diff options
Diffstat (limited to 'tools')
706 files changed, 32003 insertions, 12461 deletions
diff --git a/tools/arch/arm64/include/asm/sysreg.h b/tools/arch/arm64/include/asm/sysreg.h index ccc13e991376..cd8420e8c3ad 100644 --- a/tools/arch/arm64/include/asm/sysreg.h +++ b/tools/arch/arm64/include/asm/sysreg.h @@ -701,18 +701,18 @@ * Permission Indirection Extension (PIE) permission encodings. * Encodings with the _O suffix, have overlays applied (Permission Overlay Extension). */ -#define PIE_NONE_O 0x0 -#define PIE_R_O 0x1 -#define PIE_X_O 0x2 -#define PIE_RX_O 0x3 -#define PIE_RW_O 0x5 -#define PIE_RWnX_O 0x6 -#define PIE_RWX_O 0x7 -#define PIE_R 0x8 -#define PIE_GCS 0x9 -#define PIE_RX 0xa -#define PIE_RW 0xc -#define PIE_RWX 0xe +#define PIE_NONE_O UL(0x0) +#define PIE_R_O UL(0x1) +#define PIE_X_O UL(0x2) +#define PIE_RX_O UL(0x3) +#define PIE_RW_O UL(0x5) +#define PIE_RWnX_O UL(0x6) +#define PIE_RWX_O UL(0x7) +#define PIE_R UL(0x8) +#define PIE_GCS UL(0x9) +#define PIE_RX UL(0xa) +#define PIE_RW UL(0xc) +#define PIE_RWX UL(0xe) #define PIRx_ELx_PERM(idx, perm) ((perm) << ((idx) * 4)) diff --git a/tools/arch/x86/dell-uart-backlight-emulator/.gitignore b/tools/arch/x86/dell-uart-backlight-emulator/.gitignore new file mode 100644 index 000000000000..5c8cad8d72b9 --- /dev/null +++ b/tools/arch/x86/dell-uart-backlight-emulator/.gitignore @@ -0,0 +1 @@ +dell-uart-backlight-emulator diff --git a/tools/arch/x86/dell-uart-backlight-emulator/Makefile b/tools/arch/x86/dell-uart-backlight-emulator/Makefile new file mode 100644 index 000000000000..6ea1d9fd534b --- /dev/null +++ b/tools/arch/x86/dell-uart-backlight-emulator/Makefile @@ -0,0 +1,19 @@ +# SPDX-License-Identifier: GPL-2.0 +# Makefile for Intel Software Defined Silicon provisioning tool + +dell-uart-backlight-emulator: dell-uart-backlight-emulator.c + +BINDIR ?= /usr/bin + +override CFLAGS += -O2 -Wall + +%: %.c + $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) + +.PHONY : clean +clean : + @rm -f dell-uart-backlight-emulator + +install : dell-uart-backlight-emulator + install -d $(DESTDIR)$(BINDIR) + install -m 755 -p dell-uart-backlight-emulator $(DESTDIR)$(BINDIR)/dell-uart-backlight-emulator diff --git a/tools/arch/x86/dell-uart-backlight-emulator/README b/tools/arch/x86/dell-uart-backlight-emulator/README new file mode 100644 index 000000000000..c0d8e52046ee --- /dev/null +++ b/tools/arch/x86/dell-uart-backlight-emulator/README @@ -0,0 +1,46 @@ +Emulator for DELL0501 UART attached backlight controller +-------------------------------------------------------- + +Dell All In One (AIO) models released after 2017 use a backlight controller +board connected to an UART. + +In DSDT this uart port will be defined as: + + Name (_HID, "DELL0501") + Name (_CID, EisaId ("PNP0501") + +With the DELL0501 indicating that we are dealing with an UART with +the backlight controller board attached. + +This small emulator allows testing +the drivers/platform/x86/dell/dell-uart-backlight.c driver without access +to an actual Dell All In One. + +This requires: +1. A (desktop) PC with a 16550 UART on the motherboard and a standard DB9 + connector connected to this UART. +2. A DB9 NULL modem cable. +3. A second DB9 serial port, this can e.g. be a USB to serial converter + with a DB9 connector plugged into the same desktop PC. +4. A DSDT overlay for the desktop PC replacing the _HID of the 16550 UART + ACPI Device()Â with "DELL0501" and adding a _CID of "PNP0501", see + DSDT.patch for an example of the necessary DSDT changes. + +With everything setup and the NULL modem cable connected between +the 2 serial ports run: + +./dell-uart-backlight-emulator <path-to-/dev/tty*S#-for-second-port> + +For example when using an USB to serial converter for the second port: + +./dell-uart-backlight-emulator /dev/ttyUSB0 + +And then (re)load the dell-uart-backlight driver: + +sudo rmmod dell-uart-backlight; sudo modprobe dell-uart-backlight dyndbg + +After this check "dmesg" to see if the driver correctly received +the firmware version string from the emulator. If this works there +should be a /sys/class/backlight/dell_uart_backlight/ directory now +and writes to the brightness or bl_power files should be reflected +by matching output from the emulator. diff --git a/tools/arch/x86/dell-uart-backlight-emulator/dell-uart-backlight-emulator.c b/tools/arch/x86/dell-uart-backlight-emulator/dell-uart-backlight-emulator.c new file mode 100644 index 000000000000..655b6c96d8cf --- /dev/null +++ b/tools/arch/x86/dell-uart-backlight-emulator/dell-uart-backlight-emulator.c @@ -0,0 +1,163 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Dell AIO Serial Backlight board emulator for testing + * the Linux dell-uart-backlight driver. + * + * Copyright (C) 2024 Hans de Goede <hansg@kernel.org> + */ +#include <errno.h> +#include <fcntl.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/un.h> +#include <termios.h> +#include <unistd.h> + +int serial_fd; +int brightness = 50; + +static unsigned char dell_uart_checksum(unsigned char *buf, int len) +{ + unsigned char val = 0; + + while (len-- > 0) + val += buf[len]; + + return val ^ 0xff; +} + +/* read() will return -1 on SIGINT / SIGTERM causing the mainloop to cleanly exit */ +void signalhdlr(int signum) +{ +} + +int main(int argc, char *argv[]) +{ + struct sigaction sigact = { .sa_handler = signalhdlr }; + unsigned char buf[4], csum, response[32]; + const char *version_str = "PHI23-V321"; + struct termios tty, saved_tty; + int ret, idx, len = 0; + + if (argc != 2) { + fprintf(stderr, "Invalid or missing arguments\n"); + fprintf(stderr, "Usage: %s <serial-port>\n", argv[0]); + return 1; + } + + serial_fd = open(argv[1], O_RDWR | O_NOCTTY); + if (serial_fd == -1) { + fprintf(stderr, "Error opening %s: %s\n", argv[1], strerror(errno)); + return 1; + } + + ret = tcgetattr(serial_fd, &tty); + if (ret == -1) { + fprintf(stderr, "Error getting tcattr: %s\n", strerror(errno)); + goto out_close; + } + saved_tty = tty; + + cfsetspeed(&tty, 9600); + cfmakeraw(&tty); + tty.c_cflag &= ~CSTOPB; + tty.c_cflag &= ~CRTSCTS; + tty.c_cflag |= CLOCAL | CREAD; + + ret = tcsetattr(serial_fd, TCSANOW, &tty); + if (ret == -1) { + fprintf(stderr, "Error setting tcattr: %s\n", strerror(errno)); + goto out_restore; + } + + sigaction(SIGINT, &sigact, 0); + sigaction(SIGTERM, &sigact, 0); + + idx = 0; + while (read(serial_fd, &buf[idx], 1) == 1) { + if (idx == 0) { + switch (buf[0]) { + /* 3 MSB bits: cmd-len + 01010 SOF marker */ + case 0x6a: len = 3; break; + case 0x8a: len = 4; break; + default: + fprintf(stderr, "Error unexpected first byte: 0x%02x\n", buf[0]); + continue; /* Try to sync up with sender */ + } + } + + /* Process msg when len bytes have been received */ + if (idx != (len - 1)) { + idx++; + continue; + } + + /* Reset idx for next command */ + idx = 0; + + csum = dell_uart_checksum(buf, len - 1); + if (buf[len - 1] != csum) { + fprintf(stderr, "Error checksum mismatch got 0x%02x expected 0x%02x\n", + buf[len - 1], csum); + continue; + } + + switch ((buf[0] << 8) | buf[1]) { + case 0x6a06: /* cmd = 0x06, get version */ + len = strlen(version_str); + strcpy((char *)&response[2], version_str); + printf("Get version, reply: %s\n", version_str); + break; + case 0x8a0b: /* cmd = 0x0b, set brightness */ + if (buf[2] > 100) { + fprintf(stderr, "Error invalid brightness param: %d\n", buf[2]); + continue; + } + + len = 0; + brightness = buf[2]; + printf("Set brightness %d\n", brightness); + break; + case 0x6a0c: /* cmd = 0x0c, get brightness */ + len = 1; + response[2] = brightness; + printf("Get brightness, reply: %d\n", brightness); + break; + case 0x8a0e: /* cmd = 0x0e, set backlight power */ + if (buf[2] != 0 && buf[2] != 1) { + fprintf(stderr, "Error invalid set power param: %d\n", buf[2]); + continue; + } + + len = 0; + printf("Set power %d\n", buf[2]); + break; + default: + fprintf(stderr, "Error unknown cmd 0x%04x\n", + (buf[0] << 8) | buf[1]); + continue; + } + + /* Respond with <total-len> <cmd> <data...> <csum> */ + response[0] = len + 3; /* response length in bytes */ + response[1] = buf[1]; /* ack cmd */ + csum = dell_uart_checksum(response, len + 2); + response[len + 2] = csum; + ret = write(serial_fd, response, response[0]); + if (ret != (response[0])) + fprintf(stderr, "Error writing %d bytes: %d\n", + response[0], ret); + } + + ret = 0; +out_restore: + tcsetattr(serial_fd, TCSANOW, &saved_tty); +out_close: + close(serial_fd); + return ret; +} diff --git a/tools/arch/x86/include/asm/inat.h b/tools/arch/x86/include/asm/inat.h index a61051400311..253690eb3c26 100644 --- a/tools/arch/x86/include/asm/inat.h +++ b/tools/arch/x86/include/asm/inat.h @@ -35,6 +35,8 @@ #define INAT_PFX_VEX2 13 /* 2-bytes VEX prefix */ #define INAT_PFX_VEX3 14 /* 3-bytes VEX prefix */ #define INAT_PFX_EVEX 15 /* EVEX prefix */ +/* x86-64 REX2 prefix */ +#define INAT_PFX_REX2 16 /* 0xD5 */ #define INAT_LSTPFX_MAX 3 #define INAT_LGCPFX_MAX 11 @@ -50,7 +52,7 @@ /* Legacy prefix */ #define INAT_PFX_OFFS 0 -#define INAT_PFX_BITS 4 +#define INAT_PFX_BITS 5 #define INAT_PFX_MAX ((1 << INAT_PFX_BITS) - 1) #define INAT_PFX_MASK (INAT_PFX_MAX << INAT_PFX_OFFS) /* Escape opcodes */ @@ -77,6 +79,9 @@ #define INAT_VEXOK (1 << (INAT_FLAG_OFFS + 5)) #define INAT_VEXONLY (1 << (INAT_FLAG_OFFS + 6)) #define INAT_EVEXONLY (1 << (INAT_FLAG_OFFS + 7)) +#define INAT_NO_REX2 (1 << (INAT_FLAG_OFFS + 8)) +#define INAT_REX2_VARIANT (1 << (INAT_FLAG_OFFS + 9)) +#define INAT_EVEX_SCALABLE (1 << (INAT_FLAG_OFFS + 10)) /* Attribute making macros for attribute tables */ #define INAT_MAKE_PREFIX(pfx) (pfx << INAT_PFX_OFFS) #define INAT_MAKE_ESCAPE(esc) (esc << INAT_ESC_OFFS) @@ -128,6 +133,11 @@ static inline int inat_is_rex_prefix(insn_attr_t attr) return (attr & INAT_PFX_MASK) == INAT_PFX_REX; } +static inline int inat_is_rex2_prefix(insn_attr_t attr) +{ + return (attr & INAT_PFX_MASK) == INAT_PFX_REX2; +} + static inline int inat_last_prefix_id(insn_attr_t attr) { if ((attr & INAT_PFX_MASK) > INAT_LSTPFX_MAX) @@ -227,4 +237,9 @@ static inline int inat_must_evex(insn_attr_t attr) { return attr & INAT_EVEXONLY; } + +static inline int inat_evex_scalable(insn_attr_t attr) +{ + return attr & INAT_EVEX_SCALABLE; +} #endif diff --git a/tools/arch/x86/include/asm/insn.h b/tools/arch/x86/include/asm/insn.h index 65c0d9ce1e29..0e5abd896ad4 100644 --- a/tools/arch/x86/include/asm/insn.h +++ b/tools/arch/x86/include/asm/insn.h @@ -112,10 +112,15 @@ struct insn { #define X86_SIB_INDEX(sib) (((sib) & 0x38) >> 3) #define X86_SIB_BASE(sib) ((sib) & 0x07) -#define X86_REX_W(rex) ((rex) & 8) -#define X86_REX_R(rex) ((rex) & 4) -#define X86_REX_X(rex) ((rex) & 2) -#define X86_REX_B(rex) ((rex) & 1) +#define X86_REX2_M(rex) ((rex) & 0x80) /* REX2 M0 */ +#define X86_REX2_R(rex) ((rex) & 0x40) /* REX2 R4 */ +#define X86_REX2_X(rex) ((rex) & 0x20) /* REX2 X4 */ +#define X86_REX2_B(rex) ((rex) & 0x10) /* REX2 B4 */ + +#define X86_REX_W(rex) ((rex) & 8) /* REX or REX2 W */ +#define X86_REX_R(rex) ((rex) & 4) /* REX or REX2 R3 */ +#define X86_REX_X(rex) ((rex) & 2) /* REX or REX2 X3 */ +#define X86_REX_B(rex) ((rex) & 1) /* REX or REX2 B3 */ /* VEX bit flags */ #define X86_VEX_W(vex) ((vex) & 0x80) /* VEX3 Byte2 */ @@ -161,6 +166,18 @@ static inline void insn_get_attribute(struct insn *insn) /* Instruction uses RIP-relative addressing */ extern int insn_rip_relative(struct insn *insn); +static inline int insn_is_rex2(struct insn *insn) +{ + if (!insn->prefixes.got) + insn_get_prefixes(insn); + return insn->rex_prefix.nbytes == 2; +} + +static inline insn_byte_t insn_rex2_m_bit(struct insn *insn) +{ + return X86_REX2_M(insn->rex_prefix.bytes[1]); +} + static inline int insn_is_avx(struct insn *insn) { if (!insn->prefixes.got) @@ -198,6 +215,13 @@ static inline insn_byte_t insn_vex_p_bits(struct insn *insn) return X86_VEX_P(insn->vex_prefix.bytes[2]); } +static inline insn_byte_t insn_vex_w_bit(struct insn *insn) +{ + if (insn->vex_prefix.nbytes < 3) + return 0; + return X86_VEX_W(insn->vex_prefix.bytes[2]); +} + /* Get the last prefix id from last prefix or VEX prefix */ static inline int insn_last_prefix_id(struct insn *insn) { diff --git a/tools/arch/x86/intel_sdsi/intel_sdsi.c b/tools/arch/x86/intel_sdsi/intel_sdsi.c index 2cd92761f171..766a5d26f534 100644 --- a/tools/arch/x86/intel_sdsi/intel_sdsi.c +++ b/tools/arch/x86/intel_sdsi/intel_sdsi.c @@ -43,7 +43,7 @@ #define METER_CERT_MAX_SIZE 4096 #define STATE_MAX_NUM_LICENSES 16 #define STATE_MAX_NUM_IN_BUNDLE (uint32_t)8 -#define METER_MAX_NUM_BUNDLES 8 +#define FEAT_LEN 5 /* 4 plus NUL terminator */ #define __round_mask(x, y) ((__typeof__(x))((y) - 1)) #define round_up(x, y) ((((x) - 1) | __round_mask(x, y)) + 1) @@ -154,11 +154,12 @@ struct bundle_encoding { }; struct meter_certificate { - uint32_t block_signature; - uint32_t counter_unit; + uint32_t signature; + uint32_t version; uint64_t ppin; + uint32_t counter_unit; uint32_t bundle_length; - uint32_t reserved; + uint64_t reserved; uint32_t mmrc_encoding; uint32_t mmrc_counter; }; @@ -167,6 +168,11 @@ struct bundle_encoding_counter { uint32_t encoding; uint32_t counter; }; +#define METER_BUNDLE_SIZE sizeof(struct bundle_encoding_counter) +#define BUNDLE_COUNT(length) ((length) / METER_BUNDLE_SIZE) +#define METER_MAX_NUM_BUNDLES \ + ((METER_CERT_MAX_SIZE - sizeof(struct meter_certificate)) / \ + sizeof(struct bundle_encoding_counter)) struct sdsi_dev { struct sdsi_regs regs; @@ -179,6 +185,7 @@ struct sdsi_dev { enum command { CMD_SOCKET_INFO, CMD_METER_CERT, + CMD_METER_CURRENT_CERT, CMD_STATE_CERT, CMD_PROV_AKC, CMD_PROV_CAP, @@ -316,24 +323,27 @@ static char *content_type(uint32_t type) } } -static void get_feature(uint32_t encoding, char *feature) +static void get_feature(uint32_t encoding, char feature[5]) { char *name = (char *)&encoding; + feature[4] = '\0'; feature[3] = name[0]; feature[2] = name[1]; feature[1] = name[2]; feature[0] = name[3]; } -static int sdsi_meter_cert_show(struct sdsi_dev *s) +static int sdsi_meter_cert_show(struct sdsi_dev *s, bool show_current) { char buf[METER_CERT_MAX_SIZE] = {0}; struct bundle_encoding_counter *bec; struct meter_certificate *mc; uint32_t count = 0; FILE *cert_ptr; + char *cert_fname; int ret, size; + char name[FEAT_LEN]; ret = sdsi_update_registers(s); if (ret) @@ -341,7 +351,6 @@ static int sdsi_meter_cert_show(struct sdsi_dev *s) if (!s->regs.en_features.sdsi) { fprintf(stderr, "SDSi feature is present but not enabled.\n"); - fprintf(stderr, " Unable to read meter certificate\n"); return -1; } @@ -356,15 +365,17 @@ static int sdsi_meter_cert_show(struct sdsi_dev *s) return ret; } - cert_ptr = fopen("meter_certificate", "r"); + cert_fname = show_current ? "meter_current" : "meter_certificate"; + cert_ptr = fopen(cert_fname, "r"); + if (!cert_ptr) { - perror("Could not open 'meter_certificate' file"); + fprintf(stderr, "Could not open '%s' file: %s", cert_fname, strerror(errno)); return -1; } size = fread(buf, 1, sizeof(buf), cert_ptr); if (!size) { - fprintf(stderr, "Could not read 'meter_certificate' file\n"); + fprintf(stderr, "Could not read '%s' file\n", cert_fname); fclose(cert_ptr); return -1; } @@ -375,32 +386,39 @@ static int sdsi_meter_cert_show(struct sdsi_dev *s) printf("\n"); printf("Meter certificate for device %s\n", s->dev_name); printf("\n"); - printf("Block Signature: 0x%x\n", mc->block_signature); - printf("Count Unit: %dms\n", mc->counter_unit); - printf("PPIN: 0x%lx\n", mc->ppin); - printf("Feature Bundle Length: %d\n", mc->bundle_length); - printf("MMRC encoding: %d\n", mc->mmrc_encoding); - printf("MMRC counter: %d\n", mc->mmrc_counter); - if (mc->bundle_length % 8) { + + get_feature(mc->signature, name); + printf("Signature: %s\n", name); + + printf("Version: %d\n", mc->version); + printf("Count Unit: %dms\n", mc->counter_unit); + printf("PPIN: 0x%lx\n", mc->ppin); + printf("Feature Bundle Length: %d\n", mc->bundle_length); + + get_feature(mc->mmrc_encoding, name); + printf("MMRC encoding: %s\n", name); + + printf("MMRC counter: %d\n", mc->mmrc_counter); + if (mc->bundle_length % METER_BUNDLE_SIZE) { fprintf(stderr, "Invalid bundle length\n"); return -1; } - if (mc->bundle_length > METER_MAX_NUM_BUNDLES * 8) { - fprintf(stderr, "More than %d bundles: %d\n", - METER_MAX_NUM_BUNDLES, mc->bundle_length / 8); + if (mc->bundle_length > METER_MAX_NUM_BUNDLES * METER_BUNDLE_SIZE) { + fprintf(stderr, "More than %ld bundles: actual %ld\n", + METER_MAX_NUM_BUNDLES, BUNDLE_COUNT(mc->bundle_length)); return -1; } - bec = (void *)(mc) + sizeof(mc); + bec = (struct bundle_encoding_counter *)(mc + 1); - printf("Number of Feature Counters: %d\n", mc->bundle_length / 8); - while (count++ < mc->bundle_length / 8) { - char feature[5]; + printf("Number of Feature Counters: %ld\n", BUNDLE_COUNT(mc->bundle_length)); + while (count < BUNDLE_COUNT(mc->bundle_length)) { + char feature[FEAT_LEN]; - feature[4] = '\0'; get_feature(bec[count].encoding, feature); printf(" %s: %d\n", feature, bec[count].counter); + ++count; } return 0; @@ -480,7 +498,7 @@ static int sdsi_state_cert_show(struct sdsi_dev *s) sizeof(*lki) + // size of the license key info offset; // offset to this blob content struct bundle_encoding *bundle = (void *)(lbc) + sizeof(*lbc); - char feature[5]; + char feature[FEAT_LEN]; uint32_t i; printf(" Blob %d:\n", count - 1); @@ -493,8 +511,6 @@ static int sdsi_state_cert_show(struct sdsi_dev *s) printf(" Blob revision ID: %u\n", lbc->rev_id); printf(" Number of Features: %u\n", lbc->num_bundles); - feature[4] = '\0'; - for (i = 0; i < min(lbc->num_bundles, STATE_MAX_NUM_IN_BUNDLE); i++) { get_feature(bundle[i].encoding, feature); printf(" Feature %d: %s\n", i, feature); @@ -725,7 +741,7 @@ static void sdsi_free_dev(struct sdsi_dev *s) static void usage(char *prog) { - printf("Usage: %s [-l] [-d DEVNO [-i] [-s] [-m] [-a FILE] [-c FILE]]\n", prog); + printf("Usage: %s [-l] [-d DEVNO [-i] [-s] [-m | -C] [-a FILE] [-c FILE]\n", prog); } static void show_help(void) @@ -734,8 +750,9 @@ static void show_help(void) printf(" %-18s\t%s\n", "-l, --list", "list available On Demand devices"); printf(" %-18s\t%s\n", "-d, --devno DEVNO", "On Demand device number"); printf(" %-18s\t%s\n", "-i, --info", "show socket information"); - printf(" %-18s\t%s\n", "-s, --state", "show state certificate"); - printf(" %-18s\t%s\n", "-m, --meter", "show meter certificate"); + printf(" %-18s\t%s\n", "-s, --state", "show state certificate data"); + printf(" %-18s\t%s\n", "-m, --meter", "show meter certificate data"); + printf(" %-18s\t%s\n", "-C, --meter_current", "show live unattested meter data"); printf(" %-18s\t%s\n", "-a, --akc FILE", "provision socket with AKC FILE"); printf(" %-18s\t%s\n", "-c, --cap FILE>", "provision socket with CAP FILE"); } @@ -751,21 +768,22 @@ int main(int argc, char *argv[]) int option_index = 0; static struct option long_options[] = { - {"akc", required_argument, 0, 'a'}, - {"cap", required_argument, 0, 'c'}, - {"devno", required_argument, 0, 'd'}, - {"help", no_argument, 0, 'h'}, - {"info", no_argument, 0, 'i'}, - {"list", no_argument, 0, 'l'}, - {"meter", no_argument, 0, 'm'}, - {"state", no_argument, 0, 's'}, - {0, 0, 0, 0 } + {"akc", required_argument, 0, 'a'}, + {"cap", required_argument, 0, 'c'}, + {"devno", required_argument, 0, 'd'}, + {"help", no_argument, 0, 'h'}, + {"info", no_argument, 0, 'i'}, + {"list", no_argument, 0, 'l'}, + {"meter", no_argument, 0, 'm'}, + {"meter_current", no_argument, 0, 'C'}, + {"state", no_argument, 0, 's'}, + {0, 0, 0, 0 } }; progname = argv[0]; - while ((opt = getopt_long_only(argc, argv, "+a:c:d:hilms", long_options, + while ((opt = getopt_long_only(argc, argv, "+a:c:d:hilmCs", long_options, &option_index)) != -1) { switch (opt) { case 'd': @@ -781,6 +799,9 @@ int main(int argc, char *argv[]) case 'm': command = CMD_METER_CERT; break; + case 'C': + command = CMD_METER_CURRENT_CERT; + break; case 's': command = CMD_STATE_CERT; break; @@ -819,7 +840,10 @@ int main(int argc, char *argv[]) ret = sdsi_read_reg(s); break; case CMD_METER_CERT: - ret = sdsi_meter_cert_show(s); + ret = sdsi_meter_cert_show(s, false); + break; + case CMD_METER_CURRENT_CERT: + ret = sdsi_meter_cert_show(s, true); break; case CMD_STATE_CERT: ret = sdsi_state_cert_show(s); diff --git a/tools/arch/x86/lib/insn.c b/tools/arch/x86/lib/insn.c index ada4b4a79dd4..a43b37346a22 100644 --- a/tools/arch/x86/lib/insn.c +++ b/tools/arch/x86/lib/insn.c @@ -185,6 +185,17 @@ found: if (X86_REX_W(b)) /* REX.W overrides opnd_size */ insn->opnd_bytes = 8; + } else if (inat_is_rex2_prefix(attr)) { + insn_set_byte(&insn->rex_prefix, 0, b); + b = peek_nbyte_next(insn_byte_t, insn, 1); + insn_set_byte(&insn->rex_prefix, 1, b); + insn->rex_prefix.nbytes = 2; + insn->next_byte += 2; + if (X86_REX_W(b)) + /* REX.W overrides opnd_size */ + insn->opnd_bytes = 8; + insn->rex_prefix.got = 1; + goto vex_end; } } insn->rex_prefix.got = 1; @@ -283,6 +294,10 @@ int insn_get_opcode(struct insn *insn) m = insn_vex_m_bits(insn); p = insn_vex_p_bits(insn); insn->attr = inat_get_avx_attribute(op, m, p); + /* SCALABLE EVEX uses p bits to encode operand size */ + if (inat_evex_scalable(insn->attr) && !insn_vex_w_bit(insn) && + p == INAT_PFX_OPNDSZ) + insn->opnd_bytes = 2; if ((inat_must_evex(insn->attr) && !insn_is_evex(insn)) || (!inat_accept_vex(insn->attr) && !inat_is_group(insn->attr))) { @@ -294,6 +309,20 @@ int insn_get_opcode(struct insn *insn) goto end; } + /* Check if there is REX2 prefix or not */ + if (insn_is_rex2(insn)) { + if (insn_rex2_m_bit(insn)) { + /* map 1 is escape 0x0f */ + insn_attr_t esc_attr = inat_get_opcode_attribute(0x0f); + + pfx_id = insn_last_prefix_id(insn); + insn->attr = inat_get_escape_attribute(op, pfx_id, esc_attr); + } else { + insn->attr = inat_get_opcode_attribute(op); + } + goto end; + } + insn->attr = inat_get_opcode_attribute(op); while (inat_is_escape(insn->attr)) { /* Get escaped opcode */ diff --git a/tools/arch/x86/lib/x86-opcode-map.txt b/tools/arch/x86/lib/x86-opcode-map.txt index 12af572201a2..caedb3ef6688 100644 --- a/tools/arch/x86/lib/x86-opcode-map.txt +++ b/tools/arch/x86/lib/x86-opcode-map.txt @@ -23,6 +23,7 @@ # # AVX Superscripts # (ev): this opcode requires EVEX prefix. +# (es): this opcode requires EVEX prefix and is SCALABALE. # (evo): this opcode is changed by EVEX prefix (EVEX opcode) # (v): this opcode requires VEX prefix. # (v1): this opcode only supports 128bit VEX. @@ -33,6 +34,10 @@ # - (F2): the last prefix is 0xF2 # - (!F3) : the last prefix is not 0xF3 (including non-last prefix case) # - (66&F2): Both 0x66 and 0xF2 prefixes are specified. +# +# REX2 Prefix +# - (!REX2): REX2 is not allowed +# - (REX2): REX2 variant e.g. JMPABS Table: one byte opcode Referrer: @@ -148,7 +153,7 @@ AVXcode: 65: SEG=GS (Prefix) 66: Operand-Size (Prefix) 67: Address-Size (Prefix) -68: PUSH Iz (d64) +68: PUSH Iz 69: IMUL Gv,Ev,Iz 6a: PUSH Ib (d64) 6b: IMUL Gv,Ev,Ib @@ -157,22 +162,22 @@ AVXcode: 6e: OUTS/OUTSB DX,Xb 6f: OUTS/OUTSW/OUTSD DX,Xz # 0x70 - 0x7f -70: JO Jb -71: JNO Jb -72: JB/JNAE/JC Jb -73: JNB/JAE/JNC Jb -74: JZ/JE Jb -75: JNZ/JNE Jb -76: JBE/JNA Jb -77: JNBE/JA Jb -78: JS Jb -79: JNS Jb -7a: JP/JPE Jb -7b: JNP/JPO Jb -7c: JL/JNGE Jb -7d: JNL/JGE Jb -7e: JLE/JNG Jb -7f: JNLE/JG Jb +70: JO Jb (!REX2) +71: JNO Jb (!REX2) +72: JB/JNAE/JC Jb (!REX2) +73: JNB/JAE/JNC Jb (!REX2) +74: JZ/JE Jb (!REX2) +75: JNZ/JNE Jb (!REX2) +76: JBE/JNA Jb (!REX2) +77: JNBE/JA Jb (!REX2) +78: JS Jb (!REX2) +79: JNS Jb (!REX2) +7a: JP/JPE Jb (!REX2) +7b: JNP/JPO Jb (!REX2) +7c: JL/JNGE Jb (!REX2) +7d: JNL/JGE Jb (!REX2) +7e: JLE/JNG Jb (!REX2) +7f: JNLE/JG Jb (!REX2) # 0x80 - 0x8f 80: Grp1 Eb,Ib (1A) 81: Grp1 Ev,Iz (1A) @@ -208,24 +213,24 @@ AVXcode: 9e: SAHF 9f: LAHF # 0xa0 - 0xaf -a0: MOV AL,Ob -a1: MOV rAX,Ov -a2: MOV Ob,AL -a3: MOV Ov,rAX -a4: MOVS/B Yb,Xb -a5: MOVS/W/D/Q Yv,Xv -a6: CMPS/B Xb,Yb -a7: CMPS/W/D Xv,Yv -a8: TEST AL,Ib -a9: TEST rAX,Iz -aa: STOS/B Yb,AL -ab: STOS/W/D/Q Yv,rAX -ac: LODS/B AL,Xb -ad: LODS/W/D/Q rAX,Xv -ae: SCAS/B AL,Yb +a0: MOV AL,Ob (!REX2) +a1: MOV rAX,Ov (!REX2) | JMPABS O (REX2),(o64) +a2: MOV Ob,AL (!REX2) +a3: MOV Ov,rAX (!REX2) +a4: MOVS/B Yb,Xb (!REX2) +a5: MOVS/W/D/Q Yv,Xv (!REX2) +a6: CMPS/B Xb,Yb (!REX2) +a7: CMPS/W/D Xv,Yv (!REX2) +a8: TEST AL,Ib (!REX2) +a9: TEST rAX,Iz (!REX2) +aa: STOS/B Yb,AL (!REX2) +ab: STOS/W/D/Q Yv,rAX (!REX2) +ac: LODS/B AL,Xb (!REX2) +ad: LODS/W/D/Q rAX,Xv (!REX2) +ae: SCAS/B AL,Yb (!REX2) # Note: The May 2011 Intel manual shows Xv for the second parameter of the # next instruction but Yv is correct -af: SCAS/W/D/Q rAX,Yv +af: SCAS/W/D/Q rAX,Yv (!REX2) # 0xb0 - 0xbf b0: MOV AL/R8L,Ib b1: MOV CL/R9L,Ib @@ -266,7 +271,7 @@ d1: Grp2 Ev,1 (1A) d2: Grp2 Eb,CL (1A) d3: Grp2 Ev,CL (1A) d4: AAM Ib (i64) -d5: AAD Ib (i64) +d5: AAD Ib (i64) | REX2 (Prefix),(o64) d6: d7: XLAT/XLATB d8: ESC @@ -281,26 +286,26 @@ df: ESC # Note: "forced64" is Intel CPU behavior: they ignore 0x66 prefix # in 64-bit mode. AMD CPUs accept 0x66 prefix, it causes RIP truncation # to 16 bits. In 32-bit mode, 0x66 is accepted by both Intel and AMD. -e0: LOOPNE/LOOPNZ Jb (f64) -e1: LOOPE/LOOPZ Jb (f64) -e2: LOOP Jb (f64) -e3: JrCXZ Jb (f64) -e4: IN AL,Ib -e5: IN eAX,Ib -e6: OUT Ib,AL -e7: OUT Ib,eAX +e0: LOOPNE/LOOPNZ Jb (f64) (!REX2) +e1: LOOPE/LOOPZ Jb (f64) (!REX2) +e2: LOOP Jb (f64) (!REX2) +e3: JrCXZ Jb (f64) (!REX2) +e4: IN AL,Ib (!REX2) +e5: IN eAX,Ib (!REX2) +e6: OUT Ib,AL (!REX2) +e7: OUT Ib,eAX (!REX2) # With 0x66 prefix in 64-bit mode, for AMD CPUs immediate offset # in "near" jumps and calls is 16-bit. For CALL, # push of return address is 16-bit wide, RSP is decremented by 2 # but is not truncated to 16 bits, unlike RIP. -e8: CALL Jz (f64) -e9: JMP-near Jz (f64) -ea: JMP-far Ap (i64) -eb: JMP-short Jb (f64) -ec: IN AL,DX -ed: IN eAX,DX -ee: OUT DX,AL -ef: OUT DX,eAX +e8: CALL Jz (f64) (!REX2) +e9: JMP-near Jz (f64) (!REX2) +ea: JMP-far Ap (i64) (!REX2) +eb: JMP-short Jb (f64) (!REX2) +ec: IN AL,DX (!REX2) +ed: IN eAX,DX (!REX2) +ee: OUT DX,AL (!REX2) +ef: OUT DX,eAX (!REX2) # 0xf0 - 0xff f0: LOCK (Prefix) f1: @@ -386,14 +391,14 @@ AVXcode: 1 2e: vucomiss Vss,Wss (v1) | vucomisd Vsd,Wsd (66),(v1) 2f: vcomiss Vss,Wss (v1) | vcomisd Vsd,Wsd (66),(v1) # 0x0f 0x30-0x3f -30: WRMSR -31: RDTSC -32: RDMSR -33: RDPMC -34: SYSENTER -35: SYSEXIT +30: WRMSR (!REX2) +31: RDTSC (!REX2) +32: RDMSR (!REX2) +33: RDPMC (!REX2) +34: SYSENTER (!REX2) +35: SYSEXIT (!REX2) 36: -37: GETSEC +37: GETSEC (!REX2) 38: escape # 3-byte escape 1 39: 3a: escape # 3-byte escape 2 @@ -473,22 +478,22 @@ AVXcode: 1 7f: movq Qq,Pq | vmovdqa Wx,Vx (66) | vmovdqa32/64 Wx,Vx (66),(evo) | vmovdqu Wx,Vx (F3) | vmovdqu32/64 Wx,Vx (F3),(evo) | vmovdqu8/16 Wx,Vx (F2),(ev) # 0x0f 0x80-0x8f # Note: "forced64" is Intel CPU behavior (see comment about CALL insn). -80: JO Jz (f64) -81: JNO Jz (f64) -82: JB/JC/JNAE Jz (f64) -83: JAE/JNB/JNC Jz (f64) -84: JE/JZ Jz (f64) -85: JNE/JNZ Jz (f64) -86: JBE/JNA Jz (f64) -87: JA/JNBE Jz (f64) -88: JS Jz (f64) -89: JNS Jz (f64) -8a: JP/JPE Jz (f64) -8b: JNP/JPO Jz (f64) -8c: JL/JNGE Jz (f64) -8d: JNL/JGE Jz (f64) -8e: JLE/JNG Jz (f64) -8f: JNLE/JG Jz (f64) +80: JO Jz (f64) (!REX2) +81: JNO Jz (f64) (!REX2) +82: JB/JC/JNAE Jz (f64) (!REX2) +83: JAE/JNB/JNC Jz (f64) (!REX2) +84: JE/JZ Jz (f64) (!REX2) +85: JNE/JNZ Jz (f64) (!REX2) +86: JBE/JNA Jz (f64) (!REX2) +87: JA/JNBE Jz (f64) (!REX2) +88: JS Jz (f64) (!REX2) +89: JNS Jz (f64) (!REX2) +8a: JP/JPE Jz (f64) (!REX2) +8b: JNP/JPO Jz (f64) (!REX2) +8c: JL/JNGE Jz (f64) (!REX2) +8d: JNL/JGE Jz (f64) (!REX2) +8e: JLE/JNG Jz (f64) (!REX2) +8f: JNLE/JG Jz (f64) (!REX2) # 0x0f 0x90-0x9f 90: SETO Eb | kmovw/q Vk,Wk | kmovb/d Vk,Wk (66) 91: SETNO Eb | kmovw/q Mv,Vk | kmovb/d Mv,Vk (66) @@ -698,17 +703,17 @@ AVXcode: 2 4d: vrcp14ss/d Vsd,Hpd,Wsd (66),(ev) 4e: vrsqrt14ps/d Vpd,Wpd (66),(ev) 4f: vrsqrt14ss/d Vsd,Hsd,Wsd (66),(ev) -50: vpdpbusd Vx,Hx,Wx (66),(ev) -51: vpdpbusds Vx,Hx,Wx (66),(ev) -52: vdpbf16ps Vx,Hx,Wx (F3),(ev) | vpdpwssd Vx,Hx,Wx (66),(ev) | vp4dpwssd Vdqq,Hdqq,Wdq (F2),(ev) -53: vpdpwssds Vx,Hx,Wx (66),(ev) | vp4dpwssds Vdqq,Hdqq,Wdq (F2),(ev) +50: vpdpbusd Vx,Hx,Wx (66) | vpdpbssd Vx,Hx,Wx (F2),(v) | vpdpbsud Vx,Hx,Wx (F3),(v) | vpdpbuud Vx,Hx,Wx (v) +51: vpdpbusds Vx,Hx,Wx (66) | vpdpbssds Vx,Hx,Wx (F2),(v) | vpdpbsuds Vx,Hx,Wx (F3),(v) | vpdpbuuds Vx,Hx,Wx (v) +52: vdpbf16ps Vx,Hx,Wx (F3),(ev) | vpdpwssd Vx,Hx,Wx (66) | vp4dpwssd Vdqq,Hdqq,Wdq (F2),(ev) +53: vpdpwssds Vx,Hx,Wx (66) | vp4dpwssds Vdqq,Hdqq,Wdq (F2),(ev) 54: vpopcntb/w Vx,Wx (66),(ev) 55: vpopcntd/q Vx,Wx (66),(ev) 58: vpbroadcastd Vx,Wx (66),(v) 59: vpbroadcastq Vx,Wx (66),(v) | vbroadcasti32x2 Vx,Wx (66),(evo) 5a: vbroadcasti128 Vqq,Mdq (66),(v) | vbroadcasti32x4/64x2 Vx,Wx (66),(evo) 5b: vbroadcasti32x8/64x4 Vqq,Mdq (66),(ev) -5c: TDPBF16PS Vt,Wt,Ht (F3),(v1) +5c: TDPBF16PS Vt,Wt,Ht (F3),(v1) | TDPFP16PS Vt,Wt,Ht (F2),(v1),(o64) # Skip 0x5d 5e: TDPBSSD Vt,Wt,Ht (F2),(v1) | TDPBSUD Vt,Wt,Ht (F3),(v1) | TDPBUSD Vt,Wt,Ht (66),(v1) | TDPBUUD Vt,Wt,Ht (v1) # Skip 0x5f-0x61 @@ -718,10 +723,12 @@ AVXcode: 2 65: vblendmps/d Vx,Hx,Wx (66),(ev) 66: vpblendmb/w Vx,Hx,Wx (66),(ev) 68: vp2intersectd/q Kx,Hx,Wx (F2),(ev) -# Skip 0x69-0x6f +# Skip 0x69-0x6b +6c: TCMMIMFP16PS Vt,Wt,Ht (66),(v1),(o64) | TCMMRLFP16PS Vt,Wt,Ht (v1),(o64) +# Skip 0x6d-0x6f 70: vpshldvw Vx,Hx,Wx (66),(ev) 71: vpshldvd/q Vx,Hx,Wx (66),(ev) -72: vcvtne2ps2bf16 Vx,Hx,Wx (F2),(ev) | vcvtneps2bf16 Vx,Wx (F3),(ev) | vpshrdvw Vx,Hx,Wx (66),(ev) +72: vcvtne2ps2bf16 Vx,Hx,Wx (F2),(ev) | vcvtneps2bf16 Vx,Wx (F3) | vpshrdvw Vx,Hx,Wx (66),(ev) 73: vpshrdvd/q Vx,Hx,Wx (66),(ev) 75: vpermi2b/w Vx,Hx,Wx (66),(ev) 76: vpermi2d/q Vx,Hx,Wx (66),(ev) @@ -777,8 +784,10 @@ ac: vfnmadd213ps/d Vx,Hx,Wx (66),(v) ad: vfnmadd213ss/d Vx,Hx,Wx (66),(v),(v1) ae: vfnmsub213ps/d Vx,Hx,Wx (66),(v) af: vfnmsub213ss/d Vx,Hx,Wx (66),(v),(v1) -b4: vpmadd52luq Vx,Hx,Wx (66),(ev) -b5: vpmadd52huq Vx,Hx,Wx (66),(ev) +b0: vcvtneebf162ps Vx,Mx (F3),(!11B),(v) | vcvtneeph2ps Vx,Mx (66),(!11B),(v) | vcvtneobf162ps Vx,Mx (F2),(!11B),(v) | vcvtneoph2ps Vx,Mx (!11B),(v) +b1: vbcstnebf162ps Vx,Mw (F3),(!11B),(v) | vbcstnesh2ps Vx,Mw (66),(!11B),(v) +b4: vpmadd52luq Vx,Hx,Wx (66) +b5: vpmadd52huq Vx,Hx,Wx (66) b6: vfmaddsub231ps/d Vx,Hx,Wx (66),(v) b7: vfmsubadd231ps/d Vx,Hx,Wx (66),(v) b8: vfmadd231ps/d Vx,Hx,Wx (66),(v) @@ -796,15 +805,35 @@ c7: Grp19 (1A) c8: sha1nexte Vdq,Wdq | vexp2ps/d Vx,Wx (66),(ev) c9: sha1msg1 Vdq,Wdq ca: sha1msg2 Vdq,Wdq | vrcp28ps/d Vx,Wx (66),(ev) -cb: sha256rnds2 Vdq,Wdq | vrcp28ss/d Vx,Hx,Wx (66),(ev) -cc: sha256msg1 Vdq,Wdq | vrsqrt28ps/d Vx,Wx (66),(ev) -cd: sha256msg2 Vdq,Wdq | vrsqrt28ss/d Vx,Hx,Wx (66),(ev) +cb: sha256rnds2 Vdq,Wdq | vrcp28ss/d Vx,Hx,Wx (66),(ev) | vsha512rnds2 Vqq,Hqq,Udq (F2),(11B),(v) +cc: sha256msg1 Vdq,Wdq | vrsqrt28ps/d Vx,Wx (66),(ev) | vsha512msg1 Vqq,Udq (F2),(11B),(v) +cd: sha256msg2 Vdq,Wdq | vrsqrt28ss/d Vx,Hx,Wx (66),(ev) | vsha512msg2 Vqq,Uqq (F2),(11B),(v) cf: vgf2p8mulb Vx,Wx (66) +d2: vpdpwsud Vx,Hx,Wx (F3),(v) | vpdpwusd Vx,Hx,Wx (66),(v) | vpdpwuud Vx,Hx,Wx (v) +d3: vpdpwsuds Vx,Hx,Wx (F3),(v) | vpdpwusds Vx,Hx,Wx (66),(v) | vpdpwuuds Vx,Hx,Wx (v) +d8: AESENCWIDE128KL Qpi (F3),(000),(00B) | AESENCWIDE256KL Qpi (F3),(000),(10B) | AESDECWIDE128KL Qpi (F3),(000),(01B) | AESDECWIDE256KL Qpi (F3),(000),(11B) +da: vsm3msg1 Vdq,Hdq,Udq (v1) | vsm3msg2 Vdq,Hdq,Udq (66),(v1) | vsm4key4 Vx,Hx,Wx (F3),(v) | vsm4rnds4 Vx,Hx,Wx (F2),(v) db: VAESIMC Vdq,Wdq (66),(v1) -dc: vaesenc Vx,Hx,Wx (66) -dd: vaesenclast Vx,Hx,Wx (66) -de: vaesdec Vx,Hx,Wx (66) -df: vaesdeclast Vx,Hx,Wx (66) +dc: vaesenc Vx,Hx,Wx (66) | LOADIWKEY Vx,Hx (F3) | AESENC128KL Vpd,Qpi (F3) +dd: vaesenclast Vx,Hx,Wx (66) | AESDEC128KL Vpd,Qpi (F3) +de: vaesdec Vx,Hx,Wx (66) | AESENC256KL Vpd,Qpi (F3) +df: vaesdeclast Vx,Hx,Wx (66) | AESDEC256KL Vpd,Qpi (F3) +e0: CMPOXADD My,Gy,By (66),(v1),(o64) +e1: CMPNOXADD My,Gy,By (66),(v1),(o64) +e2: CMPBXADD My,Gy,By (66),(v1),(o64) +e3: CMPNBXADD My,Gy,By (66),(v1),(o64) +e4: CMPZXADD My,Gy,By (66),(v1),(o64) +e5: CMPNZXADD My,Gy,By (66),(v1),(o64) +e6: CMPBEXADD My,Gy,By (66),(v1),(o64) +e7: CMPNBEXADD My,Gy,By (66),(v1),(o64) +e8: CMPSXADD My,Gy,By (66),(v1),(o64) +e9: CMPNSXADD My,Gy,By (66),(v1),(o64) +ea: CMPPXADD My,Gy,By (66),(v1),(o64) +eb: CMPNPXADD My,Gy,By (66),(v1),(o64) +ec: CMPLXADD My,Gy,By (66),(v1),(o64) +ed: CMPNLXADD My,Gy,By (66),(v1),(o64) +ee: CMPLEXADD My,Gy,By (66),(v1),(o64) +ef: CMPNLEXADD My,Gy,By (66),(v1),(o64) f0: MOVBE Gy,My | MOVBE Gw,Mw (66) | CRC32 Gd,Eb (F2) | CRC32 Gd,Eb (66&F2) f1: MOVBE My,Gy | MOVBE Mw,Gw (66) | CRC32 Gd,Ey (F2) | CRC32 Gd,Ew (66&F2) f2: ANDN Gy,By,Ey (v) @@ -812,8 +841,11 @@ f3: Grp17 (1A) f5: BZHI Gy,Ey,By (v) | PEXT Gy,By,Ey (F3),(v) | PDEP Gy,By,Ey (F2),(v) | WRUSSD/Q My,Gy (66) f6: ADCX Gy,Ey (66) | ADOX Gy,Ey (F3) | MULX By,Gy,rDX,Ey (F2),(v) | WRSSD/Q My,Gy f7: BEXTR Gy,Ey,By (v) | SHLX Gy,Ey,By (66),(v) | SARX Gy,Ey,By (F3),(v) | SHRX Gy,Ey,By (F2),(v) -f8: MOVDIR64B Gv,Mdqq (66) | ENQCMD Gv,Mdqq (F2) | ENQCMDS Gv,Mdqq (F3) +f8: MOVDIR64B Gv,Mdqq (66) | ENQCMD Gv,Mdqq (F2) | ENQCMDS Gv,Mdqq (F3) | URDMSR Rq,Gq (F2),(11B) | UWRMSR Gq,Rq (F3),(11B) f9: MOVDIRI My,Gy +fa: ENCODEKEY128 Ew,Ew (F3) +fb: ENCODEKEY256 Ew,Ew (F3) +fc: AADD My,Gy | AAND My,Gy (66) | AOR My,Gy (F2) | AXOR My,Gy (F3) EndTable Table: 3-byte opcode 2 (0x0f 0x3a) @@ -893,10 +925,103 @@ c2: vcmpph Vx,Hx,Wx,Ib (ev) | vcmpsh Vx,Hx,Wx,Ib (F3),(ev) cc: sha1rnds4 Vdq,Wdq,Ib ce: vgf2p8affineqb Vx,Wx,Ib (66) cf: vgf2p8affineinvqb Vx,Wx,Ib (66) +de: vsm3rnds2 Vdq,Hdq,Wdq,Ib (66),(v1) df: VAESKEYGEN Vdq,Wdq,Ib (66),(v1) f0: RORX Gy,Ey,Ib (F2),(v) | HRESET Gv,Ib (F3),(000),(11B) EndTable +Table: EVEX map 4 +Referrer: +AVXcode: 4 +00: ADD Eb,Gb (ev) +01: ADD Ev,Gv (es) | ADD Ev,Gv (66),(es) +02: ADD Gb,Eb (ev) +03: ADD Gv,Ev (es) | ADD Gv,Ev (66),(es) +08: OR Eb,Gb (ev) +09: OR Ev,Gv (es) | OR Ev,Gv (66),(es) +0a: OR Gb,Eb (ev) +0b: OR Gv,Ev (es) | OR Gv,Ev (66),(es) +10: ADC Eb,Gb (ev) +11: ADC Ev,Gv (es) | ADC Ev,Gv (66),(es) +12: ADC Gb,Eb (ev) +13: ADC Gv,Ev (es) | ADC Gv,Ev (66),(es) +18: SBB Eb,Gb (ev) +19: SBB Ev,Gv (es) | SBB Ev,Gv (66),(es) +1a: SBB Gb,Eb (ev) +1b: SBB Gv,Ev (es) | SBB Gv,Ev (66),(es) +20: AND Eb,Gb (ev) +21: AND Ev,Gv (es) | AND Ev,Gv (66),(es) +22: AND Gb,Eb (ev) +23: AND Gv,Ev (es) | AND Gv,Ev (66),(es) +24: SHLD Ev,Gv,Ib (es) | SHLD Ev,Gv,Ib (66),(es) +28: SUB Eb,Gb (ev) +29: SUB Ev,Gv (es) | SUB Ev,Gv (66),(es) +2a: SUB Gb,Eb (ev) +2b: SUB Gv,Ev (es) | SUB Gv,Ev (66),(es) +2c: SHRD Ev,Gv,Ib (es) | SHRD Ev,Gv,Ib (66),(es) +30: XOR Eb,Gb (ev) +31: XOR Ev,Gv (es) | XOR Ev,Gv (66),(es) +32: XOR Gb,Eb (ev) +33: XOR Gv,Ev (es) | XOR Gv,Ev (66),(es) +# CCMPSCC instructions are: CCOMB, CCOMBE, CCOMF, CCOML, CCOMLE, CCOMNB, CCOMNBE, CCOMNL, CCOMNLE, +# CCOMNO, CCOMNS, CCOMNZ, CCOMO, CCOMS, CCOMT, CCOMZ +38: CCMPSCC Eb,Gb (ev) +39: CCMPSCC Ev,Gv (es) | CCMPSCC Ev,Gv (66),(es) +3a: CCMPSCC Gv,Ev (ev) +3b: CCMPSCC Gv,Ev (es) | CCMPSCC Gv,Ev (66),(es) +40: CMOVO Gv,Ev (es) | CMOVO Gv,Ev (66),(es) | CFCMOVO Ev,Ev (es) | CFCMOVO Ev,Ev (66),(es) | SETO Eb (F2),(ev) +41: CMOVNO Gv,Ev (es) | CMOVNO Gv,Ev (66),(es) | CFCMOVNO Ev,Ev (es) | CFCMOVNO Ev,Ev (66),(es) | SETNO Eb (F2),(ev) +42: CMOVB Gv,Ev (es) | CMOVB Gv,Ev (66),(es) | CFCMOVB Ev,Ev (es) | CFCMOVB Ev,Ev (66),(es) | SETB Eb (F2),(ev) +43: CMOVNB Gv,Ev (es) | CMOVNB Gv,Ev (66),(es) | CFCMOVNB Ev,Ev (es) | CFCMOVNB Ev,Ev (66),(es) | SETNB Eb (F2),(ev) +44: CMOVZ Gv,Ev (es) | CMOVZ Gv,Ev (66),(es) | CFCMOVZ Ev,Ev (es) | CFCMOVZ Ev,Ev (66),(es) | SETZ Eb (F2),(ev) +45: CMOVNZ Gv,Ev (es) | CMOVNZ Gv,Ev (66),(es) | CFCMOVNZ Ev,Ev (es) | CFCMOVNZ Ev,Ev (66),(es) | SETNZ Eb (F2),(ev) +46: CMOVBE Gv,Ev (es) | CMOVBE Gv,Ev (66),(es) | CFCMOVBE Ev,Ev (es) | CFCMOVBE Ev,Ev (66),(es) | SETBE Eb (F2),(ev) +47: CMOVNBE Gv,Ev (es) | CMOVNBE Gv,Ev (66),(es) | CFCMOVNBE Ev,Ev (es) | CFCMOVNBE Ev,Ev (66),(es) | SETNBE Eb (F2),(ev) +48: CMOVS Gv,Ev (es) | CMOVS Gv,Ev (66),(es) | CFCMOVS Ev,Ev (es) | CFCMOVS Ev,Ev (66),(es) | SETS Eb (F2),(ev) +49: CMOVNS Gv,Ev (es) | CMOVNS Gv,Ev (66),(es) | CFCMOVNS Ev,Ev (es) | CFCMOVNS Ev,Ev (66),(es) | SETNS Eb (F2),(ev) +4a: CMOVP Gv,Ev (es) | CMOVP Gv,Ev (66),(es) | CFCMOVP Ev,Ev (es) | CFCMOVP Ev,Ev (66),(es) | SETP Eb (F2),(ev) +4b: CMOVNP Gv,Ev (es) | CMOVNP Gv,Ev (66),(es) | CFCMOVNP Ev,Ev (es) | CFCMOVNP Ev,Ev (66),(es) | SETNP Eb (F2),(ev) +4c: CMOVL Gv,Ev (es) | CMOVL Gv,Ev (66),(es) | CFCMOVL Ev,Ev (es) | CFCMOVL Ev,Ev (66),(es) | SETL Eb (F2),(ev) +4d: CMOVNL Gv,Ev (es) | CMOVNL Gv,Ev (66),(es) | CFCMOVNL Ev,Ev (es) | CFCMOVNL Ev,Ev (66),(es) | SETNL Eb (F2),(ev) +4e: CMOVLE Gv,Ev (es) | CMOVLE Gv,Ev (66),(es) | CFCMOVLE Ev,Ev (es) | CFCMOVLE Ev,Ev (66),(es) | SETLE Eb (F2),(ev) +4f: CMOVNLE Gv,Ev (es) | CMOVNLE Gv,Ev (66),(es) | CFCMOVNLE Ev,Ev (es) | CFCMOVNLE Ev,Ev (66),(es) | SETNLE Eb (F2),(ev) +60: MOVBE Gv,Ev (es) | MOVBE Gv,Ev (66),(es) +61: MOVBE Ev,Gv (es) | MOVBE Ev,Gv (66),(es) +65: WRUSSD Md,Gd (66),(ev) | WRUSSQ Mq,Gq (66),(ev) +66: ADCX Gy,Ey (66),(ev) | ADOX Gy,Ey (F3),(ev) | WRSSD Md,Gd (ev) | WRSSQ Mq,Gq (66),(ev) +69: IMUL Gv,Ev,Iz (es) | IMUL Gv,Ev,Iz (66),(es) +6b: IMUL Gv,Ev,Ib (es) | IMUL Gv,Ev,Ib (66),(es) +80: Grp1 Eb,Ib (1A),(ev) +81: Grp1 Ev,Iz (1A),(es) +83: Grp1 Ev,Ib (1A),(es) +# CTESTSCC instructions are: CTESTB, CTESTBE, CTESTF, CTESTL, CTESTLE, CTESTNB, CTESTNBE, CTESTNL, +# CTESTNLE, CTESTNO, CTESTNS, CTESTNZ, CTESTO, CTESTS, CTESTT, CTESTZ +84: CTESTSCC (ev) +85: CTESTSCC (es) | CTESTSCC (66),(es) +88: POPCNT Gv,Ev (es) | POPCNT Gv,Ev (66),(es) +8f: POP2 Bq,Rq (000),(11B),(ev) +a5: SHLD Ev,Gv,CL (es) | SHLD Ev,Gv,CL (66),(es) +ad: SHRD Ev,Gv,CL (es) | SHRD Ev,Gv,CL (66),(es) +af: IMUL Gv,Ev (es) | IMUL Gv,Ev (66),(es) +c0: Grp2 Eb,Ib (1A),(ev) +c1: Grp2 Ev,Ib (1A),(es) +d0: Grp2 Eb,1 (1A),(ev) +d1: Grp2 Ev,1 (1A),(es) +d2: Grp2 Eb,CL (1A),(ev) +d3: Grp2 Ev,CL (1A),(es) +f0: CRC32 Gy,Eb (es) | INVEPT Gq,Mdq (F3),(ev) +f1: CRC32 Gy,Ey (es) | CRC32 Gy,Ey (66),(es) | INVVPID Gy,Mdq (F3),(ev) +f2: INVPCID Gy,Mdq (F3),(ev) +f4: TZCNT Gv,Ev (es) | TZCNT Gv,Ev (66),(es) +f5: LZCNT Gv,Ev (es) | LZCNT Gv,Ev (66),(es) +f6: Grp3_1 Eb (1A),(ev) +f7: Grp3_2 Ev (1A),(es) +f8: MOVDIR64B Gv,Mdqq (66),(ev) | ENQCMD Gv,Mdqq (F2),(ev) | ENQCMDS Gv,Mdqq (F3),(ev) | URDMSR Rq,Gq (F2),(11B),(ev) | UWRMSR Gq,Rq (F3),(11B),(ev) +f9: MOVDIRI My,Gy (ev) +fe: Grp4 (1A),(ev) +ff: Grp5 (1A),(es) | PUSH2 Bq,Rq (110),(11B),(ev) +EndTable + Table: EVEX map 5 Referrer: AVXcode: 5 @@ -975,6 +1100,12 @@ d6: vfcmulcph Vx,Hx,Wx (F2),(ev) | vfmulcph Vx,Hx,Wx (F3),(ev) d7: vfcmulcsh Vx,Hx,Wx (F2),(ev) | vfmulcsh Vx,Hx,Wx (F3),(ev) EndTable +Table: VEX map 7 +Referrer: +AVXcode: 7 +f8: URDMSR Rq,Id (F2),(v1),(11B) | UWRMSR Id,Rq (F3),(v1),(11B) +EndTable + GrpTable: Grp1 0: ADD 1: OR @@ -1051,7 +1182,7 @@ GrpTable: Grp6 EndTable GrpTable: Grp7 -0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B) | PCONFIG (101),(11B) | ENCLV (000),(11B) | WRMSRNS (110),(11B) +0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B) | PCONFIG (101),(11B) | ENCLV (000),(11B) | WRMSRNS (110),(11B) | RDMSRLIST (F2),(110),(11B) | WRMSRLIST (F3),(110),(11B) | PBNDKB (111),(11B) 1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001),(11B) | CLAC (010),(11B) | STAC (011),(11B) | ENCLS (111),(11B) | ERETU (F3),(010),(11B) | ERETS (F2),(010),(11B) 2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) | XEND (101)(11B) | XTEST (110)(11B) | ENCLU (111),(11B) 3: LIDT Ms @@ -1137,6 +1268,8 @@ GrpTable: Grp16 1: prefetch T0 2: prefetch T1 3: prefetch T2 +6: prefetch IT1 +7: prefetch IT0 EndTable GrpTable: Grp17 diff --git a/tools/arch/x86/tools/gen-insn-attr-x86.awk b/tools/arch/x86/tools/gen-insn-attr-x86.awk index af38469afd14..5770c8097f32 100644 --- a/tools/arch/x86/tools/gen-insn-attr-x86.awk +++ b/tools/arch/x86/tools/gen-insn-attr-x86.awk @@ -64,7 +64,9 @@ BEGIN { modrm_expr = "^([CDEGMNPQRSUVW/][a-z]+|NTA|T[012])" force64_expr = "\\([df]64\\)" - rex_expr = "^REX(\\.[XRWB]+)*" + rex_expr = "^((REX(\\.[XRWB]+)+)|(REX$))" + rex2_expr = "\\(REX2\\)" + no_rex2_expr = "\\(!REX2\\)" fpu_expr = "^ESC" # TODO lprefix1_expr = "\\((66|!F3)\\)" @@ -81,6 +83,8 @@ BEGIN { vexonly_expr = "\\(v\\)" # All opcodes with (ev) superscript supports *only* EVEX prefix evexonly_expr = "\\(ev\\)" + # (es) is the same as (ev) but also "SCALABLE" i.e. W and pp determine operand size + evex_scalable_expr = "\\(es\\)" prefix_expr = "\\(Prefix\\)" prefix_num["Operand-Size"] = "INAT_PFX_OPNDSZ" @@ -99,6 +103,7 @@ BEGIN { prefix_num["VEX+1byte"] = "INAT_PFX_VEX2" prefix_num["VEX+2byte"] = "INAT_PFX_VEX3" prefix_num["EVEX"] = "INAT_PFX_EVEX" + prefix_num["REX2"] = "INAT_PFX_REX2" clear_vars() } @@ -314,6 +319,10 @@ function convert_operands(count,opnd, i,j,imm,mod) if (match(ext, force64_expr)) flags = add_flags(flags, "INAT_FORCE64") + # check REX2 not allowed + if (match(ext, no_rex2_expr)) + flags = add_flags(flags, "INAT_NO_REX2") + # check REX prefix if (match(opcode, rex_expr)) flags = add_flags(flags, "INAT_MAKE_PREFIX(INAT_PFX_REX)") @@ -325,6 +334,8 @@ function convert_operands(count,opnd, i,j,imm,mod) # check VEX codes if (match(ext, evexonly_expr)) flags = add_flags(flags, "INAT_VEXOK | INAT_EVEXONLY") + else if (match(ext, evex_scalable_expr)) + flags = add_flags(flags, "INAT_VEXOK | INAT_EVEXONLY | INAT_EVEX_SCALABLE") else if (match(ext, vexonly_expr)) flags = add_flags(flags, "INAT_VEXOK | INAT_VEXONLY") else if (match(ext, vexok_expr) || match(opcode, vexok_opcode_expr)) @@ -351,6 +362,8 @@ function convert_operands(count,opnd, i,j,imm,mod) lptable3[idx] = add_flags(lptable3[idx],flags) variant = "INAT_VARIANT" } + if (match(ext, rex2_expr)) + table[idx] = add_flags(table[idx], "INAT_REX2_VARIANT") if (!match(ext, lprefix_expr)){ table[idx] = add_flags(table[idx],flags) } diff --git a/tools/bpf/bpftool/Documentation/Makefile b/tools/bpf/bpftool/Documentation/Makefile index ac8487dcff1d..4315652678b9 100644 --- a/tools/bpf/bpftool/Documentation/Makefile +++ b/tools/bpf/bpftool/Documentation/Makefile @@ -31,9 +31,9 @@ see_also = $(subst " ",, \ "\n" \ "SEE ALSO\n" \ "========\n" \ - "\t**bpf**\ (2),\n" \ - "\t**bpf-helpers**\\ (7)" \ - $(foreach page,$(call list_pages,$(1)),",\n\t**$(page)**\\ (8)") \ + "**bpf**\ (2),\n" \ + "**bpf-helpers**\\ (7)" \ + $(foreach page,$(call list_pages,$(1)),",\n**$(page)**\\ (8)") \ "\n") $(OUTPUT)%.8: %.rst diff --git a/tools/bpf/bpftool/Documentation/bpftool-btf.rst b/tools/bpf/bpftool/Documentation/bpftool-btf.rst index 342716f74ec4..eaba24320fb2 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-btf.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-btf.rst @@ -14,82 +14,76 @@ tool for inspection of BTF data SYNOPSIS ======== - **bpftool** [*OPTIONS*] **btf** *COMMAND* +**bpftool** [*OPTIONS*] **btf** *COMMAND* - *OPTIONS* := { |COMMON_OPTIONS| | { **-B** | **--base-btf** } } +*OPTIONS* := { |COMMON_OPTIONS| | { **-B** | **--base-btf** } } - *COMMANDS* := { **dump** | **help** } +*COMMANDS* := { **dump** | **help** } BTF COMMANDS ============= -| **bpftool** **btf** { **show** | **list** } [**id** *BTF_ID*] -| **bpftool** **btf dump** *BTF_SRC* [**format** *FORMAT*] -| **bpftool** **btf help** +| **bpftool** **btf** { **show** | **list** } [**id** *BTF_ID*] +| **bpftool** **btf dump** *BTF_SRC* [**format** *FORMAT*] +| **bpftool** **btf help** | -| *BTF_SRC* := { **id** *BTF_ID* | **prog** *PROG* | **map** *MAP* [{**key** | **value** | **kv** | **all**}] | **file** *FILE* } -| *FORMAT* := { **raw** | **c** } -| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* } -| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* } +| *BTF_SRC* := { **id** *BTF_ID* | **prog** *PROG* | **map** *MAP* [{**key** | **value** | **kv** | **all**}] | **file** *FILE* } +| *FORMAT* := { **raw** | **c** } +| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* } +| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* | **name** *PROG_NAME* } DESCRIPTION =========== - **bpftool btf { show | list }** [**id** *BTF_ID*] - Show information about loaded BTF objects. If a BTF ID is - specified, show information only about given BTF object, - otherwise list all BTF objects currently loaded on the - system. +bpftool btf { show | list } [id *BTF_ID*] + Show information about loaded BTF objects. If a BTF ID is specified, show + information only about given BTF object, otherwise list all BTF objects + currently loaded on the system. - Since Linux 5.8 bpftool is able to discover information about - processes that hold open file descriptors (FDs) against BTF - objects. On such kernels bpftool will automatically emit this - information as well. + Since Linux 5.8 bpftool is able to discover information about processes + that hold open file descriptors (FDs) against BTF objects. On such kernels + bpftool will automatically emit this information as well. - **bpftool btf dump** *BTF_SRC* - Dump BTF entries from a given *BTF_SRC*. +bpftool btf dump *BTF_SRC* + Dump BTF entries from a given *BTF_SRC*. - When **id** is specified, BTF object with that ID will be - loaded and all its BTF types emitted. + When **id** is specified, BTF object with that ID will be loaded and all + its BTF types emitted. - When **map** is provided, it's expected that map has - associated BTF object with BTF types describing key and - value. It's possible to select whether to dump only BTF - type(s) associated with key (**key**), value (**value**), - both key and value (**kv**), or all BTF types present in - associated BTF object (**all**). If not specified, **kv** - is assumed. + When **map** is provided, it's expected that map has associated BTF object + with BTF types describing key and value. It's possible to select whether to + dump only BTF type(s) associated with key (**key**), value (**value**), + both key and value (**kv**), or all BTF types present in associated BTF + object (**all**). If not specified, **kv** is assumed. - When **prog** is provided, it's expected that program has - associated BTF object with BTF types. + When **prog** is provided, it's expected that program has associated BTF + object with BTF types. - When specifying *FILE*, an ELF file is expected, containing - .BTF section with well-defined BTF binary format data, - typically produced by clang or pahole. + When specifying *FILE*, an ELF file is expected, containing .BTF section + with well-defined BTF binary format data, typically produced by clang or + pahole. - **format** option can be used to override default (raw) - output format. Raw (**raw**) or C-syntax (**c**) output - formats are supported. + **format** option can be used to override default (raw) output format. Raw + (**raw**) or C-syntax (**c**) output formats are supported. - **bpftool btf help** - Print short help message. +bpftool btf help + Print short help message. OPTIONS ======= - .. include:: common_options.rst - - -B, --base-btf *FILE* - Pass a base BTF object. Base BTF objects are typically used - with BTF objects for kernel modules. To avoid duplicating - all kernel symbols required by modules, BTF objects for - modules are "split", they are built incrementally on top of - the kernel (vmlinux) BTF object. So the base BTF reference - should usually point to the kernel BTF. - - When the main BTF object to process (for example, the - module BTF to dump) is passed as a *FILE*, bpftool attempts - to autodetect the path for the base object, and passing - this option is optional. When the main BTF object is passed - through other handles, this option becomes necessary. +.. include:: common_options.rst + +-B, --base-btf *FILE* + Pass a base BTF object. Base BTF objects are typically used with BTF + objects for kernel modules. To avoid duplicating all kernel symbols + required by modules, BTF objects for modules are "split", they are + built incrementally on top of the kernel (vmlinux) BTF object. So the + base BTF reference should usually point to the kernel BTF. + + When the main BTF object to process (for example, the module BTF to + dump) is passed as a *FILE*, bpftool attempts to autodetect the path + for the base object, and passing this option is optional. When the main + BTF object is passed through other handles, this option becomes + necessary. EXAMPLES ======== diff --git a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst index 2ce900f66d6e..e8185596a759 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst @@ -14,134 +14,125 @@ tool for inspection and simple manipulation of eBPF progs SYNOPSIS ======== - **bpftool** [*OPTIONS*] **cgroup** *COMMAND* +**bpftool** [*OPTIONS*] **cgroup** *COMMAND* - *OPTIONS* := { |COMMON_OPTIONS| | { **-f** | **--bpffs** } } +*OPTIONS* := { |COMMON_OPTIONS| | { **-f** | **--bpffs** } } - *COMMANDS* := - { **show** | **list** | **tree** | **attach** | **detach** | **help** } +*COMMANDS* := +{ **show** | **list** | **tree** | **attach** | **detach** | **help** } CGROUP COMMANDS =============== -| **bpftool** **cgroup** { **show** | **list** } *CGROUP* [**effective**] -| **bpftool** **cgroup tree** [*CGROUP_ROOT*] [**effective**] -| **bpftool** **cgroup attach** *CGROUP* *ATTACH_TYPE* *PROG* [*ATTACH_FLAGS*] -| **bpftool** **cgroup detach** *CGROUP* *ATTACH_TYPE* *PROG* -| **bpftool** **cgroup help** +| **bpftool** **cgroup** { **show** | **list** } *CGROUP* [**effective**] +| **bpftool** **cgroup tree** [*CGROUP_ROOT*] [**effective**] +| **bpftool** **cgroup attach** *CGROUP* *ATTACH_TYPE* *PROG* [*ATTACH_FLAGS*] +| **bpftool** **cgroup detach** *CGROUP* *ATTACH_TYPE* *PROG* +| **bpftool** **cgroup help** | -| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* } -| *ATTACH_TYPE* := { **cgroup_inet_ingress** | **cgroup_inet_egress** | -| **cgroup_inet_sock_create** | **cgroup_sock_ops** | -| **cgroup_device** | **cgroup_inet4_bind** | **cgroup_inet6_bind** | -| **cgroup_inet4_post_bind** | **cgroup_inet6_post_bind** | -| **cgroup_inet4_connect** | **cgroup_inet6_connect** | -| **cgroup_unix_connect** | **cgroup_inet4_getpeername** | -| **cgroup_inet6_getpeername** | **cgroup_unix_getpeername** | -| **cgroup_inet4_getsockname** | **cgroup_inet6_getsockname** | -| **cgroup_unix_getsockname** | **cgroup_udp4_sendmsg** | -| **cgroup_udp6_sendmsg** | **cgroup_unix_sendmsg** | -| **cgroup_udp4_recvmsg** | **cgroup_udp6_recvmsg** | -| **cgroup_unix_recvmsg** | **cgroup_sysctl** | -| **cgroup_getsockopt** | **cgroup_setsockopt** | -| **cgroup_inet_sock_release** } -| *ATTACH_FLAGS* := { **multi** | **override** } +| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* | **name** *PROG_NAME* } +| *ATTACH_TYPE* := { **cgroup_inet_ingress** | **cgroup_inet_egress** | +| **cgroup_inet_sock_create** | **cgroup_sock_ops** | +| **cgroup_device** | **cgroup_inet4_bind** | **cgroup_inet6_bind** | +| **cgroup_inet4_post_bind** | **cgroup_inet6_post_bind** | +| **cgroup_inet4_connect** | **cgroup_inet6_connect** | +| **cgroup_unix_connect** | **cgroup_inet4_getpeername** | +| **cgroup_inet6_getpeername** | **cgroup_unix_getpeername** | +| **cgroup_inet4_getsockname** | **cgroup_inet6_getsockname** | +| **cgroup_unix_getsockname** | **cgroup_udp4_sendmsg** | +| **cgroup_udp6_sendmsg** | **cgroup_unix_sendmsg** | +| **cgroup_udp4_recvmsg** | **cgroup_udp6_recvmsg** | +| **cgroup_unix_recvmsg** | **cgroup_sysctl** | +| **cgroup_getsockopt** | **cgroup_setsockopt** | +| **cgroup_inet_sock_release** } +| *ATTACH_FLAGS* := { **multi** | **override** } DESCRIPTION =========== - **bpftool cgroup { show | list }** *CGROUP* [**effective**] - List all programs attached to the cgroup *CGROUP*. - - Output will start with program ID followed by attach type, - attach flags and program name. - - If **effective** is specified retrieve effective programs that - will execute for events within a cgroup. This includes - inherited along with attached ones. - - **bpftool cgroup tree** [*CGROUP_ROOT*] [**effective**] - Iterate over all cgroups in *CGROUP_ROOT* and list all - attached programs. If *CGROUP_ROOT* is not specified, - bpftool uses cgroup v2 mountpoint. - - The output is similar to the output of cgroup show/list - commands: it starts with absolute cgroup path, followed by - program ID, attach type, attach flags and program name. - - If **effective** is specified retrieve effective programs that - will execute for events within a cgroup. This includes - inherited along with attached ones. - - **bpftool cgroup attach** *CGROUP* *ATTACH_TYPE* *PROG* [*ATTACH_FLAGS*] - Attach program *PROG* to the cgroup *CGROUP* with attach type - *ATTACH_TYPE* and optional *ATTACH_FLAGS*. - - *ATTACH_FLAGS* can be one of: **override** if a sub-cgroup installs - some bpf program, the program in this cgroup yields to sub-cgroup - program; **multi** if a sub-cgroup installs some bpf program, - that cgroup program gets run in addition to the program in this - cgroup. - - Only one program is allowed to be attached to a cgroup with - no attach flags or the **override** flag. Attaching another - program will release old program and attach the new one. - - Multiple programs are allowed to be attached to a cgroup with - **multi**. They are executed in FIFO order (those that were - attached first, run first). - - Non-default *ATTACH_FLAGS* are supported by kernel version 4.14 - and later. - - *ATTACH_TYPE* can be on of: - **ingress** ingress path of the inet socket (since 4.10); - **egress** egress path of the inet socket (since 4.10); - **sock_create** opening of an inet socket (since 4.10); - **sock_ops** various socket operations (since 4.12); - **device** device access (since 4.15); - **bind4** call to bind(2) for an inet4 socket (since 4.17); - **bind6** call to bind(2) for an inet6 socket (since 4.17); - **post_bind4** return from bind(2) for an inet4 socket (since 4.17); - **post_bind6** return from bind(2) for an inet6 socket (since 4.17); - **connect4** call to connect(2) for an inet4 socket (since 4.17); - **connect6** call to connect(2) for an inet6 socket (since 4.17); - **connect_unix** call to connect(2) for a unix socket (since 6.7); - **sendmsg4** call to sendto(2), sendmsg(2), sendmmsg(2) for an - unconnected udp4 socket (since 4.18); - **sendmsg6** call to sendto(2), sendmsg(2), sendmmsg(2) for an - unconnected udp6 socket (since 4.18); - **sendmsg_unix** call to sendto(2), sendmsg(2), sendmmsg(2) for - an unconnected unix socket (since 6.7); - **recvmsg4** call to recvfrom(2), recvmsg(2), recvmmsg(2) for - an unconnected udp4 socket (since 5.2); - **recvmsg6** call to recvfrom(2), recvmsg(2), recvmmsg(2) for - an unconnected udp6 socket (since 5.2); - **recvmsg_unix** call to recvfrom(2), recvmsg(2), recvmmsg(2) for - an unconnected unix socket (since 6.7); - **sysctl** sysctl access (since 5.2); - **getsockopt** call to getsockopt (since 5.3); - **setsockopt** call to setsockopt (since 5.3); - **getpeername4** call to getpeername(2) for an inet4 socket (since 5.8); - **getpeername6** call to getpeername(2) for an inet6 socket (since 5.8); - **getpeername_unix** call to getpeername(2) for a unix socket (since 6.7); - **getsockname4** call to getsockname(2) for an inet4 socket (since 5.8); - **getsockname6** call to getsockname(2) for an inet6 socket (since 5.8). - **getsockname_unix** call to getsockname(2) for a unix socket (since 6.7); - **sock_release** closing an userspace inet socket (since 5.9). - - **bpftool cgroup detach** *CGROUP* *ATTACH_TYPE* *PROG* - Detach *PROG* from the cgroup *CGROUP* and attach type - *ATTACH_TYPE*. - - **bpftool prog help** - Print short help message. +bpftool cgroup { show | list } *CGROUP* [effective] + List all programs attached to the cgroup *CGROUP*. + + Output will start with program ID followed by attach type, attach flags and + program name. + + If **effective** is specified retrieve effective programs that will execute + for events within a cgroup. This includes inherited along with attached + ones. + +bpftool cgroup tree [*CGROUP_ROOT*] [effective] + Iterate over all cgroups in *CGROUP_ROOT* and list all attached programs. + If *CGROUP_ROOT* is not specified, bpftool uses cgroup v2 mountpoint. + + The output is similar to the output of cgroup show/list commands: it starts + with absolute cgroup path, followed by program ID, attach type, attach + flags and program name. + + If **effective** is specified retrieve effective programs that will execute + for events within a cgroup. This includes inherited along with attached + ones. + +bpftool cgroup attach *CGROUP* *ATTACH_TYPE* *PROG* [*ATTACH_FLAGS*] + Attach program *PROG* to the cgroup *CGROUP* with attach type *ATTACH_TYPE* + and optional *ATTACH_FLAGS*. + + *ATTACH_FLAGS* can be one of: **override** if a sub-cgroup installs some + bpf program, the program in this cgroup yields to sub-cgroup program; + **multi** if a sub-cgroup installs some bpf program, that cgroup program + gets run in addition to the program in this cgroup. + + Only one program is allowed to be attached to a cgroup with no attach flags + or the **override** flag. Attaching another program will release old + program and attach the new one. + + Multiple programs are allowed to be attached to a cgroup with **multi**. + They are executed in FIFO order (those that were attached first, run + first). + + Non-default *ATTACH_FLAGS* are supported by kernel version 4.14 and later. + + *ATTACH_TYPE* can be one of: + + - **ingress** ingress path of the inet socket (since 4.10) + - **egress** egress path of the inet socket (since 4.10) + - **sock_create** opening of an inet socket (since 4.10) + - **sock_ops** various socket operations (since 4.12) + - **device** device access (since 4.15) + - **bind4** call to bind(2) for an inet4 socket (since 4.17) + - **bind6** call to bind(2) for an inet6 socket (since 4.17) + - **post_bind4** return from bind(2) for an inet4 socket (since 4.17) + - **post_bind6** return from bind(2) for an inet6 socket (since 4.17) + - **connect4** call to connect(2) for an inet4 socket (since 4.17) + - **connect6** call to connect(2) for an inet6 socket (since 4.17) + - **connect_unix** call to connect(2) for a unix socket (since 6.7) + - **sendmsg4** call to sendto(2), sendmsg(2), sendmmsg(2) for an unconnected udp4 socket (since 4.18) + - **sendmsg6** call to sendto(2), sendmsg(2), sendmmsg(2) for an unconnected udp6 socket (since 4.18) + - **sendmsg_unix** call to sendto(2), sendmsg(2), sendmmsg(2) for an unconnected unix socket (since 6.7) + - **recvmsg4** call to recvfrom(2), recvmsg(2), recvmmsg(2) for an unconnected udp4 socket (since 5.2) + - **recvmsg6** call to recvfrom(2), recvmsg(2), recvmmsg(2) for an unconnected udp6 socket (since 5.2) + - **recvmsg_unix** call to recvfrom(2), recvmsg(2), recvmmsg(2) for an unconnected unix socket (since 6.7) + - **sysctl** sysctl access (since 5.2) + - **getsockopt** call to getsockopt (since 5.3) + - **setsockopt** call to setsockopt (since 5.3) + - **getpeername4** call to getpeername(2) for an inet4 socket (since 5.8) + - **getpeername6** call to getpeername(2) for an inet6 socket (since 5.8) + - **getpeername_unix** call to getpeername(2) for a unix socket (since 6.7) + - **getsockname4** call to getsockname(2) for an inet4 socket (since 5.8) + - **getsockname6** call to getsockname(2) for an inet6 socket (since 5.8) + - **getsockname_unix** call to getsockname(2) for a unix socket (since 6.7) + - **sock_release** closing a userspace inet socket (since 5.9) + +bpftool cgroup detach *CGROUP* *ATTACH_TYPE* *PROG* + Detach *PROG* from the cgroup *CGROUP* and attach type *ATTACH_TYPE*. + +bpftool prog help + Print short help message. OPTIONS ======= - .. include:: common_options.rst +.. include:: common_options.rst - -f, --bpffs - Show file names of pinned programs. +-f, --bpffs + Show file names of pinned programs. EXAMPLES ======== diff --git a/tools/bpf/bpftool/Documentation/bpftool-feature.rst b/tools/bpf/bpftool/Documentation/bpftool-feature.rst index e44039f89be7..c7f837898bc7 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-feature.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-feature.rst @@ -14,77 +14,70 @@ tool for inspection of eBPF-related parameters for Linux kernel or net device SYNOPSIS ======== - **bpftool** [*OPTIONS*] **feature** *COMMAND* +**bpftool** [*OPTIONS*] **feature** *COMMAND* - *OPTIONS* := { |COMMON_OPTIONS| } +*OPTIONS* := { |COMMON_OPTIONS| } - *COMMANDS* := { **probe** | **help** } +*COMMANDS* := { **probe** | **help** } FEATURE COMMANDS ================ -| **bpftool** **feature probe** [*COMPONENT*] [**full**] [**unprivileged**] [**macros** [**prefix** *PREFIX*]] -| **bpftool** **feature list_builtins** *GROUP* -| **bpftool** **feature help** +| **bpftool** **feature probe** [*COMPONENT*] [**full**] [**unprivileged**] [**macros** [**prefix** *PREFIX*]] +| **bpftool** **feature list_builtins** *GROUP* +| **bpftool** **feature help** | -| *COMPONENT* := { **kernel** | **dev** *NAME* } -| *GROUP* := { **prog_types** | **map_types** | **attach_types** | **link_types** | **helpers** } +| *COMPONENT* := { **kernel** | **dev** *NAME* } +| *GROUP* := { **prog_types** | **map_types** | **attach_types** | **link_types** | **helpers** } DESCRIPTION =========== - **bpftool feature probe** [**kernel**] [**full**] [**macros** [**prefix** *PREFIX*]] - Probe the running kernel and dump a number of eBPF-related - parameters, such as availability of the **bpf**\ () system call, - JIT status, eBPF program types availability, eBPF helper - functions availability, and more. - - By default, bpftool **does not run probes** for - **bpf_probe_write_user**\ () and **bpf_trace_printk**\() - helpers which print warnings to kernel logs. To enable them - and run all probes, the **full** keyword should be used. - - If the **macros** keyword (but not the **-j** option) is - passed, a subset of the output is dumped as a list of - **#define** macros that are ready to be included in a C - header file, for example. If, additionally, **prefix** is - used to define a *PREFIX*, the provided string will be used - as a prefix to the names of the macros: this can be used to - avoid conflicts on macro names when including the output of - this command as a header file. - - Keyword **kernel** can be omitted. If no probe target is - specified, probing the kernel is the default behaviour. - - When the **unprivileged** keyword is used, bpftool will dump - only the features available to a user who does not have the - **CAP_SYS_ADMIN** capability set. The features available in - that case usually represent a small subset of the parameters - supported by the system. Unprivileged users MUST use the - **unprivileged** keyword: This is to avoid misdetection if - bpftool is inadvertently run as non-root, for example. This - keyword is unavailable if bpftool was compiled without - libcap. - - **bpftool feature probe dev** *NAME* [**full**] [**macros** [**prefix** *PREFIX*]] - Probe network device for supported eBPF features and dump - results to the console. - - The keywords **full**, **macros** and **prefix** have the - same role as when probing the kernel. - - **bpftool feature list_builtins** *GROUP* - List items known to bpftool. These can be BPF program types - (**prog_types**), BPF map types (**map_types**), attach types - (**attach_types**), link types (**link_types**), or BPF helper - functions (**helpers**). The command does not probe the system, but - simply lists the elements that bpftool knows from compilation time, - as provided from libbpf (for all object types) or from the BPF UAPI - header (list of helpers). This can be used in scripts to iterate over - BPF types or helpers. - - **bpftool feature help** - Print short help message. +bpftool feature probe [kernel] [full] [macros [prefix *PREFIX*]] + Probe the running kernel and dump a number of eBPF-related parameters, such + as availability of the **bpf**\ () system call, JIT status, eBPF program + types availability, eBPF helper functions availability, and more. + + By default, bpftool **does not run probes** for **bpf_probe_write_user**\ + () and **bpf_trace_printk**\() helpers which print warnings to kernel logs. + To enable them and run all probes, the **full** keyword should be used. + + If the **macros** keyword (but not the **-j** option) is passed, a subset + of the output is dumped as a list of **#define** macros that are ready to + be included in a C header file, for example. If, additionally, **prefix** + is used to define a *PREFIX*, the provided string will be used as a prefix + to the names of the macros: this can be used to avoid conflicts on macro + names when including the output of this command as a header file. + + Keyword **kernel** can be omitted. If no probe target is specified, probing + the kernel is the default behaviour. + + When the **unprivileged** keyword is used, bpftool will dump only the + features available to a user who does not have the **CAP_SYS_ADMIN** + capability set. The features available in that case usually represent a + small subset of the parameters supported by the system. Unprivileged users + MUST use the **unprivileged** keyword: This is to avoid misdetection if + bpftool is inadvertently run as non-root, for example. This keyword is + unavailable if bpftool was compiled without libcap. + +bpftool feature probe dev *NAME* [full] [macros [prefix *PREFIX*]] + Probe network device for supported eBPF features and dump results to the + console. + + The keywords **full**, **macros** and **prefix** have the same role as when + probing the kernel. + +bpftool feature list_builtins *GROUP* + List items known to bpftool. These can be BPF program types + (**prog_types**), BPF map types (**map_types**), attach types + (**attach_types**), link types (**link_types**), or BPF helper functions + (**helpers**). The command does not probe the system, but simply lists the + elements that bpftool knows from compilation time, as provided from libbpf + (for all object types) or from the BPF UAPI header (list of helpers). This + can be used in scripts to iterate over BPF types or helpers. + +bpftool feature help + Print short help message. OPTIONS ======= - .. include:: common_options.rst +.. include:: common_options.rst diff --git a/tools/bpf/bpftool/Documentation/bpftool-gen.rst b/tools/bpf/bpftool/Documentation/bpftool-gen.rst index 5e60825818dd..c768e6d4ae09 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-gen.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-gen.rst @@ -14,199 +14,177 @@ tool for BPF code-generation SYNOPSIS ======== - **bpftool** [*OPTIONS*] **gen** *COMMAND* +**bpftool** [*OPTIONS*] **gen** *COMMAND* - *OPTIONS* := { |COMMON_OPTIONS| | { **-L** | **--use-loader** } } +*OPTIONS* := { |COMMON_OPTIONS| | { **-L** | **--use-loader** } } - *COMMAND* := { **object** | **skeleton** | **help** } +*COMMAND* := { **object** | **skeleton** | **help** } GEN COMMANDS ============= -| **bpftool** **gen object** *OUTPUT_FILE* *INPUT_FILE* [*INPUT_FILE*...] -| **bpftool** **gen skeleton** *FILE* [**name** *OBJECT_NAME*] -| **bpftool** **gen subskeleton** *FILE* [**name** *OBJECT_NAME*] -| **bpftool** **gen min_core_btf** *INPUT* *OUTPUT* *OBJECT* [*OBJECT*...] -| **bpftool** **gen help** +| **bpftool** **gen object** *OUTPUT_FILE* *INPUT_FILE* [*INPUT_FILE*...] +| **bpftool** **gen skeleton** *FILE* [**name** *OBJECT_NAME*] +| **bpftool** **gen subskeleton** *FILE* [**name** *OBJECT_NAME*] +| **bpftool** **gen min_core_btf** *INPUT* *OUTPUT* *OBJECT* [*OBJECT*...] +| **bpftool** **gen help** DESCRIPTION =========== - **bpftool gen object** *OUTPUT_FILE* *INPUT_FILE* [*INPUT_FILE*...] - Statically link (combine) together one or more *INPUT_FILE*'s - into a single resulting *OUTPUT_FILE*. All the files involved - are BPF ELF object files. - - The rules of BPF static linking are mostly the same as for - user-space object files, but in addition to combining data - and instruction sections, .BTF and .BTF.ext (if present in - any of the input files) data are combined together. .BTF - data is deduplicated, so all the common types across - *INPUT_FILE*'s will only be represented once in the resulting - BTF information. - - BPF static linking allows to partition BPF source code into - individually compiled files that are then linked into - a single resulting BPF object file, which can be used to - generated BPF skeleton (with **gen skeleton** command) or - passed directly into **libbpf** (using **bpf_object__open()** - family of APIs). - - **bpftool gen skeleton** *FILE* - Generate BPF skeleton C header file for a given *FILE*. - - BPF skeleton is an alternative interface to existing libbpf - APIs for working with BPF objects. Skeleton code is intended - to significantly shorten and simplify code to load and work - with BPF programs from userspace side. Generated code is - tailored to specific input BPF object *FILE*, reflecting its - structure by listing out available maps, program, variables, - etc. Skeleton eliminates the need to lookup mentioned - components by name. Instead, if skeleton instantiation - succeeds, they are populated in skeleton structure as valid - libbpf types (e.g., **struct bpf_map** pointer) and can be - passed to existing generic libbpf APIs. - - In addition to simple and reliable access to maps and - programs, skeleton provides a storage for BPF links (**struct - bpf_link**) for each BPF program within BPF object. When - requested, supported BPF programs will be automatically - attached and resulting BPF links stored for further use by - user in pre-allocated fields in skeleton struct. For BPF - programs that can't be automatically attached by libbpf, - user can attach them manually, but store resulting BPF link - in per-program link field. All such set up links will be - automatically destroyed on BPF skeleton destruction. This - eliminates the need for users to manage links manually and - rely on libbpf support to detach programs and free up - resources. - - Another facility provided by BPF skeleton is an interface to - global variables of all supported kinds: mutable, read-only, - as well as extern ones. This interface allows to pre-setup - initial values of variables before BPF object is loaded and - verified by kernel. For non-read-only variables, the same - interface can be used to fetch values of global variables on - userspace side, even if they are modified by BPF code. - - During skeleton generation, contents of source BPF object - *FILE* is embedded within generated code and is thus not - necessary to keep around. This ensures skeleton and BPF - object file are matching 1-to-1 and always stay in sync. - Generated code is dual-licensed under LGPL-2.1 and - BSD-2-Clause licenses. - - It is a design goal and guarantee that skeleton interfaces - are interoperable with generic libbpf APIs. User should - always be able to use skeleton API to create and load BPF - object, and later use libbpf APIs to keep working with - specific maps, programs, etc. - - As part of skeleton, few custom functions are generated. - Each of them is prefixed with object name. Object name can - either be derived from object file name, i.e., if BPF object - file name is **example.o**, BPF object name will be - **example**. Object name can be also specified explicitly - through **name** *OBJECT_NAME* parameter. The following - custom functions are provided (assuming **example** as - the object name): - - - **example__open** and **example__open_opts**. - These functions are used to instantiate skeleton. It - corresponds to libbpf's **bpf_object__open**\ () API. - **_opts** variants accepts extra **bpf_object_open_opts** - options. - - - **example__load**. - This function creates maps, loads and verifies BPF - programs, initializes global data maps. It corresponds to - libppf's **bpf_object__load**\ () API. - - - **example__open_and_load** combines **example__open** and - **example__load** invocations in one commonly used - operation. - - - **example__attach** and **example__detach** - This pair of functions allow to attach and detach, - correspondingly, already loaded BPF object. Only BPF - programs of types supported by libbpf for auto-attachment - will be auto-attached and their corresponding BPF links - instantiated. For other BPF programs, user can manually - create a BPF link and assign it to corresponding fields in - skeleton struct. **example__detach** will detach both - links created automatically, as well as those populated by - user manually. - - - **example__destroy** - Detach and unload BPF programs, free up all the resources - used by skeleton and BPF object. - - If BPF object has global variables, corresponding structs - with memory layout corresponding to global data data section - layout will be created. Currently supported ones are: *.data*, - *.bss*, *.rodata*, and *.kconfig* structs/data sections. - These data sections/structs can be used to set up initial - values of variables, if set before **example__load**. - Afterwards, if target kernel supports memory-mapped BPF - arrays, same structs can be used to fetch and update - (non-read-only) data from userspace, with same simplicity - as for BPF side. - - **bpftool gen subskeleton** *FILE* - Generate BPF subskeleton C header file for a given *FILE*. - - Subskeletons are similar to skeletons, except they do not own - the corresponding maps, programs, or global variables. They - require that the object file used to generate them is already - loaded into a *bpf_object* by some other means. - - This functionality is useful when a library is included into a - larger BPF program. A subskeleton for the library would have - access to all objects and globals defined in it, without - having to know about the larger program. - - Consequently, there are only two functions defined - for subskeletons: - - - **example__open(bpf_object\*)** - Instantiates a subskeleton from an already opened (but not - necessarily loaded) **bpf_object**. - - - **example__destroy()** - Frees the storage for the subskeleton but *does not* unload - any BPF programs or maps. - - **bpftool** **gen min_core_btf** *INPUT* *OUTPUT* *OBJECT* [*OBJECT*...] - Generate a minimum BTF file as *OUTPUT*, derived from a given - *INPUT* BTF file, containing all needed BTF types so one, or - more, given eBPF objects CO-RE relocations may be satisfied. - - When kernels aren't compiled with CONFIG_DEBUG_INFO_BTF, - libbpf, when loading an eBPF object, has to rely on external - BTF files to be able to calculate CO-RE relocations. - - Usually, an external BTF file is built from existing kernel - DWARF data using pahole. It contains all the types used by - its respective kernel image and, because of that, is big. - - The min_core_btf feature builds smaller BTF files, customized - to one or multiple eBPF objects, so they can be distributed - together with an eBPF CO-RE based application, turning the - application portable to different kernel versions. - - Check examples bellow for more information how to use it. - - **bpftool gen help** - Print short help message. +bpftool gen object *OUTPUT_FILE* *INPUT_FILE* [*INPUT_FILE*...] + Statically link (combine) together one or more *INPUT_FILE*'s into a single + resulting *OUTPUT_FILE*. All the files involved are BPF ELF object files. + + The rules of BPF static linking are mostly the same as for user-space + object files, but in addition to combining data and instruction sections, + .BTF and .BTF.ext (if present in any of the input files) data are combined + together. .BTF data is deduplicated, so all the common types across + *INPUT_FILE*'s will only be represented once in the resulting BTF + information. + + BPF static linking allows to partition BPF source code into individually + compiled files that are then linked into a single resulting BPF object + file, which can be used to generated BPF skeleton (with **gen skeleton** + command) or passed directly into **libbpf** (using **bpf_object__open()** + family of APIs). + +bpftool gen skeleton *FILE* + Generate BPF skeleton C header file for a given *FILE*. + + BPF skeleton is an alternative interface to existing libbpf APIs for + working with BPF objects. Skeleton code is intended to significantly + shorten and simplify code to load and work with BPF programs from userspace + side. Generated code is tailored to specific input BPF object *FILE*, + reflecting its structure by listing out available maps, program, variables, + etc. Skeleton eliminates the need to lookup mentioned components by name. + Instead, if skeleton instantiation succeeds, they are populated in skeleton + structure as valid libbpf types (e.g., **struct bpf_map** pointer) and can + be passed to existing generic libbpf APIs. + + In addition to simple and reliable access to maps and programs, skeleton + provides a storage for BPF links (**struct bpf_link**) for each BPF program + within BPF object. When requested, supported BPF programs will be + automatically attached and resulting BPF links stored for further use by + user in pre-allocated fields in skeleton struct. For BPF programs that + can't be automatically attached by libbpf, user can attach them manually, + but store resulting BPF link in per-program link field. All such set up + links will be automatically destroyed on BPF skeleton destruction. This + eliminates the need for users to manage links manually and rely on libbpf + support to detach programs and free up resources. + + Another facility provided by BPF skeleton is an interface to global + variables of all supported kinds: mutable, read-only, as well as extern + ones. This interface allows to pre-setup initial values of variables before + BPF object is loaded and verified by kernel. For non-read-only variables, + the same interface can be used to fetch values of global variables on + userspace side, even if they are modified by BPF code. + + During skeleton generation, contents of source BPF object *FILE* is + embedded within generated code and is thus not necessary to keep around. + This ensures skeleton and BPF object file are matching 1-to-1 and always + stay in sync. Generated code is dual-licensed under LGPL-2.1 and + BSD-2-Clause licenses. + + It is a design goal and guarantee that skeleton interfaces are + interoperable with generic libbpf APIs. User should always be able to use + skeleton API to create and load BPF object, and later use libbpf APIs to + keep working with specific maps, programs, etc. + + As part of skeleton, few custom functions are generated. Each of them is + prefixed with object name. Object name can either be derived from object + file name, i.e., if BPF object file name is **example.o**, BPF object name + will be **example**. Object name can be also specified explicitly through + **name** *OBJECT_NAME* parameter. The following custom functions are + provided (assuming **example** as the object name): + + - **example__open** and **example__open_opts**. + These functions are used to instantiate skeleton. It corresponds to + libbpf's **bpf_object__open**\ () API. **_opts** variants accepts extra + **bpf_object_open_opts** options. + + - **example__load**. + This function creates maps, loads and verifies BPF programs, initializes + global data maps. It corresponds to libppf's **bpf_object__load**\ () + API. + + - **example__open_and_load** combines **example__open** and + **example__load** invocations in one commonly used operation. + + - **example__attach** and **example__detach**. + This pair of functions allow to attach and detach, correspondingly, + already loaded BPF object. Only BPF programs of types supported by libbpf + for auto-attachment will be auto-attached and their corresponding BPF + links instantiated. For other BPF programs, user can manually create a + BPF link and assign it to corresponding fields in skeleton struct. + **example__detach** will detach both links created automatically, as well + as those populated by user manually. + + - **example__destroy**. + Detach and unload BPF programs, free up all the resources used by + skeleton and BPF object. + + If BPF object has global variables, corresponding structs with memory + layout corresponding to global data data section layout will be created. + Currently supported ones are: *.data*, *.bss*, *.rodata*, and *.kconfig* + structs/data sections. These data sections/structs can be used to set up + initial values of variables, if set before **example__load**. Afterwards, + if target kernel supports memory-mapped BPF arrays, same structs can be + used to fetch and update (non-read-only) data from userspace, with same + simplicity as for BPF side. + +bpftool gen subskeleton *FILE* + Generate BPF subskeleton C header file for a given *FILE*. + + Subskeletons are similar to skeletons, except they do not own the + corresponding maps, programs, or global variables. They require that the + object file used to generate them is already loaded into a *bpf_object* by + some other means. + + This functionality is useful when a library is included into a larger BPF + program. A subskeleton for the library would have access to all objects and + globals defined in it, without having to know about the larger program. + + Consequently, there are only two functions defined for subskeletons: + + - **example__open(bpf_object\*)**. + Instantiates a subskeleton from an already opened (but not necessarily + loaded) **bpf_object**. + + - **example__destroy()**. + Frees the storage for the subskeleton but *does not* unload any BPF + programs or maps. + +bpftool gen min_core_btf *INPUT* *OUTPUT* *OBJECT* [*OBJECT*...] + Generate a minimum BTF file as *OUTPUT*, derived from a given *INPUT* BTF + file, containing all needed BTF types so one, or more, given eBPF objects + CO-RE relocations may be satisfied. + + When kernels aren't compiled with CONFIG_DEBUG_INFO_BTF, libbpf, when + loading an eBPF object, has to rely on external BTF files to be able to + calculate CO-RE relocations. + + Usually, an external BTF file is built from existing kernel DWARF data + using pahole. It contains all the types used by its respective kernel image + and, because of that, is big. + + The min_core_btf feature builds smaller BTF files, customized to one or + multiple eBPF objects, so they can be distributed together with an eBPF + CO-RE based application, turning the application portable to different + kernel versions. + + Check examples bellow for more information how to use it. + +bpftool gen help + Print short help message. OPTIONS ======= - .. include:: common_options.rst +.. include:: common_options.rst - -L, --use-loader - For skeletons, generate a "light" skeleton (also known as "loader" - skeleton). A light skeleton contains a loader eBPF program. It does - not use the majority of the libbpf infrastructure, and does not need - libelf. +-L, --use-loader + For skeletons, generate a "light" skeleton (also known as "loader" + skeleton). A light skeleton contains a loader eBPF program. It does not use + the majority of the libbpf infrastructure, and does not need libelf. EXAMPLES ======== diff --git a/tools/bpf/bpftool/Documentation/bpftool-iter.rst b/tools/bpf/bpftool/Documentation/bpftool-iter.rst index 84839d488621..2e5d81c906dc 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-iter.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-iter.rst @@ -14,50 +14,46 @@ tool to create BPF iterators SYNOPSIS ======== - **bpftool** [*OPTIONS*] **iter** *COMMAND* +**bpftool** [*OPTIONS*] **iter** *COMMAND* - *OPTIONS* := { |COMMON_OPTIONS| } +*OPTIONS* := { |COMMON_OPTIONS| } - *COMMANDS* := { **pin** | **help** } +*COMMANDS* := { **pin** | **help** } ITER COMMANDS -=================== +============= -| **bpftool** **iter pin** *OBJ* *PATH* [**map** *MAP*] -| **bpftool** **iter help** +| **bpftool** **iter pin** *OBJ* *PATH* [**map** *MAP*] +| **bpftool** **iter help** | -| *OBJ* := /a/file/of/bpf_iter_target.o -| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* } +| *OBJ* := /a/file/of/bpf_iter_target.o +| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* } DESCRIPTION =========== - **bpftool iter pin** *OBJ* *PATH* [**map** *MAP*] - A bpf iterator combines a kernel iterating of - particular kernel data (e.g., tasks, bpf_maps, etc.) - and a bpf program called for each kernel data object - (e.g., one task, one bpf_map, etc.). User space can - *read* kernel iterator output through *read()* syscall. - - The *pin* command creates a bpf iterator from *OBJ*, - and pin it to *PATH*. The *PATH* should be located - in *bpffs* mount. It must not contain a dot - character ('.'), which is reserved for future extensions - of *bpffs*. - - Map element bpf iterator requires an additional parameter - *MAP* so bpf program can iterate over map elements for - that map. User can have a bpf program in kernel to run - with each map element, do checking, filtering, aggregation, - etc. without copying data to user space. - - User can then *cat PATH* to see the bpf iterator output. - - **bpftool iter help** - Print short help message. +bpftool iter pin *OBJ* *PATH* [map *MAP*] + A bpf iterator combines a kernel iterating of particular kernel data (e.g., + tasks, bpf_maps, etc.) and a bpf program called for each kernel data object + (e.g., one task, one bpf_map, etc.). User space can *read* kernel iterator + output through *read()* syscall. + + The *pin* command creates a bpf iterator from *OBJ*, and pin it to *PATH*. + The *PATH* should be located in *bpffs* mount. It must not contain a dot + character ('.'), which is reserved for future extensions of *bpffs*. + + Map element bpf iterator requires an additional parameter *MAP* so bpf + program can iterate over map elements for that map. User can have a bpf + program in kernel to run with each map element, do checking, filtering, + aggregation, etc. without copying data to user space. + + User can then *cat PATH* to see the bpf iterator output. + +bpftool iter help + Print short help message. OPTIONS ======= - .. include:: common_options.rst +.. include:: common_options.rst EXAMPLES ======== diff --git a/tools/bpf/bpftool/Documentation/bpftool-link.rst b/tools/bpf/bpftool/Documentation/bpftool-link.rst index 52a4eee4af54..6f09d4405ed8 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-link.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-link.rst @@ -14,67 +14,62 @@ tool for inspection and simple manipulation of eBPF links SYNOPSIS ======== - **bpftool** [*OPTIONS*] **link** *COMMAND* +**bpftool** [*OPTIONS*] **link** *COMMAND* - *OPTIONS* := { |COMMON_OPTIONS| | { **-f** | **--bpffs** } | { **-n** | **--nomount** } } +*OPTIONS* := { |COMMON_OPTIONS| | { **-f** | **--bpffs** } | { **-n** | **--nomount** } } - *COMMANDS* := { **show** | **list** | **pin** | **help** } +*COMMANDS* := { **show** | **list** | **pin** | **help** } LINK COMMANDS ============= -| **bpftool** **link { show | list }** [*LINK*] -| **bpftool** **link pin** *LINK* *FILE* -| **bpftool** **link detach** *LINK* -| **bpftool** **link help** +| **bpftool** **link { show | list }** [*LINK*] +| **bpftool** **link pin** *LINK* *FILE* +| **bpftool** **link detach** *LINK* +| **bpftool** **link help** | -| *LINK* := { **id** *LINK_ID* | **pinned** *FILE* } +| *LINK* := { **id** *LINK_ID* | **pinned** *FILE* } DESCRIPTION =========== - **bpftool link { show | list }** [*LINK*] - Show information about active links. If *LINK* is - specified show information only about given link, - otherwise list all links currently active on the system. +bpftool link { show | list } [*LINK*] + Show information about active links. If *LINK* is specified show + information only about given link, otherwise list all links currently + active on the system. - Output will start with link ID followed by link type and - zero or more named attributes, some of which depend on type - of link. + Output will start with link ID followed by link type and zero or more named + attributes, some of which depend on type of link. - Since Linux 5.8 bpftool is able to discover information about - processes that hold open file descriptors (FDs) against BPF - links. On such kernels bpftool will automatically emit this - information as well. + Since Linux 5.8 bpftool is able to discover information about processes + that hold open file descriptors (FDs) against BPF links. On such kernels + bpftool will automatically emit this information as well. - **bpftool link pin** *LINK* *FILE* - Pin link *LINK* as *FILE*. +bpftool link pin *LINK* *FILE* + Pin link *LINK* as *FILE*. - Note: *FILE* must be located in *bpffs* mount. It must not - contain a dot character ('.'), which is reserved for future - extensions of *bpffs*. + Note: *FILE* must be located in *bpffs* mount. It must not contain a dot + character ('.'), which is reserved for future extensions of *bpffs*. - **bpftool link detach** *LINK* - Force-detach link *LINK*. BPF link and its underlying BPF - program will stay valid, but they will be detached from the - respective BPF hook and BPF link will transition into - a defunct state until last open file descriptor for that - link is closed. +bpftool link detach *LINK* + Force-detach link *LINK*. BPF link and its underlying BPF program will stay + valid, but they will be detached from the respective BPF hook and BPF link + will transition into a defunct state until last open file descriptor for + that link is closed. - **bpftool link help** - Print short help message. +bpftool link help + Print short help message. OPTIONS ======= - .. include:: common_options.rst + .. include:: common_options.rst - -f, --bpffs - When showing BPF links, show file names of pinned - links. + -f, --bpffs + When showing BPF links, show file names of pinned links. - -n, --nomount - Do not automatically attempt to mount any virtual file system - (such as tracefs or BPF virtual file system) when necessary. + -n, --nomount + Do not automatically attempt to mount any virtual file system (such as + tracefs or BPF virtual file system) when necessary. EXAMPLES ======== diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst index 9d6a314dfd7a..252e4c538edb 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-map.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst @@ -14,166 +14,160 @@ tool for inspection and simple manipulation of eBPF maps SYNOPSIS ======== - **bpftool** [*OPTIONS*] **map** *COMMAND* +**bpftool** [*OPTIONS*] **map** *COMMAND* - *OPTIONS* := { |COMMON_OPTIONS| | { **-f** | **--bpffs** } | { **-n** | **--nomount** } } +*OPTIONS* := { |COMMON_OPTIONS| | { **-f** | **--bpffs** } | { **-n** | **--nomount** } } - *COMMANDS* := - { **show** | **list** | **create** | **dump** | **update** | **lookup** | **getnext** | - **delete** | **pin** | **help** } +*COMMANDS* := +{ **show** | **list** | **create** | **dump** | **update** | **lookup** | **getnext** | +**delete** | **pin** | **help** } MAP COMMANDS ============= -| **bpftool** **map** { **show** | **list** } [*MAP*] -| **bpftool** **map create** *FILE* **type** *TYPE* **key** *KEY_SIZE* **value** *VALUE_SIZE* \ -| **entries** *MAX_ENTRIES* **name** *NAME* [**flags** *FLAGS*] [**inner_map** *MAP*] \ -| [**offload_dev** *NAME*] -| **bpftool** **map dump** *MAP* -| **bpftool** **map update** *MAP* [**key** *DATA*] [**value** *VALUE*] [*UPDATE_FLAGS*] -| **bpftool** **map lookup** *MAP* [**key** *DATA*] -| **bpftool** **map getnext** *MAP* [**key** *DATA*] -| **bpftool** **map delete** *MAP* **key** *DATA* -| **bpftool** **map pin** *MAP* *FILE* -| **bpftool** **map event_pipe** *MAP* [**cpu** *N* **index** *M*] -| **bpftool** **map peek** *MAP* -| **bpftool** **map push** *MAP* **value** *VALUE* -| **bpftool** **map pop** *MAP* -| **bpftool** **map enqueue** *MAP* **value** *VALUE* -| **bpftool** **map dequeue** *MAP* -| **bpftool** **map freeze** *MAP* -| **bpftool** **map help** +| **bpftool** **map** { **show** | **list** } [*MAP*] +| **bpftool** **map create** *FILE* **type** *TYPE* **key** *KEY_SIZE* **value** *VALUE_SIZE* \ +| **entries** *MAX_ENTRIES* **name** *NAME* [**flags** *FLAGS*] [**inner_map** *MAP*] \ +| [**offload_dev** *NAME*] +| **bpftool** **map dump** *MAP* +| **bpftool** **map update** *MAP* [**key** *DATA*] [**value** *VALUE*] [*UPDATE_FLAGS*] +| **bpftool** **map lookup** *MAP* [**key** *DATA*] +| **bpftool** **map getnext** *MAP* [**key** *DATA*] +| **bpftool** **map delete** *MAP* **key** *DATA* +| **bpftool** **map pin** *MAP* *FILE* +| **bpftool** **map event_pipe** *MAP* [**cpu** *N* **index** *M*] +| **bpftool** **map peek** *MAP* +| **bpftool** **map push** *MAP* **value** *VALUE* +| **bpftool** **map pop** *MAP* +| **bpftool** **map enqueue** *MAP* **value** *VALUE* +| **bpftool** **map dequeue** *MAP* +| **bpftool** **map freeze** *MAP* +| **bpftool** **map help** | -| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* | **name** *MAP_NAME* } -| *DATA* := { [**hex**] *BYTES* } -| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* | **name** *PROG_NAME* } -| *VALUE* := { *DATA* | *MAP* | *PROG* } -| *UPDATE_FLAGS* := { **any** | **exist** | **noexist** } -| *TYPE* := { **hash** | **array** | **prog_array** | **perf_event_array** | **percpu_hash** -| | **percpu_array** | **stack_trace** | **cgroup_array** | **lru_hash** -| | **lru_percpu_hash** | **lpm_trie** | **array_of_maps** | **hash_of_maps** -| | **devmap** | **devmap_hash** | **sockmap** | **cpumap** | **xskmap** | **sockhash** -| | **cgroup_storage** | **reuseport_sockarray** | **percpu_cgroup_storage** -| | **queue** | **stack** | **sk_storage** | **struct_ops** | **ringbuf** | **inode_storage** -| | **task_storage** | **bloom_filter** | **user_ringbuf** | **cgrp_storage** | **arena** } +| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* | **name** *MAP_NAME* } +| *DATA* := { [**hex**] *BYTES* } +| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* | **name** *PROG_NAME* } +| *VALUE* := { *DATA* | *MAP* | *PROG* } +| *UPDATE_FLAGS* := { **any** | **exist** | **noexist** } +| *TYPE* := { **hash** | **array** | **prog_array** | **perf_event_array** | **percpu_hash** +| | **percpu_array** | **stack_trace** | **cgroup_array** | **lru_hash** +| | **lru_percpu_hash** | **lpm_trie** | **array_of_maps** | **hash_of_maps** +| | **devmap** | **devmap_hash** | **sockmap** | **cpumap** | **xskmap** | **sockhash** +| | **cgroup_storage** | **reuseport_sockarray** | **percpu_cgroup_storage** +| | **queue** | **stack** | **sk_storage** | **struct_ops** | **ringbuf** | **inode_storage** +| | **task_storage** | **bloom_filter** | **user_ringbuf** | **cgrp_storage** | **arena** } DESCRIPTION =========== - **bpftool map { show | list }** [*MAP*] - Show information about loaded maps. If *MAP* is specified - show information only about given maps, otherwise list all - maps currently loaded on the system. In case of **name**, - *MAP* may match several maps which will all be shown. +bpftool map { show | list } [*MAP*] + Show information about loaded maps. If *MAP* is specified show information + only about given maps, otherwise list all maps currently loaded on the + system. In case of **name**, *MAP* may match several maps which will all + be shown. - Output will start with map ID followed by map type and - zero or more named attributes (depending on kernel version). + Output will start with map ID followed by map type and zero or more named + attributes (depending on kernel version). - Since Linux 5.8 bpftool is able to discover information about - processes that hold open file descriptors (FDs) against BPF - maps. On such kernels bpftool will automatically emit this - information as well. + Since Linux 5.8 bpftool is able to discover information about processes + that hold open file descriptors (FDs) against BPF maps. On such kernels + bpftool will automatically emit this information as well. - **bpftool map create** *FILE* **type** *TYPE* **key** *KEY_SIZE* **value** *VALUE_SIZE* **entries** *MAX_ENTRIES* **name** *NAME* [**flags** *FLAGS*] [**inner_map** *MAP*] [**offload_dev** *NAME*] - Create a new map with given parameters and pin it to *bpffs* - as *FILE*. +bpftool map create *FILE* type *TYPE* key *KEY_SIZE* value *VALUE_SIZE* entries *MAX_ENTRIES* name *NAME* [flags *FLAGS*] [inner_map *MAP*] [offload_dev *NAME*] + Create a new map with given parameters and pin it to *bpffs* as *FILE*. - *FLAGS* should be an integer which is the combination of - desired flags, e.g. 1024 for **BPF_F_MMAPABLE** (see bpf.h - UAPI header for existing flags). + *FLAGS* should be an integer which is the combination of desired flags, + e.g. 1024 for **BPF_F_MMAPABLE** (see bpf.h UAPI header for existing + flags). - To create maps of type array-of-maps or hash-of-maps, the - **inner_map** keyword must be used to pass an inner map. The - kernel needs it to collect metadata related to the inner maps - that the new map will work with. + To create maps of type array-of-maps or hash-of-maps, the **inner_map** + keyword must be used to pass an inner map. The kernel needs it to collect + metadata related to the inner maps that the new map will work with. - Keyword **offload_dev** expects a network interface name, - and is used to request hardware offload for the map. + Keyword **offload_dev** expects a network interface name, and is used to + request hardware offload for the map. - **bpftool map dump** *MAP* - Dump all entries in a given *MAP*. In case of **name**, - *MAP* may match several maps which will all be dumped. +bpftool map dump *MAP* + Dump all entries in a given *MAP*. In case of **name**, *MAP* may match + several maps which will all be dumped. - **bpftool map update** *MAP* [**key** *DATA*] [**value** *VALUE*] [*UPDATE_FLAGS*] - Update map entry for a given *KEY*. +bpftool map update *MAP* [key *DATA*] [value *VALUE*] [*UPDATE_FLAGS*] + Update map entry for a given *KEY*. - *UPDATE_FLAGS* can be one of: **any** update existing entry - or add if doesn't exit; **exist** update only if entry already - exists; **noexist** update only if entry doesn't exist. + *UPDATE_FLAGS* can be one of: **any** update existing entry or add if + doesn't exit; **exist** update only if entry already exists; **noexist** + update only if entry doesn't exist. - If the **hex** keyword is provided in front of the bytes - sequence, the bytes are parsed as hexadecimal values, even if - no "0x" prefix is added. If the keyword is not provided, then - the bytes are parsed as decimal values, unless a "0x" prefix - (for hexadecimal) or a "0" prefix (for octal) is provided. + If the **hex** keyword is provided in front of the bytes sequence, the + bytes are parsed as hexadecimal values, even if no "0x" prefix is added. If + the keyword is not provided, then the bytes are parsed as decimal values, + unless a "0x" prefix (for hexadecimal) or a "0" prefix (for octal) is + provided. - **bpftool map lookup** *MAP* [**key** *DATA*] - Lookup **key** in the map. +bpftool map lookup *MAP* [key *DATA*] + Lookup **key** in the map. - **bpftool map getnext** *MAP* [**key** *DATA*] - Get next key. If *key* is not specified, get first key. +bpftool map getnext *MAP* [key *DATA*] + Get next key. If *key* is not specified, get first key. - **bpftool map delete** *MAP* **key** *DATA* - Remove entry from the map. +bpftool map delete *MAP* key *DATA* + Remove entry from the map. - **bpftool map pin** *MAP* *FILE* - Pin map *MAP* as *FILE*. +bpftool map pin *MAP* *FILE* + Pin map *MAP* as *FILE*. - Note: *FILE* must be located in *bpffs* mount. It must not - contain a dot character ('.'), which is reserved for future - extensions of *bpffs*. + Note: *FILE* must be located in *bpffs* mount. It must not contain a dot + character ('.'), which is reserved for future extensions of *bpffs*. - **bpftool** **map event_pipe** *MAP* [**cpu** *N* **index** *M*] - Read events from a **BPF_MAP_TYPE_PERF_EVENT_ARRAY** map. +bpftool map event_pipe *MAP* [cpu *N* index *M*] + Read events from a **BPF_MAP_TYPE_PERF_EVENT_ARRAY** map. - Install perf rings into a perf event array map and dump - output of any **bpf_perf_event_output**\ () call in the kernel. - By default read the number of CPUs on the system and - install perf ring for each CPU in the corresponding index - in the array. + Install perf rings into a perf event array map and dump output of any + **bpf_perf_event_output**\ () call in the kernel. By default read the + number of CPUs on the system and install perf ring for each CPU in the + corresponding index in the array. - If **cpu** and **index** are specified, install perf ring - for given **cpu** at **index** in the array (single ring). + If **cpu** and **index** are specified, install perf ring for given **cpu** + at **index** in the array (single ring). - Note that installing a perf ring into an array will silently - replace any existing ring. Any other application will stop - receiving events if it installed its rings earlier. + Note that installing a perf ring into an array will silently replace any + existing ring. Any other application will stop receiving events if it + installed its rings earlier. - **bpftool map peek** *MAP* - Peek next value in the queue or stack. +bpftool map peek *MAP* + Peek next value in the queue or stack. - **bpftool map push** *MAP* **value** *VALUE* - Push *VALUE* onto the stack. +bpftool map push *MAP* value *VALUE* + Push *VALUE* onto the stack. - **bpftool map pop** *MAP* - Pop and print value from the stack. +bpftool map pop *MAP* + Pop and print value from the stack. - **bpftool map enqueue** *MAP* **value** *VALUE* - Enqueue *VALUE* into the queue. +bpftool map enqueue *MAP* value *VALUE* + Enqueue *VALUE* into the queue. - **bpftool map dequeue** *MAP* - Dequeue and print value from the queue. +bpftool map dequeue *MAP* + Dequeue and print value from the queue. - **bpftool map freeze** *MAP* - Freeze the map as read-only from user space. Entries from a - frozen map can not longer be updated or deleted with the - **bpf**\ () system call. This operation is not reversible, - and the map remains immutable from user space until its - destruction. However, read and write permissions for BPF - programs to the map remain unchanged. +bpftool map freeze *MAP* + Freeze the map as read-only from user space. Entries from a frozen map can + not longer be updated or deleted with the **bpf**\ () system call. This + operation is not reversible, and the map remains immutable from user space + until its destruction. However, read and write permissions for BPF programs + to the map remain unchanged. - **bpftool map help** - Print short help message. +bpftool map help + Print short help message. OPTIONS ======= - .. include:: common_options.rst +.. include:: common_options.rst - -f, --bpffs - Show file names of pinned maps. +-f, --bpffs + Show file names of pinned maps. - -n, --nomount - Do not automatically attempt to mount any virtual file system - (such as tracefs or BPF virtual file system) when necessary. +-n, --nomount + Do not automatically attempt to mount any virtual file system (such as + tracefs or BPF virtual file system) when necessary. EXAMPLES ======== diff --git a/tools/bpf/bpftool/Documentation/bpftool-net.rst b/tools/bpf/bpftool/Documentation/bpftool-net.rst index dd3f9469765b..348812881297 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-net.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-net.rst @@ -14,76 +14,74 @@ tool for inspection of networking related bpf prog attachments SYNOPSIS ======== - **bpftool** [*OPTIONS*] **net** *COMMAND* +**bpftool** [*OPTIONS*] **net** *COMMAND* - *OPTIONS* := { |COMMON_OPTIONS| } +*OPTIONS* := { |COMMON_OPTIONS| } - *COMMANDS* := - { **show** | **list** | **attach** | **detach** | **help** } +*COMMANDS* := { **show** | **list** | **attach** | **detach** | **help** } NET COMMANDS ============ -| **bpftool** **net** { **show** | **list** } [ **dev** *NAME* ] -| **bpftool** **net attach** *ATTACH_TYPE* *PROG* **dev** *NAME* [ **overwrite** ] -| **bpftool** **net detach** *ATTACH_TYPE* **dev** *NAME* -| **bpftool** **net help** +| **bpftool** **net** { **show** | **list** } [ **dev** *NAME* ] +| **bpftool** **net attach** *ATTACH_TYPE* *PROG* **dev** *NAME* [ **overwrite** ] +| **bpftool** **net detach** *ATTACH_TYPE* **dev** *NAME* +| **bpftool** **net help** | -| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* } -| *ATTACH_TYPE* := { **xdp** | **xdpgeneric** | **xdpdrv** | **xdpoffload** } +| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* | **name** *PROG_NAME* } +| *ATTACH_TYPE* := { **xdp** | **xdpgeneric** | **xdpdrv** | **xdpoffload** } DESCRIPTION =========== - **bpftool net { show | list }** [ **dev** *NAME* ] - List bpf program attachments in the kernel networking subsystem. - - Currently, device driver xdp attachments, tcx, netkit and old-style tc - classifier/action attachments, flow_dissector as well as netfilter - attachments are implemented, i.e., for - program types **BPF_PROG_TYPE_XDP**, **BPF_PROG_TYPE_SCHED_CLS**, - **BPF_PROG_TYPE_SCHED_ACT**, **BPF_PROG_TYPE_FLOW_DISSECTOR**, - **BPF_PROG_TYPE_NETFILTER**. - - For programs attached to a particular cgroup, e.g., - **BPF_PROG_TYPE_CGROUP_SKB**, **BPF_PROG_TYPE_CGROUP_SOCK**, - **BPF_PROG_TYPE_SOCK_OPS** and **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**, - users can use **bpftool cgroup** to dump cgroup attachments. - For sk_{filter, skb, msg, reuseport} and lwt/seg6 - bpf programs, users should consult other tools, e.g., iproute2. - - The current output will start with all xdp program attachments, followed by - all tcx, netkit, then tc class/qdisc bpf program attachments, then flow_dissector - and finally netfilter programs. Both xdp programs and tcx/netkit/tc programs are - ordered based on ifindex number. If multiple bpf programs attached - to the same networking device through **tc**, the order will be first - all bpf programs attached to tcx, netkit, then tc classes, then all bpf programs - attached to non clsact qdiscs, and finally all bpf programs attached - to root and clsact qdisc. - - **bpftool** **net attach** *ATTACH_TYPE* *PROG* **dev** *NAME* [ **overwrite** ] - Attach bpf program *PROG* to network interface *NAME* with - type specified by *ATTACH_TYPE*. Previously attached bpf program - can be replaced by the command used with **overwrite** option. - Currently, only XDP-related modes are supported for *ATTACH_TYPE*. - - *ATTACH_TYPE* can be of: - **xdp** - try native XDP and fallback to generic XDP if NIC driver does not support it; - **xdpgeneric** - Generic XDP. runs at generic XDP hook when packet already enters receive path as skb; - **xdpdrv** - Native XDP. runs earliest point in driver's receive path; - **xdpoffload** - Offload XDP. runs directly on NIC on each packet reception; - - **bpftool** **net detach** *ATTACH_TYPE* **dev** *NAME* - Detach bpf program attached to network interface *NAME* with - type specified by *ATTACH_TYPE*. To detach bpf program, same - *ATTACH_TYPE* previously used for attach must be specified. - Currently, only XDP-related modes are supported for *ATTACH_TYPE*. - - **bpftool net help** - Print short help message. +bpftool net { show | list } [ dev *NAME* ] + List bpf program attachments in the kernel networking subsystem. + + Currently, device driver xdp attachments, tcx, netkit and old-style tc + classifier/action attachments, flow_dissector as well as netfilter + attachments are implemented, i.e., for program types **BPF_PROG_TYPE_XDP**, + **BPF_PROG_TYPE_SCHED_CLS**, **BPF_PROG_TYPE_SCHED_ACT**, + **BPF_PROG_TYPE_FLOW_DISSECTOR**, **BPF_PROG_TYPE_NETFILTER**. + + For programs attached to a particular cgroup, e.g., + **BPF_PROG_TYPE_CGROUP_SKB**, **BPF_PROG_TYPE_CGROUP_SOCK**, + **BPF_PROG_TYPE_SOCK_OPS** and **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**, users + can use **bpftool cgroup** to dump cgroup attachments. For sk_{filter, skb, + msg, reuseport} and lwt/seg6 bpf programs, users should consult other + tools, e.g., iproute2. + + The current output will start with all xdp program attachments, followed by + all tcx, netkit, then tc class/qdisc bpf program attachments, then + flow_dissector and finally netfilter programs. Both xdp programs and + tcx/netkit/tc programs are ordered based on ifindex number. If multiple bpf + programs attached to the same networking device through **tc**, the order + will be first all bpf programs attached to tcx, netkit, then tc classes, + then all bpf programs attached to non clsact qdiscs, and finally all bpf + programs attached to root and clsact qdisc. + +bpftool net attach *ATTACH_TYPE* *PROG* dev *NAME* [ overwrite ] + Attach bpf program *PROG* to network interface *NAME* with type specified + by *ATTACH_TYPE*. Previously attached bpf program can be replaced by the + command used with **overwrite** option. Currently, only XDP-related modes + are supported for *ATTACH_TYPE*. + + *ATTACH_TYPE* can be of: + **xdp** - try native XDP and fallback to generic XDP if NIC driver does not support it; + **xdpgeneric** - Generic XDP. runs at generic XDP hook when packet already enters receive path as skb; + **xdpdrv** - Native XDP. runs earliest point in driver's receive path; + **xdpoffload** - Offload XDP. runs directly on NIC on each packet reception; + +bpftool net detach *ATTACH_TYPE* dev *NAME* + Detach bpf program attached to network interface *NAME* with type specified + by *ATTACH_TYPE*. To detach bpf program, same *ATTACH_TYPE* previously used + for attach must be specified. Currently, only XDP-related modes are + supported for *ATTACH_TYPE*. + +bpftool net help + Print short help message. OPTIONS ======= - .. include:: common_options.rst +.. include:: common_options.rst EXAMPLES ======== diff --git a/tools/bpf/bpftool/Documentation/bpftool-perf.rst b/tools/bpf/bpftool/Documentation/bpftool-perf.rst index 5fea633a82f1..8c1ae55be596 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-perf.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-perf.rst @@ -14,37 +14,37 @@ tool for inspection of perf related bpf prog attachments SYNOPSIS ======== - **bpftool** [*OPTIONS*] **perf** *COMMAND* +**bpftool** [*OPTIONS*] **perf** *COMMAND* - *OPTIONS* := { |COMMON_OPTIONS| } +*OPTIONS* := { |COMMON_OPTIONS| } - *COMMANDS* := - { **show** | **list** | **help** } +*COMMANDS* := +{ **show** | **list** | **help** } PERF COMMANDS ============= -| **bpftool** **perf** { **show** | **list** } -| **bpftool** **perf help** +| **bpftool** **perf** { **show** | **list** } +| **bpftool** **perf help** DESCRIPTION =========== - **bpftool perf { show | list }** - List all raw_tracepoint, tracepoint, kprobe attachment in the system. +bpftool perf { show | list } + List all raw_tracepoint, tracepoint, kprobe attachment in the system. - Output will start with process id and file descriptor in that process, - followed by bpf program id, attachment information, and attachment point. - The attachment point for raw_tracepoint/tracepoint is the trace probe name. - The attachment point for k[ret]probe is either symbol name and offset, - or a kernel virtual address. - The attachment point for u[ret]probe is the file name and the file offset. + Output will start with process id and file descriptor in that process, + followed by bpf program id, attachment information, and attachment point. + The attachment point for raw_tracepoint/tracepoint is the trace probe name. + The attachment point for k[ret]probe is either symbol name and offset, or a + kernel virtual address. The attachment point for u[ret]probe is the file + name and the file offset. - **bpftool perf help** - Print short help message. +bpftool perf help + Print short help message. OPTIONS ======= - .. include:: common_options.rst +.. include:: common_options.rst EXAMPLES ======== diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst index 58e6a5b10ef7..d6304e01afe0 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst @@ -14,250 +14,226 @@ tool for inspection and simple manipulation of eBPF progs SYNOPSIS ======== - **bpftool** [*OPTIONS*] **prog** *COMMAND* +**bpftool** [*OPTIONS*] **prog** *COMMAND* - *OPTIONS* := { |COMMON_OPTIONS| | - { **-f** | **--bpffs** } | { **-m** | **--mapcompat** } | { **-n** | **--nomount** } | - { **-L** | **--use-loader** } } +*OPTIONS* := { |COMMON_OPTIONS| | +{ **-f** | **--bpffs** } | { **-m** | **--mapcompat** } | { **-n** | **--nomount** } | +{ **-L** | **--use-loader** } } - *COMMANDS* := - { **show** | **list** | **dump xlated** | **dump jited** | **pin** | **load** | - **loadall** | **help** } +*COMMANDS* := +{ **show** | **list** | **dump xlated** | **dump jited** | **pin** | **load** | +**loadall** | **help** } PROG COMMANDS ============= -| **bpftool** **prog** { **show** | **list** } [*PROG*] -| **bpftool** **prog dump xlated** *PROG* [{ **file** *FILE* | [**opcodes**] [**linum**] [**visual**] }] -| **bpftool** **prog dump jited** *PROG* [{ **file** *FILE* | [**opcodes**] [**linum**] }] -| **bpftool** **prog pin** *PROG* *FILE* -| **bpftool** **prog** { **load** | **loadall** } *OBJ* *PATH* [**type** *TYPE*] [**map** { **idx** *IDX* | **name** *NAME* } *MAP*] [{ **offload_dev** | **xdpmeta_dev** } *NAME*] [**pinmaps** *MAP_DIR*] [**autoattach**] -| **bpftool** **prog attach** *PROG* *ATTACH_TYPE* [*MAP*] -| **bpftool** **prog detach** *PROG* *ATTACH_TYPE* [*MAP*] -| **bpftool** **prog tracelog** -| **bpftool** **prog run** *PROG* **data_in** *FILE* [**data_out** *FILE* [**data_size_out** *L*]] [**ctx_in** *FILE* [**ctx_out** *FILE* [**ctx_size_out** *M*]]] [**repeat** *N*] -| **bpftool** **prog profile** *PROG* [**duration** *DURATION*] *METRICs* -| **bpftool** **prog help** +| **bpftool** **prog** { **show** | **list** } [*PROG*] +| **bpftool** **prog dump xlated** *PROG* [{ **file** *FILE* | [**opcodes**] [**linum**] [**visual**] }] +| **bpftool** **prog dump jited** *PROG* [{ **file** *FILE* | [**opcodes**] [**linum**] }] +| **bpftool** **prog pin** *PROG* *FILE* +| **bpftool** **prog** { **load** | **loadall** } *OBJ* *PATH* [**type** *TYPE*] [**map** { **idx** *IDX* | **name** *NAME* } *MAP*] [{ **offload_dev** | **xdpmeta_dev** } *NAME*] [**pinmaps** *MAP_DIR*] [**autoattach**] +| **bpftool** **prog attach** *PROG* *ATTACH_TYPE* [*MAP*] +| **bpftool** **prog detach** *PROG* *ATTACH_TYPE* [*MAP*] +| **bpftool** **prog tracelog** +| **bpftool** **prog run** *PROG* **data_in** *FILE* [**data_out** *FILE* [**data_size_out** *L*]] [**ctx_in** *FILE* [**ctx_out** *FILE* [**ctx_size_out** *M*]]] [**repeat** *N*] +| **bpftool** **prog profile** *PROG* [**duration** *DURATION*] *METRICs* +| **bpftool** **prog help** | -| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* } -| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* | **name** *PROG_NAME* } -| *TYPE* := { -| **socket** | **kprobe** | **kretprobe** | **classifier** | **action** | -| **tracepoint** | **raw_tracepoint** | **xdp** | **perf_event** | **cgroup/skb** | -| **cgroup/sock** | **cgroup/dev** | **lwt_in** | **lwt_out** | **lwt_xmit** | -| **lwt_seg6local** | **sockops** | **sk_skb** | **sk_msg** | **lirc_mode2** | -| **cgroup/bind4** | **cgroup/bind6** | **cgroup/post_bind4** | **cgroup/post_bind6** | -| **cgroup/connect4** | **cgroup/connect6** | **cgroup/connect_unix** | -| **cgroup/getpeername4** | **cgroup/getpeername6** | **cgroup/getpeername_unix** | -| **cgroup/getsockname4** | **cgroup/getsockname6** | **cgroup/getsockname_unix** | -| **cgroup/sendmsg4** | **cgroup/sendmsg6** | **cgroup/sendmsg_unix** | -| **cgroup/recvmsg4** | **cgroup/recvmsg6** | **cgroup/recvmsg_unix** | **cgroup/sysctl** | -| **cgroup/getsockopt** | **cgroup/setsockopt** | **cgroup/sock_release** | -| **struct_ops** | **fentry** | **fexit** | **freplace** | **sk_lookup** -| } -| *ATTACH_TYPE* := { -| **sk_msg_verdict** | **sk_skb_verdict** | **sk_skb_stream_verdict** | -| **sk_skb_stream_parser** | **flow_dissector** -| } -| *METRICs* := { -| **cycles** | **instructions** | **l1d_loads** | **llc_misses** | -| **itlb_misses** | **dtlb_misses** -| } +| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* | **name** *MAP_NAME* } +| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* | **name** *PROG_NAME* } +| *TYPE* := { +| **socket** | **kprobe** | **kretprobe** | **classifier** | **action** | +| **tracepoint** | **raw_tracepoint** | **xdp** | **perf_event** | **cgroup/skb** | +| **cgroup/sock** | **cgroup/dev** | **lwt_in** | **lwt_out** | **lwt_xmit** | +| **lwt_seg6local** | **sockops** | **sk_skb** | **sk_msg** | **lirc_mode2** | +| **cgroup/bind4** | **cgroup/bind6** | **cgroup/post_bind4** | **cgroup/post_bind6** | +| **cgroup/connect4** | **cgroup/connect6** | **cgroup/connect_unix** | +| **cgroup/getpeername4** | **cgroup/getpeername6** | **cgroup/getpeername_unix** | +| **cgroup/getsockname4** | **cgroup/getsockname6** | **cgroup/getsockname_unix** | +| **cgroup/sendmsg4** | **cgroup/sendmsg6** | **cgroup/sendmsg_unix** | +| **cgroup/recvmsg4** | **cgroup/recvmsg6** | **cgroup/recvmsg_unix** | **cgroup/sysctl** | +| **cgroup/getsockopt** | **cgroup/setsockopt** | **cgroup/sock_release** | +| **struct_ops** | **fentry** | **fexit** | **freplace** | **sk_lookup** +| } +| *ATTACH_TYPE* := { +| **sk_msg_verdict** | **sk_skb_verdict** | **sk_skb_stream_verdict** | +| **sk_skb_stream_parser** | **flow_dissector** +| } +| *METRICs* := { +| **cycles** | **instructions** | **l1d_loads** | **llc_misses** | +| **itlb_misses** | **dtlb_misses** +| } DESCRIPTION =========== - **bpftool prog { show | list }** [*PROG*] - Show information about loaded programs. If *PROG* is - specified show information only about given programs, - otherwise list all programs currently loaded on the system. - In case of **tag** or **name**, *PROG* may match several - programs which will all be shown. - - Output will start with program ID followed by program type and - zero or more named attributes (depending on kernel version). - - Since Linux 5.1 the kernel can collect statistics on BPF - programs (such as the total time spent running the program, - and the number of times it was run). If available, bpftool - shows such statistics. However, the kernel does not collect - them by defaults, as it slightly impacts performance on each - program run. Activation or deactivation of the feature is - performed via the **kernel.bpf_stats_enabled** sysctl knob. - - Since Linux 5.8 bpftool is able to discover information about - processes that hold open file descriptors (FDs) against BPF - programs. On such kernels bpftool will automatically emit this - information as well. - - **bpftool prog dump xlated** *PROG* [{ **file** *FILE* | [**opcodes**] [**linum**] [**visual**] }] - Dump eBPF instructions of the programs from the kernel. By - default, eBPF will be disassembled and printed to standard - output in human-readable format. In this case, **opcodes** - controls if raw opcodes should be printed as well. - - In case of **tag** or **name**, *PROG* may match several - programs which will all be dumped. However, if **file** or - **visual** is specified, *PROG* must match a single program. - - If **file** is specified, the binary image will instead be - written to *FILE*. - - If **visual** is specified, control flow graph (CFG) will be - built instead, and eBPF instructions will be presented with - CFG in DOT format, on standard output. - - If the programs have line_info available, the source line will - be displayed. If **linum** is specified, the filename, line - number and line column will also be displayed. - - **bpftool prog dump jited** *PROG* [{ **file** *FILE* | [**opcodes**] [**linum**] }] - Dump jited image (host machine code) of the program. - - If *FILE* is specified image will be written to a file, - otherwise it will be disassembled and printed to stdout. - *PROG* must match a single program when **file** is specified. - - **opcodes** controls if raw opcodes will be printed. - - If the prog has line_info available, the source line will - be displayed. If **linum** is specified, the filename, line - number and line column will also be displayed. - - **bpftool prog pin** *PROG* *FILE* - Pin program *PROG* as *FILE*. - - Note: *FILE* must be located in *bpffs* mount. It must not - contain a dot character ('.'), which is reserved for future - extensions of *bpffs*. - - **bpftool prog { load | loadall }** *OBJ* *PATH* [**type** *TYPE*] [**map** { **idx** *IDX* | **name** *NAME* } *MAP*] [{ **offload_dev** | **xdpmeta_dev** } *NAME*] [**pinmaps** *MAP_DIR*] [**autoattach**] - Load bpf program(s) from binary *OBJ* and pin as *PATH*. - **bpftool prog load** pins only the first program from the - *OBJ* as *PATH*. **bpftool prog loadall** pins all programs - from the *OBJ* under *PATH* directory. - **type** is optional, if not specified program type will be - inferred from section names. - By default bpftool will create new maps as declared in the ELF - object being loaded. **map** parameter allows for the reuse - of existing maps. It can be specified multiple times, each - time for a different map. *IDX* refers to index of the map - to be replaced in the ELF file counting from 0, while *NAME* - allows to replace a map by name. *MAP* specifies the map to - use, referring to it by **id** or through a **pinned** file. - If **offload_dev** *NAME* is specified program will be loaded - onto given networking device (offload). - If **xdpmeta_dev** *NAME* is specified program will become - device-bound without offloading, this facilitates access - to XDP metadata. - Optional **pinmaps** argument can be provided to pin all - maps under *MAP_DIR* directory. - - If **autoattach** is specified program will be attached - before pin. In that case, only the link (representing the - program attached to its hook) is pinned, not the program as - such, so the path won't show in **bpftool prog show -f**, - only show in **bpftool link show -f**. Also, this only works - when bpftool (libbpf) is able to infer all necessary - information from the object file, in particular, it's not - supported for all program types. If a program does not - support autoattach, bpftool falls back to regular pinning - for that program instead. - - Note: *PATH* must be located in *bpffs* mount. It must not - contain a dot character ('.'), which is reserved for future - extensions of *bpffs*. - - **bpftool prog attach** *PROG* *ATTACH_TYPE* [*MAP*] - Attach bpf program *PROG* (with type specified by - *ATTACH_TYPE*). Most *ATTACH_TYPEs* require a *MAP* - parameter, with the exception of *flow_dissector* which is - attached to current networking name space. - - **bpftool prog detach** *PROG* *ATTACH_TYPE* [*MAP*] - Detach bpf program *PROG* (with type specified by - *ATTACH_TYPE*). Most *ATTACH_TYPEs* require a *MAP* - parameter, with the exception of *flow_dissector* which is - detached from the current networking name space. - - **bpftool prog tracelog** - Dump the trace pipe of the system to the console (stdout). - Hit <Ctrl+C> to stop printing. BPF programs can write to this - trace pipe at runtime with the **bpf_trace_printk**\ () helper. - This should be used only for debugging purposes. For - streaming data from BPF programs to user space, one can use - perf events (see also **bpftool-map**\ (8)). - - **bpftool prog run** *PROG* **data_in** *FILE* [**data_out** *FILE* [**data_size_out** *L*]] [**ctx_in** *FILE* [**ctx_out** *FILE* [**ctx_size_out** *M*]]] [**repeat** *N*] - Run BPF program *PROG* in the kernel testing infrastructure - for BPF, meaning that the program works on the data and - context provided by the user, and not on actual packets or - monitored functions etc. Return value and duration for the - test run are printed out to the console. - - Input data is read from the *FILE* passed with **data_in**. - If this *FILE* is "**-**", input data is read from standard - input. Input context, if any, is read from *FILE* passed with - **ctx_in**. Again, "**-**" can be used to read from standard - input, but only if standard input is not already in use for - input data. If a *FILE* is passed with **data_out**, output - data is written to that file. Similarly, output context is - written to the *FILE* passed with **ctx_out**. For both - output flows, "**-**" can be used to print to the standard - output (as plain text, or JSON if relevant option was - passed). If output keywords are omitted, output data and - context are discarded. Keywords **data_size_out** and - **ctx_size_out** are used to pass the size (in bytes) for the - output buffers to the kernel, although the default of 32 kB - should be more than enough for most cases. - - Keyword **repeat** is used to indicate the number of - consecutive runs to perform. Note that output data and - context printed to files correspond to the last of those - runs. The duration printed out at the end of the runs is an - average over all runs performed by the command. - - Not all program types support test run. Among those which do, - not all of them can take the **ctx_in**/**ctx_out** - arguments. bpftool does not perform checks on program types. - - **bpftool prog profile** *PROG* [**duration** *DURATION*] *METRICs* - Profile *METRICs* for bpf program *PROG* for *DURATION* - seconds or until user hits <Ctrl+C>. *DURATION* is optional. - If *DURATION* is not specified, the profiling will run up to - **UINT_MAX** seconds. - - **bpftool prog help** - Print short help message. +bpftool prog { show | list } [*PROG*] + Show information about loaded programs. If *PROG* is specified show + information only about given programs, otherwise list all programs + currently loaded on the system. In case of **tag** or **name**, *PROG* may + match several programs which will all be shown. + + Output will start with program ID followed by program type and zero or more + named attributes (depending on kernel version). + + Since Linux 5.1 the kernel can collect statistics on BPF programs (such as + the total time spent running the program, and the number of times it was + run). If available, bpftool shows such statistics. However, the kernel does + not collect them by defaults, as it slightly impacts performance on each + program run. Activation or deactivation of the feature is performed via the + **kernel.bpf_stats_enabled** sysctl knob. + + Since Linux 5.8 bpftool is able to discover information about processes + that hold open file descriptors (FDs) against BPF programs. On such kernels + bpftool will automatically emit this information as well. + +bpftool prog dump xlated *PROG* [{ file *FILE* | [opcodes] [linum] [visual] }] + Dump eBPF instructions of the programs from the kernel. By default, eBPF + will be disassembled and printed to standard output in human-readable + format. In this case, **opcodes** controls if raw opcodes should be printed + as well. + + In case of **tag** or **name**, *PROG* may match several programs which + will all be dumped. However, if **file** or **visual** is specified, + *PROG* must match a single program. + + If **file** is specified, the binary image will instead be written to + *FILE*. + + If **visual** is specified, control flow graph (CFG) will be built instead, + and eBPF instructions will be presented with CFG in DOT format, on standard + output. + + If the programs have line_info available, the source line will be + displayed. If **linum** is specified, the filename, line number and line + column will also be displayed. + +bpftool prog dump jited *PROG* [{ file *FILE* | [opcodes] [linum] }] + Dump jited image (host machine code) of the program. + + If *FILE* is specified image will be written to a file, otherwise it will + be disassembled and printed to stdout. *PROG* must match a single program + when **file** is specified. + + **opcodes** controls if raw opcodes will be printed. + + If the prog has line_info available, the source line will be displayed. If + **linum** is specified, the filename, line number and line column will also + be displayed. + +bpftool prog pin *PROG* *FILE* + Pin program *PROG* as *FILE*. + + Note: *FILE* must be located in *bpffs* mount. It must not contain a dot + character ('.'), which is reserved for future extensions of *bpffs*. + +bpftool prog { load | loadall } *OBJ* *PATH* [type *TYPE*] [map { idx *IDX* | name *NAME* } *MAP*] [{ offload_dev | xdpmeta_dev } *NAME*] [pinmaps *MAP_DIR*] [autoattach] + Load bpf program(s) from binary *OBJ* and pin as *PATH*. **bpftool prog + load** pins only the first program from the *OBJ* as *PATH*. **bpftool prog + loadall** pins all programs from the *OBJ* under *PATH* directory. **type** + is optional, if not specified program type will be inferred from section + names. By default bpftool will create new maps as declared in the ELF + object being loaded. **map** parameter allows for the reuse of existing + maps. It can be specified multiple times, each time for a different map. + *IDX* refers to index of the map to be replaced in the ELF file counting + from 0, while *NAME* allows to replace a map by name. *MAP* specifies the + map to use, referring to it by **id** or through a **pinned** file. If + **offload_dev** *NAME* is specified program will be loaded onto given + networking device (offload). If **xdpmeta_dev** *NAME* is specified program + will become device-bound without offloading, this facilitates access to XDP + metadata. Optional **pinmaps** argument can be provided to pin all maps + under *MAP_DIR* directory. + + If **autoattach** is specified program will be attached before pin. In that + case, only the link (representing the program attached to its hook) is + pinned, not the program as such, so the path won't show in **bpftool prog + show -f**, only show in **bpftool link show -f**. Also, this only works + when bpftool (libbpf) is able to infer all necessary information from the + object file, in particular, it's not supported for all program types. If a + program does not support autoattach, bpftool falls back to regular pinning + for that program instead. + + Note: *PATH* must be located in *bpffs* mount. It must not contain a dot + character ('.'), which is reserved for future extensions of *bpffs*. + +bpftool prog attach *PROG* *ATTACH_TYPE* [*MAP*] + Attach bpf program *PROG* (with type specified by *ATTACH_TYPE*). Most + *ATTACH_TYPEs* require a *MAP* parameter, with the exception of + *flow_dissector* which is attached to current networking name space. + +bpftool prog detach *PROG* *ATTACH_TYPE* [*MAP*] + Detach bpf program *PROG* (with type specified by *ATTACH_TYPE*). Most + *ATTACH_TYPEs* require a *MAP* parameter, with the exception of + *flow_dissector* which is detached from the current networking name space. + +bpftool prog tracelog + Dump the trace pipe of the system to the console (stdout). Hit <Ctrl+C> to + stop printing. BPF programs can write to this trace pipe at runtime with + the **bpf_trace_printk**\ () helper. This should be used only for debugging + purposes. For streaming data from BPF programs to user space, one can use + perf events (see also **bpftool-map**\ (8)). + +bpftool prog run *PROG* data_in *FILE* [data_out *FILE* [data_size_out *L*]] [ctx_in *FILE* [ctx_out *FILE* [ctx_size_out *M*]]] [repeat *N*] + Run BPF program *PROG* in the kernel testing infrastructure for BPF, + meaning that the program works on the data and context provided by the + user, and not on actual packets or monitored functions etc. Return value + and duration for the test run are printed out to the console. + + Input data is read from the *FILE* passed with **data_in**. If this *FILE* + is "**-**", input data is read from standard input. Input context, if any, + is read from *FILE* passed with **ctx_in**. Again, "**-**" can be used to + read from standard input, but only if standard input is not already in use + for input data. If a *FILE* is passed with **data_out**, output data is + written to that file. Similarly, output context is written to the *FILE* + passed with **ctx_out**. For both output flows, "**-**" can be used to + print to the standard output (as plain text, or JSON if relevant option was + passed). If output keywords are omitted, output data and context are + discarded. Keywords **data_size_out** and **ctx_size_out** are used to pass + the size (in bytes) for the output buffers to the kernel, although the + default of 32 kB should be more than enough for most cases. + + Keyword **repeat** is used to indicate the number of consecutive runs to + perform. Note that output data and context printed to files correspond to + the last of those runs. The duration printed out at the end of the runs is + an average over all runs performed by the command. + + Not all program types support test run. Among those which do, not all of + them can take the **ctx_in**/**ctx_out** arguments. bpftool does not + perform checks on program types. + +bpftool prog profile *PROG* [duration *DURATION*] *METRICs* + Profile *METRICs* for bpf program *PROG* for *DURATION* seconds or until + user hits <Ctrl+C>. *DURATION* is optional. If *DURATION* is not specified, + the profiling will run up to **UINT_MAX** seconds. + +bpftool prog help + Print short help message. OPTIONS ======= - .. include:: common_options.rst - - -f, --bpffs - When showing BPF programs, show file names of pinned - programs. - - -m, --mapcompat - Allow loading maps with unknown map definitions. - - -n, --nomount - Do not automatically attempt to mount any virtual file system - (such as tracefs or BPF virtual file system) when necessary. - - -L, --use-loader - Load program as a "loader" program. This is useful to debug - the generation of such programs. When this option is in - use, bpftool attempts to load the programs from the object - file into the kernel, but does not pin them (therefore, the - *PATH* must not be provided). - - When combined with the **-d**\ \|\ **--debug** option, - additional debug messages are generated, and the execution - of the loader program will use the **bpf_trace_printk**\ () - helper to log each step of loading BTF, creating the maps, - and loading the programs (see **bpftool prog tracelog** as - a way to dump those messages). +.. include:: common_options.rst + +-f, --bpffs + When showing BPF programs, show file names of pinned programs. + +-m, --mapcompat + Allow loading maps with unknown map definitions. + +-n, --nomount + Do not automatically attempt to mount any virtual file system (such as + tracefs or BPF virtual file system) when necessary. + +-L, --use-loader + Load program as a "loader" program. This is useful to debug the generation + of such programs. When this option is in use, bpftool attempts to load the + programs from the object file into the kernel, but does not pin them + (therefore, the *PATH* must not be provided). + + When combined with the **-d**\ \|\ **--debug** option, additional debug + messages are generated, and the execution of the loader program will use + the **bpf_trace_printk**\ () helper to log each step of loading BTF, + creating the maps, and loading the programs (see **bpftool prog tracelog** + as a way to dump those messages). EXAMPLES ======== diff --git a/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst b/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst index 8022b5321dbe..e871b9539ac7 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst @@ -14,61 +14,60 @@ tool to register/unregister/introspect BPF struct_ops SYNOPSIS ======== - **bpftool** [*OPTIONS*] **struct_ops** *COMMAND* +**bpftool** [*OPTIONS*] **struct_ops** *COMMAND* - *OPTIONS* := { |COMMON_OPTIONS| } +*OPTIONS* := { |COMMON_OPTIONS| } - *COMMANDS* := - { **show** | **list** | **dump** | **register** | **unregister** | **help** } +*COMMANDS* := +{ **show** | **list** | **dump** | **register** | **unregister** | **help** } STRUCT_OPS COMMANDS =================== -| **bpftool** **struct_ops { show | list }** [*STRUCT_OPS_MAP*] -| **bpftool** **struct_ops dump** [*STRUCT_OPS_MAP*] -| **bpftool** **struct_ops register** *OBJ* [*LINK_DIR*] -| **bpftool** **struct_ops unregister** *STRUCT_OPS_MAP* -| **bpftool** **struct_ops help** +| **bpftool** **struct_ops { show | list }** [*STRUCT_OPS_MAP*] +| **bpftool** **struct_ops dump** [*STRUCT_OPS_MAP*] +| **bpftool** **struct_ops register** *OBJ* [*LINK_DIR*] +| **bpftool** **struct_ops unregister** *STRUCT_OPS_MAP* +| **bpftool** **struct_ops help** | -| *STRUCT_OPS_MAP* := { **id** *STRUCT_OPS_MAP_ID* | **name** *STRUCT_OPS_MAP_NAME* } -| *OBJ* := /a/file/of/bpf_struct_ops.o +| *STRUCT_OPS_MAP* := { **id** *STRUCT_OPS_MAP_ID* | **name** *STRUCT_OPS_MAP_NAME* } +| *OBJ* := /a/file/of/bpf_struct_ops.o DESCRIPTION =========== - **bpftool struct_ops { show | list }** [*STRUCT_OPS_MAP*] - Show brief information about the struct_ops in the system. - If *STRUCT_OPS_MAP* is specified, it shows information only - for the given struct_ops. Otherwise, it lists all struct_ops - currently existing in the system. - - Output will start with struct_ops map ID, followed by its map - name and its struct_ops's kernel type. - - **bpftool struct_ops dump** [*STRUCT_OPS_MAP*] - Dump details information about the struct_ops in the system. - If *STRUCT_OPS_MAP* is specified, it dumps information only - for the given struct_ops. Otherwise, it dumps all struct_ops - currently existing in the system. - - **bpftool struct_ops register** *OBJ* [*LINK_DIR*] - Register bpf struct_ops from *OBJ*. All struct_ops under - the ELF section ".struct_ops" and ".struct_ops.link" will - be registered to its kernel subsystem. For each - struct_ops in the ".struct_ops.link" section, a link - will be created. You can give *LINK_DIR* to provide a - directory path where these links will be pinned with the - same name as their corresponding map name. - - **bpftool struct_ops unregister** *STRUCT_OPS_MAP* - Unregister the *STRUCT_OPS_MAP* from the kernel subsystem. - - **bpftool struct_ops help** - Print short help message. +bpftool struct_ops { show | list } [*STRUCT_OPS_MAP*] + Show brief information about the struct_ops in the system. If + *STRUCT_OPS_MAP* is specified, it shows information only for the given + struct_ops. Otherwise, it lists all struct_ops currently existing in the + system. + + Output will start with struct_ops map ID, followed by its map name and its + struct_ops's kernel type. + +bpftool struct_ops dump [*STRUCT_OPS_MAP*] + Dump details information about the struct_ops in the system. If + *STRUCT_OPS_MAP* is specified, it dumps information only for the given + struct_ops. Otherwise, it dumps all struct_ops currently existing in the + system. + +bpftool struct_ops register *OBJ* [*LINK_DIR*] + Register bpf struct_ops from *OBJ*. All struct_ops under the ELF section + ".struct_ops" and ".struct_ops.link" will be registered to its kernel + subsystem. For each struct_ops in the ".struct_ops.link" section, a link + will be created. You can give *LINK_DIR* to provide a directory path where + these links will be pinned with the same name as their corresponding map + name. + +bpftool struct_ops unregister *STRUCT_OPS_MAP* + Unregister the *STRUCT_OPS_MAP* from the kernel subsystem. + +bpftool struct_ops help + Print short help message. OPTIONS ======= - .. include:: common_options.rst +.. include:: common_options.rst EXAMPLES ======== diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst b/tools/bpf/bpftool/Documentation/bpftool.rst index 09e4f2ff5658..f38ae5c40439 100644 --- a/tools/bpf/bpftool/Documentation/bpftool.rst +++ b/tools/bpf/bpftool/Documentation/bpftool.rst @@ -14,57 +14,57 @@ tool for inspection and simple manipulation of eBPF programs and maps SYNOPSIS ======== - **bpftool** [*OPTIONS*] *OBJECT* { *COMMAND* | **help** } +**bpftool** [*OPTIONS*] *OBJECT* { *COMMAND* | **help** } - **bpftool** **batch file** *FILE* +**bpftool** **batch file** *FILE* - **bpftool** **version** +**bpftool** **version** - *OBJECT* := { **map** | **prog** | **link** | **cgroup** | **perf** | **net** | **feature** | - **btf** | **gen** | **struct_ops** | **iter** } +*OBJECT* := { **map** | **prog** | **link** | **cgroup** | **perf** | **net** | **feature** | +**btf** | **gen** | **struct_ops** | **iter** } - *OPTIONS* := { { **-V** | **--version** } | |COMMON_OPTIONS| } +*OPTIONS* := { { **-V** | **--version** } | |COMMON_OPTIONS| } - *MAP-COMMANDS* := - { **show** | **list** | **create** | **dump** | **update** | **lookup** | **getnext** | - **delete** | **pin** | **event_pipe** | **help** } +*MAP-COMMANDS* := +{ **show** | **list** | **create** | **dump** | **update** | **lookup** | **getnext** | +**delete** | **pin** | **event_pipe** | **help** } - *PROG-COMMANDS* := { **show** | **list** | **dump jited** | **dump xlated** | **pin** | - **load** | **attach** | **detach** | **help** } +*PROG-COMMANDS* := { **show** | **list** | **dump jited** | **dump xlated** | **pin** | +**load** | **attach** | **detach** | **help** } - *LINK-COMMANDS* := { **show** | **list** | **pin** | **detach** | **help** } +*LINK-COMMANDS* := { **show** | **list** | **pin** | **detach** | **help** } - *CGROUP-COMMANDS* := { **show** | **list** | **attach** | **detach** | **help** } +*CGROUP-COMMANDS* := { **show** | **list** | **attach** | **detach** | **help** } - *PERF-COMMANDS* := { **show** | **list** | **help** } +*PERF-COMMANDS* := { **show** | **list** | **help** } - *NET-COMMANDS* := { **show** | **list** | **help** } +*NET-COMMANDS* := { **show** | **list** | **help** } - *FEATURE-COMMANDS* := { **probe** | **help** } +*FEATURE-COMMANDS* := { **probe** | **help** } - *BTF-COMMANDS* := { **show** | **list** | **dump** | **help** } +*BTF-COMMANDS* := { **show** | **list** | **dump** | **help** } - *GEN-COMMANDS* := { **object** | **skeleton** | **min_core_btf** | **help** } +*GEN-COMMANDS* := { **object** | **skeleton** | **min_core_btf** | **help** } - *STRUCT-OPS-COMMANDS* := { **show** | **list** | **dump** | **register** | **unregister** | **help** } +*STRUCT-OPS-COMMANDS* := { **show** | **list** | **dump** | **register** | **unregister** | **help** } - *ITER-COMMANDS* := { **pin** | **help** } +*ITER-COMMANDS* := { **pin** | **help** } DESCRIPTION =========== - *bpftool* allows for inspection and simple modification of BPF objects - on the system. +*bpftool* allows for inspection and simple modification of BPF objects on the +system. - Note that format of the output of all tools is not guaranteed to be - stable and should not be depended upon. +Note that format of the output of all tools is not guaranteed to be stable and +should not be depended upon. OPTIONS ======= - .. include:: common_options.rst +.. include:: common_options.rst - -m, --mapcompat - Allow loading maps with unknown map definitions. +-m, --mapcompat + Allow loading maps with unknown map definitions. - -n, --nomount - Do not automatically attempt to mount any virtual file system - (such as tracefs or BPF virtual file system) when necessary. +-n, --nomount + Do not automatically attempt to mount any virtual file system (such as + tracefs or BPF virtual file system) when necessary. diff --git a/tools/bpf/bpftool/Documentation/common_options.rst b/tools/bpf/bpftool/Documentation/common_options.rst index 30df7a707f02..9234b9dab768 100644 --- a/tools/bpf/bpftool/Documentation/common_options.rst +++ b/tools/bpf/bpftool/Documentation/common_options.rst @@ -1,25 +1,23 @@ .. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) -h, --help - Print short help message (similar to **bpftool help**). + Print short help message (similar to **bpftool help**). -V, --version - Print bpftool's version number (similar to **bpftool version**), the - number of the libbpf version in use, and optional features that were - included when bpftool was compiled. Optional features include linking - against LLVM or libbfd to provide the disassembler for JIT-ted - programs (**bpftool prog dump jited**) and usage of BPF skeletons - (some features like **bpftool prog profile** or showing pids - associated to BPF objects may rely on it). + Print bpftool's version number (similar to **bpftool version**), the number + of the libbpf version in use, and optional features that were included when + bpftool was compiled. Optional features include linking against LLVM or + libbfd to provide the disassembler for JIT-ted programs (**bpftool prog + dump jited**) and usage of BPF skeletons (some features like **bpftool prog + profile** or showing pids associated to BPF objects may rely on it). -j, --json - Generate JSON output. For commands that cannot produce JSON, this - option has no effect. + Generate JSON output. For commands that cannot produce JSON, this option + has no effect. -p, --pretty - Generate human-readable JSON output. Implies **-j**. + Generate human-readable JSON output. Implies **-j**. -d, --debug - Print all logs available, even debug-level information. This includes - logs from libbpf as well as from the verifier, when attempting to - load programs. + Print all logs available, even debug-level information. This includes logs + from libbpf as well as from the verifier, when attempting to load programs. diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile index e9154ace80ff..dfa4f1bebbb3 100644 --- a/tools/bpf/bpftool/Makefile +++ b/tools/bpf/bpftool/Makefile @@ -89,6 +89,10 @@ ifneq ($(EXTRA_LDFLAGS),) LDFLAGS += $(EXTRA_LDFLAGS) endif +HOST_CFLAGS := $(subst -I$(LIBBPF_INCLUDE),-I$(LIBBPF_BOOTSTRAP_INCLUDE),\ + $(subst $(CLANG_CROSS_FLAGS),,$(CFLAGS))) +HOST_LDFLAGS := $(LDFLAGS) + INSTALL ?= install RM ?= rm -f @@ -143,7 +147,7 @@ ifeq ($(feature-llvm),1) # If LLVM is available, use it for JIT disassembly CFLAGS += -DHAVE_LLVM_SUPPORT LLVM_CONFIG_LIB_COMPONENTS := mcdisassembler all-targets - CFLAGS += $(shell $(LLVM_CONFIG) --cflags --libs $(LLVM_CONFIG_LIB_COMPONENTS)) + CFLAGS += $(shell $(LLVM_CONFIG) --cflags) LIBS += $(shell $(LLVM_CONFIG) --libs $(LLVM_CONFIG_LIB_COMPONENTS)) ifeq ($(shell $(LLVM_CONFIG) --shared-mode),static) LIBS += $(shell $(LLVM_CONFIG) --system-libs $(LLVM_CONFIG_LIB_COMPONENTS)) @@ -178,12 +182,9 @@ ifeq ($(filter -DHAVE_LLVM_SUPPORT -DHAVE_LIBBFD_SUPPORT,$(CFLAGS)),) SRCS := $(filter-out jit_disasm.c,$(SRCS)) endif -HOST_CFLAGS = $(subst -I$(LIBBPF_INCLUDE),-I$(LIBBPF_BOOTSTRAP_INCLUDE),\ - $(subst $(CLANG_CROSS_FLAGS),,$(CFLAGS))) - BPFTOOL_BOOTSTRAP := $(BOOTSTRAP_OUTPUT)bpftool -BOOTSTRAP_OBJS = $(addprefix $(BOOTSTRAP_OUTPUT),main.o common.o json_writer.o gen.o btf.o xlated_dumper.o btf_dumper.o disasm.o) +BOOTSTRAP_OBJS = $(addprefix $(BOOTSTRAP_OUTPUT),main.o common.o json_writer.o gen.o btf.o) $(BOOTSTRAP_OBJS): $(LIBBPF_BOOTSTRAP) OBJS = $(patsubst %.c,$(OUTPUT)%.o,$(SRCS)) $(OUTPUT)disasm.o @@ -231,14 +232,11 @@ endif CFLAGS += $(if $(BUILD_BPF_SKELS),,-DBPFTOOL_WITHOUT_SKELETONS) -$(BOOTSTRAP_OUTPUT)disasm.o: $(srctree)/kernel/bpf/disasm.c - $(QUIET_CC)$(HOSTCC) $(HOST_CFLAGS) -c -MMD $< -o $@ - $(OUTPUT)disasm.o: $(srctree)/kernel/bpf/disasm.c $(QUIET_CC)$(CC) $(CFLAGS) -c -MMD $< -o $@ $(BPFTOOL_BOOTSTRAP): $(BOOTSTRAP_OBJS) $(LIBBPF_BOOTSTRAP) - $(QUIET_LINK)$(HOSTCC) $(HOST_CFLAGS) $(LDFLAGS) $(BOOTSTRAP_OBJS) $(LIBS_BOOTSTRAP) -o $@ + $(QUIET_LINK)$(HOSTCC) $(HOST_CFLAGS) $(HOST_LDFLAGS) $(BOOTSTRAP_OBJS) $(LIBS_BOOTSTRAP) -o $@ $(OUTPUT)bpftool: $(OBJS) $(LIBBPF) $(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) $(OBJS) $(LIBS) -o $@ diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index 6e4f7ce6bc01..04afe2ac2228 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -106,19 +106,19 @@ _bpftool_get_link_ids() _bpftool_get_obj_map_names() { - local obj + local obj maps obj=$1 - maps=$(objdump -j maps -t $obj 2>/dev/null | \ - command awk '/g . maps/ {print $NF}') + maps=$(objdump -j .maps -t $obj 2>/dev/null | \ + command awk '/g . .maps/ {print $NF}') COMPREPLY+=( $( compgen -W "$maps" -- "$cur" ) ) } _bpftool_get_obj_map_idxs() { - local obj + local obj nmaps obj=$1 @@ -136,7 +136,7 @@ _sysfs_get_netdevs() # Retrieve type of the map that we are operating on. _bpftool_map_guess_map_type() { - local keyword ref + local keyword idx ref="" for (( idx=3; idx < ${#words[@]}-1; idx++ )); do case "${words[$((idx-2))]}" in lookup|update) @@ -255,8 +255,9 @@ _bpftool_map_update_get_name() _bpftool() { - local cur prev words objword json=0 - _init_completion || return + local cur prev words cword comp_args + local json=0 + _init_completion -- "$@" || return # Deal with options if [[ ${words[cword]} == -* ]]; then @@ -293,7 +294,7 @@ _bpftool() esac # Remove all options so completions don't have to deal with them. - local i + local i pprev for (( i=1; i < ${#words[@]}; )); do if [[ ${words[i]::1} == - ]] && [[ ${words[i]} != "-B" ]] && [[ ${words[i]} != "--base-btf" ]]; then @@ -307,7 +308,7 @@ _bpftool() prev=${words[cword - 1]} pprev=${words[cword - 2]} - local object=${words[1]} command=${words[2]} + local object=${words[1]} if [[ -z $object || $cword -eq 1 ]]; then case $cur in @@ -324,8 +325,12 @@ _bpftool() esac fi + local command=${words[2]} [[ $command == help ]] && return 0 + local MAP_TYPE='id pinned name' + local PROG_TYPE='id pinned tag name' + # Completion depends on object and command in use case $object in prog) @@ -346,8 +351,6 @@ _bpftool() ;; esac - local PROG_TYPE='id pinned tag name' - local MAP_TYPE='id pinned name' local METRIC_TYPE='cycles instructions l1d_loads llc_misses \ itlb_misses dtlb_misses' case $command in @@ -457,7 +460,7 @@ _bpftool() obj=${words[3]} if [[ ${words[-4]} == "map" ]]; then - COMPREPLY=( $( compgen -W "id pinned" -- "$cur" ) ) + COMPREPLY=( $( compgen -W "$MAP_TYPE" -- "$cur" ) ) return 0 fi if [[ ${words[-3]} == "map" ]]; then @@ -541,20 +544,9 @@ _bpftool() COMPREPLY=( $( compgen -W "$METRIC_TYPE duration" -- "$cur" ) ) return 0 ;; - 6) - case $prev in - duration) - return 0 - ;; - *) - COMPREPLY=( $( compgen -W "$METRIC_TYPE" -- "$cur" ) ) - return 0 - ;; - esac - return 0 - ;; *) - COMPREPLY=( $( compgen -W "$METRIC_TYPE" -- "$cur" ) ) + [[ $prev == duration ]] && return 0 + _bpftool_once_attr "$METRIC_TYPE" return 0 ;; esac @@ -612,7 +604,7 @@ _bpftool() return 0 ;; register) - _filedir + [[ $prev == $command ]] && _filedir return 0 ;; *) @@ -638,9 +630,12 @@ _bpftool() pinned) _filedir ;; - *) + map) _bpftool_one_of_list $MAP_TYPE ;; + *) + _bpftool_once_attr 'map' + ;; esac return 0 ;; @@ -652,7 +647,6 @@ _bpftool() esac ;; map) - local MAP_TYPE='id pinned name' case $command in show|list|dump|peek|pop|dequeue|freeze) case $prev in @@ -793,13 +787,11 @@ _bpftool() # map, depending on the type of the map to update. case "$(_bpftool_map_guess_map_type)" in array_of_maps|hash_of_maps) - local MAP_TYPE='id pinned name' COMPREPLY+=( $( compgen -W "$MAP_TYPE" \ -- "$cur" ) ) return 0 ;; prog_array) - local PROG_TYPE='id pinned tag name' COMPREPLY+=( $( compgen -W "$PROG_TYPE" \ -- "$cur" ) ) return 0 @@ -821,7 +813,7 @@ _bpftool() esac _bpftool_once_attr 'key' - local UPDATE_FLAGS='any exist noexist' + local UPDATE_FLAGS='any exist noexist' idx for (( idx=3; idx < ${#words[@]}-1; idx++ )); do if [[ ${words[idx]} == 'value' ]]; then # 'value' is present, but is not the last @@ -893,7 +885,6 @@ _bpftool() esac ;; btf) - local PROG_TYPE='id pinned tag name' local MAP_TYPE='id pinned name' case $command in dump) @@ -1033,7 +1024,6 @@ _bpftool() local BPFTOOL_CGROUP_ATTACH_TYPES="$(bpftool feature list_builtins attach_types 2>/dev/null | \ grep '^cgroup_')" local ATTACH_FLAGS='multi override' - local PROG_TYPE='id pinned tag name' # Check for $prev = $command first if [ $prev = $command ]; then _filedir @@ -1086,7 +1076,6 @@ _bpftool() esac ;; net) - local PROG_TYPE='id pinned tag name' local ATTACH_TYPES='xdp xdpgeneric xdpdrv xdpoffload' case $command in show|list) @@ -1193,14 +1182,14 @@ _bpftool() pin|detach) if [[ $prev == "$command" ]]; then COMPREPLY=( $( compgen -W "$LINK_TYPE" -- "$cur" ) ) - else + elif [[ $pprev == "$command" ]]; then _filedir fi return 0 ;; *) [[ $prev == $object ]] && \ - COMPREPLY=( $( compgen -W 'help pin show list' -- "$cur" ) ) + COMPREPLY=( $( compgen -W 'help pin detach show list' -- "$cur" ) ) ;; esac ;; diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c index cc6e6aae2447..958e92acca8e 100644 --- a/tools/bpf/bpftool/common.c +++ b/tools/bpf/bpftool/common.c @@ -244,29 +244,101 @@ int open_obj_pinned_any(const char *path, enum bpf_obj_type exp_type) return fd; } -int mount_bpffs_for_pin(const char *name, bool is_dir) +int create_and_mount_bpffs_dir(const char *dir_name) { char err_str[ERR_MAX_LEN]; - char *file; - char *dir; + bool dir_exists; int err = 0; - if (is_dir && is_bpffs(name)) + if (is_bpffs(dir_name)) return err; - file = malloc(strlen(name) + 1); - if (!file) { + dir_exists = access(dir_name, F_OK) == 0; + + if (!dir_exists) { + char *temp_name; + char *parent_name; + + temp_name = strdup(dir_name); + if (!temp_name) { + p_err("mem alloc failed"); + return -1; + } + + parent_name = dirname(temp_name); + + if (is_bpffs(parent_name)) { + /* nothing to do if already mounted */ + free(temp_name); + return err; + } + + if (access(parent_name, F_OK) == -1) { + p_err("can't create dir '%s' to pin BPF object: parent dir '%s' doesn't exist", + dir_name, parent_name); + free(temp_name); + return -1; + } + + free(temp_name); + } + + if (block_mount) { + p_err("no BPF file system found, not mounting it due to --nomount option"); + return -1; + } + + if (!dir_exists) { + err = mkdir(dir_name, S_IRWXU); + if (err) { + p_err("failed to create dir '%s': %s", dir_name, strerror(errno)); + return err; + } + } + + err = mnt_fs(dir_name, "bpf", err_str, ERR_MAX_LEN); + if (err) { + err_str[ERR_MAX_LEN - 1] = '\0'; + p_err("can't mount BPF file system on given dir '%s': %s", + dir_name, err_str); + + if (!dir_exists) + rmdir(dir_name); + } + + return err; +} + +int mount_bpffs_for_file(const char *file_name) +{ + char err_str[ERR_MAX_LEN]; + char *temp_name; + char *dir; + int err = 0; + + if (access(file_name, F_OK) != -1) { + p_err("can't pin BPF object: path '%s' already exists", file_name); + return -1; + } + + temp_name = strdup(file_name); + if (!temp_name) { p_err("mem alloc failed"); return -1; } - strcpy(file, name); - dir = dirname(file); + dir = dirname(temp_name); if (is_bpffs(dir)) /* nothing to do if already mounted */ goto out_free; + if (access(dir, F_OK) == -1) { + p_err("can't pin BPF object: dir '%s' doesn't exist", dir); + err = -1; + goto out_free; + } + if (block_mount) { p_err("no BPF file system found, not mounting it due to --nomount option"); err = -1; @@ -276,12 +348,12 @@ int mount_bpffs_for_pin(const char *name, bool is_dir) err = mnt_fs(dir, "bpf", err_str, ERR_MAX_LEN); if (err) { err_str[ERR_MAX_LEN - 1] = '\0'; - p_err("can't mount BPF file system to pin the object (%s): %s", - name, err_str); + p_err("can't mount BPF file system to pin the object '%s': %s", + file_name, err_str); } out_free: - free(file); + free(temp_name); return err; } @@ -289,7 +361,7 @@ int do_pin_fd(int fd, const char *name) { int err; - err = mount_bpffs_for_pin(name, false); + err = mount_bpffs_for_file(name); if (err) return err; diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c index 708733b0ea06..c754a428c8c6 100644 --- a/tools/bpf/bpftool/feature.c +++ b/tools/bpf/bpftool/feature.c @@ -664,7 +664,8 @@ probe_helper_ifindex(enum bpf_func_id id, enum bpf_prog_type prog_type, probe_prog_load_ifindex(prog_type, insns, ARRAY_SIZE(insns), buf, sizeof(buf), ifindex); - res = !grep(buf, "invalid func ") && !grep(buf, "unknown func "); + res = !grep(buf, "invalid func ") && !grep(buf, "unknown func ") && + !grep(buf, "program of this type cannot use helper "); switch (get_vendor_id(ifindex)) { case 0x19ee: /* Netronome specific */ diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c index 540c0f2c4fda..b3979ddc0189 100644 --- a/tools/bpf/bpftool/gen.c +++ b/tools/bpf/bpftool/gen.c @@ -386,7 +386,7 @@ static int codegen_subskel_datasecs(struct bpf_object *obj, const char *obj_name */ needs_typeof = btf_is_array(var) || btf_is_ptr_to_func_proto(btf, var); if (needs_typeof) - printf("typeof("); + printf("__typeof__("); err = btf_dump__emit_type_decl(d, var_type_id, &opts); if (err) @@ -1131,7 +1131,8 @@ static void gen_st_ops_shadow_init(struct btf *btf, struct bpf_object *obj) continue; codegen("\ \n\ - obj->struct_ops.%1$s = bpf_map__initial_value(obj->maps.%1$s, NULL);\n\ + obj->struct_ops.%1$s = (__typeof__(obj->struct_ops.%1$s))\n\ + bpf_map__initial_value(obj->maps.%1$s, NULL);\n\ \n\ ", ident); } diff --git a/tools/bpf/bpftool/iter.c b/tools/bpf/bpftool/iter.c index 6b0e5202ca7a..5c39c2ed36a2 100644 --- a/tools/bpf/bpftool/iter.c +++ b/tools/bpf/bpftool/iter.c @@ -76,7 +76,7 @@ static int do_pin(int argc, char **argv) goto close_obj; } - err = mount_bpffs_for_pin(path, false); + err = mount_bpffs_for_file(path); if (err) goto close_link; diff --git a/tools/bpf/bpftool/link.c b/tools/bpf/bpftool/link.c index afde9d0c2ea1..5cd503b763d7 100644 --- a/tools/bpf/bpftool/link.c +++ b/tools/bpf/bpftool/link.c @@ -526,6 +526,10 @@ static int show_link_close_json(int fd, struct bpf_link_info *info) show_link_ifindex_json(info->netkit.ifindex, json_wtr); show_link_attach_type_json(info->netkit.attach_type, json_wtr); break; + case BPF_LINK_TYPE_SOCKMAP: + jsonw_uint_field(json_wtr, "map_id", info->sockmap.map_id); + show_link_attach_type_json(info->sockmap.attach_type, json_wtr); + break; case BPF_LINK_TYPE_XDP: show_link_ifindex_json(info->xdp.ifindex, json_wtr); break; @@ -915,6 +919,11 @@ static int show_link_close_plain(int fd, struct bpf_link_info *info) show_link_ifindex_plain(info->netkit.ifindex); show_link_attach_type_plain(info->netkit.attach_type); break; + case BPF_LINK_TYPE_SOCKMAP: + printf("\n\t"); + printf("map_id %u ", info->sockmap.map_id); + show_link_attach_type_plain(info->sockmap.attach_type); + break; case BPF_LINK_TYPE_XDP: printf("\n\t"); show_link_ifindex_plain(info->xdp.ifindex); diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h index b8bb08d10dec..9eb764fe4cc8 100644 --- a/tools/bpf/bpftool/main.h +++ b/tools/bpf/bpftool/main.h @@ -142,7 +142,8 @@ const char *get_fd_type_name(enum bpf_obj_type type); char *get_fdinfo(int fd, const char *key); int open_obj_pinned(const char *path, bool quiet); int open_obj_pinned_any(const char *path, enum bpf_obj_type exp_type); -int mount_bpffs_for_pin(const char *name, bool is_dir); +int mount_bpffs_for_file(const char *file_name); +int create_and_mount_bpffs_dir(const char *dir_name); int do_pin_any(int argc, char **argv, int (*get_fd_by_id)(int *, char ***)); int do_pin_fd(int fd, const char *name); diff --git a/tools/bpf/bpftool/pids.c b/tools/bpf/bpftool/pids.c index 00c77edb6331..9b898571b49e 100644 --- a/tools/bpf/bpftool/pids.c +++ b/tools/bpf/bpftool/pids.c @@ -101,7 +101,6 @@ int build_obj_refs_table(struct hashmap **map, enum bpf_obj_type type) char buf[4096 / sizeof(*e) * sizeof(*e)]; struct pid_iter_bpf *skel; int err, ret, fd = -1, i; - libbpf_print_fn_t default_print; *map = hashmap__new(hash_fn_for_key_as_id, equal_fn_for_key_as_id, NULL); if (IS_ERR(*map)) { @@ -118,12 +117,18 @@ int build_obj_refs_table(struct hashmap **map, enum bpf_obj_type type) skel->rodata->obj_type = type; - /* we don't want output polluted with libbpf errors if bpf_iter is not - * supported - */ - default_print = libbpf_set_print(libbpf_print_none); - err = pid_iter_bpf__load(skel); - libbpf_set_print(default_print); + if (!verifier_logs) { + libbpf_print_fn_t default_print; + + /* Unless debug information is on, we don't want the output to + * be polluted with libbpf errors if bpf_iter is not supported. + */ + default_print = libbpf_set_print(libbpf_print_none); + err = pid_iter_bpf__load(skel); + libbpf_set_print(default_print); + } else { + err = pid_iter_bpf__load(skel); + } if (err) { /* too bad, kernel doesn't support BPF iterators yet */ err = 0; diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index 9cb42a3366c0..1a501cf09e78 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -1778,7 +1778,10 @@ offload_dev: goto err_close_obj; } - err = mount_bpffs_for_pin(pinfile, !first_prog_only); + if (first_prog_only) + err = mount_bpffs_for_file(pinfile); + else + err = create_and_mount_bpffs_dir(pinfile); if (err) goto err_close_obj; @@ -2078,7 +2081,7 @@ static int profile_parse_metrics(int argc, char **argv) NEXT_ARG(); } if (selected_cnt > MAX_NUM_PROFILE_METRICS) { - p_err("too many (%d) metrics, please specify no more than %d metrics at at time", + p_err("too many (%d) metrics, please specify no more than %d metrics at a time", selected_cnt, MAX_NUM_PROFILE_METRICS); return -1; } diff --git a/tools/bpf/bpftool/skeleton/pid_iter.bpf.c b/tools/bpf/bpftool/skeleton/pid_iter.bpf.c index 26004f0c5a6a..7bdbcac3cf62 100644 --- a/tools/bpf/bpftool/skeleton/pid_iter.bpf.c +++ b/tools/bpf/bpftool/skeleton/pid_iter.bpf.c @@ -102,8 +102,8 @@ int iter(struct bpf_iter__task_file *ctx) BPF_LINK_TYPE_PERF_EVENT___local)) { struct bpf_link *link = (struct bpf_link *) file->private_data; - if (link->type == bpf_core_enum_value(enum bpf_link_type___local, - BPF_LINK_TYPE_PERF_EVENT___local)) { + if (BPF_CORE_READ(link, type) == bpf_core_enum_value(enum bpf_link_type___local, + BPF_LINK_TYPE_PERF_EVENT___local)) { e.has_bpf_cookie = true; e.bpf_cookie = get_bpf_cookie(link); } diff --git a/tools/bpf/bpftool/struct_ops.c b/tools/bpf/bpftool/struct_ops.c index d573f2640d8e..aa43dead249c 100644 --- a/tools/bpf/bpftool/struct_ops.c +++ b/tools/bpf/bpftool/struct_ops.c @@ -515,7 +515,7 @@ static int do_register(int argc, char **argv) if (argc == 1) linkdir = GET_ARG(); - if (linkdir && mount_bpffs_for_pin(linkdir, true)) { + if (linkdir && create_and_mount_bpffs_dir(linkdir)) { p_err("can't mount bpffs for pinning"); return -1; } diff --git a/tools/cgroup/memcg_slabinfo.py b/tools/cgroup/memcg_slabinfo.py index 1d3a90d93fe2..270c28a0d098 100644 --- a/tools/cgroup/memcg_slabinfo.py +++ b/tools/cgroup/memcg_slabinfo.py @@ -146,12 +146,11 @@ def detect_kernel_config(): def for_each_slab(prog): - PGSlab = 1 << prog.constant('PG_slab') - PGHead = 1 << prog.constant('PG_head') + PGSlab = ~prog.constant('PG_slab') for page in for_each_page(prog): try: - if page.flags.value_() & PGSlab: + if page.page_type.value_() == PGSlab: yield cast('struct slab *', page) except FaultError: pass diff --git a/tools/include/linux/align.h b/tools/include/linux/align.h new file mode 100644 index 000000000000..14e34ace80dd --- /dev/null +++ b/tools/include/linux/align.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef _TOOLS_LINUX_ALIGN_H +#define _TOOLS_LINUX_ALIGN_H + +#include <uapi/linux/const.h> + +#define ALIGN(x, a) __ALIGN_KERNEL((x), (a)) +#define ALIGN_DOWN(x, a) __ALIGN_KERNEL((x) - ((a) - 1), (a)) +#define IS_ALIGNED(x, a) (((x) & ((typeof(x))(a) - 1)) == 0) + +#endif /* _TOOLS_LINUX_ALIGN_H */ diff --git a/tools/include/linux/bitmap.h b/tools/include/linux/bitmap.h index f3566ea0f932..210c13b1b857 100644 --- a/tools/include/linux/bitmap.h +++ b/tools/include/linux/bitmap.h @@ -3,6 +3,7 @@ #define _TOOLS_LINUX_BITMAP_H #include <string.h> +#include <linux/align.h> #include <linux/bitops.h> #include <linux/find.h> #include <stdlib.h> @@ -25,13 +26,14 @@ bool __bitmap_intersects(const unsigned long *bitmap1, #define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1))) #define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1))) +#define bitmap_size(nbits) (ALIGN(nbits, BITS_PER_LONG) / BITS_PER_BYTE) + static inline void bitmap_zero(unsigned long *dst, unsigned int nbits) { if (small_const_nbits(nbits)) *dst = 0UL; else { - int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long); - memset(dst, 0, len); + memset(dst, 0, bitmap_size(nbits)); } } @@ -83,7 +85,7 @@ static inline void bitmap_or(unsigned long *dst, const unsigned long *src1, */ static inline unsigned long *bitmap_zalloc(int nbits) { - return calloc(1, BITS_TO_LONGS(nbits) * sizeof(unsigned long)); + return calloc(1, bitmap_size(nbits)); } /* @@ -126,7 +128,6 @@ static inline bool bitmap_and(unsigned long *dst, const unsigned long *src1, #define BITMAP_MEM_ALIGNMENT (8 * sizeof(unsigned long)) #endif #define BITMAP_MEM_MASK (BITMAP_MEM_ALIGNMENT - 1) -#define IS_ALIGNED(x, a) (((x) & ((typeof(x))(a) - 1)) == 0) static inline bool bitmap_equal(const unsigned long *src1, const unsigned long *src2, unsigned int nbits) diff --git a/tools/include/linux/bitops.h b/tools/include/linux/bitops.h index 8e073a111d2a..b4e4cd071f8c 100644 --- a/tools/include/linux/bitops.h +++ b/tools/include/linux/bitops.h @@ -20,6 +20,8 @@ #define BITS_TO_U32(nr) DIV_ROUND_UP(nr, BITS_PER_TYPE(u32)) #define BITS_TO_BYTES(nr) DIV_ROUND_UP(nr, BITS_PER_TYPE(char)) +#define BYTES_TO_BITS(nb) ((nb) * BITS_PER_BYTE) + extern unsigned int __sw_hweight8(unsigned int w); extern unsigned int __sw_hweight16(unsigned int w); extern unsigned int __sw_hweight32(unsigned int w); diff --git a/tools/include/linux/compiler.h b/tools/include/linux/compiler.h index 7b65566f3e42..8a63a9913495 100644 --- a/tools/include/linux/compiler.h +++ b/tools/include/linux/compiler.h @@ -58,6 +58,10 @@ #define noinline #endif +#ifndef __nocf_check +#define __nocf_check __attribute__((nocf_check)) +#endif + /* Are two types/vars the same type (ignoring qualifiers)? */ #ifndef __same_type # define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b)) diff --git a/tools/include/linux/filter.h b/tools/include/linux/filter.h index 736bdeccdfe4..65aa8ce142e5 100644 --- a/tools/include/linux/filter.h +++ b/tools/include/linux/filter.h @@ -111,6 +111,24 @@ .off = 0, \ .imm = IMM }) +/* Short form of movsx, dst_reg = (s8,s16,s32)src_reg */ + +#define BPF_MOVSX64_REG(DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_MOV | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0 }) + +#define BPF_MOVSX32_REG(DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_ALU | BPF_MOV | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0 }) + /* Short form of mov based on type, BPF_X: dst_reg = src_reg, BPF_K: dst_reg = imm32 */ #define BPF_MOV64_RAW(TYPE, DST, SRC, IMM) \ diff --git a/tools/include/linux/mm.h b/tools/include/linux/mm.h index 7d73da098047..dc0fc7125bc3 100644 --- a/tools/include/linux/mm.h +++ b/tools/include/linux/mm.h @@ -2,8 +2,8 @@ #ifndef _TOOLS_LINUX_MM_H #define _TOOLS_LINUX_MM_H +#include <linux/align.h> #include <linux/mmzone.h> -#include <uapi/linux/const.h> #define PAGE_SHIFT 12 #define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT) @@ -11,9 +11,6 @@ #define PHYS_ADDR_MAX (~(phys_addr_t)0) -#define ALIGN(x, a) __ALIGN_KERNEL((x), (a)) -#define ALIGN_DOWN(x, a) __ALIGN_KERNEL((x) - ((a) - 1), (a)) - #define PAGE_ALIGN(addr) ALIGN(addr, PAGE_SIZE) #define __va(x) ((void *)((unsigned long)(x))) diff --git a/tools/include/linux/rbtree_augmented.h b/tools/include/linux/rbtree_augmented.h index 570bb9794421..95483c7d81df 100644 --- a/tools/include/linux/rbtree_augmented.h +++ b/tools/include/linux/rbtree_augmented.h @@ -158,13 +158,13 @@ RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, \ static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p) { - rb->__rb_parent_color = rb_color(rb) | (unsigned long)p; + rb->__rb_parent_color = rb_color(rb) + (unsigned long)p; } static inline void rb_set_parent_color(struct rb_node *rb, struct rb_node *p, int color) { - rb->__rb_parent_color = (unsigned long)p | color; + rb->__rb_parent_color = (unsigned long)p + color; } static inline void diff --git a/tools/include/nolibc/stdlib.h b/tools/include/nolibc/stdlib.h index bacfd35c5156..5be9d3c7435a 100644 --- a/tools/include/nolibc/stdlib.h +++ b/tools/include/nolibc/stdlib.h @@ -185,7 +185,7 @@ void *realloc(void *old_ptr, size_t new_size) if (__builtin_expect(!ret, 0)) return NULL; - memcpy(ret, heap->user_p, heap->len); + memcpy(ret, heap->user_p, user_p_len); munmap(heap, heap->len); return ret; } diff --git a/tools/include/nolibc/string.h b/tools/include/nolibc/string.h index a01c69dd495f..f9ab28421e6d 100644 --- a/tools/include/nolibc/string.h +++ b/tools/include/nolibc/string.h @@ -123,7 +123,7 @@ char *strcpy(char *dst, const char *src) * thus itself, hence the asm() statement below that's meant to disable this * confusing practice. */ -static __attribute__((unused)) +__attribute__((weak,unused,section(".text.nolibc_strlen"))) size_t strlen(const char *str) { size_t len; @@ -187,22 +187,26 @@ char *strndup(const char *str, size_t maxlen) static __attribute__((unused)) size_t strlcat(char *dst, const char *src, size_t size) { - size_t len; - char c; - - for (len = 0; dst[len]; len++) - ; - - for (;;) { - c = *src; - if (len < size) - dst[len] = c; - if (!c) + size_t len = strnlen(dst, size); + + /* + * We want len < size-1. But as size is unsigned and can wrap + * around, we use len + 1 instead. + */ + while (len + 1 < size) { + dst[len] = *src; + if (*src == '\0') break; len++; src++; } + if (len < size) + dst[len] = '\0'; + + while (*src++) + len++; + return len; } @@ -210,16 +214,18 @@ static __attribute__((unused)) size_t strlcpy(char *dst, const char *src, size_t size) { size_t len; - char c; - for (len = 0;;) { - c = src[len]; - if (len < size) - dst[len] = c; - if (!c) - break; - len++; + for (len = 0; len < size; len++) { + dst[len] = src[len]; + if (!dst[len]) + return len; } + if (size) + dst[size-1] = '\0'; + + while (src[len]) + len++; + return len; } diff --git a/tools/include/nolibc/sys.h b/tools/include/nolibc/sys.h index dda9dffd1d74..7b82bc3cf107 100644 --- a/tools/include/nolibc/sys.h +++ b/tools/include/nolibc/sys.h @@ -22,6 +22,7 @@ #include <linux/stat.h> /* for statx() */ #include <linux/prctl.h> #include <linux/resource.h> +#include <linux/utsname.h> #include "arch.h" #include "errno.h" @@ -1140,6 +1141,32 @@ int umount2(const char *path, int flags) /* + * int uname(struct utsname *buf); + */ + +struct utsname { + char sysname[65]; + char nodename[65]; + char release[65]; + char version[65]; + char machine[65]; + char domainname[65]; +}; + +static __attribute__((unused)) +int sys_uname(struct utsname *buf) +{ + return my_syscall1(__NR_uname, buf); +} + +static __attribute__((unused)) +int uname(struct utsname *buf) +{ + return __sysret(sys_uname(buf)); +} + + +/* * int unlink(const char *path); */ diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 3c42b9f1bada..90706a47f6ff 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1115,6 +1115,7 @@ enum bpf_attach_type { BPF_CGROUP_UNIX_GETSOCKNAME, BPF_NETKIT_PRIMARY, BPF_NETKIT_PEER, + BPF_TRACE_KPROBE_SESSION, __MAX_BPF_ATTACH_TYPE }; @@ -1135,6 +1136,7 @@ enum bpf_link_type { BPF_LINK_TYPE_TCX = 11, BPF_LINK_TYPE_UPROBE_MULTI = 12, BPF_LINK_TYPE_NETKIT = 13, + BPF_LINK_TYPE_SOCKMAP = 14, __MAX_BPF_LINK_TYPE, }; @@ -1662,8 +1664,10 @@ union bpf_attr { } query; struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */ - __u64 name; - __u32 prog_fd; + __u64 name; + __u32 prog_fd; + __u32 :32; + __aligned_u64 cookie; } raw_tracepoint; struct { /* anonymous struct for BPF_BTF_LOAD */ @@ -3392,6 +3396,10 @@ union bpf_attr { * for the nexthop. If the src addr cannot be derived, * **BPF_FIB_LKUP_RET_NO_SRC_ADDR** is returned. In this * case, *params*->dmac and *params*->smac are not set either. + * **BPF_FIB_LOOKUP_MARK** + * Use the mark present in *params*->mark for the fib lookup. + * This option should not be used with BPF_FIB_LOOKUP_DIRECT, + * as it only has meaning for full lookups. * * *ctx* is either **struct xdp_md** for XDP programs or * **struct sk_buff** tc cls_act programs. @@ -5020,7 +5028,7 @@ union bpf_attr { * bytes will be copied to *dst* * Return * The **hash_algo** is returned on success, - * **-EOPNOTSUP** if IMA is disabled or **-EINVAL** if + * **-EOPNOTSUPP** if IMA is disabled or **-EINVAL** if * invalid arguments are passed. * * struct socket *bpf_sock_from_file(struct file *file) @@ -5506,7 +5514,7 @@ union bpf_attr { * bytes will be copied to *dst* * Return * The **hash_algo** is returned on success, - * **-EOPNOTSUP** if the hash calculation failed or **-EINVAL** if + * **-EOPNOTSUPP** if the hash calculation failed or **-EINVAL** if * invalid arguments are passed. * * void *bpf_kptr_xchg(void *map_value, void *ptr) @@ -6718,6 +6726,10 @@ struct bpf_link_info { __u32 ifindex; __u32 attach_type; } netkit; + struct { + __u32 map_id; + __u32 attach_type; + } sockmap; }; } __attribute__((aligned(8))); @@ -6936,6 +6948,8 @@ enum { * socket transition to LISTEN state. */ BPF_SOCK_OPS_RTT_CB, /* Called on every RTT. + * Arg1: measured RTT input (mrtt) + * Arg2: updated srtt */ BPF_SOCK_OPS_PARSE_HDR_OPT_CB, /* Parse the header option. * It will be called to handle @@ -7118,6 +7132,7 @@ enum { BPF_FIB_LOOKUP_SKIP_NEIGH = (1U << 2), BPF_FIB_LOOKUP_TBID = (1U << 3), BPF_FIB_LOOKUP_SRC = (1U << 4), + BPF_FIB_LOOKUP_MARK = (1U << 5), }; enum { @@ -7150,7 +7165,7 @@ struct bpf_fib_lookup { /* output: MTU value */ __u16 mtu_result; - }; + } __attribute__((packed, aligned(2))); /* input: L3 device index for lookup * output: device index from FIB lookup */ @@ -7195,8 +7210,19 @@ struct bpf_fib_lookup { __u32 tbid; }; - __u8 smac[6]; /* ETH_ALEN */ - __u8 dmac[6]; /* ETH_ALEN */ + union { + /* input */ + struct { + __u32 mark; /* policy routing */ + /* 2 4-byte holes for input */ + }; + + /* output: source and dest mac */ + struct { + __u8 smac[6]; /* ETH_ALEN */ + __u8 dmac[6]; /* ETH_ALEN */ + }; + }; }; struct bpf_redir_neigh { @@ -7283,6 +7309,10 @@ struct bpf_timer { __u64 __opaque[2]; } __attribute__((aligned(8))); +struct bpf_wq { + __u64 __opaque[2]; +} __attribute__((aligned(8))); + struct bpf_dynptr { __u64 __opaque[2]; } __attribute__((aligned(8))); diff --git a/tools/include/uapi/linux/ethtool.h b/tools/include/uapi/linux/ethtool.h deleted file mode 100644 index 47afae3895ec..000000000000 --- a/tools/include/uapi/linux/ethtool.h +++ /dev/null @@ -1,104 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -/* - * ethtool.h: Defines for Linux ethtool. - * - * Copyright (C) 1998 David S. Miller (davem@redhat.com) - * Copyright 2001 Jeff Garzik <jgarzik@pobox.com> - * Portions Copyright 2001 Sun Microsystems (thockin@sun.com) - * Portions Copyright 2002 Intel (eli.kupermann@intel.com, - * christopher.leech@intel.com, - * scott.feldman@intel.com) - * Portions Copyright (C) Sun Microsystems 2008 - */ - -#ifndef _UAPI_LINUX_ETHTOOL_H -#define _UAPI_LINUX_ETHTOOL_H - -#include <linux/kernel.h> -#include <linux/types.h> -#include <linux/if_ether.h> - -#define ETHTOOL_GCHANNELS 0x0000003c /* Get no of channels */ - -/** - * struct ethtool_channels - configuring number of network channel - * @cmd: ETHTOOL_{G,S}CHANNELS - * @max_rx: Read only. Maximum number of receive channel the driver support. - * @max_tx: Read only. Maximum number of transmit channel the driver support. - * @max_other: Read only. Maximum number of other channel the driver support. - * @max_combined: Read only. Maximum number of combined channel the driver - * support. Set of queues RX, TX or other. - * @rx_count: Valid values are in the range 1 to the max_rx. - * @tx_count: Valid values are in the range 1 to the max_tx. - * @other_count: Valid values are in the range 1 to the max_other. - * @combined_count: Valid values are in the range 1 to the max_combined. - * - * This can be used to configure RX, TX and other channels. - */ - -struct ethtool_channels { - __u32 cmd; - __u32 max_rx; - __u32 max_tx; - __u32 max_other; - __u32 max_combined; - __u32 rx_count; - __u32 tx_count; - __u32 other_count; - __u32 combined_count; -}; - -#define ETHTOOL_FWVERS_LEN 32 -#define ETHTOOL_BUSINFO_LEN 32 -#define ETHTOOL_EROMVERS_LEN 32 - -/** - * struct ethtool_drvinfo - general driver and device information - * @cmd: Command number = %ETHTOOL_GDRVINFO - * @driver: Driver short name. This should normally match the name - * in its bus driver structure (e.g. pci_driver::name). Must - * not be an empty string. - * @version: Driver version string; may be an empty string - * @fw_version: Firmware version string; may be an empty string - * @erom_version: Expansion ROM version string; may be an empty string - * @bus_info: Device bus address. This should match the dev_name() - * string for the underlying bus device, if there is one. May be - * an empty string. - * @reserved2: Reserved for future use; see the note on reserved space. - * @n_priv_flags: Number of flags valid for %ETHTOOL_GPFLAGS and - * %ETHTOOL_SPFLAGS commands; also the number of strings in the - * %ETH_SS_PRIV_FLAGS set - * @n_stats: Number of u64 statistics returned by the %ETHTOOL_GSTATS - * command; also the number of strings in the %ETH_SS_STATS set - * @testinfo_len: Number of results returned by the %ETHTOOL_TEST - * command; also the number of strings in the %ETH_SS_TEST set - * @eedump_len: Size of EEPROM accessible through the %ETHTOOL_GEEPROM - * and %ETHTOOL_SEEPROM commands, in bytes - * @regdump_len: Size of register dump returned by the %ETHTOOL_GREGS - * command, in bytes - * - * Users can use the %ETHTOOL_GSSET_INFO command to get the number of - * strings in any string set (from Linux 2.6.34). - * - * Drivers should set at most @driver, @version, @fw_version and - * @bus_info in their get_drvinfo() implementation. The ethtool - * core fills in the other fields using other driver operations. - */ -struct ethtool_drvinfo { - __u32 cmd; - char driver[32]; - char version[32]; - char fw_version[ETHTOOL_FWVERS_LEN]; - char bus_info[ETHTOOL_BUSINFO_LEN]; - char erom_version[ETHTOOL_EROMVERS_LEN]; - char reserved2[12]; - __u32 n_priv_flags; - __u32 n_stats; - __u32 testinfo_len; - __u32 eedump_len; - __u32 regdump_len; -}; - -#define ETHTOOL_GDRVINFO 0x00000003 - -#endif /* _UAPI_LINUX_ETHTOOL_H */ diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index 2190adbe3002..ea32b101b999 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -1221,7 +1221,7 @@ struct kvm_vfio_spapr_tce { /* Available with KVM_CAP_SPAPR_RESIZE_HPT */ #define KVM_PPC_RESIZE_HPT_PREPARE _IOR(KVMIO, 0xad, struct kvm_ppc_resize_hpt) #define KVM_PPC_RESIZE_HPT_COMMIT _IOR(KVMIO, 0xae, struct kvm_ppc_resize_hpt) -/* Available with KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_HASH_MMU_V3 */ +/* Available with KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_MMU_HASH_V3 */ #define KVM_PPC_CONFIGURE_V3_MMU _IOW(KVMIO, 0xaf, struct kvm_ppc_mmuv3_cfg) /* Available with KVM_CAP_PPC_RADIX_MMU */ #define KVM_PPC_GET_RMMU_INFO _IOW(KVMIO, 0xb0, struct kvm_ppc_rmmu_info) diff --git a/tools/include/uapi/linux/memfd.h b/tools/include/uapi/linux/memfd.h new file mode 100644 index 000000000000..01c0324e7733 --- /dev/null +++ b/tools/include/uapi/linux/memfd.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _LINUX_MEMFD_H +#define _LINUX_MEMFD_H + +#include <asm-generic/hugetlb_encode.h> + +/* flags for memfd_create(2) (unsigned int) */ +#define MFD_CLOEXEC 0x0001U +#define MFD_ALLOW_SEALING 0x0002U +#define MFD_HUGETLB 0x0004U +/* not executable and sealed to prevent changing to executable. */ +#define MFD_NOEXEC_SEAL 0x0008U +/* executable */ +#define MFD_EXEC 0x0010U + +/* + * Huge page size encoding when MFD_HUGETLB is specified, and a huge page + * size other than the default is desired. See hugetlb_encode.h. + * All known huge page size encodings are provided here. It is the + * responsibility of the application to know which sizes are supported on + * the running system. See mmap(2) man page for details. + */ +#define MFD_HUGE_SHIFT HUGETLB_FLAG_ENCODE_SHIFT +#define MFD_HUGE_MASK HUGETLB_FLAG_ENCODE_MASK + +#define MFD_HUGE_64KB HUGETLB_FLAG_ENCODE_64KB +#define MFD_HUGE_512KB HUGETLB_FLAG_ENCODE_512KB +#define MFD_HUGE_1MB HUGETLB_FLAG_ENCODE_1MB +#define MFD_HUGE_2MB HUGETLB_FLAG_ENCODE_2MB +#define MFD_HUGE_8MB HUGETLB_FLAG_ENCODE_8MB +#define MFD_HUGE_16MB HUGETLB_FLAG_ENCODE_16MB +#define MFD_HUGE_32MB HUGETLB_FLAG_ENCODE_32MB +#define MFD_HUGE_256MB HUGETLB_FLAG_ENCODE_256MB +#define MFD_HUGE_512MB HUGETLB_FLAG_ENCODE_512MB +#define MFD_HUGE_1GB HUGETLB_FLAG_ENCODE_1GB +#define MFD_HUGE_2GB HUGETLB_FLAG_ENCODE_2GB +#define MFD_HUGE_16GB HUGETLB_FLAG_ENCODE_16GB + +#endif /* _LINUX_MEMFD_H */ diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index bb65ee840cda..a8188202413e 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -146,6 +146,27 @@ enum { NETDEV_A_QSTATS_TX_PACKETS, NETDEV_A_QSTATS_TX_BYTES, NETDEV_A_QSTATS_RX_ALLOC_FAIL, + NETDEV_A_QSTATS_RX_HW_DROPS, + NETDEV_A_QSTATS_RX_HW_DROP_OVERRUNS, + NETDEV_A_QSTATS_RX_CSUM_UNNECESSARY, + NETDEV_A_QSTATS_RX_CSUM_NONE, + NETDEV_A_QSTATS_RX_CSUM_BAD, + NETDEV_A_QSTATS_RX_HW_GRO_PACKETS, + NETDEV_A_QSTATS_RX_HW_GRO_BYTES, + NETDEV_A_QSTATS_RX_HW_GRO_WIRE_PACKETS, + NETDEV_A_QSTATS_RX_HW_GRO_WIRE_BYTES, + NETDEV_A_QSTATS_RX_HW_DROP_RATELIMITS, + NETDEV_A_QSTATS_TX_HW_DROPS, + NETDEV_A_QSTATS_TX_HW_DROP_ERRORS, + NETDEV_A_QSTATS_TX_CSUM_NONE, + NETDEV_A_QSTATS_TX_NEEDS_CSUM, + NETDEV_A_QSTATS_TX_HW_GSO_PACKETS, + NETDEV_A_QSTATS_TX_HW_GSO_BYTES, + NETDEV_A_QSTATS_TX_HW_GSO_WIRE_PACKETS, + NETDEV_A_QSTATS_TX_HW_GSO_WIRE_BYTES, + NETDEV_A_QSTATS_TX_HW_DROP_RATELIMITS, + NETDEV_A_QSTATS_TX_STOP, + NETDEV_A_QSTATS_TX_WAKE, __NETDEV_A_QSTATS_MAX, NETDEV_A_QSTATS_MAX = (__NETDEV_A_QSTATS_MAX - 1) diff --git a/tools/include/uapi/linux/userfaultfd.h b/tools/include/uapi/linux/userfaultfd.h new file mode 100644 index 000000000000..4283de22d5b6 --- /dev/null +++ b/tools/include/uapi/linux/userfaultfd.h @@ -0,0 +1,386 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * include/linux/userfaultfd.h + * + * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org> + * Copyright (C) 2015 Red Hat, Inc. + * + */ + +#ifndef _LINUX_USERFAULTFD_H +#define _LINUX_USERFAULTFD_H + +#include <linux/types.h> + +/* ioctls for /dev/userfaultfd */ +#define USERFAULTFD_IOC 0xAA +#define USERFAULTFD_IOC_NEW _IO(USERFAULTFD_IOC, 0x00) + +/* + * If the UFFDIO_API is upgraded someday, the UFFDIO_UNREGISTER and + * UFFDIO_WAKE ioctls should be defined as _IOW and not as _IOR. In + * userfaultfd.h we assumed the kernel was reading (instead _IOC_READ + * means the userland is reading). + */ +#define UFFD_API ((__u64)0xAA) +#define UFFD_API_REGISTER_MODES (UFFDIO_REGISTER_MODE_MISSING | \ + UFFDIO_REGISTER_MODE_WP | \ + UFFDIO_REGISTER_MODE_MINOR) +#define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP | \ + UFFD_FEATURE_EVENT_FORK | \ + UFFD_FEATURE_EVENT_REMAP | \ + UFFD_FEATURE_EVENT_REMOVE | \ + UFFD_FEATURE_EVENT_UNMAP | \ + UFFD_FEATURE_MISSING_HUGETLBFS | \ + UFFD_FEATURE_MISSING_SHMEM | \ + UFFD_FEATURE_SIGBUS | \ + UFFD_FEATURE_THREAD_ID | \ + UFFD_FEATURE_MINOR_HUGETLBFS | \ + UFFD_FEATURE_MINOR_SHMEM | \ + UFFD_FEATURE_EXACT_ADDRESS | \ + UFFD_FEATURE_WP_HUGETLBFS_SHMEM | \ + UFFD_FEATURE_WP_UNPOPULATED | \ + UFFD_FEATURE_POISON | \ + UFFD_FEATURE_WP_ASYNC | \ + UFFD_FEATURE_MOVE) +#define UFFD_API_IOCTLS \ + ((__u64)1 << _UFFDIO_REGISTER | \ + (__u64)1 << _UFFDIO_UNREGISTER | \ + (__u64)1 << _UFFDIO_API) +#define UFFD_API_RANGE_IOCTLS \ + ((__u64)1 << _UFFDIO_WAKE | \ + (__u64)1 << _UFFDIO_COPY | \ + (__u64)1 << _UFFDIO_ZEROPAGE | \ + (__u64)1 << _UFFDIO_MOVE | \ + (__u64)1 << _UFFDIO_WRITEPROTECT | \ + (__u64)1 << _UFFDIO_CONTINUE | \ + (__u64)1 << _UFFDIO_POISON) +#define UFFD_API_RANGE_IOCTLS_BASIC \ + ((__u64)1 << _UFFDIO_WAKE | \ + (__u64)1 << _UFFDIO_COPY | \ + (__u64)1 << _UFFDIO_WRITEPROTECT | \ + (__u64)1 << _UFFDIO_CONTINUE | \ + (__u64)1 << _UFFDIO_POISON) + +/* + * Valid ioctl command number range with this API is from 0x00 to + * 0x3F. UFFDIO_API is the fixed number, everything else can be + * changed by implementing a different UFFD_API. If sticking to the + * same UFFD_API more ioctl can be added and userland will be aware of + * which ioctl the running kernel implements through the ioctl command + * bitmask written by the UFFDIO_API. + */ +#define _UFFDIO_REGISTER (0x00) +#define _UFFDIO_UNREGISTER (0x01) +#define _UFFDIO_WAKE (0x02) +#define _UFFDIO_COPY (0x03) +#define _UFFDIO_ZEROPAGE (0x04) +#define _UFFDIO_MOVE (0x05) +#define _UFFDIO_WRITEPROTECT (0x06) +#define _UFFDIO_CONTINUE (0x07) +#define _UFFDIO_POISON (0x08) +#define _UFFDIO_API (0x3F) + +/* userfaultfd ioctl ids */ +#define UFFDIO 0xAA +#define UFFDIO_API _IOWR(UFFDIO, _UFFDIO_API, \ + struct uffdio_api) +#define UFFDIO_REGISTER _IOWR(UFFDIO, _UFFDIO_REGISTER, \ + struct uffdio_register) +#define UFFDIO_UNREGISTER _IOR(UFFDIO, _UFFDIO_UNREGISTER, \ + struct uffdio_range) +#define UFFDIO_WAKE _IOR(UFFDIO, _UFFDIO_WAKE, \ + struct uffdio_range) +#define UFFDIO_COPY _IOWR(UFFDIO, _UFFDIO_COPY, \ + struct uffdio_copy) +#define UFFDIO_ZEROPAGE _IOWR(UFFDIO, _UFFDIO_ZEROPAGE, \ + struct uffdio_zeropage) +#define UFFDIO_MOVE _IOWR(UFFDIO, _UFFDIO_MOVE, \ + struct uffdio_move) +#define UFFDIO_WRITEPROTECT _IOWR(UFFDIO, _UFFDIO_WRITEPROTECT, \ + struct uffdio_writeprotect) +#define UFFDIO_CONTINUE _IOWR(UFFDIO, _UFFDIO_CONTINUE, \ + struct uffdio_continue) +#define UFFDIO_POISON _IOWR(UFFDIO, _UFFDIO_POISON, \ + struct uffdio_poison) + +/* read() structure */ +struct uffd_msg { + __u8 event; + + __u8 reserved1; + __u16 reserved2; + __u32 reserved3; + + union { + struct { + __u64 flags; + __u64 address; + union { + __u32 ptid; + } feat; + } pagefault; + + struct { + __u32 ufd; + } fork; + + struct { + __u64 from; + __u64 to; + __u64 len; + } remap; + + struct { + __u64 start; + __u64 end; + } remove; + + struct { + /* unused reserved fields */ + __u64 reserved1; + __u64 reserved2; + __u64 reserved3; + } reserved; + } arg; +} __attribute__((packed)); + +/* + * Start at 0x12 and not at 0 to be more strict against bugs. + */ +#define UFFD_EVENT_PAGEFAULT 0x12 +#define UFFD_EVENT_FORK 0x13 +#define UFFD_EVENT_REMAP 0x14 +#define UFFD_EVENT_REMOVE 0x15 +#define UFFD_EVENT_UNMAP 0x16 + +/* flags for UFFD_EVENT_PAGEFAULT */ +#define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */ +#define UFFD_PAGEFAULT_FLAG_WP (1<<1) /* If reason is VM_UFFD_WP */ +#define UFFD_PAGEFAULT_FLAG_MINOR (1<<2) /* If reason is VM_UFFD_MINOR */ + +struct uffdio_api { + /* userland asks for an API number and the features to enable */ + __u64 api; + /* + * Kernel answers below with the all available features for + * the API, this notifies userland of which events and/or + * which flags for each event are enabled in the current + * kernel. + * + * Note: UFFD_EVENT_PAGEFAULT and UFFD_PAGEFAULT_FLAG_WRITE + * are to be considered implicitly always enabled in all kernels as + * long as the uffdio_api.api requested matches UFFD_API. + * + * UFFD_FEATURE_MISSING_HUGETLBFS means an UFFDIO_REGISTER + * with UFFDIO_REGISTER_MODE_MISSING mode will succeed on + * hugetlbfs virtual memory ranges. Adding or not adding + * UFFD_FEATURE_MISSING_HUGETLBFS to uffdio_api.features has + * no real functional effect after UFFDIO_API returns, but + * it's only useful for an initial feature set probe at + * UFFDIO_API time. There are two ways to use it: + * + * 1) by adding UFFD_FEATURE_MISSING_HUGETLBFS to the + * uffdio_api.features before calling UFFDIO_API, an error + * will be returned by UFFDIO_API on a kernel without + * hugetlbfs missing support + * + * 2) the UFFD_FEATURE_MISSING_HUGETLBFS can not be added in + * uffdio_api.features and instead it will be set by the + * kernel in the uffdio_api.features if the kernel supports + * it, so userland can later check if the feature flag is + * present in uffdio_api.features after UFFDIO_API + * succeeded. + * + * UFFD_FEATURE_MISSING_SHMEM works the same as + * UFFD_FEATURE_MISSING_HUGETLBFS, but it applies to shmem + * (i.e. tmpfs and other shmem based APIs). + * + * UFFD_FEATURE_SIGBUS feature means no page-fault + * (UFFD_EVENT_PAGEFAULT) event will be delivered, instead + * a SIGBUS signal will be sent to the faulting process. + * + * UFFD_FEATURE_THREAD_ID pid of the page faulted task_struct will + * be returned, if feature is not requested 0 will be returned. + * + * UFFD_FEATURE_MINOR_HUGETLBFS indicates that minor faults + * can be intercepted (via REGISTER_MODE_MINOR) for + * hugetlbfs-backed pages. + * + * UFFD_FEATURE_MINOR_SHMEM indicates the same support as + * UFFD_FEATURE_MINOR_HUGETLBFS, but for shmem-backed pages instead. + * + * UFFD_FEATURE_EXACT_ADDRESS indicates that the exact address of page + * faults would be provided and the offset within the page would not be + * masked. + * + * UFFD_FEATURE_WP_HUGETLBFS_SHMEM indicates that userfaultfd + * write-protection mode is supported on both shmem and hugetlbfs. + * + * UFFD_FEATURE_WP_UNPOPULATED indicates that userfaultfd + * write-protection mode will always apply to unpopulated pages + * (i.e. empty ptes). This will be the default behavior for shmem + * & hugetlbfs, so this flag only affects anonymous memory behavior + * when userfault write-protection mode is registered. + * + * UFFD_FEATURE_WP_ASYNC indicates that userfaultfd write-protection + * asynchronous mode is supported in which the write fault is + * automatically resolved and write-protection is un-set. + * It implies UFFD_FEATURE_WP_UNPOPULATED. + * + * UFFD_FEATURE_MOVE indicates that the kernel supports moving an + * existing page contents from userspace. + */ +#define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) +#define UFFD_FEATURE_EVENT_FORK (1<<1) +#define UFFD_FEATURE_EVENT_REMAP (1<<2) +#define UFFD_FEATURE_EVENT_REMOVE (1<<3) +#define UFFD_FEATURE_MISSING_HUGETLBFS (1<<4) +#define UFFD_FEATURE_MISSING_SHMEM (1<<5) +#define UFFD_FEATURE_EVENT_UNMAP (1<<6) +#define UFFD_FEATURE_SIGBUS (1<<7) +#define UFFD_FEATURE_THREAD_ID (1<<8) +#define UFFD_FEATURE_MINOR_HUGETLBFS (1<<9) +#define UFFD_FEATURE_MINOR_SHMEM (1<<10) +#define UFFD_FEATURE_EXACT_ADDRESS (1<<11) +#define UFFD_FEATURE_WP_HUGETLBFS_SHMEM (1<<12) +#define UFFD_FEATURE_WP_UNPOPULATED (1<<13) +#define UFFD_FEATURE_POISON (1<<14) +#define UFFD_FEATURE_WP_ASYNC (1<<15) +#define UFFD_FEATURE_MOVE (1<<16) + __u64 features; + + __u64 ioctls; +}; + +struct uffdio_range { + __u64 start; + __u64 len; +}; + +struct uffdio_register { + struct uffdio_range range; +#define UFFDIO_REGISTER_MODE_MISSING ((__u64)1<<0) +#define UFFDIO_REGISTER_MODE_WP ((__u64)1<<1) +#define UFFDIO_REGISTER_MODE_MINOR ((__u64)1<<2) + __u64 mode; + + /* + * kernel answers which ioctl commands are available for the + * range, keep at the end as the last 8 bytes aren't read. + */ + __u64 ioctls; +}; + +struct uffdio_copy { + __u64 dst; + __u64 src; + __u64 len; +#define UFFDIO_COPY_MODE_DONTWAKE ((__u64)1<<0) + /* + * UFFDIO_COPY_MODE_WP will map the page write protected on + * the fly. UFFDIO_COPY_MODE_WP is available only if the + * write protected ioctl is implemented for the range + * according to the uffdio_register.ioctls. + */ +#define UFFDIO_COPY_MODE_WP ((__u64)1<<1) + __u64 mode; + + /* + * "copy" is written by the ioctl and must be at the end: the + * copy_from_user will not read the last 8 bytes. + */ + __s64 copy; +}; + +struct uffdio_zeropage { + struct uffdio_range range; +#define UFFDIO_ZEROPAGE_MODE_DONTWAKE ((__u64)1<<0) + __u64 mode; + + /* + * "zeropage" is written by the ioctl and must be at the end: + * the copy_from_user will not read the last 8 bytes. + */ + __s64 zeropage; +}; + +struct uffdio_writeprotect { + struct uffdio_range range; +/* + * UFFDIO_WRITEPROTECT_MODE_WP: set the flag to write protect a range, + * unset the flag to undo protection of a range which was previously + * write protected. + * + * UFFDIO_WRITEPROTECT_MODE_DONTWAKE: set the flag to avoid waking up + * any wait thread after the operation succeeds. + * + * NOTE: Write protecting a region (WP=1) is unrelated to page faults, + * therefore DONTWAKE flag is meaningless with WP=1. Removing write + * protection (WP=0) in response to a page fault wakes the faulting + * task unless DONTWAKE is set. + */ +#define UFFDIO_WRITEPROTECT_MODE_WP ((__u64)1<<0) +#define UFFDIO_WRITEPROTECT_MODE_DONTWAKE ((__u64)1<<1) + __u64 mode; +}; + +struct uffdio_continue { + struct uffdio_range range; +#define UFFDIO_CONTINUE_MODE_DONTWAKE ((__u64)1<<0) + /* + * UFFDIO_CONTINUE_MODE_WP will map the page write protected on + * the fly. UFFDIO_CONTINUE_MODE_WP is available only if the + * write protected ioctl is implemented for the range + * according to the uffdio_register.ioctls. + */ +#define UFFDIO_CONTINUE_MODE_WP ((__u64)1<<1) + __u64 mode; + + /* + * Fields below here are written by the ioctl and must be at the end: + * the copy_from_user will not read past here. + */ + __s64 mapped; +}; + +struct uffdio_poison { + struct uffdio_range range; +#define UFFDIO_POISON_MODE_DONTWAKE ((__u64)1<<0) + __u64 mode; + + /* + * Fields below here are written by the ioctl and must be at the end: + * the copy_from_user will not read past here. + */ + __s64 updated; +}; + +struct uffdio_move { + __u64 dst; + __u64 src; + __u64 len; + /* + * Especially if used to atomically remove memory from the + * address space the wake on the dst range is not needed. + */ +#define UFFDIO_MOVE_MODE_DONTWAKE ((__u64)1<<0) +#define UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES ((__u64)1<<1) + __u64 mode; + /* + * "move" is written by the ioctl and must be at the end: the + * copy_from_user will not read the last 8 bytes. + */ + __s64 move; +}; + +/* + * Flags for the userfaultfd(2) system call itself. + */ + +/* + * Create a userfaultfd that can handle page faults only in user mode. + */ +#define UFFD_USER_MODE_ONLY 1 + +#endif /* _LINUX_USERFAULTFD_H */ diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 97ec005c3c47..2a4c71501a17 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -105,7 +105,7 @@ int sys_bpf_prog_load(union bpf_attr *attr, unsigned int size, int attempts) */ int probe_memcg_account(int token_fd) { - const size_t attr_sz = offsetofend(union bpf_attr, attach_btf_obj_fd); + const size_t attr_sz = offsetofend(union bpf_attr, prog_token_fd); struct bpf_insn insns[] = { BPF_EMIT_CALL(BPF_FUNC_ktime_get_coarse_ns), BPF_EXIT_INSN(), @@ -766,6 +766,7 @@ int bpf_link_create(int prog_fd, int target_fd, return libbpf_err(-EINVAL); break; case BPF_TRACE_KPROBE_MULTI: + case BPF_TRACE_KPROBE_SESSION: attr.link_create.kprobe_multi.flags = OPTS_GET(opts, kprobe_multi.flags, 0); attr.link_create.kprobe_multi.cnt = OPTS_GET(opts, kprobe_multi.cnt, 0); attr.link_create.kprobe_multi.syms = ptr_to_u64(OPTS_GET(opts, kprobe_multi.syms, 0)); @@ -785,6 +786,7 @@ int bpf_link_create(int prog_fd, int target_fd, if (!OPTS_ZEROED(opts, uprobe_multi)) return libbpf_err(-EINVAL); break; + case BPF_TRACE_RAW_TP: case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: case BPF_MODIFY_RETURN: @@ -1173,20 +1175,31 @@ int bpf_link_get_info_by_fd(int link_fd, struct bpf_link_info *info, __u32 *info return bpf_obj_get_info_by_fd(link_fd, info, info_len); } -int bpf_raw_tracepoint_open(const char *name, int prog_fd) +int bpf_raw_tracepoint_open_opts(int prog_fd, struct bpf_raw_tp_opts *opts) { const size_t attr_sz = offsetofend(union bpf_attr, raw_tracepoint); union bpf_attr attr; int fd; + if (!OPTS_VALID(opts, bpf_raw_tp_opts)) + return libbpf_err(-EINVAL); + memset(&attr, 0, attr_sz); - attr.raw_tracepoint.name = ptr_to_u64(name); attr.raw_tracepoint.prog_fd = prog_fd; + attr.raw_tracepoint.name = ptr_to_u64(OPTS_GET(opts, tp_name, NULL)); + attr.raw_tracepoint.cookie = OPTS_GET(opts, cookie, 0); fd = sys_bpf_fd(BPF_RAW_TRACEPOINT_OPEN, &attr, attr_sz); return libbpf_err_errno(fd); } +int bpf_raw_tracepoint_open(const char *name, int prog_fd) +{ + LIBBPF_OPTS(bpf_raw_tp_opts, opts, .tp_name = name); + + return bpf_raw_tracepoint_open_opts(prog_fd, &opts); +} + int bpf_btf_load(const void *btf_data, size_t btf_size, struct bpf_btf_load_opts *opts) { const size_t attr_sz = offsetofend(union bpf_attr, btf_token_fd); diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index df0db2f0cdb7..972e17ec0c09 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -617,6 +617,15 @@ LIBBPF_API int bpf_prog_query(int target_fd, enum bpf_attach_type type, __u32 query_flags, __u32 *attach_flags, __u32 *prog_ids, __u32 *prog_cnt); +struct bpf_raw_tp_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + const char *tp_name; + __u64 cookie; + size_t :0; +}; +#define bpf_raw_tp_opts__last_field cookie + +LIBBPF_API int bpf_raw_tracepoint_open_opts(int prog_fd, struct bpf_raw_tp_opts *opts); LIBBPF_API int bpf_raw_tracepoint_open(const char *name, int prog_fd); LIBBPF_API int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf, __u32 *buf_len, __u32 *prog_id, __u32 *fd_type, diff --git a/tools/lib/bpf/bpf_core_read.h b/tools/lib/bpf/bpf_core_read.h index 1ce738d91685..c0e13cdf9660 100644 --- a/tools/lib/bpf/bpf_core_read.h +++ b/tools/lib/bpf/bpf_core_read.h @@ -2,7 +2,7 @@ #ifndef __BPF_CORE_READ_H__ #define __BPF_CORE_READ_H__ -#include <bpf/bpf_helpers.h> +#include "bpf_helpers.h" /* * enum bpf_field_info_kind is passed as a second argument into @@ -104,6 +104,7 @@ enum bpf_enum_value_kind { case 2: val = *(const unsigned short *)p; break; \ case 4: val = *(const unsigned int *)p; break; \ case 8: val = *(const unsigned long long *)p; break; \ + default: val = 0; break; \ } \ val <<= __CORE_RELO(s, field, LSHIFT_U64); \ if (__CORE_RELO(s, field, SIGNED)) \ diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h index cd17f6d0791f..305c62817dd3 100644 --- a/tools/lib/bpf/bpf_helpers.h +++ b/tools/lib/bpf/bpf_helpers.h @@ -137,7 +137,8 @@ /* * Helper function to perform a tail call with a constant/immediate map slot. */ -#if __clang_major__ >= 8 && defined(__bpf__) +#if (defined(__clang__) && __clang_major__ >= 8) || (!defined(__clang__) && __GNUC__ > 12) +#if defined(__bpf__) static __always_inline void bpf_tail_call_static(void *ctx, const void *map, const __u32 slot) { @@ -165,6 +166,7 @@ bpf_tail_call_static(void *ctx, const void *map, const __u32 slot) : "r0", "r1", "r2", "r3", "r4", "r5"); } #endif +#endif enum libbpf_pin_type { LIBBPF_PIN_NONE, @@ -184,10 +186,21 @@ enum libbpf_tristate { #define __kptr __attribute__((btf_type_tag("kptr"))) #define __percpu_kptr __attribute__((btf_type_tag("percpu_kptr"))) -#define bpf_ksym_exists(sym) ({ \ - _Static_assert(!__builtin_constant_p(!!sym), #sym " should be marked as __weak"); \ - !!sym; \ +#if defined (__clang__) +#define bpf_ksym_exists(sym) ({ \ + _Static_assert(!__builtin_constant_p(!!sym), \ + #sym " should be marked as __weak"); \ + !!sym; \ +}) +#elif __GNUC__ > 8 +#define bpf_ksym_exists(sym) ({ \ + _Static_assert(__builtin_has_attribute (*sym, __weak__), \ + #sym " should be marked as __weak"); \ + !!sym; \ }) +#else +#define bpf_ksym_exists(sym) !!sym +#endif #define __arg_ctx __attribute__((btf_decl_tag("arg:ctx"))) #define __arg_nonnull __attribute((btf_decl_tag("arg:nonnull"))) diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h index 1c13f8e88833..9314fa95f04e 100644 --- a/tools/lib/bpf/bpf_tracing.h +++ b/tools/lib/bpf/bpf_tracing.h @@ -633,18 +633,18 @@ struct pt_regs; #endif #define ___bpf_ctx_cast0() ctx -#define ___bpf_ctx_cast1(x) ___bpf_ctx_cast0(), (void *)ctx[0] -#define ___bpf_ctx_cast2(x, args...) ___bpf_ctx_cast1(args), (void *)ctx[1] -#define ___bpf_ctx_cast3(x, args...) ___bpf_ctx_cast2(args), (void *)ctx[2] -#define ___bpf_ctx_cast4(x, args...) ___bpf_ctx_cast3(args), (void *)ctx[3] -#define ___bpf_ctx_cast5(x, args...) ___bpf_ctx_cast4(args), (void *)ctx[4] -#define ___bpf_ctx_cast6(x, args...) ___bpf_ctx_cast5(args), (void *)ctx[5] -#define ___bpf_ctx_cast7(x, args...) ___bpf_ctx_cast6(args), (void *)ctx[6] -#define ___bpf_ctx_cast8(x, args...) ___bpf_ctx_cast7(args), (void *)ctx[7] -#define ___bpf_ctx_cast9(x, args...) ___bpf_ctx_cast8(args), (void *)ctx[8] -#define ___bpf_ctx_cast10(x, args...) ___bpf_ctx_cast9(args), (void *)ctx[9] -#define ___bpf_ctx_cast11(x, args...) ___bpf_ctx_cast10(args), (void *)ctx[10] -#define ___bpf_ctx_cast12(x, args...) ___bpf_ctx_cast11(args), (void *)ctx[11] +#define ___bpf_ctx_cast1(x) ___bpf_ctx_cast0(), ctx[0] +#define ___bpf_ctx_cast2(x, args...) ___bpf_ctx_cast1(args), ctx[1] +#define ___bpf_ctx_cast3(x, args...) ___bpf_ctx_cast2(args), ctx[2] +#define ___bpf_ctx_cast4(x, args...) ___bpf_ctx_cast3(args), ctx[3] +#define ___bpf_ctx_cast5(x, args...) ___bpf_ctx_cast4(args), ctx[4] +#define ___bpf_ctx_cast6(x, args...) ___bpf_ctx_cast5(args), ctx[5] +#define ___bpf_ctx_cast7(x, args...) ___bpf_ctx_cast6(args), ctx[6] +#define ___bpf_ctx_cast8(x, args...) ___bpf_ctx_cast7(args), ctx[7] +#define ___bpf_ctx_cast9(x, args...) ___bpf_ctx_cast8(args), ctx[8] +#define ___bpf_ctx_cast10(x, args...) ___bpf_ctx_cast9(args), ctx[9] +#define ___bpf_ctx_cast11(x, args...) ___bpf_ctx_cast10(args), ctx[10] +#define ___bpf_ctx_cast12(x, args...) ___bpf_ctx_cast11(args), ctx[11] #define ___bpf_ctx_cast(args...) ___bpf_apply(___bpf_ctx_cast, ___bpf_narg(args))(args) /* @@ -786,14 +786,14 @@ ____##name(unsigned long long *ctx ___bpf_ctx_decl(args)) struct pt_regs; #define ___bpf_kprobe_args0() ctx -#define ___bpf_kprobe_args1(x) ___bpf_kprobe_args0(), (void *)PT_REGS_PARM1(ctx) -#define ___bpf_kprobe_args2(x, args...) ___bpf_kprobe_args1(args), (void *)PT_REGS_PARM2(ctx) -#define ___bpf_kprobe_args3(x, args...) ___bpf_kprobe_args2(args), (void *)PT_REGS_PARM3(ctx) -#define ___bpf_kprobe_args4(x, args...) ___bpf_kprobe_args3(args), (void *)PT_REGS_PARM4(ctx) -#define ___bpf_kprobe_args5(x, args...) ___bpf_kprobe_args4(args), (void *)PT_REGS_PARM5(ctx) -#define ___bpf_kprobe_args6(x, args...) ___bpf_kprobe_args5(args), (void *)PT_REGS_PARM6(ctx) -#define ___bpf_kprobe_args7(x, args...) ___bpf_kprobe_args6(args), (void *)PT_REGS_PARM7(ctx) -#define ___bpf_kprobe_args8(x, args...) ___bpf_kprobe_args7(args), (void *)PT_REGS_PARM8(ctx) +#define ___bpf_kprobe_args1(x) ___bpf_kprobe_args0(), (unsigned long long)PT_REGS_PARM1(ctx) +#define ___bpf_kprobe_args2(x, args...) ___bpf_kprobe_args1(args), (unsigned long long)PT_REGS_PARM2(ctx) +#define ___bpf_kprobe_args3(x, args...) ___bpf_kprobe_args2(args), (unsigned long long)PT_REGS_PARM3(ctx) +#define ___bpf_kprobe_args4(x, args...) ___bpf_kprobe_args3(args), (unsigned long long)PT_REGS_PARM4(ctx) +#define ___bpf_kprobe_args5(x, args...) ___bpf_kprobe_args4(args), (unsigned long long)PT_REGS_PARM5(ctx) +#define ___bpf_kprobe_args6(x, args...) ___bpf_kprobe_args5(args), (unsigned long long)PT_REGS_PARM6(ctx) +#define ___bpf_kprobe_args7(x, args...) ___bpf_kprobe_args6(args), (unsigned long long)PT_REGS_PARM7(ctx) +#define ___bpf_kprobe_args8(x, args...) ___bpf_kprobe_args7(args), (unsigned long long)PT_REGS_PARM8(ctx) #define ___bpf_kprobe_args(args...) ___bpf_apply(___bpf_kprobe_args, ___bpf_narg(args))(args) /* @@ -821,7 +821,7 @@ static __always_inline typeof(name(0)) \ ____##name(struct pt_regs *ctx, ##args) #define ___bpf_kretprobe_args0() ctx -#define ___bpf_kretprobe_args1(x) ___bpf_kretprobe_args0(), (void *)PT_REGS_RC(ctx) +#define ___bpf_kretprobe_args1(x) ___bpf_kretprobe_args0(), (unsigned long long)PT_REGS_RC(ctx) #define ___bpf_kretprobe_args(args...) ___bpf_apply(___bpf_kretprobe_args, ___bpf_narg(args))(args) /* @@ -845,24 +845,24 @@ static __always_inline typeof(name(0)) ____##name(struct pt_regs *ctx, ##args) /* If kernel has CONFIG_ARCH_HAS_SYSCALL_WRAPPER, read pt_regs directly */ #define ___bpf_syscall_args0() ctx -#define ___bpf_syscall_args1(x) ___bpf_syscall_args0(), (void *)PT_REGS_PARM1_SYSCALL(regs) -#define ___bpf_syscall_args2(x, args...) ___bpf_syscall_args1(args), (void *)PT_REGS_PARM2_SYSCALL(regs) -#define ___bpf_syscall_args3(x, args...) ___bpf_syscall_args2(args), (void *)PT_REGS_PARM3_SYSCALL(regs) -#define ___bpf_syscall_args4(x, args...) ___bpf_syscall_args3(args), (void *)PT_REGS_PARM4_SYSCALL(regs) -#define ___bpf_syscall_args5(x, args...) ___bpf_syscall_args4(args), (void *)PT_REGS_PARM5_SYSCALL(regs) -#define ___bpf_syscall_args6(x, args...) ___bpf_syscall_args5(args), (void *)PT_REGS_PARM6_SYSCALL(regs) -#define ___bpf_syscall_args7(x, args...) ___bpf_syscall_args6(args), (void *)PT_REGS_PARM7_SYSCALL(regs) +#define ___bpf_syscall_args1(x) ___bpf_syscall_args0(), (unsigned long long)PT_REGS_PARM1_SYSCALL(regs) +#define ___bpf_syscall_args2(x, args...) ___bpf_syscall_args1(args), (unsigned long long)PT_REGS_PARM2_SYSCALL(regs) +#define ___bpf_syscall_args3(x, args...) ___bpf_syscall_args2(args), (unsigned long long)PT_REGS_PARM3_SYSCALL(regs) +#define ___bpf_syscall_args4(x, args...) ___bpf_syscall_args3(args), (unsigned long long)PT_REGS_PARM4_SYSCALL(regs) +#define ___bpf_syscall_args5(x, args...) ___bpf_syscall_args4(args), (unsigned long long)PT_REGS_PARM5_SYSCALL(regs) +#define ___bpf_syscall_args6(x, args...) ___bpf_syscall_args5(args), (unsigned long long)PT_REGS_PARM6_SYSCALL(regs) +#define ___bpf_syscall_args7(x, args...) ___bpf_syscall_args6(args), (unsigned long long)PT_REGS_PARM7_SYSCALL(regs) #define ___bpf_syscall_args(args...) ___bpf_apply(___bpf_syscall_args, ___bpf_narg(args))(args) /* If kernel doesn't have CONFIG_ARCH_HAS_SYSCALL_WRAPPER, we have to BPF_CORE_READ from pt_regs */ #define ___bpf_syswrap_args0() ctx -#define ___bpf_syswrap_args1(x) ___bpf_syswrap_args0(), (void *)PT_REGS_PARM1_CORE_SYSCALL(regs) -#define ___bpf_syswrap_args2(x, args...) ___bpf_syswrap_args1(args), (void *)PT_REGS_PARM2_CORE_SYSCALL(regs) -#define ___bpf_syswrap_args3(x, args...) ___bpf_syswrap_args2(args), (void *)PT_REGS_PARM3_CORE_SYSCALL(regs) -#define ___bpf_syswrap_args4(x, args...) ___bpf_syswrap_args3(args), (void *)PT_REGS_PARM4_CORE_SYSCALL(regs) -#define ___bpf_syswrap_args5(x, args...) ___bpf_syswrap_args4(args), (void *)PT_REGS_PARM5_CORE_SYSCALL(regs) -#define ___bpf_syswrap_args6(x, args...) ___bpf_syswrap_args5(args), (void *)PT_REGS_PARM6_CORE_SYSCALL(regs) -#define ___bpf_syswrap_args7(x, args...) ___bpf_syswrap_args6(args), (void *)PT_REGS_PARM7_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args1(x) ___bpf_syswrap_args0(), (unsigned long long)PT_REGS_PARM1_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args2(x, args...) ___bpf_syswrap_args1(args), (unsigned long long)PT_REGS_PARM2_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args3(x, args...) ___bpf_syswrap_args2(args), (unsigned long long)PT_REGS_PARM3_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args4(x, args...) ___bpf_syswrap_args3(args), (unsigned long long)PT_REGS_PARM4_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args5(x, args...) ___bpf_syswrap_args4(args), (unsigned long long)PT_REGS_PARM5_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args6(x, args...) ___bpf_syswrap_args5(args), (unsigned long long)PT_REGS_PARM6_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args7(x, args...) ___bpf_syswrap_args6(args), (unsigned long long)PT_REGS_PARM7_CORE_SYSCALL(regs) #define ___bpf_syswrap_args(args...) ___bpf_apply(___bpf_syswrap_args, ___bpf_narg(args))(args) /* diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c index 4d9f30bf7f01..5dbca76b953f 100644 --- a/tools/lib/bpf/btf_dump.c +++ b/tools/lib/bpf/btf_dump.c @@ -1929,6 +1929,7 @@ static int btf_dump_int_data(struct btf_dump *d, if (d->typed_dump->is_array_terminated) break; if (*(char *)data == '\0') { + btf_dump_type_values(d, "'\\0'"); d->typed_dump->is_array_terminated = true; break; } @@ -2031,6 +2032,7 @@ static int btf_dump_array_data(struct btf_dump *d, __u32 i, elem_type_id; __s64 elem_size; bool is_array_member; + bool is_array_terminated; elem_type_id = array->type; elem_type = skip_mods_and_typedefs(d->btf, elem_type_id, NULL); @@ -2066,12 +2068,15 @@ static int btf_dump_array_data(struct btf_dump *d, */ is_array_member = d->typed_dump->is_array_member; d->typed_dump->is_array_member = true; + is_array_terminated = d->typed_dump->is_array_terminated; + d->typed_dump->is_array_terminated = false; for (i = 0; i < array->nelems; i++, data += elem_size) { if (d->typed_dump->is_array_terminated) break; btf_dump_dump_type_data(d, NULL, elem_type, elem_type_id, data, 0, 0); } d->typed_dump->is_array_member = is_array_member; + d->typed_dump->is_array_terminated = is_array_terminated; d->typed_dump->depth--; btf_dump_data_pfx(d); btf_dump_type_values(d, "]"); diff --git a/tools/lib/bpf/features.c b/tools/lib/bpf/features.c index 4e783cc7fc4b..a336786a22a3 100644 --- a/tools/lib/bpf/features.c +++ b/tools/lib/bpf/features.c @@ -22,7 +22,7 @@ int probe_fd(int fd) static int probe_kern_prog_name(int token_fd) { - const size_t attr_sz = offsetofend(union bpf_attr, prog_name); + const size_t attr_sz = offsetofend(union bpf_attr, prog_token_fd); struct bpf_insn insns[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index a2061fcd612d..5401f2df463d 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -132,6 +132,7 @@ static const char * const attach_type_name[] = { [BPF_TRACE_UPROBE_MULTI] = "trace_uprobe_multi", [BPF_NETKIT_PRIMARY] = "netkit_primary", [BPF_NETKIT_PEER] = "netkit_peer", + [BPF_TRACE_KPROBE_SESSION] = "trace_kprobe_session", }; static const char * const link_type_name[] = { @@ -149,6 +150,7 @@ static const char * const link_type_name[] = { [BPF_LINK_TYPE_TCX] = "tcx", [BPF_LINK_TYPE_UPROBE_MULTI] = "uprobe_multi", [BPF_LINK_TYPE_NETKIT] = "netkit", + [BPF_LINK_TYPE_SOCKMAP] = "sockmap", }; static const char * const map_type_name[] = { @@ -1126,17 +1128,46 @@ static int bpf_map__init_kern_struct_ops(struct bpf_map *map) const struct btf_type *mtype, *kern_mtype; __u32 mtype_id, kern_mtype_id; void *mdata, *kern_mdata; + struct bpf_program *prog; __s64 msize, kern_msize; __u32 moff, kern_moff; __u32 kern_member_idx; const char *mname; mname = btf__name_by_offset(btf, member->name_off); + moff = member->offset / 8; + mdata = data + moff; + msize = btf__resolve_size(btf, member->type); + if (msize < 0) { + pr_warn("struct_ops init_kern %s: failed to resolve the size of member %s\n", + map->name, mname); + return msize; + } + kern_member = find_member_by_name(kern_btf, kern_type, mname); if (!kern_member) { - pr_warn("struct_ops init_kern %s: Cannot find member %s in kernel BTF\n", + if (!libbpf_is_mem_zeroed(mdata, msize)) { + pr_warn("struct_ops init_kern %s: Cannot find member %s in kernel BTF\n", + map->name, mname); + return -ENOTSUP; + } + + if (st_ops->progs[i]) { + /* If we had declaratively set struct_ops callback, we need to + * force its autoload to false, because it doesn't have + * a chance of succeeding from POV of the current struct_ops map. + * If this program is still referenced somewhere else, though, + * then bpf_object_adjust_struct_ops_autoload() will update its + * autoload accordingly. + */ + st_ops->progs[i]->autoload = false; + st_ops->progs[i] = NULL; + } + + /* Skip all-zero/NULL fields if they are not present in the kernel BTF */ + pr_info("struct_ops %s: member %s not found in kernel, skipping it as it's set to zero\n", map->name, mname); - return -ENOTSUP; + continue; } kern_member_idx = kern_member - btf_members(kern_type); @@ -1147,10 +1178,7 @@ static int bpf_map__init_kern_struct_ops(struct bpf_map *map) return -ENOTSUP; } - moff = member->offset / 8; kern_moff = kern_member->offset / 8; - - mdata = data + moff; kern_mdata = kern_data + kern_moff; mtype = skip_mods_and_typedefs(btf, member->type, &mtype_id); @@ -1165,13 +1193,19 @@ static int bpf_map__init_kern_struct_ops(struct bpf_map *map) } if (btf_is_ptr(mtype)) { - struct bpf_program *prog; + prog = *(void **)mdata; + /* just like for !kern_member case above, reset declaratively + * set (at compile time) program's autload to false, + * if user replaced it with another program or NULL + */ + if (st_ops->progs[i] && st_ops->progs[i] != prog) + st_ops->progs[i]->autoload = false; /* Update the value from the shadow type */ - prog = *(void **)mdata; st_ops->progs[i] = prog; if (!prog) continue; + if (!is_valid_st_ops_program(obj, prog)) { pr_warn("struct_ops init_kern %s: member %s is not a struct_ops program\n", map->name, mname); @@ -1230,9 +1264,8 @@ static int bpf_map__init_kern_struct_ops(struct bpf_map *map) continue; } - msize = btf__resolve_size(btf, mtype_id); kern_msize = btf__resolve_size(kern_btf, kern_mtype_id); - if (msize < 0 || kern_msize < 0 || msize != kern_msize) { + if (kern_msize < 0 || msize != kern_msize) { pr_warn("struct_ops init_kern %s: Error in size of member %s: %zd != %zd(kernel)\n", map->name, mname, (ssize_t)msize, (ssize_t)kern_msize); @@ -1956,6 +1989,20 @@ static struct extern_desc *find_extern_by_name(const struct bpf_object *obj, return NULL; } +static struct extern_desc *find_extern_by_name_with_len(const struct bpf_object *obj, + const void *name, int len) +{ + const char *ext_name; + int i; + + for (i = 0; i < obj->nr_extern; i++) { + ext_name = obj->externs[i].name; + if (strlen(ext_name) == len && strncmp(ext_name, name, len) == 0) + return &obj->externs[i]; + } + return NULL; +} + static int set_kcfg_value_tri(struct extern_desc *ext, void *ext_val, char value) { @@ -7321,11 +7368,15 @@ static int bpf_object_load_prog(struct bpf_object *obj, struct bpf_program *prog char *cp, errmsg[STRERR_BUFSIZE]; size_t log_buf_size = 0; char *log_buf = NULL, *tmp; - int btf_fd, ret, err; bool own_log_buf = true; __u32 log_level = prog->log_level; + int ret, err; - if (prog->type == BPF_PROG_TYPE_UNSPEC) { + /* Be more helpful by rejecting programs that can't be validated early + * with more meaningful and actionable error message. + */ + switch (prog->type) { + case BPF_PROG_TYPE_UNSPEC: /* * The program type must be set. Most likely we couldn't find a proper * section definition at load time, and thus we didn't infer the type. @@ -7333,6 +7384,15 @@ static int bpf_object_load_prog(struct bpf_object *obj, struct bpf_program *prog pr_warn("prog '%s': missing BPF prog type, check ELF section name '%s'\n", prog->name, prog->sec_name); return -EINVAL; + case BPF_PROG_TYPE_STRUCT_OPS: + if (prog->attach_btf_id == 0) { + pr_warn("prog '%s': SEC(\"struct_ops\") program isn't referenced anywhere, did you forget to use it?\n", + prog->name); + return -EINVAL; + } + break; + default: + break; } if (!insns || !insns_cnt) @@ -7347,9 +7407,8 @@ static int bpf_object_load_prog(struct bpf_object *obj, struct bpf_program *prog load_attr.prog_ifindex = prog->prog_ifindex; /* specify func_info/line_info only if kernel supports them */ - btf_fd = btf__fd(obj->btf); - if (btf_fd >= 0 && kernel_supports(obj, FEAT_BTF_FUNC)) { - load_attr.prog_btf_fd = btf_fd; + if (obj->btf && btf__fd(obj->btf) >= 0 && kernel_supports(obj, FEAT_BTF_FUNC)) { + load_attr.prog_btf_fd = btf__fd(obj->btf); load_attr.func_info = prog->func_info; load_attr.func_info_rec_size = prog->func_info_rec_size; load_attr.func_info_cnt = prog->func_info_cnt; @@ -7973,7 +8032,10 @@ static int bpf_object__sanitize_maps(struct bpf_object *obj) return 0; } -int libbpf_kallsyms_parse(kallsyms_cb_t cb, void *ctx) +typedef int (*kallsyms_cb_t)(unsigned long long sym_addr, char sym_type, + const char *sym_name, void *ctx); + +static int libbpf_kallsyms_parse(kallsyms_cb_t cb, void *ctx) { char sym_type, sym_name[500]; unsigned long long sym_addr; @@ -8013,8 +8075,13 @@ static int kallsyms_cb(unsigned long long sym_addr, char sym_type, struct bpf_object *obj = ctx; const struct btf_type *t; struct extern_desc *ext; + char *res; - ext = find_extern_by_name(obj, sym_name); + res = strstr(sym_name, ".llvm."); + if (sym_type == 'd' && res) + ext = find_extern_by_name_with_len(obj, sym_name, res - sym_name); + else + ext = find_extern_by_name(obj, sym_name); if (!ext || ext->type != EXT_KSYM) return 0; @@ -8563,6 +8630,11 @@ int bpf_map__pin(struct bpf_map *map, const char *path) return libbpf_err(-EINVAL); } + if (map->fd < 0) { + pr_warn("map '%s': can't pin BPF map without FD (was it created?)\n", map->name); + return libbpf_err(-EINVAL); + } + if (map->pin_path) { if (path && strcmp(path, map->pin_path)) { pr_warn("map '%s' already has pin path '%s' different from '%s'\n", @@ -9231,6 +9303,7 @@ static int attach_tp(const struct bpf_program *prog, long cookie, struct bpf_lin static int attach_raw_tp(const struct bpf_program *prog, long cookie, struct bpf_link **link); static int attach_trace(const struct bpf_program *prog, long cookie, struct bpf_link **link); static int attach_kprobe_multi(const struct bpf_program *prog, long cookie, struct bpf_link **link); +static int attach_kprobe_session(const struct bpf_program *prog, long cookie, struct bpf_link **link); static int attach_uprobe_multi(const struct bpf_program *prog, long cookie, struct bpf_link **link); static int attach_lsm(const struct bpf_program *prog, long cookie, struct bpf_link **link); static int attach_iter(const struct bpf_program *prog, long cookie, struct bpf_link **link); @@ -9247,6 +9320,7 @@ static const struct bpf_sec_def section_defs[] = { SEC_DEF("uretprobe.s+", KPROBE, 0, SEC_SLEEPABLE, attach_uprobe), SEC_DEF("kprobe.multi+", KPROBE, BPF_TRACE_KPROBE_MULTI, SEC_NONE, attach_kprobe_multi), SEC_DEF("kretprobe.multi+", KPROBE, BPF_TRACE_KPROBE_MULTI, SEC_NONE, attach_kprobe_multi), + SEC_DEF("kprobe.session+", KPROBE, BPF_TRACE_KPROBE_SESSION, SEC_NONE, attach_kprobe_session), SEC_DEF("uprobe.multi+", KPROBE, BPF_TRACE_UPROBE_MULTI, SEC_NONE, attach_uprobe_multi), SEC_DEF("uretprobe.multi+", KPROBE, BPF_TRACE_UPROBE_MULTI, SEC_NONE, attach_uprobe_multi), SEC_DEF("uprobe.multi.s+", KPROBE, BPF_TRACE_UPROBE_MULTI, SEC_SLEEPABLE, attach_uprobe_multi), @@ -9298,6 +9372,7 @@ static const struct bpf_sec_def section_defs[] = { SEC_DEF("sockops", SOCK_OPS, BPF_CGROUP_SOCK_OPS, SEC_ATTACHABLE_OPT), SEC_DEF("sk_skb/stream_parser", SK_SKB, BPF_SK_SKB_STREAM_PARSER, SEC_ATTACHABLE_OPT), SEC_DEF("sk_skb/stream_verdict",SK_SKB, BPF_SK_SKB_STREAM_VERDICT, SEC_ATTACHABLE_OPT), + SEC_DEF("sk_skb/verdict", SK_SKB, BPF_SK_SKB_VERDICT, SEC_ATTACHABLE_OPT), SEC_DEF("sk_skb", SK_SKB, 0, SEC_NONE), SEC_DEF("sk_msg", SK_MSG, BPF_SK_MSG_VERDICT, SEC_ATTACHABLE_OPT), SEC_DEF("lirc_mode2", LIRC_MODE2, BPF_LIRC_MODE2, SEC_ATTACHABLE_OPT), @@ -9816,16 +9891,28 @@ static int find_kernel_btf_id(struct bpf_object *obj, const char *attach_name, enum bpf_attach_type attach_type, int *btf_obj_fd, int *btf_type_id) { - int ret, i; + int ret, i, mod_len; + const char *fn_name, *mod_name = NULL; - ret = find_attach_btf_id(obj->btf_vmlinux, attach_name, attach_type); - if (ret > 0) { - *btf_obj_fd = 0; /* vmlinux BTF */ - *btf_type_id = ret; - return 0; + fn_name = strchr(attach_name, ':'); + if (fn_name) { + mod_name = attach_name; + mod_len = fn_name - mod_name; + fn_name++; + } + + if (!mod_name || strncmp(mod_name, "vmlinux", mod_len) == 0) { + ret = find_attach_btf_id(obj->btf_vmlinux, + mod_name ? fn_name : attach_name, + attach_type); + if (ret > 0) { + *btf_obj_fd = 0; /* vmlinux BTF */ + *btf_type_id = ret; + return 0; + } + if (ret != -ENOENT) + return ret; } - if (ret != -ENOENT) - return ret; ret = load_module_btfs(obj); if (ret) @@ -9834,7 +9921,12 @@ static int find_kernel_btf_id(struct bpf_object *obj, const char *attach_name, for (i = 0; i < obj->btf_module_cnt; i++) { const struct module_btf *mod = &obj->btf_modules[i]; - ret = find_attach_btf_id(mod->btf, attach_name, attach_type); + if (mod_name && strncmp(mod->name, mod_name, mod_len) != 0) + continue; + + ret = find_attach_btf_id(mod->btf, + mod_name ? fn_name : attach_name, + attach_type); if (ret > 0) { *btf_obj_fd = mod->fd; *btf_type_id = ret; @@ -10307,6 +10399,11 @@ static int validate_map_op(const struct bpf_map *map, size_t key_sz, return -EINVAL; } + if (map->fd < 0) { + pr_warn("map '%s': can't use BPF map without FD (was it created?)\n", map->name); + return -EINVAL; + } + if (!check_value_sz) return 0; @@ -10419,8 +10516,15 @@ long libbpf_get_error(const void *ptr) int bpf_link__update_program(struct bpf_link *link, struct bpf_program *prog) { int ret; + int prog_fd = bpf_program__fd(prog); + + if (prog_fd < 0) { + pr_warn("prog '%s': can't use BPF program without FD (was it loaded?)\n", + prog->name); + return libbpf_err(-EINVAL); + } - ret = bpf_link_update(bpf_link__fd(link), bpf_program__fd(prog), NULL); + ret = bpf_link_update(bpf_link__fd(link), prog_fd, NULL); return libbpf_err_errno(ret); } @@ -10614,7 +10718,7 @@ struct bpf_link *bpf_program__attach_perf_event_opts(const struct bpf_program *p } prog_fd = bpf_program__fd(prog); if (prog_fd < 0) { - pr_warn("prog '%s': can't attach BPF program w/o FD (did you load it?)\n", + pr_warn("prog '%s': can't attach BPF program without FD (was it loaded?)\n", prog->name); return libbpf_err_ptr(-EINVAL); } @@ -11326,18 +11430,26 @@ bpf_program__attach_kprobe_multi_opts(const struct bpf_program *prog, struct kprobe_multi_resolve res = { .pattern = pattern, }; + enum bpf_attach_type attach_type; struct bpf_link *link = NULL; char errmsg[STRERR_BUFSIZE]; const unsigned long *addrs; int err, link_fd, prog_fd; + bool retprobe, session; const __u64 *cookies; const char **syms; - bool retprobe; size_t cnt; if (!OPTS_VALID(opts, bpf_kprobe_multi_opts)) return libbpf_err_ptr(-EINVAL); + prog_fd = bpf_program__fd(prog); + if (prog_fd < 0) { + pr_warn("prog '%s': can't attach BPF program without FD (was it loaded?)\n", + prog->name); + return libbpf_err_ptr(-EINVAL); + } + syms = OPTS_GET(opts, syms, false); addrs = OPTS_GET(opts, addrs, false); cnt = OPTS_GET(opts, cnt, false); @@ -11364,6 +11476,12 @@ bpf_program__attach_kprobe_multi_opts(const struct bpf_program *prog, } retprobe = OPTS_GET(opts, retprobe, false); + session = OPTS_GET(opts, session, false); + + if (retprobe && session) + return libbpf_err_ptr(-EINVAL); + + attach_type = session ? BPF_TRACE_KPROBE_SESSION : BPF_TRACE_KPROBE_MULTI; lopts.kprobe_multi.syms = syms; lopts.kprobe_multi.addrs = addrs; @@ -11378,8 +11496,7 @@ bpf_program__attach_kprobe_multi_opts(const struct bpf_program *prog, } link->detach = &bpf_link__detach_fd; - prog_fd = bpf_program__fd(prog); - link_fd = bpf_link_create(prog_fd, 0, BPF_TRACE_KPROBE_MULTI, &lopts); + link_fd = bpf_link_create(prog_fd, 0, attach_type, &lopts); if (link_fd < 0) { err = -errno; pr_warn("prog '%s': failed to attach: %s\n", @@ -11476,7 +11593,7 @@ static int attach_kprobe_multi(const struct bpf_program *prog, long cookie, stru n = sscanf(spec, "%m[a-zA-Z0-9_.*?]", &pattern); if (n < 1) { - pr_warn("kprobe multi pattern is invalid: %s\n", pattern); + pr_warn("kprobe multi pattern is invalid: %s\n", spec); return -EINVAL; } @@ -11485,6 +11602,32 @@ static int attach_kprobe_multi(const struct bpf_program *prog, long cookie, stru return libbpf_get_error(*link); } +static int attach_kprobe_session(const struct bpf_program *prog, long cookie, + struct bpf_link **link) +{ + LIBBPF_OPTS(bpf_kprobe_multi_opts, opts, .session = true); + const char *spec; + char *pattern; + int n; + + *link = NULL; + + /* no auto-attach for SEC("kprobe.session") */ + if (strcmp(prog->sec_name, "kprobe.session") == 0) + return 0; + + spec = prog->sec_name + sizeof("kprobe.session/") - 1; + n = sscanf(spec, "%m[a-zA-Z0-9_.*?]", &pattern); + if (n < 1) { + pr_warn("kprobe session pattern is invalid: %s\n", spec); + return -EINVAL; + } + + *link = bpf_program__attach_kprobe_multi_opts(prog, pattern, &opts); + free(pattern); + return *link ? 0 : -errno; +} + static int attach_uprobe_multi(const struct bpf_program *prog, long cookie, struct bpf_link **link) { char *probe_type = NULL, *binary_path = NULL, *func_name = NULL; @@ -11761,6 +11904,13 @@ bpf_program__attach_uprobe_multi(const struct bpf_program *prog, if (!OPTS_VALID(opts, bpf_uprobe_multi_opts)) return libbpf_err_ptr(-EINVAL); + prog_fd = bpf_program__fd(prog); + if (prog_fd < 0) { + pr_warn("prog '%s': can't attach BPF program without FD (was it loaded?)\n", + prog->name); + return libbpf_err_ptr(-EINVAL); + } + syms = OPTS_GET(opts, syms, NULL); offsets = OPTS_GET(opts, offsets, NULL); ref_ctr_offsets = OPTS_GET(opts, ref_ctr_offsets, NULL); @@ -11836,7 +11986,6 @@ bpf_program__attach_uprobe_multi(const struct bpf_program *prog, } link->detach = &bpf_link__detach_fd; - prog_fd = bpf_program__fd(prog); link_fd = bpf_link_create(prog_fd, 0, BPF_TRACE_UPROBE_MULTI, &lopts); if (link_fd < 0) { err = -errno; @@ -12080,7 +12229,7 @@ struct bpf_link *bpf_program__attach_usdt(const struct bpf_program *prog, return libbpf_err_ptr(-EINVAL); if (bpf_program__fd(prog) < 0) { - pr_warn("prog '%s': can't attach BPF program w/o FD (did you load it?)\n", + pr_warn("prog '%s': can't attach BPF program without FD (was it loaded?)\n", prog->name); return libbpf_err_ptr(-EINVAL); } @@ -12271,13 +12420,19 @@ static int attach_tp(const struct bpf_program *prog, long cookie, struct bpf_lin return libbpf_get_error(*link); } -struct bpf_link *bpf_program__attach_raw_tracepoint(const struct bpf_program *prog, - const char *tp_name) +struct bpf_link * +bpf_program__attach_raw_tracepoint_opts(const struct bpf_program *prog, + const char *tp_name, + struct bpf_raw_tracepoint_opts *opts) { + LIBBPF_OPTS(bpf_raw_tp_opts, raw_opts); char errmsg[STRERR_BUFSIZE]; struct bpf_link *link; int prog_fd, pfd; + if (!OPTS_VALID(opts, bpf_raw_tracepoint_opts)) + return libbpf_err_ptr(-EINVAL); + prog_fd = bpf_program__fd(prog); if (prog_fd < 0) { pr_warn("prog '%s': can't attach before loaded\n", prog->name); @@ -12289,7 +12444,9 @@ struct bpf_link *bpf_program__attach_raw_tracepoint(const struct bpf_program *pr return libbpf_err_ptr(-ENOMEM); link->detach = &bpf_link__detach_fd; - pfd = bpf_raw_tracepoint_open(tp_name, prog_fd); + raw_opts.tp_name = tp_name; + raw_opts.cookie = OPTS_GET(opts, cookie, 0); + pfd = bpf_raw_tracepoint_open_opts(prog_fd, &raw_opts); if (pfd < 0) { pfd = -errno; free(link); @@ -12301,6 +12458,12 @@ struct bpf_link *bpf_program__attach_raw_tracepoint(const struct bpf_program *pr return link; } +struct bpf_link *bpf_program__attach_raw_tracepoint(const struct bpf_program *prog, + const char *tp_name) +{ + return bpf_program__attach_raw_tracepoint_opts(prog, tp_name, NULL); +} + static int attach_raw_tp(const struct bpf_program *prog, long cookie, struct bpf_link **link) { static const char *const prefixes[] = { @@ -12454,6 +12617,12 @@ bpf_program__attach_netns(const struct bpf_program *prog, int netns_fd) return bpf_program_attach_fd(prog, netns_fd, "netns", NULL); } +struct bpf_link * +bpf_program__attach_sockmap(const struct bpf_program *prog, int map_fd) +{ + return bpf_program_attach_fd(prog, map_fd, "sockmap", NULL); +} + struct bpf_link *bpf_program__attach_xdp(const struct bpf_program *prog, int ifindex) { /* target_fd/target_ifindex use the same field in LINK_CREATE */ @@ -12662,6 +12831,12 @@ struct bpf_link *bpf_program__attach(const struct bpf_program *prog) if (!prog->sec_def || !prog->sec_def->prog_attach_fn) return libbpf_err_ptr(-EOPNOTSUPP); + if (bpf_program__fd(prog) < 0) { + pr_warn("prog '%s': can't attach BPF program without FD (was it loaded?)\n", + prog->name); + return libbpf_err_ptr(-EINVAL); + } + err = prog->sec_def->prog_attach_fn(prog, prog->sec_def->cookie, &link); if (err) return libbpf_err_ptr(err); @@ -12702,8 +12877,13 @@ struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map) __u32 zero = 0; int err, fd; - if (!bpf_map__is_struct_ops(map) || map->fd == -1) + if (!bpf_map__is_struct_ops(map)) + return libbpf_err_ptr(-EINVAL); + + if (map->fd < 0) { + pr_warn("map '%s': can't attach BPF map without FD (was it created?)\n", map->name); return libbpf_err_ptr(-EINVAL); + } link = calloc(1, sizeof(*link)); if (!link) @@ -12751,9 +12931,14 @@ int bpf_link__update_map(struct bpf_link *link, const struct bpf_map *map) __u32 zero = 0; int err; - if (!bpf_map__is_struct_ops(map) || !map_is_created(map)) + if (!bpf_map__is_struct_ops(map)) return -EINVAL; + if (map->fd < 0) { + pr_warn("map '%s': can't use BPF map without FD (was it created?)\n", map->name); + return -EINVAL; + } + st_ops_link = container_of(link, struct bpf_link_struct_ops, link); /* Ensure the type of a link is correct */ if (st_ops_link->map_fd < 0) diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 7b510761f545..c3f77d9260fe 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -539,10 +539,12 @@ struct bpf_kprobe_multi_opts { size_t cnt; /* create return kprobes */ bool retprobe; + /* create session kprobes */ + bool session; size_t :0; }; -#define bpf_kprobe_multi_opts__last_field retprobe +#define bpf_kprobe_multi_opts__last_field session LIBBPF_API struct bpf_link * bpf_program__attach_kprobe_multi_opts(const struct bpf_program *prog, @@ -760,9 +762,20 @@ bpf_program__attach_tracepoint_opts(const struct bpf_program *prog, const char *tp_name, const struct bpf_tracepoint_opts *opts); +struct bpf_raw_tracepoint_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + __u64 cookie; + size_t :0; +}; +#define bpf_raw_tracepoint_opts__last_field cookie + LIBBPF_API struct bpf_link * bpf_program__attach_raw_tracepoint(const struct bpf_program *prog, const char *tp_name); +LIBBPF_API struct bpf_link * +bpf_program__attach_raw_tracepoint_opts(const struct bpf_program *prog, + const char *tp_name, + struct bpf_raw_tracepoint_opts *opts); struct bpf_trace_opts { /* size of this struct, for forward/backward compatibility */ @@ -784,6 +797,8 @@ bpf_program__attach_cgroup(const struct bpf_program *prog, int cgroup_fd); LIBBPF_API struct bpf_link * bpf_program__attach_netns(const struct bpf_program *prog, int netns_fd); LIBBPF_API struct bpf_link * +bpf_program__attach_sockmap(const struct bpf_program *prog, int map_fd); +LIBBPF_API struct bpf_link * bpf_program__attach_xdp(const struct bpf_program *prog, int ifindex); LIBBPF_API struct bpf_link * bpf_program__attach_freplace(const struct bpf_program *prog, @@ -1282,6 +1297,7 @@ LIBBPF_API int ring_buffer__add(struct ring_buffer *rb, int map_fd, ring_buffer_sample_fn sample_cb, void *ctx); LIBBPF_API int ring_buffer__poll(struct ring_buffer *rb, int timeout_ms); LIBBPF_API int ring_buffer__consume(struct ring_buffer *rb); +LIBBPF_API int ring_buffer__consume_n(struct ring_buffer *rb, size_t n); LIBBPF_API int ring_buffer__epoll_fd(const struct ring_buffer *rb); /** @@ -1356,6 +1372,17 @@ LIBBPF_API int ring__map_fd(const struct ring *r); */ LIBBPF_API int ring__consume(struct ring *r); +/** + * @brief **ring__consume_n()** consumes up to a requested amount of items from + * a ringbuffer without event polling. + * + * @param r A ringbuffer object. + * @param n Maximum amount of items to consume. + * @return The number of items consumed, or a negative number if any of the + * callbacks return an error. + */ +LIBBPF_API int ring__consume_n(struct ring *r, size_t n); + struct user_ring_buffer_opts { size_t sz; /* size of this struct, for forward/backward compatibility */ }; diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 86804fd90dd1..c1ce8aa3520b 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -410,7 +410,16 @@ LIBBPF_1.3.0 { LIBBPF_1.4.0 { global: + bpf_program__attach_raw_tracepoint_opts; + bpf_raw_tracepoint_open_opts; bpf_token_create; btf__new_split; btf_ext__raw_data; } LIBBPF_1.3.0; + +LIBBPF_1.5.0 { + global: + bpf_program__attach_sockmap; + ring__consume_n; + ring_buffer__consume_n; +} LIBBPF_1.4.0; diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index 864b36177424..a0dcfb82e455 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -518,11 +518,6 @@ int btf_ext_visit_str_offs(struct btf_ext *btf_ext, str_off_visit_fn visit, void __s32 btf__find_by_name_kind_own(const struct btf *btf, const char *type_name, __u32 kind); -typedef int (*kallsyms_cb_t)(unsigned long long sym_addr, char sym_type, - const char *sym_name, void *ctx); - -int libbpf_kallsyms_parse(kallsyms_cb_t cb, void *arg); - /* handle direct returned errors */ static inline int libbpf_err(int ret) { diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c index 302188122439..9dfbe7750f56 100644 --- a/tools/lib/bpf/libbpf_probes.c +++ b/tools/lib/bpf/libbpf_probes.c @@ -448,7 +448,8 @@ int libbpf_probe_bpf_helper(enum bpf_prog_type prog_type, enum bpf_func_id helpe /* If BPF verifier doesn't recognize BPF helper ID (enum bpf_func_id) * at all, it will emit something like "invalid func unknown#181". * If BPF verifier recognizes BPF helper but it's not supported for - * given BPF program type, it will emit "unknown func bpf_sys_bpf#166". + * given BPF program type, it will emit "unknown func bpf_sys_bpf#166" + * or "program of this type cannot use helper bpf_sys_bpf#166". * In both cases, provided combination of BPF program type and BPF * helper is not supported by the kernel. * In all other cases, probe_prog_load() above will either succeed (e.g., @@ -457,7 +458,8 @@ int libbpf_probe_bpf_helper(enum bpf_prog_type prog_type, enum bpf_func_id helpe * that), or we'll get some more specific BPF verifier error about * some unsatisfied conditions. */ - if (ret == 0 && (strstr(buf, "invalid func ") || strstr(buf, "unknown func "))) + if (ret == 0 && (strstr(buf, "invalid func ") || strstr(buf, "unknown func ") || + strstr(buf, "program of this type cannot use helper "))) return 0; return 1; /* assume supported */ } diff --git a/tools/lib/bpf/libbpf_version.h b/tools/lib/bpf/libbpf_version.h index e783a47da815..d6e5eff967cb 100644 --- a/tools/lib/bpf/libbpf_version.h +++ b/tools/lib/bpf/libbpf_version.h @@ -4,6 +4,6 @@ #define __LIBBPF_VERSION_H #define LIBBPF_MAJOR_VERSION 1 -#define LIBBPF_MINOR_VERSION 4 +#define LIBBPF_MINOR_VERSION 5 #endif /* __LIBBPF_VERSION_H */ diff --git a/tools/lib/bpf/ringbuf.c b/tools/lib/bpf/ringbuf.c index aacb64278a01..bfd8dac4c0cc 100644 --- a/tools/lib/bpf/ringbuf.c +++ b/tools/lib/bpf/ringbuf.c @@ -231,7 +231,7 @@ static inline int roundup_len(__u32 len) return (len + 7) / 8 * 8; } -static int64_t ringbuf_process_ring(struct ring *r) +static int64_t ringbuf_process_ring(struct ring *r, size_t n) { int *len_ptr, len, err; /* 64-bit to avoid overflow in case of extreme application behavior */ @@ -268,12 +268,42 @@ static int64_t ringbuf_process_ring(struct ring *r) } smp_store_release(r->consumer_pos, cons_pos); + + if (cnt >= n) + goto done; } } while (got_new_data); done: return cnt; } +/* Consume available ring buffer(s) data without event polling, up to n + * records. + * + * Returns number of records consumed across all registered ring buffers (or + * n, whichever is less), or negative number if any of the callbacks return + * error. + */ +int ring_buffer__consume_n(struct ring_buffer *rb, size_t n) +{ + int64_t err, res = 0; + int i; + + for (i = 0; i < rb->ring_cnt; i++) { + struct ring *ring = rb->rings[i]; + + err = ringbuf_process_ring(ring, n); + if (err < 0) + return libbpf_err(err); + res += err; + n -= err; + + if (n == 0) + break; + } + return res > INT_MAX ? INT_MAX : res; +} + /* Consume available ring buffer(s) data without event polling. * Returns number of records consumed across all registered ring buffers (or * INT_MAX, whichever is less), or negative number if any of the callbacks @@ -287,13 +317,15 @@ int ring_buffer__consume(struct ring_buffer *rb) for (i = 0; i < rb->ring_cnt; i++) { struct ring *ring = rb->rings[i]; - err = ringbuf_process_ring(ring); + err = ringbuf_process_ring(ring, INT_MAX); if (err < 0) return libbpf_err(err); res += err; + if (res > INT_MAX) { + res = INT_MAX; + break; + } } - if (res > INT_MAX) - return INT_MAX; return res; } @@ -314,13 +346,13 @@ int ring_buffer__poll(struct ring_buffer *rb, int timeout_ms) __u32 ring_id = rb->events[i].data.fd; struct ring *ring = rb->rings[ring_id]; - err = ringbuf_process_ring(ring); + err = ringbuf_process_ring(ring, INT_MAX); if (err < 0) return libbpf_err(err); res += err; } if (res > INT_MAX) - return INT_MAX; + res = INT_MAX; return res; } @@ -371,17 +403,22 @@ int ring__map_fd(const struct ring *r) return r->map_fd; } -int ring__consume(struct ring *r) +int ring__consume_n(struct ring *r, size_t n) { int64_t res; - res = ringbuf_process_ring(r); + res = ringbuf_process_ring(r, n); if (res < 0) return libbpf_err(res); return res > INT_MAX ? INT_MAX : res; } +int ring__consume(struct ring *r) +{ + return ring__consume_n(r, INT_MAX); +} + static void user_ringbuf_unmap_ring(struct user_ring_buffer *rb) { if (rb->consumer_pos) { diff --git a/tools/lib/bpf/str_error.c b/tools/lib/bpf/str_error.c index 146da01979c7..5e6a1e27ddf9 100644 --- a/tools/lib/bpf/str_error.c +++ b/tools/lib/bpf/str_error.c @@ -2,6 +2,7 @@ #undef _GNU_SOURCE #include <string.h> #include <stdio.h> +#include <errno.h> #include "str_error.h" /* make sure libbpf doesn't use kernel-only integer typedefs */ @@ -15,7 +16,18 @@ char *libbpf_strerror_r(int err, char *dst, int len) { int ret = strerror_r(err < 0 ? -err : err, dst, len); - if (ret) - snprintf(dst, len, "ERROR: strerror_r(%d)=%d", err, ret); + /* on glibc <2.13, ret == -1 and errno is set, if strerror_r() can't + * handle the error, on glibc >=2.13 *positive* (errno-like) error + * code is returned directly + */ + if (ret == -1) + ret = errno; + if (ret) { + if (ret == EINVAL) + /* strerror_r() doesn't recognize this specific error */ + snprintf(dst, len, "unknown error (%d)", err < 0 ? err : -err); + else + snprintf(dst, len, "ERROR: strerror_r(%d)=%d", err, ret); + } return dst; } diff --git a/tools/lib/bpf/usdt.bpf.h b/tools/lib/bpf/usdt.bpf.h index f6763300b26a..76359bcdc94a 100644 --- a/tools/lib/bpf/usdt.bpf.h +++ b/tools/lib/bpf/usdt.bpf.h @@ -214,18 +214,18 @@ long bpf_usdt_cookie(struct pt_regs *ctx) /* we rely on ___bpf_apply() and ___bpf_narg() macros already defined in bpf_tracing.h */ #define ___bpf_usdt_args0() ctx -#define ___bpf_usdt_args1(x) ___bpf_usdt_args0(), ({ long _x; bpf_usdt_arg(ctx, 0, &_x); (void *)_x; }) -#define ___bpf_usdt_args2(x, args...) ___bpf_usdt_args1(args), ({ long _x; bpf_usdt_arg(ctx, 1, &_x); (void *)_x; }) -#define ___bpf_usdt_args3(x, args...) ___bpf_usdt_args2(args), ({ long _x; bpf_usdt_arg(ctx, 2, &_x); (void *)_x; }) -#define ___bpf_usdt_args4(x, args...) ___bpf_usdt_args3(args), ({ long _x; bpf_usdt_arg(ctx, 3, &_x); (void *)_x; }) -#define ___bpf_usdt_args5(x, args...) ___bpf_usdt_args4(args), ({ long _x; bpf_usdt_arg(ctx, 4, &_x); (void *)_x; }) -#define ___bpf_usdt_args6(x, args...) ___bpf_usdt_args5(args), ({ long _x; bpf_usdt_arg(ctx, 5, &_x); (void *)_x; }) -#define ___bpf_usdt_args7(x, args...) ___bpf_usdt_args6(args), ({ long _x; bpf_usdt_arg(ctx, 6, &_x); (void *)_x; }) -#define ___bpf_usdt_args8(x, args...) ___bpf_usdt_args7(args), ({ long _x; bpf_usdt_arg(ctx, 7, &_x); (void *)_x; }) -#define ___bpf_usdt_args9(x, args...) ___bpf_usdt_args8(args), ({ long _x; bpf_usdt_arg(ctx, 8, &_x); (void *)_x; }) -#define ___bpf_usdt_args10(x, args...) ___bpf_usdt_args9(args), ({ long _x; bpf_usdt_arg(ctx, 9, &_x); (void *)_x; }) -#define ___bpf_usdt_args11(x, args...) ___bpf_usdt_args10(args), ({ long _x; bpf_usdt_arg(ctx, 10, &_x); (void *)_x; }) -#define ___bpf_usdt_args12(x, args...) ___bpf_usdt_args11(args), ({ long _x; bpf_usdt_arg(ctx, 11, &_x); (void *)_x; }) +#define ___bpf_usdt_args1(x) ___bpf_usdt_args0(), ({ long _x; bpf_usdt_arg(ctx, 0, &_x); _x; }) +#define ___bpf_usdt_args2(x, args...) ___bpf_usdt_args1(args), ({ long _x; bpf_usdt_arg(ctx, 1, &_x); _x; }) +#define ___bpf_usdt_args3(x, args...) ___bpf_usdt_args2(args), ({ long _x; bpf_usdt_arg(ctx, 2, &_x); _x; }) +#define ___bpf_usdt_args4(x, args...) ___bpf_usdt_args3(args), ({ long _x; bpf_usdt_arg(ctx, 3, &_x); _x; }) +#define ___bpf_usdt_args5(x, args...) ___bpf_usdt_args4(args), ({ long _x; bpf_usdt_arg(ctx, 4, &_x); _x; }) +#define ___bpf_usdt_args6(x, args...) ___bpf_usdt_args5(args), ({ long _x; bpf_usdt_arg(ctx, 5, &_x); _x; }) +#define ___bpf_usdt_args7(x, args...) ___bpf_usdt_args6(args), ({ long _x; bpf_usdt_arg(ctx, 6, &_x); _x; }) +#define ___bpf_usdt_args8(x, args...) ___bpf_usdt_args7(args), ({ long _x; bpf_usdt_arg(ctx, 7, &_x); _x; }) +#define ___bpf_usdt_args9(x, args...) ___bpf_usdt_args8(args), ({ long _x; bpf_usdt_arg(ctx, 8, &_x); _x; }) +#define ___bpf_usdt_args10(x, args...) ___bpf_usdt_args9(args), ({ long _x; bpf_usdt_arg(ctx, 9, &_x); _x; }) +#define ___bpf_usdt_args11(x, args...) ___bpf_usdt_args10(args), ({ long _x; bpf_usdt_arg(ctx, 10, &_x); _x; }) +#define ___bpf_usdt_args12(x, args...) ___bpf_usdt_args11(args), ({ long _x; bpf_usdt_arg(ctx, 11, &_x); _x; }) #define ___bpf_usdt_args(args...) ___bpf_apply(___bpf_usdt_args, ___bpf_narg(args))(args) /* diff --git a/tools/lib/rbtree.c b/tools/lib/rbtree.c index 727396de6be5..9e7307186b7f 100644 --- a/tools/lib/rbtree.c +++ b/tools/lib/rbtree.c @@ -58,7 +58,7 @@ static inline void rb_set_black(struct rb_node *rb) { - rb->__rb_parent_color |= RB_BLACK; + rb->__rb_parent_color += RB_BLACK; } static inline struct rb_node *rb_red_parent(struct rb_node *red) diff --git a/tools/net/ynl/cli.py b/tools/net/ynl/cli.py index f131e33ac3ee..b8481f401376 100755 --- a/tools/net/ynl/cli.py +++ b/tools/net/ynl/cli.py @@ -19,13 +19,30 @@ class YnlEncoder(json.JSONEncoder): def main(): - parser = argparse.ArgumentParser(description='YNL CLI sample') + description = """ + YNL CLI utility - a general purpose netlink utility that uses YAML + specs to drive protocol encoding and decoding. + """ + epilog = """ + The --multi option can be repeated to include several do operations + in the same netlink payload. + """ + + parser = argparse.ArgumentParser(description=description, + epilog=epilog) parser.add_argument('--spec', dest='spec', type=str, required=True) parser.add_argument('--schema', dest='schema', type=str) parser.add_argument('--no-schema', action='store_true') parser.add_argument('--json', dest='json_text', type=str) - parser.add_argument('--do', dest='do', type=str) - parser.add_argument('--dump', dest='dump', type=str) + + group = parser.add_mutually_exclusive_group() + group.add_argument('--do', dest='do', metavar='DO-OPERATION', type=str) + group.add_argument('--multi', dest='multi', nargs=2, action='append', + metavar=('DO-OPERATION', 'JSON_TEXT'), type=str) + group.add_argument('--dump', dest='dump', metavar='DUMP-OPERATION', type=str) + group.add_argument('--list-ops', action='store_true') + group.add_argument('--list-msgs', action='store_true') + parser.add_argument('--sleep', dest='sleep', type=int) parser.add_argument('--subscribe', dest='ntf', type=str) parser.add_argument('--replace', dest='flags', action='append_const', @@ -66,6 +83,13 @@ def main(): if args.sleep: time.sleep(args.sleep) + if args.list_ops: + for op_name, op in ynl.ops.items(): + print(op_name, " [", ", ".join(op.modes), "]") + if args.list_msgs: + for op_name, op in ynl.msgs.items(): + print(op_name, " [", ", ".join(op.modes), "]") + try: if args.do: reply = ynl.do(args.do, attrs, args.flags) @@ -73,6 +97,10 @@ def main(): if args.dump: reply = ynl.dump(args.dump, attrs) output(reply) + if args.multi: + ops = [ (item[0], json.loads(item[1]), args.flags or []) for item in args.multi ] + reply = ynl.do_multi(ops) + output(reply) except NlError as e: print(e) exit(1) diff --git a/tools/net/ynl/ethtool.py b/tools/net/ynl/ethtool.py index 6c9f7e31250c..63c471f075ab 100755 --- a/tools/net/ynl/ethtool.py +++ b/tools/net/ynl/ethtool.py @@ -6,6 +6,7 @@ import json import pprint import sys import re +import os from lib import YnlFamily @@ -152,8 +153,11 @@ def main(): global args args = parser.parse_args() - spec = '../../../Documentation/netlink/specs/ethtool.yaml' - schema = '../../../Documentation/netlink/genetlink-legacy.yaml' + script_abs_dir = os.path.dirname(os.path.abspath(sys.argv[0])) + spec = os.path.join(script_abs_dir, + '../../../Documentation/netlink/specs/ethtool.yaml') + schema = os.path.join(script_abs_dir, + '../../../Documentation/netlink/genetlink-legacy.yaml') ynl = YnlFamily(spec, schema) @@ -320,7 +324,13 @@ def main(): return if args.show_time_stamping: - tsinfo = dumpit(ynl, args, 'tsinfo-get') + req = { + 'header': { + 'flags': 'stats', + }, + } + + tsinfo = dumpit(ynl, args, 'tsinfo-get', req) print(f'Time stamping parameters for {args.device}:') @@ -334,6 +344,9 @@ def main(): print('Hardware Receive Filter Modes:') [print(f'\t{v}') for v in bits_to_dict(tsinfo['rx-filters'])] + + print('Statistics:') + [print(f'\t{k}: {v}') for k, v in tsinfo['stats'].items()] return print(f'Settings for {args.device}:') diff --git a/tools/net/ynl/lib/nlspec.py b/tools/net/ynl/lib/nlspec.py index 6d08ab9e213f..b6d6f8aef423 100644 --- a/tools/net/ynl/lib/nlspec.py +++ b/tools/net/ynl/lib/nlspec.py @@ -335,6 +335,7 @@ class SpecOperation(SpecElement): req_value numerical ID when serialized, user -> kernel rsp_value numerical ID when serialized, user <- kernel + modes supported operation modes (do, dump, event etc.) is_call bool, whether the operation is a call is_async bool, whether the operation is a notification is_resv bool, whether the operation does not exist (it's just a reserved ID) @@ -350,6 +351,7 @@ class SpecOperation(SpecElement): self.req_value = req_value self.rsp_value = rsp_value + self.modes = yaml.keys() & {'do', 'dump', 'event', 'notify'} self.is_call = 'do' in yaml or 'dump' in yaml self.is_async = 'notify' in yaml or 'event' in yaml self.is_resv = not self.is_async and not self.is_call diff --git a/tools/net/ynl/lib/ynl.h b/tools/net/ynl/lib/ynl.h index 9842e85a8c57..eef7c6324ed4 100644 --- a/tools/net/ynl/lib/ynl.h +++ b/tools/net/ynl/lib/ynl.h @@ -91,6 +91,18 @@ void ynl_sock_destroy(struct ynl_sock *ys); !ynl_dump_obj_is_last(iter); \ iter = ynl_dump_obj_next(iter)) +/** + * ynl_dump_empty() - does the dump have no entries + * @dump: pointer to the dump list, as returned by a dump call + * + * Check if the dump is empty, i.e. contains no objects. + * Dump calls return NULL on error, and terminator element if empty. + */ +static inline bool ynl_dump_empty(void *dump) +{ + return dump == (void *)YNL_LIST_END; +} + int ynl_subscribe(struct ynl_sock *ys, const char *grp_name); int ynl_socket_get_fd(struct ynl_sock *ys); int ynl_ntf_check(struct ynl_sock *ys); diff --git a/tools/net/ynl/lib/ynl.py b/tools/net/ynl/lib/ynl.py index 25810e18b0a7..35e666928119 100644 --- a/tools/net/ynl/lib/ynl.py +++ b/tools/net/ynl/lib/ynl.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause from collections import namedtuple +from enum import Enum import functools import os import random @@ -76,13 +77,33 @@ class Netlink: NLMSGERR_ATTR_MISS_TYPE = 5 NLMSGERR_ATTR_MISS_NEST = 6 + # Policy types + NL_POLICY_TYPE_ATTR_TYPE = 1 + NL_POLICY_TYPE_ATTR_MIN_VALUE_S = 2 + NL_POLICY_TYPE_ATTR_MAX_VALUE_S = 3 + NL_POLICY_TYPE_ATTR_MIN_VALUE_U = 4 + NL_POLICY_TYPE_ATTR_MAX_VALUE_U = 5 + NL_POLICY_TYPE_ATTR_MIN_LENGTH = 6 + NL_POLICY_TYPE_ATTR_MAX_LENGTH = 7 + NL_POLICY_TYPE_ATTR_POLICY_IDX = 8 + NL_POLICY_TYPE_ATTR_POLICY_MAXTYPE = 9 + NL_POLICY_TYPE_ATTR_BITFIELD32_MASK = 10 + NL_POLICY_TYPE_ATTR_PAD = 11 + NL_POLICY_TYPE_ATTR_MASK = 12 + + AttrType = Enum('AttrType', ['flag', 'u8', 'u16', 'u32', 'u64', + 's8', 's16', 's32', 's64', + 'binary', 'string', 'nul-string', + 'nested', 'nested-array', + 'bitfield32', 'sint', 'uint']) class NlError(Exception): def __init__(self, nl_msg): self.nl_msg = nl_msg + self.error = -nl_msg.error def __str__(self): - return f"Netlink error: {os.strerror(-self.nl_msg.error)}\n{self.nl_msg}" + return f"Netlink error: {os.strerror(self.error)}\n{self.nl_msg}" class ConfigError(Exception): @@ -199,6 +220,8 @@ class NlMsg: self.extack['miss-nest'] = extack.as_scalar('u32') elif extack.type == Netlink.NLMSGERR_ATTR_OFFS: self.extack['bad-attr-offs'] = extack.as_scalar('u32') + elif extack.type == Netlink.NLMSGERR_ATTR_POLICY: + self.extack['policy'] = self._decode_policy(extack.raw) else: if 'unknown' not in self.extack: self.extack['unknown'] = [] @@ -210,10 +233,33 @@ class NlMsg: miss_type = self.extack['miss-type'] if miss_type in attr_space.attrs_by_val: spec = attr_space.attrs_by_val[miss_type] - desc = spec['name'] + self.extack['miss-type'] = spec['name'] if 'doc' in spec: - desc += f" ({spec['doc']})" - self.extack['miss-type'] = desc + self.extack['miss-type-doc'] = spec['doc'] + + def _decode_policy(self, raw): + policy = {} + for attr in NlAttrs(raw): + if attr.type == Netlink.NL_POLICY_TYPE_ATTR_TYPE: + type = attr.as_scalar('u32') + policy['type'] = Netlink.AttrType(type).name + elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_MIN_VALUE_S: + policy['min-value'] = attr.as_scalar('s64') + elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_MAX_VALUE_S: + policy['max-value'] = attr.as_scalar('s64') + elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_MIN_VALUE_U: + policy['min-value'] = attr.as_scalar('u64') + elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_MAX_VALUE_U: + policy['max-value'] = attr.as_scalar('u64') + elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_MIN_LENGTH: + policy['min-length'] = attr.as_scalar('u32') + elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_MAX_LENGTH: + policy['max-length'] = attr.as_scalar('u32') + elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_BITFIELD32_MASK: + policy['bitfield32-mask'] = attr.as_scalar('u32') + elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_MASK: + policy['mask'] = attr.as_scalar('u64') + return policy def cmd(self): return self.nl_type @@ -340,12 +386,9 @@ class NetlinkProtocol: def _decode(self, nl_msg): return nl_msg - def decode(self, ynl, nl_msg): + def decode(self, ynl, nl_msg, op): msg = self._decode(nl_msg) - fixed_header_size = 0 - if ynl: - op = ynl.rsp_by_value[msg.cmd()] - fixed_header_size = ynl._struct_size(op.fixed_header) + fixed_header_size = ynl._struct_size(op.fixed_header) msg.raw_attrs = NlAttrs(msg.raw, fixed_header_size) return msg @@ -585,15 +628,28 @@ class YnlFamily(SpecFamily): decoded = self._formatted_string(decoded, attr_spec.display_hint) return decoded - def _decode_array_nest(self, attr, attr_spec): + def _decode_array_attr(self, attr, attr_spec): decoded = [] offset = 0 while offset < len(attr.raw): item = NlAttr(attr.raw, offset) offset += item.full_len - subattrs = self._decode(NlAttrs(item.raw), attr_spec['nested-attributes']) - decoded.append({ item.type: subattrs }) + if attr_spec["sub-type"] == 'nest': + subattrs = self._decode(NlAttrs(item.raw), attr_spec['nested-attributes']) + decoded.append({ item.type: subattrs }) + elif attr_spec["sub-type"] == 'binary': + subattrs = item.as_bin() + if attr_spec.display_hint: + subattrs = self._formatted_string(subattrs, attr_spec.display_hint) + decoded.append(subattrs) + elif attr_spec["sub-type"] in NlAttr.type_formats: + subattrs = item.as_scalar(attr_spec['sub-type'], attr_spec.byte_order) + if attr_spec.display_hint: + subattrs = self._formatted_string(subattrs, attr_spec.display_hint) + decoded.append(subattrs) + else: + raise Exception(f'Unknown {attr_spec["sub-type"]} with name {attr_spec["name"]}') return decoded def _decode_nest_type_value(self, attr, attr_spec): @@ -687,8 +743,8 @@ class YnlFamily(SpecFamily): decoded = attr.as_scalar(attr_spec['type'], attr_spec.byte_order) if 'enum' in attr_spec: decoded = self._decode_enum(decoded, attr_spec) - elif attr_spec["type"] == 'array-nest': - decoded = self._decode_array_nest(attr, attr_spec) + elif attr_spec["type"] == 'indexed-array': + decoded = self._decode_array_attr(attr, attr_spec) elif attr_spec["type"] == 'bitfield32': value, selector = struct.unpack("II", attr.raw) if 'enum' in attr_spec: @@ -738,7 +794,7 @@ class YnlFamily(SpecFamily): if 'bad-attr-offs' not in extack: return - msg = self.nlproto.decode(self, NlMsg(request, 0, op.attr_set)) + msg = self.nlproto.decode(self, NlMsg(request, 0, op.attr_set), op) offset = self.nlproto.msghdr_size() + self._struct_size(op.fixed_header) path = self._decode_extack_path(msg.raw_attrs, op.attr_set, offset, extack['bad-attr-offs']) @@ -820,7 +876,10 @@ class YnlFamily(SpecFamily): if display_hint == 'mac': formatted = ':'.join('%02x' % b for b in raw) elif display_hint == 'hex': - formatted = bytes.hex(raw, ' ') + if isinstance(raw, int): + formatted = hex(raw) + else: + formatted = bytes.hex(raw, ' ') elif display_hint in [ 'ipv4', 'ipv6' ]: formatted = format(ipaddress.ip_address(raw)) elif display_hint == 'uuid': @@ -860,7 +919,8 @@ class YnlFamily(SpecFamily): print("Netlink done while checking for ntf!?") continue - decoded = self.nlproto.decode(self, nl_msg) + op = self.rsp_by_value[nl_msg.cmd()] + decoded = self.nlproto.decode(self, nl_msg, op) if decoded.cmd() not in self.async_msg_ids: print("Unexpected msg id done while checking for ntf", decoded) continue @@ -878,16 +938,11 @@ class YnlFamily(SpecFamily): return op['do']['request']['attributes'].copy() - def _op(self, method, vals, flags=None, dump=False): - op = self.ops[method] - + def _encode_message(self, op, vals, flags, req_seq): nl_flags = Netlink.NLM_F_REQUEST | Netlink.NLM_F_ACK for flag in flags or []: nl_flags |= flag - if dump: - nl_flags |= Netlink.NLM_F_DUMP - req_seq = random.randint(1024, 65535) msg = self.nlproto.message(nl_flags, op.req_value, 1, req_seq) if op.fixed_header: msg += self._encode_struct(op.fixed_header, vals) @@ -895,18 +950,36 @@ class YnlFamily(SpecFamily): for name, value in vals.items(): msg += self._add_attr(op.attr_set.name, name, value, search_attrs) msg = _genl_msg_finalize(msg) + return msg - self.sock.send(msg, 0) + def _ops(self, ops): + reqs_by_seq = {} + req_seq = random.randint(1024, 65535) + payload = b'' + for (method, vals, flags) in ops: + op = self.ops[method] + msg = self._encode_message(op, vals, flags, req_seq) + reqs_by_seq[req_seq] = (op, msg, flags) + payload += msg + req_seq += 1 + + self.sock.send(payload, 0) done = False rsp = [] + op_rsp = [] while not done: reply = self.sock.recv(self._recv_size) nms = NlMsgs(reply, attr_space=op.attr_set) self._recv_dbg_print(reply, nms) for nl_msg in nms: - if nl_msg.extack: - self._decode_extack(msg, op, nl_msg.extack) + if nl_msg.nl_seq in reqs_by_seq: + (op, req_msg, req_flags) = reqs_by_seq[nl_msg.nl_seq] + if nl_msg.extack: + self._decode_extack(req_msg, op, nl_msg.extack) + else: + op = self.rsp_by_value[nl_msg.cmd()] + req_flags = [] if nl_msg.error: raise NlError(nl_msg) @@ -914,13 +987,25 @@ class YnlFamily(SpecFamily): if nl_msg.extack: print("Netlink warning:") print(nl_msg) - done = True + + if Netlink.NLM_F_DUMP in req_flags: + rsp.append(op_rsp) + elif not op_rsp: + rsp.append(None) + elif len(op_rsp) == 1: + rsp.append(op_rsp[0]) + else: + rsp.append(op_rsp) + op_rsp = [] + + del reqs_by_seq[nl_msg.nl_seq] + done = len(reqs_by_seq) == 0 break - decoded = self.nlproto.decode(self, nl_msg) + decoded = self.nlproto.decode(self, nl_msg, op) # Check if this is a reply to our request - if nl_msg.nl_seq != req_seq or decoded.cmd() != op.rsp_value: + if nl_msg.nl_seq not in reqs_by_seq or decoded.cmd() != op.rsp_value: if decoded.cmd() in self.async_msg_ids: self.handle_ntf(decoded) continue @@ -931,16 +1016,23 @@ class YnlFamily(SpecFamily): rsp_msg = self._decode(decoded.raw_attrs, op.attr_set.name) if op.fixed_header: rsp_msg.update(self._decode_struct(decoded.raw, op.fixed_header)) - rsp.append(rsp_msg) + op_rsp.append(rsp_msg) - if not rsp: - return None - if not dump and len(rsp) == 1: - return rsp[0] return rsp + def _op(self, method, vals, flags=None, dump=False): + req_flags = flags or [] + if dump: + req_flags.append(Netlink.NLM_F_DUMP) + + ops = [(method, vals, req_flags)] + return self._ops(ops)[0] + def do(self, method, vals, flags=None): return self._op(method, vals, flags) def dump(self, method, vals): - return self._op(method, vals, [], dump=True) + return self._op(method, vals, dump=True) + + def do_multi(self, ops): + return self._ops(ops) diff --git a/tools/net/ynl/samples/netdev.c b/tools/net/ynl/samples/netdev.c index 591b90e21890..3e7b29bd55d5 100644 --- a/tools/net/ynl/samples/netdev.c +++ b/tools/net/ynl/samples/netdev.c @@ -100,6 +100,8 @@ int main(int argc, char **argv) if (!devs) goto err_close; + if (ynl_dump_empty(devs)) + fprintf(stderr, "Error: no devices reported\n"); ynl_dump_foreach(devs, d) netdev_print_device(d, 0); netdev_dev_get_list_free(devs); diff --git a/tools/net/ynl/ynl-gen-c.py b/tools/net/ynl/ynl-gen-c.py index a451cbfbd781..a42d62b23ee0 100755 --- a/tools/net/ynl/ynl-gen-c.py +++ b/tools/net/ynl/ynl-gen-c.py @@ -413,7 +413,7 @@ class TypeString(Type): def _attr_policy(self, policy): if 'exact-len' in self.checks: - mem = 'NLA_POLICY_EXACT_LEN(' + str(self.checks['exact-len']) + ')' + mem = 'NLA_POLICY_EXACT_LEN(' + str(self.get_limit('exact-len')) + ')' else: mem = '{ .type = ' + policy if 'max-len' in self.checks: @@ -465,7 +465,7 @@ class TypeBinary(Type): def _attr_policy(self, policy): if 'exact-len' in self.checks: - mem = 'NLA_POLICY_EXACT_LEN(' + str(self.checks['exact-len']) + ')' + mem = 'NLA_POLICY_EXACT_LEN(' + str(self.get_limit('exact-len')) + ')' else: mem = '{ ' if len(self.checks) == 1 and 'min-len' in self.checks: @@ -841,8 +841,11 @@ class AttrSet(SpecAttrSet): t = TypeBitfield32(self.family, self, elem, value) elif elem['type'] == 'nest': t = TypeNest(self.family, self, elem, value) - elif elem['type'] == 'array-nest': - t = TypeArrayNest(self.family, self, elem, value) + elif elem['type'] == 'indexed-array' and 'sub-type' in elem: + if elem["sub-type"] == 'nest': + t = TypeArrayNest(self.family, self, elem, value) + else: + raise Exception(f'new_attr: unsupported sub-type {elem["sub-type"]}') elif elem['type'] == 'nest-type-value': t = TypeNestTypeValue(self.family, self, elem, value) else: @@ -1055,7 +1058,7 @@ class Family(SpecFamily): if nested in self.root_sets: raise Exception("Inheriting members to a space used as root not supported") inherit.update(set(spec['type-value'])) - elif spec['type'] == 'array-nest': + elif spec['type'] == 'indexed-array': inherit.add('idx') self.pure_nested_structs[nested].set_inherited(inherit) @@ -1619,9 +1622,12 @@ def _multi_parse(ri, struct, init_lines, local_vars): multi_attrs = set() needs_parg = False for arg, aspec in struct.member_list(): - if aspec['type'] == 'array-nest': - local_vars.append(f'const struct nlattr *attr_{aspec.c_name};') - array_nests.add(arg) + if aspec['type'] == 'indexed-array' and 'sub-type' in aspec: + if aspec["sub-type"] == 'nest': + local_vars.append(f'const struct nlattr *attr_{aspec.c_name};') + array_nests.add(arg) + else: + raise Exception(f'Not supported sub-type {aspec["sub-type"]}') if 'multi-attr' in aspec: multi_attrs.add(arg) needs_parg |= 'nested-attributes' in aspec diff --git a/tools/net/ynl/ynl-gen-rst.py b/tools/net/ynl/ynl-gen-rst.py index 927407b3efb3..657e881d2ea4 100755 --- a/tools/net/ynl/ynl-gen-rst.py +++ b/tools/net/ynl/ynl-gen-rst.py @@ -82,9 +82,9 @@ def rst_subsubsection(title: str) -> str: return f"{title}\n" + "~" * len(title) -def rst_section(title: str) -> str: +def rst_section(namespace: str, prefix: str, title: str) -> str: """Add a section to the document""" - return f"\n{title}\n" + "=" * len(title) + return f".. _{namespace}-{prefix}-{title}:\n\n{title}\n" + "=" * len(title) def rst_subtitle(title: str) -> str: @@ -102,6 +102,17 @@ def rst_list_inline(list_: List[str], level: int = 0) -> str: return headroom(level) + "[" + ", ".join(inline(i) for i in list_) + "]" +def rst_ref(namespace: str, prefix: str, name: str) -> str: + """Add a hyperlink to the document""" + mappings = {'enum': 'definition', + 'fixed-header': 'definition', + 'nested-attributes': 'attribute-set', + 'struct': 'definition'} + if prefix in mappings: + prefix = mappings[prefix] + return f":ref:`{namespace}-{prefix}-{name}`" + + def rst_header() -> str: """The headers for all the auto generated RST files""" lines = [] @@ -159,20 +170,24 @@ def parse_do_attributes(attrs: Dict[str, Any], level: int = 0) -> str: return "\n".join(lines) -def parse_operations(operations: List[Dict[str, Any]]) -> str: +def parse_operations(operations: List[Dict[str, Any]], namespace: str) -> str: """Parse operations block""" preprocessed = ["name", "doc", "title", "do", "dump"] + linkable = ["fixed-header", "attribute-set"] lines = [] for operation in operations: - lines.append(rst_section(operation["name"])) + lines.append(rst_section(namespace, 'operation', operation["name"])) lines.append(rst_paragraph(sanitize(operation["doc"])) + "\n") for key in operation.keys(): if key in preprocessed: # Skip the special fields continue - lines.append(rst_fields(key, operation[key], 0)) + value = operation[key] + if key in linkable: + value = rst_ref(namespace, key, value) + lines.append(rst_fields(key, value, 0)) if "do" in operation: lines.append(rst_paragraph(":do:", 0)) @@ -212,14 +227,14 @@ def parse_entries(entries: List[Dict[str, Any]], level: int) -> str: return "\n".join(lines) -def parse_definitions(defs: Dict[str, Any]) -> str: +def parse_definitions(defs: Dict[str, Any], namespace: str) -> str: """Parse definitions section""" preprocessed = ["name", "entries", "members"] ignored = ["render-max"] # This is not printed lines = [] for definition in defs: - lines.append(rst_section(definition["name"])) + lines.append(rst_section(namespace, 'definition', definition["name"])) for k in definition.keys(): if k in preprocessed + ignored: continue @@ -237,14 +252,15 @@ def parse_definitions(defs: Dict[str, Any]) -> str: return "\n".join(lines) -def parse_attr_sets(entries: List[Dict[str, Any]]) -> str: +def parse_attr_sets(entries: List[Dict[str, Any]], namespace: str) -> str: """Parse attribute from attribute-set""" preprocessed = ["name", "type"] + linkable = ["enum", "nested-attributes", "struct", "sub-message"] ignored = ["checks"] lines = [] for entry in entries: - lines.append(rst_section(entry["name"])) + lines.append(rst_section(namespace, 'attribute-set', entry["name"])) for attr in entry["attributes"]: type_ = attr.get("type") attr_line = attr["name"] @@ -257,25 +273,31 @@ def parse_attr_sets(entries: List[Dict[str, Any]]) -> str: for k in attr.keys(): if k in preprocessed + ignored: continue - lines.append(rst_fields(k, sanitize(attr[k]), 0)) + if k in linkable: + value = rst_ref(namespace, k, attr[k]) + else: + value = sanitize(attr[k]) + lines.append(rst_fields(k, value, 0)) lines.append("\n") return "\n".join(lines) -def parse_sub_messages(entries: List[Dict[str, Any]]) -> str: +def parse_sub_messages(entries: List[Dict[str, Any]], namespace: str) -> str: """Parse sub-message definitions""" lines = [] for entry in entries: - lines.append(rst_section(entry["name"])) + lines.append(rst_section(namespace, 'sub-message', entry["name"])) for fmt in entry["formats"]: value = fmt["value"] lines.append(rst_bullet(bold(value))) for attr in ['fixed-header', 'attribute-set']: if attr in fmt: - lines.append(rst_fields(attr, fmt[attr], 1)) + lines.append(rst_fields(attr, + rst_ref(namespace, attr, fmt[attr]), + 1)) lines.append("\n") return "\n".join(lines) @@ -289,9 +311,11 @@ def parse_yaml(obj: Dict[str, Any]) -> str: lines.append(rst_header()) - title = f"Family ``{obj['name']}`` netlink specification" + family = obj['name'] + + title = f"Family ``{family}`` netlink specification" lines.append(rst_title(title)) - lines.append(rst_paragraph(".. contents::\n")) + lines.append(rst_paragraph(".. contents:: :depth: 3\n")) if "doc" in obj: lines.append(rst_subtitle("Summary")) @@ -300,7 +324,7 @@ def parse_yaml(obj: Dict[str, Any]) -> str: # Operations if "operations" in obj: lines.append(rst_subtitle("Operations")) - lines.append(parse_operations(obj["operations"]["list"])) + lines.append(parse_operations(obj["operations"]["list"], family)) # Multicast groups if "mcast-groups" in obj: @@ -310,17 +334,17 @@ def parse_yaml(obj: Dict[str, Any]) -> str: # Definitions if "definitions" in obj: lines.append(rst_subtitle("Definitions")) - lines.append(parse_definitions(obj["definitions"])) + lines.append(parse_definitions(obj["definitions"], family)) # Attributes set if "attribute-sets" in obj: lines.append(rst_subtitle("Attribute sets")) - lines.append(parse_attr_sets(obj["attribute-sets"])) + lines.append(parse_attr_sets(obj["attribute-sets"], family)) # Sub-messages if "sub-messages" in obj: lines.append(rst_subtitle("Sub-messages")) - lines.append(parse_sub_messages(obj["sub-messages"])) + lines.append(parse_sub_messages(obj["sub-messages"], family)) return "\n".join(lines) diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c index c8923375e30d..630e16c54ed5 100644 --- a/tools/perf/util/probe-finder.c +++ b/tools/perf/util/probe-finder.c @@ -186,8 +186,6 @@ static_var: return ret2; } -#define BYTES_TO_BITS(nb) ((nb) * BITS_PER_LONG / sizeof(long)) - static int convert_variable_type(Dwarf_Die *vr_die, struct probe_trace_arg *tvar, const char *cast, bool user_access) @@ -217,7 +215,7 @@ static int convert_variable_type(Dwarf_Die *vr_die, total = dwarf_bytesize(vr_die); if (boffs < 0 || total < 0) return -ENOENT; - ret = snprintf(buf, 16, "b%d@%d/%zd", bsize, boffs, + ret = snprintf(buf, 16, "b%d@%d/%d", bsize, boffs, BYTES_TO_BITS(total)); goto formatted; } diff --git a/tools/power/acpi/tools/pfrut/pfrut.c b/tools/power/acpi/tools/pfrut/pfrut.c index 388c9e3ad040..44a9ecbd91e8 100644 --- a/tools/power/acpi/tools/pfrut/pfrut.c +++ b/tools/power/acpi/tools/pfrut/pfrut.c @@ -174,6 +174,8 @@ void print_cap(struct pfru_update_cap_info *cap) exit(1); } + printf("update capability:%d\n", cap->update_cap); + uuid_unparse(cap->code_type, uuid); printf("code injection image type:%s\n", uuid); printf("fw_version:%d\n", cap->fw_version); diff --git a/tools/power/x86/intel-speed-select/isst-config.c b/tools/power/x86/intel-speed-select/isst-config.c index d865dc1f89ee..5899c27c2e2e 100644 --- a/tools/power/x86/intel-speed-select/isst-config.c +++ b/tools/power/x86/intel-speed-select/isst-config.c @@ -16,9 +16,9 @@ struct process_cmd_struct { int arg; }; -static const char *version_str = "v1.18"; +static const char *version_str = "v1.19"; -static const int supported_api_ver = 2; +static const int supported_api_ver = 3; static struct isst_if_platform_info isst_platform_info; static char *progname; static int debug_flag; @@ -46,6 +46,8 @@ static int force_online_offline; static int auto_mode; static int fact_enable_fail; static int cgroupv2; +static int max_die_id; +static int max_punit_id; /* clos related */ static int current_clos = -1; @@ -562,6 +564,18 @@ void for_each_online_power_domain_in_set(void (*callback)(struct isst_id *, void } for (i = 0; i < MAX_PACKAGE_COUNT; i++) { + if (max_die_id == max_punit_id) { + for (k = 0; k < MAX_PUNIT_PER_DIE && k < MAX_DIE_PER_PACKAGE; k++) { + id.cpu = cpus[i][k][k]; + id.pkg = i; + id.die = k; + id.punit = k; + if (isst_is_punit_valid(&id)) + callback(&id, arg1, arg2, arg3, arg4); + } + continue; + } + for (j = 0; j < MAX_DIE_PER_PACKAGE; j++) { /* * Fix me: @@ -795,6 +809,12 @@ static void create_cpu_map(void) cpu_cnt[pkg_id][die_id][punit_id]++; + if (max_die_id < die_id) + max_die_id = die_id; + + if (max_punit_id < cpu_map[i].punit_id) + max_punit_id = cpu_map[i].punit_id; + debug_printf( "map logical_cpu:%d core: %d die:%d pkg:%d punit:%d punit_cpu:%d punit_core:%d\n", i, cpu_map[i].core_id, cpu_map[i].die_id, @@ -2054,6 +2074,7 @@ static void dump_fact_config_for_cpu(struct isst_id *id, void *arg1, void *arg2, struct isst_fact_info fact_info; int ret; + memset(&fact_info, 0, sizeof(fact_info)); ret = isst_get_fact_info(id, tdp_level, fact_bucket, &fact_info); if (ret) { isst_display_error_info_message(1, "Failed to get turbo-freq info at this level", 1, tdp_level); diff --git a/tools/power/x86/intel-speed-select/isst-core-mbox.c b/tools/power/x86/intel-speed-select/isst-core-mbox.c index 24bea57f4ff5..c81ecd602bcf 100644 --- a/tools/power/x86/intel-speed-select/isst-core-mbox.c +++ b/tools/power/x86/intel-speed-select/isst-core-mbox.c @@ -746,6 +746,7 @@ static int mbox_set_pbf_fact_status(struct isst_id *id, int pbf, int enable) static int _get_fact_bucket_info(struct isst_id *id, int level, struct isst_fact_bucket_info *bucket_info) { + int trl_max_levels = isst_get_trl_max_levels(); unsigned int resp; int i, k, ret; @@ -769,7 +770,7 @@ static int _get_fact_bucket_info(struct isst_id *id, int level, } } - for (k = 0; k < 3; ++k) { + for (k = 0; k < trl_max_levels; ++k) { for (i = 0; i < 2; ++i) { int j; diff --git a/tools/power/x86/intel-speed-select/isst-core-tpmi.c b/tools/power/x86/intel-speed-select/isst-core-tpmi.c index 3458768562e5..32ea70c7dbd8 100644 --- a/tools/power/x86/intel-speed-select/isst-core-tpmi.c +++ b/tools/power/x86/intel-speed-select/isst-core-tpmi.c @@ -194,8 +194,14 @@ static int tpmi_get_ctdp_control(struct isst_id *id, int config_index, if (!(info.level_mask & level_mask)) return -1; - ctdp_level->fact_support = info.sst_tf_support; - ctdp_level->pbf_support = info.sst_bf_support; + if (api_version() > 2) { + ctdp_level->fact_support = info.sst_tf_support & BIT(config_index); + ctdp_level->pbf_support = info.sst_bf_support & BIT(config_index); + } else { + ctdp_level->fact_support = info.sst_tf_support; + ctdp_level->pbf_support = info.sst_bf_support; + } + ctdp_level->fact_enabled = !!(info.feature_state & BIT(1)); ctdp_level->pbf_enabled = !!(info.feature_state & BIT(0)); diff --git a/tools/power/x86/intel-speed-select/isst-core.c b/tools/power/x86/intel-speed-select/isst-core.c index f55fef4c13a7..05efffbca3b7 100644 --- a/tools/power/x86/intel-speed-select/isst-core.c +++ b/tools/power/x86/intel-speed-select/isst-core.c @@ -23,6 +23,7 @@ int isst_set_platform_ops(int api_version) isst_ops = mbox_get_platform_ops(); break; case 2: + case 3: isst_ops = tpmi_get_platform_ops(); break; default: diff --git a/tools/power/x86/intel-speed-select/isst-display.c b/tools/power/x86/intel-speed-select/isst-display.c index 14c9b037859a..07ebd08f3202 100644 --- a/tools/power/x86/intel-speed-select/isst-display.c +++ b/tools/power/x86/intel-speed-select/isst-display.c @@ -172,12 +172,19 @@ static int print_package_info(struct isst_id *id, FILE *outf) int level = 1; if (out_format_is_json()) { - if (api_version() > 1) - snprintf(header, sizeof(header), "package-%d:die-%d:powerdomain-%d:cpu-%d", - id->pkg, id->die, id->punit, id->cpu); - else + if (api_version() > 1) { + if (id->cpu < 0) + snprintf(header, sizeof(header), + "package-%d:die-%d:powerdomain-%d:cpu-None", + id->pkg, id->die, id->punit); + else + snprintf(header, sizeof(header), + "package-%d:die-%d:powerdomain-%d:cpu-%d", + id->pkg, id->die, id->punit, id->cpu); + } else { snprintf(header, sizeof(header), "package-%d:die-%d:cpu-%d", id->pkg, id->die, id->cpu); + } format_and_print(outf, level, header, NULL); return 1; } @@ -189,7 +196,12 @@ static int print_package_info(struct isst_id *id, FILE *outf) snprintf(header, sizeof(header), "powerdomain-%d", id->punit); format_and_print(outf, level++, header, NULL); } - snprintf(header, sizeof(header), "cpu-%d", id->cpu); + + if (id->cpu < 0) + snprintf(header, sizeof(header), "cpu-None"); + else + snprintf(header, sizeof(header), "cpu-%d", id->cpu); + format_and_print(outf, level, header, NULL); return level; @@ -199,8 +211,8 @@ static void _isst_pbf_display_information(struct isst_id *id, FILE *outf, int le struct isst_pbf_info *pbf_info, int disp_level) { - char header[256]; - char value[512]; + static char header[256]; + static char value[1024]; snprintf(header, sizeof(header), "speed-select-base-freq-properties"); format_and_print(outf, disp_level, header, NULL); @@ -338,8 +350,8 @@ void isst_ctdp_display_core_info(struct isst_id *id, FILE *outf, char *prefix, void isst_ctdp_display_information(struct isst_id *id, FILE *outf, int tdp_level, struct isst_pkg_ctdp *pkg_dev) { - char header[256]; - char value[512]; + static char header[256]; + static char value[1024]; static int level; int trl_max_levels = isst_get_trl_max_levels(); int i; diff --git a/tools/power/x86/intel-speed-select/isst.h b/tools/power/x86/intel-speed-select/isst.h index 4bddd3c66bf7..39ee75677c2c 100644 --- a/tools/power/x86/intel-speed-select/isst.h +++ b/tools/power/x86/intel-speed-select/isst.h @@ -80,7 +80,7 @@ #define DISP_FREQ_MULTIPLIER 100 #define MAX_PACKAGE_COUNT 32 -#define MAX_DIE_PER_PACKAGE 2 +#define MAX_DIE_PER_PACKAGE 16 #define MAX_PUNIT_PER_DIE 8 /* Unified structure to specific a CPU or a Power Domain */ diff --git a/tools/power/x86/turbostat/Makefile b/tools/power/x86/turbostat/Makefile index 92e139b9c792..2d6dce2c8f77 100644 --- a/tools/power/x86/turbostat/Makefile +++ b/tools/power/x86/turbostat/Makefile @@ -3,6 +3,8 @@ CC = $(CROSS_COMPILE)gcc BUILD_OUTPUT := $(CURDIR) PREFIX ?= /usr DESTDIR ?= +DAY := $(shell date +%Y.%m.%d) +SNAPSHOT = turbostat-$(DAY) ifeq ("$(origin O)", "command line") BUILD_OUTPUT := $(O) @@ -22,9 +24,30 @@ override CFLAGS += -D_FORTIFY_SOURCE=2 .PHONY : clean clean : @rm -f $(BUILD_OUTPUT)/turbostat + @rm -f $(SNAPSHOT).tar.gz install : turbostat - install -d $(DESTDIR)$(PREFIX)/bin + install -d $(DESTDIR)$(PREFIX)/bin install $(BUILD_OUTPUT)/turbostat $(DESTDIR)$(PREFIX)/bin/turbostat - install -d $(DESTDIR)$(PREFIX)/share/man/man8 + install -d $(DESTDIR)$(PREFIX)/share/man/man8 install -m 644 turbostat.8 $(DESTDIR)$(PREFIX)/share/man/man8 + +snapshot: turbostat + @rm -rf $(SNAPSHOT) + @mkdir $(SNAPSHOT) + @cp turbostat Makefile turbostat.c turbostat.8 ../../../../arch/x86/include/asm/intel-family.h $(SNAPSHOT) + + @sed -e 's/^#include <linux\/bits.h>/#include "bits.h"/' ../../../../arch/x86/include/asm/msr-index.h > $(SNAPSHOT)/msr-index.h + @echo '#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))' >> $(SNAPSHOT)/msr-index.h + @echo "#define BIT(x) (1 << (x))" > $(SNAPSHOT)/bits.h + @echo "#define BIT_ULL(nr) (1ULL << (nr))" >> $(SNAPSHOT)/bits.h + @echo "#define GENMASK(h, l) (((~0UL) << (l)) & (~0UL >> (sizeof(long) * 8 - 1 - (h))))" >> $(SNAPSHOT)/bits.h + @echo "#define GENMASK_ULL(h, l) (((~0ULL) << (l)) & (~0ULL >> (sizeof(long long) * 8 - 1 - (h))))" >> $(SNAPSHOT)/bits.h + + @echo PWD=. > $(SNAPSHOT)/Makefile + @echo "CFLAGS += -DMSRHEADER='\"msr-index.h\"'" >> $(SNAPSHOT)/Makefile + @echo "CFLAGS += -DINTEL_FAMILY_HEADER='\"intel-family.h\"'" >> $(SNAPSHOT)/Makefile + @sed -e's/.*MSRHEADER.*//' -e's/.*INTEL_FAMILY_HEADER.*//' Makefile >> $(SNAPSHOT)/Makefile + + @rm -f $(SNAPSHOT).tar.gz + tar cvzf $(SNAPSHOT).tar.gz $(SNAPSHOT) diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index 0d3672e5d9ed..8d37acd39201 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -155,7 +155,9 @@ The system configuration dump (if --quiet is not used) is followed by statistics .PP \fBRAM_%\fP percent of the interval that RAPL throttling was active on DRAM. .PP -\fBUncMHz\fP uncore MHz, instantaneous sample. +\fBUncMHz\fP per-package uncore MHz, instantaneous sample. +.PP +\fBUMHz1.0\fP per-package uncore MHz for domain=1 and fabric_cluster=0, instantaneous sample. System summary is the average of all packages. .SH TOO MUCH INFORMATION EXAMPLE By default, turbostat dumps all possible information -- a system configuration header, followed by columns for all counters. This is ideal for remote debugging, use the "--out" option to save everything to a text file, and get that file to the expert helping you debug. diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 98256468e248..8cdf41906e98 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -38,6 +38,7 @@ #include <stdbool.h> #include <assert.h> #include <linux/kernel.h> +#include <linux/build_bug.h> #define UNUSED(x) (void)(x) @@ -58,15 +59,22 @@ #define MAX_NOFILE 0x8000 enum counter_scope { SCOPE_CPU, SCOPE_CORE, SCOPE_PACKAGE }; -enum counter_type { COUNTER_ITEMS, COUNTER_CYCLES, COUNTER_SECONDS, COUNTER_USEC }; -enum counter_format { FORMAT_RAW, FORMAT_DELTA, FORMAT_PERCENT }; +enum counter_type { COUNTER_ITEMS, COUNTER_CYCLES, COUNTER_SECONDS, COUNTER_USEC, COUNTER_K2M }; +enum counter_format { FORMAT_RAW, FORMAT_DELTA, FORMAT_PERCENT, FORMAT_AVERAGE }; enum amperf_source { AMPERF_SOURCE_PERF, AMPERF_SOURCE_MSR }; enum rapl_source { RAPL_SOURCE_NONE, RAPL_SOURCE_PERF, RAPL_SOURCE_MSR }; +enum cstate_source { CSTATE_SOURCE_NONE, CSTATE_SOURCE_PERF, CSTATE_SOURCE_MSR }; + +struct sysfs_path { + char path[PATH_BYTES]; + int id; + struct sysfs_path *next; +}; struct msr_counter { unsigned int msr_num; char name[NAME_BYTES]; - char path[PATH_BYTES]; + struct sysfs_path *sp; unsigned int width; enum counter_type type; enum counter_format format; @@ -78,64 +86,64 @@ struct msr_counter { }; struct msr_counter bic[] = { - { 0x0, "usec", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Time_Of_Day_Seconds", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Package", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Node", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Avg_MHz", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Busy%", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Bzy_MHz", "", 0, 0, 0, NULL, 0 }, - { 0x0, "TSC_MHz", "", 0, 0, 0, NULL, 0 }, - { 0x0, "IRQ", "", 0, 0, 0, NULL, 0 }, - { 0x0, "SMI", "", 32, 0, FORMAT_DELTA, NULL, 0 }, - { 0x0, "sysfs", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CPU%c1", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CPU%c3", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CPU%c6", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CPU%c7", "", 0, 0, 0, NULL, 0 }, - { 0x0, "ThreadC", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CoreTmp", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CoreCnt", "", 0, 0, 0, NULL, 0 }, - { 0x0, "PkgTmp", "", 0, 0, 0, NULL, 0 }, - { 0x0, "GFX%rc6", "", 0, 0, 0, NULL, 0 }, - { 0x0, "GFXMHz", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Pkg%pc2", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Pkg%pc3", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Pkg%pc6", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Pkg%pc7", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Pkg%pc8", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Pkg%pc9", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Pk%pc10", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CPU%LPI", "", 0, 0, 0, NULL, 0 }, - { 0x0, "SYS%LPI", "", 0, 0, 0, NULL, 0 }, - { 0x0, "PkgWatt", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CorWatt", "", 0, 0, 0, NULL, 0 }, - { 0x0, "GFXWatt", "", 0, 0, 0, NULL, 0 }, - { 0x0, "PkgCnt", "", 0, 0, 0, NULL, 0 }, - { 0x0, "RAMWatt", "", 0, 0, 0, NULL, 0 }, - { 0x0, "PKG_%", "", 0, 0, 0, NULL, 0 }, - { 0x0, "RAM_%", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Pkg_J", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Cor_J", "", 0, 0, 0, NULL, 0 }, - { 0x0, "GFX_J", "", 0, 0, 0, NULL, 0 }, - { 0x0, "RAM_J", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Mod%c6", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Totl%C0", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Any%C0", "", 0, 0, 0, NULL, 0 }, - { 0x0, "GFX%C0", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CPUGFX%", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Core", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CPU", "", 0, 0, 0, NULL, 0 }, - { 0x0, "APIC", "", 0, 0, 0, NULL, 0 }, - { 0x0, "X2APIC", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Die", "", 0, 0, 0, NULL, 0 }, - { 0x0, "GFXAMHz", "", 0, 0, 0, NULL, 0 }, - { 0x0, "IPC", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CoreThr", "", 0, 0, 0, NULL, 0 }, - { 0x0, "UncMHz", "", 0, 0, 0, NULL, 0 }, - { 0x0, "SAM%mc6", "", 0, 0, 0, NULL, 0 }, - { 0x0, "SAMMHz", "", 0, 0, 0, NULL, 0 }, - { 0x0, "SAMAMHz", "", 0, 0, 0, NULL, 0 }, + { 0x0, "usec", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Time_Of_Day_Seconds", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Package", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Node", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Avg_MHz", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Busy%", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Bzy_MHz", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "TSC_MHz", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "IRQ", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "SMI", NULL, 32, 0, FORMAT_DELTA, NULL, 0 }, + { 0x0, "sysfs", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CPU%c1", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CPU%c3", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CPU%c6", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CPU%c7", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "ThreadC", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CoreTmp", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CoreCnt", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "PkgTmp", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "GFX%rc6", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "GFXMHz", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Pkg%pc2", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Pkg%pc3", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Pkg%pc6", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Pkg%pc7", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Pkg%pc8", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Pkg%pc9", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Pk%pc10", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CPU%LPI", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "SYS%LPI", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "PkgWatt", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CorWatt", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "GFXWatt", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "PkgCnt", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "RAMWatt", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "PKG_%", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "RAM_%", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Pkg_J", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Cor_J", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "GFX_J", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "RAM_J", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Mod%c6", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Totl%C0", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Any%C0", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "GFX%C0", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CPUGFX%", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Core", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CPU", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "APIC", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "X2APIC", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Die", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "GFXAMHz", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "IPC", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CoreThr", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "UncMHz", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "SAM%mc6", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "SAMMHz", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "SAMAMHz", NULL, 0, 0, 0, NULL, 0 }, }; #define MAX_BIC (sizeof(bic) / sizeof(struct msr_counter)) @@ -216,6 +224,28 @@ unsigned long long bic_present = BIC_USEC | BIC_TOD | BIC_sysfs | BIC_APIC | BIC #define BIC_NOT_PRESENT(COUNTER_BIT) (bic_present &= ~COUNTER_BIT) #define BIC_IS_ENABLED(COUNTER_BIT) (bic_enabled & COUNTER_BIT) +/* + * MSR_PKG_CST_CONFIG_CONTROL decoding for pkg_cstate_limit: + * If you change the values, note they are used both in comparisons + * (>= PCL__7) and to index pkg_cstate_limit_strings[]. + */ +#define PCLUKN 0 /* Unknown */ +#define PCLRSV 1 /* Reserved */ +#define PCL__0 2 /* PC0 */ +#define PCL__1 3 /* PC1 */ +#define PCL__2 4 /* PC2 */ +#define PCL__3 5 /* PC3 */ +#define PCL__4 6 /* PC4 */ +#define PCL__6 7 /* PC6 */ +#define PCL_6N 8 /* PC6 No Retention */ +#define PCL_6R 9 /* PC6 Retention */ +#define PCL__7 10 /* PC7 */ +#define PCL_7S 11 /* PC7 Shrink */ +#define PCL__8 12 /* PC8 */ +#define PCL__9 13 /* PC9 */ +#define PCL_10 14 /* PC10 */ +#define PCLUNL 15 /* Unlimited */ + struct amperf_group_fd; char *proc_stat = "/proc/stat"; @@ -299,6 +329,9 @@ struct gfx_sysfs_info { static struct gfx_sysfs_info gfx_info[GFX_MAX]; int get_msr(int cpu, off_t offset, unsigned long long *msr); +int add_counter(unsigned int msr_num, char *path, char *name, + unsigned int width, enum counter_scope scope, + enum counter_type type, enum counter_format format, int flags, int package_num); /* Model specific support Start */ @@ -663,6 +696,23 @@ static const struct platform_features adl_features = { .enable_tsc_tweak = 1, }; +static const struct platform_features arl_features = { + .has_msr_misc_feature_control = 1, + .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, + .has_config_tdp = 1, + .bclk_freq = BCLK_100MHZ, + .supported_cstates = CC1 | CC6 | CC7 | PC2 | PC3 | PC6 | PC10, + .cst_limit = CST_LIMIT_HSW, + .has_irtl_msrs = 1, + .has_msr_core_c1_res = 1, + .has_ext_cst_msrs = 1, + .trl_msrs = TRL_BASE, + .tcc_offset_bits = 6, + .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX, + .enable_tsc_tweak = 1, +}; + static const struct platform_features skx_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, @@ -905,8 +955,10 @@ static const struct platform_data turbostat_pdata[] = { { INTEL_FAM6_RAPTORLAKE_S, &adl_features }, { INTEL_FAM6_METEORLAKE, &cnl_features }, { INTEL_FAM6_METEORLAKE_L, &cnl_features }, - { INTEL_FAM6_ARROWLAKE, &cnl_features }, - { INTEL_FAM6_LUNARLAKE_M, &cnl_features }, + { INTEL_FAM6_ARROWLAKE_H, &arl_features }, + { INTEL_FAM6_ARROWLAKE_U, &arl_features }, + { INTEL_FAM6_ARROWLAKE, &arl_features }, + { INTEL_FAM6_LUNARLAKE_M, &arl_features }, { INTEL_FAM6_ATOM_SILVERMONT, &slv_features }, { INTEL_FAM6_ATOM_SILVERMONT_D, &slvd_features }, { INTEL_FAM6_ATOM_AIRMONT, &amt_features }, @@ -979,8 +1031,9 @@ char *progname; #define CPU_SUBSET_MAXCPUS 1024 /* need to use before probe... */ cpu_set_t *cpu_present_set, *cpu_effective_set, *cpu_allowed_set, *cpu_affinity_set, *cpu_subset; size_t cpu_present_setsize, cpu_effective_setsize, cpu_allowed_setsize, cpu_affinity_setsize, cpu_subset_size; -#define MAX_ADDED_COUNTERS 8 #define MAX_ADDED_THREAD_COUNTERS 24 +#define MAX_ADDED_CORE_COUNTERS 8 +#define MAX_ADDED_PACKAGE_COUNTERS 16 #define BITMASK_SIZE 32 /* Indexes used to map data read from perf and MSRs into global variables */ @@ -1022,6 +1075,7 @@ struct rapl_counter_info_t { /* struct rapl_counter_info_t for each RAPL domain */ struct rapl_counter_info_t *rapl_counter_info_perdomain; +unsigned int rapl_counter_info_perdomain_size; #define RAPL_COUNTER_FLAG_USE_MSR_SUM (1u << 1) @@ -1152,6 +1206,161 @@ struct rapl_counter { double scale; }; +/* Indexes used to map data read from perf and MSRs into global variables */ +enum ccstate_rci_index { + CCSTATE_RCI_INDEX_C1_RESIDENCY = 0, + CCSTATE_RCI_INDEX_C3_RESIDENCY = 1, + CCSTATE_RCI_INDEX_C6_RESIDENCY = 2, + CCSTATE_RCI_INDEX_C7_RESIDENCY = 3, + PCSTATE_RCI_INDEX_C2_RESIDENCY = 4, + PCSTATE_RCI_INDEX_C3_RESIDENCY = 5, + PCSTATE_RCI_INDEX_C6_RESIDENCY = 6, + PCSTATE_RCI_INDEX_C7_RESIDENCY = 7, + PCSTATE_RCI_INDEX_C8_RESIDENCY = 8, + PCSTATE_RCI_INDEX_C9_RESIDENCY = 9, + PCSTATE_RCI_INDEX_C10_RESIDENCY = 10, + NUM_CSTATE_COUNTERS, +}; + +struct cstate_counter_info_t { + unsigned long long data[NUM_CSTATE_COUNTERS]; + enum cstate_source source[NUM_CSTATE_COUNTERS]; + unsigned long long msr[NUM_CSTATE_COUNTERS]; + int fd_perf_core; + int fd_perf_pkg; +}; + +struct cstate_counter_info_t *ccstate_counter_info; +unsigned int ccstate_counter_info_size; + +#define CSTATE_COUNTER_FLAG_COLLECT_PER_CORE (1u << 0) +#define CSTATE_COUNTER_FLAG_COLLECT_PER_THREAD ((1u << 1) | CSTATE_COUNTER_FLAG_COLLECT_PER_CORE) +#define CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY (1u << 2) + +struct cstate_counter_arch_info { + int feature_mask; /* Mask for testing if the counter is supported on host */ + const char *perf_subsys; + const char *perf_name; + unsigned long long msr; + unsigned int rci_index; /* Maps data from perf counters to global variables */ + unsigned long long bic; + unsigned long long flags; + int pkg_cstate_limit; +}; + +static struct cstate_counter_arch_info ccstate_counter_arch_infos[] = { + { + .feature_mask = CC1, + .perf_subsys = "cstate_core", + .perf_name = "c1-residency", + .msr = MSR_CORE_C1_RES, + .rci_index = CCSTATE_RCI_INDEX_C1_RESIDENCY, + .bic = BIC_CPU_c1, + .flags = CSTATE_COUNTER_FLAG_COLLECT_PER_THREAD, + .pkg_cstate_limit = 0, + }, + { + .feature_mask = CC3, + .perf_subsys = "cstate_core", + .perf_name = "c3-residency", + .msr = MSR_CORE_C3_RESIDENCY, + .rci_index = CCSTATE_RCI_INDEX_C3_RESIDENCY, + .bic = BIC_CPU_c3, + .flags = CSTATE_COUNTER_FLAG_COLLECT_PER_CORE | CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY, + .pkg_cstate_limit = 0, + }, + { + .feature_mask = CC6, + .perf_subsys = "cstate_core", + .perf_name = "c6-residency", + .msr = MSR_CORE_C6_RESIDENCY, + .rci_index = CCSTATE_RCI_INDEX_C6_RESIDENCY, + .bic = BIC_CPU_c6, + .flags = CSTATE_COUNTER_FLAG_COLLECT_PER_CORE | CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY, + .pkg_cstate_limit = 0, + }, + { + .feature_mask = CC7, + .perf_subsys = "cstate_core", + .perf_name = "c7-residency", + .msr = MSR_CORE_C7_RESIDENCY, + .rci_index = CCSTATE_RCI_INDEX_C7_RESIDENCY, + .bic = BIC_CPU_c7, + .flags = CSTATE_COUNTER_FLAG_COLLECT_PER_CORE | CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY, + .pkg_cstate_limit = 0, + }, + { + .feature_mask = PC2, + .perf_subsys = "cstate_pkg", + .perf_name = "c2-residency", + .msr = MSR_PKG_C2_RESIDENCY, + .rci_index = PCSTATE_RCI_INDEX_C2_RESIDENCY, + .bic = BIC_Pkgpc2, + .flags = 0, + .pkg_cstate_limit = PCL__2, + }, + { + .feature_mask = PC3, + .perf_subsys = "cstate_pkg", + .perf_name = "c3-residency", + .msr = MSR_PKG_C3_RESIDENCY, + .rci_index = PCSTATE_RCI_INDEX_C3_RESIDENCY, + .bic = BIC_Pkgpc3, + .flags = 0, + .pkg_cstate_limit = PCL__3, + }, + { + .feature_mask = PC6, + .perf_subsys = "cstate_pkg", + .perf_name = "c6-residency", + .msr = MSR_PKG_C6_RESIDENCY, + .rci_index = PCSTATE_RCI_INDEX_C6_RESIDENCY, + .bic = BIC_Pkgpc6, + .flags = 0, + .pkg_cstate_limit = PCL__6, + }, + { + .feature_mask = PC7, + .perf_subsys = "cstate_pkg", + .perf_name = "c7-residency", + .msr = MSR_PKG_C7_RESIDENCY, + .rci_index = PCSTATE_RCI_INDEX_C7_RESIDENCY, + .bic = BIC_Pkgpc7, + .flags = 0, + .pkg_cstate_limit = PCL__7, + }, + { + .feature_mask = PC8, + .perf_subsys = "cstate_pkg", + .perf_name = "c8-residency", + .msr = MSR_PKG_C8_RESIDENCY, + .rci_index = PCSTATE_RCI_INDEX_C8_RESIDENCY, + .bic = BIC_Pkgpc8, + .flags = 0, + .pkg_cstate_limit = PCL__8, + }, + { + .feature_mask = PC9, + .perf_subsys = "cstate_pkg", + .perf_name = "c9-residency", + .msr = MSR_PKG_C9_RESIDENCY, + .rci_index = PCSTATE_RCI_INDEX_C9_RESIDENCY, + .bic = BIC_Pkgpc9, + .flags = 0, + .pkg_cstate_limit = PCL__9, + }, + { + .feature_mask = PC10, + .perf_subsys = "cstate_pkg", + .perf_name = "c10-residency", + .msr = MSR_PKG_C10_RESIDENCY, + .rci_index = PCSTATE_RCI_INDEX_C10_RESIDENCY, + .bic = BIC_Pkgpc10, + .flags = 0, + .pkg_cstate_limit = PCL_10, + }, +}; + struct thread_data { struct timeval tv_begin; struct timeval tv_end; @@ -1181,7 +1390,7 @@ struct core_data { struct rapl_counter core_energy; /* MSR_CORE_ENERGY_STAT */ unsigned int core_id; unsigned long long core_throt_cnt; - unsigned long long counter[MAX_ADDED_COUNTERS]; + unsigned long long counter[MAX_ADDED_CORE_COUNTERS]; } *core_even, *core_odd; struct pkg_data { @@ -1214,7 +1423,7 @@ struct pkg_data { struct rapl_counter rapl_dram_perf_status; /* MSR_DRAM_PERF_STATUS */ unsigned int pkg_temp_c; unsigned int uncore_mhz; - unsigned long long counter[MAX_ADDED_COUNTERS]; + unsigned long long counter[MAX_ADDED_PACKAGE_COUNTERS]; } *package_even, *package_odd; #define ODD_COUNTERS thread_odd, core_odd, package_odd @@ -1357,36 +1566,42 @@ struct sys_counters { struct msr_counter *pp; } sys; -void free_sys_counters(void) +static size_t free_msr_counters_(struct msr_counter **pp) { - struct msr_counter *p = sys.tp, *pnext = NULL; + struct msr_counter *p = NULL; + size_t num_freed = 0; - while (p) { - pnext = p->next; - free(p); - p = pnext; - } + while (*pp) { + p = *pp; - p = sys.cp, pnext = NULL; - while (p) { - pnext = p->next; - free(p); - p = pnext; - } + if (p->msr_num != 0) { + *pp = p->next; + + free(p); + ++num_freed; - p = sys.pp, pnext = NULL; - while (p) { - pnext = p->next; - free(p); - p = pnext; + continue; + } + + pp = &p->next; } - sys.added_thread_counters = 0; - sys.added_core_counters = 0; - sys.added_package_counters = 0; - sys.tp = NULL; - sys.cp = NULL; - sys.pp = NULL; + return num_freed; +} + +/* + * Free all added counters accessed via msr. + */ +static void free_sys_msr_counters(void) +{ + /* Thread counters */ + sys.added_thread_counters -= free_msr_counters_(&sys.tp); + + /* Core counters */ + sys.added_core_counters -= free_msr_counters_(&sys.cp); + + /* Package counters */ + sys.added_package_counters -= free_msr_counters_(&sys.pp); } struct system_summary { @@ -1415,6 +1630,9 @@ struct topo_params { int allowed_cpus; int allowed_cores; int max_cpu_num; + int max_core_id; + int max_package_id; + int max_die_id; int max_node_num; int nodes_per_pkg; int cores_per_node; @@ -1529,23 +1747,12 @@ int get_msr_fd(int cpu) static void bic_disable_msr_access(void) { - const unsigned long bic_msrs = - BIC_SMI | - BIC_CPU_c1 | - BIC_CPU_c3 | - BIC_CPU_c6 | - BIC_CPU_c7 | - BIC_Mod_c6 | - BIC_CoreTmp | - BIC_Totl_c0 | - BIC_Any_c0 | - BIC_GFX_c0 | - BIC_CPUGFX | - BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_PkgTmp; + const unsigned long bic_msrs = BIC_SMI | BIC_Mod_c6 | BIC_CoreTmp | + BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_PkgTmp; bic_enabled &= ~bic_msrs; - free_sys_counters(); + free_sys_msr_counters(); } static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags) @@ -1928,13 +2135,15 @@ void print_header(char *delim) if (mp->format == FORMAT_RAW) { if (mp->width == 64) outp += sprintf(outp, "%s%18.18s", delim, mp->name); - else + else if (mp->width == 32) outp += sprintf(outp, "%s%10.10s", delim, mp->name); + else + outp += sprintf(outp, "%s%7.7s", delim, mp->name); } else { if ((mp->type == COUNTER_ITEMS) && sums_need_wide_columns) outp += sprintf(outp, "%s%8s", delim, mp->name); else - outp += sprintf(outp, "%s%s", delim, mp->name); + outp += sprintf(outp, "%s%7.7s", delim, mp->name); } } @@ -1966,7 +2175,7 @@ int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) { outp += sprintf(outp, "tADDED [%d] %8s msr0x%x: %08llX %s\n", i, mp->name, mp->msr_num, - t->counter[i], mp->path); + t->counter[i], mp->sp->path); } } @@ -1987,7 +2196,7 @@ int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) { outp += sprintf(outp, "cADDED [%d] %8s msr0x%x: %08llX %s\n", i, mp->name, mp->msr_num, - c->counter[i], mp->path); + c->counter[i], mp->sp->path); } outp += sprintf(outp, "mc6_us: %016llX\n", c->mc6_us); } @@ -2023,7 +2232,7 @@ int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { outp += sprintf(outp, "pADDED [%d] %8s msr0x%x: %08llX %s\n", i, mp->name, mp->msr_num, - p->counter[i], mp->path); + p->counter[i], mp->sp->path); } } @@ -2388,7 +2597,8 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), p->counter[i]); } else if (mp->format == FORMAT_PERCENT) { outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->counter[i] / tsc); - } + } else if (mp->type == COUNTER_K2M) + outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), (unsigned int)p->counter[i] / 1000); } done: @@ -2498,6 +2708,8 @@ int delta_package(struct pkg_data *new, struct pkg_data *old) for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { if (mp->format == FORMAT_RAW) old->counter[i] = new->counter[i]; + else if (mp->format == FORMAT_AVERAGE) + old->counter[i] = new->counter[i]; else old->counter[i] = new->counter[i] - old->counter[i]; } @@ -2970,7 +3182,7 @@ unsigned long long snapshot_sysfs_counter(char *path) return counter; } -int get_mp(int cpu, struct msr_counter *mp, unsigned long long *counterp) +int get_mp(int cpu, struct msr_counter *mp, unsigned long long *counterp, char *counter_path) { if (mp->msr_num != 0) { assert(!no_msr); @@ -2980,25 +3192,40 @@ int get_mp(int cpu, struct msr_counter *mp, unsigned long long *counterp) char path[128 + PATH_BYTES]; if (mp->flags & SYSFS_PERCPU) { - sprintf(path, "/sys/devices/system/cpu/cpu%d/%s", cpu, mp->path); + sprintf(path, "/sys/devices/system/cpu/cpu%d/%s", cpu, mp->sp->path); *counterp = snapshot_sysfs_counter(path); } else { - *counterp = snapshot_sysfs_counter(mp->path); + *counterp = snapshot_sysfs_counter(counter_path); } } return 0; } -unsigned long long get_uncore_mhz(int package, int die) +unsigned long long get_legacy_uncore_mhz(int package) { char path[128]; + int die; + static int warn_once; + + /* + * for this package, use the first die_id that exists + */ + for (die = 0; die <= topo.max_die_id; ++die) { - sprintf(path, "/sys/devices/system/cpu/intel_uncore_frequency/package_%02d_die_%02d/current_freq_khz", package, - die); + sprintf(path, "/sys/devices/system/cpu/intel_uncore_frequency/package_%02d_die_%02d/current_freq_khz", + package, die); - return (snapshot_sysfs_counter(path) / 1000); + if (access(path, R_OK) == 0) + return (snapshot_sysfs_counter(path) / 1000); + } + if (!warn_once) { + warnx("BUG: %s: No %s", __func__, path); + warn_once = 1; + } + + return 0; } int get_epb(int cpu) @@ -3361,6 +3588,17 @@ size_t rapl_counter_info_count_perf(const struct rapl_counter_info_t *rci) return ret; } +static size_t cstate_counter_info_count_perf(const struct cstate_counter_info_t *cci) +{ + size_t ret = 0; + + for (int i = 0; i < NUM_CSTATE_COUNTERS; ++i) + if (cci->source[i] == CSTATE_SOURCE_PERF) + ++ret; + + return ret; +} + void write_rapl_counter(struct rapl_counter *rc, struct rapl_counter_info_t *rci, unsigned int idx) { rc->raw_value = rci->data[idx]; @@ -3368,15 +3606,18 @@ void write_rapl_counter(struct rapl_counter *rc, struct rapl_counter_info_t *rci rc->scale = rci->scale[idx]; } -int get_rapl_counters(int cpu, int domain, struct core_data *c, struct pkg_data *p) +int get_rapl_counters(int cpu, unsigned int domain, struct core_data *c, struct pkg_data *p) { unsigned long long perf_data[NUM_RAPL_COUNTERS + 1]; - struct rapl_counter_info_t *rci = &rapl_counter_info_perdomain[domain]; + struct rapl_counter_info_t *rci; if (debug) fprintf(stderr, "%s: cpu%d domain%d\n", __func__, cpu, domain); assert(rapl_counter_info_perdomain); + assert(domain < rapl_counter_info_perdomain_size); + + rci = &rapl_counter_info_perdomain[domain]; /* * If we have any perf counters to read, read them all now, in bulk @@ -3432,7 +3673,7 @@ int get_rapl_counters(int cpu, int domain, struct core_data *c, struct pkg_data } } - _Static_assert(NUM_RAPL_COUNTERS == 7); + BUILD_BUG_ON(NUM_RAPL_COUNTERS != 7); write_rapl_counter(&p->energy_pkg, rci, RAPL_RCI_INDEX_ENERGY_PKG); write_rapl_counter(&p->energy_cores, rci, RAPL_RCI_INDEX_ENERGY_CORES); write_rapl_counter(&p->energy_dram, rci, RAPL_RCI_INDEX_DRAM); @@ -3444,6 +3685,154 @@ int get_rapl_counters(int cpu, int domain, struct core_data *c, struct pkg_data return 0; } +char *find_sysfs_path_by_id(struct sysfs_path *sp, int id) +{ + while (sp) { + if (sp->id == id) + return (sp->path); + sp = sp->next; + } + if (debug) + warnx("%s: id%d not found", __func__, id); + return NULL; +} + +int get_cstate_counters(unsigned int cpu, struct thread_data *t, struct core_data *c, struct pkg_data *p) +{ + /* + * Overcommit memory a little bit here, + * but skip calculating exact sizes for the buffers. + */ + unsigned long long perf_data[NUM_CSTATE_COUNTERS]; + unsigned long long perf_data_core[NUM_CSTATE_COUNTERS + 1]; + unsigned long long perf_data_pkg[NUM_CSTATE_COUNTERS + 1]; + + struct cstate_counter_info_t *cci; + + if (debug) + fprintf(stderr, "%s: cpu%d\n", __func__, cpu); + + assert(ccstate_counter_info); + assert(cpu <= ccstate_counter_info_size); + + memset(perf_data, 0, sizeof(perf_data)); + memset(perf_data_core, 0, sizeof(perf_data_core)); + memset(perf_data_pkg, 0, sizeof(perf_data_pkg)); + + cci = &ccstate_counter_info[cpu]; + + /* + * If we have any perf counters to read, read them all now, in bulk + */ + const size_t num_perf_counters = cstate_counter_info_count_perf(cci); + ssize_t expected_read_size = num_perf_counters * sizeof(unsigned long long); + ssize_t actual_read_size_core = 0, actual_read_size_pkg = 0; + + if (cci->fd_perf_core != -1) { + /* Each descriptor read begins with number of counters read. */ + expected_read_size += sizeof(unsigned long long); + + actual_read_size_core = read(cci->fd_perf_core, &perf_data_core[0], sizeof(perf_data_core)); + + if (actual_read_size_core <= 0) + err(-1, "%s: read perf %s: %ld", __func__, "core", actual_read_size_core); + } + + if (cci->fd_perf_pkg != -1) { + /* Each descriptor read begins with number of counters read. */ + expected_read_size += sizeof(unsigned long long); + + actual_read_size_pkg = read(cci->fd_perf_pkg, &perf_data_pkg[0], sizeof(perf_data_pkg)); + + if (actual_read_size_pkg <= 0) + err(-1, "%s: read perf %s: %ld", __func__, "pkg", actual_read_size_pkg); + } + + const ssize_t actual_read_size_total = actual_read_size_core + actual_read_size_pkg; + + if (actual_read_size_total != expected_read_size) + err(-1, "%s: failed to read perf_data (%zu %zu)", __func__, expected_read_size, actual_read_size_total); + + /* + * Copy ccstate and pcstate data into unified buffer. + * + * Skip first element from core and pkg buffers. + * Kernel puts there how many counters were read. + */ + const size_t num_core_counters = perf_data_core[0]; + const size_t num_pkg_counters = perf_data_pkg[0]; + + assert(num_perf_counters == num_core_counters + num_pkg_counters); + + /* Copy ccstate perf data */ + memcpy(&perf_data[0], &perf_data_core[1], num_core_counters * sizeof(unsigned long long)); + + /* Copy pcstate perf data */ + memcpy(&perf_data[num_core_counters], &perf_data_pkg[1], num_pkg_counters * sizeof(unsigned long long)); + + for (unsigned int i = 0, pi = 0; i < NUM_CSTATE_COUNTERS; ++i) { + switch (cci->source[i]) { + case CSTATE_SOURCE_NONE: + break; + + case CSTATE_SOURCE_PERF: + assert(pi < ARRAY_SIZE(perf_data)); + assert(cci->fd_perf_core != -1 || cci->fd_perf_pkg != -1); + + if (debug) { + fprintf(stderr, "cstate via %s %u: %llu\n", "perf", i, perf_data[pi]); + } + + cci->data[i] = perf_data[pi]; + + ++pi; + break; + + case CSTATE_SOURCE_MSR: + assert(!no_msr); + if (get_msr(cpu, cci->msr[i], &cci->data[i])) + return -13 - i; + + if (debug) { + fprintf(stderr, "cstate via %s0x%llx %u: %llu\n", "msr", cci->msr[i], i, cci->data[i]); + } + + break; + } + } + + /* + * Helper to write the data only if the source of + * the counter for the current cpu is not none. + * + * Otherwise we would overwrite core data with 0 (default value), + * when invoked for the thread sibling. + */ +#define PERF_COUNTER_WRITE_DATA(out_counter, index) do { \ + if (cci->source[index] != CSTATE_SOURCE_NONE) \ + out_counter = cci->data[index]; \ +} while (0) + + BUILD_BUG_ON(NUM_CSTATE_COUNTERS != 11); + + PERF_COUNTER_WRITE_DATA(t->c1, CCSTATE_RCI_INDEX_C1_RESIDENCY); + PERF_COUNTER_WRITE_DATA(c->c3, CCSTATE_RCI_INDEX_C3_RESIDENCY); + PERF_COUNTER_WRITE_DATA(c->c6, CCSTATE_RCI_INDEX_C6_RESIDENCY); + PERF_COUNTER_WRITE_DATA(c->c7, CCSTATE_RCI_INDEX_C7_RESIDENCY); + + PERF_COUNTER_WRITE_DATA(p->pc2, PCSTATE_RCI_INDEX_C2_RESIDENCY); + PERF_COUNTER_WRITE_DATA(p->pc3, PCSTATE_RCI_INDEX_C3_RESIDENCY); + PERF_COUNTER_WRITE_DATA(p->pc6, PCSTATE_RCI_INDEX_C6_RESIDENCY); + PERF_COUNTER_WRITE_DATA(p->pc7, PCSTATE_RCI_INDEX_C7_RESIDENCY); + PERF_COUNTER_WRITE_DATA(p->pc8, PCSTATE_RCI_INDEX_C8_RESIDENCY); + PERF_COUNTER_WRITE_DATA(p->pc9, PCSTATE_RCI_INDEX_C9_RESIDENCY); + PERF_COUNTER_WRITE_DATA(p->pc10, PCSTATE_RCI_INDEX_C10_RESIDENCY); + +#undef PERF_COUNTER_WRITE_DATA + + return 0; +} + /* * get_counters(...) * migrate to cpu @@ -3499,13 +3888,11 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) return -5; t->smi_count = msr & 0xFFFFFFFF; } - if (DO_BIC(BIC_CPU_c1) && platform->has_msr_core_c1_res) { - if (get_msr(cpu, MSR_CORE_C1_RES, &t->c1)) - return -6; - } + + get_cstate_counters(cpu, t, c, p); for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) { - if (get_mp(cpu, mp, &t->counter[i])) + if (get_mp(cpu, mp, &t->counter[i], mp->sp->path)) return -10; } @@ -3519,31 +3906,14 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) return status; } - if (DO_BIC(BIC_CPU_c3) || soft_c1_residency_display(BIC_CPU_c3)) { - if (get_msr(cpu, MSR_CORE_C3_RESIDENCY, &c->c3)) - return -6; - } - - if ((DO_BIC(BIC_CPU_c6) || soft_c1_residency_display(BIC_CPU_c6)) && !platform->has_msr_knl_core_c6_residency) { - if (get_msr(cpu, MSR_CORE_C6_RESIDENCY, &c->c6)) - return -7; - } else if (platform->has_msr_knl_core_c6_residency && soft_c1_residency_display(BIC_CPU_c6)) { - if (get_msr(cpu, MSR_KNL_CORE_C6_RESIDENCY, &c->c6)) - return -7; - } - - if (DO_BIC(BIC_CPU_c7) || soft_c1_residency_display(BIC_CPU_c7)) { - if (get_msr(cpu, MSR_CORE_C7_RESIDENCY, &c->c7)) - return -8; - else if (t->is_atom) { - /* - * For Atom CPUs that has core cstate deeper than c6, - * MSR_CORE_C6_RESIDENCY returns residency of cc6 and deeper. - * Minus CC7 (and deeper cstates) residency to get - * accturate cc6 residency. - */ - c->c6 -= c->c7; - } + if (DO_BIC(BIC_CPU_c7) && t->is_atom) { + /* + * For Atom CPUs that has core cstate deeper than c6, + * MSR_CORE_C6_RESIDENCY returns residency of cc6 and deeper. + * Minus CC7 (and deeper cstates) residency to get + * accturate cc6 residency. + */ + c->c6 -= c->c7; } if (DO_BIC(BIC_Mod_c6)) @@ -3560,7 +3930,7 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) get_core_throt_cnt(cpu, &c->core_throt_cnt); for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) { - if (get_mp(cpu, mp, &c->counter[i])) + if (get_mp(cpu, mp, &c->counter[i], mp->sp->path)) return -10; } @@ -3584,34 +3954,6 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) if (get_msr(cpu, MSR_PKG_BOTH_CORE_GFXE_C0_RES, &p->pkg_both_core_gfxe_c0)) return -13; } - if (DO_BIC(BIC_Pkgpc3)) - if (get_msr(cpu, MSR_PKG_C3_RESIDENCY, &p->pc3)) - return -9; - if (DO_BIC(BIC_Pkgpc6)) { - if (platform->has_msr_atom_pkg_c6_residency) { - if (get_msr(cpu, MSR_ATOM_PKG_C6_RESIDENCY, &p->pc6)) - return -10; - } else { - if (get_msr(cpu, MSR_PKG_C6_RESIDENCY, &p->pc6)) - return -10; - } - } - - if (DO_BIC(BIC_Pkgpc2)) - if (get_msr(cpu, MSR_PKG_C2_RESIDENCY, &p->pc2)) - return -11; - if (DO_BIC(BIC_Pkgpc7)) - if (get_msr(cpu, MSR_PKG_C7_RESIDENCY, &p->pc7)) - return -12; - if (DO_BIC(BIC_Pkgpc8)) - if (get_msr(cpu, MSR_PKG_C8_RESIDENCY, &p->pc8)) - return -13; - if (DO_BIC(BIC_Pkgpc9)) - if (get_msr(cpu, MSR_PKG_C9_RESIDENCY, &p->pc9)) - return -13; - if (DO_BIC(BIC_Pkgpc10)) - if (get_msr(cpu, MSR_PKG_C10_RESIDENCY, &p->pc10)) - return -13; if (DO_BIC(BIC_CPU_LPI)) p->cpu_lpi = cpuidle_cur_cpu_lpi_us; @@ -3630,9 +3972,8 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) p->pkg_temp_c = tj_max - ((msr >> 16) & 0x7F); } - /* n.b. assume die0 uncore frequency applies to whole package */ if (DO_BIC(BIC_UNCORE_MHZ)) - p->uncore_mhz = get_uncore_mhz(p->package_id, 0); + p->uncore_mhz = get_legacy_uncore_mhz(p->package_id); if (DO_BIC(BIC_GFX_rc6)) p->gfx_rc6_ms = gfx_info[GFX_rc6].val_ull; @@ -3653,7 +3994,16 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) p->sam_act_mhz = gfx_info[SAM_ACTMHz].val; for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { - if (get_mp(cpu, mp, &p->counter[i])) + char *path = NULL; + + if (mp->msr_num == 0) { + path = find_sysfs_path_by_id(mp->sp, p->package_id); + if (path == NULL) { + warnx("%s: package_id %d not found", __func__, p->package_id); + return -10; + } + } + if (get_mp(cpu, mp, &p->counter[i], path)) return -10; } done: @@ -3662,31 +4012,8 @@ done: return 0; } -/* - * MSR_PKG_CST_CONFIG_CONTROL decoding for pkg_cstate_limit: - * If you change the values, note they are used both in comparisons - * (>= PCL__7) and to index pkg_cstate_limit_strings[]. - */ - -#define PCLUKN 0 /* Unknown */ -#define PCLRSV 1 /* Reserved */ -#define PCL__0 2 /* PC0 */ -#define PCL__1 3 /* PC1 */ -#define PCL__2 4 /* PC2 */ -#define PCL__3 5 /* PC3 */ -#define PCL__4 6 /* PC4 */ -#define PCL__6 7 /* PC6 */ -#define PCL_6N 8 /* PC6 No Retention */ -#define PCL_6R 9 /* PC6 Retention */ -#define PCL__7 10 /* PC7 */ -#define PCL_7S 11 /* PC7 Shrink */ -#define PCL__8 12 /* PC8 */ -#define PCL__9 13 /* PC9 */ -#define PCL_10 14 /* PC10 */ -#define PCLUNL 15 /* Unlimited */ - int pkg_cstate_limit = PCLUKN; -char *pkg_cstate_limit_strings[] = { "reserved", "unknown", "pc0", "pc1", "pc2", +char *pkg_cstate_limit_strings[] = { "unknown", "reserved", "pc0", "pc1", "pc2", "pc3", "pc4", "pc6", "pc6n", "pc6r", "pc7", "pc7s", "pc8", "pc9", "pc10", "unlimited" }; @@ -4175,12 +4502,32 @@ void free_fd_instr_count_percpu(void) fd_instr_count_percpu = NULL; } +void free_fd_cstate(void) +{ + if (!ccstate_counter_info) + return; + + const int counter_info_num = ccstate_counter_info_size; + + for (int counter_id = 0; counter_id < counter_info_num; ++counter_id) { + if (ccstate_counter_info[counter_id].fd_perf_core != -1) + close(ccstate_counter_info[counter_id].fd_perf_core); + + if (ccstate_counter_info[counter_id].fd_perf_pkg != -1) + close(ccstate_counter_info[counter_id].fd_perf_pkg); + } + + free(ccstate_counter_info); + ccstate_counter_info = NULL; + ccstate_counter_info_size = 0; +} + void free_fd_rapl_percpu(void) { if (!rapl_counter_info_perdomain) return; - const int num_domains = platform->has_per_core_rapl ? topo.num_cores : topo.num_packages; + const int num_domains = rapl_counter_info_perdomain_size; for (int domain_id = 0; domain_id < num_domains; ++domain_id) { if (rapl_counter_info_perdomain[domain_id].fd_perf != -1) @@ -4188,6 +4535,8 @@ void free_fd_rapl_percpu(void) } free(rapl_counter_info_perdomain); + rapl_counter_info_perdomain = NULL; + rapl_counter_info_perdomain_size = 0; } void free_all_buffers(void) @@ -4234,6 +4583,7 @@ void free_all_buffers(void) free_fd_instr_count_percpu(); free_fd_amperf_percpu(); free_fd_rapl_percpu(); + free_fd_cstate(); free(irq_column_2_cpu); free(irqs_per_cpu); @@ -4569,6 +4919,7 @@ static void update_effective_set(bool startup) void linux_perf_init(void); void rapl_perf_init(void); +void cstate_perf_init(void); void re_initialize(void) { @@ -4576,6 +4927,7 @@ void re_initialize(void) setup_all_buffers(false); linux_perf_init(); rapl_perf_init(); + cstate_perf_init(); fprintf(outf, "turbostat: re-initialized with num_cpus %d, allowed_cpus %d\n", topo.num_cpus, topo.allowed_cpus); } @@ -5294,30 +5646,27 @@ static void dump_sysfs_file(char *path) fprintf(outf, "%s: %s", strrchr(path, '/') + 1, cpuidle_buf); } -static void probe_intel_uncore_frequency(void) +static void probe_intel_uncore_frequency_legacy(void) { int i, j; char path[256]; - if (!genuine_intel) - return; - - if (access("/sys/devices/system/cpu/intel_uncore_frequency/package_00_die_00/current_freq_khz", R_OK)) - goto probe_cluster; - - BIC_PRESENT(BIC_UNCORE_MHZ); - - if (quiet) - return; - for (i = 0; i < topo.num_packages; ++i) { - for (j = 0; j < topo.num_die; ++j) { + for (j = 0; j <= topo.max_die_id; ++j) { int k, l; char path_base[128]; sprintf(path_base, "/sys/devices/system/cpu/intel_uncore_frequency/package_%02d_die_%02d", i, j); + if (access(path_base, R_OK)) + continue; + + BIC_PRESENT(BIC_UNCORE_MHZ); + + if (quiet) + return; + sprintf(path, "%s/min_freq_khz", path_base); k = read_sysfs_int(path); sprintf(path, "%s/max_freq_khz", path_base); @@ -5335,24 +5684,39 @@ static void probe_intel_uncore_frequency(void) fprintf(outf, " %d MHz\n", k / 1000); } } - return; +} + +static void probe_intel_uncore_frequency_cluster(void) +{ + int i, uncore_max_id; + char path[256]; + char path_base[128]; -probe_cluster: if (access("/sys/devices/system/cpu/intel_uncore_frequency/uncore00/current_freq_khz", R_OK)) return; if (quiet) return; - for (i = 0;; ++i) { + for (uncore_max_id = 0;; ++uncore_max_id) { + + sprintf(path_base, "/sys/devices/system/cpu/intel_uncore_frequency/uncore%02d", uncore_max_id); + + /* uncore## start at 00 and skips no numbers, so stop upon first missing */ + if (access(path_base, R_OK)) { + uncore_max_id -= 1; + break; + } + } + for (i = uncore_max_id; i >= 0; --i) { int k, l; - char path_base[128]; int package_id, domain_id, cluster_id; + char name_buf[16]; sprintf(path_base, "/sys/devices/system/cpu/intel_uncore_frequency/uncore%02d", i); if (access(path_base, R_OK)) - break; + err(1, "%s: %s\n", __func__, path_base); sprintf(path, "%s/package_id", path_base); package_id = read_sysfs_int(path); @@ -5379,9 +5743,25 @@ probe_cluster: sprintf(path, "%s/current_freq_khz", path_base); k = read_sysfs_int(path); fprintf(outf, " %d MHz\n", k / 1000); + + sprintf(path, "%s/current_freq_khz", path_base); + sprintf(name_buf, "UMHz%d.%d", domain_id, cluster_id); + + add_counter(0, path, name_buf, 0, SCOPE_PACKAGE, COUNTER_K2M, FORMAT_AVERAGE, 0, package_id); } } +static void probe_intel_uncore_frequency(void) +{ + if (!genuine_intel) + return; + + if (access("/sys/devices/system/cpu/intel_uncore_frequency/uncore00", R_OK) == 0) + probe_intel_uncore_frequency_cluster(); + else + probe_intel_uncore_frequency_legacy(); +} + static void probe_graphics(void) { /* Xe graphics sysfs knobs */ @@ -5466,7 +5846,6 @@ next: else if (!access("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", R_OK)) gfx_info[GFX_MHz].path = "/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz"; - if (!access("/sys/class/drm/card0/gt_act_freq_mhz", R_OK)) gfx_info[GFX_ACTMHz].path = "/sys/class/drm/card0/gt_act_freq_mhz"; else if (!access("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", R_OK)) @@ -6405,7 +6784,8 @@ bool is_aperf_access_required(void) return BIC_IS_ENABLED(BIC_Avg_MHz) || BIC_IS_ENABLED(BIC_Busy) || BIC_IS_ENABLED(BIC_Bzy_MHz) - || BIC_IS_ENABLED(BIC_IPC); + || BIC_IS_ENABLED(BIC_IPC) + || BIC_IS_ENABLED(BIC_CPU_c1); } int add_rapl_perf_counter_(int cpu, struct rapl_counter_info_t *rci, const struct rapl_counter_arch_info *cai, @@ -6478,17 +6858,18 @@ void linux_perf_init(void) void rapl_perf_init(void) { - const int num_domains = platform->has_per_core_rapl ? topo.num_cores : topo.num_packages; + const unsigned int num_domains = (platform->has_per_core_rapl ? topo.max_core_id : topo.max_package_id) + 1; bool *domain_visited = calloc(num_domains, sizeof(bool)); rapl_counter_info_perdomain = calloc(num_domains, sizeof(*rapl_counter_info_perdomain)); if (rapl_counter_info_perdomain == NULL) err(-1, "calloc rapl_counter_info_percpu"); + rapl_counter_info_perdomain_size = num_domains; /* * Initialize rapl_counter_info_percpu */ - for (int domain_id = 0; domain_id < num_domains; ++domain_id) { + for (unsigned int domain_id = 0; domain_id < num_domains; ++domain_id) { struct rapl_counter_info_t *rci = &rapl_counter_info_perdomain[domain_id]; rci->fd_perf = -1; @@ -6508,7 +6889,7 @@ void rapl_perf_init(void) bool has_counter = 0; double scale; enum rapl_unit unit; - int next_domain; + unsigned int next_domain; memset(domain_visited, 0, num_domains * sizeof(*domain_visited)); @@ -6521,6 +6902,8 @@ void rapl_perf_init(void) next_domain = platform->has_per_core_rapl ? cpus[cpu].physical_core_id : cpus[cpu].physical_package_id; + assert(next_domain < num_domains); + if (domain_visited[next_domain]) continue; @@ -6634,42 +7017,160 @@ static int has_amperf_access(void) return 0; } -void probe_cstates(void) +int *get_cstate_perf_group_fd(struct cstate_counter_info_t *cci, const char *group_name) { - probe_cst_limit(); + if (strcmp(group_name, "cstate_core") == 0) + return &cci->fd_perf_core; - if (platform->supported_cstates & CC1) - BIC_PRESENT(BIC_CPU_c1); + if (strcmp(group_name, "cstate_pkg") == 0) + return &cci->fd_perf_pkg; - if (platform->supported_cstates & CC3) - BIC_PRESENT(BIC_CPU_c3); + return NULL; +} - if (platform->supported_cstates & CC6) - BIC_PRESENT(BIC_CPU_c6); +int add_cstate_perf_counter_(int cpu, struct cstate_counter_info_t *cci, const struct cstate_counter_arch_info *cai) +{ + if (no_perf) + return -1; - if (platform->supported_cstates & CC7) - BIC_PRESENT(BIC_CPU_c7); + int *pfd_group = get_cstate_perf_group_fd(cci, cai->perf_subsys); - if (platform->supported_cstates & PC2 && (pkg_cstate_limit >= PCL__2)) - BIC_PRESENT(BIC_Pkgpc2); + if (pfd_group == NULL) + return -1; - if (platform->supported_cstates & PC3 && (pkg_cstate_limit >= PCL__3)) - BIC_PRESENT(BIC_Pkgpc3); + const unsigned int type = read_perf_type(cai->perf_subsys); + const unsigned int config = read_rapl_config(cai->perf_subsys, cai->perf_name); - if (platform->supported_cstates & PC6 && (pkg_cstate_limit >= PCL__6)) - BIC_PRESENT(BIC_Pkgpc6); + const int fd_counter = open_perf_counter(cpu, type, config, *pfd_group, PERF_FORMAT_GROUP); - if (platform->supported_cstates & PC7 && (pkg_cstate_limit >= PCL__7)) - BIC_PRESENT(BIC_Pkgpc7); + if (fd_counter == -1) + return -1; - if (platform->supported_cstates & PC8 && (pkg_cstate_limit >= PCL__8)) - BIC_PRESENT(BIC_Pkgpc8); + /* If it's the first counter opened, make it a group descriptor */ + if (*pfd_group == -1) + *pfd_group = fd_counter; - if (platform->supported_cstates & PC9 && (pkg_cstate_limit >= PCL__9)) - BIC_PRESENT(BIC_Pkgpc9); + return fd_counter; +} - if (platform->supported_cstates & PC10 && (pkg_cstate_limit >= PCL_10)) - BIC_PRESENT(BIC_Pkgpc10); +int add_cstate_perf_counter(int cpu, struct cstate_counter_info_t *cci, const struct cstate_counter_arch_info *cai) +{ + int ret = add_cstate_perf_counter_(cpu, cci, cai); + + if (debug) + fprintf(stderr, "%s: %d (cpu: %d)\n", __func__, ret, cpu); + + return ret; +} + +void cstate_perf_init_(bool soft_c1) +{ + bool has_counter; + bool *cores_visited = NULL, *pkg_visited = NULL; + const int cores_visited_elems = topo.max_core_id + 1; + const int pkg_visited_elems = topo.max_package_id + 1; + const int cci_num = topo.max_cpu_num + 1; + + ccstate_counter_info = calloc(cci_num, sizeof(*ccstate_counter_info)); + if (!ccstate_counter_info) + err(1, "calloc ccstate_counter_arch_info"); + ccstate_counter_info_size = cci_num; + + cores_visited = calloc(cores_visited_elems, sizeof(*cores_visited)); + if (!cores_visited) + err(1, "calloc cores_visited"); + + pkg_visited = calloc(pkg_visited_elems, sizeof(*pkg_visited)); + if (!pkg_visited) + err(1, "calloc pkg_visited"); + + /* Initialize cstate_counter_info_percpu */ + for (int cpu = 0; cpu < cci_num; ++cpu) { + ccstate_counter_info[cpu].fd_perf_core = -1; + ccstate_counter_info[cpu].fd_perf_pkg = -1; + } + + for (int cidx = 0; cidx < NUM_CSTATE_COUNTERS; ++cidx) { + has_counter = false; + memset(cores_visited, 0, cores_visited_elems * sizeof(*cores_visited)); + memset(pkg_visited, 0, pkg_visited_elems * sizeof(*pkg_visited)); + + const struct cstate_counter_arch_info *cai = &ccstate_counter_arch_infos[cidx]; + + for (int cpu = 0; cpu < cci_num; ++cpu) { + + struct cstate_counter_info_t *const cci = &ccstate_counter_info[cpu]; + + if (cpu_is_not_allowed(cpu)) + continue; + + const int core_id = cpus[cpu].physical_core_id; + const int pkg_id = cpus[cpu].physical_package_id; + + assert(core_id < cores_visited_elems); + assert(pkg_id < pkg_visited_elems); + + const bool per_thread = cai->flags & CSTATE_COUNTER_FLAG_COLLECT_PER_THREAD; + const bool per_core = cai->flags & CSTATE_COUNTER_FLAG_COLLECT_PER_CORE; + + if (!per_thread && cores_visited[core_id]) + continue; + + if (!per_core && pkg_visited[pkg_id]) + continue; + + const bool counter_needed = BIC_IS_ENABLED(cai->bic) || + (soft_c1 && (cai->flags & CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY)); + const bool counter_supported = (platform->supported_cstates & cai->feature_mask); + + if (counter_needed && counter_supported) { + /* Use perf API for this counter */ + if (!no_perf && cai->perf_name && add_cstate_perf_counter(cpu, cci, cai) != -1) { + + cci->source[cai->rci_index] = CSTATE_SOURCE_PERF; + + /* User MSR for this counter */ + } else if (!no_msr && cai->msr && pkg_cstate_limit >= cai->pkg_cstate_limit + && probe_msr(cpu, cai->msr) == 0) { + cci->source[cai->rci_index] = CSTATE_SOURCE_MSR; + cci->msr[cai->rci_index] = cai->msr; + } + } + + if (cci->source[cai->rci_index] != CSTATE_SOURCE_NONE) { + has_counter = true; + cores_visited[core_id] = true; + pkg_visited[pkg_id] = true; + } + } + + /* If any CPU has access to the counter, make it present */ + if (has_counter) + BIC_PRESENT(cai->bic); + } + + free(cores_visited); + free(pkg_visited); +} + +void cstate_perf_init(void) +{ + /* + * If we don't have a C1 residency MSR, we calculate it "in software", + * but we need APERF, MPERF too. + */ + const bool soft_c1 = !platform->has_msr_core_c1_res && has_amperf_access() + && platform->supported_cstates & CC1; + + if (soft_c1) + BIC_PRESENT(BIC_CPU_c1); + + cstate_perf_init_(soft_c1); +} + +void probe_cstates(void) +{ + probe_cst_limit(); if (platform->has_msr_module_c6_res_ms) BIC_PRESENT(BIC_Mod_c6); @@ -6927,6 +7428,22 @@ void process_cpuid() BIC_PRESENT(BIC_TSC_MHz); } +static void counter_info_init(void) +{ + for (int i = 0; i < NUM_CSTATE_COUNTERS; ++i) { + struct cstate_counter_arch_info *const cai = &ccstate_counter_arch_infos[i]; + + if (platform->has_msr_knl_core_c6_residency && cai->msr == MSR_CORE_C6_RESIDENCY) + cai->msr = MSR_KNL_CORE_C6_RESIDENCY; + + if (!platform->has_msr_core_c1_res && cai->msr == MSR_CORE_C1_RES) + cai->msr = 0; + + if (platform->has_msr_atom_pkg_c6_residency && cai->msr == MSR_PKG_C6_RESIDENCY) + cai->msr = MSR_ATOM_PKG_C6_RESIDENCY; + } +} + void probe_pm_features(void) { probe_pstates(); @@ -6967,7 +7484,6 @@ void topology_probe(bool startup) int i; int max_core_id = 0; int max_package_id = 0; - int max_die_id = 0; int max_siblings = 0; /* Initialize num_cpus, max_cpu_num */ @@ -7084,8 +7600,8 @@ void topology_probe(bool startup) /* get die information */ cpus[i].die_id = get_die_id(i); - if (cpus[i].die_id > max_die_id) - max_die_id = cpus[i].die_id; + if (cpus[i].die_id > topo.max_die_id) + topo.max_die_id = cpus[i].die_id; /* get numa node information */ cpus[i].physical_node_id = get_physical_node_id(&cpus[i]); @@ -7104,6 +7620,8 @@ void topology_probe(bool startup) if (cpus[i].thread_id == 0) topo.num_cores++; } + topo.max_core_id = max_core_id; + topo.max_package_id = max_package_id; topo.cores_per_node = max_core_id + 1; if (debug > 1) @@ -7111,9 +7629,9 @@ void topology_probe(bool startup) if (!summary_only && topo.cores_per_node > 1) BIC_PRESENT(BIC_Core); - topo.num_die = max_die_id + 1; + topo.num_die = topo.max_die_id + 1; if (debug > 1) - fprintf(outf, "max_die_id %d, sizing for %d die\n", max_die_id, topo.num_die); + fprintf(outf, "max_die_id %d, sizing for %d die\n", topo.max_die_id, topo.num_die); if (!summary_only && topo.num_die > 1) BIC_PRESENT(BIC_Die); @@ -7403,10 +7921,12 @@ void turbostat_init() check_msr_access(); check_perf_access(); process_cpuid(); + counter_info_init(); probe_pm_features(); set_amperf_source(); linux_perf_init(); rapl_perf_init(); + cstate_perf_init(); for_all_cpus(get_cpu_type, ODD_COUNTERS); for_all_cpus(get_cpu_type, EVEN_COUNTERS); @@ -7497,7 +8017,7 @@ int get_and_dump_counters(void) void print_version() { - fprintf(outf, "turbostat version 2024.04.08 - Len Brown <lenb@kernel.org>\n"); + fprintf(outf, "turbostat version 2024.05.10 - Len Brown <lenb@kernel.org>\n"); } #define COMMAND_LINE_SIZE 2048 @@ -7523,61 +8043,114 @@ void print_bootcmd(void) fclose(fp); } +struct msr_counter *find_msrp_by_name(struct msr_counter *head, char *name) +{ + struct msr_counter *mp; + + for (mp = head; mp; mp = mp->next) { + if (debug) + printf("%s: %s %s\n", __func__, name, mp->name); + if (!strncmp(name, mp->name, strlen(mp->name))) + return mp; + } + return NULL; +} + int add_counter(unsigned int msr_num, char *path, char *name, unsigned int width, enum counter_scope scope, - enum counter_type type, enum counter_format format, int flags) + enum counter_type type, enum counter_format format, int flags, int id) { struct msr_counter *msrp; if (no_msr && msr_num) errx(1, "Requested MSR counter 0x%x, but in --no-msr mode", msr_num); - msrp = calloc(1, sizeof(struct msr_counter)); - if (msrp == NULL) { - perror("calloc"); - exit(1); - } - - msrp->msr_num = msr_num; - strncpy(msrp->name, name, NAME_BYTES - 1); - if (path) - strncpy(msrp->path, path, PATH_BYTES - 1); - msrp->width = width; - msrp->type = type; - msrp->format = format; - msrp->flags = flags; + if (debug) + printf("%s(msr%d, %s, %s, width%d, scope%d, type%d, format%d, flags%x, id%d)\n", __func__, msr_num, + path, name, width, scope, type, format, flags, id); switch (scope) { case SCOPE_CPU: - msrp->next = sys.tp; - sys.tp = msrp; - sys.added_thread_counters++; - if (sys.added_thread_counters > MAX_ADDED_THREAD_COUNTERS) { - fprintf(stderr, "exceeded max %d added thread counters\n", MAX_ADDED_COUNTERS); - exit(-1); + msrp = find_msrp_by_name(sys.tp, name); + if (msrp) { + if (debug) + printf("%s: %s FOUND\n", __func__, name); + break; + } + if (sys.added_thread_counters++ >= MAX_ADDED_THREAD_COUNTERS) { + warnx("ignoring thread counter %s", name); + return -1; } break; - case SCOPE_CORE: - msrp->next = sys.cp; - sys.cp = msrp; - sys.added_core_counters++; - if (sys.added_core_counters > MAX_ADDED_COUNTERS) { - fprintf(stderr, "exceeded max %d added core counters\n", MAX_ADDED_COUNTERS); - exit(-1); + msrp = find_msrp_by_name(sys.cp, name); + if (msrp) { + if (debug) + printf("%s: %s FOUND\n", __func__, name); + break; + } + if (sys.added_core_counters++ >= MAX_ADDED_CORE_COUNTERS) { + warnx("ignoring core counter %s", name); + return -1; } break; - case SCOPE_PACKAGE: - msrp->next = sys.pp; - sys.pp = msrp; - sys.added_package_counters++; - if (sys.added_package_counters > MAX_ADDED_COUNTERS) { - fprintf(stderr, "exceeded max %d added package counters\n", MAX_ADDED_COUNTERS); - exit(-1); + msrp = find_msrp_by_name(sys.pp, name); + if (msrp) { + if (debug) + printf("%s: %s FOUND\n", __func__, name); + break; + } + if (sys.added_package_counters++ >= MAX_ADDED_PACKAGE_COUNTERS) { + warnx("ignoring package counter %s", name); + return -1; } break; + default: + warnx("ignoring counter %s with unknown scope", name); + return -1; + } + + if (msrp == NULL) { + msrp = calloc(1, sizeof(struct msr_counter)); + if (msrp == NULL) + err(-1, "calloc msr_counter"); + msrp->msr_num = msr_num; + strncpy(msrp->name, name, NAME_BYTES - 1); + msrp->width = width; + msrp->type = type; + msrp->format = format; + msrp->flags = flags; + + switch (scope) { + case SCOPE_CPU: + msrp->next = sys.tp; + sys.tp = msrp; + break; + case SCOPE_CORE: + msrp->next = sys.cp; + sys.cp = msrp; + break; + case SCOPE_PACKAGE: + msrp->next = sys.pp; + sys.pp = msrp; + break; + } + } + + if (path) { + struct sysfs_path *sp; + + sp = calloc(1, sizeof(struct sysfs_path)); + if (sp == NULL) { + perror("calloc"); + exit(1); + } + strncpy(sp->path, path, PATH_BYTES - 1); + sp->id = id; + sp->next = msrp->sp; + msrp->sp = sp; } return 0; @@ -7679,7 +8252,7 @@ next: sprintf(name_buffer, "M0X%x%s", msr_num, format == FORMAT_PERCENT ? "%" : ""); } - if (add_counter(msr_num, path, name_buffer, width, scope, type, format, 0)) + if (add_counter(msr_num, path, name_buffer, width, scope, type, format, 0, 0)) fail++; if (fail) { @@ -7744,7 +8317,7 @@ void probe_sysfs(void) if (is_deferred_skip(name_buf)) continue; - add_counter(0, path, name_buf, 64, SCOPE_CPU, COUNTER_USEC, FORMAT_PERCENT, SYSFS_PERCPU); + add_counter(0, path, name_buf, 64, SCOPE_CPU, COUNTER_USEC, FORMAT_PERCENT, SYSFS_PERCPU, 0); } for (state = 10; state >= 0; --state) { @@ -7772,7 +8345,7 @@ void probe_sysfs(void) if (is_deferred_skip(name_buf)) continue; - add_counter(0, path, name_buf, 64, SCOPE_CPU, COUNTER_ITEMS, FORMAT_DELTA, SYSFS_PERCPU); + add_counter(0, path, name_buf, 64, SCOPE_CPU, COUNTER_ITEMS, FORMAT_DELTA, SYSFS_PERCPU, 0); } } diff --git a/tools/sound/dapm-graph b/tools/sound/dapm-graph new file mode 100755 index 000000000000..57d78f6df041 --- /dev/null +++ b/tools/sound/dapm-graph @@ -0,0 +1,303 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# +# Generate a graph of the current DAPM state for an audio card +# +# Copyright 2024 Bootlin +# Author: Luca Ceresoli <luca.ceresol@bootlin.com> + +set -eu + +STYLE_NODE_ON="shape=box,style=bold,color=green4" +STYLE_NODE_OFF="shape=box,style=filled,color=gray30,fillcolor=gray95" + +# Print usage and exit +# +# $1 = exit return value +# $2 = error string (required if $1 != 0) +usage() +{ + if [ "${1}" -ne 0 ]; then + echo "${2}" >&2 + fi + + echo " +Generate a graph of the current DAPM state for an audio card. + +The DAPM state can be obtained via debugfs for a card on the local host or +a remote target, or from a local copy of the debugfs tree for the card. + +Usage: + $(basename $0) [options] -c CARD - Local sound card + $(basename $0) [options] -c CARD -r REMOTE_TARGET - Card on remote system + $(basename $0) [options] -d STATE_DIR - Local directory + +Options: + -c CARD Sound card to get DAPM state of + -r REMOTE_TARGET Get DAPM state from REMOTE_TARGET via SSH and SCP + instead of using a local sound card + -d STATE_DIR Get DAPM state from a local copy of a debugfs tree + -o OUT_FILE Output file (default: dapm.dot) + -D Show verbose debugging info + -h Print this help and exit + +The output format is implied by the extension of OUT_FILE: + + * Use the .dot extension to generate a text graph representation in + graphviz dot syntax. + * Any other extension is assumed to be a format supported by graphviz for + rendering, e.g. 'png', 'svg', and will produce both the .dot file and a + picture from it. This requires the 'dot' program from the graphviz + package. +" + + exit ${1} +} + +# Connect to a remote target via SSH, collect all DAPM files from debufs +# into a tarball and get the tarball via SCP into $3/dapm.tar +# +# $1 = target as used by ssh and scp, e.g. "root@192.168.1.1" +# $2 = sound card name +# $3 = temp dir path (present on the host, created on the target) +# $4 = local directory to extract the tarball into +# +# Requires an ssh+scp server, find and tar+gz on the target +# +# Note: the tarball is needed because plain 'scp -r' from debugfs would +# copy only empty files +grab_remote_files() +{ + echo "Collecting DAPM state from ${1}" + dbg_echo "Collected DAPM state in ${3}" + + ssh "${1}" " +set -eu && +cd \"/sys/kernel/debug/asoc/${2}\" && +find * -type d -exec mkdir -p ${3}/dapm-tree/{} \; && +find * -type f -exec cp \"{}\" \"${3}/dapm-tree/{}\" \; && +cd ${3}/dapm-tree && +tar cf ${3}/dapm.tar ." + scp -q "${1}:${3}/dapm.tar" "${3}" + + mkdir -p "${4}" + tar xf "${tmp_dir}/dapm.tar" -C "${4}" +} + +# Parse a widget file and generate graph description in graphviz dot format +# +# Skips any file named "bias_level". +# +# $1 = temporary work dir +# $2 = component name +# $3 = widget filename +process_dapm_widget() +{ + local tmp_dir="${1}" + local c_name="${2}" + local w_file="${3}" + local dot_file="${tmp_dir}/main.dot" + local links_file="${tmp_dir}/links.dot" + + local w_name="$(basename "${w_file}")" + local w_tag="${c_name}_${w_name}" + + if [ "${w_name}" = "bias_level" ]; then + return 0 + fi + + dbg_echo " + Widget: ${w_name}" + + cat "${w_file}" | ( + read line + + if echo "${line}" | grep -q ': On ' + then local node_style="${STYLE_NODE_ON}" + else local node_style="${STYLE_NODE_OFF}" + fi + + local w_type="" + while read line; do + # Collect widget type if present + if echo "${line}" | grep -q '^widget-type '; then + local w_type_raw="$(echo "$line" | cut -d ' ' -f 2)" + dbg_echo " - Widget type: ${w_type_raw}" + + # Note: escaping '\n' is tricky to get working with both + # bash and busybox ash, so use a '%' here and replace it + # later + local w_type="%n[${w_type_raw}]" + fi + + # Collect any links. We could use "in" links or "out" links, + # let's use "in" links + if echo "${line}" | grep -q '^in '; then + local w_src=$(echo "$line" | + awk -F\" '{print $6 "_" $4}' | + sed 's/^(null)_/ROOT_/') + dbg_echo " - Input route from: ${w_src}" + echo " \"${w_src}\" -> \"$w_tag\"" >> "${links_file}" + fi + done + + echo " \"${w_tag}\" [label=\"${w_name}${w_type}\",${node_style}]" | + tr '%' '\\' >> "${dot_file}" + ) +} + +# Parse the DAPM tree for a sound card component and generate graph +# description in graphviz dot format +# +# $1 = temporary work dir +# $2 = component directory +# $3 = forced component name (extracted for path if empty) +process_dapm_component() +{ + local tmp_dir="${1}" + local c_dir="${2}" + local c_name="${3}" + local dot_file="${tmp_dir}/main.dot" + local links_file="${tmp_dir}/links.dot" + + if [ -z "${c_name}" ]; then + # Extract directory name into component name: + # "./cs42l51.0-004a/dapm" -> "cs42l51.0-004a" + c_name="$(basename $(dirname "${c_dir}"))" + fi + + dbg_echo " * Component: ${c_name}" + + echo "" >> "${dot_file}" + echo " subgraph \"${c_name}\" {" >> "${dot_file}" + echo " cluster = true" >> "${dot_file}" + echo " label = \"${c_name}\"" >> "${dot_file}" + echo " color=dodgerblue" >> "${dot_file}" + + # Create empty file to ensure it will exist in all cases + >"${links_file}" + + # Iterate over widgets in the component dir + for w_file in ${c_dir}/*; do + process_dapm_widget "${tmp_dir}" "${c_name}" "${w_file}" + done + + echo " }" >> "${dot_file}" + + cat "${links_file}" >> "${dot_file}" +} + +# Parse the DAPM tree for a sound card and generate graph description in +# graphviz dot format +# +# $1 = temporary work dir +# $2 = directory tree with DAPM state (either in debugfs or a mirror) +process_dapm_tree() +{ + local tmp_dir="${1}" + local dapm_dir="${2}" + local dot_file="${tmp_dir}/main.dot" + + echo "digraph G {" > "${dot_file}" + echo " fontname=\"sans-serif\"" >> "${dot_file}" + echo " node [fontname=\"sans-serif\"]" >> "${dot_file}" + + + # Process root directory (no component) + process_dapm_component "${tmp_dir}" "${dapm_dir}/dapm" "ROOT" + + # Iterate over components + for c_dir in "${dapm_dir}"/*/dapm + do + process_dapm_component "${tmp_dir}" "${c_dir}" "" + done + + echo "}" >> "${dot_file}" +} + +main() +{ + # Parse command line + local out_file="dapm.dot" + local card_name="" + local remote_target="" + local dapm_tree="" + local dbg_on="" + while getopts "c:r:d:o:Dh" arg; do + case $arg in + c) card_name="${OPTARG}" ;; + r) remote_target="${OPTARG}" ;; + d) dapm_tree="${OPTARG}" ;; + o) out_file="${OPTARG}" ;; + D) dbg_on="1" ;; + h) usage 0 ;; + *) usage 1 ;; + esac + done + shift $(($OPTIND - 1)) + + if [ -n "${dapm_tree}" ]; then + if [ -n "${card_name}${remote_target}" ]; then + usage 1 "Cannot use -c and -r with -d" + fi + echo "Using local tree: ${dapm_tree}" + elif [ -n "${remote_target}" ]; then + if [ -z "${card_name}" ]; then + usage 1 "-r requires -c" + fi + echo "Using card ${card_name} from remote target ${remote_target}" + elif [ -n "${card_name}" ]; then + echo "Using local card: ${card_name}" + else + usage 1 "Please choose mode using -c, -r or -d" + fi + + # Define logging function + if [ "${dbg_on}" ]; then + dbg_echo() { + echo "$*" >&2 + } + else + dbg_echo() { + : + } + fi + + # Filename must have a dot in order the infer the format from the + # extension + if ! echo "${out_file}" | grep -qE '\.'; then + echo "Missing extension in output filename ${out_file}" >&2 + usage + exit 1 + fi + + local out_fmt="${out_file##*.}" + local dot_file="${out_file%.*}.dot" + + dbg_echo "dot file: $dot_file" + dbg_echo "Output file: $out_file" + dbg_echo "Output format: $out_fmt" + + tmp_dir="$(mktemp -d /tmp/$(basename $0).XXXXXX)" + trap "{ rm -fr ${tmp_dir}; }" INT TERM EXIT + + if [ -z "${dapm_tree}" ] + then + dapm_tree="/sys/kernel/debug/asoc/${card_name}" + fi + if [ -n "${remote_target}" ]; then + dapm_tree="${tmp_dir}/dapm-tree" + grab_remote_files "${remote_target}" "${card_name}" "${tmp_dir}" "${dapm_tree}" + fi + # In all cases now ${dapm_tree} contains the DAPM state + + process_dapm_tree "${tmp_dir}" "${dapm_tree}" + cp "${tmp_dir}/main.dot" "${dot_file}" + + if [ "${out_file}" != "${dot_file}" ]; then + dot -T"${out_fmt}" "${dot_file}" -o "${out_file}" + fi + + echo "Generated file ${out_file}" +} + +main "${@}" diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c index 61c69297e797..3482248aa344 100644 --- a/tools/testing/cxl/test/cxl.c +++ b/tools/testing/cxl/test/cxl.c @@ -1001,6 +1001,7 @@ static void mock_cxl_endpoint_parse_cdat(struct cxl_port *port) struct cxl_memdev *cxlmd = to_cxl_memdev(port->uport_dev); struct cxl_dev_state *cxlds = cxlmd->cxlds; struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds); + struct access_coordinate ep_c[ACCESS_COORDINATE_MAX]; struct range pmem_range = { .start = cxlds->pmem_res.start, .end = cxlds->pmem_res.end, @@ -1020,6 +1021,12 @@ static void mock_cxl_endpoint_parse_cdat(struct cxl_port *port) dpa_perf_setup(port, &pmem_range, &mds->pmem_perf); cxl_memdev_update_perf(cxlmd); + + /* + * This function is here to only test the topology iterator. It serves + * no other purpose. + */ + cxl_endpoint_get_perf_coordinates(port, ep_c); } static struct cxl_mock_ops cxl_mock_ops = { diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c index 35ee41e435ab..6584443144de 100644 --- a/tools/testing/cxl/test/mem.c +++ b/tools/testing/cxl/test/mem.c @@ -127,7 +127,7 @@ static struct { #define CXL_TEST_EVENT_CNT_MAX 15 /* Set a number of events to return at a time for simulation. */ -#define CXL_TEST_EVENT_CNT 3 +#define CXL_TEST_EVENT_RET_MAX 4 struct mock_event_log { u16 clear_idx; @@ -222,6 +222,12 @@ static void mes_add_event(struct mock_event_store *mes, log->nr_events++; } +/* + * Vary the number of events returned to simulate events occuring while the + * logs are being read. + */ +static int ret_limit = 0; + static int mock_get_event(struct device *dev, struct cxl_mbox_cmd *cmd) { struct cxl_get_event_payload *pl; @@ -233,14 +239,18 @@ static int mock_get_event(struct device *dev, struct cxl_mbox_cmd *cmd) if (cmd->size_in != sizeof(log_type)) return -EINVAL; - if (cmd->size_out < struct_size(pl, records, CXL_TEST_EVENT_CNT)) + ret_limit = (ret_limit + 1) % CXL_TEST_EVENT_RET_MAX; + if (!ret_limit) + ret_limit = 1; + + if (cmd->size_out < struct_size(pl, records, ret_limit)) return -EINVAL; log_type = *((u8 *)cmd->payload_in); if (log_type >= CXL_EVENT_TYPE_MAX) return -EINVAL; - memset(cmd->payload_out, 0, cmd->size_out); + memset(cmd->payload_out, 0, struct_size(pl, records, 0)); log = event_find_log(dev, log_type); if (!log || event_log_empty(log)) @@ -248,7 +258,7 @@ static int mock_get_event(struct device *dev, struct cxl_mbox_cmd *cmd) pl = cmd->payload_out; - for (i = 0; i < CXL_TEST_EVENT_CNT && !event_log_empty(log); i++) { + for (i = 0; i < ret_limit && !event_log_empty(log); i++) { memcpy(&pl->records[i], event_get_current(log), sizeof(pl->records[i])); pl->records[i].event.generic.hdr.handle = @@ -256,6 +266,7 @@ static int mock_get_event(struct device *dev, struct cxl_mbox_cmd *cmd) log->cur_idx++; } + cmd->size_out = struct_size(pl, records, i); pl->record_count = cpu_to_le16(i); if (!event_log_empty(log)) pl->flags |= CXL_GET_EVENT_FLAG_MORE_RECORDS; diff --git a/tools/testing/kunit/qemu_configs/riscv.py b/tools/testing/kunit/qemu_configs/riscv.py index 12a1d525978a..c87758030ff7 100644 --- a/tools/testing/kunit/qemu_configs/riscv.py +++ b/tools/testing/kunit/qemu_configs/riscv.py @@ -13,7 +13,7 @@ if not os.path.isfile(OPENSBI_PATH): QEMU_ARCH = QemuArchParams(linux_arch='riscv', kconfig=''' -CONFIG_SOC_VIRT=y +CONFIG_ARCH_VIRT=y CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y CONFIG_SERIAL_OF_PLATFORM=y diff --git a/tools/testing/nvdimm/test/ndtest.c b/tools/testing/nvdimm/test/ndtest.c index b8419f460368..b438f3d053ee 100644 --- a/tools/testing/nvdimm/test/ndtest.c +++ b/tools/testing/nvdimm/test/ndtest.c @@ -13,6 +13,8 @@ #include <nd-core.h> #include <linux/printk.h> #include <linux/seq_buf.h> +#include <linux/papr_scm.h> +#include <uapi/linux/papr_pdsm.h> #include "../watermark.h" #include "nfit_test.h" @@ -830,12 +832,11 @@ static int ndtest_bus_register(struct ndtest_priv *p) return 0; } -static int ndtest_remove(struct platform_device *pdev) +static void ndtest_remove(struct platform_device *pdev) { struct ndtest_priv *p = to_ndtest_priv(&pdev->dev); nvdimm_bus_unregister(p->bus); - return 0; } static int ndtest_probe(struct platform_device *pdev) @@ -882,7 +883,7 @@ static const struct platform_device_id ndtest_id[] = { static struct platform_driver ndtest_driver = { .probe = ndtest_probe, - .remove = ndtest_remove, + .remove_new = ndtest_remove, .driver = { .name = KBUILD_MODNAME, }, diff --git a/tools/testing/nvdimm/test/ndtest.h b/tools/testing/nvdimm/test/ndtest.h index 2c54c9cbb90c..8f27ad6f7319 100644 --- a/tools/testing/nvdimm/test/ndtest.h +++ b/tools/testing/nvdimm/test/ndtest.h @@ -5,37 +5,6 @@ #include <linux/platform_device.h> #include <linux/libnvdimm.h> -/* SCM device is unable to persist memory contents */ -#define PAPR_PMEM_UNARMED (1ULL << (63 - 0)) -/* SCM device failed to persist memory contents */ -#define PAPR_PMEM_SHUTDOWN_DIRTY (1ULL << (63 - 1)) -/* SCM device contents are not persisted from previous IPL */ -#define PAPR_PMEM_EMPTY (1ULL << (63 - 3)) -#define PAPR_PMEM_HEALTH_CRITICAL (1ULL << (63 - 4)) -/* SCM device will be garded off next IPL due to failure */ -#define PAPR_PMEM_HEALTH_FATAL (1ULL << (63 - 5)) -/* SCM contents cannot persist due to current platform health status */ -#define PAPR_PMEM_HEALTH_UNHEALTHY (1ULL << (63 - 6)) - -/* Bits status indicators for health bitmap indicating unarmed dimm */ -#define PAPR_PMEM_UNARMED_MASK (PAPR_PMEM_UNARMED | \ - PAPR_PMEM_HEALTH_UNHEALTHY) - -#define PAPR_PMEM_SAVE_FAILED (1ULL << (63 - 10)) - -/* Bits status indicators for health bitmap indicating unflushed dimm */ -#define PAPR_PMEM_BAD_SHUTDOWN_MASK (PAPR_PMEM_SHUTDOWN_DIRTY) - -/* Bits status indicators for health bitmap indicating unrestored dimm */ -#define PAPR_PMEM_BAD_RESTORE_MASK (PAPR_PMEM_EMPTY) - -/* Bit status indicators for smart event notification */ -#define PAPR_PMEM_SMART_EVENT_MASK (PAPR_PMEM_HEALTH_CRITICAL | \ - PAPR_PMEM_HEALTH_FATAL | \ - PAPR_PMEM_HEALTH_UNHEALTHY) - -#define PAPR_PMEM_SAVE_MASK (PAPR_PMEM_SAVE_FAILED) - struct ndtest_config; struct ndtest_priv { diff --git a/tools/testing/radix-tree/linux/kernel.h b/tools/testing/radix-tree/linux/kernel.h index c5c9d05f29da..c0a2bb785b92 100644 --- a/tools/testing/radix-tree/linux/kernel.h +++ b/tools/testing/radix-tree/linux/kernel.h @@ -18,6 +18,8 @@ #define pr_info printk #define pr_debug printk #define pr_cont printk +#define schedule() +#define PAGE_SHIFT 12 #define __acquires(x) #define __releases(x) diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index e1504833654d..9039f3709aff 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -17,8 +17,10 @@ TARGETS += devices TARGETS += dmabuf-heaps TARGETS += drivers/dma-buf TARGETS += drivers/s390x/uvdevice +TARGETS += drivers/net TARGETS += drivers/net/bonding TARGETS += drivers/net/team +TARGETS += drivers/net/virtio_net TARGETS += dt TARGETS += efivarfs TARGETS += exec @@ -63,7 +65,7 @@ TARGETS += net/hsr TARGETS += net/mptcp TARGETS += net/openvswitch TARGETS += net/tcp_ao -TARGETS += netfilter +TARGETS += net/netfilter TARGETS += nsfs TARGETS += perf_events TARGETS += pidfd @@ -116,6 +118,13 @@ TARGETS += zram TARGETS_HOTPLUG = cpu-hotplug TARGETS_HOTPLUG += memory-hotplug +# Networking tests want the net/lib target, include it automatically +ifneq ($(filter net drivers/net drivers/net/hw,$(TARGETS)),) +ifeq ($(filter net/lib,$(TARGETS)),) + INSTALL_DEP_TARGETS := net/lib +endif +endif + # User can optionally provide a TARGETS skiplist. By default we skip # BPF since it has cutting edge build time dependencies which require # more effort to install. @@ -245,7 +254,7 @@ ifdef INSTALL_PATH install -m 744 run_kselftest.sh $(INSTALL_PATH)/ rm -f $(TEST_LIST) @ret=1; \ - for TARGET in $(TARGETS); do \ + for TARGET in $(TARGETS) $(INSTALL_DEP_TARGETS); do \ BUILD_TARGET=$$BUILD/$$TARGET; \ $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET install \ INSTALL_PATH=$(INSTALL_PATH)/$$TARGET \ diff --git a/tools/testing/selftests/alsa/conf.c b/tools/testing/selftests/alsa/conf.c index 89e3656a042d..e2b3a5810f47 100644 --- a/tools/testing/selftests/alsa/conf.c +++ b/tools/testing/selftests/alsa/conf.c @@ -105,7 +105,7 @@ static struct card_cfg_data *conf_data_by_card(int card, bool msg) return NULL; } -static int dump_config_tree(snd_config_t *top) +static void dump_config_tree(snd_config_t *top) { snd_output_t *out; int err; diff --git a/tools/testing/selftests/arm64/abi/tpidr2.c b/tools/testing/selftests/arm64/abi/tpidr2.c index 02ee3a91b780..285c47dd42f6 100644 --- a/tools/testing/selftests/arm64/abi/tpidr2.c +++ b/tools/testing/selftests/arm64/abi/tpidr2.c @@ -262,7 +262,7 @@ static int write_clone_read(void) int main(int argc, char **argv) { - int ret, i; + int ret; putstr("TAP version 13\n"); putstr("1.."); diff --git a/tools/testing/selftests/arm64/tags/tags_test.c b/tools/testing/selftests/arm64/tags/tags_test.c index 5701163460ef..955f87c1170d 100644 --- a/tools/testing/selftests/arm64/tags/tags_test.c +++ b/tools/testing/selftests/arm64/tags/tags_test.c @@ -6,6 +6,7 @@ #include <stdint.h> #include <sys/prctl.h> #include <sys/utsname.h> +#include "../../kselftest.h" #define SHIFT_TAG(tag) ((uint64_t)(tag) << 56) #define SET_TAG(ptr, tag) (((uint64_t)(ptr) & ~SHIFT_TAG(0xff)) | \ @@ -21,6 +22,9 @@ int main(void) if (prctl(PR_SET_TAGGED_ADDR_CTRL, PR_TAGGED_ADDR_ENABLE, 0, 0, 0) == 0) tbi_enabled = 1; ptr = (struct utsname *)malloc(sizeof(*ptr)); + if (!ptr) + ksft_exit_fail_msg("Failed to allocate utsname buffer\n"); + if (tbi_enabled) tag = 0x42; ptr = (struct utsname *)SET_TAG(ptr, tag); diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index f1aebabfb017..5025401323af 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -17,7 +17,6 @@ test_dev_cgroup test_verifier_log feature test_sock -test_sock_addr urandom_read test_sockmap test_lirc_mode2_user diff --git a/tools/testing/selftests/bpf/DENYLIST.aarch64 b/tools/testing/selftests/bpf/DENYLIST.aarch64 index d8ade15e2789..0445ac38bc07 100644 --- a/tools/testing/selftests/bpf/DENYLIST.aarch64 +++ b/tools/testing/selftests/bpf/DENYLIST.aarch64 @@ -10,5 +10,3 @@ fill_link_info/kprobe_multi_link_info # bpf_program__attach_kprobe_mu fill_link_info/kretprobe_multi_link_info # bpf_program__attach_kprobe_multi_opts unexpected error: -95 fill_link_info/kprobe_multi_invalid_ubuff # bpf_program__attach_kprobe_multi_opts unexpected error: -95 missed/kprobe_recursion # missed_kprobe_recursion__attach unexpected error: -95 (errno 95) -verifier_arena # JIT does not support arena -arena_htab # JIT does not support arena diff --git a/tools/testing/selftests/bpf/DENYLIST.s390x b/tools/testing/selftests/bpf/DENYLIST.s390x index f4a2f66a683d..c34adf39eeb2 100644 --- a/tools/testing/selftests/bpf/DENYLIST.s390x +++ b/tools/testing/selftests/bpf/DENYLIST.s390x @@ -6,3 +6,4 @@ stacktrace_build_id # compare_map_keys stackid_hmap vs. sta verifier_iterating_callbacks verifier_arena # JIT does not support arena arena_htab # JIT does not support arena +arena_atomics diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 3b9eb40d6343..e0b3887b3d2d 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -53,6 +53,7 @@ progs/syscall.c-CFLAGS := -fno-strict-aliasing progs/test_pkt_md_access.c-CFLAGS := -fno-strict-aliasing progs/test_sk_lookup.c-CFLAGS := -fno-strict-aliasing progs/timer_crash.c-CFLAGS := -fno-strict-aliasing +progs/test_global_func9.c-CFLAGS := -fno-strict-aliasing ifneq ($(LLVM),) # Silence some warnings when compiled with clang @@ -81,11 +82,24 @@ TEST_INST_SUBDIRS += bpf_gcc # The following tests contain C code that, although technically legal, # triggers GCC warnings that cannot be disabled: declaration of # anonymous struct types in function parameter lists. -progs/btf_dump_test_case_bitfields.c-CFLAGS := -Wno-error -progs/btf_dump_test_case_namespacing.c-CFLAGS := -Wno-error -progs/btf_dump_test_case_packing.c-CFLAGS := -Wno-error -progs/btf_dump_test_case_padding.c-CFLAGS := -Wno-error -progs/btf_dump_test_case_syntax.c-CFLAGS := -Wno-error +progs/btf_dump_test_case_bitfields.c-bpf_gcc-CFLAGS := -Wno-error +progs/btf_dump_test_case_namespacing.c-bpf_gcc-CFLAGS := -Wno-error +progs/btf_dump_test_case_packing.c-bpf_gcc-CFLAGS := -Wno-error +progs/btf_dump_test_case_padding.c-bpf_gcc-CFLAGS := -Wno-error +progs/btf_dump_test_case_syntax.c-bpf_gcc-CFLAGS := -Wno-error + +# The following tests do type-punning, via the __imm_insn macro, from +# `struct bpf_insn' to long and then uses the value. This triggers an +# "is used uninitialized" warning in GCC due to strict-aliasing +# rules. +progs/verifier_ref_tracking.c-bpf_gcc-CFLAGS := -fno-strict-aliasing +progs/verifier_unpriv.c-bpf_gcc-CFLAGS := -fno-strict-aliasing +progs/verifier_cgroup_storage.c-bpf_gcc-CFLAGS := -fno-strict-aliasing +progs/verifier_ld_ind.c-bpf_gcc-CFLAGS := -fno-strict-aliasing +progs/verifier_map_ret_val.c-bpf_gcc-CFLAGS := -fno-strict-aliasing +progs/verifier_spill_fill.c-bpf_gcc-CFLAGS := -fno-strict-aliasing +progs/verifier_subprog_precision.c-bpf_gcc-CFLAGS := -fno-strict-aliasing +progs/verifier_uninit.c-bpf_gcc-CFLAGS := -fno-strict-aliasing endif ifneq ($(CLANG_CPUV4),) @@ -102,8 +116,6 @@ TEST_PROGS := test_kmod.sh \ test_xdp_redirect_multi.sh \ test_xdp_meta.sh \ test_xdp_veth.sh \ - test_offload.py \ - test_sock_addr.sh \ test_tunnel.sh \ test_lwt_seg6local.sh \ test_lirc_mode2.sh \ @@ -128,7 +140,7 @@ TEST_PROGS_EXTENDED := with_addr.sh \ test_xdp_vlan.sh test_bpftool.py # Compile but not part of 'make run_tests' -TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \ +TEST_GEN_PROGS_EXTENDED = test_skb_cgroup_id_user \ flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \ test_lirc_mode2_user xdping test_cpp runqslower bench bpf_testmod.ko \ xskxceiver xdp_redirect_multi xdp_synproxy veristat xdp_hw_metadata \ @@ -136,18 +148,7 @@ TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \ TEST_GEN_FILES += liburandom_read.so urandom_read sign-file uprobe_multi -# Emit succinct information message describing current building step -# $1 - generic step name (e.g., CC, LINK, etc); -# $2 - optional "flavor" specifier; if provided, will be emitted as [flavor]; -# $3 - target (assumed to be file); only file name will be emitted; -# $4 - optional extra arg, emitted as-is, if provided. -ifeq ($(V),1) -Q = -msg = -else -Q = @ -msg = @printf ' %-8s%s %s%s\n' "$(1)" "$(if $(2), [$(2)])" "$(notdir $(3))" "$(if $(4), $(4))"; -MAKEFLAGS += --no-print-directory +ifneq ($(V),1) submake_extras := feature_display=0 endif @@ -274,7 +275,7 @@ $(OUTPUT)/runqslower: $(BPFOBJ) | $(DEFAULT_BPFTOOL) $(RUNQSLOWER_OUTPUT) $(Q)$(MAKE) $(submake_extras) -C $(TOOLSDIR)/bpf/runqslower \ OUTPUT=$(RUNQSLOWER_OUTPUT) VMLINUX_BTF=$(VMLINUX_BTF) \ BPFTOOL_OUTPUT=$(HOST_BUILD_DIR)/bpftool/ \ - BPFOBJ_OUTPUT=$(BUILD_DIR)/libbpf \ + BPFOBJ_OUTPUT=$(BUILD_DIR)/libbpf/ \ BPFOBJ=$(BPFOBJ) BPF_INCLUDE=$(INCLUDE_DIR) \ EXTRA_CFLAGS='-g $(OPT_FLAGS) $(SAN_CFLAGS)' \ EXTRA_LDFLAGS='$(SAN_LDFLAGS)' && \ @@ -290,11 +291,11 @@ UNPRIV_HELPERS := $(OUTPUT)/unpriv_helpers.o TRACE_HELPERS := $(OUTPUT)/trace_helpers.o JSON_WRITER := $(OUTPUT)/json_writer.o CAP_HELPERS := $(OUTPUT)/cap_helpers.o +NETWORK_HELPERS := $(OUTPUT)/network_helpers.o $(OUTPUT)/test_dev_cgroup: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_skb_cgroup_id_user: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_sock: $(CGROUP_HELPERS) $(TESTING_HELPERS) -$(OUTPUT)/test_sock_addr: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_sockmap: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_tcpnotify_user: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(TRACE_HELPERS) $(OUTPUT)/get_cgroup_id_user: $(CGROUP_HELPERS) $(TESTING_HELPERS) @@ -308,6 +309,7 @@ $(OUTPUT)/flow_dissector_load: $(TESTING_HELPERS) $(OUTPUT)/test_maps: $(TESTING_HELPERS) $(OUTPUT)/test_verifier: $(TESTING_HELPERS) $(CAP_HELPERS) $(UNPRIV_HELPERS) $(OUTPUT)/xsk.o: $(BPFOBJ) +$(OUTPUT)/test_tcp_check_syncookie_user: $(NETWORK_HELPERS) BPFTOOL ?= $(DEFAULT_BPFTOOL) $(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile) \ @@ -442,7 +444,7 @@ endef # Build BPF object using GCC define GCC_BPF_BUILD_RULE $(call msg,GCC-BPF,$(TRUNNER_BINARY),$2) - $(Q)$(BPF_GCC) $3 -O2 -c $1 -o $2 + $(Q)$(BPF_GCC) $3 -DBPF_NO_PRESERVE_ACCESS_INDEX -Wno-attributes -O2 -c $1 -o $2 endef SKEL_BLACKLIST := btf__% test_pinning_invalid.c test_sk_assign.c @@ -455,7 +457,7 @@ LINKED_SKELS := test_static_linked.skel.h linked_funcs.skel.h \ LSKELS := fentry_test.c fexit_test.c fexit_sleep.c atomics.c \ trace_printk.c trace_vprintk.c map_ptr_kern.c \ core_kern.c core_kern_overflow.c test_ringbuf.c \ - test_ringbuf_map_key.c + test_ringbuf_n.c test_ringbuf_map_key.c # Generate both light skeleton and libbpf skeleton for these LSKELS_EXTRA := test_ksyms_module.c test_ksyms_weak.c kfunc_call_test.c \ @@ -481,7 +483,7 @@ LINKED_BPF_SRCS := $(patsubst %.bpf.o,%.c,$(foreach skel,$(LINKED_SKELS),$($(ske # $eval()) and pass control to DEFINE_TEST_RUNNER_RULES. # Parameters: # $1 - test runner base binary name (e.g., test_progs) -# $2 - test runner extra "flavor" (e.g., no_alu32, cpuv4, gcc-bpf, etc) +# $2 - test runner extra "flavor" (e.g., no_alu32, cpuv4, bpf_gcc, etc) define DEFINE_TEST_RUNNER TRUNNER_OUTPUT := $(OUTPUT)$(if $2,/)$2 @@ -509,7 +511,7 @@ endef # Using TRUNNER_XXX variables, provided by callers of DEFINE_TEST_RUNNER and # set up by DEFINE_TEST_RUNNER itself, create test runner build rules with: # $1 - test runner base binary name (e.g., test_progs) -# $2 - test runner extra "flavor" (e.g., no_alu32, cpuv4, gcc-bpf, etc) +# $2 - test runner extra "flavor" (e.g., no_alu32, cpuv4, bpf_gcc, etc) define DEFINE_TEST_RUNNER_RULES ifeq ($($(TRUNNER_OUTPUT)-dir),) @@ -532,7 +534,8 @@ $(TRUNNER_BPF_OBJS): $(TRUNNER_OUTPUT)/%.bpf.o: \ | $(TRUNNER_OUTPUT) $$(BPFOBJ) $$(call $(TRUNNER_BPF_BUILD_RULE),$$<,$$@, \ $(TRUNNER_BPF_CFLAGS) \ - $$($$<-CFLAGS)) + $$($$<-CFLAGS) \ + $$($$<-$2-CFLAGS)) $(TRUNNER_BPF_SKELS): %.skel.h: %.bpf.o $(BPFTOOL) | $(TRUNNER_OUTPUT) $$(call msg,GEN-SKEL,$(TRUNNER_BINARY),$$@) @@ -658,7 +661,7 @@ $(eval $(call DEFINE_TEST_RUNNER,test_progs,no_alu32)) # Define test_progs-cpuv4 test runner. ifneq ($(CLANG_CPUV4),) TRUNNER_BPF_BUILD_RULE := CLANG_CPUV4_BPF_BUILD_RULE -TRUNNER_BPF_CFLAGS := $(BPF_CFLAGS) $(CLANG_CFLAGS) +TRUNNER_BPF_CFLAGS := $(BPF_CFLAGS) $(CLANG_CFLAGS) -DENABLE_ATOMICS_TESTS $(eval $(call DEFINE_TEST_RUNNER,test_progs,cpuv4)) endif @@ -695,7 +698,7 @@ $(OUTPUT)/test_verifier: test_verifier.c verifier/tests.h $(BPFOBJ) | $(OUTPUT) # Include find_bit.c to compile xskxceiver. EXTRA_SRC := $(TOOLSDIR)/lib/find_bit.c -$(OUTPUT)/xskxceiver: $(EXTRA_SRC) xskxceiver.c xskxceiver.h $(OUTPUT)/xsk.o $(OUTPUT)/xsk_xdp_progs.skel.h $(BPFOBJ) | $(OUTPUT) +$(OUTPUT)/xskxceiver: $(EXTRA_SRC) xskxceiver.c xskxceiver.h $(OUTPUT)/network_helpers.o $(OUTPUT)/xsk.o $(OUTPUT)/xsk_xdp_progs.skel.h $(BPFOBJ) | $(OUTPUT) $(call msg,BINARY,,$@) $(Q)$(CC) $(CFLAGS) $(filter %.a %.o %.c,$^) $(LDLIBS) -o $@ @@ -729,6 +732,7 @@ $(OUTPUT)/bench_local_storage_rcu_tasks_trace.o: $(OUTPUT)/local_storage_rcu_tas $(OUTPUT)/bench_local_storage_create.o: $(OUTPUT)/bench_local_storage_create.skel.h $(OUTPUT)/bench_bpf_hashmap_lookup.o: $(OUTPUT)/bpf_hashmap_lookup.skel.h $(OUTPUT)/bench_htab_mem.o: $(OUTPUT)/htab_mem_bench.skel.h +$(OUTPUT)/bench_bpf_crypto.o: $(OUTPUT)/crypto_bench.skel.h $(OUTPUT)/bench.o: bench.h testing_helpers.h $(BPFOBJ) $(OUTPUT)/bench: LDLIBS += -lm $(OUTPUT)/bench: $(OUTPUT)/bench.o \ @@ -748,6 +752,7 @@ $(OUTPUT)/bench: $(OUTPUT)/bench.o \ $(OUTPUT)/bench_bpf_hashmap_lookup.o \ $(OUTPUT)/bench_local_storage_create.o \ $(OUTPUT)/bench_htab_mem.o \ + $(OUTPUT)/bench_bpf_crypto.o \ # $(call msg,BINARY,,$@) $(Q)$(CC) $(CFLAGS) $(LDFLAGS) $(filter %.a %.o,$^) $(LDLIBS) -o $@ @@ -759,7 +764,7 @@ $(OUTPUT)/veristat: $(OUTPUT)/veristat.o $(OUTPUT)/uprobe_multi: uprobe_multi.c $(call msg,BINARY,,$@) - $(Q)$(CC) $(CFLAGS) $(LDFLAGS) $^ $(LDLIBS) -o $@ + $(Q)$(CC) $(CFLAGS) -O0 $(LDFLAGS) $^ $(LDLIBS) -o $@ EXTRA_CLEAN := $(SCRATCH_DIR) $(HOST_SCRATCH_DIR) \ prog_tests/tests.h map_tests/tests.h verifier/tests.h \ diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c index b2b4c391eb0a..627b74ae041b 100644 --- a/tools/testing/selftests/bpf/bench.c +++ b/tools/testing/selftests/bpf/bench.c @@ -280,6 +280,8 @@ extern struct argp bench_strncmp_argp; extern struct argp bench_hashmap_lookup_argp; extern struct argp bench_local_storage_create_argp; extern struct argp bench_htab_mem_argp; +extern struct argp bench_trigger_batch_argp; +extern struct argp bench_crypto_argp; static const struct argp_child bench_parsers[] = { { &bench_ringbufs_argp, 0, "Ring buffers benchmark", 0 }, @@ -292,6 +294,8 @@ static const struct argp_child bench_parsers[] = { { &bench_hashmap_lookup_argp, 0, "Hashmap lookup benchmark", 0 }, { &bench_local_storage_create_argp, 0, "local-storage-create benchmark", 0 }, { &bench_htab_mem_argp, 0, "hash map memory benchmark", 0 }, + { &bench_trigger_batch_argp, 0, "BPF triggering benchmark", 0 }, + { &bench_crypto_argp, 0, "bpf crypto benchmark", 0 }, {}, }; @@ -491,24 +495,31 @@ extern const struct bench bench_rename_kretprobe; extern const struct bench bench_rename_rawtp; extern const struct bench bench_rename_fentry; extern const struct bench bench_rename_fexit; -extern const struct bench bench_trig_base; -extern const struct bench bench_trig_tp; -extern const struct bench bench_trig_rawtp; + +/* pure counting benchmarks to establish theoretical lmits */ +extern const struct bench bench_trig_usermode_count; +extern const struct bench bench_trig_syscall_count; +extern const struct bench bench_trig_kernel_count; + +/* batched, staying mostly in-kernel benchmarks */ extern const struct bench bench_trig_kprobe; extern const struct bench bench_trig_kretprobe; extern const struct bench bench_trig_kprobe_multi; extern const struct bench bench_trig_kretprobe_multi; extern const struct bench bench_trig_fentry; extern const struct bench bench_trig_fexit; -extern const struct bench bench_trig_fentry_sleep; extern const struct bench bench_trig_fmodret; -extern const struct bench bench_trig_uprobe_base; +extern const struct bench bench_trig_tp; +extern const struct bench bench_trig_rawtp; + +/* uprobe/uretprobe benchmarks */ extern const struct bench bench_trig_uprobe_nop; extern const struct bench bench_trig_uretprobe_nop; extern const struct bench bench_trig_uprobe_push; extern const struct bench bench_trig_uretprobe_push; extern const struct bench bench_trig_uprobe_ret; extern const struct bench bench_trig_uretprobe_ret; + extern const struct bench bench_rb_libbpf; extern const struct bench bench_rb_custom; extern const struct bench bench_pb_libbpf; @@ -529,6 +540,8 @@ extern const struct bench bench_local_storage_tasks_trace; extern const struct bench bench_bpf_hashmap_lookup; extern const struct bench bench_local_storage_create; extern const struct bench bench_htab_mem; +extern const struct bench bench_crypto_encrypt; +extern const struct bench bench_crypto_decrypt; static const struct bench *benchs[] = { &bench_count_global, @@ -539,24 +552,28 @@ static const struct bench *benchs[] = { &bench_rename_rawtp, &bench_rename_fentry, &bench_rename_fexit, - &bench_trig_base, - &bench_trig_tp, - &bench_trig_rawtp, + /* pure counting benchmarks for establishing theoretical limits */ + &bench_trig_usermode_count, + &bench_trig_kernel_count, + &bench_trig_syscall_count, + /* batched, staying mostly in-kernel triggers */ &bench_trig_kprobe, &bench_trig_kretprobe, &bench_trig_kprobe_multi, &bench_trig_kretprobe_multi, &bench_trig_fentry, &bench_trig_fexit, - &bench_trig_fentry_sleep, &bench_trig_fmodret, - &bench_trig_uprobe_base, + &bench_trig_tp, + &bench_trig_rawtp, + /* uprobes */ &bench_trig_uprobe_nop, &bench_trig_uretprobe_nop, &bench_trig_uprobe_push, &bench_trig_uretprobe_push, &bench_trig_uprobe_ret, &bench_trig_uretprobe_ret, + /* ringbuf/perfbuf benchmarks */ &bench_rb_libbpf, &bench_rb_custom, &bench_pb_libbpf, @@ -577,6 +594,8 @@ static const struct bench *benchs[] = { &bench_bpf_hashmap_lookup, &bench_local_storage_create, &bench_htab_mem, + &bench_crypto_encrypt, + &bench_crypto_decrypt, }; static void find_benchmark(void) diff --git a/tools/testing/selftests/bpf/benchs/bench_bpf_crypto.c b/tools/testing/selftests/bpf/benchs/bench_bpf_crypto.c new file mode 100644 index 000000000000..2845edaba8db --- /dev/null +++ b/tools/testing/selftests/bpf/benchs/bench_bpf_crypto.c @@ -0,0 +1,185 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#include <argp.h> +#include "bench.h" +#include "crypto_bench.skel.h" + +#define MAX_CIPHER_LEN 32 +static char *input; +static struct crypto_ctx { + struct crypto_bench *skel; + int pfd; +} ctx; + +static struct crypto_args { + u32 crypto_len; + char *crypto_cipher; +} args = { + .crypto_len = 16, + .crypto_cipher = "ecb(aes)", +}; + +enum { + ARG_CRYPTO_LEN = 5000, + ARG_CRYPTO_CIPHER = 5001, +}; + +static const struct argp_option opts[] = { + { "crypto-len", ARG_CRYPTO_LEN, "CRYPTO_LEN", 0, + "Set the length of crypto buffer" }, + { "crypto-cipher", ARG_CRYPTO_CIPHER, "CRYPTO_CIPHER", 0, + "Set the cipher to use (default:ecb(aes))" }, + {}, +}; + +static error_t crypto_parse_arg(int key, char *arg, struct argp_state *state) +{ + switch (key) { + case ARG_CRYPTO_LEN: + args.crypto_len = strtoul(arg, NULL, 10); + if (!args.crypto_len || + args.crypto_len > sizeof(ctx.skel->bss->dst)) { + fprintf(stderr, "Invalid crypto buffer len (limit %zu)\n", + sizeof(ctx.skel->bss->dst)); + argp_usage(state); + } + break; + case ARG_CRYPTO_CIPHER: + args.crypto_cipher = strdup(arg); + if (!strlen(args.crypto_cipher) || + strlen(args.crypto_cipher) > MAX_CIPHER_LEN) { + fprintf(stderr, "Invalid crypto cipher len (limit %d)\n", + MAX_CIPHER_LEN); + argp_usage(state); + } + break; + default: + return ARGP_ERR_UNKNOWN; + } + + return 0; +} + +const struct argp bench_crypto_argp = { + .options = opts, + .parser = crypto_parse_arg, +}; + +static void crypto_validate(void) +{ + if (env.consumer_cnt != 0) { + fprintf(stderr, "bpf crypto benchmark doesn't support consumer!\n"); + exit(1); + } +} + +static void crypto_setup(void) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts); + + int err, pfd; + size_t i, sz; + + sz = args.crypto_len; + if (!sz || sz > sizeof(ctx.skel->bss->dst)) { + fprintf(stderr, "invalid encrypt buffer size (source %zu, target %zu)\n", + sz, sizeof(ctx.skel->bss->dst)); + exit(1); + } + + setup_libbpf(); + + ctx.skel = crypto_bench__open(); + if (!ctx.skel) { + fprintf(stderr, "failed to open skeleton\n"); + exit(1); + } + + snprintf(ctx.skel->bss->cipher, 128, "%s", args.crypto_cipher); + memcpy(ctx.skel->bss->key, "12345678testtest", 16); + ctx.skel->bss->key_len = 16; + ctx.skel->bss->authsize = 0; + + srandom(time(NULL)); + input = malloc(sz); + for (i = 0; i < sz - 1; i++) + input[i] = '1' + random() % 9; + input[sz - 1] = '\0'; + + ctx.skel->rodata->len = args.crypto_len; + + err = crypto_bench__load(ctx.skel); + if (err) { + fprintf(stderr, "failed to load skeleton\n"); + crypto_bench__destroy(ctx.skel); + exit(1); + } + + pfd = bpf_program__fd(ctx.skel->progs.crypto_setup); + if (pfd < 0) { + fprintf(stderr, "failed to get fd for setup prog\n"); + crypto_bench__destroy(ctx.skel); + exit(1); + } + + err = bpf_prog_test_run_opts(pfd, &opts); + if (err || ctx.skel->bss->status) { + fprintf(stderr, "failed to run setup prog: err %d, status %d\n", + err, ctx.skel->bss->status); + crypto_bench__destroy(ctx.skel); + exit(1); + } +} + +static void crypto_encrypt_setup(void) +{ + crypto_setup(); + ctx.pfd = bpf_program__fd(ctx.skel->progs.crypto_encrypt); +} + +static void crypto_decrypt_setup(void) +{ + crypto_setup(); + ctx.pfd = bpf_program__fd(ctx.skel->progs.crypto_decrypt); +} + +static void crypto_measure(struct bench_res *res) +{ + res->hits = atomic_swap(&ctx.skel->bss->hits, 0); +} + +static void *crypto_producer(void *unused) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts, + .repeat = 64, + .data_in = input, + .data_size_in = args.crypto_len, + ); + + while (true) + (void)bpf_prog_test_run_opts(ctx.pfd, &opts); + return NULL; +} + +const struct bench bench_crypto_encrypt = { + .name = "crypto-encrypt", + .argp = &bench_crypto_argp, + .validate = crypto_validate, + .setup = crypto_encrypt_setup, + .producer_thread = crypto_producer, + .measure = crypto_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + +const struct bench bench_crypto_decrypt = { + .name = "crypto-decrypt", + .argp = &bench_crypto_argp, + .validate = crypto_validate, + .setup = crypto_decrypt_setup, + .producer_thread = crypto_producer, + .measure = crypto_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; diff --git a/tools/testing/selftests/bpf/benchs/bench_local_storage_create.c b/tools/testing/selftests/bpf/benchs/bench_local_storage_create.c index b36de42ee4d9..e2ff8ea1cb79 100644 --- a/tools/testing/selftests/bpf/benchs/bench_local_storage_create.c +++ b/tools/testing/selftests/bpf/benchs/bench_local_storage_create.c @@ -186,7 +186,7 @@ static void *task_producer(void *input) for (i = 0; i < batch_sz; i++) { if (!pthd_results[i]) - pthread_join(pthds[i], NULL);; + pthread_join(pthds[i], NULL); } } diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c index ace0d1011a8e..4b05539f167d 100644 --- a/tools/testing/selftests/bpf/benchs/bench_trigger.c +++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c @@ -1,15 +1,95 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2020 Facebook */ +#define _GNU_SOURCE +#include <argp.h> +#include <unistd.h> +#include <stdint.h> #include "bench.h" #include "trigger_bench.skel.h" #include "trace_helpers.h" +#define MAX_TRIG_BATCH_ITERS 1000 + +static struct { + __u32 batch_iters; +} args = { + .batch_iters = 100, +}; + +enum { + ARG_TRIG_BATCH_ITERS = 7000, +}; + +static const struct argp_option opts[] = { + { "trig-batch-iters", ARG_TRIG_BATCH_ITERS, "BATCH_ITER_CNT", 0, + "Number of in-kernel iterations per one driver test run"}, + {}, +}; + +static error_t parse_arg(int key, char *arg, struct argp_state *state) +{ + long ret; + + switch (key) { + case ARG_TRIG_BATCH_ITERS: + ret = strtol(arg, NULL, 10); + if (ret < 1 || ret > MAX_TRIG_BATCH_ITERS) { + fprintf(stderr, "invalid --trig-batch-iters value (should be between %d and %d)\n", + 1, MAX_TRIG_BATCH_ITERS); + argp_usage(state); + } + args.batch_iters = ret; + break; + default: + return ARGP_ERR_UNKNOWN; + } + + return 0; +} + +const struct argp bench_trigger_batch_argp = { + .options = opts, + .parser = parse_arg, +}; + +/* adjust slot shift in inc_hits() if changing */ +#define MAX_BUCKETS 256 + +#pragma GCC diagnostic ignored "-Wattributes" + /* BPF triggering benchmarks */ static struct trigger_ctx { struct trigger_bench *skel; + bool usermode_counters; + int driver_prog_fd; } ctx; -static struct counter base_hits; +static struct counter base_hits[MAX_BUCKETS]; + +static __always_inline void inc_counter(struct counter *counters) +{ + static __thread int tid = 0; + unsigned slot; + + if (unlikely(tid == 0)) + tid = syscall(SYS_gettid); + + /* multiplicative hashing, it's fast */ + slot = 2654435769U * tid; + slot >>= 24; + + atomic_inc(&base_hits[slot].value); /* use highest byte as an index */ +} + +static long sum_and_reset_counters(struct counter *counters) +{ + int i; + long sum = 0; + + for (i = 0; i < MAX_BUCKETS; i++) + sum += atomic_swap(&counters[i].value, 0); + return sum; +} static void trigger_validate(void) { @@ -19,41 +99,63 @@ static void trigger_validate(void) } } -static void *trigger_base_producer(void *input) +static void *trigger_producer(void *input) { - while (true) { - (void)syscall(__NR_getpgid); - atomic_inc(&base_hits.value); + if (ctx.usermode_counters) { + while (true) { + (void)syscall(__NR_getpgid); + inc_counter(base_hits); + } + } else { + while (true) + (void)syscall(__NR_getpgid); } return NULL; } -static void trigger_base_measure(struct bench_res *res) +static void *trigger_producer_batch(void *input) { - res->hits = atomic_swap(&base_hits.value, 0); -} + int fd = ctx.driver_prog_fd ?: bpf_program__fd(ctx.skel->progs.trigger_driver); -static void *trigger_producer(void *input) -{ while (true) - (void)syscall(__NR_getpgid); + bpf_prog_test_run_opts(fd, NULL); + return NULL; } static void trigger_measure(struct bench_res *res) { - res->hits = atomic_swap(&ctx.skel->bss->hits, 0); + if (ctx.usermode_counters) + res->hits = sum_and_reset_counters(base_hits); + else + res->hits = sum_and_reset_counters(ctx.skel->bss->hits); } static void setup_ctx(void) { setup_libbpf(); - ctx.skel = trigger_bench__open_and_load(); + ctx.skel = trigger_bench__open(); if (!ctx.skel) { fprintf(stderr, "failed to open skeleton\n"); exit(1); } + + /* default "driver" BPF program */ + bpf_program__set_autoload(ctx.skel->progs.trigger_driver, true); + + ctx.skel->rodata->batch_iters = args.batch_iters; +} + +static void load_ctx(void) +{ + int err; + + err = trigger_bench__load(ctx.skel); + if (err) { + fprintf(stderr, "failed to open skeleton\n"); + exit(1); + } } static void attach_bpf(struct bpf_program *prog) @@ -67,64 +169,104 @@ static void attach_bpf(struct bpf_program *prog) } } -static void trigger_tp_setup(void) +static void trigger_syscall_count_setup(void) { - setup_ctx(); - attach_bpf(ctx.skel->progs.bench_trigger_tp); + ctx.usermode_counters = true; } -static void trigger_rawtp_setup(void) +/* Batched, staying mostly in-kernel triggering setups */ +static void trigger_kernel_count_setup(void) { setup_ctx(); - attach_bpf(ctx.skel->progs.bench_trigger_raw_tp); + bpf_program__set_autoload(ctx.skel->progs.trigger_driver, false); + bpf_program__set_autoload(ctx.skel->progs.trigger_count, true); + load_ctx(); + /* override driver program */ + ctx.driver_prog_fd = bpf_program__fd(ctx.skel->progs.trigger_count); } static void trigger_kprobe_setup(void) { setup_ctx(); + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_kprobe, true); + load_ctx(); attach_bpf(ctx.skel->progs.bench_trigger_kprobe); } static void trigger_kretprobe_setup(void) { setup_ctx(); + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_kretprobe, true); + load_ctx(); attach_bpf(ctx.skel->progs.bench_trigger_kretprobe); } static void trigger_kprobe_multi_setup(void) { setup_ctx(); + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_kprobe_multi, true); + load_ctx(); attach_bpf(ctx.skel->progs.bench_trigger_kprobe_multi); } static void trigger_kretprobe_multi_setup(void) { setup_ctx(); + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_kretprobe_multi, true); + load_ctx(); attach_bpf(ctx.skel->progs.bench_trigger_kretprobe_multi); } static void trigger_fentry_setup(void) { setup_ctx(); + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_fentry, true); + load_ctx(); attach_bpf(ctx.skel->progs.bench_trigger_fentry); } static void trigger_fexit_setup(void) { setup_ctx(); + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_fexit, true); + load_ctx(); attach_bpf(ctx.skel->progs.bench_trigger_fexit); } -static void trigger_fentry_sleep_setup(void) +static void trigger_fmodret_setup(void) { setup_ctx(); - attach_bpf(ctx.skel->progs.bench_trigger_fentry_sleep); + bpf_program__set_autoload(ctx.skel->progs.trigger_driver, false); + bpf_program__set_autoload(ctx.skel->progs.trigger_driver_kfunc, true); + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_fmodret, true); + load_ctx(); + /* override driver program */ + ctx.driver_prog_fd = bpf_program__fd(ctx.skel->progs.trigger_driver_kfunc); + attach_bpf(ctx.skel->progs.bench_trigger_fmodret); } -static void trigger_fmodret_setup(void) +static void trigger_tp_setup(void) { setup_ctx(); - attach_bpf(ctx.skel->progs.bench_trigger_fmodret); + bpf_program__set_autoload(ctx.skel->progs.trigger_driver, false); + bpf_program__set_autoload(ctx.skel->progs.trigger_driver_kfunc, true); + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_tp, true); + load_ctx(); + /* override driver program */ + ctx.driver_prog_fd = bpf_program__fd(ctx.skel->progs.trigger_driver_kfunc); + attach_bpf(ctx.skel->progs.bench_trigger_tp); +} + +static void trigger_rawtp_setup(void) +{ + setup_ctx(); + bpf_program__set_autoload(ctx.skel->progs.trigger_driver, false); + bpf_program__set_autoload(ctx.skel->progs.trigger_driver_kfunc, true); + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_rawtp, true); + load_ctx(); + /* override driver program */ + ctx.driver_prog_fd = bpf_program__fd(ctx.skel->progs.trigger_driver_kfunc); + attach_bpf(ctx.skel->progs.bench_trigger_rawtp); } /* make sure call is not inlined and not avoided by compiler, so __weak and @@ -137,7 +279,7 @@ static void trigger_fmodret_setup(void) * GCC doesn't generate stack setup preample for these functions due to them * having no input arguments and doing nothing in the body. */ -__weak void uprobe_target_nop(void) +__nocf_check __weak void uprobe_target_nop(void) { asm volatile ("nop"); } @@ -146,7 +288,7 @@ __weak void opaque_noop_func(void) { } -__weak int uprobe_target_push(void) +__nocf_check __weak int uprobe_target_push(void) { /* overhead of function call is negligible compared to uprobe * triggering, so this shouldn't affect benchmark results much @@ -155,16 +297,16 @@ __weak int uprobe_target_push(void) return 1; } -__weak void uprobe_target_ret(void) +__nocf_check __weak void uprobe_target_ret(void) { asm volatile (""); } -static void *uprobe_base_producer(void *input) +static void *uprobe_producer_count(void *input) { while (true) { uprobe_target_nop(); - atomic_inc(&base_hits.value); + inc_counter(base_hits); } return NULL; } @@ -194,15 +336,24 @@ static void usetup(bool use_retprobe, void *target_addr) { size_t uprobe_offset; struct bpf_link *link; + int err; setup_libbpf(); - ctx.skel = trigger_bench__open_and_load(); + ctx.skel = trigger_bench__open(); if (!ctx.skel) { fprintf(stderr, "failed to open skeleton\n"); exit(1); } + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_uprobe, true); + + err = trigger_bench__load(ctx.skel); + if (err) { + fprintf(stderr, "failed to load skeleton\n"); + exit(1); + } + uprobe_offset = get_uprobe_offset(target_addr); link = bpf_program__attach_uprobe(ctx.skel->progs.bench_trigger_uprobe, use_retprobe, @@ -216,204 +367,90 @@ static void usetup(bool use_retprobe, void *target_addr) ctx.skel->links.bench_trigger_uprobe = link; } -static void uprobe_setup_nop(void) +static void usermode_count_setup(void) +{ + ctx.usermode_counters = true; +} + +static void uprobe_nop_setup(void) { usetup(false, &uprobe_target_nop); } -static void uretprobe_setup_nop(void) +static void uretprobe_nop_setup(void) { usetup(true, &uprobe_target_nop); } -static void uprobe_setup_push(void) +static void uprobe_push_setup(void) { usetup(false, &uprobe_target_push); } -static void uretprobe_setup_push(void) +static void uretprobe_push_setup(void) { usetup(true, &uprobe_target_push); } -static void uprobe_setup_ret(void) +static void uprobe_ret_setup(void) { usetup(false, &uprobe_target_ret); } -static void uretprobe_setup_ret(void) +static void uretprobe_ret_setup(void) { usetup(true, &uprobe_target_ret); } -const struct bench bench_trig_base = { - .name = "trig-base", +const struct bench bench_trig_syscall_count = { + .name = "trig-syscall-count", .validate = trigger_validate, - .producer_thread = trigger_base_producer, - .measure = trigger_base_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_tp = { - .name = "trig-tp", - .validate = trigger_validate, - .setup = trigger_tp_setup, - .producer_thread = trigger_producer, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_rawtp = { - .name = "trig-rawtp", - .validate = trigger_validate, - .setup = trigger_rawtp_setup, - .producer_thread = trigger_producer, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_kprobe = { - .name = "trig-kprobe", - .validate = trigger_validate, - .setup = trigger_kprobe_setup, + .setup = trigger_syscall_count_setup, .producer_thread = trigger_producer, .measure = trigger_measure, .report_progress = hits_drops_report_progress, .report_final = hits_drops_report_final, }; -const struct bench bench_trig_kretprobe = { - .name = "trig-kretprobe", - .validate = trigger_validate, - .setup = trigger_kretprobe_setup, - .producer_thread = trigger_producer, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_kprobe_multi = { - .name = "trig-kprobe-multi", - .validate = trigger_validate, - .setup = trigger_kprobe_multi_setup, - .producer_thread = trigger_producer, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_kretprobe_multi = { - .name = "trig-kretprobe-multi", - .validate = trigger_validate, - .setup = trigger_kretprobe_multi_setup, - .producer_thread = trigger_producer, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_fentry = { - .name = "trig-fentry", - .validate = trigger_validate, - .setup = trigger_fentry_setup, - .producer_thread = trigger_producer, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_fexit = { - .name = "trig-fexit", - .validate = trigger_validate, - .setup = trigger_fexit_setup, - .producer_thread = trigger_producer, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_fentry_sleep = { - .name = "trig-fentry-sleep", - .validate = trigger_validate, - .setup = trigger_fentry_sleep_setup, - .producer_thread = trigger_producer, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_fmodret = { - .name = "trig-fmodret", - .validate = trigger_validate, - .setup = trigger_fmodret_setup, - .producer_thread = trigger_producer, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_uprobe_base = { - .name = "trig-uprobe-base", - .setup = NULL, /* no uprobe/uretprobe is attached */ - .producer_thread = uprobe_base_producer, - .measure = trigger_base_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_uprobe_nop = { - .name = "trig-uprobe-nop", - .setup = uprobe_setup_nop, - .producer_thread = uprobe_producer_nop, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_uretprobe_nop = { - .name = "trig-uretprobe-nop", - .setup = uretprobe_setup_nop, - .producer_thread = uprobe_producer_nop, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_uprobe_push = { - .name = "trig-uprobe-push", - .setup = uprobe_setup_push, - .producer_thread = uprobe_producer_push, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_uretprobe_push = { - .name = "trig-uretprobe-push", - .setup = uretprobe_setup_push, - .producer_thread = uprobe_producer_push, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_uprobe_ret = { - .name = "trig-uprobe-ret", - .setup = uprobe_setup_ret, - .producer_thread = uprobe_producer_ret, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_uretprobe_ret = { - .name = "trig-uretprobe-ret", - .setup = uretprobe_setup_ret, - .producer_thread = uprobe_producer_ret, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; +/* batched (staying mostly in kernel) kprobe/fentry benchmarks */ +#define BENCH_TRIG_KERNEL(KIND, NAME) \ +const struct bench bench_trig_##KIND = { \ + .name = "trig-" NAME, \ + .setup = trigger_##KIND##_setup, \ + .producer_thread = trigger_producer_batch, \ + .measure = trigger_measure, \ + .report_progress = hits_drops_report_progress, \ + .report_final = hits_drops_report_final, \ + .argp = &bench_trigger_batch_argp, \ +} + +BENCH_TRIG_KERNEL(kernel_count, "kernel-count"); +BENCH_TRIG_KERNEL(kprobe, "kprobe"); +BENCH_TRIG_KERNEL(kretprobe, "kretprobe"); +BENCH_TRIG_KERNEL(kprobe_multi, "kprobe-multi"); +BENCH_TRIG_KERNEL(kretprobe_multi, "kretprobe-multi"); +BENCH_TRIG_KERNEL(fentry, "fentry"); +BENCH_TRIG_KERNEL(fexit, "fexit"); +BENCH_TRIG_KERNEL(fmodret, "fmodret"); +BENCH_TRIG_KERNEL(tp, "tp"); +BENCH_TRIG_KERNEL(rawtp, "rawtp"); + +/* uprobe benchmarks */ +#define BENCH_TRIG_USERMODE(KIND, PRODUCER, NAME) \ +const struct bench bench_trig_##KIND = { \ + .name = "trig-" NAME, \ + .validate = trigger_validate, \ + .setup = KIND##_setup, \ + .producer_thread = uprobe_producer_##PRODUCER, \ + .measure = trigger_measure, \ + .report_progress = hits_drops_report_progress, \ + .report_final = hits_drops_report_final, \ +} + +BENCH_TRIG_USERMODE(usermode_count, count, "usermode-count"); +BENCH_TRIG_USERMODE(uprobe_nop, nop, "uprobe-nop"); +BENCH_TRIG_USERMODE(uprobe_push, push, "uprobe-push"); +BENCH_TRIG_USERMODE(uprobe_ret, ret, "uprobe-ret"); +BENCH_TRIG_USERMODE(uretprobe_nop, nop, "uretprobe-nop"); +BENCH_TRIG_USERMODE(uretprobe_push, push, "uretprobe-push"); +BENCH_TRIG_USERMODE(uretprobe_ret, ret, "uretprobe-ret"); diff --git a/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh b/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh index 78e83f243294..a690f5a68b6b 100755 --- a/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh +++ b/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh @@ -2,8 +2,22 @@ set -eufo pipefail -for i in base tp rawtp kprobe fentry fmodret -do - summary=$(sudo ./bench -w2 -d5 -a trig-$i | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-) - printf "%-10s: %s\n" $i "$summary" +def_tests=( \ + usermode-count kernel-count syscall-count \ + fentry fexit fmodret \ + rawtp tp \ + kprobe kprobe-multi \ + kretprobe kretprobe-multi \ +) + +tests=("$@") +if [ ${#tests[@]} -eq 0 ]; then + tests=("${def_tests[@]}") +fi + +p=${PROD_CNT:-1} + +for t in "${tests[@]}"; do + summary=$(sudo ./bench -w2 -d5 -a -p$p trig-$t | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-) + printf "%-15s: %s\n" $t "$summary" done diff --git a/tools/testing/selftests/bpf/benchs/run_bench_uprobes.sh b/tools/testing/selftests/bpf/benchs/run_bench_uprobes.sh index 9bdcc74e03a4..af169f831f2f 100755 --- a/tools/testing/selftests/bpf/benchs/run_bench_uprobes.sh +++ b/tools/testing/selftests/bpf/benchs/run_bench_uprobes.sh @@ -2,7 +2,7 @@ set -eufo pipefail -for i in base {uprobe,uretprobe}-{nop,push,ret} +for i in usermode-count syscall-count {uprobe,uretprobe}-{nop,push,ret} do summary=$(sudo ./bench -w2 -d5 -a trig-$i | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-) printf "%-15s: %s\n" $i "$summary" diff --git a/tools/testing/selftests/bpf/bpf_arena_list.h b/tools/testing/selftests/bpf/bpf_arena_list.h index b99b9f408eff..85dbc3ea4da5 100644 --- a/tools/testing/selftests/bpf/bpf_arena_list.h +++ b/tools/testing/selftests/bpf/bpf_arena_list.h @@ -29,6 +29,7 @@ static inline void *bpf_iter_num_new(struct bpf_iter_num *it, int i, int j) { re static inline void bpf_iter_num_destroy(struct bpf_iter_num *it) {} static inline bool bpf_iter_num_next(struct bpf_iter_num *it) { return true; } #define cond_break ({}) +#define can_loop true #endif /* Safely walk link list elements. Deletion of elements is allowed. */ @@ -36,8 +37,7 @@ static inline bool bpf_iter_num_next(struct bpf_iter_num *it) { return true; } for (void * ___tmp = (pos = list_entry_safe((head)->first, \ typeof(*(pos)), member), \ (void *)0); \ - pos && ({ ___tmp = (void *)pos->member.next; 1; }); \ - cond_break, \ + pos && ({ ___tmp = (void *)pos->member.next; 1; }) && can_loop; \ pos = list_entry_safe((void __arena *)___tmp, typeof(*(pos)), member)) static inline void list_add_head(arena_list_node_t *n, arena_list_head_t *h) diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h index a5b9df38c162..3d9e4b8c6b81 100644 --- a/tools/testing/selftests/bpf/bpf_experimental.h +++ b/tools/testing/selftests/bpf/bpf_experimental.h @@ -326,9 +326,48 @@ l_true: \ }) #endif +/* + * Note that cond_break can only be portably used in the body of a breakable + * construct, whereas can_loop can be used anywhere. + */ +#ifdef __BPF_FEATURE_MAY_GOTO +#define can_loop \ + ({ __label__ l_break, l_continue; \ + bool ret = true; \ + asm volatile goto("may_goto %l[l_break]" \ + :::: l_break); \ + goto l_continue; \ + l_break: ret = false; \ + l_continue:; \ + ret; \ + }) + +#define cond_break \ + ({ __label__ l_break, l_continue; \ + asm volatile goto("may_goto %l[l_break]" \ + :::: l_break); \ + goto l_continue; \ + l_break: break; \ + l_continue:; \ + }) +#else +#define can_loop \ + ({ __label__ l_break, l_continue; \ + bool ret = true; \ + asm volatile goto("1:.byte 0xe5; \ + .byte 0; \ + .long ((%l[l_break] - 1b - 8) / 8) & 0xffff; \ + .short 0" \ + :::: l_break); \ + goto l_continue; \ + l_break: ret = false; \ + l_continue:; \ + ret; \ + }) + #define cond_break \ ({ __label__ l_break, l_continue; \ - asm volatile goto("1:.byte 0xe5; \ + asm volatile goto("1:.byte 0xe5; \ .byte 0; \ .long ((%l[l_break] - 1b - 8) / 8) & 0xffff; \ .short 0" \ @@ -337,6 +376,7 @@ l_true: \ l_break: break; \ l_continue:; \ }) +#endif #ifndef bpf_nop_mov #define bpf_nop_mov(var) \ @@ -386,6 +426,28 @@ l_true: \ , [as]"i"((dst_as << 16) | src_as)); #endif +void bpf_preempt_disable(void) __weak __ksym; +void bpf_preempt_enable(void) __weak __ksym; + +typedef struct { +} __bpf_preempt_t; + +static inline __bpf_preempt_t __bpf_preempt_constructor(void) +{ + __bpf_preempt_t ret = {}; + + bpf_preempt_disable(); + return ret; +} +static inline void __bpf_preempt_destructor(__bpf_preempt_t *t) +{ + bpf_preempt_enable(); +} +#define bpf_guard_preempt() \ + __bpf_preempt_t ___bpf_apply(preempt, __COUNTER__) \ + __attribute__((__unused__, __cleanup__(__bpf_preempt_destructor))) = \ + __bpf_preempt_constructor() + /* Description * Assert that a conditional expression is true. * Returns @@ -459,4 +521,11 @@ extern int bpf_iter_css_new(struct bpf_iter_css *it, extern struct cgroup_subsys_state *bpf_iter_css_next(struct bpf_iter_css *it) __weak __ksym; extern void bpf_iter_css_destroy(struct bpf_iter_css *it) __weak __ksym; +extern int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags) __weak __ksym; +extern int bpf_wq_start(struct bpf_wq *wq, unsigned int flags) __weak __ksym; +extern int bpf_wq_set_callback_impl(struct bpf_wq *wq, + int (callback_fn)(void *map, int *key, struct bpf_wq *wq), + unsigned int flags__k, void *aux__ign) __ksym; +#define bpf_wq_set_callback(timer, cb, flags) \ + bpf_wq_set_callback_impl(timer, cb, flags, NULL) #endif diff --git a/tools/testing/selftests/bpf/bpf_kfuncs.h b/tools/testing/selftests/bpf/bpf_kfuncs.h index 14ebe7d9e1a3..be91a6919315 100644 --- a/tools/testing/selftests/bpf/bpf_kfuncs.h +++ b/tools/testing/selftests/bpf/bpf_kfuncs.h @@ -75,4 +75,7 @@ extern void bpf_key_put(struct bpf_key *key) __ksym; extern int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_ptr, struct bpf_dynptr *sig_ptr, struct bpf_key *trusted_keyring) __ksym; + +extern bool bpf_session_is_return(void) __ksym __weak; +extern long *bpf_session_cookie(void) __ksym __weak; #endif diff --git a/tools/testing/selftests/bpf/bpf_tcp_helpers.h b/tools/testing/selftests/bpf/bpf_tcp_helpers.h deleted file mode 100644 index 82a7c9de95f9..000000000000 --- a/tools/testing/selftests/bpf/bpf_tcp_helpers.h +++ /dev/null @@ -1,241 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __BPF_TCP_HELPERS_H -#define __BPF_TCP_HELPERS_H - -#include <stdbool.h> -#include <linux/types.h> -#include <bpf/bpf_helpers.h> -#include <bpf/bpf_core_read.h> -#include <bpf/bpf_tracing.h> - -#define BPF_STRUCT_OPS(name, args...) \ -SEC("struct_ops/"#name) \ -BPF_PROG(name, args) - -#ifndef SOL_TCP -#define SOL_TCP 6 -#endif - -#ifndef TCP_CA_NAME_MAX -#define TCP_CA_NAME_MAX 16 -#endif - -#define tcp_jiffies32 ((__u32)bpf_jiffies64()) - -struct sock_common { - unsigned char skc_state; - __u16 skc_num; -} __attribute__((preserve_access_index)); - -enum sk_pacing { - SK_PACING_NONE = 0, - SK_PACING_NEEDED = 1, - SK_PACING_FQ = 2, -}; - -struct sock { - struct sock_common __sk_common; -#define sk_state __sk_common.skc_state - unsigned long sk_pacing_rate; - __u32 sk_pacing_status; /* see enum sk_pacing */ -} __attribute__((preserve_access_index)); - -struct inet_sock { - struct sock sk; -} __attribute__((preserve_access_index)); - -struct inet_connection_sock { - struct inet_sock icsk_inet; - __u8 icsk_ca_state:6, - icsk_ca_setsockopt:1, - icsk_ca_dst_locked:1; - struct { - __u8 pending; - } icsk_ack; - __u64 icsk_ca_priv[104 / sizeof(__u64)]; -} __attribute__((preserve_access_index)); - -struct request_sock { - struct sock_common __req_common; -} __attribute__((preserve_access_index)); - -struct tcp_sock { - struct inet_connection_sock inet_conn; - - __u32 rcv_nxt; - __u32 snd_nxt; - __u32 snd_una; - __u32 window_clamp; - __u8 ecn_flags; - __u32 delivered; - __u32 delivered_ce; - __u32 snd_cwnd; - __u32 snd_cwnd_cnt; - __u32 snd_cwnd_clamp; - __u32 snd_ssthresh; - __u8 syn_data:1, /* SYN includes data */ - syn_fastopen:1, /* SYN includes Fast Open option */ - syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */ - syn_fastopen_ch:1, /* Active TFO re-enabling probe */ - syn_data_acked:1,/* data in SYN is acked by SYN-ACK */ - save_syn:1, /* Save headers of SYN packet */ - is_cwnd_limited:1,/* forward progress limited by snd_cwnd? */ - syn_smc:1; /* SYN includes SMC */ - __u32 max_packets_out; - __u32 lsndtime; - __u32 prior_cwnd; - __u64 tcp_mstamp; /* most recent packet received/sent */ - bool is_mptcp; -} __attribute__((preserve_access_index)); - -static __always_inline struct inet_connection_sock *inet_csk(const struct sock *sk) -{ - return (struct inet_connection_sock *)sk; -} - -static __always_inline void *inet_csk_ca(const struct sock *sk) -{ - return (void *)inet_csk(sk)->icsk_ca_priv; -} - -static __always_inline struct tcp_sock *tcp_sk(const struct sock *sk) -{ - return (struct tcp_sock *)sk; -} - -static __always_inline bool before(__u32 seq1, __u32 seq2) -{ - return (__s32)(seq1-seq2) < 0; -} -#define after(seq2, seq1) before(seq1, seq2) - -#define TCP_ECN_OK 1 -#define TCP_ECN_QUEUE_CWR 2 -#define TCP_ECN_DEMAND_CWR 4 -#define TCP_ECN_SEEN 8 - -enum inet_csk_ack_state_t { - ICSK_ACK_SCHED = 1, - ICSK_ACK_TIMER = 2, - ICSK_ACK_PUSHED = 4, - ICSK_ACK_PUSHED2 = 8, - ICSK_ACK_NOW = 16 /* Send the next ACK immediately (once) */ -}; - -enum tcp_ca_event { - CA_EVENT_TX_START = 0, - CA_EVENT_CWND_RESTART = 1, - CA_EVENT_COMPLETE_CWR = 2, - CA_EVENT_LOSS = 3, - CA_EVENT_ECN_NO_CE = 4, - CA_EVENT_ECN_IS_CE = 5, -}; - -struct ack_sample { - __u32 pkts_acked; - __s32 rtt_us; - __u32 in_flight; -} __attribute__((preserve_access_index)); - -struct rate_sample { - __u64 prior_mstamp; /* starting timestamp for interval */ - __u32 prior_delivered; /* tp->delivered at "prior_mstamp" */ - __s32 delivered; /* number of packets delivered over interval */ - long interval_us; /* time for tp->delivered to incr "delivered" */ - __u32 snd_interval_us; /* snd interval for delivered packets */ - __u32 rcv_interval_us; /* rcv interval for delivered packets */ - long rtt_us; /* RTT of last (S)ACKed packet (or -1) */ - int losses; /* number of packets marked lost upon ACK */ - __u32 acked_sacked; /* number of packets newly (S)ACKed upon ACK */ - __u32 prior_in_flight; /* in flight before this ACK */ - bool is_app_limited; /* is sample from packet with bubble in pipe? */ - bool is_retrans; /* is sample from retransmission? */ - bool is_ack_delayed; /* is this (likely) a delayed ACK? */ -} __attribute__((preserve_access_index)); - -#define TCP_CA_NAME_MAX 16 -#define TCP_CONG_NEEDS_ECN 0x2 - -struct tcp_congestion_ops { - char name[TCP_CA_NAME_MAX]; - __u32 flags; - - /* initialize private data (optional) */ - void (*init)(struct sock *sk); - /* cleanup private data (optional) */ - void (*release)(struct sock *sk); - - /* return slow start threshold (required) */ - __u32 (*ssthresh)(struct sock *sk); - /* do new cwnd calculation (required) */ - void (*cong_avoid)(struct sock *sk, __u32 ack, __u32 acked); - /* call before changing ca_state (optional) */ - void (*set_state)(struct sock *sk, __u8 new_state); - /* call when cwnd event occurs (optional) */ - void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev); - /* call when ack arrives (optional) */ - void (*in_ack_event)(struct sock *sk, __u32 flags); - /* new value of cwnd after loss (required) */ - __u32 (*undo_cwnd)(struct sock *sk); - /* hook for packet ack accounting (optional) */ - void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample); - /* override sysctl_tcp_min_tso_segs */ - __u32 (*min_tso_segs)(struct sock *sk); - /* returns the multiplier used in tcp_sndbuf_expand (optional) */ - __u32 (*sndbuf_expand)(struct sock *sk); - /* call when packets are delivered to update cwnd and pacing rate, - * after all the ca_state processing. (optional) - */ - void (*cong_control)(struct sock *sk, const struct rate_sample *rs); - void *owner; -}; - -#define min(a, b) ((a) < (b) ? (a) : (b)) -#define max(a, b) ((a) > (b) ? (a) : (b)) -#define min_not_zero(x, y) ({ \ - typeof(x) __x = (x); \ - typeof(y) __y = (y); \ - __x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); }) - -static __always_inline bool tcp_in_slow_start(const struct tcp_sock *tp) -{ - return tp->snd_cwnd < tp->snd_ssthresh; -} - -static __always_inline bool tcp_is_cwnd_limited(const struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - - /* If in slow start, ensure cwnd grows to twice what was ACKed. */ - if (tcp_in_slow_start(tp)) - return tp->snd_cwnd < 2 * tp->max_packets_out; - - return !!BPF_CORE_READ_BITFIELD(tp, is_cwnd_limited); -} - -static __always_inline bool tcp_cc_eq(const char *a, const char *b) -{ - int i; - - for (i = 0; i < TCP_CA_NAME_MAX; i++) { - if (a[i] != b[i]) - return false; - if (!a[i]) - break; - } - - return true; -} - -extern __u32 tcp_slow_start(struct tcp_sock *tp, __u32 acked) __ksym; -extern void tcp_cong_avoid_ai(struct tcp_sock *tp, __u32 w, __u32 acked) __ksym; - -struct mptcp_sock { - struct inet_connection_sock sk; - - __u32 token; - struct sock *first; - char ca_name[TCP_CA_NAME_MAX]; -} __attribute__((preserve_access_index)); - -#endif diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c index 39ad96a18123..2a18bd320e92 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c @@ -10,18 +10,30 @@ #include <linux/percpu-defs.h> #include <linux/sysfs.h> #include <linux/tracepoint.h> +#include <linux/net.h> +#include <linux/socket.h> +#include <linux/nsproxy.h> +#include <linux/inet.h> +#include <linux/in.h> +#include <linux/in6.h> +#include <linux/un.h> +#include <net/sock.h> #include "bpf_testmod.h" #include "bpf_testmod_kfunc.h" #define CREATE_TRACE_POINTS #include "bpf_testmod-events.h" +#define CONNECT_TIMEOUT_SEC 1 + typedef int (*func_proto_typedef)(long); typedef int (*func_proto_typedef_nested1)(func_proto_typedef); typedef int (*func_proto_typedef_nested2)(func_proto_typedef_nested1); DEFINE_PER_CPU(int, bpf_testmod_ksym_percpu) = 123; long bpf_testmod_test_struct_arg_result; +static DEFINE_MUTEX(sock_lock); +static struct socket *sock; struct bpf_testmod_struct_arg_1 { int a; @@ -205,6 +217,9 @@ __weak noinline struct file *bpf_testmod_return_ptr(int arg) case 5: return (void *)~(1ull << 30); /* trigger extable */ case 6: return &f; /* valid addr */ case 7: return (void *)((long)&f | 1); /* kernel tricks */ +#ifdef CONFIG_X86_64 + case 8: return (void *)VSYSCALL_ADDR; /* vsyscall page address */ +#endif default: return NULL; } } @@ -494,6 +509,241 @@ __bpf_kfunc static u32 bpf_kfunc_call_test_static_unused_arg(u32 arg, u32 unused return arg; } +__bpf_kfunc void bpf_kfunc_call_test_sleepable(void) +{ +} + +__bpf_kfunc int bpf_kfunc_init_sock(struct init_sock_args *args) +{ + int proto; + int err; + + mutex_lock(&sock_lock); + + if (sock) { + pr_err("%s called without releasing old sock", __func__); + err = -EPERM; + goto out; + } + + switch (args->af) { + case AF_INET: + case AF_INET6: + proto = args->type == SOCK_STREAM ? IPPROTO_TCP : IPPROTO_UDP; + break; + case AF_UNIX: + proto = PF_UNIX; + break; + default: + pr_err("invalid address family %d\n", args->af); + err = -EINVAL; + goto out; + } + + err = sock_create_kern(current->nsproxy->net_ns, args->af, args->type, + proto, &sock); + + if (!err) + /* Set timeout for call to kernel_connect() to prevent it from hanging, + * and consider the connection attempt failed if it returns + * -EINPROGRESS. + */ + sock->sk->sk_sndtimeo = CONNECT_TIMEOUT_SEC * HZ; +out: + mutex_unlock(&sock_lock); + + return err; +} + +__bpf_kfunc void bpf_kfunc_close_sock(void) +{ + mutex_lock(&sock_lock); + + if (sock) { + sock_release(sock); + sock = NULL; + } + + mutex_unlock(&sock_lock); +} + +__bpf_kfunc int bpf_kfunc_call_kernel_connect(struct addr_args *args) +{ + int err; + + if (args->addrlen > sizeof(args->addr)) + return -EINVAL; + + mutex_lock(&sock_lock); + + if (!sock) { + pr_err("%s called without initializing sock", __func__); + err = -EPERM; + goto out; + } + + err = kernel_connect(sock, (struct sockaddr *)&args->addr, + args->addrlen, 0); +out: + mutex_unlock(&sock_lock); + + return err; +} + +__bpf_kfunc int bpf_kfunc_call_kernel_bind(struct addr_args *args) +{ + int err; + + if (args->addrlen > sizeof(args->addr)) + return -EINVAL; + + mutex_lock(&sock_lock); + + if (!sock) { + pr_err("%s called without initializing sock", __func__); + err = -EPERM; + goto out; + } + + err = kernel_bind(sock, (struct sockaddr *)&args->addr, args->addrlen); +out: + mutex_unlock(&sock_lock); + + return err; +} + +__bpf_kfunc int bpf_kfunc_call_kernel_listen(void) +{ + int err; + + mutex_lock(&sock_lock); + + if (!sock) { + pr_err("%s called without initializing sock", __func__); + err = -EPERM; + goto out; + } + + err = kernel_listen(sock, 128); +out: + mutex_unlock(&sock_lock); + + return err; +} + +__bpf_kfunc int bpf_kfunc_call_kernel_sendmsg(struct sendmsg_args *args) +{ + struct msghdr msg = { + .msg_name = &args->addr.addr, + .msg_namelen = args->addr.addrlen, + }; + struct kvec iov; + int err; + + if (args->addr.addrlen > sizeof(args->addr.addr) || + args->msglen > sizeof(args->msg)) + return -EINVAL; + + iov.iov_base = args->msg; + iov.iov_len = args->msglen; + + mutex_lock(&sock_lock); + + if (!sock) { + pr_err("%s called without initializing sock", __func__); + err = -EPERM; + goto out; + } + + err = kernel_sendmsg(sock, &msg, &iov, 1, args->msglen); + args->addr.addrlen = msg.msg_namelen; +out: + mutex_unlock(&sock_lock); + + return err; +} + +__bpf_kfunc int bpf_kfunc_call_sock_sendmsg(struct sendmsg_args *args) +{ + struct msghdr msg = { + .msg_name = &args->addr.addr, + .msg_namelen = args->addr.addrlen, + }; + struct kvec iov; + int err; + + if (args->addr.addrlen > sizeof(args->addr.addr) || + args->msglen > sizeof(args->msg)) + return -EINVAL; + + iov.iov_base = args->msg; + iov.iov_len = args->msglen; + + iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &iov, 1, args->msglen); + mutex_lock(&sock_lock); + + if (!sock) { + pr_err("%s called without initializing sock", __func__); + err = -EPERM; + goto out; + } + + err = sock_sendmsg(sock, &msg); + args->addr.addrlen = msg.msg_namelen; +out: + mutex_unlock(&sock_lock); + + return err; +} + +__bpf_kfunc int bpf_kfunc_call_kernel_getsockname(struct addr_args *args) +{ + int err; + + mutex_lock(&sock_lock); + + if (!sock) { + pr_err("%s called without initializing sock", __func__); + err = -EPERM; + goto out; + } + + err = kernel_getsockname(sock, (struct sockaddr *)&args->addr); + if (err < 0) + goto out; + + args->addrlen = err; + err = 0; +out: + mutex_unlock(&sock_lock); + + return err; +} + +__bpf_kfunc int bpf_kfunc_call_kernel_getpeername(struct addr_args *args) +{ + int err; + + mutex_lock(&sock_lock); + + if (!sock) { + pr_err("%s called without initializing sock", __func__); + err = -EPERM; + goto out; + } + + err = kernel_getpeername(sock, (struct sockaddr *)&args->addr); + if (err < 0) + goto out; + + args->addrlen = err; + err = 0; +out: + mutex_unlock(&sock_lock); + + return err; +} + BTF_KFUNCS_START(bpf_testmod_check_kfunc_ids) BTF_ID_FLAGS(func, bpf_testmod_test_mod_kfunc) BTF_ID_FLAGS(func, bpf_kfunc_call_test1) @@ -520,6 +770,16 @@ BTF_ID_FLAGS(func, bpf_kfunc_call_test_ref, KF_TRUSTED_ARGS | KF_RCU) BTF_ID_FLAGS(func, bpf_kfunc_call_test_destructive, KF_DESTRUCTIVE) BTF_ID_FLAGS(func, bpf_kfunc_call_test_static_unused_arg) BTF_ID_FLAGS(func, bpf_kfunc_call_test_offset) +BTF_ID_FLAGS(func, bpf_kfunc_call_test_sleepable, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_init_sock, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_close_sock, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_connect, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_bind, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_listen, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_sendmsg, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_call_sock_sendmsg, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_getsockname, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_getpeername, KF_SLEEPABLE) BTF_KFUNCS_END(bpf_testmod_check_kfunc_ids) static int bpf_testmod_ops_init(struct btf *btf) @@ -650,6 +910,8 @@ static int bpf_testmod_init(void) return ret; if (bpf_fentry_test1(0) < 0) return -EINVAL; + sock = NULL; + mutex_init(&sock_lock); return sysfs_create_bin_file(kernel_kobj, &bin_attr_bpf_testmod_file); } @@ -663,6 +925,7 @@ static void bpf_testmod_exit(void) while (refcount_read(&prog_test_struct.cnt) > 1) msleep(20); + bpf_kfunc_close_sock(); sysfs_remove_bin_file(kernel_kobj, &bin_attr_bpf_testmod_file); } diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h index 7c664dd61059..b0d586a6751f 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h @@ -64,6 +64,22 @@ struct prog_test_fail3 { char arr2[]; }; +struct init_sock_args { + int af; + int type; +}; + +struct addr_args { + char addr[sizeof(struct __kernel_sockaddr_storage)]; + int addrlen; +}; + +struct sendmsg_args { + struct addr_args addr; + char msg[10]; + int msglen; +}; + struct prog_test_ref_kfunc * bpf_kfunc_call_test_acquire(unsigned long *scalar_ptr) __ksym; void bpf_kfunc_call_test_release(struct prog_test_ref_kfunc *p) __ksym; @@ -96,6 +112,7 @@ void bpf_kfunc_call_test_pass2(struct prog_test_pass2 *p) __ksym; void bpf_kfunc_call_test_mem_len_fail2(__u64 *mem, int len) __ksym; void bpf_kfunc_call_test_destructive(void) __ksym; +void bpf_kfunc_call_test_sleepable(void) __ksym; void bpf_kfunc_call_test_offset(struct prog_test_ref_kfunc *p); struct prog_test_member *bpf_kfunc_call_memb_acquire(void); @@ -106,4 +123,15 @@ void bpf_kfunc_call_test_fail3(struct prog_test_fail3 *p); void bpf_kfunc_call_test_mem_len_fail1(void *mem, int len); void bpf_kfunc_common_test(void) __ksym; + +int bpf_kfunc_init_sock(struct init_sock_args *args) __ksym; +void bpf_kfunc_close_sock(void) __ksym; +int bpf_kfunc_call_kernel_connect(struct addr_args *args) __ksym; +int bpf_kfunc_call_kernel_bind(struct addr_args *args) __ksym; +int bpf_kfunc_call_kernel_listen(void) __ksym; +int bpf_kfunc_call_kernel_sendmsg(struct sendmsg_args *args) __ksym; +int bpf_kfunc_call_sock_sendmsg(struct sendmsg_args *args) __ksym; +int bpf_kfunc_call_kernel_getsockname(struct addr_args *args) __ksym; +int bpf_kfunc_call_kernel_getpeername(struct addr_args *args) __ksym; + #endif /* _BPF_TESTMOD_KFUNC_H */ diff --git a/tools/testing/selftests/bpf/cgroup_helpers.c b/tools/testing/selftests/bpf/cgroup_helpers.c index 19be9c63d5e8..23bb9a9e6a7d 100644 --- a/tools/testing/selftests/bpf/cgroup_helpers.c +++ b/tools/testing/selftests/bpf/cgroup_helpers.c @@ -429,7 +429,7 @@ int create_and_get_cgroup(const char *relative_path) * which is an invalid cgroup id. * If there is a failure, it prints the error to stderr. */ -unsigned long long get_cgroup_id_from_path(const char *cgroup_workdir) +static unsigned long long get_cgroup_id_from_path(const char *cgroup_workdir) { int dirfd, err, flags, mount_id, fhsize; union { @@ -508,6 +508,9 @@ int cgroup_setup_and_join(const char *path) { /** * setup_classid_environment() - Setup the cgroupv1 net_cls environment * + * This function should only be called in a custom mount namespace, e.g. + * created by running setup_cgroup_environment. + * * After calling this function, cleanup_classid_environment should be called * once testing is complete. * diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index 01f241ea2c67..eeabd798bc3a 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -13,7 +13,12 @@ CONFIG_BPF_SYSCALL=y CONFIG_CGROUP_BPF=y CONFIG_CRYPTO_HMAC=y CONFIG_CRYPTO_SHA256=y +CONFIG_CRYPTO_USER_API=y CONFIG_CRYPTO_USER_API_HASH=y +CONFIG_CRYPTO_USER_API_SKCIPHER=y +CONFIG_CRYPTO_SKCIPHER=y +CONFIG_CRYPTO_ECB=y +CONFIG_CRYPTO_AES=y CONFIG_DEBUG_INFO=y CONFIG_DEBUG_INFO_BTF=y CONFIG_DEBUG_INFO_DWARF4=y @@ -88,3 +93,5 @@ CONFIG_VSOCKETS=y CONFIG_VXLAN=y CONFIG_XDP_SOCKETS=y CONFIG_XFRM_INTERFACE=y +CONFIG_TCP_CONG_DCTCP=y +CONFIG_TCP_CONG_BBR=y diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index 6db27a9088e9..35250e6cde7f 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -52,6 +52,8 @@ struct ipv6_packet pkt_v6 = { .tcp.doff = 5, }; +static const struct network_helper_opts default_opts; + int settimeo(int fd, int timeout_ms) { struct timeval timeout = { .tv_sec = 3 }; @@ -78,24 +80,22 @@ int settimeo(int fd, int timeout_ms) #define save_errno_close(fd) ({ int __save = errno; close(fd); errno = __save; }) -static int __start_server(int type, int protocol, const struct sockaddr *addr, - socklen_t addrlen, int timeout_ms, bool reuseport) +static int __start_server(int type, const struct sockaddr *addr, socklen_t addrlen, + const struct network_helper_opts *opts) { - int on = 1; int fd; - fd = socket(addr->sa_family, type, protocol); + fd = socket(addr->sa_family, type, opts->proto); if (fd < 0) { log_err("Failed to create server socket"); return -1; } - if (settimeo(fd, timeout_ms)) + if (settimeo(fd, opts->timeout_ms)) goto error_close; - if (reuseport && - setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &on, sizeof(on))) { - log_err("Failed to set SO_REUSEPORT"); + if (opts->post_socket_cb && opts->post_socket_cb(fd, NULL)) { + log_err("Failed to call post_socket_cb"); goto error_close; } @@ -118,35 +118,35 @@ error_close: return -1; } -static int start_server_proto(int family, int type, int protocol, - const char *addr_str, __u16 port, int timeout_ms) +int start_server(int family, int type, const char *addr_str, __u16 port, + int timeout_ms) { + struct network_helper_opts opts = { + .timeout_ms = timeout_ms, + }; struct sockaddr_storage addr; socklen_t addrlen; if (make_sockaddr(family, addr_str, port, &addr, &addrlen)) return -1; - return __start_server(type, protocol, (struct sockaddr *)&addr, - addrlen, timeout_ms, false); + return __start_server(type, (struct sockaddr *)&addr, addrlen, &opts); } -int start_server(int family, int type, const char *addr_str, __u16 port, - int timeout_ms) +static int reuseport_cb(int fd, const struct post_socket_opts *opts) { - return start_server_proto(family, type, 0, addr_str, port, timeout_ms); -} + int on = 1; -int start_mptcp_server(int family, const char *addr_str, __u16 port, - int timeout_ms) -{ - return start_server_proto(family, SOCK_STREAM, IPPROTO_MPTCP, addr_str, - port, timeout_ms); + return setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &on, sizeof(on)); } int *start_reuseport_server(int family, int type, const char *addr_str, __u16 port, int timeout_ms, unsigned int nr_listens) { + struct network_helper_opts opts = { + .timeout_ms = timeout_ms, + .post_socket_cb = reuseport_cb, + }; struct sockaddr_storage addr; unsigned int nr_fds = 0; socklen_t addrlen; @@ -162,8 +162,7 @@ int *start_reuseport_server(int family, int type, const char *addr_str, if (!fds) return NULL; - fds[0] = __start_server(type, 0, (struct sockaddr *)&addr, addrlen, - timeout_ms, true); + fds[0] = __start_server(type, (struct sockaddr *)&addr, addrlen, &opts); if (fds[0] == -1) goto close_fds; nr_fds = 1; @@ -172,8 +171,7 @@ int *start_reuseport_server(int family, int type, const char *addr_str, goto close_fds; for (; nr_fds < nr_listens; nr_fds++) { - fds[nr_fds] = __start_server(type, 0, (struct sockaddr *)&addr, - addrlen, timeout_ms, true); + fds[nr_fds] = __start_server(type, (struct sockaddr *)&addr, addrlen, &opts); if (fds[nr_fds] == -1) goto close_fds; } @@ -185,6 +183,15 @@ close_fds: return NULL; } +int start_server_addr(int type, const struct sockaddr_storage *addr, socklen_t len, + const struct network_helper_opts *opts) +{ + if (!opts) + opts = &default_opts; + + return __start_server(type, (struct sockaddr *)addr, len, opts); +} + void free_fds(int *fds, unsigned int nr_close_fds) { if (fds) { @@ -258,17 +265,24 @@ static int connect_fd_to_addr(int fd, return 0; } -int connect_to_addr(const struct sockaddr_storage *addr, socklen_t addrlen, int type) +int connect_to_addr(int type, const struct sockaddr_storage *addr, socklen_t addrlen, + const struct network_helper_opts *opts) { int fd; - fd = socket(addr->ss_family, type, 0); + if (!opts) + opts = &default_opts; + + fd = socket(addr->ss_family, type, opts->proto); if (fd < 0) { log_err("Failed to create client socket"); return -1; } - if (connect_fd_to_addr(fd, addr, addrlen, false)) + if (settimeo(fd, opts->timeout_ms)) + goto error_close; + + if (connect_fd_to_addr(fd, addr, addrlen, opts->must_fail)) goto error_close; return fd; @@ -278,8 +292,6 @@ error_close: return -1; } -static const struct network_helper_opts default_opts; - int connect_to_fd_opts(int server_fd, const struct network_helper_opts *opts) { struct sockaddr_storage addr; @@ -442,25 +454,35 @@ struct nstoken *open_netns(const char *name) struct nstoken *token; token = calloc(1, sizeof(struct nstoken)); - if (!ASSERT_OK_PTR(token, "malloc token")) + if (!token) { + log_err("Failed to malloc token"); return NULL; + } token->orig_netns_fd = open("/proc/self/ns/net", O_RDONLY); - if (!ASSERT_GE(token->orig_netns_fd, 0, "open /proc/self/ns/net")) + if (token->orig_netns_fd == -1) { + log_err("Failed to open(/proc/self/ns/net)"); goto fail; + } snprintf(nspath, sizeof(nspath), "%s/%s", "/var/run/netns", name); nsfd = open(nspath, O_RDONLY | O_CLOEXEC); - if (!ASSERT_GE(nsfd, 0, "open netns fd")) + if (nsfd == -1) { + log_err("Failed to open(%s)", nspath); goto fail; + } err = setns(nsfd, CLONE_NEWNET); close(nsfd); - if (!ASSERT_OK(err, "setns")) + if (err) { + log_err("Failed to setns(nsfd)"); goto fail; + } return token; fail: + if (token->orig_netns_fd != -1) + close(token->orig_netns_fd); free(token); return NULL; } @@ -470,7 +492,8 @@ void close_netns(struct nstoken *token) if (!token) return; - ASSERT_OK(setns(token->orig_netns_fd, CLONE_NEWNET), "setns"); + if (setns(token->orig_netns_fd, CLONE_NEWNET)) + log_err("Failed to setns(orig_netns_fd)"); close(token->orig_netns_fd); free(token); } @@ -497,3 +520,153 @@ int get_socket_local_port(int sock_fd) return -1; } + +int get_hw_ring_size(char *ifname, struct ethtool_ringparam *ring_param) +{ + struct ifreq ifr = {0}; + int sockfd, err; + + sockfd = socket(AF_INET, SOCK_DGRAM, 0); + if (sockfd < 0) + return -errno; + + memcpy(ifr.ifr_name, ifname, sizeof(ifr.ifr_name)); + + ring_param->cmd = ETHTOOL_GRINGPARAM; + ifr.ifr_data = (char *)ring_param; + + if (ioctl(sockfd, SIOCETHTOOL, &ifr) < 0) { + err = errno; + close(sockfd); + return -err; + } + + close(sockfd); + return 0; +} + +int set_hw_ring_size(char *ifname, struct ethtool_ringparam *ring_param) +{ + struct ifreq ifr = {0}; + int sockfd, err; + + sockfd = socket(AF_INET, SOCK_DGRAM, 0); + if (sockfd < 0) + return -errno; + + memcpy(ifr.ifr_name, ifname, sizeof(ifr.ifr_name)); + + ring_param->cmd = ETHTOOL_SRINGPARAM; + ifr.ifr_data = (char *)ring_param; + + if (ioctl(sockfd, SIOCETHTOOL, &ifr) < 0) { + err = errno; + close(sockfd); + return -err; + } + + close(sockfd); + return 0; +} + +struct send_recv_arg { + int fd; + uint32_t bytes; + int stop; +}; + +static void *send_recv_server(void *arg) +{ + struct send_recv_arg *a = (struct send_recv_arg *)arg; + ssize_t nr_sent = 0, bytes = 0; + char batch[1500]; + int err = 0, fd; + + fd = accept(a->fd, NULL, NULL); + while (fd == -1) { + if (errno == EINTR) + continue; + err = -errno; + goto done; + } + + if (settimeo(fd, 0)) { + err = -errno; + goto done; + } + + while (bytes < a->bytes && !READ_ONCE(a->stop)) { + nr_sent = send(fd, &batch, + MIN(a->bytes - bytes, sizeof(batch)), 0); + if (nr_sent == -1 && errno == EINTR) + continue; + if (nr_sent == -1) { + err = -errno; + break; + } + bytes += nr_sent; + } + + if (bytes != a->bytes) { + log_err("send %zd expected %u", bytes, a->bytes); + if (!err) + err = bytes > a->bytes ? -E2BIG : -EINTR; + } + +done: + if (fd >= 0) + close(fd); + if (err) { + WRITE_ONCE(a->stop, 1); + return ERR_PTR(err); + } + return NULL; +} + +int send_recv_data(int lfd, int fd, uint32_t total_bytes) +{ + ssize_t nr_recv = 0, bytes = 0; + struct send_recv_arg arg = { + .fd = lfd, + .bytes = total_bytes, + .stop = 0, + }; + pthread_t srv_thread; + void *thread_ret; + char batch[1500]; + int err = 0; + + err = pthread_create(&srv_thread, NULL, send_recv_server, (void *)&arg); + if (err) { + log_err("Failed to pthread_create"); + return err; + } + + /* recv total_bytes */ + while (bytes < total_bytes && !READ_ONCE(arg.stop)) { + nr_recv = recv(fd, &batch, + MIN(total_bytes - bytes, sizeof(batch)), 0); + if (nr_recv == -1 && errno == EINTR) + continue; + if (nr_recv == -1) { + err = -errno; + break; + } + bytes += nr_recv; + } + + if (bytes != total_bytes) { + log_err("recv %zd expected %u", bytes, total_bytes); + if (!err) + err = bytes > total_bytes ? -E2BIG : -EINTR; + } + + WRITE_ONCE(arg.stop, 1); + pthread_join(srv_thread, &thread_ret); + if (IS_ERR(thread_ret)) { + log_err("Failed in thread_ret %ld", PTR_ERR(thread_ret)); + err = err ? : PTR_ERR(thread_ret); + } + + return err; +} diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h index 94b9be24e39b..883c7ea9d8d5 100644 --- a/tools/testing/selftests/bpf/network_helpers.h +++ b/tools/testing/selftests/bpf/network_helpers.h @@ -9,14 +9,20 @@ typedef __u16 __sum16; #include <linux/if_packet.h> #include <linux/ip.h> #include <linux/ipv6.h> +#include <linux/ethtool.h> +#include <linux/sockios.h> +#include <linux/err.h> #include <netinet/tcp.h> #include <bpf/bpf_endian.h> +#include <net/if.h> #define MAGIC_VAL 0x1234 #define NUM_ITER 100000 #define VIP_NUM 5 #define MAGIC_BYTES 123 +struct post_socket_opts {}; + struct network_helper_opts { const char *cc; int timeout_ms; @@ -24,6 +30,7 @@ struct network_helper_opts { bool noconnect; int type; int proto; + int (*post_socket_cb)(int fd, const struct post_socket_opts *opts); }; /* ipv4 test vector */ @@ -45,13 +52,14 @@ extern struct ipv6_packet pkt_v6; int settimeo(int fd, int timeout_ms); int start_server(int family, int type, const char *addr, __u16 port, int timeout_ms); -int start_mptcp_server(int family, const char *addr, __u16 port, - int timeout_ms); int *start_reuseport_server(int family, int type, const char *addr_str, __u16 port, int timeout_ms, unsigned int nr_listens); +int start_server_addr(int type, const struct sockaddr_storage *addr, socklen_t len, + const struct network_helper_opts *opts); void free_fds(int *fds, unsigned int nr_close_fds); -int connect_to_addr(const struct sockaddr_storage *addr, socklen_t len, int type); +int connect_to_addr(int type, const struct sockaddr_storage *addr, socklen_t len, + const struct network_helper_opts *opts); int connect_to_fd(int server_fd, int timeout_ms); int connect_to_fd_opts(int server_fd, const struct network_helper_opts *opts); int connect_fd_to_fd(int client_fd, int server_fd, int timeout_ms); @@ -61,6 +69,8 @@ int make_sockaddr(int family, const char *addr_str, __u16 port, struct sockaddr_storage *addr, socklen_t *len); char *ping_command(int family); int get_socket_local_port(int sock_fd); +int get_hw_ring_size(char *ifname, struct ethtool_ringparam *ring_param); +int set_hw_ring_size(char *ifname, struct ethtool_ringparam *ring_param); struct nstoken; /** @@ -71,6 +81,7 @@ struct nstoken; */ struct nstoken *open_netns(const char *name); void close_netns(struct nstoken *token); +int send_recv_data(int lfd, int fd, uint32_t total_bytes); static __u16 csum_fold(__u32 csum) { diff --git a/tools/testing/selftests/bpf/prog_tests/arena_atomics.c b/tools/testing/selftests/bpf/prog_tests/arena_atomics.c new file mode 100644 index 000000000000..0807a48a58ee --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/arena_atomics.c @@ -0,0 +1,186 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#include <test_progs.h> +#include "arena_atomics.skel.h" + +static void test_add(struct arena_atomics *skel) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + int err, prog_fd; + + /* No need to attach it, just run it directly */ + prog_fd = bpf_program__fd(skel->progs.add); + err = bpf_prog_test_run_opts(prog_fd, &topts); + if (!ASSERT_OK(err, "test_run_opts err")) + return; + if (!ASSERT_OK(topts.retval, "test_run_opts retval")) + return; + + ASSERT_EQ(skel->arena->add64_value, 3, "add64_value"); + ASSERT_EQ(skel->arena->add64_result, 1, "add64_result"); + + ASSERT_EQ(skel->arena->add32_value, 3, "add32_value"); + ASSERT_EQ(skel->arena->add32_result, 1, "add32_result"); + + ASSERT_EQ(skel->arena->add_stack_value_copy, 3, "add_stack_value"); + ASSERT_EQ(skel->arena->add_stack_result, 1, "add_stack_result"); + + ASSERT_EQ(skel->arena->add_noreturn_value, 3, "add_noreturn_value"); +} + +static void test_sub(struct arena_atomics *skel) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + int err, prog_fd; + + /* No need to attach it, just run it directly */ + prog_fd = bpf_program__fd(skel->progs.sub); + err = bpf_prog_test_run_opts(prog_fd, &topts); + if (!ASSERT_OK(err, "test_run_opts err")) + return; + if (!ASSERT_OK(topts.retval, "test_run_opts retval")) + return; + + ASSERT_EQ(skel->arena->sub64_value, -1, "sub64_value"); + ASSERT_EQ(skel->arena->sub64_result, 1, "sub64_result"); + + ASSERT_EQ(skel->arena->sub32_value, -1, "sub32_value"); + ASSERT_EQ(skel->arena->sub32_result, 1, "sub32_result"); + + ASSERT_EQ(skel->arena->sub_stack_value_copy, -1, "sub_stack_value"); + ASSERT_EQ(skel->arena->sub_stack_result, 1, "sub_stack_result"); + + ASSERT_EQ(skel->arena->sub_noreturn_value, -1, "sub_noreturn_value"); +} + +static void test_and(struct arena_atomics *skel) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + int err, prog_fd; + + /* No need to attach it, just run it directly */ + prog_fd = bpf_program__fd(skel->progs.and); + err = bpf_prog_test_run_opts(prog_fd, &topts); + if (!ASSERT_OK(err, "test_run_opts err")) + return; + if (!ASSERT_OK(topts.retval, "test_run_opts retval")) + return; + + ASSERT_EQ(skel->arena->and64_value, 0x010ull << 32, "and64_value"); + ASSERT_EQ(skel->arena->and32_value, 0x010, "and32_value"); +} + +static void test_or(struct arena_atomics *skel) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + int err, prog_fd; + + /* No need to attach it, just run it directly */ + prog_fd = bpf_program__fd(skel->progs.or); + err = bpf_prog_test_run_opts(prog_fd, &topts); + if (!ASSERT_OK(err, "test_run_opts err")) + return; + if (!ASSERT_OK(topts.retval, "test_run_opts retval")) + return; + + ASSERT_EQ(skel->arena->or64_value, 0x111ull << 32, "or64_value"); + ASSERT_EQ(skel->arena->or32_value, 0x111, "or32_value"); +} + +static void test_xor(struct arena_atomics *skel) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + int err, prog_fd; + + /* No need to attach it, just run it directly */ + prog_fd = bpf_program__fd(skel->progs.xor); + err = bpf_prog_test_run_opts(prog_fd, &topts); + if (!ASSERT_OK(err, "test_run_opts err")) + return; + if (!ASSERT_OK(topts.retval, "test_run_opts retval")) + return; + + ASSERT_EQ(skel->arena->xor64_value, 0x101ull << 32, "xor64_value"); + ASSERT_EQ(skel->arena->xor32_value, 0x101, "xor32_value"); +} + +static void test_cmpxchg(struct arena_atomics *skel) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + int err, prog_fd; + + /* No need to attach it, just run it directly */ + prog_fd = bpf_program__fd(skel->progs.cmpxchg); + err = bpf_prog_test_run_opts(prog_fd, &topts); + if (!ASSERT_OK(err, "test_run_opts err")) + return; + if (!ASSERT_OK(topts.retval, "test_run_opts retval")) + return; + + ASSERT_EQ(skel->arena->cmpxchg64_value, 2, "cmpxchg64_value"); + ASSERT_EQ(skel->arena->cmpxchg64_result_fail, 1, "cmpxchg_result_fail"); + ASSERT_EQ(skel->arena->cmpxchg64_result_succeed, 1, "cmpxchg_result_succeed"); + + ASSERT_EQ(skel->arena->cmpxchg32_value, 2, "lcmpxchg32_value"); + ASSERT_EQ(skel->arena->cmpxchg32_result_fail, 1, "cmpxchg_result_fail"); + ASSERT_EQ(skel->arena->cmpxchg32_result_succeed, 1, "cmpxchg_result_succeed"); +} + +static void test_xchg(struct arena_atomics *skel) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + int err, prog_fd; + + /* No need to attach it, just run it directly */ + prog_fd = bpf_program__fd(skel->progs.xchg); + err = bpf_prog_test_run_opts(prog_fd, &topts); + if (!ASSERT_OK(err, "test_run_opts err")) + return; + if (!ASSERT_OK(topts.retval, "test_run_opts retval")) + return; + + ASSERT_EQ(skel->arena->xchg64_value, 2, "xchg64_value"); + ASSERT_EQ(skel->arena->xchg64_result, 1, "xchg64_result"); + + ASSERT_EQ(skel->arena->xchg32_value, 2, "xchg32_value"); + ASSERT_EQ(skel->arena->xchg32_result, 1, "xchg32_result"); +} + +void test_arena_atomics(void) +{ + struct arena_atomics *skel; + int err; + + skel = arena_atomics__open(); + if (!ASSERT_OK_PTR(skel, "arena atomics skeleton open")) + return; + + if (skel->data->skip_tests) { + printf("%s:SKIP:no ENABLE_ATOMICS_TESTS or no addr_space_cast support in clang", + __func__); + test__skip(); + goto cleanup; + } + err = arena_atomics__load(skel); + if (!ASSERT_OK(err, "arena atomics skeleton load")) + return; + skel->bss->pid = getpid(); + + if (test__start_subtest("add")) + test_add(skel); + if (test__start_subtest("sub")) + test_sub(skel); + if (test__start_subtest("and")) + test_and(skel); + if (test__start_subtest("or")) + test_or(skel); + if (test__start_subtest("xor")) + test_xor(skel); + if (test__start_subtest("cmpxchg")) + test_cmpxchg(skel); + if (test__start_subtest("xchg")) + test_xchg(skel); + +cleanup: + arena_atomics__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c b/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c index 1454cebc262b..4407ea428e77 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c @@ -573,6 +573,115 @@ cleanup: close(lsm_fd); } +static void tp_btf_subtest(struct test_bpf_cookie *skel) +{ + __u64 cookie; + int prog_fd, link_fd = -1; + struct bpf_link *link = NULL; + LIBBPF_OPTS(bpf_link_create_opts, link_opts); + LIBBPF_OPTS(bpf_raw_tp_opts, raw_tp_opts); + LIBBPF_OPTS(bpf_trace_opts, trace_opts); + + /* There are three different ways to attach tp_btf (BTF-aware raw + * tracepoint) programs. Let's test all of them. + */ + prog_fd = bpf_program__fd(skel->progs.handle_tp_btf); + + /* low-level BPF_RAW_TRACEPOINT_OPEN command wrapper */ + skel->bss->tp_btf_res = 0; + + raw_tp_opts.cookie = cookie = 0x11000000000000L; + link_fd = bpf_raw_tracepoint_open_opts(prog_fd, &raw_tp_opts); + if (!ASSERT_GE(link_fd, 0, "bpf_raw_tracepoint_open_opts")) + goto cleanup; + + usleep(1); /* trigger */ + close(link_fd); /* detach */ + link_fd = -1; + + ASSERT_EQ(skel->bss->tp_btf_res, cookie, "raw_tp_open_res"); + + /* low-level generic bpf_link_create() API */ + skel->bss->tp_btf_res = 0; + + link_opts.tracing.cookie = cookie = 0x22000000000000L; + link_fd = bpf_link_create(prog_fd, 0, BPF_TRACE_RAW_TP, &link_opts); + if (!ASSERT_GE(link_fd, 0, "bpf_link_create")) + goto cleanup; + + usleep(1); /* trigger */ + close(link_fd); /* detach */ + link_fd = -1; + + ASSERT_EQ(skel->bss->tp_btf_res, cookie, "link_create_res"); + + /* high-level bpf_link-based bpf_program__attach_trace_opts() API */ + skel->bss->tp_btf_res = 0; + + trace_opts.cookie = cookie = 0x33000000000000L; + link = bpf_program__attach_trace_opts(skel->progs.handle_tp_btf, &trace_opts); + if (!ASSERT_OK_PTR(link, "attach_trace_opts")) + goto cleanup; + + usleep(1); /* trigger */ + bpf_link__destroy(link); /* detach */ + link = NULL; + + ASSERT_EQ(skel->bss->tp_btf_res, cookie, "attach_trace_opts_res"); + +cleanup: + if (link_fd >= 0) + close(link_fd); + bpf_link__destroy(link); +} + +static void raw_tp_subtest(struct test_bpf_cookie *skel) +{ + __u64 cookie; + int prog_fd, link_fd = -1; + struct bpf_link *link = NULL; + LIBBPF_OPTS(bpf_raw_tp_opts, raw_tp_opts); + LIBBPF_OPTS(bpf_raw_tracepoint_opts, opts); + + /* There are two different ways to attach raw_tp programs */ + prog_fd = bpf_program__fd(skel->progs.handle_raw_tp); + + /* low-level BPF_RAW_TRACEPOINT_OPEN command wrapper */ + skel->bss->raw_tp_res = 0; + + raw_tp_opts.tp_name = "sys_enter"; + raw_tp_opts.cookie = cookie = 0x55000000000000L; + link_fd = bpf_raw_tracepoint_open_opts(prog_fd, &raw_tp_opts); + if (!ASSERT_GE(link_fd, 0, "bpf_raw_tracepoint_open_opts")) + goto cleanup; + + usleep(1); /* trigger */ + close(link_fd); /* detach */ + link_fd = -1; + + ASSERT_EQ(skel->bss->raw_tp_res, cookie, "raw_tp_open_res"); + + /* high-level bpf_link-based bpf_program__attach_raw_tracepoint_opts() API */ + skel->bss->raw_tp_res = 0; + + opts.cookie = cookie = 0x66000000000000L; + link = bpf_program__attach_raw_tracepoint_opts(skel->progs.handle_raw_tp, + "sys_enter", &opts); + if (!ASSERT_OK_PTR(link, "attach_raw_tp_opts")) + goto cleanup; + + usleep(1); /* trigger */ + bpf_link__destroy(link); /* detach */ + link = NULL; + + ASSERT_EQ(skel->bss->raw_tp_res, cookie, "attach_raw_tp_opts_res"); + +cleanup: + if (link_fd >= 0) + close(link_fd); + bpf_link__destroy(link); +} + void test_bpf_cookie(void) { struct test_bpf_cookie *skel; @@ -601,6 +710,9 @@ void test_bpf_cookie(void) tracing_subtest(skel); if (test__start_subtest("lsm")) lsm_subtest(skel); - + if (test__start_subtest("tp_btf")) + tp_btf_subtest(skel); + if (test__start_subtest("raw_tp")) + raw_tp_subtest(skel); test_bpf_cookie__destroy(skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c index a88e6e07e4f5..0aca02532794 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c @@ -13,6 +13,8 @@ #include "tcp_ca_write_sk_pacing.skel.h" #include "tcp_ca_incompl_cong_ops.skel.h" #include "tcp_ca_unsupp_cong_op.skel.h" +#include "tcp_ca_kfunc.skel.h" +#include "bpf_cc_cubic.skel.h" #ifndef ENOTSUPP #define ENOTSUPP 524 @@ -20,7 +22,6 @@ static const unsigned int total_bytes = 10 * 1024 * 1024; static int expected_stg = 0xeB9F; -static int stop; static int settcpca(int fd, const char *tcp_ca) { @@ -33,63 +34,12 @@ static int settcpca(int fd, const char *tcp_ca) return 0; } -static void *server(void *arg) -{ - int lfd = (int)(long)arg, err = 0, fd; - ssize_t nr_sent = 0, bytes = 0; - char batch[1500]; - - fd = accept(lfd, NULL, NULL); - while (fd == -1) { - if (errno == EINTR) - continue; - err = -errno; - goto done; - } - - if (settimeo(fd, 0)) { - err = -errno; - goto done; - } - - while (bytes < total_bytes && !READ_ONCE(stop)) { - nr_sent = send(fd, &batch, - MIN(total_bytes - bytes, sizeof(batch)), 0); - if (nr_sent == -1 && errno == EINTR) - continue; - if (nr_sent == -1) { - err = -errno; - break; - } - bytes += nr_sent; - } - - ASSERT_EQ(bytes, total_bytes, "send"); - -done: - if (fd >= 0) - close(fd); - if (err) { - WRITE_ONCE(stop, 1); - return ERR_PTR(err); - } - return NULL; -} - static void do_test(const char *tcp_ca, const struct bpf_map *sk_stg_map) { - struct sockaddr_in6 sa6 = {}; - ssize_t nr_recv = 0, bytes = 0; int lfd = -1, fd = -1; - pthread_t srv_thread; - socklen_t addrlen = sizeof(sa6); - void *thread_ret; - char batch[1500]; int err; - WRITE_ONCE(stop, 0); - - lfd = socket(AF_INET6, SOCK_STREAM, 0); + lfd = start_server(AF_INET6, SOCK_STREAM, NULL, 0, 0); if (!ASSERT_NEQ(lfd, -1, "socket")) return; @@ -99,23 +49,7 @@ static void do_test(const char *tcp_ca, const struct bpf_map *sk_stg_map) return; } - if (settcpca(lfd, tcp_ca) || settcpca(fd, tcp_ca) || - settimeo(lfd, 0) || settimeo(fd, 0)) - goto done; - - /* bind, listen and start server thread to accept */ - sa6.sin6_family = AF_INET6; - sa6.sin6_addr = in6addr_loopback; - err = bind(lfd, (struct sockaddr *)&sa6, addrlen); - if (!ASSERT_NEQ(err, -1, "bind")) - goto done; - - err = getsockname(lfd, (struct sockaddr *)&sa6, &addrlen); - if (!ASSERT_NEQ(err, -1, "getsockname")) - goto done; - - err = listen(lfd, 1); - if (!ASSERT_NEQ(err, -1, "listen")) + if (settcpca(lfd, tcp_ca) || settcpca(fd, tcp_ca)) goto done; if (sk_stg_map) { @@ -126,7 +60,7 @@ static void do_test(const char *tcp_ca, const struct bpf_map *sk_stg_map) } /* connect to server */ - err = connect(fd, (struct sockaddr *)&sa6, addrlen); + err = connect_fd_to_fd(fd, lfd, 0); if (!ASSERT_NEQ(err, -1, "connect")) goto done; @@ -140,26 +74,7 @@ static void do_test(const char *tcp_ca, const struct bpf_map *sk_stg_map) goto done; } - err = pthread_create(&srv_thread, NULL, server, (void *)(long)lfd); - if (!ASSERT_OK(err, "pthread_create")) - goto done; - - /* recv total_bytes */ - while (bytes < total_bytes && !READ_ONCE(stop)) { - nr_recv = recv(fd, &batch, - MIN(total_bytes - bytes, sizeof(batch)), 0); - if (nr_recv == -1 && errno == EINTR) - continue; - if (nr_recv == -1) - break; - bytes += nr_recv; - } - - ASSERT_EQ(bytes, total_bytes, "recv"); - - WRITE_ONCE(stop, 1); - pthread_join(srv_thread, &thread_ret); - ASSERT_OK(IS_ERR(thread_ret), "thread_ret"); + ASSERT_OK(send_recv_data(lfd, fd, total_bytes), "send_recv_data"); done: close(lfd); @@ -315,7 +230,7 @@ static void test_rel_setsockopt(void) struct bpf_dctcp_release *rel_skel; libbpf_print_fn_t old_print_fn; - err_str = "unknown func bpf_setsockopt"; + err_str = "program of this type cannot use helper bpf_setsockopt"; found = false; old_print_fn = libbpf_set_print(libbpf_debug_print); @@ -529,6 +444,36 @@ static void test_link_replace(void) tcp_ca_update__destroy(skel); } +static void test_tcp_ca_kfunc(void) +{ + struct tcp_ca_kfunc *skel; + + skel = tcp_ca_kfunc__open_and_load(); + ASSERT_OK_PTR(skel, "tcp_ca_kfunc__open_and_load"); + tcp_ca_kfunc__destroy(skel); +} + +static void test_cc_cubic(void) +{ + struct bpf_cc_cubic *cc_cubic_skel; + struct bpf_link *link; + + cc_cubic_skel = bpf_cc_cubic__open_and_load(); + if (!ASSERT_OK_PTR(cc_cubic_skel, "bpf_cc_cubic__open_and_load")) + return; + + link = bpf_map__attach_struct_ops(cc_cubic_skel->maps.cc_cubic); + if (!ASSERT_OK_PTR(link, "bpf_map__attach_struct_ops")) { + bpf_cc_cubic__destroy(cc_cubic_skel); + return; + } + + do_test("bpf_cc_cubic", NULL); + + bpf_link__destroy(link); + bpf_cc_cubic__destroy(cc_cubic_skel); +} + void test_bpf_tcp_ca(void) { if (test__start_subtest("dctcp")) @@ -557,4 +502,8 @@ void test_bpf_tcp_ca(void) test_multi_links(); if (test__start_subtest("link_replace")) test_link_replace(); + if (test__start_subtest("tcp_ca_kfunc")) + test_tcp_ca_kfunc(); + if (test__start_subtest("cc_cubic")) + test_cc_cubic(); } diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dump.c b/tools/testing/selftests/bpf/prog_tests/btf_dump.c index e9ea38aa8248..09a8e6f9b379 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_dump.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_dump.c @@ -653,7 +653,7 @@ static void test_btf_dump_struct_data(struct btf *btf, struct btf_dump *d, cmpstr = "(struct file_operations){\n" " .owner = (struct module *)0xffffffffffffffff,\n" -" .llseek = (loff_t (*)(struct file *, loff_t, int))0xffffffffffffffff,"; +" .fop_flags = (fop_flags_t)4294967295,"; ASSERT_STRNEQ(str, cmpstr, strlen(cmpstr), "file_operations"); } diff --git a/tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c b/tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c index a8b53b8736f0..f66ceccd7029 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c @@ -25,7 +25,7 @@ static void test_lookup_update(void) int map1_fd, map2_fd, map3_fd, map4_fd, map5_fd, map1_id, map2_id; int outer_arr_fd, outer_hash_fd, outer_arr_dyn_fd; struct test_btf_map_in_map *skel; - int err, key = 0, val, i, fd; + int err, key = 0, val, i; skel = test_btf_map_in_map__open_and_load(); if (CHECK(!skel, "skel_open", "failed to open&load skeleton\n")) @@ -102,30 +102,6 @@ static void test_lookup_update(void) CHECK(map1_id == 0, "map1_id", "failed to get ID 1\n"); CHECK(map2_id == 0, "map2_id", "failed to get ID 2\n"); - test_btf_map_in_map__destroy(skel); - skel = NULL; - - /* we need to either wait for or force synchronize_rcu(), before - * checking for "still exists" condition, otherwise map could still be - * resolvable by ID, causing false positives. - * - * Older kernels (5.8 and earlier) freed map only after two - * synchronize_rcu()s, so trigger two, to be entirely sure. - */ - CHECK(kern_sync_rcu(), "sync_rcu", "failed\n"); - CHECK(kern_sync_rcu(), "sync_rcu", "failed\n"); - - fd = bpf_map_get_fd_by_id(map1_id); - if (CHECK(fd >= 0, "map1_leak", "inner_map1 leaked!\n")) { - close(fd); - goto cleanup; - } - fd = bpf_map_get_fd_by_id(map2_id); - if (CHECK(fd >= 0, "map2_leak", "inner_map2 leaked!\n")) { - close(fd); - goto cleanup; - } - cleanup: test_btf_map_in_map__destroy(skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup1_hierarchy.c b/tools/testing/selftests/bpf/prog_tests/cgroup1_hierarchy.c index 74d6d7546f40..25332e596750 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgroup1_hierarchy.c +++ b/tools/testing/selftests/bpf/prog_tests/cgroup1_hierarchy.c @@ -87,9 +87,12 @@ void test_cgroup1_hierarchy(void) goto destroy; /* Setup cgroup1 hierarchy */ + err = setup_cgroup_environment(); + if (!ASSERT_OK(err, "setup_cgroup_environment")) + goto destroy; err = setup_classid_environment(); if (!ASSERT_OK(err, "setup_classid_environment")) - goto destroy; + goto cleanup_cgroup; err = join_classid(); if (!ASSERT_OK(err, "join_cgroup1")) @@ -153,6 +156,8 @@ void test_cgroup1_hierarchy(void) cleanup: cleanup_classid_environment(); +cleanup_cgroup: + cleanup_cgroup_environment(); destroy: test_cgroup1_hierarchy__destroy(skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/cls_redirect.c b/tools/testing/selftests/bpf/prog_tests/cls_redirect.c index 2a55f717fc07..34b59f6baca1 100644 --- a/tools/testing/selftests/bpf/prog_tests/cls_redirect.c +++ b/tools/testing/selftests/bpf/prog_tests/cls_redirect.c @@ -10,6 +10,7 @@ #include <netinet/tcp.h> #include <test_progs.h> +#include "network_helpers.h" #include "progs/test_cls_redirect.h" #include "test_cls_redirect.skel.h" @@ -35,39 +36,6 @@ struct tuple { struct addr_port dst; }; -static int start_server(const struct sockaddr *addr, socklen_t len, int type) -{ - int fd = socket(addr->sa_family, type, 0); - if (CHECK_FAIL(fd == -1)) - return -1; - if (CHECK_FAIL(bind(fd, addr, len) == -1)) - goto err; - if (type == SOCK_STREAM && CHECK_FAIL(listen(fd, 128) == -1)) - goto err; - - return fd; - -err: - close(fd); - return -1; -} - -static int connect_to_server(const struct sockaddr *addr, socklen_t len, - int type) -{ - int fd = socket(addr->sa_family, type, 0); - if (CHECK_FAIL(fd == -1)) - return -1; - if (CHECK_FAIL(connect(fd, addr, len))) - goto err; - - return fd; - -err: - close(fd); - return -1; -} - static bool fill_addr_port(const struct sockaddr *sa, struct addr_port *ap) { const struct sockaddr_in6 *in6; @@ -98,14 +66,14 @@ static bool set_up_conn(const struct sockaddr *addr, socklen_t len, int type, socklen_t slen = sizeof(ss); struct sockaddr *sa = (struct sockaddr *)&ss; - *server = start_server(addr, len, type); + *server = start_server_addr(type, (struct sockaddr_storage *)addr, len, NULL); if (*server < 0) return false; if (CHECK_FAIL(getsockname(*server, sa, &slen))) goto close_server; - *conn = connect_to_server(sa, slen, type); + *conn = connect_to_addr(type, (struct sockaddr_storage *)sa, slen, NULL); if (*conn < 0) goto close_server; diff --git a/tools/testing/selftests/bpf/prog_tests/crypto_sanity.c b/tools/testing/selftests/bpf/prog_tests/crypto_sanity.c new file mode 100644 index 000000000000..b1a3a49a822a --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/crypto_sanity.c @@ -0,0 +1,197 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#include <sys/types.h> +#include <sys/socket.h> +#include <net/if.h> +#include <linux/in6.h> +#include <linux/if_alg.h> + +#include "test_progs.h" +#include "network_helpers.h" +#include "crypto_sanity.skel.h" +#include "crypto_basic.skel.h" + +#define NS_TEST "crypto_sanity_ns" +#define IPV6_IFACE_ADDR "face::1" +static const unsigned char crypto_key[] = "testtest12345678"; +static const char plain_text[] = "stringtoencrypt0"; +static int opfd = -1, tfmfd = -1; +static const char algo[] = "ecb(aes)"; +static int init_afalg(void) +{ + struct sockaddr_alg sa = { + .salg_family = AF_ALG, + .salg_type = "skcipher", + .salg_name = "ecb(aes)" + }; + + tfmfd = socket(AF_ALG, SOCK_SEQPACKET, 0); + if (tfmfd == -1) + return errno; + if (bind(tfmfd, (struct sockaddr *)&sa, sizeof(sa)) == -1) + return errno; + if (setsockopt(tfmfd, SOL_ALG, ALG_SET_KEY, crypto_key, 16) == -1) + return errno; + opfd = accept(tfmfd, NULL, 0); + if (opfd == -1) + return errno; + return 0; +} + +static void deinit_afalg(void) +{ + if (tfmfd != -1) + close(tfmfd); + if (opfd != -1) + close(opfd); +} + +static void do_crypt_afalg(const void *src, void *dst, int size, bool encrypt) +{ + struct msghdr msg = {}; + struct cmsghdr *cmsg; + char cbuf[CMSG_SPACE(4)] = {0}; + struct iovec iov; + + msg.msg_control = cbuf; + msg.msg_controllen = sizeof(cbuf); + + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_ALG; + cmsg->cmsg_type = ALG_SET_OP; + cmsg->cmsg_len = CMSG_LEN(4); + *(__u32 *)CMSG_DATA(cmsg) = encrypt ? ALG_OP_ENCRYPT : ALG_OP_DECRYPT; + + iov.iov_base = (char *)src; + iov.iov_len = size; + + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + + sendmsg(opfd, &msg, 0); + read(opfd, dst, size); +} + +void test_crypto_basic(void) +{ + RUN_TESTS(crypto_basic); +} + +void test_crypto_sanity(void) +{ + LIBBPF_OPTS(bpf_tc_hook, qdisc_hook, .attach_point = BPF_TC_EGRESS); + LIBBPF_OPTS(bpf_tc_opts, tc_attach_enc); + LIBBPF_OPTS(bpf_tc_opts, tc_attach_dec); + LIBBPF_OPTS(bpf_test_run_opts, opts); + struct nstoken *nstoken = NULL; + struct crypto_sanity *skel; + char afalg_plain[16] = {0}; + char afalg_dst[16] = {0}; + struct sockaddr_in6 addr; + int sockfd, err, pfd; + socklen_t addrlen; + u16 udp_test_port; + + skel = crypto_sanity__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel open")) + return; + + SYS(fail, "ip netns add %s", NS_TEST); + SYS(fail, "ip -net %s -6 addr add %s/128 dev lo nodad", NS_TEST, IPV6_IFACE_ADDR); + SYS(fail, "ip -net %s link set dev lo up", NS_TEST); + + nstoken = open_netns(NS_TEST); + if (!ASSERT_OK_PTR(nstoken, "open_netns")) + goto fail; + + err = init_afalg(); + if (!ASSERT_OK(err, "AF_ALG init fail")) + goto fail; + + qdisc_hook.ifindex = if_nametoindex("lo"); + if (!ASSERT_GT(qdisc_hook.ifindex, 0, "if_nametoindex lo")) + goto fail; + + skel->bss->key_len = 16; + skel->bss->authsize = 0; + udp_test_port = skel->data->udp_test_port; + memcpy(skel->bss->key, crypto_key, sizeof(crypto_key)); + snprintf(skel->bss->algo, 128, "%s", algo); + pfd = bpf_program__fd(skel->progs.skb_crypto_setup); + if (!ASSERT_GT(pfd, 0, "skb_crypto_setup fd")) + goto fail; + + err = bpf_prog_test_run_opts(pfd, &opts); + if (!ASSERT_OK(err, "skb_crypto_setup") || + !ASSERT_OK(opts.retval, "skb_crypto_setup retval")) + goto fail; + + if (!ASSERT_OK(skel->bss->status, "skb_crypto_setup status")) + goto fail; + + err = bpf_tc_hook_create(&qdisc_hook); + if (!ASSERT_OK(err, "create qdisc hook")) + goto fail; + + addrlen = sizeof(addr); + err = make_sockaddr(AF_INET6, IPV6_IFACE_ADDR, udp_test_port, + (void *)&addr, &addrlen); + if (!ASSERT_OK(err, "make_sockaddr")) + goto fail; + + tc_attach_enc.prog_fd = bpf_program__fd(skel->progs.encrypt_sanity); + err = bpf_tc_attach(&qdisc_hook, &tc_attach_enc); + if (!ASSERT_OK(err, "attach encrypt filter")) + goto fail; + + sockfd = socket(AF_INET6, SOCK_DGRAM, 0); + if (!ASSERT_NEQ(sockfd, -1, "encrypt socket")) + goto fail; + err = sendto(sockfd, plain_text, sizeof(plain_text), 0, (void *)&addr, addrlen); + close(sockfd); + if (!ASSERT_EQ(err, sizeof(plain_text), "encrypt send")) + goto fail; + + do_crypt_afalg(plain_text, afalg_dst, sizeof(afalg_dst), true); + + if (!ASSERT_OK(skel->bss->status, "encrypt status")) + goto fail; + if (!ASSERT_STRNEQ(skel->bss->dst, afalg_dst, sizeof(afalg_dst), "encrypt AF_ALG")) + goto fail; + + tc_attach_enc.flags = tc_attach_enc.prog_fd = tc_attach_enc.prog_id = 0; + err = bpf_tc_detach(&qdisc_hook, &tc_attach_enc); + if (!ASSERT_OK(err, "bpf_tc_detach encrypt")) + goto fail; + + tc_attach_dec.prog_fd = bpf_program__fd(skel->progs.decrypt_sanity); + err = bpf_tc_attach(&qdisc_hook, &tc_attach_dec); + if (!ASSERT_OK(err, "attach decrypt filter")) + goto fail; + + sockfd = socket(AF_INET6, SOCK_DGRAM, 0); + if (!ASSERT_NEQ(sockfd, -1, "decrypt socket")) + goto fail; + err = sendto(sockfd, afalg_dst, sizeof(afalg_dst), 0, (void *)&addr, addrlen); + close(sockfd); + if (!ASSERT_EQ(err, sizeof(afalg_dst), "decrypt send")) + goto fail; + + do_crypt_afalg(afalg_dst, afalg_plain, sizeof(afalg_plain), false); + + if (!ASSERT_OK(skel->bss->status, "decrypt status")) + goto fail; + if (!ASSERT_STRNEQ(skel->bss->dst, afalg_plain, sizeof(afalg_plain), "decrypt AF_ALG")) + goto fail; + + tc_attach_dec.flags = tc_attach_dec.prog_fd = tc_attach_dec.prog_id = 0; + err = bpf_tc_detach(&qdisc_hook, &tc_attach_dec); + ASSERT_OK(err, "bpf_tc_detach decrypt"); + +fail: + close_netns(nstoken); + deinit_afalg(); + SYS_NOFAIL("ip netns del " NS_TEST " &> /dev/null"); + crypto_sanity__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/dummy_st_ops.c b/tools/testing/selftests/bpf/prog_tests/dummy_st_ops.c index f43fcb13d2c4..d3d94596ab79 100644 --- a/tools/testing/selftests/bpf/prog_tests/dummy_st_ops.c +++ b/tools/testing/selftests/bpf/prog_tests/dummy_st_ops.c @@ -98,7 +98,8 @@ done: static void test_dummy_multiple_args(void) { - __u64 args[5] = {0, -100, 0x8a5f, 'c', 0x1234567887654321ULL}; + struct bpf_dummy_ops_state st = { 7 }; + __u64 args[5] = {(__u64)&st, -100, 0x8a5f, 'c', 0x1234567887654321ULL}; LIBBPF_OPTS(bpf_test_run_opts, attr, .ctx_in = args, .ctx_size_in = sizeof(args), @@ -115,6 +116,7 @@ static void test_dummy_multiple_args(void) fd = bpf_program__fd(skel->progs.test_2); err = bpf_prog_test_run_opts(fd, &attr); ASSERT_OK(err, "test_run"); + args[0] = 7; for (i = 0; i < ARRAY_SIZE(args); i++) { snprintf(name, sizeof(name), "arg %zu", i); ASSERT_EQ(skel->bss->test_2_args[i], args[i], name); @@ -125,7 +127,8 @@ static void test_dummy_multiple_args(void) static void test_dummy_sleepable(void) { - __u64 args[1] = {0}; + struct bpf_dummy_ops_state st; + __u64 args[1] = {(__u64)&st}; LIBBPF_OPTS(bpf_test_run_opts, attr, .ctx_in = args, .ctx_size_in = sizeof(args), @@ -144,6 +147,31 @@ static void test_dummy_sleepable(void) dummy_st_ops_success__destroy(skel); } +/* dummy_st_ops.test_sleepable() parameter is not marked as nullable, + * thus bpf_prog_test_run_opts() below should be rejected as it tries + * to pass NULL for this parameter. + */ +static void test_dummy_sleepable_reject_null(void) +{ + __u64 args[1] = {0}; + LIBBPF_OPTS(bpf_test_run_opts, attr, + .ctx_in = args, + .ctx_size_in = sizeof(args), + ); + struct dummy_st_ops_success *skel; + int fd, err; + + skel = dummy_st_ops_success__open_and_load(); + if (!ASSERT_OK_PTR(skel, "dummy_st_ops_load")) + return; + + fd = bpf_program__fd(skel->progs.test_sleepable); + err = bpf_prog_test_run_opts(fd, &attr); + ASSERT_EQ(err, -EINVAL, "test_run"); + + dummy_st_ops_success__destroy(skel); +} + void test_dummy_st_ops(void) { if (test__start_subtest("dummy_st_ops_attach")) @@ -156,6 +184,8 @@ void test_dummy_st_ops(void) test_dummy_multiple_args(); if (test__start_subtest("dummy_sleepable")) test_dummy_sleepable(); + if (test__start_subtest("dummy_sleepable_reject_null")) + test_dummy_sleepable_reject_null(); RUN_TESTS(dummy_st_ops_fail); } diff --git a/tools/testing/selftests/bpf/prog_tests/empty_skb.c b/tools/testing/selftests/bpf/prog_tests/empty_skb.c index 261228eb68e8..438583e1f2d1 100644 --- a/tools/testing/selftests/bpf/prog_tests/empty_skb.c +++ b/tools/testing/selftests/bpf/prog_tests/empty_skb.c @@ -94,6 +94,8 @@ void test_empty_skb(void) SYS(out, "ip netns add empty_skb"); tok = open_netns("empty_skb"); + if (!ASSERT_OK_PTR(tok, "setns")) + goto out; SYS(out, "ip link add veth0 type veth peer veth1"); SYS(out, "ip link set dev veth0 up"); SYS(out, "ip link set dev veth1 up"); diff --git a/tools/testing/selftests/bpf/prog_tests/fib_lookup.c b/tools/testing/selftests/bpf/prog_tests/fib_lookup.c index 3379df2d4cf2..bd7658958004 100644 --- a/tools/testing/selftests/bpf/prog_tests/fib_lookup.c +++ b/tools/testing/selftests/bpf/prog_tests/fib_lookup.c @@ -26,6 +26,17 @@ #define IPV6_TBID_ADDR "fd00::FFFF" #define IPV6_TBID_NET "fd00::" #define IPV6_TBID_DST "fd00::2" +#define MARK_NO_POLICY 33 +#define MARK 42 +#define MARK_TABLE "200" +#define IPV4_REMOTE_DST "1.2.3.4" +#define IPV4_LOCAL "10.4.0.3" +#define IPV4_GW1 "10.4.0.1" +#define IPV4_GW2 "10.4.0.2" +#define IPV6_REMOTE_DST "be:ef::b0:10" +#define IPV6_LOCAL "fd01::3" +#define IPV6_GW1 "fd01::1" +#define IPV6_GW2 "fd01::2" #define DMAC "11:11:11:11:11:11" #define DMAC_INIT { 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, } #define DMAC2 "01:01:01:01:01:01" @@ -36,9 +47,11 @@ struct fib_lookup_test { const char *daddr; int expected_ret; const char *expected_src; + const char *expected_dst; int lookup_flags; __u32 tbid; __u8 dmac[6]; + __u32 mark; }; static const struct fib_lookup_test tests[] = { @@ -90,10 +103,47 @@ static const struct fib_lookup_test tests[] = { .daddr = IPV6_ADDR_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS, .expected_src = IPV6_IFACE_ADDR_SEC, .lookup_flags = BPF_FIB_LOOKUP_SRC | BPF_FIB_LOOKUP_SKIP_NEIGH, }, + /* policy routing */ + { .desc = "IPv4 policy routing, default", + .daddr = IPV4_REMOTE_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS, + .expected_dst = IPV4_GW1, + .lookup_flags = BPF_FIB_LOOKUP_MARK | BPF_FIB_LOOKUP_SKIP_NEIGH, }, + { .desc = "IPv4 policy routing, mark doesn't point to a policy", + .daddr = IPV4_REMOTE_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS, + .expected_dst = IPV4_GW1, + .lookup_flags = BPF_FIB_LOOKUP_MARK | BPF_FIB_LOOKUP_SKIP_NEIGH, + .mark = MARK_NO_POLICY, }, + { .desc = "IPv4 policy routing, mark points to a policy", + .daddr = IPV4_REMOTE_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS, + .expected_dst = IPV4_GW2, + .lookup_flags = BPF_FIB_LOOKUP_MARK | BPF_FIB_LOOKUP_SKIP_NEIGH, + .mark = MARK, }, + { .desc = "IPv4 policy routing, mark points to a policy, but no flag", + .daddr = IPV4_REMOTE_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS, + .expected_dst = IPV4_GW1, + .lookup_flags = BPF_FIB_LOOKUP_SKIP_NEIGH, + .mark = MARK, }, + { .desc = "IPv6 policy routing, default", + .daddr = IPV6_REMOTE_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS, + .expected_dst = IPV6_GW1, + .lookup_flags = BPF_FIB_LOOKUP_MARK | BPF_FIB_LOOKUP_SKIP_NEIGH, }, + { .desc = "IPv6 policy routing, mark doesn't point to a policy", + .daddr = IPV6_REMOTE_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS, + .expected_dst = IPV6_GW1, + .lookup_flags = BPF_FIB_LOOKUP_MARK | BPF_FIB_LOOKUP_SKIP_NEIGH, + .mark = MARK_NO_POLICY, }, + { .desc = "IPv6 policy routing, mark points to a policy", + .daddr = IPV6_REMOTE_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS, + .expected_dst = IPV6_GW2, + .lookup_flags = BPF_FIB_LOOKUP_MARK | BPF_FIB_LOOKUP_SKIP_NEIGH, + .mark = MARK, }, + { .desc = "IPv6 policy routing, mark points to a policy, but no flag", + .daddr = IPV6_REMOTE_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS, + .expected_dst = IPV6_GW1, + .lookup_flags = BPF_FIB_LOOKUP_SKIP_NEIGH, + .mark = MARK, }, }; -static int ifindex; - static int setup_netns(void) { int err; @@ -144,12 +194,24 @@ static int setup_netns(void) if (!ASSERT_OK(err, "write_sysctl(net.ipv6.conf.veth1.forwarding)")) goto fail; + /* Setup for policy routing tests */ + SYS(fail, "ip addr add %s/24 dev veth1", IPV4_LOCAL); + SYS(fail, "ip addr add %s/64 dev veth1 nodad", IPV6_LOCAL); + SYS(fail, "ip route add %s/32 via %s", IPV4_REMOTE_DST, IPV4_GW1); + SYS(fail, "ip route add %s/32 via %s table %s", IPV4_REMOTE_DST, IPV4_GW2, MARK_TABLE); + SYS(fail, "ip -6 route add %s/128 via %s", IPV6_REMOTE_DST, IPV6_GW1); + SYS(fail, "ip -6 route add %s/128 via %s table %s", IPV6_REMOTE_DST, IPV6_GW2, MARK_TABLE); + SYS(fail, "ip rule add prio 2 fwmark %d lookup %s", MARK, MARK_TABLE); + SYS(fail, "ip -6 rule add prio 2 fwmark %d lookup %s", MARK, MARK_TABLE); + return 0; fail: return -1; } -static int set_lookup_params(struct bpf_fib_lookup *params, const struct fib_lookup_test *test) +static int set_lookup_params(struct bpf_fib_lookup *params, + const struct fib_lookup_test *test, + int ifindex) { int ret; @@ -158,6 +220,7 @@ static int set_lookup_params(struct bpf_fib_lookup *params, const struct fib_loo params->l4_protocol = IPPROTO_TCP; params->ifindex = ifindex; params->tbid = test->tbid; + params->mark = test->mark; if (inet_pton(AF_INET6, test->daddr, params->ipv6_dst) == 1) { params->family = AF_INET6; @@ -190,40 +253,45 @@ static void mac_str(char *b, const __u8 *mac) mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]); } -static void assert_src_ip(struct bpf_fib_lookup *fib_params, const char *expected_src) +static void assert_ip_address(int family, void *addr, const char *expected_str) { + char str[INET6_ADDRSTRLEN]; + u8 expected_addr[16]; + int addr_len = 0; int ret; - __u32 src6[4]; - __be32 src4; - switch (fib_params->family) { + switch (family) { case AF_INET6: - ret = inet_pton(AF_INET6, expected_src, src6); - ASSERT_EQ(ret, 1, "inet_pton(expected_src)"); - - ret = memcmp(src6, fib_params->ipv6_src, sizeof(fib_params->ipv6_src)); - if (!ASSERT_EQ(ret, 0, "fib_lookup ipv6 src")) { - char str_src6[64]; - - inet_ntop(AF_INET6, fib_params->ipv6_src, str_src6, - sizeof(str_src6)); - printf("ipv6 expected %s actual %s ", expected_src, - str_src6); - } - + ret = inet_pton(AF_INET6, expected_str, expected_addr); + ASSERT_EQ(ret, 1, "inet_pton(AF_INET6, expected_str)"); + addr_len = 16; break; case AF_INET: - ret = inet_pton(AF_INET, expected_src, &src4); - ASSERT_EQ(ret, 1, "inet_pton(expected_src)"); - - ASSERT_EQ(fib_params->ipv4_src, src4, "fib_lookup ipv4 src"); - + ret = inet_pton(AF_INET, expected_str, expected_addr); + ASSERT_EQ(ret, 1, "inet_pton(AF_INET, expected_str)"); + addr_len = 4; break; default: - PRINT_FAIL("invalid addr family: %d", fib_params->family); + PRINT_FAIL("invalid address family: %d", family); + break; + } + + if (memcmp(addr, expected_addr, addr_len)) { + inet_ntop(family, addr, str, sizeof(str)); + PRINT_FAIL("expected %s actual %s ", expected_str, str); } } +static void assert_src_ip(struct bpf_fib_lookup *params, const char *expected) +{ + assert_ip_address(params->family, params->ipv6_src, expected); +} + +static void assert_dst_ip(struct bpf_fib_lookup *params, const char *expected) +{ + assert_ip_address(params->family, params->ipv6_dst, expected); +} + void test_fib_lookup(void) { struct bpf_fib_lookup *fib_params; @@ -256,15 +324,18 @@ void test_fib_lookup(void) if (setup_netns()) goto fail; - ifindex = if_nametoindex("veth1"); - skb.ifindex = ifindex; + skb.ifindex = if_nametoindex("veth1"); + if (!ASSERT_NEQ(skb.ifindex, 0, "if_nametoindex(veth1)")) + goto fail; + fib_params = &skel->bss->fib_params; for (i = 0; i < ARRAY_SIZE(tests); i++) { printf("Testing %s ", tests[i].desc); - if (set_lookup_params(fib_params, &tests[i])) + if (set_lookup_params(fib_params, &tests[i], skb.ifindex)) continue; + skel->bss->fib_lookup_ret = -1; skel->bss->lookup_flags = tests[i].lookup_flags; @@ -278,6 +349,9 @@ void test_fib_lookup(void) if (tests[i].expected_src) assert_src_ip(fib_params, tests[i].expected_src); + if (tests[i].expected_dst) + assert_dst_ip(fib_params, tests[i].expected_dst); + ret = memcmp(tests[i].dmac, fib_params->dmac, sizeof(tests[i].dmac)); if (!ASSERT_EQ(ret, 0, "dmac not match")) { char expected[18], actual[18]; diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c index c4773173a4e4..9e5f38739104 100644 --- a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c +++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c @@ -2,7 +2,6 @@ #include <test_progs.h> #include <network_helpers.h> #include <error.h> -#include <linux/if.h> #include <linux/if_tun.h> #include <sys/uio.h> diff --git a/tools/testing/selftests/bpf/prog_tests/for_each.c b/tools/testing/selftests/bpf/prog_tests/for_each.c index 8963f8a549f2..09f6487f58b9 100644 --- a/tools/testing/selftests/bpf/prog_tests/for_each.c +++ b/tools/testing/selftests/bpf/prog_tests/for_each.c @@ -5,6 +5,7 @@ #include "for_each_hash_map_elem.skel.h" #include "for_each_array_map_elem.skel.h" #include "for_each_map_elem_write_key.skel.h" +#include "for_each_multi_maps.skel.h" static unsigned int duration; @@ -143,6 +144,65 @@ static void test_write_map_key(void) for_each_map_elem_write_key__destroy(skel); } +static void test_multi_maps(void) +{ + struct for_each_multi_maps *skel; + __u64 val, array_total, hash_total; + __u32 key, max_entries; + int i, err; + + LIBBPF_OPTS(bpf_test_run_opts, topts, + .data_in = &pkt_v4, + .data_size_in = sizeof(pkt_v4), + .repeat = 1, + ); + + skel = for_each_multi_maps__open_and_load(); + if (!ASSERT_OK_PTR(skel, "for_each_multi_maps__open_and_load")) + return; + + array_total = 0; + max_entries = bpf_map__max_entries(skel->maps.arraymap); + for (i = 0; i < max_entries; i++) { + key = i; + val = i + 1; + array_total += val; + err = bpf_map__update_elem(skel->maps.arraymap, &key, sizeof(key), + &val, sizeof(val), BPF_ANY); + if (!ASSERT_OK(err, "array_map_update")) + goto out; + } + + hash_total = 0; + max_entries = bpf_map__max_entries(skel->maps.hashmap); + for (i = 0; i < max_entries; i++) { + key = i + 100; + val = i + 1; + hash_total += val; + err = bpf_map__update_elem(skel->maps.hashmap, &key, sizeof(key), + &val, sizeof(val), BPF_ANY); + if (!ASSERT_OK(err, "hash_map_update")) + goto out; + } + + skel->bss->data_output = 0; + skel->bss->use_array = 1; + err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_pkt_access), &topts); + ASSERT_OK(err, "bpf_prog_test_run_opts"); + ASSERT_OK(topts.retval, "retval"); + ASSERT_EQ(skel->bss->data_output, array_total, "array output"); + + skel->bss->data_output = 0; + skel->bss->use_array = 0; + err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_pkt_access), &topts); + ASSERT_OK(err, "bpf_prog_test_run_opts"); + ASSERT_OK(topts.retval, "retval"); + ASSERT_EQ(skel->bss->data_output, hash_total, "hash output"); + +out: + for_each_multi_maps__destroy(skel); +} + void test_for_each(void) { if (test__start_subtest("hash_map")) @@ -151,4 +211,6 @@ void test_for_each(void) test_array_map(); if (test__start_subtest("write_map_key")) test_write_map_key(); + if (test__start_subtest("multi_maps")) + test_multi_maps(); } diff --git a/tools/testing/selftests/bpf/prog_tests/ip_check_defrag.c b/tools/testing/selftests/bpf/prog_tests/ip_check_defrag.c index 8dd2af9081f4..284764e7179f 100644 --- a/tools/testing/selftests/bpf/prog_tests/ip_check_defrag.c +++ b/tools/testing/selftests/bpf/prog_tests/ip_check_defrag.c @@ -88,6 +88,8 @@ static int attach(struct ip_check_defrag *skel, bool ipv6) int err = -1; nstoken = open_netns(NS1); + if (!ASSERT_OK_PTR(nstoken, "setns")) + goto out; skel->links.defrag = bpf_program__attach_netfilter(skel->progs.defrag, &opts); if (!ASSERT_OK_PTR(skel->links.defrag, "program attach")) diff --git a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c index 05000810e28e..960c9323d1e0 100644 --- a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c +++ b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c @@ -4,6 +4,8 @@ #include "trace_helpers.h" #include "kprobe_multi_empty.skel.h" #include "kprobe_multi_override.skel.h" +#include "kprobe_multi_session.skel.h" +#include "kprobe_multi_session_cookie.skel.h" #include "bpf/libbpf_internal.h" #include "bpf/hashmap.h" @@ -326,6 +328,74 @@ cleanup: kprobe_multi__destroy(skel); } +static void test_session_skel_api(void) +{ + struct kprobe_multi_session *skel = NULL; + LIBBPF_OPTS(bpf_kprobe_multi_opts, opts); + LIBBPF_OPTS(bpf_test_run_opts, topts); + struct bpf_link *link = NULL; + int i, err, prog_fd; + + skel = kprobe_multi_session__open_and_load(); + if (!ASSERT_OK_PTR(skel, "kprobe_multi_session__open_and_load")) + return; + + skel->bss->pid = getpid(); + + err = kprobe_multi_session__attach(skel); + if (!ASSERT_OK(err, " kprobe_multi_session__attach")) + goto cleanup; + + prog_fd = bpf_program__fd(skel->progs.trigger); + err = bpf_prog_test_run_opts(prog_fd, &topts); + ASSERT_OK(err, "test_run"); + ASSERT_EQ(topts.retval, 0, "test_run"); + + /* bpf_fentry_test1-4 trigger return probe, result is 2 */ + for (i = 0; i < 4; i++) + ASSERT_EQ(skel->bss->kprobe_session_result[i], 2, "kprobe_session_result"); + + /* bpf_fentry_test5-8 trigger only entry probe, result is 1 */ + for (i = 4; i < 8; i++) + ASSERT_EQ(skel->bss->kprobe_session_result[i], 1, "kprobe_session_result"); + +cleanup: + bpf_link__destroy(link); + kprobe_multi_session__destroy(skel); +} + +static void test_session_cookie_skel_api(void) +{ + struct kprobe_multi_session_cookie *skel = NULL; + LIBBPF_OPTS(bpf_kprobe_multi_opts, opts); + LIBBPF_OPTS(bpf_test_run_opts, topts); + struct bpf_link *link = NULL; + int err, prog_fd; + + skel = kprobe_multi_session_cookie__open_and_load(); + if (!ASSERT_OK_PTR(skel, "fentry_raw_skel_load")) + return; + + skel->bss->pid = getpid(); + + err = kprobe_multi_session_cookie__attach(skel); + if (!ASSERT_OK(err, " kprobe_multi_wrapper__attach")) + goto cleanup; + + prog_fd = bpf_program__fd(skel->progs.trigger); + err = bpf_prog_test_run_opts(prog_fd, &topts); + ASSERT_OK(err, "test_run"); + ASSERT_EQ(topts.retval, 0, "test_run"); + + ASSERT_EQ(skel->bss->test_kprobe_1_result, 1, "test_kprobe_1_result"); + ASSERT_EQ(skel->bss->test_kprobe_2_result, 2, "test_kprobe_2_result"); + ASSERT_EQ(skel->bss->test_kprobe_3_result, 3, "test_kprobe_3_result"); + +cleanup: + bpf_link__destroy(link); + kprobe_multi_session_cookie__destroy(skel); +} + static size_t symbol_hash(long key, void *ctx __maybe_unused) { return str_hash((const char *) key); @@ -336,15 +406,80 @@ static bool symbol_equal(long key1, long key2, void *ctx __maybe_unused) return strcmp((const char *) key1, (const char *) key2) == 0; } +static bool is_invalid_entry(char *buf, bool kernel) +{ + if (kernel && strchr(buf, '[')) + return true; + if (!kernel && !strchr(buf, '[')) + return true; + return false; +} + +static bool skip_entry(char *name) +{ + /* + * We attach to almost all kernel functions and some of them + * will cause 'suspicious RCU usage' when fprobe is attached + * to them. Filter out the current culprits - arch_cpu_idle + * default_idle and rcu_* functions. + */ + if (!strcmp(name, "arch_cpu_idle")) + return true; + if (!strcmp(name, "default_idle")) + return true; + if (!strncmp(name, "rcu_", 4)) + return true; + if (!strcmp(name, "bpf_dispatcher_xdp_func")) + return true; + if (!strncmp(name, "__ftrace_invalid_address__", + sizeof("__ftrace_invalid_address__") - 1)) + return true; + return false; +} + +/* Do comparision by ignoring '.llvm.<hash>' suffixes. */ +static int compare_name(const char *name1, const char *name2) +{ + const char *res1, *res2; + int len1, len2; + + res1 = strstr(name1, ".llvm."); + res2 = strstr(name2, ".llvm."); + len1 = res1 ? res1 - name1 : strlen(name1); + len2 = res2 ? res2 - name2 : strlen(name2); + + if (len1 == len2) + return strncmp(name1, name2, len1); + if (len1 < len2) + return strncmp(name1, name2, len1) <= 0 ? -1 : 1; + return strncmp(name1, name2, len2) >= 0 ? 1 : -1; +} + +static int load_kallsyms_compare(const void *p1, const void *p2) +{ + return compare_name(((const struct ksym *)p1)->name, ((const struct ksym *)p2)->name); +} + +static int search_kallsyms_compare(const void *p1, const struct ksym *p2) +{ + return compare_name(p1, p2->name); +} + static int get_syms(char ***symsp, size_t *cntp, bool kernel) { - size_t cap = 0, cnt = 0, i; - char *name = NULL, **syms = NULL; + size_t cap = 0, cnt = 0; + char *name = NULL, *ksym_name, **syms = NULL; struct hashmap *map; + struct ksyms *ksyms; + struct ksym *ks; char buf[256]; FILE *f; int err = 0; + ksyms = load_kallsyms_custom_local(load_kallsyms_compare); + if (!ASSERT_OK_PTR(ksyms, "load_kallsyms_custom_local")) + return -EINVAL; + /* * The available_filter_functions contains many duplicates, * but other than that all symbols are usable in kprobe multi @@ -368,33 +503,23 @@ static int get_syms(char ***symsp, size_t *cntp, bool kernel) } while (fgets(buf, sizeof(buf), f)) { - if (kernel && strchr(buf, '[')) - continue; - if (!kernel && !strchr(buf, '[')) + if (is_invalid_entry(buf, kernel)) continue; free(name); if (sscanf(buf, "%ms$*[^\n]\n", &name) != 1) continue; - /* - * We attach to almost all kernel functions and some of them - * will cause 'suspicious RCU usage' when fprobe is attached - * to them. Filter out the current culprits - arch_cpu_idle - * default_idle and rcu_* functions. - */ - if (!strcmp(name, "arch_cpu_idle")) - continue; - if (!strcmp(name, "default_idle")) - continue; - if (!strncmp(name, "rcu_", 4)) - continue; - if (!strcmp(name, "bpf_dispatcher_xdp_func")) - continue; - if (!strncmp(name, "__ftrace_invalid_address__", - sizeof("__ftrace_invalid_address__") - 1)) + if (skip_entry(name)) continue; - err = hashmap__add(map, name, 0); + ks = search_kallsyms_custom_local(ksyms, name, search_kallsyms_compare); + if (!ks) { + err = -EINVAL; + goto error; + } + + ksym_name = ks->name; + err = hashmap__add(map, ksym_name, 0); if (err == -EEXIST) { err = 0; continue; @@ -407,8 +532,7 @@ static int get_syms(char ***symsp, size_t *cntp, bool kernel) if (err) goto error; - syms[cnt++] = name; - name = NULL; + syms[cnt++] = ksym_name; } *symsp = syms; @@ -418,42 +542,88 @@ error: free(name); fclose(f); hashmap__free(map); - if (err) { - for (i = 0; i < cnt; i++) - free(syms[i]); + if (err) free(syms); + return err; +} + +static int get_addrs(unsigned long **addrsp, size_t *cntp, bool kernel) +{ + unsigned long *addr, *addrs, *tmp_addrs; + int err = 0, max_cnt, inc_cnt; + char *name = NULL; + size_t cnt = 0; + char buf[256]; + FILE *f; + + if (access("/sys/kernel/tracing/trace", F_OK) == 0) + f = fopen("/sys/kernel/tracing/available_filter_functions_addrs", "r"); + else + f = fopen("/sys/kernel/debug/tracing/available_filter_functions_addrs", "r"); + + if (!f) + return -ENOENT; + + /* In my local setup, the number of entries is 50k+ so Let us initially + * allocate space to hold 64k entries. If 64k is not enough, incrementally + * increase 1k each time. + */ + max_cnt = 65536; + inc_cnt = 1024; + addrs = malloc(max_cnt * sizeof(long)); + if (addrs == NULL) { + err = -ENOMEM; + goto error; + } + + while (fgets(buf, sizeof(buf), f)) { + if (is_invalid_entry(buf, kernel)) + continue; + + free(name); + if (sscanf(buf, "%p %ms$*[^\n]\n", &addr, &name) != 2) + continue; + if (skip_entry(name)) + continue; + + if (cnt == max_cnt) { + max_cnt += inc_cnt; + tmp_addrs = realloc(addrs, max_cnt); + if (!tmp_addrs) { + err = -ENOMEM; + goto error; + } + addrs = tmp_addrs; + } + + addrs[cnt++] = (unsigned long)addr; } + + *addrsp = addrs; + *cntp = cnt; + +error: + free(name); + fclose(f); + if (err) + free(addrs); return err; } -static void test_kprobe_multi_bench_attach(bool kernel) +static void do_bench_test(struct kprobe_multi_empty *skel, struct bpf_kprobe_multi_opts *opts) { - LIBBPF_OPTS(bpf_kprobe_multi_opts, opts); - struct kprobe_multi_empty *skel = NULL; long attach_start_ns, attach_end_ns; long detach_start_ns, detach_end_ns; double attach_delta, detach_delta; struct bpf_link *link = NULL; - char **syms = NULL; - size_t cnt = 0, i; - - if (!ASSERT_OK(get_syms(&syms, &cnt, kernel), "get_syms")) - return; - - skel = kprobe_multi_empty__open_and_load(); - if (!ASSERT_OK_PTR(skel, "kprobe_multi_empty__open_and_load")) - goto cleanup; - - opts.syms = (const char **) syms; - opts.cnt = cnt; attach_start_ns = get_time_ns(); link = bpf_program__attach_kprobe_multi_opts(skel->progs.test_kprobe_empty, - NULL, &opts); + NULL, opts); attach_end_ns = get_time_ns(); if (!ASSERT_OK_PTR(link, "bpf_program__attach_kprobe_multi_opts")) - goto cleanup; + return; detach_start_ns = get_time_ns(); bpf_link__destroy(link); @@ -462,17 +632,65 @@ static void test_kprobe_multi_bench_attach(bool kernel) attach_delta = (attach_end_ns - attach_start_ns) / 1000000000.0; detach_delta = (detach_end_ns - detach_start_ns) / 1000000000.0; - printf("%s: found %lu functions\n", __func__, cnt); + printf("%s: found %lu functions\n", __func__, opts->cnt); printf("%s: attached in %7.3lfs\n", __func__, attach_delta); printf("%s: detached in %7.3lfs\n", __func__, detach_delta); +} + +static void test_kprobe_multi_bench_attach(bool kernel) +{ + LIBBPF_OPTS(bpf_kprobe_multi_opts, opts); + struct kprobe_multi_empty *skel = NULL; + char **syms = NULL; + size_t cnt = 0; + + if (!ASSERT_OK(get_syms(&syms, &cnt, kernel), "get_syms")) + return; + + skel = kprobe_multi_empty__open_and_load(); + if (!ASSERT_OK_PTR(skel, "kprobe_multi_empty__open_and_load")) + goto cleanup; + + opts.syms = (const char **) syms; + opts.cnt = cnt; + + do_bench_test(skel, &opts); cleanup: kprobe_multi_empty__destroy(skel); - if (syms) { - for (i = 0; i < cnt; i++) - free(syms[i]); + if (syms) free(syms); +} + +static void test_kprobe_multi_bench_attach_addr(bool kernel) +{ + LIBBPF_OPTS(bpf_kprobe_multi_opts, opts); + struct kprobe_multi_empty *skel = NULL; + unsigned long *addrs = NULL; + size_t cnt = 0; + int err; + + err = get_addrs(&addrs, &cnt, kernel); + if (err == -ENOENT) { + test__skip(); + return; } + + if (!ASSERT_OK(err, "get_addrs")) + return; + + skel = kprobe_multi_empty__open_and_load(); + if (!ASSERT_OK_PTR(skel, "kprobe_multi_empty__open_and_load")) + goto cleanup; + + opts.addrs = addrs; + opts.cnt = cnt; + + do_bench_test(skel, &opts); + +cleanup: + kprobe_multi_empty__destroy(skel); + free(addrs); } static void test_attach_override(void) @@ -515,6 +733,10 @@ void serial_test_kprobe_multi_bench_attach(void) test_kprobe_multi_bench_attach(true); if (test__start_subtest("modules")) test_kprobe_multi_bench_attach(false); + if (test__start_subtest("kernel")) + test_kprobe_multi_bench_attach_addr(true); + if (test__start_subtest("modules")) + test_kprobe_multi_bench_attach_addr(false); } void test_kprobe_multi_test(void) @@ -538,4 +760,8 @@ void test_kprobe_multi_test(void) test_attach_api_fails(); if (test__start_subtest("attach_override")) test_attach_override(); + if (test__start_subtest("session")) + test_session_skel_api(); + if (test__start_subtest("session_cookie")) + test_session_cookie_skel_api(); } diff --git a/tools/testing/selftests/bpf/prog_tests/ksyms.c b/tools/testing/selftests/bpf/prog_tests/ksyms.c index b295969b263b..dc7aab532fb1 100644 --- a/tools/testing/selftests/bpf/prog_tests/ksyms.c +++ b/tools/testing/selftests/bpf/prog_tests/ksyms.c @@ -5,8 +5,6 @@ #include "test_ksyms.skel.h" #include <sys/stat.h> -static int duration; - void test_ksyms(void) { const char *btf_path = "/sys/kernel/btf/vmlinux"; @@ -18,43 +16,37 @@ void test_ksyms(void) int err; err = kallsyms_find("bpf_link_fops", &link_fops_addr); - if (CHECK(err == -EINVAL, "kallsyms_fopen", "failed to open: %d\n", errno)) + if (!ASSERT_NEQ(err, -EINVAL, "bpf_link_fops: kallsyms_fopen")) return; - if (CHECK(err == -ENOENT, "ksym_find", "symbol 'bpf_link_fops' not found\n")) + if (!ASSERT_NEQ(err, -ENOENT, "bpf_link_fops: ksym_find")) return; err = kallsyms_find("__per_cpu_start", &per_cpu_start_addr); - if (CHECK(err == -EINVAL, "kallsyms_fopen", "failed to open: %d\n", errno)) + if (!ASSERT_NEQ(err, -EINVAL, "__per_cpu_start: kallsyms_fopen")) return; - if (CHECK(err == -ENOENT, "ksym_find", "symbol 'per_cpu_start' not found\n")) + if (!ASSERT_NEQ(err, -ENOENT, "__per_cpu_start: ksym_find")) return; - if (CHECK(stat(btf_path, &st), "stat_btf", "err %d\n", errno)) + if (!ASSERT_OK(stat(btf_path, &st), "stat_btf")) return; btf_size = st.st_size; skel = test_ksyms__open_and_load(); - if (CHECK(!skel, "skel_open", "failed to open and load skeleton\n")) + if (!ASSERT_OK_PTR(skel, "test_ksyms__open_and_load")) return; err = test_ksyms__attach(skel); - if (CHECK(err, "skel_attach", "skeleton attach failed: %d\n", err)) + if (!ASSERT_OK(err, "test_ksyms__attach")) goto cleanup; /* trigger tracepoint */ usleep(1); data = skel->data; - CHECK(data->out__bpf_link_fops != link_fops_addr, "bpf_link_fops", - "got 0x%llx, exp 0x%llx\n", - data->out__bpf_link_fops, link_fops_addr); - CHECK(data->out__bpf_link_fops1 != 0, "bpf_link_fops1", - "got %llu, exp %llu\n", data->out__bpf_link_fops1, (__u64)0); - CHECK(data->out__btf_size != btf_size, "btf_size", - "got %llu, exp %llu\n", data->out__btf_size, btf_size); - CHECK(data->out__per_cpu_start != per_cpu_start_addr, "__per_cpu_start", - "got %llu, exp %llu\n", data->out__per_cpu_start, - per_cpu_start_addr); + ASSERT_EQ(data->out__bpf_link_fops, link_fops_addr, "bpf_link_fops"); + ASSERT_EQ(data->out__bpf_link_fops1, 0, "bpf_link_fops1"); + ASSERT_EQ(data->out__btf_size, btf_size, "btf_size"); + ASSERT_EQ(data->out__per_cpu_start, per_cpu_start_addr, "__per_cpu_start"); cleanup: test_ksyms__destroy(skel); diff --git a/tools/testing/selftests/bpf/prog_tests/module_attach.c b/tools/testing/selftests/bpf/prog_tests/module_attach.c index f53d658ed080..6d391d95f96e 100644 --- a/tools/testing/selftests/bpf/prog_tests/module_attach.c +++ b/tools/testing/selftests/bpf/prog_tests/module_attach.c @@ -51,6 +51,10 @@ void test_module_attach(void) 0, "bpf_testmod_test_read"); ASSERT_OK(err, "set_attach_target"); + err = bpf_program__set_attach_target(skel->progs.handle_fentry_explicit_manual, + 0, "bpf_testmod:bpf_testmod_test_read"); + ASSERT_OK(err, "set_attach_target_explicit"); + err = test_module_attach__load(skel); if (CHECK(err, "skel_load", "failed to load skeleton\n")) return; @@ -70,6 +74,8 @@ void test_module_attach(void) ASSERT_EQ(bss->tp_btf_read_sz, READ_SZ, "tp_btf"); ASSERT_EQ(bss->fentry_read_sz, READ_SZ, "fentry"); ASSERT_EQ(bss->fentry_manual_read_sz, READ_SZ, "fentry_manual"); + ASSERT_EQ(bss->fentry_explicit_read_sz, READ_SZ, "fentry_explicit"); + ASSERT_EQ(bss->fentry_explicit_manual_read_sz, READ_SZ, "fentry_explicit_manual"); ASSERT_EQ(bss->fexit_read_sz, READ_SZ, "fexit"); ASSERT_EQ(bss->fexit_ret, -EIO, "fexit_tet"); ASSERT_EQ(bss->fmod_ret_read_sz, READ_SZ, "fmod_ret"); diff --git a/tools/testing/selftests/bpf/prog_tests/mptcp.c b/tools/testing/selftests/bpf/prog_tests/mptcp.c index 8f8d792307c1..274d2e033e39 100644 --- a/tools/testing/selftests/bpf/prog_tests/mptcp.c +++ b/tools/testing/selftests/bpf/prog_tests/mptcp.c @@ -82,6 +82,22 @@ static void cleanup_netns(struct nstoken *nstoken) SYS_NOFAIL("ip netns del %s", NS_TEST); } +static int start_mptcp_server(int family, const char *addr_str, __u16 port, + int timeout_ms) +{ + struct network_helper_opts opts = { + .timeout_ms = timeout_ms, + .proto = IPPROTO_MPTCP, + }; + struct sockaddr_storage addr; + socklen_t addrlen; + + if (make_sockaddr(family, addr_str, port, &addr, &addrlen)) + return -1; + + return start_server_addr(SOCK_STREAM, &addr, addrlen, &opts); +} + static int verify_tsk(int map_fd, int client_fd) { int err, cfd = client_fd; @@ -273,6 +289,8 @@ static int run_mptcpify(int cgroup_fd) if (!ASSERT_OK_PTR(mptcpify_skel, "skel_open_load")) return libbpf_get_error(mptcpify_skel); + mptcpify_skel->bss->pid = getpid(); + err = mptcpify__attach(mptcpify_skel); if (!ASSERT_OK(err, "skel_attach")) goto out; diff --git a/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c b/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c index 24d493482ffc..e72d75d6baa7 100644 --- a/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c +++ b/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c @@ -12,77 +12,229 @@ #include <sys/wait.h> #include <sys/mount.h> #include <sys/fcntl.h> +#include "network_helpers.h" #define STACK_SIZE (1024 * 1024) static char child_stack[STACK_SIZE]; -static int test_current_pid_tgid(void *args) +static int get_pid_tgid(pid_t *pid, pid_t *tgid, + struct test_ns_current_pid_tgid__bss *bss) { - struct test_ns_current_pid_tgid__bss *bss; - struct test_ns_current_pid_tgid *skel; - int err = -1, duration = 0; - pid_t tgid, pid; struct stat st; + int err; - skel = test_ns_current_pid_tgid__open_and_load(); - if (CHECK(!skel, "skel_open_load", "failed to load skeleton\n")) - goto cleanup; - - pid = syscall(SYS_gettid); - tgid = getpid(); + *pid = syscall(SYS_gettid); + *tgid = getpid(); err = stat("/proc/self/ns/pid", &st); - if (CHECK(err, "stat", "failed /proc/self/ns/pid: %d\n", err)) - goto cleanup; + if (!ASSERT_OK(err, "stat /proc/self/ns/pid")) + return err; - bss = skel->bss; bss->dev = st.st_dev; bss->ino = st.st_ino; bss->user_pid = 0; bss->user_tgid = 0; + return 0; +} + +static int test_current_pid_tgid_tp(void *args) +{ + struct test_ns_current_pid_tgid__bss *bss; + struct test_ns_current_pid_tgid *skel; + int ret = -1, err; + pid_t tgid, pid; + + skel = test_ns_current_pid_tgid__open(); + if (!ASSERT_OK_PTR(skel, "test_ns_current_pid_tgid__open")) + return ret; + + bpf_program__set_autoload(skel->progs.tp_handler, true); + + err = test_ns_current_pid_tgid__load(skel); + if (!ASSERT_OK(err, "test_ns_current_pid_tgid__load")) + goto cleanup; + + bss = skel->bss; + if (get_pid_tgid(&pid, &tgid, bss)) + goto cleanup; err = test_ns_current_pid_tgid__attach(skel); - if (CHECK(err, "skel_attach", "skeleton attach failed: %d\n", err)) + if (!ASSERT_OK(err, "test_ns_current_pid_tgid__attach")) goto cleanup; /* trigger tracepoint */ usleep(1); - ASSERT_EQ(bss->user_pid, pid, "pid"); - ASSERT_EQ(bss->user_tgid, tgid, "tgid"); - err = 0; + if (!ASSERT_EQ(bss->user_pid, pid, "pid")) + goto cleanup; + if (!ASSERT_EQ(bss->user_tgid, tgid, "tgid")) + goto cleanup; + ret = 0; + +cleanup: + test_ns_current_pid_tgid__destroy(skel); + return ret; +} + +static int test_current_pid_tgid_cgrp(void *args) +{ + struct test_ns_current_pid_tgid__bss *bss; + struct test_ns_current_pid_tgid *skel; + int server_fd = -1, ret = -1, err; + int cgroup_fd = *(int *)args; + pid_t tgid, pid; + + skel = test_ns_current_pid_tgid__open(); + if (!ASSERT_OK_PTR(skel, "test_ns_current_pid_tgid__open")) + return ret; + + bpf_program__set_autoload(skel->progs.cgroup_bind4, true); + + err = test_ns_current_pid_tgid__load(skel); + if (!ASSERT_OK(err, "test_ns_current_pid_tgid__load")) + goto cleanup; + + bss = skel->bss; + if (get_pid_tgid(&pid, &tgid, bss)) + goto cleanup; + + skel->links.cgroup_bind4 = bpf_program__attach_cgroup( + skel->progs.cgroup_bind4, cgroup_fd); + if (!ASSERT_OK_PTR(skel->links.cgroup_bind4, "bpf_program__attach_cgroup")) + goto cleanup; + + server_fd = start_server(AF_INET, SOCK_STREAM, NULL, 0, 0); + if (!ASSERT_GE(server_fd, 0, "start_server")) + goto cleanup; + + if (!ASSERT_EQ(bss->user_pid, pid, "pid")) + goto cleanup; + if (!ASSERT_EQ(bss->user_tgid, tgid, "tgid")) + goto cleanup; + ret = 0; cleanup: - test_ns_current_pid_tgid__destroy(skel); + if (server_fd >= 0) + close(server_fd); + test_ns_current_pid_tgid__destroy(skel); + return ret; +} + +static int test_current_pid_tgid_sk_msg(void *args) +{ + int verdict, map, server_fd = -1, client_fd = -1; + struct test_ns_current_pid_tgid__bss *bss; + static const char send_msg[] = "message"; + struct test_ns_current_pid_tgid *skel; + int ret = -1, err, key = 0; + pid_t tgid, pid; + + skel = test_ns_current_pid_tgid__open(); + if (!ASSERT_OK_PTR(skel, "test_ns_current_pid_tgid__open")) + return ret; + + bpf_program__set_autoload(skel->progs.sk_msg, true); + + err = test_ns_current_pid_tgid__load(skel); + if (!ASSERT_OK(err, "test_ns_current_pid_tgid__load")) + goto cleanup; + + bss = skel->bss; + if (get_pid_tgid(&pid, &tgid, skel->bss)) + goto cleanup; + + verdict = bpf_program__fd(skel->progs.sk_msg); + map = bpf_map__fd(skel->maps.sock_map); + err = bpf_prog_attach(verdict, map, BPF_SK_MSG_VERDICT, 0); + if (!ASSERT_OK(err, "prog_attach")) + goto cleanup; + + server_fd = start_server(AF_INET6, SOCK_STREAM, "::1", 0, 0); + if (!ASSERT_GE(server_fd, 0, "start_server")) + goto cleanup; - return err; + client_fd = connect_to_fd(server_fd, 0); + if (!ASSERT_GE(client_fd, 0, "connect_to_fd")) + goto cleanup; + + err = bpf_map_update_elem(map, &key, &client_fd, BPF_ANY); + if (!ASSERT_OK(err, "bpf_map_update_elem")) + goto cleanup; + + err = send(client_fd, send_msg, sizeof(send_msg), 0); + if (!ASSERT_EQ(err, sizeof(send_msg), "send(msg)")) + goto cleanup; + + if (!ASSERT_EQ(bss->user_pid, pid, "pid")) + goto cleanup; + if (!ASSERT_EQ(bss->user_tgid, tgid, "tgid")) + goto cleanup; + ret = 0; + +cleanup: + if (server_fd >= 0) + close(server_fd); + if (client_fd >= 0) + close(client_fd); + test_ns_current_pid_tgid__destroy(skel); + return ret; } -static void test_ns_current_pid_tgid_new_ns(void) +static void test_ns_current_pid_tgid_new_ns(int (*fn)(void *), void *arg) { - int wstatus, duration = 0; + int wstatus; pid_t cpid; /* Create a process in a new namespace, this process * will be the init process of this new namespace hence will be pid 1. */ - cpid = clone(test_current_pid_tgid, child_stack + STACK_SIZE, - CLONE_NEWPID | SIGCHLD, NULL); + cpid = clone(fn, child_stack + STACK_SIZE, + CLONE_NEWPID | SIGCHLD, arg); - if (CHECK(cpid == -1, "clone", "%s\n", strerror(errno))) + if (!ASSERT_NEQ(cpid, -1, "clone")) return; - if (CHECK(waitpid(cpid, &wstatus, 0) == -1, "waitpid", "%s\n", strerror(errno))) + if (!ASSERT_NEQ(waitpid(cpid, &wstatus, 0), -1, "waitpid")) return; - if (CHECK(WEXITSTATUS(wstatus) != 0, "newns_pidtgid", "failed")) + if (!ASSERT_OK(WEXITSTATUS(wstatus), "newns_pidtgid")) return; } +static void test_in_netns(int (*fn)(void *), void *arg) +{ + struct nstoken *nstoken = NULL; + + SYS(cleanup, "ip netns add ns_current_pid_tgid"); + SYS(cleanup, "ip -net ns_current_pid_tgid link set dev lo up"); + + nstoken = open_netns("ns_current_pid_tgid"); + if (!ASSERT_OK_PTR(nstoken, "open_netns")) + goto cleanup; + + test_ns_current_pid_tgid_new_ns(fn, arg); + +cleanup: + if (nstoken) + close_netns(nstoken); + SYS_NOFAIL("ip netns del ns_current_pid_tgid"); +} + /* TODO: use a different tracepoint */ void serial_test_ns_current_pid_tgid(void) { - if (test__start_subtest("ns_current_pid_tgid_root_ns")) - test_current_pid_tgid(NULL); - if (test__start_subtest("ns_current_pid_tgid_new_ns")) - test_ns_current_pid_tgid_new_ns(); + if (test__start_subtest("root_ns_tp")) + test_current_pid_tgid_tp(NULL); + if (test__start_subtest("new_ns_tp")) + test_ns_current_pid_tgid_new_ns(test_current_pid_tgid_tp, NULL); + if (test__start_subtest("new_ns_cgrp")) { + int cgroup_fd = -1; + + cgroup_fd = test__join_cgroup("/sock_addr"); + if (ASSERT_GE(cgroup_fd, 0, "join_cgroup")) { + test_in_netns(test_current_pid_tgid_cgrp, &cgroup_fd); + close(cgroup_fd); + } + } + if (test__start_subtest("new_ns_sk_msg")) + test_in_netns(test_current_pid_tgid_sk_msg, NULL); } diff --git a/tools/testing/selftests/bpf/prog_tests/perf_skip.c b/tools/testing/selftests/bpf/prog_tests/perf_skip.c new file mode 100644 index 000000000000..37d8618800e4 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/perf_skip.c @@ -0,0 +1,137 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE + +#include <test_progs.h> +#include "test_perf_skip.skel.h" +#include <linux/compiler.h> +#include <linux/hw_breakpoint.h> +#include <sys/mman.h> + +#ifndef TRAP_PERF +#define TRAP_PERF 6 +#endif + +int sigio_count, sigtrap_count; + +static void handle_sigio(int sig __always_unused) +{ + ++sigio_count; +} + +static void handle_sigtrap(int signum __always_unused, + siginfo_t *info, + void *ucontext __always_unused) +{ + ASSERT_EQ(info->si_code, TRAP_PERF, "si_code"); + ++sigtrap_count; +} + +static noinline int test_function(void) +{ + asm volatile (""); + return 0; +} + +void serial_test_perf_skip(void) +{ + struct sigaction action = {}; + struct sigaction previous_sigtrap; + sighandler_t previous_sigio = SIG_ERR; + struct test_perf_skip *skel = NULL; + struct perf_event_attr attr = {}; + int perf_fd = -1; + int err; + struct f_owner_ex owner; + struct bpf_link *prog_link = NULL; + + action.sa_flags = SA_SIGINFO | SA_NODEFER; + action.sa_sigaction = handle_sigtrap; + sigemptyset(&action.sa_mask); + if (!ASSERT_OK(sigaction(SIGTRAP, &action, &previous_sigtrap), "sigaction")) + return; + + previous_sigio = signal(SIGIO, handle_sigio); + if (!ASSERT_NEQ(previous_sigio, SIG_ERR, "signal")) + goto cleanup; + + skel = test_perf_skip__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_load")) + goto cleanup; + + attr.type = PERF_TYPE_BREAKPOINT; + attr.size = sizeof(attr); + attr.bp_type = HW_BREAKPOINT_X; + attr.bp_addr = (uintptr_t)test_function; + attr.bp_len = sizeof(long); + attr.sample_period = 1; + attr.sample_type = PERF_SAMPLE_IP; + attr.pinned = 1; + attr.exclude_kernel = 1; + attr.exclude_hv = 1; + attr.precise_ip = 3; + attr.sigtrap = 1; + attr.remove_on_exec = 1; + + perf_fd = syscall(__NR_perf_event_open, &attr, 0, -1, -1, 0); + if (perf_fd < 0 && (errno == ENOENT || errno == EOPNOTSUPP)) { + printf("SKIP:no PERF_TYPE_BREAKPOINT/HW_BREAKPOINT_X\n"); + test__skip(); + goto cleanup; + } + if (!ASSERT_OK(perf_fd < 0, "perf_event_open")) + goto cleanup; + + /* Configure the perf event to signal on sample. */ + err = fcntl(perf_fd, F_SETFL, O_ASYNC); + if (!ASSERT_OK(err, "fcntl(F_SETFL, O_ASYNC)")) + goto cleanup; + + owner.type = F_OWNER_TID; + owner.pid = syscall(__NR_gettid); + err = fcntl(perf_fd, F_SETOWN_EX, &owner); + if (!ASSERT_OK(err, "fcntl(F_SETOWN_EX)")) + goto cleanup; + + /* Allow at most one sample. A sample rejected by bpf should + * not count against this. + */ + err = ioctl(perf_fd, PERF_EVENT_IOC_REFRESH, 1); + if (!ASSERT_OK(err, "ioctl(PERF_EVENT_IOC_REFRESH)")) + goto cleanup; + + prog_link = bpf_program__attach_perf_event(skel->progs.handler, perf_fd); + if (!ASSERT_OK_PTR(prog_link, "bpf_program__attach_perf_event")) + goto cleanup; + + /* Configure the bpf program to suppress the sample. */ + skel->bss->ip = (uintptr_t)test_function; + test_function(); + + ASSERT_EQ(sigio_count, 0, "sigio_count"); + ASSERT_EQ(sigtrap_count, 0, "sigtrap_count"); + + /* Configure the bpf program to allow the sample. */ + skel->bss->ip = 0; + test_function(); + + ASSERT_EQ(sigio_count, 1, "sigio_count"); + ASSERT_EQ(sigtrap_count, 1, "sigtrap_count"); + + /* Test that the sample above is the only one allowed (by perf, not + * by bpf) + */ + test_function(); + + ASSERT_EQ(sigio_count, 1, "sigio_count"); + ASSERT_EQ(sigtrap_count, 1, "sigtrap_count"); + +cleanup: + bpf_link__destroy(prog_link); + if (perf_fd >= 0) + close(perf_fd); + test_perf_skip__destroy(skel); + + if (previous_sigio != SIG_ERR) + signal(SIGIO, previous_sigio); + sigaction(SIGTRAP, &previous_sigtrap, NULL); +} diff --git a/tools/testing/selftests/bpf/prog_tests/preempt_lock.c b/tools/testing/selftests/bpf/prog_tests/preempt_lock.c new file mode 100644 index 000000000000..02917c672441 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/preempt_lock.c @@ -0,0 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <test_progs.h> +#include <network_helpers.h> +#include <preempt_lock.skel.h> + +void test_preempt_lock(void) +{ + RUN_TESTS(preempt_lock); +} diff --git a/tools/testing/selftests/bpf/prog_tests/ringbuf.c b/tools/testing/selftests/bpf/prog_tests/ringbuf.c index 48c5695b7abf..4c6f42dae409 100644 --- a/tools/testing/selftests/bpf/prog_tests/ringbuf.c +++ b/tools/testing/selftests/bpf/prog_tests/ringbuf.c @@ -13,6 +13,7 @@ #include <linux/perf_event.h> #include <linux/ring_buffer.h> #include "test_ringbuf.lskel.h" +#include "test_ringbuf_n.lskel.h" #include "test_ringbuf_map_key.lskel.h" #define EDONE 7777 @@ -326,6 +327,68 @@ cleanup: test_ringbuf_lskel__destroy(skel); } +/* + * Test ring_buffer__consume_n() by producing N_TOT_SAMPLES samples in the ring + * buffer, via getpid(), and consuming them in chunks of N_SAMPLES. + */ +#define N_TOT_SAMPLES 32 +#define N_SAMPLES 4 + +/* Sample value to verify the callback validity */ +#define SAMPLE_VALUE 42L + +static int process_n_sample(void *ctx, void *data, size_t len) +{ + struct sample *s = data; + + ASSERT_EQ(s->value, SAMPLE_VALUE, "sample_value"); + + return 0; +} + +static void ringbuf_n_subtest(void) +{ + struct test_ringbuf_n_lskel *skel_n; + int err, i; + + skel_n = test_ringbuf_n_lskel__open(); + if (!ASSERT_OK_PTR(skel_n, "test_ringbuf_n_lskel__open")) + return; + + skel_n->maps.ringbuf.max_entries = getpagesize(); + skel_n->bss->pid = getpid(); + + err = test_ringbuf_n_lskel__load(skel_n); + if (!ASSERT_OK(err, "test_ringbuf_n_lskel__load")) + goto cleanup; + + ringbuf = ring_buffer__new(skel_n->maps.ringbuf.map_fd, + process_n_sample, NULL, NULL); + if (!ASSERT_OK_PTR(ringbuf, "ring_buffer__new")) + goto cleanup; + + err = test_ringbuf_n_lskel__attach(skel_n); + if (!ASSERT_OK(err, "test_ringbuf_n_lskel__attach")) + goto cleanup_ringbuf; + + /* Produce N_TOT_SAMPLES samples in the ring buffer by calling getpid() */ + skel_n->bss->value = SAMPLE_VALUE; + for (i = 0; i < N_TOT_SAMPLES; i++) + syscall(__NR_getpgid); + + /* Consume all samples from the ring buffer in batches of N_SAMPLES */ + for (i = 0; i < N_TOT_SAMPLES; i += err) { + err = ring_buffer__consume_n(ringbuf, N_SAMPLES); + if (!ASSERT_EQ(err, N_SAMPLES, "rb_consume")) + goto cleanup_ringbuf; + } + +cleanup_ringbuf: + ring_buffer__free(ringbuf); +cleanup: + test_ringbuf_n_lskel__destroy(skel_n); +} + static int process_map_key_sample(void *ctx, void *data, size_t len) { struct sample *s; @@ -384,6 +447,8 @@ void test_ringbuf(void) { if (test__start_subtest("ringbuf")) ringbuf_subtest(); + if (test__start_subtest("ringbuf_n")) + ringbuf_n_subtest(); if (test__start_subtest("ringbuf_map_key")) ringbuf_map_key_subtest(); } diff --git a/tools/testing/selftests/bpf/prog_tests/send_signal.c b/tools/testing/selftests/bpf/prog_tests/send_signal.c index b15b343ebb6b..920aee41bd58 100644 --- a/tools/testing/selftests/bpf/prog_tests/send_signal.c +++ b/tools/testing/selftests/bpf/prog_tests/send_signal.c @@ -179,7 +179,7 @@ static void test_send_signal_nmi(bool signal_thread) pmu_fd = syscall(__NR_perf_event_open, &attr, 0 /* pid */, -1 /* cpu */, -1 /* group_fd */, 0 /* flags */); if (pmu_fd == -1) { - if (errno == ENOENT) { + if (errno == ENOENT || errno == EOPNOTSUPP) { printf("%s:SKIP:no PERF_COUNT_HW_CPU_CYCLES\n", __func__); test__skip(); diff --git a/tools/testing/selftests/bpf/prog_tests/sk_assign.c b/tools/testing/selftests/bpf/prog_tests/sk_assign.c index 1374b626a985..0b9bd1d6f7cc 100644 --- a/tools/testing/selftests/bpf/prog_tests/sk_assign.c +++ b/tools/testing/selftests/bpf/prog_tests/sk_assign.c @@ -15,6 +15,7 @@ #include <unistd.h> #include "test_progs.h" +#include "network_helpers.h" #define BIND_PORT 1234 #define CONNECT_PORT 4321 @@ -22,8 +23,6 @@ #define NS_SELF "/proc/self/ns/net" #define SERVER_MAP_PATH "/sys/fs/bpf/tc/globals/server_map" -static const struct timeval timeo_sec = { .tv_sec = 3 }; -static const size_t timeo_optlen = sizeof(timeo_sec); static int stop, duration; static bool @@ -73,52 +72,6 @@ configure_stack(void) return true; } -static int -start_server(const struct sockaddr *addr, socklen_t len, int type) -{ - int fd; - - fd = socket(addr->sa_family, type, 0); - if (CHECK_FAIL(fd == -1)) - goto out; - if (CHECK_FAIL(setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &timeo_sec, - timeo_optlen))) - goto close_out; - if (CHECK_FAIL(bind(fd, addr, len) == -1)) - goto close_out; - if (type == SOCK_STREAM && CHECK_FAIL(listen(fd, 128) == -1)) - goto close_out; - - goto out; -close_out: - close(fd); - fd = -1; -out: - return fd; -} - -static int -connect_to_server(const struct sockaddr *addr, socklen_t len, int type) -{ - int fd = -1; - - fd = socket(addr->sa_family, type, 0); - if (CHECK_FAIL(fd == -1)) - goto out; - if (CHECK_FAIL(setsockopt(fd, SOL_SOCKET, SO_SNDTIMEO, &timeo_sec, - timeo_optlen))) - goto close_out; - if (CHECK_FAIL(connect(fd, addr, len))) - goto close_out; - - goto out; -close_out: - close(fd); - fd = -1; -out: - return fd; -} - static in_port_t get_port(int fd) { @@ -161,7 +114,7 @@ run_test(int server_fd, const struct sockaddr *addr, socklen_t len, int type) in_port_t port; int ret = 1; - client = connect_to_server(addr, len, type); + client = connect_to_addr(type, (struct sockaddr_storage *)addr, len, NULL); if (client == -1) { perror("Cannot connect to server"); goto out; @@ -310,7 +263,9 @@ void test_sk_assign(void) continue; prepare_addr(test->addr, test->family, BIND_PORT, false); addr = (const struct sockaddr *)test->addr; - server = start_server(addr, test->len, test->type); + server = start_server_addr(test->type, + (const struct sockaddr_storage *)addr, + test->len, NULL); if (server == -1) goto close; diff --git a/tools/testing/selftests/bpf/prog_tests/sock_addr.c b/tools/testing/selftests/bpf/prog_tests/sock_addr.c index 5fd617718991..b880c564a204 100644 --- a/tools/testing/selftests/bpf/prog_tests/sock_addr.c +++ b/tools/testing/selftests/bpf/prog_tests/sock_addr.c @@ -3,16 +3,56 @@ #include "test_progs.h" +#include "sock_addr_kern.skel.h" +#include "bind4_prog.skel.h" +#include "bind6_prog.skel.h" #include "connect_unix_prog.skel.h" +#include "connect4_prog.skel.h" +#include "connect6_prog.skel.h" +#include "sendmsg4_prog.skel.h" +#include "sendmsg6_prog.skel.h" +#include "recvmsg4_prog.skel.h" +#include "recvmsg6_prog.skel.h" #include "sendmsg_unix_prog.skel.h" #include "recvmsg_unix_prog.skel.h" +#include "getsockname4_prog.skel.h" +#include "getsockname6_prog.skel.h" #include "getsockname_unix_prog.skel.h" +#include "getpeername4_prog.skel.h" +#include "getpeername6_prog.skel.h" #include "getpeername_unix_prog.skel.h" #include "network_helpers.h" +#ifndef ENOTSUPP +# define ENOTSUPP 524 +#endif + +#define TEST_NS "sock_addr" +#define TEST_IF_PREFIX "test_sock_addr" +#define TEST_IPV4 "127.0.0.4" +#define TEST_IPV6 "::6" + +#define SERV4_IP "192.168.1.254" +#define SERV4_REWRITE_IP "127.0.0.1" +#define SRC4_IP "172.16.0.1" +#define SRC4_REWRITE_IP TEST_IPV4 +#define SERV4_PORT 4040 +#define SERV4_REWRITE_PORT 4444 + +#define SERV6_IP "face:b00c:1234:5678::abcd" +#define SERV6_REWRITE_IP "::1" +#define SERV6_V4MAPPED_IP "::ffff:192.168.0.4" +#define SRC6_IP "::1" +#define SRC6_REWRITE_IP TEST_IPV6 +#define WILDCARD6_IP "::" +#define SERV6_PORT 6060 +#define SERV6_REWRITE_PORT 6666 + #define SERVUN_ADDRESS "bpf_cgroup_unix_test" #define SERVUN_REWRITE_ADDRESS "bpf_cgroup_unix_test_rewrite" -#define SRCUN_ADDRESS "bpf_cgroup_unix_test_src" +#define SRCUN_ADDRESS "bpf_cgroup_unix_test_src" + +#define save_errno_do(op) ({ int __save = errno; op; errno = __save; }) enum sock_addr_test_type { SOCK_ADDR_TEST_BIND, @@ -23,152 +63,955 @@ enum sock_addr_test_type { SOCK_ADDR_TEST_GETPEERNAME, }; -typedef void *(*load_fn)(int cgroup_fd); +typedef void *(*load_fn)(int cgroup_fd, + enum bpf_attach_type attach_type, + bool expect_reject); typedef void (*destroy_fn)(void *skel); -struct sock_addr_test { - enum sock_addr_test_type type; - const char *name; - /* BPF prog properties */ - load_fn loadfn; - destroy_fn destroyfn; - /* Socket properties */ - int socket_family; - int socket_type; - /* IP:port pairs for BPF prog to override */ - const char *requested_addr; - unsigned short requested_port; - const char *expected_addr; - unsigned short expected_port; - const char *expected_src_addr; +static int cmp_addr(const struct sockaddr_storage *addr1, socklen_t addr1_len, + const struct sockaddr_storage *addr2, socklen_t addr2_len, + bool cmp_port); + +struct init_sock_args { + int af; + int type; }; -static void *connect_unix_prog_load(int cgroup_fd) -{ - struct connect_unix_prog *skel; +struct addr_args { + char addr[sizeof(struct sockaddr_storage)]; + int addrlen; +}; - skel = connect_unix_prog__open_and_load(); - if (!ASSERT_OK_PTR(skel, "skel_open")) - goto cleanup; +struct sendmsg_args { + struct addr_args addr; + char msg[10]; + int msglen; +}; - skel->links.connect_unix_prog = bpf_program__attach_cgroup( - skel->progs.connect_unix_prog, cgroup_fd); - if (!ASSERT_OK_PTR(skel->links.connect_unix_prog, "prog_attach")) - goto cleanup; +static struct sock_addr_kern *skel; - return skel; -cleanup: - connect_unix_prog__destroy(skel); - return NULL; +static int run_bpf_prog(const char *prog_name, void *ctx, int ctx_size) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + struct bpf_program *prog; + int prog_fd, err; + + topts.ctx_in = ctx; + topts.ctx_size_in = ctx_size; + + prog = bpf_object__find_program_by_name(skel->obj, prog_name); + if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name")) + goto err; + + prog_fd = bpf_program__fd(prog); + err = bpf_prog_test_run_opts(prog_fd, &topts); + if (!ASSERT_OK(err, prog_name)) + goto err; + + err = topts.retval; + errno = -topts.retval; + goto out; +err: + err = -1; +out: + return err; } -static void connect_unix_prog_destroy(void *skel) +static int kernel_init_sock(int af, int type, int protocol) { - connect_unix_prog__destroy(skel); + struct init_sock_args args = { + .af = af, + .type = type, + }; + + return run_bpf_prog("init_sock", &args, sizeof(args)); } -static void *sendmsg_unix_prog_load(int cgroup_fd) +static int kernel_close_sock(int fd) { - struct sendmsg_unix_prog *skel; + return run_bpf_prog("close_sock", NULL, 0); +} - skel = sendmsg_unix_prog__open_and_load(); - if (!ASSERT_OK_PTR(skel, "skel_open")) - goto cleanup; +static int sock_addr_op(const char *name, struct sockaddr *addr, + socklen_t *addrlen, bool expect_change) +{ + struct addr_args args; + int err; - skel->links.sendmsg_unix_prog = bpf_program__attach_cgroup( - skel->progs.sendmsg_unix_prog, cgroup_fd); - if (!ASSERT_OK_PTR(skel->links.sendmsg_unix_prog, "prog_attach")) - goto cleanup; + if (addrlen) + args.addrlen = *addrlen; - return skel; -cleanup: - sendmsg_unix_prog__destroy(skel); - return NULL; + if (addr) + memcpy(&args.addr, addr, *addrlen); + + err = run_bpf_prog(name, &args, sizeof(args)); + + if (!expect_change && addr) + if (!ASSERT_EQ(cmp_addr((struct sockaddr_storage *)addr, + *addrlen, + (struct sockaddr_storage *)&args.addr, + args.addrlen, 1), + 0, "address_param_modified")) + return -1; + + if (addrlen) + *addrlen = args.addrlen; + + if (addr) + memcpy(addr, &args.addr, *addrlen); + + return err; } -static void sendmsg_unix_prog_destroy(void *skel) +static int send_msg_op(const char *name, struct sockaddr *addr, + socklen_t addrlen, const char *msg, int msglen) { - sendmsg_unix_prog__destroy(skel); + struct sendmsg_args args; + int err; + + memset(&args, 0, sizeof(args)); + memcpy(&args.addr.addr, addr, addrlen); + args.addr.addrlen = addrlen; + memcpy(args.msg, msg, msglen); + args.msglen = msglen; + + err = run_bpf_prog(name, &args, sizeof(args)); + + if (!ASSERT_EQ(cmp_addr((struct sockaddr_storage *)addr, + addrlen, + (struct sockaddr_storage *)&args.addr.addr, + args.addr.addrlen, 1), + 0, "address_param_modified")) + return -1; + + return err; } -static void *recvmsg_unix_prog_load(int cgroup_fd) +static int kernel_connect(struct sockaddr *addr, socklen_t addrlen) { - struct recvmsg_unix_prog *skel; - - skel = recvmsg_unix_prog__open_and_load(); - if (!ASSERT_OK_PTR(skel, "skel_open")) - goto cleanup; + return sock_addr_op("kernel_connect", addr, &addrlen, false); +} - skel->links.recvmsg_unix_prog = bpf_program__attach_cgroup( - skel->progs.recvmsg_unix_prog, cgroup_fd); - if (!ASSERT_OK_PTR(skel->links.recvmsg_unix_prog, "prog_attach")) - goto cleanup; +static int kernel_bind(int fd, struct sockaddr *addr, socklen_t addrlen) +{ + return sock_addr_op("kernel_bind", addr, &addrlen, false); +} - return skel; -cleanup: - recvmsg_unix_prog__destroy(skel); - return NULL; +static int kernel_listen(void) +{ + return sock_addr_op("kernel_listen", NULL, NULL, false); } -static void recvmsg_unix_prog_destroy(void *skel) +static int kernel_sendmsg(int fd, struct sockaddr *addr, socklen_t addrlen, + char *msg, int msglen) { - recvmsg_unix_prog__destroy(skel); + return send_msg_op("kernel_sendmsg", addr, addrlen, msg, msglen); } -static void *getsockname_unix_prog_load(int cgroup_fd) +static int sock_sendmsg(int fd, struct sockaddr *addr, socklen_t addrlen, + char *msg, int msglen) { - struct getsockname_unix_prog *skel; + return send_msg_op("sock_sendmsg", addr, addrlen, msg, msglen); +} - skel = getsockname_unix_prog__open_and_load(); - if (!ASSERT_OK_PTR(skel, "skel_open")) - goto cleanup; +static int kernel_getsockname(int fd, struct sockaddr *addr, socklen_t *addrlen) +{ + return sock_addr_op("kernel_getsockname", addr, addrlen, true); +} - skel->links.getsockname_unix_prog = bpf_program__attach_cgroup( - skel->progs.getsockname_unix_prog, cgroup_fd); - if (!ASSERT_OK_PTR(skel->links.getsockname_unix_prog, "prog_attach")) - goto cleanup; +static int kernel_getpeername(int fd, struct sockaddr *addr, socklen_t *addrlen) +{ + return sock_addr_op("kernel_getpeername", addr, addrlen, true); +} - return skel; -cleanup: - getsockname_unix_prog__destroy(skel); - return NULL; +int kernel_connect_to_addr(int type, const struct sockaddr_storage *addr, socklen_t addrlen, + const struct network_helper_opts *opts) +{ + int err; + + if (!ASSERT_OK(kernel_init_sock(addr->ss_family, type, 0), + "kernel_init_sock")) + goto err; + + if (kernel_connect((struct sockaddr *)addr, addrlen) < 0) + goto err; + + /* Test code expects a "file descriptor" on success. */ + err = 1; + goto out; +err: + err = -1; + save_errno_do(ASSERT_OK(kernel_close_sock(0), "kernel_close_sock")); +out: + return err; } -static void getsockname_unix_prog_destroy(void *skel) +int kernel_start_server(int family, int type, const char *addr_str, __u16 port, + int timeout_ms) { - getsockname_unix_prog__destroy(skel); + struct sockaddr_storage addr; + socklen_t addrlen; + int err; + + if (!ASSERT_OK(kernel_init_sock(family, type, 0), "kernel_init_sock")) + goto err; + + if (make_sockaddr(family, addr_str, port, &addr, &addrlen)) + goto err; + + if (kernel_bind(0, (struct sockaddr *)&addr, addrlen) < 0) + goto err; + + if (type == SOCK_STREAM) { + if (!ASSERT_OK(kernel_listen(), "kernel_listen")) + goto err; + } + + /* Test code expects a "file descriptor" on success. */ + err = 1; + goto out; +err: + err = -1; + save_errno_do(ASSERT_OK(kernel_close_sock(0), "kernel_close_sock")); +out: + return err; } -static void *getpeername_unix_prog_load(int cgroup_fd) +struct sock_ops { + int (*connect_to_addr)(int type, const struct sockaddr_storage *addr, + socklen_t addrlen, + const struct network_helper_opts *opts); + int (*start_server)(int family, int type, const char *addr_str, + __u16 port, int timeout_ms); + int (*socket)(int famil, int type, int protocol); + int (*bind)(int fd, struct sockaddr *addr, socklen_t addrlen); + int (*getsockname)(int fd, struct sockaddr *addr, socklen_t *addrlen); + int (*getpeername)(int fd, struct sockaddr *addr, socklen_t *addrlen); + int (*sendmsg)(int fd, struct sockaddr *addr, socklen_t addrlen, + char *msg, int msglen); + int (*close)(int fd); +}; + +static int user_sendmsg(int fd, struct sockaddr *addr, socklen_t addrlen, + char *msg, int msglen) { - struct getpeername_unix_prog *skel; + struct msghdr hdr; + struct iovec iov; - skel = getpeername_unix_prog__open_and_load(); - if (!ASSERT_OK_PTR(skel, "skel_open")) - goto cleanup; + memset(&iov, 0, sizeof(iov)); + iov.iov_base = msg; + iov.iov_len = msglen; - skel->links.getpeername_unix_prog = bpf_program__attach_cgroup( - skel->progs.getpeername_unix_prog, cgroup_fd); - if (!ASSERT_OK_PTR(skel->links.getpeername_unix_prog, "prog_attach")) - goto cleanup; + memset(&hdr, 0, sizeof(hdr)); + hdr.msg_name = (void *)addr; + hdr.msg_namelen = addrlen; + hdr.msg_iov = &iov; + hdr.msg_iovlen = 1; - return skel; -cleanup: - getpeername_unix_prog__destroy(skel); - return NULL; + return sendmsg(fd, &hdr, 0); } -static void getpeername_unix_prog_destroy(void *skel) +static int user_bind(int fd, struct sockaddr *addr, socklen_t addrlen) { - getpeername_unix_prog__destroy(skel); + return bind(fd, (const struct sockaddr *)addr, addrlen); +} + +struct sock_ops user_ops = { + .connect_to_addr = connect_to_addr, + .start_server = start_server, + .socket = socket, + .bind = user_bind, + .getsockname = getsockname, + .getpeername = getpeername, + .sendmsg = user_sendmsg, + .close = close, +}; + +struct sock_ops kern_ops_sock_sendmsg = { + .connect_to_addr = kernel_connect_to_addr, + .start_server = kernel_start_server, + .socket = kernel_init_sock, + .bind = kernel_bind, + .getsockname = kernel_getsockname, + .getpeername = kernel_getpeername, + .sendmsg = sock_sendmsg, + .close = kernel_close_sock, +}; + +struct sock_ops kern_ops_kernel_sendmsg = { + .connect_to_addr = kernel_connect_to_addr, + .start_server = kernel_start_server, + .socket = kernel_init_sock, + .bind = kernel_bind, + .getsockname = kernel_getsockname, + .getpeername = kernel_getpeername, + .sendmsg = kernel_sendmsg, + .close = kernel_close_sock, +}; + +struct sock_addr_test { + enum sock_addr_test_type type; + const char *name; + /* BPF prog properties */ + load_fn loadfn; + destroy_fn destroyfn; + enum bpf_attach_type attach_type; + /* Socket operations */ + struct sock_ops *ops; + /* Socket properties */ + int socket_family; + int socket_type; + /* IP:port pairs for BPF prog to override */ + const char *requested_addr; + unsigned short requested_port; + const char *expected_addr; + unsigned short expected_port; + const char *expected_src_addr; + /* Expected test result */ + enum { + LOAD_REJECT, + ATTACH_REJECT, + SYSCALL_EPERM, + SYSCALL_ENOTSUPP, + SUCCESS, + } expected_result; +}; + +#define BPF_SKEL_FUNCS_RAW(skel_name, prog_name) \ +static void *prog_name##_load_raw(int cgroup_fd, \ + enum bpf_attach_type attach_type, \ + bool expect_reject) \ +{ \ + struct skel_name *skel = skel_name##__open(); \ + int prog_fd = -1; \ + if (!ASSERT_OK_PTR(skel, "skel_open")) \ + goto cleanup; \ + if (!ASSERT_OK(skel_name##__load(skel), "load")) \ + goto cleanup; \ + prog_fd = bpf_program__fd(skel->progs.prog_name); \ + if (!ASSERT_GT(prog_fd, 0, "prog_fd")) \ + goto cleanup; \ + if (bpf_prog_attach(prog_fd, cgroup_fd, attach_type, \ + BPF_F_ALLOW_OVERRIDE), "bpf_prog_attach") { \ + ASSERT_TRUE(expect_reject, "unexpected rejection"); \ + goto cleanup; \ + } \ + if (!ASSERT_FALSE(expect_reject, "expected rejection")) \ + goto cleanup; \ +cleanup: \ + if (prog_fd > 0) \ + bpf_prog_detach(cgroup_fd, attach_type); \ + skel_name##__destroy(skel); \ + return NULL; \ +} \ +static void prog_name##_destroy_raw(void *progfd) \ +{ \ + /* No-op. *_load_raw does all cleanup. */ \ +} \ + +#define BPF_SKEL_FUNCS(skel_name, prog_name) \ +static void *prog_name##_load(int cgroup_fd, \ + enum bpf_attach_type attach_type, \ + bool expect_reject) \ +{ \ + struct skel_name *skel = skel_name##__open(); \ + if (!ASSERT_OK_PTR(skel, "skel_open")) \ + goto cleanup; \ + if (!ASSERT_OK(bpf_program__set_expected_attach_type(skel->progs.prog_name, \ + attach_type), \ + "set_expected_attach_type")) \ + goto cleanup; \ + if (skel_name##__load(skel)) { \ + ASSERT_TRUE(expect_reject, "unexpected rejection"); \ + goto cleanup; \ + } \ + if (!ASSERT_FALSE(expect_reject, "expected rejection")) \ + goto cleanup; \ + skel->links.prog_name = bpf_program__attach_cgroup( \ + skel->progs.prog_name, cgroup_fd); \ + if (!ASSERT_OK_PTR(skel->links.prog_name, "prog_attach")) \ + goto cleanup; \ + return skel; \ +cleanup: \ + skel_name##__destroy(skel); \ + return NULL; \ +} \ +static void prog_name##_destroy(void *skel) \ +{ \ + skel_name##__destroy(skel); \ } +BPF_SKEL_FUNCS(bind4_prog, bind_v4_prog); +BPF_SKEL_FUNCS_RAW(bind4_prog, bind_v4_prog); +BPF_SKEL_FUNCS(bind4_prog, bind_v4_deny_prog); +BPF_SKEL_FUNCS(bind6_prog, bind_v6_prog); +BPF_SKEL_FUNCS_RAW(bind6_prog, bind_v6_prog); +BPF_SKEL_FUNCS(bind6_prog, bind_v6_deny_prog); +BPF_SKEL_FUNCS(connect4_prog, connect_v4_prog); +BPF_SKEL_FUNCS_RAW(connect4_prog, connect_v4_prog); +BPF_SKEL_FUNCS(connect4_prog, connect_v4_deny_prog); +BPF_SKEL_FUNCS(connect6_prog, connect_v6_prog); +BPF_SKEL_FUNCS_RAW(connect6_prog, connect_v6_prog); +BPF_SKEL_FUNCS(connect6_prog, connect_v6_deny_prog); +BPF_SKEL_FUNCS(connect_unix_prog, connect_unix_prog); +BPF_SKEL_FUNCS_RAW(connect_unix_prog, connect_unix_prog); +BPF_SKEL_FUNCS(connect_unix_prog, connect_unix_deny_prog); +BPF_SKEL_FUNCS(sendmsg4_prog, sendmsg_v4_prog); +BPF_SKEL_FUNCS_RAW(sendmsg4_prog, sendmsg_v4_prog); +BPF_SKEL_FUNCS(sendmsg4_prog, sendmsg_v4_deny_prog); +BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_prog); +BPF_SKEL_FUNCS_RAW(sendmsg6_prog, sendmsg_v6_prog); +BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_deny_prog); +BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_preserve_dst_prog); +BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_v4mapped_prog); +BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_wildcard_prog); +BPF_SKEL_FUNCS(sendmsg_unix_prog, sendmsg_unix_prog); +BPF_SKEL_FUNCS_RAW(sendmsg_unix_prog, sendmsg_unix_prog); +BPF_SKEL_FUNCS(sendmsg_unix_prog, sendmsg_unix_deny_prog); +BPF_SKEL_FUNCS(recvmsg4_prog, recvmsg4_prog); +BPF_SKEL_FUNCS_RAW(recvmsg4_prog, recvmsg4_prog); +BPF_SKEL_FUNCS(recvmsg6_prog, recvmsg6_prog); +BPF_SKEL_FUNCS_RAW(recvmsg6_prog, recvmsg6_prog); +BPF_SKEL_FUNCS(recvmsg_unix_prog, recvmsg_unix_prog); +BPF_SKEL_FUNCS_RAW(recvmsg_unix_prog, recvmsg_unix_prog); +BPF_SKEL_FUNCS(getsockname_unix_prog, getsockname_unix_prog); +BPF_SKEL_FUNCS_RAW(getsockname_unix_prog, getsockname_unix_prog); +BPF_SKEL_FUNCS(getsockname4_prog, getsockname_v4_prog); +BPF_SKEL_FUNCS_RAW(getsockname4_prog, getsockname_v4_prog); +BPF_SKEL_FUNCS(getsockname6_prog, getsockname_v6_prog); +BPF_SKEL_FUNCS_RAW(getsockname6_prog, getsockname_v6_prog); +BPF_SKEL_FUNCS(getpeername_unix_prog, getpeername_unix_prog); +BPF_SKEL_FUNCS_RAW(getpeername_unix_prog, getpeername_unix_prog); +BPF_SKEL_FUNCS(getpeername4_prog, getpeername_v4_prog); +BPF_SKEL_FUNCS_RAW(getpeername4_prog, getpeername_v4_prog); +BPF_SKEL_FUNCS(getpeername6_prog, getpeername_v6_prog); +BPF_SKEL_FUNCS_RAW(getpeername6_prog, getpeername_v6_prog); + static struct sock_addr_test tests[] = { + /* bind - system calls */ + { + SOCK_ADDR_TEST_BIND, + "bind4: bind (stream)", + bind_v4_prog_load, + bind_v4_prog_destroy, + BPF_CGROUP_INET4_BIND, + &user_ops, + AF_INET, + SOCK_STREAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_BIND, + "bind4: bind deny (stream)", + bind_v4_deny_prog_load, + bind_v4_deny_prog_destroy, + BPF_CGROUP_INET4_BIND, + &user_ops, + AF_INET, + SOCK_STREAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + NULL, + SYSCALL_EPERM, + }, + { + SOCK_ADDR_TEST_BIND, + "bind4: bind (dgram)", + bind_v4_prog_load, + bind_v4_prog_destroy, + BPF_CGROUP_INET4_BIND, + &user_ops, + AF_INET, + SOCK_DGRAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_BIND, + "bind4: bind deny (dgram)", + bind_v4_deny_prog_load, + bind_v4_deny_prog_destroy, + BPF_CGROUP_INET4_BIND, + &user_ops, + AF_INET, + SOCK_DGRAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + NULL, + SYSCALL_EPERM, + }, + { + SOCK_ADDR_TEST_BIND, + "bind4: load prog with wrong expected attach type", + bind_v4_prog_load, + bind_v4_prog_destroy, + BPF_CGROUP_INET6_BIND, + &user_ops, + AF_INET, + SOCK_STREAM, + NULL, + 0, + NULL, + 0, + NULL, + LOAD_REJECT, + }, + { + SOCK_ADDR_TEST_BIND, + "bind4: attach prog with wrong attach type", + bind_v4_prog_load_raw, + bind_v4_prog_destroy_raw, + BPF_CGROUP_INET6_BIND, + &user_ops, + AF_INET, + SOCK_STREAM, + NULL, + 0, + NULL, + 0, + NULL, + ATTACH_REJECT, + }, + { + SOCK_ADDR_TEST_BIND, + "bind6: bind (stream)", + bind_v6_prog_load, + bind_v6_prog_destroy, + BPF_CGROUP_INET6_BIND, + &user_ops, + AF_INET6, + SOCK_STREAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_BIND, + "bind6: bind deny (stream)", + bind_v6_deny_prog_load, + bind_v6_deny_prog_destroy, + BPF_CGROUP_INET6_BIND, + &user_ops, + AF_INET6, + SOCK_STREAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + NULL, + SYSCALL_EPERM, + }, + { + SOCK_ADDR_TEST_BIND, + "bind6: bind (dgram)", + bind_v6_prog_load, + bind_v6_prog_destroy, + BPF_CGROUP_INET6_BIND, + &user_ops, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_BIND, + "bind6: bind deny (dgram)", + bind_v6_deny_prog_load, + bind_v6_deny_prog_destroy, + BPF_CGROUP_INET6_BIND, + &user_ops, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + NULL, + SYSCALL_EPERM, + }, + { + SOCK_ADDR_TEST_BIND, + "bind6: load prog with wrong expected attach type", + bind_v6_prog_load, + bind_v6_prog_destroy, + BPF_CGROUP_INET4_BIND, + &user_ops, + AF_INET6, + SOCK_STREAM, + NULL, + 0, + NULL, + 0, + NULL, + LOAD_REJECT, + }, + { + SOCK_ADDR_TEST_BIND, + "bind6: attach prog with wrong attach type", + bind_v6_prog_load_raw, + bind_v6_prog_destroy_raw, + BPF_CGROUP_INET4_BIND, + &user_ops, + AF_INET, + SOCK_STREAM, + NULL, + 0, + NULL, + 0, + NULL, + ATTACH_REJECT, + }, + + /* bind - kernel calls */ + { + SOCK_ADDR_TEST_BIND, + "bind4: kernel_bind (stream)", + bind_v4_prog_load, + bind_v4_prog_destroy, + BPF_CGROUP_INET4_BIND, + &kern_ops_sock_sendmsg, + AF_INET, + SOCK_STREAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_BIND, + "bind4: kernel_bind deny (stream)", + bind_v4_deny_prog_load, + bind_v4_deny_prog_destroy, + BPF_CGROUP_INET4_BIND, + &kern_ops_sock_sendmsg, + AF_INET, + SOCK_STREAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + NULL, + SYSCALL_EPERM, + }, + { + SOCK_ADDR_TEST_BIND, + "bind4: kernel_bind (dgram)", + bind_v4_prog_load, + bind_v4_prog_destroy, + BPF_CGROUP_INET4_BIND, + &kern_ops_sock_sendmsg, + AF_INET, + SOCK_DGRAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_BIND, + "bind4: kernel_bind deny (dgram)", + bind_v4_deny_prog_load, + bind_v4_deny_prog_destroy, + BPF_CGROUP_INET4_BIND, + &kern_ops_sock_sendmsg, + AF_INET, + SOCK_DGRAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + NULL, + SYSCALL_EPERM, + }, + { + SOCK_ADDR_TEST_BIND, + "bind6: kernel_bind (stream)", + bind_v6_prog_load, + bind_v6_prog_destroy, + BPF_CGROUP_INET6_BIND, + &kern_ops_sock_sendmsg, + AF_INET6, + SOCK_STREAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_BIND, + "bind6: kernel_bind deny (stream)", + bind_v6_deny_prog_load, + bind_v6_deny_prog_destroy, + BPF_CGROUP_INET6_BIND, + &kern_ops_sock_sendmsg, + AF_INET6, + SOCK_STREAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + NULL, + SYSCALL_EPERM, + }, + { + SOCK_ADDR_TEST_BIND, + "bind6: kernel_bind (dgram)", + bind_v6_prog_load, + bind_v6_prog_destroy, + BPF_CGROUP_INET6_BIND, + &kern_ops_sock_sendmsg, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_BIND, + "bind6: kernel_bind deny (dgram)", + bind_v6_deny_prog_load, + bind_v6_deny_prog_destroy, + BPF_CGROUP_INET6_BIND, + &kern_ops_sock_sendmsg, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + NULL, + SYSCALL_EPERM, + }, + + /* connect - system calls */ + { + SOCK_ADDR_TEST_CONNECT, + "connect4: connect (stream)", + connect_v4_prog_load, + connect_v4_prog_destroy, + BPF_CGROUP_INET4_CONNECT, + &user_ops, + AF_INET, + SOCK_STREAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SRC4_REWRITE_IP, + SUCCESS, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect4: connect deny (stream)", + connect_v4_deny_prog_load, + connect_v4_deny_prog_destroy, + BPF_CGROUP_INET4_CONNECT, + &user_ops, + AF_INET, + SOCK_STREAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SRC4_REWRITE_IP, + SYSCALL_EPERM, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect4: connect (dgram)", + connect_v4_prog_load, + connect_v4_prog_destroy, + BPF_CGROUP_INET4_CONNECT, + &user_ops, + AF_INET, + SOCK_DGRAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SRC4_REWRITE_IP, + SUCCESS, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect4: connect deny (dgram)", + connect_v4_deny_prog_load, + connect_v4_deny_prog_destroy, + BPF_CGROUP_INET4_CONNECT, + &user_ops, + AF_INET, + SOCK_DGRAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SRC4_REWRITE_IP, + SYSCALL_EPERM, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect4: load prog with wrong expected attach type", + connect_v4_prog_load, + connect_v4_prog_destroy, + BPF_CGROUP_INET6_CONNECT, + &user_ops, + AF_INET, + SOCK_STREAM, + NULL, + 0, + NULL, + 0, + NULL, + LOAD_REJECT, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect4: attach prog with wrong attach type", + connect_v4_prog_load_raw, + connect_v4_prog_destroy_raw, + BPF_CGROUP_INET6_CONNECT, + &user_ops, + AF_INET, + SOCK_STREAM, + NULL, + 0, + NULL, + 0, + NULL, + ATTACH_REJECT, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect6: connect (stream)", + connect_v6_prog_load, + connect_v6_prog_destroy, + BPF_CGROUP_INET6_CONNECT, + &user_ops, + AF_INET6, + SOCK_STREAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + SUCCESS, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect6: connect deny (stream)", + connect_v6_deny_prog_load, + connect_v6_deny_prog_destroy, + BPF_CGROUP_INET6_CONNECT, + &user_ops, + AF_INET6, + SOCK_STREAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + SYSCALL_EPERM, + }, { SOCK_ADDR_TEST_CONNECT, - "connect_unix", + "connect6: connect (dgram)", + connect_v6_prog_load, + connect_v6_prog_destroy, + BPF_CGROUP_INET6_CONNECT, + &user_ops, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + SUCCESS, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect6: connect deny (dgram)", + connect_v6_deny_prog_load, + connect_v6_deny_prog_destroy, + BPF_CGROUP_INET6_CONNECT, + &user_ops, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + SYSCALL_EPERM, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect6: load prog with wrong expected attach type", + connect_v6_prog_load, + connect_v6_prog_destroy, + BPF_CGROUP_INET4_CONNECT, + &user_ops, + AF_INET6, + SOCK_STREAM, + NULL, + 0, + NULL, + 0, + NULL, + LOAD_REJECT, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect6: attach prog with wrong attach type", + connect_v6_prog_load_raw, + connect_v6_prog_destroy_raw, + BPF_CGROUP_INET4_CONNECT, + &user_ops, + AF_INET, + SOCK_STREAM, + NULL, + 0, + NULL, + 0, + NULL, + ATTACH_REJECT, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect_unix: connect (stream)", connect_unix_prog_load, connect_unix_prog_destroy, + BPF_CGROUP_UNIX_CONNECT, + &user_ops, AF_UNIX, SOCK_STREAM, SERVUN_ADDRESS, @@ -176,12 +1019,631 @@ static struct sock_addr_test tests[] = { SERVUN_REWRITE_ADDRESS, 0, NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect_unix: connect deny (stream)", + connect_unix_deny_prog_load, + connect_unix_deny_prog_destroy, + BPF_CGROUP_UNIX_CONNECT, + &user_ops, + AF_UNIX, + SOCK_STREAM, + SERVUN_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + NULL, + SYSCALL_EPERM, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect_unix: attach prog with wrong attach type", + connect_unix_prog_load_raw, + connect_unix_prog_destroy_raw, + BPF_CGROUP_INET4_CONNECT, + &user_ops, + AF_UNIX, + SOCK_STREAM, + SERVUN_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + NULL, + ATTACH_REJECT, + }, + + /* connect - kernel calls */ + { + SOCK_ADDR_TEST_CONNECT, + "connect4: kernel_connect (stream)", + connect_v4_prog_load, + connect_v4_prog_destroy, + BPF_CGROUP_INET4_CONNECT, + &kern_ops_sock_sendmsg, + AF_INET, + SOCK_STREAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SRC4_REWRITE_IP, + SUCCESS, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect4: kernel_connect deny (stream)", + connect_v4_deny_prog_load, + connect_v4_deny_prog_destroy, + BPF_CGROUP_INET4_CONNECT, + &kern_ops_sock_sendmsg, + AF_INET, + SOCK_STREAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SRC4_REWRITE_IP, + SYSCALL_EPERM, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect4: kernel_connect (dgram)", + connect_v4_prog_load, + connect_v4_prog_destroy, + BPF_CGROUP_INET4_CONNECT, + &kern_ops_sock_sendmsg, + AF_INET, + SOCK_DGRAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SRC4_REWRITE_IP, + SUCCESS, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect4: kernel_connect deny (dgram)", + connect_v4_deny_prog_load, + connect_v4_deny_prog_destroy, + BPF_CGROUP_INET4_CONNECT, + &kern_ops_sock_sendmsg, + AF_INET, + SOCK_DGRAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SRC4_REWRITE_IP, + SYSCALL_EPERM, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect6: kernel_connect (stream)", + connect_v6_prog_load, + connect_v6_prog_destroy, + BPF_CGROUP_INET6_CONNECT, + &kern_ops_sock_sendmsg, + AF_INET6, + SOCK_STREAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + SUCCESS, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect6: kernel_connect deny (stream)", + connect_v6_deny_prog_load, + connect_v6_deny_prog_destroy, + BPF_CGROUP_INET6_CONNECT, + &kern_ops_sock_sendmsg, + AF_INET6, + SOCK_STREAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + SYSCALL_EPERM, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect6: kernel_connect (dgram)", + connect_v6_prog_load, + connect_v6_prog_destroy, + BPF_CGROUP_INET6_CONNECT, + &kern_ops_sock_sendmsg, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + SUCCESS, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect6: kernel_connect deny (dgram)", + connect_v6_deny_prog_load, + connect_v6_deny_prog_destroy, + BPF_CGROUP_INET6_CONNECT, + &kern_ops_sock_sendmsg, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + SYSCALL_EPERM, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect_unix: kernel_connect (dgram)", + connect_unix_prog_load, + connect_unix_prog_destroy, + BPF_CGROUP_UNIX_CONNECT, + &kern_ops_sock_sendmsg, + AF_UNIX, + SOCK_STREAM, + SERVUN_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect_unix: kernel_connect deny (dgram)", + connect_unix_deny_prog_load, + connect_unix_deny_prog_destroy, + BPF_CGROUP_UNIX_CONNECT, + &kern_ops_sock_sendmsg, + AF_UNIX, + SOCK_STREAM, + SERVUN_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + NULL, + SYSCALL_EPERM, + }, + + /* sendmsg - system calls */ + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg4: sendmsg (dgram)", + sendmsg_v4_prog_load, + sendmsg_v4_prog_destroy, + BPF_CGROUP_UDP4_SENDMSG, + &user_ops, + AF_INET, + SOCK_DGRAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SRC4_REWRITE_IP, + SUCCESS, + }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg4: sendmsg deny (dgram)", + sendmsg_v4_deny_prog_load, + sendmsg_v4_deny_prog_destroy, + BPF_CGROUP_UDP4_SENDMSG, + &user_ops, + AF_INET, + SOCK_DGRAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SRC4_REWRITE_IP, + SYSCALL_EPERM, + }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg4: load prog with wrong expected attach type", + sendmsg_v4_prog_load, + sendmsg_v4_prog_destroy, + BPF_CGROUP_UDP6_SENDMSG, + &user_ops, + AF_INET, + SOCK_DGRAM, + NULL, + 0, + NULL, + 0, + NULL, + LOAD_REJECT, + }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg4: attach prog with wrong attach type", + sendmsg_v4_prog_load_raw, + sendmsg_v4_prog_destroy_raw, + BPF_CGROUP_UDP6_SENDMSG, + &user_ops, + AF_INET, + SOCK_DGRAM, + NULL, + 0, + NULL, + 0, + NULL, + ATTACH_REJECT, + }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg6: sendmsg (dgram)", + sendmsg_v6_prog_load, + sendmsg_v6_prog_destroy, + BPF_CGROUP_UDP6_SENDMSG, + &user_ops, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + SUCCESS, + }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg6: sendmsg [::] (BSD'ism) (dgram)", + sendmsg_v6_preserve_dst_prog_load, + sendmsg_v6_preserve_dst_prog_destroy, + BPF_CGROUP_UDP6_SENDMSG, + &user_ops, + AF_INET6, + SOCK_DGRAM, + WILDCARD6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_PORT, + SRC6_IP, + SUCCESS, + }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg6: sendmsg deny (dgram)", + sendmsg_v6_deny_prog_load, + sendmsg_v6_deny_prog_destroy, + BPF_CGROUP_UDP6_SENDMSG, + &user_ops, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + SYSCALL_EPERM, + }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg6: sendmsg IPv4-mapped IPv6 (dgram)", + sendmsg_v6_v4mapped_prog_load, + sendmsg_v6_v4mapped_prog_destroy, + BPF_CGROUP_UDP6_SENDMSG, + &user_ops, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + SYSCALL_ENOTSUPP, + }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg6: sendmsg dst IP = [::] (BSD'ism) (dgram)", + sendmsg_v6_wildcard_prog_load, + sendmsg_v6_wildcard_prog_destroy, + BPF_CGROUP_UDP6_SENDMSG, + &user_ops, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + SUCCESS, + }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg6: load prog with wrong expected attach type", + sendmsg_v6_prog_load, + sendmsg_v6_prog_destroy, + BPF_CGROUP_UDP4_SENDMSG, + &user_ops, + AF_INET6, + SOCK_DGRAM, + NULL, + 0, + NULL, + 0, + NULL, + LOAD_REJECT, + }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg6: attach prog with wrong attach type", + sendmsg_v6_prog_load_raw, + sendmsg_v6_prog_destroy_raw, + BPF_CGROUP_UDP4_SENDMSG, + &user_ops, + AF_INET6, + SOCK_DGRAM, + NULL, + 0, + NULL, + 0, + NULL, + ATTACH_REJECT, + }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg_unix: sendmsg (dgram)", + sendmsg_unix_prog_load, + sendmsg_unix_prog_destroy, + BPF_CGROUP_UNIX_SENDMSG, + &user_ops, + AF_UNIX, + SOCK_DGRAM, + SERVUN_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg_unix: sendmsg deny (dgram)", + sendmsg_unix_deny_prog_load, + sendmsg_unix_deny_prog_destroy, + BPF_CGROUP_UNIX_SENDMSG, + &user_ops, + AF_UNIX, + SOCK_DGRAM, + SERVUN_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + NULL, + SYSCALL_EPERM, + }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg_unix: attach prog with wrong attach type", + sendmsg_unix_prog_load_raw, + sendmsg_unix_prog_destroy_raw, + BPF_CGROUP_UDP4_SENDMSG, + &user_ops, + AF_UNIX, + SOCK_DGRAM, + SERVUN_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + NULL, + ATTACH_REJECT, + }, + + /* sendmsg - kernel calls (sock_sendmsg) */ + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg4: sock_sendmsg (dgram)", + sendmsg_v4_prog_load, + sendmsg_v4_prog_destroy, + BPF_CGROUP_UDP4_SENDMSG, + &kern_ops_sock_sendmsg, + AF_INET, + SOCK_DGRAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SRC4_REWRITE_IP, + SUCCESS, + }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg4: sock_sendmsg deny (dgram)", + sendmsg_v4_deny_prog_load, + sendmsg_v4_deny_prog_destroy, + BPF_CGROUP_UDP4_SENDMSG, + &kern_ops_sock_sendmsg, + AF_INET, + SOCK_DGRAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SRC4_REWRITE_IP, + SYSCALL_EPERM, + }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg6: sock_sendmsg (dgram)", + sendmsg_v6_prog_load, + sendmsg_v6_prog_destroy, + BPF_CGROUP_UDP6_SENDMSG, + &kern_ops_sock_sendmsg, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + SUCCESS, + }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg6: sock_sendmsg [::] (BSD'ism) (dgram)", + sendmsg_v6_preserve_dst_prog_load, + sendmsg_v6_preserve_dst_prog_destroy, + BPF_CGROUP_UDP6_SENDMSG, + &kern_ops_sock_sendmsg, + AF_INET6, + SOCK_DGRAM, + WILDCARD6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_PORT, + SRC6_IP, + SUCCESS, + }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg6: sock_sendmsg deny (dgram)", + sendmsg_v6_deny_prog_load, + sendmsg_v6_deny_prog_destroy, + BPF_CGROUP_UDP6_SENDMSG, + &kern_ops_sock_sendmsg, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + SYSCALL_EPERM, + }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg_unix: sock_sendmsg (dgram)", + sendmsg_unix_prog_load, + sendmsg_unix_prog_destroy, + BPF_CGROUP_UNIX_SENDMSG, + &kern_ops_sock_sendmsg, + AF_UNIX, + SOCK_DGRAM, + SERVUN_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg_unix: sock_sendmsg deny (dgram)", + sendmsg_unix_deny_prog_load, + sendmsg_unix_deny_prog_destroy, + BPF_CGROUP_UNIX_SENDMSG, + &kern_ops_sock_sendmsg, + AF_UNIX, + SOCK_DGRAM, + SERVUN_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + NULL, + SYSCALL_EPERM, + }, + + /* sendmsg - kernel calls (kernel_sendmsg) */ + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg4: kernel_sendmsg (dgram)", + sendmsg_v4_prog_load, + sendmsg_v4_prog_destroy, + BPF_CGROUP_UDP4_SENDMSG, + &kern_ops_kernel_sendmsg, + AF_INET, + SOCK_DGRAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SRC4_REWRITE_IP, + SUCCESS, + }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg4: kernel_sendmsg deny (dgram)", + sendmsg_v4_deny_prog_load, + sendmsg_v4_deny_prog_destroy, + BPF_CGROUP_UDP4_SENDMSG, + &kern_ops_kernel_sendmsg, + AF_INET, + SOCK_DGRAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SRC4_REWRITE_IP, + SYSCALL_EPERM, + }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg6: kernel_sendmsg (dgram)", + sendmsg_v6_prog_load, + sendmsg_v6_prog_destroy, + BPF_CGROUP_UDP6_SENDMSG, + &kern_ops_kernel_sendmsg, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + SUCCESS, + }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg6: kernel_sendmsg [::] (BSD'ism) (dgram)", + sendmsg_v6_preserve_dst_prog_load, + sendmsg_v6_preserve_dst_prog_destroy, + BPF_CGROUP_UDP6_SENDMSG, + &kern_ops_kernel_sendmsg, + AF_INET6, + SOCK_DGRAM, + WILDCARD6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_PORT, + SRC6_IP, + SUCCESS, }, { SOCK_ADDR_TEST_SENDMSG, - "sendmsg_unix", + "sendmsg6: kernel_sendmsg deny (dgram)", + sendmsg_v6_deny_prog_load, + sendmsg_v6_deny_prog_destroy, + BPF_CGROUP_UDP6_SENDMSG, + &kern_ops_kernel_sendmsg, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + SYSCALL_EPERM, + }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg_unix: sock_sendmsg (dgram)", sendmsg_unix_prog_load, sendmsg_unix_prog_destroy, + BPF_CGROUP_UNIX_SENDMSG, + &kern_ops_kernel_sendmsg, AF_UNIX, SOCK_DGRAM, SERVUN_ADDRESS, @@ -189,12 +1651,97 @@ static struct sock_addr_test tests[] = { SERVUN_REWRITE_ADDRESS, 0, NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg_unix: kernel_sendmsg deny (dgram)", + sendmsg_unix_deny_prog_load, + sendmsg_unix_deny_prog_destroy, + BPF_CGROUP_UNIX_SENDMSG, + &kern_ops_kernel_sendmsg, + AF_UNIX, + SOCK_DGRAM, + SERVUN_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + NULL, + SYSCALL_EPERM, + }, + + /* recvmsg - system calls */ + { + SOCK_ADDR_TEST_RECVMSG, + "recvmsg4: recvfrom (dgram)", + recvmsg4_prog_load, + recvmsg4_prog_destroy, + BPF_CGROUP_UDP4_RECVMSG, + &user_ops, + AF_INET, + SOCK_DGRAM, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SERV4_IP, + SUCCESS, + }, + { + SOCK_ADDR_TEST_RECVMSG, + "recvmsg4: attach prog with wrong attach type", + recvmsg4_prog_load_raw, + recvmsg4_prog_destroy_raw, + BPF_CGROUP_UDP6_RECVMSG, + &user_ops, + AF_INET, + SOCK_DGRAM, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SERV4_IP, + ATTACH_REJECT, }, { SOCK_ADDR_TEST_RECVMSG, - "recvmsg_unix-dgram", + "recvmsg6: recvfrom (dgram)", + recvmsg6_prog_load, + recvmsg6_prog_destroy, + BPF_CGROUP_UDP6_RECVMSG, + &user_ops, + AF_INET6, + SOCK_DGRAM, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SERV6_IP, + SUCCESS, + }, + { + SOCK_ADDR_TEST_RECVMSG, + "recvmsg6: attach prog with wrong attach type", + recvmsg6_prog_load_raw, + recvmsg6_prog_destroy_raw, + BPF_CGROUP_UDP4_RECVMSG, + &user_ops, + AF_INET6, + SOCK_DGRAM, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SERV6_IP, + ATTACH_REJECT, + }, + { + SOCK_ADDR_TEST_RECVMSG, + "recvmsg_unix: recvfrom (dgram)", recvmsg_unix_prog_load, recvmsg_unix_prog_destroy, + BPF_CGROUP_UNIX_RECVMSG, + &user_ops, AF_UNIX, SOCK_DGRAM, SERVUN_REWRITE_ADDRESS, @@ -202,12 +1749,15 @@ static struct sock_addr_test tests[] = { SERVUN_REWRITE_ADDRESS, 0, SERVUN_ADDRESS, + SUCCESS, }, { SOCK_ADDR_TEST_RECVMSG, - "recvmsg_unix-stream", + "recvmsg_unix: recvfrom (stream)", recvmsg_unix_prog_load, recvmsg_unix_prog_destroy, + BPF_CGROUP_UNIX_RECVMSG, + &user_ops, AF_UNIX, SOCK_STREAM, SERVUN_REWRITE_ADDRESS, @@ -215,12 +1765,227 @@ static struct sock_addr_test tests[] = { SERVUN_REWRITE_ADDRESS, 0, SERVUN_ADDRESS, + SUCCESS, + }, + { + SOCK_ADDR_TEST_RECVMSG, + "recvmsg_unix: attach prog with wrong attach type", + recvmsg_unix_prog_load_raw, + recvmsg_unix_prog_destroy_raw, + BPF_CGROUP_UDP4_RECVMSG, + &user_ops, + AF_INET6, + SOCK_STREAM, + SERVUN_REWRITE_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + SERVUN_ADDRESS, + ATTACH_REJECT, }, + + /* getsockname - system calls */ { SOCK_ADDR_TEST_GETSOCKNAME, - "getsockname_unix", + "getsockname4: getsockname (stream)", + getsockname_v4_prog_load, + getsockname_v4_prog_destroy, + BPF_CGROUP_INET4_GETSOCKNAME, + &user_ops, + AF_INET, + SOCK_STREAM, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SERV4_IP, + SERV4_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETSOCKNAME, + "getsockname4: getsockname (dgram)", + getsockname_v4_prog_load, + getsockname_v4_prog_destroy, + BPF_CGROUP_INET4_GETSOCKNAME, + &user_ops, + AF_INET, + SOCK_DGRAM, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SERV4_IP, + SERV4_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETSOCKNAME, + "getsockname4: attach prog with wrong attach type", + getsockname_v4_prog_load_raw, + getsockname_v4_prog_destroy_raw, + BPF_CGROUP_INET6_GETSOCKNAME, + &user_ops, + AF_INET, + SOCK_DGRAM, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SERV4_IP, + SERV4_PORT, + NULL, + ATTACH_REJECT, + }, + { + SOCK_ADDR_TEST_GETSOCKNAME, + "getsockname6: getsockname (stream)", + getsockname_v6_prog_load, + getsockname_v6_prog_destroy, + BPF_CGROUP_INET6_GETSOCKNAME, + &user_ops, + AF_INET6, + SOCK_STREAM, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SERV6_IP, + SERV6_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETSOCKNAME, + "getsockname6: getsockname (dgram)", + getsockname_v6_prog_load, + getsockname_v6_prog_destroy, + BPF_CGROUP_INET6_GETSOCKNAME, + &user_ops, + AF_INET6, + SOCK_DGRAM, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SERV6_IP, + SERV6_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETSOCKNAME, + "getsockname6: attach prog with wrong attach type", + getsockname_v6_prog_load_raw, + getsockname_v6_prog_destroy_raw, + BPF_CGROUP_INET4_GETSOCKNAME, + &user_ops, + AF_INET6, + SOCK_DGRAM, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SERV6_IP, + SERV6_PORT, + NULL, + ATTACH_REJECT, + }, + { + SOCK_ADDR_TEST_GETSOCKNAME, + "getsockname_unix: getsockname", + getsockname_unix_prog_load, + getsockname_unix_prog_destroy, + BPF_CGROUP_UNIX_GETSOCKNAME, + &user_ops, + AF_UNIX, + SOCK_STREAM, + SERVUN_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETSOCKNAME, + "getsockname_unix: attach prog with wrong attach type", + getsockname_unix_prog_load_raw, + getsockname_unix_prog_destroy_raw, + BPF_CGROUP_INET4_GETSOCKNAME, + &user_ops, + AF_UNIX, + SOCK_STREAM, + SERVUN_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + NULL, + ATTACH_REJECT, + }, + + /* getsockname - kernel calls */ + { + SOCK_ADDR_TEST_GETSOCKNAME, + "getsockname4: kernel_getsockname (stream)", + getsockname_v4_prog_load, + getsockname_v4_prog_destroy, + BPF_CGROUP_INET4_GETSOCKNAME, + &kern_ops_kernel_sendmsg, + AF_INET, + SOCK_STREAM, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SERV4_IP, + SERV4_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETSOCKNAME, + "getsockname4: kernel_getsockname (dgram)", + getsockname_v4_prog_load, + getsockname_v4_prog_destroy, + BPF_CGROUP_INET4_GETSOCKNAME, + &kern_ops_kernel_sendmsg, + AF_INET, + SOCK_DGRAM, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SERV4_IP, + SERV4_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETSOCKNAME, + "getsockname6: kernel_getsockname (stream)", + getsockname_v6_prog_load, + getsockname_v6_prog_destroy, + BPF_CGROUP_INET6_GETSOCKNAME, + &kern_ops_kernel_sendmsg, + AF_INET6, + SOCK_STREAM, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SERV6_IP, + SERV6_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETSOCKNAME, + "getsockname6: kernel_getsockname (dgram)", + getsockname_v6_prog_load, + getsockname_v6_prog_destroy, + BPF_CGROUP_INET6_GETSOCKNAME, + &kern_ops_kernel_sendmsg, + AF_INET6, + SOCK_DGRAM, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SERV6_IP, + SERV6_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETSOCKNAME, + "getsockname_unix: kernel_getsockname", getsockname_unix_prog_load, getsockname_unix_prog_destroy, + BPF_CGROUP_UNIX_GETSOCKNAME, + &kern_ops_kernel_sendmsg, AF_UNIX, SOCK_STREAM, SERVUN_ADDRESS, @@ -228,12 +1993,113 @@ static struct sock_addr_test tests[] = { SERVUN_REWRITE_ADDRESS, 0, NULL, + SUCCESS, + }, + + /* getpeername - system calls */ + { + SOCK_ADDR_TEST_GETPEERNAME, + "getpeername4: getpeername (stream)", + getpeername_v4_prog_load, + getpeername_v4_prog_destroy, + BPF_CGROUP_INET4_GETPEERNAME, + &user_ops, + AF_INET, + SOCK_STREAM, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SERV4_IP, + SERV4_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETPEERNAME, + "getpeername4: getpeername (dgram)", + getpeername_v4_prog_load, + getpeername_v4_prog_destroy, + BPF_CGROUP_INET4_GETPEERNAME, + &user_ops, + AF_INET, + SOCK_DGRAM, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SERV4_IP, + SERV4_PORT, + NULL, + SUCCESS, }, { SOCK_ADDR_TEST_GETPEERNAME, - "getpeername_unix", + "getpeername4: attach prog with wrong attach type", + getpeername_v4_prog_load_raw, + getpeername_v4_prog_destroy_raw, + BPF_CGROUP_INET6_GETSOCKNAME, + &user_ops, + AF_UNIX, + SOCK_DGRAM, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SERV4_IP, + SERV4_PORT, + NULL, + ATTACH_REJECT, + }, + { + SOCK_ADDR_TEST_GETPEERNAME, + "getpeername6: getpeername (stream)", + getpeername_v6_prog_load, + getpeername_v6_prog_destroy, + BPF_CGROUP_INET6_GETPEERNAME, + &user_ops, + AF_INET6, + SOCK_STREAM, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SERV6_IP, + SERV6_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETPEERNAME, + "getpeername6: getpeername (dgram)", + getpeername_v6_prog_load, + getpeername_v6_prog_destroy, + BPF_CGROUP_INET6_GETPEERNAME, + &user_ops, + AF_INET6, + SOCK_DGRAM, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SERV6_IP, + SERV6_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETPEERNAME, + "getpeername6: attach prog with wrong attach type", + getpeername_v6_prog_load_raw, + getpeername_v6_prog_destroy_raw, + BPF_CGROUP_INET4_GETSOCKNAME, + &user_ops, + AF_INET6, + SOCK_DGRAM, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SERV6_IP, + SERV6_PORT, + NULL, + ATTACH_REJECT, + }, + { + SOCK_ADDR_TEST_GETPEERNAME, + "getpeername_unix: getpeername", getpeername_unix_prog_load, getpeername_unix_prog_destroy, + BPF_CGROUP_UNIX_GETPEERNAME, + &user_ops, AF_UNIX, SOCK_STREAM, SERVUN_ADDRESS, @@ -241,6 +2107,105 @@ static struct sock_addr_test tests[] = { SERVUN_REWRITE_ADDRESS, 0, NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETPEERNAME, + "getpeername_unix: attach prog with wrong attach type", + getpeername_unix_prog_load_raw, + getpeername_unix_prog_destroy_raw, + BPF_CGROUP_INET4_GETSOCKNAME, + &user_ops, + AF_UNIX, + SOCK_STREAM, + SERVUN_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + NULL, + ATTACH_REJECT, + }, + + /* getpeername - kernel calls */ + { + SOCK_ADDR_TEST_GETPEERNAME, + "getpeername4: kernel_getpeername (stream)", + getpeername_v4_prog_load, + getpeername_v4_prog_destroy, + BPF_CGROUP_INET4_GETPEERNAME, + &kern_ops_kernel_sendmsg, + AF_INET, + SOCK_STREAM, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SERV4_IP, + SERV4_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETPEERNAME, + "getpeername4: kernel_getpeername (dgram)", + getpeername_v4_prog_load, + getpeername_v4_prog_destroy, + BPF_CGROUP_INET4_GETPEERNAME, + &kern_ops_kernel_sendmsg, + AF_INET, + SOCK_DGRAM, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SERV4_IP, + SERV4_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETPEERNAME, + "getpeername6: kernel_getpeername (stream)", + getpeername_v6_prog_load, + getpeername_v6_prog_destroy, + BPF_CGROUP_INET6_GETPEERNAME, + &kern_ops_kernel_sendmsg, + AF_INET6, + SOCK_STREAM, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SERV6_IP, + SERV6_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETPEERNAME, + "getpeername6: kernel_getpeername (dgram)", + getpeername_v6_prog_load, + getpeername_v6_prog_destroy, + BPF_CGROUP_INET6_GETPEERNAME, + &kern_ops_kernel_sendmsg, + AF_INET6, + SOCK_DGRAM, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SERV6_IP, + SERV6_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETPEERNAME, + "getpeername_unix: kernel_getpeername", + getpeername_unix_prog_load, + getpeername_unix_prog_destroy, + BPF_CGROUP_UNIX_GETPEERNAME, + &kern_ops_kernel_sendmsg, + AF_UNIX, + SOCK_STREAM, + SERVUN_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + NULL, + SUCCESS, }, }; @@ -294,28 +2259,40 @@ static int cmp_sock_addr(info_fn fn, int sock1, return cmp_addr(&addr1, len1, addr2, addr2_len, cmp_port); } -static int cmp_local_addr(int sock1, const struct sockaddr_storage *addr2, - socklen_t addr2_len, bool cmp_port) +static int load_sock_addr_kern(void) { - return cmp_sock_addr(getsockname, sock1, addr2, addr2_len, cmp_port); + int err; + + skel = sock_addr_kern__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel")) + goto err; + + err = 0; + goto out; +err: + err = -1; +out: + return err; } -static int cmp_peer_addr(int sock1, const struct sockaddr_storage *addr2, - socklen_t addr2_len, bool cmp_port) +static void unload_sock_addr_kern(void) { - return cmp_sock_addr(getpeername, sock1, addr2, addr2_len, cmp_port); + sock_addr_kern__destroy(skel); } -static void test_bind(struct sock_addr_test *test) +static int test_bind(struct sock_addr_test *test) { struct sockaddr_storage expected_addr; socklen_t expected_addr_len = sizeof(struct sockaddr_storage); int serv = -1, client = -1, err; - serv = start_server(test->socket_family, test->socket_type, - test->requested_addr, test->requested_port, 0); - if (!ASSERT_GE(serv, 0, "start_server")) - goto cleanup; + serv = test->ops->start_server(test->socket_family, test->socket_type, + test->requested_addr, + test->requested_port, 0); + if (serv < 0) { + err = errno; + goto err; + } err = make_sockaddr(test->socket_family, test->expected_addr, test->expected_port, @@ -323,23 +2300,28 @@ static void test_bind(struct sock_addr_test *test) if (!ASSERT_EQ(err, 0, "make_sockaddr")) goto cleanup; - err = cmp_local_addr(serv, &expected_addr, expected_addr_len, true); + err = cmp_sock_addr(test->ops->getsockname, serv, &expected_addr, + expected_addr_len, true); if (!ASSERT_EQ(err, 0, "cmp_local_addr")) goto cleanup; /* Try to connect to server just in case */ - client = connect_to_addr(&expected_addr, expected_addr_len, test->socket_type); + client = connect_to_addr(test->socket_type, &expected_addr, expected_addr_len, NULL); if (!ASSERT_GE(client, 0, "connect_to_addr")) goto cleanup; cleanup: + err = 0; +err: if (client != -1) close(client); if (serv != -1) - close(serv); + test->ops->close(serv); + + return err; } -static void test_connect(struct sock_addr_test *test) +static int test_connect(struct sock_addr_test *test) { struct sockaddr_storage addr, expected_addr, expected_src_addr; socklen_t addr_len = sizeof(struct sockaddr_storage), @@ -357,9 +2339,12 @@ static void test_connect(struct sock_addr_test *test) if (!ASSERT_EQ(err, 0, "make_sockaddr")) goto cleanup; - client = connect_to_addr(&addr, addr_len, test->socket_type); - if (!ASSERT_GE(client, 0, "connect_to_addr")) - goto cleanup; + client = test->ops->connect_to_addr(test->socket_type, &addr, addr_len, + NULL); + if (client < 0) { + err = errno; + goto err; + } err = make_sockaddr(test->socket_family, test->expected_addr, test->expected_port, &expected_addr, &expected_addr_len); @@ -373,29 +2358,34 @@ static void test_connect(struct sock_addr_test *test) goto cleanup; } - err = cmp_peer_addr(client, &expected_addr, expected_addr_len, true); + err = cmp_sock_addr(test->ops->getpeername, client, &expected_addr, + expected_addr_len, true); if (!ASSERT_EQ(err, 0, "cmp_peer_addr")) goto cleanup; if (test->expected_src_addr) { - err = cmp_local_addr(client, &expected_src_addr, expected_src_addr_len, false); + err = cmp_sock_addr(test->ops->getsockname, client, + &expected_src_addr, expected_src_addr_len, + false); if (!ASSERT_EQ(err, 0, "cmp_local_addr")) goto cleanup; } cleanup: + err = 0; +err: if (client != -1) - close(client); + test->ops->close(client); if (serv != -1) close(serv); + + return err; } -static void test_xmsg(struct sock_addr_test *test) +static int test_xmsg(struct sock_addr_test *test) { struct sockaddr_storage addr, src_addr; socklen_t addr_len = sizeof(struct sockaddr_storage), src_addr_len = sizeof(struct sockaddr_storage); - struct msghdr hdr; - struct iovec iov; char data = 'a'; int serv = -1, client = -1, err; @@ -408,7 +2398,7 @@ static void test_xmsg(struct sock_addr_test *test) if (!ASSERT_GE(serv, 0, "start_server")) goto cleanup; - client = socket(test->socket_family, test->socket_type, 0); + client = test->ops->socket(test->socket_family, test->socket_type, 0); if (!ASSERT_GE(client, 0, "socket")) goto cleanup; @@ -418,7 +2408,8 @@ static void test_xmsg(struct sock_addr_test *test) if (!ASSERT_EQ(err, 0, "make_sockaddr")) goto cleanup; - err = bind(client, (const struct sockaddr *) &src_addr, src_addr_len); + err = test->ops->bind(client, (struct sockaddr *)&src_addr, + src_addr_len); if (!ASSERT_OK(err, "bind")) goto cleanup; } @@ -429,17 +2420,13 @@ static void test_xmsg(struct sock_addr_test *test) goto cleanup; if (test->socket_type == SOCK_DGRAM) { - memset(&iov, 0, sizeof(iov)); - iov.iov_base = &data; - iov.iov_len = sizeof(data); - - memset(&hdr, 0, sizeof(hdr)); - hdr.msg_name = (void *)&addr; - hdr.msg_namelen = addr_len; - hdr.msg_iov = &iov; - hdr.msg_iovlen = 1; + err = test->ops->sendmsg(client, (struct sockaddr *)&addr, + addr_len, &data, sizeof(data)); + if (err < 0) { + err = errno; + goto err; + } - err = sendmsg(client, &hdr, 0); if (!ASSERT_EQ(err, sizeof(data), "sendmsg")) goto cleanup; } else { @@ -489,19 +2476,23 @@ static void test_xmsg(struct sock_addr_test *test) } cleanup: + err = 0; +err: if (client != -1) - close(client); + test->ops->close(client); if (serv != -1) close(serv); + + return err; } -static void test_getsockname(struct sock_addr_test *test) +static int test_getsockname(struct sock_addr_test *test) { struct sockaddr_storage expected_addr; socklen_t expected_addr_len = sizeof(struct sockaddr_storage); int serv = -1, err; - serv = start_server(test->socket_family, test->socket_type, + serv = test->ops->start_server(test->socket_family, test->socket_type, test->requested_addr, test->requested_port, 0); if (!ASSERT_GE(serv, 0, "start_server")) goto cleanup; @@ -512,16 +2503,18 @@ static void test_getsockname(struct sock_addr_test *test) if (!ASSERT_EQ(err, 0, "make_sockaddr")) goto cleanup; - err = cmp_local_addr(serv, &expected_addr, expected_addr_len, true); + err = cmp_sock_addr(test->ops->getsockname, serv, &expected_addr, expected_addr_len, true); if (!ASSERT_EQ(err, 0, "cmp_local_addr")) goto cleanup; cleanup: if (serv != -1) - close(serv); + test->ops->close(serv); + + return 0; } -static void test_getpeername(struct sock_addr_test *test) +static int test_getpeername(struct sock_addr_test *test) { struct sockaddr_storage addr, expected_addr; socklen_t addr_len = sizeof(struct sockaddr_storage), @@ -538,7 +2531,8 @@ static void test_getpeername(struct sock_addr_test *test) if (!ASSERT_EQ(err, 0, "make_sockaddr")) goto cleanup; - client = connect_to_addr(&addr, addr_len, test->socket_type); + client = test->ops->connect_to_addr(test->socket_type, &addr, addr_len, + NULL); if (!ASSERT_GE(client, 0, "connect_to_addr")) goto cleanup; @@ -547,19 +2541,58 @@ static void test_getpeername(struct sock_addr_test *test) if (!ASSERT_EQ(err, 0, "make_sockaddr")) goto cleanup; - err = cmp_peer_addr(client, &expected_addr, expected_addr_len, true); + err = cmp_sock_addr(test->ops->getpeername, client, &expected_addr, + expected_addr_len, true); if (!ASSERT_EQ(err, 0, "cmp_peer_addr")) goto cleanup; cleanup: if (client != -1) - close(client); + test->ops->close(client); if (serv != -1) close(serv); + + return 0; +} + +static int setup_test_env(struct nstoken **tok) +{ + int err; + + SYS_NOFAIL("ip netns delete %s", TEST_NS); + SYS(fail, "ip netns add %s", TEST_NS); + *tok = open_netns(TEST_NS); + if (!ASSERT_OK_PTR(*tok, "netns token")) + goto fail; + + SYS(fail, "ip link add dev %s1 type veth peer name %s2", TEST_IF_PREFIX, + TEST_IF_PREFIX); + SYS(fail, "ip link set lo up"); + SYS(fail, "ip link set %s1 up", TEST_IF_PREFIX); + SYS(fail, "ip link set %s2 up", TEST_IF_PREFIX); + SYS(fail, "ip -4 addr add %s/8 dev %s1", TEST_IPV4, TEST_IF_PREFIX); + SYS(fail, "ip -6 addr add %s/128 nodad dev %s1", TEST_IPV6, TEST_IF_PREFIX); + + err = 0; + goto out; +fail: + err = -1; + close_netns(*tok); + *tok = NULL; + SYS_NOFAIL("ip netns delete %s", TEST_NS); +out: + return err; +} + +static void cleanup_test_env(struct nstoken *tok) +{ + close_netns(tok); + SYS_NOFAIL("ip netns delete %s", TEST_NS); } void test_sock_addr(void) { + struct nstoken *tok = NULL; int cgroup_fd = -1; void *skel; @@ -567,13 +2600,22 @@ void test_sock_addr(void) if (!ASSERT_GE(cgroup_fd, 0, "join_cgroup")) goto cleanup; + if (!ASSERT_OK(setup_test_env(&tok), "setup_test_env")) + goto cleanup; + + if (!ASSERT_OK(load_sock_addr_kern(), "load_sock_addr_kern")) + goto cleanup; + for (size_t i = 0; i < ARRAY_SIZE(tests); ++i) { struct sock_addr_test *test = &tests[i]; + int err; if (!test__start_subtest(test->name)) continue; - skel = test->loadfn(cgroup_fd); + skel = test->loadfn(cgroup_fd, test->attach_type, + test->expected_result == LOAD_REJECT || + test->expected_result == ATTACH_REJECT); if (!skel) continue; @@ -583,30 +2625,39 @@ void test_sock_addr(void) * the future. */ case SOCK_ADDR_TEST_BIND: - test_bind(test); + err = test_bind(test); break; case SOCK_ADDR_TEST_CONNECT: - test_connect(test); + err = test_connect(test); break; case SOCK_ADDR_TEST_SENDMSG: case SOCK_ADDR_TEST_RECVMSG: - test_xmsg(test); + err = test_xmsg(test); break; case SOCK_ADDR_TEST_GETSOCKNAME: - test_getsockname(test); + err = test_getsockname(test); break; case SOCK_ADDR_TEST_GETPEERNAME: - test_getpeername(test); + err = test_getpeername(test); break; default: ASSERT_TRUE(false, "Unknown sock addr test type"); break; } + if (test->expected_result == SYSCALL_EPERM) + ASSERT_EQ(err, EPERM, "socket operation returns EPERM"); + else if (test->expected_result == SYSCALL_ENOTSUPP) + ASSERT_EQ(err, ENOTSUPP, "socket operation returns ENOTSUPP"); + else if (test->expected_result == SUCCESS) + ASSERT_OK(err, "socket operation succeeds"); + test->destroyfn(skel); } cleanup: + unload_sock_addr_kern(); + cleanup_test_env(tok); if (cgroup_fd >= 0) close(cgroup_fd); } diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c index 77e26ecffa9d..1337153eb0ad 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c @@ -131,6 +131,65 @@ out: test_skmsg_load_helpers__destroy(skel); } +static void test_skmsg_helpers_with_link(enum bpf_map_type map_type) +{ + struct bpf_program *prog, *prog_clone, *prog_clone2; + DECLARE_LIBBPF_OPTS(bpf_link_update_opts, opts); + struct test_skmsg_load_helpers *skel; + struct bpf_link *link, *link2; + int err, map; + + skel = test_skmsg_load_helpers__open_and_load(); + if (!ASSERT_OK_PTR(skel, "test_skmsg_load_helpers__open_and_load")) + return; + + prog = skel->progs.prog_msg_verdict; + prog_clone = skel->progs.prog_msg_verdict_clone; + prog_clone2 = skel->progs.prog_msg_verdict_clone2; + map = bpf_map__fd(skel->maps.sock_map); + + link = bpf_program__attach_sockmap(prog, map); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_sockmap")) + goto out; + + /* Fail since bpf_link for the same prog has been created. */ + err = bpf_prog_attach(bpf_program__fd(prog), map, BPF_SK_MSG_VERDICT, 0); + if (!ASSERT_ERR(err, "bpf_prog_attach")) + goto out; + + /* Fail since bpf_link for the same prog type has been created. */ + link2 = bpf_program__attach_sockmap(prog_clone, map); + if (!ASSERT_ERR_PTR(link2, "bpf_program__attach_sockmap")) { + bpf_link__detach(link2); + goto out; + } + + err = bpf_link__update_program(link, prog_clone); + if (!ASSERT_OK(err, "bpf_link__update_program")) + goto out; + + /* Fail since a prog with different type attempts to do update. */ + err = bpf_link__update_program(link, skel->progs.prog_skb_verdict); + if (!ASSERT_ERR(err, "bpf_link__update_program")) + goto out; + + /* Fail since the old prog does not match the one in the kernel. */ + opts.old_prog_fd = bpf_program__fd(prog_clone2); + opts.flags = BPF_F_REPLACE; + err = bpf_link_update(bpf_link__fd(link), bpf_program__fd(prog), &opts); + if (!ASSERT_ERR(err, "bpf_link_update")) + goto out; + + opts.old_prog_fd = bpf_program__fd(prog_clone); + opts.flags = BPF_F_REPLACE; + err = bpf_link_update(bpf_link__fd(link), bpf_program__fd(prog), &opts); + if (!ASSERT_OK(err, "bpf_link_update")) + goto out; +out: + bpf_link__detach(link); + test_skmsg_load_helpers__destroy(skel); +} + static void test_sockmap_update(enum bpf_map_type map_type) { int err, prog, src; @@ -298,6 +357,40 @@ out: test_sockmap_skb_verdict_attach__destroy(skel); } +static void test_sockmap_skb_verdict_attach_with_link(void) +{ + struct test_sockmap_skb_verdict_attach *skel; + struct bpf_program *prog; + struct bpf_link *link; + int err, map; + + skel = test_sockmap_skb_verdict_attach__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load")) + return; + prog = skel->progs.prog_skb_verdict; + map = bpf_map__fd(skel->maps.sock_map); + link = bpf_program__attach_sockmap(prog, map); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_sockmap")) + goto out; + + bpf_link__detach(link); + + err = bpf_prog_attach(bpf_program__fd(prog), map, BPF_SK_SKB_STREAM_VERDICT, 0); + if (!ASSERT_OK(err, "bpf_prog_attach")) + goto out; + + /* Fail since attaching with the same prog/map has been done. */ + link = bpf_program__attach_sockmap(prog, map); + if (!ASSERT_ERR_PTR(link, "bpf_program__attach_sockmap")) + bpf_link__detach(link); + + err = bpf_prog_detach2(bpf_program__fd(prog), map, BPF_SK_SKB_STREAM_VERDICT); + if (!ASSERT_OK(err, "bpf_prog_detach2")) + goto out; +out: + test_sockmap_skb_verdict_attach__destroy(skel); +} + static __u32 query_prog_id(int prog_fd) { struct bpf_prog_info info = {}; @@ -475,30 +568,19 @@ out: test_sockmap_drop_prog__destroy(drop); } -static void test_sockmap_skb_verdict_peek(void) +static void test_sockmap_skb_verdict_peek_helper(int map) { - int err, map, verdict, s, c1, p1, zero = 0, sent, recvd, avail; - struct test_sockmap_pass_prog *pass; + int err, s, c1, p1, zero = 0, sent, recvd, avail; char snd[256] = "0123456789"; char rcv[256] = "0"; - pass = test_sockmap_pass_prog__open_and_load(); - if (!ASSERT_OK_PTR(pass, "open_and_load")) - return; - verdict = bpf_program__fd(pass->progs.prog_skb_verdict); - map = bpf_map__fd(pass->maps.sock_map_rx); - - err = bpf_prog_attach(verdict, map, BPF_SK_SKB_STREAM_VERDICT, 0); - if (!ASSERT_OK(err, "bpf_prog_attach")) - goto out; - s = socket_loopback(AF_INET, SOCK_STREAM); if (!ASSERT_GT(s, -1, "socket_loopback(s)")) - goto out; + return; err = create_pair(s, AF_INET, SOCK_STREAM, &c1, &p1); if (!ASSERT_OK(err, "create_pairs(s)")) - goto out; + return; err = bpf_map_update_elem(map, &zero, &c1, BPF_NOEXIST); if (!ASSERT_OK(err, "bpf_map_update_elem(c1)")) @@ -520,7 +602,58 @@ static void test_sockmap_skb_verdict_peek(void) out_close: close(c1); close(p1); +} + +static void test_sockmap_skb_verdict_peek(void) +{ + struct test_sockmap_pass_prog *pass; + int err, map, verdict; + + pass = test_sockmap_pass_prog__open_and_load(); + if (!ASSERT_OK_PTR(pass, "open_and_load")) + return; + verdict = bpf_program__fd(pass->progs.prog_skb_verdict); + map = bpf_map__fd(pass->maps.sock_map_rx); + + err = bpf_prog_attach(verdict, map, BPF_SK_SKB_STREAM_VERDICT, 0); + if (!ASSERT_OK(err, "bpf_prog_attach")) + goto out; + + test_sockmap_skb_verdict_peek_helper(map); + +out: + test_sockmap_pass_prog__destroy(pass); +} + +static void test_sockmap_skb_verdict_peek_with_link(void) +{ + struct test_sockmap_pass_prog *pass; + struct bpf_program *prog; + struct bpf_link *link; + int err, map; + + pass = test_sockmap_pass_prog__open_and_load(); + if (!ASSERT_OK_PTR(pass, "open_and_load")) + return; + prog = pass->progs.prog_skb_verdict; + map = bpf_map__fd(pass->maps.sock_map_rx); + link = bpf_program__attach_sockmap(prog, map); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_sockmap")) + goto out; + + err = bpf_link__update_program(link, pass->progs.prog_skb_verdict_clone); + if (!ASSERT_OK(err, "bpf_link__update_program")) + goto out; + + /* Fail since a prog with different attach type attempts to do update. */ + err = bpf_link__update_program(link, pass->progs.prog_skb_parser); + if (!ASSERT_ERR(err, "bpf_link__update_program")) + goto out; + + test_sockmap_skb_verdict_peek_helper(map); + ASSERT_EQ(pass->bss->clone_called, 1, "clone_called"); out: + bpf_link__detach(link); test_sockmap_pass_prog__destroy(pass); } @@ -788,6 +921,8 @@ void test_sockmap_basic(void) test_sockmap_skb_verdict_attach(BPF_SK_SKB_STREAM_VERDICT, BPF_SK_SKB_VERDICT); } + if (test__start_subtest("sockmap skb_verdict attach_with_link")) + test_sockmap_skb_verdict_attach_with_link(); if (test__start_subtest("sockmap msg_verdict progs query")) test_sockmap_progs_query(BPF_SK_MSG_VERDICT); if (test__start_subtest("sockmap stream_parser progs query")) @@ -804,6 +939,8 @@ void test_sockmap_basic(void) test_sockmap_skb_verdict_fionread(false); if (test__start_subtest("sockmap skb_verdict msg_f_peek")) test_sockmap_skb_verdict_peek(); + if (test__start_subtest("sockmap skb_verdict msg_f_peek with link")) + test_sockmap_skb_verdict_peek_with_link(); if (test__start_subtest("sockmap unconnected af_unix")) test_sockmap_unconnected_unix(); if (test__start_subtest("sockmap one socket to many map entries")) @@ -812,4 +949,8 @@ void test_sockmap_basic(void) test_sockmap_many_maps(); if (test__start_subtest("sockmap same socket replace")) test_sockmap_same_sock(); + if (test__start_subtest("sockmap sk_msg attach sockmap helpers with link")) + test_skmsg_helpers_with_link(BPF_MAP_TYPE_SOCKMAP); + if (test__start_subtest("sockhash sk_msg attach sockhash helpers with link")) + test_skmsg_helpers_with_link(BPF_MAP_TYPE_SOCKHASH); } diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c index a92807bfcd13..e91b59366030 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c @@ -767,6 +767,24 @@ static void test_msg_redir_to_connected(struct test_sockmap_listen *skel, xbpf_prog_detach2(verdict, sock_map, BPF_SK_MSG_VERDICT); } +static void test_msg_redir_to_connected_with_link(struct test_sockmap_listen *skel, + struct bpf_map *inner_map, int family, + int sotype) +{ + int prog_msg_verdict = bpf_program__fd(skel->progs.prog_msg_verdict); + int verdict_map = bpf_map__fd(skel->maps.verdict_map); + int sock_map = bpf_map__fd(inner_map); + int link_fd; + + link_fd = bpf_link_create(prog_msg_verdict, sock_map, BPF_SK_MSG_VERDICT, NULL); + if (!ASSERT_GE(link_fd, 0, "bpf_link_create")) + return; + + redir_to_connected(family, sotype, sock_map, verdict_map, REDIR_EGRESS); + + close(link_fd); +} + static void redir_to_listening(int family, int sotype, int sock_mapfd, int verd_mapfd, enum redir_mode mode) { @@ -869,6 +887,24 @@ static void test_msg_redir_to_listening(struct test_sockmap_listen *skel, xbpf_prog_detach2(verdict, sock_map, BPF_SK_MSG_VERDICT); } +static void test_msg_redir_to_listening_with_link(struct test_sockmap_listen *skel, + struct bpf_map *inner_map, int family, + int sotype) +{ + struct bpf_program *verdict = skel->progs.prog_msg_verdict; + int verdict_map = bpf_map__fd(skel->maps.verdict_map); + int sock_map = bpf_map__fd(inner_map); + struct bpf_link *link; + + link = bpf_program__attach_sockmap(verdict, sock_map); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_sockmap")) + return; + + redir_to_listening(family, sotype, sock_map, verdict_map, REDIR_EGRESS); + + bpf_link__detach(link); +} + static void redir_partial(int family, int sotype, int sock_map, int parser_map) { int s, c0 = -1, c1 = -1, p0 = -1, p1 = -1; @@ -1316,7 +1352,9 @@ static void test_redir(struct test_sockmap_listen *skel, struct bpf_map *map, TEST(test_skb_redir_to_listening), TEST(test_skb_redir_partial), TEST(test_msg_redir_to_connected), + TEST(test_msg_redir_to_connected_with_link), TEST(test_msg_redir_to_listening), + TEST(test_msg_redir_to_listening_with_link), }; const char *family_name, *map_name; const struct redir_test *t; diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt.c b/tools/testing/selftests/bpf/prog_tests/sockopt.c index 5a4491d4edfe..eaac83a7f388 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockopt.c +++ b/tools/testing/selftests/bpf/prog_tests/sockopt.c @@ -24,6 +24,7 @@ enum sockopt_test_error { static struct sockopt_test { const char *descr; const struct bpf_insn insns[64]; + enum bpf_prog_type prog_type; enum bpf_attach_type attach_type; enum bpf_attach_type expected_attach_type; @@ -928,9 +929,40 @@ static struct sockopt_test { .error = EPERM_SETSOCKOPT, }, + + /* ==================== prog_type ==================== */ + + { + .descr = "can attach only BPF_CGROUP_SETSOCKOP", + .insns = { + /* return 1 */ + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + + }, + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + .attach_type = BPF_CGROUP_SETSOCKOPT, + .expected_attach_type = 0, + .error = DENY_ATTACH, + }, + + { + .descr = "can attach only BPF_CGROUP_GETSOCKOP", + .insns = { + /* return 1 */ + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + + }, + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + .attach_type = BPF_CGROUP_GETSOCKOPT, + .expected_attach_type = 0, + .error = DENY_ATTACH, + }, }; static int load_prog(const struct bpf_insn *insns, + enum bpf_prog_type prog_type, enum bpf_attach_type expected_attach_type) { LIBBPF_OPTS(bpf_prog_load_opts, opts, @@ -947,7 +979,7 @@ static int load_prog(const struct bpf_insn *insns, } insns_cnt++; - fd = bpf_prog_load(BPF_PROG_TYPE_CGROUP_SOCKOPT, NULL, "GPL", insns, insns_cnt, &opts); + fd = bpf_prog_load(prog_type, NULL, "GPL", insns, insns_cnt, &opts); if (verbose && fd < 0) fprintf(stderr, "%s\n", bpf_log_buf); @@ -1036,13 +1068,18 @@ static int call_getsockopt(bool use_io_uring, int fd, int level, int optname, return getsockopt(fd, level, optname, optval, optlen); } -static int run_test(int cgroup_fd, struct sockopt_test *test, bool use_io_uring) +static int run_test(int cgroup_fd, struct sockopt_test *test, bool use_io_uring, + bool use_link) { - int sock_fd, err, prog_fd; + int prog_type = BPF_PROG_TYPE_CGROUP_SOCKOPT; + int sock_fd, err, prog_fd, link_fd = -1; void *optval = NULL; int ret = 0; - prog_fd = load_prog(test->insns, test->expected_attach_type); + if (test->prog_type) + prog_type = test->prog_type; + + prog_fd = load_prog(test->insns, prog_type, test->expected_attach_type); if (prog_fd < 0) { if (test->error == DENY_LOAD) return 0; @@ -1051,7 +1088,12 @@ static int run_test(int cgroup_fd, struct sockopt_test *test, bool use_io_uring) return -1; } - err = bpf_prog_attach(prog_fd, cgroup_fd, test->attach_type, 0); + if (use_link) { + err = bpf_link_create(prog_fd, cgroup_fd, test->attach_type, NULL); + link_fd = err; + } else { + err = bpf_prog_attach(prog_fd, cgroup_fd, test->attach_type, 0); + } if (err < 0) { if (test->error == DENY_ATTACH) goto close_prog_fd; @@ -1142,7 +1184,12 @@ free_optval: close_sock_fd: close(sock_fd); detach_prog: - bpf_prog_detach2(prog_fd, cgroup_fd, test->attach_type); + if (use_link) { + if (link_fd >= 0) + close(link_fd); + } else { + bpf_prog_detach2(prog_fd, cgroup_fd, test->attach_type); + } close_prog_fd: close(prog_fd); return ret; @@ -1160,10 +1207,12 @@ void test_sockopt(void) if (!test__start_subtest(tests[i].descr)) continue; - ASSERT_OK(run_test(cgroup_fd, &tests[i], false), + ASSERT_OK(run_test(cgroup_fd, &tests[i], false, false), + tests[i].descr); + ASSERT_OK(run_test(cgroup_fd, &tests[i], false, true), tests[i].descr); if (tests[i].io_uring_support) - ASSERT_OK(run_test(cgroup_fd, &tests[i], true), + ASSERT_OK(run_test(cgroup_fd, &tests[i], true, false), tests[i].descr); } diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c b/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c index 917f486db826..1d3a20f01b60 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c +++ b/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <test_progs.h> #include "cgroup_helpers.h" +#include "network_helpers.h" #include "sockopt_inherit.skel.h" @@ -9,35 +10,6 @@ #define CUSTOM_INHERIT2 1 #define CUSTOM_LISTENER 2 -static int connect_to_server(int server_fd) -{ - struct sockaddr_storage addr; - socklen_t len = sizeof(addr); - int fd; - - fd = socket(AF_INET, SOCK_STREAM, 0); - if (fd < 0) { - log_err("Failed to create client socket"); - return -1; - } - - if (getsockname(server_fd, (struct sockaddr *)&addr, &len)) { - log_err("Failed to get server addr"); - goto out; - } - - if (connect(fd, (const struct sockaddr *)&addr, len) < 0) { - log_err("Fail to connect to server"); - goto out; - } - - return fd; - -out: - close(fd); - return -1; -} - static int verify_sockopt(int fd, int optname, const char *msg, char expected) { socklen_t optlen = 1; @@ -98,47 +70,36 @@ static void *server_thread(void *arg) return (void *)(long)err; } -static int start_server(void) +static int custom_cb(int fd, const struct post_socket_opts *opts) { - struct sockaddr_in addr = { - .sin_family = AF_INET, - .sin_addr.s_addr = htonl(INADDR_LOOPBACK), - }; char buf; int err; - int fd; int i; - fd = socket(AF_INET, SOCK_STREAM, 0); - if (fd < 0) { - log_err("Failed to create server socket"); - return -1; - } - for (i = CUSTOM_INHERIT1; i <= CUSTOM_LISTENER; i++) { buf = 0x01; err = setsockopt(fd, SOL_CUSTOM, i, &buf, 1); if (err) { log_err("Failed to call setsockopt(%d)", i); - close(fd); return -1; } } - if (bind(fd, (const struct sockaddr *)&addr, sizeof(addr)) < 0) { - log_err("Failed to bind socket"); - close(fd); - return -1; - } - - return fd; + return 0; } static void run_test(int cgroup_fd) { struct bpf_link *link_getsockopt = NULL; struct bpf_link *link_setsockopt = NULL; + struct network_helper_opts opts = { + .post_socket_cb = custom_cb, + }; int server_fd = -1, client_fd; + struct sockaddr_in addr = { + .sin_family = AF_INET, + .sin_addr.s_addr = htonl(INADDR_LOOPBACK), + }; struct sockopt_inherit *obj; void *server_err; pthread_t tid; @@ -160,7 +121,8 @@ static void run_test(int cgroup_fd) if (!ASSERT_OK_PTR(link_setsockopt, "cg-attach-setsockopt")) goto close_bpf_object; - server_fd = start_server(); + server_fd = start_server_addr(SOCK_STREAM, (struct sockaddr_storage *)&addr, + sizeof(addr), &opts); if (!ASSERT_GE(server_fd, 0, "start_server")) goto close_bpf_object; @@ -173,7 +135,7 @@ static void run_test(int cgroup_fd) pthread_cond_wait(&server_started, &server_started_mtx); pthread_mutex_unlock(&server_started_mtx); - client_fd = connect_to_server(server_fd); + client_fd = connect_to_fd(server_fd, 0); if (!ASSERT_GE(client_fd, 0, "connect_to_server")) goto close_server_fd; diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c index 5db9eec24b5b..0832fd787457 100644 --- a/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c +++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c @@ -35,7 +35,7 @@ retry: pmu_fd = syscall(__NR_perf_event_open, &attr, -1 /* pid */, 0 /* cpu 0 */, -1 /* group id */, 0 /* flags */); - if (pmu_fd < 0 && errno == ENOENT) { + if (pmu_fd < 0 && (errno == ENOENT || errno == EOPNOTSUPP)) { printf("%s:SKIP:no PERF_COUNT_HW_CPU_CYCLES\n", __func__); test__skip(); goto cleanup; diff --git a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c index dbe06aeaa2b2..b1073d36d77a 100644 --- a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c +++ b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c @@ -530,7 +530,7 @@ static int wait_netstamp_needed_key(void) __u64 tstamp = 0; nstoken = open_netns(NS_DST); - if (!nstoken) + if (!ASSERT_OK_PTR(nstoken, "setns dst")) return -1; srv_fd = start_server(AF_INET6, SOCK_DGRAM, "::1", 0, 0); diff --git a/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c b/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c index 8fe84da1b9b4..f2b99d95d916 100644 --- a/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c +++ b/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c @@ -10,6 +10,9 @@ struct tcp_rtt_storage { __u32 delivered; __u32 delivered_ce; __u32 icsk_retransmits; + + __u32 mrtt_us; /* args[0] */ + __u32 srtt; /* args[1] */ }; static void send_byte(int fd) @@ -83,6 +86,17 @@ static int verify_sk(int map_fd, int client_fd, const char *msg, __u32 invoked, err++; } + /* Precise values of mrtt and srtt are unavailable, just make sure they are nonzero */ + if (val.mrtt_us == 0) { + log_err("%s: unexpected bpf_tcp_sock.args[0] (mrtt_us) %u == 0", msg, val.mrtt_us); + err++; + } + + if (val.srtt == 0) { + log_err("%s: unexpected bpf_tcp_sock.args[1] (srtt) %u == 0", msg, val.srtt); + err++; + } + return err; } diff --git a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c index ee5372c7f2c7..29e183a80f49 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c +++ b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c @@ -4,6 +4,8 @@ #include <time.h> #include "struct_ops_module.skel.h" +#include "struct_ops_nulled_out_cb.skel.h" +#include "struct_ops_forgotten_cb.skel.h" static void check_map_info(struct bpf_map_info *info) { @@ -66,6 +68,7 @@ static void test_struct_ops_load(void) * auto-loading, or it will fail to load. */ bpf_program__set_autoload(skel->progs.test_2, false); + bpf_map__set_autocreate(skel->maps.testmod_zeroed, false); err = struct_ops_module__load(skel); if (!ASSERT_OK(err, "struct_ops_module_load")) @@ -93,9 +96,163 @@ cleanup: struct_ops_module__destroy(skel); } +static void test_struct_ops_not_zeroed(void) +{ + struct struct_ops_module *skel; + int err; + + /* zeroed is 0, and zeroed_op is null */ + skel = struct_ops_module__open(); + if (!ASSERT_OK_PTR(skel, "struct_ops_module_open")) + return; + + skel->struct_ops.testmod_zeroed->zeroed = 0; + /* zeroed_op prog should be not loaded automatically now */ + skel->struct_ops.testmod_zeroed->zeroed_op = NULL; + + err = struct_ops_module__load(skel); + ASSERT_OK(err, "struct_ops_module_load"); + + struct_ops_module__destroy(skel); + + /* zeroed is not 0 */ + skel = struct_ops_module__open(); + if (!ASSERT_OK_PTR(skel, "struct_ops_module_open_not_zeroed")) + return; + + /* libbpf should reject the testmod_zeroed since struct + * bpf_testmod_ops in the kernel has no "zeroed" field and the + * value of "zeroed" is non-zero. + */ + skel->struct_ops.testmod_zeroed->zeroed = 0xdeadbeef; + skel->struct_ops.testmod_zeroed->zeroed_op = NULL; + err = struct_ops_module__load(skel); + ASSERT_ERR(err, "struct_ops_module_load_not_zeroed"); + + struct_ops_module__destroy(skel); + + /* zeroed_op is not null */ + skel = struct_ops_module__open(); + if (!ASSERT_OK_PTR(skel, "struct_ops_module_open_not_zeroed_op")) + return; + + /* libbpf should reject the testmod_zeroed since the value of its + * "zeroed_op" is not null. + */ + skel->struct_ops.testmod_zeroed->zeroed_op = skel->progs.test_3; + err = struct_ops_module__load(skel); + ASSERT_ERR(err, "struct_ops_module_load_not_zeroed_op"); + + struct_ops_module__destroy(skel); +} + +/* The signature of an implementation might not match the signature of the + * function pointer prototype defined in the BPF program. This mismatch + * should be allowed as long as the behavior of the operator program + * adheres to the signature in the kernel. Libbpf should not enforce the + * signature; rather, let the kernel verifier handle the enforcement. + */ +static void test_struct_ops_incompatible(void) +{ + struct struct_ops_module *skel; + struct bpf_link *link; + int err; + + skel = struct_ops_module__open(); + if (!ASSERT_OK_PTR(skel, "struct_ops_module_open")) + return; + + bpf_map__set_autocreate(skel->maps.testmod_zeroed, false); + + err = struct_ops_module__load(skel); + if (!ASSERT_OK(err, "skel_load")) + goto cleanup; + + link = bpf_map__attach_struct_ops(skel->maps.testmod_incompatible); + if (ASSERT_OK_PTR(link, "attach_struct_ops")) + bpf_link__destroy(link); + +cleanup: + struct_ops_module__destroy(skel); +} + +/* validate that it's ok to "turn off" callback that kernel supports */ +static void test_struct_ops_nulled_out_cb(void) +{ + struct struct_ops_nulled_out_cb *skel; + int err; + + skel = struct_ops_nulled_out_cb__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + /* kernel knows about test_1, but we still null it out */ + skel->struct_ops.ops->test_1 = NULL; + + err = struct_ops_nulled_out_cb__load(skel); + if (!ASSERT_OK(err, "skel_load")) + goto cleanup; + + ASSERT_FALSE(bpf_program__autoload(skel->progs.test_1_turn_off), "prog_autoload"); + ASSERT_LT(bpf_program__fd(skel->progs.test_1_turn_off), 0, "prog_fd"); + +cleanup: + struct_ops_nulled_out_cb__destroy(skel); +} + +/* validate that libbpf generates reasonable error message if struct_ops is + * not referenced in any struct_ops map + */ +static void test_struct_ops_forgotten_cb(void) +{ + struct struct_ops_forgotten_cb *skel; + char *log; + int err; + + skel = struct_ops_forgotten_cb__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + start_libbpf_log_capture(); + + err = struct_ops_forgotten_cb__load(skel); + if (!ASSERT_ERR(err, "skel_load")) + goto cleanup; + + log = stop_libbpf_log_capture(); + ASSERT_HAS_SUBSTR(log, + "prog 'test_1_forgotten': SEC(\"struct_ops\") program isn't referenced anywhere, did you forget to use it?", + "libbpf_log"); + free(log); + + struct_ops_forgotten_cb__destroy(skel); + + /* now let's programmatically use it, we should be fine now */ + skel = struct_ops_forgotten_cb__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + skel->struct_ops.ops->test_1 = skel->progs.test_1_forgotten; /* not anymore */ + + err = struct_ops_forgotten_cb__load(skel); + if (!ASSERT_OK(err, "skel_load")) + goto cleanup; + +cleanup: + struct_ops_forgotten_cb__destroy(skel); +} + void serial_test_struct_ops_module(void) { - if (test__start_subtest("test_struct_ops_load")) + if (test__start_subtest("struct_ops_load")) test_struct_ops_load(); + if (test__start_subtest("struct_ops_not_zeroed")) + test_struct_ops_not_zeroed(); + if (test__start_subtest("struct_ops_incompatible")) + test_struct_ops_incompatible(); + if (test__start_subtest("struct_ops_null_out_cb")) + test_struct_ops_nulled_out_cb(); + if (test__start_subtest("struct_ops_forgotten_cb")) + test_struct_ops_forgotten_cb(); } diff --git a/tools/testing/selftests/bpf/prog_tests/test_tunnel.c b/tools/testing/selftests/bpf/prog_tests/test_tunnel.c index 5f1fb0a2ea56..cec746e77cd3 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_tunnel.c +++ b/tools/testing/selftests/bpf/prog_tests/test_tunnel.c @@ -612,6 +612,8 @@ static void test_ipip_tunnel(enum ipip_encap encap) /* ping from at_ns0 namespace test */ nstoken = open_netns("at_ns0"); + if (!ASSERT_OK_PTR(nstoken, "setns")) + goto done; err = test_ping(AF_INET, IP4_ADDR_TUNL_DEV1); if (!ASSERT_OK(err, "test_ping")) goto done; @@ -666,6 +668,8 @@ static void test_xfrm_tunnel(void) /* ping from at_ns0 namespace test */ nstoken = open_netns("at_ns0"); + if (!ASSERT_OK_PTR(nstoken, "setns")) + goto done; err = test_ping(AF_INET, IP4_ADDR_TUNL_DEV1); close_netns(nstoken); if (!ASSERT_OK(err, "test_ping")) diff --git a/tools/testing/selftests/bpf/prog_tests/trace_printk.c b/tools/testing/selftests/bpf/prog_tests/trace_printk.c index 7b9124d506a5..e56e88596d64 100644 --- a/tools/testing/selftests/bpf/prog_tests/trace_printk.c +++ b/tools/testing/selftests/bpf/prog_tests/trace_printk.c @@ -5,18 +5,19 @@ #include "trace_printk.lskel.h" -#define TRACEFS_PIPE "/sys/kernel/tracing/trace_pipe" -#define DEBUGFS_PIPE "/sys/kernel/debug/tracing/trace_pipe" #define SEARCHMSG "testing,testing" +static void trace_pipe_cb(const char *str, void *data) +{ + if (strstr(str, SEARCHMSG) != NULL) + (*(int *)data)++; +} + void serial_test_trace_printk(void) { struct trace_printk_lskel__bss *bss; - int err = 0, iter = 0, found = 0; struct trace_printk_lskel *skel; - char *buf = NULL; - FILE *fp = NULL; - size_t buflen; + int err = 0, found = 0; skel = trace_printk_lskel__open(); if (!ASSERT_OK_PTR(skel, "trace_printk__open")) @@ -35,16 +36,6 @@ void serial_test_trace_printk(void) if (!ASSERT_OK(err, "trace_printk__attach")) goto cleanup; - if (access(TRACEFS_PIPE, F_OK) == 0) - fp = fopen(TRACEFS_PIPE, "r"); - else - fp = fopen(DEBUGFS_PIPE, "r"); - if (!ASSERT_OK_PTR(fp, "fopen(TRACE_PIPE)")) - goto cleanup; - - /* We do not want to wait forever if this test fails... */ - fcntl(fileno(fp), F_SETFL, O_NONBLOCK); - /* wait for tracepoint to trigger */ usleep(1); trace_printk_lskel__detach(skel); @@ -56,21 +47,12 @@ void serial_test_trace_printk(void) goto cleanup; /* verify our search string is in the trace buffer */ - while (getline(&buf, &buflen, fp) >= 0 || errno == EAGAIN) { - if (strstr(buf, SEARCHMSG) != NULL) - found++; - if (found == bss->trace_printk_ran) - break; - if (++iter > 1000) - break; - } + ASSERT_OK(read_trace_pipe_iter(trace_pipe_cb, &found, 1000), + "read_trace_pipe_iter"); if (!ASSERT_EQ(found, bss->trace_printk_ran, "found")) goto cleanup; cleanup: trace_printk_lskel__destroy(skel); - free(buf); - if (fp) - fclose(fp); } diff --git a/tools/testing/selftests/bpf/prog_tests/trace_vprintk.c b/tools/testing/selftests/bpf/prog_tests/trace_vprintk.c index 44ea2fd88f4c..2af6a6f2096a 100644 --- a/tools/testing/selftests/bpf/prog_tests/trace_vprintk.c +++ b/tools/testing/selftests/bpf/prog_tests/trace_vprintk.c @@ -5,18 +5,19 @@ #include "trace_vprintk.lskel.h" -#define TRACEFS_PIPE "/sys/kernel/tracing/trace_pipe" -#define DEBUGFS_PIPE "/sys/kernel/debug/tracing/trace_pipe" #define SEARCHMSG "1,2,3,4,5,6,7,8,9,10" +static void trace_pipe_cb(const char *str, void *data) +{ + if (strstr(str, SEARCHMSG) != NULL) + (*(int *)data)++; +} + void serial_test_trace_vprintk(void) { struct trace_vprintk_lskel__bss *bss; - int err = 0, iter = 0, found = 0; struct trace_vprintk_lskel *skel; - char *buf = NULL; - FILE *fp = NULL; - size_t buflen; + int err = 0, found = 0; skel = trace_vprintk_lskel__open_and_load(); if (!ASSERT_OK_PTR(skel, "trace_vprintk__open_and_load")) @@ -28,16 +29,6 @@ void serial_test_trace_vprintk(void) if (!ASSERT_OK(err, "trace_vprintk__attach")) goto cleanup; - if (access(TRACEFS_PIPE, F_OK) == 0) - fp = fopen(TRACEFS_PIPE, "r"); - else - fp = fopen(DEBUGFS_PIPE, "r"); - if (!ASSERT_OK_PTR(fp, "fopen(TRACE_PIPE)")) - goto cleanup; - - /* We do not want to wait forever if this test fails... */ - fcntl(fileno(fp), F_SETFL, O_NONBLOCK); - /* wait for tracepoint to trigger */ usleep(1); trace_vprintk_lskel__detach(skel); @@ -49,14 +40,8 @@ void serial_test_trace_vprintk(void) goto cleanup; /* verify our search string is in the trace buffer */ - while (getline(&buf, &buflen, fp) >= 0 || errno == EAGAIN) { - if (strstr(buf, SEARCHMSG) != NULL) - found++; - if (found == bss->trace_vprintk_ran) - break; - if (++iter > 1000) - break; - } + ASSERT_OK(read_trace_pipe_iter(trace_pipe_cb, &found, 1000), + "read_trace_pipe_iter"); if (!ASSERT_EQ(found, bss->trace_vprintk_ran, "found")) goto cleanup; @@ -66,7 +51,4 @@ void serial_test_trace_vprintk(void) cleanup: trace_vprintk_lskel__destroy(skel); - free(buf); - if (fp) - fclose(fp); } diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index c4f9f306646e..c60db8beeb73 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -66,6 +66,7 @@ #include "verifier_sdiv.skel.h" #include "verifier_search_pruning.skel.h" #include "verifier_sock.skel.h" +#include "verifier_sock_addr.skel.h" #include "verifier_spill_fill.skel.h" #include "verifier_spin_lock.skel.h" #include "verifier_stack_ptr.skel.h" @@ -181,6 +182,7 @@ void test_verifier_scalar_ids(void) { RUN(verifier_scalar_ids); } void test_verifier_sdiv(void) { RUN(verifier_sdiv); } void test_verifier_search_pruning(void) { RUN(verifier_search_pruning); } void test_verifier_sock(void) { RUN(verifier_sock); } +void test_verifier_sock_addr(void) { RUN(verifier_sock_addr); } void test_verifier_spill_fill(void) { RUN(verifier_spill_fill); } void test_verifier_spin_lock(void) { RUN(verifier_spin_lock); } void test_verifier_stack_ptr(void) { RUN(verifier_stack_ptr); } diff --git a/tools/testing/selftests/bpf/prog_tests/verifier_kfunc_prog_types.c b/tools/testing/selftests/bpf/prog_tests/verifier_kfunc_prog_types.c new file mode 100644 index 000000000000..3918ecc2ee91 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/verifier_kfunc_prog_types.c @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#include <test_progs.h> + +#include "verifier_kfunc_prog_types.skel.h" + +void test_verifier_kfunc_prog_types(void) +{ + RUN_TESTS(verifier_kfunc_prog_types); +} diff --git a/tools/testing/selftests/bpf/prog_tests/wq.c b/tools/testing/selftests/bpf/prog_tests/wq.c new file mode 100644 index 000000000000..99e438fe12ac --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/wq.c @@ -0,0 +1,40 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Benjamin Tissoires */ +#include <test_progs.h> +#include "wq.skel.h" +#include "wq_failures.skel.h" + +void serial_test_wq(void) +{ + struct wq *wq_skel = NULL; + int err, prog_fd; + + LIBBPF_OPTS(bpf_test_run_opts, topts); + + RUN_TESTS(wq); + + /* re-run the success test to check if the timer was actually executed */ + + wq_skel = wq__open_and_load(); + if (!ASSERT_OK_PTR(wq_skel, "wq_skel_load")) + return; + + err = wq__attach(wq_skel); + if (!ASSERT_OK(err, "wq_attach")) + return; + + prog_fd = bpf_program__fd(wq_skel->progs.test_syscall_array_sleepable); + err = bpf_prog_test_run_opts(prog_fd, &topts); + ASSERT_OK(err, "test_run"); + ASSERT_EQ(topts.retval, 0, "test_run"); + + usleep(50); /* 10 usecs should be enough, but give it extra */ + + ASSERT_EQ(wq_skel->bss->ok_sleepable, (1 << 1), "ok_sleepable"); + wq__destroy(wq_skel); +} + +void serial_test_failures_wq(void) +{ + RUN_TESTS(wq_failures); +} diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c b/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c index 498d3bdaa4b0..bad0ea167be7 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c @@ -107,8 +107,8 @@ void test_xdp_do_redirect(void) .attach_point = BPF_TC_INGRESS); memcpy(&data[sizeof(__u64)], &pkt_udp, sizeof(pkt_udp)); - *((__u32 *)data) = 0x42; /* metadata test value */ - *((__u32 *)data + 4) = 0; + ((__u32 *)data)[0] = 0x42; /* metadata test value */ + ((__u32 *)data)[1] = 0; skel = test_xdp_do_redirect__open(); if (!ASSERT_OK_PTR(skel, "skel")) diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c b/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c index 05edcf32f528..f76b5d67a3ee 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c @@ -384,6 +384,8 @@ void test_xdp_metadata(void) SYS(out, "ip netns add " RX_NETNS_NAME); tok = open_netns(TX_NETNS_NAME); + if (!ASSERT_OK_PTR(tok, "setns")) + goto out; SYS(out, "ip link add numtxqueues 1 numrxqueues 1 " TX_NAME " type veth peer " RX_NAME " numtxqueues 1 numrxqueues 1"); SYS(out, "ip link set " RX_NAME " netns " RX_NETNS_NAME); @@ -400,6 +402,8 @@ void test_xdp_metadata(void) SYS(out, "ip -4 neigh add " RX_ADDR " lladdr " RX_MAC " dev " TX_NAME_VLAN); switch_ns_to_rx(&tok); + if (!ASSERT_OK_PTR(tok, "setns rx")) + goto out; SYS(out, "ip link set dev " RX_NAME " address " RX_MAC); SYS(out, "ip link set dev " RX_NAME " up"); @@ -449,6 +453,8 @@ void test_xdp_metadata(void) goto out; switch_ns_to_tx(&tok); + if (!ASSERT_OK_PTR(tok, "setns tx")) + goto out; /* Setup separate AF_XDP for TX interface nad send packet to the RX socket. */ tx_ifindex = if_nametoindex(TX_NAME); @@ -461,6 +467,8 @@ void test_xdp_metadata(void) goto out; switch_ns_to_rx(&tok); + if (!ASSERT_OK_PTR(tok, "setns rx")) + goto out; /* Verify packet sent from AF_XDP has proper metadata. */ if (!ASSERT_GE(verify_xsk_metadata(&rx_xsk, true), 0, @@ -468,6 +476,8 @@ void test_xdp_metadata(void) goto out; switch_ns_to_tx(&tok); + if (!ASSERT_OK_PTR(tok, "setns tx")) + goto out; complete_tx(&tx_xsk); /* Now check metadata of packet, generated with network stack */ @@ -475,6 +485,8 @@ void test_xdp_metadata(void) goto out; switch_ns_to_rx(&tok); + if (!ASSERT_OK_PTR(tok, "setns rx")) + goto out; if (!ASSERT_GE(verify_xsk_metadata(&rx_xsk, false), 0, "verify_xsk_metadata")) @@ -498,6 +510,8 @@ void test_xdp_metadata(void) goto out; switch_ns_to_tx(&tok); + if (!ASSERT_OK_PTR(tok, "setns tx")) + goto out; /* Send packet to trigger . */ if (!ASSERT_GE(generate_packet(&tx_xsk, AF_XDP_CONSUMER_PORT), 0, @@ -505,6 +519,8 @@ void test_xdp_metadata(void) goto out; switch_ns_to_rx(&tok); + if (!ASSERT_OK_PTR(tok, "setns rx")) + goto out; while (!retries--) { if (bpf_obj2->bss->called) diff --git a/tools/testing/selftests/bpf/progs/arena_atomics.c b/tools/testing/selftests/bpf/progs/arena_atomics.c new file mode 100644 index 000000000000..55f10563208d --- /dev/null +++ b/tools/testing/selftests/bpf/progs/arena_atomics.c @@ -0,0 +1,178 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include <stdbool.h> +#include "bpf_arena_common.h" + +struct { + __uint(type, BPF_MAP_TYPE_ARENA); + __uint(map_flags, BPF_F_MMAPABLE); + __uint(max_entries, 10); /* number of pages */ +#ifdef __TARGET_ARCH_arm64 + __ulong(map_extra, 0x1ull << 32); /* start of mmap() region */ +#else + __ulong(map_extra, 0x1ull << 44); /* start of mmap() region */ +#endif +} arena SEC(".maps"); + +#if defined(ENABLE_ATOMICS_TESTS) && defined(__BPF_FEATURE_ADDR_SPACE_CAST) +bool skip_tests __attribute((__section__(".data"))) = false; +#else +bool skip_tests = true; +#endif + +__u32 pid = 0; + +#undef __arena +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) +#define __arena __attribute__((address_space(1))) +#else +#define __arena SEC(".addr_space.1") +#endif + +__u64 __arena add64_value = 1; +__u64 __arena add64_result = 0; +__u32 __arena add32_value = 1; +__u32 __arena add32_result = 0; +__u64 __arena add_stack_value_copy = 0; +__u64 __arena add_stack_result = 0; +__u64 __arena add_noreturn_value = 1; + +SEC("raw_tp/sys_enter") +int add(const void *ctx) +{ + if (pid != (bpf_get_current_pid_tgid() >> 32)) + return 0; +#ifdef ENABLE_ATOMICS_TESTS + __u64 add_stack_value = 1; + + add64_result = __sync_fetch_and_add(&add64_value, 2); + add32_result = __sync_fetch_and_add(&add32_value, 2); + add_stack_result = __sync_fetch_and_add(&add_stack_value, 2); + add_stack_value_copy = add_stack_value; + __sync_fetch_and_add(&add_noreturn_value, 2); +#endif + + return 0; +} + +__s64 __arena sub64_value = 1; +__s64 __arena sub64_result = 0; +__s32 __arena sub32_value = 1; +__s32 __arena sub32_result = 0; +__s64 __arena sub_stack_value_copy = 0; +__s64 __arena sub_stack_result = 0; +__s64 __arena sub_noreturn_value = 1; + +SEC("raw_tp/sys_enter") +int sub(const void *ctx) +{ + if (pid != (bpf_get_current_pid_tgid() >> 32)) + return 0; +#ifdef ENABLE_ATOMICS_TESTS + __u64 sub_stack_value = 1; + + sub64_result = __sync_fetch_and_sub(&sub64_value, 2); + sub32_result = __sync_fetch_and_sub(&sub32_value, 2); + sub_stack_result = __sync_fetch_and_sub(&sub_stack_value, 2); + sub_stack_value_copy = sub_stack_value; + __sync_fetch_and_sub(&sub_noreturn_value, 2); +#endif + + return 0; +} + +__u64 __arena and64_value = (0x110ull << 32); +__u32 __arena and32_value = 0x110; + +SEC("raw_tp/sys_enter") +int and(const void *ctx) +{ + if (pid != (bpf_get_current_pid_tgid() >> 32)) + return 0; +#ifdef ENABLE_ATOMICS_TESTS + + __sync_fetch_and_and(&and64_value, 0x011ull << 32); + __sync_fetch_and_and(&and32_value, 0x011); +#endif + + return 0; +} + +__u32 __arena or32_value = 0x110; +__u64 __arena or64_value = (0x110ull << 32); + +SEC("raw_tp/sys_enter") +int or(const void *ctx) +{ + if (pid != (bpf_get_current_pid_tgid() >> 32)) + return 0; +#ifdef ENABLE_ATOMICS_TESTS + __sync_fetch_and_or(&or64_value, 0x011ull << 32); + __sync_fetch_and_or(&or32_value, 0x011); +#endif + + return 0; +} + +__u64 __arena xor64_value = (0x110ull << 32); +__u32 __arena xor32_value = 0x110; + +SEC("raw_tp/sys_enter") +int xor(const void *ctx) +{ + if (pid != (bpf_get_current_pid_tgid() >> 32)) + return 0; +#ifdef ENABLE_ATOMICS_TESTS + __sync_fetch_and_xor(&xor64_value, 0x011ull << 32); + __sync_fetch_and_xor(&xor32_value, 0x011); +#endif + + return 0; +} + +__u32 __arena cmpxchg32_value = 1; +__u32 __arena cmpxchg32_result_fail = 0; +__u32 __arena cmpxchg32_result_succeed = 0; +__u64 __arena cmpxchg64_value = 1; +__u64 __arena cmpxchg64_result_fail = 0; +__u64 __arena cmpxchg64_result_succeed = 0; + +SEC("raw_tp/sys_enter") +int cmpxchg(const void *ctx) +{ + if (pid != (bpf_get_current_pid_tgid() >> 32)) + return 0; +#ifdef ENABLE_ATOMICS_TESTS + cmpxchg64_result_fail = __sync_val_compare_and_swap(&cmpxchg64_value, 0, 3); + cmpxchg64_result_succeed = __sync_val_compare_and_swap(&cmpxchg64_value, 1, 2); + + cmpxchg32_result_fail = __sync_val_compare_and_swap(&cmpxchg32_value, 0, 3); + cmpxchg32_result_succeed = __sync_val_compare_and_swap(&cmpxchg32_value, 1, 2); +#endif + + return 0; +} + +__u64 __arena xchg64_value = 1; +__u64 __arena xchg64_result = 0; +__u32 __arena xchg32_value = 1; +__u32 __arena xchg32_result = 0; + +SEC("raw_tp/sys_enter") +int xchg(const void *ctx) +{ + if (pid != (bpf_get_current_pid_tgid() >> 32)) + return 0; +#ifdef ENABLE_ATOMICS_TESTS + __u64 val64 = 2; + __u32 val32 = 2; + + xchg64_result = __sync_lock_test_and_set(&xchg64_value, val64); + xchg32_result = __sync_lock_test_and_set(&xchg32_value, val32); +#endif + + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/arena_list.c b/tools/testing/selftests/bpf/progs/arena_list.c index c0422c58cee2..93bd0600eba0 100644 --- a/tools/testing/selftests/bpf/progs/arena_list.c +++ b/tools/testing/selftests/bpf/progs/arena_list.c @@ -49,7 +49,7 @@ int arena_list_add(void *ctx) list_head = &global_head; - for (i = zero; i < cnt; cond_break, i++) { + for (i = zero; i < cnt && can_loop; i++) { struct elem __arena *n = bpf_alloc(sizeof(*n)); test_val++; diff --git a/tools/testing/selftests/bpf/progs/bench_local_storage_create.c b/tools/testing/selftests/bpf/progs/bench_local_storage_create.c index e4bfbba6c193..c8ec0d0368e4 100644 --- a/tools/testing/selftests/bpf/progs/bench_local_storage_create.c +++ b/tools/testing/selftests/bpf/progs/bench_local_storage_create.c @@ -61,14 +61,15 @@ SEC("lsm.s/socket_post_create") int BPF_PROG(socket_post_create, struct socket *sock, int family, int type, int protocol, int kern) { + struct sock *sk = sock->sk; struct storage *stg; __u32 pid; pid = bpf_get_current_pid_tgid() >> 32; - if (pid != bench_pid) + if (pid != bench_pid || !sk) return 0; - stg = bpf_sk_storage_get(&sk_storage_map, sock->sk, NULL, + stg = bpf_sk_storage_get(&sk_storage_map, sk, NULL, BPF_LOCAL_STORAGE_GET_F_CREATE); if (stg) diff --git a/tools/testing/selftests/bpf/progs/bind4_prog.c b/tools/testing/selftests/bpf/progs/bind4_prog.c index a487f60b73ac..b7ddf8ec4ee8 100644 --- a/tools/testing/selftests/bpf/progs/bind4_prog.c +++ b/tools/testing/selftests/bpf/progs/bind4_prog.c @@ -12,6 +12,8 @@ #include <bpf/bpf_helpers.h> #include <bpf/bpf_endian.h> +#include "bind_prog.h" + #define SERV4_IP 0xc0a801feU /* 192.168.1.254 */ #define SERV4_PORT 4040 #define SERV4_REWRITE_IP 0x7f000001U /* 127.0.0.1 */ @@ -118,23 +120,23 @@ int bind_v4_prog(struct bpf_sock_addr *ctx) // u8 narrow loads: user_ip4 = 0; - user_ip4 |= ((volatile __u8 *)&ctx->user_ip4)[0] << 0; - user_ip4 |= ((volatile __u8 *)&ctx->user_ip4)[1] << 8; - user_ip4 |= ((volatile __u8 *)&ctx->user_ip4)[2] << 16; - user_ip4 |= ((volatile __u8 *)&ctx->user_ip4)[3] << 24; + user_ip4 |= load_byte(ctx->user_ip4, 0, sizeof(user_ip4)); + user_ip4 |= load_byte(ctx->user_ip4, 1, sizeof(user_ip4)); + user_ip4 |= load_byte(ctx->user_ip4, 2, sizeof(user_ip4)); + user_ip4 |= load_byte(ctx->user_ip4, 3, sizeof(user_ip4)); if (ctx->user_ip4 != user_ip4) return 0; user_port = 0; - user_port |= ((volatile __u8 *)&ctx->user_port)[0] << 0; - user_port |= ((volatile __u8 *)&ctx->user_port)[1] << 8; + user_port |= load_byte(ctx->user_port, 0, sizeof(user_port)); + user_port |= load_byte(ctx->user_port, 1, sizeof(user_port)); if (ctx->user_port != user_port) return 0; // u16 narrow loads: user_ip4 = 0; - user_ip4 |= ((volatile __u16 *)&ctx->user_ip4)[0] << 0; - user_ip4 |= ((volatile __u16 *)&ctx->user_ip4)[1] << 16; + user_ip4 |= load_word(ctx->user_ip4, 0, sizeof(user_ip4)); + user_ip4 |= load_word(ctx->user_ip4, 1, sizeof(user_ip4)); if (ctx->user_ip4 != user_ip4) return 0; @@ -156,4 +158,10 @@ int bind_v4_prog(struct bpf_sock_addr *ctx) return 1; } +SEC("cgroup/bind4") +int bind_v4_deny_prog(struct bpf_sock_addr *ctx) +{ + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/bind6_prog.c b/tools/testing/selftests/bpf/progs/bind6_prog.c index d62cd9e9cf0e..501c3fc11d35 100644 --- a/tools/testing/selftests/bpf/progs/bind6_prog.c +++ b/tools/testing/selftests/bpf/progs/bind6_prog.c @@ -12,6 +12,8 @@ #include <bpf/bpf_helpers.h> #include <bpf/bpf_endian.h> +#include "bind_prog.h" + #define SERV6_IP_0 0xfaceb00c /* face:b00c:1234:5678::abcd */ #define SERV6_IP_1 0x12345678 #define SERV6_IP_2 0x00000000 @@ -129,25 +131,25 @@ int bind_v6_prog(struct bpf_sock_addr *ctx) // u8 narrow loads: for (i = 0; i < 4; i++) { user_ip6 = 0; - user_ip6 |= ((volatile __u8 *)&ctx->user_ip6[i])[0] << 0; - user_ip6 |= ((volatile __u8 *)&ctx->user_ip6[i])[1] << 8; - user_ip6 |= ((volatile __u8 *)&ctx->user_ip6[i])[2] << 16; - user_ip6 |= ((volatile __u8 *)&ctx->user_ip6[i])[3] << 24; + user_ip6 |= load_byte(ctx->user_ip6[i], 0, sizeof(user_ip6)); + user_ip6 |= load_byte(ctx->user_ip6[i], 1, sizeof(user_ip6)); + user_ip6 |= load_byte(ctx->user_ip6[i], 2, sizeof(user_ip6)); + user_ip6 |= load_byte(ctx->user_ip6[i], 3, sizeof(user_ip6)); if (ctx->user_ip6[i] != user_ip6) return 0; } user_port = 0; - user_port |= ((volatile __u8 *)&ctx->user_port)[0] << 0; - user_port |= ((volatile __u8 *)&ctx->user_port)[1] << 8; + user_port |= load_byte(ctx->user_port, 0, sizeof(user_port)); + user_port |= load_byte(ctx->user_port, 1, sizeof(user_port)); if (ctx->user_port != user_port) return 0; // u16 narrow loads: for (i = 0; i < 4; i++) { user_ip6 = 0; - user_ip6 |= ((volatile __u16 *)&ctx->user_ip6[i])[0] << 0; - user_ip6 |= ((volatile __u16 *)&ctx->user_ip6[i])[1] << 16; + user_ip6 |= load_word(ctx->user_ip6[i], 0, sizeof(user_ip6)); + user_ip6 |= load_word(ctx->user_ip6[i], 1, sizeof(user_ip6)); if (ctx->user_ip6[i] != user_ip6) return 0; } @@ -173,4 +175,10 @@ int bind_v6_prog(struct bpf_sock_addr *ctx) return 1; } +SEC("cgroup/bind6") +int bind_v6_deny_prog(struct bpf_sock_addr *ctx) +{ + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/bind_prog.h b/tools/testing/selftests/bpf/progs/bind_prog.h new file mode 100644 index 000000000000..e830caa940c3 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bind_prog.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __BIND_PROG_H__ +#define __BIND_PROG_H__ + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define load_byte(src, b, s) \ + (((volatile __u8 *)&(src))[b] << 8 * b) +#define load_word(src, w, s) \ + (((volatile __u16 *)&(src))[w] << 16 * w) +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#define load_byte(src, b, s) \ + (((volatile __u8 *)&(src))[(b) + (sizeof(src) - (s))] << 8 * ((s) - (b) - 1)) +#define load_word(src, w, s) \ + (((volatile __u16 *)&(src))[w] << 16 * (((s) / 2) - (w) - 1)) +#else +# error "Fix your compiler's __BYTE_ORDER__?!" +#endif + +#endif diff --git a/tools/testing/selftests/bpf/progs/bpf_cc_cubic.c b/tools/testing/selftests/bpf/progs/bpf_cc_cubic.c new file mode 100644 index 000000000000..1654a530aa3d --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_cc_cubic.c @@ -0,0 +1,189 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* Highlights: + * 1. The major difference between this bpf program and tcp_cubic.c + * is that this bpf program relies on `cong_control` rather than + * `cong_avoid` in the struct tcp_congestion_ops. + * 2. Logic such as tcp_cwnd_reduction, tcp_cong_avoid, and + * tcp_update_pacing_rate is bypassed when `cong_control` is + * defined, so moving these logic to `cong_control`. + * 3. WARNING: This bpf program is NOT the same as tcp_cubic.c. + * The main purpose is to show use cases of the arguments in + * `cong_control`. For simplicity's sake, it reuses tcp cubic's + * kernel functions. + */ + +#include "bpf_tracing_net.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +#define USEC_PER_SEC 1000000UL +#define TCP_PACING_SS_RATIO (200) +#define TCP_PACING_CA_RATIO (120) +#define TCP_REORDERING (12) + +#define min(a, b) ((a) < (b) ? (a) : (b)) +#define max(a, b) ((a) > (b) ? (a) : (b)) +#define after(seq2, seq1) before(seq1, seq2) + +extern void cubictcp_init(struct sock *sk) __ksym; +extern void cubictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event) __ksym; +extern __u32 cubictcp_recalc_ssthresh(struct sock *sk) __ksym; +extern void cubictcp_state(struct sock *sk, __u8 new_state) __ksym; +extern __u32 tcp_reno_undo_cwnd(struct sock *sk) __ksym; +extern void cubictcp_acked(struct sock *sk, const struct ack_sample *sample) __ksym; +extern void cubictcp_cong_avoid(struct sock *sk, __u32 ack, __u32 acked) __ksym; + +static bool before(__u32 seq1, __u32 seq2) +{ + return (__s32)(seq1-seq2) < 0; +} + +static __u64 div64_u64(__u64 dividend, __u64 divisor) +{ + return dividend / divisor; +} + +static void tcp_update_pacing_rate(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + __u64 rate; + + /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */ + rate = (__u64)tp->mss_cache * ((USEC_PER_SEC / 100) << 3); + + /* current rate is (cwnd * mss) / srtt + * In Slow Start [1], set sk_pacing_rate to 200 % the current rate. + * In Congestion Avoidance phase, set it to 120 % the current rate. + * + * [1] : Normal Slow Start condition is (tp->snd_cwnd < tp->snd_ssthresh) + * If snd_cwnd >= (tp->snd_ssthresh / 2), we are approaching + * end of slow start and should slow down. + */ + if (tp->snd_cwnd < tp->snd_ssthresh / 2) + rate *= TCP_PACING_SS_RATIO; + else + rate *= TCP_PACING_CA_RATIO; + + rate *= max(tp->snd_cwnd, tp->packets_out); + + if (tp->srtt_us) + rate = div64_u64(rate, (__u64)tp->srtt_us); + + sk->sk_pacing_rate = min(rate, sk->sk_max_pacing_rate); +} + +static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, + int newly_lost, int flag) +{ + struct tcp_sock *tp = tcp_sk(sk); + int sndcnt = 0; + __u32 pkts_in_flight = tp->packets_out - (tp->sacked_out + tp->lost_out) + tp->retrans_out; + int delta = tp->snd_ssthresh - pkts_in_flight; + + if (newly_acked_sacked <= 0 || !tp->prior_cwnd) + return; + + __u32 prr_delivered = tp->prr_delivered + newly_acked_sacked; + + if (delta < 0) { + __u64 dividend = + (__u64)tp->snd_ssthresh * prr_delivered + tp->prior_cwnd - 1; + sndcnt = (__u32)div64_u64(dividend, (__u64)tp->prior_cwnd) - tp->prr_out; + } else { + sndcnt = max(prr_delivered - tp->prr_out, newly_acked_sacked); + if (flag & FLAG_SND_UNA_ADVANCED && !newly_lost) + sndcnt++; + sndcnt = min(delta, sndcnt); + } + /* Force a fast retransmit upon entering fast recovery */ + sndcnt = max(sndcnt, (tp->prr_out ? 0 : 1)); + tp->snd_cwnd = pkts_in_flight + sndcnt; +} + +/* Decide wheather to run the increase function of congestion control. */ +static bool tcp_may_raise_cwnd(const struct sock *sk, const int flag) +{ + if (tcp_sk(sk)->reordering > TCP_REORDERING) + return flag & FLAG_FORWARD_PROGRESS; + + return flag & FLAG_DATA_ACKED; +} + +SEC("struct_ops") +void BPF_PROG(bpf_cubic_init, struct sock *sk) +{ + cubictcp_init(sk); +} + +SEC("struct_ops") +void BPF_PROG(bpf_cubic_cwnd_event, struct sock *sk, enum tcp_ca_event event) +{ + cubictcp_cwnd_event(sk, event); +} + +SEC("struct_ops") +void BPF_PROG(bpf_cubic_cong_control, struct sock *sk, __u32 ack, int flag, + const struct rate_sample *rs) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (((1<<TCP_CA_CWR) | (1<<TCP_CA_Recovery)) & + (1 << inet_csk(sk)->icsk_ca_state)) { + /* Reduce cwnd if state mandates */ + tcp_cwnd_reduction(sk, rs->acked_sacked, rs->losses, flag); + + if (!before(tp->snd_una, tp->high_seq)) { + /* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */ + if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH && + inet_csk(sk)->icsk_ca_state == TCP_CA_CWR) { + tp->snd_cwnd = tp->snd_ssthresh; + tp->snd_cwnd_stamp = tcp_jiffies32; + } + } + } else if (tcp_may_raise_cwnd(sk, flag)) { + /* Advance cwnd if state allows */ + cubictcp_cong_avoid(sk, ack, rs->acked_sacked); + tp->snd_cwnd_stamp = tcp_jiffies32; + } + + tcp_update_pacing_rate(sk); +} + +SEC("struct_ops") +__u32 BPF_PROG(bpf_cubic_recalc_ssthresh, struct sock *sk) +{ + return cubictcp_recalc_ssthresh(sk); +} + +SEC("struct_ops") +void BPF_PROG(bpf_cubic_state, struct sock *sk, __u8 new_state) +{ + cubictcp_state(sk, new_state); +} + +SEC("struct_ops") +void BPF_PROG(bpf_cubic_acked, struct sock *sk, const struct ack_sample *sample) +{ + cubictcp_acked(sk, sample); +} + +SEC("struct_ops") +__u32 BPF_PROG(bpf_cubic_undo_cwnd, struct sock *sk) +{ + return tcp_reno_undo_cwnd(sk); +} + +SEC(".struct_ops") +struct tcp_congestion_ops cc_cubic = { + .init = (void *)bpf_cubic_init, + .ssthresh = (void *)bpf_cubic_recalc_ssthresh, + .cong_control = (void *)bpf_cubic_cong_control, + .set_state = (void *)bpf_cubic_state, + .undo_cwnd = (void *)bpf_cubic_undo_cwnd, + .cwnd_event = (void *)bpf_cubic_cwnd_event, + .pkts_acked = (void *)bpf_cubic_acked, + .name = "bpf_cc_cubic", +}; + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/bpf_cubic.c b/tools/testing/selftests/bpf/progs/bpf_cubic.c index c997e3e3d3fb..d665b8a15cc4 100644 --- a/tools/testing/selftests/bpf/progs/bpf_cubic.c +++ b/tools/testing/selftests/bpf/progs/bpf_cubic.c @@ -14,14 +14,22 @@ * "ca->ack_cnt / delta" operation. */ -#include <linux/bpf.h> -#include <linux/stddef.h> -#include <linux/tcp.h> -#include "bpf_tcp_helpers.h" +#include "bpf_tracing_net.h" +#include <bpf/bpf_tracing.h> char _license[] SEC("license") = "GPL"; #define clamp(val, lo, hi) min((typeof(val))max(val, lo), hi) +#define min(a, b) ((a) < (b) ? (a) : (b)) +#define max(a, b) ((a) > (b) ? (a) : (b)) +static bool before(__u32 seq1, __u32 seq2) +{ + return (__s32)(seq1-seq2) < 0; +} +#define after(seq2, seq1) before(seq1, seq2) + +extern __u32 tcp_slow_start(struct tcp_sock *tp, __u32 acked) __ksym; +extern void tcp_cong_avoid_ai(struct tcp_sock *tp, __u32 w, __u32 acked) __ksym; #define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation * max_cwnd = snd_cwnd * beta @@ -70,7 +78,7 @@ static const __u64 cube_factor = (__u64)(1ull << (10+3*BICTCP_HZ)) / (bic_scale * 10); /* BIC TCP Parameters */ -struct bictcp { +struct bpf_bictcp { __u32 cnt; /* increase cwnd by 1 after ACKs */ __u32 last_max_cwnd; /* last maximum snd_cwnd */ __u32 last_cwnd; /* the last snd_cwnd */ @@ -91,7 +99,7 @@ struct bictcp { __u32 curr_rtt; /* the minimum rtt of current round */ }; -static inline void bictcp_reset(struct bictcp *ca) +static void bictcp_reset(struct bpf_bictcp *ca) { ca->cnt = 0; ca->last_max_cwnd = 0; @@ -112,7 +120,7 @@ extern unsigned long CONFIG_HZ __kconfig; #define USEC_PER_SEC 1000000UL #define USEC_PER_JIFFY (USEC_PER_SEC / HZ) -static __always_inline __u64 div64_u64(__u64 dividend, __u64 divisor) +static __u64 div64_u64(__u64 dividend, __u64 divisor) { return dividend / divisor; } @@ -120,7 +128,7 @@ static __always_inline __u64 div64_u64(__u64 dividend, __u64 divisor) #define div64_ul div64_u64 #define BITS_PER_U64 (sizeof(__u64) * 8) -static __always_inline int fls64(__u64 x) +static int fls64(__u64 x) { int num = BITS_PER_U64 - 1; @@ -153,15 +161,15 @@ static __always_inline int fls64(__u64 x) return num + 1; } -static __always_inline __u32 bictcp_clock_us(const struct sock *sk) +static __u32 bictcp_clock_us(const struct sock *sk) { return tcp_sk(sk)->tcp_mstamp; } -static __always_inline void bictcp_hystart_reset(struct sock *sk) +static void bictcp_hystart_reset(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - struct bictcp *ca = inet_csk_ca(sk); + struct bpf_bictcp *ca = inet_csk_ca(sk); ca->round_start = ca->last_ack = bictcp_clock_us(sk); ca->end_seq = tp->snd_nxt; @@ -169,11 +177,10 @@ static __always_inline void bictcp_hystart_reset(struct sock *sk) ca->sample_cnt = 0; } -/* "struct_ops/" prefix is a requirement */ -SEC("struct_ops/bpf_cubic_init") +SEC("struct_ops") void BPF_PROG(bpf_cubic_init, struct sock *sk) { - struct bictcp *ca = inet_csk_ca(sk); + struct bpf_bictcp *ca = inet_csk_ca(sk); bictcp_reset(ca); @@ -184,12 +191,11 @@ void BPF_PROG(bpf_cubic_init, struct sock *sk) tcp_sk(sk)->snd_ssthresh = initial_ssthresh; } -/* "struct_ops" prefix is a requirement */ -SEC("struct_ops/bpf_cubic_cwnd_event") +SEC("struct_ops") void BPF_PROG(bpf_cubic_cwnd_event, struct sock *sk, enum tcp_ca_event event) { if (event == CA_EVENT_TX_START) { - struct bictcp *ca = inet_csk_ca(sk); + struct bpf_bictcp *ca = inet_csk_ca(sk); __u32 now = tcp_jiffies32; __s32 delta; @@ -230,7 +236,7 @@ static const __u8 v[] = { * Newton-Raphson iteration. * Avg err ~= 0.195% */ -static __always_inline __u32 cubic_root(__u64 a) +static __u32 cubic_root(__u64 a) { __u32 x, b, shift; @@ -263,8 +269,7 @@ static __always_inline __u32 cubic_root(__u64 a) /* * Compute congestion window to use. */ -static __always_inline void bictcp_update(struct bictcp *ca, __u32 cwnd, - __u32 acked) +static void bictcp_update(struct bpf_bictcp *ca, __u32 cwnd, __u32 acked) { __u32 delta, bic_target, max_cnt; __u64 offs, t; @@ -377,11 +382,11 @@ tcp_friendliness: ca->cnt = max(ca->cnt, 2U); } -/* Or simply use the BPF_STRUCT_OPS to avoid the SEC boiler plate. */ -void BPF_STRUCT_OPS(bpf_cubic_cong_avoid, struct sock *sk, __u32 ack, __u32 acked) +SEC("struct_ops") +void BPF_PROG(bpf_cubic_cong_avoid, struct sock *sk, __u32 ack, __u32 acked) { struct tcp_sock *tp = tcp_sk(sk); - struct bictcp *ca = inet_csk_ca(sk); + struct bpf_bictcp *ca = inet_csk_ca(sk); if (!tcp_is_cwnd_limited(sk)) return; @@ -397,10 +402,11 @@ void BPF_STRUCT_OPS(bpf_cubic_cong_avoid, struct sock *sk, __u32 ack, __u32 acke tcp_cong_avoid_ai(tp, ca->cnt, acked); } -__u32 BPF_STRUCT_OPS(bpf_cubic_recalc_ssthresh, struct sock *sk) +SEC("struct_ops") +__u32 BPF_PROG(bpf_cubic_recalc_ssthresh, struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); - struct bictcp *ca = inet_csk_ca(sk); + struct bpf_bictcp *ca = inet_csk_ca(sk); ca->epoch_start = 0; /* end of epoch */ @@ -414,7 +420,8 @@ __u32 BPF_STRUCT_OPS(bpf_cubic_recalc_ssthresh, struct sock *sk) return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U); } -void BPF_STRUCT_OPS(bpf_cubic_state, struct sock *sk, __u8 new_state) +SEC("struct_ops") +void BPF_PROG(bpf_cubic_state, struct sock *sk, __u8 new_state) { if (new_state == TCP_CA_Loss) { bictcp_reset(inet_csk_ca(sk)); @@ -433,7 +440,7 @@ void BPF_STRUCT_OPS(bpf_cubic_state, struct sock *sk, __u8 new_state) * We apply another 100% factor because @rate is doubled at this point. * We cap the cushion to 1ms. */ -static __always_inline __u32 hystart_ack_delay(struct sock *sk) +static __u32 hystart_ack_delay(struct sock *sk) { unsigned long rate; @@ -444,10 +451,10 @@ static __always_inline __u32 hystart_ack_delay(struct sock *sk) div64_ul((__u64)GSO_MAX_SIZE * 4 * USEC_PER_SEC, rate)); } -static __always_inline void hystart_update(struct sock *sk, __u32 delay) +static void hystart_update(struct sock *sk, __u32 delay) { struct tcp_sock *tp = tcp_sk(sk); - struct bictcp *ca = inet_csk_ca(sk); + struct bpf_bictcp *ca = inet_csk_ca(sk); __u32 threshold; if (hystart_detect & HYSTART_ACK_TRAIN) { @@ -492,11 +499,11 @@ static __always_inline void hystart_update(struct sock *sk, __u32 delay) int bpf_cubic_acked_called = 0; -void BPF_STRUCT_OPS(bpf_cubic_acked, struct sock *sk, - const struct ack_sample *sample) +SEC("struct_ops") +void BPF_PROG(bpf_cubic_acked, struct sock *sk, const struct ack_sample *sample) { const struct tcp_sock *tp = tcp_sk(sk); - struct bictcp *ca = inet_csk_ca(sk); + struct bpf_bictcp *ca = inet_csk_ca(sk); __u32 delay; bpf_cubic_acked_called = 1; @@ -524,7 +531,8 @@ void BPF_STRUCT_OPS(bpf_cubic_acked, struct sock *sk, extern __u32 tcp_reno_undo_cwnd(struct sock *sk) __ksym; -__u32 BPF_STRUCT_OPS(bpf_cubic_undo_cwnd, struct sock *sk) +SEC("struct_ops") +__u32 BPF_PROG(bpf_cubic_undo_cwnd, struct sock *sk) { return tcp_reno_undo_cwnd(sk); } diff --git a/tools/testing/selftests/bpf/progs/bpf_dctcp.c b/tools/testing/selftests/bpf/progs/bpf_dctcp.c index 460682759aed..3c9ffe340312 100644 --- a/tools/testing/selftests/bpf/progs/bpf_dctcp.c +++ b/tools/testing/selftests/bpf/progs/bpf_dctcp.c @@ -6,15 +6,23 @@ * the kernel BPF logic. */ -#include <stddef.h> -#include <linux/bpf.h> -#include <linux/types.h> -#include <linux/stddef.h> -#include <linux/tcp.h> -#include <errno.h> +#include "bpf_tracing_net.h" #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> -#include "bpf_tcp_helpers.h" + +#ifndef EBUSY +#define EBUSY 16 +#endif +#define min(a, b) ((a) < (b) ? (a) : (b)) +#define max(a, b) ((a) > (b) ? (a) : (b)) +#define min_not_zero(x, y) ({ \ + typeof(x) __x = (x); \ + typeof(y) __y = (y); \ + __x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); }) +static bool before(__u32 seq1, __u32 seq2) +{ + return (__s32)(seq1-seq2) < 0; +} char _license[] SEC("license") = "GPL"; @@ -35,7 +43,7 @@ struct { #define DCTCP_MAX_ALPHA 1024U -struct dctcp { +struct bpf_dctcp { __u32 old_delivered; __u32 old_delivered_ce; __u32 prior_rcv_nxt; @@ -48,8 +56,7 @@ struct dctcp { static unsigned int dctcp_shift_g = 4; /* g = 1/2^4 */ static unsigned int dctcp_alpha_on_init = DCTCP_MAX_ALPHA; -static __always_inline void dctcp_reset(const struct tcp_sock *tp, - struct dctcp *ca) +static void dctcp_reset(const struct tcp_sock *tp, struct bpf_dctcp *ca) { ca->next_seq = tp->snd_nxt; @@ -57,11 +64,11 @@ static __always_inline void dctcp_reset(const struct tcp_sock *tp, ca->old_delivered_ce = tp->delivered_ce; } -SEC("struct_ops/dctcp_init") +SEC("struct_ops") void BPF_PROG(dctcp_init, struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); - struct dctcp *ca = inet_csk_ca(sk); + struct bpf_dctcp *ca = inet_csk_ca(sk); int *stg; if (!(tp->ecn_flags & TCP_ECN_OK) && fallback[0]) { @@ -104,21 +111,21 @@ void BPF_PROG(dctcp_init, struct sock *sk) dctcp_reset(tp, ca); } -SEC("struct_ops/dctcp_ssthresh") +SEC("struct_ops") __u32 BPF_PROG(dctcp_ssthresh, struct sock *sk) { - struct dctcp *ca = inet_csk_ca(sk); + struct bpf_dctcp *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); ca->loss_cwnd = tp->snd_cwnd; return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->dctcp_alpha) >> 11U), 2U); } -SEC("struct_ops/dctcp_update_alpha") +SEC("struct_ops") void BPF_PROG(dctcp_update_alpha, struct sock *sk, __u32 flags) { const struct tcp_sock *tp = tcp_sk(sk); - struct dctcp *ca = inet_csk_ca(sk); + struct bpf_dctcp *ca = inet_csk_ca(sk); /* Expired RTT */ if (!before(tp->snd_una, ca->next_seq)) { @@ -144,16 +151,16 @@ void BPF_PROG(dctcp_update_alpha, struct sock *sk, __u32 flags) } } -static __always_inline void dctcp_react_to_loss(struct sock *sk) +static void dctcp_react_to_loss(struct sock *sk) { - struct dctcp *ca = inet_csk_ca(sk); + struct bpf_dctcp *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); ca->loss_cwnd = tp->snd_cwnd; tp->snd_ssthresh = max(tp->snd_cwnd >> 1U, 2U); } -SEC("struct_ops/dctcp_state") +SEC("struct_ops") void BPF_PROG(dctcp_state, struct sock *sk, __u8 new_state) { if (new_state == TCP_CA_Recovery && @@ -164,7 +171,7 @@ void BPF_PROG(dctcp_state, struct sock *sk, __u8 new_state) */ } -static __always_inline void dctcp_ece_ack_cwr(struct sock *sk, __u32 ce_state) +static void dctcp_ece_ack_cwr(struct sock *sk, __u32 ce_state) { struct tcp_sock *tp = tcp_sk(sk); @@ -179,9 +186,8 @@ static __always_inline void dctcp_ece_ack_cwr(struct sock *sk, __u32 ce_state) * S: 0 <- last pkt was non-CE * 1 <- last pkt was CE */ -static __always_inline -void dctcp_ece_ack_update(struct sock *sk, enum tcp_ca_event evt, - __u32 *prior_rcv_nxt, __u32 *ce_state) +static void dctcp_ece_ack_update(struct sock *sk, enum tcp_ca_event evt, + __u32 *prior_rcv_nxt, __u32 *ce_state) { __u32 new_ce_state = (evt == CA_EVENT_ECN_IS_CE) ? 1 : 0; @@ -201,10 +207,10 @@ void dctcp_ece_ack_update(struct sock *sk, enum tcp_ca_event evt, dctcp_ece_ack_cwr(sk, new_ce_state); } -SEC("struct_ops/dctcp_cwnd_event") +SEC("struct_ops") void BPF_PROG(dctcp_cwnd_event, struct sock *sk, enum tcp_ca_event ev) { - struct dctcp *ca = inet_csk_ca(sk); + struct bpf_dctcp *ca = inet_csk_ca(sk); switch (ev) { case CA_EVENT_ECN_IS_CE: @@ -220,17 +226,17 @@ void BPF_PROG(dctcp_cwnd_event, struct sock *sk, enum tcp_ca_event ev) } } -SEC("struct_ops/dctcp_cwnd_undo") +SEC("struct_ops") __u32 BPF_PROG(dctcp_cwnd_undo, struct sock *sk) { - const struct dctcp *ca = inet_csk_ca(sk); + const struct bpf_dctcp *ca = inet_csk_ca(sk); return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd); } extern void tcp_reno_cong_avoid(struct sock *sk, __u32 ack, __u32 acked) __ksym; -SEC("struct_ops/dctcp_reno_cong_avoid") +SEC("struct_ops") void BPF_PROG(dctcp_cong_avoid, struct sock *sk, __u32 ack, __u32 acked) { tcp_reno_cong_avoid(sk, ack, acked); diff --git a/tools/testing/selftests/bpf/progs/bpf_dctcp_release.c b/tools/testing/selftests/bpf/progs/bpf_dctcp_release.c index d836f7c372f0..c91763f248b2 100644 --- a/tools/testing/selftests/bpf/progs/bpf_dctcp_release.c +++ b/tools/testing/selftests/bpf/progs/bpf_dctcp_release.c @@ -1,19 +1,15 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2021 Facebook */ -#include <stddef.h> -#include <linux/bpf.h> -#include <linux/types.h> -#include <linux/stddef.h> -#include <linux/tcp.h> +#include "bpf_tracing_net.h" #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> -#include "bpf_tcp_helpers.h" char _license[] SEC("license") = "GPL"; const char cubic[] = "cubic"; -void BPF_STRUCT_OPS(dctcp_nouse_release, struct sock *sk) +SEC("struct_ops") +void BPF_PROG(dctcp_nouse_release, struct sock *sk) { bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION, (void *)cubic, sizeof(cubic)); diff --git a/tools/testing/selftests/bpf/progs/bpf_tcp_nogpl.c b/tools/testing/selftests/bpf/progs/bpf_tcp_nogpl.c index 2ecd833dcd41..8a7a4c1b54e8 100644 --- a/tools/testing/selftests/bpf/progs/bpf_tcp_nogpl.c +++ b/tools/testing/selftests/bpf/progs/bpf_tcp_nogpl.c @@ -1,14 +1,12 @@ // SPDX-License-Identifier: GPL-2.0 -#include <linux/bpf.h> -#include <linux/types.h> -#include <bpf/bpf_helpers.h> +#include "bpf_tracing_net.h" #include <bpf/bpf_tracing.h> -#include "bpf_tcp_helpers.h" char _license[] SEC("license") = "X"; -void BPF_STRUCT_OPS(nogpltcp_init, struct sock *sk) +SEC("struct_ops") +void BPF_PROG(nogpltcp_init, struct sock *sk) { } diff --git a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h index 7001965d1cc3..59843b430f76 100644 --- a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h +++ b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h @@ -2,6 +2,9 @@ #ifndef __BPF_TRACING_NET_H__ #define __BPF_TRACING_NET_H__ +#include <vmlinux.h> +#include <bpf/bpf_core_read.h> + #define AF_INET 2 #define AF_INET6 10 @@ -22,6 +25,7 @@ #define IP_TOS 1 +#define SOL_IPV6 41 #define IPV6_TCLASS 67 #define IPV6_AUTOFLOWLABEL 70 @@ -46,6 +50,13 @@ #define TCP_CA_NAME_MAX 16 #define TCP_NAGLE_OFF 1 +#define TCP_ECN_OK 1 +#define TCP_ECN_QUEUE_CWR 2 +#define TCP_ECN_DEMAND_CWR 4 +#define TCP_ECN_SEEN 8 + +#define TCP_CONG_NEEDS_ECN 0x2 + #define ICSK_TIME_RETRANS 1 #define ICSK_TIME_PROBE0 3 #define ICSK_TIME_LOSS_PROBE 5 @@ -80,6 +91,14 @@ #define TCP_INFINITE_SSTHRESH 0x7fffffff #define TCP_PINGPONG_THRESH 3 +#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ +#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */ +#define FLAG_DATA_SACKED 0x20 /* New SACK. */ +#define FLAG_SND_UNA_ADVANCED \ + 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ +#define FLAG_ACKED (FLAG_DATA_ACKED | FLAG_SYN_ACKED) +#define FLAG_FORWARD_PROGRESS (FLAG_ACKED | FLAG_DATA_SACKED) + #define fib_nh_dev nh_common.nhc_dev #define fib_nh_gw_family nh_common.nhc_gw_family #define fib_nh_gw6 nh_common.nhc_gw.ipv6 @@ -119,4 +138,37 @@ #define tw_v6_daddr __tw_common.skc_v6_daddr #define tw_v6_rcv_saddr __tw_common.skc_v6_rcv_saddr +#define tcp_jiffies32 ((__u32)bpf_jiffies64()) + +static inline struct inet_connection_sock *inet_csk(const struct sock *sk) +{ + return (struct inet_connection_sock *)sk; +} + +static inline void *inet_csk_ca(const struct sock *sk) +{ + return (void *)inet_csk(sk)->icsk_ca_priv; +} + +static inline struct tcp_sock *tcp_sk(const struct sock *sk) +{ + return (struct tcp_sock *)sk; +} + +static inline bool tcp_in_slow_start(const struct tcp_sock *tp) +{ + return tp->snd_cwnd < tp->snd_ssthresh; +} + +static inline bool tcp_is_cwnd_limited(const struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + + /* If in slow start, ensure cwnd grows to twice what was ACKed. */ + if (tcp_in_slow_start(tp)) + return tp->snd_cwnd < 2 * tp->max_packets_out; + + return !!BPF_CORE_READ_BITFIELD(tp, is_cwnd_limited); +} + #endif diff --git a/tools/testing/selftests/bpf/progs/cgrp_kfunc_common.h b/tools/testing/selftests/bpf/progs/cgrp_kfunc_common.h index 22914a70db54..73ba32e9a693 100644 --- a/tools/testing/selftests/bpf/progs/cgrp_kfunc_common.h +++ b/tools/testing/selftests/bpf/progs/cgrp_kfunc_common.h @@ -13,7 +13,7 @@ struct __cgrps_kfunc_map_value { struct cgroup __kptr * cgrp; }; -struct hash_map { +struct { __uint(type, BPF_MAP_TYPE_HASH); __type(key, int); __type(value, struct __cgrps_kfunc_map_value); diff --git a/tools/testing/selftests/bpf/progs/connect4_prog.c b/tools/testing/selftests/bpf/progs/connect4_prog.c index 7ef49ec04838..9e9ebf27b878 100644 --- a/tools/testing/selftests/bpf/progs/connect4_prog.c +++ b/tools/testing/selftests/bpf/progs/connect4_prog.c @@ -14,8 +14,6 @@ #include <bpf/bpf_helpers.h> #include <bpf/bpf_endian.h> -#include "bpf_tcp_helpers.h" - #define SRC_REWRITE_IP4 0x7f000004U #define DST_REWRITE_IP4 0x7f000001U #define DST_REWRITE_PORT4 4444 @@ -32,6 +30,10 @@ #define IFNAMSIZ 16 #endif +#ifndef SOL_TCP +#define SOL_TCP 6 +#endif + __attribute__ ((noinline)) __weak int do_bind(struct bpf_sock_addr *ctx) { @@ -197,4 +199,10 @@ int connect_v4_prog(struct bpf_sock_addr *ctx) return do_bind(ctx) ? 1 : 0; } +SEC("cgroup/connect4") +int connect_v4_deny_prog(struct bpf_sock_addr *ctx) +{ + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/connect6_prog.c b/tools/testing/selftests/bpf/progs/connect6_prog.c index 40266d2c737c..e98573b00ddb 100644 --- a/tools/testing/selftests/bpf/progs/connect6_prog.c +++ b/tools/testing/selftests/bpf/progs/connect6_prog.c @@ -90,4 +90,10 @@ int connect_v6_prog(struct bpf_sock_addr *ctx) return 1; } +SEC("cgroup/connect6") +int connect_v6_deny_prog(struct bpf_sock_addr *ctx) +{ + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/connect_unix_prog.c b/tools/testing/selftests/bpf/progs/connect_unix_prog.c index 2ef0e0c46d17..ba60adadb335 100644 --- a/tools/testing/selftests/bpf/progs/connect_unix_prog.c +++ b/tools/testing/selftests/bpf/progs/connect_unix_prog.c @@ -36,4 +36,10 @@ int connect_unix_prog(struct bpf_sock_addr *ctx) return 1; } +SEC("cgroup/connect_unix") +int connect_unix_deny_prog(struct bpf_sock_addr *ctx) +{ + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/cpumask_common.h b/tools/testing/selftests/bpf/progs/cpumask_common.h index c705d8112a35..b979e91f55f0 100644 --- a/tools/testing/selftests/bpf/progs/cpumask_common.h +++ b/tools/testing/selftests/bpf/progs/cpumask_common.h @@ -9,7 +9,7 @@ int err; -#define private(name) SEC(".bss." #name) __hidden __attribute__((aligned(8))) +#define private(name) SEC(".bss." #name) __attribute__((aligned(8))) private(MASK) static struct bpf_cpumask __kptr * global_mask; struct __cpumask_map_value { diff --git a/tools/testing/selftests/bpf/progs/cpumask_failure.c b/tools/testing/selftests/bpf/progs/cpumask_failure.c index a9bf6ea336cf..a988d2823b52 100644 --- a/tools/testing/selftests/bpf/progs/cpumask_failure.c +++ b/tools/testing/selftests/bpf/progs/cpumask_failure.c @@ -61,11 +61,8 @@ SEC("tp_btf/task_newtask") __failure __msg("bpf_cpumask_set_cpu args#1 expected pointer to STRUCT bpf_cpumask") int BPF_PROG(test_mutate_cpumask, struct task_struct *task, u64 clone_flags) { - struct bpf_cpumask *cpumask; - /* Can't set the CPU of a non-struct bpf_cpumask. */ bpf_cpumask_set_cpu(0, (struct bpf_cpumask *)task->cpus_ptr); - __sink(cpumask); return 0; } diff --git a/tools/testing/selftests/bpf/progs/crypto_basic.c b/tools/testing/selftests/bpf/progs/crypto_basic.c new file mode 100644 index 000000000000..8cf7168b42d5 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/crypto_basic.c @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2023 Meta Platforms, Inc. and affiliates. */ + +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include "bpf_misc.h" +#include "bpf_kfuncs.h" +#include "crypto_common.h" + +int status; +SEC("syscall") +int crypto_release(void *ctx) +{ + struct bpf_crypto_params params = { + .type = "skcipher", + .algo = "ecb(aes)", + .key_len = 16, + }; + + struct bpf_crypto_ctx *cctx; + int err = 0; + + status = 0; + + cctx = bpf_crypto_ctx_create(¶ms, sizeof(params), &err); + + if (!cctx) { + status = err; + return 0; + } + + bpf_crypto_ctx_release(cctx); + + return 0; +} + +SEC("syscall") +__failure __msg("Unreleased reference") +int crypto_acquire(void *ctx) +{ + struct bpf_crypto_params params = { + .type = "skcipher", + .algo = "ecb(aes)", + .key_len = 16, + }; + struct bpf_crypto_ctx *cctx; + int err = 0; + + status = 0; + + cctx = bpf_crypto_ctx_create(¶ms, sizeof(params), &err); + + if (!cctx) { + status = err; + return 0; + } + + cctx = bpf_crypto_ctx_acquire(cctx); + if (!cctx) + return -EINVAL; + + bpf_crypto_ctx_release(cctx); + + return 0; +} + +char __license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/crypto_bench.c b/tools/testing/selftests/bpf/progs/crypto_bench.c new file mode 100644 index 000000000000..e61fe0882293 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/crypto_bench.c @@ -0,0 +1,109 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#include "vmlinux.h" +#include "bpf_tracing_net.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> +#include <bpf/bpf_tracing.h> +#include "bpf_misc.h" +#include "bpf_kfuncs.h" +#include "crypto_common.h" + +const volatile unsigned int len = 16; +char cipher[128] = {}; +u32 key_len, authsize; +char dst[256] = {}; +u8 key[256] = {}; +long hits = 0; +int status; + +SEC("syscall") +int crypto_setup(void *args) +{ + struct bpf_crypto_ctx *cctx; + struct bpf_crypto_params params = { + .type = "skcipher", + .key_len = key_len, + .authsize = authsize, + }; + int err = 0; + + status = 0; + + if (!cipher[0] || !key_len || key_len > 256) { + status = -EINVAL; + return 0; + } + + __builtin_memcpy(¶ms.algo, cipher, sizeof(cipher)); + __builtin_memcpy(¶ms.key, key, sizeof(key)); + cctx = bpf_crypto_ctx_create(¶ms, sizeof(params), &err); + + if (!cctx) { + status = err; + return 0; + } + + err = crypto_ctx_insert(cctx); + if (err && err != -EEXIST) + status = err; + + return 0; +} + +SEC("tc") +int crypto_encrypt(struct __sk_buff *skb) +{ + struct __crypto_ctx_value *v; + struct bpf_crypto_ctx *ctx; + struct bpf_dynptr psrc, pdst, iv; + + v = crypto_ctx_value_lookup(); + if (!v) { + status = -ENOENT; + return 0; + } + + ctx = v->ctx; + if (!ctx) { + status = -ENOENT; + return 0; + } + + bpf_dynptr_from_skb(skb, 0, &psrc); + bpf_dynptr_from_mem(dst, len, 0, &pdst); + bpf_dynptr_from_mem(dst, 0, 0, &iv); + + status = bpf_crypto_encrypt(ctx, &psrc, &pdst, &iv); + __sync_add_and_fetch(&hits, 1); + + return 0; +} + +SEC("tc") +int crypto_decrypt(struct __sk_buff *skb) +{ + struct bpf_dynptr psrc, pdst, iv; + struct __crypto_ctx_value *v; + struct bpf_crypto_ctx *ctx; + + v = crypto_ctx_value_lookup(); + if (!v) + return -ENOENT; + + ctx = v->ctx; + if (!ctx) + return -ENOENT; + + bpf_dynptr_from_skb(skb, 0, &psrc); + bpf_dynptr_from_mem(dst, len, 0, &pdst); + bpf_dynptr_from_mem(dst, 0, 0, &iv); + + status = bpf_crypto_decrypt(ctx, &psrc, &pdst, &iv); + __sync_add_and_fetch(&hits, 1); + + return 0; +} + +char __license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/crypto_common.h b/tools/testing/selftests/bpf/progs/crypto_common.h new file mode 100644 index 000000000000..57dd7a68a8c3 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/crypto_common.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#ifndef _CRYPTO_COMMON_H +#define _CRYPTO_COMMON_H + +#include "errno.h" +#include <stdbool.h> + +struct bpf_crypto_ctx *bpf_crypto_ctx_create(const struct bpf_crypto_params *params, + u32 params__sz, int *err) __ksym; +struct bpf_crypto_ctx *bpf_crypto_ctx_acquire(struct bpf_crypto_ctx *ctx) __ksym; +void bpf_crypto_ctx_release(struct bpf_crypto_ctx *ctx) __ksym; +int bpf_crypto_encrypt(struct bpf_crypto_ctx *ctx, const struct bpf_dynptr *src, + const struct bpf_dynptr *dst, const struct bpf_dynptr *iv) __ksym; +int bpf_crypto_decrypt(struct bpf_crypto_ctx *ctx, const struct bpf_dynptr *src, + const struct bpf_dynptr *dst, const struct bpf_dynptr *iv) __ksym; + +struct __crypto_ctx_value { + struct bpf_crypto_ctx __kptr * ctx; +}; + +struct array_map { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, int); + __type(value, struct __crypto_ctx_value); + __uint(max_entries, 1); +} __crypto_ctx_map SEC(".maps"); + +static inline struct __crypto_ctx_value *crypto_ctx_value_lookup(void) +{ + u32 key = 0; + + return bpf_map_lookup_elem(&__crypto_ctx_map, &key); +} + +static inline int crypto_ctx_insert(struct bpf_crypto_ctx *ctx) +{ + struct __crypto_ctx_value local, *v; + struct bpf_crypto_ctx *old; + u32 key = 0; + int err; + + local.ctx = NULL; + err = bpf_map_update_elem(&__crypto_ctx_map, &key, &local, 0); + if (err) { + bpf_crypto_ctx_release(ctx); + return err; + } + + v = bpf_map_lookup_elem(&__crypto_ctx_map, &key); + if (!v) { + bpf_crypto_ctx_release(ctx); + return -ENOENT; + } + + old = bpf_kptr_xchg(&v->ctx, ctx); + if (old) { + bpf_crypto_ctx_release(old); + return -EEXIST; + } + + return 0; +} + +#endif /* _CRYPTO_COMMON_H */ diff --git a/tools/testing/selftests/bpf/progs/crypto_sanity.c b/tools/testing/selftests/bpf/progs/crypto_sanity.c new file mode 100644 index 000000000000..1be0a3fa5efd --- /dev/null +++ b/tools/testing/selftests/bpf/progs/crypto_sanity.c @@ -0,0 +1,169 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#include "vmlinux.h" +#include "bpf_tracing_net.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> +#include <bpf/bpf_tracing.h> +#include "bpf_misc.h" +#include "bpf_kfuncs.h" +#include "crypto_common.h" + +unsigned char key[256] = {}; +u16 udp_test_port = 7777; +u32 authsize, key_len; +char algo[128] = {}; +char dst[16] = {}; +int status; + +static int skb_dynptr_validate(struct __sk_buff *skb, struct bpf_dynptr *psrc) +{ + struct ipv6hdr ip6h; + struct udphdr udph; + u32 offset; + + if (skb->protocol != __bpf_constant_htons(ETH_P_IPV6)) + return -1; + + if (bpf_skb_load_bytes(skb, ETH_HLEN, &ip6h, sizeof(ip6h))) + return -1; + + if (ip6h.nexthdr != IPPROTO_UDP) + return -1; + + if (bpf_skb_load_bytes(skb, ETH_HLEN + sizeof(ip6h), &udph, sizeof(udph))) + return -1; + + if (udph.dest != __bpf_htons(udp_test_port)) + return -1; + + offset = ETH_HLEN + sizeof(ip6h) + sizeof(udph); + if (skb->len < offset + 16) + return -1; + + /* let's make sure that 16 bytes of payload are in the linear part of skb */ + bpf_skb_pull_data(skb, offset + 16); + bpf_dynptr_from_skb(skb, 0, psrc); + bpf_dynptr_adjust(psrc, offset, offset + 16); + + return 0; +} + +SEC("syscall") +int skb_crypto_setup(void *ctx) +{ + struct bpf_crypto_params params = { + .type = "skcipher", + .key_len = key_len, + .authsize = authsize, + }; + struct bpf_crypto_ctx *cctx; + int err = 0; + + status = 0; + + if (key_len > 256) { + status = -EINVAL; + return 0; + } + + __builtin_memcpy(¶ms.algo, algo, sizeof(algo)); + __builtin_memcpy(¶ms.key, key, sizeof(key)); + cctx = bpf_crypto_ctx_create(¶ms, sizeof(params), &err); + + if (!cctx) { + status = err; + return 0; + } + + err = crypto_ctx_insert(cctx); + if (err && err != -EEXIST) + status = err; + + return 0; +} + +SEC("tc") +int decrypt_sanity(struct __sk_buff *skb) +{ + struct __crypto_ctx_value *v; + struct bpf_crypto_ctx *ctx; + struct bpf_dynptr psrc, pdst, iv; + int err; + + err = skb_dynptr_validate(skb, &psrc); + if (err < 0) { + status = err; + return TC_ACT_SHOT; + } + + v = crypto_ctx_value_lookup(); + if (!v) { + status = -ENOENT; + return TC_ACT_SHOT; + } + + ctx = v->ctx; + if (!ctx) { + status = -ENOENT; + return TC_ACT_SHOT; + } + + /* dst is a global variable to make testing part easier to check. In real + * production code, a percpu map should be used to store the result. + */ + bpf_dynptr_from_mem(dst, sizeof(dst), 0, &pdst); + /* iv dynptr has to be initialized with 0 size, but proper memory region + * has to be provided anyway + */ + bpf_dynptr_from_mem(dst, 0, 0, &iv); + + status = bpf_crypto_decrypt(ctx, &psrc, &pdst, &iv); + + return TC_ACT_SHOT; +} + +SEC("tc") +int encrypt_sanity(struct __sk_buff *skb) +{ + struct __crypto_ctx_value *v; + struct bpf_crypto_ctx *ctx; + struct bpf_dynptr psrc, pdst, iv; + int err; + + status = 0; + + err = skb_dynptr_validate(skb, &psrc); + if (err < 0) { + status = err; + return TC_ACT_SHOT; + } + + v = crypto_ctx_value_lookup(); + if (!v) { + status = -ENOENT; + return TC_ACT_SHOT; + } + + ctx = v->ctx; + if (!ctx) { + status = -ENOENT; + return TC_ACT_SHOT; + } + + /* dst is a global variable to make testing part easier to check. In real + * production code, a percpu map should be used to store the result. + */ + bpf_dynptr_from_mem(dst, sizeof(dst), 0, &pdst); + /* iv dynptr has to be initialized with 0 size, but proper memory region + * has to be provided anyway + */ + bpf_dynptr_from_mem(dst, 0, 0, &iv); + + status = bpf_crypto_encrypt(ctx, &psrc, &pdst, &iv); + + return TC_ACT_SHOT; +} + +char __license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/dummy_st_ops_success.c b/tools/testing/selftests/bpf/progs/dummy_st_ops_success.c index 1efa746c25dc..ec0c595d47af 100644 --- a/tools/testing/selftests/bpf/progs/dummy_st_ops_success.c +++ b/tools/testing/selftests/bpf/progs/dummy_st_ops_success.c @@ -11,8 +11,17 @@ int BPF_PROG(test_1, struct bpf_dummy_ops_state *state) { int ret; - if (!state) - return 0xf2f3f4f5; + /* Check that 'state' nullable status is detected correctly. + * If 'state' argument would be assumed non-null by verifier + * the code below would be deleted as dead (which it shouldn't). + * Hide it from the compiler behind 'asm' block to avoid + * unnecessary optimizations. + */ + asm volatile ( + "if %[state] != 0 goto +2;" + "r0 = 0xf2f3f4f5;" + "exit;" + ::[state]"p"(state)); ret = state->val; state->val = 0x5a; @@ -25,7 +34,7 @@ SEC("struct_ops/test_2") int BPF_PROG(test_2, struct bpf_dummy_ops_state *state, int a1, unsigned short a2, char a3, unsigned long a4) { - test_2_args[0] = (unsigned long)state; + test_2_args[0] = state->val; test_2_args[1] = a1; test_2_args[2] = a2; test_2_args[3] = a3; diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c index 7ce7e827d5f0..66a60bfb5867 100644 --- a/tools/testing/selftests/bpf/progs/dynptr_fail.c +++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c @@ -80,7 +80,7 @@ SEC("?raw_tp") __failure __msg("Unreleased reference id=2") int ringbuf_missing_release1(void *ctx) { - struct bpf_dynptr ptr; + struct bpf_dynptr ptr = {}; bpf_ringbuf_reserve_dynptr(&ringbuf, val, 0, &ptr); @@ -1385,7 +1385,7 @@ SEC("?raw_tp") __failure __msg("Expected an initialized dynptr as arg #1") int dynptr_adjust_invalid(void *ctx) { - struct bpf_dynptr ptr; + struct bpf_dynptr ptr = {}; /* this should fail */ bpf_dynptr_adjust(&ptr, 1, 2); @@ -1398,7 +1398,7 @@ SEC("?raw_tp") __failure __msg("Expected an initialized dynptr as arg #1") int dynptr_is_null_invalid(void *ctx) { - struct bpf_dynptr ptr; + struct bpf_dynptr ptr = {}; /* this should fail */ bpf_dynptr_is_null(&ptr); @@ -1411,7 +1411,7 @@ SEC("?raw_tp") __failure __msg("Expected an initialized dynptr as arg #1") int dynptr_is_rdonly_invalid(void *ctx) { - struct bpf_dynptr ptr; + struct bpf_dynptr ptr = {}; /* this should fail */ bpf_dynptr_is_rdonly(&ptr); @@ -1424,7 +1424,7 @@ SEC("?raw_tp") __failure __msg("Expected an initialized dynptr as arg #1") int dynptr_size_invalid(void *ctx) { - struct bpf_dynptr ptr; + struct bpf_dynptr ptr = {}; /* this should fail */ bpf_dynptr_size(&ptr); @@ -1437,7 +1437,7 @@ SEC("?raw_tp") __failure __msg("Expected an initialized dynptr as arg #1") int clone_invalid1(void *ctx) { - struct bpf_dynptr ptr1; + struct bpf_dynptr ptr1 = {}; struct bpf_dynptr ptr2; /* this should fail */ diff --git a/tools/testing/selftests/bpf/progs/fib_lookup.c b/tools/testing/selftests/bpf/progs/fib_lookup.c index c4514dd58c62..7b5dd2214ff4 100644 --- a/tools/testing/selftests/bpf/progs/fib_lookup.c +++ b/tools/testing/selftests/bpf/progs/fib_lookup.c @@ -3,8 +3,8 @@ #include <linux/types.h> #include <linux/bpf.h> +#include <linux/pkt_cls.h> #include <bpf/bpf_helpers.h> -#include "bpf_tracing_net.h" struct bpf_fib_lookup fib_params = {}; int fib_lookup_ret = 0; diff --git a/tools/testing/selftests/bpf/progs/for_each_multi_maps.c b/tools/testing/selftests/bpf/progs/for_each_multi_maps.c new file mode 100644 index 000000000000..ff0bed7d4459 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/for_each_multi_maps.c @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 3); + __type(key, __u32); + __type(value, __u64); +} arraymap SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 5); + __type(key, __u32); + __type(value, __u64); +} hashmap SEC(".maps"); + +struct callback_ctx { + int output; +}; + +u32 data_output = 0; +int use_array = 0; + +static __u64 +check_map_elem(struct bpf_map *map, __u32 *key, __u64 *val, + struct callback_ctx *data) +{ + data->output += *val; + return 0; +} + +SEC("tc") +int test_pkt_access(struct __sk_buff *skb) +{ + struct callback_ctx data; + + data.output = 0; + if (use_array) + bpf_for_each_map_elem(&arraymap, check_map_elem, &data, 0); + else + bpf_for_each_map_elem(&hashmap, check_map_elem, &data, 0); + data_output = data.output; + + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/getpeername4_prog.c b/tools/testing/selftests/bpf/progs/getpeername4_prog.c new file mode 100644 index 000000000000..4c97208cd25d --- /dev/null +++ b/tools/testing/selftests/bpf/progs/getpeername4_prog.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Google LLC */ + +#include "vmlinux.h" + +#include <string.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> +#include <bpf/bpf_core_read.h> +#include "bpf_kfuncs.h" + +#define REWRITE_ADDRESS_IP4 0xc0a801fe // 192.168.1.254 +#define REWRITE_ADDRESS_PORT4 4040 + +SEC("cgroup/getpeername4") +int getpeername_v4_prog(struct bpf_sock_addr *ctx) +{ + ctx->user_ip4 = bpf_htonl(REWRITE_ADDRESS_IP4); + ctx->user_port = bpf_htons(REWRITE_ADDRESS_PORT4); + + return 1; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/getpeername6_prog.c b/tools/testing/selftests/bpf/progs/getpeername6_prog.c new file mode 100644 index 000000000000..070e4d7f636c --- /dev/null +++ b/tools/testing/selftests/bpf/progs/getpeername6_prog.c @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Google LLC */ + +#include "vmlinux.h" + +#include <string.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> +#include <bpf/bpf_core_read.h> +#include "bpf_kfuncs.h" + +#define REWRITE_ADDRESS_IP6_0 0xfaceb00c +#define REWRITE_ADDRESS_IP6_1 0x12345678 +#define REWRITE_ADDRESS_IP6_2 0x00000000 +#define REWRITE_ADDRESS_IP6_3 0x0000abcd + +#define REWRITE_ADDRESS_PORT6 6060 + +SEC("cgroup/getpeername6") +int getpeername_v6_prog(struct bpf_sock_addr *ctx) +{ + ctx->user_ip6[0] = bpf_htonl(REWRITE_ADDRESS_IP6_0); + ctx->user_ip6[1] = bpf_htonl(REWRITE_ADDRESS_IP6_1); + ctx->user_ip6[2] = bpf_htonl(REWRITE_ADDRESS_IP6_2); + ctx->user_ip6[3] = bpf_htonl(REWRITE_ADDRESS_IP6_3); + ctx->user_port = bpf_htons(REWRITE_ADDRESS_PORT6); + + return 1; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/getsockname4_prog.c b/tools/testing/selftests/bpf/progs/getsockname4_prog.c new file mode 100644 index 000000000000..e298487c6347 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/getsockname4_prog.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Google LLC */ + +#include "vmlinux.h" + +#include <string.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> +#include <bpf/bpf_core_read.h> +#include "bpf_kfuncs.h" + +#define REWRITE_ADDRESS_IP4 0xc0a801fe // 192.168.1.254 +#define REWRITE_ADDRESS_PORT4 4040 + +SEC("cgroup/getsockname4") +int getsockname_v4_prog(struct bpf_sock_addr *ctx) +{ + ctx->user_ip4 = bpf_htonl(REWRITE_ADDRESS_IP4); + ctx->user_port = bpf_htons(REWRITE_ADDRESS_PORT4); + + return 1; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/getsockname6_prog.c b/tools/testing/selftests/bpf/progs/getsockname6_prog.c new file mode 100644 index 000000000000..811d10cd5525 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/getsockname6_prog.c @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Google LLC */ + +#include "vmlinux.h" + +#include <string.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> +#include <bpf/bpf_core_read.h> +#include "bpf_kfuncs.h" + +#define REWRITE_ADDRESS_IP6_0 0xfaceb00c +#define REWRITE_ADDRESS_IP6_1 0x12345678 +#define REWRITE_ADDRESS_IP6_2 0x00000000 +#define REWRITE_ADDRESS_IP6_3 0x0000abcd + +#define REWRITE_ADDRESS_PORT6 6060 + +SEC("cgroup/getsockname6") +int getsockname_v6_prog(struct bpf_sock_addr *ctx) +{ + ctx->user_ip6[0] = bpf_htonl(REWRITE_ADDRESS_IP6_0); + ctx->user_ip6[1] = bpf_htonl(REWRITE_ADDRESS_IP6_1); + ctx->user_ip6[2] = bpf_htonl(REWRITE_ADDRESS_IP6_2); + ctx->user_ip6[3] = bpf_htonl(REWRITE_ADDRESS_IP6_3); + ctx->user_port = bpf_htons(REWRITE_ADDRESS_PORT6); + + return 1; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/iters.c b/tools/testing/selftests/bpf/progs/iters.c index 3db416606f2f..fe65e0952a1e 100644 --- a/tools/testing/selftests/bpf/progs/iters.c +++ b/tools/testing/selftests/bpf/progs/iters.c @@ -673,7 +673,7 @@ static __noinline void fill(struct bpf_iter_num *it, int *arr, __u32 n, int mul) static __noinline int sum(struct bpf_iter_num *it, int *arr, __u32 n) { - int *t, i, sum = 0;; + int *t, i, sum = 0; while ((t = bpf_iter_num_next(it))) { i = *t; diff --git a/tools/testing/selftests/bpf/progs/jeq_infer_not_null_fail.c b/tools/testing/selftests/bpf/progs/jeq_infer_not_null_fail.c index f46965053acb..4d619bea9c75 100644 --- a/tools/testing/selftests/bpf/progs/jeq_infer_not_null_fail.c +++ b/tools/testing/selftests/bpf/progs/jeq_infer_not_null_fail.c @@ -4,6 +4,10 @@ #include <bpf/bpf_helpers.h> #include "bpf_misc.h" +#ifndef __clang__ +#pragma GCC diagnostic ignored "-Warray-bounds" +#endif + char _license[] SEC("license") = "GPL"; struct { diff --git a/tools/testing/selftests/bpf/progs/kprobe_multi_session.c b/tools/testing/selftests/bpf/progs/kprobe_multi_session.c new file mode 100644 index 000000000000..bbba9eb46551 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/kprobe_multi_session.c @@ -0,0 +1,79 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include <stdbool.h> +#include "bpf_kfuncs.h" + +#define ARRAY_SIZE(x) (int)(sizeof(x) / sizeof((x)[0])) + +char _license[] SEC("license") = "GPL"; + +extern const void bpf_fentry_test1 __ksym; +extern const void bpf_fentry_test2 __ksym; +extern const void bpf_fentry_test3 __ksym; +extern const void bpf_fentry_test4 __ksym; +extern const void bpf_fentry_test5 __ksym; +extern const void bpf_fentry_test6 __ksym; +extern const void bpf_fentry_test7 __ksym; +extern const void bpf_fentry_test8 __ksym; + +int pid = 0; + +__u64 kprobe_session_result[8]; + +static int session_check(void *ctx) +{ + unsigned int i; + __u64 addr; + const void *kfuncs[] = { + &bpf_fentry_test1, + &bpf_fentry_test2, + &bpf_fentry_test3, + &bpf_fentry_test4, + &bpf_fentry_test5, + &bpf_fentry_test6, + &bpf_fentry_test7, + &bpf_fentry_test8, + }; + + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 1; + + addr = bpf_get_func_ip(ctx); + + for (i = 0; i < ARRAY_SIZE(kfuncs); i++) { + if (kfuncs[i] == (void *) addr) { + kprobe_session_result[i]++; + break; + } + } + + /* + * Force probes for function bpf_fentry_test[5-8] not to + * install and execute the return probe + */ + if (((const void *) addr == &bpf_fentry_test5) || + ((const void *) addr == &bpf_fentry_test6) || + ((const void *) addr == &bpf_fentry_test7) || + ((const void *) addr == &bpf_fentry_test8)) + return 1; + + return 0; +} + +/* + * No tests in here, just to trigger 'bpf_fentry_test*' + * through tracing test_run + */ +SEC("fentry/bpf_modify_return_test") +int BPF_PROG(trigger) +{ + return 0; +} + +SEC("kprobe.session/bpf_fentry_test*") +int test_kprobe(struct pt_regs *ctx) +{ + return session_check(ctx); +} diff --git a/tools/testing/selftests/bpf/progs/kprobe_multi_session_cookie.c b/tools/testing/selftests/bpf/progs/kprobe_multi_session_cookie.c new file mode 100644 index 000000000000..d49070803e22 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/kprobe_multi_session_cookie.c @@ -0,0 +1,58 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include <stdbool.h> +#include "bpf_kfuncs.h" + +char _license[] SEC("license") = "GPL"; + +int pid = 0; + +__u64 test_kprobe_1_result = 0; +__u64 test_kprobe_2_result = 0; +__u64 test_kprobe_3_result = 0; + +/* + * No tests in here, just to trigger 'bpf_fentry_test*' + * through tracing test_run + */ +SEC("fentry/bpf_modify_return_test") +int BPF_PROG(trigger) +{ + return 0; +} + +static int check_cookie(__u64 val, __u64 *result) +{ + long *cookie; + + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 1; + + cookie = bpf_session_cookie(); + + if (bpf_session_is_return()) + *result = *cookie == val ? val : 0; + else + *cookie = val; + return 0; +} + +SEC("kprobe.session/bpf_fentry_test1") +int test_kprobe_1(struct pt_regs *ctx) +{ + return check_cookie(1, &test_kprobe_1_result); +} + +SEC("kprobe.session/bpf_fentry_test1") +int test_kprobe_2(struct pt_regs *ctx) +{ + return check_cookie(2, &test_kprobe_2_result); +} + +SEC("kprobe.session/bpf_fentry_test1") +int test_kprobe_3(struct pt_regs *ctx) +{ + return check_cookie(3, &test_kprobe_3_result); +} diff --git a/tools/testing/selftests/bpf/progs/local_storage.c b/tools/testing/selftests/bpf/progs/local_storage.c index e5e3a8b8dd07..637e75df2e14 100644 --- a/tools/testing/selftests/bpf/progs/local_storage.c +++ b/tools/testing/selftests/bpf/progs/local_storage.c @@ -140,11 +140,12 @@ int BPF_PROG(socket_bind, struct socket *sock, struct sockaddr *address, { __u32 pid = bpf_get_current_pid_tgid() >> 32; struct local_storage *storage; + struct sock *sk = sock->sk; - if (pid != monitored_pid) + if (pid != monitored_pid || !sk) return 0; - storage = bpf_sk_storage_get(&sk_storage_map, sock->sk, 0, 0); + storage = bpf_sk_storage_get(&sk_storage_map, sk, 0, 0); if (!storage) return 0; @@ -155,24 +156,24 @@ int BPF_PROG(socket_bind, struct socket *sock, struct sockaddr *address, /* This tests that we can associate multiple elements * with the local storage. */ - storage = bpf_sk_storage_get(&sk_storage_map2, sock->sk, 0, + storage = bpf_sk_storage_get(&sk_storage_map2, sk, 0, BPF_LOCAL_STORAGE_GET_F_CREATE); if (!storage) return 0; - if (bpf_sk_storage_delete(&sk_storage_map2, sock->sk)) + if (bpf_sk_storage_delete(&sk_storage_map2, sk)) return 0; - storage = bpf_sk_storage_get(&sk_storage_map2, sock->sk, 0, + storage = bpf_sk_storage_get(&sk_storage_map2, sk, 0, BPF_LOCAL_STORAGE_GET_F_CREATE); if (!storage) return 0; - if (bpf_sk_storage_delete(&sk_storage_map, sock->sk)) + if (bpf_sk_storage_delete(&sk_storage_map, sk)) return 0; /* Ensure that the sk_storage_map is disconnected from the storage. */ - if (!sock->sk->sk_bpf_storage || sock->sk->sk_bpf_storage->smap) + if (!sk->sk_bpf_storage || sk->sk_bpf_storage->smap) return 0; sk_storage_result = 0; @@ -185,11 +186,12 @@ int BPF_PROG(socket_post_create, struct socket *sock, int family, int type, { __u32 pid = bpf_get_current_pid_tgid() >> 32; struct local_storage *storage; + struct sock *sk = sock->sk; - if (pid != monitored_pid) + if (pid != monitored_pid || !sk) return 0; - storage = bpf_sk_storage_get(&sk_storage_map, sock->sk, 0, + storage = bpf_sk_storage_get(&sk_storage_map, sk, 0, BPF_LOCAL_STORAGE_GET_F_CREATE); if (!storage) return 0; diff --git a/tools/testing/selftests/bpf/progs/lsm_cgroup.c b/tools/testing/selftests/bpf/progs/lsm_cgroup.c index 02c11d16b692..d7598538aa2d 100644 --- a/tools/testing/selftests/bpf/progs/lsm_cgroup.c +++ b/tools/testing/selftests/bpf/progs/lsm_cgroup.c @@ -103,11 +103,15 @@ static __always_inline int real_bind(struct socket *sock, int addrlen) { struct sockaddr_ll sa = {}; + struct sock *sk = sock->sk; - if (sock->sk->__sk_common.skc_family != AF_PACKET) + if (!sk) + return 1; + + if (sk->__sk_common.skc_family != AF_PACKET) return 1; - if (sock->sk->sk_kern_sock) + if (sk->sk_kern_sock) return 1; bpf_probe_read_kernel(&sa, sizeof(sa), address); diff --git a/tools/testing/selftests/bpf/progs/map_kptr.c b/tools/testing/selftests/bpf/progs/map_kptr.c index da30f0d59364..ab0ce1d01a4a 100644 --- a/tools/testing/selftests/bpf/progs/map_kptr.c +++ b/tools/testing/selftests/bpf/progs/map_kptr.c @@ -110,10 +110,14 @@ DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_map, array_of_array_maps); DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_ARRAY_OF_MAPS, hash_map, array_of_hash_maps); DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_ARRAY_OF_MAPS, hash_malloc_map, array_of_hash_malloc_maps); DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_ARRAY_OF_MAPS, lru_hash_map, array_of_lru_hash_maps); +DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_ARRAY_OF_MAPS, pcpu_array_map, array_of_pcpu_array_maps); +DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_ARRAY_OF_MAPS, pcpu_hash_map, array_of_pcpu_hash_maps); DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_HASH_OF_MAPS, array_map, hash_of_array_maps); DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_HASH_OF_MAPS, hash_map, hash_of_hash_maps); DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_HASH_OF_MAPS, hash_malloc_map, hash_of_hash_malloc_maps); DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_HASH_OF_MAPS, lru_hash_map, hash_of_lru_hash_maps); +DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_HASH_OF_MAPS, pcpu_array_map, hash_of_pcpu_array_maps); +DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_HASH_OF_MAPS, pcpu_hash_map, hash_of_pcpu_hash_maps); #define WRITE_ONCE(x, val) ((*(volatile typeof(x) *) &(x)) = (val)) @@ -204,6 +208,8 @@ int test_map_kptr(struct __sk_buff *ctx) TEST(hash_map); TEST(hash_malloc_map); TEST(lru_hash_map); + TEST(pcpu_array_map); + TEST(pcpu_hash_map); #undef TEST return 0; @@ -281,10 +287,14 @@ int test_map_in_map_kptr(struct __sk_buff *ctx) TEST(array_of_hash_maps); TEST(array_of_hash_malloc_maps); TEST(array_of_lru_hash_maps); + TEST(array_of_pcpu_array_maps); + TEST(array_of_pcpu_hash_maps); TEST(hash_of_array_maps); TEST(hash_of_hash_maps); TEST(hash_of_hash_malloc_maps); TEST(hash_of_lru_hash_maps); + TEST(hash_of_pcpu_array_maps); + TEST(hash_of_pcpu_hash_maps); #undef TEST return 0; diff --git a/tools/testing/selftests/bpf/progs/mptcp_sock.c b/tools/testing/selftests/bpf/progs/mptcp_sock.c index 91a0d7eff2ac..f3acb90588c7 100644 --- a/tools/testing/selftests/bpf/progs/mptcp_sock.c +++ b/tools/testing/selftests/bpf/progs/mptcp_sock.c @@ -2,9 +2,9 @@ /* Copyright (c) 2020, Tessares SA. */ /* Copyright (c) 2022, SUSE. */ -#include <linux/bpf.h> +#include "bpf_tracing_net.h" #include <bpf/bpf_helpers.h> -#include "bpf_tcp_helpers.h" +#include <bpf/bpf_tracing.h> char _license[] SEC("license") = "GPL"; __u32 token = 0; diff --git a/tools/testing/selftests/bpf/progs/mptcpify.c b/tools/testing/selftests/bpf/progs/mptcpify.c index 53301ae8a8f7..cbdc730c3a47 100644 --- a/tools/testing/selftests/bpf/progs/mptcpify.c +++ b/tools/testing/selftests/bpf/progs/mptcpify.c @@ -6,10 +6,14 @@ #include "bpf_tracing_net.h" char _license[] SEC("license") = "GPL"; +int pid; SEC("fmod_ret/update_socket_protocol") int BPF_PROG(mptcpify, int family, int type, int protocol) { + if (bpf_get_current_pid_tgid() >> 32 != pid) + return protocol; + if ((family == AF_INET || family == AF_INET6) && type == SOCK_STREAM && (!protocol || protocol == IPPROTO_TCP)) { diff --git a/tools/testing/selftests/bpf/progs/preempt_lock.c b/tools/testing/selftests/bpf/progs/preempt_lock.c new file mode 100644 index 000000000000..672fc368d9c4 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/preempt_lock.c @@ -0,0 +1,132 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <vmlinux.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include "bpf_misc.h" +#include "bpf_experimental.h" + +SEC("?tc") +__failure __msg("1 bpf_preempt_enable is missing") +int preempt_lock_missing_1(struct __sk_buff *ctx) +{ + bpf_preempt_disable(); + return 0; +} + +SEC("?tc") +__failure __msg("2 bpf_preempt_enable(s) are missing") +int preempt_lock_missing_2(struct __sk_buff *ctx) +{ + bpf_preempt_disable(); + bpf_preempt_disable(); + return 0; +} + +SEC("?tc") +__failure __msg("3 bpf_preempt_enable(s) are missing") +int preempt_lock_missing_3(struct __sk_buff *ctx) +{ + bpf_preempt_disable(); + bpf_preempt_disable(); + bpf_preempt_disable(); + return 0; +} + +SEC("?tc") +__failure __msg("1 bpf_preempt_enable is missing") +int preempt_lock_missing_3_minus_2(struct __sk_buff *ctx) +{ + bpf_preempt_disable(); + bpf_preempt_disable(); + bpf_preempt_disable(); + bpf_preempt_enable(); + bpf_preempt_enable(); + return 0; +} + +static __noinline void preempt_disable(void) +{ + bpf_preempt_disable(); +} + +static __noinline void preempt_enable(void) +{ + bpf_preempt_enable(); +} + +SEC("?tc") +__failure __msg("1 bpf_preempt_enable is missing") +int preempt_lock_missing_1_subprog(struct __sk_buff *ctx) +{ + preempt_disable(); + return 0; +} + +SEC("?tc") +__failure __msg("2 bpf_preempt_enable(s) are missing") +int preempt_lock_missing_2_subprog(struct __sk_buff *ctx) +{ + preempt_disable(); + preempt_disable(); + return 0; +} + +SEC("?tc") +__failure __msg("1 bpf_preempt_enable is missing") +int preempt_lock_missing_2_minus_1_subprog(struct __sk_buff *ctx) +{ + preempt_disable(); + preempt_disable(); + preempt_enable(); + return 0; +} + +static __noinline void preempt_balance_subprog(void) +{ + preempt_disable(); + preempt_enable(); +} + +SEC("?tc") +__success int preempt_balance(struct __sk_buff *ctx) +{ + bpf_guard_preempt(); + return 0; +} + +SEC("?tc") +__success int preempt_balance_subprog_test(struct __sk_buff *ctx) +{ + preempt_balance_subprog(); + return 0; +} + +SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") +__failure __msg("sleepable helper bpf_copy_from_user#") +int preempt_sleepable_helper(void *ctx) +{ + u32 data; + + bpf_preempt_disable(); + bpf_copy_from_user(&data, sizeof(data), NULL); + bpf_preempt_enable(); + return 0; +} + +int __noinline preempt_global_subprog(void) +{ + preempt_balance_subprog(); + return 0; +} + +SEC("?tc") +__failure __msg("global function calls are not allowed with preemption disabled") +int preempt_global_subprog_test(struct __sk_buff *ctx) +{ + preempt_disable(); + preempt_global_subprog(); + preempt_enable(); + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/sendmsg4_prog.c b/tools/testing/selftests/bpf/progs/sendmsg4_prog.c index 351e79aef2fa..edc159598a0e 100644 --- a/tools/testing/selftests/bpf/progs/sendmsg4_prog.c +++ b/tools/testing/selftests/bpf/progs/sendmsg4_prog.c @@ -49,4 +49,10 @@ int sendmsg_v4_prog(struct bpf_sock_addr *ctx) return 1; } +SEC("cgroup/sendmsg4") +int sendmsg_v4_deny_prog(struct bpf_sock_addr *ctx) +{ + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/sendmsg6_prog.c b/tools/testing/selftests/bpf/progs/sendmsg6_prog.c index bf9b46b806f6..36a7f960799f 100644 --- a/tools/testing/selftests/bpf/progs/sendmsg6_prog.c +++ b/tools/testing/selftests/bpf/progs/sendmsg6_prog.c @@ -20,6 +20,11 @@ #define DST_REWRITE_IP6_2 0 #define DST_REWRITE_IP6_3 1 +#define DST_REWRITE_IP6_V4_MAPPED_0 0 +#define DST_REWRITE_IP6_V4_MAPPED_1 0 +#define DST_REWRITE_IP6_V4_MAPPED_2 0x0000FFFF +#define DST_REWRITE_IP6_V4_MAPPED_3 0xc0a80004 // 192.168.0.4 + #define DST_REWRITE_PORT6 6666 SEC("cgroup/sendmsg6") @@ -59,4 +64,56 @@ int sendmsg_v6_prog(struct bpf_sock_addr *ctx) return 1; } +SEC("cgroup/sendmsg6") +int sendmsg_v6_v4mapped_prog(struct bpf_sock_addr *ctx) +{ + /* Rewrite source. */ + ctx->msg_src_ip6[0] = bpf_htonl(SRC_REWRITE_IP6_0); + ctx->msg_src_ip6[1] = bpf_htonl(SRC_REWRITE_IP6_1); + ctx->msg_src_ip6[2] = bpf_htonl(SRC_REWRITE_IP6_2); + ctx->msg_src_ip6[3] = bpf_htonl(SRC_REWRITE_IP6_3); + + /* Rewrite destination. */ + ctx->user_ip6[0] = bpf_htonl(DST_REWRITE_IP6_V4_MAPPED_0); + ctx->user_ip6[1] = bpf_htonl(DST_REWRITE_IP6_V4_MAPPED_1); + ctx->user_ip6[2] = bpf_htonl(DST_REWRITE_IP6_V4_MAPPED_2); + ctx->user_ip6[3] = bpf_htonl(DST_REWRITE_IP6_V4_MAPPED_3); + + ctx->user_port = bpf_htons(DST_REWRITE_PORT6); + + return 1; +} + +SEC("cgroup/sendmsg6") +int sendmsg_v6_wildcard_prog(struct bpf_sock_addr *ctx) +{ + /* Rewrite source. */ + ctx->msg_src_ip6[0] = bpf_htonl(SRC_REWRITE_IP6_0); + ctx->msg_src_ip6[1] = bpf_htonl(SRC_REWRITE_IP6_1); + ctx->msg_src_ip6[2] = bpf_htonl(SRC_REWRITE_IP6_2); + ctx->msg_src_ip6[3] = bpf_htonl(SRC_REWRITE_IP6_3); + + /* Rewrite destination. */ + ctx->user_ip6[0] = bpf_htonl(0); + ctx->user_ip6[1] = bpf_htonl(0); + ctx->user_ip6[2] = bpf_htonl(0); + ctx->user_ip6[3] = bpf_htonl(0); + + ctx->user_port = bpf_htons(DST_REWRITE_PORT6); + + return 1; +} + +SEC("cgroup/sendmsg6") +int sendmsg_v6_preserve_dst_prog(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/sendmsg6") +int sendmsg_v6_deny_prog(struct bpf_sock_addr *ctx) +{ + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/sendmsg_unix_prog.c b/tools/testing/selftests/bpf/progs/sendmsg_unix_prog.c index d8869b03dda9..332d0eb1116f 100644 --- a/tools/testing/selftests/bpf/progs/sendmsg_unix_prog.c +++ b/tools/testing/selftests/bpf/progs/sendmsg_unix_prog.c @@ -36,4 +36,10 @@ int sendmsg_unix_prog(struct bpf_sock_addr *ctx) return 1; } +SEC("cgroup/sendmsg_unix") +int sendmsg_unix_deny_prog(struct bpf_sock_addr *ctx) +{ + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/skb_pkt_end.c b/tools/testing/selftests/bpf/progs/skb_pkt_end.c index 992b7861003a..db4abd2682fc 100644 --- a/tools/testing/selftests/bpf/progs/skb_pkt_end.c +++ b/tools/testing/selftests/bpf/progs/skb_pkt_end.c @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 +#ifndef BPF_NO_PRESERVE_ACCESS_INDEX #define BPF_NO_PRESERVE_ACCESS_INDEX +#endif #include <vmlinux.h> #include <bpf/bpf_core_read.h> #include <bpf/bpf_helpers.h> diff --git a/tools/testing/selftests/bpf/progs/sock_addr_kern.c b/tools/testing/selftests/bpf/progs/sock_addr_kern.c new file mode 100644 index 000000000000..8386bb15ccdc --- /dev/null +++ b/tools/testing/selftests/bpf/progs/sock_addr_kern.c @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Google LLC */ +#include <vmlinux.h> +#include <bpf/bpf_helpers.h> +#include "../bpf_testmod/bpf_testmod_kfunc.h" + +SEC("syscall") +int init_sock(struct init_sock_args *args) +{ + bpf_kfunc_init_sock(args); + + return 0; +} + +SEC("syscall") +int close_sock(void *ctx) +{ + bpf_kfunc_close_sock(); + + return 0; +} + +SEC("syscall") +int kernel_connect(struct addr_args *args) +{ + return bpf_kfunc_call_kernel_connect(args); +} + +SEC("syscall") +int kernel_bind(struct addr_args *args) +{ + return bpf_kfunc_call_kernel_bind(args); +} + +SEC("syscall") +int kernel_listen(struct addr_args *args) +{ + return bpf_kfunc_call_kernel_listen(); +} + +SEC("syscall") +int kernel_sendmsg(struct sendmsg_args *args) +{ + return bpf_kfunc_call_kernel_sendmsg(args); +} + +SEC("syscall") +int sock_sendmsg(struct sendmsg_args *args) +{ + return bpf_kfunc_call_sock_sendmsg(args); +} + +SEC("syscall") +int kernel_getsockname(struct addr_args *args) +{ + return bpf_kfunc_call_kernel_getsockname(args); +} + +SEC("syscall") +int kernel_getpeername(struct addr_args *args) +{ + return bpf_kfunc_call_kernel_getpeername(args); +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/sockopt_qos_to_cc.c b/tools/testing/selftests/bpf/progs/sockopt_qos_to_cc.c index 83753b00a556..5c3614333b01 100644 --- a/tools/testing/selftests/bpf/progs/sockopt_qos_to_cc.c +++ b/tools/testing/selftests/bpf/progs/sockopt_qos_to_cc.c @@ -1,24 +1,20 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2021 Facebook */ -#include <string.h> -#include <linux/tcp.h> -#include <netinet/in.h> -#include <linux/bpf.h> -#include <bpf/bpf_helpers.h> -#include "bpf_tcp_helpers.h" +#include "bpf_tracing_net.h" char _license[] SEC("license") = "GPL"; __s32 page_size = 0; +const char cc_reno[TCP_CA_NAME_MAX] = "reno"; +const char cc_cubic[TCP_CA_NAME_MAX] = "cubic"; + SEC("cgroup/setsockopt") int sockopt_qos_to_cc(struct bpf_sockopt *ctx) { void *optval_end = ctx->optval_end; int *optval = ctx->optval; char buf[TCP_CA_NAME_MAX]; - char cc_reno[TCP_CA_NAME_MAX] = "reno"; - char cc_cubic[TCP_CA_NAME_MAX] = "cubic"; if (ctx->level != SOL_IPV6 || ctx->optname != IPV6_TCLASS) goto out; @@ -29,11 +25,11 @@ int sockopt_qos_to_cc(struct bpf_sockopt *ctx) if (bpf_getsockopt(ctx->sk, SOL_TCP, TCP_CONGESTION, &buf, sizeof(buf))) return 0; - if (!tcp_cc_eq(buf, cc_cubic)) + if (bpf_strncmp(buf, sizeof(buf), cc_cubic)) return 0; if (*optval == 0x2d) { - if (bpf_setsockopt(ctx->sk, SOL_TCP, TCP_CONGESTION, &cc_reno, + if (bpf_setsockopt(ctx->sk, SOL_TCP, TCP_CONGESTION, (void *)&cc_reno, sizeof(cc_reno))) return 0; } diff --git a/tools/testing/selftests/bpf/progs/struct_ops_forgotten_cb.c b/tools/testing/selftests/bpf/progs/struct_ops_forgotten_cb.c new file mode 100644 index 000000000000..3c822103bd40 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/struct_ops_forgotten_cb.c @@ -0,0 +1,19 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#include <vmlinux.h> +#include <bpf/bpf_tracing.h> +#include "../bpf_testmod/bpf_testmod.h" + +char _license[] SEC("license") = "GPL"; + +SEC("struct_ops/test_1") +int BPF_PROG(test_1_forgotten) +{ + return 0; +} + +SEC(".struct_ops.link") +struct bpf_testmod_ops ops = { + /* we forgot to reference test_1_forgotten above, oops */ +}; + diff --git a/tools/testing/selftests/bpf/progs/struct_ops_module.c b/tools/testing/selftests/bpf/progs/struct_ops_module.c index 026cabfa7f1f..4c56d4a9d9f4 100644 --- a/tools/testing/selftests/bpf/progs/struct_ops_module.c +++ b/tools/testing/selftests/bpf/progs/struct_ops_module.c @@ -23,7 +23,7 @@ void BPF_PROG(test_2, int a, int b) test_2_result = a + b; } -SEC("struct_ops/test_3") +SEC("?struct_ops/test_3") int BPF_PROG(test_3, int a, int b) { test_2_result = a + b + 3; @@ -54,3 +54,37 @@ struct bpf_testmod_ops___v2 testmod_2 = { .test_1 = (void *)test_1, .test_2 = (void *)test_2_v2, }; + +struct bpf_testmod_ops___zeroed { + int (*test_1)(void); + void (*test_2)(int a, int b); + int (*test_maybe_null)(int dummy, struct task_struct *task); + void (*zeroed_op)(int a, int b); + int zeroed; +}; + +SEC("struct_ops/test_3") +int BPF_PROG(zeroed_op) +{ + return 1; +} + +SEC(".struct_ops.link") +struct bpf_testmod_ops___zeroed testmod_zeroed = { + .test_1 = (void *)test_1, + .test_2 = (void *)test_2_v2, + .zeroed_op = (void *)zeroed_op, +}; + +struct bpf_testmod_ops___incompatible { + int (*test_1)(void); + void (*test_2)(int *a); + int data; +}; + +SEC(".struct_ops.link") +struct bpf_testmod_ops___incompatible testmod_incompatible = { + .test_1 = (void *)test_1, + .test_2 = (void *)test_2, + .data = 3, +}; diff --git a/tools/testing/selftests/bpf/progs/struct_ops_nulled_out_cb.c b/tools/testing/selftests/bpf/progs/struct_ops_nulled_out_cb.c new file mode 100644 index 000000000000..fa2021388485 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/struct_ops_nulled_out_cb.c @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#include <vmlinux.h> +#include <bpf/bpf_tracing.h> +#include "../bpf_testmod/bpf_testmod.h" + +char _license[] SEC("license") = "GPL"; + +int rand; +int arr[1]; + +SEC("struct_ops/test_1") +int BPF_PROG(test_1_turn_off) +{ + return arr[rand]; /* potentially way out of range access */ +} + +SEC(".struct_ops.link") +struct bpf_testmod_ops ops = { + .test_1 = (void *)test_1_turn_off, +}; + diff --git a/tools/testing/selftests/bpf/progs/task_kfunc_common.h b/tools/testing/selftests/bpf/progs/task_kfunc_common.h index 41f2d44f49cb..6720c4b5be41 100644 --- a/tools/testing/selftests/bpf/progs/task_kfunc_common.h +++ b/tools/testing/selftests/bpf/progs/task_kfunc_common.h @@ -13,7 +13,7 @@ struct __tasks_kfunc_map_value { struct task_struct __kptr * task; }; -struct hash_map { +struct { __uint(type, BPF_MAP_TYPE_HASH); __type(key, int); __type(value, struct __tasks_kfunc_map_value); diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_incompl_cong_ops.c b/tools/testing/selftests/bpf/progs/tcp_ca_incompl_cong_ops.c index 7bb872fb22dd..0016c90e9c13 100644 --- a/tools/testing/selftests/bpf/progs/tcp_ca_incompl_cong_ops.c +++ b/tools/testing/selftests/bpf/progs/tcp_ca_incompl_cong_ops.c @@ -1,24 +1,18 @@ // SPDX-License-Identifier: GPL-2.0 -#include "vmlinux.h" - +#include "bpf_tracing_net.h" #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> char _license[] SEC("license") = "GPL"; -static inline struct tcp_sock *tcp_sk(const struct sock *sk) -{ - return (struct tcp_sock *)sk; -} - -SEC("struct_ops/incompl_cong_ops_ssthresh") +SEC("struct_ops") __u32 BPF_PROG(incompl_cong_ops_ssthresh, struct sock *sk) { return tcp_sk(sk)->snd_ssthresh; } -SEC("struct_ops/incompl_cong_ops_undo_cwnd") +SEC("struct_ops") __u32 BPF_PROG(incompl_cong_ops_undo_cwnd, struct sock *sk) { return tcp_sk(sk)->snd_cwnd; diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c b/tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c new file mode 100644 index 000000000000..f95862f570b7 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c @@ -0,0 +1,121 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Facebook */ + +#include "vmlinux.h" +#include <bpf/bpf_tracing.h> + +extern void bbr_init(struct sock *sk) __ksym; +extern void bbr_main(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs) __ksym; +extern u32 bbr_sndbuf_expand(struct sock *sk) __ksym; +extern u32 bbr_undo_cwnd(struct sock *sk) __ksym; +extern void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) __ksym; +extern u32 bbr_ssthresh(struct sock *sk) __ksym; +extern u32 bbr_min_tso_segs(struct sock *sk) __ksym; +extern void bbr_set_state(struct sock *sk, u8 new_state) __ksym; + +extern void dctcp_init(struct sock *sk) __ksym; +extern void dctcp_update_alpha(struct sock *sk, u32 flags) __ksym; +extern void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev) __ksym; +extern u32 dctcp_ssthresh(struct sock *sk) __ksym; +extern u32 dctcp_cwnd_undo(struct sock *sk) __ksym; +extern void dctcp_state(struct sock *sk, u8 new_state) __ksym; + +extern void cubictcp_init(struct sock *sk) __ksym; +extern u32 cubictcp_recalc_ssthresh(struct sock *sk) __ksym; +extern void cubictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) __ksym; +extern void cubictcp_state(struct sock *sk, u8 new_state) __ksym; +extern void cubictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event) __ksym; +extern void cubictcp_acked(struct sock *sk, const struct ack_sample *sample) __ksym; + +SEC("struct_ops") +void BPF_PROG(init, struct sock *sk) +{ + bbr_init(sk); + dctcp_init(sk); + cubictcp_init(sk); +} + +SEC("struct_ops") +void BPF_PROG(in_ack_event, struct sock *sk, u32 flags) +{ + dctcp_update_alpha(sk, flags); +} + +SEC("struct_ops") +void BPF_PROG(cong_control, struct sock *sk, u32 ack, int flag, const struct rate_sample *rs) +{ + bbr_main(sk, ack, flag, rs); +} + +SEC("struct_ops") +void BPF_PROG(cong_avoid, struct sock *sk, u32 ack, u32 acked) +{ + cubictcp_cong_avoid(sk, ack, acked); +} + +SEC("struct_ops") +u32 BPF_PROG(sndbuf_expand, struct sock *sk) +{ + return bbr_sndbuf_expand(sk); +} + +SEC("struct_ops") +u32 BPF_PROG(undo_cwnd, struct sock *sk) +{ + bbr_undo_cwnd(sk); + return dctcp_cwnd_undo(sk); +} + +SEC("struct_ops") +void BPF_PROG(cwnd_event, struct sock *sk, enum tcp_ca_event event) +{ + bbr_cwnd_event(sk, event); + dctcp_cwnd_event(sk, event); + cubictcp_cwnd_event(sk, event); +} + +SEC("struct_ops") +u32 BPF_PROG(ssthresh, struct sock *sk) +{ + bbr_ssthresh(sk); + dctcp_ssthresh(sk); + return cubictcp_recalc_ssthresh(sk); +} + +SEC("struct_ops") +u32 BPF_PROG(min_tso_segs, struct sock *sk) +{ + return bbr_min_tso_segs(sk); +} + +SEC("struct_ops") +void BPF_PROG(set_state, struct sock *sk, u8 new_state) +{ + bbr_set_state(sk, new_state); + dctcp_state(sk, new_state); + cubictcp_state(sk, new_state); +} + +SEC("struct_ops") +void BPF_PROG(pkts_acked, struct sock *sk, const struct ack_sample *sample) +{ + cubictcp_acked(sk, sample); +} + +SEC(".struct_ops") +struct tcp_congestion_ops tcp_ca_kfunc = { + .init = (void *)init, + .in_ack_event = (void *)in_ack_event, + .cong_control = (void *)cong_control, + .cong_avoid = (void *)cong_avoid, + .sndbuf_expand = (void *)sndbuf_expand, + .undo_cwnd = (void *)undo_cwnd, + .cwnd_event = (void *)cwnd_event, + .ssthresh = (void *)ssthresh, + .min_tso_segs = (void *)min_tso_segs, + .set_state = (void *)set_state, + .pkts_acked = (void *)pkts_acked, + .name = "tcp_ca_kfunc", +}; + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_unsupp_cong_op.c b/tools/testing/selftests/bpf/progs/tcp_ca_unsupp_cong_op.c index c06f4a41c21a..54f916a931c6 100644 --- a/tools/testing/selftests/bpf/progs/tcp_ca_unsupp_cong_op.c +++ b/tools/testing/selftests/bpf/progs/tcp_ca_unsupp_cong_op.c @@ -7,7 +7,7 @@ char _license[] SEC("license") = "GPL"; -SEC("struct_ops/unsupp_cong_op_get_info") +SEC("struct_ops") size_t BPF_PROG(unsupp_cong_op_get_info, struct sock *sk, u32 ext, int *attr, union tcp_cc_info *info) { diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_update.c b/tools/testing/selftests/bpf/progs/tcp_ca_update.c index b93a0ed33057..e4bd82bc0d01 100644 --- a/tools/testing/selftests/bpf/progs/tcp_ca_update.c +++ b/tools/testing/selftests/bpf/progs/tcp_ca_update.c @@ -1,7 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 -#include "vmlinux.h" - +#include "bpf_tracing_net.h" #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> @@ -10,36 +9,31 @@ char _license[] SEC("license") = "GPL"; int ca1_cnt = 0; int ca2_cnt = 0; -static inline struct tcp_sock *tcp_sk(const struct sock *sk) -{ - return (struct tcp_sock *)sk; -} - -SEC("struct_ops/ca_update_1_init") +SEC("struct_ops") void BPF_PROG(ca_update_1_init, struct sock *sk) { ca1_cnt++; } -SEC("struct_ops/ca_update_2_init") +SEC("struct_ops") void BPF_PROG(ca_update_2_init, struct sock *sk) { ca2_cnt++; } -SEC("struct_ops/ca_update_cong_control") +SEC("struct_ops") void BPF_PROG(ca_update_cong_control, struct sock *sk, const struct rate_sample *rs) { } -SEC("struct_ops/ca_update_ssthresh") +SEC("struct_ops") __u32 BPF_PROG(ca_update_ssthresh, struct sock *sk) { return tcp_sk(sk)->snd_ssthresh; } -SEC("struct_ops/ca_update_undo_cwnd") +SEC("struct_ops") __u32 BPF_PROG(ca_update_undo_cwnd, struct sock *sk) { return tcp_sk(sk)->snd_cwnd; diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_write_sk_pacing.c b/tools/testing/selftests/bpf/progs/tcp_ca_write_sk_pacing.c index 0724a79cec78..a58b5194fc89 100644 --- a/tools/testing/selftests/bpf/progs/tcp_ca_write_sk_pacing.c +++ b/tools/testing/selftests/bpf/progs/tcp_ca_write_sk_pacing.c @@ -1,7 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 -#include "vmlinux.h" - +#include "bpf_tracing_net.h" #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> @@ -11,22 +10,17 @@ char _license[] SEC("license") = "GPL"; #define min(a, b) ((a) < (b) ? (a) : (b)) -static inline struct tcp_sock *tcp_sk(const struct sock *sk) -{ - return (struct tcp_sock *)sk; -} - -static inline unsigned int tcp_left_out(const struct tcp_sock *tp) +static unsigned int tcp_left_out(const struct tcp_sock *tp) { return tp->sacked_out + tp->lost_out; } -static inline unsigned int tcp_packets_in_flight(const struct tcp_sock *tp) +static unsigned int tcp_packets_in_flight(const struct tcp_sock *tp) { return tp->packets_out - tcp_left_out(tp) + tp->retrans_out; } -SEC("struct_ops/write_sk_pacing_init") +SEC("struct_ops") void BPF_PROG(write_sk_pacing_init, struct sock *sk) { #ifdef ENABLE_ATOMICS_TESTS @@ -37,7 +31,7 @@ void BPF_PROG(write_sk_pacing_init, struct sock *sk) #endif } -SEC("struct_ops/write_sk_pacing_cong_control") +SEC("struct_ops") void BPF_PROG(write_sk_pacing_cong_control, struct sock *sk, const struct rate_sample *rs) { @@ -49,13 +43,13 @@ void BPF_PROG(write_sk_pacing_cong_control, struct sock *sk, tp->app_limited = (tp->delivered + tcp_packets_in_flight(tp)) ?: 1; } -SEC("struct_ops/write_sk_pacing_ssthresh") +SEC("struct_ops") __u32 BPF_PROG(write_sk_pacing_ssthresh, struct sock *sk) { return tcp_sk(sk)->snd_ssthresh; } -SEC("struct_ops/write_sk_pacing_undo_cwnd") +SEC("struct_ops") __u32 BPF_PROG(write_sk_pacing_undo_cwnd, struct sock *sk) { return tcp_sk(sk)->snd_cwnd; diff --git a/tools/testing/selftests/bpf/progs/tcp_rtt.c b/tools/testing/selftests/bpf/progs/tcp_rtt.c index 0988d79f1587..42c729f85524 100644 --- a/tools/testing/selftests/bpf/progs/tcp_rtt.c +++ b/tools/testing/selftests/bpf/progs/tcp_rtt.c @@ -10,6 +10,9 @@ struct tcp_rtt_storage { __u32 delivered; __u32 delivered_ce; __u32 icsk_retransmits; + + __u32 mrtt_us; /* args[0] */ + __u32 srtt; /* args[1] */ }; struct { @@ -55,5 +58,8 @@ int _sockops(struct bpf_sock_ops *ctx) storage->delivered_ce = tcp_sk->delivered_ce; storage->icsk_retransmits = tcp_sk->icsk_retransmits; + storage->mrtt_us = ctx->args[0]; + storage->srtt = ctx->args[1]; + return 1; } diff --git a/tools/testing/selftests/bpf/progs/test_access_variable_array.c b/tools/testing/selftests/bpf/progs/test_access_variable_array.c index 808c49b79889..326b7d1f496a 100644 --- a/tools/testing/selftests/bpf/progs/test_access_variable_array.c +++ b/tools/testing/selftests/bpf/progs/test_access_variable_array.c @@ -7,7 +7,7 @@ unsigned long span = 0; -SEC("fentry/load_balance") +SEC("fentry/sched_balance_rq") int BPF_PROG(fentry_fentry, int this_cpu, struct rq *this_rq, struct sched_domain *sd) { diff --git a/tools/testing/selftests/bpf/progs/test_bpf_cookie.c b/tools/testing/selftests/bpf/progs/test_bpf_cookie.c index 5a3a80f751c4..c83142b55f47 100644 --- a/tools/testing/selftests/bpf/progs/test_bpf_cookie.c +++ b/tools/testing/selftests/bpf/progs/test_bpf_cookie.c @@ -15,6 +15,8 @@ __u64 uprobe_res; __u64 uretprobe_res; __u64 tp_res; __u64 pe_res; +__u64 raw_tp_res; +__u64 tp_btf_res; __u64 fentry_res; __u64 fexit_res; __u64 fmod_ret_res; @@ -87,6 +89,20 @@ int handle_pe(struct pt_regs *ctx) return 0; } +SEC("raw_tp/sys_enter") +int handle_raw_tp(void *ctx) +{ + update(ctx, &raw_tp_res); + return 0; +} + +SEC("tp_btf/sys_enter") +int handle_tp_btf(void *ctx) +{ + update(ctx, &tp_btf_res); + return 0; +} + SEC("fentry/bpf_fentry_test1") int BPF_PROG(fentry_test1, int a) { diff --git a/tools/testing/selftests/bpf/progs/test_btf_skc_cls_ingress.c b/tools/testing/selftests/bpf/progs/test_btf_skc_cls_ingress.c index e2bea4da194b..f0759efff6ef 100644 --- a/tools/testing/selftests/bpf/progs/test_btf_skc_cls_ingress.c +++ b/tools/testing/selftests/bpf/progs/test_btf_skc_cls_ingress.c @@ -1,19 +1,13 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2020 Facebook */ -#include <string.h> -#include <errno.h> -#include <netinet/in.h> -#include <linux/stddef.h> -#include <linux/bpf.h> -#include <linux/ipv6.h> -#include <linux/tcp.h> -#include <linux/if_ether.h> -#include <linux/pkt_cls.h> - +#include "bpf_tracing_net.h" #include <bpf/bpf_helpers.h> #include <bpf/bpf_endian.h> -#include "bpf_tcp_helpers.h" + +#ifndef ENOENT +#define ENOENT 2 +#endif struct sockaddr_in6 srv_sa6 = {}; __u16 listen_tp_sport = 0; diff --git a/tools/testing/selftests/bpf/progs/test_global_func10.c b/tools/testing/selftests/bpf/progs/test_global_func10.c index 8fba3f3649e2..5da001ca57a5 100644 --- a/tools/testing/selftests/bpf/progs/test_global_func10.c +++ b/tools/testing/selftests/bpf/progs/test_global_func10.c @@ -4,6 +4,10 @@ #include <bpf/bpf_helpers.h> #include "bpf_misc.h" +#if !defined(__clang__) +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#endif + struct Small { long x; }; diff --git a/tools/testing/selftests/bpf/progs/test_lwt_redirect.c b/tools/testing/selftests/bpf/progs/test_lwt_redirect.c index 8c895122f293..83439b87b766 100644 --- a/tools/testing/selftests/bpf/progs/test_lwt_redirect.c +++ b/tools/testing/selftests/bpf/progs/test_lwt_redirect.c @@ -3,7 +3,7 @@ #include <bpf/bpf_endian.h> #include <bpf/bpf_helpers.h> #include <linux/ip.h> -#include "bpf_tracing_net.h" +#include <linux/if_ether.h> /* We don't care about whether the packet can be received by network stack. * Just care if the packet is sent to the correct device at correct direction diff --git a/tools/testing/selftests/bpf/progs/test_module_attach.c b/tools/testing/selftests/bpf/progs/test_module_attach.c index 8a1b50f3a002..cc1a012d038f 100644 --- a/tools/testing/selftests/bpf/progs/test_module_attach.c +++ b/tools/testing/selftests/bpf/progs/test_module_attach.c @@ -73,6 +73,29 @@ int BPF_PROG(handle_fentry_manual, return 0; } +__u32 fentry_explicit_read_sz = 0; + +SEC("fentry/bpf_testmod:bpf_testmod_test_read") +int BPF_PROG(handle_fentry_explicit, + struct file *file, struct kobject *kobj, + struct bin_attribute *bin_attr, char *buf, loff_t off, size_t len) +{ + fentry_explicit_read_sz = len; + return 0; +} + + +__u32 fentry_explicit_manual_read_sz = 0; + +SEC("fentry") +int BPF_PROG(handle_fentry_explicit_manual, + struct file *file, struct kobject *kobj, + struct bin_attribute *bin_attr, char *buf, loff_t off, size_t len) +{ + fentry_explicit_manual_read_sz = len; + return 0; +} + __u32 fexit_read_sz = 0; int fexit_ret = 0; diff --git a/tools/testing/selftests/bpf/progs/test_ns_current_pid_tgid.c b/tools/testing/selftests/bpf/progs/test_ns_current_pid_tgid.c index 0763d49f9c42..386315afad65 100644 --- a/tools/testing/selftests/bpf/progs/test_ns_current_pid_tgid.c +++ b/tools/testing/selftests/bpf/progs/test_ns_current_pid_tgid.c @@ -5,23 +5,48 @@ #include <stdint.h> #include <bpf/bpf_helpers.h> +struct { + __uint(type, BPF_MAP_TYPE_SOCKMAP); + __uint(max_entries, 2); + __type(key, __u32); + __type(value, __u32); +} sock_map SEC(".maps"); + __u64 user_pid = 0; __u64 user_tgid = 0; __u64 dev = 0; __u64 ino = 0; -SEC("tracepoint/syscalls/sys_enter_nanosleep") -int handler(const void *ctx) +static void get_pid_tgid(void) { struct bpf_pidns_info nsdata; if (bpf_get_ns_current_pid_tgid(dev, ino, &nsdata, sizeof(struct bpf_pidns_info))) - return 0; + return; user_pid = nsdata.pid; user_tgid = nsdata.tgid; +} +SEC("?tracepoint/syscalls/sys_enter_nanosleep") +int tp_handler(const void *ctx) +{ + get_pid_tgid(); return 0; } +SEC("?cgroup/bind4") +int cgroup_bind4(struct bpf_sock_addr *ctx) +{ + get_pid_tgid(); + return 1; +} + +SEC("?sk_msg") +int sk_msg(struct sk_msg_md *msg) +{ + get_pid_tgid(); + return SK_PASS; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_perf_skip.c b/tools/testing/selftests/bpf/progs/test_perf_skip.c new file mode 100644 index 000000000000..7eb8b6de7a57 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_perf_skip.c @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +uintptr_t ip; + +SEC("perf_event") +int handler(struct bpf_perf_event_data *data) +{ + /* Skip events that have the correct ip. */ + return ip != PT_REGS_IP(&data->regs); +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_ringbuf_n.c b/tools/testing/selftests/bpf/progs/test_ringbuf_n.c new file mode 100644 index 000000000000..8669eb42dbe0 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_ringbuf_n.c @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2024 Andrea Righi <andrea.righi@canonical.com> + +#include <linux/bpf.h> +#include <sched.h> +#include <unistd.h> +#include <bpf/bpf_helpers.h> +#include "bpf_misc.h" + +char _license[] SEC("license") = "GPL"; + +#define TASK_COMM_LEN 16 + +struct sample { + int pid; + long value; + char comm[16]; +}; + +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); +} ringbuf SEC(".maps"); + +int pid = 0; +long value = 0; + +SEC("fentry/" SYS_PREFIX "sys_getpgid") +int test_ringbuf_n(void *ctx) +{ + int cur_pid = bpf_get_current_pid_tgid() >> 32; + struct sample *sample; + + if (cur_pid != pid) + return 0; + + sample = bpf_ringbuf_reserve(&ringbuf, sizeof(*sample), 0); + if (!sample) + return 0; + + sample->pid = pid; + sample->value = value; + bpf_get_current_comm(sample->comm, sizeof(sample->comm)); + + bpf_ringbuf_submit(sample, 0); + + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/test_skmsg_load_helpers.c b/tools/testing/selftests/bpf/progs/test_skmsg_load_helpers.c index 45e8fc75a739..996b177324ba 100644 --- a/tools/testing/selftests/bpf/progs/test_skmsg_load_helpers.c +++ b/tools/testing/selftests/bpf/progs/test_skmsg_load_helpers.c @@ -24,8 +24,7 @@ struct { __type(value, __u64); } socket_storage SEC(".maps"); -SEC("sk_msg") -int prog_msg_verdict(struct sk_msg_md *msg) +static int prog_msg_verdict_common(struct sk_msg_md *msg) { struct task_struct *task = (struct task_struct *)bpf_get_current_task(); int verdict = SK_PASS; @@ -44,4 +43,28 @@ int prog_msg_verdict(struct sk_msg_md *msg) return verdict; } +SEC("sk_msg") +int prog_msg_verdict(struct sk_msg_md *msg) +{ + return prog_msg_verdict_common(msg); +} + +SEC("sk_msg") +int prog_msg_verdict_clone(struct sk_msg_md *msg) +{ + return prog_msg_verdict_common(msg); +} + +SEC("sk_msg") +int prog_msg_verdict_clone2(struct sk_msg_md *msg) +{ + return prog_msg_verdict_common(msg); +} + +SEC("sk_skb/stream_verdict") +int prog_skb_verdict(struct __sk_buff *skb) +{ + return SK_PASS; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_sock_fields.c b/tools/testing/selftests/bpf/progs/test_sock_fields.c index f75e531bf36f..196844be349c 100644 --- a/tools/testing/selftests/bpf/progs/test_sock_fields.c +++ b/tools/testing/selftests/bpf/progs/test_sock_fields.c @@ -7,7 +7,6 @@ #include <bpf/bpf_helpers.h> #include <bpf/bpf_endian.h> -#include "bpf_tcp_helpers.h" enum bpf_linum_array_idx { EGRESS_LINUM_IDX, @@ -42,6 +41,10 @@ struct { __type(value, struct bpf_spinlock_cnt); } sk_pkt_out_cnt10 SEC(".maps"); +struct tcp_sock { + __u32 lsndtime; +} __attribute__((preserve_access_index)); + struct bpf_tcp_sock listen_tp = {}; struct sockaddr_in6 srv_sa6 = {}; struct bpf_tcp_sock cli_tp = {}; diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_pass_prog.c b/tools/testing/selftests/bpf/progs/test_sockmap_pass_prog.c index 1d86a717a290..69aacc96db36 100644 --- a/tools/testing/selftests/bpf/progs/test_sockmap_pass_prog.c +++ b/tools/testing/selftests/bpf/progs/test_sockmap_pass_prog.c @@ -23,10 +23,25 @@ struct { __type(value, int); } sock_map_msg SEC(".maps"); -SEC("sk_skb") +SEC("sk_skb/stream_verdict") int prog_skb_verdict(struct __sk_buff *skb) { return SK_PASS; } +int clone_called; + +SEC("sk_skb/stream_verdict") +int prog_skb_verdict_clone(struct __sk_buff *skb) +{ + clone_called = 1; + return SK_PASS; +} + +SEC("sk_skb/stream_parser") +int prog_skb_parser(struct __sk_buff *skb) +{ + return SK_PASS; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_skb_verdict_attach.c b/tools/testing/selftests/bpf/progs/test_sockmap_skb_verdict_attach.c index 3c69aa971738..d25b0bb30fc0 100644 --- a/tools/testing/selftests/bpf/progs/test_sockmap_skb_verdict_attach.c +++ b/tools/testing/selftests/bpf/progs/test_sockmap_skb_verdict_attach.c @@ -9,7 +9,7 @@ struct { __type(value, __u64); } sock_map SEC(".maps"); -SEC("sk_skb") +SEC("sk_skb/verdict") int prog_skb_verdict(struct __sk_buff *skb) { return SK_DROP; diff --git a/tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c b/tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c index a3f3f43fc195..6935f32eeb8f 100644 --- a/tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c +++ b/tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c @@ -1,18 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 -#include <stddef.h> -#include <string.h> -#include <netinet/in.h> -#include <linux/bpf.h> -#include <linux/if_ether.h> -#include <linux/if_packet.h> -#include <linux/ip.h> -#include <linux/ipv6.h> -#include <linux/types.h> -#include <linux/socket.h> -#include <linux/tcp.h> +#include "bpf_tracing_net.h" #include <bpf/bpf_helpers.h> #include <bpf/bpf_endian.h> -#include "bpf_tcp_helpers.h" #include "test_tcpbpf.h" struct tcpbpf_globals global = {}; diff --git a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c index 3e436e6f7312..3f5abcf3ff13 100644 --- a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c +++ b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c @@ -567,12 +567,18 @@ int ip6vxlan_get_tunnel_src(struct __sk_buff *skb) return TC_ACT_OK; } +struct local_geneve_opt { + struct geneve_opt gopt; + int data; +}; + SEC("tc") int geneve_set_tunnel(struct __sk_buff *skb) { int ret; struct bpf_tunnel_key key; - struct geneve_opt gopt; + struct local_geneve_opt local_gopt; + struct geneve_opt *gopt = (struct geneve_opt *) &local_gopt; __builtin_memset(&key, 0x0, sizeof(key)); key.remote_ipv4 = 0xac100164; /* 172.16.1.100 */ @@ -580,14 +586,14 @@ int geneve_set_tunnel(struct __sk_buff *skb) key.tunnel_tos = 0; key.tunnel_ttl = 64; - __builtin_memset(&gopt, 0x0, sizeof(gopt)); - gopt.opt_class = bpf_htons(0x102); /* Open Virtual Networking (OVN) */ - gopt.type = 0x08; - gopt.r1 = 0; - gopt.r2 = 0; - gopt.r3 = 0; - gopt.length = 2; /* 4-byte multiple */ - *(int *) &gopt.opt_data = bpf_htonl(0xdeadbeef); + __builtin_memset(gopt, 0x0, sizeof(local_gopt)); + gopt->opt_class = bpf_htons(0x102); /* Open Virtual Networking (OVN) */ + gopt->type = 0x08; + gopt->r1 = 0; + gopt->r2 = 0; + gopt->r3 = 0; + gopt->length = 2; /* 4-byte multiple */ + *(int *) &gopt->opt_data = bpf_htonl(0xdeadbeef); ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key), BPF_F_ZERO_CSUM_TX); @@ -596,7 +602,7 @@ int geneve_set_tunnel(struct __sk_buff *skb) return TC_ACT_SHOT; } - ret = bpf_skb_set_tunnel_opt(skb, &gopt, sizeof(gopt)); + ret = bpf_skb_set_tunnel_opt(skb, gopt, sizeof(local_gopt)); if (ret < 0) { log_err(ret); return TC_ACT_SHOT; @@ -631,7 +637,8 @@ SEC("tc") int ip6geneve_set_tunnel(struct __sk_buff *skb) { struct bpf_tunnel_key key; - struct geneve_opt gopt; + struct local_geneve_opt local_gopt; + struct geneve_opt *gopt = (struct geneve_opt *) &local_gopt; int ret; __builtin_memset(&key, 0x0, sizeof(key)); @@ -647,16 +654,16 @@ int ip6geneve_set_tunnel(struct __sk_buff *skb) return TC_ACT_SHOT; } - __builtin_memset(&gopt, 0x0, sizeof(gopt)); - gopt.opt_class = bpf_htons(0x102); /* Open Virtual Networking (OVN) */ - gopt.type = 0x08; - gopt.r1 = 0; - gopt.r2 = 0; - gopt.r3 = 0; - gopt.length = 2; /* 4-byte multiple */ - *(int *) &gopt.opt_data = bpf_htonl(0xfeedbeef); + __builtin_memset(gopt, 0x0, sizeof(local_gopt)); + gopt->opt_class = bpf_htons(0x102); /* Open Virtual Networking (OVN) */ + gopt->type = 0x08; + gopt->r1 = 0; + gopt->r2 = 0; + gopt->r3 = 0; + gopt->length = 2; /* 4-byte multiple */ + *(int *) &gopt->opt_data = bpf_htonl(0xfeedbeef); - ret = bpf_skb_set_tunnel_opt(skb, &gopt, sizeof(gopt)); + ret = bpf_skb_set_tunnel_opt(skb, gopt, sizeof(gopt)); if (ret < 0) { log_err(ret); return TC_ACT_SHOT; diff --git a/tools/testing/selftests/bpf/progs/test_xdp_noinline.c b/tools/testing/selftests/bpf/progs/test_xdp_noinline.c index 5c7e4758a0ca..fad94e41cef9 100644 --- a/tools/testing/selftests/bpf/progs/test_xdp_noinline.c +++ b/tools/testing/selftests/bpf/progs/test_xdp_noinline.c @@ -318,6 +318,14 @@ bool encap_v6(struct xdp_md *xdp, struct ctl_value *cval, return true; } +#ifndef __clang__ +#pragma GCC push_options +/* GCC optimization collapses functions and increases the number of arguments + * beyond the compatible amount supported by BPF. + */ +#pragma GCC optimize("-fno-ipa-sra") +#endif + static __attribute__ ((noinline)) bool encap_v4(struct xdp_md *xdp, struct ctl_value *cval, struct packet_description *pckt, @@ -372,6 +380,10 @@ bool encap_v4(struct xdp_md *xdp, struct ctl_value *cval, return true; } +#ifndef __clang__ +#pragma GCC pop_options +#endif + static __attribute__ ((noinline)) int swap_mac_and_send(void *data, void *data_end) { @@ -588,12 +600,13 @@ static void connection_table_lookup(struct real_definition **real, __attribute__ ((noinline)) static int process_l3_headers_v6(struct packet_description *pckt, __u8 *protocol, __u64 off, - __u16 *pkt_bytes, void *data, - void *data_end) + __u16 *pkt_bytes, void *extra_args[2]) { struct ipv6hdr *ip6h; __u64 iph_len; int action; + void *data = extra_args[0]; + void *data_end = extra_args[1]; ip6h = data + off; if (ip6h + 1 > data_end) @@ -619,11 +632,12 @@ static int process_l3_headers_v6(struct packet_description *pckt, __attribute__ ((noinline)) static int process_l3_headers_v4(struct packet_description *pckt, __u8 *protocol, __u64 off, - __u16 *pkt_bytes, void *data, - void *data_end) + __u16 *pkt_bytes, void *extra_args[2]) { struct iphdr *iph; int action; + void *data = extra_args[0]; + void *data_end = extra_args[1]; iph = data + off; if (iph + 1 > data_end) @@ -666,13 +680,14 @@ static int process_packet(void *data, __u64 off, void *data_end, __u8 protocol; __u32 vip_num; int action; + void *extra_args[2] = { data, data_end }; if (is_ipv6) action = process_l3_headers_v6(&pckt, &protocol, off, - &pkt_bytes, data, data_end); + &pkt_bytes, extra_args); else action = process_l3_headers_v4(&pckt, &protocol, off, - &pkt_bytes, data, data_end); + &pkt_bytes, extra_args); if (action >= 0) return action; protocol = pckt.flow.proto; diff --git a/tools/testing/selftests/bpf/progs/test_xdp_vlan.c b/tools/testing/selftests/bpf/progs/test_xdp_vlan.c index f3ec8086482d..a7588302268d 100644 --- a/tools/testing/selftests/bpf/progs/test_xdp_vlan.c +++ b/tools/testing/selftests/bpf/progs/test_xdp_vlan.c @@ -160,7 +160,7 @@ int xdp_prognum1(struct xdp_md *ctx) /* Modifying VLAN, preserve top 4 bits */ vlan_hdr->h_vlan_TCI = - bpf_htons((bpf_ntohs(vlan_hdr->h_vlan_TCI) & 0xf000) + bpf_htons((bpf_ntohs(vlan_hdr->h_vlan_TCI) & 0xf000U) | TO_VLAN); } diff --git a/tools/testing/selftests/bpf/progs/timer.c b/tools/testing/selftests/bpf/progs/timer.c index f615da97df26..4c677c001258 100644 --- a/tools/testing/selftests/bpf/progs/timer.c +++ b/tools/testing/selftests/bpf/progs/timer.c @@ -2,9 +2,10 @@ /* Copyright (c) 2021 Facebook */ #include <linux/bpf.h> #include <time.h> +#include <stdbool.h> #include <errno.h> #include <bpf/bpf_helpers.h> -#include "bpf_tcp_helpers.h" +#include <bpf/bpf_tracing.h> char _license[] SEC("license") = "GPL"; struct hmap_elem { diff --git a/tools/testing/selftests/bpf/progs/timer_failure.c b/tools/testing/selftests/bpf/progs/timer_failure.c index 0996c2486f05..5a2e9dabf1c6 100644 --- a/tools/testing/selftests/bpf/progs/timer_failure.c +++ b/tools/testing/selftests/bpf/progs/timer_failure.c @@ -5,8 +5,8 @@ #include <time.h> #include <errno.h> #include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> #include "bpf_misc.h" -#include "bpf_tcp_helpers.h" char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/timer_mim.c b/tools/testing/selftests/bpf/progs/timer_mim.c index 2fee7ab105ef..50ebc3f68522 100644 --- a/tools/testing/selftests/bpf/progs/timer_mim.c +++ b/tools/testing/selftests/bpf/progs/timer_mim.c @@ -4,7 +4,7 @@ #include <time.h> #include <errno.h> #include <bpf/bpf_helpers.h> -#include "bpf_tcp_helpers.h" +#include <bpf/bpf_tracing.h> char _license[] SEC("license") = "GPL"; struct hmap_elem { diff --git a/tools/testing/selftests/bpf/progs/timer_mim_reject.c b/tools/testing/selftests/bpf/progs/timer_mim_reject.c index 5d648e3d8a41..dd3f1ed6d6e6 100644 --- a/tools/testing/selftests/bpf/progs/timer_mim_reject.c +++ b/tools/testing/selftests/bpf/progs/timer_mim_reject.c @@ -4,7 +4,7 @@ #include <time.h> #include <errno.h> #include <bpf/bpf_helpers.h> -#include "bpf_tcp_helpers.h" +#include <bpf/bpf_tracing.h> char _license[] SEC("license") = "GPL"; struct hmap_elem { diff --git a/tools/testing/selftests/bpf/progs/trigger_bench.c b/tools/testing/selftests/bpf/progs/trigger_bench.c index 5fda43901033..2619ed193c65 100644 --- a/tools/testing/selftests/bpf/progs/trigger_bench.c +++ b/tools/testing/selftests/bpf/progs/trigger_bench.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2020 Facebook - #include <linux/bpf.h> #include <asm/unistd.h> #include <bpf/bpf_helpers.h> @@ -9,82 +8,126 @@ char _license[] SEC("license") = "GPL"; -long hits = 0; +#define CPU_MASK 255 +#define MAX_CPUS (CPU_MASK + 1) /* should match MAX_BUCKETS in benchs/bench_trigger.c */ -SEC("tp/syscalls/sys_enter_getpgid") -int bench_trigger_tp(void *ctx) +/* matches struct counter in bench.h */ +struct counter { + long value; +} __attribute__((aligned(128))); + +struct counter hits[MAX_CPUS]; + +static __always_inline void inc_counter(void) +{ + int cpu = bpf_get_smp_processor_id(); + + __sync_add_and_fetch(&hits[cpu & CPU_MASK].value, 1); +} + +SEC("?uprobe") +int bench_trigger_uprobe(void *ctx) { - __sync_add_and_fetch(&hits, 1); + inc_counter(); return 0; } -SEC("raw_tp/sys_enter") -int BPF_PROG(bench_trigger_raw_tp, struct pt_regs *regs, long id) +const volatile int batch_iters = 0; + +SEC("?raw_tp") +int trigger_count(void *ctx) { - if (id == __NR_getpgid) - __sync_add_and_fetch(&hits, 1); + int i; + + for (i = 0; i < batch_iters; i++) + inc_counter(); + return 0; } -SEC("kprobe/" SYS_PREFIX "sys_getpgid") +SEC("?raw_tp") +int trigger_driver(void *ctx) +{ + int i; + + for (i = 0; i < batch_iters; i++) + (void)bpf_get_numa_node_id(); /* attach point for benchmarking */ + + return 0; +} + +extern int bpf_modify_return_test_tp(int nonce) __ksym __weak; + +SEC("?raw_tp") +int trigger_driver_kfunc(void *ctx) +{ + int i; + + for (i = 0; i < batch_iters; i++) + (void)bpf_modify_return_test_tp(0); /* attach point for benchmarking */ + + return 0; +} + +SEC("?kprobe/bpf_get_numa_node_id") int bench_trigger_kprobe(void *ctx) { - __sync_add_and_fetch(&hits, 1); + inc_counter(); return 0; } -SEC("kretprobe/" SYS_PREFIX "sys_getpgid") +SEC("?kretprobe/bpf_get_numa_node_id") int bench_trigger_kretprobe(void *ctx) { - __sync_add_and_fetch(&hits, 1); + inc_counter(); return 0; } -SEC("kprobe.multi/" SYS_PREFIX "sys_getpgid") +SEC("?kprobe.multi/bpf_get_numa_node_id") int bench_trigger_kprobe_multi(void *ctx) { - __sync_add_and_fetch(&hits, 1); + inc_counter(); return 0; } -SEC("kretprobe.multi/" SYS_PREFIX "sys_getpgid") +SEC("?kretprobe.multi/bpf_get_numa_node_id") int bench_trigger_kretprobe_multi(void *ctx) { - __sync_add_and_fetch(&hits, 1); + inc_counter(); return 0; } -SEC("fentry/" SYS_PREFIX "sys_getpgid") +SEC("?fentry/bpf_get_numa_node_id") int bench_trigger_fentry(void *ctx) { - __sync_add_and_fetch(&hits, 1); + inc_counter(); return 0; } -SEC("fexit/" SYS_PREFIX "sys_getpgid") +SEC("?fexit/bpf_get_numa_node_id") int bench_trigger_fexit(void *ctx) { - __sync_add_and_fetch(&hits, 1); + inc_counter(); return 0; } -SEC("fentry.s/" SYS_PREFIX "sys_getpgid") -int bench_trigger_fentry_sleep(void *ctx) +SEC("?fmod_ret/bpf_modify_return_test_tp") +int bench_trigger_fmodret(void *ctx) { - __sync_add_and_fetch(&hits, 1); - return 0; + inc_counter(); + return -22; } -SEC("fmod_ret/" SYS_PREFIX "sys_getpgid") -int bench_trigger_fmodret(void *ctx) +SEC("?tp/bpf_test_run/bpf_trigger_tp") +int bench_trigger_tp(void *ctx) { - __sync_add_and_fetch(&hits, 1); - return -22; + inc_counter(); + return 0; } -SEC("uprobe") -int bench_trigger_uprobe(void *ctx) +SEC("?raw_tp/bpf_trigger_tp") +int bench_trigger_rawtp(void *ctx) { - __sync_add_and_fetch(&hits, 1); + inc_counter(); return 0; } diff --git a/tools/testing/selftests/bpf/progs/verifier_bounds.c b/tools/testing/selftests/bpf/progs/verifier_bounds.c index 960998f16306..a0bb7fb40ea5 100644 --- a/tools/testing/selftests/bpf/progs/verifier_bounds.c +++ b/tools/testing/selftests/bpf/progs/verifier_bounds.c @@ -886,6 +886,69 @@ l1_%=: r0 = 0; \ } SEC("socket") +__description("bounds check for non const xor src dst") +__success __log_level(2) +__msg("5: (af) r0 ^= r6 ; R0_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=431,var_off=(0x0; 0x1af))") +__naked void non_const_xor_src_dst(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r6 = r0; \ + call %[bpf_get_prandom_u32]; \ + r6 &= 0xaf; \ + r0 &= 0x1a0; \ + r0 ^= r6; \ + exit; \ +" : + : __imm(bpf_map_lookup_elem), + __imm_addr(map_hash_8b), + __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("bounds check for non const or src dst") +__success __log_level(2) +__msg("5: (4f) r0 |= r6 ; R0_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=431,var_off=(0x0; 0x1af))") +__naked void non_const_or_src_dst(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r6 = r0; \ + call %[bpf_get_prandom_u32]; \ + r6 &= 0xaf; \ + r0 &= 0x1a0; \ + r0 |= r6; \ + exit; \ +" : + : __imm(bpf_map_lookup_elem), + __imm_addr(map_hash_8b), + __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("bounds check for non const mul regs") +__success __log_level(2) +__msg("5: (2f) r0 *= r6 ; R0_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=3825,var_off=(0x0; 0xfff))") +__naked void non_const_mul_regs(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r6 = r0; \ + call %[bpf_get_prandom_u32]; \ + r6 &= 0xff; \ + r0 &= 0x0f; \ + r0 *= r6; \ + exit; \ +" : + : __imm(bpf_map_lookup_elem), + __imm_addr(map_hash_8b), + __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") __description("bounds checks after 32-bit truncation. test 1") __success __failure_unpriv __msg_unpriv("R0 leaks addr") __retval(0) diff --git a/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c b/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c index baff5ffe9405..a9fc30ed4d73 100644 --- a/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c +++ b/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c @@ -8,6 +8,13 @@ #include "xdp_metadata.h" #include "bpf_kfuncs.h" +/* The compiler may be able to detect the access to uninitialized + memory in the routines performing out of bound memory accesses and + emit warnings about it. This is the case of GCC. */ +#if !defined(__clang__) +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#endif + int arr[1]; int unkn_idx; const volatile bool call_dead_subprog = false; diff --git a/tools/testing/selftests/bpf/progs/verifier_helper_restricted.c b/tools/testing/selftests/bpf/progs/verifier_helper_restricted.c index 0ede0ccd090c..059aa716e3d0 100644 --- a/tools/testing/selftests/bpf/progs/verifier_helper_restricted.c +++ b/tools/testing/selftests/bpf/progs/verifier_helper_restricted.c @@ -30,7 +30,7 @@ struct { SEC("kprobe") __description("bpf_ktime_get_coarse_ns is forbidden in BPF_PROG_TYPE_KPROBE") -__failure __msg("unknown func bpf_ktime_get_coarse_ns") +__failure __msg("program of this type cannot use helper bpf_ktime_get_coarse_ns") __naked void in_bpf_prog_type_kprobe_1(void) { asm volatile (" \ @@ -44,7 +44,7 @@ __naked void in_bpf_prog_type_kprobe_1(void) SEC("tracepoint") __description("bpf_ktime_get_coarse_ns is forbidden in BPF_PROG_TYPE_TRACEPOINT") -__failure __msg("unknown func bpf_ktime_get_coarse_ns") +__failure __msg("program of this type cannot use helper bpf_ktime_get_coarse_ns") __naked void in_bpf_prog_type_tracepoint_1(void) { asm volatile (" \ @@ -58,7 +58,7 @@ __naked void in_bpf_prog_type_tracepoint_1(void) SEC("perf_event") __description("bpf_ktime_get_coarse_ns is forbidden in BPF_PROG_TYPE_PERF_EVENT") -__failure __msg("unknown func bpf_ktime_get_coarse_ns") +__failure __msg("program of this type cannot use helper bpf_ktime_get_coarse_ns") __naked void bpf_prog_type_perf_event_1(void) { asm volatile (" \ @@ -72,7 +72,7 @@ __naked void bpf_prog_type_perf_event_1(void) SEC("raw_tracepoint") __description("bpf_ktime_get_coarse_ns is forbidden in BPF_PROG_TYPE_RAW_TRACEPOINT") -__failure __msg("unknown func bpf_ktime_get_coarse_ns") +__failure __msg("program of this type cannot use helper bpf_ktime_get_coarse_ns") __naked void bpf_prog_type_raw_tracepoint_1(void) { asm volatile (" \ diff --git a/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c b/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c index 99e561f18f9b..bd676d7e615f 100644 --- a/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c +++ b/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c @@ -318,7 +318,7 @@ int cond_break1(const void *ctx) unsigned long i; unsigned int sum = 0; - for (i = zero; i < ARR_SZ; cond_break, i++) + for (i = zero; i < ARR_SZ && can_loop; i++) sum += i; for (i = zero; i < ARR_SZ; i++) { barrier_var(i); @@ -336,12 +336,11 @@ int cond_break2(const void *ctx) int i, j; int sum = 0; - for (i = zero; i < 1000; cond_break, i++) + for (i = zero; i < 1000 && can_loop; i++) for (j = zero; j < 1000; j++) { sum += i + j; cond_break; - } - + } return sum; } @@ -349,7 +348,7 @@ static __noinline int loop(void) { int i, sum = 0; - for (i = zero; i <= 1000000; i++, cond_break) + for (i = zero; i <= 1000000 && can_loop; i++) sum += i; return sum; diff --git a/tools/testing/selftests/bpf/progs/verifier_kfunc_prog_types.c b/tools/testing/selftests/bpf/progs/verifier_kfunc_prog_types.c new file mode 100644 index 000000000000..cb32b0cfc84b --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_kfunc_prog_types.c @@ -0,0 +1,122 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#include <vmlinux.h> +#include <bpf/bpf_tracing.h> +#include <bpf/bpf_helpers.h> + +#include "bpf_misc.h" +#include "cgrp_kfunc_common.h" +#include "cpumask_common.h" +#include "task_kfunc_common.h" + +char _license[] SEC("license") = "GPL"; + +/*************** + * Task kfuncs * + ***************/ + +static void task_kfunc_load_test(void) +{ + struct task_struct *current, *ref_1, *ref_2; + + current = bpf_get_current_task_btf(); + ref_1 = bpf_task_from_pid(current->pid); + if (!ref_1) + return; + + ref_2 = bpf_task_acquire(ref_1); + if (ref_2) + bpf_task_release(ref_2); + bpf_task_release(ref_1); +} + +SEC("raw_tp") +__failure __msg("calling kernel function") +int BPF_PROG(task_kfunc_raw_tp) +{ + task_kfunc_load_test(); + return 0; +} + +SEC("syscall") +__success +int BPF_PROG(task_kfunc_syscall) +{ + task_kfunc_load_test(); + return 0; +} + +/***************** + * cgroup kfuncs * + *****************/ + +static void cgrp_kfunc_load_test(void) +{ + struct cgroup *cgrp, *ref; + + cgrp = bpf_cgroup_from_id(0); + if (!cgrp) + return; + + ref = bpf_cgroup_acquire(cgrp); + if (!ref) { + bpf_cgroup_release(cgrp); + return; + } + + bpf_cgroup_release(ref); + bpf_cgroup_release(cgrp); +} + +SEC("raw_tp") +__failure __msg("calling kernel function") +int BPF_PROG(cgrp_kfunc_raw_tp) +{ + cgrp_kfunc_load_test(); + return 0; +} + +SEC("syscall") +__success +int BPF_PROG(cgrp_kfunc_syscall) +{ + cgrp_kfunc_load_test(); + return 0; +} + +/****************** + * cpumask kfuncs * + ******************/ + +static void cpumask_kfunc_load_test(void) +{ + struct bpf_cpumask *alloc, *ref; + + alloc = bpf_cpumask_create(); + if (!alloc) + return; + + ref = bpf_cpumask_acquire(alloc); + bpf_cpumask_set_cpu(0, alloc); + bpf_cpumask_test_cpu(0, (const struct cpumask *)ref); + + bpf_cpumask_release(ref); + bpf_cpumask_release(alloc); +} + +SEC("raw_tp") +__failure __msg("calling kernel function") +int BPF_PROG(cpumask_kfunc_raw_tp) +{ + cpumask_kfunc_load_test(); + return 0; +} + +SEC("syscall") +__success +int BPF_PROG(cpumask_kfunc_syscall) +{ + cpumask_kfunc_load_test(); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/verifier_sock_addr.c b/tools/testing/selftests/bpf/progs/verifier_sock_addr.c new file mode 100644 index 000000000000..9c31448a0f52 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_sock_addr.c @@ -0,0 +1,331 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Google LLC */ + +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include <bpf_sockopt_helpers.h> +#include "bpf_misc.h" + +SEC("cgroup/recvmsg4") +__success +int recvmsg4_good_return_code(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/recvmsg4") +__failure __msg("At program exit the register R0 has smin=0 smax=0 should have been in [1, 1]") +int recvmsg4_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/recvmsg6") +__success +int recvmsg6_good_return_code(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/recvmsg6") +__failure __msg("At program exit the register R0 has smin=0 smax=0 should have been in [1, 1]") +int recvmsg6_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/recvmsg_unix") +__success +int recvmsg_unix_good_return_code(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/recvmsg_unix") +__failure __msg("At program exit the register R0 has smin=0 smax=0 should have been in [1, 1]") +int recvmsg_unix_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/sendmsg4") +__success +int sendmsg4_good_return_code_0(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/sendmsg4") +__success +int sendmsg4_good_return_code_1(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/sendmsg4") +__failure __msg("At program exit the register R0 has smin=2 smax=2 should have been in [0, 1]") +int sendmsg4_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 2; +} + +SEC("cgroup/sendmsg6") +__success +int sendmsg6_good_return_code_0(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/sendmsg6") +__success +int sendmsg6_good_return_code_1(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/sendmsg6") +__failure __msg("At program exit the register R0 has smin=2 smax=2 should have been in [0, 1]") +int sendmsg6_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 2; +} + +SEC("cgroup/sendmsg_unix") +__success +int sendmsg_unix_good_return_code_0(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/sendmsg_unix") +__success +int sendmsg_unix_good_return_code_1(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/sendmsg_unix") +__failure __msg("At program exit the register R0 has smin=2 smax=2 should have been in [0, 1]") +int sendmsg_unix_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 2; +} + +SEC("cgroup/getpeername4") +__success +int getpeername4_good_return_code(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/getpeername4") +__failure __msg("At program exit the register R0 has smin=0 smax=0 should have been in [1, 1]") +int getpeername4_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/getpeername6") +__success +int getpeername6_good_return_code(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/getpeername6") +__failure __msg("At program exit the register R0 has smin=0 smax=0 should have been in [1, 1]") +int getpeername6_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/getpeername_unix") +__success +int getpeername_unix_good_return_code(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/getpeername_unix") +__failure __msg("At program exit the register R0 has smin=0 smax=0 should have been in [1, 1]") +int getpeername_unix_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/getsockname4") +__success +int getsockname4_good_return_code(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/getsockname4") +__failure __msg("At program exit the register R0 has smin=0 smax=0 should have been in [1, 1]") +int getsockname4_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/getsockname6") +__success +int getsockname6_good_return_code(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/getsockname6") +__failure __msg("At program exit the register R0 has smin=0 smax=0 should have been in [1, 1]") +int getsockname6_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/getsockname_unix") +__success +int getsockname_unix_good_return_code(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/getsockname_unix") +__failure __msg("At program exit the register R0 has smin=0 smax=0 should have been in [1, 1]") +int getsockname_unix_unix_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/bind4") +__success +int bind4_good_return_code_0(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/bind4") +__success +int bind4_good_return_code_1(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/bind4") +__success +int bind4_good_return_code_2(struct bpf_sock_addr *ctx) +{ + return 2; +} + +SEC("cgroup/bind4") +__success +int bind4_good_return_code_3(struct bpf_sock_addr *ctx) +{ + return 3; +} + +SEC("cgroup/bind4") +__failure __msg("At program exit the register R0 has smin=4 smax=4 should have been in [0, 3]") +int bind4_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 4; +} + +SEC("cgroup/bind6") +__success +int bind6_good_return_code_0(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/bind6") +__success +int bind6_good_return_code_1(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/bind6") +__success +int bind6_good_return_code_2(struct bpf_sock_addr *ctx) +{ + return 2; +} + +SEC("cgroup/bind6") +__success +int bind6_good_return_code_3(struct bpf_sock_addr *ctx) +{ + return 3; +} + +SEC("cgroup/bind6") +__failure __msg("At program exit the register R0 has smin=4 smax=4 should have been in [0, 3]") +int bind6_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 4; +} + +SEC("cgroup/connect4") +__success +int connect4_good_return_code_0(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/connect4") +__success +int connect4_good_return_code_1(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/connect4") +__failure __msg("At program exit the register R0 has smin=2 smax=2 should have been in [0, 1]") +int connect4_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 2; +} + +SEC("cgroup/connect6") +__success +int connect6_good_return_code_0(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/connect6") +__success +int connect6_good_return_code_1(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/connect6") +__failure __msg("At program exit the register R0 has smin=2 smax=2 should have been in [0, 1]") +int connect6_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 2; +} + +SEC("cgroup/connect_unix") +__success +int connect_unix_good_return_code_0(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/connect_unix") +__success +int connect_unix_good_return_code_1(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/connect_unix") +__failure __msg("At program exit the register R0 has smin=2 smax=2 should have been in [0, 1]") +int connect_unix_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 2; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c index 6f5d19665cf6..4a58e0398e72 100644 --- a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c +++ b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c @@ -6,6 +6,7 @@ #include <linux/bpf.h> #include <bpf/bpf_helpers.h> #include "bpf_misc.h" +#include <../../../tools/include/linux/filter.h> #define ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0])) @@ -76,6 +77,94 @@ __naked int subprog_result_precise(void) ); } +__naked __noinline __used +static unsigned long fp_leaking_subprog() +{ + asm volatile ( + ".8byte %[r0_eq_r10_cast_s8];" + "exit;" + :: __imm_insn(r0_eq_r10_cast_s8, BPF_MOVSX64_REG(BPF_REG_0, BPF_REG_10, 8)) + ); +} + +__naked __noinline __used +static unsigned long sneaky_fp_leaking_subprog() +{ + asm volatile ( + "r1 = r10;" + ".8byte %[r0_eq_r1_cast_s8];" + "exit;" + :: __imm_insn(r0_eq_r1_cast_s8, BPF_MOVSX64_REG(BPF_REG_0, BPF_REG_1, 8)) + ); +} + +SEC("?raw_tp") +__success __log_level(2) +__msg("6: (0f) r1 += r0") +__msg("mark_precise: frame0: last_idx 6 first_idx 0 subseq_idx -1") +__msg("mark_precise: frame0: regs=r0 stack= before 5: (bf) r1 = r6") +__msg("mark_precise: frame0: regs=r0 stack= before 4: (27) r0 *= 4") +__msg("mark_precise: frame0: regs=r0 stack= before 3: (57) r0 &= 3") +__msg("mark_precise: frame0: regs=r0 stack= before 10: (95) exit") +__msg("mark_precise: frame1: regs=r0 stack= before 9: (bf) r0 = (s8)r10") +__msg("7: R0_w=scalar") +__naked int fp_precise_subprog_result(void) +{ + asm volatile ( + "call fp_leaking_subprog;" + /* use subprog's returned value (which is derived from r10=fp + * register), as index into vals array, forcing all of that to + * be known precisely + */ + "r0 &= 3;" + "r0 *= 4;" + "r1 = %[vals];" + /* force precision marking */ + "r1 += r0;" + "r0 = *(u32 *)(r1 + 0);" + "exit;" + : + : __imm_ptr(vals) + : __clobber_common + ); +} + +SEC("?raw_tp") +__success __log_level(2) +__msg("6: (0f) r1 += r0") +__msg("mark_precise: frame0: last_idx 6 first_idx 0 subseq_idx -1") +__msg("mark_precise: frame0: regs=r0 stack= before 5: (bf) r1 = r6") +__msg("mark_precise: frame0: regs=r0 stack= before 4: (27) r0 *= 4") +__msg("mark_precise: frame0: regs=r0 stack= before 3: (57) r0 &= 3") +__msg("mark_precise: frame0: regs=r0 stack= before 11: (95) exit") +__msg("mark_precise: frame1: regs=r0 stack= before 10: (bf) r0 = (s8)r1") +/* here r1 is marked precise, even though it's fp register, but that's fine + * because by the time we get out of subprogram it has to be derived from r10 + * anyways, at which point we'll break precision chain + */ +__msg("mark_precise: frame1: regs=r1 stack= before 9: (bf) r1 = r10") +__msg("7: R0_w=scalar") +__naked int sneaky_fp_precise_subprog_result(void) +{ + asm volatile ( + "call sneaky_fp_leaking_subprog;" + /* use subprog's returned value (which is derived from r10=fp + * register), as index into vals array, forcing all of that to + * be known precisely + */ + "r0 &= 3;" + "r0 *= 4;" + "r1 = %[vals];" + /* force precision marking */ + "r1 += r0;" + "r0 = *(u32 *)(r1 + 0);" + "exit;" + : + : __imm_ptr(vals) + : __clobber_common + ); +} + SEC("?raw_tp") __success __log_level(2) __msg("9: (0f) r1 += r0") diff --git a/tools/testing/selftests/bpf/progs/wq.c b/tools/testing/selftests/bpf/progs/wq.c new file mode 100644 index 000000000000..49e712acbf60 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/wq.c @@ -0,0 +1,180 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Benjamin Tissoires + */ + +#include "bpf_experimental.h" +#include <bpf/bpf_helpers.h> +#include "bpf_misc.h" +#include "../bpf_testmod/bpf_testmod_kfunc.h" + +char _license[] SEC("license") = "GPL"; + +struct hmap_elem { + int counter; + struct bpf_timer timer; /* unused */ + struct bpf_spin_lock lock; /* unused */ + struct bpf_wq work; +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 1000); + __type(key, int); + __type(value, struct hmap_elem); +} hmap SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(map_flags, BPF_F_NO_PREALLOC); + __uint(max_entries, 1000); + __type(key, int); + __type(value, struct hmap_elem); +} hmap_malloc SEC(".maps"); + +struct elem { + struct bpf_wq w; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 2); + __type(key, int); + __type(value, struct elem); +} array SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_LRU_HASH); + __uint(max_entries, 4); + __type(key, int); + __type(value, struct elem); +} lru SEC(".maps"); + +__u32 ok; +__u32 ok_sleepable; + +static int test_elem_callback(void *map, int *key, + int (callback_fn)(void *map, int *key, struct bpf_wq *wq)) +{ + struct elem init = {}, *val; + struct bpf_wq *wq; + + if ((ok & (1 << *key) || + (ok_sleepable & (1 << *key)))) + return -22; + + if (map == &lru && + bpf_map_update_elem(map, key, &init, 0)) + return -1; + + val = bpf_map_lookup_elem(map, key); + if (!val) + return -2; + + wq = &val->w; + if (bpf_wq_init(wq, map, 0) != 0) + return -3; + + if (bpf_wq_set_callback(wq, callback_fn, 0)) + return -4; + + if (bpf_wq_start(wq, 0)) + return -5; + + return 0; +} + +static int test_hmap_elem_callback(void *map, int *key, + int (callback_fn)(void *map, int *key, struct bpf_wq *wq)) +{ + struct hmap_elem init = {}, *val; + struct bpf_wq *wq; + + if ((ok & (1 << *key) || + (ok_sleepable & (1 << *key)))) + return -22; + + if (bpf_map_update_elem(map, key, &init, 0)) + return -1; + + val = bpf_map_lookup_elem(map, key); + if (!val) + return -2; + + wq = &val->work; + if (bpf_wq_init(wq, map, 0) != 0) + return -3; + + if (bpf_wq_set_callback(wq, callback_fn, 0)) + return -4; + + if (bpf_wq_start(wq, 0)) + return -5; + + return 0; +} + +/* callback for non sleepable workqueue */ +static int wq_callback(void *map, int *key, struct bpf_wq *work) +{ + bpf_kfunc_common_test(); + ok |= (1 << *key); + return 0; +} + +/* callback for sleepable workqueue */ +static int wq_cb_sleepable(void *map, int *key, struct bpf_wq *work) +{ + bpf_kfunc_call_test_sleepable(); + ok_sleepable |= (1 << *key); + return 0; +} + +SEC("tc") +/* test that workqueues can be used from an array */ +__retval(0) +long test_call_array_sleepable(void *ctx) +{ + int key = 0; + + return test_elem_callback(&array, &key, wq_cb_sleepable); +} + +SEC("syscall") +/* Same test than above but from a sleepable context. */ +__retval(0) +long test_syscall_array_sleepable(void *ctx) +{ + int key = 1; + + return test_elem_callback(&array, &key, wq_cb_sleepable); +} + +SEC("tc") +/* test that workqueues can be used from a hashmap */ +__retval(0) +long test_call_hash_sleepable(void *ctx) +{ + int key = 2; + + return test_hmap_elem_callback(&hmap, &key, wq_callback); +} + +SEC("tc") +/* test that workqueues can be used from a hashmap with NO_PREALLOC. */ +__retval(0) +long test_call_hash_malloc_sleepable(void *ctx) +{ + int key = 3; + + return test_hmap_elem_callback(&hmap_malloc, &key, wq_callback); +} + +SEC("tc") +/* test that workqueues can be used from a LRU map */ +__retval(0) +long test_call_lru_sleepable(void *ctx) +{ + int key = 4; + + return test_elem_callback(&lru, &key, wq_callback); +} diff --git a/tools/testing/selftests/bpf/progs/wq_failures.c b/tools/testing/selftests/bpf/progs/wq_failures.c new file mode 100644 index 000000000000..4cbdb425f223 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/wq_failures.c @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Benjamin Tissoires + */ + +#include "bpf_experimental.h" +#include <bpf/bpf_helpers.h> +#include "bpf_misc.h" +#include "../bpf_testmod/bpf_testmod_kfunc.h" + +char _license[] SEC("license") = "GPL"; + +struct elem { + struct bpf_wq w; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 2); + __type(key, int); + __type(value, struct elem); +} array SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_LRU_HASH); + __uint(max_entries, 4); + __type(key, int); + __type(value, struct elem); +} lru SEC(".maps"); + +/* callback for non sleepable workqueue */ +static int wq_callback(void *map, int *key, struct bpf_wq *work) +{ + bpf_kfunc_common_test(); + return 0; +} + +/* callback for sleepable workqueue */ +static int wq_cb_sleepable(void *map, int *key, struct bpf_wq *work) +{ + bpf_kfunc_call_test_sleepable(); + return 0; +} + +SEC("tc") +/* test that bpf_wq_init takes a map as a second argument + */ +__log_level(2) +__flag(BPF_F_TEST_STATE_FREQ) +__failure +__msg(": (85) call bpf_wq_init#") /* anchor message */ +__msg("pointer in R2 isn't map pointer") +long test_wq_init_nomap(void *ctx) +{ + struct bpf_wq *wq; + struct elem *val; + int key = 0; + + val = bpf_map_lookup_elem(&array, &key); + if (!val) + return -1; + + wq = &val->w; + if (bpf_wq_init(wq, &key, 0) != 0) + return -3; + + return 0; +} + +SEC("tc") +/* test that the workqueue is part of the map in bpf_wq_init + */ +__log_level(2) +__flag(BPF_F_TEST_STATE_FREQ) +__failure +__msg(": (85) call bpf_wq_init#") /* anchor message */ +__msg("workqueue pointer in R1 map_uid=0 doesn't match map pointer in R2 map_uid=0") +long test_wq_init_wrong_map(void *ctx) +{ + struct bpf_wq *wq; + struct elem *val; + int key = 0; + + val = bpf_map_lookup_elem(&array, &key); + if (!val) + return -1; + + wq = &val->w; + if (bpf_wq_init(wq, &lru, 0) != 0) + return -3; + + return 0; +} + +SEC("?tc") +__log_level(2) +__failure +/* check that the first argument of bpf_wq_set_callback() + * is a correct bpf_wq pointer. + */ +__msg(": (85) call bpf_wq_set_callback_impl#") /* anchor message */ +__msg("arg#0 doesn't point to a map value") +long test_wrong_wq_pointer(void *ctx) +{ + int key = 0; + struct bpf_wq *wq; + + wq = bpf_map_lookup_elem(&array, &key); + if (!wq) + return 1; + + if (bpf_wq_init(wq, &array, 0)) + return 2; + + if (bpf_wq_set_callback((void *)&wq, wq_callback, 0)) + return 3; + + return -22; +} + +SEC("?tc") +__log_level(2) +__failure +/* check that the first argument of bpf_wq_set_callback() + * is a correct bpf_wq pointer. + */ +__msg(": (85) call bpf_wq_set_callback_impl#") /* anchor message */ +__msg("off 1 doesn't point to 'struct bpf_wq' that is at 0") +long test_wrong_wq_pointer_offset(void *ctx) +{ + int key = 0; + struct bpf_wq *wq; + + wq = bpf_map_lookup_elem(&array, &key); + if (!wq) + return 1; + + if (bpf_wq_init(wq, &array, 0)) + return 2; + + if (bpf_wq_set_callback((void *)wq + 1, wq_cb_sleepable, 0)) + return 3; + + return -22; +} diff --git a/tools/testing/selftests/bpf/test_cpp.cpp b/tools/testing/selftests/bpf/test_cpp.cpp index f4936834f76f..dde0bb16e782 100644 --- a/tools/testing/selftests/bpf/test_cpp.cpp +++ b/tools/testing/selftests/bpf/test_cpp.cpp @@ -7,6 +7,7 @@ #include <bpf/bpf.h> #include <bpf/btf.h> #include "test_core_extern.skel.h" +#include "struct_ops_module.skel.h" template <typename T> class Skeleton { @@ -98,6 +99,7 @@ int main(int argc, char *argv[]) { struct btf_dump_opts opts = { }; struct test_core_extern *skel; + struct struct_ops_module *skel2; struct btf *btf; int fd; @@ -118,6 +120,9 @@ int main(int argc, char *argv[]) skel = test_core_extern__open_and_load(); test_core_extern__destroy(skel); + skel2 = struct_ops_module__open_and_load(); + struct_ops_module__destroy(skel2); + fd = bpf_enable_stats(BPF_STATS_RUN_TIME); if (fd < 0) std::cout << "FAILED to enable stats: " << fd << std::endl; diff --git a/tools/testing/selftests/bpf/test_sock_addr.c b/tools/testing/selftests/bpf/test_sock_addr.c deleted file mode 100644 index 80c42583f597..000000000000 --- a/tools/testing/selftests/bpf/test_sock_addr.c +++ /dev/null @@ -1,1434 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -// Copyright (c) 2018 Facebook - -#define _GNU_SOURCE - -#include <stdio.h> -#include <stdlib.h> -#include <unistd.h> - -#include <arpa/inet.h> -#include <netinet/in.h> -#include <sys/types.h> -#include <sys/select.h> -#include <sys/socket.h> - -#include <linux/filter.h> - -#include <bpf/bpf.h> -#include <bpf/libbpf.h> - -#include "cgroup_helpers.h" -#include "testing_helpers.h" -#include "bpf_util.h" - -#ifndef ENOTSUPP -# define ENOTSUPP 524 -#endif - -#define CG_PATH "/foo" -#define CONNECT4_PROG_PATH "./connect4_prog.bpf.o" -#define CONNECT6_PROG_PATH "./connect6_prog.bpf.o" -#define SENDMSG4_PROG_PATH "./sendmsg4_prog.bpf.o" -#define SENDMSG6_PROG_PATH "./sendmsg6_prog.bpf.o" -#define RECVMSG4_PROG_PATH "./recvmsg4_prog.bpf.o" -#define RECVMSG6_PROG_PATH "./recvmsg6_prog.bpf.o" -#define BIND4_PROG_PATH "./bind4_prog.bpf.o" -#define BIND6_PROG_PATH "./bind6_prog.bpf.o" - -#define SERV4_IP "192.168.1.254" -#define SERV4_REWRITE_IP "127.0.0.1" -#define SRC4_IP "172.16.0.1" -#define SRC4_REWRITE_IP "127.0.0.4" -#define SERV4_PORT 4040 -#define SERV4_REWRITE_PORT 4444 - -#define SERV6_IP "face:b00c:1234:5678::abcd" -#define SERV6_REWRITE_IP "::1" -#define SERV6_V4MAPPED_IP "::ffff:192.168.0.4" -#define SRC6_IP "::1" -#define SRC6_REWRITE_IP "::6" -#define WILDCARD6_IP "::" -#define SERV6_PORT 6060 -#define SERV6_REWRITE_PORT 6666 - -#define INET_NTOP_BUF 40 - -struct sock_addr_test; - -typedef int (*load_fn)(const struct sock_addr_test *test); -typedef int (*info_fn)(int, struct sockaddr *, socklen_t *); - -char bpf_log_buf[BPF_LOG_BUF_SIZE]; - -struct sock_addr_test { - const char *descr; - /* BPF prog properties */ - load_fn loadfn; - enum bpf_attach_type expected_attach_type; - enum bpf_attach_type attach_type; - /* Socket properties */ - int domain; - int type; - /* IP:port pairs for BPF prog to override */ - const char *requested_ip; - unsigned short requested_port; - const char *expected_ip; - unsigned short expected_port; - const char *expected_src_ip; - /* Expected test result */ - enum { - LOAD_REJECT, - ATTACH_REJECT, - ATTACH_OKAY, - SYSCALL_EPERM, - SYSCALL_ENOTSUPP, - SUCCESS, - } expected_result; -}; - -static int bind4_prog_load(const struct sock_addr_test *test); -static int bind6_prog_load(const struct sock_addr_test *test); -static int connect4_prog_load(const struct sock_addr_test *test); -static int connect6_prog_load(const struct sock_addr_test *test); -static int sendmsg_allow_prog_load(const struct sock_addr_test *test); -static int sendmsg_deny_prog_load(const struct sock_addr_test *test); -static int recvmsg_allow_prog_load(const struct sock_addr_test *test); -static int recvmsg_deny_prog_load(const struct sock_addr_test *test); -static int sendmsg4_rw_asm_prog_load(const struct sock_addr_test *test); -static int recvmsg4_rw_c_prog_load(const struct sock_addr_test *test); -static int sendmsg4_rw_c_prog_load(const struct sock_addr_test *test); -static int sendmsg6_rw_asm_prog_load(const struct sock_addr_test *test); -static int recvmsg6_rw_c_prog_load(const struct sock_addr_test *test); -static int sendmsg6_rw_c_prog_load(const struct sock_addr_test *test); -static int sendmsg6_rw_v4mapped_prog_load(const struct sock_addr_test *test); -static int sendmsg6_rw_wildcard_prog_load(const struct sock_addr_test *test); - -static struct sock_addr_test tests[] = { - /* bind */ - { - "bind4: load prog with wrong expected attach type", - bind4_prog_load, - BPF_CGROUP_INET6_BIND, - BPF_CGROUP_INET4_BIND, - AF_INET, - SOCK_STREAM, - NULL, - 0, - NULL, - 0, - NULL, - LOAD_REJECT, - }, - { - "bind4: attach prog with wrong attach type", - bind4_prog_load, - BPF_CGROUP_INET4_BIND, - BPF_CGROUP_INET6_BIND, - AF_INET, - SOCK_STREAM, - NULL, - 0, - NULL, - 0, - NULL, - ATTACH_REJECT, - }, - { - "bind4: rewrite IP & TCP port in", - bind4_prog_load, - BPF_CGROUP_INET4_BIND, - BPF_CGROUP_INET4_BIND, - AF_INET, - SOCK_STREAM, - SERV4_IP, - SERV4_PORT, - SERV4_REWRITE_IP, - SERV4_REWRITE_PORT, - NULL, - SUCCESS, - }, - { - "bind4: rewrite IP & UDP port in", - bind4_prog_load, - BPF_CGROUP_INET4_BIND, - BPF_CGROUP_INET4_BIND, - AF_INET, - SOCK_DGRAM, - SERV4_IP, - SERV4_PORT, - SERV4_REWRITE_IP, - SERV4_REWRITE_PORT, - NULL, - SUCCESS, - }, - { - "bind6: load prog with wrong expected attach type", - bind6_prog_load, - BPF_CGROUP_INET4_BIND, - BPF_CGROUP_INET6_BIND, - AF_INET6, - SOCK_STREAM, - NULL, - 0, - NULL, - 0, - NULL, - LOAD_REJECT, - }, - { - "bind6: attach prog with wrong attach type", - bind6_prog_load, - BPF_CGROUP_INET6_BIND, - BPF_CGROUP_INET4_BIND, - AF_INET, - SOCK_STREAM, - NULL, - 0, - NULL, - 0, - NULL, - ATTACH_REJECT, - }, - { - "bind6: rewrite IP & TCP port in", - bind6_prog_load, - BPF_CGROUP_INET6_BIND, - BPF_CGROUP_INET6_BIND, - AF_INET6, - SOCK_STREAM, - SERV6_IP, - SERV6_PORT, - SERV6_REWRITE_IP, - SERV6_REWRITE_PORT, - NULL, - SUCCESS, - }, - { - "bind6: rewrite IP & UDP port in", - bind6_prog_load, - BPF_CGROUP_INET6_BIND, - BPF_CGROUP_INET6_BIND, - AF_INET6, - SOCK_DGRAM, - SERV6_IP, - SERV6_PORT, - SERV6_REWRITE_IP, - SERV6_REWRITE_PORT, - NULL, - SUCCESS, - }, - - /* connect */ - { - "connect4: load prog with wrong expected attach type", - connect4_prog_load, - BPF_CGROUP_INET6_CONNECT, - BPF_CGROUP_INET4_CONNECT, - AF_INET, - SOCK_STREAM, - NULL, - 0, - NULL, - 0, - NULL, - LOAD_REJECT, - }, - { - "connect4: attach prog with wrong attach type", - connect4_prog_load, - BPF_CGROUP_INET4_CONNECT, - BPF_CGROUP_INET6_CONNECT, - AF_INET, - SOCK_STREAM, - NULL, - 0, - NULL, - 0, - NULL, - ATTACH_REJECT, - }, - { - "connect4: rewrite IP & TCP port", - connect4_prog_load, - BPF_CGROUP_INET4_CONNECT, - BPF_CGROUP_INET4_CONNECT, - AF_INET, - SOCK_STREAM, - SERV4_IP, - SERV4_PORT, - SERV4_REWRITE_IP, - SERV4_REWRITE_PORT, - SRC4_REWRITE_IP, - SUCCESS, - }, - { - "connect4: rewrite IP & UDP port", - connect4_prog_load, - BPF_CGROUP_INET4_CONNECT, - BPF_CGROUP_INET4_CONNECT, - AF_INET, - SOCK_DGRAM, - SERV4_IP, - SERV4_PORT, - SERV4_REWRITE_IP, - SERV4_REWRITE_PORT, - SRC4_REWRITE_IP, - SUCCESS, - }, - { - "connect6: load prog with wrong expected attach type", - connect6_prog_load, - BPF_CGROUP_INET4_CONNECT, - BPF_CGROUP_INET6_CONNECT, - AF_INET6, - SOCK_STREAM, - NULL, - 0, - NULL, - 0, - NULL, - LOAD_REJECT, - }, - { - "connect6: attach prog with wrong attach type", - connect6_prog_load, - BPF_CGROUP_INET6_CONNECT, - BPF_CGROUP_INET4_CONNECT, - AF_INET, - SOCK_STREAM, - NULL, - 0, - NULL, - 0, - NULL, - ATTACH_REJECT, - }, - { - "connect6: rewrite IP & TCP port", - connect6_prog_load, - BPF_CGROUP_INET6_CONNECT, - BPF_CGROUP_INET6_CONNECT, - AF_INET6, - SOCK_STREAM, - SERV6_IP, - SERV6_PORT, - SERV6_REWRITE_IP, - SERV6_REWRITE_PORT, - SRC6_REWRITE_IP, - SUCCESS, - }, - { - "connect6: rewrite IP & UDP port", - connect6_prog_load, - BPF_CGROUP_INET6_CONNECT, - BPF_CGROUP_INET6_CONNECT, - AF_INET6, - SOCK_DGRAM, - SERV6_IP, - SERV6_PORT, - SERV6_REWRITE_IP, - SERV6_REWRITE_PORT, - SRC6_REWRITE_IP, - SUCCESS, - }, - - /* sendmsg */ - { - "sendmsg4: load prog with wrong expected attach type", - sendmsg4_rw_asm_prog_load, - BPF_CGROUP_UDP6_SENDMSG, - BPF_CGROUP_UDP4_SENDMSG, - AF_INET, - SOCK_DGRAM, - NULL, - 0, - NULL, - 0, - NULL, - LOAD_REJECT, - }, - { - "sendmsg4: attach prog with wrong attach type", - sendmsg4_rw_asm_prog_load, - BPF_CGROUP_UDP4_SENDMSG, - BPF_CGROUP_UDP6_SENDMSG, - AF_INET, - SOCK_DGRAM, - NULL, - 0, - NULL, - 0, - NULL, - ATTACH_REJECT, - }, - { - "sendmsg4: rewrite IP & port (asm)", - sendmsg4_rw_asm_prog_load, - BPF_CGROUP_UDP4_SENDMSG, - BPF_CGROUP_UDP4_SENDMSG, - AF_INET, - SOCK_DGRAM, - SERV4_IP, - SERV4_PORT, - SERV4_REWRITE_IP, - SERV4_REWRITE_PORT, - SRC4_REWRITE_IP, - SUCCESS, - }, - { - "sendmsg4: rewrite IP & port (C)", - sendmsg4_rw_c_prog_load, - BPF_CGROUP_UDP4_SENDMSG, - BPF_CGROUP_UDP4_SENDMSG, - AF_INET, - SOCK_DGRAM, - SERV4_IP, - SERV4_PORT, - SERV4_REWRITE_IP, - SERV4_REWRITE_PORT, - SRC4_REWRITE_IP, - SUCCESS, - }, - { - "sendmsg4: deny call", - sendmsg_deny_prog_load, - BPF_CGROUP_UDP4_SENDMSG, - BPF_CGROUP_UDP4_SENDMSG, - AF_INET, - SOCK_DGRAM, - SERV4_IP, - SERV4_PORT, - SERV4_REWRITE_IP, - SERV4_REWRITE_PORT, - SRC4_REWRITE_IP, - SYSCALL_EPERM, - }, - { - "sendmsg6: load prog with wrong expected attach type", - sendmsg6_rw_asm_prog_load, - BPF_CGROUP_UDP4_SENDMSG, - BPF_CGROUP_UDP6_SENDMSG, - AF_INET6, - SOCK_DGRAM, - NULL, - 0, - NULL, - 0, - NULL, - LOAD_REJECT, - }, - { - "sendmsg6: attach prog with wrong attach type", - sendmsg6_rw_asm_prog_load, - BPF_CGROUP_UDP6_SENDMSG, - BPF_CGROUP_UDP4_SENDMSG, - AF_INET6, - SOCK_DGRAM, - NULL, - 0, - NULL, - 0, - NULL, - ATTACH_REJECT, - }, - { - "sendmsg6: rewrite IP & port (asm)", - sendmsg6_rw_asm_prog_load, - BPF_CGROUP_UDP6_SENDMSG, - BPF_CGROUP_UDP6_SENDMSG, - AF_INET6, - SOCK_DGRAM, - SERV6_IP, - SERV6_PORT, - SERV6_REWRITE_IP, - SERV6_REWRITE_PORT, - SRC6_REWRITE_IP, - SUCCESS, - }, - { - "sendmsg6: rewrite IP & port (C)", - sendmsg6_rw_c_prog_load, - BPF_CGROUP_UDP6_SENDMSG, - BPF_CGROUP_UDP6_SENDMSG, - AF_INET6, - SOCK_DGRAM, - SERV6_IP, - SERV6_PORT, - SERV6_REWRITE_IP, - SERV6_REWRITE_PORT, - SRC6_REWRITE_IP, - SUCCESS, - }, - { - "sendmsg6: IPv4-mapped IPv6", - sendmsg6_rw_v4mapped_prog_load, - BPF_CGROUP_UDP6_SENDMSG, - BPF_CGROUP_UDP6_SENDMSG, - AF_INET6, - SOCK_DGRAM, - SERV6_IP, - SERV6_PORT, - SERV6_REWRITE_IP, - SERV6_REWRITE_PORT, - SRC6_REWRITE_IP, - SYSCALL_ENOTSUPP, - }, - { - "sendmsg6: set dst IP = [::] (BSD'ism)", - sendmsg6_rw_wildcard_prog_load, - BPF_CGROUP_UDP6_SENDMSG, - BPF_CGROUP_UDP6_SENDMSG, - AF_INET6, - SOCK_DGRAM, - SERV6_IP, - SERV6_PORT, - SERV6_REWRITE_IP, - SERV6_REWRITE_PORT, - SRC6_REWRITE_IP, - SUCCESS, - }, - { - "sendmsg6: preserve dst IP = [::] (BSD'ism)", - sendmsg_allow_prog_load, - BPF_CGROUP_UDP6_SENDMSG, - BPF_CGROUP_UDP6_SENDMSG, - AF_INET6, - SOCK_DGRAM, - WILDCARD6_IP, - SERV6_PORT, - SERV6_REWRITE_IP, - SERV6_PORT, - SRC6_IP, - SUCCESS, - }, - { - "sendmsg6: deny call", - sendmsg_deny_prog_load, - BPF_CGROUP_UDP6_SENDMSG, - BPF_CGROUP_UDP6_SENDMSG, - AF_INET6, - SOCK_DGRAM, - SERV6_IP, - SERV6_PORT, - SERV6_REWRITE_IP, - SERV6_REWRITE_PORT, - SRC6_REWRITE_IP, - SYSCALL_EPERM, - }, - - /* recvmsg */ - { - "recvmsg4: return code ok", - recvmsg_allow_prog_load, - BPF_CGROUP_UDP4_RECVMSG, - BPF_CGROUP_UDP4_RECVMSG, - AF_INET, - SOCK_DGRAM, - NULL, - 0, - NULL, - 0, - NULL, - ATTACH_OKAY, - }, - { - "recvmsg4: return code !ok", - recvmsg_deny_prog_load, - BPF_CGROUP_UDP4_RECVMSG, - BPF_CGROUP_UDP4_RECVMSG, - AF_INET, - SOCK_DGRAM, - NULL, - 0, - NULL, - 0, - NULL, - LOAD_REJECT, - }, - { - "recvmsg6: return code ok", - recvmsg_allow_prog_load, - BPF_CGROUP_UDP6_RECVMSG, - BPF_CGROUP_UDP6_RECVMSG, - AF_INET6, - SOCK_DGRAM, - NULL, - 0, - NULL, - 0, - NULL, - ATTACH_OKAY, - }, - { - "recvmsg6: return code !ok", - recvmsg_deny_prog_load, - BPF_CGROUP_UDP6_RECVMSG, - BPF_CGROUP_UDP6_RECVMSG, - AF_INET6, - SOCK_DGRAM, - NULL, - 0, - NULL, - 0, - NULL, - LOAD_REJECT, - }, - { - "recvmsg4: rewrite IP & port (C)", - recvmsg4_rw_c_prog_load, - BPF_CGROUP_UDP4_RECVMSG, - BPF_CGROUP_UDP4_RECVMSG, - AF_INET, - SOCK_DGRAM, - SERV4_REWRITE_IP, - SERV4_REWRITE_PORT, - SERV4_REWRITE_IP, - SERV4_REWRITE_PORT, - SERV4_IP, - SUCCESS, - }, - { - "recvmsg6: rewrite IP & port (C)", - recvmsg6_rw_c_prog_load, - BPF_CGROUP_UDP6_RECVMSG, - BPF_CGROUP_UDP6_RECVMSG, - AF_INET6, - SOCK_DGRAM, - SERV6_REWRITE_IP, - SERV6_REWRITE_PORT, - SERV6_REWRITE_IP, - SERV6_REWRITE_PORT, - SERV6_IP, - SUCCESS, - }, -}; - -static int mk_sockaddr(int domain, const char *ip, unsigned short port, - struct sockaddr *addr, socklen_t addr_len) -{ - struct sockaddr_in6 *addr6; - struct sockaddr_in *addr4; - - if (domain != AF_INET && domain != AF_INET6) { - log_err("Unsupported address family"); - return -1; - } - - memset(addr, 0, addr_len); - - if (domain == AF_INET) { - if (addr_len < sizeof(struct sockaddr_in)) - return -1; - addr4 = (struct sockaddr_in *)addr; - addr4->sin_family = domain; - addr4->sin_port = htons(port); - if (inet_pton(domain, ip, (void *)&addr4->sin_addr) != 1) { - log_err("Invalid IPv4: %s", ip); - return -1; - } - } else if (domain == AF_INET6) { - if (addr_len < sizeof(struct sockaddr_in6)) - return -1; - addr6 = (struct sockaddr_in6 *)addr; - addr6->sin6_family = domain; - addr6->sin6_port = htons(port); - if (inet_pton(domain, ip, (void *)&addr6->sin6_addr) != 1) { - log_err("Invalid IPv6: %s", ip); - return -1; - } - } - - return 0; -} - -static int load_insns(const struct sock_addr_test *test, - const struct bpf_insn *insns, size_t insns_cnt) -{ - LIBBPF_OPTS(bpf_prog_load_opts, opts); - int ret; - - opts.expected_attach_type = test->expected_attach_type; - opts.log_buf = bpf_log_buf; - opts.log_size = BPF_LOG_BUF_SIZE; - - ret = bpf_prog_load(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, NULL, "GPL", insns, insns_cnt, &opts); - if (ret < 0 && test->expected_result != LOAD_REJECT) { - log_err(">>> Loading program error.\n" - ">>> Verifier output:\n%s\n-------\n", bpf_log_buf); - } - - return ret; -} - -static int load_path(const struct sock_addr_test *test, const char *path) -{ - struct bpf_object *obj; - struct bpf_program *prog; - int err; - - obj = bpf_object__open_file(path, NULL); - err = libbpf_get_error(obj); - if (err) { - log_err(">>> Opening BPF object (%s) error.\n", path); - return -1; - } - - prog = bpf_object__next_program(obj, NULL); - if (!prog) - goto err_out; - - bpf_program__set_type(prog, BPF_PROG_TYPE_CGROUP_SOCK_ADDR); - bpf_program__set_expected_attach_type(prog, test->expected_attach_type); - bpf_program__set_flags(prog, testing_prog_flags()); - - err = bpf_object__load(obj); - if (err) { - if (test->expected_result != LOAD_REJECT) - log_err(">>> Loading program (%s) error.\n", path); - goto err_out; - } - - return bpf_program__fd(prog); -err_out: - bpf_object__close(obj); - return -1; -} - -static int bind4_prog_load(const struct sock_addr_test *test) -{ - return load_path(test, BIND4_PROG_PATH); -} - -static int bind6_prog_load(const struct sock_addr_test *test) -{ - return load_path(test, BIND6_PROG_PATH); -} - -static int connect4_prog_load(const struct sock_addr_test *test) -{ - return load_path(test, CONNECT4_PROG_PATH); -} - -static int connect6_prog_load(const struct sock_addr_test *test) -{ - return load_path(test, CONNECT6_PROG_PATH); -} - -static int xmsg_ret_only_prog_load(const struct sock_addr_test *test, - int32_t rc) -{ - struct bpf_insn insns[] = { - /* return rc */ - BPF_MOV64_IMM(BPF_REG_0, rc), - BPF_EXIT_INSN(), - }; - return load_insns(test, insns, ARRAY_SIZE(insns)); -} - -static int sendmsg_allow_prog_load(const struct sock_addr_test *test) -{ - return xmsg_ret_only_prog_load(test, /*rc*/ 1); -} - -static int sendmsg_deny_prog_load(const struct sock_addr_test *test) -{ - return xmsg_ret_only_prog_load(test, /*rc*/ 0); -} - -static int recvmsg_allow_prog_load(const struct sock_addr_test *test) -{ - return xmsg_ret_only_prog_load(test, /*rc*/ 1); -} - -static int recvmsg_deny_prog_load(const struct sock_addr_test *test) -{ - return xmsg_ret_only_prog_load(test, /*rc*/ 0); -} - -static int sendmsg4_rw_asm_prog_load(const struct sock_addr_test *test) -{ - struct sockaddr_in dst4_rw_addr; - struct in_addr src4_rw_ip; - - if (inet_pton(AF_INET, SRC4_REWRITE_IP, (void *)&src4_rw_ip) != 1) { - log_err("Invalid IPv4: %s", SRC4_REWRITE_IP); - return -1; - } - - if (mk_sockaddr(AF_INET, SERV4_REWRITE_IP, SERV4_REWRITE_PORT, - (struct sockaddr *)&dst4_rw_addr, - sizeof(dst4_rw_addr)) == -1) - return -1; - - struct bpf_insn insns[] = { - BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), - - /* if (sk.family == AF_INET && */ - BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, - offsetof(struct bpf_sock_addr, family)), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, AF_INET, 8), - - /* sk.type == SOCK_DGRAM) { */ - BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, - offsetof(struct bpf_sock_addr, type)), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, SOCK_DGRAM, 6), - - /* msg_src_ip4 = src4_rw_ip */ - BPF_MOV32_IMM(BPF_REG_7, src4_rw_ip.s_addr), - BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7, - offsetof(struct bpf_sock_addr, msg_src_ip4)), - - /* user_ip4 = dst4_rw_addr.sin_addr */ - BPF_MOV32_IMM(BPF_REG_7, dst4_rw_addr.sin_addr.s_addr), - BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7, - offsetof(struct bpf_sock_addr, user_ip4)), - - /* user_port = dst4_rw_addr.sin_port */ - BPF_MOV32_IMM(BPF_REG_7, dst4_rw_addr.sin_port), - BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7, - offsetof(struct bpf_sock_addr, user_port)), - /* } */ - - /* return 1 */ - BPF_MOV64_IMM(BPF_REG_0, 1), - BPF_EXIT_INSN(), - }; - - return load_insns(test, insns, ARRAY_SIZE(insns)); -} - -static int recvmsg4_rw_c_prog_load(const struct sock_addr_test *test) -{ - return load_path(test, RECVMSG4_PROG_PATH); -} - -static int sendmsg4_rw_c_prog_load(const struct sock_addr_test *test) -{ - return load_path(test, SENDMSG4_PROG_PATH); -} - -static int sendmsg6_rw_dst_asm_prog_load(const struct sock_addr_test *test, - const char *rw_dst_ip) -{ - struct sockaddr_in6 dst6_rw_addr; - struct in6_addr src6_rw_ip; - - if (inet_pton(AF_INET6, SRC6_REWRITE_IP, (void *)&src6_rw_ip) != 1) { - log_err("Invalid IPv6: %s", SRC6_REWRITE_IP); - return -1; - } - - if (mk_sockaddr(AF_INET6, rw_dst_ip, SERV6_REWRITE_PORT, - (struct sockaddr *)&dst6_rw_addr, - sizeof(dst6_rw_addr)) == -1) - return -1; - - struct bpf_insn insns[] = { - BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), - - /* if (sk.family == AF_INET6) { */ - BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, - offsetof(struct bpf_sock_addr, family)), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, AF_INET6, 18), - -#define STORE_IPV6_WORD_N(DST, SRC, N) \ - BPF_MOV32_IMM(BPF_REG_7, SRC[N]), \ - BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7, \ - offsetof(struct bpf_sock_addr, DST[N])) - -#define STORE_IPV6(DST, SRC) \ - STORE_IPV6_WORD_N(DST, SRC, 0), \ - STORE_IPV6_WORD_N(DST, SRC, 1), \ - STORE_IPV6_WORD_N(DST, SRC, 2), \ - STORE_IPV6_WORD_N(DST, SRC, 3) - - STORE_IPV6(msg_src_ip6, src6_rw_ip.s6_addr32), - STORE_IPV6(user_ip6, dst6_rw_addr.sin6_addr.s6_addr32), - - /* user_port = dst6_rw_addr.sin6_port */ - BPF_MOV32_IMM(BPF_REG_7, dst6_rw_addr.sin6_port), - BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7, - offsetof(struct bpf_sock_addr, user_port)), - - /* } */ - - /* return 1 */ - BPF_MOV64_IMM(BPF_REG_0, 1), - BPF_EXIT_INSN(), - }; - - return load_insns(test, insns, ARRAY_SIZE(insns)); -} - -static int sendmsg6_rw_asm_prog_load(const struct sock_addr_test *test) -{ - return sendmsg6_rw_dst_asm_prog_load(test, SERV6_REWRITE_IP); -} - -static int recvmsg6_rw_c_prog_load(const struct sock_addr_test *test) -{ - return load_path(test, RECVMSG6_PROG_PATH); -} - -static int sendmsg6_rw_v4mapped_prog_load(const struct sock_addr_test *test) -{ - return sendmsg6_rw_dst_asm_prog_load(test, SERV6_V4MAPPED_IP); -} - -static int sendmsg6_rw_wildcard_prog_load(const struct sock_addr_test *test) -{ - return sendmsg6_rw_dst_asm_prog_load(test, WILDCARD6_IP); -} - -static int sendmsg6_rw_c_prog_load(const struct sock_addr_test *test) -{ - return load_path(test, SENDMSG6_PROG_PATH); -} - -static int cmp_addr(const struct sockaddr_storage *addr1, - const struct sockaddr_storage *addr2, int cmp_port) -{ - const struct sockaddr_in *four1, *four2; - const struct sockaddr_in6 *six1, *six2; - - if (addr1->ss_family != addr2->ss_family) - return -1; - - if (addr1->ss_family == AF_INET) { - four1 = (const struct sockaddr_in *)addr1; - four2 = (const struct sockaddr_in *)addr2; - return !((four1->sin_port == four2->sin_port || !cmp_port) && - four1->sin_addr.s_addr == four2->sin_addr.s_addr); - } else if (addr1->ss_family == AF_INET6) { - six1 = (const struct sockaddr_in6 *)addr1; - six2 = (const struct sockaddr_in6 *)addr2; - return !((six1->sin6_port == six2->sin6_port || !cmp_port) && - !memcmp(&six1->sin6_addr, &six2->sin6_addr, - sizeof(struct in6_addr))); - } - - return -1; -} - -static int cmp_sock_addr(info_fn fn, int sock1, - const struct sockaddr_storage *addr2, int cmp_port) -{ - struct sockaddr_storage addr1; - socklen_t len1 = sizeof(addr1); - - memset(&addr1, 0, len1); - if (fn(sock1, (struct sockaddr *)&addr1, (socklen_t *)&len1) != 0) - return -1; - - return cmp_addr(&addr1, addr2, cmp_port); -} - -static int cmp_local_ip(int sock1, const struct sockaddr_storage *addr2) -{ - return cmp_sock_addr(getsockname, sock1, addr2, /*cmp_port*/ 0); -} - -static int cmp_local_addr(int sock1, const struct sockaddr_storage *addr2) -{ - return cmp_sock_addr(getsockname, sock1, addr2, /*cmp_port*/ 1); -} - -static int cmp_peer_addr(int sock1, const struct sockaddr_storage *addr2) -{ - return cmp_sock_addr(getpeername, sock1, addr2, /*cmp_port*/ 1); -} - -static int start_server(int type, const struct sockaddr_storage *addr, - socklen_t addr_len) -{ - int fd; - - fd = socket(addr->ss_family, type, 0); - if (fd == -1) { - log_err("Failed to create server socket"); - goto out; - } - - if (bind(fd, (const struct sockaddr *)addr, addr_len) == -1) { - log_err("Failed to bind server socket"); - goto close_out; - } - - if (type == SOCK_STREAM) { - if (listen(fd, 128) == -1) { - log_err("Failed to listen on server socket"); - goto close_out; - } - } - - goto out; -close_out: - close(fd); - fd = -1; -out: - return fd; -} - -static int connect_to_server(int type, const struct sockaddr_storage *addr, - socklen_t addr_len) -{ - int domain; - int fd = -1; - - domain = addr->ss_family; - - if (domain != AF_INET && domain != AF_INET6) { - log_err("Unsupported address family"); - goto err; - } - - fd = socket(domain, type, 0); - if (fd == -1) { - log_err("Failed to create client socket"); - goto err; - } - - if (connect(fd, (const struct sockaddr *)addr, addr_len) == -1) { - log_err("Fail to connect to server"); - goto err; - } - - goto out; -err: - close(fd); - fd = -1; -out: - return fd; -} - -int init_pktinfo(int domain, struct cmsghdr *cmsg) -{ - struct in6_pktinfo *pktinfo6; - struct in_pktinfo *pktinfo4; - - if (domain == AF_INET) { - cmsg->cmsg_level = SOL_IP; - cmsg->cmsg_type = IP_PKTINFO; - cmsg->cmsg_len = CMSG_LEN(sizeof(struct in_pktinfo)); - pktinfo4 = (struct in_pktinfo *)CMSG_DATA(cmsg); - memset(pktinfo4, 0, sizeof(struct in_pktinfo)); - if (inet_pton(domain, SRC4_IP, - (void *)&pktinfo4->ipi_spec_dst) != 1) - return -1; - } else if (domain == AF_INET6) { - cmsg->cmsg_level = SOL_IPV6; - cmsg->cmsg_type = IPV6_PKTINFO; - cmsg->cmsg_len = CMSG_LEN(sizeof(struct in6_pktinfo)); - pktinfo6 = (struct in6_pktinfo *)CMSG_DATA(cmsg); - memset(pktinfo6, 0, sizeof(struct in6_pktinfo)); - if (inet_pton(domain, SRC6_IP, - (void *)&pktinfo6->ipi6_addr) != 1) - return -1; - } else { - return -1; - } - - return 0; -} - -static int sendmsg_to_server(int type, const struct sockaddr_storage *addr, - socklen_t addr_len, int set_cmsg, int flags, - int *syscall_err) -{ - union { - char buf[CMSG_SPACE(sizeof(struct in6_pktinfo))]; - struct cmsghdr align; - } control6; - union { - char buf[CMSG_SPACE(sizeof(struct in_pktinfo))]; - struct cmsghdr align; - } control4; - struct msghdr hdr; - struct iovec iov; - char data = 'a'; - int domain; - int fd = -1; - - domain = addr->ss_family; - - if (domain != AF_INET && domain != AF_INET6) { - log_err("Unsupported address family"); - goto err; - } - - fd = socket(domain, type, 0); - if (fd == -1) { - log_err("Failed to create client socket"); - goto err; - } - - memset(&iov, 0, sizeof(iov)); - iov.iov_base = &data; - iov.iov_len = sizeof(data); - - memset(&hdr, 0, sizeof(hdr)); - hdr.msg_name = (void *)addr; - hdr.msg_namelen = addr_len; - hdr.msg_iov = &iov; - hdr.msg_iovlen = 1; - - if (set_cmsg) { - if (domain == AF_INET) { - hdr.msg_control = &control4; - hdr.msg_controllen = sizeof(control4.buf); - } else if (domain == AF_INET6) { - hdr.msg_control = &control6; - hdr.msg_controllen = sizeof(control6.buf); - } - if (init_pktinfo(domain, CMSG_FIRSTHDR(&hdr))) { - log_err("Fail to init pktinfo"); - goto err; - } - } - - if (sendmsg(fd, &hdr, flags) != sizeof(data)) { - log_err("Fail to send message to server"); - *syscall_err = errno; - goto err; - } - - goto out; -err: - close(fd); - fd = -1; -out: - return fd; -} - -static int fastconnect_to_server(const struct sockaddr_storage *addr, - socklen_t addr_len) -{ - int sendmsg_err; - - return sendmsg_to_server(SOCK_STREAM, addr, addr_len, /*set_cmsg*/0, - MSG_FASTOPEN, &sendmsg_err); -} - -static int recvmsg_from_client(int sockfd, struct sockaddr_storage *src_addr) -{ - struct timeval tv; - struct msghdr hdr; - struct iovec iov; - char data[64]; - fd_set rfds; - - FD_ZERO(&rfds); - FD_SET(sockfd, &rfds); - - tv.tv_sec = 2; - tv.tv_usec = 0; - - if (select(sockfd + 1, &rfds, NULL, NULL, &tv) <= 0 || - !FD_ISSET(sockfd, &rfds)) - return -1; - - memset(&iov, 0, sizeof(iov)); - iov.iov_base = data; - iov.iov_len = sizeof(data); - - memset(&hdr, 0, sizeof(hdr)); - hdr.msg_name = src_addr; - hdr.msg_namelen = sizeof(struct sockaddr_storage); - hdr.msg_iov = &iov; - hdr.msg_iovlen = 1; - - return recvmsg(sockfd, &hdr, 0); -} - -static int init_addrs(const struct sock_addr_test *test, - struct sockaddr_storage *requested_addr, - struct sockaddr_storage *expected_addr, - struct sockaddr_storage *expected_src_addr) -{ - socklen_t addr_len = sizeof(struct sockaddr_storage); - - if (mk_sockaddr(test->domain, test->expected_ip, test->expected_port, - (struct sockaddr *)expected_addr, addr_len) == -1) - goto err; - - if (mk_sockaddr(test->domain, test->requested_ip, test->requested_port, - (struct sockaddr *)requested_addr, addr_len) == -1) - goto err; - - if (test->expected_src_ip && - mk_sockaddr(test->domain, test->expected_src_ip, 0, - (struct sockaddr *)expected_src_addr, addr_len) == -1) - goto err; - - return 0; -err: - return -1; -} - -static int run_bind_test_case(const struct sock_addr_test *test) -{ - socklen_t addr_len = sizeof(struct sockaddr_storage); - struct sockaddr_storage requested_addr; - struct sockaddr_storage expected_addr; - int clientfd = -1; - int servfd = -1; - int err = 0; - - if (init_addrs(test, &requested_addr, &expected_addr, NULL)) - goto err; - - servfd = start_server(test->type, &requested_addr, addr_len); - if (servfd == -1) - goto err; - - if (cmp_local_addr(servfd, &expected_addr)) - goto err; - - /* Try to connect to server just in case */ - clientfd = connect_to_server(test->type, &expected_addr, addr_len); - if (clientfd == -1) - goto err; - - goto out; -err: - err = -1; -out: - close(clientfd); - close(servfd); - return err; -} - -static int run_connect_test_case(const struct sock_addr_test *test) -{ - socklen_t addr_len = sizeof(struct sockaddr_storage); - struct sockaddr_storage expected_src_addr; - struct sockaddr_storage requested_addr; - struct sockaddr_storage expected_addr; - int clientfd = -1; - int servfd = -1; - int err = 0; - - if (init_addrs(test, &requested_addr, &expected_addr, - &expected_src_addr)) - goto err; - - /* Prepare server to connect to */ - servfd = start_server(test->type, &expected_addr, addr_len); - if (servfd == -1) - goto err; - - clientfd = connect_to_server(test->type, &requested_addr, addr_len); - if (clientfd == -1) - goto err; - - /* Make sure src and dst addrs were overridden properly */ - if (cmp_peer_addr(clientfd, &expected_addr)) - goto err; - - if (cmp_local_ip(clientfd, &expected_src_addr)) - goto err; - - if (test->type == SOCK_STREAM) { - /* Test TCP Fast Open scenario */ - clientfd = fastconnect_to_server(&requested_addr, addr_len); - if (clientfd == -1) - goto err; - - /* Make sure src and dst addrs were overridden properly */ - if (cmp_peer_addr(clientfd, &expected_addr)) - goto err; - - if (cmp_local_ip(clientfd, &expected_src_addr)) - goto err; - } - - goto out; -err: - err = -1; -out: - close(clientfd); - close(servfd); - return err; -} - -static int run_xmsg_test_case(const struct sock_addr_test *test, int max_cmsg) -{ - socklen_t addr_len = sizeof(struct sockaddr_storage); - struct sockaddr_storage expected_addr; - struct sockaddr_storage server_addr; - struct sockaddr_storage sendmsg_addr; - struct sockaddr_storage recvmsg_addr; - int clientfd = -1; - int servfd = -1; - int set_cmsg; - int err = 0; - - if (test->type != SOCK_DGRAM) - goto err; - - if (init_addrs(test, &sendmsg_addr, &server_addr, &expected_addr)) - goto err; - - /* Prepare server to sendmsg to */ - servfd = start_server(test->type, &server_addr, addr_len); - if (servfd == -1) - goto err; - - for (set_cmsg = 0; set_cmsg <= max_cmsg; ++set_cmsg) { - if (clientfd >= 0) - close(clientfd); - - clientfd = sendmsg_to_server(test->type, &sendmsg_addr, - addr_len, set_cmsg, /*flags*/0, - &err); - if (err) - goto out; - else if (clientfd == -1) - goto err; - - /* Try to receive message on server instead of using - * getpeername(2) on client socket, to check that client's - * destination address was rewritten properly, since - * getpeername(2) doesn't work with unconnected datagram - * sockets. - * - * Get source address from recvmsg(2) as well to make sure - * source was rewritten properly: getsockname(2) can't be used - * since socket is unconnected and source defined for one - * specific packet may differ from the one used by default and - * returned by getsockname(2). - */ - if (recvmsg_from_client(servfd, &recvmsg_addr) == -1) - goto err; - - if (cmp_addr(&recvmsg_addr, &expected_addr, /*cmp_port*/0)) - goto err; - } - - goto out; -err: - err = -1; -out: - close(clientfd); - close(servfd); - return err; -} - -static int run_test_case(int cgfd, const struct sock_addr_test *test) -{ - int progfd = -1; - int err = 0; - - printf("Test case: %s .. ", test->descr); - - progfd = test->loadfn(test); - if (test->expected_result == LOAD_REJECT && progfd < 0) - goto out; - else if (test->expected_result == LOAD_REJECT || progfd < 0) - goto err; - - err = bpf_prog_attach(progfd, cgfd, test->attach_type, - BPF_F_ALLOW_OVERRIDE); - if (test->expected_result == ATTACH_REJECT && err) { - err = 0; /* error was expected, reset it */ - goto out; - } else if (test->expected_result == ATTACH_REJECT || err) { - goto err; - } else if (test->expected_result == ATTACH_OKAY) { - err = 0; - goto out; - } - - switch (test->attach_type) { - case BPF_CGROUP_INET4_BIND: - case BPF_CGROUP_INET6_BIND: - err = run_bind_test_case(test); - break; - case BPF_CGROUP_INET4_CONNECT: - case BPF_CGROUP_INET6_CONNECT: - err = run_connect_test_case(test); - break; - case BPF_CGROUP_UDP4_SENDMSG: - case BPF_CGROUP_UDP6_SENDMSG: - err = run_xmsg_test_case(test, 1); - break; - case BPF_CGROUP_UDP4_RECVMSG: - case BPF_CGROUP_UDP6_RECVMSG: - err = run_xmsg_test_case(test, 0); - break; - default: - goto err; - } - - if (test->expected_result == SYSCALL_EPERM && err == EPERM) { - err = 0; /* error was expected, reset it */ - goto out; - } - - if (test->expected_result == SYSCALL_ENOTSUPP && err == ENOTSUPP) { - err = 0; /* error was expected, reset it */ - goto out; - } - - if (err || test->expected_result != SUCCESS) - goto err; - - goto out; -err: - err = -1; -out: - /* Detaching w/o checking return code: best effort attempt. */ - if (progfd != -1) - bpf_prog_detach(cgfd, test->attach_type); - close(progfd); - printf("[%s]\n", err ? "FAIL" : "PASS"); - return err; -} - -static int run_tests(int cgfd) -{ - int passes = 0; - int fails = 0; - int i; - - for (i = 0; i < ARRAY_SIZE(tests); ++i) { - if (run_test_case(cgfd, &tests[i])) - ++fails; - else - ++passes; - } - printf("Summary: %d PASSED, %d FAILED\n", passes, fails); - return fails ? -1 : 0; -} - -int main(int argc, char **argv) -{ - int cgfd = -1; - int err = 0; - - if (argc < 2) { - fprintf(stderr, - "%s has to be run via %s.sh. Skip direct run.\n", - argv[0], argv[0]); - exit(err); - } - - cgfd = cgroup_setup_and_join(CG_PATH); - if (cgfd < 0) - goto err; - - /* Use libbpf 1.0 API mode */ - libbpf_set_strict_mode(LIBBPF_STRICT_ALL); - - if (run_tests(cgfd)) - goto err; - - goto out; -err: - err = -1; -out: - close(cgfd); - cleanup_cgroup_environment(); - return err; -} diff --git a/tools/testing/selftests/bpf/test_sock_addr.sh b/tools/testing/selftests/bpf/test_sock_addr.sh deleted file mode 100755 index 3b9fdb8094aa..000000000000 --- a/tools/testing/selftests/bpf/test_sock_addr.sh +++ /dev/null @@ -1,58 +0,0 @@ -#!/bin/sh - -set -eu - -ping_once() -{ - type ping${1} >/dev/null 2>&1 && PING="ping${1}" || PING="ping -${1}" - $PING -q -c 1 -W 1 ${2%%/*} >/dev/null 2>&1 -} - -wait_for_ip() -{ - local _i - echo -n "Wait for testing IPv4/IPv6 to become available " - for _i in $(seq ${MAX_PING_TRIES}); do - echo -n "." - if ping_once 4 ${TEST_IPv4} && ping_once 6 ${TEST_IPv6}; then - echo " OK" - return - fi - done - echo 1>&2 "ERROR: Timeout waiting for test IP to become available." - exit 1 -} - -setup() -{ - # Create testing interfaces not to interfere with current environment. - ip link add dev ${TEST_IF} type veth peer name ${TEST_IF_PEER} - ip link set ${TEST_IF} up - ip link set ${TEST_IF_PEER} up - - ip -4 addr add ${TEST_IPv4} dev ${TEST_IF} - ip -6 addr add ${TEST_IPv6} dev ${TEST_IF} - wait_for_ip -} - -cleanup() -{ - ip link del ${TEST_IF} 2>/dev/null || : - ip link del ${TEST_IF_PEER} 2>/dev/null || : -} - -main() -{ - trap cleanup EXIT 2 3 6 15 - setup - ./test_sock_addr setup_done -} - -BASENAME=$(basename $0 .sh) -TEST_IF="${BASENAME}1" -TEST_IF_PEER="${BASENAME}2" -TEST_IPv4="127.0.0.4/8" -TEST_IPv6="::6/128" -MAX_PING_TRIES=5 - -main diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c index 024a0faafb3b..92752f5eeded 100644 --- a/tools/testing/selftests/bpf/test_sockmap.c +++ b/tools/testing/selftests/bpf/test_sockmap.c @@ -1887,10 +1887,13 @@ static int check_whitelist(struct _test *t, struct sockmap_options *opt) while (entry) { if ((opt->prepend && strstr(opt->prepend, entry) != 0) || strstr(opt->map, entry) != 0 || - strstr(t->title, entry) != 0) + strstr(t->title, entry) != 0) { + free(ptr); return 0; + } entry = strtok(NULL, ","); } + free(ptr); return -EINVAL; } @@ -1907,10 +1910,13 @@ static int check_blacklist(struct _test *t, struct sockmap_options *opt) while (entry) { if ((opt->prepend && strstr(opt->prepend, entry) != 0) || strstr(opt->map, entry) != 0 || - strstr(t->title, entry) != 0) + strstr(t->title, entry) != 0) { + free(ptr); return 0; + } entry = strtok(NULL, ","); } + free(ptr); return -EINVAL; } @@ -2104,9 +2110,9 @@ out: free(options.whitelist); if (options.blacklist) free(options.blacklist); + close(cg_fd); if (cg_created) cleanup_cgroup_environment(); - close(cg_fd); return err; } diff --git a/tools/testing/selftests/bpf/test_tc_tunnel.sh b/tools/testing/selftests/bpf/test_tc_tunnel.sh index 910044f08908..7989ec608454 100755 --- a/tools/testing/selftests/bpf/test_tc_tunnel.sh +++ b/tools/testing/selftests/bpf/test_tc_tunnel.sh @@ -72,7 +72,6 @@ cleanup() { server_listen() { ip netns exec "${ns2}" nc "${netcat_opt}" -l "${port}" > "${outfile}" & server_pid=$! - sleep 0.2 } client_connect() { @@ -93,6 +92,16 @@ verify_data() { fi } +wait_for_port() { + for i in $(seq 20); do + if ip netns exec "${ns2}" ss ${2:--4}OHntl | grep -q "$1"; then + return 0 + fi + sleep 0.1 + done + return 1 +} + set -e # no arguments: automated test, run all @@ -193,6 +202,7 @@ setup # basic communication works echo "test basic connectivity" server_listen +wait_for_port ${port} ${netcat_opt} client_connect verify_data @@ -204,6 +214,7 @@ ip netns exec "${ns1}" tc filter add dev veth1 egress \ section "encap_${tuntype}_${mac}" echo "test bpf encap without decap (expect failure)" server_listen +wait_for_port ${port} ${netcat_opt} ! client_connect if [[ "$tuntype" =~ "udp" ]]; then diff --git a/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c b/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c index 32df93747095..7b5fc98838cd 100644 --- a/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c +++ b/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c @@ -16,68 +16,7 @@ #include <bpf/libbpf.h> #include "cgroup_helpers.h" - -static int start_server(const struct sockaddr *addr, socklen_t len, bool dual) -{ - int mode = !dual; - int fd; - - fd = socket(addr->sa_family, SOCK_STREAM, 0); - if (fd == -1) { - log_err("Failed to create server socket"); - goto out; - } - - if (addr->sa_family == AF_INET6) { - if (setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, (char *)&mode, - sizeof(mode)) == -1) { - log_err("Failed to set the dual-stack mode"); - goto close_out; - } - } - - if (bind(fd, addr, len) == -1) { - log_err("Failed to bind server socket"); - goto close_out; - } - - if (listen(fd, 128) == -1) { - log_err("Failed to listen on server socket"); - goto close_out; - } - - goto out; - -close_out: - close(fd); - fd = -1; -out: - return fd; -} - -static int connect_to_server(const struct sockaddr *addr, socklen_t len) -{ - int fd = -1; - - fd = socket(addr->sa_family, SOCK_STREAM, 0); - if (fd == -1) { - log_err("Failed to create client socket"); - goto out; - } - - if (connect(fd, (const struct sockaddr *)addr, len) == -1) { - log_err("Fail to connect to server"); - goto close_out; - } - - goto out; - -close_out: - close(fd); - fd = -1; -out: - return fd; -} +#include "network_helpers.h" static int get_map_fd_by_prog_id(int prog_id, bool *xdp) { @@ -117,8 +56,7 @@ err: return map_fd; } -static int run_test(int server_fd, int results_fd, bool xdp, - const struct sockaddr *addr, socklen_t len) +static int run_test(int server_fd, int results_fd, bool xdp) { int client = -1, srv_client = -1; int ret = 0; @@ -144,7 +82,7 @@ static int run_test(int server_fd, int results_fd, bool xdp, goto err; } - client = connect_to_server(addr, len); + client = connect_to_fd(server_fd, 0); if (client == -1) goto err; @@ -201,23 +139,23 @@ out: return ret; } -static bool get_port(int server_fd, in_port_t *port) +static int v6only_true(int fd, const struct post_socket_opts *opts) { - struct sockaddr_in addr; - socklen_t len = sizeof(addr); + int mode = true; - if (getsockname(server_fd, (struct sockaddr *)&addr, &len)) { - log_err("Failed to get server addr"); - return false; - } + return setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &mode, sizeof(mode)); +} + +static int v6only_false(int fd, const struct post_socket_opts *opts) +{ + int mode = false; - /* sin_port and sin6_port are located at the same offset. */ - *port = addr.sin_port; - return true; + return setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &mode, sizeof(mode)); } int main(int argc, char **argv) { + struct network_helper_opts opts = { 0 }; struct sockaddr_in addr4; struct sockaddr_in6 addr6; struct sockaddr_in addr4dual; @@ -259,31 +197,30 @@ int main(int argc, char **argv) addr6dual.sin6_addr = in6addr_any; addr6dual.sin6_port = 0; - server = start_server((const struct sockaddr *)&addr4, sizeof(addr4), - false); - if (server == -1 || !get_port(server, &addr4.sin_port)) + server = start_server_addr(SOCK_STREAM, (struct sockaddr_storage *)&addr4, + sizeof(addr4), NULL); + if (server == -1) goto err; - server_v6 = start_server((const struct sockaddr *)&addr6, - sizeof(addr6), false); - if (server_v6 == -1 || !get_port(server_v6, &addr6.sin6_port)) + opts.post_socket_cb = v6only_true; + server_v6 = start_server_addr(SOCK_STREAM, (struct sockaddr_storage *)&addr6, + sizeof(addr6), &opts); + if (server_v6 == -1) goto err; - server_dual = start_server((const struct sockaddr *)&addr6dual, - sizeof(addr6dual), true); - if (server_dual == -1 || !get_port(server_dual, &addr4dual.sin_port)) + opts.post_socket_cb = v6only_false; + server_dual = start_server_addr(SOCK_STREAM, (struct sockaddr_storage *)&addr6dual, + sizeof(addr6dual), &opts); + if (server_dual == -1) goto err; - if (run_test(server, results, xdp, - (const struct sockaddr *)&addr4, sizeof(addr4))) + if (run_test(server, results, xdp)) goto err; - if (run_test(server_v6, results, xdp, - (const struct sockaddr *)&addr6, sizeof(addr6))) + if (run_test(server_v6, results, xdp)) goto err; - if (run_test(server_dual, results, xdp, - (const struct sockaddr *)&addr4dual, sizeof(addr4dual))) + if (run_test(server_dual, results, xdp)) goto err; printf("ok\n"); diff --git a/tools/testing/selftests/bpf/testing_helpers.c b/tools/testing/selftests/bpf/testing_helpers.c index 28b6646662af..d5379a0e6da8 100644 --- a/tools/testing/selftests/bpf/testing_helpers.c +++ b/tools/testing/selftests/bpf/testing_helpers.c @@ -368,9 +368,23 @@ int delete_module(const char *name, int flags) int unload_bpf_testmod(bool verbose) { + int ret, cnt = 0; + if (kern_sync_rcu()) fprintf(stdout, "Failed to trigger kernel-side RCU sync!\n"); - if (delete_module("bpf_testmod", 0)) { + + for (;;) { + ret = delete_module("bpf_testmod", 0); + if (!ret || errno != EAGAIN) + break; + if (++cnt > 10000) { + fprintf(stdout, "Unload of bpf_testmod timed out\n"); + break; + } + usleep(100); + } + + if (ret) { if (errno == ENOENT) { if (verbose) fprintf(stdout, "bpf_testmod.ko is already unloaded.\n"); diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c index 27fd7ed3e4b0..70e29f316fe7 100644 --- a/tools/testing/selftests/bpf/trace_helpers.c +++ b/tools/testing/selftests/bpf/trace_helpers.c @@ -61,12 +61,7 @@ void free_kallsyms_local(struct ksyms *ksyms) free(ksyms); } -static int ksym_cmp(const void *p1, const void *p2) -{ - return ((struct ksym *)p1)->addr - ((struct ksym *)p2)->addr; -} - -struct ksyms *load_kallsyms_local(void) +static struct ksyms *load_kallsyms_local_common(ksym_cmp_t cmp_cb) { FILE *f; char func[256], buf[256]; @@ -100,7 +95,7 @@ struct ksyms *load_kallsyms_local(void) goto error; } fclose(f); - qsort(ksyms->syms, ksyms->sym_cnt, sizeof(struct ksym), ksym_cmp); + qsort(ksyms->syms, ksyms->sym_cnt, sizeof(struct ksym), cmp_cb); return ksyms; error: @@ -109,6 +104,21 @@ error: return NULL; } +static int ksym_cmp(const void *p1, const void *p2) +{ + return ((struct ksym *)p1)->addr - ((struct ksym *)p2)->addr; +} + +struct ksyms *load_kallsyms_local(void) +{ + return load_kallsyms_local_common(ksym_cmp); +} + +struct ksyms *load_kallsyms_custom_local(ksym_cmp_t cmp_cb) +{ + return load_kallsyms_local_common(cmp_cb); +} + int load_kallsyms(void) { pthread_mutex_lock(&ksyms_mutex); @@ -148,6 +158,28 @@ struct ksym *ksym_search_local(struct ksyms *ksyms, long key) return &ksyms->syms[0]; } +struct ksym *search_kallsyms_custom_local(struct ksyms *ksyms, const void *p, + ksym_search_cmp_t cmp_cb) +{ + int start = 0, mid, end = ksyms->sym_cnt; + struct ksym *ks; + int result; + + while (start < end) { + mid = start + (end - start) / 2; + ks = &ksyms->syms[mid]; + result = cmp_cb(p, ks); + if (result < 0) + end = mid; + else if (result > 0) + start = mid + 1; + else + return ks; + } + + return NULL; +} + struct ksym *ksym_search(long key) { if (!ksyms) @@ -201,29 +233,6 @@ out: return err; } -void read_trace_pipe(void) -{ - int trace_fd; - - if (access(TRACEFS_PIPE, F_OK) == 0) - trace_fd = open(TRACEFS_PIPE, O_RDONLY, 0); - else - trace_fd = open(DEBUGFS_PIPE, O_RDONLY, 0); - if (trace_fd < 0) - return; - - while (1) { - static char buf[4096]; - ssize_t sz; - - sz = read(trace_fd, buf, sizeof(buf) - 1); - if (sz > 0) { - buf[sz] = 0; - puts(buf); - } - } -} - ssize_t get_uprobe_offset(const void *addr) { size_t start, end, base; @@ -381,3 +390,43 @@ out: close(fd); return err; } + +int read_trace_pipe_iter(void (*cb)(const char *str, void *data), void *data, int iter) +{ + size_t buflen, n; + char *buf = NULL; + FILE *fp = NULL; + + if (access(TRACEFS_PIPE, F_OK) == 0) + fp = fopen(TRACEFS_PIPE, "r"); + else + fp = fopen(DEBUGFS_PIPE, "r"); + if (!fp) + return -1; + + /* We do not want to wait forever when iter is specified. */ + if (iter) + fcntl(fileno(fp), F_SETFL, O_NONBLOCK); + + while ((n = getline(&buf, &buflen, fp) >= 0) || errno == EAGAIN) { + if (n > 0) + cb(buf, data); + if (iter && !(--iter)) + break; + } + + free(buf); + if (fp) + fclose(fp); + return 0; +} + +static void trace_pipe_cb(const char *str, void *data) +{ + printf("%s", str); +} + +void read_trace_pipe(void) +{ + read_trace_pipe_iter(trace_pipe_cb, NULL, 0); +} diff --git a/tools/testing/selftests/bpf/trace_helpers.h b/tools/testing/selftests/bpf/trace_helpers.h index 04fd1da7079d..2ce873c9f9aa 100644 --- a/tools/testing/selftests/bpf/trace_helpers.h +++ b/tools/testing/selftests/bpf/trace_helpers.h @@ -13,6 +13,9 @@ struct ksym { }; struct ksyms; +typedef int (*ksym_cmp_t)(const void *p1, const void *p2); +typedef int (*ksym_search_cmp_t)(const void *p1, const struct ksym *p2); + int load_kallsyms(void); struct ksym *ksym_search(long key); long ksym_get_addr(const char *name); @@ -22,10 +25,16 @@ struct ksym *ksym_search_local(struct ksyms *ksyms, long key); long ksym_get_addr_local(struct ksyms *ksyms, const char *name); void free_kallsyms_local(struct ksyms *ksyms); +struct ksyms *load_kallsyms_custom_local(ksym_cmp_t cmp_cb); +struct ksym *search_kallsyms_custom_local(struct ksyms *ksyms, const void *p1, + ksym_search_cmp_t cmp_cb); + /* open kallsyms and find addresses on the fly, faster than load + search. */ int kallsyms_find(const char *sym, unsigned long long *addr); void read_trace_pipe(void); +int read_trace_pipe_iter(void (*cb)(const char *str, void *data), + void *data, int iter); ssize_t get_uprobe_offset(const void *addr); ssize_t get_rel_offset(uintptr_t addr); diff --git a/tools/testing/selftests/bpf/uprobe_multi.c b/tools/testing/selftests/bpf/uprobe_multi.c index a61ceab60b68..7ffa563ffeba 100644 --- a/tools/testing/selftests/bpf/uprobe_multi.c +++ b/tools/testing/selftests/bpf/uprobe_multi.c @@ -9,7 +9,7 @@ #define NAME(name, idx) PASTE(name, idx) -#define DEF(name, idx) int NAME(name, idx)(void) { return 0; } +#define DEF(name, idx) int __attribute__((weak)) NAME(name, idx)(void) { return 0; } #define CALL(name, idx) NAME(name, idx)(); #define F(body, name, idx) body(name, idx) diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c index 244d4996e06e..b2854238d4a0 100644 --- a/tools/testing/selftests/bpf/veristat.c +++ b/tools/testing/selftests/bpf/veristat.c @@ -792,10 +792,13 @@ static int parse_stats(const char *stats_str, struct stat_specs *specs) while ((next = strtok_r(state ? NULL : input, ",", &state))) { err = parse_stat(next, specs); - if (err) + if (err) { + free(input); return err; + } } + free(input); return 0; } diff --git a/tools/testing/selftests/bpf/xdp_hw_metadata.c b/tools/testing/selftests/bpf/xdp_hw_metadata.c index bdf5d8180067..6f9956eed797 100644 --- a/tools/testing/selftests/bpf/xdp_hw_metadata.c +++ b/tools/testing/selftests/bpf/xdp_hw_metadata.c @@ -495,20 +495,6 @@ peek: return 0; } -struct ethtool_channels { - __u32 cmd; - __u32 max_rx; - __u32 max_tx; - __u32 max_other; - __u32 max_combined; - __u32 rx_count; - __u32 tx_count; - __u32 other_count; - __u32 combined_count; -}; - -#define ETHTOOL_GCHANNELS 0x0000003c /* Get no of channels */ - static int rxq_num(const char *ifname) { struct ethtool_channels ch = { @@ -595,6 +581,8 @@ static void cleanup(void) if (bpf_obj) xdp_hw_metadata__destroy(bpf_obj); + + free((void *)saved_hwtstamp_ifname); } static void handle_signal(int sig) diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c index b1102ee13faa..2eac0895b0a1 100644 --- a/tools/testing/selftests/bpf/xskxceiver.c +++ b/tools/testing/selftests/bpf/xskxceiver.c @@ -81,6 +81,7 @@ #include <linux/mman.h> #include <linux/netdev.h> #include <linux/bitmap.h> +#include <linux/ethtool.h> #include <arpa/inet.h> #include <net/if.h> #include <locale.h> @@ -105,11 +106,15 @@ #include "../kselftest.h" #include "xsk_xdp_common.h" +#include <network_helpers.h> + static bool opt_verbose; static bool opt_print_tests; static enum test_mode opt_mode = TEST_MODE_ALL; static u32 opt_run_test = RUN_ALL_TESTS; +void test__fail(void) { /* for network_helpers.c */ } + static void __exit_with_error(int error, const char *file, const char *func, int line) { ksft_test_result_fail("[%s:%s:%i]: ERROR: %d/\"%s\"\n", file, func, line, error, @@ -239,7 +244,7 @@ static void enable_busy_poll(struct xsk_socket_info *xsk) (void *)&sock_opt, sizeof(sock_opt)) < 0) exit_with_error(errno); - sock_opt = BATCH_SIZE; + sock_opt = xsk->batch_size; if (setsockopt(xsk_socket__fd(xsk->xsk), SOL_SOCKET, SO_BUSY_POLL_BUDGET, (void *)&sock_opt, sizeof(sock_opt)) < 0) exit_with_error(errno); @@ -409,6 +414,33 @@ static void parse_command_line(struct ifobject *ifobj_tx, struct ifobject *ifobj } } +static int set_ring_size(struct ifobject *ifobj) +{ + int ret; + u32 ctr = 0; + + while (ctr++ < SOCK_RECONF_CTR) { + ret = set_hw_ring_size(ifobj->ifname, &ifobj->ring); + if (!ret) + break; + + /* Retry if it fails */ + if (ctr >= SOCK_RECONF_CTR || errno != EBUSY) + return -errno; + + usleep(USLEEP_MAX); + } + + return ret; +} + +static int hw_ring_size_reset(struct ifobject *ifobj) +{ + ifobj->ring.tx_pending = ifobj->set_ring.default_tx; + ifobj->ring.rx_pending = ifobj->set_ring.default_rx; + return set_ring_size(ifobj); +} + static void __test_spec_init(struct test_spec *test, struct ifobject *ifobj_tx, struct ifobject *ifobj_rx) { @@ -439,6 +471,7 @@ static void __test_spec_init(struct test_spec *test, struct ifobject *ifobj_tx, for (j = 0; j < MAX_SOCKETS; j++) { memset(&ifobj->xsk_arr[j], 0, sizeof(ifobj->xsk_arr[j])); ifobj->xsk_arr[j].rxqsize = XSK_RING_CONS__DEFAULT_NUM_DESCS; + ifobj->xsk_arr[j].batch_size = DEFAULT_BATCH_SIZE; if (i == 0) ifobj->xsk_arr[j].pkt_stream = test->tx_pkt_stream_default; else @@ -451,12 +484,16 @@ static void __test_spec_init(struct test_spec *test, struct ifobject *ifobj_tx, } } + if (ifobj_tx->hw_ring_size_supp) + hw_ring_size_reset(ifobj_tx); + test->ifobj_tx = ifobj_tx; test->ifobj_rx = ifobj_rx; test->current_step = 0; test->total_steps = 1; test->nb_sockets = 1; test->fail = false; + test->set_ring = false; test->mtu = MAX_ETH_PKT_SIZE; test->xdp_prog_rx = ifobj_rx->xdp_progs->progs.xsk_def_prog; test->xskmap_rx = ifobj_rx->xdp_progs->maps.xsk; @@ -1087,7 +1124,7 @@ static int __receive_pkts(struct test_spec *test, struct xsk_socket_info *xsk) return TEST_CONTINUE; } - rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE, &idx_rx); + rcvd = xsk_ring_cons__peek(&xsk->rx, xsk->batch_size, &idx_rx); if (!rcvd) return TEST_CONTINUE; @@ -1239,7 +1276,8 @@ static int __send_pkts(struct ifobject *ifobject, struct xsk_socket_info *xsk, b buffer_len = pkt_get_buffer_len(umem, pkt_stream->max_pkt_len); /* pkts_in_flight might be negative if many invalid packets are sent */ - if (pkts_in_flight >= (int)((umem_size(umem) - BATCH_SIZE * buffer_len) / buffer_len)) { + if (pkts_in_flight >= (int)((umem_size(umem) - xsk->batch_size * buffer_len) / + buffer_len)) { ret = kick_tx(xsk); if (ret) return TEST_FAILURE; @@ -1249,7 +1287,7 @@ static int __send_pkts(struct ifobject *ifobject, struct xsk_socket_info *xsk, b fds.fd = xsk_socket__fd(xsk->xsk); fds.events = POLLOUT; - while (xsk_ring_prod__reserve(&xsk->tx, BATCH_SIZE, &idx) < BATCH_SIZE) { + while (xsk_ring_prod__reserve(&xsk->tx, xsk->batch_size, &idx) < xsk->batch_size) { if (use_poll) { ret = poll(&fds, 1, POLL_TMOUT); if (timeout) { @@ -1269,10 +1307,10 @@ static int __send_pkts(struct ifobject *ifobject, struct xsk_socket_info *xsk, b } } - complete_pkts(xsk, BATCH_SIZE); + complete_pkts(xsk, xsk->batch_size); } - for (i = 0; i < BATCH_SIZE; i++) { + for (i = 0; i < xsk->batch_size; i++) { struct pkt *pkt = pkt_stream_get_next_tx_pkt(pkt_stream); u32 nb_frags_left, nb_frags, bytes_written = 0; @@ -1280,9 +1318,9 @@ static int __send_pkts(struct ifobject *ifobject, struct xsk_socket_info *xsk, b break; nb_frags = pkt_nb_frags(umem->frame_size, pkt_stream, pkt); - if (nb_frags > BATCH_SIZE - i) { + if (nb_frags > xsk->batch_size - i) { pkt_stream_cancel(pkt_stream); - xsk_ring_prod__cancel(&xsk->tx, BATCH_SIZE - i); + xsk_ring_prod__cancel(&xsk->tx, xsk->batch_size - i); break; } nb_frags_left = nb_frags; @@ -1370,7 +1408,7 @@ static int wait_for_tx_completion(struct xsk_socket_info *xsk) return TEST_FAILURE; } - complete_pkts(xsk, BATCH_SIZE); + complete_pkts(xsk, xsk->batch_size); } return TEST_PASS; @@ -1860,6 +1898,14 @@ static int testapp_validate_traffic(struct test_spec *test) return TEST_SKIP; } + if (test->set_ring) { + if (ifobj_tx->hw_ring_size_supp) + return set_ring_size(ifobj_tx); + + ksft_test_result_skip("Changing HW ring size not supported.\n"); + return TEST_SKIP; + } + xsk_attach_xdp_progs(test, ifobj_rx, ifobj_tx); return __testapp_validate_traffic(test, ifobj_rx, ifobj_tx); } @@ -2373,6 +2419,50 @@ static int testapp_xdp_metadata_mb(struct test_spec *test) return testapp_xdp_metadata_copy(test); } +static int testapp_hw_sw_min_ring_size(struct test_spec *test) +{ + int ret; + + test->set_ring = true; + test->total_steps = 2; + test->ifobj_tx->ring.tx_pending = DEFAULT_BATCH_SIZE; + test->ifobj_tx->ring.rx_pending = DEFAULT_BATCH_SIZE * 2; + test->ifobj_tx->xsk->batch_size = 1; + test->ifobj_rx->xsk->batch_size = 1; + ret = testapp_validate_traffic(test); + if (ret) + return ret; + + /* Set batch size to hw_ring_size - 1 */ + test->ifobj_tx->xsk->batch_size = DEFAULT_BATCH_SIZE - 1; + test->ifobj_rx->xsk->batch_size = DEFAULT_BATCH_SIZE - 1; + return testapp_validate_traffic(test); +} + +static int testapp_hw_sw_max_ring_size(struct test_spec *test) +{ + u32 max_descs = XSK_RING_PROD__DEFAULT_NUM_DESCS * 2; + int ret; + + test->set_ring = true; + test->total_steps = 2; + test->ifobj_tx->ring.tx_pending = test->ifobj_tx->ring.tx_max_pending; + test->ifobj_tx->ring.rx_pending = test->ifobj_tx->ring.rx_max_pending; + test->ifobj_rx->umem->num_frames = max_descs; + test->ifobj_rx->xsk->rxqsize = max_descs; + test->ifobj_tx->xsk->batch_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; + test->ifobj_rx->xsk->batch_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; + + ret = testapp_validate_traffic(test); + if (ret) + return ret; + + /* Set batch_size to 4095 */ + test->ifobj_tx->xsk->batch_size = max_descs - 1; + test->ifobj_rx->xsk->batch_size = max_descs - 1; + return testapp_validate_traffic(test); +} + static void run_pkt_test(struct test_spec *test) { int ret; @@ -2477,7 +2567,9 @@ static const struct test_spec tests[] = { {.name = "ALIGNED_INV_DESC_MULTI_BUFF", .test_func = testapp_aligned_inv_desc_mb}, {.name = "UNALIGNED_INV_DESC_MULTI_BUFF", .test_func = testapp_unaligned_inv_desc_mb}, {.name = "TOO_MANY_FRAGS", .test_func = testapp_too_many_frags}, -}; + {.name = "HW_SW_MIN_RING_SIZE", .test_func = testapp_hw_sw_min_ring_size}, + {.name = "HW_SW_MAX_RING_SIZE", .test_func = testapp_hw_sw_max_ring_size}, + }; static void print_tests(void) { @@ -2497,6 +2589,7 @@ int main(int argc, char **argv) int modes = TEST_MODE_SKB + 1; struct test_spec test; bool shared_netdev; + int ret; /* Use libbpf 1.0 API mode */ libbpf_set_strict_mode(LIBBPF_STRICT_ALL); @@ -2534,6 +2627,13 @@ int main(int argc, char **argv) modes++; } + ret = get_hw_ring_size(ifobj_tx->ifname, &ifobj_tx->ring); + if (!ret) { + ifobj_tx->hw_ring_size_supp = true; + ifobj_tx->set_ring.default_tx = ifobj_tx->ring.tx_pending; + ifobj_tx->set_ring.default_rx = ifobj_tx->ring.rx_pending; + } + init_iface(ifobj_rx, worker_testapp_validate_rx); init_iface(ifobj_tx, worker_testapp_validate_tx); @@ -2581,6 +2681,9 @@ int main(int argc, char **argv) } } + if (ifobj_tx->hw_ring_size_supp) + hw_ring_size_reset(ifobj_tx); + pkt_stream_delete(tx_pkt_stream_default); pkt_stream_delete(rx_pkt_stream_default); xsk_unload_xdp_programs(ifobj_tx); diff --git a/tools/testing/selftests/bpf/xskxceiver.h b/tools/testing/selftests/bpf/xskxceiver.h index f174df2d693f..906de5fab7a3 100644 --- a/tools/testing/selftests/bpf/xskxceiver.h +++ b/tools/testing/selftests/bpf/xskxceiver.h @@ -44,7 +44,7 @@ #define MAX_ETH_JUMBO_SIZE 9000 #define USLEEP_MAX 10000 #define SOCK_RECONF_CTR 10 -#define BATCH_SIZE 64 +#define DEFAULT_BATCH_SIZE 64 #define POLL_TMOUT 1000 #define THREAD_TMOUT 3 #define DEFAULT_PKT_CNT (4 * 1024) @@ -91,6 +91,7 @@ struct xsk_socket_info { struct pkt_stream *pkt_stream; u32 outstanding_tx; u32 rxqsize; + u32 batch_size; u8 dst_mac[ETH_ALEN]; u8 src_mac[ETH_ALEN]; }; @@ -113,6 +114,11 @@ struct pkt_stream { bool verbatim; }; +struct set_hw_ring { + u32 default_tx; + u32 default_rx; +}; + struct ifobject; struct test_spec; typedef int (*validation_func_t)(struct ifobject *ifobj); @@ -129,6 +135,8 @@ struct ifobject { struct xsk_xdp_progs *xdp_progs; struct bpf_map *xskmap; struct bpf_program *xdp_prog; + struct ethtool_ringparam ring; + struct set_hw_ring set_ring; enum test_mode mode; int ifindex; int mtu; @@ -145,6 +153,7 @@ struct ifobject { bool unaligned_supp; bool multi_buff_supp; bool multi_buff_zc_supp; + bool hw_ring_size_supp; }; struct test_spec { @@ -162,6 +171,7 @@ struct test_spec { u16 current_step; u16 nb_sockets; bool fail; + bool set_ring; enum test_mode mode; char name[MAX_TEST_NAME_SIZE]; }; diff --git a/tools/testing/selftests/capabilities/test_execve.c b/tools/testing/selftests/capabilities/test_execve.c index 7cde07a5df78..47bad7ddc5bc 100644 --- a/tools/testing/selftests/capabilities/test_execve.c +++ b/tools/testing/selftests/capabilities/test_execve.c @@ -82,7 +82,7 @@ static bool create_and_enter_ns(uid_t inner_uid) { uid_t outer_uid; gid_t outer_gid; - int i; + int i, ret; bool have_outer_privilege; outer_uid = getuid(); @@ -97,7 +97,10 @@ static bool create_and_enter_ns(uid_t inner_uid) ksft_exit_fail_msg("setresuid - %s\n", strerror(errno)); // Re-enable effective caps - capng_get_caps_process(); + ret = capng_get_caps_process(); + if (ret == -1) + ksft_exit_fail_msg("capng_get_caps_process failed\n"); + for (i = 0; i < CAP_LAST_CAP; i++) if (capng_have_capability(CAPNG_PERMITTED, i)) capng_update(CAPNG_ADD, CAPNG_EFFECTIVE, i); @@ -207,6 +210,7 @@ static void exec_validate_cap(bool eff, bool perm, bool inh, bool ambient) static int do_tests(int uid, const char *our_path) { + int ret; bool have_outer_privilege = create_and_enter_ns(uid); int ourpath_fd = open(our_path, O_RDONLY | O_DIRECTORY); @@ -250,7 +254,9 @@ static int do_tests(int uid, const char *our_path) ksft_exit_fail_msg("chmod - %s\n", strerror(errno)); } - capng_get_caps_process(); + ret = capng_get_caps_process(); + if (ret == -1) + ksft_exit_fail_msg("capng_get_caps_process failed\n"); /* Make sure that i starts out clear */ capng_update(CAPNG_DROP, CAPNG_INHERITABLE, CAP_NET_BIND_SERVICE); diff --git a/tools/testing/selftests/capabilities/validate_cap.c b/tools/testing/selftests/capabilities/validate_cap.c index 60b4e7b716a7..65f2a1c89239 100644 --- a/tools/testing/selftests/capabilities/validate_cap.c +++ b/tools/testing/selftests/capabilities/validate_cap.c @@ -28,6 +28,7 @@ static bool bool_arg(char **argv, int i) int main(int argc, char **argv) { const char *atsec = ""; + int ret; /* * Be careful just in case a setgid or setcapped copy of this @@ -44,7 +45,11 @@ int main(int argc, char **argv) atsec = " (AT_SECURE is not set)"; #endif - capng_get_caps_process(); + ret = capng_get_caps_process(); + if (ret == -1) { + ksft_print_msg("capng_get_caps_process failed\n"); + return 1; + } if (capng_have_capability(CAPNG_EFFECTIVE, CAP_NET_BIND_SERVICE) != bool_arg(argv, 1)) { ksft_print_msg("Wrong effective state%s\n", atsec); diff --git a/tools/testing/selftests/cgroup/Makefile b/tools/testing/selftests/cgroup/Makefile index 00b441928909..16461dc0ffdf 100644 --- a/tools/testing/selftests/cgroup/Makefile +++ b/tools/testing/selftests/cgroup/Makefile @@ -4,7 +4,7 @@ CFLAGS += -Wall -pthread all: ${HELPER_PROGS} TEST_FILES := with_stress.sh -TEST_PROGS := test_stress.sh test_cpuset_prs.sh +TEST_PROGS := test_stress.sh test_cpuset_prs.sh test_cpuset_v1_hp.sh TEST_GEN_FILES := wait_inotify TEST_GEN_PROGS = test_memcontrol TEST_GEN_PROGS += test_kmem diff --git a/tools/testing/selftests/cgroup/cgroup_util.c b/tools/testing/selftests/cgroup/cgroup_util.c index 0340d4ca8f51..432db923bced 100644 --- a/tools/testing/selftests/cgroup/cgroup_util.c +++ b/tools/testing/selftests/cgroup/cgroup_util.c @@ -195,10 +195,10 @@ int cg_write_numeric(const char *cgroup, const char *control, long value) return cg_write(cgroup, control, buf); } -int cg_find_unified_root(char *root, size_t len) +int cg_find_unified_root(char *root, size_t len, bool *nsdelegate) { char buf[10 * PAGE_SIZE]; - char *fs, *mount, *type; + char *fs, *mount, *type, *options; const char delim[] = "\n\t "; if (read_text("/proc/self/mounts", buf, sizeof(buf)) <= 0) @@ -211,12 +211,14 @@ int cg_find_unified_root(char *root, size_t len) for (fs = strtok(buf, delim); fs; fs = strtok(NULL, delim)) { mount = strtok(NULL, delim); type = strtok(NULL, delim); - strtok(NULL, delim); + options = strtok(NULL, delim); strtok(NULL, delim); strtok(NULL, delim); if (strcmp(type, "cgroup2") == 0) { strncpy(root, mount, len); + if (nsdelegate) + *nsdelegate = !!strstr(options, "nsdelegate"); return 0; } } diff --git a/tools/testing/selftests/cgroup/cgroup_util.h b/tools/testing/selftests/cgroup/cgroup_util.h index 1df7f202214a..e8d04ac9e3d2 100644 --- a/tools/testing/selftests/cgroup/cgroup_util.h +++ b/tools/testing/selftests/cgroup/cgroup_util.h @@ -18,10 +18,10 @@ */ static inline int values_close(long a, long b, int err) { - return abs(a - b) <= (a + b) / 100 * err; + return labs(a - b) <= (a + b) / 100 * err; } -extern int cg_find_unified_root(char *root, size_t len); +extern int cg_find_unified_root(char *root, size_t len, bool *nsdelegate); extern char *cg_name(const char *root, const char *name); extern char *cg_name_indexed(const char *root, const char *name, int index); extern char *cg_control(const char *cgroup, const char *control); diff --git a/tools/testing/selftests/cgroup/test_core.c b/tools/testing/selftests/cgroup/test_core.c index 80aa6b2373b9..a5672a91d273 100644 --- a/tools/testing/selftests/cgroup/test_core.c +++ b/tools/testing/selftests/cgroup/test_core.c @@ -18,6 +18,8 @@ #include "../kselftest.h" #include "cgroup_util.h" +static bool nsdelegate; + static int touch_anon(char *buf, size_t size) { int fd; @@ -775,6 +777,9 @@ static int test_cgcore_lesser_ns_open(const char *root) pid_t pid; int status; + if (!nsdelegate) + return KSFT_SKIP; + cg_test_a = cg_name(root, "cg_test_a"); cg_test_b = cg_name(root, "cg_test_b"); @@ -862,7 +867,7 @@ int main(int argc, char *argv[]) char root[PATH_MAX]; int i, ret = EXIT_SUCCESS; - if (cg_find_unified_root(root, sizeof(root))) + if (cg_find_unified_root(root, sizeof(root), &nsdelegate)) ksft_exit_skip("cgroup v2 isn't mounted\n"); if (cg_read_strstr(root, "cgroup.subtree_control", "memory")) diff --git a/tools/testing/selftests/cgroup/test_cpu.c b/tools/testing/selftests/cgroup/test_cpu.c index 24020a2c68dc..dad2ed82f3ef 100644 --- a/tools/testing/selftests/cgroup/test_cpu.c +++ b/tools/testing/selftests/cgroup/test_cpu.c @@ -237,7 +237,7 @@ run_cpucg_weight_test( { int ret = KSFT_FAIL, i; char *parent = NULL; - struct cpu_hogger children[3] = {NULL}; + struct cpu_hogger children[3] = {}; parent = cg_name(root, "cpucg_test_0"); if (!parent) @@ -408,7 +408,7 @@ run_cpucg_nested_weight_test(const char *root, bool overprovisioned) { int ret = KSFT_FAIL, i; char *parent = NULL, *child = NULL; - struct cpu_hogger leaf[3] = {NULL}; + struct cpu_hogger leaf[3] = {}; long nested_leaf_usage, child_usage; int nprocs = get_nprocs(); @@ -700,7 +700,7 @@ int main(int argc, char *argv[]) char root[PATH_MAX]; int i, ret = EXIT_SUCCESS; - if (cg_find_unified_root(root, sizeof(root))) + if (cg_find_unified_root(root, sizeof(root), NULL)) ksft_exit_skip("cgroup v2 isn't mounted\n"); if (cg_read_strstr(root, "cgroup.subtree_control", "cpu")) diff --git a/tools/testing/selftests/cgroup/test_cpuset.c b/tools/testing/selftests/cgroup/test_cpuset.c index b061ed1e05b4..4034d14ba69a 100644 --- a/tools/testing/selftests/cgroup/test_cpuset.c +++ b/tools/testing/selftests/cgroup/test_cpuset.c @@ -249,7 +249,7 @@ int main(int argc, char *argv[]) char root[PATH_MAX]; int i, ret = EXIT_SUCCESS; - if (cg_find_unified_root(root, sizeof(root))) + if (cg_find_unified_root(root, sizeof(root), NULL)) ksft_exit_skip("cgroup v2 isn't mounted\n"); if (cg_read_strstr(root, "cgroup.subtree_control", "cpuset")) diff --git a/tools/testing/selftests/cgroup/test_cpuset_v1_hp.sh b/tools/testing/selftests/cgroup/test_cpuset_v1_hp.sh new file mode 100755 index 000000000000..3f45512fb512 --- /dev/null +++ b/tools/testing/selftests/cgroup/test_cpuset_v1_hp.sh @@ -0,0 +1,46 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# +# Test the special cpuset v1 hotplug case where a cpuset become empty of +# CPUs will force migration of tasks out to an ancestor. +# + +skip_test() { + echo "$1" + echo "Test SKIPPED" + exit 4 # ksft_skip +} + +[[ $(id -u) -eq 0 ]] || skip_test "Test must be run as root!" + +# Find cpuset v1 mount point +CPUSET=$(mount -t cgroup | grep cpuset | head -1 | awk -e '{print $3}') +[[ -n "$CPUSET" ]] || skip_test "cpuset v1 mount point not found!" + +# +# Create a test cpuset, put a CPU and a task there and offline that CPU +# +TDIR=test$$ +[[ -d $CPUSET/$TDIR ]] || mkdir $CPUSET/$TDIR +echo 1 > $CPUSET/$TDIR/cpuset.cpus +echo 0 > $CPUSET/$TDIR/cpuset.mems +sleep 10& +TASK=$! +echo $TASK > $CPUSET/$TDIR/tasks +NEWCS=$(cat /proc/$TASK/cpuset) +[[ $NEWCS != "/$TDIR" ]] && { + echo "Unexpected cpuset $NEWCS, test FAILED!" + exit 1 +} + +echo 0 > /sys/devices/system/cpu/cpu1/online +sleep 0.5 +echo 1 > /sys/devices/system/cpu/cpu1/online +NEWCS=$(cat /proc/$TASK/cpuset) +rmdir $CPUSET/$TDIR +[[ $NEWCS != "/" ]] && { + echo "cpuset $NEWCS, test FAILED!" + exit 1 +} +echo "Test PASSED" +exit 0 diff --git a/tools/testing/selftests/cgroup/test_freezer.c b/tools/testing/selftests/cgroup/test_freezer.c index 8845353aca53..8730645d363a 100644 --- a/tools/testing/selftests/cgroup/test_freezer.c +++ b/tools/testing/selftests/cgroup/test_freezer.c @@ -827,7 +827,7 @@ int main(int argc, char *argv[]) char root[PATH_MAX]; int i, ret = EXIT_SUCCESS; - if (cg_find_unified_root(root, sizeof(root))) + if (cg_find_unified_root(root, sizeof(root), NULL)) ksft_exit_skip("cgroup v2 isn't mounted\n"); for (i = 0; i < ARRAY_SIZE(tests); i++) { switch (tests[i].fn(root)) { diff --git a/tools/testing/selftests/cgroup/test_hugetlb_memcg.c b/tools/testing/selftests/cgroup/test_hugetlb_memcg.c index f0fefeb4cc24..856f9508ea56 100644 --- a/tools/testing/selftests/cgroup/test_hugetlb_memcg.c +++ b/tools/testing/selftests/cgroup/test_hugetlb_memcg.c @@ -214,7 +214,7 @@ int main(int argc, char **argv) return ret; } - if (cg_find_unified_root(root, sizeof(root))) + if (cg_find_unified_root(root, sizeof(root), NULL)) ksft_exit_skip("cgroup v2 isn't mounted\n"); switch (test_hugetlb_memcg(root)) { diff --git a/tools/testing/selftests/cgroup/test_kill.c b/tools/testing/selftests/cgroup/test_kill.c index 6153690319c9..0e5bb6c7307a 100644 --- a/tools/testing/selftests/cgroup/test_kill.c +++ b/tools/testing/selftests/cgroup/test_kill.c @@ -276,7 +276,7 @@ int main(int argc, char *argv[]) char root[PATH_MAX]; int i, ret = EXIT_SUCCESS; - if (cg_find_unified_root(root, sizeof(root))) + if (cg_find_unified_root(root, sizeof(root), NULL)) ksft_exit_skip("cgroup v2 isn't mounted\n"); for (i = 0; i < ARRAY_SIZE(tests); i++) { switch (tests[i].fn(root)) { diff --git a/tools/testing/selftests/cgroup/test_kmem.c b/tools/testing/selftests/cgroup/test_kmem.c index c82f974b85c9..96693d8772be 100644 --- a/tools/testing/selftests/cgroup/test_kmem.c +++ b/tools/testing/selftests/cgroup/test_kmem.c @@ -192,7 +192,7 @@ static int test_kmem_memcg_deletion(const char *root) goto cleanup; sum = anon + file + kernel + sock; - if (abs(sum - current) < MAX_VMSTAT_ERROR) { + if (labs(sum - current) < MAX_VMSTAT_ERROR) { ret = KSFT_PASS; } else { printf("memory.current = %ld\n", current); @@ -380,7 +380,7 @@ static int test_percpu_basic(const char *root) current = cg_read_long(parent, "memory.current"); percpu = cg_read_key_long(parent, "memory.stat", "percpu "); - if (current > 0 && percpu > 0 && abs(current - percpu) < + if (current > 0 && percpu > 0 && labs(current - percpu) < MAX_VMSTAT_ERROR) ret = KSFT_PASS; else @@ -420,7 +420,7 @@ int main(int argc, char **argv) char root[PATH_MAX]; int i, ret = EXIT_SUCCESS; - if (cg_find_unified_root(root, sizeof(root))) + if (cg_find_unified_root(root, sizeof(root), NULL)) ksft_exit_skip("cgroup v2 isn't mounted\n"); /* diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c index c7c9572003a8..41ae8047b889 100644 --- a/tools/testing/selftests/cgroup/test_memcontrol.c +++ b/tools/testing/selftests/cgroup/test_memcontrol.c @@ -716,7 +716,9 @@ static bool reclaim_until(const char *memcg, long goal) */ static int test_memcg_reclaim(const char *root) { - int ret = KSFT_FAIL, fd, retries; + int ret = KSFT_FAIL; + int fd = -1; + int retries; char *memcg; long current, expected_usage; @@ -1314,7 +1316,7 @@ int main(int argc, char **argv) char root[PATH_MAX]; int i, proc_status, ret = EXIT_SUCCESS; - if (cg_find_unified_root(root, sizeof(root))) + if (cg_find_unified_root(root, sizeof(root), NULL)) ksft_exit_skip("cgroup v2 isn't mounted\n"); /* diff --git a/tools/testing/selftests/cgroup/test_zswap.c b/tools/testing/selftests/cgroup/test_zswap.c index f0e488ed90d8..190096017f80 100644 --- a/tools/testing/selftests/cgroup/test_zswap.c +++ b/tools/testing/selftests/cgroup/test_zswap.c @@ -50,7 +50,7 @@ static int get_zswap_stored_pages(size_t *value) return read_int("/sys/kernel/debug/zswap/stored_pages", value); } -static int get_cg_wb_count(const char *cg) +static long get_cg_wb_count(const char *cg) { return cg_read_key_long(cg, "memory.stat", "zswpwb"); } @@ -249,6 +249,132 @@ out: } /* + * Attempt writeback with the following steps: + * 1. Allocate memory. + * 2. Reclaim memory equal to the amount that was allocated in step 1. + This will move it into zswap. + * 3. Save current zswap usage. + * 4. Move the memory allocated in step 1 back in from zswap. + * 5. Set zswap.max to half the amount that was recorded in step 3. + * 6. Attempt to reclaim memory equal to the amount that was allocated, + this will either trigger writeback if it's enabled, or reclamation + will fail if writeback is disabled as there isn't enough zswap space. + */ +static int attempt_writeback(const char *cgroup, void *arg) +{ + long pagesize = sysconf(_SC_PAGESIZE); + char *test_group = arg; + size_t memsize = MB(4); + char buf[pagesize]; + long zswap_usage; + bool wb_enabled; + int ret = -1; + char *mem; + + wb_enabled = cg_read_long(test_group, "memory.zswap.writeback"); + mem = (char *)malloc(memsize); + if (!mem) + return ret; + + /* + * Fill half of each page with increasing data, and keep other + * half empty, this will result in data that is still compressible + * and ends up in zswap, with material zswap usage. + */ + for (int i = 0; i < pagesize; i++) + buf[i] = i < pagesize/2 ? (char) i : 0; + + for (int i = 0; i < memsize; i += pagesize) + memcpy(&mem[i], buf, pagesize); + + /* Try and reclaim allocated memory */ + if (cg_write_numeric(test_group, "memory.reclaim", memsize)) { + ksft_print_msg("Failed to reclaim all of the requested memory\n"); + goto out; + } + + zswap_usage = cg_read_long(test_group, "memory.zswap.current"); + + /* zswpin */ + for (int i = 0; i < memsize; i += pagesize) { + if (memcmp(&mem[i], buf, pagesize)) { + ksft_print_msg("invalid memory\n"); + goto out; + } + } + + if (cg_write_numeric(test_group, "memory.zswap.max", zswap_usage/2)) + goto out; + + /* + * If writeback is enabled, trying to reclaim memory now will trigger a + * writeback as zswap.max is half of what was needed when reclaim ran the first time. + * If writeback is disabled, memory reclaim will fail as zswap is limited and + * it can't writeback to swap. + */ + ret = cg_write_numeric(test_group, "memory.reclaim", memsize); + if (!wb_enabled) + ret = (ret == -EAGAIN) ? 0 : -1; + +out: + free(mem); + return ret; +} + +/* Test to verify the zswap writeback path */ +static int test_zswap_writeback(const char *root, bool wb) +{ + long zswpwb_before, zswpwb_after; + int ret = KSFT_FAIL; + char *test_group; + + test_group = cg_name(root, "zswap_writeback_test"); + if (!test_group) + goto out; + if (cg_create(test_group)) + goto out; + if (cg_write(test_group, "memory.zswap.writeback", wb ? "1" : "0")) + goto out; + + zswpwb_before = get_cg_wb_count(test_group); + if (zswpwb_before != 0) { + ksft_print_msg("zswpwb_before = %ld instead of 0\n", zswpwb_before); + goto out; + } + + if (cg_run(test_group, attempt_writeback, (void *) test_group)) + goto out; + + /* Verify that zswap writeback occurred only if writeback was enabled */ + zswpwb_after = get_cg_wb_count(test_group); + if (zswpwb_after < 0) + goto out; + + if (wb != !!zswpwb_after) { + ksft_print_msg("zswpwb_after is %ld while wb is %s", + zswpwb_after, wb ? "enabled" : "disabled"); + goto out; + } + + ret = KSFT_PASS; + +out: + cg_destroy(test_group); + free(test_group); + return ret; +} + +static int test_zswap_writeback_enabled(const char *root) +{ + return test_zswap_writeback(root, true); +} + +static int test_zswap_writeback_disabled(const char *root) +{ + return test_zswap_writeback(root, false); +} + +/* * When trying to store a memcg page in zswap, if the memcg hits its memory * limit in zswap, writeback should affect only the zswapped pages of that * memcg. @@ -257,7 +383,7 @@ static int test_no_invasive_cgroup_shrink(const char *root) { int ret = KSFT_FAIL; size_t control_allocation_size = MB(10); - char *control_allocation, *wb_group = NULL, *control_group = NULL; + char *control_allocation = NULL, *wb_group = NULL, *control_group = NULL; wb_group = setup_test_group_1M(root, "per_memcg_wb_test1"); if (!wb_group) @@ -342,7 +468,7 @@ static int test_no_kmem_bypass(const char *root) struct sysinfo sys_info; int ret = KSFT_FAIL; int child_status; - char *test_group; + char *test_group = NULL; pid_t child_pid; /* Read sys info and compute test values accordingly */ @@ -364,8 +490,6 @@ static int test_no_kmem_bypass(const char *root) trigger_allocation_size = sys_info.totalram / 20; /* Set up test memcg */ - if (cg_write(root, "cgroup.subtree_control", "+memory")) - goto out; test_group = cg_name(root, "kmem_bypass_test"); if (!test_group) goto out; @@ -425,6 +549,8 @@ struct zswap_test { T(test_zswap_usage), T(test_swapin_nozswap), T(test_zswapin), + T(test_zswap_writeback_enabled), + T(test_zswap_writeback_disabled), T(test_no_kmem_bypass), T(test_no_invasive_cgroup_shrink), }; @@ -440,7 +566,7 @@ int main(int argc, char **argv) char root[PATH_MAX]; int i, ret = EXIT_SUCCESS; - if (cg_find_unified_root(root, sizeof(root))) + if (cg_find_unified_root(root, sizeof(root), NULL)) ksft_exit_skip("cgroup v2 isn't mounted\n"); if (!zswap_configured()) diff --git a/tools/testing/selftests/clone3/clone3.c b/tools/testing/selftests/clone3/clone3.c index 3c9bf0cd82a8..e61f07973ce5 100644 --- a/tools/testing/selftests/clone3/clone3.c +++ b/tools/testing/selftests/clone3/clone3.c @@ -95,9 +95,14 @@ static int call_clone3(uint64_t flags, size_t size, enum test_mode test_mode) getpid(), pid); if (waitpid(-1, &status, __WALL) < 0) { - ksft_print_msg("Child returned %s\n", strerror(errno)); + ksft_print_msg("waitpid() returned %s\n", strerror(errno)); return -errno; } + if (!WIFEXITED(status)) { + ksft_print_msg("Child did not exit normally, status 0x%x\n", + status); + return EXIT_FAILURE; + } if (WEXITSTATUS(status)) return WEXITSTATUS(status); diff --git a/tools/testing/selftests/clone3/clone3_clear_sighand.c b/tools/testing/selftests/clone3/clone3_clear_sighand.c index 54a8b2445be9..ce0426786828 100644 --- a/tools/testing/selftests/clone3/clone3_clear_sighand.c +++ b/tools/testing/selftests/clone3/clone3_clear_sighand.c @@ -120,5 +120,5 @@ int main(int argc, char **argv) test_clone3_clear_sighand(); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/clone3/clone3_set_tid.c b/tools/testing/selftests/clone3/clone3_set_tid.c index ed785afb6077..bfb0da2b4fdd 100644 --- a/tools/testing/selftests/clone3/clone3_set_tid.c +++ b/tools/testing/selftests/clone3/clone3_set_tid.c @@ -114,7 +114,8 @@ static int call_clone3_set_tid(pid_t *set_tid, return WEXITSTATUS(status); } -static void test_clone3_set_tid(pid_t *set_tid, +static void test_clone3_set_tid(const char *desc, + pid_t *set_tid, size_t set_tid_size, int flags, int expected, @@ -129,17 +130,13 @@ static void test_clone3_set_tid(pid_t *set_tid, ret = call_clone3_set_tid(set_tid, set_tid_size, flags, expected_pid, wait_for_it); ksft_print_msg( - "[%d] clone3() with CLONE_SET_TID %d says :%d - expected %d\n", + "[%d] clone3() with CLONE_SET_TID %d says: %d - expected %d\n", getpid(), set_tid[0], ret, expected); - if (ret != expected) - ksft_test_result_fail( - "[%d] Result (%d) is different than expected (%d)\n", - getpid(), ret, expected); - else - ksft_test_result_pass( - "[%d] Result (%d) matches expectation (%d)\n", - getpid(), ret, expected); + + ksft_test_result(ret == expected, "%s with %zu TIDs and flags 0x%x\n", + desc, set_tid_size, flags); } + int main(int argc, char *argv[]) { FILE *f; @@ -172,73 +169,91 @@ int main(int argc, char *argv[]) /* Try invalid settings */ memset(&set_tid, 0, sizeof(set_tid)); - test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL + 1, 0, -EINVAL, 0, 0); + test_clone3_set_tid("invalid size, 0 TID", + set_tid, MAX_PID_NS_LEVEL + 1, 0, -EINVAL, 0, 0); - test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL * 2, 0, -EINVAL, 0, 0); + test_clone3_set_tid("invalid size, 0 TID", + set_tid, MAX_PID_NS_LEVEL * 2, 0, -EINVAL, 0, 0); - test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL * 2 + 1, 0, - -EINVAL, 0, 0); + test_clone3_set_tid("invalid size, 0 TID", + set_tid, MAX_PID_NS_LEVEL * 2 + 1, 0, + -EINVAL, 0, 0); - test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL * 42, 0, -EINVAL, 0, 0); + test_clone3_set_tid("invalid size, 0 TID", + set_tid, MAX_PID_NS_LEVEL * 42, 0, -EINVAL, 0, 0); /* * This can actually work if this test running in a MAX_PID_NS_LEVEL - 1 * nested PID namespace. */ - test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL - 1, 0, -EINVAL, 0, 0); + test_clone3_set_tid("invalid size, 0 TID", + set_tid, MAX_PID_NS_LEVEL - 1, 0, -EINVAL, 0, 0); memset(&set_tid, 0xff, sizeof(set_tid)); - test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL + 1, 0, -EINVAL, 0, 0); + test_clone3_set_tid("invalid size, TID all 1s", + set_tid, MAX_PID_NS_LEVEL + 1, 0, -EINVAL, 0, 0); - test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL * 2, 0, -EINVAL, 0, 0); + test_clone3_set_tid("invalid size, TID all 1s", + set_tid, MAX_PID_NS_LEVEL * 2, 0, -EINVAL, 0, 0); - test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL * 2 + 1, 0, - -EINVAL, 0, 0); + test_clone3_set_tid("invalid size, TID all 1s", + set_tid, MAX_PID_NS_LEVEL * 2 + 1, 0, + -EINVAL, 0, 0); - test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL * 42, 0, -EINVAL, 0, 0); + test_clone3_set_tid("invalid size, TID all 1s", + set_tid, MAX_PID_NS_LEVEL * 42, 0, -EINVAL, 0, 0); /* * This can actually work if this test running in a MAX_PID_NS_LEVEL - 1 * nested PID namespace. */ - test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL - 1, 0, -EINVAL, 0, 0); + test_clone3_set_tid("invalid size, TID all 1s", + set_tid, MAX_PID_NS_LEVEL - 1, 0, -EINVAL, 0, 0); memset(&set_tid, 0, sizeof(set_tid)); /* Try with an invalid PID */ set_tid[0] = 0; - test_clone3_set_tid(set_tid, 1, 0, -EINVAL, 0, 0); + test_clone3_set_tid("valid size, 0 TID", + set_tid, 1, 0, -EINVAL, 0, 0); set_tid[0] = -1; - test_clone3_set_tid(set_tid, 1, 0, -EINVAL, 0, 0); + test_clone3_set_tid("valid size, -1 TID", + set_tid, 1, 0, -EINVAL, 0, 0); /* Claim that the set_tid array actually contains 2 elements. */ - test_clone3_set_tid(set_tid, 2, 0, -EINVAL, 0, 0); + test_clone3_set_tid("2 TIDs, -1 and 0", + set_tid, 2, 0, -EINVAL, 0, 0); /* Try it in a new PID namespace */ if (uid == 0) - test_clone3_set_tid(set_tid, 1, CLONE_NEWPID, -EINVAL, 0, 0); + test_clone3_set_tid("valid size, -1 TID", + set_tid, 1, CLONE_NEWPID, -EINVAL, 0, 0); else ksft_test_result_skip("Clone3() with set_tid requires root\n"); /* Try with a valid PID (1) this should return -EEXIST. */ set_tid[0] = 1; if (uid == 0) - test_clone3_set_tid(set_tid, 1, 0, -EEXIST, 0, 0); + test_clone3_set_tid("duplicate PID 1", + set_tid, 1, 0, -EEXIST, 0, 0); else ksft_test_result_skip("Clone3() with set_tid requires root\n"); /* Try it in a new PID namespace */ if (uid == 0) - test_clone3_set_tid(set_tid, 1, CLONE_NEWPID, 0, 0, 0); + test_clone3_set_tid("duplicate PID 1", + set_tid, 1, CLONE_NEWPID, 0, 0, 0); else ksft_test_result_skip("Clone3() with set_tid requires root\n"); /* pid_max should fail everywhere */ set_tid[0] = pid_max; - test_clone3_set_tid(set_tid, 1, 0, -EINVAL, 0, 0); + test_clone3_set_tid("set TID to maximum", + set_tid, 1, 0, -EINVAL, 0, 0); if (uid == 0) - test_clone3_set_tid(set_tid, 1, CLONE_NEWPID, -EINVAL, 0, 0); + test_clone3_set_tid("set TID to maximum", + set_tid, 1, CLONE_NEWPID, -EINVAL, 0, 0); else ksft_test_result_skip("Clone3() with set_tid requires root\n"); @@ -262,10 +277,12 @@ int main(int argc, char *argv[]) /* After the child has finished, its PID should be free. */ set_tid[0] = pid; - test_clone3_set_tid(set_tid, 1, 0, 0, 0, 0); + test_clone3_set_tid("reallocate child TID", + set_tid, 1, 0, 0, 0, 0); /* This should fail as there is no PID 1 in that namespace */ - test_clone3_set_tid(set_tid, 1, CLONE_NEWPID, -EINVAL, 0, 0); + test_clone3_set_tid("duplicate child TID", + set_tid, 1, CLONE_NEWPID, -EINVAL, 0, 0); /* * Creating a process with PID 1 in the newly created most nested @@ -274,7 +291,8 @@ int main(int argc, char *argv[]) */ set_tid[0] = 1; set_tid[1] = pid; - test_clone3_set_tid(set_tid, 2, CLONE_NEWPID, 0, pid, 0); + test_clone3_set_tid("create PID 1 in new NS", + set_tid, 2, CLONE_NEWPID, 0, pid, 0); ksft_print_msg("unshare PID namespace\n"); if (unshare(CLONE_NEWPID) == -1) @@ -284,7 +302,8 @@ int main(int argc, char *argv[]) set_tid[0] = pid; /* This should fail as there is no PID 1 in that namespace */ - test_clone3_set_tid(set_tid, 1, 0, -EINVAL, 0, 0); + test_clone3_set_tid("duplicate PID 1", + set_tid, 1, 0, -EINVAL, 0, 0); /* Let's create a PID 1 */ ns_pid = fork(); @@ -295,21 +314,25 @@ int main(int argc, char *argv[]) */ set_tid[0] = 43; set_tid[1] = -1; - test_clone3_set_tid(set_tid, 2, 0, -EINVAL, 0, 0); + test_clone3_set_tid("check leak on invalid TID -1", + set_tid, 2, 0, -EINVAL, 0, 0); set_tid[0] = 43; set_tid[1] = pid; - test_clone3_set_tid(set_tid, 2, 0, 0, 43, 0); + test_clone3_set_tid("check leak on invalid specific TID", + set_tid, 2, 0, 0, 43, 0); ksft_print_msg("Child in PID namespace has PID %d\n", getpid()); set_tid[0] = 2; - test_clone3_set_tid(set_tid, 1, 0, 0, 2, 0); + test_clone3_set_tid("create PID 2 in child NS", + set_tid, 1, 0, 0, 2, 0); set_tid[0] = 1; set_tid[1] = -1; set_tid[2] = pid; /* This should fail as there is invalid PID at level '1'. */ - test_clone3_set_tid(set_tid, 3, CLONE_NEWPID, -EINVAL, 0, 0); + test_clone3_set_tid("fail due to invalid TID at level 1", + set_tid, 3, CLONE_NEWPID, -EINVAL, 0, 0); set_tid[0] = 1; set_tid[1] = 42; @@ -319,13 +342,15 @@ int main(int argc, char *argv[]) * namespaces. Again assuming this is running in the host's * PID namespace. Not yet nested. */ - test_clone3_set_tid(set_tid, 4, CLONE_NEWPID, -EINVAL, 0, 0); + test_clone3_set_tid("fail due to too few active PID NSs", + set_tid, 4, CLONE_NEWPID, -EINVAL, 0, 0); /* * This should work and from the parent we should see * something like 'NSpid: pid 42 1'. */ - test_clone3_set_tid(set_tid, 3, CLONE_NEWPID, 0, 42, true); + test_clone3_set_tid("verify that we have 3 PID NSs", + set_tid, 3, CLONE_NEWPID, 0, 42, true); child_exit(ksft_cnt.ksft_fail); } @@ -380,16 +405,14 @@ int main(int argc, char *argv[]) ksft_cnt.ksft_pass += 6 - (ksft_cnt.ksft_fail - WEXITSTATUS(status)); ksft_cnt.ksft_fail = WEXITSTATUS(status); - if (ns3 == pid && ns2 == 42 && ns1 == 1) - ksft_test_result_pass( - "PIDs in all namespaces as expected (%d,%d,%d)\n", - ns3, ns2, ns1); - else - ksft_test_result_fail( - "PIDs in all namespaces not as expected (%d,%d,%d)\n", - ns3, ns2, ns1); + ksft_print_msg("Expecting PIDs %d, 42, 1\n", pid); + ksft_print_msg("Have PIDs in namespaces: %d, %d, %d\n", ns3, ns2, ns1); + ksft_test_result(ns3 == pid && ns2 == 42 && ns1 == 1, + "PIDs in all namespaces as expected\n"); out: ret = 0; - return !ret ? ksft_exit_pass() : ksft_exit_fail(); + if (ret) + ksft_exit_fail(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/core/close_range_test.c b/tools/testing/selftests/core/close_range_test.c index c59e4adb905d..991c473e3859 100644 --- a/tools/testing/selftests/core/close_range_test.c +++ b/tools/testing/selftests/core/close_range_test.c @@ -17,6 +17,15 @@ #include "../kselftest_harness.h" #include "../clone3/clone3_selftests.h" + +#ifndef F_LINUX_SPECIFIC_BASE +#define F_LINUX_SPECIFIC_BASE 1024 +#endif + +#ifndef F_DUPFD_QUERY +#define F_DUPFD_QUERY (F_LINUX_SPECIFIC_BASE + 3) +#endif + static inline int sys_close_range(unsigned int fd, unsigned int max_fd, unsigned int flags) { @@ -45,6 +54,15 @@ TEST(core_close_range) SKIP(return, "close_range() syscall not supported"); } + for (i = 0; i < 100; i++) { + ret = fcntl(open_fds[i], F_DUPFD_QUERY, open_fds[i + 1]); + if (ret < 0) { + EXPECT_EQ(errno, EINVAL); + } else { + EXPECT_EQ(ret, 0); + } + } + EXPECT_EQ(0, sys_close_range(open_fds[0], open_fds[50], 0)); for (i = 0; i <= 50; i++) @@ -358,7 +376,7 @@ TEST(close_range_cloexec_unshare) */ TEST(close_range_cloexec_syzbot) { - int fd1, fd2, fd3, flags, ret, status; + int fd1, fd2, fd3, fd4, flags, ret, status; pid_t pid; struct __clone_args args = { .flags = CLONE_FILES, @@ -372,6 +390,13 @@ TEST(close_range_cloexec_syzbot) fd2 = dup2(fd1, 1000); EXPECT_GT(fd2, 0); + flags = fcntl(fd1, F_DUPFD_QUERY, fd2); + if (flags < 0) { + EXPECT_EQ(errno, EINVAL); + } else { + EXPECT_EQ(flags, 1); + } + pid = sys_clone3(&args, sizeof(args)); ASSERT_GE(pid, 0); @@ -396,6 +421,15 @@ TEST(close_range_cloexec_syzbot) fd3 = dup2(fd1, 42); EXPECT_GT(fd3, 0); + flags = fcntl(fd1, F_DUPFD_QUERY, fd3); + if (flags < 0) { + EXPECT_EQ(errno, EINVAL); + } else { + EXPECT_EQ(flags, 1); + } + + + /* * Duplicating the file descriptor must remove the * FD_CLOEXEC flag. @@ -426,6 +460,24 @@ TEST(close_range_cloexec_syzbot) fd3 = dup2(fd1, 42); EXPECT_GT(fd3, 0); + flags = fcntl(fd1, F_DUPFD_QUERY, fd3); + if (flags < 0) { + EXPECT_EQ(errno, EINVAL); + } else { + EXPECT_EQ(flags, 1); + } + + fd4 = open("/dev/null", O_RDWR); + EXPECT_GT(fd4, 0); + + /* Same inode, different file pointers. */ + flags = fcntl(fd1, F_DUPFD_QUERY, fd4); + if (flags < 0) { + EXPECT_EQ(errno, EINVAL); + } else { + EXPECT_EQ(flags, 0); + } + flags = fcntl(fd3, F_GETFD); EXPECT_GT(flags, -1); EXPECT_EQ(flags & FD_CLOEXEC, 0); @@ -433,6 +485,7 @@ TEST(close_range_cloexec_syzbot) EXPECT_EQ(close(fd1), 0); EXPECT_EQ(close(fd2), 0); EXPECT_EQ(close(fd3), 0); + EXPECT_EQ(close(fd4), 0); } /* diff --git a/tools/testing/selftests/cpufreq/cpufreq.sh b/tools/testing/selftests/cpufreq/cpufreq.sh index b583a2fb4504..a8b1dbc0a3a5 100755 --- a/tools/testing/selftests/cpufreq/cpufreq.sh +++ b/tools/testing/selftests/cpufreq/cpufreq.sh @@ -178,8 +178,7 @@ cpufreq_basic_tests() count=$(count_cpufreq_managed_cpus) if [ $count = 0 ]; then - printf "No cpu is managed by cpufreq core, exiting\n" - exit; + ktap_exit_fail_msg "No cpu is managed by cpufreq core, exiting\n" else printf "CPUFreq manages: $count CPUs\n\n" fi diff --git a/tools/testing/selftests/cpufreq/main.sh b/tools/testing/selftests/cpufreq/main.sh index 60ce18ed0666..a0eb84cf7167 100755 --- a/tools/testing/selftests/cpufreq/main.sh +++ b/tools/testing/selftests/cpufreq/main.sh @@ -7,15 +7,15 @@ source governor.sh source module.sh source special-tests.sh +DIR="$(dirname $(readlink -f "$0"))" +source "${DIR}"/../kselftest/ktap_helpers.sh + FUNC=basic # do basic tests by default OUTFILE=cpufreq_selftest SYSFS= CPUROOT= CPUFREQROOT= -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 - helpme() { printf "Usage: $0 [-h] [-todg args] @@ -32,7 +32,7 @@ helpme() [-d <driver's module name: only with \"-t modtest>\"] [-g <governor's module name: only with \"-t modtest>\"] \n" - exit 2 + exit "${KSFT_FAIL}" } prerequisite() @@ -40,8 +40,8 @@ prerequisite() msg="skip all tests:" if [ $UID != 0 ]; then - echo $msg must be run as root >&2 - exit $ksft_skip + ktap_skip_all "$msg must be run as root" + exit "${KSFT_SKIP}" fi taskset -p 01 $$ @@ -49,21 +49,21 @@ prerequisite() SYSFS=`mount -t sysfs | head -1 | awk '{ print $3 }'` if [ ! -d "$SYSFS" ]; then - echo $msg sysfs is not mounted >&2 - exit 2 + ktap_skip_all "$msg sysfs is not mounted" + exit "${KSFT_SKIP}" fi CPUROOT=$SYSFS/devices/system/cpu CPUFREQROOT="$CPUROOT/cpufreq" if ! ls $CPUROOT/cpu* > /dev/null 2>&1; then - echo $msg cpus not available in sysfs >&2 - exit 2 + ktap_skip_all "$msg cpus not available in sysfs" + exit "${KSFT_SKIP}" fi if ! ls $CPUROOT/cpufreq > /dev/null 2>&1; then - echo $msg cpufreq directory not available in sysfs >&2 - exit 2 + ktap_skip_all "$msg cpufreq directory not available in sysfs" + exit "${KSFT_SKIP}" fi } @@ -105,8 +105,7 @@ do_test() count=$(count_cpufreq_managed_cpus) if [ $count = 0 -a $FUNC != "modtest" ]; then - echo "No cpu is managed by cpufreq core, exiting" - exit 2; + ktap_exit_fail_msg "No cpu is managed by cpufreq core, exiting" fi case "$FUNC" in @@ -125,8 +124,7 @@ do_test() "modtest") # Do we have modules in place? if [ -z $DRIVER_MOD ] && [ -z $GOVERNOR_MOD ]; then - echo "No driver or governor module passed with -d or -g" - exit 2; + ktap_exit_fail_msg "No driver or governor module passed with -d or -g" fi if [ $DRIVER_MOD ]; then @@ -137,8 +135,7 @@ do_test() fi else if [ $count = 0 ]; then - echo "No cpu is managed by cpufreq core, exiting" - exit 2; + ktap_exit_fail_msg "No cpu is managed by cpufreq core, exiting" fi module_governor_test $GOVERNOR_MOD @@ -162,7 +159,7 @@ do_test() ;; *) - echo "Invalid [-f] function type" + ktap_print_msg "Invalid [-f] function type" helpme ;; esac @@ -186,13 +183,25 @@ dmesg_dumps() dmesg >> $1.dmesg_full.txt } +ktap_print_header + # Parse arguments parse_arguments $@ +ktap_set_plan 1 + # Make sure all requirements are met prerequisite # Run requested functions clear_dumps $OUTFILE do_test | tee -a $OUTFILE.txt +if [ "${PIPESTATUS[0]}" -ne 0 ]; then + exit ${PIPESTATUS[0]}; +fi dmesg_dumps $OUTFILE + +ktap_test_pass "Completed successfully" + +ktap_print_totals +exit "${KSFT_PASS}" diff --git a/tools/testing/selftests/cpufreq/module.sh b/tools/testing/selftests/cpufreq/module.sh index 22563cd122e7..7f2667e0ae2d 100755 --- a/tools/testing/selftests/cpufreq/module.sh +++ b/tools/testing/selftests/cpufreq/module.sh @@ -24,16 +24,14 @@ test_basic_insmod_rmmod() # insert module insmod $1 if [ $? != 0 ]; then - printf "Insmod $1 failed\n" - exit; + ktap_exit_fail_msg "Insmod $1 failed\n" fi printf "Removing $1 module\n" # remove module rmmod $1 if [ $? != 0 ]; then - printf "rmmod $1 failed\n" - exit; + ktap_exit_fail_msg "rmmod $1 failed\n" fi printf "\n" diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile index 789d6949c247..29a22f50e762 100644 --- a/tools/testing/selftests/damon/Makefile +++ b/tools/testing/selftests/damon/Makefile @@ -7,16 +7,21 @@ TEST_GEN_FILES += debugfs_target_ids_pid_leak TEST_GEN_FILES += access_memory TEST_FILES = _chk_dependency.sh _debugfs_common.sh + +# functionality tests TEST_PROGS = debugfs_attrs.sh debugfs_schemes.sh debugfs_target_ids.sh +TEST_PROGS += sysfs.sh +TEST_PROGS += sysfs_update_schemes_tried_regions_wss_estimation.py +TEST_PROGS += damos_quota.py damos_quota_goal.py damos_apply_interval.py +TEST_PROGS += reclaim.sh lru_sort.sh + +# regression tests (reproducers of previously found bugs) TEST_PROGS += debugfs_empty_targets.sh debugfs_huge_count_read_write.sh TEST_PROGS += debugfs_duplicate_context_creation.sh TEST_PROGS += debugfs_rm_non_contexts.sh TEST_PROGS += debugfs_target_ids_read_before_terminate_race.sh TEST_PROGS += debugfs_target_ids_pid_leak.sh -TEST_PROGS += sysfs.sh sysfs_update_removed_scheme_dir.sh +TEST_PROGS += sysfs_update_removed_scheme_dir.sh TEST_PROGS += sysfs_update_schemes_tried_regions_hang.py -TEST_PROGS += sysfs_update_schemes_tried_regions_wss_estimation.py -TEST_PROGS += damos_quota.py damos_apply_interval.py -TEST_PROGS += reclaim.sh lru_sort.sh include ../lib.mk diff --git a/tools/testing/selftests/damon/_damon_sysfs.py b/tools/testing/selftests/damon/_damon_sysfs.py index d23d7398a27a..2bd44c32be1b 100644 --- a/tools/testing/selftests/damon/_damon_sysfs.py +++ b/tools/testing/selftests/damon/_damon_sysfs.py @@ -2,7 +2,18 @@ import os -sysfs_root = '/sys/kernel/mm/damon/admin' +ksft_skip=4 + +sysfs_root = None +with open('/proc/mounts', 'r') as f: + for line in f: + dev_name, mount_point, dev_fs = line.split()[:3] + if dev_fs == 'sysfs': + sysfs_root = '%s/kernel/mm/damon/admin' % mount_point + break +if sysfs_root is None: + print('Seems sysfs not mounted?') + exit(ksft_skip) def write_file(path, string): "Returns error string if failed, or None otherwise" @@ -34,11 +45,11 @@ class DamosAccessPattern: self.nr_accesses = nr_accesses self.age = age - if self.size == None: + if self.size is None: self.size = [0, 2**64 - 1] - if self.nr_accesses == None: + if self.nr_accesses is None: self.nr_accesses = [0, 2**64 - 1] - if self.age == None: + if self.age is None: self.age = [0, 2**64 - 1] def sysfs_dir(self): @@ -47,55 +58,109 @@ class DamosAccessPattern: def stage(self): err = write_file( os.path.join(self.sysfs_dir(), 'sz', 'min'), self.size[0]) - if err != None: + if err is not None: return err err = write_file( os.path.join(self.sysfs_dir(), 'sz', 'max'), self.size[1]) - if err != None: + if err is not None: return err err = write_file(os.path.join(self.sysfs_dir(), 'nr_accesses', 'min'), self.nr_accesses[0]) - if err != None: + if err is not None: return err err = write_file(os.path.join(self.sysfs_dir(), 'nr_accesses', 'max'), self.nr_accesses[1]) - if err != None: + if err is not None: return err err = write_file( os.path.join(self.sysfs_dir(), 'age', 'min'), self.age[0]) - if err != None: + if err is not None: return err err = write_file( os.path.join(self.sysfs_dir(), 'age', 'max'), self.age[1]) - if err != None: + if err is not None: return err +qgoal_metric_user_input = 'user_input' +qgoal_metric_some_mem_psi_us = 'some_mem_psi_us' +qgoal_metrics = [qgoal_metric_user_input, qgoal_metric_some_mem_psi_us] + +class DamosQuotaGoal: + metric = None + target_value = None + current_value = None + effective_bytes = None + quota = None # owner quota + idx = None + + def __init__(self, metric, target_value=10000, current_value=0): + self.metric = metric + self.target_value = target_value + self.current_value = current_value + + def sysfs_dir(self): + return os.path.join(self.quota.sysfs_dir(), 'goals', '%d' % self.idx) + + def stage(self): + err = write_file(os.path.join(self.sysfs_dir(), 'target_metric'), + self.metric) + if err is not None: + return err + err = write_file(os.path.join(self.sysfs_dir(), 'target_value'), + self.target_value) + if err is not None: + return err + err = write_file(os.path.join(self.sysfs_dir(), 'current_value'), + self.current_value) + if err is not None: + return err + return None + class DamosQuota: sz = None # size quota, in bytes ms = None # time quota + goals = None # quota goals reset_interval_ms = None # quota reset interval scheme = None # owner scheme - def __init__(self, sz=0, ms=0, reset_interval_ms=0): + def __init__(self, sz=0, ms=0, goals=None, reset_interval_ms=0): self.sz = sz self.ms = ms self.reset_interval_ms = reset_interval_ms + self.goals = goals if goals is not None else [] + for idx, goal in enumerate(self.goals): + goal.idx = idx + goal.quota = self def sysfs_dir(self): return os.path.join(self.scheme.sysfs_dir(), 'quotas') def stage(self): err = write_file(os.path.join(self.sysfs_dir(), 'bytes'), self.sz) - if err != None: + if err is not None: return err err = write_file(os.path.join(self.sysfs_dir(), 'ms'), self.ms) - if err != None: + if err is not None: return err err = write_file(os.path.join(self.sysfs_dir(), 'reset_interval_ms'), self.reset_interval_ms) - if err != None: + if err is not None: return err + nr_goals_file = os.path.join(self.sysfs_dir(), 'goals', 'nr_goals') + content, err = read_file(nr_goals_file) + if err is not None: + return err + if int(content) != len(self.goals): + err = write_file(nr_goals_file, len(self.goals)) + if err is not None: + return err + for goal in self.goals: + err = goal.stage() + if err is not None: + return err + return None + class DamosStats: nr_tried = None sz_tried = None @@ -136,30 +201,30 @@ class Damos: def stage(self): err = write_file(os.path.join(self.sysfs_dir(), 'action'), self.action) - if err != None: + if err is not None: return err err = self.access_pattern.stage() - if err != None: + if err is not None: return err err = write_file(os.path.join(self.sysfs_dir(), 'apply_interval_us'), '%d' % self.apply_interval_us) - if err != None: + if err is not None: return err err = self.quota.stage() - if err != None: + if err is not None: return err # disable watermarks err = write_file( os.path.join(self.sysfs_dir(), 'watermarks', 'metric'), 'none') - if err != None: + if err is not None: return err # disable filters err = write_file( os.path.join(self.sysfs_dir(), 'filters', 'nr_filters'), '0') - if err != None: + if err is not None: return err class DamonTarget: @@ -178,7 +243,7 @@ class DamonTarget: def stage(self): err = write_file( os.path.join(self.sysfs_dir(), 'regions', 'nr_regions'), '0') - if err != None: + if err is not None: return err return write_file( os.path.join(self.sysfs_dir(), 'pid_target'), self.pid) @@ -210,27 +275,27 @@ class DamonAttrs: def stage(self): err = write_file(os.path.join(self.interval_sysfs_dir(), 'sample_us'), self.sample_us) - if err != None: + if err is not None: return err err = write_file(os.path.join(self.interval_sysfs_dir(), 'aggr_us'), self.aggr_us) - if err != None: + if err is not None: return err err = write_file(os.path.join(self.interval_sysfs_dir(), 'update_us'), self.update_us) - if err != None: + if err is not None: return err err = write_file( os.path.join(self.nr_regions_range_sysfs_dir(), 'min'), self.min_nr_regions) - if err != None: + if err is not None: return err err = write_file( os.path.join(self.nr_regions_range_sysfs_dir(), 'max'), self.max_nr_regions) - if err != None: + if err is not None: return err class DamonCtx: @@ -264,36 +329,38 @@ class DamonCtx: def stage(self): err = write_file( os.path.join(self.sysfs_dir(), 'operations'), self.ops) - if err != None: + if err is not None: return err err = self.monitoring_attrs.stage() - if err != None: + if err is not None: return err nr_targets_file = os.path.join( self.sysfs_dir(), 'targets', 'nr_targets') content, err = read_file(nr_targets_file) - if err != None: + if err is not None: return err if int(content) != len(self.targets): err = write_file(nr_targets_file, '%d' % len(self.targets)) - if err != None: + if err is not None: return err for target in self.targets: err = target.stage() - if err != None: + if err is not None: return err nr_schemes_file = os.path.join( self.sysfs_dir(), 'schemes', 'nr_schemes') content, err = read_file(nr_schemes_file) + if err is not None: + return err if int(content) != len(self.schemes): err = write_file(nr_schemes_file, '%d' % len(self.schemes)) - if err != None: + if err is not None: return err for scheme in self.schemes: err = scheme.stage() - if err != None: + if err is not None: return err return None @@ -317,16 +384,16 @@ class Kdamond: nr_contexts_file = os.path.join(self.sysfs_dir(), 'contexts', 'nr_contexts') content, err = read_file(nr_contexts_file) - if err != None: + if err is not None: return err if int(content) != len(self.contexts): err = write_file(nr_contexts_file, '%d' % len(self.contexts)) - if err != None: + if err is not None: return err for context in self.contexts: err = context.stage() - if err != None: + if err is not None: return err err = write_file(os.path.join(self.sysfs_dir(), 'state'), 'on') return err @@ -334,20 +401,20 @@ class Kdamond: def update_schemes_tried_bytes(self): err = write_file(os.path.join(self.sysfs_dir(), 'state'), 'update_schemes_tried_bytes') - if err != None: + if err is not None: return err for context in self.contexts: for scheme in context.schemes: content, err = read_file(os.path.join(scheme.sysfs_dir(), 'tried_regions', 'total_bytes')) - if err != None: + if err is not None: return err scheme.tried_bytes = int(content) def update_schemes_stats(self): err = write_file(os.path.join(self.sysfs_dir(), 'state'), 'update_schemes_stats') - if err != None: + if err is not None: return err for context in self.contexts: for scheme in context.schemes: @@ -356,11 +423,39 @@ class Kdamond: 'sz_applied', 'qt_exceeds']: content, err = read_file( os.path.join(scheme.sysfs_dir(), 'stats', stat)) - if err != None: + if err is not None: return err stat_values.append(int(content)) scheme.stats = DamosStats(*stat_values) + def update_schemes_effective_quotas(self): + err = write_file(os.path.join(self.sysfs_dir(), 'state'), + 'update_schemes_effective_quotas') + if err is not None: + return err + for context in self.contexts: + for scheme in context.schemes: + for goal in scheme.quota.goals: + content, err = read_file( + os.path.join(scheme.quota.sysfs_dir(), + 'effective_bytes')) + if err is not None: + return err + goal.effective_bytes = int(content) + return None + + def commit_schemes_quota_goals(self): + for context in self.contexts: + for scheme in context.schemes: + for goal in scheme.quota.goals: + err = goal.stage() + if err is not None: + print('commit_schemes_quota_goals failed stagign: %s'% + err) + exit(1) + return write_file(os.path.join(self.sysfs_dir(), 'state'), + 'commit_schemes_quota_goals') + class Kdamonds: kdamonds = [] @@ -376,10 +471,10 @@ class Kdamonds: def start(self): err = write_file(os.path.join(self.sysfs_dir(), 'nr_kdamonds'), '%s' % len(self.kdamonds)) - if err != None: + if err is not None: return err for kdamond in self.kdamonds: err = kdamond.start() - if err != None: + if err is not None: return err return None diff --git a/tools/testing/selftests/damon/damos_quota_goal.py b/tools/testing/selftests/damon/damos_quota_goal.py new file mode 100644 index 000000000000..18246f3b62f7 --- /dev/null +++ b/tools/testing/selftests/damon/damos_quota_goal.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +import subprocess +import time + +import _damon_sysfs + +def main(): + # access two 10 MiB memory regions, 2 second per each + sz_region = 10 * 1024 * 1024 + proc = subprocess.Popen(['./access_memory', '2', '%d' % sz_region, '2000']) + + goal = _damon_sysfs.DamosQuotaGoal( + metric=_damon_sysfs.qgoal_metric_user_input, target_value=10000) + kdamonds = _damon_sysfs.Kdamonds([_damon_sysfs.Kdamond( + contexts=[_damon_sysfs.DamonCtx( + ops='vaddr', + targets=[_damon_sysfs.DamonTarget(pid=proc.pid)], + schemes=[_damon_sysfs.Damos( + action='stat', + quota=_damon_sysfs.DamosQuota( + goals=[goal], reset_interval_ms=100), + )] # schemes + )] # contexts + )]) # kdamonds + + err = kdamonds.start() + if err != None: + print('kdamond start failed: %s' % err) + exit(1) + + score_values_to_test = [0, 15000, 5000, 18000] + while proc.poll() == None: + if len(score_values_to_test) == 0: + time.sleep(0.1) + continue + + goal.current_value = score_values_to_test.pop(0) + expect_increase = goal.current_value < goal.target_value + + err = kdamonds.kdamonds[0].commit_schemes_quota_goals() + if err is not None: + print('commit_schemes_quota_goals failed: %s' % err) + exit(1) + + err = kdamonds.kdamonds[0].update_schemes_effective_quotas() + if err is not None: + print('before-update_schemes_effective_quotas failed: %s' % err) + exit(1) + last_effective_bytes = goal.effective_bytes + + time.sleep(0.5) + + err = kdamonds.kdamonds[0].update_schemes_effective_quotas() + if err is not None: + print('after-update_schemes_effective_quotas failed: %s' % err) + exit(1) + + print('score: %s, effective quota: %d -> %d (%.3fx)' % ( + goal.current_value, last_effective_bytes, goal.effective_bytes, + goal.effective_bytes / last_effective_bytes + if last_effective_bytes != 0 else -1.0)) + + if last_effective_bytes == goal.effective_bytes: + print('efective bytes not changed: %d' % goal.effective_bytes) + exit(1) + + increased = last_effective_bytes < goal.effective_bytes + if expect_increase != increased: + print('expectation of increase (%s) != increased (%s)' % + (expect_increase, increased)) + exit(1) + last_effective_bytes = goal.effective_bytes + +if __name__ == '__main__': + main() diff --git a/tools/testing/selftests/dmabuf-heaps/dmabuf-heap.c b/tools/testing/selftests/dmabuf-heaps/dmabuf-heap.c index 890a8236a8ba..5f541522364f 100644 --- a/tools/testing/selftests/dmabuf-heaps/dmabuf-heap.c +++ b/tools/testing/selftests/dmabuf-heaps/dmabuf-heap.c @@ -15,6 +15,7 @@ #include <linux/dma-buf.h> #include <linux/dma-heap.h> #include <drm/drm.h> +#include "../kselftest.h" #define DEVPATH "/dev/dma_heap" @@ -90,14 +91,13 @@ static int dmabuf_heap_open(char *name) char buf[256]; ret = snprintf(buf, 256, "%s/%s", DEVPATH, name); - if (ret < 0) { - printf("snprintf failed!\n"); - return ret; - } + if (ret < 0) + ksft_exit_fail_msg("snprintf failed! %d\n", ret); fd = open(buf, O_RDWR); if (fd < 0) - printf("open %s failed!\n", buf); + ksft_exit_fail_msg("open %s failed: %s\n", buf, strerror(errno)); + return fd; } @@ -140,7 +140,7 @@ static int dmabuf_sync(int fd, int start_stop) #define ONE_MEG (1024 * 1024) -static int test_alloc_and_import(char *heap_name) +static void test_alloc_and_import(char *heap_name) { int heap_fd = -1, dmabuf_fd = -1, importer_fd = -1; uint32_t handle = 0; @@ -148,27 +148,19 @@ static int test_alloc_and_import(char *heap_name) int ret; heap_fd = dmabuf_heap_open(heap_name); - if (heap_fd < 0) - return -1; - printf(" Testing allocation and importing: "); + ksft_print_msg("Testing allocation and importing:\n"); ret = dmabuf_heap_alloc(heap_fd, ONE_MEG, 0, &dmabuf_fd); if (ret) { - printf("FAIL (Allocation Failed!)\n"); - ret = -1; - goto out; + ksft_test_result_fail("FAIL (Allocation Failed!) %d\n", ret); + return; } + /* mmap and write a simple pattern */ - p = mmap(NULL, - ONE_MEG, - PROT_READ | PROT_WRITE, - MAP_SHARED, - dmabuf_fd, - 0); + p = mmap(NULL, ONE_MEG, PROT_READ | PROT_WRITE, MAP_SHARED, dmabuf_fd, 0); if (p == MAP_FAILED) { - printf("FAIL (mmap() failed)\n"); - ret = -1; - goto out; + ksft_test_result_fail("FAIL (mmap() failed): %s\n", strerror(errno)); + goto close_and_return; } dmabuf_sync(dmabuf_fd, DMA_BUF_SYNC_START); @@ -178,71 +170,64 @@ static int test_alloc_and_import(char *heap_name) importer_fd = open_vgem(); if (importer_fd < 0) { - ret = importer_fd; - printf("(Could not open vgem - skipping): "); + ksft_test_result_skip("Could not open vgem %d\n", importer_fd); } else { ret = import_vgem_fd(importer_fd, dmabuf_fd, &handle); - if (ret < 0) { - printf("FAIL (Failed to import buffer)\n"); - goto out; - } + ksft_test_result(ret >= 0, "Import buffer %d\n", ret); } ret = dmabuf_sync(dmabuf_fd, DMA_BUF_SYNC_START); if (ret < 0) { - printf("FAIL (DMA_BUF_SYNC_START failed!)\n"); + ksft_print_msg("FAIL (DMA_BUF_SYNC_START failed!) %d\n", ret); goto out; } memset(p, 0xff, ONE_MEG); ret = dmabuf_sync(dmabuf_fd, DMA_BUF_SYNC_END); if (ret < 0) { - printf("FAIL (DMA_BUF_SYNC_END failed!)\n"); + ksft_print_msg("FAIL (DMA_BUF_SYNC_END failed!) %d\n", ret); goto out; } close_handle(importer_fd, handle); - ret = 0; - printf(" OK\n"); + ksft_test_result_pass("%s dmabuf sync succeeded\n", __func__); + return; + out: - if (p) - munmap(p, ONE_MEG); - if (importer_fd >= 0) - close(importer_fd); - if (dmabuf_fd >= 0) - close(dmabuf_fd); - if (heap_fd >= 0) - close(heap_fd); + ksft_test_result_fail("%s dmabuf sync failed\n", __func__); + munmap(p, ONE_MEG); + close(importer_fd); - return ret; +close_and_return: + close(dmabuf_fd); + close(heap_fd); } -static int test_alloc_zeroed(char *heap_name, size_t size) +static void test_alloc_zeroed(char *heap_name, size_t size) { int heap_fd = -1, dmabuf_fd[32]; - int i, j, ret; + int i, j, k, ret; void *p = NULL; char *c; - printf(" Testing alloced %ldk buffers are zeroed: ", size / 1024); + ksft_print_msg("Testing alloced %ldk buffers are zeroed:\n", size / 1024); heap_fd = dmabuf_heap_open(heap_name); - if (heap_fd < 0) - return -1; /* Allocate and fill a bunch of buffers */ for (i = 0; i < 32; i++) { ret = dmabuf_heap_alloc(heap_fd, size, 0, &dmabuf_fd[i]); - if (ret < 0) { - printf("FAIL (Allocation (%i) failed)\n", i); - goto out; + if (ret) { + ksft_test_result_fail("FAIL (Allocation (%i) failed) %d\n", i, ret); + goto close_and_return; } + /* mmap and fill with simple pattern */ p = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, dmabuf_fd[i], 0); if (p == MAP_FAILED) { - printf("FAIL (mmap() failed!)\n"); - ret = -1; - goto out; + ksft_test_result_fail("FAIL (mmap() failed!): %s\n", strerror(errno)); + goto close_and_return; } + dmabuf_sync(dmabuf_fd[i], DMA_BUF_SYNC_START); memset(p, 0xff, size); dmabuf_sync(dmabuf_fd[i], DMA_BUF_SYNC_END); @@ -251,48 +236,47 @@ static int test_alloc_zeroed(char *heap_name, size_t size) /* close them all */ for (i = 0; i < 32; i++) close(dmabuf_fd[i]); + ksft_test_result_pass("Allocate and fill a bunch of buffers\n"); /* Allocate and validate all buffers are zeroed */ for (i = 0; i < 32; i++) { ret = dmabuf_heap_alloc(heap_fd, size, 0, &dmabuf_fd[i]); if (ret < 0) { - printf("FAIL (Allocation (%i) failed)\n", i); - goto out; + ksft_test_result_fail("FAIL (Allocation (%i) failed) %d\n", i, ret); + goto close_and_return; } /* mmap and validate everything is zero */ p = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, dmabuf_fd[i], 0); if (p == MAP_FAILED) { - printf("FAIL (mmap() failed!)\n"); - ret = -1; - goto out; + ksft_test_result_fail("FAIL (mmap() failed!): %s\n", strerror(errno)); + goto close_and_return; } + dmabuf_sync(dmabuf_fd[i], DMA_BUF_SYNC_START); c = (char *)p; for (j = 0; j < size; j++) { if (c[j] != 0) { - printf("FAIL (Allocated buffer not zeroed @ %i)\n", j); - break; + ksft_print_msg("FAIL (Allocated buffer not zeroed @ %i)\n", j); + dmabuf_sync(dmabuf_fd[i], DMA_BUF_SYNC_END); + munmap(p, size); + goto out; } } dmabuf_sync(dmabuf_fd[i], DMA_BUF_SYNC_END); munmap(p, size); } - /* close them all */ - for (i = 0; i < 32; i++) - close(dmabuf_fd[i]); - - close(heap_fd); - printf("OK\n"); - return 0; out: - while (i > 0) { - close(dmabuf_fd[i]); - i--; - } + ksft_test_result(i == 32, "Allocate and validate all buffers are zeroed\n"); + +close_and_return: + /* close them all */ + for (k = 0; k < i; k++) + close(dmabuf_fd[k]); + close(heap_fd); - return ret; + return; } /* Test the ioctl version compatibility w/ a smaller structure then expected */ @@ -360,126 +344,97 @@ static int dmabuf_heap_alloc_newer(int fd, size_t len, unsigned int flags, return ret; } -static int test_alloc_compat(char *heap_name) +static void test_alloc_compat(char *heap_name) { - int heap_fd = -1, dmabuf_fd = -1; - int ret; + int ret, heap_fd = -1, dmabuf_fd = -1; heap_fd = dmabuf_heap_open(heap_name); - if (heap_fd < 0) - return -1; - printf(" Testing (theoretical)older alloc compat: "); + ksft_print_msg("Testing (theoretical) older alloc compat:\n"); ret = dmabuf_heap_alloc_older(heap_fd, ONE_MEG, 0, &dmabuf_fd); - if (ret) { - printf("FAIL (Older compat allocation failed!)\n"); - ret = -1; - goto out; - } - close(dmabuf_fd); - printf("OK\n"); + if (dmabuf_fd >= 0) + close(dmabuf_fd); + ksft_test_result(!ret, "dmabuf_heap_alloc_older\n"); - printf(" Testing (theoretical)newer alloc compat: "); + ksft_print_msg("Testing (theoretical) newer alloc compat:\n"); ret = dmabuf_heap_alloc_newer(heap_fd, ONE_MEG, 0, &dmabuf_fd); - if (ret) { - printf("FAIL (Newer compat allocation failed!)\n"); - ret = -1; - goto out; - } - printf("OK\n"); -out: if (dmabuf_fd >= 0) close(dmabuf_fd); - if (heap_fd >= 0) - close(heap_fd); + ksft_test_result(!ret, "dmabuf_heap_alloc_newer\n"); - return ret; + close(heap_fd); } -static int test_alloc_errors(char *heap_name) +static void test_alloc_errors(char *heap_name) { int heap_fd = -1, dmabuf_fd = -1; int ret; heap_fd = dmabuf_heap_open(heap_name); - if (heap_fd < 0) - return -1; - printf(" Testing expected error cases: "); + ksft_print_msg("Testing expected error cases:\n"); ret = dmabuf_heap_alloc(0, ONE_MEG, 0x111111, &dmabuf_fd); - if (!ret) { - printf("FAIL (Did not see expected error (invalid fd)!)\n"); - ret = -1; - goto out; - } + ksft_test_result(ret, "Error expected on invalid fd %d\n", ret); ret = dmabuf_heap_alloc(heap_fd, ONE_MEG, 0x111111, &dmabuf_fd); - if (!ret) { - printf("FAIL (Did not see expected error (invalid heap flags)!)\n"); - ret = -1; - goto out; - } + ksft_test_result(ret, "Error expected on invalid heap flags %d\n", ret); ret = dmabuf_heap_alloc_fdflags(heap_fd, ONE_MEG, ~(O_RDWR | O_CLOEXEC), 0, &dmabuf_fd); - if (!ret) { - printf("FAIL (Did not see expected error (invalid fd flags)!)\n"); - ret = -1; - goto out; - } + ksft_test_result(ret, "Error expected on invalid heap flags %d\n", ret); - printf("OK\n"); - ret = 0; -out: if (dmabuf_fd >= 0) close(dmabuf_fd); - if (heap_fd >= 0) - close(heap_fd); + close(heap_fd); +} - return ret; +static int numer_of_heaps(void) +{ + DIR *d = opendir(DEVPATH); + struct dirent *dir; + int heaps = 0; + + while ((dir = readdir(d))) { + if (!strncmp(dir->d_name, ".", 2)) + continue; + if (!strncmp(dir->d_name, "..", 3)) + continue; + heaps++; + } + + return heaps; } int main(void) { - DIR *d; struct dirent *dir; - int ret = -1; + DIR *d; + + ksft_print_header(); d = opendir(DEVPATH); if (!d) { - printf("No %s directory?\n", DEVPATH); - return -1; + ksft_print_msg("No %s directory?\n", DEVPATH); + return KSFT_SKIP; } - while ((dir = readdir(d)) != NULL) { + ksft_set_plan(11 * numer_of_heaps()); + + while ((dir = readdir(d))) { if (!strncmp(dir->d_name, ".", 2)) continue; if (!strncmp(dir->d_name, "..", 3)) continue; - printf("Testing heap: %s\n", dir->d_name); - printf("=======================================\n"); - ret = test_alloc_and_import(dir->d_name); - if (ret) - break; - - ret = test_alloc_zeroed(dir->d_name, 4 * 1024); - if (ret) - break; - - ret = test_alloc_zeroed(dir->d_name, ONE_MEG); - if (ret) - break; - - ret = test_alloc_compat(dir->d_name); - if (ret) - break; - - ret = test_alloc_errors(dir->d_name); - if (ret) - break; + ksft_print_msg("Testing heap: %s\n", dir->d_name); + ksft_print_msg("=======================================\n"); + test_alloc_and_import(dir->d_name); + test_alloc_zeroed(dir->d_name, 4 * 1024); + test_alloc_zeroed(dir->d_name, ONE_MEG); + test_alloc_compat(dir->d_name); + test_alloc_errors(dir->d_name); } closedir(d); - return ret; + ksft_finished(); } diff --git a/tools/testing/selftests/drivers/net/Makefile b/tools/testing/selftests/drivers/net/Makefile new file mode 100644 index 000000000000..e54f382bcb02 --- /dev/null +++ b/tools/testing/selftests/drivers/net/Makefile @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: GPL-2.0 + +TEST_INCLUDES := $(wildcard lib/py/*.py) + +TEST_PROGS := \ + ping.py \ + queues.py \ + stats.py \ +# end of TEST_PROGS + +include ../../lib.mk diff --git a/tools/testing/selftests/drivers/net/README.rst b/tools/testing/selftests/drivers/net/README.rst new file mode 100644 index 000000000000..3b6a29e6564b --- /dev/null +++ b/tools/testing/selftests/drivers/net/README.rst @@ -0,0 +1,136 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Running driver tests +==================== + +Networking driver tests are executed within kselftest framework like any +other tests. They support testing both real device drivers and emulated / +software drivers (latter mostly to test the core parts of the stack). + +SW mode +~~~~~~~ + +By default, when no extra parameters are set or exported, tests execute +against software drivers such as netdevsim. No extra preparation is required +the software devices are created and destroyed as part of the test. +In this mode the tests are indistinguishable from other selftests and +(for example) can be run under ``virtme-ng`` like the core networking selftests. + +HW mode +~~~~~~~ + +Executing tests against a real device requires external preparation. +The netdevice against which tests will be run must exist, be running +(in UP state) and be configured with an IP address. + +Refer to list of :ref:`Variables` later in this file to set up running +the tests against a real device. + +Both modes required +~~~~~~~~~~~~~~~~~~~ + +All tests in drivers/net must support running both against a software device +and a real device. SW-only tests should instead be placed in net/ or +drivers/net/netdevsim, HW-only tests in drivers/net/hw. + +Variables +========= + +The variables can be set in the environment or by creating a net.config +file in the same directory as this README file. Example:: + + $ NETIF=eth0 ./some_test.sh + +or:: + + $ cat tools/testing/selftests/drivers/net/net.config + # Variable set in a file + NETIF=eth0 + +Local test (which don't require endpoint for sending / receiving traffic) +need only the ``NETIF`` variable. Remaining variables define the endpoint +and communication method. + +NETIF +~~~~~ + +Name of the netdevice against which the test should be executed. +When empty or not set software devices will be used. + +LOCAL_V4, LOCAL_V6, REMOTE_V4, REMOTE_V6 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Local and remote endpoint IP addresses. + +REMOTE_TYPE +~~~~~~~~~~~ + +Communication method used to run commands on the remote endpoint. +Test framework has built-in support for ``netns`` and ``ssh`` channels. +``netns`` assumes the "remote" interface is part of the same +host, just moved to the specified netns. +``ssh`` communicates with remote endpoint over ``ssh`` and ``scp``. +Using persistent SSH connections is strongly encouraged to avoid +the latency of SSH connection setup on every command. + +Communication methods are defined by classes in ``lib/py/remote_{name}.py``. +It should be possible to add a new method without modifying any of +the framework, by simply adding an appropriately named file to ``lib/py``. + +REMOTE_ARGS +~~~~~~~~~~~ + +Arguments used to construct the communication channel. +Communication channel dependent:: + + for netns - name of the "remote" namespace + for ssh - name/address of the remote host + +Example +======= + +Build the selftests:: + + # make -C tools/testing/selftests/ TARGETS="drivers/net drivers/net/hw" + +"Install" the tests and copy them over to the target machine:: + + # make -C tools/testing/selftests/ TARGETS="drivers/net drivers/net/hw" \ + install INSTALL_PATH=/tmp/ksft-net-drv + + # rsync -ra --delete /tmp/ksft-net-drv root@192.168.1.1:/root/ + +On the target machine, running the tests will use netdevsim by default:: + + [/root] # ./ksft-net-drv/run_kselftest.sh -t drivers/net:ping.py + TAP version 13 + 1..1 + # timeout set to 45 + # selftests: drivers/net: ping.py + # KTAP version 1 + # 1..3 + # ok 1 ping.test_v4 + # ok 2 ping.test_v6 + # ok 3 ping.test_tcp + # # Totals: pass:3 fail:0 xfail:0 xpass:0 skip:0 error:0 + ok 1 selftests: drivers/net: ping.py + +Create a config with remote info:: + + [/root] # cat > ./ksft-net-drv/drivers/net/net.config <<EOF + NETIF=eth0 + LOCAL_V4=192.168.1.1 + REMOTE_V4=192.168.1.2 + REMOTE_TYPE=ssh + REMOTE_ARGS=root@192.168.1.2 + EOF + +Run the test:: + + [/root] # ./ksft-net-drv/drivers/net/ping.py + KTAP version 1 + 1..3 + ok 1 ping.test_v4 + ok 2 ping.test_v6 # SKIP Test requires IPv6 connectivity + ok 3 ping.test_tcp + # Totals: pass:2 fail:0 xfail:0 xpass:0 skip:1 error:0 diff --git a/tools/testing/selftests/drivers/net/config b/tools/testing/selftests/drivers/net/config new file mode 100644 index 000000000000..f6a58ce8a230 --- /dev/null +++ b/tools/testing/selftests/drivers/net/config @@ -0,0 +1,2 @@ +CONFIG_IPV6=y +CONFIG_NETDEVSIM=m diff --git a/tools/testing/selftests/drivers/net/hw/Makefile b/tools/testing/selftests/drivers/net/hw/Makefile new file mode 100644 index 000000000000..4933d045ab66 --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/Makefile @@ -0,0 +1,28 @@ +# SPDX-License-Identifier: GPL-2.0+ OR MIT + +TEST_PROGS = \ + csum.py \ + devlink_port_split.py \ + ethtool.sh \ + ethtool_extended_state.sh \ + ethtool_mm.sh \ + ethtool_rmon.sh \ + hw_stats_l3.sh \ + hw_stats_l3_gre.sh \ + loopback.sh \ + pp_alloc_fail.py \ + # + +TEST_FILES := \ + ethtool_lib.sh \ + # + +TEST_INCLUDES := \ + $(wildcard lib/py/*.py ../lib/py/*.py) \ + ../../../net/lib.sh \ + ../../../net/forwarding/lib.sh \ + ../../../net/forwarding/ipip_lib.sh \ + ../../../net/forwarding/tc_common.sh \ + # + +include ../../../lib.mk diff --git a/tools/testing/selftests/drivers/net/hw/csum.py b/tools/testing/selftests/drivers/net/hw/csum.py new file mode 100755 index 000000000000..cb40497faee4 --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/csum.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +"""Run the tools/testing/selftests/net/csum testsuite.""" + +from os import path + +from lib.py import ksft_run, ksft_exit, KsftSkipEx +from lib.py import EthtoolFamily, NetDrvEpEnv +from lib.py import bkg, cmd, wait_port_listen + +def test_receive(cfg, ipv4=False, extra_args=None): + """Test local nic checksum receive. Remote host sends crafted packets.""" + if not cfg.have_rx_csum: + raise KsftSkipEx(f"Test requires rx checksum offload on {cfg.ifname}") + + if ipv4: + ip_args = f"-4 -S {cfg.remote_v4} -D {cfg.v4}" + else: + ip_args = f"-6 -S {cfg.remote_v6} -D {cfg.v6}" + + rx_cmd = f"{cfg.bin_local} -i {cfg.ifname} -n 100 {ip_args} -r 1 -R {extra_args}" + tx_cmd = f"{cfg.bin_remote} -i {cfg.ifname} -n 100 {ip_args} -r 1 -T {extra_args}" + + with bkg(rx_cmd, exit_wait=True): + wait_port_listen(34000, proto="udp") + cmd(tx_cmd, host=cfg.remote) + + +def test_transmit(cfg, ipv4=False, extra_args=None): + """Test local nic checksum transmit. Remote host verifies packets.""" + if (not cfg.have_tx_csum_generic and + not (cfg.have_tx_csum_ipv4 and ipv4) and + not (cfg.have_tx_csum_ipv6 and not ipv4)): + raise KsftSkipEx(f"Test requires tx checksum offload on {cfg.ifname}") + + if ipv4: + ip_args = f"-4 -S {cfg.v4} -D {cfg.remote_v4}" + else: + ip_args = f"-6 -S {cfg.v6} -D {cfg.remote_v6}" + + # Cannot randomize input when calculating zero checksum + if extra_args != "-U -Z": + extra_args += " -r 1" + + rx_cmd = f"{cfg.bin_remote} -i {cfg.ifname} -L 1 -n 100 {ip_args} -R {extra_args}" + tx_cmd = f"{cfg.bin_local} -i {cfg.ifname} -L 1 -n 100 {ip_args} -T {extra_args}" + + with bkg(rx_cmd, host=cfg.remote, exit_wait=True): + wait_port_listen(34000, proto="udp", host=cfg.remote) + cmd(tx_cmd) + + +def test_builder(name, cfg, ipv4=False, tx=False, extra_args=""): + """Construct specific tests from the common template. + + Most tests follow the same basic pattern, differing only in + Direction of the test and optional flags passed to csum.""" + def f(cfg): + if ipv4: + cfg.require_v4() + else: + cfg.require_v6() + + if tx: + test_transmit(cfg, ipv4, extra_args) + else: + test_receive(cfg, ipv4, extra_args) + + if ipv4: + f.__name__ = "ipv4_" + name + else: + f.__name__ = "ipv6_" + name + return f + + +def check_nic_features(cfg) -> None: + """Test whether Tx and Rx checksum offload are enabled. + + If the device under test has either off, then skip the relevant tests.""" + cfg.have_tx_csum_generic = False + cfg.have_tx_csum_ipv4 = False + cfg.have_tx_csum_ipv6 = False + cfg.have_rx_csum = False + + ethnl = EthtoolFamily() + features = ethnl.features_get({"header": {"dev-index": cfg.ifindex}}) + for f in features["active"]["bits"]["bit"]: + if f["name"] == "tx-checksum-ip-generic": + cfg.have_tx_csum_generic = True + elif f["name"] == "tx-checksum-ipv4": + cfg.have_tx_csum_ipv4 = True + elif f["name"] == "tx-checksum-ipv6": + cfg.have_tx_csum_ipv6 = True + elif f["name"] == "rx-checksum": + cfg.have_rx_csum = True + + +def main() -> None: + with NetDrvEpEnv(__file__, nsim_test=False) as cfg: + check_nic_features(cfg) + + cfg.bin_local = path.abspath(path.dirname(__file__) + "/../../../net/lib/csum") + cfg.bin_remote = cfg.remote.deploy(cfg.bin_local) + + cases = [] + for ipv4 in [True, False]: + cases.append(test_builder("rx_tcp", cfg, ipv4, False, "-t")) + cases.append(test_builder("rx_tcp_invalid", cfg, ipv4, False, "-t -E")) + + cases.append(test_builder("rx_udp", cfg, ipv4, False, "")) + cases.append(test_builder("rx_udp_invalid", cfg, ipv4, False, "-E")) + + cases.append(test_builder("tx_udp_csum_offload", cfg, ipv4, True, "-U")) + cases.append(test_builder("tx_udp_zero_checksum", cfg, ipv4, True, "-U -Z")) + + ksft_run(cases=cases, args=(cfg, )) + ksft_exit() + + +if __name__ == "__main__": + main() diff --git a/tools/testing/selftests/net/devlink_port_split.py b/tools/testing/selftests/drivers/net/hw/devlink_port_split.py index 2d84c7a0be6b..2d84c7a0be6b 100755 --- a/tools/testing/selftests/net/devlink_port_split.py +++ b/tools/testing/selftests/drivers/net/hw/devlink_port_split.py diff --git a/tools/testing/selftests/net/forwarding/ethtool.sh b/tools/testing/selftests/drivers/net/hw/ethtool.sh index aa2eafb7b243..fa6953de6b6d 100755 --- a/tools/testing/selftests/net/forwarding/ethtool.sh +++ b/tools/testing/selftests/drivers/net/hw/ethtool.sh @@ -10,7 +10,8 @@ ALL_TESTS=" different_speeds_autoneg_on " NUM_NETIFS=2 -source lib.sh +lib_dir=$(dirname "$0") +source "$lib_dir"/../../../net/forwarding/lib.sh source ethtool_lib.sh h1_create() @@ -64,9 +65,8 @@ same_speeds_autoneg_off() setup_wait_dev_with_timeout $h1 setup_wait_dev_with_timeout $h2 ping_do $h1 192.0.2.2 - check_err $? "speed $speed autoneg off" - log_test "force of same speed autoneg off" - log_info "speed = $speed" + check_err $? "ping with speed $speed autoneg off" + log_test "force speed $speed on both ends" done ethtool -s $h2 autoneg on @@ -111,9 +111,8 @@ combination_of_neg_on_and_off() setup_wait_dev_with_timeout $h1 setup_wait_dev_with_timeout $h2 ping_do $h1 192.0.2.2 - check_err $? "h1-speed=$speed autoneg off, h2 autoneg on" - log_test "one side with autoneg off and another with autoneg on" - log_info "force speed = $speed" + check_err $? "ping with h1-speed=$speed autoneg off, h2 autoneg on" + log_test "force speed $speed vs. autoneg" done ethtool -s $h1 autoneg on @@ -206,10 +205,9 @@ advertise_subset_of_speeds() setup_wait_dev_with_timeout $h1 setup_wait_dev_with_timeout $h2 ping_do $h1 192.0.2.2 - check_err $? "h1=$speed_1_to_advertise, h2=$speed_2_to_advertise ($speed_value)" + check_err $? "ping with h1=$speed_1_to_advertise, h2=$speed_2_to_advertise ($speed_value)" - log_test "advertise subset of speeds" - log_info "h1=$speed_1_to_advertise, h2=$speed_2_to_advertise" + log_test "advertise $speed_1_to_advertise vs. $speed_2_to_advertise" done ethtool -s $h2 autoneg on @@ -286,8 +284,6 @@ different_speeds_autoneg_on() ethtool -s $h1 autoneg on } -skip_on_veth - trap cleanup EXIT setup_prepare diff --git a/tools/testing/selftests/net/forwarding/ethtool_extended_state.sh b/tools/testing/selftests/drivers/net/hw/ethtool_extended_state.sh index 17f89c3b7c02..a7584448416e 100755 --- a/tools/testing/selftests/net/forwarding/ethtool_extended_state.sh +++ b/tools/testing/selftests/drivers/net/hw/ethtool_extended_state.sh @@ -8,7 +8,8 @@ ALL_TESTS=" " NUM_NETIFS=2 -source lib.sh +lib_dir=$(dirname "$0") +source "$lib_dir"/../../../net/forwarding/lib.sh source ethtool_lib.sh TIMEOUT=$((WAIT_TIMEOUT * 1000)) # ms @@ -108,8 +109,6 @@ no_cable() ip link set dev $swp3 down } -skip_on_veth - setup_prepare tests_run diff --git a/tools/testing/selftests/net/forwarding/ethtool_lib.sh b/tools/testing/selftests/drivers/net/hw/ethtool_lib.sh index b9bfb45085af..b9bfb45085af 100644 --- a/tools/testing/selftests/net/forwarding/ethtool_lib.sh +++ b/tools/testing/selftests/drivers/net/hw/ethtool_lib.sh diff --git a/tools/testing/selftests/net/forwarding/ethtool_mm.sh b/tools/testing/selftests/drivers/net/hw/ethtool_mm.sh index 50d5bfb17ef1..c301e735c8ab 100755 --- a/tools/testing/selftests/net/forwarding/ethtool_mm.sh +++ b/tools/testing/selftests/drivers/net/hw/ethtool_mm.sh @@ -14,7 +14,8 @@ ALL_TESTS=" NUM_NETIFS=2 REQUIRE_MZ=no PREEMPTIBLE_PRIO=0 -source lib.sh +lib_dir=$(dirname "$0") +source "$lib_dir"/../../../net/forwarding/lib.sh traffic_test() { diff --git a/tools/testing/selftests/net/forwarding/ethtool_rmon.sh b/tools/testing/selftests/drivers/net/hw/ethtool_rmon.sh index 41a34a61f763..8f60c1685ad4 100755 --- a/tools/testing/selftests/net/forwarding/ethtool_rmon.sh +++ b/tools/testing/selftests/drivers/net/hw/ethtool_rmon.sh @@ -7,7 +7,8 @@ ALL_TESTS=" " NUM_NETIFS=2 -source lib.sh +lib_dir=$(dirname "$0") +source "$lib_dir"/../../../net/forwarding/lib.sh ETH_FCS_LEN=4 ETH_HLEN=$((6+6+2)) @@ -43,6 +44,7 @@ bucket_test() # Mausezahn does not include FCS bytes in its length - but the # histogram counters do len=$((len - ETH_FCS_LEN)) + len=$((len > 0 ? len : 0)) before=$(ethtool --json -S $iface --groups rmon | \ jq -r ".[0].rmon[\"${set}-pktsNtoM\"][$bucket].val") @@ -78,7 +80,7 @@ rmon_histogram() for if in $iface $neigh; do if ! ensure_mtu $if ${bucket[0]}; then - log_test_skip "$if does not support the required MTU for $step" + log_test_xfail "$if does not support the required MTU for $step" return fi done @@ -93,7 +95,7 @@ rmon_histogram() jq -r ".[0].rmon[\"${set}-pktsNtoM\"][]|[.low, .high]|@tsv" 2>/dev/null) if [ $nbuckets -eq 0 ]; then - log_test_skip "$iface does not support $set histogram counters" + log_test_xfail "$iface does not support $set histogram counters" return fi } diff --git a/tools/testing/selftests/net/forwarding/hw_stats_l3.sh b/tools/testing/selftests/drivers/net/hw/hw_stats_l3.sh index 48584a51388f..67fafefc80be 100755 --- a/tools/testing/selftests/net/forwarding/hw_stats_l3.sh +++ b/tools/testing/selftests/drivers/net/hw/hw_stats_l3.sh @@ -48,7 +48,9 @@ ALL_TESTS=" test_double_enable " NUM_NETIFS=4 -source lib.sh +lib_dir=$(dirname "$0") +source "$lib_dir"/../../../net/forwarding/lib.sh +source "$lib_dir"/../../../net/forwarding/tc_common.sh h1_create() { @@ -324,17 +326,9 @@ setup_wait used=$(ip -j stats show dev $rp1.200 group offload subgroup hw_stats_info | jq '.[].info.l3_stats.used') -kind=$(ip -j -d link show dev $rp1 | - jq -r '.[].linkinfo.info_kind') -if [[ $used != true ]]; then - if [[ $kind == veth ]]; then - log_test_skip "l3_stats not offloaded on veth interface" - EXIT_STATUS=$ksft_skip - else - RET=1 log_test "l3_stats not offloaded" - fi -else - tests_run -fi +[[ $used = true ]] +check_err $? "hw_stats_info.used=$used" +log_test "l3_stats offloaded" +tests_run exit $EXIT_STATUS diff --git a/tools/testing/selftests/net/forwarding/hw_stats_l3_gre.sh b/tools/testing/selftests/drivers/net/hw/hw_stats_l3_gre.sh index 7594bbb49029..a94d92e1abce 100755 --- a/tools/testing/selftests/net/forwarding/hw_stats_l3_gre.sh +++ b/tools/testing/selftests/drivers/net/hw/hw_stats_l3_gre.sh @@ -12,8 +12,10 @@ ALL_TESTS=" test_stats_tx " NUM_NETIFS=6 -source lib.sh -source ipip_lib.sh +lib_dir=$(dirname "$0") +source "$lib_dir"/../../../net/forwarding/lib.sh +source "$lib_dir"/../../../net/forwarding/ipip_lib.sh +source "$lib_dir"/../../../net/forwarding/tc_common.sh setup_prepare() { @@ -99,8 +101,6 @@ test_stats_rx() test_stats g2a rx } -skip_on_veth - trap cleanup EXIT setup_prepare diff --git a/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py b/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py new file mode 100644 index 000000000000..b582885786f5 --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py @@ -0,0 +1,16 @@ +# SPDX-License-Identifier: GPL-2.0 + +import sys +from pathlib import Path + +KSFT_DIR = (Path(__file__).parent / "../../../../..").resolve() + +try: + sys.path.append(KSFT_DIR.as_posix()) + from net.lib.py import * + from drivers.net.lib.py import * +except ModuleNotFoundError as e: + ksft_pr("Failed importing `net` library from kernel sources") + ksft_pr(str(e)) + ktap_result(True, comment="SKIP") + sys.exit(4) diff --git a/tools/testing/selftests/net/forwarding/loopback.sh b/tools/testing/selftests/drivers/net/hw/loopback.sh index 8f4057310b5b..5acc3ff820aa 100755 --- a/tools/testing/selftests/net/forwarding/loopback.sh +++ b/tools/testing/selftests/drivers/net/hw/loopback.sh @@ -6,8 +6,9 @@ ksft_skip=4 ALL_TESTS="loopback_test" NUM_NETIFS=2 -source tc_common.sh -source lib.sh +lib_dir=$(dirname "$0") +source "$lib_dir"/../../../net/forwarding/tc_common.sh +source "$lib_dir"/../../../net/forwarding/lib.sh h1_create() { diff --git a/tools/testing/selftests/drivers/net/hw/pp_alloc_fail.py b/tools/testing/selftests/drivers/net/hw/pp_alloc_fail.py new file mode 100755 index 000000000000..026d98976c35 --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/pp_alloc_fail.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +import time +import os +from lib.py import ksft_run, ksft_exit, ksft_pr +from lib.py import KsftSkipEx, KsftFailEx +from lib.py import NetdevFamily, NlError +from lib.py import NetDrvEpEnv +from lib.py import cmd, tool, GenerateTraffic + + +def _write_fail_config(config): + for key, value in config.items(): + with open("/sys/kernel/debug/fail_function/" + key, "w") as fp: + fp.write(str(value) + "\n") + + +def _enable_pp_allocation_fail(): + if not os.path.exists("/sys/kernel/debug/fail_function"): + raise KsftSkipEx("Kernel built without function error injection (or DebugFS)") + + if not os.path.exists("/sys/kernel/debug/fail_function/page_pool_alloc_pages"): + with open("/sys/kernel/debug/fail_function/inject", "w") as fp: + fp.write("page_pool_alloc_pages\n") + + _write_fail_config({ + "verbose": 0, + "interval": 511, + "probability": 100, + "times": -1, + }) + + +def _disable_pp_allocation_fail(): + if not os.path.exists("/sys/kernel/debug/fail_function"): + return + + if os.path.exists("/sys/kernel/debug/fail_function/page_pool_alloc_pages"): + with open("/sys/kernel/debug/fail_function/inject", "w") as fp: + fp.write("\n") + + _write_fail_config({ + "probability": 0, + "times": 0, + }) + + +def test_pp_alloc(cfg, netdevnl): + def get_stats(): + return netdevnl.qstats_get({"ifindex": cfg.ifindex}, dump=True)[0] + + def check_traffic_flowing(): + stat1 = get_stats() + time.sleep(1) + stat2 = get_stats() + if stat2['rx-packets'] - stat1['rx-packets'] < 15000: + raise KsftFailEx("Traffic seems low:", stat2['rx-packets'] - stat1['rx-packets']) + + + try: + stats = get_stats() + except NlError as e: + if e.nl_msg.error == -95: + stats = {} + else: + raise + if 'rx-alloc-fail' not in stats: + raise KsftSkipEx("Driver does not report 'rx-alloc-fail' via qstats") + + set_g = False + traffic = None + try: + traffic = GenerateTraffic(cfg) + + check_traffic_flowing() + + _enable_pp_allocation_fail() + + s1 = get_stats() + time.sleep(3) + s2 = get_stats() + + if s2['rx-alloc-fail'] - s1['rx-alloc-fail'] < 1: + raise KsftSkipEx("Allocation failures not increasing") + if s2['rx-alloc-fail'] - s1['rx-alloc-fail'] < 100: + raise KsftSkipEx("Allocation increasing too slowly", s2['rx-alloc-fail'] - s1['rx-alloc-fail'], + "packets:", s2['rx-packets'] - s1['rx-packets']) + + # Basic failures are fine, try to wobble some settings to catch extra failures + check_traffic_flowing() + g = tool("ethtool", "-g " + cfg.ifname, json=True)[0] + if 'rx' in g and g["rx"] * 2 <= g["rx-max"]: + new_g = g['rx'] * 2 + elif 'rx' in g: + new_g = g['rx'] // 2 + else: + new_g = None + + if new_g: + set_g = cmd(f"ethtool -G {cfg.ifname} rx {new_g}", fail=False).ret == 0 + if set_g: + ksft_pr("ethtool -G change retval: success") + else: + ksft_pr("ethtool -G change retval: did not succeed", new_g) + else: + ksft_pr("ethtool -G change retval: did not try") + + time.sleep(0.1) + check_traffic_flowing() + finally: + _disable_pp_allocation_fail() + if traffic: + traffic.stop() + time.sleep(0.1) + if set_g: + cmd(f"ethtool -G {cfg.ifname} rx {g['rx']}") + + +def main() -> None: + netdevnl = NetdevFamily() + with NetDrvEpEnv(__file__, nsim_test=False) as cfg: + + ksft_run([test_pp_alloc], args=(cfg, netdevnl, )) + ksft_exit() + + +if __name__ == "__main__": + main() diff --git a/tools/testing/selftests/drivers/net/hw/settings b/tools/testing/selftests/drivers/net/hw/settings new file mode 100644 index 000000000000..e7b9417537fb --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/settings @@ -0,0 +1 @@ +timeout=0 diff --git a/tools/testing/selftests/drivers/net/lib/py/__init__.py b/tools/testing/selftests/drivers/net/lib/py/__init__.py new file mode 100644 index 000000000000..401e70f7f136 --- /dev/null +++ b/tools/testing/selftests/drivers/net/lib/py/__init__.py @@ -0,0 +1,19 @@ +# SPDX-License-Identifier: GPL-2.0 + +import sys +from pathlib import Path + +KSFT_DIR = (Path(__file__).parent / "../../../..").resolve() + +try: + sys.path.append(KSFT_DIR.as_posix()) + from net.lib.py import * +except ModuleNotFoundError as e: + ksft_pr("Failed importing `net` library from kernel sources") + ksft_pr(str(e)) + ktap_result(True, comment="SKIP") + sys.exit(4) + +from .env import * +from .load import * +from .remote import Remote diff --git a/tools/testing/selftests/drivers/net/lib/py/env.py b/tools/testing/selftests/drivers/net/lib/py/env.py new file mode 100644 index 000000000000..edcedd7bffab --- /dev/null +++ b/tools/testing/selftests/drivers/net/lib/py/env.py @@ -0,0 +1,224 @@ +# SPDX-License-Identifier: GPL-2.0 + +import os +from pathlib import Path +from lib.py import KsftSkipEx, KsftXfailEx +from lib.py import cmd, ip +from lib.py import NetNS, NetdevSimDev +from .remote import Remote + + +def _load_env_file(src_path): + env = os.environ.copy() + + src_dir = Path(src_path).parent.resolve() + if not (src_dir / "net.config").exists(): + return env + + with open((src_dir / "net.config").as_posix(), 'r') as fp: + for line in fp.readlines(): + full_file = line + # Strip comments + pos = line.find("#") + if pos >= 0: + line = line[:pos] + line = line.strip() + if not line: + continue + pair = line.split('=', maxsplit=1) + if len(pair) != 2: + raise Exception("Can't parse configuration line:", full_file) + env[pair[0]] = pair[1] + return env + + +class NetDrvEnv: + """ + Class for a single NIC / host env, with no remote end + """ + def __init__(self, src_path, **kwargs): + self._ns = None + + self.env = _load_env_file(src_path) + + if 'NETIF' in self.env: + self.dev = ip("link show dev " + self.env['NETIF'], json=True)[0] + else: + self._ns = NetdevSimDev(**kwargs) + self.dev = self._ns.nsims[0].dev + self.ifindex = self.dev['ifindex'] + + def __enter__(self): + ip(f"link set dev {self.dev['ifname']} up") + + return self + + def __exit__(self, ex_type, ex_value, ex_tb): + """ + __exit__ gets called at the end of a "with" block. + """ + self.__del__() + + def __del__(self): + if self._ns: + self._ns.remove() + self._ns = None + + +class NetDrvEpEnv: + """ + Class for an environment with a local device and "remote endpoint" + which can be used to send traffic in. + + For local testing it creates two network namespaces and a pair + of netdevsim devices. + """ + + # Network prefixes used for local tests + nsim_v4_pfx = "192.0.2." + nsim_v6_pfx = "2001:db8::" + + def __init__(self, src_path, nsim_test=None): + + self.env = _load_env_file(src_path) + + # Things we try to destroy + self.remote = None + # These are for local testing state + self._netns = None + self._ns = None + self._ns_peer = None + + if "NETIF" in self.env: + if nsim_test is True: + raise KsftXfailEx("Test only works on netdevsim") + self._check_env() + + self.dev = ip("link show dev " + self.env['NETIF'], json=True)[0] + + self.v4 = self.env.get("LOCAL_V4") + self.v6 = self.env.get("LOCAL_V6") + self.remote_v4 = self.env.get("REMOTE_V4") + self.remote_v6 = self.env.get("REMOTE_V6") + kind = self.env["REMOTE_TYPE"] + args = self.env["REMOTE_ARGS"] + else: + if nsim_test is False: + raise KsftXfailEx("Test does not work on netdevsim") + + self.create_local() + + self.dev = self._ns.nsims[0].dev + + self.v4 = self.nsim_v4_pfx + "1" + self.v6 = self.nsim_v6_pfx + "1" + self.remote_v4 = self.nsim_v4_pfx + "2" + self.remote_v6 = self.nsim_v6_pfx + "2" + kind = "netns" + args = self._netns.name + + self.remote = Remote(kind, args, src_path) + + self.addr = self.v6 if self.v6 else self.v4 + self.remote_addr = self.remote_v6 if self.remote_v6 else self.remote_v4 + + self.addr_ipver = "6" if self.v6 else "4" + # Bracketed addresses, some commands need IPv6 to be inside [] + self.baddr = f"[{self.v6}]" if self.v6 else self.v4 + self.remote_baddr = f"[{self.remote_v6}]" if self.remote_v6 else self.remote_v4 + + self.ifname = self.dev['ifname'] + self.ifindex = self.dev['ifindex'] + + self._required_cmd = {} + + def create_local(self): + self._netns = NetNS() + self._ns = NetdevSimDev() + self._ns_peer = NetdevSimDev(ns=self._netns) + + with open("/proc/self/ns/net") as nsfd0, \ + open("/var/run/netns/" + self._netns.name) as nsfd1: + ifi0 = self._ns.nsims[0].ifindex + ifi1 = self._ns_peer.nsims[0].ifindex + NetdevSimDev.ctrl_write('link_device', + f'{nsfd0.fileno()}:{ifi0} {nsfd1.fileno()}:{ifi1}') + + ip(f" addr add dev {self._ns.nsims[0].ifname} {self.nsim_v4_pfx}1/24") + ip(f"-6 addr add dev {self._ns.nsims[0].ifname} {self.nsim_v6_pfx}1/64 nodad") + ip(f" link set dev {self._ns.nsims[0].ifname} up") + + ip(f" addr add dev {self._ns_peer.nsims[0].ifname} {self.nsim_v4_pfx}2/24", ns=self._netns) + ip(f"-6 addr add dev {self._ns_peer.nsims[0].ifname} {self.nsim_v6_pfx}2/64 nodad", ns=self._netns) + ip(f" link set dev {self._ns_peer.nsims[0].ifname} up", ns=self._netns) + + def _check_env(self): + vars_needed = [ + ["LOCAL_V4", "LOCAL_V6"], + ["REMOTE_V4", "REMOTE_V6"], + ["REMOTE_TYPE"], + ["REMOTE_ARGS"] + ] + missing = [] + + for choice in vars_needed: + for entry in choice: + if entry in self.env: + break + else: + missing.append(choice) + # Make sure v4 / v6 configs are symmetric + if ("LOCAL_V6" in self.env) != ("REMOTE_V6" in self.env): + missing.append(["LOCAL_V6", "REMOTE_V6"]) + if ("LOCAL_V4" in self.env) != ("REMOTE_V4" in self.env): + missing.append(["LOCAL_V4", "REMOTE_V4"]) + if missing: + raise Exception("Invalid environment, missing configuration:", missing, + "Please see tools/testing/selftests/drivers/net/README.rst") + + def __enter__(self): + return self + + def __exit__(self, ex_type, ex_value, ex_tb): + """ + __exit__ gets called at the end of a "with" block. + """ + self.__del__() + + def __del__(self): + if self._ns: + self._ns.remove() + self._ns = None + if self._ns_peer: + self._ns_peer.remove() + self._ns_peer = None + if self._netns: + del self._netns + self._netns = None + if self.remote: + del self.remote + self.remote = None + + def require_v4(self): + if not self.v4 or not self.remote_v4: + raise KsftSkipEx("Test requires IPv4 connectivity") + + def require_v6(self): + if not self.v6 or not self.remote_v6: + raise KsftSkipEx("Test requires IPv6 connectivity") + + def _require_cmd(self, comm, key, host=None): + cached = self._required_cmd.get(comm, {}) + if cached.get(key) is None: + cached[key] = cmd("command -v -- " + comm, fail=False, + shell=True, host=host).ret == 0 + self._required_cmd[comm] = cached + return cached[key] + + def require_cmd(self, comm, local=True, remote=False): + if local: + if not self._require_cmd(comm, "local"): + raise KsftSkipEx("Test requires command: " + comm) + if remote: + if not self._require_cmd(comm, "remote"): + raise KsftSkipEx("Test requires (remote) command: " + comm) diff --git a/tools/testing/selftests/drivers/net/lib/py/load.py b/tools/testing/selftests/drivers/net/lib/py/load.py new file mode 100644 index 000000000000..abdb677bdb1c --- /dev/null +++ b/tools/testing/selftests/drivers/net/lib/py/load.py @@ -0,0 +1,41 @@ +# SPDX-License-Identifier: GPL-2.0 + +import time + +from lib.py import ksft_pr, cmd, ip, rand_port, wait_port_listen + +class GenerateTraffic: + def __init__(self, env): + env.require_cmd("iperf3", remote=True) + + self.env = env + + port = rand_port() + self._iperf_server = cmd(f"iperf3 -s -p {port}", background=True) + wait_port_listen(port) + time.sleep(0.1) + self._iperf_client = cmd(f"iperf3 -c {env.addr} -P 16 -p {port} -t 86400", + background=True, host=env.remote) + + # Wait for traffic to ramp up + pkt = ip("-s link show dev " + env.ifname, json=True)[0]["stats64"]["rx"]["packets"] + for _ in range(50): + time.sleep(0.1) + now = ip("-s link show dev " + env.ifname, json=True)[0]["stats64"]["rx"]["packets"] + if now - pkt > 1000: + return + pkt = now + self.stop(verbose=True) + raise Exception("iperf3 traffic did not ramp up") + + def stop(self, verbose=None): + self._iperf_client.process(terminate=True) + if verbose: + ksft_pr(">> Client:") + ksft_pr(self._iperf_client.stdout) + ksft_pr(self._iperf_client.stderr) + self._iperf_server.process(terminate=True) + if verbose: + ksft_pr(">> Server:") + ksft_pr(self._iperf_server.stdout) + ksft_pr(self._iperf_server.stderr) diff --git a/tools/testing/selftests/drivers/net/lib/py/remote.py b/tools/testing/selftests/drivers/net/lib/py/remote.py new file mode 100644 index 000000000000..b1780b987722 --- /dev/null +++ b/tools/testing/selftests/drivers/net/lib/py/remote.py @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: GPL-2.0 + +import os +import importlib + +_modules = {} + +def Remote(kind, args, src_path): + global _modules + + if kind not in _modules: + _modules[kind] = importlib.import_module("..remote_" + kind, __name__) + + dir_path = os.path.abspath(src_path + "/../") + return getattr(_modules[kind], "Remote")(args, dir_path) diff --git a/tools/testing/selftests/drivers/net/lib/py/remote_netns.py b/tools/testing/selftests/drivers/net/lib/py/remote_netns.py new file mode 100644 index 000000000000..7d5eeb0271bc --- /dev/null +++ b/tools/testing/selftests/drivers/net/lib/py/remote_netns.py @@ -0,0 +1,21 @@ +# SPDX-License-Identifier: GPL-2.0 + +import os +import subprocess + +from lib.py import cmd + + +class Remote: + def __init__(self, name, dir_path): + self.name = name + self.dir_path = dir_path + + def cmd(self, comm): + return subprocess.Popen(["ip", "netns", "exec", self.name, "bash", "-c", comm], + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + def deploy(self, what): + if os.path.isabs(what): + return what + return os.path.abspath(self.dir_path + "/" + what) diff --git a/tools/testing/selftests/drivers/net/lib/py/remote_ssh.py b/tools/testing/selftests/drivers/net/lib/py/remote_ssh.py new file mode 100644 index 000000000000..924addde19a3 --- /dev/null +++ b/tools/testing/selftests/drivers/net/lib/py/remote_ssh.py @@ -0,0 +1,39 @@ +# SPDX-License-Identifier: GPL-2.0 + +import os +import string +import subprocess +import random + +from lib.py import cmd + + +class Remote: + def __init__(self, name, dir_path): + self.name = name + self.dir_path = dir_path + self._tmpdir = None + + def __del__(self): + if self._tmpdir: + cmd("rm -rf " + self._tmpdir, host=self) + self._tmpdir = None + + def cmd(self, comm): + return subprocess.Popen(["ssh", "-q", self.name, comm], + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + def _mktmp(self): + return ''.join(random.choice(string.ascii_lowercase) for _ in range(8)) + + def deploy(self, what): + if not self._tmpdir: + self._tmpdir = "/tmp/" + self._mktmp() + cmd("mkdir " + self._tmpdir, host=self) + file_name = self._tmpdir + "/" + self._mktmp() + os.path.basename(what) + + if not os.path.isabs(what): + what = os.path.abspath(self.dir_path + "/" + what) + + cmd(f"scp {what} {self.name}:{file_name}") + return file_name diff --git a/tools/testing/selftests/drivers/net/microchip/ksz9477_qos.sh b/tools/testing/selftests/drivers/net/microchip/ksz9477_qos.sh new file mode 100755 index 000000000000..82be5d013330 --- /dev/null +++ b/tools/testing/selftests/drivers/net/microchip/ksz9477_qos.sh @@ -0,0 +1,668 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Copyright (c) 2024 Pengutronix, Oleksij Rempel <kernel@pengutronix.de> + +# The script is adopted to work with the Microchip KSZ switch driver. + +ETH_FCS_LEN=4 + +WAIT_TIME=1 +NUM_NETIFS=4 +REQUIRE_JQ="yes" +REQUIRE_MZ="yes" +STABLE_MAC_ADDRS=yes +NETIF_CREATE=no +lib_dir=$(dirname $0)/../../../net/forwarding +source $lib_dir/tc_common.sh +source $lib_dir/lib.sh + +require_command dcb + +h1=${NETIFS[p1]} +swp1=${NETIFS[p2]} +swp2=${NETIFS[p3]} +h2=${NETIFS[p4]} + +H1_IPV4="192.0.2.1" +H2_IPV4="192.0.2.2" +H1_IPV6="2001:db8:1::1" +H2_IPV6="2001:db8:1::2" + +# On h1_ and h2_create do not set IP addresses to avoid interaction with the +# system, to keep packet counters clean. +h1_create() +{ + simple_if_init $h1 + sysctl_set net.ipv6.conf.${h1}.disable_ipv6 1 + # Get the MAC address of the interface to use it with mausezahn + h1_mac=$(ip -j link show dev ${h1} | jq -e '.[].address') +} + +h1_destroy() +{ + sysctl_restore net.ipv6.conf.${h1}.disable_ipv6 + simple_if_fini $h1 +} + +h2_create() +{ + simple_if_init $h2 + sysctl_set net.ipv6.conf.${h2}.disable_ipv6 1 + h2_mac=$(ip -j link show dev ${h2} | jq -e '.[].address') +} + +h2_destroy() +{ + sysctl_restore net.ipv6.conf.${h2}.disable_ipv6 + simple_if_fini $h2 +} + +switch_create() +{ + ip link set ${swp1} up + ip link set ${swp2} up + sysctl_set net.ipv6.conf.${swp1}.disable_ipv6 1 + sysctl_set net.ipv6.conf.${swp2}.disable_ipv6 1 + + # Ports should trust VLAN PCP even with vlan_filtering=0 + ip link add br0 type bridge + ip link set ${swp1} master br0 + ip link set ${swp2} master br0 + ip link set br0 up + sysctl_set net.ipv6.conf.br0.disable_ipv6 1 +} + +switch_destroy() +{ + sysctl_restore net.ipv6.conf.${swp2}.disable_ipv6 + sysctl_restore net.ipv6.conf.${swp1}.disable_ipv6 + + ip link del br0 +} + +setup_prepare() +{ + vrf_prepare + + h1_create + h2_create + switch_create +} + +cleanup() +{ + pre_cleanup + + h2_destroy + h1_destroy + switch_destroy + + vrf_cleanup +} + +set_apptrust_order() +{ + local if_name=$1 + local order=$2 + + dcb apptrust set dev ${if_name} order ${order} +} + +# Function to extract a specified field from a given JSON stats string +extract_network_stat() { + local stats_json=$1 + local field_name=$2 + + echo $(echo "$stats_json" | jq -r "$field_name") +} + +run_test() +{ + local test_name=$1; + local apptrust_order=$2; + local port_prio=$3; + local dscp_ipv=$4; + local dscp=$5; + local have_vlan=$6; + local pcp_ipv=$7; + local vlan_pcp=$8; + local ip_v6=$9 + + local rx_ipv + local tx_ipv + + RET=0 + + # Send some packet to populate the switch MAC table + $MZ ${h2} -a ${h2_mac} -b ${h1_mac} -p 64 -t icmp echores -c 1 + + # Based on the apptrust order, set the expected Internal Priority values + # for the RX and TX paths. + if [ "${apptrust_order}" == "" ]; then + echo "Apptrust order not set." + rx_ipv=${port_prio} + tx_ipv=${port_prio} + elif [ "${apptrust_order}" == "dscp" ]; then + echo "Apptrust order is DSCP." + rx_ipv=${dscp_ipv} + tx_ipv=${dscp_ipv} + elif [ "${apptrust_order}" == "pcp" ]; then + echo "Apptrust order is PCP." + rx_ipv=${pcp_ipv} + tx_ipv=${pcp_ipv} + elif [ "${apptrust_order}" == "pcp dscp" ]; then + echo "Apptrust order is PCP DSCP." + if [ ${have_vlan} -eq 1 ]; then + rx_ipv=$((dscp_ipv > pcp_ipv ? dscp_ipv : pcp_ipv)) + tx_ipv=${pcp_ipv} + else + rx_ipv=${dscp_ipv} + tx_ipv=${dscp_ipv} + fi + else + RET=1 + echo "Error: Unknown apptrust order ${apptrust_order}" + log_test "${test_name}" + return + fi + + # Most/all? of the KSZ switches do not provide per-TC counters. There + # are only tx_hi and rx_hi counters, which are used to count packets + # which are considered as high priority and most likely not assigned + # to the queue 0. + # On the ingress path, packets seem to get high priority status + # independently of the DSCP or PCP global mapping. On the egress path, + # the high priority status is assigned based on the DSCP or PCP global + # map configuration. + # The thresholds for the high priority status are not documented, but + # it seems that the switch considers packets as high priority on the + # ingress path if detected Internal Priority is greater than 0. On the + # egress path, the switch considers packets as high priority if + # detected Internal Priority is greater than 1. + if [ ${rx_ipv} -ge 1 ]; then + local expect_rx_high_prio=1 + else + local expect_rx_high_prio=0 + fi + + if [ ${tx_ipv} -ge 2 ]; then + local expect_tx_high_prio=1 + else + local expect_tx_high_prio=0 + fi + + # Use ip tool to get the current switch packet counters. ethool stats + # need to be recalculated to get the correct values. + local swp1_stats=$(ip -s -j link show dev ${swp1}) + local swp2_stats=$(ip -s -j link show dev ${swp2}) + local swp1_rx_packets_before=$(extract_network_stat "$swp1_stats" \ + '.[0].stats64.rx.packets') + local swp1_rx_bytes_before=$(extract_network_stat "$swp1_stats" \ + '.[0].stats64.rx.bytes') + local swp2_tx_packets_before=$(extract_network_stat "$swp2_stats" \ + '.[0].stats64.tx.packets') + local swp2_tx_bytes_before=$(extract_network_stat "$swp2_stats" \ + '.[0].stats64.tx.bytes') + local swp1_rx_hi_before=$(ethtool_stats_get ${swp1} "rx_hi") + local swp2_tx_hi_before=$(ethtool_stats_get ${swp2} "tx_hi") + + # Assamble the mausezahn command based on the test parameters + # For the testis with ipv4 or ipv6, use icmp response packets, + # to avoid interaction with the system, to keep packet counters + # clean. + if [ ${ip_v6} -eq 0 ]; then + local ip="-a ${h1_mac} -b ${h2_mac} -A ${H1_IPV4} \ + -B ${H2_IPV4} -t icmp unreach,code=1,dscp=${dscp}" + else + local ip="-6 -a ${h1_mac} -b ${h2_mac} -A ${H1_IPV6} \ + -B ${H2_IPV6} -t icmp6 type=1,code=0,dscp=${dscp}" + fi + + if [ ${have_vlan} -eq 1 ]; then + local vlan_pcp_opt="-Q ${vlan_pcp}:0" + else + local vlan_pcp_opt="" + fi + $MZ ${h1} ${ip} -c ${PING_COUNT} -d 10msec ${vlan_pcp_opt} + + # Wait until the switch packet counters are updated + sleep 6 + + local swp1_stats=$(ip -s -j link show dev ${swp1}) + local swp2_stats=$(ip -s -j link show dev ${swp2}) + + local swp1_rx_packets_after=$(extract_network_stat "$swp1_stats" \ + '.[0].stats64.rx.packets') + local swp1_rx_bytes_after=$(extract_network_stat "$swp1_stats" \ + '.[0].stats64.rx.bytes') + local swp2_tx_packets_after=$(extract_network_stat "$swp2_stats" \ + '.[0].stats64.tx.packets') + local swp2_tx_bytes_after=$(extract_network_stat "$swp2_stats" \ + '.[0].stats64.tx.bytes') + + local swp1_rx_packets_diff=$((${swp1_rx_packets_after} - \ + ${swp1_rx_packets_before})) + local swp2_tx_packets_diff=$((${swp2_tx_packets_after} - \ + ${swp2_tx_packets_before})) + + local swp1_rx_hi_after=$(ethtool_stats_get ${swp1} "rx_hi") + local swp2_tx_hi_after=$(ethtool_stats_get ${swp2} "tx_hi") + + # Test if any packets were received on swp1, we will rx before and after + if [ ${swp1_rx_packets_diff} -lt ${PING_COUNT} ]; then + echo "Not expected amount of received packets on ${swp1}" + echo "before ${swp1_rx_packets_before} after ${swp1_rx_packets_after}" + RET=1 + fi + + # Test if any packets were transmitted on swp2, we will tx before and after + if [ ${swp2_tx_packets_diff} -lt ${PING_COUNT} ]; then + echo "Not expected amount of transmitted packets on ${swp2}" + echo "before ${swp2_tx_packets_before} after ${swp2_tx_packets_after}" + RET=1 + fi + + # tx/rx_hi counted in bytes. So, we need to compare the difference in bytes + local swp1_rx_bytes_diff=$(($swp1_rx_bytes_after - $swp1_rx_bytes_before)) + local swp2_tx_bytes_diff=$(($swp2_tx_bytes_after - $swp2_tx_bytes_before)) + local swp1_rx_hi_diff=$(($swp1_rx_hi_after - $swp1_rx_hi_before)) + local swp2_tx_hi_diff=$(($swp2_tx_hi_after - $swp2_tx_hi_before)) + + if [ ${expect_rx_high_prio} -eq 1 ]; then + swp1_rx_hi_diff=$((${swp1_rx_hi_diff} - \ + ${swp1_rx_packets_diff} * ${ETH_FCS_LEN})) + if [ ${swp1_rx_hi_diff} -ne ${swp1_rx_bytes_diff} ]; then + echo "Not expected amount of high priority packets received on ${swp1}" + echo "RX hi diff: ${swp1_rx_hi_diff}, expected RX bytes diff: ${swp1_rx_bytes_diff}" + RET=1 + fi + else + if [ ${swp1_rx_hi_diff} -ne 0 ]; then + echo "Unexpected amount of high priority packets received on ${swp1}" + echo "RX hi diff: ${swp1_rx_hi_diff}, expected 0" + RET=1 + fi + fi + + if [ ${expect_tx_high_prio} -eq 1 ]; then + swp2_tx_hi_diff=$((${swp2_tx_hi_diff} - \ + ${swp2_tx_packets_diff} * ${ETH_FCS_LEN})) + if [ ${swp2_tx_hi_diff} -ne ${swp2_tx_bytes_diff} ]; then + echo "Not expected amount of high priority packets transmitted on ${swp2}" + echo "TX hi diff: ${swp2_tx_hi_diff}, expected TX bytes diff: ${swp2_tx_bytes_diff}" + RET=1 + fi + else + if [ ${swp2_tx_hi_diff} -ne 0 ]; then + echo "Unexpected amount of high priority packets transmitted on ${swp2}" + echo "TX hi diff: ${swp2_tx_hi_diff}, expected 0" + RET=1 + fi + fi + + log_test "${test_name}" +} + +run_test_dscp() +{ + # IPv4 test + run_test "$1" "$2" "$3" "$4" "$5" 0 0 0 0 + # IPv6 test + run_test "$1" "$2" "$3" "$4" "$5" 0 0 0 1 +} + +run_test_dscp_pcp() +{ + # IPv4 test + run_test "$1" "$2" "$3" "$4" "$5" 1 "$6" "$7" 0 + # IPv6 test + run_test "$1" "$2" "$3" "$4" "$5" 1 "$6" "$7" 1 +} + +port_default_prio_get() +{ + local if_name=$1 + local prio + + prio="$(dcb -j app show dev ${if_name} default-prio | \ + jq '.default_prio[]')" + if [ -z "${prio}" ]; then + prio=0 + fi + + echo ${prio} +} + +test_port_default() +{ + local orig_apptrust=$(port_get_default_apptrust ${swp1}) + local orig_prio=$(port_default_prio_get ${swp1}) + local apptrust_order="" + + RET=0 + + # Make sure no other priority sources will interfere with the test + set_apptrust_order ${swp1} "${apptrust_order}" + + for val in $(seq 0 7); do + dcb app replace dev ${swp1} default-prio ${val} + if [ $val -ne $(port_default_prio_get ${swp1}) ]; then + RET=1 + break + fi + + run_test_dscp "Port-default QoS classification, prio: ${val}" \ + "${apptrust_order}" ${val} 0 0 + done + + set_apptrust_order ${swp1} "${orig_apptrust}" + if [[ "$orig_apptrust" != "$(port_get_default_apptrust ${swp1})" ]]; then + RET=1 + fi + + dcb app replace dev ${swp1} default-prio ${orig_prio} + if [ $orig_prio -ne $(port_default_prio_get ${swp1}) ]; then + RET=1 + fi + + log_test "Port-default QoS classification" +} + +port_get_default_apptrust() +{ + local if_name=$1 + + dcb -j apptrust show dev ${if_name} | jq -r '.order[]' | \ + tr '\n' ' ' | xargs +} + +test_port_apptrust() +{ + local original_dscp_prios_swp1=$(get_dscp_prios ${swp1}) + local orig_apptrust=$(port_get_default_apptrust ${swp1}) + local orig_port_prio=$(port_default_prio_get ${swp1}) + local order_variants=("pcp dscp" "dscp" "pcp") + local apptrust_order + local port_prio + local dscp_prio + local pcp_prio + local dscp + local pcp + + RET=0 + + # First, test if apptrust configuration as taken by the kernel + for order in "${order_variants[@]}"; do + set_apptrust_order ${swp1} "${order}" + if [[ "$order" != "$(port_get_default_apptrust ${swp1})" ]]; then + RET=1 + break + fi + done + + log_test "Apptrust, supported variants" + + # To test if the apptrust configuration is working as expected, we need + # to set DSCP priorities for the switch port. + init_dscp_prios "${swp1}" "${original_dscp_prios_swp1}" + + # Start with a simple test where all apptrust sources are disabled + # default port priority is 0, DSCP priority is mapped to 7. + # No high priority packets should be received or transmitted. + port_prio=0 + dscp_prio=7 + dscp=4 + + dcb app replace dev ${swp1} default-prio ${port_prio} + dcb app replace dev ${swp1} dscp-prio ${dscp}:${dscp_prio} + + apptrust_order="" + set_apptrust_order ${swp1} "${apptrust_order}" + # Test with apptrust sources disabled, Packets should get port default + # priority which is 0 + run_test_dscp "Apptrust, all disabled. DSCP-prio ${dscp}:${dscp_prio}" \ + "${apptrust_order}" ${port_prio} ${dscp_prio} ${dscp} + + apptrust_order="pcp" + set_apptrust_order ${swp1} "${apptrust_order}" + # If PCP is enabled, packets should get PCP priority, which is not + # set in this test (no VLAN tags are present in the packet). No high + # priority packets should be received or transmitted. + run_test_dscp "Apptrust, PCP enabled. DSCP-prio ${dscp}:${dscp_prio}" \ + "${apptrust_order}" ${port_prio} ${dscp_prio} ${dscp} + + apptrust_order="dscp" + set_apptrust_order ${swp1} "${apptrust_order}" + # If DSCP is enabled, packets should get DSCP priority which is set to 7 + # in this test. High priority packets should be received and transmitted. + run_test_dscp "Apptrust, DSCP enabled. DSCP-prio ${dscp}:${dscp_prio}" \ + "${apptrust_order}" ${port_prio} ${dscp_prio} ${dscp} + + apptrust_order="pcp dscp" + set_apptrust_order ${swp1} "${apptrust_order}" + # If PCP and DSCP are enabled, PCP would have higher apptrust priority + # so packets should get PCP priority. But in this test VLAN PCP is not + # set, so it should get DSCP priority which is set to 7. High priority + # packets should be received and transmitted. + run_test_dscp "Apptrust, PCP and DSCP are enabled. DSCP-prio ${dscp}:${dscp_prio}" \ + "${apptrust_order}" ${port_prio} ${dscp_prio} ${dscp} + + # If VLAN PCP is set, it should have higher apptrust priority than DSCP + # so packets should get VLAN PCP priority. Send packets with VLAN PCP + # set to 0, DSCP set to 7. Packets should get VLAN PCP priority. + # No high priority packets should be transmitted. Due to nature of the + # switch, high priority packets will be received. + pcp_prio=0 + pcp=0 + run_test_dscp_pcp "Apptrust, PCP and DSCP are enabled. PCP ${pcp_prio}, DSCP-prio ${dscp}:${dscp_prio}" \ + "${apptrust_order}" ${port_prio} ${dscp_prio} ${dscp} ${pcp_prio} ${pcp} + + # If VLAN PCP is set to 7, it should have higher apptrust priority than + # DSCP so packets should get VLAN PCP priority. Send packets with VLAN + # PCP set to 7, DSCP set to 7. Packets should get VLAN PCP priority. + # High priority packets should be received and transmitted. + pcp_prio=7 + pcp=7 + run_test_dscp_pcp "Apptrust, PCP and DSCP are enabled. PCP ${pcp_prio}, DSCP-prio ${dscp}:${dscp_prio}" \ + "${apptrust_order}" ${port_prio} ${dscp_prio} ${dscp} ${pcp_prio} ${pcp} + # Now make sure that the switch is able to handle the case where DSCP + # priority is set to 0 and PCP priority is set to 7. Packets should get + # PCP priority. High priority packets should be received and transmitted. + dscp_prio=0 + dcb app replace dev ${swp1} dscp-prio ${dscp}:${dscp_prio} + run_test_dscp_pcp "Apptrust, PCP and DSCP are enabled. PCP ${pcp_prio}, DSCP-prio ${dscp}:${dscp_prio}" \ + "${apptrust_order}" ${port_prio} ${dscp_prio} ${dscp} ${pcp_prio} ${pcp} + # If both VLAN PCP and DSCP are set to 0, packets should get 0 priority. + # No high priority packets should be received or transmitted. + pcp_prio=0 + pcp=0 + run_test_dscp_pcp "Apptrust, PCP and DSCP are enabled. PCP ${pcp_prio}, DSCP-prio ${dscp}:${dscp_prio}" \ + "${apptrust_order}" ${port_prio} ${dscp_prio} ${dscp} ${pcp_prio} ${pcp} + + # Restore original priorities + if ! restore_priorities "${swp1}" "${original_dscp_prios_swp1}"; then + RET=1 + fi + + set_apptrust_order ${swp1} "${orig_apptrust}" + if [ "$orig_apptrust" != "$(port_get_default_apptrust ${swp1})" ]; then + RET=1 + fi + + dcb app replace dev ${swp1} default-prio ${orig_port_prio} + if [ $orig_port_prio -ne $(port_default_prio_get ${swp1}) ]; then + RET=1 + fi + + log_test "Apptrust, restore original settings" +} + +# Function to get current DSCP priorities +get_dscp_prios() { + local if_name=$1 + dcb -j app show dev ${if_name} | jq -c '.dscp_prio' +} + +# Function to set a specific DSCP priority on a device +replace_dscp_prio() { + local if_name=$1 + local dscp=$2 + local prio=$3 + dcb app replace dev ${if_name} dscp-prio ${dscp}:${prio} +} + +# Function to compare DSCP maps +compare_dscp_maps() { + local old_json=$1 + local new_json=$2 + local dscp=$3 + local prio=$4 + + # Create a modified old_json with the expected change for comparison + local modified_old_json=$(echo "$old_json" | + jq --argjson dscp $dscp --argjson prio $prio \ + 'map(if .[0] == $dscp then [$dscp, $prio] else . end)' | + tr -d " \n") + + # Compare new_json with the modified_old_json + if [[ "$modified_old_json" == "$new_json" ]]; then + return 0 + else + return 1 + fi +} + +# Function to set DSCP priorities +set_and_verify_dscp() { + local port=$1 + local dscp=$2 + local new_prio=$3 + + local old_prios=$(get_dscp_prios $port) + + replace_dscp_prio "$port" $dscp $new_prio + + # Fetch current settings and compare + local current_prios=$(get_dscp_prios $port) + if ! compare_dscp_maps "$old_prios" "$current_prios" $dscp $new_prio; then + echo "Error: Unintended changes detected in DSCP map for $port after setting DSCP $dscp to $new_prio." + return 1 + fi + return 0 +} + +# Function to restore original priorities +restore_priorities() { + local port=$1 + local original_prios=$2 + + echo "Removing test artifacts for $port" + local current_prios=$(get_dscp_prios $port) + local prio_str=$(echo "$current_prios" | + jq -r 'map("\(.[0]):\(.[1])") | join(" ")') + dcb app del dev $port dscp-prio $prio_str + + echo "Restoring original DSCP priorities for $port" + local restore_str=$(echo "$original_prios" | + jq -r 'map("\(.[0]):\(.[1])") | join(" ")') + dcb app add dev $port dscp-prio $restore_str + + local current_prios=$(get_dscp_prios $port) + if [[ "$original_prios" != "$current_prios" ]]; then + echo "Error: Failed to restore original DSCP priorities for $port" + return 1 + fi + return 0 +} + +# Initialize DSCP priorities. Set them to predictable values for testing. +init_dscp_prios() { + local port=$1 + local original_prios=$2 + + echo "Removing any existing DSCP priority mappins for $port" + local prio_str=$(echo "$original_prios" | + jq -r 'map("\(.[0]):\(.[1])") | join(" ")') + dcb app del dev $port dscp-prio $prio_str + + # Initialize DSCP priorities list + local dscp_prios="" + for dscp in {0..63}; do + dscp_prios+=("$dscp:0") + done + + echo "Setting initial DSCP priorities map to 0 for $port" + dcb app add dev $port dscp-prio ${dscp_prios[@]} +} + +# Main function to test global DSCP map across specified ports +test_global_dscp_map() { + local ports=("$swp1" "$swp2") + local original_dscp_prios_port0=$(get_dscp_prios ${ports[0]}) + local orig_apptrust=$(port_get_default_apptrust ${swp1}) + local orig_port_prio=$(port_default_prio_get ${swp1}) + local apptrust_order="dscp" + local port_prio=0 + local dscp_prio + local dscp + + RET=0 + + set_apptrust_order ${swp1} "${apptrust_order}" + dcb app replace dev ${swp1} default-prio ${port_prio} + + # Initialize DSCP priorities + init_dscp_prios "${ports[0]}" "$original_dscp_prios_port0" + + # Loop over each DSCP index + for dscp in {0..63}; do + # and test each Internal Priority value + for dscp_prio in {0..7}; do + # do it for each port. This is to test if the global DSCP map + # is accessible from all ports. + for port in "${ports[@]}"; do + if ! set_and_verify_dscp "$port" $dscp $dscp_prio; then + RET=1 + fi + done + + # Test if the DSCP priority is correctly applied to the packets + run_test_dscp "DSCP (${dscp}) QoS classification, prio: ${dscp_prio}" \ + "${apptrust_order}" ${port_prio} ${dscp_prio} ${dscp} + if [ ${RET} -eq 1 ]; then + break + fi + done + done + + # Restore original priorities + if ! restore_priorities "${ports[0]}" "${original_dscp_prios_port0}"; then + RET=1 + fi + + set_apptrust_order ${swp1} "${orig_apptrust}" + if [[ "$orig_apptrust" != "$(port_get_default_apptrust ${swp1})" ]]; then + RET=1 + fi + + dcb app replace dev ${swp1} default-prio ${orig_port_prio} + if [ $orig_port_prio -ne $(port_default_prio_get ${swp1}) ]; then + RET=1 + fi + + log_test "DSCP global map" +} + +trap cleanup EXIT + +ALL_TESTS=" + test_port_default + test_port_apptrust + test_global_dscp_map +" + +setup_prepare +setup_wait +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/drivers/net/mlxsw/ethtool_lanes.sh b/tools/testing/selftests/drivers/net/mlxsw/ethtool_lanes.sh index 91891b9418d7..877cd6df94a1 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/ethtool_lanes.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/ethtool_lanes.sh @@ -24,8 +24,8 @@ setup_prepare() busywait "$TIMEOUT" wait_for_port_up ethtool $swp2 check_err $? "ports did not come up" - local lanes_exist=$(ethtool $swp1 | grep 'Lanes:') - if [[ -z $lanes_exist ]]; then + busywait $TIMEOUT sh -c "ethtool $swp1 | grep -q Lanes:" + if [[ $? -ne 0 ]]; then log_test "SKIP: driver does not support lanes setting" exit 1 fi @@ -122,8 +122,9 @@ autoneg() ethtool_set $swp1 speed $max_speed lanes $lanes ip link set dev $swp1 up ip link set dev $swp2 up - busywait "$TIMEOUT" wait_for_port_up ethtool $swp2 - check_err $? "ports did not come up" + + busywait $TIMEOUT sh -c "ethtool $swp1 | grep -q Lanes:" + check_err $? "Lanes parameter is not presented on time" check_lanes $swp1 $lanes $max_speed log_test "$lanes lanes is autonegotiated" @@ -160,8 +161,9 @@ autoneg_force_mode() ethtool_set $swp2 speed $max_speed lanes $lanes autoneg off ip link set dev $swp1 up ip link set dev $swp2 up - busywait "$TIMEOUT" wait_for_port_up ethtool $swp2 - check_err $? "ports did not come up" + + busywait $TIMEOUT sh -c "ethtool $swp1 | grep -q Lanes:" + check_err $? "Lanes parameter is not presented on time" check_lanes $swp1 $lanes $max_speed log_test "Autoneg off, $lanes lanes detected during force mode" diff --git a/tools/testing/selftests/drivers/net/mlxsw/mlxsw_lib.sh b/tools/testing/selftests/drivers/net/mlxsw/mlxsw_lib.sh index 6369927e9c37..48395cfd4f95 100644 --- a/tools/testing/selftests/drivers/net/mlxsw/mlxsw_lib.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/mlxsw_lib.sh @@ -42,7 +42,7 @@ __mlxsw_only_on_spectrum() local src=$1; shift if ! mlxsw_on_spectrum "$rev"; then - log_test_skip $src:$caller "(Spectrum-$rev only)" + log_test_xfail $src:$caller "(Spectrum-$rev only)" return 1 fi } diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh index a88d8a8c85f2..899b6892603f 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh @@ -47,7 +47,6 @@ for current_test in ${TESTS:-$ALL_TESTS}; do RET=0 target=$(${current_test}_get_target "$should_fail") if ((target == 0)); then - log_test_skip "'$current_test' should_fail=$should_fail test" continue fi diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh index f981c957f097..482ebb744eba 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh @@ -52,7 +52,6 @@ for current_test in ${TESTS:-$ALL_TESTS}; do RET=0 target=$(${current_test}_get_target "$should_fail") if ((target == 0)); then - log_test_skip "'$current_test' [$profile] should_fail=$should_fail test" continue fi ${current_test}_setup_prepare diff --git a/tools/testing/selftests/drivers/net/ping.py b/tools/testing/selftests/drivers/net/ping.py new file mode 100755 index 000000000000..eb83e7b48797 --- /dev/null +++ b/tools/testing/selftests/drivers/net/ping.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +from lib.py import ksft_run, ksft_exit +from lib.py import ksft_eq +from lib.py import NetDrvEpEnv +from lib.py import bkg, cmd, wait_port_listen, rand_port + + +def test_v4(cfg) -> None: + cfg.require_v4() + + cmd(f"ping -c 1 -W0.5 {cfg.remote_v4}") + cmd(f"ping -c 1 -W0.5 {cfg.v4}", host=cfg.remote) + + +def test_v6(cfg) -> None: + cfg.require_v6() + + cmd(f"ping -c 1 -W0.5 {cfg.remote_v6}") + cmd(f"ping -c 1 -W0.5 {cfg.v6}", host=cfg.remote) + + +def test_tcp(cfg) -> None: + cfg.require_cmd("socat", remote=True) + + port = rand_port() + listen_cmd = f"socat -{cfg.addr_ipver} -t 2 -u TCP-LISTEN:{port},reuseport STDOUT" + + with bkg(listen_cmd, exit_wait=True) as nc: + wait_port_listen(port) + + cmd(f"echo ping | socat -t 2 -u STDIN TCP:{cfg.baddr}:{port}", + shell=True, host=cfg.remote) + ksft_eq(nc.stdout.strip(), "ping") + + with bkg(listen_cmd, host=cfg.remote, exit_wait=True) as nc: + wait_port_listen(port, host=cfg.remote) + + cmd(f"echo ping | socat -t 2 -u STDIN TCP:{cfg.remote_baddr}:{port}", shell=True) + ksft_eq(nc.stdout.strip(), "ping") + + +def main() -> None: + with NetDrvEpEnv(__file__) as cfg: + ksft_run(globs=globals(), case_pfx={"test_"}, args=(cfg, )) + ksft_exit() + + +if __name__ == "__main__": + main() diff --git a/tools/testing/selftests/drivers/net/queues.py b/tools/testing/selftests/drivers/net/queues.py new file mode 100755 index 000000000000..30f29096e27c --- /dev/null +++ b/tools/testing/selftests/drivers/net/queues.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +from lib.py import ksft_run, ksft_exit, ksft_eq, KsftSkipEx +from lib.py import EthtoolFamily, NetdevFamily +from lib.py import NetDrvEnv +from lib.py import cmd +import glob + + +def sys_get_queues(ifname) -> int: + folders = glob.glob(f'/sys/class/net/{ifname}/queues/rx-*') + return len(folders) + + +def nl_get_queues(cfg, nl): + queues = nl.queue_get({'ifindex': cfg.ifindex}, dump=True) + if queues: + return len([q for q in queues if q['type'] == 'rx']) + return None + + +def get_queues(cfg, nl) -> None: + queues = nl_get_queues(cfg, nl) + if not queues: + raise KsftSkipEx('queue-get not supported by device') + + expected = sys_get_queues(cfg.dev['ifname']) + ksft_eq(queues, expected) + + +def addremove_queues(cfg, nl) -> None: + queues = nl_get_queues(cfg, nl) + if not queues: + raise KsftSkipEx('queue-get not supported by device') + + curr_queues = sys_get_queues(cfg.dev['ifname']) + if curr_queues == 1: + raise KsftSkipEx('cannot decrement queue: already at 1') + + netnl = EthtoolFamily() + channels = netnl.channels_get({'header': {'dev-index': cfg.ifindex}}) + if channels['combined-count'] == 0: + rx_type = 'rx' + else: + rx_type = 'combined' + + expected = curr_queues - 1 + cmd(f"ethtool -L {cfg.dev['ifname']} {rx_type} {expected}", timeout=10) + queues = nl_get_queues(cfg, nl) + ksft_eq(queues, expected) + + expected = curr_queues + cmd(f"ethtool -L {cfg.dev['ifname']} {rx_type} {expected}", timeout=10) + queues = nl_get_queues(cfg, nl) + ksft_eq(queues, expected) + + +def main() -> None: + with NetDrvEnv(__file__, queue_count=3) as cfg: + ksft_run([get_queues, addremove_queues], args=(cfg, NetdevFamily())) + ksft_exit() + + +if __name__ == "__main__": + main() diff --git a/tools/testing/selftests/drivers/net/stats.py b/tools/testing/selftests/drivers/net/stats.py new file mode 100755 index 000000000000..7a7b16b180e2 --- /dev/null +++ b/tools/testing/selftests/drivers/net/stats.py @@ -0,0 +1,144 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +from lib.py import ksft_run, ksft_exit, ksft_pr +from lib.py import ksft_ge, ksft_eq, ksft_in, ksft_true, ksft_raises, KsftSkipEx, KsftXfailEx +from lib.py import EthtoolFamily, NetdevFamily, RtnlFamily, NlError +from lib.py import NetDrvEnv + +ethnl = EthtoolFamily() +netfam = NetdevFamily() +rtnl = RtnlFamily() + + +def check_pause(cfg) -> None: + global ethnl + + try: + ethnl.pause_get({"header": {"dev-index": cfg.ifindex}}) + except NlError as e: + if e.error == 95: + raise KsftXfailEx("pause not supported by the device") + raise + + data = ethnl.pause_get({"header": {"dev-index": cfg.ifindex, + "flags": {'stats'}}}) + ksft_true(data['stats'], "driver does not report stats") + + +def check_fec(cfg) -> None: + global ethnl + + try: + ethnl.fec_get({"header": {"dev-index": cfg.ifindex}}) + except NlError as e: + if e.error == 95: + raise KsftXfailEx("FEC not supported by the device") + raise + + data = ethnl.fec_get({"header": {"dev-index": cfg.ifindex, + "flags": {'stats'}}}) + ksft_true(data['stats'], "driver does not report stats") + + +def pkt_byte_sum(cfg) -> None: + global netfam, rtnl + + def get_qstat(test): + global netfam + stats = netfam.qstats_get({}, dump=True) + if stats: + for qs in stats: + if qs["ifindex"]== test.ifindex: + return qs + + qstat = get_qstat(cfg) + if qstat is None: + raise KsftSkipEx("qstats not supported by the device") + + for key in ['tx-packets', 'tx-bytes', 'rx-packets', 'rx-bytes']: + ksft_in(key, qstat, "Drivers should always report basic keys") + + # Compare stats, rtnl stats and qstats must match, + # but the interface may be up, so do a series of dumps + # each time the more "recent" stats must be higher or same. + def stat_cmp(rstat, qstat): + for key in ['tx-packets', 'tx-bytes', 'rx-packets', 'rx-bytes']: + if rstat[key] != qstat[key]: + return rstat[key] - qstat[key] + return 0 + + for _ in range(10): + rtstat = rtnl.getlink({"ifi-index": cfg.ifindex})['stats'] + if stat_cmp(rtstat, qstat) < 0: + raise Exception("RTNL stats are lower, fetched later") + qstat = get_qstat(cfg) + if stat_cmp(rtstat, qstat) > 0: + raise Exception("Qstats are lower, fetched later") + + +def qstat_by_ifindex(cfg) -> None: + global netfam + global rtnl + + # Construct a map ifindex -> [dump, by-index, dump] + ifindexes = {} + stats = netfam.qstats_get({}, dump=True) + for entry in stats: + ifindexes[entry['ifindex']] = [entry, None, None] + + for ifindex in ifindexes.keys(): + entry = netfam.qstats_get({"ifindex": ifindex}, dump=True) + ksft_eq(len(entry), 1) + ifindexes[entry[0]['ifindex']][1] = entry[0] + + stats = netfam.qstats_get({}, dump=True) + for entry in stats: + ifindexes[entry['ifindex']][2] = entry + + if len(ifindexes) == 0: + raise KsftSkipEx("No ifindex supports qstats") + + # Now make sure the stats match/make sense + for ifindex, triple in ifindexes.items(): + all_keys = triple[0].keys() | triple[1].keys() | triple[2].keys() + + for key in all_keys: + ksft_ge(triple[1][key], triple[0][key], comment="bad key: " + key) + ksft_ge(triple[2][key], triple[1][key], comment="bad key: " + key) + + # Test invalid dumps + # 0 is invalid + with ksft_raises(NlError) as cm: + netfam.qstats_get({"ifindex": 0}, dump=True) + ksft_eq(cm.exception.nl_msg.error, -34) + ksft_eq(cm.exception.nl_msg.extack['bad-attr'], '.ifindex') + + # loopback has no stats + with ksft_raises(NlError) as cm: + netfam.qstats_get({"ifindex": 1}, dump=True) + ksft_eq(cm.exception.nl_msg.error, -95) + ksft_eq(cm.exception.nl_msg.extack['bad-attr'], '.ifindex') + + # Try to get stats for lowest unused ifindex but not 0 + devs = rtnl.getlink({}, dump=True) + all_ifindexes = set([dev["ifi-index"] for dev in devs]) + lowest = 2 + while lowest in all_ifindexes: + lowest += 1 + + with ksft_raises(NlError) as cm: + netfam.qstats_get({"ifindex": lowest}, dump=True) + ksft_eq(cm.exception.nl_msg.error, -19) + ksft_eq(cm.exception.nl_msg.extack['bad-attr'], '.ifindex') + + +def main() -> None: + with NetDrvEnv(__file__) as cfg: + ksft_run([check_pause, check_fec, pkt_byte_sum, qstat_by_ifindex], + args=(cfg, )) + ksft_exit() + + +if __name__ == "__main__": + main() diff --git a/tools/testing/selftests/drivers/net/virtio_net/Makefile b/tools/testing/selftests/drivers/net/virtio_net/Makefile new file mode 100644 index 000000000000..7ec7cd3ab2cc --- /dev/null +++ b/tools/testing/selftests/drivers/net/virtio_net/Makefile @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: GPL-2.0+ OR MIT + +TEST_PROGS = basic_features.sh \ + # + +TEST_FILES = \ + virtio_net_common.sh \ + # + +TEST_INCLUDES = \ + ../../../net/forwarding/lib.sh \ + ../../../net/lib.sh \ + # + +include ../../../lib.mk diff --git a/tools/testing/selftests/drivers/net/virtio_net/basic_features.sh b/tools/testing/selftests/drivers/net/virtio_net/basic_features.sh new file mode 100755 index 000000000000..cf8cf816ed48 --- /dev/null +++ b/tools/testing/selftests/drivers/net/virtio_net/basic_features.sh @@ -0,0 +1,131 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# See virtio_net_common.sh comments for more details about assumed setup + +ALL_TESTS=" + initial_ping_test + f_mac_test +" + +source virtio_net_common.sh + +lib_dir=$(dirname "$0") +source "$lib_dir"/../../../net/forwarding/lib.sh + +h1=${NETIFS[p1]} +h2=${NETIFS[p2]} + +h1_create() +{ + simple_if_init $h1 $H1_IPV4/24 $H1_IPV6/64 +} + +h1_destroy() +{ + simple_if_fini $h1 $H1_IPV4/24 $H1_IPV6/64 +} + +h2_create() +{ + simple_if_init $h2 $H2_IPV4/24 $H2_IPV6/64 +} + +h2_destroy() +{ + simple_if_fini $h2 $H2_IPV4/24 $H2_IPV6/64 +} + +initial_ping_test() +{ + setup_cleanup + setup_prepare + ping_test $h1 $H2_IPV4 " simple" +} + +f_mac_test() +{ + RET=0 + local test_name="mac feature filtered" + + virtio_feature_present $h1 $VIRTIO_NET_F_MAC + if [ $? -ne 0 ]; then + log_test_skip "$test_name" "Device $h1 is missing feature $VIRTIO_NET_F_MAC." + return 0 + fi + virtio_feature_present $h1 $VIRTIO_NET_F_MAC + if [ $? -ne 0 ]; then + log_test_skip "$test_name" "Device $h2 is missing feature $VIRTIO_NET_F_MAC." + return 0 + fi + + setup_cleanup + setup_prepare + + grep -q 0 /sys/class/net/$h1/addr_assign_type + check_err $? "Permanent address assign type for $h1 is not set" + grep -q 0 /sys/class/net/$h2/addr_assign_type + check_err $? "Permanent address assign type for $h2 is not set" + + setup_cleanup + virtio_filter_feature_add $h1 $VIRTIO_NET_F_MAC + virtio_filter_feature_add $h2 $VIRTIO_NET_F_MAC + setup_prepare + + grep -q 0 /sys/class/net/$h1/addr_assign_type + check_fail $? "Permanent address assign type for $h1 is set when F_MAC feature is filtered" + grep -q 0 /sys/class/net/$h2/addr_assign_type + check_fail $? "Permanent address assign type for $h2 is set when F_MAC feature is filtered" + + ping_do $h1 $H2_IPV4 + check_err $? "Ping failed" + + log_test "$test_name" +} + +setup_prepare() +{ + virtio_device_rebind $h1 + virtio_device_rebind $h2 + wait_for_dev $h1 + wait_for_dev $h2 + + vrf_prepare + + h1_create + h2_create +} + +setup_cleanup() +{ + h2_destroy + h1_destroy + + vrf_cleanup + + virtio_filter_features_clear $h1 + virtio_filter_features_clear $h2 + virtio_device_rebind $h1 + virtio_device_rebind $h2 + wait_for_dev $h1 + wait_for_dev $h2 +} + +cleanup() +{ + pre_cleanup + setup_cleanup +} + +check_driver $h1 "virtio_net" +check_driver $h2 "virtio_net" +check_virtio_debugfs $h1 +check_virtio_debugfs $h2 + +trap cleanup EXIT + +setup_prepare + +tests_run + +exit "$EXIT_STATUS" diff --git a/tools/testing/selftests/drivers/net/virtio_net/config b/tools/testing/selftests/drivers/net/virtio_net/config new file mode 100644 index 000000000000..f35de0542b60 --- /dev/null +++ b/tools/testing/selftests/drivers/net/virtio_net/config @@ -0,0 +1,2 @@ +CONFIG_VIRTIO_NET=y +CONFIG_VIRTIO_DEBUG=y diff --git a/tools/testing/selftests/drivers/net/virtio_net/virtio_net_common.sh b/tools/testing/selftests/drivers/net/virtio_net/virtio_net_common.sh new file mode 100644 index 000000000000..57bd8055e2e5 --- /dev/null +++ b/tools/testing/selftests/drivers/net/virtio_net/virtio_net_common.sh @@ -0,0 +1,99 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# This assumes running on a host with two virtio interfaces connected +# back to back. Example script to do such wire-up of tap devices would +# look like this: +# +# ======================================================================================================= +# #!/bin/bash +# +# DEV1="$1" +# DEV2="$2" +# +# sudo tc qdisc add dev $DEV1 clsact +# sudo tc qdisc add dev $DEV2 clsact +# sudo tc filter add dev $DEV1 ingress protocol all pref 1 matchall action mirred egress redirect dev $DEV2 +# sudo tc filter add dev $DEV2 ingress protocol all pref 1 matchall action mirred egress redirect dev $DEV1 +# sudo ip link set $DEV1 up +# sudo ip link set $DEV2 up +# ======================================================================================================= + +REQUIRE_MZ="no" +NETIF_CREATE="no" +NETIF_FIND_DRIVER="virtio_net" +NUM_NETIFS=2 + +H1_IPV4="192.0.2.1" +H2_IPV4="192.0.2.2" +H1_IPV6="2001:db8:1::1" +H2_IPV6="2001:db8:1::2" + +VIRTIO_NET_F_MAC=5 + +virtio_device_get() +{ + local dev=$1; shift + local device_path="/sys/class/net/$dev/device/" + + basename `realpath $device_path` +} + +virtio_device_rebind() +{ + local dev=$1; shift + local device=`virtio_device_get $dev` + + echo "$device" > /sys/bus/virtio/drivers/virtio_net/unbind + echo "$device" > /sys/bus/virtio/drivers/virtio_net/bind +} + +virtio_debugfs_get() +{ + local dev=$1; shift + local device=`virtio_device_get $dev` + + echo /sys/kernel/debug/virtio/$device/ +} + +check_virtio_debugfs() +{ + local dev=$1; shift + local debugfs=`virtio_debugfs_get $dev` + + if [ ! -f "$debugfs/device_features" ] || + [ ! -f "$debugfs/filter_feature_add" ] || + [ ! -f "$debugfs/filter_feature_del" ] || + [ ! -f "$debugfs/filter_features" ] || + [ ! -f "$debugfs/filter_features_clear" ]; then + echo "SKIP: not possible to access debugfs for $dev" + exit $ksft_skip + fi +} + +virtio_feature_present() +{ + local dev=$1; shift + local feature=$1; shift + local debugfs=`virtio_debugfs_get $dev` + + cat $debugfs/device_features |grep "^$feature$" &> /dev/null + return $? +} + +virtio_filter_features_clear() +{ + local dev=$1; shift + local debugfs=`virtio_debugfs_get $dev` + + echo "1" > $debugfs/filter_features_clear +} + +virtio_filter_feature_add() +{ + local dev=$1; shift + local feature=$1; shift + local debugfs=`virtio_debugfs_get $dev` + + echo "$feature" > $debugfs/filter_feature_add +} diff --git a/tools/testing/selftests/exec/recursion-depth.c b/tools/testing/selftests/exec/recursion-depth.c index b2f37d86a5f6..438c8ff2fd26 100644 --- a/tools/testing/selftests/exec/recursion-depth.c +++ b/tools/testing/selftests/exec/recursion-depth.c @@ -37,25 +37,25 @@ int main(void) ksft_test_result_skip("error: unshare, errno %d\n", errno); ksft_finished(); } - ksft_exit_fail_msg("error: unshare, errno %d\n", errno); + ksft_exit_fail_perror("error: unshare"); } if (mount(NULL, "/", NULL, MS_PRIVATE | MS_REC, NULL) == -1) - ksft_exit_fail_msg("error: mount '/', errno %d\n", errno); + ksft_exit_fail_perror("error: mount '/'"); /* Require "exec" filesystem. */ if (mount(NULL, "/tmp", "ramfs", 0, NULL) == -1) - ksft_exit_fail_msg("error: mount ramfs, errno %d\n", errno); + ksft_exit_fail_perror("error: mount ramfs"); #define FILENAME "/tmp/1" fd = creat(FILENAME, 0700); if (fd == -1) - ksft_exit_fail_msg("error: creat, errno %d\n", errno); + ksft_exit_fail_perror("error: creat"); #define S "#!" FILENAME "\n" if (write(fd, S, strlen(S)) != strlen(S)) - ksft_exit_fail_msg("error: write, errno %d\n", errno); + ksft_exit_fail_perror("error: write"); close(fd); diff --git a/tools/testing/selftests/filesystems/binderfs/Makefile b/tools/testing/selftests/filesystems/binderfs/Makefile index c2f7cef919c0..eb4c3b411934 100644 --- a/tools/testing/selftests/filesystems/binderfs/Makefile +++ b/tools/testing/selftests/filesystems/binderfs/Makefile @@ -3,6 +3,4 @@ CFLAGS += $(KHDR_INCLUDES) -pthread TEST_GEN_PROGS := binderfs_test -binderfs_test: binderfs_test.c ../../kselftest.h ../../kselftest_harness.h - include ../../lib.mk diff --git a/tools/testing/selftests/filesystems/statmount/statmount_test.c b/tools/testing/selftests/filesystems/statmount/statmount_test.c index 3eafd7da58e2..e6d7c4f1c85b 100644 --- a/tools/testing/selftests/filesystems/statmount/statmount_test.c +++ b/tools/testing/selftests/filesystems/statmount/statmount_test.c @@ -3,6 +3,7 @@ #define _GNU_SOURCE #include <assert.h> +#include <stddef.h> #include <stdint.h> #include <sched.h> #include <fcntl.h> diff --git a/tools/testing/selftests/ftrace/ftracetest b/tools/testing/selftests/ftrace/ftracetest index 25d4e0fca385..cce72f8b03dc 100755 --- a/tools/testing/selftests/ftrace/ftracetest +++ b/tools/testing/selftests/ftrace/ftracetest @@ -255,7 +255,13 @@ prlog() { # messages [ "$LOG_FILE" ] && printf "$*$newline" | strip_esc >> $LOG_FILE } catlog() { #file - cat $1 + if [ "${KTAP}" = "1" ]; then + cat $1 | while read line ; do + echo "# $line" + done + else + cat $1 + fi [ "$LOG_FILE" ] && cat $1 | strip_esc >> $LOG_FILE } prlog "=== Ftrace unit tests ===" diff --git a/tools/testing/selftests/ftrace/ftracetest-ktap b/tools/testing/selftests/ftrace/ftracetest-ktap index b3284679ef3a..14e62ef3f3b9 100755 --- a/tools/testing/selftests/ftrace/ftracetest-ktap +++ b/tools/testing/selftests/ftrace/ftracetest-ktap @@ -5,4 +5,4 @@ # # Copyright (C) Arm Ltd., 2023 -./ftracetest -K +./ftracetest -K -v diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_btfarg.tc b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_btfarg.tc index b9c21a81d248..c0cdad4c400e 100644 --- a/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_btfarg.tc +++ b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_btfarg.tc @@ -53,7 +53,7 @@ fi echo > dynamic_events -if [ "$FIELDS" ] ; then +if [ "$FIELDS" -a "$FPROBES" ] ; then echo "t:tpevent ${TP2} obj_size=s->object_size" >> dynamic_events echo "f:fpevent ${TP3}%return path=\$retval->name:string" >> dynamic_events echo "t:tpevent2 ${TP4} p->se.group_node.next->prev" >> dynamic_events diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/fprobe_args_vfs.tc b/tools/testing/selftests/ftrace/test.d/dynevent/fprobe_args_vfs.tc new file mode 100644 index 000000000000..c6a9d2466a71 --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/dynevent/fprobe_args_vfs.tc @@ -0,0 +1,41 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: Fprobe event VFS type argument +# requires: dynamic_events "%pd/%pD":README "f[:[<group>/][<event>]] <func-name>[%return] [<args>]":README + + +: "Test argument %pd with name for fprobe" +echo 'f:testprobe dput name=$arg1:%pd' > dynamic_events +echo 1 > events/fprobes/testprobe/enable +grep -q "1" events/fprobes/testprobe/enable +echo 0 > events/fprobes/testprobe/enable +grep "dput" trace | grep -q "enable" +echo "" > dynamic_events +echo "" > trace + +: "Test argument %pd without name for fprobe" +echo 'f:testprobe dput $arg1:%pd' > dynamic_events +echo 1 > events/fprobes/testprobe/enable +grep -q "1" events/fprobes/testprobe/enable +echo 0 > events/fprobes/testprobe/enable +grep "dput" trace | grep -q "enable" +echo "" > dynamic_events +echo "" > trace + +: "Test argument %pD with name for fprobe" +echo 'f:testprobe vfs_read name=$arg1:%pD' > dynamic_events +echo 1 > events/fprobes/testprobe/enable +grep -q "1" events/fprobes/testprobe/enable +echo 0 > events/fprobes/testprobe/enable +grep "vfs_read" trace | grep -q "enable" +echo "" > dynamic_events +echo "" > trace + +: "Test argument %pD without name for fprobe" +echo 'f:testprobe vfs_read $arg1:%pD' > dynamic_events +echo 1 > events/fprobes/testprobe/enable +grep -q "1" events/fprobes/testprobe/enable +echo 0 > events/fprobes/testprobe/enable +grep "vfs_read" trace | grep -q "enable" +echo "" > dynamic_events +echo "" > trace diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/fprobe_entry_arg.tc b/tools/testing/selftests/ftrace/test.d/dynevent/fprobe_entry_arg.tc index d183b8a8ecf8..1e251ce2998e 100644 --- a/tools/testing/selftests/ftrace/test.d/dynevent/fprobe_entry_arg.tc +++ b/tools/testing/selftests/ftrace/test.d/dynevent/fprobe_entry_arg.tc @@ -11,7 +11,7 @@ echo 1 > events/tests/enable echo > trace cat trace > /dev/null -function streq() { +streq() { test $1 = $2 } diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc b/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc index 25432b8cd5bd..073a748b9380 100644 --- a/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc +++ b/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc @@ -19,7 +19,7 @@ fail() { # mesg FILTER=set_ftrace_filter FUNC1="schedule" -FUNC2="scheduler_tick" +FUNC2="sched_tick" ALL_FUNCS="#### all functions enabled ####" diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_vfs.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_vfs.tc new file mode 100644 index 000000000000..21a54be6894c --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_vfs.tc @@ -0,0 +1,40 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: Kprobe event VFS type argument +# requires: kprobe_events "%pd/%pD":README + +: "Test argument %pd with name" +echo 'p:testprobe dput name=$arg1:%pd' > kprobe_events +echo 1 > events/kprobes/testprobe/enable +grep -q "1" events/kprobes/testprobe/enable +echo 0 > events/kprobes/testprobe/enable +grep "dput" trace | grep -q "enable" +echo "" > kprobe_events +echo "" > trace + +: "Test argument %pd without name" +echo 'p:testprobe dput $arg1:%pd' > kprobe_events +echo 1 > events/kprobes/testprobe/enable +grep -q "1" events/kprobes/testprobe/enable +echo 0 > events/kprobes/testprobe/enable +grep "dput" trace | grep -q "enable" +echo "" > kprobe_events +echo "" > trace + +: "Test argument %pD with name" +echo 'p:testprobe vfs_read name=$arg1:%pD' > kprobe_events +echo 1 > events/kprobes/testprobe/enable +grep -q "1" events/kprobes/testprobe/enable +echo 0 > events/kprobes/testprobe/enable +grep "vfs_read" trace | grep -q "enable" +echo "" > kprobe_events +echo "" > trace + +: "Test argument %pD without name" +echo 'p:testprobe vfs_read $arg1:%pD' > kprobe_events +echo 1 > events/kprobes/testprobe/enable +grep -q "1" events/kprobes/testprobe/enable +echo 0 > events/kprobes/testprobe/enable +grep "vfs_read" trace | grep -q "enable" +echo "" > kprobe_events +echo "" > trace diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_entry_arg.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_entry_arg.tc index 53b82f36a1d0..e50470b53164 100644 --- a/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_entry_arg.tc +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_entry_arg.tc @@ -11,7 +11,7 @@ echo 1 > events/kprobes/enable echo > trace cat trace > /dev/null -function streq() { +streq() { test $1 = $2 } diff --git a/tools/testing/selftests/hid/config.common b/tools/testing/selftests/hid/config.common index 0f456dbab62f..45b5570441ce 100644 --- a/tools/testing/selftests/hid/config.common +++ b/tools/testing/selftests/hid/config.common @@ -238,3 +238,4 @@ CONFIG_VLAN_8021Q=y CONFIG_XFRM_SUB_POLICY=y CONFIG_XFRM_USER=y CONFIG_ZEROPLUS_FF=y +CONFIG_KASAN=y diff --git a/tools/testing/selftests/hid/hid_bpf.c b/tools/testing/selftests/hid/hid_bpf.c index 2cf96f818f25..f825623e3edc 100644 --- a/tools/testing/selftests/hid/hid_bpf.c +++ b/tools/testing/selftests/hid/hid_bpf.c @@ -16,6 +16,11 @@ #define SHOW_UHID_DEBUG 0 +#define min(a, b) \ + ({ __typeof__(a) _a = (a); \ + __typeof__(b) _b = (b); \ + _a < _b ? _a : _b; }) + static unsigned char rdesc[] = { 0x06, 0x00, 0xff, /* Usage Page (Vendor Defined Page 1) */ 0x09, 0x21, /* Usage (Vendor Usage 0x21) */ @@ -111,6 +116,10 @@ struct hid_hw_request_syscall_args { static pthread_mutex_t uhid_started_mtx = PTHREAD_MUTEX_INITIALIZER; static pthread_cond_t uhid_started = PTHREAD_COND_INITIALIZER; +static pthread_mutex_t uhid_output_mtx = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t uhid_output_cond = PTHREAD_COND_INITIALIZER; +static unsigned char output_report[10]; + /* no need to protect uhid_stopped, only one thread accesses it */ static bool uhid_stopped; @@ -205,6 +214,13 @@ static int uhid_event(struct __test_metadata *_metadata, int fd) break; case UHID_OUTPUT: UHID_LOG("UHID_OUTPUT from uhid-dev"); + + pthread_mutex_lock(&uhid_output_mtx); + memcpy(output_report, + ev.u.output.data, + min(ev.u.output.size, sizeof(output_report))); + pthread_cond_signal(&uhid_output_cond); + pthread_mutex_unlock(&uhid_output_mtx); break; case UHID_GET_REPORT: UHID_LOG("UHID_GET_REPORT from uhid-dev"); @@ -734,8 +750,100 @@ TEST_F(hid_bpf, test_hid_change_report) } /* - * Attach hid_user_raw_request to the given uhid device, - * call the bpf program from userspace + * Call hid_bpf_input_report against the given uhid device, + * check that the program is called and does the expected. + */ +TEST_F(hid_bpf, test_hid_user_input_report_call) +{ + struct hid_hw_request_syscall_args args = { + .retval = -1, + .size = 10, + }; + DECLARE_LIBBPF_OPTS(bpf_test_run_opts, tattrs, + .ctx_in = &args, + .ctx_size_in = sizeof(args), + ); + __u8 buf[10] = {0}; + int err, prog_fd; + + LOAD_BPF; + + args.hid = self->hid_id; + args.data[0] = 1; /* report ID */ + args.data[1] = 2; /* report ID */ + args.data[2] = 42; /* report ID */ + + prog_fd = bpf_program__fd(self->skel->progs.hid_user_input_report); + + /* check that there is no data to read from hidraw */ + memset(buf, 0, sizeof(buf)); + err = read(self->hidraw_fd, buf, sizeof(buf)); + ASSERT_EQ(err, -1) TH_LOG("read_hidraw"); + + err = bpf_prog_test_run_opts(prog_fd, &tattrs); + + ASSERT_OK(err) TH_LOG("error while calling bpf_prog_test_run_opts"); + + ASSERT_EQ(args.retval, 0); + + /* read the data from hidraw */ + memset(buf, 0, sizeof(buf)); + err = read(self->hidraw_fd, buf, sizeof(buf)); + ASSERT_EQ(err, 6) TH_LOG("read_hidraw"); + ASSERT_EQ(buf[0], 1); + ASSERT_EQ(buf[1], 2); + ASSERT_EQ(buf[2], 42); +} + +/* + * Call hid_bpf_hw_output_report against the given uhid device, + * check that the program is called and does the expected. + */ +TEST_F(hid_bpf, test_hid_user_output_report_call) +{ + struct hid_hw_request_syscall_args args = { + .retval = -1, + .size = 10, + }; + DECLARE_LIBBPF_OPTS(bpf_test_run_opts, tattrs, + .ctx_in = &args, + .ctx_size_in = sizeof(args), + ); + int err, cond_err, prog_fd; + struct timespec time_to_wait; + + LOAD_BPF; + + args.hid = self->hid_id; + args.data[0] = 1; /* report ID */ + args.data[1] = 2; /* report ID */ + args.data[2] = 42; /* report ID */ + + prog_fd = bpf_program__fd(self->skel->progs.hid_user_output_report); + + pthread_mutex_lock(&uhid_output_mtx); + + memset(output_report, 0, sizeof(output_report)); + clock_gettime(CLOCK_REALTIME, &time_to_wait); + time_to_wait.tv_sec += 2; + + err = bpf_prog_test_run_opts(prog_fd, &tattrs); + cond_err = pthread_cond_timedwait(&uhid_output_cond, &uhid_output_mtx, &time_to_wait); + + ASSERT_OK(err) TH_LOG("error while calling bpf_prog_test_run_opts"); + ASSERT_OK(cond_err) TH_LOG("error while calling waiting for the condition"); + + ASSERT_EQ(args.retval, 3); + + ASSERT_EQ(output_report[0], 1); + ASSERT_EQ(output_report[1], 2); + ASSERT_EQ(output_report[2], 42); + + pthread_mutex_unlock(&uhid_output_mtx); +} + +/* + * Call hid_hw_raw_request against the given uhid device, * check that the program is called and does the expected. */ TEST_F(hid_bpf, test_hid_user_raw_request_call) diff --git a/tools/testing/selftests/hid/progs/hid.c b/tools/testing/selftests/hid/progs/hid.c index 1e558826b809..f67d35def142 100644 --- a/tools/testing/selftests/hid/progs/hid.c +++ b/tools/testing/selftests/hid/progs/hid.c @@ -101,6 +101,52 @@ int hid_user_raw_request(struct hid_hw_request_syscall_args *args) return 0; } +SEC("syscall") +int hid_user_output_report(struct hid_hw_request_syscall_args *args) +{ + struct hid_bpf_ctx *ctx; + const size_t size = args->size; + int i, ret = 0; + + if (size > sizeof(args->data)) + return -7; /* -E2BIG */ + + ctx = hid_bpf_allocate_context(args->hid); + if (!ctx) + return -1; /* EPERM check */ + + ret = hid_bpf_hw_output_report(ctx, + args->data, + size); + args->retval = ret; + + hid_bpf_release_context(ctx); + + return 0; +} + +SEC("syscall") +int hid_user_input_report(struct hid_hw_request_syscall_args *args) +{ + struct hid_bpf_ctx *ctx; + const size_t size = args->size; + int i, ret = 0; + + if (size > sizeof(args->data)) + return -7; /* -E2BIG */ + + ctx = hid_bpf_allocate_context(args->hid); + if (!ctx) + return -1; /* EPERM check */ + + ret = hid_bpf_input_report(ctx, HID_INPUT_REPORT, args->data, size); + args->retval = ret; + + hid_bpf_release_context(ctx); + + return 0; +} + static const __u8 rdesc[] = { 0x05, 0x01, /* USAGE_PAGE (Generic Desktop) */ 0x09, 0x32, /* USAGE (Z) */ diff --git a/tools/testing/selftests/hid/progs/hid_bpf_helpers.h b/tools/testing/selftests/hid/progs/hid_bpf_helpers.h index 65e657ac1198..9cd56821d0f1 100644 --- a/tools/testing/selftests/hid/progs/hid_bpf_helpers.h +++ b/tools/testing/selftests/hid/progs/hid_bpf_helpers.h @@ -94,5 +94,11 @@ extern int hid_bpf_hw_request(struct hid_bpf_ctx *ctx, size_t buf__sz, enum hid_report_type type, enum hid_class_request reqtype) __ksym; +extern int hid_bpf_hw_output_report(struct hid_bpf_ctx *ctx, + __u8 *buf, size_t buf__sz) __ksym; +extern int hid_bpf_input_report(struct hid_bpf_ctx *ctx, + enum hid_report_type type, + __u8 *data, + size_t buf__sz) __ksym; #endif /* __HID_BPF_HELPERS_H */ diff --git a/tools/testing/selftests/hid/tests/base.py b/tools/testing/selftests/hid/tests/base.py index 51433063b227..3a465768e507 100644 --- a/tools/testing/selftests/hid/tests/base.py +++ b/tools/testing/selftests/hid/tests/base.py @@ -8,11 +8,13 @@ import libevdev import os import pytest +import shutil +import subprocess import time import logging -from hidtools.device.base_device import BaseDevice, EvdevMatch, SysfsFile +from .base_device import BaseDevice, EvdevMatch, SysfsFile from pathlib import Path from typing import Final, List, Tuple @@ -157,6 +159,17 @@ class BaseTestCase: # for example ("playstation", "hid-playstation") kernel_modules: List[Tuple[str, str]] = [] + # List of in kernel HID-BPF object files to load + # before starting the test + # Any existing pre-loaded HID-BPF module will be removed + # before the ones in this list will be manually loaded. + # Each Element is a tuple '(hid_bpf_object, rdesc_fixup_present)', + # for example '("xppen-ArtistPro16Gen2.bpf.o", True)' + # If 'rdesc_fixup_present' is True, the test needs to wait + # for one unbind and rebind before it can be sure the kernel is + # ready + hid_bpfs: List[Tuple[str, bool]] = [] + def assertInputEventsIn(self, expected_events, effective_events): effective_events = effective_events.copy() for ev in expected_events: @@ -211,8 +224,6 @@ class BaseTestCase: # we don't know beforehand the name of the module from modinfo sysfs_path = Path("/sys/module") / kernel_module.replace("-", "_") if not sysfs_path.exists(): - import subprocess - ret = subprocess.run(["/usr/sbin/modprobe", kernel_module]) if ret.returncode != 0: pytest.skip( @@ -225,6 +236,64 @@ class BaseTestCase: self._load_kernel_module(kernel_driver, kernel_module) yield + def load_hid_bpfs(self): + script_dir = Path(os.path.dirname(os.path.realpath(__file__))) + root_dir = (script_dir / "../../../../..").resolve() + bpf_dir = root_dir / "drivers/hid/bpf/progs" + + udev_hid_bpf = shutil.which("udev-hid-bpf") + if not udev_hid_bpf: + pytest.skip("udev-hid-bpf not found in $PATH, skipping") + + wait = False + for _, rdesc_fixup in self.hid_bpfs: + if rdesc_fixup: + wait = True + + for hid_bpf, _ in self.hid_bpfs: + # We need to start `udev-hid-bpf` in the background + # and dispatch uhid events in case the kernel needs + # to fetch features on the device + process = subprocess.Popen( + [ + "udev-hid-bpf", + "--verbose", + "add", + str(self.uhdev.sys_path), + str(bpf_dir / hid_bpf), + ], + ) + while process.poll() is None: + self.uhdev.dispatch(1) + + if process.poll() != 0: + pytest.fail( + f"Couldn't insert hid-bpf program '{hid_bpf}', marking the test as failed" + ) + + if wait: + # the HID-BPF program exports a rdesc fixup, so it needs to be + # unbound by the kernel and then rebound. + # Ensure we get the bound event exactly 2 times (one for the normal + # uhid loading, and then the reload from HID-BPF) + now = time.time() + while self.uhdev.kernel_ready_count < 2 and time.time() - now < 2: + self.uhdev.dispatch(1) + + if self.uhdev.kernel_ready_count < 2: + pytest.fail( + f"Couldn't insert hid-bpf programs, marking the test as failed" + ) + + def unload_hid_bpfs(self): + ret = subprocess.run( + ["udev-hid-bpf", "--verbose", "remove", str(self.uhdev.sys_path)], + ) + if ret.returncode != 0: + pytest.fail( + f"Couldn't unload hid-bpf programs, marking the test as failed" + ) + @pytest.fixture() def new_uhdev(self, load_kernel_module): return self.create_device() @@ -248,12 +317,18 @@ class BaseTestCase: now = time.time() while not self.uhdev.is_ready() and time.time() - now < 5: self.uhdev.dispatch(1) + + if self.hid_bpfs: + self.load_hid_bpfs() + if self.uhdev.get_evdev() is None: logger.warning( f"available list of input nodes: (default application is '{self.uhdev.application}')" ) logger.warning(self.uhdev.input_nodes) yield + if self.hid_bpfs: + self.unload_hid_bpfs() self.uhdev = None except PermissionError: pytest.skip("Insufficient permissions, run me as root") @@ -313,8 +388,6 @@ class HIDTestUdevRule(object): self.reload_udev_rules() def reload_udev_rules(self): - import subprocess - subprocess.run("udevadm control --reload-rules".split()) subprocess.run("systemd-hwdb update".split()) @@ -330,10 +403,11 @@ class HIDTestUdevRule(object): delete=False, ) as f: f.write( - 'KERNELS=="*input*", ATTRS{name}=="*uhid test *", ENV{LIBINPUT_IGNORE_DEVICE}="1"\n' - ) - f.write( - 'KERNELS=="*input*", ATTRS{name}=="*uhid test * System Multi Axis", ENV{ID_INPUT_TOUCHSCREEN}="", ENV{ID_INPUT_SYSTEM_MULTIAXIS}="1"\n' + """ +KERNELS=="*input*", ATTRS{name}=="*uhid test *", ENV{LIBINPUT_IGNORE_DEVICE}="1" +KERNELS=="*hid*", ENV{HID_NAME}=="*uhid test *", ENV{HID_BPF_IGNORE_DEVICE}="1" +KERNELS=="*input*", ATTRS{name}=="*uhid test * System Multi Axis", ENV{ID_INPUT_TOUCHSCREEN}="", ENV{ID_INPUT_SYSTEM_MULTIAXIS}="1" +""" ) self.rulesfile = f diff --git a/tools/testing/selftests/hid/tests/base_device.py b/tools/testing/selftests/hid/tests/base_device.py new file mode 100644 index 000000000000..e0515be97f83 --- /dev/null +++ b/tools/testing/selftests/hid/tests/base_device.py @@ -0,0 +1,421 @@ +#!/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 +# -*- coding: utf-8 -*- +# +# Copyright (c) 2017 Benjamin Tissoires <benjamin.tissoires@gmail.com> +# Copyright (c) 2017 Red Hat, Inc. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +import fcntl +import functools +import libevdev +import os + +try: + import pyudev +except ImportError: + raise ImportError("UHID is not supported due to missing pyudev dependency") + +import logging + +import hidtools.hid as hid +from hidtools.uhid import UHIDDevice +from hidtools.util import BusType + +from pathlib import Path +from typing import Any, ClassVar, Dict, List, Optional, Tuple, Type, Union + +logger = logging.getLogger("hidtools.device.base_device") + + +class SysfsFile(object): + def __init__(self, path): + self.path = path + + def __set_value(self, value): + with open(self.path, "w") as f: + return f.write(f"{value}\n") + + def __get_value(self): + with open(self.path) as f: + return f.read().strip() + + @property + def int_value(self) -> int: + return int(self.__get_value()) + + @int_value.setter + def int_value(self, v: int) -> None: + self.__set_value(v) + + @property + def str_value(self) -> str: + return self.__get_value() + + @str_value.setter + def str_value(self, v: str) -> None: + self.__set_value(v) + + +class LED(object): + def __init__(self, sys_path): + self.max_brightness = SysfsFile(sys_path / "max_brightness").int_value + self.__brightness = SysfsFile(sys_path / "brightness") + + @property + def brightness(self) -> int: + return self.__brightness.int_value + + @brightness.setter + def brightness(self, value: int) -> None: + self.__brightness.int_value = value + + +class PowerSupply(object): + """Represents Linux power_supply_class sysfs nodes.""" + + def __init__(self, sys_path): + self._capacity = SysfsFile(sys_path / "capacity") + self._status = SysfsFile(sys_path / "status") + self._type = SysfsFile(sys_path / "type") + + @property + def capacity(self) -> int: + return self._capacity.int_value + + @property + def status(self) -> str: + return self._status.str_value + + @property + def type(self) -> str: + return self._type.str_value + + +class HIDIsReady(object): + """ + Companion class that binds to a kernel mechanism + and that allows to know when a uhid device is ready or not. + + See :meth:`is_ready` for details. + """ + + def __init__(self: "HIDIsReady", uhid: UHIDDevice) -> None: + self.uhid = uhid + + def is_ready(self: "HIDIsReady") -> bool: + """ + Overwrite in subclasses: should return True or False whether + the attached uhid device is ready or not. + """ + return False + + +class UdevHIDIsReady(HIDIsReady): + _pyudev_context: ClassVar[Optional[pyudev.Context]] = None + _pyudev_monitor: ClassVar[Optional[pyudev.Monitor]] = None + _uhid_devices: ClassVar[Dict[int, Tuple[bool, int]]] = {} + + def __init__(self: "UdevHIDIsReady", uhid: UHIDDevice) -> None: + super().__init__(uhid) + self._init_pyudev() + + @classmethod + def _init_pyudev(cls: Type["UdevHIDIsReady"]) -> None: + if cls._pyudev_context is None: + cls._pyudev_context = pyudev.Context() + cls._pyudev_monitor = pyudev.Monitor.from_netlink(cls._pyudev_context) + cls._pyudev_monitor.filter_by("hid") + cls._pyudev_monitor.start() + + UHIDDevice._append_fd_to_poll( + cls._pyudev_monitor.fileno(), cls._cls_udev_event_callback + ) + + @classmethod + def _cls_udev_event_callback(cls: Type["UdevHIDIsReady"]) -> None: + if cls._pyudev_monitor is None: + return + event: pyudev.Device + for event in iter(functools.partial(cls._pyudev_monitor.poll, 0.02), None): + if event.action not in ["bind", "remove", "unbind"]: + return + + logger.debug(f"udev event: {event.action} -> {event}") + + id = int(event.sys_path.strip().split(".")[-1], 16) + + device_ready, count = cls._uhid_devices.get(id, (False, 0)) + + ready = event.action == "bind" + if not device_ready and ready: + count += 1 + cls._uhid_devices[id] = (ready, count) + + def is_ready(self: "UdevHIDIsReady") -> Tuple[bool, int]: + try: + return self._uhid_devices[self.uhid.hid_id] + except KeyError: + return (False, 0) + + +class EvdevMatch(object): + def __init__( + self: "EvdevMatch", + *, + requires: List[Any] = [], + excludes: List[Any] = [], + req_properties: List[Any] = [], + excl_properties: List[Any] = [], + ) -> None: + self.requires = requires + self.excludes = excludes + self.req_properties = req_properties + self.excl_properties = excl_properties + + def is_a_match(self: "EvdevMatch", evdev: libevdev.Device) -> bool: + for m in self.requires: + if not evdev.has(m): + return False + for m in self.excludes: + if evdev.has(m): + return False + for p in self.req_properties: + if not evdev.has_property(p): + return False + for p in self.excl_properties: + if evdev.has_property(p): + return False + return True + + +class EvdevDevice(object): + """ + Represents an Evdev node and its properties. + This is a stub for the libevdev devices, as they are relying on + uevent to get the data, saving us some ioctls to fetch the names + and properties. + """ + + def __init__(self: "EvdevDevice", sysfs: Path) -> None: + self.sysfs = sysfs + self.event_node: Any = None + self.libevdev: Optional[libevdev.Device] = None + + self.uevents = {} + # all of the interesting properties are stored in the input uevent, so in the parent + # so convert the uevent file of the parent input node into a dict + with open(sysfs.parent / "uevent") as f: + for line in f.readlines(): + key, value = line.strip().split("=") + self.uevents[key] = value.strip('"') + + # we open all evdev nodes in order to not miss any event + self.open() + + @property + def name(self: "EvdevDevice") -> str: + assert "NAME" in self.uevents + + return self.uevents["NAME"] + + @property + def evdev(self: "EvdevDevice") -> Path: + return Path("/dev/input") / self.sysfs.name + + def matches_application( + self: "EvdevDevice", application: str, matches: Dict[str, EvdevMatch] + ) -> bool: + if self.libevdev is None: + return False + + if application in matches: + return matches[application].is_a_match(self.libevdev) + + logger.error( + f"application '{application}' is unknown, please update/fix hid-tools" + ) + assert False # hid-tools likely needs an update + + def open(self: "EvdevDevice") -> libevdev.Device: + self.event_node = open(self.evdev, "rb") + self.libevdev = libevdev.Device(self.event_node) + + assert self.libevdev.fd is not None + + fd = self.libevdev.fd.fileno() + flag = fcntl.fcntl(fd, fcntl.F_GETFD) + fcntl.fcntl(fd, fcntl.F_SETFL, flag | os.O_NONBLOCK) + + return self.libevdev + + def close(self: "EvdevDevice") -> None: + if self.libevdev is not None and self.libevdev.fd is not None: + self.libevdev.fd.close() + self.libevdev = None + if self.event_node is not None: + self.event_node.close() + self.event_node = None + + +class BaseDevice(UHIDDevice): + # default _application_matches that matches nothing. This needs + # to be set in the subclasses to have get_evdev() working + _application_matches: Dict[str, EvdevMatch] = {} + + def __init__( + self, + name, + application, + rdesc_str: Optional[str] = None, + rdesc: Optional[Union[hid.ReportDescriptor, str, bytes]] = None, + input_info=None, + ) -> None: + self._kernel_is_ready: HIDIsReady = UdevHIDIsReady(self) + if rdesc_str is None and rdesc is None: + raise Exception("Please provide at least a rdesc or rdesc_str") + super().__init__() + if name is None: + name = f"uhid gamepad test {self.__class__.__name__}" + if input_info is None: + input_info = (BusType.USB, 1, 2) + self.name = name + self.info = input_info + self.default_reportID = None + self.opened = False + self.started = False + self.application = application + self._input_nodes: Optional[list[EvdevDevice]] = None + if rdesc is None: + assert rdesc_str is not None + self.rdesc = hid.ReportDescriptor.from_human_descr(rdesc_str) # type: ignore + else: + self.rdesc = rdesc # type: ignore + + @property + def power_supply_class(self: "BaseDevice") -> Optional[PowerSupply]: + ps = self.walk_sysfs("power_supply", "power_supply/*") + if ps is None or len(ps) < 1: + return None + + return PowerSupply(ps[0]) + + @property + def led_classes(self: "BaseDevice") -> List[LED]: + leds = self.walk_sysfs("led", "**/max_brightness") + if leds is None: + return [] + + return [LED(led.parent) for led in leds] + + @property + def kernel_is_ready(self: "BaseDevice") -> bool: + return self._kernel_is_ready.is_ready()[0] and self.started + + @property + def kernel_ready_count(self: "BaseDevice") -> int: + return self._kernel_is_ready.is_ready()[1] + + @property + def input_nodes(self: "BaseDevice") -> List[EvdevDevice]: + if self._input_nodes is not None: + return self._input_nodes + + if not self.kernel_is_ready or not self.started: + return [] + + self._input_nodes = [ + EvdevDevice(path) + for path in self.walk_sysfs("input", "input/input*/event*") + ] + return self._input_nodes + + def match_evdev_rule(self, application, evdev): + """Replace this in subclasses if the device has multiple reports + of the same type and we need to filter based on the actual evdev + node. + + returning True will append the corresponding report to + `self.input_nodes[type]` + returning False will ignore this report / type combination + for the device. + """ + return True + + def open(self): + self.opened = True + + def _close_all_opened_evdev(self): + if self._input_nodes is not None: + for e in self._input_nodes: + e.close() + + def __del__(self): + self._close_all_opened_evdev() + + def close(self): + self.opened = False + + def start(self, flags): + self.started = True + + def stop(self): + self.started = False + self._close_all_opened_evdev() + + def next_sync_events(self, application=None): + evdev = self.get_evdev(application) + if evdev is not None: + return list(evdev.events()) + return [] + + @property + def application_matches(self: "BaseDevice") -> Dict[str, EvdevMatch]: + return self._application_matches + + @application_matches.setter + def application_matches(self: "BaseDevice", data: Dict[str, EvdevMatch]) -> None: + self._application_matches = data + + def get_evdev(self, application=None): + if application is None: + application = self.application + + if len(self.input_nodes) == 0: + return None + + assert self._input_nodes is not None + + if len(self._input_nodes) == 1: + evdev = self._input_nodes[0] + if self.match_evdev_rule(application, evdev.libevdev): + return evdev.libevdev + else: + for _evdev in self._input_nodes: + if _evdev.matches_application(application, self.application_matches): + if self.match_evdev_rule(application, _evdev.libevdev): + return _evdev.libevdev + + def is_ready(self): + """Returns whether a UHID device is ready. Can be overwritten in + subclasses to add extra conditions on when to consider a UHID + device ready. This can be: + + - we need to wait on different types of input devices to be ready + (Touch Screen and Pen for example) + - we need to have at least 4 LEDs present + (len(self.uhdev.leds_classes) == 4) + - or any other combinations""" + return self.kernel_is_ready diff --git a/tools/testing/selftests/hid/tests/base_gamepad.py b/tools/testing/selftests/hid/tests/base_gamepad.py new file mode 100644 index 000000000000..ec74d75767a2 --- /dev/null +++ b/tools/testing/selftests/hid/tests/base_gamepad.py @@ -0,0 +1,238 @@ +# SPDX-License-Identifier: GPL-2.0 +import libevdev + +from .base_device import BaseDevice +from hidtools.util import BusType + + +class InvalidHIDCommunication(Exception): + pass + + +class GamepadData(object): + pass + + +class AxisMapping(object): + """Represents a mapping between a HID type + and an evdev event""" + + def __init__(self, hid, evdev=None): + self.hid = hid.lower() + + if evdev is None: + evdev = f"ABS_{hid.upper()}" + + self.evdev = libevdev.evbit("EV_ABS", evdev) + + +class BaseGamepad(BaseDevice): + buttons_map = { + 1: "BTN_SOUTH", + 2: "BTN_EAST", + 3: "BTN_C", + 4: "BTN_NORTH", + 5: "BTN_WEST", + 6: "BTN_Z", + 7: "BTN_TL", + 8: "BTN_TR", + 9: "BTN_TL2", + 10: "BTN_TR2", + 11: "BTN_SELECT", + 12: "BTN_START", + 13: "BTN_MODE", + 14: "BTN_THUMBL", + 15: "BTN_THUMBR", + } + + axes_map = { + "left_stick": { + "x": AxisMapping("x"), + "y": AxisMapping("y"), + }, + "right_stick": { + "x": AxisMapping("z"), + "y": AxisMapping("Rz"), + }, + } + + def __init__(self, rdesc, application="Game Pad", name=None, input_info=None): + assert rdesc is not None + super().__init__(name, application, input_info=input_info, rdesc=rdesc) + self.buttons = (1, 2, 3) + self._buttons = {} + self.left = (127, 127) + self.right = (127, 127) + self.hat_switch = 15 + assert self.parsed_rdesc is not None + + self.fields = [] + for r in self.parsed_rdesc.input_reports.values(): + if r.application_name == self.application: + self.fields.extend([f.usage_name for f in r]) + + def store_axes(self, which, gamepad, data): + amap = self.axes_map[which] + x, y = data + setattr(gamepad, amap["x"].hid, x) + setattr(gamepad, amap["y"].hid, y) + + def create_report( + self, + *, + left=(None, None), + right=(None, None), + hat_switch=None, + buttons=None, + reportID=None, + application="Game Pad", + ): + """ + Return an input report for this device. + + :param left: a tuple of absolute (x, y) value of the left joypad + where ``None`` is "leave unchanged" + :param right: a tuple of absolute (x, y) value of the right joypad + where ``None`` is "leave unchanged" + :param hat_switch: an absolute angular value of the hat switch + (expressed in 1/8 of circle, 0 being North, 2 East) + where ``None`` is "leave unchanged" + :param buttons: a dict of index/bool for the button states, + where ``None`` is "leave unchanged" + :param reportID: the numeric report ID for this report, if needed + :param application: the application used to report the values + """ + if buttons is not None: + for i, b in buttons.items(): + if i not in self.buttons: + raise InvalidHIDCommunication( + f"button {i} is not part of this {self.application}" + ) + if b is not None: + self._buttons[i] = b + + def replace_none_in_tuple(item, default): + if item is None: + item = (None, None) + + if None in item: + if item[0] is None: + item = (default[0], item[1]) + if item[1] is None: + item = (item[0], default[1]) + + return item + + right = replace_none_in_tuple(right, self.right) + self.right = right + left = replace_none_in_tuple(left, self.left) + self.left = left + + if hat_switch is None: + hat_switch = self.hat_switch + else: + self.hat_switch = hat_switch + + reportID = reportID or self.default_reportID + + gamepad = GamepadData() + for i, b in self._buttons.items(): + gamepad.__setattr__(f"b{i}", int(b) if b is not None else 0) + + self.store_axes("left_stick", gamepad, left) + self.store_axes("right_stick", gamepad, right) + gamepad.hatswitch = hat_switch # type: ignore ### gamepad is by default empty + return super().create_report( + gamepad, reportID=reportID, application=application + ) + + def event( + self, *, left=(None, None), right=(None, None), hat_switch=None, buttons=None + ): + """ + Send an input event on the default report ID. + + :param left: a tuple of absolute (x, y) value of the left joypad + where ``None`` is "leave unchanged" + :param right: a tuple of absolute (x, y) value of the right joypad + where ``None`` is "leave unchanged" + :param hat_switch: an absolute angular value of the hat switch + where ``None`` is "leave unchanged" + :param buttons: a dict of index/bool for the button states, + where ``None`` is "leave unchanged" + """ + r = self.create_report( + left=left, right=right, hat_switch=hat_switch, buttons=buttons + ) + self.call_input_event(r) + return [r] + + +class JoystickGamepad(BaseGamepad): + buttons_map = { + 1: "BTN_TRIGGER", + 2: "BTN_THUMB", + 3: "BTN_THUMB2", + 4: "BTN_TOP", + 5: "BTN_TOP2", + 6: "BTN_PINKIE", + 7: "BTN_BASE", + 8: "BTN_BASE2", + 9: "BTN_BASE3", + 10: "BTN_BASE4", + 11: "BTN_BASE5", + 12: "BTN_BASE6", + 13: "BTN_DEAD", + } + + axes_map = { + "left_stick": { + "x": AxisMapping("x"), + "y": AxisMapping("y"), + }, + "right_stick": { + "x": AxisMapping("rudder"), + "y": AxisMapping("throttle"), + }, + } + + def __init__(self, rdesc, application="Joystick", name=None, input_info=None): + super().__init__(rdesc, application, name, input_info) + + def create_report( + self, + *, + left=(None, None), + right=(None, None), + hat_switch=None, + buttons=None, + reportID=None, + application=None, + ): + """ + Return an input report for this device. + + :param left: a tuple of absolute (x, y) value of the left joypad + where ``None`` is "leave unchanged" + :param right: a tuple of absolute (x, y) value of the right joypad + where ``None`` is "leave unchanged" + :param hat_switch: an absolute angular value of the hat switch + where ``None`` is "leave unchanged" + :param buttons: a dict of index/bool for the button states, + where ``None`` is "leave unchanged" + :param reportID: the numeric report ID for this report, if needed + :param application: the application for this report, if needed + """ + if application is None: + application = "Joystick" + return super().create_report( + left=left, + right=right, + hat_switch=hat_switch, + buttons=buttons, + reportID=reportID, + application=application, + ) + + def store_right_joystick(self, gamepad, data): + gamepad.rudder, gamepad.throttle = data diff --git a/tools/testing/selftests/hid/tests/test_gamepad.py b/tools/testing/selftests/hid/tests/test_gamepad.py index 26c74040b796..8d5b5ffdae49 100644 --- a/tools/testing/selftests/hid/tests/test_gamepad.py +++ b/tools/testing/selftests/hid/tests/test_gamepad.py @@ -10,7 +10,8 @@ from . import base import libevdev import pytest -from hidtools.device.base_gamepad import AsusGamepad, SaitekGamepad +from .base_gamepad import BaseGamepad, JoystickGamepad, AxisMapping +from hidtools.util import BusType import logging @@ -199,6 +200,449 @@ class BaseTest: ) +class SaitekGamepad(JoystickGamepad): + # fmt: off + report_descriptor = [ + 0x05, 0x01, # Usage Page (Generic Desktop) 0 + 0x09, 0x04, # Usage (Joystick) 2 + 0xa1, 0x01, # Collection (Application) 4 + 0x09, 0x01, # .Usage (Pointer) 6 + 0xa1, 0x00, # .Collection (Physical) 8 + 0x85, 0x01, # ..Report ID (1) 10 + 0x09, 0x30, # ..Usage (X) 12 + 0x15, 0x00, # ..Logical Minimum (0) 14 + 0x26, 0xff, 0x00, # ..Logical Maximum (255) 16 + 0x35, 0x00, # ..Physical Minimum (0) 19 + 0x46, 0xff, 0x00, # ..Physical Maximum (255) 21 + 0x75, 0x08, # ..Report Size (8) 24 + 0x95, 0x01, # ..Report Count (1) 26 + 0x81, 0x02, # ..Input (Data,Var,Abs) 28 + 0x09, 0x31, # ..Usage (Y) 30 + 0x81, 0x02, # ..Input (Data,Var,Abs) 32 + 0x05, 0x02, # ..Usage Page (Simulation Controls) 34 + 0x09, 0xba, # ..Usage (Rudder) 36 + 0x81, 0x02, # ..Input (Data,Var,Abs) 38 + 0x09, 0xbb, # ..Usage (Throttle) 40 + 0x81, 0x02, # ..Input (Data,Var,Abs) 42 + 0x05, 0x09, # ..Usage Page (Button) 44 + 0x19, 0x01, # ..Usage Minimum (1) 46 + 0x29, 0x0c, # ..Usage Maximum (12) 48 + 0x25, 0x01, # ..Logical Maximum (1) 50 + 0x45, 0x01, # ..Physical Maximum (1) 52 + 0x75, 0x01, # ..Report Size (1) 54 + 0x95, 0x0c, # ..Report Count (12) 56 + 0x81, 0x02, # ..Input (Data,Var,Abs) 58 + 0x95, 0x01, # ..Report Count (1) 60 + 0x75, 0x00, # ..Report Size (0) 62 + 0x81, 0x03, # ..Input (Cnst,Var,Abs) 64 + 0x05, 0x01, # ..Usage Page (Generic Desktop) 66 + 0x09, 0x39, # ..Usage (Hat switch) 68 + 0x25, 0x07, # ..Logical Maximum (7) 70 + 0x46, 0x3b, 0x01, # ..Physical Maximum (315) 72 + 0x55, 0x00, # ..Unit Exponent (0) 75 + 0x65, 0x44, # ..Unit (Degrees^4,EngRotation) 77 + 0x75, 0x04, # ..Report Size (4) 79 + 0x81, 0x42, # ..Input (Data,Var,Abs,Null) 81 + 0x65, 0x00, # ..Unit (None) 83 + 0xc0, # .End Collection 85 + 0x05, 0x0f, # .Usage Page (Vendor Usage Page 0x0f) 86 + 0x09, 0x92, # .Usage (Vendor Usage 0x92) 88 + 0xa1, 0x02, # .Collection (Logical) 90 + 0x85, 0x02, # ..Report ID (2) 92 + 0x09, 0xa0, # ..Usage (Vendor Usage 0xa0) 94 + 0x09, 0x9f, # ..Usage (Vendor Usage 0x9f) 96 + 0x25, 0x01, # ..Logical Maximum (1) 98 + 0x45, 0x00, # ..Physical Maximum (0) 100 + 0x75, 0x01, # ..Report Size (1) 102 + 0x95, 0x02, # ..Report Count (2) 104 + 0x81, 0x02, # ..Input (Data,Var,Abs) 106 + 0x75, 0x06, # ..Report Size (6) 108 + 0x95, 0x01, # ..Report Count (1) 110 + 0x81, 0x03, # ..Input (Cnst,Var,Abs) 112 + 0x09, 0x22, # ..Usage (Vendor Usage 0x22) 114 + 0x75, 0x07, # ..Report Size (7) 116 + 0x25, 0x7f, # ..Logical Maximum (127) 118 + 0x81, 0x02, # ..Input (Data,Var,Abs) 120 + 0x09, 0x94, # ..Usage (Vendor Usage 0x94) 122 + 0x75, 0x01, # ..Report Size (1) 124 + 0x25, 0x01, # ..Logical Maximum (1) 126 + 0x81, 0x02, # ..Input (Data,Var,Abs) 128 + 0xc0, # .End Collection 130 + 0x09, 0x21, # .Usage (Vendor Usage 0x21) 131 + 0xa1, 0x02, # .Collection (Logical) 133 + 0x85, 0x0b, # ..Report ID (11) 135 + 0x09, 0x22, # ..Usage (Vendor Usage 0x22) 137 + 0x26, 0xff, 0x00, # ..Logical Maximum (255) 139 + 0x75, 0x08, # ..Report Size (8) 142 + 0x91, 0x02, # ..Output (Data,Var,Abs) 144 + 0x09, 0x53, # ..Usage (Vendor Usage 0x53) 146 + 0x25, 0x0a, # ..Logical Maximum (10) 148 + 0x91, 0x02, # ..Output (Data,Var,Abs) 150 + 0x09, 0x50, # ..Usage (Vendor Usage 0x50) 152 + 0x27, 0xfe, 0xff, 0x00, 0x00, # ..Logical Maximum (65534) 154 + 0x47, 0xfe, 0xff, 0x00, 0x00, # ..Physical Maximum (65534) 159 + 0x75, 0x10, # ..Report Size (16) 164 + 0x55, 0xfd, # ..Unit Exponent (237) 166 + 0x66, 0x01, 0x10, # ..Unit (Seconds,SILinear) 168 + 0x91, 0x02, # ..Output (Data,Var,Abs) 171 + 0x55, 0x00, # ..Unit Exponent (0) 173 + 0x65, 0x00, # ..Unit (None) 175 + 0x09, 0x54, # ..Usage (Vendor Usage 0x54) 177 + 0x55, 0xfd, # ..Unit Exponent (237) 179 + 0x66, 0x01, 0x10, # ..Unit (Seconds,SILinear) 181 + 0x91, 0x02, # ..Output (Data,Var,Abs) 184 + 0x55, 0x00, # ..Unit Exponent (0) 186 + 0x65, 0x00, # ..Unit (None) 188 + 0x09, 0xa7, # ..Usage (Vendor Usage 0xa7) 190 + 0x55, 0xfd, # ..Unit Exponent (237) 192 + 0x66, 0x01, 0x10, # ..Unit (Seconds,SILinear) 194 + 0x91, 0x02, # ..Output (Data,Var,Abs) 197 + 0x55, 0x00, # ..Unit Exponent (0) 199 + 0x65, 0x00, # ..Unit (None) 201 + 0xc0, # .End Collection 203 + 0x09, 0x5a, # .Usage (Vendor Usage 0x5a) 204 + 0xa1, 0x02, # .Collection (Logical) 206 + 0x85, 0x0c, # ..Report ID (12) 208 + 0x09, 0x22, # ..Usage (Vendor Usage 0x22) 210 + 0x26, 0xff, 0x00, # ..Logical Maximum (255) 212 + 0x45, 0x00, # ..Physical Maximum (0) 215 + 0x75, 0x08, # ..Report Size (8) 217 + 0x91, 0x02, # ..Output (Data,Var,Abs) 219 + 0x09, 0x5c, # ..Usage (Vendor Usage 0x5c) 221 + 0x26, 0x10, 0x27, # ..Logical Maximum (10000) 223 + 0x46, 0x10, 0x27, # ..Physical Maximum (10000) 226 + 0x75, 0x10, # ..Report Size (16) 229 + 0x55, 0xfd, # ..Unit Exponent (237) 231 + 0x66, 0x01, 0x10, # ..Unit (Seconds,SILinear) 233 + 0x91, 0x02, # ..Output (Data,Var,Abs) 236 + 0x55, 0x00, # ..Unit Exponent (0) 238 + 0x65, 0x00, # ..Unit (None) 240 + 0x09, 0x5b, # ..Usage (Vendor Usage 0x5b) 242 + 0x25, 0x7f, # ..Logical Maximum (127) 244 + 0x75, 0x08, # ..Report Size (8) 246 + 0x91, 0x02, # ..Output (Data,Var,Abs) 248 + 0x09, 0x5e, # ..Usage (Vendor Usage 0x5e) 250 + 0x26, 0x10, 0x27, # ..Logical Maximum (10000) 252 + 0x75, 0x10, # ..Report Size (16) 255 + 0x55, 0xfd, # ..Unit Exponent (237) 257 + 0x66, 0x01, 0x10, # ..Unit (Seconds,SILinear) 259 + 0x91, 0x02, # ..Output (Data,Var,Abs) 262 + 0x55, 0x00, # ..Unit Exponent (0) 264 + 0x65, 0x00, # ..Unit (None) 266 + 0x09, 0x5d, # ..Usage (Vendor Usage 0x5d) 268 + 0x25, 0x7f, # ..Logical Maximum (127) 270 + 0x75, 0x08, # ..Report Size (8) 272 + 0x91, 0x02, # ..Output (Data,Var,Abs) 274 + 0xc0, # .End Collection 276 + 0x09, 0x73, # .Usage (Vendor Usage 0x73) 277 + 0xa1, 0x02, # .Collection (Logical) 279 + 0x85, 0x0d, # ..Report ID (13) 281 + 0x09, 0x22, # ..Usage (Vendor Usage 0x22) 283 + 0x26, 0xff, 0x00, # ..Logical Maximum (255) 285 + 0x45, 0x00, # ..Physical Maximum (0) 288 + 0x91, 0x02, # ..Output (Data,Var,Abs) 290 + 0x09, 0x70, # ..Usage (Vendor Usage 0x70) 292 + 0x15, 0x81, # ..Logical Minimum (-127) 294 + 0x25, 0x7f, # ..Logical Maximum (127) 296 + 0x36, 0xf0, 0xd8, # ..Physical Minimum (-10000) 298 + 0x46, 0x10, 0x27, # ..Physical Maximum (10000) 301 + 0x91, 0x02, # ..Output (Data,Var,Abs) 304 + 0xc0, # .End Collection 306 + 0x09, 0x6e, # .Usage (Vendor Usage 0x6e) 307 + 0xa1, 0x02, # .Collection (Logical) 309 + 0x85, 0x0e, # ..Report ID (14) 311 + 0x09, 0x22, # ..Usage (Vendor Usage 0x22) 313 + 0x15, 0x00, # ..Logical Minimum (0) 315 + 0x26, 0xff, 0x00, # ..Logical Maximum (255) 317 + 0x35, 0x00, # ..Physical Minimum (0) 320 + 0x45, 0x00, # ..Physical Maximum (0) 322 + 0x91, 0x02, # ..Output (Data,Var,Abs) 324 + 0x09, 0x70, # ..Usage (Vendor Usage 0x70) 326 + 0x25, 0x7f, # ..Logical Maximum (127) 328 + 0x46, 0x10, 0x27, # ..Physical Maximum (10000) 330 + 0x91, 0x02, # ..Output (Data,Var,Abs) 333 + 0x09, 0x6f, # ..Usage (Vendor Usage 0x6f) 335 + 0x15, 0x81, # ..Logical Minimum (-127) 337 + 0x36, 0xf0, 0xd8, # ..Physical Minimum (-10000) 339 + 0x91, 0x02, # ..Output (Data,Var,Abs) 342 + 0x09, 0x71, # ..Usage (Vendor Usage 0x71) 344 + 0x15, 0x00, # ..Logical Minimum (0) 346 + 0x26, 0xff, 0x00, # ..Logical Maximum (255) 348 + 0x35, 0x00, # ..Physical Minimum (0) 351 + 0x46, 0x68, 0x01, # ..Physical Maximum (360) 353 + 0x91, 0x02, # ..Output (Data,Var,Abs) 356 + 0x09, 0x72, # ..Usage (Vendor Usage 0x72) 358 + 0x75, 0x10, # ..Report Size (16) 360 + 0x26, 0x10, 0x27, # ..Logical Maximum (10000) 362 + 0x46, 0x10, 0x27, # ..Physical Maximum (10000) 365 + 0x55, 0xfd, # ..Unit Exponent (237) 368 + 0x66, 0x01, 0x10, # ..Unit (Seconds,SILinear) 370 + 0x91, 0x02, # ..Output (Data,Var,Abs) 373 + 0x55, 0x00, # ..Unit Exponent (0) 375 + 0x65, 0x00, # ..Unit (None) 377 + 0xc0, # .End Collection 379 + 0x09, 0x77, # .Usage (Vendor Usage 0x77) 380 + 0xa1, 0x02, # .Collection (Logical) 382 + 0x85, 0x51, # ..Report ID (81) 384 + 0x09, 0x22, # ..Usage (Vendor Usage 0x22) 386 + 0x25, 0x7f, # ..Logical Maximum (127) 388 + 0x45, 0x00, # ..Physical Maximum (0) 390 + 0x75, 0x08, # ..Report Size (8) 392 + 0x91, 0x02, # ..Output (Data,Var,Abs) 394 + 0x09, 0x78, # ..Usage (Vendor Usage 0x78) 396 + 0xa1, 0x02, # ..Collection (Logical) 398 + 0x09, 0x7b, # ...Usage (Vendor Usage 0x7b) 400 + 0x09, 0x79, # ...Usage (Vendor Usage 0x79) 402 + 0x09, 0x7a, # ...Usage (Vendor Usage 0x7a) 404 + 0x15, 0x01, # ...Logical Minimum (1) 406 + 0x25, 0x03, # ...Logical Maximum (3) 408 + 0x91, 0x00, # ...Output (Data,Arr,Abs) 410 + 0xc0, # ..End Collection 412 + 0x09, 0x7c, # ..Usage (Vendor Usage 0x7c) 413 + 0x15, 0x00, # ..Logical Minimum (0) 415 + 0x26, 0xfe, 0x00, # ..Logical Maximum (254) 417 + 0x91, 0x02, # ..Output (Data,Var,Abs) 420 + 0xc0, # .End Collection 422 + 0x09, 0x92, # .Usage (Vendor Usage 0x92) 423 + 0xa1, 0x02, # .Collection (Logical) 425 + 0x85, 0x52, # ..Report ID (82) 427 + 0x09, 0x96, # ..Usage (Vendor Usage 0x96) 429 + 0xa1, 0x02, # ..Collection (Logical) 431 + 0x09, 0x9a, # ...Usage (Vendor Usage 0x9a) 433 + 0x09, 0x99, # ...Usage (Vendor Usage 0x99) 435 + 0x09, 0x97, # ...Usage (Vendor Usage 0x97) 437 + 0x09, 0x98, # ...Usage (Vendor Usage 0x98) 439 + 0x09, 0x9b, # ...Usage (Vendor Usage 0x9b) 441 + 0x09, 0x9c, # ...Usage (Vendor Usage 0x9c) 443 + 0x15, 0x01, # ...Logical Minimum (1) 445 + 0x25, 0x06, # ...Logical Maximum (6) 447 + 0x91, 0x00, # ...Output (Data,Arr,Abs) 449 + 0xc0, # ..End Collection 451 + 0xc0, # .End Collection 452 + 0x05, 0xff, # .Usage Page (Vendor Usage Page 0xff) 453 + 0x0a, 0x01, 0x03, # .Usage (Vendor Usage 0x301) 455 + 0xa1, 0x02, # .Collection (Logical) 458 + 0x85, 0x40, # ..Report ID (64) 460 + 0x0a, 0x02, 0x03, # ..Usage (Vendor Usage 0x302) 462 + 0xa1, 0x02, # ..Collection (Logical) 465 + 0x1a, 0x11, 0x03, # ...Usage Minimum (785) 467 + 0x2a, 0x20, 0x03, # ...Usage Maximum (800) 470 + 0x25, 0x10, # ...Logical Maximum (16) 473 + 0x91, 0x00, # ...Output (Data,Arr,Abs) 475 + 0xc0, # ..End Collection 477 + 0x0a, 0x03, 0x03, # ..Usage (Vendor Usage 0x303) 478 + 0x15, 0x00, # ..Logical Minimum (0) 481 + 0x27, 0xff, 0xff, 0x00, 0x00, # ..Logical Maximum (65535) 483 + 0x75, 0x10, # ..Report Size (16) 488 + 0x91, 0x02, # ..Output (Data,Var,Abs) 490 + 0xc0, # .End Collection 492 + 0x05, 0x0f, # .Usage Page (Vendor Usage Page 0x0f) 493 + 0x09, 0x7d, # .Usage (Vendor Usage 0x7d) 495 + 0xa1, 0x02, # .Collection (Logical) 497 + 0x85, 0x43, # ..Report ID (67) 499 + 0x09, 0x7e, # ..Usage (Vendor Usage 0x7e) 501 + 0x26, 0x80, 0x00, # ..Logical Maximum (128) 503 + 0x46, 0x10, 0x27, # ..Physical Maximum (10000) 506 + 0x75, 0x08, # ..Report Size (8) 509 + 0x91, 0x02, # ..Output (Data,Var,Abs) 511 + 0xc0, # .End Collection 513 + 0x09, 0x7f, # .Usage (Vendor Usage 0x7f) 514 + 0xa1, 0x02, # .Collection (Logical) 516 + 0x85, 0x0b, # ..Report ID (11) 518 + 0x09, 0x80, # ..Usage (Vendor Usage 0x80) 520 + 0x26, 0xff, 0x7f, # ..Logical Maximum (32767) 522 + 0x45, 0x00, # ..Physical Maximum (0) 525 + 0x75, 0x0f, # ..Report Size (15) 527 + 0xb1, 0x03, # ..Feature (Cnst,Var,Abs) 529 + 0x09, 0xa9, # ..Usage (Vendor Usage 0xa9) 531 + 0x25, 0x01, # ..Logical Maximum (1) 533 + 0x75, 0x01, # ..Report Size (1) 535 + 0xb1, 0x03, # ..Feature (Cnst,Var,Abs) 537 + 0x09, 0x83, # ..Usage (Vendor Usage 0x83) 539 + 0x26, 0xff, 0x00, # ..Logical Maximum (255) 541 + 0x75, 0x08, # ..Report Size (8) 544 + 0xb1, 0x03, # ..Feature (Cnst,Var,Abs) 546 + 0xc0, # .End Collection 548 + 0x09, 0xab, # .Usage (Vendor Usage 0xab) 549 + 0xa1, 0x03, # .Collection (Report) 551 + 0x85, 0x15, # ..Report ID (21) 553 + 0x09, 0x25, # ..Usage (Vendor Usage 0x25) 555 + 0xa1, 0x02, # ..Collection (Logical) 557 + 0x09, 0x26, # ...Usage (Vendor Usage 0x26) 559 + 0x09, 0x30, # ...Usage (Vendor Usage 0x30) 561 + 0x09, 0x32, # ...Usage (Vendor Usage 0x32) 563 + 0x09, 0x31, # ...Usage (Vendor Usage 0x31) 565 + 0x09, 0x33, # ...Usage (Vendor Usage 0x33) 567 + 0x09, 0x34, # ...Usage (Vendor Usage 0x34) 569 + 0x15, 0x01, # ...Logical Minimum (1) 571 + 0x25, 0x06, # ...Logical Maximum (6) 573 + 0xb1, 0x00, # ...Feature (Data,Arr,Abs) 575 + 0xc0, # ..End Collection 577 + 0xc0, # .End Collection 578 + 0x09, 0x89, # .Usage (Vendor Usage 0x89) 579 + 0xa1, 0x03, # .Collection (Report) 581 + 0x85, 0x16, # ..Report ID (22) 583 + 0x09, 0x8b, # ..Usage (Vendor Usage 0x8b) 585 + 0xa1, 0x02, # ..Collection (Logical) 587 + 0x09, 0x8c, # ...Usage (Vendor Usage 0x8c) 589 + 0x09, 0x8d, # ...Usage (Vendor Usage 0x8d) 591 + 0x09, 0x8e, # ...Usage (Vendor Usage 0x8e) 593 + 0x25, 0x03, # ...Logical Maximum (3) 595 + 0xb1, 0x00, # ...Feature (Data,Arr,Abs) 597 + 0xc0, # ..End Collection 599 + 0x09, 0x22, # ..Usage (Vendor Usage 0x22) 600 + 0x15, 0x00, # ..Logical Minimum (0) 602 + 0x26, 0xfe, 0x00, # ..Logical Maximum (254) 604 + 0xb1, 0x02, # ..Feature (Data,Var,Abs) 607 + 0xc0, # .End Collection 609 + 0x09, 0x90, # .Usage (Vendor Usage 0x90) 610 + 0xa1, 0x03, # .Collection (Report) 612 + 0x85, 0x50, # ..Report ID (80) 614 + 0x09, 0x22, # ..Usage (Vendor Usage 0x22) 616 + 0x26, 0xff, 0x00, # ..Logical Maximum (255) 618 + 0x91, 0x02, # ..Output (Data,Var,Abs) 621 + 0xc0, # .End Collection 623 + 0xc0, # End Collection 624 + ] + # fmt: on + + def __init__(self, rdesc=report_descriptor, name=None): + super().__init__(rdesc, name=name, input_info=(BusType.USB, 0x06A3, 0xFF0D)) + self.buttons = (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12) + + +class AsusGamepad(BaseGamepad): + # fmt: off + report_descriptor = [ + 0x05, 0x01, # Usage Page (Generic Desktop) 0 + 0x09, 0x05, # Usage (Game Pad) 2 + 0xa1, 0x01, # Collection (Application) 4 + 0x85, 0x01, # .Report ID (1) 6 + 0x05, 0x09, # .Usage Page (Button) 8 + 0x0a, 0x01, 0x00, # .Usage (Vendor Usage 0x01) 10 + 0x0a, 0x02, 0x00, # .Usage (Vendor Usage 0x02) 13 + 0x0a, 0x04, 0x00, # .Usage (Vendor Usage 0x04) 16 + 0x0a, 0x05, 0x00, # .Usage (Vendor Usage 0x05) 19 + 0x0a, 0x07, 0x00, # .Usage (Vendor Usage 0x07) 22 + 0x0a, 0x08, 0x00, # .Usage (Vendor Usage 0x08) 25 + 0x0a, 0x0e, 0x00, # .Usage (Vendor Usage 0x0e) 28 + 0x0a, 0x0f, 0x00, # .Usage (Vendor Usage 0x0f) 31 + 0x0a, 0x0d, 0x00, # .Usage (Vendor Usage 0x0d) 34 + 0x05, 0x0c, # .Usage Page (Consumer Devices) 37 + 0x0a, 0x24, 0x02, # .Usage (AC Back) 39 + 0x0a, 0x23, 0x02, # .Usage (AC Home) 42 + 0x15, 0x00, # .Logical Minimum (0) 45 + 0x25, 0x01, # .Logical Maximum (1) 47 + 0x75, 0x01, # .Report Size (1) 49 + 0x95, 0x0b, # .Report Count (11) 51 + 0x81, 0x02, # .Input (Data,Var,Abs) 53 + 0x75, 0x01, # .Report Size (1) 55 + 0x95, 0x01, # .Report Count (1) 57 + 0x81, 0x03, # .Input (Cnst,Var,Abs) 59 + 0x05, 0x01, # .Usage Page (Generic Desktop) 61 + 0x75, 0x04, # .Report Size (4) 63 + 0x95, 0x01, # .Report Count (1) 65 + 0x25, 0x07, # .Logical Maximum (7) 67 + 0x46, 0x3b, 0x01, # .Physical Maximum (315) 69 + 0x66, 0x14, 0x00, # .Unit (Degrees,EngRotation) 72 + 0x09, 0x39, # .Usage (Hat switch) 75 + 0x81, 0x42, # .Input (Data,Var,Abs,Null) 77 + 0x66, 0x00, 0x00, # .Unit (None) 79 + 0x09, 0x01, # .Usage (Pointer) 82 + 0xa1, 0x00, # .Collection (Physical) 84 + 0x09, 0x30, # ..Usage (X) 86 + 0x09, 0x31, # ..Usage (Y) 88 + 0x09, 0x32, # ..Usage (Z) 90 + 0x09, 0x35, # ..Usage (Rz) 92 + 0x05, 0x02, # ..Usage Page (Simulation Controls) 94 + 0x09, 0xc5, # ..Usage (Brake) 96 + 0x09, 0xc4, # ..Usage (Accelerator) 98 + 0x15, 0x00, # ..Logical Minimum (0) 100 + 0x26, 0xff, 0x00, # ..Logical Maximum (255) 102 + 0x35, 0x00, # ..Physical Minimum (0) 105 + 0x46, 0xff, 0x00, # ..Physical Maximum (255) 107 + 0x75, 0x08, # ..Report Size (8) 110 + 0x95, 0x06, # ..Report Count (6) 112 + 0x81, 0x02, # ..Input (Data,Var,Abs) 114 + 0xc0, # .End Collection 116 + 0x85, 0x02, # .Report ID (2) 117 + 0x05, 0x08, # .Usage Page (LEDs) 119 + 0x0a, 0x01, 0x00, # .Usage (Num Lock) 121 + 0x0a, 0x02, 0x00, # .Usage (Caps Lock) 124 + 0x0a, 0x03, 0x00, # .Usage (Scroll Lock) 127 + 0x0a, 0x04, 0x00, # .Usage (Compose) 130 + 0x15, 0x00, # .Logical Minimum (0) 133 + 0x25, 0x01, # .Logical Maximum (1) 135 + 0x75, 0x01, # .Report Size (1) 137 + 0x95, 0x04, # .Report Count (4) 139 + 0x91, 0x02, # .Output (Data,Var,Abs) 141 + 0x75, 0x04, # .Report Size (4) 143 + 0x95, 0x01, # .Report Count (1) 145 + 0x91, 0x03, # .Output (Cnst,Var,Abs) 147 + 0xc0, # End Collection 149 + 0x05, 0x0c, # Usage Page (Consumer Devices) 150 + 0x09, 0x01, # Usage (Consumer Control) 152 + 0xa1, 0x01, # Collection (Application) 154 + 0x85, 0x03, # .Report ID (3) 156 + 0x05, 0x01, # .Usage Page (Generic Desktop) 158 + 0x09, 0x06, # .Usage (Keyboard) 160 + 0xa1, 0x02, # .Collection (Logical) 162 + 0x05, 0x06, # ..Usage Page (Generic Device Controls) 164 + 0x09, 0x20, # ..Usage (Battery Strength) 166 + 0x15, 0x00, # ..Logical Minimum (0) 168 + 0x26, 0xff, 0x00, # ..Logical Maximum (255) 170 + 0x75, 0x08, # ..Report Size (8) 173 + 0x95, 0x01, # ..Report Count (1) 175 + 0x81, 0x02, # ..Input (Data,Var,Abs) 177 + 0x06, 0xbc, 0xff, # ..Usage Page (Vendor Usage Page 0xffbc) 179 + 0x0a, 0xad, 0xbd, # ..Usage (Vendor Usage 0xbdad) 182 + 0x75, 0x08, # ..Report Size (8) 185 + 0x95, 0x06, # ..Report Count (6) 187 + 0x81, 0x02, # ..Input (Data,Var,Abs) 189 + 0xc0, # .End Collection 191 + 0xc0, # End Collection 192 + ] + # fmt: on + + def __init__(self, rdesc=report_descriptor, name=None): + super().__init__(rdesc, name=name, input_info=(BusType.USB, 0x18D1, 0x2C40)) + self.buttons = (1, 2, 4, 5, 7, 8, 14, 15, 13) + + +class RaptorMach2Joystick(JoystickGamepad): + axes_map = { + "left_stick": { + "x": AxisMapping("x"), + "y": AxisMapping("y"), + }, + "right_stick": { + "x": AxisMapping("z"), + "y": AxisMapping("Rz"), + }, + } + + def __init__( + self, + name, + rdesc=None, + application="Joystick", + input_info=(BusType.USB, 0x11C0, 0x5606), + ): + super().__init__(rdesc, application, name, input_info) + self.buttons = (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12) + self.hat_switch = 240 # null value is 240 as max is 239 + + def event( + self, *, left=(None, None), right=(None, None), hat_switch=None, buttons=None + ): + if hat_switch is not None: + hat_switch *= 30 + + return super().event( + left=left, right=right, hat_switch=hat_switch, buttons=buttons + ) + + class TestSaitekGamepad(BaseTest.TestGamepad): def create_device(self): return SaitekGamepad() @@ -207,3 +651,14 @@ class TestSaitekGamepad(BaseTest.TestGamepad): class TestAsusGamepad(BaseTest.TestGamepad): def create_device(self): return AsusGamepad() + + +class TestRaptorMach2Joystick(BaseTest.TestGamepad): + hid_bpfs = [("FR-TEC__Raptor-Mach-2.bpf.o", True)] + + def create_device(self): + return RaptorMach2Joystick( + "uhid test Sanmos Group FR-TEC Raptor MACH 2", + rdesc="05 01 09 04 a1 01 05 01 85 01 05 01 09 30 75 10 95 01 15 00 26 ff 07 46 ff 07 81 02 05 01 09 31 75 10 95 01 15 00 26 ff 07 46 ff 07 81 02 05 01 09 33 75 10 95 01 15 00 26 ff 03 46 ff 03 81 02 05 00 09 00 75 10 95 01 15 00 26 ff 03 46 ff 03 81 02 05 01 09 32 75 10 95 01 15 00 26 ff 03 46 ff 03 81 02 05 01 09 35 75 10 95 01 15 00 26 ff 03 46 ff 03 81 02 05 01 09 34 75 10 95 01 15 00 26 ff 07 46 ff 07 81 02 05 01 09 36 75 10 95 01 15 00 26 ff 03 46 ff 03 81 02 05 09 19 01 2a 1d 00 15 00 25 01 75 01 96 80 00 81 02 05 01 09 39 26 ef 00 46 68 01 65 14 75 10 95 01 81 42 05 01 09 00 75 08 95 1d 81 01 15 00 26 ef 00 85 58 26 ff 00 46 ff 00 75 08 95 3f 09 00 91 02 85 59 75 08 95 80 09 00 b1 02 c0", + input_info=(BusType.USB, 0x11C0, 0x5606), + ) diff --git a/tools/testing/selftests/hid/tests/test_tablet.py b/tools/testing/selftests/hid/tests/test_tablet.py index 903f19f7cbe9..a9e2de1e8861 100644 --- a/tools/testing/selftests/hid/tests/test_tablet.py +++ b/tools/testing/selftests/hid/tests/test_tablet.py @@ -35,6 +35,7 @@ class BtnPressed(Enum): PRIMARY_PRESSED = libevdev.EV_KEY.BTN_STYLUS SECONDARY_PRESSED = libevdev.EV_KEY.BTN_STYLUS2 + THIRD_PRESSED = libevdev.EV_KEY.BTN_STYLUS3 class PenState(Enum): @@ -44,58 +45,28 @@ class PenState(Enum): We extend it with the various buttons when we need to check them. """ - PEN_IS_OUT_OF_RANGE = BtnTouch.UP, None, None - PEN_IS_IN_RANGE = BtnTouch.UP, ToolType.PEN, None - PEN_IS_IN_RANGE_WITH_BUTTON = BtnTouch.UP, ToolType.PEN, BtnPressed.PRIMARY_PRESSED - PEN_IS_IN_RANGE_WITH_SECOND_BUTTON = ( - BtnTouch.UP, - ToolType.PEN, - BtnPressed.SECONDARY_PRESSED, - ) - PEN_IS_IN_CONTACT = BtnTouch.DOWN, ToolType.PEN, None - PEN_IS_IN_CONTACT_WITH_BUTTON = ( - BtnTouch.DOWN, - ToolType.PEN, - BtnPressed.PRIMARY_PRESSED, - ) - PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON = ( - BtnTouch.DOWN, - ToolType.PEN, - BtnPressed.SECONDARY_PRESSED, - ) - PEN_IS_IN_RANGE_WITH_ERASING_INTENT = BtnTouch.UP, ToolType.RUBBER, None - PEN_IS_IN_RANGE_WITH_ERASING_INTENT_WITH_BUTTON = ( - BtnTouch.UP, - ToolType.RUBBER, - BtnPressed.PRIMARY_PRESSED, - ) - PEN_IS_IN_RANGE_WITH_ERASING_INTENT_WITH_SECOND_BUTTON = ( - BtnTouch.UP, - ToolType.RUBBER, - BtnPressed.SECONDARY_PRESSED, - ) - PEN_IS_ERASING = BtnTouch.DOWN, ToolType.RUBBER, None - PEN_IS_ERASING_WITH_BUTTON = ( - BtnTouch.DOWN, - ToolType.RUBBER, - BtnPressed.PRIMARY_PRESSED, - ) - PEN_IS_ERASING_WITH_SECOND_BUTTON = ( - BtnTouch.DOWN, - ToolType.RUBBER, - BtnPressed.SECONDARY_PRESSED, - ) - - def __init__(self, touch: BtnTouch, tool: Optional[ToolType], button: Optional[BtnPressed]): + PEN_IS_OUT_OF_RANGE = BtnTouch.UP, None, False + PEN_IS_IN_RANGE = BtnTouch.UP, ToolType.PEN, False + PEN_IS_IN_RANGE_WITH_BUTTON = BtnTouch.UP, ToolType.PEN, True + PEN_IS_IN_CONTACT = BtnTouch.DOWN, ToolType.PEN, False + PEN_IS_IN_CONTACT_WITH_BUTTON = BtnTouch.DOWN, ToolType.PEN, True + PEN_IS_IN_RANGE_WITH_ERASING_INTENT = BtnTouch.UP, ToolType.RUBBER, False + PEN_IS_IN_RANGE_WITH_ERASING_INTENT_WITH_BUTTON = BtnTouch.UP, ToolType.RUBBER, True + PEN_IS_ERASING = BtnTouch.DOWN, ToolType.RUBBER, False + PEN_IS_ERASING_WITH_BUTTON = BtnTouch.DOWN, ToolType.RUBBER, True + + def __init__( + self, touch: BtnTouch, tool: Optional[ToolType], button: Optional[bool] + ): self.touch = touch # type: ignore self.tool = tool # type: ignore self.button = button # type: ignore @classmethod - def from_evdev(cls, evdev) -> "PenState": + def from_evdev(cls, evdev, test_button) -> "PenState": touch = BtnTouch(evdev.value[libevdev.EV_KEY.BTN_TOUCH]) tool = None - button = None + button = False if ( evdev.value[libevdev.EV_KEY.BTN_TOOL_RUBBER] and not evdev.value[libevdev.EV_KEY.BTN_TOOL_PEN] @@ -112,19 +83,20 @@ class PenState(Enum): ): raise ValueError("2 tools are not allowed") - # we take only the highest button in account - for b in [libevdev.EV_KEY.BTN_STYLUS, libevdev.EV_KEY.BTN_STYLUS2]: - if bool(evdev.value[b]): - button = BtnPressed(b) + # we take only the provided button into account + if test_button is not None: + button = bool(evdev.value[test_button.value]) # the kernel tends to insert an EV_SYN once removing the tool, so # the button will be released after if tool is None: - button = None + button = False return cls((touch, tool, button)) # type: ignore - def apply(self, events: List[libevdev.InputEvent], strict: bool) -> "PenState": + def apply( + self, events: List[libevdev.InputEvent], strict: bool, test_button: BtnPressed + ) -> "PenState": if libevdev.EV_SYN.SYN_REPORT in events: raise ValueError("EV_SYN is in the event sequence") touch = self.touch @@ -148,19 +120,16 @@ class PenState(Enum): raise ValueError(f"duplicated BTN_TOOL_* in {events}") tool_found = True tool = ToolType(ev.code) if ev.value else None - elif ev in ( - libevdev.InputEvent(libevdev.EV_KEY.BTN_STYLUS), - libevdev.InputEvent(libevdev.EV_KEY.BTN_STYLUS2), - ): + elif test_button is not None and ev in (test_button.value,): if button_found: raise ValueError(f"duplicated BTN_STYLUS* in {events}") button_found = True - button = BtnPressed(ev.code) if ev.value else None + button = bool(ev.value) # the kernel tends to insert an EV_SYN once removing the tool, so # the button will be released after if tool is None: - button = None + button = False new_state = PenState((touch, tool, button)) # type: ignore if strict: @@ -183,11 +152,9 @@ class PenState(Enum): PenState.PEN_IS_OUT_OF_RANGE, PenState.PEN_IS_IN_RANGE, PenState.PEN_IS_IN_RANGE_WITH_BUTTON, - PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON, PenState.PEN_IS_IN_RANGE_WITH_ERASING_INTENT, PenState.PEN_IS_IN_CONTACT, PenState.PEN_IS_IN_CONTACT_WITH_BUTTON, - PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON, PenState.PEN_IS_ERASING, ) @@ -195,7 +162,6 @@ class PenState(Enum): return ( PenState.PEN_IS_IN_RANGE, PenState.PEN_IS_IN_RANGE_WITH_BUTTON, - PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON, PenState.PEN_IS_OUT_OF_RANGE, PenState.PEN_IS_IN_CONTACT, ) @@ -204,7 +170,6 @@ class PenState(Enum): return ( PenState.PEN_IS_IN_CONTACT, PenState.PEN_IS_IN_CONTACT_WITH_BUTTON, - PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON, PenState.PEN_IS_IN_RANGE, ) @@ -236,21 +201,6 @@ class PenState(Enum): PenState.PEN_IS_IN_RANGE_WITH_BUTTON, ) - if self == PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON: - return ( - PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON, - PenState.PEN_IS_IN_RANGE, - PenState.PEN_IS_OUT_OF_RANGE, - PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON, - ) - - if self == PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON: - return ( - PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON, - PenState.PEN_IS_IN_CONTACT, - PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON, - ) - return tuple() def historically_tolerated_transitions(self) -> Tuple["PenState", ...]: @@ -263,11 +213,9 @@ class PenState(Enum): PenState.PEN_IS_OUT_OF_RANGE, PenState.PEN_IS_IN_RANGE, PenState.PEN_IS_IN_RANGE_WITH_BUTTON, - PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON, PenState.PEN_IS_IN_RANGE_WITH_ERASING_INTENT, PenState.PEN_IS_IN_CONTACT, PenState.PEN_IS_IN_CONTACT_WITH_BUTTON, - PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON, PenState.PEN_IS_ERASING, ) @@ -275,7 +223,6 @@ class PenState(Enum): return ( PenState.PEN_IS_IN_RANGE, PenState.PEN_IS_IN_RANGE_WITH_BUTTON, - PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON, PenState.PEN_IS_OUT_OF_RANGE, PenState.PEN_IS_IN_CONTACT, ) @@ -284,7 +231,6 @@ class PenState(Enum): return ( PenState.PEN_IS_IN_CONTACT, PenState.PEN_IS_IN_CONTACT_WITH_BUTTON, - PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON, PenState.PEN_IS_IN_RANGE, PenState.PEN_IS_OUT_OF_RANGE, ) @@ -319,22 +265,6 @@ class PenState(Enum): PenState.PEN_IS_OUT_OF_RANGE, ) - if self == PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON: - return ( - PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON, - PenState.PEN_IS_IN_RANGE, - PenState.PEN_IS_OUT_OF_RANGE, - PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON, - ) - - if self == PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON: - return ( - PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON, - PenState.PEN_IS_IN_CONTACT, - PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON, - PenState.PEN_IS_OUT_OF_RANGE, - ) - return tuple() @staticmethod @@ -402,9 +332,9 @@ class PenState(Enum): } @staticmethod - def legal_transitions_with_primary_button() -> Dict[str, Tuple["PenState", ...]]: + def legal_transitions_with_button() -> Dict[str, Tuple["PenState", ...]]: """We revisit the Windows Pen Implementation state machine: - we now have a primary button. + we now have a button. """ return { "hover-button": (PenState.PEN_IS_IN_RANGE_WITH_BUTTON,), @@ -451,56 +381,6 @@ class PenState(Enum): } @staticmethod - def legal_transitions_with_secondary_button() -> Dict[str, Tuple["PenState", ...]]: - """We revisit the Windows Pen Implementation state machine: - we now have a secondary button. - Note: we don't looks for 2 buttons interactions. - """ - return { - "hover-button": (PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON,), - "hover-button -> out-of-range": ( - PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON, - PenState.PEN_IS_OUT_OF_RANGE, - ), - "in-range -> button-press": ( - PenState.PEN_IS_IN_RANGE, - PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON, - ), - "in-range -> button-press -> button-release": ( - PenState.PEN_IS_IN_RANGE, - PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON, - PenState.PEN_IS_IN_RANGE, - ), - "in-range -> touch -> button-press -> button-release": ( - PenState.PEN_IS_IN_RANGE, - PenState.PEN_IS_IN_CONTACT, - PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON, - PenState.PEN_IS_IN_CONTACT, - ), - "in-range -> touch -> button-press -> release -> button-release": ( - PenState.PEN_IS_IN_RANGE, - PenState.PEN_IS_IN_CONTACT, - PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON, - PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON, - PenState.PEN_IS_IN_RANGE, - ), - "in-range -> button-press -> touch -> release -> button-release": ( - PenState.PEN_IS_IN_RANGE, - PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON, - PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON, - PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON, - PenState.PEN_IS_IN_RANGE, - ), - "in-range -> button-press -> touch -> button-release -> release": ( - PenState.PEN_IS_IN_RANGE, - PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON, - PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON, - PenState.PEN_IS_IN_CONTACT, - PenState.PEN_IS_IN_RANGE, - ), - } - - @staticmethod def tolerated_transitions() -> Dict[str, Tuple["PenState", ...]]: """This is not adhering to the Windows Pen Implementation state machine but we should expect the kernel to behave properly, mostly for historical @@ -616,10 +496,22 @@ class Pen(object): evdev.value[axis] == value ), f"assert evdev.value[{axis}] ({evdev.value[axis]}) != {value}" - def assert_expected_input_events(self, evdev): + def assert_expected_input_events(self, evdev, button): assert evdev.value[libevdev.EV_ABS.ABS_X] == self.x assert evdev.value[libevdev.EV_ABS.ABS_Y] == self.y - assert self.current_state == PenState.from_evdev(evdev) + + # assert no other buttons than the tested ones are set + buttons = [ + BtnPressed.PRIMARY_PRESSED, + BtnPressed.SECONDARY_PRESSED, + BtnPressed.THIRD_PRESSED, + ] + if button is not None: + buttons.remove(button) + for b in buttons: + assert evdev.value[b.value] is None or evdev.value[b.value] == False + + assert self.current_state == PenState.from_evdev(evdev, button) class PenDigitizer(base.UHIDTestDevice): @@ -647,7 +539,7 @@ class PenDigitizer(base.UHIDTestDevice): continue self.fields = [f.usage_name for f in r] - def move_to(self, pen, state): + def move_to(self, pen, state, button): # fill in the previous values if pen.current_state == PenState.PEN_IS_OUT_OF_RANGE: pen.restore() @@ -690,29 +582,17 @@ class PenDigitizer(base.UHIDTestDevice): pen.inrange = True pen.invert = False pen.eraser = False - pen.barrelswitch = True - pen.secondarybarrelswitch = False + assert button is not None + pen.barrelswitch = button == BtnPressed.PRIMARY_PRESSED + pen.secondarybarrelswitch = button == BtnPressed.SECONDARY_PRESSED elif state == PenState.PEN_IS_IN_CONTACT_WITH_BUTTON: pen.tipswitch = True pen.inrange = True pen.invert = False pen.eraser = False - pen.barrelswitch = True - pen.secondarybarrelswitch = False - elif state == PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON: - pen.tipswitch = False - pen.inrange = True - pen.invert = False - pen.eraser = False - pen.barrelswitch = False - pen.secondarybarrelswitch = True - elif state == PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON: - pen.tipswitch = True - pen.inrange = True - pen.invert = False - pen.eraser = False - pen.barrelswitch = False - pen.secondarybarrelswitch = True + assert button is not None + pen.barrelswitch = button == BtnPressed.PRIMARY_PRESSED + pen.secondarybarrelswitch = button == BtnPressed.SECONDARY_PRESSED elif state == PenState.PEN_IS_IN_RANGE_WITH_ERASING_INTENT: pen.tipswitch = False pen.inrange = True @@ -730,7 +610,7 @@ class PenDigitizer(base.UHIDTestDevice): pen.current_state = state - def event(self, pen): + def event(self, pen, button): rs = [] r = self.create_report(application=self.cur_application, data=pen) self.call_input_event(r) @@ -771,17 +651,17 @@ class BaseTest: def create_device(self): raise Exception("please reimplement me in subclasses") - def post(self, uhdev, pen): - r = uhdev.event(pen) + def post(self, uhdev, pen, test_button): + r = uhdev.event(pen, test_button) events = uhdev.next_sync_events() self.debug_reports(r, uhdev, events) return events def validate_transitions( - self, from_state, pen, evdev, events, allow_intermediate_states + self, from_state, pen, evdev, events, allow_intermediate_states, button ): # check that the final state is correct - pen.assert_expected_input_events(evdev) + pen.assert_expected_input_events(evdev, button) state = from_state @@ -794,12 +674,14 @@ class BaseTest: events = events[idx + 1 :] # now check for a valid transition - state = state.apply(sync_events, not allow_intermediate_states) + state = state.apply(sync_events, not allow_intermediate_states, button) if events: - state = state.apply(sync_events, not allow_intermediate_states) + state = state.apply(sync_events, not allow_intermediate_states, button) - def _test_states(self, state_list, scribble, allow_intermediate_states): + def _test_states( + self, state_list, scribble, allow_intermediate_states, button=None + ): """Internal method to test against a list of transition between states. state_list is a list of PenState objects @@ -812,10 +694,10 @@ class BaseTest: cur_state = PenState.PEN_IS_OUT_OF_RANGE p = Pen(50, 60) - uhdev.move_to(p, PenState.PEN_IS_OUT_OF_RANGE) - events = self.post(uhdev, p) + uhdev.move_to(p, PenState.PEN_IS_OUT_OF_RANGE, button) + events = self.post(uhdev, p, button) self.validate_transitions( - cur_state, p, evdev, events, allow_intermediate_states + cur_state, p, evdev, events, allow_intermediate_states, button ) cur_state = p.current_state @@ -824,18 +706,18 @@ class BaseTest: if scribble and cur_state != PenState.PEN_IS_OUT_OF_RANGE: p.x += 1 p.y -= 1 - events = self.post(uhdev, p) + events = self.post(uhdev, p, button) self.validate_transitions( - cur_state, p, evdev, events, allow_intermediate_states + cur_state, p, evdev, events, allow_intermediate_states, button ) assert len(events) >= 3 # X, Y, SYN - uhdev.move_to(p, state) + uhdev.move_to(p, state, button) if scribble and state != PenState.PEN_IS_OUT_OF_RANGE: p.x += 1 p.y -= 1 - events = self.post(uhdev, p) + events = self.post(uhdev, p, button) self.validate_transitions( - cur_state, p, evdev, events, allow_intermediate_states + cur_state, p, evdev, events, allow_intermediate_states, button ) cur_state = p.current_state @@ -874,12 +756,17 @@ class BaseTest: "state_list", [ pytest.param(v, id=k) - for k, v in PenState.legal_transitions_with_primary_button().items() + for k, v in PenState.legal_transitions_with_button().items() ], ) def test_valid_primary_button_pen_states(self, state_list, scribble): """Rework the transition state machine by adding the primary button.""" - self._test_states(state_list, scribble, allow_intermediate_states=False) + self._test_states( + state_list, + scribble, + allow_intermediate_states=False, + button=BtnPressed.PRIMARY_PRESSED, + ) @pytest.mark.skip_if_uhdev( lambda uhdev: "Secondary Barrel Switch" not in uhdev.fields, @@ -890,12 +777,38 @@ class BaseTest: "state_list", [ pytest.param(v, id=k) - for k, v in PenState.legal_transitions_with_secondary_button().items() + for k, v in PenState.legal_transitions_with_button().items() ], ) def test_valid_secondary_button_pen_states(self, state_list, scribble): """Rework the transition state machine by adding the secondary button.""" - self._test_states(state_list, scribble, allow_intermediate_states=False) + self._test_states( + state_list, + scribble, + allow_intermediate_states=False, + button=BtnPressed.SECONDARY_PRESSED, + ) + + @pytest.mark.skip_if_uhdev( + lambda uhdev: "Third Barrel Switch" not in uhdev.fields, + "Device not compatible, missing Third Barrel Switch usage", + ) + @pytest.mark.parametrize("scribble", [True, False], ids=["scribble", "static"]) + @pytest.mark.parametrize( + "state_list", + [ + pytest.param(v, id=k) + for k, v in PenState.legal_transitions_with_button().items() + ], + ) + def test_valid_third_button_pen_states(self, state_list, scribble): + """Rework the transition state machine by adding the secondary button.""" + self._test_states( + state_list, + scribble, + allow_intermediate_states=False, + button=BtnPressed.THIRD_PRESSED, + ) @pytest.mark.skip_if_uhdev( lambda uhdev: "Invert" not in uhdev.fields, @@ -956,7 +869,7 @@ class BaseTest: class GXTP_pen(PenDigitizer): - def event(self, pen): + def event(self, pen, test_button): if not hasattr(self, "prev_tip_state"): self.prev_tip_state = False @@ -977,13 +890,407 @@ class GXTP_pen(PenDigitizer): if pen.eraser: internal_pen.invert = False - return super().event(internal_pen) + return super().event(internal_pen, test_button) class USIPen(PenDigitizer): pass +class XPPen_ArtistPro16Gen2_28bd_095b(PenDigitizer): + """ + Pen with two buttons and a rubber end, but which reports + the second button as an eraser + """ + + def __init__( + self, + name, + rdesc_str=None, + rdesc=None, + application="Pen", + physical="Stylus", + input_info=(BusType.USB, 0x28BD, 0x095B), + evdev_name_suffix=None, + ): + super().__init__( + name, rdesc_str, rdesc, application, physical, input_info, evdev_name_suffix + ) + self.fields.append("Secondary Barrel Switch") + + def move_to(self, pen, state, button): + # fill in the previous values + if pen.current_state == PenState.PEN_IS_OUT_OF_RANGE: + pen.restore() + + print(f"\n *** pen is moving to {state} ***") + + if state == PenState.PEN_IS_OUT_OF_RANGE: + pen.backup() + pen.x = 0 + pen.y = 0 + pen.tipswitch = False + pen.tippressure = 0 + pen.azimuth = 0 + pen.inrange = False + pen.width = 0 + pen.height = 0 + pen.invert = False + pen.eraser = False + pen.xtilt = 0 + pen.ytilt = 0 + pen.twist = 0 + pen.barrelswitch = False + elif state == PenState.PEN_IS_IN_RANGE: + pen.tipswitch = False + pen.inrange = True + pen.invert = False + pen.eraser = False + pen.barrelswitch = False + elif state == PenState.PEN_IS_IN_CONTACT: + pen.tipswitch = True + pen.inrange = True + pen.invert = False + pen.eraser = False + pen.barrelswitch = False + elif state == PenState.PEN_IS_IN_RANGE_WITH_BUTTON: + pen.tipswitch = False + pen.inrange = True + pen.invert = False + assert button is not None + pen.barrelswitch = button == BtnPressed.PRIMARY_PRESSED + pen.eraser = button == BtnPressed.SECONDARY_PRESSED + elif state == PenState.PEN_IS_IN_CONTACT_WITH_BUTTON: + pen.tipswitch = True + pen.inrange = True + pen.invert = False + assert button is not None + pen.barrelswitch = button == BtnPressed.PRIMARY_PRESSED + pen.eraser = button == BtnPressed.SECONDARY_PRESSED + elif state == PenState.PEN_IS_IN_RANGE_WITH_ERASING_INTENT: + pen.tipswitch = False + pen.inrange = True + pen.invert = True + pen.eraser = False + pen.barrelswitch = False + elif state == PenState.PEN_IS_ERASING: + pen.tipswitch = True + pen.inrange = True + pen.invert = True + pen.eraser = False + pen.barrelswitch = False + + pen.current_state = state + + def event(self, pen, test_button): + import math + + pen_copy = copy.copy(pen) + width = 13.567 + height = 8.480 + tip_height = 0.055677699 + hx = tip_height * (32767 / width) + hy = tip_height * (32767 / height) + if pen_copy.xtilt != 0: + pen_copy.x += round(hx * math.sin(math.radians(pen_copy.xtilt))) + if pen_copy.ytilt != 0: + pen_copy.y += round(hy * math.sin(math.radians(pen_copy.ytilt))) + + return super().event(pen_copy, test_button) + + +class XPPen_Artist24_28bd_093a(PenDigitizer): + """ + Pen that reports secondary barrel switch through eraser + """ + + def __init__( + self, + name, + rdesc_str=None, + rdesc=None, + application="Pen", + physical="Stylus", + input_info=(BusType.USB, 0x28BD, 0x093A), + evdev_name_suffix=None, + ): + super().__init__( + name, rdesc_str, rdesc, application, physical, input_info, evdev_name_suffix + ) + self.fields.append("Secondary Barrel Switch") + self.previous_state = PenState.PEN_IS_OUT_OF_RANGE + + def move_to(self, pen, state, button, debug=True): + # fill in the previous values + if pen.current_state == PenState.PEN_IS_OUT_OF_RANGE: + pen.restore() + + if debug: + print(f"\n *** pen is moving to {state} ***") + + if state == PenState.PEN_IS_OUT_OF_RANGE: + pen.backup() + pen.tipswitch = False + pen.tippressure = 0 + pen.azimuth = 0 + pen.inrange = False + pen.width = 0 + pen.height = 0 + pen.invert = False + pen.eraser = False + pen.xtilt = 0 + pen.ytilt = 0 + pen.twist = 0 + pen.barrelswitch = False + elif state == PenState.PEN_IS_IN_RANGE: + pen.tipswitch = False + pen.inrange = True + pen.invert = False + pen.eraser = False + pen.barrelswitch = False + elif state == PenState.PEN_IS_IN_CONTACT: + pen.tipswitch = True + pen.inrange = True + pen.invert = False + pen.eraser = False + pen.barrelswitch = False + elif state == PenState.PEN_IS_IN_RANGE_WITH_BUTTON: + pen.tipswitch = False + pen.inrange = True + pen.invert = False + assert button is not None + pen.barrelswitch = button == BtnPressed.PRIMARY_PRESSED + pen.eraser = button == BtnPressed.SECONDARY_PRESSED + elif state == PenState.PEN_IS_IN_CONTACT_WITH_BUTTON: + pen.tipswitch = True + pen.inrange = True + pen.invert = False + assert button is not None + pen.barrelswitch = button == BtnPressed.PRIMARY_PRESSED + pen.eraser = button == BtnPressed.SECONDARY_PRESSED + + pen.current_state = state + + def send_intermediate_state(self, pen, state, button): + intermediate_pen = copy.copy(pen) + self.move_to(intermediate_pen, state, button, debug=False) + return super().event(intermediate_pen, button) + + def event(self, pen, button): + rs = [] + + # the pen reliably sends in-range events in a normal case (non emulation of eraser mode) + if self.previous_state == PenState.PEN_IS_IN_CONTACT: + if pen.current_state == PenState.PEN_IS_OUT_OF_RANGE: + rs.extend( + self.send_intermediate_state(pen, PenState.PEN_IS_IN_RANGE, button) + ) + + if button == BtnPressed.SECONDARY_PRESSED: + if self.previous_state == PenState.PEN_IS_IN_RANGE: + if pen.current_state == PenState.PEN_IS_IN_RANGE_WITH_BUTTON: + rs.extend( + self.send_intermediate_state( + pen, PenState.PEN_IS_OUT_OF_RANGE, button + ) + ) + + if self.previous_state == PenState.PEN_IS_IN_RANGE_WITH_BUTTON: + if pen.current_state == PenState.PEN_IS_IN_RANGE: + rs.extend( + self.send_intermediate_state( + pen, PenState.PEN_IS_OUT_OF_RANGE, button + ) + ) + + if self.previous_state == PenState.PEN_IS_IN_CONTACT: + if pen.current_state == PenState.PEN_IS_IN_CONTACT_WITH_BUTTON: + rs.extend( + self.send_intermediate_state( + pen, PenState.PEN_IS_OUT_OF_RANGE, button + ) + ) + rs.extend( + self.send_intermediate_state( + pen, PenState.PEN_IS_IN_RANGE_WITH_BUTTON, button + ) + ) + + if self.previous_state == PenState.PEN_IS_IN_CONTACT_WITH_BUTTON: + if pen.current_state == PenState.PEN_IS_IN_CONTACT: + rs.extend( + self.send_intermediate_state( + pen, PenState.PEN_IS_OUT_OF_RANGE, button + ) + ) + rs.extend( + self.send_intermediate_state( + pen, PenState.PEN_IS_IN_RANGE, button + ) + ) + + rs.extend(super().event(pen, button)) + self.previous_state = pen.current_state + return rs + + +class Huion_Kamvas_Pro_19_256c_006b(PenDigitizer): + """ + Pen that reports secondary barrel switch through secondary TipSwtich + and 3rd button through Invert + """ + + def __init__( + self, + name, + rdesc_str=None, + rdesc=None, + application="Stylus", + physical=None, + input_info=(BusType.USB, 0x256C, 0x006B), + evdev_name_suffix=None, + ): + super().__init__( + name, rdesc_str, rdesc, application, physical, input_info, evdev_name_suffix + ) + self.fields.append("Secondary Barrel Switch") + self.fields.append("Third Barrel Switch") + self.previous_state = PenState.PEN_IS_OUT_OF_RANGE + + def move_to(self, pen, state, button, debug=True): + # fill in the previous values + if pen.current_state == PenState.PEN_IS_OUT_OF_RANGE: + pen.restore() + + if debug: + print(f"\n *** pen is moving to {state} ***") + + if state == PenState.PEN_IS_OUT_OF_RANGE: + pen.backup() + pen.tipswitch = False + pen.tippressure = 0 + pen.azimuth = 0 + pen.inrange = False + pen.width = 0 + pen.height = 0 + pen.invert = False + pen.eraser = False + pen.xtilt = 0 + pen.ytilt = 0 + pen.twist = 0 + pen.barrelswitch = False + pen.secondarytipswitch = False + elif state == PenState.PEN_IS_IN_RANGE: + pen.tipswitch = False + pen.inrange = True + pen.invert = False + pen.eraser = False + pen.barrelswitch = False + pen.secondarytipswitch = False + elif state == PenState.PEN_IS_IN_CONTACT: + pen.tipswitch = True + pen.inrange = True + pen.invert = False + pen.eraser = False + pen.barrelswitch = False + pen.secondarytipswitch = False + elif state == PenState.PEN_IS_IN_RANGE_WITH_BUTTON: + pen.tipswitch = False + pen.inrange = True + pen.eraser = False + assert button is not None + pen.barrelswitch = button == BtnPressed.PRIMARY_PRESSED + pen.secondarytipswitch = button == BtnPressed.SECONDARY_PRESSED + pen.invert = button == BtnPressed.THIRD_PRESSED + elif state == PenState.PEN_IS_IN_CONTACT_WITH_BUTTON: + pen.tipswitch = True + pen.inrange = True + pen.eraser = False + assert button is not None + pen.barrelswitch = button == BtnPressed.PRIMARY_PRESSED + pen.secondarytipswitch = button == BtnPressed.SECONDARY_PRESSED + pen.invert = button == BtnPressed.THIRD_PRESSED + elif state == PenState.PEN_IS_IN_RANGE_WITH_ERASING_INTENT: + pen.tipswitch = False + pen.inrange = True + pen.invert = True + pen.eraser = False + pen.barrelswitch = False + pen.secondarytipswitch = False + elif state == PenState.PEN_IS_ERASING: + pen.tipswitch = False + pen.inrange = True + pen.invert = False + pen.eraser = True + pen.barrelswitch = False + pen.secondarytipswitch = False + + pen.current_state = state + + def call_input_event(self, report): + if report[0] == 0x0a: + # ensures the original second Eraser usage is null + report[1] &= 0xdf + + # ensures the original last bit is equal to bit 6 (In Range) + if report[1] & 0x40: + report[1] |= 0x80 + + super().call_input_event(report) + + def send_intermediate_state(self, pen, state, test_button): + intermediate_pen = copy.copy(pen) + self.move_to(intermediate_pen, state, test_button, debug=False) + return super().event(intermediate_pen, test_button) + + def event(self, pen, button): + rs = [] + + # it's not possible to go between eraser mode or not without + # going out-of-prox: the eraser mode is activated by presenting + # the tail of the pen + if self.previous_state in ( + PenState.PEN_IS_IN_RANGE, + PenState.PEN_IS_IN_RANGE_WITH_BUTTON, + PenState.PEN_IS_IN_CONTACT, + PenState.PEN_IS_IN_CONTACT_WITH_BUTTON, + ) and pen.current_state in ( + PenState.PEN_IS_IN_RANGE_WITH_ERASING_INTENT, + PenState.PEN_IS_IN_RANGE_WITH_ERASING_INTENT_WITH_BUTTON, + PenState.PEN_IS_ERASING, + PenState.PEN_IS_ERASING_WITH_BUTTON, + ): + rs.extend( + self.send_intermediate_state(pen, PenState.PEN_IS_OUT_OF_RANGE, button) + ) + + # same than above except from eraser to normal + if self.previous_state in ( + PenState.PEN_IS_IN_RANGE_WITH_ERASING_INTENT, + PenState.PEN_IS_IN_RANGE_WITH_ERASING_INTENT_WITH_BUTTON, + PenState.PEN_IS_ERASING, + PenState.PEN_IS_ERASING_WITH_BUTTON, + ) and pen.current_state in ( + PenState.PEN_IS_IN_RANGE, + PenState.PEN_IS_IN_RANGE_WITH_BUTTON, + PenState.PEN_IS_IN_CONTACT, + PenState.PEN_IS_IN_CONTACT_WITH_BUTTON, + ): + rs.extend( + self.send_intermediate_state(pen, PenState.PEN_IS_OUT_OF_RANGE, button) + ) + + if self.previous_state == PenState.PEN_IS_OUT_OF_RANGE: + if pen.current_state == PenState.PEN_IS_IN_RANGE_WITH_BUTTON: + rs.extend( + self.send_intermediate_state(pen, PenState.PEN_IS_IN_RANGE, button) + ) + + rs.extend(super().event(pen, button)) + self.previous_state = pen.current_state + return rs + + ################################################################################ # # Windows 7 compatible devices @@ -1162,3 +1469,37 @@ class TestGoodix_27c6_0e00(BaseTest.TestTablet): rdesc="05 0d 09 04 a1 01 85 01 09 22 a1 02 55 0e 65 11 35 00 15 00 09 42 25 01 75 01 95 01 81 02 25 7f 09 30 75 07 81 42 95 01 75 08 09 51 81 02 75 10 05 01 26 04 20 46 e6 09 09 30 81 02 26 60 15 46 9a 06 09 31 81 02 05 0d 55 0f 75 08 25 ff 45 ff 09 48 81 42 09 49 81 42 55 0e c0 09 22 a1 02 09 42 25 01 75 01 95 01 81 02 25 7f 09 30 75 07 81 42 95 01 75 08 09 51 81 02 75 10 05 01 26 04 20 46 e6 09 09 30 81 02 26 60 15 46 9a 06 09 31 81 02 05 0d 55 0f 75 08 25 ff 45 ff 09 48 81 42 09 49 81 42 55 0e c0 09 22 a1 02 09 42 25 01 75 01 95 01 81 02 25 7f 09 30 75 07 81 42 95 01 75 08 09 51 81 02 75 10 05 01 26 04 20 46 e6 09 09 30 81 02 26 60 15 46 9a 06 09 31 81 02 05 0d 55 0f 75 08 25 ff 45 ff 09 48 81 42 09 49 81 42 55 0e c0 09 22 a1 02 09 42 15 00 25 01 75 01 95 01 81 02 25 7f 09 30 75 07 81 42 75 08 09 51 95 01 81 02 05 01 26 04 20 75 10 55 0e 65 11 09 30 35 00 46 e6 09 81 02 26 60 15 46 9a 06 09 31 81 02 05 0d 55 0f 75 08 25 ff 45 ff 09 48 81 42 09 49 81 42 55 0e c0 09 22 a1 02 09 42 15 00 25 01 75 01 95 01 81 02 25 7f 09 30 75 07 81 42 75 08 09 51 95 01 81 02 05 01 26 04 20 75 10 55 0e 65 11 09 30 35 00 46 e6 09 81 02 26 60 15 46 9a 06 09 31 81 02 05 0d 55 0f 75 08 25 ff 45 ff 09 48 81 42 09 49 81 42 55 0e c0 09 54 15 00 25 7f 75 08 95 01 81 02 85 02 09 55 95 01 25 0a b1 02 85 03 06 00 ff 09 c5 15 00 26 ff 00 75 08 96 00 01 b1 02 c0 05 0d 09 02 a1 01 09 20 a1 00 85 08 05 01 a4 09 30 35 00 46 e6 09 15 00 26 04 20 55 0d 65 13 75 10 95 01 81 02 09 31 46 9a 06 26 60 15 81 02 b4 05 0d 09 38 95 01 75 08 15 00 25 01 81 02 09 30 75 10 26 ff 0f 81 02 09 31 81 02 09 42 09 44 09 5a 09 3c 09 45 09 32 75 01 95 06 25 01 81 02 95 02 81 03 09 3d 55 0e 65 14 36 d8 dc 46 28 23 16 d8 dc 26 28 23 95 01 75 10 81 02 09 3e 81 02 09 41 15 00 27 a0 8c 00 00 35 00 47 a0 8c 00 00 81 02 05 20 0a 53 04 65 00 16 01 f8 26 ff 07 75 10 95 01 81 02 0a 54 04 81 02 0a 55 04 81 02 0a 57 04 81 02 0a 58 04 81 02 0a 59 04 81 02 0a 72 04 81 02 0a 73 04 81 02 0a 74 04 81 02 05 0d 09 3b 15 00 25 64 75 08 81 02 09 5b 25 ff 75 40 81 02 06 00 ff 09 5b 75 20 81 02 05 0d 09 5c 26 ff 00 75 08 81 02 09 5e 81 02 09 70 a1 02 15 01 25 06 09 72 09 73 09 74 09 75 09 76 09 77 81 20 c0 06 00 ff 09 01 15 00 27 ff ff 00 00 75 10 95 01 81 02 85 09 09 81 a1 02 09 81 15 01 25 04 09 82 09 83 09 84 09 85 81 20 c0 85 10 09 5c a1 02 15 00 25 01 75 08 95 01 09 38 b1 02 09 5c 26 ff 00 b1 02 09 5d 75 01 95 01 25 01 b1 02 95 07 b1 03 c0 85 11 09 5e a1 02 09 38 15 00 25 01 75 08 95 01 b1 02 09 5e 26 ff 00 b1 02 09 5f 75 01 25 01 b1 02 75 07 b1 03 c0 85 12 09 70 a1 02 75 08 95 01 15 00 25 01 09 38 b1 02 09 70 a1 02 25 06 09 72 09 73 09 74 09 75 09 76 09 77 b1 20 c0 09 71 75 01 25 01 b1 02 75 07 b1 03 c0 85 13 09 80 15 00 25 ff 75 40 95 01 b1 02 85 14 09 44 a1 02 09 38 75 08 95 01 25 01 b1 02 15 01 25 03 09 44 a1 02 09 a4 09 44 09 5a 09 45 09 a3 b1 20 c0 09 5a a1 02 09 a4 09 44 09 5a 09 45 09 a3 b1 20 c0 09 45 a1 02 09 a4 09 44 09 5a 09 45 09 a3 b1 20 c0 c0 85 15 75 08 95 01 05 0d 09 90 a1 02 09 38 25 01 b1 02 09 91 75 10 26 ff 0f b1 02 09 92 75 40 25 ff b1 02 05 06 09 2a 75 08 26 ff 00 a1 02 09 2d b1 02 09 2e b1 02 c0 c0 85 16 05 06 09 2b a1 02 05 0d 25 01 09 38 b1 02 05 06 09 2b a1 02 09 2d 26 ff 00 b1 02 09 2e b1 02 c0 c0 85 17 06 00 ff 09 01 a1 02 05 0d 09 38 75 08 95 01 25 01 b1 02 06 00 ff 09 01 75 10 27 ff ff 00 00 b1 02 c0 85 18 05 0d 09 38 75 08 95 01 15 00 25 01 b1 02 c0 c0 06 f0 ff 09 01 a1 01 85 0e 09 01 15 00 25 ff 75 08 95 40 91 02 09 01 15 00 25 ff 75 08 95 40 81 02 c0", input_info=(BusType.I2C, 0x27C6, 0x0E00), ) + + +class TestXPPen_ArtistPro16Gen2_28bd_095b(BaseTest.TestTablet): + hid_bpfs = [("XPPen__ArtistPro16Gen2.bpf.o", True)] + + def create_device(self): + dev = XPPen_ArtistPro16Gen2_28bd_095b( + "uhid test XPPen Artist Pro 16 Gen2 28bd 095b", + rdesc="05 0d 09 02 a1 01 85 07 09 20 a1 00 09 42 09 44 09 45 09 3c 15 00 25 01 75 01 95 04 81 02 95 01 81 03 09 32 15 00 25 01 95 01 81 02 95 02 81 03 75 10 95 01 35 00 a4 05 01 09 30 65 13 55 0d 46 ff 34 26 ff 7f 81 02 09 31 46 20 21 26 ff 7f 81 02 b4 09 30 45 00 26 ff 3f 81 42 09 3d 15 81 25 7f 75 08 95 01 81 02 09 3e 15 81 25 7f 81 02 c0 c0", + input_info=(BusType.USB, 0x28BD, 0x095B), + ) + return dev + + +class TestXPPen_Artist24_28bd_093a(BaseTest.TestTablet): + hid_bpfs = [("XPPen__Artist24.bpf.o", True)] + + def create_device(self): + return XPPen_Artist24_28bd_093a( + "uhid test XPPen Artist 24 28bd 093a", + rdesc="05 0d 09 02 a1 01 85 07 09 20 a1 00 09 42 09 44 09 45 15 00 25 01 75 01 95 03 81 02 95 02 81 03 09 32 95 01 81 02 95 02 81 03 75 10 95 01 35 00 a4 05 01 09 30 65 13 55 0d 46 f0 50 26 ff 7f 81 02 09 31 46 91 2d 26 ff 7f 81 02 b4 09 30 45 00 26 ff 1f 81 42 09 3d 15 81 25 7f 75 08 95 01 81 02 09 3e 15 81 25 7f 81 02 c0 c0", + input_info=(BusType.USB, 0x28BD, 0x093A), + ) + + +class TestHuion_Kamvas_Pro_19_256c_006b(BaseTest.TestTablet): + hid_bpfs = [("Huion__Kamvas-Pro-19.bpf.o", True)] + + def create_device(self): + return Huion_Kamvas_Pro_19_256c_006b( + "uhid test HUION Huion Tablet_GT1902", + rdesc="05 0d 09 02 a1 01 85 0a 09 20 a1 01 09 42 09 44 09 43 09 3c 09 45 15 00 25 01 75 01 95 06 81 02 09 32 75 01 95 01 81 02 81 03 05 01 09 30 09 31 55 0d 65 33 26 ff 7f 35 00 46 00 08 75 10 95 02 81 02 05 0d 09 30 26 ff 3f 75 10 95 01 81 02 09 3d 09 3e 15 a6 25 5a 75 08 95 02 81 02 c0 c0 05 0d 09 04 a1 01 85 04 09 22 a1 02 05 0d 95 01 75 06 09 51 15 00 25 3f 81 02 09 42 25 01 75 01 95 01 81 02 75 01 95 01 81 03 05 01 75 10 55 0e 65 11 09 30 26 ff 7f 35 00 46 15 0c 81 42 09 31 26 ff 7f 46 cb 06 81 42 05 0d 09 30 26 ff 1f 75 10 95 01 81 02 c0 05 0d 09 22 a1 02 05 0d 95 01 75 06 09 51 15 00 25 3f 81 02 09 42 25 01 75 01 95 01 81 02 75 01 95 01 81 03 05 01 75 10 55 0e 65 11 09 30 26 ff 7f 35 00 46 15 0c 81 42 09 31 26 ff 7f 46 cb 06 81 42 05 0d 09 30 26 ff 1f 75 10 95 01 81 02 c0 05 0d 09 56 55 00 65 00 27 ff ff ff 7f 95 01 75 20 81 02 09 54 25 7f 95 01 75 08 81 02 75 08 95 08 81 03 85 05 09 55 25 0a 75 08 95 01 b1 02 06 00 ff 09 c5 85 06 15 00 26 ff 00 75 08 96 00 01 b1 02 c0", + input_info=(BusType.USB, 0x256C, 0x006B), + ) diff --git a/tools/testing/selftests/ipc/msgque.c b/tools/testing/selftests/ipc/msgque.c index 656c43c24044..c75ea4094870 100644 --- a/tools/testing/selftests/ipc/msgque.c +++ b/tools/testing/selftests/ipc/msgque.c @@ -198,13 +198,12 @@ int main(int argc, char **argv) struct msgque_data msgque; if (getuid() != 0) - return ksft_exit_skip( - "Please run the test as root - Exiting.\n"); + ksft_exit_skip("Please run the test as root - Exiting.\n"); msgque.key = ftok(argv[0], 822155650); if (msgque.key == -1) { printf("Can't make key: %d\n", -errno); - return ksft_exit_fail(); + ksft_exit_fail(); } msgque.msq_id = msgget(msgque.key, IPC_CREAT | IPC_EXCL | 0666); @@ -243,13 +242,13 @@ int main(int argc, char **argv) printf("Failed to test queue: %d\n", err); goto err_out; } - return ksft_exit_pass(); + ksft_exit_pass(); err_destroy: if (msgctl(msgque.msq_id, IPC_RMID, NULL)) { printf("Failed to destroy queue: %d\n", -errno); - return ksft_exit_fail(); + ksft_exit_fail(); } err_out: - return ksft_exit_fail(); + ksft_exit_fail(); } diff --git a/tools/testing/selftests/kcmp/kcmp_test.c b/tools/testing/selftests/kcmp/kcmp_test.c index 25110c7c0b3e..d7a8e321bb16 100644 --- a/tools/testing/selftests/kcmp/kcmp_test.c +++ b/tools/testing/selftests/kcmp/kcmp_test.c @@ -91,7 +91,7 @@ int main(int argc, char **argv) ksft_print_header(); ksft_set_plan(3); - fd2 = open(kpath, O_RDWR, 0644); + fd2 = open(kpath, O_RDWR); if (fd2 < 0) { perror("Can't open file"); ksft_exit_fail(); diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h index 14bbab0cce13..76c2a6945d3e 100644 --- a/tools/testing/selftests/kselftest.h +++ b/tools/testing/selftests/kselftest.h @@ -16,10 +16,12 @@ * For each test, report any progress, debugging, etc with: * * ksft_print_msg(fmt, ...); + * ksft_perror(msg); * * and finally report the pass/fail/skip/xfail state of the test with one of: * * ksft_test_result(condition, fmt, ...); + * ksft_test_result_report(result, fmt, ...); * ksft_test_result_pass(fmt, ...); * ksft_test_result_fail(fmt, ...); * ksft_test_result_skip(fmt, ...); @@ -39,6 +41,7 @@ * the program is aborting before finishing all tests): * * ksft_exit_fail_msg(fmt, ...); + * ksft_exit_fail_perror(msg); * */ #ifndef __KSELFTEST_H @@ -305,13 +308,34 @@ void ksft_test_result_code(int exit_code, const char *test_name, printf("\n"); } -static inline __noreturn int ksft_exit_pass(void) +/** + * ksft_test_result() - Report test success based on truth of condition + * + * @condition: if true, report test success, otherwise failure. + */ +#define ksft_test_result_report(result, fmt, ...) do { \ + switch (result) { \ + case KSFT_PASS: \ + ksft_test_result_pass(fmt, ##__VA_ARGS__); \ + break; \ + case KSFT_FAIL: \ + ksft_test_result_fail(fmt, ##__VA_ARGS__); \ + break; \ + case KSFT_XFAIL: \ + ksft_test_result_xfail(fmt, ##__VA_ARGS__); \ + break; \ + case KSFT_SKIP: \ + ksft_test_result_skip(fmt, ##__VA_ARGS__); \ + break; \ + } } while (0) + +static inline __noreturn void ksft_exit_pass(void) { ksft_print_cnts(); exit(KSFT_PASS); } -static inline __noreturn int ksft_exit_fail(void) +static inline __noreturn void ksft_exit_fail(void) { ksft_print_cnts(); exit(KSFT_FAIL); @@ -338,7 +362,7 @@ static inline __noreturn int ksft_exit_fail(void) ksft_cnt.ksft_xfail + \ ksft_cnt.ksft_xskip) -static inline __noreturn __printf(1, 2) int ksft_exit_fail_msg(const char *msg, ...) +static inline __noreturn __printf(1, 2) void ksft_exit_fail_msg(const char *msg, ...) { int saved_errno = errno; va_list args; @@ -353,19 +377,32 @@ static inline __noreturn __printf(1, 2) int ksft_exit_fail_msg(const char *msg, exit(KSFT_FAIL); } -static inline __noreturn int ksft_exit_xfail(void) +static inline __noreturn void ksft_exit_fail_perror(const char *msg) +{ +#ifndef NOLIBC + ksft_exit_fail_msg("%s: %s (%d)\n", msg, strerror(errno), errno); +#else + /* + * nolibc doesn't provide strerror() and it seems + * inappropriate to add one, just print the errno. + */ + ksft_exit_fail_msg("%s: %d)\n", msg, errno); +#endif +} + +static inline __noreturn void ksft_exit_xfail(void) { ksft_print_cnts(); exit(KSFT_XFAIL); } -static inline __noreturn int ksft_exit_xpass(void) +static inline __noreturn void ksft_exit_xpass(void) { ksft_print_cnts(); exit(KSFT_XPASS); } -static inline __noreturn __printf(1, 2) int ksft_exit_skip(const char *msg, ...) +static inline __noreturn __printf(1, 2) void ksft_exit_skip(const char *msg, ...) { int saved_errno = errno; va_list args; diff --git a/tools/testing/selftests/kselftest/ktap_helpers.sh b/tools/testing/selftests/kselftest/ktap_helpers.sh index f2fbb914e058..79a125eb24c2 100644 --- a/tools/testing/selftests/kselftest/ktap_helpers.sh +++ b/tools/testing/selftests/kselftest/ktap_helpers.sh @@ -43,7 +43,7 @@ __ktap_test() { directive="$3" # optional local directive_str= - [[ ! -z "$directive" ]] && directive_str="# $directive" + [ ! -z "$directive" ] && directive_str="# $directive" echo $result $KTAP_TESTNO $description $directive_str @@ -99,7 +99,7 @@ ktap_exit_fail_msg() { ktap_finished() { ktap_print_totals - if [ $(("$KTAP_CNT_PASS" + "$KTAP_CNT_SKIP")) -eq "$KSFT_NUM_TESTS" ]; then + if [ $((KTAP_CNT_PASS + KTAP_CNT_SKIP)) -eq "$KSFT_NUM_TESTS" ]; then exit "$KSFT_PASS" else exit "$KSFT_FAIL" diff --git a/tools/testing/selftests/kselftest_deps.sh b/tools/testing/selftests/kselftest_deps.sh index de59cc8f03c3..487e49fdf2a6 100755 --- a/tools/testing/selftests/kselftest_deps.sh +++ b/tools/testing/selftests/kselftest_deps.sh @@ -244,6 +244,7 @@ l4_test() l5_test() { tests=$(find $(dirname "$test") -type f -name "*.mk") + [[ -z "${tests// }" ]] && return test_libs=$(grep "^IOURING_EXTRA_LIBS +\?=" $tests | \ cut -d "=" -f 2) diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index d98702b6955d..3c8f2965c285 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -66,6 +66,8 @@ #include <sys/wait.h> #include <unistd.h> #include <setjmp.h> +#include <syscall.h> +#include <linux/sched.h> #include "kselftest.h" @@ -80,6 +82,17 @@ # define TH_LOG_ENABLED 1 #endif +/* Wait for the child process to end but without sharing memory mapping. */ +static inline pid_t clone3_vfork(void) +{ + struct clone_args args = { + .flags = CLONE_VFORK, + .exit_signal = SIGCHLD, + }; + + return syscall(__NR_clone3, &args, sizeof(args)); +} + /** * TH_LOG() * @@ -281,6 +294,32 @@ * A bare "return;" statement may be used to return early. */ #define FIXTURE_TEARDOWN(fixture_name) \ + static const bool fixture_name##_teardown_parent; \ + __FIXTURE_TEARDOWN(fixture_name) + +/** + * FIXTURE_TEARDOWN_PARENT() + * *_metadata* is included so that EXPECT_*, ASSERT_* etc. work correctly. + * + * @fixture_name: fixture name + * + * .. code-block:: c + * + * FIXTURE_TEARDOWN_PARENT(fixture_name) { implementation } + * + * Same as FIXTURE_TEARDOWN() but run this code in a parent process. This + * enables the test process to drop its privileges without impacting the + * related FIXTURE_TEARDOWN_PARENT() (e.g. to remove files from a directory + * where write access was dropped). + * + * To make it possible for the parent process to use *self*, share (MAP_SHARED) + * the fixture data between all forked processes. + */ +#define FIXTURE_TEARDOWN_PARENT(fixture_name) \ + static const bool fixture_name##_teardown_parent = true; \ + __FIXTURE_TEARDOWN(fixture_name) + +#define __FIXTURE_TEARDOWN(fixture_name) \ void fixture_name##_teardown( \ struct __test_metadata __attribute__((unused)) *_metadata, \ FIXTURE_DATA(fixture_name) __attribute__((unused)) *self, \ @@ -325,7 +364,7 @@ * variant. */ #define FIXTURE_VARIANT_ADD(fixture_name, variant_name) \ - extern FIXTURE_VARIANT(fixture_name) \ + extern const FIXTURE_VARIANT(fixture_name) \ _##fixture_name##_##variant_name##_variant; \ static struct __fixture_variant_metadata \ _##fixture_name##_##variant_name##_object = \ @@ -337,7 +376,7 @@ __register_fixture_variant(&_##fixture_name##_fixture_object, \ &_##fixture_name##_##variant_name##_object); \ } \ - FIXTURE_VARIANT(fixture_name) \ + const FIXTURE_VARIANT(fixture_name) \ _##fixture_name##_##variant_name##_variant = /** @@ -355,10 +394,11 @@ * Very similar to TEST() except that *self* is the setup instance of fixture's * datatype exposed for use by the implementation. * - * The @test_name code is run in a separate process sharing the same memory - * (i.e. vfork), which means that the test process can update its privileges - * without impacting the related FIXTURE_TEARDOWN() (e.g. to remove files from - * a directory where write access was dropped). + * The _metadata object is shared (MAP_SHARED) with all the potential forked + * processes, which enables them to use EXCEPT_*() and ASSERT_*(). + * + * The *self* object is only shared with the potential forked processes if + * FIXTURE_TEARDOWN_PARENT() is used instead of FIXTURE_TEARDOWN(). */ #define TEST_F(fixture_name, test_name) \ __TEST_F_IMPL(fixture_name, test_name, -1, TEST_TIMEOUT_DEFAULT) @@ -379,53 +419,71 @@ struct __fixture_variant_metadata *variant) \ { \ /* fixture data is alloced, setup, and torn down per call. */ \ - FIXTURE_DATA(fixture_name) self; \ + FIXTURE_DATA(fixture_name) self_private, *self = NULL; \ pid_t child = 1; \ int status = 0; \ - bool jmp = false; \ - memset(&self, 0, sizeof(FIXTURE_DATA(fixture_name))); \ + /* Makes sure there is only one teardown, even when child forks again. */ \ + bool *teardown = mmap(NULL, sizeof(*teardown), \ + PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); \ + *teardown = false; \ + if (sizeof(*self) > 0) { \ + if (fixture_name##_teardown_parent) { \ + self = mmap(NULL, sizeof(*self), PROT_READ | PROT_WRITE, \ + MAP_SHARED | MAP_ANONYMOUS, -1, 0); \ + } else { \ + memset(&self_private, 0, sizeof(self_private)); \ + self = &self_private; \ + } \ + } \ if (setjmp(_metadata->env) == 0) { \ - /* Use the same _metadata. */ \ - child = vfork(); \ + /* _metadata and potentially self are shared with all forks. */ \ + child = clone3_vfork(); \ if (child == 0) { \ - fixture_name##_setup(_metadata, &self, variant->data); \ + fixture_name##_setup(_metadata, self, variant->data); \ /* Let setup failure terminate early. */ \ if (_metadata->exit_code) \ _exit(0); \ _metadata->setup_completed = true; \ - fixture_name##_##test_name(_metadata, &self, variant->data); \ + fixture_name##_##test_name(_metadata, self, variant->data); \ } else if (child < 0 || child != waitpid(child, &status, 0)) { \ ksft_print_msg("ERROR SPAWNING TEST GRANDCHILD\n"); \ _metadata->exit_code = KSFT_FAIL; \ } \ } \ - else \ - jmp = true; \ if (child == 0) { \ - if (_metadata->setup_completed && !_metadata->teardown_parent && !jmp) \ - fixture_name##_teardown(_metadata, &self, variant->data); \ + if (_metadata->setup_completed && !fixture_name##_teardown_parent && \ + __sync_bool_compare_and_swap(teardown, false, true)) \ + fixture_name##_teardown(_metadata, self, variant->data); \ _exit(0); \ } \ - if (_metadata->setup_completed && _metadata->teardown_parent) \ - fixture_name##_teardown(_metadata, &self, variant->data); \ - if (!WIFEXITED(status) && WIFSIGNALED(status)) \ + if (_metadata->setup_completed && fixture_name##_teardown_parent && \ + __sync_bool_compare_and_swap(teardown, false, true)) \ + fixture_name##_teardown(_metadata, self, variant->data); \ + munmap(teardown, sizeof(*teardown)); \ + if (self && fixture_name##_teardown_parent) \ + munmap(self, sizeof(*self)); \ + if (WIFEXITED(status)) { \ + if (WEXITSTATUS(status)) \ + _metadata->exit_code = WEXITSTATUS(status); \ + } else if (WIFSIGNALED(status)) { \ /* Forward signal to __wait_for_test(). */ \ kill(getpid(), WTERMSIG(status)); \ + } \ __test_check_assert(_metadata); \ } \ - static struct __test_metadata \ - _##fixture_name##_##test_name##_object = { \ - .name = #test_name, \ - .fn = &wrapper_##fixture_name##_##test_name, \ - .fixture = &_##fixture_name##_fixture_object, \ - .termsig = signal, \ - .timeout = tmout, \ - .teardown_parent = false, \ - }; \ + static struct __test_metadata *_##fixture_name##_##test_name##_object; \ static void __attribute__((constructor)) \ _register_##fixture_name##_##test_name(void) \ { \ - __register_test(&_##fixture_name##_##test_name##_object); \ + struct __test_metadata *object = mmap(NULL, sizeof(*object), \ + PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); \ + object->name = #test_name; \ + object->fn = &wrapper_##fixture_name##_##test_name; \ + object->fixture = &_##fixture_name##_fixture_object; \ + object->termsig = signal; \ + object->timeout = tmout; \ + _##fixture_name##_##test_name##_object = object; \ + __register_test(object); \ } \ static void fixture_name##_##test_name( \ struct __test_metadata __attribute__((unused)) *_metadata, \ @@ -833,11 +891,12 @@ struct __test_xfail { { \ .fixture = &_##fixture_name##_fixture_object, \ .variant = &_##fixture_name##_##variant_name##_object, \ - .test = &_##fixture_name##_##test_name##_object, \ }; \ static void __attribute__((constructor)) \ _register_##fixture_name##_##variant_name##_##test_name##_xfail(void) \ { \ + _##fixture_name##_##variant_name##_##test_name##_xfail.test = \ + _##fixture_name##_##test_name##_object; \ __register_xfail(&_##fixture_name##_##variant_name##_##test_name##_xfail); \ } @@ -880,7 +939,6 @@ struct __test_metadata { bool timed_out; /* did this test timeout instead of exiting? */ bool aborted; /* stopped test due to failed ASSERT */ bool setup_completed; /* did setup finish? */ - bool teardown_parent; /* run teardown in a parent process */ jmp_buf env; /* for exiting out of test early */ struct __test_results *results; struct __test_metadata *prev, *next; @@ -1164,6 +1222,9 @@ void __run_test(struct __fixture_metadata *f, /* reset test struct */ t->exit_code = KSFT_PASS; t->trigger = 0; + t->aborted = false; + t->setup_completed = false; + memset(t->env, 0, sizeof(t->env)); memset(t->results->reason, 0, sizeof(t->results->reason)); if (asprintf(&test_name, "%s%s%s.%s", f->name, @@ -1179,7 +1240,7 @@ void __run_test(struct __fixture_metadata *f, fflush(stdout); fflush(stderr); - t->pid = fork(); + t->pid = clone3_vfork(); if (t->pid < 0) { ksft_print_msg("ERROR SPAWNING TEST CHILD\n"); t->exit_code = KSFT_FAIL; diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 741c7dc16afc..ce8ff8e8ce3a 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -45,6 +45,7 @@ LIBKVM_x86_64 += lib/x86_64/vmx.c LIBKVM_aarch64 += lib/aarch64/gic.c LIBKVM_aarch64 += lib/aarch64/gic_v3.c +LIBKVM_aarch64 += lib/aarch64/gic_v3_its.c LIBKVM_aarch64 += lib/aarch64/handlers.S LIBKVM_aarch64 += lib/aarch64/processor.c LIBKVM_aarch64 += lib/aarch64/spinlock.c @@ -120,6 +121,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/tsc_msrs_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_pmu_caps_test TEST_GEN_PROGS_x86_64 += x86_64/xen_shinfo_test TEST_GEN_PROGS_x86_64 += x86_64/xen_vmcall_test +TEST_GEN_PROGS_x86_64 += x86_64/sev_init2_tests TEST_GEN_PROGS_x86_64 += x86_64/sev_migrate_tests TEST_GEN_PROGS_x86_64 += x86_64/sev_smoke_test TEST_GEN_PROGS_x86_64 += x86_64/amx_test @@ -157,6 +159,7 @@ TEST_GEN_PROGS_aarch64 += aarch64/smccc_filter TEST_GEN_PROGS_aarch64 += aarch64/vcpu_width_config TEST_GEN_PROGS_aarch64 += aarch64/vgic_init TEST_GEN_PROGS_aarch64 += aarch64/vgic_irq +TEST_GEN_PROGS_aarch64 += aarch64/vgic_lpi_stress TEST_GEN_PROGS_aarch64 += aarch64/vpmu_counter_access TEST_GEN_PROGS_aarch64 += access_tracking_perf_test TEST_GEN_PROGS_aarch64 += arch_timer @@ -189,6 +192,8 @@ TEST_GEN_PROGS_s390x += rseq_test TEST_GEN_PROGS_s390x += set_memory_region_test TEST_GEN_PROGS_s390x += kvm_binary_stats_test +TEST_GEN_PROGS_riscv += riscv/sbi_pmu_test +TEST_GEN_PROGS_riscv += riscv/ebreak_test TEST_GEN_PROGS_riscv += arch_timer TEST_GEN_PROGS_riscv += demand_paging_test TEST_GEN_PROGS_riscv += dirty_log_test @@ -225,8 +230,8 @@ LINUX_TOOL_ARCH_INCLUDE = $(top_srcdir)/tools/arch/$(ARCH)/include endif CFLAGS += -Wall -Wstrict-prototypes -Wuninitialized -O2 -g -std=gnu99 \ -Wno-gnu-variable-sized-type-not-at-end -MD -MP -DCONFIG_64BIT \ - -fno-builtin-memcmp -fno-builtin-memcpy -fno-builtin-memset \ - -fno-builtin-strnlen \ + -D_GNU_SOURCE -fno-builtin-memcmp -fno-builtin-memcpy \ + -fno-builtin-memset -fno-builtin-strnlen \ -fno-stack-protector -fno-PIE -I$(LINUX_TOOL_INCLUDE) \ -I$(LINUX_TOOL_ARCH_INCLUDE) -I$(LINUX_HDR_PATH) -Iinclude \ -I$(<D) -Iinclude/$(ARCH_DIR) -I ../rseq -I.. $(EXTRA_CFLAGS) \ diff --git a/tools/testing/selftests/kvm/aarch64/arch_timer.c b/tools/testing/selftests/kvm/aarch64/arch_timer.c index 4eaba83cdcf3..eeba1cc87ff8 100644 --- a/tools/testing/selftests/kvm/aarch64/arch_timer.c +++ b/tools/testing/selftests/kvm/aarch64/arch_timer.c @@ -5,18 +5,14 @@ * * Copyright (c) 2021, Google LLC. */ -#define _GNU_SOURCE - #include "arch_timer.h" #include "delay.h" #include "gic.h" #include "processor.h" #include "timer_test.h" +#include "ucall_common.h" #include "vgic.h" -#define GICD_BASE_GPA 0x8000000ULL -#define GICR_BASE_GPA 0x80A0000ULL - enum guest_stage { GUEST_STAGE_VTIMER_CVAL = 1, GUEST_STAGE_VTIMER_TVAL, @@ -149,8 +145,7 @@ static void guest_code(void) local_irq_disable(); - gic_init(GIC_V3, test_args.nr_vcpus, - (void *)GICD_BASE_GPA, (void *)GICR_BASE_GPA); + gic_init(GIC_V3, test_args.nr_vcpus); timer_set_ctl(VIRTUAL, CTL_IMASK); timer_set_ctl(PHYSICAL, CTL_IMASK); @@ -209,7 +204,7 @@ struct kvm_vm *test_vm_create(void) vcpu_init_descriptor_tables(vcpus[i]); test_init_timer_irq(vm); - gic_fd = vgic_v3_setup(vm, nr_vcpus, 64, GICD_BASE_GPA, GICR_BASE_GPA); + gic_fd = vgic_v3_setup(vm, nr_vcpus, 64); __TEST_REQUIRE(gic_fd >= 0, "Failed to create vgic-v3"); /* Make all the test's cmdline args visible to the guest */ diff --git a/tools/testing/selftests/kvm/aarch64/page_fault_test.c b/tools/testing/selftests/kvm/aarch64/page_fault_test.c index 5972905275cf..d29b08198b42 100644 --- a/tools/testing/selftests/kvm/aarch64/page_fault_test.c +++ b/tools/testing/selftests/kvm/aarch64/page_fault_test.c @@ -7,7 +7,6 @@ * hugetlbfs with a hole). It checks that the expected handling method is * called (e.g., uffd faults with the right address and write/read flag). */ -#define _GNU_SOURCE #include <linux/bitmap.h> #include <fcntl.h> #include <test_util.h> @@ -375,14 +374,14 @@ static void setup_uffd(struct kvm_vm *vm, struct test_params *p, *pt_uffd = uffd_setup_demand_paging(uffd_mode, 0, pt_args.hva, pt_args.paging_size, - test->uffd_pt_handler); + 1, test->uffd_pt_handler); *data_uffd = NULL; if (test->uffd_data_handler) *data_uffd = uffd_setup_demand_paging(uffd_mode, 0, data_args.hva, data_args.paging_size, - test->uffd_data_handler); + 1, test->uffd_data_handler); } static void free_uffd(struct test_desc *test, struct uffd_desc *pt_uffd, diff --git a/tools/testing/selftests/kvm/aarch64/psci_test.c b/tools/testing/selftests/kvm/aarch64/psci_test.c index 9b004905d1d3..61731a950def 100644 --- a/tools/testing/selftests/kvm/aarch64/psci_test.c +++ b/tools/testing/selftests/kvm/aarch64/psci_test.c @@ -11,9 +11,9 @@ * KVM_SYSTEM_EVENT_SUSPEND UAPI. */ -#define _GNU_SOURCE - +#include <linux/kernel.h> #include <linux/psci.h> +#include <asm/cputype.h> #include "kvm_util.h" #include "processor.h" diff --git a/tools/testing/selftests/kvm/aarch64/set_id_regs.c b/tools/testing/selftests/kvm/aarch64/set_id_regs.c index 16e2338686c1..a7de39fa2a0a 100644 --- a/tools/testing/selftests/kvm/aarch64/set_id_regs.c +++ b/tools/testing/selftests/kvm/aarch64/set_id_regs.c @@ -327,8 +327,8 @@ uint64_t get_invalid_value(const struct reg_ftr_bits *ftr_bits, uint64_t ftr) return ftr; } -static void test_reg_set_success(struct kvm_vcpu *vcpu, uint64_t reg, - const struct reg_ftr_bits *ftr_bits) +static uint64_t test_reg_set_success(struct kvm_vcpu *vcpu, uint64_t reg, + const struct reg_ftr_bits *ftr_bits) { uint8_t shift = ftr_bits->shift; uint64_t mask = ftr_bits->mask; @@ -346,6 +346,8 @@ static void test_reg_set_success(struct kvm_vcpu *vcpu, uint64_t reg, vcpu_set_reg(vcpu, reg, val); vcpu_get_reg(vcpu, reg, &new_val); TEST_ASSERT_EQ(new_val, val); + + return new_val; } static void test_reg_set_fail(struct kvm_vcpu *vcpu, uint64_t reg, @@ -374,7 +376,15 @@ static void test_reg_set_fail(struct kvm_vcpu *vcpu, uint64_t reg, TEST_ASSERT_EQ(val, old_val); } -static void test_user_set_reg(struct kvm_vcpu *vcpu, bool aarch64_only) +static uint64_t test_reg_vals[KVM_ARM_FEATURE_ID_RANGE_SIZE]; + +#define encoding_to_range_idx(encoding) \ + KVM_ARM_FEATURE_ID_RANGE_IDX(sys_reg_Op0(encoding), sys_reg_Op1(encoding), \ + sys_reg_CRn(encoding), sys_reg_CRm(encoding), \ + sys_reg_Op2(encoding)) + + +static void test_vm_ftr_id_regs(struct kvm_vcpu *vcpu, bool aarch64_only) { uint64_t masks[KVM_ARM_FEATURE_ID_RANGE_SIZE]; struct reg_mask_range range = { @@ -398,9 +408,7 @@ static void test_user_set_reg(struct kvm_vcpu *vcpu, bool aarch64_only) int idx; /* Get the index to masks array for the idreg */ - idx = KVM_ARM_FEATURE_ID_RANGE_IDX(sys_reg_Op0(reg_id), sys_reg_Op1(reg_id), - sys_reg_CRn(reg_id), sys_reg_CRm(reg_id), - sys_reg_Op2(reg_id)); + idx = encoding_to_range_idx(reg_id); for (int j = 0; ftr_bits[j].type != FTR_END; j++) { /* Skip aarch32 reg on aarch64 only system, since they are RAZ/WI. */ @@ -414,7 +422,9 @@ static void test_user_set_reg(struct kvm_vcpu *vcpu, bool aarch64_only) TEST_ASSERT_EQ(masks[idx] & ftr_bits[j].mask, ftr_bits[j].mask); test_reg_set_fail(vcpu, reg, &ftr_bits[j]); - test_reg_set_success(vcpu, reg, &ftr_bits[j]); + + test_reg_vals[idx] = test_reg_set_success(vcpu, reg, + &ftr_bits[j]); ksft_test_result_pass("%s\n", ftr_bits[j].name); } @@ -425,7 +435,6 @@ static void test_guest_reg_read(struct kvm_vcpu *vcpu) { bool done = false; struct ucall uc; - uint64_t val; while (!done) { vcpu_run(vcpu); @@ -436,8 +445,8 @@ static void test_guest_reg_read(struct kvm_vcpu *vcpu) break; case UCALL_SYNC: /* Make sure the written values are seen by guest */ - vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(uc.args[2]), &val); - TEST_ASSERT_EQ(val, uc.args[3]); + TEST_ASSERT_EQ(test_reg_vals[encoding_to_range_idx(uc.args[2])], + uc.args[3]); break; case UCALL_DONE: done = true; @@ -448,13 +457,85 @@ static void test_guest_reg_read(struct kvm_vcpu *vcpu) } } +/* Politely lifted from arch/arm64/include/asm/cache.h */ +/* Ctypen, bits[3(n - 1) + 2 : 3(n - 1)], for n = 1 to 7 */ +#define CLIDR_CTYPE_SHIFT(level) (3 * (level - 1)) +#define CLIDR_CTYPE_MASK(level) (7 << CLIDR_CTYPE_SHIFT(level)) +#define CLIDR_CTYPE(clidr, level) \ + (((clidr) & CLIDR_CTYPE_MASK(level)) >> CLIDR_CTYPE_SHIFT(level)) + +static void test_clidr(struct kvm_vcpu *vcpu) +{ + uint64_t clidr; + int level; + + vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_CLIDR_EL1), &clidr); + + /* find the first empty level in the cache hierarchy */ + for (level = 1; level < 7; level++) { + if (!CLIDR_CTYPE(clidr, level)) + break; + } + + /* + * If you have a mind-boggling 7 levels of cache, congratulations, you + * get to fix this. + */ + TEST_ASSERT(level <= 7, "can't find an empty level in cache hierarchy"); + + /* stick in a unified cache level */ + clidr |= BIT(2) << CLIDR_CTYPE_SHIFT(level); + + vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_CLIDR_EL1), clidr); + test_reg_vals[encoding_to_range_idx(SYS_CLIDR_EL1)] = clidr; +} + +static void test_vcpu_ftr_id_regs(struct kvm_vcpu *vcpu) +{ + u64 val; + + test_clidr(vcpu); + + vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_MPIDR_EL1), &val); + val++; + vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_MPIDR_EL1), val); + + test_reg_vals[encoding_to_range_idx(SYS_MPIDR_EL1)] = val; + ksft_test_result_pass("%s\n", __func__); +} + +static void test_assert_id_reg_unchanged(struct kvm_vcpu *vcpu, uint32_t encoding) +{ + size_t idx = encoding_to_range_idx(encoding); + uint64_t observed; + + vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(encoding), &observed); + TEST_ASSERT_EQ(test_reg_vals[idx], observed); +} + +static void test_reset_preserves_id_regs(struct kvm_vcpu *vcpu) +{ + /* + * Calls KVM_ARM_VCPU_INIT behind the scenes, which will do an + * architectural reset of the vCPU. + */ + aarch64_vcpu_setup(vcpu, NULL); + + for (int i = 0; i < ARRAY_SIZE(test_regs); i++) + test_assert_id_reg_unchanged(vcpu, test_regs[i].reg); + + test_assert_id_reg_unchanged(vcpu, SYS_CLIDR_EL1); + + ksft_test_result_pass("%s\n", __func__); +} + int main(void) { struct kvm_vcpu *vcpu; struct kvm_vm *vm; bool aarch64_only; uint64_t val, el0; - int ftr_cnt; + int test_cnt; TEST_REQUIRE(kvm_has_cap(KVM_CAP_ARM_SUPPORTED_REG_MASK_RANGES)); @@ -467,18 +548,22 @@ int main(void) ksft_print_header(); - ftr_cnt = ARRAY_SIZE(ftr_id_aa64dfr0_el1) + ARRAY_SIZE(ftr_id_dfr0_el1) + - ARRAY_SIZE(ftr_id_aa64isar0_el1) + ARRAY_SIZE(ftr_id_aa64isar1_el1) + - ARRAY_SIZE(ftr_id_aa64isar2_el1) + ARRAY_SIZE(ftr_id_aa64pfr0_el1) + - ARRAY_SIZE(ftr_id_aa64mmfr0_el1) + ARRAY_SIZE(ftr_id_aa64mmfr1_el1) + - ARRAY_SIZE(ftr_id_aa64mmfr2_el1) + ARRAY_SIZE(ftr_id_aa64zfr0_el1) - - ARRAY_SIZE(test_regs); + test_cnt = ARRAY_SIZE(ftr_id_aa64dfr0_el1) + ARRAY_SIZE(ftr_id_dfr0_el1) + + ARRAY_SIZE(ftr_id_aa64isar0_el1) + ARRAY_SIZE(ftr_id_aa64isar1_el1) + + ARRAY_SIZE(ftr_id_aa64isar2_el1) + ARRAY_SIZE(ftr_id_aa64pfr0_el1) + + ARRAY_SIZE(ftr_id_aa64mmfr0_el1) + ARRAY_SIZE(ftr_id_aa64mmfr1_el1) + + ARRAY_SIZE(ftr_id_aa64mmfr2_el1) + ARRAY_SIZE(ftr_id_aa64zfr0_el1) - + ARRAY_SIZE(test_regs) + 2; - ksft_set_plan(ftr_cnt); + ksft_set_plan(test_cnt); + + test_vm_ftr_id_regs(vcpu, aarch64_only); + test_vcpu_ftr_id_regs(vcpu); - test_user_set_reg(vcpu, aarch64_only); test_guest_reg_read(vcpu); + test_reset_preserves_id_regs(vcpu); + kvm_vm_free(vm); ksft_finished(); diff --git a/tools/testing/selftests/kvm/aarch64/vgic_init.c b/tools/testing/selftests/kvm/aarch64/vgic_init.c index eef816b80993..b3b5fb0ff0a9 100644 --- a/tools/testing/selftests/kvm/aarch64/vgic_init.c +++ b/tools/testing/selftests/kvm/aarch64/vgic_init.c @@ -4,7 +4,6 @@ * * Copyright (C) 2020, Red Hat, Inc. */ -#define _GNU_SOURCE #include <linux/kernel.h> #include <sys/syscall.h> #include <asm/kvm.h> @@ -84,6 +83,18 @@ static struct vm_gic vm_gic_create_with_vcpus(uint32_t gic_dev_type, return v; } +static struct vm_gic vm_gic_create_barebones(uint32_t gic_dev_type) +{ + struct vm_gic v; + + v.gic_dev_type = gic_dev_type; + v.vm = vm_create_barebones(); + v.gic_fd = kvm_create_device(v.vm, gic_dev_type); + + return v; +} + + static void vm_gic_destroy(struct vm_gic *v) { close(v->gic_fd); @@ -357,6 +368,40 @@ static void test_vcpus_then_vgic(uint32_t gic_dev_type) vm_gic_destroy(&v); } +#define KVM_VGIC_V2_ATTR(offset, cpu) \ + (FIELD_PREP(KVM_DEV_ARM_VGIC_OFFSET_MASK, offset) | \ + FIELD_PREP(KVM_DEV_ARM_VGIC_CPUID_MASK, cpu)) + +#define GIC_CPU_CTRL 0x00 + +static void test_v2_uaccess_cpuif_no_vcpus(void) +{ + struct vm_gic v; + u64 val = 0; + int ret; + + v = vm_gic_create_barebones(KVM_DEV_TYPE_ARM_VGIC_V2); + subtest_dist_rdist(&v); + + ret = __kvm_has_device_attr(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CPU_REGS, + KVM_VGIC_V2_ATTR(GIC_CPU_CTRL, 0)); + TEST_ASSERT(ret && errno == EINVAL, + "accessed non-existent CPU interface, want errno: %i", + EINVAL); + ret = __kvm_device_attr_get(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CPU_REGS, + KVM_VGIC_V2_ATTR(GIC_CPU_CTRL, 0), &val); + TEST_ASSERT(ret && errno == EINVAL, + "accessed non-existent CPU interface, want errno: %i", + EINVAL); + ret = __kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CPU_REGS, + KVM_VGIC_V2_ATTR(GIC_CPU_CTRL, 0), &val); + TEST_ASSERT(ret && errno == EINVAL, + "accessed non-existent CPU interface, want errno: %i", + EINVAL); + + vm_gic_destroy(&v); +} + static void test_v3_new_redist_regions(void) { struct kvm_vcpu *vcpus[NR_VCPUS]; @@ -675,6 +720,9 @@ void run_tests(uint32_t gic_dev_type) test_vcpus_then_vgic(gic_dev_type); test_vgic_then_vcpus(gic_dev_type); + if (VGIC_DEV_IS_V2(gic_dev_type)) + test_v2_uaccess_cpuif_no_vcpus(); + if (VGIC_DEV_IS_V3(gic_dev_type)) { test_v3_new_redist_regions(); test_v3_typer_accesses(); diff --git a/tools/testing/selftests/kvm/aarch64/vgic_irq.c b/tools/testing/selftests/kvm/aarch64/vgic_irq.c index 2e64b4856e38..a51dbd2a5f84 100644 --- a/tools/testing/selftests/kvm/aarch64/vgic_irq.c +++ b/tools/testing/selftests/kvm/aarch64/vgic_irq.c @@ -19,9 +19,6 @@ #include "gic_v3.h" #include "vgic.h" -#define GICD_BASE_GPA 0x08000000ULL -#define GICR_BASE_GPA 0x080A0000ULL - /* * Stores the user specified args; it's passed to the guest and to every test * function. @@ -49,9 +46,6 @@ struct test_args { #define IRQ_DEFAULT_PRIO (LOWEST_PRIO - 1) #define IRQ_DEFAULT_PRIO_REG (IRQ_DEFAULT_PRIO << KVM_PRIO_SHIFT) /* 0xf0 */ -static void *dist = (void *)GICD_BASE_GPA; -static void *redist = (void *)GICR_BASE_GPA; - /* * The kvm_inject_* utilities are used by the guest to ask the host to inject * interrupts (e.g., using the KVM_IRQ_LINE ioctl). @@ -152,7 +146,7 @@ static void reset_stats(void) static uint64_t gic_read_ap1r0(void) { - uint64_t reg = read_sysreg_s(SYS_ICV_AP1R0_EL1); + uint64_t reg = read_sysreg_s(SYS_ICC_AP1R0_EL1); dsb(sy); return reg; @@ -160,7 +154,7 @@ static uint64_t gic_read_ap1r0(void) static void gic_write_ap1r0(uint64_t val) { - write_sysreg_s(val, SYS_ICV_AP1R0_EL1); + write_sysreg_s(val, SYS_ICC_AP1R0_EL1); isb(); } @@ -478,7 +472,7 @@ static void guest_code(struct test_args *args) bool level_sensitive = args->level_sensitive; struct kvm_inject_desc *f, *inject_fns; - gic_init(GIC_V3, 1, dist, redist); + gic_init(GIC_V3, 1); for (i = 0; i < nr_irqs; i++) gic_irq_enable(i); @@ -764,8 +758,7 @@ static void test_vgic(uint32_t nr_irqs, bool level_sensitive, bool eoi_split) memcpy(addr_gva2hva(vm, args_gva), &args, sizeof(args)); vcpu_args_set(vcpu, 1, args_gva); - gic_fd = vgic_v3_setup(vm, 1, nr_irqs, - GICD_BASE_GPA, GICR_BASE_GPA); + gic_fd = vgic_v3_setup(vm, 1, nr_irqs); __TEST_REQUIRE(gic_fd >= 0, "Failed to create vgic-v3, skipping"); vm_install_exception_handler(vm, VECTOR_IRQ_CURRENT, diff --git a/tools/testing/selftests/kvm/aarch64/vgic_lpi_stress.c b/tools/testing/selftests/kvm/aarch64/vgic_lpi_stress.c new file mode 100644 index 000000000000..fc4fe52fb6f8 --- /dev/null +++ b/tools/testing/selftests/kvm/aarch64/vgic_lpi_stress.c @@ -0,0 +1,410 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * vgic_lpi_stress - Stress test for KVM's ITS emulation + * + * Copyright (c) 2024 Google LLC + */ + +#include <linux/sizes.h> +#include <pthread.h> +#include <stdatomic.h> +#include <sys/sysinfo.h> + +#include "kvm_util.h" +#include "gic.h" +#include "gic_v3.h" +#include "gic_v3_its.h" +#include "processor.h" +#include "ucall.h" +#include "vgic.h" + +#define TEST_MEMSLOT_INDEX 1 + +#define GIC_LPI_OFFSET 8192 + +static size_t nr_iterations = 1000; +static vm_paddr_t gpa_base; + +static struct kvm_vm *vm; +static struct kvm_vcpu **vcpus; +static int gic_fd, its_fd; + +static struct test_data { + bool request_vcpus_stop; + u32 nr_cpus; + u32 nr_devices; + u32 nr_event_ids; + + vm_paddr_t device_table; + vm_paddr_t collection_table; + vm_paddr_t cmdq_base; + void *cmdq_base_va; + vm_paddr_t itt_tables; + + vm_paddr_t lpi_prop_table; + vm_paddr_t lpi_pend_tables; +} test_data = { + .nr_cpus = 1, + .nr_devices = 1, + .nr_event_ids = 16, +}; + +static void guest_irq_handler(struct ex_regs *regs) +{ + u32 intid = gic_get_and_ack_irq(); + + if (intid == IAR_SPURIOUS) + return; + + GUEST_ASSERT(intid >= GIC_LPI_OFFSET); + gic_set_eoi(intid); +} + +static void guest_setup_its_mappings(void) +{ + u32 coll_id, device_id, event_id, intid = GIC_LPI_OFFSET; + u32 nr_events = test_data.nr_event_ids; + u32 nr_devices = test_data.nr_devices; + u32 nr_cpus = test_data.nr_cpus; + + for (coll_id = 0; coll_id < nr_cpus; coll_id++) + its_send_mapc_cmd(test_data.cmdq_base_va, coll_id, coll_id, true); + + /* Round-robin the LPIs to all of the vCPUs in the VM */ + coll_id = 0; + for (device_id = 0; device_id < nr_devices; device_id++) { + vm_paddr_t itt_base = test_data.itt_tables + (device_id * SZ_64K); + + its_send_mapd_cmd(test_data.cmdq_base_va, device_id, + itt_base, SZ_64K, true); + + for (event_id = 0; event_id < nr_events; event_id++) { + its_send_mapti_cmd(test_data.cmdq_base_va, device_id, + event_id, coll_id, intid++); + + coll_id = (coll_id + 1) % test_data.nr_cpus; + } + } +} + +static void guest_invalidate_all_rdists(void) +{ + int i; + + for (i = 0; i < test_data.nr_cpus; i++) + its_send_invall_cmd(test_data.cmdq_base_va, i); +} + +static void guest_setup_gic(void) +{ + static atomic_int nr_cpus_ready = 0; + u32 cpuid = guest_get_vcpuid(); + + gic_init(GIC_V3, test_data.nr_cpus); + gic_rdist_enable_lpis(test_data.lpi_prop_table, SZ_64K, + test_data.lpi_pend_tables + (cpuid * SZ_64K)); + + atomic_fetch_add(&nr_cpus_ready, 1); + + if (cpuid > 0) + return; + + while (atomic_load(&nr_cpus_ready) < test_data.nr_cpus) + cpu_relax(); + + its_init(test_data.collection_table, SZ_64K, + test_data.device_table, SZ_64K, + test_data.cmdq_base, SZ_64K); + + guest_setup_its_mappings(); + guest_invalidate_all_rdists(); +} + +static void guest_code(size_t nr_lpis) +{ + guest_setup_gic(); + + GUEST_SYNC(0); + + /* + * Don't use WFI here to avoid blocking the vCPU thread indefinitely and + * never getting the stop signal. + */ + while (!READ_ONCE(test_data.request_vcpus_stop)) + cpu_relax(); + + GUEST_DONE(); +} + +static void setup_memslot(void) +{ + size_t pages; + size_t sz; + + /* + * For the ITS: + * - A single level device table + * - A single level collection table + * - The command queue + * - An ITT for each device + */ + sz = (3 + test_data.nr_devices) * SZ_64K; + + /* + * For the redistributors: + * - A shared LPI configuration table + * - An LPI pending table for each vCPU + */ + sz += (1 + test_data.nr_cpus) * SZ_64K; + + pages = sz / vm->page_size; + gpa_base = ((vm_compute_max_gfn(vm) + 1) * vm->page_size) - sz; + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, gpa_base, + TEST_MEMSLOT_INDEX, pages, 0); +} + +#define LPI_PROP_DEFAULT_PRIO 0xa0 + +static void configure_lpis(void) +{ + size_t nr_lpis = test_data.nr_devices * test_data.nr_event_ids; + u8 *tbl = addr_gpa2hva(vm, test_data.lpi_prop_table); + size_t i; + + for (i = 0; i < nr_lpis; i++) { + tbl[i] = LPI_PROP_DEFAULT_PRIO | + LPI_PROP_GROUP1 | + LPI_PROP_ENABLED; + } +} + +static void setup_test_data(void) +{ + size_t pages_per_64k = vm_calc_num_guest_pages(vm->mode, SZ_64K); + u32 nr_devices = test_data.nr_devices; + u32 nr_cpus = test_data.nr_cpus; + vm_paddr_t cmdq_base; + + test_data.device_table = vm_phy_pages_alloc(vm, pages_per_64k, + gpa_base, + TEST_MEMSLOT_INDEX); + + test_data.collection_table = vm_phy_pages_alloc(vm, pages_per_64k, + gpa_base, + TEST_MEMSLOT_INDEX); + + cmdq_base = vm_phy_pages_alloc(vm, pages_per_64k, gpa_base, + TEST_MEMSLOT_INDEX); + virt_map(vm, cmdq_base, cmdq_base, pages_per_64k); + test_data.cmdq_base = cmdq_base; + test_data.cmdq_base_va = (void *)cmdq_base; + + test_data.itt_tables = vm_phy_pages_alloc(vm, pages_per_64k * nr_devices, + gpa_base, TEST_MEMSLOT_INDEX); + + test_data.lpi_prop_table = vm_phy_pages_alloc(vm, pages_per_64k, + gpa_base, TEST_MEMSLOT_INDEX); + configure_lpis(); + + test_data.lpi_pend_tables = vm_phy_pages_alloc(vm, pages_per_64k * nr_cpus, + gpa_base, TEST_MEMSLOT_INDEX); + + sync_global_to_guest(vm, test_data); +} + +static void setup_gic(void) +{ + gic_fd = vgic_v3_setup(vm, test_data.nr_cpus, 64); + __TEST_REQUIRE(gic_fd >= 0, "Failed to create GICv3"); + + its_fd = vgic_its_setup(vm); +} + +static void signal_lpi(u32 device_id, u32 event_id) +{ + vm_paddr_t db_addr = GITS_BASE_GPA + GITS_TRANSLATER; + + struct kvm_msi msi = { + .address_lo = db_addr, + .address_hi = db_addr >> 32, + .data = event_id, + .devid = device_id, + .flags = KVM_MSI_VALID_DEVID, + }; + + /* + * KVM_SIGNAL_MSI returns 1 if the MSI wasn't 'blocked' by the VM, + * which for arm64 implies having a valid translation in the ITS. + */ + TEST_ASSERT(__vm_ioctl(vm, KVM_SIGNAL_MSI, &msi) == 1, + "KVM_SIGNAL_MSI ioctl failed"); +} + +static pthread_barrier_t test_setup_barrier; + +static void *lpi_worker_thread(void *data) +{ + u32 device_id = (size_t)data; + u32 event_id; + size_t i; + + pthread_barrier_wait(&test_setup_barrier); + + for (i = 0; i < nr_iterations; i++) + for (event_id = 0; event_id < test_data.nr_event_ids; event_id++) + signal_lpi(device_id, event_id); + + return NULL; +} + +static void *vcpu_worker_thread(void *data) +{ + struct kvm_vcpu *vcpu = data; + struct ucall uc; + + while (true) { + vcpu_run(vcpu); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_SYNC: + pthread_barrier_wait(&test_setup_barrier); + continue; + case UCALL_DONE: + return NULL; + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + break; + default: + TEST_FAIL("Unknown ucall: %lu", uc.cmd); + } + } + + return NULL; +} + +static void report_stats(struct timespec delta) +{ + double nr_lpis; + double time; + + nr_lpis = test_data.nr_devices * test_data.nr_event_ids * nr_iterations; + + time = delta.tv_sec; + time += ((double)delta.tv_nsec) / NSEC_PER_SEC; + + pr_info("Rate: %.2f LPIs/sec\n", nr_lpis / time); +} + +static void run_test(void) +{ + u32 nr_devices = test_data.nr_devices; + u32 nr_vcpus = test_data.nr_cpus; + pthread_t *lpi_threads = malloc(nr_devices * sizeof(pthread_t)); + pthread_t *vcpu_threads = malloc(nr_vcpus * sizeof(pthread_t)); + struct timespec start, delta; + size_t i; + + TEST_ASSERT(lpi_threads && vcpu_threads, "Failed to allocate pthread arrays"); + + pthread_barrier_init(&test_setup_barrier, NULL, nr_vcpus + nr_devices + 1); + + for (i = 0; i < nr_vcpus; i++) + pthread_create(&vcpu_threads[i], NULL, vcpu_worker_thread, vcpus[i]); + + for (i = 0; i < nr_devices; i++) + pthread_create(&lpi_threads[i], NULL, lpi_worker_thread, (void *)i); + + pthread_barrier_wait(&test_setup_barrier); + + clock_gettime(CLOCK_MONOTONIC, &start); + + for (i = 0; i < nr_devices; i++) + pthread_join(lpi_threads[i], NULL); + + delta = timespec_elapsed(start); + write_guest_global(vm, test_data.request_vcpus_stop, true); + + for (i = 0; i < nr_vcpus; i++) + pthread_join(vcpu_threads[i], NULL); + + report_stats(delta); +} + +static void setup_vm(void) +{ + int i; + + vcpus = malloc(test_data.nr_cpus * sizeof(struct kvm_vcpu)); + TEST_ASSERT(vcpus, "Failed to allocate vCPU array"); + + vm = vm_create_with_vcpus(test_data.nr_cpus, guest_code, vcpus); + + vm_init_descriptor_tables(vm); + for (i = 0; i < test_data.nr_cpus; i++) + vcpu_init_descriptor_tables(vcpus[i]); + + vm_install_exception_handler(vm, VECTOR_IRQ_CURRENT, guest_irq_handler); + + setup_memslot(); + + setup_gic(); + + setup_test_data(); +} + +static void destroy_vm(void) +{ + close(its_fd); + close(gic_fd); + kvm_vm_free(vm); + free(vcpus); +} + +static void pr_usage(const char *name) +{ + pr_info("%s [-v NR_VCPUS] [-d NR_DEVICES] [-e NR_EVENTS] [-i ITERS] -h\n", name); + pr_info(" -v:\tnumber of vCPUs (default: %u)\n", test_data.nr_cpus); + pr_info(" -d:\tnumber of devices (default: %u)\n", test_data.nr_devices); + pr_info(" -e:\tnumber of event IDs per device (default: %u)\n", test_data.nr_event_ids); + pr_info(" -i:\tnumber of iterations (default: %lu)\n", nr_iterations); +} + +int main(int argc, char **argv) +{ + u32 nr_threads; + int c; + + while ((c = getopt(argc, argv, "hv:d:e:i:")) != -1) { + switch (c) { + case 'v': + test_data.nr_cpus = atoi(optarg); + break; + case 'd': + test_data.nr_devices = atoi(optarg); + break; + case 'e': + test_data.nr_event_ids = atoi(optarg); + break; + case 'i': + nr_iterations = strtoul(optarg, NULL, 0); + break; + case 'h': + default: + pr_usage(argv[0]); + return 1; + } + } + + nr_threads = test_data.nr_cpus + test_data.nr_devices; + if (nr_threads > get_nprocs()) + pr_info("WARNING: running %u threads on %d CPUs; performance is degraded.\n", + nr_threads, get_nprocs()); + + setup_vm(); + + run_test(); + + destroy_vm(); + + return 0; +} diff --git a/tools/testing/selftests/kvm/aarch64/vpmu_counter_access.c b/tools/testing/selftests/kvm/aarch64/vpmu_counter_access.c index f2fb0e3f14bc..d31b9f64ba14 100644 --- a/tools/testing/selftests/kvm/aarch64/vpmu_counter_access.c +++ b/tools/testing/selftests/kvm/aarch64/vpmu_counter_access.c @@ -404,9 +404,6 @@ static void guest_code(uint64_t expected_pmcr_n) GUEST_DONE(); } -#define GICD_BASE_GPA 0x8000000ULL -#define GICR_BASE_GPA 0x80A0000ULL - /* Create a VM that has one vCPU with PMUv3 configured. */ static void create_vpmu_vm(void *guest_code) { @@ -438,8 +435,7 @@ static void create_vpmu_vm(void *guest_code) init.features[0] |= (1 << KVM_ARM_VCPU_PMU_V3); vpmu_vm.vcpu = aarch64_vcpu_add(vpmu_vm.vm, 0, &init, guest_code); vcpu_init_descriptor_tables(vpmu_vm.vcpu); - vpmu_vm.gic_fd = vgic_v3_setup(vpmu_vm.vm, 1, 64, - GICD_BASE_GPA, GICR_BASE_GPA); + vpmu_vm.gic_fd = vgic_v3_setup(vpmu_vm.vm, 1, 64); __TEST_REQUIRE(vpmu_vm.gic_fd >= 0, "Failed to create vgic-v3, skipping"); diff --git a/tools/testing/selftests/kvm/arch_timer.c b/tools/testing/selftests/kvm/arch_timer.c index ae1f1a6d8312..acb2cb596332 100644 --- a/tools/testing/selftests/kvm/arch_timer.c +++ b/tools/testing/selftests/kvm/arch_timer.c @@ -19,9 +19,6 @@ * * Copyright (c) 2021, Google LLC. */ - -#define _GNU_SOURCE - #include <stdlib.h> #include <pthread.h> #include <linux/sizes.h> @@ -29,6 +26,7 @@ #include <sys/sysinfo.h> #include "timer_test.h" +#include "ucall_common.h" struct test_args test_args = { .nr_vcpus = NR_VCPUS_DEF, diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c index bf3609f71854..0202b78f8680 100644 --- a/tools/testing/selftests/kvm/demand_paging_test.c +++ b/tools/testing/selftests/kvm/demand_paging_test.c @@ -6,14 +6,10 @@ * Copyright (C) 2018, Red Hat, Inc. * Copyright (C) 2019, Google, Inc. */ - -#define _GNU_SOURCE /* for pipe2 */ - #include <inttypes.h> #include <stdio.h> #include <stdlib.h> #include <time.h> -#include <poll.h> #include <pthread.h> #include <linux/userfaultfd.h> #include <sys/syscall.h> @@ -22,6 +18,7 @@ #include "test_util.h" #include "memstress.h" #include "guest_modes.h" +#include "ucall_common.h" #include "userfaultfd_util.h" #ifdef __NR_userfaultfd @@ -77,8 +74,20 @@ static int handle_uffd_page_request(int uffd_mode, int uffd, copy.mode = 0; r = ioctl(uffd, UFFDIO_COPY, ©); - if (r == -1) { - pr_info("Failed UFFDIO_COPY in 0x%lx from thread %d with errno: %d\n", + /* + * With multiple vCPU threads fault on a single page and there are + * multiple readers for the UFFD, at least one of the UFFDIO_COPYs + * will fail with EEXIST: handle that case without signaling an + * error. + * + * Note that this also suppress any EEXISTs occurring from, + * e.g., the first UFFDIO_COPY/CONTINUEs on a page. That never + * happens here, but a realistic VMM might potentially maintain + * some external state to correctly surface EEXISTs to userspace + * (or prevent duplicate COPY/CONTINUEs in the first place). + */ + if (r == -1 && errno != EEXIST) { + pr_info("Failed UFFDIO_COPY in 0x%lx from thread %d, errno = %d\n", addr, tid, errno); return r; } @@ -89,8 +98,20 @@ static int handle_uffd_page_request(int uffd_mode, int uffd, cont.range.len = demand_paging_size; r = ioctl(uffd, UFFDIO_CONTINUE, &cont); - if (r == -1) { - pr_info("Failed UFFDIO_CONTINUE in 0x%lx from thread %d with errno: %d\n", + /* + * With multiple vCPU threads fault on a single page and there are + * multiple readers for the UFFD, at least one of the UFFDIO_COPYs + * will fail with EEXIST: handle that case without signaling an + * error. + * + * Note that this also suppress any EEXISTs occurring from, + * e.g., the first UFFDIO_COPY/CONTINUEs on a page. That never + * happens here, but a realistic VMM might potentially maintain + * some external state to correctly surface EEXISTs to userspace + * (or prevent duplicate COPY/CONTINUEs in the first place). + */ + if (r == -1 && errno != EEXIST) { + pr_info("Failed UFFDIO_CONTINUE in 0x%lx, thread %d, errno = %d\n", addr, tid, errno); return r; } @@ -110,7 +131,9 @@ static int handle_uffd_page_request(int uffd_mode, int uffd, struct test_params { int uffd_mode; + bool single_uffd; useconds_t uffd_delay; + int readers_per_uffd; enum vm_mem_backing_src_type src_type; bool partition_vcpu_memory_access; }; @@ -131,10 +154,12 @@ static void run_test(enum vm_guest_mode mode, void *arg) struct memstress_vcpu_args *vcpu_args; struct test_params *p = arg; struct uffd_desc **uffd_descs = NULL; + uint64_t uffd_region_size; struct timespec start; struct timespec ts_diff; + double vcpu_paging_rate; struct kvm_vm *vm; - int i; + int i, num_uffds = 0; vm = memstress_create_vm(mode, nr_vcpus, guest_percpu_mem_size, 1, p->src_type, p->partition_vcpu_memory_access); @@ -147,7 +172,8 @@ static void run_test(enum vm_guest_mode mode, void *arg) memset(guest_data_prototype, 0xAB, demand_paging_size); if (p->uffd_mode == UFFDIO_REGISTER_MODE_MINOR) { - for (i = 0; i < nr_vcpus; i++) { + num_uffds = p->single_uffd ? 1 : nr_vcpus; + for (i = 0; i < num_uffds; i++) { vcpu_args = &memstress_args.vcpu_args[i]; prefault_mem(addr_gpa2alias(vm, vcpu_args->gpa), vcpu_args->pages * memstress_args.guest_page_size); @@ -155,9 +181,13 @@ static void run_test(enum vm_guest_mode mode, void *arg) } if (p->uffd_mode) { - uffd_descs = malloc(nr_vcpus * sizeof(struct uffd_desc *)); + num_uffds = p->single_uffd ? 1 : nr_vcpus; + uffd_region_size = nr_vcpus * guest_percpu_mem_size / num_uffds; + + uffd_descs = malloc(num_uffds * sizeof(struct uffd_desc *)); TEST_ASSERT(uffd_descs, "Memory allocation failed"); - for (i = 0; i < nr_vcpus; i++) { + for (i = 0; i < num_uffds; i++) { + struct memstress_vcpu_args *vcpu_args; void *vcpu_hva; vcpu_args = &memstress_args.vcpu_args[i]; @@ -170,7 +200,8 @@ static void run_test(enum vm_guest_mode mode, void *arg) */ uffd_descs[i] = uffd_setup_demand_paging( p->uffd_mode, p->uffd_delay, vcpu_hva, - vcpu_args->pages * memstress_args.guest_page_size, + uffd_region_size, + p->readers_per_uffd, &handle_uffd_page_request); } } @@ -187,15 +218,19 @@ static void run_test(enum vm_guest_mode mode, void *arg) if (p->uffd_mode) { /* Tell the user fault fd handler threads to quit */ - for (i = 0; i < nr_vcpus; i++) + for (i = 0; i < num_uffds; i++) uffd_stop_demand_paging(uffd_descs[i]); } - pr_info("Total guest execution time: %ld.%.9lds\n", + pr_info("Total guest execution time:\t%ld.%.9lds\n", ts_diff.tv_sec, ts_diff.tv_nsec); - pr_info("Overall demand paging rate: %f pgs/sec\n", - memstress_args.vcpu_args[0].pages * nr_vcpus / - ((double)ts_diff.tv_sec + (double)ts_diff.tv_nsec / NSEC_PER_SEC)); + + vcpu_paging_rate = memstress_args.vcpu_args[0].pages / + ((double)ts_diff.tv_sec + (double)ts_diff.tv_nsec / NSEC_PER_SEC); + pr_info("Per-vcpu demand paging rate:\t%f pgs/sec/vcpu\n", + vcpu_paging_rate); + pr_info("Overall demand paging rate:\t%f pgs/sec\n", + vcpu_paging_rate * nr_vcpus); memstress_destroy_vm(vm); @@ -207,15 +242,20 @@ static void run_test(enum vm_guest_mode mode, void *arg) static void help(char *name) { puts(""); - printf("usage: %s [-h] [-m vm_mode] [-u uffd_mode] [-d uffd_delay_usec]\n" - " [-b memory] [-s type] [-v vcpus] [-c cpu_list] [-o]\n", name); + printf("usage: %s [-h] [-m vm_mode] [-u uffd_mode] [-a]\n" + " [-d uffd_delay_usec] [-r readers_per_uffd] [-b memory]\n" + " [-s type] [-v vcpus] [-c cpu_list] [-o]\n", name); guest_modes_help(); printf(" -u: use userfaultfd to handle vCPU page faults. Mode is a\n" " UFFD registration mode: 'MISSING' or 'MINOR'.\n"); kvm_print_vcpu_pinning_help(); + printf(" -a: Use a single userfaultfd for all of guest memory, instead of\n" + " creating one for each region paged by a unique vCPU\n" + " Set implicitly with -o, and no effect without -u.\n"); printf(" -d: add a delay in usec to the User Fault\n" " FD handler to simulate demand paging\n" " overheads. Ignored without -u.\n"); + printf(" -r: Set the number of reader threads per uffd.\n"); printf(" -b: specify the size of the memory region which should be\n" " demand paged by each vCPU. e.g. 10M or 3G.\n" " Default: 1G\n"); @@ -234,12 +274,14 @@ int main(int argc, char *argv[]) struct test_params p = { .src_type = DEFAULT_VM_MEM_SRC, .partition_vcpu_memory_access = true, + .readers_per_uffd = 1, + .single_uffd = false, }; int opt; guest_modes_append_default(); - while ((opt = getopt(argc, argv, "hm:u:d:b:s:v:c:o")) != -1) { + while ((opt = getopt(argc, argv, "ahom:u:d:b:s:v:c:r:")) != -1) { switch (opt) { case 'm': guest_modes_cmdline(optarg); @@ -251,6 +293,9 @@ int main(int argc, char *argv[]) p.uffd_mode = UFFDIO_REGISTER_MODE_MINOR; TEST_ASSERT(p.uffd_mode, "UFFD mode must be 'MISSING' or 'MINOR'."); break; + case 'a': + p.single_uffd = true; + break; case 'd': p.uffd_delay = strtoul(optarg, NULL, 0); TEST_ASSERT(p.uffd_delay >= 0, "A negative UFFD delay is not supported."); @@ -271,6 +316,13 @@ int main(int argc, char *argv[]) break; case 'o': p.partition_vcpu_memory_access = false; + p.single_uffd = true; + break; + case 'r': + p.readers_per_uffd = atoi(optarg); + TEST_ASSERT(p.readers_per_uffd >= 1, + "Invalid number of readers per uffd %d: must be >=1", + p.readers_per_uffd); break; case 'h': default: diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c index 504f6fe980e8..9f24303acb8c 100644 --- a/tools/testing/selftests/kvm/dirty_log_perf_test.c +++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c @@ -18,13 +18,11 @@ #include "test_util.h" #include "memstress.h" #include "guest_modes.h" +#include "ucall_common.h" #ifdef __aarch64__ #include "aarch64/vgic.h" -#define GICD_BASE_GPA 0x8000000ULL -#define GICR_BASE_GPA 0x80A0000ULL - static int gic_fd; static void arch_setup_vm(struct kvm_vm *vm, unsigned int nr_vcpus) @@ -33,7 +31,7 @@ static void arch_setup_vm(struct kvm_vm *vm, unsigned int nr_vcpus) * The test can still run even if hardware does not support GICv3, as it * is only an optimization to reduce guest exits. */ - gic_fd = vgic_v3_setup(vm, nr_vcpus, 64, GICD_BASE_GPA, GICR_BASE_GPA); + gic_fd = vgic_v3_setup(vm, nr_vcpus, 64); } static void arch_cleanup_vm(struct kvm_vm *vm) @@ -132,7 +130,6 @@ struct test_params { enum vm_mem_backing_src_type backing_src; int slots; uint32_t write_percent; - uint32_t random_seed; bool random_access; }; @@ -156,8 +153,6 @@ static void run_test(enum vm_guest_mode mode, void *arg) p->slots, p->backing_src, p->partition_vcpu_memory_access); - pr_info("Random seed: %u\n", p->random_seed); - memstress_set_random_seed(vm, p->random_seed); memstress_set_write_percent(vm, p->write_percent); guest_num_pages = (nr_vcpus * guest_percpu_mem_size) >> vm->page_shift; @@ -346,11 +341,13 @@ int main(int argc, char *argv[]) .partition_vcpu_memory_access = true, .backing_src = DEFAULT_VM_MEM_SRC, .slots = 1, - .random_seed = 1, .write_percent = 100, }; int opt; + /* Override the seed to be deterministic by default. */ + guest_random_seed = 1; + dirty_log_manual_caps = kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2); dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | @@ -395,7 +392,7 @@ int main(int argc, char *argv[]) p.phys_offset = strtoull(optarg, NULL, 0); break; case 'r': - p.random_seed = atoi_positive("Random seed", optarg); + guest_random_seed = atoi_positive("Random seed", optarg); break; case 's': p.backing_src = parse_backing_src_type(optarg); diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index eaad5b20854c..aacf80f57439 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -4,9 +4,6 @@ * * Copyright (C) 2018, Red Hat, Inc. */ - -#define _GNU_SOURCE /* for program_invocation_name */ - #include <stdio.h> #include <stdlib.h> #include <pthread.h> @@ -23,6 +20,7 @@ #include "test_util.h" #include "guest_modes.h" #include "processor.h" +#include "ucall_common.h" #define DIRTY_MEM_BITS 30 /* 1G */ #define PAGE_SHIFT_4K 12 @@ -76,7 +74,6 @@ static uint64_t host_page_size; static uint64_t guest_page_size; static uint64_t guest_num_pages; -static uint64_t random_array[TEST_PAGES_PER_LOOP]; static uint64_t iteration; /* @@ -109,19 +106,19 @@ static void guest_code(void) */ for (i = 0; i < guest_num_pages; i++) { addr = guest_test_virt_mem + i * guest_page_size; - *(uint64_t *)addr = READ_ONCE(iteration); + vcpu_arch_put_guest(*(uint64_t *)addr, READ_ONCE(iteration)); } while (true) { for (i = 0; i < TEST_PAGES_PER_LOOP; i++) { addr = guest_test_virt_mem; - addr += (READ_ONCE(random_array[i]) % guest_num_pages) + addr += (guest_random_u64(&guest_rng) % guest_num_pages) * guest_page_size; addr = align_down(addr, host_page_size); - *(uint64_t *)addr = READ_ONCE(iteration); + + vcpu_arch_put_guest(*(uint64_t *)addr, READ_ONCE(iteration)); } - /* Tell the host that we need more random numbers */ GUEST_SYNC(1); } } @@ -508,20 +505,10 @@ static void log_mode_after_vcpu_run(struct kvm_vcpu *vcpu, int ret, int err) mode->after_vcpu_run(vcpu, ret, err); } -static void generate_random_array(uint64_t *guest_array, uint64_t size) -{ - uint64_t i; - - for (i = 0; i < size; i++) - guest_array[i] = random(); -} - static void *vcpu_worker(void *data) { int ret; struct kvm_vcpu *vcpu = data; - struct kvm_vm *vm = vcpu->vm; - uint64_t *guest_array; uint64_t pages_count = 0; struct kvm_signal_mask *sigmask = alloca(offsetof(struct kvm_signal_mask, sigset) + sizeof(sigset_t)); @@ -540,11 +527,8 @@ static void *vcpu_worker(void *data) sigemptyset(sigset); sigaddset(sigset, SIG_IPI); - guest_array = addr_gva2hva(vm, (vm_vaddr_t)random_array); - while (!READ_ONCE(host_quit)) { /* Clear any existing kick signals */ - generate_random_array(guest_array, TEST_PAGES_PER_LOOP); pages_count += TEST_PAGES_PER_LOOP; /* Let the guest dirty the random pages */ ret = __vcpu_run(vcpu); diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c index 92eae206baa6..ba0c8e996035 100644 --- a/tools/testing/selftests/kvm/guest_memfd_test.c +++ b/tools/testing/selftests/kvm/guest_memfd_test.c @@ -4,8 +4,6 @@ * * Author: Chao Peng <chao.p.peng@linux.intel.com> */ - -#define _GNU_SOURCE #include <stdlib.h> #include <string.h> #include <unistd.h> @@ -19,8 +17,8 @@ #include <sys/types.h> #include <sys/stat.h> +#include "kvm_util.h" #include "test_util.h" -#include "kvm_util_base.h" static void test_file_read_write(int fd) { diff --git a/tools/testing/selftests/kvm/guest_print_test.c b/tools/testing/selftests/kvm/guest_print_test.c index 3502caa3590c..8092c2d0f5d6 100644 --- a/tools/testing/selftests/kvm/guest_print_test.c +++ b/tools/testing/selftests/kvm/guest_print_test.c @@ -13,6 +13,7 @@ #include "test_util.h" #include "kvm_util.h" #include "processor.h" +#include "ucall_common.h" struct guest_vals { uint64_t a; diff --git a/tools/testing/selftests/kvm/hardware_disable_test.c b/tools/testing/selftests/kvm/hardware_disable_test.c index decc521fc760..bce73bcb973c 100644 --- a/tools/testing/selftests/kvm/hardware_disable_test.c +++ b/tools/testing/selftests/kvm/hardware_disable_test.c @@ -4,9 +4,6 @@ * kvm_arch_hardware_disable is called and it attempts to unregister the user * return notifiers. */ - -#define _GNU_SOURCE - #include <fcntl.h> #include <pthread.h> #include <semaphore.h> diff --git a/tools/testing/selftests/kvm/include/aarch64/gic.h b/tools/testing/selftests/kvm/include/aarch64/gic.h index b217ea17cac5..baeb3c859389 100644 --- a/tools/testing/selftests/kvm/include/aarch64/gic.h +++ b/tools/testing/selftests/kvm/include/aarch64/gic.h @@ -6,11 +6,26 @@ #ifndef SELFTEST_KVM_GIC_H #define SELFTEST_KVM_GIC_H +#include <asm/kvm.h> + enum gic_type { GIC_V3, GIC_TYPE_MAX, }; +/* + * Note that the redistributor frames are at the end, as the range scales + * with the number of vCPUs in the VM. + */ +#define GITS_BASE_GPA 0x8000000ULL +#define GICD_BASE_GPA (GITS_BASE_GPA + KVM_VGIC_V3_ITS_SIZE) +#define GICR_BASE_GPA (GICD_BASE_GPA + KVM_VGIC_V3_DIST_SIZE) + +/* The GIC is identity-mapped into the guest at the time of setup. */ +#define GITS_BASE_GVA ((volatile void *)GITS_BASE_GPA) +#define GICD_BASE_GVA ((volatile void *)GICD_BASE_GPA) +#define GICR_BASE_GVA ((volatile void *)GICR_BASE_GPA) + #define MIN_SGI 0 #define MIN_PPI 16 #define MIN_SPI 32 @@ -21,8 +36,7 @@ enum gic_type { #define INTID_IS_PPI(intid) (MIN_PPI <= (intid) && (intid) < MIN_SPI) #define INTID_IS_SPI(intid) (MIN_SPI <= (intid) && (intid) <= MAX_SPI) -void gic_init(enum gic_type type, unsigned int nr_cpus, - void *dist_base, void *redist_base); +void gic_init(enum gic_type type, unsigned int nr_cpus); void gic_irq_enable(unsigned int intid); void gic_irq_disable(unsigned int intid); unsigned int gic_get_and_ack_irq(void); @@ -44,4 +58,7 @@ void gic_irq_clear_pending(unsigned int intid); bool gic_irq_get_pending(unsigned int intid); void gic_irq_set_config(unsigned int intid, bool is_edge); +void gic_rdist_enable_lpis(vm_paddr_t cfg_table, size_t cfg_table_size, + vm_paddr_t pend_table); + #endif /* SELFTEST_KVM_GIC_H */ diff --git a/tools/testing/selftests/kvm/include/aarch64/gic_v3.h b/tools/testing/selftests/kvm/include/aarch64/gic_v3.h index ba0886e8a2bb..a76615fa39a1 100644 --- a/tools/testing/selftests/kvm/include/aarch64/gic_v3.h +++ b/tools/testing/selftests/kvm/include/aarch64/gic_v3.h @@ -1,82 +1,604 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0-only */ /* - * ARM Generic Interrupt Controller (GIC) v3 specific defines + * Copyright (C) 2013, 2014 ARM Limited, All Rights Reserved. + * Author: Marc Zyngier <marc.zyngier@arm.com> */ - -#ifndef SELFTEST_KVM_GICV3_H -#define SELFTEST_KVM_GICV3_H - -#include <asm/sysreg.h> +#ifndef __SELFTESTS_GIC_V3_H +#define __SELFTESTS_GIC_V3_H /* - * Distributor registers + * Distributor registers. We assume we're running non-secure, with ARE + * being set. Secure-only and non-ARE registers are not described. */ #define GICD_CTLR 0x0000 #define GICD_TYPER 0x0004 +#define GICD_IIDR 0x0008 +#define GICD_TYPER2 0x000C +#define GICD_STATUSR 0x0010 +#define GICD_SETSPI_NSR 0x0040 +#define GICD_CLRSPI_NSR 0x0048 +#define GICD_SETSPI_SR 0x0050 +#define GICD_CLRSPI_SR 0x0058 #define GICD_IGROUPR 0x0080 #define GICD_ISENABLER 0x0100 #define GICD_ICENABLER 0x0180 #define GICD_ISPENDR 0x0200 #define GICD_ICPENDR 0x0280 -#define GICD_ICACTIVER 0x0380 #define GICD_ISACTIVER 0x0300 +#define GICD_ICACTIVER 0x0380 #define GICD_IPRIORITYR 0x0400 #define GICD_ICFGR 0x0C00 +#define GICD_IGRPMODR 0x0D00 +#define GICD_NSACR 0x0E00 +#define GICD_IGROUPRnE 0x1000 +#define GICD_ISENABLERnE 0x1200 +#define GICD_ICENABLERnE 0x1400 +#define GICD_ISPENDRnE 0x1600 +#define GICD_ICPENDRnE 0x1800 +#define GICD_ISACTIVERnE 0x1A00 +#define GICD_ICACTIVERnE 0x1C00 +#define GICD_IPRIORITYRnE 0x2000 +#define GICD_ICFGRnE 0x3000 +#define GICD_IROUTER 0x6000 +#define GICD_IROUTERnE 0x8000 +#define GICD_IDREGS 0xFFD0 +#define GICD_PIDR2 0xFFE8 + +#define ESPI_BASE_INTID 4096 /* - * The assumption is that the guest runs in a non-secure mode. - * The following bits of GICD_CTLR are defined accordingly. + * Those registers are actually from GICv2, but the spec demands that they + * are implemented as RES0 if ARE is 1 (which we do in KVM's emulated GICv3). */ +#define GICD_ITARGETSR 0x0800 +#define GICD_SGIR 0x0F00 +#define GICD_CPENDSGIR 0x0F10 +#define GICD_SPENDSGIR 0x0F20 + #define GICD_CTLR_RWP (1U << 31) #define GICD_CTLR_nASSGIreq (1U << 8) +#define GICD_CTLR_DS (1U << 6) #define GICD_CTLR_ARE_NS (1U << 4) #define GICD_CTLR_ENABLE_G1A (1U << 1) #define GICD_CTLR_ENABLE_G1 (1U << 0) +#define GICD_IIDR_IMPLEMENTER_SHIFT 0 +#define GICD_IIDR_IMPLEMENTER_MASK (0xfff << GICD_IIDR_IMPLEMENTER_SHIFT) +#define GICD_IIDR_REVISION_SHIFT 12 +#define GICD_IIDR_REVISION_MASK (0xf << GICD_IIDR_REVISION_SHIFT) +#define GICD_IIDR_VARIANT_SHIFT 16 +#define GICD_IIDR_VARIANT_MASK (0xf << GICD_IIDR_VARIANT_SHIFT) +#define GICD_IIDR_PRODUCT_ID_SHIFT 24 +#define GICD_IIDR_PRODUCT_ID_MASK (0xff << GICD_IIDR_PRODUCT_ID_SHIFT) + + +/* + * In systems with a single security state (what we emulate in KVM) + * the meaning of the interrupt group enable bits is slightly different + */ +#define GICD_CTLR_ENABLE_SS_G1 (1U << 1) +#define GICD_CTLR_ENABLE_SS_G0 (1U << 0) + +#define GICD_TYPER_RSS (1U << 26) +#define GICD_TYPER_LPIS (1U << 17) +#define GICD_TYPER_MBIS (1U << 16) +#define GICD_TYPER_ESPI (1U << 8) + +#define GICD_TYPER_ID_BITS(typer) ((((typer) >> 19) & 0x1f) + 1) +#define GICD_TYPER_NUM_LPIS(typer) ((((typer) >> 11) & 0x1f) + 1) #define GICD_TYPER_SPIS(typer) ((((typer) & 0x1f) + 1) * 32) -#define GICD_INT_DEF_PRI_X4 0xa0a0a0a0 +#define GICD_TYPER_ESPIS(typer) \ + (((typer) & GICD_TYPER_ESPI) ? GICD_TYPER_SPIS((typer) >> 27) : 0) + +#define GICD_TYPER2_nASSGIcap (1U << 8) +#define GICD_TYPER2_VIL (1U << 7) +#define GICD_TYPER2_VID GENMASK(4, 0) + +#define GICD_IROUTER_SPI_MODE_ONE (0U << 31) +#define GICD_IROUTER_SPI_MODE_ANY (1U << 31) + +#define GIC_PIDR2_ARCH_MASK 0xf0 +#define GIC_PIDR2_ARCH_GICv3 0x30 +#define GIC_PIDR2_ARCH_GICv4 0x40 + +#define GIC_V3_DIST_SIZE 0x10000 + +#define GIC_PAGE_SIZE_4K 0ULL +#define GIC_PAGE_SIZE_16K 1ULL +#define GIC_PAGE_SIZE_64K 2ULL +#define GIC_PAGE_SIZE_MASK 3ULL /* - * Redistributor registers + * Re-Distributor registers, offsets from RD_base */ -#define GICR_CTLR 0x000 -#define GICR_WAKER 0x014 +#define GICR_CTLR GICD_CTLR +#define GICR_IIDR 0x0004 +#define GICR_TYPER 0x0008 +#define GICR_STATUSR GICD_STATUSR +#define GICR_WAKER 0x0014 +#define GICR_SETLPIR 0x0040 +#define GICR_CLRLPIR 0x0048 +#define GICR_PROPBASER 0x0070 +#define GICR_PENDBASER 0x0078 +#define GICR_INVLPIR 0x00A0 +#define GICR_INVALLR 0x00B0 +#define GICR_SYNCR 0x00C0 +#define GICR_IDREGS GICD_IDREGS +#define GICR_PIDR2 GICD_PIDR2 + +#define GICR_CTLR_ENABLE_LPIS (1UL << 0) +#define GICR_CTLR_CES (1UL << 1) +#define GICR_CTLR_IR (1UL << 2) +#define GICR_CTLR_RWP (1UL << 3) -#define GICR_CTLR_RWP (1U << 3) +#define GICR_TYPER_CPU_NUMBER(r) (((r) >> 8) & 0xffff) + +#define EPPI_BASE_INTID 1056 + +#define GICR_TYPER_NR_PPIS(r) \ + ({ \ + unsigned int __ppinum = ((r) >> 27) & 0x1f; \ + unsigned int __nr_ppis = 16; \ + if (__ppinum == 1 || __ppinum == 2) \ + __nr_ppis += __ppinum * 32; \ + \ + __nr_ppis; \ + }) #define GICR_WAKER_ProcessorSleep (1U << 1) #define GICR_WAKER_ChildrenAsleep (1U << 2) +#define GIC_BASER_CACHE_nCnB 0ULL +#define GIC_BASER_CACHE_SameAsInner 0ULL +#define GIC_BASER_CACHE_nC 1ULL +#define GIC_BASER_CACHE_RaWt 2ULL +#define GIC_BASER_CACHE_RaWb 3ULL +#define GIC_BASER_CACHE_WaWt 4ULL +#define GIC_BASER_CACHE_WaWb 5ULL +#define GIC_BASER_CACHE_RaWaWt 6ULL +#define GIC_BASER_CACHE_RaWaWb 7ULL +#define GIC_BASER_CACHE_MASK 7ULL +#define GIC_BASER_NonShareable 0ULL +#define GIC_BASER_InnerShareable 1ULL +#define GIC_BASER_OuterShareable 2ULL +#define GIC_BASER_SHAREABILITY_MASK 3ULL + +#define GIC_BASER_CACHEABILITY(reg, inner_outer, type) \ + (GIC_BASER_CACHE_##type << reg##_##inner_outer##_CACHEABILITY_SHIFT) + +#define GIC_BASER_SHAREABILITY(reg, type) \ + (GIC_BASER_##type << reg##_SHAREABILITY_SHIFT) + +/* encode a size field of width @w containing @n - 1 units */ +#define GIC_ENCODE_SZ(n, w) (((unsigned long)(n) - 1) & GENMASK_ULL(((w) - 1), 0)) + +#define GICR_PROPBASER_SHAREABILITY_SHIFT (10) +#define GICR_PROPBASER_INNER_CACHEABILITY_SHIFT (7) +#define GICR_PROPBASER_OUTER_CACHEABILITY_SHIFT (56) +#define GICR_PROPBASER_SHAREABILITY_MASK \ + GIC_BASER_SHAREABILITY(GICR_PROPBASER, SHAREABILITY_MASK) +#define GICR_PROPBASER_INNER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, MASK) +#define GICR_PROPBASER_OUTER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GICR_PROPBASER, OUTER, MASK) +#define GICR_PROPBASER_CACHEABILITY_MASK GICR_PROPBASER_INNER_CACHEABILITY_MASK + +#define GICR_PROPBASER_InnerShareable \ + GIC_BASER_SHAREABILITY(GICR_PROPBASER, InnerShareable) + +#define GICR_PROPBASER_nCnB GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, nCnB) +#define GICR_PROPBASER_nC GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, nC) +#define GICR_PROPBASER_RaWt GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWt) +#define GICR_PROPBASER_RaWb GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWb) +#define GICR_PROPBASER_WaWt GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, WaWt) +#define GICR_PROPBASER_WaWb GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, WaWb) +#define GICR_PROPBASER_RaWaWt GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWaWt) +#define GICR_PROPBASER_RaWaWb GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWaWb) + +#define GICR_PROPBASER_IDBITS_MASK (0x1f) +#define GICR_PROPBASER_ADDRESS(x) ((x) & GENMASK_ULL(51, 12)) +#define GICR_PENDBASER_ADDRESS(x) ((x) & GENMASK_ULL(51, 16)) + +#define GICR_PENDBASER_SHAREABILITY_SHIFT (10) +#define GICR_PENDBASER_INNER_CACHEABILITY_SHIFT (7) +#define GICR_PENDBASER_OUTER_CACHEABILITY_SHIFT (56) +#define GICR_PENDBASER_SHAREABILITY_MASK \ + GIC_BASER_SHAREABILITY(GICR_PENDBASER, SHAREABILITY_MASK) +#define GICR_PENDBASER_INNER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, MASK) +#define GICR_PENDBASER_OUTER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GICR_PENDBASER, OUTER, MASK) +#define GICR_PENDBASER_CACHEABILITY_MASK GICR_PENDBASER_INNER_CACHEABILITY_MASK + +#define GICR_PENDBASER_InnerShareable \ + GIC_BASER_SHAREABILITY(GICR_PENDBASER, InnerShareable) + +#define GICR_PENDBASER_nCnB GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, nCnB) +#define GICR_PENDBASER_nC GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, nC) +#define GICR_PENDBASER_RaWt GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWt) +#define GICR_PENDBASER_RaWb GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWb) +#define GICR_PENDBASER_WaWt GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, WaWt) +#define GICR_PENDBASER_WaWb GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, WaWb) +#define GICR_PENDBASER_RaWaWt GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWaWt) +#define GICR_PENDBASER_RaWaWb GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWaWb) + +#define GICR_PENDBASER_PTZ BIT_ULL(62) + /* - * Redistributor registers, offsets from SGI base + * Re-Distributor registers, offsets from SGI_base */ #define GICR_IGROUPR0 GICD_IGROUPR #define GICR_ISENABLER0 GICD_ISENABLER #define GICR_ICENABLER0 GICD_ICENABLER #define GICR_ISPENDR0 GICD_ISPENDR +#define GICR_ICPENDR0 GICD_ICPENDR #define GICR_ISACTIVER0 GICD_ISACTIVER #define GICR_ICACTIVER0 GICD_ICACTIVER -#define GICR_ICENABLER GICD_ICENABLER -#define GICR_ICACTIVER GICD_ICACTIVER #define GICR_IPRIORITYR0 GICD_IPRIORITYR +#define GICR_ICFGR0 GICD_ICFGR +#define GICR_IGRPMODR0 GICD_IGRPMODR +#define GICR_NSACR GICD_NSACR + +#define GICR_TYPER_PLPIS (1U << 0) +#define GICR_TYPER_VLPIS (1U << 1) +#define GICR_TYPER_DIRTY (1U << 2) +#define GICR_TYPER_DirectLPIS (1U << 3) +#define GICR_TYPER_LAST (1U << 4) +#define GICR_TYPER_RVPEID (1U << 7) +#define GICR_TYPER_COMMON_LPI_AFF GENMASK_ULL(25, 24) +#define GICR_TYPER_AFFINITY GENMASK_ULL(63, 32) + +#define GICR_INVLPIR_INTID GENMASK_ULL(31, 0) +#define GICR_INVLPIR_VPEID GENMASK_ULL(47, 32) +#define GICR_INVLPIR_V GENMASK_ULL(63, 63) + +#define GICR_INVALLR_VPEID GICR_INVLPIR_VPEID +#define GICR_INVALLR_V GICR_INVLPIR_V + +#define GIC_V3_REDIST_SIZE 0x20000 + +#define LPI_PROP_GROUP1 (1 << 1) +#define LPI_PROP_ENABLED (1 << 0) + +/* + * Re-Distributor registers, offsets from VLPI_base + */ +#define GICR_VPROPBASER 0x0070 + +#define GICR_VPROPBASER_IDBITS_MASK 0x1f + +#define GICR_VPROPBASER_SHAREABILITY_SHIFT (10) +#define GICR_VPROPBASER_INNER_CACHEABILITY_SHIFT (7) +#define GICR_VPROPBASER_OUTER_CACHEABILITY_SHIFT (56) + +#define GICR_VPROPBASER_SHAREABILITY_MASK \ + GIC_BASER_SHAREABILITY(GICR_VPROPBASER, SHAREABILITY_MASK) +#define GICR_VPROPBASER_INNER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, MASK) +#define GICR_VPROPBASER_OUTER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GICR_VPROPBASER, OUTER, MASK) +#define GICR_VPROPBASER_CACHEABILITY_MASK \ + GICR_VPROPBASER_INNER_CACHEABILITY_MASK + +#define GICR_VPROPBASER_InnerShareable \ + GIC_BASER_SHAREABILITY(GICR_VPROPBASER, InnerShareable) + +#define GICR_VPROPBASER_nCnB GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, nCnB) +#define GICR_VPROPBASER_nC GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, nC) +#define GICR_VPROPBASER_RaWt GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, RaWt) +#define GICR_VPROPBASER_RaWb GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, RaWb) +#define GICR_VPROPBASER_WaWt GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, WaWt) +#define GICR_VPROPBASER_WaWb GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, WaWb) +#define GICR_VPROPBASER_RaWaWt GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, RaWaWt) +#define GICR_VPROPBASER_RaWaWb GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, RaWaWb) + +/* + * GICv4.1 VPROPBASER reinvention. A subtle mix between the old + * VPROPBASER and ITS_BASER. Just not quite any of the two. + */ +#define GICR_VPROPBASER_4_1_VALID (1ULL << 63) +#define GICR_VPROPBASER_4_1_ENTRY_SIZE GENMASK_ULL(61, 59) +#define GICR_VPROPBASER_4_1_INDIRECT (1ULL << 55) +#define GICR_VPROPBASER_4_1_PAGE_SIZE GENMASK_ULL(54, 53) +#define GICR_VPROPBASER_4_1_Z (1ULL << 52) +#define GICR_VPROPBASER_4_1_ADDR GENMASK_ULL(51, 12) +#define GICR_VPROPBASER_4_1_SIZE GENMASK_ULL(6, 0) + +#define GICR_VPENDBASER 0x0078 + +#define GICR_VPENDBASER_SHAREABILITY_SHIFT (10) +#define GICR_VPENDBASER_INNER_CACHEABILITY_SHIFT (7) +#define GICR_VPENDBASER_OUTER_CACHEABILITY_SHIFT (56) +#define GICR_VPENDBASER_SHAREABILITY_MASK \ + GIC_BASER_SHAREABILITY(GICR_VPENDBASER, SHAREABILITY_MASK) +#define GICR_VPENDBASER_INNER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, MASK) +#define GICR_VPENDBASER_OUTER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GICR_VPENDBASER, OUTER, MASK) +#define GICR_VPENDBASER_CACHEABILITY_MASK \ + GICR_VPENDBASER_INNER_CACHEABILITY_MASK + +#define GICR_VPENDBASER_NonShareable \ + GIC_BASER_SHAREABILITY(GICR_VPENDBASER, NonShareable) + +#define GICR_VPENDBASER_InnerShareable \ + GIC_BASER_SHAREABILITY(GICR_VPENDBASER, InnerShareable) + +#define GICR_VPENDBASER_nCnB GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, nCnB) +#define GICR_VPENDBASER_nC GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, nC) +#define GICR_VPENDBASER_RaWt GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, RaWt) +#define GICR_VPENDBASER_RaWb GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, RaWb) +#define GICR_VPENDBASER_WaWt GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, WaWt) +#define GICR_VPENDBASER_WaWb GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, WaWb) +#define GICR_VPENDBASER_RaWaWt GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, RaWaWt) +#define GICR_VPENDBASER_RaWaWb GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, RaWaWb) + +#define GICR_VPENDBASER_Dirty (1ULL << 60) +#define GICR_VPENDBASER_PendingLast (1ULL << 61) +#define GICR_VPENDBASER_IDAI (1ULL << 62) +#define GICR_VPENDBASER_Valid (1ULL << 63) + +/* + * GICv4.1 VPENDBASER, used for VPE residency. On top of these fields, + * also use the above Valid, PendingLast and Dirty. + */ +#define GICR_VPENDBASER_4_1_DB (1ULL << 62) +#define GICR_VPENDBASER_4_1_VGRP0EN (1ULL << 59) +#define GICR_VPENDBASER_4_1_VGRP1EN (1ULL << 58) +#define GICR_VPENDBASER_4_1_VPEID GENMASK_ULL(15, 0) + +#define GICR_VSGIR 0x0080 + +#define GICR_VSGIR_VPEID GENMASK(15, 0) + +#define GICR_VSGIPENDR 0x0088 + +#define GICR_VSGIPENDR_BUSY (1U << 31) +#define GICR_VSGIPENDR_PENDING GENMASK(15, 0) + +/* + * ITS registers, offsets from ITS_base + */ +#define GITS_CTLR 0x0000 +#define GITS_IIDR 0x0004 +#define GITS_TYPER 0x0008 +#define GITS_MPIDR 0x0018 +#define GITS_CBASER 0x0080 +#define GITS_CWRITER 0x0088 +#define GITS_CREADR 0x0090 +#define GITS_BASER 0x0100 +#define GITS_IDREGS_BASE 0xffd0 +#define GITS_PIDR0 0xffe0 +#define GITS_PIDR1 0xffe4 +#define GITS_PIDR2 GICR_PIDR2 +#define GITS_PIDR4 0xffd0 +#define GITS_CIDR0 0xfff0 +#define GITS_CIDR1 0xfff4 +#define GITS_CIDR2 0xfff8 +#define GITS_CIDR3 0xfffc + +#define GITS_TRANSLATER 0x10040 + +#define GITS_SGIR 0x20020 + +#define GITS_SGIR_VPEID GENMASK_ULL(47, 32) +#define GITS_SGIR_VINTID GENMASK_ULL(3, 0) + +#define GITS_CTLR_ENABLE (1U << 0) +#define GITS_CTLR_ImDe (1U << 1) +#define GITS_CTLR_ITS_NUMBER_SHIFT 4 +#define GITS_CTLR_ITS_NUMBER (0xFU << GITS_CTLR_ITS_NUMBER_SHIFT) +#define GITS_CTLR_QUIESCENT (1U << 31) + +#define GITS_TYPER_PLPIS (1UL << 0) +#define GITS_TYPER_VLPIS (1UL << 1) +#define GITS_TYPER_ITT_ENTRY_SIZE_SHIFT 4 +#define GITS_TYPER_ITT_ENTRY_SIZE GENMASK_ULL(7, 4) +#define GITS_TYPER_IDBITS_SHIFT 8 +#define GITS_TYPER_DEVBITS_SHIFT 13 +#define GITS_TYPER_DEVBITS GENMASK_ULL(17, 13) +#define GITS_TYPER_PTA (1UL << 19) +#define GITS_TYPER_HCC_SHIFT 24 +#define GITS_TYPER_HCC(r) (((r) >> GITS_TYPER_HCC_SHIFT) & 0xff) +#define GITS_TYPER_VMOVP (1ULL << 37) +#define GITS_TYPER_VMAPP (1ULL << 40) +#define GITS_TYPER_SVPET GENMASK_ULL(42, 41) -/* CPU interface registers */ -#define SYS_ICC_PMR_EL1 sys_reg(3, 0, 4, 6, 0) -#define SYS_ICC_IAR1_EL1 sys_reg(3, 0, 12, 12, 0) -#define SYS_ICC_EOIR1_EL1 sys_reg(3, 0, 12, 12, 1) -#define SYS_ICC_DIR_EL1 sys_reg(3, 0, 12, 11, 1) -#define SYS_ICC_CTLR_EL1 sys_reg(3, 0, 12, 12, 4) -#define SYS_ICC_SRE_EL1 sys_reg(3, 0, 12, 12, 5) -#define SYS_ICC_GRPEN1_EL1 sys_reg(3, 0, 12, 12, 7) +#define GITS_IIDR_REV_SHIFT 12 +#define GITS_IIDR_REV_MASK (0xf << GITS_IIDR_REV_SHIFT) +#define GITS_IIDR_REV(r) (((r) >> GITS_IIDR_REV_SHIFT) & 0xf) +#define GITS_IIDR_PRODUCTID_SHIFT 24 -#define SYS_ICV_AP1R0_EL1 sys_reg(3, 0, 12, 9, 0) +#define GITS_CBASER_VALID (1ULL << 63) +#define GITS_CBASER_SHAREABILITY_SHIFT (10) +#define GITS_CBASER_INNER_CACHEABILITY_SHIFT (59) +#define GITS_CBASER_OUTER_CACHEABILITY_SHIFT (53) +#define GITS_CBASER_SHAREABILITY_MASK \ + GIC_BASER_SHAREABILITY(GITS_CBASER, SHAREABILITY_MASK) +#define GITS_CBASER_INNER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, MASK) +#define GITS_CBASER_OUTER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GITS_CBASER, OUTER, MASK) +#define GITS_CBASER_CACHEABILITY_MASK GITS_CBASER_INNER_CACHEABILITY_MASK -#define ICC_PMR_DEF_PRIO 0xf0 +#define GITS_CBASER_InnerShareable \ + GIC_BASER_SHAREABILITY(GITS_CBASER, InnerShareable) +#define GITS_CBASER_nCnB GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, nCnB) +#define GITS_CBASER_nC GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, nC) +#define GITS_CBASER_RaWt GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWt) +#define GITS_CBASER_RaWb GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWb) +#define GITS_CBASER_WaWt GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, WaWt) +#define GITS_CBASER_WaWb GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, WaWb) +#define GITS_CBASER_RaWaWt GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWaWt) +#define GITS_CBASER_RaWaWb GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWaWb) + +#define GITS_CBASER_ADDRESS(cbaser) ((cbaser) & GENMASK_ULL(51, 12)) + +#define GITS_BASER_NR_REGS 8 + +#define GITS_BASER_VALID (1ULL << 63) +#define GITS_BASER_INDIRECT (1ULL << 62) + +#define GITS_BASER_INNER_CACHEABILITY_SHIFT (59) +#define GITS_BASER_OUTER_CACHEABILITY_SHIFT (53) +#define GITS_BASER_INNER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GITS_BASER, INNER, MASK) +#define GITS_BASER_CACHEABILITY_MASK GITS_BASER_INNER_CACHEABILITY_MASK +#define GITS_BASER_OUTER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GITS_BASER, OUTER, MASK) +#define GITS_BASER_SHAREABILITY_MASK \ + GIC_BASER_SHAREABILITY(GITS_BASER, SHAREABILITY_MASK) + +#define GITS_BASER_nCnB GIC_BASER_CACHEABILITY(GITS_BASER, INNER, nCnB) +#define GITS_BASER_nC GIC_BASER_CACHEABILITY(GITS_BASER, INNER, nC) +#define GITS_BASER_RaWt GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWt) +#define GITS_BASER_RaWb GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWb) +#define GITS_BASER_WaWt GIC_BASER_CACHEABILITY(GITS_BASER, INNER, WaWt) +#define GITS_BASER_WaWb GIC_BASER_CACHEABILITY(GITS_BASER, INNER, WaWb) +#define GITS_BASER_RaWaWt GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWaWt) +#define GITS_BASER_RaWaWb GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWaWb) + +#define GITS_BASER_TYPE_SHIFT (56) +#define GITS_BASER_TYPE(r) (((r) >> GITS_BASER_TYPE_SHIFT) & 7) +#define GITS_BASER_ENTRY_SIZE_SHIFT (48) +#define GITS_BASER_ENTRY_SIZE(r) ((((r) >> GITS_BASER_ENTRY_SIZE_SHIFT) & 0x1f) + 1) +#define GITS_BASER_ENTRY_SIZE_MASK GENMASK_ULL(52, 48) +#define GITS_BASER_PHYS_52_to_48(phys) \ + (((phys) & GENMASK_ULL(47, 16)) | (((phys) >> 48) & 0xf) << 12) +#define GITS_BASER_ADDR_48_to_52(baser) \ + (((baser) & GENMASK_ULL(47, 16)) | (((baser) >> 12) & 0xf) << 48) + +#define GITS_BASER_SHAREABILITY_SHIFT (10) +#define GITS_BASER_InnerShareable \ + GIC_BASER_SHAREABILITY(GITS_BASER, InnerShareable) +#define GITS_BASER_PAGE_SIZE_SHIFT (8) +#define __GITS_BASER_PSZ(sz) (GIC_PAGE_SIZE_ ## sz << GITS_BASER_PAGE_SIZE_SHIFT) +#define GITS_BASER_PAGE_SIZE_4K __GITS_BASER_PSZ(4K) +#define GITS_BASER_PAGE_SIZE_16K __GITS_BASER_PSZ(16K) +#define GITS_BASER_PAGE_SIZE_64K __GITS_BASER_PSZ(64K) +#define GITS_BASER_PAGE_SIZE_MASK __GITS_BASER_PSZ(MASK) +#define GITS_BASER_PAGES_MAX 256 +#define GITS_BASER_PAGES_SHIFT (0) +#define GITS_BASER_NR_PAGES(r) (((r) & 0xff) + 1) + +#define GITS_BASER_TYPE_NONE 0 +#define GITS_BASER_TYPE_DEVICE 1 +#define GITS_BASER_TYPE_VCPU 2 +#define GITS_BASER_TYPE_RESERVED3 3 +#define GITS_BASER_TYPE_COLLECTION 4 +#define GITS_BASER_TYPE_RESERVED5 5 +#define GITS_BASER_TYPE_RESERVED6 6 +#define GITS_BASER_TYPE_RESERVED7 7 + +#define GITS_LVL1_ENTRY_SIZE (8UL) + +/* + * ITS commands + */ +#define GITS_CMD_MAPD 0x08 +#define GITS_CMD_MAPC 0x09 +#define GITS_CMD_MAPTI 0x0a +#define GITS_CMD_MAPI 0x0b +#define GITS_CMD_MOVI 0x01 +#define GITS_CMD_DISCARD 0x0f +#define GITS_CMD_INV 0x0c +#define GITS_CMD_MOVALL 0x0e +#define GITS_CMD_INVALL 0x0d +#define GITS_CMD_INT 0x03 +#define GITS_CMD_CLEAR 0x04 +#define GITS_CMD_SYNC 0x05 + +/* + * GICv4 ITS specific commands + */ +#define GITS_CMD_GICv4(x) ((x) | 0x20) +#define GITS_CMD_VINVALL GITS_CMD_GICv4(GITS_CMD_INVALL) +#define GITS_CMD_VMAPP GITS_CMD_GICv4(GITS_CMD_MAPC) +#define GITS_CMD_VMAPTI GITS_CMD_GICv4(GITS_CMD_MAPTI) +#define GITS_CMD_VMOVI GITS_CMD_GICv4(GITS_CMD_MOVI) +#define GITS_CMD_VSYNC GITS_CMD_GICv4(GITS_CMD_SYNC) +/* VMOVP, VSGI and INVDB are the odd ones, as they dont have a physical counterpart */ +#define GITS_CMD_VMOVP GITS_CMD_GICv4(2) +#define GITS_CMD_VSGI GITS_CMD_GICv4(3) +#define GITS_CMD_INVDB GITS_CMD_GICv4(0xe) + +/* + * ITS error numbers + */ +#define E_ITS_MOVI_UNMAPPED_INTERRUPT 0x010107 +#define E_ITS_MOVI_UNMAPPED_COLLECTION 0x010109 +#define E_ITS_INT_UNMAPPED_INTERRUPT 0x010307 +#define E_ITS_CLEAR_UNMAPPED_INTERRUPT 0x010507 +#define E_ITS_MAPD_DEVICE_OOR 0x010801 +#define E_ITS_MAPD_ITTSIZE_OOR 0x010802 +#define E_ITS_MAPC_PROCNUM_OOR 0x010902 +#define E_ITS_MAPC_COLLECTION_OOR 0x010903 +#define E_ITS_MAPTI_UNMAPPED_DEVICE 0x010a04 +#define E_ITS_MAPTI_ID_OOR 0x010a05 +#define E_ITS_MAPTI_PHYSICALID_OOR 0x010a06 +#define E_ITS_INV_UNMAPPED_INTERRUPT 0x010c07 +#define E_ITS_INVALL_UNMAPPED_COLLECTION 0x010d09 +#define E_ITS_MOVALL_PROCNUM_OOR 0x010e01 +#define E_ITS_DISCARD_UNMAPPED_INTERRUPT 0x010f07 + +/* + * CPU interface registers + */ +#define ICC_CTLR_EL1_EOImode_SHIFT (1) +#define ICC_CTLR_EL1_EOImode_drop_dir (0U << ICC_CTLR_EL1_EOImode_SHIFT) +#define ICC_CTLR_EL1_EOImode_drop (1U << ICC_CTLR_EL1_EOImode_SHIFT) +#define ICC_CTLR_EL1_EOImode_MASK (1 << ICC_CTLR_EL1_EOImode_SHIFT) +#define ICC_CTLR_EL1_CBPR_SHIFT 0 +#define ICC_CTLR_EL1_CBPR_MASK (1 << ICC_CTLR_EL1_CBPR_SHIFT) +#define ICC_CTLR_EL1_PMHE_SHIFT 6 +#define ICC_CTLR_EL1_PMHE_MASK (1 << ICC_CTLR_EL1_PMHE_SHIFT) +#define ICC_CTLR_EL1_PRI_BITS_SHIFT 8 +#define ICC_CTLR_EL1_PRI_BITS_MASK (0x7 << ICC_CTLR_EL1_PRI_BITS_SHIFT) +#define ICC_CTLR_EL1_ID_BITS_SHIFT 11 +#define ICC_CTLR_EL1_ID_BITS_MASK (0x7 << ICC_CTLR_EL1_ID_BITS_SHIFT) +#define ICC_CTLR_EL1_SEIS_SHIFT 14 +#define ICC_CTLR_EL1_SEIS_MASK (0x1 << ICC_CTLR_EL1_SEIS_SHIFT) +#define ICC_CTLR_EL1_A3V_SHIFT 15 +#define ICC_CTLR_EL1_A3V_MASK (0x1 << ICC_CTLR_EL1_A3V_SHIFT) +#define ICC_CTLR_EL1_RSS (0x1 << 18) +#define ICC_CTLR_EL1_ExtRange (0x1 << 19) +#define ICC_PMR_EL1_SHIFT 0 +#define ICC_PMR_EL1_MASK (0xff << ICC_PMR_EL1_SHIFT) +#define ICC_BPR0_EL1_SHIFT 0 +#define ICC_BPR0_EL1_MASK (0x7 << ICC_BPR0_EL1_SHIFT) +#define ICC_BPR1_EL1_SHIFT 0 +#define ICC_BPR1_EL1_MASK (0x7 << ICC_BPR1_EL1_SHIFT) +#define ICC_IGRPEN0_EL1_SHIFT 0 +#define ICC_IGRPEN0_EL1_MASK (1 << ICC_IGRPEN0_EL1_SHIFT) +#define ICC_IGRPEN1_EL1_SHIFT 0 +#define ICC_IGRPEN1_EL1_MASK (1 << ICC_IGRPEN1_EL1_SHIFT) +#define ICC_SRE_EL1_DIB (1U << 2) +#define ICC_SRE_EL1_DFB (1U << 1) #define ICC_SRE_EL1_SRE (1U << 0) -#define ICC_IGRPEN1_EL1_ENABLE (1U << 0) +/* These are for GICv2 emulation only */ +#define GICH_LR_VIRTUALID (0x3ffUL << 0) +#define GICH_LR_PHYSID_CPUID_SHIFT (10) +#define GICH_LR_PHYSID_CPUID (7UL << GICH_LR_PHYSID_CPUID_SHIFT) + +#define ICC_IAR1_EL1_SPURIOUS 0x3ff + +#define ICC_SRE_EL2_SRE (1 << 0) +#define ICC_SRE_EL2_ENABLE (1 << 3) -#define GICV3_MAX_CPUS 512 +#define ICC_SGI1R_TARGET_LIST_SHIFT 0 +#define ICC_SGI1R_TARGET_LIST_MASK (0xffff << ICC_SGI1R_TARGET_LIST_SHIFT) +#define ICC_SGI1R_AFFINITY_1_SHIFT 16 +#define ICC_SGI1R_AFFINITY_1_MASK (0xff << ICC_SGI1R_AFFINITY_1_SHIFT) +#define ICC_SGI1R_SGI_ID_SHIFT 24 +#define ICC_SGI1R_SGI_ID_MASK (0xfULL << ICC_SGI1R_SGI_ID_SHIFT) +#define ICC_SGI1R_AFFINITY_2_SHIFT 32 +#define ICC_SGI1R_AFFINITY_2_MASK (0xffULL << ICC_SGI1R_AFFINITY_2_SHIFT) +#define ICC_SGI1R_IRQ_ROUTING_MODE_BIT 40 +#define ICC_SGI1R_RS_SHIFT 44 +#define ICC_SGI1R_RS_MASK (0xfULL << ICC_SGI1R_RS_SHIFT) +#define ICC_SGI1R_AFFINITY_3_SHIFT 48 +#define ICC_SGI1R_AFFINITY_3_MASK (0xffULL << ICC_SGI1R_AFFINITY_3_SHIFT) -#endif /* SELFTEST_KVM_GICV3_H */ +#endif diff --git a/tools/testing/selftests/kvm/include/aarch64/gic_v3_its.h b/tools/testing/selftests/kvm/include/aarch64/gic_v3_its.h new file mode 100644 index 000000000000..3722ed9c8f96 --- /dev/null +++ b/tools/testing/selftests/kvm/include/aarch64/gic_v3_its.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __SELFTESTS_GIC_V3_ITS_H__ +#define __SELFTESTS_GIC_V3_ITS_H__ + +#include <linux/sizes.h> + +void its_init(vm_paddr_t coll_tbl, size_t coll_tbl_sz, + vm_paddr_t device_tbl, size_t device_tbl_sz, + vm_paddr_t cmdq, size_t cmdq_size); + +void its_send_mapd_cmd(void *cmdq_base, u32 device_id, vm_paddr_t itt_base, + size_t itt_size, bool valid); +void its_send_mapc_cmd(void *cmdq_base, u32 vcpu_id, u32 collection_id, bool valid); +void its_send_mapti_cmd(void *cmdq_base, u32 device_id, u32 event_id, + u32 collection_id, u32 intid); +void its_send_invall_cmd(void *cmdq_base, u32 collection_id); + +#endif // __SELFTESTS_GIC_V3_ITS_H__ diff --git a/tools/testing/selftests/kvm/include/aarch64/processor.h b/tools/testing/selftests/kvm/include/aarch64/processor.h index 9e518b562827..9b20a355d81a 100644 --- a/tools/testing/selftests/kvm/include/aarch64/processor.h +++ b/tools/testing/selftests/kvm/include/aarch64/processor.h @@ -8,6 +8,8 @@ #define SELFTEST_KVM_PROCESSOR_H #include "kvm_util.h" +#include "ucall_common.h" + #include <linux/stringify.h> #include <linux/types.h> #include <asm/sysreg.h> @@ -58,8 +60,6 @@ MAIR_ATTRIDX(MAIR_ATTR_NORMAL, MT_NORMAL) | \ MAIR_ATTRIDX(MAIR_ATTR_NORMAL_WT, MT_NORMAL_WT)) -#define MPIDR_HWID_BITMASK (0xff00fffffful) - void aarch64_vcpu_setup(struct kvm_vcpu *vcpu, struct kvm_vcpu_init *init); struct kvm_vcpu *aarch64_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, struct kvm_vcpu_init *init, void *guest_code); @@ -177,11 +177,28 @@ static __always_inline u32 __raw_readl(const volatile void *addr) return val; } +static __always_inline void __raw_writeq(u64 val, volatile void *addr) +{ + asm volatile("str %0, [%1]" : : "rZ" (val), "r" (addr)); +} + +static __always_inline u64 __raw_readq(const volatile void *addr) +{ + u64 val; + asm volatile("ldr %0, [%1]" : "=r" (val) : "r" (addr)); + return val; +} + #define writel_relaxed(v,c) ((void)__raw_writel((__force u32)cpu_to_le32(v),(c))) #define readl_relaxed(c) ({ u32 __r = le32_to_cpu((__force __le32)__raw_readl(c)); __r; }) +#define writeq_relaxed(v,c) ((void)__raw_writeq((__force u64)cpu_to_le64(v),(c))) +#define readq_relaxed(c) ({ u64 __r = le64_to_cpu((__force __le64)__raw_readq(c)); __r; }) #define writel(v,c) ({ __iowmb(); writel_relaxed((v),(c));}) #define readl(c) ({ u32 __v = readl_relaxed(c); __iormb(__v); __v; }) +#define writeq(v,c) ({ __iowmb(); writeq_relaxed((v),(c));}) +#define readq(c) ({ u64 __v = readq_relaxed(c); __iormb(__v); __v; }) + static inline void local_irq_enable(void) { diff --git a/tools/testing/selftests/kvm/include/aarch64/ucall.h b/tools/testing/selftests/kvm/include/aarch64/ucall.h index 4b68f37efd36..4ec801f37f00 100644 --- a/tools/testing/selftests/kvm/include/aarch64/ucall.h +++ b/tools/testing/selftests/kvm/include/aarch64/ucall.h @@ -2,7 +2,7 @@ #ifndef SELFTEST_KVM_UCALL_H #define SELFTEST_KVM_UCALL_H -#include "kvm_util_base.h" +#include "kvm_util.h" #define UCALL_EXIT_REASON KVM_EXIT_MMIO diff --git a/tools/testing/selftests/kvm/include/aarch64/vgic.h b/tools/testing/selftests/kvm/include/aarch64/vgic.h index 0ac6f05c63f9..c481d0c00a5d 100644 --- a/tools/testing/selftests/kvm/include/aarch64/vgic.h +++ b/tools/testing/selftests/kvm/include/aarch64/vgic.h @@ -16,8 +16,7 @@ ((uint64_t)(flags) << 12) | \ index) -int vgic_v3_setup(struct kvm_vm *vm, unsigned int nr_vcpus, uint32_t nr_irqs, - uint64_t gicd_base_gpa, uint64_t gicr_base_gpa); +int vgic_v3_setup(struct kvm_vm *vm, unsigned int nr_vcpus, uint32_t nr_irqs); #define VGIC_MAX_RESERVED 1023 @@ -33,4 +32,6 @@ void kvm_irq_write_isactiver(int gic_fd, uint32_t intid, struct kvm_vcpu *vcpu); #define KVM_IRQCHIP_NUM_PINS (1020 - 32) +int vgic_its_setup(struct kvm_vm *vm); + #endif // SELFTEST_KVM_VGIC_H diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index c9286811a4cb..63c2aaae51f3 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -1,13 +1,1116 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* - * tools/testing/selftests/kvm/include/kvm_util.h - * * Copyright (C) 2018, Google LLC. */ #ifndef SELFTEST_KVM_UTIL_H #define SELFTEST_KVM_UTIL_H -#include "kvm_util_base.h" -#include "ucall_common.h" +#include "test_util.h" + +#include <linux/compiler.h> +#include "linux/hashtable.h" +#include "linux/list.h" +#include <linux/kernel.h> +#include <linux/kvm.h> +#include "linux/rbtree.h" +#include <linux/types.h> + +#include <asm/atomic.h> +#include <asm/kvm.h> + +#include <sys/ioctl.h> + +#include "kvm_util_arch.h" +#include "kvm_util_types.h" +#include "sparsebit.h" + +#define KVM_DEV_PATH "/dev/kvm" +#define KVM_MAX_VCPUS 512 + +#define NSEC_PER_SEC 1000000000L + +struct userspace_mem_region { + struct kvm_userspace_memory_region2 region; + struct sparsebit *unused_phy_pages; + struct sparsebit *protected_phy_pages; + int fd; + off_t offset; + enum vm_mem_backing_src_type backing_src_type; + void *host_mem; + void *host_alias; + void *mmap_start; + void *mmap_alias; + size_t mmap_size; + struct rb_node gpa_node; + struct rb_node hva_node; + struct hlist_node slot_node; +}; + +struct kvm_vcpu { + struct list_head list; + uint32_t id; + int fd; + struct kvm_vm *vm; + struct kvm_run *run; +#ifdef __x86_64__ + struct kvm_cpuid2 *cpuid; +#endif + struct kvm_dirty_gfn *dirty_gfns; + uint32_t fetch_index; + uint32_t dirty_gfns_count; +}; + +struct userspace_mem_regions { + struct rb_root gpa_tree; + struct rb_root hva_tree; + DECLARE_HASHTABLE(slot_hash, 9); +}; + +enum kvm_mem_region_type { + MEM_REGION_CODE, + MEM_REGION_DATA, + MEM_REGION_PT, + MEM_REGION_TEST_DATA, + NR_MEM_REGIONS, +}; + +struct kvm_vm { + int mode; + unsigned long type; + int kvm_fd; + int fd; + unsigned int pgtable_levels; + unsigned int page_size; + unsigned int page_shift; + unsigned int pa_bits; + unsigned int va_bits; + uint64_t max_gfn; + struct list_head vcpus; + struct userspace_mem_regions regions; + struct sparsebit *vpages_valid; + struct sparsebit *vpages_mapped; + bool has_irqchip; + bool pgd_created; + vm_paddr_t ucall_mmio_addr; + vm_paddr_t pgd; + vm_vaddr_t handlers; + uint32_t dirty_ring_size; + uint64_t gpa_tag_mask; + + struct kvm_vm_arch arch; + + /* Cache of information for binary stats interface */ + int stats_fd; + struct kvm_stats_header stats_header; + struct kvm_stats_desc *stats_desc; + + /* + * KVM region slots. These are the default memslots used by page + * allocators, e.g., lib/elf uses the memslots[MEM_REGION_CODE] + * memslot. + */ + uint32_t memslots[NR_MEM_REGIONS]; +}; + +struct vcpu_reg_sublist { + const char *name; + long capability; + int feature; + int feature_type; + bool finalize; + __u64 *regs; + __u64 regs_n; + __u64 *rejects_set; + __u64 rejects_set_n; + __u64 *skips_set; + __u64 skips_set_n; +}; + +struct vcpu_reg_list { + char *name; + struct vcpu_reg_sublist sublists[]; +}; + +#define for_each_sublist(c, s) \ + for ((s) = &(c)->sublists[0]; (s)->regs; ++(s)) + +#define kvm_for_each_vcpu(vm, i, vcpu) \ + for ((i) = 0; (i) <= (vm)->last_vcpu_id; (i)++) \ + if (!((vcpu) = vm->vcpus[i])) \ + continue; \ + else + +struct userspace_mem_region * +memslot2region(struct kvm_vm *vm, uint32_t memslot); + +static inline struct userspace_mem_region *vm_get_mem_region(struct kvm_vm *vm, + enum kvm_mem_region_type type) +{ + assert(type < NR_MEM_REGIONS); + return memslot2region(vm, vm->memslots[type]); +} + +/* Minimum allocated guest virtual and physical addresses */ +#define KVM_UTIL_MIN_VADDR 0x2000 +#define KVM_GUEST_PAGE_TABLE_MIN_PADDR 0x180000 + +#define DEFAULT_GUEST_STACK_VADDR_MIN 0xab6000 +#define DEFAULT_STACK_PGS 5 + +enum vm_guest_mode { + VM_MODE_P52V48_4K, + VM_MODE_P52V48_16K, + VM_MODE_P52V48_64K, + VM_MODE_P48V48_4K, + VM_MODE_P48V48_16K, + VM_MODE_P48V48_64K, + VM_MODE_P40V48_4K, + VM_MODE_P40V48_16K, + VM_MODE_P40V48_64K, + VM_MODE_PXXV48_4K, /* For 48bits VA but ANY bits PA */ + VM_MODE_P47V64_4K, + VM_MODE_P44V64_4K, + VM_MODE_P36V48_4K, + VM_MODE_P36V48_16K, + VM_MODE_P36V48_64K, + VM_MODE_P36V47_16K, + NUM_VM_MODES, +}; + +struct vm_shape { + uint32_t type; + uint8_t mode; + uint8_t pad0; + uint16_t pad1; +}; + +kvm_static_assert(sizeof(struct vm_shape) == sizeof(uint64_t)); + +#define VM_TYPE_DEFAULT 0 + +#define VM_SHAPE(__mode) \ +({ \ + struct vm_shape shape = { \ + .mode = (__mode), \ + .type = VM_TYPE_DEFAULT \ + }; \ + \ + shape; \ +}) + +#if defined(__aarch64__) + +extern enum vm_guest_mode vm_mode_default; + +#define VM_MODE_DEFAULT vm_mode_default +#define MIN_PAGE_SHIFT 12U +#define ptes_per_page(page_size) ((page_size) / 8) + +#elif defined(__x86_64__) + +#define VM_MODE_DEFAULT VM_MODE_PXXV48_4K +#define MIN_PAGE_SHIFT 12U +#define ptes_per_page(page_size) ((page_size) / 8) + +#elif defined(__s390x__) + +#define VM_MODE_DEFAULT VM_MODE_P44V64_4K +#define MIN_PAGE_SHIFT 12U +#define ptes_per_page(page_size) ((page_size) / 16) + +#elif defined(__riscv) + +#if __riscv_xlen == 32 +#error "RISC-V 32-bit kvm selftests not supported" +#endif + +#define VM_MODE_DEFAULT VM_MODE_P40V48_4K +#define MIN_PAGE_SHIFT 12U +#define ptes_per_page(page_size) ((page_size) / 8) + +#endif + +#define VM_SHAPE_DEFAULT VM_SHAPE(VM_MODE_DEFAULT) + +#define MIN_PAGE_SIZE (1U << MIN_PAGE_SHIFT) +#define PTES_PER_MIN_PAGE ptes_per_page(MIN_PAGE_SIZE) + +struct vm_guest_mode_params { + unsigned int pa_bits; + unsigned int va_bits; + unsigned int page_size; + unsigned int page_shift; +}; +extern const struct vm_guest_mode_params vm_guest_mode_params[]; + +int open_path_or_exit(const char *path, int flags); +int open_kvm_dev_path_or_exit(void); + +bool get_kvm_param_bool(const char *param); +bool get_kvm_intel_param_bool(const char *param); +bool get_kvm_amd_param_bool(const char *param); + +int get_kvm_param_integer(const char *param); +int get_kvm_intel_param_integer(const char *param); +int get_kvm_amd_param_integer(const char *param); + +unsigned int kvm_check_cap(long cap); + +static inline bool kvm_has_cap(long cap) +{ + return kvm_check_cap(cap); +} + +#define __KVM_SYSCALL_ERROR(_name, _ret) \ + "%s failed, rc: %i errno: %i (%s)", (_name), (_ret), errno, strerror(errno) + +/* + * Use the "inner", double-underscore macro when reporting errors from within + * other macros so that the name of ioctl() and not its literal numeric value + * is printed on error. The "outer" macro is strongly preferred when reporting + * errors "directly", i.e. without an additional layer of macros, as it reduces + * the probability of passing in the wrong string. + */ +#define __KVM_IOCTL_ERROR(_name, _ret) __KVM_SYSCALL_ERROR(_name, _ret) +#define KVM_IOCTL_ERROR(_ioctl, _ret) __KVM_IOCTL_ERROR(#_ioctl, _ret) + +#define kvm_do_ioctl(fd, cmd, arg) \ +({ \ + kvm_static_assert(!_IOC_SIZE(cmd) || sizeof(*arg) == _IOC_SIZE(cmd)); \ + ioctl(fd, cmd, arg); \ +}) + +#define __kvm_ioctl(kvm_fd, cmd, arg) \ + kvm_do_ioctl(kvm_fd, cmd, arg) + +#define kvm_ioctl(kvm_fd, cmd, arg) \ +({ \ + int ret = __kvm_ioctl(kvm_fd, cmd, arg); \ + \ + TEST_ASSERT(!ret, __KVM_IOCTL_ERROR(#cmd, ret)); \ +}) + +static __always_inline void static_assert_is_vm(struct kvm_vm *vm) { } + +#define __vm_ioctl(vm, cmd, arg) \ +({ \ + static_assert_is_vm(vm); \ + kvm_do_ioctl((vm)->fd, cmd, arg); \ +}) + +/* + * Assert that a VM or vCPU ioctl() succeeded, with extra magic to detect if + * the ioctl() failed because KVM killed/bugged the VM. To detect a dead VM, + * probe KVM_CAP_USER_MEMORY, which (a) has been supported by KVM since before + * selftests existed and (b) should never outright fail, i.e. is supposed to + * return 0 or 1. If KVM kills a VM, KVM returns -EIO for all ioctl()s for the + * VM and its vCPUs, including KVM_CHECK_EXTENSION. + */ +#define __TEST_ASSERT_VM_VCPU_IOCTL(cond, name, ret, vm) \ +do { \ + int __errno = errno; \ + \ + static_assert_is_vm(vm); \ + \ + if (cond) \ + break; \ + \ + if (errno == EIO && \ + __vm_ioctl(vm, KVM_CHECK_EXTENSION, (void *)KVM_CAP_USER_MEMORY) < 0) { \ + TEST_ASSERT(errno == EIO, "KVM killed the VM, should return -EIO"); \ + TEST_FAIL("KVM killed/bugged the VM, check the kernel log for clues"); \ + } \ + errno = __errno; \ + TEST_ASSERT(cond, __KVM_IOCTL_ERROR(name, ret)); \ +} while (0) + +#define TEST_ASSERT_VM_VCPU_IOCTL(cond, cmd, ret, vm) \ + __TEST_ASSERT_VM_VCPU_IOCTL(cond, #cmd, ret, vm) + +#define vm_ioctl(vm, cmd, arg) \ +({ \ + int ret = __vm_ioctl(vm, cmd, arg); \ + \ + __TEST_ASSERT_VM_VCPU_IOCTL(!ret, #cmd, ret, vm); \ +}) + +static __always_inline void static_assert_is_vcpu(struct kvm_vcpu *vcpu) { } + +#define __vcpu_ioctl(vcpu, cmd, arg) \ +({ \ + static_assert_is_vcpu(vcpu); \ + kvm_do_ioctl((vcpu)->fd, cmd, arg); \ +}) + +#define vcpu_ioctl(vcpu, cmd, arg) \ +({ \ + int ret = __vcpu_ioctl(vcpu, cmd, arg); \ + \ + __TEST_ASSERT_VM_VCPU_IOCTL(!ret, #cmd, ret, (vcpu)->vm); \ +}) + +/* + * Looks up and returns the value corresponding to the capability + * (KVM_CAP_*) given by cap. + */ +static inline int vm_check_cap(struct kvm_vm *vm, long cap) +{ + int ret = __vm_ioctl(vm, KVM_CHECK_EXTENSION, (void *)cap); + + TEST_ASSERT_VM_VCPU_IOCTL(ret >= 0, KVM_CHECK_EXTENSION, ret, vm); + return ret; +} + +static inline int __vm_enable_cap(struct kvm_vm *vm, uint32_t cap, uint64_t arg0) +{ + struct kvm_enable_cap enable_cap = { .cap = cap, .args = { arg0 } }; + + return __vm_ioctl(vm, KVM_ENABLE_CAP, &enable_cap); +} +static inline void vm_enable_cap(struct kvm_vm *vm, uint32_t cap, uint64_t arg0) +{ + struct kvm_enable_cap enable_cap = { .cap = cap, .args = { arg0 } }; + + vm_ioctl(vm, KVM_ENABLE_CAP, &enable_cap); +} + +static inline void vm_set_memory_attributes(struct kvm_vm *vm, uint64_t gpa, + uint64_t size, uint64_t attributes) +{ + struct kvm_memory_attributes attr = { + .attributes = attributes, + .address = gpa, + .size = size, + .flags = 0, + }; + + /* + * KVM_SET_MEMORY_ATTRIBUTES overwrites _all_ attributes. These flows + * need significant enhancements to support multiple attributes. + */ + TEST_ASSERT(!attributes || attributes == KVM_MEMORY_ATTRIBUTE_PRIVATE, + "Update me to support multiple attributes!"); + + vm_ioctl(vm, KVM_SET_MEMORY_ATTRIBUTES, &attr); +} + + +static inline void vm_mem_set_private(struct kvm_vm *vm, uint64_t gpa, + uint64_t size) +{ + vm_set_memory_attributes(vm, gpa, size, KVM_MEMORY_ATTRIBUTE_PRIVATE); +} + +static inline void vm_mem_set_shared(struct kvm_vm *vm, uint64_t gpa, + uint64_t size) +{ + vm_set_memory_attributes(vm, gpa, size, 0); +} + +void vm_guest_mem_fallocate(struct kvm_vm *vm, uint64_t gpa, uint64_t size, + bool punch_hole); + +static inline void vm_guest_mem_punch_hole(struct kvm_vm *vm, uint64_t gpa, + uint64_t size) +{ + vm_guest_mem_fallocate(vm, gpa, size, true); +} + +static inline void vm_guest_mem_allocate(struct kvm_vm *vm, uint64_t gpa, + uint64_t size) +{ + vm_guest_mem_fallocate(vm, gpa, size, false); +} + +void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size); +const char *vm_guest_mode_string(uint32_t i); + +void kvm_vm_free(struct kvm_vm *vmp); +void kvm_vm_restart(struct kvm_vm *vmp); +void kvm_vm_release(struct kvm_vm *vmp); +int kvm_memcmp_hva_gva(void *hva, struct kvm_vm *vm, const vm_vaddr_t gva, + size_t len); +void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename); +int kvm_memfd_alloc(size_t size, bool hugepages); + +void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent); + +static inline void kvm_vm_get_dirty_log(struct kvm_vm *vm, int slot, void *log) +{ + struct kvm_dirty_log args = { .dirty_bitmap = log, .slot = slot }; + + vm_ioctl(vm, KVM_GET_DIRTY_LOG, &args); +} + +static inline void kvm_vm_clear_dirty_log(struct kvm_vm *vm, int slot, void *log, + uint64_t first_page, uint32_t num_pages) +{ + struct kvm_clear_dirty_log args = { + .dirty_bitmap = log, + .slot = slot, + .first_page = first_page, + .num_pages = num_pages + }; + + vm_ioctl(vm, KVM_CLEAR_DIRTY_LOG, &args); +} + +static inline uint32_t kvm_vm_reset_dirty_ring(struct kvm_vm *vm) +{ + return __vm_ioctl(vm, KVM_RESET_DIRTY_RINGS, NULL); +} + +static inline int vm_get_stats_fd(struct kvm_vm *vm) +{ + int fd = __vm_ioctl(vm, KVM_GET_STATS_FD, NULL); + + TEST_ASSERT_VM_VCPU_IOCTL(fd >= 0, KVM_GET_STATS_FD, fd, vm); + return fd; +} + +static inline void read_stats_header(int stats_fd, struct kvm_stats_header *header) +{ + ssize_t ret; + + ret = pread(stats_fd, header, sizeof(*header), 0); + TEST_ASSERT(ret == sizeof(*header), + "Failed to read '%lu' header bytes, ret = '%ld'", + sizeof(*header), ret); +} + +struct kvm_stats_desc *read_stats_descriptors(int stats_fd, + struct kvm_stats_header *header); + +static inline ssize_t get_stats_descriptor_size(struct kvm_stats_header *header) +{ + /* + * The base size of the descriptor is defined by KVM's ABI, but the + * size of the name field is variable, as far as KVM's ABI is + * concerned. For a given instance of KVM, the name field is the same + * size for all stats and is provided in the overall stats header. + */ + return sizeof(struct kvm_stats_desc) + header->name_size; +} + +static inline struct kvm_stats_desc *get_stats_descriptor(struct kvm_stats_desc *stats, + int index, + struct kvm_stats_header *header) +{ + /* + * Note, size_desc includes the size of the name field, which is + * variable. i.e. this is NOT equivalent to &stats_desc[i]. + */ + return (void *)stats + index * get_stats_descriptor_size(header); +} + +void read_stat_data(int stats_fd, struct kvm_stats_header *header, + struct kvm_stats_desc *desc, uint64_t *data, + size_t max_elements); + +void __vm_get_stat(struct kvm_vm *vm, const char *stat_name, uint64_t *data, + size_t max_elements); + +static inline uint64_t vm_get_stat(struct kvm_vm *vm, const char *stat_name) +{ + uint64_t data; + + __vm_get_stat(vm, stat_name, &data, 1); + return data; +} + +void vm_create_irqchip(struct kvm_vm *vm); + +static inline int __vm_create_guest_memfd(struct kvm_vm *vm, uint64_t size, + uint64_t flags) +{ + struct kvm_create_guest_memfd guest_memfd = { + .size = size, + .flags = flags, + }; + + return __vm_ioctl(vm, KVM_CREATE_GUEST_MEMFD, &guest_memfd); +} + +static inline int vm_create_guest_memfd(struct kvm_vm *vm, uint64_t size, + uint64_t flags) +{ + int fd = __vm_create_guest_memfd(vm, size, flags); + + TEST_ASSERT(fd >= 0, KVM_IOCTL_ERROR(KVM_CREATE_GUEST_MEMFD, fd)); + return fd; +} + +void vm_set_user_memory_region(struct kvm_vm *vm, uint32_t slot, uint32_t flags, + uint64_t gpa, uint64_t size, void *hva); +int __vm_set_user_memory_region(struct kvm_vm *vm, uint32_t slot, uint32_t flags, + uint64_t gpa, uint64_t size, void *hva); +void vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flags, + uint64_t gpa, uint64_t size, void *hva, + uint32_t guest_memfd, uint64_t guest_memfd_offset); +int __vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flags, + uint64_t gpa, uint64_t size, void *hva, + uint32_t guest_memfd, uint64_t guest_memfd_offset); + +void vm_userspace_mem_region_add(struct kvm_vm *vm, + enum vm_mem_backing_src_type src_type, + uint64_t guest_paddr, uint32_t slot, uint64_t npages, + uint32_t flags); +void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, + uint64_t guest_paddr, uint32_t slot, uint64_t npages, + uint32_t flags, int guest_memfd_fd, uint64_t guest_memfd_offset); + +#ifndef vm_arch_has_protected_memory +static inline bool vm_arch_has_protected_memory(struct kvm_vm *vm) +{ + return false; +} +#endif + +void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags); +void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa); +void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot); +struct kvm_vcpu *__vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id); +void vm_populate_vaddr_bitmap(struct kvm_vm *vm); +vm_vaddr_t vm_vaddr_unused_gap(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min); +vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min); +vm_vaddr_t __vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min, + enum kvm_mem_region_type type); +vm_vaddr_t vm_vaddr_alloc_shared(struct kvm_vm *vm, size_t sz, + vm_vaddr_t vaddr_min, + enum kvm_mem_region_type type); +vm_vaddr_t vm_vaddr_alloc_pages(struct kvm_vm *vm, int nr_pages); +vm_vaddr_t __vm_vaddr_alloc_page(struct kvm_vm *vm, + enum kvm_mem_region_type type); +vm_vaddr_t vm_vaddr_alloc_page(struct kvm_vm *vm); + +void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, + unsigned int npages); +void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa); +void *addr_gva2hva(struct kvm_vm *vm, vm_vaddr_t gva); +vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva); +void *addr_gpa2alias(struct kvm_vm *vm, vm_paddr_t gpa); + +#ifndef vcpu_arch_put_guest +#define vcpu_arch_put_guest(mem, val) do { (mem) = (val); } while (0) +#endif + +static inline vm_paddr_t vm_untag_gpa(struct kvm_vm *vm, vm_paddr_t gpa) +{ + return gpa & ~vm->gpa_tag_mask; +} + +void vcpu_run(struct kvm_vcpu *vcpu); +int _vcpu_run(struct kvm_vcpu *vcpu); + +static inline int __vcpu_run(struct kvm_vcpu *vcpu) +{ + return __vcpu_ioctl(vcpu, KVM_RUN, NULL); +} + +void vcpu_run_complete_io(struct kvm_vcpu *vcpu); +struct kvm_reg_list *vcpu_get_reg_list(struct kvm_vcpu *vcpu); + +static inline void vcpu_enable_cap(struct kvm_vcpu *vcpu, uint32_t cap, + uint64_t arg0) +{ + struct kvm_enable_cap enable_cap = { .cap = cap, .args = { arg0 } }; + + vcpu_ioctl(vcpu, KVM_ENABLE_CAP, &enable_cap); +} + +static inline void vcpu_guest_debug_set(struct kvm_vcpu *vcpu, + struct kvm_guest_debug *debug) +{ + vcpu_ioctl(vcpu, KVM_SET_GUEST_DEBUG, debug); +} + +static inline void vcpu_mp_state_get(struct kvm_vcpu *vcpu, + struct kvm_mp_state *mp_state) +{ + vcpu_ioctl(vcpu, KVM_GET_MP_STATE, mp_state); +} +static inline void vcpu_mp_state_set(struct kvm_vcpu *vcpu, + struct kvm_mp_state *mp_state) +{ + vcpu_ioctl(vcpu, KVM_SET_MP_STATE, mp_state); +} + +static inline void vcpu_regs_get(struct kvm_vcpu *vcpu, struct kvm_regs *regs) +{ + vcpu_ioctl(vcpu, KVM_GET_REGS, regs); +} + +static inline void vcpu_regs_set(struct kvm_vcpu *vcpu, struct kvm_regs *regs) +{ + vcpu_ioctl(vcpu, KVM_SET_REGS, regs); +} +static inline void vcpu_sregs_get(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + vcpu_ioctl(vcpu, KVM_GET_SREGS, sregs); + +} +static inline void vcpu_sregs_set(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + vcpu_ioctl(vcpu, KVM_SET_SREGS, sregs); +} +static inline int _vcpu_sregs_set(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + return __vcpu_ioctl(vcpu, KVM_SET_SREGS, sregs); +} +static inline void vcpu_fpu_get(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) +{ + vcpu_ioctl(vcpu, KVM_GET_FPU, fpu); +} +static inline void vcpu_fpu_set(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) +{ + vcpu_ioctl(vcpu, KVM_SET_FPU, fpu); +} + +static inline int __vcpu_get_reg(struct kvm_vcpu *vcpu, uint64_t id, void *addr) +{ + struct kvm_one_reg reg = { .id = id, .addr = (uint64_t)addr }; + + return __vcpu_ioctl(vcpu, KVM_GET_ONE_REG, ®); +} +static inline int __vcpu_set_reg(struct kvm_vcpu *vcpu, uint64_t id, uint64_t val) +{ + struct kvm_one_reg reg = { .id = id, .addr = (uint64_t)&val }; + + return __vcpu_ioctl(vcpu, KVM_SET_ONE_REG, ®); +} +static inline void vcpu_get_reg(struct kvm_vcpu *vcpu, uint64_t id, void *addr) +{ + struct kvm_one_reg reg = { .id = id, .addr = (uint64_t)addr }; + + vcpu_ioctl(vcpu, KVM_GET_ONE_REG, ®); +} +static inline void vcpu_set_reg(struct kvm_vcpu *vcpu, uint64_t id, uint64_t val) +{ + struct kvm_one_reg reg = { .id = id, .addr = (uint64_t)&val }; + + vcpu_ioctl(vcpu, KVM_SET_ONE_REG, ®); +} + +#ifdef __KVM_HAVE_VCPU_EVENTS +static inline void vcpu_events_get(struct kvm_vcpu *vcpu, + struct kvm_vcpu_events *events) +{ + vcpu_ioctl(vcpu, KVM_GET_VCPU_EVENTS, events); +} +static inline void vcpu_events_set(struct kvm_vcpu *vcpu, + struct kvm_vcpu_events *events) +{ + vcpu_ioctl(vcpu, KVM_SET_VCPU_EVENTS, events); +} +#endif +#ifdef __x86_64__ +static inline void vcpu_nested_state_get(struct kvm_vcpu *vcpu, + struct kvm_nested_state *state) +{ + vcpu_ioctl(vcpu, KVM_GET_NESTED_STATE, state); +} +static inline int __vcpu_nested_state_set(struct kvm_vcpu *vcpu, + struct kvm_nested_state *state) +{ + return __vcpu_ioctl(vcpu, KVM_SET_NESTED_STATE, state); +} + +static inline void vcpu_nested_state_set(struct kvm_vcpu *vcpu, + struct kvm_nested_state *state) +{ + vcpu_ioctl(vcpu, KVM_SET_NESTED_STATE, state); +} +#endif +static inline int vcpu_get_stats_fd(struct kvm_vcpu *vcpu) +{ + int fd = __vcpu_ioctl(vcpu, KVM_GET_STATS_FD, NULL); + + TEST_ASSERT_VM_VCPU_IOCTL(fd >= 0, KVM_CHECK_EXTENSION, fd, vcpu->vm); + return fd; +} + +int __kvm_has_device_attr(int dev_fd, uint32_t group, uint64_t attr); + +static inline void kvm_has_device_attr(int dev_fd, uint32_t group, uint64_t attr) +{ + int ret = __kvm_has_device_attr(dev_fd, group, attr); + + TEST_ASSERT(!ret, "KVM_HAS_DEVICE_ATTR failed, rc: %i errno: %i", ret, errno); +} + +int __kvm_device_attr_get(int dev_fd, uint32_t group, uint64_t attr, void *val); + +static inline void kvm_device_attr_get(int dev_fd, uint32_t group, + uint64_t attr, void *val) +{ + int ret = __kvm_device_attr_get(dev_fd, group, attr, val); + + TEST_ASSERT(!ret, KVM_IOCTL_ERROR(KVM_GET_DEVICE_ATTR, ret)); +} + +int __kvm_device_attr_set(int dev_fd, uint32_t group, uint64_t attr, void *val); + +static inline void kvm_device_attr_set(int dev_fd, uint32_t group, + uint64_t attr, void *val) +{ + int ret = __kvm_device_attr_set(dev_fd, group, attr, val); + + TEST_ASSERT(!ret, KVM_IOCTL_ERROR(KVM_SET_DEVICE_ATTR, ret)); +} + +static inline int __vcpu_has_device_attr(struct kvm_vcpu *vcpu, uint32_t group, + uint64_t attr) +{ + return __kvm_has_device_attr(vcpu->fd, group, attr); +} + +static inline void vcpu_has_device_attr(struct kvm_vcpu *vcpu, uint32_t group, + uint64_t attr) +{ + kvm_has_device_attr(vcpu->fd, group, attr); +} + +static inline int __vcpu_device_attr_get(struct kvm_vcpu *vcpu, uint32_t group, + uint64_t attr, void *val) +{ + return __kvm_device_attr_get(vcpu->fd, group, attr, val); +} + +static inline void vcpu_device_attr_get(struct kvm_vcpu *vcpu, uint32_t group, + uint64_t attr, void *val) +{ + kvm_device_attr_get(vcpu->fd, group, attr, val); +} + +static inline int __vcpu_device_attr_set(struct kvm_vcpu *vcpu, uint32_t group, + uint64_t attr, void *val) +{ + return __kvm_device_attr_set(vcpu->fd, group, attr, val); +} + +static inline void vcpu_device_attr_set(struct kvm_vcpu *vcpu, uint32_t group, + uint64_t attr, void *val) +{ + kvm_device_attr_set(vcpu->fd, group, attr, val); +} + +int __kvm_test_create_device(struct kvm_vm *vm, uint64_t type); +int __kvm_create_device(struct kvm_vm *vm, uint64_t type); + +static inline int kvm_create_device(struct kvm_vm *vm, uint64_t type) +{ + int fd = __kvm_create_device(vm, type); + + TEST_ASSERT(fd >= 0, KVM_IOCTL_ERROR(KVM_CREATE_DEVICE, fd)); + return fd; +} + +void *vcpu_map_dirty_ring(struct kvm_vcpu *vcpu); + +/* + * VM VCPU Args Set + * + * Input Args: + * vm - Virtual Machine + * num - number of arguments + * ... - arguments, each of type uint64_t + * + * Output Args: None + * + * Return: None + * + * Sets the first @num input parameters for the function at @vcpu's entry point, + * per the C calling convention of the architecture, to the values given as + * variable args. Each of the variable args is expected to be of type uint64_t. + * The maximum @num can be is specific to the architecture. + */ +void vcpu_args_set(struct kvm_vcpu *vcpu, unsigned int num, ...); + +void kvm_irq_line(struct kvm_vm *vm, uint32_t irq, int level); +int _kvm_irq_line(struct kvm_vm *vm, uint32_t irq, int level); + +#define KVM_MAX_IRQ_ROUTES 4096 + +struct kvm_irq_routing *kvm_gsi_routing_create(void); +void kvm_gsi_routing_irqchip_add(struct kvm_irq_routing *routing, + uint32_t gsi, uint32_t pin); +int _kvm_gsi_routing_write(struct kvm_vm *vm, struct kvm_irq_routing *routing); +void kvm_gsi_routing_write(struct kvm_vm *vm, struct kvm_irq_routing *routing); + +const char *exit_reason_str(unsigned int exit_reason); + +vm_paddr_t vm_phy_page_alloc(struct kvm_vm *vm, vm_paddr_t paddr_min, + uint32_t memslot); +vm_paddr_t __vm_phy_pages_alloc(struct kvm_vm *vm, size_t num, + vm_paddr_t paddr_min, uint32_t memslot, + bool protected); +vm_paddr_t vm_alloc_page_table(struct kvm_vm *vm); + +static inline vm_paddr_t vm_phy_pages_alloc(struct kvm_vm *vm, size_t num, + vm_paddr_t paddr_min, uint32_t memslot) +{ + /* + * By default, allocate memory as protected for VMs that support + * protected memory, as the majority of memory for such VMs is + * protected, i.e. using shared memory is effectively opt-in. + */ + return __vm_phy_pages_alloc(vm, num, paddr_min, memslot, + vm_arch_has_protected_memory(vm)); +} + +/* + * ____vm_create() does KVM_CREATE_VM and little else. __vm_create() also + * loads the test binary into guest memory and creates an IRQ chip (x86 only). + * __vm_create() does NOT create vCPUs, @nr_runnable_vcpus is used purely to + * calculate the amount of memory needed for per-vCPU data, e.g. stacks. + */ +struct kvm_vm *____vm_create(struct vm_shape shape); +struct kvm_vm *__vm_create(struct vm_shape shape, uint32_t nr_runnable_vcpus, + uint64_t nr_extra_pages); + +static inline struct kvm_vm *vm_create_barebones(void) +{ + return ____vm_create(VM_SHAPE_DEFAULT); +} + +static inline struct kvm_vm *vm_create_barebones_type(unsigned long type) +{ + const struct vm_shape shape = { + .mode = VM_MODE_DEFAULT, + .type = type, + }; + + return ____vm_create(shape); +} + +static inline struct kvm_vm *vm_create(uint32_t nr_runnable_vcpus) +{ + return __vm_create(VM_SHAPE_DEFAULT, nr_runnable_vcpus, 0); +} + +struct kvm_vm *__vm_create_with_vcpus(struct vm_shape shape, uint32_t nr_vcpus, + uint64_t extra_mem_pages, + void *guest_code, struct kvm_vcpu *vcpus[]); + +static inline struct kvm_vm *vm_create_with_vcpus(uint32_t nr_vcpus, + void *guest_code, + struct kvm_vcpu *vcpus[]) +{ + return __vm_create_with_vcpus(VM_SHAPE_DEFAULT, nr_vcpus, 0, + guest_code, vcpus); +} + + +struct kvm_vm *__vm_create_shape_with_one_vcpu(struct vm_shape shape, + struct kvm_vcpu **vcpu, + uint64_t extra_mem_pages, + void *guest_code); + +/* + * Create a VM with a single vCPU with reasonable defaults and @extra_mem_pages + * additional pages of guest memory. Returns the VM and vCPU (via out param). + */ +static inline struct kvm_vm *__vm_create_with_one_vcpu(struct kvm_vcpu **vcpu, + uint64_t extra_mem_pages, + void *guest_code) +{ + return __vm_create_shape_with_one_vcpu(VM_SHAPE_DEFAULT, vcpu, + extra_mem_pages, guest_code); +} + +static inline struct kvm_vm *vm_create_with_one_vcpu(struct kvm_vcpu **vcpu, + void *guest_code) +{ + return __vm_create_with_one_vcpu(vcpu, 0, guest_code); +} + +static inline struct kvm_vm *vm_create_shape_with_one_vcpu(struct vm_shape shape, + struct kvm_vcpu **vcpu, + void *guest_code) +{ + return __vm_create_shape_with_one_vcpu(shape, vcpu, 0, guest_code); +} + +struct kvm_vcpu *vm_recreate_with_one_vcpu(struct kvm_vm *vm); + +void kvm_pin_this_task_to_pcpu(uint32_t pcpu); +void kvm_print_vcpu_pinning_help(void); +void kvm_parse_vcpu_pinning(const char *pcpus_string, uint32_t vcpu_to_pcpu[], + int nr_vcpus); + +unsigned long vm_compute_max_gfn(struct kvm_vm *vm); +unsigned int vm_calc_num_guest_pages(enum vm_guest_mode mode, size_t size); +unsigned int vm_num_host_pages(enum vm_guest_mode mode, unsigned int num_guest_pages); +unsigned int vm_num_guest_pages(enum vm_guest_mode mode, unsigned int num_host_pages); +static inline unsigned int +vm_adjust_num_guest_pages(enum vm_guest_mode mode, unsigned int num_guest_pages) +{ + unsigned int n; + n = vm_num_guest_pages(mode, vm_num_host_pages(mode, num_guest_pages)); +#ifdef __s390x__ + /* s390 requires 1M aligned guest sizes */ + n = (n + 255) & ~255; +#endif + return n; +} + +#define sync_global_to_guest(vm, g) ({ \ + typeof(g) *_p = addr_gva2hva(vm, (vm_vaddr_t)&(g)); \ + memcpy(_p, &(g), sizeof(g)); \ +}) + +#define sync_global_from_guest(vm, g) ({ \ + typeof(g) *_p = addr_gva2hva(vm, (vm_vaddr_t)&(g)); \ + memcpy(&(g), _p, sizeof(g)); \ +}) + +/* + * Write a global value, but only in the VM's (guest's) domain. Primarily used + * for "globals" that hold per-VM values (VMs always duplicate code and global + * data into their own region of physical memory), but can be used anytime it's + * undesirable to change the host's copy of the global. + */ +#define write_guest_global(vm, g, val) ({ \ + typeof(g) *_p = addr_gva2hva(vm, (vm_vaddr_t)&(g)); \ + typeof(g) _val = val; \ + \ + memcpy(_p, &(_val), sizeof(g)); \ +}) + +void assert_on_unhandled_exception(struct kvm_vcpu *vcpu); + +void vcpu_arch_dump(FILE *stream, struct kvm_vcpu *vcpu, + uint8_t indent); + +static inline void vcpu_dump(FILE *stream, struct kvm_vcpu *vcpu, + uint8_t indent) +{ + vcpu_arch_dump(stream, vcpu, indent); +} + +/* + * Adds a vCPU with reasonable defaults (e.g. a stack) + * + * Input Args: + * vm - Virtual Machine + * vcpu_id - The id of the VCPU to add to the VM. + */ +struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id); +void vcpu_arch_set_entry_point(struct kvm_vcpu *vcpu, void *guest_code); + +static inline struct kvm_vcpu *vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, + void *guest_code) +{ + struct kvm_vcpu *vcpu = vm_arch_vcpu_add(vm, vcpu_id); + + vcpu_arch_set_entry_point(vcpu, guest_code); + + return vcpu; +} + +/* Re-create a vCPU after restarting a VM, e.g. for state save/restore tests. */ +struct kvm_vcpu *vm_arch_vcpu_recreate(struct kvm_vm *vm, uint32_t vcpu_id); + +static inline struct kvm_vcpu *vm_vcpu_recreate(struct kvm_vm *vm, + uint32_t vcpu_id) +{ + return vm_arch_vcpu_recreate(vm, vcpu_id); +} + +void vcpu_arch_free(struct kvm_vcpu *vcpu); + +void virt_arch_pgd_alloc(struct kvm_vm *vm); + +static inline void virt_pgd_alloc(struct kvm_vm *vm) +{ + virt_arch_pgd_alloc(vm); +} + +/* + * VM Virtual Page Map + * + * Input Args: + * vm - Virtual Machine + * vaddr - VM Virtual Address + * paddr - VM Physical Address + * memslot - Memory region slot for new virtual translation tables + * + * Output Args: None + * + * Return: None + * + * Within @vm, creates a virtual translation for the page starting + * at @vaddr to the page starting at @paddr. + */ +void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr); + +static inline void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) +{ + virt_arch_pg_map(vm, vaddr, paddr); +} + + +/* + * Address Guest Virtual to Guest Physical + * + * Input Args: + * vm - Virtual Machine + * gva - VM virtual address + * + * Output Args: None + * + * Return: + * Equivalent VM physical address + * + * Returns the VM physical address of the translated VM virtual + * address given by @gva. + */ +vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva); + +static inline vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) +{ + return addr_arch_gva2gpa(vm, gva); +} + +/* + * Virtual Translation Tables Dump + * + * Input Args: + * stream - Output FILE stream + * vm - Virtual Machine + * indent - Left margin indent amount + * + * Output Args: None + * + * Return: None + * + * Dumps to the FILE stream given by @stream, the contents of all the + * virtual translation tables for the VM given by @vm. + */ +void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent); + +static inline void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) +{ + virt_arch_dump(stream, vm, indent); +} + + +static inline int __vm_disable_nx_huge_pages(struct kvm_vm *vm) +{ + return __vm_enable_cap(vm, KVM_CAP_VM_DISABLE_NX_HUGE_PAGES, 0); +} + +/* + * Arch hook that is invoked via a constructor, i.e. before exeucting main(), + * to allow for arch-specific setup that is common to all tests, e.g. computing + * the default guest "mode". + */ +void kvm_selftest_arch_init(void); + +void kvm_arch_vm_post_create(struct kvm_vm *vm); + +bool vm_is_gpa_protected(struct kvm_vm *vm, vm_paddr_t paddr); + +uint32_t guest_get_vcpuid(void); #endif /* SELFTEST_KVM_UTIL_H */ diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h deleted file mode 100644 index 3e0db283a46a..000000000000 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ /dev/null @@ -1,1135 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * tools/testing/selftests/kvm/include/kvm_util_base.h - * - * Copyright (C) 2018, Google LLC. - */ -#ifndef SELFTEST_KVM_UTIL_BASE_H -#define SELFTEST_KVM_UTIL_BASE_H - -#include "test_util.h" - -#include <linux/compiler.h> -#include "linux/hashtable.h" -#include "linux/list.h" -#include <linux/kernel.h> -#include <linux/kvm.h> -#include "linux/rbtree.h" -#include <linux/types.h> - -#include <asm/atomic.h> -#include <asm/kvm.h> - -#include <sys/ioctl.h> - -#include "kvm_util_arch.h" -#include "sparsebit.h" - -/* - * Provide a version of static_assert() that is guaranteed to have an optional - * message param. If _ISOC11_SOURCE is defined, glibc (/usr/include/assert.h) - * #undefs and #defines static_assert() as a direct alias to _Static_assert(), - * i.e. effectively makes the message mandatory. Many KVM selftests #define - * _GNU_SOURCE for various reasons, and _GNU_SOURCE implies _ISOC11_SOURCE. As - * a result, static_assert() behavior is non-deterministic and may or may not - * require a message depending on #include order. - */ -#define __kvm_static_assert(expr, msg, ...) _Static_assert(expr, msg) -#define kvm_static_assert(expr, ...) __kvm_static_assert(expr, ##__VA_ARGS__, #expr) - -#define KVM_DEV_PATH "/dev/kvm" -#define KVM_MAX_VCPUS 512 - -#define NSEC_PER_SEC 1000000000L - -typedef uint64_t vm_paddr_t; /* Virtual Machine (Guest) physical address */ -typedef uint64_t vm_vaddr_t; /* Virtual Machine (Guest) virtual address */ - -struct userspace_mem_region { - struct kvm_userspace_memory_region2 region; - struct sparsebit *unused_phy_pages; - struct sparsebit *protected_phy_pages; - int fd; - off_t offset; - enum vm_mem_backing_src_type backing_src_type; - void *host_mem; - void *host_alias; - void *mmap_start; - void *mmap_alias; - size_t mmap_size; - struct rb_node gpa_node; - struct rb_node hva_node; - struct hlist_node slot_node; -}; - -struct kvm_vcpu { - struct list_head list; - uint32_t id; - int fd; - struct kvm_vm *vm; - struct kvm_run *run; -#ifdef __x86_64__ - struct kvm_cpuid2 *cpuid; -#endif - struct kvm_dirty_gfn *dirty_gfns; - uint32_t fetch_index; - uint32_t dirty_gfns_count; -}; - -struct userspace_mem_regions { - struct rb_root gpa_tree; - struct rb_root hva_tree; - DECLARE_HASHTABLE(slot_hash, 9); -}; - -enum kvm_mem_region_type { - MEM_REGION_CODE, - MEM_REGION_DATA, - MEM_REGION_PT, - MEM_REGION_TEST_DATA, - NR_MEM_REGIONS, -}; - -struct kvm_vm { - int mode; - unsigned long type; - uint8_t subtype; - int kvm_fd; - int fd; - unsigned int pgtable_levels; - unsigned int page_size; - unsigned int page_shift; - unsigned int pa_bits; - unsigned int va_bits; - uint64_t max_gfn; - struct list_head vcpus; - struct userspace_mem_regions regions; - struct sparsebit *vpages_valid; - struct sparsebit *vpages_mapped; - bool has_irqchip; - bool pgd_created; - vm_paddr_t ucall_mmio_addr; - vm_paddr_t pgd; - vm_vaddr_t gdt; - vm_vaddr_t tss; - vm_vaddr_t idt; - vm_vaddr_t handlers; - uint32_t dirty_ring_size; - uint64_t gpa_tag_mask; - - struct kvm_vm_arch arch; - - /* Cache of information for binary stats interface */ - int stats_fd; - struct kvm_stats_header stats_header; - struct kvm_stats_desc *stats_desc; - - /* - * KVM region slots. These are the default memslots used by page - * allocators, e.g., lib/elf uses the memslots[MEM_REGION_CODE] - * memslot. - */ - uint32_t memslots[NR_MEM_REGIONS]; -}; - -struct vcpu_reg_sublist { - const char *name; - long capability; - int feature; - int feature_type; - bool finalize; - __u64 *regs; - __u64 regs_n; - __u64 *rejects_set; - __u64 rejects_set_n; - __u64 *skips_set; - __u64 skips_set_n; -}; - -struct vcpu_reg_list { - char *name; - struct vcpu_reg_sublist sublists[]; -}; - -#define for_each_sublist(c, s) \ - for ((s) = &(c)->sublists[0]; (s)->regs; ++(s)) - -#define kvm_for_each_vcpu(vm, i, vcpu) \ - for ((i) = 0; (i) <= (vm)->last_vcpu_id; (i)++) \ - if (!((vcpu) = vm->vcpus[i])) \ - continue; \ - else - -struct userspace_mem_region * -memslot2region(struct kvm_vm *vm, uint32_t memslot); - -static inline struct userspace_mem_region *vm_get_mem_region(struct kvm_vm *vm, - enum kvm_mem_region_type type) -{ - assert(type < NR_MEM_REGIONS); - return memslot2region(vm, vm->memslots[type]); -} - -/* Minimum allocated guest virtual and physical addresses */ -#define KVM_UTIL_MIN_VADDR 0x2000 -#define KVM_GUEST_PAGE_TABLE_MIN_PADDR 0x180000 - -#define DEFAULT_GUEST_STACK_VADDR_MIN 0xab6000 -#define DEFAULT_STACK_PGS 5 - -enum vm_guest_mode { - VM_MODE_P52V48_4K, - VM_MODE_P52V48_16K, - VM_MODE_P52V48_64K, - VM_MODE_P48V48_4K, - VM_MODE_P48V48_16K, - VM_MODE_P48V48_64K, - VM_MODE_P40V48_4K, - VM_MODE_P40V48_16K, - VM_MODE_P40V48_64K, - VM_MODE_PXXV48_4K, /* For 48bits VA but ANY bits PA */ - VM_MODE_P47V64_4K, - VM_MODE_P44V64_4K, - VM_MODE_P36V48_4K, - VM_MODE_P36V48_16K, - VM_MODE_P36V48_64K, - VM_MODE_P36V47_16K, - NUM_VM_MODES, -}; - -struct vm_shape { - uint32_t type; - uint8_t mode; - uint8_t subtype; - uint16_t padding; -}; - -kvm_static_assert(sizeof(struct vm_shape) == sizeof(uint64_t)); - -#define VM_TYPE_DEFAULT 0 - -#define VM_SHAPE(__mode) \ -({ \ - struct vm_shape shape = { \ - .mode = (__mode), \ - .type = VM_TYPE_DEFAULT \ - }; \ - \ - shape; \ -}) - -#if defined(__aarch64__) - -extern enum vm_guest_mode vm_mode_default; - -#define VM_MODE_DEFAULT vm_mode_default -#define MIN_PAGE_SHIFT 12U -#define ptes_per_page(page_size) ((page_size) / 8) - -#elif defined(__x86_64__) - -#define VM_MODE_DEFAULT VM_MODE_PXXV48_4K -#define MIN_PAGE_SHIFT 12U -#define ptes_per_page(page_size) ((page_size) / 8) - -#elif defined(__s390x__) - -#define VM_MODE_DEFAULT VM_MODE_P44V64_4K -#define MIN_PAGE_SHIFT 12U -#define ptes_per_page(page_size) ((page_size) / 16) - -#elif defined(__riscv) - -#if __riscv_xlen == 32 -#error "RISC-V 32-bit kvm selftests not supported" -#endif - -#define VM_MODE_DEFAULT VM_MODE_P40V48_4K -#define MIN_PAGE_SHIFT 12U -#define ptes_per_page(page_size) ((page_size) / 8) - -#endif - -#define VM_SHAPE_DEFAULT VM_SHAPE(VM_MODE_DEFAULT) - -#define MIN_PAGE_SIZE (1U << MIN_PAGE_SHIFT) -#define PTES_PER_MIN_PAGE ptes_per_page(MIN_PAGE_SIZE) - -struct vm_guest_mode_params { - unsigned int pa_bits; - unsigned int va_bits; - unsigned int page_size; - unsigned int page_shift; -}; -extern const struct vm_guest_mode_params vm_guest_mode_params[]; - -int open_path_or_exit(const char *path, int flags); -int open_kvm_dev_path_or_exit(void); - -bool get_kvm_param_bool(const char *param); -bool get_kvm_intel_param_bool(const char *param); -bool get_kvm_amd_param_bool(const char *param); - -int get_kvm_param_integer(const char *param); -int get_kvm_intel_param_integer(const char *param); -int get_kvm_amd_param_integer(const char *param); - -unsigned int kvm_check_cap(long cap); - -static inline bool kvm_has_cap(long cap) -{ - return kvm_check_cap(cap); -} - -#define __KVM_SYSCALL_ERROR(_name, _ret) \ - "%s failed, rc: %i errno: %i (%s)", (_name), (_ret), errno, strerror(errno) - -/* - * Use the "inner", double-underscore macro when reporting errors from within - * other macros so that the name of ioctl() and not its literal numeric value - * is printed on error. The "outer" macro is strongly preferred when reporting - * errors "directly", i.e. without an additional layer of macros, as it reduces - * the probability of passing in the wrong string. - */ -#define __KVM_IOCTL_ERROR(_name, _ret) __KVM_SYSCALL_ERROR(_name, _ret) -#define KVM_IOCTL_ERROR(_ioctl, _ret) __KVM_IOCTL_ERROR(#_ioctl, _ret) - -#define kvm_do_ioctl(fd, cmd, arg) \ -({ \ - kvm_static_assert(!_IOC_SIZE(cmd) || sizeof(*arg) == _IOC_SIZE(cmd)); \ - ioctl(fd, cmd, arg); \ -}) - -#define __kvm_ioctl(kvm_fd, cmd, arg) \ - kvm_do_ioctl(kvm_fd, cmd, arg) - -#define kvm_ioctl(kvm_fd, cmd, arg) \ -({ \ - int ret = __kvm_ioctl(kvm_fd, cmd, arg); \ - \ - TEST_ASSERT(!ret, __KVM_IOCTL_ERROR(#cmd, ret)); \ -}) - -static __always_inline void static_assert_is_vm(struct kvm_vm *vm) { } - -#define __vm_ioctl(vm, cmd, arg) \ -({ \ - static_assert_is_vm(vm); \ - kvm_do_ioctl((vm)->fd, cmd, arg); \ -}) - -/* - * Assert that a VM or vCPU ioctl() succeeded, with extra magic to detect if - * the ioctl() failed because KVM killed/bugged the VM. To detect a dead VM, - * probe KVM_CAP_USER_MEMORY, which (a) has been supported by KVM since before - * selftests existed and (b) should never outright fail, i.e. is supposed to - * return 0 or 1. If KVM kills a VM, KVM returns -EIO for all ioctl()s for the - * VM and its vCPUs, including KVM_CHECK_EXTENSION. - */ -#define __TEST_ASSERT_VM_VCPU_IOCTL(cond, name, ret, vm) \ -do { \ - int __errno = errno; \ - \ - static_assert_is_vm(vm); \ - \ - if (cond) \ - break; \ - \ - if (errno == EIO && \ - __vm_ioctl(vm, KVM_CHECK_EXTENSION, (void *)KVM_CAP_USER_MEMORY) < 0) { \ - TEST_ASSERT(errno == EIO, "KVM killed the VM, should return -EIO"); \ - TEST_FAIL("KVM killed/bugged the VM, check the kernel log for clues"); \ - } \ - errno = __errno; \ - TEST_ASSERT(cond, __KVM_IOCTL_ERROR(name, ret)); \ -} while (0) - -#define TEST_ASSERT_VM_VCPU_IOCTL(cond, cmd, ret, vm) \ - __TEST_ASSERT_VM_VCPU_IOCTL(cond, #cmd, ret, vm) - -#define vm_ioctl(vm, cmd, arg) \ -({ \ - int ret = __vm_ioctl(vm, cmd, arg); \ - \ - __TEST_ASSERT_VM_VCPU_IOCTL(!ret, #cmd, ret, vm); \ -}) - -static __always_inline void static_assert_is_vcpu(struct kvm_vcpu *vcpu) { } - -#define __vcpu_ioctl(vcpu, cmd, arg) \ -({ \ - static_assert_is_vcpu(vcpu); \ - kvm_do_ioctl((vcpu)->fd, cmd, arg); \ -}) - -#define vcpu_ioctl(vcpu, cmd, arg) \ -({ \ - int ret = __vcpu_ioctl(vcpu, cmd, arg); \ - \ - __TEST_ASSERT_VM_VCPU_IOCTL(!ret, #cmd, ret, (vcpu)->vm); \ -}) - -/* - * Looks up and returns the value corresponding to the capability - * (KVM_CAP_*) given by cap. - */ -static inline int vm_check_cap(struct kvm_vm *vm, long cap) -{ - int ret = __vm_ioctl(vm, KVM_CHECK_EXTENSION, (void *)cap); - - TEST_ASSERT_VM_VCPU_IOCTL(ret >= 0, KVM_CHECK_EXTENSION, ret, vm); - return ret; -} - -static inline int __vm_enable_cap(struct kvm_vm *vm, uint32_t cap, uint64_t arg0) -{ - struct kvm_enable_cap enable_cap = { .cap = cap, .args = { arg0 } }; - - return __vm_ioctl(vm, KVM_ENABLE_CAP, &enable_cap); -} -static inline void vm_enable_cap(struct kvm_vm *vm, uint32_t cap, uint64_t arg0) -{ - struct kvm_enable_cap enable_cap = { .cap = cap, .args = { arg0 } }; - - vm_ioctl(vm, KVM_ENABLE_CAP, &enable_cap); -} - -static inline void vm_set_memory_attributes(struct kvm_vm *vm, uint64_t gpa, - uint64_t size, uint64_t attributes) -{ - struct kvm_memory_attributes attr = { - .attributes = attributes, - .address = gpa, - .size = size, - .flags = 0, - }; - - /* - * KVM_SET_MEMORY_ATTRIBUTES overwrites _all_ attributes. These flows - * need significant enhancements to support multiple attributes. - */ - TEST_ASSERT(!attributes || attributes == KVM_MEMORY_ATTRIBUTE_PRIVATE, - "Update me to support multiple attributes!"); - - vm_ioctl(vm, KVM_SET_MEMORY_ATTRIBUTES, &attr); -} - - -static inline void vm_mem_set_private(struct kvm_vm *vm, uint64_t gpa, - uint64_t size) -{ - vm_set_memory_attributes(vm, gpa, size, KVM_MEMORY_ATTRIBUTE_PRIVATE); -} - -static inline void vm_mem_set_shared(struct kvm_vm *vm, uint64_t gpa, - uint64_t size) -{ - vm_set_memory_attributes(vm, gpa, size, 0); -} - -void vm_guest_mem_fallocate(struct kvm_vm *vm, uint64_t gpa, uint64_t size, - bool punch_hole); - -static inline void vm_guest_mem_punch_hole(struct kvm_vm *vm, uint64_t gpa, - uint64_t size) -{ - vm_guest_mem_fallocate(vm, gpa, size, true); -} - -static inline void vm_guest_mem_allocate(struct kvm_vm *vm, uint64_t gpa, - uint64_t size) -{ - vm_guest_mem_fallocate(vm, gpa, size, false); -} - -void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size); -const char *vm_guest_mode_string(uint32_t i); - -void kvm_vm_free(struct kvm_vm *vmp); -void kvm_vm_restart(struct kvm_vm *vmp); -void kvm_vm_release(struct kvm_vm *vmp); -int kvm_memcmp_hva_gva(void *hva, struct kvm_vm *vm, const vm_vaddr_t gva, - size_t len); -void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename); -int kvm_memfd_alloc(size_t size, bool hugepages); - -void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent); - -static inline void kvm_vm_get_dirty_log(struct kvm_vm *vm, int slot, void *log) -{ - struct kvm_dirty_log args = { .dirty_bitmap = log, .slot = slot }; - - vm_ioctl(vm, KVM_GET_DIRTY_LOG, &args); -} - -static inline void kvm_vm_clear_dirty_log(struct kvm_vm *vm, int slot, void *log, - uint64_t first_page, uint32_t num_pages) -{ - struct kvm_clear_dirty_log args = { - .dirty_bitmap = log, - .slot = slot, - .first_page = first_page, - .num_pages = num_pages - }; - - vm_ioctl(vm, KVM_CLEAR_DIRTY_LOG, &args); -} - -static inline uint32_t kvm_vm_reset_dirty_ring(struct kvm_vm *vm) -{ - return __vm_ioctl(vm, KVM_RESET_DIRTY_RINGS, NULL); -} - -static inline int vm_get_stats_fd(struct kvm_vm *vm) -{ - int fd = __vm_ioctl(vm, KVM_GET_STATS_FD, NULL); - - TEST_ASSERT_VM_VCPU_IOCTL(fd >= 0, KVM_GET_STATS_FD, fd, vm); - return fd; -} - -static inline void read_stats_header(int stats_fd, struct kvm_stats_header *header) -{ - ssize_t ret; - - ret = pread(stats_fd, header, sizeof(*header), 0); - TEST_ASSERT(ret == sizeof(*header), - "Failed to read '%lu' header bytes, ret = '%ld'", - sizeof(*header), ret); -} - -struct kvm_stats_desc *read_stats_descriptors(int stats_fd, - struct kvm_stats_header *header); - -static inline ssize_t get_stats_descriptor_size(struct kvm_stats_header *header) -{ - /* - * The base size of the descriptor is defined by KVM's ABI, but the - * size of the name field is variable, as far as KVM's ABI is - * concerned. For a given instance of KVM, the name field is the same - * size for all stats and is provided in the overall stats header. - */ - return sizeof(struct kvm_stats_desc) + header->name_size; -} - -static inline struct kvm_stats_desc *get_stats_descriptor(struct kvm_stats_desc *stats, - int index, - struct kvm_stats_header *header) -{ - /* - * Note, size_desc includes the size of the name field, which is - * variable. i.e. this is NOT equivalent to &stats_desc[i]. - */ - return (void *)stats + index * get_stats_descriptor_size(header); -} - -void read_stat_data(int stats_fd, struct kvm_stats_header *header, - struct kvm_stats_desc *desc, uint64_t *data, - size_t max_elements); - -void __vm_get_stat(struct kvm_vm *vm, const char *stat_name, uint64_t *data, - size_t max_elements); - -static inline uint64_t vm_get_stat(struct kvm_vm *vm, const char *stat_name) -{ - uint64_t data; - - __vm_get_stat(vm, stat_name, &data, 1); - return data; -} - -void vm_create_irqchip(struct kvm_vm *vm); - -static inline int __vm_create_guest_memfd(struct kvm_vm *vm, uint64_t size, - uint64_t flags) -{ - struct kvm_create_guest_memfd guest_memfd = { - .size = size, - .flags = flags, - }; - - return __vm_ioctl(vm, KVM_CREATE_GUEST_MEMFD, &guest_memfd); -} - -static inline int vm_create_guest_memfd(struct kvm_vm *vm, uint64_t size, - uint64_t flags) -{ - int fd = __vm_create_guest_memfd(vm, size, flags); - - TEST_ASSERT(fd >= 0, KVM_IOCTL_ERROR(KVM_CREATE_GUEST_MEMFD, fd)); - return fd; -} - -void vm_set_user_memory_region(struct kvm_vm *vm, uint32_t slot, uint32_t flags, - uint64_t gpa, uint64_t size, void *hva); -int __vm_set_user_memory_region(struct kvm_vm *vm, uint32_t slot, uint32_t flags, - uint64_t gpa, uint64_t size, void *hva); -void vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flags, - uint64_t gpa, uint64_t size, void *hva, - uint32_t guest_memfd, uint64_t guest_memfd_offset); -int __vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flags, - uint64_t gpa, uint64_t size, void *hva, - uint32_t guest_memfd, uint64_t guest_memfd_offset); - -void vm_userspace_mem_region_add(struct kvm_vm *vm, - enum vm_mem_backing_src_type src_type, - uint64_t guest_paddr, uint32_t slot, uint64_t npages, - uint32_t flags); -void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, - uint64_t guest_paddr, uint32_t slot, uint64_t npages, - uint32_t flags, int guest_memfd_fd, uint64_t guest_memfd_offset); - -#ifndef vm_arch_has_protected_memory -static inline bool vm_arch_has_protected_memory(struct kvm_vm *vm) -{ - return false; -} -#endif - -void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags); -void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa); -void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot); -struct kvm_vcpu *__vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id); -void vm_populate_vaddr_bitmap(struct kvm_vm *vm); -vm_vaddr_t vm_vaddr_unused_gap(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min); -vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min); -vm_vaddr_t __vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min, - enum kvm_mem_region_type type); -vm_vaddr_t vm_vaddr_alloc_shared(struct kvm_vm *vm, size_t sz, - vm_vaddr_t vaddr_min, - enum kvm_mem_region_type type); -vm_vaddr_t vm_vaddr_alloc_pages(struct kvm_vm *vm, int nr_pages); -vm_vaddr_t __vm_vaddr_alloc_page(struct kvm_vm *vm, - enum kvm_mem_region_type type); -vm_vaddr_t vm_vaddr_alloc_page(struct kvm_vm *vm); - -void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, - unsigned int npages); -void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa); -void *addr_gva2hva(struct kvm_vm *vm, vm_vaddr_t gva); -vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva); -void *addr_gpa2alias(struct kvm_vm *vm, vm_paddr_t gpa); - - -static inline vm_paddr_t vm_untag_gpa(struct kvm_vm *vm, vm_paddr_t gpa) -{ - return gpa & ~vm->gpa_tag_mask; -} - -void vcpu_run(struct kvm_vcpu *vcpu); -int _vcpu_run(struct kvm_vcpu *vcpu); - -static inline int __vcpu_run(struct kvm_vcpu *vcpu) -{ - return __vcpu_ioctl(vcpu, KVM_RUN, NULL); -} - -void vcpu_run_complete_io(struct kvm_vcpu *vcpu); -struct kvm_reg_list *vcpu_get_reg_list(struct kvm_vcpu *vcpu); - -static inline void vcpu_enable_cap(struct kvm_vcpu *vcpu, uint32_t cap, - uint64_t arg0) -{ - struct kvm_enable_cap enable_cap = { .cap = cap, .args = { arg0 } }; - - vcpu_ioctl(vcpu, KVM_ENABLE_CAP, &enable_cap); -} - -static inline void vcpu_guest_debug_set(struct kvm_vcpu *vcpu, - struct kvm_guest_debug *debug) -{ - vcpu_ioctl(vcpu, KVM_SET_GUEST_DEBUG, debug); -} - -static inline void vcpu_mp_state_get(struct kvm_vcpu *vcpu, - struct kvm_mp_state *mp_state) -{ - vcpu_ioctl(vcpu, KVM_GET_MP_STATE, mp_state); -} -static inline void vcpu_mp_state_set(struct kvm_vcpu *vcpu, - struct kvm_mp_state *mp_state) -{ - vcpu_ioctl(vcpu, KVM_SET_MP_STATE, mp_state); -} - -static inline void vcpu_regs_get(struct kvm_vcpu *vcpu, struct kvm_regs *regs) -{ - vcpu_ioctl(vcpu, KVM_GET_REGS, regs); -} - -static inline void vcpu_regs_set(struct kvm_vcpu *vcpu, struct kvm_regs *regs) -{ - vcpu_ioctl(vcpu, KVM_SET_REGS, regs); -} -static inline void vcpu_sregs_get(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) -{ - vcpu_ioctl(vcpu, KVM_GET_SREGS, sregs); - -} -static inline void vcpu_sregs_set(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) -{ - vcpu_ioctl(vcpu, KVM_SET_SREGS, sregs); -} -static inline int _vcpu_sregs_set(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) -{ - return __vcpu_ioctl(vcpu, KVM_SET_SREGS, sregs); -} -static inline void vcpu_fpu_get(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) -{ - vcpu_ioctl(vcpu, KVM_GET_FPU, fpu); -} -static inline void vcpu_fpu_set(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) -{ - vcpu_ioctl(vcpu, KVM_SET_FPU, fpu); -} - -static inline int __vcpu_get_reg(struct kvm_vcpu *vcpu, uint64_t id, void *addr) -{ - struct kvm_one_reg reg = { .id = id, .addr = (uint64_t)addr }; - - return __vcpu_ioctl(vcpu, KVM_GET_ONE_REG, ®); -} -static inline int __vcpu_set_reg(struct kvm_vcpu *vcpu, uint64_t id, uint64_t val) -{ - struct kvm_one_reg reg = { .id = id, .addr = (uint64_t)&val }; - - return __vcpu_ioctl(vcpu, KVM_SET_ONE_REG, ®); -} -static inline void vcpu_get_reg(struct kvm_vcpu *vcpu, uint64_t id, void *addr) -{ - struct kvm_one_reg reg = { .id = id, .addr = (uint64_t)addr }; - - vcpu_ioctl(vcpu, KVM_GET_ONE_REG, ®); -} -static inline void vcpu_set_reg(struct kvm_vcpu *vcpu, uint64_t id, uint64_t val) -{ - struct kvm_one_reg reg = { .id = id, .addr = (uint64_t)&val }; - - vcpu_ioctl(vcpu, KVM_SET_ONE_REG, ®); -} - -#ifdef __KVM_HAVE_VCPU_EVENTS -static inline void vcpu_events_get(struct kvm_vcpu *vcpu, - struct kvm_vcpu_events *events) -{ - vcpu_ioctl(vcpu, KVM_GET_VCPU_EVENTS, events); -} -static inline void vcpu_events_set(struct kvm_vcpu *vcpu, - struct kvm_vcpu_events *events) -{ - vcpu_ioctl(vcpu, KVM_SET_VCPU_EVENTS, events); -} -#endif -#ifdef __x86_64__ -static inline void vcpu_nested_state_get(struct kvm_vcpu *vcpu, - struct kvm_nested_state *state) -{ - vcpu_ioctl(vcpu, KVM_GET_NESTED_STATE, state); -} -static inline int __vcpu_nested_state_set(struct kvm_vcpu *vcpu, - struct kvm_nested_state *state) -{ - return __vcpu_ioctl(vcpu, KVM_SET_NESTED_STATE, state); -} - -static inline void vcpu_nested_state_set(struct kvm_vcpu *vcpu, - struct kvm_nested_state *state) -{ - vcpu_ioctl(vcpu, KVM_SET_NESTED_STATE, state); -} -#endif -static inline int vcpu_get_stats_fd(struct kvm_vcpu *vcpu) -{ - int fd = __vcpu_ioctl(vcpu, KVM_GET_STATS_FD, NULL); - - TEST_ASSERT_VM_VCPU_IOCTL(fd >= 0, KVM_CHECK_EXTENSION, fd, vcpu->vm); - return fd; -} - -int __kvm_has_device_attr(int dev_fd, uint32_t group, uint64_t attr); - -static inline void kvm_has_device_attr(int dev_fd, uint32_t group, uint64_t attr) -{ - int ret = __kvm_has_device_attr(dev_fd, group, attr); - - TEST_ASSERT(!ret, "KVM_HAS_DEVICE_ATTR failed, rc: %i errno: %i", ret, errno); -} - -int __kvm_device_attr_get(int dev_fd, uint32_t group, uint64_t attr, void *val); - -static inline void kvm_device_attr_get(int dev_fd, uint32_t group, - uint64_t attr, void *val) -{ - int ret = __kvm_device_attr_get(dev_fd, group, attr, val); - - TEST_ASSERT(!ret, KVM_IOCTL_ERROR(KVM_GET_DEVICE_ATTR, ret)); -} - -int __kvm_device_attr_set(int dev_fd, uint32_t group, uint64_t attr, void *val); - -static inline void kvm_device_attr_set(int dev_fd, uint32_t group, - uint64_t attr, void *val) -{ - int ret = __kvm_device_attr_set(dev_fd, group, attr, val); - - TEST_ASSERT(!ret, KVM_IOCTL_ERROR(KVM_SET_DEVICE_ATTR, ret)); -} - -static inline int __vcpu_has_device_attr(struct kvm_vcpu *vcpu, uint32_t group, - uint64_t attr) -{ - return __kvm_has_device_attr(vcpu->fd, group, attr); -} - -static inline void vcpu_has_device_attr(struct kvm_vcpu *vcpu, uint32_t group, - uint64_t attr) -{ - kvm_has_device_attr(vcpu->fd, group, attr); -} - -static inline int __vcpu_device_attr_get(struct kvm_vcpu *vcpu, uint32_t group, - uint64_t attr, void *val) -{ - return __kvm_device_attr_get(vcpu->fd, group, attr, val); -} - -static inline void vcpu_device_attr_get(struct kvm_vcpu *vcpu, uint32_t group, - uint64_t attr, void *val) -{ - kvm_device_attr_get(vcpu->fd, group, attr, val); -} - -static inline int __vcpu_device_attr_set(struct kvm_vcpu *vcpu, uint32_t group, - uint64_t attr, void *val) -{ - return __kvm_device_attr_set(vcpu->fd, group, attr, val); -} - -static inline void vcpu_device_attr_set(struct kvm_vcpu *vcpu, uint32_t group, - uint64_t attr, void *val) -{ - kvm_device_attr_set(vcpu->fd, group, attr, val); -} - -int __kvm_test_create_device(struct kvm_vm *vm, uint64_t type); -int __kvm_create_device(struct kvm_vm *vm, uint64_t type); - -static inline int kvm_create_device(struct kvm_vm *vm, uint64_t type) -{ - int fd = __kvm_create_device(vm, type); - - TEST_ASSERT(fd >= 0, KVM_IOCTL_ERROR(KVM_CREATE_DEVICE, fd)); - return fd; -} - -void *vcpu_map_dirty_ring(struct kvm_vcpu *vcpu); - -/* - * VM VCPU Args Set - * - * Input Args: - * vm - Virtual Machine - * num - number of arguments - * ... - arguments, each of type uint64_t - * - * Output Args: None - * - * Return: None - * - * Sets the first @num input parameters for the function at @vcpu's entry point, - * per the C calling convention of the architecture, to the values given as - * variable args. Each of the variable args is expected to be of type uint64_t. - * The maximum @num can be is specific to the architecture. - */ -void vcpu_args_set(struct kvm_vcpu *vcpu, unsigned int num, ...); - -void kvm_irq_line(struct kvm_vm *vm, uint32_t irq, int level); -int _kvm_irq_line(struct kvm_vm *vm, uint32_t irq, int level); - -#define KVM_MAX_IRQ_ROUTES 4096 - -struct kvm_irq_routing *kvm_gsi_routing_create(void); -void kvm_gsi_routing_irqchip_add(struct kvm_irq_routing *routing, - uint32_t gsi, uint32_t pin); -int _kvm_gsi_routing_write(struct kvm_vm *vm, struct kvm_irq_routing *routing); -void kvm_gsi_routing_write(struct kvm_vm *vm, struct kvm_irq_routing *routing); - -const char *exit_reason_str(unsigned int exit_reason); - -vm_paddr_t vm_phy_page_alloc(struct kvm_vm *vm, vm_paddr_t paddr_min, - uint32_t memslot); -vm_paddr_t __vm_phy_pages_alloc(struct kvm_vm *vm, size_t num, - vm_paddr_t paddr_min, uint32_t memslot, - bool protected); -vm_paddr_t vm_alloc_page_table(struct kvm_vm *vm); - -static inline vm_paddr_t vm_phy_pages_alloc(struct kvm_vm *vm, size_t num, - vm_paddr_t paddr_min, uint32_t memslot) -{ - /* - * By default, allocate memory as protected for VMs that support - * protected memory, as the majority of memory for such VMs is - * protected, i.e. using shared memory is effectively opt-in. - */ - return __vm_phy_pages_alloc(vm, num, paddr_min, memslot, - vm_arch_has_protected_memory(vm)); -} - -/* - * ____vm_create() does KVM_CREATE_VM and little else. __vm_create() also - * loads the test binary into guest memory and creates an IRQ chip (x86 only). - * __vm_create() does NOT create vCPUs, @nr_runnable_vcpus is used purely to - * calculate the amount of memory needed for per-vCPU data, e.g. stacks. - */ -struct kvm_vm *____vm_create(struct vm_shape shape); -struct kvm_vm *__vm_create(struct vm_shape shape, uint32_t nr_runnable_vcpus, - uint64_t nr_extra_pages); - -static inline struct kvm_vm *vm_create_barebones(void) -{ - return ____vm_create(VM_SHAPE_DEFAULT); -} - -#ifdef __x86_64__ -static inline struct kvm_vm *vm_create_barebones_protected_vm(void) -{ - const struct vm_shape shape = { - .mode = VM_MODE_DEFAULT, - .type = KVM_X86_SW_PROTECTED_VM, - }; - - return ____vm_create(shape); -} -#endif - -static inline struct kvm_vm *vm_create(uint32_t nr_runnable_vcpus) -{ - return __vm_create(VM_SHAPE_DEFAULT, nr_runnable_vcpus, 0); -} - -struct kvm_vm *__vm_create_with_vcpus(struct vm_shape shape, uint32_t nr_vcpus, - uint64_t extra_mem_pages, - void *guest_code, struct kvm_vcpu *vcpus[]); - -static inline struct kvm_vm *vm_create_with_vcpus(uint32_t nr_vcpus, - void *guest_code, - struct kvm_vcpu *vcpus[]) -{ - return __vm_create_with_vcpus(VM_SHAPE_DEFAULT, nr_vcpus, 0, - guest_code, vcpus); -} - - -struct kvm_vm *__vm_create_shape_with_one_vcpu(struct vm_shape shape, - struct kvm_vcpu **vcpu, - uint64_t extra_mem_pages, - void *guest_code); - -/* - * Create a VM with a single vCPU with reasonable defaults and @extra_mem_pages - * additional pages of guest memory. Returns the VM and vCPU (via out param). - */ -static inline struct kvm_vm *__vm_create_with_one_vcpu(struct kvm_vcpu **vcpu, - uint64_t extra_mem_pages, - void *guest_code) -{ - return __vm_create_shape_with_one_vcpu(VM_SHAPE_DEFAULT, vcpu, - extra_mem_pages, guest_code); -} - -static inline struct kvm_vm *vm_create_with_one_vcpu(struct kvm_vcpu **vcpu, - void *guest_code) -{ - return __vm_create_with_one_vcpu(vcpu, 0, guest_code); -} - -static inline struct kvm_vm *vm_create_shape_with_one_vcpu(struct vm_shape shape, - struct kvm_vcpu **vcpu, - void *guest_code) -{ - return __vm_create_shape_with_one_vcpu(shape, vcpu, 0, guest_code); -} - -struct kvm_vcpu *vm_recreate_with_one_vcpu(struct kvm_vm *vm); - -void kvm_pin_this_task_to_pcpu(uint32_t pcpu); -void kvm_print_vcpu_pinning_help(void); -void kvm_parse_vcpu_pinning(const char *pcpus_string, uint32_t vcpu_to_pcpu[], - int nr_vcpus); - -unsigned long vm_compute_max_gfn(struct kvm_vm *vm); -unsigned int vm_calc_num_guest_pages(enum vm_guest_mode mode, size_t size); -unsigned int vm_num_host_pages(enum vm_guest_mode mode, unsigned int num_guest_pages); -unsigned int vm_num_guest_pages(enum vm_guest_mode mode, unsigned int num_host_pages); -static inline unsigned int -vm_adjust_num_guest_pages(enum vm_guest_mode mode, unsigned int num_guest_pages) -{ - unsigned int n; - n = vm_num_guest_pages(mode, vm_num_host_pages(mode, num_guest_pages)); -#ifdef __s390x__ - /* s390 requires 1M aligned guest sizes */ - n = (n + 255) & ~255; -#endif - return n; -} - -#define sync_global_to_guest(vm, g) ({ \ - typeof(g) *_p = addr_gva2hva(vm, (vm_vaddr_t)&(g)); \ - memcpy(_p, &(g), sizeof(g)); \ -}) - -#define sync_global_from_guest(vm, g) ({ \ - typeof(g) *_p = addr_gva2hva(vm, (vm_vaddr_t)&(g)); \ - memcpy(&(g), _p, sizeof(g)); \ -}) - -/* - * Write a global value, but only in the VM's (guest's) domain. Primarily used - * for "globals" that hold per-VM values (VMs always duplicate code and global - * data into their own region of physical memory), but can be used anytime it's - * undesirable to change the host's copy of the global. - */ -#define write_guest_global(vm, g, val) ({ \ - typeof(g) *_p = addr_gva2hva(vm, (vm_vaddr_t)&(g)); \ - typeof(g) _val = val; \ - \ - memcpy(_p, &(_val), sizeof(g)); \ -}) - -void assert_on_unhandled_exception(struct kvm_vcpu *vcpu); - -void vcpu_arch_dump(FILE *stream, struct kvm_vcpu *vcpu, - uint8_t indent); - -static inline void vcpu_dump(FILE *stream, struct kvm_vcpu *vcpu, - uint8_t indent) -{ - vcpu_arch_dump(stream, vcpu, indent); -} - -/* - * Adds a vCPU with reasonable defaults (e.g. a stack) - * - * Input Args: - * vm - Virtual Machine - * vcpu_id - The id of the VCPU to add to the VM. - */ -struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id); -void vcpu_arch_set_entry_point(struct kvm_vcpu *vcpu, void *guest_code); - -static inline struct kvm_vcpu *vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, - void *guest_code) -{ - struct kvm_vcpu *vcpu = vm_arch_vcpu_add(vm, vcpu_id); - - vcpu_arch_set_entry_point(vcpu, guest_code); - - return vcpu; -} - -/* Re-create a vCPU after restarting a VM, e.g. for state save/restore tests. */ -struct kvm_vcpu *vm_arch_vcpu_recreate(struct kvm_vm *vm, uint32_t vcpu_id); - -static inline struct kvm_vcpu *vm_vcpu_recreate(struct kvm_vm *vm, - uint32_t vcpu_id) -{ - return vm_arch_vcpu_recreate(vm, vcpu_id); -} - -void vcpu_arch_free(struct kvm_vcpu *vcpu); - -void virt_arch_pgd_alloc(struct kvm_vm *vm); - -static inline void virt_pgd_alloc(struct kvm_vm *vm) -{ - virt_arch_pgd_alloc(vm); -} - -/* - * VM Virtual Page Map - * - * Input Args: - * vm - Virtual Machine - * vaddr - VM Virtual Address - * paddr - VM Physical Address - * memslot - Memory region slot for new virtual translation tables - * - * Output Args: None - * - * Return: None - * - * Within @vm, creates a virtual translation for the page starting - * at @vaddr to the page starting at @paddr. - */ -void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr); - -static inline void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) -{ - virt_arch_pg_map(vm, vaddr, paddr); -} - - -/* - * Address Guest Virtual to Guest Physical - * - * Input Args: - * vm - Virtual Machine - * gva - VM virtual address - * - * Output Args: None - * - * Return: - * Equivalent VM physical address - * - * Returns the VM physical address of the translated VM virtual - * address given by @gva. - */ -vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva); - -static inline vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) -{ - return addr_arch_gva2gpa(vm, gva); -} - -/* - * Virtual Translation Tables Dump - * - * Input Args: - * stream - Output FILE stream - * vm - Virtual Machine - * indent - Left margin indent amount - * - * Output Args: None - * - * Return: None - * - * Dumps to the FILE stream given by @stream, the contents of all the - * virtual translation tables for the VM given by @vm. - */ -void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent); - -static inline void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) -{ - virt_arch_dump(stream, vm, indent); -} - - -static inline int __vm_disable_nx_huge_pages(struct kvm_vm *vm) -{ - return __vm_enable_cap(vm, KVM_CAP_VM_DISABLE_NX_HUGE_PAGES, 0); -} - -/* - * Arch hook that is invoked via a constructor, i.e. before exeucting main(), - * to allow for arch-specific setup that is common to all tests, e.g. computing - * the default guest "mode". - */ -void kvm_selftest_arch_init(void); - -void kvm_arch_vm_post_create(struct kvm_vm *vm); - -bool vm_is_gpa_protected(struct kvm_vm *vm, vm_paddr_t paddr); - -uint32_t guest_get_vcpuid(void); - -#endif /* SELFTEST_KVM_UTIL_BASE_H */ diff --git a/tools/testing/selftests/kvm/include/kvm_util_types.h b/tools/testing/selftests/kvm/include/kvm_util_types.h new file mode 100644 index 000000000000..ec787b97cf18 --- /dev/null +++ b/tools/testing/selftests/kvm/include/kvm_util_types.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef SELFTEST_KVM_UTIL_TYPES_H +#define SELFTEST_KVM_UTIL_TYPES_H + +/* + * Provide a version of static_assert() that is guaranteed to have an optional + * message param. _GNU_SOURCE is defined for all KVM selftests, _GNU_SOURCE + * implies _ISOC11_SOURCE, and if _ISOC11_SOURCE is defined, glibc #undefs and + * #defines static_assert() as a direct alias to _Static_assert() (see + * usr/include/assert.h). Define a custom macro instead of redefining + * static_assert() to avoid creating non-deterministic behavior that is + * dependent on include order. + */ +#define __kvm_static_assert(expr, msg, ...) _Static_assert(expr, msg) +#define kvm_static_assert(expr, ...) __kvm_static_assert(expr, ##__VA_ARGS__, #expr) + +typedef uint64_t vm_paddr_t; /* Virtual Machine (Guest) physical address */ +typedef uint64_t vm_vaddr_t; /* Virtual Machine (Guest) virtual address */ + +#endif /* SELFTEST_KVM_UTIL_TYPES_H */ diff --git a/tools/testing/selftests/kvm/include/memstress.h b/tools/testing/selftests/kvm/include/memstress.h index ce4e603050ea..9071eb6dea60 100644 --- a/tools/testing/selftests/kvm/include/memstress.h +++ b/tools/testing/selftests/kvm/include/memstress.h @@ -62,7 +62,6 @@ struct kvm_vm *memstress_create_vm(enum vm_guest_mode mode, int nr_vcpus, void memstress_destroy_vm(struct kvm_vm *vm); void memstress_set_write_percent(struct kvm_vm *vm, uint32_t write_percent); -void memstress_set_random_seed(struct kvm_vm *vm, uint32_t random_seed); void memstress_set_random_access(struct kvm_vm *vm, bool random_access); void memstress_start_vcpu_threads(int vcpus, void (*vcpu_fn)(struct memstress_vcpu_args *)); diff --git a/tools/testing/selftests/kvm/include/riscv/processor.h b/tools/testing/selftests/kvm/include/riscv/processor.h index ce473fe251dd..5f389166338c 100644 --- a/tools/testing/selftests/kvm/include/riscv/processor.h +++ b/tools/testing/selftests/kvm/include/riscv/processor.h @@ -50,6 +50,16 @@ static inline uint64_t __kvm_reg_id(uint64_t type, uint64_t subtype, bool __vcpu_has_ext(struct kvm_vcpu *vcpu, uint64_t ext); +static inline bool __vcpu_has_isa_ext(struct kvm_vcpu *vcpu, uint64_t isa_ext) +{ + return __vcpu_has_ext(vcpu, RISCV_ISA_EXT_REG(isa_ext)); +} + +static inline bool __vcpu_has_sbi_ext(struct kvm_vcpu *vcpu, uint64_t sbi_ext) +{ + return __vcpu_has_ext(vcpu, RISCV_SBI_EXT_REG(sbi_ext)); +} + struct ex_regs { unsigned long ra; unsigned long sp; @@ -154,45 +164,6 @@ void vm_install_interrupt_handler(struct kvm_vm *vm, exception_handler_fn handle #define PGTBL_PAGE_SIZE PGTBL_L0_BLOCK_SIZE #define PGTBL_PAGE_SIZE_SHIFT PGTBL_L0_BLOCK_SHIFT -/* SBI return error codes */ -#define SBI_SUCCESS 0 -#define SBI_ERR_FAILURE -1 -#define SBI_ERR_NOT_SUPPORTED -2 -#define SBI_ERR_INVALID_PARAM -3 -#define SBI_ERR_DENIED -4 -#define SBI_ERR_INVALID_ADDRESS -5 -#define SBI_ERR_ALREADY_AVAILABLE -6 -#define SBI_ERR_ALREADY_STARTED -7 -#define SBI_ERR_ALREADY_STOPPED -8 - -#define SBI_EXT_EXPERIMENTAL_START 0x08000000 -#define SBI_EXT_EXPERIMENTAL_END 0x08FFFFFF - -#define KVM_RISCV_SELFTESTS_SBI_EXT SBI_EXT_EXPERIMENTAL_END -#define KVM_RISCV_SELFTESTS_SBI_UCALL 0 -#define KVM_RISCV_SELFTESTS_SBI_UNEXP 1 - -enum sbi_ext_id { - SBI_EXT_BASE = 0x10, - SBI_EXT_STA = 0x535441, -}; - -enum sbi_ext_base_fid { - SBI_EXT_BASE_PROBE_EXT = 3, -}; - -struct sbiret { - long error; - long value; -}; - -struct sbiret sbi_ecall(int ext, int fid, unsigned long arg0, - unsigned long arg1, unsigned long arg2, - unsigned long arg3, unsigned long arg4, - unsigned long arg5); - -bool guest_sbi_probe_extension(int extid, long *out_val); - static inline void local_irq_enable(void) { csr_set(CSR_SSTATUS, SR_SIE); diff --git a/tools/testing/selftests/kvm/include/riscv/sbi.h b/tools/testing/selftests/kvm/include/riscv/sbi.h new file mode 100644 index 000000000000..046b432ae896 --- /dev/null +++ b/tools/testing/selftests/kvm/include/riscv/sbi.h @@ -0,0 +1,141 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * RISC-V SBI specific definitions + * + * Copyright (C) 2024 Rivos Inc. + */ + +#ifndef SELFTEST_KVM_SBI_H +#define SELFTEST_KVM_SBI_H + +/* SBI spec version fields */ +#define SBI_SPEC_VERSION_DEFAULT 0x1 +#define SBI_SPEC_VERSION_MAJOR_SHIFT 24 +#define SBI_SPEC_VERSION_MAJOR_MASK 0x7f +#define SBI_SPEC_VERSION_MINOR_MASK 0xffffff + +/* SBI return error codes */ +#define SBI_SUCCESS 0 +#define SBI_ERR_FAILURE -1 +#define SBI_ERR_NOT_SUPPORTED -2 +#define SBI_ERR_INVALID_PARAM -3 +#define SBI_ERR_DENIED -4 +#define SBI_ERR_INVALID_ADDRESS -5 +#define SBI_ERR_ALREADY_AVAILABLE -6 +#define SBI_ERR_ALREADY_STARTED -7 +#define SBI_ERR_ALREADY_STOPPED -8 + +#define SBI_EXT_EXPERIMENTAL_START 0x08000000 +#define SBI_EXT_EXPERIMENTAL_END 0x08FFFFFF + +#define KVM_RISCV_SELFTESTS_SBI_EXT SBI_EXT_EXPERIMENTAL_END +#define KVM_RISCV_SELFTESTS_SBI_UCALL 0 +#define KVM_RISCV_SELFTESTS_SBI_UNEXP 1 + +enum sbi_ext_id { + SBI_EXT_BASE = 0x10, + SBI_EXT_STA = 0x535441, + SBI_EXT_PMU = 0x504D55, +}; + +enum sbi_ext_base_fid { + SBI_EXT_BASE_GET_SPEC_VERSION = 0, + SBI_EXT_BASE_GET_IMP_ID, + SBI_EXT_BASE_GET_IMP_VERSION, + SBI_EXT_BASE_PROBE_EXT = 3, +}; +enum sbi_ext_pmu_fid { + SBI_EXT_PMU_NUM_COUNTERS = 0, + SBI_EXT_PMU_COUNTER_GET_INFO, + SBI_EXT_PMU_COUNTER_CFG_MATCH, + SBI_EXT_PMU_COUNTER_START, + SBI_EXT_PMU_COUNTER_STOP, + SBI_EXT_PMU_COUNTER_FW_READ, + SBI_EXT_PMU_COUNTER_FW_READ_HI, + SBI_EXT_PMU_SNAPSHOT_SET_SHMEM, +}; + +union sbi_pmu_ctr_info { + unsigned long value; + struct { + unsigned long csr:12; + unsigned long width:6; +#if __riscv_xlen == 32 + unsigned long reserved:13; +#else + unsigned long reserved:45; +#endif + unsigned long type:1; + }; +}; + +struct riscv_pmu_snapshot_data { + u64 ctr_overflow_mask; + u64 ctr_values[64]; + u64 reserved[447]; +}; + +struct sbiret { + long error; + long value; +}; + +/** General pmu event codes specified in SBI PMU extension */ +enum sbi_pmu_hw_generic_events_t { + SBI_PMU_HW_NO_EVENT = 0, + SBI_PMU_HW_CPU_CYCLES = 1, + SBI_PMU_HW_INSTRUCTIONS = 2, + SBI_PMU_HW_CACHE_REFERENCES = 3, + SBI_PMU_HW_CACHE_MISSES = 4, + SBI_PMU_HW_BRANCH_INSTRUCTIONS = 5, + SBI_PMU_HW_BRANCH_MISSES = 6, + SBI_PMU_HW_BUS_CYCLES = 7, + SBI_PMU_HW_STALLED_CYCLES_FRONTEND = 8, + SBI_PMU_HW_STALLED_CYCLES_BACKEND = 9, + SBI_PMU_HW_REF_CPU_CYCLES = 10, + + SBI_PMU_HW_GENERAL_MAX, +}; + +/* SBI PMU counter types */ +enum sbi_pmu_ctr_type { + SBI_PMU_CTR_TYPE_HW = 0x0, + SBI_PMU_CTR_TYPE_FW, +}; + +/* Flags defined for config matching function */ +#define SBI_PMU_CFG_FLAG_SKIP_MATCH BIT(0) +#define SBI_PMU_CFG_FLAG_CLEAR_VALUE BIT(1) +#define SBI_PMU_CFG_FLAG_AUTO_START BIT(2) +#define SBI_PMU_CFG_FLAG_SET_VUINH BIT(3) +#define SBI_PMU_CFG_FLAG_SET_VSINH BIT(4) +#define SBI_PMU_CFG_FLAG_SET_UINH BIT(5) +#define SBI_PMU_CFG_FLAG_SET_SINH BIT(6) +#define SBI_PMU_CFG_FLAG_SET_MINH BIT(7) + +/* Flags defined for counter start function */ +#define SBI_PMU_START_FLAG_SET_INIT_VALUE BIT(0) +#define SBI_PMU_START_FLAG_INIT_SNAPSHOT BIT(1) + +/* Flags defined for counter stop function */ +#define SBI_PMU_STOP_FLAG_RESET BIT(0) +#define SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT BIT(1) + +struct sbiret sbi_ecall(int ext, int fid, unsigned long arg0, + unsigned long arg1, unsigned long arg2, + unsigned long arg3, unsigned long arg4, + unsigned long arg5); + +bool guest_sbi_probe_extension(int extid, long *out_val); + +/* Make SBI version */ +static inline unsigned long sbi_mk_version(unsigned long major, + unsigned long minor) +{ + return ((major & SBI_SPEC_VERSION_MAJOR_MASK) << SBI_SPEC_VERSION_MAJOR_SHIFT) + | (minor & SBI_SPEC_VERSION_MINOR_MASK); +} + +unsigned long get_host_sbi_spec_version(void); + +#endif /* SELFTEST_KVM_SBI_H */ diff --git a/tools/testing/selftests/kvm/include/riscv/ucall.h b/tools/testing/selftests/kvm/include/riscv/ucall.h index be46eb32ec27..a695ae36f3e0 100644 --- a/tools/testing/selftests/kvm/include/riscv/ucall.h +++ b/tools/testing/selftests/kvm/include/riscv/ucall.h @@ -3,6 +3,7 @@ #define SELFTEST_KVM_UCALL_H #include "processor.h" +#include "sbi.h" #define UCALL_EXIT_REASON KVM_EXIT_RISCV_SBI diff --git a/tools/testing/selftests/kvm/include/s390x/ucall.h b/tools/testing/selftests/kvm/include/s390x/ucall.h index b231bf2e49d6..8035a872a351 100644 --- a/tools/testing/selftests/kvm/include/s390x/ucall.h +++ b/tools/testing/selftests/kvm/include/s390x/ucall.h @@ -2,7 +2,7 @@ #ifndef SELFTEST_KVM_UCALL_H #define SELFTEST_KVM_UCALL_H -#include "kvm_util_base.h" +#include "kvm_util.h" #define UCALL_EXIT_REASON KVM_EXIT_S390_SIEIC diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h index 8a6e30612c86..3e473058849f 100644 --- a/tools/testing/selftests/kvm/include/test_util.h +++ b/tools/testing/selftests/kvm/include/test_util.h @@ -91,9 +91,28 @@ struct guest_random_state { uint32_t seed; }; +extern uint32_t guest_random_seed; +extern struct guest_random_state guest_rng; + struct guest_random_state new_guest_random_state(uint32_t seed); uint32_t guest_random_u32(struct guest_random_state *state); +static inline bool __guest_random_bool(struct guest_random_state *state, + uint8_t percent) +{ + return (guest_random_u32(state) % 100) < percent; +} + +static inline bool guest_random_bool(struct guest_random_state *state) +{ + return __guest_random_bool(state, 50); +} + +static inline uint64_t guest_random_u64(struct guest_random_state *state) +{ + return ((uint64_t)guest_random_u32(state) << 32) | guest_random_u32(state); +} + enum vm_mem_backing_src_type { VM_MEM_SRC_ANONYMOUS, VM_MEM_SRC_ANONYMOUS_THP, diff --git a/tools/testing/selftests/kvm/include/userfaultfd_util.h b/tools/testing/selftests/kvm/include/userfaultfd_util.h index 877449c34592..60f7f9d435dc 100644 --- a/tools/testing/selftests/kvm/include/userfaultfd_util.h +++ b/tools/testing/selftests/kvm/include/userfaultfd_util.h @@ -5,9 +5,6 @@ * Copyright (C) 2018, Red Hat, Inc. * Copyright (C) 2019-2022 Google LLC */ - -#define _GNU_SOURCE /* for pipe2 */ - #include <inttypes.h> #include <time.h> #include <pthread.h> @@ -17,17 +14,27 @@ typedef int (*uffd_handler_t)(int uffd_mode, int uffd, struct uffd_msg *msg); -struct uffd_desc { +struct uffd_reader_args { int uffd_mode; int uffd; - int pipefds[2]; useconds_t delay; uffd_handler_t handler; - pthread_t thread; + /* Holds the read end of the pipe for killing the reader. */ + int pipe; +}; + +struct uffd_desc { + int uffd; + uint64_t num_readers; + /* Holds the write ends of the pipes for killing the readers. */ + int *pipefds; + pthread_t *readers; + struct uffd_reader_args *reader_args; }; struct uffd_desc *uffd_setup_demand_paging(int uffd_mode, useconds_t delay, void *hva, uint64_t len, + uint64_t num_readers, uffd_handler_t handler); void uffd_stop_demand_paging(struct uffd_desc *uffd); diff --git a/tools/testing/selftests/kvm/include/x86_64/kvm_util_arch.h b/tools/testing/selftests/kvm/include/x86_64/kvm_util_arch.h index 9f1725192aa2..972bb1c4ab4c 100644 --- a/tools/testing/selftests/kvm/include/x86_64/kvm_util_arch.h +++ b/tools/testing/selftests/kvm/include/x86_64/kvm_util_arch.h @@ -5,7 +5,16 @@ #include <stdbool.h> #include <stdint.h> +#include "kvm_util_types.h" +#include "test_util.h" + +extern bool is_forced_emulation_enabled; + struct kvm_vm_arch { + vm_vaddr_t gdt; + vm_vaddr_t tss; + vm_vaddr_t idt; + uint64_t c_bit; uint64_t s_bit; int sev_fd; @@ -20,4 +29,23 @@ static inline bool __vm_arch_has_protected_memory(struct kvm_vm_arch *arch) #define vm_arch_has_protected_memory(vm) \ __vm_arch_has_protected_memory(&(vm)->arch) +#define vcpu_arch_put_guest(mem, __val) \ +do { \ + const typeof(mem) val = (__val); \ + \ + if (!is_forced_emulation_enabled || guest_random_bool(&guest_rng)) { \ + (mem) = val; \ + } else if (guest_random_bool(&guest_rng)) { \ + __asm__ __volatile__(KVM_FEP "mov %1, %0" \ + : "+m" (mem) \ + : "r" (val) : "memory"); \ + } else { \ + uint64_t __old = READ_ONCE(mem); \ + \ + __asm__ __volatile__(KVM_FEP LOCK_PREFIX "cmpxchg %[new], %[ptr]" \ + : [ptr] "+m" (mem), [old] "+a" (__old) \ + : [new]"r" (val) : "memory", "cc"); \ + } \ +} while (0) + #endif // SELFTEST_KVM_UTIL_ARCH_H diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 81ce37ec407d..8eb57de0b587 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -18,17 +18,12 @@ #include <linux/kvm_para.h> #include <linux/stringify.h> -#include "../kvm_util.h" +#include "kvm_util.h" +#include "ucall_common.h" extern bool host_cpu_is_intel; extern bool host_cpu_is_amd; -enum vm_guest_x86_subtype { - VM_SUBTYPE_NONE = 0, - VM_SUBTYPE_SEV, - VM_SUBTYPE_SEV_ES, -}; - /* Forced emulation prefix, used to invoke the emulator unconditionally. */ #define KVM_FEP "ud2; .byte 'k', 'v', 'm';" @@ -1139,8 +1134,6 @@ struct idt_entry { uint32_t offset2; uint32_t reserved; }; -void vm_init_descriptor_tables(struct kvm_vm *vm); -void vcpu_init_descriptor_tables(struct kvm_vcpu *vcpu); void vm_install_exception_handler(struct kvm_vm *vm, int vector, void (*handler)(struct ex_regs *)); diff --git a/tools/testing/selftests/kvm/include/x86_64/sev.h b/tools/testing/selftests/kvm/include/x86_64/sev.h index 8a1bf88474c9..82c11c81a956 100644 --- a/tools/testing/selftests/kvm/include/x86_64/sev.h +++ b/tools/testing/selftests/kvm/include/x86_64/sev.h @@ -31,8 +31,9 @@ void sev_vm_launch(struct kvm_vm *vm, uint32_t policy); void sev_vm_launch_measure(struct kvm_vm *vm, uint8_t *measurement); void sev_vm_launch_finish(struct kvm_vm *vm); -struct kvm_vm *vm_sev_create_with_one_vcpu(uint32_t policy, void *guest_code, +struct kvm_vm *vm_sev_create_with_one_vcpu(uint32_t type, void *guest_code, struct kvm_vcpu **cpu); +void vm_sev_launch(struct kvm_vm *vm, uint32_t policy, uint8_t *measurement); kvm_static_assert(SEV_RET_SUCCESS == 0); @@ -67,20 +68,8 @@ kvm_static_assert(SEV_RET_SUCCESS == 0); __TEST_ASSERT_VM_VCPU_IOCTL(!ret, #cmd, ret, vm); \ }) -static inline void sev_vm_init(struct kvm_vm *vm) -{ - vm->arch.sev_fd = open_sev_dev_path_or_exit(); - - vm_sev_ioctl(vm, KVM_SEV_INIT, NULL); -} - - -static inline void sev_es_vm_init(struct kvm_vm *vm) -{ - vm->arch.sev_fd = open_sev_dev_path_or_exit(); - - vm_sev_ioctl(vm, KVM_SEV_ES_INIT, NULL); -} +void sev_vm_init(struct kvm_vm *vm); +void sev_es_vm_init(struct kvm_vm *vm); static inline void sev_register_encrypted_memory(struct kvm_vm *vm, struct userspace_mem_region *region) diff --git a/tools/testing/selftests/kvm/include/x86_64/ucall.h b/tools/testing/selftests/kvm/include/x86_64/ucall.h index 06b244bd06ee..d3825dcc3cd9 100644 --- a/tools/testing/selftests/kvm/include/x86_64/ucall.h +++ b/tools/testing/selftests/kvm/include/x86_64/ucall.h @@ -2,7 +2,7 @@ #ifndef SELFTEST_KVM_UCALL_H #define SELFTEST_KVM_UCALL_H -#include "kvm_util_base.h" +#include "kvm_util.h" #define UCALL_EXIT_REASON KVM_EXIT_IO diff --git a/tools/testing/selftests/kvm/kvm_binary_stats_test.c b/tools/testing/selftests/kvm/kvm_binary_stats_test.c index 698c1cfa3111..f02355c3c4c2 100644 --- a/tools/testing/selftests/kvm/kvm_binary_stats_test.c +++ b/tools/testing/selftests/kvm/kvm_binary_stats_test.c @@ -6,8 +6,6 @@ * * Test the fd-based interface for KVM statistics. */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ #include <fcntl.h> #include <stdio.h> #include <stdlib.h> diff --git a/tools/testing/selftests/kvm/kvm_create_max_vcpus.c b/tools/testing/selftests/kvm/kvm_create_max_vcpus.c index b9e23265e4b3..c78f34699f73 100644 --- a/tools/testing/selftests/kvm/kvm_create_max_vcpus.c +++ b/tools/testing/selftests/kvm/kvm_create_max_vcpus.c @@ -6,8 +6,6 @@ * * Test for KVM_CAP_MAX_VCPUS and KVM_CAP_MAX_VCPU_ID. */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ #include <fcntl.h> #include <stdio.h> #include <stdlib.h> diff --git a/tools/testing/selftests/kvm/kvm_page_table_test.c b/tools/testing/selftests/kvm/kvm_page_table_test.c index e0ba97ac1c56..dd8b12f626d3 100644 --- a/tools/testing/selftests/kvm/kvm_page_table_test.c +++ b/tools/testing/selftests/kvm/kvm_page_table_test.c @@ -8,9 +8,6 @@ * page size have been pre-allocated on your system, if you are planning to * use hugepages to back the guest memory for testing. */ - -#define _GNU_SOURCE /* for program_invocation_name */ - #include <stdio.h> #include <stdlib.h> #include <time.h> @@ -21,6 +18,7 @@ #include "kvm_util.h" #include "processor.h" #include "guest_modes.h" +#include "ucall_common.h" #define TEST_MEM_SLOT_INDEX 1 diff --git a/tools/testing/selftests/kvm/lib/aarch64/gic.c b/tools/testing/selftests/kvm/lib/aarch64/gic.c index 55668631d546..7abbf8866512 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/gic.c +++ b/tools/testing/selftests/kvm/lib/aarch64/gic.c @@ -17,13 +17,12 @@ static const struct gic_common_ops *gic_common_ops; static struct spinlock gic_lock; -static void gic_cpu_init(unsigned int cpu, void *redist_base) +static void gic_cpu_init(unsigned int cpu) { - gic_common_ops->gic_cpu_init(cpu, redist_base); + gic_common_ops->gic_cpu_init(cpu); } -static void -gic_dist_init(enum gic_type type, unsigned int nr_cpus, void *dist_base) +static void gic_dist_init(enum gic_type type, unsigned int nr_cpus) { const struct gic_common_ops *gic_ops = NULL; @@ -40,7 +39,7 @@ gic_dist_init(enum gic_type type, unsigned int nr_cpus, void *dist_base) GUEST_ASSERT(gic_ops); - gic_ops->gic_init(nr_cpus, dist_base); + gic_ops->gic_init(nr_cpus); gic_common_ops = gic_ops; /* Make sure that the initialized data is visible to all the vCPUs */ @@ -49,18 +48,15 @@ gic_dist_init(enum gic_type type, unsigned int nr_cpus, void *dist_base) spin_unlock(&gic_lock); } -void gic_init(enum gic_type type, unsigned int nr_cpus, - void *dist_base, void *redist_base) +void gic_init(enum gic_type type, unsigned int nr_cpus) { uint32_t cpu = guest_get_vcpuid(); GUEST_ASSERT(type < GIC_TYPE_MAX); - GUEST_ASSERT(dist_base); - GUEST_ASSERT(redist_base); GUEST_ASSERT(nr_cpus); - gic_dist_init(type, nr_cpus, dist_base); - gic_cpu_init(cpu, redist_base); + gic_dist_init(type, nr_cpus); + gic_cpu_init(cpu); } void gic_irq_enable(unsigned int intid) diff --git a/tools/testing/selftests/kvm/lib/aarch64/gic_private.h b/tools/testing/selftests/kvm/lib/aarch64/gic_private.h index 75d07313c893..d24e9ecc96c6 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/gic_private.h +++ b/tools/testing/selftests/kvm/lib/aarch64/gic_private.h @@ -8,8 +8,8 @@ #define SELFTEST_KVM_GIC_PRIVATE_H struct gic_common_ops { - void (*gic_init)(unsigned int nr_cpus, void *dist_base); - void (*gic_cpu_init)(unsigned int cpu, void *redist_base); + void (*gic_init)(unsigned int nr_cpus); + void (*gic_cpu_init)(unsigned int cpu); void (*gic_irq_enable)(unsigned int intid); void (*gic_irq_disable)(unsigned int intid); uint64_t (*gic_read_iar)(void); diff --git a/tools/testing/selftests/kvm/lib/aarch64/gic_v3.c b/tools/testing/selftests/kvm/lib/aarch64/gic_v3.c index 263bf3ed8fd5..66d05506f78b 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/gic_v3.c +++ b/tools/testing/selftests/kvm/lib/aarch64/gic_v3.c @@ -9,12 +9,21 @@ #include "processor.h" #include "delay.h" +#include "gic.h" #include "gic_v3.h" #include "gic_private.h" +#define GICV3_MAX_CPUS 512 + +#define GICD_INT_DEF_PRI 0xa0 +#define GICD_INT_DEF_PRI_X4 ((GICD_INT_DEF_PRI << 24) |\ + (GICD_INT_DEF_PRI << 16) |\ + (GICD_INT_DEF_PRI << 8) |\ + GICD_INT_DEF_PRI) + +#define ICC_PMR_DEF_PRIO 0xf0 + struct gicv3_data { - void *dist_base; - void *redist_base[GICV3_MAX_CPUS]; unsigned int nr_cpus; unsigned int nr_spis; }; @@ -35,17 +44,23 @@ static void gicv3_gicd_wait_for_rwp(void) { unsigned int count = 100000; /* 1s */ - while (readl(gicv3_data.dist_base + GICD_CTLR) & GICD_CTLR_RWP) { + while (readl(GICD_BASE_GVA + GICD_CTLR) & GICD_CTLR_RWP) { GUEST_ASSERT(count--); udelay(10); } } -static void gicv3_gicr_wait_for_rwp(void *redist_base) +static inline volatile void *gicr_base_cpu(uint32_t cpu) +{ + /* Align all the redistributors sequentially */ + return GICR_BASE_GVA + cpu * SZ_64K * 2; +} + +static void gicv3_gicr_wait_for_rwp(uint32_t cpu) { unsigned int count = 100000; /* 1s */ - while (readl(redist_base + GICR_CTLR) & GICR_CTLR_RWP) { + while (readl(gicr_base_cpu(cpu) + GICR_CTLR) & GICR_CTLR_RWP) { GUEST_ASSERT(count--); udelay(10); } @@ -56,7 +71,7 @@ static void gicv3_wait_for_rwp(uint32_t cpu_or_dist) if (cpu_or_dist & DIST_BIT) gicv3_gicd_wait_for_rwp(); else - gicv3_gicr_wait_for_rwp(gicv3_data.redist_base[cpu_or_dist]); + gicv3_gicr_wait_for_rwp(cpu_or_dist); } static enum gicv3_intid_range get_intid_range(unsigned int intid) @@ -116,15 +131,15 @@ static void gicv3_set_eoi_split(bool split) uint32_t gicv3_reg_readl(uint32_t cpu_or_dist, uint64_t offset) { - void *base = cpu_or_dist & DIST_BIT ? gicv3_data.dist_base - : sgi_base_from_redist(gicv3_data.redist_base[cpu_or_dist]); + volatile void *base = cpu_or_dist & DIST_BIT ? GICD_BASE_GVA + : sgi_base_from_redist(gicr_base_cpu(cpu_or_dist)); return readl(base + offset); } void gicv3_reg_writel(uint32_t cpu_or_dist, uint64_t offset, uint32_t reg_val) { - void *base = cpu_or_dist & DIST_BIT ? gicv3_data.dist_base - : sgi_base_from_redist(gicv3_data.redist_base[cpu_or_dist]); + volatile void *base = cpu_or_dist & DIST_BIT ? GICD_BASE_GVA + : sgi_base_from_redist(gicr_base_cpu(cpu_or_dist)); writel(reg_val, base + offset); } @@ -263,7 +278,7 @@ static bool gicv3_irq_get_pending(uint32_t intid) return gicv3_read_reg(intid, GICD_ISPENDR, 32, 1); } -static void gicv3_enable_redist(void *redist_base) +static void gicv3_enable_redist(volatile void *redist_base) { uint32_t val = readl(redist_base + GICR_WAKER); unsigned int count = 100000; /* 1s */ @@ -278,21 +293,15 @@ static void gicv3_enable_redist(void *redist_base) } } -static inline void *gicr_base_cpu(void *redist_base, uint32_t cpu) +static void gicv3_cpu_init(unsigned int cpu) { - /* Align all the redistributors sequentially */ - return redist_base + cpu * SZ_64K * 2; -} - -static void gicv3_cpu_init(unsigned int cpu, void *redist_base) -{ - void *sgi_base; + volatile void *sgi_base; unsigned int i; - void *redist_base_cpu; + volatile void *redist_base_cpu; GUEST_ASSERT(cpu < gicv3_data.nr_cpus); - redist_base_cpu = gicr_base_cpu(redist_base, cpu); + redist_base_cpu = gicr_base_cpu(cpu); sgi_base = sgi_base_from_redist(redist_base_cpu); gicv3_enable_redist(redist_base_cpu); @@ -310,7 +319,7 @@ static void gicv3_cpu_init(unsigned int cpu, void *redist_base) writel(GICD_INT_DEF_PRI_X4, sgi_base + GICR_IPRIORITYR0 + i); - gicv3_gicr_wait_for_rwp(redist_base_cpu); + gicv3_gicr_wait_for_rwp(cpu); /* Enable the GIC system register (ICC_*) access */ write_sysreg_s(read_sysreg_s(SYS_ICC_SRE_EL1) | ICC_SRE_EL1_SRE, @@ -320,18 +329,15 @@ static void gicv3_cpu_init(unsigned int cpu, void *redist_base) write_sysreg_s(ICC_PMR_DEF_PRIO, SYS_ICC_PMR_EL1); /* Enable non-secure Group-1 interrupts */ - write_sysreg_s(ICC_IGRPEN1_EL1_ENABLE, SYS_ICC_GRPEN1_EL1); - - gicv3_data.redist_base[cpu] = redist_base_cpu; + write_sysreg_s(ICC_IGRPEN1_EL1_MASK, SYS_ICC_IGRPEN1_EL1); } static void gicv3_dist_init(void) { - void *dist_base = gicv3_data.dist_base; unsigned int i; /* Disable the distributor until we set things up */ - writel(0, dist_base + GICD_CTLR); + writel(0, GICD_BASE_GVA + GICD_CTLR); gicv3_gicd_wait_for_rwp(); /* @@ -339,33 +345,32 @@ static void gicv3_dist_init(void) * Also, deactivate and disable them. */ for (i = 32; i < gicv3_data.nr_spis; i += 32) { - writel(~0, dist_base + GICD_IGROUPR + i / 8); - writel(~0, dist_base + GICD_ICACTIVER + i / 8); - writel(~0, dist_base + GICD_ICENABLER + i / 8); + writel(~0, GICD_BASE_GVA + GICD_IGROUPR + i / 8); + writel(~0, GICD_BASE_GVA + GICD_ICACTIVER + i / 8); + writel(~0, GICD_BASE_GVA + GICD_ICENABLER + i / 8); } /* Set a default priority for all the SPIs */ for (i = 32; i < gicv3_data.nr_spis; i += 4) writel(GICD_INT_DEF_PRI_X4, - dist_base + GICD_IPRIORITYR + i); + GICD_BASE_GVA + GICD_IPRIORITYR + i); /* Wait for the settings to sync-in */ gicv3_gicd_wait_for_rwp(); /* Finally, enable the distributor globally with ARE */ writel(GICD_CTLR_ARE_NS | GICD_CTLR_ENABLE_G1A | - GICD_CTLR_ENABLE_G1, dist_base + GICD_CTLR); + GICD_CTLR_ENABLE_G1, GICD_BASE_GVA + GICD_CTLR); gicv3_gicd_wait_for_rwp(); } -static void gicv3_init(unsigned int nr_cpus, void *dist_base) +static void gicv3_init(unsigned int nr_cpus) { GUEST_ASSERT(nr_cpus <= GICV3_MAX_CPUS); gicv3_data.nr_cpus = nr_cpus; - gicv3_data.dist_base = dist_base; gicv3_data.nr_spis = GICD_TYPER_SPIS( - readl(gicv3_data.dist_base + GICD_TYPER)); + readl(GICD_BASE_GVA + GICD_TYPER)); if (gicv3_data.nr_spis > 1020) gicv3_data.nr_spis = 1020; @@ -396,3 +401,27 @@ const struct gic_common_ops gicv3_ops = { .gic_irq_get_pending = gicv3_irq_get_pending, .gic_irq_set_config = gicv3_irq_set_config, }; + +void gic_rdist_enable_lpis(vm_paddr_t cfg_table, size_t cfg_table_size, + vm_paddr_t pend_table) +{ + volatile void *rdist_base = gicr_base_cpu(guest_get_vcpuid()); + + u32 ctlr; + u64 val; + + val = (cfg_table | + GICR_PROPBASER_InnerShareable | + GICR_PROPBASER_RaWaWb | + ((ilog2(cfg_table_size) - 1) & GICR_PROPBASER_IDBITS_MASK)); + writeq_relaxed(val, rdist_base + GICR_PROPBASER); + + val = (pend_table | + GICR_PENDBASER_InnerShareable | + GICR_PENDBASER_RaWaWb); + writeq_relaxed(val, rdist_base + GICR_PENDBASER); + + ctlr = readl_relaxed(rdist_base + GICR_CTLR); + ctlr |= GICR_CTLR_ENABLE_LPIS; + writel_relaxed(ctlr, rdist_base + GICR_CTLR); +} diff --git a/tools/testing/selftests/kvm/lib/aarch64/gic_v3_its.c b/tools/testing/selftests/kvm/lib/aarch64/gic_v3_its.c new file mode 100644 index 000000000000..09f270545646 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/aarch64/gic_v3_its.c @@ -0,0 +1,248 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Guest ITS library, generously donated by drivers/irqchip/irq-gic-v3-its.c + * over in the kernel tree. + */ + +#include <linux/kvm.h> +#include <linux/sizes.h> +#include <asm/kvm_para.h> +#include <asm/kvm.h> + +#include "kvm_util.h" +#include "vgic.h" +#include "gic.h" +#include "gic_v3.h" +#include "processor.h" + +static u64 its_read_u64(unsigned long offset) +{ + return readq_relaxed(GITS_BASE_GVA + offset); +} + +static void its_write_u64(unsigned long offset, u64 val) +{ + writeq_relaxed(val, GITS_BASE_GVA + offset); +} + +static u32 its_read_u32(unsigned long offset) +{ + return readl_relaxed(GITS_BASE_GVA + offset); +} + +static void its_write_u32(unsigned long offset, u32 val) +{ + writel_relaxed(val, GITS_BASE_GVA + offset); +} + +static unsigned long its_find_baser(unsigned int type) +{ + int i; + + for (i = 0; i < GITS_BASER_NR_REGS; i++) { + u64 baser; + unsigned long offset = GITS_BASER + (i * sizeof(baser)); + + baser = its_read_u64(offset); + if (GITS_BASER_TYPE(baser) == type) + return offset; + } + + GUEST_FAIL("Couldn't find an ITS BASER of type %u", type); + return -1; +} + +static void its_install_table(unsigned int type, vm_paddr_t base, size_t size) +{ + unsigned long offset = its_find_baser(type); + u64 baser; + + baser = ((size / SZ_64K) - 1) | + GITS_BASER_PAGE_SIZE_64K | + GITS_BASER_InnerShareable | + base | + GITS_BASER_RaWaWb | + GITS_BASER_VALID; + + its_write_u64(offset, baser); +} + +static void its_install_cmdq(vm_paddr_t base, size_t size) +{ + u64 cbaser; + + cbaser = ((size / SZ_4K) - 1) | + GITS_CBASER_InnerShareable | + base | + GITS_CBASER_RaWaWb | + GITS_CBASER_VALID; + + its_write_u64(GITS_CBASER, cbaser); +} + +void its_init(vm_paddr_t coll_tbl, size_t coll_tbl_sz, + vm_paddr_t device_tbl, size_t device_tbl_sz, + vm_paddr_t cmdq, size_t cmdq_size) +{ + u32 ctlr; + + its_install_table(GITS_BASER_TYPE_COLLECTION, coll_tbl, coll_tbl_sz); + its_install_table(GITS_BASER_TYPE_DEVICE, device_tbl, device_tbl_sz); + its_install_cmdq(cmdq, cmdq_size); + + ctlr = its_read_u32(GITS_CTLR); + ctlr |= GITS_CTLR_ENABLE; + its_write_u32(GITS_CTLR, ctlr); +} + +struct its_cmd_block { + union { + u64 raw_cmd[4]; + __le64 raw_cmd_le[4]; + }; +}; + +static inline void its_fixup_cmd(struct its_cmd_block *cmd) +{ + /* Let's fixup BE commands */ + cmd->raw_cmd_le[0] = cpu_to_le64(cmd->raw_cmd[0]); + cmd->raw_cmd_le[1] = cpu_to_le64(cmd->raw_cmd[1]); + cmd->raw_cmd_le[2] = cpu_to_le64(cmd->raw_cmd[2]); + cmd->raw_cmd_le[3] = cpu_to_le64(cmd->raw_cmd[3]); +} + +static void its_mask_encode(u64 *raw_cmd, u64 val, int h, int l) +{ + u64 mask = GENMASK_ULL(h, l); + *raw_cmd &= ~mask; + *raw_cmd |= (val << l) & mask; +} + +static void its_encode_cmd(struct its_cmd_block *cmd, u8 cmd_nr) +{ + its_mask_encode(&cmd->raw_cmd[0], cmd_nr, 7, 0); +} + +static void its_encode_devid(struct its_cmd_block *cmd, u32 devid) +{ + its_mask_encode(&cmd->raw_cmd[0], devid, 63, 32); +} + +static void its_encode_event_id(struct its_cmd_block *cmd, u32 id) +{ + its_mask_encode(&cmd->raw_cmd[1], id, 31, 0); +} + +static void its_encode_phys_id(struct its_cmd_block *cmd, u32 phys_id) +{ + its_mask_encode(&cmd->raw_cmd[1], phys_id, 63, 32); +} + +static void its_encode_size(struct its_cmd_block *cmd, u8 size) +{ + its_mask_encode(&cmd->raw_cmd[1], size, 4, 0); +} + +static void its_encode_itt(struct its_cmd_block *cmd, u64 itt_addr) +{ + its_mask_encode(&cmd->raw_cmd[2], itt_addr >> 8, 51, 8); +} + +static void its_encode_valid(struct its_cmd_block *cmd, int valid) +{ + its_mask_encode(&cmd->raw_cmd[2], !!valid, 63, 63); +} + +static void its_encode_target(struct its_cmd_block *cmd, u64 target_addr) +{ + its_mask_encode(&cmd->raw_cmd[2], target_addr >> 16, 51, 16); +} + +static void its_encode_collection(struct its_cmd_block *cmd, u16 col) +{ + its_mask_encode(&cmd->raw_cmd[2], col, 15, 0); +} + +#define GITS_CMDQ_POLL_ITERATIONS 0 + +static void its_send_cmd(void *cmdq_base, struct its_cmd_block *cmd) +{ + u64 cwriter = its_read_u64(GITS_CWRITER); + struct its_cmd_block *dst = cmdq_base + cwriter; + u64 cbaser = its_read_u64(GITS_CBASER); + size_t cmdq_size; + u64 next; + int i; + + cmdq_size = ((cbaser & 0xFF) + 1) * SZ_4K; + + its_fixup_cmd(cmd); + + WRITE_ONCE(*dst, *cmd); + dsb(ishst); + next = (cwriter + sizeof(*cmd)) % cmdq_size; + its_write_u64(GITS_CWRITER, next); + + /* + * Polling isn't necessary considering KVM's ITS emulation at the time + * of writing this, as the CMDQ is processed synchronously after a write + * to CWRITER. + */ + for (i = 0; its_read_u64(GITS_CREADR) != next; i++) { + __GUEST_ASSERT(i < GITS_CMDQ_POLL_ITERATIONS, + "ITS didn't process command at offset %lu after %d iterations\n", + cwriter, i); + + cpu_relax(); + } +} + +void its_send_mapd_cmd(void *cmdq_base, u32 device_id, vm_paddr_t itt_base, + size_t itt_size, bool valid) +{ + struct its_cmd_block cmd = {}; + + its_encode_cmd(&cmd, GITS_CMD_MAPD); + its_encode_devid(&cmd, device_id); + its_encode_size(&cmd, ilog2(itt_size) - 1); + its_encode_itt(&cmd, itt_base); + its_encode_valid(&cmd, valid); + + its_send_cmd(cmdq_base, &cmd); +} + +void its_send_mapc_cmd(void *cmdq_base, u32 vcpu_id, u32 collection_id, bool valid) +{ + struct its_cmd_block cmd = {}; + + its_encode_cmd(&cmd, GITS_CMD_MAPC); + its_encode_collection(&cmd, collection_id); + its_encode_target(&cmd, vcpu_id); + its_encode_valid(&cmd, valid); + + its_send_cmd(cmdq_base, &cmd); +} + +void its_send_mapti_cmd(void *cmdq_base, u32 device_id, u32 event_id, + u32 collection_id, u32 intid) +{ + struct its_cmd_block cmd = {}; + + its_encode_cmd(&cmd, GITS_CMD_MAPTI); + its_encode_devid(&cmd, device_id); + its_encode_event_id(&cmd, event_id); + its_encode_phys_id(&cmd, intid); + its_encode_collection(&cmd, collection_id); + + its_send_cmd(cmdq_base, &cmd); +} + +void its_send_invall_cmd(void *cmdq_base, u32 collection_id) +{ + struct its_cmd_block cmd = {}; + + its_encode_cmd(&cmd, GITS_CMD_INVALL); + its_encode_collection(&cmd, collection_id); + + its_send_cmd(cmdq_base, &cmd); +} diff --git a/tools/testing/selftests/kvm/lib/aarch64/processor.c b/tools/testing/selftests/kvm/lib/aarch64/processor.c index a9eb17295be4..0ac7cc89f38c 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/processor.c +++ b/tools/testing/selftests/kvm/lib/aarch64/processor.c @@ -11,6 +11,8 @@ #include "guest_modes.h" #include "kvm_util.h" #include "processor.h" +#include "ucall_common.h" + #include <linux/bitfield.h> #include <linux/sizes.h> diff --git a/tools/testing/selftests/kvm/lib/aarch64/vgic.c b/tools/testing/selftests/kvm/lib/aarch64/vgic.c index 184378d593e9..4427f43f73ea 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/vgic.c +++ b/tools/testing/selftests/kvm/lib/aarch64/vgic.c @@ -3,8 +3,10 @@ * ARM Generic Interrupt Controller (GIC) v3 host support */ +#include <linux/kernel.h> #include <linux/kvm.h> #include <linux/sizes.h> +#include <asm/cputype.h> #include <asm/kvm_para.h> #include <asm/kvm.h> @@ -19,8 +21,6 @@ * Input args: * vm - KVM VM * nr_vcpus - Number of vCPUs supported by this VM - * gicd_base_gpa - Guest Physical Address of the Distributor region - * gicr_base_gpa - Guest Physical Address of the Redistributor region * * Output args: None * @@ -30,11 +30,10 @@ * redistributor regions of the guest. Since it depends on the number of * vCPUs for the VM, it must be called after all the vCPUs have been created. */ -int vgic_v3_setup(struct kvm_vm *vm, unsigned int nr_vcpus, uint32_t nr_irqs, - uint64_t gicd_base_gpa, uint64_t gicr_base_gpa) +int vgic_v3_setup(struct kvm_vm *vm, unsigned int nr_vcpus, uint32_t nr_irqs) { int gic_fd; - uint64_t redist_attr; + uint64_t attr; struct list_head *iter; unsigned int nr_gic_pages, nr_vcpus_created = 0; @@ -60,18 +59,19 @@ int vgic_v3_setup(struct kvm_vm *vm, unsigned int nr_vcpus, uint32_t nr_irqs, kvm_device_attr_set(gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, KVM_DEV_ARM_VGIC_CTRL_INIT, NULL); + attr = GICD_BASE_GPA; kvm_device_attr_set(gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_V3_ADDR_TYPE_DIST, &gicd_base_gpa); + KVM_VGIC_V3_ADDR_TYPE_DIST, &attr); nr_gic_pages = vm_calc_num_guest_pages(vm->mode, KVM_VGIC_V3_DIST_SIZE); - virt_map(vm, gicd_base_gpa, gicd_base_gpa, nr_gic_pages); + virt_map(vm, GICD_BASE_GPA, GICD_BASE_GPA, nr_gic_pages); /* Redistributor setup */ - redist_attr = REDIST_REGION_ATTR_ADDR(nr_vcpus, gicr_base_gpa, 0, 0); + attr = REDIST_REGION_ATTR_ADDR(nr_vcpus, GICR_BASE_GPA, 0, 0); kvm_device_attr_set(gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &redist_attr); + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &attr); nr_gic_pages = vm_calc_num_guest_pages(vm->mode, KVM_VGIC_V3_REDIST_SIZE * nr_vcpus); - virt_map(vm, gicr_base_gpa, gicr_base_gpa, nr_gic_pages); + virt_map(vm, GICR_BASE_GPA, GICR_BASE_GPA, nr_gic_pages); kvm_device_attr_set(gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, KVM_DEV_ARM_VGIC_CTRL_INIT, NULL); @@ -168,3 +168,21 @@ void kvm_irq_write_isactiver(int gic_fd, uint32_t intid, struct kvm_vcpu *vcpu) { vgic_poke_irq(gic_fd, intid, vcpu, GICD_ISACTIVER); } + +int vgic_its_setup(struct kvm_vm *vm) +{ + int its_fd = kvm_create_device(vm, KVM_DEV_TYPE_ARM_VGIC_ITS); + u64 attr; + + attr = GITS_BASE_GPA; + kvm_device_attr_set(its_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_ITS_ADDR_TYPE, &attr); + + kvm_device_attr_set(its_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, + KVM_DEV_ARM_VGIC_CTRL_INIT, NULL); + + virt_map(vm, GITS_BASE_GPA, GITS_BASE_GPA, + vm_calc_num_guest_pages(vm->mode, KVM_VGIC_V3_ITS_SIZE)); + + return its_fd; +} diff --git a/tools/testing/selftests/kvm/lib/assert.c b/tools/testing/selftests/kvm/lib/assert.c index 2bd25b191d15..b49690658c60 100644 --- a/tools/testing/selftests/kvm/lib/assert.c +++ b/tools/testing/selftests/kvm/lib/assert.c @@ -4,9 +4,6 @@ * * Copyright (C) 2018, Google LLC. */ - -#define _GNU_SOURCE /* for getline(3) and strchrnul(3)*/ - #include "test_util.h" #include <execinfo.h> diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index b2262b5fad9e..ad00e4761886 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -4,11 +4,10 @@ * * Copyright (C) 2018, Google LLC. */ - -#define _GNU_SOURCE /* for program_invocation_name */ #include "test_util.h" #include "kvm_util.h" #include "processor.h" +#include "ucall_common.h" #include <assert.h> #include <sched.h> @@ -20,6 +19,9 @@ #define KVM_UTIL_MIN_PFN 2 +uint32_t guest_random_seed; +struct guest_random_state guest_rng; + static int vcpu_mmap_sz(void); int open_path_or_exit(const char *path, int flags) @@ -276,7 +278,6 @@ struct kvm_vm *____vm_create(struct vm_shape shape) vm->mode = shape.mode; vm->type = shape.type; - vm->subtype = shape.subtype; vm->pa_bits = vm_guest_mode_params[vm->mode].pa_bits; vm->va_bits = vm_guest_mode_params[vm->mode].va_bits; @@ -433,6 +434,10 @@ struct kvm_vm *__vm_create(struct vm_shape shape, uint32_t nr_runnable_vcpus, slot0 = memslot2region(vm, 0); ucall_init(vm, slot0->region.guest_phys_addr + slot0->region.memory_size); + pr_info("Random seed: 0x%x\n", guest_random_seed); + guest_rng = new_guest_random_state(guest_random_seed); + sync_global_to_guest(vm, guest_rng); + kvm_arch_vm_post_create(vm); return vm; @@ -930,6 +935,10 @@ void vm_set_user_memory_region(struct kvm_vm *vm, uint32_t slot, uint32_t flags, errno, strerror(errno)); } +#define TEST_REQUIRE_SET_USER_MEMORY_REGION2() \ + __TEST_REQUIRE(kvm_has_cap(KVM_CAP_USER_MEMORY2), \ + "KVM selftests now require KVM_SET_USER_MEMORY_REGION2 (introduced in v6.8)") + int __vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flags, uint64_t gpa, uint64_t size, void *hva, uint32_t guest_memfd, uint64_t guest_memfd_offset) @@ -944,6 +953,8 @@ int __vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flag .guest_memfd_offset = guest_memfd_offset, }; + TEST_REQUIRE_SET_USER_MEMORY_REGION2(); + return ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION2, ®ion); } @@ -970,6 +981,8 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, size_t mem_size = npages * vm->page_size; size_t alignment; + TEST_REQUIRE_SET_USER_MEMORY_REGION2(); + TEST_ASSERT(vm_adjust_num_guest_pages(vm->mode, npages) == npages, "Number of guest pages is not compatible with the host. " "Try npages=%d", vm_adjust_num_guest_pages(vm->mode, npages)); @@ -2306,6 +2319,8 @@ void __attribute((constructor)) kvm_selftest_init(void) /* Tell stdout not to buffer its content. */ setbuf(stdout, NULL); + guest_random_seed = random(); + kvm_selftest_arch_init(); } diff --git a/tools/testing/selftests/kvm/lib/memstress.c b/tools/testing/selftests/kvm/lib/memstress.c index cf2c73971308..313277486a1d 100644 --- a/tools/testing/selftests/kvm/lib/memstress.c +++ b/tools/testing/selftests/kvm/lib/memstress.c @@ -2,14 +2,13 @@ /* * Copyright (C) 2020, Google LLC. */ -#define _GNU_SOURCE - #include <inttypes.h> #include <linux/bitmap.h> #include "kvm_util.h" #include "memstress.h" #include "processor.h" +#include "ucall_common.h" struct memstress_args memstress_args; @@ -56,7 +55,7 @@ void memstress_guest_code(uint32_t vcpu_idx) uint64_t page; int i; - rand_state = new_guest_random_state(args->random_seed + vcpu_idx); + rand_state = new_guest_random_state(guest_random_seed + vcpu_idx); gva = vcpu_args->gva; pages = vcpu_args->pages; @@ -76,7 +75,7 @@ void memstress_guest_code(uint32_t vcpu_idx) addr = gva + (page * args->guest_page_size); - if (guest_random_u32(&rand_state) % 100 < args->write_percent) + if (__guest_random_bool(&rand_state, args->write_percent)) *(uint64_t *)addr = 0x0123456789ABCDEF; else READ_ONCE(*(uint64_t *)addr); @@ -243,12 +242,6 @@ void memstress_set_write_percent(struct kvm_vm *vm, uint32_t write_percent) sync_global_to_guest(vm, memstress_args.write_percent); } -void memstress_set_random_seed(struct kvm_vm *vm, uint32_t random_seed) -{ - memstress_args.random_seed = random_seed; - sync_global_to_guest(vm, memstress_args.random_seed); -} - void memstress_set_random_access(struct kvm_vm *vm, bool random_access) { memstress_args.random_access = random_access; diff --git a/tools/testing/selftests/kvm/lib/riscv/processor.c b/tools/testing/selftests/kvm/lib/riscv/processor.c index e8211f5d6863..6ae47b3d6b25 100644 --- a/tools/testing/selftests/kvm/lib/riscv/processor.c +++ b/tools/testing/selftests/kvm/lib/riscv/processor.c @@ -10,6 +10,7 @@ #include "kvm_util.h" #include "processor.h" +#include "ucall_common.h" #define DEFAULT_RISCV_GUEST_STACK_VADDR_MIN 0xac0000 @@ -502,3 +503,15 @@ bool guest_sbi_probe_extension(int extid, long *out_val) return true; } + +unsigned long get_host_sbi_spec_version(void) +{ + struct sbiret ret; + + ret = sbi_ecall(SBI_EXT_BASE, SBI_EXT_BASE_GET_SPEC_VERSION, 0, + 0, 0, 0, 0, 0); + + GUEST_ASSERT(!ret.error); + + return ret.value; +} diff --git a/tools/testing/selftests/kvm/lib/test_util.c b/tools/testing/selftests/kvm/lib/test_util.c index 5a8f8becb129..8ed0b74ae837 100644 --- a/tools/testing/selftests/kvm/lib/test_util.c +++ b/tools/testing/selftests/kvm/lib/test_util.c @@ -4,8 +4,6 @@ * * Copyright (C) 2020, Google LLC. */ - -#define _GNU_SOURCE #include <stdio.h> #include <stdarg.h> #include <assert.h> diff --git a/tools/testing/selftests/kvm/lib/ucall_common.c b/tools/testing/selftests/kvm/lib/ucall_common.c index f5af65a41c29..42151e571953 100644 --- a/tools/testing/selftests/kvm/lib/ucall_common.c +++ b/tools/testing/selftests/kvm/lib/ucall_common.c @@ -1,9 +1,12 @@ // SPDX-License-Identifier: GPL-2.0-only -#include "kvm_util.h" #include "linux/types.h" #include "linux/bitmap.h" #include "linux/atomic.h" +#include "kvm_util.h" +#include "ucall_common.h" + + #define GUEST_UCALL_FAILED -1 struct ucall_header { diff --git a/tools/testing/selftests/kvm/lib/userfaultfd_util.c b/tools/testing/selftests/kvm/lib/userfaultfd_util.c index f4eef6eb2dc2..7c9de8414462 100644 --- a/tools/testing/selftests/kvm/lib/userfaultfd_util.c +++ b/tools/testing/selftests/kvm/lib/userfaultfd_util.c @@ -6,9 +6,6 @@ * Copyright (C) 2018, Red Hat, Inc. * Copyright (C) 2019-2022 Google LLC */ - -#define _GNU_SOURCE /* for pipe2 */ - #include <inttypes.h> #include <stdio.h> #include <stdlib.h> @@ -16,6 +13,7 @@ #include <poll.h> #include <pthread.h> #include <linux/userfaultfd.h> +#include <sys/epoll.h> #include <sys/syscall.h> #include "kvm_util.h" @@ -27,76 +25,69 @@ static void *uffd_handler_thread_fn(void *arg) { - struct uffd_desc *uffd_desc = (struct uffd_desc *)arg; - int uffd = uffd_desc->uffd; - int pipefd = uffd_desc->pipefds[0]; - useconds_t delay = uffd_desc->delay; + struct uffd_reader_args *reader_args = (struct uffd_reader_args *)arg; + int uffd = reader_args->uffd; int64_t pages = 0; struct timespec start; struct timespec ts_diff; + struct epoll_event evt; + int epollfd; + + epollfd = epoll_create(1); + TEST_ASSERT(epollfd >= 0, "Failed to create epollfd."); + + evt.events = EPOLLIN | EPOLLEXCLUSIVE; + evt.data.u32 = 0; + TEST_ASSERT(!epoll_ctl(epollfd, EPOLL_CTL_ADD, uffd, &evt), + "Failed to add uffd to epollfd"); + + evt.events = EPOLLIN; + evt.data.u32 = 1; + TEST_ASSERT(!epoll_ctl(epollfd, EPOLL_CTL_ADD, reader_args->pipe, &evt), + "Failed to add pipe to epollfd"); clock_gettime(CLOCK_MONOTONIC, &start); while (1) { struct uffd_msg msg; - struct pollfd pollfd[2]; - char tmp_chr; int r; - pollfd[0].fd = uffd; - pollfd[0].events = POLLIN; - pollfd[1].fd = pipefd; - pollfd[1].events = POLLIN; + r = epoll_wait(epollfd, &evt, 1, -1); + TEST_ASSERT(r == 1, + "Unexpected number of events (%d) from epoll, errno = %d", + r, errno); - r = poll(pollfd, 2, -1); - switch (r) { - case -1: - pr_info("poll err"); - continue; - case 0: - continue; - case 1: - break; - default: - pr_info("Polling uffd returned %d", r); - return NULL; - } - - if (pollfd[0].revents & POLLERR) { - pr_info("uffd revents has POLLERR"); - return NULL; - } + if (evt.data.u32 == 1) { + char tmp_chr; - if (pollfd[1].revents & POLLIN) { - r = read(pollfd[1].fd, &tmp_chr, 1); + TEST_ASSERT(!(evt.events & (EPOLLERR | EPOLLHUP)), + "Reader thread received EPOLLERR or EPOLLHUP on pipe."); + r = read(reader_args->pipe, &tmp_chr, 1); TEST_ASSERT(r == 1, - "Error reading pipefd in UFFD thread"); + "Error reading pipefd in uffd reader thread"); break; } - if (!(pollfd[0].revents & POLLIN)) - continue; + TEST_ASSERT(!(evt.events & (EPOLLERR | EPOLLHUP)), + "Reader thread received EPOLLERR or EPOLLHUP on uffd."); r = read(uffd, &msg, sizeof(msg)); if (r == -1) { - if (errno == EAGAIN) - continue; - pr_info("Read of uffd got errno %d\n", errno); - return NULL; + TEST_ASSERT(errno == EAGAIN, + "Error reading from UFFD: errno = %d", errno); + continue; } - if (r != sizeof(msg)) { - pr_info("Read on uffd returned unexpected size: %d bytes", r); - return NULL; - } + TEST_ASSERT(r == sizeof(msg), + "Read on uffd returned unexpected number of bytes (%d)", r); if (!(msg.event & UFFD_EVENT_PAGEFAULT)) continue; - if (delay) - usleep(delay); - r = uffd_desc->handler(uffd_desc->uffd_mode, uffd, &msg); - if (r < 0) - return NULL; + if (reader_args->delay) + usleep(reader_args->delay); + r = reader_args->handler(reader_args->uffd_mode, uffd, &msg); + TEST_ASSERT(r >= 0, + "Reader thread handler fn returned negative value %d", r); pages++; } @@ -110,6 +101,7 @@ static void *uffd_handler_thread_fn(void *arg) struct uffd_desc *uffd_setup_demand_paging(int uffd_mode, useconds_t delay, void *hva, uint64_t len, + uint64_t num_readers, uffd_handler_t handler) { struct uffd_desc *uffd_desc; @@ -118,14 +110,25 @@ struct uffd_desc *uffd_setup_demand_paging(int uffd_mode, useconds_t delay, struct uffdio_api uffdio_api; struct uffdio_register uffdio_register; uint64_t expected_ioctls = ((uint64_t) 1) << _UFFDIO_COPY; - int ret; + int ret, i; PER_PAGE_DEBUG("Userfaultfd %s mode, faults resolved with %s\n", is_minor ? "MINOR" : "MISSING", is_minor ? "UFFDIO_CONINUE" : "UFFDIO_COPY"); uffd_desc = malloc(sizeof(struct uffd_desc)); - TEST_ASSERT(uffd_desc, "malloc failed"); + TEST_ASSERT(uffd_desc, "Failed to malloc uffd descriptor"); + + uffd_desc->pipefds = calloc(sizeof(int), num_readers); + TEST_ASSERT(uffd_desc->pipefds, "Failed to alloc pipes"); + + uffd_desc->readers = calloc(sizeof(pthread_t), num_readers); + TEST_ASSERT(uffd_desc->readers, "Failed to alloc reader threads"); + + uffd_desc->reader_args = calloc(sizeof(struct uffd_reader_args), num_readers); + TEST_ASSERT(uffd_desc->reader_args, "Failed to alloc reader_args"); + + uffd_desc->num_readers = num_readers; /* In order to get minor faults, prefault via the alias. */ if (is_minor) @@ -148,18 +151,28 @@ struct uffd_desc *uffd_setup_demand_paging(int uffd_mode, useconds_t delay, TEST_ASSERT((uffdio_register.ioctls & expected_ioctls) == expected_ioctls, "missing userfaultfd ioctls"); - ret = pipe2(uffd_desc->pipefds, O_CLOEXEC | O_NONBLOCK); - TEST_ASSERT(!ret, "Failed to set up pipefd"); - - uffd_desc->uffd_mode = uffd_mode; uffd_desc->uffd = uffd; - uffd_desc->delay = delay; - uffd_desc->handler = handler; - pthread_create(&uffd_desc->thread, NULL, uffd_handler_thread_fn, - uffd_desc); + for (i = 0; i < uffd_desc->num_readers; ++i) { + int pipes[2]; - PER_VCPU_DEBUG("Created uffd thread for HVA range [%p, %p)\n", - hva, hva + len); + ret = pipe2((int *) &pipes, O_CLOEXEC | O_NONBLOCK); + TEST_ASSERT(!ret, "Failed to set up pipefd %i for uffd_desc %p", + i, uffd_desc); + + uffd_desc->pipefds[i] = pipes[1]; + + uffd_desc->reader_args[i].uffd_mode = uffd_mode; + uffd_desc->reader_args[i].uffd = uffd; + uffd_desc->reader_args[i].delay = delay; + uffd_desc->reader_args[i].handler = handler; + uffd_desc->reader_args[i].pipe = pipes[0]; + + pthread_create(&uffd_desc->readers[i], NULL, uffd_handler_thread_fn, + &uffd_desc->reader_args[i]); + + PER_VCPU_DEBUG("Created uffd thread %i for HVA range [%p, %p)\n", + i, hva, hva + len); + } return uffd_desc; } @@ -167,19 +180,26 @@ struct uffd_desc *uffd_setup_demand_paging(int uffd_mode, useconds_t delay, void uffd_stop_demand_paging(struct uffd_desc *uffd) { char c = 0; - int ret; + int i; - ret = write(uffd->pipefds[1], &c, 1); - TEST_ASSERT(ret == 1, "Unable to write to pipefd"); + for (i = 0; i < uffd->num_readers; ++i) + TEST_ASSERT(write(uffd->pipefds[i], &c, 1) == 1, + "Unable to write to pipefd %i for uffd_desc %p", i, uffd); - ret = pthread_join(uffd->thread, NULL); - TEST_ASSERT(ret == 0, "Pthread_join failed."); + for (i = 0; i < uffd->num_readers; ++i) + TEST_ASSERT(!pthread_join(uffd->readers[i], NULL), + "Pthread_join failed on reader %i for uffd_desc %p", i, uffd); close(uffd->uffd); - close(uffd->pipefds[1]); - close(uffd->pipefds[0]); + for (i = 0; i < uffd->num_readers; ++i) { + close(uffd->pipefds[i]); + close(uffd->reader_args[i].pipe); + } + free(uffd->pipefds); + free(uffd->readers); + free(uffd->reader_args); free(uffd); } diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 74a4c736c9ae..c664e446136b 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -15,14 +15,16 @@ #define NUM_INTERRUPTS 256 #endif -#define DEFAULT_CODE_SELECTOR 0x8 -#define DEFAULT_DATA_SELECTOR 0x10 +#define KERNEL_CS 0x8 +#define KERNEL_DS 0x10 +#define KERNEL_TSS 0x18 #define MAX_NR_CPUID_ENTRIES 100 vm_vaddr_t exception_handlers; bool host_cpu_is_amd; bool host_cpu_is_intel; +bool is_forced_emulation_enabled; static void regs_dump(FILE *stream, struct kvm_regs *regs, uint8_t indent) { @@ -417,7 +419,7 @@ static void kvm_seg_set_unusable(struct kvm_segment *segp) static void kvm_seg_fill_gdt_64bit(struct kvm_vm *vm, struct kvm_segment *segp) { - void *gdt = addr_gva2hva(vm, vm->gdt); + void *gdt = addr_gva2hva(vm, vm->arch.gdt); struct desc64 *desc = gdt + (segp->selector >> 3) * 8; desc->limit0 = segp->limit & 0xFFFF; @@ -437,27 +439,10 @@ static void kvm_seg_fill_gdt_64bit(struct kvm_vm *vm, struct kvm_segment *segp) desc->base3 = segp->base >> 32; } - -/* - * Set Long Mode Flat Kernel Code Segment - * - * Input Args: - * vm - VM whose GDT is being filled, or NULL to only write segp - * selector - selector value - * - * Output Args: - * segp - Pointer to KVM segment - * - * Return: None - * - * Sets up the KVM segment pointed to by @segp, to be a code segment - * with the selector value given by @selector. - */ -static void kvm_seg_set_kernel_code_64bit(struct kvm_vm *vm, uint16_t selector, - struct kvm_segment *segp) +static void kvm_seg_set_kernel_code_64bit(struct kvm_segment *segp) { memset(segp, 0, sizeof(*segp)); - segp->selector = selector; + segp->selector = KERNEL_CS; segp->limit = 0xFFFFFFFFu; segp->s = 0x1; /* kTypeCodeData */ segp->type = 0x08 | 0x01 | 0x02; /* kFlagCode | kFlagCodeAccessed @@ -466,30 +451,12 @@ static void kvm_seg_set_kernel_code_64bit(struct kvm_vm *vm, uint16_t selector, segp->g = true; segp->l = true; segp->present = 1; - if (vm) - kvm_seg_fill_gdt_64bit(vm, segp); } -/* - * Set Long Mode Flat Kernel Data Segment - * - * Input Args: - * vm - VM whose GDT is being filled, or NULL to only write segp - * selector - selector value - * - * Output Args: - * segp - Pointer to KVM segment - * - * Return: None - * - * Sets up the KVM segment pointed to by @segp, to be a data segment - * with the selector value given by @selector. - */ -static void kvm_seg_set_kernel_data_64bit(struct kvm_vm *vm, uint16_t selector, - struct kvm_segment *segp) +static void kvm_seg_set_kernel_data_64bit(struct kvm_segment *segp) { memset(segp, 0, sizeof(*segp)); - segp->selector = selector; + segp->selector = KERNEL_DS; segp->limit = 0xFFFFFFFFu; segp->s = 0x1; /* kTypeCodeData */ segp->type = 0x00 | 0x01 | 0x02; /* kFlagData | kFlagDataAccessed @@ -497,8 +464,6 @@ static void kvm_seg_set_kernel_data_64bit(struct kvm_vm *vm, uint16_t selector, */ segp->g = true; segp->present = true; - if (vm) - kvm_seg_fill_gdt_64bit(vm, segp); } vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) @@ -516,72 +481,153 @@ vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) return vm_untag_gpa(vm, PTE_GET_PA(*pte)) | (gva & ~HUGEPAGE_MASK(level)); } -static void kvm_setup_gdt(struct kvm_vm *vm, struct kvm_dtable *dt) -{ - if (!vm->gdt) - vm->gdt = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA); - - dt->base = vm->gdt; - dt->limit = getpagesize(); -} - -static void kvm_setup_tss_64bit(struct kvm_vm *vm, struct kvm_segment *segp, - int selector) +static void kvm_seg_set_tss_64bit(vm_vaddr_t base, struct kvm_segment *segp) { - if (!vm->tss) - vm->tss = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA); - memset(segp, 0, sizeof(*segp)); - segp->base = vm->tss; + segp->base = base; segp->limit = 0x67; - segp->selector = selector; + segp->selector = KERNEL_TSS; segp->type = 0xb; segp->present = 1; - kvm_seg_fill_gdt_64bit(vm, segp); } -static void vcpu_setup(struct kvm_vm *vm, struct kvm_vcpu *vcpu) +static void vcpu_init_sregs(struct kvm_vm *vm, struct kvm_vcpu *vcpu) { struct kvm_sregs sregs; + TEST_ASSERT_EQ(vm->mode, VM_MODE_PXXV48_4K); + /* Set mode specific system register values. */ vcpu_sregs_get(vcpu, &sregs); - sregs.idt.limit = 0; + sregs.idt.base = vm->arch.idt; + sregs.idt.limit = NUM_INTERRUPTS * sizeof(struct idt_entry) - 1; + sregs.gdt.base = vm->arch.gdt; + sregs.gdt.limit = getpagesize() - 1; + + sregs.cr0 = X86_CR0_PE | X86_CR0_NE | X86_CR0_PG; + sregs.cr4 |= X86_CR4_PAE | X86_CR4_OSFXSR; + sregs.efer |= (EFER_LME | EFER_LMA | EFER_NX); - kvm_setup_gdt(vm, &sregs.gdt); + kvm_seg_set_unusable(&sregs.ldt); + kvm_seg_set_kernel_code_64bit(&sregs.cs); + kvm_seg_set_kernel_data_64bit(&sregs.ds); + kvm_seg_set_kernel_data_64bit(&sregs.es); + kvm_seg_set_kernel_data_64bit(&sregs.gs); + kvm_seg_set_tss_64bit(vm->arch.tss, &sregs.tr); - switch (vm->mode) { - case VM_MODE_PXXV48_4K: - sregs.cr0 = X86_CR0_PE | X86_CR0_NE | X86_CR0_PG; - sregs.cr4 |= X86_CR4_PAE | X86_CR4_OSFXSR; - sregs.efer |= (EFER_LME | EFER_LMA | EFER_NX); + sregs.cr3 = vm->pgd; + vcpu_sregs_set(vcpu, &sregs); +} - kvm_seg_set_unusable(&sregs.ldt); - kvm_seg_set_kernel_code_64bit(vm, DEFAULT_CODE_SELECTOR, &sregs.cs); - kvm_seg_set_kernel_data_64bit(vm, DEFAULT_DATA_SELECTOR, &sregs.ds); - kvm_seg_set_kernel_data_64bit(vm, DEFAULT_DATA_SELECTOR, &sregs.es); - kvm_setup_tss_64bit(vm, &sregs.tr, 0x18); - break; +static void set_idt_entry(struct kvm_vm *vm, int vector, unsigned long addr, + int dpl, unsigned short selector) +{ + struct idt_entry *base = + (struct idt_entry *)addr_gva2hva(vm, vm->arch.idt); + struct idt_entry *e = &base[vector]; + + memset(e, 0, sizeof(*e)); + e->offset0 = addr; + e->selector = selector; + e->ist = 0; + e->type = 14; + e->dpl = dpl; + e->p = 1; + e->offset1 = addr >> 16; + e->offset2 = addr >> 32; +} + +static bool kvm_fixup_exception(struct ex_regs *regs) +{ + if (regs->r9 != KVM_EXCEPTION_MAGIC || regs->rip != regs->r10) + return false; - default: - TEST_FAIL("Unknown guest mode, mode: 0x%x", vm->mode); + if (regs->vector == DE_VECTOR) + return false; + + regs->rip = regs->r11; + regs->r9 = regs->vector; + regs->r10 = regs->error_code; + return true; +} + +void route_exception(struct ex_regs *regs) +{ + typedef void(*handler)(struct ex_regs *); + handler *handlers = (handler *)exception_handlers; + + if (handlers && handlers[regs->vector]) { + handlers[regs->vector](regs); + return; } - sregs.cr3 = vm->pgd; - vcpu_sregs_set(vcpu, &sregs); + if (kvm_fixup_exception(regs)) + return; + + ucall_assert(UCALL_UNHANDLED, + "Unhandled exception in guest", __FILE__, __LINE__, + "Unhandled exception '0x%lx' at guest RIP '0x%lx'", + regs->vector, regs->rip); +} + +static void vm_init_descriptor_tables(struct kvm_vm *vm) +{ + extern void *idt_handlers; + struct kvm_segment seg; + int i; + + vm->arch.gdt = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA); + vm->arch.idt = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA); + vm->handlers = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA); + vm->arch.tss = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA); + + /* Handlers have the same address in both address spaces.*/ + for (i = 0; i < NUM_INTERRUPTS; i++) + set_idt_entry(vm, i, (unsigned long)(&idt_handlers)[i], 0, KERNEL_CS); + + *(vm_vaddr_t *)addr_gva2hva(vm, (vm_vaddr_t)(&exception_handlers)) = vm->handlers; + + kvm_seg_set_kernel_code_64bit(&seg); + kvm_seg_fill_gdt_64bit(vm, &seg); + + kvm_seg_set_kernel_data_64bit(&seg); + kvm_seg_fill_gdt_64bit(vm, &seg); + + kvm_seg_set_tss_64bit(vm->arch.tss, &seg); + kvm_seg_fill_gdt_64bit(vm, &seg); +} + +void vm_install_exception_handler(struct kvm_vm *vm, int vector, + void (*handler)(struct ex_regs *)) +{ + vm_vaddr_t *handlers = (vm_vaddr_t *)addr_gva2hva(vm, vm->handlers); + + handlers[vector] = (vm_vaddr_t)handler; +} + +void assert_on_unhandled_exception(struct kvm_vcpu *vcpu) +{ + struct ucall uc; + + if (get_ucall(vcpu, &uc) == UCALL_UNHANDLED) + REPORT_GUEST_ASSERT(uc); } void kvm_arch_vm_post_create(struct kvm_vm *vm) { vm_create_irqchip(vm); + vm_init_descriptor_tables(vm); + sync_global_to_guest(vm, host_cpu_is_intel); sync_global_to_guest(vm, host_cpu_is_amd); + sync_global_to_guest(vm, is_forced_emulation_enabled); + + if (vm->type == KVM_X86_SEV_VM || vm->type == KVM_X86_SEV_ES_VM) { + struct kvm_sev_init init = { 0 }; - if (vm->subtype == VM_SUBTYPE_SEV) - sev_vm_init(vm); - else if (vm->subtype == VM_SUBTYPE_SEV_ES) - sev_es_vm_init(vm); + vm_sev_ioctl(vm, KVM_SEV_INIT2, &init); + } } void vcpu_arch_set_entry_point(struct kvm_vcpu *vcpu, void *guest_code) @@ -621,7 +667,7 @@ struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id) vcpu = __vm_vcpu_add(vm, vcpu_id); vcpu_init_cpuid(vcpu, kvm_get_supported_cpuid()); - vcpu_setup(vm, vcpu); + vcpu_init_sregs(vm, vcpu); /* Setup guest general purpose registers */ vcpu_regs_get(vcpu, ®s); @@ -1081,108 +1127,15 @@ void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits) void kvm_init_vm_address_properties(struct kvm_vm *vm) { - if (vm->subtype == VM_SUBTYPE_SEV || vm->subtype == VM_SUBTYPE_SEV_ES) { + if (vm->type == KVM_X86_SEV_VM || vm->type == KVM_X86_SEV_ES_VM) { + vm->arch.sev_fd = open_sev_dev_path_or_exit(); vm->arch.c_bit = BIT_ULL(this_cpu_property(X86_PROPERTY_SEV_C_BIT)); vm->gpa_tag_mask = vm->arch.c_bit; + } else { + vm->arch.sev_fd = -1; } } -static void set_idt_entry(struct kvm_vm *vm, int vector, unsigned long addr, - int dpl, unsigned short selector) -{ - struct idt_entry *base = - (struct idt_entry *)addr_gva2hva(vm, vm->idt); - struct idt_entry *e = &base[vector]; - - memset(e, 0, sizeof(*e)); - e->offset0 = addr; - e->selector = selector; - e->ist = 0; - e->type = 14; - e->dpl = dpl; - e->p = 1; - e->offset1 = addr >> 16; - e->offset2 = addr >> 32; -} - - -static bool kvm_fixup_exception(struct ex_regs *regs) -{ - if (regs->r9 != KVM_EXCEPTION_MAGIC || regs->rip != regs->r10) - return false; - - if (regs->vector == DE_VECTOR) - return false; - - regs->rip = regs->r11; - regs->r9 = regs->vector; - regs->r10 = regs->error_code; - return true; -} - -void route_exception(struct ex_regs *regs) -{ - typedef void(*handler)(struct ex_regs *); - handler *handlers = (handler *)exception_handlers; - - if (handlers && handlers[regs->vector]) { - handlers[regs->vector](regs); - return; - } - - if (kvm_fixup_exception(regs)) - return; - - ucall_assert(UCALL_UNHANDLED, - "Unhandled exception in guest", __FILE__, __LINE__, - "Unhandled exception '0x%lx' at guest RIP '0x%lx'", - regs->vector, regs->rip); -} - -void vm_init_descriptor_tables(struct kvm_vm *vm) -{ - extern void *idt_handlers; - int i; - - vm->idt = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA); - vm->handlers = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA); - /* Handlers have the same address in both address spaces.*/ - for (i = 0; i < NUM_INTERRUPTS; i++) - set_idt_entry(vm, i, (unsigned long)(&idt_handlers)[i], 0, - DEFAULT_CODE_SELECTOR); -} - -void vcpu_init_descriptor_tables(struct kvm_vcpu *vcpu) -{ - struct kvm_vm *vm = vcpu->vm; - struct kvm_sregs sregs; - - vcpu_sregs_get(vcpu, &sregs); - sregs.idt.base = vm->idt; - sregs.idt.limit = NUM_INTERRUPTS * sizeof(struct idt_entry) - 1; - sregs.gdt.base = vm->gdt; - sregs.gdt.limit = getpagesize() - 1; - kvm_seg_set_kernel_data_64bit(NULL, DEFAULT_DATA_SELECTOR, &sregs.gs); - vcpu_sregs_set(vcpu, &sregs); - *(vm_vaddr_t *)addr_gva2hva(vm, (vm_vaddr_t)(&exception_handlers)) = vm->handlers; -} - -void vm_install_exception_handler(struct kvm_vm *vm, int vector, - void (*handler)(struct ex_regs *)) -{ - vm_vaddr_t *handlers = (vm_vaddr_t *)addr_gva2hva(vm, vm->handlers); - - handlers[vector] = (vm_vaddr_t)handler; -} - -void assert_on_unhandled_exception(struct kvm_vcpu *vcpu) -{ - struct ucall uc; - - if (get_ucall(vcpu, &uc) == UCALL_UNHANDLED) - REPORT_GUEST_ASSERT(uc); -} - const struct kvm_cpuid_entry2 *get_cpuid_entry(const struct kvm_cpuid2 *cpuid, uint32_t function, uint32_t index) { @@ -1344,6 +1297,7 @@ void kvm_selftest_arch_init(void) { host_cpu_is_intel = this_cpu_is_intel(); host_cpu_is_amd = this_cpu_is_amd(); + is_forced_emulation_enabled = kvm_is_forced_emulation_enabled(); } bool sys_clocksource_is_based_on_tsc(void) diff --git a/tools/testing/selftests/kvm/lib/x86_64/sev.c b/tools/testing/selftests/kvm/lib/x86_64/sev.c index e248d3364b9c..e9535ee20b7f 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/sev.c +++ b/tools/testing/selftests/kvm/lib/x86_64/sev.c @@ -1,5 +1,4 @@ // SPDX-License-Identifier: GPL-2.0-only -#define _GNU_SOURCE /* for program_invocation_short_name */ #include <stdint.h> #include <stdbool.h> @@ -35,6 +34,32 @@ static void encrypt_region(struct kvm_vm *vm, struct userspace_mem_region *regio } } +void sev_vm_init(struct kvm_vm *vm) +{ + if (vm->type == KVM_X86_DEFAULT_VM) { + assert(vm->arch.sev_fd == -1); + vm->arch.sev_fd = open_sev_dev_path_or_exit(); + vm_sev_ioctl(vm, KVM_SEV_INIT, NULL); + } else { + struct kvm_sev_init init = { 0 }; + assert(vm->type == KVM_X86_SEV_VM); + vm_sev_ioctl(vm, KVM_SEV_INIT2, &init); + } +} + +void sev_es_vm_init(struct kvm_vm *vm) +{ + if (vm->type == KVM_X86_DEFAULT_VM) { + assert(vm->arch.sev_fd == -1); + vm->arch.sev_fd = open_sev_dev_path_or_exit(); + vm_sev_ioctl(vm, KVM_SEV_ES_INIT, NULL); + } else { + struct kvm_sev_init init = { 0 }; + assert(vm->type == KVM_X86_SEV_ES_VM); + vm_sev_ioctl(vm, KVM_SEV_INIT2, &init); + } +} + void sev_vm_launch(struct kvm_vm *vm, uint32_t policy) { struct kvm_sev_launch_start launch_start = { @@ -87,28 +112,30 @@ void sev_vm_launch_finish(struct kvm_vm *vm) TEST_ASSERT_EQ(status.state, SEV_GUEST_STATE_RUNNING); } -struct kvm_vm *vm_sev_create_with_one_vcpu(uint32_t policy, void *guest_code, +struct kvm_vm *vm_sev_create_with_one_vcpu(uint32_t type, void *guest_code, struct kvm_vcpu **cpu) { struct vm_shape shape = { - .type = VM_TYPE_DEFAULT, .mode = VM_MODE_DEFAULT, - .subtype = policy & SEV_POLICY_ES ? VM_SUBTYPE_SEV_ES : - VM_SUBTYPE_SEV, + .type = type, }; struct kvm_vm *vm; struct kvm_vcpu *cpus[1]; - uint8_t measurement[512]; vm = __vm_create_with_vcpus(shape, 1, 0, guest_code, cpus); *cpu = cpus[0]; + return vm; +} + +void vm_sev_launch(struct kvm_vm *vm, uint32_t policy, uint8_t *measurement) +{ sev_vm_launch(vm, policy); - /* TODO: Validate the measurement is as expected. */ + if (!measurement) + measurement = alloca(256); + sev_vm_launch_measure(vm, measurement); sev_vm_launch_finish(vm); - - return vm; } diff --git a/tools/testing/selftests/kvm/max_guest_memory_test.c b/tools/testing/selftests/kvm/max_guest_memory_test.c index 1a6da7389bf1..0b9678858b6d 100644 --- a/tools/testing/selftests/kvm/max_guest_memory_test.c +++ b/tools/testing/selftests/kvm/max_guest_memory_test.c @@ -1,6 +1,4 @@ // SPDX-License-Identifier: GPL-2.0 -#define _GNU_SOURCE - #include <stdio.h> #include <stdlib.h> #include <pthread.h> diff --git a/tools/testing/selftests/kvm/memslot_modification_stress_test.c b/tools/testing/selftests/kvm/memslot_modification_stress_test.c index 156361966612..05fcf902e067 100644 --- a/tools/testing/selftests/kvm/memslot_modification_stress_test.c +++ b/tools/testing/selftests/kvm/memslot_modification_stress_test.c @@ -6,9 +6,6 @@ * Copyright (C) 2018, Red Hat, Inc. * Copyright (C) 2020, Google, Inc. */ - -#define _GNU_SOURCE /* for program_invocation_name */ - #include <stdio.h> #include <stdlib.h> #include <sys/syscall.h> diff --git a/tools/testing/selftests/kvm/riscv/arch_timer.c b/tools/testing/selftests/kvm/riscv/arch_timer.c index 0f9cabd99fd4..2c792228ac0b 100644 --- a/tools/testing/selftests/kvm/riscv/arch_timer.c +++ b/tools/testing/selftests/kvm/riscv/arch_timer.c @@ -7,13 +7,11 @@ * * Copyright (c) 2024, Intel Corporation. */ - -#define _GNU_SOURCE - #include "arch_timer.h" #include "kvm_util.h" #include "processor.h" #include "timer_test.h" +#include "ucall_common.h" static int timer_irq = IRQ_S_TIMER; @@ -85,7 +83,7 @@ struct kvm_vm *test_vm_create(void) int nr_vcpus = test_args.nr_vcpus; vm = vm_create_with_vcpus(nr_vcpus, guest_code, vcpus); - __TEST_REQUIRE(__vcpu_has_ext(vcpus[0], RISCV_ISA_EXT_REG(KVM_RISCV_ISA_EXT_SSTC)), + __TEST_REQUIRE(__vcpu_has_isa_ext(vcpus[0], KVM_RISCV_ISA_EXT_SSTC), "SSTC not available, skipping test\n"); vm_init_vector_tables(vm); diff --git a/tools/testing/selftests/kvm/riscv/ebreak_test.c b/tools/testing/selftests/kvm/riscv/ebreak_test.c new file mode 100644 index 000000000000..823c132069b4 --- /dev/null +++ b/tools/testing/selftests/kvm/riscv/ebreak_test.c @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * RISC-V KVM ebreak test. + * + * Copyright 2024 Beijing ESWIN Computing Technology Co., Ltd. + * + */ +#include "kvm_util.h" + +#define LABEL_ADDRESS(v) ((uint64_t)&(v)) + +extern unsigned char sw_bp_1, sw_bp_2; +static uint64_t sw_bp_addr; + +static void guest_code(void) +{ + asm volatile( + ".option push\n" + ".option norvc\n" + "sw_bp_1: ebreak\n" + "sw_bp_2: ebreak\n" + ".option pop\n" + ); + GUEST_ASSERT_EQ(READ_ONCE(sw_bp_addr), LABEL_ADDRESS(sw_bp_2)); + + GUEST_DONE(); +} + +static void guest_breakpoint_handler(struct ex_regs *regs) +{ + WRITE_ONCE(sw_bp_addr, regs->epc); + regs->epc += 4; +} + +int main(void) +{ + struct kvm_vm *vm; + struct kvm_vcpu *vcpu; + uint64_t pc; + struct kvm_guest_debug debug = { + .control = KVM_GUESTDBG_ENABLE, + }; + + TEST_REQUIRE(kvm_has_cap(KVM_CAP_SET_GUEST_DEBUG)); + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + + vm_init_vector_tables(vm); + vcpu_init_vector_tables(vcpu); + vm_install_exception_handler(vm, EXC_BREAKPOINT, + guest_breakpoint_handler); + + /* + * Enable the guest debug. + * ebreak should exit to the VMM with KVM_EXIT_DEBUG reason. + */ + vcpu_guest_debug_set(vcpu, &debug); + vcpu_run(vcpu); + + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_DEBUG); + + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.pc), &pc); + TEST_ASSERT_EQ(pc, LABEL_ADDRESS(sw_bp_1)); + + /* skip sw_bp_1 */ + vcpu_set_reg(vcpu, RISCV_CORE_REG(regs.pc), pc + 4); + + /* + * Disable all debug controls. + * Guest should handle the ebreak without exiting to the VMM. + */ + memset(&debug, 0, sizeof(debug)); + vcpu_guest_debug_set(vcpu, &debug); + + vcpu_run(vcpu); + + TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_DONE); + + kvm_vm_free(vm); + + return 0; +} diff --git a/tools/testing/selftests/kvm/riscv/get-reg-list.c b/tools/testing/selftests/kvm/riscv/get-reg-list.c index b882b7b9b785..222198dd6d04 100644 --- a/tools/testing/selftests/kvm/riscv/get-reg-list.c +++ b/tools/testing/selftests/kvm/riscv/get-reg-list.c @@ -43,6 +43,7 @@ bool filter_reg(__u64 reg) case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_V: case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SMSTATEEN: case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SSAIA: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SSCOFPMF: case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SSTC: case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SVINVAL: case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SVNAPOT: @@ -408,6 +409,7 @@ static const char *isa_ext_single_id_to_str(__u64 reg_off) KVM_ISA_EXT_ARR(V), KVM_ISA_EXT_ARR(SMSTATEEN), KVM_ISA_EXT_ARR(SSAIA), + KVM_ISA_EXT_ARR(SSCOFPMF), KVM_ISA_EXT_ARR(SSTC), KVM_ISA_EXT_ARR(SVINVAL), KVM_ISA_EXT_ARR(SVNAPOT), @@ -931,6 +933,7 @@ KVM_ISA_EXT_SUBLIST_CONFIG(fp_f, FP_F); KVM_ISA_EXT_SUBLIST_CONFIG(fp_d, FP_D); KVM_ISA_EXT_SIMPLE_CONFIG(h, H); KVM_ISA_EXT_SUBLIST_CONFIG(smstateen, SMSTATEEN); +KVM_ISA_EXT_SIMPLE_CONFIG(sscofpmf, SSCOFPMF); KVM_ISA_EXT_SIMPLE_CONFIG(sstc, SSTC); KVM_ISA_EXT_SIMPLE_CONFIG(svinval, SVINVAL); KVM_ISA_EXT_SIMPLE_CONFIG(svnapot, SVNAPOT); @@ -986,6 +989,7 @@ struct vcpu_reg_list *vcpu_configs[] = { &config_fp_d, &config_h, &config_smstateen, + &config_sscofpmf, &config_sstc, &config_svinval, &config_svnapot, diff --git a/tools/testing/selftests/kvm/riscv/sbi_pmu_test.c b/tools/testing/selftests/kvm/riscv/sbi_pmu_test.c new file mode 100644 index 000000000000..69bb94e6b227 --- /dev/null +++ b/tools/testing/selftests/kvm/riscv/sbi_pmu_test.c @@ -0,0 +1,681 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * sbi_pmu_test.c - Tests the riscv64 SBI PMU functionality. + * + * Copyright (c) 2024, Rivos Inc. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <sys/types.h> +#include "kvm_util.h" +#include "test_util.h" +#include "processor.h" +#include "sbi.h" +#include "arch_timer.h" + +/* Maximum counters(firmware + hardware) */ +#define RISCV_MAX_PMU_COUNTERS 64 +union sbi_pmu_ctr_info ctrinfo_arr[RISCV_MAX_PMU_COUNTERS]; + +/* Snapshot shared memory data */ +#define PMU_SNAPSHOT_GPA_BASE BIT(30) +static void *snapshot_gva; +static vm_paddr_t snapshot_gpa; + +static int vcpu_shared_irq_count; +static int counter_in_use; + +/* Cache the available counters in a bitmask */ +static unsigned long counter_mask_available; + +static bool illegal_handler_invoked; + +#define SBI_PMU_TEST_BASIC BIT(0) +#define SBI_PMU_TEST_EVENTS BIT(1) +#define SBI_PMU_TEST_SNAPSHOT BIT(2) +#define SBI_PMU_TEST_OVERFLOW BIT(3) + +static int disabled_tests; + +unsigned long pmu_csr_read_num(int csr_num) +{ +#define switchcase_csr_read(__csr_num, __val) {\ + case __csr_num: \ + __val = csr_read(__csr_num); \ + break; } +#define switchcase_csr_read_2(__csr_num, __val) {\ + switchcase_csr_read(__csr_num + 0, __val) \ + switchcase_csr_read(__csr_num + 1, __val)} +#define switchcase_csr_read_4(__csr_num, __val) {\ + switchcase_csr_read_2(__csr_num + 0, __val) \ + switchcase_csr_read_2(__csr_num + 2, __val)} +#define switchcase_csr_read_8(__csr_num, __val) {\ + switchcase_csr_read_4(__csr_num + 0, __val) \ + switchcase_csr_read_4(__csr_num + 4, __val)} +#define switchcase_csr_read_16(__csr_num, __val) {\ + switchcase_csr_read_8(__csr_num + 0, __val) \ + switchcase_csr_read_8(__csr_num + 8, __val)} +#define switchcase_csr_read_32(__csr_num, __val) {\ + switchcase_csr_read_16(__csr_num + 0, __val) \ + switchcase_csr_read_16(__csr_num + 16, __val)} + + unsigned long ret = 0; + + switch (csr_num) { + switchcase_csr_read_32(CSR_CYCLE, ret) + switchcase_csr_read_32(CSR_CYCLEH, ret) + default : + break; + } + + return ret; +#undef switchcase_csr_read_32 +#undef switchcase_csr_read_16 +#undef switchcase_csr_read_8 +#undef switchcase_csr_read_4 +#undef switchcase_csr_read_2 +#undef switchcase_csr_read +} + +static inline void dummy_func_loop(uint64_t iter) +{ + int i = 0; + + while (i < iter) { + asm volatile("nop"); + i++; + } +} + +static void start_counter(unsigned long counter, unsigned long start_flags, + unsigned long ival) +{ + struct sbiret ret; + + ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_START, counter, 1, start_flags, + ival, 0, 0); + __GUEST_ASSERT(ret.error == 0, "Unable to start counter %ld\n", counter); +} + +/* This should be invoked only for reset counter use case */ +static void stop_reset_counter(unsigned long counter, unsigned long stop_flags) +{ + struct sbiret ret; + + ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_STOP, counter, 1, + stop_flags | SBI_PMU_STOP_FLAG_RESET, 0, 0, 0); + __GUEST_ASSERT(ret.error == SBI_ERR_ALREADY_STOPPED, + "Unable to stop counter %ld\n", counter); +} + +static void stop_counter(unsigned long counter, unsigned long stop_flags) +{ + struct sbiret ret; + + ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_STOP, counter, 1, stop_flags, + 0, 0, 0); + __GUEST_ASSERT(ret.error == 0, "Unable to stop counter %ld error %ld\n", + counter, ret.error); +} + +static void guest_illegal_exception_handler(struct ex_regs *regs) +{ + __GUEST_ASSERT(regs->cause == EXC_INST_ILLEGAL, + "Unexpected exception handler %lx\n", regs->cause); + + illegal_handler_invoked = true; + /* skip the trapping instruction */ + regs->epc += 4; +} + +static void guest_irq_handler(struct ex_regs *regs) +{ + unsigned int irq_num = regs->cause & ~CAUSE_IRQ_FLAG; + struct riscv_pmu_snapshot_data *snapshot_data = snapshot_gva; + unsigned long overflown_mask; + unsigned long counter_val = 0; + + /* Validate that we are in the correct irq handler */ + GUEST_ASSERT_EQ(irq_num, IRQ_PMU_OVF); + + /* Stop all counters first to avoid further interrupts */ + stop_counter(counter_in_use, SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT); + + csr_clear(CSR_SIP, BIT(IRQ_PMU_OVF)); + + overflown_mask = READ_ONCE(snapshot_data->ctr_overflow_mask); + GUEST_ASSERT(overflown_mask & 0x01); + + WRITE_ONCE(vcpu_shared_irq_count, vcpu_shared_irq_count+1); + + counter_val = READ_ONCE(snapshot_data->ctr_values[0]); + /* Now start the counter to mimick the real driver behavior */ + start_counter(counter_in_use, SBI_PMU_START_FLAG_SET_INIT_VALUE, counter_val); +} + +static unsigned long get_counter_index(unsigned long cbase, unsigned long cmask, + unsigned long cflags, + unsigned long event) +{ + struct sbiret ret; + + ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_CFG_MATCH, cbase, cmask, + cflags, event, 0, 0); + __GUEST_ASSERT(ret.error == 0, "config matching failed %ld\n", ret.error); + GUEST_ASSERT(ret.value < RISCV_MAX_PMU_COUNTERS); + GUEST_ASSERT(BIT(ret.value) & counter_mask_available); + + return ret.value; +} + +static unsigned long get_num_counters(void) +{ + struct sbiret ret; + + ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_NUM_COUNTERS, 0, 0, 0, 0, 0, 0); + + __GUEST_ASSERT(ret.error == 0, "Unable to retrieve number of counters from SBI PMU"); + __GUEST_ASSERT(ret.value < RISCV_MAX_PMU_COUNTERS, + "Invalid number of counters %ld\n", ret.value); + + return ret.value; +} + +static void update_counter_info(int num_counters) +{ + int i = 0; + struct sbiret ret; + + for (i = 0; i < num_counters; i++) { + ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_GET_INFO, i, 0, 0, 0, 0, 0); + + /* There can be gaps in logical counter indicies*/ + if (ret.error) + continue; + GUEST_ASSERT_NE(ret.value, 0); + + ctrinfo_arr[i].value = ret.value; + counter_mask_available |= BIT(i); + } + + GUEST_ASSERT(counter_mask_available > 0); +} + +static unsigned long read_fw_counter(int idx, union sbi_pmu_ctr_info ctrinfo) +{ + struct sbiret ret; + + ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_FW_READ, idx, 0, 0, 0, 0, 0); + GUEST_ASSERT(ret.error == 0); + return ret.value; +} + +static unsigned long read_counter(int idx, union sbi_pmu_ctr_info ctrinfo) +{ + unsigned long counter_val = 0; + + __GUEST_ASSERT(ctrinfo.type < 2, "Invalid counter type %d", ctrinfo.type); + + if (ctrinfo.type == SBI_PMU_CTR_TYPE_HW) + counter_val = pmu_csr_read_num(ctrinfo.csr); + else if (ctrinfo.type == SBI_PMU_CTR_TYPE_FW) + counter_val = read_fw_counter(idx, ctrinfo); + + return counter_val; +} + +static inline void verify_sbi_requirement_assert(void) +{ + long out_val = 0; + bool probe; + + probe = guest_sbi_probe_extension(SBI_EXT_PMU, &out_val); + GUEST_ASSERT(probe && out_val == 1); + + if (get_host_sbi_spec_version() < sbi_mk_version(2, 0)) + __GUEST_ASSERT(0, "SBI implementation version doesn't support PMU Snapshot"); +} + +static void snapshot_set_shmem(vm_paddr_t gpa, unsigned long flags) +{ + unsigned long lo = (unsigned long)gpa; +#if __riscv_xlen == 32 + unsigned long hi = (unsigned long)(gpa >> 32); +#else + unsigned long hi = gpa == -1 ? -1 : 0; +#endif + struct sbiret ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_SNAPSHOT_SET_SHMEM, + lo, hi, flags, 0, 0, 0); + + GUEST_ASSERT(ret.value == 0 && ret.error == 0); +} + +static void test_pmu_event(unsigned long event) +{ + unsigned long counter; + unsigned long counter_value_pre, counter_value_post; + unsigned long counter_init_value = 100; + + counter = get_counter_index(0, counter_mask_available, 0, event); + counter_value_pre = read_counter(counter, ctrinfo_arr[counter]); + + /* Do not set the initial value */ + start_counter(counter, 0, 0); + dummy_func_loop(10000); + stop_counter(counter, 0); + + counter_value_post = read_counter(counter, ctrinfo_arr[counter]); + __GUEST_ASSERT(counter_value_post > counter_value_pre, + "Event update verification failed: post [%lx] pre [%lx]\n", + counter_value_post, counter_value_pre); + + /* + * We can't just update the counter without starting it. + * Do start/stop twice to simulate that by first initializing to a very + * high value and a low value after that. + */ + start_counter(counter, SBI_PMU_START_FLAG_SET_INIT_VALUE, ULONG_MAX/2); + stop_counter(counter, 0); + counter_value_pre = read_counter(counter, ctrinfo_arr[counter]); + + start_counter(counter, SBI_PMU_START_FLAG_SET_INIT_VALUE, counter_init_value); + stop_counter(counter, 0); + counter_value_post = read_counter(counter, ctrinfo_arr[counter]); + __GUEST_ASSERT(counter_value_pre > counter_value_post, + "Counter reinitialization verification failed : post [%lx] pre [%lx]\n", + counter_value_post, counter_value_pre); + + /* Now set the initial value and compare */ + start_counter(counter, SBI_PMU_START_FLAG_SET_INIT_VALUE, counter_init_value); + dummy_func_loop(10000); + stop_counter(counter, 0); + + counter_value_post = read_counter(counter, ctrinfo_arr[counter]); + __GUEST_ASSERT(counter_value_post > counter_init_value, + "Event update verification failed: post [%lx] pre [%lx]\n", + counter_value_post, counter_init_value); + + stop_reset_counter(counter, 0); +} + +static void test_pmu_event_snapshot(unsigned long event) +{ + unsigned long counter; + unsigned long counter_value_pre, counter_value_post; + unsigned long counter_init_value = 100; + struct riscv_pmu_snapshot_data *snapshot_data = snapshot_gva; + + counter = get_counter_index(0, counter_mask_available, 0, event); + counter_value_pre = read_counter(counter, ctrinfo_arr[counter]); + + /* Do not set the initial value */ + start_counter(counter, 0, 0); + dummy_func_loop(10000); + stop_counter(counter, SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT); + + /* The counter value is updated w.r.t relative index of cbase */ + counter_value_post = READ_ONCE(snapshot_data->ctr_values[0]); + __GUEST_ASSERT(counter_value_post > counter_value_pre, + "Event update verification failed: post [%lx] pre [%lx]\n", + counter_value_post, counter_value_pre); + + /* + * We can't just update the counter without starting it. + * Do start/stop twice to simulate that by first initializing to a very + * high value and a low value after that. + */ + WRITE_ONCE(snapshot_data->ctr_values[0], ULONG_MAX/2); + start_counter(counter, SBI_PMU_START_FLAG_INIT_SNAPSHOT, 0); + stop_counter(counter, SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT); + counter_value_pre = READ_ONCE(snapshot_data->ctr_values[0]); + + WRITE_ONCE(snapshot_data->ctr_values[0], counter_init_value); + start_counter(counter, SBI_PMU_START_FLAG_INIT_SNAPSHOT, 0); + stop_counter(counter, SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT); + counter_value_post = READ_ONCE(snapshot_data->ctr_values[0]); + __GUEST_ASSERT(counter_value_pre > counter_value_post, + "Counter reinitialization verification failed : post [%lx] pre [%lx]\n", + counter_value_post, counter_value_pre); + + /* Now set the initial value and compare */ + WRITE_ONCE(snapshot_data->ctr_values[0], counter_init_value); + start_counter(counter, SBI_PMU_START_FLAG_INIT_SNAPSHOT, 0); + dummy_func_loop(10000); + stop_counter(counter, SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT); + + counter_value_post = READ_ONCE(snapshot_data->ctr_values[0]); + __GUEST_ASSERT(counter_value_post > counter_init_value, + "Event update verification failed: post [%lx] pre [%lx]\n", + counter_value_post, counter_init_value); + + stop_reset_counter(counter, 0); +} + +static void test_pmu_event_overflow(unsigned long event) +{ + unsigned long counter; + unsigned long counter_value_post; + unsigned long counter_init_value = ULONG_MAX - 10000; + struct riscv_pmu_snapshot_data *snapshot_data = snapshot_gva; + + counter = get_counter_index(0, counter_mask_available, 0, event); + counter_in_use = counter; + + /* The counter value is updated w.r.t relative index of cbase passed to start/stop */ + WRITE_ONCE(snapshot_data->ctr_values[0], counter_init_value); + start_counter(counter, SBI_PMU_START_FLAG_INIT_SNAPSHOT, 0); + dummy_func_loop(10000); + udelay(msecs_to_usecs(2000)); + /* irq handler should have stopped the counter */ + stop_counter(counter, SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT); + + counter_value_post = READ_ONCE(snapshot_data->ctr_values[0]); + /* The counter value after stopping should be less the init value due to overflow */ + __GUEST_ASSERT(counter_value_post < counter_init_value, + "counter_value_post %lx counter_init_value %lx for counter\n", + counter_value_post, counter_init_value); + + stop_reset_counter(counter, 0); +} + +static void test_invalid_event(void) +{ + struct sbiret ret; + unsigned long event = 0x1234; /* A random event */ + + ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_CFG_MATCH, 0, + counter_mask_available, 0, event, 0, 0); + GUEST_ASSERT_EQ(ret.error, SBI_ERR_NOT_SUPPORTED); +} + +static void test_pmu_events(void) +{ + int num_counters = 0; + + /* Get the counter details */ + num_counters = get_num_counters(); + update_counter_info(num_counters); + + /* Sanity testing for any random invalid event */ + test_invalid_event(); + + /* Only these two events are guaranteed to be present */ + test_pmu_event(SBI_PMU_HW_CPU_CYCLES); + test_pmu_event(SBI_PMU_HW_INSTRUCTIONS); + + GUEST_DONE(); +} + +static void test_pmu_basic_sanity(void) +{ + long out_val = 0; + bool probe; + struct sbiret ret; + int num_counters = 0, i; + union sbi_pmu_ctr_info ctrinfo; + + probe = guest_sbi_probe_extension(SBI_EXT_PMU, &out_val); + GUEST_ASSERT(probe && out_val == 1); + + num_counters = get_num_counters(); + + for (i = 0; i < num_counters; i++) { + ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_GET_INFO, i, + 0, 0, 0, 0, 0); + + /* There can be gaps in logical counter indicies*/ + if (ret.error) + continue; + GUEST_ASSERT_NE(ret.value, 0); + + ctrinfo.value = ret.value; + + /** + * Accessibility check of hardware and read capability of firmware counters. + * The spec doesn't mandate any initial value. No need to check any value. + */ + if (ctrinfo.type == SBI_PMU_CTR_TYPE_HW) { + pmu_csr_read_num(ctrinfo.csr); + GUEST_ASSERT(illegal_handler_invoked); + } else if (ctrinfo.type == SBI_PMU_CTR_TYPE_FW) { + read_fw_counter(i, ctrinfo); + } + } + + GUEST_DONE(); +} + +static void test_pmu_events_snaphost(void) +{ + int num_counters = 0; + struct riscv_pmu_snapshot_data *snapshot_data = snapshot_gva; + int i; + + /* Verify presence of SBI PMU and minimum requrired SBI version */ + verify_sbi_requirement_assert(); + + snapshot_set_shmem(snapshot_gpa, 0); + + /* Get the counter details */ + num_counters = get_num_counters(); + update_counter_info(num_counters); + + /* Validate shared memory access */ + GUEST_ASSERT_EQ(READ_ONCE(snapshot_data->ctr_overflow_mask), 0); + for (i = 0; i < num_counters; i++) { + if (counter_mask_available & (BIT(i))) + GUEST_ASSERT_EQ(READ_ONCE(snapshot_data->ctr_values[i]), 0); + } + /* Only these two events are guranteed to be present */ + test_pmu_event_snapshot(SBI_PMU_HW_CPU_CYCLES); + test_pmu_event_snapshot(SBI_PMU_HW_INSTRUCTIONS); + + GUEST_DONE(); +} + +static void test_pmu_events_overflow(void) +{ + int num_counters = 0; + + /* Verify presence of SBI PMU and minimum requrired SBI version */ + verify_sbi_requirement_assert(); + + snapshot_set_shmem(snapshot_gpa, 0); + csr_set(CSR_IE, BIT(IRQ_PMU_OVF)); + local_irq_enable(); + + /* Get the counter details */ + num_counters = get_num_counters(); + update_counter_info(num_counters); + + /* + * Qemu supports overflow for cycle/instruction. + * This test may fail on any platform that do not support overflow for these two events. + */ + test_pmu_event_overflow(SBI_PMU_HW_CPU_CYCLES); + GUEST_ASSERT_EQ(vcpu_shared_irq_count, 1); + + test_pmu_event_overflow(SBI_PMU_HW_INSTRUCTIONS); + GUEST_ASSERT_EQ(vcpu_shared_irq_count, 2); + + GUEST_DONE(); +} + +static void run_vcpu(struct kvm_vcpu *vcpu) +{ + struct ucall uc; + + vcpu_run(vcpu); + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + break; + case UCALL_DONE: + case UCALL_SYNC: + break; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + break; + } +} + +void test_vm_destroy(struct kvm_vm *vm) +{ + memset(ctrinfo_arr, 0, sizeof(union sbi_pmu_ctr_info) * RISCV_MAX_PMU_COUNTERS); + counter_mask_available = 0; + kvm_vm_free(vm); +} + +static void test_vm_basic_test(void *guest_code) +{ + struct kvm_vm *vm; + struct kvm_vcpu *vcpu; + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + __TEST_REQUIRE(__vcpu_has_sbi_ext(vcpu, KVM_RISCV_SBI_EXT_PMU), + "SBI PMU not available, skipping test"); + vm_init_vector_tables(vm); + /* Illegal instruction handler is required to verify read access without configuration */ + vm_install_exception_handler(vm, EXC_INST_ILLEGAL, guest_illegal_exception_handler); + + vcpu_init_vector_tables(vcpu); + run_vcpu(vcpu); + + test_vm_destroy(vm); +} + +static void test_vm_events_test(void *guest_code) +{ + struct kvm_vm *vm = NULL; + struct kvm_vcpu *vcpu = NULL; + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + __TEST_REQUIRE(__vcpu_has_sbi_ext(vcpu, KVM_RISCV_SBI_EXT_PMU), + "SBI PMU not available, skipping test"); + run_vcpu(vcpu); + + test_vm_destroy(vm); +} + +static void test_vm_setup_snapshot_mem(struct kvm_vm *vm, struct kvm_vcpu *vcpu) +{ + /* PMU Snapshot requires single page only */ + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, PMU_SNAPSHOT_GPA_BASE, 1, 1, 0); + /* PMU_SNAPSHOT_GPA_BASE is identity mapped */ + virt_map(vm, PMU_SNAPSHOT_GPA_BASE, PMU_SNAPSHOT_GPA_BASE, 1); + + snapshot_gva = (void *)(PMU_SNAPSHOT_GPA_BASE); + snapshot_gpa = addr_gva2gpa(vcpu->vm, (vm_vaddr_t)snapshot_gva); + sync_global_to_guest(vcpu->vm, snapshot_gva); + sync_global_to_guest(vcpu->vm, snapshot_gpa); +} + +static void test_vm_events_snapshot_test(void *guest_code) +{ + struct kvm_vm *vm = NULL; + struct kvm_vcpu *vcpu; + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + __TEST_REQUIRE(__vcpu_has_sbi_ext(vcpu, KVM_RISCV_SBI_EXT_PMU), + "SBI PMU not available, skipping test"); + + test_vm_setup_snapshot_mem(vm, vcpu); + + run_vcpu(vcpu); + + test_vm_destroy(vm); +} + +static void test_vm_events_overflow(void *guest_code) +{ + struct kvm_vm *vm = NULL; + struct kvm_vcpu *vcpu; + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + __TEST_REQUIRE(__vcpu_has_sbi_ext(vcpu, KVM_RISCV_SBI_EXT_PMU), + "SBI PMU not available, skipping test"); + + __TEST_REQUIRE(__vcpu_has_isa_ext(vcpu, KVM_RISCV_ISA_EXT_SSCOFPMF), + "Sscofpmf is not available, skipping overflow test"); + + test_vm_setup_snapshot_mem(vm, vcpu); + vm_init_vector_tables(vm); + vm_install_interrupt_handler(vm, guest_irq_handler); + + vcpu_init_vector_tables(vcpu); + /* Initialize guest timer frequency. */ + vcpu_get_reg(vcpu, RISCV_TIMER_REG(frequency), &timer_freq); + sync_global_to_guest(vm, timer_freq); + + run_vcpu(vcpu); + + test_vm_destroy(vm); +} + +static void test_print_help(char *name) +{ + pr_info("Usage: %s [-h] [-d <test name>]\n", name); + pr_info("\t-d: Test to disable. Available tests are 'basic', 'events', 'snapshot', 'overflow'\n"); + pr_info("\t-h: print this help screen\n"); +} + +static bool parse_args(int argc, char *argv[]) +{ + int opt; + + while ((opt = getopt(argc, argv, "hd:")) != -1) { + switch (opt) { + case 'd': + if (!strncmp("basic", optarg, 5)) + disabled_tests |= SBI_PMU_TEST_BASIC; + else if (!strncmp("events", optarg, 6)) + disabled_tests |= SBI_PMU_TEST_EVENTS; + else if (!strncmp("snapshot", optarg, 8)) + disabled_tests |= SBI_PMU_TEST_SNAPSHOT; + else if (!strncmp("overflow", optarg, 8)) + disabled_tests |= SBI_PMU_TEST_OVERFLOW; + else + goto done; + break; + case 'h': + default: + goto done; + } + } + + return true; +done: + test_print_help(argv[0]); + return false; +} + +int main(int argc, char *argv[]) +{ + if (!parse_args(argc, argv)) + exit(KSFT_SKIP); + + if (!(disabled_tests & SBI_PMU_TEST_BASIC)) { + test_vm_basic_test(test_pmu_basic_sanity); + pr_info("SBI PMU basic test : PASS\n"); + } + + if (!(disabled_tests & SBI_PMU_TEST_EVENTS)) { + test_vm_events_test(test_pmu_events); + pr_info("SBI PMU event verification test : PASS\n"); + } + + if (!(disabled_tests & SBI_PMU_TEST_SNAPSHOT)) { + test_vm_events_snapshot_test(test_pmu_events_snaphost); + pr_info("SBI PMU event verification with snapshot test : PASS\n"); + } + + if (!(disabled_tests & SBI_PMU_TEST_OVERFLOW)) { + test_vm_events_overflow(test_pmu_events_overflow); + pr_info("SBI PMU event verification with overflow test : PASS\n"); + } + + return 0; +} diff --git a/tools/testing/selftests/kvm/rseq_test.c b/tools/testing/selftests/kvm/rseq_test.c index 28f97fb52044..e5898678bfab 100644 --- a/tools/testing/selftests/kvm/rseq_test.c +++ b/tools/testing/selftests/kvm/rseq_test.c @@ -1,5 +1,13 @@ // SPDX-License-Identifier: GPL-2.0-only -#define _GNU_SOURCE /* for program_invocation_short_name */ + +/* + * Include rseq.c without _GNU_SOURCE defined, before including any headers, so + * that rseq.c is compiled with its configuration, not KVM selftests' config. + */ +#undef _GNU_SOURCE +#include "../rseq/rseq.c" +#define _GNU_SOURCE + #include <errno.h> #include <fcntl.h> #include <pthread.h> @@ -19,8 +27,7 @@ #include "kvm_util.h" #include "processor.h" #include "test_util.h" - -#include "../rseq/rseq.c" +#include "ucall_common.h" /* * Any bug related to task migration is likely to be timing-dependent; perform @@ -186,12 +193,35 @@ static void calc_min_max_cpu(void) "Only one usable CPU, task migration not possible"); } +static void help(const char *name) +{ + puts(""); + printf("usage: %s [-h] [-u]\n", name); + printf(" -u: Don't sanity check the number of successful KVM_RUNs\n"); + puts(""); + exit(0); +} + int main(int argc, char *argv[]) { + bool skip_sanity_check = false; int r, i, snapshot; struct kvm_vm *vm; struct kvm_vcpu *vcpu; u32 cpu, rseq_cpu; + int opt; + + while ((opt = getopt(argc, argv, "hu")) != -1) { + switch (opt) { + case 'u': + skip_sanity_check = true; + break; + case 'h': + default: + help(argv[0]); + break; + } + } r = sched_getaffinity(0, sizeof(possible_mask), &possible_mask); TEST_ASSERT(!r, "sched_getaffinity failed, errno = %d (%s)", errno, @@ -254,9 +284,17 @@ int main(int argc, char *argv[]) * getcpu() to stabilize. A 2:1 migration:KVM_RUN ratio is a fairly * conservative ratio on x86-64, which can do _more_ KVM_RUNs than * migrations given the 1us+ delay in the migration task. + * + * Another reason why it may have small migration:KVM_RUN ratio is that, + * on systems with large low power mode wakeup latency, it may happen + * quite often that the scheduler is not able to wake up the target CPU + * before the vCPU thread is scheduled to another CPU. */ - TEST_ASSERT(i > (NR_TASK_MIGRATIONS / 2), - "Only performed %d KVM_RUNs, task stalled too much?", i); + TEST_ASSERT(skip_sanity_check || i > (NR_TASK_MIGRATIONS / 2), + "Only performed %d KVM_RUNs, task stalled too much?\n\n" + " Try disabling deep sleep states to reduce CPU wakeup latency,\n" + " e.g. via cpuidle.off=1 or setting /dev/cpu_dma_latency to '0',\n" + " or run with -u to disable this sanity check.", i); pthread_join(migration_thread, NULL); diff --git a/tools/testing/selftests/kvm/s390x/cmma_test.c b/tools/testing/selftests/kvm/s390x/cmma_test.c index 626a2b8a2037..b39033844756 100644 --- a/tools/testing/selftests/kvm/s390x/cmma_test.c +++ b/tools/testing/selftests/kvm/s390x/cmma_test.c @@ -7,8 +7,6 @@ * Authors: * Nico Boehr <nrb@linux.ibm.com> */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ #include <fcntl.h> #include <stdio.h> #include <stdlib.h> @@ -18,6 +16,7 @@ #include "test_util.h" #include "kvm_util.h" #include "kselftest.h" +#include "ucall_common.h" #define MAIN_PAGE_COUNT 512 diff --git a/tools/testing/selftests/kvm/s390x/memop.c b/tools/testing/selftests/kvm/s390x/memop.c index 48cb910e660d..f2df7416be84 100644 --- a/tools/testing/selftests/kvm/s390x/memop.c +++ b/tools/testing/selftests/kvm/s390x/memop.c @@ -15,6 +15,7 @@ #include "test_util.h" #include "kvm_util.h" #include "kselftest.h" +#include "ucall_common.h" enum mop_target { LOGICAL, diff --git a/tools/testing/selftests/kvm/s390x/sync_regs_test.c b/tools/testing/selftests/kvm/s390x/sync_regs_test.c index 43fb25ddc3ec..53def355ccba 100644 --- a/tools/testing/selftests/kvm/s390x/sync_regs_test.c +++ b/tools/testing/selftests/kvm/s390x/sync_regs_test.c @@ -10,8 +10,6 @@ * * Test expected behavior of the KVM_CAP_SYNC_REGS functionality. */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ #include <fcntl.h> #include <stdio.h> #include <stdlib.h> diff --git a/tools/testing/selftests/kvm/s390x/tprot.c b/tools/testing/selftests/kvm/s390x/tprot.c index c73f948c9b63..7a742a673b7c 100644 --- a/tools/testing/selftests/kvm/s390x/tprot.c +++ b/tools/testing/selftests/kvm/s390x/tprot.c @@ -8,6 +8,7 @@ #include "test_util.h" #include "kvm_util.h" #include "kselftest.h" +#include "ucall_common.h" #define PAGE_SHIFT 12 #define PAGE_SIZE (1 << PAGE_SHIFT) diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c index bd57d991e27d..bb8002084f52 100644 --- a/tools/testing/selftests/kvm/set_memory_region_test.c +++ b/tools/testing/selftests/kvm/set_memory_region_test.c @@ -1,5 +1,4 @@ // SPDX-License-Identifier: GPL-2.0 -#define _GNU_SOURCE /* for program_invocation_short_name */ #include <fcntl.h> #include <pthread.h> #include <sched.h> @@ -221,8 +220,20 @@ static void test_move_memory_region(void) static void guest_code_delete_memory_region(void) { + struct desc_ptr idt; uint64_t val; + /* + * Clobber the IDT so that a #PF due to the memory region being deleted + * escalates to triple-fault shutdown. Because the memory region is + * deleted, there will be no valid mappings. As a result, KVM will + * repeatedly intercepts the state-2 page fault that occurs when trying + * to vector the guest's #PF. I.e. trying to actually handle the #PF + * in the guest will never succeed, and so isn't an option. + */ + memset(&idt, 0, sizeof(idt)); + __asm__ __volatile__("lidt %0" :: "m"(idt)); + GUEST_SYNC(0); /* Spin until the memory region is deleted. */ @@ -339,7 +350,7 @@ static void test_invalid_memory_region_flags(void) #ifdef __x86_64__ if (kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SW_PROTECTED_VM)) - vm = vm_create_barebones_protected_vm(); + vm = vm_create_barebones_type(KVM_X86_SW_PROTECTED_VM); else #endif vm = vm_create_barebones(); @@ -462,7 +473,7 @@ static void test_add_private_memory_region(void) pr_info("Testing ADD of KVM_MEM_GUEST_MEMFD memory regions\n"); - vm = vm_create_barebones_protected_vm(); + vm = vm_create_barebones_type(KVM_X86_SW_PROTECTED_VM); test_invalid_guest_memfd(vm, vm->kvm_fd, 0, "KVM fd should fail"); test_invalid_guest_memfd(vm, vm->fd, 0, "VM's fd should fail"); @@ -471,7 +482,7 @@ static void test_add_private_memory_region(void) test_invalid_guest_memfd(vm, memfd, 0, "Regular memfd() should fail"); close(memfd); - vm2 = vm_create_barebones_protected_vm(); + vm2 = vm_create_barebones_type(KVM_X86_SW_PROTECTED_VM); memfd = vm_create_guest_memfd(vm2, MEM_REGION_SIZE, 0); test_invalid_guest_memfd(vm, memfd, 0, "Other VM's guest_memfd() should fail"); @@ -499,7 +510,7 @@ static void test_add_overlapping_private_memory_regions(void) pr_info("Testing ADD of overlapping KVM_MEM_GUEST_MEMFD memory regions\n"); - vm = vm_create_barebones_protected_vm(); + vm = vm_create_barebones_type(KVM_X86_SW_PROTECTED_VM); memfd = vm_create_guest_memfd(vm, MEM_REGION_SIZE * 4, 0); diff --git a/tools/testing/selftests/kvm/steal_time.c b/tools/testing/selftests/kvm/steal_time.c index bae0c5026f82..a8d3afa0b86b 100644 --- a/tools/testing/selftests/kvm/steal_time.c +++ b/tools/testing/selftests/kvm/steal_time.c @@ -4,20 +4,22 @@ * * Copyright (C) 2020, Red Hat, Inc. */ -#define _GNU_SOURCE #include <stdio.h> #include <time.h> #include <sched.h> #include <pthread.h> #include <linux/kernel.h> #include <asm/kvm.h> -#ifndef __riscv +#ifdef __riscv +#include "sbi.h" +#else #include <asm/kvm_para.h> #endif #include "test_util.h" #include "kvm_util.h" #include "processor.h" +#include "ucall_common.h" #define NR_VCPUS 4 #define ST_GPA_BASE (1 << 30) @@ -83,20 +85,18 @@ static void steal_time_init(struct kvm_vcpu *vcpu, uint32_t i) static void steal_time_dump(struct kvm_vm *vm, uint32_t vcpu_idx) { struct kvm_steal_time *st = addr_gva2hva(vm, (ulong)st_gva[vcpu_idx]); - int i; - pr_info("VCPU%d:\n", vcpu_idx); - pr_info(" steal: %lld\n", st->steal); - pr_info(" version: %d\n", st->version); - pr_info(" flags: %d\n", st->flags); - pr_info(" preempted: %d\n", st->preempted); - pr_info(" u8_pad: "); - for (i = 0; i < 3; ++i) - pr_info("%d", st->u8_pad[i]); - pr_info("\n pad: "); - for (i = 0; i < 11; ++i) - pr_info("%d", st->pad[i]); - pr_info("\n"); + ksft_print_msg("VCPU%d:\n", vcpu_idx); + ksft_print_msg(" steal: %lld\n", st->steal); + ksft_print_msg(" version: %d\n", st->version); + ksft_print_msg(" flags: %d\n", st->flags); + ksft_print_msg(" preempted: %d\n", st->preempted); + ksft_print_msg(" u8_pad: %d %d %d\n", + st->u8_pad[0], st->u8_pad[1], st->u8_pad[2]); + ksft_print_msg(" pad: %d %d %d %d %d %d %d %d %d %d %d\n", + st->pad[0], st->pad[1], st->pad[2], st->pad[3], + st->pad[4], st->pad[5], st->pad[6], st->pad[7], + st->pad[8], st->pad[9], st->pad[10]); } #elif defined(__aarch64__) @@ -199,10 +199,10 @@ static void steal_time_dump(struct kvm_vm *vm, uint32_t vcpu_idx) { struct st_time *st = addr_gva2hva(vm, (ulong)st_gva[vcpu_idx]); - pr_info("VCPU%d:\n", vcpu_idx); - pr_info(" rev: %d\n", st->rev); - pr_info(" attr: %d\n", st->attr); - pr_info(" st_time: %ld\n", st->st_time); + ksft_print_msg("VCPU%d:\n", vcpu_idx); + ksft_print_msg(" rev: %d\n", st->rev); + ksft_print_msg(" attr: %d\n", st->attr); + ksft_print_msg(" st_time: %ld\n", st->st_time); } #elif defined(__riscv) @@ -366,7 +366,9 @@ int main(int ac, char **av) vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, ST_GPA_BASE, 1, gpages, 0); virt_map(vm, ST_GPA_BASE, ST_GPA_BASE, gpages); + ksft_print_header(); TEST_REQUIRE(is_steal_time_supported(vcpus[0])); + ksft_set_plan(NR_VCPUS); /* Run test on each VCPU */ for (i = 0; i < NR_VCPUS; ++i) { @@ -407,14 +409,15 @@ int main(int ac, char **av) run_delay, stolen_time); if (verbose) { - pr_info("VCPU%d: total-stolen-time=%ld test-stolen-time=%ld", i, - guest_stolen_time[i], stolen_time); - if (stolen_time == run_delay) - pr_info(" (BONUS: guest test-stolen-time even exactly matches test-run_delay)"); - pr_info("\n"); + ksft_print_msg("VCPU%d: total-stolen-time=%ld test-stolen-time=%ld%s\n", + i, guest_stolen_time[i], stolen_time, + stolen_time == run_delay ? + " (BONUS: guest test-stolen-time even exactly matches test-run_delay)" : ""); steal_time_dump(vm, i); } + ksft_test_result_pass("vcpu%d\n", i); } - return 0; + /* Print results and exit() accordingly */ + ksft_finished(); } diff --git a/tools/testing/selftests/kvm/x86_64/amx_test.c b/tools/testing/selftests/kvm/x86_64/amx_test.c index eae521f050e0..903940c54d2d 100644 --- a/tools/testing/selftests/kvm/x86_64/amx_test.c +++ b/tools/testing/selftests/kvm/x86_64/amx_test.c @@ -6,8 +6,6 @@ * * Tests for amx #NM exception and save/restore. */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ #include <fcntl.h> #include <stdio.h> #include <stdlib.h> @@ -246,8 +244,6 @@ int main(int argc, char *argv[]) vcpu_regs_get(vcpu, ®s1); /* Register #NM handler */ - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vcpu); vm_install_exception_handler(vm, NM_VECTOR, guest_nm_handler); /* amx cfg for guest_code */ diff --git a/tools/testing/selftests/kvm/x86_64/dirty_log_page_splitting_test.c b/tools/testing/selftests/kvm/x86_64/dirty_log_page_splitting_test.c index ee3b384b991c..2929c067c207 100644 --- a/tools/testing/selftests/kvm/x86_64/dirty_log_page_splitting_test.c +++ b/tools/testing/selftests/kvm/x86_64/dirty_log_page_splitting_test.c @@ -17,6 +17,7 @@ #include "test_util.h" #include "memstress.h" #include "guest_modes.h" +#include "ucall_common.h" #define VCPUS 2 #define SLOTS 2 diff --git a/tools/testing/selftests/kvm/x86_64/exit_on_emulation_failure_test.c b/tools/testing/selftests/kvm/x86_64/exit_on_emulation_failure_test.c index 6c2e5e0ceb1f..81055476d394 100644 --- a/tools/testing/selftests/kvm/x86_64/exit_on_emulation_failure_test.c +++ b/tools/testing/selftests/kvm/x86_64/exit_on_emulation_failure_test.c @@ -4,12 +4,9 @@ * * Test for KVM_CAP_EXIT_ON_EMULATION_FAILURE. */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ - #include "flds_emulation.h" - #include "test_util.h" +#include "ucall_common.h" #define MMIO_GPA 0x700000000 #define MMIO_GVA MMIO_GPA diff --git a/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c b/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c index f3c2239228b1..762628f7d4ba 100644 --- a/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c +++ b/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c @@ -110,8 +110,6 @@ static void test_fix_hypercall(struct kvm_vcpu *vcpu, bool disable_quirk) { struct kvm_vm *vm = vcpu->vm; - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vcpu); vm_install_exception_handler(vcpu->vm, UD_VECTOR, guest_ud_handler); if (disable_quirk) diff --git a/tools/testing/selftests/kvm/x86_64/hwcr_msr_test.c b/tools/testing/selftests/kvm/x86_64/hwcr_msr_test.c index df351ae17029..10b1b0ba374e 100644 --- a/tools/testing/selftests/kvm/x86_64/hwcr_msr_test.c +++ b/tools/testing/selftests/kvm/x86_64/hwcr_msr_test.c @@ -2,8 +2,6 @@ /* * Copyright (C) 2023, Google LLC. */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ #include <sys/ioctl.h> #include "test_util.h" diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c index 5c27efbf405e..4f5881d4ef66 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c @@ -7,8 +7,6 @@ * This work is licensed under the terms of the GNU GPL, version 2. * */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ #include <fcntl.h> #include <stdio.h> #include <stdlib.h> diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_evmcs.c b/tools/testing/selftests/kvm/x86_64/hyperv_evmcs.c index 4c7257ecd2a6..e192720bfe14 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_evmcs.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_evmcs.c @@ -4,7 +4,6 @@ * * Tests for Enlightened VMCS, including nested guest state. */ -#define _GNU_SOURCE /* for program_invocation_short_name */ #include <fcntl.h> #include <stdio.h> #include <stdlib.h> @@ -258,8 +257,6 @@ int main(int argc, char *argv[]) vcpu_args_set(vcpu, 3, vmx_pages_gva, hv_pages_gva, addr_gva2gpa(vm, hcall_page)); vcpu_set_msr(vcpu, HV_X64_MSR_VP_INDEX, vcpu->id); - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vcpu); vm_install_exception_handler(vm, UD_VECTOR, guest_ud_handler); vm_install_exception_handler(vm, NMI_VECTOR, guest_nmi_handler); diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_features.c b/tools/testing/selftests/kvm/x86_64/hyperv_features.c index b923a285e96f..068e9c69710d 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_features.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_features.c @@ -156,9 +156,6 @@ static void guest_test_msrs_access(void) vcpu_init_cpuid(vcpu, prev_cpuid); } - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vcpu); - /* TODO: Make this entire test easier to maintain. */ if (stage >= 21) vcpu_enable_cap(vcpu, KVM_CAP_HYPERV_SYNIC2, 0); @@ -532,9 +529,6 @@ static void guest_test_hcalls_access(void) while (true) { vm = vm_create_with_one_vcpu(&vcpu, guest_hcall); - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vcpu); - /* Hypercall input/output */ hcall_page = vm_vaddr_alloc_pages(vm, 2); memset(addr_gva2hva(vm, hcall_page), 0x0, 2 * getpagesize()); diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_ipi.c b/tools/testing/selftests/kvm/x86_64/hyperv_ipi.c index f1617762c22f..22c0c124582f 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_ipi.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_ipi.c @@ -5,8 +5,6 @@ * Copyright (C) 2022, Red Hat, Inc. * */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ #include <pthread.h> #include <inttypes.h> @@ -256,16 +254,13 @@ int main(int argc, char *argv[]) hcall_page = vm_vaddr_alloc_pages(vm, 2); memset(addr_gva2hva(vm, hcall_page), 0x0, 2 * getpagesize()); - vm_init_descriptor_tables(vm); vcpu[1] = vm_vcpu_add(vm, RECEIVER_VCPU_ID_1, receiver_code); - vcpu_init_descriptor_tables(vcpu[1]); vcpu_args_set(vcpu[1], 2, hcall_page, addr_gva2gpa(vm, hcall_page)); vcpu_set_msr(vcpu[1], HV_X64_MSR_VP_INDEX, RECEIVER_VCPU_ID_1); vcpu_set_hv_cpuid(vcpu[1]); vcpu[2] = vm_vcpu_add(vm, RECEIVER_VCPU_ID_2, receiver_code); - vcpu_init_descriptor_tables(vcpu[2]); vcpu_args_set(vcpu[2], 2, hcall_page, addr_gva2gpa(vm, hcall_page)); vcpu_set_msr(vcpu[2], HV_X64_MSR_VP_INDEX, RECEIVER_VCPU_ID_2); vcpu_set_hv_cpuid(vcpu[2]); diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c b/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c index c9b18707edc0..b987a3d79715 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c @@ -4,7 +4,6 @@ * * Tests for Hyper-V extensions to SVM. */ -#define _GNU_SOURCE /* for program_invocation_short_name */ #include <fcntl.h> #include <stdio.h> #include <stdlib.h> diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_tlb_flush.c b/tools/testing/selftests/kvm/x86_64/hyperv_tlb_flush.c index 05b56095cf76..077cd0ec3040 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_tlb_flush.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_tlb_flush.c @@ -5,8 +5,6 @@ * Copyright (C) 2022, Red Hat, Inc. * */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ #include <asm/barrier.h> #include <pthread.h> #include <inttypes.h> diff --git a/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c b/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c index 40cc59f4e650..78878b3a2725 100644 --- a/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c +++ b/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c @@ -183,9 +183,6 @@ int main(void) vcpu_clear_cpuid_entry(vcpu, KVM_CPUID_FEATURES); - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vcpu); - enter_guest(vcpu); kvm_vm_free(vm); diff --git a/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c b/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c index 853802641e1e..2b550eff35f1 100644 --- a/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c +++ b/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c @@ -75,14 +75,12 @@ int main(int argc, char *argv[]) struct ucall uc; int testcase; + TEST_REQUIRE(this_cpu_has(X86_FEATURE_MWAIT)); TEST_REQUIRE(kvm_has_cap(KVM_CAP_DISABLE_QUIRKS2)); vm = vm_create_with_one_vcpu(&vcpu, guest_code); vcpu_clear_cpuid_feature(vcpu, X86_FEATURE_MWAIT); - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vcpu); - while (1) { vcpu_run(vcpu); TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); diff --git a/tools/testing/selftests/kvm/x86_64/nested_exceptions_test.c b/tools/testing/selftests/kvm/x86_64/nested_exceptions_test.c index 3670331adf21..3eb0313ffa39 100644 --- a/tools/testing/selftests/kvm/x86_64/nested_exceptions_test.c +++ b/tools/testing/selftests/kvm/x86_64/nested_exceptions_test.c @@ -1,6 +1,4 @@ // SPDX-License-Identifier: GPL-2.0-only -#define _GNU_SOURCE /* for program_invocation_short_name */ - #include "test_util.h" #include "kvm_util.h" #include "processor.h" diff --git a/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c b/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c index 17bbb96fc4df..e7efb2b35f8b 100644 --- a/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c +++ b/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c @@ -5,9 +5,6 @@ * * Copyright (C) 2022, Google LLC. */ - -#define _GNU_SOURCE - #include <fcntl.h> #include <stdint.h> #include <time.h> diff --git a/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.sh b/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.sh index 7cbb409801ee..caad084b8bfd 100755 --- a/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.sh +++ b/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.sh @@ -13,10 +13,21 @@ NX_HUGE_PAGES_RECOVERY_RATIO=$(cat /sys/module/kvm/parameters/nx_huge_pages_reco NX_HUGE_PAGES_RECOVERY_PERIOD=$(cat /sys/module/kvm/parameters/nx_huge_pages_recovery_period_ms) HUGE_PAGES=$(cat /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages) +# If we're already root, the host might not have sudo. +if [ $(whoami) == "root" ]; then + function do_sudo () { + "$@" + } +else + function do_sudo () { + sudo "$@" + } +fi + set +e function sudo_echo () { - echo "$1" | sudo tee -a "$2" > /dev/null + echo "$1" | do_sudo tee -a "$2" > /dev/null } NXECUTABLE="$(dirname $0)/nx_huge_pages_test" diff --git a/tools/testing/selftests/kvm/x86_64/platform_info_test.c b/tools/testing/selftests/kvm/x86_64/platform_info_test.c index 87011965dc41..eda88080c186 100644 --- a/tools/testing/selftests/kvm/x86_64/platform_info_test.c +++ b/tools/testing/selftests/kvm/x86_64/platform_info_test.c @@ -9,8 +9,6 @@ * Verifies expected behavior of controlling guest access to * MSR_PLATFORM_INFO. */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ #include <fcntl.h> #include <stdio.h> #include <stdlib.h> @@ -26,36 +24,18 @@ static void guest_code(void) { uint64_t msr_platform_info; + uint8_t vector; - for (;;) { - msr_platform_info = rdmsr(MSR_PLATFORM_INFO); - GUEST_SYNC(msr_platform_info); - asm volatile ("inc %r11"); - } -} - -static void test_msr_platform_info_enabled(struct kvm_vcpu *vcpu) -{ - struct ucall uc; - - vm_enable_cap(vcpu->vm, KVM_CAP_MSR_PLATFORM_INFO, true); - vcpu_run(vcpu); - TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); + GUEST_SYNC(true); + msr_platform_info = rdmsr(MSR_PLATFORM_INFO); + GUEST_ASSERT_EQ(msr_platform_info & MSR_PLATFORM_INFO_MAX_TURBO_RATIO, + MSR_PLATFORM_INFO_MAX_TURBO_RATIO); - get_ucall(vcpu, &uc); - TEST_ASSERT(uc.cmd == UCALL_SYNC, - "Received ucall other than UCALL_SYNC: %lu", uc.cmd); - TEST_ASSERT((uc.args[1] & MSR_PLATFORM_INFO_MAX_TURBO_RATIO) == - MSR_PLATFORM_INFO_MAX_TURBO_RATIO, - "Expected MSR_PLATFORM_INFO to have max turbo ratio mask: %i.", - MSR_PLATFORM_INFO_MAX_TURBO_RATIO); -} + GUEST_SYNC(false); + vector = rdmsr_safe(MSR_PLATFORM_INFO, &msr_platform_info); + GUEST_ASSERT_EQ(vector, GP_VECTOR); -static void test_msr_platform_info_disabled(struct kvm_vcpu *vcpu) -{ - vm_enable_cap(vcpu->vm, KVM_CAP_MSR_PLATFORM_INFO, false); - vcpu_run(vcpu); - TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_SHUTDOWN); + GUEST_DONE(); } int main(int argc, char *argv[]) @@ -63,6 +43,7 @@ int main(int argc, char *argv[]) struct kvm_vcpu *vcpu; struct kvm_vm *vm; uint64_t msr_platform_info; + struct ucall uc; TEST_REQUIRE(kvm_has_cap(KVM_CAP_MSR_PLATFORM_INFO)); @@ -71,8 +52,26 @@ int main(int argc, char *argv[]) msr_platform_info = vcpu_get_msr(vcpu, MSR_PLATFORM_INFO); vcpu_set_msr(vcpu, MSR_PLATFORM_INFO, msr_platform_info | MSR_PLATFORM_INFO_MAX_TURBO_RATIO); - test_msr_platform_info_enabled(vcpu); - test_msr_platform_info_disabled(vcpu); + + for (;;) { + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_SYNC: + vm_enable_cap(vm, KVM_CAP_MSR_PLATFORM_INFO, uc.args[1]); + break; + case UCALL_DONE: + goto done; + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + default: + TEST_FAIL("Unexpected ucall %lu", uc.cmd); + break; + } + } + +done: vcpu_set_msr(vcpu, MSR_PLATFORM_INFO, msr_platform_info); kvm_vm_free(vm); diff --git a/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c b/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c index 26c85815f7e9..96446134c00b 100644 --- a/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c +++ b/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c @@ -2,8 +2,6 @@ /* * Copyright (C) 2023, Tencent, Inc. */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ #include <x86intrin.h> #include "pmu.h" @@ -21,7 +19,6 @@ static uint8_t kvm_pmu_version; static bool kvm_has_perf_caps; -static bool is_forced_emulation_enabled; static struct kvm_vm *pmu_vm_create_with_one_vcpu(struct kvm_vcpu **vcpu, void *guest_code, @@ -31,11 +28,7 @@ static struct kvm_vm *pmu_vm_create_with_one_vcpu(struct kvm_vcpu **vcpu, struct kvm_vm *vm; vm = vm_create_with_one_vcpu(vcpu, guest_code); - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(*vcpu); - sync_global_to_guest(vm, kvm_pmu_version); - sync_global_to_guest(vm, is_forced_emulation_enabled); /* * Set PERF_CAPABILITIES before PMU version as KVM disallows enabling @@ -630,7 +623,6 @@ int main(int argc, char *argv[]) kvm_pmu_version = kvm_cpu_property(X86_PROPERTY_PMU_VERSION); kvm_has_perf_caps = kvm_cpu_has(X86_FEATURE_PDCM); - is_forced_emulation_enabled = kvm_is_forced_emulation_enabled(); test_intel_counters(); diff --git a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c index 3c85d1ae9893..26b3e7efe5dd 100644 --- a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c +++ b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c @@ -9,9 +9,6 @@ * Verifies the expected behavior of allow lists and deny lists for * virtual PMU events. */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ - #include "kvm_util.h" #include "pmu.h" #include "processor.h" @@ -337,9 +334,6 @@ static void test_pmu_config_disable(void (*guest_code)(void)) vm_enable_cap(vm, KVM_CAP_PMU_CAPABILITY, KVM_PMU_CAP_DISABLE); vcpu = vm_vcpu_add(vm, 0, guest_code); - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vcpu); - TEST_ASSERT(!sanity_check_pmu(vcpu), "Guest should not be able to use disabled PMU."); @@ -876,9 +870,6 @@ int main(int argc, char *argv[]) vm = vm_create_with_one_vcpu(&vcpu, guest_code); - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vcpu); - TEST_REQUIRE(sanity_check_pmu(vcpu)); if (use_amd_pmu()) diff --git a/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c b/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c index e0f642d2a3c4..82a8d88b5338 100644 --- a/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c +++ b/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c @@ -2,7 +2,6 @@ /* * Copyright (C) 2022, Google LLC. */ -#define _GNU_SOURCE /* for program_invocation_short_name */ #include <fcntl.h> #include <limits.h> #include <pthread.h> diff --git a/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c b/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c index 366cf18600bc..d691d86e5bc3 100644 --- a/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c +++ b/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c @@ -4,7 +4,6 @@ * * Copyright (C) 2020, Red Hat, Inc. */ -#define _GNU_SOURCE /* for program_invocation_name */ #include <fcntl.h> #include <stdio.h> #include <stdlib.h> diff --git a/tools/testing/selftests/kvm/x86_64/set_sregs_test.c b/tools/testing/selftests/kvm/x86_64/set_sregs_test.c index 3610981d9162..c021c0795a96 100644 --- a/tools/testing/selftests/kvm/x86_64/set_sregs_test.c +++ b/tools/testing/selftests/kvm/x86_64/set_sregs_test.c @@ -10,7 +10,6 @@ * That bug allowed a user-mode program that called the KVM_SET_SREGS * ioctl to put a VCPU's local APIC into an invalid state. */ -#define _GNU_SOURCE /* for program_invocation_short_name */ #include <fcntl.h> #include <stdio.h> #include <stdlib.h> diff --git a/tools/testing/selftests/kvm/x86_64/sev_init2_tests.c b/tools/testing/selftests/kvm/x86_64/sev_init2_tests.c new file mode 100644 index 000000000000..7a4a61be119b --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/sev_init2_tests.c @@ -0,0 +1,152 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/kvm.h> +#include <linux/psp-sev.h> +#include <stdio.h> +#include <sys/ioctl.h> +#include <stdlib.h> +#include <errno.h> +#include <pthread.h> + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "svm_util.h" +#include "kselftest.h" + +#define SVM_SEV_FEAT_DEBUG_SWAP 32u + +/* + * Some features may have hidden dependencies, or may only work + * for certain VM types. Err on the side of safety and don't + * expect that all supported features can be passed one by one + * to KVM_SEV_INIT2. + * + * (Well, right now there's only one...) + */ +#define KNOWN_FEATURES SVM_SEV_FEAT_DEBUG_SWAP + +int kvm_fd; +u64 supported_vmsa_features; +bool have_sev_es; + +static int __sev_ioctl(int vm_fd, int cmd_id, void *data) +{ + struct kvm_sev_cmd cmd = { + .id = cmd_id, + .data = (uint64_t)data, + .sev_fd = open_sev_dev_path_or_exit(), + }; + int ret; + + ret = ioctl(vm_fd, KVM_MEMORY_ENCRYPT_OP, &cmd); + TEST_ASSERT(ret < 0 || cmd.error == SEV_RET_SUCCESS, + "%d failed: fw error: %d\n", + cmd_id, cmd.error); + + return ret; +} + +static void test_init2(unsigned long vm_type, struct kvm_sev_init *init) +{ + struct kvm_vm *vm; + int ret; + + vm = vm_create_barebones_type(vm_type); + ret = __sev_ioctl(vm->fd, KVM_SEV_INIT2, init); + TEST_ASSERT(ret == 0, + "KVM_SEV_INIT2 return code is %d (expected 0), errno: %d", + ret, errno); + kvm_vm_free(vm); +} + +static void test_init2_invalid(unsigned long vm_type, struct kvm_sev_init *init, const char *msg) +{ + struct kvm_vm *vm; + int ret; + + vm = vm_create_barebones_type(vm_type); + ret = __sev_ioctl(vm->fd, KVM_SEV_INIT2, init); + TEST_ASSERT(ret == -1 && errno == EINVAL, + "KVM_SEV_INIT2 should fail, %s.", + msg); + kvm_vm_free(vm); +} + +void test_vm_types(void) +{ + test_init2(KVM_X86_SEV_VM, &(struct kvm_sev_init){}); + + /* + * TODO: check that unsupported types cannot be created. Probably + * a separate selftest. + */ + if (have_sev_es) + test_init2(KVM_X86_SEV_ES_VM, &(struct kvm_sev_init){}); + + test_init2_invalid(0, &(struct kvm_sev_init){}, + "VM type is KVM_X86_DEFAULT_VM"); + if (kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SW_PROTECTED_VM)) + test_init2_invalid(KVM_X86_SW_PROTECTED_VM, &(struct kvm_sev_init){}, + "VM type is KVM_X86_SW_PROTECTED_VM"); +} + +void test_flags(uint32_t vm_type) +{ + int i; + + for (i = 0; i < 32; i++) + test_init2_invalid(vm_type, + &(struct kvm_sev_init){ .flags = BIT(i) }, + "invalid flag"); +} + +void test_features(uint32_t vm_type, uint64_t supported_features) +{ + int i; + + for (i = 0; i < 64; i++) { + if (!(supported_features & (1u << i))) + test_init2_invalid(vm_type, + &(struct kvm_sev_init){ .vmsa_features = BIT_ULL(i) }, + "unknown feature"); + else if (KNOWN_FEATURES & (1u << i)) + test_init2(vm_type, + &(struct kvm_sev_init){ .vmsa_features = BIT_ULL(i) }); + } +} + +int main(int argc, char *argv[]) +{ + int kvm_fd = open_kvm_dev_path_or_exit(); + bool have_sev; + + TEST_REQUIRE(__kvm_has_device_attr(kvm_fd, KVM_X86_GRP_SEV, + KVM_X86_SEV_VMSA_FEATURES) == 0); + kvm_device_attr_get(kvm_fd, KVM_X86_GRP_SEV, + KVM_X86_SEV_VMSA_FEATURES, + &supported_vmsa_features); + + have_sev = kvm_cpu_has(X86_FEATURE_SEV); + TEST_ASSERT(have_sev == !!(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SEV_VM)), + "sev: KVM_CAP_VM_TYPES (%x) does not match cpuid (checking %x)", + kvm_check_cap(KVM_CAP_VM_TYPES), 1 << KVM_X86_SEV_VM); + + TEST_REQUIRE(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SEV_VM)); + have_sev_es = kvm_cpu_has(X86_FEATURE_SEV_ES); + + TEST_ASSERT(have_sev_es == !!(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SEV_ES_VM)), + "sev-es: KVM_CAP_VM_TYPES (%x) does not match cpuid (checking %x)", + kvm_check_cap(KVM_CAP_VM_TYPES), 1 << KVM_X86_SEV_ES_VM); + + test_vm_types(); + + test_flags(KVM_X86_SEV_VM); + if (have_sev_es) + test_flags(KVM_X86_SEV_ES_VM); + + test_features(KVM_X86_SEV_VM, 0); + if (have_sev_es) + test_features(KVM_X86_SEV_ES_VM, supported_vmsa_features); + + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/sev_smoke_test.c b/tools/testing/selftests/kvm/x86_64/sev_smoke_test.c index 026779f3ed06..7c70c0da4fb7 100644 --- a/tools/testing/selftests/kvm/x86_64/sev_smoke_test.c +++ b/tools/testing/selftests/kvm/x86_64/sev_smoke_test.c @@ -4,6 +4,7 @@ #include <stdlib.h> #include <string.h> #include <sys/ioctl.h> +#include <math.h> #include "test_util.h" #include "kvm_util.h" @@ -13,6 +14,8 @@ #include "sev.h" +#define XFEATURE_MASK_X87_AVX (XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM) + static void guest_sev_es_code(void) { /* TODO: Check CPUID after GHCB-based hypercall support is added. */ @@ -35,13 +38,98 @@ static void guest_sev_code(void) GUEST_DONE(); } +/* Stash state passed via VMSA before any compiled code runs. */ +extern void guest_code_xsave(void); +asm("guest_code_xsave:\n" + "mov $-1, %eax\n" + "mov $-1, %edx\n" + "xsave (%rdi)\n" + "jmp guest_sev_es_code"); + +static void compare_xsave(u8 *from_host, u8 *from_guest) +{ + int i; + bool bad = false; + for (i = 0; i < 4095; i++) { + if (from_host[i] != from_guest[i]) { + printf("mismatch at %02hhx | %02hhx %02hhx\n", i, from_host[i], from_guest[i]); + bad = true; + } + } + + if (bad) + abort(); +} + +static void test_sync_vmsa(uint32_t policy) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + vm_vaddr_t gva; + void *hva; + + double x87val = M_PI; + struct kvm_xsave __attribute__((aligned(64))) xsave = { 0 }; + struct kvm_sregs sregs; + struct kvm_xcrs xcrs = { + .nr_xcrs = 1, + .xcrs[0].xcr = 0, + .xcrs[0].value = XFEATURE_MASK_X87_AVX, + }; + + vm = vm_sev_create_with_one_vcpu(KVM_X86_SEV_ES_VM, guest_code_xsave, &vcpu); + gva = vm_vaddr_alloc_shared(vm, PAGE_SIZE, KVM_UTIL_MIN_VADDR, + MEM_REGION_TEST_DATA); + hva = addr_gva2hva(vm, gva); + + vcpu_args_set(vcpu, 1, gva); + + vcpu_sregs_get(vcpu, &sregs); + sregs.cr4 |= X86_CR4_OSFXSR | X86_CR4_OSXSAVE; + vcpu_sregs_set(vcpu, &sregs); + + vcpu_xcrs_set(vcpu, &xcrs); + asm("fninit\n" + "vpcmpeqb %%ymm4, %%ymm4, %%ymm4\n" + "fldl %3\n" + "xsave (%2)\n" + "fstp %%st\n" + : "=m"(xsave) + : "A"(XFEATURE_MASK_X87_AVX), "r"(&xsave), "m" (x87val) + : "ymm4", "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)"); + vcpu_xsave_set(vcpu, &xsave); + + vm_sev_launch(vm, SEV_POLICY_ES | policy, NULL); + + /* This page is shared, so make it decrypted. */ + memset(hva, 0, 4096); + + vcpu_run(vcpu); + + TEST_ASSERT(vcpu->run->exit_reason == KVM_EXIT_SYSTEM_EVENT, + "Wanted SYSTEM_EVENT, got %s", + exit_reason_str(vcpu->run->exit_reason)); + TEST_ASSERT_EQ(vcpu->run->system_event.type, KVM_SYSTEM_EVENT_SEV_TERM); + TEST_ASSERT_EQ(vcpu->run->system_event.ndata, 1); + TEST_ASSERT_EQ(vcpu->run->system_event.data[0], GHCB_MSR_TERM_REQ); + + compare_xsave((u8 *)&xsave, (u8 *)hva); + + kvm_vm_free(vm); +} + static void test_sev(void *guest_code, uint64_t policy) { struct kvm_vcpu *vcpu; struct kvm_vm *vm; struct ucall uc; - vm = vm_sev_create_with_one_vcpu(policy, guest_code, &vcpu); + uint32_t type = policy & SEV_POLICY_ES ? KVM_X86_SEV_ES_VM : KVM_X86_SEV_VM; + + vm = vm_sev_create_with_one_vcpu(type, guest_code, &vcpu); + + /* TODO: Validate the measurement is as expected. */ + vm_sev_launch(vm, policy, NULL); for (;;) { vcpu_run(vcpu); @@ -82,6 +170,12 @@ int main(int argc, char *argv[]) if (kvm_cpu_has(X86_FEATURE_SEV_ES)) { test_sev(guest_sev_es_code, SEV_POLICY_ES | SEV_POLICY_NO_DBG); test_sev(guest_sev_es_code, SEV_POLICY_ES); + + if (kvm_has_cap(KVM_CAP_XCRS) && + (xgetbv(0) & XFEATURE_MASK_X87_AVX) == XFEATURE_MASK_X87_AVX) { + test_sync_vmsa(0); + test_sync_vmsa(SEV_POLICY_NO_DBG); + } } return 0; diff --git a/tools/testing/selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c b/tools/testing/selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c index 416207c38a17..fabeeaddfb3a 100644 --- a/tools/testing/selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c +++ b/tools/testing/selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c @@ -5,9 +5,6 @@ * Test that KVM emulates instructions in response to EPT violations when * allow_smaller_maxphyaddr is enabled and guest.MAXPHYADDR < host.MAXPHYADDR. */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ - #include "flds_emulation.h" #include "test_util.h" @@ -60,9 +57,6 @@ int main(int argc, char *argv[]) vm = vm_create_with_one_vcpu(&vcpu, guest_code); vcpu_args_set(vcpu, 1, kvm_is_tdp_enabled()); - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vcpu); - vcpu_set_cpuid_property(vcpu, X86_PROPERTY_MAX_PHY_ADDR, MAXPHYADDR); rc = kvm_check_cap(KVM_CAP_EXIT_ON_EMULATION_FAILURE); diff --git a/tools/testing/selftests/kvm/x86_64/smm_test.c b/tools/testing/selftests/kvm/x86_64/smm_test.c index e18b86666e1f..55c88d664a94 100644 --- a/tools/testing/selftests/kvm/x86_64/smm_test.c +++ b/tools/testing/selftests/kvm/x86_64/smm_test.c @@ -4,7 +4,6 @@ * * Tests for SMM. */ -#define _GNU_SOURCE /* for program_invocation_short_name */ #include <fcntl.h> #include <stdio.h> #include <stdlib.h> diff --git a/tools/testing/selftests/kvm/x86_64/state_test.c b/tools/testing/selftests/kvm/x86_64/state_test.c index 88b58aab7207..1c756db329e5 100644 --- a/tools/testing/selftests/kvm/x86_64/state_test.c +++ b/tools/testing/selftests/kvm/x86_64/state_test.c @@ -6,7 +6,6 @@ * * Tests for vCPU state save/restore, including nested guest state. */ -#define _GNU_SOURCE /* for program_invocation_short_name */ #include <fcntl.h> #include <stdio.h> #include <stdlib.h> diff --git a/tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c b/tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c index 32bef39bec21..916e04248fbb 100644 --- a/tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c +++ b/tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c @@ -93,9 +93,6 @@ int main(int argc, char *argv[]) vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vcpu); - vm_install_exception_handler(vm, VINTR_IRQ_NUMBER, vintr_irq_handler); vm_install_exception_handler(vm, INTR_IRQ_NUMBER, intr_irq_handler); diff --git a/tools/testing/selftests/kvm/x86_64/svm_nested_shutdown_test.c b/tools/testing/selftests/kvm/x86_64/svm_nested_shutdown_test.c index d6fcdcc3af31..00135cbba35e 100644 --- a/tools/testing/selftests/kvm/x86_64/svm_nested_shutdown_test.c +++ b/tools/testing/selftests/kvm/x86_64/svm_nested_shutdown_test.c @@ -48,12 +48,9 @@ int main(int argc, char *argv[]) TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM)); vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vcpu); - vcpu_alloc_svm(vm, &svm_gva); - vcpu_args_set(vcpu, 2, svm_gva, vm->idt); + vcpu_args_set(vcpu, 2, svm_gva, vm->arch.idt); vcpu_run(vcpu); TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_SHUTDOWN); diff --git a/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c b/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c index 0c7ce3d4e83a..7b6481d6c0d3 100644 --- a/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c +++ b/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c @@ -152,9 +152,6 @@ static void run_test(bool is_nmi) vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vcpu); - vm_install_exception_handler(vm, NMI_VECTOR, guest_nmi_handler); vm_install_exception_handler(vm, BP_VECTOR, guest_bp_handler); vm_install_exception_handler(vm, INT_NR, guest_int_handler); @@ -166,7 +163,7 @@ static void run_test(bool is_nmi) idt_alt_vm = vm_vaddr_alloc_page(vm); idt_alt = addr_gva2hva(vm, idt_alt_vm); - idt = addr_gva2hva(vm, vm->idt); + idt = addr_gva2hva(vm, vm->arch.idt); memcpy(idt_alt, idt, getpagesize()); } else { idt_alt_vm = 0; diff --git a/tools/testing/selftests/kvm/x86_64/sync_regs_test.c b/tools/testing/selftests/kvm/x86_64/sync_regs_test.c index adb5593daf48..8fa3948b0170 100644 --- a/tools/testing/selftests/kvm/x86_64/sync_regs_test.c +++ b/tools/testing/selftests/kvm/x86_64/sync_regs_test.c @@ -8,8 +8,6 @@ * including requesting an invalid register set, updates to/from values * in kvm_run.s.regs when kvm_valid_regs and kvm_dirty_regs are toggled. */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ #include <fcntl.h> #include <stdio.h> #include <stdlib.h> diff --git a/tools/testing/selftests/kvm/x86_64/ucna_injection_test.c b/tools/testing/selftests/kvm/x86_64/ucna_injection_test.c index dcbb3c29fb8e..57f157c06b39 100644 --- a/tools/testing/selftests/kvm/x86_64/ucna_injection_test.c +++ b/tools/testing/selftests/kvm/x86_64/ucna_injection_test.c @@ -17,14 +17,11 @@ * delivered into the guest or not. * */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ #include <pthread.h> #include <inttypes.h> #include <string.h> #include <time.h> -#include "kvm_util_base.h" #include "kvm_util.h" #include "mce.h" #include "processor.h" @@ -285,10 +282,6 @@ int main(int argc, char *argv[]) cmcidis_vcpu = create_vcpu_with_mce_cap(vm, 1, false, cmci_disabled_guest_code); cmci_vcpu = create_vcpu_with_mce_cap(vm, 2, true, cmci_enabled_guest_code); - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(ucna_vcpu); - vcpu_init_descriptor_tables(cmcidis_vcpu); - vcpu_init_descriptor_tables(cmci_vcpu); vm_install_exception_handler(vm, CMCI_VECTOR, guest_cmci_handler); vm_install_exception_handler(vm, GP_VECTOR, guest_gp_handler); diff --git a/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c b/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c index f4f61a2d2464..32b2794b78fe 100644 --- a/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c +++ b/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c @@ -4,8 +4,6 @@ * * Tests for exiting into userspace on registered MSRs */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ #include <sys/ioctl.h> #include "kvm_test_harness.h" @@ -13,8 +11,6 @@ #include "kvm_util.h" #include "vmx.h" -static bool fep_available; - #define MSR_NON_EXISTENT 0x474f4f00 static u64 deny_bits = 0; @@ -258,7 +254,7 @@ static void guest_code_filter_allow(void) GUEST_ASSERT(data == 2); GUEST_ASSERT(guest_exception_count == 0); - if (fep_available) { + if (is_forced_emulation_enabled) { /* Let userspace know we aren't done. */ GUEST_SYNC(0); @@ -520,8 +516,6 @@ KVM_ONE_VCPU_TEST(user_msr, msr_filter_allow, guest_code_filter_allow) uint64_t cmd; int rc; - sync_global_to_guest(vm, fep_available); - rc = kvm_check_cap(KVM_CAP_X86_USER_SPACE_MSR); TEST_ASSERT(rc, "KVM_CAP_X86_USER_SPACE_MSR is available"); vm_enable_cap(vm, KVM_CAP_X86_USER_SPACE_MSR, KVM_MSR_EXIT_REASON_FILTER); @@ -531,9 +525,6 @@ KVM_ONE_VCPU_TEST(user_msr, msr_filter_allow, guest_code_filter_allow) vm_ioctl(vm, KVM_X86_SET_MSR_FILTER, &filter_allow); - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vcpu); - vm_install_exception_handler(vm, GP_VECTOR, guest_gp_handler); /* Process guest code userspace exits. */ @@ -551,7 +542,7 @@ KVM_ONE_VCPU_TEST(user_msr, msr_filter_allow, guest_code_filter_allow) vcpu_run(vcpu); cmd = process_ucall(vcpu); - if (fep_available) { + if (is_forced_emulation_enabled) { TEST_ASSERT_EQ(cmd, UCALL_SYNC); vm_install_exception_handler(vm, GP_VECTOR, guest_fep_gp_handler); @@ -774,7 +765,5 @@ KVM_ONE_VCPU_TEST(user_msr, user_exit_msr_flags, NULL) int main(int argc, char *argv[]) { - fep_available = kvm_is_forced_emulation_enabled(); - return test_harness_run(argc, argv); } diff --git a/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c b/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c index 977948fd52e6..fa512d033205 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c @@ -4,9 +4,6 @@ * * Copyright (C) 2018, Red Hat, Inc. */ - -#define _GNU_SOURCE /* for program_invocation_name */ - #include <stdio.h> #include <stdlib.h> #include <linux/bitmap.h> diff --git a/tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c b/tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c index fad3634fd9eb..3fd6eceab46f 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c @@ -115,9 +115,6 @@ int main(int argc, char *argv[]) vm = vm_create_with_one_vcpu(&vcpu, guest_code); get_set_sigalrm_vcpu(vcpu); - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vcpu); - vm_install_exception_handler(vm, UD_VECTOR, guest_ud_handler); /* diff --git a/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c b/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c index ea0cb3cae0f7..7c92536551cc 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c @@ -10,7 +10,6 @@ * and check it can be retrieved with KVM_GET_MSR, also test * the invalid LBR formats are rejected. */ -#define _GNU_SOURCE /* for program_invocation_short_name */ #include <sys/ioctl.h> #include <linux/bitmap.h> @@ -86,9 +85,6 @@ KVM_ONE_VCPU_TEST(vmx_pmu_caps, guest_wrmsr_perf_capabilities, guest_code) struct ucall uc; int r, i; - vm_init_descriptor_tables(vcpu->vm); - vcpu_init_descriptor_tables(vcpu); - vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, host_cap.capabilities); vcpu_args_set(vcpu, 1, host_cap.capabilities); diff --git a/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c b/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c index affc32800158..00dd2ac07a61 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c @@ -9,7 +9,6 @@ * value instead of partially decayed timer value * */ -#define _GNU_SOURCE /* for program_invocation_short_name */ #include <fcntl.h> #include <stdio.h> #include <stdlib.h> diff --git a/tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c b/tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c index 725c206ba0b9..a76078a08ff8 100644 --- a/tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c +++ b/tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c @@ -19,8 +19,6 @@ * Migration is a command line option. When used on non-numa machines will * exit with error. Test is still usefull on non-numa for testing IPIs. */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ #include <getopt.h> #include <pthread.h> #include <inttypes.h> @@ -410,8 +408,6 @@ int main(int argc, char *argv[]) vm = vm_create_with_one_vcpu(¶ms[0].vcpu, halter_guest_code); - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(params[0].vcpu); vm_install_exception_handler(vm, IPI_VECTOR, guest_ipi_handler); virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA); diff --git a/tools/testing/selftests/kvm/x86_64/xapic_state_test.c b/tools/testing/selftests/kvm/x86_64/xapic_state_test.c index ab75b873a4ad..69849acd95b0 100644 --- a/tools/testing/selftests/kvm/x86_64/xapic_state_test.c +++ b/tools/testing/selftests/kvm/x86_64/xapic_state_test.c @@ -1,5 +1,4 @@ // SPDX-License-Identifier: GPL-2.0-only -#define _GNU_SOURCE /* for program_invocation_short_name */ #include <fcntl.h> #include <stdio.h> #include <stdlib.h> diff --git a/tools/testing/selftests/kvm/x86_64/xcr0_cpuid_test.c b/tools/testing/selftests/kvm/x86_64/xcr0_cpuid_test.c index 25a0b0db5c3c..95ce192d0753 100644 --- a/tools/testing/selftests/kvm/x86_64/xcr0_cpuid_test.c +++ b/tools/testing/selftests/kvm/x86_64/xcr0_cpuid_test.c @@ -109,9 +109,6 @@ int main(int argc, char *argv[]) vm = vm_create_with_one_vcpu(&vcpu, guest_code); run = vcpu->run; - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vcpu); - while (1) { vcpu_run(vcpu); diff --git a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c index d2ea0435f4f7..a59b3c799bb2 100644 --- a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c +++ b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c @@ -125,7 +125,7 @@ struct compat_vcpu_runstate_info { uint32_t state; uint64_t state_entry_time; uint64_t time[5]; -} __attribute__((__packed__));; +} __attribute__((__packed__)); struct arch_vcpu_info { unsigned long cr2; @@ -171,8 +171,9 @@ static volatile bool guest_saw_irq; static void evtchn_handler(struct ex_regs *regs) { struct vcpu_info *vi = (void *)VCPU_INFO_VADDR; - vi->evtchn_upcall_pending = 0; - vi->evtchn_pending_sel = 0; + + vcpu_arch_put_guest(vi->evtchn_upcall_pending, 0); + vcpu_arch_put_guest(vi->evtchn_pending_sel, 0); guest_saw_irq = true; GUEST_SYNC(TEST_GUEST_SAW_IRQ); @@ -380,20 +381,6 @@ wait_for_timer: GUEST_SYNC(TEST_DONE); } -static int cmp_timespec(struct timespec *a, struct timespec *b) -{ - if (a->tv_sec > b->tv_sec) - return 1; - else if (a->tv_sec < b->tv_sec) - return -1; - else if (a->tv_nsec > b->tv_nsec) - return 1; - else if (a->tv_nsec < b->tv_nsec) - return -1; - else - return 0; -} - static struct shared_info *shinfo; static struct vcpu_info *vinfo; static struct kvm_vcpu *vcpu; @@ -449,7 +436,6 @@ static void *juggle_shinfo_state(void *arg) int main(int argc, char *argv[]) { - struct timespec min_ts, max_ts, vm_ts; struct kvm_xen_hvm_attr evt_reset; struct kvm_vm *vm; pthread_t thread; @@ -468,8 +454,6 @@ int main(int argc, char *argv[]) bool do_evtchn_tests = do_eventfd_tests && !!(xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_SEND); bool has_shinfo_hva = !!(xen_caps & KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA); - clock_gettime(CLOCK_REALTIME, &min_ts); - vm = vm_create_with_one_vcpu(&vcpu, guest_code); /* Map a region for the shared_info page */ @@ -553,8 +537,6 @@ int main(int argc, char *argv[]) }; vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &vec); - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vcpu); vm_install_exception_handler(vm, EVTCHN_VECTOR, evtchn_handler); if (do_runstate_tests) { @@ -1010,7 +992,6 @@ int main(int argc, char *argv[]) vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &evt_reset); alarm(0); - clock_gettime(CLOCK_REALTIME, &max_ts); /* * Just a *really* basic check that things are being put in the @@ -1019,6 +1000,8 @@ int main(int argc, char *argv[]) */ struct pvclock_wall_clock *wc; struct pvclock_vcpu_time_info *ti, *ti2; + struct kvm_clock_data kcdata; + long long delta; wc = addr_gpa2hva(vm, SHINFO_REGION_GPA + 0xc00); ti = addr_gpa2hva(vm, SHINFO_REGION_GPA + 0x40 + 0x20); @@ -1034,12 +1017,34 @@ int main(int argc, char *argv[]) ti2->tsc_shift, ti2->flags); } - vm_ts.tv_sec = wc->sec; - vm_ts.tv_nsec = wc->nsec; TEST_ASSERT(wc->version && !(wc->version & 1), "Bad wallclock version %x", wc->version); - TEST_ASSERT(cmp_timespec(&min_ts, &vm_ts) <= 0, "VM time too old"); - TEST_ASSERT(cmp_timespec(&max_ts, &vm_ts) >= 0, "VM time too new"); + + vm_ioctl(vm, KVM_GET_CLOCK, &kcdata); + + if (kcdata.flags & KVM_CLOCK_REALTIME) { + if (verbose) { + printf("KVM_GET_CLOCK clock: %lld.%09lld\n", + kcdata.clock / NSEC_PER_SEC, kcdata.clock % NSEC_PER_SEC); + printf("KVM_GET_CLOCK realtime: %lld.%09lld\n", + kcdata.realtime / NSEC_PER_SEC, kcdata.realtime % NSEC_PER_SEC); + } + + delta = (wc->sec * NSEC_PER_SEC + wc->nsec) - (kcdata.realtime - kcdata.clock); + + /* + * KVM_GET_CLOCK gives CLOCK_REALTIME which jumps on leap seconds updates but + * unfortunately KVM doesn't currently offer a CLOCK_TAI alternative. Accept 1s + * delta as testing clock accuracy is not the goal here. The test just needs to + * check that the value in shinfo is somewhat sane. + */ + TEST_ASSERT(llabs(delta) < NSEC_PER_SEC, + "Guest's epoch from shinfo %d.%09d differs from KVM_GET_CLOCK %lld.%lld", + wc->sec, wc->nsec, (kcdata.realtime - kcdata.clock) / NSEC_PER_SEC, + (kcdata.realtime - kcdata.clock) % NSEC_PER_SEC); + } else { + pr_info("Missing KVM_CLOCK_REALTIME, skipping shinfo epoch sanity check\n"); + } TEST_ASSERT(ti->version && !(ti->version & 1), "Bad time_info version %x", ti->version); diff --git a/tools/testing/selftests/kvm/x86_64/xss_msr_test.c b/tools/testing/selftests/kvm/x86_64/xss_msr_test.c index 167c97abff1b..f331a4e9bae3 100644 --- a/tools/testing/selftests/kvm/x86_64/xss_msr_test.c +++ b/tools/testing/selftests/kvm/x86_64/xss_msr_test.c @@ -4,8 +4,6 @@ * * Tests for the IA32_XSS MSR. */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ #include <sys/ioctl.h> #include "test_util.h" diff --git a/tools/testing/selftests/landlock/base_test.c b/tools/testing/selftests/landlock/base_test.c index a6f89aaea77d..3c1e9f35b531 100644 --- a/tools/testing/selftests/landlock/base_test.c +++ b/tools/testing/selftests/landlock/base_test.c @@ -75,7 +75,7 @@ TEST(abi_version) const struct landlock_ruleset_attr ruleset_attr = { .handled_access_fs = LANDLOCK_ACCESS_FS_READ_FILE, }; - ASSERT_EQ(4, landlock_create_ruleset(NULL, 0, + ASSERT_EQ(5, landlock_create_ruleset(NULL, 0, LANDLOCK_CREATE_RULESET_VERSION)); ASSERT_EQ(-1, landlock_create_ruleset(&ruleset_attr, 0, diff --git a/tools/testing/selftests/landlock/fs_test.c b/tools/testing/selftests/landlock/fs_test.c index 9a6036fbf289..6b5a9ff88c3d 100644 --- a/tools/testing/selftests/landlock/fs_test.c +++ b/tools/testing/selftests/landlock/fs_test.c @@ -8,21 +8,34 @@ */ #define _GNU_SOURCE +#include <asm/termbits.h> #include <fcntl.h> +#include <libgen.h> +#include <linux/fiemap.h> #include <linux/landlock.h> #include <linux/magic.h> #include <sched.h> +#include <stddef.h> #include <stdio.h> #include <string.h> #include <sys/capability.h> +#include <sys/ioctl.h> #include <sys/mount.h> #include <sys/prctl.h> #include <sys/sendfile.h> +#include <sys/socket.h> #include <sys/stat.h> #include <sys/sysmacros.h> +#include <sys/un.h> #include <sys/vfs.h> #include <unistd.h> +/* + * Intentionally included last to work around header conflict. + * See https://sourceware.org/glibc/wiki/Synchronizing_Headers. + */ +#include <linux/fs.h> + #include "common.h" #ifndef renameat2 @@ -285,15 +298,21 @@ static void prepare_layout_opt(struct __test_metadata *const _metadata, static void prepare_layout(struct __test_metadata *const _metadata) { - _metadata->teardown_parent = true; - prepare_layout_opt(_metadata, &mnt_tmp); } static void cleanup_layout(struct __test_metadata *const _metadata) { set_cap(_metadata, CAP_SYS_ADMIN); - EXPECT_EQ(0, umount(TMP_DIR)); + if (umount(TMP_DIR)) { + /* + * According to the test environment, the mount point of the + * current directory may be shared or not, which changes the + * visibility of the nested TMP_DIR mount point for the test's + * parent process doing this cleanup. + */ + ASSERT_EQ(EINVAL, errno); + } clear_cap(_metadata, CAP_SYS_ADMIN); EXPECT_EQ(0, remove_path(TMP_DIR)); } @@ -307,7 +326,7 @@ FIXTURE_SETUP(layout0) prepare_layout(_metadata); } -FIXTURE_TEARDOWN(layout0) +FIXTURE_TEARDOWN_PARENT(layout0) { cleanup_layout(_metadata); } @@ -370,7 +389,7 @@ FIXTURE_SETUP(layout1) create_layout1(_metadata); } -FIXTURE_TEARDOWN(layout1) +FIXTURE_TEARDOWN_PARENT(layout1) { remove_layout1(_metadata); @@ -529,9 +548,10 @@ TEST_F_FORK(layout1, inval) LANDLOCK_ACCESS_FS_EXECUTE | \ LANDLOCK_ACCESS_FS_WRITE_FILE | \ LANDLOCK_ACCESS_FS_READ_FILE | \ - LANDLOCK_ACCESS_FS_TRUNCATE) + LANDLOCK_ACCESS_FS_TRUNCATE | \ + LANDLOCK_ACCESS_FS_IOCTL_DEV) -#define ACCESS_LAST LANDLOCK_ACCESS_FS_TRUNCATE +#define ACCESS_LAST LANDLOCK_ACCESS_FS_IOCTL_DEV #define ACCESS_ALL ( \ ACCESS_FILE | \ @@ -736,6 +756,9 @@ static int create_ruleset(struct __test_metadata *const _metadata, } for (i = 0; rules[i].path; i++) { + if (!rules[i].access) + continue; + add_path_beneath(_metadata, ruleset_fd, rules[i].access, rules[i].path); } @@ -3444,7 +3467,7 @@ TEST_F_FORK(layout1, truncate_unhandled) LANDLOCK_ACCESS_FS_WRITE_FILE; int ruleset_fd; - /* Enable Landlock. */ + /* Enables Landlock. */ ruleset_fd = create_ruleset(_metadata, handled, rules); ASSERT_LE(0, ruleset_fd); @@ -3527,7 +3550,7 @@ TEST_F_FORK(layout1, truncate) LANDLOCK_ACCESS_FS_TRUNCATE; int ruleset_fd; - /* Enable Landlock. */ + /* Enables Landlock. */ ruleset_fd = create_ruleset(_metadata, handled, rules); ASSERT_LE(0, ruleset_fd); @@ -3683,7 +3706,7 @@ FIXTURE_SETUP(ftruncate) create_file(_metadata, file1_s1d1); } -FIXTURE_TEARDOWN(ftruncate) +FIXTURE_TEARDOWN_PARENT(ftruncate) { EXPECT_EQ(0, remove_path(file1_s1d1)); cleanup_layout(_metadata); @@ -3753,7 +3776,7 @@ TEST_F_FORK(ftruncate, open_and_ftruncate) }; int fd, ruleset_fd; - /* Enable Landlock. */ + /* Enables Landlock. */ ruleset_fd = create_ruleset(_metadata, variant->handled, rules); ASSERT_LE(0, ruleset_fd); enforce_ruleset(_metadata, ruleset_fd); @@ -3830,22 +3853,471 @@ TEST_F_FORK(ftruncate, open_and_ftruncate_in_different_processes) ASSERT_EQ(0, close(socket_fds[1])); } -TEST(memfd_ftruncate) +/* Invokes the FS_IOC_GETFLAGS IOCTL and returns its errno or 0. */ +static int test_fs_ioc_getflags_ioctl(int fd) { - int fd; + uint32_t flags; + + if (ioctl(fd, FS_IOC_GETFLAGS, &flags) < 0) + return errno; + return 0; +} - fd = memfd_create("name", MFD_CLOEXEC); +TEST(memfd_ftruncate_and_ioctl) +{ + const struct landlock_ruleset_attr attr = { + .handled_access_fs = ACCESS_ALL, + }; + int ruleset_fd, fd, i; + + /* + * We exercise the same test both with and without Landlock enabled, to + * ensure that it behaves the same in both cases. + */ + for (i = 0; i < 2; i++) { + /* Creates a new memfd. */ + fd = memfd_create("name", MFD_CLOEXEC); + ASSERT_LE(0, fd); + + /* + * Checks that operations associated with the opened file + * (ftruncate, ioctl) are permitted on file descriptors that are + * created in ways other than open(2). + */ + EXPECT_EQ(0, test_ftruncate(fd)); + EXPECT_EQ(0, test_fs_ioc_getflags_ioctl(fd)); + + ASSERT_EQ(0, close(fd)); + + /* Enables Landlock. */ + ruleset_fd = landlock_create_ruleset(&attr, sizeof(attr), 0); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + } +} + +static int test_fionread_ioctl(int fd) +{ + size_t sz = 0; + + if (ioctl(fd, FIONREAD, &sz) < 0 && errno == EACCES) + return errno; + return 0; +} + +TEST_F_FORK(layout1, o_path_ftruncate_and_ioctl) +{ + const struct landlock_ruleset_attr attr = { + .handled_access_fs = ACCESS_ALL, + }; + int ruleset_fd, fd; + + /* + * Checks that for files opened with O_PATH, both ioctl(2) and + * ftruncate(2) yield EBADF, as it is documented in open(2) for the + * O_PATH flag. + */ + fd = open(dir_s1d1, O_PATH | O_CLOEXEC); ASSERT_LE(0, fd); + EXPECT_EQ(EBADF, test_ftruncate(fd)); + EXPECT_EQ(EBADF, test_fs_ioc_getflags_ioctl(fd)); + + ASSERT_EQ(0, close(fd)); + + /* Enables Landlock. */ + ruleset_fd = landlock_create_ruleset(&attr, sizeof(attr), 0); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + /* - * Checks that ftruncate is permitted on file descriptors that are - * created in ways other than open(2). + * Checks that after enabling Landlock, + * - the file can still be opened with O_PATH + * - both ioctl and truncate still yield EBADF (not EACCES). */ - EXPECT_EQ(0, test_ftruncate(fd)); + fd = open(dir_s1d1, O_PATH | O_CLOEXEC); + ASSERT_LE(0, fd); + + EXPECT_EQ(EBADF, test_ftruncate(fd)); + EXPECT_EQ(EBADF, test_fs_ioc_getflags_ioctl(fd)); ASSERT_EQ(0, close(fd)); } +/* + * ioctl_error - generically call the given ioctl with a pointer to a + * sufficiently large zeroed-out memory region. + * + * Returns the IOCTLs error, or 0. + */ +static int ioctl_error(struct __test_metadata *const _metadata, int fd, + unsigned int cmd) +{ + char buf[128]; /* sufficiently large */ + int res, stdinbak_fd; + + /* + * Depending on the IOCTL command, parts of the zeroed-out buffer might + * be interpreted as file descriptor numbers. We do not want to + * accidentally operate on file descriptor 0 (stdin), so we temporarily + * move stdin to a different FD and close FD 0 for the IOCTL call. + */ + stdinbak_fd = dup(0); + ASSERT_LT(0, stdinbak_fd); + ASSERT_EQ(0, close(0)); + + /* Invokes the IOCTL with a zeroed-out buffer. */ + bzero(&buf, sizeof(buf)); + res = ioctl(fd, cmd, &buf); + + /* Restores the old FD 0 and closes the backup FD. */ + ASSERT_EQ(0, dup2(stdinbak_fd, 0)); + ASSERT_EQ(0, close(stdinbak_fd)); + + if (res < 0) + return errno; + + return 0; +} + +/* Define some linux/falloc.h IOCTL commands which are not available in uapi headers. */ +struct space_resv { + __s16 l_type; + __s16 l_whence; + __s64 l_start; + __s64 l_len; /* len == 0 means until end of file */ + __s32 l_sysid; + __u32 l_pid; + __s32 l_pad[4]; /* reserved area */ +}; + +#define FS_IOC_RESVSP _IOW('X', 40, struct space_resv) +#define FS_IOC_UNRESVSP _IOW('X', 41, struct space_resv) +#define FS_IOC_RESVSP64 _IOW('X', 42, struct space_resv) +#define FS_IOC_UNRESVSP64 _IOW('X', 43, struct space_resv) +#define FS_IOC_ZERO_RANGE _IOW('X', 57, struct space_resv) + +/* + * Tests a series of blanket-permitted and denied IOCTLs. + */ +TEST_F_FORK(layout1, blanket_permitted_ioctls) +{ + const struct landlock_ruleset_attr attr = { + .handled_access_fs = LANDLOCK_ACCESS_FS_IOCTL_DEV, + }; + int ruleset_fd, fd; + + /* Enables Landlock. */ + ruleset_fd = landlock_create_ruleset(&attr, sizeof(attr), 0); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + fd = open("/dev/null", O_RDWR | O_CLOEXEC); + ASSERT_LE(0, fd); + + /* + * Checks permitted commands. + * These ones may return errors, but should not be blocked by Landlock. + */ + EXPECT_NE(EACCES, ioctl_error(_metadata, fd, FIOCLEX)); + EXPECT_NE(EACCES, ioctl_error(_metadata, fd, FIONCLEX)); + EXPECT_NE(EACCES, ioctl_error(_metadata, fd, FIONBIO)); + EXPECT_NE(EACCES, ioctl_error(_metadata, fd, FIOASYNC)); + EXPECT_NE(EACCES, ioctl_error(_metadata, fd, FIOQSIZE)); + EXPECT_NE(EACCES, ioctl_error(_metadata, fd, FIFREEZE)); + EXPECT_NE(EACCES, ioctl_error(_metadata, fd, FITHAW)); + EXPECT_NE(EACCES, ioctl_error(_metadata, fd, FS_IOC_FIEMAP)); + EXPECT_NE(EACCES, ioctl_error(_metadata, fd, FIGETBSZ)); + EXPECT_NE(EACCES, ioctl_error(_metadata, fd, FICLONE)); + EXPECT_NE(EACCES, ioctl_error(_metadata, fd, FICLONERANGE)); + EXPECT_NE(EACCES, ioctl_error(_metadata, fd, FIDEDUPERANGE)); + EXPECT_NE(EACCES, ioctl_error(_metadata, fd, FS_IOC_GETFSUUID)); + EXPECT_NE(EACCES, ioctl_error(_metadata, fd, FS_IOC_GETFSSYSFSPATH)); + + /* + * Checks blocked commands. + * A call to a blocked IOCTL command always returns EACCES. + */ + EXPECT_EQ(EACCES, ioctl_error(_metadata, fd, FIONREAD)); + EXPECT_EQ(EACCES, ioctl_error(_metadata, fd, FS_IOC_GETFLAGS)); + EXPECT_EQ(EACCES, ioctl_error(_metadata, fd, FS_IOC_SETFLAGS)); + EXPECT_EQ(EACCES, ioctl_error(_metadata, fd, FS_IOC_FSGETXATTR)); + EXPECT_EQ(EACCES, ioctl_error(_metadata, fd, FS_IOC_FSSETXATTR)); + EXPECT_EQ(EACCES, ioctl_error(_metadata, fd, FIBMAP)); + EXPECT_EQ(EACCES, ioctl_error(_metadata, fd, FS_IOC_RESVSP)); + EXPECT_EQ(EACCES, ioctl_error(_metadata, fd, FS_IOC_RESVSP64)); + EXPECT_EQ(EACCES, ioctl_error(_metadata, fd, FS_IOC_UNRESVSP)); + EXPECT_EQ(EACCES, ioctl_error(_metadata, fd, FS_IOC_UNRESVSP64)); + EXPECT_EQ(EACCES, ioctl_error(_metadata, fd, FS_IOC_ZERO_RANGE)); + + /* Default case is also blocked. */ + EXPECT_EQ(EACCES, ioctl_error(_metadata, fd, 0xc00ffeee)); + + ASSERT_EQ(0, close(fd)); +} + +/* + * Named pipes are not governed by the LANDLOCK_ACCESS_FS_IOCTL_DEV right, + * because they are not character or block devices. + */ +TEST_F_FORK(layout1, named_pipe_ioctl) +{ + pid_t child_pid; + int fd, ruleset_fd; + const char *const path = file1_s1d1; + const struct landlock_ruleset_attr attr = { + .handled_access_fs = LANDLOCK_ACCESS_FS_IOCTL_DEV, + }; + + ASSERT_EQ(0, unlink(path)); + ASSERT_EQ(0, mkfifo(path, 0600)); + + /* Enables Landlock. */ + ruleset_fd = landlock_create_ruleset(&attr, sizeof(attr), 0); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + /* The child process opens the pipe for writing. */ + child_pid = fork(); + ASSERT_NE(-1, child_pid); + if (child_pid == 0) { + fd = open(path, O_WRONLY); + close(fd); + exit(0); + } + + fd = open(path, O_RDONLY); + ASSERT_LE(0, fd); + + /* FIONREAD is implemented by pipefifo_fops. */ + EXPECT_EQ(0, test_fionread_ioctl(fd)); + + ASSERT_EQ(0, close(fd)); + ASSERT_EQ(0, unlink(path)); + + ASSERT_EQ(child_pid, waitpid(child_pid, NULL, 0)); +} + +/* For named UNIX domain sockets, no IOCTL restrictions apply. */ +TEST_F_FORK(layout1, named_unix_domain_socket_ioctl) +{ + const char *const path = file1_s1d1; + int srv_fd, cli_fd, ruleset_fd; + socklen_t size; + struct sockaddr_un srv_un, cli_un; + const struct landlock_ruleset_attr attr = { + .handled_access_fs = LANDLOCK_ACCESS_FS_IOCTL_DEV, + }; + + /* Sets up a server */ + srv_un.sun_family = AF_UNIX; + strncpy(srv_un.sun_path, path, sizeof(srv_un.sun_path)); + + ASSERT_EQ(0, unlink(path)); + srv_fd = socket(AF_UNIX, SOCK_STREAM, 0); + ASSERT_LE(0, srv_fd); + + size = offsetof(struct sockaddr_un, sun_path) + strlen(srv_un.sun_path); + ASSERT_EQ(0, bind(srv_fd, (struct sockaddr *)&srv_un, size)); + ASSERT_EQ(0, listen(srv_fd, 10 /* qlen */)); + + /* Enables Landlock. */ + ruleset_fd = landlock_create_ruleset(&attr, sizeof(attr), 0); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + /* Sets up a client connection to it */ + cli_un.sun_family = AF_UNIX; + cli_fd = socket(AF_UNIX, SOCK_STREAM, 0); + ASSERT_LE(0, cli_fd); + + size = offsetof(struct sockaddr_un, sun_path) + strlen(cli_un.sun_path); + ASSERT_EQ(0, bind(cli_fd, (struct sockaddr *)&cli_un, size)); + + bzero(&cli_un, sizeof(cli_un)); + cli_un.sun_family = AF_UNIX; + strncpy(cli_un.sun_path, path, sizeof(cli_un.sun_path)); + size = offsetof(struct sockaddr_un, sun_path) + strlen(cli_un.sun_path); + + ASSERT_EQ(0, connect(cli_fd, (struct sockaddr *)&cli_un, size)); + + /* FIONREAD and other IOCTLs should not be forbidden. */ + EXPECT_EQ(0, test_fionread_ioctl(cli_fd)); + + ASSERT_EQ(0, close(cli_fd)); +} + +/* clang-format off */ +FIXTURE(ioctl) {}; + +FIXTURE_SETUP(ioctl) {}; + +FIXTURE_TEARDOWN(ioctl) {}; +/* clang-format on */ + +FIXTURE_VARIANT(ioctl) +{ + const __u64 handled; + const __u64 allowed; + const mode_t open_mode; + /* + * FIONREAD is used as a characteristic device-specific IOCTL command. + * It is implemented in fs/ioctl.c for regular files, + * but we do not blanket-permit it for devices. + */ + const int expected_fionread_result; +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(ioctl, handled_i_allowed_none) { + /* clang-format on */ + .handled = LANDLOCK_ACCESS_FS_IOCTL_DEV, + .allowed = 0, + .open_mode = O_RDWR, + .expected_fionread_result = EACCES, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(ioctl, handled_i_allowed_i) { + /* clang-format on */ + .handled = LANDLOCK_ACCESS_FS_IOCTL_DEV, + .allowed = LANDLOCK_ACCESS_FS_IOCTL_DEV, + .open_mode = O_RDWR, + .expected_fionread_result = 0, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(ioctl, unhandled) { + /* clang-format on */ + .handled = LANDLOCK_ACCESS_FS_EXECUTE, + .allowed = LANDLOCK_ACCESS_FS_EXECUTE, + .open_mode = O_RDWR, + .expected_fionread_result = 0, +}; + +TEST_F_FORK(ioctl, handle_dir_access_file) +{ + const int flag = 0; + const struct rule rules[] = { + { + .path = "/dev", + .access = variant->allowed, + }, + {}, + }; + int file_fd, ruleset_fd; + + /* Enables Landlock. */ + ruleset_fd = create_ruleset(_metadata, variant->handled, rules); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + file_fd = open("/dev/zero", variant->open_mode); + ASSERT_LE(0, file_fd); + + /* Checks that IOCTL commands return the expected errors. */ + EXPECT_EQ(variant->expected_fionread_result, + test_fionread_ioctl(file_fd)); + + /* Checks that unrestrictable commands are unrestricted. */ + EXPECT_EQ(0, ioctl(file_fd, FIOCLEX)); + EXPECT_EQ(0, ioctl(file_fd, FIONCLEX)); + EXPECT_EQ(0, ioctl(file_fd, FIONBIO, &flag)); + EXPECT_EQ(0, ioctl(file_fd, FIOASYNC, &flag)); + EXPECT_EQ(0, ioctl(file_fd, FIGETBSZ, &flag)); + + ASSERT_EQ(0, close(file_fd)); +} + +TEST_F_FORK(ioctl, handle_dir_access_dir) +{ + const int flag = 0; + const struct rule rules[] = { + { + .path = "/dev", + .access = variant->allowed, + }, + {}, + }; + int dir_fd, ruleset_fd; + + /* Enables Landlock. */ + ruleset_fd = create_ruleset(_metadata, variant->handled, rules); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + /* + * Ignore variant->open_mode for this test, as we intend to open a + * directory. If the directory can not be opened, the variant is + * infeasible to test with an opened directory. + */ + dir_fd = open("/dev", O_RDONLY); + if (dir_fd < 0) + return; + + /* + * Checks that IOCTL commands return the expected errors. + * We do not use the expected values from the fixture here. + * + * When using IOCTL on a directory, no Landlock restrictions apply. + */ + EXPECT_EQ(0, test_fionread_ioctl(dir_fd)); + + /* Checks that unrestrictable commands are unrestricted. */ + EXPECT_EQ(0, ioctl(dir_fd, FIOCLEX)); + EXPECT_EQ(0, ioctl(dir_fd, FIONCLEX)); + EXPECT_EQ(0, ioctl(dir_fd, FIONBIO, &flag)); + EXPECT_EQ(0, ioctl(dir_fd, FIOASYNC, &flag)); + EXPECT_EQ(0, ioctl(dir_fd, FIGETBSZ, &flag)); + + ASSERT_EQ(0, close(dir_fd)); +} + +TEST_F_FORK(ioctl, handle_file_access_file) +{ + const int flag = 0; + const struct rule rules[] = { + { + .path = "/dev/zero", + .access = variant->allowed, + }, + {}, + }; + int file_fd, ruleset_fd; + + /* Enables Landlock. */ + ruleset_fd = create_ruleset(_metadata, variant->handled, rules); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + file_fd = open("/dev/zero", variant->open_mode); + ASSERT_LE(0, file_fd) + { + TH_LOG("Failed to open /dev/zero: %s", strerror(errno)); + } + + /* Checks that IOCTL commands return the expected errors. */ + EXPECT_EQ(variant->expected_fionread_result, + test_fionread_ioctl(file_fd)); + + /* Checks that unrestrictable commands are unrestricted. */ + EXPECT_EQ(0, ioctl(file_fd, FIOCLEX)); + EXPECT_EQ(0, ioctl(file_fd, FIONCLEX)); + EXPECT_EQ(0, ioctl(file_fd, FIONBIO, &flag)); + EXPECT_EQ(0, ioctl(file_fd, FIOASYNC, &flag)); + EXPECT_EQ(0, ioctl(file_fd, FIGETBSZ, &flag)); + + ASSERT_EQ(0, close(file_fd)); +} + /* clang-format off */ FIXTURE(layout1_bind) {}; /* clang-format on */ @@ -3861,7 +4333,7 @@ FIXTURE_SETUP(layout1_bind) clear_cap(_metadata, CAP_SYS_ADMIN); } -FIXTURE_TEARDOWN(layout1_bind) +FIXTURE_TEARDOWN_PARENT(layout1_bind) { /* umount(dir_s2d2)) is handled by namespace lifetime. */ @@ -4266,7 +4738,7 @@ FIXTURE_SETUP(layout2_overlay) clear_cap(_metadata, CAP_SYS_ADMIN); } -FIXTURE_TEARDOWN(layout2_overlay) +FIXTURE_TEARDOWN_PARENT(layout2_overlay) { if (self->skip_test) SKIP(return, "overlayfs is not supported (teardown)"); @@ -4616,7 +5088,6 @@ FIXTURE(layout3_fs) { bool has_created_dir; bool has_created_file; - char *dir_path; bool skip_test; }; @@ -4675,11 +5146,24 @@ FIXTURE_VARIANT_ADD(layout3_fs, hostfs) { .cwd_fs_magic = HOSTFS_SUPER_MAGIC, }; +static char *dirname_alloc(const char *path) +{ + char *dup; + + if (!path) + return NULL; + + dup = strdup(path); + if (!dup) + return NULL; + + return dirname(dup); +} + FIXTURE_SETUP(layout3_fs) { struct stat statbuf; - const char *slash; - size_t dir_len; + char *dir_path = dirname_alloc(variant->file_path); if (!supports_filesystem(variant->mnt.type) || !cwd_matches_fs(variant->cwd_fs_magic)) { @@ -4687,27 +5171,15 @@ FIXTURE_SETUP(layout3_fs) SKIP(return, "this filesystem is not supported (setup)"); } - _metadata->teardown_parent = true; - - slash = strrchr(variant->file_path, '/'); - ASSERT_NE(slash, NULL); - dir_len = (size_t)slash - (size_t)variant->file_path; - ASSERT_LT(0, dir_len); - self->dir_path = malloc(dir_len + 1); - self->dir_path[dir_len] = '\0'; - strncpy(self->dir_path, variant->file_path, dir_len); - prepare_layout_opt(_metadata, &variant->mnt); /* Creates directory when required. */ - if (stat(self->dir_path, &statbuf)) { + if (stat(dir_path, &statbuf)) { set_cap(_metadata, CAP_DAC_OVERRIDE); - EXPECT_EQ(0, mkdir(self->dir_path, 0700)) + EXPECT_EQ(0, mkdir(dir_path, 0700)) { TH_LOG("Failed to create directory \"%s\": %s", - self->dir_path, strerror(errno)); - free(self->dir_path); - self->dir_path = NULL; + dir_path, strerror(errno)); } self->has_created_dir = true; clear_cap(_metadata, CAP_DAC_OVERRIDE); @@ -4728,9 +5200,11 @@ FIXTURE_SETUP(layout3_fs) self->has_created_file = true; clear_cap(_metadata, CAP_DAC_OVERRIDE); } + + free(dir_path); } -FIXTURE_TEARDOWN(layout3_fs) +FIXTURE_TEARDOWN_PARENT(layout3_fs) { if (self->skip_test) SKIP(return, "this filesystem is not supported (teardown)"); @@ -4746,16 +5220,17 @@ FIXTURE_TEARDOWN(layout3_fs) } if (self->has_created_dir) { + char *dir_path = dirname_alloc(variant->file_path); + set_cap(_metadata, CAP_DAC_OVERRIDE); /* * Don't check for error because the directory might already * have been removed (cf. release_inode test). */ - rmdir(self->dir_path); + rmdir(dir_path); clear_cap(_metadata, CAP_DAC_OVERRIDE); + free(dir_path); } - free(self->dir_path); - self->dir_path = NULL; cleanup_layout(_metadata); } @@ -4822,7 +5297,10 @@ TEST_F_FORK(layout3_fs, tag_inode_dir_mnt) TEST_F_FORK(layout3_fs, tag_inode_dir_child) { - layer3_fs_tag_inode(_metadata, self, variant, self->dir_path); + char *dir_path = dirname_alloc(variant->file_path); + + layer3_fs_tag_inode(_metadata, self, variant, dir_path); + free(dir_path); } TEST_F_FORK(layout3_fs, tag_inode_file) @@ -4849,9 +5327,13 @@ TEST_F_FORK(layout3_fs, release_inodes) if (self->has_created_file) EXPECT_EQ(0, remove_path(variant->file_path)); - if (self->has_created_dir) + if (self->has_created_dir) { + char *dir_path = dirname_alloc(variant->file_path); + /* Don't check for error because of cgroup specificities. */ - remove_path(self->dir_path); + remove_path(dir_path); + free(dir_path); + } ruleset_fd = create_ruleset(_metadata, LANDLOCK_ACCESS_FS_READ_DIR, layer1); diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk index da2cade3bab0..429535816dbd 100644 --- a/tools/testing/selftests/lib.mk +++ b/tools/testing/selftests/lib.mk @@ -7,6 +7,8 @@ else ifneq ($(filter -%,$(LLVM)),) LLVM_SUFFIX := $(LLVM) endif +CLANG := $(LLVM_PREFIX)clang$(LLVM_SUFFIX) + CLANG_TARGET_FLAGS_arm := arm-linux-gnueabi CLANG_TARGET_FLAGS_arm64 := aarch64-linux-gnu CLANG_TARGET_FLAGS_hexagon := hexagon-linux-musl @@ -18,7 +20,13 @@ CLANG_TARGET_FLAGS_riscv := riscv64-linux-gnu CLANG_TARGET_FLAGS_s390 := s390x-linux-gnu CLANG_TARGET_FLAGS_x86 := x86_64-linux-gnu CLANG_TARGET_FLAGS_x86_64 := x86_64-linux-gnu -CLANG_TARGET_FLAGS := $(CLANG_TARGET_FLAGS_$(ARCH)) + +# Default to host architecture if ARCH is not explicitly given. +ifeq ($(ARCH),) +CLANG_TARGET_FLAGS := $(shell $(CLANG) -print-target-triple) +else +CLANG_TARGET_FLAGS := $(CLANG_TARGET_FLAGS_$(ARCH)) +endif ifeq ($(CROSS_COMPILE),) ifeq ($(CLANG_TARGET_FLAGS),) @@ -30,7 +38,7 @@ else CLANG_FLAGS += --target=$(notdir $(CROSS_COMPILE:%-=%)) endif # CROSS_COMPILE -CC := $(LLVM_PREFIX)clang$(LLVM_SUFFIX) $(CLANG_FLAGS) -fintegrated-as +CC := $(CLANG) $(CLANG_FLAGS) -fintegrated-as else CC := $(CROSS_COMPILE)gcc endif # LLVM @@ -44,10 +52,33 @@ endif selfdir = $(realpath $(dir $(filter %/lib.mk,$(MAKEFILE_LIST)))) top_srcdir = $(selfdir)/../../.. +# msg: emit succinct information message describing current building step +# $1 - generic step name (e.g., CC, LINK, etc); +# $2 - optional "flavor" specifier; if provided, will be emitted as [flavor]; +# $3 - target (assumed to be file); only file name will be emitted; +# $4 - optional extra arg, emitted as-is, if provided. +ifeq ($(V),1) +Q = +msg = +else +Q = @ +msg = @printf ' %-8s%s %s%s\n' "$(1)" "$(if $(2), [$(2)])" "$(notdir $(3))" "$(if $(4), $(4))"; +MAKEFLAGS += --no-print-directory +endif + ifeq ($(KHDR_INCLUDES),) KHDR_INCLUDES := -isystem $(top_srcdir)/usr/include endif +# In order to use newer items that haven't yet been added to the user's system +# header files, add $(TOOLS_INCLUDES) to the compiler invocation in each +# each selftest. +# You may need to add files to that location, or to refresh an existing file. In +# order to do that, run "make headers" from $(top_srcdir), then copy the +# header file that you want from $(top_srcdir)/usr/include/... , to the matching +# subdir in $(TOOLS_INCLUDE). +TOOLS_INCLUDES := -isystem $(top_srcdir)/tools/include/uapi + # The following are built by lib.mk common compile rules. # TEST_CUSTOM_PROGS should be used by tests that require # custom build rule and prevent common build rule use. @@ -176,7 +207,8 @@ endif ifeq ($(OVERRIDE_TARGETS),) LOCAL_HDRS += $(selfdir)/kselftest_harness.h $(selfdir)/kselftest.h $(OUTPUT)/%:%.c $(LOCAL_HDRS) - $(LINK.c) $(filter-out $(LOCAL_HDRS),$^) $(LDLIBS) -o $@ + $(call msg,CC,,$@) + $(Q)$(LINK.c) $(filter-out $(LOCAL_HDRS),$^) $(LDLIBS) -o $@ $(OUTPUT)/%.o:%.S $(COMPILE.S) $^ -o $@ diff --git a/tools/testing/selftests/membarrier/membarrier_test_multi_thread.c b/tools/testing/selftests/membarrier/membarrier_test_multi_thread.c index a9cc17facfb3..4e14dba81234 100644 --- a/tools/testing/selftests/membarrier/membarrier_test_multi_thread.c +++ b/tools/testing/selftests/membarrier/membarrier_test_multi_thread.c @@ -69,5 +69,5 @@ int main(int argc, char **argv) /* Multi-threaded */ test_mt_membarrier(); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/membarrier/membarrier_test_single_thread.c b/tools/testing/selftests/membarrier/membarrier_test_single_thread.c index 4cdc8b1d124c..fa3f1d6c37a0 100644 --- a/tools/testing/selftests/membarrier/membarrier_test_single_thread.c +++ b/tools/testing/selftests/membarrier/membarrier_test_single_thread.c @@ -24,5 +24,5 @@ int main(int argc, char **argv) test_membarrier_get_registrations(/*cmd=*/0); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/memfd/fuse_test.c b/tools/testing/selftests/memfd/fuse_test.c index 93798c8c5d54..dbc171a3806d 100644 --- a/tools/testing/selftests/memfd/fuse_test.c +++ b/tools/testing/selftests/memfd/fuse_test.c @@ -306,7 +306,7 @@ int main(int argc, char **argv) * then the kernel did a page-replacement or canceled the read() (or * whatever magic it did..). In that case, the memfd object is still * all zero. - * In case the memfd-object was *not* sealed, the read() was successfull + * In case the memfd-object was *not* sealed, the read() was successful * and the memfd object must *not* be all zero. * Note that in real scenarios, there might be a mixture of both, but * in this test-cases, we have explicit 200ms delays which should be diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c index 18f585684e20..95af2d78fd31 100644 --- a/tools/testing/selftests/memfd/memfd_test.c +++ b/tools/testing/selftests/memfd/memfd_test.c @@ -1528,7 +1528,7 @@ static void test_share_open(char *banner, char *b_suffix) /* * Test sharing via fork() - * Test whether seal-modifications work as expected with forked childs. + * Test whether seal-modifications work as expected with forked children. */ static void test_share_fork(char *banner, char *b_suffix) { diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index eb5f39a2668b..60b8feb6a5ec 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -12,7 +12,7 @@ uname_M := $(shell uname -m 2>/dev/null || echo not) else uname_M := $(shell echo $(CROSS_COMPILE) | grep -o '^[a-z0-9]\+') endif -ARCH ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/' -e 's/ppc64.*/ppc64/') +ARCH ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/' -e 's/ppc64.*/powerpc/') endif # Without this, failed build products remain, with up-to-date timestamps, @@ -32,7 +32,7 @@ endif # LDLIBS. MAKEFLAGS += --no-builtin-rules -CFLAGS = -Wall -I $(top_srcdir) $(EXTRA_CFLAGS) $(KHDR_INCLUDES) +CFLAGS = -Wall -I $(top_srcdir) $(EXTRA_CFLAGS) $(KHDR_INCLUDES) $(TOOLS_INCLUDES) LDLIBS = -lrt -lpthread -lm TEST_GEN_FILES = cow @@ -98,13 +98,13 @@ TEST_GEN_FILES += $(BINARIES_64) endif else -ifneq (,$(findstring $(ARCH),ppc64)) +ifneq (,$(findstring $(ARCH),powerpc)) TEST_GEN_FILES += protection_keys endif endif -ifneq (,$(filter $(ARCH),arm64 ia64 mips64 parisc64 ppc64 riscv64 s390x sparc64 x86_64)) +ifneq (,$(filter $(ARCH),arm64 ia64 mips64 parisc64 powerpc riscv64 s390x sparc64 x86_64)) TEST_GEN_FILES += va_high_addr_switch TEST_GEN_FILES += virtual_address_range TEST_GEN_FILES += write_to_hugetlbfs diff --git a/tools/testing/selftests/mm/compaction_test.c b/tools/testing/selftests/mm/compaction_test.c index 533999b6c284..4f42eb7d7636 100644 --- a/tools/testing/selftests/mm/compaction_test.c +++ b/tools/testing/selftests/mm/compaction_test.c @@ -177,7 +177,7 @@ int main(int argc, char **argv) ksft_print_header(); if (prereq() || geteuid()) - return ksft_exit_skip("Prerequisites unsatisfied\n"); + ksft_exit_skip("Prerequisites unsatisfied\n"); ksft_set_plan(1); @@ -225,7 +225,7 @@ int main(int argc, char **argv) } if (check_compaction(mem_free, hugepage_size) == 0) - return ksft_exit_pass(); + ksft_exit_pass(); - return ksft_exit_fail(); + ksft_exit_fail(); } diff --git a/tools/testing/selftests/mm/cow.c b/tools/testing/selftests/mm/cow.c index 363bf5f801be..32c6ccc2a6be 100644 --- a/tools/testing/selftests/mm/cow.c +++ b/tools/testing/selftests/mm/cow.c @@ -199,7 +199,7 @@ static int child_vmsplice_memcmp_fn(char *mem, size_t size, typedef int (*child_fn)(char *mem, size_t size, struct comm_pipes *comm_pipes); static void do_test_cow_in_parent(char *mem, size_t size, bool do_mprotect, - child_fn fn) + child_fn fn, bool xfail) { struct comm_pipes comm_pipes; char buf; @@ -247,33 +247,47 @@ static void do_test_cow_in_parent(char *mem, size_t size, bool do_mprotect, else ret = -EINVAL; - ksft_test_result(!ret, "No leak from parent into child\n"); + if (!ret) { + ksft_test_result_pass("No leak from parent into child\n"); + } else if (xfail) { + /* + * With hugetlb, some vmsplice() tests are currently expected to + * fail because (a) harder to fix and (b) nobody really cares. + * Flag them as expected failure for now. + */ + ksft_test_result_xfail("Leak from parent into child\n"); + } else { + ksft_test_result_fail("Leak from parent into child\n"); + } close_comm_pipes: close_comm_pipes(&comm_pipes); } -static void test_cow_in_parent(char *mem, size_t size) +static void test_cow_in_parent(char *mem, size_t size, bool is_hugetlb) { - do_test_cow_in_parent(mem, size, false, child_memcmp_fn); + do_test_cow_in_parent(mem, size, false, child_memcmp_fn, false); } -static void test_cow_in_parent_mprotect(char *mem, size_t size) +static void test_cow_in_parent_mprotect(char *mem, size_t size, bool is_hugetlb) { - do_test_cow_in_parent(mem, size, true, child_memcmp_fn); + do_test_cow_in_parent(mem, size, true, child_memcmp_fn, false); } -static void test_vmsplice_in_child(char *mem, size_t size) +static void test_vmsplice_in_child(char *mem, size_t size, bool is_hugetlb) { - do_test_cow_in_parent(mem, size, false, child_vmsplice_memcmp_fn); + do_test_cow_in_parent(mem, size, false, child_vmsplice_memcmp_fn, + is_hugetlb); } -static void test_vmsplice_in_child_mprotect(char *mem, size_t size) +static void test_vmsplice_in_child_mprotect(char *mem, size_t size, + bool is_hugetlb) { - do_test_cow_in_parent(mem, size, true, child_vmsplice_memcmp_fn); + do_test_cow_in_parent(mem, size, true, child_vmsplice_memcmp_fn, + is_hugetlb); } static void do_test_vmsplice_in_parent(char *mem, size_t size, - bool before_fork) + bool before_fork, bool xfail) { struct iovec iov = { .iov_base = mem, @@ -355,8 +369,18 @@ static void do_test_vmsplice_in_parent(char *mem, size_t size, } } - ksft_test_result(!memcmp(old, new, transferred), - "No leak from child into parent\n"); + if (!memcmp(old, new, transferred)) { + ksft_test_result_pass("No leak from child into parent\n"); + } else if (xfail) { + /* + * With hugetlb, some vmsplice() tests are currently expected to + * fail because (a) harder to fix and (b) nobody really cares. + * Flag them as expected failure for now. + */ + ksft_test_result_xfail("Leak from child into parent\n"); + } else { + ksft_test_result_fail("Leak from child into parent\n"); + } close_pipe: close(fds[0]); close(fds[1]); @@ -367,14 +391,14 @@ free: free(new); } -static void test_vmsplice_before_fork(char *mem, size_t size) +static void test_vmsplice_before_fork(char *mem, size_t size, bool is_hugetlb) { - do_test_vmsplice_in_parent(mem, size, true); + do_test_vmsplice_in_parent(mem, size, true, is_hugetlb); } -static void test_vmsplice_after_fork(char *mem, size_t size) +static void test_vmsplice_after_fork(char *mem, size_t size, bool is_hugetlb) { - do_test_vmsplice_in_parent(mem, size, false); + do_test_vmsplice_in_parent(mem, size, false, is_hugetlb); } #ifdef LOCAL_CONFIG_HAVE_LIBURING @@ -529,12 +553,12 @@ close_comm_pipes: close_comm_pipes(&comm_pipes); } -static void test_iouring_ro(char *mem, size_t size) +static void test_iouring_ro(char *mem, size_t size, bool is_hugetlb) { do_test_iouring(mem, size, false); } -static void test_iouring_fork(char *mem, size_t size) +static void test_iouring_fork(char *mem, size_t size, bool is_hugetlb) { do_test_iouring(mem, size, true); } @@ -678,37 +702,41 @@ free_tmp: free(tmp); } -static void test_ro_pin_on_shared(char *mem, size_t size) +static void test_ro_pin_on_shared(char *mem, size_t size, bool is_hugetlb) { do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, false); } -static void test_ro_fast_pin_on_shared(char *mem, size_t size) +static void test_ro_fast_pin_on_shared(char *mem, size_t size, bool is_hugetlb) { do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, true); } -static void test_ro_pin_on_ro_previously_shared(char *mem, size_t size) +static void test_ro_pin_on_ro_previously_shared(char *mem, size_t size, + bool is_hugetlb) { do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, false); } -static void test_ro_fast_pin_on_ro_previously_shared(char *mem, size_t size) +static void test_ro_fast_pin_on_ro_previously_shared(char *mem, size_t size, + bool is_hugetlb) { do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, true); } -static void test_ro_pin_on_ro_exclusive(char *mem, size_t size) +static void test_ro_pin_on_ro_exclusive(char *mem, size_t size, + bool is_hugetlb) { do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, false); } -static void test_ro_fast_pin_on_ro_exclusive(char *mem, size_t size) +static void test_ro_fast_pin_on_ro_exclusive(char *mem, size_t size, + bool is_hugetlb) { do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, true); } -typedef void (*test_fn)(char *mem, size_t size); +typedef void (*test_fn)(char *mem, size_t size, bool hugetlb); static void do_run_with_base_page(test_fn fn, bool swapout) { @@ -740,7 +768,7 @@ static void do_run_with_base_page(test_fn fn, bool swapout) } } - fn(mem, pagesize); + fn(mem, pagesize, false); munmap: munmap(mem, pagesize); } @@ -904,7 +932,7 @@ static void do_run_with_thp(test_fn fn, enum thp_run thp_run, size_t thpsize) break; } - fn(mem, size); + fn(mem, size, false); munmap: munmap(mmap_mem, mmap_size); if (mremap_mem != MAP_FAILED) @@ -997,7 +1025,7 @@ static void run_with_hugetlb(test_fn fn, const char *desc, size_t hugetlbsize) } munmap(dummy, hugetlbsize); - fn(mem, hugetlbsize); + fn(mem, hugetlbsize, true); munmap: munmap(mem, hugetlbsize); } @@ -1036,7 +1064,7 @@ static const struct test_case anon_test_cases[] = { */ { "vmsplice() + unmap in child", - test_vmsplice_in_child + test_vmsplice_in_child, }, /* * vmsplice() test, but do an additional mprotect(PROT_READ)+ @@ -1044,7 +1072,7 @@ static const struct test_case anon_test_cases[] = { */ { "vmsplice() + unmap in child with mprotect() optimization", - test_vmsplice_in_child_mprotect + test_vmsplice_in_child_mprotect, }, /* * vmsplice() [R/O GUP] in parent before fork(), unmap in parent after @@ -1322,23 +1350,31 @@ close_comm_pipes: close_comm_pipes(&comm_pipes); } -static void test_anon_thp_collapse_unshared(char *mem, size_t size) +static void test_anon_thp_collapse_unshared(char *mem, size_t size, + bool is_hugetlb) { + assert(!is_hugetlb); do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UNSHARED); } -static void test_anon_thp_collapse_fully_shared(char *mem, size_t size) +static void test_anon_thp_collapse_fully_shared(char *mem, size_t size, + bool is_hugetlb) { + assert(!is_hugetlb); do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_FULLY_SHARED); } -static void test_anon_thp_collapse_lower_shared(char *mem, size_t size) +static void test_anon_thp_collapse_lower_shared(char *mem, size_t size, + bool is_hugetlb) { + assert(!is_hugetlb); do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_LOWER_SHARED); } -static void test_anon_thp_collapse_upper_shared(char *mem, size_t size) +static void test_anon_thp_collapse_upper_shared(char *mem, size_t size, + bool is_hugetlb) { + assert(!is_hugetlb); do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UPPER_SHARED); } @@ -1779,5 +1815,5 @@ int main(int argc, char **argv) if (err) ksft_exit_fail_msg("%d out of %d tests failed\n", err, ksft_test_num()); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/mm/gup_longterm.c b/tools/testing/selftests/mm/gup_longterm.c index ad168d35b23b..9423ad439a61 100644 --- a/tools/testing/selftests/mm/gup_longterm.c +++ b/tools/testing/selftests/mm/gup_longterm.c @@ -118,15 +118,22 @@ static void do_test(int fd, size_t size, enum test_type type, bool shared) return; } - /* - * Fault in the page writable such that GUP-fast can eventually pin - * it immediately. - */ + /* Fault in the page such that GUP-fast can pin it directly. */ memset(mem, 0, size); switch (type) { case TEST_TYPE_RO: case TEST_TYPE_RO_FAST: + /* + * Cover more cases regarding unsharing decisions when + * long-term R/O pinning by mapping the page R/O. + */ + ret = mprotect(mem, size, PROT_READ); + if (ret) { + ksft_test_result_fail("mprotect() failed\n"); + goto munmap; + } + /* FALLTHROUGH */ case TEST_TYPE_RW: case TEST_TYPE_RW_FAST: { struct pin_longterm_test args; @@ -228,6 +235,7 @@ static void do_test(int fd, size_t size, enum test_type type, bool shared) assert(false); } +munmap: munmap(mem, size); } @@ -456,5 +464,5 @@ int main(int argc, char **argv) if (err) ksft_exit_fail_msg("%d out of %d tests failed\n", err, ksft_test_num()); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/mm/gup_test.c b/tools/testing/selftests/mm/gup_test.c index 18a49c70d4c6..bd335cf9bc0e 100644 --- a/tools/testing/selftests/mm/gup_test.c +++ b/tools/testing/selftests/mm/gup_test.c @@ -228,7 +228,7 @@ int main(int argc, char **argv) break; } ksft_test_result_skip("Please run this test as root\n"); - return ksft_exit_pass(); + ksft_exit_pass(); } p = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, filed, 0); @@ -267,5 +267,5 @@ int main(int argc, char **argv) free(tid); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/mm/hugetlb_madv_vs_map.c b/tools/testing/selftests/mm/hugetlb_madv_vs_map.c index d01e8d4901d0..8f122a0f0828 100644 --- a/tools/testing/selftests/mm/hugetlb_madv_vs_map.c +++ b/tools/testing/selftests/mm/hugetlb_madv_vs_map.c @@ -27,9 +27,9 @@ #include "vm_util.h" #include "../kselftest.h" -#define MMAP_SIZE (1 << 21) #define INLOOP_ITER 100 +size_t mmap_size; char *huge_ptr; /* Touch the memory while it is being madvised() */ @@ -44,7 +44,7 @@ void *touch(void *unused) void *madv(void *unused) { for (int i = 0; i < INLOOP_ITER; i++) - madvise(huge_ptr, MMAP_SIZE, MADV_DONTNEED); + madvise(huge_ptr, mmap_size, MADV_DONTNEED); return NULL; } @@ -59,7 +59,7 @@ void *map_extra(void *unused) void *ptr; for (int i = 0; i < INLOOP_ITER; i++) { - ptr = mmap(NULL, MMAP_SIZE, PROT_READ | PROT_WRITE, + ptr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0); @@ -93,14 +93,16 @@ int main(void) free_hugepages); } + mmap_size = default_huge_page_size(); + while (max--) { - huge_ptr = mmap(NULL, MMAP_SIZE, PROT_READ | PROT_WRITE, + huge_ptr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0); if ((unsigned long)huge_ptr == -1) { - ksft_exit_skip("Failed to allocated huge page\n"); - return KSFT_SKIP; + ksft_test_result_fail("Failed to allocate huge page\n"); + return KSFT_FAIL; } pthread_create(&thread1, NULL, madv, NULL); @@ -117,7 +119,7 @@ int main(void) } /* Unmap and restart */ - munmap(huge_ptr, MMAP_SIZE); + munmap(huge_ptr, mmap_size); } return KSFT_PASS; diff --git a/tools/testing/selftests/mm/ksm_functional_tests.c b/tools/testing/selftests/mm/ksm_functional_tests.c index d615767e396b..37de82da9be7 100644 --- a/tools/testing/selftests/mm/ksm_functional_tests.c +++ b/tools/testing/selftests/mm/ksm_functional_tests.c @@ -28,6 +28,15 @@ #define MiB (1024 * KiB) #define FORK_EXEC_CHILD_PRG_NAME "ksm_fork_exec_child" +#define MAP_MERGE_FAIL ((void *)-1) +#define MAP_MERGE_SKIP ((void *)-2) + +enum ksm_merge_mode { + KSM_MERGE_PRCTL, + KSM_MERGE_MADVISE, + KSM_MERGE_NONE, /* PRCTL already set */ +}; + static int mem_fd; static int ksm_fd; static int ksm_full_scans_fd; @@ -146,33 +155,34 @@ static int ksm_unmerge(void) return 0; } -static char *mmap_and_merge_range(char val, unsigned long size, int prot, - bool use_prctl) +static char *__mmap_and_merge_range(char val, unsigned long size, int prot, + enum ksm_merge_mode mode) { char *map; + char *err_map = MAP_MERGE_FAIL; int ret; /* Stabilize accounting by disabling KSM completely. */ if (ksm_unmerge()) { - ksft_test_result_fail("Disabling (unmerging) KSM failed\n"); - return MAP_FAILED; + ksft_print_msg("Disabling (unmerging) KSM failed\n"); + return err_map; } if (get_my_merging_pages() > 0) { - ksft_test_result_fail("Still pages merged\n"); - return MAP_FAILED; + ksft_print_msg("Still pages merged\n"); + return err_map; } map = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, 0); if (map == MAP_FAILED) { - ksft_test_result_fail("mmap() failed\n"); - return MAP_FAILED; + ksft_print_msg("mmap() failed\n"); + return err_map; } /* Don't use THP. Ignore if THP are not around on a kernel. */ if (madvise(map, size, MADV_NOHUGEPAGE) && errno != EINVAL) { - ksft_test_result_fail("MADV_NOHUGEPAGE failed\n"); + ksft_print_msg("MADV_NOHUGEPAGE failed\n"); goto unmap; } @@ -180,27 +190,36 @@ static char *mmap_and_merge_range(char val, unsigned long size, int prot, memset(map, val, size); if (mprotect(map, size, prot)) { - ksft_test_result_skip("mprotect() failed\n"); + ksft_print_msg("mprotect() failed\n"); + err_map = MAP_MERGE_SKIP; goto unmap; } - if (use_prctl) { + switch (mode) { + case KSM_MERGE_PRCTL: ret = prctl(PR_SET_MEMORY_MERGE, 1, 0, 0, 0); if (ret < 0 && errno == EINVAL) { - ksft_test_result_skip("PR_SET_MEMORY_MERGE not supported\n"); + ksft_print_msg("PR_SET_MEMORY_MERGE not supported\n"); + err_map = MAP_MERGE_SKIP; goto unmap; } else if (ret) { - ksft_test_result_fail("PR_SET_MEMORY_MERGE=1 failed\n"); + ksft_print_msg("PR_SET_MEMORY_MERGE=1 failed\n"); goto unmap; } - } else if (madvise(map, size, MADV_MERGEABLE)) { - ksft_test_result_fail("MADV_MERGEABLE failed\n"); - goto unmap; + break; + case KSM_MERGE_MADVISE: + if (madvise(map, size, MADV_MERGEABLE)) { + ksft_print_msg("MADV_MERGEABLE failed\n"); + goto unmap; + } + break; + case KSM_MERGE_NONE: + break; } /* Run KSM to trigger merging and wait. */ if (ksm_merge()) { - ksft_test_result_fail("Running KSM failed\n"); + ksft_print_msg("Running KSM failed\n"); goto unmap; } @@ -209,14 +228,31 @@ static char *mmap_and_merge_range(char val, unsigned long size, int prot, * accounted differently (depending on kernel support). */ if (val && !get_my_merging_pages()) { - ksft_test_result_fail("No pages got merged\n"); + ksft_print_msg("No pages got merged\n"); goto unmap; } return map; unmap: munmap(map, size); - return MAP_FAILED; + return err_map; +} + +static char *mmap_and_merge_range(char val, unsigned long size, int prot, + enum ksm_merge_mode mode) +{ + char *map; + char *ret = MAP_FAILED; + + map = __mmap_and_merge_range(val, size, prot, mode); + if (map == MAP_MERGE_FAIL) + ksft_test_result_fail("Merging memory failed"); + else if (map == MAP_MERGE_SKIP) + ksft_test_result_skip("Merging memory skipped"); + else + ret = map; + + return ret; } static void test_unmerge(void) @@ -226,7 +262,7 @@ static void test_unmerge(void) ksft_print_msg("[RUN] %s\n", __func__); - map = mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, false); + map = mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, KSM_MERGE_MADVISE); if (map == MAP_FAILED) return; @@ -264,7 +300,7 @@ static void test_unmerge_zero_pages(void) } /* Let KSM deduplicate zero pages. */ - map = mmap_and_merge_range(0x00, size, PROT_READ | PROT_WRITE, false); + map = mmap_and_merge_range(0x00, size, PROT_READ | PROT_WRITE, KSM_MERGE_MADVISE); if (map == MAP_FAILED) return; @@ -312,7 +348,7 @@ static void test_unmerge_discarded(void) ksft_print_msg("[RUN] %s\n", __func__); - map = mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, false); + map = mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, KSM_MERGE_MADVISE); if (map == MAP_FAILED) return; @@ -344,7 +380,7 @@ static void test_unmerge_uffd_wp(void) ksft_print_msg("[RUN] %s\n", __func__); - map = mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, false); + map = mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, KSM_MERGE_MADVISE); if (map == MAP_FAILED) return; @@ -439,6 +475,36 @@ static void test_prctl(void) ksft_test_result_pass("Setting/clearing PR_SET_MEMORY_MERGE works\n"); } +static int test_child_ksm(void) +{ + const unsigned int size = 2 * MiB; + char *map; + + /* Test if KSM is enabled for the process. */ + if (prctl(PR_GET_MEMORY_MERGE, 0, 0, 0, 0) != 1) + return -1; + + /* Test if merge could really happen. */ + map = __mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, KSM_MERGE_NONE); + if (map == MAP_MERGE_FAIL) + return -2; + else if (map == MAP_MERGE_SKIP) + return -3; + + munmap(map, size); + return 0; +} + +static void test_child_ksm_err(int status) +{ + if (status == -1) + ksft_test_result_fail("unexpected PR_GET_MEMORY_MERGE result in child\n"); + else if (status == -2) + ksft_test_result_fail("Merge in child failed\n"); + else if (status == -3) + ksft_test_result_skip("Merge in child skipped\n"); +} + /* Verify that prctl ksm flag is inherited. */ static void test_prctl_fork(void) { @@ -458,7 +524,7 @@ static void test_prctl_fork(void) child_pid = fork(); if (!child_pid) { - exit(prctl(PR_GET_MEMORY_MERGE, 0, 0, 0, 0)); + exit(test_child_ksm()); } else if (child_pid < 0) { ksft_test_result_fail("fork() failed\n"); return; @@ -467,8 +533,11 @@ static void test_prctl_fork(void) if (waitpid(child_pid, &status, 0) < 0) { ksft_test_result_fail("waitpid() failed\n"); return; - } else if (WEXITSTATUS(status) != 1) { - ksft_test_result_fail("unexpected PR_GET_MEMORY_MERGE result in child\n"); + } + + status = WEXITSTATUS(status); + if (status) { + test_child_ksm_err(status); return; } @@ -480,12 +549,6 @@ static void test_prctl_fork(void) ksft_test_result_pass("PR_SET_MEMORY_MERGE value is inherited\n"); } -static int ksm_fork_exec_child(void) -{ - /* Test if KSM is enabled for the process. */ - return prctl(PR_GET_MEMORY_MERGE, 0, 0, 0, 0) == 1; -} - static void test_prctl_fork_exec(void) { int ret, status; @@ -518,7 +581,7 @@ static void test_prctl_fork_exec(void) if (WIFEXITED(status)) { status = WEXITSTATUS(status); if (status) { - ksft_test_result_fail("KSM not enabled\n"); + test_child_ksm_err(status); return; } } else { @@ -545,7 +608,7 @@ static void test_prctl_unmerge(void) ksft_print_msg("[RUN] %s\n", __func__); - map = mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, true); + map = mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, KSM_MERGE_PRCTL); if (map == MAP_FAILED) return; @@ -568,7 +631,7 @@ static void test_prot_none(void) ksft_print_msg("[RUN] %s\n", __func__); - map = mmap_and_merge_range(0x11, size, PROT_NONE, false); + map = mmap_and_merge_range(0x11, size, PROT_NONE, KSM_MERGE_MADVISE); if (map == MAP_FAILED) goto unmap; @@ -599,7 +662,7 @@ int main(int argc, char **argv) int err; if (argc > 1 && !strcmp(argv[1], FORK_EXEC_CHILD_PRG_NAME)) { - exit(ksm_fork_exec_child() == 1 ? 0 : 1); + exit(test_child_ksm()); } #ifdef __NR_userfaultfd @@ -646,5 +709,5 @@ int main(int argc, char **argv) if (err) ksft_exit_fail_msg("%d out of %d tests failed\n", err, ksft_test_num()); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/mm/madv_populate.c b/tools/testing/selftests/mm/madv_populate.c index 17bcb07f19f3..ef7d911da13e 100644 --- a/tools/testing/selftests/mm/madv_populate.c +++ b/tools/testing/selftests/mm/madv_populate.c @@ -307,5 +307,5 @@ int main(int argc, char **argv) if (err) ksft_exit_fail_msg("%d out of %d tests failed\n", err, ksft_test_num()); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/mm/memfd_secret.c b/tools/testing/selftests/mm/memfd_secret.c index 9b298f6a04b3..9a0597310a76 100644 --- a/tools/testing/selftests/mm/memfd_secret.c +++ b/tools/testing/selftests/mm/memfd_secret.c @@ -20,6 +20,7 @@ #include <unistd.h> #include <errno.h> #include <stdio.h> +#include <fcntl.h> #include "../kselftest.h" @@ -83,6 +84,45 @@ static void test_mlock_limit(int fd) pass("mlock limit is respected\n"); } +static void test_vmsplice(int fd, const char *desc) +{ + ssize_t transferred; + struct iovec iov; + int pipefd[2]; + char *mem; + + if (pipe(pipefd)) { + fail("pipe failed: %s\n", strerror(errno)); + return; + } + + mem = mmap(NULL, page_size, prot, mode, fd, 0); + if (mem == MAP_FAILED) { + fail("Unable to mmap secret memory\n"); + goto close_pipe; + } + + /* + * vmsplice() may use GUP-fast, which must also fail. Prefault the + * page table, so GUP-fast could find it. + */ + memset(mem, PATTERN, page_size); + + iov.iov_base = mem; + iov.iov_len = page_size; + transferred = vmsplice(pipefd[1], &iov, 1, 0); + + if (transferred < 0 && errno == EFAULT) + pass("vmsplice is blocked as expected with %s\n", desc); + else + fail("vmsplice: unexpected memory access with %s\n", desc); + + munmap(mem, page_size); +close_pipe: + close(pipefd[0]); + close(pipefd[1]); +} + static void try_process_vm_read(int fd, int pipefd[2]) { struct iovec liov, riov; @@ -187,7 +227,6 @@ static void test_remote_access(int fd, const char *name, return; } - ftruncate(fd, page_size); memset(mem, PATTERN, page_size); if (write(pipefd[1], &mem, sizeof(mem)) < 0) { @@ -258,7 +297,7 @@ static void prepare(void) strerror(errno)); } -#define NUM_TESTS 4 +#define NUM_TESTS 6 int main(int argc, char *argv[]) { @@ -277,9 +316,17 @@ int main(int argc, char *argv[]) ksft_exit_fail_msg("memfd_secret failed: %s\n", strerror(errno)); } + if (ftruncate(fd, page_size)) + ksft_exit_fail_msg("ftruncate failed: %s\n", strerror(errno)); test_mlock_limit(fd); test_file_apis(fd); + /* + * We have to run the first vmsplice test before any secretmem page was + * allocated for this fd. + */ + test_vmsplice(fd, "fresh page"); + test_vmsplice(fd, "existing page"); test_process_vm_read(fd); test_ptrace(fd); diff --git a/tools/testing/selftests/mm/mkdirty.c b/tools/testing/selftests/mm/mkdirty.c index 301abb99e027..b8a7efe9204e 100644 --- a/tools/testing/selftests/mm/mkdirty.c +++ b/tools/testing/selftests/mm/mkdirty.c @@ -375,5 +375,5 @@ int main(void) if (err) ksft_exit_fail_msg("%d out of %d tests failed\n", err, ksft_test_num()); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/mm/mlock2-tests.c b/tools/testing/selftests/mm/mlock2-tests.c index 26f744188ad0..7f0d50fa361d 100644 --- a/tools/testing/selftests/mm/mlock2-tests.c +++ b/tools/testing/selftests/mm/mlock2-tests.c @@ -20,8 +20,6 @@ static int get_vm_area(unsigned long addr, struct vm_boundaries *area) FILE *file; int ret = 1; char line[1024] = {0}; - char *end_addr; - char *stop; unsigned long start; unsigned long end; @@ -37,21 +35,10 @@ static int get_vm_area(unsigned long addr, struct vm_boundaries *area) memset(area, 0, sizeof(struct vm_boundaries)); while(fgets(line, 1024, file)) { - end_addr = strchr(line, '-'); - if (!end_addr) { + if (sscanf(line, "%lx-%lx", &start, &end) != 2) { ksft_print_msg("cannot parse /proc/self/maps\n"); goto out; } - *end_addr = '\0'; - end_addr++; - stop = strchr(end_addr, ' '); - if (!stop) { - ksft_print_msg("cannot parse /proc/self/maps\n"); - goto out; - } - - sscanf(line, "%lx", &start); - sscanf(end_addr, "%lx", &end); if (start <= addr && end > addr) { area->start = start; diff --git a/tools/testing/selftests/mm/mremap_test.c b/tools/testing/selftests/mm/mremap_test.c index 2f8b991f78cb..1b03bcfaefdf 100644 --- a/tools/testing/selftests/mm/mremap_test.c +++ b/tools/testing/selftests/mm/mremap_test.c @@ -23,6 +23,7 @@ #define VALIDATION_NO_THRESHOLD 0 /* Verify the entire region */ #define MIN(X, Y) ((X) < (Y) ? (X) : (Y)) +#define MAX(X, Y) ((X) > (Y) ? (X) : (Y)) #define SIZE_MB(m) ((size_t)m * (1024 * 1024)) #define SIZE_KB(k) ((size_t)k * 1024) @@ -69,6 +70,27 @@ enum { .expect_failure = should_fail \ } +/* compute square root using binary search */ +static unsigned long get_sqrt(unsigned long val) +{ + unsigned long low = 1; + + /* assuming rand_size is less than 1TB */ + unsigned long high = (1UL << 20); + + while (low <= high) { + unsigned long mid = low + (high - low) / 2; + unsigned long temp = mid * mid; + + if (temp == val) + return mid; + if (temp < val) + low = mid + 1; + high = mid - 1; + } + return low; +} + /* * Returns false if the requested remap region overlaps with an * existing mapping (e.g text, stack) else returns true. @@ -126,19 +148,21 @@ static unsigned long long get_mmap_min_addr(void) * Using /proc/self/maps, assert that the specified address range is contained * within a single mapping. */ -static bool is_range_mapped(FILE *maps_fp, void *start, void *end) +static bool is_range_mapped(FILE *maps_fp, unsigned long start, + unsigned long end) { char *line = NULL; size_t len = 0; bool success = false; + unsigned long first_val, second_val; rewind(maps_fp); while (getline(&line, &len, maps_fp) != -1) { - char *first = strtok(line, "- "); - void *first_val = (void *)strtol(first, NULL, 16); - char *second = strtok(NULL, "- "); - void *second_val = (void *) strtol(second, NULL, 16); + if (sscanf(line, "%lx-%lx", &first_val, &second_val) != 2) { + ksft_exit_fail_msg("cannot parse /proc/self/maps\n"); + break; + } if (first_val <= start && second_val >= end) { success = true; @@ -233,7 +257,8 @@ static void mremap_expand_merge(FILE *maps_fp, unsigned long page_size) goto out; } - success = is_range_mapped(maps_fp, start, start + 3 * page_size); + success = is_range_mapped(maps_fp, (unsigned long)start, + (unsigned long)(start + 3 * page_size)); munmap(start, 3 * page_size); out: @@ -272,7 +297,8 @@ static void mremap_expand_merge_offset(FILE *maps_fp, unsigned long page_size) goto out; } - success = is_range_mapped(maps_fp, start, start + 3 * page_size); + success = is_range_mapped(maps_fp, (unsigned long)start, + (unsigned long)(start + 3 * page_size)); munmap(start, 3 * page_size); out: @@ -296,7 +322,7 @@ out: * * |DDDDddddSSSSssss| */ -static void mremap_move_within_range(char pattern_seed) +static void mremap_move_within_range(unsigned int pattern_seed, char *rand_addr) { char *test_name = "mremap mremap move within range"; void *src, *dest; @@ -316,10 +342,7 @@ static void mremap_move_within_range(char pattern_seed) src = (void *)((unsigned long)src & ~(SIZE_MB(2) - 1)); /* Set byte pattern for source block. */ - srand(pattern_seed); - for (i = 0; i < SIZE_MB(2); i++) { - ((char *)src)[i] = (char) rand(); - } + memcpy(src, rand_addr, SIZE_MB(2)); dest = src - SIZE_MB(2); @@ -357,14 +380,14 @@ out: /* Returns the time taken for the remap on success else returns -1. */ static long long remap_region(struct config c, unsigned int threshold_mb, - char pattern_seed) + char *rand_addr) { void *addr, *src_addr, *dest_addr, *dest_preamble_addr; - int d; - unsigned long long t; + unsigned long long t, d; struct timespec t_start = {0, 0}, t_end = {0, 0}; long long start_ns, end_ns, align_mask, ret, offset; unsigned long long threshold; + unsigned long num_chunks; if (threshold_mb == VALIDATION_NO_THRESHOLD) threshold = c.region_size; @@ -378,9 +401,7 @@ static long long remap_region(struct config c, unsigned int threshold_mb, } /* Set byte pattern for source block. */ - srand(pattern_seed); - for (t = 0; t < threshold; t++) - memset((char *) src_addr + t, (char) rand(), 1); + memcpy(src_addr, rand_addr, threshold); /* Mask to zero out lower bits of address for alignment */ align_mask = ~(c.dest_alignment - 1); @@ -420,9 +441,7 @@ static long long remap_region(struct config c, unsigned int threshold_mb, } /* Set byte pattern for the dest preamble block. */ - srand(pattern_seed); - for (d = 0; d < c.dest_preamble_size; d++) - memset((char *) dest_preamble_addr + d, (char) rand(), 1); + memcpy(dest_preamble_addr, rand_addr, c.dest_preamble_size); } clock_gettime(CLOCK_MONOTONIC, &t_start); @@ -436,15 +455,42 @@ static long long remap_region(struct config c, unsigned int threshold_mb, goto clean_up_dest_preamble; } - /* Verify byte pattern after remapping */ - srand(pattern_seed); - for (t = 0; t < threshold; t++) { - char c = (char) rand(); + /* + * Verify byte pattern after remapping. Employ an algorithm with a + * square root time complexity in threshold: divide the range into + * chunks, if memcmp() returns non-zero, only then perform an + * iteration in that chunk to find the mismatch index. + */ + num_chunks = get_sqrt(threshold); + for (unsigned long i = 0; i < num_chunks; ++i) { + size_t chunk_size = threshold / num_chunks; + unsigned long shift = i * chunk_size; + + if (!memcmp(dest_addr + shift, rand_addr + shift, chunk_size)) + continue; + + /* brute force iteration only over mismatch segment */ + for (t = shift; t < shift + chunk_size; ++t) { + if (((char *) dest_addr)[t] != rand_addr[t]) { + ksft_print_msg("Data after remap doesn't match at offset %llu\n", + t); + ksft_print_msg("Expected: %#x\t Got: %#x\n", rand_addr[t] & 0xff, + ((char *) dest_addr)[t] & 0xff); + ret = -1; + goto clean_up_dest; + } + } + } - if (((char *) dest_addr)[t] != c) { + /* + * if threshold is not divisible by num_chunks, then check the + * last chunk + */ + for (t = num_chunks * (threshold / num_chunks); t < threshold; ++t) { + if (((char *) dest_addr)[t] != rand_addr[t]) { ksft_print_msg("Data after remap doesn't match at offset %llu\n", - t); - ksft_print_msg("Expected: %#x\t Got: %#x\n", c & 0xff, + t); + ksft_print_msg("Expected: %#x\t Got: %#x\n", rand_addr[t] & 0xff, ((char *) dest_addr)[t] & 0xff); ret = -1; goto clean_up_dest; @@ -452,22 +498,44 @@ static long long remap_region(struct config c, unsigned int threshold_mb, } /* Verify the dest preamble byte pattern after remapping */ - if (c.dest_preamble_size) { - srand(pattern_seed); - for (d = 0; d < c.dest_preamble_size; d++) { - char c = (char) rand(); - - if (((char *) dest_preamble_addr)[d] != c) { - ksft_print_msg("Preamble data after remap doesn't match at offset %d\n", - d); - ksft_print_msg("Expected: %#x\t Got: %#x\n", c & 0xff, - ((char *) dest_preamble_addr)[d] & 0xff); + if (!c.dest_preamble_size) + goto no_preamble; + + num_chunks = get_sqrt(c.dest_preamble_size); + + for (unsigned long i = 0; i < num_chunks; ++i) { + size_t chunk_size = c.dest_preamble_size / num_chunks; + unsigned long shift = i * chunk_size; + + if (!memcmp(dest_preamble_addr + shift, rand_addr + shift, + chunk_size)) + continue; + + /* brute force iteration only over mismatched segment */ + for (d = shift; d < shift + chunk_size; ++d) { + if (((char *) dest_preamble_addr)[d] != rand_addr[d]) { + ksft_print_msg("Preamble data after remap doesn't match at offset %llu\n", + d); + ksft_print_msg("Expected: %#x\t Got: %#x\n", rand_addr[d] & 0xff, + ((char *) dest_preamble_addr)[d] & 0xff); ret = -1; goto clean_up_dest; } } } + for (d = num_chunks * (c.dest_preamble_size / num_chunks); d < c.dest_preamble_size; ++d) { + if (((char *) dest_preamble_addr)[d] != rand_addr[d]) { + ksft_print_msg("Preamble data after remap doesn't match at offset %llu\n", + d); + ksft_print_msg("Expected: %#x\t Got: %#x\n", rand_addr[d] & 0xff, + ((char *) dest_preamble_addr)[d] & 0xff); + ret = -1; + goto clean_up_dest; + } + } + +no_preamble: start_ns = t_start.tv_sec * NS_PER_SEC + t_start.tv_nsec; end_ns = t_end.tv_sec * NS_PER_SEC + t_end.tv_nsec; ret = end_ns - start_ns; @@ -494,7 +562,8 @@ out: * the beginning of the mapping just because the aligned * down address landed on a mapping that maybe does not exist. */ -static void mremap_move_1mb_from_start(char pattern_seed) +static void mremap_move_1mb_from_start(unsigned int pattern_seed, + char *rand_addr) { char *test_name = "mremap move 1mb from start at 1MB+256KB aligned src"; void *src = NULL, *dest = NULL; @@ -520,10 +589,7 @@ static void mremap_move_1mb_from_start(char pattern_seed) } /* Set byte pattern for source block. */ - srand(pattern_seed); - for (i = 0; i < SIZE_MB(2); i++) { - ((char *)src)[i] = (char) rand(); - } + memcpy(src, rand_addr, SIZE_MB(2)); /* * Unmap the beginning of dest so that the aligned address @@ -568,10 +634,10 @@ out: static void run_mremap_test_case(struct test test_case, int *failures, unsigned int threshold_mb, - unsigned int pattern_seed) + unsigned int pattern_seed, char *rand_addr) { long long remap_time = remap_region(test_case.config, threshold_mb, - pattern_seed); + rand_addr); if (remap_time < 0) { if (test_case.expect_failure) @@ -642,7 +708,15 @@ int main(int argc, char **argv) int failures = 0; int i, run_perf_tests; unsigned int threshold_mb = VALIDATION_DEFAULT_THRESHOLD; + + /* hard-coded test configs */ + size_t max_test_variable_region_size = _2GB; + size_t max_test_constant_region_size = _2MB; + size_t dest_preamble_size = 10 * _4MB; + unsigned int pattern_seed; + char *rand_addr; + size_t rand_size; int num_expand_tests = 2; int num_misc_tests = 2; struct test test_cases[MAX_TEST] = {}; @@ -659,6 +733,31 @@ int main(int argc, char **argv) ksft_print_msg("Test configs:\n\tthreshold_mb=%u\n\tpattern_seed=%u\n\n", threshold_mb, pattern_seed); + /* + * set preallocated random array according to test configs; see the + * functions for the logic of setting the size + */ + if (!threshold_mb) + rand_size = MAX(max_test_variable_region_size, + max_test_constant_region_size); + else + rand_size = MAX(MIN(threshold_mb * _1MB, + max_test_variable_region_size), + max_test_constant_region_size); + rand_size = MAX(dest_preamble_size, rand_size); + + rand_addr = (char *)mmap(NULL, rand_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (rand_addr == MAP_FAILED) { + perror("mmap"); + ksft_exit_fail_msg("cannot mmap rand_addr\n"); + } + + /* fill stream of random bytes */ + srand(pattern_seed); + for (unsigned long i = 0; i < rand_size; ++i) + rand_addr[i] = (char) rand(); + page_size = sysconf(_SC_PAGESIZE); /* Expected mremap failures */ @@ -730,13 +829,13 @@ int main(int argc, char **argv) for (i = 0; i < ARRAY_SIZE(test_cases); i++) run_mremap_test_case(test_cases[i], &failures, threshold_mb, - pattern_seed); + pattern_seed, rand_addr); maps_fp = fopen("/proc/self/maps", "r"); if (maps_fp == NULL) { - ksft_print_msg("Failed to read /proc/self/maps: %s\n", strerror(errno)); - exit(KSFT_FAIL); + munmap(rand_addr, rand_size); + ksft_exit_fail_msg("Failed to read /proc/self/maps: %s\n", strerror(errno)); } mremap_expand_merge(maps_fp, page_size); @@ -744,17 +843,20 @@ int main(int argc, char **argv) fclose(maps_fp); - mremap_move_within_range(pattern_seed); - mremap_move_1mb_from_start(pattern_seed); + mremap_move_within_range(pattern_seed, rand_addr); + mremap_move_1mb_from_start(pattern_seed, rand_addr); if (run_perf_tests) { ksft_print_msg("\n%s\n", "mremap HAVE_MOVE_PMD/PUD optimization time comparison for 1GB region:"); for (i = 0; i < ARRAY_SIZE(perf_test_cases); i++) run_mremap_test_case(perf_test_cases[i], &failures, - threshold_mb, pattern_seed); + threshold_mb, pattern_seed, + rand_addr); } + munmap(rand_addr, rand_size); + if (failures > 0) ksft_exit_fail(); else diff --git a/tools/testing/selftests/mm/pagemap_ioctl.c b/tools/testing/selftests/mm/pagemap_ioctl.c index d59517ed3d48..2d785aca72a5 100644 --- a/tools/testing/selftests/mm/pagemap_ioctl.c +++ b/tools/testing/selftests/mm/pagemap_ioctl.c @@ -1484,7 +1484,7 @@ int main(int argc, char *argv[]) ksft_print_header(); if (init_uffd()) - return ksft_exit_pass(); + ksft_exit_pass(); ksft_set_plan(115); @@ -1660,5 +1660,5 @@ int main(int argc, char *argv[]) userfaultfd_tests(); close(pagemap_fd); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index 4bdb3a0c7a60..3157204b9047 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -152,9 +152,13 @@ done < /proc/meminfo # both of these requirements into account and attempt to increase # number of huge pages available. nr_cpus=$(nproc) -hpgsize_MB=$((hpgsize_KB / 1024)) -half_ufd_size_MB=$((((nr_cpus * hpgsize_MB + 127) / 128) * 128)) -needmem_KB=$((half_ufd_size_MB * 2 * 1024)) +uffd_min_KB=$((hpgsize_KB * nr_cpus * 2)) +hugetlb_min_KB=$((256 * 1024)) +if [[ $uffd_min_KB -gt $hugetlb_min_KB ]]; then + needmem_KB=$uffd_min_KB +else + needmem_KB=$hugetlb_min_KB +fi # set proper nr_hugepages if [ -n "$freepgs" ] && [ -n "$hpgsize_KB" ]; then @@ -294,7 +298,8 @@ CATEGORY="userfaultfd" run_test ./uffd-unit-tests uffd_stress_bin=./uffd-stress CATEGORY="userfaultfd" run_test ${uffd_stress_bin} anon 20 16 # Hugetlb tests require source and destination huge pages. Pass in half -# the size ($half_ufd_size_MB), which is used for *each*. +# the size of the free pages we have, which is used for *each*. +half_ufd_size_MB=$((freepgs / 2)) CATEGORY="userfaultfd" run_test ${uffd_stress_bin} hugetlb "$half_ufd_size_MB" 32 CATEGORY="userfaultfd" run_test ${uffd_stress_bin} hugetlb-private "$half_ufd_size_MB" 32 CATEGORY="userfaultfd" run_test ${uffd_stress_bin} shmem 20 16 diff --git a/tools/testing/selftests/mm/soft-dirty.c b/tools/testing/selftests/mm/soft-dirty.c index 7dbfa53d93a0..bdfa5d085f00 100644 --- a/tools/testing/selftests/mm/soft-dirty.c +++ b/tools/testing/selftests/mm/soft-dirty.c @@ -209,5 +209,5 @@ int main(int argc, char **argv) close(pagemap_fd); - return ksft_exit_pass(); + ksft_finished(); } diff --git a/tools/testing/selftests/mm/virtual_address_range.c b/tools/testing/selftests/mm/virtual_address_range.c index 7bcf8d48256a..4e4c1e311247 100644 --- a/tools/testing/selftests/mm/virtual_address_range.c +++ b/tools/testing/selftests/mm/virtual_address_range.c @@ -12,6 +12,8 @@ #include <errno.h> #include <sys/mman.h> #include <sys/time.h> +#include <fcntl.h> + #include "../kselftest.h" /* @@ -85,7 +87,7 @@ static int validate_lower_address_hint(void) char *ptr; ptr = mmap((void *) (1UL << 45), MAP_CHUNK_SIZE, PROT_READ | - PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (ptr == MAP_FAILED) return 0; @@ -93,6 +95,66 @@ static int validate_lower_address_hint(void) return 1; } +static int validate_complete_va_space(void) +{ + unsigned long start_addr, end_addr, prev_end_addr; + char line[400]; + char prot[6]; + FILE *file; + int fd; + + fd = open("va_dump", O_CREAT | O_WRONLY, 0600); + unlink("va_dump"); + if (fd < 0) { + ksft_test_result_skip("cannot create or open dump file\n"); + ksft_finished(); + } + + file = fopen("/proc/self/maps", "r"); + if (file == NULL) + ksft_exit_fail_msg("cannot open /proc/self/maps\n"); + + prev_end_addr = 0; + while (fgets(line, sizeof(line), file)) { + unsigned long hop; + + if (sscanf(line, "%lx-%lx %s[rwxp-]", + &start_addr, &end_addr, prot) != 3) + ksft_exit_fail_msg("cannot parse /proc/self/maps\n"); + + /* end of userspace mappings; ignore vsyscall mapping */ + if (start_addr & (1UL << 63)) + return 0; + + /* /proc/self/maps must have gaps less than MAP_CHUNK_SIZE */ + if (start_addr - prev_end_addr >= MAP_CHUNK_SIZE) + return 1; + + prev_end_addr = end_addr; + + if (prot[0] != 'r') + continue; + + /* + * Confirm whether MAP_CHUNK_SIZE chunk can be found or not. + * If write succeeds, no need to check MAP_CHUNK_SIZE - 1 + * addresses after that. If the address was not held by this + * process, write would fail with errno set to EFAULT. + * Anyways, if write returns anything apart from 1, exit the + * program since that would mean a bug in /proc/self/maps. + */ + hop = 0; + while (start_addr + hop < end_addr) { + if (write(fd, (void *)(start_addr + hop), 1) != 1) + return 1; + lseek(fd, 0, SEEK_SET); + + hop += MAP_CHUNK_SIZE; + } + } + return 0; +} + int main(int argc, char *argv[]) { char *ptr[NR_CHUNKS_LOW]; @@ -105,13 +167,11 @@ int main(int argc, char *argv[]) for (i = 0; i < NR_CHUNKS_LOW; i++) { ptr[i] = mmap(NULL, MAP_CHUNK_SIZE, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (ptr[i] == MAP_FAILED) { - if (validate_lower_address_hint()) { - ksft_test_result_skip("Memory constraint not fulfilled\n"); - ksft_finished(); - } + if (validate_lower_address_hint()) + ksft_exit_fail_msg("mmap unexpectedly succeeded with hint\n"); break; } @@ -127,7 +187,7 @@ int main(int argc, char *argv[]) for (i = 0; i < NR_CHUNKS_HIGH; i++) { hint = hind_addr(); hptr[i] = mmap(hint, MAP_CHUNK_SIZE, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (hptr[i] == MAP_FAILED) break; @@ -135,6 +195,10 @@ int main(int argc, char *argv[]) validate_addr(hptr[i], 1); } hchunks = i; + if (validate_complete_va_space()) { + ksft_test_result_fail("BUG in mmap() or /proc/self/maps\n"); + ksft_finished(); + } for (i = 0; i < lchunks; i++) munmap(ptr[i], MAP_CHUNK_SIZE); diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore index 2f9d378edec3..49a56eb5d036 100644 --- a/tools/testing/selftests/net/.gitignore +++ b/tools/testing/selftests/net/.gitignore @@ -2,9 +2,9 @@ bind_bhash bind_timewait bind_wildcard -csum cmsg_sender diag_uid +epoll_busy_poll fin_ack_lat gro hwtstamp_config @@ -31,6 +31,7 @@ reuseport_dualstack rxtimestamp sctp_hello scm_pidfd +scm_rights sk_bind_sendto_listen sk_connect_zero_addr socket diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 7b6918d5f4af..bd01e4a0be2c 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -20,7 +20,6 @@ TEST_PROGS += reuseaddr_ports_exhausted.sh TEST_PROGS += txtimestamp.sh TEST_PROGS += vrf-xfrm-tests.sh TEST_PROGS += rxtimestamp.sh -TEST_PROGS += devlink_port_split.py TEST_PROGS += drop_monitor_tests.sh TEST_PROGS += vrf_route_leaking.sh TEST_PROGS += bareudp.sh @@ -35,6 +34,7 @@ TEST_PROGS += gre_gso.sh TEST_PROGS += cmsg_so_mark.sh TEST_PROGS += cmsg_time.sh cmsg_ipv6.sh TEST_PROGS += netns-name.sh +TEST_PROGS += nl_netdev.py TEST_PROGS += srv6_end_dt46_l3vpn_test.sh TEST_PROGS += srv6_end_dt4_l3vpn_test.sh TEST_PROGS += srv6_end_dt6_l3vpn_test.sh @@ -67,7 +67,7 @@ TEST_GEN_FILES += ipsec TEST_GEN_FILES += ioam6_parser TEST_GEN_FILES += gro TEST_GEN_PROGS = reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa -TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict tls tun tap +TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict tls tun tap epoll_busy_poll TEST_GEN_FILES += toeplitz TEST_GEN_FILES += cmsg_sender TEST_GEN_FILES += stress_reuseport_listen @@ -81,9 +81,6 @@ TEST_PROGS += test_ingress_egress_chaining.sh TEST_GEN_PROGS += so_incoming_cpu TEST_PROGS += sctp_vrf.sh TEST_GEN_FILES += sctp_hello -TEST_GEN_FILES += csum -TEST_GEN_FILES += nat6to4.o -TEST_GEN_FILES += xdp_dummy.o TEST_GEN_FILES += ip_local_port_range TEST_GEN_FILES += bind_wildcard TEST_PROGS += test_vxlan_mdb.sh @@ -93,63 +90,22 @@ TEST_PROGS += test_bridge_backup_port.sh TEST_PROGS += fdb_flush.sh TEST_PROGS += fq_band_pktlimit.sh TEST_PROGS += vlan_hw_filter.sh +TEST_PROGS += bpf_offload.py TEST_FILES := settings TEST_FILES += in_netns.sh lib.sh net_helper.sh setup_loopback.sh setup_veth.sh +TEST_GEN_FILES += $(patsubst %.c,%.o,$(wildcard *.bpf.c)) + TEST_INCLUDES := forwarding/lib.sh include ../lib.mk +$(OUTPUT)/epoll_busy_poll: LDLIBS += -lcap $(OUTPUT)/reuseport_bpf_numa: LDLIBS += -lnuma $(OUTPUT)/tcp_mmap: LDLIBS += -lpthread -lcrypto $(OUTPUT)/tcp_inq: LDLIBS += -lpthread $(OUTPUT)/bind_bhash: LDLIBS += -lpthread $(OUTPUT)/io_uring_zerocopy_tx: CFLAGS += -I../../../include/ -# Rules to generate bpf objs -CLANG ?= clang -SCRATCH_DIR := $(OUTPUT)/tools -BUILD_DIR := $(SCRATCH_DIR)/build -BPFDIR := $(abspath ../../../lib/bpf) -APIDIR := $(abspath ../../../include/uapi) - -CCINCLUDE += -I../bpf -CCINCLUDE += -I../../../../usr/include/ -CCINCLUDE += -I$(SCRATCH_DIR)/include - -BPFOBJ := $(BUILD_DIR)/libbpf/libbpf.a - -MAKE_DIRS := $(BUILD_DIR)/libbpf -$(MAKE_DIRS): - mkdir -p $@ - -# Get Clang's default includes on this system, as opposed to those seen by -# '--target=bpf'. This fixes "missing" files on some architectures/distros, -# such as asm/byteorder.h, asm/socket.h, asm/sockios.h, sys/cdefs.h etc. -# -# Use '-idirafter': Don't interfere with include mechanics except where the -# build would have failed anyways. -define get_sys_includes -$(shell $(1) $(2) -v -E - </dev/null 2>&1 \ - | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \ -$(shell $(1) $(2) -dM -E - </dev/null | grep '__riscv_xlen ' | awk '{printf("-D__riscv_xlen=%d -D__BITS_PER_LONG=%d", $$3, $$3)}') -endef - -ifneq ($(CROSS_COMPILE),) -CLANG_TARGET_ARCH = --target=$(notdir $(CROSS_COMPILE:%-=%)) -endif - -CLANG_SYS_INCLUDES = $(call get_sys_includes,$(CLANG),$(CLANG_TARGET_ARCH)) - -$(OUTPUT)/nat6to4.o $(OUTPUT)/xdp_dummy.o: $(OUTPUT)/%.o : %.c $(BPFOBJ) | $(MAKE_DIRS) - $(CLANG) -O2 --target=bpf -c $< $(CCINCLUDE) $(CLANG_SYS_INCLUDES) -o $@ - -$(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \ - $(APIDIR)/linux/bpf.h \ - | $(BUILD_DIR)/libbpf - $(MAKE) $(submake_extras) -C $(BPFDIR) OUTPUT=$(BUILD_DIR)/libbpf/ \ - EXTRA_CFLAGS='-g -O0' \ - DESTDIR=$(SCRATCH_DIR) prefix= all install_headers - -EXTRA_CLEAN := $(SCRATCH_DIR) +include bpf.mk diff --git a/tools/testing/selftests/net/af_unix/Makefile b/tools/testing/selftests/net/af_unix/Makefile index 221c387a7d7f..3b83c797650d 100644 --- a/tools/testing/selftests/net/af_unix/Makefile +++ b/tools/testing/selftests/net/af_unix/Makefile @@ -1,4 +1,4 @@ CFLAGS += $(KHDR_INCLUDES) -TEST_GEN_PROGS := diag_uid test_unix_oob unix_connect scm_pidfd +TEST_GEN_PROGS := diag_uid test_unix_oob unix_connect scm_pidfd scm_rights include ../../lib.mk diff --git a/tools/testing/selftests/net/af_unix/scm_rights.c b/tools/testing/selftests/net/af_unix/scm_rights.c new file mode 100644 index 000000000000..bab606c9f1eb --- /dev/null +++ b/tools/testing/selftests/net/af_unix/scm_rights.c @@ -0,0 +1,286 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright Amazon.com Inc. or its affiliates. */ +#define _GNU_SOURCE +#include <sched.h> + +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/un.h> + +#include "../../kselftest_harness.h" + +FIXTURE(scm_rights) +{ + int fd[16]; +}; + +FIXTURE_VARIANT(scm_rights) +{ + char name[16]; + int type; + int flags; + bool test_listener; +}; + +FIXTURE_VARIANT_ADD(scm_rights, dgram) +{ + .name = "UNIX ", + .type = SOCK_DGRAM, + .flags = 0, + .test_listener = false, +}; + +FIXTURE_VARIANT_ADD(scm_rights, stream) +{ + .name = "UNIX-STREAM ", + .type = SOCK_STREAM, + .flags = 0, + .test_listener = false, +}; + +FIXTURE_VARIANT_ADD(scm_rights, stream_oob) +{ + .name = "UNIX-STREAM ", + .type = SOCK_STREAM, + .flags = MSG_OOB, + .test_listener = false, +}; + +FIXTURE_VARIANT_ADD(scm_rights, stream_listener) +{ + .name = "UNIX-STREAM ", + .type = SOCK_STREAM, + .flags = 0, + .test_listener = true, +}; + +FIXTURE_VARIANT_ADD(scm_rights, stream_listener_oob) +{ + .name = "UNIX-STREAM ", + .type = SOCK_STREAM, + .flags = MSG_OOB, + .test_listener = true, +}; + +static int count_sockets(struct __test_metadata *_metadata, + const FIXTURE_VARIANT(scm_rights) *variant) +{ + int sockets = -1, len, ret; + char *line = NULL; + size_t unused; + FILE *f; + + f = fopen("/proc/net/protocols", "r"); + ASSERT_NE(NULL, f); + + len = strlen(variant->name); + + while (getline(&line, &unused, f) != -1) { + int unused2; + + if (strncmp(line, variant->name, len)) + continue; + + ret = sscanf(line + len, "%d %d", &unused2, &sockets); + ASSERT_EQ(2, ret); + + break; + } + + free(line); + + ret = fclose(f); + ASSERT_EQ(0, ret); + + return sockets; +} + +FIXTURE_SETUP(scm_rights) +{ + int ret; + + ret = unshare(CLONE_NEWNET); + ASSERT_EQ(0, ret); + + ret = count_sockets(_metadata, variant); + ASSERT_EQ(0, ret); +} + +FIXTURE_TEARDOWN(scm_rights) +{ + int ret; + + sleep(1); + + ret = count_sockets(_metadata, variant); + ASSERT_EQ(0, ret); +} + +static void create_listeners(struct __test_metadata *_metadata, + FIXTURE_DATA(scm_rights) *self, + int n) +{ + struct sockaddr_un addr = { + .sun_family = AF_UNIX, + }; + socklen_t addrlen; + int i, ret; + + for (i = 0; i < n * 2; i += 2) { + self->fd[i] = socket(AF_UNIX, SOCK_STREAM, 0); + ASSERT_LE(0, self->fd[i]); + + addrlen = sizeof(addr.sun_family); + ret = bind(self->fd[i], (struct sockaddr *)&addr, addrlen); + ASSERT_EQ(0, ret); + + ret = listen(self->fd[i], -1); + ASSERT_EQ(0, ret); + + addrlen = sizeof(addr); + ret = getsockname(self->fd[i], (struct sockaddr *)&addr, &addrlen); + ASSERT_EQ(0, ret); + + self->fd[i + 1] = socket(AF_UNIX, SOCK_STREAM, 0); + ASSERT_LE(0, self->fd[i + 1]); + + ret = connect(self->fd[i + 1], (struct sockaddr *)&addr, addrlen); + ASSERT_EQ(0, ret); + } +} + +static void create_socketpairs(struct __test_metadata *_metadata, + FIXTURE_DATA(scm_rights) *self, + const FIXTURE_VARIANT(scm_rights) *variant, + int n) +{ + int i, ret; + + ASSERT_GE(sizeof(self->fd) / sizeof(int), n); + + for (i = 0; i < n * 2; i += 2) { + ret = socketpair(AF_UNIX, variant->type, 0, self->fd + i); + ASSERT_EQ(0, ret); + } +} + +static void __create_sockets(struct __test_metadata *_metadata, + FIXTURE_DATA(scm_rights) *self, + const FIXTURE_VARIANT(scm_rights) *variant, + int n) +{ + if (variant->test_listener) + create_listeners(_metadata, self, n); + else + create_socketpairs(_metadata, self, variant, n); +} + +static void __close_sockets(struct __test_metadata *_metadata, + FIXTURE_DATA(scm_rights) *self, + int n) +{ + int i, ret; + + ASSERT_GE(sizeof(self->fd) / sizeof(int), n); + + for (i = 0; i < n * 2; i++) { + ret = close(self->fd[i]); + ASSERT_EQ(0, ret); + } +} + +void __send_fd(struct __test_metadata *_metadata, + const FIXTURE_DATA(scm_rights) *self, + const FIXTURE_VARIANT(scm_rights) *variant, + int inflight, int receiver) +{ +#define MSG "nop" +#define MSGLEN 3 + struct { + struct cmsghdr cmsghdr; + int fd[2]; + } cmsg = { + .cmsghdr = { + .cmsg_len = CMSG_LEN(sizeof(cmsg.fd)), + .cmsg_level = SOL_SOCKET, + .cmsg_type = SCM_RIGHTS, + }, + .fd = { + self->fd[inflight * 2], + self->fd[inflight * 2], + }, + }; + struct iovec iov = { + .iov_base = MSG, + .iov_len = MSGLEN, + }; + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = &cmsg, + .msg_controllen = CMSG_SPACE(sizeof(cmsg.fd)), + }; + int ret; + + ret = sendmsg(self->fd[receiver * 2 + 1], &msg, variant->flags); + ASSERT_EQ(MSGLEN, ret); +} + +#define create_sockets(n) \ + __create_sockets(_metadata, self, variant, n) +#define close_sockets(n) \ + __close_sockets(_metadata, self, n) +#define send_fd(inflight, receiver) \ + __send_fd(_metadata, self, variant, inflight, receiver) + +TEST_F(scm_rights, self_ref) +{ + create_sockets(2); + + send_fd(0, 0); + + send_fd(1, 1); + + close_sockets(2); +} + +TEST_F(scm_rights, triangle) +{ + create_sockets(6); + + send_fd(0, 1); + send_fd(1, 2); + send_fd(2, 0); + + send_fd(3, 4); + send_fd(4, 5); + send_fd(5, 3); + + close_sockets(6); +} + +TEST_F(scm_rights, cross_edge) +{ + create_sockets(8); + + send_fd(0, 1); + send_fd(1, 2); + send_fd(2, 0); + send_fd(1, 3); + send_fd(3, 2); + + send_fd(4, 5); + send_fd(5, 6); + send_fd(6, 4); + send_fd(5, 7); + send_fd(7, 6); + + close_sockets(8); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/net/amt.sh b/tools/testing/selftests/net/amt.sh index 75528788cb95..5175a42cbe8a 100755 --- a/tools/testing/selftests/net/amt.sh +++ b/tools/testing/selftests/net/amt.sh @@ -210,8 +210,8 @@ check_features() test_ipv4_forward() { - RESULT4=$(ip netns exec "${LISTENER}" nc -w 1 -l -u 239.0.0.1 4000) - if [ "$RESULT4" == "172.17.0.2" ]; then + RESULT4=$(ip netns exec "${LISTENER}" timeout 15 socat - UDP4-LISTEN:4000,readbytes=128 || true) + if echo "$RESULT4" | grep -q "172.17.0.2"; then printf "TEST: %-60s [ OK ]\n" "IPv4 amt multicast forwarding" exit 0 else @@ -222,8 +222,8 @@ test_ipv4_forward() test_ipv6_forward() { - RESULT6=$(ip netns exec "${LISTENER}" nc -w 1 -l -u ff0e::5:6 6000) - if [ "$RESULT6" == "2001:db8:3::2" ]; then + RESULT6=$(ip netns exec "${LISTENER}" timeout 15 socat - UDP6-LISTEN:6000,readbytes=128 || true) + if echo "$RESULT6" | grep -q "2001:db8:3::2"; then printf "TEST: %-60s [ OK ]\n" "IPv6 amt multicast forwarding" exit 0 else @@ -236,14 +236,14 @@ send_mcast4() { sleep 2 ip netns exec "${SOURCE}" bash -c \ - 'echo 172.17.0.2 | nc -w 1 -u 239.0.0.1 4000' & + 'printf "%s %128s" 172.17.0.2 | nc -w 1 -u 239.0.0.1 4000' & } send_mcast6() { sleep 2 ip netns exec "${SOURCE}" bash -c \ - 'echo 2001:db8:3::2 | nc -w 1 -u ff0e::5:6 6000' & + 'printf "%s %128s" 2001:db8:3::2 | nc -w 1 -u ff0e::5:6 6000' & } check_features diff --git a/tools/testing/selftests/net/bpf.mk b/tools/testing/selftests/net/bpf.mk new file mode 100644 index 000000000000..a4f6755dd894 --- /dev/null +++ b/tools/testing/selftests/net/bpf.mk @@ -0,0 +1,53 @@ +# SPDX-License-Identifier: GPL-2.0 +# Rules to generate bpf objs +CLANG ?= clang +SCRATCH_DIR := $(OUTPUT)/tools +BUILD_DIR := $(SCRATCH_DIR)/build +BPFDIR := $(top_srcdir)/tools/lib/bpf +APIDIR := $(top_srcdir)/tools/include/uapi + +CCINCLUDE += -I$(selfdir)/bpf +CCINCLUDE += -I$(top_srcdir)/usr/include/ +CCINCLUDE += -I$(SCRATCH_DIR)/include + +BPFOBJ := $(BUILD_DIR)/libbpf/libbpf.a + +MAKE_DIRS := $(BUILD_DIR)/libbpf +$(MAKE_DIRS): + $(call msg,MKDIR,,$@) + $(Q)mkdir -p $@ + +# Get Clang's default includes on this system, as opposed to those seen by +# '--target=bpf'. This fixes "missing" files on some architectures/distros, +# such as asm/byteorder.h, asm/socket.h, asm/sockios.h, sys/cdefs.h etc. +# +# Use '-idirafter': Don't interfere with include mechanics except where the +# build would have failed anyways. +define get_sys_includes +$(shell $(1) $(2) -v -E - </dev/null 2>&1 \ + | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \ +$(shell $(1) $(2) -dM -E - </dev/null | grep '__riscv_xlen ' | awk '{printf("-D__riscv_xlen=%d -D__BITS_PER_LONG=%d", $$3, $$3)}') +endef + +ifneq ($(CROSS_COMPILE),) +CLANG_TARGET_ARCH = --target=$(notdir $(CROSS_COMPILE:%-=%)) +endif + +CLANG_SYS_INCLUDES = $(call get_sys_includes,$(CLANG),$(CLANG_TARGET_ARCH)) + +BPF_PROG_OBJS := $(patsubst %.c,$(OUTPUT)/%.o,$(wildcard *.bpf.c)) + +$(BPF_PROG_OBJS): $(OUTPUT)/%.o : %.c $(BPFOBJ) | $(MAKE_DIRS) + $(call msg,BPF_PROG,,$@) + $(Q)$(CLANG) -O2 -g --target=bpf $(CCINCLUDE) $(CLANG_SYS_INCLUDES) \ + -c $< -o $@ + +$(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \ + $(APIDIR)/linux/bpf.h \ + | $(BUILD_DIR)/libbpf + $(call msg,MAKE,,$@) + $(Q)$(MAKE) $(submake_extras) -C $(BPFDIR) OUTPUT=$(BUILD_DIR)/libbpf/ \ + EXTRA_CFLAGS='-g -O0' \ + DESTDIR=$(SCRATCH_DIR) prefix= all install_headers + +EXTRA_CLEAN += $(SCRATCH_DIR) diff --git a/tools/testing/selftests/bpf/test_offload.py b/tools/testing/selftests/net/bpf_offload.py index 6157f884d091..3efe44f6e92a 100755 --- a/tools/testing/selftests/bpf/test_offload.py +++ b/tools/testing/selftests/net/bpf_offload.py @@ -29,6 +29,9 @@ import subprocess import time import traceback +from lib.py import NetdevSim, NetdevSimDev + + logfile = None log_level = 1 skip_extack = False @@ -145,8 +148,10 @@ def tool(name, args, flags, JSON=True, ns="", fail=True, include_stderr=False): if JSON: params += "%s " % (flags["json"]) - if ns != "": + if ns: ns = "ip netns exec %s " % (ns) + elif ns is None: + ns = "" if include_stderr: ret, stdout, stderr = cmd(ns + name + " " + params + args, @@ -201,11 +206,11 @@ def bpftool_prog_list_wait(expected=0, n_retry=20): time.sleep(0.05) raise Exception("Time out waiting for program counts to stabilize want %d, have %d" % (expected, nprogs)) -def bpftool_map_list_wait(expected=0, n_retry=20): +def bpftool_map_list_wait(expected=0, n_retry=20, ns=""): for i in range(n_retry): - nmaps = len(bpftool_map_list()) - if nmaps == expected: - return + maps = bpftool_map_list(ns=ns) + if len(maps) == expected: + return maps time.sleep(0.05) raise Exception("Time out waiting for map counts to stabilize want %d, have %d" % (expected, nmaps)) @@ -237,7 +242,7 @@ def tc(args, JSON=True, ns="", fail=True, include_stderr=False): def ethtool(dev, opt, args, fail=True): return cmd("ethtool %s %s %s" % (opt, dev["ifname"], args), fail=fail) -def bpf_obj(name, sec=".text", path=bpf_test_dir,): +def bpf_obj(name, sec="xdp", path=bpf_test_dir,): return "obj %s sec %s" % (os.path.join(path, name), sec) def bpf_pinned(name): @@ -334,72 +339,16 @@ class DebugfsDir: return dfs -class NetdevSimDev: +class BpfNetdevSimDev(NetdevSimDev): """ Class for netdevsim bus device and its attributes. """ - @staticmethod - def ctrl_write(path, val): - fullpath = os.path.join("/sys/bus/netdevsim/", path) - try: - with open(fullpath, "w") as f: - f.write(val) - except OSError as e: - log("WRITE %s: %r" % (fullpath, val), -e.errno) - raise e - log("WRITE %s: %r" % (fullpath, val), 0) - - def __init__(self, port_count=1): - addr = 0 - while True: - try: - self.ctrl_write("new_device", "%u %u" % (addr, port_count)) - except OSError as e: - if e.errno == errno.ENOSPC: - addr += 1 - continue - raise e - break - self.addr = addr - - # As probe of netdevsim device might happen from a workqueue, - # so wait here until all netdevs appear. - self.wait_for_netdevs(port_count) - - ret, out = cmd("udevadm settle", fail=False) - if ret: - raise Exception("udevadm settle failed") - ifnames = self.get_ifnames() - + def __init__(self, port_count=1, ns=None): + super().__init__(port_count, ns=ns) devs.append(self) - self.dfs_dir = "/sys/kernel/debug/netdevsim/netdevsim%u/" % addr - - self.nsims = [] - for port_index in range(port_count): - self.nsims.append(NetdevSim(self, port_index, ifnames[port_index])) - - def get_ifnames(self): - ifnames = [] - listdir = os.listdir("/sys/bus/netdevsim/devices/netdevsim%u/net/" % self.addr) - for ifname in listdir: - ifnames.append(ifname) - ifnames.sort() - return ifnames - - def wait_for_netdevs(self, port_count): - timeout = 5 - timeout_start = time.time() - - while True: - try: - ifnames = self.get_ifnames() - except FileNotFoundError as e: - ifnames = [] - if len(ifnames) == port_count: - break - if time.time() < timeout_start + timeout: - continue - raise Exception("netdevices did not appear within timeout") + + def _make_port(self, port_index, ifname): + return BpfNetdevSim(self, port_index, ifname, self.ns) def dfs_num_bound_progs(self): path = os.path.join(self.dfs_dir, "bpf_bound_progs") @@ -415,33 +364,20 @@ class NetdevSimDev: return progs def remove(self): - self.ctrl_write("del_device", "%u" % (self.addr, )) + super().remove() devs.remove(self) - def remove_nsim(self, nsim): - self.nsims.remove(nsim) - self.ctrl_write("devices/netdevsim%u/del_port" % (self.addr, ), - "%u" % (nsim.port_index, )) -class NetdevSim: +class BpfNetdevSim(NetdevSim): """ Class for netdevsim netdevice and its attributes. """ - def __init__(self, nsimdev, port_index, ifname): - # In case udev renamed the netdev to according to new schema, - # check if the name matches the port_index. - nsimnamere = re.compile("eni\d+np(\d+)") - match = nsimnamere.match(ifname) - if match and int(match.groups()[0]) != port_index + 1: - raise Exception("netdevice name mismatches the expected one") - - self.nsimdev = nsimdev - self.port_index = port_index - self.ns = "" + def __init__(self, nsimdev, port_index, ifname, ns=None): + super().__init__(nsimdev, port_index, ifname, ns=ns) + self.dfs_dir = "%s/ports/%u/" % (nsimdev.dfs_dir, port_index) self.dfs_refresh() - _, [self.dev] = ip("link show dev %s" % ifname) def __getitem__(self, key): return self.dev[key] @@ -468,7 +404,7 @@ class NetdevSim: raise Exception("Time out waiting for program counts to stabilize want %d/%d, have %d bound, %d loaded" % (bound, total, nbound, nprogs)) def set_ns(self, ns): - name = "1" if ns == "" else ns + name = ns if ns else "1" ip("link set dev %s netns %s" % (self.dev["ifname"], name), ns=self.ns) self.ns = ns @@ -605,7 +541,7 @@ def pin_prog(file_name, idx=0): return file_name, bpf_pinned(file_name) def pin_map(file_name, idx=0, expected=1): - maps = bpftool_map_list(expected=expected) + maps = bpftool_map_list_wait(expected=expected) m = maps[idx] bpftool("map pin id %d %s" % (m["id"], file_name)) files.append(file_name) @@ -618,7 +554,7 @@ def check_dev_info_removed(prog_file=None, map_file=None): ret, err = bpftool("prog show pin %s" % (prog_file), fail=False) fail(ret != 0, "failed to show prog with removed device") - bpftool_map_list(expected=0) + bpftool_map_list_wait(expected=0) ret, err = bpftool("map show pin %s" % (map_file), fail=False) fail(ret == 0, "Showing map with removed device did not fail") fail(err["error"].find("No such device") == -1, @@ -642,7 +578,7 @@ def check_dev_info(other_ns, ns, prog_file=None, map_file=None, removed=False): else: fail("ifname" in dev.keys(), "Ifname is reported for other ns") - maps = bpftool_map_list(expected=2, ns=ns) + maps = bpftool_map_list_wait(expected=2, ns=ns) for m in maps: fail("dev" not in m.keys(), "Device parameters not reported") fail(dev != m["dev"], "Map's device different than program's") @@ -744,7 +680,7 @@ def test_multi_prog(simdev, sim, obj, modename, modeid): start_test("Test multi-attachment XDP - device remove...") simdev.remove() - simdev = NetdevSimDev() + simdev = BpfNetdevSimDev() sim, = simdev.nsims sim.set_ethtool_tc_offloads(True) return [simdev, sim] @@ -809,13 +745,13 @@ try: bytecode = bpf_bytecode("1,6 0 0 4294967295,") start_test("Test destruction of generic XDP...") - simdev = NetdevSimDev() + simdev = BpfNetdevSimDev() sim, = simdev.nsims sim.set_xdp(obj, "generic") simdev.remove() bpftool_prog_list_wait(expected=0) - simdev = NetdevSimDev() + simdev = BpfNetdevSimDev() sim, = simdev.nsims sim.tc_add_ingress() @@ -967,7 +903,7 @@ try: simdev.remove() bpftool_prog_list_wait(expected=0) - simdev = NetdevSimDev() + simdev = BpfNetdevSimDev() sim, = simdev.nsims sim.set_ethtool_tc_offloads(True) @@ -976,7 +912,7 @@ try: simdev.remove() bpftool_prog_list_wait(expected=0) - simdev = NetdevSimDev() + simdev = BpfNetdevSimDev() sim, = simdev.nsims sim.set_ethtool_tc_offloads(True) @@ -1080,7 +1016,7 @@ try: bpftool_prog_list_wait(expected=0) start_test("Test attempt to use a program for a wrong device...") - simdev2 = NetdevSimDev() + simdev2 = BpfNetdevSimDev() sim2, = simdev2.nsims sim2.set_xdp(obj, "offload") pin_file, pinned = pin_prog("/sys/fs/bpf/tmp") @@ -1169,7 +1105,7 @@ try: clean_up() bpftool_prog_list_wait(expected=0) - simdev = NetdevSimDev() + simdev = BpfNetdevSimDev() sim, = simdev.nsims map_obj = bpf_obj("sample_map_ret0.bpf.o") start_test("Test loading program with maps...") @@ -1201,12 +1137,12 @@ try: clean_up() bpftool_prog_list_wait(expected=0) - simdev = NetdevSimDev() + simdev = BpfNetdevSimDev() sim, = simdev.nsims start_test("Test map update (no flags)...") sim.set_xdp(map_obj, "offload", JSON=False) # map fixup msg breaks JSON - maps = bpftool_map_list(expected=2) + maps = bpftool_map_list_wait(expected=2) array = maps[0] if maps[0]["type"] == "array" else maps[1] htab = maps[0] if maps[0]["type"] == "hash" else maps[1] for m in maps: @@ -1285,14 +1221,14 @@ try: bpftool_map_list_wait(expected=0) simdev.remove() - simdev = NetdevSimDev() + simdev = BpfNetdevSimDev() sim, = simdev.nsims sim.set_xdp(map_obj, "offload", JSON=False) # map fixup msg breaks JSON simdev.remove() bpftool_map_list_wait(expected=0) start_test("Test map creation fail path...") - simdev = NetdevSimDev() + simdev = BpfNetdevSimDev() sim, = simdev.nsims sim.dfs["bpf_map_accept"] = "N" ret, _ = sim.set_xdp(map_obj, "offload", JSON=False, fail=False) @@ -1302,9 +1238,9 @@ try: simdev.remove() start_test("Test multi-dev ASIC program reuse...") - simdevA = NetdevSimDev() + simdevA = BpfNetdevSimDev() simA, = simdevA.nsims - simdevB = NetdevSimDev(3) + simdevB = BpfNetdevSimDev(3) simB1, simB2, simB3 = simdevB.nsims sims = (simA, simB1, simB2, simB3) simB = (simB1, simB2, simB3) diff --git a/tools/testing/selftests/net/cmsg_sender.c b/tools/testing/selftests/net/cmsg_sender.c index c79e65581dc3..876c2db02a63 100644 --- a/tools/testing/selftests/net/cmsg_sender.c +++ b/tools/testing/selftests/net/cmsg_sender.c @@ -260,15 +260,8 @@ cs_write_cmsg(int fd, struct msghdr *msg, char *cbuf, size_t cbuf_sz) SOL_IPV6, IPV6_HOPLIMIT, &opt.v6.hlimit); if (opt.txtime.ena) { - struct sock_txtime so_txtime = { - .clockid = CLOCK_MONOTONIC, - }; __u64 txtime; - if (setsockopt(fd, SOL_SOCKET, SO_TXTIME, - &so_txtime, sizeof(so_txtime))) - error(ERN_SOCKOPT, errno, "setsockopt TXTIME"); - txtime = time_start_mono.tv_sec * (1000ULL * 1000 * 1000) + time_start_mono.tv_nsec + opt.txtime.delay * 1000; @@ -284,13 +277,6 @@ cs_write_cmsg(int fd, struct msghdr *msg, char *cbuf, size_t cbuf_sz) memcpy(CMSG_DATA(cmsg), &txtime, sizeof(txtime)); } if (opt.ts.ena) { - __u32 val = SOF_TIMESTAMPING_SOFTWARE | - SOF_TIMESTAMPING_OPT_TSONLY; - - if (setsockopt(fd, SOL_SOCKET, SO_TIMESTAMPING, - &val, sizeof(val))) - error(ERN_SOCKOPT, errno, "setsockopt TIMESTAMPING"); - cmsg = (struct cmsghdr *)(cbuf + cmsg_len); cmsg_len += CMSG_SPACE(sizeof(__u32)); if (cbuf_sz < cmsg_len) @@ -333,16 +319,17 @@ static const char *cs_ts_info2str(unsigned int info) return "unknown"; } -static void +static unsigned long cs_read_cmsg(int fd, struct msghdr *msg, char *cbuf, size_t cbuf_sz) { struct sock_extended_err *see; struct scm_timestamping *ts; + unsigned long ts_seen = 0; struct cmsghdr *cmsg; int i, err; if (!opt.ts.ena) - return; + return 0; msg->msg_control = cbuf; msg->msg_controllen = cbuf_sz; @@ -396,8 +383,11 @@ cs_read_cmsg(int fd, struct msghdr *msg, char *cbuf, size_t cbuf_sz) printf(" %5s ts%d %lluus\n", cs_ts_info2str(see->ee_info), i, rel_time); + ts_seen |= 1 << see->ee_info; } } + + return ts_seen; } static void ca_set_sockopts(int fd) @@ -422,6 +412,24 @@ static void ca_set_sockopts(int fd) setsockopt(fd, SOL_SOCKET, SO_PRIORITY, &opt.sockopt.priority, sizeof(opt.sockopt.priority))) error(ERN_SOCKOPT, errno, "setsockopt SO_PRIORITY"); + + if (opt.txtime.ena) { + struct sock_txtime so_txtime = { + .clockid = CLOCK_MONOTONIC, + }; + + if (setsockopt(fd, SOL_SOCKET, SO_TXTIME, + &so_txtime, sizeof(so_txtime))) + error(ERN_SOCKOPT, errno, "setsockopt TXTIME"); + } + if (opt.ts.ena) { + __u32 val = SOF_TIMESTAMPING_SOFTWARE | + SOF_TIMESTAMPING_OPT_TSONLY; + + if (setsockopt(fd, SOL_SOCKET, SO_TIMESTAMPING, + &val, sizeof(val))) + error(ERN_SOCKOPT, errno, "setsockopt TIMESTAMPING"); + } } int main(int argc, char *argv[]) @@ -509,10 +517,16 @@ int main(int argc, char *argv[]) err = ERN_SUCCESS; if (opt.ts.ena) { - /* Make sure all timestamps have time to loop back */ - usleep(opt.txtime.delay); + unsigned long seen; + int i; - cs_read_cmsg(fd, &msg, cbuf, sizeof(cbuf)); + /* Make sure all timestamps have time to loop back */ + for (i = 0; i < 40; i++) { + seen = cs_read_cmsg(fd, &msg, cbuf, sizeof(cbuf)); + if (seen & (1 << SCM_TSTAMP_SND)) + break; + usleep(opt.txtime.delay / 20); + } } err_out: diff --git a/tools/testing/selftests/net/cmsg_time.sh b/tools/testing/selftests/net/cmsg_time.sh index af85267ad1e3..1d7e756644bc 100755 --- a/tools/testing/selftests/net/cmsg_time.sh +++ b/tools/testing/selftests/net/cmsg_time.sh @@ -66,10 +66,13 @@ for i in "-4 $TGT4" "-6 $TGT6"; do awk '/SND/ { if ($3 > 1000) print "OK"; }') check_result $? "$ts" "OK" "$prot - TXTIME abs" - ts=$(ip netns exec $NS ./cmsg_sender -p $p $i 1234 -t -d 1000 | + [ "$KSFT_MACHINE_SLOW" = yes ] && delay=8000 || delay=1000 + + ts=$(ip netns exec $NS ./cmsg_sender -p $p $i 1234 -t -d $delay | awk '/SND/ {snd=$3} /SCHED/ {sch=$3} - END { if (snd - sch > 500) print "OK"; }') + END { if (snd - sch > '$((delay/2))') print "OK"; + else print snd, "-", sch, "<", '$((delay/2))'; }') check_result $? "$ts" "OK" "$prot - TXTIME rel" done done diff --git a/tools/testing/selftests/net/config b/tools/testing/selftests/net/config index 5e4390cac17e..04de7a6ba6f3 100644 --- a/tools/testing/selftests/net/config +++ b/tools/testing/selftests/net/config @@ -30,6 +30,7 @@ CONFIG_IP_GRE=m CONFIG_NETFILTER=y CONFIG_NETFILTER_ADVANCED=y CONFIG_NF_CONNTRACK=m +CONFIG_IPV6_MROUTE=y CONFIG_IPV6_SIT=y CONFIG_IP_DCCP=m CONFIG_NF_NAT=m diff --git a/tools/testing/selftests/net/epoll_busy_poll.c b/tools/testing/selftests/net/epoll_busy_poll.c new file mode 100644 index 000000000000..16e457c2f877 --- /dev/null +++ b/tools/testing/selftests/net/epoll_busy_poll.c @@ -0,0 +1,320 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* Basic per-epoll context busy poll test. + * + * Only tests the ioctls, but should be expanded to test two connected hosts in + * the future + */ + +#define _GNU_SOURCE + +#include <error.h> +#include <errno.h> +#include <inttypes.h> +#include <limits.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include <sys/capability.h> + +#include <sys/epoll.h> +#include <sys/ioctl.h> +#include <sys/socket.h> + +#include "../kselftest_harness.h" + +/* if the headers haven't been updated, we need to define some things */ +#if !defined(EPOLL_IOC_TYPE) +struct epoll_params { + uint32_t busy_poll_usecs; + uint16_t busy_poll_budget; + uint8_t prefer_busy_poll; + + /* pad the struct to a multiple of 64bits */ + uint8_t __pad; +}; + +#define EPOLL_IOC_TYPE 0x8A +#define EPIOCSPARAMS _IOW(EPOLL_IOC_TYPE, 0x01, struct epoll_params) +#define EPIOCGPARAMS _IOR(EPOLL_IOC_TYPE, 0x02, struct epoll_params) +#endif + +FIXTURE(invalid_fd) +{ + int invalid_fd; + struct epoll_params params; +}; + +FIXTURE_SETUP(invalid_fd) +{ + int ret; + + ret = socket(AF_UNIX, SOCK_DGRAM, 0); + EXPECT_NE(-1, ret) + TH_LOG("error creating unix socket"); + + self->invalid_fd = ret; +} + +FIXTURE_TEARDOWN(invalid_fd) +{ + int ret; + + ret = close(self->invalid_fd); + EXPECT_EQ(0, ret); +} + +TEST_F(invalid_fd, test_invalid_fd) +{ + int ret; + + ret = ioctl(self->invalid_fd, EPIOCGPARAMS, &self->params); + + EXPECT_EQ(-1, ret) + TH_LOG("EPIOCGPARAMS on invalid epoll FD should error"); + + EXPECT_EQ(ENOTTY, errno) + TH_LOG("EPIOCGPARAMS on invalid epoll FD should set errno to ENOTTY"); + + memset(&self->params, 0, sizeof(struct epoll_params)); + + ret = ioctl(self->invalid_fd, EPIOCSPARAMS, &self->params); + + EXPECT_EQ(-1, ret) + TH_LOG("EPIOCSPARAMS on invalid epoll FD should error"); + + EXPECT_EQ(ENOTTY, errno) + TH_LOG("EPIOCSPARAMS on invalid epoll FD should set errno to ENOTTY"); +} + +FIXTURE(epoll_busy_poll) +{ + int fd; + struct epoll_params params; + struct epoll_params *invalid_params; + cap_t caps; +}; + +FIXTURE_SETUP(epoll_busy_poll) +{ + int ret; + + ret = epoll_create1(0); + EXPECT_NE(-1, ret) + TH_LOG("epoll_create1 failed?"); + + self->fd = ret; + + self->caps = cap_get_proc(); + EXPECT_NE(NULL, self->caps); +} + +FIXTURE_TEARDOWN(epoll_busy_poll) +{ + int ret; + + ret = close(self->fd); + EXPECT_EQ(0, ret); + + ret = cap_free(self->caps); + EXPECT_NE(-1, ret) + TH_LOG("unable to free capabilities"); +} + +TEST_F(epoll_busy_poll, test_get_params) +{ + /* begin by getting the epoll params from the kernel + * + * the default should be default and all fields should be zero'd by the + * kernel, so set params fields to garbage to test this. + */ + int ret = 0; + + self->params.busy_poll_usecs = 0xff; + self->params.busy_poll_budget = 0xff; + self->params.prefer_busy_poll = 1; + self->params.__pad = 0xf; + + ret = ioctl(self->fd, EPIOCGPARAMS, &self->params); + EXPECT_EQ(0, ret) + TH_LOG("ioctl EPIOCGPARAMS should succeed"); + + EXPECT_EQ(0, self->params.busy_poll_usecs) + TH_LOG("EPIOCGPARAMS busy_poll_usecs should have been 0"); + + EXPECT_EQ(0, self->params.busy_poll_budget) + TH_LOG("EPIOCGPARAMS busy_poll_budget should have been 0"); + + EXPECT_EQ(0, self->params.prefer_busy_poll) + TH_LOG("EPIOCGPARAMS prefer_busy_poll should have been 0"); + + EXPECT_EQ(0, self->params.__pad) + TH_LOG("EPIOCGPARAMS __pad should have been 0"); + + self->invalid_params = (struct epoll_params *)0xdeadbeef; + ret = ioctl(self->fd, EPIOCGPARAMS, self->invalid_params); + + EXPECT_EQ(-1, ret) + TH_LOG("EPIOCGPARAMS should error with invalid params"); + + EXPECT_EQ(EFAULT, errno) + TH_LOG("EPIOCGPARAMS with invalid params should set errno to EFAULT"); +} + +TEST_F(epoll_busy_poll, test_set_invalid) +{ + int ret; + + memset(&self->params, 0, sizeof(struct epoll_params)); + + self->params.__pad = 1; + + ret = ioctl(self->fd, EPIOCSPARAMS, &self->params); + + EXPECT_EQ(-1, ret) + TH_LOG("EPIOCSPARAMS non-zero __pad should error"); + + EXPECT_EQ(EINVAL, errno) + TH_LOG("EPIOCSPARAMS non-zero __pad errno should be EINVAL"); + + self->params.__pad = 0; + self->params.busy_poll_usecs = (uint32_t)INT_MAX + 1; + + ret = ioctl(self->fd, EPIOCSPARAMS, &self->params); + + EXPECT_EQ(-1, ret) + TH_LOG("EPIOCSPARAMS should error busy_poll_usecs > S32_MAX"); + + EXPECT_EQ(EINVAL, errno) + TH_LOG("EPIOCSPARAMS busy_poll_usecs > S32_MAX errno should be EINVAL"); + + self->params.__pad = 0; + self->params.busy_poll_usecs = 32; + self->params.prefer_busy_poll = 2; + + ret = ioctl(self->fd, EPIOCSPARAMS, &self->params); + + EXPECT_EQ(-1, ret) + TH_LOG("EPIOCSPARAMS should error prefer_busy_poll > 1"); + + EXPECT_EQ(EINVAL, errno) + TH_LOG("EPIOCSPARAMS prefer_busy_poll > 1 errno should be EINVAL"); + + self->params.__pad = 0; + self->params.busy_poll_usecs = 32; + self->params.prefer_busy_poll = 1; + + /* set budget well above kernel's NAPI_POLL_WEIGHT of 64 */ + self->params.busy_poll_budget = UINT16_MAX; + + /* test harness should run with CAP_NET_ADMIN, but let's make sure */ + cap_flag_value_t tmp; + + ret = cap_get_flag(self->caps, CAP_NET_ADMIN, CAP_EFFECTIVE, &tmp); + EXPECT_EQ(0, ret) + TH_LOG("unable to get CAP_NET_ADMIN cap flag"); + + EXPECT_EQ(CAP_SET, tmp) + TH_LOG("expecting CAP_NET_ADMIN to be set for the test harness"); + + /* at this point we know CAP_NET_ADMIN is available, so setting the + * params with a busy_poll_budget > NAPI_POLL_WEIGHT should succeed + */ + ret = ioctl(self->fd, EPIOCSPARAMS, &self->params); + + EXPECT_EQ(0, ret) + TH_LOG("EPIOCSPARAMS should allow busy_poll_budget > NAPI_POLL_WEIGHT"); + + /* remove CAP_NET_ADMIN from our effective set */ + cap_value_t net_admin[] = { CAP_NET_ADMIN }; + + ret = cap_set_flag(self->caps, CAP_EFFECTIVE, 1, net_admin, CAP_CLEAR); + EXPECT_EQ(0, ret) + TH_LOG("couldn't clear CAP_NET_ADMIN"); + + ret = cap_set_proc(self->caps); + EXPECT_EQ(0, ret) + TH_LOG("cap_set_proc should drop CAP_NET_ADMIN"); + + /* this is now expected to fail */ + ret = ioctl(self->fd, EPIOCSPARAMS, &self->params); + + EXPECT_EQ(-1, ret) + TH_LOG("EPIOCSPARAMS should error busy_poll_budget > NAPI_POLL_WEIGHT"); + + EXPECT_EQ(EPERM, errno) + TH_LOG("EPIOCSPARAMS errno should be EPERM busy_poll_budget > NAPI_POLL_WEIGHT"); + + /* restore CAP_NET_ADMIN to our effective set */ + ret = cap_set_flag(self->caps, CAP_EFFECTIVE, 1, net_admin, CAP_SET); + EXPECT_EQ(0, ret) + TH_LOG("couldn't restore CAP_NET_ADMIN"); + + ret = cap_set_proc(self->caps); + EXPECT_EQ(0, ret) + TH_LOG("cap_set_proc should set CAP_NET_ADMIN"); + + self->invalid_params = (struct epoll_params *)0xdeadbeef; + ret = ioctl(self->fd, EPIOCSPARAMS, self->invalid_params); + + EXPECT_EQ(-1, ret) + TH_LOG("EPIOCSPARAMS should error when epoll_params is invalid"); + + EXPECT_EQ(EFAULT, errno) + TH_LOG("EPIOCSPARAMS should set errno to EFAULT when epoll_params is invalid"); +} + +TEST_F(epoll_busy_poll, test_set_and_get_valid) +{ + int ret; + + memset(&self->params, 0, sizeof(struct epoll_params)); + + self->params.busy_poll_usecs = 25; + self->params.busy_poll_budget = 16; + self->params.prefer_busy_poll = 1; + + ret = ioctl(self->fd, EPIOCSPARAMS, &self->params); + + EXPECT_EQ(0, ret) + TH_LOG("EPIOCSPARAMS with valid params should not error"); + + /* check that the kernel returns the same values back */ + + memset(&self->params, 0, sizeof(struct epoll_params)); + + ret = ioctl(self->fd, EPIOCGPARAMS, &self->params); + + EXPECT_EQ(0, ret) + TH_LOG("EPIOCGPARAMS should not error"); + + EXPECT_EQ(25, self->params.busy_poll_usecs) + TH_LOG("params.busy_poll_usecs incorrect"); + + EXPECT_EQ(16, self->params.busy_poll_budget) + TH_LOG("params.busy_poll_budget incorrect"); + + EXPECT_EQ(1, self->params.prefer_busy_poll) + TH_LOG("params.prefer_busy_poll incorrect"); + + EXPECT_EQ(0, self->params.__pad) + TH_LOG("params.__pad was not 0"); +} + +TEST_F(epoll_busy_poll, test_invalid_ioctl) +{ + int invalid_ioctl = EPIOCGPARAMS + 10; + int ret; + + ret = ioctl(self->fd, invalid_ioctl, &self->params); + + EXPECT_EQ(-1, ret) + TH_LOG("invalid ioctl should return error"); + + EXPECT_EQ(EINVAL, errno) + TH_LOG("invalid ioctl should set errno to EINVAL"); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/net/fib_rule_tests.sh b/tools/testing/selftests/net/fib_rule_tests.sh index 51157a5559b7..7c01f58a20de 100755 --- a/tools/testing/selftests/net/fib_rule_tests.sh +++ b/tools/testing/selftests/net/fib_rule_tests.sh @@ -9,6 +9,7 @@ PAUSE_ON_FAIL=${PAUSE_ON_FAIL:=no} RTABLE=100 RTABLE_PEER=101 +RTABLE_VRF=102 GW_IP4=192.51.100.2 SRC_IP=192.51.100.3 GW_IP6=2001:db8:1::2 @@ -17,7 +18,14 @@ SRC_IP6=2001:db8:1::3 DEV_ADDR=192.51.100.1 DEV_ADDR6=2001:db8:1::1 DEV=dummy0 -TESTS="fib_rule6 fib_rule4 fib_rule6_connect fib_rule4_connect" +TESTS=" + fib_rule6 + fib_rule4 + fib_rule6_connect + fib_rule4_connect + fib_rule6_vrf + fib_rule4_vrf +" SELFTEST_PATH="" @@ -27,13 +35,18 @@ log_test() local expected=$2 local msg="$3" + $IP rule show | grep -q l3mdev + if [ $? -eq 0 ]; then + msg="$msg (VRF)" + fi + if [ ${rc} -eq ${expected} ]; then nsuccess=$((nsuccess+1)) - printf "\n TEST: %-50s [ OK ]\n" "${msg}" + printf "\n TEST: %-60s [ OK ]\n" "${msg}" else ret=1 nfail=$((nfail+1)) - printf "\n TEST: %-50s [FAIL]\n" "${msg}" + printf "\n TEST: %-60s [FAIL]\n" "${msg}" if [ "${PAUSE_ON_FAIL}" = "yes" ]; then echo echo "hit enter to continue, 'q' to quit" @@ -130,6 +143,17 @@ cleanup_peer() ip netns del $peerns } +setup_vrf() +{ + $IP link add name vrf0 up type vrf table $RTABLE_VRF + $IP link set dev $DEV master vrf0 +} + +cleanup_vrf() +{ + $IP link del dev vrf0 +} + fib_check_iproute_support() { ip rule help 2>&1 | grep -q $1 @@ -248,6 +272,13 @@ fib_rule6_test() fi } +fib_rule6_vrf_test() +{ + setup_vrf + fib_rule6_test + cleanup_vrf +} + # Verify that the IPV6_TCLASS option of UDPv6 and TCPv6 sockets is properly # taken into account when connecting the socket and when sending packets. fib_rule6_connect_test() @@ -385,6 +416,13 @@ fib_rule4_test() fi } +fib_rule4_vrf_test() +{ + setup_vrf + fib_rule4_test + cleanup_vrf +} + # Verify that the IP_TOS option of UDPv4 and TCPv4 sockets is properly taken # into account when connecting the socket and when sending packets. fib_rule4_connect_test() @@ -467,6 +505,8 @@ do fib_rule4_test|fib_rule4) fib_rule4_test;; fib_rule6_connect_test|fib_rule6_connect) fib_rule6_connect_test;; fib_rule4_connect_test|fib_rule4_connect) fib_rule4_connect_test;; + fib_rule6_vrf_test|fib_rule6_vrf) fib_rule6_vrf_test;; + fib_rule4_vrf_test|fib_rule4_vrf) fib_rule4_vrf_test;; help) echo "Test names: $TESTS"; exit 0;; diff --git a/tools/testing/selftests/net/forwarding/Makefile b/tools/testing/selftests/net/forwarding/Makefile index 535865b3d1d6..fa7b59ff4029 100644 --- a/tools/testing/selftests/net/forwarding/Makefile +++ b/tools/testing/selftests/net/forwarding/Makefile @@ -15,18 +15,12 @@ TEST_PROGS = bridge_fdb_learning_limit.sh \ bridge_vlan_unaware.sh \ custom_multipath_hash.sh \ dual_vxlan_bridge.sh \ - ethtool_extended_state.sh \ - ethtool_mm.sh \ - ethtool_rmon.sh \ - ethtool.sh \ gre_custom_multipath_hash.sh \ gre_inner_v4_multipath.sh \ gre_inner_v6_multipath.sh \ gre_multipath_nh_res.sh \ gre_multipath_nh.sh \ gre_multipath.sh \ - hw_stats_l3.sh \ - hw_stats_l3_gre.sh \ ip6_forward_instats_vrf.sh \ ip6gre_custom_multipath_hash.sh \ ip6gre_flat_key.sh \ @@ -43,8 +37,8 @@ TEST_PROGS = bridge_fdb_learning_limit.sh \ ipip_hier_gre_key.sh \ ipip_hier_gre_keys.sh \ ipip_hier_gre.sh \ + lib_sh_test.sh \ local_termination.sh \ - loopback.sh \ mirror_gre_bound.sh \ mirror_gre_bridge_1d.sh \ mirror_gre_bridge_1d_vlan.sh \ @@ -113,7 +107,6 @@ TEST_PROGS = bridge_fdb_learning_limit.sh \ vxlan_symmetric.sh TEST_FILES := devlink_lib.sh \ - ethtool_lib.sh \ fib_offload_lib.sh \ forwarding.config.sample \ ip6gre_lib.sh \ diff --git a/tools/testing/selftests/net/forwarding/README b/tools/testing/selftests/net/forwarding/README index b8a2af8fcfb7..7fdb6a9ca543 100644 --- a/tools/testing/selftests/net/forwarding/README +++ b/tools/testing/selftests/net/forwarding/README @@ -56,3 +56,36 @@ o Checks shall be added to lib.sh for any external dependencies. o Code shall be checked using ShellCheck [1] prior to submission. 1. https://www.shellcheck.net/ + +Customization +============= + +The forwarding selftests framework uses a number of variables that +influence its behavior and tools it invokes, and how it invokes them, in +various ways. A number of these variables can be overridden. The way these +overridable variables are specified is typically one of the following two +syntaxes: + + : "${VARIABLE:=default_value}" + VARIABLE=${VARIABLE:=default_value} + +Any of these variables can be overridden. Notably net/forwarding/lib.sh and +net/lib.sh contain a number of overridable variables. + +One way of overriding these variables is through the environment: + + PAUSE_ON_FAIL=yes ./some_test.sh + +The variable NETIFS is special. Since it is an array variable, there is no +way to pass it through the environment. Its value can instead be given as +consecutive arguments to the selftest: + + ./some_test.sh swp{1..8} + +A way to customize variables in a persistent fashion is to create a file +named forwarding.config in this directory. lib.sh sources the file if +present, so it can contain any shell code. Typically it will contain +assignments of variables whose value should be overridden. + +forwarding.config.sample is available in the directory as an example of +how forwarding.config might look. diff --git a/tools/testing/selftests/net/forwarding/bridge_igmp.sh b/tools/testing/selftests/net/forwarding/bridge_igmp.sh index 2aa66d2a1702..e6a3e04fd83f 100755 --- a/tools/testing/selftests/net/forwarding/bridge_igmp.sh +++ b/tools/testing/selftests/net/forwarding/bridge_igmp.sh @@ -478,10 +478,10 @@ v3exc_timeout_test() RET=0 local X=("192.0.2.20" "192.0.2.30") - # GMI should be 3 seconds + # GMI should be 5 seconds ip link set dev br0 type bridge mcast_query_interval 100 \ mcast_query_response_interval 100 \ - mcast_membership_interval 300 + mcast_membership_interval 500 v3exclude_prepare $h1 $ALL_MAC $ALL_GROUP ip link set dev br0 type bridge mcast_query_interval 500 \ @@ -489,7 +489,7 @@ v3exc_timeout_test() mcast_membership_interval 1500 $MZ $h1 -c 1 -b $ALL_MAC -B $ALL_GROUP -t ip "proto=2,p=$MZPKT_ALLOW2" -q - sleep 3 + sleep 5 bridge -j -d -s mdb show dev br0 \ | jq -e ".[].mdb[] | \ select(.grp == \"$TEST_GROUP\" and \ diff --git a/tools/testing/selftests/net/forwarding/bridge_mld.sh b/tools/testing/selftests/net/forwarding/bridge_mld.sh index e2b9ff773c6b..f84ab2e65754 100755 --- a/tools/testing/selftests/net/forwarding/bridge_mld.sh +++ b/tools/testing/selftests/net/forwarding/bridge_mld.sh @@ -478,10 +478,10 @@ mldv2exc_timeout_test() RET=0 local X=("2001:db8:1::20" "2001:db8:1::30") - # GMI should be 3 seconds + # GMI should be 5 seconds ip link set dev br0 type bridge mcast_query_interval 100 \ mcast_query_response_interval 100 \ - mcast_membership_interval 300 + mcast_membership_interval 500 mldv2exclude_prepare $h1 ip link set dev br0 type bridge mcast_query_interval 500 \ @@ -489,7 +489,7 @@ mldv2exc_timeout_test() mcast_membership_interval 1500 $MZ $h1 -c 1 $MZPKT_ALLOW2 -q - sleep 3 + sleep 5 bridge -j -d -s mdb show dev br0 \ | jq -e ".[].mdb[] | \ select(.grp == \"$TEST_GROUP\" and \ diff --git a/tools/testing/selftests/net/forwarding/forwarding.config.sample b/tools/testing/selftests/net/forwarding/forwarding.config.sample index 1fc4f0242fc5..f1ca95e79a65 100644 --- a/tools/testing/selftests/net/forwarding/forwarding.config.sample +++ b/tools/testing/selftests/net/forwarding/forwarding.config.sample @@ -3,51 +3,28 @@ ############################################################################## # Topology description. p1 looped back to p2, p3 to p4 and so on. -declare -A NETIFS -NETIFS[p1]=veth0 -NETIFS[p2]=veth1 -NETIFS[p3]=veth2 -NETIFS[p4]=veth3 -NETIFS[p5]=veth4 -NETIFS[p6]=veth5 -NETIFS[p7]=veth6 -NETIFS[p8]=veth7 -NETIFS[p9]=veth8 -NETIFS[p10]=veth9 +NETIFS=( + [p1]=veth0 + [p2]=veth1 + [p3]=veth2 + [p4]=veth3 + [p5]=veth4 + [p6]=veth5 + [p7]=veth6 + [p8]=veth7 + [p9]=veth8 + [p10]=veth9 +) # Port that does not have a cable connected. NETIF_NO_CABLE=eth8 ############################################################################## -# Defines +# In addition to the topology-related variables, it is also possible to override +# in this file other variables that net/lib.sh, net/forwarding/lib.sh or other +# libraries or selftests use. E.g.: -# IPv4 ping utility name -PING=ping -# IPv6 ping utility name. Some distributions use 'ping' for IPv6. PING6=ping6 -# Packet generator. Some distributions use 'mz'. MZ=mausezahn -# mausezahn delay between transmissions in microseconds. -MZ_DELAY=0 -# Time to wait after interfaces participating in the test are all UP WAIT_TIME=5 -# Whether to pause on failure or not. -PAUSE_ON_FAIL=no -# Whether to pause on cleanup or not. -PAUSE_ON_CLEANUP=no -# Type of network interface to create -NETIF_TYPE=veth -# Whether to create virtual interfaces (veth) or not -NETIF_CREATE=yes -# Timeout (in seconds) before ping exits regardless of how many packets have -# been sent or received -PING_TIMEOUT=5 -# Minimum ageing_time (in centiseconds) supported by hardware -LOW_AGEING_TIME=1000 -# Flag for tc match, supposed to be skip_sw/skip_hw which means do not process -# filter by software/hardware -TC_FLAG=skip_hw -# IPv6 traceroute utility name. -TROUTE6=traceroute6 - diff --git a/tools/testing/selftests/net/forwarding/ipip_lib.sh b/tools/testing/selftests/net/forwarding/ipip_lib.sh index 30f36a57bae6..01e62c4ac94d 100644 --- a/tools/testing/selftests/net/forwarding/ipip_lib.sh +++ b/tools/testing/selftests/net/forwarding/ipip_lib.sh @@ -141,7 +141,6 @@ # | $h2 + | # | 192.0.2.18/28 | # +---------------------------+ -source lib.sh h1_create() { diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index e579c2e0c462..112c85c35092 100644 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -2,33 +2,124 @@ # SPDX-License-Identifier: GPL-2.0 ############################################################################## +# Topology description. p1 looped back to p2, p3 to p4 and so on. + +declare -A NETIFS=( + [p1]=veth0 + [p2]=veth1 + [p3]=veth2 + [p4]=veth3 + [p5]=veth4 + [p6]=veth5 + [p7]=veth6 + [p8]=veth7 + [p9]=veth8 + [p10]=veth9 +) + +# Port that does not have a cable connected. +: "${NETIF_NO_CABLE:=eth8}" + +############################################################################## # Defines -# Can be overridden by the configuration file. -PING=${PING:=ping} -PING6=${PING6:=ping6} -MZ=${MZ:=mausezahn} -MZ_DELAY=${MZ_DELAY:=0} -ARPING=${ARPING:=arping} -TEAMD=${TEAMD:=teamd} -WAIT_TIME=${WAIT_TIME:=5} -PAUSE_ON_FAIL=${PAUSE_ON_FAIL:=no} -PAUSE_ON_CLEANUP=${PAUSE_ON_CLEANUP:=no} -NETIF_TYPE=${NETIF_TYPE:=veth} -NETIF_CREATE=${NETIF_CREATE:=yes} -MCD=${MCD:=smcrouted} -MC_CLI=${MC_CLI:=smcroutectl} -PING_COUNT=${PING_COUNT:=10} -PING_TIMEOUT=${PING_TIMEOUT:=5} -WAIT_TIMEOUT=${WAIT_TIMEOUT:=20} -INTERFACE_TIMEOUT=${INTERFACE_TIMEOUT:=600} -LOW_AGEING_TIME=${LOW_AGEING_TIME:=1000} -REQUIRE_JQ=${REQUIRE_JQ:=yes} -REQUIRE_MZ=${REQUIRE_MZ:=yes} -REQUIRE_MTOOLS=${REQUIRE_MTOOLS:=no} -STABLE_MAC_ADDRS=${STABLE_MAC_ADDRS:=no} -TCPDUMP_EXTRA_FLAGS=${TCPDUMP_EXTRA_FLAGS:=} -TROUTE6=${TROUTE6:=traceroute6} +# Networking utilities. +: "${PING:=ping}" +: "${PING6:=ping6}" # Some distros just use ping. +: "${ARPING:=arping}" +: "${TROUTE6:=traceroute6}" + +# Packet generator. +: "${MZ:=mausezahn}" # Some distributions use 'mz'. +: "${MZ_DELAY:=0}" + +# Host configuration tools. +: "${TEAMD:=teamd}" +: "${MCD:=smcrouted}" +: "${MC_CLI:=smcroutectl}" + +# Constants for netdevice bring-up: +# Default time in seconds to wait for an interface to come up before giving up +# and bailing out. Used during initial setup. +: "${INTERFACE_TIMEOUT:=600}" +# Like INTERFACE_TIMEOUT, but default for ad-hoc waiting in testing scripts. +: "${WAIT_TIMEOUT:=20}" +# Time to wait after interfaces participating in the test are all UP. +: "${WAIT_TIME:=5}" + +# Whether to pause on, respectively, after a failure and before cleanup. +: "${PAUSE_ON_FAIL:=no}" +: "${PAUSE_ON_CLEANUP:=no}" + +# Whether to create virtual interfaces, and what netdevice type they should be. +: "${NETIF_CREATE:=yes}" +: "${NETIF_TYPE:=veth}" + +# Constants for ping tests: +# How many packets should be sent. +: "${PING_COUNT:=10}" +# Timeout (in seconds) before ping exits regardless of how many packets have +# been sent or received +: "${PING_TIMEOUT:=5}" + +# Minimum ageing_time (in centiseconds) supported by hardware +: "${LOW_AGEING_TIME:=1000}" + +# Whether to check for availability of certain tools. +: "${REQUIRE_JQ:=yes}" +: "${REQUIRE_MZ:=yes}" +: "${REQUIRE_MTOOLS:=no}" + +# Whether to override MAC addresses on interfaces participating in the test. +: "${STABLE_MAC_ADDRS:=no}" + +# Flags for tcpdump +: "${TCPDUMP_EXTRA_FLAGS:=}" + +# Flags for TC filters. +: "${TC_FLAG:=skip_hw}" + +# Whether the machine is "slow" -- i.e. might be incapable of running tests +# involving heavy traffic. This might be the case on a debug kernel, a VM, or +# e.g. a low-power board. +: "${KSFT_MACHINE_SLOW:=no}" + +############################################################################## +# Find netifs by test-specified driver name + +driver_name_get() +{ + local dev=$1; shift + local driver_path="/sys/class/net/$dev/device/driver" + + if [[ -L $driver_path ]]; then + basename `realpath $driver_path` + fi +} + +netif_find_driver() +{ + local ifnames=`ip -j link show | jq -r ".[].ifname"` + local count=0 + + for ifname in $ifnames + do + local driver_name=`driver_name_get $ifname` + if [[ ! -z $driver_name && $driver_name == $NETIF_FIND_DRIVER ]]; then + count=$((count + 1)) + NETIFS[p$count]="$ifname" + fi + done +} + +# Whether to find netdevice according to the driver speficied by the importer +: "${NETIF_FIND_DRIVER:=}" + +if [[ $NETIF_FIND_DRIVER ]]; then + unset NETIFS + declare -A NETIFS + netif_find_driver +fi net_forwarding_dir=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") @@ -41,27 +132,9 @@ source "$net_forwarding_dir/../lib.sh" # timeout in seconds slowwait() { - local timeout=$1; shift - - local start_time="$(date -u +%s)" - while true - do - local out - out=$("$@") - local ret=$? - if ((!ret)); then - echo -n "$out" - return 0 - fi - - local current_time="$(date -u +%s)" - if ((current_time - start_time > timeout)); then - echo -n "$out" - return 1 - fi + local timeout_sec=$1; shift - sleep 0.1 - done + loopy_wait "sleep 0.1" "$((timeout_sec * 1000))" "$@" } ############################################################################## @@ -205,22 +278,23 @@ check_port_mab_support() fi } -skip_on_veth() +if [[ "$(id -u)" -ne 0 ]]; then + echo "SKIP: need root privileges" + exit $ksft_skip +fi + +check_driver() { - local kind=$(ip -j -d link show dev ${NETIFS[p1]} | - jq -r '.[].linkinfo.info_kind') + local dev=$1; shift + local expected=$1; shift + local driver_name=`driver_name_get $dev` - if [[ $kind == veth ]]; then - echo "SKIP: Test cannot be run with veth pairs" + if [[ $driver_name != $expected ]]; then + echo "SKIP: expected driver $expected for $dev, got $driver_name instead" exit $ksft_skip fi } -if [[ "$(id -u)" -ne 0 ]]; then - echo "SKIP: need root privileges" - exit $ksft_skip -fi - if [[ "$CHECK_TC" = "yes" ]]; then check_tc_version fi @@ -235,6 +309,21 @@ require_command() fi } +# IPv6 support was added in v3.0 +check_mtools_version() +{ + local version="$(msend -v)" + local major + + version=${version##msend version } + major=$(echo $version | cut -d. -f1) + + if [ $major -lt 3 ]; then + echo "SKIP: expected mtools version 3.0, got $version" + exit $ksft_skip + fi +} + if [[ "$REQUIRE_JQ" = "yes" ]]; then require_command jq fi @@ -242,15 +331,10 @@ if [[ "$REQUIRE_MZ" = "yes" ]]; then require_command $MZ fi if [[ "$REQUIRE_MTOOLS" = "yes" ]]; then - # https://github.com/vladimiroltean/mtools/ - # patched for IPv6 support + # https://github.com/troglobit/mtools require_command msend require_command mreceive -fi - -if [[ ! -v NUM_NETIFS ]]; then - echo "SKIP: importer does not define \"NUM_NETIFS\"" - exit $ksft_skip + check_mtools_version fi ############################################################################## @@ -271,6 +355,23 @@ done ############################################################################## # Network interfaces configuration +if [[ ! -v NUM_NETIFS ]]; then + echo "SKIP: importer does not define \"NUM_NETIFS\"" + exit $ksft_skip +fi + +if (( NUM_NETIFS > ${#NETIFS[@]} )); then + echo "SKIP: Importer requires $NUM_NETIFS NETIFS, but only ${#NETIFS[@]} are defined (${NETIFS[@]})" + exit $ksft_skip +fi + +for i in $(seq ${#NETIFS[@]}); do + if [[ ! ${NETIFS[p$i]} ]]; then + echo "SKIP: NETIFS[p$i] not given" + exit $ksft_skip + fi +done + create_netif_veth() { local i @@ -358,14 +459,31 @@ EXIT_STATUS=0 # Per-test return value. Clear at the beginning of each test. RET=0 +ret_set_ksft_status() +{ + local ksft_status=$1; shift + local msg=$1; shift + + RET=$(ksft_status_merge $RET $ksft_status) + if (( $? )); then + retmsg=$msg + fi +} + +# Whether FAILs should be interpreted as XFAILs. Internal. +FAIL_TO_XFAIL= + check_err() { local err=$1 local msg=$2 - if [[ $RET -eq 0 && $err -ne 0 ]]; then - RET=$err - retmsg=$msg + if ((err)); then + if [[ $FAIL_TO_XFAIL = yes ]]; then + ret_set_ksft_status $ksft_xfail "$msg" + else + ret_set_ksft_status $ksft_fail "$msg" + fi fi } @@ -374,10 +492,7 @@ check_fail() local err=$1 local msg=$2 - if [[ $RET -eq 0 && $err -eq 0 ]]; then - RET=1 - retmsg=$msg - fi + check_err $((!err)) "$msg" } check_err_fail() @@ -393,6 +508,85 @@ check_err_fail() fi } +xfail_on_slow() +{ + if [[ $KSFT_MACHINE_SLOW = yes ]]; then + FAIL_TO_XFAIL=yes "$@" + else + "$@" + fi +} + +xfail_on_veth() +{ + local dev=$1; shift + local kind + + kind=$(ip -j -d link show dev $dev | + jq -r '.[].linkinfo.info_kind') + if [[ $kind = veth ]]; then + FAIL_TO_XFAIL=yes "$@" + else + "$@" + fi +} + +log_test_result() +{ + local test_name=$1; shift + local opt_str=$1; shift + local result=$1; shift + local retmsg=$1; shift + + printf "TEST: %-60s [%s]\n" "$test_name $opt_str" "$result" + if [[ $retmsg ]]; then + printf "\t%s\n" "$retmsg" + fi +} + +pause_on_fail() +{ + if [[ $PAUSE_ON_FAIL == yes ]]; then + echo "Hit enter to continue, 'q' to quit" + read a + [[ $a == q ]] && exit 1 + fi +} + +handle_test_result_pass() +{ + local test_name=$1; shift + local opt_str=$1; shift + + log_test_result "$test_name" "$opt_str" " OK " +} + +handle_test_result_fail() +{ + local test_name=$1; shift + local opt_str=$1; shift + + log_test_result "$test_name" "$opt_str" FAIL "$retmsg" + pause_on_fail +} + +handle_test_result_xfail() +{ + local test_name=$1; shift + local opt_str=$1; shift + + log_test_result "$test_name" "$opt_str" XFAIL "$retmsg" + pause_on_fail +} + +handle_test_result_skip() +{ + local test_name=$1; shift + local opt_str=$1; shift + + log_test_result "$test_name" "$opt_str" SKIP "$retmsg" +} + log_test() { local test_name=$1 @@ -402,31 +596,28 @@ log_test() opt_str="($opt_str)" fi - if [[ $RET -ne 0 ]]; then - EXIT_STATUS=1 - printf "TEST: %-60s [FAIL]\n" "$test_name $opt_str" - if [[ ! -z "$retmsg" ]]; then - printf "\t%s\n" "$retmsg" - fi - if [ "${PAUSE_ON_FAIL}" = "yes" ]; then - echo "Hit enter to continue, 'q' to quit" - read a - [ "$a" = "q" ] && exit 1 - fi - return 1 + if ((RET == ksft_pass)); then + handle_test_result_pass "$test_name" "$opt_str" + elif ((RET == ksft_xfail)); then + handle_test_result_xfail "$test_name" "$opt_str" + elif ((RET == ksft_skip)); then + handle_test_result_skip "$test_name" "$opt_str" + else + handle_test_result_fail "$test_name" "$opt_str" fi - printf "TEST: %-60s [ OK ]\n" "$test_name $opt_str" - return 0 + EXIT_STATUS=$(ksft_exit_status_merge $EXIT_STATUS $RET) + return $RET } log_test_skip() { - local test_name=$1 - local opt_str=$2 + RET=$ksft_skip retmsg= log_test "$@" +} - printf "TEST: %-60s [SKIP]\n" "$test_name $opt_str" - return 0 +log_test_xfail() +{ + RET=$ksft_xfail retmsg= log_test "$@" } log_info() @@ -562,6 +753,19 @@ setup_wait() sleep $WAIT_TIME } +wait_for_dev() +{ + local dev=$1; shift + local timeout=${1:-$WAIT_TIMEOUT}; shift + + slowwait $timeout ip link show dev $dev &> /dev/null + if (( $? )); then + check_err 1 + log_test wait_for_dev "Interface $dev did not appear." + exit $EXIT_STATUS + fi +} + cmd_jq() { local cmd=$1 @@ -2011,6 +2215,8 @@ bail_on_lldpad() { local reason1="$1"; shift local reason2="$1"; shift + local caller=${FUNCNAME[1]} + local src=${BASH_SOURCE[1]} if systemctl is-active --quiet lldpad; then @@ -2031,7 +2237,8 @@ bail_on_lldpad() an environment variable ALLOW_LLDPAD to a non-empty string. EOF - exit 1 + log_test_skip $src:$caller + exit $EXIT_STATUS else return fi diff --git a/tools/testing/selftests/net/forwarding/lib_sh_test.sh b/tools/testing/selftests/net/forwarding/lib_sh_test.sh new file mode 100755 index 000000000000..ff2accccaf4d --- /dev/null +++ b/tools/testing/selftests/net/forwarding/lib_sh_test.sh @@ -0,0 +1,208 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# This tests the operation of lib.sh itself. + +ALL_TESTS=" + test_ret + test_exit_status +" +NUM_NETIFS=0 +source lib.sh + +# Simulated checks. + +do_test() +{ + local msg=$1; shift + + "$@" + check_err $? "$msg" +} + +tpass() +{ + do_test "tpass" true +} + +tfail() +{ + do_test "tfail" false +} + +txfail() +{ + FAIL_TO_XFAIL=yes do_test "txfail" false +} + +# Simulated tests. + +pass() +{ + RET=0 + do_test "true" true + log_test "true" +} + +fail() +{ + RET=0 + do_test "false" false + log_test "false" +} + +xfail() +{ + RET=0 + FAIL_TO_XFAIL=yes do_test "xfalse" false + log_test "xfalse" +} + +skip() +{ + RET=0 + log_test_skip "skip" +} + +slow_xfail() +{ + RET=0 + xfail_on_slow do_test "slow_false" false + log_test "slow_false" +} + +# lib.sh tests. + +ret_tests_run() +{ + local t + + RET=0 + retmsg= + for t in "$@"; do + $t + done + echo "$retmsg" + return $RET +} + +ret_subtest() +{ + local expect_ret=$1; shift + local expect_retmsg=$1; shift + local -a tests=( "$@" ) + + local status_names=(pass fail xfail xpass skip) + local ret + local out + + RET=0 + + # Run this in a subshell, so that our environment is intact. + out=$(ret_tests_run "${tests[@]}") + ret=$? + + (( ret == expect_ret )) + check_err $? "RET=$ret expected $expect_ret" + + [[ $out == $expect_retmsg ]] + check_err $? "retmsg=$out expected $expect_retmsg" + + log_test "RET $(echo ${tests[@]}) -> ${status_names[$ret]}" +} + +test_ret() +{ + ret_subtest $ksft_pass "" + + ret_subtest $ksft_pass "" tpass + ret_subtest $ksft_fail "tfail" tfail + ret_subtest $ksft_xfail "txfail" txfail + + ret_subtest $ksft_pass "" tpass tpass + ret_subtest $ksft_fail "tfail" tpass tfail + ret_subtest $ksft_xfail "txfail" tpass txfail + + ret_subtest $ksft_fail "tfail" tfail tpass + ret_subtest $ksft_xfail "txfail" txfail tpass + + ret_subtest $ksft_fail "tfail" tfail tfail + ret_subtest $ksft_fail "tfail" tfail txfail + + ret_subtest $ksft_fail "tfail" txfail tfail + + ret_subtest $ksft_xfail "txfail" txfail txfail +} + +exit_status_tests_run() +{ + EXIT_STATUS=0 + tests_run > /dev/null + return $EXIT_STATUS +} + +exit_status_subtest() +{ + local expect_exit_status=$1; shift + local tests=$1; shift + local what=$1; shift + + local status_names=(pass fail xfail xpass skip) + local exit_status + local out + + RET=0 + + # Run this in a subshell, so that our environment is intact. + out=$(TESTS="$tests" exit_status_tests_run) + exit_status=$? + + (( exit_status == expect_exit_status )) + check_err $? "EXIT_STATUS=$exit_status, expected $expect_exit_status" + + log_test "EXIT_STATUS $tests$what -> ${status_names[$exit_status]}" +} + +test_exit_status() +{ + exit_status_subtest $ksft_pass ":" + + exit_status_subtest $ksft_pass "pass" + exit_status_subtest $ksft_fail "fail" + exit_status_subtest $ksft_pass "xfail" + exit_status_subtest $ksft_skip "skip" + + exit_status_subtest $ksft_pass "pass pass" + exit_status_subtest $ksft_fail "pass fail" + exit_status_subtest $ksft_pass "pass xfail" + exit_status_subtest $ksft_skip "pass skip" + + exit_status_subtest $ksft_fail "fail pass" + exit_status_subtest $ksft_pass "xfail pass" + exit_status_subtest $ksft_skip "skip pass" + + exit_status_subtest $ksft_fail "fail fail" + exit_status_subtest $ksft_fail "fail xfail" + exit_status_subtest $ksft_fail "fail skip" + + exit_status_subtest $ksft_fail "xfail fail" + exit_status_subtest $ksft_fail "skip fail" + + exit_status_subtest $ksft_pass "xfail xfail" + exit_status_subtest $ksft_skip "xfail skip" + exit_status_subtest $ksft_skip "skip xfail" + + exit_status_subtest $ksft_skip "skip skip" + + KSFT_MACHINE_SLOW=yes \ + exit_status_subtest $ksft_pass "slow_xfail" ": slow" + + KSFT_MACHINE_SLOW=no \ + exit_status_subtest $ksft_fail "slow_xfail" ": fast" +} + +trap pre_cleanup EXIT + +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/net/forwarding/local_termination.sh b/tools/testing/selftests/net/forwarding/local_termination.sh index c5b0cbc85b3e..4b364cdf3ef0 100755 --- a/tools/testing/selftests/net/forwarding/local_termination.sh +++ b/tools/testing/selftests/net/forwarding/local_termination.sh @@ -155,25 +155,30 @@ run_test() "$smac > $MACVLAN_ADDR, ethertype IPv4 (0x0800)" \ true - check_rcv $rcv_if_name "Unicast IPv4 to unknown MAC address" \ - "$smac > $UNKNOWN_UC_ADDR1, ethertype IPv4 (0x0800)" \ - false + xfail_on_veth $h1 \ + check_rcv $rcv_if_name "Unicast IPv4 to unknown MAC address" \ + "$smac > $UNKNOWN_UC_ADDR1, ethertype IPv4 (0x0800)" \ + false check_rcv $rcv_if_name "Unicast IPv4 to unknown MAC address, promisc" \ "$smac > $UNKNOWN_UC_ADDR2, ethertype IPv4 (0x0800)" \ true - check_rcv $rcv_if_name "Unicast IPv4 to unknown MAC address, allmulti" \ - "$smac > $UNKNOWN_UC_ADDR3, ethertype IPv4 (0x0800)" \ - false + xfail_on_veth $h1 \ + check_rcv $rcv_if_name \ + "Unicast IPv4 to unknown MAC address, allmulti" \ + "$smac > $UNKNOWN_UC_ADDR3, ethertype IPv4 (0x0800)" \ + false check_rcv $rcv_if_name "Multicast IPv4 to joined group" \ "$smac > $JOINED_MACV4_MC_ADDR, ethertype IPv4 (0x0800)" \ true - check_rcv $rcv_if_name "Multicast IPv4 to unknown group" \ - "$smac > $UNKNOWN_MACV4_MC_ADDR1, ethertype IPv4 (0x0800)" \ - false + xfail_on_veth $h1 \ + check_rcv $rcv_if_name \ + "Multicast IPv4 to unknown group" \ + "$smac > $UNKNOWN_MACV4_MC_ADDR1, ethertype IPv4 (0x0800)" \ + false check_rcv $rcv_if_name "Multicast IPv4 to unknown group, promisc" \ "$smac > $UNKNOWN_MACV4_MC_ADDR2, ethertype IPv4 (0x0800)" \ @@ -187,9 +192,10 @@ run_test() "$smac > $JOINED_MACV6_MC_ADDR, ethertype IPv6 (0x86dd)" \ true - check_rcv $rcv_if_name "Multicast IPv6 to unknown group" \ - "$smac > $UNKNOWN_MACV6_MC_ADDR1, ethertype IPv6 (0x86dd)" \ - false + xfail_on_veth $h1 \ + check_rcv $rcv_if_name "Multicast IPv6 to unknown group" \ + "$smac > $UNKNOWN_MACV6_MC_ADDR1, ethertype IPv6 (0x86dd)" \ + false check_rcv $rcv_if_name "Multicast IPv6 to unknown group, promisc" \ "$smac > $UNKNOWN_MACV6_MC_ADDR2, ethertype IPv6 (0x86dd)" \ diff --git a/tools/testing/selftests/net/forwarding/router_mpath_nh.sh b/tools/testing/selftests/net/forwarding/router_mpath_nh.sh index 3f0f5dc95542..2ba44247c60a 100755 --- a/tools/testing/selftests/net/forwarding/router_mpath_nh.sh +++ b/tools/testing/selftests/net/forwarding/router_mpath_nh.sh @@ -1,6 +1,41 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 +# +-------------------------+ +# | H1 | +# | $h1 + | +# | 192.0.2.2/24 | | +# | 2001:db8:1::2/64 | | +# +-------------------|-----+ +# | +# +-------------------|----------------------+ +# | | R1 | +# | $rp11 + | +# | 192.0.2.1/24 | +# | 2001:db8:1::1/64 | +# | | +# | + $rp12 + $rp13 | +# | | 169.254.2.12/24 | 169.254.3.13/24 | +# | | fe80:2::12/64 | fe80:3::13/64 | +# +--|--------------------|------------------+ +# | | +# +--|--------------------|------------------+ +# | + $rp22 + $rp23 | +# | 169.254.2.22/24 169.254.3.23/24 | +# | fe80:2::22/64 fe80:3::23/64 | +# | | +# | $rp21 + | +# | 198.51.100.1/24 | | +# | 2001:db8:2::1/64 | R2 | +# +-------------------|----------------------+ +# | +# +-------------------|-----+ +# | | | +# | $h2 + | +# | 198.51.100.2/24 | +# | 2001:db8:2::2/64 H2 | +# +-------------------------+ + ALL_TESTS=" ping_ipv4 ping_ipv6 diff --git a/tools/testing/selftests/net/forwarding/router_mpath_nh_lib.sh b/tools/testing/selftests/net/forwarding/router_mpath_nh_lib.sh index 7e7d62161c34..2903294d8bca 100644 --- a/tools/testing/selftests/net/forwarding/router_mpath_nh_lib.sh +++ b/tools/testing/selftests/net/forwarding/router_mpath_nh_lib.sh @@ -56,21 +56,12 @@ nh_stats_test_dispatch_swhw() local group_id=$1; shift local mz="$@" - local used - nh_stats_do_test "$what" "$nh1_id" "$nh2_id" "$group_id" \ nh_stats_get "${mz[@]}" - used=$(ip -s -j -d nexthop show id $group_id | - jq '.[].hw_stats.used') - kind=$(ip -j -d link show dev $rp11 | - jq -r '.[].linkinfo.info_kind') - if [[ $used == true ]]; then + xfail_on_veth $rp11 \ nh_stats_do_test "HW $what" "$nh1_id" "$nh2_id" "$group_id" \ nh_stats_get_hw "${mz[@]}" - elif [[ $kind == veth ]]; then - log_test_skip "HW stats not offloaded on veth topology" - fi } nh_stats_test_dispatch() @@ -83,7 +74,6 @@ nh_stats_test_dispatch() local mz="$@" local enabled - local kind if ! ip nexthop help 2>&1 | grep -q hw_stats; then log_test_skip "NH stats test: ip doesn't support HW stats" diff --git a/tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh b/tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh index 4b483d24ad00..cd9e346436fc 100755 --- a/tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh +++ b/tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh @@ -1,6 +1,41 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 +# +-------------------------+ +# | H1 | +# | $h1 + | +# | 192.0.2.2/24 | | +# | 2001:db8:1::2/64 | | +# +-------------------|-----+ +# | +# +-------------------|----------------------+ +# | | R1 | +# | $rp11 + | +# | 192.0.2.1/24 | +# | 2001:db8:1::1/64 | +# | | +# | + $rp12 + $rp13 | +# | | 169.254.2.12/24 | 169.254.3.13/24 | +# | | fe80:2::12/64 | fe80:3::13/64 | +# +--|--------------------|------------------+ +# | | +# +--|--------------------|------------------+ +# | + $rp22 + $rp23 | +# | 169.254.2.22/24 169.254.3.23/24 | +# | fe80:2::22/64 fe80:3::23/64 | +# | | +# | $rp21 + | +# | 198.51.100.1/24 | | +# | 2001:db8:2::1/64 | R2 | +# +-------------------|----------------------+ +# | +# +-------------------|-----+ +# | | | +# | $h2 + | +# | 198.51.100.2/24 | +# | 2001:db8:2::2/64 H2 | +# +-------------------------+ + ALL_TESTS=" ping_ipv4 ping_ipv6 diff --git a/tools/testing/selftests/net/forwarding/router_nh.sh b/tools/testing/selftests/net/forwarding/router_nh.sh index f3a53738bdcc..92904b01eae9 100755 --- a/tools/testing/selftests/net/forwarding/router_nh.sh +++ b/tools/testing/selftests/net/forwarding/router_nh.sh @@ -1,6 +1,20 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 +# +-------------------------+ +-------------------------+ +# | H1 | | H2 | +# | $h1 + | | $h2 + | +# | 192.0.2.2/24 | | | 198.51.100.2/24 | | +# | 2001:db8:1::2/64 | | | 2001:db8:2::2/64 | | +# +-------------------|-----+ +-------------------|-----+ +# | | +# +-------------------|----------------------------|-----+ +# | R1 | | | +# | $rp1 + $rp2 + | +# | 192.0.2.1/24 198.51.100.1/24 | +# | 2001:db8:1::1/64 2001:db8:2::1/64 | +# +------------------------------------------------------+ + ALL_TESTS=" ping_ipv4 ping_ipv6 diff --git a/tools/testing/selftests/net/forwarding/sch_ets_tests.sh b/tools/testing/selftests/net/forwarding/sch_ets_tests.sh index cdf689e99458..f9d26a7911bb 100644 --- a/tools/testing/selftests/net/forwarding/sch_ets_tests.sh +++ b/tools/testing/selftests/net/forwarding/sch_ets_tests.sh @@ -199,25 +199,28 @@ ets_set_dwrr_two_bands() ets_test_strict() { ets_set_strict - ets_dwrr_test_01 - ets_dwrr_test_12 + xfail_on_slow ets_dwrr_test_01 + xfail_on_slow ets_dwrr_test_12 } ets_test_mixed() { ets_set_mixed - ets_dwrr_test_01 - ets_dwrr_test_12 + xfail_on_slow ets_dwrr_test_01 + xfail_on_slow ets_dwrr_test_12 } ets_test_dwrr() { ets_set_dwrr_uniform - ets_dwrr_test_012 + xfail_on_slow ets_dwrr_test_012 + ets_set_dwrr_varying - ets_dwrr_test_012 + xfail_on_slow ets_dwrr_test_012 + ets_change_quantum - ets_dwrr_test_012 + xfail_on_slow ets_dwrr_test_012 + ets_set_dwrr_two_bands - ets_dwrr_test_01 + xfail_on_slow ets_dwrr_test_01 } diff --git a/tools/testing/selftests/net/forwarding/sch_red.sh b/tools/testing/selftests/net/forwarding/sch_red.sh index 81f31179ac88..17f28644568e 100755 --- a/tools/testing/selftests/net/forwarding/sch_red.sh +++ b/tools/testing/selftests/net/forwarding/sch_red.sh @@ -451,35 +451,35 @@ uninstall_qdisc() ecn_test() { install_qdisc ecn - do_ecn_test $BACKLOG + xfail_on_slow do_ecn_test $BACKLOG uninstall_qdisc } ecn_nodrop_test() { install_qdisc ecn nodrop - do_ecn_nodrop_test $BACKLOG + xfail_on_slow do_ecn_nodrop_test $BACKLOG uninstall_qdisc } red_test() { install_qdisc - do_red_test $BACKLOG + xfail_on_slow do_red_test $BACKLOG uninstall_qdisc } red_qevent_test() { install_qdisc qevent early_drop block 10 - do_red_qevent_test $BACKLOG + xfail_on_slow do_red_qevent_test $BACKLOG uninstall_qdisc } ecn_qevent_test() { install_qdisc ecn qevent mark block 10 - do_ecn_qevent_test $BACKLOG + xfail_on_slow do_ecn_qevent_test $BACKLOG uninstall_qdisc } diff --git a/tools/testing/selftests/net/forwarding/sch_tbf_core.sh b/tools/testing/selftests/net/forwarding/sch_tbf_core.sh index d1f26cb7cd73..9cd884d4a5de 100644 --- a/tools/testing/selftests/net/forwarding/sch_tbf_core.sh +++ b/tools/testing/selftests/net/forwarding/sch_tbf_core.sh @@ -227,7 +227,7 @@ do_tbf_test() local nr=$(rate $t2 $t3 10) local nr_pct=$((100 * (nr - er) / er)) ((-5 <= nr_pct && nr_pct <= 5)) - check_err $? "Expected rate $(humanize $er), got $(humanize $nr), which is $nr_pct% off. Required accuracy is +-5%." + xfail_on_slow check_err $? "Expected rate $(humanize $er), got $(humanize $nr), which is $nr_pct% off. Required accuracy is +-5%." log_test "TC $((vlan - 10)): TBF rate ${mbit}Mbit" } diff --git a/tools/testing/selftests/net/forwarding/tc_common.sh b/tools/testing/selftests/net/forwarding/tc_common.sh index bce8bb8d2b6f..2e3326edfa9a 100644 --- a/tools/testing/selftests/net/forwarding/tc_common.sh +++ b/tools/testing/selftests/net/forwarding/tc_common.sh @@ -4,7 +4,7 @@ CHECK_TC="yes" # Can be overridden by the configuration file. See lib.sh -TC_HIT_TIMEOUT=${TC_HIT_TIMEOUT:=1000} # ms +: "${TC_HIT_TIMEOUT:=1000}" # ms tc_check_packets() { diff --git a/tools/testing/selftests/net/forwarding/tc_tunnel_key.sh b/tools/testing/selftests/net/forwarding/tc_tunnel_key.sh index 5a5dd9034819..79775b10b99f 100755 --- a/tools/testing/selftests/net/forwarding/tc_tunnel_key.sh +++ b/tools/testing/selftests/net/forwarding/tc_tunnel_key.sh @@ -1,7 +1,5 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 ALL_TESTS="tunnel_key_nofrag_test" diff --git a/tools/testing/selftests/net/gro.c b/tools/testing/selftests/net/gro.c index 353e1e867fbb..b2184847e388 100644 --- a/tools/testing/selftests/net/gro.c +++ b/tools/testing/selftests/net/gro.c @@ -93,6 +93,7 @@ static bool tx_socket = true; static int tcp_offset = -1; static int total_hdr_len = -1; static int ethhdr_proto = -1; +static const int num_flush_id_cases = 6; static void vlog(const char *fmt, ...) { @@ -119,6 +120,9 @@ static void setup_sock_filter(int fd) next_off = offsetof(struct ipv6hdr, nexthdr); ipproto_off = ETH_HLEN + next_off; + /* Overridden later if exthdrs are used: */ + opt_ipproto_off = ipproto_off; + if (strcmp(testname, "ip") == 0) { if (proto == PF_INET) optlen = sizeof(struct ip_timestamp); @@ -617,6 +621,113 @@ static void add_ipv6_exthdr(void *buf, void *optpkt, __u8 exthdr_type, char *ext iph->payload_len = htons(ntohs(iph->payload_len) + MIN_EXTHDR_SIZE); } +static void fix_ip4_checksum(struct iphdr *iph) +{ + iph->check = 0; + iph->check = checksum_fold(iph, sizeof(struct iphdr), 0); +} + +static void send_flush_id_case(int fd, struct sockaddr_ll *daddr, int tcase) +{ + static char buf1[MAX_HDR_LEN + PAYLOAD_LEN]; + static char buf2[MAX_HDR_LEN + PAYLOAD_LEN]; + static char buf3[MAX_HDR_LEN + PAYLOAD_LEN]; + bool send_three = false; + struct iphdr *iph1; + struct iphdr *iph2; + struct iphdr *iph3; + + iph1 = (struct iphdr *)(buf1 + ETH_HLEN); + iph2 = (struct iphdr *)(buf2 + ETH_HLEN); + iph3 = (struct iphdr *)(buf3 + ETH_HLEN); + + create_packet(buf1, 0, 0, PAYLOAD_LEN, 0); + create_packet(buf2, PAYLOAD_LEN, 0, PAYLOAD_LEN, 0); + create_packet(buf3, PAYLOAD_LEN * 2, 0, PAYLOAD_LEN, 0); + + switch (tcase) { + case 0: /* DF=1, Incrementing - should coalesce */ + iph1->frag_off |= htons(IP_DF); + iph1->id = htons(8); + + iph2->frag_off |= htons(IP_DF); + iph2->id = htons(9); + break; + + case 1: /* DF=1, Fixed - should coalesce */ + iph1->frag_off |= htons(IP_DF); + iph1->id = htons(8); + + iph2->frag_off |= htons(IP_DF); + iph2->id = htons(8); + break; + + case 2: /* DF=0, Incrementing - should coalesce */ + iph1->frag_off &= ~htons(IP_DF); + iph1->id = htons(8); + + iph2->frag_off &= ~htons(IP_DF); + iph2->id = htons(9); + break; + + case 3: /* DF=0, Fixed - should not coalesce */ + iph1->frag_off &= ~htons(IP_DF); + iph1->id = htons(8); + + iph2->frag_off &= ~htons(IP_DF); + iph2->id = htons(8); + break; + + case 4: /* DF=1, two packets incrementing, and one fixed - should + * coalesce only the first two packets + */ + iph1->frag_off |= htons(IP_DF); + iph1->id = htons(8); + + iph2->frag_off |= htons(IP_DF); + iph2->id = htons(9); + + iph3->frag_off |= htons(IP_DF); + iph3->id = htons(9); + send_three = true; + break; + + case 5: /* DF=1, two packets fixed, and one incrementing - should + * coalesce only the first two packets + */ + iph1->frag_off |= htons(IP_DF); + iph1->id = htons(8); + + iph2->frag_off |= htons(IP_DF); + iph2->id = htons(8); + + iph3->frag_off |= htons(IP_DF); + iph3->id = htons(9); + send_three = true; + break; + } + + fix_ip4_checksum(iph1); + fix_ip4_checksum(iph2); + write_packet(fd, buf1, total_hdr_len + PAYLOAD_LEN, daddr); + write_packet(fd, buf2, total_hdr_len + PAYLOAD_LEN, daddr); + + if (send_three) { + fix_ip4_checksum(iph3); + write_packet(fd, buf3, total_hdr_len + PAYLOAD_LEN, daddr); + } +} + +static void test_flush_id(int fd, struct sockaddr_ll *daddr, char *fin_pkt) +{ + for (int i = 0; i < num_flush_id_cases; i++) { + sleep(1); + send_flush_id_case(fd, daddr, i); + sleep(1); + write_packet(fd, fin_pkt, total_hdr_len, daddr); + } +} + static void send_ipv6_exthdr(int fd, struct sockaddr_ll *daddr, char *ext_data1, char *ext_data2) { static char buf[MAX_HDR_LEN + PAYLOAD_LEN]; @@ -935,6 +1046,8 @@ static void gro_sender(void) send_fragment4(txfd, &daddr); sleep(1); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + + test_flush_id(txfd, &daddr, fin_pkt); } else if (proto == PF_INET6) { sleep(1); send_fragment6(txfd, &daddr); @@ -1061,6 +1174,34 @@ static void gro_receiver(void) printf("fragmented ip4 doesn't coalesce: "); check_recv_pkts(rxfd, correct_payload, 2); + + /* is_atomic checks */ + printf("DF=1, Incrementing - should coalesce: "); + correct_payload[0] = PAYLOAD_LEN * 2; + check_recv_pkts(rxfd, correct_payload, 1); + + printf("DF=1, Fixed - should coalesce: "); + correct_payload[0] = PAYLOAD_LEN * 2; + check_recv_pkts(rxfd, correct_payload, 1); + + printf("DF=0, Incrementing - should coalesce: "); + correct_payload[0] = PAYLOAD_LEN * 2; + check_recv_pkts(rxfd, correct_payload, 1); + + printf("DF=0, Fixed - should not coalesce: "); + correct_payload[0] = PAYLOAD_LEN; + correct_payload[1] = PAYLOAD_LEN; + check_recv_pkts(rxfd, correct_payload, 2); + + printf("DF=1, 2 Incrementing and one fixed - should coalesce only first 2 packets: "); + correct_payload[0] = PAYLOAD_LEN * 2; + correct_payload[1] = PAYLOAD_LEN; + check_recv_pkts(rxfd, correct_payload, 2); + + printf("DF=1, 2 Fixed and one incrementing - should coalesce only first 2 packets: "); + correct_payload[0] = PAYLOAD_LEN * 2; + correct_payload[1] = PAYLOAD_LEN; + check_recv_pkts(rxfd, correct_payload, 2); } else if (proto == PF_INET6) { /* GRO doesn't check for ipv6 hop limit when flushing. * Hence no corresponding test to the ipv4 case. diff --git a/tools/testing/selftests/net/hsr/Makefile b/tools/testing/selftests/net/hsr/Makefile index 92c1d9d080cd..884cd2cc0681 100644 --- a/tools/testing/selftests/net/hsr/Makefile +++ b/tools/testing/selftests/net/hsr/Makefile @@ -2,6 +2,7 @@ top_srcdir = ../../../../.. -TEST_PROGS := hsr_ping.sh +TEST_PROGS := hsr_ping.sh hsr_redbox.sh +TEST_FILES += hsr_common.sh include ../../lib.mk diff --git a/tools/testing/selftests/net/hsr/hsr_common.sh b/tools/testing/selftests/net/hsr/hsr_common.sh new file mode 100644 index 000000000000..8e97b1f2e7e5 --- /dev/null +++ b/tools/testing/selftests/net/hsr/hsr_common.sh @@ -0,0 +1,84 @@ +# SPDX-License-Identifier: GPL-2.0 +# Common code for HSR testing scripts + +source ../lib.sh +ret=0 +ksft_skip=4 + +# $1: IP address +is_v6() +{ + [ -z "${1##*:*}" ] +} + +do_ping() +{ + local netns="$1" + local connect_addr="$2" + local ping_args="-q -c 2" + + if is_v6 "${connect_addr}"; then + $ipv6 || return 0 + ping_args="${ping_args} -6" + fi + + ip netns exec ${netns} ping ${ping_args} $connect_addr >/dev/null + if [ $? -ne 0 ] ; then + echo "$netns -> $connect_addr connectivity [ FAIL ]" 1>&2 + ret=1 + return 1 + fi + + return 0 +} + +do_ping_long() +{ + local netns="$1" + local connect_addr="$2" + local ping_args="-q -c 10" + + if is_v6 "${connect_addr}"; then + $ipv6 || return 0 + ping_args="${ping_args} -6" + fi + + OUT="$(LANG=C ip netns exec ${netns} ping ${ping_args} $connect_addr | grep received)" + if [ $? -ne 0 ] ; then + echo "$netns -> $connect_addr ping [ FAIL ]" 1>&2 + ret=1 + return 1 + fi + + VAL="$(echo $OUT | cut -d' ' -f1-8)" + SED_VAL="$(echo ${VAL} | sed -r -e 's/([0-9]{2}).*([0-9]{2}).*[[:space:]]([0-9]+%).*/\1 transmitted \2 received \3 loss/')" + if [ "${SED_VAL}" != "10 transmitted 10 received 0% loss" ] + then + echo "$netns -> $connect_addr ping TEST [ FAIL ]" + echo "Expect to send and receive 10 packets and no duplicates." + echo "Full message: ${OUT}." + ret=1 + return 1 + fi + + return 0 +} + +stop_if_error() +{ + local msg="$1" + + if [ ${ret} -ne 0 ]; then + echo "FAIL: ${msg}" 1>&2 + exit ${ret} + fi +} + +check_prerequisites() +{ + ip -Version > /dev/null 2>&1 + if [ $? -ne 0 ];then + echo "SKIP: Could not run test without ip tool" + exit $ksft_skip + fi +} diff --git a/tools/testing/selftests/net/hsr/hsr_ping.sh b/tools/testing/selftests/net/hsr/hsr_ping.sh index 1c6457e54625..790294c8af83 100755 --- a/tools/testing/selftests/net/hsr/hsr_ping.sh +++ b/tools/testing/selftests/net/hsr/hsr_ping.sh @@ -1,10 +1,10 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 -ret=0 -ksft_skip=4 ipv6=true +source ./hsr_common.sh + optstring="h4" usage() { echo "Usage: $0 [OPTION]" @@ -27,88 +27,6 @@ while getopts "$optstring" option;do esac done -sec=$(date +%s) -rndh=$(printf %x $sec)-$(mktemp -u XXXXXX) -ns1="ns1-$rndh" -ns2="ns2-$rndh" -ns3="ns3-$rndh" - -cleanup() -{ - local netns - for netns in "$ns1" "$ns2" "$ns3" ;do - ip netns del $netns - done -} - -# $1: IP address -is_v6() -{ - [ -z "${1##*:*}" ] -} - -do_ping() -{ - local netns="$1" - local connect_addr="$2" - local ping_args="-q -c 2" - - if is_v6 "${connect_addr}"; then - $ipv6 || return 0 - ping_args="${ping_args} -6" - fi - - ip netns exec ${netns} ping ${ping_args} $connect_addr >/dev/null - if [ $? -ne 0 ] ; then - echo "$netns -> $connect_addr connectivity [ FAIL ]" 1>&2 - ret=1 - return 1 - fi - - return 0 -} - -do_ping_long() -{ - local netns="$1" - local connect_addr="$2" - local ping_args="-q -c 10" - - if is_v6 "${connect_addr}"; then - $ipv6 || return 0 - ping_args="${ping_args} -6" - fi - - OUT="$(LANG=C ip netns exec ${netns} ping ${ping_args} $connect_addr | grep received)" - if [ $? -ne 0 ] ; then - echo "$netns -> $connect_addr ping [ FAIL ]" 1>&2 - ret=1 - return 1 - fi - - VAL="$(echo $OUT | cut -d' ' -f1-8)" - if [ "$VAL" != "10 packets transmitted, 10 received, 0% packet loss," ] - then - echo "$netns -> $connect_addr ping TEST [ FAIL ]" - echo "Expect to send and receive 10 packets and no duplicates." - echo "Full message: ${OUT}." - ret=1 - return 1 - fi - - return 0 -} - -stop_if_error() -{ - local msg="$1" - - if [ ${ret} -ne 0 ]; then - echo "FAIL: ${msg}" 1>&2 - exit ${ret} - fi -} - do_complete_ping_test() { echo "INFO: Initial validation ping." @@ -248,27 +166,13 @@ setup_hsr_interfaces() ip -net "$ns3" link set hsr3 up } -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi - -trap cleanup EXIT +check_prerequisites +setup_ns ns1 ns2 ns3 -for i in "$ns1" "$ns2" "$ns3" ;do - ip netns add $i || exit $ksft_skip - ip -net $i link set lo up -done +trap cleanup_all_ns EXIT setup_hsr_interfaces 0 do_complete_ping_test -cleanup - -for i in "$ns1" "$ns2" "$ns3" ;do - ip netns add $i || exit $ksft_skip - ip -net $i link set lo up -done setup_hsr_interfaces 1 do_complete_ping_test diff --git a/tools/testing/selftests/net/hsr/hsr_redbox.sh b/tools/testing/selftests/net/hsr/hsr_redbox.sh new file mode 100755 index 000000000000..1f36785347c0 --- /dev/null +++ b/tools/testing/selftests/net/hsr/hsr_redbox.sh @@ -0,0 +1,121 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +ipv6=false + +source ./hsr_common.sh + +do_complete_ping_test() +{ + echo "INFO: Initial validation ping (HSR-SAN/RedBox)." + # Each node has to be able to reach each one. + do_ping "${ns1}" 100.64.0.2 + do_ping "${ns2}" 100.64.0.1 + # Ping between SANs (test bridge) + do_ping "${ns4}" 100.64.0.51 + do_ping "${ns5}" 100.64.0.41 + # Ping from SANs to hsr1 (via hsr2) (and opposite) + do_ping "${ns3}" 100.64.0.1 + do_ping "${ns1}" 100.64.0.3 + do_ping "${ns1}" 100.64.0.41 + do_ping "${ns4}" 100.64.0.1 + do_ping "${ns1}" 100.64.0.51 + do_ping "${ns5}" 100.64.0.1 + stop_if_error "Initial validation failed." + + # Wait for MGNT HSR frames being received and nodes being + # merged. + sleep 5 + + echo "INFO: Longer ping test (HSR-SAN/RedBox)." + # Ping from SAN to hsr1 (via hsr2) + do_ping_long "${ns3}" 100.64.0.1 + # Ping from hsr1 (via hsr2) to SANs (and opposite) + do_ping_long "${ns1}" 100.64.0.3 + do_ping_long "${ns1}" 100.64.0.41 + do_ping_long "${ns4}" 100.64.0.1 + do_ping_long "${ns1}" 100.64.0.51 + do_ping_long "${ns5}" 100.64.0.1 + stop_if_error "Longer ping test failed." + + echo "INFO: All good." +} + +setup_hsr_interfaces() +{ + local HSRv="$1" + + echo "INFO: preparing interfaces for HSRv${HSRv} (HSR-SAN/RedBox)." +# +# IPv4 addresses (100.64.X.Y/24), and [X.Y] is presented on below diagram: +# +# +# |NS1 | |NS4 | +# | [0.1] | | | +# | /-- hsr1 --\ | | [0.41] | +# | ns1eth1 ns1eth2 | | ns4eth1 (SAN) | +# |------------------------| |-------------------| +# | | | +# | | | +# | | | +# |------------------------| |-------------------------------| +# | ns2eth1 ns2eth2 | | ns3eth2 | +# | \-- hsr2 --/ | | / | +# | [0.2] \ | | / | |------------| +# | ns2eth3 |---| ns3eth1 -- ns3br1 -- ns3eth3--|--| ns5eth1 | +# | (interlink)| | [0.3] [0.11] | | [0.51] | +# |NS2 (RedBOX) | |NS3 (BR) | | NS5 (SAN) | +# +# + # Check if iproute2 supports adding interlink port to hsrX device + ip link help hsr | grep -q INTERLINK + [ $? -ne 0 ] && { echo "iproute2: HSR interlink interface not supported!"; exit 0; } + + # Create interfaces for name spaces + ip link add ns1eth1 netns "${ns1}" type veth peer name ns2eth1 netns "${ns2}" + ip link add ns1eth2 netns "${ns1}" type veth peer name ns2eth2 netns "${ns2}" + ip link add ns2eth3 netns "${ns2}" type veth peer name ns3eth1 netns "${ns3}" + ip link add ns3eth2 netns "${ns3}" type veth peer name ns4eth1 netns "${ns4}" + ip link add ns3eth3 netns "${ns3}" type veth peer name ns5eth1 netns "${ns5}" + + sleep 1 + + ip -n "${ns1}" link set ns1eth1 up + ip -n "${ns1}" link set ns1eth2 up + + ip -n "${ns2}" link set ns2eth1 up + ip -n "${ns2}" link set ns2eth2 up + ip -n "${ns2}" link set ns2eth3 up + + ip -n "${ns3}" link add name ns3br1 type bridge + ip -n "${ns3}" link set ns3br1 up + ip -n "${ns3}" link set ns3eth1 master ns3br1 up + ip -n "${ns3}" link set ns3eth2 master ns3br1 up + ip -n "${ns3}" link set ns3eth3 master ns3br1 up + + ip -n "${ns4}" link set ns4eth1 up + ip -n "${ns5}" link set ns5eth1 up + + ip -net "${ns1}" link add name hsr1 type hsr slave1 ns1eth1 slave2 ns1eth2 supervision 45 version ${HSRv} proto 0 + ip -net "${ns2}" link add name hsr2 type hsr slave1 ns2eth1 slave2 ns2eth2 interlink ns2eth3 supervision 45 version ${HSRv} proto 0 + + ip -n "${ns1}" addr add 100.64.0.1/24 dev hsr1 + ip -n "${ns2}" addr add 100.64.0.2/24 dev hsr2 + ip -n "${ns3}" addr add 100.64.0.11/24 dev ns3br1 + ip -n "${ns3}" addr add 100.64.0.3/24 dev ns3eth1 + ip -n "${ns4}" addr add 100.64.0.41/24 dev ns4eth1 + ip -n "${ns5}" addr add 100.64.0.51/24 dev ns5eth1 + + ip -n "${ns1}" link set hsr1 up + ip -n "${ns2}" link set hsr2 up +} + +check_prerequisites +setup_ns ns1 ns2 ns3 ns4 ns5 + +trap cleanup_all_ns EXIT + +setup_hsr_interfaces 1 +do_complete_ping_test + +exit $ret diff --git a/tools/testing/selftests/net/ip_local_port_range.c b/tools/testing/selftests/net/ip_local_port_range.c index 193b82745fd8..29451d2244b7 100644 --- a/tools/testing/selftests/net/ip_local_port_range.c +++ b/tools/testing/selftests/net/ip_local_port_range.c @@ -359,7 +359,7 @@ TEST_F(ip_local_port_range, late_bind) struct sockaddr_in v4; struct sockaddr_in6 v6; } addr; - socklen_t addr_len; + socklen_t addr_len = 0; const int one = 1; int fd, err; __u32 range; diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh index f9fe182dfbd4..72b191e4e064 100644 --- a/tools/testing/selftests/net/lib.sh +++ b/tools/testing/selftests/net/lib.sh @@ -4,19 +4,64 @@ ############################################################################## # Defines -WAIT_TIMEOUT=${WAIT_TIMEOUT:=20} +: "${WAIT_TIMEOUT:=20}" + BUSYWAIT_TIMEOUT=$((WAIT_TIMEOUT * 1000)) # ms -# Kselftest framework requirement - SKIP code is 4. +# Kselftest framework constants. +ksft_pass=0 +ksft_fail=1 +ksft_xfail=2 ksft_skip=4 + # namespace list created by setup_ns NS_LIST="" ############################################################################## # Helpers -busywait() + +__ksft_status_merge() +{ + local a=$1; shift + local b=$1; shift + local -A weights + local weight=0 + + for i in "$@"; do + weights[$i]=$((weight++)) + done + + if [[ ${weights[$a]} > ${weights[$b]} ]]; then + echo "$a" + return 0 + else + echo "$b" + return 1 + fi +} + +ksft_status_merge() +{ + local a=$1; shift + local b=$1; shift + + __ksft_status_merge "$a" "$b" \ + $ksft_pass $ksft_xfail $ksft_skip $ksft_fail +} + +ksft_exit_status_merge() { - local timeout=$1; shift + local a=$1; shift + local b=$1; shift + + __ksft_status_merge "$a" "$b" \ + $ksft_xfail $ksft_pass $ksft_skip $ksft_fail +} + +loopy_wait() +{ + local sleep_cmd=$1; shift + local timeout_ms=$1; shift local start_time="$(date -u +%s%3N)" while true @@ -30,13 +75,22 @@ busywait() fi local current_time="$(date -u +%s%3N)" - if ((current_time - start_time > timeout)); then + if ((current_time - start_time > timeout_ms)); then echo -n "$out" return 1 fi + + $sleep_cmd done } +busywait() +{ + local timeout_ms=$1; shift + + loopy_wait : "$timeout_ms" "$@" +} + cleanup_ns() { local ns="" @@ -73,15 +127,17 @@ setup_ns() local ns="" local ns_name="" local ns_list="" + local ns_exist= for ns_name in "$@"; do # Some test may setup/remove same netns multi times if unset ${ns_name} 2> /dev/null; then ns="${ns_name,,}-$(mktemp -u XXXXXX)" eval readonly ${ns_name}="$ns" + ns_exist=false else eval ns='$'${ns_name} cleanup_ns "$ns" - + ns_exist=true fi if ! ip netns add "$ns"; then @@ -90,7 +146,7 @@ setup_ns() return $ksft_skip fi ip -n "$ns" link set lo up - ns_list="$ns_list $ns" + ! $ns_exist && ns_list="$ns_list $ns" done NS_LIST="$NS_LIST $ns_list" } diff --git a/tools/testing/selftests/net/lib/.gitignore b/tools/testing/selftests/net/lib/.gitignore new file mode 100644 index 000000000000..1ebc6187f421 --- /dev/null +++ b/tools/testing/selftests/net/lib/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +csum diff --git a/tools/testing/selftests/net/lib/Makefile b/tools/testing/selftests/net/lib/Makefile new file mode 100644 index 000000000000..82c3264b115e --- /dev/null +++ b/tools/testing/selftests/net/lib/Makefile @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: GPL-2.0 + +CFLAGS = -Wall -Wl,--no-as-needed -O2 -g +CFLAGS += -I../../../../../usr/include/ $(KHDR_INCLUDES) +# Additional include paths needed by kselftest.h +CFLAGS += -I../../ + +TEST_FILES := ../../../../../Documentation/netlink/specs +TEST_FILES += ../../../../net/ynl + +TEST_GEN_FILES += csum + +TEST_INCLUDES := $(wildcard py/*.py) + +include ../../lib.mk diff --git a/tools/testing/selftests/net/csum.c b/tools/testing/selftests/net/lib/csum.c index 90eb06fefa59..b9f3fc3c3426 100644 --- a/tools/testing/selftests/net/csum.c +++ b/tools/testing/selftests/net/lib/csum.c @@ -682,7 +682,7 @@ static int recv_verify_packet_ipv6(void *nh, int len) } /* return whether auxdata includes TP_STATUS_CSUM_VALID */ -static bool recv_verify_packet_csum(struct msghdr *msg) +static uint32_t recv_get_packet_csum_status(struct msghdr *msg) { struct tpacket_auxdata *aux = NULL; struct cmsghdr *cm; @@ -706,7 +706,7 @@ static bool recv_verify_packet_csum(struct msghdr *msg) if (!aux) error(1, 0, "cmsg: no auxdata"); - return aux->tp_status & TP_STATUS_CSUM_VALID; + return aux->tp_status; } static int recv_packet(int fd) @@ -716,6 +716,7 @@ static int recv_packet(int fd) char ctrl[CMSG_SPACE(sizeof(struct tpacket_auxdata))]; struct pkt *buf = (void *)_buf; struct msghdr msg = {0}; + uint32_t tp_status; struct iovec iov; int len, ret; @@ -737,6 +738,17 @@ static int recv_packet(int fd) if (len == -1) error(1, errno, "recv p"); + tp_status = recv_get_packet_csum_status(&msg); + + /* GRO might coalesce randomized packets. Such GSO packets are + * then reinitialized for csum offload (CHECKSUM_PARTIAL), with + * a pseudo csum. Do not try to validate these checksums. + */ + if (tp_status & TP_STATUS_CSUMNOTREADY) { + fprintf(stderr, "cmsg: GSO packet has partial csum: skip\n"); + continue; + } + if (cfg_family == PF_INET6) ret = recv_verify_packet_ipv6(buf, len); else @@ -753,7 +765,7 @@ static int recv_packet(int fd) * Do not fail if kernel does not validate a good csum: * Absence of validation does not imply invalid. */ - if (recv_verify_packet_csum(&msg) && cfg_bad_csum) { + if (tp_status & TP_STATUS_CSUM_VALID && cfg_bad_csum) { fprintf(stderr, "cmsg: expected bad csum, pf_packet returns valid\n"); bad_validations++; } diff --git a/tools/testing/selftests/net/lib/py/__init__.py b/tools/testing/selftests/net/lib/py/__init__.py new file mode 100644 index 000000000000..b6d498d125fe --- /dev/null +++ b/tools/testing/selftests/net/lib/py/__init__.py @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0 + +from .consts import KSRC +from .ksft import * +from .netns import NetNS +from .nsim import * +from .utils import * +from .ynl import NlError, YnlFamily, EthtoolFamily, NetdevFamily, RtnlFamily diff --git a/tools/testing/selftests/net/lib/py/consts.py b/tools/testing/selftests/net/lib/py/consts.py new file mode 100644 index 000000000000..f518ce79d82c --- /dev/null +++ b/tools/testing/selftests/net/lib/py/consts.py @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: GPL-2.0 + +import sys +from pathlib import Path + +KSFT_DIR = (Path(__file__).parent / "../../..").resolve() +KSRC = (Path(__file__).parent / "../../../../../..").resolve() + +KSFT_MAIN_NAME = Path(sys.argv[0]).with_suffix("").name diff --git a/tools/testing/selftests/net/lib/py/ksft.py b/tools/testing/selftests/net/lib/py/ksft.py new file mode 100644 index 000000000000..4769b4eb1ea1 --- /dev/null +++ b/tools/testing/selftests/net/lib/py/ksft.py @@ -0,0 +1,159 @@ +# SPDX-License-Identifier: GPL-2.0 + +import builtins +import inspect +import sys +import time +import traceback +from .consts import KSFT_MAIN_NAME + +KSFT_RESULT = None +KSFT_RESULT_ALL = True + + +class KsftFailEx(Exception): + pass + + +class KsftSkipEx(Exception): + pass + + +class KsftXfailEx(Exception): + pass + + +def ksft_pr(*objs, **kwargs): + print("#", *objs, **kwargs) + + +def _fail(*args): + global KSFT_RESULT + KSFT_RESULT = False + + frame = inspect.stack()[2] + ksft_pr("At " + frame.filename + " line " + str(frame.lineno) + ":") + ksft_pr(*args) + + +def ksft_eq(a, b, comment=""): + global KSFT_RESULT + if a != b: + _fail("Check failed", a, "!=", b, comment) + + +def ksft_true(a, comment=""): + if not a: + _fail("Check failed", a, "does not eval to True", comment) + + +def ksft_in(a, b, comment=""): + if a not in b: + _fail("Check failed", a, "not in", b, comment) + + +def ksft_ge(a, b, comment=""): + if a < b: + _fail("Check failed", a, "<", b, comment) + + +class ksft_raises: + def __init__(self, expected_type): + self.exception = None + self.expected_type = expected_type + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + if exc_type is None: + _fail(f"Expected exception {str(self.expected_type.__name__)}, none raised") + elif self.expected_type != exc_type: + _fail(f"Expected exception {str(self.expected_type.__name__)}, raised {str(exc_type.__name__)}") + self.exception = exc_val + # Suppress the exception if its the expected one + return self.expected_type == exc_type + + +def ksft_busy_wait(cond, sleep=0.005, deadline=1, comment=""): + end = time.monotonic() + deadline + while True: + if cond(): + return + if time.monotonic() > end: + _fail("Waiting for condition timed out", comment) + return + time.sleep(sleep) + + +def ktap_result(ok, cnt=1, case="", comment=""): + global KSFT_RESULT_ALL + KSFT_RESULT_ALL = KSFT_RESULT_ALL and ok + + res = "" + if not ok: + res += "not " + res += "ok " + res += str(cnt) + " " + res += KSFT_MAIN_NAME + if case: + res += "." + str(case.__name__) + if comment: + res += " # " + comment + print(res) + + +def ksft_run(cases=None, globs=None, case_pfx=None, args=()): + cases = cases or [] + + if globs and case_pfx: + for key, value in globs.items(): + if not callable(value): + continue + for prefix in case_pfx: + if key.startswith(prefix): + cases.append(value) + break + + totals = {"pass": 0, "fail": 0, "skip": 0, "xfail": 0} + + print("KTAP version 1") + print("1.." + str(len(cases))) + + global KSFT_RESULT + cnt = 0 + for case in cases: + KSFT_RESULT = True + cnt += 1 + try: + case(*args) + except KsftSkipEx as e: + ktap_result(True, cnt, case, comment="SKIP " + str(e)) + totals['skip'] += 1 + continue + except KsftXfailEx as e: + ktap_result(True, cnt, case, comment="XFAIL " + str(e)) + totals['xfail'] += 1 + continue + except Exception as e: + tb = traceback.format_exc() + for line in tb.strip().split('\n'): + ksft_pr("Exception|", line) + ktap_result(False, cnt, case) + totals['fail'] += 1 + continue + + ktap_result(KSFT_RESULT, cnt, case) + if KSFT_RESULT: + totals['pass'] += 1 + else: + totals['fail'] += 1 + + print( + f"# Totals: pass:{totals['pass']} fail:{totals['fail']} xfail:{totals['xfail']} xpass:0 skip:{totals['skip']} error:0" + ) + + +def ksft_exit(): + global KSFT_RESULT_ALL + sys.exit(0 if KSFT_RESULT_ALL else 1) diff --git a/tools/testing/selftests/net/lib/py/netns.py b/tools/testing/selftests/net/lib/py/netns.py new file mode 100644 index 000000000000..ecff85f9074f --- /dev/null +++ b/tools/testing/selftests/net/lib/py/netns.py @@ -0,0 +1,31 @@ +# SPDX-License-Identifier: GPL-2.0 + +from .utils import ip +import random +import string + + +class NetNS: + def __init__(self, name=None): + if name: + self.name = name + else: + self.name = ''.join(random.choice(string.ascii_lowercase) for _ in range(8)) + ip('netns add ' + self.name) + + def __del__(self): + if self.name: + ip('netns del ' + self.name) + self.name = None + + def __enter__(self): + return self + + def __exit__(self, ex_type, ex_value, ex_tb): + self.__del__() + + def __str__(self): + return self.name + + def __repr__(self): + return f"NetNS({self.name})" diff --git a/tools/testing/selftests/net/lib/py/nsim.py b/tools/testing/selftests/net/lib/py/nsim.py new file mode 100644 index 000000000000..f571a8b3139b --- /dev/null +++ b/tools/testing/selftests/net/lib/py/nsim.py @@ -0,0 +1,134 @@ +# SPDX-License-Identifier: GPL-2.0 + +import json +import os +import random +import re +import time +from .utils import cmd, ip + + +class NetdevSim: + """ + Class for netdevsim netdevice and its attributes. + """ + + def __init__(self, nsimdev, port_index, ifname, ns=None): + # In case udev renamed the netdev to according to new schema, + # check if the name matches the port_index. + nsimnamere = re.compile(r"eni\d+np(\d+)") + match = nsimnamere.match(ifname) + if match and int(match.groups()[0]) != port_index + 1: + raise Exception("netdevice name mismatches the expected one") + + self.ifname = ifname + self.nsimdev = nsimdev + self.port_index = port_index + self.ns = ns + self.dfs_dir = "%s/ports/%u/" % (nsimdev.dfs_dir, port_index) + ret = ip("-j link show dev %s" % ifname, ns=ns) + self.dev = json.loads(ret.stdout)[0] + self.ifindex = self.dev["ifindex"] + + def dfs_write(self, path, val): + self.nsimdev.dfs_write(f'ports/{self.port_index}/' + path, val) + + +class NetdevSimDev: + """ + Class for netdevsim bus device and its attributes. + """ + @staticmethod + def ctrl_write(path, val): + fullpath = os.path.join("/sys/bus/netdevsim/", path) + with open(fullpath, "w") as f: + f.write(val) + + def dfs_write(self, path, val): + fullpath = os.path.join(f"/sys/kernel/debug/netdevsim/netdevsim{self.addr}/", path) + with open(fullpath, "w") as f: + f.write(val) + + def __init__(self, port_count=1, queue_count=1, ns=None): + # nsim will spawn in init_net, we'll set to actual ns once we switch it there + self.ns = None + + if not os.path.exists("/sys/bus/netdevsim"): + cmd("modprobe netdevsim") + + addr = random.randrange(1 << 15) + while True: + try: + self.ctrl_write("new_device", "%u %u %u" % (addr, port_count, queue_count)) + except OSError as e: + if e.errno == errno.ENOSPC: + addr = random.randrange(1 << 15) + continue + raise e + break + self.addr = addr + + # As probe of netdevsim device might happen from a workqueue, + # so wait here until all netdevs appear. + self.wait_for_netdevs(port_count) + + if ns: + cmd(f"devlink dev reload netdevsim/netdevsim{addr} netns {ns.name}") + self.ns = ns + + cmd("udevadm settle", ns=self.ns) + ifnames = self.get_ifnames() + + self.dfs_dir = "/sys/kernel/debug/netdevsim/netdevsim%u/" % addr + + self.nsims = [] + for port_index in range(port_count): + self.nsims.append(self._make_port(port_index, ifnames[port_index])) + + self.removed = False + + def __enter__(self): + return self + + def __exit__(self, ex_type, ex_value, ex_tb): + """ + __exit__ gets called at the end of a "with" block. + """ + self.remove() + + def _make_port(self, port_index, ifname): + return NetdevSim(self, port_index, ifname, self.ns) + + def get_ifnames(self): + ifnames = [] + listdir = cmd(f"ls /sys/bus/netdevsim/devices/netdevsim{self.addr}/net/", + ns=self.ns).stdout.split() + for ifname in listdir: + ifnames.append(ifname) + ifnames.sort() + return ifnames + + def wait_for_netdevs(self, port_count): + timeout = 5 + timeout_start = time.time() + + while True: + try: + ifnames = self.get_ifnames() + except FileNotFoundError as e: + ifnames = [] + if len(ifnames) == port_count: + break + if time.time() < timeout_start + timeout: + continue + raise Exception("netdevices did not appear within timeout") + + def remove(self): + if not self.removed: + self.ctrl_write("del_device", "%u" % (self.addr, )) + self.removed = True + + def remove_nsim(self, nsim): + self.nsims.remove(nsim) + self.ctrl_write("devices/netdevsim%u/del_port" % (self.addr, ), + "%u" % (nsim.port_index, )) diff --git a/tools/testing/selftests/net/lib/py/utils.py b/tools/testing/selftests/net/lib/py/utils.py new file mode 100644 index 000000000000..0540ea24921d --- /dev/null +++ b/tools/testing/selftests/net/lib/py/utils.py @@ -0,0 +1,102 @@ +# SPDX-License-Identifier: GPL-2.0 + +import json as _json +import random +import re +import subprocess +import time + + +class cmd: + def __init__(self, comm, shell=True, fail=True, ns=None, background=False, host=None, timeout=5): + if ns: + comm = f'ip netns exec {ns} ' + comm + + self.stdout = None + self.stderr = None + self.ret = None + + self.comm = comm + if host: + self.proc = host.cmd(comm) + else: + self.proc = subprocess.Popen(comm, shell=shell, stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + if not background: + self.process(terminate=False, fail=fail, timeout=timeout) + + def process(self, terminate=True, fail=None, timeout=5): + if fail is None: + fail = not terminate + + if terminate: + self.proc.terminate() + stdout, stderr = self.proc.communicate(timeout) + self.stdout = stdout.decode("utf-8") + self.stderr = stderr.decode("utf-8") + self.proc.stdout.close() + self.proc.stderr.close() + self.ret = self.proc.returncode + + if self.proc.returncode != 0 and fail: + if len(stderr) > 0 and stderr[-1] == "\n": + stderr = stderr[:-1] + raise Exception("Command failed: %s\nSTDOUT: %s\nSTDERR: %s" % + (self.proc.args, stdout, stderr)) + + +class bkg(cmd): + def __init__(self, comm, shell=True, fail=None, ns=None, host=None, + exit_wait=False): + super().__init__(comm, background=True, + shell=shell, fail=fail, ns=ns, host=host) + self.terminate = not exit_wait + self.check_fail = fail + + def __enter__(self): + return self + + def __exit__(self, ex_type, ex_value, ex_tb): + return self.process(terminate=self.terminate, fail=self.check_fail) + + +def tool(name, args, json=None, ns=None, host=None): + cmd_str = name + ' ' + if json: + cmd_str += '--json ' + cmd_str += args + cmd_obj = cmd(cmd_str, ns=ns, host=host) + if json: + return _json.loads(cmd_obj.stdout) + return cmd_obj + + +def ip(args, json=None, ns=None, host=None): + if ns: + args = f'-netns {ns} ' + args + return tool('ip', args, json=json, host=host) + + +def rand_port(): + """ + Get unprivileged port, for now just random, one day we may decide to check if used. + """ + return random.randint(10000, 65535) + + +def wait_port_listen(port, proto="tcp", ns=None, host=None, sleep=0.005, deadline=5): + end = time.monotonic() + deadline + + pattern = f":{port:04X} .* " + if proto == "tcp": # for tcp protocol additionally check the socket state + pattern += "0A" + pattern = re.compile(pattern) + + while True: + data = cmd(f'cat /proc/net/{proto}*', ns=ns, host=host, shell=True).stdout + for row in data.split("\n"): + if pattern.search(row): + return + if time.monotonic() > end: + raise Exception("Waiting for port listen timed out") + time.sleep(sleep) diff --git a/tools/testing/selftests/net/lib/py/ynl.py b/tools/testing/selftests/net/lib/py/ynl.py new file mode 100644 index 000000000000..1ace58370c06 --- /dev/null +++ b/tools/testing/selftests/net/lib/py/ynl.py @@ -0,0 +1,49 @@ +# SPDX-License-Identifier: GPL-2.0 + +import sys +from pathlib import Path +from .consts import KSRC, KSFT_DIR +from .ksft import ksft_pr, ktap_result + +# Resolve paths +try: + if (KSFT_DIR / "kselftest-list.txt").exists(): + # Running in "installed" selftests + tools_full_path = KSFT_DIR + SPEC_PATH = KSFT_DIR / "net/lib/specs" + + sys.path.append(tools_full_path.as_posix()) + from net.lib.ynl.lib import YnlFamily, NlError + else: + # Running in tree + tools_full_path = KSRC / "tools" + SPEC_PATH = KSRC / "Documentation/netlink/specs" + + sys.path.append(tools_full_path.as_posix()) + from net.ynl.lib import YnlFamily, NlError +except ModuleNotFoundError as e: + ksft_pr("Failed importing `ynl` library from kernel sources") + ksft_pr(str(e)) + ktap_result(True, comment="SKIP") + sys.exit(4) + +# +# Wrapper classes, loading the right specs +# Set schema='' to avoid jsonschema validation, it's slow +# +class EthtoolFamily(YnlFamily): + def __init__(self): + super().__init__((SPEC_PATH / Path('ethtool.yaml')).as_posix(), + schema='') + + +class RtnlFamily(YnlFamily): + def __init__(self): + super().__init__((SPEC_PATH / Path('rt_link.yaml')).as_posix(), + schema='') + + +class NetdevFamily(YnlFamily): + def __init__(self): + super().__init__((SPEC_PATH / Path('netdev.yaml')).as_posix(), + schema='') diff --git a/tools/testing/selftests/net/mptcp/diag.sh b/tools/testing/selftests/net/mptcp/diag.sh index bc97ab33a00e..776d43a6922d 100755 --- a/tools/testing/selftests/net/mptcp/diag.sh +++ b/tools/testing/selftests/net/mptcp/diag.sh @@ -200,6 +200,58 @@ chk_msk_cestab() "${expected}" "${msg}" "" } +msk_info_get_value() +{ + local port="${1}" + local info="${2}" + + ss -N "${ns}" -inHM dport "${port}" | \ + mptcp_lib_get_info_value "${info}" "${info}" +} + +chk_msk_info() +{ + local port="${1}" + local info="${2}" + local cnt="${3}" + local msg="....chk ${info}" + local delta_ms=250 # half what we waited before, just to be sure + local now + + now=$(msk_info_get_value "${port}" "${info}") + + mptcp_lib_print_title "${msg}" + if { [ -z "${cnt}" ] || [ -z "${now}" ]; } && + ! mptcp_lib_expect_all_features; then + mptcp_lib_pr_skip "Feature probably not supported" + mptcp_lib_result_skip "${msg}" + elif [ "$((cnt + delta_ms))" -lt "${now}" ]; then + mptcp_lib_pr_ok + mptcp_lib_result_pass "${msg}" + else + mptcp_lib_pr_fail "value of ${info} changed by $((now - cnt))ms," \ + "expected at least ${delta_ms}ms" + mptcp_lib_result_fail "${msg}" + ret=${KSFT_FAIL} + fi +} + +chk_last_time_info() +{ + local port="${1}" + local data_sent data_recv ack_recv + + data_sent=$(msk_info_get_value "${port}" "last_data_sent") + data_recv=$(msk_info_get_value "${port}" "last_data_recv") + ack_recv=$(msk_info_get_value "${port}" "last_ack_recv") + + sleep 0.5 # wait to check after if the timestamps difference + + chk_msk_info "${port}" "last_data_sent" "${data_sent}" + chk_msk_info "${port}" "last_data_recv" "${data_recv}" + chk_msk_info "${port}" "last_ack_recv" "${ack_recv}" +} + wait_connected() { local listener_ns="${1}" @@ -233,6 +285,7 @@ echo "b" | \ 127.0.0.1 >/dev/null & wait_connected $ns 10000 chk_msk_nr 2 "after MPC handshake " +chk_last_time_info 10000 chk_msk_remote_key_nr 2 "....chk remote_key" chk_msk_fallback_nr 0 "....chk no fallback" chk_msk_inuse 2 diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh index 4131f3263a48..b77fb7065bfb 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh @@ -147,7 +147,7 @@ cleanup() mptcp_lib_check_mptcp mptcp_lib_check_kallsyms -mptcp_lib_check_tools ip +mptcp_lib_check_tools ip tc sin=$(mktemp) sout=$(mktemp) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index e4403236f655..fefa9173bdaa 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -31,7 +31,6 @@ timeout_poll=30 timeout_test=$((timeout_poll * 2 + 1)) capture=false checksum=false -ip_mptcp=0 check_invert=0 validate_checksum=false init=0 @@ -125,8 +124,8 @@ init_shapers() { local i for i in $(seq 1 4); do - tc -n $ns1 qdisc add dev ns1eth$i root netem rate 20mbit delay 1 - tc -n $ns2 qdisc add dev ns2eth$i root netem rate 20mbit delay 1 + tc -n $ns1 qdisc add dev ns1eth$i root netem rate 20mbit delay 1ms + tc -n $ns2 qdisc add dev ns2eth$i root netem rate 20mbit delay 1ms done } @@ -142,7 +141,7 @@ init() { mptcp_lib_check_mptcp mptcp_lib_check_kallsyms - mptcp_lib_check_tools ip ss "${iptables}" "${ip6tables}" + mptcp_lib_check_tools ip tc ss "${iptables}" "${ip6tables}" sin=$(mktemp) sout=$(mktemp) @@ -606,173 +605,65 @@ kill_events_pids() pm_nl_set_limits() { - local ns=$1 - local addrs=$2 - local subflows=$3 - - if [ $ip_mptcp -eq 1 ]; then - ip -n $ns mptcp limits set add_addr_accepted $addrs subflows $subflows - else - ip netns exec $ns ./pm_nl_ctl limits $addrs $subflows - fi + mptcp_lib_pm_nl_set_limits "${@}" } pm_nl_add_endpoint() { - local ns=$1 - local addr=$2 - local flags _flags - local port _port - local dev _dev - local id _id - local nr=2 - - local p - for p in "${@}" - do - if [ $p = "flags" ]; then - eval _flags=\$"$nr" - [ -n "$_flags" ]; flags="flags $_flags" - fi - if [ $p = "dev" ]; then - eval _dev=\$"$nr" - [ -n "$_dev" ]; dev="dev $_dev" - fi - if [ $p = "id" ]; then - eval _id=\$"$nr" - [ -n "$_id" ]; id="id $_id" - fi - if [ $p = "port" ]; then - eval _port=\$"$nr" - [ -n "$_port" ]; port="port $_port" - fi - - nr=$((nr + 1)) - done - - if [ $ip_mptcp -eq 1 ]; then - ip -n $ns mptcp endpoint add $addr ${_flags//","/" "} $dev $id $port - else - ip netns exec $ns ./pm_nl_ctl add $addr $flags $dev $id $port - fi + mptcp_lib_pm_nl_add_endpoint "${@}" } pm_nl_del_endpoint() { - local ns=$1 - local id=$2 - local addr=$3 - - if [ $ip_mptcp -eq 1 ]; then - [ $id -ne 0 ] && addr='' - ip -n $ns mptcp endpoint delete id $id $addr - else - ip netns exec $ns ./pm_nl_ctl del $id $addr - fi + mptcp_lib_pm_nl_del_endpoint "${@}" } pm_nl_flush_endpoint() { - local ns=$1 - - if [ $ip_mptcp -eq 1 ]; then - ip -n $ns mptcp endpoint flush - else - ip netns exec $ns ./pm_nl_ctl flush - fi + mptcp_lib_pm_nl_flush_endpoint "${@}" } pm_nl_show_endpoints() { - local ns=$1 - - if [ $ip_mptcp -eq 1 ]; then - ip -n $ns mptcp endpoint show - else - ip netns exec $ns ./pm_nl_ctl dump - fi + mptcp_lib_pm_nl_show_endpoints "${@}" } pm_nl_change_endpoint() { - local ns=$1 - local id=$2 - local flags=$3 - - if [ $ip_mptcp -eq 1 ]; then - ip -n $ns mptcp endpoint change id $id ${flags//","/" "} - else - ip netns exec $ns ./pm_nl_ctl set id $id flags $flags - fi + mptcp_lib_pm_nl_change_endpoint "${@}" } pm_nl_check_endpoint() { - local line expected_line local msg="$1" local ns=$2 local addr=$3 - local _flags="" - local flags - local _port - local port - local dev - local _id - local id + local flags dev id port print_check "${msg}" shift 3 while [ -n "$1" ]; do - if [ $1 = "flags" ]; then - _flags=$2 - [ -n "$_flags" ]; flags="flags $_flags" - shift - elif [ $1 = "dev" ]; then - [ -n "$2" ]; dev="dev $2" + case "${1}" in + "flags" | "dev" | "id" | "port") + eval "${1}"="${2}" shift - elif [ $1 = "id" ]; then - _id=$2 - [ -n "$_id" ]; id="id $_id" - shift - elif [ $1 = "port" ]; then - _port=$2 - [ -n "$_port" ]; port=" port $_port" - shift - fi + ;; + *) + ;; + esac shift done - if [ -z "$id" ]; then + if [ -z "${id}" ]; then test_fail "bad test - missing endpoint id" return fi - if [ $ip_mptcp -eq 1 ]; then - # get line and trim trailing whitespace - line=$(ip -n $ns mptcp endpoint show $id) - line="${line% }" - # the dump order is: address id flags port dev - [ -n "$addr" ] && expected_line="$addr" - expected_line+=" $id" - [ -n "$_flags" ] && expected_line+=" ${_flags//","/" "}" - [ -n "$dev" ] && expected_line+=" $dev" - [ -n "$port" ] && expected_line+=" $port" - else - line=$(ip netns exec $ns ./pm_nl_ctl get $_id) - # the dump order is: id flags dev address port - expected_line="$id" - [ -n "$flags" ] && expected_line+=" $flags" - [ -n "$dev" ] && expected_line+=" $dev" - [ -n "$addr" ] && expected_line+=" $addr" - [ -n "$_port" ] && expected_line+=" $_port" - fi - if [ "$line" = "$expected_line" ]; then - print_ok - else - fail_test "expected '$expected_line' found '$line'" - fi + check_output "mptcp_lib_pm_nl_get_endpoint ${ns} ${id}" \ + "$(mptcp_lib_pm_nl_format_endpoints \ + "${id},${addr},${flags//","/" "},${dev},${port}")" } pm_nl_set_endpoint() @@ -3212,7 +3103,7 @@ fail_tests() # multiple subflows if reset_with_fail "MP_FAIL MP_RST" 2; then - tc -n $ns2 qdisc add dev ns2eth1 root netem rate 1mbit delay 5 + tc -n $ns2 qdisc add dev ns2eth1 root netem rate 1mbit delay 5ms pm_nl_set_limits $ns1 0 1 pm_nl_set_limits $ns2 0 1 pm_nl_add_endpoint $ns2 10.0.2.2 dev ns2eth2 flags subflow @@ -3702,7 +3593,7 @@ while getopts "${all_tests_args}cCih" opt; do checksum=true ;; i) - ip_mptcp=1 + mptcp_lib_set_ip_mptcp ;; h) usage diff --git a/tools/testing/selftests/net/mptcp/mptcp_lib.sh b/tools/testing/selftests/net/mptcp/mptcp_lib.sh index d529b4b37af8..ad2ebda5cb64 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_lib.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_lib.sh @@ -23,6 +23,7 @@ MPTCP_LIB_SUBTESTS=() MPTCP_LIB_SUBTESTS_DUPLICATED=0 MPTCP_LIB_TEST_COUNTER=0 MPTCP_LIB_TEST_FORMAT="%02u %-50s" +MPTCP_LIB_IP_MPTCP=0 # only if supported (or forced) and not disabled, see no-color.org if { [ -t 1 ] || [ "${SELFTESTS_MPTCP_LIB_COLOR_FORCE:-}" = "1" ]; } && @@ -384,6 +385,12 @@ mptcp_lib_check_tools() { exit ${KSFT_SKIP} fi ;; + "tc") + if ! tc -help &> /dev/null; then + mptcp_lib_pr_skip "Could not run test without tc tool" + exit ${KSFT_SKIP} + fi + ;; "ss") if ! ss -h | grep -q MPTCP; then mptcp_lib_pr_skip "ss tool does not support MPTCP" @@ -505,3 +512,131 @@ mptcp_lib_verify_listener_events() { mptcp_lib_check_expected "type" "family" "saddr" "sport" || rc="${?}" return "${rc}" } + +mptcp_lib_set_ip_mptcp() { + MPTCP_LIB_IP_MPTCP=1 +} + +mptcp_lib_is_ip_mptcp() { + [ "${MPTCP_LIB_IP_MPTCP}" = "1" ] +} + +# format: <id>,<ip>,<flags>,<dev> +mptcp_lib_pm_nl_format_endpoints() { + local entry id ip flags dev port + + for entry in "${@}"; do + IFS=, read -r id ip flags dev port <<< "${entry}" + if mptcp_lib_is_ip_mptcp; then + echo -n "${ip}" + [ -n "${port}" ] && echo -n " port ${port}" + echo -n " id ${id}" + [ -n "${flags}" ] && echo -n " ${flags}" + [ -n "${dev}" ] && echo -n " dev ${dev}" + echo " " # always a space at the end + else + echo -n "id ${id}" + echo -n " flags ${flags//" "/","}" + [ -n "${dev}" ] && echo -n " dev ${dev}" + echo -n " ${ip}" + [ -n "${port}" ] && echo -n " ${port}" + echo "" + fi + done +} + +mptcp_lib_pm_nl_get_endpoint() { + local ns=${1} + local id=${2} + + if mptcp_lib_is_ip_mptcp; then + ip -n "${ns}" mptcp endpoint show id "${id}" + else + ip netns exec "${ns}" ./pm_nl_ctl get "${id}" + fi +} + +mptcp_lib_pm_nl_set_limits() { + local ns=${1} + local addrs=${2} + local subflows=${3} + + if mptcp_lib_is_ip_mptcp; then + ip -n "${ns}" mptcp limits set add_addr_accepted "${addrs}" subflows "${subflows}" + else + ip netns exec "${ns}" ./pm_nl_ctl limits "${addrs}" "${subflows}" + fi +} + +mptcp_lib_pm_nl_add_endpoint() { + local ns=${1} + local addr=${2} + local flags dev id port + local nr=2 + + local p + for p in "${@}"; do + case "${p}" in + "flags" | "dev" | "id" | "port") + eval "${p}"=\$"${nr}" + ;; + esac + + nr=$((nr + 1)) + done + + if mptcp_lib_is_ip_mptcp; then + # shellcheck disable=SC2086 # blanks in flags, no double quote + ip -n "${ns}" mptcp endpoint add "${addr}" ${flags//","/" "} \ + ${dev:+dev "${dev}"} ${id:+id "${id}"} ${port:+port "${port}"} + else + ip netns exec "${ns}" ./pm_nl_ctl add "${addr}" ${flags:+flags "${flags}"} \ + ${dev:+dev "${dev}"} ${id:+id "${id}"} ${port:+port "${port}"} + fi +} + +mptcp_lib_pm_nl_del_endpoint() { + local ns=${1} + local id=${2} + local addr=${3} + + if mptcp_lib_is_ip_mptcp; then + [ "${id}" -ne 0 ] && addr='' + ip -n "${ns}" mptcp endpoint delete id "${id}" ${addr:+"${addr}"} + else + ip netns exec "${ns}" ./pm_nl_ctl del "${id}" "${addr}" + fi +} + +mptcp_lib_pm_nl_flush_endpoint() { + local ns=${1} + + if mptcp_lib_is_ip_mptcp; then + ip -n "${ns}" mptcp endpoint flush + else + ip netns exec "${ns}" ./pm_nl_ctl flush + fi +} + +mptcp_lib_pm_nl_show_endpoints() { + local ns=${1} + + if mptcp_lib_is_ip_mptcp; then + ip -n "${ns}" mptcp endpoint show + else + ip netns exec "${ns}" ./pm_nl_ctl dump + fi +} + +mptcp_lib_pm_nl_change_endpoint() { + local ns=${1} + local id=${2} + local flags=${3} + + if mptcp_lib_is_ip_mptcp; then + # shellcheck disable=SC2086 # blanks in flags, no double quote + ip -n "${ns}" mptcp endpoint change id "${id}" ${flags//","/" "} + else + ip netns exec "${ns}" ./pm_nl_ctl set id "${id}" flags "${flags}" + fi +} diff --git a/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh index e2d70c18786e..68899a303a1a 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh @@ -22,6 +22,28 @@ ns1="" ns2="" ns_sbox="" +usage() { + echo "Usage: $0 [ -i ] [ -h ]" + echo -e "\t-i: use 'ip mptcp' instead of 'pm_nl_ctl'" + echo -e "\t-h: help" +} + +while getopts "hi" option;do + case "$option" in + "h") + usage "$0" + exit ${KSFT_PASS} + ;; + "i") + mptcp_lib_set_ip_mptcp + ;; + "?") + usage "$0" + exit ${KSFT_FAIL} + ;; + esac +done + add_mark_rules() { local ns=$1 @@ -58,15 +80,15 @@ init() # let $ns2 reach any $ns1 address from any interface ip -net "$ns2" route add default via 10.0.$i.1 dev ns2eth$i metric 10$i - ip netns exec $ns1 ./pm_nl_ctl add 10.0.$i.1 flags signal - ip netns exec $ns1 ./pm_nl_ctl add dead:beef:$i::1 flags signal + mptcp_lib_pm_nl_add_endpoint "${ns1}" "10.0.${i}.1" flags signal + mptcp_lib_pm_nl_add_endpoint "${ns1}" "dead:beef:${i}::1" flags signal - ip netns exec $ns2 ./pm_nl_ctl add 10.0.$i.2 flags signal - ip netns exec $ns2 ./pm_nl_ctl add dead:beef:$i::2 flags signal + mptcp_lib_pm_nl_add_endpoint "${ns2}" "10.0.${i}.2" flags signal + mptcp_lib_pm_nl_add_endpoint "${ns2}" "dead:beef:${i}::2" flags signal done - ip netns exec $ns1 ./pm_nl_ctl limits 8 8 - ip netns exec $ns2 ./pm_nl_ctl limits 8 8 + mptcp_lib_pm_nl_set_limits "${ns1}" 8 8 + mptcp_lib_pm_nl_set_limits "${ns2}" 8 8 add_mark_rules $ns1 1 add_mark_rules $ns2 2 diff --git a/tools/testing/selftests/net/mptcp/pm_netlink.sh b/tools/testing/selftests/net/mptcp/pm_netlink.sh index 6ab8c5d36340..2757378b1b13 100755 --- a/tools/testing/selftests/net/mptcp/pm_netlink.sh +++ b/tools/testing/selftests/net/mptcp/pm_netlink.sh @@ -1,28 +1,28 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 -# Double quotes to prevent globbing and word splitting is recommended in new -# code but we accept it, especially because there were too many before having -# address all other issues detected by shellcheck. -#shellcheck disable=SC2086 - . "$(dirname "${0}")/mptcp_lib.sh" ret=0 usage() { - echo "Usage: $0 [ -h ]" + echo "Usage: $0 [ -i ] [ -h ]" + echo -e "\t-i: use 'ip mptcp' instead of 'pm_nl_ctl'" + echo -e "\t-h: help" } -optstring=h +optstring=hi while getopts "$optstring" option;do case "$option" in "h") - usage $0 + usage "$0" exit ${KSFT_PASS} ;; + "i") + mptcp_lib_set_ip_mptcp + ;; "?") - usage $0 + usage "$0" exit ${KSFT_FAIL} ;; esac @@ -35,7 +35,7 @@ err=$(mktemp) #shellcheck disable=SC2317 cleanup() { - rm -f $err + rm -f "${err}" mptcp_lib_ns_exit "${ns1}" } @@ -46,6 +46,76 @@ trap cleanup EXIT mptcp_lib_ns_init ns1 +format_limits() { + local accept="${1}" + local subflows="${2}" + + if mptcp_lib_is_ip_mptcp; then + # with a space at the end + printf "add_addr_accepted %d subflows %d \n" "${accept}" "${subflows}" + else + printf "accept %d\nsubflows %d\n" "${accept}" "${subflows}" + fi +} + +get_limits() { + if mptcp_lib_is_ip_mptcp; then + ip -n "${ns1}" mptcp limits + else + ip netns exec "${ns1}" ./pm_nl_ctl limits + fi +} + +format_endpoints() { + mptcp_lib_pm_nl_format_endpoints "${@}" +} + +get_endpoint() { + # shellcheck disable=SC2317 # invoked indirectly + mptcp_lib_pm_nl_get_endpoint "${ns1}" "${@}" +} + +change_address() { + local addr=${1} + local flags=${2} + + if mptcp_lib_is_ip_mptcp; then + ip -n "${ns1}" mptcp endpoint change "${addr}" "${flags}" + else + ip netns exec "${ns1}" ./pm_nl_ctl set "${addr}" flags "${flags}" + fi +} + +set_limits() +{ + mptcp_lib_pm_nl_set_limits "${ns1}" "${@}" +} + +add_endpoint() +{ + mptcp_lib_pm_nl_add_endpoint "${ns1}" "${@}" +} + +del_endpoint() +{ + mptcp_lib_pm_nl_del_endpoint "${ns1}" "${@}" +} + +flush_endpoint() +{ + mptcp_lib_pm_nl_flush_endpoint "${ns1}" +} + +show_endpoints() +{ + mptcp_lib_pm_nl_show_endpoints "${ns1}" +} + +change_endpoint() +{ + mptcp_lib_pm_nl_change_endpoint "${ns1}" "${@}" +} + check() { local cmd="$1" @@ -67,125 +137,126 @@ check() fi } -check "ip netns exec $ns1 ./pm_nl_ctl dump" "" "defaults addr list" +check "show_endpoints" "" "defaults addr list" -default_limits="$(ip netns exec $ns1 ./pm_nl_ctl limits)" +default_limits="$(get_limits)" if mptcp_lib_expect_all_features; then - check "ip netns exec $ns1 ./pm_nl_ctl limits" "accept 0 -subflows 2" "defaults limits" + check "get_limits" "$(format_limits 0 2)" "defaults limits" fi -ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.1 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.2 flags subflow dev lo -ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.3 flags signal,backup -check "ip netns exec $ns1 ./pm_nl_ctl get 1" "id 1 flags 10.0.1.1" "simple add/get addr" +add_endpoint 10.0.1.1 +add_endpoint 10.0.1.2 flags subflow dev lo +add_endpoint 10.0.1.3 flags signal,backup +check "get_endpoint 1" "$(format_endpoints "1,10.0.1.1")" "simple add/get addr" -check "ip netns exec $ns1 ./pm_nl_ctl dump" \ -"id 1 flags 10.0.1.1 -id 2 flags subflow dev lo 10.0.1.2 -id 3 flags signal,backup 10.0.1.3" "dump addrs" +check "show_endpoints" \ + "$(format_endpoints "1,10.0.1.1" \ + "2,10.0.1.2,subflow,lo" \ + "3,10.0.1.3,signal backup")" "dump addrs" -ip netns exec $ns1 ./pm_nl_ctl del 2 -check "ip netns exec $ns1 ./pm_nl_ctl get 2" "" "simple del addr" -check "ip netns exec $ns1 ./pm_nl_ctl dump" \ -"id 1 flags 10.0.1.1 -id 3 flags signal,backup 10.0.1.3" "dump addrs after del" +del_endpoint 2 +check "get_endpoint 2" "" "simple del addr" +check "show_endpoints" \ + "$(format_endpoints "1,10.0.1.1" \ + "3,10.0.1.3,signal backup")" "dump addrs after del" -ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.3 2>/dev/null -check "ip netns exec $ns1 ./pm_nl_ctl get 4" "" "duplicate addr" +add_endpoint 10.0.1.3 2>/dev/null +check "get_endpoint 4" "" "duplicate addr" -ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.4 flags signal -check "ip netns exec $ns1 ./pm_nl_ctl get 4" "id 4 flags signal 10.0.1.4" "id addr increment" +add_endpoint 10.0.1.4 flags signal +check "get_endpoint 4" "$(format_endpoints "4,10.0.1.4,signal")" "id addr increment" for i in $(seq 5 9); do - ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.$i flags signal >/dev/null 2>&1 + add_endpoint "10.0.1.${i}" flags signal >/dev/null 2>&1 done -check "ip netns exec $ns1 ./pm_nl_ctl get 9" "id 9 flags signal 10.0.1.9" "hard addr limit" -check "ip netns exec $ns1 ./pm_nl_ctl get 10" "" "above hard addr limit" +check "get_endpoint 9" "$(format_endpoints "9,10.0.1.9,signal")" "hard addr limit" +check "get_endpoint 10" "" "above hard addr limit" -ip netns exec $ns1 ./pm_nl_ctl del 9 +del_endpoint 9 for i in $(seq 10 255); do - ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.9 id $i - ip netns exec $ns1 ./pm_nl_ctl del $i + add_endpoint 10.0.0.9 id "${i}" + del_endpoint "${i}" done -check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags 10.0.1.1 -id 3 flags signal,backup 10.0.1.3 -id 4 flags signal 10.0.1.4 -id 5 flags signal 10.0.1.5 -id 6 flags signal 10.0.1.6 -id 7 flags signal 10.0.1.7 -id 8 flags signal 10.0.1.8" "id limit" - -ip netns exec $ns1 ./pm_nl_ctl flush -check "ip netns exec $ns1 ./pm_nl_ctl dump" "" "flush addrs" - -ip netns exec $ns1 ./pm_nl_ctl limits 9 1 2>/dev/null -check "ip netns exec $ns1 ./pm_nl_ctl limits" "$default_limits" "rcv addrs above hard limit" - -ip netns exec $ns1 ./pm_nl_ctl limits 1 9 2>/dev/null -check "ip netns exec $ns1 ./pm_nl_ctl limits" "$default_limits" "subflows above hard limit" - -ip netns exec $ns1 ./pm_nl_ctl limits 8 8 -check "ip netns exec $ns1 ./pm_nl_ctl limits" "accept 8 -subflows 8" "set limits" - -ip netns exec $ns1 ./pm_nl_ctl flush -ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.1 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.2 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.3 id 100 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.4 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.5 id 254 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.6 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.7 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.8 -check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags 10.0.1.1 -id 2 flags 10.0.1.2 -id 3 flags 10.0.1.7 -id 4 flags 10.0.1.8 -id 100 flags 10.0.1.3 -id 101 flags 10.0.1.4 -id 254 flags 10.0.1.5 -id 255 flags 10.0.1.6" "set ids" - -ip netns exec $ns1 ./pm_nl_ctl flush -ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.1 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.2 id 254 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.3 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.4 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.5 id 253 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.6 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.7 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.8 -check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags 10.0.0.1 -id 2 flags 10.0.0.4 -id 3 flags 10.0.0.6 -id 4 flags 10.0.0.7 -id 5 flags 10.0.0.8 -id 253 flags 10.0.0.5 -id 254 flags 10.0.0.2 -id 255 flags 10.0.0.3" "wrap-around ids" - -ip netns exec $ns1 ./pm_nl_ctl flush -ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.1 flags subflow -ip netns exec $ns1 ./pm_nl_ctl set 10.0.1.1 flags backup -check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags \ -subflow,backup 10.0.1.1" "set flags (backup)" -ip netns exec $ns1 ./pm_nl_ctl set 10.0.1.1 flags nobackup -check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags \ -subflow 10.0.1.1" " (nobackup)" +check "show_endpoints" \ + "$(format_endpoints "1,10.0.1.1" \ + "3,10.0.1.3,signal backup" \ + "4,10.0.1.4,signal" \ + "5,10.0.1.5,signal" \ + "6,10.0.1.6,signal" \ + "7,10.0.1.7,signal" \ + "8,10.0.1.8,signal")" "id limit" + +flush_endpoint +check "show_endpoints" "" "flush addrs" + +set_limits 9 1 2>/dev/null +check "get_limits" "${default_limits}" "rcv addrs above hard limit" + +set_limits 1 9 2>/dev/null +check "get_limits" "${default_limits}" "subflows above hard limit" + +set_limits 8 8 +check "get_limits" "$(format_limits 8 8)" "set limits" + +flush_endpoint +add_endpoint 10.0.1.1 +add_endpoint 10.0.1.2 +add_endpoint 10.0.1.3 id 100 +add_endpoint 10.0.1.4 +add_endpoint 10.0.1.5 id 254 +add_endpoint 10.0.1.6 +add_endpoint 10.0.1.7 +add_endpoint 10.0.1.8 +check "show_endpoints" \ + "$(format_endpoints "1,10.0.1.1" \ + "2,10.0.1.2" \ + "3,10.0.1.7" \ + "4,10.0.1.8" \ + "100,10.0.1.3" \ + "101,10.0.1.4" \ + "254,10.0.1.5" \ + "255,10.0.1.6")" "set ids" + +flush_endpoint +add_endpoint 10.0.0.1 +add_endpoint 10.0.0.2 id 254 +add_endpoint 10.0.0.3 +add_endpoint 10.0.0.4 +add_endpoint 10.0.0.5 id 253 +add_endpoint 10.0.0.6 +add_endpoint 10.0.0.7 +add_endpoint 10.0.0.8 +check "show_endpoints" \ + "$(format_endpoints "1,10.0.0.1" \ + "2,10.0.0.4" \ + "3,10.0.0.6" \ + "4,10.0.0.7" \ + "5,10.0.0.8" \ + "253,10.0.0.5" \ + "254,10.0.0.2" \ + "255,10.0.0.3")" "wrap-around ids" + +flush_endpoint +add_endpoint 10.0.1.1 flags subflow +change_address 10.0.1.1 backup +check "show_endpoints" "$(format_endpoints "1,10.0.1.1,subflow backup")" \ + "set flags (backup)" +change_address 10.0.1.1 nobackup +check "show_endpoints" "$(format_endpoints "1,10.0.1.1,subflow")" \ + " (nobackup)" # fullmesh support has been added later -ip netns exec $ns1 ./pm_nl_ctl set id 1 flags fullmesh 2>/dev/null -if ip netns exec $ns1 ./pm_nl_ctl dump | grep -q "fullmesh" || +change_endpoint 1 fullmesh 2>/dev/null +if show_endpoints | grep -q "fullmesh" || mptcp_lib_expect_all_features; then - check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags \ -subflow,fullmesh 10.0.1.1" " (fullmesh)" - ip netns exec $ns1 ./pm_nl_ctl set id 1 flags nofullmesh - check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags \ -subflow 10.0.1.1" " (nofullmesh)" - ip netns exec $ns1 ./pm_nl_ctl set id 1 flags backup,fullmesh - check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags \ -subflow,backup,fullmesh 10.0.1.1" " (backup,fullmesh)" + check "show_endpoints" "$(format_endpoints "1,10.0.1.1,subflow fullmesh")" \ + " (fullmesh)" + change_endpoint 1 nofullmesh + check "show_endpoints" "$(format_endpoints "1,10.0.1.1,subflow")" \ + " (nofullmesh)" + change_endpoint 1 backup,fullmesh + check "show_endpoints" "$(format_endpoints "1,10.0.1.1,subflow backup fullmesh")" \ + " (backup,fullmesh)" else for st in fullmesh nofullmesh backup,fullmesh; do st=" (${st})" diff --git a/tools/testing/selftests/net/mptcp/pm_nl_ctl.c b/tools/testing/selftests/net/mptcp/pm_nl_ctl.c index 7426a2cbd4a0..7ad5a59adff2 100644 --- a/tools/testing/selftests/net/mptcp/pm_nl_ctl.c +++ b/tools/testing/selftests/net/mptcp/pm_nl_ctl.c @@ -1276,7 +1276,7 @@ int add_listener(int argc, char *argv[]) struct sockaddr_storage addr; struct sockaddr_in6 *a6; struct sockaddr_in *a4; - u_int16_t family; + u_int16_t family = AF_UNSPEC; int enable = 1; int sock; int err; diff --git a/tools/testing/selftests/net/mptcp/simult_flows.sh b/tools/testing/selftests/net/mptcp/simult_flows.sh index 1b2366220388..4b14b4412166 100755 --- a/tools/testing/selftests/net/mptcp/simult_flows.sh +++ b/tools/testing/selftests/net/mptcp/simult_flows.sh @@ -27,10 +27,11 @@ capout="" size=0 usage() { - echo "Usage: $0 [ -b ] [ -c ] [ -d ]" + echo "Usage: $0 [ -b ] [ -c ] [ -d ] [ -i]" echo -e "\t-b: bail out after first error, otherwise runs al testcases" echo -e "\t-c: capture packets for each test using tcpdump (default: no capture)" echo -e "\t-d: debug this script" + echo -e "\t-i: use 'ip mptcp' instead of 'pm_nl_ctl'" } # This function is used in the cleanup trap @@ -45,7 +46,7 @@ cleanup() } mptcp_lib_check_mptcp -mptcp_lib_check_tools ip +mptcp_lib_check_tools ip tc # "$ns1" ns2 ns3 # ns1eth1 ns2eth1 ns2eth3 ns3eth1 @@ -85,8 +86,8 @@ setup() ip -net "$ns1" route add default via 10.0.2.2 metric 101 ip -net "$ns1" route add default via dead:beef:2::2 metric 101 - ip netns exec "$ns1" ./pm_nl_ctl limits 1 1 - ip netns exec "$ns1" ./pm_nl_ctl add 10.0.2.1 dev ns1eth2 flags subflow + mptcp_lib_pm_nl_set_limits "${ns1}" 1 1 + mptcp_lib_pm_nl_add_endpoint "${ns1}" 10.0.2.1 dev ns1eth2 flags subflow ip -net "$ns2" addr add 10.0.1.2/24 dev ns2eth1 ip -net "$ns2" addr add dead:beef:1::2/64 dev ns2eth1 nodad @@ -108,7 +109,7 @@ setup() ip -net "$ns3" route add default via 10.0.3.2 ip -net "$ns3" route add default via dead:beef:3::2 - ip netns exec "$ns3" ./pm_nl_ctl limits 1 1 + mptcp_lib_pm_nl_set_limits "${ns3}" 1 1 # debug build can slow down measurably the test program # we use quite tight time limit on the run-time, to ensure @@ -216,8 +217,8 @@ run_test() shift 4 local msg=$* - [ $delay1 -gt 0 ] && delay1="delay $delay1" || delay1="" - [ $delay2 -gt 0 ] && delay2="delay $delay2" || delay2="" + [ $delay1 -gt 0 ] && delay1="delay ${delay1}ms" || delay1="" + [ $delay2 -gt 0 ] && delay2="delay ${delay2}ms" || delay2="" for dev in ns1eth1 ns1eth2; do tc -n $ns1 qdisc del dev $dev root >/dev/null 2>&1 @@ -259,7 +260,7 @@ run_test() fi } -while getopts "bcdh" option;do +while getopts "bcdhi" option;do case "$option" in "h") usage $0 @@ -274,6 +275,9 @@ while getopts "bcdh" option;do "d") set -x ;; + "i") + mptcp_lib_set_ip_mptcp + ;; "?") usage $0 exit ${KSFT_FAIL} diff --git a/tools/testing/selftests/net/nat6to4.c b/tools/testing/selftests/net/nat6to4.bpf.c index ac54c36b25fc..ac54c36b25fc 100644 --- a/tools/testing/selftests/net/nat6to4.c +++ b/tools/testing/selftests/net/nat6to4.bpf.c diff --git a/tools/testing/selftests/netfilter/.gitignore b/tools/testing/selftests/net/netfilter/.gitignore index c2229b3e40d4..0a64d6d0e29a 100644 --- a/tools/testing/selftests/netfilter/.gitignore +++ b/tools/testing/selftests/net/netfilter/.gitignore @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only -nf-queue -connect_close audit_logread +connect_close conntrack_dump_flush sctp_collision +nf_queue diff --git a/tools/testing/selftests/net/netfilter/Makefile b/tools/testing/selftests/net/netfilter/Makefile new file mode 100644 index 000000000000..47945b2b3f92 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/Makefile @@ -0,0 +1,52 @@ +# SPDX-License-Identifier: GPL-2.0 + +top_srcdir = ../../../../.. + +HOSTPKG_CONFIG := pkg-config +MNL_CFLAGS := $(shell $(HOSTPKG_CONFIG) --cflags libmnl 2>/dev/null) +MNL_LDLIBS := $(shell $(HOSTPKG_CONFIG) --libs libmnl 2>/dev/null || echo -lmnl) + +TEST_PROGS := br_netfilter.sh bridge_brouter.sh +TEST_PROGS += conntrack_icmp_related.sh +TEST_PROGS += conntrack_ipip_mtu.sh +TEST_PROGS += conntrack_tcp_unreplied.sh +TEST_PROGS += conntrack_sctp_collision.sh +TEST_PROGS += conntrack_vrf.sh +TEST_PROGS += ipvs.sh +TEST_PROGS += nf_conntrack_packetdrill.sh +TEST_PROGS += nf_nat_edemux.sh +TEST_PROGS += nft_audit.sh +TEST_PROGS += nft_concat_range.sh +TEST_PROGS += nft_conntrack_helper.sh +TEST_PROGS += nft_fib.sh +TEST_PROGS += nft_flowtable.sh +TEST_PROGS += nft_meta.sh +TEST_PROGS += nft_nat.sh +TEST_PROGS += nft_nat_zones.sh +TEST_PROGS += nft_queue.sh +TEST_PROGS += nft_synproxy.sh +TEST_PROGS += nft_zones_many.sh +TEST_PROGS += rpath.sh +TEST_PROGS += xt_string.sh + +TEST_PROGS_EXTENDED = nft_concat_range_perf.sh + +TEST_GEN_PROGS = conntrack_dump_flush + +TEST_GEN_FILES = audit_logread +TEST_GEN_FILES += connect_close nf_queue +TEST_GEN_FILES += sctp_collision + +include ../../lib.mk + +$(OUTPUT)/nf_queue: CFLAGS += $(MNL_CFLAGS) +$(OUTPUT)/nf_queue: LDLIBS += $(MNL_LDLIBS) + +$(OUTPUT)/conntrack_dump_flush: CFLAGS += $(MNL_CFLAGS) +$(OUTPUT)/conntrack_dump_flush: LDLIBS += $(MNL_LDLIBS) + +TEST_FILES := lib.sh +TEST_FILES += packetdrill + +TEST_INCLUDES := \ + ../lib.sh diff --git a/tools/testing/selftests/netfilter/audit_logread.c b/tools/testing/selftests/net/netfilter/audit_logread.c index a0a880fc2d9d..a0a880fc2d9d 100644 --- a/tools/testing/selftests/netfilter/audit_logread.c +++ b/tools/testing/selftests/net/netfilter/audit_logread.c diff --git a/tools/testing/selftests/net/netfilter/br_netfilter.sh b/tools/testing/selftests/net/netfilter/br_netfilter.sh new file mode 100755 index 000000000000..c28379a965d8 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/br_netfilter.sh @@ -0,0 +1,171 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Test for legacy br_netfilter module combined with connection tracking, +# a combination that doesn't really work. +# Multicast/broadcast packets race for hash table insertion. + +# eth0 br0 eth0 +# setup is: ns1 <->,ns0 <-> ns3 +# ns2 <-' `'-> ns4 + +source lib.sh + +checktool "nft --version" "run test without nft tool" + +cleanup() { + cleanup_all_ns +} + +trap cleanup EXIT + +setup_ns ns0 ns1 ns2 ns3 ns4 + +ret=0 + +do_ping() +{ + fromns="$1" + dstip="$2" + + if ! ip netns exec "$fromns" ping -c 1 -q "$dstip" > /dev/null; then + echo "ERROR: ping from $fromns to $dstip" + ip netns exec "$ns0" nft list ruleset + ret=1 + fi +} + +bcast_ping() +{ + fromns="$1" + dstip="$2" + + local packets=500 + + [ "$KSFT_MACHINE_SLOW" = yes ] && packets=100 + + for i in $(seq 1 $packets); do + if ! ip netns exec "$fromns" ping -q -f -b -c 1 -q "$dstip" > /dev/null 2>&1; then + echo "ERROR: ping -b from $fromns to $dstip" + ip netns exec "$ns0" nft list ruleset + ret=1 + break + fi + done +} + +ip netns exec "$ns0" sysctl -q net.ipv4.conf.all.rp_filter=0 +ip netns exec "$ns0" sysctl -q net.ipv4.conf.default.rp_filter=0 + +if ! ip link add veth1 netns "$ns0" type veth peer name eth0 netns "$ns1"; then + echo "SKIP: Can't create veth device" + exit $ksft_skip +fi + +ip link add veth2 netns "$ns0" type veth peer name eth0 netns "$ns2" +ip link add veth3 netns "$ns0" type veth peer name eth0 netns "$ns3" +ip link add veth4 netns "$ns0" type veth peer name eth0 netns "$ns4" + +for i in $(seq 1 4); do + ip -net "$ns0" link set "veth$i" up +done + +if ! ip -net "$ns0" link add br0 type bridge stp_state 0 forward_delay 0 nf_call_iptables 1 nf_call_ip6tables 1 nf_call_arptables 1; then + echo "SKIP: Can't create bridge br0" + exit $ksft_skip +fi + +# make veth0,1,2 part of bridge. +for i in $(seq 1 3); do + ip -net "$ns0" link set "veth$i" master br0 +done + +# add a macvlan on top of the bridge. +MACVLAN_ADDR=ba:f3:13:37:42:23 +ip -net "$ns0" link add link br0 name macvlan0 type macvlan mode private +ip -net "$ns0" link set macvlan0 address ${MACVLAN_ADDR} +ip -net "$ns0" link set macvlan0 up +ip -net "$ns0" addr add 10.23.0.1/24 dev macvlan0 + +# add a macvlan on top of veth4. +MACVLAN_ADDR=ba:f3:13:37:42:24 +ip -net "$ns0" link add link veth4 name macvlan4 type macvlan mode passthru +ip -net "$ns0" link set macvlan4 address ${MACVLAN_ADDR} +ip -net "$ns0" link set macvlan4 up + +# make the macvlan part of the bridge. +# veth4 is not a bridge port, only the macvlan on top of it. +ip -net "$ns0" link set macvlan4 master br0 + +ip -net "$ns0" link set br0 up +ip -net "$ns0" addr add 10.0.0.1/24 dev br0 + +modprobe -q br_netfilter +if ! ip netns exec "$ns0" sysctl -q net.bridge.bridge-nf-call-iptables=1; then + echo "SKIP: bridge netfilter not available" + ret=$ksft_skip +fi + +# for testing, so namespaces will reply to ping -b probes. +ip netns exec "$ns0" sysctl -q net.ipv4.icmp_echo_ignore_broadcasts=0 + +# enable conntrack in ns0 and drop broadcast packets in forward to +# avoid them from getting confirmed in the postrouting hook before +# the cloned skb is passed up the stack. +ip netns exec "$ns0" nft -f - <<EOF +table ip filter { + chain input { + type filter hook input priority 1; policy accept + iifname br0 counter + ct state new accept + } +} + +table bridge filter { + chain forward { + type filter hook forward priority 0; policy accept + meta pkttype broadcast ip protocol icmp counter drop + } +} +EOF +if [ "$?" -ne 0 ];then + echo "SKIP: could not add nftables ruleset" + exit $ksft_skip +fi + +# place 1, 2 & 3 in same subnet, connected via ns0:br0. +# ns4 is placed in same subnet as well, but its not +# part of the bridge: the corresponding veth4 is not +# part of the bridge, only its macvlan interface. +for i in $(seq 1 4); do + eval ip -net \$ns"$i" link set eth0 up +done +for i in $(seq 1 2); do + eval ip -net \$ns"$i" addr add "10.0.0.1$i/24" dev eth0 +done + +ip -net "$ns3" addr add 10.23.0.13/24 dev eth0 +ip -net "$ns4" addr add 10.23.0.14/24 dev eth0 + +# test basic connectivity +do_ping "$ns1" 10.0.0.12 +do_ping "$ns3" 10.23.0.1 +do_ping "$ns4" 10.23.0.1 + +bcast_ping "$ns1" 10.0.0.255 + +# This should deliver broadcast to macvlan0, which is on top of ns0:br0. +bcast_ping "$ns3" 10.23.0.255 + +# same, this time via veth4:macvlan4. +bcast_ping "$ns4" 10.23.0.255 + +read t < /proc/sys/kernel/tainted +if [ "$t" -eq 0 ];then + echo PASS: kernel not tainted +else + echo ERROR: kernel is tainted + ret=1 +fi + +exit $ret diff --git a/tools/testing/selftests/net/netfilter/bridge_brouter.sh b/tools/testing/selftests/net/netfilter/bridge_brouter.sh new file mode 100755 index 000000000000..2549b6590693 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/bridge_brouter.sh @@ -0,0 +1,122 @@ +#!/bin/bash +# +# This test is for bridge 'brouting', i.e. make some packets being routed +# rather than getting bridged even though they arrive on interface that is +# part of a bridge. + +# eth0 br0 eth0 +# setup is: ns1 <-> nsbr <-> ns2 + +source lib.sh + +if ! ebtables -V > /dev/null 2>&1;then + echo "SKIP: Could not run test without ebtables" + exit $ksft_skip +fi + +cleanup() { + cleanup_all_ns +} + +trap cleanup EXIT + +setup_ns nsbr ns1 ns2 + +ip netns exec "$nsbr" sysctl -q net.ipv4.conf.default.rp_filter=0 +ip netns exec "$nsbr" sysctl -q net.ipv4.conf.all.rp_filter=0 +if ! ip link add veth0 netns "$nsbr" type veth peer name eth0 netns "$ns1"; then + echo "SKIP: Can't create veth device" + exit $ksft_skip +fi +ip link add veth1 netns "$nsbr" type veth peer name eth0 netns "$ns2" + +if ! ip -net "$nsbr" link add br0 type bridge; then + echo "SKIP: Can't create bridge br0" + exit $ksft_skip +fi + +ip -net "$nsbr" link set veth0 up +ip -net "$nsbr" link set veth1 up + +ip -net "$nsbr" link set veth0 master br0 +ip -net "$nsbr" link set veth1 master br0 +ip -net "$nsbr" link set br0 up +ip -net "$nsbr" addr add 10.0.0.1/24 dev br0 + +# place both in same subnet, ${ns1} and ${ns2} connected via ${nsbr}:br0 +ip -net "$ns1" link set eth0 up +ip -net "$ns2" link set eth0 up +ip -net "$ns1" addr add 10.0.0.11/24 dev eth0 +ip -net "$ns2" addr add 10.0.0.12/24 dev eth0 + +test_ebtables_broute() +{ + # redirect is needed so the dstmac is rewritten to the bridge itself, + # ip stack won't process OTHERHOST (foreign unicast mac) packets. + if ! ip netns exec "$nsbr" ebtables -t broute -A BROUTING -p ipv4 --ip-protocol icmp -j redirect --redirect-target=DROP; then + echo "SKIP: Could not add ebtables broute redirect rule" + return $ksft_skip + fi + + ip netns exec "$nsbr" sysctl -q net.ipv4.conf.veth0.forwarding=0 + + # ping net${ns1}, expected to not work (ip forwarding is off) + if ip netns exec "$ns1" ping -q -c 1 10.0.0.12 -W 0.5 > /dev/null 2>&1; then + echo "ERROR: ping works, should have failed" 1>&2 + return 1 + fi + + # enable forwarding on both interfaces. + # neither needs an ip address, but at least the bridge needs + # an ip address in same network segment as ${ns1} and ${ns2} (${nsbr} + # needs to be able to determine route for to-be-forwarded packet). + ip netns exec "$nsbr" sysctl -q net.ipv4.conf.veth0.forwarding=1 + ip netns exec "$nsbr" sysctl -q net.ipv4.conf.veth1.forwarding=1 + + if ! ip netns exec "$ns1" ping -q -c 1 10.0.0.12 > /dev/null; then + echo "ERROR: ping did not work, but it should (broute+forward)" 1>&2 + return 1 + fi + + echo "PASS: ${ns1}/${ns2} connectivity with active broute rule" + ip netns exec "$nsbr" ebtables -t broute -F + + # ping net${ns1}, expected to work (frames are bridged) + if ! ip netns exec "$ns1" ping -q -c 1 10.0.0.12 > /dev/null; then + echo "ERROR: ping did not work, but it should (bridged)" 1>&2 + return 1 + fi + + ip netns exec "$nsbr" ebtables -t filter -A FORWARD -p ipv4 --ip-protocol icmp -j DROP + + # ping net${ns1}, expected to not work (DROP in bridge forward) + if ip netns exec "$ns1" ping -q -c 1 10.0.0.12 -W 0.5 > /dev/null 2>&1; then + echo "ERROR: ping works, should have failed (icmp forward drop)" 1>&2 + return 1 + fi + + # re-activate brouter + ip netns exec "$nsbr" ebtables -t broute -A BROUTING -p ipv4 --ip-protocol icmp -j redirect --redirect-target=DROP + + if ! ip netns exec "$ns2" ping -q -c 1 10.0.0.11 > /dev/null; then + echo "ERROR: ping did not work, but it should (broute+forward 2)" 1>&2 + return 1 + fi + + echo "PASS: ${ns1}/${ns2} connectivity with active broute rule and bridge forward drop" + return 0 +} + +# test basic connectivity +if ! ip netns exec "$ns1" ping -c 1 -q 10.0.0.12 > /dev/null; then + echo "ERROR: Could not reach ${ns2} from ${ns1}" 1>&2 + exit 1 +fi + +if ! ip netns exec "$ns2" ping -c 1 -q 10.0.0.11 > /dev/null; then + echo "ERROR: Could not reach ${ns1} from ${ns2}" 1>&2 + exit 1 +fi + +test_ebtables_broute +exit $? diff --git a/tools/testing/selftests/net/netfilter/config b/tools/testing/selftests/net/netfilter/config new file mode 100644 index 000000000000..63ef80ef47a4 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/config @@ -0,0 +1,89 @@ +CONFIG_AUDIT=y +CONFIG_BPF_SYSCALL=y +CONFIG_BRIDGE=m +CONFIG_BRIDGE_EBT_BROUTE=m +CONFIG_BRIDGE_EBT_IP=m +CONFIG_BRIDGE_EBT_REDIRECT=m +CONFIG_BRIDGE_EBT_T_FILTER=m +CONFIG_BRIDGE_NETFILTER=m +CONFIG_BRIDGE_NF_EBTABLES=m +CONFIG_CGROUP_BPF=y +CONFIG_DUMMY=m +CONFIG_INET_ESP=m +CONFIG_IP_NF_MATCH_RPFILTER=m +CONFIG_IP6_NF_MATCH_RPFILTER=m +CONFIG_IP_NF_IPTABLES=m +CONFIG_IP6_NF_IPTABLES=m +CONFIG_IP_NF_FILTER=m +CONFIG_IP6_NF_FILTER=m +CONFIG_IP_NF_RAW=m +CONFIG_IP6_NF_RAW=m +CONFIG_IP_SCTP=m +CONFIG_IP_VS=m +CONFIG_IP_VS_PROTO_TCP=y +CONFIG_IP_VS_RR=m +CONFIG_IPV6=y +CONFIG_IPV6_MULTIPLE_TABLES=y +CONFIG_MACVLAN=m +CONFIG_NAMESPACES=y +CONFIG_NET_CLS_U32=m +CONFIG_NET_L3_MASTER_DEV=y +CONFIG_NET_NS=y +CONFIG_NET_SCH_NETEM=m +CONFIG_NET_SCH_HTB=m +CONFIG_NET_IPIP=m +CONFIG_NET_VRF=y +CONFIG_NETFILTER=y +CONFIG_NETFILTER_ADVANCED=y +CONFIG_NETFILTER_NETLINK=m +CONFIG_NETFILTER_NETLINK_QUEUE=m +CONFIG_NETFILTER_SYNPROXY=m +CONFIG_NETFILTER_XTABLES=m +CONFIG_NETFILTER_XT_NAT=m +CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m +CONFIG_NETFILTER_XT_MATCH_STATE=m +CONFIG_NETFILTER_XT_MATCH_STRING=m +CONFIG_NETFILTER_XT_TARGET_REDIRECT=m +CONFIG_NF_CONNTRACK=m +CONFIG_NF_CONNTRACK_EVENTS=y +CONFIG_NF_CONNTRACK_FTP=m +CONFIG_NF_CONNTRACK_MARK=y +CONFIG_NF_CONNTRACK_ZONES=y +CONFIG_NF_CT_NETLINK=m +CONFIG_NF_CT_PROTO_SCTP=y +CONFIG_NF_FLOW_TABLE=m +CONFIG_NF_LOG_IPV4=m +CONFIG_NF_LOG_IPV6=m +CONFIG_NF_NAT=m +CONFIG_NF_NAT_REDIRECT=y +CONFIG_NF_NAT_MASQUERADE=y +CONFIG_NF_TABLES=m +CONFIG_NF_TABLES_BRIDGE=m +CONFIG_NF_TABLES_INET=y +CONFIG_NF_TABLES_IPV4=y +CONFIG_NF_TABLES_IPV6=y +CONFIG_NF_TABLES_NETDEV=y +CONFIG_NF_FLOW_TABLE_INET=m +CONFIG_NFT_BRIDGE_META=m +CONFIG_NFT_COMPAT=m +CONFIG_NFT_CT=m +CONFIG_NFT_FIB=m +CONFIG_NFT_FIB_INET=m +CONFIG_NFT_FIB_IPV4=m +CONFIG_NFT_FIB_IPV6=m +CONFIG_NFT_FLOW_OFFLOAD=m +CONFIG_NFT_LIMIT=m +CONFIG_NFT_LOG=m +CONFIG_NFT_MASQ=m +CONFIG_NFT_NAT=m +CONFIG_NFT_NUMGEN=m +CONFIG_NFT_QUEUE=m +CONFIG_NFT_QUOTA=m +CONFIG_NFT_REDIR=m +CONFIG_NFT_SYNPROXY=m +CONFIG_VETH=m +CONFIG_VLAN_8021Q=m +CONFIG_XFRM_USER=m +CONFIG_XFRM_STATISTICS=y +CONFIG_NET_PKTGEN=m +CONFIG_TUN=m diff --git a/tools/testing/selftests/netfilter/connect_close.c b/tools/testing/selftests/net/netfilter/connect_close.c index 1c3b0add54c4..1c3b0add54c4 100644 --- a/tools/testing/selftests/netfilter/connect_close.c +++ b/tools/testing/selftests/net/netfilter/connect_close.c diff --git a/tools/testing/selftests/netfilter/conntrack_dump_flush.c b/tools/testing/selftests/net/netfilter/conntrack_dump_flush.c index b11ea8ee6719..bd9317bf5ada 100644 --- a/tools/testing/selftests/netfilter/conntrack_dump_flush.c +++ b/tools/testing/selftests/net/netfilter/conntrack_dump_flush.c @@ -10,7 +10,7 @@ #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_conntrack.h> #include <linux/netfilter/nf_conntrack_tcp.h> -#include "../kselftest_harness.h" +#include "../../kselftest_harness.h" #define TEST_ZONE_ID 123 #define NF_CT_DEFAULT_ZONE_ID 0 @@ -313,13 +313,11 @@ FIXTURE_SETUP(conntrack_dump_flush) self->sock = mnl_socket_open(NETLINK_NETFILTER); if (!self->sock) { perror("mnl_socket_open"); - exit(EXIT_FAILURE); + SKIP(return, "cannot open netlink_netfilter socket"); } - if (mnl_socket_bind(self->sock, 0, MNL_SOCKET_AUTOPID) < 0) { - perror("mnl_socket_bind"); - exit(EXIT_FAILURE); - } + ret = mnl_socket_bind(self->sock, 0, MNL_SOCKET_AUTOPID); + EXPECT_EQ(ret, 0); ret = conntracK_count_zone(self->sock, TEST_ZONE_ID); if (ret < 0 && errno == EPERM) diff --git a/tools/testing/selftests/netfilter/conntrack_icmp_related.sh b/tools/testing/selftests/net/netfilter/conntrack_icmp_related.sh index 76645aaf2b58..c63d840ead61 100755 --- a/tools/testing/selftests/netfilter/conntrack_icmp_related.sh +++ b/tools/testing/selftests/net/netfilter/conntrack_icmp_related.sh @@ -14,35 +14,32 @@ # check the icmp errors are propagated to the correct host as per # nat of "established" icmp-echo "connection". -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 -ret=0 +source lib.sh -nft --version > /dev/null 2>&1 -if [ $? -ne 0 ];then +if ! nft --version > /dev/null 2>&1;then echo "SKIP: Could not run test without nft tool" exit $ksft_skip fi -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi - cleanup() { - for i in 1 2;do ip netns del nsclient$i;done - for i in 1 2;do ip netns del nsrouter$i;done + cleanup_all_ns } trap cleanup EXIT -ipv4() { - echo -n 192.168.$1.2 -} +setup_ns nsclient1 nsclient2 nsrouter1 nsrouter2 + +ret=0 + +add_addr() +{ + ns=$1 + dev=$2 + i=$3 -ipv6 () { - echo -n dead:$1::2 + ip -net "$ns" link set "$dev" up + ip -net "$ns" addr add "192.168.$i.2/24" dev "$dev" + ip -net "$ns" addr add "dead:$i::2/64" dev "$dev" nodad } check_counter() @@ -52,10 +49,9 @@ check_counter() expect=$3 local lret=0 - cnt=$(ip netns exec $ns nft list counter inet filter "$name" | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns" nft list counter inet filter "$name" | grep -q "$expect"; then echo "ERROR: counter $name in $ns has unexpected value (expected $expect)" 1>&2 - ip netns exec $ns nft list counter inet filter "$name" 1>&2 + ip netns exec "$ns" nft list counter inet filter "$name" 1>&2 lret=1 fi @@ -65,9 +61,8 @@ check_counter() check_unknown() { expect="packets 0 bytes 0" - for n in nsclient1 nsclient2 nsrouter1 nsrouter2; do - check_counter $n "unknown" "$expect" - if [ $? -ne 0 ] ;then + for n in ${nsclient1} ${nsclient2} ${nsrouter1} ${nsrouter2}; do + if ! check_counter "$n" "unknown" "$expect"; then return 1 fi done @@ -75,61 +70,48 @@ check_unknown() return 0 } -for n in nsclient1 nsclient2 nsrouter1 nsrouter2; do - ip netns add $n - ip -net $n link set lo up -done - -DEV=veth0 -ip link add $DEV netns nsclient1 type veth peer name eth1 netns nsrouter1 DEV=veth0 -ip link add $DEV netns nsclient2 type veth peer name eth1 netns nsrouter2 +ip link add "$DEV" netns "$nsclient1" type veth peer name eth1 netns "$nsrouter1" +ip link add "$DEV" netns "$nsclient2" type veth peer name eth1 netns "$nsrouter2" +ip link add "$DEV" netns "$nsrouter1" type veth peer name eth2 netns "$nsrouter2" -DEV=veth0 -ip link add $DEV netns nsrouter1 type veth peer name eth2 netns nsrouter2 +add_addr "$nsclient1" $DEV 1 +add_addr "$nsclient2" $DEV 2 -DEV=veth0 -for i in 1 2; do - ip -net nsclient$i link set $DEV up - ip -net nsclient$i addr add $(ipv4 $i)/24 dev $DEV - ip -net nsclient$i addr add $(ipv6 $i)/64 dev $DEV -done - -ip -net nsrouter1 link set eth1 up -ip -net nsrouter1 link set veth0 up +ip -net "$nsrouter1" link set eth1 up +ip -net "$nsrouter1" link set $DEV up -ip -net nsrouter2 link set eth1 up -ip -net nsrouter2 link set eth2 up +ip -net "$nsrouter2" link set eth1 mtu 1280 up +ip -net "$nsrouter2" link set eth2 up -ip -net nsclient1 route add default via 192.168.1.1 -ip -net nsclient1 -6 route add default via dead:1::1 +ip -net "$nsclient1" route add default via 192.168.1.1 +ip -net "$nsclient1" -6 route add default via dead:1::1 -ip -net nsclient2 route add default via 192.168.2.1 -ip -net nsclient2 route add default via dead:2::1 +ip -net "$nsclient2" route add default via 192.168.2.1 +ip -net "$nsclient2" route add default via dead:2::1 +ip -net "$nsclient2" link set veth0 mtu 1280 -i=3 -ip -net nsrouter1 addr add 192.168.1.1/24 dev eth1 -ip -net nsrouter1 addr add 192.168.3.1/24 dev veth0 -ip -net nsrouter1 addr add dead:1::1/64 dev eth1 -ip -net nsrouter1 addr add dead:3::1/64 dev veth0 -ip -net nsrouter1 route add default via 192.168.3.10 -ip -net nsrouter1 -6 route add default via dead:3::10 +ip -net "$nsrouter1" addr add 192.168.1.1/24 dev eth1 +ip -net "$nsrouter1" addr add 192.168.3.1/24 dev veth0 +ip -net "$nsrouter1" addr add dead:1::1/64 dev eth1 nodad +ip -net "$nsrouter1" addr add dead:3::1/64 dev veth0 nodad +ip -net "$nsrouter1" route add default via 192.168.3.10 +ip -net "$nsrouter1" -6 route add default via dead:3::10 -ip -net nsrouter2 addr add 192.168.2.1/24 dev eth1 -ip -net nsrouter2 addr add 192.168.3.10/24 dev eth2 -ip -net nsrouter2 addr add dead:2::1/64 dev eth1 -ip -net nsrouter2 addr add dead:3::10/64 dev eth2 -ip -net nsrouter2 route add default via 192.168.3.1 -ip -net nsrouter2 route add default via dead:3::1 +ip -net "$nsrouter2" addr add 192.168.2.1/24 dev eth1 +ip -net "$nsrouter2" addr add 192.168.3.10/24 dev eth2 +ip -net "$nsrouter2" addr add dead:2::1/64 dev eth1 nodad +ip -net "$nsrouter2" addr add dead:3::10/64 dev eth2 nodad +ip -net "$nsrouter2" route add default via 192.168.3.1 +ip -net "$nsrouter2" route add default via dead:3::1 -sleep 2 for i in 4 6; do - ip netns exec nsrouter1 sysctl -q net.ipv$i.conf.all.forwarding=1 - ip netns exec nsrouter2 sysctl -q net.ipv$i.conf.all.forwarding=1 + ip netns exec "$nsrouter1" sysctl -q net.ipv$i.conf.all.forwarding=1 + ip netns exec "$nsrouter2" sysctl -q net.ipv$i.conf.all.forwarding=1 done -for netns in nsrouter1 nsrouter2; do -ip netns exec $netns nft -f - <<EOF +for netns in "$nsrouter1" "$nsrouter2"; do +ip netns exec "$netns" nft -f - <<EOF table inet filter { counter unknown { } counter related { } @@ -144,7 +126,7 @@ table inet filter { EOF done -ip netns exec nsclient1 nft -f - <<EOF +ip netns exec "$nsclient1" nft -f - <<EOF table inet filter { counter unknown { } counter related { } @@ -164,7 +146,7 @@ table inet filter { } EOF -ip netns exec nsclient2 nft -f - <<EOF +ip netns exec "$nsclient2" nft -f - <<EOF table inet filter { counter unknown { } counter new { } @@ -189,11 +171,10 @@ table inet filter { } EOF - # make sure NAT core rewrites adress of icmp error if nat is used according to # conntrack nat information (icmp error will be directed at nsrouter1 address, # but it needs to be routed to nsclient1 address). -ip netns exec nsrouter1 nft -f - <<EOF +ip netns exec "$nsrouter1" nft -f - <<EOF table ip nat { chain postrouting { type nat hook postrouting priority 0; policy accept; @@ -208,44 +189,32 @@ table ip6 nat { } EOF -ip netns exec nsrouter2 ip link set eth1 mtu 1280 -ip netns exec nsclient2 ip link set veth0 mtu 1280 -sleep 1 - -ip netns exec nsclient1 ping -c 1 -s 1000 -q -M do 192.168.2.2 >/dev/null -if [ $? -ne 0 ]; then +if ! ip netns exec "$nsclient1" ping -c 1 -s 1000 -q -M "do" 192.168.2.2 >/dev/null; then echo "ERROR: netns ip routing/connectivity broken" 1>&2 - cleanup exit 1 fi -ip netns exec nsclient1 ping6 -q -c 1 -s 1000 dead:2::2 >/dev/null -if [ $? -ne 0 ]; then +if ! ip netns exec "$nsclient1" ping -c 1 -s 1000 -q dead:2::2 >/dev/null; then echo "ERROR: netns ipv6 routing/connectivity broken" 1>&2 - cleanup exit 1 fi -check_unknown -if [ $? -ne 0 ]; then +if ! check_unknown; then ret=1 fi expect="packets 0 bytes 0" -for netns in nsrouter1 nsrouter2 nsclient1;do - check_counter "$netns" "related" "$expect" - if [ $? -ne 0 ]; then +for netns in "$nsrouter1" "$nsrouter2" "$nsclient1";do + if ! check_counter "$netns" "related" "$expect"; then ret=1 fi done expect="packets 2 bytes 2076" -check_counter nsclient2 "new" "$expect" -if [ $? -ne 0 ]; then +if ! check_counter "$nsclient2" "new" "$expect"; then ret=1 fi -ip netns exec nsclient1 ping -q -c 1 -s 1300 -M do 192.168.2.2 > /dev/null -if [ $? -eq 0 ]; then +if ip netns exec "$nsclient1" ping -W 0.5 -q -c 1 -s 1300 -M "do" 192.168.2.2 > /dev/null; then echo "ERROR: ping should have failed with PMTU too big error" 1>&2 ret=1 fi @@ -253,30 +222,26 @@ fi # nsrouter2 should have generated the icmp error, so # related counter should be 0 (its in forward). expect="packets 0 bytes 0" -check_counter "nsrouter2" "related" "$expect" -if [ $? -ne 0 ]; then +if ! check_counter "$nsrouter2" "related" "$expect"; then ret=1 fi # but nsrouter1 should have seen it, same for nsclient1. expect="packets 1 bytes 576" -for netns in nsrouter1 nsclient1;do - check_counter "$netns" "related" "$expect" - if [ $? -ne 0 ]; then +for netns in ${nsrouter1} ${nsclient1};do + if ! check_counter "$netns" "related" "$expect"; then ret=1 fi done -ip netns exec nsclient1 ping6 -c 1 -s 1300 dead:2::2 > /dev/null -if [ $? -eq 0 ]; then +if ip netns exec "${nsclient1}" ping6 -W 0.5 -c 1 -s 1300 dead:2::2 > /dev/null; then echo "ERROR: ping6 should have failed with PMTU too big error" 1>&2 ret=1 fi expect="packets 2 bytes 1856" -for netns in nsrouter1 nsclient1;do - check_counter "$netns" "related" "$expect" - if [ $? -ne 0 ]; then +for netns in "${nsrouter1}" "${nsclient1}";do + if ! check_counter "$netns" "related" "$expect"; then ret=1 fi done @@ -288,21 +253,19 @@ else fi # add 'bad' route, expect icmp REDIRECT to be generated -ip netns exec nsclient1 ip route add 192.168.1.42 via 192.168.1.1 -ip netns exec nsclient1 ip route add dead:1::42 via dead:1::1 +ip netns exec "${nsclient1}" ip route add 192.168.1.42 via 192.168.1.1 +ip netns exec "${nsclient1}" ip route add dead:1::42 via dead:1::1 -ip netns exec "nsclient1" ping -q -c 2 192.168.1.42 > /dev/null +ip netns exec "$nsclient1" ping -W 1 -q -i 0.5 -c 2 192.168.1.42 > /dev/null expect="packets 1 bytes 112" -check_counter nsclient1 "redir4" "$expect" -if [ $? -ne 0 ];then +if ! check_counter "$nsclient1" "redir4" "$expect"; then ret=1 fi -ip netns exec "nsclient1" ping -c 1 dead:1::42 > /dev/null +ip netns exec "$nsclient1" ping -W 1 -c 1 dead:1::42 > /dev/null expect="packets 1 bytes 192" -check_counter nsclient1 "redir6" "$expect" -if [ $? -ne 0 ];then +if ! check_counter "$nsclient1" "redir6" "$expect"; then ret=1 fi diff --git a/tools/testing/selftests/netfilter/ipip-conntrack-mtu.sh b/tools/testing/selftests/net/netfilter/conntrack_ipip_mtu.sh index eb9553e4986b..9832a5d0198a 100755 --- a/tools/testing/selftests/netfilter/ipip-conntrack-mtu.sh +++ b/tools/testing/selftests/net/netfilter/conntrack_ipip_mtu.sh @@ -1,8 +1,7 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 +source lib.sh # Conntrack needs to reassemble fragments in order to have complete # packets for rule matching. Reassembly can lead to packet loss. @@ -23,56 +22,44 @@ ksft_skip=4 # between Client A and Client B over WAN. Wanrouter has MTU 1400 set # on its interfaces. -rnd=$(mktemp -u XXXXXXXX) rx=$(mktemp) -r_a="ns-ra-$rnd" -r_b="ns-rb-$rnd" -r_w="ns-rw-$rnd" -c_a="ns-ca-$rnd" -c_b="ns-cb-$rnd" - -checktool (){ - if ! $1 > /dev/null 2>&1; then - echo "SKIP: Could not $2" - exit $ksft_skip - fi -} - checktool "iptables --version" "run test without iptables" -checktool "ip -Version" "run test without ip tool" -checktool "which socat" "run test without socat" -checktool "ip netns add ${r_a}" "create net namespace" +checktool "socat -h" "run test without socat" -for n in ${r_b} ${r_w} ${c_a} ${c_b};do - ip netns add ${n} -done +setup_ns r_a r_b r_w c_a c_b cleanup() { - for n in ${r_a} ${r_b} ${r_w} ${c_a} ${c_b};do - ip netns del ${n} - done - rm -f ${rx} + cleanup_all_ns + rm -f "$rx" } trap cleanup EXIT +listener_ready() +{ + ns="$1" + port="$2" + ss -N "$ns" -lnu -o "sport = :$port" | grep -q "$port" +} + test_path() { msg="$1" - ip netns exec ${c_b} socat -t 3 - udp4-listen:5000,reuseaddr > ${rx} < /dev/null & + ip netns exec "$c_b" socat -t 3 - udp4-listen:5000,reuseaddr > "$rx" < /dev/null & + + busywait $BUSYWAIT_TIMEOUT listener_ready "$c_b" 5000 - sleep 1 for i in 1 2 3; do head -c1400 /dev/zero | tr "\000" "a" | \ - ip netns exec ${c_a} socat -t 1 -u STDIN UDP:192.168.20.2:5000 + ip netns exec "$c_a" socat -t 1 -u STDIN UDP:192.168.20.2:5000 done wait - bytes=$(wc -c < ${rx}) + bytes=$(wc -c < "$rx") - if [ $bytes -eq 1400 ];then + if [ "$bytes" -eq 1400 ];then echo "OK: PMTU $msg connection tracking" else echo "FAIL: PMTU $msg connection tracking: got $bytes, expected 1400" @@ -91,24 +78,24 @@ test_path() { # 10.4.4.1 via 10.2.2.254 (Router B via Wanrouter) # No iptables rules at all. -ip link add veth0 netns ${r_a} type veth peer name veth0 netns ${r_w} -ip link add veth1 netns ${r_a} type veth peer name veth0 netns ${c_a} +ip link add veth0 netns "$r_a" type veth peer name veth0 netns "$r_w" +ip link add veth1 netns "$r_a" type veth peer name veth0 netns "$c_a" l_addr="10.2.2.1" r_addr="10.4.4.1" -ip netns exec ${r_a} ip link add ipip0 type ipip local ${l_addr} remote ${r_addr} mode ipip || exit $ksft_skip +ip netns exec "$r_a" ip link add ipip0 type ipip local "$l_addr" remote "$r_addr" mode ipip || exit $ksft_skip for dev in lo veth0 veth1 ipip0; do - ip -net ${r_a} link set $dev up + ip -net "$r_a" link set "$dev" up done -ip -net ${r_a} addr add 10.2.2.1/24 dev veth0 -ip -net ${r_a} addr add 192.168.10.1/24 dev veth1 +ip -net "$r_a" addr add 10.2.2.1/24 dev veth0 +ip -net "$r_a" addr add 192.168.10.1/24 dev veth1 -ip -net ${r_a} route add 192.168.20.0/24 dev ipip0 -ip -net ${r_a} route add 10.4.4.0/24 via 10.2.2.254 +ip -net "$r_a" route add 192.168.20.0/24 dev ipip0 +ip -net "$r_a" route add 10.4.4.0/24 via 10.2.2.254 -ip netns exec ${r_a} sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null +ip netns exec "$r_a" sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null # Detailed setup for Router B # --------------------------- @@ -121,49 +108,46 @@ ip netns exec ${r_a} sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null # 10.2.2.1 via 10.4.4.254 (Router A via Wanrouter) # No iptables rules at all. -ip link add veth0 netns ${r_b} type veth peer name veth1 netns ${r_w} -ip link add veth1 netns ${r_b} type veth peer name veth0 netns ${c_b} +ip link add veth0 netns "$r_b" type veth peer name veth1 netns "$r_w" +ip link add veth1 netns "$r_b" type veth peer name veth0 netns "$c_b" l_addr="10.4.4.1" r_addr="10.2.2.1" -ip netns exec ${r_b} ip link add ipip0 type ipip local ${l_addr} remote ${r_addr} mode ipip || exit $ksft_skip +ip netns exec "$r_b" ip link add ipip0 type ipip local "${l_addr}" remote "${r_addr}" mode ipip || exit $ksft_skip -for dev in lo veth0 veth1 ipip0; do - ip -net ${r_b} link set $dev up +for dev in veth0 veth1 ipip0; do + ip -net "$r_b" link set $dev up done -ip -net ${r_b} addr add 10.4.4.1/24 dev veth0 -ip -net ${r_b} addr add 192.168.20.1/24 dev veth1 +ip -net "$r_b" addr add 10.4.4.1/24 dev veth0 +ip -net "$r_b" addr add 192.168.20.1/24 dev veth1 -ip -net ${r_b} route add 192.168.10.0/24 dev ipip0 -ip -net ${r_b} route add 10.2.2.0/24 via 10.4.4.254 -ip netns exec ${r_b} sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null +ip -net "$r_b" route add 192.168.10.0/24 dev ipip0 +ip -net "$r_b" route add 10.2.2.0/24 via 10.4.4.254 +ip netns exec "$r_b" sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null # Client A -ip -net ${c_a} addr add 192.168.10.2/24 dev veth0 -ip -net ${c_a} link set dev lo up -ip -net ${c_a} link set dev veth0 up -ip -net ${c_a} route add default via 192.168.10.1 +ip -net "$c_a" addr add 192.168.10.2/24 dev veth0 +ip -net "$c_a" link set dev veth0 up +ip -net "$c_a" route add default via 192.168.10.1 # Client A -ip -net ${c_b} addr add 192.168.20.2/24 dev veth0 -ip -net ${c_b} link set dev veth0 up -ip -net ${c_b} link set dev lo up -ip -net ${c_b} route add default via 192.168.20.1 +ip -net "$c_b" addr add 192.168.20.2/24 dev veth0 +ip -net "$c_b" link set dev veth0 up +ip -net "$c_b" route add default via 192.168.20.1 # Wan -ip -net ${r_w} addr add 10.2.2.254/24 dev veth0 -ip -net ${r_w} addr add 10.4.4.254/24 dev veth1 +ip -net "$r_w" addr add 10.2.2.254/24 dev veth0 +ip -net "$r_w" addr add 10.4.4.254/24 dev veth1 -ip -net ${r_w} link set dev lo up -ip -net ${r_w} link set dev veth0 up mtu 1400 -ip -net ${r_w} link set dev veth1 up mtu 1400 +ip -net "$r_w" link set dev veth0 up mtu 1400 +ip -net "$r_w" link set dev veth1 up mtu 1400 -ip -net ${r_a} link set dev veth0 mtu 1400 -ip -net ${r_b} link set dev veth0 mtu 1400 +ip -net "$r_a" link set dev veth0 mtu 1400 +ip -net "$r_b" link set dev veth0 mtu 1400 -ip netns exec ${r_w} sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null +ip netns exec "$r_w" sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null # Path MTU discovery # ------------------ @@ -203,5 +187,5 @@ test_path "without" #packet is too big (1400) for the tunnel PMTU (1380) to Router B, it is #dropped on Router A before sending. -ip netns exec ${r_a} iptables -A FORWARD -m conntrack --ctstate NEW +ip netns exec "$r_a" iptables -A FORWARD -m conntrack --ctstate NEW test_path "with" diff --git a/tools/testing/selftests/net/netfilter/conntrack_sctp_collision.sh b/tools/testing/selftests/net/netfilter/conntrack_sctp_collision.sh new file mode 100755 index 000000000000..d860f7d9744b --- /dev/null +++ b/tools/testing/selftests/net/netfilter/conntrack_sctp_collision.sh @@ -0,0 +1,87 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Testing For SCTP COLLISION SCENARIO as Below: +# +# 14:35:47.655279 IP CLIENT_IP.PORT > SERVER_IP.PORT: sctp (1) [INIT] [init tag: 2017837359] +# 14:35:48.353250 IP SERVER_IP.PORT > CLIENT_IP.PORT: sctp (1) [INIT] [init tag: 1187206187] +# 14:35:48.353275 IP CLIENT_IP.PORT > SERVER_IP.PORT: sctp (1) [INIT ACK] [init tag: 2017837359] +# 14:35:48.353283 IP SERVER_IP.PORT > CLIENT_IP.PORT: sctp (1) [COOKIE ECHO] +# 14:35:48.353977 IP CLIENT_IP.PORT > SERVER_IP.PORT: sctp (1) [COOKIE ACK] +# 14:35:48.855335 IP SERVER_IP.PORT > CLIENT_IP.PORT: sctp (1) [INIT ACK] [init tag: 164579970] +# +# TOPO: SERVER_NS (link0)<--->(link1) ROUTER_NS (link2)<--->(link3) CLIENT_NS + +source lib.sh + +CLIENT_IP="198.51.200.1" +CLIENT_PORT=1234 + +SERVER_IP="198.51.100.1" +SERVER_PORT=1234 + +CLIENT_GW="198.51.200.2" +SERVER_GW="198.51.100.2" + +# setup the topo +setup() { + setup_ns CLIENT_NS SERVER_NS ROUTER_NS + ip -n "$SERVER_NS" link add link0 type veth peer name link1 netns "$ROUTER_NS" + ip -n "$CLIENT_NS" link add link3 type veth peer name link2 netns "$ROUTER_NS" + + ip -n "$SERVER_NS" link set link0 up + ip -n "$SERVER_NS" addr add $SERVER_IP/24 dev link0 + ip -n "$SERVER_NS" route add $CLIENT_IP dev link0 via $SERVER_GW + + ip -n "$ROUTER_NS" link set link1 up + ip -n "$ROUTER_NS" link set link2 up + ip -n "$ROUTER_NS" addr add $SERVER_GW/24 dev link1 + ip -n "$ROUTER_NS" addr add $CLIENT_GW/24 dev link2 + ip net exec "$ROUTER_NS" sysctl -wq net.ipv4.ip_forward=1 + + ip -n "$CLIENT_NS" link set link3 up + ip -n "$CLIENT_NS" addr add $CLIENT_IP/24 dev link3 + ip -n "$CLIENT_NS" route add $SERVER_IP dev link3 via $CLIENT_GW + + # simulate the delay on OVS upcall by setting up a delay for INIT_ACK with + # tc on $SERVER_NS side + tc -n "$SERVER_NS" qdisc add dev link0 root handle 1: htb r2q 64 + tc -n "$SERVER_NS" class add dev link0 parent 1: classid 1:1 htb rate 100mbit + tc -n "$SERVER_NS" filter add dev link0 parent 1: protocol ip u32 match ip protocol 132 \ + 0xff match u8 2 0xff at 32 flowid 1:1 + if ! tc -n "$SERVER_NS" qdisc add dev link0 parent 1:1 handle 10: netem delay 1200ms; then + echo "SKIP: Cannot add netem qdisc" + exit $ksft_skip + fi + + # simulate the ctstate check on OVS nf_conntrack + ip net exec "$ROUTER_NS" iptables -A FORWARD -m state --state INVALID,UNTRACKED -j DROP + ip net exec "$ROUTER_NS" iptables -A INPUT -p sctp -j DROP + + # use a smaller number for assoc's max_retrans to reproduce the issue + modprobe -q sctp + ip net exec "$CLIENT_NS" sysctl -wq net.sctp.association_max_retrans=3 +} + +cleanup() { + ip net exec "$CLIENT_NS" pkill sctp_collision >/dev/null 2>&1 + ip net exec "$SERVER_NS" pkill sctp_collision >/dev/null 2>&1 + cleanup_all_ns +} + +do_test() { + ip net exec "$SERVER_NS" ./sctp_collision server \ + $SERVER_IP $SERVER_PORT $CLIENT_IP $CLIENT_PORT & + ip net exec "$CLIENT_NS" ./sctp_collision client \ + $CLIENT_IP $CLIENT_PORT $SERVER_IP $SERVER_PORT +} + +# NOTE: one way to work around the issue is set a smaller hb_interval +# ip net exec $CLIENT_NS sysctl -wq net.sctp.hb_interval=3500 + +# run the test case +trap cleanup EXIT +setup && \ +echo "Test for SCTP Collision in nf_conntrack:" && \ +do_test && echo "PASS!" +exit $? diff --git a/tools/testing/selftests/net/netfilter/conntrack_tcp_unreplied.sh b/tools/testing/selftests/net/netfilter/conntrack_tcp_unreplied.sh new file mode 100755 index 000000000000..121ea93c0178 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/conntrack_tcp_unreplied.sh @@ -0,0 +1,164 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Check that UNREPLIED tcp conntrack will eventually timeout. +# + +source lib.sh + +if ! nft --version > /dev/null 2>&1;then + echo "SKIP: Could not run test without nft tool" + exit $ksft_skip +fi + +if ! conntrack --version > /dev/null 2>&1;then + echo "SKIP: Could not run test without conntrack tool" + exit $ksft_skip +fi + +ret=0 + +cleanup() { + ip netns pids "$ns1" | xargs kill 2>/dev/null + ip netns pids "$ns2" | xargs kill 2>/dev/null + + cleanup_all_ns +} + +ipv4() { + echo -n 192.168."$1".2 +} + +check_counter() +{ + ns=$1 + name=$2 + expect=$3 + local lret=0 + + if ! ip netns exec "$ns2" nft list counter inet filter "$name" | grep -q "$expect"; then + echo "ERROR: counter $name in $ns2 has unexpected value (expected $expect)" 1>&2 + ip netns exec "$ns2" nft list counter inet filter "$name" 1>&2 + lret=1 + fi + + return $lret +} + +trap cleanup EXIT + +# Create test namespaces +setup_ns ns1 ns2 + +# Connect the namespace to the host using a veth pair +ip -net "$ns1" link add name veth1 type veth peer name veth2 +ip -net "$ns1" link set netns "$ns2" dev veth2 + +ip -net "$ns1" link set up dev lo +ip -net "$ns2" link set up dev lo +ip -net "$ns1" link set up dev veth1 +ip -net "$ns2" link set up dev veth2 + +ip -net "$ns2" addr add 10.11.11.2/24 dev veth2 +ip -net "$ns2" route add default via 10.11.11.1 + +ip netns exec "$ns2" sysctl -q net.ipv4.conf.veth2.forwarding=1 + +# add a rule inside NS so we enable conntrack +ip netns exec "$ns1" nft -f - <<EOF +table inet filter { + chain input { + type filter hook input priority 0; policy accept; + ct state established accept + } +} +EOF + +ip -net "$ns1" addr add 10.11.11.1/24 dev veth1 +ip -net "$ns1" route add 10.99.99.99 via 10.11.11.2 + +# Check connectivity works +ip netns exec "$ns1" ping -q -c 2 10.11.11.2 >/dev/null || exit 1 + +ip netns exec "$ns2" socat -u -4 TCP-LISTEN:8080,reuseaddr STDOUT & + +ip netns exec "$ns2" nft -f - <<EOF +table inet filter { + counter connreq { } + counter redir { } + chain input { + type filter hook input priority 0; policy accept; + ct state new tcp flags syn ip daddr 10.99.99.99 tcp dport 80 counter name "connreq" accept + ct state new ct status dnat tcp dport 8080 counter name "redir" accept + } +} +EOF +if [ $? -ne 0 ]; then + echo "ERROR: Could not load nft rules" + exit 1 +fi + +ip netns exec "$ns2" sysctl -q net.netfilter.nf_conntrack_tcp_timeout_syn_sent=10 + +echo "INFO: connect $ns1 -> $ns2 to the virtual ip" +ip netns exec "$ns1" bash -c 'for i in $(seq 1 $BUSYWAIT_TIMEOUT) ; do + socat -u STDIN TCP:10.99.99.99:80 < /dev/null + sleep 0.1 + done' & + +wait_for_attempt() +{ + count=$(ip netns exec "$ns2" conntrack -L -p tcp --dport 80 2>/dev/null | wc -l) + if [ "$count" -gt 0 ]; then + return 0 + fi + + return 1 +} + +# wait for conntrack to pick the new connection request up before loading +# the nat redirect rule. +if ! busywait "$BUSYWAIT_TIMEOUT" wait_for_attempt; then + echo "ERROR: $ns2 did not pick up tcp connection from peer" + exit 1 +fi + +ip netns exec "$ns2" nft -f - <<EOF +table inet nat { + chain prerouting { + type nat hook prerouting priority 0; policy accept; + ip daddr 10.99.99.99 tcp dport 80 redirect to :8080 + } +} +EOF +if [ $? -ne 0 ]; then + echo "ERROR: Could not load nat redirect" + exit 1 +fi + +wait_for_redirect() +{ + count=$(ip netns exec "$ns2" conntrack -L -p tcp --reply-port-src 8080 2>/dev/null | wc -l) + if [ "$count" -gt 0 ]; then + return 0 + fi + + return 1 +} +echo "INFO: NAT redirect added in ns $ns2, waiting for $BUSYWAIT_TIMEOUT ms for nat to take effect" + +busywait "$BUSYWAIT_TIMEOUT" wait_for_redirect +ret=$? + +expect="packets 1 bytes 60" +if ! check_counter "$ns2" "redir" "$expect"; then + ret=1 +fi + +if [ $ret -eq 0 ];then + echo "PASS: redirection counter has expected values" +else + echo "ERROR: no tcp connection was redirected" +fi + +exit $ret diff --git a/tools/testing/selftests/netfilter/conntrack_vrf.sh b/tools/testing/selftests/net/netfilter/conntrack_vrf.sh index 8b5ea9234588..073e8e62d350 100755 --- a/tools/testing/selftests/netfilter/conntrack_vrf.sh +++ b/tools/testing/selftests/net/netfilter/conntrack_vrf.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash # This script demonstrates interaction of conntrack and vrf. # The vrf driver calls the netfilter hooks again, with oif/iif @@ -28,84 +28,67 @@ # that was supposed to be fixed by the commit mentioned above to make sure # that any fix to test case 1 won't break masquerade again. -ksft_skip=4 +source lib.sh IP0=172.30.30.1 IP1=172.30.30.2 PFXL=30 ret=0 -sfx=$(mktemp -u "XXXXXXXX") -ns0="ns0-$sfx" -ns1="ns1-$sfx" - cleanup() { ip netns pids $ns0 | xargs kill 2>/dev/null ip netns pids $ns1 | xargs kill 2>/dev/null - ip netns del $ns0 $ns1 + cleanup_all_ns } -nft --version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without nft tool" - exit $ksft_skip -fi - -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi - -ip netns add "$ns0" -if [ $? -ne 0 ];then - echo "SKIP: Could not create net namespace $ns0" - exit $ksft_skip -fi -ip netns add "$ns1" +checktool "nft --version" "run test without nft" +checktool "conntrack --version" "run test without conntrack" +checktool "socat -h" "run test without socat" trap cleanup EXIT -ip netns exec $ns0 sysctl -q -w net.ipv4.conf.default.rp_filter=0 -ip netns exec $ns0 sysctl -q -w net.ipv4.conf.all.rp_filter=0 -ip netns exec $ns0 sysctl -q -w net.ipv4.conf.all.rp_filter=0 +setup_ns ns0 ns1 + +ip netns exec "$ns0" sysctl -q -w net.ipv4.conf.default.rp_filter=0 +ip netns exec "$ns0" sysctl -q -w net.ipv4.conf.all.rp_filter=0 +ip netns exec "$ns0" sysctl -q -w net.ipv4.conf.all.rp_filter=0 -ip link add veth0 netns "$ns0" type veth peer name veth0 netns "$ns1" > /dev/null 2>&1 -if [ $? -ne 0 ];then +if ! ip link add veth0 netns "$ns0" type veth peer name veth0 netns "$ns1" > /dev/null 2>&1; then echo "SKIP: Could not add veth device" exit $ksft_skip fi -ip -net $ns0 li add tvrf type vrf table 9876 -if [ $? -ne 0 ];then +if ! ip -net "$ns0" li add tvrf type vrf table 9876; then echo "SKIP: Could not add vrf device" exit $ksft_skip fi -ip -net $ns0 li set lo up +ip -net "$ns0" li set veth0 master tvrf +ip -net "$ns0" li set tvrf up +ip -net "$ns0" li set veth0 up +ip -net "$ns1" li set veth0 up -ip -net $ns0 li set veth0 master tvrf -ip -net $ns0 li set tvrf up -ip -net $ns0 li set veth0 up -ip -net $ns1 li set veth0 up +ip -net "$ns0" addr add $IP0/$PFXL dev veth0 +ip -net "$ns1" addr add $IP1/$PFXL dev veth0 -ip -net $ns0 addr add $IP0/$PFXL dev veth0 -ip -net $ns1 addr add $IP1/$PFXL dev veth0 +listener_ready() +{ + local ns="$1" -ip netns exec $ns1 iperf3 -s > /dev/null 2>&1& -if [ $? -ne 0 ];then - echo "SKIP: Could not start iperf3" - exit $ksft_skip -fi + ss -N "$ns" -l -n -t -o "sport = :55555" | grep -q "55555" +} + +ip netns exec "$ns1" socat -u -4 TCP-LISTEN:55555,reuseaddr,fork STDOUT > /dev/null & +busywait $BUSYWAIT_TIMEOUT listener_ready "$ns1" # test vrf ingress handling. # The incoming connection should be placed in conntrack zone 1, # as decided by the first iteration of the ruleset. test_ct_zone_in() { -ip netns exec $ns0 nft -f - <<EOF +ip netns exec "$ns0" nft -f - <<EOF table testct { chain rawpre { type filter hook prerouting priority raw; @@ -126,21 +109,21 @@ table testct { } } EOF - ip netns exec $ns1 ping -W 1 -c 1 -I veth0 $IP0 > /dev/null + ip netns exec "$ns1" ping -W 1 -c 1 -I veth0 "$IP0" > /dev/null # should be in zone 1, not zone 2 - count=$(ip netns exec $ns0 conntrack -L -s $IP1 -d $IP0 -p icmp --zone 1 2>/dev/null | wc -l) - if [ $count -eq 1 ]; then + count=$(ip netns exec "$ns0" conntrack -L -s $IP1 -d $IP0 -p icmp --zone 1 2>/dev/null | wc -l) + if [ "$count" -eq 1 ]; then echo "PASS: entry found in conntrack zone 1" else echo "FAIL: entry not found in conntrack zone 1" - count=$(ip netns exec $ns0 conntrack -L -s $IP1 -d $IP0 -p icmp --zone 2 2> /dev/null | wc -l) - if [ $count -eq 1 ]; then + count=$(ip netns exec "$ns0" conntrack -L -s $IP1 -d $IP0 -p icmp --zone 2 2> /dev/null | wc -l) + if [ "$count" -eq 1 ]; then echo "FAIL: entry found in zone 2 instead" else echo "FAIL: entry not in zone 1 or 2, dumping table" - ip netns exec $ns0 conntrack -L - ip netns exec $ns0 nft list ruleset + ip netns exec "$ns0" conntrack -L + ip netns exec "$ns0" nft list ruleset fi fi } @@ -153,12 +136,12 @@ test_masquerade_vrf() local qdisc=$1 if [ "$qdisc" != "default" ]; then - tc -net $ns0 qdisc add dev tvrf root $qdisc + tc -net "$ns0" qdisc add dev tvrf root "$qdisc" fi - ip netns exec $ns0 conntrack -F 2>/dev/null + ip netns exec "$ns0" conntrack -F 2>/dev/null -ip netns exec $ns0 nft -f - <<EOF +ip netns exec "$ns0" nft -f - <<EOF flush ruleset table ip nat { chain rawout { @@ -179,25 +162,23 @@ table ip nat { } } EOF - ip netns exec $ns0 ip vrf exec tvrf iperf3 -t 1 -c $IP1 >/dev/null - if [ $? -ne 0 ]; then - echo "FAIL: iperf3 connect failure with masquerade + sport rewrite on vrf device" + if ! ip netns exec "$ns0" ip vrf exec tvrf socat -u -4 STDIN TCP:"$IP1":55555 < /dev/null > /dev/null;then + echo "FAIL: connect failure with masquerade + sport rewrite on vrf device" ret=1 return fi # must also check that nat table was evaluated on second (lower device) iteration. - ip netns exec $ns0 nft list table ip nat |grep -q 'counter packets 2' && - ip netns exec $ns0 nft list table ip nat |grep -q 'untracked counter packets [1-9]' - if [ $? -eq 0 ]; then - echo "PASS: iperf3 connect with masquerade + sport rewrite on vrf device ($qdisc qdisc)" + if ip netns exec "$ns0" nft list table ip nat |grep -q 'counter packets 1' && + ip netns exec "$ns0" nft list table ip nat |grep -q 'untracked counter packets [1-9]'; then + echo "PASS: connect with masquerade + sport rewrite on vrf device ($qdisc qdisc)" else echo "FAIL: vrf rules have unexpected counter value" ret=1 fi if [ "$qdisc" != "default" ]; then - tc -net $ns0 qdisc del dev tvrf root + tc -net "$ns0" qdisc del dev tvrf root fi } @@ -206,8 +187,8 @@ EOF # oifname is the lower device (veth0 in this case). test_masquerade_veth() { - ip netns exec $ns0 conntrack -F 2>/dev/null -ip netns exec $ns0 nft -f - <<EOF + ip netns exec "$ns0" conntrack -F 2>/dev/null +ip netns exec "$ns0" nft -f - <<EOF flush ruleset table ip nat { chain postrouting { @@ -216,17 +197,15 @@ table ip nat { } } EOF - ip netns exec $ns0 ip vrf exec tvrf iperf3 -t 1 -c $IP1 > /dev/null - if [ $? -ne 0 ]; then - echo "FAIL: iperf3 connect failure with masquerade + sport rewrite on veth device" + if ! ip netns exec "$ns0" ip vrf exec tvrf socat -u -4 STDIN TCP:"$IP1":55555 < /dev/null > /dev/null;then + echo "FAIL: connect failure with masquerade + sport rewrite on veth device" ret=1 return fi # must also check that nat table was evaluated on second (lower device) iteration. - ip netns exec $ns0 nft list table ip nat |grep -q 'counter packets 2' - if [ $? -eq 0 ]; then - echo "PASS: iperf3 connect with masquerade + sport rewrite on veth device" + if ip netns exec "$ns0" nft list table ip nat |grep -q 'counter packets 1'; then + echo "PASS: connect with masquerade + sport rewrite on veth device" else echo "FAIL: vrf masq rule has unexpected counter value" ret=1 diff --git a/tools/testing/selftests/net/netfilter/ipvs.sh b/tools/testing/selftests/net/netfilter/ipvs.sh new file mode 100755 index 000000000000..4ceee9fb3949 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/ipvs.sh @@ -0,0 +1,211 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# End-to-end ipvs test suite +# Topology: +#--------------------------------------------------------------+ +# | | +# ns0 | ns1 | +# ----------- | ----------- ----------- | +# | veth01 | --------- | veth10 | | veth12 | | +# ----------- peer ----------- ----------- | +# | | | | +# ----------- | | | +# | br0 | |----------------- peer |--------------| +# ----------- | | | +# | | | | +# ---------- peer ---------- ----------- | +# | veth02 | --------- | veth20 | | veth21 | | +# ---------- | ---------- ----------- | +# | ns2 | +# | | +#--------------------------------------------------------------+ +# +# We assume that all network driver are loaded +# + +source lib.sh + +ret=0 +GREEN='\033[0;92m' +RED='\033[0;31m' +NC='\033[0m' # No Color + +readonly port=8080 + +readonly vip_v4=207.175.44.110 +readonly cip_v4=10.0.0.2 +readonly gip_v4=10.0.0.1 +readonly dip_v4=172.16.0.1 +readonly rip_v4=172.16.0.2 +readonly sip_v4=10.0.0.3 + +readonly infile="$(mktemp)" +readonly outfile="$(mktemp)" +readonly datalen=32 + +sysipvsnet="/proc/sys/net/ipv4/vs/" +if [ ! -d $sysipvsnet ]; then + if ! modprobe -q ip_vs; then + echo "skip: could not run test without ipvs module" + exit $ksft_skip + fi +fi + +checktool "ipvsadm -v" "run test without ipvsadm" +checktool "socat -h" "run test without socat" + +setup() { + setup_ns ns0 ns1 ns2 + + ip link add veth01 netns "${ns0}" type veth peer name veth10 netns "${ns1}" + ip link add veth02 netns "${ns0}" type veth peer name veth20 netns "${ns2}" + ip link add veth12 netns "${ns1}" type veth peer name veth21 netns "${ns2}" + + ip netns exec "${ns0}" ip link set veth01 up + ip netns exec "${ns0}" ip link set veth02 up + ip netns exec "${ns0}" ip link add br0 type bridge + ip netns exec "${ns0}" ip link set veth01 master br0 + ip netns exec "${ns0}" ip link set veth02 master br0 + ip netns exec "${ns0}" ip link set br0 up + ip netns exec "${ns0}" ip addr add "${cip_v4}/24" dev br0 + + ip netns exec "${ns1}" ip link set veth10 up + ip netns exec "${ns1}" ip addr add "${gip_v4}/24" dev veth10 + ip netns exec "${ns1}" ip link set veth12 up + ip netns exec "${ns1}" ip addr add "${dip_v4}/24" dev veth12 + + ip netns exec "${ns2}" ip link set veth21 up + ip netns exec "${ns2}" ip addr add "${rip_v4}/24" dev veth21 + ip netns exec "${ns2}" ip link set veth20 up + ip netns exec "${ns2}" ip addr add "${sip_v4}/24" dev veth20 + + sleep 1 + + dd if=/dev/urandom of="${infile}" bs="${datalen}" count=1 status=none +} + +cleanup() { + cleanup_all_ns + + if [ -f "${outfile}" ]; then + rm "${outfile}" + fi + if [ -f "${infile}" ]; then + rm "${infile}" + fi +} + +server_listen() { + ip netns exec "$ns2" socat -u -4 TCP-LISTEN:8080,reuseaddr STDOUT > "${outfile}" & + server_pid=$! + sleep 0.2 +} + +client_connect() { + ip netns exec "${ns0}" timeout 2 socat -u -4 STDIN TCP:"${vip_v4}":"${port}" < "${infile}" +} + +verify_data() { + wait "${server_pid}" + cmp "$infile" "$outfile" 2>/dev/null +} + +test_service() { + server_listen + client_connect + verify_data +} + + +test_dr() { + ip netns exec "${ns0}" ip route add "${vip_v4}" via "${gip_v4}" dev br0 + + ip netns exec "${ns1}" sysctl -qw net.ipv4.ip_forward=1 + ip netns exec "${ns1}" ipvsadm -A -t "${vip_v4}:${port}" -s rr + ip netns exec "${ns1}" ipvsadm -a -t "${vip_v4}:${port}" -r "${rip_v4}:${port}" + ip netns exec "${ns1}" ip addr add "${vip_v4}/32" dev lo:1 + + # avoid incorrect arp response + ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.all.arp_ignore=1 + ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.all.arp_announce=2 + # avoid reverse route lookup + ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.all.rp_filter=0 + ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.veth21.rp_filter=0 + ip netns exec "${ns2}" ip addr add "${vip_v4}/32" dev lo:1 + + test_service +} + +test_nat() { + ip netns exec "${ns0}" ip route add "${vip_v4}" via "${gip_v4}" dev br0 + + ip netns exec "${ns1}" sysctl -qw net.ipv4.ip_forward=1 + ip netns exec "${ns1}" ipvsadm -A -t "${vip_v4}:${port}" -s rr + ip netns exec "${ns1}" ipvsadm -a -m -t "${vip_v4}:${port}" -r "${rip_v4}:${port}" + ip netns exec "${ns1}" ip addr add "${vip_v4}/32" dev lo:1 + + ip netns exec "${ns2}" ip link del veth20 + ip netns exec "${ns2}" ip route add default via "${dip_v4}" dev veth21 + + test_service +} + +test_tun() { + ip netns exec "${ns0}" ip route add "${vip_v4}" via "${gip_v4}" dev br0 + + ip netns exec "${ns1}" modprobe -q ipip + ip netns exec "${ns1}" ip link set tunl0 up + ip netns exec "${ns1}" sysctl -qw net.ipv4.ip_forward=0 + ip netns exec "${ns1}" sysctl -qw net.ipv4.conf.all.send_redirects=0 + ip netns exec "${ns1}" sysctl -qw net.ipv4.conf.default.send_redirects=0 + ip netns exec "${ns1}" ipvsadm -A -t "${vip_v4}:${port}" -s rr + ip netns exec "${ns1}" ipvsadm -a -i -t "${vip_v4}:${port}" -r ${rip_v4}:${port} + ip netns exec "${ns1}" ip addr add ${vip_v4}/32 dev lo:1 + + ip netns exec "${ns2}" modprobe -q ipip + ip netns exec "${ns2}" ip link set tunl0 up + ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.all.arp_ignore=1 + ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.all.arp_announce=2 + ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.all.rp_filter=0 + ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.tunl0.rp_filter=0 + ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.veth21.rp_filter=0 + ip netns exec "${ns2}" ip addr add "${vip_v4}/32" dev lo:1 + + test_service +} + +run_tests() { + local errors= + + echo "Testing DR mode..." + cleanup + setup + test_dr + errors=$(( $errors + $? )) + + echo "Testing NAT mode..." + cleanup + setup + test_nat + errors=$(( $errors + $? )) + + echo "Testing Tunnel mode..." + cleanup + setup + test_tun + errors=$(( $errors + $? )) + + return $errors +} + +trap cleanup EXIT + +run_tests + +if [ $? -ne 0 ]; then + echo -e "$(basename $0): ${RED}FAIL${NC}" + exit 1 +fi +echo -e "$(basename $0): ${GREEN}PASS${NC}" +exit 0 diff --git a/tools/testing/selftests/net/netfilter/lib.sh b/tools/testing/selftests/net/netfilter/lib.sh new file mode 100644 index 000000000000..bedd35298e15 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/lib.sh @@ -0,0 +1,10 @@ +net_netfilter_dir=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") + +source "$net_netfilter_dir/../lib.sh" + +checktool (){ + if ! $1 > /dev/null 2>&1; then + echo "SKIP: Could not $2" + exit $ksft_skip + fi +} diff --git a/tools/testing/selftests/net/netfilter/nf_conntrack_packetdrill.sh b/tools/testing/selftests/net/netfilter/nf_conntrack_packetdrill.sh new file mode 100755 index 000000000000..c6fdd2079f4d --- /dev/null +++ b/tools/testing/selftests/net/netfilter/nf_conntrack_packetdrill.sh @@ -0,0 +1,71 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +source lib.sh + +checktool "conntrack --version" "run test without conntrack" +checktool "iptables --version" "run test without iptables" +checktool "ip6tables --version" "run test without ip6tables" + +modprobe -q tun +modprobe -q nf_conntrack +# echo 1 > /proc/sys/net/netfilter/nf_log_all_netns + +PDRILL_TIMEOUT=10 + +files=" +conntrack_ack_loss_stall.pkt +conntrack_inexact_rst.pkt +conntrack_syn_challenge_ack.pkt +conntrack_synack_old.pkt +conntrack_synack_reuse.pkt +conntrack_rst_invalid.pkt +" + +if ! packetdrill --dry_run --verbose "packetdrill/conntrack_ack_loss_stall.pkt";then + echo "SKIP: packetdrill not installed" + exit ${ksft_skip} +fi + +ret=0 + +run_packetdrill() +{ + filename="$1" + ipver="$2" + local mtu=1500 + + export NFCT_IP_VERSION="$ipver" + + if [ "$ipver" = "ipv4" ];then + export xtables="iptables" + elif [ "$ipver" = "ipv6" ];then + export xtables="ip6tables" + mtu=1520 + fi + + timeout "$PDRILL_TIMEOUT" unshare -n packetdrill --ip_version="$ipver" --mtu=$mtu \ + --tolerance_usecs=1000000 --non_fatal packet "$filename" +} + +run_one_test_file() +{ + filename="$1" + + for v in ipv4 ipv6;do + printf "%-50s(%s)%-20s" "$filename" "$v" "" + if run_packetdrill packetdrill/"$f" "$v";then + echo OK + else + echo FAIL + ret=1 + fi + done +} + +echo "Replaying packetdrill test cases:" +for f in $files;do + run_one_test_file packetdrill/"$f" +done + +exit $ret diff --git a/tools/testing/selftests/net/netfilter/nf_nat_edemux.sh b/tools/testing/selftests/net/netfilter/nf_nat_edemux.sh new file mode 100755 index 000000000000..1014551dd769 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/nf_nat_edemux.sh @@ -0,0 +1,97 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Test NAT source port clash resolution +# + +source lib.sh +ret=0 +socatpid=0 + +cleanup() +{ + [ "$socatpid" -gt 0 ] && kill "$socatpid" + + cleanup_all_ns +} + +checktool "socat -h" "run test without socat" +checktool "iptables --version" "run test without iptables" + +trap cleanup EXIT + +setup_ns ns1 ns2 + +# Connect the namespaces using a veth pair +ip link add name veth2 type veth peer name veth1 +ip link set netns "$ns1" dev veth1 +ip link set netns "$ns2" dev veth2 + +ip netns exec "$ns1" ip link set up dev lo +ip netns exec "$ns1" ip link set up dev veth1 +ip netns exec "$ns1" ip addr add 192.168.1.1/24 dev veth1 + +ip netns exec "$ns2" ip link set up dev lo +ip netns exec "$ns2" ip link set up dev veth2 +ip netns exec "$ns2" ip addr add 192.168.1.2/24 dev veth2 + +# Create a server in one namespace +ip netns exec "$ns1" socat -u TCP-LISTEN:5201,fork OPEN:/dev/null,wronly=1 & +socatpid=$! + +# Restrict source port to just one so we don't have to exhaust +# all others. +ip netns exec "$ns2" sysctl -q net.ipv4.ip_local_port_range="10000 10000" + +# add a virtual IP using DNAT +ip netns exec "$ns2" iptables -t nat -A OUTPUT -d 10.96.0.1/32 -p tcp --dport 443 -j DNAT --to-destination 192.168.1.1:5201 + +# ... and route it to the other namespace +ip netns exec "$ns2" ip route add 10.96.0.1 via 192.168.1.1 + +# add a persistent connection from the other namespace +ip netns exec "$ns2" socat -t 10 - TCP:192.168.1.1:5201 > /dev/null & + +sleep 1 + +# ip daddr:dport will be rewritten to 192.168.1.1 5201 +# NAT must reallocate source port 10000 because +# 192.168.1.2:10000 -> 192.168.1.1:5201 is already in use +echo test | ip netns exec "$ns2" socat -t 3 -u STDIN TCP:10.96.0.1:443,connect-timeout=3 >/dev/null +ret=$? + +# Check socat can connect to 10.96.0.1:443 (aka 192.168.1.1:5201). +if [ $ret -eq 0 ]; then + echo "PASS: socat can connect via NAT'd address" +else + echo "FAIL: socat cannot connect via NAT'd address" +fi + +# check sport clashres. +ip netns exec "$ns1" iptables -t nat -A PREROUTING -p tcp --dport 5202 -j REDIRECT --to-ports 5201 +ip netns exec "$ns1" iptables -t nat -A PREROUTING -p tcp --dport 5203 -j REDIRECT --to-ports 5201 + +sleep 5 | ip netns exec "$ns2" socat -t 5 -u STDIN TCP:192.168.1.1:5202,connect-timeout=5 >/dev/null & + +# if connect succeeds, client closes instantly due to EOF on stdin. +# if connect hangs, it will time out after 5s. +echo | ip netns exec "$ns2" socat -t 3 -u STDIN TCP:192.168.1.1:5203,connect-timeout=5 >/dev/null & +cpid2=$! + +time_then=$(date +%s) +wait $cpid2 +rv=$? +time_now=$(date +%s) + +# Check how much time has elapsed, expectation is for +# 'cpid2' to connect and then exit (and no connect delay). +delta=$((time_now - time_then)) + +if [ $delta -lt 2 ] && [ $rv -eq 0 ]; then + echo "PASS: could connect to service via redirected ports" +else + echo "FAIL: socat cannot connect to service via redirect ($delta seconds elapsed, returned $rv)" + ret=1 +fi + +exit $ret diff --git a/tools/testing/selftests/netfilter/nf-queue.c b/tools/testing/selftests/net/netfilter/nf_queue.c index 9e56b9d47037..9e56b9d47037 100644 --- a/tools/testing/selftests/netfilter/nf-queue.c +++ b/tools/testing/selftests/net/netfilter/nf_queue.c diff --git a/tools/testing/selftests/netfilter/nft_audit.sh b/tools/testing/selftests/net/netfilter/nft_audit.sh index 99ed5bd6e840..902f8114bc80 100755 --- a/tools/testing/selftests/netfilter/nft_audit.sh +++ b/tools/testing/selftests/net/netfilter/nft_audit.sh @@ -6,11 +6,34 @@ SKIP_RC=4 RC=0 +if [ -r /var/run/auditd.pid ];then + read pid < /var/run/auditd.pid + p=$(pgrep ^auditd$) + + if [ "$pid" -eq "$p" ]; then + echo "SKIP: auditd is running" + exit $SKIP_RC + fi +fi + nft --version >/dev/null 2>&1 || { echo "SKIP: missing nft tool" exit $SKIP_RC } +# nft must be recent enough to support "reset" keyword. +nft --check -f /dev/stdin >/dev/null 2>&1 <<EOF +add table t +add chain t c +reset rules t c +EOF + +if [ "$?" -ne 0 ];then + echo -n "SKIP: nft reset feature test failed: " + nft --version + exit $SKIP_RC +fi + # Run everything in a separate network namespace [ "${1}" != "run" ] && { unshare -n "${0}" run; exit $?; } @@ -73,7 +96,7 @@ done for ((i = 0; i < 500; i++)); do echo "add rule t2 c3 counter accept comment \"rule $i\"" -done >$rulefile +done > "$rulefile" do_test "nft -f $rulefile" \ 'table=t2 family=2 entries=500 op=nft_register_rule' @@ -101,7 +124,7 @@ do_test 'nft add counter t2 c1; add counter t2 c2' \ for ((i = 3; i <= 500; i++)); do echo "add counter t2 c$i" -done >$rulefile +done > "$rulefile" do_test "nft -f $rulefile" \ 'table=t2 family=2 entries=498 op=nft_register_obj' @@ -115,7 +138,7 @@ do_test 'nft add quota t2 q1 { 10 bytes }; add quota t2 q2 { 10 bytes }' \ for ((i = 3; i <= 500; i++)); do echo "add quota t2 q$i { 10 bytes }" -done >$rulefile +done > "$rulefile" do_test "nft -f $rulefile" \ 'table=t2 family=2 entries=498 op=nft_register_obj' @@ -157,7 +180,7 @@ table=t2 family=2 entries=135 op=nft_reset_rule' # resetting sets and elements -elem=(22 ,80 ,443) +elem=(22 ",80" ",443") relem="" for i in {1..3}; do relem+="${elem[((i - 1))]}" diff --git a/tools/testing/selftests/netfilter/nft_concat_range.sh b/tools/testing/selftests/net/netfilter/nft_concat_range.sh index e908009576c7..6d66240e149c 100755 --- a/tools/testing/selftests/netfilter/nft_concat_range.sh +++ b/tools/testing/selftests/net/netfilter/nft_concat_range.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash # SPDX-License-Identifier: GPL-2.0 # # nft_concat_range.sh - Tests for sets with concatenation of ranged fields @@ -7,10 +7,10 @@ # # Author: Stefano Brivio <sbrivio@redhat.com> # -# shellcheck disable=SC2154,SC2034,SC2016,SC2030,SC2031 +# shellcheck disable=SC2154,SC2034,SC2016,SC2030,SC2031,SC2317 # ^ Configuration and templates sourced with eval, counters reused in subshells -KSELFTEST_SKIP=4 +source lib.sh # Available test groups: # - reported_issues: check for issues that were reported in the past @@ -19,7 +19,7 @@ KSELFTEST_SKIP=4 # - timeout: check that packets match entries until they expire # - performance: estimate matching rate, compare with rbtree and hash baselines TESTS="reported_issues correctness concurrency timeout" -[ "${quicktest}" != "1" ] && TESTS="${TESTS} performance" +[ -n "$NFT_CONCAT_RANGE_TESTS" ] && TESTS="${NFT_CONCAT_RANGE_TESTS}" # Set types, defined by TYPE_ variables below TYPES="net_port port_net net6_port port_proto net6_port_mac net6_port_mac_proto @@ -31,7 +31,7 @@ BUGS="flush_remove_add reload" # List of possible paths to pktgen script from kernel tree for performance tests PKTGEN_SCRIPT_PATHS=" - ../../../../samples/pktgen/pktgen_bench_xmit_mode_netif_receive.sh + ../../../../../samples/pktgen/pktgen_bench_xmit_mode_netif_receive.sh pktgen/pktgen_bench_xmit_mode_netif_receive.sh" # Definition of set types: @@ -66,7 +66,7 @@ src start 1 count 5 src_delta 2000 -tools sendip nc bash +tools sendip bash proto udp race_repeat 3 @@ -91,7 +91,7 @@ src start 1 count 5 src_delta 2000 -tools sendip socat nc bash +tools sendip socat bash proto udp race_repeat 3 @@ -116,7 +116,7 @@ src start 10 count 5 src_delta 2000 -tools sendip socat nc bash +tools sendip socat bash proto udp6 race_repeat 3 @@ -141,7 +141,7 @@ src start 1 count 5 src_delta 2000 -tools sendip socat nc bash +tools sendip socat bash proto udp race_repeat 0 @@ -163,7 +163,7 @@ src mac start 10 count 5 src_delta 2000 -tools sendip socat nc bash +tools sendip socat bash proto udp6 race_repeat 0 @@ -185,7 +185,7 @@ src mac proto start 10 count 5 src_delta 2000 -tools sendip socat nc bash +tools sendip socat bash proto udp6 race_repeat 0 @@ -207,7 +207,7 @@ src addr4 start 1 count 5 src_delta 2000 -tools sendip socat nc bash +tools sendip socat bash proto udp race_repeat 3 @@ -227,7 +227,7 @@ src addr6 port start 10 count 5 src_delta 2000 -tools sendip socat nc +tools sendip socat proto udp6 race_repeat 3 @@ -247,7 +247,7 @@ src mac proto addr4 start 1 count 5 src_delta 2000 -tools sendip socat nc bash +tools sendip socat bash proto udp race_repeat 0 @@ -264,7 +264,7 @@ src mac start 1 count 5 src_delta 2000 -tools sendip socat nc bash +tools sendip socat bash proto udp race_repeat 0 @@ -286,7 +286,7 @@ src mac addr4 start 1 count 5 src_delta 2000 -tools sendip socat nc bash +tools sendip socat bash proto udp race_repeat 0 @@ -337,7 +337,7 @@ src addr4 start 1 count 5 src_delta 2000 -tools sendip socat nc +tools sendip socat proto udp race_repeat 3 @@ -363,7 +363,7 @@ src mac start 1 count 1 src_delta 2000 -tools sendip socat nc bash +tools sendip socat bash proto udp race_repeat 0 @@ -473,8 +473,6 @@ setup_veth() { B() { ip netns exec B "$@" >/dev/null 2>&1 } - - sleep 2 } # Fill in set template and initialise set @@ -488,12 +486,6 @@ check_tools() { __tools= for tool in ${tools}; do - if [ "${tool}" = "nc" ] && [ "${proto}" = "udp6" ] && \ - ! nc -u -w0 1.1.1.1 1 2>/dev/null; then - # Some GNU netcat builds might not support IPv6 - __tools="${__tools} netcat-openbsd" - continue - fi __tools="${__tools} ${tool}" command -v "${tool}" >/dev/null && return 0 @@ -554,30 +546,7 @@ setup_send_udp() { ip addr add "${dst_addr4}" dev veth_a 2>/dev/null [ -z "${dst_port}" ] && dst_port=12345 - echo "test4" | B socat -t 0.01 STDIN UDP4-DATAGRAM:${dst_addr4}:${dst_port}"${__socatbind}" - - src_addr4= - src_port= - } - elif command -v nc >/dev/null; then - if nc -u -w0 1.1.1.1 1 2>/dev/null; then - # OpenBSD netcat - nc_opt="-w0" - else - # GNU netcat - nc_opt="-q0" - fi - - send_udp() { - if [ -n "${src_addr4}" ]; then - B ip addr add "${src_addr4}" dev veth_b - __src_addr4="-s ${src_addr4}" - fi - ip addr add "${dst_addr4}" dev veth_a 2>/dev/null - [ -n "${src_port}" ] && src_port="-p ${src_port}" - - echo "" | B nc -u "${nc_opt}" "${__src_addr4}" \ - "${src_port}" "${dst_addr4}" "${dst_port}" + echo "test4" | B socat -t 0.01 STDIN UDP4-DATAGRAM:"$dst_addr4":"$dst_port""${__socatbind}" src_addr4= src_port= @@ -632,11 +601,7 @@ setup_send_udp6() { __socatbind6= if [ -n "${src_addr6}" ]; then - if [ -n "${src_addr6} != "${src_addr6_added} ]; then - B ip addr add "${src_addr6}" dev veth_b nodad - - src_addr6_added=${src_addr6} - fi + B ip addr add "${src_addr6}" dev veth_b nodad __socatbind6=",bind=[${src_addr6}]" @@ -645,26 +610,7 @@ setup_send_udp6() { fi fi - echo "test6" | B socat -t 0.01 STDIN UDP6-DATAGRAM:[${dst_addr6}]:${dst_port}"${__socatbind6}" - } - elif command -v nc >/dev/null && nc -u -w0 1.1.1.1 1 2>/dev/null; then - # GNU netcat might not work with IPv6, try next tool - send_udp6() { - ip -6 addr add "${dst_addr6}" dev veth_a nodad \ - 2>/dev/null - if [ -n "${src_addr6}" ]; then - B ip addr add "${src_addr6}" dev veth_b nodad - else - src_addr6="2001:db8::2" - fi - [ -n "${src_port}" ] && src_port="-p ${src_port}" - - # shellcheck disable=SC2086 # this needs split options - echo "" | B nc -u w0 "-s${src_addr6}" ${src_port} \ - ${dst_addr6} ${dst_port} - - src_addr6= - src_port= + echo "test6" | B socat -t 0.01 STDIN UDP6-DATAGRAM:["$dst_addr6"]:"$dst_port""${__socatbind6}" } elif [ -z "$(bash -c 'type -p')" ]; then send_udp6() { @@ -679,10 +625,17 @@ setup_send_udp6() { fi } +listener_ready() +{ + port="$1" + ss -lnt -o "sport = :$port" | grep -q "$port" +} + # Set up function to send TCP traffic on IPv4 setup_flood_tcp() { if command -v iperf3 >/dev/null; then flood_tcp() { + local n_port="${dst_port}" [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" if [ -n "${src_addr4}" ]; then B ip addr add "${src_addr4}/16" dev veth_b @@ -699,7 +652,7 @@ setup_flood_tcp() { # shellcheck disable=SC2086 # this needs split options iperf3 -s -DB "${dst_addr4}" ${dst_port} >/dev/null 2>&1 - sleep 2 + busywait "$BUSYWAIT_TIMEOUT" listener_ready "$n_port" # shellcheck disable=SC2086 # this needs split options B iperf3 -c "${dst_addr4}" ${dst_port} ${src_port} \ @@ -711,6 +664,7 @@ setup_flood_tcp() { } elif command -v iperf >/dev/null; then flood_tcp() { + local n_port="${dst_port}" [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" if [ -n "${src_addr4}" ]; then B ip addr add "${src_addr4}/16" dev veth_b @@ -727,7 +681,7 @@ setup_flood_tcp() { # shellcheck disable=SC2086 # this needs split options iperf -s -DB "${dst_addr4}" ${dst_port} >/dev/null 2>&1 - sleep 2 + busywait "$BUSYWAIT_TIMEOUT" listener_ready "$n_port" # shellcheck disable=SC2086 # this needs split options B iperf -c "${dst_addr4}" ${dst_port} ${src_addr4} \ @@ -739,6 +693,7 @@ setup_flood_tcp() { } elif command -v netperf >/dev/null; then flood_tcp() { + local n_port="${dst_port}" [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" if [ -n "${src_addr4}" ]; then B ip addr add "${src_addr4}/16" dev veth_b @@ -755,7 +710,7 @@ setup_flood_tcp() { # shellcheck disable=SC2086 # this needs split options netserver -4 ${dst_port} -L "${dst_addr4}" \ >/dev/null 2>&1 - sleep 2 + busywait "$BUSYWAIT_TIMEOUT" listener_ready "${n_port}" # shellcheck disable=SC2086 # this needs split options B netperf -4 -H "${dst_addr4}" ${dst_port} \ @@ -774,6 +729,7 @@ setup_flood_tcp() { setup_flood_tcp6() { if command -v iperf3 >/dev/null; then flood_tcp6() { + local n_port="${dst_port}" [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" if [ -n "${src_addr6}" ]; then B ip addr add "${src_addr6}" dev veth_b nodad @@ -790,7 +746,7 @@ setup_flood_tcp6() { # shellcheck disable=SC2086 # this needs split options iperf3 -s -DB "${dst_addr6}" ${dst_port} >/dev/null 2>&1 - sleep 2 + busywait "$BUSYWAIT_TIMEOUT" listener_ready "${n_port}" # shellcheck disable=SC2086 # this needs split options B iperf3 -c "${dst_addr6}" ${dst_port} \ @@ -802,6 +758,7 @@ setup_flood_tcp6() { } elif command -v iperf >/dev/null; then flood_tcp6() { + local n_port="${dst_port}" [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" if [ -n "${src_addr6}" ]; then B ip addr add "${src_addr6}" dev veth_b nodad @@ -818,7 +775,7 @@ setup_flood_tcp6() { # shellcheck disable=SC2086 # this needs split options iperf -s -VDB "${dst_addr6}" ${dst_port} >/dev/null 2>&1 - sleep 2 + busywait "$BUSYWAIT_TIMEOUT" listener_ready "$n_port" # shellcheck disable=SC2086 # this needs split options B iperf -c "${dst_addr6}" -V ${dst_port} \ @@ -830,6 +787,7 @@ setup_flood_tcp6() { } elif command -v netperf >/dev/null; then flood_tcp6() { + local n_port="${dst_port}" [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" if [ -n "${src_addr6}" ]; then B ip addr add "${src_addr6}" dev veth_b nodad @@ -846,7 +804,7 @@ setup_flood_tcp6() { # shellcheck disable=SC2086 # this needs split options netserver -6 ${dst_port} -L "${dst_addr6}" \ >/dev/null 2>&1 - sleep 2 + busywait "$BUSYWAIT_TIMEOUT" listener_ready "$n_port" # shellcheck disable=SC2086 # this needs split options B netperf -6 -H "${dst_addr6}" ${dst_port} \ @@ -865,6 +823,7 @@ setup_flood_tcp6() { setup_flood_udp() { if command -v iperf3 >/dev/null; then flood_udp() { + local n_port="${dst_port}" [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" if [ -n "${src_addr4}" ]; then B ip addr add "${src_addr4}/16" dev veth_b @@ -881,7 +840,7 @@ setup_flood_udp() { # shellcheck disable=SC2086 # this needs split options iperf3 -s -DB "${dst_addr4}" ${dst_port} - sleep 2 + busywait "$BUSYWAIT_TIMEOUT" listener_ready "$n_port" # shellcheck disable=SC2086 # this needs split options B iperf3 -u -c "${dst_addr4}" -Z -b 100M -l16 -t1000 \ @@ -893,6 +852,7 @@ setup_flood_udp() { } elif command -v iperf >/dev/null; then flood_udp() { + local n_port="${dst_port}" [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" if [ -n "${src_addr4}" ]; then B ip addr add "${src_addr4}/16" dev veth_b @@ -909,7 +869,7 @@ setup_flood_udp() { # shellcheck disable=SC2086 # this needs split options iperf -u -sDB "${dst_addr4}" ${dst_port} >/dev/null 2>&1 - sleep 2 + busywait "$BUSYWAIT_TIMEOUT" listener_ready "$n_port" # shellcheck disable=SC2086 # this needs split options B iperf -u -c "${dst_addr4}" -b 100M -l1 -t1000 \ @@ -921,6 +881,7 @@ setup_flood_udp() { } elif command -v netperf >/dev/null; then flood_udp() { + local n_port="${dst_port}" [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" if [ -n "${src_addr4}" ]; then B ip addr add "${src_addr4}/16" dev veth_b @@ -937,7 +898,7 @@ setup_flood_udp() { # shellcheck disable=SC2086 # this needs split options netserver -4 ${dst_port} -L "${dst_addr4}" \ >/dev/null 2>&1 - sleep 2 + busywait "$BUSYWAIT_TIMEOUT" listener_ready "$n_port" # shellcheck disable=SC2086 # this needs split options B netperf -4 -H "${dst_addr4}" ${dst_port} \ @@ -982,6 +943,7 @@ cleanup() { ip link del dummy0 2>/dev/null ip route del default 2>/dev/null ip -6 route del default 2>/dev/null + ip netns pids B 2>/dev/null | xargs kill 2>/dev/null ip netns del B 2>/dev/null ip link del veth_a 2>/dev/null timeout= @@ -989,15 +951,18 @@ cleanup() { killall iperf 2>/dev/null killall netperf 2>/dev/null killall netserver 2>/dev/null - rm -f ${tmp} - sleep 2 +} + +cleanup_exit() { + cleanup + rm -f "$tmp" } # Entry point for setup functions setup() { if [ "$(id -u)" -ne 0 ]; then echo " need to run as root" - exit ${KSELFTEST_SKIP} + exit ${ksft_skip} fi cleanup @@ -1258,7 +1223,7 @@ send_nomatch() { # - check that packets outside range don't match it # - remove some elements, check that packets don't match anymore test_correctness() { - setup veth send_"${proto}" set || return ${KSELFTEST_SKIP} + setup veth send_"${proto}" set || return ${ksft_skip} range_size=1 for i in $(seq "${start}" $((start + count))); do @@ -1273,7 +1238,7 @@ test_correctness() { srcend=$((end + src_delta)) add "$(format)" || return 1 - for j in $(seq ${start} $((range_size / 2 + 1)) ${end}); do + for j in $(seq "$start" $((range_size / 2 + 1)) ${end}); do send_match "${j}" $((j + src_delta)) || return 1 done send_nomatch $((end + 1)) $((end + 1 + src_delta)) || return 1 @@ -1281,7 +1246,7 @@ test_correctness() { # Delete elements now and then if [ $((i % 3)) -eq 0 ]; then del "$(format)" || return 1 - for j in $(seq ${start} \ + for j in $(seq "$start" \ $((range_size / 2 + 1)) ${end}); do send_nomatch "${j}" $((j + src_delta)) \ || return 1 @@ -1307,12 +1272,12 @@ test_concurrency() { proto=${flood_proto} tools=${flood_tools} chain_spec=${flood_spec} - setup veth flood_"${proto}" set || return ${KSELFTEST_SKIP} + setup veth flood_"${proto}" set || return ${ksft_skip} range_size=1 cstart=${start} flood_pids= - for i in $(seq ${start} $((start + count))); do + for i in $(seq "$start" $((start + count))); do end=$((start + range_size)) srcstart=$((start + src_delta)) srcend=$((end + src_delta)) @@ -1325,7 +1290,7 @@ test_concurrency() { start=$((end + range_size)) done - sleep 10 + sleep $((RANDOM%10)) pids= for c in $(seq 1 "$(nproc)"); do ( @@ -1335,7 +1300,7 @@ test_concurrency() { # $start needs to be local to this subshell # shellcheck disable=SC2030 start=${cstart} - for i in $(seq ${start} $((start + count))); do + for i in $(seq "$start" $((start + count))); do end=$((start + range_size)) srcstart=$((start + src_delta)) srcend=$((end + src_delta)) @@ -1350,7 +1315,7 @@ test_concurrency() { range_size=1 start=${cstart} - for i in $(seq ${start} $((start + count))); do + for i in $(seq "$start" $((start + count))); do end=$((start + range_size)) srcstart=$((start + src_delta)) srcend=$((end + src_delta)) @@ -1366,7 +1331,7 @@ test_concurrency() { range_size=1 start=${cstart} - for i in $(seq ${start} $((start + count))); do + for i in $(seq "$start" $((start + count))); do end=$((start + range_size)) srcstart=$((start + src_delta)) srcend=$((end + src_delta)) @@ -1379,7 +1344,7 @@ test_concurrency() { range_size=1 start=${cstart} - for i in $(seq ${start} $((start + count))); do + for i in $(seq "$start" $((start + count))); do end=$((start + range_size)) srcstart=$((start + src_delta)) srcend=$((end + src_delta)) @@ -1407,31 +1372,34 @@ test_concurrency() { # - add all the elements with 3s timeout while checking that packets match # - wait 3s after the last insertion, check that packets don't match any entry test_timeout() { - setup veth send_"${proto}" set || return ${KSELFTEST_SKIP} + setup veth send_"${proto}" set || return ${ksft_skip} timeout=3 + + [ "$KSFT_MACHINE_SLOW" = "yes" ] && timeout=8 + range_size=1 - for i in $(seq "${start}" $((start + count))); do + for i in $(seq "$start" $((start + count))); do end=$((start + range_size)) srcstart=$((start + src_delta)) srcend=$((end + src_delta)) add "$(format)" || return 1 - for j in $(seq ${start} $((range_size / 2 + 1)) ${end}); do + for j in $(seq "$start" $((range_size / 2 + 1)) ${end}); do send_match "${j}" $((j + src_delta)) || return 1 done range_size=$((range_size + 1)) start=$((end + range_size)) done - sleep 3 - for i in $(seq ${start} $((start + count))); do + sleep $timeout + for i in $(seq "$start" $((start + count))); do end=$((start + range_size)) srcstart=$((start + src_delta)) srcend=$((end + src_delta)) - for j in $(seq ${start} $((range_size / 2 + 1)) ${end}); do + for j in $(seq "$start" $((range_size / 2 + 1)) ${end}); do send_nomatch "${j}" $((j + src_delta)) || return 1 done @@ -1450,13 +1418,13 @@ test_performance() { chain_spec=${perf_spec} dst="${perf_dst}" src="${perf_src}" - setup veth perf set || return ${KSELFTEST_SKIP} + setup veth perf set || return ${ksft_skip} first=${start} range_size=1 for set in test norange noconcat; do start=${first} - for i in $(seq ${start} $((start + perf_entries))); do + for i in $(seq "$start" $((start + perf_entries))); do end=$((start + range_size)) srcstart=$((start + src_delta)) srcend=$((end + src_delta)) @@ -1464,7 +1432,7 @@ test_performance() { if [ $((end / 65534)) -gt $((start / 65534)) ]; then start=${end} end=$((end + 1)) - elif [ ${start} -eq ${end} ]; then + elif [ "$start" -eq "$end" ]; then end=$((start + 1)) fi @@ -1475,7 +1443,7 @@ test_performance() { nft -f "${tmp}" done - perf $((end - 1)) ${srcstart} + perf $((end - 1)) "$srcstart" sleep 2 @@ -1519,14 +1487,17 @@ test_performance() { } test_bug_flush_remove_add() { + rounds=100 + [ "$KSFT_MACHINE_SLOW" = "yes" ] && rounds=10 + set_cmd='{ set s { type ipv4_addr . inet_service; flags interval; }; }' elem1='{ 10.0.0.1 . 22-25, 10.0.0.1 . 10-20 }' elem2='{ 10.0.0.1 . 10-20, 10.0.0.1 . 22-25 }' - for i in `seq 1 100`; do - nft add table t ${set_cmd} || return ${KSELFTEST_SKIP} - nft add element t s ${elem1} 2>/dev/null || return 1 + for i in $(seq 1 $rounds); do + nft add table t "$set_cmd" || return ${ksft_skip} + nft add element t s "$elem1" 2>/dev/null || return 1 nft flush set t s 2>/dev/null || return 1 - nft add element t s ${elem2} 2>/dev/null || return 1 + nft add element t s "$elem2" 2>/dev/null || return 1 done nft flush ruleset } @@ -1534,7 +1505,7 @@ test_bug_flush_remove_add() { # - add ranged element, check that packets match it # - reload the set, check packets still match test_bug_reload() { - setup veth send_"${proto}" set || return ${KSELFTEST_SKIP} + setup veth send_"${proto}" set || return ${ksft_skip} rstart=${start} range_size=1 @@ -1573,7 +1544,7 @@ test_bug_reload() { srcstart=$((start + src_delta)) srcend=$((end + src_delta)) - for j in $(seq ${start} $((range_size / 2 + 1)) ${end}); do + for j in $(seq "$start" $((range_size / 2 + 1)) ${end}); do send_match "${j}" $((j + src_delta)) || return 1 done @@ -1591,12 +1562,12 @@ test_reported_issues() { # Run everything in a separate network namespace [ "${1}" != "run" ] && { unshare -n "${0}" run; exit $?; } tmp="$(mktemp)" -trap cleanup EXIT +trap cleanup_exit EXIT # Entry point for test runs passed=0 for name in ${TESTS}; do - printf "TEST: %s\n" "$(echo ${name} | tr '_' ' ')" + printf "TEST: %s\n" "$(echo "$name" | tr '_' ' ')" if [ "${name}" = "reported_issues" ]; then SUBTESTS="${BUGS}" else @@ -1623,10 +1594,16 @@ for name in ${TESTS}; do continue fi - printf " %-60s " "${display}" + [ "$KSFT_MACHINE_SLOW" = "yes" ] && count=1 + + printf " %-32s " "${display}" + tthen=$(date +%s) eval test_"${name}" ret=$? + tnow=$(date +%s) + printf "%5ds%-30s" $((tnow-tthen)) + if [ $ret -eq 0 ]; then printf "[ OK ]\n" info_flush @@ -1635,11 +1612,11 @@ for name in ${TESTS}; do printf "[FAIL]\n" err_flush exit 1 - elif [ $ret -eq ${KSELFTEST_SKIP} ]; then + elif [ $ret -eq ${ksft_skip} ]; then printf "[SKIP]\n" err_flush fi done done -[ ${passed} -eq 0 ] && exit ${KSELFTEST_SKIP} || exit 0 +[ ${passed} -eq 0 ] && exit ${ksft_skip} || exit 0 diff --git a/tools/testing/selftests/net/netfilter/nft_concat_range_perf.sh b/tools/testing/selftests/net/netfilter/nft_concat_range_perf.sh new file mode 100755 index 000000000000..5d276995a5c5 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/nft_concat_range_perf.sh @@ -0,0 +1,9 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# + +source lib.sh + +[ "$KSFT_MACHINE_SLOW" = yes ] && exit ${ksft_skip} + +NFT_CONCAT_RANGE_TESTS="performance" exec ./nft_concat_range.sh diff --git a/tools/testing/selftests/net/netfilter/nft_conntrack_helper.sh b/tools/testing/selftests/net/netfilter/nft_conntrack_helper.sh new file mode 100755 index 000000000000..abcaa7337197 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/nft_conntrack_helper.sh @@ -0,0 +1,171 @@ +#!/bin/bash +# +# This tests connection tracking helper assignment: +# 1. can attach ftp helper to a connection from nft ruleset. +# 2. auto-assign still works. +# +# Kselftest framework requirement - SKIP code is 4. + +source lib.sh + +ret=0 + +testipv6=1 + +checktool "socat -h" "run test without socat" +checktool "conntrack --version" "run test without conntrack" +checktool "nft --version" "run test without nft" + +cleanup() +{ + ip netns pids "$ns1" | xargs kill 2>/dev/null + + ip netns del "$ns1" + ip netns del "$ns2" +} + +trap cleanup EXIT + +setup_ns ns1 ns2 + +if ! ip link add veth0 netns "$ns1" type veth peer name veth0 netns "$ns2" > /dev/null 2>&1;then + echo "SKIP: No virtual ethernet pair device support in kernel" + exit $ksft_skip +fi + +ip -net "$ns1" link set veth0 up +ip -net "$ns2" link set veth0 up + +ip -net "$ns1" addr add 10.0.1.1/24 dev veth0 +ip -net "$ns1" addr add dead:1::1/64 dev veth0 nodad + +ip -net "$ns2" addr add 10.0.1.2/24 dev veth0 +ip -net "$ns2" addr add dead:1::2/64 dev veth0 nodad + +load_ruleset_family() { + local family=$1 + local ns=$2 + +ip netns exec "$ns" nft -f - <<EOF +table $family raw { + ct helper ftp { + type "ftp" protocol tcp + } + chain pre { + type filter hook prerouting priority 0; policy accept; + tcp dport 2121 ct helper set "ftp" + } + chain output { + type filter hook output priority 0; policy accept; + tcp dport 2121 ct helper set "ftp" + } +} +EOF + return $? +} + +check_for_helper() +{ + local netns=$1 + local message=$2 + local port=$3 + + if echo "$message" |grep -q 'ipv6';then + local family="ipv6" + else + local family="ipv4" + fi + + if ! ip netns exec "$netns" conntrack -L -f $family -p tcp --dport "$port" 2> /dev/null |grep -q 'helper=ftp';then + if [ "$autoassign" -eq 0 ] ;then + echo "FAIL: ${netns} did not show attached helper $message" 1>&2 + ret=1 + else + echo "PASS: ${netns} did not show attached helper $message" 1>&2 + fi + else + if [ "$autoassign" -eq 0 ] ;then + echo "PASS: ${netns} connection on port $port has ftp helper attached" 1>&2 + else + echo "FAIL: ${netns} connection on port $port has ftp helper attached" 1>&2 + ret=1 + fi + fi + + return 0 +} + +listener_ready() +{ + ns="$1" + port="$2" + proto="$3" + ss -N "$ns" -lnt -o "sport = :$port" | grep -q "$port" +} + +test_helper() +{ + local port=$1 + local autoassign=$2 + + if [ "$autoassign" -eq 0 ] ;then + msg="set via ruleset" + else + msg="auto-assign" + fi + + ip netns exec "$ns2" socat -t 3 -u -4 TCP-LISTEN:"$port",reuseaddr STDOUT > /dev/null & + busywait "$BUSYWAIT_TIMEOUT" listener_ready "$ns2" "$port" "-4" + + ip netns exec "$ns1" socat -u -4 STDIN TCP:10.0.1.2:"$port" < /dev/null > /dev/null + + check_for_helper "$ns1" "ip $msg" "$port" "$autoassign" + check_for_helper "$ns2" "ip $msg" "$port" "$autoassign" + + if [ $testipv6 -eq 0 ] ;then + return 0 + fi + + ip netns exec "$ns1" conntrack -F 2> /dev/null + ip netns exec "$ns2" conntrack -F 2> /dev/null + + ip netns exec "$ns2" socat -t 3 -u -6 TCP-LISTEN:"$port",reuseaddr STDOUT > /dev/null & + busywait $BUSYWAIT_TIMEOUT listener_ready "$ns2" "$port" "-6" + + ip netns exec "$ns1" socat -t 3 -u -6 STDIN TCP:"[dead:1::2]":"$port" < /dev/null > /dev/null + + check_for_helper "$ns1" "ipv6 $msg" "$port" + check_for_helper "$ns2" "ipv6 $msg" "$port" +} + +if ! load_ruleset_family ip "$ns1"; then + echo "FAIL: ${ns1} cannot load ip ruleset" 1>&2 + exit 1 +fi + +if ! load_ruleset_family ip6 "$ns1"; then + echo "SKIP: ${ns1} cannot load ip6 ruleset" 1>&2 + testipv6=0 +fi + +if ! load_ruleset_family inet "${ns2}"; then + echo "SKIP: ${ns1} cannot load inet ruleset" 1>&2 + if ! load_ruleset_family ip "${ns2}"; then + echo "FAIL: ${ns2} cannot load ip ruleset" 1>&2 + exit 1 + fi + + if [ "$testipv6" -eq 1 ] ;then + if ! load_ruleset_family ip6 "$ns2"; then + echo "FAIL: ${ns2} cannot load ip6 ruleset" 1>&2 + exit 1 + fi + fi +fi + +test_helper 2121 0 +ip netns exec "$ns1" sysctl -qe 'net.netfilter.nf_conntrack_helper=1' +ip netns exec "$ns2" sysctl -qe 'net.netfilter.nf_conntrack_helper=1' +test_helper 21 1 + +exit $ret diff --git a/tools/testing/selftests/net/netfilter/nft_fib.sh b/tools/testing/selftests/net/netfilter/nft_fib.sh new file mode 100755 index 000000000000..ce1451c275fd --- /dev/null +++ b/tools/testing/selftests/net/netfilter/nft_fib.sh @@ -0,0 +1,234 @@ +#!/bin/bash +# +# This tests the fib expression. +# +# Kselftest framework requirement - SKIP code is 4. + +source lib.sh + +ret=0 + +timeout=4 + +log_netns=$(sysctl -n net.netfilter.nf_log_all_netns) + +cleanup() +{ + cleanup_all_ns + + [ "$log_netns" -eq 0 ] && sysctl -q net.netfilter.nf_log_all_netns=$log_netns +} + +checktool "nft --version" "run test without nft" + +setup_ns nsrouter ns1 ns2 + +trap cleanup EXIT + +if dmesg | grep -q ' nft_rpfilter: ';then + dmesg -c | grep ' nft_rpfilter: ' + echo "WARN: a previous test run has failed" 1>&2 +fi + +sysctl -q net.netfilter.nf_log_all_netns=1 + +load_ruleset() { + local netns=$1 + +ip netns exec "$netns" nft -f /dev/stdin <<EOF +table inet filter { + chain prerouting { + type filter hook prerouting priority 0; policy accept; + fib saddr . iif oif missing counter log prefix "$netns nft_rpfilter: " drop + } +} +EOF +} + +load_pbr_ruleset() { + local netns=$1 + +ip netns exec "$netns" nft -f /dev/stdin <<EOF +table inet filter { + chain forward { + type filter hook forward priority raw; + fib saddr . iif oif gt 0 accept + log drop + } +} +EOF +} + +load_ruleset_count() { + local netns=$1 + +ip netns exec "$netns" nft -f /dev/stdin <<EOF +table inet filter { + chain prerouting { + type filter hook prerouting priority 0; policy accept; + ip daddr 1.1.1.1 fib saddr . iif oif missing counter drop + ip6 daddr 1c3::c01d fib saddr . iif oif missing counter drop + } +} +EOF +} + +check_drops() { + if dmesg | grep -q ' nft_rpfilter: ';then + dmesg | grep ' nft_rpfilter: ' + echo "FAIL: rpfilter did drop packets" + return 1 + fi + + return 0 +} + +check_fib_counter() { + local want=$1 + local ns=$2 + local address=$3 + + if ! ip netns exec "$ns" nft list table inet filter | grep 'fib saddr . iif' | grep "$address" | grep -q "packets $want";then + echo "Netns $ns fib counter doesn't match expected packet count of $want for $address" 1>&2 + ip netns exec "$ns" nft list table inet filter + return 1 + fi + + if [ "$want" -gt 0 ]; then + echo "PASS: fib expression did drop packets for $address" + fi + + return 0 +} + +load_ruleset "$nsrouter" +load_ruleset "$ns1" +load_ruleset "$ns2" + +if ! ip link add veth0 netns "$nsrouter" type veth peer name eth0 netns "$ns1" > /dev/null 2>&1; then + echo "SKIP: No virtual ethernet pair device support in kernel" + exit $ksft_skip +fi +ip link add veth1 netns "$nsrouter" type veth peer name eth0 netns "$ns2" + +ip -net "$nsrouter" link set veth0 up +ip -net "$nsrouter" addr add 10.0.1.1/24 dev veth0 +ip -net "$nsrouter" addr add dead:1::1/64 dev veth0 nodad + +ip -net "$nsrouter" link set veth1 up +ip -net "$nsrouter" addr add 10.0.2.1/24 dev veth1 +ip -net "$nsrouter" addr add dead:2::1/64 dev veth1 nodad + +ip -net "$ns1" link set eth0 up +ip -net "$ns2" link set eth0 up + +ip -net "$ns1" addr add 10.0.1.99/24 dev eth0 +ip -net "$ns1" addr add dead:1::99/64 dev eth0 nodad +ip -net "$ns1" route add default via 10.0.1.1 +ip -net "$ns1" route add default via dead:1::1 + +ip -net "$ns2" addr add 10.0.2.99/24 dev eth0 +ip -net "$ns2" addr add dead:2::99/64 dev eth0 nodad +ip -net "$ns2" route add default via 10.0.2.1 +ip -net "$ns2" route add default via dead:2::1 + +test_ping() { + local daddr4=$1 + local daddr6=$2 + + if ! ip netns exec "$ns1" ping -c 1 -q "$daddr4" > /dev/null; then + check_drops + echo "FAIL: ${ns1} cannot reach $daddr4, ret $ret" 1>&2 + return 1 + fi + + if ! ip netns exec "$ns1" ping -c 1 -q "$daddr6" > /dev/null; then + check_drops + echo "FAIL: ${ns1} cannot reach $daddr6, ret $ret" 1>&2 + return 1 + fi + + return 0 +} + +ip netns exec "$nsrouter" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null +ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null +ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null +ip netns exec "$nsrouter" sysctl net.ipv4.conf.all.rp_filter=0 > /dev/null +ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth0.rp_filter=0 > /dev/null + +test_ping 10.0.2.1 dead:2::1 || exit 1 +check_drops || exit 1 + +test_ping 10.0.2.99 dead:2::99 || exit 1 +check_drops || exit 1 + +echo "PASS: fib expression did not cause unwanted packet drops" + +ip netns exec "$nsrouter" nft flush table inet filter + +ip -net "$ns1" route del default +ip -net "$ns1" -6 route del default + +ip -net "$ns1" addr del 10.0.1.99/24 dev eth0 +ip -net "$ns1" addr del dead:1::99/64 dev eth0 + +ip -net "$ns1" addr add 10.0.2.99/24 dev eth0 +ip -net "$ns1" addr add dead:2::99/64 dev eth0 nodad + +ip -net "$ns1" route add default via 10.0.2.1 +ip -net "$ns1" -6 route add default via dead:2::1 + +ip -net "$nsrouter" addr add dead:2::1/64 dev veth0 nodad + +# switch to ruleset that doesn't log, this time +# its expected that this does drop the packets. +load_ruleset_count "$nsrouter" + +# ns1 has a default route, but nsrouter does not. +# must not check return value, ping to 1.1.1.1 will +# fail. +check_fib_counter 0 "$nsrouter" 1.1.1.1 || exit 1 +check_fib_counter 0 "$nsrouter" 1c3::c01d || exit 1 + +ip netns exec "$ns1" ping -W 0.5 -c 1 -q 1.1.1.1 > /dev/null +check_fib_counter 1 "$nsrouter" 1.1.1.1 || exit 1 + +ip netns exec "$ns1" ping -W 0.5 -i 0.1 -c 3 -q 1c3::c01d > /dev/null +check_fib_counter 3 "$nsrouter" 1c3::c01d || exit 1 + +# delete all rules +ip netns exec "$ns1" nft flush ruleset +ip netns exec "$ns2" nft flush ruleset +ip netns exec "$nsrouter" nft flush ruleset + +ip -net "$ns1" addr add 10.0.1.99/24 dev eth0 +ip -net "$ns1" addr add dead:1::99/64 dev eth0 nodad + +ip -net "$ns1" addr del 10.0.2.99/24 dev eth0 +ip -net "$ns1" addr del dead:2::99/64 dev eth0 + +ip -net "$nsrouter" addr del dead:2::1/64 dev veth0 + +# ... pbr ruleset for the router, check iif+oif. +if ! load_pbr_ruleset "$nsrouter";then + echo "SKIP: Could not load fib forward ruleset" + exit $ksft_skip +fi + +ip -net "$nsrouter" rule add from all table 128 +ip -net "$nsrouter" rule add from all iif veth0 table 129 +ip -net "$nsrouter" route add table 128 to 10.0.1.0/24 dev veth0 +ip -net "$nsrouter" route add table 129 to 10.0.2.0/24 dev veth1 + +# drop main ipv4 table +ip -net "$nsrouter" -4 rule delete table main + +if ! test_ping 10.0.2.99 dead:2::99;then + ip -net "$nsrouter" nft list ruleset + echo "FAIL: fib mismatch in pbr setup" + exit 1 +fi + +echo "PASS: fib expression forward check with policy based routing" +exit 0 diff --git a/tools/testing/selftests/netfilter/nft_flowtable.sh b/tools/testing/selftests/net/netfilter/nft_flowtable.sh index a32f490f7539..b3995550856a 100755 --- a/tools/testing/selftests/netfilter/nft_flowtable.sh +++ b/tools/testing/selftests/net/netfilter/nft_flowtable.sh @@ -14,15 +14,10 @@ # nft_flowtable.sh -o8000 -l1500 -r2000 # -sfx=$(mktemp -u "XXXXXXXX") -ns1="ns1-$sfx" -ns2="ns2-$sfx" -nsr1="nsr1-$sfx" -nsr2="nsr2-$sfx" - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 +source lib.sh + ret=0 +SOCAT_TIMEOUT=60 nsin="" ns1out="" @@ -30,52 +25,41 @@ ns2out="" log_netns=$(sysctl -n net.netfilter.nf_log_all_netns) -checktool (){ - if ! $1 > /dev/null 2>&1; then - echo "SKIP: Could not $2" - exit $ksft_skip - fi -} - checktool "nft --version" "run test without nft tool" -checktool "ip -Version" "run test without ip tool" -checktool "which nc" "run test without nc (netcat)" -checktool "ip netns add $nsr1" "create net namespace $nsr1" +checktool "socat -h" "run test without socat" -ip netns add $ns1 -ip netns add $ns2 -ip netns add $nsr2 +setup_ns ns1 ns2 nsr1 nsr2 cleanup() { - ip netns del $ns1 - ip netns del $ns2 - ip netns del $nsr1 - ip netns del $nsr2 + ip netns pids "$ns1" | xargs kill 2>/dev/null + ip netns pids "$ns2" | xargs kill 2>/dev/null + + cleanup_all_ns rm -f "$nsin" "$ns1out" "$ns2out" - [ $log_netns -eq 0 ] && sysctl -q net.netfilter.nf_log_all_netns=$log_netns + [ "$log_netns" -eq 0 ] && sysctl -q net.netfilter.nf_log_all_netns="$log_netns" } trap cleanup EXIT sysctl -q net.netfilter.nf_log_all_netns=1 -ip link add veth0 netns $nsr1 type veth peer name eth0 netns $ns1 -ip link add veth1 netns $nsr1 type veth peer name veth0 netns $nsr2 +ip link add veth0 netns "$nsr1" type veth peer name eth0 netns "$ns1" +ip link add veth1 netns "$nsr1" type veth peer name veth0 netns "$nsr2" -ip link add veth1 netns $nsr2 type veth peer name eth0 netns $ns2 +ip link add veth1 netns "$nsr2" type veth peer name eth0 netns "$ns2" -for dev in lo veth0 veth1; do - ip -net $nsr1 link set $dev up - ip -net $nsr2 link set $dev up +for dev in veth0 veth1; do + ip -net "$nsr1" link set "$dev" up + ip -net "$nsr2" link set "$dev" up done -ip -net $nsr1 addr add 10.0.1.1/24 dev veth0 -ip -net $nsr1 addr add dead:1::1/64 dev veth0 +ip -net "$nsr1" addr add 10.0.1.1/24 dev veth0 +ip -net "$nsr1" addr add dead:1::1/64 dev veth0 nodad -ip -net $nsr2 addr add 10.0.2.1/24 dev veth1 -ip -net $nsr2 addr add dead:2::1/64 dev veth1 +ip -net "$nsr2" addr add 10.0.2.1/24 dev veth1 +ip -net "$nsr2" addr add dead:2::1/64 dev veth1 nodad # set different MTUs so we need to push packets coming from ns1 (large MTU) # to ns2 (smaller MTU) to stack either to perform fragmentation (ip_no_pmtu_disc=1), @@ -107,56 +91,63 @@ do esac done -if ! ip -net $nsr1 link set veth0 mtu $omtu; then +if ! ip -net "$nsr1" link set veth0 mtu "$omtu"; then + exit 1 +fi + +ip -net "$ns1" link set eth0 mtu "$omtu" + +if ! ip -net "$nsr2" link set veth1 mtu "$rmtu"; then exit 1 fi -ip -net $ns1 link set eth0 mtu $omtu +if ! ip -net "$nsr1" link set veth1 mtu "$lmtu"; then + exit 1 +fi -if ! ip -net $nsr2 link set veth1 mtu $rmtu; then +if ! ip -net "$nsr2" link set veth0 mtu "$lmtu"; then exit 1 fi -ip -net $ns2 link set eth0 mtu $rmtu +ip -net "$ns2" link set eth0 mtu "$rmtu" # transfer-net between nsr1 and nsr2. # these addresses are not used for connections. -ip -net $nsr1 addr add 192.168.10.1/24 dev veth1 -ip -net $nsr1 addr add fee1:2::1/64 dev veth1 +ip -net "$nsr1" addr add 192.168.10.1/24 dev veth1 +ip -net "$nsr1" addr add fee1:2::1/64 dev veth1 nodad -ip -net $nsr2 addr add 192.168.10.2/24 dev veth0 -ip -net $nsr2 addr add fee1:2::2/64 dev veth0 +ip -net "$nsr2" addr add 192.168.10.2/24 dev veth0 +ip -net "$nsr2" addr add fee1:2::2/64 dev veth0 nodad for i in 0 1; do - ip netns exec $nsr1 sysctl net.ipv4.conf.veth$i.forwarding=1 > /dev/null - ip netns exec $nsr2 sysctl net.ipv4.conf.veth$i.forwarding=1 > /dev/null + ip netns exec "$nsr1" sysctl net.ipv4.conf.veth$i.forwarding=1 > /dev/null + ip netns exec "$nsr2" sysctl net.ipv4.conf.veth$i.forwarding=1 > /dev/null done -for ns in $ns1 $ns2;do - ip -net $ns link set lo up - ip -net $ns link set eth0 up +for ns in "$ns1" "$ns2";do + ip -net "$ns" link set eth0 up - if ! ip netns exec $ns sysctl net.ipv4.tcp_no_metrics_save=1 > /dev/null; then + if ! ip netns exec "$ns" sysctl net.ipv4.tcp_no_metrics_save=1 > /dev/null; then echo "ERROR: Check Originator/Responder values (problem during address addition)" exit 1 fi # don't set ip DF bit for first two tests - ip netns exec $ns sysctl net.ipv4.ip_no_pmtu_disc=1 > /dev/null + ip netns exec "$ns" sysctl net.ipv4.ip_no_pmtu_disc=1 > /dev/null done -ip -net $ns1 addr add 10.0.1.99/24 dev eth0 -ip -net $ns2 addr add 10.0.2.99/24 dev eth0 -ip -net $ns1 route add default via 10.0.1.1 -ip -net $ns2 route add default via 10.0.2.1 -ip -net $ns1 addr add dead:1::99/64 dev eth0 -ip -net $ns2 addr add dead:2::99/64 dev eth0 -ip -net $ns1 route add default via dead:1::1 -ip -net $ns2 route add default via dead:2::1 +ip -net "$ns1" addr add 10.0.1.99/24 dev eth0 +ip -net "$ns2" addr add 10.0.2.99/24 dev eth0 +ip -net "$ns1" route add default via 10.0.1.1 +ip -net "$ns2" route add default via 10.0.2.1 +ip -net "$ns1" addr add dead:1::99/64 dev eth0 nodad +ip -net "$ns2" addr add dead:2::99/64 dev eth0 nodad +ip -net "$ns1" route add default via dead:1::1 +ip -net "$ns2" route add default via dead:2::1 -ip -net $nsr1 route add default via 192.168.10.2 -ip -net $nsr2 route add default via 192.168.10.1 +ip -net "$nsr1" route add default via 192.168.10.2 +ip -net "$nsr2" route add default via 192.168.10.1 -ip netns exec $nsr1 nft -f - <<EOF +ip netns exec "$nsr1" nft -f - <<EOF table inet filter { flowtable f1 { hook ingress priority 0 @@ -188,7 +179,7 @@ if [ $? -ne 0 ]; then exit $ksft_skip fi -ip netns exec $ns2 nft -f - <<EOF +ip netns exec "$ns2" nft -f - <<EOF table inet filter { counter ip4dscp0 { } counter ip4dscp3 { } @@ -204,25 +195,22 @@ table inet filter { EOF if [ $? -ne 0 ]; then - echo "SKIP: Could not load nft ruleset" + echo -n "SKIP: Could not load ruleset: " + nft --version exit $ksft_skip fi # test basic connectivity -if ! ip netns exec $ns1 ping -c 1 -q 10.0.2.99 > /dev/null; then +if ! ip netns exec "$ns1" ping -c 1 -q 10.0.2.99 > /dev/null; then echo "ERROR: $ns1 cannot reach ns2" 1>&2 exit 1 fi -if ! ip netns exec $ns2 ping -c 1 -q 10.0.1.99 > /dev/null; then +if ! ip netns exec "$ns2" ping -c 1 -q 10.0.1.99 > /dev/null; then echo "ERROR: $ns2 cannot reach $ns1" 1>&2 exit 1 fi -if [ $ret -eq 0 ];then - echo "PASS: netns routing/connectivity: $ns1 can reach $ns2" -fi - nsin=$(mktemp) ns1out=$(mktemp) ns2out=$(mktemp) @@ -248,23 +236,27 @@ check_counters() local what=$1 local ok=1 - local orig=$(ip netns exec $nsr1 nft reset counter inet filter routed_orig | grep packets) - local repl=$(ip netns exec $nsr1 nft reset counter inet filter routed_repl | grep packets) + local orig repl + orig=$(ip netns exec "$nsr1" nft reset counter inet filter routed_orig | grep packets) + repl=$(ip netns exec "$nsr1" nft reset counter inet filter routed_repl | grep packets) local orig_cnt=${orig#*bytes} local repl_cnt=${repl#*bytes} - local fs=$(du -sb $nsin) + local fs + fs=$(du -sb "$nsin") local max_orig=${fs%%/*} local max_repl=$((max_orig/4)) - if [ $orig_cnt -gt $max_orig ];then + # flowtable fastpath should bypass normal routing one, i.e. the counters in forward hook + # should always be lower than the size of the transmitted file (max_orig). + if [ "$orig_cnt" -gt "$max_orig" ];then echo "FAIL: $what: original counter $orig_cnt exceeds expected value $max_orig" 1>&2 ret=1 ok=0 fi - if [ $repl_cnt -gt $max_repl ];then + if [ "$repl_cnt" -gt $max_repl ];then echo "FAIL: $what: reply counter $repl_cnt exceeds expected value $max_repl" 1>&2 ret=1 ok=0 @@ -280,39 +272,40 @@ check_dscp() local what=$1 local ok=1 - local counter=$(ip netns exec $ns2 nft reset counter inet filter ip4dscp3 | grep packets) + local counter + counter=$(ip netns exec "$ns2" nft reset counter inet filter ip4dscp3 | grep packets) local pc4=${counter%*bytes*} local pc4=${pc4#*packets} - local counter=$(ip netns exec $ns2 nft reset counter inet filter ip4dscp0 | grep packets) + counter=$(ip netns exec "$ns2" nft reset counter inet filter ip4dscp0 | grep packets) local pc4z=${counter%*bytes*} local pc4z=${pc4z#*packets} case "$what" in "dscp_none") - if [ $pc4 -gt 0 ] || [ $pc4z -eq 0 ]; then + if [ "$pc4" -gt 0 ] || [ "$pc4z" -eq 0 ]; then echo "FAIL: dscp counters do not match, expected dscp3 == 0, dscp0 > 0, but got $pc4,$pc4z" 1>&2 ret=1 ok=0 fi ;; "dscp_fwd") - if [ $pc4 -eq 0 ] || [ $pc4z -eq 0 ]; then + if [ "$pc4" -eq 0 ] || [ "$pc4z" -eq 0 ]; then echo "FAIL: dscp counters do not match, expected dscp3 and dscp0 > 0 but got $pc4,$pc4z" 1>&2 ret=1 ok=0 fi ;; "dscp_ingress") - if [ $pc4 -eq 0 ] || [ $pc4z -gt 0 ]; then + if [ "$pc4" -eq 0 ] || [ "$pc4z" -gt 0 ]; then echo "FAIL: dscp counters do not match, expected dscp3 > 0, dscp0 == 0 but got $pc4,$pc4z" 1>&2 ret=1 ok=0 fi ;; "dscp_egress") - if [ $pc4 -eq 0 ] || [ $pc4z -gt 0 ]; then + if [ "$pc4" -eq 0 ] || [ "$pc4z" -gt 0 ]; then echo "FAIL: dscp counters do not match, expected dscp3 > 0, dscp0 == 0 but got $pc4,$pc4z" 1>&2 ret=1 ok=0 @@ -324,7 +317,7 @@ check_dscp() ok=0 esac - if [ $ok -eq 1 ] ;then + if [ "$ok" -eq 1 ] ;then echo "PASS: $what: dscp packet counters match" fi } @@ -345,6 +338,11 @@ check_transfer() return 0 } +listener_ready() +{ + ss -N "$nsb" -lnt -o "sport = :12345" | grep -q 12345 +} + test_tcp_forwarding_ip() { local nsa=$1 @@ -353,40 +351,23 @@ test_tcp_forwarding_ip() local dstport=$4 local lret=0 - ip netns exec $nsb nc -w 5 -l -p 12345 < "$nsin" > "$ns2out" & + timeout "$SOCAT_TIMEOUT" ip netns exec "$nsb" socat -4 TCP-LISTEN:12345,reuseaddr STDIO < "$nsin" > "$ns2out" & lpid=$! - sleep 1 - ip netns exec $nsa nc -w 4 "$dstip" "$dstport" < "$nsin" > "$ns1out" & - cpid=$! - - sleep 1 - - prev="$(ls -l $ns1out $ns2out)" - sleep 1 - - while [[ "$prev" != "$(ls -l $ns1out $ns2out)" ]]; do - sleep 1; - prev="$(ls -l $ns1out $ns2out)" - done + busywait 1000 listener_ready - if test -d /proc/"$lpid"/; then - kill $lpid - fi - - if test -d /proc/"$cpid"/; then - kill $cpid - fi + timeout "$SOCAT_TIMEOUT" ip netns exec "$nsa" socat -4 TCP:"$dstip":"$dstport" STDIO < "$nsin" > "$ns1out" wait $lpid - wait $cpid if ! check_transfer "$nsin" "$ns2out" "ns1 -> ns2"; then lret=1 + ret=1 fi if ! check_transfer "$nsin" "$ns1out" "ns1 <- ns2"; then lret=1 + ret=1 fi return $lret @@ -403,7 +384,7 @@ test_tcp_forwarding_set_dscp() { check_dscp "dscp_none" -ip netns exec $nsr1 nft -f - <<EOF +ip netns exec "$nsr1" nft -f - <<EOF table netdev dscpmangle { chain setdscp0 { type filter hook ingress device "veth0" priority 0; policy accept @@ -415,12 +396,12 @@ if [ $? -eq 0 ]; then test_tcp_forwarding_ip "$1" "$2" 10.0.2.99 12345 check_dscp "dscp_ingress" - ip netns exec $nsr1 nft delete table netdev dscpmangle + ip netns exec "$nsr1" nft delete table netdev dscpmangle else echo "SKIP: Could not load netdev:ingress for veth0" fi -ip netns exec $nsr1 nft -f - <<EOF +ip netns exec "$nsr1" nft -f - <<EOF table netdev dscpmangle { chain setdscp0 { type filter hook egress device "veth1" priority 0; policy accept @@ -432,14 +413,14 @@ if [ $? -eq 0 ]; then test_tcp_forwarding_ip "$1" "$2" 10.0.2.99 12345 check_dscp "dscp_egress" - ip netns exec $nsr1 nft flush table netdev dscpmangle + ip netns exec "$nsr1" nft flush table netdev dscpmangle else echo "SKIP: Could not load netdev:egress for veth1" fi # partial. If flowtable really works, then both dscp-is-0 and dscp-is-cs3 # counters should have seen packets (before and after ft offload kicks in). - ip netns exec $nsr1 nft -a insert rule inet filter forward ip dscp set cs3 + ip netns exec "$nsr1" nft -a insert rule inet filter forward ip dscp set cs3 test_tcp_forwarding_ip "$1" "$2" 10.0.2.99 12345 check_dscp "dscp_fwd" } @@ -455,8 +436,8 @@ test_tcp_forwarding_nat() pmtu=$3 what=$4 - if [ $lret -eq 0 ] ; then - if [ $pmtu -eq 1 ] ;then + if [ "$lret" -eq 0 ] ; then + if [ "$pmtu" -eq 1 ] ;then check_counters "flow offload for ns1/ns2 with masquerade and pmtu discovery $what" else echo "PASS: flow offload for ns1/ns2 with masquerade $what" @@ -464,9 +445,9 @@ test_tcp_forwarding_nat() test_tcp_forwarding_ip "$1" "$2" 10.6.6.6 1666 lret=$? - if [ $pmtu -eq 1 ] ;then + if [ "$pmtu" -eq 1 ] ;then check_counters "flow offload for ns1/ns2 with dnat and pmtu discovery $what" - elif [ $lret -eq 0 ] ; then + elif [ "$lret" -eq 0 ] ; then echo "PASS: flow offload for ns1/ns2 with dnat $what" fi fi @@ -481,25 +462,25 @@ make_file "$nsin" # Due to MTU mismatch in both directions, all packets (except small packets like pure # acks) have to be handled by normal forwarding path. Therefore, packet counters # are not checked. -if test_tcp_forwarding $ns1 $ns2; then +if test_tcp_forwarding "$ns1" "$ns2"; then echo "PASS: flow offloaded for ns1/ns2" else echo "FAIL: flow offload for ns1/ns2:" 1>&2 - ip netns exec $nsr1 nft list ruleset + ip netns exec "$nsr1" nft list ruleset ret=1 fi # delete default route, i.e. ns2 won't be able to reach ns1 and # will depend on ns1 being masqueraded in nsr1. # expect ns1 has nsr1 address. -ip -net $ns2 route del default via 10.0.2.1 -ip -net $ns2 route del default via dead:2::1 -ip -net $ns2 route add 192.168.10.1 via 10.0.2.1 +ip -net "$ns2" route del default via 10.0.2.1 +ip -net "$ns2" route del default via dead:2::1 +ip -net "$ns2" route add 192.168.10.1 via 10.0.2.1 # Second test: # Same, but with NAT enabled. Same as in first test: we expect normal forward path # to handle most packets. -ip netns exec $nsr1 nft -f - <<EOF +ip netns exec "$nsr1" nft -f - <<EOF table ip nat { chain prerouting { type nat hook prerouting priority 0; policy accept; @@ -513,14 +494,14 @@ table ip nat { } EOF -if ! test_tcp_forwarding_set_dscp $ns1 $ns2 0 ""; then +if ! test_tcp_forwarding_set_dscp "$ns1" "$ns2" 0 ""; then echo "FAIL: flow offload for ns1/ns2 with dscp update" 1>&2 exit 0 fi -if ! test_tcp_forwarding_nat $ns1 $ns2 0 ""; then +if ! test_tcp_forwarding_nat "$ns1" "$ns2" 0 ""; then echo "FAIL: flow offload for ns1/ns2 with NAT" 1>&2 - ip netns exec $nsr1 nft list ruleset + ip netns exec "$nsr1" nft list ruleset ret=1 fi @@ -528,35 +509,40 @@ fi # Same as second test, but with PMTU discovery enabled. This # means that we expect the fastpath to handle packets as soon # as the endpoints adjust the packet size. -ip netns exec $ns1 sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null -ip netns exec $ns2 sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null +ip netns exec "$ns1" sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null +ip netns exec "$ns2" sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null # reset counters. # With pmtu in-place we'll also check that nft counters # are lower than file size and packets were forwarded via flowtable layer. # For earlier tests (large mtus), packets cannot be handled via flowtable # (except pure acks and other small packets). -ip netns exec $nsr1 nft reset counters table inet filter >/dev/null +ip netns exec "$nsr1" nft reset counters table inet filter >/dev/null -if ! test_tcp_forwarding_nat $ns1 $ns2 1 ""; then +if ! test_tcp_forwarding_nat "$ns1" "$ns2" 1 ""; then echo "FAIL: flow offload for ns1/ns2 with NAT and pmtu discovery" 1>&2 - ip netns exec $nsr1 nft list ruleset + ip netns exec "$nsr1" nft list ruleset fi # Another test: # Add bridge interface br0 to Router1, with NAT enabled. -ip -net $nsr1 link add name br0 type bridge -ip -net $nsr1 addr flush dev veth0 -ip -net $nsr1 link set up dev veth0 -ip -net $nsr1 link set veth0 master br0 -ip -net $nsr1 addr add 10.0.1.1/24 dev br0 -ip -net $nsr1 addr add dead:1::1/64 dev br0 -ip -net $nsr1 link set up dev br0 +test_bridge() { +if ! ip -net "$nsr1" link add name br0 type bridge 2>/dev/null;then + echo "SKIP: could not add bridge br0" + [ "$ret" -eq 0 ] && ret=$ksft_skip + return +fi +ip -net "$nsr1" addr flush dev veth0 +ip -net "$nsr1" link set up dev veth0 +ip -net "$nsr1" link set veth0 master br0 +ip -net "$nsr1" addr add 10.0.1.1/24 dev br0 +ip -net "$nsr1" addr add dead:1::1/64 dev br0 nodad +ip -net "$nsr1" link set up dev br0 -ip netns exec $nsr1 sysctl net.ipv4.conf.br0.forwarding=1 > /dev/null +ip netns exec "$nsr1" sysctl net.ipv4.conf.br0.forwarding=1 > /dev/null # br0 with NAT enabled. -ip netns exec $nsr1 nft -f - <<EOF +ip netns exec "$nsr1" nft -f - <<EOF flush table ip nat table ip nat { chain prerouting { @@ -571,56 +557,59 @@ table ip nat { } EOF -if ! test_tcp_forwarding_nat $ns1 $ns2 1 "on bridge"; then +if ! test_tcp_forwarding_nat "$ns1" "$ns2" 1 "on bridge"; then echo "FAIL: flow offload for ns1/ns2 with bridge NAT" 1>&2 - ip netns exec $nsr1 nft list ruleset + ip netns exec "$nsr1" nft list ruleset ret=1 fi # Another test: # Add bridge interface br0 to Router1, with NAT and VLAN. -ip -net $nsr1 link set veth0 nomaster -ip -net $nsr1 link set down dev veth0 -ip -net $nsr1 link add link veth0 name veth0.10 type vlan id 10 -ip -net $nsr1 link set up dev veth0 -ip -net $nsr1 link set up dev veth0.10 -ip -net $nsr1 link set veth0.10 master br0 - -ip -net $ns1 addr flush dev eth0 -ip -net $ns1 link add link eth0 name eth0.10 type vlan id 10 -ip -net $ns1 link set eth0 up -ip -net $ns1 link set eth0.10 up -ip -net $ns1 addr add 10.0.1.99/24 dev eth0.10 -ip -net $ns1 route add default via 10.0.1.1 -ip -net $ns1 addr add dead:1::99/64 dev eth0.10 - -if ! test_tcp_forwarding_nat $ns1 $ns2 1 "bridge and VLAN"; then +ip -net "$nsr1" link set veth0 nomaster +ip -net "$nsr1" link set down dev veth0 +ip -net "$nsr1" link add link veth0 name veth0.10 type vlan id 10 +ip -net "$nsr1" link set up dev veth0 +ip -net "$nsr1" link set up dev veth0.10 +ip -net "$nsr1" link set veth0.10 master br0 + +ip -net "$ns1" addr flush dev eth0 +ip -net "$ns1" link add link eth0 name eth0.10 type vlan id 10 +ip -net "$ns1" link set eth0 up +ip -net "$ns1" link set eth0.10 up +ip -net "$ns1" addr add 10.0.1.99/24 dev eth0.10 +ip -net "$ns1" route add default via 10.0.1.1 +ip -net "$ns1" addr add dead:1::99/64 dev eth0.10 nodad + +if ! test_tcp_forwarding_nat "$ns1" "$ns2" 1 "bridge and VLAN"; then echo "FAIL: flow offload for ns1/ns2 with bridge NAT and VLAN" 1>&2 - ip netns exec $nsr1 nft list ruleset + ip netns exec "$nsr1" nft list ruleset ret=1 fi # restore test topology (remove bridge and VLAN) -ip -net $nsr1 link set veth0 nomaster -ip -net $nsr1 link set veth0 down -ip -net $nsr1 link set veth0.10 down -ip -net $nsr1 link delete veth0.10 type vlan -ip -net $nsr1 link delete br0 type bridge -ip -net $ns1 addr flush dev eth0.10 -ip -net $ns1 link set eth0.10 down -ip -net $ns1 link set eth0 down -ip -net $ns1 link delete eth0.10 type vlan +ip -net "$nsr1" link set veth0 nomaster +ip -net "$nsr1" link set veth0 down +ip -net "$nsr1" link set veth0.10 down +ip -net "$nsr1" link delete veth0.10 type vlan +ip -net "$nsr1" link delete br0 type bridge +ip -net "$ns1" addr flush dev eth0.10 +ip -net "$ns1" link set eth0.10 down +ip -net "$ns1" link set eth0 down +ip -net "$ns1" link delete eth0.10 type vlan # restore address in ns1 and nsr1 -ip -net $ns1 link set eth0 up -ip -net $ns1 addr add 10.0.1.99/24 dev eth0 -ip -net $ns1 route add default via 10.0.1.1 -ip -net $ns1 addr add dead:1::99/64 dev eth0 -ip -net $ns1 route add default via dead:1::1 -ip -net $nsr1 addr add 10.0.1.1/24 dev veth0 -ip -net $nsr1 addr add dead:1::1/64 dev veth0 -ip -net $nsr1 link set up dev veth0 +ip -net "$ns1" link set eth0 up +ip -net "$ns1" addr add 10.0.1.99/24 dev eth0 +ip -net "$ns1" route add default via 10.0.1.1 +ip -net "$ns1" addr add dead:1::99/64 dev eth0 nodad +ip -net "$ns1" route add default via dead:1::1 +ip -net "$nsr1" addr add 10.0.1.1/24 dev veth0 +ip -net "$nsr1" addr add dead:1::1/64 dev veth0 nodad +ip -net "$nsr1" link set up dev veth0 +} + +test_bridge KEY_SHA="0x"$(ps -af | sha1sum | cut -d " " -f 1) KEY_AES="0x"$(ps -af | md5sum | cut -d " " -f 1) @@ -640,33 +629,43 @@ do_esp() { local spi_out=$6 local spi_in=$7 - ip -net $ns xfrm state add src $remote dst $me proto esp spi $spi_in enc aes $KEY_AES auth sha1 $KEY_SHA mode tunnel sel src $rnet dst $lnet - ip -net $ns xfrm state add src $me dst $remote proto esp spi $spi_out enc aes $KEY_AES auth sha1 $KEY_SHA mode tunnel sel src $lnet dst $rnet + ip -net "$ns" xfrm state add src "$remote" dst "$me" proto esp spi "$spi_in" enc aes "$KEY_AES" auth sha1 "$KEY_SHA" mode tunnel sel src "$rnet" dst "$lnet" + ip -net "$ns" xfrm state add src "$me" dst "$remote" proto esp spi "$spi_out" enc aes "$KEY_AES" auth sha1 "$KEY_SHA" mode tunnel sel src "$lnet" dst "$rnet" # to encrypt packets as they go out (includes forwarded packets that need encapsulation) - ip -net $ns xfrm policy add src $lnet dst $rnet dir out tmpl src $me dst $remote proto esp mode tunnel priority 1 action allow + ip -net "$ns" xfrm policy add src "$lnet" dst "$rnet" dir out tmpl src "$me" dst "$remote" proto esp mode tunnel priority 1 action allow # to fwd decrypted packets after esp processing: - ip -net $ns xfrm policy add src $rnet dst $lnet dir fwd tmpl src $remote dst $me proto esp mode tunnel priority 1 action allow - + ip -net "$ns" xfrm policy add src "$rnet" dst "$lnet" dir fwd tmpl src "$remote" dst "$me" proto esp mode tunnel priority 1 action allow } -do_esp $nsr1 192.168.10.1 192.168.10.2 10.0.1.0/24 10.0.2.0/24 $SPI1 $SPI2 +do_esp "$nsr1" 192.168.10.1 192.168.10.2 10.0.1.0/24 10.0.2.0/24 "$SPI1" "$SPI2" -do_esp $nsr2 192.168.10.2 192.168.10.1 10.0.2.0/24 10.0.1.0/24 $SPI2 $SPI1 +do_esp "$nsr2" 192.168.10.2 192.168.10.1 10.0.2.0/24 10.0.1.0/24 "$SPI2" "$SPI1" -ip netns exec $nsr1 nft delete table ip nat +ip netns exec "$nsr1" nft delete table ip nat # restore default routes -ip -net $ns2 route del 192.168.10.1 via 10.0.2.1 -ip -net $ns2 route add default via 10.0.2.1 -ip -net $ns2 route add default via dead:2::1 +ip -net "$ns2" route del 192.168.10.1 via 10.0.2.1 +ip -net "$ns2" route add default via 10.0.2.1 +ip -net "$ns2" route add default via dead:2::1 -if test_tcp_forwarding $ns1 $ns2; then +if test_tcp_forwarding "$ns1" "$ns2"; then check_counters "ipsec tunnel mode for ns1/ns2" else echo "FAIL: ipsec tunnel mode for ns1/ns2" - ip netns exec $nsr1 nft list ruleset 1>&2 - ip netns exec $nsr1 cat /proc/net/xfrm_stat 1>&2 + ip netns exec "$nsr1" nft list ruleset 1>&2 + ip netns exec "$nsr1" cat /proc/net/xfrm_stat 1>&2 +fi + +if [ "$1" = "" ]; then + low=1280 + mtu=$((65536 - low)) + o=$(((RANDOM%mtu) + low)) + l=$(((RANDOM%mtu) + low)) + r=$(((RANDOM%mtu) + low)) + + echo "re-run with random mtus: -o $o -l $l -r $r" + $0 -o "$o" -l "$l" -r "$r" fi exit $ret diff --git a/tools/testing/selftests/netfilter/nft_meta.sh b/tools/testing/selftests/net/netfilter/nft_meta.sh index f33154c04d34..71505b6cb252 100755 --- a/tools/testing/selftests/netfilter/nft_meta.sh +++ b/tools/testing/selftests/net/netfilter/nft_meta.sh @@ -91,10 +91,10 @@ check_one_counter() local want="packets $2" local verbose="$3" - if ! ip netns exec "$ns0" nft list counter inet filter $cname | grep -q "$want"; then + if ! ip netns exec "$ns0" nft list counter inet filter "$cname" | grep -q "$want"; then echo "FAIL: $cname, want \"$want\", got" ret=1 - ip netns exec "$ns0" nft list counter inet filter $cname + ip netns exec "$ns0" nft list counter inet filter "$cname" fi } diff --git a/tools/testing/selftests/netfilter/nft_nat.sh b/tools/testing/selftests/net/netfilter/nft_nat.sh index dd40d9f6f259..9e39de26455f 100755 --- a/tools/testing/selftests/netfilter/nft_nat.sh +++ b/tools/testing/selftests/net/netfilter/nft_nat.sh @@ -3,77 +3,60 @@ # This test is for basic NAT functionality: snat, dnat, redirect, masquerade. # -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 +source lib.sh + ret=0 test_inet_nat=true -sfx=$(mktemp -u "XXXXXXXX") -ns0="ns0-$sfx" -ns1="ns1-$sfx" -ns2="ns2-$sfx" +checktool "nft --version" "run test without nft tool" +checktool "socat -h" "run test without socat" cleanup() { - for i in 0 1 2; do ip netns del ns$i-"$sfx";done -} + ip netns pids "$ns0" | xargs kill 2>/dev/null + ip netns pids "$ns1" | xargs kill 2>/dev/null + ip netns pids "$ns2" | xargs kill 2>/dev/null -nft --version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without nft tool" - exit $ksft_skip -fi - -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi + rm -f "$INFILE" "$OUTFILE" -ip netns add "$ns0" -if [ $? -ne 0 ];then - echo "SKIP: Could not create net namespace $ns0" - exit $ksft_skip -fi + cleanup_all_ns +} trap cleanup EXIT -ip netns add "$ns1" -if [ $? -ne 0 ];then - echo "SKIP: Could not create net namespace $ns1" - exit $ksft_skip -fi +INFILE=$(mktemp) +OUTFILE=$(mktemp) -ip netns add "$ns2" -if [ $? -ne 0 ];then - echo "SKIP: Could not create net namespace $ns2" - exit $ksft_skip -fi +setup_ns ns0 ns1 ns2 -ip link add veth0 netns "$ns0" type veth peer name eth0 netns "$ns1" > /dev/null 2>&1 -if [ $? -ne 0 ];then +if ! ip link add veth0 netns "$ns0" type veth peer name eth0 netns "$ns1" > /dev/null 2>&1;then echo "SKIP: No virtual ethernet pair device support in kernel" exit $ksft_skip fi ip link add veth1 netns "$ns0" type veth peer name eth0 netns "$ns2" -ip -net "$ns0" link set lo up ip -net "$ns0" link set veth0 up ip -net "$ns0" addr add 10.0.1.1/24 dev veth0 -ip -net "$ns0" addr add dead:1::1/64 dev veth0 +ip -net "$ns0" addr add dead:1::1/64 dev veth0 nodad ip -net "$ns0" link set veth1 up ip -net "$ns0" addr add 10.0.2.1/24 dev veth1 -ip -net "$ns0" addr add dead:2::1/64 dev veth1 - -for i in 1 2; do - ip -net ns$i-$sfx link set lo up - ip -net ns$i-$sfx link set eth0 up - ip -net ns$i-$sfx addr add 10.0.$i.99/24 dev eth0 - ip -net ns$i-$sfx route add default via 10.0.$i.1 - ip -net ns$i-$sfx addr add dead:$i::99/64 dev eth0 - ip -net ns$i-$sfx route add default via dead:$i::1 -done +ip -net "$ns0" addr add dead:2::1/64 dev veth1 nodad + +do_config() +{ + ns="$1" + subnet="$2" + + ip -net "$ns" link set eth0 up + ip -net "$ns" addr add "10.0.$subnet.99/24" dev eth0 + ip -net "$ns" route add default via "10.0.$subnet.1" + ip -net "$ns" addr add "dead:$subnet::99/64" dev eth0 nodad + ip -net "$ns" route add default via "dead:$subnet::1" +} + +do_config "$ns1" 1 +do_config "$ns2" 2 bad_counter() { @@ -83,7 +66,7 @@ bad_counter() local tag=$4 echo "ERROR: $counter counter in $ns has unexpected value (expected $expect) at $tag" 1>&2 - ip netns exec $ns nft list counter inet filter $counter 1>&2 + ip netns exec "$ns" nft list counter inet filter "$counter" 1>&2 } check_counters() @@ -91,26 +74,23 @@ check_counters() ns=$1 local lret=0 - cnt=$(ip netns exec $ns nft list counter inet filter ns0in | grep -q "packets 1 bytes 84") - if [ $? -ne 0 ]; then - bad_counter $ns ns0in "packets 1 bytes 84" "check_counters 1" + if ! ip netns exec "$ns" nft list counter inet filter ns0in | grep -q "packets 1 bytes 84";then + bad_counter "$ns" ns0in "packets 1 bytes 84" "check_counters 1" lret=1 fi - cnt=$(ip netns exec $ns nft list counter inet filter ns0out | grep -q "packets 1 bytes 84") - if [ $? -ne 0 ]; then - bad_counter $ns ns0out "packets 1 bytes 84" "check_counters 2" + + if ! ip netns exec "$ns" nft list counter inet filter ns0out | grep -q "packets 1 bytes 84";then + bad_counter "$ns" ns0out "packets 1 bytes 84" "check_counters 2" lret=1 fi expect="packets 1 bytes 104" - cnt=$(ip netns exec $ns nft list counter inet filter ns0in6 | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter $ns ns0in6 "$expect" "check_counters 3" + if ! ip netns exec "$ns" nft list counter inet filter ns0in6 | grep -q "$expect";then + bad_counter "$ns" ns0in6 "$expect" "check_counters 3" lret=1 fi - cnt=$(ip netns exec $ns nft list counter inet filter ns0out6 | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter $ns ns0out6 "$expect" "check_counters 4" + if ! ip netns exec "$ns" nft list counter inet filter ns0out6 | grep -q "$expect";then + bad_counter "$ns" ns0out6 "$expect" "check_counters 4" lret=1 fi @@ -122,41 +102,35 @@ check_ns0_counters() local ns=$1 local lret=0 - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns0in | grep -q "packets 0 bytes 0") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" nft list counter inet filter ns0in | grep -q "packets 0 bytes 0";then bad_counter "$ns0" ns0in "packets 0 bytes 0" "check_ns0_counters 1" lret=1 fi - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns0in6 | grep -q "packets 0 bytes 0") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" nft list counter inet filter ns0in6 | grep -q "packets 0 bytes 0";then bad_counter "$ns0" ns0in6 "packets 0 bytes 0" lret=1 fi - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns0out | grep -q "packets 0 bytes 0") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" nft list counter inet filter ns0out | grep -q "packets 0 bytes 0";then bad_counter "$ns0" ns0out "packets 0 bytes 0" "check_ns0_counters 2" lret=1 fi - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns0out6 | grep -q "packets 0 bytes 0") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" nft list counter inet filter ns0out6 | grep -q "packets 0 bytes 0";then bad_counter "$ns0" ns0out6 "packets 0 bytes 0" "check_ns0_counters3 " lret=1 fi for dir in "in" "out" ; do expect="packets 1 bytes 84" - cnt=$(ip netns exec "$ns0" nft list counter inet filter ${ns}${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns0" $ns$dir "$expect" "check_ns0_counters 4" + if ! ip netns exec "$ns0" nft list counter inet filter "${ns}${dir}" | grep -q "$expect";then + bad_counter "$ns0" "$ns${dir}" "$expect" "check_ns0_counters 4" lret=1 fi expect="packets 1 bytes 104" - cnt=$(ip netns exec "$ns0" nft list counter inet filter ${ns}${dir}6 | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns0" $ns$dir6 "$expect" "check_ns0_counters 5" + if ! ip netns exec "$ns0" nft list counter inet filter "${ns}${dir}6" | grep -q "$expect";then + bad_counter "$ns0" "$ns${dir}6" "$expect" "check_ns0_counters 5" lret=1 fi done @@ -166,8 +140,8 @@ check_ns0_counters() reset_counters() { - for i in 0 1 2;do - ip netns exec ns$i-$sfx nft reset counters inet > /dev/null + for i in "$ns0" "$ns1" "$ns2" ;do + ip netns exec "$i" nft reset counters inet > /dev/null done } @@ -177,7 +151,7 @@ test_local_dnat6() local lret=0 local IPF="" - if [ $family = "inet" ];then + if [ "$family" = "inet" ];then IPF="ip6" fi @@ -195,8 +169,7 @@ EOF fi # ping netns1, expect rewrite to netns2 - ip netns exec "$ns0" ping -q -c 1 dead:1::99 > /dev/null - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" ping -q -c 1 dead:1::99 > /dev/null;then lret=1 echo "ERROR: ping6 failed" return $lret @@ -204,8 +177,7 @@ EOF expect="packets 0 bytes 0" for dir in "in6" "out6" ; do - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then bad_counter "$ns0" ns1$dir "$expect" "test_local_dnat6 1" lret=1 fi @@ -213,8 +185,7 @@ EOF expect="packets 1 bytes 104" for dir in "in6" "out6" ; do - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then bad_counter "$ns0" ns2$dir "$expect" "test_local_dnat6 2" lret=1 fi @@ -223,8 +194,7 @@ EOF # expect 0 count in ns1 expect="packets 0 bytes 0" for dir in "in6" "out6" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns0${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns1" nft list counter inet filter "ns0${dir}" | grep -q "$expect";then bad_counter "$ns1" ns0$dir "$expect" "test_local_dnat6 3" lret=1 fi @@ -233,8 +203,7 @@ EOF # expect 1 packet in ns2 expect="packets 1 bytes 104" for dir in "in6" "out6" ; do - cnt=$(ip netns exec "$ns2" nft list counter inet filter ns0${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns2" nft list counter inet filter "ns0${dir}" | grep -q "$expect";then bad_counter "$ns2" ns0$dir "$expect" "test_local_dnat6 4" lret=1 fi @@ -252,7 +221,7 @@ test_local_dnat() local lret=0 local IPF="" - if [ $family = "inet" ];then + if [ "$family" = "inet" ];then IPF="ip" fi @@ -265,7 +234,7 @@ table $family nat { } EOF if [ $? -ne 0 ]; then - if [ $family = "inet" ];then + if [ "$family" = "inet" ];then echo "SKIP: inet nat tests" test_inet_nat=false return $ksft_skip @@ -275,8 +244,7 @@ EOF fi # ping netns1, expect rewrite to netns2 - ip netns exec "$ns0" ping -q -c 1 10.0.1.99 > /dev/null - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" ping -q -c 1 10.0.1.99 > /dev/null;then lret=1 echo "ERROR: ping failed" return $lret @@ -284,18 +252,16 @@ EOF expect="packets 0 bytes 0" for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns0" ns1$dir "$expect" "test_local_dnat 1" + if ! ip netns exec "$ns0" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then + bad_counter "$ns0" "ns1$dir" "$expect" "test_local_dnat 1" lret=1 fi done expect="packets 1 bytes 84" for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns0" ns2$dir "$expect" "test_local_dnat 2" + if ! ip netns exec "$ns0" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then + bad_counter "$ns0" "ns2$dir" "$expect" "test_local_dnat 2" lret=1 fi done @@ -303,9 +269,8 @@ EOF # expect 0 count in ns1 expect="packets 0 bytes 0" for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns0${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns1" ns0$dir "$expect" "test_local_dnat 3" + if ! ip netns exec "$ns1" nft list counter inet filter ns0${dir} | grep -q "$expect";then + bad_counter "$ns1" "ns0$dir" "$expect" "test_local_dnat 3" lret=1 fi done @@ -313,20 +278,18 @@ EOF # expect 1 packet in ns2 expect="packets 1 bytes 84" for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns2" nft list counter inet filter ns0${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns2" ns0$dir "$expect" "test_local_dnat 4" + if ! ip netns exec "$ns2" nft list counter inet filter ns0${dir} | grep -q "$expect";then + bad_counter "$ns2" "ns0$dir" "$expect" "test_local_dnat 4" lret=1 fi done test $lret -eq 0 && echo "PASS: ping to $ns1 was $family NATted to $ns2" - ip netns exec "$ns0" nft flush chain $family nat output + ip netns exec "$ns0" nft flush chain "$family" nat output reset_counters - ip netns exec "$ns0" ping -q -c 1 10.0.1.99 > /dev/null - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" ping -q -c 1 10.0.1.99 > /dev/null;then lret=1 echo "ERROR: ping failed" return $lret @@ -334,16 +297,14 @@ EOF expect="packets 1 bytes 84" for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then bad_counter "$ns1" ns1$dir "$expect" "test_local_dnat 5" lret=1 fi done expect="packets 0 bytes 0" for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then bad_counter "$ns0" ns2$dir "$expect" "test_local_dnat 6" lret=1 fi @@ -352,8 +313,7 @@ EOF # expect 1 count in ns1 expect="packets 1 bytes 84" for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns0${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns1" nft list counter inet filter "ns0${dir}" | grep -q "$expect";then bad_counter "$ns0" ns0$dir "$expect" "test_local_dnat 7" lret=1 fi @@ -362,8 +322,7 @@ EOF # expect 0 packet in ns2 expect="packets 0 bytes 0" for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns2" nft list counter inet filter ns0${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns2" nft list counter inet filter "ns0${dir}" | grep -q "$expect";then bad_counter "$ns2" ns0$dir "$expect" "test_local_dnat 8" lret=1 fi @@ -374,13 +333,19 @@ EOF return $lret } +listener_ready() +{ + local ns="$1" + local port="$2" + local proto="$3" + ss -N "$ns" -ln "$proto" -o "sport = :$port" | grep -q "$port" +} + test_local_dnat_portonly() { local family=$1 local daddr=$2 local lret=0 - local sr_s - local sr_r ip netns exec "$ns0" nft -f /dev/stdin <<EOF table $family nat { @@ -392,7 +357,7 @@ table $family nat { } EOF if [ $? -ne 0 ]; then - if [ $family = "inet" ];then + if [ "$family" = "inet" ];then echo "SKIP: inet port test" test_inet_nat=false return @@ -401,17 +366,16 @@ EOF return fi - echo SERVER-$family | ip netns exec "$ns1" timeout 5 socat -u STDIN TCP-LISTEN:2000 & - sc_s=$! + echo "SERVER-$family" | ip netns exec "$ns1" timeout 3 socat -u STDIN TCP-LISTEN:2000 & - sleep 1 + busywait $BUSYWAIT_TIMEOUT listener_ready "$ns1" 2000 "-t" - result=$(ip netns exec "$ns0" timeout 1 socat TCP:$daddr:2000 STDOUT) + result=$(ip netns exec "$ns0" timeout 1 socat -u TCP:"$daddr":2000 STDOUT) if [ "$result" = "SERVER-inet" ];then echo "PASS: inet port rewrite without l3 address" else - echo "ERROR: inet port rewrite" + echo "ERROR: inet port rewrite without l3 address, got $result" ret=1 fi } @@ -424,24 +388,20 @@ test_masquerade6() ip netns exec "$ns0" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null - ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then + if ! ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null;then echo "ERROR: cannot ping $ns1 from $ns2 via ipv6" return 1 - lret=1 fi expect="packets 1 bytes 104" for dir in "in6" "out6" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns1" ns2$dir "$expect" "test_masquerade6 1" + if ! ip netns exec "$ns1" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then + bad_counter "$ns1" "ns2$dir" "$expect" "test_masquerade6 1" lret=1 fi - cnt=$(ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns2" ns1$dir "$expect" "test_masquerade6 2" + if ! ip netns exec "$ns2" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then + bad_counter "$ns2" "ns1$dir" "$expect" "test_masquerade6 2" lret=1 fi done @@ -462,8 +422,7 @@ EOF return $ksft_skip fi - ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then + if ! ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null;then echo "ERROR: cannot ping $ns1 from $ns2 with active $family masquerade $natflags" lret=1 fi @@ -471,14 +430,12 @@ EOF # ns1 should have seen packets from ns0, due to masquerade expect="packets 1 bytes 104" for dir in "in6" "out6" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns0${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns1" nft list counter inet filter "ns0${dir}" | grep -q "$expect";then bad_counter "$ns1" ns0$dir "$expect" "test_masquerade6 3" lret=1 fi - cnt=$(ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns2" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then bad_counter "$ns2" ns1$dir "$expect" "test_masquerade6 4" lret=1 fi @@ -487,27 +444,23 @@ EOF # ns1 should not have seen packets from ns2, due to masquerade expect="packets 0 bytes 0" for dir in "in6" "out6" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns1" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then bad_counter "$ns1" ns0$dir "$expect" "test_masquerade6 5" lret=1 fi - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns0" ns1$dir "$expect" "test_masquerade6 6" + if ! ip netns exec "$ns0" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then + bad_counter "$ns0" "ns1$dir" "$expect" "test_masquerade6 6" lret=1 fi done - ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then + if ! ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null;then echo "ERROR: cannot ping $ns1 from $ns2 with active ipv6 masquerade $natflags (attempt 2)" lret=1 fi - ip netns exec "$ns0" nft flush chain $family nat postrouting - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" nft flush chain "$family" nat postrouting;then echo "ERROR: Could not flush $family nat postrouting" 1>&2 lret=1 fi @@ -526,23 +479,20 @@ test_masquerade() ip netns exec "$ns0" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null ip netns exec "$ns0" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null - ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then - echo "ERROR: cannot ping $ns1 from "$ns2" $natflags" + if ! ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null;then + echo "ERROR: cannot ping $ns1 from $ns2 $natflags" lret=1 fi expect="packets 1 bytes 84" for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns1" ns2$dir "$expect" "test_masquerade 1" + if ! ip netns exec "$ns1" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then + bad_counter "$ns1" "ns2$dir" "$expect" "test_masquerade 1" lret=1 fi - cnt=$(ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns2" ns1$dir "$expect" "test_masquerade 2" + if ! ip netns exec "$ns2" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then + bad_counter "$ns2" "ns1$dir" "$expect" "test_masquerade 2" lret=1 fi done @@ -563,8 +513,7 @@ EOF return $ksft_skip fi - ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then + if ! ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null;then echo "ERROR: cannot ping $ns1 from $ns2 with active $family masquerade $natflags" lret=1 fi @@ -572,15 +521,13 @@ EOF # ns1 should have seen packets from ns0, due to masquerade expect="packets 1 bytes 84" for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns0${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns1" ns0$dir "$expect" "test_masquerade 3" + if ! ip netns exec "$ns1" nft list counter inet filter "ns0${dir}" | grep -q "$expect";then + bad_counter "$ns1" "ns0$dir" "$expect" "test_masquerade 3" lret=1 fi - cnt=$(ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns2" ns1$dir "$expect" "test_masquerade 4" + if ! ip netns exec "$ns2" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then + bad_counter "$ns2" "ns1$dir" "$expect" "test_masquerade 4" lret=1 fi done @@ -588,27 +535,23 @@ EOF # ns1 should not have seen packets from ns2, due to masquerade expect="packets 0 bytes 0" for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns1" ns0$dir "$expect" "test_masquerade 5" + if ! ip netns exec "$ns1" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then + bad_counter "$ns1" "ns0$dir" "$expect" "test_masquerade 5" lret=1 fi - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns0" ns1$dir "$expect" "test_masquerade 6" + if ! ip netns exec "$ns0" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then + bad_counter "$ns0" "ns1$dir" "$expect" "test_masquerade 6" lret=1 fi done - ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then + if ! ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null;then echo "ERROR: cannot ping $ns1 from $ns2 with active ip masquerade $natflags (attempt 2)" lret=1 fi - ip netns exec "$ns0" nft flush chain $family nat postrouting - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" nft flush chain "$family" nat postrouting; then echo "ERROR: Could not flush $family nat postrouting" 1>&2 lret=1 fi @@ -625,22 +568,19 @@ test_redirect6() ip netns exec "$ns0" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null - ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then + if ! ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null;then echo "ERROR: cannnot ping $ns1 from $ns2 via ipv6" lret=1 fi expect="packets 1 bytes 104" for dir in "in6" "out6" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns1" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then bad_counter "$ns1" ns2$dir "$expect" "test_redirect6 1" lret=1 fi - cnt=$(ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns2" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then bad_counter "$ns2" ns1$dir "$expect" "test_redirect6 2" lret=1 fi @@ -662,8 +602,7 @@ EOF return $ksft_skip fi - ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then + if ! ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null;then echo "ERROR: cannot ping $ns1 from $ns2 via ipv6 with active $family redirect" lret=1 fi @@ -671,8 +610,7 @@ EOF # ns1 should have seen no packets from ns2, due to redirection expect="packets 0 bytes 0" for dir in "in6" "out6" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns1" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then bad_counter "$ns1" ns0$dir "$expect" "test_redirect6 3" lret=1 fi @@ -681,15 +619,13 @@ EOF # ns0 should have seen packets from ns2, due to masquerade expect="packets 1 bytes 104" for dir in "in6" "out6" ; do - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then bad_counter "$ns1" ns0$dir "$expect" "test_redirect6 4" lret=1 fi done - ip netns exec "$ns0" nft delete table $family nat - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" nft delete table "$family" nat;then echo "ERROR: Could not delete $family nat table" 1>&2 lret=1 fi @@ -707,22 +643,19 @@ test_redirect() ip netns exec "$ns0" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null ip netns exec "$ns0" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null - ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then + if ! ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null;then echo "ERROR: cannot ping $ns1 from $ns2" lret=1 fi expect="packets 1 bytes 84" for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns1" $ns2$dir "$expect" "test_redirect 1" + if ! ip netns exec "$ns1" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then + bad_counter "$ns1" "$ns2$dir" "$expect" "test_redirect 1" lret=1 fi - cnt=$(ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect";then bad_counter "$ns2" ns1$dir "$expect" "test_redirect 2" lret=1 fi @@ -744,8 +677,7 @@ EOF return $ksft_skip fi - ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then + if ! ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null;then echo "ERROR: cannot ping $ns1 from $ns2 with active $family ip redirect" lret=1 fi @@ -754,8 +686,7 @@ EOF expect="packets 0 bytes 0" for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns1" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then bad_counter "$ns1" ns0$dir "$expect" "test_redirect 3" lret=1 fi @@ -764,15 +695,13 @@ EOF # ns0 should have seen packets from ns2, due to masquerade expect="packets 1 bytes 84" for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then bad_counter "$ns0" ns0$dir "$expect" "test_redirect 4" lret=1 fi done - ip netns exec "$ns0" nft delete table $family nat - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" nft delete table "$family" nat;then echo "ERROR: Could not delete $family nat table" 1>&2 lret=1 fi @@ -803,13 +732,13 @@ test_port_shadow() # make shadow entry, from client (ns2), going to (ns1), port 41404, sport 1405. echo "fake-entry" | ip netns exec "$ns2" timeout 1 socat -u STDIN UDP:"$daddrc":41404,sourceport=1405 - echo ROUTER | ip netns exec "$ns0" timeout 5 socat -u STDIN UDP4-LISTEN:1405 & - sc_r=$! + echo ROUTER | ip netns exec "$ns0" timeout 3 socat -T 3 -u STDIN UDP4-LISTEN:1405 2>/dev/null & + local sc_r=$! + echo CLIENT | ip netns exec "$ns2" timeout 3 socat -T 3 -u STDIN UDP4-LISTEN:1405,reuseport 2>/dev/null & + local sc_c=$! - echo CLIENT | ip netns exec "$ns2" timeout 5 socat -u STDIN UDP4-LISTEN:1405,reuseport & - sc_c=$! - - sleep 0.3 + busywait $BUSYWAIT_TIMEOUT listener_ready "$ns0" 1405 "-u" + busywait $BUSYWAIT_TIMEOUT listener_ready "$ns2" 1405 "-u" # ns1 tries to connect to ns0:1405. With default settings this should connect # to client, it matches the conntrack entry created above. @@ -846,7 +775,7 @@ table $family filter { EOF test_port_shadow "port-filter" "ROUTER" - ip netns exec "$ns0" nft delete table $family filter + ip netns exec "$ns0" nft delete table "$family" filter } # This prevents port shadow of router service via notrack. @@ -868,7 +797,7 @@ table $family raw { EOF test_port_shadow "port-notrack" "ROUTER" - ip netns exec "$ns0" nft delete table $family raw + ip netns exec "$ns0" nft delete table "$family" raw } # This prevents port shadow of router service via sport remap. @@ -886,21 +815,19 @@ table $family pat { EOF test_port_shadow "pat" "ROUTER" - ip netns exec "$ns0" nft delete table $family pat + ip netns exec "$ns0" nft delete table "$family" pat } test_port_shadowing() { local family="ip" - conntrack -h >/dev/null 2>&1 - if [ $? -ne 0 ];then + if ! conntrack -h >/dev/null 2>&1;then echo "SKIP: Could not run nat port shadowing test without conntrack tool" return fi - socat -h > /dev/null 2>&1 - if [ $? -ne 0 ];then + if ! socat -h > /dev/null 2>&1;then echo "SKIP: Could not run nat port shadowing test without socat tool" return fi @@ -946,8 +873,7 @@ test_stateless_nat_ip() ip netns exec "$ns0" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null ip netns exec "$ns0" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null - ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then + if ! ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null;then echo "ERROR: cannot ping $ns1 from $ns2 before loading stateless rules" return 1 fi @@ -981,23 +907,20 @@ EOF reset_counters - ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then + if ! ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null; then echo "ERROR: cannot ping $ns1 from $ns2 with stateless rules" lret=1 fi # ns1 should have seen packets from .2.2, due to stateless rewrite. expect="packets 1 bytes 84" - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns0insl | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns1" nft list counter inet filter ns0insl | grep -q "$expect";then bad_counter "$ns1" ns0insl "$expect" "test_stateless 1" lret=1 fi for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect";then bad_counter "$ns2" ns1$dir "$expect" "test_stateless 2" lret=1 fi @@ -1006,14 +929,12 @@ EOF # ns1 should not have seen packets from ns2, due to masquerade expect="packets 0 bytes 0" for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect";then bad_counter "$ns1" ns0$dir "$expect" "test_stateless 3" lret=1 fi - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" nft list counter inet filter ns1${dir} | grep -q "$expect";then bad_counter "$ns0" ns1$dir "$expect" "test_stateless 4" lret=1 fi @@ -1021,8 +942,7 @@ EOF reset_counters - socat -h > /dev/null 2>&1 - if [ $? -ne 0 ];then + if ! socat -h > /dev/null 2>&1;then echo "SKIP: Could not run stateless nat frag test without socat tool" if [ $lret -eq 0 ]; then return $ksft_skip @@ -1032,42 +952,36 @@ EOF return $lret fi - local tmpfile=$(mktemp) - dd if=/dev/urandom of=$tmpfile bs=4096 count=1 2>/dev/null + dd if=/dev/urandom of="$INFILE" bs=4096 count=1 2>/dev/null - local outfile=$(mktemp) - ip netns exec "$ns1" timeout 3 socat -u UDP4-RECV:4233 OPEN:$outfile < /dev/null & - sc_r=$! + ip netns exec "$ns1" timeout 3 socat -u UDP4-RECV:4233 OPEN:"$OUTFILE" < /dev/null 2>/dev/null & + + busywait $BUSYWAIT_TIMEOUT listener_ready "$ns1" 4233 "-u" - sleep 1 # re-do with large ping -> ip fragmentation - ip netns exec "$ns2" timeout 3 socat - UDP4-SENDTO:"10.0.1.99:4233" < "$tmpfile" > /dev/null - if [ $? -ne 0 ] ; then + if ! ip netns exec "$ns2" timeout 3 socat -u STDIN UDP4-SENDTO:"10.0.1.99:4233" < "$INFILE" > /dev/null;then echo "ERROR: failed to test udp $ns1 to $ns2 with stateless ip nat" 1>&2 lret=1 fi wait - cmp "$tmpfile" "$outfile" - if [ $? -ne 0 ]; then - ls -l "$tmpfile" "$outfile" + if ! cmp "$INFILE" "$OUTFILE";then + ls -l "$INFILE" "$OUTFILE" echo "ERROR: in and output file mismatch when checking udp with stateless nat" 1>&2 lret=1 fi - rm -f "$tmpfile" "$outfile" + :> "$OUTFILE" # ns1 should have seen packets from 2.2, due to stateless rewrite. expect="packets 3 bytes 4164" - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns0insl | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns1" nft list counter inet filter ns0insl | grep -q "$expect";then bad_counter "$ns1" ns0insl "$expect" "test_stateless 5" lret=1 fi - ip netns exec "$ns0" nft delete table ip stateless - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" nft delete table ip stateless; then echo "ERROR: Could not delete table ip stateless" 1>&2 lret=1 fi @@ -1078,8 +992,8 @@ EOF } # ip netns exec "$ns0" ping -c 1 -q 10.0.$i.99 -for i in 0 1 2; do -ip netns exec ns$i-$sfx nft -f /dev/stdin <<EOF +for i in "$ns0" "$ns1" "$ns2" ;do +ip netns exec "$i" nft -f /dev/stdin <<EOF table inet filter { counter ns0in {} counter ns1in {} @@ -1145,7 +1059,7 @@ done # special case for stateless nat check, counter needs to # be done before (input) ip defragmentation -ip netns exec ns1-$sfx nft -f /dev/stdin <<EOF +ip netns exec "$ns1" nft -f /dev/stdin <<EOF table inet filter { counter ns0insl {} @@ -1156,31 +1070,49 @@ table inet filter { } EOF -sleep 3 -# test basic connectivity -for i in 1 2; do - ip netns exec "$ns0" ping -c 1 -q 10.0.$i.99 > /dev/null - if [ $? -ne 0 ];then - echo "ERROR: Could not reach other namespace(s)" 1>&2 - ret=1 - fi - - ip netns exec "$ns0" ping -c 1 -q dead:$i::99 > /dev/null - if [ $? -ne 0 ];then - echo "ERROR: Could not reach other namespace(s) via ipv6" 1>&2 - ret=1 - fi - check_counters ns$i-$sfx - if [ $? -ne 0 ]; then - ret=1 - fi - - check_ns0_counters ns$i - if [ $? -ne 0 ]; then - ret=1 - fi - reset_counters -done +ping_basic() +{ + i="$1" + if ! ip netns exec "$ns0" ping -c 1 -q 10.0."$i".99 > /dev/null;then + echo "ERROR: Could not reach other namespace(s)" 1>&2 + ret=1 + fi + + if ! ip netns exec "$ns0" ping -c 1 -q dead:"$i"::99 > /dev/null;then + echo "ERROR: Could not reach other namespace(s) via ipv6" 1>&2 + ret=1 + fi +} + +test_basic_conn() +{ + local nsexec + name="$1" + + nsexec=$(eval echo \$"$1") + + ping_basic 1 + ping_basic 2 + + if ! check_counters "$nsexec";then + return 1 + fi + + if ! check_ns0_counters "$name";then + return 1 + fi + + reset_counters + return 0 +} + +if ! test_basic_conn "ns1" ; then + echo "ERROR: basic test for ns1 failed" 1>&2 + exit 1 +fi +if ! test_basic_conn "ns2"; then + echo "ERROR: basic test for ns1 failed" 1>&2 +fi if [ $ret -eq 0 ];then echo "PASS: netns routing/connectivity: $ns0 can reach $ns1 and $ns2" diff --git a/tools/testing/selftests/netfilter/nft_nat_zones.sh b/tools/testing/selftests/net/netfilter/nft_nat_zones.sh index b9ab37380f33..3b81d88bdde3 100755 --- a/tools/testing/selftests/netfilter/nft_nat_zones.sh +++ b/tools/testing/selftests/net/netfilter/nft_nat_zones.sh @@ -3,17 +3,17 @@ # Test connection tracking zone and NAT source port reallocation support. # -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 +source lib.sh # Don't increase too much, 2000 clients should work # just fine but script can then take several minutes with # KASAN/debug builds. maxclients=100 -have_iperf=1 +have_socat=0 ret=0 +[ "$KSFT_MACHINE_SLOW" = yes ] && maxclients=40 # client1---. # veth1-. # | @@ -31,12 +31,6 @@ ret=0 # NAT Gateway is supposed to do port reallocation for each of the # connections. -sfx=$(mktemp -u "XXXXXXXX") -gw="ns-gw-$sfx" -cl1="ns-cl1-$sfx" -cl2="ns-cl2-$sfx" -srv="ns-srv-$sfx" - v4gc1=$(sysctl -n net.ipv4.neigh.default.gc_thresh1 2>/dev/null) v4gc2=$(sysctl -n net.ipv4.neigh.default.gc_thresh2 2>/dev/null) v4gc3=$(sysctl -n net.ipv4.neigh.default.gc_thresh3 2>/dev/null) @@ -46,61 +40,29 @@ v6gc3=$(sysctl -n net.ipv6.neigh.default.gc_thresh3 2>/dev/null) cleanup() { - ip netns del $gw - ip netns del $srv - for i in $(seq 1 $maxclients); do - ip netns del ns-cl$i-$sfx 2>/dev/null - done - - sysctl -q net.ipv4.neigh.default.gc_thresh1=$v4gc1 2>/dev/null - sysctl -q net.ipv4.neigh.default.gc_thresh2=$v4gc2 2>/dev/null - sysctl -q net.ipv4.neigh.default.gc_thresh3=$v4gc3 2>/dev/null - sysctl -q net.ipv6.neigh.default.gc_thresh1=$v6gc1 2>/dev/null - sysctl -q net.ipv6.neigh.default.gc_thresh2=$v6gc2 2>/dev/null - sysctl -q net.ipv6.neigh.default.gc_thresh3=$v6gc3 2>/dev/null + cleanup_all_ns + + sysctl -q net.ipv4.neigh.default.gc_thresh1="$v4gc1" 2>/dev/null + sysctl -q net.ipv4.neigh.default.gc_thresh2="$v4gc2" 2>/dev/null + sysctl -q net.ipv4.neigh.default.gc_thresh3="$v4gc3" 2>/dev/null + sysctl -q net.ipv6.neigh.default.gc_thresh1="$v6gc1" 2>/dev/null + sysctl -q net.ipv6.neigh.default.gc_thresh2="$v6gc2" 2>/dev/null + sysctl -q net.ipv6.neigh.default.gc_thresh3="$v6gc3" 2>/dev/null } -nft --version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without nft tool" - exit $ksft_skip -fi +checktool "nft --version" echo "run test without nft tool" +checktool "conntrack -V" "run test without conntrack tool" -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip +if socat -h >/dev/null 2>&1; then + have_socat=1 fi -conntrack -V > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without conntrack tool" - exit $ksft_skip -fi - -iperf3 -v >/dev/null 2>&1 -if [ $? -ne 0 ];then - have_iperf=0 -fi - -ip netns add "$gw" -if [ $? -ne 0 ];then - echo "SKIP: Could not create net namespace $gw" - exit $ksft_skip -fi -ip -net "$gw" link set lo up +setup_ns gw srv trap cleanup EXIT -ip netns add "$srv" -if [ $? -ne 0 ];then - echo "SKIP: Could not create server netns $srv" - exit $ksft_skip -fi - ip link add veth0 netns "$gw" type veth peer name eth0 netns "$srv" ip -net "$gw" link set veth0 up -ip -net "$srv" link set lo up ip -net "$srv" link set eth0 up sysctl -q net.ipv6.neigh.default.gc_thresh1=512 2>/dev/null @@ -110,55 +72,49 @@ sysctl -q net.ipv4.neigh.default.gc_thresh1=512 2>/dev/null sysctl -q net.ipv4.neigh.default.gc_thresh2=1024 2>/dev/null sysctl -q net.ipv4.neigh.default.gc_thresh3=4096 2>/dev/null -for i in $(seq 1 $maxclients);do - cl="ns-cl$i-$sfx" +for i in $(seq 1 "$maxclients");do + setup_ns "cl$i" - ip netns add "$cl" - if [ $? -ne 0 ];then - echo "SKIP: Could not create client netns $cl" - exit $ksft_skip - fi - ip link add veth$i netns "$gw" type veth peer name eth0 netns "$cl" > /dev/null 2>&1 - if [ $? -ne 0 ];then + cl=$(eval echo \$cl"$i") + if ! ip link add veth"$i" netns "$gw" type veth peer name eth0 netns "$cl" > /dev/null 2>&1;then echo "SKIP: No virtual ethernet pair device support in kernel" exit $ksft_skip fi done -for i in $(seq 1 $maxclients);do - cl="ns-cl$i-$sfx" - echo netns exec "$cl" ip link set lo up +for i in $(seq 1 "$maxclients");do + cl=$(eval echo \$cl"$i") echo netns exec "$cl" ip link set eth0 up echo netns exec "$cl" sysctl -q net.ipv4.tcp_syn_retries=2 - echo netns exec "$gw" ip link set veth$i up - echo netns exec "$gw" sysctl -q net.ipv4.conf.veth$i.arp_ignore=2 - echo netns exec "$gw" sysctl -q net.ipv4.conf.veth$i.rp_filter=0 + echo netns exec "$gw" ip link set "veth$i" up + echo netns exec "$gw" sysctl -q net.ipv4.conf.veth"$i".arp_ignore=2 + echo netns exec "$gw" sysctl -q net.ipv4.conf.veth"$i".rp_filter=0 # clients have same IP addresses. echo netns exec "$cl" ip addr add 10.1.0.3/24 dev eth0 - echo netns exec "$cl" ip addr add dead:1::3/64 dev eth0 + echo netns exec "$cl" ip addr add dead:1::3/64 dev eth0 nodad echo netns exec "$cl" ip route add default via 10.1.0.2 dev eth0 echo netns exec "$cl" ip route add default via dead:1::2 dev eth0 # NB: same addresses on client-facing interfaces. - echo netns exec "$gw" ip addr add 10.1.0.2/24 dev veth$i - echo netns exec "$gw" ip addr add dead:1::2/64 dev veth$i + echo netns exec "$gw" ip addr add 10.1.0.2/24 dev "veth$i" + echo netns exec "$gw" ip addr add dead:1::2/64 dev "veth$i" nodad # gw: policy routing - echo netns exec "$gw" ip route add 10.1.0.0/24 dev veth$i table $((1000+i)) - echo netns exec "$gw" ip route add dead:1::0/64 dev veth$i table $((1000+i)) + echo netns exec "$gw" ip route add 10.1.0.0/24 dev "veth$i" table $((1000+i)) + echo netns exec "$gw" ip route add dead:1::0/64 dev "veth$i" table $((1000+i)) echo netns exec "$gw" ip route add 10.3.0.0/24 dev veth0 table $((1000+i)) echo netns exec "$gw" ip route add dead:3::0/64 dev veth0 table $((1000+i)) - echo netns exec "$gw" ip rule add fwmark $i lookup $((1000+i)) + echo netns exec "$gw" ip rule add fwmark "$i" lookup $((1000+i)) done | ip -batch /dev/stdin ip -net "$gw" addr add 10.3.0.1/24 dev veth0 -ip -net "$gw" addr add dead:3::1/64 dev veth0 +ip -net "$gw" addr add dead:3::1/64 dev veth0 nodad ip -net "$srv" addr add 10.3.0.99/24 dev eth0 -ip -net "$srv" addr add dead:3::99/64 dev eth0 +ip -net "$srv" addr add dead:3::99/64 dev eth0 nodad -ip netns exec $gw nft -f /dev/stdin<<EOF +ip netns exec "$gw" nft -f /dev/stdin<<EOF table inet raw { map iiftomark { type ifname : mark @@ -203,18 +159,22 @@ table inet raw { } } EOF +if [ "$?" -ne 0 ];then + echo "SKIP: Could not add nftables rules" + exit $ksft_skip +fi ( echo add element inet raw iiftomark \{ for i in $(seq 1 $((maxclients-1))); do - echo \"veth$i\" : $i, + echo \"veth"$i"\" : "$i", done - echo \"veth$maxclients\" : $maxclients \} + echo \"veth"$maxclients"\" : "$maxclients" \} echo add element inet raw iiftozone \{ for i in $(seq 1 $((maxclients-1))); do - echo \"veth$i\" : $i, + echo \"veth"$i"\" : "$i", done echo \"veth$maxclients\" : $maxclients \} -) | ip netns exec $gw nft -f /dev/stdin +) | ip netns exec "$gw" nft -f /dev/stdin ip netns exec "$gw" sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null ip netns exec "$gw" sysctl -q net.ipv6.conf.all.forwarding=1 > /dev/null @@ -224,73 +184,72 @@ ip netns exec "$gw" sysctl -q net.ipv4.conf.all.rp_filter=0 >/dev/null ip netns exec "$gw" sysctl -q net.ipv4.fwmark_reflect=1 > /dev/null ip netns exec "$gw" sysctl -q net.ipv6.fwmark_reflect=1 > /dev/null -for i in $(seq 1 $maxclients); do - cl="ns-cl$i-$sfx" - ip netns exec $cl ping -i 0.5 -q -c 3 10.3.0.99 > /dev/null 2>&1 & - if [ $? -ne 0 ]; then - echo FAIL: Ping failure from $cl 1>&2 - ret=1 - break - fi +for i in $(seq 1 "$maxclients"); do + cl=$(eval echo \$cl"$i") + ip netns exec "$cl" ping -i 0.5 -q -c 3 10.3.0.99 > /dev/null 2>&1 & done -wait +wait || ret=1 -for i in $(seq 1 $maxclients); do - ip netns exec $gw nft get element inet raw inicmp "{ 10.1.0.3 . \"veth$i\" . 10.3.0.99 }" | grep -q "{ 10.1.0.3 . \"veth$i\" . 10.3.0.99 counter packets 3 bytes 252 }" - if [ $? -ne 0 ];then +[ "$ret" -ne 0 ] && "FAIL: Ping failure from $cl" 1>&2 + +for i in $(seq 1 "$maxclients"); do + if ! ip netns exec "$gw" nft get element inet raw inicmp "{ 10.1.0.3 . \"veth$i\" . 10.3.0.99 }" | grep -q "{ 10.1.0.3 . \"veth$i\" . 10.3.0.99 counter packets 3 bytes 252 }"; then ret=1 echo "FAIL: counter icmp mismatch for veth$i" 1>&2 - ip netns exec $gw nft get element inet raw inicmp "{ 10.1.0.3 . \"veth$i\" . 10.3.0.99 }" 1>&2 + ip netns exec "$gw" nft get element inet raw inicmp "{ 10.1.0.3 . \"veth$i\" . 10.3.0.99 }" 1>&2 break fi done -ip netns exec $gw nft get element inet raw inicmp "{ 10.3.0.99 . \"veth0\" . 10.3.0.1 }" | grep -q "{ 10.3.0.99 . \"veth0\" . 10.3.0.1 counter packets $((3 * $maxclients)) bytes $((252 * $maxclients)) }" -if [ $? -ne 0 ];then +if ! ip netns exec "$gw" nft get element inet raw inicmp "{ 10.3.0.99 . \"veth0\" . 10.3.0.1 }" | grep -q "{ 10.3.0.99 . \"veth0\" . 10.3.0.1 counter packets $((3 * maxclients)) bytes $((252 * maxclients)) }"; then ret=1 - echo "FAIL: counter icmp mismatch for veth0: { 10.3.0.99 . \"veth0\" . 10.3.0.1 counter packets $((3 * $maxclients)) bytes $((252 * $maxclients)) }" - ip netns exec $gw nft get element inet raw inicmp "{ 10.3.99 . \"veth0\" . 10.3.0.1 }" 1>&2 + echo "FAIL: counter icmp mismatch for veth0: { 10.3.0.99 . \"veth0\" . 10.3.0.1 counter packets $((3 * maxclients)) bytes $((252 * maxclients)) }" + ip netns exec "$gw" nft get element inet raw inicmp "{ 10.3.99 . \"veth0\" . 10.3.0.1 }" 1>&2 fi -if [ $ret -eq 0 ]; then +if [ $ret -eq 0 ]; then echo "PASS: ping test from all $maxclients namespaces" fi -if [ $have_iperf -eq 0 ];then - echo "SKIP: iperf3 not installed" +if [ $have_socat -eq 0 ];then + echo "SKIP: socat not installed" if [ $ret -ne 0 ];then exit $ret fi exit $ksft_skip fi -ip netns exec $srv iperf3 -s > /dev/null 2>&1 & -iperfpid=$! -sleep 1 +listener_ready() +{ + ss -N "$1" -lnt -o "sport = :5201" | grep -q 5201 +} + +ip netns exec "$srv" socat -u TCP-LISTEN:5201,fork STDOUT > /dev/null 2>/dev/null & +socatpid=$! + +busywait 1000 listener_ready "$srv" -for i in $(seq 1 $maxclients); do +for i in $(seq 1 "$maxclients"); do if [ $ret -ne 0 ]; then break fi - cl="ns-cl$i-$sfx" - ip netns exec $cl iperf3 -c 10.3.0.99 --cport 10000 -n 1 > /dev/null - if [ $? -ne 0 ]; then - echo FAIL: Failure to connect for $cl 1>&2 - ip netns exec $gw conntrack -S 1>&2 + cl=$(eval echo \$cl"$i") + if ! ip netns exec "$cl" socat -4 -u STDIN TCP:10.3.0.99:5201,sourceport=10000 < /dev/null > /dev/null; then + echo "FAIL: Failure to connect for $cl" 1>&2 + ip netns exec "$gw" conntrack -S 1>&2 ret=1 fi done if [ $ret -eq 0 ];then - echo "PASS: iperf3 connections for all $maxclients net namespaces" + echo "PASS: socat connections for all $maxclients net namespaces" fi -kill $iperfpid +kill $socatpid wait -for i in $(seq 1 $maxclients); do - ip netns exec $gw nft get element inet raw inflows "{ 10.1.0.3 . 10000 . \"veth$i\" . 10.3.0.99 . 5201 }" > /dev/null - if [ $? -ne 0 ];then +for i in $(seq 1 "$maxclients"); do + if ! ip netns exec "$gw" nft get element inet raw inflows "{ 10.1.0.3 . 10000 . \"veth$i\" . 10.3.0.99 . 5201 }" > /dev/null;then ret=1 echo "FAIL: can't find expected tcp entry for veth$i" 1>&2 break @@ -300,8 +259,7 @@ if [ $ret -eq 0 ];then echo "PASS: Found client connection for all $maxclients net namespaces" fi -ip netns exec $gw nft get element inet raw inflows "{ 10.3.0.99 . 5201 . \"veth0\" . 10.3.0.1 . 10000 }" > /dev/null -if [ $? -ne 0 ];then +if ! ip netns exec "$gw" nft get element inet raw inflows "{ 10.3.0.99 . 5201 . \"veth0\" . 10.3.0.1 . 10000 }" > /dev/null;then ret=1 echo "FAIL: cannot find return entry on veth0" 1>&2 fi diff --git a/tools/testing/selftests/net/netfilter/nft_queue.sh b/tools/testing/selftests/net/netfilter/nft_queue.sh new file mode 100755 index 000000000000..8538f08c64c2 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/nft_queue.sh @@ -0,0 +1,417 @@ +#!/bin/bash +# +# This tests nf_queue: +# 1. can process packets from all hooks +# 2. support running nfqueue from more than one base chain +# +# shellcheck disable=SC2162,SC2317 + +source lib.sh +ret=0 +timeout=2 + +cleanup() +{ + ip netns pids "$ns1" | xargs kill 2>/dev/null + ip netns pids "$ns2" | xargs kill 2>/dev/null + ip netns pids "$nsrouter" | xargs kill 2>/dev/null + + cleanup_all_ns + + rm -f "$TMPINPUT" + rm -f "$TMPFILE0" + rm -f "$TMPFILE1" + rm -f "$TMPFILE2" "$TMPFILE3" +} + +checktool "nft --version" "test without nft tool" + +trap cleanup EXIT + +setup_ns ns1 ns2 nsrouter + +TMPFILE0=$(mktemp) +TMPFILE1=$(mktemp) +TMPFILE2=$(mktemp) +TMPFILE3=$(mktemp) + +TMPINPUT=$(mktemp) +dd conv=sparse status=none if=/dev/zero bs=1M count=200 of="$TMPINPUT" + +if ! ip link add veth0 netns "$nsrouter" type veth peer name eth0 netns "$ns1" > /dev/null 2>&1; then + echo "SKIP: No virtual ethernet pair device support in kernel" + exit $ksft_skip +fi +ip link add veth1 netns "$nsrouter" type veth peer name eth0 netns "$ns2" + +ip -net "$nsrouter" link set veth0 up +ip -net "$nsrouter" addr add 10.0.1.1/24 dev veth0 +ip -net "$nsrouter" addr add dead:1::1/64 dev veth0 nodad + +ip -net "$nsrouter" link set veth1 up +ip -net "$nsrouter" addr add 10.0.2.1/24 dev veth1 +ip -net "$nsrouter" addr add dead:2::1/64 dev veth1 nodad + +ip -net "$ns1" link set eth0 up +ip -net "$ns2" link set eth0 up + +ip -net "$ns1" addr add 10.0.1.99/24 dev eth0 +ip -net "$ns1" addr add dead:1::99/64 dev eth0 nodad +ip -net "$ns1" route add default via 10.0.1.1 +ip -net "$ns1" route add default via dead:1::1 + +ip -net "$ns2" addr add 10.0.2.99/24 dev eth0 +ip -net "$ns2" addr add dead:2::99/64 dev eth0 nodad +ip -net "$ns2" route add default via 10.0.2.1 +ip -net "$ns2" route add default via dead:2::1 + +load_ruleset() { + local name=$1 + local prio=$2 + +ip netns exec "$nsrouter" nft -f /dev/stdin <<EOF +table inet $name { + chain nfq { + ip protocol icmp queue bypass + icmpv6 type { "echo-request", "echo-reply" } queue num 1 bypass + } + chain pre { + type filter hook prerouting priority $prio; policy accept; + jump nfq + } + chain input { + type filter hook input priority $prio; policy accept; + jump nfq + } + chain forward { + type filter hook forward priority $prio; policy accept; + tcp dport 12345 queue num 2 + jump nfq + } + chain output { + type filter hook output priority $prio; policy accept; + tcp dport 12345 queue num 3 + tcp sport 23456 queue num 3 + jump nfq + } + chain post { + type filter hook postrouting priority $prio; policy accept; + jump nfq + } +} +EOF +} + +load_counter_ruleset() { + local prio=$1 + +ip netns exec "$nsrouter" nft -f /dev/stdin <<EOF +table inet countrules { + chain pre { + type filter hook prerouting priority $prio; policy accept; + counter + } + chain input { + type filter hook input priority $prio; policy accept; + counter + } + chain forward { + type filter hook forward priority $prio; policy accept; + counter + } + chain output { + type filter hook output priority $prio; policy accept; + counter + } + chain post { + type filter hook postrouting priority $prio; policy accept; + counter + } +} +EOF +} + +test_ping() { + if ! ip netns exec "$ns1" ping -c 1 -q 10.0.2.99 > /dev/null; then + return 1 + fi + + if ! ip netns exec "$ns1" ping -c 1 -q dead:2::99 > /dev/null; then + return 2 + fi + + return 0 +} + +test_ping_router() { + if ! ip netns exec "$ns1" ping -c 1 -q 10.0.2.1 > /dev/null; then + return 3 + fi + + if ! ip netns exec "$ns1" ping -c 1 -q dead:2::1 > /dev/null; then + return 4 + fi + + return 0 +} + +test_queue_blackhole() { + local proto=$1 + +ip netns exec "$nsrouter" nft -f /dev/stdin <<EOF +table $proto blackh { + chain forward { + type filter hook forward priority 0; policy accept; + queue num 600 + } +} +EOF + if [ "$proto" = "ip" ] ;then + ip netns exec "$ns1" ping -W 2 -c 1 -q 10.0.2.99 > /dev/null + lret=$? + elif [ "$proto" = "ip6" ]; then + ip netns exec "$ns1" ping -W 2 -c 1 -q dead:2::99 > /dev/null + lret=$? + else + lret=111 + fi + + # queue without bypass keyword should drop traffic if no listener exists. + if [ "$lret" -eq 0 ];then + echo "FAIL: $proto expected failure, got $lret" 1>&2 + exit 1 + fi + + if ! ip netns exec "$nsrouter" nft delete table "$proto" blackh; then + echo "FAIL: $proto: Could not delete blackh table" + exit 1 + fi + + echo "PASS: $proto: statement with no listener results in packet drop" +} + +nf_queue_wait() +{ + local procfile="/proc/self/net/netfilter/nfnetlink_queue" + local netns id + + netns="$1" + id="$2" + + # if this file doesn't exist, nfnetlink_module isn't loaded. + # rather than loading it ourselves, wait for kernel module autoload + # completion, nfnetlink should do so automatically because nf_queue + # helper program, spawned in the background, asked for this functionality. + test -f "$procfile" && + ip netns exec "$netns" cat "$procfile" | grep -q "^ *$id " +} + +test_queue() +{ + local expected="$1" + local last="" + + # spawn nf_queue listeners + ip netns exec "$nsrouter" ./nf_queue -c -q 0 -t $timeout > "$TMPFILE0" & + ip netns exec "$nsrouter" ./nf_queue -c -q 1 -t $timeout > "$TMPFILE1" & + + busywait "$BUSYWAIT_TIMEOUT" nf_queue_wait "$nsrouter" 0 + busywait "$BUSYWAIT_TIMEOUT" nf_queue_wait "$nsrouter" 1 + + if ! test_ping;then + echo "FAIL: netns routing/connectivity with active listener on queues 0 and 1: $ret" 1>&2 + exit $ret + fi + + if ! test_ping_router;then + echo "FAIL: netns router unreachable listener on queue 0 and 1: $ret" 1>&2 + exit $ret + fi + + wait + ret=$? + + for file in $TMPFILE0 $TMPFILE1; do + last=$(tail -n1 "$file") + if [ x"$last" != x"$expected packets total" ]; then + echo "FAIL: Expected $expected packets total, but got $last" 1>&2 + ip netns exec "$nsrouter" nft list ruleset + exit 1 + fi + done + + echo "PASS: Expected and received $last" +} + +listener_ready() +{ + ss -N "$1" -lnt -o "sport = :12345" | grep -q 12345 +} + +test_tcp_forward() +{ + ip netns exec "$nsrouter" ./nf_queue -q 2 -t "$timeout" & + local nfqpid=$! + + timeout 5 ip netns exec "$ns2" socat -u TCP-LISTEN:12345 STDOUT >/dev/null & + local rpid=$! + + busywait "$BUSYWAIT_TIMEOUT" listener_ready "$ns2" + + ip netns exec "$ns1" socat -u STDIN TCP:10.0.2.99:12345 <"$TMPINPUT" >/dev/null + + wait "$rpid" && echo "PASS: tcp and nfqueue in forward chain" +} + +test_tcp_localhost() +{ + dd conv=sparse status=none if=/dev/zero bs=1M count=200 of="$TMPINPUT" + timeout 5 ip netns exec "$nsrouter" socat -u TCP-LISTEN:12345 STDOUT >/dev/null & + local rpid=$! + + ip netns exec "$nsrouter" ./nf_queue -q 3 -t "$timeout" & + local nfqpid=$! + + busywait "$BUSYWAIT_TIMEOUT" listener_ready "$nsrouter" + + ip netns exec "$nsrouter" socat -u STDIN TCP:127.0.0.1:12345 <"$TMPINPUT" >/dev/null + + wait "$rpid" && echo "PASS: tcp via loopback" + wait 2>/dev/null +} + +test_tcp_localhost_connectclose() +{ + ip netns exec "$nsrouter" ./connect_close -p 23456 -t "$timeout" & + ip netns exec "$nsrouter" ./nf_queue -q 3 -t "$timeout" & + + busywait "$BUSYWAIT_TIMEOUT" nf_queue_wait "$nsrouter" 3 + + wait && echo "PASS: tcp via loopback with connect/close" + wait 2>/dev/null +} + +test_tcp_localhost_requeue() +{ +ip netns exec "$nsrouter" nft -f /dev/stdin <<EOF +flush ruleset +table inet filter { + chain output { + type filter hook output priority 0; policy accept; + tcp dport 12345 limit rate 1/second burst 1 packets counter queue num 0 + } + chain post { + type filter hook postrouting priority 0; policy accept; + tcp dport 12345 limit rate 1/second burst 1 packets counter queue num 0 + } +} +EOF + timeout 5 ip netns exec "$nsrouter" socat -u TCP-LISTEN:12345 STDOUT >/dev/null & + local rpid=$! + + ip netns exec "$nsrouter" ./nf_queue -c -q 1 -t "$timeout" > "$TMPFILE2" & + + # nfqueue 1 will be called via output hook. But this time, + # re-queue the packet to nfqueue program on queue 2. + ip netns exec "$nsrouter" ./nf_queue -G -d 150 -c -q 0 -Q 1 -t "$timeout" > "$TMPFILE3" & + + busywait "$BUSYWAIT_TIMEOUT" listener_ready "$nsrouter" + ip netns exec "$nsrouter" socat -u STDIN TCP:127.0.0.1:12345 <"$TMPINPUT" > /dev/null + + wait + + if ! diff -u "$TMPFILE2" "$TMPFILE3" ; then + echo "FAIL: lost packets during requeue?!" 1>&2 + return + fi + + echo "PASS: tcp via loopback and re-queueing" +} + +test_icmp_vrf() { + if ! ip -net "$ns1" link add tvrf type vrf table 9876;then + echo "SKIP: Could not add vrf device" + return + fi + + ip -net "$ns1" li set eth0 master tvrf + ip -net "$ns1" li set tvrf up + + ip -net "$ns1" route add 10.0.2.0/24 via 10.0.1.1 dev eth0 table 9876 +ip netns exec "$ns1" nft -f /dev/stdin <<EOF +flush ruleset +table inet filter { + chain output { + type filter hook output priority 0; policy accept; + meta oifname "tvrf" icmp type echo-request counter queue num 1 + meta oifname "eth0" icmp type echo-request counter queue num 1 + } + chain post { + type filter hook postrouting priority 0; policy accept; + meta oifname "tvrf" icmp type echo-request counter queue num 1 + meta oifname "eth0" icmp type echo-request counter queue num 1 + } +} +EOF + ip netns exec "$ns1" ./nf_queue -q 1 -t "$timeout" & + local nfqpid=$! + + busywait "$BUSYWAIT_TIMEOUT" nf_queue_wait "$ns1" 1 + + ip netns exec "$ns1" ip vrf exec tvrf ping -c 1 10.0.2.99 > /dev/null + + for n in output post; do + for d in tvrf eth0; do + if ! ip netns exec "$ns1" nft list chain inet filter "$n" | grep -q "oifname \"$d\" icmp type echo-request counter packets 1"; then + echo "FAIL: chain $n: icmp packet counter mismatch for device $d" 1>&2 + ip netns exec "$ns1" nft list ruleset + ret=1 + return + fi + done + done + + wait "$nfqpid" && echo "PASS: icmp+nfqueue via vrf" + wait 2>/dev/null +} + +ip netns exec "$nsrouter" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null +ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null +ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null + +load_ruleset "filter" 0 + +if test_ping; then + # queue bypass works (rules were skipped, no listener) + echo "PASS: ${ns1} can reach ${ns2}" +else + echo "FAIL: ${ns1} cannot reach ${ns2}: $ret" 1>&2 + exit $ret +fi + +test_queue_blackhole ip +test_queue_blackhole ip6 + +# dummy ruleset to add base chains between the +# queueing rules. We don't want the second reinject +# to re-execute the old hooks. +load_counter_ruleset 10 + +# we are hooking all: prerouting/input/forward/output/postrouting. +# we ping ${ns2} from ${ns1} via ${nsrouter} using ipv4 and ipv6, so: +# 1x icmp prerouting,forward,postrouting -> 3 queue events (6 incl. reply). +# 1x icmp prerouting,input,output postrouting -> 4 queue events incl. reply. +# so we expect that userspace program receives 10 packets. +test_queue 10 + +# same. We queue to a second program as well. +load_ruleset "filter2" 20 +test_queue 20 + +test_tcp_forward +test_tcp_localhost +test_tcp_localhost_connectclose +test_tcp_localhost_requeue +test_icmp_vrf + +exit $ret diff --git a/tools/testing/selftests/net/netfilter/nft_synproxy.sh b/tools/testing/selftests/net/netfilter/nft_synproxy.sh new file mode 100755 index 000000000000..293f667a6aec --- /dev/null +++ b/tools/testing/selftests/net/netfilter/nft_synproxy.sh @@ -0,0 +1,96 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +source lib.sh + +ret=0 + +checktool "nft --version" "run test without nft tool" +checktool "iperf3 --version" "run test without iperf3" + +setup_ns nsr ns1 ns2 + +modprobe -q nf_conntrack + +cleanup() { + ip netns pids "$ns1" | xargs kill 2>/dev/null + ip netns pids "$ns2" | xargs kill 2>/dev/null + + cleanup_all_ns +} + +trap cleanup EXIT + +ip link add veth0 netns "$nsr" type veth peer name eth0 netns "$ns1" +ip link add veth1 netns "$nsr" type veth peer name eth0 netns "$ns2" + +for dev in veth0 veth1; do + ip -net "$nsr" link set "$dev" up +done + +ip -net "$nsr" addr add 10.0.1.1/24 dev veth0 +ip -net "$nsr" addr add 10.0.2.1/24 dev veth1 + +ip netns exec "$nsr" sysctl -q net.ipv4.conf.veth0.forwarding=1 +ip netns exec "$nsr" sysctl -q net.ipv4.conf.veth1.forwarding=1 +ip netns exec "$nsr" sysctl -q net.netfilter.nf_conntrack_tcp_loose=0 + +for n in $ns1 $ns2; do + ip -net "$n" link set eth0 up +done +ip -net "$ns1" addr add 10.0.1.99/24 dev eth0 +ip -net "$ns2" addr add 10.0.2.99/24 dev eth0 +ip -net "$ns1" route add default via 10.0.1.1 +ip -net "$ns2" route add default via 10.0.2.1 + +# test basic connectivity +if ! ip netns exec "$ns1" ping -c 1 -q 10.0.2.99 > /dev/null; then + echo "ERROR: $ns1 cannot reach $ns2" 1>&2 + exit 1 +fi + +if ! ip netns exec "$ns2" ping -c 1 -q 10.0.1.99 > /dev/null; then + echo "ERROR: $ns2 cannot reach $ns1" 1>&2 + exit 1 +fi + +ip netns exec "$ns2" iperf3 -s > /dev/null 2>&1 & +# ip netns exec $nsr tcpdump -vvv -n -i veth1 tcp | head -n 10 & + +sleep 1 + +ip netns exec "$nsr" nft -f - <<EOF +table inet filter { + chain prerouting { + type filter hook prerouting priority -300; policy accept; + meta iif veth0 tcp flags syn counter notrack + } + + chain forward { + type filter hook forward priority 0; policy accept; + + ct state new,established counter accept + + meta iif veth0 meta l4proto tcp ct state untracked,invalid synproxy mss 1460 sack-perm timestamp + + ct state invalid counter drop + + # make ns2 unreachable w.o. tcp synproxy + tcp flags syn counter drop + } +} +EOF +if [ $? -ne 0 ]; then + echo "SKIP: Cannot add nft synproxy" + exit $ksft_skip +fi + +if ! ip netns exec "$ns1" timeout 5 iperf3 -c 10.0.2.99 -n $((1 * 1024 * 1024)) > /dev/null; then + echo "FAIL: iperf3 returned an error" 1>&2 + ret=1 + ip netns exec "$nsr" nft list ruleset +else + echo "PASS: synproxy connection successful" +fi + +exit $ret diff --git a/tools/testing/selftests/netfilter/nft_zones_many.sh b/tools/testing/selftests/net/netfilter/nft_zones_many.sh index 5a8db0b48928..7db9982ba5a6 100755 --- a/tools/testing/selftests/netfilter/nft_zones_many.sh +++ b/tools/testing/selftests/net/netfilter/nft_zones_many.sh @@ -3,47 +3,34 @@ # Test insertion speed for packets with identical addresses/ports # that are all placed in distinct conntrack zones. -sfx=$(mktemp -u "XXXXXXXX") -ns="ns-$sfx" - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 +source lib.sh zones=2000 +[ "$KSFT_MACHINE_SLOW" = yes ] && zones=500 + have_ct_tool=0 ret=0 cleanup() { - ip netns del $ns -} - -checktool (){ - if ! $1 > /dev/null 2>&1; then - echo "SKIP: Could not $2" - exit $ksft_skip - fi + cleanup_all_ns } checktool "nft --version" "run test without nft tool" -checktool "ip -Version" "run test without ip tool" checktool "socat -V" "run test without socat tool" -checktool "ip netns add $ns" "create net namespace" + +setup_ns ns1 trap cleanup EXIT -conntrack -V > /dev/null 2>&1 -if [ $? -eq 0 ];then +if conntrack -V > /dev/null 2>&1; then have_ct_tool=1 fi -ip -net "$ns" link set lo up - test_zones() { local max_zones=$1 -ip netns exec $ns sysctl -q net.netfilter.nf_conntrack_udp_timeout=3600 -ip netns exec $ns nft -f /dev/stdin<<EOF +ip netns exec "$ns1" nft -f /dev/stdin<<EOF flush ruleset table inet raw { map rndzone { @@ -56,29 +43,39 @@ table inet raw { } } EOF +if [ "$?" -ne 0 ];then + echo "SKIP: Cannot add nftables rules" + exit $ksft_skip +fi + + ip netns exec "$ns1" sysctl -q net.netfilter.nf_conntrack_udp_timeout=3600 + ( echo "add element inet raw rndzone {" - for i in $(seq 1 $max_zones);do + for i in $(seq 1 "$max_zones");do echo -n "$i : $i" - if [ $i -lt $max_zones ]; then + if [ "$i" -lt "$max_zones" ]; then echo "," else echo "}" fi done - ) | ip netns exec $ns nft -f /dev/stdin + ) | ip netns exec "$ns1" nft -f /dev/stdin local i=0 local j=0 - local outerstart=$(date +%s%3N) - local stop=$outerstart - - while [ $i -lt $max_zones ]; do - local start=$(date +%s%3N) + local outerstart + local stop + outerstart=$(date +%s%3N) + stop=$outerstart + + while [ "$i" -lt "$max_zones" ]; do + local start + start=$(date +%s%3N) i=$((i + 1000)) j=$((j + 1)) # nft rule in output places each packet in a different zone. - dd if=/dev/zero of=/dev/stdout bs=8k count=1000 2>/dev/null | ip netns exec "$ns" socat STDIN UDP:127.0.0.1:12345,sourceport=12345 + dd if=/dev/zero bs=8k count=1000 2>/dev/null | ip netns exec "$ns1" socat -u STDIN UDP:127.0.0.1:12345,sourceport=12345 if [ $? -ne 0 ] ;then ret=1 break @@ -89,14 +86,15 @@ EOF echo "PASS: added 1000 entries in $duration ms (now $i total, loop $j)" done - if [ $have_ct_tool -eq 1 ]; then - local count=$(ip netns exec "$ns" conntrack -C) - local duration=$((stop-outerstart)) + if [ "$have_ct_tool" -eq 1 ]; then + local count duration + count=$(ip netns exec "$ns1" conntrack -C) + duration=$((stop-outerstart)) - if [ $count -eq $max_zones ]; then + if [ "$count" -ge "$max_zones" ]; then echo "PASS: inserted $count entries from packet path in $duration ms total" else - ip netns exec $ns conntrack -S 1>&2 + ip netns exec "$ns1" conntrack -S 1>&2 echo "FAIL: inserted $count entries from packet path in $duration ms total, expected $max_zones entries" ret=1 fi @@ -110,18 +108,19 @@ EOF test_conntrack_tool() { local max_zones=$1 - ip netns exec $ns conntrack -F >/dev/null 2>/dev/null + ip netns exec "$ns1" conntrack -F >/dev/null 2>/dev/null - local outerstart=$(date +%s%3N) - local start=$(date +%s%3N) - local stop=$start - local i=0 - while [ $i -lt $max_zones ]; do + local outerstart start stop i + outerstart=$(date +%s%3N) + start=$(date +%s%3N) + stop="$start" + i=0 + while [ "$i" -lt "$max_zones" ]; do i=$((i + 1)) - ip netns exec "$ns" conntrack -I -s 1.1.1.1 -d 2.2.2.2 --protonum 6 \ + ip netns exec "$ns1" conntrack -I -s 1.1.1.1 -d 2.2.2.2 --protonum 6 \ --timeout 3600 --state ESTABLISHED --sport 12345 --dport 1000 --zone $i >/dev/null 2>&1 if [ $? -ne 0 ];then - ip netns exec "$ns" conntrack -I -s 1.1.1.1 -d 2.2.2.2 --protonum 6 \ + ip netns exec "$ns1" conntrack -I -s 1.1.1.1 -d 2.2.2.2 --protonum 6 \ --timeout 3600 --state ESTABLISHED --sport 12345 --dport 1000 --zone $i > /dev/null echo "FAIL: conntrack -I returned an error" ret=1 @@ -137,13 +136,15 @@ test_conntrack_tool() { fi done - local count=$(ip netns exec "$ns" conntrack -C) - local duration=$((stop-outerstart)) + local count + local duration + count=$(ip netns exec "$ns1" conntrack -C) + duration=$((stop-outerstart)) - if [ $count -eq $max_zones ]; then + if [ "$count" -eq "$max_zones" ]; then echo "PASS: inserted $count entries via ctnetlink in $duration ms" else - ip netns exec $ns conntrack -S 1>&2 + ip netns exec "$ns1" conntrack -S 1>&2 echo "FAIL: inserted $count entries via ctnetlink in $duration ms, expected $max_zones entries ($duration ms)" ret=1 fi @@ -151,7 +152,7 @@ test_conntrack_tool() { test_zones $zones -if [ $have_ct_tool -eq 1 ];then +if [ "$have_ct_tool" -eq 1 ];then test_conntrack_tool $zones else echo "SKIP: Could not run ctnetlink insertion test without conntrack tool" diff --git a/tools/testing/selftests/net/netfilter/packetdrill/common.sh b/tools/testing/selftests/net/netfilter/packetdrill/common.sh new file mode 100755 index 000000000000..ed36d535196d --- /dev/null +++ b/tools/testing/selftests/net/netfilter/packetdrill/common.sh @@ -0,0 +1,33 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# for debugging set net.netfilter.nf_log_all_netns=1 in init_net +# or do not use net namespaces. +modprobe -q nf_conntrack +sysctl -q net.netfilter.nf_conntrack_log_invalid=6 + +# Flush old cached data (fastopen cookies). +ip tcp_metrics flush all > /dev/null 2>&1 + +# TCP min, default, and max receive and send buffer sizes. +sysctl -q net.ipv4.tcp_rmem="4096 540000 $((15*1024*1024))" +sysctl -q net.ipv4.tcp_wmem="4096 $((256*1024)) 4194304" + +# TCP congestion control. +sysctl -q net.ipv4.tcp_congestion_control=cubic + +# TCP slow start after idle. +sysctl -q net.ipv4.tcp_slow_start_after_idle=0 + +# TCP Explicit Congestion Notification (ECN) +sysctl -q net.ipv4.tcp_ecn=0 + +sysctl -q net.ipv4.tcp_notsent_lowat=4294967295 > /dev/null 2>&1 + +# Override the default qdisc on the tun device. +# Many tests fail with timing errors if the default +# is FQ and that paces their flows. +tc qdisc add dev tun0 root pfifo + +# Enable conntrack +$xtables -A INPUT -m conntrack --ctstate NEW -p tcp --syn diff --git a/tools/testing/selftests/net/netfilter/packetdrill/conntrack_ack_loss_stall.pkt b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_ack_loss_stall.pkt new file mode 100644 index 000000000000..d755bd64c54f --- /dev/null +++ b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_ack_loss_stall.pkt @@ -0,0 +1,118 @@ +// check that already-acked (retransmitted) packet is let through rather +// than tagged as INVALID. + +`packetdrill/common.sh` + +// should set -P DROP but it disconnects VM w.o. extra netns ++0 `$xtables -A INPUT -m conntrack --ctstate INVALID -j DROP` + ++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 ++0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 ++0 bind(3, ..., ...) = 0 ++0 listen(3, 10) = 0 + ++0 < S 0:0(0) win 32792 <mss 1000> ++0 > S. 0:0(0) ack 1 <mss 1460> ++.01 < . 1:1(0) ack 1 win 65535 ++0 accept(3, ..., ...) = 4 + ++0.0001 < P. 1:1461(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 1461 win 65535 ++0.0001 < P. 1461:2921(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 2921 win 65535 ++0.0001 < P. 2921:4381(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 4381 win 65535 ++0.0001 < P. 4381:5841(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 5841 win 65535 ++0.0001 < P. 5841:7301(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 7301 win 65535 ++0.0001 < P. 7301:8761(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 8761 win 65535 ++0.0001 < P. 8761:10221(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 10221 win 65535 ++0.0001 < P. 10221:11681(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 11681 win 65535 ++0.0001 < P. 11681:13141(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 13141 win 65535 ++0.0001 < P. 13141:14601(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 14601 win 65535 ++0.0001 < P. 14601:16061(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 16061 win 65535 ++0.0001 < P. 16061:17521(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 17521 win 65535 ++0.0001 < P. 17521:18981(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 18981 win 65535 ++0.0001 < P. 18981:20441(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 20441 win 65535 ++0.0001 < P. 20441:21901(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 21901 win 65535 ++0.0001 < P. 21901:23361(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 23361 win 65535 ++0.0001 < P. 23361:24821(1460) ack 1 win 257 +0.055 > . 1:1(0) ack 24821 win 65535 ++0.0001 < P. 24821:26281(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 26281 win 65535 ++0.0001 < P. 26281:27741(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 27741 win 65535 ++0.0001 < P. 27741:29201(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 29201 win 65535 ++0.0001 < P. 29201:30661(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 30661 win 65535 ++0.0001 < P. 30661:32121(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 32121 win 65535 ++0.0001 < P. 32121:33581(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 33581 win 65535 ++0.0001 < P. 33581:35041(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 35041 win 65535 ++0.0001 < P. 35041:36501(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 36501 win 65535 ++0.0001 < P. 36501:37961(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 37961 win 65535 ++0.0001 < P. 37961:39421(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 39421 win 65535 ++0.0001 < P. 39421:40881(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 40881 win 65535 ++0.0001 < P. 40881:42341(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 42341 win 65535 ++0.0001 < P. 42341:43801(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 43801 win 65535 ++0.0001 < P. 43801:45261(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 45261 win 65535 ++0.0001 < P. 45261:46721(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 46721 win 65535 ++0.0001 < P. 46721:48181(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 48181 win 65535 ++0.0001 < P. 48181:49641(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 49641 win 65535 ++0.0001 < P. 49641:51101(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 51101 win 65535 ++0.0001 < P. 51101:52561(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 52561 win 65535 ++0.0001 < P. 52561:54021(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 54021 win 65535 ++0.0001 < P. 54021:55481(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 55481 win 65535 ++0.0001 < P. 55481:56941(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 56941 win 65535 ++0.0001 < P. 56941:58401(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 58401 win 65535 ++0.0001 < P. 58401:59861(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 59861 win 65535 ++0.0001 < P. 59861:61321(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 61321 win 65535 ++0.0001 < P. 61321:62781(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 62781 win 65535 ++0.0001 < P. 62781:64241(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 64241 win 65535 ++0.0001 < P. 64241:65701(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 65701 win 65535 ++0.0001 < P. 65701:67161(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 67161 win 65535 + +// nf_ct_proto_6: SEQ is under the lower bound (already ACKed data retransmitted) IN=tun0 OUT= MAC= SRC=192.0.2.1 DST=192.168.24.72 LEN=1500 TOS=0x00 PREC=0x00 TTL=255 ID=0 PROTO=TCP SPT=34375 DPT=8080 SEQ=1 ACK=4162510439 WINDOW=257 RES=0x00 ACK PSH URGP=0 ++0.0001 < P. 1:1461(1460) ack 1 win 257 + +// only sent if above packet isn't flagged as invalid ++.0 > . 1:1(0) ack 67161 win 65535 + ++0 `$xtables -D INPUT -m conntrack --ctstate INVALID -j DROP` diff --git a/tools/testing/selftests/net/netfilter/packetdrill/conntrack_inexact_rst.pkt b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_inexact_rst.pkt new file mode 100644 index 000000000000..dccdd4c009c6 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_inexact_rst.pkt @@ -0,0 +1,62 @@ +// check RST packet that doesn't exactly match expected next sequence +// number still transitions conntrack state to CLOSE iff its already in +// FIN/CLOSE_WAIT. + +`packetdrill/common.sh` + +// 5.771921 server_ip > client_ip TLSv1.2 337 [Packet size limited during capture] +// 5.771994 server_ip > client_ip TLSv1.2 337 [Packet size limited during capture] +// 5.772212 client_ip > server_ip TCP 66 45020 > 443 [ACK] Seq=1905874048 Ack=781810658 Win=36352 Len=0 TSval=3317842872 TSecr=675936334 +// 5.787924 server_ip > client_ip TLSv1.2 1300 [Packet size limited during capture] +// 5.788126 server_ip > client_ip TLSv1.2 90 Application Data +// 5.788207 server_ip > client_ip TCP 66 443 > 45020 [FIN, ACK] Seq=781811916 Ack=1905874048 Win=31104 Len=0 TSval=675936350 TSecr=3317842872 +// 5.788447 client_ip > server_ip TLSv1.2 90 Application Data +// 5.788479 client_ip > server_ip TCP 66 45020 > 443 [RST, ACK] Seq=1905874072 Ack=781811917 Win=39040 Len=0 TSval=3317842889 TSecr=675936350 +// 5.788581 server_ip > client_ip TCP 54 8443 > 45020 [RST] Seq=781811892 Win=0 Len=0 + ++0 `iptables -A INPUT -p tcp -m conntrack --ctstate INVALID -j DROP` ++0 `iptables -A OUTPUT -p tcp -m conntrack --ctstate INVALID -j DROP` + ++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 ++0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0 + +0.1 connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress) + +0.1 > S 0:0(0) win 65535 <mss 1460,sackOK,TS val 1 ecr 0,nop,wscale 8> + ++0.1 < S. 1:1(0) ack 1 win 65535 <mss 1460> + ++0 > . 1:1(0) ack 1 win 65535 ++0 < . 1:1001(1000) ack 1 win 65535 ++0 < . 1001:2001(1000) ack 1 win 65535 ++0 < . 2001:3001(1000) ack 1 win 65535 + ++0 > . 1:1(0) ack 1001 win 65535 ++0 > . 1:1(0) ack 2001 win 65535 ++0 > . 1:1(0) ack 3001 win 65535 + ++0 write(3, ..., 1000) = 1000 + ++0.0 > P. 1:1001(1000) ack 3001 win 65535 + ++0.1 read(3, ..., 1000) = 1000 + +// Conntrack should move to FIN_WAIT, then CLOSE_WAIT. ++0 < F. 3001:3001(0) ack 1001 win 65535 ++0 > . 1001:1001(0) ack 3002 win 65535 + ++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null |grep -q CLOSE_WAIT` + ++1 close(3) = 0 +// RST: unread data. FIN was seen, hence ack + 1 ++0 > R. 1001:1001(0) ack 3002 win 65535 +// ... and then, CLOSE. ++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null |grep -q CLOSE\ ` + +// Spurious RST from peer -- no sk state. Should NOT get +// marked INVALID, because conntrack is already closing. ++0.1 < R 2001:2001(0) win 0 + +// No packets should have been marked INVALID ++0 `iptables -v -S INPUT | grep INVALID | grep -q -- "-c 0 0"` ++0 `iptables -v -S OUTPUT | grep INVALID | grep -q -- "-c 0 0"` diff --git a/tools/testing/selftests/net/netfilter/packetdrill/conntrack_rst_invalid.pkt b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_rst_invalid.pkt new file mode 100644 index 000000000000..686f18a3d9ef --- /dev/null +++ b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_rst_invalid.pkt @@ -0,0 +1,59 @@ +// check that out of window resets are marked as INVALID and conntrack remains +// in ESTABLISHED state. + +`packetdrill/common.sh` + ++0 `$xtables -A INPUT -p tcp -m conntrack --ctstate INVALID -j DROP` ++0 `$xtables -A OUTPUT -p tcp -m conntrack --ctstate INVALID -j DROP` + ++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 ++0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0 + +0.1 connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress) + +0.1 > S 0:0(0) win 65535 <mss 1460,sackOK,TS val 1 ecr 0,nop,wscale 8> + ++0.1 < S. 1:1(0) ack 1 win 65535 <mss 1460> + ++0 > . 1:1(0) ack 1 win 65535 ++0 < . 1:1001(1000) ack 1 win 65535 ++0 < . 1001:2001(1000) ack 1 win 65535 ++0 < . 2001:3001(1000) ack 1 win 65535 + ++0 > . 1:1(0) ack 1001 win 65535 ++0 > . 1:1(0) ack 2001 win 65535 ++0 > . 1:1(0) ack 3001 win 65535 + ++0 write(3, ..., 1000) = 1000 + +// out of window ++0.0 < R 0:0(0) win 0 ++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null |grep -q ESTABLISHED` + +// out of window ++0.0 < R 1000000:1000000(0) win 0 ++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null |grep -q ESTABLISHED` + +// in-window but not exact match ++0.0 < R 42:42(0) win 0 ++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null |grep -q ESTABLISHED` + ++0.0 > P. 1:1001(1000) ack 3001 win 65535 + ++0.1 read(3, ..., 1000) = 1000 ++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null |grep -q ESTABLISHED` + ++0 < . 3001:3001(0) ack 1001 win 65535 + ++0.0 < R. 3000:3000(0) ack 1001 win 0 ++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null |grep -q ESTABLISHED` + +// exact next sequence ++0.0 < R. 3001:3001(0) ack 1001 win 0 +// Conntrack should move to CLOSE + +// Expect four invalid RSTs ++0 `$xtables -v -S INPUT | grep INVALID | grep -q -- "-c 4 "` ++0 `$xtables -v -S OUTPUT | grep INVALID | grep -q -- "-c 0 0"` + ++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null |grep -q CLOSE\ ` diff --git a/tools/testing/selftests/net/netfilter/packetdrill/conntrack_syn_challenge_ack.pkt b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_syn_challenge_ack.pkt new file mode 100644 index 000000000000..3442cd29bc93 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_syn_challenge_ack.pkt @@ -0,0 +1,44 @@ +// Check connection re-use, i.e. peer that receives the SYN answers with +// a challenge-ACK. +// Check that conntrack lets all packets pass, including the challenge ack, +// and that a new connection is established. + +`packetdrill/common.sh` + +// S > +// . < (challnge-ack) +// R. > +// S > +// S. < +// Expected outcome: established connection. + ++0 `$xtables -A INPUT -p tcp -m conntrack --ctstate INVALID -j DROP` ++0 `$xtables -A OUTPUT -p tcp -m conntrack --ctstate INVALID -j DROP` + ++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 ++0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0 + +0.1 connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress) +0.1 > S 0:0(0) win 65535 <mss 1460,sackOK,TS val 1 ecr 0,nop,wscale 8> + +// Challenge ACK, old incarnation. +0.1 < . 145824453:145824453(0) ack 643160523 win 240 <mss 1460,nop,nop,TS val 1 ecr 1,nop,wscale 0> + ++0.01 > R 643160523:643160523(0) win 0 + ++0.01 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null | grep UNREPLIED | grep -q SYN_SENT` + +// Must go through. ++0.01 > S 0:0(0) win 65535 <mss 1460,sackOK,TS val 1 ecr 0,nop,wscale 8> + +// correct synack ++0.1 < S. 0:0(0) ack 1 win 250 <mss 1460,nop,nop,TS val 1 ecr 1,nop,wscale 0> + +// 3whs completes. ++0.01 > . 1:1(0) ack 1 win 256 <nop,nop,TS val 1 ecr 1> + ++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null | grep ESTABLISHED | grep -q ASSURED` + +// No packets should have been marked INVALID ++0 `$xtables -v -S INPUT | grep INVALID | grep -q -- "-c 0 0"` ++0 `$xtables -v -S OUTPUT | grep INVALID | grep -q -- "-c 0 0"` diff --git a/tools/testing/selftests/net/netfilter/packetdrill/conntrack_synack_old.pkt b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_synack_old.pkt new file mode 100644 index 000000000000..3047160c4bf3 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_synack_old.pkt @@ -0,0 +1,51 @@ +// Check conntrack copes with syn/ack reply for a previous, old incarnation. + +// tcpdump with buggy sequence +// 10.176.25.8.829 > 10.192.171.30.2049: Flags [S], seq 2375731741, win 29200, options [mss 1460,sackOK,TS val 2083107423 ecr 0,nop,wscale 7], length 0 +// OLD synack, for old/previous S +// 10.192.171.30.2049 > 10.176.25.8.829: Flags [S.], seq 145824453, ack 643160523, win 65535, options [mss 8952,nop,wscale 5,TS val 3215437785 ecr 2082921663,nop,nop], length 0 +// This reset never makes it to the endpoint, elided in the packetdrill script +// 10.192.171.30.2049 > 10.176.25.8.829: Flags [R.], seq 1, ack 1, win 65535, options [mss 8952,nop,wscale 5,TS val 3215443451 ecr 2082921663,nop,nop], length 0 +// Syn retransmit, no change +// 10.176.25.8.829 > 10.192.171.30.2049: Flags [S], seq 2375731741, win 29200, options [mss 1460,sackOK,TS val 2083115583 ecr 0,nop,wscale 7], length 0 +// CORRECT synack, should be accepted, but conntrack classified this as INVALID: +// SEQ is over the upper bound (over the window of the receiver) IN=tun0 OUT= MAC= SRC=192.0.2.1 DST=192.168.37.78 LEN=40 TOS=0x00 PREC=0x00 TTL=255 ID=0 PROTO=TCP SPT=8080 DPT=34500 SEQ=162602411 ACK=2124350315 .. +// 10.192.171.30.2049 > 10.176.25.8.829: Flags [S.], seq 162602410, ack 2375731742, win 65535, options [mss 8952,nop,wscale 5,TS val 3215445754 ecr 2083115583,nop,nop], length 0 + +`packetdrill/common.sh` + ++0 `$xtables -A INPUT -p tcp -m conntrack --ctstate INVALID -j DROP` ++0 `$xtables -A OUTPUT -p tcp -m conntrack --ctstate INVALID -j DROP` + ++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 ++0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0 + +0.1 connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress) +0.1 > S 0:0(0) win 65535 <mss 1460,sackOK,TS val 1 ecr 0,nop,wscale 8> + +// bogus/outdated synack, invalid ack value +0.1 < S. 145824453:145824453(0) ack 643160523 win 240 <mss 1440,nop,nop,TS val 1 ecr 1,nop,wscale 0> + +// syn retransmitted +1.01 > S 0:0(0) win 65535 <mss 1460,sackOK,TS val 1015 ecr 0,nop,wscale 8> ++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null | grep UNREPLIED | grep -q SYN_SENT` + +// correct synack ++0 < S. 145758918:145758918(0) ack 1 win 250 <mss 1460,nop,nop,TS val 1 ecr 1,nop,wscale 0> ++0 write(3, ..., 1) = 1 + +// with buggy conntrack above packet is dropped, so SYN rtx is seen: +// script packet: 1.054007 . 1:1(0) ack 16777958 win 256 <nop,nop,TS val 1033 ecr 1> +// actual packet: 3.010000 S 0:0(0) win 65535 <mss 1460,sackOK,TS val 1015 ecr 0,nop,wscale 8> ++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null | grep ESTABLISHED | grep -q ASSURED` + ++0 > P. 1:2(1) ack 4294901762 win 256 <nop,nop,TS val 1067 ecr 1> + ++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null | grep ASSURED | grep -q ESTABLISHED` + +// No packets should have been marked INVALID in OUTPUT direction, 1 in INPUT ++0 `$xtables -v -S OUTPUT | grep INVALID | grep -q -- "-c 0 0"` ++0 `$xtables -v -S INPUT | grep INVALID | grep -q -- "-c 1 "` + ++0 `$xtables -D INPUT -p tcp -m conntrack --ctstate INVALID -j DROP` ++0 `$xtables -D OUTPUT -p tcp -m conntrack --ctstate INVALID -j DROP` diff --git a/tools/testing/selftests/net/netfilter/packetdrill/conntrack_synack_reuse.pkt b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_synack_reuse.pkt new file mode 100644 index 000000000000..842242f8ccf7 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_synack_reuse.pkt @@ -0,0 +1,34 @@ +// Check reception of another SYN while we have an established conntrack state. +// Challenge ACK is supposed to pass through, RST reply should clear conntrack +// state and SYN retransmit should give us new 'SYN_RECV' connection state. + +`packetdrill/common.sh` + +// should show a match if bug is present: ++0 `iptables -A INPUT -m conntrack --ctstate INVALID -p tcp --tcp-flags SYN,ACK SYN,ACK` + ++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 ++0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 ++0 bind(3, ..., ...) = 0 ++0 listen(3, 10) = 0 + ++0 < S 0:0(0) win 32792 <mss 1000,nop,wscale 7, TS val 1 ecr 0,nop,nop> ++0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,TS val 100 ecr 1,nop,wscale 8> ++.01 < . 1:1(0) ack 1 win 257 <TS val 1 ecr 100,nop,nop> ++0 accept(3, ..., ...) = 4 + ++0 < P. 1:101(100) ack 1 win 257 <TS val 2 ecr 100,nop,nop> ++.001 > . 1:1(0) ack 101 win 256 <nop,nop,TS val 110 ecr 2> ++0 read(4, ..., 101) = 100 + +1.0 < S 2000:2000(0) win 32792 <mss 1000,nop,wscale 7, TS val 233 ecr 0,nop,nop> +// Won't expect this: challenge ack. + ++0 > . 1:1(0) ack 101 win 256 <nop,nop,TS val 112 ecr 2> ++0 < R. 101:101(0) ack 1 win 257 ++0 close(4) = 0 + +1.5 < S 2000:2000(0) win 32792 <mss 1000,nop,wscale 0, TS val 233 ecr 0,nop,nop> + ++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null | grep -q SYN_RECV` ++0 `iptables -v -S INPUT | grep INVALID | grep -q -- "-c 0 0"` diff --git a/tools/testing/selftests/netfilter/rpath.sh b/tools/testing/selftests/net/netfilter/rpath.sh index 5289c8447a41..4485fd7675ed 100755 --- a/tools/testing/selftests/netfilter/rpath.sh +++ b/tools/testing/selftests/net/netfilter/rpath.sh @@ -64,12 +64,18 @@ ip -net "$ns2" a a fec0:42::1/64 dev d0 nodad # firewall matches to test [ -n "$iptables" ] && { common='-t raw -A PREROUTING -s 192.168.0.0/16' - ip netns exec "$ns2" "$iptables" $common -m rpfilter + if ! ip netns exec "$ns2" "$iptables" $common -m rpfilter;then + echo "Cannot add rpfilter rule" + exit $ksft_skip + fi ip netns exec "$ns2" "$iptables" $common -m rpfilter --invert } [ -n "$ip6tables" ] && { common='-t raw -A PREROUTING -s fec0::/16' - ip netns exec "$ns2" "$ip6tables" $common -m rpfilter + if ! ip netns exec "$ns2" "$ip6tables" $common -m rpfilter;then + echo "Cannot add rpfilter rule" + exit $ksft_skip + fi ip netns exec "$ns2" "$ip6tables" $common -m rpfilter --invert } [ -n "$nft" ] && ip netns exec "$ns2" $nft -f - <<EOF diff --git a/tools/testing/selftests/netfilter/sctp_collision.c b/tools/testing/selftests/net/netfilter/sctp_collision.c index 21bb1cfd8a85..21bb1cfd8a85 100644 --- a/tools/testing/selftests/netfilter/sctp_collision.c +++ b/tools/testing/selftests/net/netfilter/sctp_collision.c diff --git a/tools/testing/selftests/net/netfilter/settings b/tools/testing/selftests/net/netfilter/settings new file mode 100644 index 000000000000..abc5648b59ab --- /dev/null +++ b/tools/testing/selftests/net/netfilter/settings @@ -0,0 +1 @@ +timeout=1800 diff --git a/tools/testing/selftests/netfilter/xt_string.sh b/tools/testing/selftests/net/netfilter/xt_string.sh index 1802653a4728..8d401c69e317 100755 --- a/tools/testing/selftests/netfilter/xt_string.sh +++ b/tools/testing/selftests/net/netfilter/xt_string.sh @@ -5,53 +5,57 @@ ksft_skip=4 rc=0 -if ! iptables --version >/dev/null 2>&1; then - echo "SKIP: Test needs iptables" - exit $ksft_skip -fi -if ! ip -V >/dev/null 2>&1; then - echo "SKIP: Test needs iproute2" - exit $ksft_skip -fi -if ! nc -h >/dev/null 2>&1; then - echo "SKIP: Test needs netcat" - exit $ksft_skip -fi +source lib.sh + +checktool "socat -h" "run test without socat" +checktool "iptables --version" "test needs iptables" + +infile=$(mktemp) + +cleanup() +{ + ip netns del "$netns" + rm -f "$infile" +} + +trap cleanup EXIT + +setup_ns netns + +ip -net "$netns" link add d0 type dummy +ip -net "$netns" link set d0 up +ip -net "$netns" addr add 10.1.2.1/24 dev d0 pattern="foo bar baz" patlen=11 hdrlen=$((20 + 8)) # IPv4 + UDP -ns="ns-$(mktemp -u XXXXXXXX)" -trap 'ip netns del $ns' EXIT -ip netns add "$ns" -ip -net "$ns" link add d0 type dummy -ip -net "$ns" link set d0 up -ip -net "$ns" addr add 10.1.2.1/24 dev d0 - -#ip netns exec "$ns" tcpdump -npXi d0 & + +#ip netns exec "$netns" tcpdump -npXi d0 & #tcpdump_pid=$! -#trap 'kill $tcpdump_pid; ip netns del $ns' EXIT +#trap 'kill $tcpdump_pid; ip netns del $netns' EXIT add_rule() { # (alg, from, to) - ip netns exec "$ns" \ + ip netns exec "$netns" \ iptables -A OUTPUT -o d0 -m string \ - --string "$pattern" --algo $1 --from $2 --to $3 + --string "$pattern" --algo "$1" --from "$2" --to "$3" } showrules() { # () - ip netns exec "$ns" iptables -v -S OUTPUT | grep '^-A' + ip netns exec "$netns" iptables -v -S OUTPUT | grep '^-A' } zerorules() { - ip netns exec "$ns" iptables -Z OUTPUT + ip netns exec "$netns" iptables -Z OUTPUT } countrule() { # (pattern) showrules | grep -c -- "$*" } send() { # (offset) - ( for ((i = 0; i < $1 - $hdrlen; i++)); do - printf " " + ( for ((i = 0; i < $1 - hdrlen; i++)); do + echo -n " " done - printf "$pattern" - ) | ip netns exec "$ns" nc -w 1 -u 10.1.2.2 27374 + echo -n "$pattern" + ) > "$infile" + + ip netns exec "$netns" socat -t 1 -u STDIN UDP-SENDTO:10.1.2.2:27374 < "$infile" } add_rule bm 1000 1500 @@ -61,8 +65,8 @@ add_rule kmp 1400 1600 zerorules send 0 -send $((1000 - $patlen)) -if [ $(countrule -c 0 0) -ne 4 ]; then +send $((1000 - patlen)) +if [ "$(countrule -c 0 0)" -ne 4 ]; then echo "FAIL: rules match data before --from" showrules ((rc--)) @@ -70,16 +74,16 @@ fi zerorules send 1000 -send $((1400 - $patlen)) -if [ $(countrule -c 2) -ne 2 ]; then +send $((1400 - patlen)) +if [ "$(countrule -c 2)" -ne 2 ]; then echo "FAIL: only two rules should match at low offset" showrules ((rc--)) fi zerorules -send $((1500 - $patlen)) -if [ $(countrule -c 1) -ne 4 ]; then +send $((1500 - patlen)) +if [ "$(countrule -c 1)" -ne 4 ]; then echo "FAIL: all rules should match at end of packet" showrules ((rc--)) @@ -87,7 +91,7 @@ fi zerorules send 1495 -if [ $(countrule -c 1) -ne 1 ]; then +if [ "$(countrule -c 1)" -ne 1 ]; then echo "FAIL: only kmp with proper --to should match pattern spanning fragments" showrules ((rc--)) @@ -95,23 +99,23 @@ fi zerorules send 1500 -if [ $(countrule -c 1) -ne 2 ]; then +if [ "$(countrule -c 1)" -ne 2 ]; then echo "FAIL: two rules should match pattern at start of second fragment" showrules ((rc--)) fi zerorules -send $((1600 - $patlen)) -if [ $(countrule -c 1) -ne 2 ]; then +send $((1600 - patlen)) +if [ "$(countrule -c 1)" -ne 2 ]; then echo "FAIL: two rules should match pattern at end of largest --to" showrules ((rc--)) fi zerorules -send $((1600 - $patlen + 1)) -if [ $(countrule -c 1) -ne 0 ]; then +send $((1600 - patlen + 1)) +if [ "$(countrule -c 1)" -ne 0 ]; then echo "FAIL: no rules should match pattern extending largest --to" showrules ((rc--)) @@ -119,10 +123,11 @@ fi zerorules send 1600 -if [ $(countrule -c 1) -ne 0 ]; then +if [ "$(countrule -c 1)" -ne 0 ]; then echo "FAIL: no rule should match pattern past largest --to" showrules ((rc--)) fi +[ $rc -eq 0 ] && echo "PASS: string match tests" exit $rc diff --git a/tools/testing/selftests/net/nl_netdev.py b/tools/testing/selftests/net/nl_netdev.py new file mode 100755 index 000000000000..93d9d914529b --- /dev/null +++ b/tools/testing/selftests/net/nl_netdev.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +import time +from lib.py import ksft_run, ksft_exit, ksft_pr +from lib.py import ksft_eq, ksft_ge, ksft_busy_wait +from lib.py import NetdevFamily, NetdevSimDev, ip + + +def empty_check(nf) -> None: + devs = nf.dev_get({}, dump=True) + ksft_ge(len(devs), 1) + + +def lo_check(nf) -> None: + lo_info = nf.dev_get({"ifindex": 1}) + ksft_eq(len(lo_info['xdp-features']), 0) + ksft_eq(len(lo_info['xdp-rx-metadata-features']), 0) + + +def page_pool_check(nf) -> None: + with NetdevSimDev() as nsimdev: + nsim = nsimdev.nsims[0] + + def up(): + ip(f"link set dev {nsim.ifname} up") + + def down(): + ip(f"link set dev {nsim.ifname} down") + + def get_pp(): + pp_list = nf.page_pool_get({}, dump=True) + return [pp for pp in pp_list if pp.get("ifindex") == nsim.ifindex] + + # No page pools when down + down() + ksft_eq(len(get_pp()), 0) + + # Up, empty page pool appears + up() + pp_list = get_pp() + ksft_ge(len(pp_list), 0) + refs = sum([pp["inflight"] for pp in pp_list]) + ksft_eq(refs, 0) + + # Down, it disappears, again + down() + pp_list = get_pp() + ksft_eq(len(pp_list), 0) + + # Up, allocate a page + up() + nsim.dfs_write("pp_hold", "y") + pp_list = nf.page_pool_get({}, dump=True) + refs = sum([pp["inflight"] for pp in pp_list if pp.get("ifindex") == nsim.ifindex]) + ksft_ge(refs, 1) + + # Now let's leak a page + down() + pp_list = get_pp() + ksft_eq(len(pp_list), 1) + refs = sum([pp["inflight"] for pp in pp_list]) + ksft_eq(refs, 1) + attached = [pp for pp in pp_list if "detach-time" not in pp] + ksft_eq(len(attached), 0) + + # New pp can get created, and we'll have two + up() + pp_list = get_pp() + attached = [pp for pp in pp_list if "detach-time" not in pp] + detached = [pp for pp in pp_list if "detach-time" in pp] + ksft_eq(len(attached), 1) + ksft_eq(len(detached), 1) + + # Free the old page and the old pp is gone + nsim.dfs_write("pp_hold", "n") + # Freeing check is once a second so we may need to retry + ksft_busy_wait(lambda: len(get_pp()) == 1, deadline=2) + + # And down... + down() + ksft_eq(len(get_pp()), 0) + + # Last, leave the page hanging for destroy, nothing to check + # we're trying to exercise the orphaning path in the kernel + up() + nsim.dfs_write("pp_hold", "y") + + +def main() -> None: + nf = NetdevFamily() + ksft_run([empty_check, lo_check, page_pool_check], + args=(nf, )) + ksft_exit() + + +if __name__ == "__main__": + main() diff --git a/tools/testing/selftests/net/openvswitch/ovs-dpctl.py b/tools/testing/selftests/net/openvswitch/ovs-dpctl.py index 5e0e539a323d..1dd057afd3fb 100644 --- a/tools/testing/selftests/net/openvswitch/ovs-dpctl.py +++ b/tools/testing/selftests/net/openvswitch/ovs-dpctl.py @@ -489,7 +489,7 @@ class ovsactions(nla): actstr, reason = parse_extract_field( actstr, "drop(", - "([0-9]+)", + r"([0-9]+)", lambda x: int(x, 0), False, None, @@ -502,9 +502,9 @@ class ovsactions(nla): actstr = actstr[len("drop"): ] return (totallen - len(actstr)) - elif parse_starts_block(actstr, "^(\d+)", False, True): + elif parse_starts_block(actstr, r"^(\d+)", False, True): actstr, output = parse_extract_field( - actstr, None, "(\d+)", lambda x: int(x), False, "0" + actstr, None, r"(\d+)", lambda x: int(x), False, "0" ) self["attrs"].append(["OVS_ACTION_ATTR_OUTPUT", output]) parsed = True @@ -512,7 +512,7 @@ class ovsactions(nla): actstr, recircid = parse_extract_field( actstr, "recirc(", - "([0-9a-fA-Fx]+)", + r"([0-9a-fA-Fx]+)", lambda x: int(x, 0), False, 0, @@ -588,17 +588,17 @@ class ovsactions(nla): actstr = actstr[3:] actstr, ip_block_min = parse_extract_field( - actstr, "=", "([0-9a-fA-F\.]+)", str, False + actstr, "=", r"([0-9a-fA-F\.]+)", str, False ) actstr, ip_block_max = parse_extract_field( - actstr, "-", "([0-9a-fA-F\.]+)", str, False + actstr, "-", r"([0-9a-fA-F\.]+)", str, False ) actstr, proto_min = parse_extract_field( - actstr, ":", "(\d+)", int, False + actstr, ":", r"(\d+)", int, False ) actstr, proto_max = parse_extract_field( - actstr, "-", "(\d+)", int, False + actstr, "-", r"(\d+)", int, False ) if t is not None: diff --git a/tools/testing/selftests/bpf/progs/sample_map_ret0.c b/tools/testing/selftests/net/sample_map_ret0.bpf.c index 495990d355ef..43ca92594926 100644 --- a/tools/testing/selftests/bpf/progs/sample_map_ret0.c +++ b/tools/testing/selftests/net/sample_map_ret0.bpf.c @@ -17,7 +17,7 @@ struct { } array SEC(".maps"); /* Sample program which should always load for testing control paths. */ -SEC(".text") int func() +SEC("xdp") int func() { __u64 key64 = 0; __u32 key = 0; diff --git a/tools/testing/selftests/bpf/progs/sample_ret0.c b/tools/testing/selftests/net/sample_ret0.bpf.c index fec99750d6ea..1df5ca98bb65 100644 --- a/tools/testing/selftests/bpf/progs/sample_ret0.c +++ b/tools/testing/selftests/net/sample_ret0.bpf.c @@ -1,6 +1,9 @@ /* SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) */ +#define SEC(name) __attribute__((section(name), used)) + /* Sample program which should always load for testing control paths. */ +SEC("xdp") int func() { return 0; diff --git a/tools/testing/selftests/net/test_bridge_neigh_suppress.sh b/tools/testing/selftests/net/test_bridge_neigh_suppress.sh index 8533393a4f18..02b986c9c247 100755 --- a/tools/testing/selftests/net/test_bridge_neigh_suppress.sh +++ b/tools/testing/selftests/net/test_bridge_neigh_suppress.sh @@ -154,17 +154,9 @@ setup_topo() setup_topo_ns $ns done - ip link add name veth0 type veth peer name veth1 - ip link set dev veth0 netns $h1 name eth0 - ip link set dev veth1 netns $sw1 name swp1 - - ip link add name veth0 type veth peer name veth1 - ip link set dev veth0 netns $sw1 name veth0 - ip link set dev veth1 netns $sw2 name veth0 - - ip link add name veth0 type veth peer name veth1 - ip link set dev veth0 netns $h2 name eth0 - ip link set dev veth1 netns $sw2 name swp1 + ip -n $h1 link add name eth0 type veth peer name swp1 netns $sw1 + ip -n $sw1 link add name veth0 type veth peer name veth0 netns $sw2 + ip -n $h2 link add name eth0 type veth peer name swp1 netns $sw2 } setup_host_common() diff --git a/tools/testing/selftests/net/udpgro.sh b/tools/testing/selftests/net/udpgro.sh index 8802604148dd..11a1ebda564f 100755 --- a/tools/testing/selftests/net/udpgro.sh +++ b/tools/testing/selftests/net/udpgro.sh @@ -7,7 +7,7 @@ source net_helper.sh readonly PEER_NS="ns-peer-$(mktemp -u XXXXXX)" -BPF_FILE="xdp_dummy.o" +BPF_FILE="xdp_dummy.bpf.o" # set global exit status, but never reset nonzero one. check_err() diff --git a/tools/testing/selftests/net/udpgro_bench.sh b/tools/testing/selftests/net/udpgro_bench.sh index 7080eae5312b..c51ea90a1395 100755 --- a/tools/testing/selftests/net/udpgro_bench.sh +++ b/tools/testing/selftests/net/udpgro_bench.sh @@ -7,7 +7,7 @@ source net_helper.sh readonly PEER_NS="ns-peer-$(mktemp -u XXXXXX)" -BPF_FILE="xdp_dummy.o" +BPF_FILE="xdp_dummy.bpf.o" cleanup() { local -r jobs="$(jobs -p)" diff --git a/tools/testing/selftests/net/udpgro_frglist.sh b/tools/testing/selftests/net/udpgro_frglist.sh index e1ff645bd3d1..17404f49cdb6 100755 --- a/tools/testing/selftests/net/udpgro_frglist.sh +++ b/tools/testing/selftests/net/udpgro_frglist.sh @@ -7,7 +7,7 @@ source net_helper.sh readonly PEER_NS="ns-peer-$(mktemp -u XXXXXX)" -BPF_FILE="xdp_dummy.o" +BPF_FILE="xdp_dummy.bpf.o" cleanup() { local -r jobs="$(jobs -p)" @@ -42,8 +42,8 @@ run_one() { ip -n "${PEER_NS}" link set veth1 xdp object ${BPF_FILE} section xdp tc -n "${PEER_NS}" qdisc add dev veth1 clsact - tc -n "${PEER_NS}" filter add dev veth1 ingress prio 4 protocol ipv6 bpf object-file nat6to4.o section schedcls/ingress6/nat_6 direct-action - tc -n "${PEER_NS}" filter add dev veth1 egress prio 4 protocol ip bpf object-file nat6to4.o section schedcls/egress4/snat4 direct-action + tc -n "${PEER_NS}" filter add dev veth1 ingress prio 4 protocol ipv6 bpf object-file nat6to4.bpf.o section schedcls/ingress6/nat_6 direct-action + tc -n "${PEER_NS}" filter add dev veth1 egress prio 4 protocol ip bpf object-file nat6to4.bpf.o section schedcls/egress4/snat4 direct-action echo ${rx_args} ip netns exec "${PEER_NS}" ./udpgso_bench_rx ${rx_args} -r & @@ -89,7 +89,7 @@ if [ ! -f ${BPF_FILE} ]; then exit -1 fi -if [ ! -f nat6to4.o ]; then +if [ ! -f nat6to4.bpf.o ]; then echo "Missing nat6to4 helper. Run 'make' first" exit -1 fi diff --git a/tools/testing/selftests/net/udpgro_fwd.sh b/tools/testing/selftests/net/udpgro_fwd.sh index 83ed987cff34..550d8eb3e224 100755 --- a/tools/testing/selftests/net/udpgro_fwd.sh +++ b/tools/testing/selftests/net/udpgro_fwd.sh @@ -3,7 +3,7 @@ source net_helper.sh -BPF_FILE="xdp_dummy.o" +BPF_FILE="xdp_dummy.bpf.o" readonly BASE="ns-$(mktemp -u XXXXXX)" readonly SRC=2 readonly DST=1 diff --git a/tools/testing/selftests/net/veth.sh b/tools/testing/selftests/net/veth.sh index 3a394b43e274..4f1edbafb946 100755 --- a/tools/testing/selftests/net/veth.sh +++ b/tools/testing/selftests/net/veth.sh @@ -1,7 +1,7 @@ #!/bin/sh # SPDX-License-Identifier: GPL-2.0 -BPF_FILE="xdp_dummy.o" +BPF_FILE="xdp_dummy.bpf.o" readonly STATS="$(mktemp -p /tmp ns-XXXXXX)" readonly BASE=`basename $STATS` readonly SRC=2 diff --git a/tools/testing/selftests/net/xdp_dummy.c b/tools/testing/selftests/net/xdp_dummy.bpf.c index d988b2e0cee8..d988b2e0cee8 100644 --- a/tools/testing/selftests/net/xdp_dummy.c +++ b/tools/testing/selftests/net/xdp_dummy.bpf.c diff --git a/tools/testing/selftests/net/xfrm_policy.sh b/tools/testing/selftests/net/xfrm_policy.sh index 457789530645..3eeeeffb4005 100755 --- a/tools/testing/selftests/net/xfrm_policy.sh +++ b/tools/testing/selftests/net/xfrm_policy.sh @@ -293,7 +293,7 @@ check_random_order() local ns=$1 local log=$2 - for i in $(seq 100); do + for i in $(seq 50); do ip -net $ns xfrm policy flush for j in $(seq 0 16 255 | sort -R); do ip -net $ns xfrm policy add dst $j.0.0.0/24 dir out priority 10 action allow @@ -306,7 +306,7 @@ check_random_order() done done - for i in $(seq 100); do + for i in $(seq 50); do ip -net $ns xfrm policy flush for j in $(seq 0 16 255 | sort -R); do local addr=$(printf "e000:0000:%02x00::/56" $j) diff --git a/tools/testing/selftests/netfilter/Makefile b/tools/testing/selftests/netfilter/Makefile deleted file mode 100644 index 936c3085bb83..000000000000 --- a/tools/testing/selftests/netfilter/Makefile +++ /dev/null @@ -1,21 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# Makefile for netfilter selftests - -TEST_PROGS := nft_trans_stress.sh nft_fib.sh nft_nat.sh bridge_brouter.sh \ - conntrack_icmp_related.sh nft_flowtable.sh ipvs.sh \ - nft_concat_range.sh nft_conntrack_helper.sh \ - nft_queue.sh nft_meta.sh nf_nat_edemux.sh \ - ipip-conntrack-mtu.sh conntrack_tcp_unreplied.sh \ - conntrack_vrf.sh nft_synproxy.sh rpath.sh nft_audit.sh \ - conntrack_sctp_collision.sh xt_string.sh \ - bridge_netfilter.sh - -HOSTPKG_CONFIG := pkg-config - -CFLAGS += $(shell $(HOSTPKG_CONFIG) --cflags libmnl 2>/dev/null) -LDLIBS += $(shell $(HOSTPKG_CONFIG) --libs libmnl 2>/dev/null || echo -lmnl) - -TEST_GEN_FILES = nf-queue connect_close audit_logread sctp_collision \ - conntrack_dump_flush - -include ../lib.mk diff --git a/tools/testing/selftests/netfilter/bridge_brouter.sh b/tools/testing/selftests/netfilter/bridge_brouter.sh deleted file mode 100755 index 29f3955b9af7..000000000000 --- a/tools/testing/selftests/netfilter/bridge_brouter.sh +++ /dev/null @@ -1,146 +0,0 @@ -#!/bin/bash -# -# This test is for bridge 'brouting', i.e. make some packets being routed -# rather than getting bridged even though they arrive on interface that is -# part of a bridge. - -# eth0 br0 eth0 -# setup is: ns1 <-> ns0 <-> ns2 - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 -ret=0 - -ebtables -V > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ebtables" - exit $ksft_skip -fi - -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi - -ip netns add ns0 -ip netns add ns1 -ip netns add ns2 - -ip link add veth0 netns ns0 type veth peer name eth0 netns ns1 -if [ $? -ne 0 ]; then - echo "SKIP: Can't create veth device" - exit $ksft_skip -fi -ip link add veth1 netns ns0 type veth peer name eth0 netns ns2 - -ip -net ns0 link set lo up -ip -net ns0 link set veth0 up -ip -net ns0 link set veth1 up - -ip -net ns0 link add br0 type bridge -if [ $? -ne 0 ]; then - echo "SKIP: Can't create bridge br0" - exit $ksft_skip -fi - -ip -net ns0 link set veth0 master br0 -ip -net ns0 link set veth1 master br0 -ip -net ns0 link set br0 up -ip -net ns0 addr add 10.0.0.1/24 dev br0 - -# place both in same subnet, ns1 and ns2 connected via ns0:br0 -for i in 1 2; do - ip -net ns$i link set lo up - ip -net ns$i link set eth0 up - ip -net ns$i addr add 10.0.0.1$i/24 dev eth0 -done - -test_ebtables_broute() -{ - local cipt - - # redirect is needed so the dstmac is rewritten to the bridge itself, - # ip stack won't process OTHERHOST (foreign unicast mac) packets. - ip netns exec ns0 ebtables -t broute -A BROUTING -p ipv4 --ip-protocol icmp -j redirect --redirect-target=DROP - if [ $? -ne 0 ]; then - echo "SKIP: Could not add ebtables broute redirect rule" - return $ksft_skip - fi - - # ping netns1, expected to not work (ip forwarding is off) - ip netns exec ns1 ping -q -c 1 10.0.0.12 > /dev/null 2>&1 - if [ $? -eq 0 ]; then - echo "ERROR: ping works, should have failed" 1>&2 - return 1 - fi - - # enable forwarding on both interfaces. - # neither needs an ip address, but at least the bridge needs - # an ip address in same network segment as ns1 and ns2 (ns0 - # needs to be able to determine route for to-be-forwarded packet). - ip netns exec ns0 sysctl -q net.ipv4.conf.veth0.forwarding=1 - ip netns exec ns0 sysctl -q net.ipv4.conf.veth1.forwarding=1 - - sleep 1 - - ip netns exec ns1 ping -q -c 1 10.0.0.12 > /dev/null - if [ $? -ne 0 ]; then - echo "ERROR: ping did not work, but it should (broute+forward)" 1>&2 - return 1 - fi - - echo "PASS: ns1/ns2 connectivity with active broute rule" - ip netns exec ns0 ebtables -t broute -F - - # ping netns1, expected to work (frames are bridged) - ip netns exec ns1 ping -q -c 1 10.0.0.12 > /dev/null - if [ $? -ne 0 ]; then - echo "ERROR: ping did not work, but it should (bridged)" 1>&2 - return 1 - fi - - ip netns exec ns0 ebtables -t filter -A FORWARD -p ipv4 --ip-protocol icmp -j DROP - - # ping netns1, expected to not work (DROP in bridge forward) - ip netns exec ns1 ping -q -c 1 10.0.0.12 > /dev/null 2>&1 - if [ $? -eq 0 ]; then - echo "ERROR: ping works, should have failed (icmp forward drop)" 1>&2 - return 1 - fi - - # re-activate brouter - ip netns exec ns0 ebtables -t broute -A BROUTING -p ipv4 --ip-protocol icmp -j redirect --redirect-target=DROP - - ip netns exec ns2 ping -q -c 1 10.0.0.11 > /dev/null - if [ $? -ne 0 ]; then - echo "ERROR: ping did not work, but it should (broute+forward 2)" 1>&2 - return 1 - fi - - echo "PASS: ns1/ns2 connectivity with active broute rule and bridge forward drop" - return 0 -} - -# test basic connectivity -ip netns exec ns1 ping -c 1 -q 10.0.0.12 > /dev/null -if [ $? -ne 0 ]; then - echo "ERROR: Could not reach ns2 from ns1" 1>&2 - ret=1 -fi - -ip netns exec ns2 ping -c 1 -q 10.0.0.11 > /dev/null -if [ $? -ne 0 ]; then - echo "ERROR: Could not reach ns1 from ns2" 1>&2 - ret=1 -fi - -if [ $ret -eq 0 ];then - echo "PASS: netns connectivity: ns1 and ns2 can reach each other" -fi - -test_ebtables_broute -ret=$? -for i in 0 1 2; do ip netns del ns$i;done - -exit $ret diff --git a/tools/testing/selftests/netfilter/bridge_netfilter.sh b/tools/testing/selftests/netfilter/bridge_netfilter.sh deleted file mode 100644 index 659b3ab02c8b..000000000000 --- a/tools/testing/selftests/netfilter/bridge_netfilter.sh +++ /dev/null @@ -1,188 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 -# -# Test bridge netfilter + conntrack, a combination that doesn't really work, -# with multicast/broadcast packets racing for hash table insertion. - -# eth0 br0 eth0 -# setup is: ns1 <->,ns0 <-> ns3 -# ns2 <-' `'-> ns4 - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 -ret=0 - -sfx=$(mktemp -u "XXXXXXXX") -ns0="ns0-$sfx" -ns1="ns1-$sfx" -ns2="ns2-$sfx" -ns3="ns3-$sfx" -ns4="ns4-$sfx" - -ebtables -V > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ebtables" - exit $ksft_skip -fi - -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi - -for i in $(seq 0 4); do - eval ip netns add \$ns$i -done - -cleanup() { - for i in $(seq 0 4); do eval ip netns del \$ns$i;done -} - -trap cleanup EXIT - -do_ping() -{ - fromns="$1" - dstip="$2" - - ip netns exec $fromns ping -c 1 -q $dstip > /dev/null - if [ $? -ne 0 ]; then - echo "ERROR: ping from $fromns to $dstip" - ip netns exec ${ns0} nft list ruleset - ret=1 - fi -} - -bcast_ping() -{ - fromns="$1" - dstip="$2" - - for i in $(seq 1 1000); do - ip netns exec $fromns ping -q -f -b -c 1 -q $dstip > /dev/null 2>&1 - if [ $? -ne 0 ]; then - echo "ERROR: ping -b from $fromns to $dstip" - ip netns exec ${ns0} nft list ruleset - fi - done -} - -ip link add veth1 netns ${ns0} type veth peer name eth0 netns ${ns1} -if [ $? -ne 0 ]; then - echo "SKIP: Can't create veth device" - exit $ksft_skip -fi - -ip link add veth2 netns ${ns0} type veth peer name eth0 netns $ns2 -ip link add veth3 netns ${ns0} type veth peer name eth0 netns $ns3 -ip link add veth4 netns ${ns0} type veth peer name eth0 netns $ns4 - -ip -net ${ns0} link set lo up - -for i in $(seq 1 4); do - ip -net ${ns0} link set veth$i up -done - -ip -net ${ns0} link add br0 type bridge stp_state 0 forward_delay 0 nf_call_iptables 1 nf_call_ip6tables 1 nf_call_arptables 1 -if [ $? -ne 0 ]; then - echo "SKIP: Can't create bridge br0" - exit $ksft_skip -fi - -# make veth0,1,2 part of bridge. -for i in $(seq 1 3); do - ip -net ${ns0} link set veth$i master br0 -done - -# add a macvlan on top of the bridge. -MACVLAN_ADDR=ba:f3:13:37:42:23 -ip -net ${ns0} link add link br0 name macvlan0 type macvlan mode private -ip -net ${ns0} link set macvlan0 address ${MACVLAN_ADDR} -ip -net ${ns0} link set macvlan0 up -ip -net ${ns0} addr add 10.23.0.1/24 dev macvlan0 - -# add a macvlan on top of veth4. -MACVLAN_ADDR=ba:f3:13:37:42:24 -ip -net ${ns0} link add link veth4 name macvlan4 type macvlan mode vepa -ip -net ${ns0} link set macvlan4 address ${MACVLAN_ADDR} -ip -net ${ns0} link set macvlan4 up - -# make the macvlan part of the bridge. -# veth4 is not a bridge port, only the macvlan on top of it. -ip -net ${ns0} link set macvlan4 master br0 - -ip -net ${ns0} link set br0 up -ip -net ${ns0} addr add 10.0.0.1/24 dev br0 -ip netns exec ${ns0} sysctl -q net.bridge.bridge-nf-call-iptables=1 -ret=$? -if [ $ret -ne 0 ] ; then - echo "SKIP: bridge netfilter not available" - ret=$ksft_skip -fi - -# for testing, so namespaces will reply to ping -b probes. -ip netns exec ${ns0} sysctl -q net.ipv4.icmp_echo_ignore_broadcasts=0 - -# enable conntrack in ns0 and drop broadcast packets in forward to -# avoid them from getting confirmed in the postrouting hook before -# the cloned skb is passed up the stack. -ip netns exec ${ns0} nft -f - <<EOF -table ip filter { - chain input { - type filter hook input priority 1; policy accept - iifname br0 counter - ct state new accept - } -} - -table bridge filter { - chain forward { - type filter hook forward priority 0; policy accept - meta pkttype broadcast ip protocol icmp counter drop - } -} -EOF - -# place 1, 2 & 3 in same subnet, connected via ns0:br0. -# ns4 is placed in same subnet as well, but its not -# part of the bridge: the corresponding veth4 is not -# part of the bridge, only its macvlan interface. -for i in $(seq 1 4); do - eval ip -net \$ns$i link set lo up - eval ip -net \$ns$i link set eth0 up -done -for i in $(seq 1 2); do - eval ip -net \$ns$i addr add 10.0.0.1$i/24 dev eth0 -done - -ip -net ${ns3} addr add 10.23.0.13/24 dev eth0 -ip -net ${ns4} addr add 10.23.0.14/24 dev eth0 - -# test basic connectivity -do_ping ${ns1} 10.0.0.12 -do_ping ${ns3} 10.23.0.1 -do_ping ${ns4} 10.23.0.1 - -if [ $ret -eq 0 ];then - echo "PASS: netns connectivity: ns1 can reach ns2, ns3 and ns4 can reach ns0" -fi - -bcast_ping ${ns1} 10.0.0.255 - -# This should deliver broadcast to macvlan0, which is on top of ns0:br0. -bcast_ping ${ns3} 10.23.0.255 - -# same, this time via veth4:macvlan4. -bcast_ping ${ns4} 10.23.0.255 - -read t < /proc/sys/kernel/tainted - -if [ $t -eq 0 ];then - echo PASS: kernel not tainted -else - echo ERROR: kernel is tainted - ret=1 -fi - -exit $ret diff --git a/tools/testing/selftests/netfilter/config b/tools/testing/selftests/netfilter/config deleted file mode 100644 index 7c42b1b2c69b..000000000000 --- a/tools/testing/selftests/netfilter/config +++ /dev/null @@ -1,9 +0,0 @@ -CONFIG_NET_NS=y -CONFIG_NF_TABLES_INET=y -CONFIG_NFT_QUEUE=m -CONFIG_NFT_NAT=m -CONFIG_NFT_REDIR=m -CONFIG_NFT_MASQ=m -CONFIG_NFT_FLOW_OFFLOAD=m -CONFIG_NF_CT_NETLINK=m -CONFIG_AUDIT=y diff --git a/tools/testing/selftests/netfilter/conntrack_sctp_collision.sh b/tools/testing/selftests/netfilter/conntrack_sctp_collision.sh deleted file mode 100755 index a924e595cfd8..000000000000 --- a/tools/testing/selftests/netfilter/conntrack_sctp_collision.sh +++ /dev/null @@ -1,89 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 -# -# Testing For SCTP COLLISION SCENARIO as Below: -# -# 14:35:47.655279 IP CLIENT_IP.PORT > SERVER_IP.PORT: sctp (1) [INIT] [init tag: 2017837359] -# 14:35:48.353250 IP SERVER_IP.PORT > CLIENT_IP.PORT: sctp (1) [INIT] [init tag: 1187206187] -# 14:35:48.353275 IP CLIENT_IP.PORT > SERVER_IP.PORT: sctp (1) [INIT ACK] [init tag: 2017837359] -# 14:35:48.353283 IP SERVER_IP.PORT > CLIENT_IP.PORT: sctp (1) [COOKIE ECHO] -# 14:35:48.353977 IP CLIENT_IP.PORT > SERVER_IP.PORT: sctp (1) [COOKIE ACK] -# 14:35:48.855335 IP SERVER_IP.PORT > CLIENT_IP.PORT: sctp (1) [INIT ACK] [init tag: 164579970] -# -# TOPO: SERVER_NS (link0)<--->(link1) ROUTER_NS (link2)<--->(link3) CLIENT_NS - -CLIENT_NS=$(mktemp -u client-XXXXXXXX) -CLIENT_IP="198.51.200.1" -CLIENT_PORT=1234 - -SERVER_NS=$(mktemp -u server-XXXXXXXX) -SERVER_IP="198.51.100.1" -SERVER_PORT=1234 - -ROUTER_NS=$(mktemp -u router-XXXXXXXX) -CLIENT_GW="198.51.200.2" -SERVER_GW="198.51.100.2" - -# setup the topo -setup() { - ip net add $CLIENT_NS - ip net add $SERVER_NS - ip net add $ROUTER_NS - ip -n $SERVER_NS link add link0 type veth peer name link1 netns $ROUTER_NS - ip -n $CLIENT_NS link add link3 type veth peer name link2 netns $ROUTER_NS - - ip -n $SERVER_NS link set link0 up - ip -n $SERVER_NS addr add $SERVER_IP/24 dev link0 - ip -n $SERVER_NS route add $CLIENT_IP dev link0 via $SERVER_GW - - ip -n $ROUTER_NS link set link1 up - ip -n $ROUTER_NS link set link2 up - ip -n $ROUTER_NS addr add $SERVER_GW/24 dev link1 - ip -n $ROUTER_NS addr add $CLIENT_GW/24 dev link2 - ip net exec $ROUTER_NS sysctl -wq net.ipv4.ip_forward=1 - - ip -n $CLIENT_NS link set link3 up - ip -n $CLIENT_NS addr add $CLIENT_IP/24 dev link3 - ip -n $CLIENT_NS route add $SERVER_IP dev link3 via $CLIENT_GW - - # simulate the delay on OVS upcall by setting up a delay for INIT_ACK with - # tc on $SERVER_NS side - tc -n $SERVER_NS qdisc add dev link0 root handle 1: htb - tc -n $SERVER_NS class add dev link0 parent 1: classid 1:1 htb rate 100mbit - tc -n $SERVER_NS filter add dev link0 parent 1: protocol ip u32 match ip protocol 132 \ - 0xff match u8 2 0xff at 32 flowid 1:1 - tc -n $SERVER_NS qdisc add dev link0 parent 1:1 handle 10: netem delay 1200ms - - # simulate the ctstate check on OVS nf_conntrack - ip net exec $ROUTER_NS iptables -A FORWARD -m state --state INVALID,UNTRACKED -j DROP - ip net exec $ROUTER_NS iptables -A INPUT -p sctp -j DROP - - # use a smaller number for assoc's max_retrans to reproduce the issue - modprobe sctp - ip net exec $CLIENT_NS sysctl -wq net.sctp.association_max_retrans=3 -} - -cleanup() { - ip net exec $CLIENT_NS pkill sctp_collision 2>&1 >/dev/null - ip net exec $SERVER_NS pkill sctp_collision 2>&1 >/dev/null - ip net del "$CLIENT_NS" - ip net del "$SERVER_NS" - ip net del "$ROUTER_NS" -} - -do_test() { - ip net exec $SERVER_NS ./sctp_collision server \ - $SERVER_IP $SERVER_PORT $CLIENT_IP $CLIENT_PORT & - ip net exec $CLIENT_NS ./sctp_collision client \ - $CLIENT_IP $CLIENT_PORT $SERVER_IP $SERVER_PORT -} - -# NOTE: one way to work around the issue is set a smaller hb_interval -# ip net exec $CLIENT_NS sysctl -wq net.sctp.hb_interval=3500 - -# run the test case -trap cleanup EXIT -setup && \ -echo "Test for SCTP Collision in nf_conntrack:" && \ -do_test && echo "PASS!" -exit $? diff --git a/tools/testing/selftests/netfilter/conntrack_tcp_unreplied.sh b/tools/testing/selftests/netfilter/conntrack_tcp_unreplied.sh deleted file mode 100755 index e7d7bf13cff5..000000000000 --- a/tools/testing/selftests/netfilter/conntrack_tcp_unreplied.sh +++ /dev/null @@ -1,167 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 -# -# Check that UNREPLIED tcp conntrack will eventually timeout. -# - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 -ret=0 - -waittime=20 -sfx=$(mktemp -u "XXXXXXXX") -ns1="ns1-$sfx" -ns2="ns2-$sfx" - -nft --version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without nft tool" - exit $ksft_skip -fi - -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi - -cleanup() { - ip netns pids $ns1 | xargs kill 2>/dev/null - ip netns pids $ns2 | xargs kill 2>/dev/null - - ip netns del $ns1 - ip netns del $ns2 -} - -ipv4() { - echo -n 192.168.$1.2 -} - -check_counter() -{ - ns=$1 - name=$2 - expect=$3 - local lret=0 - - cnt=$(ip netns exec $ns2 nft list counter inet filter "$name" | grep -q "$expect") - if [ $? -ne 0 ]; then - echo "ERROR: counter $name in $ns2 has unexpected value (expected $expect)" 1>&2 - ip netns exec $ns2 nft list counter inet filter "$name" 1>&2 - lret=1 - fi - - return $lret -} - -# Create test namespaces -ip netns add $ns1 || exit 1 - -trap cleanup EXIT - -ip netns add $ns2 || exit 1 - -# Connect the namespace to the host using a veth pair -ip -net $ns1 link add name veth1 type veth peer name veth2 -ip -net $ns1 link set netns $ns2 dev veth2 - -ip -net $ns1 link set up dev lo -ip -net $ns2 link set up dev lo -ip -net $ns1 link set up dev veth1 -ip -net $ns2 link set up dev veth2 - -ip -net $ns2 addr add 10.11.11.2/24 dev veth2 -ip -net $ns2 route add default via 10.11.11.1 - -ip netns exec $ns2 sysctl -q net.ipv4.conf.veth2.forwarding=1 - -# add a rule inside NS so we enable conntrack -ip netns exec $ns1 iptables -A INPUT -m state --state established,related -j ACCEPT - -ip -net $ns1 addr add 10.11.11.1/24 dev veth1 -ip -net $ns1 route add 10.99.99.99 via 10.11.11.2 - -# Check connectivity works -ip netns exec $ns1 ping -q -c 2 10.11.11.2 >/dev/null || exit 1 - -ip netns exec $ns2 nc -l -p 8080 < /dev/null & - -# however, conntrack entries are there - -ip netns exec $ns2 nft -f - <<EOF -table inet filter { - counter connreq { } - counter redir { } - chain input { - type filter hook input priority 0; policy accept; - ct state new tcp flags syn ip daddr 10.99.99.99 tcp dport 80 counter name "connreq" accept - ct state new ct status dnat tcp dport 8080 counter name "redir" accept - } -} -EOF -if [ $? -ne 0 ]; then - echo "ERROR: Could not load nft rules" - exit 1 -fi - -ip netns exec $ns2 sysctl -q net.netfilter.nf_conntrack_tcp_timeout_syn_sent=10 - -echo "INFO: connect $ns1 -> $ns2 to the virtual ip" -ip netns exec $ns1 bash -c 'while true ; do - nc -p 60000 10.99.99.99 80 - sleep 1 - done' & - -sleep 1 - -ip netns exec $ns2 nft -f - <<EOF -table inet nat { - chain prerouting { - type nat hook prerouting priority 0; policy accept; - ip daddr 10.99.99.99 tcp dport 80 redirect to :8080 - } -} -EOF -if [ $? -ne 0 ]; then - echo "ERROR: Could not load nat redirect" - exit 1 -fi - -count=$(ip netns exec $ns2 conntrack -L -p tcp --dport 80 2>/dev/null | wc -l) -if [ $count -eq 0 ]; then - echo "ERROR: $ns2 did not pick up tcp connection from peer" - exit 1 -fi - -echo "INFO: NAT redirect added in ns $ns2, waiting for $waittime seconds for nat to take effect" -for i in $(seq 1 $waittime); do - echo -n "." - - sleep 1 - - count=$(ip netns exec $ns2 conntrack -L -p tcp --reply-port-src 8080 2>/dev/null | wc -l) - if [ $count -gt 0 ]; then - echo - echo "PASS: redirection took effect after $i seconds" - break - fi - - m=$((i%20)) - if [ $m -eq 0 ]; then - echo " waited for $i seconds" - fi -done - -expect="packets 1 bytes 60" -check_counter "$ns2" "redir" "$expect" -if [ $? -ne 0 ]; then - ret=1 -fi - -if [ $ret -eq 0 ];then - echo "PASS: redirection counter has expected values" -else - echo "ERROR: no tcp connection was redirected" -fi - -exit $ret diff --git a/tools/testing/selftests/netfilter/ipvs.sh b/tools/testing/selftests/netfilter/ipvs.sh deleted file mode 100755 index c3b8f90c497e..000000000000 --- a/tools/testing/selftests/netfilter/ipvs.sh +++ /dev/null @@ -1,228 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 -# -# End-to-end ipvs test suite -# Topology: -#--------------------------------------------------------------+ -# | | -# ns0 | ns1 | -# ----------- | ----------- ----------- | -# | veth01 | --------- | veth10 | | veth12 | | -# ----------- peer ----------- ----------- | -# | | | | -# ----------- | | | -# | br0 | |----------------- peer |--------------| -# ----------- | | | -# | | | | -# ---------- peer ---------- ----------- | -# | veth02 | --------- | veth20 | | veth21 | | -# ---------- | ---------- ----------- | -# | ns2 | -# | | -#--------------------------------------------------------------+ -# -# We assume that all network driver are loaded -# - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 -ret=0 -GREEN='\033[0;92m' -RED='\033[0;31m' -NC='\033[0m' # No Color - -readonly port=8080 - -readonly vip_v4=207.175.44.110 -readonly cip_v4=10.0.0.2 -readonly gip_v4=10.0.0.1 -readonly dip_v4=172.16.0.1 -readonly rip_v4=172.16.0.2 -readonly sip_v4=10.0.0.3 - -readonly infile="$(mktemp)" -readonly outfile="$(mktemp)" -readonly datalen=32 - -sysipvsnet="/proc/sys/net/ipv4/vs/" -if [ ! -d $sysipvsnet ]; then - modprobe -q ip_vs - if [ $? -ne 0 ]; then - echo "skip: could not run test without ipvs module" - exit $ksft_skip - fi -fi - -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ]; then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi - -ipvsadm -v > /dev/null 2>&1 -if [ $? -ne 0 ]; then - echo "SKIP: Could not run test without ipvsadm" - exit $ksft_skip -fi - -setup() { - ip netns add ns0 - ip netns add ns1 - ip netns add ns2 - - ip link add veth01 netns ns0 type veth peer name veth10 netns ns1 - ip link add veth02 netns ns0 type veth peer name veth20 netns ns2 - ip link add veth12 netns ns1 type veth peer name veth21 netns ns2 - - ip netns exec ns0 ip link set veth01 up - ip netns exec ns0 ip link set veth02 up - ip netns exec ns0 ip link add br0 type bridge - ip netns exec ns0 ip link set veth01 master br0 - ip netns exec ns0 ip link set veth02 master br0 - ip netns exec ns0 ip link set br0 up - ip netns exec ns0 ip addr add ${cip_v4}/24 dev br0 - - ip netns exec ns1 ip link set lo up - ip netns exec ns1 ip link set veth10 up - ip netns exec ns1 ip addr add ${gip_v4}/24 dev veth10 - ip netns exec ns1 ip link set veth12 up - ip netns exec ns1 ip addr add ${dip_v4}/24 dev veth12 - - ip netns exec ns2 ip link set lo up - ip netns exec ns2 ip link set veth21 up - ip netns exec ns2 ip addr add ${rip_v4}/24 dev veth21 - ip netns exec ns2 ip link set veth20 up - ip netns exec ns2 ip addr add ${sip_v4}/24 dev veth20 - - sleep 1 - - dd if=/dev/urandom of="${infile}" bs="${datalen}" count=1 status=none -} - -cleanup() { - for i in 0 1 2 - do - ip netns del ns$i > /dev/null 2>&1 - done - - if [ -f "${outfile}" ]; then - rm "${outfile}" - fi - if [ -f "${infile}" ]; then - rm "${infile}" - fi -} - -server_listen() { - ip netns exec ns2 nc -l -p 8080 > "${outfile}" & - server_pid=$! - sleep 0.2 -} - -client_connect() { - ip netns exec ns0 timeout 2 nc -w 1 ${vip_v4} ${port} < "${infile}" -} - -verify_data() { - wait "${server_pid}" - cmp "$infile" "$outfile" 2>/dev/null -} - -test_service() { - server_listen - client_connect - verify_data -} - - -test_dr() { - ip netns exec ns0 ip route add ${vip_v4} via ${gip_v4} dev br0 - - ip netns exec ns1 sysctl -qw net.ipv4.ip_forward=1 - ip netns exec ns1 ipvsadm -A -t ${vip_v4}:${port} -s rr - ip netns exec ns1 ipvsadm -a -t ${vip_v4}:${port} -r ${rip_v4}:${port} - ip netns exec ns1 ip addr add ${vip_v4}/32 dev lo:1 - - # avoid incorrect arp response - ip netns exec ns2 sysctl -qw net.ipv4.conf.all.arp_ignore=1 - ip netns exec ns2 sysctl -qw net.ipv4.conf.all.arp_announce=2 - # avoid reverse route lookup - ip netns exec ns2 sysctl -qw net.ipv4.conf.all.rp_filter=0 - ip netns exec ns2 sysctl -qw net.ipv4.conf.veth21.rp_filter=0 - ip netns exec ns2 ip addr add ${vip_v4}/32 dev lo:1 - - test_service -} - -test_nat() { - ip netns exec ns0 ip route add ${vip_v4} via ${gip_v4} dev br0 - - ip netns exec ns1 sysctl -qw net.ipv4.ip_forward=1 - ip netns exec ns1 ipvsadm -A -t ${vip_v4}:${port} -s rr - ip netns exec ns1 ipvsadm -a -m -t ${vip_v4}:${port} -r ${rip_v4}:${port} - ip netns exec ns1 ip addr add ${vip_v4}/32 dev lo:1 - - ip netns exec ns2 ip link del veth20 - ip netns exec ns2 ip route add default via ${dip_v4} dev veth21 - - test_service -} - -test_tun() { - ip netns exec ns0 ip route add ${vip_v4} via ${gip_v4} dev br0 - - ip netns exec ns1 modprobe ipip - ip netns exec ns1 ip link set tunl0 up - ip netns exec ns1 sysctl -qw net.ipv4.ip_forward=0 - ip netns exec ns1 sysctl -qw net.ipv4.conf.all.send_redirects=0 - ip netns exec ns1 sysctl -qw net.ipv4.conf.default.send_redirects=0 - ip netns exec ns1 ipvsadm -A -t ${vip_v4}:${port} -s rr - ip netns exec ns1 ipvsadm -a -i -t ${vip_v4}:${port} -r ${rip_v4}:${port} - ip netns exec ns1 ip addr add ${vip_v4}/32 dev lo:1 - - ip netns exec ns2 modprobe ipip - ip netns exec ns2 ip link set tunl0 up - ip netns exec ns2 sysctl -qw net.ipv4.conf.all.arp_ignore=1 - ip netns exec ns2 sysctl -qw net.ipv4.conf.all.arp_announce=2 - ip netns exec ns2 sysctl -qw net.ipv4.conf.all.rp_filter=0 - ip netns exec ns2 sysctl -qw net.ipv4.conf.tunl0.rp_filter=0 - ip netns exec ns2 sysctl -qw net.ipv4.conf.veth21.rp_filter=0 - ip netns exec ns2 ip addr add ${vip_v4}/32 dev lo:1 - - test_service -} - -run_tests() { - local errors= - - echo "Testing DR mode..." - cleanup - setup - test_dr - errors=$(( $errors + $? )) - - echo "Testing NAT mode..." - cleanup - setup - test_nat - errors=$(( $errors + $? )) - - echo "Testing Tunnel mode..." - cleanup - setup - test_tun - errors=$(( $errors + $? )) - - return $errors -} - -trap cleanup EXIT - -run_tests - -if [ $? -ne 0 ]; then - echo -e "$(basename $0): ${RED}FAIL${NC}" - exit 1 -fi -echo -e "$(basename $0): ${GREEN}PASS${NC}" -exit 0 diff --git a/tools/testing/selftests/netfilter/nf_nat_edemux.sh b/tools/testing/selftests/netfilter/nf_nat_edemux.sh deleted file mode 100755 index a1aa8f4a5828..000000000000 --- a/tools/testing/selftests/netfilter/nf_nat_edemux.sh +++ /dev/null @@ -1,127 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 -# -# Test NAT source port clash resolution -# - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 -ret=0 - -sfx=$(mktemp -u "XXXXXXXX") -ns1="ns1-$sfx" -ns2="ns2-$sfx" -socatpid=0 - -cleanup() -{ - [ $socatpid -gt 0 ] && kill $socatpid - ip netns del $ns1 - ip netns del $ns2 -} - -socat -h > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without socat" - exit $ksft_skip -fi - -iptables --version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without iptables" - exit $ksft_skip -fi - -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi - -ip netns add "$ns1" -if [ $? -ne 0 ];then - echo "SKIP: Could not create net namespace $ns1" - exit $ksft_skip -fi - -trap cleanup EXIT - -ip netns add $ns2 - -# Connect the namespaces using a veth pair -ip link add name veth2 type veth peer name veth1 -ip link set netns $ns1 dev veth1 -ip link set netns $ns2 dev veth2 - -ip netns exec $ns1 ip link set up dev lo -ip netns exec $ns1 ip link set up dev veth1 -ip netns exec $ns1 ip addr add 192.168.1.1/24 dev veth1 - -ip netns exec $ns2 ip link set up dev lo -ip netns exec $ns2 ip link set up dev veth2 -ip netns exec $ns2 ip addr add 192.168.1.2/24 dev veth2 - -# Create a server in one namespace -ip netns exec $ns1 socat -u TCP-LISTEN:5201,fork OPEN:/dev/null,wronly=1 & -socatpid=$! - -# Restrict source port to just one so we don't have to exhaust -# all others. -ip netns exec $ns2 sysctl -q net.ipv4.ip_local_port_range="10000 10000" - -# add a virtual IP using DNAT -ip netns exec $ns2 iptables -t nat -A OUTPUT -d 10.96.0.1/32 -p tcp --dport 443 -j DNAT --to-destination 192.168.1.1:5201 - -# ... and route it to the other namespace -ip netns exec $ns2 ip route add 10.96.0.1 via 192.168.1.1 - -sleep 1 - -# add a persistent connection from the other namespace -ip netns exec $ns2 socat -t 10 - TCP:192.168.1.1:5201 > /dev/null & - -sleep 1 - -# ip daddr:dport will be rewritten to 192.168.1.1 5201 -# NAT must reallocate source port 10000 because -# 192.168.1.2:10000 -> 192.168.1.1:5201 is already in use -echo test | ip netns exec $ns2 socat -t 3 -u STDIN TCP:10.96.0.1:443,connect-timeout=3 >/dev/null -ret=$? - -# Check socat can connect to 10.96.0.1:443 (aka 192.168.1.1:5201). -if [ $ret -eq 0 ]; then - echo "PASS: socat can connect via NAT'd address" -else - echo "FAIL: socat cannot connect via NAT'd address" -fi - -# check sport clashres. -ip netns exec $ns1 iptables -t nat -A PREROUTING -p tcp --dport 5202 -j REDIRECT --to-ports 5201 -ip netns exec $ns1 iptables -t nat -A PREROUTING -p tcp --dport 5203 -j REDIRECT --to-ports 5201 - -sleep 5 | ip netns exec $ns2 socat -t 5 -u STDIN TCP:192.168.1.1:5202,connect-timeout=5 >/dev/null & -cpid1=$! -sleep 1 - -# if connect succeeds, client closes instantly due to EOF on stdin. -# if connect hangs, it will time out after 5s. -echo | ip netns exec $ns2 socat -t 3 -u STDIN TCP:192.168.1.1:5203,connect-timeout=5 >/dev/null & -cpid2=$! - -time_then=$(date +%s) -wait $cpid2 -rv=$? -time_now=$(date +%s) - -# Check how much time has elapsed, expectation is for -# 'cpid2' to connect and then exit (and no connect delay). -delta=$((time_now - time_then)) - -if [ $delta -lt 2 -a $rv -eq 0 ]; then - echo "PASS: could connect to service via redirected ports" -else - echo "FAIL: socat cannot connect to service via redirect ($delta seconds elapsed, returned $rv)" - ret=1 -fi - -exit $ret diff --git a/tools/testing/selftests/netfilter/nft_conntrack_helper.sh b/tools/testing/selftests/netfilter/nft_conntrack_helper.sh deleted file mode 100755 index faa7778d7bd1..000000000000 --- a/tools/testing/selftests/netfilter/nft_conntrack_helper.sh +++ /dev/null @@ -1,197 +0,0 @@ -#!/bin/bash -# -# This tests connection tracking helper assignment: -# 1. can attach ftp helper to a connection from nft ruleset. -# 2. auto-assign still works. -# -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 -ret=0 - -sfx=$(mktemp -u "XXXXXXXX") -ns1="ns1-$sfx" -ns2="ns2-$sfx" -testipv6=1 - -cleanup() -{ - ip netns del ${ns1} - ip netns del ${ns2} -} - -nft --version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without nft tool" - exit $ksft_skip -fi - -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi - -conntrack -V > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without conntrack tool" - exit $ksft_skip -fi - -which nc >/dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without netcat tool" - exit $ksft_skip -fi - -trap cleanup EXIT - -ip netns add ${ns1} -ip netns add ${ns2} - -ip link add veth0 netns ${ns1} type veth peer name veth0 netns ${ns2} > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: No virtual ethernet pair device support in kernel" - exit $ksft_skip -fi - -ip -net ${ns1} link set lo up -ip -net ${ns1} link set veth0 up - -ip -net ${ns2} link set lo up -ip -net ${ns2} link set veth0 up - -ip -net ${ns1} addr add 10.0.1.1/24 dev veth0 -ip -net ${ns1} addr add dead:1::1/64 dev veth0 - -ip -net ${ns2} addr add 10.0.1.2/24 dev veth0 -ip -net ${ns2} addr add dead:1::2/64 dev veth0 - -load_ruleset_family() { - local family=$1 - local ns=$2 - -ip netns exec ${ns} nft -f - <<EOF -table $family raw { - ct helper ftp { - type "ftp" protocol tcp - } - chain pre { - type filter hook prerouting priority 0; policy accept; - tcp dport 2121 ct helper set "ftp" - } - chain output { - type filter hook output priority 0; policy accept; - tcp dport 2121 ct helper set "ftp" - } -} -EOF - return $? -} - -check_for_helper() -{ - local netns=$1 - local message=$2 - local port=$3 - - if echo $message |grep -q 'ipv6';then - local family="ipv6" - else - local family="ipv4" - fi - - ip netns exec ${netns} conntrack -L -f $family -p tcp --dport $port 2> /dev/null |grep -q 'helper=ftp' - if [ $? -ne 0 ] ; then - if [ $autoassign -eq 0 ] ;then - echo "FAIL: ${netns} did not show attached helper $message" 1>&2 - ret=1 - else - echo "PASS: ${netns} did not show attached helper $message" 1>&2 - fi - else - if [ $autoassign -eq 0 ] ;then - echo "PASS: ${netns} connection on port $port has ftp helper attached" 1>&2 - else - echo "FAIL: ${netns} connection on port $port has ftp helper attached" 1>&2 - ret=1 - fi - fi - - return 0 -} - -test_helper() -{ - local port=$1 - local autoassign=$2 - - if [ $autoassign -eq 0 ] ;then - msg="set via ruleset" - else - msg="auto-assign" - fi - - sleep 3 | ip netns exec ${ns2} nc -w 2 -l -p $port > /dev/null & - - sleep 1 | ip netns exec ${ns1} nc -w 2 10.0.1.2 $port > /dev/null & - sleep 1 - - check_for_helper "$ns1" "ip $msg" $port $autoassign - check_for_helper "$ns2" "ip $msg" $port $autoassign - - wait - - if [ $testipv6 -eq 0 ] ;then - return 0 - fi - - ip netns exec ${ns1} conntrack -F 2> /dev/null - ip netns exec ${ns2} conntrack -F 2> /dev/null - - sleep 3 | ip netns exec ${ns2} nc -w 2 -6 -l -p $port > /dev/null & - - sleep 1 | ip netns exec ${ns1} nc -w 2 -6 dead:1::2 $port > /dev/null & - sleep 1 - - check_for_helper "$ns1" "ipv6 $msg" $port - check_for_helper "$ns2" "ipv6 $msg" $port - - wait -} - -load_ruleset_family ip ${ns1} -if [ $? -ne 0 ];then - echo "FAIL: ${ns1} cannot load ip ruleset" 1>&2 - exit 1 -fi - -load_ruleset_family ip6 ${ns1} -if [ $? -ne 0 ];then - echo "SKIP: ${ns1} cannot load ip6 ruleset" 1>&2 - testipv6=0 -fi - -load_ruleset_family inet ${ns2} -if [ $? -ne 0 ];then - echo "SKIP: ${ns1} cannot load inet ruleset" 1>&2 - load_ruleset_family ip ${ns2} - if [ $? -ne 0 ];then - echo "FAIL: ${ns2} cannot load ip ruleset" 1>&2 - exit 1 - fi - - if [ $testipv6 -eq 1 ] ;then - load_ruleset_family ip6 ${ns2} - if [ $? -ne 0 ];then - echo "FAIL: ${ns2} cannot load ip6 ruleset" 1>&2 - exit 1 - fi - fi -fi - -test_helper 2121 0 -ip netns exec ${ns1} sysctl -qe 'net.netfilter.nf_conntrack_helper=1' -ip netns exec ${ns2} sysctl -qe 'net.netfilter.nf_conntrack_helper=1' -test_helper 21 1 - -exit $ret diff --git a/tools/testing/selftests/netfilter/nft_fib.sh b/tools/testing/selftests/netfilter/nft_fib.sh deleted file mode 100755 index dff476e45e77..000000000000 --- a/tools/testing/selftests/netfilter/nft_fib.sh +++ /dev/null @@ -1,273 +0,0 @@ -#!/bin/bash -# -# This tests the fib expression. -# -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 -ret=0 - -sfx=$(mktemp -u "XXXXXXXX") -ns1="ns1-$sfx" -ns2="ns2-$sfx" -nsrouter="nsrouter-$sfx" -timeout=4 - -log_netns=$(sysctl -n net.netfilter.nf_log_all_netns) - -cleanup() -{ - ip netns del ${ns1} - ip netns del ${ns2} - ip netns del ${nsrouter} - - [ $log_netns -eq 0 ] && sysctl -q net.netfilter.nf_log_all_netns=$log_netns -} - -nft --version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without nft tool" - exit $ksft_skip -fi - -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi - -ip netns add ${nsrouter} -if [ $? -ne 0 ];then - echo "SKIP: Could not create net namespace" - exit $ksft_skip -fi - -trap cleanup EXIT - -dmesg | grep -q ' nft_rpfilter: ' -if [ $? -eq 0 ]; then - dmesg -c | grep ' nft_rpfilter: ' - echo "WARN: a previous test run has failed" 1>&2 -fi - -sysctl -q net.netfilter.nf_log_all_netns=1 -ip netns add ${ns1} -ip netns add ${ns2} - -load_ruleset() { - local netns=$1 - -ip netns exec ${netns} nft -f /dev/stdin <<EOF -table inet filter { - chain prerouting { - type filter hook prerouting priority 0; policy accept; - fib saddr . iif oif missing counter log prefix "$netns nft_rpfilter: " drop - } -} -EOF -} - -load_pbr_ruleset() { - local netns=$1 - -ip netns exec ${netns} nft -f /dev/stdin <<EOF -table inet filter { - chain forward { - type filter hook forward priority raw; - fib saddr . iif oif gt 0 accept - log drop - } -} -EOF -} - -load_ruleset_count() { - local netns=$1 - -ip netns exec ${netns} nft -f /dev/stdin <<EOF -table inet filter { - chain prerouting { - type filter hook prerouting priority 0; policy accept; - ip daddr 1.1.1.1 fib saddr . iif oif missing counter drop - ip6 daddr 1c3::c01d fib saddr . iif oif missing counter drop - } -} -EOF -} - -check_drops() { - dmesg | grep -q ' nft_rpfilter: ' - if [ $? -eq 0 ]; then - dmesg | grep ' nft_rpfilter: ' - echo "FAIL: rpfilter did drop packets" - return 1 - fi - - return 0 -} - -check_fib_counter() { - local want=$1 - local ns=$2 - local address=$3 - - line=$(ip netns exec ${ns} nft list table inet filter | grep 'fib saddr . iif' | grep $address | grep "packets $want" ) - ret=$? - - if [ $ret -ne 0 ];then - echo "Netns $ns fib counter doesn't match expected packet count of $want for $address" 1>&2 - ip netns exec ${ns} nft list table inet filter - return 1 - fi - - if [ $want -gt 0 ]; then - echo "PASS: fib expression did drop packets for $address" - fi - - return 0 -} - -load_ruleset ${nsrouter} -load_ruleset ${ns1} -load_ruleset ${ns2} - -ip link add veth0 netns ${nsrouter} type veth peer name eth0 netns ${ns1} > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: No virtual ethernet pair device support in kernel" - exit $ksft_skip -fi -ip link add veth1 netns ${nsrouter} type veth peer name eth0 netns ${ns2} - -ip -net ${nsrouter} link set lo up -ip -net ${nsrouter} link set veth0 up -ip -net ${nsrouter} addr add 10.0.1.1/24 dev veth0 -ip -net ${nsrouter} addr add dead:1::1/64 dev veth0 - -ip -net ${nsrouter} link set veth1 up -ip -net ${nsrouter} addr add 10.0.2.1/24 dev veth1 -ip -net ${nsrouter} addr add dead:2::1/64 dev veth1 - -ip -net ${ns1} link set lo up -ip -net ${ns1} link set eth0 up - -ip -net ${ns2} link set lo up -ip -net ${ns2} link set eth0 up - -ip -net ${ns1} addr add 10.0.1.99/24 dev eth0 -ip -net ${ns1} addr add dead:1::99/64 dev eth0 -ip -net ${ns1} route add default via 10.0.1.1 -ip -net ${ns1} route add default via dead:1::1 - -ip -net ${ns2} addr add 10.0.2.99/24 dev eth0 -ip -net ${ns2} addr add dead:2::99/64 dev eth0 -ip -net ${ns2} route add default via 10.0.2.1 -ip -net ${ns2} route add default via dead:2::1 - -test_ping() { - local daddr4=$1 - local daddr6=$2 - - ip netns exec ${ns1} ping -c 1 -q $daddr4 > /dev/null - ret=$? - if [ $ret -ne 0 ];then - check_drops - echo "FAIL: ${ns1} cannot reach $daddr4, ret $ret" 1>&2 - return 1 - fi - - ip netns exec ${ns1} ping -c 3 -q $daddr6 > /dev/null - ret=$? - if [ $ret -ne 0 ];then - check_drops - echo "FAIL: ${ns1} cannot reach $daddr6, ret $ret" 1>&2 - return 1 - fi - - return 0 -} - -ip netns exec ${nsrouter} sysctl net.ipv6.conf.all.forwarding=1 > /dev/null -ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null -ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null -ip netns exec ${nsrouter} sysctl net.ipv4.conf.all.rp_filter=0 > /dev/null -ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth0.rp_filter=0 > /dev/null - -sleep 3 - -test_ping 10.0.2.1 dead:2::1 || exit 1 -check_drops || exit 1 - -test_ping 10.0.2.99 dead:2::99 || exit 1 -check_drops || exit 1 - -echo "PASS: fib expression did not cause unwanted packet drops" - -ip netns exec ${nsrouter} nft flush table inet filter - -ip -net ${ns1} route del default -ip -net ${ns1} -6 route del default - -ip -net ${ns1} addr del 10.0.1.99/24 dev eth0 -ip -net ${ns1} addr del dead:1::99/64 dev eth0 - -ip -net ${ns1} addr add 10.0.2.99/24 dev eth0 -ip -net ${ns1} addr add dead:2::99/64 dev eth0 - -ip -net ${ns1} route add default via 10.0.2.1 -ip -net ${ns1} -6 route add default via dead:2::1 - -ip -net ${nsrouter} addr add dead:2::1/64 dev veth0 - -# switch to ruleset that doesn't log, this time -# its expected that this does drop the packets. -load_ruleset_count ${nsrouter} - -# ns1 has a default route, but nsrouter does not. -# must not check return value, ping to 1.1.1.1 will -# fail. -check_fib_counter 0 ${nsrouter} 1.1.1.1 || exit 1 -check_fib_counter 0 ${nsrouter} 1c3::c01d || exit 1 - -ip netns exec ${ns1} ping -c 1 -W 1 -q 1.1.1.1 > /dev/null -check_fib_counter 1 ${nsrouter} 1.1.1.1 || exit 1 - -sleep 2 -ip netns exec ${ns1} ping -c 3 -q 1c3::c01d > /dev/null -check_fib_counter 3 ${nsrouter} 1c3::c01d || exit 1 - -# delete all rules -ip netns exec ${ns1} nft flush ruleset -ip netns exec ${ns2} nft flush ruleset -ip netns exec ${nsrouter} nft flush ruleset - -ip -net ${ns1} addr add 10.0.1.99/24 dev eth0 -ip -net ${ns1} addr add dead:1::99/64 dev eth0 - -ip -net ${ns1} addr del 10.0.2.99/24 dev eth0 -ip -net ${ns1} addr del dead:2::99/64 dev eth0 - -ip -net ${nsrouter} addr del dead:2::1/64 dev veth0 - -# ... pbr ruleset for the router, check iif+oif. -load_pbr_ruleset ${nsrouter} -if [ $? -ne 0 ] ; then - echo "SKIP: Could not load fib forward ruleset" - exit $ksft_skip -fi - -ip -net ${nsrouter} rule add from all table 128 -ip -net ${nsrouter} rule add from all iif veth0 table 129 -ip -net ${nsrouter} route add table 128 to 10.0.1.0/24 dev veth0 -ip -net ${nsrouter} route add table 129 to 10.0.2.0/24 dev veth1 - -# drop main ipv4 table -ip -net ${nsrouter} -4 rule delete table main - -test_ping 10.0.2.99 dead:2::99 -if [ $? -ne 0 ] ; then - ip -net ${nsrouter} nft list ruleset - echo "FAIL: fib mismatch in pbr setup" - exit 1 -fi - -echo "PASS: fib expression forward check with policy based routing" -exit 0 diff --git a/tools/testing/selftests/netfilter/nft_queue.sh b/tools/testing/selftests/netfilter/nft_queue.sh deleted file mode 100755 index e12729753351..000000000000 --- a/tools/testing/selftests/netfilter/nft_queue.sh +++ /dev/null @@ -1,449 +0,0 @@ -#!/bin/bash -# -# This tests nf_queue: -# 1. can process packets from all hooks -# 2. support running nfqueue from more than one base chain -# -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 -ret=0 - -sfx=$(mktemp -u "XXXXXXXX") -ns1="ns1-$sfx" -ns2="ns2-$sfx" -nsrouter="nsrouter-$sfx" -timeout=4 - -cleanup() -{ - ip netns pids ${ns1} | xargs kill 2>/dev/null - ip netns pids ${ns2} | xargs kill 2>/dev/null - ip netns pids ${nsrouter} | xargs kill 2>/dev/null - - ip netns del ${ns1} - ip netns del ${ns2} - ip netns del ${nsrouter} - rm -f "$TMPFILE0" - rm -f "$TMPFILE1" - rm -f "$TMPFILE2" "$TMPFILE3" -} - -nft --version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without nft tool" - exit $ksft_skip -fi - -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi - -ip netns add ${nsrouter} -if [ $? -ne 0 ];then - echo "SKIP: Could not create net namespace" - exit $ksft_skip -fi - -TMPFILE0=$(mktemp) -TMPFILE1=$(mktemp) -TMPFILE2=$(mktemp) -TMPFILE3=$(mktemp) -trap cleanup EXIT - -ip netns add ${ns1} -ip netns add ${ns2} - -ip link add veth0 netns ${nsrouter} type veth peer name eth0 netns ${ns1} > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: No virtual ethernet pair device support in kernel" - exit $ksft_skip -fi -ip link add veth1 netns ${nsrouter} type veth peer name eth0 netns ${ns2} - -ip -net ${nsrouter} link set lo up -ip -net ${nsrouter} link set veth0 up -ip -net ${nsrouter} addr add 10.0.1.1/24 dev veth0 -ip -net ${nsrouter} addr add dead:1::1/64 dev veth0 - -ip -net ${nsrouter} link set veth1 up -ip -net ${nsrouter} addr add 10.0.2.1/24 dev veth1 -ip -net ${nsrouter} addr add dead:2::1/64 dev veth1 - -ip -net ${ns1} link set lo up -ip -net ${ns1} link set eth0 up - -ip -net ${ns2} link set lo up -ip -net ${ns2} link set eth0 up - -ip -net ${ns1} addr add 10.0.1.99/24 dev eth0 -ip -net ${ns1} addr add dead:1::99/64 dev eth0 -ip -net ${ns1} route add default via 10.0.1.1 -ip -net ${ns1} route add default via dead:1::1 - -ip -net ${ns2} addr add 10.0.2.99/24 dev eth0 -ip -net ${ns2} addr add dead:2::99/64 dev eth0 -ip -net ${ns2} route add default via 10.0.2.1 -ip -net ${ns2} route add default via dead:2::1 - -load_ruleset() { - local name=$1 - local prio=$2 - -ip netns exec ${nsrouter} nft -f /dev/stdin <<EOF -table inet $name { - chain nfq { - ip protocol icmp queue bypass - icmpv6 type { "echo-request", "echo-reply" } queue num 1 bypass - } - chain pre { - type filter hook prerouting priority $prio; policy accept; - jump nfq - } - chain input { - type filter hook input priority $prio; policy accept; - jump nfq - } - chain forward { - type filter hook forward priority $prio; policy accept; - tcp dport 12345 queue num 2 - jump nfq - } - chain output { - type filter hook output priority $prio; policy accept; - tcp dport 12345 queue num 3 - tcp sport 23456 queue num 3 - jump nfq - } - chain post { - type filter hook postrouting priority $prio; policy accept; - jump nfq - } -} -EOF -} - -load_counter_ruleset() { - local prio=$1 - -ip netns exec ${nsrouter} nft -f /dev/stdin <<EOF -table inet countrules { - chain pre { - type filter hook prerouting priority $prio; policy accept; - counter - } - chain input { - type filter hook input priority $prio; policy accept; - counter - } - chain forward { - type filter hook forward priority $prio; policy accept; - counter - } - chain output { - type filter hook output priority $prio; policy accept; - counter - } - chain post { - type filter hook postrouting priority $prio; policy accept; - counter - } -} -EOF -} - -test_ping() { - ip netns exec ${ns1} ping -c 1 -q 10.0.2.99 > /dev/null - if [ $? -ne 0 ];then - return 1 - fi - - ip netns exec ${ns1} ping -c 1 -q dead:2::99 > /dev/null - if [ $? -ne 0 ];then - return 1 - fi - - return 0 -} - -test_ping_router() { - ip netns exec ${ns1} ping -c 1 -q 10.0.2.1 > /dev/null - if [ $? -ne 0 ];then - return 1 - fi - - ip netns exec ${ns1} ping -c 1 -q dead:2::1 > /dev/null - if [ $? -ne 0 ];then - return 1 - fi - - return 0 -} - -test_queue_blackhole() { - local proto=$1 - -ip netns exec ${nsrouter} nft -f /dev/stdin <<EOF -table $proto blackh { - chain forward { - type filter hook forward priority 0; policy accept; - queue num 600 - } -} -EOF - if [ $proto = "ip" ] ;then - ip netns exec ${ns1} ping -W 2 -c 1 -q 10.0.2.99 > /dev/null - lret=$? - elif [ $proto = "ip6" ]; then - ip netns exec ${ns1} ping -W 2 -c 1 -q dead:2::99 > /dev/null - lret=$? - else - lret=111 - fi - - # queue without bypass keyword should drop traffic if no listener exists. - if [ $lret -eq 0 ];then - echo "FAIL: $proto expected failure, got $lret" 1>&2 - exit 1 - fi - - ip netns exec ${nsrouter} nft delete table $proto blackh - if [ $? -ne 0 ] ;then - echo "FAIL: $proto: Could not delete blackh table" - exit 1 - fi - - echo "PASS: $proto: statement with no listener results in packet drop" -} - -test_queue() -{ - local expected=$1 - local last="" - - # spawn nf-queue listeners - ip netns exec ${nsrouter} ./nf-queue -c -q 0 -t $timeout > "$TMPFILE0" & - ip netns exec ${nsrouter} ./nf-queue -c -q 1 -t $timeout > "$TMPFILE1" & - sleep 1 - test_ping - ret=$? - if [ $ret -ne 0 ];then - echo "FAIL: netns routing/connectivity with active listener on queue $queue: $ret" 1>&2 - exit $ret - fi - - test_ping_router - ret=$? - if [ $ret -ne 0 ];then - echo "FAIL: netns router unreachable listener on queue $queue: $ret" 1>&2 - exit $ret - fi - - wait - ret=$? - - for file in $TMPFILE0 $TMPFILE1; do - last=$(tail -n1 "$file") - if [ x"$last" != x"$expected packets total" ]; then - echo "FAIL: Expected $expected packets total, but got $last" 1>&2 - cat "$file" 1>&2 - - ip netns exec ${nsrouter} nft list ruleset - exit 1 - fi - done - - echo "PASS: Expected and received $last" -} - -test_tcp_forward() -{ - ip netns exec ${nsrouter} ./nf-queue -q 2 -t $timeout & - local nfqpid=$! - - tmpfile=$(mktemp) || exit 1 - dd conv=sparse status=none if=/dev/zero bs=1M count=200 of=$tmpfile - ip netns exec ${ns2} nc -w 5 -l -p 12345 <"$tmpfile" >/dev/null & - local rpid=$! - - sleep 1 - ip netns exec ${ns1} nc -w 5 10.0.2.99 12345 <"$tmpfile" >/dev/null & - - rm -f "$tmpfile" - - wait $rpid - wait $lpid - [ $? -eq 0 ] && echo "PASS: tcp and nfqueue in forward chain" -} - -test_tcp_localhost() -{ - tmpfile=$(mktemp) || exit 1 - - dd conv=sparse status=none if=/dev/zero bs=1M count=200 of=$tmpfile - ip netns exec ${nsrouter} nc -w 5 -l -p 12345 <"$tmpfile" >/dev/null & - local rpid=$! - - ip netns exec ${nsrouter} ./nf-queue -q 3 -t $timeout & - local nfqpid=$! - - sleep 1 - ip netns exec ${nsrouter} nc -w 5 127.0.0.1 12345 <"$tmpfile" > /dev/null - rm -f "$tmpfile" - - wait $rpid - [ $? -eq 0 ] && echo "PASS: tcp via loopback" - wait 2>/dev/null -} - -test_tcp_localhost_connectclose() -{ - tmpfile=$(mktemp) || exit 1 - - ip netns exec ${nsrouter} ./connect_close -p 23456 -t $timeout & - - ip netns exec ${nsrouter} ./nf-queue -q 3 -t $timeout & - local nfqpid=$! - - sleep 1 - rm -f "$tmpfile" - - wait $rpid - [ $? -eq 0 ] && echo "PASS: tcp via loopback with connect/close" - wait 2>/dev/null -} - -test_tcp_localhost_requeue() -{ -ip netns exec ${nsrouter} nft -f /dev/stdin <<EOF -flush ruleset -table inet filter { - chain output { - type filter hook output priority 0; policy accept; - tcp dport 12345 limit rate 1/second burst 1 packets counter queue num 0 - } - chain post { - type filter hook postrouting priority 0; policy accept; - tcp dport 12345 limit rate 1/second burst 1 packets counter queue num 0 - } -} -EOF - tmpfile=$(mktemp) || exit 1 - dd conv=sparse status=none if=/dev/zero bs=1M count=200 of=$tmpfile - ip netns exec ${nsrouter} nc -w 5 -l -p 12345 <"$tmpfile" >/dev/null & - local rpid=$! - - ip netns exec ${nsrouter} ./nf-queue -c -q 1 -t $timeout > "$TMPFILE2" & - - # nfqueue 1 will be called via output hook. But this time, - # re-queue the packet to nfqueue program on queue 2. - ip netns exec ${nsrouter} ./nf-queue -G -d 150 -c -q 0 -Q 1 -t $timeout > "$TMPFILE3" & - - sleep 1 - ip netns exec ${nsrouter} nc -w 5 127.0.0.1 12345 <"$tmpfile" > /dev/null - rm -f "$tmpfile" - - wait - - if ! diff -u "$TMPFILE2" "$TMPFILE3" ; then - echo "FAIL: lost packets during requeue?!" 1>&2 - return - fi - - echo "PASS: tcp via loopback and re-queueing" -} - -test_icmp_vrf() { - ip -net $ns1 link add tvrf type vrf table 9876 - if [ $? -ne 0 ];then - echo "SKIP: Could not add vrf device" - return - fi - - ip -net $ns1 li set eth0 master tvrf - ip -net $ns1 li set tvrf up - - ip -net $ns1 route add 10.0.2.0/24 via 10.0.1.1 dev eth0 table 9876 -ip netns exec ${ns1} nft -f /dev/stdin <<EOF -flush ruleset -table inet filter { - chain output { - type filter hook output priority 0; policy accept; - meta oifname "tvrf" icmp type echo-request counter queue num 1 - meta oifname "eth0" icmp type echo-request counter queue num 1 - } - chain post { - type filter hook postrouting priority 0; policy accept; - meta oifname "tvrf" icmp type echo-request counter queue num 1 - meta oifname "eth0" icmp type echo-request counter queue num 1 - } -} -EOF - ip netns exec ${ns1} ./nf-queue -q 1 -t $timeout & - local nfqpid=$! - - sleep 1 - ip netns exec ${ns1} ip vrf exec tvrf ping -c 1 10.0.2.99 > /dev/null - - for n in output post; do - for d in tvrf eth0; do - ip netns exec ${ns1} nft list chain inet filter $n | grep -q "oifname \"$d\" icmp type echo-request counter packets 1" - if [ $? -ne 0 ] ; then - echo "FAIL: chain $n: icmp packet counter mismatch for device $d" 1>&2 - ip netns exec ${ns1} nft list ruleset - ret=1 - return - fi - done - done - - wait $nfqpid - [ $? -eq 0 ] && echo "PASS: icmp+nfqueue via vrf" - wait 2>/dev/null -} - -ip netns exec ${nsrouter} sysctl net.ipv6.conf.all.forwarding=1 > /dev/null -ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null -ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null - -load_ruleset "filter" 0 - -sleep 3 - -test_ping -ret=$? -if [ $ret -eq 0 ];then - # queue bypass works (rules were skipped, no listener) - echo "PASS: ${ns1} can reach ${ns2}" -else - echo "FAIL: ${ns1} cannot reach ${ns2}: $ret" 1>&2 - exit $ret -fi - -test_queue_blackhole ip -test_queue_blackhole ip6 - -# dummy ruleset to add base chains between the -# queueing rules. We don't want the second reinject -# to re-execute the old hooks. -load_counter_ruleset 10 - -# we are hooking all: prerouting/input/forward/output/postrouting. -# we ping ${ns2} from ${ns1} via ${nsrouter} using ipv4 and ipv6, so: -# 1x icmp prerouting,forward,postrouting -> 3 queue events (6 incl. reply). -# 1x icmp prerouting,input,output postrouting -> 4 queue events incl. reply. -# so we expect that userspace program receives 10 packets. -test_queue 10 - -# same. We queue to a second program as well. -load_ruleset "filter2" 20 -test_queue 20 - -test_tcp_forward -test_tcp_localhost -test_tcp_localhost_connectclose -test_tcp_localhost_requeue -test_icmp_vrf - -exit $ret diff --git a/tools/testing/selftests/netfilter/nft_synproxy.sh b/tools/testing/selftests/netfilter/nft_synproxy.sh deleted file mode 100755 index b62933b680d6..000000000000 --- a/tools/testing/selftests/netfilter/nft_synproxy.sh +++ /dev/null @@ -1,117 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 -# - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 -ret=0 - -rnd=$(mktemp -u XXXXXXXX) -nsr="nsr-$rnd" # synproxy machine -ns1="ns1-$rnd" # iperf client -ns2="ns2-$rnd" # iperf server - -checktool (){ - if ! $1 > /dev/null 2>&1; then - echo "SKIP: Could not $2" - exit $ksft_skip - fi -} - -checktool "nft --version" "run test without nft tool" -checktool "ip -Version" "run test without ip tool" -checktool "iperf3 --version" "run test without iperf3" -checktool "ip netns add $nsr" "create net namespace" - -modprobe -q nf_conntrack - -ip netns add $ns1 -ip netns add $ns2 - -cleanup() { - ip netns pids $ns1 | xargs kill 2>/dev/null - ip netns pids $ns2 | xargs kill 2>/dev/null - ip netns del $ns1 - ip netns del $ns2 - - ip netns del $nsr -} - -trap cleanup EXIT - -ip link add veth0 netns $nsr type veth peer name eth0 netns $ns1 -ip link add veth1 netns $nsr type veth peer name eth0 netns $ns2 - -for dev in lo veth0 veth1; do -ip -net $nsr link set $dev up -done - -ip -net $nsr addr add 10.0.1.1/24 dev veth0 -ip -net $nsr addr add 10.0.2.1/24 dev veth1 - -ip netns exec $nsr sysctl -q net.ipv4.conf.veth0.forwarding=1 -ip netns exec $nsr sysctl -q net.ipv4.conf.veth1.forwarding=1 -ip netns exec $nsr sysctl -q net.netfilter.nf_conntrack_tcp_loose=0 - -for n in $ns1 $ns2; do - ip -net $n link set lo up - ip -net $n link set eth0 up -done -ip -net $ns1 addr add 10.0.1.99/24 dev eth0 -ip -net $ns2 addr add 10.0.2.99/24 dev eth0 -ip -net $ns1 route add default via 10.0.1.1 -ip -net $ns2 route add default via 10.0.2.1 - -# test basic connectivity -if ! ip netns exec $ns1 ping -c 1 -q 10.0.2.99 > /dev/null; then - echo "ERROR: $ns1 cannot reach $ns2" 1>&2 - exit 1 -fi - -if ! ip netns exec $ns2 ping -c 1 -q 10.0.1.99 > /dev/null; then - echo "ERROR: $ns2 cannot reach $ns1" 1>&2 - exit 1 -fi - -ip netns exec $ns2 iperf3 -s > /dev/null 2>&1 & -# ip netns exec $nsr tcpdump -vvv -n -i veth1 tcp | head -n 10 & - -sleep 1 - -ip netns exec $nsr nft -f - <<EOF -table inet filter { - chain prerouting { - type filter hook prerouting priority -300; policy accept; - meta iif veth0 tcp flags syn counter notrack - } - - chain forward { - type filter hook forward priority 0; policy accept; - - ct state new,established counter accept - - meta iif veth0 meta l4proto tcp ct state untracked,invalid synproxy mss 1460 sack-perm timestamp - - ct state invalid counter drop - - # make ns2 unreachable w.o. tcp synproxy - tcp flags syn counter drop - } -} -EOF -if [ $? -ne 0 ]; then - echo "SKIP: Cannot add nft synproxy" - exit $ksft_skip -fi - -ip netns exec $ns1 timeout 5 iperf3 -c 10.0.2.99 -n $((1 * 1024 * 1024)) > /dev/null - -if [ $? -ne 0 ]; then - echo "FAIL: iperf3 returned an error" 1>&2 - ret=$? - ip netns exec $nsr nft list ruleset -else - echo "PASS: synproxy connection successful" -fi - -exit $ret diff --git a/tools/testing/selftests/netfilter/nft_trans_stress.sh b/tools/testing/selftests/netfilter/nft_trans_stress.sh deleted file mode 100755 index 2ffba45a78bf..000000000000 --- a/tools/testing/selftests/netfilter/nft_trans_stress.sh +++ /dev/null @@ -1,151 +0,0 @@ -#!/bin/bash -# -# This test is for stress-testing the nf_tables config plane path vs. -# packet path processing: Make sure we never release rules that are -# still visible to other cpus. -# -# set -e - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 - -testns=testns-$(mktemp -u "XXXXXXXX") -tmp="" - -tables="foo bar baz quux" -global_ret=0 -eret=0 -lret=0 - -cleanup() { - ip netns pids "$testns" | xargs kill 2>/dev/null - ip netns del "$testns" - - rm -f "$tmp" -} - -check_result() -{ - local r=$1 - local OK="PASS" - - if [ $r -ne 0 ] ;then - OK="FAIL" - global_ret=$r - fi - - echo "$OK: nft $2 test returned $r" - - eret=0 -} - -nft --version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without nft tool" - exit $ksft_skip -fi - -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi - -trap cleanup EXIT -tmp=$(mktemp) - -for table in $tables; do - echo add table inet "$table" >> "$tmp" - echo flush table inet "$table" >> "$tmp" - - echo "add chain inet $table INPUT { type filter hook input priority 0; }" >> "$tmp" - echo "add chain inet $table OUTPUT { type filter hook output priority 0; }" >> "$tmp" - for c in $(seq 1 400); do - chain=$(printf "chain%03u" "$c") - echo "add chain inet $table $chain" >> "$tmp" - done - - for c in $(seq 1 400); do - chain=$(printf "chain%03u" "$c") - for BASE in INPUT OUTPUT; do - echo "add rule inet $table $BASE counter jump $chain" >> "$tmp" - done - echo "add rule inet $table $chain counter return" >> "$tmp" - done -done - -ip netns add "$testns" -ip -netns "$testns" link set lo up - -lscpu | grep ^CPU\(s\): | ( read cpu cpunum ; -cpunum=$((cpunum-1)) -for i in $(seq 0 $cpunum);do - mask=$(printf 0x%x $((1<<$i))) - ip netns exec "$testns" taskset $mask ping -4 127.0.0.1 -fq > /dev/null & - ip netns exec "$testns" taskset $mask ping -6 ::1 -fq > /dev/null & -done) - -sleep 1 - -ip netns exec "$testns" nft -f "$tmp" -for i in $(seq 1 10) ; do ip netns exec "$testns" nft -f "$tmp" & done - -for table in $tables;do - randsleep=$((RANDOM%2)) - sleep $randsleep - ip netns exec "$testns" nft delete table inet $table - lret=$? - if [ $lret -ne 0 ]; then - eret=$lret - fi -done - -check_result $eret "add/delete" - -for i in $(seq 1 10) ; do - (echo "flush ruleset"; cat "$tmp") | ip netns exec "$testns" nft -f /dev/stdin - - lret=$? - if [ $lret -ne 0 ]; then - eret=$lret - fi -done - -check_result $eret "reload" - -for i in $(seq 1 10) ; do - (echo "flush ruleset"; cat "$tmp" - echo "insert rule inet foo INPUT meta nftrace set 1" - echo "insert rule inet foo OUTPUT meta nftrace set 1" - ) | ip netns exec "$testns" nft -f /dev/stdin - lret=$? - if [ $lret -ne 0 ]; then - eret=$lret - fi - - (echo "flush ruleset"; cat "$tmp" - ) | ip netns exec "$testns" nft -f /dev/stdin - - lret=$? - if [ $lret -ne 0 ]; then - eret=$lret - fi -done - -check_result $eret "add/delete with nftrace enabled" - -echo "insert rule inet foo INPUT meta nftrace set 1" >> $tmp -echo "insert rule inet foo OUTPUT meta nftrace set 1" >> $tmp - -for i in $(seq 1 10) ; do - (echo "flush ruleset"; cat "$tmp") | ip netns exec "$testns" nft -f /dev/stdin - - lret=$? - if [ $lret -ne 0 ]; then - eret=1 - fi -done - -check_result $lret "add/delete with nftrace enabled" - -exit $global_ret diff --git a/tools/testing/selftests/netfilter/settings b/tools/testing/selftests/netfilter/settings deleted file mode 100644 index 6091b45d226b..000000000000 --- a/tools/testing/selftests/netfilter/settings +++ /dev/null @@ -1 +0,0 @@ -timeout=120 diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c index 6ba4f8275ac4..94bb6e11c16f 100644 --- a/tools/testing/selftests/nolibc/nolibc-test.c +++ b/tools/testing/selftests/nolibc/nolibc-test.c @@ -27,6 +27,7 @@ #include <sys/syscall.h> #include <sys/sysmacros.h> #include <sys/time.h> +#include <sys/utsname.h> #include <sys/wait.h> #include <dirent.h> #include <errno.h> @@ -600,6 +601,25 @@ int expect_strne(const char *expr, int llen, const char *cmp) return ret; } +#define EXPECT_STRBUFEQ(cond, expr, buf, val, cmp) \ + do { if (!(cond)) result(llen, SKIPPED); else ret += expect_str_buf_eq(expr, buf, val, llen, cmp); } while (0) + +static __attribute__((unused)) +int expect_str_buf_eq(size_t expr, const char *buf, size_t val, int llen, const char *cmp) +{ + llen += printf(" = %lu <%s> ", expr, buf); + if (strcmp(buf, cmp) != 0) { + result(llen, FAIL); + return 1; + } + if (expr != val) { + result(llen, FAIL); + return 1; + } + + result(llen, OK); + return 0; +} /* declare tests based on line numbers. There must be exactly one test per line. */ #define CASE_TEST(name) \ @@ -761,6 +781,45 @@ int test_stat_timestamps(void) return 0; } +int test_uname(void) +{ + struct utsname buf; + char osrelease[sizeof(buf.release)]; + ssize_t r; + int fd; + + memset(&buf.domainname, 'P', sizeof(buf.domainname)); + + if (uname(&buf)) + return 1; + + if (strncmp("Linux", buf.sysname, sizeof(buf.sysname))) + return 1; + + fd = open("/proc/sys/kernel/osrelease", O_RDONLY); + if (fd == -1) + return 1; + + r = read(fd, osrelease, sizeof(osrelease)); + if (r == -1) + return 1; + + close(fd); + + if (osrelease[r - 1] == '\n') + r--; + + /* Validate one of the later fields to ensure field sizes are correct */ + if (strncmp(osrelease, buf.release, r)) + return 1; + + /* Ensure the field domainname is set, it is missing from struct old_utsname */ + if (strnlen(buf.domainname, sizeof(buf.domainname)) == sizeof(buf.domainname)) + return 1; + + return 0; +} + int test_mmap_munmap(void) { int ret, fd, i, page_size; @@ -966,6 +1025,8 @@ int run_syscall(int min, int max) CASE_TEST(stat_fault); EXPECT_SYSER(1, stat(NULL, &stat_buf), -1, EFAULT); break; CASE_TEST(stat_timestamps); EXPECT_SYSZR(1, test_stat_timestamps()); break; CASE_TEST(symlink_root); EXPECT_SYSER(1, symlink("/", "/"), -1, EEXIST); break; + CASE_TEST(uname); EXPECT_SYSZR(proc, test_uname()); break; + CASE_TEST(uname_fault); EXPECT_SYSER(1, uname(NULL), -1, EFAULT); break; CASE_TEST(unlink_root); EXPECT_SYSER(1, unlink("/"), -1, EISDIR); break; CASE_TEST(unlink_blah); EXPECT_SYSER(1, unlink("/proc/self/blah"), -1, ENOENT); break; CASE_TEST(wait_child); EXPECT_SYSER(1, wait(&tmp), -1, ECHILD); break; @@ -991,6 +1052,14 @@ int run_stdlib(int min, int max) for (test = min; test >= 0 && test <= max; test++) { int llen = 0; /* line length */ + /* For functions that take a long buffer, like strlcat() + * Add some more chars after the \0, to test functions that overwrite the buffer set + * the \0 at the exact right position. + */ + char buf[10] = "test123456"; + buf[4] = '\0'; + + /* avoid leaving empty lines below, this will insert holes into * test numbers. */ @@ -1007,6 +1076,19 @@ int run_stdlib(int min, int max) CASE_TEST(strchr_foobar_z); EXPECT_STRZR(1, strchr("foobar", 'z')); break; CASE_TEST(strrchr_foobar_o); EXPECT_STREQ(1, strrchr("foobar", 'o'), "obar"); break; CASE_TEST(strrchr_foobar_z); EXPECT_STRZR(1, strrchr("foobar", 'z')); break; +#ifdef NOLIBC + CASE_TEST(strlcat_0); EXPECT_STRBUFEQ(1, strlcat(buf, "bar", 0), buf, 3, "test"); break; + CASE_TEST(strlcat_1); EXPECT_STRBUFEQ(1, strlcat(buf, "bar", 1), buf, 4, "test"); break; + CASE_TEST(strlcat_5); EXPECT_STRBUFEQ(1, strlcat(buf, "bar", 5), buf, 7, "test"); break; + CASE_TEST(strlcat_6); EXPECT_STRBUFEQ(1, strlcat(buf, "bar", 6), buf, 7, "testb"); break; + CASE_TEST(strlcat_7); EXPECT_STRBUFEQ(1, strlcat(buf, "bar", 7), buf, 7, "testba"); break; + CASE_TEST(strlcat_8); EXPECT_STRBUFEQ(1, strlcat(buf, "bar", 8), buf, 7, "testbar"); break; + CASE_TEST(strlcpy_0); EXPECT_STRBUFEQ(1, strlcpy(buf, "bar", 0), buf, 3, "test"); break; + CASE_TEST(strlcpy_1); EXPECT_STRBUFEQ(1, strlcpy(buf, "bar", 1), buf, 3, ""); break; + CASE_TEST(strlcpy_2); EXPECT_STRBUFEQ(1, strlcpy(buf, "bar", 2), buf, 3, "b"); break; + CASE_TEST(strlcpy_3); EXPECT_STRBUFEQ(1, strlcpy(buf, "bar", 3), buf, 3, "ba"); break; + CASE_TEST(strlcpy_4); EXPECT_STRBUFEQ(1, strlcpy(buf, "bar", 4), buf, 3, "bar"); break; +#endif CASE_TEST(memcmp_20_20); EXPECT_EQ(1, memcmp("aaa\x20", "aaa\x20", 4), 0); break; CASE_TEST(memcmp_20_60); EXPECT_LT(1, memcmp("aaa\x20", "aaa\x60", 4), 0); break; CASE_TEST(memcmp_60_20); EXPECT_GT(1, memcmp("aaa\x60", "aaa\x20", 4), 0); break; diff --git a/tools/testing/selftests/perf_events/.gitignore b/tools/testing/selftests/perf_events/.gitignore index 790c47001e77..ee93dc4969b8 100644 --- a/tools/testing/selftests/perf_events/.gitignore +++ b/tools/testing/selftests/perf_events/.gitignore @@ -1,3 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only sigtrap_threads remove_on_exec +watermark_signal diff --git a/tools/testing/selftests/perf_events/Makefile b/tools/testing/selftests/perf_events/Makefile index db93c4ff081a..70e3ff211278 100644 --- a/tools/testing/selftests/perf_events/Makefile +++ b/tools/testing/selftests/perf_events/Makefile @@ -2,5 +2,5 @@ CFLAGS += -Wl,-no-as-needed -Wall $(KHDR_INCLUDES) LDFLAGS += -lpthread -TEST_GEN_PROGS := sigtrap_threads remove_on_exec +TEST_GEN_PROGS := sigtrap_threads remove_on_exec watermark_signal include ../lib.mk diff --git a/tools/testing/selftests/perf_events/watermark_signal.c b/tools/testing/selftests/perf_events/watermark_signal.c new file mode 100644 index 000000000000..49dc1e831174 --- /dev/null +++ b/tools/testing/selftests/perf_events/watermark_signal.c @@ -0,0 +1,146 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE + +#include <errno.h> +#include <fcntl.h> +#include <linux/perf_event.h> +#include <stddef.h> +#include <sched.h> +#include <signal.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> +#include <sys/mman.h> +#include <sys/syscall.h> +#include <sys/wait.h> +#include <unistd.h> + +#include "../kselftest_harness.h" + +#define __maybe_unused __attribute__((__unused__)) + +static int sigio_count; + +static void handle_sigio(int signum __maybe_unused, + siginfo_t *oh __maybe_unused, + void *uc __maybe_unused) +{ + ++sigio_count; +} + +static void do_child(void) +{ + raise(SIGSTOP); + + for (int i = 0; i < 20; ++i) + sleep(1); + + raise(SIGSTOP); + + exit(0); +} + +TEST(watermark_signal) +{ + struct perf_event_attr attr; + struct perf_event_mmap_page *p = NULL; + struct sigaction previous_sigio, sigio = { 0 }; + pid_t child = -1; + int child_status; + int fd = -1; + long page_size = sysconf(_SC_PAGE_SIZE); + + sigio.sa_sigaction = handle_sigio; + EXPECT_EQ(sigaction(SIGIO, &sigio, &previous_sigio), 0); + + memset(&attr, 0, sizeof(attr)); + attr.size = sizeof(attr); + attr.type = PERF_TYPE_SOFTWARE; + attr.config = PERF_COUNT_SW_DUMMY; + attr.sample_period = 1; + attr.disabled = 1; + attr.watermark = 1; + attr.context_switch = 1; + attr.wakeup_watermark = 1; + + child = fork(); + EXPECT_GE(child, 0); + if (child == 0) + do_child(); + else if (child < 0) { + perror("fork()"); + goto cleanup; + } + + if (waitpid(child, &child_status, WSTOPPED) != child || + !(WIFSTOPPED(child_status) && WSTOPSIG(child_status) == SIGSTOP)) { + fprintf(stderr, + "failed to sycnhronize with child errno=%d status=%x\n", + errno, + child_status); + goto cleanup; + } + + fd = syscall(__NR_perf_event_open, &attr, child, -1, -1, + PERF_FLAG_FD_CLOEXEC); + if (fd < 0) { + fprintf(stderr, "failed opening event %llx\n", attr.config); + goto cleanup; + } + + if (fcntl(fd, F_SETFL, FASYNC)) { + perror("F_SETFL FASYNC"); + goto cleanup; + } + + if (fcntl(fd, F_SETOWN, getpid())) { + perror("F_SETOWN getpid()"); + goto cleanup; + } + + if (fcntl(fd, F_SETSIG, SIGIO)) { + perror("F_SETSIG SIGIO"); + goto cleanup; + } + + p = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (p == NULL) { + perror("mmap"); + goto cleanup; + } + + if (ioctl(fd, PERF_EVENT_IOC_ENABLE, 0)) { + perror("PERF_EVENT_IOC_ENABLE"); + goto cleanup; + } + + if (kill(child, SIGCONT) < 0) { + perror("SIGCONT"); + goto cleanup; + } + + if (waitpid(child, &child_status, WSTOPPED) != -1 || errno != EINTR) + fprintf(stderr, + "expected SIGIO to terminate wait errno=%d status=%x\n%d", + errno, + child_status, + sigio_count); + + EXPECT_GE(sigio_count, 1); + +cleanup: + if (p != NULL) + munmap(p, 2 * page_size); + + if (fd >= 0) + close(fd); + + if (child > 0) { + kill(child, SIGKILL); + waitpid(child, NULL, 0); + } + + sigaction(SIGIO, &previous_sigio, NULL); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/pidfd/config b/tools/testing/selftests/pidfd/config index f6f2965e17af..6133524710f7 100644 --- a/tools/testing/selftests/pidfd/config +++ b/tools/testing/selftests/pidfd/config @@ -3,5 +3,7 @@ CONFIG_IPC_NS=y CONFIG_USER_NS=y CONFIG_PID_NS=y CONFIG_NET_NS=y +CONFIG_TIME_NS=y +CONFIG_GENERIC_VDSO_TIME_NS=y CONFIG_CGROUPS=y CONFIG_CHECKPOINT_RESTORE=y diff --git a/tools/testing/selftests/pidfd/pidfd_fdinfo_test.c b/tools/testing/selftests/pidfd/pidfd_fdinfo_test.c index 01cc37bf611c..f062a986e382 100644 --- a/tools/testing/selftests/pidfd/pidfd_fdinfo_test.c +++ b/tools/testing/selftests/pidfd/pidfd_fdinfo_test.c @@ -307,5 +307,5 @@ int main(int argc, char **argv) test_pidfd_fdinfo_nspid(); test_pidfd_dead_fdinfo(); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/pidfd/pidfd_open_test.c b/tools/testing/selftests/pidfd/pidfd_open_test.c index 8a59438ccc78..c62564c264b1 100644 --- a/tools/testing/selftests/pidfd/pidfd_open_test.c +++ b/tools/testing/selftests/pidfd/pidfd_open_test.c @@ -159,5 +159,7 @@ on_error: if (pidfd >= 0) close(pidfd); - return !ret ? ksft_exit_pass() : ksft_exit_fail(); + if (ret) + ksft_exit_fail(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/pidfd/pidfd_poll_test.c b/tools/testing/selftests/pidfd/pidfd_poll_test.c index 610811275357..55d74a50358f 100644 --- a/tools/testing/selftests/pidfd/pidfd_poll_test.c +++ b/tools/testing/selftests/pidfd/pidfd_poll_test.c @@ -112,5 +112,5 @@ int main(int argc, char **argv) } ksft_test_result_pass("pidfd poll test: pass\n"); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/pidfd/pidfd_setns_test.c b/tools/testing/selftests/pidfd/pidfd_setns_test.c index 6e2f2cd400ca..47746b0c6acd 100644 --- a/tools/testing/selftests/pidfd/pidfd_setns_test.c +++ b/tools/testing/selftests/pidfd/pidfd_setns_test.c @@ -158,7 +158,7 @@ FIXTURE_SETUP(current_nsset) /* Create task that exits right away. */ self->child_pid_exited = create_child(&self->child_pidfd_exited, CLONE_NEWUSER | CLONE_NEWNET); - EXPECT_GT(self->child_pid_exited, 0); + EXPECT_GE(self->child_pid_exited, 0); if (self->child_pid_exited == 0) _exit(EXIT_SUCCESS); diff --git a/tools/testing/selftests/pidfd/pidfd_test.c b/tools/testing/selftests/pidfd/pidfd_test.c index c081ae91313a..9faa686f90e4 100644 --- a/tools/testing/selftests/pidfd/pidfd_test.c +++ b/tools/testing/selftests/pidfd/pidfd_test.c @@ -572,5 +572,5 @@ int main(int argc, char **argv) test_pidfd_send_signal_exited_fail(); test_pidfd_send_signal_recycled_pid_fail(); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/power_supply/test_power_supply_properties.sh b/tools/testing/selftests/power_supply/test_power_supply_properties.sh index df272dfe1d2a..a66b1313ed88 100755 --- a/tools/testing/selftests/power_supply/test_power_supply_properties.sh +++ b/tools/testing/selftests/power_supply/test_power_supply_properties.sh @@ -23,7 +23,7 @@ count_tests() { total_tests=0 for i in $SUPPLIES; do - total_tests=$(("$total_tests" + "$NUM_TESTS")) + total_tests=$((total_tests + NUM_TESTS)) done echo "$total_tests" diff --git a/tools/testing/selftests/powerpc/Makefile b/tools/testing/selftests/powerpc/Makefile index c376151982c4..b175e94e1901 100644 --- a/tools/testing/selftests/powerpc/Makefile +++ b/tools/testing/selftests/powerpc/Makefile @@ -7,12 +7,6 @@ ARCH := $(shell echo $(ARCH) | sed -e s/ppc.*/powerpc/) ifeq ($(ARCH),powerpc) -GIT_VERSION = $(shell git describe --always --long --dirty || echo "unknown") - -CFLAGS := -std=gnu99 -O2 -Wall -Werror -DGIT_VERSION='"$(GIT_VERSION)"' -I$(CURDIR)/include $(CFLAGS) - -export CFLAGS - SUB_DIRS = alignment \ benchmarks \ cache_shape \ @@ -46,6 +40,7 @@ $(SUB_DIRS): BUILD_TARGET=$(OUTPUT)/$@; mkdir -p $$BUILD_TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -k -C $@ all include ../lib.mk +include ./flags.mk override define RUN_TESTS +@for TARGET in $(SUB_DIRS); do \ @@ -57,14 +52,14 @@ endef override define INSTALL_RULE +@for TARGET in $(SUB_DIRS); do \ BUILD_TARGET=$(OUTPUT)/$$TARGET; \ - $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET install;\ + $(MAKE) OUTPUT=$$BUILD_TARGET INSTALL_PATH=$$INSTALL_PATH/$$TARGET -C $$TARGET install;\ done; endef emit_tests: +@for TARGET in $(SUB_DIRS); do \ BUILD_TARGET=$(OUTPUT)/$$TARGET; \ - $(MAKE) OUTPUT=$$BUILD_TARGET -s -C $$TARGET $@;\ + $(MAKE) OUTPUT=$$BUILD_TARGET COLLECTION=$(COLLECTION)/$$TARGET -s -C $$TARGET $@;\ done; override define CLEAN diff --git a/tools/testing/selftests/powerpc/alignment/Makefile b/tools/testing/selftests/powerpc/alignment/Makefile index 93e9af37449d..66d5d7aaeb20 100644 --- a/tools/testing/selftests/powerpc/alignment/Makefile +++ b/tools/testing/selftests/powerpc/alignment/Makefile @@ -3,5 +3,6 @@ TEST_GEN_PROGS := copy_first_unaligned alignment_handler top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk $(TEST_GEN_PROGS): ../harness.c ../utils.c diff --git a/tools/testing/selftests/powerpc/benchmarks/Makefile b/tools/testing/selftests/powerpc/benchmarks/Makefile index a32a6ab89914..1321922038d0 100644 --- a/tools/testing/selftests/powerpc/benchmarks/Makefile +++ b/tools/testing/selftests/powerpc/benchmarks/Makefile @@ -4,10 +4,11 @@ TEST_GEN_FILES := exec_target TEST_FILES := settings -CFLAGS += -O2 - top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk + +CFLAGS += -O2 $(TEST_GEN_PROGS): ../harness.c diff --git a/tools/testing/selftests/powerpc/cache_shape/Makefile b/tools/testing/selftests/powerpc/cache_shape/Makefile index 689f6c8ebcd8..3a3ca956ac66 100644 --- a/tools/testing/selftests/powerpc/cache_shape/Makefile +++ b/tools/testing/selftests/powerpc/cache_shape/Makefile @@ -3,5 +3,6 @@ TEST_GEN_PROGS := cache_shape top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk $(TEST_GEN_PROGS): ../harness.c ../utils.c diff --git a/tools/testing/selftests/powerpc/copyloops/Makefile b/tools/testing/selftests/powerpc/copyloops/Makefile index 77594e697f2f..42940f92d832 100644 --- a/tools/testing/selftests/powerpc/copyloops/Makefile +++ b/tools/testing/selftests/powerpc/copyloops/Makefile @@ -1,14 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -# The loops are all 64-bit code -CFLAGS += -m64 -CFLAGS += -I$(CURDIR) -CFLAGS += -D SELFTEST -CFLAGS += -maltivec -CFLAGS += -mcpu=power4 - -# Use our CFLAGS for the implicit .S rule & set the asm machine type -ASFLAGS = $(CFLAGS) -Wa,-mpower4 - TEST_GEN_PROGS := copyuser_64_t0 copyuser_64_t1 copyuser_64_t2 \ copyuser_p7_t0 copyuser_p7_t1 \ memcpy_64_t0 memcpy_64_t1 memcpy_64_t2 \ @@ -20,6 +10,17 @@ EXTRA_SOURCES := validate.c ../harness.c stubs.S top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk + +# The loops are all 64-bit code +CFLAGS += -m64 +CFLAGS += -I$(CURDIR) +CFLAGS += -D SELFTEST +CFLAGS += -maltivec +CFLAGS += -mcpu=power4 + +# Use our CFLAGS for the implicit .S rule & set the asm machine type +ASFLAGS = $(CFLAGS) -Wa,-mpower4 $(OUTPUT)/copyuser_64_t%: copyuser_64.S $(EXTRA_SOURCES) $(CC) $(CPPFLAGS) $(CFLAGS) \ diff --git a/tools/testing/selftests/powerpc/dexcr/.gitignore b/tools/testing/selftests/powerpc/dexcr/.gitignore index b82f45dd46b9..11eefb4b9fa4 100644 --- a/tools/testing/selftests/powerpc/dexcr/.gitignore +++ b/tools/testing/selftests/powerpc/dexcr/.gitignore @@ -1,2 +1,4 @@ +dexcr_test hashchk_test +chdexcr lsdexcr diff --git a/tools/testing/selftests/powerpc/dexcr/Makefile b/tools/testing/selftests/powerpc/dexcr/Makefile index 76210f2bcec3..58cf9f722905 100644 --- a/tools/testing/selftests/powerpc/dexcr/Makefile +++ b/tools/testing/selftests/powerpc/dexcr/Makefile @@ -1,9 +1,12 @@ -TEST_GEN_PROGS := hashchk_test -TEST_GEN_FILES := lsdexcr +TEST_GEN_PROGS := dexcr_test hashchk_test +TEST_GEN_FILES := lsdexcr chdexcr include ../../lib.mk +include ../flags.mk -$(OUTPUT)/hashchk_test: CFLAGS += -fno-pie $(call cc-option,-mno-rop-protect) +CFLAGS += $(KHDR_INCLUDES) + +$(OUTPUT)/hashchk_test: CFLAGS += -fno-pie -no-pie $(call cc-option,-mno-rop-protect) $(TEST_GEN_PROGS): ../harness.c ../utils.c ./dexcr.c $(TEST_GEN_FILES): ../utils.c ./dexcr.c diff --git a/tools/testing/selftests/powerpc/dexcr/chdexcr.c b/tools/testing/selftests/powerpc/dexcr/chdexcr.c new file mode 100644 index 000000000000..c548d7a5bb9b --- /dev/null +++ b/tools/testing/selftests/powerpc/dexcr/chdexcr.c @@ -0,0 +1,112 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <errno.h> +#include <stddef.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/prctl.h> + +#include "dexcr.h" +#include "utils.h" + +static void die(const char *msg) +{ + printf("%s\n", msg); + exit(1); +} + +static void help(void) +{ + printf("Invoke a provided program with a custom DEXCR on-exec reset value\n" + "\n" + "usage: chdexcr [CHDEXCR OPTIONS] -- PROGRAM [ARGS...]\n" + "\n" + "Each configurable DEXCR aspect is exposed as an option.\n" + "\n" + "The normal option sets the aspect in the DEXCR. The --no- variant\n" + "clears that aspect. For example, --ibrtpd sets the IBRTPD aspect bit,\n" + "so indirect branch prediction will be disabled in the provided program.\n" + "Conversely, --no-ibrtpd clears the aspect bit, so indirect branch\n" + "prediction may occur.\n" + "\n" + "CHDEXCR OPTIONS:\n"); + + for (int i = 0; i < ARRAY_SIZE(aspects); i++) { + const struct dexcr_aspect *aspect = &aspects[i]; + + if (aspect->prctl == -1) + continue; + + printf(" --%-6s / --no-%-6s : %s\n", aspect->opt, aspect->opt, aspect->desc); + } +} + +static const struct dexcr_aspect *opt_to_aspect(const char *opt) +{ + for (int i = 0; i < ARRAY_SIZE(aspects); i++) + if (aspects[i].prctl != -1 && !strcmp(aspects[i].opt, opt)) + return &aspects[i]; + + return NULL; +} + +static int apply_option(const char *option) +{ + const struct dexcr_aspect *aspect; + const char *opt = NULL; + const char *set_prefix = "--"; + const char *clear_prefix = "--no-"; + unsigned long ctrl = 0; + int err; + + if (!strcmp(option, "-h") || !strcmp(option, "--help")) { + help(); + exit(0); + } + + /* Strip out --(no-) prefix and determine ctrl value */ + if (!strncmp(option, clear_prefix, strlen(clear_prefix))) { + opt = &option[strlen(clear_prefix)]; + ctrl |= PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC; + } else if (!strncmp(option, set_prefix, strlen(set_prefix))) { + opt = &option[strlen(set_prefix)]; + ctrl |= PR_PPC_DEXCR_CTRL_SET_ONEXEC; + } + + if (!opt || !*opt) + return 1; + + aspect = opt_to_aspect(opt); + if (!aspect) + die("unknown aspect"); + + err = pr_set_dexcr(aspect->prctl, ctrl); + if (err) + die("failed to apply option"); + + return 0; +} + +int main(int argc, char *const argv[]) +{ + int i; + + if (!dexcr_exists()) + die("DEXCR not detected on this hardware"); + + for (i = 1; i < argc; i++) + if (apply_option(argv[i])) + break; + + if (i < argc && !strcmp(argv[i], "--")) + i++; + + if (i >= argc) + die("missing command"); + + execvp(argv[i], &argv[i]); + perror("execve"); + + return errno; +} diff --git a/tools/testing/selftests/powerpc/dexcr/dexcr.c b/tools/testing/selftests/powerpc/dexcr/dexcr.c index 65ec5347de98..468fd0dc9912 100644 --- a/tools/testing/selftests/powerpc/dexcr/dexcr.c +++ b/tools/testing/selftests/powerpc/dexcr/dexcr.c @@ -3,6 +3,7 @@ #include <errno.h> #include <setjmp.h> #include <signal.h> +#include <sys/prctl.h> #include <sys/types.h> #include <sys/wait.h> @@ -43,6 +44,45 @@ out: return exists; } +unsigned int pr_which_to_aspect(unsigned long which) +{ + switch (which) { + case PR_PPC_DEXCR_SBHE: + return DEXCR_PR_SBHE; + case PR_PPC_DEXCR_IBRTPD: + return DEXCR_PR_IBRTPD; + case PR_PPC_DEXCR_SRAPD: + return DEXCR_PR_SRAPD; + case PR_PPC_DEXCR_NPHIE: + return DEXCR_PR_NPHIE; + default: + FAIL_IF_EXIT_MSG(true, "unknown PR aspect"); + } +} + +int pr_get_dexcr(unsigned long which) +{ + return prctl(PR_PPC_GET_DEXCR, which, 0UL, 0UL, 0UL); +} + +int pr_set_dexcr(unsigned long which, unsigned long ctrl) +{ + return prctl(PR_PPC_SET_DEXCR, which, ctrl, 0UL, 0UL); +} + +bool pr_dexcr_aspect_supported(unsigned long which) +{ + if (pr_get_dexcr(which) == -1) + return errno == ENODEV; + + return true; +} + +bool pr_dexcr_aspect_editable(unsigned long which) +{ + return pr_get_dexcr(which) & PR_PPC_DEXCR_CTRL_EDITABLE; +} + /* * Just test if a bad hashchk triggers a signal, without checking * for support or if the NPHIE aspect is enabled. diff --git a/tools/testing/selftests/powerpc/dexcr/dexcr.h b/tools/testing/selftests/powerpc/dexcr/dexcr.h index f55cbbc8643b..51e9ba3b0997 100644 --- a/tools/testing/selftests/powerpc/dexcr/dexcr.h +++ b/tools/testing/selftests/powerpc/dexcr/dexcr.h @@ -9,6 +9,7 @@ #define _SELFTESTS_POWERPC_DEXCR_DEXCR_H #include <stdbool.h> +#include <sys/prctl.h> #include <sys/types.h> #include "reg.h" @@ -26,8 +27,64 @@ #define PPC_RAW_HASHCHK(b, i, a) \ str(.long (0x7C0005E4 | PPC_RAW_HASH_ARGS(b, i, a));) +struct dexcr_aspect { + const char *name; /* Short display name */ + const char *opt; /* Option name for chdexcr */ + const char *desc; /* Expanded aspect meaning */ + unsigned int index; /* Aspect bit index in DEXCR */ + unsigned long prctl; /* 'which' value for get/set prctl */ +}; + +static const struct dexcr_aspect aspects[] = { + { + .name = "SBHE", + .opt = "sbhe", + .desc = "Speculative branch hint enable", + .index = 0, + .prctl = PR_PPC_DEXCR_SBHE, + }, + { + .name = "IBRTPD", + .opt = "ibrtpd", + .desc = "Indirect branch recurrent target prediction disable", + .index = 3, + .prctl = PR_PPC_DEXCR_IBRTPD, + }, + { + .name = "SRAPD", + .opt = "srapd", + .desc = "Subroutine return address prediction disable", + .index = 4, + .prctl = PR_PPC_DEXCR_SRAPD, + }, + { + .name = "NPHIE", + .opt = "nphie", + .desc = "Non-privileged hash instruction enable", + .index = 5, + .prctl = PR_PPC_DEXCR_NPHIE, + }, + { + .name = "PHIE", + .opt = "phie", + .desc = "Privileged hash instruction enable", + .index = 6, + .prctl = -1, + }, +}; + bool dexcr_exists(void); +bool pr_dexcr_aspect_supported(unsigned long which); + +bool pr_dexcr_aspect_editable(unsigned long which); + +int pr_get_dexcr(unsigned long pr_aspect); + +int pr_set_dexcr(unsigned long pr_aspect, unsigned long ctrl); + +unsigned int pr_which_to_aspect(unsigned long which); + bool hashchk_triggers(void); enum dexcr_source { diff --git a/tools/testing/selftests/powerpc/dexcr/dexcr_test.c b/tools/testing/selftests/powerpc/dexcr/dexcr_test.c new file mode 100644 index 000000000000..7a8657164908 --- /dev/null +++ b/tools/testing/selftests/powerpc/dexcr/dexcr_test.c @@ -0,0 +1,215 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <errno.h> +#include <fcntl.h> +#include <stdlib.h> +#include <string.h> +#include <sys/prctl.h> +#include <unistd.h> + +#include "dexcr.h" +#include "utils.h" + +/* + * Helper function for testing the behaviour of a newly exec-ed process + */ +static int dexcr_prctl_onexec_test_child(unsigned long which, const char *status) +{ + unsigned long dexcr = mfspr(SPRN_DEXCR_RO); + unsigned long aspect = pr_which_to_aspect(which); + int ctrl = pr_get_dexcr(which); + + if (!strcmp(status, "set")) { + FAIL_IF_EXIT_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_SET), + "setting aspect across exec not applied"); + + FAIL_IF_EXIT_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_SET_ONEXEC), + "setting aspect across exec not inherited"); + + FAIL_IF_EXIT_MSG(!(aspect & dexcr), "setting aspect across exec did not take effect"); + } else if (!strcmp(status, "clear")) { + FAIL_IF_EXIT_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_CLEAR), + "clearing aspect across exec not applied"); + + FAIL_IF_EXIT_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC), + "clearing aspect across exec not inherited"); + + FAIL_IF_EXIT_MSG(aspect & dexcr, "clearing aspect across exec did not take effect"); + } else { + FAIL_IF_EXIT_MSG(true, "unknown expected status"); + } + + return 0; +} + +/* + * Test that the given prctl value can be manipulated freely + */ +static int dexcr_prctl_aspect_test(unsigned long which) +{ + unsigned long aspect = pr_which_to_aspect(which); + pid_t pid; + int ctrl; + int err; + int errno_save; + + SKIP_IF_MSG(!dexcr_exists(), "DEXCR not supported"); + SKIP_IF_MSG(!pr_dexcr_aspect_supported(which), "DEXCR aspect not supported"); + SKIP_IF_MSG(!pr_dexcr_aspect_editable(which), "DEXCR aspect not editable with prctl"); + + /* We reject invalid combinations of arguments */ + err = pr_set_dexcr(which, PR_PPC_DEXCR_CTRL_SET | PR_PPC_DEXCR_CTRL_CLEAR); + errno_save = errno; + FAIL_IF_MSG(err != -1, "simultaneous set and clear should be rejected"); + FAIL_IF_MSG(errno_save != EINVAL, "simultaneous set and clear should be rejected with EINVAL"); + + err = pr_set_dexcr(which, PR_PPC_DEXCR_CTRL_SET_ONEXEC | PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC); + errno_save = errno; + FAIL_IF_MSG(err != -1, "simultaneous set and clear on exec should be rejected"); + FAIL_IF_MSG(errno_save != EINVAL, "simultaneous set and clear on exec should be rejected with EINVAL"); + + /* We set the aspect */ + err = pr_set_dexcr(which, PR_PPC_DEXCR_CTRL_SET); + FAIL_IF_MSG(err, "PR_PPC_DEXCR_CTRL_SET failed"); + + ctrl = pr_get_dexcr(which); + FAIL_IF_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_SET), "config value not PR_PPC_DEXCR_CTRL_SET"); + FAIL_IF_MSG(ctrl & PR_PPC_DEXCR_CTRL_CLEAR, "config value unexpected clear flag"); + FAIL_IF_MSG(!(aspect & mfspr(SPRN_DEXCR_RO)), "setting aspect did not take effect"); + + /* We clear the aspect */ + err = pr_set_dexcr(which, PR_PPC_DEXCR_CTRL_CLEAR); + FAIL_IF_MSG(err, "PR_PPC_DEXCR_CTRL_CLEAR failed"); + + ctrl = pr_get_dexcr(which); + FAIL_IF_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_CLEAR), "config value not PR_PPC_DEXCR_CTRL_CLEAR"); + FAIL_IF_MSG(ctrl & PR_PPC_DEXCR_CTRL_SET, "config value unexpected set flag"); + FAIL_IF_MSG(aspect & mfspr(SPRN_DEXCR_RO), "clearing aspect did not take effect"); + + /* We make it set on exec (doesn't change our current value) */ + err = pr_set_dexcr(which, PR_PPC_DEXCR_CTRL_SET_ONEXEC); + FAIL_IF_MSG(err, "PR_PPC_DEXCR_CTRL_SET_ONEXEC failed"); + + ctrl = pr_get_dexcr(which); + FAIL_IF_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_CLEAR), "process aspect should still be cleared"); + FAIL_IF_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_SET_ONEXEC), "config value not PR_PPC_DEXCR_CTRL_SET_ONEXEC"); + FAIL_IF_MSG(ctrl & PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC, "config value unexpected clear on exec flag"); + FAIL_IF_MSG(aspect & mfspr(SPRN_DEXCR_RO), "scheduling aspect to set on exec should not change it now"); + + /* We make it clear on exec (doesn't change our current value) */ + err = pr_set_dexcr(which, PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC); + FAIL_IF_MSG(err, "PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC failed"); + + ctrl = pr_get_dexcr(which); + FAIL_IF_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_CLEAR), "process aspect config should still be cleared"); + FAIL_IF_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC), "config value not PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC"); + FAIL_IF_MSG(ctrl & PR_PPC_DEXCR_CTRL_SET_ONEXEC, "config value unexpected set on exec flag"); + FAIL_IF_MSG(aspect & mfspr(SPRN_DEXCR_RO), "process aspect should still be cleared"); + + /* We allow setting the current and on-exec value in a single call */ + err = pr_set_dexcr(which, PR_PPC_DEXCR_CTRL_SET | PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC); + FAIL_IF_MSG(err, "PR_PPC_DEXCR_CTRL_SET | PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC failed"); + + ctrl = pr_get_dexcr(which); + FAIL_IF_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_SET), "config value not PR_PPC_DEXCR_CTRL_SET"); + FAIL_IF_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC), "config value not PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC"); + FAIL_IF_MSG(!(aspect & mfspr(SPRN_DEXCR_RO)), "process aspect should be set"); + + err = pr_set_dexcr(which, PR_PPC_DEXCR_CTRL_CLEAR | PR_PPC_DEXCR_CTRL_SET_ONEXEC); + FAIL_IF_MSG(err, "PR_PPC_DEXCR_CTRL_CLEAR | PR_PPC_DEXCR_CTRL_SET_ONEXEC failed"); + + ctrl = pr_get_dexcr(which); + FAIL_IF_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_CLEAR), "config value not PR_PPC_DEXCR_CTRL_CLEAR"); + FAIL_IF_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_SET_ONEXEC), "config value not PR_PPC_DEXCR_CTRL_SET_ONEXEC"); + FAIL_IF_MSG(aspect & mfspr(SPRN_DEXCR_RO), "process aspect should be clear"); + + /* Verify the onexec value is applied across exec */ + pid = fork(); + if (!pid) { + char which_str[32] = {}; + char *args[] = { "dexcr_prctl_onexec_test_child", which_str, "set", NULL }; + unsigned int ctrl = pr_get_dexcr(which); + + sprintf(which_str, "%lu", which); + + FAIL_IF_EXIT_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_SET_ONEXEC), + "setting aspect on exec not copied across fork"); + + FAIL_IF_EXIT_MSG(mfspr(SPRN_DEXCR_RO) & aspect, + "setting aspect on exec wrongly applied to fork"); + + execve("/proc/self/exe", args, NULL); + _exit(errno); + } + await_child_success(pid); + + err = pr_set_dexcr(which, PR_PPC_DEXCR_CTRL_SET | PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC); + FAIL_IF_MSG(err, "PR_PPC_DEXCR_CTRL_SET | PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC failed"); + + pid = fork(); + if (!pid) { + char which_str[32] = {}; + char *args[] = { "dexcr_prctl_onexec_test_child", which_str, "clear", NULL }; + unsigned int ctrl = pr_get_dexcr(which); + + sprintf(which_str, "%lu", which); + + FAIL_IF_EXIT_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC), + "clearing aspect on exec not copied across fork"); + + FAIL_IF_EXIT_MSG(!(mfspr(SPRN_DEXCR_RO) & aspect), + "clearing aspect on exec wrongly applied to fork"); + + execve("/proc/self/exe", args, NULL); + _exit(errno); + } + await_child_success(pid); + + return 0; +} + +static int dexcr_prctl_ibrtpd_test(void) +{ + return dexcr_prctl_aspect_test(PR_PPC_DEXCR_IBRTPD); +} + +static int dexcr_prctl_srapd_test(void) +{ + return dexcr_prctl_aspect_test(PR_PPC_DEXCR_SRAPD); +} + +static int dexcr_prctl_nphie_test(void) +{ + return dexcr_prctl_aspect_test(PR_PPC_DEXCR_NPHIE); +} + +int main(int argc, char *argv[]) +{ + int err = 0; + + /* + * Some tests require checking what happens across exec, so we may be + * invoked as the child of a particular test + */ + if (argc > 1) { + if (argc == 3 && !strcmp(argv[0], "dexcr_prctl_onexec_test_child")) { + unsigned long which; + + err = parse_ulong(argv[1], strlen(argv[1]), &which, 10); + FAIL_IF_MSG(err, "failed to parse which value for child"); + + return dexcr_prctl_onexec_test_child(which, argv[2]); + } + + FAIL_IF_MSG(true, "unknown test case"); + } + + /* + * Otherwise we are the main test invocation and run the full suite + */ + err |= test_harness(dexcr_prctl_ibrtpd_test, "dexcr_prctl_ibrtpd"); + err |= test_harness(dexcr_prctl_srapd_test, "dexcr_prctl_srapd"); + err |= test_harness(dexcr_prctl_nphie_test, "dexcr_prctl_nphie"); + + return err; +} diff --git a/tools/testing/selftests/powerpc/dexcr/hashchk_test.c b/tools/testing/selftests/powerpc/dexcr/hashchk_test.c index 7d5658c9ebe4..645224bdc142 100644 --- a/tools/testing/selftests/powerpc/dexcr/hashchk_test.c +++ b/tools/testing/selftests/powerpc/dexcr/hashchk_test.c @@ -21,8 +21,14 @@ static int require_nphie(void) { SKIP_IF_MSG(!dexcr_exists(), "DEXCR not supported"); + + pr_set_dexcr(PR_PPC_DEXCR_NPHIE, PR_PPC_DEXCR_CTRL_SET | PR_PPC_DEXCR_CTRL_SET_ONEXEC); + + if (get_dexcr(EFFECTIVE) & DEXCR_PR_NPHIE) + return 0; + SKIP_IF_MSG(!(get_dexcr(EFFECTIVE) & DEXCR_PR_NPHIE), - "DEXCR[NPHIE] not enabled"); + "Failed to enable DEXCR[NPHIE]"); return 0; } diff --git a/tools/testing/selftests/powerpc/dexcr/lsdexcr.c b/tools/testing/selftests/powerpc/dexcr/lsdexcr.c index 94abbfcc389e..7588929180ab 100644 --- a/tools/testing/selftests/powerpc/dexcr/lsdexcr.c +++ b/tools/testing/selftests/powerpc/dexcr/lsdexcr.c @@ -1,9 +1,9 @@ // SPDX-License-Identifier: GPL-2.0+ -#include <errno.h> #include <stddef.h> #include <stdio.h> #include <string.h> +#include <sys/prctl.h> #include "dexcr.h" #include "utils.h" @@ -12,40 +12,6 @@ static unsigned int dexcr; static unsigned int hdexcr; static unsigned int effective; -struct dexcr_aspect { - const char *name; - const char *desc; - unsigned int index; -}; - -static const struct dexcr_aspect aspects[] = { - { - .name = "SBHE", - .desc = "Speculative branch hint enable", - .index = 0, - }, - { - .name = "IBRTPD", - .desc = "Indirect branch recurrent target prediction disable", - .index = 3, - }, - { - .name = "SRAPD", - .desc = "Subroutine return address prediction disable", - .index = 4, - }, - { - .name = "NPHIE", - .desc = "Non-privileged hash instruction enable", - .index = 5, - }, - { - .name = "PHIE", - .desc = "Privileged hash instruction enable", - .index = 6, - }, -}; - static void print_list(const char *list[], size_t len) { for (size_t i = 0; i < len; i++) { @@ -60,7 +26,7 @@ static void print_dexcr(char *name, unsigned int bits) const char *enabled_aspects[ARRAY_SIZE(aspects) + 1] = {NULL}; size_t j = 0; - printf("%s: %08x", name, bits); + printf("%s: 0x%08x", name, bits); if (bits == 0) { printf("\n"); @@ -103,6 +69,63 @@ static void print_aspect(const struct dexcr_aspect *aspect) printf(" \t(%s)\n", aspect->desc); } +static void print_aspect_config(const struct dexcr_aspect *aspect) +{ + const char *reason = NULL; + const char *reason_hyp = NULL; + const char *reason_prctl = "no prctl"; + bool actual = effective & DEXCR_PR_BIT(aspect->index); + bool expected = actual; /* Assume it's fine if we don't expect a specific set/clear value */ + + if (actual) + reason = "set by unknown"; + else + reason = "cleared by unknown"; + + if (aspect->prctl != -1) { + int ctrl = pr_get_dexcr(aspect->prctl); + + if (ctrl < 0) { + reason_prctl = "failed to read prctl"; + } else { + if (ctrl & PR_PPC_DEXCR_CTRL_SET) { + reason_prctl = "set by prctl"; + expected = true; + } else if (ctrl & PR_PPC_DEXCR_CTRL_CLEAR) { + reason_prctl = "cleared by prctl"; + expected = false; + } else { + reason_prctl = "unknown prctl"; + } + + reason = reason_prctl; + } + } + + if (hdexcr & DEXCR_PR_BIT(aspect->index)) { + reason_hyp = "set by hypervisor"; + reason = reason_hyp; + expected = true; + } else { + reason_hyp = "not modified by hypervisor"; + } + + printf("%12s (%d): %-28s (%s, %s)\n", + aspect->name, + aspect->index, + reason, + reason_hyp, + reason_prctl); + + /* + * The checks are not atomic, so this can technically trigger if the + * hypervisor makes a change while we are checking each source. It's + * far more likely to be a bug if we see this though. + */ + if (actual != expected) + printf(" : ! actual %s does not match config\n", aspect->name); +} + int main(int argc, char *argv[]) { if (!dexcr_exists()) { @@ -114,6 +137,8 @@ int main(int argc, char *argv[]) hdexcr = get_dexcr(HDEXCR); effective = dexcr | hdexcr; + printf("current status:\n"); + print_dexcr(" DEXCR", dexcr); print_dexcr(" HDEXCR", hdexcr); print_dexcr("Effective", effective); @@ -136,6 +161,12 @@ int main(int argc, char *argv[]) else printf("ignored\n"); } + printf("\n"); + + printf("configuration:\n"); + for (size_t i = 0; i < ARRAY_SIZE(aspects); i++) + print_aspect_config(&aspects[i]); + printf("\n"); return 0; } diff --git a/tools/testing/selftests/powerpc/dscr/Makefile b/tools/testing/selftests/powerpc/dscr/Makefile index 9289d5febe1e..9fa9cb5bd989 100644 --- a/tools/testing/selftests/powerpc/dscr/Makefile +++ b/tools/testing/selftests/powerpc/dscr/Makefile @@ -5,6 +5,7 @@ TEST_GEN_PROGS := dscr_default_test dscr_explicit_test dscr_user_test \ top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk $(OUTPUT)/dscr_default_test: LDLIBS += -lpthread $(OUTPUT)/dscr_explicit_test: LDLIBS += -lpthread diff --git a/tools/testing/selftests/powerpc/eeh/Makefile b/tools/testing/selftests/powerpc/eeh/Makefile index ae963eb2dc5b..70797716f2b5 100644 --- a/tools/testing/selftests/powerpc/eeh/Makefile +++ b/tools/testing/selftests/powerpc/eeh/Makefile @@ -7,3 +7,4 @@ TEST_FILES := eeh-functions.sh settings top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk diff --git a/tools/testing/selftests/powerpc/flags.mk b/tools/testing/selftests/powerpc/flags.mk new file mode 100644 index 000000000000..b909bee3cb2a --- /dev/null +++ b/tools/testing/selftests/powerpc/flags.mk @@ -0,0 +1,12 @@ +#This checks for any ENV variables and add those. + +ifeq ($(GIT_VERSION),) +GIT_VERSION := $(shell git describe --always --long --dirty || echo "unknown") +export GIT_VERSION +endif + +ifeq ($(CFLAGS),) +CFLAGS := -std=gnu99 -O2 -Wall -Werror -DGIT_VERSION='"$(GIT_VERSION)"' -I$(selfdir)/powerpc/include $(CFLAGS) +export CFLAGS +endif + diff --git a/tools/testing/selftests/powerpc/math/Makefile b/tools/testing/selftests/powerpc/math/Makefile index 3948f7c510aa..b14fd2e0c6a8 100644 --- a/tools/testing/selftests/powerpc/math/Makefile +++ b/tools/testing/selftests/powerpc/math/Makefile @@ -3,6 +3,7 @@ TEST_GEN_PROGS := fpu_syscall fpu_preempt fpu_signal fpu_denormal vmx_syscall vm top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk $(TEST_GEN_PROGS): ../harness.c $(TEST_GEN_PROGS): CFLAGS += -O2 -g -pthread -m64 -maltivec diff --git a/tools/testing/selftests/powerpc/mce/Makefile b/tools/testing/selftests/powerpc/mce/Makefile index 2424513982d9..ce4ed679aaaf 100644 --- a/tools/testing/selftests/powerpc/mce/Makefile +++ b/tools/testing/selftests/powerpc/mce/Makefile @@ -3,5 +3,6 @@ TEST_GEN_PROGS := inject-ra-err include ../../lib.mk +include ../flags.mk $(TEST_GEN_PROGS): ../harness.c diff --git a/tools/testing/selftests/powerpc/mm/Makefile b/tools/testing/selftests/powerpc/mm/Makefile index 4a6608beef0e..aab058ecb352 100644 --- a/tools/testing/selftests/powerpc/mm/Makefile +++ b/tools/testing/selftests/powerpc/mm/Makefile @@ -13,6 +13,7 @@ TEST_GEN_FILES := tempfile top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk $(TEST_GEN_PROGS): ../harness.c ../utils.c diff --git a/tools/testing/selftests/powerpc/nx-gzip/Makefile b/tools/testing/selftests/powerpc/nx-gzip/Makefile index 0785c2e99d40..480d8ba94cf7 100644 --- a/tools/testing/selftests/powerpc/nx-gzip/Makefile +++ b/tools/testing/selftests/powerpc/nx-gzip/Makefile @@ -1,8 +1,9 @@ -CFLAGS = -O3 -m64 -I./include -I../include - TEST_GEN_FILES := gzfht_test gunz_test TEST_PROGS := nx-gzip-test.sh include ../../lib.mk +include ../flags.mk + +CFLAGS = -O3 -m64 -I./include -I../include $(TEST_GEN_FILES): gzip_vas.c ../utils.c diff --git a/tools/testing/selftests/powerpc/papr_attributes/Makefile b/tools/testing/selftests/powerpc/papr_attributes/Makefile index e899712d49db..406429499022 100644 --- a/tools/testing/selftests/powerpc/papr_attributes/Makefile +++ b/tools/testing/selftests/powerpc/papr_attributes/Makefile @@ -3,5 +3,6 @@ TEST_GEN_PROGS := attr_test top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk -$(TEST_GEN_PROGS): ../harness.c ../utils.c
\ No newline at end of file +$(TEST_GEN_PROGS): ../harness.c ../utils.c diff --git a/tools/testing/selftests/powerpc/papr_sysparm/Makefile b/tools/testing/selftests/powerpc/papr_sysparm/Makefile index 7f79e437634a..fed4f2414dbf 100644 --- a/tools/testing/selftests/powerpc/papr_sysparm/Makefile +++ b/tools/testing/selftests/powerpc/papr_sysparm/Makefile @@ -6,6 +6,7 @@ TEST_GEN_PROGS := papr_sysparm top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk $(TEST_GEN_PROGS): ../harness.c ../utils.c diff --git a/tools/testing/selftests/powerpc/papr_vpd/Makefile b/tools/testing/selftests/powerpc/papr_vpd/Makefile index 06b719703bfd..b09852e40882 100644 --- a/tools/testing/selftests/powerpc/papr_vpd/Makefile +++ b/tools/testing/selftests/powerpc/papr_vpd/Makefile @@ -6,6 +6,7 @@ TEST_GEN_PROGS := papr_vpd top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk $(TEST_GEN_PROGS): ../harness.c ../utils.c diff --git a/tools/testing/selftests/powerpc/pmu/Makefile b/tools/testing/selftests/powerpc/pmu/Makefile index a284fa874a9f..7e9dbf3d0d09 100644 --- a/tools/testing/selftests/powerpc/pmu/Makefile +++ b/tools/testing/selftests/powerpc/pmu/Makefile @@ -7,8 +7,11 @@ EXTRA_SOURCES := ../harness.c event.c lib.c ../utils.c top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk -all: $(TEST_GEN_PROGS) ebb sampling_tests event_code_tests +SUB_DIRS := ebb sampling_tests event_code_tests + +all: $(TEST_GEN_PROGS) $(SUB_DIRS) $(TEST_GEN_PROGS): $(EXTRA_SOURCES) @@ -22,12 +25,16 @@ $(OUTPUT)/count_stcx_fail: loop.S $(EXTRA_SOURCES) $(OUTPUT)/per_event_excludes: ../utils.c +$(SUB_DIRS): + BUILD_TARGET=$(OUTPUT)/$@; mkdir -p $$BUILD_TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -k -C $@ all + DEFAULT_RUN_TESTS := $(RUN_TESTS) override define RUN_TESTS $(DEFAULT_RUN_TESTS) - +TARGET=ebb; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET run_tests - +TARGET=sampling_tests; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET run_tests - +TARGET=event_code_tests; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET run_tests + +@for TARGET in $(SUB_DIRS); do \ + BUILD_TARGET=$(OUTPUT)/$$TARGET; \ + $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET run_tests; \ + done; endef emit_tests: @@ -35,34 +42,29 @@ emit_tests: BASENAME_TEST=`basename $$TEST`; \ echo "$(COLLECTION):$$BASENAME_TEST"; \ done - +TARGET=ebb; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -s -C $$TARGET emit_tests - +TARGET=sampling_tests; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -s -C $$TARGET emit_tests - +TARGET=event_code_tests; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -s -C $$TARGET emit_tests + +@for TARGET in $(SUB_DIRS); do \ + BUILD_TARGET=$(OUTPUT)/$$TARGET; \ + $(MAKE) OUTPUT=$$BUILD_TARGET COLLECTION=$(COLLECTION)/$$TARGET -s -C $$TARGET emit_tests; \ + done; DEFAULT_INSTALL_RULE := $(INSTALL_RULE) override define INSTALL_RULE $(DEFAULT_INSTALL_RULE) - +TARGET=ebb; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET install - +TARGET=sampling_tests; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET install - +TARGET=event_code_tests; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET install + +@for TARGET in $(SUB_DIRS); do \ + BUILD_TARGET=$(OUTPUT)/$$TARGET; \ + $(MAKE) OUTPUT=$$BUILD_TARGET INSTALL_PATH=$$INSTALL_PATH/$$TARGET -C $$TARGET install; \ + done; endef DEFAULT_CLEAN := $(CLEAN) override define CLEAN $(DEFAULT_CLEAN) $(RM) $(TEST_GEN_PROGS) $(OUTPUT)/loop.o - +TARGET=ebb; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET clean - +TARGET=sampling_tests; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET clean - +TARGET=event_code_tests; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET clean + +@for TARGET in $(SUB_DIRS); do \ + BUILD_TARGET=$(OUTPUT)/$$TARGET; \ + $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET clean; \ + done; endef -ebb: - TARGET=$@; BUILD_TARGET=$$OUTPUT/$$TARGET; mkdir -p $$BUILD_TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -k -C $$TARGET all - -sampling_tests: - TARGET=$@; BUILD_TARGET=$$OUTPUT/$$TARGET; mkdir -p $$BUILD_TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -k -C $$TARGET all - -event_code_tests: - TARGET=$@; BUILD_TARGET=$$OUTPUT/$$TARGET; mkdir -p $$BUILD_TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -k -C $$TARGET all .PHONY: all run_tests ebb sampling_tests event_code_tests emit_tests diff --git a/tools/testing/selftests/powerpc/pmu/ebb/Makefile b/tools/testing/selftests/powerpc/pmu/ebb/Makefile index 010160690227..1b39af7c10db 100644 --- a/tools/testing/selftests/powerpc/pmu/ebb/Makefile +++ b/tools/testing/selftests/powerpc/pmu/ebb/Makefile @@ -4,16 +4,6 @@ include ../../../../../build/Build.include noarg: $(MAKE) -C ../../ -# The EBB handler is 64-bit code and everything links against it -CFLAGS += -m64 - -TMPOUT = $(OUTPUT)/TMPDIR/ -# Toolchains may build PIE by default which breaks the assembly -no-pie-option := $(call try-run, echo 'int main() { return 0; }' | \ - $(CC) -Werror $(KBUILD_CPPFLAGS) $(CC_OPTION_CFLAGS) -no-pie -x c - -o "$$TMP", -no-pie) - -LDFLAGS += $(no-pie-option) - TEST_GEN_PROGS := reg_access_test event_attributes_test cycles_test \ cycles_with_freeze_test pmc56_overflow_test \ ebb_vs_cpu_event_test cpu_event_vs_ebb_test \ @@ -28,6 +18,17 @@ TEST_GEN_PROGS := reg_access_test event_attributes_test cycles_test \ top_srcdir = ../../../../../.. include ../../../lib.mk +include ../../flags.mk + +# The EBB handler is 64-bit code and everything links against it +CFLAGS += -m64 + +TMPOUT = $(OUTPUT)/TMPDIR/ +# Toolchains may build PIE by default which breaks the assembly +no-pie-option := $(call try-run, echo 'int main() { return 0; }' | \ + $(CC) -Werror $(KBUILD_CPPFLAGS) $(CC_OPTION_CFLAGS) -no-pie -x c - -o "$$TMP", -no-pie) + +LDFLAGS += $(no-pie-option) $(TEST_GEN_PROGS): ../../harness.c ../../utils.c ../event.c ../lib.c \ ebb.c ebb_handler.S trace.c busy_loop.S diff --git a/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile b/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile index 4e07d7046457..fdb080b3fa65 100644 --- a/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile +++ b/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile @@ -1,6 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -CFLAGS += -m64 - TEST_GEN_PROGS := group_constraint_pmc56_test group_pmc56_exclude_constraints_test group_constraint_pmc_count_test \ group_constraint_repeat_test group_constraint_radix_scope_qual_test reserved_bits_mmcra_sample_elig_mode_test \ group_constraint_mmcra_sample_test invalid_event_code_test reserved_bits_mmcra_thresh_ctl_test \ @@ -11,5 +9,8 @@ TEST_GEN_PROGS := group_constraint_pmc56_test group_pmc56_exclude_constraints_te top_srcdir = ../../../../../.. include ../../../lib.mk +include ../../flags.mk + +CFLAGS += -m64 $(TEST_GEN_PROGS): ../../harness.c ../../utils.c ../event.c ../lib.c ../sampling_tests/misc.h ../sampling_tests/misc.c diff --git a/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile b/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile index 9e67351fb252..9f79bec5fce7 100644 --- a/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile +++ b/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile @@ -1,6 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -CFLAGS += -m64 - TEST_GEN_PROGS := mmcr0_exceptionbits_test mmcr0_cc56run_test mmcr0_pmccext_test \ mmcr0_pmcjce_test mmcr0_fc56_pmc1ce_test mmcr0_fc56_pmc56_test \ mmcr1_comb_test mmcr2_l2l3_test mmcr2_fcs_fch_test \ @@ -11,5 +9,8 @@ TEST_GEN_PROGS := mmcr0_exceptionbits_test mmcr0_cc56run_test mmcr0_pmccext_test top_srcdir = ../../../../../.. include ../../../lib.mk +include ../../flags.mk + +CFLAGS += -m64 $(TEST_GEN_PROGS): ../../harness.c ../../utils.c ../event.c ../lib.c misc.c misc.h ../loop.S ../branch_loops.S diff --git a/tools/testing/selftests/powerpc/primitives/Makefile b/tools/testing/selftests/powerpc/primitives/Makefile index 9b9491a63213..23bd9a7590dd 100644 --- a/tools/testing/selftests/powerpc/primitives/Makefile +++ b/tools/testing/selftests/powerpc/primitives/Makefile @@ -1,9 +1,10 @@ # SPDX-License-Identifier: GPL-2.0-only -CFLAGS += -I$(CURDIR) - TEST_GEN_PROGS := load_unaligned_zeropad top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk + +CFLAGS += -I$(CURDIR) $(TEST_GEN_PROGS): ../harness.c diff --git a/tools/testing/selftests/powerpc/ptrace/Makefile b/tools/testing/selftests/powerpc/ptrace/Makefile index 1b39b86849da..59ca01d8567e 100644 --- a/tools/testing/selftests/powerpc/ptrace/Makefile +++ b/tools/testing/selftests/powerpc/ptrace/Makefile @@ -26,6 +26,7 @@ LOCAL_HDRS += $(patsubst %,$(selfdir)/powerpc/ptrace/%,$(wildcard *.h)) top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk TM_TESTS := $(patsubst %,$(OUTPUT)/%,$(TM_TESTS)) TESTS_64 := $(patsubst %,$(OUTPUT)/%,$(TESTS_64)) diff --git a/tools/testing/selftests/powerpc/security/Makefile b/tools/testing/selftests/powerpc/security/Makefile index e0d979ab0204..33286039724a 100644 --- a/tools/testing/selftests/powerpc/security/Makefile +++ b/tools/testing/selftests/powerpc/security/Makefile @@ -5,9 +5,10 @@ TEST_PROGS := mitigation-patching.sh top_srcdir = ../../../../.. -CFLAGS += $(KHDR_INCLUDES) - include ../../lib.mk +include ../flags.mk + +CFLAGS += $(KHDR_INCLUDES) $(TEST_GEN_PROGS): ../harness.c ../utils.c diff --git a/tools/testing/selftests/powerpc/signal/Makefile b/tools/testing/selftests/powerpc/signal/Makefile index f679d260afc8..ece95bd52be9 100644 --- a/tools/testing/selftests/powerpc/signal/Makefile +++ b/tools/testing/selftests/powerpc/signal/Makefile @@ -3,7 +3,6 @@ TEST_GEN_PROGS := signal signal_tm sigfuz sigreturn_vdso sig_sc_double_restart TEST_GEN_PROGS += sigreturn_kernel TEST_GEN_PROGS += sigreturn_unaligned -CFLAGS += -maltivec $(OUTPUT)/signal_tm: CFLAGS += -mhtm $(OUTPUT)/sigfuz: CFLAGS += -pthread -m64 @@ -11,5 +10,8 @@ TEST_FILES := settings top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk + +CFLAGS += -maltivec $(TEST_GEN_PROGS): ../harness.c ../utils.c signal.S diff --git a/tools/testing/selftests/powerpc/stringloops/Makefile b/tools/testing/selftests/powerpc/stringloops/Makefile index 9c39f55a58ff..4c9d9a58c9d1 100644 --- a/tools/testing/selftests/powerpc/stringloops/Makefile +++ b/tools/testing/selftests/powerpc/stringloops/Makefile @@ -1,7 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -# The loops are all 64-bit code -CFLAGS += -I$(CURDIR) - EXTRA_SOURCES := ../harness.c build_32bit = $(shell if ($(CC) $(CFLAGS) -m32 -o /dev/null memcmp.c >/dev/null 2>&1) then echo "1"; fi) @@ -27,9 +24,13 @@ $(OUTPUT)/strlen_32: CFLAGS += -m32 TEST_GEN_PROGS += strlen_32 endif -ASFLAGS = $(CFLAGS) - top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk + +# The loops are all 64-bit code +CFLAGS += -I$(CURDIR) + +ASFLAGS = $(CFLAGS) $(TEST_GEN_PROGS): $(EXTRA_SOURCES) diff --git a/tools/testing/selftests/powerpc/switch_endian/Makefile b/tools/testing/selftests/powerpc/switch_endian/Makefile index bdc081afedb0..0da2e0a74264 100644 --- a/tools/testing/selftests/powerpc/switch_endian/Makefile +++ b/tools/testing/selftests/powerpc/switch_endian/Makefile @@ -1,12 +1,13 @@ # SPDX-License-Identifier: GPL-2.0 TEST_GEN_PROGS := switch_endian_test -ASFLAGS += -O2 -Wall -g -nostdlib -m64 - EXTRA_CLEAN = $(OUTPUT)/*.o $(OUTPUT)/check-reversed.S top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk + +ASFLAGS += -O2 -Wall -g -nostdlib -m64 $(OUTPUT)/switch_endian_test: ASFLAGS += -I $(OUTPUT) $(OUTPUT)/switch_endian_test: $(OUTPUT)/check-reversed.S diff --git a/tools/testing/selftests/powerpc/syscalls/Makefile b/tools/testing/selftests/powerpc/syscalls/Makefile index ee1740ddfb0c..3bc07af88f0e 100644 --- a/tools/testing/selftests/powerpc/syscalls/Makefile +++ b/tools/testing/selftests/powerpc/syscalls/Makefile @@ -1,9 +1,10 @@ # SPDX-License-Identifier: GPL-2.0-only TEST_GEN_PROGS := ipc_unmuxed rtas_filter -CFLAGS += $(KHDR_INCLUDES) - top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk + +CFLAGS += $(KHDR_INCLUDES) $(TEST_GEN_PROGS): ../harness.c ../utils.c diff --git a/tools/testing/selftests/powerpc/tm/Makefile b/tools/testing/selftests/powerpc/tm/Makefile index 3876805c2f31..f13f0ab36007 100644 --- a/tools/testing/selftests/powerpc/tm/Makefile +++ b/tools/testing/selftests/powerpc/tm/Makefile @@ -11,6 +11,7 @@ TEST_FILES := settings top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk $(TEST_GEN_PROGS): ../harness.c ../utils.c diff --git a/tools/testing/selftests/powerpc/vphn/Makefile b/tools/testing/selftests/powerpc/vphn/Makefile index cf65cbf33085..61d519a076c6 100644 --- a/tools/testing/selftests/powerpc/vphn/Makefile +++ b/tools/testing/selftests/powerpc/vphn/Makefile @@ -1,10 +1,11 @@ # SPDX-License-Identifier: GPL-2.0-only TEST_GEN_PROGS := test-vphn -CFLAGS += -m64 -I$(CURDIR) - top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk + +CFLAGS += -m64 -I$(CURDIR) $(TEST_GEN_PROGS): ../harness.c diff --git a/tools/testing/selftests/rcutorture/bin/torture.sh b/tools/testing/selftests/rcutorture/bin/torture.sh index bbac5f4b03d0..990d24696fd3 100755 --- a/tools/testing/selftests/rcutorture/bin/torture.sh +++ b/tools/testing/selftests/rcutorture/bin/torture.sh @@ -391,7 +391,7 @@ __EOF__ forceflavor="`echo $flavor | sed -e 's/^CONFIG/CONFIG_FORCE/'`" deselectedflavors="`grep -v $flavor $T/rcutasksflavors | tr '\012' ' ' | tr -s ' ' | sed -e 's/ *$//'`" echo " --- Running RCU Tasks Trace flavor $flavor `date`" >> $rtfdir/log - tools/testing/selftests/rcutorture/bin/kvm.sh --datestamp "$ds/results-rcutasksflavors/$flavor" --buildonly --configs "TINY01 TREE04" --kconfig "CONFIG_RCU_EXPERT=y CONFIG_RCU_SCALE_TEST=y $forceflavor=y $deselectedflavors" --trust-make > $T/$flavor.out 2>&1 + tools/testing/selftests/rcutorture/bin/kvm.sh --datestamp "$ds/results-rcutasksflavors/$flavor" --buildonly --configs "TINY01 TREE04" --kconfig "CONFIG_RCU_EXPERT=y CONFIG_RCU_SCALE_TEST=y CONFIG_KPROBES=n CONFIG_RCU_TRACE=n CONFIG_TRACING=n CONFIG_BLK_DEV_IO_TRACE=n CONFIG_UPROBE_EVENTS=n $forceflavor=y $deselectedflavors" --trust-make > $T/$flavor.out 2>&1 retcode=$? if test "$retcode" -ne 0 then @@ -425,7 +425,7 @@ fi if test "$do_scftorture" = "yes" then # Scale memory based on the number of CPUs. - scfmem=$((2+HALF_ALLOTED_CPUS/16)) + scfmem=$((3+HALF_ALLOTED_CPUS/16)) torture_bootargs="scftorture.nthreads=$HALF_ALLOTED_CPUS torture.disable_onoff_at_boot csdlock_debug=1" torture_set "scftorture" tools/testing/selftests/rcutorture/bin/kvm.sh --torture scf --allcpus --duration "$duration_scftorture" --configs "$configs_scftorture" --kconfig "CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --memory ${scfmem}G --trust-make fi @@ -559,7 +559,7 @@ do_kcsan="$do_kcsan_save" if test "$do_kvfree" = "yes" then torture_bootargs="rcuscale.kfree_rcu_test=1 rcuscale.kfree_nthreads=16 rcuscale.holdoff=20 rcuscale.kfree_loops=10000 torture.disable_onoff_at_boot" - torture_set "rcuscale-kvfree" tools/testing/selftests/rcutorture/bin/kvm.sh --torture rcuscale --allcpus --duration 10 --kconfig "CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --memory 2G --trust-make + torture_set "rcuscale-kvfree" tools/testing/selftests/rcutorture/bin/kvm.sh --torture rcuscale --allcpus --duration $duration_rcutorture --kconfig "CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --memory 2G --trust-make fi if test "$do_clocksourcewd" = "yes" diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE09 b/tools/testing/selftests/rcutorture/configs/rcu/TREE09 index fc45645bb5f4..9ecd1b4e653d 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE09 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE09 @@ -10,8 +10,9 @@ CONFIG_NO_HZ_FULL=n CONFIG_RCU_TRACE=n CONFIG_RCU_NOCB_CPU=n CONFIG_DEBUG_LOCK_ALLOC=n -CONFIG_RCU_BOOST=n +CONFIG_RCU_BOOST=y +CONFIG_RCU_BOOST_DELAY=100 CONFIG_DEBUG_OBJECTS_RCU_HEAD=n -#CHECK#CONFIG_RCU_EXPERT=n +CONFIG_RCU_EXPERT=y CONFIG_KPROBES=n CONFIG_FTRACE=n diff --git a/tools/testing/selftests/resctrl/Makefile b/tools/testing/selftests/resctrl/Makefile index 2deac2031de9..021863f86053 100644 --- a/tools/testing/selftests/resctrl/Makefile +++ b/tools/testing/selftests/resctrl/Makefile @@ -5,6 +5,8 @@ CFLAGS += $(KHDR_INCLUDES) TEST_GEN_PROGS := resctrl_tests +LOCAL_HDRS += $(wildcard *.h) + include ../lib.mk -$(OUTPUT)/resctrl_tests: $(wildcard *.[ch]) +$(OUTPUT)/resctrl_tests: $(wildcard *.c) diff --git a/tools/testing/selftests/resctrl/cat_test.c b/tools/testing/selftests/resctrl/cat_test.c index 4cb991be8e31..c7686fb6641a 100644 --- a/tools/testing/selftests/resctrl/cat_test.c +++ b/tools/testing/selftests/resctrl/cat_test.c @@ -128,7 +128,7 @@ static int check_results(struct resctrl_val_param *param, const char *cache_type return fail; } -void cat_test_cleanup(void) +static void cat_test_cleanup(void) { remove(RESULT_FILE_NAME); } @@ -284,13 +284,10 @@ static int cat_run_test(const struct resctrl_test *test, const struct user_param ret = cat_test(test, uparams, ¶m, span, start_mask); if (ret) - goto out; + return ret; ret = check_results(¶m, test->resource, cache_total_size, full_cache_mask, start_mask); -out: - cat_test_cleanup(); - return ret; } @@ -373,6 +370,7 @@ struct resctrl_test l3_cat_test = { .resource = "L3", .feature_check = test_resource_feature_check, .run_test = cat_run_test, + .cleanup = cat_test_cleanup, }; struct resctrl_test l3_noncont_cat_test = { diff --git a/tools/testing/selftests/resctrl/cmt_test.c b/tools/testing/selftests/resctrl/cmt_test.c index a81f91222a89..0105afec6188 100644 --- a/tools/testing/selftests/resctrl/cmt_test.c +++ b/tools/testing/selftests/resctrl/cmt_test.c @@ -40,11 +40,11 @@ static int show_results_info(unsigned long sum_llc_val, int no_of_bits, int ret; avg_llc_val = sum_llc_val / num_of_runs; - avg_diff = (long)abs(cache_span - avg_llc_val); + avg_diff = (long)(cache_span - avg_llc_val); diff_percent = ((float)cache_span - avg_llc_val) / cache_span * 100; ret = platform && abs((int)diff_percent) > max_diff_percent && - abs(avg_diff) > max_diff; + labs(avg_diff) > max_diff; ksft_print_msg("%s Check cache miss rate within %lu%%\n", ret ? "Fail:" : "Pass:", max_diff_percent); @@ -91,7 +91,7 @@ static int check_results(struct resctrl_val_param *param, size_t span, int no_of MAX_DIFF, MAX_DIFF_PERCENT, runs - 1, true); } -void cmt_test_cleanup(void) +static void cmt_test_cleanup(void) { remove(RESULT_FILE_NAME); } @@ -161,7 +161,6 @@ static int cmt_run_test(const struct resctrl_test *test, const struct user_param ksft_print_msg("Intel CMT may be inaccurate when Sub-NUMA Clustering is enabled. Check BIOS configuration.\n"); out: - cmt_test_cleanup(); free(span_str); return ret; @@ -178,4 +177,5 @@ struct resctrl_test cmt_test = { .resource = "L3", .feature_check = cmt_feature_check, .run_test = cmt_run_test, + .cleanup = cmt_test_cleanup, }; diff --git a/tools/testing/selftests/resctrl/mba_test.c b/tools/testing/selftests/resctrl/mba_test.c index 7946e32e85c8..a6ad39aae162 100644 --- a/tools/testing/selftests/resctrl/mba_test.c +++ b/tools/testing/selftests/resctrl/mba_test.c @@ -60,8 +60,8 @@ static bool show_mba_info(unsigned long *bw_imc, unsigned long *bw_resc) /* Memory bandwidth from 100% down to 10% */ for (allocation = 0; allocation < ALLOCATION_MAX / ALLOCATION_STEP; allocation++) { - unsigned long avg_bw_imc, avg_bw_resc; unsigned long sum_bw_imc = 0, sum_bw_resc = 0; + long avg_bw_imc, avg_bw_resc; int avg_diff_per; float avg_diff; @@ -137,7 +137,7 @@ static int check_results(void) return show_mba_info(bw_imc, bw_resc); } -void mba_test_cleanup(void) +static void mba_test_cleanup(void) { remove(RESULT_FILE_NAME); } @@ -158,13 +158,10 @@ static int mba_run_test(const struct resctrl_test *test, const struct user_param ret = resctrl_val(test, uparams, uparams->benchmark_cmd, ¶m); if (ret) - goto out; + return ret; ret = check_results(); -out: - mba_test_cleanup(); - return ret; } @@ -180,4 +177,5 @@ struct resctrl_test mba_test = { .vendor_specific = ARCH_INTEL, .feature_check = mba_feature_check, .run_test = mba_run_test, + .cleanup = mba_test_cleanup, }; diff --git a/tools/testing/selftests/resctrl/mbm_test.c b/tools/testing/selftests/resctrl/mbm_test.c index d67ffa3ec63a..6fec51e1ff46 100644 --- a/tools/testing/selftests/resctrl/mbm_test.c +++ b/tools/testing/selftests/resctrl/mbm_test.c @@ -17,8 +17,8 @@ static int show_bw_info(unsigned long *bw_imc, unsigned long *bw_resc, size_t span) { - unsigned long avg_bw_imc = 0, avg_bw_resc = 0; unsigned long sum_bw_imc = 0, sum_bw_resc = 0; + long avg_bw_imc = 0, avg_bw_resc = 0; int runs, ret, avg_diff_per; float avg_diff = 0; @@ -105,7 +105,7 @@ static int mbm_setup(const struct resctrl_test *test, return ret; } -void mbm_test_cleanup(void) +static void mbm_test_cleanup(void) { remove(RESULT_FILE_NAME); } @@ -126,15 +126,12 @@ static int mbm_run_test(const struct resctrl_test *test, const struct user_param ret = resctrl_val(test, uparams, uparams->benchmark_cmd, ¶m); if (ret) - goto out; + return ret; ret = check_results(DEFAULT_SPAN); if (ret && (get_vendor() == ARCH_INTEL)) ksft_print_msg("Intel MBM may be inaccurate when Sub-NUMA Clustering is enabled. Check BIOS configuration.\n"); -out: - mbm_test_cleanup(); - return ret; } @@ -150,4 +147,5 @@ struct resctrl_test mbm_test = { .vendor_specific = ARCH_INTEL, .feature_check = mbm_feature_check, .run_test = mbm_run_test, + .cleanup = mbm_test_cleanup, }; diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h index 2051bd135e0d..00d51fa7531c 100644 --- a/tools/testing/selftests/resctrl/resctrl.h +++ b/tools/testing/selftests/resctrl/resctrl.h @@ -72,6 +72,7 @@ struct user_params { * @disabled: Test is disabled * @feature_check: Callback to check required resctrl features * @run_test: Callback to run the test + * @cleanup: Callback to cleanup after the test */ struct resctrl_test { const char *name; @@ -82,6 +83,7 @@ struct resctrl_test { bool (*feature_check)(const struct resctrl_test *test); int (*run_test)(const struct resctrl_test *test, const struct user_params *uparams); + void (*cleanup)(void); }; /* @@ -156,9 +158,6 @@ int resctrl_val(const struct resctrl_test *test, const struct user_params *uparams, const char * const *benchmark_cmd, struct resctrl_val_param *param); -void tests_cleanup(void); -void mbm_test_cleanup(void); -void mba_test_cleanup(void); unsigned long create_bit_mask(unsigned int start, unsigned int len); unsigned int count_contiguous_bits(unsigned long val, unsigned int *start); int get_full_cbm(const char *cache_type, unsigned long *mask); @@ -166,11 +165,9 @@ int get_mask_no_shareable(const char *cache_type, unsigned long *mask); int get_cache_size(int cpu_no, const char *cache_type, unsigned long *cache_size); int resource_info_unsigned_get(const char *resource, const char *filename, unsigned int *val); void ctrlc_handler(int signum, siginfo_t *info, void *ptr); -int signal_handler_register(void); +int signal_handler_register(const struct resctrl_test *test); void signal_handler_unregister(void); -void cat_test_cleanup(void); unsigned int count_bits(unsigned long n); -void cmt_test_cleanup(void); void perf_event_attr_initialize(struct perf_event_attr *pea, __u64 config); void perf_event_initialize_read_format(struct perf_event_read *pe_read); diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c index f3dc1b9696e7..ecbb7605a981 100644 --- a/tools/testing/selftests/resctrl/resctrl_tests.c +++ b/tools/testing/selftests/resctrl/resctrl_tests.c @@ -81,19 +81,11 @@ static void cmd_help(void) printf("\t-h: help\n"); } -void tests_cleanup(void) -{ - mbm_test_cleanup(); - mba_test_cleanup(); - cmt_test_cleanup(); - cat_test_cleanup(); -} - -static int test_prepare(void) +static int test_prepare(const struct resctrl_test *test) { int res; - res = signal_handler_register(); + res = signal_handler_register(test); if (res) { ksft_print_msg("Failed to register signal handler\n"); return res; @@ -108,8 +100,10 @@ static int test_prepare(void) return 0; } -static void test_cleanup(void) +static void test_cleanup(const struct resctrl_test *test) { + if (test->cleanup) + test->cleanup(); umount_resctrlfs(); signal_handler_unregister(); } @@ -136,7 +130,7 @@ static void run_single_test(const struct resctrl_test *test, const struct user_p ksft_print_msg("Starting %s test ...\n", test->name); - if (test_prepare()) { + if (test_prepare(test)) { ksft_exit_fail_msg("Abnormal failure when preparing for the test\n"); return; } @@ -151,7 +145,7 @@ static void run_single_test(const struct resctrl_test *test, const struct user_p ksft_test_result(!ret, "%s: test\n", test->name); cleanup: - test_cleanup(); + test_cleanup(test); } static void init_user_params(struct user_params *uparams) @@ -253,13 +247,13 @@ last_arg: * 2. We execute perf commands */ if (geteuid() != 0) - return ksft_exit_skip("Not running as root. Skipping...\n"); + ksft_exit_skip("Not running as root. Skipping...\n"); if (!check_resctrlfs_support()) - return ksft_exit_skip("resctrl FS does not exist. Enable X86_CPU_RESCTRL config option.\n"); + ksft_exit_skip("resctrl FS does not exist. Enable X86_CPU_RESCTRL config option.\n"); if (umount_resctrlfs()) - return ksft_exit_skip("resctrl FS unmount failed.\n"); + ksft_exit_skip("resctrl FS unmount failed.\n"); filter_dmesg(); diff --git a/tools/testing/selftests/resctrl/resctrl_val.c b/tools/testing/selftests/resctrl/resctrl_val.c index 5a49f07a6c85..445f306d4c2f 100644 --- a/tools/testing/selftests/resctrl/resctrl_val.c +++ b/tools/testing/selftests/resctrl/resctrl_val.c @@ -62,6 +62,7 @@ struct imc_counter_config { static char mbm_total_path[1024]; static int imcs; static struct imc_counter_config imc_counters_config[MAX_IMCS][2]; +static const struct resctrl_test *current_test; void membw_initialize_perf_event_attr(int i, int j) { @@ -472,7 +473,8 @@ void ctrlc_handler(int signum, siginfo_t *info, void *ptr) if (bm_pid) kill(bm_pid, SIGKILL); umount_resctrlfs(); - tests_cleanup(); + if (current_test && current_test->cleanup) + current_test->cleanup(); ksft_print_msg("Ending\n\n"); exit(EXIT_SUCCESS); @@ -482,13 +484,14 @@ void ctrlc_handler(int signum, siginfo_t *info, void *ptr) * Register CTRL-C handler for parent, as it has to kill * child process before exiting. */ -int signal_handler_register(void) +int signal_handler_register(const struct resctrl_test *test) { struct sigaction sigact = {}; int ret = 0; bm_pid = 0; + current_test = test; sigact.sa_sigaction = ctrlc_handler; sigemptyset(&sigact.sa_mask); sigact.sa_flags = SA_SIGINFO; @@ -510,6 +513,7 @@ void signal_handler_unregister(void) { struct sigaction sigact = {}; + current_test = NULL; sigact.sa_handler = SIG_DFL; sigemptyset(&sigact.sa_mask); if (sigaction(SIGINT, &sigact, NULL) || diff --git a/tools/testing/selftests/ring-buffer/.gitignore b/tools/testing/selftests/ring-buffer/.gitignore new file mode 100644 index 000000000000..3aed1a2a6c67 --- /dev/null +++ b/tools/testing/selftests/ring-buffer/.gitignore @@ -0,0 +1 @@ +map_test diff --git a/tools/testing/selftests/ring-buffer/Makefile b/tools/testing/selftests/ring-buffer/Makefile new file mode 100644 index 000000000000..627c5fa6d1ab --- /dev/null +++ b/tools/testing/selftests/ring-buffer/Makefile @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0 +CFLAGS += -Wl,-no-as-needed -Wall +CFLAGS += $(KHDR_INCLUDES) +CFLAGS += -D_GNU_SOURCE + +TEST_GEN_PROGS = map_test + +include ../lib.mk diff --git a/tools/testing/selftests/ring-buffer/config b/tools/testing/selftests/ring-buffer/config new file mode 100644 index 000000000000..d936f8f00e78 --- /dev/null +++ b/tools/testing/selftests/ring-buffer/config @@ -0,0 +1,2 @@ +CONFIG_FTRACE=y +CONFIG_TRACER_SNAPSHOT=y diff --git a/tools/testing/selftests/ring-buffer/map_test.c b/tools/testing/selftests/ring-buffer/map_test.c new file mode 100644 index 000000000000..a9006fa7097e --- /dev/null +++ b/tools/testing/selftests/ring-buffer/map_test.c @@ -0,0 +1,294 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Ring-buffer memory mapping tests + * + * Copyright (c) 2024 Vincent Donnefort <vdonnefort@google.com> + */ +#include <fcntl.h> +#include <sched.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> + +#include <linux/trace_mmap.h> + +#include <sys/mman.h> +#include <sys/ioctl.h> + +#include "../user_events/user_events_selftests.h" /* share tracefs setup */ +#include "../kselftest_harness.h" + +#define TRACEFS_ROOT "/sys/kernel/tracing" + +static int __tracefs_write(const char *path, const char *value) +{ + int fd, ret; + + fd = open(path, O_WRONLY | O_TRUNC); + if (fd < 0) + return fd; + + ret = write(fd, value, strlen(value)); + + close(fd); + + return ret == -1 ? -errno : 0; +} + +static int __tracefs_write_int(const char *path, int value) +{ + char *str; + int ret; + + if (asprintf(&str, "%d", value) < 0) + return -1; + + ret = __tracefs_write(path, str); + + free(str); + + return ret; +} + +#define tracefs_write_int(path, value) \ + ASSERT_EQ(__tracefs_write_int((path), (value)), 0) + +#define tracefs_write(path, value) \ + ASSERT_EQ(__tracefs_write((path), (value)), 0) + +static int tracefs_reset(void) +{ + if (__tracefs_write_int(TRACEFS_ROOT"/tracing_on", 0)) + return -1; + if (__tracefs_write(TRACEFS_ROOT"/trace", "")) + return -1; + if (__tracefs_write(TRACEFS_ROOT"/set_event", "")) + return -1; + if (__tracefs_write(TRACEFS_ROOT"/current_tracer", "nop")) + return -1; + + return 0; +} + +struct tracefs_cpu_map_desc { + struct trace_buffer_meta *meta; + int cpu_fd; +}; + +int tracefs_cpu_map(struct tracefs_cpu_map_desc *desc, int cpu) +{ + int page_size = getpagesize(); + char *cpu_path; + void *map; + + if (asprintf(&cpu_path, + TRACEFS_ROOT"/per_cpu/cpu%d/trace_pipe_raw", + cpu) < 0) + return -ENOMEM; + + desc->cpu_fd = open(cpu_path, O_RDONLY | O_NONBLOCK); + free(cpu_path); + if (desc->cpu_fd < 0) + return -ENODEV; + + map = mmap(NULL, page_size, PROT_READ, MAP_SHARED, desc->cpu_fd, 0); + if (map == MAP_FAILED) + return -errno; + + desc->meta = (struct trace_buffer_meta *)map; + + return 0; +} + +void tracefs_cpu_unmap(struct tracefs_cpu_map_desc *desc) +{ + munmap(desc->meta, desc->meta->meta_page_size); + close(desc->cpu_fd); +} + +FIXTURE(map) { + struct tracefs_cpu_map_desc map_desc; + bool umount; +}; + +FIXTURE_VARIANT(map) { + int subbuf_size; +}; + +FIXTURE_VARIANT_ADD(map, subbuf_size_4k) { + .subbuf_size = 4, +}; + +FIXTURE_VARIANT_ADD(map, subbuf_size_8k) { + .subbuf_size = 8, +}; + +FIXTURE_SETUP(map) +{ + int cpu = sched_getcpu(); + cpu_set_t cpu_mask; + bool fail, umount; + char *message; + + if (getuid() != 0) + SKIP(return, "Skipping: %s", "Please run the test as root"); + + if (!tracefs_enabled(&message, &fail, &umount)) { + if (fail) { + TH_LOG("Tracefs setup failed: %s", message); + ASSERT_FALSE(fail); + } + SKIP(return, "Skipping: %s", message); + } + + self->umount = umount; + + ASSERT_GE(cpu, 0); + + ASSERT_EQ(tracefs_reset(), 0); + + tracefs_write_int(TRACEFS_ROOT"/buffer_subbuf_size_kb", variant->subbuf_size); + + ASSERT_EQ(tracefs_cpu_map(&self->map_desc, cpu), 0); + + /* + * Ensure generated events will be found on this very same ring-buffer. + */ + CPU_ZERO(&cpu_mask); + CPU_SET(cpu, &cpu_mask); + ASSERT_EQ(sched_setaffinity(0, sizeof(cpu_mask), &cpu_mask), 0); +} + +FIXTURE_TEARDOWN(map) +{ + tracefs_reset(); + + if (self->umount) + tracefs_unmount(); + + tracefs_cpu_unmap(&self->map_desc); +} + +TEST_F(map, meta_page_check) +{ + struct tracefs_cpu_map_desc *desc = &self->map_desc; + int cnt = 0; + + ASSERT_EQ(desc->meta->entries, 0); + ASSERT_EQ(desc->meta->overrun, 0); + ASSERT_EQ(desc->meta->read, 0); + + ASSERT_EQ(desc->meta->reader.id, 0); + ASSERT_EQ(desc->meta->reader.read, 0); + + ASSERT_EQ(ioctl(desc->cpu_fd, TRACE_MMAP_IOCTL_GET_READER), 0); + ASSERT_EQ(desc->meta->reader.id, 0); + + tracefs_write_int(TRACEFS_ROOT"/tracing_on", 1); + for (int i = 0; i < 16; i++) + tracefs_write_int(TRACEFS_ROOT"/trace_marker", i); +again: + ASSERT_EQ(ioctl(desc->cpu_fd, TRACE_MMAP_IOCTL_GET_READER), 0); + + ASSERT_EQ(desc->meta->entries, 16); + ASSERT_EQ(desc->meta->overrun, 0); + ASSERT_EQ(desc->meta->read, 16); + + ASSERT_EQ(desc->meta->reader.id, 1); + + if (!(cnt++)) + goto again; +} + +TEST_F(map, data_mmap) +{ + struct tracefs_cpu_map_desc *desc = &self->map_desc; + unsigned long meta_len, data_len; + void *data; + + meta_len = desc->meta->meta_page_size; + data_len = desc->meta->subbuf_size * desc->meta->nr_subbufs; + + /* Map all the available subbufs */ + data = mmap(NULL, data_len, PROT_READ, MAP_SHARED, + desc->cpu_fd, meta_len); + ASSERT_NE(data, MAP_FAILED); + munmap(data, data_len); + + /* Map all the available subbufs - 1 */ + data_len -= desc->meta->subbuf_size; + data = mmap(NULL, data_len, PROT_READ, MAP_SHARED, + desc->cpu_fd, meta_len); + ASSERT_NE(data, MAP_FAILED); + munmap(data, data_len); + + /* Overflow the available subbufs by 1 */ + meta_len += desc->meta->subbuf_size * 2; + data = mmap(NULL, data_len, PROT_READ, MAP_SHARED, + desc->cpu_fd, meta_len); + ASSERT_EQ(data, MAP_FAILED); +} + +FIXTURE(snapshot) { + bool umount; +}; + +FIXTURE_SETUP(snapshot) +{ + bool fail, umount; + struct stat sb; + char *message; + + if (getuid() != 0) + SKIP(return, "Skipping: %s", "Please run the test as root"); + + if (stat(TRACEFS_ROOT"/snapshot", &sb)) + SKIP(return, "Skipping: %s", "snapshot not available"); + + if (!tracefs_enabled(&message, &fail, &umount)) { + if (fail) { + TH_LOG("Tracefs setup failed: %s", message); + ASSERT_FALSE(fail); + } + SKIP(return, "Skipping: %s", message); + } + + self->umount = umount; +} + +FIXTURE_TEARDOWN(snapshot) +{ + __tracefs_write(TRACEFS_ROOT"/events/sched/sched_switch/trigger", + "!snapshot"); + tracefs_reset(); + + if (self->umount) + tracefs_unmount(); +} + +TEST_F(snapshot, excludes_map) +{ + struct tracefs_cpu_map_desc map_desc; + int cpu = sched_getcpu(); + + ASSERT_GE(cpu, 0); + tracefs_write(TRACEFS_ROOT"/events/sched/sched_switch/trigger", + "snapshot"); + ASSERT_EQ(tracefs_cpu_map(&map_desc, cpu), -EBUSY); +} + +TEST_F(snapshot, excluded_by_map) +{ + struct tracefs_cpu_map_desc map_desc; + int cpu = sched_getcpu(); + + ASSERT_EQ(tracefs_cpu_map(&map_desc, cpu), 0); + + ASSERT_EQ(__tracefs_write(TRACEFS_ROOT"/events/sched/sched_switch/trigger", + "snapshot"), -EBUSY); + ASSERT_EQ(__tracefs_write(TRACEFS_ROOT"/snapshot", + "1"), -EBUSY); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/sync/sync_test.c b/tools/testing/selftests/sync/sync_test.c index 414a617db993..93db5aa246a3 100644 --- a/tools/testing/selftests/sync/sync_test.c +++ b/tools/testing/selftests/sync/sync_test.c @@ -109,6 +109,5 @@ int main(void) ksft_exit_fail_msg("%d out of %d sync tests failed\n", err, ksft_test_num()); - /* need this return to keep gcc happy */ - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/timers/adjtick.c b/tools/testing/selftests/timers/adjtick.c index 47e05fdc32c5..205b76a4abb4 100644 --- a/tools/testing/selftests/timers/adjtick.c +++ b/tools/testing/selftests/timers/adjtick.c @@ -205,7 +205,7 @@ int main(int argc, char **argv) adjtimex(&tx1); if (err) - return ksft_exit_fail(); + ksft_exit_fail(); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/timers/alarmtimer-suspend.c b/tools/testing/selftests/timers/alarmtimer-suspend.c index 4332b494103d..ad52e608b88e 100644 --- a/tools/testing/selftests/timers/alarmtimer-suspend.c +++ b/tools/testing/selftests/timers/alarmtimer-suspend.c @@ -173,6 +173,6 @@ int main(void) timer_delete(tm1); } if (final_ret) - return ksft_exit_fail(); - return ksft_exit_pass(); + ksft_exit_fail(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/timers/change_skew.c b/tools/testing/selftests/timers/change_skew.c index 992a77f2a74c..4421cd562c24 100644 --- a/tools/testing/selftests/timers/change_skew.c +++ b/tools/testing/selftests/timers/change_skew.c @@ -89,8 +89,8 @@ int main(int argc, char **argv) if (ret) { printf("[FAIL]"); - return ksft_exit_fail(); + ksft_exit_fail(); } printf("[OK]"); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/timers/freq-step.c b/tools/testing/selftests/timers/freq-step.c index 4b76450d78d1..73b636f89fdc 100644 --- a/tools/testing/selftests/timers/freq-step.c +++ b/tools/testing/selftests/timers/freq-step.c @@ -257,7 +257,7 @@ int main(int argc, char **argv) set_frequency(0.0); if (fails) - return ksft_exit_fail(); + ksft_exit_fail(); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/timers/leap-a-day.c b/tools/testing/selftests/timers/leap-a-day.c index 23eb398c8140..986abbdb1521 100644 --- a/tools/testing/selftests/timers/leap-a-day.c +++ b/tools/testing/selftests/timers/leap-a-day.c @@ -268,7 +268,7 @@ int main(int argc, char **argv) if (ret < 0) { printf("Error: Problem setting STA_INS/STA_DEL!: %s\n", time_state_str(ret)); - return ksft_exit_fail(); + ksft_exit_fail(); } /* Validate STA_INS was set */ @@ -277,7 +277,7 @@ int main(int argc, char **argv) if (tx.status != STA_INS && tx.status != STA_DEL) { printf("Error: STA_INS/STA_DEL not set!: %s\n", time_state_str(ret)); - return ksft_exit_fail(); + ksft_exit_fail(); } if (tai_time) { @@ -295,7 +295,7 @@ int main(int argc, char **argv) se.sigev_value.sival_int = 0; if (timer_create(CLOCK_REALTIME, &se, &tm1) == -1) { printf("Error: timer_create failed\n"); - return ksft_exit_fail(); + ksft_exit_fail(); } its1.it_value.tv_sec = next_leap; its1.it_value.tv_nsec = 0; @@ -366,7 +366,7 @@ int main(int argc, char **argv) if (error_found) { printf("Errors observed\n"); clear_time_state(); - return ksft_exit_fail(); + ksft_exit_fail(); } printf("\n"); if ((iterations != -1) && !(--iterations)) @@ -374,5 +374,5 @@ int main(int argc, char **argv) } clear_time_state(); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/timers/leapcrash.c b/tools/testing/selftests/timers/leapcrash.c index f70802c5dd0d..8fd065eec904 100644 --- a/tools/testing/selftests/timers/leapcrash.c +++ b/tools/testing/selftests/timers/leapcrash.c @@ -87,7 +87,7 @@ int main(void) tv.tv_usec = 0; if (settimeofday(&tv, NULL)) { printf("Error: You're likely not running with proper (ie: root) permissions\n"); - return ksft_exit_fail(); + ksft_exit_fail(); } tx.modes = 0; adjtimex(&tx); @@ -104,5 +104,5 @@ int main(void) fflush(stdout); } printf("[OK]\n"); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/timers/mqueue-lat.c b/tools/testing/selftests/timers/mqueue-lat.c index 7916cf5cc6ff..f3179a605bba 100644 --- a/tools/testing/selftests/timers/mqueue-lat.c +++ b/tools/testing/selftests/timers/mqueue-lat.c @@ -107,8 +107,8 @@ int main(int argc, char **argv) ret = mqueue_lat_test(); if (ret < 0) { printf("[FAILED]\n"); - return ksft_exit_fail(); + ksft_exit_fail(); } printf("[OK]\n"); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c index c001dd79179d..07c81c0093c0 100644 --- a/tools/testing/selftests/timers/posix_timers.c +++ b/tools/testing/selftests/timers/posix_timers.c @@ -260,16 +260,16 @@ int main(int argc, char **argv) ksft_print_msg("based timers if other threads run on the CPU...\n"); if (check_itimer(ITIMER_VIRTUAL) < 0) - return ksft_exit_fail(); + ksft_exit_fail(); if (check_itimer(ITIMER_PROF) < 0) - return ksft_exit_fail(); + ksft_exit_fail(); if (check_itimer(ITIMER_REAL) < 0) - return ksft_exit_fail(); + ksft_exit_fail(); if (check_timer_create(CLOCK_THREAD_CPUTIME_ID) < 0) - return ksft_exit_fail(); + ksft_exit_fail(); /* * It's unfortunately hard to reliably test a timer expiration @@ -281,10 +281,10 @@ int main(int argc, char **argv) * find a better solution. */ if (check_timer_create(CLOCK_PROCESS_CPUTIME_ID) < 0) - return ksft_exit_fail(); + ksft_exit_fail(); if (check_timer_distribution() < 0) - return ksft_exit_fail(); + ksft_exit_fail(); ksft_finished(); } diff --git a/tools/testing/selftests/timers/raw_skew.c b/tools/testing/selftests/timers/raw_skew.c index 6eba203f9da7..030143eb09b4 100644 --- a/tools/testing/selftests/timers/raw_skew.c +++ b/tools/testing/selftests/timers/raw_skew.c @@ -137,11 +137,11 @@ int main(int argc, char **argv) if (tx1.offset || tx2.offset || tx1.freq != tx2.freq || tx1.tick != tx2.tick) { printf(" [SKIP]\n"); - return ksft_exit_skip("The clock was adjusted externally. Shutdown NTPd or other time sync daemons\n"); + ksft_exit_skip("The clock was adjusted externally. Shutdown NTPd or other time sync daemons\n"); } printf(" [FAILED]\n"); - return ksft_exit_fail(); + ksft_exit_fail(); } printf(" [OK]\n"); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/timers/set-2038.c b/tools/testing/selftests/timers/set-2038.c index 688cfd81b531..f7d978721b9e 100644 --- a/tools/testing/selftests/timers/set-2038.c +++ b/tools/testing/selftests/timers/set-2038.c @@ -128,6 +128,6 @@ out: /* restore clock */ settime(start); if (ret) - return ksft_exit_fail(); - return ksft_exit_pass(); + ksft_exit_fail(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/timers/set-tai.c b/tools/testing/selftests/timers/set-tai.c index 8c4179ee2ca2..5b67462efcd6 100644 --- a/tools/testing/selftests/timers/set-tai.c +++ b/tools/testing/selftests/timers/set-tai.c @@ -61,9 +61,9 @@ int main(int argc, char **argv) ret = get_tai(); if (ret != i) { printf("[FAILED] expected: %i got %i\n", i, ret); - return ksft_exit_fail(); + ksft_exit_fail(); } } printf("[OK]\n"); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/timers/set-timer-lat.c b/tools/testing/selftests/timers/set-timer-lat.c index 50da45437daa..7ce240c89b21 100644 --- a/tools/testing/selftests/timers/set-timer-lat.c +++ b/tools/testing/selftests/timers/set-timer-lat.c @@ -278,6 +278,6 @@ int main(void) ret |= do_timer_oneshot(clock_id, 0); } if (ret) - return ksft_exit_fail(); - return ksft_exit_pass(); + ksft_exit_fail(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/timers/set-tz.c b/tools/testing/selftests/timers/set-tz.c index 62bd33eb16f0..20daaf1782b7 100644 --- a/tools/testing/selftests/timers/set-tz.c +++ b/tools/testing/selftests/timers/set-tz.c @@ -102,9 +102,9 @@ int main(int argc, char **argv) printf("[OK]\n"); set_tz(min, dst); - return ksft_exit_pass(); + ksft_exit_pass(); err: set_tz(min, dst); - return ksft_exit_fail(); + ksft_exit_fail(); } diff --git a/tools/testing/selftests/timers/skew_consistency.c b/tools/testing/selftests/timers/skew_consistency.c index 63913f75b384..c8e6bffe4e0a 100644 --- a/tools/testing/selftests/timers/skew_consistency.c +++ b/tools/testing/selftests/timers/skew_consistency.c @@ -70,8 +70,8 @@ int main(int argc, char **argv) if (ret) { printf("[FAILED]\n"); - return ksft_exit_fail(); + ksft_exit_fail(); } printf("[OK]\n"); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/timers/threadtest.c b/tools/testing/selftests/timers/threadtest.c index 80aed4bf06fb..76b38e41d9c7 100644 --- a/tools/testing/selftests/timers/threadtest.c +++ b/tools/testing/selftests/timers/threadtest.c @@ -189,5 +189,5 @@ out: /* die */ if (ret) ksft_exit_fail(); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/timers/valid-adjtimex.c b/tools/testing/selftests/timers/valid-adjtimex.c index d13ebde20322..d500884801d8 100644 --- a/tools/testing/selftests/timers/valid-adjtimex.c +++ b/tools/testing/selftests/timers/valid-adjtimex.c @@ -320,10 +320,10 @@ int validate_set_offset(void) int main(int argc, char **argv) { if (validate_freq()) - return ksft_exit_fail(); + ksft_exit_fail(); if (validate_set_offset()) - return ksft_exit_fail(); + ksft_exit_fail(); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/tty/tty_tstamp_update.c b/tools/testing/selftests/tty/tty_tstamp_update.c index 0ee97943dccc..9e1a40f5db17 100644 --- a/tools/testing/selftests/tty/tty_tstamp_update.c +++ b/tools/testing/selftests/tty/tty_tstamp_update.c @@ -47,42 +47,60 @@ int main(int argc, char **argv) int r; char tty[PATH_MAX] = {}; struct stat st1, st2; + int result = KSFT_FAIL; ksft_print_header(); ksft_set_plan(1); r = readlink("/proc/self/fd/0", tty, PATH_MAX); - if (r < 0) - ksft_exit_fail_msg("readlink on /proc/self/fd/0 failed: %m\n"); + if (r < 0) { + ksft_print_msg("readlink on /proc/self/fd/0 failed: %m\n"); + goto out; + } + + if (!tty_valid(tty)) { + ksft_print_msg("invalid tty path '%s'\n", tty); + result = KSFT_SKIP; + goto out; - if (!tty_valid(tty)) - ksft_exit_skip("invalid tty path '%s'\n", tty); + } r = stat(tty, &st1); - if (r < 0) - ksft_exit_fail_msg("stat failed on tty path '%s': %m\n", tty); + if (r < 0) { + ksft_print_msg("stat failed on tty path '%s': %m\n", tty); + goto out; + } /* We need to wait at least 8 seconds in order to observe timestamp change */ /* https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=fbf47635315ab308c9b58a1ea0906e711a9228de */ sleep(10); r = write_dev_tty(); - if (r < 0) - ksft_exit_fail_msg("failed to write to /dev/tty: %s\n", - strerror(-r)); + if (r < 0) { + ksft_print_msg("failed to write to /dev/tty: %s\n", + strerror(-r)); + goto out; + } r = stat(tty, &st2); - if (r < 0) - ksft_exit_fail_msg("stat failed on tty path '%s': %m\n", tty); + if (r < 0) { + ksft_print_msg("stat failed on tty path '%s': %m\n", tty); + goto out; + } /* We wrote to the terminal so timestamps should have been updated */ if (st1.st_atim.tv_sec == st2.st_atim.tv_sec && st1.st_mtim.tv_sec == st2.st_mtim.tv_sec) { - ksft_test_result_fail("tty timestamps not updated\n"); - ksft_exit_fail(); + ksft_print_msg("tty timestamps not updated\n"); + goto out; } - ksft_test_result_pass( + ksft_print_msg( "timestamps of terminal '%s' updated after write to /dev/tty\n", tty); - return EXIT_SUCCESS; + result = KSFT_PASS; + +out: + ksft_test_result_report(result, "tty_tstamp_update\n"); + + ksft_finished(); } diff --git a/tools/testing/selftests/user_events/ftrace_test.c b/tools/testing/selftests/user_events/ftrace_test.c index dcd7509fe2e0..0bb46793dcd4 100644 --- a/tools/testing/selftests/user_events/ftrace_test.c +++ b/tools/testing/selftests/user_events/ftrace_test.c @@ -261,6 +261,12 @@ TEST_F(user, register_events) { ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, ®)); ASSERT_EQ(0, reg.write_index); + /* Register without separator spacing should still match */ + reg.enable_bit = 29; + reg.name_args = (__u64)"__test_event u32 field1;u32 field2"; + ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, ®)); + ASSERT_EQ(0, reg.write_index); + /* Multiple registers to same name but different args should fail */ reg.enable_bit = 29; reg.name_args = (__u64)"__test_event u32 field1;"; @@ -288,6 +294,8 @@ TEST_F(user, register_events) { ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSUNREG, &unreg)); unreg.disable_bit = 30; ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSUNREG, &unreg)); + unreg.disable_bit = 29; + ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSUNREG, &unreg)); /* Delete should have been auto-done after close and unregister */ close(self->data_fd); diff --git a/tools/testing/selftests/vDSO/vdso_config.h b/tools/testing/selftests/vDSO/vdso_config.h index cdfed403ba13..7b543e7f04d7 100644 --- a/tools/testing/selftests/vDSO/vdso_config.h +++ b/tools/testing/selftests/vDSO/vdso_config.h @@ -53,15 +53,19 @@ #if __riscv_xlen == 32 #define VDSO_32BIT 1 #endif +#elif defined(__loongarch__) +#define VDSO_VERSION 6 +#define VDSO_NAMES 1 #endif -static const char *versions[6] = { +static const char *versions[7] = { "LINUX_2.6", "LINUX_2.6.15", "LINUX_2.6.29", "LINUX_2.6.39", "LINUX_4", "LINUX_4.15", + "LINUX_5.10" }; static const char *names[2][6] = { diff --git a/tools/testing/selftests/vDSO/vdso_test_getcpu.c b/tools/testing/selftests/vDSO/vdso_test_getcpu.c index 1df5d057d79f..b758f68c6c9c 100644 --- a/tools/testing/selftests/vDSO/vdso_test_getcpu.c +++ b/tools/testing/selftests/vDSO/vdso_test_getcpu.c @@ -13,13 +13,7 @@ #include "../kselftest.h" #include "parse_vdso.h" - -#if defined(__riscv) -const char *version = "LINUX_4.15"; -#else -const char *version = "LINUX_2.6"; -#endif -const char *name = "__vdso_getcpu"; +#include "vdso_config.h" struct getcpu_cache; typedef long (*getcpu_t)(unsigned int *, unsigned int *, @@ -27,6 +21,8 @@ typedef long (*getcpu_t)(unsigned int *, unsigned int *, int main(int argc, char **argv) { + const char *version = versions[VDSO_VERSION]; + const char **name = (const char **)&names[VDSO_NAMES]; unsigned long sysinfo_ehdr; unsigned int cpu, node; getcpu_t get_cpu; @@ -40,9 +36,9 @@ int main(int argc, char **argv) vdso_init_from_sysinfo_ehdr(getauxval(AT_SYSINFO_EHDR)); - get_cpu = (getcpu_t)vdso_sym(version, name); + get_cpu = (getcpu_t)vdso_sym(version, name[4]); if (!get_cpu) { - printf("Could not find %s\n", name); + printf("Could not find %s\n", name[4]); return KSFT_SKIP; } @@ -50,7 +46,7 @@ int main(int argc, char **argv) if (ret == 0) { printf("Running on CPU %u node %u\n", cpu, node); } else { - printf("%s failed\n", name); + printf("%s failed\n", name[4]); return KSFT_FAIL; } diff --git a/tools/testing/selftests/vDSO/vdso_test_gettimeofday.c b/tools/testing/selftests/vDSO/vdso_test_gettimeofday.c index e411f287a426..ee4f1ca56a71 100644 --- a/tools/testing/selftests/vDSO/vdso_test_gettimeofday.c +++ b/tools/testing/selftests/vDSO/vdso_test_gettimeofday.c @@ -18,25 +18,13 @@ #include "../kselftest.h" #include "parse_vdso.h" - -/* - * ARM64's vDSO exports its gettimeofday() implementation with a different - * name and version from other architectures, so we need to handle it as - * a special case. - */ -#if defined(__aarch64__) -const char *version = "LINUX_2.6.39"; -const char *name = "__kernel_gettimeofday"; -#elif defined(__riscv) -const char *version = "LINUX_4.15"; -const char *name = "__vdso_gettimeofday"; -#else -const char *version = "LINUX_2.6"; -const char *name = "__vdso_gettimeofday"; -#endif +#include "vdso_config.h" int main(int argc, char **argv) { + const char *version = versions[VDSO_VERSION]; + const char **name = (const char **)&names[VDSO_NAMES]; + unsigned long sysinfo_ehdr = getauxval(AT_SYSINFO_EHDR); if (!sysinfo_ehdr) { printf("AT_SYSINFO_EHDR is not present!\n"); @@ -47,10 +35,10 @@ int main(int argc, char **argv) /* Find gettimeofday. */ typedef long (*gtod_t)(struct timeval *tv, struct timezone *tz); - gtod_t gtod = (gtod_t)vdso_sym(version, name); + gtod_t gtod = (gtod_t)vdso_sym(version, name[0]); if (!gtod) { - printf("Could not find %s\n", name); + printf("Could not find %s\n", name[0]); return KSFT_SKIP; } @@ -61,7 +49,7 @@ int main(int argc, char **argv) printf("The time is %lld.%06lld\n", (long long)tv.tv_sec, (long long)tv.tv_usec); } else { - printf("%s failed\n", name); + printf("%s failed\n", name[0]); return KSFT_FAIL; } diff --git a/tools/testing/selftests/wireguard/qemu/arch/riscv32.config b/tools/testing/selftests/wireguard/qemu/arch/riscv32.config index a7f8e8a95625..66290cf289a9 100644 --- a/tools/testing/selftests/wireguard/qemu/arch/riscv32.config +++ b/tools/testing/selftests/wireguard/qemu/arch/riscv32.config @@ -2,7 +2,7 @@ CONFIG_NONPORTABLE=y CONFIG_ARCH_RV32I=y CONFIG_MMU=y CONFIG_FPU=y -CONFIG_SOC_VIRT=y +CONFIG_ARCH_VIRT=y CONFIG_RISCV_ISA_FALLBACK=y CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y diff --git a/tools/testing/selftests/wireguard/qemu/arch/riscv64.config b/tools/testing/selftests/wireguard/qemu/arch/riscv64.config index daeb3e5e0965..db1aa9f388b9 100644 --- a/tools/testing/selftests/wireguard/qemu/arch/riscv64.config +++ b/tools/testing/selftests/wireguard/qemu/arch/riscv64.config @@ -1,7 +1,7 @@ CONFIG_ARCH_RV64I=y CONFIG_MMU=y CONFIG_FPU=y -CONFIG_SOC_VIRT=y +CONFIG_ARCH_VIRT=y CONFIG_RISCV_ISA_FALLBACK=y CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y diff --git a/tools/testing/selftests/wireguard/qemu/kernel.config b/tools/testing/selftests/wireguard/qemu/kernel.config index 507555714b1d..f314d3789f17 100644 --- a/tools/testing/selftests/wireguard/qemu/kernel.config +++ b/tools/testing/selftests/wireguard/qemu/kernel.config @@ -41,7 +41,6 @@ CONFIG_KALLSYMS=y CONFIG_BUG=y CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE=y CONFIG_JUMP_LABEL=y -CONFIG_BASE_FULL=y CONFIG_FUTEX=y CONFIG_SHMEM=y CONFIG_SLUB=y diff --git a/tools/testing/selftests/x86/amx.c b/tools/testing/selftests/x86/amx.c index d884fd69dd51..95aad6d8849b 100644 --- a/tools/testing/selftests/x86/amx.c +++ b/tools/testing/selftests/x86/amx.c @@ -103,21 +103,6 @@ static void clearhandler(int sig) #define CPUID_LEAF1_ECX_XSAVE_MASK (1 << 26) #define CPUID_LEAF1_ECX_OSXSAVE_MASK (1 << 27) -static inline void check_cpuid_xsave(void) -{ - uint32_t eax, ebx, ecx, edx; - - /* - * CPUID.1:ECX.XSAVE[bit 26] enumerates general - * support for the XSAVE feature set, including - * XGETBV. - */ - __cpuid_count(1, 0, eax, ebx, ecx, edx); - if (!(ecx & CPUID_LEAF1_ECX_XSAVE_MASK)) - fatal_error("cpuid: no CPU xsave support"); - if (!(ecx & CPUID_LEAF1_ECX_OSXSAVE_MASK)) - fatal_error("cpuid: no OS xsave support"); -} static uint32_t xbuf_size; @@ -350,6 +335,7 @@ enum expected_result { FAIL_EXPECTED, SUCCESS_EXPECTED }; /* arch_prctl() and sigaltstack() test */ +#define ARCH_GET_XCOMP_SUPP 0x1021 #define ARCH_GET_XCOMP_PERM 0x1022 #define ARCH_REQ_XCOMP_PERM 0x1023 @@ -928,8 +914,15 @@ static void test_ptrace(void) int main(void) { - /* Check hardware availability at first */ - check_cpuid_xsave(); + unsigned long features; + long rc; + + rc = syscall(SYS_arch_prctl, ARCH_GET_XCOMP_SUPP, &features); + if (rc || (features & XFEATURE_MASK_XTILE) != XFEATURE_MASK_XTILE) { + ksft_print_msg("no AMX support\n"); + return KSFT_SKIP; + } + check_cpuid_xtiledata(); init_stashed_xsave(); diff --git a/tools/testing/selftests/x86/lam.c b/tools/testing/selftests/x86/lam.c index 215b8150b7cc..0ea4f6813930 100644 --- a/tools/testing/selftests/x86/lam.c +++ b/tools/testing/selftests/x86/lam.c @@ -1183,7 +1183,7 @@ int main(int argc, char **argv) if (!cpu_has_lam()) { ksft_print_msg("Unsupported LAM feature!\n"); - return -1; + return KSFT_SKIP; } while ((c = getopt(argc, argv, "ht:")) != -1) { @@ -1237,5 +1237,5 @@ int main(int argc, char **argv) ksft_set_plan(tests_cnt); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/x86/test_mremap_vdso.c b/tools/testing/selftests/x86/test_mremap_vdso.c index f0d876d48277..d53959e03593 100644 --- a/tools/testing/selftests/x86/test_mremap_vdso.c +++ b/tools/testing/selftests/x86/test_mremap_vdso.c @@ -19,6 +19,7 @@ #include <sys/auxv.h> #include <sys/syscall.h> #include <sys/wait.h> +#include "../kselftest.h" #define PAGE_SIZE 4096 @@ -29,13 +30,13 @@ static int try_to_remap(void *vdso_addr, unsigned long size) /* Searching for memory location where to remap */ dest_addr = mmap(0, size, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); if (dest_addr == MAP_FAILED) { - printf("[WARN]\tmmap failed (%d): %m\n", errno); + ksft_print_msg("WARN: mmap failed (%d): %m\n", errno); return 0; } - printf("[NOTE]\tMoving vDSO: [%p, %#lx] -> [%p, %#lx]\n", - vdso_addr, (unsigned long)vdso_addr + size, - dest_addr, (unsigned long)dest_addr + size); + ksft_print_msg("Moving vDSO: [%p, %#lx] -> [%p, %#lx]\n", + vdso_addr, (unsigned long)vdso_addr + size, + dest_addr, (unsigned long)dest_addr + size); fflush(stdout); new_addr = mremap(vdso_addr, size, size, @@ -43,10 +44,10 @@ static int try_to_remap(void *vdso_addr, unsigned long size) if ((unsigned long)new_addr == (unsigned long)-1) { munmap(dest_addr, size); if (errno == EINVAL) { - printf("[NOTE]\tvDSO partial move failed, will try with bigger size\n"); + ksft_print_msg("vDSO partial move failed, will try with bigger size\n"); return -1; /* Retry with larger */ } - printf("[FAIL]\tmremap failed (%d): %m\n", errno); + ksft_print_msg("[FAIL]\tmremap failed (%d): %m\n", errno); return 1; } @@ -58,11 +59,12 @@ int main(int argc, char **argv, char **envp) { pid_t child; + ksft_print_header(); + ksft_set_plan(1); + child = fork(); - if (child == -1) { - printf("[WARN]\tfailed to fork (%d): %m\n", errno); - return 1; - } + if (child == -1) + ksft_exit_fail_msg("failed to fork (%d): %m\n", errno); if (child == 0) { unsigned long vdso_size = PAGE_SIZE; @@ -70,9 +72,9 @@ int main(int argc, char **argv, char **envp) int ret = -1; auxval = getauxval(AT_SYSINFO_EHDR); - printf("\tAT_SYSINFO_EHDR is %#lx\n", auxval); + ksft_print_msg("AT_SYSINFO_EHDR is %#lx\n", auxval); if (!auxval || auxval == -ENOENT) { - printf("[WARN]\tgetauxval failed\n"); + ksft_print_msg("WARN: getauxval failed\n"); return 0; } @@ -92,16 +94,13 @@ int main(int argc, char **argv, char **envp) int status; if (waitpid(child, &status, 0) != child || - !WIFEXITED(status)) { - printf("[FAIL]\tmremap() of the vDSO does not work on this kernel!\n"); - return 1; - } else if (WEXITSTATUS(status) != 0) { - printf("[FAIL]\tChild failed with %d\n", - WEXITSTATUS(status)); - return 1; - } - printf("[OK]\n"); + !WIFEXITED(status)) + ksft_test_result_fail("mremap() of the vDSO does not work on this kernel!\n"); + else if (WEXITSTATUS(status) != 0) + ksft_test_result_fail("Child failed with %d\n", WEXITSTATUS(status)); + else + ksft_test_result_pass("%s\n", __func__); } - return 0; + ksft_finished(); } diff --git a/tools/testing/selftests/x86/test_shadow_stack.c b/tools/testing/selftests/x86/test_shadow_stack.c index 757e6527f67e..ee909a7927f9 100644 --- a/tools/testing/selftests/x86/test_shadow_stack.c +++ b/tools/testing/selftests/x86/test_shadow_stack.c @@ -556,7 +556,7 @@ struct node { * looked at the shadow stack gaps. * 5. See if it landed in the gap. */ -int test_guard_gap(void) +int test_guard_gap_other_gaps(void) { void *free_area, *shstk, *test_map = (void *)0xFFFFFFFFFFFFFFFF; struct node *head = NULL, *cur; @@ -593,11 +593,64 @@ int test_guard_gap(void) if (shstk - test_map - PAGE_SIZE != PAGE_SIZE) return 1; - printf("[OK]\tGuard gap test\n"); + printf("[OK]\tGuard gap test, other mapping's gaps\n"); return 0; } +/* Tests respecting the guard gap of the mapping getting placed */ +int test_guard_gap_new_mappings_gaps(void) +{ + void *free_area, *shstk_start, *test_map = (void *)0xFFFFFFFFFFFFFFFF; + struct node *head = NULL, *cur; + int ret = 0; + + free_area = mmap(0, PAGE_SIZE * 4, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + munmap(free_area, PAGE_SIZE * 4); + + /* Test letting map_shadow_stack find a free space */ + shstk_start = mmap(free_area, PAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (shstk_start == MAP_FAILED || shstk_start != free_area) + return 1; + + while (test_map > shstk_start) { + test_map = (void *)syscall(__NR_map_shadow_stack, 0, PAGE_SIZE, 0); + if (test_map == MAP_FAILED) { + printf("[INFO]\tmap_shadow_stack MAP_FAILED\n"); + ret = 1; + break; + } + + cur = malloc(sizeof(*cur)); + cur->mapping = test_map; + + cur->next = head; + head = cur; + + if (test_map == free_area + PAGE_SIZE) { + printf("[INFO]\tNew mapping has other mapping in guard gap!\n"); + ret = 1; + break; + } + } + + while (head) { + cur = head; + head = cur->next; + munmap(cur->mapping, PAGE_SIZE); + free(cur); + } + + munmap(shstk_start, PAGE_SIZE); + + if (!ret) + printf("[OK]\tGuard gap test, placement mapping's gaps\n"); + + return ret; +} + /* * Too complicated to pull it out of the 32 bit header, but also get the * 64 bit one needed above. Just define a copy here. @@ -850,9 +903,15 @@ int main(int argc, char *argv[]) goto out; } - if (test_guard_gap()) { + if (test_guard_gap_other_gaps()) { + ret = 1; + printf("[FAIL]\tGuard gap test, other mappings' gaps\n"); + goto out; + } + + if (test_guard_gap_new_mappings_gaps()) { ret = 1; - printf("[FAIL]\tGuard gap test\n"); + printf("[FAIL]\tGuard gap test, placement mapping's gaps\n"); goto out; } diff --git a/tools/testing/selftests/x86/test_vsyscall.c b/tools/testing/selftests/x86/test_vsyscall.c index 47cab972807c..d4c8e8d79d38 100644 --- a/tools/testing/selftests/x86/test_vsyscall.c +++ b/tools/testing/selftests/x86/test_vsyscall.c @@ -21,6 +21,13 @@ #include <sys/uio.h> #include "helpers.h" +#include "../kselftest.h" + +#ifdef __x86_64__ +#define TOTAL_TESTS 13 +#else +#define TOTAL_TESTS 8 +#endif #ifdef __x86_64__ # define VSYS(x) (x) @@ -39,18 +46,6 @@ /* max length of lines in /proc/self/maps - anything longer is skipped here */ #define MAPS_LINE_LEN 128 -static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), - int flags) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_sigaction = handler; - sa.sa_flags = SA_SIGINFO | flags; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - /* vsyscalls and vDSO */ bool vsyscall_map_r = false, vsyscall_map_x = false; @@ -75,83 +70,25 @@ static void init_vdso(void) if (!vdso) vdso = dlopen("linux-gate.so.1", RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD); if (!vdso) { - printf("[WARN]\tfailed to find vDSO\n"); + ksft_print_msg("[WARN] failed to find vDSO\n"); return; } vdso_gtod = (gtod_t)dlsym(vdso, "__vdso_gettimeofday"); if (!vdso_gtod) - printf("[WARN]\tfailed to find gettimeofday in vDSO\n"); + ksft_print_msg("[WARN] failed to find gettimeofday in vDSO\n"); vdso_gettime = (vgettime_t)dlsym(vdso, "__vdso_clock_gettime"); if (!vdso_gettime) - printf("[WARN]\tfailed to find clock_gettime in vDSO\n"); + ksft_print_msg("[WARN] failed to find clock_gettime in vDSO\n"); vdso_time = (time_func_t)dlsym(vdso, "__vdso_time"); if (!vdso_time) - printf("[WARN]\tfailed to find time in vDSO\n"); + ksft_print_msg("[WARN] failed to find time in vDSO\n"); vdso_getcpu = (getcpu_t)dlsym(vdso, "__vdso_getcpu"); if (!vdso_getcpu) - printf("[WARN]\tfailed to find getcpu in vDSO\n"); -} - -static int init_vsys(void) -{ -#ifdef __x86_64__ - int nerrs = 0; - FILE *maps; - char line[MAPS_LINE_LEN]; - bool found = false; - - maps = fopen("/proc/self/maps", "r"); - if (!maps) { - printf("[WARN]\tCould not open /proc/self/maps -- assuming vsyscall is r-x\n"); - vsyscall_map_r = true; - return 0; - } - - while (fgets(line, MAPS_LINE_LEN, maps)) { - char r, x; - void *start, *end; - char name[MAPS_LINE_LEN]; - - /* sscanf() is safe here as strlen(name) >= strlen(line) */ - if (sscanf(line, "%p-%p %c-%cp %*x %*x:%*x %*u %s", - &start, &end, &r, &x, name) != 5) - continue; - - if (strcmp(name, "[vsyscall]")) - continue; - - printf("\tvsyscall map: %s", line); - - if (start != (void *)0xffffffffff600000 || - end != (void *)0xffffffffff601000) { - printf("[FAIL]\taddress range is nonsense\n"); - nerrs++; - } - - printf("\tvsyscall permissions are %c-%c\n", r, x); - vsyscall_map_r = (r == 'r'); - vsyscall_map_x = (x == 'x'); - - found = true; - break; - } - - fclose(maps); - - if (!found) { - printf("\tno vsyscall map in /proc/self/maps\n"); - vsyscall_map_r = false; - vsyscall_map_x = false; - } - - return nerrs; -#else - return 0; -#endif + ksft_print_msg("[WARN] failed to find getcpu in vDSO\n"); } /* syscalls */ @@ -176,98 +113,76 @@ static inline long sys_getcpu(unsigned * cpu, unsigned * node, return syscall(SYS_getcpu, cpu, node, cache); } -static jmp_buf jmpbuf; -static volatile unsigned long segv_err; - -static void sigsegv(int sig, siginfo_t *info, void *ctx_void) -{ - ucontext_t *ctx = (ucontext_t *)ctx_void; - - segv_err = ctx->uc_mcontext.gregs[REG_ERR]; - siglongjmp(jmpbuf, 1); -} - static double tv_diff(const struct timeval *a, const struct timeval *b) { return (double)(a->tv_sec - b->tv_sec) + (double)((int)a->tv_usec - (int)b->tv_usec) * 1e-6; } -static int check_gtod(const struct timeval *tv_sys1, - const struct timeval *tv_sys2, - const struct timezone *tz_sys, - const char *which, - const struct timeval *tv_other, - const struct timezone *tz_other) +static void check_gtod(const struct timeval *tv_sys1, + const struct timeval *tv_sys2, + const struct timezone *tz_sys, + const char *which, + const struct timeval *tv_other, + const struct timezone *tz_other) { - int nerrs = 0; double d1, d2; - if (tz_other && (tz_sys->tz_minuteswest != tz_other->tz_minuteswest || tz_sys->tz_dsttime != tz_other->tz_dsttime)) { - printf("[FAIL] %s tz mismatch\n", which); - nerrs++; - } + if (tz_other && (tz_sys->tz_minuteswest != tz_other->tz_minuteswest || + tz_sys->tz_dsttime != tz_other->tz_dsttime)) + ksft_print_msg("%s tz mismatch\n", which); d1 = tv_diff(tv_other, tv_sys1); d2 = tv_diff(tv_sys2, tv_other); - printf("\t%s time offsets: %lf %lf\n", which, d1, d2); - if (d1 < 0 || d2 < 0) { - printf("[FAIL]\t%s time was inconsistent with the syscall\n", which); - nerrs++; - } else { - printf("[OK]\t%s gettimeofday()'s timeval was okay\n", which); - } + ksft_print_msg("%s time offsets: %lf %lf\n", which, d1, d2); - return nerrs; + ksft_test_result(!(d1 < 0 || d2 < 0), "%s gettimeofday()'s timeval\n", which); } -static int test_gtod(void) +static void test_gtod(void) { struct timeval tv_sys1, tv_sys2, tv_vdso, tv_vsys; struct timezone tz_sys, tz_vdso, tz_vsys; long ret_vdso = -1; long ret_vsys = -1; - int nerrs = 0; - printf("[RUN]\ttest gettimeofday()\n"); + ksft_print_msg("test gettimeofday()\n"); if (sys_gtod(&tv_sys1, &tz_sys) != 0) - err(1, "syscall gettimeofday"); + ksft_exit_fail_msg("syscall gettimeofday: %s\n", strerror(errno)); if (vdso_gtod) ret_vdso = vdso_gtod(&tv_vdso, &tz_vdso); if (vsyscall_map_x) ret_vsys = vgtod(&tv_vsys, &tz_vsys); if (sys_gtod(&tv_sys2, &tz_sys) != 0) - err(1, "syscall gettimeofday"); + ksft_exit_fail_msg("syscall gettimeofday: %s\n", strerror(errno)); if (vdso_gtod) { - if (ret_vdso == 0) { - nerrs += check_gtod(&tv_sys1, &tv_sys2, &tz_sys, "vDSO", &tv_vdso, &tz_vdso); - } else { - printf("[FAIL]\tvDSO gettimeofday() failed: %ld\n", ret_vdso); - nerrs++; - } + if (ret_vdso == 0) + check_gtod(&tv_sys1, &tv_sys2, &tz_sys, "vDSO", &tv_vdso, &tz_vdso); + else + ksft_test_result_fail("vDSO gettimeofday() failed: %ld\n", ret_vdso); + } else { + ksft_test_result_skip("vdso_gtod isn't set\n"); } if (vsyscall_map_x) { - if (ret_vsys == 0) { - nerrs += check_gtod(&tv_sys1, &tv_sys2, &tz_sys, "vsyscall", &tv_vsys, &tz_vsys); - } else { - printf("[FAIL]\tvsys gettimeofday() failed: %ld\n", ret_vsys); - nerrs++; - } + if (ret_vsys == 0) + check_gtod(&tv_sys1, &tv_sys2, &tz_sys, "vsyscall", &tv_vsys, &tz_vsys); + else + ksft_test_result_fail("vsys gettimeofday() failed: %ld\n", ret_vsys); + } else { + ksft_test_result_skip("vsyscall_map_x isn't set\n"); } - - return nerrs; } -static int test_time(void) { - int nerrs = 0; - - printf("[RUN]\ttest time()\n"); +static void test_time(void) +{ long t_sys1, t_sys2, t_vdso = 0, t_vsys = 0; long t2_sys1 = -1, t2_sys2 = -1, t2_vdso = -1, t2_vsys = -1; + + ksft_print_msg("test time()\n"); t_sys1 = sys_time(&t2_sys1); if (vdso_time) t_vdso = vdso_time(&t2_vdso); @@ -275,56 +190,60 @@ static int test_time(void) { t_vsys = vtime(&t2_vsys); t_sys2 = sys_time(&t2_sys2); if (t_sys1 < 0 || t_sys1 != t2_sys1 || t_sys2 < 0 || t_sys2 != t2_sys2) { - printf("[FAIL]\tsyscall failed (ret1:%ld output1:%ld ret2:%ld output2:%ld)\n", t_sys1, t2_sys1, t_sys2, t2_sys2); - nerrs++; - return nerrs; + ksft_print_msg("syscall failed (ret1:%ld output1:%ld ret2:%ld output2:%ld)\n", + t_sys1, t2_sys1, t_sys2, t2_sys2); + ksft_test_result_skip("vdso_time\n"); + ksft_test_result_skip("vdso_time\n"); + return; } if (vdso_time) { - if (t_vdso < 0 || t_vdso != t2_vdso) { - printf("[FAIL]\tvDSO failed (ret:%ld output:%ld)\n", t_vdso, t2_vdso); - nerrs++; - } else if (t_vdso < t_sys1 || t_vdso > t_sys2) { - printf("[FAIL]\tvDSO returned the wrong time (%ld %ld %ld)\n", t_sys1, t_vdso, t_sys2); - nerrs++; - } else { - printf("[OK]\tvDSO time() is okay\n"); - } + if (t_vdso < 0 || t_vdso != t2_vdso) + ksft_test_result_fail("vDSO failed (ret:%ld output:%ld)\n", + t_vdso, t2_vdso); + else if (t_vdso < t_sys1 || t_vdso > t_sys2) + ksft_test_result_fail("vDSO returned the wrong time (%ld %ld %ld)\n", + t_sys1, t_vdso, t_sys2); + else + ksft_test_result_pass("vDSO time() is okay\n"); + } else { + ksft_test_result_skip("vdso_time isn't set\n"); } if (vsyscall_map_x) { - if (t_vsys < 0 || t_vsys != t2_vsys) { - printf("[FAIL]\tvsyscall failed (ret:%ld output:%ld)\n", t_vsys, t2_vsys); - nerrs++; - } else if (t_vsys < t_sys1 || t_vsys > t_sys2) { - printf("[FAIL]\tvsyscall returned the wrong time (%ld %ld %ld)\n", t_sys1, t_vsys, t_sys2); - nerrs++; - } else { - printf("[OK]\tvsyscall time() is okay\n"); - } + if (t_vsys < 0 || t_vsys != t2_vsys) + ksft_test_result_fail("vsyscall failed (ret:%ld output:%ld)\n", + t_vsys, t2_vsys); + else if (t_vsys < t_sys1 || t_vsys > t_sys2) + ksft_test_result_fail("vsyscall returned the wrong time (%ld %ld %ld)\n", + t_sys1, t_vsys, t_sys2); + else + ksft_test_result_pass("vsyscall time() is okay\n"); + } else { + ksft_test_result_skip("vsyscall_map_x isn't set\n"); } - - return nerrs; } -static int test_getcpu(int cpu) +static void test_getcpu(int cpu) { - int nerrs = 0; + unsigned int cpu_sys, cpu_vdso, cpu_vsys, node_sys, node_vdso, node_vsys; long ret_sys, ret_vdso = -1, ret_vsys = -1; + unsigned int node = 0; + bool have_node = false; + cpu_set_t cpuset; - printf("[RUN]\tgetcpu() on CPU %d\n", cpu); + ksft_print_msg("getcpu() on CPU %d\n", cpu); - cpu_set_t cpuset; CPU_ZERO(&cpuset); CPU_SET(cpu, &cpuset); if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0) { - printf("[SKIP]\tfailed to force CPU %d\n", cpu); - return nerrs; + ksft_print_msg("failed to force CPU %d\n", cpu); + ksft_test_result_skip("vdso_getcpu\n"); + ksft_test_result_skip("vsyscall_map_x\n"); + + return; } - unsigned cpu_sys, cpu_vdso, cpu_vsys, node_sys, node_vdso, node_vsys; - unsigned node = 0; - bool have_node = false; ret_sys = sys_getcpu(&cpu_sys, &node_sys, 0); if (vdso_getcpu) ret_vdso = vdso_getcpu(&cpu_vdso, &node_vdso, 0); @@ -332,10 +251,9 @@ static int test_getcpu(int cpu) ret_vsys = vgetcpu(&cpu_vsys, &node_vsys, 0); if (ret_sys == 0) { - if (cpu_sys != cpu) { - printf("[FAIL]\tsyscall reported CPU %hu but should be %d\n", cpu_sys, cpu); - nerrs++; - } + if (cpu_sys != cpu) + ksft_print_msg("syscall reported CPU %hu but should be %d\n", + cpu_sys, cpu); have_node = true; node = node_sys; @@ -343,63 +261,84 @@ static int test_getcpu(int cpu) if (vdso_getcpu) { if (ret_vdso) { - printf("[FAIL]\tvDSO getcpu() failed\n"); - nerrs++; + ksft_test_result_fail("vDSO getcpu() failed\n"); } else { if (!have_node) { have_node = true; node = node_vdso; } - if (cpu_vdso != cpu) { - printf("[FAIL]\tvDSO reported CPU %hu but should be %d\n", cpu_vdso, cpu); - nerrs++; - } else { - printf("[OK]\tvDSO reported correct CPU\n"); - } - - if (node_vdso != node) { - printf("[FAIL]\tvDSO reported node %hu but should be %hu\n", node_vdso, node); - nerrs++; + if (cpu_vdso != cpu || node_vdso != node) { + if (cpu_vdso != cpu) + ksft_print_msg("vDSO reported CPU %hu but should be %d\n", + cpu_vdso, cpu); + if (node_vdso != node) + ksft_print_msg("vDSO reported node %hu but should be %hu\n", + node_vdso, node); + ksft_test_result_fail("Wrong values\n"); } else { - printf("[OK]\tvDSO reported correct node\n"); + ksft_test_result_pass("vDSO reported correct CPU and node\n"); } } + } else { + ksft_test_result_skip("vdso_getcpu isn't set\n"); } if (vsyscall_map_x) { if (ret_vsys) { - printf("[FAIL]\tvsyscall getcpu() failed\n"); - nerrs++; + ksft_test_result_fail("vsyscall getcpu() failed\n"); } else { if (!have_node) { have_node = true; node = node_vsys; } - if (cpu_vsys != cpu) { - printf("[FAIL]\tvsyscall reported CPU %hu but should be %d\n", cpu_vsys, cpu); - nerrs++; + if (cpu_vsys != cpu || node_vsys != node) { + if (cpu_vsys != cpu) + ksft_print_msg("vsyscall reported CPU %hu but should be %d\n", + cpu_vsys, cpu); + if (node_vsys != node) + ksft_print_msg("vsyscall reported node %hu but should be %hu\n", + node_vsys, node); + ksft_test_result_fail("Wrong values\n"); } else { - printf("[OK]\tvsyscall reported correct CPU\n"); - } - - if (node_vsys != node) { - printf("[FAIL]\tvsyscall reported node %hu but should be %hu\n", node_vsys, node); - nerrs++; - } else { - printf("[OK]\tvsyscall reported correct node\n"); + ksft_test_result_pass("vsyscall reported correct CPU and node\n"); } } + } else { + ksft_test_result_skip("vsyscall_map_x isn't set\n"); } +} + +#ifdef __x86_64__ + +static jmp_buf jmpbuf; +static volatile unsigned long segv_err; + +static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), + int flags) +{ + struct sigaction sa; - return nerrs; + memset(&sa, 0, sizeof(sa)); + sa.sa_sigaction = handler; + sa.sa_flags = SA_SIGINFO | flags; + sigemptyset(&sa.sa_mask); + if (sigaction(sig, &sa, 0)) + ksft_exit_fail_msg("sigaction failed\n"); } -static int test_vsys_r(void) +static void sigsegv(int sig, siginfo_t *info, void *ctx_void) { -#ifdef __x86_64__ - printf("[RUN]\tChecking read access to the vsyscall page\n"); + ucontext_t *ctx = (ucontext_t *)ctx_void; + + segv_err = ctx->uc_mcontext.gregs[REG_ERR]; + siglongjmp(jmpbuf, 1); +} + +static void test_vsys_r(void) +{ + ksft_print_msg("Checking read access to the vsyscall page\n"); bool can_read; if (sigsetjmp(jmpbuf, 1) == 0) { *(volatile int *)0xffffffffff600000; @@ -408,32 +347,25 @@ static int test_vsys_r(void) can_read = false; } - if (can_read && !vsyscall_map_r) { - printf("[FAIL]\tWe have read access, but we shouldn't\n"); - return 1; - } else if (!can_read && vsyscall_map_r) { - printf("[FAIL]\tWe don't have read access, but we should\n"); - return 1; - } else if (can_read) { - printf("[OK]\tWe have read access\n"); - } else { - printf("[OK]\tWe do not have read access: #PF(0x%lx)\n", - segv_err); - } -#endif - - return 0; + if (can_read && !vsyscall_map_r) + ksft_test_result_fail("We have read access, but we shouldn't\n"); + else if (!can_read && vsyscall_map_r) + ksft_test_result_fail("We don't have read access, but we should\n"); + else if (can_read) + ksft_test_result_pass("We have read access\n"); + else + ksft_test_result_pass("We do not have read access: #PF(0x%lx)\n", segv_err); } -static int test_vsys_x(void) +static void test_vsys_x(void) { -#ifdef __x86_64__ if (vsyscall_map_x) { /* We already tested this adequately. */ - return 0; + ksft_test_result_pass("vsyscall_map_x is true\n"); + return; } - printf("[RUN]\tMake sure that vsyscalls really page fault\n"); + ksft_print_msg("Make sure that vsyscalls really page fault\n"); bool can_exec; if (sigsetjmp(jmpbuf, 1) == 0) { @@ -443,20 +375,14 @@ static int test_vsys_x(void) can_exec = false; } - if (can_exec) { - printf("[FAIL]\tExecuting the vsyscall did not page fault\n"); - return 1; - } else if (segv_err & (1 << 4)) { /* INSTR */ - printf("[OK]\tExecuting the vsyscall page failed: #PF(0x%lx)\n", - segv_err); - } else { - printf("[FAIL]\tExecution failed with the wrong error: #PF(0x%lx)\n", - segv_err); - return 1; - } -#endif - - return 0; + if (can_exec) + ksft_test_result_fail("Executing the vsyscall did not page fault\n"); + else if (segv_err & (1 << 4)) /* INSTR */ + ksft_test_result_pass("Executing the vsyscall page failed: #PF(0x%lx)\n", + segv_err); + else + ksft_test_result_fail("Execution failed with the wrong error: #PF(0x%lx)\n", + segv_err); } /* @@ -470,14 +396,13 @@ static int test_vsys_x(void) * fact that ptrace() ever worked was a nice courtesy of old kernels, * but the code to support it is fairly gross. */ -static int test_process_vm_readv(void) +static void test_process_vm_readv(void) { -#ifdef __x86_64__ char buf[4096]; struct iovec local, remote; int ret; - printf("[RUN]\tprocess_vm_readv() from vsyscall page\n"); + ksft_print_msg("process_vm_readv() from vsyscall page\n"); local.iov_base = buf; local.iov_len = 4096; @@ -489,27 +414,71 @@ static int test_process_vm_readv(void) * We expect process_vm_readv() to work if and only if the * vsyscall page is readable. */ - printf("[%s]\tprocess_vm_readv() failed (ret = %d, errno = %d)\n", vsyscall_map_r ? "FAIL" : "OK", ret, errno); - return vsyscall_map_r ? 1 : 0; + ksft_test_result(!vsyscall_map_r, + "process_vm_readv() failed (ret = %d, errno = %d)\n", ret, errno); + return; } - if (vsyscall_map_r) { - if (!memcmp(buf, remote.iov_base, sizeof(buf))) { - printf("[OK]\tIt worked and read correct data\n"); - } else { - printf("[FAIL]\tIt worked but returned incorrect data\n"); - return 1; + if (vsyscall_map_r) + ksft_test_result(!memcmp(buf, remote.iov_base, sizeof(buf)), "Read data\n"); + else + ksft_test_result_fail("process_rm_readv() succeeded, but it should have failed in this configuration\n"); +} + +static void init_vsys(void) +{ + int nerrs = 0; + FILE *maps; + char line[MAPS_LINE_LEN]; + bool found = false; + + maps = fopen("/proc/self/maps", "r"); + if (!maps) { + ksft_test_result_skip("Could not open /proc/self/maps -- assuming vsyscall is r-x\n"); + vsyscall_map_r = true; + return; + } + + while (fgets(line, MAPS_LINE_LEN, maps)) { + char r, x; + void *start, *end; + char name[MAPS_LINE_LEN]; + + /* sscanf() is safe here as strlen(name) >= strlen(line) */ + if (sscanf(line, "%p-%p %c-%cp %*x %*x:%*x %*u %s", + &start, &end, &r, &x, name) != 5) + continue; + + if (strcmp(name, "[vsyscall]")) + continue; + + ksft_print_msg("vsyscall map: %s", line); + + if (start != (void *)0xffffffffff600000 || + end != (void *)0xffffffffff601000) { + ksft_print_msg("address range is nonsense\n"); + nerrs++; } - } else { - printf("[FAIL]\tprocess_rm_readv() succeeded, but it should have failed in this configuration\n"); - return 1; + + ksft_print_msg("vsyscall permissions are %c-%c\n", r, x); + vsyscall_map_r = (r == 'r'); + vsyscall_map_x = (x == 'x'); + + found = true; + break; } -#endif - return 0; + fclose(maps); + + if (!found) { + ksft_print_msg("no vsyscall map in /proc/self/maps\n"); + vsyscall_map_r = false; + vsyscall_map_x = false; + } + + ksft_test_result(!nerrs, "vsyscall map\n"); } -#ifdef __x86_64__ static volatile sig_atomic_t num_vsyscall_traps; static void sigtrap(int sig, siginfo_t *info, void *ctx_void) @@ -521,15 +490,17 @@ static void sigtrap(int sig, siginfo_t *info, void *ctx_void) num_vsyscall_traps++; } -static int test_emulation(void) +static void test_emulation(void) { time_t tmp; bool is_native; - if (!vsyscall_map_x) - return 0; + if (!vsyscall_map_x) { + ksft_test_result_skip("vsyscall_map_x isn't set\n"); + return; + } - printf("[RUN]\tchecking that vsyscalls are emulated\n"); + ksft_print_msg("checking that vsyscalls are emulated\n"); sethandler(SIGTRAP, sigtrap, 0); set_eflags(get_eflags() | X86_EFLAGS_TF); vtime(&tmp); @@ -545,36 +516,35 @@ static int test_emulation(void) */ is_native = (num_vsyscall_traps > 1); - printf("[%s]\tvsyscalls are %s (%d instructions in vsyscall page)\n", - (is_native ? "FAIL" : "OK"), - (is_native ? "native" : "emulated"), - (int)num_vsyscall_traps); - - return is_native; + ksft_test_result(!is_native, "vsyscalls are %s (%d instructions in vsyscall page)\n", + (is_native ? "native" : "emulated"), (int)num_vsyscall_traps); } #endif int main(int argc, char **argv) { - int nerrs = 0; + int total_tests = TOTAL_TESTS; - init_vdso(); - nerrs += init_vsys(); + ksft_print_header(); + ksft_set_plan(total_tests); - nerrs += test_gtod(); - nerrs += test_time(); - nerrs += test_getcpu(0); - nerrs += test_getcpu(1); - - sethandler(SIGSEGV, sigsegv, 0); - nerrs += test_vsys_r(); - nerrs += test_vsys_x(); + init_vdso(); +#ifdef __x86_64__ + init_vsys(); +#endif - nerrs += test_process_vm_readv(); + test_gtod(); + test_time(); + test_getcpu(0); + test_getcpu(1); #ifdef __x86_64__ - nerrs += test_emulation(); + sethandler(SIGSEGV, sigsegv, 0); + test_vsys_r(); + test_vsys_x(); + test_process_vm_readv(); + test_emulation(); #endif - return nerrs ? 1 : 0; + ksft_finished(); } diff --git a/tools/tracing/rtla/Makefile.config b/tools/tracing/rtla/Makefile.config index 6d4ba77847b6..0b7ecfb30d19 100644 --- a/tools/tracing/rtla/Makefile.config +++ b/tools/tracing/rtla/Makefile.config @@ -3,7 +3,7 @@ STOP_ERROR := LIBTRACEEVENT_MIN_VERSION = 1.5 -LIBTRACEFS_MIN_VERSION = 1.3 +LIBTRACEFS_MIN_VERSION = 1.6 define lib_setup $(eval LIB_INCLUDES += $(shell sh -c "$(PKG_CONFIG) --cflags lib$(1)")) diff --git a/tools/tracing/rtla/src/osnoise_hist.c b/tools/tracing/rtla/src/osnoise_hist.c index 01870d50942a..7be17d09f7e8 100644 --- a/tools/tracing/rtla/src/osnoise_hist.c +++ b/tools/tracing/rtla/src/osnoise_hist.c @@ -36,13 +36,14 @@ struct osnoise_hist_params { cpu_set_t hk_cpu_set; struct sched_attr sched_param; struct trace_events *events; - char no_header; char no_summary; char no_index; char with_zeros; int bucket_size; int entries; + int warmup; + int buffer_size; }; struct osnoise_hist_cpu { @@ -436,9 +437,9 @@ static void osnoise_hist_usage(char *usage) static const char * const msg[] = { "", " usage: rtla osnoise hist [-h] [-D] [-d s] [-a us] [-p us] [-r us] [-s us] [-S us] \\", - " [-T us] [-t[=file]] [-e sys[:event]] [--filter <filter>] [--trigger <trigger>] \\", + " [-T us] [-t[file]] [-e sys[:event]] [--filter <filter>] [--trigger <trigger>] \\", " [-c cpu-list] [-H cpu-list] [-P priority] [-b N] [-E N] [--no-header] [--no-summary] \\", - " [--no-index] [--with-zeros] [-C[=cgroup_name]]", + " [--no-index] [--with-zeros] [-C[=cgroup_name]] [--warm-up]", "", " -h/--help: print this menu", " -a/--auto: set automatic trace mode, stopping the session if argument in us sample is hit", @@ -452,7 +453,7 @@ static void osnoise_hist_usage(char *usage) " -C/--cgroup[=cgroup_name]: set cgroup, if no cgroup_name is passed, the rtla's cgroup will be inherited", " -d/--duration time[s|m|h|d]: duration of the session", " -D/--debug: print debug info", - " -t/--trace[=file]: save the stopped trace to [file|osnoise_trace.txt]", + " -t/--trace[file]: save the stopped trace to [file|osnoise_trace.txt]", " -e/--event <sys:event>: enable the <sys:event> in the trace instance, multiple -e are allowed", " --filter <filter>: enable a trace event filter to the previous -e event", " --trigger <trigger>: enable a trace event trigger to the previous -e event", @@ -468,6 +469,8 @@ static void osnoise_hist_usage(char *usage) " f:prio - use SCHED_FIFO with prio", " d:runtime[us|ms|s]:period[us|ms|s] - use SCHED_DEADLINE with runtime and period", " in nanoseconds", + " --warm-up: let the workload run for s seconds before collecting data", + " --trace-buffer-size kB: set the per-cpu trace buffer size in kB", NULL, }; @@ -531,13 +534,15 @@ static struct osnoise_hist_params {"with-zeros", no_argument, 0, '3'}, {"trigger", required_argument, 0, '4'}, {"filter", required_argument, 0, '5'}, + {"warm-up", required_argument, 0, '6'}, + {"trace-buffer-size", required_argument, 0, '7'}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; - c = getopt_long(argc, argv, "a:c:C::b:d:e:E:DhH:p:P:r:s:S:t::T:01234:5:", + c = getopt_long(argc, argv, "a:c:C::b:d:e:E:DhH:p:P:r:s:S:t::T:01234:5:6:7:", long_options, &option_index); /* detect the end of the options. */ @@ -640,9 +645,13 @@ static struct osnoise_hist_params params->threshold = get_llong_from_str(optarg); break; case 't': - if (optarg) - /* skip = */ - params->trace_output = &optarg[1]; + if (optarg) { + if (optarg[0] == '=') + params->trace_output = &optarg[1]; + else + params->trace_output = &optarg[0]; + } else if (optind < argc && argv[optind][0] != '0') + params->trace_output = argv[optind]; else params->trace_output = "osnoise_trace.txt"; break; @@ -680,6 +689,12 @@ static struct osnoise_hist_params osnoise_hist_usage("--filter requires a previous -e\n"); } break; + case '6': + params->warmup = get_llong_from_str(optarg); + break; + case '7': + params->buffer_size = get_llong_from_str(optarg); + break; default: osnoise_hist_usage("Invalid option"); } @@ -886,6 +901,11 @@ int osnoise_hist_main(int argc, char *argv[]) goto out_hist; } + if (params->buffer_size > 0) { + retval = trace_set_buffer_size(&record->trace, params->buffer_size); + if (retval) + goto out_hist; + } } /* @@ -899,6 +919,25 @@ int osnoise_hist_main(int argc, char *argv[]) trace_instance_start(&record->trace); trace_instance_start(trace); + if (params->warmup > 0) { + debug_msg("Warming up for %d seconds\n", params->warmup); + sleep(params->warmup); + if (stop_tracing) + goto out_hist; + + /* + * Clean up the buffer. The osnoise workload do not run + * with tracing off to avoid creating a performance penalty + * when not needed. + */ + retval = tracefs_instance_file_write(trace->inst, "trace", ""); + if (retval < 0) { + debug_msg("Error cleaning up the buffer"); + goto out_hist; + } + + } + tool->start_time = time(NULL); osnoise_hist_set_signals(params); diff --git a/tools/tracing/rtla/src/osnoise_top.c b/tools/tracing/rtla/src/osnoise_top.c index 457360db0767..07ba55d4ec06 100644 --- a/tools/tracing/rtla/src/osnoise_top.c +++ b/tools/tracing/rtla/src/osnoise_top.c @@ -40,6 +40,8 @@ struct osnoise_top_params { int set_sched; int cgroup; int hk_cpus; + int warmup; + int buffer_size; cpu_set_t hk_cpu_set; struct sched_attr sched_param; struct trace_events *events; @@ -281,8 +283,8 @@ static void osnoise_top_usage(struct osnoise_top_params *params, char *usage) static const char * const msg[] = { " [-h] [-q] [-D] [-d s] [-a us] [-p us] [-r us] [-s us] [-S us] \\", - " [-T us] [-t[=file]] [-e sys[:event]] [--filter <filter>] [--trigger <trigger>] \\", - " [-c cpu-list] [-H cpu-list] [-P priority] [-C[=cgroup_name]]", + " [-T us] [-t[file]] [-e sys[:event]] [--filter <filter>] [--trigger <trigger>] \\", + " [-c cpu-list] [-H cpu-list] [-P priority] [-C[=cgroup_name]] [--warm-up s]", "", " -h/--help: print this menu", " -a/--auto: set automatic trace mode, stopping the session if argument in us sample is hit", @@ -296,7 +298,7 @@ static void osnoise_top_usage(struct osnoise_top_params *params, char *usage) " -C/--cgroup[=cgroup_name]: set cgroup, if no cgroup_name is passed, the rtla's cgroup will be inherited", " -d/--duration time[s|m|h|d]: duration of the session", " -D/--debug: print debug info", - " -t/--trace[=file]: save the stopped trace to [file|osnoise_trace.txt]", + " -t/--trace[file]: save the stopped trace to [file|osnoise_trace.txt]", " -e/--event <sys:event>: enable the <sys:event> in the trace instance, multiple -e are allowed", " --filter <filter>: enable a trace event filter to the previous -e event", " --trigger <trigger>: enable a trace event trigger to the previous -e event", @@ -307,6 +309,8 @@ static void osnoise_top_usage(struct osnoise_top_params *params, char *usage) " f:prio - use SCHED_FIFO with prio", " d:runtime[us|ms|s]:period[us|ms|s] - use SCHED_DEADLINE with runtime and period", " in nanoseconds", + " --warm-up s: let the workload run for s seconds before collecting data", + " --trace-buffer-size kB: set the per-cpu trace buffer size in kB", NULL, }; @@ -381,13 +385,15 @@ struct osnoise_top_params *osnoise_top_parse_args(int argc, char **argv) {"trace", optional_argument, 0, 't'}, {"trigger", required_argument, 0, '0'}, {"filter", required_argument, 0, '1'}, + {"warm-up", required_argument, 0, '2'}, + {"trace-buffer-size", required_argument, 0, '3'}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; - c = getopt_long(argc, argv, "a:c:C::d:De:hH:p:P:qr:s:S:t::T:0:1:", + c = getopt_long(argc, argv, "a:c:C::d:De:hH:p:P:qr:s:S:t::T:0:1:2:3:", long_options, &option_index); /* Detect the end of the options. */ @@ -480,9 +486,13 @@ struct osnoise_top_params *osnoise_top_parse_args(int argc, char **argv) params->stop_total_us = get_llong_from_str(optarg); break; case 't': - if (optarg) - /* skip = */ - params->trace_output = &optarg[1]; + if (optarg) { + if (optarg[0] == '=') + params->trace_output = &optarg[1]; + else + params->trace_output = &optarg[0]; + } else if (optind < argc && argv[optind][0] != '-') + params->trace_output = argv[optind]; else params->trace_output = "osnoise_trace.txt"; break; @@ -511,6 +521,12 @@ struct osnoise_top_params *osnoise_top_parse_args(int argc, char **argv) osnoise_top_usage(params, "--filter requires a previous -e\n"); } break; + case '2': + params->warmup = get_llong_from_str(optarg); + break; + case '3': + params->buffer_size = get_llong_from_str(optarg); + break; default: osnoise_top_usage(params, "Invalid option"); } @@ -719,6 +735,12 @@ int osnoise_top_main(int argc, char **argv) if (retval) goto out_top; } + + if (params->buffer_size > 0) { + retval = trace_set_buffer_size(&record->trace, params->buffer_size); + if (retval) + goto out_top; + } } /* @@ -732,6 +754,25 @@ int osnoise_top_main(int argc, char **argv) trace_instance_start(&record->trace); trace_instance_start(trace); + if (params->warmup > 0) { + debug_msg("Warming up for %d seconds\n", params->warmup); + sleep(params->warmup); + if (stop_tracing) + goto out_top; + + /* + * Clean up the buffer. The osnoise workload do not run + * with tracing off to avoid creating a performance penalty + * when not needed. + */ + retval = tracefs_instance_file_write(trace->inst, "trace", ""); + if (retval < 0) { + debug_msg("Error cleaning up the buffer"); + goto out_top; + } + + } + tool->start_time = time(NULL); osnoise_top_set_signals(params); diff --git a/tools/tracing/rtla/src/timerlat_aa.c b/tools/tracing/rtla/src/timerlat_aa.c index 7093fd5333be..7bd80ee2a5b4 100644 --- a/tools/tracing/rtla/src/timerlat_aa.c +++ b/tools/tracing/rtla/src/timerlat_aa.c @@ -16,6 +16,9 @@ enum timelat_state { TIMERLAT_WAITING_THREAD, }; +/* Used to fill spaces in the output */ +static const char *spaces = " "; + #define MAX_COMM 24 /* @@ -274,14 +277,17 @@ static int timerlat_aa_nmi_handler(struct trace_seq *s, struct tep_record *recor taa_data->prev_irq_timstamp = start; trace_seq_reset(taa_data->prev_irqs_seq); - trace_seq_printf(taa_data->prev_irqs_seq, "\t%24s \t\t\t%9.2f us\n", - "nmi", ns_to_usf(duration)); + trace_seq_printf(taa_data->prev_irqs_seq, " %24s %.*s %9.2f us\n", + "nmi", + 24, spaces, + ns_to_usf(duration)); return 0; } taa_data->thread_nmi_sum += duration; - trace_seq_printf(taa_data->nmi_seq, " %24s \t\t\t%9.2f us\n", - "nmi", ns_to_usf(duration)); + trace_seq_printf(taa_data->nmi_seq, " %24s %.*s %9.2f us\n", + "nmi", + 24, spaces, ns_to_usf(duration)); return 0; } @@ -323,8 +329,10 @@ static int timerlat_aa_irq_handler(struct trace_seq *s, struct tep_record *recor taa_data->prev_irq_timstamp = start; trace_seq_reset(taa_data->prev_irqs_seq); - trace_seq_printf(taa_data->prev_irqs_seq, "\t%24s:%-3llu \t\t%9.2f us\n", - desc, vector, ns_to_usf(duration)); + trace_seq_printf(taa_data->prev_irqs_seq, " %24s:%-3llu %.*s %9.2f us\n", + desc, vector, + 15, spaces, + ns_to_usf(duration)); return 0; } @@ -372,8 +380,10 @@ static int timerlat_aa_irq_handler(struct trace_seq *s, struct tep_record *recor * IRQ interference. */ taa_data->thread_irq_sum += duration; - trace_seq_printf(taa_data->irqs_seq, " %24s:%-3llu \t %9.2f us\n", - desc, vector, ns_to_usf(duration)); + trace_seq_printf(taa_data->irqs_seq, " %24s:%-3llu %.*s %9.2f us\n", + desc, vector, + 24, spaces, + ns_to_usf(duration)); return 0; } @@ -408,8 +418,10 @@ static int timerlat_aa_softirq_handler(struct trace_seq *s, struct tep_record *r taa_data->thread_softirq_sum += duration; - trace_seq_printf(taa_data->softirqs_seq, "\t%24s:%-3llu \t %9.2f us\n", - softirq_name[vector], vector, ns_to_usf(duration)); + trace_seq_printf(taa_data->softirqs_seq, " %24s:%-3llu %.*s %9.2f us\n", + softirq_name[vector], vector, + 24, spaces, + ns_to_usf(duration)); return 0; } @@ -452,8 +464,10 @@ static int timerlat_aa_thread_handler(struct trace_seq *s, struct tep_record *re } else { taa_data->thread_thread_sum += duration; - trace_seq_printf(taa_data->threads_seq, "\t%24s:%-3llu \t\t%9.2f us\n", - comm, pid, ns_to_usf(duration)); + trace_seq_printf(taa_data->threads_seq, " %24s:%-12llu %.*s %9.2f us\n", + comm, pid, + 15, spaces, + ns_to_usf(duration)); } return 0; @@ -482,7 +496,8 @@ static int timerlat_aa_stack_handler(struct trace_seq *s, struct tep_record *rec function = tep_find_function(taa_ctx->tool->trace.tep, caller[i]); if (!function) break; - trace_seq_printf(taa_data->stack_seq, "\t\t-> %s\n", function); + trace_seq_printf(taa_data->stack_seq, " %.*s -> %s\n", + 14, spaces, function); } } return 0; @@ -568,23 +583,24 @@ static void timerlat_thread_analysis(struct timerlat_aa_data *taa_data, int cpu, exp_irq_ts = taa_data->timer_irq_start_time - taa_data->timer_irq_start_delay; if (exp_irq_ts < taa_data->prev_irq_timstamp + taa_data->prev_irq_duration) { if (taa_data->prev_irq_timstamp < taa_data->timer_irq_start_time) - printf(" Previous IRQ interference: \t\t up to %9.2f us\n", - ns_to_usf(taa_data->prev_irq_duration)); + printf(" Previous IRQ interference: %.*s up to %9.2f us\n", + 16, spaces, + ns_to_usf(taa_data->prev_irq_duration)); } /* * The delay that the IRQ suffered before starting. */ - printf(" IRQ handler delay: %16s %9.2f us (%.2f %%)\n", - (ns_to_usf(taa_data->timer_exit_from_idle) > 10) ? "(exit from idle)" : "", - ns_to_usf(taa_data->timer_irq_start_delay), - ns_to_per(total, taa_data->timer_irq_start_delay)); + printf(" IRQ handler delay: %.*s %16s %9.2f us (%.2f %%)\n", 16, spaces, + (ns_to_usf(taa_data->timer_exit_from_idle) > 10) ? "(exit from idle)" : "", + ns_to_usf(taa_data->timer_irq_start_delay), + ns_to_per(total, taa_data->timer_irq_start_delay)); /* * Timerlat IRQ. */ - printf(" IRQ latency: \t\t\t\t %9.2f us\n", - ns_to_usf(taa_data->tlat_irq_latency)); + printf(" IRQ latency: %.*s %9.2f us\n", 40, spaces, + ns_to_usf(taa_data->tlat_irq_latency)); if (irq) { /* @@ -595,15 +611,16 @@ static void timerlat_thread_analysis(struct timerlat_aa_data *taa_data, int cpu, * so it will be displayed, it is the key. */ printf(" Blocking thread:\n"); - printf(" %24s:%-9llu\n", - taa_data->run_thread_comm, taa_data->run_thread_pid); + printf(" %.*s %24s:%-9llu\n", 6, spaces, taa_data->run_thread_comm, + taa_data->run_thread_pid); } else { /* * The duration of the IRQ handler that handled the timerlat IRQ. */ - printf(" Timerlat IRQ duration: \t\t %9.2f us (%.2f %%)\n", - ns_to_usf(taa_data->timer_irq_duration), - ns_to_per(total, taa_data->timer_irq_duration)); + printf(" Timerlat IRQ duration: %.*s %9.2f us (%.2f %%)\n", + 30, spaces, + ns_to_usf(taa_data->timer_irq_duration), + ns_to_per(total, taa_data->timer_irq_duration)); /* * The amount of time that the current thread postponed the scheduler. @@ -611,13 +628,13 @@ static void timerlat_thread_analysis(struct timerlat_aa_data *taa_data, int cpu, * Recalling that it is net from NMI/IRQ/Softirq interference, so there * is no need to compute values here. */ - printf(" Blocking thread: \t\t\t %9.2f us (%.2f %%)\n", - ns_to_usf(taa_data->thread_blocking_duration), - ns_to_per(total, taa_data->thread_blocking_duration)); + printf(" Blocking thread: %.*s %9.2f us (%.2f %%)\n", 36, spaces, + ns_to_usf(taa_data->thread_blocking_duration), + ns_to_per(total, taa_data->thread_blocking_duration)); - printf(" %24s:%-9llu %9.2f us\n", - taa_data->run_thread_comm, taa_data->run_thread_pid, - ns_to_usf(taa_data->thread_blocking_duration)); + printf(" %.*s %24s:%-9llu %.*s %9.2f us\n", 6, spaces, + taa_data->run_thread_comm, taa_data->run_thread_pid, + 12, spaces, ns_to_usf(taa_data->thread_blocking_duration)); } /* @@ -629,9 +646,9 @@ static void timerlat_thread_analysis(struct timerlat_aa_data *taa_data, int cpu, * NMIs can happen during the IRQ, so they are always possible. */ if (taa_data->thread_nmi_sum) - printf(" NMI interference \t\t\t %9.2f us (%.2f %%)\n", - ns_to_usf(taa_data->thread_nmi_sum), - ns_to_per(total, taa_data->thread_nmi_sum)); + printf(" NMI interference %.*s %9.2f us (%.2f %%)\n", 36, spaces, + ns_to_usf(taa_data->thread_nmi_sum), + ns_to_per(total, taa_data->thread_nmi_sum)); /* * If it is an IRQ latency, the other factors can be skipped. @@ -643,9 +660,9 @@ static void timerlat_thread_analysis(struct timerlat_aa_data *taa_data, int cpu, * Prints the interference caused by IRQs to the thread latency. */ if (taa_data->thread_irq_sum) { - printf(" IRQ interference \t\t\t %9.2f us (%.2f %%)\n", - ns_to_usf(taa_data->thread_irq_sum), - ns_to_per(total, taa_data->thread_irq_sum)); + printf(" IRQ interference %.*s %9.2f us (%.2f %%)\n", 36, spaces, + ns_to_usf(taa_data->thread_irq_sum), + ns_to_per(total, taa_data->thread_irq_sum)); trace_seq_do_printf(taa_data->irqs_seq); } @@ -654,9 +671,9 @@ static void timerlat_thread_analysis(struct timerlat_aa_data *taa_data, int cpu, * Prints the interference caused by Softirqs to the thread latency. */ if (taa_data->thread_softirq_sum) { - printf(" Softirq interference \t\t\t %9.2f us (%.2f %%)\n", - ns_to_usf(taa_data->thread_softirq_sum), - ns_to_per(total, taa_data->thread_softirq_sum)); + printf(" Softirq interference %.*s %9.2f us (%.2f %%)\n", 32, spaces, + ns_to_usf(taa_data->thread_softirq_sum), + ns_to_per(total, taa_data->thread_softirq_sum)); trace_seq_do_printf(taa_data->softirqs_seq); } @@ -670,9 +687,9 @@ static void timerlat_thread_analysis(struct timerlat_aa_data *taa_data, int cpu, * timer handling latency. */ if (taa_data->thread_thread_sum) { - printf(" Thread interference \t\t\t %9.2f us (%.2f %%)\n", - ns_to_usf(taa_data->thread_thread_sum), - ns_to_per(total, taa_data->thread_thread_sum)); + printf(" Thread interference %.*s %9.2f us (%.2f %%)\n", 33, spaces, + ns_to_usf(taa_data->thread_thread_sum), + ns_to_per(total, taa_data->thread_thread_sum)); trace_seq_do_printf(taa_data->threads_seq); } @@ -682,8 +699,8 @@ static void timerlat_thread_analysis(struct timerlat_aa_data *taa_data, int cpu, */ print_total: printf("------------------------------------------------------------------------\n"); - printf(" %s latency: \t\t\t %9.2f us (100%%)\n", irq ? "IRQ" : "Thread", - ns_to_usf(total)); + printf(" %s latency: %.*s %9.2f us (100%%)\n", irq ? " IRQ" : "Thread", + 37, spaces, ns_to_usf(total)); } static int timerlat_auto_analysis_collect_trace(struct timerlat_aa_context *taa_ctx) diff --git a/tools/tracing/rtla/src/timerlat_hist.c b/tools/tracing/rtla/src/timerlat_hist.c index 8bd51aab6513..a3907c390d67 100644 --- a/tools/tracing/rtla/src/timerlat_hist.c +++ b/tools/tracing/rtla/src/timerlat_hist.c @@ -40,6 +40,7 @@ struct timerlat_hist_params { int no_aa; int dump_tasks; int user_workload; + int kernel_workload; int user_hist; cpu_set_t hk_cpu_set; struct sched_attr sched_param; @@ -52,6 +53,8 @@ struct timerlat_hist_params { char with_zeros; int bucket_size; int entries; + int warmup; + int buffer_size; }; struct timerlat_hist_cpu { @@ -324,17 +327,29 @@ timerlat_print_summary(struct timerlat_hist_params *params, if (!data->hist[cpu].irq_count && !data->hist[cpu].thread_count) continue; - if (!params->no_irq) - trace_seq_printf(trace->seq, "%9llu ", - data->hist[cpu].min_irq); + if (!params->no_irq) { + if (data->hist[cpu].irq_count) + trace_seq_printf(trace->seq, "%9llu ", + data->hist[cpu].min_irq); + else + trace_seq_printf(trace->seq, " - "); + } - if (!params->no_thread) - trace_seq_printf(trace->seq, "%9llu ", - data->hist[cpu].min_thread); + if (!params->no_thread) { + if (data->hist[cpu].thread_count) + trace_seq_printf(trace->seq, "%9llu ", + data->hist[cpu].min_thread); + else + trace_seq_printf(trace->seq, " - "); + } - if (params->user_hist) - trace_seq_printf(trace->seq, "%9llu ", - data->hist[cpu].min_user); + if (params->user_hist) { + if (data->hist[cpu].user_count) + trace_seq_printf(trace->seq, "%9llu ", + data->hist[cpu].min_user); + else + trace_seq_printf(trace->seq, " - "); + } } trace_seq_printf(trace->seq, "\n"); @@ -384,25 +399,164 @@ timerlat_print_summary(struct timerlat_hist_params *params, if (!data->hist[cpu].irq_count && !data->hist[cpu].thread_count) continue; - if (!params->no_irq) - trace_seq_printf(trace->seq, "%9llu ", - data->hist[cpu].max_irq); + if (!params->no_irq) { + if (data->hist[cpu].irq_count) + trace_seq_printf(trace->seq, "%9llu ", + data->hist[cpu].max_irq); + else + trace_seq_printf(trace->seq, " - "); + } - if (!params->no_thread) - trace_seq_printf(trace->seq, "%9llu ", - data->hist[cpu].max_thread); + if (!params->no_thread) { + if (data->hist[cpu].thread_count) + trace_seq_printf(trace->seq, "%9llu ", + data->hist[cpu].max_thread); + else + trace_seq_printf(trace->seq, " - "); + } - if (params->user_hist) - trace_seq_printf(trace->seq, "%9llu ", - data->hist[cpu].max_user); + if (params->user_hist) { + if (data->hist[cpu].user_count) + trace_seq_printf(trace->seq, "%9llu ", + data->hist[cpu].max_user); + else + trace_seq_printf(trace->seq, " - "); + } } trace_seq_printf(trace->seq, "\n"); trace_seq_do_printf(trace->seq); trace_seq_reset(trace->seq); } +static void +timerlat_print_stats_all(struct timerlat_hist_params *params, + struct trace_instance *trace, + struct timerlat_hist_data *data) +{ + struct timerlat_hist_cpu *cpu_data; + struct timerlat_hist_cpu sum; + int cpu; + + if (params->no_summary) + return; + + memset(&sum, 0, sizeof(sum)); + sum.min_irq = ~0; + sum.min_thread = ~0; + sum.min_user = ~0; + + for (cpu = 0; cpu < data->nr_cpus; cpu++) { + if (params->cpus && !CPU_ISSET(cpu, ¶ms->monitored_cpus)) + continue; + + if (!data->hist[cpu].irq_count && !data->hist[cpu].thread_count) + continue; + + cpu_data = &data->hist[cpu]; + + sum.irq_count += cpu_data->irq_count; + update_min(&sum.min_irq, &cpu_data->min_irq); + update_sum(&sum.sum_irq, &cpu_data->sum_irq); + update_max(&sum.max_irq, &cpu_data->max_irq); + + sum.thread_count += cpu_data->thread_count; + update_min(&sum.min_thread, &cpu_data->min_thread); + update_sum(&sum.sum_thread, &cpu_data->sum_thread); + update_max(&sum.max_thread, &cpu_data->max_thread); + + sum.user_count += cpu_data->user_count; + update_min(&sum.min_user, &cpu_data->min_user); + update_sum(&sum.sum_user, &cpu_data->sum_user); + update_max(&sum.max_user, &cpu_data->max_user); + } + + if (!params->no_index) + trace_seq_printf(trace->seq, "ALL: "); + + if (!params->no_irq) + trace_seq_printf(trace->seq, " IRQ"); + + if (!params->no_thread) + trace_seq_printf(trace->seq, " Thr"); + + if (params->user_hist) + trace_seq_printf(trace->seq, " Usr"); + + trace_seq_printf(trace->seq, "\n"); + + if (!params->no_index) + trace_seq_printf(trace->seq, "count:"); + + if (!params->no_irq) + trace_seq_printf(trace->seq, "%9d ", + sum.irq_count); + + if (!params->no_thread) + trace_seq_printf(trace->seq, "%9d ", + sum.thread_count); + + if (params->user_hist) + trace_seq_printf(trace->seq, "%9d ", + sum.user_count); + + trace_seq_printf(trace->seq, "\n"); + + if (!params->no_index) + trace_seq_printf(trace->seq, "min: "); + + if (!params->no_irq) + trace_seq_printf(trace->seq, "%9llu ", + sum.min_irq); + + if (!params->no_thread) + trace_seq_printf(trace->seq, "%9llu ", + sum.min_thread); + + if (params->user_hist) + trace_seq_printf(trace->seq, "%9llu ", + sum.min_user); + + trace_seq_printf(trace->seq, "\n"); + + if (!params->no_index) + trace_seq_printf(trace->seq, "avg: "); + + if (!params->no_irq) + trace_seq_printf(trace->seq, "%9llu ", + sum.sum_irq / sum.irq_count); + + if (!params->no_thread) + trace_seq_printf(trace->seq, "%9llu ", + sum.sum_thread / sum.thread_count); + + if (params->user_hist) + trace_seq_printf(trace->seq, "%9llu ", + sum.sum_user / sum.user_count); + + trace_seq_printf(trace->seq, "\n"); + + if (!params->no_index) + trace_seq_printf(trace->seq, "max: "); + + if (!params->no_irq) + trace_seq_printf(trace->seq, "%9llu ", + sum.max_irq); + + if (!params->no_thread) + trace_seq_printf(trace->seq, "%9llu ", + sum.max_thread); + + if (params->user_hist) + trace_seq_printf(trace->seq, "%9llu ", + sum.max_user); + + trace_seq_printf(trace->seq, "\n"); + trace_seq_do_printf(trace->seq); + trace_seq_reset(trace->seq); +} + /* - * timerlat_print_stats - print data for all CPUs + * timerlat_print_stats - print data for each CPUs */ static void timerlat_print_stats(struct timerlat_hist_params *params, struct osnoise_tool *tool) @@ -485,6 +639,7 @@ timerlat_print_stats(struct timerlat_hist_params *params, struct osnoise_tool *t trace_seq_reset(trace->seq); timerlat_print_summary(params, trace, data); + timerlat_print_stats_all(params, trace, data); } /* @@ -497,9 +652,10 @@ static void timerlat_hist_usage(char *usage) char *msg[] = { "", " usage: [rtla] timerlat hist [-h] [-q] [-d s] [-D] [-n] [-a us] [-p us] [-i us] [-T us] [-s us] \\", - " [-t[=file]] [-e sys[:event]] [--filter <filter>] [--trigger <trigger>] [-c cpu-list] [-H cpu-list]\\", + " [-t[file]] [-e sys[:event]] [--filter <filter>] [--trigger <trigger>] [-c cpu-list] [-H cpu-list]\\", " [-P priority] [-E N] [-b N] [--no-irq] [--no-thread] [--no-header] [--no-summary] \\", - " [--no-index] [--with-zeros] [--dma-latency us] [-C[=cgroup_name]] [--no-aa] [--dump-task] [-u]", + " [--no-index] [--with-zeros] [--dma-latency us] [-C[=cgroup_name]] [--no-aa] [--dump-task] [-u|-k]", + " [--warm-up s]", "", " -h/--help: print this menu", " -a/--auto: set automatic trace mode, stopping the session if argument in us latency is hit", @@ -513,7 +669,7 @@ static void timerlat_hist_usage(char *usage) " -d/--duration time[m|h|d]: duration of the session in seconds", " --dump-tasks: prints the task running on all CPUs if stop conditions are met (depends on !--no-aa)", " -D/--debug: print debug info", - " -t/--trace[=file]: save the stopped trace to [file|timerlat_trace.txt]", + " -t/--trace[file]: save the stopped trace to [file|timerlat_trace.txt]", " -e/--event <sys:event>: enable the <sys:event> in the trace instance, multiple -e are allowed", " --filter <filter>: enable a trace event filter to the previous -e event", " --trigger <trigger>: enable a trace event trigger to the previous -e event", @@ -534,8 +690,11 @@ static void timerlat_hist_usage(char *usage) " f:prio - use SCHED_FIFO with prio", " d:runtime[us|ms|s]:period[us|ms|s] - use SCHED_DEADLINE with runtime and period", " in nanoseconds", - " -u/--user-threads: use rtla user-space threads instead of in-kernel timerlat threads", + " -u/--user-threads: use rtla user-space threads instead of kernel-space timerlat threads", + " -k/--kernel-threads: use timerlat kernel-space threads instead of rtla user-space threads", " -U/--user-load: enable timerlat for user-defined user-space workload", + " --warm-up s: let the workload run for s seconds before collecting data", + " --trace-buffer-size kB: set the per-cpu trace buffer size in kB", NULL, }; @@ -597,6 +756,7 @@ static struct timerlat_hist_params {"thread", required_argument, 0, 'T'}, {"trace", optional_argument, 0, 't'}, {"user-threads", no_argument, 0, 'u'}, + {"kernel-threads", no_argument, 0, 'k'}, {"user-load", no_argument, 0, 'U'}, {"event", required_argument, 0, 'e'}, {"no-irq", no_argument, 0, '0'}, @@ -610,13 +770,15 @@ static struct timerlat_hist_params {"dma-latency", required_argument, 0, '8'}, {"no-aa", no_argument, 0, '9'}, {"dump-task", no_argument, 0, '\1'}, + {"warm-up", required_argument, 0, '\2'}, + {"trace-buffer-size", required_argument, 0, '\3'}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; - c = getopt_long(argc, argv, "a:c:C::b:d:e:E:DhH:i:np:P:s:t::T:uU0123456:7:8:9\1", + c = getopt_long(argc, argv, "a:c:C::b:d:e:E:DhH:i:knp:P:s:t::T:uU0123456:7:8:9\1\2:\3", long_options, &option_index); /* detect the end of the options. */ @@ -699,6 +861,9 @@ static struct timerlat_hist_params case 'i': params->stop_us = get_llong_from_str(optarg); break; + case 'k': + params->kernel_workload = 1; + break; case 'n': params->output_divisor = 1; break; @@ -720,9 +885,13 @@ static struct timerlat_hist_params params->stop_total_us = get_llong_from_str(optarg); break; case 't': - if (optarg) - /* skip = */ - params->trace_output = &optarg[1]; + if (optarg) { + if (optarg[0] == '=') + params->trace_output = &optarg[1]; + else + params->trace_output = &optarg[0]; + } else if (optind < argc && argv[optind][0] != '-') + params->trace_output = argv[optind]; else params->trace_output = "timerlat_trace.txt"; break; @@ -785,6 +954,12 @@ static struct timerlat_hist_params case '\1': params->dump_tasks = 1; break; + case '\2': + params->warmup = get_llong_from_str(optarg); + break; + case '\3': + params->buffer_size = get_llong_from_str(optarg); + break; default: timerlat_hist_usage("Invalid option"); } @@ -807,6 +982,9 @@ static struct timerlat_hist_params if (!params->stop_us && !params->stop_total_us) params->no_aa = 1; + if (params->kernel_workload && params->user_workload) + timerlat_hist_usage("--kernel-threads and --user-threads are mutually exclusive!"); + return params; } @@ -882,6 +1060,22 @@ timerlat_hist_apply_config(struct osnoise_tool *tool, struct timerlat_hist_param auto_house_keeping(¶ms->monitored_cpus); } + /* + * If the user did not specify a type of thread, try user-threads first. + * Fall back to kernel threads otherwise. + */ + if (!params->kernel_workload && !params->user_workload) { + retval = tracefs_file_exists(NULL, "osnoise/per_cpu/cpu0/timerlat_fd"); + if (retval) { + debug_msg("User-space interface detected, setting user-threads\n"); + params->user_workload = 1; + params->user_hist = 1; + } else { + debug_msg("User-space interface not detected, setting kernel-threads\n"); + params->kernel_workload = 1; + } + } + if (params->user_hist) { retval = osnoise_set_workload(tool->context, 0); if (retval) { @@ -1019,6 +1213,12 @@ int timerlat_hist_main(int argc, char *argv[]) if (retval) goto out_hist; } + + if (params->buffer_size > 0) { + retval = trace_set_buffer_size(&record->trace, params->buffer_size); + if (retval) + goto out_hist; + } } if (!params->no_aa) { @@ -1039,22 +1239,6 @@ int timerlat_hist_main(int argc, char *argv[]) } } - /* - * Start the tracers here, after having set all instances. - * - * Let the trace instance start first for the case of hitting a stop - * tracing while enabling other instances. The trace instance is the - * one with most valuable information. - */ - if (params->trace_output) - trace_instance_start(&record->trace); - if (!params->no_aa) - trace_instance_start(&aa->trace); - trace_instance_start(trace); - - tool->start_time = time(NULL); - timerlat_hist_set_signals(params); - if (params->user_workload) { /* rtla asked to stop */ params_u.should_run = 1; @@ -1074,6 +1258,29 @@ int timerlat_hist_main(int argc, char *argv[]) err_msg("Error creating timerlat user-space threads\n"); } + if (params->warmup > 0) { + debug_msg("Warming up for %d seconds\n", params->warmup); + sleep(params->warmup); + if (stop_tracing) + goto out_hist; + } + + /* + * Start the tracers here, after having set all instances. + * + * Let the trace instance start first for the case of hitting a stop + * tracing while enabling other instances. The trace instance is the + * one with most valuable information. + */ + if (params->trace_output) + trace_instance_start(&record->trace); + if (!params->no_aa) + trace_instance_start(&aa->trace); + trace_instance_start(trace); + + tool->start_time = time(NULL); + timerlat_hist_set_signals(params); + while (!stop_tracing) { sleep(params->sleep_time); @@ -1099,6 +1306,7 @@ int timerlat_hist_main(int argc, char *argv[]) } } } + if (params->user_workload && !params_u.stopped_running) { params_u.should_run = 0; sleep(1); diff --git a/tools/tracing/rtla/src/timerlat_top.c b/tools/tracing/rtla/src/timerlat_top.c index 8a3fa64319c6..8c16419fe22a 100644 --- a/tools/tracing/rtla/src/timerlat_top.c +++ b/tools/tracing/rtla/src/timerlat_top.c @@ -44,6 +44,10 @@ struct timerlat_top_params { int hk_cpus; int user_top; int user_workload; + int kernel_workload; + int pretty_output; + int warmup; + int buffer_size; cpu_set_t hk_cpu_set; struct sched_attr sched_param; struct trace_events *events; @@ -118,6 +122,37 @@ cleanup: return NULL; } +static void +timerlat_top_reset_sum(struct timerlat_top_cpu *summary) +{ + memset(summary, 0, sizeof(*summary)); + summary->min_irq = ~0; + summary->min_thread = ~0; + summary->min_user = ~0; +} + +static void +timerlat_top_update_sum(struct osnoise_tool *tool, int cpu, struct timerlat_top_cpu *sum) +{ + struct timerlat_top_data *data = tool->data; + struct timerlat_top_cpu *cpu_data = &data->cpu_data[cpu]; + + sum->irq_count += cpu_data->irq_count; + update_min(&sum->min_irq, &cpu_data->min_irq); + update_sum(&sum->sum_irq, &cpu_data->sum_irq); + update_max(&sum->max_irq, &cpu_data->max_irq); + + sum->thread_count += cpu_data->thread_count; + update_min(&sum->min_thread, &cpu_data->min_thread); + update_sum(&sum->sum_thread, &cpu_data->sum_thread); + update_max(&sum->max_thread, &cpu_data->max_thread); + + sum->user_count += cpu_data->user_count; + update_min(&sum->min_user, &cpu_data->min_user); + update_sum(&sum->sum_user, &cpu_data->sum_user); + update_max(&sum->max_user, &cpu_data->max_user); +} + /* * timerlat_hist_update - record a new timerlat occurent on cpu, updating data */ @@ -179,19 +214,22 @@ timerlat_top_handler(struct trace_seq *s, struct tep_record *record, /* * timerlat_top_header - print the header of the tool output */ -static void timerlat_top_header(struct osnoise_tool *top) +static void timerlat_top_header(struct timerlat_top_params *params, struct osnoise_tool *top) { - struct timerlat_top_params *params = top->params; struct trace_seq *s = top->trace.seq; char duration[26]; get_duration(top->start_time, duration, sizeof(duration)); - trace_seq_printf(s, "\033[2;37;40m"); + if (params->pretty_output) + trace_seq_printf(s, "\033[2;37;40m"); + trace_seq_printf(s, " Timer Latency "); if (params->user_top) trace_seq_printf(s, " "); - trace_seq_printf(s, "\033[0;0;0m"); + + if (params->pretty_output) + trace_seq_printf(s, "\033[0;0;0m"); trace_seq_printf(s, "\n"); trace_seq_printf(s, "%-6s | IRQ Timer Latency (%s) | Thread Timer Latency (%s)", duration, @@ -204,14 +242,20 @@ static void timerlat_top_header(struct osnoise_tool *top) } trace_seq_printf(s, "\n"); - trace_seq_printf(s, "\033[2;30;47m"); + if (params->pretty_output) + trace_seq_printf(s, "\033[2;30;47m"); + trace_seq_printf(s, "CPU COUNT | cur min avg max | cur min avg max"); if (params->user_top) trace_seq_printf(s, " | cur min avg max"); - trace_seq_printf(s, "\033[0;0;0m"); + + if (params->pretty_output) + trace_seq_printf(s, "\033[0;0;0m"); trace_seq_printf(s, "\n"); } +static const char *no_value = " -"; + /* * timerlat_top_print - prints the output of a given CPU */ @@ -239,10 +283,7 @@ static void timerlat_top_print(struct osnoise_tool *top, int cpu) trace_seq_printf(s, "%3d #%-9d |", cpu, cpu_data->irq_count); if (!cpu_data->irq_count) { - trace_seq_printf(s, " - "); - trace_seq_printf(s, " - "); - trace_seq_printf(s, " - "); - trace_seq_printf(s, " - |"); + trace_seq_printf(s, "%s %s %s %s |", no_value, no_value, no_value, no_value); } else { trace_seq_printf(s, "%9llu ", cpu_data->cur_irq / params->output_divisor); trace_seq_printf(s, "%9llu ", cpu_data->min_irq / params->output_divisor); @@ -251,10 +292,7 @@ static void timerlat_top_print(struct osnoise_tool *top, int cpu) } if (!cpu_data->thread_count) { - trace_seq_printf(s, " - "); - trace_seq_printf(s, " - "); - trace_seq_printf(s, " - "); - trace_seq_printf(s, " -\n"); + trace_seq_printf(s, "%s %s %s %s", no_value, no_value, no_value, no_value); } else { trace_seq_printf(s, "%9llu ", cpu_data->cur_thread / divisor); trace_seq_printf(s, "%9llu ", cpu_data->min_thread / divisor); @@ -271,10 +309,7 @@ static void timerlat_top_print(struct osnoise_tool *top, int cpu) trace_seq_printf(s, " |"); if (!cpu_data->user_count) { - trace_seq_printf(s, " - "); - trace_seq_printf(s, " - "); - trace_seq_printf(s, " - "); - trace_seq_printf(s, " -\n"); + trace_seq_printf(s, "%s %s %s %s\n", no_value, no_value, no_value, no_value); } else { trace_seq_printf(s, "%9llu ", cpu_data->cur_user / divisor); trace_seq_printf(s, "%9llu ", cpu_data->min_user / divisor); @@ -285,6 +320,77 @@ static void timerlat_top_print(struct osnoise_tool *top, int cpu) } /* + * timerlat_top_print_sum - prints the summary output + */ +static void +timerlat_top_print_sum(struct osnoise_tool *top, struct timerlat_top_cpu *summary) +{ + const char *split = "----------------------------------------"; + struct timerlat_top_params *params = top->params; + unsigned long long count = summary->irq_count; + int divisor = params->output_divisor; + struct trace_seq *s = top->trace.seq; + int e = 0; + + if (divisor == 0) + return; + + /* + * Skip if no data is available: is this cpu offline? + */ + if (!summary->irq_count && !summary->thread_count) + return; + + while (count > 999999) { + e++; + count /= 10; + } + + trace_seq_printf(s, "%.*s|%.*s|%.*s", 15, split, 40, split, 39, split); + if (params->user_top) + trace_seq_printf(s, "-|%.*s", 39, split); + trace_seq_printf(s, "\n"); + + trace_seq_printf(s, "ALL #%-6llu e%d |", count, e); + + if (!summary->irq_count) { + trace_seq_printf(s, " %s %s %s |", no_value, no_value, no_value); + } else { + trace_seq_printf(s, " "); + trace_seq_printf(s, "%9llu ", summary->min_irq / params->output_divisor); + trace_seq_printf(s, "%9llu ", (summary->sum_irq / summary->irq_count) / divisor); + trace_seq_printf(s, "%9llu |", summary->max_irq / divisor); + } + + if (!summary->thread_count) { + trace_seq_printf(s, "%s %s %s %s", no_value, no_value, no_value, no_value); + } else { + trace_seq_printf(s, " "); + trace_seq_printf(s, "%9llu ", summary->min_thread / divisor); + trace_seq_printf(s, "%9llu ", + (summary->sum_thread / summary->thread_count) / divisor); + trace_seq_printf(s, "%9llu", summary->max_thread / divisor); + } + + if (!params->user_top) { + trace_seq_printf(s, "\n"); + return; + } + + trace_seq_printf(s, " |"); + + if (!summary->user_count) { + trace_seq_printf(s, " %s %s %s |", no_value, no_value, no_value); + } else { + trace_seq_printf(s, " "); + trace_seq_printf(s, "%9llu ", summary->min_user / divisor); + trace_seq_printf(s, "%9llu ", + (summary->sum_user / summary->user_count) / divisor); + trace_seq_printf(s, "%9llu\n", summary->max_user / divisor); + } +} + +/* * clear_terminal - clears the output terminal */ static void clear_terminal(struct trace_seq *seq) @@ -300,6 +406,7 @@ static void timerlat_print_stats(struct timerlat_top_params *params, struct osnoise_tool *top) { struct trace_instance *trace = &top->trace; + struct timerlat_top_cpu summary; static int nr_cpus = -1; int i; @@ -312,14 +419,19 @@ timerlat_print_stats(struct timerlat_top_params *params, struct osnoise_tool *to if (!params->quiet) clear_terminal(trace->seq); - timerlat_top_header(top); + timerlat_top_reset_sum(&summary); + + timerlat_top_header(params, top); for (i = 0; i < nr_cpus; i++) { if (params->cpus && !CPU_ISSET(i, ¶ms->monitored_cpus)) continue; timerlat_top_print(top, i); + timerlat_top_update_sum(top, i, &summary); } + timerlat_top_print_sum(top, &summary); + trace_seq_do_printf(trace->seq); trace_seq_reset(trace->seq); } @@ -334,8 +446,8 @@ static void timerlat_top_usage(char *usage) static const char *const msg[] = { "", " usage: rtla timerlat [top] [-h] [-q] [-a us] [-d s] [-D] [-n] [-p us] [-i us] [-T us] [-s us] \\", - " [[-t[=file]] [-e sys[:event]] [--filter <filter>] [--trigger <trigger>] [-c cpu-list] [-H cpu-list]\\", - " [-P priority] [--dma-latency us] [--aa-only us] [-C[=cgroup_name]] [-u]", + " [[-t[file]] [-e sys[:event]] [--filter <filter>] [--trigger <trigger>] [-c cpu-list] [-H cpu-list]\\", + " [-P priority] [--dma-latency us] [--aa-only us] [-C[=cgroup_name]] [-u|-k] [--warm-up s]", "", " -h/--help: print this menu", " -a/--auto: set automatic trace mode, stopping the session if argument in us latency is hit", @@ -350,7 +462,7 @@ static void timerlat_top_usage(char *usage) " -d/--duration time[m|h|d]: duration of the session in seconds", " -D/--debug: print debug info", " --dump-tasks: prints the task running on all CPUs if stop conditions are met (depends on !--no-aa)", - " -t/--trace[=file]: save the stopped trace to [file|timerlat_trace.txt]", + " -t/--trace[file]: save the stopped trace to [file|timerlat_trace.txt]", " -e/--event <sys:event>: enable the <sys:event> in the trace instance, multiple -e are allowed", " --filter <command>: enable a trace event filter to the previous -e event", " --trigger <command>: enable a trace event trigger to the previous -e event", @@ -364,8 +476,11 @@ static void timerlat_top_usage(char *usage) " f:prio - use SCHED_FIFO with prio", " d:runtime[us|ms|s]:period[us|ms|s] - use SCHED_DEADLINE with runtime and period", " in nanoseconds", - " -u/--user-threads: use rtla user-space threads instead of in-kernel timerlat threads", + " -u/--user-threads: use rtla user-space threads instead of kernel-space timerlat threads", + " -k/--kernel-threads: use timerlat kernel-space threads instead of rtla user-space threads", " -U/--user-load: enable timerlat for user-defined user-space workload", + " --warm-up s: let the workload run for s seconds before collecting data", + " --trace-buffer-size kB: set the per-cpu trace buffer size in kB", NULL, }; @@ -425,6 +540,7 @@ static struct timerlat_top_params {"thread", required_argument, 0, 'T'}, {"trace", optional_argument, 0, 't'}, {"user-threads", no_argument, 0, 'u'}, + {"kernel-threads", no_argument, 0, 'k'}, {"user-load", no_argument, 0, 'U'}, {"trigger", required_argument, 0, '0'}, {"filter", required_argument, 0, '1'}, @@ -432,13 +548,15 @@ static struct timerlat_top_params {"no-aa", no_argument, 0, '3'}, {"dump-tasks", no_argument, 0, '4'}, {"aa-only", required_argument, 0, '5'}, + {"warm-up", required_argument, 0, '6'}, + {"trace-buffer-size", required_argument, 0, '7'}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; - c = getopt_long(argc, argv, "a:c:C::d:De:hH:i:np:P:qs:t::T:uU0:1:2:345:", + c = getopt_long(argc, argv, "a:c:C::d:De:hH:i:knp:P:qs:t::T:uU0:1:2:345:6:7:", long_options, &option_index); /* detect the end of the options. */ @@ -523,6 +641,9 @@ static struct timerlat_top_params case 'i': params->stop_us = get_llong_from_str(optarg); break; + case 'k': + params->kernel_workload = true; + break; case 'n': params->output_divisor = 1; break; @@ -547,9 +668,13 @@ static struct timerlat_top_params params->stop_total_us = get_llong_from_str(optarg); break; case 't': - if (optarg) - /* skip = */ - params->trace_output = &optarg[1]; + if (optarg) { + if (optarg[0] == '=') + params->trace_output = &optarg[1]; + else + params->trace_output = &optarg[0]; + } else if (optind < argc && argv[optind][0] != '-') + params->trace_output = argv[optind]; else params->trace_output = "timerlat_trace.txt"; @@ -595,6 +720,12 @@ static struct timerlat_top_params case '4': params->dump_tasks = 1; break; + case '6': + params->warmup = get_llong_from_str(optarg); + break; + case '7': + params->buffer_size = get_llong_from_str(optarg); + break; default: timerlat_top_usage("Invalid option"); } @@ -614,6 +745,9 @@ static struct timerlat_top_params if (params->no_aa && params->aa_only) timerlat_top_usage("--no-aa and --aa-only are mutually exclusive!"); + if (params->kernel_workload && params->user_workload) + timerlat_top_usage("--kernel-threads and --user-threads are mutually exclusive!"); + return params; } @@ -692,6 +826,22 @@ timerlat_top_apply_config(struct osnoise_tool *top, struct timerlat_top_params * auto_house_keeping(¶ms->monitored_cpus); } + /* + * If the user did not specify a type of thread, try user-threads first. + * Fall back to kernel threads otherwise. + */ + if (!params->kernel_workload && !params->user_workload) { + retval = tracefs_file_exists(NULL, "osnoise/per_cpu/cpu0/timerlat_fd"); + if (retval) { + debug_msg("User-space interface detected, setting user-threads\n"); + params->user_workload = 1; + params->user_top = 1; + } else { + debug_msg("User-space interface not detected, setting kernel-threads\n"); + params->kernel_workload = 1; + } + } + if (params->user_top) { retval = osnoise_set_workload(top->context, 0); if (retval) { @@ -700,6 +850,9 @@ timerlat_top_apply_config(struct osnoise_tool *top, struct timerlat_top_params * } } + if (isatty(1) && !params->quiet) + params->pretty_output = 1; + return 0; out_err: @@ -830,6 +983,12 @@ int timerlat_top_main(int argc, char *argv[]) if (retval) goto out_top; } + + if (params->buffer_size > 0) { + retval = trace_set_buffer_size(&record->trace, params->buffer_size); + if (retval) + goto out_top; + } } if (!params->no_aa) { @@ -859,22 +1018,6 @@ int timerlat_top_main(int argc, char *argv[]) } } - /* - * Start the tracers here, after having set all instances. - * - * Let the trace instance start first for the case of hitting a stop - * tracing while enabling other instances. The trace instance is the - * one with most valuable information. - */ - if (params->trace_output) - trace_instance_start(&record->trace); - if (!params->no_aa && aa != top) - trace_instance_start(&aa->trace); - trace_instance_start(trace); - - top->start_time = time(NULL); - timerlat_top_set_signals(params); - if (params->user_workload) { /* rtla asked to stop */ params_u.should_run = 1; @@ -894,6 +1037,27 @@ int timerlat_top_main(int argc, char *argv[]) err_msg("Error creating timerlat user-space threads\n"); } + if (params->warmup > 0) { + debug_msg("Warming up for %d seconds\n", params->warmup); + sleep(params->warmup); + } + + /* + * Start the tracers here, after having set all instances. + * + * Let the trace instance start first for the case of hitting a stop + * tracing while enabling other instances. The trace instance is the + * one with most valuable information. + */ + if (params->trace_output) + trace_instance_start(&record->trace); + if (!params->no_aa && aa != top) + trace_instance_start(&aa->trace); + trace_instance_start(trace); + + top->start_time = time(NULL); + timerlat_top_set_signals(params); + while (!stop_tracing) { sleep(params->sleep_time); diff --git a/tools/tracing/rtla/src/trace.c b/tools/tracing/rtla/src/trace.c index e1ba6d9f4265..170a706248ab 100644 --- a/tools/tracing/rtla/src/trace.c +++ b/tools/tracing/rtla/src/trace.c @@ -540,3 +540,18 @@ int trace_is_off(struct trace_instance *tool, struct trace_instance *trace) return 0; } + +/* + * trace_set_buffer_size - set the per-cpu tracing buffer size. + */ +int trace_set_buffer_size(struct trace_instance *trace, int size) +{ + int retval; + + debug_msg("Setting trace buffer size to %d Kb\n", size); + retval = tracefs_instance_set_buffer_size(trace->inst, size, -1); + if (retval) + err_msg("Error setting trace buffer size\n"); + + return retval; +} diff --git a/tools/tracing/rtla/src/trace.h b/tools/tracing/rtla/src/trace.h index 2e9a89a25615..c7c92dc9a18a 100644 --- a/tools/tracing/rtla/src/trace.h +++ b/tools/tracing/rtla/src/trace.h @@ -48,3 +48,4 @@ int trace_events_enable(struct trace_instance *instance, int trace_event_add_filter(struct trace_events *event, char *filter); int trace_event_add_trigger(struct trace_events *event, char *trigger); int trace_is_off(struct trace_instance *tool, struct trace_instance *trace); +int trace_set_buffer_size(struct trace_instance *trace, int size); diff --git a/tools/workqueue/wq_monitor.py b/tools/workqueue/wq_monitor.py index a8856a9c45dc..9e964c5be40c 100644 --- a/tools/workqueue/wq_monitor.py +++ b/tools/workqueue/wq_monitor.py @@ -32,16 +32,13 @@ https://github.com/osandov/drgn. rescued The number of work items executed by the rescuer. """ -import sys import signal -import os import re import time import json import drgn -from drgn.helpers.linux.list import list_for_each_entry,list_empty -from drgn.helpers.linux.cpumask import for_each_possible_cpu +from drgn.helpers.linux.list import list_for_each_entry import argparse parser = argparse.ArgumentParser(description=desc, @@ -54,10 +51,6 @@ parser.add_argument('-j', '--json', action='store_true', help='Output in json') args = parser.parse_args() -def err(s): - print(s, file=sys.stderr, flush=True) - sys.exit(1) - workqueues = prog['workqueues'] WQ_UNBOUND = prog['WQ_UNBOUND'] diff --git a/tools/writeback/wb_monitor.py b/tools/writeback/wb_monitor.py new file mode 100644 index 000000000000..5e3591f1f9a9 --- /dev/null +++ b/tools/writeback/wb_monitor.py @@ -0,0 +1,172 @@ +#!/usr/bin/env drgn +# +# Copyright (C) 2024 Kemeng Shi <shikemeng@huaweicloud.com> +# Copyright (C) 2024 Huawei Inc + +desc = """ +This is a drgn script based on wq_monitor.py to monitor writeback info on +backing dev. For more info on drgn, visit https://github.com/osandov/drgn. + + writeback(kB) Amount of dirty pages are currently being written back to + disk. + + reclaimable(kB) Amount of pages are currently reclaimable. + + dirtied(kB) Amount of pages have been dirtied. + + wrttien(kB) Amount of dirty pages have been written back to disk. + + avg_wb(kBps) Smoothly estimated write bandwidth of writing dirty pages + back to disk. +""" + +import signal +import re +import time +import json + +import drgn +from drgn.helpers.linux.list import list_for_each_entry + +import argparse +parser = argparse.ArgumentParser(description=desc, + formatter_class=argparse.RawTextHelpFormatter) +parser.add_argument('bdi', metavar='REGEX', nargs='*', + help='Target backing device name patterns (all if empty)') +parser.add_argument('-i', '--interval', metavar='SECS', type=float, default=1, + help='Monitoring interval (0 to print once and exit)') +parser.add_argument('-j', '--json', action='store_true', + help='Output in json') +parser.add_argument('-c', '--cgroup', action='store_true', + help='show writeback of bdi in cgroup') +args = parser.parse_args() + +bdi_list = prog['bdi_list'] + +WB_RECLAIMABLE = prog['WB_RECLAIMABLE'] +WB_WRITEBACK = prog['WB_WRITEBACK'] +WB_DIRTIED = prog['WB_DIRTIED'] +WB_WRITTEN = prog['WB_WRITTEN'] +NR_WB_STAT_ITEMS = prog['NR_WB_STAT_ITEMS'] + +PAGE_SHIFT = prog['PAGE_SHIFT'] + +def K(x): + return x << (PAGE_SHIFT - 10) + +class Stats: + def dict(self, now): + return { 'timestamp' : now, + 'name' : self.name, + 'writeback' : self.stats[WB_WRITEBACK], + 'reclaimable' : self.stats[WB_RECLAIMABLE], + 'dirtied' : self.stats[WB_DIRTIED], + 'written' : self.stats[WB_WRITTEN], + 'avg_wb' : self.avg_bw, } + + def table_header_str(): + return f'{"":>16} {"writeback":>10} {"reclaimable":>12} ' \ + f'{"dirtied":>9} {"written":>9} {"avg_bw":>9}' + + def table_row_str(self): + out = f'{self.name[-16:]:16} ' \ + f'{self.stats[WB_WRITEBACK]:10} ' \ + f'{self.stats[WB_RECLAIMABLE]:12} ' \ + f'{self.stats[WB_DIRTIED]:9} ' \ + f'{self.stats[WB_WRITTEN]:9} ' \ + f'{self.avg_bw:9} ' + return out + + def show_header(): + if Stats.table_fmt: + print() + print(Stats.table_header_str()) + + def show_stats(self): + if Stats.table_fmt: + print(self.table_row_str()) + else: + print(self.dict(Stats.now)) + +class WbStats(Stats): + def __init__(self, wb): + bdi_name = wb.bdi.dev_name.string_().decode() + # avoid to use bdi.wb.memcg_css which is only defined when + # CONFIG_CGROUP_WRITEBACK is enabled + if wb == wb.bdi.wb.address_of_(): + ino = "1" + else: + ino = str(wb.memcg_css.cgroup.kn.id.value_()) + self.name = bdi_name + '_' + ino + + self.stats = [0] * NR_WB_STAT_ITEMS + for i in range(NR_WB_STAT_ITEMS): + if wb.stat[i].count >= 0: + self.stats[i] = int(K(wb.stat[i].count)) + else: + self.stats[i] = 0 + + self.avg_bw = int(K(wb.avg_write_bandwidth)) + +class BdiStats(Stats): + def __init__(self, bdi): + self.name = bdi.dev_name.string_().decode() + self.stats = [0] * NR_WB_STAT_ITEMS + self.avg_bw = 0 + + def collectStats(self, wb_stats): + for i in range(NR_WB_STAT_ITEMS): + self.stats[i] += wb_stats.stats[i] + + self.avg_bw += wb_stats.avg_bw + +exit_req = False + +def sigint_handler(signr, frame): + global exit_req + exit_req = True + +def main(): + # handle args + Stats.table_fmt = not args.json + interval = args.interval + cgroup = args.cgroup + + re_str = None + if args.bdi: + for r in args.bdi: + if re_str is None: + re_str = r + else: + re_str += '|' + r + + filter_re = re.compile(re_str) if re_str else None + + # monitoring loop + signal.signal(signal.SIGINT, sigint_handler) + + while not exit_req: + Stats.now = time.time() + + Stats.show_header() + for bdi in list_for_each_entry('struct backing_dev_info', bdi_list.address_of_(), 'bdi_list'): + bdi_stats = BdiStats(bdi) + if filter_re and not filter_re.search(bdi_stats.name): + continue + + for wb in list_for_each_entry('struct bdi_writeback', bdi.wb_list.address_of_(), 'bdi_node'): + wb_stats = WbStats(wb) + bdi_stats.collectStats(wb_stats) + if cgroup: + wb_stats.show_stats() + + bdi_stats.show_stats() + if cgroup and Stats.table_fmt: + print() + + if interval == 0: + break + time.sleep(interval) + +if __name__ == "__main__": + main() |