summaryrefslogtreecommitdiff
path: root/samples
diff options
context:
space:
mode:
Diffstat (limited to 'samples')
-rw-r--r--samples/bpf/.gitignore1
-rw-r--r--samples/bpf/Makefile36
-rw-r--r--samples/bpf/cpustat_kern.c36
-rw-r--r--samples/bpf/cpustat_user.c47
-rw-r--r--samples/bpf/hbm.c3
-rw-r--r--samples/bpf/lathist_kern.c24
-rw-r--r--samples/bpf/lathist_user.c42
-rw-r--r--samples/bpf/offwaketime_kern.c52
-rw-r--r--samples/bpf/offwaketime_user.c66
-rw-r--r--samples/bpf/sockex3_kern.c12
-rw-r--r--samples/bpf/sockex3_user.c6
-rw-r--r--samples/bpf/spintest_kern.c36
-rw-r--r--samples/bpf/spintest_user.c68
-rw-r--r--samples/bpf/syscall_tp_kern.c24
-rw-r--r--samples/bpf/syscall_tp_user.c54
-rw-r--r--samples/bpf/task_fd_query_kern.c2
-rw-r--r--samples/bpf/task_fd_query_user.c2
-rw-r--r--samples/bpf/test_current_task_under_cgroup_kern.c27
-rw-r--r--samples/bpf/test_current_task_under_cgroup_user.c52
-rw-r--r--samples/bpf/test_map_in_map_kern.c7
-rw-r--r--samples/bpf/test_probe_write_user_kern.c12
-rw-r--r--samples/bpf/test_probe_write_user_user.c49
-rw-r--r--samples/bpf/trace_output_kern.c15
-rw-r--r--samples/bpf/trace_output_user.c55
-rw-r--r--samples/bpf/tracex3_kern.c2
-rw-r--r--samples/bpf/tracex5_user.c6
-rw-r--r--samples/bpf/xdp_monitor_kern.c60
-rw-r--r--samples/bpf/xdp_monitor_user.c159
-rw-r--r--samples/bpf/xdp_redirect_cpu_user.c155
-rw-r--r--samples/bpf/xdp_sample_pkts_kern.c14
-rw-r--r--samples/bpf/xdp_sample_pkts_user.c1
-rw-r--r--samples/bpf/xdpsock_user.c406
-rw-r--r--samples/bpf/xsk_fwd.c1085
-rw-r--r--samples/configfs/configfs_sample.c59
-rw-r--r--samples/kprobes/kprobe_example.c6
-rw-r--r--samples/kprobes/kretprobe_example.c4
-rw-r--r--samples/mic/mpssd/mpssd.c24
-rw-r--r--samples/nitro_enclaves/.gitignore2
-rw-r--r--samples/nitro_enclaves/Makefile16
-rw-r--r--samples/nitro_enclaves/ne_ioctl_sample.c883
-rw-r--r--samples/vfio-mdev/mbochs.c3
41 files changed, 3083 insertions, 530 deletions
diff --git a/samples/bpf/.gitignore b/samples/bpf/.gitignore
index 034800c4d1e6..b2f29bc8dc43 100644
--- a/samples/bpf/.gitignore
+++ b/samples/bpf/.gitignore
@@ -50,4 +50,5 @@ xdp_rxq_info
xdp_sample_pkts
xdp_tx_iptunnel
xdpsock
+xsk_fwd
testfile.img
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index f87ee02073ba..aeebf5d12f32 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -48,6 +48,7 @@ tprogs-y += syscall_tp
tprogs-y += cpustat
tprogs-y += xdp_adjust_tail
tprogs-y += xdpsock
+tprogs-y += xsk_fwd
tprogs-y += xdp_fwd
tprogs-y += task_fd_query
tprogs-y += xdp_sample_pkts
@@ -71,12 +72,12 @@ tracex4-objs := tracex4_user.o
tracex5-objs := tracex5_user.o $(TRACE_HELPERS)
tracex6-objs := tracex6_user.o
tracex7-objs := tracex7_user.o
-test_probe_write_user-objs := bpf_load.o test_probe_write_user_user.o
-trace_output-objs := bpf_load.o trace_output_user.o $(TRACE_HELPERS)
-lathist-objs := bpf_load.o lathist_user.o
-offwaketime-objs := bpf_load.o offwaketime_user.o $(TRACE_HELPERS)
-spintest-objs := bpf_load.o spintest_user.o $(TRACE_HELPERS)
-map_perf_test-objs := bpf_load.o map_perf_test_user.o
+test_probe_write_user-objs := test_probe_write_user_user.o
+trace_output-objs := trace_output_user.o $(TRACE_HELPERS)
+lathist-objs := lathist_user.o
+offwaketime-objs := offwaketime_user.o $(TRACE_HELPERS)
+spintest-objs := spintest_user.o $(TRACE_HELPERS)
+map_perf_test-objs := map_perf_test_user.o
test_overhead-objs := bpf_load.o test_overhead_user.o
test_cgrp2_array_pin-objs := test_cgrp2_array_pin.o
test_cgrp2_attach-objs := test_cgrp2_attach.o
@@ -86,7 +87,7 @@ xdp1-objs := xdp1_user.o
# reuse xdp1 source intentionally
xdp2-objs := xdp1_user.o
xdp_router_ipv4-objs := xdp_router_ipv4_user.o
-test_current_task_under_cgroup-objs := bpf_load.o $(CGROUP_HELPERS) \
+test_current_task_under_cgroup-objs := $(CGROUP_HELPERS) \
test_current_task_under_cgroup_user.o
trace_event-objs := trace_event_user.o $(TRACE_HELPERS)
sampleip-objs := sampleip_user.o $(TRACE_HELPERS)
@@ -97,13 +98,14 @@ test_map_in_map-objs := test_map_in_map_user.o
per_socket_stats_example-objs := cookie_uid_helper_example.o
xdp_redirect-objs := xdp_redirect_user.o
xdp_redirect_map-objs := xdp_redirect_map_user.o
-xdp_redirect_cpu-objs := bpf_load.o xdp_redirect_cpu_user.o
-xdp_monitor-objs := bpf_load.o xdp_monitor_user.o
+xdp_redirect_cpu-objs := xdp_redirect_cpu_user.o
+xdp_monitor-objs := xdp_monitor_user.o
xdp_rxq_info-objs := xdp_rxq_info_user.o
-syscall_tp-objs := bpf_load.o syscall_tp_user.o
-cpustat-objs := bpf_load.o cpustat_user.o
+syscall_tp-objs := syscall_tp_user.o
+cpustat-objs := cpustat_user.o
xdp_adjust_tail-objs := xdp_adjust_tail_user.o
xdpsock-objs := xdpsock_user.o
+xsk_fwd-objs := xsk_fwd.o
xdp_fwd-objs := xdp_fwd_user.o
task_fd_query-objs := bpf_load.o task_fd_query_user.o $(TRACE_HELPERS)
xdp_sample_pkts-objs := xdp_sample_pkts_user.o $(TRACE_HELPERS)
@@ -203,11 +205,14 @@ TPROGLDLIBS_trace_output += -lrt
TPROGLDLIBS_map_perf_test += -lrt
TPROGLDLIBS_test_overhead += -lrt
TPROGLDLIBS_xdpsock += -pthread
+TPROGLDLIBS_xsk_fwd += -pthread
# Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline:
# make M=samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang
LLC ?= llc
CLANG ?= clang
+OPT ?= opt
+LLVM_DIS ?= llvm-dis
LLVM_OBJCOPY ?= llvm-objcopy
BTF_PAHOLE ?= pahole
@@ -300,6 +305,11 @@ $(obj)/hbm_edt_kern.o: $(src)/hbm.h $(src)/hbm_kern.h
# asm/sysreg.h - inline assembly used by it is incompatible with llvm.
# But, there is no easy way to fix it, so just exclude it since it is
# useless for BPF samples.
+# below we use long chain of commands, clang | opt | llvm-dis | llc,
+# to generate final object file. 'clang' compiles the source into IR
+# with native target, e.g., x64, arm64, etc. 'opt' does bpf CORE IR builtin
+# processing (llvm12) and IR optimizations. 'llvm-dis' converts
+# 'opt' output to IR, and finally 'llc' generates bpf byte code.
$(obj)/%.o: $(src)/%.c
@echo " CLANG-bpf " $@
$(Q)$(CLANG) $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(BPF_EXTRA_CFLAGS) \
@@ -311,7 +321,9 @@ $(obj)/%.o: $(src)/%.c
-Wno-address-of-packed-member -Wno-tautological-compare \
-Wno-unknown-warning-option $(CLANG_ARCH_ARGS) \
-I$(srctree)/samples/bpf/ -include asm_goto_workaround.h \
- -O2 -emit-llvm -c $< -o -| $(LLC) -march=bpf $(LLC_FLAGS) -filetype=obj -o $@
+ -O2 -emit-llvm -Xclang -disable-llvm-passes -c $< -o - | \
+ $(OPT) -O2 -mtriple=bpf-pc-linux | $(LLVM_DIS) | \
+ $(LLC) -march=bpf $(LLC_FLAGS) -filetype=obj -o $@
ifeq ($(DWARF2BTF),y)
$(BTF_PAHOLE) -J $@
endif
diff --git a/samples/bpf/cpustat_kern.c b/samples/bpf/cpustat_kern.c
index a86a19d5f033..5aefd19cdfa1 100644
--- a/samples/bpf/cpustat_kern.c
+++ b/samples/bpf/cpustat_kern.c
@@ -51,28 +51,28 @@ static int cpu_opps[] = { 208000, 432000, 729000, 960000, 1200000 };
#define MAP_OFF_PSTATE_IDX 3
#define MAP_OFF_NUM 4
-struct bpf_map_def SEC("maps") my_map = {
- .type = BPF_MAP_TYPE_ARRAY,
- .key_size = sizeof(u32),
- .value_size = sizeof(u64),
- .max_entries = MAX_CPU * MAP_OFF_NUM,
-};
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __type(key, u32);
+ __type(value, u64);
+ __uint(max_entries, MAX_CPU * MAP_OFF_NUM);
+} my_map SEC(".maps");
/* cstate_duration records duration time for every idle state per CPU */
-struct bpf_map_def SEC("maps") cstate_duration = {
- .type = BPF_MAP_TYPE_ARRAY,
- .key_size = sizeof(u32),
- .value_size = sizeof(u64),
- .max_entries = MAX_CPU * MAX_CSTATE_ENTRIES,
-};
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __type(key, u32);
+ __type(value, u64);
+ __uint(max_entries, MAX_CPU * MAX_CSTATE_ENTRIES);
+} cstate_duration SEC(".maps");
/* pstate_duration records duration time for every operating point per CPU */
-struct bpf_map_def SEC("maps") pstate_duration = {
- .type = BPF_MAP_TYPE_ARRAY,
- .key_size = sizeof(u32),
- .value_size = sizeof(u64),
- .max_entries = MAX_CPU * MAX_PSTATE_ENTRIES,
-};
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __type(key, u32);
+ __type(value, u64);
+ __uint(max_entries, MAX_CPU * MAX_PSTATE_ENTRIES);
+} pstate_duration SEC(".maps");
/*
* The trace events for cpu_idle and cpu_frequency are taken from:
diff --git a/samples/bpf/cpustat_user.c b/samples/bpf/cpustat_user.c
index 869a99406dbf..96675985e9e0 100644
--- a/samples/bpf/cpustat_user.c
+++ b/samples/bpf/cpustat_user.c
@@ -9,7 +9,6 @@
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
-#include <linux/bpf.h>
#include <locale.h>
#include <sys/types.h>
#include <sys/stat.h>
@@ -18,7 +17,9 @@
#include <sys/wait.h>
#include <bpf/bpf.h>
-#include "bpf_load.h"
+#include <bpf/libbpf.h>
+
+static int cstate_map_fd, pstate_map_fd;
#define MAX_CPU 8
#define MAX_PSTATE_ENTRIES 5
@@ -181,21 +182,50 @@ static void int_exit(int sig)
{
cpu_stat_inject_cpu_idle_event();
cpu_stat_inject_cpu_frequency_event();
- cpu_stat_update(map_fd[1], map_fd[2]);
+ cpu_stat_update(cstate_map_fd, pstate_map_fd);
cpu_stat_print();
exit(0);
}
int main(int argc, char **argv)
{
+ struct bpf_link *link = NULL;
+ struct bpf_program *prog;
+ struct bpf_object *obj;
char filename[256];
int ret;
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+ obj = bpf_object__open_file(filename, NULL);
+ if (libbpf_get_error(obj)) {
+ fprintf(stderr, "ERROR: opening BPF object file failed\n");
+ return 0;
+ }
- if (load_bpf_file(filename)) {
- printf("%s", bpf_log_buf);
- return 1;
+ prog = bpf_object__find_program_by_name(obj, "bpf_prog1");
+ if (!prog) {
+ printf("finding a prog in obj file failed\n");
+ goto cleanup;
+ }
+
+ /* load BPF program */
+ if (bpf_object__load(obj)) {
+ fprintf(stderr, "ERROR: loading BPF object file failed\n");
+ goto cleanup;
+ }
+
+ cstate_map_fd = bpf_object__find_map_fd_by_name(obj, "cstate_duration");
+ pstate_map_fd = bpf_object__find_map_fd_by_name(obj, "pstate_duration");
+ if (cstate_map_fd < 0 || pstate_map_fd < 0) {
+ fprintf(stderr, "ERROR: finding a map in obj file failed\n");
+ goto cleanup;
+ }
+
+ link = bpf_program__attach(prog);
+ if (libbpf_get_error(link)) {
+ fprintf(stderr, "ERROR: bpf_program__attach failed\n");
+ link = NULL;
+ goto cleanup;
}
ret = cpu_stat_inject_cpu_idle_event();
@@ -210,10 +240,13 @@ int main(int argc, char **argv)
signal(SIGTERM, int_exit);
while (1) {
- cpu_stat_update(map_fd[1], map_fd[2]);
+ cpu_stat_update(cstate_map_fd, pstate_map_fd);
cpu_stat_print();
sleep(5);
}
+cleanup:
+ bpf_link__destroy(link);
+ bpf_object__close(obj);
return 0;
}
diff --git a/samples/bpf/hbm.c b/samples/bpf/hbm.c
index 4b22ace52f80..ff4c533dfac2 100644
--- a/samples/bpf/hbm.c
+++ b/samples/bpf/hbm.c
@@ -40,6 +40,7 @@
#include <errno.h>
#include <fcntl.h>
#include <linux/unistd.h>
+#include <linux/compiler.h>
#include <linux/bpf.h>
#include <bpf/bpf.h>
@@ -483,7 +484,7 @@ int main(int argc, char **argv)
"Option -%c requires an argument.\n\n",
optopt);
case 'h':
- fallthrough;
+ __fallthrough;
default:
Usage();
return 0;
diff --git a/samples/bpf/lathist_kern.c b/samples/bpf/lathist_kern.c
index ca9c2e4e69aa..4adfcbbe6ef4 100644
--- a/samples/bpf/lathist_kern.c
+++ b/samples/bpf/lathist_kern.c
@@ -18,12 +18,12 @@
* trace_preempt_[on|off] tracepoints hooks is not supported.
*/
-struct bpf_map_def SEC("maps") my_map = {
- .type = BPF_MAP_TYPE_ARRAY,
- .key_size = sizeof(int),
- .value_size = sizeof(u64),
- .max_entries = MAX_CPU,
-};
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __type(key, int);
+ __type(value, u64);
+ __uint(max_entries, MAX_CPU);
+} my_map SEC(".maps");
SEC("kprobe/trace_preempt_off")
int bpf_prog1(struct pt_regs *ctx)
@@ -61,12 +61,12 @@ static unsigned int log2l(unsigned long v)
return log2(v);
}
-struct bpf_map_def SEC("maps") my_lat = {
- .type = BPF_MAP_TYPE_ARRAY,
- .key_size = sizeof(int),
- .value_size = sizeof(long),
- .max_entries = MAX_CPU * MAX_ENTRIES,
-};
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __type(key, int);
+ __type(value, long);
+ __uint(max_entries, MAX_CPU * MAX_ENTRIES);
+} my_lat SEC(".maps");
SEC("kprobe/trace_preempt_on")
int bpf_prog2(struct pt_regs *ctx)
diff --git a/samples/bpf/lathist_user.c b/samples/bpf/lathist_user.c
index 2ff2839a52d5..7d8ff2418303 100644
--- a/samples/bpf/lathist_user.c
+++ b/samples/bpf/lathist_user.c
@@ -6,9 +6,8 @@
#include <unistd.h>
#include <stdlib.h>
#include <signal.h>
-#include <linux/bpf.h>
+#include <bpf/libbpf.h>
#include <bpf/bpf.h>
-#include "bpf_load.h"
#define MAX_ENTRIES 20
#define MAX_CPU 4
@@ -81,20 +80,51 @@ static void get_data(int fd)
int main(int argc, char **argv)
{
+ struct bpf_link *links[2];
+ struct bpf_program *prog;
+ struct bpf_object *obj;
char filename[256];
+ int map_fd, i = 0;
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+ obj = bpf_object__open_file(filename, NULL);
+ if (libbpf_get_error(obj)) {
+ fprintf(stderr, "ERROR: opening BPF object file failed\n");
+ return 0;
+ }
+
+ /* load BPF program */
+ if (bpf_object__load(obj)) {
+ fprintf(stderr, "ERROR: loading BPF object file failed\n");
+ goto cleanup;
+ }
- if (load_bpf_file(filename)) {
- printf("%s", bpf_log_buf);
- return 1;
+ map_fd = bpf_object__find_map_fd_by_name(obj, "my_lat");
+ if (map_fd < 0) {
+ fprintf(stderr, "ERROR: finding a map in obj file failed\n");
+ goto cleanup;
+ }
+
+ bpf_object__for_each_program(prog, obj) {
+ links[i] = bpf_program__attach(prog);
+ if (libbpf_get_error(links[i])) {
+ fprintf(stderr, "ERROR: bpf_program__attach failed\n");
+ links[i] = NULL;
+ goto cleanup;
+ }
+ i++;
}
while (1) {
- get_data(map_fd[1]);
+ get_data(map_fd);
print_hist();
sleep(5);
}
+cleanup:
+ for (i--; i >= 0; i--)
+ bpf_link__destroy(links[i]);
+
+ bpf_object__close(obj);
return 0;
}
diff --git a/samples/bpf/offwaketime_kern.c b/samples/bpf/offwaketime_kern.c
index e74ee1cd4b9c..14b792915a9c 100644
--- a/samples/bpf/offwaketime_kern.c
+++ b/samples/bpf/offwaketime_kern.c
@@ -28,38 +28,38 @@ struct key_t {
u32 tret;
};
-struct bpf_map_def SEC("maps") counts = {
- .type = BPF_MAP_TYPE_HASH,
- .key_size = sizeof(struct key_t),
- .value_size = sizeof(u64),
- .max_entries = 10000,
-};
-
-struct bpf_map_def SEC("maps") start = {
- .type = BPF_MAP_TYPE_HASH,
- .key_size = sizeof(u32),
- .value_size = sizeof(u64),
- .max_entries = 10000,
-};
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, struct key_t);
+ __type(value, u64);
+ __uint(max_entries, 10000);
+} counts SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, u32);
+ __type(value, u64);
+ __uint(max_entries, 10000);
+} start SEC(".maps");
struct wokeby_t {
char name[TASK_COMM_LEN];
u32 ret;
};
-struct bpf_map_def SEC("maps") wokeby = {
- .type = BPF_MAP_TYPE_HASH,
- .key_size = sizeof(u32),
- .value_size = sizeof(struct wokeby_t),
- .max_entries = 10000,
-};
-
-struct bpf_map_def SEC("maps") stackmap = {
- .type = BPF_MAP_TYPE_STACK_TRACE,
- .key_size = sizeof(u32),
- .value_size = PERF_MAX_STACK_DEPTH * sizeof(u64),
- .max_entries = 10000,
-};
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, u32);
+ __type(value, struct wokeby_t);
+ __uint(max_entries, 10000);
+} wokeby SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_STACK_TRACE);
+ __uint(key_size, sizeof(u32));
+ __uint(value_size, PERF_MAX_STACK_DEPTH * sizeof(u64));
+ __uint(max_entries, 10000);
+} stackmap SEC(".maps");
#define STACKID_FLAGS (0 | BPF_F_FAST_STACK_CMP)
diff --git a/samples/bpf/offwaketime_user.c b/samples/bpf/offwaketime_user.c
index 51c7da5341cc..5734cfdaaacb 100644
--- a/samples/bpf/offwaketime_user.c
+++ b/samples/bpf/offwaketime_user.c
@@ -5,19 +5,19 @@
#include <unistd.h>
#include <stdlib.h>
#include <signal.h>
-#include <linux/bpf.h>
-#include <string.h>
#include <linux/perf_event.h>
#include <errno.h>
-#include <assert.h>
#include <stdbool.h>
#include <sys/resource.h>
#include <bpf/libbpf.h>
-#include "bpf_load.h"
+#include <bpf/bpf.h>
#include "trace_helpers.h"
#define PRINT_RAW_ADDR 0
+/* counts, stackmap */
+static int map_fd[2];
+
static void print_ksym(__u64 addr)
{
struct ksym *sym;
@@ -52,14 +52,14 @@ static void print_stack(struct key_t *key, __u64 count)
int i;
printf("%s;", key->target);
- if (bpf_map_lookup_elem(map_fd[3], &key->tret, ip) != 0) {
+ if (bpf_map_lookup_elem(map_fd[1], &key->tret, ip) != 0) {
printf("---;");
} else {
for (i = PERF_MAX_STACK_DEPTH - 1; i >= 0; i--)
print_ksym(ip[i]);
}
printf("-;");
- if (bpf_map_lookup_elem(map_fd[3], &key->wret, ip) != 0) {
+ if (bpf_map_lookup_elem(map_fd[1], &key->wret, ip) != 0) {
printf("---;");
} else {
for (i = 0; i < PERF_MAX_STACK_DEPTH; i++)
@@ -96,23 +96,54 @@ static void int_exit(int sig)
int main(int argc, char **argv)
{
struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+ struct bpf_object *obj = NULL;
+ struct bpf_link *links[2];
+ struct bpf_program *prog;
+ int delay = 1, i = 0;
char filename[256];
- int delay = 1;
-
- snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
- setrlimit(RLIMIT_MEMLOCK, &r);
- signal(SIGINT, int_exit);
- signal(SIGTERM, int_exit);
+ if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+ perror("setrlimit(RLIMIT_MEMLOCK)");
+ return 1;
+ }
if (load_kallsyms()) {
printf("failed to process /proc/kallsyms\n");
return 2;
}
- if (load_bpf_file(filename)) {
- printf("%s", bpf_log_buf);
- return 1;
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+ obj = bpf_object__open_file(filename, NULL);
+ if (libbpf_get_error(obj)) {
+ fprintf(stderr, "ERROR: opening BPF object file failed\n");
+ obj = NULL;
+ goto cleanup;
+ }
+
+ /* load BPF program */
+ if (bpf_object__load(obj)) {
+ fprintf(stderr, "ERROR: loading BPF object file failed\n");
+ goto cleanup;
+ }
+
+ map_fd[0] = bpf_object__find_map_fd_by_name(obj, "counts");
+ map_fd[1] = bpf_object__find_map_fd_by_name(obj, "stackmap");
+ if (map_fd[0] < 0 || map_fd[1] < 0) {
+ fprintf(stderr, "ERROR: finding a map in obj file failed\n");
+ goto cleanup;
+ }
+
+ signal(SIGINT, int_exit);
+ signal(SIGTERM, int_exit);
+
+ bpf_object__for_each_program(prog, obj) {
+ links[i] = bpf_program__attach(prog);
+ if (libbpf_get_error(links[i])) {
+ fprintf(stderr, "ERROR: bpf_program__attach failed\n");
+ links[i] = NULL;
+ goto cleanup;
+ }
+ i++;
}
if (argc > 1)
@@ -120,5 +151,10 @@ int main(int argc, char **argv)
sleep(delay);
print_stacks(map_fd[0]);
+cleanup:
+ for (i--; i >= 0; i--)
+ bpf_link__destroy(links[i]);
+
+ bpf_object__close(obj);
return 0;
}
diff --git a/samples/bpf/sockex3_kern.c b/samples/bpf/sockex3_kern.c
index cab9cca0b8eb..b363503357e5 100644
--- a/samples/bpf/sockex3_kern.c
+++ b/samples/bpf/sockex3_kern.c
@@ -31,11 +31,13 @@ struct {
#define PARSE_IP 3
#define PARSE_IPV6 4
-/* protocol dispatch routine.
- * It tail-calls next BPF program depending on eth proto
- * Note, we could have used:
- * bpf_tail_call(skb, &jmp_table, proto);
- * but it would need large prog_array
+/* Protocol dispatch routine. It tail-calls next BPF program depending
+ * on eth proto. Note, we could have used ...
+ *
+ * bpf_tail_call(skb, &jmp_table, proto);
+ *
+ * ... but it would need large prog_array and cannot be optimised given
+ * the map key is not static.
*/
static inline void parse_eth_proto(struct __sk_buff *skb, u32 proto)
{
diff --git a/samples/bpf/sockex3_user.c b/samples/bpf/sockex3_user.c
index 4dbee7427d47..7793f6a6ae7e 100644
--- a/samples/bpf/sockex3_user.c
+++ b/samples/bpf/sockex3_user.c
@@ -29,8 +29,8 @@ int main(int argc, char **argv)
struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
struct bpf_program *prog;
struct bpf_object *obj;
+ const char *section;
char filename[256];
- const char *title;
FILE *f;
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
@@ -58,8 +58,8 @@ int main(int argc, char **argv)
bpf_object__for_each_program(prog, obj) {
fd = bpf_program__fd(prog);
- title = bpf_program__title(prog, false);
- if (sscanf(title, "socket/%d", &key) != 1) {
+ section = bpf_program__section_name(prog);
+ if (sscanf(section, "socket/%d", &key) != 1) {
fprintf(stderr, "ERROR: finding prog failed\n");
goto cleanup;
}
diff --git a/samples/bpf/spintest_kern.c b/samples/bpf/spintest_kern.c
index f508af357251..455da77319d9 100644
--- a/samples/bpf/spintest_kern.c
+++ b/samples/bpf/spintest_kern.c
@@ -12,25 +12,25 @@
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
-struct bpf_map_def SEC("maps") my_map = {
- .type = BPF_MAP_TYPE_HASH,
- .key_size = sizeof(long),
- .value_size = sizeof(long),
- .max_entries = 1024,
-};
-struct bpf_map_def SEC("maps") my_map2 = {
- .type = BPF_MAP_TYPE_PERCPU_HASH,
- .key_size = sizeof(long),
- .value_size = sizeof(long),
- .max_entries = 1024,
-};
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, long);
+ __type(value, long);
+ __uint(max_entries, 1024);
+} my_map SEC(".maps");
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_HASH);
+ __uint(key_size, sizeof(long));
+ __uint(value_size, sizeof(long));
+ __uint(max_entries, 1024);
+} my_map2 SEC(".maps");
-struct bpf_map_def SEC("maps") stackmap = {
- .type = BPF_MAP_TYPE_STACK_TRACE,
- .key_size = sizeof(u32),
- .value_size = PERF_MAX_STACK_DEPTH * sizeof(u64),
- .max_entries = 10000,
-};
+struct {
+ __uint(type, BPF_MAP_TYPE_STACK_TRACE);
+ __uint(key_size, sizeof(u32));
+ __uint(value_size, PERF_MAX_STACK_DEPTH * sizeof(u64));
+ __uint(max_entries, 10000);
+} stackmap SEC(".maps");
#define PROG(foo) \
int foo(struct pt_regs *ctx) \
diff --git a/samples/bpf/spintest_user.c b/samples/bpf/spintest_user.c
index fb430ea2ef51..f090d0dc60d6 100644
--- a/samples/bpf/spintest_user.c
+++ b/samples/bpf/spintest_user.c
@@ -1,40 +1,77 @@
// SPDX-License-Identifier: GPL-2.0
#include <stdio.h>
#include <unistd.h>
-#include <linux/bpf.h>
#include <string.h>
#include <assert.h>
#include <sys/resource.h>
#include <bpf/libbpf.h>
-#include "bpf_load.h"
+#include <bpf/bpf.h>
#include "trace_helpers.h"
int main(int ac, char **argv)
{
struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+ char filename[256], symbol[256];
+ struct bpf_object *obj = NULL;
+ struct bpf_link *links[20];
long key, next_key, value;
- char filename[256];
+ struct bpf_program *prog;
+ int map_fd, i, j = 0;
+ const char *section;
struct ksym *sym;
- int i;
- snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
- setrlimit(RLIMIT_MEMLOCK, &r);
+ if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+ perror("setrlimit(RLIMIT_MEMLOCK)");
+ return 1;
+ }
if (load_kallsyms()) {
printf("failed to process /proc/kallsyms\n");
return 2;
}
- if (load_bpf_file(filename)) {
- printf("%s", bpf_log_buf);
- return 1;
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+ obj = bpf_object__open_file(filename, NULL);
+ if (libbpf_get_error(obj)) {
+ fprintf(stderr, "ERROR: opening BPF object file failed\n");
+ obj = NULL;
+ goto cleanup;
+ }
+
+ /* load BPF program */
+ if (bpf_object__load(obj)) {
+ fprintf(stderr, "ERROR: loading BPF object file failed\n");
+ goto cleanup;
+ }
+
+ map_fd = bpf_object__find_map_fd_by_name(obj, "my_map");
+ if (map_fd < 0) {
+ fprintf(stderr, "ERROR: finding a map in obj file failed\n");
+ goto cleanup;
+ }
+
+ bpf_object__for_each_program(prog, obj) {
+ section = bpf_program__section_name(prog);
+ if (sscanf(section, "kprobe/%s", symbol) != 1)
+ continue;
+
+ /* Attach prog only when symbol exists */
+ if (ksym_get_addr(symbol)) {
+ links[j] = bpf_program__attach(prog);
+ if (libbpf_get_error(links[j])) {
+ fprintf(stderr, "bpf_program__attach failed\n");
+ links[j] = NULL;
+ goto cleanup;
+ }
+ j++;
+ }
}
for (i = 0; i < 5; i++) {
key = 0;
printf("kprobing funcs:");
- while (bpf_map_get_next_key(map_fd[0], &key, &next_key) == 0) {
- bpf_map_lookup_elem(map_fd[0], &next_key, &value);
+ while (bpf_map_get_next_key(map_fd, &key, &next_key) == 0) {
+ bpf_map_lookup_elem(map_fd, &next_key, &value);
assert(next_key == value);
sym = ksym_search(value);
key = next_key;
@@ -48,10 +85,15 @@ int main(int ac, char **argv)
if (key)
printf("\n");
key = 0;
- while (bpf_map_get_next_key(map_fd[0], &key, &next_key) == 0)
- bpf_map_delete_elem(map_fd[0], &next_key);
+ while (bpf_map_get_next_key(map_fd, &key, &next_key) == 0)
+ bpf_map_delete_elem(map_fd, &next_key);
sleep(1);
}
+cleanup:
+ for (j--; j >= 0; j--)
+ bpf_link__destroy(links[j]);
+
+ bpf_object__close(obj);
return 0;
}
diff --git a/samples/bpf/syscall_tp_kern.c b/samples/bpf/syscall_tp_kern.c
index 5a62b03b1f88..50231c2eff9c 100644
--- a/samples/bpf/syscall_tp_kern.c
+++ b/samples/bpf/syscall_tp_kern.c
@@ -18,19 +18,19 @@ struct syscalls_exit_open_args {
long ret;
};
-struct bpf_map_def SEC("maps") enter_open_map = {
- .type = BPF_MAP_TYPE_ARRAY,
- .key_size = sizeof(u32),
- .value_size = sizeof(u32),
- .max_entries = 1,
-};
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __type(key, u32);
+ __type(value, u32);
+ __uint(max_entries, 1);
+} enter_open_map SEC(".maps");
-struct bpf_map_def SEC("maps") exit_open_map = {
- .type = BPF_MAP_TYPE_ARRAY,
- .key_size = sizeof(u32),
- .value_size = sizeof(u32),
- .max_entries = 1,
-};
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __type(key, u32);
+ __type(value, u32);
+ __uint(max_entries, 1);
+} exit_open_map SEC(".maps");
static __always_inline void count(void *map)
{
diff --git a/samples/bpf/syscall_tp_user.c b/samples/bpf/syscall_tp_user.c
index 57014bab7cbe..76a1d00128fb 100644
--- a/samples/bpf/syscall_tp_user.c
+++ b/samples/bpf/syscall_tp_user.c
@@ -5,16 +5,12 @@
#include <unistd.h>
#include <fcntl.h>
#include <stdlib.h>
-#include <signal.h>
-#include <linux/bpf.h>
#include <string.h>
#include <linux/perf_event.h>
#include <errno.h>
-#include <assert.h>
-#include <stdbool.h>
#include <sys/resource.h>
+#include <bpf/libbpf.h>
#include <bpf/bpf.h>
-#include "bpf_load.h"
/* This program verifies bpf attachment to tracepoint sys_enter_* and sys_exit_*.
* This requires kernel CONFIG_FTRACE_SYSCALLS to be set.
@@ -49,16 +45,44 @@ static void verify_map(int map_id)
static int test(char *filename, int num_progs)
{
- int i, fd, map0_fds[num_progs], map1_fds[num_progs];
+ int map0_fds[num_progs], map1_fds[num_progs], fd, i, j = 0;
+ struct bpf_link *links[num_progs * 4];
+ struct bpf_object *objs[num_progs];
+ struct bpf_program *prog;
for (i = 0; i < num_progs; i++) {
- if (load_bpf_file(filename)) {
- fprintf(stderr, "%s", bpf_log_buf);
- return 1;
+ objs[i] = bpf_object__open_file(filename, NULL);
+ if (libbpf_get_error(objs[i])) {
+ fprintf(stderr, "opening BPF object file failed\n");
+ objs[i] = NULL;
+ goto cleanup;
}
- printf("prog #%d: map ids %d %d\n", i, map_fd[0], map_fd[1]);
- map0_fds[i] = map_fd[0];
- map1_fds[i] = map_fd[1];
+
+ /* load BPF program */
+ if (bpf_object__load(objs[i])) {
+ fprintf(stderr, "loading BPF object file failed\n");
+ goto cleanup;
+ }
+
+ map0_fds[i] = bpf_object__find_map_fd_by_name(objs[i],
+ "enter_open_map");
+ map1_fds[i] = bpf_object__find_map_fd_by_name(objs[i],
+ "exit_open_map");
+ if (map0_fds[i] < 0 || map1_fds[i] < 0) {
+ fprintf(stderr, "finding a map in obj file failed\n");
+ goto cleanup;
+ }
+
+ bpf_object__for_each_program(prog, objs[i]) {
+ links[j] = bpf_program__attach(prog);
+ if (libbpf_get_error(links[j])) {
+ fprintf(stderr, "bpf_program__attach failed\n");
+ links[j] = NULL;
+ goto cleanup;
+ }
+ j++;
+ }
+ printf("prog #%d: map ids %d %d\n", i, map0_fds[i], map1_fds[i]);
}
/* current load_bpf_file has perf_event_open default pid = -1
@@ -80,6 +104,12 @@ static int test(char *filename, int num_progs)
verify_map(map1_fds[i]);
}
+cleanup:
+ for (j--; j >= 0; j--)
+ bpf_link__destroy(links[j]);
+
+ for (i--; i >= 0; i--)
+ bpf_object__close(objs[i]);
return 0;
}
diff --git a/samples/bpf/task_fd_query_kern.c b/samples/bpf/task_fd_query_kern.c
index 278ade5427c8..c821294e1774 100644
--- a/samples/bpf/task_fd_query_kern.c
+++ b/samples/bpf/task_fd_query_kern.c
@@ -10,7 +10,7 @@ int bpf_prog1(struct pt_regs *ctx)
return 0;
}
-SEC("kretprobe/blk_account_io_completion")
+SEC("kretprobe/blk_account_io_done")
int bpf_prog2(struct pt_regs *ctx)
{
return 0;
diff --git a/samples/bpf/task_fd_query_user.c b/samples/bpf/task_fd_query_user.c
index ff2e9c1c7266..4a74531dc403 100644
--- a/samples/bpf/task_fd_query_user.c
+++ b/samples/bpf/task_fd_query_user.c
@@ -314,7 +314,7 @@ int main(int argc, char **argv)
/* test two functions in the corresponding *_kern.c file */
CHECK_AND_RET(test_debug_fs_kprobe(0, "blk_mq_start_request",
BPF_FD_TYPE_KPROBE));
- CHECK_AND_RET(test_debug_fs_kprobe(1, "blk_account_io_completion",
+ CHECK_AND_RET(test_debug_fs_kprobe(1, "blk_account_io_done",
BPF_FD_TYPE_KRETPROBE));
/* test nondebug fs kprobe */
diff --git a/samples/bpf/test_current_task_under_cgroup_kern.c b/samples/bpf/test_current_task_under_cgroup_kern.c
index 6dc4f41bb6cb..fbd43e2bb4d3 100644
--- a/samples/bpf/test_current_task_under_cgroup_kern.c
+++ b/samples/bpf/test_current_task_under_cgroup_kern.c
@@ -10,23 +10,24 @@
#include <linux/version.h>
#include <bpf/bpf_helpers.h>
#include <uapi/linux/utsname.h>
+#include "trace_common.h"
-struct bpf_map_def SEC("maps") cgroup_map = {
- .type = BPF_MAP_TYPE_CGROUP_ARRAY,
- .key_size = sizeof(u32),
- .value_size = sizeof(u32),
- .max_entries = 1,
-};
+struct {
+ __uint(type, BPF_MAP_TYPE_CGROUP_ARRAY);
+ __uint(key_size, sizeof(u32));
+ __uint(value_size, sizeof(u32));
+ __uint(max_entries, 1);
+} cgroup_map SEC(".maps");
-struct bpf_map_def SEC("maps") perf_map = {
- .type = BPF_MAP_TYPE_ARRAY,
- .key_size = sizeof(u32),
- .value_size = sizeof(u64),
- .max_entries = 1,
-};
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __type(key, u32);
+ __type(value, u64);
+ __uint(max_entries, 1);
+} perf_map SEC(".maps");
/* Writes the last PID that called sync to a map at index 0 */
-SEC("kprobe/sys_sync")
+SEC("kprobe/" SYSCALL(sys_sync))
int bpf_prog1(struct pt_regs *ctx)
{
u64 pid = bpf_get_current_pid_tgid();
diff --git a/samples/bpf/test_current_task_under_cgroup_user.c b/samples/bpf/test_current_task_under_cgroup_user.c
index 06e9f8ce42e2..ac251a417f45 100644
--- a/samples/bpf/test_current_task_under_cgroup_user.c
+++ b/samples/bpf/test_current_task_under_cgroup_user.c
@@ -4,10 +4,9 @@
#define _GNU_SOURCE
#include <stdio.h>
-#include <linux/bpf.h>
#include <unistd.h>
#include <bpf/bpf.h>
-#include "bpf_load.h"
+#include <bpf/libbpf.h>
#include "cgroup_helpers.h"
#define CGROUP_PATH "/my-cgroup"
@@ -15,13 +14,44 @@
int main(int argc, char **argv)
{
pid_t remote_pid, local_pid = getpid();
- int cg2, idx = 0, rc = 0;
+ struct bpf_link *link = NULL;
+ struct bpf_program *prog;
+ int cg2, idx = 0, rc = 1;
+ struct bpf_object *obj;
char filename[256];
+ int map_fd[2];
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
- if (load_bpf_file(filename)) {
- printf("%s", bpf_log_buf);
- return 1;
+ obj = bpf_object__open_file(filename, NULL);
+ if (libbpf_get_error(obj)) {
+ fprintf(stderr, "ERROR: opening BPF object file failed\n");
+ return 0;
+ }
+
+ prog = bpf_object__find_program_by_name(obj, "bpf_prog1");
+ if (!prog) {
+ printf("finding a prog in obj file failed\n");
+ goto cleanup;
+ }
+
+ /* load BPF program */
+ if (bpf_object__load(obj)) {
+ fprintf(stderr, "ERROR: loading BPF object file failed\n");
+ goto cleanup;
+ }
+
+ map_fd[0] = bpf_object__find_map_fd_by_name(obj, "cgroup_map");
+ map_fd[1] = bpf_object__find_map_fd_by_name(obj, "perf_map");
+ if (map_fd[0] < 0 || map_fd[1] < 0) {
+ fprintf(stderr, "ERROR: finding a map in obj file failed\n");
+ goto cleanup;
+ }
+
+ link = bpf_program__attach(prog);
+ if (libbpf_get_error(link)) {
+ fprintf(stderr, "ERROR: bpf_program__attach failed\n");
+ link = NULL;
+ goto cleanup;
}
if (setup_cgroup_environment())
@@ -70,12 +100,14 @@ int main(int argc, char **argv)
goto err;
}
- goto out;
-err:
- rc = 1;
+ rc = 0;
-out:
+err:
close(cg2);
cleanup_cgroup_environment();
+
+cleanup:
+ bpf_link__destroy(link);
+ bpf_object__close(obj);
return rc;
}
diff --git a/samples/bpf/test_map_in_map_kern.c b/samples/bpf/test_map_in_map_kern.c
index 8def45c5b697..b0200c8eac09 100644
--- a/samples/bpf/test_map_in_map_kern.c
+++ b/samples/bpf/test_map_in_map_kern.c
@@ -103,10 +103,9 @@ static __always_inline int do_inline_hash_lookup(void *inner_map, u32 port)
return result ? *result : -ENOENT;
}
-SEC("kprobe/" SYSCALL(sys_connect))
+SEC("kprobe/__sys_connect")
int trace_sys_connect(struct pt_regs *ctx)
{
- struct pt_regs *real_regs = (struct pt_regs *)PT_REGS_PARM1_CORE(ctx);
struct sockaddr_in6 *in6;
u16 test_case, port, dst6[8];
int addrlen, ret, inline_ret, ret_key = 0;
@@ -114,8 +113,8 @@ int trace_sys_connect(struct pt_regs *ctx)
void *outer_map, *inner_map;
bool inline_hash = false;
- in6 = (struct sockaddr_in6 *)PT_REGS_PARM2_CORE(real_regs);
- addrlen = (int)PT_REGS_PARM3_CORE(real_regs);
+ in6 = (struct sockaddr_in6 *)PT_REGS_PARM2_CORE(ctx);
+ addrlen = (int)PT_REGS_PARM3_CORE(ctx);
if (addrlen != sizeof(*in6))
return 0;
diff --git a/samples/bpf/test_probe_write_user_kern.c b/samples/bpf/test_probe_write_user_kern.c
index fd651a65281e..220a96438d75 100644
--- a/samples/bpf/test_probe_write_user_kern.c
+++ b/samples/bpf/test_probe_write_user_kern.c
@@ -13,12 +13,12 @@
#include <bpf/bpf_core_read.h>
#include "trace_common.h"
-struct bpf_map_def SEC("maps") dnat_map = {
- .type = BPF_MAP_TYPE_HASH,
- .key_size = sizeof(struct sockaddr_in),
- .value_size = sizeof(struct sockaddr_in),
- .max_entries = 256,
-};
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, struct sockaddr_in);
+ __type(value, struct sockaddr_in);
+ __uint(max_entries, 256);
+} dnat_map SEC(".maps");
/* kprobe is NOT a stable ABI
* kernel functions can be removed, renamed or completely change semantics.
diff --git a/samples/bpf/test_probe_write_user_user.c b/samples/bpf/test_probe_write_user_user.c
index 045eb5e30f54..00ccfb834e45 100644
--- a/samples/bpf/test_probe_write_user_user.c
+++ b/samples/bpf/test_probe_write_user_user.c
@@ -1,21 +1,22 @@
// SPDX-License-Identifier: GPL-2.0
#include <stdio.h>
#include <assert.h>
-#include <linux/bpf.h>
#include <unistd.h>
#include <bpf/bpf.h>
-#include "bpf_load.h"
+#include <bpf/libbpf.h>
#include <sys/socket.h>
-#include <string.h>
#include <netinet/in.h>
#include <arpa/inet.h>
int main(int ac, char **argv)
{
- int serverfd, serverconnfd, clientfd;
- socklen_t sockaddr_len;
- struct sockaddr serv_addr, mapped_addr, tmp_addr;
struct sockaddr_in *serv_addr_in, *mapped_addr_in, *tmp_addr_in;
+ struct sockaddr serv_addr, mapped_addr, tmp_addr;
+ int serverfd, serverconnfd, clientfd, map_fd;
+ struct bpf_link *link = NULL;
+ struct bpf_program *prog;
+ struct bpf_object *obj;
+ socklen_t sockaddr_len;
char filename[256];
char *ip;
@@ -24,10 +25,35 @@ int main(int ac, char **argv)
tmp_addr_in = (struct sockaddr_in *)&tmp_addr;
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+ obj = bpf_object__open_file(filename, NULL);
+ if (libbpf_get_error(obj)) {
+ fprintf(stderr, "ERROR: opening BPF object file failed\n");
+ return 0;
+ }
+
+ prog = bpf_object__find_program_by_name(obj, "bpf_prog1");
+ if (libbpf_get_error(prog)) {
+ fprintf(stderr, "ERROR: finding a prog in obj file failed\n");
+ goto cleanup;
+ }
+
+ /* load BPF program */
+ if (bpf_object__load(obj)) {
+ fprintf(stderr, "ERROR: loading BPF object file failed\n");
+ goto cleanup;
+ }
+
+ map_fd = bpf_object__find_map_fd_by_name(obj, "dnat_map");
+ if (map_fd < 0) {
+ fprintf(stderr, "ERROR: finding a map in obj file failed\n");
+ goto cleanup;
+ }
- if (load_bpf_file(filename)) {
- printf("%s", bpf_log_buf);
- return 1;
+ link = bpf_program__attach(prog);
+ if (libbpf_get_error(link)) {
+ fprintf(stderr, "ERROR: bpf_program__attach failed\n");
+ link = NULL;
+ goto cleanup;
}
assert((serverfd = socket(AF_INET, SOCK_STREAM, 0)) > 0);
@@ -51,7 +77,7 @@ int main(int ac, char **argv)
mapped_addr_in->sin_port = htons(5555);
mapped_addr_in->sin_addr.s_addr = inet_addr("255.255.255.255");
- assert(!bpf_map_update_elem(map_fd[0], &mapped_addr, &serv_addr, BPF_ANY));
+ assert(!bpf_map_update_elem(map_fd, &mapped_addr, &serv_addr, BPF_ANY));
assert(listen(serverfd, 5) == 0);
@@ -75,5 +101,8 @@ int main(int ac, char **argv)
/* Is the server's getsockname = the socket getpeername */
assert(memcmp(&serv_addr, &tmp_addr, sizeof(struct sockaddr_in)) == 0);
+cleanup:
+ bpf_link__destroy(link);
+ bpf_object__close(obj);
return 0;
}
diff --git a/samples/bpf/trace_output_kern.c b/samples/bpf/trace_output_kern.c
index 1d7d422cae6f..b64815af0943 100644
--- a/samples/bpf/trace_output_kern.c
+++ b/samples/bpf/trace_output_kern.c
@@ -2,15 +2,16 @@
#include <linux/version.h>
#include <uapi/linux/bpf.h>
#include <bpf/bpf_helpers.h>
+#include "trace_common.h"
-struct bpf_map_def SEC("maps") my_map = {
- .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
- .key_size = sizeof(int),
- .value_size = sizeof(u32),
- .max_entries = 2,
-};
+struct {
+ __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
+ __uint(key_size, sizeof(int));
+ __uint(value_size, sizeof(u32));
+ __uint(max_entries, 2);
+} my_map SEC(".maps");
-SEC("kprobe/sys_write")
+SEC("kprobe/" SYSCALL(sys_write))
int bpf_prog1(struct pt_regs *ctx)
{
struct S {
diff --git a/samples/bpf/trace_output_user.c b/samples/bpf/trace_output_user.c
index 60a17dd05345..364b98764d54 100644
--- a/samples/bpf/trace_output_user.c
+++ b/samples/bpf/trace_output_user.c
@@ -1,23 +1,10 @@
// SPDX-License-Identifier: GPL-2.0-only
#include <stdio.h>
-#include <unistd.h>
-#include <stdlib.h>
-#include <stdbool.h>
-#include <string.h>
#include <fcntl.h>
#include <poll.h>
-#include <linux/perf_event.h>
-#include <linux/bpf.h>
-#include <errno.h>
-#include <assert.h>
-#include <sys/syscall.h>
-#include <sys/ioctl.h>
-#include <sys/mman.h>
#include <time.h>
#include <signal.h>
#include <bpf/libbpf.h>
-#include "bpf_load.h"
-#include "perf-sys.h"
static __u64 time_get_ns(void)
{
@@ -57,20 +44,48 @@ static void print_bpf_output(void *ctx, int cpu, void *data, __u32 size)
int main(int argc, char **argv)
{
struct perf_buffer_opts pb_opts = {};
+ struct bpf_link *link = NULL;
+ struct bpf_program *prog;
struct perf_buffer *pb;
+ struct bpf_object *obj;
+ int map_fd, ret = 0;
char filename[256];
FILE *f;
- int ret;
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+ obj = bpf_object__open_file(filename, NULL);
+ if (libbpf_get_error(obj)) {
+ fprintf(stderr, "ERROR: opening BPF object file failed\n");
+ return 0;
+ }
- if (load_bpf_file(filename)) {
- printf("%s", bpf_log_buf);
- return 1;
+ /* load BPF program */
+ if (bpf_object__load(obj)) {
+ fprintf(stderr, "ERROR: loading BPF object file failed\n");
+ goto cleanup;
+ }
+
+ map_fd = bpf_object__find_map_fd_by_name(obj, "my_map");
+ if (map_fd < 0) {
+ fprintf(stderr, "ERROR: finding a map in obj file failed\n");
+ goto cleanup;
+ }
+
+ prog = bpf_object__find_program_by_name(obj, "bpf_prog1");
+ if (libbpf_get_error(prog)) {
+ fprintf(stderr, "ERROR: finding a prog in obj file failed\n");
+ goto cleanup;
+ }
+
+ link = bpf_program__attach(prog);
+ if (libbpf_get_error(link)) {
+ fprintf(stderr, "ERROR: bpf_program__attach failed\n");
+ link = NULL;
+ goto cleanup;
}
pb_opts.sample_cb = print_bpf_output;
- pb = perf_buffer__new(map_fd[0], 8, &pb_opts);
+ pb = perf_buffer__new(map_fd, 8, &pb_opts);
ret = libbpf_get_error(pb);
if (ret) {
printf("failed to setup perf_buffer: %d\n", ret);
@@ -84,5 +99,9 @@ int main(int argc, char **argv)
while ((ret = perf_buffer__poll(pb, 1000)) >= 0 && cnt < MAX_CNT) {
}
kill(0, SIGINT);
+
+cleanup:
+ bpf_link__destroy(link);
+ bpf_object__close(obj);
return ret;
}
diff --git a/samples/bpf/tracex3_kern.c b/samples/bpf/tracex3_kern.c
index 659613c19a82..710a4410b2fb 100644
--- a/samples/bpf/tracex3_kern.c
+++ b/samples/bpf/tracex3_kern.c
@@ -49,7 +49,7 @@ struct {
__uint(max_entries, SLOTS);
} lat_map SEC(".maps");
-SEC("kprobe/blk_account_io_completion")
+SEC("kprobe/blk_account_io_done")
int bpf_prog2(struct pt_regs *ctx)
{
long rq = PT_REGS_PARM1(ctx);
diff --git a/samples/bpf/tracex5_user.c b/samples/bpf/tracex5_user.c
index 98dad57a96c4..c17d3fb5fd64 100644
--- a/samples/bpf/tracex5_user.c
+++ b/samples/bpf/tracex5_user.c
@@ -39,8 +39,8 @@ int main(int ac, char **argv)
struct bpf_program *prog;
struct bpf_object *obj;
int key, fd, progs_fd;
+ const char *section;
char filename[256];
- const char *title;
FILE *f;
setrlimit(RLIMIT_MEMLOCK, &r);
@@ -78,9 +78,9 @@ int main(int ac, char **argv)
}
bpf_object__for_each_program(prog, obj) {
- title = bpf_program__title(prog, false);
+ section = bpf_program__section_name(prog);
/* register only syscalls to PROG_ARRAY */
- if (sscanf(title, "kprobe/%d", &key) != 1)
+ if (sscanf(section, "kprobe/%d", &key) != 1)
continue;
fd = bpf_program__fd(prog);
diff --git a/samples/bpf/xdp_monitor_kern.c b/samples/bpf/xdp_monitor_kern.c
index 3d33cca2d48a..5c955b812c47 100644
--- a/samples/bpf/xdp_monitor_kern.c
+++ b/samples/bpf/xdp_monitor_kern.c
@@ -6,21 +6,21 @@
#include <uapi/linux/bpf.h>
#include <bpf/bpf_helpers.h>
-struct bpf_map_def SEC("maps") redirect_err_cnt = {
- .type = BPF_MAP_TYPE_PERCPU_ARRAY,
- .key_size = sizeof(u32),
- .value_size = sizeof(u64),
- .max_entries = 2,
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+ __type(key, u32);
+ __type(value, u64);
+ __uint(max_entries, 2);
/* TODO: have entries for all possible errno's */
-};
+} redirect_err_cnt SEC(".maps");
#define XDP_UNKNOWN XDP_REDIRECT + 1
-struct bpf_map_def SEC("maps") exception_cnt = {
- .type = BPF_MAP_TYPE_PERCPU_ARRAY,
- .key_size = sizeof(u32),
- .value_size = sizeof(u64),
- .max_entries = XDP_UNKNOWN + 1,
-};
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+ __type(key, u32);
+ __type(value, u64);
+ __uint(max_entries, XDP_UNKNOWN + 1);
+} exception_cnt SEC(".maps");
/* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_redirect/format
* Code in: kernel/include/trace/events/xdp.h
@@ -129,19 +129,19 @@ struct datarec {
};
#define MAX_CPUS 64
-struct bpf_map_def SEC("maps") cpumap_enqueue_cnt = {
- .type = BPF_MAP_TYPE_PERCPU_ARRAY,
- .key_size = sizeof(u32),
- .value_size = sizeof(struct datarec),
- .max_entries = MAX_CPUS,
-};
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+ __type(key, u32);
+ __type(value, struct datarec);
+ __uint(max_entries, MAX_CPUS);
+} cpumap_enqueue_cnt SEC(".maps");
-struct bpf_map_def SEC("maps") cpumap_kthread_cnt = {
- .type = BPF_MAP_TYPE_PERCPU_ARRAY,
- .key_size = sizeof(u32),
- .value_size = sizeof(struct datarec),
- .max_entries = 1,
-};
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+ __type(key, u32);
+ __type(value, struct datarec);
+ __uint(max_entries, 1);
+} cpumap_kthread_cnt SEC(".maps");
/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_enqueue/format
* Code in: kernel/include/trace/events/xdp.h
@@ -210,12 +210,12 @@ int trace_xdp_cpumap_kthread(struct cpumap_kthread_ctx *ctx)
return 0;
}
-struct bpf_map_def SEC("maps") devmap_xmit_cnt = {
- .type = BPF_MAP_TYPE_PERCPU_ARRAY,
- .key_size = sizeof(u32),
- .value_size = sizeof(struct datarec),
- .max_entries = 1,
-};
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+ __type(key, u32);
+ __type(value, struct datarec);
+ __uint(max_entries, 1);
+} devmap_xmit_cnt SEC(".maps");
/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_devmap_xmit/format
* Code in: kernel/include/trace/events/xdp.h
diff --git a/samples/bpf/xdp_monitor_user.c b/samples/bpf/xdp_monitor_user.c
index ef53b93db573..03d0a182913f 100644
--- a/samples/bpf/xdp_monitor_user.c
+++ b/samples/bpf/xdp_monitor_user.c
@@ -26,12 +26,37 @@ static const char *__doc_err_only__=
#include <net/if.h>
#include <time.h>
+#include <signal.h>
#include <bpf/bpf.h>
-#include "bpf_load.h"
+#include <bpf/libbpf.h>
#include "bpf_util.h"
+enum map_type {
+ REDIRECT_ERR_CNT,
+ EXCEPTION_CNT,
+ CPUMAP_ENQUEUE_CNT,
+ CPUMAP_KTHREAD_CNT,
+ DEVMAP_XMIT_CNT,
+};
+
+static const char *const map_type_strings[] = {
+ [REDIRECT_ERR_CNT] = "redirect_err_cnt",
+ [EXCEPTION_CNT] = "exception_cnt",
+ [CPUMAP_ENQUEUE_CNT] = "cpumap_enqueue_cnt",
+ [CPUMAP_KTHREAD_CNT] = "cpumap_kthread_cnt",
+ [DEVMAP_XMIT_CNT] = "devmap_xmit_cnt",
+};
+
+#define NUM_MAP 5
+#define NUM_TP 8
+
+static int tp_cnt;
+static int map_cnt;
static int verbose = 1;
static bool debug = false;
+struct bpf_map *map_data[NUM_MAP] = {};
+struct bpf_link *tp_links[NUM_TP] = {};
+struct bpf_object *obj;
static const struct option long_options[] = {
{"help", no_argument, NULL, 'h' },
@@ -41,6 +66,16 @@ static const struct option long_options[] = {
{0, 0, NULL, 0 }
};
+static void int_exit(int sig)
+{
+ /* Detach tracepoints */
+ while (tp_cnt)
+ bpf_link__destroy(tp_links[--tp_cnt]);
+
+ bpf_object__close(obj);
+ exit(0);
+}
+
/* C standard specifies two constants, EXIT_SUCCESS(0) and EXIT_FAILURE(1) */
#define EXIT_FAIL_MEM 5
@@ -483,23 +518,23 @@ static bool stats_collect(struct stats_record *rec)
* this can happen by someone running perf-record -e
*/
- fd = map_data[0].fd; /* map0: redirect_err_cnt */
+ fd = bpf_map__fd(map_data[REDIRECT_ERR_CNT]);
for (i = 0; i < REDIR_RES_MAX; i++)
map_collect_record_u64(fd, i, &rec->xdp_redirect[i]);
- fd = map_data[1].fd; /* map1: exception_cnt */
+ fd = bpf_map__fd(map_data[EXCEPTION_CNT]);
for (i = 0; i < XDP_ACTION_MAX; i++) {
map_collect_record_u64(fd, i, &rec->xdp_exception[i]);
}
- fd = map_data[2].fd; /* map2: cpumap_enqueue_cnt */
+ fd = bpf_map__fd(map_data[CPUMAP_ENQUEUE_CNT]);
for (i = 0; i < MAX_CPUS; i++)
map_collect_record(fd, i, &rec->xdp_cpumap_enqueue[i]);
- fd = map_data[3].fd; /* map3: cpumap_kthread_cnt */
+ fd = bpf_map__fd(map_data[CPUMAP_KTHREAD_CNT]);
map_collect_record(fd, 0, &rec->xdp_cpumap_kthread);
- fd = map_data[4].fd; /* map4: devmap_xmit_cnt */
+ fd = bpf_map__fd(map_data[DEVMAP_XMIT_CNT]);
map_collect_record(fd, 0, &rec->xdp_devmap_xmit);
return true;
@@ -598,8 +633,8 @@ static void stats_poll(int interval, bool err_only)
/* TODO Need more advanced stats on error types */
if (verbose) {
- printf(" - Stats map0: %s\n", map_data[0].name);
- printf(" - Stats map1: %s\n", map_data[1].name);
+ printf(" - Stats map0: %s\n", bpf_map__name(map_data[0]));
+ printf(" - Stats map1: %s\n", bpf_map__name(map_data[1]));
printf("\n");
}
fflush(stdout);
@@ -618,44 +653,51 @@ static void stats_poll(int interval, bool err_only)
static void print_bpf_prog_info(void)
{
- int i;
+ struct bpf_program *prog;
+ struct bpf_map *map;
+ int i = 0;
/* Prog info */
- printf("Loaded BPF prog have %d bpf program(s)\n", prog_cnt);
- for (i = 0; i < prog_cnt; i++) {
- printf(" - prog_fd[%d] = fd(%d)\n", i, prog_fd[i]);
+ printf("Loaded BPF prog have %d bpf program(s)\n", tp_cnt);
+ bpf_object__for_each_program(prog, obj) {
+ printf(" - prog_fd[%d] = fd(%d)\n", i, bpf_program__fd(prog));
+ i++;
}
+ i = 0;
/* Maps info */
- printf("Loaded BPF prog have %d map(s)\n", map_data_count);
- for (i = 0; i < map_data_count; i++) {
- char *name = map_data[i].name;
- int fd = map_data[i].fd;
+ printf("Loaded BPF prog have %d map(s)\n", map_cnt);
+ bpf_object__for_each_map(map, obj) {
+ const char *name = bpf_map__name(map);
+ int fd = bpf_map__fd(map);
printf(" - map_data[%d] = fd(%d) name:%s\n", i, fd, name);
+ i++;
}
/* Event info */
- printf("Searching for (max:%d) event file descriptor(s)\n", prog_cnt);
- for (i = 0; i < prog_cnt; i++) {
- if (event_fd[i] != -1)
- printf(" - event_fd[%d] = fd(%d)\n", i, event_fd[i]);
+ printf("Searching for (max:%d) event file descriptor(s)\n", tp_cnt);
+ for (i = 0; i < tp_cnt; i++) {
+ int fd = bpf_link__fd(tp_links[i]);
+
+ if (fd != -1)
+ printf(" - event_fd[%d] = fd(%d)\n", i, fd);
}
}
int main(int argc, char **argv)
{
struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+ struct bpf_program *prog;
int longindex = 0, opt;
- int ret = EXIT_SUCCESS;
- char bpf_obj_file[256];
+ int ret = EXIT_FAILURE;
+ enum map_type type;
+ char filename[256];
/* Default settings: */
bool errors_only = true;
int interval = 2;
- snprintf(bpf_obj_file, sizeof(bpf_obj_file), "%s_kern.o", argv[0]);
-
/* Parse commands line args */
while ((opt = getopt_long(argc, argv, "hDSs:",
long_options, &longindex)) != -1) {
@@ -672,40 +714,79 @@ int main(int argc, char **argv)
case 'h':
default:
usage(argv);
- return EXIT_FAILURE;
+ return ret;
}
}
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
if (setrlimit(RLIMIT_MEMLOCK, &r)) {
perror("setrlimit(RLIMIT_MEMLOCK)");
- return EXIT_FAILURE;
+ return ret;
}
- if (load_bpf_file(bpf_obj_file)) {
- printf("ERROR - bpf_log_buf: %s", bpf_log_buf);
- return EXIT_FAILURE;
+ /* Remove tracepoint program when program is interrupted or killed */
+ signal(SIGINT, int_exit);
+ signal(SIGTERM, int_exit);
+
+ obj = bpf_object__open_file(filename, NULL);
+ if (libbpf_get_error(obj)) {
+ printf("ERROR: opening BPF object file failed\n");
+ obj = NULL;
+ goto cleanup;
+ }
+
+ /* load BPF program */
+ if (bpf_object__load(obj)) {
+ printf("ERROR: loading BPF object file failed\n");
+ goto cleanup;
+ }
+
+ for (type = 0; type < NUM_MAP; type++) {
+ map_data[type] =
+ bpf_object__find_map_by_name(obj, map_type_strings[type]);
+
+ if (libbpf_get_error(map_data[type])) {
+ printf("ERROR: finding a map in obj file failed\n");
+ goto cleanup;
+ }
+ map_cnt++;
}
- if (!prog_fd[0]) {
- printf("ERROR - load_bpf_file: %s\n", strerror(errno));
- return EXIT_FAILURE;
+
+ bpf_object__for_each_program(prog, obj) {
+ tp_links[tp_cnt] = bpf_program__attach(prog);
+ if (libbpf_get_error(tp_links[tp_cnt])) {
+ printf("ERROR: bpf_program__attach failed\n");
+ tp_links[tp_cnt] = NULL;
+ goto cleanup;
+ }
+ tp_cnt++;
}
if (debug) {
print_bpf_prog_info();
}
- /* Unload/stop tracepoint event by closing fd's */
+ /* Unload/stop tracepoint event by closing bpf_link's */
if (errors_only) {
- /* The prog_fd[i] and event_fd[i] depend on the
- * order the functions was defined in _kern.c
+ /* The bpf_link[i] depend on the order of
+ * the functions was defined in _kern.c
*/
- close(event_fd[2]); /* tracepoint/xdp/xdp_redirect */
- close(prog_fd[2]); /* func: trace_xdp_redirect */
- close(event_fd[3]); /* tracepoint/xdp/xdp_redirect_map */
- close(prog_fd[3]); /* func: trace_xdp_redirect_map */
+ bpf_link__destroy(tp_links[2]); /* tracepoint/xdp/xdp_redirect */
+ tp_links[2] = NULL;
+
+ bpf_link__destroy(tp_links[3]); /* tracepoint/xdp/xdp_redirect_map */
+ tp_links[3] = NULL;
}
stats_poll(interval, errors_only);
+ ret = EXIT_SUCCESS;
+
+cleanup:
+ /* Detach tracepoints */
+ while (tp_cnt)
+ bpf_link__destroy(tp_links[--tp_cnt]);
+
+ bpf_object__close(obj);
return ret;
}
diff --git a/samples/bpf/xdp_redirect_cpu_user.c b/samples/bpf/xdp_redirect_cpu_user.c
index 004c0622c913..6fb8dbde62c5 100644
--- a/samples/bpf/xdp_redirect_cpu_user.c
+++ b/samples/bpf/xdp_redirect_cpu_user.c
@@ -37,18 +37,35 @@ static __u32 prog_id;
static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
static int n_cpus;
-static int cpu_map_fd;
-static int rx_cnt_map_fd;
-static int redirect_err_cnt_map_fd;
-static int cpumap_enqueue_cnt_map_fd;
-static int cpumap_kthread_cnt_map_fd;
-static int cpus_available_map_fd;
-static int cpus_count_map_fd;
-static int cpus_iterator_map_fd;
-static int exception_cnt_map_fd;
+
+enum map_type {
+ CPU_MAP,
+ RX_CNT,
+ REDIRECT_ERR_CNT,
+ CPUMAP_ENQUEUE_CNT,
+ CPUMAP_KTHREAD_CNT,
+ CPUS_AVAILABLE,
+ CPUS_COUNT,
+ CPUS_ITERATOR,
+ EXCEPTION_CNT,
+};
+
+static const char *const map_type_strings[] = {
+ [CPU_MAP] = "cpu_map",
+ [RX_CNT] = "rx_cnt",
+ [REDIRECT_ERR_CNT] = "redirect_err_cnt",
+ [CPUMAP_ENQUEUE_CNT] = "cpumap_enqueue_cnt",
+ [CPUMAP_KTHREAD_CNT] = "cpumap_kthread_cnt",
+ [CPUS_AVAILABLE] = "cpus_available",
+ [CPUS_COUNT] = "cpus_count",
+ [CPUS_ITERATOR] = "cpus_iterator",
+ [EXCEPTION_CNT] = "exception_cnt",
+};
#define NUM_TP 5
-struct bpf_link *tp_links[NUM_TP] = { 0 };
+#define NUM_MAP 9
+struct bpf_link *tp_links[NUM_TP] = {};
+static int map_fds[NUM_MAP];
static int tp_cnt = 0;
/* Exit return codes */
@@ -111,7 +128,7 @@ static void print_avail_progs(struct bpf_object *obj)
bpf_object__for_each_program(pos, obj) {
if (bpf_program__is_xdp(pos))
- printf(" %s\n", bpf_program__title(pos, false));
+ printf(" %s\n", bpf_program__section_name(pos));
}
}
@@ -527,20 +544,20 @@ static void stats_collect(struct stats_record *rec)
{
int fd, i;
- fd = rx_cnt_map_fd;
+ fd = map_fds[RX_CNT];
map_collect_percpu(fd, 0, &rec->rx_cnt);
- fd = redirect_err_cnt_map_fd;
+ fd = map_fds[REDIRECT_ERR_CNT];
map_collect_percpu(fd, 1, &rec->redir_err);
- fd = cpumap_enqueue_cnt_map_fd;
+ fd = map_fds[CPUMAP_ENQUEUE_CNT];
for (i = 0; i < n_cpus; i++)
map_collect_percpu(fd, i, &rec->enq[i]);
- fd = cpumap_kthread_cnt_map_fd;
+ fd = map_fds[CPUMAP_KTHREAD_CNT];
map_collect_percpu(fd, 0, &rec->kthread);
- fd = exception_cnt_map_fd;
+ fd = map_fds[EXCEPTION_CNT];
map_collect_percpu(fd, 0, &rec->exception);
}
@@ -565,7 +582,7 @@ static int create_cpu_entry(__u32 cpu, struct bpf_cpumap_val *value,
/* Add a CPU entry to cpumap, as this allocate a cpu entry in
* the kernel for the cpu.
*/
- ret = bpf_map_update_elem(cpu_map_fd, &cpu, value, 0);
+ ret = bpf_map_update_elem(map_fds[CPU_MAP], &cpu, value, 0);
if (ret) {
fprintf(stderr, "Create CPU entry failed (err:%d)\n", ret);
exit(EXIT_FAIL_BPF);
@@ -574,21 +591,21 @@ static int create_cpu_entry(__u32 cpu, struct bpf_cpumap_val *value,
/* Inform bpf_prog's that a new CPU is available to select
* from via some control maps.
*/
- ret = bpf_map_update_elem(cpus_available_map_fd, &avail_idx, &cpu, 0);
+ ret = bpf_map_update_elem(map_fds[CPUS_AVAILABLE], &avail_idx, &cpu, 0);
if (ret) {
fprintf(stderr, "Add to avail CPUs failed\n");
exit(EXIT_FAIL_BPF);
}
/* When not replacing/updating existing entry, bump the count */
- ret = bpf_map_lookup_elem(cpus_count_map_fd, &key, &curr_cpus_count);
+ ret = bpf_map_lookup_elem(map_fds[CPUS_COUNT], &key, &curr_cpus_count);
if (ret) {
fprintf(stderr, "Failed reading curr cpus_count\n");
exit(EXIT_FAIL_BPF);
}
if (new) {
curr_cpus_count++;
- ret = bpf_map_update_elem(cpus_count_map_fd, &key,
+ ret = bpf_map_update_elem(map_fds[CPUS_COUNT], &key,
&curr_cpus_count, 0);
if (ret) {
fprintf(stderr, "Failed write curr cpus_count\n");
@@ -612,7 +629,7 @@ static void mark_cpus_unavailable(void)
int ret, i;
for (i = 0; i < n_cpus; i++) {
- ret = bpf_map_update_elem(cpus_available_map_fd, &i,
+ ret = bpf_map_update_elem(map_fds[CPUS_AVAILABLE], &i,
&invalid_cpu, 0);
if (ret) {
fprintf(stderr, "Failed marking CPU unavailable\n");
@@ -665,68 +682,37 @@ static void stats_poll(int interval, bool use_separators, char *prog_name,
free_stats_record(prev);
}
-static struct bpf_link * attach_tp(struct bpf_object *obj,
- const char *tp_category,
- const char* tp_name)
+static int init_tracepoints(struct bpf_object *obj)
{
struct bpf_program *prog;
- struct bpf_link *link;
- char sec_name[PATH_MAX];
- int len;
- len = snprintf(sec_name, PATH_MAX, "tracepoint/%s/%s",
- tp_category, tp_name);
- if (len < 0)
- exit(EXIT_FAIL);
+ bpf_object__for_each_program(prog, obj) {
+ if (bpf_program__is_tracepoint(prog) != true)
+ continue;
- prog = bpf_object__find_program_by_title(obj, sec_name);
- if (!prog) {
- fprintf(stderr, "ERR: finding progsec: %s\n", sec_name);
- exit(EXIT_FAIL_BPF);
+ tp_links[tp_cnt] = bpf_program__attach(prog);
+ if (libbpf_get_error(tp_links[tp_cnt])) {
+ tp_links[tp_cnt] = NULL;
+ return -EINVAL;
+ }
+ tp_cnt++;
}
- link = bpf_program__attach_tracepoint(prog, tp_category, tp_name);
- if (libbpf_get_error(link))
- exit(EXIT_FAIL_BPF);
-
- return link;
-}
-
-static void init_tracepoints(struct bpf_object *obj) {
- tp_links[tp_cnt++] = attach_tp(obj, "xdp", "xdp_redirect_err");
- tp_links[tp_cnt++] = attach_tp(obj, "xdp", "xdp_redirect_map_err");
- tp_links[tp_cnt++] = attach_tp(obj, "xdp", "xdp_exception");
- tp_links[tp_cnt++] = attach_tp(obj, "xdp", "xdp_cpumap_enqueue");
- tp_links[tp_cnt++] = attach_tp(obj, "xdp", "xdp_cpumap_kthread");
+ return 0;
}
static int init_map_fds(struct bpf_object *obj)
{
- /* Maps updated by tracepoints */
- redirect_err_cnt_map_fd =
- bpf_object__find_map_fd_by_name(obj, "redirect_err_cnt");
- exception_cnt_map_fd =
- bpf_object__find_map_fd_by_name(obj, "exception_cnt");
- cpumap_enqueue_cnt_map_fd =
- bpf_object__find_map_fd_by_name(obj, "cpumap_enqueue_cnt");
- cpumap_kthread_cnt_map_fd =
- bpf_object__find_map_fd_by_name(obj, "cpumap_kthread_cnt");
-
- /* Maps used by XDP */
- rx_cnt_map_fd = bpf_object__find_map_fd_by_name(obj, "rx_cnt");
- cpu_map_fd = bpf_object__find_map_fd_by_name(obj, "cpu_map");
- cpus_available_map_fd =
- bpf_object__find_map_fd_by_name(obj, "cpus_available");
- cpus_count_map_fd = bpf_object__find_map_fd_by_name(obj, "cpus_count");
- cpus_iterator_map_fd =
- bpf_object__find_map_fd_by_name(obj, "cpus_iterator");
-
- if (cpu_map_fd < 0 || rx_cnt_map_fd < 0 ||
- redirect_err_cnt_map_fd < 0 || cpumap_enqueue_cnt_map_fd < 0 ||
- cpumap_kthread_cnt_map_fd < 0 || cpus_available_map_fd < 0 ||
- cpus_count_map_fd < 0 || cpus_iterator_map_fd < 0 ||
- exception_cnt_map_fd < 0)
- return -ENOENT;
+ enum map_type type;
+
+ for (type = 0; type < NUM_MAP; type++) {
+ map_fds[type] =
+ bpf_object__find_map_fd_by_name(obj,
+ map_type_strings[type]);
+
+ if (map_fds[type] < 0)
+ return -ENOENT;
+ }
return 0;
}
@@ -795,13 +781,13 @@ int main(int argc, char **argv)
bool stress_mode = false;
struct bpf_program *prog;
struct bpf_object *obj;
+ int err = EXIT_FAIL;
char filename[256];
int added_cpus = 0;
int longindex = 0;
int interval = 2;
int add_cpu = -1;
- int opt, err;
- int prog_fd;
+ int opt, prog_fd;
int *cpu, i;
__u32 qsize;
@@ -824,24 +810,29 @@ int main(int argc, char **argv)
}
if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd))
- return EXIT_FAIL;
+ return err;
if (prog_fd < 0) {
fprintf(stderr, "ERR: bpf_prog_load_xattr: %s\n",
strerror(errno));
- return EXIT_FAIL;
+ return err;
}
- init_tracepoints(obj);
+
+ if (init_tracepoints(obj) < 0) {
+ fprintf(stderr, "ERR: bpf_program__attach failed\n");
+ return err;
+ }
+
if (init_map_fds(obj) < 0) {
fprintf(stderr, "bpf_object__find_map_fd_by_name failed\n");
- return EXIT_FAIL;
+ return err;
}
mark_cpus_unavailable();
cpu = malloc(n_cpus * sizeof(int));
if (!cpu) {
fprintf(stderr, "failed to allocate cpu array\n");
- return EXIT_FAIL;
+ return err;
}
memset(cpu, 0, n_cpus * sizeof(int));
@@ -960,14 +951,12 @@ int main(int argc, char **argv)
prog = bpf_object__find_program_by_title(obj, prog_name);
if (!prog) {
fprintf(stderr, "bpf_object__find_program_by_title failed\n");
- err = EXIT_FAIL;
goto out;
}
prog_fd = bpf_program__fd(prog);
if (prog_fd < 0) {
fprintf(stderr, "bpf_program__fd failed\n");
- err = EXIT_FAIL;
goto out;
}
@@ -986,6 +975,8 @@ int main(int argc, char **argv)
stats_poll(interval, use_separators, prog_name, mprog_name,
&value, stress_mode);
+
+ err = EXIT_OK;
out:
free(cpu);
return err;
diff --git a/samples/bpf/xdp_sample_pkts_kern.c b/samples/bpf/xdp_sample_pkts_kern.c
index 33377289e2a8..9cf76b340dd7 100644
--- a/samples/bpf/xdp_sample_pkts_kern.c
+++ b/samples/bpf/xdp_sample_pkts_kern.c
@@ -5,14 +5,12 @@
#include <bpf/bpf_helpers.h>
#define SAMPLE_SIZE 64ul
-#define MAX_CPUS 128
-
-struct bpf_map_def SEC("maps") my_map = {
- .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
- .key_size = sizeof(int),
- .value_size = sizeof(u32),
- .max_entries = MAX_CPUS,
-};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
+ __uint(key_size, sizeof(int));
+ __uint(value_size, sizeof(u32));
+} my_map SEC(".maps");
SEC("xdp_sample")
int xdp_sample_prog(struct xdp_md *ctx)
diff --git a/samples/bpf/xdp_sample_pkts_user.c b/samples/bpf/xdp_sample_pkts_user.c
index 991ef6f0880b..4b2a300c750c 100644
--- a/samples/bpf/xdp_sample_pkts_user.c
+++ b/samples/bpf/xdp_sample_pkts_user.c
@@ -18,7 +18,6 @@
#include "perf-sys.h"
-#define MAX_CPUS 128
static int if_idx;
static char *if_name;
static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c
index 19c679456a0e..1149e94ca32f 100644
--- a/samples/bpf/xdpsock_user.c
+++ b/samples/bpf/xdpsock_user.c
@@ -11,6 +11,7 @@
#include <linux/if_xdp.h>
#include <linux/if_ether.h>
#include <linux/ip.h>
+#include <linux/limits.h>
#include <linux/udp.h>
#include <arpa/inet.h>
#include <locale.h>
@@ -78,6 +79,11 @@ static int opt_pkt_count;
static u16 opt_pkt_size = MIN_PKT_SIZE;
static u32 opt_pkt_fill_pattern = 0x12345678;
static bool opt_extra_stats;
+static bool opt_quiet;
+static bool opt_app_stats;
+static const char *opt_irq_str = "";
+static u32 irq_no;
+static int irqs_at_init = -1;
static int opt_poll;
static int opt_interval = 1;
static u32 opt_xdp_bind_flags = XDP_USE_NEED_WAKEUP;
@@ -90,18 +96,7 @@ static bool opt_need_wakeup = true;
static u32 opt_num_xsks = 1;
static u32 prog_id;
-struct xsk_umem_info {
- struct xsk_ring_prod fq;
- struct xsk_ring_cons cq;
- struct xsk_umem *umem;
- void *buffer;
-};
-
-struct xsk_socket_info {
- struct xsk_ring_cons rx;
- struct xsk_ring_prod tx;
- struct xsk_umem_info *umem;
- struct xsk_socket *xsk;
+struct xsk_ring_stats {
unsigned long rx_npkts;
unsigned long tx_npkts;
unsigned long rx_dropped_npkts;
@@ -118,6 +113,41 @@ struct xsk_socket_info {
unsigned long prev_rx_full_npkts;
unsigned long prev_rx_fill_empty_npkts;
unsigned long prev_tx_empty_npkts;
+};
+
+struct xsk_driver_stats {
+ unsigned long intrs;
+ unsigned long prev_intrs;
+};
+
+struct xsk_app_stats {
+ unsigned long rx_empty_polls;
+ unsigned long fill_fail_polls;
+ unsigned long copy_tx_sendtos;
+ unsigned long tx_wakeup_sendtos;
+ unsigned long opt_polls;
+ unsigned long prev_rx_empty_polls;
+ unsigned long prev_fill_fail_polls;
+ unsigned long prev_copy_tx_sendtos;
+ unsigned long prev_tx_wakeup_sendtos;
+ unsigned long prev_opt_polls;
+};
+
+struct xsk_umem_info {
+ struct xsk_ring_prod fq;
+ struct xsk_ring_cons cq;
+ struct xsk_umem *umem;
+ void *buffer;
+};
+
+struct xsk_socket_info {
+ struct xsk_ring_cons rx;
+ struct xsk_ring_prod tx;
+ struct xsk_umem_info *umem;
+ struct xsk_socket *xsk;
+ struct xsk_ring_stats ring_stats;
+ struct xsk_app_stats app_stats;
+ struct xsk_driver_stats drv_stats;
u32 outstanding_tx;
};
@@ -172,18 +202,151 @@ static int xsk_get_xdp_stats(int fd, struct xsk_socket_info *xsk)
return err;
if (optlen == sizeof(struct xdp_statistics)) {
- xsk->rx_dropped_npkts = stats.rx_dropped;
- xsk->rx_invalid_npkts = stats.rx_invalid_descs;
- xsk->tx_invalid_npkts = stats.tx_invalid_descs;
- xsk->rx_full_npkts = stats.rx_ring_full;
- xsk->rx_fill_empty_npkts = stats.rx_fill_ring_empty_descs;
- xsk->tx_empty_npkts = stats.tx_ring_empty_descs;
+ xsk->ring_stats.rx_dropped_npkts = stats.rx_dropped;
+ xsk->ring_stats.rx_invalid_npkts = stats.rx_invalid_descs;
+ xsk->ring_stats.tx_invalid_npkts = stats.tx_invalid_descs;
+ xsk->ring_stats.rx_full_npkts = stats.rx_ring_full;
+ xsk->ring_stats.rx_fill_empty_npkts = stats.rx_fill_ring_empty_descs;
+ xsk->ring_stats.tx_empty_npkts = stats.tx_ring_empty_descs;
return 0;
}
return -EINVAL;
}
+static void dump_app_stats(long dt)
+{
+ int i;
+
+ for (i = 0; i < num_socks && xsks[i]; i++) {
+ char *fmt = "%-18s %'-14.0f %'-14lu\n";
+ double rx_empty_polls_ps, fill_fail_polls_ps, copy_tx_sendtos_ps,
+ tx_wakeup_sendtos_ps, opt_polls_ps;
+
+ rx_empty_polls_ps = (xsks[i]->app_stats.rx_empty_polls -
+ xsks[i]->app_stats.prev_rx_empty_polls) * 1000000000. / dt;
+ fill_fail_polls_ps = (xsks[i]->app_stats.fill_fail_polls -
+ xsks[i]->app_stats.prev_fill_fail_polls) * 1000000000. / dt;
+ copy_tx_sendtos_ps = (xsks[i]->app_stats.copy_tx_sendtos -
+ xsks[i]->app_stats.prev_copy_tx_sendtos) * 1000000000. / dt;
+ tx_wakeup_sendtos_ps = (xsks[i]->app_stats.tx_wakeup_sendtos -
+ xsks[i]->app_stats.prev_tx_wakeup_sendtos)
+ * 1000000000. / dt;
+ opt_polls_ps = (xsks[i]->app_stats.opt_polls -
+ xsks[i]->app_stats.prev_opt_polls) * 1000000000. / dt;
+
+ printf("\n%-18s %-14s %-14s\n", "", "calls/s", "count");
+ printf(fmt, "rx empty polls", rx_empty_polls_ps, xsks[i]->app_stats.rx_empty_polls);
+ printf(fmt, "fill fail polls", fill_fail_polls_ps,
+ xsks[i]->app_stats.fill_fail_polls);
+ printf(fmt, "copy tx sendtos", copy_tx_sendtos_ps,
+ xsks[i]->app_stats.copy_tx_sendtos);
+ printf(fmt, "tx wakeup sendtos", tx_wakeup_sendtos_ps,
+ xsks[i]->app_stats.tx_wakeup_sendtos);
+ printf(fmt, "opt polls", opt_polls_ps, xsks[i]->app_stats.opt_polls);
+
+ xsks[i]->app_stats.prev_rx_empty_polls = xsks[i]->app_stats.rx_empty_polls;
+ xsks[i]->app_stats.prev_fill_fail_polls = xsks[i]->app_stats.fill_fail_polls;
+ xsks[i]->app_stats.prev_copy_tx_sendtos = xsks[i]->app_stats.copy_tx_sendtos;
+ xsks[i]->app_stats.prev_tx_wakeup_sendtos = xsks[i]->app_stats.tx_wakeup_sendtos;
+ xsks[i]->app_stats.prev_opt_polls = xsks[i]->app_stats.opt_polls;
+ }
+}
+
+static bool get_interrupt_number(void)
+{
+ FILE *f_int_proc;
+ char line[4096];
+ bool found = false;
+
+ f_int_proc = fopen("/proc/interrupts", "r");
+ if (f_int_proc == NULL) {
+ printf("Failed to open /proc/interrupts.\n");
+ return found;
+ }
+
+ while (!feof(f_int_proc) && !found) {
+ /* Make sure to read a full line at a time */
+ if (fgets(line, sizeof(line), f_int_proc) == NULL ||
+ line[strlen(line) - 1] != '\n') {
+ printf("Error reading from interrupts file\n");
+ break;
+ }
+
+ /* Extract interrupt number from line */
+ if (strstr(line, opt_irq_str) != NULL) {
+ irq_no = atoi(line);
+ found = true;
+ break;
+ }
+ }
+
+ fclose(f_int_proc);
+
+ return found;
+}
+
+static int get_irqs(void)
+{
+ char count_path[PATH_MAX];
+ int total_intrs = -1;
+ FILE *f_count_proc;
+ char line[4096];
+
+ snprintf(count_path, sizeof(count_path),
+ "/sys/kernel/irq/%i/per_cpu_count", irq_no);
+ f_count_proc = fopen(count_path, "r");
+ if (f_count_proc == NULL) {
+ printf("Failed to open %s\n", count_path);
+ return total_intrs;
+ }
+
+ if (fgets(line, sizeof(line), f_count_proc) == NULL ||
+ line[strlen(line) - 1] != '\n') {
+ printf("Error reading from %s\n", count_path);
+ } else {
+ static const char com[2] = ",";
+ char *token;
+
+ total_intrs = 0;
+ token = strtok(line, com);
+ while (token != NULL) {
+ /* sum up interrupts across all cores */
+ total_intrs += atoi(token);
+ token = strtok(NULL, com);
+ }
+ }
+
+ fclose(f_count_proc);
+
+ return total_intrs;
+}
+
+static void dump_driver_stats(long dt)
+{
+ int i;
+
+ for (i = 0; i < num_socks && xsks[i]; i++) {
+ char *fmt = "%-18s %'-14.0f %'-14lu\n";
+ double intrs_ps;
+ int n_ints = get_irqs();
+
+ if (n_ints < 0) {
+ printf("error getting intr info for intr %i\n", irq_no);
+ return;
+ }
+ xsks[i]->drv_stats.intrs = n_ints - irqs_at_init;
+
+ intrs_ps = (xsks[i]->drv_stats.intrs - xsks[i]->drv_stats.prev_intrs) *
+ 1000000000. / dt;
+
+ printf("\n%-18s %-14s %-14s\n", "", "intrs/s", "count");
+ printf(fmt, "irqs", intrs_ps, xsks[i]->drv_stats.intrs);
+
+ xsks[i]->drv_stats.prev_intrs = xsks[i]->drv_stats.intrs;
+ }
+}
+
static void dump_stats(void)
{
unsigned long now = get_nsecs();
@@ -193,67 +356,83 @@ static void dump_stats(void)
prev_time = now;
for (i = 0; i < num_socks && xsks[i]; i++) {
- char *fmt = "%-15s %'-11.0f %'-11lu\n";
+ char *fmt = "%-18s %'-14.0f %'-14lu\n";
double rx_pps, tx_pps, dropped_pps, rx_invalid_pps, full_pps, fill_empty_pps,
tx_invalid_pps, tx_empty_pps;
- rx_pps = (xsks[i]->rx_npkts - xsks[i]->prev_rx_npkts) *
+ rx_pps = (xsks[i]->ring_stats.rx_npkts - xsks[i]->ring_stats.prev_rx_npkts) *
1000000000. / dt;
- tx_pps = (xsks[i]->tx_npkts - xsks[i]->prev_tx_npkts) *
+ tx_pps = (xsks[i]->ring_stats.tx_npkts - xsks[i]->ring_stats.prev_tx_npkts) *
1000000000. / dt;
printf("\n sock%d@", i);
print_benchmark(false);
printf("\n");
- printf("%-15s %-11s %-11s %-11.2f\n", "", "pps", "pkts",
+ printf("%-18s %-14s %-14s %-14.2f\n", "", "pps", "pkts",
dt / 1000000000.);
- printf(fmt, "rx", rx_pps, xsks[i]->rx_npkts);
- printf(fmt, "tx", tx_pps, xsks[i]->tx_npkts);
+ printf(fmt, "rx", rx_pps, xsks[i]->ring_stats.rx_npkts);
+ printf(fmt, "tx", tx_pps, xsks[i]->ring_stats.tx_npkts);
- xsks[i]->prev_rx_npkts = xsks[i]->rx_npkts;
- xsks[i]->prev_tx_npkts = xsks[i]->tx_npkts;
+ xsks[i]->ring_stats.prev_rx_npkts = xsks[i]->ring_stats.rx_npkts;
+ xsks[i]->ring_stats.prev_tx_npkts = xsks[i]->ring_stats.tx_npkts;
if (opt_extra_stats) {
if (!xsk_get_xdp_stats(xsk_socket__fd(xsks[i]->xsk), xsks[i])) {
- dropped_pps = (xsks[i]->rx_dropped_npkts -
- xsks[i]->prev_rx_dropped_npkts) * 1000000000. / dt;
- rx_invalid_pps = (xsks[i]->rx_invalid_npkts -
- xsks[i]->prev_rx_invalid_npkts) * 1000000000. / dt;
- tx_invalid_pps = (xsks[i]->tx_invalid_npkts -
- xsks[i]->prev_tx_invalid_npkts) * 1000000000. / dt;
- full_pps = (xsks[i]->rx_full_npkts -
- xsks[i]->prev_rx_full_npkts) * 1000000000. / dt;
- fill_empty_pps = (xsks[i]->rx_fill_empty_npkts -
- xsks[i]->prev_rx_fill_empty_npkts)
- * 1000000000. / dt;
- tx_empty_pps = (xsks[i]->tx_empty_npkts -
- xsks[i]->prev_tx_empty_npkts) * 1000000000. / dt;
+ dropped_pps = (xsks[i]->ring_stats.rx_dropped_npkts -
+ xsks[i]->ring_stats.prev_rx_dropped_npkts) *
+ 1000000000. / dt;
+ rx_invalid_pps = (xsks[i]->ring_stats.rx_invalid_npkts -
+ xsks[i]->ring_stats.prev_rx_invalid_npkts) *
+ 1000000000. / dt;
+ tx_invalid_pps = (xsks[i]->ring_stats.tx_invalid_npkts -
+ xsks[i]->ring_stats.prev_tx_invalid_npkts) *
+ 1000000000. / dt;
+ full_pps = (xsks[i]->ring_stats.rx_full_npkts -
+ xsks[i]->ring_stats.prev_rx_full_npkts) *
+ 1000000000. / dt;
+ fill_empty_pps = (xsks[i]->ring_stats.rx_fill_empty_npkts -
+ xsks[i]->ring_stats.prev_rx_fill_empty_npkts) *
+ 1000000000. / dt;
+ tx_empty_pps = (xsks[i]->ring_stats.tx_empty_npkts -
+ xsks[i]->ring_stats.prev_tx_empty_npkts) *
+ 1000000000. / dt;
printf(fmt, "rx dropped", dropped_pps,
- xsks[i]->rx_dropped_npkts);
+ xsks[i]->ring_stats.rx_dropped_npkts);
printf(fmt, "rx invalid", rx_invalid_pps,
- xsks[i]->rx_invalid_npkts);
+ xsks[i]->ring_stats.rx_invalid_npkts);
printf(fmt, "tx invalid", tx_invalid_pps,
- xsks[i]->tx_invalid_npkts);
+ xsks[i]->ring_stats.tx_invalid_npkts);
printf(fmt, "rx queue full", full_pps,
- xsks[i]->rx_full_npkts);
+ xsks[i]->ring_stats.rx_full_npkts);
printf(fmt, "fill ring empty", fill_empty_pps,
- xsks[i]->rx_fill_empty_npkts);
+ xsks[i]->ring_stats.rx_fill_empty_npkts);
printf(fmt, "tx ring empty", tx_empty_pps,
- xsks[i]->tx_empty_npkts);
-
- xsks[i]->prev_rx_dropped_npkts = xsks[i]->rx_dropped_npkts;
- xsks[i]->prev_rx_invalid_npkts = xsks[i]->rx_invalid_npkts;
- xsks[i]->prev_tx_invalid_npkts = xsks[i]->tx_invalid_npkts;
- xsks[i]->prev_rx_full_npkts = xsks[i]->rx_full_npkts;
- xsks[i]->prev_rx_fill_empty_npkts = xsks[i]->rx_fill_empty_npkts;
- xsks[i]->prev_tx_empty_npkts = xsks[i]->tx_empty_npkts;
+ xsks[i]->ring_stats.tx_empty_npkts);
+
+ xsks[i]->ring_stats.prev_rx_dropped_npkts =
+ xsks[i]->ring_stats.rx_dropped_npkts;
+ xsks[i]->ring_stats.prev_rx_invalid_npkts =
+ xsks[i]->ring_stats.rx_invalid_npkts;
+ xsks[i]->ring_stats.prev_tx_invalid_npkts =
+ xsks[i]->ring_stats.tx_invalid_npkts;
+ xsks[i]->ring_stats.prev_rx_full_npkts =
+ xsks[i]->ring_stats.rx_full_npkts;
+ xsks[i]->ring_stats.prev_rx_fill_empty_npkts =
+ xsks[i]->ring_stats.rx_fill_empty_npkts;
+ xsks[i]->ring_stats.prev_tx_empty_npkts =
+ xsks[i]->ring_stats.tx_empty_npkts;
} else {
printf("%-15s\n", "Error retrieving extra stats");
}
}
}
+
+ if (opt_app_stats)
+ dump_app_stats(dt);
+ if (irq_no)
+ dump_driver_stats(dt);
}
static bool is_benchmark_done(void)
@@ -613,7 +792,16 @@ static struct xsk_umem_info *xsk_configure_umem(void *buffer, u64 size)
{
struct xsk_umem_info *umem;
struct xsk_umem_config cfg = {
- .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
+ /* We recommend that you set the fill ring size >= HW RX ring size +
+ * AF_XDP RX ring size. Make sure you fill up the fill ring
+ * with buffers at regular intervals, and you will with this setting
+ * avoid allocation failures in the driver. These are usually quite
+ * expensive since drivers have not been written to assume that
+ * allocation failures are common. For regular sockets, kernel
+ * allocated memory is used that only runs out in OOM situations
+ * that should be rare.
+ */
+ .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS * 2,
.comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS,
.frame_size = opt_xsk_frame_size,
.frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM,
@@ -640,13 +828,13 @@ static void xsk_populate_fill_ring(struct xsk_umem_info *umem)
u32 idx;
ret = xsk_ring_prod__reserve(&umem->fq,
- XSK_RING_PROD__DEFAULT_NUM_DESCS, &idx);
- if (ret != XSK_RING_PROD__DEFAULT_NUM_DESCS)
+ XSK_RING_PROD__DEFAULT_NUM_DESCS * 2, &idx);
+ if (ret != XSK_RING_PROD__DEFAULT_NUM_DESCS * 2)
exit_with_error(-ret);
- for (i = 0; i < XSK_RING_PROD__DEFAULT_NUM_DESCS; i++)
+ for (i = 0; i < XSK_RING_PROD__DEFAULT_NUM_DESCS * 2; i++)
*xsk_ring_prod__fill_addr(&umem->fq, idx++) =
i * opt_xsk_frame_size;
- xsk_ring_prod__submit(&umem->fq, XSK_RING_PROD__DEFAULT_NUM_DESCS);
+ xsk_ring_prod__submit(&umem->fq, XSK_RING_PROD__DEFAULT_NUM_DESCS * 2);
}
static struct xsk_socket_info *xsk_configure_socket(struct xsk_umem_info *umem,
@@ -683,6 +871,17 @@ static struct xsk_socket_info *xsk_configure_socket(struct xsk_umem_info *umem,
if (ret)
exit_with_error(-ret);
+ xsk->app_stats.rx_empty_polls = 0;
+ xsk->app_stats.fill_fail_polls = 0;
+ xsk->app_stats.copy_tx_sendtos = 0;
+ xsk->app_stats.tx_wakeup_sendtos = 0;
+ xsk->app_stats.opt_polls = 0;
+ xsk->app_stats.prev_rx_empty_polls = 0;
+ xsk->app_stats.prev_fill_fail_polls = 0;
+ xsk->app_stats.prev_copy_tx_sendtos = 0;
+ xsk->app_stats.prev_tx_wakeup_sendtos = 0;
+ xsk->app_stats.prev_opt_polls = 0;
+
return xsk;
}
@@ -709,6 +908,9 @@ static struct option long_options[] = {
{"tx-pkt-size", required_argument, 0, 's'},
{"tx-pkt-pattern", required_argument, 0, 'P'},
{"extra-stats", no_argument, 0, 'x'},
+ {"quiet", no_argument, 0, 'Q'},
+ {"app-stats", no_argument, 0, 'a'},
+ {"irq-string", no_argument, 0, 'I'},
{0, 0, 0, 0}
};
@@ -744,6 +946,9 @@ static void usage(const char *prog)
" Min size: %d, Max size %d.\n"
" -P, --tx-pkt-pattern=nPacket fill pattern. Default: 0x%x\n"
" -x, --extra-stats Display extra statistics.\n"
+ " -Q, --quiet Do not display any stats.\n"
+ " -a, --app-stats Display application (syscall) statistics.\n"
+ " -I, --irq-string Display driver interrupt statistics for interface associated with irq-string.\n"
"\n";
fprintf(stderr, str, prog, XSK_UMEM__DEFAULT_FRAME_SIZE,
opt_batch_size, MIN_PKT_SIZE, MIN_PKT_SIZE,
@@ -759,7 +964,7 @@ static void parse_command_line(int argc, char **argv)
opterr = 0;
for (;;) {
- c = getopt_long(argc, argv, "Frtli:q:pSNn:czf:muMd:b:C:s:P:x",
+ c = getopt_long(argc, argv, "Frtli:q:pSNn:czf:muMd:b:C:s:P:xQaI:",
long_options, &option_index);
if (c == -1)
break;
@@ -843,6 +1048,22 @@ static void parse_command_line(int argc, char **argv)
case 'x':
opt_extra_stats = 1;
break;
+ case 'Q':
+ opt_quiet = 1;
+ break;
+ case 'a':
+ opt_app_stats = 1;
+ break;
+ case 'I':
+ opt_irq_str = optarg;
+ if (get_interrupt_number())
+ irqs_at_init = get_irqs();
+ if (irqs_at_init < 0) {
+ fprintf(stderr, "ERROR: Failed to get irqs for %s\n", opt_irq_str);
+ usage(basename(argv[0]));
+ }
+
+ break;
default:
usage(basename(argv[0]));
}
@@ -888,8 +1109,15 @@ static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk,
if (!xsk->outstanding_tx)
return;
- if (!opt_need_wakeup || xsk_ring_prod__needs_wakeup(&xsk->tx))
+ /* In copy mode, Tx is driven by a syscall so we need to use e.g. sendto() to
+ * really send the packets. In zero-copy mode we do not have to do this, since Tx
+ * is driven by the NAPI loop. So as an optimization, we do not have to call
+ * sendto() all the time in zero-copy mode for l2fwd.
+ */
+ if (opt_xdp_bind_flags & XDP_COPY) {
+ xsk->app_stats.copy_tx_sendtos++;
kick_tx(xsk);
+ }
ndescs = (xsk->outstanding_tx > opt_batch_size) ? opt_batch_size :
xsk->outstanding_tx;
@@ -904,8 +1132,10 @@ static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk,
while (ret != rcvd) {
if (ret < 0)
exit_with_error(-ret);
- if (xsk_ring_prod__needs_wakeup(&umem->fq))
+ if (xsk_ring_prod__needs_wakeup(&umem->fq)) {
+ xsk->app_stats.fill_fail_polls++;
ret = poll(fds, num_socks, opt_timeout);
+ }
ret = xsk_ring_prod__reserve(&umem->fq, rcvd, &idx_fq);
}
@@ -916,7 +1146,7 @@ static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk,
xsk_ring_prod__submit(&xsk->umem->fq, rcvd);
xsk_ring_cons__release(&xsk->umem->cq, rcvd);
xsk->outstanding_tx -= rcvd;
- xsk->tx_npkts += rcvd;
+ xsk->ring_stats.tx_npkts += rcvd;
}
}
@@ -929,14 +1159,16 @@ static inline void complete_tx_only(struct xsk_socket_info *xsk,
if (!xsk->outstanding_tx)
return;
- if (!opt_need_wakeup || xsk_ring_prod__needs_wakeup(&xsk->tx))
+ if (!opt_need_wakeup || xsk_ring_prod__needs_wakeup(&xsk->tx)) {
+ xsk->app_stats.tx_wakeup_sendtos++;
kick_tx(xsk);
+ }
rcvd = xsk_ring_cons__peek(&xsk->umem->cq, batch_size, &idx);
if (rcvd > 0) {
xsk_ring_cons__release(&xsk->umem->cq, rcvd);
xsk->outstanding_tx -= rcvd;
- xsk->tx_npkts += rcvd;
+ xsk->ring_stats.tx_npkts += rcvd;
}
}
@@ -948,8 +1180,10 @@ static void rx_drop(struct xsk_socket_info *xsk, struct pollfd *fds)
rcvd = xsk_ring_cons__peek(&xsk->rx, opt_batch_size, &idx_rx);
if (!rcvd) {
- if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq))
+ if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) {
+ xsk->app_stats.rx_empty_polls++;
ret = poll(fds, num_socks, opt_timeout);
+ }
return;
}
@@ -957,8 +1191,10 @@ static void rx_drop(struct xsk_socket_info *xsk, struct pollfd *fds)
while (ret != rcvd) {
if (ret < 0)
exit_with_error(-ret);
- if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq))
+ if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) {
+ xsk->app_stats.fill_fail_polls++;
ret = poll(fds, num_socks, opt_timeout);
+ }
ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq);
}
@@ -976,7 +1212,7 @@ static void rx_drop(struct xsk_socket_info *xsk, struct pollfd *fds)
xsk_ring_prod__submit(&xsk->umem->fq, rcvd);
xsk_ring_cons__release(&xsk->rx, rcvd);
- xsk->rx_npkts += rcvd;
+ xsk->ring_stats.rx_npkts += rcvd;
}
static void rx_drop_all(void)
@@ -991,6 +1227,8 @@ static void rx_drop_all(void)
for (;;) {
if (opt_poll) {
+ for (i = 0; i < num_socks; i++)
+ xsks[i]->app_stats.opt_polls++;
ret = poll(fds, num_socks, opt_timeout);
if (ret <= 0)
continue;
@@ -1004,7 +1242,7 @@ static void rx_drop_all(void)
}
}
-static void tx_only(struct xsk_socket_info *xsk, u32 frame_nb, int batch_size)
+static void tx_only(struct xsk_socket_info *xsk, u32 *frame_nb, int batch_size)
{
u32 idx;
unsigned int i;
@@ -1017,14 +1255,14 @@ static void tx_only(struct xsk_socket_info *xsk, u32 frame_nb, int batch_size)
for (i = 0; i < batch_size; i++) {
struct xdp_desc *tx_desc = xsk_ring_prod__tx_desc(&xsk->tx,
idx + i);
- tx_desc->addr = (frame_nb + i) << XSK_UMEM__DEFAULT_FRAME_SHIFT;
+ tx_desc->addr = (*frame_nb + i) << XSK_UMEM__DEFAULT_FRAME_SHIFT;
tx_desc->len = PKT_SIZE;
}
xsk_ring_prod__submit(&xsk->tx, batch_size);
xsk->outstanding_tx += batch_size;
- frame_nb += batch_size;
- frame_nb %= NUM_FRAMES;
+ *frame_nb += batch_size;
+ *frame_nb %= NUM_FRAMES;
complete_tx_only(xsk, batch_size);
}
@@ -1071,6 +1309,8 @@ static void tx_only_all(void)
int batch_size = get_batch_size(pkt_cnt);
if (opt_poll) {
+ for (i = 0; i < num_socks; i++)
+ xsks[i]->app_stats.opt_polls++;
ret = poll(fds, num_socks, opt_timeout);
if (ret <= 0)
continue;
@@ -1080,7 +1320,7 @@ static void tx_only_all(void)
}
for (i = 0; i < num_socks; i++)
- tx_only(xsks[i], frame_nb[i], batch_size);
+ tx_only(xsks[i], &frame_nb[i], batch_size);
pkt_cnt += batch_size;
@@ -1102,8 +1342,10 @@ static void l2fwd(struct xsk_socket_info *xsk, struct pollfd *fds)
rcvd = xsk_ring_cons__peek(&xsk->rx, opt_batch_size, &idx_rx);
if (!rcvd) {
- if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq))
+ if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) {
+ xsk->app_stats.rx_empty_polls++;
ret = poll(fds, num_socks, opt_timeout);
+ }
return;
}
@@ -1111,8 +1353,11 @@ static void l2fwd(struct xsk_socket_info *xsk, struct pollfd *fds)
while (ret != rcvd) {
if (ret < 0)
exit_with_error(-ret);
- if (xsk_ring_prod__needs_wakeup(&xsk->tx))
+ complete_tx_l2fwd(xsk, fds);
+ if (xsk_ring_prod__needs_wakeup(&xsk->tx)) {
+ xsk->app_stats.tx_wakeup_sendtos++;
kick_tx(xsk);
+ }
ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx);
}
@@ -1134,7 +1379,7 @@ static void l2fwd(struct xsk_socket_info *xsk, struct pollfd *fds)
xsk_ring_prod__submit(&xsk->tx, rcvd);
xsk_ring_cons__release(&xsk->rx, rcvd);
- xsk->rx_npkts += rcvd;
+ xsk->ring_stats.rx_npkts += rcvd;
xsk->outstanding_tx += rcvd;
}
@@ -1150,6 +1395,8 @@ static void l2fwd_all(void)
for (;;) {
if (opt_poll) {
+ for (i = 0; i < num_socks; i++)
+ xsks[i]->app_stats.opt_polls++;
ret = poll(fds, num_socks, opt_timeout);
if (ret <= 0)
continue;
@@ -1271,9 +1518,11 @@ int main(int argc, char **argv)
setlocale(LC_ALL, "");
- ret = pthread_create(&pt, NULL, poller, NULL);
- if (ret)
- exit_with_error(ret);
+ if (!opt_quiet) {
+ ret = pthread_create(&pt, NULL, poller, NULL);
+ if (ret)
+ exit_with_error(ret);
+ }
prev_time = get_nsecs();
start_time = prev_time;
@@ -1287,7 +1536,8 @@ int main(int argc, char **argv)
benchmark_done = true;
- pthread_join(pt, NULL);
+ if (!opt_quiet)
+ pthread_join(pt, NULL);
xdpsock_cleanup();
diff --git a/samples/bpf/xsk_fwd.c b/samples/bpf/xsk_fwd.c
new file mode 100644
index 000000000000..1cd97c84c337
--- /dev/null
+++ b/samples/bpf/xsk_fwd.c
@@ -0,0 +1,1085 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2020 Intel Corporation. */
+
+#define _GNU_SOURCE
+#include <poll.h>
+#include <pthread.h>
+#include <signal.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/resource.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <time.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <netinet/ether.h>
+#include <net/if.h>
+
+#include <linux/bpf.h>
+#include <linux/if_link.h>
+#include <linux/if_xdp.h>
+
+#include <bpf/libbpf.h>
+#include <bpf/xsk.h>
+#include <bpf/bpf.h>
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+
+typedef __u64 u64;
+typedef __u32 u32;
+typedef __u16 u16;
+typedef __u8 u8;
+
+/* This program illustrates the packet forwarding between multiple AF_XDP
+ * sockets in multi-threaded environment. All threads are sharing a common
+ * buffer pool, with each socket having its own private buffer cache.
+ *
+ * Example 1: Single thread handling two sockets. The packets received by socket
+ * A (interface IFA, queue QA) are forwarded to socket B (interface IFB, queue
+ * QB), while the packets received by socket B are forwarded to socket A. The
+ * thread is running on CPU core X:
+ *
+ * ./xsk_fwd -i IFA -q QA -i IFB -q QB -c X
+ *
+ * Example 2: Two threads, each handling two sockets. The thread running on CPU
+ * core X forwards all the packets received by socket A to socket B, and all the
+ * packets received by socket B to socket A. The thread running on CPU core Y is
+ * performing the same packet forwarding between sockets C and D:
+ *
+ * ./xsk_fwd -i IFA -q QA -i IFB -q QB -i IFC -q QC -i IFD -q QD
+ * -c CX -c CY
+ */
+
+/*
+ * Buffer pool and buffer cache
+ *
+ * For packet forwarding, the packet buffers are typically allocated from the
+ * pool for packet reception and freed back to the pool for further reuse once
+ * the packet transmission is completed.
+ *
+ * The buffer pool is shared between multiple threads. In order to minimize the
+ * access latency to the shared buffer pool, each thread creates one (or
+ * several) buffer caches, which, unlike the buffer pool, are private to the
+ * thread that creates them and therefore cannot be shared with other threads.
+ * The access to the shared pool is only needed either (A) when the cache gets
+ * empty due to repeated buffer allocations and it needs to be replenished from
+ * the pool, or (B) when the cache gets full due to repeated buffer free and it
+ * needs to be flushed back to the pull.
+ *
+ * In a packet forwarding system, a packet received on any input port can
+ * potentially be transmitted on any output port, depending on the forwarding
+ * configuration. For AF_XDP sockets, for this to work with zero-copy of the
+ * packet buffers when, it is required that the buffer pool memory fits into the
+ * UMEM area shared by all the sockets.
+ */
+
+struct bpool_params {
+ u32 n_buffers;
+ u32 buffer_size;
+ int mmap_flags;
+
+ u32 n_users_max;
+ u32 n_buffers_per_slab;
+};
+
+/* This buffer pool implementation organizes the buffers into equally sized
+ * slabs of *n_buffers_per_slab*. Initially, there are *n_slabs* slabs in the
+ * pool that are completely filled with buffer pointers (full slabs).
+ *
+ * Each buffer cache has a slab for buffer allocation and a slab for buffer
+ * free, with both of these slabs initially empty. When the cache's allocation
+ * slab goes empty, it is swapped with one of the available full slabs from the
+ * pool, if any is available. When the cache's free slab goes full, it is
+ * swapped for one of the empty slabs from the pool, which is guaranteed to
+ * succeed.
+ *
+ * Partially filled slabs never get traded between the cache and the pool
+ * (except when the cache itself is destroyed), which enables fast operation
+ * through pointer swapping.
+ */
+struct bpool {
+ struct bpool_params params;
+ pthread_mutex_t lock;
+ void *addr;
+
+ u64 **slabs;
+ u64 **slabs_reserved;
+ u64 *buffers;
+ u64 *buffers_reserved;
+
+ u64 n_slabs;
+ u64 n_slabs_reserved;
+ u64 n_buffers;
+
+ u64 n_slabs_available;
+ u64 n_slabs_reserved_available;
+
+ struct xsk_umem_config umem_cfg;
+ struct xsk_ring_prod umem_fq;
+ struct xsk_ring_cons umem_cq;
+ struct xsk_umem *umem;
+};
+
+static struct bpool *
+bpool_init(struct bpool_params *params,
+ struct xsk_umem_config *umem_cfg)
+{
+ struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+ u64 n_slabs, n_slabs_reserved, n_buffers, n_buffers_reserved;
+ u64 slabs_size, slabs_reserved_size;
+ u64 buffers_size, buffers_reserved_size;
+ u64 total_size, i;
+ struct bpool *bp;
+ u8 *p;
+ int status;
+
+ /* mmap prep. */
+ if (setrlimit(RLIMIT_MEMLOCK, &r))
+ return NULL;
+
+ /* bpool internals dimensioning. */
+ n_slabs = (params->n_buffers + params->n_buffers_per_slab - 1) /
+ params->n_buffers_per_slab;
+ n_slabs_reserved = params->n_users_max * 2;
+ n_buffers = n_slabs * params->n_buffers_per_slab;
+ n_buffers_reserved = n_slabs_reserved * params->n_buffers_per_slab;
+
+ slabs_size = n_slabs * sizeof(u64 *);
+ slabs_reserved_size = n_slabs_reserved * sizeof(u64 *);
+ buffers_size = n_buffers * sizeof(u64);
+ buffers_reserved_size = n_buffers_reserved * sizeof(u64);
+
+ total_size = sizeof(struct bpool) +
+ slabs_size + slabs_reserved_size +
+ buffers_size + buffers_reserved_size;
+
+ /* bpool memory allocation. */
+ p = calloc(total_size, sizeof(u8));
+ if (!p)
+ return NULL;
+
+ /* bpool memory initialization. */
+ bp = (struct bpool *)p;
+ memcpy(&bp->params, params, sizeof(*params));
+ bp->params.n_buffers = n_buffers;
+
+ bp->slabs = (u64 **)&p[sizeof(struct bpool)];
+ bp->slabs_reserved = (u64 **)&p[sizeof(struct bpool) +
+ slabs_size];
+ bp->buffers = (u64 *)&p[sizeof(struct bpool) +
+ slabs_size + slabs_reserved_size];
+ bp->buffers_reserved = (u64 *)&p[sizeof(struct bpool) +
+ slabs_size + slabs_reserved_size + buffers_size];
+
+ bp->n_slabs = n_slabs;
+ bp->n_slabs_reserved = n_slabs_reserved;
+ bp->n_buffers = n_buffers;
+
+ for (i = 0; i < n_slabs; i++)
+ bp->slabs[i] = &bp->buffers[i * params->n_buffers_per_slab];
+ bp->n_slabs_available = n_slabs;
+
+ for (i = 0; i < n_slabs_reserved; i++)
+ bp->slabs_reserved[i] = &bp->buffers_reserved[i *
+ params->n_buffers_per_slab];
+ bp->n_slabs_reserved_available = n_slabs_reserved;
+
+ for (i = 0; i < n_buffers; i++)
+ bp->buffers[i] = i * params->buffer_size;
+
+ /* lock. */
+ status = pthread_mutex_init(&bp->lock, NULL);
+ if (status) {
+ free(p);
+ return NULL;
+ }
+
+ /* mmap. */
+ bp->addr = mmap(NULL,
+ n_buffers * params->buffer_size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS | params->mmap_flags,
+ -1,
+ 0);
+ if (bp->addr == MAP_FAILED) {
+ pthread_mutex_destroy(&bp->lock);
+ free(p);
+ return NULL;
+ }
+
+ /* umem. */
+ status = xsk_umem__create(&bp->umem,
+ bp->addr,
+ bp->params.n_buffers * bp->params.buffer_size,
+ &bp->umem_fq,
+ &bp->umem_cq,
+ umem_cfg);
+ if (status) {
+ munmap(bp->addr, bp->params.n_buffers * bp->params.buffer_size);
+ pthread_mutex_destroy(&bp->lock);
+ free(p);
+ return NULL;
+ }
+ memcpy(&bp->umem_cfg, umem_cfg, sizeof(*umem_cfg));
+
+ return bp;
+}
+
+static void
+bpool_free(struct bpool *bp)
+{
+ if (!bp)
+ return;
+
+ xsk_umem__delete(bp->umem);
+ munmap(bp->addr, bp->params.n_buffers * bp->params.buffer_size);
+ pthread_mutex_destroy(&bp->lock);
+ free(bp);
+}
+
+struct bcache {
+ struct bpool *bp;
+
+ u64 *slab_cons;
+ u64 *slab_prod;
+
+ u64 n_buffers_cons;
+ u64 n_buffers_prod;
+};
+
+static u32
+bcache_slab_size(struct bcache *bc)
+{
+ struct bpool *bp = bc->bp;
+
+ return bp->params.n_buffers_per_slab;
+}
+
+static struct bcache *
+bcache_init(struct bpool *bp)
+{
+ struct bcache *bc;
+
+ bc = calloc(1, sizeof(struct bcache));
+ if (!bc)
+ return NULL;
+
+ bc->bp = bp;
+ bc->n_buffers_cons = 0;
+ bc->n_buffers_prod = 0;
+
+ pthread_mutex_lock(&bp->lock);
+ if (bp->n_slabs_reserved_available == 0) {
+ pthread_mutex_unlock(&bp->lock);
+ free(bc);
+ return NULL;
+ }
+
+ bc->slab_cons = bp->slabs_reserved[bp->n_slabs_reserved_available - 1];
+ bc->slab_prod = bp->slabs_reserved[bp->n_slabs_reserved_available - 2];
+ bp->n_slabs_reserved_available -= 2;
+ pthread_mutex_unlock(&bp->lock);
+
+ return bc;
+}
+
+static void
+bcache_free(struct bcache *bc)
+{
+ struct bpool *bp;
+
+ if (!bc)
+ return;
+
+ /* In order to keep this example simple, the case of freeing any
+ * existing buffers from the cache back to the pool is ignored.
+ */
+
+ bp = bc->bp;
+ pthread_mutex_lock(&bp->lock);
+ bp->slabs_reserved[bp->n_slabs_reserved_available] = bc->slab_prod;
+ bp->slabs_reserved[bp->n_slabs_reserved_available + 1] = bc->slab_cons;
+ bp->n_slabs_reserved_available += 2;
+ pthread_mutex_unlock(&bp->lock);
+
+ free(bc);
+}
+
+/* To work correctly, the implementation requires that the *n_buffers* input
+ * argument is never greater than the buffer pool's *n_buffers_per_slab*. This
+ * is typically the case, with one exception taking place when large number of
+ * buffers are allocated at init time (e.g. for the UMEM fill queue setup).
+ */
+static inline u32
+bcache_cons_check(struct bcache *bc, u32 n_buffers)
+{
+ struct bpool *bp = bc->bp;
+ u64 n_buffers_per_slab = bp->params.n_buffers_per_slab;
+ u64 n_buffers_cons = bc->n_buffers_cons;
+ u64 n_slabs_available;
+ u64 *slab_full;
+
+ /*
+ * Consumer slab is not empty: Use what's available locally. Do not
+ * look for more buffers from the pool when the ask can only be
+ * partially satisfied.
+ */
+ if (n_buffers_cons)
+ return (n_buffers_cons < n_buffers) ?
+ n_buffers_cons :
+ n_buffers;
+
+ /*
+ * Consumer slab is empty: look to trade the current consumer slab
+ * (full) for a full slab from the pool, if any is available.
+ */
+ pthread_mutex_lock(&bp->lock);
+ n_slabs_available = bp->n_slabs_available;
+ if (!n_slabs_available) {
+ pthread_mutex_unlock(&bp->lock);
+ return 0;
+ }
+
+ n_slabs_available--;
+ slab_full = bp->slabs[n_slabs_available];
+ bp->slabs[n_slabs_available] = bc->slab_cons;
+ bp->n_slabs_available = n_slabs_available;
+ pthread_mutex_unlock(&bp->lock);
+
+ bc->slab_cons = slab_full;
+ bc->n_buffers_cons = n_buffers_per_slab;
+ return n_buffers;
+}
+
+static inline u64
+bcache_cons(struct bcache *bc)
+{
+ u64 n_buffers_cons = bc->n_buffers_cons - 1;
+ u64 buffer;
+
+ buffer = bc->slab_cons[n_buffers_cons];
+ bc->n_buffers_cons = n_buffers_cons;
+ return buffer;
+}
+
+static inline void
+bcache_prod(struct bcache *bc, u64 buffer)
+{
+ struct bpool *bp = bc->bp;
+ u64 n_buffers_per_slab = bp->params.n_buffers_per_slab;
+ u64 n_buffers_prod = bc->n_buffers_prod;
+ u64 n_slabs_available;
+ u64 *slab_empty;
+
+ /*
+ * Producer slab is not yet full: store the current buffer to it.
+ */
+ if (n_buffers_prod < n_buffers_per_slab) {
+ bc->slab_prod[n_buffers_prod] = buffer;
+ bc->n_buffers_prod = n_buffers_prod + 1;
+ return;
+ }
+
+ /*
+ * Producer slab is full: trade the cache's current producer slab
+ * (full) for an empty slab from the pool, then store the current
+ * buffer to the new producer slab. As one full slab exists in the
+ * cache, it is guaranteed that there is at least one empty slab
+ * available in the pool.
+ */
+ pthread_mutex_lock(&bp->lock);
+ n_slabs_available = bp->n_slabs_available;
+ slab_empty = bp->slabs[n_slabs_available];
+ bp->slabs[n_slabs_available] = bc->slab_prod;
+ bp->n_slabs_available = n_slabs_available + 1;
+ pthread_mutex_unlock(&bp->lock);
+
+ slab_empty[0] = buffer;
+ bc->slab_prod = slab_empty;
+ bc->n_buffers_prod = 1;
+}
+
+/*
+ * Port
+ *
+ * Each of the forwarding ports sits on top of an AF_XDP socket. In order for
+ * packet forwarding to happen with no packet buffer copy, all the sockets need
+ * to share the same UMEM area, which is used as the buffer pool memory.
+ */
+#ifndef MAX_BURST_RX
+#define MAX_BURST_RX 64
+#endif
+
+#ifndef MAX_BURST_TX
+#define MAX_BURST_TX 64
+#endif
+
+struct burst_rx {
+ u64 addr[MAX_BURST_RX];
+ u32 len[MAX_BURST_RX];
+};
+
+struct burst_tx {
+ u64 addr[MAX_BURST_TX];
+ u32 len[MAX_BURST_TX];
+ u32 n_pkts;
+};
+
+struct port_params {
+ struct xsk_socket_config xsk_cfg;
+ struct bpool *bp;
+ const char *iface;
+ u32 iface_queue;
+};
+
+struct port {
+ struct port_params params;
+
+ struct bcache *bc;
+
+ struct xsk_ring_cons rxq;
+ struct xsk_ring_prod txq;
+ struct xsk_ring_prod umem_fq;
+ struct xsk_ring_cons umem_cq;
+ struct xsk_socket *xsk;
+ int umem_fq_initialized;
+
+ u64 n_pkts_rx;
+ u64 n_pkts_tx;
+};
+
+static void
+port_free(struct port *p)
+{
+ if (!p)
+ return;
+
+ /* To keep this example simple, the code to free the buffers from the
+ * socket's receive and transmit queues, as well as from the UMEM fill
+ * and completion queues, is not included.
+ */
+
+ if (p->xsk)
+ xsk_socket__delete(p->xsk);
+
+ bcache_free(p->bc);
+
+ free(p);
+}
+
+static struct port *
+port_init(struct port_params *params)
+{
+ struct port *p;
+ u32 umem_fq_size, pos = 0;
+ int status, i;
+
+ /* Memory allocation and initialization. */
+ p = calloc(sizeof(struct port), 1);
+ if (!p)
+ return NULL;
+
+ memcpy(&p->params, params, sizeof(p->params));
+ umem_fq_size = params->bp->umem_cfg.fill_size;
+
+ /* bcache. */
+ p->bc = bcache_init(params->bp);
+ if (!p->bc ||
+ (bcache_slab_size(p->bc) < umem_fq_size) ||
+ (bcache_cons_check(p->bc, umem_fq_size) < umem_fq_size)) {
+ port_free(p);
+ return NULL;
+ }
+
+ /* xsk socket. */
+ status = xsk_socket__create_shared(&p->xsk,
+ params->iface,
+ params->iface_queue,
+ params->bp->umem,
+ &p->rxq,
+ &p->txq,
+ &p->umem_fq,
+ &p->umem_cq,
+ &params->xsk_cfg);
+ if (status) {
+ port_free(p);
+ return NULL;
+ }
+
+ /* umem fq. */
+ xsk_ring_prod__reserve(&p->umem_fq, umem_fq_size, &pos);
+
+ for (i = 0; i < umem_fq_size; i++)
+ *xsk_ring_prod__fill_addr(&p->umem_fq, pos + i) =
+ bcache_cons(p->bc);
+
+ xsk_ring_prod__submit(&p->umem_fq, umem_fq_size);
+ p->umem_fq_initialized = 1;
+
+ return p;
+}
+
+static inline u32
+port_rx_burst(struct port *p, struct burst_rx *b)
+{
+ u32 n_pkts, pos, i;
+
+ /* Free buffers for FQ replenish. */
+ n_pkts = ARRAY_SIZE(b->addr);
+
+ n_pkts = bcache_cons_check(p->bc, n_pkts);
+ if (!n_pkts)
+ return 0;
+
+ /* RXQ. */
+ n_pkts = xsk_ring_cons__peek(&p->rxq, n_pkts, &pos);
+ if (!n_pkts) {
+ if (xsk_ring_prod__needs_wakeup(&p->umem_fq)) {
+ struct pollfd pollfd = {
+ .fd = xsk_socket__fd(p->xsk),
+ .events = POLLIN,
+ };
+
+ poll(&pollfd, 1, 0);
+ }
+ return 0;
+ }
+
+ for (i = 0; i < n_pkts; i++) {
+ b->addr[i] = xsk_ring_cons__rx_desc(&p->rxq, pos + i)->addr;
+ b->len[i] = xsk_ring_cons__rx_desc(&p->rxq, pos + i)->len;
+ }
+
+ xsk_ring_cons__release(&p->rxq, n_pkts);
+ p->n_pkts_rx += n_pkts;
+
+ /* UMEM FQ. */
+ for ( ; ; ) {
+ int status;
+
+ status = xsk_ring_prod__reserve(&p->umem_fq, n_pkts, &pos);
+ if (status == n_pkts)
+ break;
+
+ if (xsk_ring_prod__needs_wakeup(&p->umem_fq)) {
+ struct pollfd pollfd = {
+ .fd = xsk_socket__fd(p->xsk),
+ .events = POLLIN,
+ };
+
+ poll(&pollfd, 1, 0);
+ }
+ }
+
+ for (i = 0; i < n_pkts; i++)
+ *xsk_ring_prod__fill_addr(&p->umem_fq, pos + i) =
+ bcache_cons(p->bc);
+
+ xsk_ring_prod__submit(&p->umem_fq, n_pkts);
+
+ return n_pkts;
+}
+
+static inline void
+port_tx_burst(struct port *p, struct burst_tx *b)
+{
+ u32 n_pkts, pos, i;
+ int status;
+
+ /* UMEM CQ. */
+ n_pkts = p->params.bp->umem_cfg.comp_size;
+
+ n_pkts = xsk_ring_cons__peek(&p->umem_cq, n_pkts, &pos);
+
+ for (i = 0; i < n_pkts; i++) {
+ u64 addr = *xsk_ring_cons__comp_addr(&p->umem_cq, pos + i);
+
+ bcache_prod(p->bc, addr);
+ }
+
+ xsk_ring_cons__release(&p->umem_cq, n_pkts);
+
+ /* TXQ. */
+ n_pkts = b->n_pkts;
+
+ for ( ; ; ) {
+ status = xsk_ring_prod__reserve(&p->txq, n_pkts, &pos);
+ if (status == n_pkts)
+ break;
+
+ if (xsk_ring_prod__needs_wakeup(&p->txq))
+ sendto(xsk_socket__fd(p->xsk), NULL, 0, MSG_DONTWAIT,
+ NULL, 0);
+ }
+
+ for (i = 0; i < n_pkts; i++) {
+ xsk_ring_prod__tx_desc(&p->txq, pos + i)->addr = b->addr[i];
+ xsk_ring_prod__tx_desc(&p->txq, pos + i)->len = b->len[i];
+ }
+
+ xsk_ring_prod__submit(&p->txq, n_pkts);
+ if (xsk_ring_prod__needs_wakeup(&p->txq))
+ sendto(xsk_socket__fd(p->xsk), NULL, 0, MSG_DONTWAIT, NULL, 0);
+ p->n_pkts_tx += n_pkts;
+}
+
+/*
+ * Thread
+ *
+ * Packet forwarding threads.
+ */
+#ifndef MAX_PORTS_PER_THREAD
+#define MAX_PORTS_PER_THREAD 16
+#endif
+
+struct thread_data {
+ struct port *ports_rx[MAX_PORTS_PER_THREAD];
+ struct port *ports_tx[MAX_PORTS_PER_THREAD];
+ u32 n_ports_rx;
+ struct burst_rx burst_rx;
+ struct burst_tx burst_tx[MAX_PORTS_PER_THREAD];
+ u32 cpu_core_id;
+ int quit;
+};
+
+static void swap_mac_addresses(void *data)
+{
+ struct ether_header *eth = (struct ether_header *)data;
+ struct ether_addr *src_addr = (struct ether_addr *)&eth->ether_shost;
+ struct ether_addr *dst_addr = (struct ether_addr *)&eth->ether_dhost;
+ struct ether_addr tmp;
+
+ tmp = *src_addr;
+ *src_addr = *dst_addr;
+ *dst_addr = tmp;
+}
+
+static void *
+thread_func(void *arg)
+{
+ struct thread_data *t = arg;
+ cpu_set_t cpu_cores;
+ u32 i;
+
+ CPU_ZERO(&cpu_cores);
+ CPU_SET(t->cpu_core_id, &cpu_cores);
+ pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpu_cores);
+
+ for (i = 0; !t->quit; i = (i + 1) & (t->n_ports_rx - 1)) {
+ struct port *port_rx = t->ports_rx[i];
+ struct port *port_tx = t->ports_tx[i];
+ struct burst_rx *brx = &t->burst_rx;
+ struct burst_tx *btx = &t->burst_tx[i];
+ u32 n_pkts, j;
+
+ /* RX. */
+ n_pkts = port_rx_burst(port_rx, brx);
+ if (!n_pkts)
+ continue;
+
+ /* Process & TX. */
+ for (j = 0; j < n_pkts; j++) {
+ u64 addr = xsk_umem__add_offset_to_addr(brx->addr[j]);
+ u8 *pkt = xsk_umem__get_data(port_rx->params.bp->addr,
+ addr);
+
+ swap_mac_addresses(pkt);
+
+ btx->addr[btx->n_pkts] = brx->addr[j];
+ btx->len[btx->n_pkts] = brx->len[j];
+ btx->n_pkts++;
+
+ if (btx->n_pkts == MAX_BURST_TX) {
+ port_tx_burst(port_tx, btx);
+ btx->n_pkts = 0;
+ }
+ }
+ }
+
+ return NULL;
+}
+
+/*
+ * Process
+ */
+static const struct bpool_params bpool_params_default = {
+ .n_buffers = 64 * 1024,
+ .buffer_size = XSK_UMEM__DEFAULT_FRAME_SIZE,
+ .mmap_flags = 0,
+
+ .n_users_max = 16,
+ .n_buffers_per_slab = XSK_RING_PROD__DEFAULT_NUM_DESCS * 2,
+};
+
+static const struct xsk_umem_config umem_cfg_default = {
+ .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS * 2,
+ .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS,
+ .frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE,
+ .frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM,
+ .flags = 0,
+};
+
+static const struct port_params port_params_default = {
+ .xsk_cfg = {
+ .rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS,
+ .tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
+ .libbpf_flags = 0,
+ .xdp_flags = XDP_FLAGS_DRV_MODE,
+ .bind_flags = XDP_USE_NEED_WAKEUP | XDP_ZEROCOPY,
+ },
+
+ .bp = NULL,
+ .iface = NULL,
+ .iface_queue = 0,
+};
+
+#ifndef MAX_PORTS
+#define MAX_PORTS 64
+#endif
+
+#ifndef MAX_THREADS
+#define MAX_THREADS 64
+#endif
+
+static struct bpool_params bpool_params;
+static struct xsk_umem_config umem_cfg;
+static struct bpool *bp;
+
+static struct port_params port_params[MAX_PORTS];
+static struct port *ports[MAX_PORTS];
+static u64 n_pkts_rx[MAX_PORTS];
+static u64 n_pkts_tx[MAX_PORTS];
+static int n_ports;
+
+static pthread_t threads[MAX_THREADS];
+static struct thread_data thread_data[MAX_THREADS];
+static int n_threads;
+
+static void
+print_usage(char *prog_name)
+{
+ const char *usage =
+ "Usage:\n"
+ "\t%s [ -b SIZE ] -c CORE -i INTERFACE [ -q QUEUE ]\n"
+ "\n"
+ "-c CORE CPU core to run a packet forwarding thread\n"
+ " on. May be invoked multiple times.\n"
+ "\n"
+ "-b SIZE Number of buffers in the buffer pool shared\n"
+ " by all the forwarding threads. Default: %u.\n"
+ "\n"
+ "-i INTERFACE Network interface. Each (INTERFACE, QUEUE)\n"
+ " pair specifies one forwarding port. May be\n"
+ " invoked multiple times.\n"
+ "\n"
+ "-q QUEUE Network interface queue for RX and TX. Each\n"
+ " (INTERFACE, QUEUE) pair specified one\n"
+ " forwarding port. Default: %u. May be invoked\n"
+ " multiple times.\n"
+ "\n";
+ printf(usage,
+ prog_name,
+ bpool_params_default.n_buffers,
+ port_params_default.iface_queue);
+}
+
+static int
+parse_args(int argc, char **argv)
+{
+ struct option lgopts[] = {
+ { NULL, 0, 0, 0 }
+ };
+ int opt, option_index;
+
+ /* Parse the input arguments. */
+ for ( ; ;) {
+ opt = getopt_long(argc, argv, "c:i:q:", lgopts, &option_index);
+ if (opt == EOF)
+ break;
+
+ switch (opt) {
+ case 'b':
+ bpool_params.n_buffers = atoi(optarg);
+ break;
+
+ case 'c':
+ if (n_threads == MAX_THREADS) {
+ printf("Max number of threads (%d) reached.\n",
+ MAX_THREADS);
+ return -1;
+ }
+
+ thread_data[n_threads].cpu_core_id = atoi(optarg);
+ n_threads++;
+ break;
+
+ case 'i':
+ if (n_ports == MAX_PORTS) {
+ printf("Max number of ports (%d) reached.\n",
+ MAX_PORTS);
+ return -1;
+ }
+
+ port_params[n_ports].iface = optarg;
+ port_params[n_ports].iface_queue = 0;
+ n_ports++;
+ break;
+
+ case 'q':
+ if (n_ports == 0) {
+ printf("No port specified for queue.\n");
+ return -1;
+ }
+ port_params[n_ports - 1].iface_queue = atoi(optarg);
+ break;
+
+ default:
+ printf("Illegal argument.\n");
+ return -1;
+ }
+ }
+
+ optind = 1; /* reset getopt lib */
+
+ /* Check the input arguments. */
+ if (!n_ports) {
+ printf("No ports specified.\n");
+ return -1;
+ }
+
+ if (!n_threads) {
+ printf("No threads specified.\n");
+ return -1;
+ }
+
+ if (n_ports % n_threads) {
+ printf("Ports cannot be evenly distributed to threads.\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+static void
+print_port(u32 port_id)
+{
+ struct port *port = ports[port_id];
+
+ printf("Port %u: interface = %s, queue = %u\n",
+ port_id, port->params.iface, port->params.iface_queue);
+}
+
+static void
+print_thread(u32 thread_id)
+{
+ struct thread_data *t = &thread_data[thread_id];
+ u32 i;
+
+ printf("Thread %u (CPU core %u): ",
+ thread_id, t->cpu_core_id);
+
+ for (i = 0; i < t->n_ports_rx; i++) {
+ struct port *port_rx = t->ports_rx[i];
+ struct port *port_tx = t->ports_tx[i];
+
+ printf("(%s, %u) -> (%s, %u), ",
+ port_rx->params.iface,
+ port_rx->params.iface_queue,
+ port_tx->params.iface,
+ port_tx->params.iface_queue);
+ }
+
+ printf("\n");
+}
+
+static void
+print_port_stats_separator(void)
+{
+ printf("+-%4s-+-%12s-+-%13s-+-%12s-+-%13s-+\n",
+ "----",
+ "------------",
+ "-------------",
+ "------------",
+ "-------------");
+}
+
+static void
+print_port_stats_header(void)
+{
+ print_port_stats_separator();
+ printf("| %4s | %12s | %13s | %12s | %13s |\n",
+ "Port",
+ "RX packets",
+ "RX rate (pps)",
+ "TX packets",
+ "TX_rate (pps)");
+ print_port_stats_separator();
+}
+
+static void
+print_port_stats_trailer(void)
+{
+ print_port_stats_separator();
+ printf("\n");
+}
+
+static void
+print_port_stats(int port_id, u64 ns_diff)
+{
+ struct port *p = ports[port_id];
+ double rx_pps, tx_pps;
+
+ rx_pps = (p->n_pkts_rx - n_pkts_rx[port_id]) * 1000000000. / ns_diff;
+ tx_pps = (p->n_pkts_tx - n_pkts_tx[port_id]) * 1000000000. / ns_diff;
+
+ printf("| %4d | %12llu | %13.0f | %12llu | %13.0f |\n",
+ port_id,
+ p->n_pkts_rx,
+ rx_pps,
+ p->n_pkts_tx,
+ tx_pps);
+
+ n_pkts_rx[port_id] = p->n_pkts_rx;
+ n_pkts_tx[port_id] = p->n_pkts_tx;
+}
+
+static void
+print_port_stats_all(u64 ns_diff)
+{
+ int i;
+
+ print_port_stats_header();
+ for (i = 0; i < n_ports; i++)
+ print_port_stats(i, ns_diff);
+ print_port_stats_trailer();
+}
+
+static int quit;
+
+static void
+signal_handler(int sig)
+{
+ quit = 1;
+}
+
+static void remove_xdp_program(void)
+{
+ int i;
+
+ for (i = 0 ; i < n_ports; i++)
+ bpf_set_link_xdp_fd(if_nametoindex(port_params[i].iface), -1,
+ port_params[i].xsk_cfg.xdp_flags);
+}
+
+int main(int argc, char **argv)
+{
+ struct timespec time;
+ u64 ns0;
+ int i;
+
+ /* Parse args. */
+ memcpy(&bpool_params, &bpool_params_default,
+ sizeof(struct bpool_params));
+ memcpy(&umem_cfg, &umem_cfg_default,
+ sizeof(struct xsk_umem_config));
+ for (i = 0; i < MAX_PORTS; i++)
+ memcpy(&port_params[i], &port_params_default,
+ sizeof(struct port_params));
+
+ if (parse_args(argc, argv)) {
+ print_usage(argv[0]);
+ return -1;
+ }
+
+ /* Buffer pool initialization. */
+ bp = bpool_init(&bpool_params, &umem_cfg);
+ if (!bp) {
+ printf("Buffer pool initialization failed.\n");
+ return -1;
+ }
+ printf("Buffer pool created successfully.\n");
+
+ /* Ports initialization. */
+ for (i = 0; i < MAX_PORTS; i++)
+ port_params[i].bp = bp;
+
+ for (i = 0; i < n_ports; i++) {
+ ports[i] = port_init(&port_params[i]);
+ if (!ports[i]) {
+ printf("Port %d initialization failed.\n", i);
+ return -1;
+ }
+ print_port(i);
+ }
+ printf("All ports created successfully.\n");
+
+ /* Threads. */
+ for (i = 0; i < n_threads; i++) {
+ struct thread_data *t = &thread_data[i];
+ u32 n_ports_per_thread = n_ports / n_threads, j;
+
+ for (j = 0; j < n_ports_per_thread; j++) {
+ t->ports_rx[j] = ports[i * n_ports_per_thread + j];
+ t->ports_tx[j] = ports[i * n_ports_per_thread +
+ (j + 1) % n_ports_per_thread];
+ }
+
+ t->n_ports_rx = n_ports_per_thread;
+
+ print_thread(i);
+ }
+
+ for (i = 0; i < n_threads; i++) {
+ int status;
+
+ status = pthread_create(&threads[i],
+ NULL,
+ thread_func,
+ &thread_data[i]);
+ if (status) {
+ printf("Thread %d creation failed.\n", i);
+ return -1;
+ }
+ }
+ printf("All threads created successfully.\n");
+
+ /* Print statistics. */
+ signal(SIGINT, signal_handler);
+ signal(SIGTERM, signal_handler);
+ signal(SIGABRT, signal_handler);
+
+ clock_gettime(CLOCK_MONOTONIC, &time);
+ ns0 = time.tv_sec * 1000000000UL + time.tv_nsec;
+ for ( ; !quit; ) {
+ u64 ns1, ns_diff;
+
+ sleep(1);
+ clock_gettime(CLOCK_MONOTONIC, &time);
+ ns1 = time.tv_sec * 1000000000UL + time.tv_nsec;
+ ns_diff = ns1 - ns0;
+ ns0 = ns1;
+
+ print_port_stats_all(ns_diff);
+ }
+
+ /* Threads completion. */
+ printf("Quit.\n");
+ for (i = 0; i < n_threads; i++)
+ thread_data[i].quit = 1;
+
+ for (i = 0; i < n_threads; i++)
+ pthread_join(threads[i], NULL);
+
+ for (i = 0; i < n_ports; i++)
+ port_free(ports[i]);
+
+ bpool_free(bp);
+
+ remove_xdp_program();
+
+ return 0;
+}
diff --git a/samples/configfs/configfs_sample.c b/samples/configfs/configfs_sample.c
index e2398d94e8da..f9008be7a8a1 100644
--- a/samples/configfs/configfs_sample.c
+++ b/samples/configfs/configfs_sample.c
@@ -7,19 +7,17 @@
* macros defined by configfs.h
*
* Based on sysfs:
- * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
*
* configfs Copyright (C) 2005 Oracle. All rights reserved.
*/
#include <linux/init.h>
+#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/slab.h>
-
#include <linux/configfs.h>
-
-
/*
* 01-childless
*
@@ -40,8 +38,8 @@ struct childless {
static inline struct childless *to_childless(struct config_item *item)
{
- return item ? container_of(to_configfs_subsystem(to_config_group(item)),
- struct childless, subsys) : NULL;
+ return container_of(to_configfs_subsystem(to_config_group(item)),
+ struct childless, subsys);
}
static ssize_t childless_showme_show(struct config_item *item, char *page)
@@ -64,17 +62,11 @@ static ssize_t childless_storeme_store(struct config_item *item,
const char *page, size_t count)
{
struct childless *childless = to_childless(item);
- unsigned long tmp;
- char *p = (char *) page;
-
- tmp = simple_strtoul(p, &p, 10);
- if (!p || (*p && (*p != '\n')))
- return -EINVAL;
-
- if (tmp > INT_MAX)
- return -ERANGE;
+ int ret;
- childless->storeme = tmp;
+ ret = kstrtoint(page, 10, &childless->storeme);
+ if (ret)
+ return ret;
return count;
}
@@ -117,7 +109,6 @@ static struct childless childless_subsys = {
},
};
-
/* ----------------------------------------------------------------- */
/*
@@ -136,7 +127,7 @@ struct simple_child {
static inline struct simple_child *to_simple_child(struct config_item *item)
{
- return item ? container_of(item, struct simple_child, item) : NULL;
+ return container_of(item, struct simple_child, item);
}
static ssize_t simple_child_storeme_show(struct config_item *item, char *page)
@@ -148,17 +139,11 @@ static ssize_t simple_child_storeme_store(struct config_item *item,
const char *page, size_t count)
{
struct simple_child *simple_child = to_simple_child(item);
- unsigned long tmp;
- char *p = (char *) page;
-
- tmp = simple_strtoul(p, &p, 10);
- if (!p || (*p && (*p != '\n')))
- return -EINVAL;
-
- if (tmp > INT_MAX)
- return -ERANGE;
+ int ret;
- simple_child->storeme = tmp;
+ ret = kstrtoint(page, 10, &simple_child->storeme);
+ if (ret)
+ return ret;
return count;
}
@@ -176,7 +161,7 @@ static void simple_child_release(struct config_item *item)
}
static struct configfs_item_operations simple_child_item_ops = {
- .release = simple_child_release,
+ .release = simple_child_release,
};
static const struct config_item_type simple_child_type = {
@@ -185,15 +170,14 @@ static const struct config_item_type simple_child_type = {
.ct_owner = THIS_MODULE,
};
-
struct simple_children {
struct config_group group;
};
static inline struct simple_children *to_simple_children(struct config_item *item)
{
- return item ? container_of(to_config_group(item),
- struct simple_children, group) : NULL;
+ return container_of(to_config_group(item),
+ struct simple_children, group);
}
static struct config_item *simple_children_make_item(struct config_group *group,
@@ -208,8 +192,6 @@ static struct config_item *simple_children_make_item(struct config_group *group,
config_item_init_type_name(&simple_child->item, name,
&simple_child_type);
- simple_child->storeme = 0;
-
return &simple_child->item;
}
@@ -263,7 +245,6 @@ static struct configfs_subsystem simple_children_subsys = {
},
};
-
/* ----------------------------------------------------------------- */
/*
@@ -350,9 +331,8 @@ static struct configfs_subsystem *example_subsys[] = {
static int __init configfs_example_init(void)
{
- int ret;
- int i;
struct configfs_subsystem *subsys;
+ int ret, i;
for (i = 0; example_subsys[i]; i++) {
subsys = example_subsys[i];
@@ -361,9 +341,8 @@ static int __init configfs_example_init(void)
mutex_init(&subsys->su_mutex);
ret = configfs_register_subsystem(subsys);
if (ret) {
- printk(KERN_ERR "Error %d while registering subsystem %s\n",
- ret,
- subsys->su_group.cg_item.ci_namebuf);
+ pr_err("Error %d while registering subsystem %s\n",
+ ret, subsys->su_group.cg_item.ci_namebuf);
goto out_unregister;
}
}
diff --git a/samples/kprobes/kprobe_example.c b/samples/kprobes/kprobe_example.c
index 8b718943d603..365905cb24b1 100644
--- a/samples/kprobes/kprobe_example.c
+++ b/samples/kprobes/kprobe_example.c
@@ -2,13 +2,13 @@
/*
* NOTE: This example is works on x86 and powerpc.
* Here's a sample kernel module showing the use of kprobes to dump a
- * stack trace and selected registers when _do_fork() is called.
+ * stack trace and selected registers when kernel_clone() is called.
*
* For more information on theory of operation of kprobes, see
* Documentation/trace/kprobes.rst
*
* You will see the trace data in /var/log/messages and on the console
- * whenever _do_fork() is invoked to create a new process.
+ * whenever kernel_clone() is invoked to create a new process.
*/
#include <linux/kernel.h>
@@ -16,7 +16,7 @@
#include <linux/kprobes.h>
#define MAX_SYMBOL_LEN 64
-static char symbol[MAX_SYMBOL_LEN] = "_do_fork";
+static char symbol[MAX_SYMBOL_LEN] = "kernel_clone";
module_param_string(symbol, symbol, sizeof(symbol), 0644);
/* For each probe you need to allocate a kprobe structure */
diff --git a/samples/kprobes/kretprobe_example.c b/samples/kprobes/kretprobe_example.c
index 69fd1235108a..5dc1bf3baa98 100644
--- a/samples/kprobes/kretprobe_example.c
+++ b/samples/kprobes/kretprobe_example.c
@@ -8,7 +8,7 @@
*
* usage: insmod kretprobe_example.ko func=<func_name>
*
- * If no func_name is specified, _do_fork is instrumented
+ * If no func_name is specified, kernel_clone is instrumented
*
* For more information on theory of operation of kretprobes, see
* Documentation/trace/kprobes.rst
@@ -26,7 +26,7 @@
#include <linux/limits.h>
#include <linux/sched.h>
-static char func_name[NAME_MAX] = "_do_fork";
+static char func_name[NAME_MAX] = "kernel_clone";
module_param_string(func, func_name, NAME_MAX, S_IRUGO);
MODULE_PARM_DESC(func, "Function to kretprobe; this module will report the"
" function's execution time");
diff --git a/samples/mic/mpssd/mpssd.c b/samples/mic/mpssd/mpssd.c
index a11bf6c5b53b..c03a05d498f0 100644
--- a/samples/mic/mpssd/mpssd.c
+++ b/samples/mic/mpssd/mpssd.c
@@ -403,9 +403,9 @@ mic_virtio_copy(struct mic_info *mic, int fd,
static inline unsigned _vring_size(unsigned int num, unsigned long align)
{
- return ((sizeof(struct vring_desc) * num + sizeof(__u16) * (3 + num)
+ return _ALIGN_UP(((sizeof(struct vring_desc) * num + sizeof(__u16) * (3 + num)
+ align - 1) & ~(align - 1))
- + sizeof(__u16) * 3 + sizeof(struct vring_used_elem) * num;
+ + sizeof(__u16) * 3 + sizeof(struct vring_used_elem) * num, 4);
}
/*
@@ -612,7 +612,7 @@ virtio_net(void *arg)
copy.out_len, hdr->gso_type);
#endif
#ifdef DEBUG
- disp_iovec(mic, copy, __func__, __LINE__);
+ disp_iovec(mic, &copy, __func__, __LINE__);
mpsslog("%s %s %d read from tap 0x%lx\n",
mic->name, __func__, __LINE__,
len);
@@ -632,7 +632,7 @@ virtio_net(void *arg)
if (!err)
verify_out_len(mic, &copy);
#ifdef DEBUG
- disp_iovec(mic, copy, __func__, __LINE__);
+ disp_iovec(mic, &copy, __func__, __LINE__);
mpsslog("%s %s %d wrote to net 0x%lx\n",
mic->name, __func__, __LINE__,
sum_iovec_len(&copy));
@@ -681,12 +681,12 @@ virtio_net(void *arg)
sizeof(struct virtio_net_hdr);
verify_out_len(mic, &copy);
#ifdef DEBUG
- disp_iovec(mic, copy, __func__,
+ disp_iovec(mic, &copy, __func__,
__LINE__);
mpsslog("%s %s %d ",
mic->name, __func__, __LINE__);
mpsslog("read from net 0x%lx\n",
- sum_iovec_len(copy));
+ sum_iovec_len(&copy));
#endif
len = writev(net_poll[NET_FD_TUN].fd,
copy.iov, copy.iovcnt);
@@ -814,7 +814,7 @@ virtio_console(void *arg)
len = readv(pty_fd, copy.iov, copy.iovcnt);
if (len > 0) {
#ifdef DEBUG
- disp_iovec(mic, copy, __func__, __LINE__);
+ disp_iovec(mic, &copy, __func__, __LINE__);
mpsslog("%s %s %d read from tap 0x%lx\n",
mic->name, __func__, __LINE__,
len);
@@ -834,10 +834,10 @@ virtio_console(void *arg)
if (!err)
verify_out_len(mic, &copy);
#ifdef DEBUG
- disp_iovec(mic, copy, __func__, __LINE__);
+ disp_iovec(mic, &copy, __func__, __LINE__);
mpsslog("%s %s %d wrote to net 0x%lx\n",
mic->name, __func__, __LINE__,
- sum_iovec_len(copy));
+ sum_iovec_len(&copy));
#endif
/* Reinitialize IOV for next run */
iov0->iov_len = PAGE_SIZE;
@@ -866,12 +866,12 @@ virtio_console(void *arg)
iov1->iov_len = copy.out_len;
verify_out_len(mic, &copy);
#ifdef DEBUG
- disp_iovec(mic, copy, __func__,
+ disp_iovec(mic, &copy, __func__,
__LINE__);
mpsslog("%s %s %d ",
mic->name, __func__, __LINE__);
mpsslog("read from net 0x%lx\n",
- sum_iovec_len(copy));
+ sum_iovec_len(&copy));
#endif
len = writev(pty_fd,
copy.iov, copy.iovcnt);
@@ -883,7 +883,7 @@ virtio_console(void *arg)
sum_iovec_len(&copy));
} else {
#ifdef DEBUG
- disp_iovec(mic, copy, __func__,
+ disp_iovec(mic, &copy, __func__,
__LINE__);
mpsslog("%s %s %d ",
mic->name, __func__,
diff --git a/samples/nitro_enclaves/.gitignore b/samples/nitro_enclaves/.gitignore
new file mode 100644
index 000000000000..827934129c90
--- /dev/null
+++ b/samples/nitro_enclaves/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0
+ne_ioctl_sample
diff --git a/samples/nitro_enclaves/Makefile b/samples/nitro_enclaves/Makefile
new file mode 100644
index 000000000000..a3ec78fefb52
--- /dev/null
+++ b/samples/nitro_enclaves/Makefile
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+
+# Enclave lifetime management support for Nitro Enclaves (NE) - ioctl sample
+# usage.
+
+.PHONY: all clean
+
+CFLAGS += -Wall
+
+all:
+ $(CC) $(CFLAGS) -o ne_ioctl_sample ne_ioctl_sample.c -lpthread
+
+clean:
+ rm -f ne_ioctl_sample
diff --git a/samples/nitro_enclaves/ne_ioctl_sample.c b/samples/nitro_enclaves/ne_ioctl_sample.c
new file mode 100644
index 000000000000..480b763142b3
--- /dev/null
+++ b/samples/nitro_enclaves/ne_ioctl_sample.c
@@ -0,0 +1,883 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ */
+
+/**
+ * DOC: Sample flow of using the ioctl interface provided by the Nitro Enclaves (NE)
+ * kernel driver.
+ *
+ * Usage
+ * -----
+ *
+ * Load the nitro_enclaves module, setting also the enclave CPU pool. The
+ * enclave CPUs need to be full cores from the same NUMA node. CPU 0 and its
+ * siblings have to remain available for the primary / parent VM, so they
+ * cannot be included in the enclave CPU pool.
+ *
+ * See the cpu list section from the kernel documentation.
+ * https://www.kernel.org/doc/html/latest/admin-guide/kernel-parameters.html#cpu-lists
+ *
+ * insmod drivers/virt/nitro_enclaves/nitro_enclaves.ko
+ * lsmod
+ *
+ * The CPU pool can be set at runtime, after the kernel module is loaded.
+ *
+ * echo <cpu-list> > /sys/module/nitro_enclaves/parameters/ne_cpus
+ *
+ * NUMA and CPU siblings information can be found using:
+ *
+ * lscpu
+ * /proc/cpuinfo
+ *
+ * Check the online / offline CPU list. The CPUs from the pool should be
+ * offlined.
+ *
+ * lscpu
+ *
+ * Check dmesg for any warnings / errors through the NE driver lifetime / usage.
+ * The NE logs contain the "nitro_enclaves" or "pci 0000:00:02.0" pattern.
+ *
+ * dmesg
+ *
+ * Setup hugetlbfs huge pages. The memory needs to be from the same NUMA node as
+ * the enclave CPUs.
+ *
+ * https://www.kernel.org/doc/html/latest/admin-guide/mm/hugetlbpage.html
+ *
+ * By default, the allocation of hugetlb pages are distributed on all possible
+ * NUMA nodes. Use the following configuration files to set the number of huge
+ * pages from a NUMA node:
+ *
+ * /sys/devices/system/node/node<X>/hugepages/hugepages-2048kB/nr_hugepages
+ * /sys/devices/system/node/node<X>/hugepages/hugepages-1048576kB/nr_hugepages
+ *
+ * or, if not on a system with multiple NUMA nodes, can also set the number
+ * of 2 MiB / 1 GiB huge pages using
+ *
+ * /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
+ * /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages
+ *
+ * In this example 256 hugepages of 2 MiB are used.
+ *
+ * Build and run the NE sample.
+ *
+ * make -C samples/nitro_enclaves clean
+ * make -C samples/nitro_enclaves
+ * ./samples/nitro_enclaves/ne_ioctl_sample <path_to_enclave_image>
+ *
+ * Unload the nitro_enclaves module.
+ *
+ * rmmod nitro_enclaves
+ * lsmod
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <poll.h>
+#include <pthread.h>
+#include <string.h>
+#include <sys/eventfd.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <linux/mman.h>
+#include <linux/nitro_enclaves.h>
+#include <linux/vm_sockets.h>
+
+/**
+ * NE_DEV_NAME - Nitro Enclaves (NE) misc device that provides the ioctl interface.
+ */
+#define NE_DEV_NAME "/dev/nitro_enclaves"
+
+/**
+ * NE_POLL_WAIT_TIME - Timeout in seconds for each poll event.
+ */
+#define NE_POLL_WAIT_TIME (60)
+/**
+ * NE_POLL_WAIT_TIME_MS - Timeout in milliseconds for each poll event.
+ */
+#define NE_POLL_WAIT_TIME_MS (NE_POLL_WAIT_TIME * 1000)
+
+/**
+ * NE_SLEEP_TIME - Amount of time in seconds for the process to keep the enclave alive.
+ */
+#define NE_SLEEP_TIME (300)
+
+/**
+ * NE_DEFAULT_NR_VCPUS - Default number of vCPUs set for an enclave.
+ */
+#define NE_DEFAULT_NR_VCPUS (2)
+
+/**
+ * NE_MIN_MEM_REGION_SIZE - Minimum size of a memory region - 2 MiB.
+ */
+#define NE_MIN_MEM_REGION_SIZE (2 * 1024 * 1024)
+
+/**
+ * NE_DEFAULT_NR_MEM_REGIONS - Default number of memory regions of 2 MiB set for
+ * an enclave.
+ */
+#define NE_DEFAULT_NR_MEM_REGIONS (256)
+
+/**
+ * NE_IMAGE_LOAD_HEARTBEAT_CID - Vsock CID for enclave image loading heartbeat logic.
+ */
+#define NE_IMAGE_LOAD_HEARTBEAT_CID (3)
+/**
+ * NE_IMAGE_LOAD_HEARTBEAT_PORT - Vsock port for enclave image loading heartbeat logic.
+ */
+#define NE_IMAGE_LOAD_HEARTBEAT_PORT (9000)
+/**
+ * NE_IMAGE_LOAD_HEARTBEAT_VALUE - Heartbeat value for enclave image loading.
+ */
+#define NE_IMAGE_LOAD_HEARTBEAT_VALUE (0xb7)
+
+/**
+ * struct ne_user_mem_region - User space memory region set for an enclave.
+ * @userspace_addr: Address of the user space memory region.
+ * @memory_size: Size of the user space memory region.
+ */
+struct ne_user_mem_region {
+ void *userspace_addr;
+ size_t memory_size;
+};
+
+/**
+ * ne_create_vm() - Create a slot for the enclave VM.
+ * @ne_dev_fd: The file descriptor of the NE misc device.
+ * @slot_uid: The generated slot uid for the enclave.
+ * @enclave_fd : The generated file descriptor for the enclave.
+ *
+ * Context: Process context.
+ * Return:
+ * * 0 on success.
+ * * Negative return value on failure.
+ */
+static int ne_create_vm(int ne_dev_fd, unsigned long *slot_uid, int *enclave_fd)
+{
+ int rc = -EINVAL;
+ *enclave_fd = ioctl(ne_dev_fd, NE_CREATE_VM, slot_uid);
+
+ if (*enclave_fd < 0) {
+ rc = *enclave_fd;
+ switch (errno) {
+ case NE_ERR_NO_CPUS_AVAIL_IN_POOL: {
+ printf("Error in create VM, no CPUs available in the NE CPU pool\n");
+
+ break;
+ }
+
+ default:
+ printf("Error in create VM [%m]\n");
+ }
+
+ return rc;
+ }
+
+ return 0;
+}
+
+
+/**
+ * ne_poll_enclave_fd() - Thread function for polling the enclave fd.
+ * @data: Argument provided for the polling function.
+ *
+ * Context: Process context.
+ * Return:
+ * * NULL on success / failure.
+ */
+void *ne_poll_enclave_fd(void *data)
+{
+ int enclave_fd = *(int *)data;
+ struct pollfd fds[1] = {};
+ int i = 0;
+ int rc = -EINVAL;
+
+ printf("Running from poll thread, enclave fd %d\n", enclave_fd);
+
+ fds[0].fd = enclave_fd;
+ fds[0].events = POLLIN | POLLERR | POLLHUP;
+
+ /* Keep on polling until the current process is terminated. */
+ while (1) {
+ printf("[iter %d] Polling ...\n", i);
+
+ rc = poll(fds, 1, NE_POLL_WAIT_TIME_MS);
+ if (rc < 0) {
+ printf("Error in poll [%m]\n");
+
+ return NULL;
+ }
+
+ i++;
+
+ if (!rc) {
+ printf("Poll: %d seconds elapsed\n",
+ i * NE_POLL_WAIT_TIME);
+
+ continue;
+ }
+
+ printf("Poll received value 0x%x\n", fds[0].revents);
+
+ if (fds[0].revents & POLLHUP) {
+ printf("Received POLLHUP\n");
+
+ return NULL;
+ }
+
+ if (fds[0].revents & POLLNVAL) {
+ printf("Received POLLNVAL\n");
+
+ return NULL;
+ }
+ }
+
+ return NULL;
+}
+
+/**
+ * ne_alloc_user_mem_region() - Allocate a user space memory region for an enclave.
+ * @ne_user_mem_region: User space memory region allocated using hugetlbfs.
+ *
+ * Context: Process context.
+ * Return:
+ * * 0 on success.
+ * * Negative return value on failure.
+ */
+static int ne_alloc_user_mem_region(struct ne_user_mem_region *ne_user_mem_region)
+{
+ /**
+ * Check available hugetlb encodings for different huge page sizes in
+ * include/uapi/linux/mman.h.
+ */
+ ne_user_mem_region->userspace_addr = mmap(NULL, ne_user_mem_region->memory_size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS |
+ MAP_HUGETLB | MAP_HUGE_2MB, -1, 0);
+ if (ne_user_mem_region->userspace_addr == MAP_FAILED) {
+ printf("Error in mmap memory [%m]\n");
+
+ return -1;
+ }
+
+ return 0;
+}
+
+/**
+ * ne_load_enclave_image() - Place the enclave image in the enclave memory.
+ * @enclave_fd : The file descriptor associated with the enclave.
+ * @ne_user_mem_regions: User space memory regions allocated for the enclave.
+ * @enclave_image_path : The file path of the enclave image.
+ *
+ * Context: Process context.
+ * Return:
+ * * 0 on success.
+ * * Negative return value on failure.
+ */
+static int ne_load_enclave_image(int enclave_fd, struct ne_user_mem_region ne_user_mem_regions[],
+ char *enclave_image_path)
+{
+ unsigned char *enclave_image = NULL;
+ int enclave_image_fd = -1;
+ size_t enclave_image_size = 0;
+ size_t enclave_memory_size = 0;
+ unsigned long i = 0;
+ size_t image_written_bytes = 0;
+ struct ne_image_load_info image_load_info = {
+ .flags = NE_EIF_IMAGE,
+ };
+ struct stat image_stat_buf = {};
+ int rc = -EINVAL;
+ size_t temp_image_offset = 0;
+
+ for (i = 0; i < NE_DEFAULT_NR_MEM_REGIONS; i++)
+ enclave_memory_size += ne_user_mem_regions[i].memory_size;
+
+ rc = stat(enclave_image_path, &image_stat_buf);
+ if (rc < 0) {
+ printf("Error in get image stat info [%m]\n");
+
+ return rc;
+ }
+
+ enclave_image_size = image_stat_buf.st_size;
+
+ if (enclave_memory_size < enclave_image_size) {
+ printf("The enclave memory is smaller than the enclave image size\n");
+
+ return -ENOMEM;
+ }
+
+ rc = ioctl(enclave_fd, NE_GET_IMAGE_LOAD_INFO, &image_load_info);
+ if (rc < 0) {
+ switch (errno) {
+ case NE_ERR_NOT_IN_INIT_STATE: {
+ printf("Error in get image load info, enclave not in init state\n");
+
+ break;
+ }
+
+ case NE_ERR_INVALID_FLAG_VALUE: {
+ printf("Error in get image load info, provided invalid flag\n");
+
+ break;
+ }
+
+ default:
+ printf("Error in get image load info [%m]\n");
+ }
+
+ return rc;
+ }
+
+ printf("Enclave image offset in enclave memory is %lld\n",
+ image_load_info.memory_offset);
+
+ enclave_image_fd = open(enclave_image_path, O_RDONLY);
+ if (enclave_image_fd < 0) {
+ printf("Error in open enclave image file [%m]\n");
+
+ return enclave_image_fd;
+ }
+
+ enclave_image = mmap(NULL, enclave_image_size, PROT_READ,
+ MAP_PRIVATE, enclave_image_fd, 0);
+ if (enclave_image == MAP_FAILED) {
+ printf("Error in mmap enclave image [%m]\n");
+
+ return -1;
+ }
+
+ temp_image_offset = image_load_info.memory_offset;
+
+ for (i = 0; i < NE_DEFAULT_NR_MEM_REGIONS; i++) {
+ size_t bytes_to_write = 0;
+ size_t memory_offset = 0;
+ size_t memory_size = ne_user_mem_regions[i].memory_size;
+ size_t remaining_bytes = 0;
+ void *userspace_addr = ne_user_mem_regions[i].userspace_addr;
+
+ if (temp_image_offset >= memory_size) {
+ temp_image_offset -= memory_size;
+
+ continue;
+ } else if (temp_image_offset != 0) {
+ memory_offset = temp_image_offset;
+ memory_size -= temp_image_offset;
+ temp_image_offset = 0;
+ }
+
+ remaining_bytes = enclave_image_size - image_written_bytes;
+ bytes_to_write = memory_size < remaining_bytes ?
+ memory_size : remaining_bytes;
+
+ memcpy(userspace_addr + memory_offset,
+ enclave_image + image_written_bytes, bytes_to_write);
+
+ image_written_bytes += bytes_to_write;
+
+ if (image_written_bytes == enclave_image_size)
+ break;
+ }
+
+ munmap(enclave_image, enclave_image_size);
+
+ close(enclave_image_fd);
+
+ return 0;
+}
+
+/**
+ * ne_set_user_mem_region() - Set a user space memory region for the given enclave.
+ * @enclave_fd : The file descriptor associated with the enclave.
+ * @ne_user_mem_region : User space memory region to be set for the enclave.
+ *
+ * Context: Process context.
+ * Return:
+ * * 0 on success.
+ * * Negative return value on failure.
+ */
+static int ne_set_user_mem_region(int enclave_fd, struct ne_user_mem_region ne_user_mem_region)
+{
+ struct ne_user_memory_region mem_region = {
+ .flags = NE_DEFAULT_MEMORY_REGION,
+ .memory_size = ne_user_mem_region.memory_size,
+ .userspace_addr = (__u64)ne_user_mem_region.userspace_addr,
+ };
+ int rc = -EINVAL;
+
+ rc = ioctl(enclave_fd, NE_SET_USER_MEMORY_REGION, &mem_region);
+ if (rc < 0) {
+ switch (errno) {
+ case NE_ERR_NOT_IN_INIT_STATE: {
+ printf("Error in set user memory region, enclave not in init state\n");
+
+ break;
+ }
+
+ case NE_ERR_INVALID_MEM_REGION_SIZE: {
+ printf("Error in set user memory region, mem size not multiple of 2 MiB\n");
+
+ break;
+ }
+
+ case NE_ERR_INVALID_MEM_REGION_ADDR: {
+ printf("Error in set user memory region, invalid user space address\n");
+
+ break;
+ }
+
+ case NE_ERR_UNALIGNED_MEM_REGION_ADDR: {
+ printf("Error in set user memory region, unaligned user space address\n");
+
+ break;
+ }
+
+ case NE_ERR_MEM_REGION_ALREADY_USED: {
+ printf("Error in set user memory region, memory region already used\n");
+
+ break;
+ }
+
+ case NE_ERR_MEM_NOT_HUGE_PAGE: {
+ printf("Error in set user memory region, not backed by huge pages\n");
+
+ break;
+ }
+
+ case NE_ERR_MEM_DIFFERENT_NUMA_NODE: {
+ printf("Error in set user memory region, different NUMA node than CPUs\n");
+
+ break;
+ }
+
+ case NE_ERR_MEM_MAX_REGIONS: {
+ printf("Error in set user memory region, max memory regions reached\n");
+
+ break;
+ }
+
+ case NE_ERR_INVALID_PAGE_SIZE: {
+ printf("Error in set user memory region, has page not multiple of 2 MiB\n");
+
+ break;
+ }
+
+ case NE_ERR_INVALID_FLAG_VALUE: {
+ printf("Error in set user memory region, provided invalid flag\n");
+
+ break;
+ }
+
+ default:
+ printf("Error in set user memory region [%m]\n");
+ }
+
+ return rc;
+ }
+
+ return 0;
+}
+
+/**
+ * ne_free_mem_regions() - Unmap all the user space memory regions that were set
+ * aside for the enclave.
+ * @ne_user_mem_regions: The user space memory regions associated with an enclave.
+ *
+ * Context: Process context.
+ */
+static void ne_free_mem_regions(struct ne_user_mem_region ne_user_mem_regions[])
+{
+ unsigned int i = 0;
+
+ for (i = 0; i < NE_DEFAULT_NR_MEM_REGIONS; i++)
+ munmap(ne_user_mem_regions[i].userspace_addr,
+ ne_user_mem_regions[i].memory_size);
+}
+
+/**
+ * ne_add_vcpu() - Add a vCPU to the given enclave.
+ * @enclave_fd : The file descriptor associated with the enclave.
+ * @vcpu_id: vCPU id to be set for the enclave, either provided or
+ * auto-generated (if provided vCPU id is 0).
+ *
+ * Context: Process context.
+ * Return:
+ * * 0 on success.
+ * * Negative return value on failure.
+ */
+static int ne_add_vcpu(int enclave_fd, unsigned int *vcpu_id)
+{
+ int rc = -EINVAL;
+
+ rc = ioctl(enclave_fd, NE_ADD_VCPU, vcpu_id);
+ if (rc < 0) {
+ switch (errno) {
+ case NE_ERR_NO_CPUS_AVAIL_IN_POOL: {
+ printf("Error in add vcpu, no CPUs available in the NE CPU pool\n");
+
+ break;
+ }
+
+ case NE_ERR_VCPU_ALREADY_USED: {
+ printf("Error in add vcpu, the provided vCPU is already used\n");
+
+ break;
+ }
+
+ case NE_ERR_VCPU_NOT_IN_CPU_POOL: {
+ printf("Error in add vcpu, the provided vCPU is not in the NE CPU pool\n");
+
+ break;
+ }
+
+ case NE_ERR_VCPU_INVALID_CPU_CORE: {
+ printf("Error in add vcpu, the core id of the provided vCPU is invalid\n");
+
+ break;
+ }
+
+ case NE_ERR_NOT_IN_INIT_STATE: {
+ printf("Error in add vcpu, enclave not in init state\n");
+
+ break;
+ }
+
+ case NE_ERR_INVALID_VCPU: {
+ printf("Error in add vcpu, the provided vCPU is out of avail CPUs range\n");
+
+ break;
+ }
+
+ default:
+ printf("Error in add vcpu [%m]\n");
+
+ }
+ return rc;
+ }
+
+ return 0;
+}
+
+/**
+ * ne_start_enclave() - Start the given enclave.
+ * @enclave_fd : The file descriptor associated with the enclave.
+ * @enclave_start_info : Enclave metadata used for starting e.g. vsock CID.
+ *
+ * Context: Process context.
+ * Return:
+ * * 0 on success.
+ * * Negative return value on failure.
+ */
+static int ne_start_enclave(int enclave_fd, struct ne_enclave_start_info *enclave_start_info)
+{
+ int rc = -EINVAL;
+
+ rc = ioctl(enclave_fd, NE_START_ENCLAVE, enclave_start_info);
+ if (rc < 0) {
+ switch (errno) {
+ case NE_ERR_NOT_IN_INIT_STATE: {
+ printf("Error in start enclave, enclave not in init state\n");
+
+ break;
+ }
+
+ case NE_ERR_NO_MEM_REGIONS_ADDED: {
+ printf("Error in start enclave, no memory regions have been added\n");
+
+ break;
+ }
+
+ case NE_ERR_NO_VCPUS_ADDED: {
+ printf("Error in start enclave, no vCPUs have been added\n");
+
+ break;
+ }
+
+ case NE_ERR_FULL_CORES_NOT_USED: {
+ printf("Error in start enclave, enclave has no full cores set\n");
+
+ break;
+ }
+
+ case NE_ERR_ENCLAVE_MEM_MIN_SIZE: {
+ printf("Error in start enclave, enclave memory is less than min size\n");
+
+ break;
+ }
+
+ case NE_ERR_INVALID_FLAG_VALUE: {
+ printf("Error in start enclave, provided invalid flag\n");
+
+ break;
+ }
+
+ case NE_ERR_INVALID_ENCLAVE_CID: {
+ printf("Error in start enclave, provided invalid enclave CID\n");
+
+ break;
+ }
+
+ default:
+ printf("Error in start enclave [%m]\n");
+ }
+
+ return rc;
+ }
+
+ return 0;
+}
+
+/**
+ * ne_start_enclave_check_booted() - Start the enclave and wait for a hearbeat
+ * from it, on a newly created vsock channel,
+ * to check it has booted.
+ * @enclave_fd : The file descriptor associated with the enclave.
+ *
+ * Context: Process context.
+ * Return:
+ * * 0 on success.
+ * * Negative return value on failure.
+ */
+static int ne_start_enclave_check_booted(int enclave_fd)
+{
+ struct sockaddr_vm client_vsock_addr = {};
+ int client_vsock_fd = -1;
+ socklen_t client_vsock_len = sizeof(client_vsock_addr);
+ struct ne_enclave_start_info enclave_start_info = {};
+ struct pollfd fds[1] = {};
+ int rc = -EINVAL;
+ unsigned char recv_buf = 0;
+ struct sockaddr_vm server_vsock_addr = {
+ .svm_family = AF_VSOCK,
+ .svm_cid = NE_IMAGE_LOAD_HEARTBEAT_CID,
+ .svm_port = NE_IMAGE_LOAD_HEARTBEAT_PORT,
+ };
+ int server_vsock_fd = -1;
+
+ server_vsock_fd = socket(AF_VSOCK, SOCK_STREAM, 0);
+ if (server_vsock_fd < 0) {
+ rc = server_vsock_fd;
+
+ printf("Error in socket [%m]\n");
+
+ return rc;
+ }
+
+ rc = bind(server_vsock_fd, (struct sockaddr *)&server_vsock_addr,
+ sizeof(server_vsock_addr));
+ if (rc < 0) {
+ printf("Error in bind [%m]\n");
+
+ goto out;
+ }
+
+ rc = listen(server_vsock_fd, 1);
+ if (rc < 0) {
+ printf("Error in listen [%m]\n");
+
+ goto out;
+ }
+
+ rc = ne_start_enclave(enclave_fd, &enclave_start_info);
+ if (rc < 0)
+ goto out;
+
+ printf("Enclave started, CID %llu\n", enclave_start_info.enclave_cid);
+
+ fds[0].fd = server_vsock_fd;
+ fds[0].events = POLLIN;
+
+ rc = poll(fds, 1, NE_POLL_WAIT_TIME_MS);
+ if (rc < 0) {
+ printf("Error in poll [%m]\n");
+
+ goto out;
+ }
+
+ if (!rc) {
+ printf("Poll timeout, %d seconds elapsed\n", NE_POLL_WAIT_TIME);
+
+ rc = -ETIMEDOUT;
+
+ goto out;
+ }
+
+ if ((fds[0].revents & POLLIN) == 0) {
+ printf("Poll received value %d\n", fds[0].revents);
+
+ rc = -EINVAL;
+
+ goto out;
+ }
+
+ rc = accept(server_vsock_fd, (struct sockaddr *)&client_vsock_addr,
+ &client_vsock_len);
+ if (rc < 0) {
+ printf("Error in accept [%m]\n");
+
+ goto out;
+ }
+
+ client_vsock_fd = rc;
+
+ /*
+ * Read the heartbeat value that the init process in the enclave sends
+ * after vsock connect.
+ */
+ rc = read(client_vsock_fd, &recv_buf, sizeof(recv_buf));
+ if (rc < 0) {
+ printf("Error in read [%m]\n");
+
+ goto out;
+ }
+
+ if (rc != sizeof(recv_buf) || recv_buf != NE_IMAGE_LOAD_HEARTBEAT_VALUE) {
+ printf("Read %d instead of %d\n", recv_buf,
+ NE_IMAGE_LOAD_HEARTBEAT_VALUE);
+
+ goto out;
+ }
+
+ /* Write the heartbeat value back. */
+ rc = write(client_vsock_fd, &recv_buf, sizeof(recv_buf));
+ if (rc < 0) {
+ printf("Error in write [%m]\n");
+
+ goto out;
+ }
+
+ rc = 0;
+
+out:
+ close(server_vsock_fd);
+
+ return rc;
+}
+
+int main(int argc, char *argv[])
+{
+ int enclave_fd = -1;
+ unsigned int i = 0;
+ int ne_dev_fd = -1;
+ struct ne_user_mem_region ne_user_mem_regions[NE_DEFAULT_NR_MEM_REGIONS] = {};
+ unsigned int ne_vcpus[NE_DEFAULT_NR_VCPUS] = {};
+ int rc = -EINVAL;
+ pthread_t thread_id = 0;
+ unsigned long slot_uid = 0;
+
+ if (argc != 2) {
+ printf("Usage: %s <path_to_enclave_image>\n", argv[0]);
+
+ exit(EXIT_FAILURE);
+ }
+
+ if (strlen(argv[1]) >= PATH_MAX) {
+ printf("The size of the path to enclave image is higher than max path\n");
+
+ exit(EXIT_FAILURE);
+ }
+
+ ne_dev_fd = open(NE_DEV_NAME, O_RDWR | O_CLOEXEC);
+ if (ne_dev_fd < 0) {
+ printf("Error in open NE device [%m]\n");
+
+ exit(EXIT_FAILURE);
+ }
+
+ printf("Creating enclave slot ...\n");
+
+ rc = ne_create_vm(ne_dev_fd, &slot_uid, &enclave_fd);
+
+ close(ne_dev_fd);
+
+ if (rc < 0)
+ exit(EXIT_FAILURE);
+
+ printf("Enclave fd %d\n", enclave_fd);
+
+ rc = pthread_create(&thread_id, NULL, ne_poll_enclave_fd, (void *)&enclave_fd);
+ if (rc < 0) {
+ printf("Error in thread create [%m]\n");
+
+ close(enclave_fd);
+
+ exit(EXIT_FAILURE);
+ }
+
+ for (i = 0; i < NE_DEFAULT_NR_MEM_REGIONS; i++) {
+ ne_user_mem_regions[i].memory_size = NE_MIN_MEM_REGION_SIZE;
+
+ rc = ne_alloc_user_mem_region(&ne_user_mem_regions[i]);
+ if (rc < 0) {
+ printf("Error in alloc userspace memory region, iter %d\n", i);
+
+ goto release_enclave_fd;
+ }
+ }
+
+ rc = ne_load_enclave_image(enclave_fd, ne_user_mem_regions, argv[1]);
+ if (rc < 0)
+ goto release_enclave_fd;
+
+ for (i = 0; i < NE_DEFAULT_NR_MEM_REGIONS; i++) {
+ rc = ne_set_user_mem_region(enclave_fd, ne_user_mem_regions[i]);
+ if (rc < 0) {
+ printf("Error in set memory region, iter %d\n", i);
+
+ goto release_enclave_fd;
+ }
+ }
+
+ printf("Enclave memory regions were added\n");
+
+ for (i = 0; i < NE_DEFAULT_NR_VCPUS; i++) {
+ /*
+ * The vCPU is chosen from the enclave vCPU pool, if the value
+ * of the vcpu_id is 0.
+ */
+ ne_vcpus[i] = 0;
+ rc = ne_add_vcpu(enclave_fd, &ne_vcpus[i]);
+ if (rc < 0) {
+ printf("Error in add vcpu, iter %d\n", i);
+
+ goto release_enclave_fd;
+ }
+
+ printf("Added vCPU %d to the enclave\n", ne_vcpus[i]);
+ }
+
+ printf("Enclave vCPUs were added\n");
+
+ rc = ne_start_enclave_check_booted(enclave_fd);
+ if (rc < 0) {
+ printf("Error in the enclave start / image loading heartbeat logic [rc=%d]\n", rc);
+
+ goto release_enclave_fd;
+ }
+
+ printf("Entering sleep for %d seconds ...\n", NE_SLEEP_TIME);
+
+ sleep(NE_SLEEP_TIME);
+
+ close(enclave_fd);
+
+ ne_free_mem_regions(ne_user_mem_regions);
+
+ exit(EXIT_SUCCESS);
+
+release_enclave_fd:
+ close(enclave_fd);
+ ne_free_mem_regions(ne_user_mem_regions);
+
+ exit(EXIT_FAILURE);
+}
diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c
index 3cc5e5921682..e03068917273 100644
--- a/samples/vfio-mdev/mbochs.c
+++ b/samples/vfio-mdev/mbochs.c
@@ -846,7 +846,7 @@ static struct sg_table *mbochs_map_dmabuf(struct dma_buf_attachment *at,
if (sg_alloc_table_from_pages(sg, dmabuf->pages, dmabuf->pagecount,
0, dmabuf->mode.size, GFP_KERNEL) < 0)
goto err2;
- if (!dma_map_sg(at->dev, sg->sgl, sg->nents, direction))
+ if (dma_map_sgtable(at->dev, sg, direction, 0))
goto err3;
return sg;
@@ -868,6 +868,7 @@ static void mbochs_unmap_dmabuf(struct dma_buf_attachment *at,
dev_dbg(dev, "%s: %d\n", __func__, dmabuf->id);
+ dma_unmap_sgtable(at->dev, sg, direction, 0);
sg_free_table(sg);
kfree(sg);
}