diff options
author | David S. Miller <davem@davemloft.net> | 2021-03-09 18:07:05 -0800 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2021-03-09 18:07:05 -0800 |
commit | c1acda9807e2bbe1d2026b44f37d959d6d8266c8 (patch) | |
tree | 6af2137ad95c0303f9b59d11fe7866e8ebfbcd07 | |
parent | 05a59d79793d482f628a31753c671f2e92178a21 (diff) | |
parent | 32f91529e2bdbe0d92edb3ced41dfba4beffa84a (diff) |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
Alexei Starovoitov says:
====================
pull-request: bpf-next 2021-03-09
The following pull-request contains BPF updates for your *net-next* tree.
We've added 90 non-merge commits during the last 17 day(s) which contain
a total of 114 files changed, 5158 insertions(+), 1288 deletions(-).
The main changes are:
1) Faster bpf_redirect_map(), from Björn.
2) skmsg cleanup, from Cong.
3) Support for floating point types in BTF, from Ilya.
4) Documentation for sys_bpf commands, from Joe.
5) Support for sk_lookup in bpf_prog_test_run, form Lorenz.
6) Enable task local storage for tracing programs, from Song.
7) bpf_for_each_map_elem() helper, from Yonghong.
====================
Signed-off-by: David S. Miller <davem@davemloft.net>
114 files changed, 5158 insertions, 1286 deletions
diff --git a/Documentation/bpf/btf.rst b/Documentation/bpf/btf.rst index 44dc789de2b4..846354cd2d69 100644 --- a/Documentation/bpf/btf.rst +++ b/Documentation/bpf/btf.rst @@ -84,6 +84,7 @@ sequentially and type id is assigned to each recognized type starting from id #define BTF_KIND_FUNC_PROTO 13 /* Function Proto */ #define BTF_KIND_VAR 14 /* Variable */ #define BTF_KIND_DATASEC 15 /* Section */ + #define BTF_KIND_FLOAT 16 /* Floating point */ Note that the type section encodes debug info, not just pure types. ``BTF_KIND_FUNC`` is not a type, and it represents a defined subprogram. @@ -95,8 +96,8 @@ Each type contains the following common data:: /* "info" bits arrangement * bits 0-15: vlen (e.g. # of struct's members) * bits 16-23: unused - * bits 24-27: kind (e.g. int, ptr, array...etc) - * bits 28-30: unused + * bits 24-28: kind (e.g. int, ptr, array...etc) + * bits 29-30: unused * bit 31: kind_flag, currently used by * struct, union and fwd */ @@ -452,6 +453,18 @@ map definition. * ``offset``: the in-section offset of the variable * ``size``: the size of the variable in bytes +2.2.16 BTF_KIND_FLOAT +~~~~~~~~~~~~~~~~~~~~~ + +``struct btf_type`` encoding requirement: + * ``name_off``: any valid offset + * ``info.kind_flag``: 0 + * ``info.kind``: BTF_KIND_FLOAT + * ``info.vlen``: 0 + * ``size``: the size of the float type in bytes: 2, 4, 8, 12 or 16. + +No additional type data follow ``btf_type``. + 3. BTF Kernel API ***************** diff --git a/Documentation/bpf/index.rst b/Documentation/bpf/index.rst index 4f2874b729c3..a702f67dd45f 100644 --- a/Documentation/bpf/index.rst +++ b/Documentation/bpf/index.rst @@ -12,9 +12,6 @@ BPF instruction-set. The Cilium project also maintains a `BPF and XDP Reference Guide`_ that goes into great technical depth about the BPF Architecture. -The primary info for the bpf syscall is available in the `man-pages`_ -for `bpf(2)`_. - BPF Type Format (BTF) ===================== @@ -35,6 +32,12 @@ Two sets of Questions and Answers (Q&A) are maintained. bpf_design_QA bpf_devel_QA +Syscall API +=========== + +The primary info for the bpf syscall is available in the `man-pages`_ +for `bpf(2)`_. For more information about the userspace API, see +Documentation/userspace-api/ebpf/index.rst. Helper functions ================ diff --git a/Documentation/userspace-api/ebpf/index.rst b/Documentation/userspace-api/ebpf/index.rst new file mode 100644 index 000000000000..473dfba78116 --- /dev/null +++ b/Documentation/userspace-api/ebpf/index.rst @@ -0,0 +1,17 @@ +.. SPDX-License-Identifier: GPL-2.0 + +eBPF Userspace API +================== + +eBPF is a kernel mechanism to provide a sandboxed runtime environment in the +Linux kernel for runtime extension and instrumentation without changing kernel +source code or loading kernel modules. eBPF programs can be attached to various +kernel subsystems, including networking, tracing and Linux security modules +(LSM). + +For internal kernel documentation on eBPF, see Documentation/bpf/index.rst. + +.. toctree:: + :maxdepth: 1 + + syscall diff --git a/Documentation/userspace-api/ebpf/syscall.rst b/Documentation/userspace-api/ebpf/syscall.rst new file mode 100644 index 000000000000..ea9918084221 --- /dev/null +++ b/Documentation/userspace-api/ebpf/syscall.rst @@ -0,0 +1,24 @@ +.. SPDX-License-Identifier: GPL-2.0 + +eBPF Syscall +------------ + +:Authors: - Alexei Starovoitov <ast@kernel.org> + - Joe Stringer <joe@wand.net.nz> + - Michael Kerrisk <mtk.manpages@gmail.com> + +The primary info for the bpf syscall is available in the `man-pages`_ +for `bpf(2)`_. + +bpf() subcommand reference +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. kernel-doc:: include/uapi/linux/bpf.h + :doc: eBPF Syscall Preamble + +.. kernel-doc:: include/uapi/linux/bpf.h + :doc: eBPF Syscall Commands + +.. Links: +.. _man-pages: https://www.kernel.org/doc/man-pages/ +.. _bpf(2): https://man7.org/linux/man-pages/man2/bpf.2.html diff --git a/Documentation/userspace-api/index.rst b/Documentation/userspace-api/index.rst index d29b020e5622..1e2438b7afa0 100644 --- a/Documentation/userspace-api/index.rst +++ b/Documentation/userspace-api/index.rst @@ -21,6 +21,7 @@ place where this information is gathered. unshare spec_ctrl accelerators/ocxl + ebpf/index ioctl/index iommu media/index diff --git a/MAINTAINERS b/MAINTAINERS index f2e34ede1ab3..ed9fd1fb6932 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3233,6 +3233,7 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git T: git git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git F: Documentation/bpf/ F: Documentation/networking/filter.rst +F: Documentation/userspace-api/ebpf/ F: arch/*/net/* F: include/linux/bpf* F: include/linux/filter.h @@ -3247,6 +3248,7 @@ F: net/core/filter.c F: net/sched/act_bpf.c F: net/sched/cls_bpf.c F: samples/bpf/ +F: scripts/bpf_doc.py F: tools/bpf/ F: tools/lib/bpf/ F: tools/testing/selftests/bpf/ diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 82e520d2cb12..708a8b2ca25b 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -2973,7 +2973,8 @@ static int virtnet_probe(struct virtio_device *vdev) return -ENOMEM; /* Set up network device as normal. */ - dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE; + dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE | + IFF_TX_SKB_NO_LINEAR; dev->netdev_ops = &virtnet_netdev; dev->features = NETIF_F_HIGHDMA; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index cccaef1088ea..a25730eaa148 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -39,6 +39,7 @@ struct bpf_local_storage; struct bpf_local_storage_map; struct kobject; struct mem_cgroup; +struct bpf_func_state; extern struct idr btf_idr; extern spinlock_t btf_idr_lock; @@ -117,6 +118,9 @@ struct bpf_map_ops { void *owner, u32 size); struct bpf_local_storage __rcu ** (*map_owner_storage_ptr)(void *owner); + /* Misc helpers.*/ + int (*map_redirect)(struct bpf_map *map, u32 ifindex, u64 flags); + /* map_meta_equal must be implemented for maps that can be * used as an inner map. It is a runtime check to ensure * an inner map can be inserted to an outer map. @@ -129,6 +133,13 @@ struct bpf_map_ops { bool (*map_meta_equal)(const struct bpf_map *meta0, const struct bpf_map *meta1); + + int (*map_set_for_each_callback_args)(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee); + int (*map_for_each_callback)(struct bpf_map *map, void *callback_fn, + void *callback_ctx, u64 flags); + /* BTF name and id of struct allocated by map_alloc */ const char * const map_btf_name; int *map_btf_id; @@ -295,6 +306,8 @@ enum bpf_arg_type { ARG_CONST_ALLOC_SIZE_OR_ZERO, /* number of allocated bytes requested */ ARG_PTR_TO_BTF_ID_SOCK_COMMON, /* pointer to in-kernel sock_common or bpf-mirrored bpf_sock */ ARG_PTR_TO_PERCPU_BTF_ID, /* pointer to in-kernel percpu type */ + ARG_PTR_TO_FUNC, /* pointer to a bpf program function */ + ARG_PTR_TO_STACK_OR_NULL, /* pointer to stack or NULL */ __BPF_ARG_TYPE_MAX, }; @@ -411,6 +424,8 @@ enum bpf_reg_type { PTR_TO_RDWR_BUF, /* reg points to a read/write buffer */ PTR_TO_RDWR_BUF_OR_NULL, /* reg points to a read/write buffer or NULL */ PTR_TO_PERCPU_BTF_ID, /* reg points to a percpu kernel variable */ + PTR_TO_FUNC, /* reg points to a bpf program function */ + PTR_TO_MAP_KEY, /* reg points to a map element key */ }; /* The information passed from prog-specific *_is_valid_access @@ -506,6 +521,11 @@ enum bpf_cgroup_storage_type { */ #define MAX_BPF_FUNC_ARGS 12 +/* The maximum number of arguments passed through registers + * a single function may have. + */ +#define MAX_BPF_FUNC_REG_ARGS 5 + struct btf_func_model { u8 ret_size; u8 nr_args; @@ -1380,6 +1400,10 @@ void bpf_iter_map_show_fdinfo(const struct bpf_iter_aux_info *aux, int bpf_iter_map_fill_link_info(const struct bpf_iter_aux_info *aux, struct bpf_link_info *info); +int map_set_for_each_callback_args(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee); + int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value); int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, @@ -1429,9 +1453,9 @@ struct btf *bpf_get_btf_vmlinux(void); /* Map specifics */ struct xdp_buff; struct sk_buff; +struct bpf_dtab_netdev; +struct bpf_cpu_map_entry; -struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key); -struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key); void __dev_flush(void); int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, struct net_device *dev_rx); @@ -1441,7 +1465,6 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, struct bpf_prog *xdp_prog); bool dev_map_can_have_prog(struct bpf_map *map); -struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key); void __cpu_map_flush(void); int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, struct net_device *dev_rx); @@ -1470,6 +1493,9 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, int bpf_prog_test_run_raw_tp(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); +int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, + const union bpf_attr *kattr, + union bpf_attr __user *uattr); bool btf_ctx_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info); @@ -1499,6 +1525,7 @@ struct bpf_prog *bpf_prog_by_id(u32 id); struct bpf_link *bpf_link_by_id(u32 id); const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id); +void bpf_task_storage_free(struct task_struct *task); #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { @@ -1568,17 +1595,6 @@ static inline int bpf_obj_get_user(const char __user *pathname, int flags) return -EOPNOTSUPP; } -static inline struct net_device *__dev_map_lookup_elem(struct bpf_map *map, - u32 key) -{ - return NULL; -} - -static inline struct net_device *__dev_map_hash_lookup_elem(struct bpf_map *map, - u32 key) -{ - return NULL; -} static inline bool dev_map_can_have_prog(struct bpf_map *map) { return false; @@ -1590,6 +1606,7 @@ static inline void __dev_flush(void) struct xdp_buff; struct bpf_dtab_netdev; +struct bpf_cpu_map_entry; static inline int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, @@ -1614,12 +1631,6 @@ static inline int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, return 0; } -static inline -struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key) -{ - return NULL; -} - static inline void __cpu_map_flush(void) { } @@ -1670,6 +1681,13 @@ static inline int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, return -ENOTSUPP; } +static inline int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, + const union bpf_attr *kattr, + union bpf_attr __user *uattr) +{ + return -ENOTSUPP; +} + static inline void bpf_map_put(struct bpf_map *map) { } @@ -1684,6 +1702,10 @@ bpf_base_func_proto(enum bpf_func_id func_id) { return NULL; } + +static inline void bpf_task_storage_free(struct task_struct *task) +{ +} #endif /* CONFIG_BPF_SYSCALL */ void __bpf_free_used_btfs(struct bpf_prog_aux *aux, @@ -1768,22 +1790,24 @@ static inline void bpf_map_offload_map_free(struct bpf_map *map) } #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */ -#if defined(CONFIG_BPF_STREAM_PARSER) -int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, - struct bpf_prog *old, u32 which); +#if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog); int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype); int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags); void sock_map_unhash(struct sock *sk); void sock_map_close(struct sock *sk, long timeout); + +void bpf_sk_reuseport_detach(struct sock *sk); +int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, + void *value); +int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags); #else -static inline int sock_map_prog_update(struct bpf_map *map, - struct bpf_prog *prog, - struct bpf_prog *old, u32 which) +static inline void bpf_sk_reuseport_detach(struct sock *sk) { - return -EOPNOTSUPP; } +#ifdef CONFIG_BPF_SYSCALL static inline int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog) { @@ -1801,20 +1825,7 @@ static inline int sock_map_update_elem_sys(struct bpf_map *map, void *key, void { return -EOPNOTSUPP; } -#endif /* CONFIG_BPF_STREAM_PARSER */ -#if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) -void bpf_sk_reuseport_detach(struct sock *sk); -int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, - void *value); -int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, - void *value, u64 map_flags); -#else -static inline void bpf_sk_reuseport_detach(struct sock *sk) -{ -} - -#ifdef CONFIG_BPF_SYSCALL static inline int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, void *value) { @@ -1886,6 +1897,9 @@ extern const struct bpf_func_proto bpf_this_cpu_ptr_proto; extern const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto; extern const struct bpf_func_proto bpf_sock_from_file_proto; extern const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto; +extern const struct bpf_func_proto bpf_task_storage_get_proto; +extern const struct bpf_func_proto bpf_task_storage_delete_proto; +extern const struct bpf_func_proto bpf_for_each_map_elem_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index b2c9463f36a1..b902c580c48d 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -126,7 +126,8 @@ bpf_local_storage_lookup(struct bpf_local_storage *local_storage, struct bpf_local_storage_map *smap, bool cacheit_lockit); -void bpf_local_storage_map_free(struct bpf_local_storage_map *smap); +void bpf_local_storage_map_free(struct bpf_local_storage_map *smap, + int __percpu *busy_counter); int bpf_local_storage_map_check_btf(const struct bpf_map *map, const struct btf *btf, diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h index 0d1c33ace398..479c101546ad 100644 --- a/include/linux/bpf_lsm.h +++ b/include/linux/bpf_lsm.h @@ -38,21 +38,9 @@ static inline struct bpf_storage_blob *bpf_inode( return inode->i_security + bpf_lsm_blob_sizes.lbs_inode; } -static inline struct bpf_storage_blob *bpf_task( - const struct task_struct *task) -{ - if (unlikely(!task->security)) - return NULL; - - return task->security + bpf_lsm_blob_sizes.lbs_task; -} - extern const struct bpf_func_proto bpf_inode_storage_get_proto; extern const struct bpf_func_proto bpf_inode_storage_delete_proto; -extern const struct bpf_func_proto bpf_task_storage_get_proto; -extern const struct bpf_func_proto bpf_task_storage_delete_proto; void bpf_inode_storage_free(struct inode *inode); -void bpf_task_storage_free(struct task_struct *task); #else /* !CONFIG_BPF_LSM */ @@ -73,20 +61,10 @@ static inline struct bpf_storage_blob *bpf_inode( return NULL; } -static inline struct bpf_storage_blob *bpf_task( - const struct task_struct *task) -{ - return NULL; -} - static inline void bpf_inode_storage_free(struct inode *inode) { } -static inline void bpf_task_storage_free(struct task_struct *task) -{ -} - #endif /* CONFIG_BPF_LSM */ #endif /* _LINUX_BPF_LSM_H */ diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 99f7fd657d87..f883f01a5061 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -103,19 +103,17 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP_HASH, dev_map_hash_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_SK_STORAGE, sk_storage_map_ops) -#if defined(CONFIG_BPF_STREAM_PARSER) -BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops) -BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops) -#endif #ifdef CONFIG_BPF_LSM BPF_MAP_TYPE(BPF_MAP_TYPE_INODE_STORAGE, inode_storage_map_ops) -BPF_MAP_TYPE(BPF_MAP_TYPE_TASK_STORAGE, task_storage_map_ops) #endif +BPF_MAP_TYPE(BPF_MAP_TYPE_TASK_STORAGE, task_storage_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops) #if defined(CONFIG_XDP_SOCKETS) BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops) #endif #ifdef CONFIG_INET +BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, reuseport_array_ops) #endif #endif diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 971b33aca13d..51c2ffa3d901 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -68,6 +68,8 @@ struct bpf_reg_state { unsigned long raw1; unsigned long raw2; } raw; + + u32 subprogno; /* for PTR_TO_FUNC */ }; /* For PTR_TO_PACKET, used to find other pointers with the same variable * offset, so they can share range knowledge. @@ -204,6 +206,7 @@ struct bpf_func_state { int acquired_refs; struct bpf_reference_state *refs; int allocated_stack; + bool in_callback_fn; struct bpf_stack_state *stack; }; diff --git a/include/linux/filter.h b/include/linux/filter.h index 3b00fc906ccd..b2b85b2cad8e 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -646,7 +646,8 @@ struct bpf_redirect_info { u32 flags; u32 tgt_index; void *tgt_value; - struct bpf_map *map; + u32 map_id; + enum bpf_map_type map_type; u32 kern_flags; struct bpf_nh_params nh; }; @@ -1472,4 +1473,32 @@ static inline bool bpf_sk_lookup_run_v6(struct net *net, int protocol, } #endif /* IS_ENABLED(CONFIG_IPV6) */ +static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifindex, u64 flags, + void *lookup_elem(struct bpf_map *map, u32 key)) +{ + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + + /* Lower bits of the flags are used as return code on lookup failure */ + if (unlikely(flags > XDP_TX)) + return XDP_ABORTED; + + ri->tgt_value = lookup_elem(map, ifindex); + if (unlikely(!ri->tgt_value)) { + /* If the lookup fails we want to clear out the state in the + * redirect_info struct completely, so that if an eBPF program + * performs multiple lookups, the last one always takes + * precedence. + */ + ri->map_id = INT_MAX; /* Valid map id idr range: [1,INT_MAX[ */ + ri->map_type = BPF_MAP_TYPE_UNSPEC; + return flags; + } + + ri->tgt_index = ifindex; + ri->map_id = map->id; + ri->map_type = map->map_type; + + return XDP_REDIRECT; +} + #endif /* __LINUX_FILTER_H__ */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5b67ea89d5f2..b379d08a12ed 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1518,6 +1518,8 @@ struct net_device_ops { * @IFF_FAILOVER_SLAVE: device is lower dev of a failover master device * @IFF_L3MDEV_RX_HANDLER: only invoke the rx handler of L3 master device * @IFF_LIVE_RENAME_OK: rename is allowed while device is up and running + * @IFF_TX_SKB_NO_LINEAR: device/driver is capable of xmitting frames with + * skb_headlen(skb) == 0 (data starts from frag0) */ enum netdev_priv_flags { IFF_802_1Q_VLAN = 1<<0, @@ -1551,6 +1553,7 @@ enum netdev_priv_flags { IFF_FAILOVER_SLAVE = 1<<28, IFF_L3MDEV_RX_HANDLER = 1<<29, IFF_LIVE_RENAME_OK = 1<<30, + IFF_TX_SKB_NO_LINEAR = 1<<31, }; #define IFF_802_1Q_VLAN IFF_802_1Q_VLAN @@ -1577,12 +1580,14 @@ enum netdev_priv_flags { #define IFF_L3MDEV_SLAVE IFF_L3MDEV_SLAVE #define IFF_TEAM IFF_TEAM #define IFF_RXFH_CONFIGURED IFF_RXFH_CONFIGURED +#define IFF_PHONY_HEADROOM IFF_PHONY_HEADROOM #define IFF_MACSEC IFF_MACSEC #define IFF_NO_RX_HANDLER IFF_NO_RX_HANDLER #define IFF_FAILOVER IFF_FAILOVER #define IFF_FAILOVER_SLAVE IFF_FAILOVER_SLAVE #define IFF_L3MDEV_RX_HANDLER IFF_L3MDEV_RX_HANDLER #define IFF_LIVE_RENAME_OK IFF_LIVE_RENAME_OK +#define IFF_TX_SKB_NO_LINEAR IFF_TX_SKB_NO_LINEAR /* Specifies the type of the struct net_device::ml_priv pointer */ enum netdev_ml_priv_type { diff --git a/include/linux/sched.h b/include/linux/sched.h index ef00bb22164c..e5b7d9054473 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -42,6 +42,7 @@ struct audit_context; struct backing_dev_info; struct bio_list; struct blk_plug; +struct bpf_local_storage; struct capture_control; struct cfs_rq; struct fs_struct; @@ -1351,6 +1352,10 @@ struct task_struct { /* Used by LSM modules for access restriction: */ void *security; #endif +#ifdef CONFIG_BPF_SYSCALL + /* Used by BPF task local storage */ + struct bpf_local_storage __rcu *bpf_storage; +#endif #ifdef CONFIG_GCC_PLUGIN_STACKLEAK unsigned long lowest_stack; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 6d0a33d1c0db..0503c917d773 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -656,6 +656,7 @@ typedef unsigned char *sk_buff_data_t; * @protocol: Packet protocol from driver * @destructor: Destruct function * @tcp_tsorted_anchor: list structure for TCP (tp->tsorted_sent_queue) + * @_sk_redir: socket redirection information for skmsg * @_nfct: Associated connection, if any (with nfctinfo bits) * @nf_bridge: Saved data about a bridged frame - see br_netfilter.c * @skb_iif: ifindex of device we arrived on @@ -755,6 +756,9 @@ struct sk_buff { void (*destructor)(struct sk_buff *skb); }; struct list_head tcp_tsorted_anchor; +#ifdef CONFIG_NET_SOCK_MSG + unsigned long _sk_redir; +#endif }; #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 8edbbf5f2f93..6c09d94be2e9 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -56,8 +56,8 @@ struct sk_msg { struct sk_psock_progs { struct bpf_prog *msg_parser; - struct bpf_prog *skb_parser; - struct bpf_prog *skb_verdict; + struct bpf_prog *stream_parser; + struct bpf_prog *stream_verdict; }; enum sk_psock_state_bits { @@ -70,12 +70,6 @@ struct sk_psock_link { void *link_raw; }; -struct sk_psock_parser { - struct strparser strp; - bool enabled; - void (*saved_data_ready)(struct sock *sk); -}; - struct sk_psock_work_state { struct sk_buff *skb; u32 len; @@ -90,7 +84,9 @@ struct sk_psock { u32 eval; struct sk_msg *cork; struct sk_psock_progs progs; - struct sk_psock_parser parser; +#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) + struct strparser strp; +#endif struct sk_buff_head ingress_skb; struct list_head ingress_msg; unsigned long state; @@ -100,6 +96,7 @@ struct sk_psock { void (*saved_unhash)(struct sock *sk); void (*saved_close)(struct sock *sk, long timeout); void (*saved_write_space)(struct sock *sk); + void (*saved_data_ready)(struct sock *sk); struct proto *sk_proto; struct sk_psock_work_state work_state; struct work_struct work; @@ -305,9 +302,25 @@ static inline void sk_psock_report_error(struct sk_psock *psock, int err) struct sk_psock *sk_psock_init(struct sock *sk, int node); +#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock); void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock); void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock); +#else +static inline int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) +{ + return -EOPNOTSUPP; +} + +static inline void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) +{ +} + +static inline void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) +{ +} +#endif + void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock); void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock); @@ -327,8 +340,6 @@ static inline void sk_psock_free_link(struct sk_psock_link *link) struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock); -void __sk_psock_purge_ingress_msg(struct sk_psock *psock); - static inline void sk_psock_cork_free(struct sk_psock *psock) { if (psock->cork) { @@ -389,7 +400,6 @@ static inline struct sk_psock *sk_psock_get(struct sock *sk) return psock; } -void sk_psock_stop(struct sock *sk, struct sk_psock *psock); void sk_psock_drop(struct sock *sk, struct sk_psock *psock); static inline void sk_psock_put(struct sock *sk, struct sk_psock *psock) @@ -400,8 +410,8 @@ static inline void sk_psock_put(struct sock *sk, struct sk_psock *psock) static inline void sk_psock_data_ready(struct sock *sk, struct sk_psock *psock) { - if (psock->parser.enabled) - psock->parser.saved_data_ready(sk); + if (psock->saved_data_ready) + psock->saved_data_ready(sk); else sk->sk_data_ready(sk); } @@ -430,8 +440,8 @@ static inline int psock_replace_prog(struct bpf_prog **pprog, static inline void psock_progs_drop(struct sk_psock_progs *progs) { psock_set_prog(&progs->msg_parser, NULL); - psock_set_prog(&progs->skb_parser, NULL); - psock_set_prog(&progs->skb_verdict, NULL); + psock_set_prog(&progs->stream_parser, NULL); + psock_set_prog(&progs->stream_verdict, NULL); } int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb); @@ -440,6 +450,44 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock) { if (!psock) return false; - return psock->parser.enabled; + return !!psock->saved_data_ready; +} + +#if IS_ENABLED(CONFIG_NET_SOCK_MSG) + +/* We only have one bit so far. */ +#define BPF_F_PTR_MASK ~(BPF_F_INGRESS) + +static inline bool skb_bpf_ingress(const struct sk_buff *skb) +{ + unsigned long sk_redir = skb->_sk_redir; + + return sk_redir & BPF_F_INGRESS; +} + +static inline void skb_bpf_set_ingress(struct sk_buff *skb) +{ + skb->_sk_redir |= BPF_F_INGRESS; +} + +static inline void skb_bpf_set_redir(struct sk_buff *skb, struct sock *sk_redir, + bool ingress) +{ + skb->_sk_redir = (unsigned long)sk_redir; + if (ingress) + skb->_sk_redir |= BPF_F_INGRESS; +} + +static inline struct sock *skb_bpf_redirect_fetch(const struct sk_buff *skb) +{ + unsigned long sk_redir = skb->_sk_redir; + + return (struct sock *)(sk_redir & BPF_F_PTR_MASK); +} + +static inline void skb_bpf_redirect_clear(struct sk_buff *skb) +{ + skb->_sk_redir = 0; } +#endif /* CONFIG_NET_SOCK_MSG */ #endif /* _LINUX_SKMSG_H */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 963cd86d12dd..075de26f449d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -883,36 +883,11 @@ struct tcp_skb_cb { struct inet6_skb_parm h6; #endif } header; /* For incoming skbs */ - struct { - __u32 flags; - struct sock *sk_redir; - void *data_end; - } bpf; }; }; #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0])) -static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb) -{ - TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb); -} - -static inline bool tcp_skb_bpf_ingress(const struct sk_buff *skb) -{ - return TCP_SKB_CB(skb)->bpf.flags & BPF_F_INGRESS; -} - -static inline struct sock *tcp_skb_bpf_redirect_fetch(struct sk_buff *skb) -{ - return TCP_SKB_CB(skb)->bpf.sk_redir; -} - -static inline void tcp_skb_bpf_redirect_clear(struct sk_buff *skb) -{ - TCP_SKB_CB(skb)->bpf.sk_redir = NULL; -} - extern const struct inet_connection_sock_af_ops ipv4_specific; #if IS_ENABLED(CONFIG_IPV6) @@ -2222,25 +2197,27 @@ void tcp_update_ulp(struct sock *sk, struct proto *p, __MODULE_INFO(alias, alias_userspace, name); \ __MODULE_INFO(alias, alias_tcp_ulp, "tcp-ulp-" name) +#ifdef CONFIG_NET_SOCK_MSG struct sk_msg; struct sk_psock; -#ifdef CONFIG_BPF_STREAM_PARSER +#ifdef CONFIG_BPF_SYSCALL struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock); void tcp_bpf_clone(const struct sock *sk, struct sock *newsk); -#else -static inline void tcp_bpf_clone(const struct sock *sk, struct sock *newsk) -{ -} -#endif /* CONFIG_BPF_STREAM_PARSER */ +#endif /* CONFIG_BPF_SYSCALL */ -#ifdef CONFIG_NET_SOCK_MSG int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, u32 bytes, int flags); int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, int len, int flags); #endif /* CONFIG_NET_SOCK_MSG */ +#if !defined(CONFIG_BPF_SYSCALL) || !defined(CONFIG_NET_SOCK_MSG) +static inline void tcp_bpf_clone(const struct sock *sk, struct sock *newsk) +{ +} +#endif + #ifdef CONFIG_CGROUP_BPF static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops, struct sk_buff *skb, diff --git a/include/net/udp.h b/include/net/udp.h index a132a02b2f2c..d4d064c59232 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -515,9 +515,9 @@ static inline struct sk_buff *udp_rcv_segment(struct sock *sk, return segs; } -#ifdef CONFIG_BPF_STREAM_PARSER +#ifdef CONFIG_BPF_SYSCALL struct sk_psock; struct proto *udp_bpf_get_proto(struct sock *sk, struct sk_psock *psock); -#endif /* BPF_STREAM_PARSER */ +#endif #endif /* _UDP_H */ diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index cc17bc957548..9c0722c6d7ac 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -80,19 +80,6 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp); void __xsk_map_flush(void); -static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, - u32 key) -{ - struct xsk_map *m = container_of(map, struct xsk_map, map); - struct xdp_sock *xs; - - if (key >= map->max_entries) - return NULL; - - xs = READ_ONCE(m->xsk_map[key]); - return xs; -} - #else static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) @@ -109,12 +96,6 @@ static inline void __xsk_map_flush(void) { } -static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, - u32 key) -{ - return NULL; -} - #endif /* CONFIG_XDP_SOCKETS */ #endif /* _LINUX_XDP_SOCK_H */ diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index 76a97176ab81..fcad3645a70b 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -86,19 +86,15 @@ struct _bpf_dtab_netdev { }; #endif /* __DEVMAP_OBJ_TYPE */ -#define devmap_ifindex(tgt, map) \ - (((map->map_type == BPF_MAP_TYPE_DEVMAP || \ - map->map_type == BPF_MAP_TYPE_DEVMAP_HASH)) ? \ - ((struct _bpf_dtab_netdev *)tgt)->dev->ifindex : 0) - DECLARE_EVENT_CLASS(xdp_redirect_template, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, const void *tgt, int err, - const struct bpf_map *map, u32 index), + enum bpf_map_type map_type, + u32 map_id, u32 index), - TP_ARGS(dev, xdp, tgt, err, map, index), + TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index), TP_STRUCT__entry( __field(int, prog_id) @@ -111,14 +107,22 @@ DECLARE_EVENT_CLASS(xdp_redirect_template, ), TP_fast_assign( + u32 ifindex = 0, map_index = index; + + if (map_type == BPF_MAP_TYPE_DEVMAP || map_type == BPF_MAP_TYPE_DEVMAP_HASH) { + ifindex = ((struct _bpf_dtab_netdev *)tgt)->dev->ifindex; + } else if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) { + ifindex = index; + map_index = 0; + } + __entry->prog_id = xdp->aux->id; __entry->act = XDP_REDIRECT; __entry->ifindex = dev->ifindex; __entry->err = err; - __entry->to_ifindex = map ? devmap_ifindex(tgt, map) : - index; - __entry->map_id = map ? map->id : 0; - __entry->map_index = map ? index : 0; + __entry->to_ifindex = ifindex; + __entry->map_id = map_id; + __entry->map_index = map_index; ), TP_printk("prog_id=%d action=%s ifindex=%d to_ifindex=%d err=%d" @@ -133,45 +137,49 @@ DEFINE_EVENT(xdp_redirect_template, xdp_redirect, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, const void *tgt, int err, - const struct bpf_map *map, u32 index), - TP_ARGS(dev, xdp, tgt, err, map, index) + enum bpf_map_type map_type, + u32 map_id, u32 index), + TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index) ); DEFINE_EVENT(xdp_redirect_template, xdp_redirect_err, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, const void *tgt, int err, - const struct bpf_map *map, u32 index), - TP_ARGS(dev, xdp, tgt, err, map, index) + enum bpf_map_type map_type, + u32 map_id, u32 index), + TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index) ); -#define _trace_xdp_redirect(dev, xdp, to) \ - trace_xdp_redirect(dev, xdp, NULL, 0, NULL, to) +#define _trace_xdp_redirect(dev, xdp, to) \ + trace_xdp_redirect(dev, xdp, NULL, 0, BPF_MAP_TYPE_UNSPEC, INT_MAX, to) -#define _trace_xdp_redirect_err(dev, xdp, to, err) \ - trace_xdp_redirect_err(dev, xdp, NULL, err, NULL, to) +#define _trace_xdp_redirect_err(dev, xdp, to, err) \ + trace_xdp_redirect_err(dev, xdp, NULL, err, BPF_MAP_TYPE_UNSPEC, INT_MAX, to) -#define _trace_xdp_redirect_map(dev, xdp, to, map, index) \ - trace_xdp_redirect(dev, xdp, to, 0, map, index) +#define _trace_xdp_redirect_map(dev, xdp, to, map_type, map_id, index) \ + trace_xdp_redirect(dev, xdp, to, 0, map_type, map_id, index) -#define _trace_xdp_redirect_map_err(dev, xdp, to, map, index, err) \ - trace_xdp_redirect_err(dev, xdp, to, err, map, index) +#define _trace_xdp_redirect_map_err(dev, xdp, to, map_type, map_id, index, err) \ + trace_xdp_redirect_err(dev, xdp, to, err, map_type, map_id, index) /* not used anymore, but kept around so as not to break old programs */ DEFINE_EVENT(xdp_redirect_template, xdp_redirect_map, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, const void *tgt, int err, - const struct bpf_map *map, u32 index), - TP_ARGS(dev, xdp, tgt, err, map, index) + enum bpf_map_type map_type, + u32 map_id, u32 index), + TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index) ); DEFINE_EVENT(xdp_redirect_template, xdp_redirect_map_err, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, const void *tgt, int err, - const struct bpf_map *map, u32 index), - TP_ARGS(dev, xdp, tgt, err, map, index) + enum bpf_map_type map_type, + u32 map_id, u32 index), + TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index) ); TRACE_EVENT(xdp_cpumap_kthread, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 79c893310492..2d3036e292a9 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -93,7 +93,717 @@ union bpf_iter_link_info { } map; }; -/* BPF syscall commands, see bpf(2) man-page for details. */ +/* BPF syscall commands, see bpf(2) man-page for more details. */ +/** + * DOC: eBPF Syscall Preamble + * + * The operation to be performed by the **bpf**\ () system call is determined + * by the *cmd* argument. Each operation takes an accompanying argument, + * provided via *attr*, which is a pointer to a union of type *bpf_attr* (see + * below). The size argument is the size of the union pointed to by *attr*. + */ +/** + * DOC: eBPF Syscall Commands + * + * BPF_MAP_CREATE + * Description + * Create a map and return a file descriptor that refers to the + * map. The close-on-exec file descriptor flag (see **fcntl**\ (2)) + * is automatically enabled for the new file descriptor. + * + * Applying **close**\ (2) to the file descriptor returned by + * **BPF_MAP_CREATE** will delete the map (but see NOTES). + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_MAP_LOOKUP_ELEM + * Description + * Look up an element with a given *key* in the map referred to + * by the file descriptor *map_fd*. + * + * The *flags* argument may be specified as one of the + * following: + * + * **BPF_F_LOCK** + * Look up the value of a spin-locked map without + * returning the lock. This must be specified if the + * elements contain a spinlock. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_UPDATE_ELEM + * Description + * Create or update an element (key/value pair) in a specified map. + * + * The *flags* argument should be specified as one of the + * following: + * + * **BPF_ANY** + * Create a new element or update an existing element. + * **BPF_NOEXIST** + * Create a new element only if it did not exist. + * **BPF_EXIST** + * Update an existing element. + * **BPF_F_LOCK** + * Update a spin_lock-ed map element. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * May set *errno* to **EINVAL**, **EPERM**, **ENOMEM**, + * **E2BIG**, **EEXIST**, or **ENOENT**. + * + * **E2BIG** + * The number of elements in the map reached the + * *max_entries* limit specified at map creation time. + * **EEXIST** + * If *flags* specifies **BPF_NOEXIST** and the element + * with *key* already exists in the map. + * **ENOENT** + * If *flags* specifies **BPF_EXIST** and the element with + * *key* does not exist in the map. + * + * BPF_MAP_DELETE_ELEM + * Description + * Look up and delete an element by key in a specified map. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_GET_NEXT_KEY + * Description + * Look up an element by key in a specified map and return the key + * of the next element. Can be used to iterate over all elements + * in the map. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * The following cases can be used to iterate over all elements of + * the map: + * + * * If *key* is not found, the operation returns zero and sets + * the *next_key* pointer to the key of the first element. + * * If *key* is found, the operation returns zero and sets the + * *next_key* pointer to the key of the next element. + * * If *key* is the last element, returns -1 and *errno* is set + * to **ENOENT**. + * + * May set *errno* to **ENOMEM**, **EFAULT**, **EPERM**, or + * **EINVAL** on error. + * + * BPF_PROG_LOAD + * Description + * Verify and load an eBPF program, returning a new file + * descriptor associated with the program. + * + * Applying **close**\ (2) to the file descriptor returned by + * **BPF_PROG_LOAD** will unload the eBPF program (but see NOTES). + * + * The close-on-exec file descriptor flag (see **fcntl**\ (2)) is + * automatically enabled for the new file descriptor. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_OBJ_PIN + * Description + * Pin an eBPF program or map referred by the specified *bpf_fd* + * to the provided *pathname* on the filesystem. + * + * The *pathname* argument must not contain a dot ("."). + * + * On success, *pathname* retains a reference to the eBPF object, + * preventing deallocation of the object when the original + * *bpf_fd* is closed. This allow the eBPF object to live beyond + * **close**\ (\ *bpf_fd*\ ), and hence the lifetime of the parent + * process. + * + * Applying **unlink**\ (2) or similar calls to the *pathname* + * unpins the object from the filesystem, removing the reference. + * If no other file descriptors or filesystem nodes refer to the + * same object, it will be deallocated (see NOTES). + * + * The filesystem type for the parent directory of *pathname* must + * be **BPF_FS_MAGIC**. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_OBJ_GET + * Description + * Open a file descriptor for the eBPF object pinned to the + * specified *pathname*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_PROG_ATTACH + * Description + * Attach an eBPF program to a *target_fd* at the specified + * *attach_type* hook. + * + * The *attach_type* specifies the eBPF attachment point to + * attach the program to, and must be one of *bpf_attach_type* + * (see below). + * + * The *attach_bpf_fd* must be a valid file descriptor for a + * loaded eBPF program of a cgroup, flow dissector, LIRC, sockmap + * or sock_ops type corresponding to the specified *attach_type*. + * + * The *target_fd* must be a valid file descriptor for a kernel + * object which depends on the attach type of *attach_bpf_fd*: + * + * **BPF_PROG_TYPE_CGROUP_DEVICE**, + * **BPF_PROG_TYPE_CGROUP_SKB**, + * **BPF_PROG_TYPE_CGROUP_SOCK**, + * **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**, + * **BPF_PROG_TYPE_CGROUP_SOCKOPT**, + * **BPF_PROG_TYPE_CGROUP_SYSCTL**, + * **BPF_PROG_TYPE_SOCK_OPS** + * + * Control Group v2 hierarchy with the eBPF controller + * enabled. Requires the kernel to be compiled with + * **CONFIG_CGROUP_BPF**. + * + * **BPF_PROG_TYPE_FLOW_DISSECTOR** + * + * Network namespace (eg /proc/self/ns/net). + * + * **BPF_PROG_TYPE_LIRC_MODE2** + * + * LIRC device path (eg /dev/lircN). Requires the kernel + * to be compiled with **CONFIG_BPF_LIRC_MODE2**. + * + * **BPF_PROG_TYPE_SK_SKB**, + * **BPF_PROG_TYPE_SK_MSG** + * + * eBPF map of socket type (eg **BPF_MAP_TYPE_SOCKHASH**). + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_PROG_DETACH + * Description + * Detach the eBPF program associated with the *target_fd* at the + * hook specified by *attach_type*. The program must have been + * previously attached using **BPF_PROG_ATTACH**. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_PROG_TEST_RUN + * Description + * Run the eBPF program associated with the *prog_fd* a *repeat* + * number of times against a provided program context *ctx_in* and + * data *data_in*, and return the modified program context + * *ctx_out*, *data_out* (for example, packet data), result of the + * execution *retval*, and *duration* of the test run. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * **ENOSPC** + * Either *data_size_out* or *ctx_size_out* is too small. + * **ENOTSUPP** + * This command is not supported by the program type of + * the program referred to by *prog_fd*. + * + * BPF_PROG_GET_NEXT_ID + * Description + * Fetch the next eBPF program currently loaded into the kernel. + * + * Looks for the eBPF program with an id greater than *start_id* + * and updates *next_id* on success. If no other eBPF programs + * remain with ids higher than *start_id*, returns -1 and sets + * *errno* to **ENOENT**. + * + * Return + * Returns zero on success. On error, or when no id remains, -1 + * is returned and *errno* is set appropriately. + * + * BPF_MAP_GET_NEXT_ID + * Description + * Fetch the next eBPF map currently loaded into the kernel. + * + * Looks for the eBPF map with an id greater than *start_id* + * and updates *next_id* on success. If no other eBPF maps + * remain with ids higher than *start_id*, returns -1 and sets + * *errno* to **ENOENT**. + * + * Return + * Returns zero on success. On error, or when no id remains, -1 + * is returned and *errno* is set appropriately. + * + * BPF_PROG_GET_FD_BY_ID + * Description + * Open a file descriptor for the eBPF program corresponding to + * *prog_id*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_MAP_GET_FD_BY_ID + * Description + * Open a file descriptor for the eBPF map corresponding to + * *map_id*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_OBJ_GET_INFO_BY_FD + * Description + * Obtain information about the eBPF object corresponding to + * *bpf_fd*. + * + * Populates up to *info_len* bytes of *info*, which will be in + * one of the following formats depending on the eBPF object type + * of *bpf_fd*: + * + * * **struct bpf_prog_info** + * * **struct bpf_map_info** + * * **struct bpf_btf_info** + * * **struct bpf_link_info** + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_PROG_QUERY + * Description + * Obtain information about eBPF programs associated with the + * specified *attach_type* hook. + * + * The *target_fd* must be a valid file descriptor for a kernel + * object which depends on the attach type of *attach_bpf_fd*: + * + * **BPF_PROG_TYPE_CGROUP_DEVICE**, + * **BPF_PROG_TYPE_CGROUP_SKB**, + * **BPF_PROG_TYPE_CGROUP_SOCK**, + * **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**, + * **BPF_PROG_TYPE_CGROUP_SOCKOPT**, + * **BPF_PROG_TYPE_CGROUP_SYSCTL**, + * **BPF_PROG_TYPE_SOCK_OPS** + * + * Control Group v2 hierarchy with the eBPF controller + * enabled. Requires the kernel to be compiled with + * **CONFIG_CGROUP_BPF**. + * + * **BPF_PROG_TYPE_FLOW_DISSECTOR** + * + * Network namespace (eg /proc/self/ns/net). + * + * **BPF_PROG_TYPE_LIRC_MODE2** + * + * LIRC device path (eg /dev/lircN). Requires the kernel + * to be compiled with **CONFIG_BPF_LIRC_MODE2**. + * + * **BPF_PROG_QUERY** always fetches the number of programs + * attached and the *attach_flags* which were used to attach those + * programs. Additionally, if *prog_ids* is nonzero and the number + * of attached programs is less than *prog_cnt*, populates + * *prog_ids* with the eBPF program ids of the programs attached + * at *target_fd*. + * + * The following flags may alter the result: + * + * **BPF_F_QUERY_EFFECTIVE** + * Only return information regarding programs which are + * currently effective at the specified *target_fd*. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_RAW_TRACEPOINT_OPEN + * Description + * Attach an eBPF program to a tracepoint *name* to access kernel + * internal arguments of the tracepoint in their raw form. + * + * The *prog_fd* must be a valid file descriptor associated with + * a loaded eBPF program of type **BPF_PROG_TYPE_RAW_TRACEPOINT**. + * + * No ABI guarantees are made about the content of tracepoint + * arguments exposed to the corresponding eBPF program. + * + * Applying **close**\ (2) to the file descriptor returned by + * **BPF_RAW_TRACEPOINT_OPEN** will delete the map (but see NOTES). + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_BTF_LOAD + * Description + * Verify and load BPF Type Format (BTF) metadata into the kernel, + * returning a new file descriptor associated with the metadata. + * BTF is described in more detail at + * https://www.kernel.org/doc/html/latest/bpf/btf.html. + * + * The *btf* parameter must point to valid memory providing + * *btf_size* bytes of BTF binary metadata. + * + * The returned file descriptor can be passed to other **bpf**\ () + * subcommands such as **BPF_PROG_LOAD** or **BPF_MAP_CREATE** to + * associate the BTF with those objects. + * + * Similar to **BPF_PROG_LOAD**, **BPF_BTF_LOAD** has optional + * parameters to specify a *btf_log_buf*, *btf_log_size* and + * *btf_log_level* which allow the kernel to return freeform log + * output regarding the BTF verification process. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_BTF_GET_FD_BY_ID + * Description + * Open a file descriptor for the BPF Type Format (BTF) + * corresponding to *btf_id*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_TASK_FD_QUERY + * Description + * Obtain information about eBPF programs associated with the + * target process identified by *pid* and *fd*. + * + * If the *pid* and *fd* are associated with a tracepoint, kprobe + * or uprobe perf event, then the *prog_id* and *fd_type* will + * be populated with the eBPF program id and file descriptor type + * of type **bpf_task_fd_type**. If associated with a kprobe or + * uprobe, the *probe_offset* and *probe_addr* will also be + * populated. Optionally, if *buf* is provided, then up to + * *buf_len* bytes of *buf* will be populated with the name of + * the tracepoint, kprobe or uprobe. + * + * The resulting *prog_id* may be introspected in deeper detail + * using **BPF_PROG_GET_FD_BY_ID** and **BPF_OBJ_GET_INFO_BY_FD**. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_LOOKUP_AND_DELETE_ELEM + * Description + * Look up an element with the given *key* in the map referred to + * by the file descriptor *fd*, and if found, delete the element. + * + * The **BPF_MAP_TYPE_QUEUE** and **BPF_MAP_TYPE_STACK** map types + * implement this command as a "pop" operation, deleting the top + * element rather than one corresponding to *key*. + * The *key* and *key_len* parameters should be zeroed when + * issuing this operation for these map types. + * + * This command is only valid for the following map types: + * * **BPF_MAP_TYPE_QUEUE** + * * **BPF_MAP_TYPE_STACK** + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_FREEZE + * Description + * Freeze the permissions of the specified map. + * + * Write permissions may be frozen by passing zero *flags*. + * Upon success, no future syscall invocations may alter the + * map state of *map_fd*. Write operations from eBPF programs + * are still possible for a frozen map. + * + * Not supported for maps of type **BPF_MAP_TYPE_STRUCT_OPS**. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_BTF_GET_NEXT_ID + * Description + * Fetch the next BPF Type Format (BTF) object currently loaded + * into the kernel. + * + * Looks for the BTF object with an id greater than *start_id* + * and updates *next_id* on success. If no other BTF objects + * remain with ids higher than *start_id*, returns -1 and sets + * *errno* to **ENOENT**. + * + * Return + * Returns zero on success. On error, or when no id remains, -1 + * is returned and *errno* is set appropriately. + * + * BPF_MAP_LOOKUP_BATCH + * Description + * Iterate and fetch multiple elements in a map. + * + * Two opaque values are used to manage batch operations, + * *in_batch* and *out_batch*. Initially, *in_batch* must be set + * to NULL to begin the batched operation. After each subsequent + * **BPF_MAP_LOOKUP_BATCH**, the caller should pass the resultant + * *out_batch* as the *in_batch* for the next operation to + * continue iteration from the current point. + * + * The *keys* and *values* are output parameters which must point + * to memory large enough to hold *count* items based on the key + * and value size of the map *map_fd*. The *keys* buffer must be + * of *key_size* * *count*. The *values* buffer must be of + * *value_size* * *count*. + * + * The *elem_flags* argument may be specified as one of the + * following: + * + * **BPF_F_LOCK** + * Look up the value of a spin-locked map without + * returning the lock. This must be specified if the + * elements contain a spinlock. + * + * On success, *count* elements from the map are copied into the + * user buffer, with the keys copied into *keys* and the values + * copied into the corresponding indices in *values*. + * + * If an error is returned and *errno* is not **EFAULT**, *count* + * is set to the number of successfully processed elements. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * May set *errno* to **ENOSPC** to indicate that *keys* or + * *values* is too small to dump an entire bucket during + * iteration of a hash-based map type. + * + * BPF_MAP_LOOKUP_AND_DELETE_BATCH + * Description + * Iterate and delete all elements in a map. + * + * This operation has the same behavior as + * **BPF_MAP_LOOKUP_BATCH** with two exceptions: + * + * * Every element that is successfully returned is also deleted + * from the map. This is at least *count* elements. Note that + * *count* is both an input and an output parameter. + * * Upon returning with *errno* set to **EFAULT**, up to + * *count* elements may be deleted without returning the keys + * and values of the deleted elements. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_UPDATE_BATCH + * Description + * Update multiple elements in a map by *key*. + * + * The *keys* and *values* are input parameters which must point + * to memory large enough to hold *count* items based on the key + * and value size of the map *map_fd*. The *keys* buffer must be + * of *key_size* * *count*. The *values* buffer must be of + * *value_size* * *count*. + * + * Each element specified in *keys* is sequentially updated to the + * value in the corresponding index in *values*. The *in_batch* + * and *out_batch* parameters are ignored and should be zeroed. + * + * The *elem_flags* argument should be specified as one of the + * following: + * + * **BPF_ANY** + * Create new elements or update a existing elements. + * **BPF_NOEXIST** + * Create new elements only if they do not exist. + * **BPF_EXIST** + * Update existing elements. + * **BPF_F_LOCK** + * Update spin_lock-ed map elements. This must be + * specified if the map value contains a spinlock. + * + * On success, *count* elements from the map are updated. + * + * If an error is returned and *errno* is not **EFAULT**, *count* + * is set to the number of successfully processed elements. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * May set *errno* to **EINVAL**, **EPERM**, **ENOMEM**, or + * **E2BIG**. **E2BIG** indicates that the number of elements in + * the map reached the *max_entries* limit specified at map + * creation time. + * + * May set *errno* to one of the following error codes under + * specific circumstances: + * + * **EEXIST** + * If *flags* specifies **BPF_NOEXIST** and the element + * with *key* already exists in the map. + * **ENOENT** + * If *flags* specifies **BPF_EXIST** and the element with + * *key* does not exist in the map. + * + * BPF_MAP_DELETE_BATCH + * Description + * Delete multiple elements in a map by *key*. + * + * The *keys* parameter is an input parameter which must point + * to memory large enough to hold *count* items based on the key + * size of the map *map_fd*, that is, *key_size* * *count*. + * + * Each element specified in *keys* is sequentially deleted. The + * *in_batch*, *out_batch*, and *values* parameters are ignored + * and should be zeroed. + * + * The *elem_flags* argument may be specified as one of the + * following: + * + * **BPF_F_LOCK** + * Look up the value of a spin-locked map without + * returning the lock. This must be specified if the + * elements contain a spinlock. + * + * On success, *count* elements from the map are updated. + * + * If an error is returned and *errno* is not **EFAULT**, *count* + * is set to the number of successfully processed elements. If + * *errno* is **EFAULT**, up to *count* elements may be been + * deleted. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_LINK_CREATE + * Description + * Attach an eBPF program to a *target_fd* at the specified + * *attach_type* hook and return a file descriptor handle for + * managing the link. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_LINK_UPDATE + * Description + * Update the eBPF program in the specified *link_fd* to + * *new_prog_fd*. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_LINK_GET_FD_BY_ID + * Description + * Open a file descriptor for the eBPF Link corresponding to + * *link_id*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_LINK_GET_NEXT_ID + * Description + * Fetch the next eBPF link currently loaded into the kernel. + * + * Looks for the eBPF link with an id greater than *start_id* + * and updates *next_id* on success. If no other eBPF links + * remain with ids higher than *start_id*, returns -1 and sets + * *errno* to **ENOENT**. + * + * Return + * Returns zero on success. On error, or when no id remains, -1 + * is returned and *errno* is set appropriately. + * + * BPF_ENABLE_STATS + * Description + * Enable eBPF runtime statistics gathering. + * + * Runtime statistics gathering for the eBPF runtime is disabled + * by default to minimize the corresponding performance overhead. + * This command enables statistics globally. + * + * Multiple programs may independently enable statistics. + * After gathering the desired statistics, eBPF runtime statistics + * may be disabled again by calling **close**\ (2) for the file + * descriptor returned by this function. Statistics will only be + * disabled system-wide when all outstanding file descriptors + * returned by prior calls for this subcommand are closed. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_ITER_CREATE + * Description + * Create an iterator on top of the specified *link_fd* (as + * previously created using **BPF_LINK_CREATE**) and return a + * file descriptor that can be used to trigger the iteration. + * + * If the resulting file descriptor is pinned to the filesystem + * using **BPF_OBJ_PIN**, then subsequent **read**\ (2) syscalls + * for that path will trigger the iterator to read kernel state + * using the eBPF program attached to *link_fd*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_LINK_DETACH + * Description + * Forcefully detach the specified *link_fd* from its + * corresponding attachment point. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_PROG_BIND_MAP + * Description + * Bind a map to the lifetime of an eBPF program. + * + * The map identified by *map_fd* is bound to the program + * identified by *prog_fd* and only released when *prog_fd* is + * released. This may be used in cases where metadata should be + * associated with a program which otherwise does not contain any + * references to the map (for example, embedded in the eBPF + * program instructions). + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * NOTES + * eBPF objects (maps and programs) can be shared between processes. + * + * * After **fork**\ (2), the child inherits file descriptors + * referring to the same eBPF objects. + * * File descriptors referring to eBPF objects can be transferred over + * **unix**\ (7) domain sockets. + * * File descriptors referring to eBPF objects can be duplicated in the + * usual way, using **dup**\ (2) and similar calls. + * * File descriptors referring to eBPF objects can be pinned to the + * filesystem using the **BPF_OBJ_PIN** command of **bpf**\ (2). + * + * An eBPF object is deallocated only after all file descriptors referring + * to the object have been closed and no references remain pinned to the + * filesystem or attached (for example, bound to a program or device). + */ enum bpf_cmd { BPF_MAP_CREATE, BPF_MAP_LOOKUP_ELEM, @@ -393,6 +1103,15 @@ enum bpf_link_type { * is struct/union. */ #define BPF_PSEUDO_BTF_ID 3 +/* insn[0].src_reg: BPF_PSEUDO_FUNC + * insn[0].imm: insn offset to the func + * insn[1].imm: 0 + * insn[0].off: 0 + * insn[1].off: 0 + * ldimm64 rewrite: address of the function + * verifier type: PTR_TO_FUNC. + */ +#define BPF_PSEUDO_FUNC 4 /* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative * offset to another bpf function @@ -720,7 +1439,7 @@ union bpf_attr { * parsed and used to produce a manual page. The workflow is the following, * and requires the rst2man utility: * - * $ ./scripts/bpf_helpers_doc.py \ + * $ ./scripts/bpf_doc.py \ * --filename include/uapi/linux/bpf.h > /tmp/bpf-helpers.rst * $ rst2man /tmp/bpf-helpers.rst > /tmp/bpf-helpers.7 * $ man /tmp/bpf-helpers.7 @@ -1765,6 +2484,10 @@ union bpf_attr { * Use with ENCAP_L3/L4 flags to further specify the tunnel * type; *len* is the length of the inner MAC header. * + * * **BPF_F_ADJ_ROOM_ENCAP_L2_ETH**: + * Use with BPF_F_ADJ_ROOM_ENCAP_L2 flag to further specify the + * L2 type as Ethernet. + * * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be @@ -3909,6 +4632,34 @@ union bpf_attr { * * **BPF_MTU_CHK_RET_FRAG_NEEDED** * * **BPF_MTU_CHK_RET_SEGS_TOOBIG** * + * long bpf_for_each_map_elem(struct bpf_map *map, void *callback_fn, void *callback_ctx, u64 flags) + * Description + * For each element in **map**, call **callback_fn** function with + * **map**, **callback_ctx** and other map-specific parameters. + * The **callback_fn** should be a static function and + * the **callback_ctx** should be a pointer to the stack. + * The **flags** is used to control certain aspects of the helper. + * Currently, the **flags** must be 0. + * + * The following are a list of supported map types and their + * respective expected callback signatures: + * + * BPF_MAP_TYPE_HASH, BPF_MAP_TYPE_PERCPU_HASH, + * BPF_MAP_TYPE_LRU_HASH, BPF_MAP_TYPE_LRU_PERCPU_HASH, + * BPF_MAP_TYPE_ARRAY, BPF_MAP_TYPE_PERCPU_ARRAY + * + * long (\*callback_fn)(struct bpf_map \*map, const void \*key, void \*value, void \*ctx); + * + * For per_cpu maps, the map_value is the value on the cpu where the + * bpf_prog is running. + * + * If **callback_fn** return 0, the helper will continue to the next + * element. If return value is 1, the helper will skip the rest of + * elements and return. Other return values are not used now. + * + * Return + * The number of traversed map elements for success, **-EINVAL** for + * invalid **flags**. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4075,6 +4826,7 @@ union bpf_attr { FN(ima_inode_hash), \ FN(sock_from_file), \ FN(check_mtu), \ + FN(for_each_map_elem), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -4168,6 +4920,7 @@ enum { BPF_F_ADJ_ROOM_ENCAP_L4_GRE = (1ULL << 3), BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4), BPF_F_ADJ_ROOM_NO_CSUM_RESET = (1ULL << 5), + BPF_F_ADJ_ROOM_ENCAP_L2_ETH = (1ULL << 6), }; enum { @@ -5205,7 +5958,10 @@ struct bpf_pidns_info { /* User accessible data for SK_LOOKUP programs. Add new fields at the end. */ struct bpf_sk_lookup { - __bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */ + union { + __bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */ + __u64 cookie; /* Non-zero if socket was selected in PROG_TEST_RUN */ + }; __u32 family; /* Protocol family (AF_INET, AF_INET6) */ __u32 protocol; /* IP protocol (IPPROTO_TCP, IPPROTO_UDP) */ diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h index 5a667107ad2c..d27b1708efe9 100644 --- a/include/uapi/linux/btf.h +++ b/include/uapi/linux/btf.h @@ -52,7 +52,7 @@ struct btf_type { }; }; -#define BTF_INFO_KIND(info) (((info) >> 24) & 0x0f) +#define BTF_INFO_KIND(info) (((info) >> 24) & 0x1f) #define BTF_INFO_VLEN(info) ((info) & 0xffff) #define BTF_INFO_KFLAG(info) ((info) >> 31) @@ -72,7 +72,8 @@ struct btf_type { #define BTF_KIND_FUNC_PROTO 13 /* Function Proto */ #define BTF_KIND_VAR 14 /* Variable */ #define BTF_KIND_DATASEC 15 /* Section */ -#define BTF_KIND_MAX BTF_KIND_DATASEC +#define BTF_KIND_FLOAT 16 /* Floating point */ +#define BTF_KIND_MAX BTF_KIND_FLOAT #define NR_BTF_KINDS (BTF_KIND_MAX + 1) /* For some specific BTF_KIND, "struct btf_type" is immediately diff --git a/init/Kconfig b/init/Kconfig index 22946fe5ded9..2c9cbd8e368c 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1709,6 +1709,7 @@ config BPF_SYSCALL select BPF select IRQ_WORK select TASKS_TRACE_RCU + select NET_SOCK_MSG if INET default n help Enable the bpf() system call that allows to manipulate eBPF diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index d1249340fd6b..7f33098ca63f 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -9,8 +9,8 @@ CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy) obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o +obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o -obj-${CONFIG_BPF_LSM} += bpf_task_storage.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o obj-$(CONFIG_BPF_JIT) += trampoline.o obj-$(CONFIG_BPF_SYSCALL) += btf.o @@ -18,7 +18,6 @@ obj-$(CONFIG_BPF_JIT) += dispatcher.o ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o obj-$(CONFIG_BPF_SYSCALL) += cpumap.o -obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o obj-$(CONFIG_BPF_SYSCALL) += offload.o obj-$(CONFIG_BPF_SYSCALL) += net_namespace.o endif diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 1f8453343bf2..463d25e1e67e 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -625,6 +625,42 @@ static const struct bpf_iter_seq_info iter_seq_info = { .seq_priv_size = sizeof(struct bpf_iter_seq_array_map_info), }; +static int bpf_for_each_array_elem(struct bpf_map *map, void *callback_fn, + void *callback_ctx, u64 flags) +{ + u32 i, key, num_elems = 0; + struct bpf_array *array; + bool is_percpu; + u64 ret = 0; + void *val; + + if (flags != 0) + return -EINVAL; + + is_percpu = map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; + array = container_of(map, struct bpf_array, map); + if (is_percpu) + migrate_disable(); + for (i = 0; i < map->max_entries; i++) { + if (is_percpu) + val = this_cpu_ptr(array->pptrs[i]); + else + val = array->value + array->elem_size * i; + num_elems++; + key = i; + ret = BPF_CAST_CALL(callback_fn)((u64)(long)map, + (u64)(long)&key, (u64)(long)val, + (u64)(long)callback_ctx, 0); + /* return value: 0 - continue, 1 - stop and return */ + if (ret) + break; + } + + if (is_percpu) + migrate_enable(); + return num_elems; +} + static int array_map_btf_id; const struct bpf_map_ops array_map_ops = { .map_meta_equal = array_map_meta_equal, @@ -643,6 +679,8 @@ const struct bpf_map_ops array_map_ops = { .map_check_btf = array_map_check_btf, .map_lookup_batch = generic_map_lookup_batch, .map_update_batch = generic_map_update_batch, + .map_set_for_each_callback_args = map_set_for_each_callback_args, + .map_for_each_callback = bpf_for_each_array_elem, .map_btf_name = "bpf_array", .map_btf_id = &array_map_btf_id, .iter_seq_info = &iter_seq_info, @@ -660,6 +698,8 @@ const struct bpf_map_ops percpu_array_map_ops = { .map_delete_elem = array_map_delete_elem, .map_seq_show_elem = percpu_array_map_seq_show_elem, .map_check_btf = array_map_check_btf, + .map_set_for_each_callback_args = map_set_for_each_callback_args, + .map_for_each_callback = bpf_for_each_array_elem, .map_btf_name = "bpf_array", .map_btf_id = &percpu_array_map_btf_id, .iter_seq_info = &iter_seq_info, diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index 6639640523c0..da753721457c 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -237,7 +237,7 @@ static void inode_storage_map_free(struct bpf_map *map) smap = (struct bpf_local_storage_map *)map; bpf_local_storage_cache_idx_free(&inode_cache, smap->cache_idx); - bpf_local_storage_map_free(smap); + bpf_local_storage_map_free(smap, NULL); } static int inode_storage_map_btf_id; diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index a0d9eade9c80..931870f9cf56 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -675,3 +675,19 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx) */ return ret == 0 ? 0 : -EAGAIN; } + +BPF_CALL_4(bpf_for_each_map_elem, struct bpf_map *, map, void *, callback_fn, + void *, callback_ctx, u64, flags) +{ + return map->ops->map_for_each_callback(map, callback_fn, callback_ctx, flags); +} + +const struct bpf_func_proto bpf_for_each_map_elem_proto = { + .func = bpf_for_each_map_elem, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_FUNC, + .arg3_type = ARG_PTR_TO_STACK_OR_NULL, + .arg4_type = ARG_ANYTHING, +}; diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index dd5aedee99e7..b305270b7a4b 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -140,17 +140,18 @@ static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem) { struct bpf_local_storage *local_storage; bool free_local_storage = false; + unsigned long flags; if (unlikely(!selem_linked_to_storage(selem))) /* selem has already been unlinked from sk */ return; local_storage = rcu_dereference(selem->local_storage); - raw_spin_lock_bh(&local_storage->lock); + raw_spin_lock_irqsave(&local_storage->lock, flags); if (likely(selem_linked_to_storage(selem))) free_local_storage = bpf_selem_unlink_storage_nolock( local_storage, selem, true); - raw_spin_unlock_bh(&local_storage->lock); + raw_spin_unlock_irqrestore(&local_storage->lock, flags); if (free_local_storage) kfree_rcu(local_storage, rcu); @@ -167,6 +168,7 @@ void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) { struct bpf_local_storage_map *smap; struct bpf_local_storage_map_bucket *b; + unsigned long flags; if (unlikely(!selem_linked_to_map(selem))) /* selem has already be unlinked from smap */ @@ -174,21 +176,22 @@ void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) smap = rcu_dereference(SDATA(selem)->smap); b = select_bucket(smap, selem); - raw_spin_lock_bh(&b->lock); + raw_spin_lock_irqsave(&b->lock, flags); if (likely(selem_linked_to_map(selem))) hlist_del_init_rcu(&selem->map_node); - raw_spin_unlock_bh(&b->lock); + raw_spin_unlock_irqrestore(&b->lock, flags); } void bpf_selem_link_map(struct bpf_local_storage_map *smap, struct bpf_local_storage_elem *selem) { struct bpf_local_storage_map_bucket *b = select_bucket(smap, selem); + unsigned long flags; - raw_spin_lock_bh(&b->lock); + raw_spin_lock_irqsave(&b->lock, flags); RCU_INIT_POINTER(SDATA(selem)->smap, smap); hlist_add_head_rcu(&selem->map_node, &b->list); - raw_spin_unlock_bh(&b->lock); + raw_spin_unlock_irqrestore(&b->lock, flags); } void bpf_selem_unlink(struct bpf_local_storage_elem *selem) @@ -224,16 +227,18 @@ bpf_local_storage_lookup(struct bpf_local_storage *local_storage, sdata = SDATA(selem); if (cacheit_lockit) { + unsigned long flags; + /* spinlock is needed to avoid racing with the * parallel delete. Otherwise, publishing an already * deleted sdata to the cache will become a use-after-free * problem in the next bpf_local_storage_lookup(). */ - raw_spin_lock_bh(&local_storage->lock); + raw_spin_lock_irqsave(&local_storage->lock, flags); if (selem_linked_to_storage(selem)) rcu_assign_pointer(local_storage->cache[smap->cache_idx], sdata); - raw_spin_unlock_bh(&local_storage->lock); + raw_spin_unlock_irqrestore(&local_storage->lock, flags); } return sdata; @@ -327,6 +332,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, struct bpf_local_storage_data *old_sdata = NULL; struct bpf_local_storage_elem *selem; struct bpf_local_storage *local_storage; + unsigned long flags; int err; /* BPF_EXIST and BPF_NOEXIST cannot be both set */ @@ -374,7 +380,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, } } - raw_spin_lock_bh(&local_storage->lock); + raw_spin_lock_irqsave(&local_storage->lock, flags); /* Recheck local_storage->list under local_storage->lock */ if (unlikely(hlist_empty(&local_storage->list))) { @@ -428,11 +434,11 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, } unlock: - raw_spin_unlock_bh(&local_storage->lock); + raw_spin_unlock_irqrestore(&local_storage->lock, flags); return SDATA(selem); unlock_err: - raw_spin_unlock_bh(&local_storage->lock); + raw_spin_unlock_irqrestore(&local_storage->lock, flags); return ERR_PTR(err); } @@ -468,7 +474,8 @@ void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache, spin_unlock(&cache->idx_lock); } -void bpf_local_storage_map_free(struct bpf_local_storage_map *smap) +void bpf_local_storage_map_free(struct bpf_local_storage_map *smap, + int __percpu *busy_counter) { struct bpf_local_storage_elem *selem; struct bpf_local_storage_map_bucket *b; @@ -497,7 +504,15 @@ void bpf_local_storage_map_free(struct bpf_local_storage_map *smap) while ((selem = hlist_entry_safe( rcu_dereference_raw(hlist_first_rcu(&b->list)), struct bpf_local_storage_elem, map_node))) { + if (busy_counter) { + migrate_disable(); + __this_cpu_inc(*busy_counter); + } bpf_selem_unlink(selem); + if (busy_counter) { + __this_cpu_dec(*busy_counter); + migrate_enable(); + } cond_resched_rcu(); } rcu_read_unlock(); diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 1622a44d1617..9829f381b51c 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -115,10 +115,6 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_spin_lock_proto; case BPF_FUNC_spin_unlock: return &bpf_spin_unlock_proto; - case BPF_FUNC_task_storage_get: - return &bpf_task_storage_get_proto; - case BPF_FUNC_task_storage_delete: - return &bpf_task_storage_delete_proto; case BPF_FUNC_bprm_opts_set: return &bpf_bprm_opts_set_proto; case BPF_FUNC_ima_inode_hash: diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index e0da0258b732..fd3c74ef608e 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -15,21 +15,41 @@ #include <linux/bpf_local_storage.h> #include <linux/filter.h> #include <uapi/linux/btf.h> -#include <linux/bpf_lsm.h> #include <linux/btf_ids.h> #include <linux/fdtable.h> DEFINE_BPF_STORAGE_CACHE(task_cache); +DEFINE_PER_CPU(int, bpf_task_storage_busy); + +static void bpf_task_storage_lock(void) +{ + migrate_disable(); + __this_cpu_inc(bpf_task_storage_busy); +} + +static void bpf_task_storage_unlock(void) +{ + __this_cpu_dec(bpf_task_storage_busy); + migrate_enable(); +} + +static bool bpf_task_storage_trylock(void) +{ + migrate_disable(); + if (unlikely(__this_cpu_inc_return(bpf_task_storage_busy) != 1)) { + __this_cpu_dec(bpf_task_storage_busy); + migrate_enable(); + return false; + } + return true; +} + static struct bpf_local_storage __rcu **task_storage_ptr(void *owner) { struct task_struct *task = owner; - struct bpf_storage_blob *bsb; - bsb = bpf_task(task); - if (!bsb) - return NULL; - return &bsb->storage; + return &task->bpf_storage; } static struct bpf_local_storage_data * @@ -38,13 +58,8 @@ task_storage_lookup(struct task_struct *task, struct bpf_map *map, { struct bpf_local_storage *task_storage; struct bpf_local_storage_map *smap; - struct bpf_storage_blob *bsb; - - bsb = bpf_task(task); - if (!bsb) - return NULL; - task_storage = rcu_dereference(bsb->storage); + task_storage = rcu_dereference(task->bpf_storage); if (!task_storage) return NULL; @@ -57,16 +72,12 @@ void bpf_task_storage_free(struct task_struct *task) struct bpf_local_storage_elem *selem; struct bpf_local_storage *local_storage; bool free_task_storage = false; - struct bpf_storage_blob *bsb; struct hlist_node *n; - - bsb = bpf_task(task); - if (!bsb) - return; + unsigned long flags; rcu_read_lock(); - local_storage = rcu_dereference(bsb->storage); + local_storage = rcu_dereference(task->bpf_storage); if (!local_storage) { rcu_read_unlock(); return; @@ -81,7 +92,8 @@ void bpf_task_storage_free(struct task_struct *task) * when unlinking elem from the local_storage->list and * the map's bucket->list. */ - raw_spin_lock_bh(&local_storage->lock); + bpf_task_storage_lock(); + raw_spin_lock_irqsave(&local_storage->lock, flags); hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) { /* Always unlink from map before unlinking from * local_storage. @@ -90,7 +102,8 @@ void bpf_task_storage_free(struct task_struct *task) free_task_storage = bpf_selem_unlink_storage_nolock( local_storage, selem, false); } - raw_spin_unlock_bh(&local_storage->lock); + raw_spin_unlock_irqrestore(&local_storage->lock, flags); + bpf_task_storage_unlock(); rcu_read_unlock(); /* free_task_storage should always be true as long as @@ -123,7 +136,9 @@ static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key) goto out; } + bpf_task_storage_lock(); sdata = task_storage_lookup(task, map, true); + bpf_task_storage_unlock(); put_pid(pid); return sdata ? sdata->data : NULL; out: @@ -150,13 +165,15 @@ static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key, */ WARN_ON_ONCE(!rcu_read_lock_held()); task = pid_task(pid, PIDTYPE_PID); - if (!task || !task_storage_ptr(task)) { + if (!task) { err = -ENOENT; goto out; } + bpf_task_storage_lock(); sdata = bpf_local_storage_update( task, (struct bpf_local_storage_map *)map, value, map_flags); + bpf_task_storage_unlock(); err = PTR_ERR_OR_ZERO(sdata); out: @@ -199,7 +216,9 @@ static int bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key) goto out; } + bpf_task_storage_lock(); err = task_storage_delete(task, map); + bpf_task_storage_unlock(); out: put_pid(pid); return err; @@ -213,44 +232,47 @@ BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE)) return (unsigned long)NULL; - /* explicitly check that the task_storage_ptr is not - * NULL as task_storage_lookup returns NULL in this case and - * bpf_local_storage_update expects the owner to have a - * valid storage pointer. - */ - if (!task || !task_storage_ptr(task)) + if (!task) + return (unsigned long)NULL; + + if (!bpf_task_storage_trylock()) return (unsigned long)NULL; sdata = task_storage_lookup(task, map, true); if (sdata) - return (unsigned long)sdata->data; + goto unlock; - /* This helper must only be called from places where the lifetime of the task - * is guaranteed. Either by being refcounted or by being protected - * by an RCU read-side critical section. - */ - if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) { + /* only allocate new storage, when the task is refcounted */ + if (refcount_read(&task->usage) && + (flags & BPF_LOCAL_STORAGE_GET_F_CREATE)) sdata = bpf_local_storage_update( task, (struct bpf_local_storage_map *)map, value, BPF_NOEXIST); - return IS_ERR(sdata) ? (unsigned long)NULL : - (unsigned long)sdata->data; - } - return (unsigned long)NULL; +unlock: + bpf_task_storage_unlock(); + return IS_ERR_OR_NULL(sdata) ? (unsigned long)NULL : + (unsigned long)sdata->data; } BPF_CALL_2(bpf_task_storage_delete, struct bpf_map *, map, struct task_struct *, task) { + int ret; + if (!task) return -EINVAL; + if (!bpf_task_storage_trylock()) + return -EBUSY; + /* This helper must only be called from places where the lifetime of the task * is guaranteed. Either by being refcounted or by being protected * by an RCU read-side critical section. */ - return task_storage_delete(task, map); + ret = task_storage_delete(task, map); + bpf_task_storage_unlock(); + return ret; } static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key) @@ -276,7 +298,7 @@ static void task_storage_map_free(struct bpf_map *map) smap = (struct bpf_local_storage_map *)map; bpf_local_storage_cache_idx_free(&task_cache, smap->cache_idx); - bpf_local_storage_map_free(smap); + bpf_local_storage_map_free(smap, &bpf_task_storage_busy); } static int task_storage_map_btf_id; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index b1a76fe046cb..369faeddf1df 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -173,7 +173,7 @@ #define BITS_ROUNDUP_BYTES(bits) \ (BITS_ROUNDDOWN_BYTES(bits) + !!BITS_PER_BYTE_MASKED(bits)) -#define BTF_INFO_MASK 0x8f00ffff +#define BTF_INFO_MASK 0x9f00ffff #define BTF_INT_MASK 0x0fffffff #define BTF_TYPE_ID_VALID(type_id) ((type_id) <= BTF_MAX_TYPE) #define BTF_STR_OFFSET_VALID(name_off) ((name_off) <= BTF_MAX_NAME_OFFSET) @@ -280,6 +280,7 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = { [BTF_KIND_FUNC_PROTO] = "FUNC_PROTO", [BTF_KIND_VAR] = "VAR", [BTF_KIND_DATASEC] = "DATASEC", + [BTF_KIND_FLOAT] = "FLOAT", }; static const char *btf_type_str(const struct btf_type *t) @@ -574,6 +575,7 @@ static bool btf_type_has_size(const struct btf_type *t) case BTF_KIND_UNION: case BTF_KIND_ENUM: case BTF_KIND_DATASEC: + case BTF_KIND_FLOAT: return true; } @@ -1704,6 +1706,7 @@ __btf_resolve_size(const struct btf *btf, const struct btf_type *type, case BTF_KIND_STRUCT: case BTF_KIND_UNION: case BTF_KIND_ENUM: + case BTF_KIND_FLOAT: size = type->size; goto resolved; @@ -1849,7 +1852,7 @@ static int btf_df_check_kflag_member(struct btf_verifier_env *env, return -EINVAL; } -/* Used for ptr, array and struct/union type members. +/* Used for ptr, array struct/union and float type members. * int, enum and modifier types have their specific callback functions. */ static int btf_generic_check_kflag_member(struct btf_verifier_env *env, @@ -3675,6 +3678,81 @@ static const struct btf_kind_operations datasec_ops = { .show = btf_datasec_show, }; +static s32 btf_float_check_meta(struct btf_verifier_env *env, + const struct btf_type *t, + u32 meta_left) +{ + if (btf_type_vlen(t)) { + btf_verifier_log_type(env, t, "vlen != 0"); + return -EINVAL; + } + + if (btf_type_kflag(t)) { + btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); + return -EINVAL; + } + + if (t->size != 2 && t->size != 4 && t->size != 8 && t->size != 12 && + t->size != 16) { + btf_verifier_log_type(env, t, "Invalid type_size"); + return -EINVAL; + } + + btf_verifier_log_type(env, t, NULL); + + return 0; +} + +static int btf_float_check_member(struct btf_verifier_env *env, + const struct btf_type *struct_type, + const struct btf_member *member, + const struct btf_type *member_type) +{ + u64 start_offset_bytes; + u64 end_offset_bytes; + u64 misalign_bits; + u64 align_bytes; + u64 align_bits; + + /* Different architectures have different alignment requirements, so + * here we check only for the reasonable minimum. This way we ensure + * that types after CO-RE can pass the kernel BTF verifier. + */ + align_bytes = min_t(u64, sizeof(void *), member_type->size); + align_bits = align_bytes * BITS_PER_BYTE; + div64_u64_rem(member->offset, align_bits, &misalign_bits); + if (misalign_bits) { + btf_verifier_log_member(env, struct_type, member, + "Member is not properly aligned"); + return -EINVAL; + } + + start_offset_bytes = member->offset / BITS_PER_BYTE; + end_offset_bytes = start_offset_bytes + member_type->size; + if (end_offset_bytes > struct_type->size) { + btf_verifier_log_member(env, struct_type, member, + "Member exceeds struct_size"); + return -EINVAL; + } + + return 0; +} + +static void btf_float_log(struct btf_verifier_env *env, + const struct btf_type *t) +{ + btf_verifier_log(env, "size=%u", t->size); +} + +static const struct btf_kind_operations float_ops = { + .check_meta = btf_float_check_meta, + .resolve = btf_df_resolve, + .check_member = btf_float_check_member, + .check_kflag_member = btf_generic_check_kflag_member, + .log_details = btf_float_log, + .show = btf_df_show, +}; + static int btf_func_proto_check(struct btf_verifier_env *env, const struct btf_type *t) { @@ -3808,6 +3886,7 @@ static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = { [BTF_KIND_FUNC_PROTO] = &func_proto_ops, [BTF_KIND_VAR] = &var_ops, [BTF_KIND_DATASEC] = &datasec_ops, + [BTF_KIND_FLOAT] = &float_ops, }; static s32 btf_check_meta(struct btf_verifier_env *env, @@ -4592,8 +4671,10 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, } arg = off / 8; args = (const struct btf_param *)(t + 1); - /* if (t == NULL) Fall back to default BPF prog with 5 u64 arguments */ - nr_args = t ? btf_type_vlen(t) : 5; + /* if (t == NULL) Fall back to default BPF prog with + * MAX_BPF_FUNC_REG_ARGS u64 arguments. + */ + nr_args = t ? btf_type_vlen(t) : MAX_BPF_FUNC_REG_ARGS; if (prog->aux->attach_btf_trace) { /* skip first 'void *__data' argument in btf_trace_##name typedef */ args++; @@ -4649,7 +4730,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, } } else { if (!t) - /* Default prog with 5 args */ + /* Default prog with MAX_BPF_FUNC_REG_ARGS args */ return true; t = btf_type_by_id(btf, args[arg].type); } @@ -5100,12 +5181,12 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, if (!func) { /* BTF function prototype doesn't match the verifier types. - * Fall back to 5 u64 args. + * Fall back to MAX_BPF_FUNC_REG_ARGS u64 args. */ - for (i = 0; i < 5; i++) + for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) m->arg_size[i] = 8; m->ret_size = 8; - m->nr_args = 5; + m->nr_args = MAX_BPF_FUNC_REG_ARGS; return 0; } args = (const struct btf_param *)(func + 1); @@ -5328,8 +5409,9 @@ int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, } args = (const struct btf_param *)(t + 1); nargs = btf_type_vlen(t); - if (nargs > 5) { - bpf_log(log, "Function %s has %d > 5 args\n", tname, nargs); + if (nargs > MAX_BPF_FUNC_REG_ARGS) { + bpf_log(log, "Function %s has %d > %d args\n", tname, nargs, + MAX_BPF_FUNC_REG_ARGS); goto out; } @@ -5458,9 +5540,9 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, } args = (const struct btf_param *)(t + 1); nargs = btf_type_vlen(t); - if (nargs > 5) { - bpf_log(log, "Global function %s() with %d > 5 args. Buggy compiler.\n", - tname, nargs); + if (nargs > MAX_BPF_FUNC_REG_ARGS) { + bpf_log(log, "Global function %s() with %d > %d args. Buggy compiler.\n", + tname, nargs, MAX_BPF_FUNC_REG_ARGS); return -EINVAL; } /* check that function returns int */ diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 5d1469de6921..0cf2791d5099 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -543,7 +543,6 @@ static void cpu_map_free(struct bpf_map *map) * complete. */ - bpf_clear_redirect_map(map); synchronize_rcu(); /* For cpu_map the remote CPUs can still be using the entries @@ -563,7 +562,7 @@ static void cpu_map_free(struct bpf_map *map) kfree(cmap); } -struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key) +static void *__cpu_map_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); struct bpf_cpu_map_entry *rcpu; @@ -600,6 +599,11 @@ static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } +static int cpu_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) +{ + return __bpf_xdp_redirect_map(map, ifindex, flags, __cpu_map_lookup_elem); +} + static int cpu_map_btf_id; const struct bpf_map_ops cpu_map_ops = { .map_meta_equal = bpf_map_meta_equal, @@ -612,6 +616,7 @@ const struct bpf_map_ops cpu_map_ops = { .map_check_btf = map_check_no_btf, .map_btf_name = "bpf_cpu_map", .map_btf_id = &cpu_map_btf_id, + .map_redirect = cpu_map_redirect, }; static void bq_flush_to_queue(struct xdp_bulk_queue *bq) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 85d9d1b72a33..7a5ad7331c3b 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -197,7 +197,6 @@ static void dev_map_free(struct bpf_map *map) list_del_rcu(&dtab->list); spin_unlock(&dev_map_lock); - bpf_clear_redirect_map(map); synchronize_rcu(); /* Make sure prior __dev_map_entry_free() have completed. */ @@ -258,7 +257,7 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } -struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key) +static void *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct hlist_head *head = dev_map_index_hash(dtab, key); @@ -392,7 +391,7 @@ void __dev_flush(void) * update happens in parallel here a dev_put wont happen until after reading the * ifindex. */ -struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key) +static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *obj; @@ -735,6 +734,16 @@ static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value, map, key, value, map_flags); } +static int dev_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) +{ + return __bpf_xdp_redirect_map(map, ifindex, flags, __dev_map_lookup_elem); +} + +static int dev_hash_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) +{ + return __bpf_xdp_redirect_map(map, ifindex, flags, __dev_map_hash_lookup_elem); +} + static int dev_map_btf_id; const struct bpf_map_ops dev_map_ops = { .map_meta_equal = bpf_map_meta_equal, @@ -747,6 +756,7 @@ const struct bpf_map_ops dev_map_ops = { .map_check_btf = map_check_no_btf, .map_btf_name = "bpf_dtab", .map_btf_id = &dev_map_btf_id, + .map_redirect = dev_map_redirect, }; static int dev_map_hash_map_btf_id; @@ -761,6 +771,7 @@ const struct bpf_map_ops dev_map_hash_ops = { .map_check_btf = map_check_no_btf, .map_btf_name = "bpf_dtab", .map_btf_id = &dev_map_hash_map_btf_id, + .map_redirect = dev_hash_map_redirect, }; static void dev_map_hash_remove_netdev(struct bpf_dtab *dtab, diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index d63912e73ad9..330d721dd2af 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1869,6 +1869,63 @@ static const struct bpf_iter_seq_info iter_seq_info = { .seq_priv_size = sizeof(struct bpf_iter_seq_hash_map_info), }; +static int bpf_for_each_hash_elem(struct bpf_map *map, void *callback_fn, + void *callback_ctx, u64 flags) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct hlist_nulls_head *head; + struct hlist_nulls_node *n; + struct htab_elem *elem; + u32 roundup_key_size; + int i, num_elems = 0; + void __percpu *pptr; + struct bucket *b; + void *key, *val; + bool is_percpu; + u64 ret = 0; + + if (flags != 0) + return -EINVAL; + + is_percpu = htab_is_percpu(htab); + + roundup_key_size = round_up(map->key_size, 8); + /* disable migration so percpu value prepared here will be the + * same as the one seen by the bpf program with bpf_map_lookup_elem(). + */ + if (is_percpu) + migrate_disable(); + for (i = 0; i < htab->n_buckets; i++) { + b = &htab->buckets[i]; + rcu_read_lock(); + head = &b->head; + hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) { + key = elem->key; + if (is_percpu) { + /* current cpu value for percpu map */ + pptr = htab_elem_get_ptr(elem, map->key_size); + val = this_cpu_ptr(pptr); + } else { + val = elem->key + roundup_key_size; + } + num_elems++; + ret = BPF_CAST_CALL(callback_fn)((u64)(long)map, + (u64)(long)key, (u64)(long)val, + (u64)(long)callback_ctx, 0); + /* return value: 0 - continue, 1 - stop and return */ + if (ret) { + rcu_read_unlock(); + goto out; + } + } + rcu_read_unlock(); + } +out: + if (is_percpu) + migrate_enable(); + return num_elems; +} + static int htab_map_btf_id; const struct bpf_map_ops htab_map_ops = { .map_meta_equal = bpf_map_meta_equal, @@ -1881,6 +1938,8 @@ const struct bpf_map_ops htab_map_ops = { .map_delete_elem = htab_map_delete_elem, .map_gen_lookup = htab_map_gen_lookup, .map_seq_show_elem = htab_map_seq_show_elem, + .map_set_for_each_callback_args = map_set_for_each_callback_args, + .map_for_each_callback = bpf_for_each_hash_elem, BATCH_OPS(htab), .map_btf_name = "bpf_htab", .map_btf_id = &htab_map_btf_id, @@ -1900,6 +1959,8 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_delete_elem = htab_lru_map_delete_elem, .map_gen_lookup = htab_lru_map_gen_lookup, .map_seq_show_elem = htab_map_seq_show_elem, + .map_set_for_each_callback_args = map_set_for_each_callback_args, + .map_for_each_callback = bpf_for_each_hash_elem, BATCH_OPS(htab_lru), .map_btf_name = "bpf_htab", .map_btf_id = &htab_lru_map_btf_id, @@ -2019,6 +2080,8 @@ const struct bpf_map_ops htab_percpu_map_ops = { .map_update_elem = htab_percpu_map_update_elem, .map_delete_elem = htab_map_delete_elem, .map_seq_show_elem = htab_percpu_map_seq_show_elem, + .map_set_for_each_callback_args = map_set_for_each_callback_args, + .map_for_each_callback = bpf_for_each_hash_elem, BATCH_OPS(htab_percpu), .map_btf_name = "bpf_htab", .map_btf_id = &htab_percpu_map_btf_id, @@ -2036,6 +2099,8 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = { .map_update_elem = htab_lru_percpu_map_update_elem, .map_delete_elem = htab_lru_map_delete_elem, .map_seq_show_elem = htab_percpu_map_seq_show_elem, + .map_set_for_each_callback_args = map_set_for_each_callback_args, + .map_for_each_callback = bpf_for_each_hash_elem, BATCH_OPS(htab_lru_percpu), .map_btf_name = "bpf_htab", .map_btf_id = &htab_lru_percpu_map_btf_id, diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 308427fe03a3..074800226327 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -708,6 +708,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_ringbuf_discard_proto; case BPF_FUNC_ringbuf_query: return &bpf_ringbuf_query_proto; + case BPF_FUNC_for_each_map_elem: + return &bpf_for_each_map_elem_proto; default: break; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c56e3fcb5f1a..f9096b049cd6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -234,6 +234,12 @@ static bool bpf_pseudo_call(const struct bpf_insn *insn) insn->src_reg == BPF_PSEUDO_CALL; } +static bool bpf_pseudo_func(const struct bpf_insn *insn) +{ + return insn->code == (BPF_LD | BPF_IMM | BPF_DW) && + insn->src_reg == BPF_PSEUDO_FUNC; +} + struct bpf_call_arg_meta { struct bpf_map *map_ptr; bool raw_mode; @@ -248,6 +254,7 @@ struct bpf_call_arg_meta { u32 btf_id; struct btf *ret_btf; u32 ret_btf_id; + u32 subprogno; }; struct btf *btf_vmlinux; @@ -390,6 +397,24 @@ __printf(3, 4) static void verbose_linfo(struct bpf_verifier_env *env, env->prev_linfo = linfo; } +static void verbose_invalid_scalar(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, + struct tnum *range, const char *ctx, + const char *reg_name) +{ + char tn_buf[48]; + + verbose(env, "At %s the register %s ", ctx, reg_name); + if (!tnum_is_unknown(reg->var_off)) { + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "has value %s", tn_buf); + } else { + verbose(env, "has unknown scalar value"); + } + tnum_strn(tn_buf, sizeof(tn_buf), *range); + verbose(env, " should have been in %s\n", tn_buf); +} + static bool type_is_pkt_pointer(enum bpf_reg_type type) { return type == PTR_TO_PACKET || @@ -409,6 +434,7 @@ static bool reg_type_not_null(enum bpf_reg_type type) return type == PTR_TO_SOCKET || type == PTR_TO_TCP_SOCK || type == PTR_TO_MAP_VALUE || + type == PTR_TO_MAP_KEY || type == PTR_TO_SOCK_COMMON; } @@ -451,7 +477,8 @@ static bool arg_type_may_be_null(enum bpf_arg_type type) type == ARG_PTR_TO_MEM_OR_NULL || type == ARG_PTR_TO_CTX_OR_NULL || type == ARG_PTR_TO_SOCKET_OR_NULL || - type == ARG_PTR_TO_ALLOC_MEM_OR_NULL; + type == ARG_PTR_TO_ALLOC_MEM_OR_NULL || + type == ARG_PTR_TO_STACK_OR_NULL; } /* Determine whether the function releases some resources allocated by another @@ -541,6 +568,8 @@ static const char * const reg_type_str[] = { [PTR_TO_RDONLY_BUF_OR_NULL] = "rdonly_buf_or_null", [PTR_TO_RDWR_BUF] = "rdwr_buf", [PTR_TO_RDWR_BUF_OR_NULL] = "rdwr_buf_or_null", + [PTR_TO_FUNC] = "func", + [PTR_TO_MAP_KEY] = "map_key", }; static char slot_type_char[] = { @@ -612,6 +641,7 @@ static void print_verifier_state(struct bpf_verifier_env *env, if (type_is_pkt_pointer(t)) verbose(env, ",r=%d", reg->range); else if (t == CONST_PTR_TO_MAP || + t == PTR_TO_MAP_KEY || t == PTR_TO_MAP_VALUE || t == PTR_TO_MAP_VALUE_OR_NULL) verbose(env, ",ks=%d,vs=%d", @@ -1519,7 +1549,7 @@ static int add_subprog(struct bpf_verifier_env *env, int off) } ret = find_subprog(env, off); if (ret >= 0) - return 0; + return ret; if (env->subprog_cnt >= BPF_MAX_SUBPROGS) { verbose(env, "too many subprograms\n"); return -E2BIG; @@ -1527,7 +1557,7 @@ static int add_subprog(struct bpf_verifier_env *env, int off) env->subprog_info[env->subprog_cnt++].start = off; sort(env->subprog_info, env->subprog_cnt, sizeof(env->subprog_info[0]), cmp_subprogs, NULL); - return 0; + return env->subprog_cnt - 1; } static int check_subprogs(struct bpf_verifier_env *env) @@ -1544,6 +1574,19 @@ static int check_subprogs(struct bpf_verifier_env *env) /* determine subprog starts. The end is one before the next starts */ for (i = 0; i < insn_cnt; i++) { + if (bpf_pseudo_func(insn + i)) { + if (!env->bpf_capable) { + verbose(env, + "function pointers are allowed for CAP_BPF and CAP_SYS_ADMIN\n"); + return -EPERM; + } + ret = add_subprog(env, i + insn[i].imm + 1); + if (ret < 0) + return ret; + /* remember subprog */ + insn[i + 1].imm = ret; + continue; + } if (!bpf_pseudo_call(insn + i)) continue; if (!env->bpf_capable) { @@ -2295,6 +2338,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_PERCPU_BTF_ID: case PTR_TO_MEM: case PTR_TO_MEM_OR_NULL: + case PTR_TO_FUNC: + case PTR_TO_MAP_KEY: return true; default: return false; @@ -2899,6 +2944,10 @@ static int __check_mem_access(struct bpf_verifier_env *env, int regno, reg = &cur_regs(env)[regno]; switch (reg->type) { + case PTR_TO_MAP_KEY: + verbose(env, "invalid access to map key, key_size=%d off=%d size=%d\n", + mem_size, off, size); + break; case PTR_TO_MAP_VALUE: verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n", mem_size, off, size); @@ -3304,6 +3353,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, case PTR_TO_FLOW_KEYS: pointer_desc = "flow keys "; break; + case PTR_TO_MAP_KEY: + pointer_desc = "key "; + break; case PTR_TO_MAP_VALUE: pointer_desc = "value "; break; @@ -3405,7 +3457,7 @@ process_func: continue_func: subprog_end = subprog[idx + 1].start; for (; i < subprog_end; i++) { - if (!bpf_pseudo_call(insn + i)) + if (!bpf_pseudo_call(insn + i) && !bpf_pseudo_func(insn + i)) continue; /* remember insn and function to return to */ ret_insn[frame] = i + 1; @@ -3842,7 +3894,19 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn /* for access checks, reg->off is just part of off */ off += reg->off; - if (reg->type == PTR_TO_MAP_VALUE) { + if (reg->type == PTR_TO_MAP_KEY) { + if (t == BPF_WRITE) { + verbose(env, "write to change key R%d not allowed\n", regno); + return -EACCES; + } + + err = check_mem_region_access(env, regno, off, size, + reg->map_ptr->key_size, false); + if (err) + return err; + if (value_regno >= 0) + mark_reg_unknown(env, regs, value_regno); + } else if (reg->type == PTR_TO_MAP_VALUE) { if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { verbose(env, "R%d leaks addr into map\n", value_regno); @@ -4258,6 +4322,9 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, case PTR_TO_PACKET_META: return check_packet_access(env, regno, reg->off, access_size, zero_size_allowed); + case PTR_TO_MAP_KEY: + return check_mem_region_access(env, regno, reg->off, access_size, + reg->map_ptr->key_size, false); case PTR_TO_MAP_VALUE: if (check_map_access_type(env, regno, reg->off, access_size, meta && meta->raw_mode ? BPF_WRITE : @@ -4474,6 +4541,7 @@ static const struct bpf_reg_types map_key_value_types = { PTR_TO_STACK, PTR_TO_PACKET, PTR_TO_PACKET_META, + PTR_TO_MAP_KEY, PTR_TO_MAP_VALUE, }, }; @@ -4505,6 +4573,7 @@ static const struct bpf_reg_types mem_types = { PTR_TO_STACK, PTR_TO_PACKET, PTR_TO_PACKET_META, + PTR_TO_MAP_KEY, PTR_TO_MAP_VALUE, PTR_TO_MEM, PTR_TO_RDONLY_BUF, @@ -4517,6 +4586,7 @@ static const struct bpf_reg_types int_ptr_types = { PTR_TO_STACK, PTR_TO_PACKET, PTR_TO_PACKET_META, + PTR_TO_MAP_KEY, PTR_TO_MAP_VALUE, }, }; @@ -4529,6 +4599,8 @@ static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_T static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } }; static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } }; static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PERCPU_BTF_ID } }; +static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } }; +static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } }; static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_MAP_KEY] = &map_key_value_types, @@ -4557,6 +4629,8 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_INT] = &int_ptr_types, [ARG_PTR_TO_LONG] = &int_ptr_types, [ARG_PTR_TO_PERCPU_BTF_ID] = &percpu_btf_ptr_types, + [ARG_PTR_TO_FUNC] = &func_ptr_types, + [ARG_PTR_TO_STACK_OR_NULL] = &stack_ptr_types, }; static int check_reg_type(struct bpf_verifier_env *env, u32 regno, @@ -4738,6 +4812,8 @@ skip_type_check: verbose(env, "verifier internal error\n"); return -EFAULT; } + } else if (arg_type == ARG_PTR_TO_FUNC) { + meta->subprogno = reg->subprogno; } else if (arg_type_is_mem_ptr(arg_type)) { /* The access to this pointer is only checked when we hit the * next is_mem_size argument below. @@ -5258,13 +5334,19 @@ static void clear_caller_saved_regs(struct bpf_verifier_env *env, } } -static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, - int *insn_idx) +typedef int (*set_callee_state_fn)(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee, + int insn_idx); + +static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, + int *insn_idx, int subprog, + set_callee_state_fn set_callee_state_cb) { struct bpf_verifier_state *state = env->cur_state; struct bpf_func_info_aux *func_info_aux; struct bpf_func_state *caller, *callee; - int i, err, subprog, target_insn; + int err; bool is_global = false; if (state->curframe + 1 >= MAX_CALL_FRAMES) { @@ -5273,14 +5355,6 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return -E2BIG; } - target_insn = *insn_idx + insn->imm; - subprog = find_subprog(env, target_insn + 1); - if (subprog < 0) { - verbose(env, "verifier bug. No program starts at insn %d\n", - target_insn + 1); - return -EFAULT; - } - caller = state->frame[state->curframe]; if (state->frame[state->curframe + 1]) { verbose(env, "verifier bug. Frame %d already allocated\n", @@ -5335,11 +5409,9 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (err) return err; - /* copy r1 - r5 args that callee can access. The copy includes parent - * pointers, which connects us up to the liveness chain - */ - for (i = BPF_REG_1; i <= BPF_REG_5; i++) - callee->regs[i] = caller->regs[i]; + err = set_callee_state_cb(env, caller, callee, *insn_idx); + if (err) + return err; clear_caller_saved_regs(env, caller->regs); @@ -5347,7 +5419,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, state->curframe++; /* and go analyze first insn of the callee */ - *insn_idx = target_insn; + *insn_idx = env->subprog_info[subprog].start - 1; if (env->log.level & BPF_LOG_LEVEL) { verbose(env, "caller:\n"); @@ -5358,6 +5430,92 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return 0; } +int map_set_for_each_callback_args(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee) +{ + /* bpf_for_each_map_elem(struct bpf_map *map, void *callback_fn, + * void *callback_ctx, u64 flags); + * callback_fn(struct bpf_map *map, void *key, void *value, + * void *callback_ctx); + */ + callee->regs[BPF_REG_1] = caller->regs[BPF_REG_1]; + + callee->regs[BPF_REG_2].type = PTR_TO_MAP_KEY; + __mark_reg_known_zero(&callee->regs[BPF_REG_2]); + callee->regs[BPF_REG_2].map_ptr = caller->regs[BPF_REG_1].map_ptr; + + callee->regs[BPF_REG_3].type = PTR_TO_MAP_VALUE; + __mark_reg_known_zero(&callee->regs[BPF_REG_3]); + callee->regs[BPF_REG_3].map_ptr = caller->regs[BPF_REG_1].map_ptr; + + /* pointer to stack or null */ + callee->regs[BPF_REG_4] = caller->regs[BPF_REG_3]; + + /* unused */ + __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); + return 0; +} + +static int set_callee_state(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee, int insn_idx) +{ + int i; + + /* copy r1 - r5 args that callee can access. The copy includes parent + * pointers, which connects us up to the liveness chain + */ + for (i = BPF_REG_1; i <= BPF_REG_5; i++) + callee->regs[i] = caller->regs[i]; + return 0; +} + +static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, + int *insn_idx) +{ + int subprog, target_insn; + + target_insn = *insn_idx + insn->imm + 1; + subprog = find_subprog(env, target_insn); + if (subprog < 0) { + verbose(env, "verifier bug. No program starts at insn %d\n", + target_insn); + return -EFAULT; + } + + return __check_func_call(env, insn, insn_idx, subprog, set_callee_state); +} + +static int set_map_elem_callback_state(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee, + int insn_idx) +{ + struct bpf_insn_aux_data *insn_aux = &env->insn_aux_data[insn_idx]; + struct bpf_map *map; + int err; + + if (bpf_map_ptr_poisoned(insn_aux)) { + verbose(env, "tail_call abusing map_ptr\n"); + return -EINVAL; + } + + map = BPF_MAP_PTR(insn_aux->map_ptr_state); + if (!map->ops->map_set_for_each_callback_args || + !map->ops->map_for_each_callback) { + verbose(env, "callback function not allowed for map\n"); + return -ENOTSUPP; + } + + err = map->ops->map_set_for_each_callback_args(env, caller, callee); + if (err) + return err; + + callee->in_callback_fn = true; + return 0; +} + static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) { struct bpf_verifier_state *state = env->cur_state; @@ -5380,8 +5538,22 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) state->curframe--; caller = state->frame[state->curframe]; - /* return to the caller whatever r0 had in the callee */ - caller->regs[BPF_REG_0] = *r0; + if (callee->in_callback_fn) { + /* enforce R0 return value range [0, 1]. */ + struct tnum range = tnum_range(0, 1); + + if (r0->type != SCALAR_VALUE) { + verbose(env, "R0 not a scalar value\n"); + return -EACCES; + } + if (!tnum_in(range, r0->var_off)) { + verbose_invalid_scalar(env, r0, &range, "callback return", "R0"); + return -EINVAL; + } + } else { + /* return to the caller whatever r0 had in the callee */ + caller->regs[BPF_REG_0] = *r0; + } /* Transfer references to the caller */ err = transfer_reference_state(caller, callee); @@ -5436,7 +5608,9 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, func_id != BPF_FUNC_map_delete_elem && func_id != BPF_FUNC_map_push_elem && func_id != BPF_FUNC_map_pop_elem && - func_id != BPF_FUNC_map_peek_elem) + func_id != BPF_FUNC_map_peek_elem && + func_id != BPF_FUNC_for_each_map_elem && + func_id != BPF_FUNC_redirect_map) return 0; if (map == NULL) { @@ -5517,15 +5691,18 @@ static int check_reference_leak(struct bpf_verifier_env *env) return state->acquired_refs ? -EINVAL : 0; } -static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx) +static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn, + int *insn_idx_p) { const struct bpf_func_proto *fn = NULL; struct bpf_reg_state *regs; struct bpf_call_arg_meta meta; + int insn_idx = *insn_idx_p; bool changes_data; - int i, err; + int i, err, func_id; /* find function prototype */ + func_id = insn->imm; if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) { verbose(env, "invalid func %s#%d\n", func_id_name(func_id), func_id); @@ -5571,7 +5748,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn meta.func_id = func_id; /* check args */ - for (i = 0; i < 5; i++) { + for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) { err = check_func_arg(env, i, &meta, fn); if (err) return err; @@ -5621,6 +5798,13 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn return -EINVAL; } + if (func_id == BPF_FUNC_for_each_map_elem) { + err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, + set_map_elem_callback_state); + if (err < 0) + return -EINVAL; + } + /* reset caller saved regs */ for (i = 0; i < CALLER_SAVED_REGS; i++) { mark_reg_not_init(env, regs, caller_saved[i]); @@ -5874,6 +6058,19 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, else *ptr_limit = -off; return 0; + case PTR_TO_MAP_KEY: + /* Currently, this code is not exercised as the only use + * is bpf_for_each_map_elem() helper which requires + * bpf_capble. The code has been tested manually for + * future use. + */ + if (mask_to_left) { + *ptr_limit = ptr_reg->umax_value + ptr_reg->off; + } else { + off = ptr_reg->smin_value + ptr_reg->off; + *ptr_limit = ptr_reg->map_ptr->key_size - off; + } + return 0; case PTR_TO_MAP_VALUE: if (mask_to_left) { *ptr_limit = ptr_reg->umax_value + ptr_reg->off; @@ -5904,7 +6101,7 @@ static int update_alu_sanitation_state(struct bpf_insn_aux_data *aux, aux->alu_limit != alu_limit)) return -EACCES; - /* Corresponding fixup done in fixup_bpf_calls(). */ + /* Corresponding fixup done in do_misc_fixups(). */ aux->alu_state = alu_state; aux->alu_limit = alu_limit; return 0; @@ -6075,6 +6272,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, verbose(env, "R%d pointer arithmetic on %s prohibited\n", dst, reg_type_str[ptr_reg->type]); return -EACCES; + case PTR_TO_MAP_KEY: case PTR_TO_MAP_VALUE: if (!env->allow_ptr_leaks && !known && (smin_val < 0) != (smax_val < 0)) { verbose(env, "R%d has unknown scalar with mixed signed bounds, pointer arithmetic with it prohibited for !root\n", @@ -8254,6 +8452,24 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) return 0; } + if (insn->src_reg == BPF_PSEUDO_FUNC) { + struct bpf_prog_aux *aux = env->prog->aux; + u32 subprogno = insn[1].imm; + + if (!aux->func_info) { + verbose(env, "missing btf func_info\n"); + return -EINVAL; + } + if (aux->func_info_aux[subprogno].linkage != BTF_FUNC_STATIC) { + verbose(env, "callback function not static\n"); + return -EINVAL; + } + + dst_reg->type = PTR_TO_FUNC; + dst_reg->subprogno = subprogno; + return 0; + } + map = env->used_maps[aux->map_index]; mark_reg_known_zero(env, regs, insn->dst_reg); dst_reg->map_ptr = map; @@ -8482,17 +8698,7 @@ static int check_return_code(struct bpf_verifier_env *env) } if (!tnum_in(range, reg->var_off)) { - char tn_buf[48]; - - verbose(env, "At program exit the register R0 "); - if (!tnum_is_unknown(reg->var_off)) { - tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "has value %s", tn_buf); - } else { - verbose(env, "has unknown scalar value"); - } - tnum_strn(tn_buf, sizeof(tn_buf), range); - verbose(env, " should have been in %s\n", tn_buf); + verbose_invalid_scalar(env, reg, &range, "program exit", "R0"); return -EINVAL; } @@ -8619,6 +8825,27 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env, return DONE_EXPLORING; } +static int visit_func_call_insn(int t, int insn_cnt, + struct bpf_insn *insns, + struct bpf_verifier_env *env, + bool visit_callee) +{ + int ret; + + ret = push_insn(t, t + 1, FALLTHROUGH, env, false); + if (ret) + return ret; + + if (t + 1 < insn_cnt) + init_explored_state(env, t + 1); + if (visit_callee) { + init_explored_state(env, t); + ret = push_insn(t, t + insns[t].imm + 1, BRANCH, + env, false); + } + return ret; +} + /* Visits the instruction at index t and returns one of the following: * < 0 - an error occurred * DONE_EXPLORING - the instruction was fully explored @@ -8629,6 +8856,9 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env) struct bpf_insn *insns = env->prog->insnsi; int ret; + if (bpf_pseudo_func(insns + t)) + return visit_func_call_insn(t, insn_cnt, insns, env, true); + /* All non-branch instructions have a single fall-through edge. */ if (BPF_CLASS(insns[t].code) != BPF_JMP && BPF_CLASS(insns[t].code) != BPF_JMP32) @@ -8639,18 +8869,8 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env) return DONE_EXPLORING; case BPF_CALL: - ret = push_insn(t, t + 1, FALLTHROUGH, env, false); - if (ret) - return ret; - - if (t + 1 < insn_cnt) - init_explored_state(env, t + 1); - if (insns[t].src_reg == BPF_PSEUDO_CALL) { - init_explored_state(env, t); - ret = push_insn(t, t + insns[t].imm + 1, BRANCH, - env, false); - } - return ret; + return visit_func_call_insn(t, insn_cnt, insns, env, + insns[t].src_reg == BPF_PSEUDO_CALL); case BPF_JA: if (BPF_SRC(insns[t].code) != BPF_K) @@ -9259,6 +9479,7 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, */ return false; } + case PTR_TO_MAP_KEY: case PTR_TO_MAP_VALUE: /* If the new min/max/var_off satisfy the old ones and * everything else matches, we are OK. @@ -10105,10 +10326,9 @@ static int do_check(struct bpf_verifier_env *env) if (insn->src_reg == BPF_PSEUDO_CALL) err = check_func_call(env, insn, &env->insn_idx); else - err = check_helper_call(env, insn->imm, env->insn_idx); + err = check_helper_call(env, insn, &env->insn_idx); if (err) return err; - } else if (opcode == BPF_JA) { if (BPF_SRC(insn->code) != BPF_K || insn->imm != 0 || @@ -10537,6 +10757,12 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) goto next_insn; } + if (insn[0].src_reg == BPF_PSEUDO_FUNC) { + aux = &env->insn_aux_data[i]; + aux->ptr_type = PTR_TO_FUNC; + goto next_insn; + } + /* In final convert_pseudo_ld_imm64() step, this is * converted into regular 64-bit imm load insn. */ @@ -10669,9 +10895,13 @@ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env) int insn_cnt = env->prog->len; int i; - for (i = 0; i < insn_cnt; i++, insn++) - if (insn->code == (BPF_LD | BPF_IMM | BPF_DW)) - insn->src_reg = 0; + for (i = 0; i < insn_cnt; i++, insn++) { + if (insn->code != (BPF_LD | BPF_IMM | BPF_DW)) + continue; + if (insn->src_reg == BPF_PSEUDO_FUNC) + continue; + insn->src_reg = 0; + } } /* single env->prog->insni[off] instruction was replaced with the range @@ -11310,6 +11540,12 @@ static int jit_subprogs(struct bpf_verifier_env *env) return 0; for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { + if (bpf_pseudo_func(insn)) { + env->insn_aux_data[i].call_imm = insn->imm; + /* subprog is encoded in insn[1].imm */ + continue; + } + if (!bpf_pseudo_call(insn)) continue; /* Upon error here we cannot fall back to interpreter but @@ -11439,6 +11675,12 @@ static int jit_subprogs(struct bpf_verifier_env *env) for (i = 0; i < env->subprog_cnt; i++) { insn = func[i]->insnsi; for (j = 0; j < func[i]->len; j++, insn++) { + if (bpf_pseudo_func(insn)) { + subprog = insn[1].imm; + insn[0].imm = (u32)(long)func[subprog]->bpf_func; + insn[1].imm = ((u64)(long)func[subprog]->bpf_func) >> 32; + continue; + } if (!bpf_pseudo_call(insn)) continue; subprog = insn->off; @@ -11484,6 +11726,11 @@ static int jit_subprogs(struct bpf_verifier_env *env) * later look the same as if they were interpreted only. */ for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { + if (bpf_pseudo_func(insn)) { + insn[0].imm = env->insn_aux_data[i].call_imm; + insn[1].imm = find_subprog(env, i + insn[0].imm + 1); + continue; + } if (!bpf_pseudo_call(insn)) continue; insn->off = env->insn_aux_data[i].call_imm; @@ -11548,6 +11795,14 @@ static int fixup_call_args(struct bpf_verifier_env *env) return -EINVAL; } for (i = 0; i < prog->len; i++, insn++) { + if (bpf_pseudo_func(insn)) { + /* When JIT fails the progs with callback calls + * have to be rejected, since interpreter doesn't support them yet. + */ + verbose(env, "callbacks are not allowed in non-JITed programs\n"); + return -EINVAL; + } + if (!bpf_pseudo_call(insn)) continue; depth = get_callee_stack_depth(env, insn, i); @@ -11560,12 +11815,10 @@ static int fixup_call_args(struct bpf_verifier_env *env) return err; } -/* fixup insn->imm field of bpf_call instructions - * and inline eligible helpers as explicit sequence of BPF instructions - * - * this function is called after eBPF program passed verification +/* Do various post-verification rewrites in a single program pass. + * These rewrites simplify JIT and interpreter implementations. */ -static int fixup_bpf_calls(struct bpf_verifier_env *env) +static int do_misc_fixups(struct bpf_verifier_env *env) { struct bpf_prog *prog = env->prog; bool expect_blinding = bpf_jit_blinding_enabled(prog); @@ -11580,6 +11833,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) int i, ret, cnt, delta = 0; for (i = 0; i < insn_cnt; i++, insn++) { + /* Make divide-by-zero exceptions impossible. */ if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) || insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) || insn->code == (BPF_ALU | BPF_MOD | BPF_X) || @@ -11620,6 +11874,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) continue; } + /* Implement LD_ABS and LD_IND with a rewrite, if supported by the program type. */ if (BPF_CLASS(insn->code) == BPF_LD && (BPF_MODE(insn->code) == BPF_ABS || BPF_MODE(insn->code) == BPF_IND)) { @@ -11639,6 +11894,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) continue; } + /* Rewrite pointer arithmetic to mitigate speculation attacks. */ if (insn->code == (BPF_ALU64 | BPF_ADD | BPF_X) || insn->code == (BPF_ALU64 | BPF_SUB | BPF_X)) { const u8 code_add = BPF_ALU64 | BPF_ADD | BPF_X; @@ -11787,7 +12043,8 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) insn->imm == BPF_FUNC_map_delete_elem || insn->imm == BPF_FUNC_map_push_elem || insn->imm == BPF_FUNC_map_pop_elem || - insn->imm == BPF_FUNC_map_peek_elem)) { + insn->imm == BPF_FUNC_map_peek_elem || + insn->imm == BPF_FUNC_redirect_map)) { aux = &env->insn_aux_data[i + delta]; if (bpf_map_ptr_poisoned(aux)) goto patch_call_imm; @@ -11829,6 +12086,9 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) (int (*)(struct bpf_map *map, void *value))NULL)); BUILD_BUG_ON(!__same_type(ops->map_peek_elem, (int (*)(struct bpf_map *map, void *value))NULL)); + BUILD_BUG_ON(!__same_type(ops->map_redirect, + (int (*)(struct bpf_map *map, u32 ifindex, u64 flags))NULL)); + patch_map_ops_generic: switch (insn->imm) { case BPF_FUNC_map_lookup_elem: @@ -11855,11 +12115,16 @@ patch_map_ops_generic: insn->imm = BPF_CAST_CALL(ops->map_peek_elem) - __bpf_call_base; continue; + case BPF_FUNC_redirect_map: + insn->imm = BPF_CAST_CALL(ops->map_redirect) - + __bpf_call_base; + continue; } goto patch_call_imm; } + /* Implement bpf_jiffies64 inline. */ if (prog->jit_requested && BITS_PER_LONG == 64 && insn->imm == BPF_FUNC_jiffies64) { struct bpf_insn ld_jiffies_addr[2] = { @@ -12670,7 +12935,7 @@ skip_full_check: ret = convert_ctx_accesses(env); if (ret == 0) - ret = fixup_bpf_calls(env); + ret = do_misc_fixups(env); /* do 32-bit optimization after insn patching has done so those patched * insns could be handled correctly. diff --git a/kernel/fork.c b/kernel/fork.c index d3171e8e88e5..b94391a58708 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -96,6 +96,7 @@ #include <linux/kasan.h> #include <linux/scs.h> #include <linux/io_uring.h> +#include <linux/bpf.h> #include <asm/pgalloc.h> #include <linux/uaccess.h> @@ -734,6 +735,7 @@ void __put_task_struct(struct task_struct *tsk) cgroup_free(tsk); task_numa_free(tsk, true); security_task_free(tsk); + bpf_task_storage_free(tsk); exit_creds(tsk); delayacct_tsk_free(tsk); put_signal_struct(tsk->signal); @@ -2064,6 +2066,9 @@ static __latent_entropy struct task_struct *copy_process( p->sequential_io = 0; p->sequential_io_avg = 0; #endif +#ifdef CONFIG_BPF_SYSCALL + RCU_INIT_POINTER(p->bpf_storage, NULL); +#endif /* Perform scheduler related setup. Assign this task to a CPU. */ retval = sched_fork(clone_flags, p); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index b0c45d923f0f..0d23755c2747 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1367,6 +1367,12 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_per_cpu_ptr_proto; case BPF_FUNC_this_cpu_ptr: return &bpf_this_cpu_ptr_proto; + case BPF_FUNC_task_storage_get: + return &bpf_task_storage_get_proto; + case BPF_FUNC_task_storage_delete: + return &bpf_task_storage_delete_proto; + case BPF_FUNC_for_each_map_elem: + return &bpf_for_each_map_elem_proto; default: return NULL; } diff --git a/net/Kconfig b/net/Kconfig index 8cea808ad9e8..0ead7ec0d2bd 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -317,13 +317,9 @@ config BPF_STREAM_PARSER select STREAM_PARSER select NET_SOCK_MSG help - Enabling this allows a stream parser to be used with + Enabling this allows a TCP stream parser to be used with BPF_MAP_TYPE_SOCKMAP. - BPF_MAP_TYPE_SOCKMAP provides a map type to use with network sockets. - It can be used to enforce socket policy, implement socket redirects, - etc. - config NET_FLOW_LIMIT bool depends on RPS diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 58bcb8c849d5..0abdd67f44b1 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -10,20 +10,86 @@ #include <net/bpf_sk_storage.h> #include <net/sock.h> #include <net/tcp.h> +#include <net/net_namespace.h> #include <linux/error-injection.h> #include <linux/smp.h> +#include <linux/sock_diag.h> #define CREATE_TRACE_POINTS #include <trace/events/bpf_test_run.h> +struct bpf_test_timer { + enum { NO_PREEMPT, NO_MIGRATE } mode; + u32 i; + u64 time_start, time_spent; +}; + +static void bpf_test_timer_enter(struct bpf_test_timer *t) + __acquires(rcu) +{ + rcu_read_lock(); + if (t->mode == NO_PREEMPT) + preempt_disable(); + else + migrate_disable(); + + t->time_start = ktime_get_ns(); +} + +static void bpf_test_timer_leave(struct bpf_test_timer *t) + __releases(rcu) +{ + t->time_start = 0; + + if (t->mode == NO_PREEMPT) + preempt_enable(); + else + migrate_enable(); + rcu_read_unlock(); +} + +static bool bpf_test_timer_continue(struct bpf_test_timer *t, u32 repeat, int *err, u32 *duration) + __must_hold(rcu) +{ + t->i++; + if (t->i >= repeat) { + /* We're done. */ + t->time_spent += ktime_get_ns() - t->time_start; + do_div(t->time_spent, t->i); + *duration = t->time_spent > U32_MAX ? U32_MAX : (u32)t->time_spent; + *err = 0; + goto reset; + } + + if (signal_pending(current)) { + /* During iteration: we've been cancelled, abort. */ + *err = -EINTR; + goto reset; + } + + if (need_resched()) { + /* During iteration: we need to reschedule between runs. */ + t->time_spent += ktime_get_ns() - t->time_start; + bpf_test_timer_leave(t); + cond_resched(); + bpf_test_timer_enter(t); + } + + /* Do another round. */ + return true; + +reset: + t->i = 0; + return false; +} + static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *retval, u32 *time, bool xdp) { struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = { NULL }; + struct bpf_test_timer t = { NO_MIGRATE }; enum bpf_cgroup_storage_type stype; - u64 time_start, time_spent = 0; - int ret = 0; - u32 i; + int ret; for_each_cgroup_storage_type(stype) { storage[stype] = bpf_cgroup_storage_alloc(prog, stype); @@ -38,40 +104,16 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, if (!repeat) repeat = 1; - rcu_read_lock(); - migrate_disable(); - time_start = ktime_get_ns(); - for (i = 0; i < repeat; i++) { + bpf_test_timer_enter(&t); + do { bpf_cgroup_storage_set(storage); if (xdp) *retval = bpf_prog_run_xdp(prog, ctx); else *retval = BPF_PROG_RUN(prog, ctx); - - if (signal_pending(current)) { - ret = -EINTR; - break; - } - - if (need_resched()) { - time_spent += ktime_get_ns() - time_start; - migrate_enable(); - rcu_read_unlock(); - - cond_resched(); - - rcu_read_lock(); - migrate_disable(); - time_start = ktime_get_ns(); - } - } - time_spent += ktime_get_ns() - time_start; - migrate_enable(); - rcu_read_unlock(); - - do_div(time_spent, repeat); - *time = time_spent > U32_MAX ? U32_MAX : (u32)time_spent; + } while (bpf_test_timer_continue(&t, repeat, &ret, time)); + bpf_test_timer_leave(&t); for_each_cgroup_storage_type(stype) bpf_cgroup_storage_free(storage[stype]); @@ -674,18 +716,17 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { + struct bpf_test_timer t = { NO_PREEMPT }; u32 size = kattr->test.data_size_in; struct bpf_flow_dissector ctx = {}; u32 repeat = kattr->test.repeat; struct bpf_flow_keys *user_ctx; struct bpf_flow_keys flow_keys; - u64 time_start, time_spent = 0; const struct ethhdr *eth; unsigned int flags = 0; u32 retval, duration; void *data; int ret; - u32 i; if (prog->type != BPF_PROG_TYPE_FLOW_DISSECTOR) return -EINVAL; @@ -721,48 +762,127 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, ctx.data = data; ctx.data_end = (__u8 *)data + size; - rcu_read_lock(); - preempt_disable(); - time_start = ktime_get_ns(); - for (i = 0; i < repeat; i++) { + bpf_test_timer_enter(&t); + do { retval = bpf_flow_dissect(prog, &ctx, eth->h_proto, ETH_HLEN, size, flags); + } while (bpf_test_timer_continue(&t, repeat, &ret, &duration)); + bpf_test_timer_leave(&t); - if (signal_pending(current)) { - preempt_enable(); - rcu_read_unlock(); + if (ret < 0) + goto out; - ret = -EINTR; - goto out; - } + ret = bpf_test_finish(kattr, uattr, &flow_keys, sizeof(flow_keys), + retval, duration); + if (!ret) + ret = bpf_ctx_finish(kattr, uattr, user_ctx, + sizeof(struct bpf_flow_keys)); - if (need_resched()) { - time_spent += ktime_get_ns() - time_start; - preempt_enable(); - rcu_read_unlock(); +out: + kfree(user_ctx); + kfree(data); + return ret; +} - cond_resched(); +int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kattr, + union bpf_attr __user *uattr) +{ + struct bpf_test_timer t = { NO_PREEMPT }; + struct bpf_prog_array *progs = NULL; + struct bpf_sk_lookup_kern ctx = {}; + u32 repeat = kattr->test.repeat; + struct bpf_sk_lookup *user_ctx; + u32 retval, duration; + int ret = -EINVAL; - rcu_read_lock(); - preempt_disable(); - time_start = ktime_get_ns(); - } + if (prog->type != BPF_PROG_TYPE_SK_LOOKUP) + return -EINVAL; + + if (kattr->test.flags || kattr->test.cpu) + return -EINVAL; + + if (kattr->test.data_in || kattr->test.data_size_in || kattr->test.data_out || + kattr->test.data_size_out) + return -EINVAL; + + if (!repeat) + repeat = 1; + + user_ctx = bpf_ctx_init(kattr, sizeof(*user_ctx)); + if (IS_ERR(user_ctx)) + return PTR_ERR(user_ctx); + + if (!user_ctx) + return -EINVAL; + + if (user_ctx->sk) + goto out; + + if (!range_is_zero(user_ctx, offsetofend(typeof(*user_ctx), local_port), sizeof(*user_ctx))) + goto out; + + if (user_ctx->local_port > U16_MAX || user_ctx->remote_port > U16_MAX) { + ret = -ERANGE; + goto out; } - time_spent += ktime_get_ns() - time_start; - preempt_enable(); - rcu_read_unlock(); - do_div(time_spent, repeat); - duration = time_spent > U32_MAX ? U32_MAX : (u32)time_spent; + ctx.family = (u16)user_ctx->family; + ctx.protocol = (u16)user_ctx->protocol; + ctx.dport = (u16)user_ctx->local_port; + ctx.sport = (__force __be16)user_ctx->remote_port; - ret = bpf_test_finish(kattr, uattr, &flow_keys, sizeof(flow_keys), - retval, duration); + switch (ctx.family) { + case AF_INET: + ctx.v4.daddr = (__force __be32)user_ctx->local_ip4; + ctx.v4.saddr = (__force __be32)user_ctx->remote_ip4; + break; + +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + ctx.v6.daddr = (struct in6_addr *)user_ctx->local_ip6; + ctx.v6.saddr = (struct in6_addr *)user_ctx->remote_ip6; + break; +#endif + + default: + ret = -EAFNOSUPPORT; + goto out; + } + + progs = bpf_prog_array_alloc(1, GFP_KERNEL); + if (!progs) { + ret = -ENOMEM; + goto out; + } + + progs->items[0].prog = prog; + + bpf_test_timer_enter(&t); + do { + ctx.selected_sk = NULL; + retval = BPF_PROG_SK_LOOKUP_RUN_ARRAY(progs, ctx, BPF_PROG_RUN); + } while (bpf_test_timer_continue(&t, repeat, &ret, &duration)); + bpf_test_timer_leave(&t); + + if (ret < 0) + goto out; + + user_ctx->cookie = 0; + if (ctx.selected_sk) { + if (ctx.selected_sk->sk_reuseport && !ctx.no_reuseport) { + ret = -EOPNOTSUPP; + goto out; + } + + user_ctx->cookie = sock_gen_cookie(ctx.selected_sk); + } + + ret = bpf_test_finish(kattr, uattr, NULL, 0, retval, duration); if (!ret) - ret = bpf_ctx_finish(kattr, uattr, user_ctx, - sizeof(struct bpf_flow_keys)); + ret = bpf_ctx_finish(kattr, uattr, user_ctx, sizeof(*user_ctx)); out: + bpf_prog_array_free(progs); kfree(user_ctx); - kfree(data); return ret; } diff --git a/net/core/Makefile b/net/core/Makefile index 3e2c378e5f31..0c2233c826fd 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -16,7 +16,6 @@ obj-y += dev.o dev_addr_lists.o dst.o netevent.o \ obj-y += net-sysfs.o obj-$(CONFIG_PAGE_POOL) += page_pool.o obj-$(CONFIG_PROC_FS) += net-procfs.o -obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o obj-$(CONFIG_NET_PKTGEN) += pktgen.o obj-$(CONFIG_NETPOLL) += netpoll.o obj-$(CONFIG_FIB_RULES) += fib_rules.o @@ -28,10 +27,13 @@ obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o obj-$(CONFIG_LWTUNNEL) += lwtunnel.o obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o -obj-$(CONFIG_BPF_STREAM_PARSER) += sock_map.o obj-$(CONFIG_DST_CACHE) += dst_cache.o obj-$(CONFIG_HWBM) += hwbm.o obj-$(CONFIG_NET_DEVLINK) += devlink.o obj-$(CONFIG_GRO_CELLS) += gro_cells.o obj-$(CONFIG_FAILOVER) += failover.o +ifeq ($(CONFIG_INET),y) +obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o +obj-$(CONFIG_BPF_SYSCALL) += sock_map.o +endif obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 4edd033e899c..cc3712ad8716 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -89,7 +89,7 @@ static void bpf_sk_storage_map_free(struct bpf_map *map) smap = (struct bpf_local_storage_map *)map; bpf_local_storage_cache_idx_free(&sk_cache, smap->cache_idx); - bpf_local_storage_map_free(smap); + bpf_local_storage_map_free(smap, NULL); } static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) diff --git a/net/core/filter.c b/net/core/filter.c index adfdad234674..b6732000d8a2 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1863,10 +1863,7 @@ static const struct bpf_func_proto bpf_sk_fullsock_proto = { static inline int sk_skb_try_make_writable(struct sk_buff *skb, unsigned int write_len) { - int err = __bpf_try_make_writable(skb, write_len); - - bpf_compute_data_end_sk_skb(skb); - return err; + return __bpf_try_make_writable(skb, write_len); } BPF_CALL_2(sk_skb_pull_data, struct sk_buff *, skb, u32, len) @@ -3412,6 +3409,7 @@ static u32 bpf_skb_net_base_len(const struct sk_buff *skb) BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \ BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \ BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \ + BPF_F_ADJ_ROOM_ENCAP_L2_ETH | \ BPF_F_ADJ_ROOM_ENCAP_L2( \ BPF_ADJ_ROOM_ENCAP_L2_MASK)) @@ -3448,6 +3446,10 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) return -EINVAL; + if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH && + inner_mac_len < ETH_HLEN) + return -EINVAL; + if (skb->encapsulation) return -EALREADY; @@ -3466,7 +3468,11 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, skb->inner_mac_header = inner_net - inner_mac_len; skb->inner_network_header = inner_net; skb->inner_transport_header = inner_trans; - skb_set_inner_protocol(skb, skb->protocol); + + if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH) + skb_set_inner_protocol(skb, htons(ETH_P_TEB)); + else + skb_set_inner_protocol(skb, skb->protocol); skb->encapsulation = 1; skb_set_network_header(skb, mac_len); @@ -3577,7 +3583,6 @@ BPF_CALL_4(sk_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, return -ENOMEM; __skb_pull(skb, len_diff_abs); } - bpf_compute_data_end_sk_skb(skb); if (tls_sw_has_ctx_rx(skb->sk)) { struct strp_msg *rxm = strp_msg(skb); @@ -3742,10 +3747,7 @@ static const struct bpf_func_proto bpf_skb_change_tail_proto = { BPF_CALL_3(sk_skb_change_tail, struct sk_buff *, skb, u32, new_len, u64, flags) { - int ret = __bpf_skb_change_tail(skb, new_len, flags); - - bpf_compute_data_end_sk_skb(skb); - return ret; + return __bpf_skb_change_tail(skb, new_len, flags); } static const struct bpf_func_proto sk_skb_change_tail_proto = { @@ -3808,10 +3810,7 @@ static const struct bpf_func_proto bpf_skb_change_head_proto = { BPF_CALL_3(sk_skb_change_head, struct sk_buff *, skb, u32, head_room, u64, flags) { - int ret = __bpf_skb_change_head(skb, head_room, flags); - - bpf_compute_data_end_sk_skb(skb); - return ret; + return __bpf_skb_change_head(skb, head_room, flags); } static const struct bpf_func_proto sk_skb_change_head_proto = { @@ -3919,23 +3918,6 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = { .arg2_type = ARG_ANYTHING, }; -static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, - struct bpf_map *map, struct xdp_buff *xdp) -{ - switch (map->map_type) { - case BPF_MAP_TYPE_DEVMAP: - case BPF_MAP_TYPE_DEVMAP_HASH: - return dev_map_enqueue(fwd, xdp, dev_rx); - case BPF_MAP_TYPE_CPUMAP: - return cpu_map_enqueue(fwd, xdp, dev_rx); - case BPF_MAP_TYPE_XSKMAP: - return __xsk_map_redirect(fwd, xdp); - default: - return -EBADRQC; - } - return 0; -} - void xdp_do_flush(void) { __dev_flush(); @@ -3944,71 +3926,52 @@ void xdp_do_flush(void) } EXPORT_SYMBOL_GPL(xdp_do_flush); -static inline void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index) -{ - switch (map->map_type) { - case BPF_MAP_TYPE_DEVMAP: - return __dev_map_lookup_elem(map, index); - case BPF_MAP_TYPE_DEVMAP_HASH: - return __dev_map_hash_lookup_elem(map, index); - case BPF_MAP_TYPE_CPUMAP: - return __cpu_map_lookup_elem(map, index); - case BPF_MAP_TYPE_XSKMAP: - return __xsk_map_lookup_elem(map, index); - default: - return NULL; - } -} - -void bpf_clear_redirect_map(struct bpf_map *map) -{ - struct bpf_redirect_info *ri; - int cpu; - - for_each_possible_cpu(cpu) { - ri = per_cpu_ptr(&bpf_redirect_info, cpu); - /* Avoid polluting remote cacheline due to writes if - * not needed. Once we pass this test, we need the - * cmpxchg() to make sure it hasn't been changed in - * the meantime by remote CPU. - */ - if (unlikely(READ_ONCE(ri->map) == map)) - cmpxchg(&ri->map, map, NULL); - } -} - int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); - struct bpf_map *map = READ_ONCE(ri->map); - u32 index = ri->tgt_index; + enum bpf_map_type map_type = ri->map_type; void *fwd = ri->tgt_value; + u32 map_id = ri->map_id; int err; - ri->tgt_index = 0; - ri->tgt_value = NULL; - WRITE_ONCE(ri->map, NULL); + ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */ + ri->map_type = BPF_MAP_TYPE_UNSPEC; - if (unlikely(!map)) { - fwd = dev_get_by_index_rcu(dev_net(dev), index); - if (unlikely(!fwd)) { - err = -EINVAL; - goto err; + switch (map_type) { + case BPF_MAP_TYPE_DEVMAP: + fallthrough; + case BPF_MAP_TYPE_DEVMAP_HASH: + err = dev_map_enqueue(fwd, xdp, dev); + break; + case BPF_MAP_TYPE_CPUMAP: + err = cpu_map_enqueue(fwd, xdp, dev); + break; + case BPF_MAP_TYPE_XSKMAP: + err = __xsk_map_redirect(fwd, xdp); + break; + case BPF_MAP_TYPE_UNSPEC: + if (map_id == INT_MAX) { + fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index); + if (unlikely(!fwd)) { + err = -EINVAL; + break; + } + err = dev_xdp_enqueue(fwd, xdp, dev); + break; } - - err = dev_xdp_enqueue(fwd, xdp, dev); - } else { - err = __bpf_tx_xdp_map(dev, fwd, map, xdp); + fallthrough; + default: + err = -EBADRQC; } if (unlikely(err)) goto err; - _trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index); + _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index); return 0; err: - _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err); + _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err); return err; } EXPORT_SYMBOL_GPL(xdp_do_redirect); @@ -4017,41 +3980,36 @@ static int xdp_do_generic_redirect_map(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, struct bpf_prog *xdp_prog, - struct bpf_map *map) + void *fwd, + enum bpf_map_type map_type, u32 map_id) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); - u32 index = ri->tgt_index; - void *fwd = ri->tgt_value; - int err = 0; - - ri->tgt_index = 0; - ri->tgt_value = NULL; - WRITE_ONCE(ri->map, NULL); - - if (map->map_type == BPF_MAP_TYPE_DEVMAP || - map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { - struct bpf_dtab_netdev *dst = fwd; + int err; - err = dev_map_generic_redirect(dst, skb, xdp_prog); + switch (map_type) { + case BPF_MAP_TYPE_DEVMAP: + fallthrough; + case BPF_MAP_TYPE_DEVMAP_HASH: + err = dev_map_generic_redirect(fwd, skb, xdp_prog); if (unlikely(err)) goto err; - } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { - struct xdp_sock *xs = fwd; - - err = xsk_generic_rcv(xs, xdp); + break; + case BPF_MAP_TYPE_XSKMAP: + err = xsk_generic_rcv(fwd, xdp); if (err) goto err; consume_skb(skb); - } else { + break; + default: /* TODO: Handle BPF_MAP_TYPE_CPUMAP */ err = -EBADRQC; goto err; } - _trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index); + _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index); return 0; err: - _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err); + _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err); return err; } @@ -4059,31 +4017,34 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); - struct bpf_map *map = READ_ONCE(ri->map); - u32 index = ri->tgt_index; - struct net_device *fwd; - int err = 0; - - if (map) - return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, - map); - ri->tgt_index = 0; - fwd = dev_get_by_index_rcu(dev_net(dev), index); - if (unlikely(!fwd)) { - err = -EINVAL; - goto err; - } + enum bpf_map_type map_type = ri->map_type; + void *fwd = ri->tgt_value; + u32 map_id = ri->map_id; + int err; - err = xdp_ok_fwd_dev(fwd, skb->len); - if (unlikely(err)) - goto err; + ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */ + ri->map_type = BPF_MAP_TYPE_UNSPEC; - skb->dev = fwd; - _trace_xdp_redirect(dev, xdp_prog, index); - generic_xdp_tx(skb, xdp_prog); - return 0; + if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) { + fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index); + if (unlikely(!fwd)) { + err = -EINVAL; + goto err; + } + + err = xdp_ok_fwd_dev(fwd, skb->len); + if (unlikely(err)) + goto err; + + skb->dev = fwd; + _trace_xdp_redirect(dev, xdp_prog, ri->tgt_index); + generic_xdp_tx(skb, xdp_prog); + return 0; + } + + return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, fwd, map_type, map_id); err: - _trace_xdp_redirect_err(dev, xdp_prog, index, err); + _trace_xdp_redirect_err(dev, xdp_prog, ri->tgt_index, err); return err; } @@ -4094,10 +4055,12 @@ BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags) if (unlikely(flags)) return XDP_ABORTED; - ri->flags = flags; + /* NB! Map type UNSPEC and map_id == INT_MAX (never generated + * by map_idr) is used for ifindex based XDP redirect. + */ ri->tgt_index = ifindex; - ri->tgt_value = NULL; - WRITE_ONCE(ri->map, NULL); + ri->map_id = INT_MAX; + ri->map_type = BPF_MAP_TYPE_UNSPEC; return XDP_REDIRECT; } @@ -4113,28 +4076,7 @@ static const struct bpf_func_proto bpf_xdp_redirect_proto = { BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); - - /* Lower bits of the flags are used as return code on lookup failure */ - if (unlikely(flags > XDP_TX)) - return XDP_ABORTED; - - ri->tgt_value = __xdp_map_lookup_elem(map, ifindex); - if (unlikely(!ri->tgt_value)) { - /* If the lookup fails we want to clear out the state in the - * redirect_info struct completely, so that if an eBPF program - * performs multiple lookups, the last one always takes - * precedence. - */ - WRITE_ONCE(ri->map, NULL); - return flags; - } - - ri->flags = flags; - ri->tgt_index = ifindex; - WRITE_ONCE(ri->map, map); - - return XDP_REDIRECT; + return map->ops->map_redirect(map, ifindex, flags); } static const struct bpf_func_proto bpf_xdp_redirect_map_proto = { @@ -9655,22 +9597,40 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, return insn - insn_buf; } +/* data_end = skb->data + skb_headlen() */ +static struct bpf_insn *bpf_convert_data_end_access(const struct bpf_insn *si, + struct bpf_insn *insn) +{ + /* si->dst_reg = skb->data */ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, data)); + /* AX = skb->len */ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len), + BPF_REG_AX, si->src_reg, + offsetof(struct sk_buff, len)); + /* si->dst_reg = skb->data + skb->len */ + *insn++ = BPF_ALU64_REG(BPF_ADD, si->dst_reg, BPF_REG_AX); + /* AX = skb->data_len */ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data_len), + BPF_REG_AX, si->src_reg, + offsetof(struct sk_buff, data_len)); + /* si->dst_reg = skb->data + skb->len - skb->data_len */ + *insn++ = BPF_ALU64_REG(BPF_SUB, si->dst_reg, BPF_REG_AX); + + return insn; +} + static u32 sk_skb_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; - int off; switch (si->off) { case offsetof(struct __sk_buff, data_end): - off = si->off; - off -= offsetof(struct __sk_buff, data_end); - off += offsetof(struct sk_buff, cb); - off += offsetof(struct tcp_skb_cb, bpf.data_end); - *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, - si->src_reg, off); + insn = bpf_convert_data_end_access(si, insn); break; default: return bpf_convert_ctx_access(type, si, insn_buf, prog, @@ -10449,6 +10409,7 @@ static u32 sk_lookup_convert_ctx_access(enum bpf_access_type type, } const struct bpf_prog_ops sk_lookup_prog_ops = { + .test_run = bpf_prog_test_run_sk_lookup, }; const struct bpf_verifier_ops sk_lookup_verifier_ops = { diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 1261512d6807..07f54015238a 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -525,7 +525,8 @@ static void sk_psock_backlog(struct work_struct *work) len = skb->len; off = 0; start: - ingress = tcp_skb_bpf_ingress(skb); + ingress = skb_bpf_ingress(skb); + skb_bpf_redirect_clear(skb); do { ret = -EIO; if (likely(psock->sk->sk_socket)) @@ -618,7 +619,7 @@ struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock) return link; } -void __sk_psock_purge_ingress_msg(struct sk_psock *psock) +static void __sk_psock_purge_ingress_msg(struct sk_psock *psock) { struct sk_msg *msg, *tmp; @@ -631,7 +632,12 @@ void __sk_psock_purge_ingress_msg(struct sk_psock *psock) static void sk_psock_zap_ingress(struct sk_psock *psock) { - __skb_queue_purge(&psock->ingress_skb); + struct sk_buff *skb; + + while ((skb = __skb_dequeue(&psock->ingress_skb)) != NULL) { + skb_bpf_redirect_clear(skb); + kfree_skb(skb); + } __sk_psock_purge_ingress_msg(psock); } @@ -645,15 +651,15 @@ static void sk_psock_link_destroy(struct sk_psock *psock) } } +static void sk_psock_done_strp(struct sk_psock *psock); + static void sk_psock_destroy_deferred(struct work_struct *gc) { struct sk_psock *psock = container_of(gc, struct sk_psock, gc); /* No sk_callback_lock since already detached. */ - /* Parser has been stopped */ - if (psock->progs.skb_parser) - strp_done(&psock->parser.strp); + sk_psock_done_strp(psock); cancel_work_sync(&psock->work); @@ -685,9 +691,9 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock) write_lock_bh(&sk->sk_callback_lock); sk_psock_restore_proto(sk, psock); rcu_assign_sk_user_data(sk, NULL); - if (psock->progs.skb_parser) + if (psock->progs.stream_parser) sk_psock_stop_strp(sk, psock); - else if (psock->progs.skb_verdict) + else if (psock->progs.stream_verdict) sk_psock_stop_verdict(sk, psock); write_unlock_bh(&sk->sk_callback_lock); sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); @@ -743,27 +749,12 @@ out: } EXPORT_SYMBOL_GPL(sk_psock_msg_verdict); -static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog, - struct sk_buff *skb) -{ - bpf_compute_data_end_sk_skb(skb); - return bpf_prog_run_pin_on_cpu(prog, skb); -} - -static struct sk_psock *sk_psock_from_strp(struct strparser *strp) -{ - struct sk_psock_parser *parser; - - parser = container_of(strp, struct sk_psock_parser, strp); - return container_of(parser, struct sk_psock, parser); -} - static void sk_psock_skb_redirect(struct sk_buff *skb) { struct sk_psock *psock_other; struct sock *sk_other; - sk_other = tcp_skb_bpf_redirect_fetch(skb); + sk_other = skb_bpf_redirect_fetch(skb); /* This error is a buggy BPF program, it returned a redirect * return code, but then didn't set a redirect interface. */ @@ -806,16 +797,17 @@ int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb) int ret = __SK_PASS; rcu_read_lock(); - prog = READ_ONCE(psock->progs.skb_verdict); + prog = READ_ONCE(psock->progs.stream_verdict); if (likely(prog)) { /* We skip full set_owner_r here because if we do a SK_PASS * or SK_DROP we can skip skb memory accounting and use the * TLS context. */ skb->sk = psock->sk; - tcp_skb_bpf_redirect_clear(skb); - ret = sk_psock_bpf_run(psock, prog, skb); - ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); + skb_dst_drop(skb); + skb_bpf_redirect_clear(skb); + ret = bpf_prog_run_pin_on_cpu(prog, skb); + ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); skb->sk = NULL; } sk_psock_tls_verdict_apply(skb, psock->sk, ret); @@ -827,7 +819,6 @@ EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read); static void sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, int verdict) { - struct tcp_skb_cb *tcp; struct sock *sk_other; int err = -EIO; @@ -839,8 +830,7 @@ static void sk_psock_verdict_apply(struct sk_psock *psock, goto out_free; } - tcp = TCP_SKB_CB(skb); - tcp->bpf.flags |= BPF_F_INGRESS; + skb_bpf_set_ingress(skb); /* If the queue is empty then we can submit directly * into the msg queue. If its not empty we have to @@ -866,6 +856,24 @@ out_free: } } +static void sk_psock_write_space(struct sock *sk) +{ + struct sk_psock *psock; + void (*write_space)(struct sock *sk) = NULL; + + rcu_read_lock(); + psock = sk_psock(sk); + if (likely(psock)) { + if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) + schedule_work(&psock->work); + write_space = psock->saved_write_space; + } + rcu_read_unlock(); + if (write_space) + write_space(sk); +} + +#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) { struct sk_psock *psock; @@ -881,11 +889,12 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) goto out; } skb_set_owner_r(skb, sk); - prog = READ_ONCE(psock->progs.skb_verdict); + prog = READ_ONCE(psock->progs.stream_verdict); if (likely(prog)) { - tcp_skb_bpf_redirect_clear(skb); - ret = sk_psock_bpf_run(psock, prog, skb); - ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); + skb_dst_drop(skb); + skb_bpf_redirect_clear(skb); + ret = bpf_prog_run_pin_on_cpu(prog, skb); + ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); } sk_psock_verdict_apply(psock, skb, ret); out: @@ -899,15 +908,15 @@ static int sk_psock_strp_read_done(struct strparser *strp, int err) static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb) { - struct sk_psock *psock = sk_psock_from_strp(strp); + struct sk_psock *psock = container_of(strp, struct sk_psock, strp); struct bpf_prog *prog; int ret = skb->len; rcu_read_lock(); - prog = READ_ONCE(psock->progs.skb_parser); + prog = READ_ONCE(psock->progs.stream_parser); if (likely(prog)) { skb->sk = psock->sk; - ret = sk_psock_bpf_run(psock, prog, skb); + ret = bpf_prog_run_pin_on_cpu(prog, skb); skb->sk = NULL; } rcu_read_unlock(); @@ -923,16 +932,59 @@ static void sk_psock_strp_data_ready(struct sock *sk) psock = sk_psock(sk); if (likely(psock)) { if (tls_sw_has_ctx_rx(sk)) { - psock->parser.saved_data_ready(sk); + psock->saved_data_ready(sk); } else { write_lock_bh(&sk->sk_callback_lock); - strp_data_ready(&psock->parser.strp); + strp_data_ready(&psock->strp); write_unlock_bh(&sk->sk_callback_lock); } } rcu_read_unlock(); } +int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) +{ + static const struct strp_callbacks cb = { + .rcv_msg = sk_psock_strp_read, + .read_sock_done = sk_psock_strp_read_done, + .parse_msg = sk_psock_strp_parse, + }; + + return strp_init(&psock->strp, sk, &cb); +} + +void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) +{ + if (psock->saved_data_ready) + return; + + psock->saved_data_ready = sk->sk_data_ready; + sk->sk_data_ready = sk_psock_strp_data_ready; + sk->sk_write_space = sk_psock_write_space; +} + +void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) +{ + if (!psock->saved_data_ready) + return; + + sk->sk_data_ready = psock->saved_data_ready; + psock->saved_data_ready = NULL; + strp_stop(&psock->strp); +} + +static void sk_psock_done_strp(struct sk_psock *psock) +{ + /* Parser has been stopped */ + if (psock->progs.stream_parser) + strp_done(&psock->strp); +} +#else +static void sk_psock_done_strp(struct sk_psock *psock) +{ +} +#endif /* CONFIG_BPF_STREAM_PARSER */ + static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, unsigned int offset, size_t orig_len) { @@ -957,11 +1009,12 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, goto out; } skb_set_owner_r(skb, sk); - prog = READ_ONCE(psock->progs.skb_verdict); + prog = READ_ONCE(psock->progs.stream_verdict); if (likely(prog)) { - tcp_skb_bpf_redirect_clear(skb); - ret = sk_psock_bpf_run(psock, prog, skb); - ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); + skb_dst_drop(skb); + skb_bpf_redirect_clear(skb); + ret = bpf_prog_run_pin_on_cpu(prog, skb); + ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); } sk_psock_verdict_apply(psock, skb, ret); out: @@ -984,82 +1037,21 @@ static void sk_psock_verdict_data_ready(struct sock *sk) sock->ops->read_sock(sk, &desc, sk_psock_verdict_recv); } -static void sk_psock_write_space(struct sock *sk) -{ - struct sk_psock *psock; - void (*write_space)(struct sock *sk) = NULL; - - rcu_read_lock(); - psock = sk_psock(sk); - if (likely(psock)) { - if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) - schedule_work(&psock->work); - write_space = psock->saved_write_space; - } - rcu_read_unlock(); - if (write_space) - write_space(sk); -} - -int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) -{ - static const struct strp_callbacks cb = { - .rcv_msg = sk_psock_strp_read, - .read_sock_done = sk_psock_strp_read_done, - .parse_msg = sk_psock_strp_parse, - }; - - psock->parser.enabled = false; - return strp_init(&psock->parser.strp, sk, &cb); -} - void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock) { - struct sk_psock_parser *parser = &psock->parser; - - if (parser->enabled) + if (psock->saved_data_ready) return; - parser->saved_data_ready = sk->sk_data_ready; + psock->saved_data_ready = sk->sk_data_ready; sk->sk_data_ready = sk_psock_verdict_data_ready; sk->sk_write_space = sk_psock_write_space; - parser->enabled = true; -} - -void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) -{ - struct sk_psock_parser *parser = &psock->parser; - - if (parser->enabled) - return; - - parser->saved_data_ready = sk->sk_data_ready; - sk->sk_data_ready = sk_psock_strp_data_ready; - sk->sk_write_space = sk_psock_write_space; - parser->enabled = true; -} - -void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) -{ - struct sk_psock_parser *parser = &psock->parser; - - if (!parser->enabled) - return; - - sk->sk_data_ready = parser->saved_data_ready; - parser->saved_data_ready = NULL; - strp_stop(&parser->strp); - parser->enabled = false; } void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock) { - struct sk_psock_parser *parser = &psock->parser; - - if (!parser->enabled) + if (!psock->saved_data_ready) return; - sk->sk_data_ready = parser->saved_data_ready; - parser->saved_data_ready = NULL; - parser->enabled = false; + sk->sk_data_ready = psock->saved_data_ready; + psock->saved_data_ready = NULL; } diff --git a/net/core/sock_map.c b/net/core/sock_map.c index d758fb83c884..dd53a7771d7e 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -24,6 +24,9 @@ struct bpf_stab { #define SOCK_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) +static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, + struct bpf_prog *old, u32 which); + static struct bpf_map *sock_map_alloc(union bpf_attr *attr) { struct bpf_stab *stab; @@ -148,9 +151,9 @@ static void sock_map_del_link(struct sock *sk, struct bpf_map *map = link->map; struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - if (psock->parser.enabled && stab->progs.skb_parser) + if (psock->saved_data_ready && stab->progs.stream_parser) strp_stop = true; - if (psock->parser.enabled && stab->progs.skb_verdict) + if (psock->saved_data_ready && stab->progs.stream_verdict) verdict_stop = true; list_del(&link->list); sk_psock_free_link(link); @@ -224,23 +227,23 @@ out: static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, struct sock *sk) { - struct bpf_prog *msg_parser, *skb_parser, *skb_verdict; + struct bpf_prog *msg_parser, *stream_parser, *stream_verdict; struct sk_psock *psock; int ret; - skb_verdict = READ_ONCE(progs->skb_verdict); - if (skb_verdict) { - skb_verdict = bpf_prog_inc_not_zero(skb_verdict); - if (IS_ERR(skb_verdict)) - return PTR_ERR(skb_verdict); + stream_verdict = READ_ONCE(progs->stream_verdict); + if (stream_verdict) { + stream_verdict = bpf_prog_inc_not_zero(stream_verdict); + if (IS_ERR(stream_verdict)) + return PTR_ERR(stream_verdict); } - skb_parser = READ_ONCE(progs->skb_parser); - if (skb_parser) { - skb_parser = bpf_prog_inc_not_zero(skb_parser); - if (IS_ERR(skb_parser)) { - ret = PTR_ERR(skb_parser); - goto out_put_skb_verdict; + stream_parser = READ_ONCE(progs->stream_parser); + if (stream_parser) { + stream_parser = bpf_prog_inc_not_zero(stream_parser); + if (IS_ERR(stream_parser)) { + ret = PTR_ERR(stream_parser); + goto out_put_stream_verdict; } } @@ -249,7 +252,7 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, msg_parser = bpf_prog_inc_not_zero(msg_parser); if (IS_ERR(msg_parser)) { ret = PTR_ERR(msg_parser); - goto out_put_skb_parser; + goto out_put_stream_parser; } } @@ -261,8 +264,8 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, if (psock) { if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) || - (skb_parser && READ_ONCE(psock->progs.skb_parser)) || - (skb_verdict && READ_ONCE(psock->progs.skb_verdict))) { + (stream_parser && READ_ONCE(psock->progs.stream_parser)) || + (stream_verdict && READ_ONCE(psock->progs.stream_verdict))) { sk_psock_put(sk, psock); ret = -EBUSY; goto out_progs; @@ -283,15 +286,15 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, goto out_drop; write_lock_bh(&sk->sk_callback_lock); - if (skb_parser && skb_verdict && !psock->parser.enabled) { + if (stream_parser && stream_verdict && !psock->saved_data_ready) { ret = sk_psock_init_strp(sk, psock); if (ret) goto out_unlock_drop; - psock_set_prog(&psock->progs.skb_verdict, skb_verdict); - psock_set_prog(&psock->progs.skb_parser, skb_parser); + psock_set_prog(&psock->progs.stream_verdict, stream_verdict); + psock_set_prog(&psock->progs.stream_parser, stream_parser); sk_psock_start_strp(sk, psock); - } else if (!skb_parser && skb_verdict && !psock->parser.enabled) { - psock_set_prog(&psock->progs.skb_verdict, skb_verdict); + } else if (!stream_parser && stream_verdict && !psock->saved_data_ready) { + psock_set_prog(&psock->progs.stream_verdict, stream_verdict); sk_psock_start_verdict(sk,psock); } write_unlock_bh(&sk->sk_callback_lock); @@ -303,12 +306,12 @@ out_drop: out_progs: if (msg_parser) bpf_prog_put(msg_parser); -out_put_skb_parser: - if (skb_parser) - bpf_prog_put(skb_parser); -out_put_skb_verdict: - if (skb_verdict) - bpf_prog_put(skb_verdict); +out_put_stream_parser: + if (stream_parser) + bpf_prog_put(stream_parser); +out_put_stream_verdict: + if (stream_verdict) + bpf_prog_put(stream_verdict); return ret; } @@ -657,7 +660,6 @@ const struct bpf_func_proto bpf_sock_map_update_proto = { BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, struct bpf_map *, map, u32, key, u64, flags) { - struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); struct sock *sk; if (unlikely(flags & ~(BPF_F_INGRESS))) @@ -667,8 +669,7 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; - tcb->bpf.flags = flags; - tcb->bpf.sk_redir = sk; + skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS); return SK_PASS; } @@ -1250,7 +1251,6 @@ const struct bpf_func_proto bpf_sock_hash_update_proto = { BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb, struct bpf_map *, map, void *, key, u64, flags) { - struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); struct sock *sk; if (unlikely(flags & ~(BPF_F_INGRESS))) @@ -1260,8 +1260,7 @@ BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb, if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; - tcb->bpf.flags = flags; - tcb->bpf.sk_redir = sk; + skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS); return SK_PASS; } @@ -1448,8 +1447,8 @@ static struct sk_psock_progs *sock_map_progs(struct bpf_map *map) return NULL; } -int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, - struct bpf_prog *old, u32 which) +static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, + struct bpf_prog *old, u32 which) { struct sk_psock_progs *progs = sock_map_progs(map); struct bpf_prog **pprog; @@ -1461,11 +1460,13 @@ int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, case BPF_SK_MSG_VERDICT: pprog = &progs->msg_parser; break; +#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) case BPF_SK_SKB_STREAM_PARSER: - pprog = &progs->skb_parser; + pprog = &progs->stream_parser; break; +#endif case BPF_SK_SKB_STREAM_VERDICT: - pprog = &progs->skb_verdict; + pprog = &progs->stream_verdict; break; default: return -EOPNOTSUPP; diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 5b77a46885b9..bbdd9c44f14e 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -62,7 +62,7 @@ obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o obj-$(CONFIG_NET_SOCK_MSG) += tcp_bpf.o -obj-$(CONFIG_BPF_STREAM_PARSER) += udp_bpf.o +obj-$(CONFIG_BPF_SYSCALL) += udp_bpf.o obj-$(CONFIG_NETLABEL) += cipso_ipv4.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index bc7d2a586e18..17c322b875fd 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -229,7 +229,7 @@ int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, } EXPORT_SYMBOL_GPL(tcp_bpf_sendmsg_redir); -#ifdef CONFIG_BPF_STREAM_PARSER +#ifdef CONFIG_BPF_SYSCALL static bool tcp_bpf_stream_read(const struct sock *sk) { struct sk_psock *psock; @@ -629,4 +629,4 @@ void tcp_bpf_clone(const struct sock *sk, struct sock *newsk) if (prot == &tcp_bpf_prots[family][TCP_BPF_BASE]) newsk->sk_prot = sk->sk_prot_creator; } -#endif /* CONFIG_BPF_STREAM_PARSER */ +#endif /* CONFIG_BPF_SYSCALL */ diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 4faabd1ecfd1..a71ed664da0a 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -445,6 +445,97 @@ static void xsk_destruct_skb(struct sk_buff *skb) sock_wfree(skb); } +static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, + struct xdp_desc *desc) +{ + struct xsk_buff_pool *pool = xs->pool; + u32 hr, len, ts, offset, copy, copied; + struct sk_buff *skb; + struct page *page; + void *buffer; + int err, i; + u64 addr; + + hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom)); + + skb = sock_alloc_send_skb(&xs->sk, hr, 1, &err); + if (unlikely(!skb)) + return ERR_PTR(err); + + skb_reserve(skb, hr); + + addr = desc->addr; + len = desc->len; + ts = pool->unaligned ? len : pool->chunk_size; + + buffer = xsk_buff_raw_get_data(pool, addr); + offset = offset_in_page(buffer); + addr = buffer - pool->addrs; + + for (copied = 0, i = 0; copied < len; i++) { + page = pool->umem->pgs[addr >> PAGE_SHIFT]; + get_page(page); + + copy = min_t(u32, PAGE_SIZE - offset, len - copied); + skb_fill_page_desc(skb, i, page, offset, copy); + + copied += copy; + addr += copy; + offset = 0; + } + + skb->len += len; + skb->data_len += len; + skb->truesize += ts; + + refcount_add(ts, &xs->sk.sk_wmem_alloc); + + return skb; +} + +static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, + struct xdp_desc *desc) +{ + struct net_device *dev = xs->dev; + struct sk_buff *skb; + + if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) { + skb = xsk_build_skb_zerocopy(xs, desc); + if (IS_ERR(skb)) + return skb; + } else { + u32 hr, tr, len; + void *buffer; + int err; + + hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom)); + tr = dev->needed_tailroom; + len = desc->len; + + skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err); + if (unlikely(!skb)) + return ERR_PTR(err); + + skb_reserve(skb, hr); + skb_put(skb, len); + + buffer = xsk_buff_raw_get_data(xs->pool, desc->addr); + err = skb_store_bits(skb, 0, buffer, len); + if (unlikely(err)) { + kfree_skb(skb); + return ERR_PTR(err); + } + } + + skb->dev = dev; + skb->priority = xs->sk.sk_priority; + skb->mark = xs->sk.sk_mark; + skb_shinfo(skb)->destructor_arg = (void *)(long)desc->addr; + skb->destructor = xsk_destruct_skb; + + return skb; +} + static int xsk_generic_xmit(struct sock *sk) { struct xdp_sock *xs = xdp_sk(sk); @@ -461,43 +552,30 @@ static int xsk_generic_xmit(struct sock *sk) goto out; while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) { - char *buffer; - u64 addr; - u32 len; - if (max_batch-- == 0) { err = -EAGAIN; goto out; } - len = desc.len; - skb = sock_alloc_send_skb(sk, len, 1, &err); - if (unlikely(!skb)) + skb = xsk_build_skb(xs, &desc); + if (IS_ERR(skb)) { + err = PTR_ERR(skb); goto out; + } - skb_put(skb, len); - addr = desc.addr; - buffer = xsk_buff_raw_get_data(xs->pool, addr); - err = skb_store_bits(skb, 0, buffer, len); /* This is the backpressure mechanism for the Tx path. * Reserve space in the completion queue and only proceed * if there is space in it. This avoids having to implement * any buffering in the Tx path. */ spin_lock_irqsave(&xs->pool->cq_lock, flags); - if (unlikely(err) || xskq_prod_reserve(xs->pool->cq)) { + if (xskq_prod_reserve(xs->pool->cq)) { spin_unlock_irqrestore(&xs->pool->cq_lock, flags); kfree_skb(skb); goto out; } spin_unlock_irqrestore(&xs->pool->cq_lock, flags); - skb->dev = xs->dev; - skb->priority = sk->sk_priority; - skb->mark = sk->sk_mark; - skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr; - skb->destructor = xsk_destruct_skb; - err = __dev_direct_xmit(skb, xs->queue_id); if (err == NETDEV_TX_BUSY) { /* Tell user-space to retry the send */ diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 2823b7c3302d..2ac3802c2cd7 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -47,19 +47,18 @@ struct xsk_queue { u64 queue_empty_descs; }; -/* The structure of the shared state of the rings are the same as the - * ring buffer in kernel/events/ring_buffer.c. For the Rx and completion - * ring, the kernel is the producer and user space is the consumer. For - * the Tx and fill rings, the kernel is the consumer and user space is - * the producer. +/* The structure of the shared state of the rings are a simple + * circular buffer, as outlined in + * Documentation/core-api/circular-buffers.rst. For the Rx and + * completion ring, the kernel is the producer and user space is the + * consumer. For the Tx and fill rings, the kernel is the consumer and + * user space is the producer. * * producer consumer * - * if (LOAD ->consumer) { LOAD ->producer - * (A) smp_rmb() (C) + * if (LOAD ->consumer) { (A) LOAD.acq ->producer (C) * STORE $data LOAD $data - * smp_wmb() (B) smp_mb() (D) - * STORE ->producer STORE ->consumer + * STORE.rel ->producer (B) STORE.rel ->consumer (D) * } * * (A) pairs with (D), and (B) pairs with (C). @@ -78,7 +77,8 @@ struct xsk_queue { * * (A) is a control dependency that separates the load of ->consumer * from the stores of $data. In case ->consumer indicates there is no - * room in the buffer to store $data we do not. So no barrier is needed. + * room in the buffer to store $data we do not. The dependency will + * order both of the stores after the loads. So no barrier is needed. * * (D) protects the load of the data to be observed to happen after the * store of the consumer pointer. If we did not have this memory @@ -227,15 +227,13 @@ static inline u32 xskq_cons_read_desc_batch(struct xsk_queue *q, static inline void __xskq_cons_release(struct xsk_queue *q) { - smp_mb(); /* D, matches A */ - WRITE_ONCE(q->ring->consumer, q->cached_cons); + smp_store_release(&q->ring->consumer, q->cached_cons); /* D, matchees A */ } static inline void __xskq_cons_peek(struct xsk_queue *q) { /* Refresh the local pointer */ - q->cached_prod = READ_ONCE(q->ring->producer); - smp_rmb(); /* C, matches B */ + q->cached_prod = smp_load_acquire(&q->ring->producer); /* C, matches B */ } static inline void xskq_cons_get_entries(struct xsk_queue *q) @@ -397,9 +395,7 @@ static inline int xskq_prod_reserve_desc(struct xsk_queue *q, static inline void __xskq_prod_submit(struct xsk_queue *q, u32 idx) { - smp_wmb(); /* B, matches C */ - - WRITE_ONCE(q->ring->producer, idx); + smp_store_release(&q->ring->producer, idx); /* B, matches C */ } static inline void xskq_prod_submit(struct xsk_queue *q) diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c index 113fd9017203..67b4ce504852 100644 --- a/net/xdp/xskmap.c +++ b/net/xdp/xskmap.c @@ -87,7 +87,6 @@ static void xsk_map_free(struct bpf_map *map) { struct xsk_map *m = container_of(map, struct xsk_map, map); - bpf_clear_redirect_map(map); synchronize_net(); bpf_map_area_free(m); } @@ -125,6 +124,16 @@ static int xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) return insn - insn_buf; } +static void *__xsk_map_lookup_elem(struct bpf_map *map, u32 key) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + + if (key >= map->max_entries) + return NULL; + + return READ_ONCE(m->xsk_map[key]); +} + static void *xsk_map_lookup_elem(struct bpf_map *map, void *key) { WARN_ON_ONCE(!rcu_read_lock_held()); @@ -215,6 +224,11 @@ static int xsk_map_delete_elem(struct bpf_map *map, void *key) return 0; } +static int xsk_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) +{ + return __bpf_xdp_redirect_map(map, ifindex, flags, __xsk_map_lookup_elem); +} + void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, struct xdp_sock **map_entry) { @@ -247,4 +261,5 @@ const struct bpf_map_ops xsk_map_ops = { .map_check_btf = map_check_no_btf, .map_btf_name = "xsk_map", .map_btf_id = &xsk_map_btf_id, + .map_redirect = xsk_map_redirect, }; diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_doc.py index 867ada23281c..2d94025b38e9 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_doc.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: GPL-2.0-only # # Copyright (C) 2018-2019 Netronome Systems, Inc. +# Copyright (C) 2021 Isovalent, Inc. # In case user attempts to run with Python 2. from __future__ import print_function @@ -13,6 +14,9 @@ import sys, os class NoHelperFound(BaseException): pass +class NoSyscallCommandFound(BaseException): + pass + class ParsingError(BaseException): def __init__(self, line='<line not provided>', reader=None): if reader: @@ -22,18 +26,27 @@ class ParsingError(BaseException): else: BaseException.__init__(self, 'Error parsing line: %s' % line) -class Helper(object): + +class APIElement(object): """ - An object representing the description of an eBPF helper function. - @proto: function prototype of the helper function - @desc: textual description of the helper function - @ret: description of the return value of the helper function + An object representing the description of an aspect of the eBPF API. + @proto: prototype of the API symbol + @desc: textual description of the symbol + @ret: (optional) description of any associated return value """ def __init__(self, proto='', desc='', ret=''): self.proto = proto self.desc = desc self.ret = ret + +class Helper(APIElement): + """ + An object representing the description of an eBPF helper function. + @proto: function prototype of the helper function + @desc: textual description of the helper function + @ret: description of the return value of the helper function + """ def proto_break_down(self): """ Break down helper function protocol into smaller chunks: return type, @@ -60,6 +73,7 @@ class Helper(object): return res + class HeaderParser(object): """ An object used to parse a file in order to extract the documentation of a @@ -72,6 +86,13 @@ class HeaderParser(object): self.reader = open(filename, 'r') self.line = '' self.helpers = [] + self.commands = [] + + def parse_element(self): + proto = self.parse_symbol() + desc = self.parse_desc() + ret = self.parse_ret() + return APIElement(proto=proto, desc=desc, ret=ret) def parse_helper(self): proto = self.parse_proto() @@ -79,6 +100,18 @@ class HeaderParser(object): ret = self.parse_ret() return Helper(proto=proto, desc=desc, ret=ret) + def parse_symbol(self): + p = re.compile(' \* ?(.+)$') + capture = p.match(self.line) + if not capture: + raise NoSyscallCommandFound + end_re = re.compile(' \* ?NOTES$') + end = end_re.match(self.line) + if end: + raise NoSyscallCommandFound + self.line = self.reader.readline() + return capture.group(1) + def parse_proto(self): # Argument can be of shape: # - "void" @@ -140,16 +173,29 @@ class HeaderParser(object): break return ret - def run(self): - # Advance to start of helper function descriptions. - offset = self.reader.read().find('* Start of BPF helper function descriptions:') + def seek_to(self, target, help_message): + self.reader.seek(0) + offset = self.reader.read().find(target) if offset == -1: - raise Exception('Could not find start of eBPF helper descriptions list') + raise Exception(help_message) self.reader.seek(offset) self.reader.readline() self.reader.readline() self.line = self.reader.readline() + def parse_syscall(self): + self.seek_to('* DOC: eBPF Syscall Commands', + 'Could not find start of eBPF syscall descriptions list') + while True: + try: + command = self.parse_element() + self.commands.append(command) + except NoSyscallCommandFound: + break + + def parse_helpers(self): + self.seek_to('* Start of BPF helper function descriptions:', + 'Could not find start of eBPF helper descriptions list') while True: try: helper = self.parse_helper() @@ -157,6 +203,9 @@ class HeaderParser(object): except NoHelperFound: break + def run(self): + self.parse_syscall() + self.parse_helpers() self.reader.close() ############################################################################### @@ -165,10 +214,11 @@ class Printer(object): """ A generic class for printers. Printers should be created with an array of Helper objects, and implement a way to print them in the desired fashion. - @helpers: array of Helper objects to print to standard output + @parser: A HeaderParser with objects to print to standard output """ - def __init__(self, helpers): - self.helpers = helpers + def __init__(self, parser): + self.parser = parser + self.elements = [] def print_header(self): pass @@ -181,19 +231,23 @@ class Printer(object): def print_all(self): self.print_header() - for helper in self.helpers: - self.print_one(helper) + for elem in self.elements: + self.print_one(elem) self.print_footer() + class PrinterRST(Printer): """ - A printer for dumping collected information about helpers as a ReStructured - Text page compatible with the rst2man program, which can be used to - generate a manual page for the helpers. - @helpers: array of Helper objects to print to standard output + A generic class for printers that print ReStructured Text. Printers should + be created with a HeaderParser object, and implement a way to print API + elements in the desired fashion. + @parser: A HeaderParser with objects to print to standard output """ - def print_header(self): - header = '''\ + def __init__(self, parser): + self.parser = parser + + def print_license(self): + license = '''\ .. Copyright (C) All BPF authors and contributors from 2014 to present. .. See git log include/uapi/linux/bpf.h in kernel tree for details. .. @@ -221,9 +275,39 @@ class PrinterRST(Printer): .. .. Please do not edit this file. It was generated from the documentation .. located in file include/uapi/linux/bpf.h of the Linux kernel sources -.. (helpers description), and from scripts/bpf_helpers_doc.py in the same +.. (helpers description), and from scripts/bpf_doc.py in the same .. repository (header and footer). +''' + print(license) + def print_elem(self, elem): + if (elem.desc): + print('\tDescription') + # Do not strip all newline characters: formatted code at the end of + # a section must be followed by a blank line. + for line in re.sub('\n$', '', elem.desc, count=1).split('\n'): + print('{}{}'.format('\t\t' if line else '', line)) + + if (elem.ret): + print('\tReturn') + for line in elem.ret.rstrip().split('\n'): + print('{}{}'.format('\t\t' if line else '', line)) + + print('') + + +class PrinterHelpersRST(PrinterRST): + """ + A printer for dumping collected information about helpers as a ReStructured + Text page compatible with the rst2man program, which can be used to + generate a manual page for the helpers. + @parser: A HeaderParser with Helper objects to print to standard output + """ + def __init__(self, parser): + self.elements = parser.helpers + + def print_header(self): + header = '''\ =========== BPF-HELPERS =========== @@ -264,6 +348,7 @@ kernel at the top). HELPERS ======= ''' + PrinterRST.print_license(self) print(header) def print_footer(self): @@ -380,27 +465,50 @@ SEE ALSO def print_one(self, helper): self.print_proto(helper) + self.print_elem(helper) - if (helper.desc): - print('\tDescription') - # Do not strip all newline characters: formatted code at the end of - # a section must be followed by a blank line. - for line in re.sub('\n$', '', helper.desc, count=1).split('\n'): - print('{}{}'.format('\t\t' if line else '', line)) - if (helper.ret): - print('\tReturn') - for line in helper.ret.rstrip().split('\n'): - print('{}{}'.format('\t\t' if line else '', line)) +class PrinterSyscallRST(PrinterRST): + """ + A printer for dumping collected information about the syscall API as a + ReStructured Text page compatible with the rst2man program, which can be + used to generate a manual page for the syscall. + @parser: A HeaderParser with APIElement objects to print to standard + output + """ + def __init__(self, parser): + self.elements = parser.commands + + def print_header(self): + header = '''\ +=== +bpf +=== +------------------------------------------------------------------------------- +Perform a command on an extended BPF object +------------------------------------------------------------------------------- + +:Manual section: 2 + +COMMANDS +======== +''' + PrinterRST.print_license(self) + print(header) + + def print_one(self, command): + print('**%s**' % (command.proto)) + self.print_elem(command) - print('') class PrinterHelpers(Printer): """ A printer for dumping collected information about helpers as C header to be included from BPF program. - @helpers: array of Helper objects to print to standard output + @parser: A HeaderParser with Helper objects to print to standard output """ + def __init__(self, parser): + self.elements = parser.helpers type_fwds = [ 'struct bpf_fib_lookup', @@ -511,7 +619,7 @@ class PrinterHelpers(Printer): def print_header(self): header = '''\ -/* This is auto-generated file. See bpf_helpers_doc.py for details. */ +/* This is auto-generated file. See bpf_doc.py for details. */ /* Forward declarations of BPF structs */''' @@ -589,8 +697,13 @@ script = os.path.abspath(sys.argv[0]) linuxRoot = os.path.dirname(os.path.dirname(script)) bpfh = os.path.join(linuxRoot, 'include/uapi/linux/bpf.h') +printers = { + 'helpers': PrinterHelpersRST, + 'syscall': PrinterSyscallRST, +} + argParser = argparse.ArgumentParser(description=""" -Parse eBPF header file and generate documentation for eBPF helper functions. +Parse eBPF header file and generate documentation for the eBPF API. The RST-formatted output produced can be turned into a manual page with the rst2man utility. """) @@ -601,6 +714,8 @@ if (os.path.isfile(bpfh)): default=bpfh) else: argParser.add_argument('--filename', help='path to include/uapi/linux/bpf.h') +argParser.add_argument('target', nargs='?', default='helpers', + choices=printers.keys(), help='eBPF API target') args = argParser.parse_args() # Parse file. @@ -609,7 +724,9 @@ headerParser.run() # Print formatted output to standard output. if args.header: - printer = PrinterHelpers(headerParser.helpers) + if args.target != 'helpers': + raise NotImplementedError('Only helpers header generation is supported') + printer = PrinterHelpers(headerParser) else: - printer = PrinterRST(headerParser.helpers) + printer = printers[args.target](headerParser) printer.print_all() diff --git a/tools/bpf/Makefile.helpers b/tools/bpf/Makefile.helpers deleted file mode 100644 index 854d084026dd..000000000000 --- a/tools/bpf/Makefile.helpers +++ /dev/null @@ -1,60 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -ifndef allow-override - include ../scripts/Makefile.include - include ../scripts/utilities.mak -else - # Assume Makefile.helpers is being run from bpftool/Documentation - # subdirectory. Go up two more directories to fetch bpf.h header and - # associated script. - UP2DIR := ../../ -endif - -INSTALL ?= install -RM ?= rm -f -RMDIR ?= rmdir --ignore-fail-on-non-empty - -ifeq ($(V),1) - Q = -else - Q = @ -endif - -prefix ?= /usr/local -mandir ?= $(prefix)/man -man7dir = $(mandir)/man7 - -HELPERS_RST = bpf-helpers.rst -MAN7_RST = $(HELPERS_RST) - -_DOC_MAN7 = $(patsubst %.rst,%.7,$(MAN7_RST)) -DOC_MAN7 = $(addprefix $(OUTPUT),$(_DOC_MAN7)) - -helpers: man7 -man7: $(DOC_MAN7) - -RST2MAN_DEP := $(shell command -v rst2man 2>/dev/null) - -$(OUTPUT)$(HELPERS_RST): $(UP2DIR)../../include/uapi/linux/bpf.h - $(QUIET_GEN)$(UP2DIR)../../scripts/bpf_helpers_doc.py --filename $< > $@ - -$(OUTPUT)%.7: $(OUTPUT)%.rst -ifndef RST2MAN_DEP - $(error "rst2man not found, but required to generate man pages") -endif - $(QUIET_GEN)rst2man $< > $@ - -helpers-clean: - $(call QUIET_CLEAN, eBPF_helpers-manpage) - $(Q)$(RM) $(DOC_MAN7) $(OUTPUT)$(HELPERS_RST) - -helpers-install: helpers - $(call QUIET_INSTALL, eBPF_helpers-manpage) - $(Q)$(INSTALL) -d -m 755 $(DESTDIR)$(man7dir) - $(Q)$(INSTALL) -m 644 $(DOC_MAN7) $(DESTDIR)$(man7dir) - -helpers-uninstall: - $(call QUIET_UNINST, eBPF_helpers-manpage) - $(Q)$(RM) $(addprefix $(DESTDIR)$(man7dir)/,$(_DOC_MAN7)) - $(Q)$(RMDIR) $(DESTDIR)$(man7dir) - -.PHONY: helpers helpers-clean helpers-install helpers-uninstall diff --git a/tools/bpf/bpf_dbg.c b/tools/bpf/bpf_dbg.c index a07dfc479270..00e560a17baf 100644 --- a/tools/bpf/bpf_dbg.c +++ b/tools/bpf/bpf_dbg.c @@ -1198,7 +1198,7 @@ static int cmd_run(char *num) else return CMD_OK; bpf_reset(); - } while (pcap_next_pkt() && (!has_limit || (has_limit && ++i < pkts))); + } while (pcap_next_pkt() && (!has_limit || (++i < pkts))); rl_printf("bpf passes:%u fails:%u\n", pass, fail); diff --git a/tools/bpf/bpf_exp.y b/tools/bpf/bpf_exp.y index 8d48e896be50..dfb7254a24e8 100644 --- a/tools/bpf/bpf_exp.y +++ b/tools/bpf/bpf_exp.y @@ -185,13 +185,13 @@ ldx | OP_LDXB number '*' '(' '[' number ']' '&' number ')' { if ($2 != 4 || $9 != 0xf) { fprintf(stderr, "ldxb offset not supported!\n"); - exit(0); + exit(1); } else { bpf_set_curr_instr(BPF_LDX | BPF_MSH | BPF_B, 0, 0, $6); } } | OP_LDX number '*' '(' '[' number ']' '&' number ')' { if ($2 != 4 || $9 != 0xf) { fprintf(stderr, "ldxb offset not supported!\n"); - exit(0); + exit(1); } else { bpf_set_curr_instr(BPF_LDX | BPF_MSH | BPF_B, 0, 0, $6); } } ; @@ -472,7 +472,7 @@ static void bpf_assert_max(void) { if (curr_instr >= BPF_MAXINSNS) { fprintf(stderr, "only max %u insns allowed!\n", BPF_MAXINSNS); - exit(0); + exit(1); } } @@ -522,7 +522,7 @@ static int bpf_find_insns_offset(const char *label) if (ret == -ENOENT) { fprintf(stderr, "no such label \'%s\'!\n", label); - exit(0); + exit(1); } return ret; @@ -549,9 +549,11 @@ static uint8_t bpf_encode_jt_jf_offset(int off, int i) { int delta = off - i - 1; - if (delta < 0 || delta > 255) - fprintf(stderr, "warning: insn #%d jumps to insn #%d, " + if (delta < 0 || delta > 255) { + fprintf(stderr, "error: insn #%d jumps to insn #%d, " "which is out of range\n", i, off); + exit(1); + } return (uint8_t) delta; } diff --git a/tools/bpf/bpftool/.gitignore b/tools/bpf/bpftool/.gitignore index 944cb4b7c95d..05ce4446b780 100644 --- a/tools/bpf/bpftool/.gitignore +++ b/tools/bpf/bpftool/.gitignore @@ -3,7 +3,6 @@ /bootstrap/ /bpftool bpftool*.8 -bpf-helpers.* FEATURE-DUMP.bpftool feature libbpf diff --git a/tools/bpf/bpftool/Documentation/Makefile b/tools/bpf/bpftool/Documentation/Makefile index f33cb02de95c..c49487905ceb 100644 --- a/tools/bpf/bpftool/Documentation/Makefile +++ b/tools/bpf/bpftool/Documentation/Makefile @@ -16,15 +16,12 @@ prefix ?= /usr/local mandir ?= $(prefix)/man man8dir = $(mandir)/man8 -# Load targets for building eBPF helpers man page. -include ../../Makefile.helpers - MAN8_RST = $(wildcard bpftool*.rst) _DOC_MAN8 = $(patsubst %.rst,%.8,$(MAN8_RST)) DOC_MAN8 = $(addprefix $(OUTPUT),$(_DOC_MAN8)) -man: man8 helpers +man: man8 man8: $(DOC_MAN8) RST2MAN_DEP := $(shell command -v rst2man 2>/dev/null) @@ -46,16 +43,16 @@ ifndef RST2MAN_DEP endif $(QUIET_GEN)( cat $< ; printf "%b" $(call see_also,$<) ) | rst2man $(RST2MAN_OPTS) > $@ -clean: helpers-clean +clean: $(call QUIET_CLEAN, Documentation) $(Q)$(RM) $(DOC_MAN8) -install: man helpers-install +install: man $(call QUIET_INSTALL, Documentation-man) $(Q)$(INSTALL) -d -m 755 $(DESTDIR)$(man8dir) $(Q)$(INSTALL) -m 644 $(DOC_MAN8) $(DESTDIR)$(man8dir) -uninstall: helpers-uninstall +uninstall: $(call QUIET_UNINST, Documentation-man) $(Q)$(RM) $(addprefix $(DESTDIR)$(man8dir)/,$(_DOC_MAN8)) $(Q)$(RMDIR) $(DESTDIR)$(man8dir) diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c index fe9e7b3a4b50..985610c3f193 100644 --- a/tools/bpf/bpftool/btf.c +++ b/tools/bpf/bpftool/btf.c @@ -36,6 +36,7 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = { [BTF_KIND_FUNC_PROTO] = "FUNC_PROTO", [BTF_KIND_VAR] = "VAR", [BTF_KIND_DATASEC] = "DATASEC", + [BTF_KIND_FLOAT] = "FLOAT", }; struct btf_attach_table { @@ -327,6 +328,13 @@ static int dump_btf_type(const struct btf *btf, __u32 id, jsonw_end_array(w); break; } + case BTF_KIND_FLOAT: { + if (json_output) + jsonw_uint_field(w, "size", t->size); + else + printf(" size=%u", t->size); + break; + } default: break; } diff --git a/tools/bpf/bpftool/btf_dumper.c b/tools/bpf/bpftool/btf_dumper.c index 0e9310727281..7ca54d046362 100644 --- a/tools/bpf/bpftool/btf_dumper.c +++ b/tools/bpf/bpftool/btf_dumper.c @@ -596,6 +596,7 @@ static int __btf_dumper_type_only(const struct btf *btf, __u32 type_id, switch (BTF_INFO_KIND(t->info)) { case BTF_KIND_INT: case BTF_KIND_TYPEDEF: + case BTF_KIND_FLOAT: BTF_PRINT_ARG("%s ", btf__name_by_offset(btf, t->name_off)); break; case BTF_KIND_STRUCT: diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c index 359960a8f1de..40a88df275f9 100644 --- a/tools/bpf/bpftool/feature.c +++ b/tools/bpf/bpftool/feature.c @@ -336,6 +336,10 @@ static void probe_kernel_image_config(const char *define_prefix) { "CONFIG_BPF_JIT", }, /* Avoid compiling eBPF interpreter (use JIT only) */ { "CONFIG_BPF_JIT_ALWAYS_ON", }, + /* Kernel BTF debug information available */ + { "CONFIG_DEBUG_INFO_BTF", }, + /* Kernel module BTF debug information available */ + { "CONFIG_DEBUG_INFO_BTF_MODULES", }, /* cgroups */ { "CONFIG_CGROUPS", }, diff --git a/tools/bpf/bpftool/xlated_dumper.c b/tools/bpf/bpftool/xlated_dumper.c index 8608cd68cdd0..6fc3e6f7f40c 100644 --- a/tools/bpf/bpftool/xlated_dumper.c +++ b/tools/bpf/bpftool/xlated_dumper.c @@ -196,6 +196,9 @@ static const char *print_imm(void *private_data, else if (insn->src_reg == BPF_PSEUDO_MAP_VALUE) snprintf(dd->scratch_buff, sizeof(dd->scratch_buff), "map[id:%u][0]+%u", insn->imm, (insn + 1)->imm); + else if (insn->src_reg == BPF_PSEUDO_FUNC) + snprintf(dd->scratch_buff, sizeof(dd->scratch_buff), + "subprog[%+d]", insn->imm); else snprintf(dd->scratch_buff, sizeof(dd->scratch_buff), "0x%llx", (unsigned long long)full_imm); diff --git a/tools/bpf/runqslower/Makefile b/tools/bpf/runqslower/Makefile index 9d9fb6209be1..3818ec511fd2 100644 --- a/tools/bpf/runqslower/Makefile +++ b/tools/bpf/runqslower/Makefile @@ -16,7 +16,10 @@ CFLAGS := -g -Wall # Try to detect best kernel BTF source KERNEL_REL := $(shell uname -r) -VMLINUX_BTF_PATHS := /sys/kernel/btf/vmlinux /boot/vmlinux-$(KERNEL_REL) +VMLINUX_BTF_PATHS := $(if $(O),$(O)/vmlinux) \ + $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux) \ + ../../../vmlinux /sys/kernel/btf/vmlinux \ + /boot/vmlinux-$(KERNEL_REL) VMLINUX_BTF_PATH := $(or $(VMLINUX_BTF),$(firstword \ $(wildcard $(VMLINUX_BTF_PATHS)))) @@ -66,12 +69,16 @@ $(OUTPUT) $(BPFOBJ_OUTPUT) $(BPFTOOL_OUTPUT): $(QUIET_MKDIR)mkdir -p $@ $(OUTPUT)/vmlinux.h: $(VMLINUX_BTF_PATH) | $(OUTPUT) $(BPFTOOL) +ifeq ($(VMLINUX_H),) $(Q)if [ ! -e "$(VMLINUX_BTF_PATH)" ] ; then \ echo "Couldn't find kernel BTF; set VMLINUX_BTF to" \ "specify its location." >&2; \ exit 1;\ fi $(QUIET_GEN)$(BPFTOOL) btf dump file $(VMLINUX_BTF_PATH) format c > $@ +else + $(Q)cp "$(VMLINUX_H)" $@ +endif $(BPFOBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(BPFOBJ_OUTPUT) $(Q)$(MAKE) $(submake_extras) -C $(LIBBPF_SRC) OUTPUT=$(BPFOBJ_OUTPUT) $@ diff --git a/tools/bpf/runqslower/runqslower.bpf.c b/tools/bpf/runqslower/runqslower.bpf.c index 1f18a409f044..645530ca7e98 100644 --- a/tools/bpf/runqslower/runqslower.bpf.c +++ b/tools/bpf/runqslower/runqslower.bpf.c @@ -11,9 +11,9 @@ const volatile __u64 min_us = 0; const volatile pid_t targ_pid = 0; struct { - __uint(type, BPF_MAP_TYPE_HASH); - __uint(max_entries, 10240); - __type(key, u32); + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); __type(value, u64); } start SEC(".maps"); @@ -25,15 +25,20 @@ struct { /* record enqueue timestamp */ __always_inline -static int trace_enqueue(u32 tgid, u32 pid) +static int trace_enqueue(struct task_struct *t) { - u64 ts; + u32 pid = t->pid; + u64 *ptr; if (!pid || (targ_pid && targ_pid != pid)) return 0; - ts = bpf_ktime_get_ns(); - bpf_map_update_elem(&start, &pid, &ts, 0); + ptr = bpf_task_storage_get(&start, t, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (!ptr) + return 0; + + *ptr = bpf_ktime_get_ns(); return 0; } @@ -43,7 +48,7 @@ int handle__sched_wakeup(u64 *ctx) /* TP_PROTO(struct task_struct *p) */ struct task_struct *p = (void *)ctx[0]; - return trace_enqueue(p->tgid, p->pid); + return trace_enqueue(p); } SEC("tp_btf/sched_wakeup_new") @@ -52,7 +57,7 @@ int handle__sched_wakeup_new(u64 *ctx) /* TP_PROTO(struct task_struct *p) */ struct task_struct *p = (void *)ctx[0]; - return trace_enqueue(p->tgid, p->pid); + return trace_enqueue(p); } SEC("tp_btf/sched_switch") @@ -70,12 +75,16 @@ int handle__sched_switch(u64 *ctx) /* ivcsw: treat like an enqueue event and store timestamp */ if (prev->state == TASK_RUNNING) - trace_enqueue(prev->tgid, prev->pid); + trace_enqueue(prev); pid = next->pid; + /* For pid mismatch, save a bpf_task_storage_get */ + if (!pid || (targ_pid && targ_pid != pid)) + return 0; + /* fetch timestamp and calculate delta */ - tsp = bpf_map_lookup_elem(&start, &pid); + tsp = bpf_task_storage_get(&start, next, 0, 0); if (!tsp) return 0; /* missed enqueue */ @@ -91,7 +100,7 @@ int handle__sched_switch(u64 *ctx) bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &event, sizeof(event)); - bpf_map_delete_elem(&start, &pid); + bpf_task_storage_delete(&start, next); return 0; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 79c893310492..2d3036e292a9 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -93,7 +93,717 @@ union bpf_iter_link_info { } map; }; -/* BPF syscall commands, see bpf(2) man-page for details. */ +/* BPF syscall commands, see bpf(2) man-page for more details. */ +/** + * DOC: eBPF Syscall Preamble + * + * The operation to be performed by the **bpf**\ () system call is determined + * by the *cmd* argument. Each operation takes an accompanying argument, + * provided via *attr*, which is a pointer to a union of type *bpf_attr* (see + * below). The size argument is the size of the union pointed to by *attr*. + */ +/** + * DOC: eBPF Syscall Commands + * + * BPF_MAP_CREATE + * Description + * Create a map and return a file descriptor that refers to the + * map. The close-on-exec file descriptor flag (see **fcntl**\ (2)) + * is automatically enabled for the new file descriptor. + * + * Applying **close**\ (2) to the file descriptor returned by + * **BPF_MAP_CREATE** will delete the map (but see NOTES). + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_MAP_LOOKUP_ELEM + * Description + * Look up an element with a given *key* in the map referred to + * by the file descriptor *map_fd*. + * + * The *flags* argument may be specified as one of the + * following: + * + * **BPF_F_LOCK** + * Look up the value of a spin-locked map without + * returning the lock. This must be specified if the + * elements contain a spinlock. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_UPDATE_ELEM + * Description + * Create or update an element (key/value pair) in a specified map. + * + * The *flags* argument should be specified as one of the + * following: + * + * **BPF_ANY** + * Create a new element or update an existing element. + * **BPF_NOEXIST** + * Create a new element only if it did not exist. + * **BPF_EXIST** + * Update an existing element. + * **BPF_F_LOCK** + * Update a spin_lock-ed map element. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * May set *errno* to **EINVAL**, **EPERM**, **ENOMEM**, + * **E2BIG**, **EEXIST**, or **ENOENT**. + * + * **E2BIG** + * The number of elements in the map reached the + * *max_entries* limit specified at map creation time. + * **EEXIST** + * If *flags* specifies **BPF_NOEXIST** and the element + * with *key* already exists in the map. + * **ENOENT** + * If *flags* specifies **BPF_EXIST** and the element with + * *key* does not exist in the map. + * + * BPF_MAP_DELETE_ELEM + * Description + * Look up and delete an element by key in a specified map. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_GET_NEXT_KEY + * Description + * Look up an element by key in a specified map and return the key + * of the next element. Can be used to iterate over all elements + * in the map. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * The following cases can be used to iterate over all elements of + * the map: + * + * * If *key* is not found, the operation returns zero and sets + * the *next_key* pointer to the key of the first element. + * * If *key* is found, the operation returns zero and sets the + * *next_key* pointer to the key of the next element. + * * If *key* is the last element, returns -1 and *errno* is set + * to **ENOENT**. + * + * May set *errno* to **ENOMEM**, **EFAULT**, **EPERM**, or + * **EINVAL** on error. + * + * BPF_PROG_LOAD + * Description + * Verify and load an eBPF program, returning a new file + * descriptor associated with the program. + * + * Applying **close**\ (2) to the file descriptor returned by + * **BPF_PROG_LOAD** will unload the eBPF program (but see NOTES). + * + * The close-on-exec file descriptor flag (see **fcntl**\ (2)) is + * automatically enabled for the new file descriptor. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_OBJ_PIN + * Description + * Pin an eBPF program or map referred by the specified *bpf_fd* + * to the provided *pathname* on the filesystem. + * + * The *pathname* argument must not contain a dot ("."). + * + * On success, *pathname* retains a reference to the eBPF object, + * preventing deallocation of the object when the original + * *bpf_fd* is closed. This allow the eBPF object to live beyond + * **close**\ (\ *bpf_fd*\ ), and hence the lifetime of the parent + * process. + * + * Applying **unlink**\ (2) or similar calls to the *pathname* + * unpins the object from the filesystem, removing the reference. + * If no other file descriptors or filesystem nodes refer to the + * same object, it will be deallocated (see NOTES). + * + * The filesystem type for the parent directory of *pathname* must + * be **BPF_FS_MAGIC**. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_OBJ_GET + * Description + * Open a file descriptor for the eBPF object pinned to the + * specified *pathname*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_PROG_ATTACH + * Description + * Attach an eBPF program to a *target_fd* at the specified + * *attach_type* hook. + * + * The *attach_type* specifies the eBPF attachment point to + * attach the program to, and must be one of *bpf_attach_type* + * (see below). + * + * The *attach_bpf_fd* must be a valid file descriptor for a + * loaded eBPF program of a cgroup, flow dissector, LIRC, sockmap + * or sock_ops type corresponding to the specified *attach_type*. + * + * The *target_fd* must be a valid file descriptor for a kernel + * object which depends on the attach type of *attach_bpf_fd*: + * + * **BPF_PROG_TYPE_CGROUP_DEVICE**, + * **BPF_PROG_TYPE_CGROUP_SKB**, + * **BPF_PROG_TYPE_CGROUP_SOCK**, + * **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**, + * **BPF_PROG_TYPE_CGROUP_SOCKOPT**, + * **BPF_PROG_TYPE_CGROUP_SYSCTL**, + * **BPF_PROG_TYPE_SOCK_OPS** + * + * Control Group v2 hierarchy with the eBPF controller + * enabled. Requires the kernel to be compiled with + * **CONFIG_CGROUP_BPF**. + * + * **BPF_PROG_TYPE_FLOW_DISSECTOR** + * + * Network namespace (eg /proc/self/ns/net). + * + * **BPF_PROG_TYPE_LIRC_MODE2** + * + * LIRC device path (eg /dev/lircN). Requires the kernel + * to be compiled with **CONFIG_BPF_LIRC_MODE2**. + * + * **BPF_PROG_TYPE_SK_SKB**, + * **BPF_PROG_TYPE_SK_MSG** + * + * eBPF map of socket type (eg **BPF_MAP_TYPE_SOCKHASH**). + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_PROG_DETACH + * Description + * Detach the eBPF program associated with the *target_fd* at the + * hook specified by *attach_type*. The program must have been + * previously attached using **BPF_PROG_ATTACH**. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_PROG_TEST_RUN + * Description + * Run the eBPF program associated with the *prog_fd* a *repeat* + * number of times against a provided program context *ctx_in* and + * data *data_in*, and return the modified program context + * *ctx_out*, *data_out* (for example, packet data), result of the + * execution *retval*, and *duration* of the test run. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * **ENOSPC** + * Either *data_size_out* or *ctx_size_out* is too small. + * **ENOTSUPP** + * This command is not supported by the program type of + * the program referred to by *prog_fd*. + * + * BPF_PROG_GET_NEXT_ID + * Description + * Fetch the next eBPF program currently loaded into the kernel. + * + * Looks for the eBPF program with an id greater than *start_id* + * and updates *next_id* on success. If no other eBPF programs + * remain with ids higher than *start_id*, returns -1 and sets + * *errno* to **ENOENT**. + * + * Return + * Returns zero on success. On error, or when no id remains, -1 + * is returned and *errno* is set appropriately. + * + * BPF_MAP_GET_NEXT_ID + * Description + * Fetch the next eBPF map currently loaded into the kernel. + * + * Looks for the eBPF map with an id greater than *start_id* + * and updates *next_id* on success. If no other eBPF maps + * remain with ids higher than *start_id*, returns -1 and sets + * *errno* to **ENOENT**. + * + * Return + * Returns zero on success. On error, or when no id remains, -1 + * is returned and *errno* is set appropriately. + * + * BPF_PROG_GET_FD_BY_ID + * Description + * Open a file descriptor for the eBPF program corresponding to + * *prog_id*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_MAP_GET_FD_BY_ID + * Description + * Open a file descriptor for the eBPF map corresponding to + * *map_id*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_OBJ_GET_INFO_BY_FD + * Description + * Obtain information about the eBPF object corresponding to + * *bpf_fd*. + * + * Populates up to *info_len* bytes of *info*, which will be in + * one of the following formats depending on the eBPF object type + * of *bpf_fd*: + * + * * **struct bpf_prog_info** + * * **struct bpf_map_info** + * * **struct bpf_btf_info** + * * **struct bpf_link_info** + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_PROG_QUERY + * Description + * Obtain information about eBPF programs associated with the + * specified *attach_type* hook. + * + * The *target_fd* must be a valid file descriptor for a kernel + * object which depends on the attach type of *attach_bpf_fd*: + * + * **BPF_PROG_TYPE_CGROUP_DEVICE**, + * **BPF_PROG_TYPE_CGROUP_SKB**, + * **BPF_PROG_TYPE_CGROUP_SOCK**, + * **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**, + * **BPF_PROG_TYPE_CGROUP_SOCKOPT**, + * **BPF_PROG_TYPE_CGROUP_SYSCTL**, + * **BPF_PROG_TYPE_SOCK_OPS** + * + * Control Group v2 hierarchy with the eBPF controller + * enabled. Requires the kernel to be compiled with + * **CONFIG_CGROUP_BPF**. + * + * **BPF_PROG_TYPE_FLOW_DISSECTOR** + * + * Network namespace (eg /proc/self/ns/net). + * + * **BPF_PROG_TYPE_LIRC_MODE2** + * + * LIRC device path (eg /dev/lircN). Requires the kernel + * to be compiled with **CONFIG_BPF_LIRC_MODE2**. + * + * **BPF_PROG_QUERY** always fetches the number of programs + * attached and the *attach_flags* which were used to attach those + * programs. Additionally, if *prog_ids* is nonzero and the number + * of attached programs is less than *prog_cnt*, populates + * *prog_ids* with the eBPF program ids of the programs attached + * at *target_fd*. + * + * The following flags may alter the result: + * + * **BPF_F_QUERY_EFFECTIVE** + * Only return information regarding programs which are + * currently effective at the specified *target_fd*. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_RAW_TRACEPOINT_OPEN + * Description + * Attach an eBPF program to a tracepoint *name* to access kernel + * internal arguments of the tracepoint in their raw form. + * + * The *prog_fd* must be a valid file descriptor associated with + * a loaded eBPF program of type **BPF_PROG_TYPE_RAW_TRACEPOINT**. + * + * No ABI guarantees are made about the content of tracepoint + * arguments exposed to the corresponding eBPF program. + * + * Applying **close**\ (2) to the file descriptor returned by + * **BPF_RAW_TRACEPOINT_OPEN** will delete the map (but see NOTES). + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_BTF_LOAD + * Description + * Verify and load BPF Type Format (BTF) metadata into the kernel, + * returning a new file descriptor associated with the metadata. + * BTF is described in more detail at + * https://www.kernel.org/doc/html/latest/bpf/btf.html. + * + * The *btf* parameter must point to valid memory providing + * *btf_size* bytes of BTF binary metadata. + * + * The returned file descriptor can be passed to other **bpf**\ () + * subcommands such as **BPF_PROG_LOAD** or **BPF_MAP_CREATE** to + * associate the BTF with those objects. + * + * Similar to **BPF_PROG_LOAD**, **BPF_BTF_LOAD** has optional + * parameters to specify a *btf_log_buf*, *btf_log_size* and + * *btf_log_level* which allow the kernel to return freeform log + * output regarding the BTF verification process. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_BTF_GET_FD_BY_ID + * Description + * Open a file descriptor for the BPF Type Format (BTF) + * corresponding to *btf_id*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_TASK_FD_QUERY + * Description + * Obtain information about eBPF programs associated with the + * target process identified by *pid* and *fd*. + * + * If the *pid* and *fd* are associated with a tracepoint, kprobe + * or uprobe perf event, then the *prog_id* and *fd_type* will + * be populated with the eBPF program id and file descriptor type + * of type **bpf_task_fd_type**. If associated with a kprobe or + * uprobe, the *probe_offset* and *probe_addr* will also be + * populated. Optionally, if *buf* is provided, then up to + * *buf_len* bytes of *buf* will be populated with the name of + * the tracepoint, kprobe or uprobe. + * + * The resulting *prog_id* may be introspected in deeper detail + * using **BPF_PROG_GET_FD_BY_ID** and **BPF_OBJ_GET_INFO_BY_FD**. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_LOOKUP_AND_DELETE_ELEM + * Description + * Look up an element with the given *key* in the map referred to + * by the file descriptor *fd*, and if found, delete the element. + * + * The **BPF_MAP_TYPE_QUEUE** and **BPF_MAP_TYPE_STACK** map types + * implement this command as a "pop" operation, deleting the top + * element rather than one corresponding to *key*. + * The *key* and *key_len* parameters should be zeroed when + * issuing this operation for these map types. + * + * This command is only valid for the following map types: + * * **BPF_MAP_TYPE_QUEUE** + * * **BPF_MAP_TYPE_STACK** + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_FREEZE + * Description + * Freeze the permissions of the specified map. + * + * Write permissions may be frozen by passing zero *flags*. + * Upon success, no future syscall invocations may alter the + * map state of *map_fd*. Write operations from eBPF programs + * are still possible for a frozen map. + * + * Not supported for maps of type **BPF_MAP_TYPE_STRUCT_OPS**. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_BTF_GET_NEXT_ID + * Description + * Fetch the next BPF Type Format (BTF) object currently loaded + * into the kernel. + * + * Looks for the BTF object with an id greater than *start_id* + * and updates *next_id* on success. If no other BTF objects + * remain with ids higher than *start_id*, returns -1 and sets + * *errno* to **ENOENT**. + * + * Return + * Returns zero on success. On error, or when no id remains, -1 + * is returned and *errno* is set appropriately. + * + * BPF_MAP_LOOKUP_BATCH + * Description + * Iterate and fetch multiple elements in a map. + * + * Two opaque values are used to manage batch operations, + * *in_batch* and *out_batch*. Initially, *in_batch* must be set + * to NULL to begin the batched operation. After each subsequent + * **BPF_MAP_LOOKUP_BATCH**, the caller should pass the resultant + * *out_batch* as the *in_batch* for the next operation to + * continue iteration from the current point. + * + * The *keys* and *values* are output parameters which must point + * to memory large enough to hold *count* items based on the key + * and value size of the map *map_fd*. The *keys* buffer must be + * of *key_size* * *count*. The *values* buffer must be of + * *value_size* * *count*. + * + * The *elem_flags* argument may be specified as one of the + * following: + * + * **BPF_F_LOCK** + * Look up the value of a spin-locked map without + * returning the lock. This must be specified if the + * elements contain a spinlock. + * + * On success, *count* elements from the map are copied into the + * user buffer, with the keys copied into *keys* and the values + * copied into the corresponding indices in *values*. + * + * If an error is returned and *errno* is not **EFAULT**, *count* + * is set to the number of successfully processed elements. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * May set *errno* to **ENOSPC** to indicate that *keys* or + * *values* is too small to dump an entire bucket during + * iteration of a hash-based map type. + * + * BPF_MAP_LOOKUP_AND_DELETE_BATCH + * Description + * Iterate and delete all elements in a map. + * + * This operation has the same behavior as + * **BPF_MAP_LOOKUP_BATCH** with two exceptions: + * + * * Every element that is successfully returned is also deleted + * from the map. This is at least *count* elements. Note that + * *count* is both an input and an output parameter. + * * Upon returning with *errno* set to **EFAULT**, up to + * *count* elements may be deleted without returning the keys + * and values of the deleted elements. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_UPDATE_BATCH + * Description + * Update multiple elements in a map by *key*. + * + * The *keys* and *values* are input parameters which must point + * to memory large enough to hold *count* items based on the key + * and value size of the map *map_fd*. The *keys* buffer must be + * of *key_size* * *count*. The *values* buffer must be of + * *value_size* * *count*. + * + * Each element specified in *keys* is sequentially updated to the + * value in the corresponding index in *values*. The *in_batch* + * and *out_batch* parameters are ignored and should be zeroed. + * + * The *elem_flags* argument should be specified as one of the + * following: + * + * **BPF_ANY** + * Create new elements or update a existing elements. + * **BPF_NOEXIST** + * Create new elements only if they do not exist. + * **BPF_EXIST** + * Update existing elements. + * **BPF_F_LOCK** + * Update spin_lock-ed map elements. This must be + * specified if the map value contains a spinlock. + * + * On success, *count* elements from the map are updated. + * + * If an error is returned and *errno* is not **EFAULT**, *count* + * is set to the number of successfully processed elements. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * May set *errno* to **EINVAL**, **EPERM**, **ENOMEM**, or + * **E2BIG**. **E2BIG** indicates that the number of elements in + * the map reached the *max_entries* limit specified at map + * creation time. + * + * May set *errno* to one of the following error codes under + * specific circumstances: + * + * **EEXIST** + * If *flags* specifies **BPF_NOEXIST** and the element + * with *key* already exists in the map. + * **ENOENT** + * If *flags* specifies **BPF_EXIST** and the element with + * *key* does not exist in the map. + * + * BPF_MAP_DELETE_BATCH + * Description + * Delete multiple elements in a map by *key*. + * + * The *keys* parameter is an input parameter which must point + * to memory large enough to hold *count* items based on the key + * size of the map *map_fd*, that is, *key_size* * *count*. + * + * Each element specified in *keys* is sequentially deleted. The + * *in_batch*, *out_batch*, and *values* parameters are ignored + * and should be zeroed. + * + * The *elem_flags* argument may be specified as one of the + * following: + * + * **BPF_F_LOCK** + * Look up the value of a spin-locked map without + * returning the lock. This must be specified if the + * elements contain a spinlock. + * + * On success, *count* elements from the map are updated. + * + * If an error is returned and *errno* is not **EFAULT**, *count* + * is set to the number of successfully processed elements. If + * *errno* is **EFAULT**, up to *count* elements may be been + * deleted. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_LINK_CREATE + * Description + * Attach an eBPF program to a *target_fd* at the specified + * *attach_type* hook and return a file descriptor handle for + * managing the link. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_LINK_UPDATE + * Description + * Update the eBPF program in the specified *link_fd* to + * *new_prog_fd*. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_LINK_GET_FD_BY_ID + * Description + * Open a file descriptor for the eBPF Link corresponding to + * *link_id*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_LINK_GET_NEXT_ID + * Description + * Fetch the next eBPF link currently loaded into the kernel. + * + * Looks for the eBPF link with an id greater than *start_id* + * and updates *next_id* on success. If no other eBPF links + * remain with ids higher than *start_id*, returns -1 and sets + * *errno* to **ENOENT**. + * + * Return + * Returns zero on success. On error, or when no id remains, -1 + * is returned and *errno* is set appropriately. + * + * BPF_ENABLE_STATS + * Description + * Enable eBPF runtime statistics gathering. + * + * Runtime statistics gathering for the eBPF runtime is disabled + * by default to minimize the corresponding performance overhead. + * This command enables statistics globally. + * + * Multiple programs may independently enable statistics. + * After gathering the desired statistics, eBPF runtime statistics + * may be disabled again by calling **close**\ (2) for the file + * descriptor returned by this function. Statistics will only be + * disabled system-wide when all outstanding file descriptors + * returned by prior calls for this subcommand are closed. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_ITER_CREATE + * Description + * Create an iterator on top of the specified *link_fd* (as + * previously created using **BPF_LINK_CREATE**) and return a + * file descriptor that can be used to trigger the iteration. + * + * If the resulting file descriptor is pinned to the filesystem + * using **BPF_OBJ_PIN**, then subsequent **read**\ (2) syscalls + * for that path will trigger the iterator to read kernel state + * using the eBPF program attached to *link_fd*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_LINK_DETACH + * Description + * Forcefully detach the specified *link_fd* from its + * corresponding attachment point. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_PROG_BIND_MAP + * Description + * Bind a map to the lifetime of an eBPF program. + * + * The map identified by *map_fd* is bound to the program + * identified by *prog_fd* and only released when *prog_fd* is + * released. This may be used in cases where metadata should be + * associated with a program which otherwise does not contain any + * references to the map (for example, embedded in the eBPF + * program instructions). + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * NOTES + * eBPF objects (maps and programs) can be shared between processes. + * + * * After **fork**\ (2), the child inherits file descriptors + * referring to the same eBPF objects. + * * File descriptors referring to eBPF objects can be transferred over + * **unix**\ (7) domain sockets. + * * File descriptors referring to eBPF objects can be duplicated in the + * usual way, using **dup**\ (2) and similar calls. + * * File descriptors referring to eBPF objects can be pinned to the + * filesystem using the **BPF_OBJ_PIN** command of **bpf**\ (2). + * + * An eBPF object is deallocated only after all file descriptors referring + * to the object have been closed and no references remain pinned to the + * filesystem or attached (for example, bound to a program or device). + */ enum bpf_cmd { BPF_MAP_CREATE, BPF_MAP_LOOKUP_ELEM, @@ -393,6 +1103,15 @@ enum bpf_link_type { * is struct/union. */ #define BPF_PSEUDO_BTF_ID 3 +/* insn[0].src_reg: BPF_PSEUDO_FUNC + * insn[0].imm: insn offset to the func + * insn[1].imm: 0 + * insn[0].off: 0 + * insn[1].off: 0 + * ldimm64 rewrite: address of the function + * verifier type: PTR_TO_FUNC. + */ +#define BPF_PSEUDO_FUNC 4 /* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative * offset to another bpf function @@ -720,7 +1439,7 @@ union bpf_attr { * parsed and used to produce a manual page. The workflow is the following, * and requires the rst2man utility: * - * $ ./scripts/bpf_helpers_doc.py \ + * $ ./scripts/bpf_doc.py \ * --filename include/uapi/linux/bpf.h > /tmp/bpf-helpers.rst * $ rst2man /tmp/bpf-helpers.rst > /tmp/bpf-helpers.7 * $ man /tmp/bpf-helpers.7 @@ -1765,6 +2484,10 @@ union bpf_attr { * Use with ENCAP_L3/L4 flags to further specify the tunnel * type; *len* is the length of the inner MAC header. * + * * **BPF_F_ADJ_ROOM_ENCAP_L2_ETH**: + * Use with BPF_F_ADJ_ROOM_ENCAP_L2 flag to further specify the + * L2 type as Ethernet. + * * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be @@ -3909,6 +4632,34 @@ union bpf_attr { * * **BPF_MTU_CHK_RET_FRAG_NEEDED** * * **BPF_MTU_CHK_RET_SEGS_TOOBIG** * + * long bpf_for_each_map_elem(struct bpf_map *map, void *callback_fn, void *callback_ctx, u64 flags) + * Description + * For each element in **map**, call **callback_fn** function with + * **map**, **callback_ctx** and other map-specific parameters. + * The **callback_fn** should be a static function and + * the **callback_ctx** should be a pointer to the stack. + * The **flags** is used to control certain aspects of the helper. + * Currently, the **flags** must be 0. + * + * The following are a list of supported map types and their + * respective expected callback signatures: + * + * BPF_MAP_TYPE_HASH, BPF_MAP_TYPE_PERCPU_HASH, + * BPF_MAP_TYPE_LRU_HASH, BPF_MAP_TYPE_LRU_PERCPU_HASH, + * BPF_MAP_TYPE_ARRAY, BPF_MAP_TYPE_PERCPU_ARRAY + * + * long (\*callback_fn)(struct bpf_map \*map, const void \*key, void \*value, void \*ctx); + * + * For per_cpu maps, the map_value is the value on the cpu where the + * bpf_prog is running. + * + * If **callback_fn** return 0, the helper will continue to the next + * element. If return value is 1, the helper will skip the rest of + * elements and return. Other return values are not used now. + * + * Return + * The number of traversed map elements for success, **-EINVAL** for + * invalid **flags**. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4075,6 +4826,7 @@ union bpf_attr { FN(ima_inode_hash), \ FN(sock_from_file), \ FN(check_mtu), \ + FN(for_each_map_elem), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -4168,6 +4920,7 @@ enum { BPF_F_ADJ_ROOM_ENCAP_L4_GRE = (1ULL << 3), BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4), BPF_F_ADJ_ROOM_NO_CSUM_RESET = (1ULL << 5), + BPF_F_ADJ_ROOM_ENCAP_L2_ETH = (1ULL << 6), }; enum { @@ -5205,7 +5958,10 @@ struct bpf_pidns_info { /* User accessible data for SK_LOOKUP programs. Add new fields at the end. */ struct bpf_sk_lookup { - __bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */ + union { + __bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */ + __u64 cookie; /* Non-zero if socket was selected in PROG_TEST_RUN */ + }; __u32 family; /* Protocol family (AF_INET, AF_INET6) */ __u32 protocol; /* IP protocol (IPPROTO_TCP, IPPROTO_UDP) */ diff --git a/tools/include/uapi/linux/btf.h b/tools/include/uapi/linux/btf.h index 5a667107ad2c..d27b1708efe9 100644 --- a/tools/include/uapi/linux/btf.h +++ b/tools/include/uapi/linux/btf.h @@ -52,7 +52,7 @@ struct btf_type { }; }; -#define BTF_INFO_KIND(info) (((info) >> 24) & 0x0f) +#define BTF_INFO_KIND(info) (((info) >> 24) & 0x1f) #define BTF_INFO_VLEN(info) ((info) & 0xffff) #define BTF_INFO_KFLAG(info) ((info) >> 31) @@ -72,7 +72,8 @@ struct btf_type { #define BTF_KIND_FUNC_PROTO 13 /* Function Proto */ #define BTF_KIND_VAR 14 /* Variable */ #define BTF_KIND_DATASEC 15 /* Section */ -#define BTF_KIND_MAX BTF_KIND_DATASEC +#define BTF_KIND_FLOAT 16 /* Floating point */ +#define BTF_KIND_MAX BTF_KIND_FLOAT #define NR_BTF_KINDS (BTF_KIND_MAX + 1) /* For some specific BTF_KIND, "struct btf_type" is immediately diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile index 887a494ad5fc..8170f88e8ea6 100644 --- a/tools/lib/bpf/Makefile +++ b/tools/lib/bpf/Makefile @@ -158,7 +158,7 @@ $(BPF_IN_STATIC): force $(BPF_HELPER_DEFS) $(Q)$(MAKE) $(build)=libbpf OUTPUT=$(STATIC_OBJDIR) $(BPF_HELPER_DEFS): $(srctree)/tools/include/uapi/linux/bpf.h - $(QUIET_GEN)$(srctree)/scripts/bpf_helpers_doc.py --header \ + $(QUIET_GEN)$(srctree)/scripts/bpf_doc.py --header \ --file $(srctree)/tools/include/uapi/linux/bpf.h > $(BPF_HELPER_DEFS) $(OUTPUT)libbpf.so: $(OUTPUT)libbpf.so.$(LIBBPF_VERSION) diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index d9c10830d749..3aa58f2ac183 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -291,6 +291,7 @@ static int btf_type_size(const struct btf_type *t) case BTF_KIND_PTR: case BTF_KIND_TYPEDEF: case BTF_KIND_FUNC: + case BTF_KIND_FLOAT: return base_size; case BTF_KIND_INT: return base_size + sizeof(__u32); @@ -338,6 +339,7 @@ static int btf_bswap_type_rest(struct btf_type *t) case BTF_KIND_PTR: case BTF_KIND_TYPEDEF: case BTF_KIND_FUNC: + case BTF_KIND_FLOAT: return 0; case BTF_KIND_INT: *(__u32 *)(t + 1) = bswap_32(*(__u32 *)(t + 1)); @@ -578,6 +580,7 @@ __s64 btf__resolve_size(const struct btf *btf, __u32 type_id) case BTF_KIND_UNION: case BTF_KIND_ENUM: case BTF_KIND_DATASEC: + case BTF_KIND_FLOAT: size = t->size; goto done; case BTF_KIND_PTR: @@ -621,6 +624,7 @@ int btf__align_of(const struct btf *btf, __u32 id) switch (kind) { case BTF_KIND_INT: case BTF_KIND_ENUM: + case BTF_KIND_FLOAT: return min(btf_ptr_sz(btf), (size_t)t->size); case BTF_KIND_PTR: return btf_ptr_sz(btf); @@ -1756,6 +1760,47 @@ int btf__add_int(struct btf *btf, const char *name, size_t byte_sz, int encoding return btf_commit_type(btf, sz); } +/* + * Append new BTF_KIND_FLOAT type with: + * - *name* - non-empty, non-NULL type name; + * - *sz* - size of the type, in bytes; + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_float(struct btf *btf, const char *name, size_t byte_sz) +{ + struct btf_type *t; + int sz, name_off; + + /* non-empty name */ + if (!name || !name[0]) + return -EINVAL; + + /* byte_sz must be one of the explicitly allowed values */ + if (byte_sz != 2 && byte_sz != 4 && byte_sz != 8 && byte_sz != 12 && + byte_sz != 16) + return -EINVAL; + + if (btf_ensure_modifiable(btf)) + return -ENOMEM; + + sz = sizeof(struct btf_type); + t = btf_add_type_mem(btf, sz); + if (!t) + return -ENOMEM; + + name_off = btf__add_str(btf, name); + if (name_off < 0) + return name_off; + + t->name_off = name_off; + t->info = btf_type_info(BTF_KIND_FLOAT, 0, 0); + t->size = byte_sz; + + return btf_commit_type(btf, sz); +} + /* it's completely legal to append BTF types with type IDs pointing forward to * types that haven't been appended yet, so we only make sure that id looks * sane, we can't guarantee that ID will always be valid @@ -1883,7 +1928,7 @@ static int btf_add_composite(struct btf *btf, int kind, const char *name, __u32 * - *byte_sz* - size of the struct, in bytes; * * Struct initially has no fields in it. Fields can be added by - * btf__add_field() right after btf__add_struct() succeeds. + * btf__add_field() right after btf__add_struct() succeeds. * * Returns: * - >0, type ID of newly added BTF type; @@ -3626,6 +3671,7 @@ static int btf_dedup_prep(struct btf_dedup *d) case BTF_KIND_FWD: case BTF_KIND_TYPEDEF: case BTF_KIND_FUNC: + case BTF_KIND_FLOAT: h = btf_hash_common(t); break; case BTF_KIND_INT: @@ -3722,6 +3768,7 @@ static int btf_dedup_prim_type(struct btf_dedup *d, __u32 type_id) break; case BTF_KIND_FWD: + case BTF_KIND_FLOAT: h = btf_hash_common(t); for_each_dedup_cand(d, hash_entry, h) { cand_id = (__u32)(long)hash_entry->value; @@ -3983,6 +4030,7 @@ static int btf_dedup_is_equiv(struct btf_dedup *d, __u32 cand_id, return btf_compat_enum(cand_type, canon_type); case BTF_KIND_FWD: + case BTF_KIND_FLOAT: return btf_equal_common(cand_type, canon_type); case BTF_KIND_CONST: @@ -4479,6 +4527,7 @@ static int btf_dedup_remap_type(struct btf_dedup *d, __u32 type_id) switch (btf_kind(t)) { case BTF_KIND_INT: case BTF_KIND_ENUM: + case BTF_KIND_FLOAT: break; case BTF_KIND_FWD: diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h index 1237bcd1dd17..029a9cfc8c2d 100644 --- a/tools/lib/bpf/btf.h +++ b/tools/lib/bpf/btf.h @@ -95,6 +95,7 @@ LIBBPF_API int btf__find_str(struct btf *btf, const char *s); LIBBPF_API int btf__add_str(struct btf *btf, const char *s); LIBBPF_API int btf__add_int(struct btf *btf, const char *name, size_t byte_sz, int encoding); +LIBBPF_API int btf__add_float(struct btf *btf, const char *name, size_t byte_sz); LIBBPF_API int btf__add_ptr(struct btf *btf, int ref_type_id); LIBBPF_API int btf__add_array(struct btf *btf, int index_type_id, int elem_type_id, __u32 nr_elems); @@ -294,6 +295,11 @@ static inline bool btf_is_datasec(const struct btf_type *t) return btf_kind(t) == BTF_KIND_DATASEC; } +static inline bool btf_is_float(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_FLOAT; +} + static inline __u8 btf_int_encoding(const struct btf_type *t) { return BTF_INT_ENCODING(*(__u32 *)(t + 1)); diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c index 2f9d685bd522..5e957fcceee6 100644 --- a/tools/lib/bpf/btf_dump.c +++ b/tools/lib/bpf/btf_dump.c @@ -279,6 +279,7 @@ static int btf_dump_mark_referenced(struct btf_dump *d) case BTF_KIND_INT: case BTF_KIND_ENUM: case BTF_KIND_FWD: + case BTF_KIND_FLOAT: break; case BTF_KIND_VOLATILE: @@ -453,6 +454,7 @@ static int btf_dump_order_type(struct btf_dump *d, __u32 id, bool through_ptr) switch (btf_kind(t)) { case BTF_KIND_INT: + case BTF_KIND_FLOAT: tstate->order_state = ORDERED; return 0; @@ -1133,6 +1135,7 @@ skip_mod: case BTF_KIND_STRUCT: case BTF_KIND_UNION: case BTF_KIND_TYPEDEF: + case BTF_KIND_FLOAT: goto done; default: pr_warn("unexpected type in decl chain, kind:%u, id:[%u]\n", @@ -1247,6 +1250,7 @@ static void btf_dump_emit_type_chain(struct btf_dump *d, switch (kind) { case BTF_KIND_INT: + case BTF_KIND_FLOAT: btf_dump_emit_mods(d, decls); name = btf_name_of(d, t->name_off); btf_dump_printf(d, "%s", name); diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index d43cc3f29dae..2f351d3ad3e7 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -178,6 +178,8 @@ enum kern_feature_id { FEAT_PROG_BIND_MAP, /* Kernel support for module BTFs */ FEAT_MODULE_BTF, + /* BTF_KIND_FLOAT support */ + FEAT_BTF_FLOAT, __FEAT_CNT, }; @@ -188,6 +190,7 @@ enum reloc_type { RELO_CALL, RELO_DATA, RELO_EXTERN, + RELO_SUBPROG_ADDR, }; struct reloc_desc { @@ -574,6 +577,16 @@ static bool insn_is_subprog_call(const struct bpf_insn *insn) insn->off == 0; } +static bool is_ldimm64(struct bpf_insn *insn) +{ + return insn->code == (BPF_LD | BPF_IMM | BPF_DW); +} + +static bool insn_is_pseudo_func(struct bpf_insn *insn) +{ + return is_ldimm64(insn) && insn->src_reg == BPF_PSEUDO_FUNC; +} + static int bpf_object__init_prog(struct bpf_object *obj, struct bpf_program *prog, const char *name, size_t sec_idx, const char *sec_name, @@ -1935,6 +1948,7 @@ static const char *btf_kind_str(const struct btf_type *t) case BTF_KIND_FUNC_PROTO: return "func_proto"; case BTF_KIND_VAR: return "var"; case BTF_KIND_DATASEC: return "datasec"; + case BTF_KIND_FLOAT: return "float"; default: return "unknown"; } } @@ -2384,15 +2398,17 @@ static bool btf_needs_sanitization(struct bpf_object *obj) { bool has_func_global = kernel_supports(FEAT_BTF_GLOBAL_FUNC); bool has_datasec = kernel_supports(FEAT_BTF_DATASEC); + bool has_float = kernel_supports(FEAT_BTF_FLOAT); bool has_func = kernel_supports(FEAT_BTF_FUNC); - return !has_func || !has_datasec || !has_func_global; + return !has_func || !has_datasec || !has_func_global || !has_float; } static void bpf_object__sanitize_btf(struct bpf_object *obj, struct btf *btf) { bool has_func_global = kernel_supports(FEAT_BTF_GLOBAL_FUNC); bool has_datasec = kernel_supports(FEAT_BTF_DATASEC); + bool has_float = kernel_supports(FEAT_BTF_FLOAT); bool has_func = kernel_supports(FEAT_BTF_FUNC); struct btf_type *t; int i, j, vlen; @@ -2445,6 +2461,13 @@ static void bpf_object__sanitize_btf(struct bpf_object *obj, struct btf *btf) } else if (!has_func_global && btf_is_func(t)) { /* replace BTF_FUNC_GLOBAL with BTF_FUNC_STATIC */ t->info = BTF_INFO_ENC(BTF_KIND_FUNC, 0, 0); + } else if (!has_float && btf_is_float(t)) { + /* replace FLOAT with an equally-sized empty STRUCT; + * since C compilers do not accept e.g. "float" as a + * valid struct name, make it anonymous + */ + t->name_off = 0; + t->info = BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 0); } } } @@ -2974,6 +2997,23 @@ static bool sym_is_extern(const GElf_Sym *sym) GELF_ST_TYPE(sym->st_info) == STT_NOTYPE; } +static bool sym_is_subprog(const GElf_Sym *sym, int text_shndx) +{ + int bind = GELF_ST_BIND(sym->st_info); + int type = GELF_ST_TYPE(sym->st_info); + + /* in .text section */ + if (sym->st_shndx != text_shndx) + return false; + + /* local function */ + if (bind == STB_LOCAL && type == STT_SECTION) + return true; + + /* global function */ + return bind == STB_GLOBAL && type == STT_FUNC; +} + static int find_extern_btf_id(const struct btf *btf, const char *ext_name) { const struct btf_type *t; @@ -3395,7 +3435,7 @@ static int bpf_program__record_reloc(struct bpf_program *prog, return 0; } - if (insn->code != (BPF_LD | BPF_IMM | BPF_DW)) { + if (!is_ldimm64(insn)) { pr_warn("prog '%s': invalid relo against '%s' for insns[%d].code 0x%x\n", prog->name, sym_name, insn_idx, insn->code); return -LIBBPF_ERRNO__RELOC; @@ -3430,6 +3470,23 @@ static int bpf_program__record_reloc(struct bpf_program *prog, return -LIBBPF_ERRNO__RELOC; } + /* loading subprog addresses */ + if (sym_is_subprog(sym, obj->efile.text_shndx)) { + /* global_func: sym->st_value = offset in the section, insn->imm = 0. + * local_func: sym->st_value = 0, insn->imm = offset in the section. + */ + if ((sym->st_value % BPF_INSN_SZ) || (insn->imm % BPF_INSN_SZ)) { + pr_warn("prog '%s': bad subprog addr relo against '%s' at offset %zu+%d\n", + prog->name, sym_name, (size_t)sym->st_value, insn->imm); + return -LIBBPF_ERRNO__RELOC; + } + + reloc_desc->type = RELO_SUBPROG_ADDR; + reloc_desc->insn_idx = insn_idx; + reloc_desc->sym_off = sym->st_value; + return 0; + } + type = bpf_object__section_to_libbpf_map_type(obj, shdr_idx); sym_sec_name = elf_sec_name(obj, elf_sec_by_idx(obj, shdr_idx)); @@ -3882,6 +3939,18 @@ static int probe_kern_btf_datasec(void) strs, sizeof(strs))); } +static int probe_kern_btf_float(void) +{ + static const char strs[] = "\0float"; + __u32 types[] = { + /* float */ + BTF_TYPE_FLOAT_ENC(1, 4), + }; + + return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), + strs, sizeof(strs))); +} + static int probe_kern_array_mmap(void) { struct bpf_create_map_attr attr = { @@ -4061,6 +4130,9 @@ static struct kern_feature_desc { [FEAT_MODULE_BTF] = { "module BTF support", probe_module_btf, }, + [FEAT_BTF_FLOAT] = { + "BTF_KIND_FLOAT support", probe_kern_btf_float, + }, }; static bool kernel_supports(enum kern_feature_id feat_id) @@ -5566,11 +5638,6 @@ static void bpf_core_poison_insn(struct bpf_program *prog, int relo_idx, insn->imm = 195896080; /* => 0xbad2310 => "bad relo" */ } -static bool is_ldimm64(struct bpf_insn *insn) -{ - return insn->code == (BPF_LD | BPF_IMM | BPF_DW); -} - static int insn_bpf_size_to_bytes(struct bpf_insn *insn) { switch (BPF_SIZE(insn->code)) { @@ -6172,6 +6239,10 @@ bpf_object__relocate_data(struct bpf_object *obj, struct bpf_program *prog) } relo->processed = true; break; + case RELO_SUBPROG_ADDR: + insn[0].src_reg = BPF_PSEUDO_FUNC; + /* will be handled as a follow up pass */ + break; case RELO_CALL: /* will be handled as a follow up pass */ break; @@ -6358,11 +6429,11 @@ bpf_object__reloc_code(struct bpf_object *obj, struct bpf_program *main_prog, for (insn_idx = 0; insn_idx < prog->sec_insn_cnt; insn_idx++) { insn = &main_prog->insns[prog->sub_insn_off + insn_idx]; - if (!insn_is_subprog_call(insn)) + if (!insn_is_subprog_call(insn) && !insn_is_pseudo_func(insn)) continue; relo = find_prog_insn_relo(prog, insn_idx); - if (relo && relo->type != RELO_CALL) { + if (relo && relo->type != RELO_CALL && relo->type != RELO_SUBPROG_ADDR) { pr_warn("prog '%s': unexpected relo for insn #%zu, type %d\n", prog->name, insn_idx, relo->type); return -LIBBPF_ERRNO__RELOC; @@ -6374,8 +6445,22 @@ bpf_object__reloc_code(struct bpf_object *obj, struct bpf_program *main_prog, * call always has imm = -1, but for static functions * relocation is against STT_SECTION and insn->imm * points to a start of a static function + * + * for subprog addr relocation, the relo->sym_off + insn->imm is + * the byte offset in the corresponding section. + */ + if (relo->type == RELO_CALL) + sub_insn_idx = relo->sym_off / BPF_INSN_SZ + insn->imm + 1; + else + sub_insn_idx = (relo->sym_off + insn->imm) / BPF_INSN_SZ; + } else if (insn_is_pseudo_func(insn)) { + /* + * RELO_SUBPROG_ADDR relo is always emitted even if both + * functions are in the same section, so it shouldn't reach here. */ - sub_insn_idx = relo->sym_off / BPF_INSN_SZ + insn->imm + 1; + pr_warn("prog '%s': missing subprog addr relo for insn #%zu\n", + prog->name, insn_idx); + return -LIBBPF_ERRNO__RELOC; } else { /* if subprogram call is to a static function within * the same ELF section, there won't be any relocation diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 1c0fd2dd233a..ec898f464ab9 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -350,3 +350,8 @@ LIBBPF_0.3.0 { xsk_setup_xdp_prog; xsk_socket__update_xskmap; } LIBBPF_0.2.0; + +LIBBPF_0.4.0 { + global: + btf__add_float; +} LIBBPF_0.3.0; diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index 969d0ac592ba..343f6eb05637 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -31,6 +31,8 @@ #define BTF_MEMBER_ENC(name, type, bits_offset) (name), (type), (bits_offset) #define BTF_PARAM_ENC(name, type) (name), (type) #define BTF_VAR_SECINFO_ENC(type, offset, size) (type), (offset), (size) +#define BTF_TYPE_FLOAT_ENC(name, sz) \ + BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_FLOAT, 0, 0), sz) #ifndef likely #define likely(x) __builtin_expect(!!(x), 1) diff --git a/tools/lib/bpf/libbpf_util.h b/tools/lib/bpf/libbpf_util.h index 59c779c5790c..cfbcfc063c81 100644 --- a/tools/lib/bpf/libbpf_util.h +++ b/tools/lib/bpf/libbpf_util.h @@ -5,6 +5,7 @@ #define __LIBBPF_LIBBPF_UTIL_H #include <stdbool.h> +#include <linux/compiler.h> #ifdef __cplusplus extern "C" { @@ -15,29 +16,56 @@ extern "C" { * application that uses libbpf. */ #if defined(__i386__) || defined(__x86_64__) -# define libbpf_smp_rmb() asm volatile("" : : : "memory") -# define libbpf_smp_wmb() asm volatile("" : : : "memory") -# define libbpf_smp_mb() \ - asm volatile("lock; addl $0,-4(%%rsp)" : : : "memory", "cc") -/* Hinders stores to be observed before older loads. */ -# define libbpf_smp_rwmb() asm volatile("" : : : "memory") +# define libbpf_smp_store_release(p, v) \ + do { \ + asm volatile("" : : : "memory"); \ + WRITE_ONCE(*p, v); \ + } while (0) +# define libbpf_smp_load_acquire(p) \ + ({ \ + typeof(*p) ___p1 = READ_ONCE(*p); \ + asm volatile("" : : : "memory"); \ + ___p1; \ + }) #elif defined(__aarch64__) -# define libbpf_smp_rmb() asm volatile("dmb ishld" : : : "memory") -# define libbpf_smp_wmb() asm volatile("dmb ishst" : : : "memory") -# define libbpf_smp_mb() asm volatile("dmb ish" : : : "memory") -# define libbpf_smp_rwmb() libbpf_smp_mb() -#elif defined(__arm__) -/* These are only valid for armv7 and above */ -# define libbpf_smp_rmb() asm volatile("dmb ish" : : : "memory") -# define libbpf_smp_wmb() asm volatile("dmb ishst" : : : "memory") -# define libbpf_smp_mb() asm volatile("dmb ish" : : : "memory") -# define libbpf_smp_rwmb() libbpf_smp_mb() -#else -/* Architecture missing native barrier functions. */ -# define libbpf_smp_rmb() __sync_synchronize() -# define libbpf_smp_wmb() __sync_synchronize() -# define libbpf_smp_mb() __sync_synchronize() -# define libbpf_smp_rwmb() __sync_synchronize() +# define libbpf_smp_store_release(p, v) \ + asm volatile ("stlr %w1, %0" : "=Q" (*p) : "r" (v) : "memory") +# define libbpf_smp_load_acquire(p) \ + ({ \ + typeof(*p) ___p1; \ + asm volatile ("ldar %w0, %1" \ + : "=r" (___p1) : "Q" (*p) : "memory"); \ + ___p1; \ + }) +#elif defined(__riscv) +# define libbpf_smp_store_release(p, v) \ + do { \ + asm volatile ("fence rw,w" : : : "memory"); \ + WRITE_ONCE(*p, v); \ + } while (0) +# define libbpf_smp_load_acquire(p) \ + ({ \ + typeof(*p) ___p1 = READ_ONCE(*p); \ + asm volatile ("fence r,rw" : : : "memory"); \ + ___p1; \ + }) +#endif + +#ifndef libbpf_smp_store_release +#define libbpf_smp_store_release(p, v) \ + do { \ + __sync_synchronize(); \ + WRITE_ONCE(*p, v); \ + } while (0) +#endif + +#ifndef libbpf_smp_load_acquire +#define libbpf_smp_load_acquire(p) \ + ({ \ + typeof(*p) ___p1 = READ_ONCE(*p); \ + __sync_synchronize(); \ + ___p1; \ + }) #endif #ifdef __cplusplus diff --git a/tools/lib/bpf/xsk.h b/tools/lib/bpf/xsk.h index e9f121f5d129..a9fdea87b5cd 100644 --- a/tools/lib/bpf/xsk.h +++ b/tools/lib/bpf/xsk.h @@ -96,7 +96,8 @@ static inline __u32 xsk_prod_nb_free(struct xsk_ring_prod *r, __u32 nb) * this function. Without this optimization it whould have been * free_entries = r->cached_prod - r->cached_cons + r->size. */ - r->cached_cons = *r->consumer + r->size; + r->cached_cons = libbpf_smp_load_acquire(r->consumer); + r->cached_cons += r->size; return r->cached_cons - r->cached_prod; } @@ -106,7 +107,7 @@ static inline __u32 xsk_cons_nb_avail(struct xsk_ring_cons *r, __u32 nb) __u32 entries = r->cached_prod - r->cached_cons; if (entries == 0) { - r->cached_prod = *r->producer; + r->cached_prod = libbpf_smp_load_acquire(r->producer); entries = r->cached_prod - r->cached_cons; } @@ -129,9 +130,7 @@ static inline void xsk_ring_prod__submit(struct xsk_ring_prod *prod, __u32 nb) /* Make sure everything has been written to the ring before indicating * this to the kernel by writing the producer pointer. */ - libbpf_smp_wmb(); - - *prod->producer += nb; + libbpf_smp_store_release(prod->producer, *prod->producer + nb); } static inline __u32 xsk_ring_cons__peek(struct xsk_ring_cons *cons, __u32 nb, __u32 *idx) @@ -139,11 +138,6 @@ static inline __u32 xsk_ring_cons__peek(struct xsk_ring_cons *cons, __u32 nb, __ __u32 entries = xsk_cons_nb_avail(cons, nb); if (entries > 0) { - /* Make sure we do not speculatively read the data before - * we have received the packet buffers from the ring. - */ - libbpf_smp_rmb(); - *idx = cons->cached_cons; cons->cached_cons += entries; } @@ -161,9 +155,8 @@ static inline void xsk_ring_cons__release(struct xsk_ring_cons *cons, __u32 nb) /* Make sure data has been read before indicating we are done * with the entries by updating the consumer pointer. */ - libbpf_smp_rwmb(); + libbpf_smp_store_release(cons->consumer, *cons->consumer + nb); - *cons->consumer += nb; } static inline void *xsk_umem__get_data(void *umem_area, __u64 addr) diff --git a/tools/perf/MANIFEST b/tools/perf/MANIFEST index 5d7b947320fb..f05c4d48fd7e 100644 --- a/tools/perf/MANIFEST +++ b/tools/perf/MANIFEST @@ -20,4 +20,4 @@ tools/lib/bitmap.c tools/lib/str_error_r.c tools/lib/vsprintf.c tools/lib/zalloc.c -scripts/bpf_helpers_doc.py +scripts/bpf_doc.py diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index c0c48fdb9ac1..4866f6a21901 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -1,4 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only +bpf-helpers* +bpf-syscall* test_verifier test_maps test_lru_map diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 044bfdcf5b74..c3999587bc23 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -68,6 +68,7 @@ TEST_PROGS := test_kmod.sh \ test_bpftool_build.sh \ test_bpftool.sh \ test_bpftool_metadata.sh \ + test_doc_build.sh \ test_xsk.sh TEST_PROGS_EXTENDED := with_addr.sh \ @@ -103,6 +104,7 @@ override define CLEAN $(call msg,CLEAN) $(Q)$(RM) -r $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES) $(EXTRA_CLEAN) $(Q)$(MAKE) -C bpf_testmod clean + $(Q)$(MAKE) docs-clean endef include ../lib.mk @@ -180,6 +182,7 @@ $(OUTPUT)/runqslower: $(BPFOBJ) | $(DEFAULT_BPFTOOL) cp $(SCRATCH_DIR)/runqslower $@ $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED): $(OUTPUT)/test_stub.o $(BPFOBJ) +$(TEST_GEN_FILES): docs $(OUTPUT)/test_dev_cgroup: cgroup_helpers.c $(OUTPUT)/test_skb_cgroup_id_user: cgroup_helpers.c @@ -200,11 +203,16 @@ $(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile) \ CC=$(HOSTCC) LD=$(HOSTLD) \ OUTPUT=$(HOST_BUILD_DIR)/bpftool/ \ prefix= DESTDIR=$(HOST_SCRATCH_DIR)/ install - $(Q)mkdir -p $(BUILD_DIR)/bpftool/Documentation - $(Q)RST2MAN_OPTS="--exit-status=1" $(MAKE) $(submake_extras) \ - -C $(BPFTOOLDIR)/Documentation \ - OUTPUT=$(BUILD_DIR)/bpftool/Documentation/ \ - prefix= DESTDIR=$(SCRATCH_DIR)/ install + +docs: + $(Q)RST2MAN_OPTS="--exit-status=1" $(MAKE) $(submake_extras) \ + -f Makefile.docs \ + prefix= OUTPUT=$(OUTPUT)/ DESTDIR=$(OUTPUT)/ $@ + +docs-clean: + $(Q)$(MAKE) $(submake_extras) \ + -f Makefile.docs \ + prefix= OUTPUT=$(OUTPUT)/ DESTDIR=$(OUTPUT)/ $@ $(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \ ../../../include/uapi/linux/bpf.h \ @@ -382,11 +390,12 @@ $(TRUNNER_EXTRA_OBJS): $(TRUNNER_OUTPUT)/%.o: \ $$(call msg,EXT-OBJ,$(TRUNNER_BINARY),$$@) $(Q)$$(CC) $$(CFLAGS) -c $$< $$(LDLIBS) -o $$@ -# only copy extra resources if in flavored build +# non-flavored in-srctree builds receive special treatment, in particular, we +# do not need to copy extra resources (see e.g. test_btf_dump_case()) $(TRUNNER_BINARY)-extras: $(TRUNNER_EXTRA_FILES) | $(TRUNNER_OUTPUT) -ifneq ($2,) +ifneq ($2:$(OUTPUT),:$(shell pwd)) $$(call msg,EXT-COPY,$(TRUNNER_BINARY),$(TRUNNER_EXTRA_FILES)) - $(Q)cp -a $$^ $(TRUNNER_OUTPUT)/ + $(Q)rsync -aq $$^ $(TRUNNER_OUTPUT)/ endif $(OUTPUT)/$(TRUNNER_BINARY): $(TRUNNER_TEST_OBJS) \ @@ -476,3 +485,5 @@ EXTRA_CLEAN := $(TEST_CUSTOM_PROGS) $(SCRATCH_DIR) $(HOST_SCRATCH_DIR) \ prog_tests/tests.h map_tests/tests.h verifier/tests.h \ feature \ $(addprefix $(OUTPUT)/,*.o *.skel.h no_alu32 bpf_gcc bpf_testmod.ko) + +.PHONY: docs docs-clean diff --git a/tools/testing/selftests/bpf/Makefile.docs b/tools/testing/selftests/bpf/Makefile.docs new file mode 100644 index 000000000000..ccf260021e83 --- /dev/null +++ b/tools/testing/selftests/bpf/Makefile.docs @@ -0,0 +1,82 @@ +# SPDX-License-Identifier: GPL-2.0-only + +include ../../../scripts/Makefile.include +include ../../../scripts/utilities.mak + +INSTALL ?= install +RM ?= rm -f +RMDIR ?= rmdir --ignore-fail-on-non-empty + +ifeq ($(V),1) + Q = +else + Q = @ +endif + +prefix ?= /usr/local +mandir ?= $(prefix)/man +man2dir = $(mandir)/man2 +man7dir = $(mandir)/man7 + +SYSCALL_RST = bpf-syscall.rst +MAN2_RST = $(SYSCALL_RST) + +HELPERS_RST = bpf-helpers.rst +MAN7_RST = $(HELPERS_RST) + +_DOC_MAN2 = $(patsubst %.rst,%.2,$(MAN2_RST)) +DOC_MAN2 = $(addprefix $(OUTPUT),$(_DOC_MAN2)) + +_DOC_MAN7 = $(patsubst %.rst,%.7,$(MAN7_RST)) +DOC_MAN7 = $(addprefix $(OUTPUT),$(_DOC_MAN7)) + +DOCTARGETS := helpers syscall + +docs: $(DOCTARGETS) +syscall: man2 +helpers: man7 +man2: $(DOC_MAN2) +man7: $(DOC_MAN7) + +RST2MAN_DEP := $(shell command -v rst2man 2>/dev/null) + +# Configure make rules for the man page bpf-$1.$2. +# $1 - target for scripts/bpf_doc.py +# $2 - man page section to generate the troff file +define DOCS_RULES = +$(OUTPUT)bpf-$1.rst: ../../../../include/uapi/linux/bpf.h + $$(QUIET_GEN)../../../../scripts/bpf_doc.py $1 \ + --filename $$< > $$@ + +$(OUTPUT)%.$2: $(OUTPUT)%.rst +ifndef RST2MAN_DEP + $$(error "rst2man not found, but required to generate man pages") +endif + $$(QUIET_GEN)rst2man $$< > $$@ + +docs-clean-$1: + $$(call QUIET_CLEAN, eBPF_$1-manpage) + $(Q)$(RM) $$(DOC_MAN$2) $(OUTPUT)bpf-$1.rst + +docs-install-$1: docs + $$(call QUIET_INSTALL, eBPF_$1-manpage) + $(Q)$(INSTALL) -d -m 755 $(DESTDIR)$$(man$2dir) + $(Q)$(INSTALL) -m 644 $$(DOC_MAN$2) $(DESTDIR)$$(man$2dir) + +docs-uninstall-$1: + $$(call QUIET_UNINST, eBPF_$1-manpage) + $(Q)$(RM) $$(addprefix $(DESTDIR)$$(man$2dir)/,$$(_DOC_MAN$2)) + $(Q)$(RMDIR) $(DESTDIR)$$(man$2dir) + +.PHONY: $1 docs-clean-$1 docs-install-$1 docs-uninstall-$1 +endef + +# Create the make targets to generate manual pages by name and section +$(eval $(call DOCS_RULES,helpers,7)) +$(eval $(call DOCS_RULES,syscall,2)) + +docs-clean: $(foreach doctarget,$(DOCTARGETS), docs-clean-$(doctarget)) +docs-install: $(foreach doctarget,$(DOCTARGETS), docs-install-$(doctarget)) +docs-uninstall: $(foreach doctarget,$(DOCTARGETS), docs-uninstall-$(doctarget)) + +.PHONY: docs docs-clean docs-install docs-uninstall man2 man7 diff --git a/tools/testing/selftests/bpf/README.rst b/tools/testing/selftests/bpf/README.rst index fd148b8410fa..3464161c8eea 100644 --- a/tools/testing/selftests/bpf/README.rst +++ b/tools/testing/selftests/bpf/README.rst @@ -111,6 +111,45 @@ available in 10.0.1. The patch is available in llvm 11.0.0 trunk. __ https://reviews.llvm.org/D78466 +bpf_verif_scale/loop6.o test failure with Clang 12 +================================================== + +With Clang 12, the following bpf_verif_scale test failed: + * ``bpf_verif_scale/loop6.o`` + +The verifier output looks like + +.. code-block:: c + + R1 type=ctx expected=fp + The sequence of 8193 jumps is too complex. + +The reason is compiler generating the following code + +.. code-block:: c + + ; for (i = 0; (i < VIRTIO_MAX_SGS) && (i < num); i++) { + 14: 16 05 40 00 00 00 00 00 if w5 == 0 goto +64 <LBB0_6> + 15: bc 51 00 00 00 00 00 00 w1 = w5 + 16: 04 01 00 00 ff ff ff ff w1 += -1 + 17: 67 05 00 00 20 00 00 00 r5 <<= 32 + 18: 77 05 00 00 20 00 00 00 r5 >>= 32 + 19: a6 01 01 00 05 00 00 00 if w1 < 5 goto +1 <LBB0_4> + 20: b7 05 00 00 06 00 00 00 r5 = 6 + 00000000000000a8 <LBB0_4>: + 21: b7 02 00 00 00 00 00 00 r2 = 0 + 22: b7 01 00 00 00 00 00 00 r1 = 0 + ; for (i = 0; (i < VIRTIO_MAX_SGS) && (i < num); i++) { + 23: 7b 1a e0 ff 00 00 00 00 *(u64 *)(r10 - 32) = r1 + 24: 7b 5a c0 ff 00 00 00 00 *(u64 *)(r10 - 64) = r5 + +Note that insn #15 has w1 = w5 and w1 is refined later but +r5(w5) is eventually saved on stack at insn #24 for later use. +This cause later verifier failure. The bug has been `fixed`__ in +Clang 13. + +__ https://reviews.llvm.org/D97479 + BPF CO-RE-based tests and Clang version ======================================= @@ -131,3 +170,12 @@ failures: .. _2: https://reviews.llvm.org/D85174 .. _3: https://reviews.llvm.org/D83878 .. _4: https://reviews.llvm.org/D83242 + +Floating-point tests and Clang version +====================================== + +Certain selftests, e.g. core_reloc, require support for the floating-point +types, which was introduced in `Clang 13`__. The older Clang versions will +either crash when compiling these tests, or generate an incorrect BTF. + +__ https://reviews.llvm.org/D83289 diff --git a/tools/testing/selftests/bpf/btf_helpers.c b/tools/testing/selftests/bpf/btf_helpers.c index 48f90490f922..b692e6ead9b5 100644 --- a/tools/testing/selftests/bpf/btf_helpers.c +++ b/tools/testing/selftests/bpf/btf_helpers.c @@ -23,6 +23,7 @@ static const char * const btf_kind_str_mapping[] = { [BTF_KIND_FUNC_PROTO] = "FUNC_PROTO", [BTF_KIND_VAR] = "VAR", [BTF_KIND_DATASEC] = "DATASEC", + [BTF_KIND_FLOAT] = "FLOAT", }; static const char *btf_kind_str(__u16 kind) @@ -173,6 +174,9 @@ int fprintf_btf_type_raw(FILE *out, const struct btf *btf, __u32 id) } break; } + case BTF_KIND_FLOAT: + fprintf(out, " size=%u", t->size); + break; default: break; } diff --git a/tools/testing/selftests/bpf/prog_tests/attach_probe.c b/tools/testing/selftests/bpf/prog_tests/attach_probe.c index a0ee87c8e1ea..9dc4e3dfbcf3 100644 --- a/tools/testing/selftests/bpf/prog_tests/attach_probe.c +++ b/tools/testing/selftests/bpf/prog_tests/attach_probe.c @@ -2,6 +2,44 @@ #include <test_progs.h> #include "test_attach_probe.skel.h" +#if defined(__powerpc64__) && defined(_CALL_ELF) && _CALL_ELF == 2 + +#define OP_RT_RA_MASK 0xffff0000UL +#define LIS_R2 0x3c400000UL +#define ADDIS_R2_R12 0x3c4c0000UL +#define ADDI_R2_R2 0x38420000UL + +static ssize_t get_offset(ssize_t addr, ssize_t base) +{ + u32 *insn = (u32 *) addr; + + /* + * A PPC64 ABIv2 function may have a local and a global entry + * point. We need to use the local entry point when patching + * functions, so identify and step over the global entry point + * sequence. + * + * The global entry point sequence is always of the form: + * + * addis r2,r12,XXXX + * addi r2,r2,XXXX + * + * A linker optimisation may convert the addis to lis: + * + * lis r2,XXXX + * addi r2,r2,XXXX + */ + if ((((*insn & OP_RT_RA_MASK) == ADDIS_R2_R12) || + ((*insn & OP_RT_RA_MASK) == LIS_R2)) && + ((*(insn + 1) & OP_RT_RA_MASK) == ADDI_R2_R2)) + return (ssize_t)(insn + 2) - base; + else + return addr - base; +} +#else +#define get_offset(addr, base) (addr - base) +#endif + ssize_t get_base_addr() { size_t start, offset; char buf[256]; @@ -36,7 +74,7 @@ void test_attach_probe(void) if (CHECK(base_addr < 0, "get_base_addr", "failed to find base addr: %zd", base_addr)) return; - uprobe_offset = (size_t)&get_base_addr - base_addr; + uprobe_offset = get_offset((size_t)&get_base_addr, base_addr); skel = test_attach_probe__open_and_load(); if (CHECK(!skel, "skel_open", "failed to open skeleton\n")) diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c b/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c index e698ee6bb6c2..3d002c245d2b 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c @@ -76,6 +76,7 @@ void test_bpf_verif_scale(void) { "loop2.o", BPF_PROG_TYPE_RAW_TRACEPOINT }, { "loop4.o", BPF_PROG_TYPE_SCHED_CLS }, { "loop5.o", BPF_PROG_TYPE_SCHED_CLS }, + { "loop6.o", BPF_PROG_TYPE_KPROBE }, /* partial unroll. 19k insn in a loop. * Total program size 20.8k insn. diff --git a/tools/testing/selftests/bpf/prog_tests/btf.c b/tools/testing/selftests/bpf/prog_tests/btf.c index 6a7ee7420701..0457ae32b270 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf.c +++ b/tools/testing/selftests/bpf/prog_tests/btf.c @@ -1903,7 +1903,7 @@ static struct btf_raw_test raw_tests[] = { .raw_types = { /* int */ /* [1] */ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), - BTF_TYPE_ENC(0, 0x10000000, 4), + BTF_TYPE_ENC(0, 0x20000000, 4), BTF_END_RAW, }, .str_sec = "", @@ -3531,6 +3531,136 @@ static struct btf_raw_test raw_tests[] = { .max_entries = 1, }, +{ + .descr = "float test #1, well-formed", + .raw_types = { + BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), + /* [1] */ + BTF_TYPE_FLOAT_ENC(NAME_TBD, 2), /* [2] */ + BTF_TYPE_FLOAT_ENC(NAME_TBD, 4), /* [3] */ + BTF_TYPE_FLOAT_ENC(NAME_TBD, 8), /* [4] */ + BTF_TYPE_FLOAT_ENC(NAME_TBD, 12), /* [5] */ + BTF_TYPE_FLOAT_ENC(NAME_TBD, 16), /* [6] */ + BTF_STRUCT_ENC(NAME_TBD, 5, 48), /* [7] */ + BTF_MEMBER_ENC(NAME_TBD, 2, 0), + BTF_MEMBER_ENC(NAME_TBD, 3, 32), + BTF_MEMBER_ENC(NAME_TBD, 4, 64), + BTF_MEMBER_ENC(NAME_TBD, 5, 128), + BTF_MEMBER_ENC(NAME_TBD, 6, 256), + BTF_END_RAW, + }, + BTF_STR_SEC("\0int\0_Float16\0float\0double\0_Float80\0long_double" + "\0floats\0a\0b\0c\0d\0e"), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "float_type_check_btf", + .key_size = sizeof(int), + .value_size = 48, + .key_type_id = 1, + .value_type_id = 7, + .max_entries = 1, +}, +{ + .descr = "float test #2, invalid vlen", + .raw_types = { + BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), + /* [1] */ + BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_FLOAT, 0, 1), 4), + /* [2] */ + BTF_END_RAW, + }, + BTF_STR_SEC("\0int\0float"), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "float_type_check_btf", + .key_size = sizeof(int), + .value_size = 4, + .key_type_id = 1, + .value_type_id = 2, + .max_entries = 1, + .btf_load_err = true, + .err_str = "vlen != 0", +}, +{ + .descr = "float test #3, invalid kind_flag", + .raw_types = { + BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), + /* [1] */ + BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_FLOAT, 1, 0), 4), + /* [2] */ + BTF_END_RAW, + }, + BTF_STR_SEC("\0int\0float"), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "float_type_check_btf", + .key_size = sizeof(int), + .value_size = 4, + .key_type_id = 1, + .value_type_id = 2, + .max_entries = 1, + .btf_load_err = true, + .err_str = "Invalid btf_info kind_flag", +}, +{ + .descr = "float test #4, member does not fit", + .raw_types = { + BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), + /* [1] */ + BTF_TYPE_FLOAT_ENC(NAME_TBD, 4), /* [2] */ + BTF_STRUCT_ENC(NAME_TBD, 1, 2), /* [3] */ + BTF_MEMBER_ENC(NAME_TBD, 2, 0), + BTF_END_RAW, + }, + BTF_STR_SEC("\0int\0float\0floats\0x"), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "float_type_check_btf", + .key_size = sizeof(int), + .value_size = 4, + .key_type_id = 1, + .value_type_id = 3, + .max_entries = 1, + .btf_load_err = true, + .err_str = "Member exceeds struct_size", +}, +{ + .descr = "float test #5, member is not properly aligned", + .raw_types = { + BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), + /* [1] */ + BTF_TYPE_FLOAT_ENC(NAME_TBD, 4), /* [2] */ + BTF_STRUCT_ENC(NAME_TBD, 1, 8), /* [3] */ + BTF_MEMBER_ENC(NAME_TBD, 2, 8), + BTF_END_RAW, + }, + BTF_STR_SEC("\0int\0float\0floats\0x"), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "float_type_check_btf", + .key_size = sizeof(int), + .value_size = 4, + .key_type_id = 1, + .value_type_id = 3, + .max_entries = 1, + .btf_load_err = true, + .err_str = "Member is not properly aligned", +}, +{ + .descr = "float test #6, invalid size", + .raw_types = { + BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), + /* [1] */ + BTF_TYPE_FLOAT_ENC(NAME_TBD, 6), /* [2] */ + BTF_END_RAW, + }, + BTF_STR_SEC("\0int\0float"), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "float_type_check_btf", + .key_size = sizeof(int), + .value_size = 6, + .key_type_id = 1, + .value_type_id = 2, + .max_entries = 1, + .btf_load_err = true, + .err_str = "Invalid type_size", +}, + }; /* struct btf_raw_test raw_tests[] */ static const char *get_next_str(const char *start, const char *end) @@ -6281,11 +6411,12 @@ const struct btf_dedup_test dedup_tests[] = { /* int[16] */ BTF_TYPE_ARRAY_ENC(1, 1, 16), /* [2] */ /* struct s { */ - BTF_STRUCT_ENC(NAME_NTH(2), 4, 84), /* [3] */ + BTF_STRUCT_ENC(NAME_NTH(2), 5, 88), /* [3] */ BTF_MEMBER_ENC(NAME_NTH(3), 4, 0), /* struct s *next; */ BTF_MEMBER_ENC(NAME_NTH(4), 5, 64), /* const int *a; */ BTF_MEMBER_ENC(NAME_NTH(5), 2, 128), /* int b[16]; */ BTF_MEMBER_ENC(NAME_NTH(6), 1, 640), /* int c; */ + BTF_MEMBER_ENC(NAME_NTH(8), 13, 672), /* float d; */ /* ptr -> [3] struct s */ BTF_PTR_ENC(3), /* [4] */ /* ptr -> [6] const int */ @@ -6296,39 +6427,43 @@ const struct btf_dedup_test dedup_tests[] = { /* full copy of the above */ BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_SIGNED, 0, 32, 4), /* [7] */ BTF_TYPE_ARRAY_ENC(7, 7, 16), /* [8] */ - BTF_STRUCT_ENC(NAME_NTH(2), 4, 84), /* [9] */ + BTF_STRUCT_ENC(NAME_NTH(2), 5, 88), /* [9] */ BTF_MEMBER_ENC(NAME_NTH(3), 10, 0), BTF_MEMBER_ENC(NAME_NTH(4), 11, 64), BTF_MEMBER_ENC(NAME_NTH(5), 8, 128), BTF_MEMBER_ENC(NAME_NTH(6), 7, 640), + BTF_MEMBER_ENC(NAME_NTH(8), 13, 672), BTF_PTR_ENC(9), /* [10] */ BTF_PTR_ENC(12), /* [11] */ BTF_CONST_ENC(7), /* [12] */ + BTF_TYPE_FLOAT_ENC(NAME_NTH(7), 4), /* [13] */ BTF_END_RAW, }, - BTF_STR_SEC("\0int\0s\0next\0a\0b\0c\0"), + BTF_STR_SEC("\0int\0s\0next\0a\0b\0c\0float\0d"), }, .expect = { .raw_types = { /* int */ - BTF_TYPE_INT_ENC(NAME_NTH(4), BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + BTF_TYPE_INT_ENC(NAME_NTH(5), BTF_INT_SIGNED, 0, 32, 4), /* [1] */ /* int[16] */ BTF_TYPE_ARRAY_ENC(1, 1, 16), /* [2] */ /* struct s { */ - BTF_STRUCT_ENC(NAME_NTH(6), 4, 84), /* [3] */ - BTF_MEMBER_ENC(NAME_NTH(5), 4, 0), /* struct s *next; */ + BTF_STRUCT_ENC(NAME_NTH(8), 5, 88), /* [3] */ + BTF_MEMBER_ENC(NAME_NTH(7), 4, 0), /* struct s *next; */ BTF_MEMBER_ENC(NAME_NTH(1), 5, 64), /* const int *a; */ BTF_MEMBER_ENC(NAME_NTH(2), 2, 128), /* int b[16]; */ BTF_MEMBER_ENC(NAME_NTH(3), 1, 640), /* int c; */ + BTF_MEMBER_ENC(NAME_NTH(4), 7, 672), /* float d; */ /* ptr -> [3] struct s */ BTF_PTR_ENC(3), /* [4] */ /* ptr -> [6] const int */ BTF_PTR_ENC(6), /* [5] */ /* const -> [1] int */ BTF_CONST_ENC(1), /* [6] */ + BTF_TYPE_FLOAT_ENC(NAME_NTH(7), 4), /* [7] */ BTF_END_RAW, }, - BTF_STR_SEC("\0a\0b\0c\0int\0next\0s"), + BTF_STR_SEC("\0a\0b\0c\0d\0int\0float\0next\0s"), }, .opts = { .dont_resolve_fwds = false, @@ -6449,9 +6584,10 @@ const struct btf_dedup_test dedup_tests[] = { BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 1), BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 8), BTF_FUNC_ENC(NAME_TBD, 12), /* [13] func */ + BTF_TYPE_FLOAT_ENC(NAME_TBD, 2), /* [14] float */ BTF_END_RAW, }, - BTF_STR_SEC("\0A\0B\0C\0D\0E\0F\0G\0H\0I\0J\0K\0L\0M"), + BTF_STR_SEC("\0A\0B\0C\0D\0E\0F\0G\0H\0I\0J\0K\0L\0M\0N"), }, .expect = { .raw_types = { @@ -6474,16 +6610,17 @@ const struct btf_dedup_test dedup_tests[] = { BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 1), BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 8), BTF_FUNC_ENC(NAME_TBD, 12), /* [13] func */ + BTF_TYPE_FLOAT_ENC(NAME_TBD, 2), /* [14] float */ BTF_END_RAW, }, - BTF_STR_SEC("\0A\0B\0C\0D\0E\0F\0G\0H\0I\0J\0K\0L\0M"), + BTF_STR_SEC("\0A\0B\0C\0D\0E\0F\0G\0H\0I\0J\0K\0L\0M\0N"), }, .opts = { .dont_resolve_fwds = false, }, }, { - .descr = "dedup: no int duplicates", + .descr = "dedup: no int/float duplicates", .input = { .raw_types = { BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_SIGNED, 0, 32, 8), @@ -6498,9 +6635,15 @@ const struct btf_dedup_test dedup_tests[] = { BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_SIGNED, 0, 27, 8), /* different byte size */ BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_SIGNED, 0, 32, 4), + /* all allowed sizes */ + BTF_TYPE_FLOAT_ENC(NAME_NTH(3), 2), + BTF_TYPE_FLOAT_ENC(NAME_NTH(3), 4), + BTF_TYPE_FLOAT_ENC(NAME_NTH(3), 8), + BTF_TYPE_FLOAT_ENC(NAME_NTH(3), 12), + BTF_TYPE_FLOAT_ENC(NAME_NTH(3), 16), BTF_END_RAW, }, - BTF_STR_SEC("\0int\0some other int"), + BTF_STR_SEC("\0int\0some other int\0float"), }, .expect = { .raw_types = { @@ -6516,9 +6659,15 @@ const struct btf_dedup_test dedup_tests[] = { BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_SIGNED, 0, 27, 8), /* different byte size */ BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_SIGNED, 0, 32, 4), + /* all allowed sizes */ + BTF_TYPE_FLOAT_ENC(NAME_NTH(3), 2), + BTF_TYPE_FLOAT_ENC(NAME_NTH(3), 4), + BTF_TYPE_FLOAT_ENC(NAME_NTH(3), 8), + BTF_TYPE_FLOAT_ENC(NAME_NTH(3), 12), + BTF_TYPE_FLOAT_ENC(NAME_NTH(3), 16), BTF_END_RAW, }, - BTF_STR_SEC("\0int\0some other int"), + BTF_STR_SEC("\0int\0some other int\0float"), }, .opts = { .dont_resolve_fwds = false, @@ -6630,6 +6779,7 @@ static int btf_type_size(const struct btf_type *t) case BTF_KIND_PTR: case BTF_KIND_TYPEDEF: case BTF_KIND_FUNC: + case BTF_KIND_FLOAT: return base_size; case BTF_KIND_INT: return base_size + sizeof(__u32); diff --git a/tools/testing/selftests/bpf/prog_tests/core_reloc.c b/tools/testing/selftests/bpf/prog_tests/core_reloc.c index 06eb956ff7bb..d94dcead72e6 100644 --- a/tools/testing/selftests/bpf/prog_tests/core_reloc.c +++ b/tools/testing/selftests/bpf/prog_tests/core_reloc.c @@ -266,6 +266,7 @@ static int duration = 0; .arr_elem_sz = sizeof(((type *)0)->arr_field[0]), \ .ptr_sz = 8, /* always 8-byte pointer for BPF */ \ .enum_sz = sizeof(((type *)0)->enum_field), \ + .float_sz = sizeof(((type *)0)->float_field), \ } #define SIZE_CASE(name) { \ diff --git a/tools/testing/selftests/bpf/prog_tests/for_each.c b/tools/testing/selftests/bpf/prog_tests/for_each.c new file mode 100644 index 000000000000..68eb12a287d4 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/for_each.c @@ -0,0 +1,130 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#include <test_progs.h> +#include <network_helpers.h> +#include "for_each_hash_map_elem.skel.h" +#include "for_each_array_map_elem.skel.h" + +static unsigned int duration; + +static void test_hash_map(void) +{ + int i, err, hashmap_fd, max_entries, percpu_map_fd; + struct for_each_hash_map_elem *skel; + __u64 *percpu_valbuf = NULL; + __u32 key, num_cpus, retval; + __u64 val; + + skel = for_each_hash_map_elem__open_and_load(); + if (!ASSERT_OK_PTR(skel, "for_each_hash_map_elem__open_and_load")) + return; + + hashmap_fd = bpf_map__fd(skel->maps.hashmap); + max_entries = bpf_map__max_entries(skel->maps.hashmap); + for (i = 0; i < max_entries; i++) { + key = i; + val = i + 1; + err = bpf_map_update_elem(hashmap_fd, &key, &val, BPF_ANY); + if (!ASSERT_OK(err, "map_update")) + goto out; + } + + num_cpus = bpf_num_possible_cpus(); + percpu_map_fd = bpf_map__fd(skel->maps.percpu_map); + percpu_valbuf = malloc(sizeof(__u64) * num_cpus); + if (!ASSERT_OK_PTR(percpu_valbuf, "percpu_valbuf")) + goto out; + + key = 1; + for (i = 0; i < num_cpus; i++) + percpu_valbuf[i] = i + 1; + err = bpf_map_update_elem(percpu_map_fd, &key, percpu_valbuf, BPF_ANY); + if (!ASSERT_OK(err, "percpu_map_update")) + goto out; + + err = bpf_prog_test_run(bpf_program__fd(skel->progs.test_pkt_access), + 1, &pkt_v4, sizeof(pkt_v4), NULL, NULL, + &retval, &duration); + if (CHECK(err || retval, "ipv4", "err %d errno %d retval %d\n", + err, errno, retval)) + goto out; + + ASSERT_EQ(skel->bss->hashmap_output, 4, "hashmap_output"); + ASSERT_EQ(skel->bss->hashmap_elems, max_entries, "hashmap_elems"); + + key = 1; + err = bpf_map_lookup_elem(hashmap_fd, &key, &val); + ASSERT_ERR(err, "hashmap_lookup"); + + ASSERT_EQ(skel->bss->percpu_called, 1, "percpu_called"); + ASSERT_LT(skel->bss->cpu, num_cpus, "num_cpus"); + ASSERT_EQ(skel->bss->percpu_map_elems, 1, "percpu_map_elems"); + ASSERT_EQ(skel->bss->percpu_key, 1, "percpu_key"); + ASSERT_EQ(skel->bss->percpu_val, skel->bss->cpu + 1, "percpu_val"); + ASSERT_EQ(skel->bss->percpu_output, 100, "percpu_output"); +out: + free(percpu_valbuf); + for_each_hash_map_elem__destroy(skel); +} + +static void test_array_map(void) +{ + __u32 key, num_cpus, max_entries, retval; + int i, arraymap_fd, percpu_map_fd, err; + struct for_each_array_map_elem *skel; + __u64 *percpu_valbuf = NULL; + __u64 val, expected_total; + + skel = for_each_array_map_elem__open_and_load(); + if (!ASSERT_OK_PTR(skel, "for_each_array_map_elem__open_and_load")) + return; + + arraymap_fd = bpf_map__fd(skel->maps.arraymap); + expected_total = 0; + max_entries = bpf_map__max_entries(skel->maps.arraymap); + for (i = 0; i < max_entries; i++) { + key = i; + val = i + 1; + /* skip the last iteration for expected total */ + if (i != max_entries - 1) + expected_total += val; + err = bpf_map_update_elem(arraymap_fd, &key, &val, BPF_ANY); + if (!ASSERT_OK(err, "map_update")) + goto out; + } + + num_cpus = bpf_num_possible_cpus(); + percpu_map_fd = bpf_map__fd(skel->maps.percpu_map); + percpu_valbuf = malloc(sizeof(__u64) * num_cpus); + if (!ASSERT_OK_PTR(percpu_valbuf, "percpu_valbuf")) + goto out; + + key = 0; + for (i = 0; i < num_cpus; i++) + percpu_valbuf[i] = i + 1; + err = bpf_map_update_elem(percpu_map_fd, &key, percpu_valbuf, BPF_ANY); + if (!ASSERT_OK(err, "percpu_map_update")) + goto out; + + err = bpf_prog_test_run(bpf_program__fd(skel->progs.test_pkt_access), + 1, &pkt_v4, sizeof(pkt_v4), NULL, NULL, + &retval, &duration); + if (CHECK(err || retval, "ipv4", "err %d errno %d retval %d\n", + err, errno, retval)) + goto out; + + ASSERT_EQ(skel->bss->arraymap_output, expected_total, "array_output"); + ASSERT_EQ(skel->bss->cpu + 1, skel->bss->percpu_val, "percpu_val"); + +out: + free(percpu_valbuf); + for_each_array_map_elem__destroy(skel); +} + +void test_for_each(void) +{ + if (test__start_subtest("hash_map")) + test_hash_map(); + if (test__start_subtest("array_map")) + test_array_map(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/prog_run_xattr.c b/tools/testing/selftests/bpf/prog_tests/prog_run_xattr.c index 935a294f049a..131d7f7eeb42 100644 --- a/tools/testing/selftests/bpf/prog_tests/prog_run_xattr.c +++ b/tools/testing/selftests/bpf/prog_tests/prog_run_xattr.c @@ -2,12 +2,31 @@ #include <test_progs.h> #include <network_helpers.h> -void test_prog_run_xattr(void) +#include "test_pkt_access.skel.h" + +static const __u32 duration; + +static void check_run_cnt(int prog_fd, __u64 run_cnt) { - const char *file = "./test_pkt_access.o"; - struct bpf_object *obj; - char buf[10]; + struct bpf_prog_info info = {}; + __u32 info_len = sizeof(info); int err; + + err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len); + if (CHECK(err, "get_prog_info", "failed to get bpf_prog_info for fd %d\n", prog_fd)) + return; + + CHECK(run_cnt != info.run_cnt, "run_cnt", + "incorrect number of repetitions, want %llu have %llu\n", run_cnt, info.run_cnt); +} + +void test_prog_run_xattr(void) +{ + struct test_pkt_access *skel; + int err, stats_fd = -1; + char buf[10] = {}; + __u64 run_cnt = 0; + struct bpf_prog_test_run_attr tattr = { .repeat = 1, .data_in = &pkt_v4, @@ -16,12 +35,15 @@ void test_prog_run_xattr(void) .data_size_out = 5, }; - err = bpf_prog_load(file, BPF_PROG_TYPE_SCHED_CLS, &obj, - &tattr.prog_fd); - if (CHECK_ATTR(err, "load", "err %d errno %d\n", err, errno)) + stats_fd = bpf_enable_stats(BPF_STATS_RUN_TIME); + if (CHECK_ATTR(stats_fd < 0, "enable_stats", "failed %d\n", errno)) return; - memset(buf, 0, sizeof(buf)); + skel = test_pkt_access__open_and_load(); + if (CHECK_ATTR(!skel, "open_and_load", "failed\n")) + goto cleanup; + + tattr.prog_fd = bpf_program__fd(skel->progs.test_pkt_access); err = bpf_prog_test_run_xattr(&tattr); CHECK_ATTR(err != -1 || errno != ENOSPC || tattr.retval, "run", @@ -34,8 +56,12 @@ void test_prog_run_xattr(void) CHECK_ATTR(buf[5] != 0, "overflow", "BPF_PROG_TEST_RUN ignored size hint\n"); + run_cnt += tattr.repeat; + check_run_cnt(tattr.prog_fd, run_cnt); + tattr.data_out = NULL; tattr.data_size_out = 0; + tattr.repeat = 2; errno = 0; err = bpf_prog_test_run_xattr(&tattr); @@ -46,5 +72,12 @@ void test_prog_run_xattr(void) err = bpf_prog_test_run_xattr(&tattr); CHECK_ATTR(err != -EINVAL, "run_wrong_size_out", "err %d\n", err); - bpf_object__close(obj); + run_cnt += tattr.repeat; + check_run_cnt(tattr.prog_fd, run_cnt); + +cleanup: + if (skel) + test_pkt_access__destroy(skel); + if (stats_fd != -1) + close(stats_fd); } diff --git a/tools/testing/selftests/bpf/prog_tests/sk_lookup.c b/tools/testing/selftests/bpf/prog_tests/sk_lookup.c index 9ff0412e1fd3..45c82db3c58c 100644 --- a/tools/testing/selftests/bpf/prog_tests/sk_lookup.c +++ b/tools/testing/selftests/bpf/prog_tests/sk_lookup.c @@ -241,6 +241,48 @@ fail: return -1; } +static __u64 socket_cookie(int fd) +{ + __u64 cookie; + socklen_t cookie_len = sizeof(cookie); + + if (CHECK(getsockopt(fd, SOL_SOCKET, SO_COOKIE, &cookie, &cookie_len) < 0, + "getsockopt(SO_COOKIE)", "%s\n", strerror(errno))) + return 0; + return cookie; +} + +static int fill_sk_lookup_ctx(struct bpf_sk_lookup *ctx, const char *local_ip, __u16 local_port, + const char *remote_ip, __u16 remote_port) +{ + void *local, *remote; + int err; + + memset(ctx, 0, sizeof(*ctx)); + ctx->local_port = local_port; + ctx->remote_port = htons(remote_port); + + if (is_ipv6(local_ip)) { + ctx->family = AF_INET6; + local = &ctx->local_ip6[0]; + remote = &ctx->remote_ip6[0]; + } else { + ctx->family = AF_INET; + local = &ctx->local_ip4; + remote = &ctx->remote_ip4; + } + + err = inet_pton(ctx->family, local_ip, local); + if (CHECK(err != 1, "inet_pton", "local_ip failed\n")) + return 1; + + err = inet_pton(ctx->family, remote_ip, remote); + if (CHECK(err != 1, "inet_pton", "remote_ip failed\n")) + return 1; + + return 0; +} + static int send_byte(int fd) { ssize_t n; @@ -1009,18 +1051,27 @@ static void test_drop_on_reuseport(struct test_sk_lookup *skel) static void run_sk_assign(struct test_sk_lookup *skel, struct bpf_program *lookup_prog, - const char *listen_ip, const char *connect_ip) + const char *remote_ip, const char *local_ip) { - int client_fd, peer_fd, server_fds[MAX_SERVERS] = { -1 }; - struct bpf_link *lookup_link; + int server_fds[MAX_SERVERS] = { -1 }; + struct bpf_sk_lookup ctx; + __u64 server_cookie; int i, err; - lookup_link = attach_lookup_prog(lookup_prog); - if (!lookup_link) + DECLARE_LIBBPF_OPTS(bpf_test_run_opts, opts, + .ctx_in = &ctx, + .ctx_size_in = sizeof(ctx), + .ctx_out = &ctx, + .ctx_size_out = sizeof(ctx), + ); + + if (fill_sk_lookup_ctx(&ctx, local_ip, EXT_PORT, remote_ip, INT_PORT)) return; + ctx.protocol = IPPROTO_TCP; + for (i = 0; i < ARRAY_SIZE(server_fds); i++) { - server_fds[i] = make_server(SOCK_STREAM, listen_ip, 0, NULL); + server_fds[i] = make_server(SOCK_STREAM, local_ip, 0, NULL); if (server_fds[i] < 0) goto close_servers; @@ -1030,23 +1081,25 @@ static void run_sk_assign(struct test_sk_lookup *skel, goto close_servers; } - client_fd = make_client(SOCK_STREAM, connect_ip, EXT_PORT); - if (client_fd < 0) + server_cookie = socket_cookie(server_fds[SERVER_B]); + if (!server_cookie) + return; + + err = bpf_prog_test_run_opts(bpf_program__fd(lookup_prog), &opts); + if (CHECK(err, "test_run", "failed with error %d\n", errno)) + goto close_servers; + + if (CHECK(ctx.cookie == 0, "ctx.cookie", "no socket selected\n")) goto close_servers; - peer_fd = accept(server_fds[SERVER_B], NULL, NULL); - if (CHECK(peer_fd < 0, "accept", "failed\n")) - goto close_client; + CHECK(ctx.cookie != server_cookie, "ctx.cookie", + "selected sk %llu instead of %llu\n", ctx.cookie, server_cookie); - close(peer_fd); -close_client: - close(client_fd); close_servers: for (i = 0; i < ARRAY_SIZE(server_fds); i++) { if (server_fds[i] != -1) close(server_fds[i]); } - bpf_link__destroy(lookup_link); } static void run_sk_assign_v4(struct test_sk_lookup *skel, diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c index d7d65a700799..c26e6bf05e49 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c @@ -1014,8 +1014,8 @@ static void test_skb_redir_to_connected(struct test_sockmap_listen *skel, struct bpf_map *inner_map, int family, int sotype) { - int verdict = bpf_program__fd(skel->progs.prog_skb_verdict); - int parser = bpf_program__fd(skel->progs.prog_skb_parser); + int verdict = bpf_program__fd(skel->progs.prog_stream_verdict); + int parser = bpf_program__fd(skel->progs.prog_stream_parser); int verdict_map = bpf_map__fd(skel->maps.verdict_map); int sock_map = bpf_map__fd(inner_map); int err; @@ -1125,8 +1125,8 @@ static void test_skb_redir_to_listening(struct test_sockmap_listen *skel, struct bpf_map *inner_map, int family, int sotype) { - int verdict = bpf_program__fd(skel->progs.prog_skb_verdict); - int parser = bpf_program__fd(skel->progs.prog_skb_parser); + int verdict = bpf_program__fd(skel->progs.prog_stream_verdict); + int parser = bpf_program__fd(skel->progs.prog_stream_parser); int verdict_map = bpf_map__fd(skel->maps.verdict_map); int sock_map = bpf_map__fd(inner_map); int err; diff --git a/tools/testing/selftests/bpf/prog_tests/task_local_storage.c b/tools/testing/selftests/bpf/prog_tests/task_local_storage.c new file mode 100644 index 000000000000..035c263aab1b --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/task_local_storage.c @@ -0,0 +1,92 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#define _GNU_SOURCE /* See feature_test_macros(7) */ +#include <unistd.h> +#include <sys/syscall.h> /* For SYS_xxx definitions */ +#include <sys/types.h> +#include <test_progs.h> +#include "task_local_storage.skel.h" +#include "task_local_storage_exit_creds.skel.h" +#include "task_ls_recursion.skel.h" + +static void test_sys_enter_exit(void) +{ + struct task_local_storage *skel; + int err; + + skel = task_local_storage__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open_and_load")) + return; + + skel->bss->target_pid = syscall(SYS_gettid); + + err = task_local_storage__attach(skel); + if (!ASSERT_OK(err, "skel_attach")) + goto out; + + syscall(SYS_gettid); + syscall(SYS_gettid); + + /* 3x syscalls: 1x attach and 2x gettid */ + ASSERT_EQ(skel->bss->enter_cnt, 3, "enter_cnt"); + ASSERT_EQ(skel->bss->exit_cnt, 3, "exit_cnt"); + ASSERT_EQ(skel->bss->mismatch_cnt, 0, "mismatch_cnt"); +out: + task_local_storage__destroy(skel); +} + +static void test_exit_creds(void) +{ + struct task_local_storage_exit_creds *skel; + int err; + + skel = task_local_storage_exit_creds__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open_and_load")) + return; + + err = task_local_storage_exit_creds__attach(skel); + if (!ASSERT_OK(err, "skel_attach")) + goto out; + + /* trigger at least one exit_creds() */ + if (CHECK_FAIL(system("ls > /dev/null"))) + goto out; + + /* sync rcu to make sure exit_creds() is called for "ls" */ + kern_sync_rcu(); + ASSERT_EQ(skel->bss->valid_ptr_count, 0, "valid_ptr_count"); + ASSERT_NEQ(skel->bss->null_ptr_count, 0, "null_ptr_count"); +out: + task_local_storage_exit_creds__destroy(skel); +} + +static void test_recursion(void) +{ + struct task_ls_recursion *skel; + int err; + + skel = task_ls_recursion__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open_and_load")) + return; + + err = task_ls_recursion__attach(skel); + if (!ASSERT_OK(err, "skel_attach")) + goto out; + + /* trigger sys_enter, make sure it does not cause deadlock */ + syscall(SYS_gettid); + +out: + task_ls_recursion__destroy(skel); +} + +void test_task_local_storage(void) +{ + if (test__start_subtest("sys_enter_exit")) + test_sys_enter_exit(); + if (test__start_subtest("exit_creds")) + test_exit_creds(); + if (test__start_subtest("recursion")) + test_recursion(); +} diff --git a/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c b/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c index 31975c96e2c9..12b40dc81e14 100644 --- a/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c +++ b/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c @@ -205,6 +205,12 @@ struct struct_with_embedded_stuff { int t[11]; }; +struct float_struct { + float f; + const double *d; + volatile long double *ld; +}; + struct root_struct { enum e1 _1; enum e2 _2; @@ -219,6 +225,7 @@ struct root_struct { union_fwd_t *_12; union_fwd_ptr_t _13; struct struct_with_embedded_stuff _14; + struct float_struct _15; }; /* ------ END-EXPECTED-OUTPUT ------ */ diff --git a/tools/testing/selftests/bpf/progs/core_reloc_types.h b/tools/testing/selftests/bpf/progs/core_reloc_types.h index 9a2850850121..9982eb969048 100644 --- a/tools/testing/selftests/bpf/progs/core_reloc_types.h +++ b/tools/testing/selftests/bpf/progs/core_reloc_types.h @@ -807,6 +807,7 @@ struct core_reloc_size_output { int arr_elem_sz; int ptr_sz; int enum_sz; + int float_sz; }; struct core_reloc_size { @@ -816,6 +817,7 @@ struct core_reloc_size { int arr_field[4]; void *ptr_field; enum { VALUE = 123 } enum_field; + float float_field; }; struct core_reloc_size___diff_sz { @@ -825,6 +827,7 @@ struct core_reloc_size___diff_sz { char arr_field[10]; void *ptr_field; enum { OTHER_VALUE = 0xFFFFFFFFFFFFFFFF } enum_field; + double float_field; }; /* Error case of two candidates with the fields (int_field) at the same @@ -839,6 +842,7 @@ struct core_reloc_size___err_ambiguous1 { int arr_field[4]; void *ptr_field; enum { VALUE___1 = 123 } enum_field; + float float_field; }; struct core_reloc_size___err_ambiguous2 { @@ -850,6 +854,7 @@ struct core_reloc_size___err_ambiguous2 { int arr_field[4]; void *ptr_field; enum { VALUE___2 = 123 } enum_field; + float float_field; }; /* diff --git a/tools/testing/selftests/bpf/progs/for_each_array_map_elem.c b/tools/testing/selftests/bpf/progs/for_each_array_map_elem.c new file mode 100644 index 000000000000..75e8e1069fe7 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/for_each_array_map_elem.c @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 3); + __type(key, __u32); + __type(value, __u64); +} arraymap SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, __u64); +} percpu_map SEC(".maps"); + +struct callback_ctx { + int output; +}; + +static __u64 +check_array_elem(struct bpf_map *map, __u32 *key, __u64 *val, + struct callback_ctx *data) +{ + data->output += *val; + if (*key == 1) + return 1; /* stop the iteration */ + return 0; +} + +__u32 cpu = 0; +__u64 percpu_val = 0; + +static __u64 +check_percpu_elem(struct bpf_map *map, __u32 *key, __u64 *val, + struct callback_ctx *data) +{ + cpu = bpf_get_smp_processor_id(); + percpu_val = *val; + return 0; +} + +u32 arraymap_output = 0; + +SEC("classifier") +int test_pkt_access(struct __sk_buff *skb) +{ + struct callback_ctx data; + + data.output = 0; + bpf_for_each_map_elem(&arraymap, check_array_elem, &data, 0); + arraymap_output = data.output; + + bpf_for_each_map_elem(&percpu_map, check_percpu_elem, (void *)0, 0); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/for_each_hash_map_elem.c b/tools/testing/selftests/bpf/progs/for_each_hash_map_elem.c new file mode 100644 index 000000000000..913dd91aafff --- /dev/null +++ b/tools/testing/selftests/bpf/progs/for_each_hash_map_elem.c @@ -0,0 +1,95 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 3); + __type(key, __u32); + __type(value, __u64); +} hashmap SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_HASH); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, __u64); +} percpu_map SEC(".maps"); + +struct callback_ctx { + struct __sk_buff *ctx; + int input; + int output; +}; + +static __u64 +check_hash_elem(struct bpf_map *map, __u32 *key, __u64 *val, + struct callback_ctx *data) +{ + struct __sk_buff *skb = data->ctx; + __u32 k; + __u64 v; + + if (skb) { + k = *key; + v = *val; + if (skb->len == 10000 && k == 10 && v == 10) + data->output = 3; /* impossible path */ + else + data->output = 4; + } else { + data->output = data->input; + bpf_map_delete_elem(map, key); + } + + return 0; +} + +__u32 cpu = 0; +__u32 percpu_called = 0; +__u32 percpu_key = 0; +__u64 percpu_val = 0; +int percpu_output = 0; + +static __u64 +check_percpu_elem(struct bpf_map *map, __u32 *key, __u64 *val, + struct callback_ctx *unused) +{ + struct callback_ctx data; + + percpu_called++; + cpu = bpf_get_smp_processor_id(); + percpu_key = *key; + percpu_val = *val; + + data.ctx = 0; + data.input = 100; + data.output = 0; + bpf_for_each_map_elem(&hashmap, check_hash_elem, &data, 0); + percpu_output = data.output; + + return 0; +} + +int hashmap_output = 0; +int hashmap_elems = 0; +int percpu_map_elems = 0; + +SEC("classifier") +int test_pkt_access(struct __sk_buff *skb) +{ + struct callback_ctx data; + + data.ctx = skb; + data.input = 10; + data.output = 0; + hashmap_elems = bpf_for_each_map_elem(&hashmap, check_hash_elem, &data, 0); + hashmap_output = data.output; + + percpu_map_elems = bpf_for_each_map_elem(&percpu_map, check_percpu_elem, + (void *)0, 0); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/loop6.c b/tools/testing/selftests/bpf/progs/loop6.c new file mode 100644 index 000000000000..38de0331e6b4 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/loop6.c @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/ptrace.h> +#include <stddef.h> +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +char _license[] SEC("license") = "GPL"; + +/* typically virtio scsi has max SGs of 6 */ +#define VIRTIO_MAX_SGS 6 + +/* Verifier will fail with SG_MAX = 128. The failure can be + * workarounded with a smaller SG_MAX, e.g. 10. + */ +#define WORKAROUND +#ifdef WORKAROUND +#define SG_MAX 10 +#else +/* typically virtio blk has max SEG of 128 */ +#define SG_MAX 128 +#endif + +#define SG_CHAIN 0x01UL +#define SG_END 0x02UL + +struct scatterlist { + unsigned long page_link; + unsigned int offset; + unsigned int length; +}; + +#define sg_is_chain(sg) ((sg)->page_link & SG_CHAIN) +#define sg_is_last(sg) ((sg)->page_link & SG_END) +#define sg_chain_ptr(sg) \ + ((struct scatterlist *) ((sg)->page_link & ~(SG_CHAIN | SG_END))) + +static inline struct scatterlist *__sg_next(struct scatterlist *sgp) +{ + struct scatterlist sg; + + bpf_probe_read_kernel(&sg, sizeof(sg), sgp); + if (sg_is_last(&sg)) + return NULL; + + sgp++; + + bpf_probe_read_kernel(&sg, sizeof(sg), sgp); + if (sg_is_chain(&sg)) + sgp = sg_chain_ptr(&sg); + + return sgp; +} + +static inline struct scatterlist *get_sgp(struct scatterlist **sgs, int i) +{ + struct scatterlist *sgp; + + bpf_probe_read_kernel(&sgp, sizeof(sgp), sgs + i); + return sgp; +} + +int config = 0; +int result = 0; + +SEC("kprobe/virtqueue_add_sgs") +int BPF_KPROBE(trace_virtqueue_add_sgs, void *unused, struct scatterlist **sgs, + unsigned int out_sgs, unsigned int in_sgs) +{ + struct scatterlist *sgp = NULL; + __u64 length1 = 0, length2 = 0; + unsigned int i, n, len; + + if (config != 0) + return 0; + + for (i = 0; (i < VIRTIO_MAX_SGS) && (i < out_sgs); i++) { + for (n = 0, sgp = get_sgp(sgs, i); sgp && (n < SG_MAX); + sgp = __sg_next(sgp)) { + bpf_probe_read_kernel(&len, sizeof(len), &sgp->length); + length1 += len; + n++; + } + } + + for (i = 0; (i < VIRTIO_MAX_SGS) && (i < in_sgs); i++) { + for (n = 0, sgp = get_sgp(sgs, i); sgp && (n < SG_MAX); + sgp = __sg_next(sgp)) { + bpf_probe_read_kernel(&len, sizeof(len), &sgp->length); + length2 += len; + n++; + } + } + + config = 1; + result = length2 - length1; + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/task_local_storage.c b/tools/testing/selftests/bpf/progs/task_local_storage.c new file mode 100644 index 000000000000..80a0a20db88d --- /dev/null +++ b/tools/testing/selftests/bpf/progs/task_local_storage.c @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, long); +} enter_id SEC(".maps"); + +#define MAGIC_VALUE 0xabcd1234 + +pid_t target_pid = 0; +int mismatch_cnt = 0; +int enter_cnt = 0; +int exit_cnt = 0; + +SEC("tp_btf/sys_enter") +int BPF_PROG(on_enter, struct pt_regs *regs, long id) +{ + struct task_struct *task; + long *ptr; + + task = bpf_get_current_task_btf(); + if (task->pid != target_pid) + return 0; + + ptr = bpf_task_storage_get(&enter_id, task, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (!ptr) + return 0; + + __sync_fetch_and_add(&enter_cnt, 1); + *ptr = MAGIC_VALUE + enter_cnt; + + return 0; +} + +SEC("tp_btf/sys_exit") +int BPF_PROG(on_exit, struct pt_regs *regs, long id) +{ + struct task_struct *task; + long *ptr; + + task = bpf_get_current_task_btf(); + if (task->pid != target_pid) + return 0; + + ptr = bpf_task_storage_get(&enter_id, task, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (!ptr) + return 0; + + __sync_fetch_and_add(&exit_cnt, 1); + if (*ptr != MAGIC_VALUE + exit_cnt) + __sync_fetch_and_add(&mismatch_cnt, 1); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/task_local_storage_exit_creds.c b/tools/testing/selftests/bpf/progs/task_local_storage_exit_creds.c new file mode 100644 index 000000000000..81758c0aef99 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/task_local_storage_exit_creds.c @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, __u64); +} task_storage SEC(".maps"); + +int valid_ptr_count = 0; +int null_ptr_count = 0; + +SEC("fentry/exit_creds") +int BPF_PROG(trace_exit_creds, struct task_struct *task) +{ + __u64 *ptr; + + ptr = bpf_task_storage_get(&task_storage, task, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (ptr) + __sync_fetch_and_add(&valid_ptr_count, 1); + else + __sync_fetch_and_add(&null_ptr_count, 1); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/task_ls_recursion.c b/tools/testing/selftests/bpf/progs/task_ls_recursion.c new file mode 100644 index 000000000000..564583dca7c8 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/task_ls_recursion.c @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, long); +} map_a SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, long); +} map_b SEC(".maps"); + +SEC("fentry/bpf_local_storage_lookup") +int BPF_PROG(on_lookup) +{ + struct task_struct *task = bpf_get_current_task_btf(); + + bpf_task_storage_delete(&map_a, task); + bpf_task_storage_delete(&map_b, task); + return 0; +} + +SEC("fentry/bpf_local_storage_update") +int BPF_PROG(on_update) +{ + struct task_struct *task = bpf_get_current_task_btf(); + long *ptr; + + ptr = bpf_task_storage_get(&map_a, task, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (ptr) + *ptr += 1; + + ptr = bpf_task_storage_get(&map_b, task, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (ptr) + *ptr += 1; + + return 0; +} + +SEC("tp_btf/sys_enter") +int BPF_PROG(on_enter, struct pt_regs *regs, long id) +{ + struct task_struct *task; + long *ptr; + + task = bpf_get_current_task_btf(); + ptr = bpf_task_storage_get(&map_a, task, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (ptr) + *ptr = 200; + + ptr = bpf_task_storage_get(&map_b, task, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (ptr) + *ptr = 100; + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_size.c b/tools/testing/selftests/bpf/progs/test_core_reloc_size.c index d7fb6cfc7891..7b2d576aeea1 100644 --- a/tools/testing/selftests/bpf/progs/test_core_reloc_size.c +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_size.c @@ -21,6 +21,7 @@ struct core_reloc_size_output { int arr_elem_sz; int ptr_sz; int enum_sz; + int float_sz; }; struct core_reloc_size { @@ -30,6 +31,7 @@ struct core_reloc_size { int arr_field[4]; void *ptr_field; enum { VALUE = 123 } enum_field; + float float_field; }; SEC("raw_tracepoint/sys_enter") @@ -45,6 +47,7 @@ int test_core_size(void *ctx) out->arr_elem_sz = bpf_core_field_size(in->arr_field[0]); out->ptr_sz = bpf_core_field_size(in->ptr_field); out->enum_sz = bpf_core_field_size(in->enum_field); + out->float_sz = bpf_core_field_size(in->float_field); return 0; } diff --git a/tools/testing/selftests/bpf/progs/test_sk_lookup.c b/tools/testing/selftests/bpf/progs/test_sk_lookup.c index 1032b292af5b..ac6f7f205e25 100644 --- a/tools/testing/selftests/bpf/progs/test_sk_lookup.c +++ b/tools/testing/selftests/bpf/progs/test_sk_lookup.c @@ -64,6 +64,10 @@ static const int PROG_DONE = 1; static const __u32 KEY_SERVER_A = SERVER_A; static const __u32 KEY_SERVER_B = SERVER_B; +static const __u16 SRC_PORT = bpf_htons(8008); +static const __u32 SRC_IP4 = IP4(127, 0, 0, 2); +static const __u32 SRC_IP6[] = IP6(0xfd000000, 0x0, 0x0, 0x00000002); + static const __u16 DST_PORT = 7007; /* Host byte order */ static const __u32 DST_IP4 = IP4(127, 0, 0, 1); static const __u32 DST_IP6[] = IP6(0xfd000000, 0x0, 0x0, 0x00000001); @@ -398,11 +402,12 @@ int ctx_narrow_access(struct bpf_sk_lookup *ctx) if (LSW(ctx->protocol, 0) != IPPROTO_TCP) return SK_DROP; - /* Narrow loads from remote_port field. Expect non-0 value. */ - if (LSB(ctx->remote_port, 0) == 0 && LSB(ctx->remote_port, 1) == 0 && - LSB(ctx->remote_port, 2) == 0 && LSB(ctx->remote_port, 3) == 0) + /* Narrow loads from remote_port field. Expect SRC_PORT. */ + if (LSB(ctx->remote_port, 0) != ((SRC_PORT >> 0) & 0xff) || + LSB(ctx->remote_port, 1) != ((SRC_PORT >> 8) & 0xff) || + LSB(ctx->remote_port, 2) != 0 || LSB(ctx->remote_port, 3) != 0) return SK_DROP; - if (LSW(ctx->remote_port, 0) == 0) + if (LSW(ctx->remote_port, 0) != SRC_PORT) return SK_DROP; /* Narrow loads from local_port field. Expect DST_PORT. */ @@ -415,11 +420,14 @@ int ctx_narrow_access(struct bpf_sk_lookup *ctx) /* Narrow loads from IPv4 fields */ if (v4) { - /* Expect non-0.0.0.0 in remote_ip4 */ - if (LSB(ctx->remote_ip4, 0) == 0 && LSB(ctx->remote_ip4, 1) == 0 && - LSB(ctx->remote_ip4, 2) == 0 && LSB(ctx->remote_ip4, 3) == 0) + /* Expect SRC_IP4 in remote_ip4 */ + if (LSB(ctx->remote_ip4, 0) != ((SRC_IP4 >> 0) & 0xff) || + LSB(ctx->remote_ip4, 1) != ((SRC_IP4 >> 8) & 0xff) || + LSB(ctx->remote_ip4, 2) != ((SRC_IP4 >> 16) & 0xff) || + LSB(ctx->remote_ip4, 3) != ((SRC_IP4 >> 24) & 0xff)) return SK_DROP; - if (LSW(ctx->remote_ip4, 0) == 0 && LSW(ctx->remote_ip4, 1) == 0) + if (LSW(ctx->remote_ip4, 0) != ((SRC_IP4 >> 0) & 0xffff) || + LSW(ctx->remote_ip4, 1) != ((SRC_IP4 >> 16) & 0xffff)) return SK_DROP; /* Expect DST_IP4 in local_ip4 */ @@ -448,20 +456,32 @@ int ctx_narrow_access(struct bpf_sk_lookup *ctx) /* Narrow loads from IPv6 fields */ if (!v4) { - /* Expect non-:: IP in remote_ip6 */ - if (LSB(ctx->remote_ip6[0], 0) == 0 && LSB(ctx->remote_ip6[0], 1) == 0 && - LSB(ctx->remote_ip6[0], 2) == 0 && LSB(ctx->remote_ip6[0], 3) == 0 && - LSB(ctx->remote_ip6[1], 0) == 0 && LSB(ctx->remote_ip6[1], 1) == 0 && - LSB(ctx->remote_ip6[1], 2) == 0 && LSB(ctx->remote_ip6[1], 3) == 0 && - LSB(ctx->remote_ip6[2], 0) == 0 && LSB(ctx->remote_ip6[2], 1) == 0 && - LSB(ctx->remote_ip6[2], 2) == 0 && LSB(ctx->remote_ip6[2], 3) == 0 && - LSB(ctx->remote_ip6[3], 0) == 0 && LSB(ctx->remote_ip6[3], 1) == 0 && - LSB(ctx->remote_ip6[3], 2) == 0 && LSB(ctx->remote_ip6[3], 3) == 0) + /* Expect SRC_IP6 in remote_ip6 */ + if (LSB(ctx->remote_ip6[0], 0) != ((SRC_IP6[0] >> 0) & 0xff) || + LSB(ctx->remote_ip6[0], 1) != ((SRC_IP6[0] >> 8) & 0xff) || + LSB(ctx->remote_ip6[0], 2) != ((SRC_IP6[0] >> 16) & 0xff) || + LSB(ctx->remote_ip6[0], 3) != ((SRC_IP6[0] >> 24) & 0xff) || + LSB(ctx->remote_ip6[1], 0) != ((SRC_IP6[1] >> 0) & 0xff) || + LSB(ctx->remote_ip6[1], 1) != ((SRC_IP6[1] >> 8) & 0xff) || + LSB(ctx->remote_ip6[1], 2) != ((SRC_IP6[1] >> 16) & 0xff) || + LSB(ctx->remote_ip6[1], 3) != ((SRC_IP6[1] >> 24) & 0xff) || + LSB(ctx->remote_ip6[2], 0) != ((SRC_IP6[2] >> 0) & 0xff) || + LSB(ctx->remote_ip6[2], 1) != ((SRC_IP6[2] >> 8) & 0xff) || + LSB(ctx->remote_ip6[2], 2) != ((SRC_IP6[2] >> 16) & 0xff) || + LSB(ctx->remote_ip6[2], 3) != ((SRC_IP6[2] >> 24) & 0xff) || + LSB(ctx->remote_ip6[3], 0) != ((SRC_IP6[3] >> 0) & 0xff) || + LSB(ctx->remote_ip6[3], 1) != ((SRC_IP6[3] >> 8) & 0xff) || + LSB(ctx->remote_ip6[3], 2) != ((SRC_IP6[3] >> 16) & 0xff) || + LSB(ctx->remote_ip6[3], 3) != ((SRC_IP6[3] >> 24) & 0xff)) return SK_DROP; - if (LSW(ctx->remote_ip6[0], 0) == 0 && LSW(ctx->remote_ip6[0], 1) == 0 && - LSW(ctx->remote_ip6[1], 0) == 0 && LSW(ctx->remote_ip6[1], 1) == 0 && - LSW(ctx->remote_ip6[2], 0) == 0 && LSW(ctx->remote_ip6[2], 1) == 0 && - LSW(ctx->remote_ip6[3], 0) == 0 && LSW(ctx->remote_ip6[3], 1) == 0) + if (LSW(ctx->remote_ip6[0], 0) != ((SRC_IP6[0] >> 0) & 0xffff) || + LSW(ctx->remote_ip6[0], 1) != ((SRC_IP6[0] >> 16) & 0xffff) || + LSW(ctx->remote_ip6[1], 0) != ((SRC_IP6[1] >> 0) & 0xffff) || + LSW(ctx->remote_ip6[1], 1) != ((SRC_IP6[1] >> 16) & 0xffff) || + LSW(ctx->remote_ip6[2], 0) != ((SRC_IP6[2] >> 0) & 0xffff) || + LSW(ctx->remote_ip6[2], 1) != ((SRC_IP6[2] >> 16) & 0xffff) || + LSW(ctx->remote_ip6[3], 0) != ((SRC_IP6[3] >> 0) & 0xffff) || + LSW(ctx->remote_ip6[3], 1) != ((SRC_IP6[3] >> 16) & 0xffff)) return SK_DROP; /* Expect DST_IP6 in local_ip6 */ if (LSB(ctx->local_ip6[0], 0) != ((DST_IP6[0] >> 0) & 0xff) || diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_listen.c b/tools/testing/selftests/bpf/progs/test_sockmap_listen.c index a3a366c57ce1..fa221141e9c1 100644 --- a/tools/testing/selftests/bpf/progs/test_sockmap_listen.c +++ b/tools/testing/selftests/bpf/progs/test_sockmap_listen.c @@ -31,13 +31,13 @@ struct { static volatile bool test_sockmap; /* toggled by user-space */ SEC("sk_skb/stream_parser") -int prog_skb_parser(struct __sk_buff *skb) +int prog_stream_parser(struct __sk_buff *skb) { return skb->len; } SEC("sk_skb/stream_verdict") -int prog_skb_verdict(struct __sk_buff *skb) +int prog_stream_verdict(struct __sk_buff *skb) { unsigned int *count; __u32 zero = 0; diff --git a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c index 37bce7a7c394..84cd63259554 100644 --- a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c +++ b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c @@ -24,14 +24,29 @@ static const int cfg_port = 8000; static const int cfg_udp_src = 20000; +#define L2_PAD_SZ (sizeof(struct vxlanhdr) + ETH_HLEN) + #define UDP_PORT 5555 #define MPLS_OVER_UDP_PORT 6635 #define ETH_OVER_UDP_PORT 7777 +#define VXLAN_UDP_PORT 8472 + +#define EXTPROTO_VXLAN 0x1 + +#define VXLAN_N_VID (1u << 24) +#define VXLAN_VNI_MASK bpf_htonl((VXLAN_N_VID - 1) << 8) +#define VXLAN_FLAGS 0x8 +#define VXLAN_VNI 1 /* MPLS label 1000 with S bit (last label) set and ttl of 255. */ static const __u32 mpls_label = __bpf_constant_htonl(1000 << 12 | MPLS_LS_S_MASK | 0xff); +struct vxlanhdr { + __be32 vx_flags; + __be32 vx_vni; +} __attribute__((packed)); + struct gre_hdr { __be16 flags; __be16 protocol; @@ -45,13 +60,13 @@ union l4hdr { struct v4hdr { struct iphdr ip; union l4hdr l4hdr; - __u8 pad[16]; /* enough space for L2 header */ + __u8 pad[L2_PAD_SZ]; /* space for L2 header / vxlan header ... */ } __attribute__((packed)); struct v6hdr { struct ipv6hdr ip; union l4hdr l4hdr; - __u8 pad[16]; /* enough space for L2 header */ + __u8 pad[L2_PAD_SZ]; /* space for L2 header / vxlan header ... */ } __attribute__((packed)); static __always_inline void set_ipv4_csum(struct iphdr *iph) @@ -69,14 +84,15 @@ static __always_inline void set_ipv4_csum(struct iphdr *iph) iph->check = ~((csum & 0xffff) + (csum >> 16)); } -static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto, - __u16 l2_proto) +static __always_inline int __encap_ipv4(struct __sk_buff *skb, __u8 encap_proto, + __u16 l2_proto, __u16 ext_proto) { __u16 udp_dst = UDP_PORT; struct iphdr iph_inner; struct v4hdr h_outer; struct tcphdr tcph; int olen, l2_len; + __u8 *l2_hdr = NULL; int tcp_off; __u64 flags; @@ -141,7 +157,11 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto, break; case ETH_P_TEB: l2_len = ETH_HLEN; - udp_dst = ETH_OVER_UDP_PORT; + if (ext_proto & EXTPROTO_VXLAN) { + udp_dst = VXLAN_UDP_PORT; + l2_len += sizeof(struct vxlanhdr); + } else + udp_dst = ETH_OVER_UDP_PORT; break; } flags |= BPF_F_ADJ_ROOM_ENCAP_L2(l2_len); @@ -171,14 +191,26 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto, } /* add L2 encap (if specified) */ + l2_hdr = (__u8 *)&h_outer + olen; switch (l2_proto) { case ETH_P_MPLS_UC: - *((__u32 *)((__u8 *)&h_outer + olen)) = mpls_label; + *(__u32 *)l2_hdr = mpls_label; break; case ETH_P_TEB: - if (bpf_skb_load_bytes(skb, 0, (__u8 *)&h_outer + olen, - ETH_HLEN)) + flags |= BPF_F_ADJ_ROOM_ENCAP_L2_ETH; + + if (ext_proto & EXTPROTO_VXLAN) { + struct vxlanhdr *vxlan_hdr = (struct vxlanhdr *)l2_hdr; + + vxlan_hdr->vx_flags = VXLAN_FLAGS; + vxlan_hdr->vx_vni = bpf_htonl((VXLAN_VNI & VXLAN_VNI_MASK) << 8); + + l2_hdr += sizeof(struct vxlanhdr); + } + + if (bpf_skb_load_bytes(skb, 0, l2_hdr, ETH_HLEN)) return TC_ACT_SHOT; + break; } olen += l2_len; @@ -214,14 +246,21 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto, return TC_ACT_OK; } -static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto, +static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto, __u16 l2_proto) { + return __encap_ipv4(skb, encap_proto, l2_proto, 0); +} + +static __always_inline int __encap_ipv6(struct __sk_buff *skb, __u8 encap_proto, + __u16 l2_proto, __u16 ext_proto) +{ __u16 udp_dst = UDP_PORT; struct ipv6hdr iph_inner; struct v6hdr h_outer; struct tcphdr tcph; int olen, l2_len; + __u8 *l2_hdr = NULL; __u16 tot_len; __u64 flags; @@ -249,7 +288,11 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto, break; case ETH_P_TEB: l2_len = ETH_HLEN; - udp_dst = ETH_OVER_UDP_PORT; + if (ext_proto & EXTPROTO_VXLAN) { + udp_dst = VXLAN_UDP_PORT; + l2_len += sizeof(struct vxlanhdr); + } else + udp_dst = ETH_OVER_UDP_PORT; break; } flags |= BPF_F_ADJ_ROOM_ENCAP_L2(l2_len); @@ -267,7 +310,7 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto, h_outer.l4hdr.udp.source = __bpf_constant_htons(cfg_udp_src); h_outer.l4hdr.udp.dest = bpf_htons(udp_dst); tot_len = bpf_ntohs(iph_inner.payload_len) + sizeof(iph_inner) + - sizeof(h_outer.l4hdr.udp); + sizeof(h_outer.l4hdr.udp) + l2_len; h_outer.l4hdr.udp.check = 0; h_outer.l4hdr.udp.len = bpf_htons(tot_len); break; @@ -278,13 +321,24 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto, } /* add L2 encap (if specified) */ + l2_hdr = (__u8 *)&h_outer + olen; switch (l2_proto) { case ETH_P_MPLS_UC: - *((__u32 *)((__u8 *)&h_outer + olen)) = mpls_label; + *(__u32 *)l2_hdr = mpls_label; break; case ETH_P_TEB: - if (bpf_skb_load_bytes(skb, 0, (__u8 *)&h_outer + olen, - ETH_HLEN)) + flags |= BPF_F_ADJ_ROOM_ENCAP_L2_ETH; + + if (ext_proto & EXTPROTO_VXLAN) { + struct vxlanhdr *vxlan_hdr = (struct vxlanhdr *)l2_hdr; + + vxlan_hdr->vx_flags = VXLAN_FLAGS; + vxlan_hdr->vx_vni = bpf_htonl((VXLAN_VNI & VXLAN_VNI_MASK) << 8); + + l2_hdr += sizeof(struct vxlanhdr); + } + + if (bpf_skb_load_bytes(skb, 0, l2_hdr, ETH_HLEN)) return TC_ACT_SHOT; break; } @@ -309,6 +363,12 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto, return TC_ACT_OK; } +static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto, + __u16 l2_proto) +{ + return __encap_ipv6(skb, encap_proto, l2_proto, 0); +} + SEC("encap_ipip_none") int __encap_ipip_none(struct __sk_buff *skb) { @@ -372,6 +432,17 @@ int __encap_udp_eth(struct __sk_buff *skb) return TC_ACT_OK; } +SEC("encap_vxlan_eth") +int __encap_vxlan_eth(struct __sk_buff *skb) +{ + if (skb->protocol == __bpf_constant_htons(ETH_P_IP)) + return __encap_ipv4(skb, IPPROTO_UDP, + ETH_P_TEB, + EXTPROTO_VXLAN); + else + return TC_ACT_OK; +} + SEC("encap_sit_none") int __encap_sit_none(struct __sk_buff *skb) { @@ -444,6 +515,17 @@ int __encap_ip6udp_eth(struct __sk_buff *skb) return TC_ACT_OK; } +SEC("encap_ip6vxlan_eth") +int __encap_ip6vxlan_eth(struct __sk_buff *skb) +{ + if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6)) + return __encap_ipv6(skb, IPPROTO_UDP, + ETH_P_TEB, + EXTPROTO_VXLAN); + else + return TC_ACT_OK; +} + static int decap_internal(struct __sk_buff *skb, int off, int len, char proto) { char buf[sizeof(struct v6hdr)]; @@ -479,6 +561,9 @@ static int decap_internal(struct __sk_buff *skb, int off, int len, char proto) case ETH_OVER_UDP_PORT: olen += ETH_HLEN; break; + case VXLAN_UDP_PORT: + olen += ETH_HLEN + sizeof(struct vxlanhdr); + break; } break; default: diff --git a/tools/testing/selftests/bpf/test_bpftool_build.sh b/tools/testing/selftests/bpf/test_bpftool_build.sh index 2db3c60e1e61..ac349a5cea7e 100755 --- a/tools/testing/selftests/bpf/test_bpftool_build.sh +++ b/tools/testing/selftests/bpf/test_bpftool_build.sh @@ -85,23 +85,6 @@ make_with_tmpdir() { echo } -make_doc_and_clean() { - echo -e "\$PWD: $PWD" - echo -e "command: make -s $* doc >/dev/null" - RST2MAN_OPTS="--exit-status=1" make $J -s $* doc - if [ $? -ne 0 ] ; then - ERROR=1 - printf "FAILURE: Errors or warnings when building documentation\n" - fi - ( - if [ $# -ge 1 ] ; then - cd ${@: -1} - fi - make -s doc-clean - ) - echo -} - echo "Trying to build bpftool" echo -e "... through kbuild\n" @@ -162,7 +145,3 @@ make_and_clean make_with_tmpdir OUTPUT make_with_tmpdir O - -echo -e "Checking documentation build\n" -# From tools/bpf/bpftool -make_doc_and_clean diff --git a/tools/testing/selftests/bpf/test_btf.h b/tools/testing/selftests/bpf/test_btf.h index 2023725f1962..e2394eea4b7f 100644 --- a/tools/testing/selftests/bpf/test_btf.h +++ b/tools/testing/selftests/bpf/test_btf.h @@ -66,4 +66,7 @@ #define BTF_FUNC_ENC(name, func_proto) \ BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_FUNC, 0, 0), func_proto) +#define BTF_TYPE_FLOAT_ENC(name, sz) \ + BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_FLOAT, 0, 0), sz) + #endif /* _TEST_BTF_H */ diff --git a/tools/testing/selftests/bpf/test_doc_build.sh b/tools/testing/selftests/bpf/test_doc_build.sh new file mode 100755 index 000000000000..7eb940a7b2eb --- /dev/null +++ b/tools/testing/selftests/bpf/test_doc_build.sh @@ -0,0 +1,13 @@ +#!/bin/bash +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) + +# Assume script is located under tools/testing/selftests/bpf/. We want to start +# build attempts from the top of kernel repository. +SCRIPT_REL_PATH=$(realpath --relative-to=$PWD $0) +SCRIPT_REL_DIR=$(dirname $SCRIPT_REL_PATH) +KDIR_ROOT_DIR=$(realpath $PWD/$SCRIPT_REL_DIR/../../../../) +cd $KDIR_ROOT_DIR + +for tgt in docs docs-clean; do + make -s -C $PWD/$SCRIPT_REL_DIR $tgt; +done diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h index f7c2fd89d01a..e87c8546230e 100644 --- a/tools/testing/selftests/bpf/test_progs.h +++ b/tools/testing/selftests/bpf/test_progs.h @@ -152,6 +152,17 @@ extern int test__join_cgroup(const char *path); ___ok; \ }) +#define ASSERT_LT(actual, expected, name) ({ \ + static int duration = 0; \ + typeof(actual) ___act = (actual); \ + typeof(expected) ___exp = (expected); \ + bool ___ok = ___act < ___exp; \ + CHECK(!___ok, (name), \ + "unexpected %s: actual %lld >= expected %lld\n", \ + (name), (long long)(___act), (long long)(___exp)); \ + ___ok; \ +}) + #define ASSERT_STREQ(actual, expected, name) ({ \ static int duration = 0; \ const char *___act = actual; \ diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c index 427ca00a3217..eefd445b96fc 100644 --- a/tools/testing/selftests/bpf/test_sockmap.c +++ b/tools/testing/selftests/bpf/test_sockmap.c @@ -732,7 +732,7 @@ static int sendmsg_test(struct sockmap_options *opt) * socket is not a valid test. So in this case lets not * enable kTLS but still run the test. */ - if (!txmsg_redir || (txmsg_redir && txmsg_ingress)) { + if (!txmsg_redir || txmsg_ingress) { err = sockmap_init_ktls(opt->verbose, rx_fd); if (err) return err; diff --git a/tools/testing/selftests/bpf/test_tc_tunnel.sh b/tools/testing/selftests/bpf/test_tc_tunnel.sh index 7c76b841b17b..c9dde9b9d987 100755 --- a/tools/testing/selftests/bpf/test_tc_tunnel.sh +++ b/tools/testing/selftests/bpf/test_tc_tunnel.sh @@ -44,8 +44,8 @@ setup() { # clamp route to reserve room for tunnel headers ip -netns "${ns1}" -4 route flush table main ip -netns "${ns1}" -6 route flush table main - ip -netns "${ns1}" -4 route add "${ns2_v4}" mtu 1458 dev veth1 - ip -netns "${ns1}" -6 route add "${ns2_v6}" mtu 1438 dev veth1 + ip -netns "${ns1}" -4 route add "${ns2_v4}" mtu 1450 dev veth1 + ip -netns "${ns1}" -6 route add "${ns2_v6}" mtu 1430 dev veth1 sleep 1 @@ -105,6 +105,12 @@ if [[ "$#" -eq "0" ]]; then echo "sit" $0 ipv6 sit none 100 + echo "ip4 vxlan" + $0 ipv4 vxlan eth 2000 + + echo "ip6 vxlan" + $0 ipv6 ip6vxlan eth 2000 + for mac in none mpls eth ; do echo "ip gre $mac" $0 ipv4 gre $mac 100 @@ -214,6 +220,9 @@ if [[ "$tuntype" =~ "udp" ]]; then targs="encap fou encap-sport auto encap-dport $dport" elif [[ "$tuntype" =~ "gre" && "$mac" == "eth" ]]; then ttype=$gretaptype +elif [[ "$tuntype" =~ "vxlan" && "$mac" == "eth" ]]; then + ttype="vxlan" + targs="id 1 dstport 8472 udp6zerocsumrx" else ttype=$tuntype targs="" @@ -242,7 +251,7 @@ if [[ "$tuntype" == "ip6udp" && "$mac" == "mpls" ]]; then elif [[ "$tuntype" =~ "udp" && "$mac" == "eth" ]]; then # No support for TEB fou tunnel; expect failure. expect_tun_fail=1 -elif [[ "$tuntype" =~ "gre" && "$mac" == "eth" ]]; then +elif [[ "$tuntype" =~ (gre|vxlan) && "$mac" == "eth" ]]; then # Share ethernet address between tunnel/veth2 so L2 decap works. ethaddr=$(ip netns exec "${ns2}" ip link show veth2 | \ awk '/ether/ { print $2 }') diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 58b5a349d3ba..1512092e1e68 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -105,7 +105,7 @@ struct bpf_test { enum bpf_prog_type prog_type; uint8_t flags; void (*fill_helper)(struct bpf_test *self); - uint8_t runs; + int runs; #define bpf_testdata_struct_t \ struct { \ uint32_t retval, retval_unpriv; \ @@ -1165,7 +1165,7 @@ static void do_test_single(struct bpf_test *test, bool unpriv, run_errs = 0; run_successes = 0; - if (!alignment_prevented_execution && fd_prog >= 0) { + if (!alignment_prevented_execution && fd_prog >= 0 && test->runs >= 0) { uint32_t expected_val; int i; diff --git a/tools/testing/selftests/bpf/test_xsk.sh b/tools/testing/selftests/bpf/test_xsk.sh index 88a7483eaae4..56d4474e2c83 100755 --- a/tools/testing/selftests/bpf/test_xsk.sh +++ b/tools/testing/selftests/bpf/test_xsk.sh @@ -71,13 +71,21 @@ # # Run (full output without color-coding): # sudo ./test_xsk.sh +# +# Run with verbose output: +# sudo ./test_xsk.sh -v +# +# Run and dump packet contents: +# sudo ./test_xsk.sh -D . xsk_prereqs.sh -while getopts c flag +while getopts "cvD" flag do case "${flag}" in c) colorconsole=1;; + v) verbose=1;; + D) dump_pkts=1;; esac done @@ -95,13 +103,17 @@ NS1=af_xdp${VETH1_POSTFIX} MTU=1500 setup_vethPairs() { - echo "setting up ${VETH0}: namespace: ${NS0}" + if [[ $verbose -eq 1 ]]; then + echo "setting up ${VETH0}: namespace: ${NS0}" + fi ip netns add ${NS1} ip link add ${VETH0} type veth peer name ${VETH1} if [ -f /proc/net/if_inet6 ]; then echo 1 > /proc/sys/net/ipv6/conf/${VETH0}/disable_ipv6 fi - echo "setting up ${VETH1}: namespace: ${NS1}" + if [[ $verbose -eq 1 ]]; then + echo "setting up ${VETH1}: namespace: ${NS1}" + fi ip link set ${VETH1} netns ${NS1} ip netns exec ${NS1} ip link set ${VETH1} mtu ${MTU} ip link set ${VETH0} mtu ${MTU} @@ -125,121 +137,24 @@ echo "${VETH0}:${VETH1},${NS1}" > ${SPECFILE} validate_veth_spec_file -echo "Spec file created: ${SPECFILE}" - -test_status $retval "${TEST_NAME}" - -## START TESTS - -statusList=() - -### TEST 1 -TEST_NAME="XSK KSELFTEST FRAMEWORK" - -echo "Switching interfaces [${VETH0}, ${VETH1}] to XDP Generic mode" -vethXDPgeneric ${VETH0} ${VETH1} ${NS1} - -retval=$? -if [ $retval -eq 0 ]; then - echo "Switching interfaces [${VETH0}, ${VETH1}] to XDP Native mode" - vethXDPnative ${VETH0} ${VETH1} ${NS1} +if [[ $verbose -eq 1 ]]; then + echo "Spec file created: ${SPECFILE}" + VERBOSE_ARG="-v" fi -retval=$? -test_status $retval "${TEST_NAME}" -statusList+=($retval) - -### TEST 2 -TEST_NAME="SKB NOPOLL" - -vethXDPgeneric ${VETH0} ${VETH1} ${NS1} - -params=("-S") -execxdpxceiver params - -retval=$? -test_status $retval "${TEST_NAME}" -statusList+=($retval) - -### TEST 3 -TEST_NAME="SKB POLL" - -vethXDPgeneric ${VETH0} ${VETH1} ${NS1} - -params=("-S" "-p") -execxdpxceiver params - -retval=$? -test_status $retval "${TEST_NAME}" -statusList+=($retval) - -### TEST 4 -TEST_NAME="DRV NOPOLL" - -vethXDPnative ${VETH0} ${VETH1} ${NS1} - -params=("-N") -execxdpxceiver params - -retval=$? -test_status $retval "${TEST_NAME}" -statusList+=($retval) - -### TEST 5 -TEST_NAME="DRV POLL" - -vethXDPnative ${VETH0} ${VETH1} ${NS1} - -params=("-N" "-p") -execxdpxceiver params - -retval=$? -test_status $retval "${TEST_NAME}" -statusList+=($retval) - -### TEST 6 -TEST_NAME="SKB SOCKET TEARDOWN" - -vethXDPgeneric ${VETH0} ${VETH1} ${NS1} - -params=("-S" "-T") -execxdpxceiver params - -retval=$? -test_status $retval "${TEST_NAME}" -statusList+=($retval) - -### TEST 7 -TEST_NAME="DRV SOCKET TEARDOWN" - -vethXDPnative ${VETH0} ${VETH1} ${NS1} - -params=("-N" "-T") -execxdpxceiver params +if [[ $dump_pkts -eq 1 ]]; then + DUMP_PKTS_ARG="-D" +fi -retval=$? test_status $retval "${TEST_NAME}" -statusList+=($retval) -### TEST 8 -TEST_NAME="SKB BIDIRECTIONAL SOCKETS" - -vethXDPgeneric ${VETH0} ${VETH1} ${NS1} - -params=("-S" "-B") -execxdpxceiver params - -retval=$? -test_status $retval "${TEST_NAME}" -statusList+=($retval) +## START TESTS -### TEST 9 -TEST_NAME="DRV BIDIRECTIONAL SOCKETS" +statusList=() -vethXDPnative ${VETH0} ${VETH1} ${NS1} +TEST_NAME="XSK KSELFTESTS" -params=("-N" "-B") -execxdpxceiver params +execxdpxceiver retval=$? test_status $retval "${TEST_NAME}" diff --git a/tools/testing/selftests/bpf/verifier/ctx_sk_lookup.c b/tools/testing/selftests/bpf/verifier/ctx_sk_lookup.c index fb13ca2d5606..d78627be060f 100644 --- a/tools/testing/selftests/bpf/verifier/ctx_sk_lookup.c +++ b/tools/testing/selftests/bpf/verifier/ctx_sk_lookup.c @@ -239,6 +239,7 @@ .result = ACCEPT, .prog_type = BPF_PROG_TYPE_SK_LOOKUP, .expected_attach_type = BPF_SK_LOOKUP, + .runs = -1, }, /* invalid 8-byte reads from a 4-byte fields in bpf_sk_lookup */ { diff --git a/tools/testing/selftests/bpf/vmtest.sh b/tools/testing/selftests/bpf/vmtest.sh index 26ae8d0b6ce3..22554894db99 100755 --- a/tools/testing/selftests/bpf/vmtest.sh +++ b/tools/testing/selftests/bpf/vmtest.sh @@ -17,6 +17,9 @@ KCONFIG_URL="https://raw.githubusercontent.com/libbpf/libbpf/master/travis-ci/vm KCONFIG_API_URL="https://api.github.com/repos/libbpf/libbpf/contents/travis-ci/vmtest/configs/latest.config" INDEX_URL="https://raw.githubusercontent.com/libbpf/libbpf/master/travis-ci/vmtest/configs/INDEX" NUM_COMPILE_JOBS="$(nproc)" +LOG_FILE_BASE="$(date +"bpf_selftests.%Y-%m-%d_%H-%M-%S")" +LOG_FILE="${LOG_FILE_BASE}.log" +EXIT_STATUS_FILE="${LOG_FILE_BASE}.exit_status" usage() { @@ -146,7 +149,6 @@ update_init_script() local init_script_dir="${OUTPUT_DIR}/${MOUNT_DIR}/etc/rcS.d" local init_script="${init_script_dir}/S50-startup" local command="$1" - local log_file="$2" mount_image @@ -163,11 +165,16 @@ EOF sudo bash -c "cat >${init_script}" <<EOF #!/bin/bash +# Have a default value in the exit status file +# incase the VM is forcefully stopped. +echo "130" > "/root/${EXIT_STATUS_FILE}" + { cd /root/bpf echo ${command} stdbuf -oL -eL ${command} -} 2>&1 | tee /root/${log_file} + echo "\$?" > "/root/${EXIT_STATUS_FILE}" +} 2>&1 | tee "/root/${LOG_FILE}" poweroff -f EOF @@ -221,10 +228,12 @@ EOF copy_logs() { local mount_dir="${OUTPUT_DIR}/${MOUNT_DIR}" - local log_file="${mount_dir}/root/$1" + local log_file="${mount_dir}/root/${LOG_FILE}" + local exit_status_file="${mount_dir}/root/${EXIT_STATUS_FILE}" mount_image sudo cp ${log_file} "${OUTPUT_DIR}" + sudo cp ${exit_status_file} "${OUTPUT_DIR}" sudo rm -f ${log_file} unmount_image } @@ -263,7 +272,6 @@ main() { local script_dir="$(cd -P -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P)" local kernel_checkout=$(realpath "${script_dir}"/../../../../) - local log_file="$(date +"bpf_selftests.%Y-%m-%d_%H-%M-%S.log")" # By default the script searches for the kernel in the checkout directory but # it also obeys environment variables O= and KBUILD_OUTPUT= local kernel_bzimage="${kernel_checkout}/${X86_BZIMAGE}" @@ -347,19 +355,23 @@ main() fi update_selftests "${kernel_checkout}" "${make_command}" - update_init_script "${command}" "${log_file}" + update_init_script "${command}" run_vm "${kernel_bzimage}" - copy_logs "${log_file}" - echo "Logs saved in ${OUTPUT_DIR}/${log_file}" + copy_logs + echo "Logs saved in ${OUTPUT_DIR}/${LOG_FILE}" } catch() { local exit_code=$1 + local exit_status_file="${OUTPUT_DIR}/${EXIT_STATUS_FILE}" # This is just a cleanup and the directory may # have already been unmounted. So, don't let this # clobber the error code we intend to return. unmount_image || true + if [[ -f "${exit_status_file}" ]]; then + exit_code="$(cat ${exit_status_file})" + fi exit ${exit_code} } diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c index f4a96d5ff524..8b0f7fdd9003 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.c +++ b/tools/testing/selftests/bpf/xdpxceiver.c @@ -18,12 +18,7 @@ * These selftests test AF_XDP SKB and Native/DRV modes using veth * Virtual Ethernet interfaces. * - * The following tests are run: - * - * 1. AF_XDP SKB mode - * Generic mode XDP is driver independent, used when the driver does - * not have support for XDP. Works on any netdevice using sockets and - * generic XDP path. XDP hook from netif_receive_skb(). + * For each mode, the following tests are run: * a. nopoll - soft-irq processing * b. poll - using poll() syscall * c. Socket Teardown @@ -33,19 +28,21 @@ * Configure sockets as bi-directional tx/rx sockets, sets up fill and * completion rings on each socket, tx/rx in both directions. Only nopoll * mode is used + * e. Statistics + * Trigger some error conditions and ensure that the appropriate statistics + * are incremented. Within this test, the following statistics are tested: + * i. rx dropped + * Increase the UMEM frame headroom to a value which results in + * insufficient space in the rx buffer for both the packet and the headroom. + * ii. tx invalid + * Set the 'len' field of tx descriptors to an invalid value (umem frame + * size + 1). + * iii. rx ring full + * Reduce the size of the RX ring to a fraction of the fill ring size. + * iv. fill queue empty + * Do not populate the fill queue and then try to receive pkts. * - * 2. AF_XDP DRV/Native mode - * Works on any netdevice with XDP_REDIRECT support, driver dependent. Processes - * packets before SKB allocation. Provides better performance than SKB. Driver - * hook available just after DMA of buffer descriptor. - * a. nopoll - * b. poll - * c. Socket Teardown - * d. Bi-directional sockets - * - Only copy mode is supported because veth does not currently support - * zero-copy mode - * - * Total tests: 8 + * Total tests: 10 * * Flow: * ----- @@ -58,7 +55,7 @@ * - Rx thread verifies if all 10k packets were received and delivered in-order, * and have the right content * - * Enable/disable debug mode: + * Enable/disable packet dump mode: * -------------------------- * To enable L2 - L4 headers and payload dump of each packet on STDOUT, add * parameter -D to params array in test_xsk.sh, i.e. params=("-S" "-D") @@ -98,17 +95,24 @@ typedef __u16 __sum16; static void __exit_with_error(int error, const char *file, const char *func, int line) { - ksft_test_result_fail - ("[%s:%s:%i]: ERROR: %d/\"%s\"\n", file, func, line, error, strerror(error)); - ksft_exit_xfail(); + if (configured_mode == TEST_MODE_UNCONFIGURED) { + ksft_exit_fail_msg + ("[%s:%s:%i]: ERROR: %d/\"%s\"\n", file, func, line, error, strerror(error)); + } else { + ksft_test_result_fail + ("[%s:%s:%i]: ERROR: %d/\"%s\"\n", file, func, line, error, strerror(error)); + ksft_exit_xfail(); + } } #define exit_with_error(error) __exit_with_error(error, __FILE__, __func__, __LINE__) #define print_ksft_result(void)\ - (ksft_test_result_pass("PASS: %s %s %s%s\n", uut ? "DRV" : "SKB", opt_poll ? "POLL" :\ - "NOPOLL", opt_teardown ? "Socket Teardown" : "",\ - opt_bidi ? "Bi-directional Sockets" : "")) + (ksft_test_result_pass("PASS: %s %s %s%s%s\n", configured_mode ? "DRV" : "SKB",\ + test_type == TEST_TYPE_POLL ? "POLL" : "NOPOLL",\ + test_type == TEST_TYPE_TEARDOWN ? "Socket Teardown" : "",\ + test_type == TEST_TYPE_BIDI ? "Bi-directional Sockets" : "",\ + test_type == TEST_TYPE_STATS ? "Stats" : "")) static void pthread_init_mutex(void) { @@ -270,13 +274,20 @@ static void gen_eth_frame(struct xsk_umem_info *umem, u64 addr) static void xsk_configure_umem(struct ifobject *data, void *buffer, u64 size) { int ret; + struct xsk_umem_config cfg = { + .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS, + .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS, + .frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE, + .frame_headroom = frame_headroom, + .flags = XSK_UMEM__DEFAULT_FLAGS + }; data->umem = calloc(1, sizeof(struct xsk_umem_info)); if (!data->umem) exit_with_error(errno); ret = xsk_umem__create(&data->umem->umem, buffer, size, - &data->umem->fq, &data->umem->cq, NULL); + &data->umem->fq, &data->umem->cq, &cfg); if (ret) exit_with_error(ret); @@ -308,13 +319,13 @@ static int xsk_configure_socket(struct ifobject *ifobject) exit_with_error(errno); ifobject->xsk->umem = ifobject->umem; - cfg.rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS; + cfg.rx_size = rxqsize; cfg.tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; cfg.libbpf_flags = 0; - cfg.xdp_flags = opt_xdp_flags; - cfg.bind_flags = opt_xdp_bind_flags; + cfg.xdp_flags = xdp_flags; + cfg.bind_flags = xdp_bind_flags; - if (!opt_bidi) { + if (test_type != TEST_TYPE_BIDI) { rxr = (ifobject->fv.vector == rx) ? &ifobject->xsk->rx : NULL; txr = (ifobject->fv.vector == tx) ? &ifobject->xsk->tx : NULL; } else { @@ -334,13 +345,8 @@ static int xsk_configure_socket(struct ifobject *ifobject) static struct option long_options[] = { {"interface", required_argument, 0, 'i'}, {"queue", optional_argument, 0, 'q'}, - {"poll", no_argument, 0, 'p'}, - {"xdp-skb", no_argument, 0, 'S'}, - {"xdp-native", no_argument, 0, 'N'}, - {"copy", no_argument, 0, 'c'}, - {"tear-down", no_argument, 0, 'T'}, - {"bidi", optional_argument, 0, 'B'}, - {"debug", optional_argument, 0, 'D'}, + {"dump-pkts", optional_argument, 0, 'D'}, + {"verbose", no_argument, 0, 'v'}, {"tx-pkt-count", optional_argument, 0, 'C'}, {0, 0, 0, 0} }; @@ -352,13 +358,8 @@ static void usage(const char *prog) " Options:\n" " -i, --interface Use interface\n" " -q, --queue=n Use queue n (default 0)\n" - " -p, --poll Use poll syscall\n" - " -S, --xdp-skb=n Use XDP SKB mode\n" - " -N, --xdp-native=n Enforce XDP DRV (native) mode\n" - " -c, --copy Force copy mode\n" - " -T, --tear-down Tear down sockets by repeatedly recreating them\n" - " -B, --bidi Bi-directional sockets test\n" - " -D, --debug Debug mode - dump packets L2 - L5\n" + " -D, --dump-pkts Dump packets L2 - L5\n" + " -v, --verbose Verbose output\n" " -C, --tx-pkt-count=n Number of packets to send\n"; ksft_print_msg(str, prog); } @@ -392,7 +393,7 @@ static void *nsswitchthread(void *args) ksft_test_result_fail("ERROR: [%s] interface \"%s\" does not exist\n", __func__, ifdict[targs->idx]->ifname); } else { - ksft_print_msg("Interface found: %s\n", ifdict[targs->idx]->ifname); + print_verbose("Interface found: %s\n", ifdict[targs->idx]->ifname); targs->retptr = true; } } @@ -422,7 +423,7 @@ static int validate_interfaces(void) pthread_join(ns_thread, NULL); if (targs->retptr) - ksft_print_msg("NS switched: %s\n", ifdict[i]->nsname); + print_verbose("NS switched: %s\n", ifdict[i]->nsname); free(targs); } else { @@ -432,7 +433,7 @@ static int validate_interfaces(void) ("ERROR: interface \"%s\" does not exist\n", ifdict[i]->ifname); ret = false; } else { - ksft_print_msg("Interface found: %s\n", ifdict[i]->ifname); + print_verbose("Interface found: %s\n", ifdict[i]->ifname); } } } @@ -446,7 +447,7 @@ static void parse_command_line(int argc, char **argv) opterr = 0; for (;;) { - c = getopt_long(argc, argv, "i:q:pSNcTBDC:", long_options, &option_index); + c = getopt_long(argc, argv, "i:q:DC:v", long_options, &option_index); if (c == -1) break; @@ -469,40 +470,26 @@ static void parse_command_line(int argc, char **argv) case 'q': opt_queue = atoi(optarg); break; - case 'p': - opt_poll = 1; - break; - case 'S': - opt_xdp_flags |= XDP_FLAGS_SKB_MODE; - opt_xdp_bind_flags |= XDP_COPY; - uut = ORDER_CONTENT_VALIDATE_XDP_SKB; - break; - case 'N': - opt_xdp_flags |= XDP_FLAGS_DRV_MODE; - opt_xdp_bind_flags |= XDP_COPY; - uut = ORDER_CONTENT_VALIDATE_XDP_DRV; - break; - case 'c': - opt_xdp_bind_flags |= XDP_COPY; - break; - case 'T': - opt_teardown = 1; - break; - case 'B': - opt_bidi = 1; - break; case 'D': debug_pkt_dump = 1; break; case 'C': opt_pkt_count = atoi(optarg); break; + case 'v': + opt_verbose = 1; + break; default: usage(basename(argv[0])); ksft_exit_xfail(); } } + if (!opt_pkt_count) { + print_verbose("No tx-pkt-count specified, using default %u\n", DEFAULT_PKT_CNT); + opt_pkt_count = DEFAULT_PKT_CNT; + } + if (!validate_interfaces()) { usage(basename(argv[0])); ksft_exit_xfail(); @@ -599,6 +586,8 @@ static void tx_only(struct xsk_socket_info *xsk, u32 *frameptr, int batch_size) { u32 idx; unsigned int i; + bool tx_invalid_test = stat_test_type == STAT_TEST_TX_INVALID; + u32 len = tx_invalid_test ? XSK_UMEM__DEFAULT_FRAME_SIZE + 1 : PKT_SIZE; while (xsk_ring_prod__reserve(&xsk->tx, batch_size, &idx) < batch_size) complete_tx_only(xsk, batch_size); @@ -607,11 +596,16 @@ static void tx_only(struct xsk_socket_info *xsk, u32 *frameptr, int batch_size) struct xdp_desc *tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, idx + i); tx_desc->addr = (*frameptr + i) << XSK_UMEM__DEFAULT_FRAME_SHIFT; - tx_desc->len = PKT_SIZE; + tx_desc->len = len; } xsk_ring_prod__submit(&xsk->tx, batch_size); - xsk->outstanding_tx += batch_size; + if (!tx_invalid_test) { + xsk->outstanding_tx += batch_size; + } else { + if (!NEED_WAKEUP || xsk_ring_prod__needs_wakeup(&xsk->tx)) + kick_tx(xsk); + } *frameptr += batch_size; *frameptr %= num_frames; complete_tx_only(xsk, batch_size); @@ -654,7 +648,7 @@ static void tx_only_all(struct ifobject *ifobject) while ((opt_pkt_count && pkt_cnt < opt_pkt_count) || !opt_pkt_count) { int batch_size = get_batch_size(pkt_cnt); - if (opt_poll) { + if (test_type == TEST_TYPE_POLL) { ret = poll(fds, 1, POLL_TMOUT); if (ret <= 0) continue; @@ -714,7 +708,7 @@ static void worker_pkt_dump(void) int payload = *((uint32_t *)(pkt_buf[iter]->payload + PKT_HDR_SIZE)); if (payload == EOT) { - ksft_print_msg("End-of-transmission frame received\n"); + print_verbose("End-of-transmission frame received\n"); fprintf(stdout, "---------------------------------------\n"); break; } @@ -723,6 +717,48 @@ static void worker_pkt_dump(void) } } +static void worker_stats_validate(struct ifobject *ifobject) +{ + struct xdp_statistics stats; + socklen_t optlen; + int err; + struct xsk_socket *xsk = stat_test_type == STAT_TEST_TX_INVALID ? + ifdict[!ifobject->ifdict_index]->xsk->xsk : + ifobject->xsk->xsk; + int fd = xsk_socket__fd(xsk); + unsigned long xsk_stat = 0, expected_stat = opt_pkt_count; + + sigvar = 0; + + optlen = sizeof(stats); + err = getsockopt(fd, SOL_XDP, XDP_STATISTICS, &stats, &optlen); + if (err) + return; + + if (optlen == sizeof(struct xdp_statistics)) { + switch (stat_test_type) { + case STAT_TEST_RX_DROPPED: + xsk_stat = stats.rx_dropped; + break; + case STAT_TEST_TX_INVALID: + xsk_stat = stats.tx_invalid_descs; + break; + case STAT_TEST_RX_FULL: + xsk_stat = stats.rx_ring_full; + expected_stat -= RX_FULL_RXQSIZE; + break; + case STAT_TEST_RX_FILL_EMPTY: + xsk_stat = stats.rx_fill_ring_empty_descs; + break; + default: + break; + } + + if (xsk_stat == expected_stat) + sigvar = 1; + } +} + static void worker_pkt_validate(void) { u32 payloadseqnum = -2; @@ -746,7 +782,7 @@ static void worker_pkt_validate(void) } if (payloadseqnum == EOT) { - ksft_print_msg("End-of-transmission frame received: PASS\n"); + print_verbose("End-of-transmission frame received: PASS\n"); sigvar = 1; break; } @@ -836,7 +872,7 @@ static void *worker_testapp_validate(void *arg) usleep(USLEEP_MAX); } - ksft_print_msg("Interface [%s] vector [Tx]\n", ifobject->ifname); + print_verbose("Interface [%s] vector [Tx]\n", ifobject->ifname); for (int i = 0; i < num_frames; i++) { /*send EOT frame */ if (i == (num_frames - 1)) @@ -850,7 +886,7 @@ static void *worker_testapp_validate(void *arg) gen_eth_frame(ifobject->umem, i * XSK_UMEM__DEFAULT_FRAME_SIZE); } - ksft_print_msg("Sending %d packets on interface %s\n", + print_verbose("Sending %d packets on interface %s\n", (opt_pkt_count - 1), ifobject->ifname); tx_only_all(ifobject); } else if (ifobject->fv.vector == rx) { @@ -860,8 +896,9 @@ static void *worker_testapp_validate(void *arg) if (!bidi_pass) thread_common_ops(ifobject, bufs, &sync_mutex_tx, &spinning_rx); - ksft_print_msg("Interface [%s] vector [Rx]\n", ifobject->ifname); - xsk_populate_fill_ring(ifobject->umem); + print_verbose("Interface [%s] vector [Rx]\n", ifobject->ifname); + if (stat_test_type != STAT_TEST_RX_FILL_EMPTY) + xsk_populate_fill_ring(ifobject->umem); TAILQ_INIT(&head); if (debug_pkt_dump) { @@ -878,26 +915,32 @@ static void *worker_testapp_validate(void *arg) pthread_mutex_unlock(&sync_mutex); while (1) { - if (opt_poll) { + if (test_type == TEST_TYPE_POLL) { ret = poll(fds, 1, POLL_TMOUT); if (ret <= 0) continue; } - rx_pkt(ifobject->xsk, fds); - worker_pkt_validate(); + + if (test_type != TEST_TYPE_STATS) { + rx_pkt(ifobject->xsk, fds); + worker_pkt_validate(); + } else { + worker_stats_validate(ifobject); + } if (sigvar) break; } - ksft_print_msg("Received %d packets on interface %s\n", - pkt_counter, ifobject->ifname); + if (test_type != TEST_TYPE_STATS) + print_verbose("Received %d packets on interface %s\n", + pkt_counter, ifobject->ifname); - if (opt_teardown) - ksft_print_msg("Destroying socket\n"); + if (test_type == TEST_TYPE_TEARDOWN) + print_verbose("Destroying socket\n"); } - if (!opt_bidi || bidi_pass) { + if ((test_type != TEST_TYPE_BIDI) || bidi_pass) { xsk_socket__delete(ifobject->xsk->xsk); (void)xsk_umem__delete(ifobject->umem->umem); } @@ -907,14 +950,15 @@ static void *worker_testapp_validate(void *arg) static void testapp_validate(void) { struct timespec max_wait = { 0, 0 }; + bool bidi = test_type == TEST_TYPE_BIDI; pthread_attr_init(&attr); pthread_attr_setstacksize(&attr, THREAD_STACK); - if (opt_bidi && bidi_pass) { + if ((test_type == TEST_TYPE_BIDI) && bidi_pass) { pthread_init_mutex(); if (!switching_notify) { - ksft_print_msg("Switching Tx/Rx vectors\n"); + print_verbose("Switching Tx/Rx vectors\n"); switching_notify++; } } @@ -922,10 +966,10 @@ static void testapp_validate(void) pthread_mutex_lock(&sync_mutex); /*Spawn RX thread */ - if (!opt_bidi || !bidi_pass) { + if (!bidi || !bidi_pass) { if (pthread_create(&t0, &attr, worker_testapp_validate, ifdict[1])) exit_with_error(errno); - } else if (opt_bidi && bidi_pass) { + } else if (bidi && bidi_pass) { /*switch Tx/Rx vectors */ ifdict[0]->fv.vector = rx; if (pthread_create(&t0, &attr, worker_testapp_validate, ifdict[0])) @@ -942,10 +986,10 @@ static void testapp_validate(void) pthread_mutex_unlock(&sync_mutex); /*Spawn TX thread */ - if (!opt_bidi || !bidi_pass) { + if (!bidi || !bidi_pass) { if (pthread_create(&t1, &attr, worker_testapp_validate, ifdict[0])) exit_with_error(errno); - } else if (opt_bidi && bidi_pass) { + } else if (bidi && bidi_pass) { /*switch Tx/Rx vectors */ ifdict[1]->fv.vector = tx; if (pthread_create(&t1, &attr, worker_testapp_validate, ifdict[1])) @@ -964,19 +1008,46 @@ static void testapp_validate(void) free(pkt_buf); } - if (!opt_teardown && !opt_bidi) + if (!(test_type == TEST_TYPE_TEARDOWN) && !bidi && !(test_type == TEST_TYPE_STATS)) print_ksft_result(); } static void testapp_sockets(void) { - for (int i = 0; i < (opt_teardown ? MAX_TEARDOWN_ITER : MAX_BIDI_ITER); i++) { + for (int i = 0; i < ((test_type == TEST_TYPE_TEARDOWN) ? MAX_TEARDOWN_ITER : MAX_BIDI_ITER); + i++) { pkt_counter = 0; prev_pkt = -1; sigvar = 0; - ksft_print_msg("Creating socket\n"); + print_verbose("Creating socket\n"); + testapp_validate(); + test_type == TEST_TYPE_BIDI ? bidi_pass++ : bidi_pass; + } + + print_ksft_result(); +} + +static void testapp_stats(void) +{ + for (int i = 0; i < STAT_TEST_TYPE_MAX; i++) { + stat_test_type = i; + + /* reset defaults */ + rxqsize = XSK_RING_CONS__DEFAULT_NUM_DESCS; + frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM; + + switch (stat_test_type) { + case STAT_TEST_RX_DROPPED: + frame_headroom = XSK_UMEM__DEFAULT_FRAME_SIZE - + XDP_PACKET_HEADROOM - 1; + break; + case STAT_TEST_RX_FULL: + rxqsize = RX_FULL_RXQSIZE; + break; + default: + break; + } testapp_validate(); - opt_bidi ? bidi_pass++ : bidi_pass; } print_ksft_result(); @@ -1003,6 +1074,104 @@ static void init_iface_config(struct ifaceconfigobj *ifaceconfig) ifdict[1]->src_port = ifaceconfig->dst_port; } +static void *nsdisablemodethread(void *args) +{ + struct targs *targs = args; + + targs->retptr = false; + + if (switch_namespace(targs->idx)) { + targs->retptr = bpf_set_link_xdp_fd(ifdict[targs->idx]->ifindex, -1, targs->flags); + } else { + targs->retptr = errno; + print_verbose("Failed to switch namespace to %s\n", ifdict[targs->idx]->nsname); + } + + pthread_exit(NULL); +} + +static void disable_xdp_mode(int mode) +{ + int err = 0; + __u32 flags = XDP_FLAGS_UPDATE_IF_NOEXIST | mode; + char *mode_str = mode & XDP_FLAGS_SKB_MODE ? "skb" : "drv"; + + for (int i = 0; i < MAX_INTERFACES; i++) { + if (strcmp(ifdict[i]->nsname, "")) { + struct targs *targs; + + targs = malloc(sizeof(*targs)); + memset(targs, 0, sizeof(*targs)); + if (!targs) + exit_with_error(errno); + + targs->idx = i; + targs->flags = flags; + if (pthread_create(&ns_thread, NULL, nsdisablemodethread, targs)) + exit_with_error(errno); + + pthread_join(ns_thread, NULL); + err = targs->retptr; + free(targs); + } else { + err = bpf_set_link_xdp_fd(ifdict[i]->ifindex, -1, flags); + } + + if (err) { + print_verbose("Failed to disable %s mode on interface %s\n", + mode_str, ifdict[i]->ifname); + exit_with_error(err); + } + + print_verbose("Disabled %s mode for interface: %s\n", mode_str, ifdict[i]->ifname); + configured_mode = mode & XDP_FLAGS_SKB_MODE ? TEST_MODE_DRV : TEST_MODE_SKB; + } +} + +static void run_pkt_test(int mode, int type) +{ + test_type = type; + + /* reset defaults after potential previous test */ + xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; + pkt_counter = 0; + switching_notify = 0; + bidi_pass = 0; + prev_pkt = -1; + ifdict[0]->fv.vector = tx; + ifdict[1]->fv.vector = rx; + sigvar = 0; + stat_test_type = -1; + rxqsize = XSK_RING_CONS__DEFAULT_NUM_DESCS; + frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM; + + switch (mode) { + case (TEST_MODE_SKB): + if (configured_mode == TEST_MODE_DRV) + disable_xdp_mode(XDP_FLAGS_DRV_MODE); + xdp_flags |= XDP_FLAGS_SKB_MODE; + break; + case (TEST_MODE_DRV): + if (configured_mode == TEST_MODE_SKB) + disable_xdp_mode(XDP_FLAGS_SKB_MODE); + xdp_flags |= XDP_FLAGS_DRV_MODE; + break; + default: + break; + } + + pthread_init_mutex(); + + if (test_type == TEST_TYPE_STATS) + testapp_stats(); + else if ((test_type != TEST_TYPE_TEARDOWN) && (test_type != TEST_TYPE_BIDI)) + testapp_validate(); + else + testapp_sockets(); + + pthread_destroy_mutex(); +} + int main(int argc, char **argv) { struct rlimit _rlim = { RLIM_INFINITY, RLIM_INFINITY }; @@ -1016,6 +1185,7 @@ int main(int argc, char **argv) const char *IP2 = "192.168.100.161"; u16 UDP_DST_PORT = 2020; u16 UDP_SRC_PORT = 2121; + int i, j; ifaceconfig = malloc(sizeof(struct ifaceconfigobj)); memcpy(ifaceconfig->dst_mac, MAC1, ETH_ALEN); @@ -1041,24 +1211,18 @@ int main(int argc, char **argv) init_iface_config(ifaceconfig); - pthread_init_mutex(); + disable_xdp_mode(XDP_FLAGS_DRV_MODE); - ksft_set_plan(1); + ksft_set_plan(TEST_MODE_MAX * TEST_TYPE_MAX); - if (!opt_teardown && !opt_bidi) { - testapp_validate(); - } else if (opt_teardown && opt_bidi) { - ksft_test_result_fail("ERROR: parameters -T and -B cannot be used together\n"); - ksft_exit_xfail(); - } else { - testapp_sockets(); + for (i = 0; i < TEST_MODE_MAX; i++) { + for (j = 0; j < TEST_TYPE_MAX; j++) + run_pkt_test(i, j); } for (int i = 0; i < MAX_INTERFACES; i++) free(ifdict[i]); - pthread_destroy_mutex(); - ksft_exit_pass(); return 0; diff --git a/tools/testing/selftests/bpf/xdpxceiver.h b/tools/testing/selftests/bpf/xdpxceiver.h index 0e9f9b7e61c2..30314ef305c2 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.h +++ b/tools/testing/selftests/bpf/xdpxceiver.h @@ -41,33 +41,59 @@ #define BATCH_SIZE 64 #define POLL_TMOUT 1000 #define NEED_WAKEUP true +#define DEFAULT_PKT_CNT 10000 +#define RX_FULL_RXQSIZE 32 + +#define print_verbose(x...) do { if (opt_verbose) ksft_print_msg(x); } while (0) typedef __u32 u32; typedef __u16 u16; typedef __u8 u8; -enum TESTS { - ORDER_CONTENT_VALIDATE_XDP_SKB = 0, - ORDER_CONTENT_VALIDATE_XDP_DRV = 1, +enum TEST_MODES { + TEST_MODE_UNCONFIGURED = -1, + TEST_MODE_SKB, + TEST_MODE_DRV, + TEST_MODE_MAX +}; + +enum TEST_TYPES { + TEST_TYPE_NOPOLL, + TEST_TYPE_POLL, + TEST_TYPE_TEARDOWN, + TEST_TYPE_BIDI, + TEST_TYPE_STATS, + TEST_TYPE_MAX }; -u8 uut; -u8 debug_pkt_dump; -u32 num_frames; -u8 switching_notify; -u8 bidi_pass; +enum STAT_TEST_TYPES { + STAT_TEST_RX_DROPPED, + STAT_TEST_TX_INVALID, + STAT_TEST_RX_FULL, + STAT_TEST_RX_FILL_EMPTY, + STAT_TEST_TYPE_MAX +}; + +static int configured_mode = TEST_MODE_UNCONFIGURED; +static u8 debug_pkt_dump; +static u32 num_frames; +static u8 switching_notify; +static u8 bidi_pass; +static int test_type; -static u32 opt_xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; static int opt_queue; static int opt_pkt_count; -static int opt_poll; -static int opt_teardown; -static int opt_bidi; -static u32 opt_xdp_bind_flags = XDP_USE_NEED_WAKEUP; +static u8 opt_verbose; + +static u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; +static u32 xdp_bind_flags = XDP_USE_NEED_WAKEUP | XDP_COPY; static u8 pkt_data[XSK_UMEM__DEFAULT_FRAME_SIZE]; static u32 pkt_counter; -static u32 prev_pkt = -1; +static long prev_pkt = -1; static int sigvar; +static int stat_test_type; +static u32 rxqsize; +static u32 frame_headroom; struct xsk_umem_info { struct xsk_ring_prod fq; @@ -137,8 +163,9 @@ pthread_t t0, t1, ns_thread; pthread_attr_t attr; struct targs { - bool retptr; + u8 retptr; int idx; + u32 flags; }; TAILQ_HEAD(head_s, pkt) head = TAILQ_HEAD_INITIALIZER(head); diff --git a/tools/testing/selftests/bpf/xsk_prereqs.sh b/tools/testing/selftests/bpf/xsk_prereqs.sh index 9d54c4645127..dac1c5f78752 100755 --- a/tools/testing/selftests/bpf/xsk_prereqs.sh +++ b/tools/testing/selftests/bpf/xsk_prereqs.sh @@ -82,24 +82,21 @@ clear_configs() { if [ $(ip netns show | grep $3 &>/dev/null; echo $?;) == 0 ]; then [ $(ip netns exec $3 ip link show $2 &>/dev/null; echo $?;) == 0 ] && - { echo "removing link $1:$2"; ip netns exec $3 ip link del $2; } - echo "removing ns $3" + { ip netns exec $3 ip link del $2; } ip netns del $3 fi #Once we delete a veth pair node, the entire veth pair is removed, #this is just to be cautious just incase the NS does not exist then #veth node inside NS won't get removed so we explicitly remove it [ $(ip link show $1 &>/dev/null; echo $?;) == 0 ] && - { echo "removing link $1"; ip link del $1; } + { ip link del $1; } if [ -f ${SPECFILE} ]; then - echo "removing spec file:" ${SPECFILE} rm -f ${SPECFILE} fi } cleanup_exit() { - echo "cleaning up..." clear_configs $1 $2 $3 } @@ -108,28 +105,7 @@ validate_ip_utility() [ ! $(type -P ip) ] && { echo "'ip' not found. Skipping tests."; test_exit $ksft_skip 1; } } -vethXDPgeneric() -{ - ip link set dev $1 xdpdrv off - ip netns exec $3 ip link set dev $2 xdpdrv off -} - -vethXDPnative() -{ - ip link set dev $1 xdpgeneric off - ip netns exec $3 ip link set dev $2 xdpgeneric off -} - execxdpxceiver() { - local -a 'paramkeys=("${!'"$1"'[@]}")' copy - paramkeysstr=${paramkeys[*]} - - for index in $paramkeysstr; - do - current=$1"[$index]" - copy[$index]=${!current} - done - - ./${XSKOBJ} -i ${VETH0} -i ${VETH1},${NS1} ${copy[*]} -C ${NUMPKTS} + ./${XSKOBJ} -i ${VETH0} -i ${VETH1},${NS1} -C ${NUMPKTS} ${VERBOSE_ARG} ${DUMP_PKTS_ARG} } |