From 0a09a2f933c73dc76ab0b72da6855f44342a8903 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 21 Feb 2023 21:06:42 +0100 Subject: bpf: Annotate data races in bpf_local_storage There are a few cases where hlist_node is checked to be unhashed without holding the lock protecting its modification. In this case, one must use hlist_unhashed_lockless to avoid load tearing and KCSAN reports. Fix this by using lockless variant in places not protected by the lock. Since this is not prompted by any actual KCSAN reports but only from code review, I have not included a fixes tag. Cc: Martin KaFai Lau Cc: KP Singh Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20230221200646.2500777-4-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_local_storage.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 35f4138a54dc..58da17ae5124 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -51,11 +51,21 @@ owner_storage(struct bpf_local_storage_map *smap, void *owner) return map->ops->map_owner_storage_ptr(owner); } +static bool selem_linked_to_storage_lockless(const struct bpf_local_storage_elem *selem) +{ + return !hlist_unhashed_lockless(&selem->snode); +} + static bool selem_linked_to_storage(const struct bpf_local_storage_elem *selem) { return !hlist_unhashed(&selem->snode); } +static bool selem_linked_to_map_lockless(const struct bpf_local_storage_elem *selem) +{ + return !hlist_unhashed_lockless(&selem->map_node); +} + static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem) { return !hlist_unhashed(&selem->map_node); @@ -174,7 +184,7 @@ static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem, bool free_local_storage = false; unsigned long flags; - if (unlikely(!selem_linked_to_storage(selem))) + if (unlikely(!selem_linked_to_storage_lockless(selem))) /* selem has already been unlinked from sk */ return; @@ -208,7 +218,7 @@ void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) struct bpf_local_storage_map_bucket *b; unsigned long flags; - if (unlikely(!selem_linked_to_map(selem))) + if (unlikely(!selem_linked_to_map_lockless(selem))) /* selem has already be unlinked from smap */ return; @@ -420,7 +430,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, err = check_flags(old_sdata, map_flags); if (err) return ERR_PTR(err); - if (old_sdata && selem_linked_to_storage(SELEM(old_sdata))) { + if (old_sdata && selem_linked_to_storage_lockless(SELEM(old_sdata))) { copy_map_value_locked(&smap->map, old_sdata->data, value, false); return old_sdata; -- cgit v1.2.3-58-ga151 From 521d3c0a1730c29c96870919a7a115577e17f8c7 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 21 Feb 2023 21:06:43 +0100 Subject: bpf: Remove unused MEM_ALLOC | PTR_TRUSTED checks The plan is to supposedly tag everything with PTR_TRUSTED eventually, however those changes should bring in their respective code, instead of leaving it around right now. It is arguable whether PTR_TRUSTED is required for all types, when it's only use case is making PTR_TO_BTF_ID a bit stronger, while all other types are trusted by default. Hence, just drop the two instances which do not occur in the verifier for now to avoid reader confusion. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20230221200646.2500777-5-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d517d13878cf..477c22c9bbd7 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6656,7 +6656,6 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env, case PTR_TO_BTF_ID | MEM_ALLOC: case PTR_TO_BTF_ID | PTR_TRUSTED: case PTR_TO_BTF_ID | MEM_RCU: - case PTR_TO_BTF_ID | MEM_ALLOC | PTR_TRUSTED: case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF: /* When referenced PTR_TO_BTF_ID is passed to release function, * its fixed offset must be 0. In the other cases, fixed offset @@ -9211,7 +9210,6 @@ static int check_reg_allocation_locked(struct bpf_verifier_env *env, struct bpf_ ptr = reg->map_ptr; break; case PTR_TO_BTF_ID | MEM_ALLOC: - case PTR_TO_BTF_ID | MEM_ALLOC | PTR_TRUSTED: ptr = reg->btf; break; default: -- cgit v1.2.3-58-ga151 From da03e43a8c500fcfb11ac5eeb03c1b4a9c1dd958 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 21 Feb 2023 21:06:44 +0100 Subject: bpf: Fix check_reg_type for PTR_TO_BTF_ID The current code does type matching for the case where reg->type is PTR_TO_BTF_ID or has the PTR_TRUSTED flag. However, this only needs to occur for non-MEM_ALLOC and non-MEM_PERCPU cases, but will include both as per the current code. The MEM_ALLOC case with or without PTR_TRUSTED needs to be handled specially by the code for type_is_alloc case, while MEM_PERCPU case must be ignored. Hence, to restore correct behavior and for clarity, explicitly list out the handled PTR_TO_BTF_ID types which should be handled for each case using a switch statement. Helpers currently only take: PTR_TO_BTF_ID PTR_TO_BTF_ID | PTR_TRUSTED PTR_TO_BTF_ID | MEM_RCU PTR_TO_BTF_ID | MEM_ALLOC PTR_TO_BTF_ID | MEM_PERCPU PTR_TO_BTF_ID | MEM_PERCPU | PTR_TRUSTED This fix was also described (for the MEM_ALLOC case) in [0]. [0]: https://lore.kernel.org/bpf/20221121160657.h6z7xuvedybp5y7s@apollo Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20230221200646.2500777-6-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 477c22c9bbd7..062fd4a05234 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6527,7 +6527,14 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, return -EACCES; found: - if (reg->type == PTR_TO_BTF_ID || reg->type & PTR_TRUSTED) { + if (base_type(reg->type) != PTR_TO_BTF_ID) + return 0; + + switch ((int)reg->type) { + case PTR_TO_BTF_ID: + case PTR_TO_BTF_ID | PTR_TRUSTED: + case PTR_TO_BTF_ID | MEM_RCU: + { /* For bpf_sk_release, it needs to match against first member * 'struct sock_common', hence make an exception for it. This * allows bpf_sk_release to work for multiple socket types. @@ -6563,13 +6570,23 @@ found: return -EACCES; } } - } else if (type_is_alloc(reg->type)) { + break; + } + case PTR_TO_BTF_ID | MEM_ALLOC: if (meta->func_id != BPF_FUNC_spin_lock && meta->func_id != BPF_FUNC_spin_unlock) { verbose(env, "verifier internal error: unimplemented handling of MEM_ALLOC\n"); return -EFAULT; } + /* Handled by helper specific checks */ + break; + case PTR_TO_BTF_ID | MEM_PERCPU: + case PTR_TO_BTF_ID | MEM_PERCPU | PTR_TRUSTED: + /* Handled by helper specific checks */ + break; + default: + verbose(env, "verifier internal error: invalid PTR_TO_BTF_ID register for type match\n"); + return -EFAULT; } - return 0; } -- cgit v1.2.3-58-ga151 From dbd8d22863e83ee2834642e4cfd3bdacb8a1c975 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 21 Feb 2023 21:06:45 +0100 Subject: bpf: Wrap register invalidation with a helper Typically, verifier should use env->allow_ptr_leaks when invaliding registers for users that don't have CAP_PERFMON or CAP_SYS_ADMIN to avoid leaking the pointer value. This is similar in spirit to c67cae551f0d ("bpf: Tighten ptr_to_btf_id checks."). In a lot of the existing checks, we know the capabilities are present, hence we don't do the check. Instead of being inconsistent in the application of the check, wrap the action of invalidating a register into a helper named 'mark_invalid_reg' and use it in a uniform fashion to replace open coded invalidation operations, so that the check is always made regardless of the call site and we don't have to remember whether it needs to be done or not for each case. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20230221200646.2500777-7-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 062fd4a05234..741cb5107536 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -895,6 +895,14 @@ static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_re static void __mark_reg_unknown(const struct bpf_verifier_env *env, struct bpf_reg_state *reg); +static void mark_reg_invalid(const struct bpf_verifier_env *env, struct bpf_reg_state *reg) +{ + if (!env->allow_ptr_leaks) + __mark_reg_not_init(env, reg); + else + __mark_reg_unknown(env, reg); +} + static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env, struct bpf_func_state *state, int spi) { @@ -934,12 +942,8 @@ static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env, /* Dynptr slices are only PTR_TO_MEM_OR_NULL and PTR_TO_MEM */ if (dreg->type != (PTR_TO_MEM | PTR_MAYBE_NULL) && dreg->type != PTR_TO_MEM) continue; - if (dreg->dynptr_id == dynptr_id) { - if (!env->allow_ptr_leaks) - __mark_reg_not_init(env, dreg); - else - __mark_reg_unknown(env, dreg); - } + if (dreg->dynptr_id == dynptr_id) + mark_reg_invalid(env, dreg); })); /* Do not release reference state, we are destroying dynptr on stack, @@ -7384,7 +7388,7 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({ if (reg_is_pkt_pointer_any(reg)) - __mark_reg_unknown(env, reg); + mark_reg_invalid(env, reg); })); } @@ -7429,12 +7433,8 @@ static int release_reference(struct bpf_verifier_env *env, return err; bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({ - if (reg->ref_obj_id == ref_obj_id) { - if (!env->allow_ptr_leaks) - __mark_reg_not_init(env, reg); - else - __mark_reg_unknown(env, reg); - } + if (reg->ref_obj_id == ref_obj_id) + mark_reg_invalid(env, reg); })); return 0; @@ -7447,7 +7447,7 @@ static void invalidate_non_owning_refs(struct bpf_verifier_env *env) bpf_for_each_reg_in_vstate(env->cur_state, unused, reg, ({ if (type_is_non_owning_ref(reg->type)) - __mark_reg_unknown(env, reg); + mark_reg_invalid(env, reg); })); } -- cgit v1.2.3-58-ga151 From 5d5de3a431d87ac51d43da8d796891d014975ab7 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Thu, 16 Feb 2023 10:48:21 +0800 Subject: bpf: Only allocate one bpf_mem_cache for bpf_cpumask_ma The size of bpf_cpumask is fixed, so there is no need to allocate many bpf_mem_caches for bpf_cpumask_ma, just one bpf_mem_cache is enough. Also add comments for bpf_mem_alloc_init() in bpf_mem_alloc.h to prevent future miuse. Signed-off-by: Hou Tao Acked-by: Jiri Olsa Link: https://lore.kernel.org/r/20230216024821.2202916-1-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_mem_alloc.h | 7 +++++++ kernel/bpf/cpumask.c | 6 +++--- 2 files changed, 10 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h index 3e164b8efaa9..a7104af61ab4 100644 --- a/include/linux/bpf_mem_alloc.h +++ b/include/linux/bpf_mem_alloc.h @@ -14,6 +14,13 @@ struct bpf_mem_alloc { struct work_struct work; }; +/* 'size != 0' is for bpf_mem_alloc which manages fixed-size objects. + * Alloc and free are done with bpf_mem_cache_{alloc,free}(). + * + * 'size = 0' is for bpf_mem_alloc which manages many fixed-size objects. + * Alloc and free are done with bpf_mem_{alloc,free}() and the size of + * the returned object is given by the size argument of bpf_mem_alloc(). + */ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu); void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma); diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c index 52b981512a35..2b3fbbfebdc5 100644 --- a/kernel/bpf/cpumask.c +++ b/kernel/bpf/cpumask.c @@ -55,7 +55,7 @@ __bpf_kfunc struct bpf_cpumask *bpf_cpumask_create(void) /* cpumask must be the first element so struct bpf_cpumask be cast to struct cpumask. */ BUILD_BUG_ON(offsetof(struct bpf_cpumask, cpumask) != 0); - cpumask = bpf_mem_alloc(&bpf_cpumask_ma, sizeof(*cpumask)); + cpumask = bpf_mem_cache_alloc(&bpf_cpumask_ma); if (!cpumask) return NULL; @@ -123,7 +123,7 @@ __bpf_kfunc void bpf_cpumask_release(struct bpf_cpumask *cpumask) if (refcount_dec_and_test(&cpumask->usage)) { migrate_disable(); - bpf_mem_free(&bpf_cpumask_ma, cpumask); + bpf_mem_cache_free(&bpf_cpumask_ma, cpumask); migrate_enable(); } } @@ -468,7 +468,7 @@ static int __init cpumask_kfunc_init(void) }, }; - ret = bpf_mem_alloc_init(&bpf_cpumask_ma, 0, false); + ret = bpf_mem_alloc_init(&bpf_cpumask_ma, sizeof(struct bpf_cpumask), false); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &cpumask_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &cpumask_kfunc_set); return ret ?: register_btf_id_dtor_kfuncs(cpumask_dtors, -- cgit v1.2.3-58-ga151 From df2ccc180a2e6f6e4343ebee99dcfab4f8af2816 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Mon, 20 Feb 2023 17:37:56 +0100 Subject: bpf: Check for helper calls in check_subprogs() The condition src_reg != BPF_PSEUDO_CALL && imm == BPF_FUNC_tail_call may be satisfied by a kfunc call. This would lead to unnecessarily setting has_tail_call. Use src_reg == 0 instead. Signed-off-by: Ilya Leoshkevich Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20230220163756.753713-1-iii@linux.ibm.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 741cb5107536..5cb8b623f639 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2479,8 +2479,8 @@ static int check_subprogs(struct bpf_verifier_env *env) u8 code = insn[i].code; if (code == (BPF_JMP | BPF_CALL) && - insn[i].imm == BPF_FUNC_tail_call && - insn[i].src_reg != BPF_PSEUDO_CALL) + insn[i].src_reg == 0 && + insn[i].imm == BPF_FUNC_tail_call) subprog[cur_subprog].has_tail_call = true; if (BPF_CLASS(code) == BPF_LD && (BPF_MODE(code) == BPF_ABS || BPF_MODE(code) == BPF_IND)) -- cgit v1.2.3-58-ga151 From 332ea1f697be148bd5e66475d82b5ecc5084da65 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 22 Feb 2023 15:29:12 -1000 Subject: bpf: Add bpf_cgroup_from_id() kfunc cgroup ID is an userspace-visible 64bit value uniquely identifying a given cgroup. As the IDs are used widely, it's useful to be able to look up the matching cgroups. Add bpf_cgroup_from_id(). v2: Separate out selftest into its own patch as suggested by Alexei. Signed-off-by: Tejun Heo Link: https://lore.kernel.org/r/Y/bBaG96t0/gQl9/@slm.duckdns.org Signed-off-by: Alexei Starovoitov --- Documentation/bpf/kfuncs.rst | 10 +++++++--- kernel/bpf/helpers.c | 18 ++++++++++++++++++ 2 files changed, 25 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst index ca96ef3f6896..226313747be5 100644 --- a/Documentation/bpf/kfuncs.rst +++ b/Documentation/bpf/kfuncs.rst @@ -583,13 +583,17 @@ Here's an example of how it can be used: ---- -Another kfunc available for interacting with ``struct cgroup *`` objects is -bpf_cgroup_ancestor(). This allows callers to access the ancestor of a cgroup, -and return it as a cgroup kptr. +Other kfuncs available for interacting with ``struct cgroup *`` objects are +bpf_cgroup_ancestor() and bpf_cgroup_from_id(), allowing callers to access +the ancestor of a cgroup and find a cgroup by its ID, respectively. Both +return a cgroup kptr. .. kernel-doc:: kernel/bpf/helpers.c :identifiers: bpf_cgroup_ancestor +.. kernel-doc:: kernel/bpf/helpers.c + :identifiers: bpf_cgroup_from_id + Eventually, BPF should be updated to allow this to happen with a normal memory load in the program itself. This is currently not possible without more work in the verifier. bpf_cgroup_ancestor() can be used as follows: diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 5b278a38ae58..a784be6f8bac 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2101,6 +2101,23 @@ __bpf_kfunc struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level) cgroup_get(ancestor); return ancestor; } + +/** + * bpf_cgroup_from_id - Find a cgroup from its ID. A cgroup returned by this + * kfunc which is not subsequently stored in a map, must be released by calling + * bpf_cgroup_release(). + * @cgrp: The cgroup for which we're performing a lookup. + * @level: The level of ancestor to look up. + */ +__bpf_kfunc struct cgroup *bpf_cgroup_from_id(u64 cgid) +{ + struct cgroup *cgrp; + + cgrp = cgroup_get_from_id(cgid); + if (IS_ERR(cgrp)) + return NULL; + return cgrp; +} #endif /* CONFIG_CGROUPS */ /** @@ -2167,6 +2184,7 @@ BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_cgroup_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_cgroup_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_TRUSTED_ARGS | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_cgroup_from_id, KF_ACQUIRE | KF_RET_NULL) #endif BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL) BTF_SET8_END(generic_btf_ids) -- cgit v1.2.3-58-ga151 From 30a2d8328d8ac1bb0a6bf73f4f4cf03f4f5977cc Mon Sep 17 00:00:00 2001 From: David Vernet Date: Tue, 28 Feb 2023 09:28:45 -0600 Subject: bpf: Fix bpf_cgroup_from_id() doxygen header In commit 332ea1f697be ("bpf: Add bpf_cgroup_from_id() kfunc"), a new bpf_cgroup_from_id() kfunc was added which allows a BPF program to lookup and acquire a reference to a cgroup from a cgroup id. The commit's doxygen comment seems to have copy-pasted fields, which causes BPF kfunc helper documentation to fail to render: /helpers.c:2114: warning: Excess function parameter 'cgrp'... /helpers.c:2114: warning: Excess function parameter 'level'... /helpers.c:2114: warning: Excess function parameter 'level'... This patch fixes the doxygen header. Fixes: 332ea1f697be ("bpf: Add bpf_cgroup_from_id() kfunc") Signed-off-by: David Vernet Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20230228152845.294695-1-void@manifault.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index a784be6f8bac..abdcc52f90a6 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2106,8 +2106,7 @@ __bpf_kfunc struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level) * bpf_cgroup_from_id - Find a cgroup from its ID. A cgroup returned by this * kfunc which is not subsequently stored in a map, must be released by calling * bpf_cgroup_release(). - * @cgrp: The cgroup for which we're performing a lookup. - * @level: The level of ancestor to look up. + * @cgid: cgroup id. */ __bpf_kfunc struct cgroup *bpf_cgroup_from_id(u64 cgid) { -- cgit v1.2.3-58-ga151 From 2f46439346700a2b41cf0fa9432f110f42fd8821 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Wed, 1 Mar 2023 07:49:44 -0800 Subject: bpf: Support "sk_buff" and "xdp_buff" as valid kfunc arg types The bpf mirror of the in-kernel sk_buff and xdp_buff data structures are __sk_buff and xdp_md. Currently, when we pass in the program ctx to a kfunc where the program ctx is a skb or xdp buffer, we reject the program if the in-kernel definition is sk_buff/xdp_buff instead of __sk_buff/xdp_md. This change allows "sk_buff <--> __sk_buff" and "xdp_buff <--> xdp_md" to be recognized as valid matches. The user program may pass in their program ctx as a __sk_buff or xdp_md, and the in-kernel definition of the kfunc may define this arg as a sk_buff or xdp_buff. Signed-off-by: Joanne Koong Link: https://lore.kernel.org/r/20230301154953.641654-2-joannelkoong@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index fa22ec79ac0e..84cca8473873 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5683,6 +5683,10 @@ again: * int socket_filter_bpf_prog(struct __sk_buff *skb) * { // no fields of skb are ever used } */ + if (strcmp(ctx_tname, "__sk_buff") == 0 && strcmp(tname, "sk_buff") == 0) + return ctx_type; + if (strcmp(ctx_tname, "xdp_md") == 0 && strcmp(tname, "xdp_buff") == 0) + return ctx_type; if (strcmp(ctx_tname, tname)) { /* bpf_user_pt_regs_t is a typedef, so resolve it to * underlying struct and check name again -- cgit v1.2.3-58-ga151 From 7e0dac2807e6c4ae8c56941d74971fdb0763b4f9 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Wed, 1 Mar 2023 07:49:45 -0800 Subject: bpf: Refactor process_dynptr_func This change cleans up process_dynptr_func's flow to be more intuitive and updates some comments with more context. Signed-off-by: Joanne Koong Link: https://lore.kernel.org/r/20230301154953.641654-3-joannelkoong@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 3 --- kernel/bpf/verifier.c | 62 ++++++++++++++++++++++---------------------- 2 files changed, 31 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index cf1bb1cf4a7b..b26ff2a8f63b 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -616,9 +616,6 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env, enum bpf_arg_type arg_type); int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, u32 mem_size); -struct bpf_call_arg_meta; -int process_dynptr_func(struct bpf_verifier_env *env, int regno, - enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta); /* this lives here instead of in bpf.h because it needs to dereference tgt_prog */ static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5cb8b623f639..e0e00509846b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -959,39 +959,49 @@ static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env, return 0; } -static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg, - int spi) +static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { + int spi; + if (reg->type == CONST_PTR_TO_DYNPTR) return false; - /* For -ERANGE (i.e. spi not falling into allocated stack slots), we - * will do check_mem_access to check and update stack bounds later, so - * return true for that case. + spi = dynptr_get_spi(env, reg); + + /* -ERANGE (i.e. spi not falling into allocated stack slots) isn't an + * error because this just means the stack state hasn't been updated yet. + * We will do check_mem_access to check and update stack bounds later. */ - if (spi < 0) - return spi == -ERANGE; - /* We allow overwriting existing unreferenced STACK_DYNPTR slots, see - * mark_stack_slots_dynptr which calls destroy_if_dynptr_stack_slot to - * ensure dynptr objects at the slots we are touching are completely - * destructed before we reinitialize them for a new one. For referenced - * ones, destroy_if_dynptr_stack_slot returns an error early instead of - * delaying it until the end where the user will get "Unreleased + if (spi < 0 && spi != -ERANGE) + return false; + + /* We don't need to check if the stack slots are marked by previous + * dynptr initializations because we allow overwriting existing unreferenced + * STACK_DYNPTR slots, see mark_stack_slots_dynptr which calls + * destroy_if_dynptr_stack_slot to ensure dynptr objects at the slots we are + * touching are completely destructed before we reinitialize them for a new + * one. For referenced ones, destroy_if_dynptr_stack_slot returns an error early + * instead of delaying it until the end where the user will get "Unreleased * reference" error. */ return true; } -static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg, - int spi) +static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { struct bpf_func_state *state = func(env, reg); - int i; + int i, spi; - /* This already represents first slot of initialized bpf_dynptr */ + /* This already represents first slot of initialized bpf_dynptr. + * + * CONST_PTR_TO_DYNPTR already has fixed and var_off as 0 due to + * check_func_arg_reg_off's logic, so we don't need to check its + * offset and alignment. + */ if (reg->type == CONST_PTR_TO_DYNPTR) return true; + spi = dynptr_get_spi(env, reg); if (spi < 0) return false; if (!state->stack[spi].spilled_ptr.dynptr.first_slot) @@ -6215,11 +6225,10 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno, * Helpers which do not mutate the bpf_dynptr set MEM_RDONLY in their argument * type, and declare it as 'const struct bpf_dynptr *' in their prototype. */ -int process_dynptr_func(struct bpf_verifier_env *env, int regno, - enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta) +static int process_dynptr_func(struct bpf_verifier_env *env, int regno, + enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta) { struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; - int spi = 0; /* MEM_UNINIT and MEM_RDONLY are exclusive, when applied to an * ARG_PTR_TO_DYNPTR (or ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_*): @@ -6228,15 +6237,6 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno, verbose(env, "verifier internal error: misconfigured dynptr helper type flags\n"); return -EFAULT; } - /* CONST_PTR_TO_DYNPTR already has fixed and var_off as 0 due to - * check_func_arg_reg_off's logic. We only need to check offset - * and its alignment for PTR_TO_STACK. - */ - if (reg->type == PTR_TO_STACK) { - spi = dynptr_get_spi(env, reg); - if (spi < 0 && spi != -ERANGE) - return spi; - } /* MEM_UNINIT - Points to memory that is an appropriate candidate for * constructing a mutable bpf_dynptr object. @@ -6254,7 +6254,7 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno, * to. */ if (arg_type & MEM_UNINIT) { - if (!is_dynptr_reg_valid_uninit(env, reg, spi)) { + if (!is_dynptr_reg_valid_uninit(env, reg)) { verbose(env, "Dynptr has to be an uninitialized dynptr\n"); return -EINVAL; } @@ -6277,7 +6277,7 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno, return -EINVAL; } - if (!is_dynptr_reg_valid_init(env, reg, spi)) { + if (!is_dynptr_reg_valid_init(env, reg)) { verbose(env, "Expected an initialized dynptr as arg #%d\n", regno); -- cgit v1.2.3-58-ga151 From 1d18feb2c915c5ad0a9a61d04b8560e8efb78ce8 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Wed, 1 Mar 2023 07:49:46 -0800 Subject: bpf: Allow initializing dynptrs in kfuncs This change allows kfuncs to take in an uninitialized dynptr as a parameter. Before this change, only helper functions could successfully use uninitialized dynptrs. This change moves the memory access check (including stack state growing and slot marking) into process_dynptr_func(), which both helpers and kfuncs call into. Signed-off-by: Joanne Koong Link: https://lore.kernel.org/r/20230301154953.641654-4-joannelkoong@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 67 +++++++++++++++++---------------------------------- 1 file changed, 22 insertions(+), 45 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e0e00509846b..82e39fc5ed05 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -268,7 +268,6 @@ struct bpf_call_arg_meta { u32 ret_btf_id; u32 subprogno; struct btf_field *kptr_field; - u8 uninit_dynptr_regno; }; struct btf *btf_vmlinux; @@ -6225,10 +6224,11 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno, * Helpers which do not mutate the bpf_dynptr set MEM_RDONLY in their argument * type, and declare it as 'const struct bpf_dynptr *' in their prototype. */ -static int process_dynptr_func(struct bpf_verifier_env *env, int regno, - enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta) +static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn_idx, + enum bpf_arg_type arg_type) { struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + int err; /* MEM_UNINIT and MEM_RDONLY are exclusive, when applied to an * ARG_PTR_TO_DYNPTR (or ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_*): @@ -6254,23 +6254,23 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, * to. */ if (arg_type & MEM_UNINIT) { + int i; + if (!is_dynptr_reg_valid_uninit(env, reg)) { verbose(env, "Dynptr has to be an uninitialized dynptr\n"); return -EINVAL; } - /* We only support one dynptr being uninitialized at the moment, - * which is sufficient for the helper functions we have right now. - */ - if (meta->uninit_dynptr_regno) { - verbose(env, "verifier internal error: multiple uninitialized dynptr args\n"); - return -EFAULT; + /* we write BPF_DW bits (8 bytes) at a time */ + for (i = 0; i < BPF_DYNPTR_SIZE; i += 8) { + err = check_mem_access(env, insn_idx, regno, + i, BPF_DW, BPF_WRITE, -1, false); + if (err) + return err; } - meta->uninit_dynptr_regno = regno; + err = mark_stack_slots_dynptr(env, reg, arg_type, insn_idx); } else /* MEM_RDONLY and None case from above */ { - int err; - /* For the reg->type == PTR_TO_STACK case, bpf_dynptr is never const */ if (reg->type == CONST_PTR_TO_DYNPTR && !(arg_type & MEM_RDONLY)) { verbose(env, "cannot pass pointer to const bpf_dynptr, the helper mutates it\n"); @@ -6306,10 +6306,8 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, } err = mark_dynptr_read(env, reg); - if (err) - return err; } - return 0; + return err; } static bool arg_type_is_mem_size(enum bpf_arg_type type) @@ -6719,7 +6717,8 @@ static int dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state static int check_func_arg(struct bpf_verifier_env *env, u32 arg, struct bpf_call_arg_meta *meta, - const struct bpf_func_proto *fn) + const struct bpf_func_proto *fn, + int insn_idx) { u32 regno = BPF_REG_1 + arg; struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; @@ -6932,7 +6931,7 @@ skip_type_check: err = check_mem_size_reg(env, reg, regno, true, meta); break; case ARG_PTR_TO_DYNPTR: - err = process_dynptr_func(env, regno, arg_type, meta); + err = process_dynptr_func(env, regno, insn_idx, arg_type); if (err) return err; break; @@ -8218,7 +8217,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn meta.func_id = func_id; /* check args */ for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) { - err = check_func_arg(env, i, &meta, fn); + err = check_func_arg(env, i, &meta, fn, insn_idx); if (err) return err; } @@ -8243,30 +8242,6 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn regs = cur_regs(env); - /* This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot - * be reinitialized by any dynptr helper. Hence, mark_stack_slots_dynptr - * is safe to do directly. - */ - if (meta.uninit_dynptr_regno) { - if (regs[meta.uninit_dynptr_regno].type == CONST_PTR_TO_DYNPTR) { - verbose(env, "verifier internal error: CONST_PTR_TO_DYNPTR cannot be initialized\n"); - return -EFAULT; - } - /* we write BPF_DW bits (8 bytes) at a time */ - for (i = 0; i < BPF_DYNPTR_SIZE; i += 8) { - err = check_mem_access(env, insn_idx, meta.uninit_dynptr_regno, - i, BPF_DW, BPF_WRITE, -1, false); - if (err) - return err; - } - - err = mark_stack_slots_dynptr(env, ®s[meta.uninit_dynptr_regno], - fn->arg_type[meta.uninit_dynptr_regno - BPF_REG_1], - insn_idx); - if (err) - return err; - } - if (meta.release_regno) { err = -EINVAL; /* This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot @@ -9475,7 +9450,8 @@ static int process_kf_arg_ptr_to_rbtree_node(struct bpf_verifier_env *env, &meta->arg_rbtree_root.field); } -static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta) +static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta, + int insn_idx) { const char *func_name = meta->func_name, *ref_tname; const struct btf *btf = meta->btf; @@ -9672,7 +9648,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return -EINVAL; } - ret = process_dynptr_func(env, regno, ARG_PTR_TO_DYNPTR | MEM_RDONLY, NULL); + ret = process_dynptr_func(env, regno, insn_idx, + ARG_PTR_TO_DYNPTR | MEM_RDONLY); if (ret < 0) return ret; break; @@ -9880,7 +9857,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } /* Check the arguments */ - err = check_kfunc_args(env, &meta); + err = check_kfunc_args(env, &meta, insn_idx); if (err < 0) return err; /* In case of release function, we get register number of refcounted -- cgit v1.2.3-58-ga151 From 485ec51ef9764c0f67d35cabba0a963936b9126e Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Wed, 1 Mar 2023 07:49:48 -0800 Subject: bpf: Refactor verifier dynptr into get_dynptr_arg_reg This commit refactors the logic for determining which register in a function is the dynptr into "get_dynptr_arg_reg". This will be used in the future when the dynptr reg for BPF_FUNC_dynptr_write will need to be obtained in order to support writes for skb dynptrs. Signed-off-by: Joanne Koong Link: https://lore.kernel.org/r/20230301154953.641654-6-joannelkoong@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 80 ++++++++++++++++++++++++++++++++------------------- 1 file changed, 50 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 82e39fc5ed05..8fd2f26a8977 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6689,6 +6689,28 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env, } } +static struct bpf_reg_state *get_dynptr_arg_reg(struct bpf_verifier_env *env, + const struct bpf_func_proto *fn, + struct bpf_reg_state *regs) +{ + struct bpf_reg_state *state = NULL; + int i; + + for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) + if (arg_type_is_dynptr(fn->arg_type[i])) { + if (state) { + verbose(env, "verifier internal error: multiple dynptr args\n"); + return NULL; + } + state = ®s[BPF_REG_1 + i]; + } + + if (!state) + verbose(env, "verifier internal error: no dynptr arg found\n"); + + return state; +} + static int dynptr_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { struct bpf_func_state *state = func(env, reg); @@ -8326,43 +8348,41 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn } break; case BPF_FUNC_dynptr_data: - for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) { - if (arg_type_is_dynptr(fn->arg_type[i])) { - struct bpf_reg_state *reg = ®s[BPF_REG_1 + i]; - int id, ref_obj_id; - - if (meta.dynptr_id) { - verbose(env, "verifier internal error: meta.dynptr_id already set\n"); - return -EFAULT; - } - - if (meta.ref_obj_id) { - verbose(env, "verifier internal error: meta.ref_obj_id already set\n"); - return -EFAULT; - } + { + struct bpf_reg_state *reg; + int id, ref_obj_id; - id = dynptr_id(env, reg); - if (id < 0) { - verbose(env, "verifier internal error: failed to obtain dynptr id\n"); - return id; - } + reg = get_dynptr_arg_reg(env, fn, regs); + if (!reg) + return -EFAULT; - ref_obj_id = dynptr_ref_obj_id(env, reg); - if (ref_obj_id < 0) { - verbose(env, "verifier internal error: failed to obtain dynptr ref_obj_id\n"); - return ref_obj_id; - } - meta.dynptr_id = id; - meta.ref_obj_id = ref_obj_id; - break; - } + if (meta.dynptr_id) { + verbose(env, "verifier internal error: meta.dynptr_id already set\n"); + return -EFAULT; } - if (i == MAX_BPF_FUNC_REG_ARGS) { - verbose(env, "verifier internal error: no dynptr in bpf_dynptr_data()\n"); + if (meta.ref_obj_id) { + verbose(env, "verifier internal error: meta.ref_obj_id already set\n"); return -EFAULT; } + + id = dynptr_id(env, reg); + if (id < 0) { + verbose(env, "verifier internal error: failed to obtain dynptr id\n"); + return id; + } + + ref_obj_id = dynptr_ref_obj_id(env, reg); + if (ref_obj_id < 0) { + verbose(env, "verifier internal error: failed to obtain dynptr ref_obj_id\n"); + return ref_obj_id; + } + + meta.dynptr_id = id; + meta.ref_obj_id = ref_obj_id; + break; + } case BPF_FUNC_user_ringbuf_drain: err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, set_user_ringbuf_callback_state); -- cgit v1.2.3-58-ga151 From d96d937d7c5c12237dce1f14bf0fc9900cabba09 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Wed, 1 Mar 2023 07:49:49 -0800 Subject: bpf: Add __uninit kfunc annotation This patch adds __uninit as a kfunc annotation. This will be useful for scenarios such as for example in dynptrs, indicating whether the dynptr should be checked by the verifier as an initialized or an uninitialized dynptr. Without this annotation, the alternative would be needing to hard-code in the verifier the specific kfunc to indicate that arg should be treated as an uninitialized arg. Signed-off-by: Joanne Koong Link: https://lore.kernel.org/r/20230301154953.641654-7-joannelkoong@gmail.com Signed-off-by: Alexei Starovoitov --- Documentation/bpf/kfuncs.rst | 17 +++++++++++++++++ kernel/bpf/verifier.c | 18 ++++++++++++++++-- 2 files changed, 33 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst index 226313747be5..9a78533d25ac 100644 --- a/Documentation/bpf/kfuncs.rst +++ b/Documentation/bpf/kfuncs.rst @@ -100,6 +100,23 @@ Hence, whenever a constant scalar argument is accepted by a kfunc which is not a size parameter, and the value of the constant matters for program safety, __k suffix should be used. +2.2.2 __uninit Annotation +-------------------- + +This annotation is used to indicate that the argument will be treated as +uninitialized. + +An example is given below:: + + __bpf_kfunc int bpf_dynptr_from_skb(..., struct bpf_dynptr_kern *ptr__uninit) + { + ... + } + +Here, the dynptr will be treated as an uninitialized dynptr. Without this +annotation, the verifier will reject the program if the dynptr passed in is +not initialized. + .. _BPF_kfunc_nodef: 2.3 Using an existing kernel function diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8fd2f26a8977..d052aa5800de 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8727,6 +8727,11 @@ static bool is_kfunc_arg_alloc_obj(const struct btf *btf, const struct btf_param return __kfunc_param_match_suffix(btf, arg, "__alloc"); } +static bool is_kfunc_arg_uninit(const struct btf *btf, const struct btf_param *arg) +{ + return __kfunc_param_match_suffix(btf, arg, "__uninit"); +} + static bool is_kfunc_arg_scalar_with_name(const struct btf *btf, const struct btf_param *arg, const char *name) @@ -9662,17 +9667,26 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return ret; break; case KF_ARG_PTR_TO_DYNPTR: + { + enum bpf_arg_type dynptr_arg_type = ARG_PTR_TO_DYNPTR; + if (reg->type != PTR_TO_STACK && reg->type != CONST_PTR_TO_DYNPTR) { verbose(env, "arg#%d expected pointer to stack or dynptr_ptr\n", i); return -EINVAL; } - ret = process_dynptr_func(env, regno, insn_idx, - ARG_PTR_TO_DYNPTR | MEM_RDONLY); + if (reg->type == CONST_PTR_TO_DYNPTR) + dynptr_arg_type |= MEM_RDONLY; + + if (is_kfunc_arg_uninit(btf, &args[i])) + dynptr_arg_type |= MEM_UNINIT; + + ret = process_dynptr_func(env, regno, insn_idx, dynptr_arg_type); if (ret < 0) return ret; break; + } case KF_ARG_PTR_TO_LIST_HEAD: if (reg->type != PTR_TO_MAP_VALUE && reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { -- cgit v1.2.3-58-ga151 From b5964b968ac64c2ec2debee7518499113b27c34e Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Wed, 1 Mar 2023 07:49:50 -0800 Subject: bpf: Add skb dynptrs Add skb dynptrs, which are dynptrs whose underlying pointer points to a skb. The dynptr acts on skb data. skb dynptrs have two main benefits. One is that they allow operations on sizes that are not statically known at compile-time (eg variable-sized accesses). Another is that parsing the packet data through dynptrs (instead of through direct access of skb->data and skb->data_end) can be more ergonomic and less brittle (eg does not need manual if checking for being within bounds of data_end). For bpf prog types that don't support writes on skb data, the dynptr is read-only (bpf_dynptr_write() will return an error) For reads and writes through the bpf_dynptr_read() and bpf_dynptr_write() interfaces, reading and writing from/to data in the head as well as from/to non-linear paged buffers is supported. Data slices through the bpf_dynptr_data API are not supported; instead bpf_dynptr_slice() and bpf_dynptr_slice_rdwr() (added in subsequent commit) should be used. For examples of how skb dynptrs can be used, please see the attached selftests. Signed-off-by: Joanne Koong Link: https://lore.kernel.org/r/20230301154953.641654-8-joannelkoong@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 14 +++++++- include/linux/filter.h | 18 ++++++++++ include/uapi/linux/bpf.h | 13 ++++++-- kernel/bpf/btf.c | 18 ++++++++++ kernel/bpf/helpers.c | 76 ++++++++++++++++++++++++++++++++++-------- kernel/bpf/verifier.c | 61 +++++++++++++++++++++++++++++++++ net/core/filter.c | 67 +++++++++++++++++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 13 ++++++-- 8 files changed, 261 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 296841a31749..e7436d7615b0 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -607,11 +607,14 @@ enum bpf_type_flag { */ NON_OWN_REF = BIT(14 + BPF_BASE_TYPE_BITS), + /* DYNPTR points to sk_buff */ + DYNPTR_TYPE_SKB = BIT(15 + BPF_BASE_TYPE_BITS), + __BPF_TYPE_FLAG_MAX, __BPF_TYPE_LAST_FLAG = __BPF_TYPE_FLAG_MAX - 1, }; -#define DYNPTR_TYPE_FLAG_MASK (DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF) +#define DYNPTR_TYPE_FLAG_MASK (DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF | DYNPTR_TYPE_SKB) /* Max number of base types. */ #define BPF_BASE_TYPE_LIMIT (1UL << BPF_BASE_TYPE_BITS) @@ -1146,6 +1149,8 @@ enum bpf_dynptr_type { BPF_DYNPTR_TYPE_LOCAL, /* Underlying data is a ringbuf record */ BPF_DYNPTR_TYPE_RINGBUF, + /* Underlying data is a sk_buff */ + BPF_DYNPTR_TYPE_SKB, }; int bpf_dynptr_check_size(u32 size); @@ -2846,6 +2851,8 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size); +int bpf_dynptr_from_skb_rdonly(struct sk_buff *skb, u64 flags, + struct bpf_dynptr_kern *ptr); #else static inline bool bpf_sock_common_is_valid_access(int off, int size, enum bpf_access_type type, @@ -2867,6 +2874,11 @@ static inline u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, { return 0; } +static inline int bpf_dynptr_from_skb_rdonly(struct sk_buff *skb, u64 flags, + struct bpf_dynptr_kern *ptr) +{ + return -EOPNOTSUPP; +} #endif #ifdef CONFIG_INET diff --git a/include/linux/filter.h b/include/linux/filter.h index 1727898f1641..de18e844d15e 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1542,4 +1542,22 @@ static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u64 index return XDP_REDIRECT; } +#ifdef CONFIG_NET +int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len); +int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, + u32 len, u64 flags); +#else /* CONFIG_NET */ +static inline int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, + void *to, u32 len) +{ + return -EOPNOTSUPP; +} + +static inline int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, + const void *from, u32 len, u64 flags) +{ + return -EOPNOTSUPP; +} +#endif /* CONFIG_NET */ + #endif /* __LINUX_FILTER_H__ */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 62ce1f5d1b1d..d0351d30e551 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5325,11 +5325,17 @@ union bpf_attr { * Description * Write *len* bytes from *src* into *dst*, starting from *offset* * into *dst*. - * *flags* is currently unused. + * + * *flags* must be 0 except for skb-type dynptrs. + * + * For skb-type dynptrs: + * * For *flags*, please see the flags accepted by + * **bpf_skb_store_bytes**\ (). * Return * 0 on success, -E2BIG if *offset* + *len* exceeds the length * of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst* - * is a read-only dynptr or if *flags* is not 0. + * is a read-only dynptr or if *flags* is not correct. For skb-type dynptrs, + * other errors correspond to errors returned by **bpf_skb_store_bytes**\ (). * * void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u32 offset, u32 len) * Description @@ -5337,6 +5343,9 @@ union bpf_attr { * * *len* must be a statically known value. The returned data slice * is invalidated whenever the dynptr is invalidated. + * + * skb type dynptrs may not use bpf_dynptr_data. They should + * instead use bpf_dynptr_slice and bpf_dynptr_slice_rdwr. * Return * Pointer to the underlying dynptr data, NULL if the dynptr is * read-only, if the dynptr is invalid, or if the offset and length diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 84cca8473873..ef2d8969ed1f 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -207,6 +207,11 @@ enum btf_kfunc_hook { BTF_KFUNC_HOOK_TRACING, BTF_KFUNC_HOOK_SYSCALL, BTF_KFUNC_HOOK_FMODRET, + BTF_KFUNC_HOOK_CGROUP_SKB, + BTF_KFUNC_HOOK_SCHED_ACT, + BTF_KFUNC_HOOK_SK_SKB, + BTF_KFUNC_HOOK_SOCKET_FILTER, + BTF_KFUNC_HOOK_LWT, BTF_KFUNC_HOOK_MAX, }; @@ -7708,6 +7713,19 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type) return BTF_KFUNC_HOOK_TRACING; case BPF_PROG_TYPE_SYSCALL: return BTF_KFUNC_HOOK_SYSCALL; + case BPF_PROG_TYPE_CGROUP_SKB: + return BTF_KFUNC_HOOK_CGROUP_SKB; + case BPF_PROG_TYPE_SCHED_ACT: + return BTF_KFUNC_HOOK_SCHED_ACT; + case BPF_PROG_TYPE_SK_SKB: + return BTF_KFUNC_HOOK_SK_SKB; + case BPF_PROG_TYPE_SOCKET_FILTER: + return BTF_KFUNC_HOOK_SOCKET_FILTER; + case BPF_PROG_TYPE_LWT_OUT: + case BPF_PROG_TYPE_LWT_IN: + case BPF_PROG_TYPE_LWT_XMIT: + case BPF_PROG_TYPE_LWT_SEG6LOCAL: + return BTF_KFUNC_HOOK_LWT; default: return BTF_KFUNC_HOOK_MAX; } diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index abdcc52f90a6..e8e2414d1587 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1420,11 +1420,21 @@ static bool bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr) return ptr->size & DYNPTR_RDONLY_BIT; } +void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr) +{ + ptr->size |= DYNPTR_RDONLY_BIT; +} + static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum bpf_dynptr_type type) { ptr->size |= type << DYNPTR_TYPE_SHIFT; } +static enum bpf_dynptr_type bpf_dynptr_get_type(const struct bpf_dynptr_kern *ptr) +{ + return (ptr->size & ~(DYNPTR_RDONLY_BIT)) >> DYNPTR_TYPE_SHIFT; +} + u32 bpf_dynptr_get_size(const struct bpf_dynptr_kern *ptr) { return ptr->size & DYNPTR_SIZE_MASK; @@ -1497,6 +1507,7 @@ static const struct bpf_func_proto bpf_dynptr_from_mem_proto = { BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src, u32, offset, u64, flags) { + enum bpf_dynptr_type type; int err; if (!src->data || flags) @@ -1506,13 +1517,23 @@ BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern if (err) return err; - /* Source and destination may possibly overlap, hence use memmove to - * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr - * pointing to overlapping PTR_TO_MAP_VALUE regions. - */ - memmove(dst, src->data + src->offset + offset, len); + type = bpf_dynptr_get_type(src); - return 0; + switch (type) { + case BPF_DYNPTR_TYPE_LOCAL: + case BPF_DYNPTR_TYPE_RINGBUF: + /* Source and destination may possibly overlap, hence use memmove to + * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr + * pointing to overlapping PTR_TO_MAP_VALUE regions. + */ + memmove(dst, src->data + src->offset + offset, len); + return 0; + case BPF_DYNPTR_TYPE_SKB: + return __bpf_skb_load_bytes(src->data, src->offset + offset, dst, len); + default: + WARN_ONCE(true, "bpf_dynptr_read: unknown dynptr type %d\n", type); + return -EFAULT; + } } static const struct bpf_func_proto bpf_dynptr_read_proto = { @@ -1529,22 +1550,36 @@ static const struct bpf_func_proto bpf_dynptr_read_proto = { BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, void *, src, u32, len, u64, flags) { + enum bpf_dynptr_type type; int err; - if (!dst->data || flags || bpf_dynptr_is_rdonly(dst)) + if (!dst->data || bpf_dynptr_is_rdonly(dst)) return -EINVAL; err = bpf_dynptr_check_off_len(dst, offset, len); if (err) return err; - /* Source and destination may possibly overlap, hence use memmove to - * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr - * pointing to overlapping PTR_TO_MAP_VALUE regions. - */ - memmove(dst->data + dst->offset + offset, src, len); + type = bpf_dynptr_get_type(dst); - return 0; + switch (type) { + case BPF_DYNPTR_TYPE_LOCAL: + case BPF_DYNPTR_TYPE_RINGBUF: + if (flags) + return -EINVAL; + /* Source and destination may possibly overlap, hence use memmove to + * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr + * pointing to overlapping PTR_TO_MAP_VALUE regions. + */ + memmove(dst->data + dst->offset + offset, src, len); + return 0; + case BPF_DYNPTR_TYPE_SKB: + return __bpf_skb_store_bytes(dst->data, dst->offset + offset, src, len, + flags); + default: + WARN_ONCE(true, "bpf_dynptr_write: unknown dynptr type %d\n", type); + return -EFAULT; + } } static const struct bpf_func_proto bpf_dynptr_write_proto = { @@ -1560,6 +1595,7 @@ static const struct bpf_func_proto bpf_dynptr_write_proto = { BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u32, len) { + enum bpf_dynptr_type type; int err; if (!ptr->data) @@ -1572,7 +1608,19 @@ BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u3 if (bpf_dynptr_is_rdonly(ptr)) return 0; - return (unsigned long)(ptr->data + ptr->offset + offset); + type = bpf_dynptr_get_type(ptr); + + switch (type) { + case BPF_DYNPTR_TYPE_LOCAL: + case BPF_DYNPTR_TYPE_RINGBUF: + return (unsigned long)(ptr->data + ptr->offset + offset); + case BPF_DYNPTR_TYPE_SKB: + /* skb dynptrs should use bpf_dynptr_slice / bpf_dynptr_slice_rdwr */ + return 0; + default: + WARN_ONCE(true, "bpf_dynptr_data: unknown dynptr type %d\n", type); + return 0; + } } static const struct bpf_func_proto bpf_dynptr_data_proto = { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d052aa5800de..4f5fce16543b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -750,6 +750,8 @@ static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type) return BPF_DYNPTR_TYPE_LOCAL; case DYNPTR_TYPE_RINGBUF: return BPF_DYNPTR_TYPE_RINGBUF; + case DYNPTR_TYPE_SKB: + return BPF_DYNPTR_TYPE_SKB; default: return BPF_DYNPTR_TYPE_INVALID; } @@ -6295,6 +6297,9 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn case DYNPTR_TYPE_RINGBUF: err_extra = "ringbuf"; break; + case DYNPTR_TYPE_SKB: + err_extra = "skb "; + break; default: err_extra = ""; break; @@ -6737,6 +6742,24 @@ static int dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state return state->stack[spi].spilled_ptr.ref_obj_id; } +static enum bpf_dynptr_type dynptr_get_type(struct bpf_verifier_env *env, + struct bpf_reg_state *reg) +{ + struct bpf_func_state *state = func(env, reg); + int spi; + + if (reg->type == CONST_PTR_TO_DYNPTR) + return reg->dynptr.type; + + spi = __get_spi(reg->off); + if (spi < 0) { + verbose(env, "verifier internal error: invalid spi when querying dynptr type\n"); + return BPF_DYNPTR_TYPE_INVALID; + } + + return state->stack[spi].spilled_ptr.dynptr.type; +} + static int check_func_arg(struct bpf_verifier_env *env, u32 arg, struct bpf_call_arg_meta *meta, const struct bpf_func_proto *fn, @@ -8383,6 +8406,27 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn break; } + case BPF_FUNC_dynptr_write: + { + enum bpf_dynptr_type dynptr_type; + struct bpf_reg_state *reg; + + reg = get_dynptr_arg_reg(env, fn, regs); + if (!reg) + return -EFAULT; + + dynptr_type = dynptr_get_type(env, reg); + if (dynptr_type == BPF_DYNPTR_TYPE_INVALID) + return -EFAULT; + + if (dynptr_type == BPF_DYNPTR_TYPE_SKB) + /* this will trigger clear_all_pkt_pointers(), which will + * invalidate all dynptr slices associated with the skb + */ + changes_data = true; + + break; + } case BPF_FUNC_user_ringbuf_drain: err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, set_user_ringbuf_callback_state); @@ -8898,6 +8942,7 @@ enum special_kfunc_type { KF_bpf_rbtree_remove, KF_bpf_rbtree_add, KF_bpf_rbtree_first, + KF_bpf_dynptr_from_skb, }; BTF_SET_START(special_kfunc_set) @@ -8912,6 +8957,7 @@ BTF_ID(func, bpf_rdonly_cast) BTF_ID(func, bpf_rbtree_remove) BTF_ID(func, bpf_rbtree_add) BTF_ID(func, bpf_rbtree_first) +BTF_ID(func, bpf_dynptr_from_skb) BTF_SET_END(special_kfunc_set) BTF_ID_LIST(special_kfunc_list) @@ -8928,6 +8974,7 @@ BTF_ID(func, bpf_rcu_read_unlock) BTF_ID(func, bpf_rbtree_remove) BTF_ID(func, bpf_rbtree_add) BTF_ID(func, bpf_rbtree_first) +BTF_ID(func, bpf_dynptr_from_skb) static bool is_kfunc_bpf_rcu_read_lock(struct bpf_kfunc_call_arg_meta *meta) { @@ -9682,6 +9729,9 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (is_kfunc_arg_uninit(btf, &args[i])) dynptr_arg_type |= MEM_UNINIT; + if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) + dynptr_arg_type |= DYNPTR_TYPE_SKB; + ret = process_dynptr_func(env, regno, insn_idx, dynptr_arg_type); if (ret < 0) return ret; @@ -16356,6 +16406,17 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) { insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1); *cnt = 1; + } else if (desc->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) { + bool seen_direct_write = env->seen_direct_write; + bool is_rdonly = !may_access_direct_pkt_data(env, NULL, BPF_WRITE); + + if (is_rdonly) + insn->imm = BPF_CALL_IMM(bpf_dynptr_from_skb_rdonly); + + /* restore env->seen_direct_write to its original value, since + * may_access_direct_pkt_data mutates it + */ + env->seen_direct_write = seen_direct_write; } return 0; } diff --git a/net/core/filter.c b/net/core/filter.c index 1d6f165923bf..f3afa31a9b10 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1721,6 +1721,12 @@ static const struct bpf_func_proto bpf_skb_store_bytes_proto = { .arg5_type = ARG_ANYTHING, }; +int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, + u32 len, u64 flags) +{ + return ____bpf_skb_store_bytes(skb, offset, from, len, flags); +} + BPF_CALL_4(bpf_skb_load_bytes, const struct sk_buff *, skb, u32, offset, void *, to, u32, len) { @@ -1751,6 +1757,11 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = { .arg4_type = ARG_CONST_SIZE, }; +int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len) +{ + return ____bpf_skb_load_bytes(skb, offset, to, len); +} + BPF_CALL_4(bpf_flow_dissector_load_bytes, const struct bpf_flow_dissector *, ctx, u32, offset, void *, to, u32, len) @@ -11621,3 +11632,59 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id) return func; } + +__diag_push(); +__diag_ignore_all("-Wmissing-prototypes", + "Global functions as their definitions will be in vmlinux BTF"); +__bpf_kfunc int bpf_dynptr_from_skb(struct sk_buff *skb, u64 flags, + struct bpf_dynptr_kern *ptr__uninit) +{ + if (flags) { + bpf_dynptr_set_null(ptr__uninit); + return -EINVAL; + } + + bpf_dynptr_init(ptr__uninit, skb, BPF_DYNPTR_TYPE_SKB, 0, skb->len); + + return 0; +} +__diag_pop(); + +int bpf_dynptr_from_skb_rdonly(struct sk_buff *skb, u64 flags, + struct bpf_dynptr_kern *ptr__uninit) +{ + int err; + + err = bpf_dynptr_from_skb(skb, flags, ptr__uninit); + if (err) + return err; + + bpf_dynptr_set_rdonly(ptr__uninit); + + return 0; +} + +BTF_SET8_START(bpf_kfunc_check_set_skb) +BTF_ID_FLAGS(func, bpf_dynptr_from_skb) +BTF_SET8_END(bpf_kfunc_check_set_skb) + +static const struct btf_kfunc_id_set bpf_kfunc_set_skb = { + .owner = THIS_MODULE, + .set = &bpf_kfunc_check_set_skb, +}; + +static int __init bpf_kfunc_init(void) +{ + int ret; + + ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_skb); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_ACT, &bpf_kfunc_set_skb); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SK_SKB, &bpf_kfunc_set_skb); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SOCKET_FILTER, &bpf_kfunc_set_skb); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SKB, &bpf_kfunc_set_skb); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_OUT, &bpf_kfunc_set_skb); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_IN, &bpf_kfunc_set_skb); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_XMIT, &bpf_kfunc_set_skb); + return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_SEG6LOCAL, &bpf_kfunc_set_skb); +} +late_initcall(bpf_kfunc_init); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 62ce1f5d1b1d..d0351d30e551 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5325,11 +5325,17 @@ union bpf_attr { * Description * Write *len* bytes from *src* into *dst*, starting from *offset* * into *dst*. - * *flags* is currently unused. + * + * *flags* must be 0 except for skb-type dynptrs. + * + * For skb-type dynptrs: + * * For *flags*, please see the flags accepted by + * **bpf_skb_store_bytes**\ (). * Return * 0 on success, -E2BIG if *offset* + *len* exceeds the length * of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst* - * is a read-only dynptr or if *flags* is not 0. + * is a read-only dynptr or if *flags* is not correct. For skb-type dynptrs, + * other errors correspond to errors returned by **bpf_skb_store_bytes**\ (). * * void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u32 offset, u32 len) * Description @@ -5337,6 +5343,9 @@ union bpf_attr { * * *len* must be a statically known value. The returned data slice * is invalidated whenever the dynptr is invalidated. + * + * skb type dynptrs may not use bpf_dynptr_data. They should + * instead use bpf_dynptr_slice and bpf_dynptr_slice_rdwr. * Return * Pointer to the underlying dynptr data, NULL if the dynptr is * read-only, if the dynptr is invalid, or if the offset and length -- cgit v1.2.3-58-ga151 From 05421aecd4ed65da0dc17b0c3c13779ef334e9e5 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Wed, 1 Mar 2023 07:49:51 -0800 Subject: bpf: Add xdp dynptrs Add xdp dynptrs, which are dynptrs whose underlying pointer points to a xdp_buff. The dynptr acts on xdp data. xdp dynptrs have two main benefits. One is that they allow operations on sizes that are not statically known at compile-time (eg variable-sized accesses). Another is that parsing the packet data through dynptrs (instead of through direct access of xdp->data and xdp->data_end) can be more ergonomic and less brittle (eg does not need manual if checking for being within bounds of data_end). For reads and writes on the dynptr, this includes reading/writing from/to and across fragments. Data slices through the bpf_dynptr_data API are not supported; instead bpf_dynptr_slice() and bpf_dynptr_slice_rdwr() should be used. For examples of how xdp dynptrs can be used, please see the attached selftests. Signed-off-by: Joanne Koong Link: https://lore.kernel.org/r/20230301154953.641654-9-joannelkoong@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 8 +++++++- include/linux/filter.h | 14 ++++++++++++++ include/uapi/linux/bpf.h | 2 +- kernel/bpf/helpers.c | 9 ++++++++- kernel/bpf/verifier.c | 10 ++++++++++ net/core/filter.c | 37 +++++++++++++++++++++++++++++++++++-- tools/include/uapi/linux/bpf.h | 2 +- 7 files changed, 76 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e7436d7615b0..23ec684e660d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -610,11 +610,15 @@ enum bpf_type_flag { /* DYNPTR points to sk_buff */ DYNPTR_TYPE_SKB = BIT(15 + BPF_BASE_TYPE_BITS), + /* DYNPTR points to xdp_buff */ + DYNPTR_TYPE_XDP = BIT(16 + BPF_BASE_TYPE_BITS), + __BPF_TYPE_FLAG_MAX, __BPF_TYPE_LAST_FLAG = __BPF_TYPE_FLAG_MAX - 1, }; -#define DYNPTR_TYPE_FLAG_MASK (DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF | DYNPTR_TYPE_SKB) +#define DYNPTR_TYPE_FLAG_MASK (DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF | DYNPTR_TYPE_SKB \ + | DYNPTR_TYPE_XDP) /* Max number of base types. */ #define BPF_BASE_TYPE_LIMIT (1UL << BPF_BASE_TYPE_BITS) @@ -1151,6 +1155,8 @@ enum bpf_dynptr_type { BPF_DYNPTR_TYPE_RINGBUF, /* Underlying data is a sk_buff */ BPF_DYNPTR_TYPE_SKB, + /* Underlying data is a xdp_buff */ + BPF_DYNPTR_TYPE_XDP, }; int bpf_dynptr_check_size(u32 size); diff --git a/include/linux/filter.h b/include/linux/filter.h index de18e844d15e..3f6992261ec5 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1546,6 +1546,8 @@ static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u64 index int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len); int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags); +int __bpf_xdp_load_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len); +int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len); #else /* CONFIG_NET */ static inline int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len) @@ -1558,6 +1560,18 @@ static inline int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, { return -EOPNOTSUPP; } + +static inline int __bpf_xdp_load_bytes(struct xdp_buff *xdp, u32 offset, + void *buf, u32 len) +{ + return -EOPNOTSUPP; +} + +static inline int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, + void *buf, u32 len) +{ + return -EOPNOTSUPP; +} #endif /* CONFIG_NET */ #endif /* __LINUX_FILTER_H__ */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d0351d30e551..faa304c926cf 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5344,7 +5344,7 @@ union bpf_attr { * *len* must be a statically known value. The returned data slice * is invalidated whenever the dynptr is invalidated. * - * skb type dynptrs may not use bpf_dynptr_data. They should + * skb and xdp type dynptrs may not use bpf_dynptr_data. They should * instead use bpf_dynptr_slice and bpf_dynptr_slice_rdwr. * Return * Pointer to the underlying dynptr data, NULL if the dynptr is diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index e8e2414d1587..114a875a05b1 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1530,6 +1530,8 @@ BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern return 0; case BPF_DYNPTR_TYPE_SKB: return __bpf_skb_load_bytes(src->data, src->offset + offset, dst, len); + case BPF_DYNPTR_TYPE_XDP: + return __bpf_xdp_load_bytes(src->data, src->offset + offset, dst, len); default: WARN_ONCE(true, "bpf_dynptr_read: unknown dynptr type %d\n", type); return -EFAULT; @@ -1576,6 +1578,10 @@ BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, v case BPF_DYNPTR_TYPE_SKB: return __bpf_skb_store_bytes(dst->data, dst->offset + offset, src, len, flags); + case BPF_DYNPTR_TYPE_XDP: + if (flags) + return -EINVAL; + return __bpf_xdp_store_bytes(dst->data, dst->offset + offset, src, len); default: WARN_ONCE(true, "bpf_dynptr_write: unknown dynptr type %d\n", type); return -EFAULT; @@ -1615,7 +1621,8 @@ BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u3 case BPF_DYNPTR_TYPE_RINGBUF: return (unsigned long)(ptr->data + ptr->offset + offset); case BPF_DYNPTR_TYPE_SKB: - /* skb dynptrs should use bpf_dynptr_slice / bpf_dynptr_slice_rdwr */ + case BPF_DYNPTR_TYPE_XDP: + /* skb and xdp dynptrs should use bpf_dynptr_slice / bpf_dynptr_slice_rdwr */ return 0; default: WARN_ONCE(true, "bpf_dynptr_data: unknown dynptr type %d\n", type); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4f5fce16543b..5e42946e53ab 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -752,6 +752,8 @@ static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type) return BPF_DYNPTR_TYPE_RINGBUF; case DYNPTR_TYPE_SKB: return BPF_DYNPTR_TYPE_SKB; + case DYNPTR_TYPE_XDP: + return BPF_DYNPTR_TYPE_XDP; default: return BPF_DYNPTR_TYPE_INVALID; } @@ -6300,6 +6302,9 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn case DYNPTR_TYPE_SKB: err_extra = "skb "; break; + case DYNPTR_TYPE_XDP: + err_extra = "xdp "; + break; default: err_extra = ""; break; @@ -8943,6 +8948,7 @@ enum special_kfunc_type { KF_bpf_rbtree_add, KF_bpf_rbtree_first, KF_bpf_dynptr_from_skb, + KF_bpf_dynptr_from_xdp, }; BTF_SET_START(special_kfunc_set) @@ -8958,6 +8964,7 @@ BTF_ID(func, bpf_rbtree_remove) BTF_ID(func, bpf_rbtree_add) BTF_ID(func, bpf_rbtree_first) BTF_ID(func, bpf_dynptr_from_skb) +BTF_ID(func, bpf_dynptr_from_xdp) BTF_SET_END(special_kfunc_set) BTF_ID_LIST(special_kfunc_list) @@ -8975,6 +8982,7 @@ BTF_ID(func, bpf_rbtree_remove) BTF_ID(func, bpf_rbtree_add) BTF_ID(func, bpf_rbtree_first) BTF_ID(func, bpf_dynptr_from_skb) +BTF_ID(func, bpf_dynptr_from_xdp) static bool is_kfunc_bpf_rcu_read_lock(struct bpf_kfunc_call_arg_meta *meta) { @@ -9731,6 +9739,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) dynptr_arg_type |= DYNPTR_TYPE_SKB; + else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_xdp]) + dynptr_arg_type |= DYNPTR_TYPE_XDP; ret = process_dynptr_func(env, regno, insn_idx, dynptr_arg_type); if (ret < 0) diff --git a/net/core/filter.c b/net/core/filter.c index f3afa31a9b10..c692046fa7f6 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3839,7 +3839,7 @@ static const struct bpf_func_proto sk_skb_change_head_proto = { .arg3_type = ARG_ANYTHING, }; -BPF_CALL_1(bpf_xdp_get_buff_len, struct xdp_buff*, xdp) +BPF_CALL_1(bpf_xdp_get_buff_len, struct xdp_buff*, xdp) { return xdp_get_buff_len(xdp); } @@ -3999,6 +3999,11 @@ static const struct bpf_func_proto bpf_xdp_load_bytes_proto = { .arg4_type = ARG_CONST_SIZE, }; +int __bpf_xdp_load_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len) +{ + return ____bpf_xdp_load_bytes(xdp, offset, buf, len); +} + BPF_CALL_4(bpf_xdp_store_bytes, struct xdp_buff *, xdp, u32, offset, void *, buf, u32, len) { @@ -4026,6 +4031,11 @@ static const struct bpf_func_proto bpf_xdp_store_bytes_proto = { .arg4_type = ARG_CONST_SIZE, }; +int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len) +{ + return ____bpf_xdp_store_bytes(xdp, offset, buf, len); +} + static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset) { struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); @@ -11648,6 +11658,19 @@ __bpf_kfunc int bpf_dynptr_from_skb(struct sk_buff *skb, u64 flags, return 0; } + +__bpf_kfunc int bpf_dynptr_from_xdp(struct xdp_buff *xdp, u64 flags, + struct bpf_dynptr_kern *ptr__uninit) +{ + if (flags) { + bpf_dynptr_set_null(ptr__uninit); + return -EINVAL; + } + + bpf_dynptr_init(ptr__uninit, xdp, BPF_DYNPTR_TYPE_XDP, 0, xdp_get_buff_len(xdp)); + + return 0; +} __diag_pop(); int bpf_dynptr_from_skb_rdonly(struct sk_buff *skb, u64 flags, @@ -11668,11 +11691,20 @@ BTF_SET8_START(bpf_kfunc_check_set_skb) BTF_ID_FLAGS(func, bpf_dynptr_from_skb) BTF_SET8_END(bpf_kfunc_check_set_skb) +BTF_SET8_START(bpf_kfunc_check_set_xdp) +BTF_ID_FLAGS(func, bpf_dynptr_from_xdp) +BTF_SET8_END(bpf_kfunc_check_set_xdp) + static const struct btf_kfunc_id_set bpf_kfunc_set_skb = { .owner = THIS_MODULE, .set = &bpf_kfunc_check_set_skb, }; +static const struct btf_kfunc_id_set bpf_kfunc_set_xdp = { + .owner = THIS_MODULE, + .set = &bpf_kfunc_check_set_xdp, +}; + static int __init bpf_kfunc_init(void) { int ret; @@ -11685,6 +11717,7 @@ static int __init bpf_kfunc_init(void) ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_OUT, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_IN, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_XMIT, &bpf_kfunc_set_skb); - return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_SEG6LOCAL, &bpf_kfunc_set_skb); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_SEG6LOCAL, &bpf_kfunc_set_skb); + return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp); } late_initcall(bpf_kfunc_init); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index d0351d30e551..faa304c926cf 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5344,7 +5344,7 @@ union bpf_attr { * *len* must be a statically known value. The returned data slice * is invalidated whenever the dynptr is invalidated. * - * skb type dynptrs may not use bpf_dynptr_data. They should + * skb and xdp type dynptrs may not use bpf_dynptr_data. They should * instead use bpf_dynptr_slice and bpf_dynptr_slice_rdwr. * Return * Pointer to the underlying dynptr data, NULL if the dynptr is -- cgit v1.2.3-58-ga151 From 66e3a13e7c2c44d0c9dd6bb244680ca7529a8845 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Wed, 1 Mar 2023 07:49:52 -0800 Subject: bpf: Add bpf_dynptr_slice and bpf_dynptr_slice_rdwr Two new kfuncs are added, bpf_dynptr_slice and bpf_dynptr_slice_rdwr. The user must pass in a buffer to store the contents of the data slice if a direct pointer to the data cannot be obtained. For skb and xdp type dynptrs, these two APIs are the only way to obtain a data slice. However, for other types of dynptrs, there is no difference between bpf_dynptr_slice(_rdwr) and bpf_dynptr_data. For skb type dynptrs, the data is copied into the user provided buffer if any of the data is not in the linear portion of the skb. For xdp type dynptrs, the data is copied into the user provided buffer if the data is between xdp frags. If the skb is cloned and a call to bpf_dynptr_data_rdwr is made, then the skb will be uncloned (see bpf_unclone_prologue()). Please note that any bpf_dynptr_write() automatically invalidates any prior data slices of the skb dynptr. This is because the skb may be cloned or may need to pull its paged buffer into the head. As such, any bpf_dynptr_write() will automatically have its prior data slices invalidated, even if the write is to data in the skb head of an uncloned skb. Please note as well that any other helper calls that change the underlying packet buffer (eg bpf_skb_pull_data()) invalidates any data slices of the skb dynptr as well, for the same reasons. Signed-off-by: Joanne Koong Link: https://lore.kernel.org/r/20230301154953.641654-10-joannelkoong@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 14 +++++ include/uapi/linux/bpf.h | 5 ++ kernel/bpf/helpers.c | 138 +++++++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 127 +++++++++++++++++++++++++++++++++++-- net/core/filter.c | 6 +- tools/include/uapi/linux/bpf.h | 5 ++ 6 files changed, 288 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/filter.h b/include/linux/filter.h index 3f6992261ec5..efa5d4a1677e 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1548,6 +1548,9 @@ int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags); int __bpf_xdp_load_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len); int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len); +void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len); +void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, + void *buf, unsigned long len, bool flush); #else /* CONFIG_NET */ static inline int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len) @@ -1572,6 +1575,17 @@ static inline int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, { return -EOPNOTSUPP; } + +static inline void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len) +{ + return NULL; +} + +static inline void *bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, void *buf, + unsigned long len, bool flush) +{ + return NULL; +} #endif /* CONFIG_NET */ #endif /* __LINUX_FILTER_H__ */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index faa304c926cf..c9699304aed2 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5329,6 +5329,11 @@ union bpf_attr { * *flags* must be 0 except for skb-type dynptrs. * * For skb-type dynptrs: + * * All data slices of the dynptr are automatically + * invalidated after **bpf_dynptr_write**\ (). This is + * because writing may pull the skb and change the + * underlying packet buffer. + * * * For *flags*, please see the flags accepted by * **bpf_skb_store_bytes**\ (). * Return diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 114a875a05b1..648b29e78b84 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2193,6 +2193,142 @@ __bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid) return p; } +/** + * bpf_dynptr_slice - Obtain a read-only pointer to the dynptr data. + * + * For non-skb and non-xdp type dynptrs, there is no difference between + * bpf_dynptr_slice and bpf_dynptr_data. + * + * If the intention is to write to the data slice, please use + * bpf_dynptr_slice_rdwr. + * + * The user must check that the returned pointer is not null before using it. + * + * Please note that in the case of skb and xdp dynptrs, bpf_dynptr_slice + * does not change the underlying packet data pointers, so a call to + * bpf_dynptr_slice will not invalidate any ctx->data/data_end pointers in + * the bpf program. + * + * @ptr: The dynptr whose data slice to retrieve + * @offset: Offset into the dynptr + * @buffer: User-provided buffer to copy contents into + * @buffer__szk: Size (in bytes) of the buffer. This is the length of the + * requested slice. This must be a constant. + * + * @returns: NULL if the call failed (eg invalid dynptr), pointer to a read-only + * data slice (can be either direct pointer to the data or a pointer to the user + * provided buffer, with its contents containing the data, if unable to obtain + * direct pointer) + */ +__bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset, + void *buffer, u32 buffer__szk) +{ + enum bpf_dynptr_type type; + u32 len = buffer__szk; + int err; + + if (!ptr->data) + return 0; + + err = bpf_dynptr_check_off_len(ptr, offset, len); + if (err) + return 0; + + type = bpf_dynptr_get_type(ptr); + + switch (type) { + case BPF_DYNPTR_TYPE_LOCAL: + case BPF_DYNPTR_TYPE_RINGBUF: + return ptr->data + ptr->offset + offset; + case BPF_DYNPTR_TYPE_SKB: + return skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer); + case BPF_DYNPTR_TYPE_XDP: + { + void *xdp_ptr = bpf_xdp_pointer(ptr->data, ptr->offset + offset, len); + if (xdp_ptr) + return xdp_ptr; + + bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer, len, false); + return buffer; + } + default: + WARN_ONCE(true, "unknown dynptr type %d\n", type); + return 0; + } +} + +/** + * bpf_dynptr_slice_rdwr - Obtain a writable pointer to the dynptr data. + * + * For non-skb and non-xdp type dynptrs, there is no difference between + * bpf_dynptr_slice and bpf_dynptr_data. + * + * The returned pointer is writable and may point to either directly the dynptr + * data at the requested offset or to the buffer if unable to obtain a direct + * data pointer to (example: the requested slice is to the paged area of an skb + * packet). In the case where the returned pointer is to the buffer, the user + * is responsible for persisting writes through calling bpf_dynptr_write(). This + * usually looks something like this pattern: + * + * struct eth_hdr *eth = bpf_dynptr_slice_rdwr(&dynptr, 0, buffer, sizeof(buffer)); + * if (!eth) + * return TC_ACT_SHOT; + * + * // mutate eth header // + * + * if (eth == buffer) + * bpf_dynptr_write(&ptr, 0, buffer, sizeof(buffer), 0); + * + * Please note that, as in the example above, the user must check that the + * returned pointer is not null before using it. + * + * Please also note that in the case of skb and xdp dynptrs, bpf_dynptr_slice_rdwr + * does not change the underlying packet data pointers, so a call to + * bpf_dynptr_slice_rdwr will not invalidate any ctx->data/data_end pointers in + * the bpf program. + * + * @ptr: The dynptr whose data slice to retrieve + * @offset: Offset into the dynptr + * @buffer: User-provided buffer to copy contents into + * @buffer__szk: Size (in bytes) of the buffer. This is the length of the + * requested slice. This must be a constant. + * + * @returns: NULL if the call failed (eg invalid dynptr), pointer to a + * data slice (can be either direct pointer to the data or a pointer to the user + * provided buffer, with its contents containing the data, if unable to obtain + * direct pointer) + */ +__bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr_kern *ptr, u32 offset, + void *buffer, u32 buffer__szk) +{ + if (!ptr->data || bpf_dynptr_is_rdonly(ptr)) + return 0; + + /* bpf_dynptr_slice_rdwr is the same logic as bpf_dynptr_slice. + * + * For skb-type dynptrs, it is safe to write into the returned pointer + * if the bpf program allows skb data writes. There are two possiblities + * that may occur when calling bpf_dynptr_slice_rdwr: + * + * 1) The requested slice is in the head of the skb. In this case, the + * returned pointer is directly to skb data, and if the skb is cloned, the + * verifier will have uncloned it (see bpf_unclone_prologue()) already. + * The pointer can be directly written into. + * + * 2) Some portion of the requested slice is in the paged buffer area. + * In this case, the requested data will be copied out into the buffer + * and the returned pointer will be a pointer to the buffer. The skb + * will not be pulled. To persist the write, the user will need to call + * bpf_dynptr_write(), which will pull the skb and commit the write. + * + * Similarly for xdp programs, if the requested slice is not across xdp + * fragments, then a direct pointer will be returned, otherwise the data + * will be copied out into the buffer and the user will need to call + * bpf_dynptr_write() to commit changes. + */ + return bpf_dynptr_slice(ptr, offset, buffer, buffer__szk); +} + __bpf_kfunc void *bpf_cast_to_kern_ctx(void *obj) { return obj; @@ -2262,6 +2398,8 @@ BTF_ID_FLAGS(func, bpf_cast_to_kern_ctx) BTF_ID_FLAGS(func, bpf_rdonly_cast) BTF_ID_FLAGS(func, bpf_rcu_read_lock) BTF_ID_FLAGS(func, bpf_rcu_read_unlock) +BTF_ID_FLAGS(func, bpf_dynptr_slice, KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL) BTF_SET8_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5e42946e53ab..a856896e835a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -759,6 +759,22 @@ static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type) } } +static enum bpf_type_flag get_dynptr_type_flag(enum bpf_dynptr_type type) +{ + switch (type) { + case BPF_DYNPTR_TYPE_LOCAL: + return DYNPTR_TYPE_LOCAL; + case BPF_DYNPTR_TYPE_RINGBUF: + return DYNPTR_TYPE_RINGBUF; + case BPF_DYNPTR_TYPE_SKB: + return DYNPTR_TYPE_SKB; + case BPF_DYNPTR_TYPE_XDP: + return DYNPTR_TYPE_XDP; + default: + return 0; + } +} + static bool dynptr_type_refcounted(enum bpf_dynptr_type type) { return type == BPF_DYNPTR_TYPE_RINGBUF; @@ -1681,6 +1697,12 @@ static bool reg_is_pkt_pointer_any(const struct bpf_reg_state *reg) reg->type == PTR_TO_PACKET_END; } +static bool reg_is_dynptr_slice_pkt(const struct bpf_reg_state *reg) +{ + return base_type(reg->type) == PTR_TO_MEM && + (reg->type & DYNPTR_TYPE_SKB || reg->type & DYNPTR_TYPE_XDP); +} + /* Unmodified PTR_TO_PACKET[_META,_END] register from ctx access. */ static bool reg_is_init_pkt_pointer(const struct bpf_reg_state *reg, enum bpf_reg_type which) @@ -7429,6 +7451,9 @@ static int check_func_proto(const struct bpf_func_proto *fn, int func_id) /* Packet data might have moved, any old PTR_TO_PACKET[_META,_END] * are now invalid, so turn them into unknown SCALAR_VALUE. + * + * This also applies to dynptr slices belonging to skb and xdp dynptrs, + * since these slices point to packet data. */ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) { @@ -7436,7 +7461,7 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) struct bpf_reg_state *reg; bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({ - if (reg_is_pkt_pointer_any(reg)) + if (reg_is_pkt_pointer_any(reg) || reg_is_dynptr_slice_pkt(reg)) mark_reg_invalid(env, reg); })); } @@ -8688,6 +8713,11 @@ struct bpf_kfunc_call_arg_meta { struct { struct btf_field *field; } arg_rbtree_root; + struct { + enum bpf_dynptr_type type; + u32 id; + } initialized_dynptr; + u64 mem_size; }; static bool is_kfunc_acquire(struct bpf_kfunc_call_arg_meta *meta) @@ -8761,6 +8791,19 @@ static bool is_kfunc_arg_mem_size(const struct btf *btf, return __kfunc_param_match_suffix(btf, arg, "__sz"); } +static bool is_kfunc_arg_const_mem_size(const struct btf *btf, + const struct btf_param *arg, + const struct bpf_reg_state *reg) +{ + const struct btf_type *t; + + t = btf_type_skip_modifiers(btf, arg->type, NULL); + if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE) + return false; + + return __kfunc_param_match_suffix(btf, arg, "__szk"); +} + static bool is_kfunc_arg_constant(const struct btf *btf, const struct btf_param *arg) { return __kfunc_param_match_suffix(btf, arg, "__k"); @@ -8949,6 +8992,8 @@ enum special_kfunc_type { KF_bpf_rbtree_first, KF_bpf_dynptr_from_skb, KF_bpf_dynptr_from_xdp, + KF_bpf_dynptr_slice, + KF_bpf_dynptr_slice_rdwr, }; BTF_SET_START(special_kfunc_set) @@ -8965,6 +9010,8 @@ BTF_ID(func, bpf_rbtree_add) BTF_ID(func, bpf_rbtree_first) BTF_ID(func, bpf_dynptr_from_skb) BTF_ID(func, bpf_dynptr_from_xdp) +BTF_ID(func, bpf_dynptr_slice) +BTF_ID(func, bpf_dynptr_slice_rdwr) BTF_SET_END(special_kfunc_set) BTF_ID_LIST(special_kfunc_list) @@ -8983,6 +9030,8 @@ BTF_ID(func, bpf_rbtree_add) BTF_ID(func, bpf_rbtree_first) BTF_ID(func, bpf_dynptr_from_skb) BTF_ID(func, bpf_dynptr_from_xdp) +BTF_ID(func, bpf_dynptr_slice) +BTF_ID(func, bpf_dynptr_slice_rdwr) static bool is_kfunc_bpf_rcu_read_lock(struct bpf_kfunc_call_arg_meta *meta) { @@ -9062,7 +9111,10 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (is_kfunc_arg_callback(env, meta->btf, &args[argno])) return KF_ARG_PTR_TO_CALLBACK; - if (argno + 1 < nargs && is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], ®s[regno + 1])) + + if (argno + 1 < nargs && + (is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], ®s[regno + 1]) || + is_kfunc_arg_const_mem_size(meta->btf, &args[argno + 1], ®s[regno + 1]))) arg_mem_size = true; /* This is the catch all argument type of register types supported by @@ -9745,6 +9797,18 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ ret = process_dynptr_func(env, regno, insn_idx, dynptr_arg_type); if (ret < 0) return ret; + + if (!(dynptr_arg_type & MEM_UNINIT)) { + int id = dynptr_id(env, reg); + + if (id < 0) { + verbose(env, "verifier internal error: failed to obtain dynptr id\n"); + return id; + } + meta->initialized_dynptr.id = id; + meta->initialized_dynptr.type = dynptr_get_type(env, reg); + } + break; } case KF_ARG_PTR_TO_LIST_HEAD: @@ -9840,14 +9904,33 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return ret; break; case KF_ARG_PTR_TO_MEM_SIZE: - ret = check_kfunc_mem_size_reg(env, ®s[regno + 1], regno + 1); + { + struct bpf_reg_state *size_reg = ®s[regno + 1]; + const struct btf_param *size_arg = &args[i + 1]; + + ret = check_kfunc_mem_size_reg(env, size_reg, regno + 1); if (ret < 0) { verbose(env, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", i, i + 1); return ret; } - /* Skip next '__sz' argument */ + + if (is_kfunc_arg_const_mem_size(meta->btf, size_arg, size_reg)) { + if (meta->arg_constant.found) { + verbose(env, "verifier internal error: only one constant argument permitted\n"); + return -EFAULT; + } + if (!tnum_is_const(size_reg->var_off)) { + verbose(env, "R%d must be a known constant\n", regno + 1); + return -EINVAL; + } + meta->arg_constant.found = true; + meta->arg_constant.value = size_reg->var_off.value; + } + + /* Skip next '__sz' or '__szk' argument */ i++; break; + } case KF_ARG_PTR_TO_CALLBACK: meta->subprogno = reg->subprogno; break; @@ -10082,6 +10165,42 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_UNTRUSTED; regs[BPF_REG_0].btf = desc_btf; regs[BPF_REG_0].btf_id = meta.arg_constant.value; + } else if (meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice] || + meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice_rdwr]) { + enum bpf_type_flag type_flag = get_dynptr_type_flag(meta.initialized_dynptr.type); + + mark_reg_known_zero(env, regs, BPF_REG_0); + + if (!meta.arg_constant.found) { + verbose(env, "verifier internal error: bpf_dynptr_slice(_rdwr) no constant size\n"); + return -EFAULT; + } + + regs[BPF_REG_0].mem_size = meta.arg_constant.value; + + /* PTR_MAYBE_NULL will be added when is_kfunc_ret_null is checked */ + regs[BPF_REG_0].type = PTR_TO_MEM | type_flag; + + if (meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice]) { + regs[BPF_REG_0].type |= MEM_RDONLY; + } else { + /* this will set env->seen_direct_write to true */ + if (!may_access_direct_pkt_data(env, NULL, BPF_WRITE)) { + verbose(env, "the prog does not allow writes to packet data\n"); + return -EINVAL; + } + } + + if (!meta.initialized_dynptr.id) { + verbose(env, "verifier internal error: no dynptr id\n"); + return -EFAULT; + } + regs[BPF_REG_0].dynptr_id = meta.initialized_dynptr.id; + + /* we don't need to set BPF_REG_0's ref obj id + * because packet slices are not refcounted (see + * dynptr_type_refcounted) + */ } else { verbose(env, "kernel function %s unhandled dynamic return type\n", meta.func_name); diff --git a/net/core/filter.c b/net/core/filter.c index c692046fa7f6..8f3124e06133 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3894,8 +3894,8 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = { .arg2_type = ARG_ANYTHING, }; -static void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, - void *buf, unsigned long len, bool flush) +void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, + void *buf, unsigned long len, bool flush) { unsigned long ptr_len, ptr_off = 0; skb_frag_t *next_frag, *end_frag; @@ -3941,7 +3941,7 @@ static void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, } } -static void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len) +void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len) { struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); u32 size = xdp->data_end - xdp->data; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index faa304c926cf..c9699304aed2 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5329,6 +5329,11 @@ union bpf_attr { * *flags* must be 0 except for skb-type dynptrs. * * For skb-type dynptrs: + * * All data slices of the dynptr are automatically + * invalidated after **bpf_dynptr_write**\ (). This is + * because writing may pull the skb and change the + * underlying packet buffer. + * * * For *flags*, please see the flags accepted by * **bpf_skb_store_bytes**\ (). * Return -- cgit v1.2.3-58-ga151 From 65334e64a493c6a0976de7ad56bf8b7a9ff04b4a Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 25 Feb 2023 16:40:08 +0100 Subject: bpf: Support kptrs in percpu hashmap and percpu LRU hashmap Enable support for kptrs in percpu BPF hashmap and percpu BPF LRU hashmap by wiring up the freeing of these kptrs from percpu map elements. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20230225154010.391965-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/hashtab.c | 59 ++++++++++++++++++++++++++++++++-------------------- kernel/bpf/syscall.c | 2 ++ 2 files changed, 39 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 5dfcb5ad0d06..653aeb481c79 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -249,7 +249,18 @@ static void htab_free_prealloced_fields(struct bpf_htab *htab) struct htab_elem *elem; elem = get_htab_elem(htab, i); - bpf_obj_free_fields(htab->map.record, elem->key + round_up(htab->map.key_size, 8)); + if (htab_is_percpu(htab)) { + void __percpu *pptr = htab_elem_get_ptr(elem, htab->map.key_size); + int cpu; + + for_each_possible_cpu(cpu) { + bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu)); + cond_resched(); + } + } else { + bpf_obj_free_fields(htab->map.record, elem->key + round_up(htab->map.key_size, 8)); + cond_resched(); + } cond_resched(); } } @@ -759,9 +770,17 @@ static int htab_lru_map_gen_lookup(struct bpf_map *map, static void check_and_free_fields(struct bpf_htab *htab, struct htab_elem *elem) { - void *map_value = elem->key + round_up(htab->map.key_size, 8); + if (htab_is_percpu(htab)) { + void __percpu *pptr = htab_elem_get_ptr(elem, htab->map.key_size); + int cpu; - bpf_obj_free_fields(htab->map.record, map_value); + for_each_possible_cpu(cpu) + bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu)); + } else { + void *map_value = elem->key + round_up(htab->map.key_size, 8); + + bpf_obj_free_fields(htab->map.record, map_value); + } } /* It is called from the bpf_lru_list when the LRU needs to delete @@ -858,9 +877,9 @@ find_first_elem: static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l) { + check_and_free_fields(htab, l); if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr); - check_and_free_fields(htab, l); bpf_mem_cache_free(&htab->ma, l); } @@ -918,14 +937,13 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, { if (!onallcpus) { /* copy true value_size bytes */ - memcpy(this_cpu_ptr(pptr), value, htab->map.value_size); + copy_map_value(&htab->map, this_cpu_ptr(pptr), value); } else { u32 size = round_up(htab->map.value_size, 8); int off = 0, cpu; for_each_possible_cpu(cpu) { - bpf_long_memcpy(per_cpu_ptr(pptr, cpu), - value + off, size); + copy_map_value_long(&htab->map, per_cpu_ptr(pptr, cpu), value + off); off += size; } } @@ -940,16 +958,14 @@ static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr, * (onallcpus=false always when coming from bpf prog). */ if (!onallcpus) { - u32 size = round_up(htab->map.value_size, 8); int current_cpu = raw_smp_processor_id(); int cpu; for_each_possible_cpu(cpu) { if (cpu == current_cpu) - bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value, - size); - else - memset(per_cpu_ptr(pptr, cpu), 0, size); + copy_map_value_long(&htab->map, per_cpu_ptr(pptr, cpu), value); + else /* Since elem is preallocated, we cannot touch special fields */ + zero_map_value(&htab->map, per_cpu_ptr(pptr, cpu)); } } else { pcpu_copy_value(htab, pptr, value, onallcpus); @@ -1575,9 +1591,8 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, pptr = htab_elem_get_ptr(l, key_size); for_each_possible_cpu(cpu) { - bpf_long_memcpy(value + off, - per_cpu_ptr(pptr, cpu), - roundup_value_size); + copy_map_value_long(&htab->map, value + off, per_cpu_ptr(pptr, cpu)); + check_and_init_map_value(&htab->map, value + off); off += roundup_value_size; } } else { @@ -1772,8 +1787,8 @@ again_nocopy: pptr = htab_elem_get_ptr(l, map->key_size); for_each_possible_cpu(cpu) { - bpf_long_memcpy(dst_val + off, - per_cpu_ptr(pptr, cpu), size); + copy_map_value_long(&htab->map, dst_val + off, per_cpu_ptr(pptr, cpu)); + check_and_init_map_value(&htab->map, dst_val + off); off += size; } } else { @@ -2046,9 +2061,9 @@ static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem) roundup_value_size = round_up(map->value_size, 8); pptr = htab_elem_get_ptr(elem, map->key_size); for_each_possible_cpu(cpu) { - bpf_long_memcpy(info->percpu_value_buf + off, - per_cpu_ptr(pptr, cpu), - roundup_value_size); + copy_map_value_long(map, info->percpu_value_buf + off, + per_cpu_ptr(pptr, cpu)); + check_and_init_map_value(map, info->percpu_value_buf + off); off += roundup_value_size; } ctx.value = info->percpu_value_buf; @@ -2292,8 +2307,8 @@ int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) */ pptr = htab_elem_get_ptr(l, map->key_size); for_each_possible_cpu(cpu) { - bpf_long_memcpy(value + off, - per_cpu_ptr(pptr, cpu), size); + copy_map_value_long(map, value + off, per_cpu_ptr(pptr, cpu)); + check_and_init_map_value(map, value + off); off += size; } ret = 0; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e3fcdc9836a6..da117a2a83b2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1059,7 +1059,9 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, case BPF_KPTR_UNREF: case BPF_KPTR_REF: if (map->map_type != BPF_MAP_TYPE_HASH && + map->map_type != BPF_MAP_TYPE_PERCPU_HASH && map->map_type != BPF_MAP_TYPE_LRU_HASH && + map->map_type != BPF_MAP_TYPE_LRU_PERCPU_HASH && map->map_type != BPF_MAP_TYPE_ARRAY && map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY) { ret = -EOPNOTSUPP; -- cgit v1.2.3-58-ga151 From 9db44fdd8105da00669d425acab887c668df75f6 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 25 Feb 2023 16:40:09 +0100 Subject: bpf: Support kptrs in local storage maps Enable support for kptrs in local storage maps by wiring up the freeing of these kptrs from map value. Freeing of bpf_local_storage_map is only delayed in case there are special fields, therefore bpf_selem_free_* path can also only dereference smap safely in that case. This is recorded using a bool utilizing a hole in bpF_local_storage_elem. It could have been tagged in the pointer value smap using the lowest bit (since alignment > 1), but since there was already a hole I went with the simpler option. Only the map structure freeing is delayed using RCU barriers, as the buckets aren't used when selem is being freed, so they can be freed once all readers of the bucket lists can no longer access it. Cc: Martin KaFai Lau Cc: KP Singh Cc: Paul E. McKenney Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20230225154010.391965-3-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_local_storage.h | 6 +++++ kernel/bpf/bpf_local_storage.c | 48 +++++++++++++++++++++++++++++++++++---- kernel/bpf/syscall.c | 6 ++++- kernel/bpf/verifier.c | 12 ++++++---- 4 files changed, 63 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 6d37a40cd90e..0fe92986412b 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -74,6 +74,12 @@ struct bpf_local_storage_elem { struct hlist_node snode; /* Linked to bpf_local_storage */ struct bpf_local_storage __rcu *local_storage; struct rcu_head rcu; + bool can_use_smap; /* Is it safe to access smap in bpf_selem_free_* RCU + * callbacks? bpf_local_storage_map_free only + * executes rcu_barrier when there are special + * fields, this field remembers that to ensure we + * don't access already freed smap in sdata. + */ /* 8 bytes hole */ /* The data is stored in another cacheline to minimize * the number of cachelines access during a cache hit. diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 58da17ae5124..2bdd722fe293 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -85,6 +85,7 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, if (selem) { if (value) copy_map_value(&smap->map, SDATA(selem)->data, value); + /* No need to call check_and_init_map_value as memory is zero init */ return selem; } @@ -113,10 +114,25 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu) struct bpf_local_storage_elem *selem; selem = container_of(rcu, struct bpf_local_storage_elem, rcu); + /* The can_use_smap bool is set whenever we need to free additional + * fields in selem data before freeing selem. bpf_local_storage_map_free + * only executes rcu_barrier to wait for RCU callbacks when it has + * special fields, hence we can only conditionally dereference smap, as + * by this time the map might have already been freed without waiting + * for our call_rcu callback if it did not have any special fields. + */ + if (selem->can_use_smap) + bpf_obj_free_fields(SDATA(selem)->smap->map.record, SDATA(selem)->data); + kfree(selem); +} + +static void bpf_selem_free_tasks_trace_rcu(struct rcu_head *rcu) +{ + /* Free directly if Tasks Trace RCU GP also implies RCU GP */ if (rcu_trace_implies_rcu_gp()) - kfree(selem); + bpf_selem_free_rcu(rcu); else - kfree_rcu(selem, rcu); + call_rcu(rcu, bpf_selem_free_rcu); } /* local_storage->lock must be held and selem->local_storage == local_storage. @@ -170,9 +186,9 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL); if (use_trace_rcu) - call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu); + call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_tasks_trace_rcu); else - kfree_rcu(selem, rcu); + call_rcu(&selem->rcu, bpf_selem_free_rcu); return free_local_storage; } @@ -240,6 +256,11 @@ void bpf_selem_link_map(struct bpf_local_storage_map *smap, RCU_INIT_POINTER(SDATA(selem)->smap, smap); hlist_add_head_rcu(&selem->map_node, &b->list); raw_spin_unlock_irqrestore(&b->lock, flags); + + /* If our data will have special fields, smap will wait for us to use + * its record in bpf_selem_free_* RCU callbacks before freeing itself. + */ + selem->can_use_smap = !IS_ERR_OR_NULL(smap->map.record); } void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool use_trace_rcu) @@ -723,6 +744,25 @@ void bpf_local_storage_map_free(struct bpf_map *map, */ synchronize_rcu(); + /* Only delay freeing of smap, buckets are not needed anymore */ kvfree(smap->buckets); + + /* When local storage has special fields, callbacks for + * bpf_selem_free_rcu and bpf_selem_free_tasks_trace_rcu will keep using + * the map BTF record, we need to execute an RCU barrier to wait for + * them as the record will be freed right after our map_free callback. + */ + if (!IS_ERR_OR_NULL(smap->map.record)) { + rcu_barrier_tasks_trace(); + /* We cannot skip rcu_barrier() when rcu_trace_implies_rcu_gp() + * is true, because while call_rcu invocation is skipped in that + * case in bpf_selem_free_tasks_trace_rcu (and all local storage + * maps pass use_trace_rcu = true), there can be call_rcu + * callbacks based on use_trace_rcu = false in the earlier while + * ((selem = ...)) loop or from bpf_local_storage_unlink_nolock + * called from owner's free path. + */ + rcu_barrier(); + } bpf_map_area_free(smap); } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index da117a2a83b2..eb50025b03c1 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1063,7 +1063,11 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, map->map_type != BPF_MAP_TYPE_LRU_HASH && map->map_type != BPF_MAP_TYPE_LRU_PERCPU_HASH && map->map_type != BPF_MAP_TYPE_ARRAY && - map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY) { + map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY && + map->map_type != BPF_MAP_TYPE_SK_STORAGE && + map->map_type != BPF_MAP_TYPE_INODE_STORAGE && + map->map_type != BPF_MAP_TYPE_TASK_STORAGE && + map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) { ret = -EOPNOTSUPP; goto free_map_tab; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a856896e835a..bf580f246a01 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7222,22 +7222,26 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, break; case BPF_MAP_TYPE_SK_STORAGE: if (func_id != BPF_FUNC_sk_storage_get && - func_id != BPF_FUNC_sk_storage_delete) + func_id != BPF_FUNC_sk_storage_delete && + func_id != BPF_FUNC_kptr_xchg) goto error; break; case BPF_MAP_TYPE_INODE_STORAGE: if (func_id != BPF_FUNC_inode_storage_get && - func_id != BPF_FUNC_inode_storage_delete) + func_id != BPF_FUNC_inode_storage_delete && + func_id != BPF_FUNC_kptr_xchg) goto error; break; case BPF_MAP_TYPE_TASK_STORAGE: if (func_id != BPF_FUNC_task_storage_get && - func_id != BPF_FUNC_task_storage_delete) + func_id != BPF_FUNC_task_storage_delete && + func_id != BPF_FUNC_kptr_xchg) goto error; break; case BPF_MAP_TYPE_CGRP_STORAGE: if (func_id != BPF_FUNC_cgrp_storage_get && - func_id != BPF_FUNC_cgrp_storage_delete) + func_id != BPF_FUNC_cgrp_storage_delete && + func_id != BPF_FUNC_kptr_xchg) goto error; break; case BPF_MAP_TYPE_BLOOM_FILTER: -- cgit v1.2.3-58-ga151 From 7ce60b110eece1d7b3d5c322fd11f6d41a29d17b Mon Sep 17 00:00:00 2001 From: David Vernet Date: Wed, 1 Mar 2023 13:49:09 -0600 Subject: bpf: Fix doxygen comments for dynptr slice kfuncs In commit 66e3a13e7c2c ("bpf: Add bpf_dynptr_slice and bpf_dynptr_slice_rdwr"), the bpf_dynptr_slice() and bpf_dynptr_slice_rdwr() kfuncs were added to BPF. These kfuncs included doxygen headers, but unfortunately those headers are not properly formatted according to [0], and causes the following warnings during the docs build: ./kernel/bpf/helpers.c:2225: warning: \ Excess function parameter 'returns' description in 'bpf_dynptr_slice' ./kernel/bpf/helpers.c:2303: warning: \ Excess function parameter 'returns' description in 'bpf_dynptr_slice_rdwr' ... This patch fixes those doxygen comments. [0]: https://docs.kernel.org/doc-guide/kernel-doc.html#function-documentation Fixes: 66e3a13e7c2c ("bpf: Add bpf_dynptr_slice and bpf_dynptr_slice_rdwr") Signed-off-by: David Vernet Link: https://lore.kernel.org/r/20230301194910.602738-1-void@manifault.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 648b29e78b84..58431a92bb65 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2194,7 +2194,12 @@ __bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid) } /** - * bpf_dynptr_slice - Obtain a read-only pointer to the dynptr data. + * bpf_dynptr_slice() - Obtain a read-only pointer to the dynptr data. + * @ptr: The dynptr whose data slice to retrieve + * @offset: Offset into the dynptr + * @buffer: User-provided buffer to copy contents into + * @buffer__szk: Size (in bytes) of the buffer. This is the length of the + * requested slice. This must be a constant. * * For non-skb and non-xdp type dynptrs, there is no difference between * bpf_dynptr_slice and bpf_dynptr_data. @@ -2209,13 +2214,7 @@ __bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid) * bpf_dynptr_slice will not invalidate any ctx->data/data_end pointers in * the bpf program. * - * @ptr: The dynptr whose data slice to retrieve - * @offset: Offset into the dynptr - * @buffer: User-provided buffer to copy contents into - * @buffer__szk: Size (in bytes) of the buffer. This is the length of the - * requested slice. This must be a constant. - * - * @returns: NULL if the call failed (eg invalid dynptr), pointer to a read-only + * Return: NULL if the call failed (eg invalid dynptr), pointer to a read-only * data slice (can be either direct pointer to the data or a pointer to the user * provided buffer, with its contents containing the data, if unable to obtain * direct pointer) @@ -2258,7 +2257,12 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset } /** - * bpf_dynptr_slice_rdwr - Obtain a writable pointer to the dynptr data. + * bpf_dynptr_slice_rdwr() - Obtain a writable pointer to the dynptr data. + * @ptr: The dynptr whose data slice to retrieve + * @offset: Offset into the dynptr + * @buffer: User-provided buffer to copy contents into + * @buffer__szk: Size (in bytes) of the buffer. This is the length of the + * requested slice. This must be a constant. * * For non-skb and non-xdp type dynptrs, there is no difference between * bpf_dynptr_slice and bpf_dynptr_data. @@ -2287,13 +2291,7 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset * bpf_dynptr_slice_rdwr will not invalidate any ctx->data/data_end pointers in * the bpf program. * - * @ptr: The dynptr whose data slice to retrieve - * @offset: Offset into the dynptr - * @buffer: User-provided buffer to copy contents into - * @buffer__szk: Size (in bytes) of the buffer. This is the length of the - * requested slice. This must be a constant. - * - * @returns: NULL if the call failed (eg invalid dynptr), pointer to a + * Return: NULL if the call failed (eg invalid dynptr), pointer to a * data slice (can be either direct pointer to the data or a pointer to the user * provided buffer, with its contents containing the data, if unable to obtain * direct pointer) -- cgit v1.2.3-58-ga151 From c45eac537bd8b4977d335c123212140bc5257670 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Wed, 1 Mar 2023 21:30:14 -0800 Subject: bpf: Fix bpf_dynptr_slice{_rdwr} to return NULL instead of 0 Change bpf_dynptr_slice and bpf_dynptr_slice_rdwr to return NULL instead of 0, in accordance with the codebase guidelines. Fixes: 66e3a13e7c2c ("bpf: Add bpf_dynptr_slice and bpf_dynptr_slice_rdwr") Reported-by: kernel test robot Signed-off-by: Joanne Koong Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20230302053014.1726219-1-joannelkoong@gmail.com --- kernel/bpf/helpers.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 58431a92bb65..de9ef8476e29 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2227,11 +2227,11 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset int err; if (!ptr->data) - return 0; + return NULL; err = bpf_dynptr_check_off_len(ptr, offset, len); if (err) - return 0; + return NULL; type = bpf_dynptr_get_type(ptr); @@ -2252,7 +2252,7 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset } default: WARN_ONCE(true, "unknown dynptr type %d\n", type); - return 0; + return NULL; } } @@ -2300,7 +2300,7 @@ __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr_kern *ptr, u32 o void *buffer, u32 buffer__szk) { if (!ptr->data || bpf_dynptr_is_rdonly(ptr)) - return 0; + return NULL; /* bpf_dynptr_slice_rdwr is the same logic as bpf_dynptr_slice. * -- cgit v1.2.3-58-ga151 From c501bf55c88b834adefda870c7c092ec9052a437 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 2 Mar 2023 09:42:59 -1000 Subject: bpf: Make bpf_get_current_[ancestor_]cgroup_id() available for all program types These helpers are safe to call from any context and there's no reason to restrict access to them. Remove them from bpf_trace and filter lists and add to bpf_base_func_proto() under perfmon_capable(). v2: After consulting with Andrii, relocated in bpf_base_func_proto() so that they require bpf_capable() but not perfomon_capable() as it doesn't read from or affect others on the system. Signed-off-by: Tejun Heo Cc: Andrii Nakryiko Link: https://lore.kernel.org/r/ZAD8QyoszMZiTzBY@slm.duckdns.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/cgroup.c | 4 ---- kernel/bpf/helpers.c | 4 ++++ kernel/trace/bpf_trace.c | 4 ---- net/core/filter.c | 6 ------ 4 files changed, 4 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index bf2fdb33fb31..a4ae422b8f12 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -2529,10 +2529,6 @@ cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_current_pid_tgid_proto; case BPF_FUNC_get_current_comm: return &bpf_get_current_comm_proto; - case BPF_FUNC_get_current_cgroup_id: - return &bpf_get_current_cgroup_id_proto; - case BPF_FUNC_get_current_ancestor_cgroup_id: - return &bpf_get_current_ancestor_cgroup_id_proto; #ifdef CONFIG_CGROUP_NET_CLASSID case BPF_FUNC_get_cgroup_classid: return &bpf_get_cgroup_classid_curr_proto; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index de9ef8476e29..6fc0d6c44e4c 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1748,6 +1748,10 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_cgrp_storage_get_proto; case BPF_FUNC_cgrp_storage_delete: return &bpf_cgrp_storage_delete_proto; + case BPF_FUNC_get_current_cgroup_id: + return &bpf_get_current_cgroup_id_proto; + case BPF_FUNC_get_current_ancestor_cgroup_id: + return &bpf_get_current_ancestor_cgroup_id_proto; #endif default: break; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index e8da032bb6fc..bcf91bc7bf71 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1453,10 +1453,6 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) NULL : &bpf_probe_read_compat_str_proto; #endif #ifdef CONFIG_CGROUPS - case BPF_FUNC_get_current_cgroup_id: - return &bpf_get_current_cgroup_id_proto; - case BPF_FUNC_get_current_ancestor_cgroup_id: - return &bpf_get_current_ancestor_cgroup_id_proto; case BPF_FUNC_cgrp_storage_get: return &bpf_cgrp_storage_get_proto; case BPF_FUNC_cgrp_storage_delete: diff --git a/net/core/filter.c b/net/core/filter.c index 8f3124e06133..a2dc44e70ea0 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -8165,12 +8165,6 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_storage_delete_proto; case BPF_FUNC_get_netns_cookie: return &bpf_get_netns_cookie_sk_msg_proto; -#ifdef CONFIG_CGROUPS - case BPF_FUNC_get_current_cgroup_id: - return &bpf_get_current_cgroup_id_proto; - case BPF_FUNC_get_current_ancestor_cgroup_id: - return &bpf_get_current_ancestor_cgroup_id_proto; -#endif #ifdef CONFIG_CGROUP_NET_CLASSID case BPF_FUNC_get_cgroup_classid: return &bpf_get_cgroup_classid_curr_proto; -- cgit v1.2.3-58-ga151 From f71f8530494bb5ab43d3369ef0ce8373eb1ee077 Mon Sep 17 00:00:00 2001 From: Tero Kristo Date: Thu, 2 Mar 2023 13:46:13 +0200 Subject: bpf: Add support for absolute value BPF timers Add a new flag BPF_F_TIMER_ABS that can be passed to bpf_timer_start() to start an absolute value timer instead of the default relative value. This makes the timer expire at an exact point in time, instead of a time with latencies induced by both the BPF and timer subsystems. Suggested-by: Artem Bityutskiy Signed-off-by: Tero Kristo Link: https://lore.kernel.org/r/20230302114614.2985072-2-tero.kristo@linux.intel.com Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 15 +++++++++++++++ kernel/bpf/helpers.c | 11 +++++++++-- tools/include/uapi/linux/bpf.h | 15 +++++++++++++++ 3 files changed, 39 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c9699304aed2..976b194eb775 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4969,6 +4969,12 @@ union bpf_attr { * different maps if key/value layout matches across maps. * Every bpf_timer_set_callback() can have different callback_fn. * + * *flags* can be one of: + * + * **BPF_F_TIMER_ABS** + * Start the timer in absolute expire value instead of the + * default relative one. + * * Return * 0 on success. * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier @@ -7097,4 +7103,13 @@ struct bpf_core_relo { enum bpf_core_relo_kind kind; }; +/* + * Flags to control bpf_timer_start() behaviour. + * - BPF_F_TIMER_ABS: Timeout passed is absolute time, by default it is + * relative to current time. + */ +enum { + BPF_F_TIMER_ABS = (1ULL << 0), +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 6fc0d6c44e4c..12f12e879bcf 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1264,10 +1264,11 @@ BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, fla { struct bpf_hrtimer *t; int ret = 0; + enum hrtimer_mode mode; if (in_nmi()) return -EOPNOTSUPP; - if (flags) + if (flags > BPF_F_TIMER_ABS) return -EINVAL; __bpf_spin_lock_irqsave(&timer->lock); t = timer->timer; @@ -1275,7 +1276,13 @@ BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, fla ret = -EINVAL; goto out; } - hrtimer_start(&t->timer, ns_to_ktime(nsecs), HRTIMER_MODE_REL_SOFT); + + if (flags & BPF_F_TIMER_ABS) + mode = HRTIMER_MODE_ABS_SOFT; + else + mode = HRTIMER_MODE_REL_SOFT; + + hrtimer_start(&t->timer, ns_to_ktime(nsecs), mode); out: __bpf_spin_unlock_irqrestore(&timer->lock); return ret; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index c9699304aed2..976b194eb775 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4969,6 +4969,12 @@ union bpf_attr { * different maps if key/value layout matches across maps. * Every bpf_timer_set_callback() can have different callback_fn. * + * *flags* can be one of: + * + * **BPF_F_TIMER_ABS** + * Start the timer in absolute expire value instead of the + * default relative one. + * * Return * 0 on success. * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier @@ -7097,4 +7103,13 @@ struct bpf_core_relo { enum bpf_core_relo_kind kind; }; +/* + * Flags to control bpf_timer_start() behaviour. + * - BPF_F_TIMER_ABS: Timeout passed is absolute time, by default it is + * relative to current time. + */ +enum { + BPF_F_TIMER_ABS = (1ULL << 0), +}; + #endif /* _UAPI__LINUX_BPF_H__ */ -- cgit v1.2.3-58-ga151 From 03b77e17aeb22a5935ea20d585ca6a1f2947e62b Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 2 Mar 2023 20:14:41 -0800 Subject: bpf: Rename __kptr_ref -> __kptr and __kptr -> __kptr_untrusted. __kptr meant to store PTR_UNTRUSTED kernel pointers inside bpf maps. The concept felt useful, but didn't get much traction, since bpf_rdonly_cast() was added soon after and bpf programs received a simpler way to access PTR_UNTRUSTED kernel pointers without going through restrictive __kptr usage. Rename __kptr_ref -> __kptr and __kptr -> __kptr_untrusted to indicate its intended usage. The main goal of __kptr_untrusted was to read/write such pointers directly while bpf_kptr_xchg was a mechanism to access refcnted kernel pointers. The next patch will allow RCU protected __kptr access with direct read. At that point __kptr_untrusted will be deprecated. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: David Vernet Link: https://lore.kernel.org/bpf/20230303041446.3630-2-alexei.starovoitov@gmail.com --- Documentation/bpf/bpf_design_QA.rst | 4 ++-- Documentation/bpf/cpumasks.rst | 4 ++-- Documentation/bpf/kfuncs.rst | 2 +- kernel/bpf/btf.c | 4 ++-- tools/lib/bpf/bpf_helpers.h | 2 +- tools/testing/selftests/bpf/progs/cb_refs.c | 2 +- .../selftests/bpf/progs/cgrp_kfunc_common.h | 2 +- tools/testing/selftests/bpf/progs/cpumask_common.h | 2 +- tools/testing/selftests/bpf/progs/jit_probe_mem.c | 2 +- tools/testing/selftests/bpf/progs/lru_bug.c | 2 +- tools/testing/selftests/bpf/progs/map_kptr.c | 4 ++-- tools/testing/selftests/bpf/progs/map_kptr_fail.c | 6 +++--- .../selftests/bpf/progs/task_kfunc_common.h | 2 +- tools/testing/selftests/bpf/test_verifier.c | 22 +++++++++++----------- 14 files changed, 30 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/Documentation/bpf/bpf_design_QA.rst b/Documentation/bpf/bpf_design_QA.rst index bfff0e7e37c2..38372a956d65 100644 --- a/Documentation/bpf/bpf_design_QA.rst +++ b/Documentation/bpf/bpf_design_QA.rst @@ -314,7 +314,7 @@ Q: What is the compatibility story for special BPF types in map values? Q: Users are allowed to embed bpf_spin_lock, bpf_timer fields in their BPF map values (when using BTF support for BPF maps). This allows to use helpers for such objects on these fields inside map values. Users are also allowed to embed -pointers to some kernel types (with __kptr and __kptr_ref BTF tags). Will the +pointers to some kernel types (with __kptr_untrusted and __kptr BTF tags). Will the kernel preserve backwards compatibility for these features? A: It depends. For bpf_spin_lock, bpf_timer: YES, for kptr and everything else: @@ -324,7 +324,7 @@ For struct types that have been added already, like bpf_spin_lock and bpf_timer, the kernel will preserve backwards compatibility, as they are part of UAPI. For kptrs, they are also part of UAPI, but only with respect to the kptr -mechanism. The types that you can use with a __kptr and __kptr_ref tagged +mechanism. The types that you can use with a __kptr_untrusted and __kptr tagged pointer in your struct are NOT part of the UAPI contract. The supported types can and will change across kernel releases. However, operations like accessing kptr fields and bpf_kptr_xchg() helper will continue to be supported across kernel diff --git a/Documentation/bpf/cpumasks.rst b/Documentation/bpf/cpumasks.rst index 24bef9cbbeee..75344cd230e5 100644 --- a/Documentation/bpf/cpumasks.rst +++ b/Documentation/bpf/cpumasks.rst @@ -51,7 +51,7 @@ For example: .. code-block:: c struct cpumask_map_value { - struct bpf_cpumask __kptr_ref * cpumask; + struct bpf_cpumask __kptr * cpumask; }; struct array_map { @@ -128,7 +128,7 @@ Here is an example of a ``struct bpf_cpumask *`` being retrieved from a map: /* struct containing the struct bpf_cpumask kptr which is stored in the map. */ struct cpumasks_kfunc_map_value { - struct bpf_cpumask __kptr_ref * bpf_cpumask; + struct bpf_cpumask __kptr * bpf_cpumask; }; /* The map containing struct cpumasks_kfunc_map_value entries. */ diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst index 9d85bbc3b771..b5d9b0d446bc 100644 --- a/Documentation/bpf/kfuncs.rst +++ b/Documentation/bpf/kfuncs.rst @@ -544,7 +544,7 @@ Here's an example of how it can be used: /* struct containing the struct task_struct kptr which is actually stored in the map. */ struct __cgroups_kfunc_map_value { - struct cgroup __kptr_ref * cgroup; + struct cgroup __kptr * cgroup; }; /* The map containing struct __cgroups_kfunc_map_value entries. */ diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index ef2d8969ed1f..c5e1d6955491 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3288,9 +3288,9 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t, /* Reject extra tags */ if (btf_type_is_type_tag(btf_type_by_id(btf, t->type))) return -EINVAL; - if (!strcmp("kptr", __btf_name_by_offset(btf, t->name_off))) + if (!strcmp("kptr_untrusted", __btf_name_by_offset(btf, t->name_off))) type = BPF_KPTR_UNREF; - else if (!strcmp("kptr_ref", __btf_name_by_offset(btf, t->name_off))) + else if (!strcmp("kptr", __btf_name_by_offset(btf, t->name_off))) type = BPF_KPTR_REF; else return -EINVAL; diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h index 5ec1871acb2f..7d12d3e620cc 100644 --- a/tools/lib/bpf/bpf_helpers.h +++ b/tools/lib/bpf/bpf_helpers.h @@ -174,8 +174,8 @@ enum libbpf_tristate { #define __kconfig __attribute__((section(".kconfig"))) #define __ksym __attribute__((section(".ksyms"))) +#define __kptr_untrusted __attribute__((btf_type_tag("kptr_untrusted"))) #define __kptr __attribute__((btf_type_tag("kptr"))) -#define __kptr_ref __attribute__((btf_type_tag("kptr_ref"))) #ifndef ___bpf_concat #define ___bpf_concat(a, b) a ## b diff --git a/tools/testing/selftests/bpf/progs/cb_refs.c b/tools/testing/selftests/bpf/progs/cb_refs.c index 7653df1bc787..ce96b33e38d6 100644 --- a/tools/testing/selftests/bpf/progs/cb_refs.c +++ b/tools/testing/selftests/bpf/progs/cb_refs.c @@ -4,7 +4,7 @@ #include struct map_value { - struct prog_test_ref_kfunc __kptr_ref *ptr; + struct prog_test_ref_kfunc __kptr *ptr; }; struct { diff --git a/tools/testing/selftests/bpf/progs/cgrp_kfunc_common.h b/tools/testing/selftests/bpf/progs/cgrp_kfunc_common.h index 2f8de933b957..d0b7cd0d09d7 100644 --- a/tools/testing/selftests/bpf/progs/cgrp_kfunc_common.h +++ b/tools/testing/selftests/bpf/progs/cgrp_kfunc_common.h @@ -10,7 +10,7 @@ #include struct __cgrps_kfunc_map_value { - struct cgroup __kptr_ref * cgrp; + struct cgroup __kptr * cgrp; }; struct hash_map { diff --git a/tools/testing/selftests/bpf/progs/cpumask_common.h b/tools/testing/selftests/bpf/progs/cpumask_common.h index ad34f3b602be..65e5496ca1b2 100644 --- a/tools/testing/selftests/bpf/progs/cpumask_common.h +++ b/tools/testing/selftests/bpf/progs/cpumask_common.h @@ -10,7 +10,7 @@ int err; struct __cpumask_map_value { - struct bpf_cpumask __kptr_ref * cpumask; + struct bpf_cpumask __kptr * cpumask; }; struct array_map { diff --git a/tools/testing/selftests/bpf/progs/jit_probe_mem.c b/tools/testing/selftests/bpf/progs/jit_probe_mem.c index 2d2e61470794..13f00ca2ed0a 100644 --- a/tools/testing/selftests/bpf/progs/jit_probe_mem.c +++ b/tools/testing/selftests/bpf/progs/jit_probe_mem.c @@ -4,7 +4,7 @@ #include #include -static struct prog_test_ref_kfunc __kptr_ref *v; +static struct prog_test_ref_kfunc __kptr *v; long total_sum = -1; extern struct prog_test_ref_kfunc *bpf_kfunc_call_test_acquire(unsigned long *sp) __ksym; diff --git a/tools/testing/selftests/bpf/progs/lru_bug.c b/tools/testing/selftests/bpf/progs/lru_bug.c index 687081a724b3..ad73029cb1e3 100644 --- a/tools/testing/selftests/bpf/progs/lru_bug.c +++ b/tools/testing/selftests/bpf/progs/lru_bug.c @@ -4,7 +4,7 @@ #include struct map_value { - struct task_struct __kptr *ptr; + struct task_struct __kptr_untrusted *ptr; }; struct { diff --git a/tools/testing/selftests/bpf/progs/map_kptr.c b/tools/testing/selftests/bpf/progs/map_kptr.c index a24d17bc17eb..3fe7cde4cbfd 100644 --- a/tools/testing/selftests/bpf/progs/map_kptr.c +++ b/tools/testing/selftests/bpf/progs/map_kptr.c @@ -4,8 +4,8 @@ #include struct map_value { - struct prog_test_ref_kfunc __kptr *unref_ptr; - struct prog_test_ref_kfunc __kptr_ref *ref_ptr; + struct prog_test_ref_kfunc __kptr_untrusted *unref_ptr; + struct prog_test_ref_kfunc __kptr *ref_ptr; }; struct array_map { diff --git a/tools/testing/selftests/bpf/progs/map_kptr_fail.c b/tools/testing/selftests/bpf/progs/map_kptr_fail.c index 760e41e1a632..e19e2a5f38cf 100644 --- a/tools/testing/selftests/bpf/progs/map_kptr_fail.c +++ b/tools/testing/selftests/bpf/progs/map_kptr_fail.c @@ -7,9 +7,9 @@ struct map_value { char buf[8]; - struct prog_test_ref_kfunc __kptr *unref_ptr; - struct prog_test_ref_kfunc __kptr_ref *ref_ptr; - struct prog_test_member __kptr_ref *ref_memb_ptr; + struct prog_test_ref_kfunc __kptr_untrusted *unref_ptr; + struct prog_test_ref_kfunc __kptr *ref_ptr; + struct prog_test_member __kptr *ref_memb_ptr; }; struct array_map { diff --git a/tools/testing/selftests/bpf/progs/task_kfunc_common.h b/tools/testing/selftests/bpf/progs/task_kfunc_common.h index c0ffd171743e..4c2a4b0e3a25 100644 --- a/tools/testing/selftests/bpf/progs/task_kfunc_common.h +++ b/tools/testing/selftests/bpf/progs/task_kfunc_common.h @@ -10,7 +10,7 @@ #include struct __tasks_kfunc_map_value { - struct task_struct __kptr_ref * task; + struct task_struct __kptr * task; }; struct hash_map { diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 8b9949bb833d..49a70d9beb0b 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -699,13 +699,13 @@ static int create_cgroup_storage(bool percpu) * struct bpf_timer t; * }; * struct btf_ptr { + * struct prog_test_ref_kfunc __kptr_untrusted *ptr; * struct prog_test_ref_kfunc __kptr *ptr; - * struct prog_test_ref_kfunc __kptr_ref *ptr; - * struct prog_test_member __kptr_ref *ptr; + * struct prog_test_member __kptr *ptr; * } */ static const char btf_str_sec[] = "\0bpf_spin_lock\0val\0cnt\0l\0bpf_timer\0timer\0t" - "\0btf_ptr\0prog_test_ref_kfunc\0ptr\0kptr\0kptr_ref" + "\0btf_ptr\0prog_test_ref_kfunc\0ptr\0kptr\0kptr_untrusted" "\0prog_test_member"; static __u32 btf_raw_types[] = { /* int */ @@ -724,20 +724,20 @@ static __u32 btf_raw_types[] = { BTF_MEMBER_ENC(41, 4, 0), /* struct bpf_timer t; */ /* struct prog_test_ref_kfunc */ /* [6] */ BTF_STRUCT_ENC(51, 0, 0), - BTF_STRUCT_ENC(89, 0, 0), /* [7] */ + BTF_STRUCT_ENC(95, 0, 0), /* [7] */ + /* type tag "kptr_untrusted" */ + BTF_TYPE_TAG_ENC(80, 6), /* [8] */ /* type tag "kptr" */ - BTF_TYPE_TAG_ENC(75, 6), /* [8] */ - /* type tag "kptr_ref" */ - BTF_TYPE_TAG_ENC(80, 6), /* [9] */ - BTF_TYPE_TAG_ENC(80, 7), /* [10] */ + BTF_TYPE_TAG_ENC(75, 6), /* [9] */ + BTF_TYPE_TAG_ENC(75, 7), /* [10] */ BTF_PTR_ENC(8), /* [11] */ BTF_PTR_ENC(9), /* [12] */ BTF_PTR_ENC(10), /* [13] */ /* struct btf_ptr */ /* [14] */ BTF_STRUCT_ENC(43, 3, 24), - BTF_MEMBER_ENC(71, 11, 0), /* struct prog_test_ref_kfunc __kptr *ptr; */ - BTF_MEMBER_ENC(71, 12, 64), /* struct prog_test_ref_kfunc __kptr_ref *ptr; */ - BTF_MEMBER_ENC(71, 13, 128), /* struct prog_test_member __kptr_ref *ptr; */ + BTF_MEMBER_ENC(71, 11, 0), /* struct prog_test_ref_kfunc __kptr_untrusted *ptr; */ + BTF_MEMBER_ENC(71, 12, 64), /* struct prog_test_ref_kfunc __kptr *ptr; */ + BTF_MEMBER_ENC(71, 13, 128), /* struct prog_test_member __kptr *ptr; */ }; static char bpf_vlog[UINT_MAX >> 8]; -- cgit v1.2.3-58-ga151 From 8d093b4e95a2a16a2cfcd36869b348a17112fabe Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 2 Mar 2023 20:14:42 -0800 Subject: bpf: Mark cgroups and dfl_cgrp fields as trusted. bpf programs sometimes do: bpf_cgrp_storage_get(&map, task->cgroups->dfl_cgrp, ...); It is safe to do, because cgroups->dfl_cgrp pointer is set diring init and never changes. The task->cgroups is also never NULL. It is also set during init and will change when task switches cgroups. For any trusted task pointer dereference of cgroups and dfl_cgrp should yield trusted pointers. The verifier wasn't aware of this. Hence in gcc compiled kernels task->cgroups dereference was producing PTR_TO_BTF_ID without modifiers while in clang compiled kernels the verifier recognizes __rcu tag in cgroups field and produces PTR_TO_BTF_ID | MEM_RCU | MAYBE_NULL. Tag cgroups and dfl_cgrp as trusted to equalize clang and gcc behavior. When GCC supports btf_type_tag such tagging will done directly in the type. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: David Vernet Acked-by: Tejun Heo Link: https://lore.kernel.org/bpf/20230303041446.3630-3-alexei.starovoitov@gmail.com --- kernel/bpf/verifier.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bf580f246a01..b834f3d2d81a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5046,6 +5046,11 @@ static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val) BTF_TYPE_SAFE_NESTED(struct task_struct) { const cpumask_t *cpus_ptr; + struct css_set __rcu *cgroups; +}; + +BTF_TYPE_SAFE_NESTED(struct css_set) { + struct cgroup *dfl_cgrp; }; static bool nested_ptr_is_trusted(struct bpf_verifier_env *env, @@ -5057,6 +5062,7 @@ static bool nested_ptr_is_trusted(struct bpf_verifier_env *env, return false; BTF_TYPE_EMIT(BTF_TYPE_SAFE_NESTED(struct task_struct)); + BTF_TYPE_EMIT(BTF_TYPE_SAFE_NESTED(struct css_set)); return btf_nested_type_is_trusted(&env->log, reg, off); } -- cgit v1.2.3-58-ga151 From 20c09d92faeefb8536f705d3a4629e0dc314c8a1 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 2 Mar 2023 20:14:43 -0800 Subject: bpf: Introduce kptr_rcu. The life time of certain kernel structures like 'struct cgroup' is protected by RCU. Hence it's safe to dereference them directly from __kptr tagged pointers in bpf maps. The resulting pointer is MEM_RCU and can be passed to kfuncs that expect KF_RCU. Derefrence of other kptr-s returns PTR_UNTRUSTED. For example: struct map_value { struct cgroup __kptr *cgrp; }; SEC("tp_btf/cgroup_mkdir") int BPF_PROG(test_cgrp_get_ancestors, struct cgroup *cgrp_arg, const char *path) { struct cgroup *cg, *cg2; cg = bpf_cgroup_acquire(cgrp_arg); // cg is PTR_TRUSTED and ref_obj_id > 0 bpf_kptr_xchg(&v->cgrp, cg); cg2 = v->cgrp; // This is new feature introduced by this patch. // cg2 is PTR_MAYBE_NULL | MEM_RCU. // When cg2 != NULL, it's a valid cgroup, but its percpu_ref could be zero if (cg2) bpf_cgroup_ancestor(cg2, level); // safe to do. } Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Tejun Heo Acked-by: David Vernet Link: https://lore.kernel.org/bpf/20230303041446.3630-4-alexei.starovoitov@gmail.com --- Documentation/bpf/kfuncs.rst | 12 +++-- include/linux/btf.h | 2 +- kernel/bpf/helpers.c | 6 ++- kernel/bpf/verifier.c | 55 ++++++++++++++++++---- net/bpf/test_run.c | 3 +- .../selftests/bpf/progs/cgrp_kfunc_failure.c | 2 +- tools/testing/selftests/bpf/progs/map_kptr_fail.c | 4 +- tools/testing/selftests/bpf/verifier/calls.c | 2 +- tools/testing/selftests/bpf/verifier/map_kptr.c | 2 +- 9 files changed, 65 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst index b5d9b0d446bc..69eccf6f98ef 100644 --- a/Documentation/bpf/kfuncs.rst +++ b/Documentation/bpf/kfuncs.rst @@ -249,11 +249,13 @@ added later. 2.4.8 KF_RCU flag ----------------- -The KF_RCU flag is used for kfuncs which have a rcu ptr as its argument. -When used together with KF_ACQUIRE, it indicates the kfunc should have a -single argument which must be a trusted argument or a MEM_RCU pointer. -The argument may have reference count of 0 and the kfunc must take this -into consideration. +The KF_RCU flag is a weaker version of KF_TRUSTED_ARGS. The kfuncs marked with +KF_RCU expect either PTR_TRUSTED or MEM_RCU arguments. The verifier guarantees +that the objects are valid and there is no use-after-free. The pointers are not +NULL, but the object's refcount could have reached zero. The kfuncs need to +consider doing refcnt != 0 check, especially when returning a KF_ACQUIRE +pointer. Note as well that a KF_ACQUIRE kfunc that is KF_RCU should very likely +also be KF_RET_NULL. .. _KF_deprecated_flag: diff --git a/include/linux/btf.h b/include/linux/btf.h index 49e0fe6d8274..556b3e2e7471 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -70,7 +70,7 @@ #define KF_TRUSTED_ARGS (1 << 4) /* kfunc only takes trusted pointer arguments */ #define KF_SLEEPABLE (1 << 5) /* kfunc may sleep */ #define KF_DESTRUCTIVE (1 << 6) /* kfunc performs destructive actions */ -#define KF_RCU (1 << 7) /* kfunc only takes rcu pointer arguments */ +#define KF_RCU (1 << 7) /* kfunc takes either rcu or trusted pointer arguments */ /* * Tag marking a kernel function as a kfunc. This is meant to minimize the diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 12f12e879bcf..637ac4e92e75 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2163,8 +2163,10 @@ __bpf_kfunc struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level) if (level > cgrp->level || level < 0) return NULL; + /* cgrp's refcnt could be 0 here, but ancestors can still be accessed */ ancestor = cgrp->ancestors[level]; - cgroup_get(ancestor); + if (!cgroup_tryget(ancestor)) + return NULL; return ancestor; } @@ -2382,7 +2384,7 @@ BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL) BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_cgroup_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_cgroup_release, KF_RELEASE) -BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_TRUSTED_ARGS | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_RCU | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_cgroup_from_id, KF_ACQUIRE | KF_RET_NULL) #endif BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b834f3d2d81a..a095055d7ef4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4218,7 +4218,7 @@ static int map_kptr_match_type(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno) { const char *targ_name = kernel_type_name(kptr_field->kptr.btf, kptr_field->kptr.btf_id); - int perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED; + int perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED | MEM_RCU; const char *reg_name = ""; /* Only unreferenced case accepts untrusted pointers */ @@ -4285,6 +4285,34 @@ bad_type: return -EINVAL; } +/* The non-sleepable programs and sleepable programs with explicit bpf_rcu_read_lock() + * can dereference RCU protected pointers and result is PTR_TRUSTED. + */ +static bool in_rcu_cs(struct bpf_verifier_env *env) +{ + return env->cur_state->active_rcu_lock || !env->prog->aux->sleepable; +} + +/* Once GCC supports btf_type_tag the following mechanism will be replaced with tag check */ +BTF_SET_START(rcu_protected_types) +BTF_ID(struct, prog_test_ref_kfunc) +BTF_ID(struct, cgroup) +BTF_SET_END(rcu_protected_types) + +static bool rcu_protected_object(const struct btf *btf, u32 btf_id) +{ + if (!btf_is_kernel(btf)) + return false; + return btf_id_set_contains(&rcu_protected_types, btf_id); +} + +static bool rcu_safe_kptr(const struct btf_field *field) +{ + const struct btf_field_kptr *kptr = &field->kptr; + + return field->type == BPF_KPTR_REF && rcu_protected_object(kptr->btf, kptr->btf_id); +} + static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, int value_regno, int insn_idx, struct btf_field *kptr_field) @@ -4319,7 +4347,10 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, * value from map as PTR_TO_BTF_ID, with the correct type. */ mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, kptr_field->kptr.btf, - kptr_field->kptr.btf_id, PTR_MAYBE_NULL | PTR_UNTRUSTED); + kptr_field->kptr.btf_id, + rcu_safe_kptr(kptr_field) && in_rcu_cs(env) ? + PTR_MAYBE_NULL | MEM_RCU : + PTR_MAYBE_NULL | PTR_UNTRUSTED); /* For mark_ptr_or_null_reg */ val_reg->id = ++env->id_gen; } else if (class == BPF_STX) { @@ -5163,10 +5194,17 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, * An RCU-protected pointer can also be deemed trusted if we are in an * RCU read region. This case is handled below. */ - if (nested_ptr_is_trusted(env, reg, off)) + if (nested_ptr_is_trusted(env, reg, off)) { flag |= PTR_TRUSTED; - else + /* + * task->cgroups is trusted. It provides a stronger guarantee + * than __rcu tag on 'cgroups' field in 'struct task_struct'. + * Clear MEM_RCU in such case. + */ + flag &= ~MEM_RCU; + } else { flag &= ~PTR_TRUSTED; + } if (flag & MEM_RCU) { /* Mark value register as MEM_RCU only if it is protected by @@ -5175,11 +5213,10 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, * read lock region. Also mark rcu pointer as PTR_MAYBE_NULL since * it could be null in some cases. */ - if (!env->cur_state->active_rcu_lock || - !(is_trusted_reg(reg) || is_rcu_reg(reg))) - flag &= ~MEM_RCU; - else + if (in_rcu_cs(env) && (is_trusted_reg(reg) || is_rcu_reg(reg))) flag |= PTR_MAYBE_NULL; + else + flag &= ~MEM_RCU; } else if (reg->type & MEM_RCU) { /* ptr (reg) is marked as MEM_RCU, but the struct field is not tagged * with __rcu. Mark the flag as PTR_UNTRUSTED conservatively. @@ -9676,7 +9713,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return -EINVAL; } - if (is_kfunc_trusted_args(meta) && + if ((is_kfunc_trusted_args(meta) || is_kfunc_rcu(meta)) && (register_is_null(reg) || type_may_be_null(reg->type))) { verbose(env, "Possibly NULL pointer passed to trusted arg%d\n", i); return -EACCES; diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 6f3d654b3339..6a8b33a103a4 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -737,6 +737,7 @@ __bpf_kfunc void bpf_kfunc_call_test_mem_len_fail2(u64 *mem, int len) __bpf_kfunc void bpf_kfunc_call_test_ref(struct prog_test_ref_kfunc *p) { + /* p != NULL, but p->cnt could be 0 */ } __bpf_kfunc void bpf_kfunc_call_test_destructive(void) @@ -784,7 +785,7 @@ BTF_ID_FLAGS(func, bpf_kfunc_call_test_fail3) BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_pass1) BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_fail1) BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_fail2) -BTF_ID_FLAGS(func, bpf_kfunc_call_test_ref, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_kfunc_call_test_ref, KF_TRUSTED_ARGS | KF_RCU) BTF_ID_FLAGS(func, bpf_kfunc_call_test_destructive, KF_DESTRUCTIVE) BTF_ID_FLAGS(func, bpf_kfunc_call_test_static_unused_arg) BTF_SET8_END(test_sk_check_kfunc_ids) diff --git a/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c b/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c index 4ad7fe24966d..b42291ed9586 100644 --- a/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c +++ b/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c @@ -205,7 +205,7 @@ int BPF_PROG(cgrp_kfunc_get_unreleased, struct cgroup *cgrp, const char *path) } SEC("tp_btf/cgroup_mkdir") -__failure __msg("arg#0 is untrusted_ptr_or_null_ expected ptr_ or socket") +__failure __msg("expects refcounted") int BPF_PROG(cgrp_kfunc_release_untrusted, struct cgroup *cgrp, const char *path) { struct __cgrps_kfunc_map_value *v; diff --git a/tools/testing/selftests/bpf/progs/map_kptr_fail.c b/tools/testing/selftests/bpf/progs/map_kptr_fail.c index e19e2a5f38cf..08f9ec18c345 100644 --- a/tools/testing/selftests/bpf/progs/map_kptr_fail.c +++ b/tools/testing/selftests/bpf/progs/map_kptr_fail.c @@ -281,7 +281,7 @@ int reject_kptr_get_bad_type_match(struct __sk_buff *ctx) } SEC("?tc") -__failure __msg("R1 type=untrusted_ptr_or_null_ expected=percpu_ptr_") +__failure __msg("R1 type=rcu_ptr_or_null_ expected=percpu_ptr_") int mark_ref_as_untrusted_or_null(struct __sk_buff *ctx) { struct map_value *v; @@ -316,7 +316,7 @@ int reject_untrusted_store_to_ref(struct __sk_buff *ctx) } SEC("?tc") -__failure __msg("R2 type=untrusted_ptr_ expected=ptr_") +__failure __msg("R2 must be referenced") int reject_untrusted_xchg(struct __sk_buff *ctx) { struct prog_test_ref_kfunc *p; diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c index 289ed202ec66..9a326a800e5c 100644 --- a/tools/testing/selftests/bpf/verifier/calls.c +++ b/tools/testing/selftests/bpf/verifier/calls.c @@ -243,7 +243,7 @@ }, .result_unpriv = REJECT, .result = REJECT, - .errstr = "R1 must be referenced", + .errstr = "R1 must be", }, { "calls: valid kfunc call: referenced arg needs refcounted PTR_TO_BTF_ID", diff --git a/tools/testing/selftests/bpf/verifier/map_kptr.c b/tools/testing/selftests/bpf/verifier/map_kptr.c index 6914904344c0..d775ccb01989 100644 --- a/tools/testing/selftests/bpf/verifier/map_kptr.c +++ b/tools/testing/selftests/bpf/verifier/map_kptr.c @@ -336,7 +336,7 @@ .prog_type = BPF_PROG_TYPE_SCHED_CLS, .fixup_map_kptr = { 1 }, .result = REJECT, - .errstr = "R1 type=untrusted_ptr_or_null_ expected=percpu_ptr_", + .errstr = "R1 type=rcu_ptr_or_null_ expected=percpu_ptr_", }, { "map_kptr: ref: reject off != 0", -- cgit v1.2.3-58-ga151 From 6fcd486b3a0a628c41f12b3a7329a18a2c74b351 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 2 Mar 2023 20:14:46 -0800 Subject: bpf: Refactor RCU enforcement in the verifier. bpf_rcu_read_lock/unlock() are only available in clang compiled kernels. Lack of such key mechanism makes it impossible for sleepable bpf programs to use RCU pointers. Allow bpf_rcu_read_lock/unlock() in GCC compiled kernels (though GCC doesn't support btf_type_tag yet) and allowlist certain field dereferences in important data structures like tast_struct, cgroup, socket that are used by sleepable programs either as RCU pointer or full trusted pointer (which is valid outside of RCU CS). Use BTF_TYPE_SAFE_RCU and BTF_TYPE_SAFE_TRUSTED macros for such tagging. They will be removed once GCC supports btf_type_tag. With that refactor check_ptr_to_btf_access(). Make it strict in enforcing PTR_TRUSTED and PTR_UNTRUSTED while deprecating old PTR_TO_BTF_ID without modifier flags. There is a chance that this strict enforcement might break existing programs (especially on GCC compiled kernels), but this cleanup has to start sooner than later. Note PTR_TO_CTX access still yields old deprecated PTR_TO_BTF_ID. Once it's converted to strict PTR_TRUSTED or PTR_UNTRUSTED the kfuncs and helpers will be able to default to KF_TRUSTED_ARGS. KF_RCU will remain as a weaker version of KF_TRUSTED_ARGS where obj refcnt could be 0. Adjust rcu_read_lock selftest to run on gcc and clang compiled kernels. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: David Vernet Link: https://lore.kernel.org/bpf/20230303041446.3630-7-alexei.starovoitov@gmail.com --- include/linux/bpf.h | 2 +- include/linux/bpf_verifier.h | 1 - kernel/bpf/btf.c | 16 +- kernel/bpf/cpumask.c | 40 ++--- kernel/bpf/verifier.c | 178 ++++++++++++++------- .../selftests/bpf/prog_tests/cgrp_local_storage.c | 14 +- .../selftests/bpf/prog_tests/rcu_read_lock.c | 16 +- .../selftests/bpf/progs/cgrp_ls_sleepable.c | 4 +- .../testing/selftests/bpf/progs/cpumask_failure.c | 2 +- .../selftests/bpf/progs/nested_trust_failure.c | 2 +- tools/testing/selftests/bpf/progs/rcu_read_lock.c | 6 +- tools/testing/selftests/bpf/verifier/calls.c | 2 +- 12 files changed, 173 insertions(+), 110 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 23ec684e660d..d3456804f7aa 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2279,7 +2279,7 @@ struct bpf_core_ctx { bool btf_nested_type_is_trusted(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, - int off); + int off, const char *suffix); bool btf_type_ids_nocast_alias(struct bpf_verifier_log *log, const struct btf *reg_btf, u32 reg_id, diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index b26ff2a8f63b..18538bad2b8c 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -537,7 +537,6 @@ struct bpf_verifier_env { bool bypass_spec_v1; bool bypass_spec_v4; bool seen_direct_write; - bool rcu_tag_supported; struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ const struct bpf_line_info *prev_linfo; struct bpf_verifier_log log; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index c5e1d6955491..a8cb09e5973b 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6163,6 +6163,7 @@ static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf, const char *tname, *mname, *tag_value; u32 vlen, elem_id, mid; + *flag = 0; again: tname = __btf_name_by_offset(btf, t->name_off); if (!btf_type_is_struct(t)) { @@ -6329,6 +6330,15 @@ error: * of this field or inside of this struct */ if (btf_type_is_struct(mtype)) { + if (BTF_INFO_KIND(mtype->info) == BTF_KIND_UNION && + btf_type_vlen(mtype) != 1) + /* + * walking unions yields untrusted pointers + * with exception of __bpf_md_ptr and other + * unions with a single member + */ + *flag |= PTR_UNTRUSTED; + /* our field must be inside that union or struct */ t = mtype; @@ -6373,7 +6383,7 @@ error: stype = btf_type_skip_modifiers(btf, mtype->type, &id); if (btf_type_is_struct(stype)) { *next_btf_id = id; - *flag = tmp_flag; + *flag |= tmp_flag; return WALK_PTR; } } @@ -8357,7 +8367,7 @@ out: bool btf_nested_type_is_trusted(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, - int off) + int off, const char *suffix) { struct btf *btf = reg->btf; const struct btf_type *walk_type, *safe_type; @@ -8374,7 +8384,7 @@ bool btf_nested_type_is_trusted(struct bpf_verifier_log *log, tname = btf_name_by_offset(btf, walk_type->name_off); - ret = snprintf(safe_tname, sizeof(safe_tname), "%s__safe_fields", tname); + ret = snprintf(safe_tname, sizeof(safe_tname), "%s%s", tname, suffix); if (ret < 0) return false; diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c index 2b3fbbfebdc5..b6587ec40f1b 100644 --- a/kernel/bpf/cpumask.c +++ b/kernel/bpf/cpumask.c @@ -427,26 +427,26 @@ BTF_ID_FLAGS(func, bpf_cpumask_create, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_cpumask_release, KF_RELEASE | KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_cpumask_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_cpumask_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL) -BTF_ID_FLAGS(func, bpf_cpumask_first, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_first_zero, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_set_cpu, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_clear_cpu, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_test_cpu, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_test_and_set_cpu, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_test_and_clear_cpu, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_setall, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_clear, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_and, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_or, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_xor, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_equal, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_intersects, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_subset, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_empty, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_full, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_any, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_any_and, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_first, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_first_zero, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_set_cpu, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_clear_cpu, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_test_cpu, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_test_and_set_cpu, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_test_and_clear_cpu, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_setall, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_clear, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_and, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_or, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_xor, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_equal, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_intersects, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_subset, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_empty, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_full, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_any, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_any_and, KF_RCU) BTF_SET8_END(cpumask_kfunc_btf_ids) static const struct btf_kfunc_id_set cpumask_kfunc_set = { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a095055d7ef4..c2adf3c24c64 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5073,29 +5073,76 @@ static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val) return 0; } -#define BTF_TYPE_SAFE_NESTED(__type) __PASTE(__type, __safe_fields) +#define BTF_TYPE_SAFE_RCU(__type) __PASTE(__type, __safe_rcu) +#define BTF_TYPE_SAFE_TRUSTED(__type) __PASTE(__type, __safe_trusted) -BTF_TYPE_SAFE_NESTED(struct task_struct) { +/* + * Allow list few fields as RCU trusted or full trusted. + * This logic doesn't allow mix tagging and will be removed once GCC supports + * btf_type_tag. + */ + +/* RCU trusted: these fields are trusted in RCU CS and never NULL */ +BTF_TYPE_SAFE_RCU(struct task_struct) { const cpumask_t *cpus_ptr; struct css_set __rcu *cgroups; + struct task_struct __rcu *real_parent; + struct task_struct *group_leader; }; -BTF_TYPE_SAFE_NESTED(struct css_set) { +BTF_TYPE_SAFE_RCU(struct css_set) { struct cgroup *dfl_cgrp; }; -static bool nested_ptr_is_trusted(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, - int off) +/* full trusted: these fields are trusted even outside of RCU CS and never NULL */ +BTF_TYPE_SAFE_TRUSTED(struct bpf_iter_meta) { + __bpf_md_ptr(struct seq_file *, seq); +}; + +BTF_TYPE_SAFE_TRUSTED(struct bpf_iter__task) { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct task_struct *, task); +}; + +BTF_TYPE_SAFE_TRUSTED(struct linux_binprm) { + struct file *file; +}; + +BTF_TYPE_SAFE_TRUSTED(struct file) { + struct inode *f_inode; +}; + +BTF_TYPE_SAFE_TRUSTED(struct dentry) { + /* no negative dentry-s in places where bpf can see it */ + struct inode *d_inode; +}; + +BTF_TYPE_SAFE_TRUSTED(struct socket) { + struct sock *sk; +}; + +static bool type_is_rcu(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, + int off) { - /* If its parent is not trusted, it can't regain its trusted status. */ - if (!is_trusted_reg(reg)) - return false; + BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct task_struct)); + BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct css_set)); - BTF_TYPE_EMIT(BTF_TYPE_SAFE_NESTED(struct task_struct)); - BTF_TYPE_EMIT(BTF_TYPE_SAFE_NESTED(struct css_set)); + return btf_nested_type_is_trusted(&env->log, reg, off, "__safe_rcu"); +} - return btf_nested_type_is_trusted(&env->log, reg, off); +static bool type_is_trusted(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, + int off) +{ + BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct bpf_iter_meta)); + BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct bpf_iter__task)); + BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct linux_binprm)); + BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct file)); + BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct dentry)); + BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct socket)); + + return btf_nested_type_is_trusted(&env->log, reg, off, "__safe_trusted"); } static int check_ptr_to_btf_access(struct bpf_verifier_env *env, @@ -5181,49 +5228,58 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, if (ret < 0) return ret; - /* If this is an untrusted pointer, all pointers formed by walking it - * also inherit the untrusted flag. - */ - if (type_flag(reg->type) & PTR_UNTRUSTED) - flag |= PTR_UNTRUSTED; + if (ret != PTR_TO_BTF_ID) { + /* just mark; */ - /* By default any pointer obtained from walking a trusted pointer is no - * longer trusted, unless the field being accessed has explicitly been - * marked as inheriting its parent's state of trust. - * - * An RCU-protected pointer can also be deemed trusted if we are in an - * RCU read region. This case is handled below. - */ - if (nested_ptr_is_trusted(env, reg, off)) { - flag |= PTR_TRUSTED; - /* - * task->cgroups is trusted. It provides a stronger guarantee - * than __rcu tag on 'cgroups' field in 'struct task_struct'. - * Clear MEM_RCU in such case. + } else if (type_flag(reg->type) & PTR_UNTRUSTED) { + /* If this is an untrusted pointer, all pointers formed by walking it + * also inherit the untrusted flag. + */ + flag = PTR_UNTRUSTED; + + } else if (is_trusted_reg(reg) || is_rcu_reg(reg)) { + /* By default any pointer obtained from walking a trusted pointer is no + * longer trusted, unless the field being accessed has explicitly been + * marked as inheriting its parent's state of trust (either full or RCU). + * For example: + * 'cgroups' pointer is untrusted if task->cgroups dereference + * happened in a sleepable program outside of bpf_rcu_read_lock() + * section. In a non-sleepable program it's trusted while in RCU CS (aka MEM_RCU). + * Note bpf_rcu_read_unlock() converts MEM_RCU pointers to PTR_UNTRUSTED. + * + * A regular RCU-protected pointer with __rcu tag can also be deemed + * trusted if we are in an RCU CS. Such pointer can be NULL. */ - flag &= ~MEM_RCU; + if (type_is_trusted(env, reg, off)) { + flag |= PTR_TRUSTED; + } else if (in_rcu_cs(env) && !type_may_be_null(reg->type)) { + if (type_is_rcu(env, reg, off)) { + /* ignore __rcu tag and mark it MEM_RCU */ + flag |= MEM_RCU; + } else if (flag & MEM_RCU) { + /* __rcu tagged pointers can be NULL */ + flag |= PTR_MAYBE_NULL; + } else if (flag & (MEM_PERCPU | MEM_USER)) { + /* keep as-is */ + } else { + /* walking unknown pointers yields untrusted pointer */ + flag = PTR_UNTRUSTED; + } + } else { + /* + * If not in RCU CS or MEM_RCU pointer can be NULL then + * aggressively mark as untrusted otherwise such + * pointers will be plain PTR_TO_BTF_ID without flags + * and will be allowed to be passed into helpers for + * compat reasons. + */ + flag = PTR_UNTRUSTED; + } } else { + /* Old compat. Deprecated */ flag &= ~PTR_TRUSTED; } - if (flag & MEM_RCU) { - /* Mark value register as MEM_RCU only if it is protected by - * bpf_rcu_read_lock() and the ptr reg is rcu or trusted. MEM_RCU - * itself can already indicate trustedness inside the rcu - * read lock region. Also mark rcu pointer as PTR_MAYBE_NULL since - * it could be null in some cases. - */ - if (in_rcu_cs(env) && (is_trusted_reg(reg) || is_rcu_reg(reg))) - flag |= PTR_MAYBE_NULL; - else - flag &= ~MEM_RCU; - } else if (reg->type & MEM_RCU) { - /* ptr (reg) is marked as MEM_RCU, but the struct field is not tagged - * with __rcu. Mark the flag as PTR_UNTRUSTED conservatively. - */ - flag |= PTR_UNTRUSTED; - } - if (atype == BPF_READ && value_regno >= 0) mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id, flag); @@ -10049,10 +10105,6 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, rcu_lock = is_kfunc_bpf_rcu_read_lock(&meta); rcu_unlock = is_kfunc_bpf_rcu_read_unlock(&meta); - if ((rcu_lock || rcu_unlock) && !env->rcu_tag_supported) { - verbose(env, "no vmlinux btf rcu tag support for kfunc %s\n", func_name); - return -EACCES; - } if (env->cur_state->active_rcu_lock) { struct bpf_func_state *state; @@ -14911,8 +14963,22 @@ static int do_check(struct bpf_verifier_env *env) * src_reg == stack|map in some other branch. * Reject it. */ - verbose(env, "same insn cannot be used with different pointers\n"); - return -EINVAL; + if (base_type(src_reg_type) == PTR_TO_BTF_ID && + base_type(*prev_src_type) == PTR_TO_BTF_ID) { + /* + * Have to support a use case when one path through + * the program yields TRUSTED pointer while another + * is UNTRUSTED. Fallback to UNTRUSTED to generate + * BPF_PROBE_MEM. + */ + *prev_src_type = PTR_TO_BTF_ID | PTR_UNTRUSTED; + } else { + verbose(env, + "The same insn cannot be used with different pointers: %s", + reg_type_str(env, src_reg_type)); + verbose(env, " != %s\n", reg_type_str(env, *prev_src_type)); + return -EINVAL; + } } } else if (class == BPF_STX) { @@ -17984,8 +18050,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr) env->bypass_spec_v1 = bpf_bypass_spec_v1(); env->bypass_spec_v4 = bpf_bypass_spec_v4(); env->bpf_capable = bpf_capable(); - env->rcu_tag_supported = btf_vmlinux && - btf_find_by_name_kind(btf_vmlinux, "rcu", BTF_KIND_TYPE_TAG) > 0; if (is_priv) env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ; diff --git a/tools/testing/selftests/bpf/prog_tests/cgrp_local_storage.c b/tools/testing/selftests/bpf/prog_tests/cgrp_local_storage.c index 2cc759956e3b..63e776f4176e 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgrp_local_storage.c +++ b/tools/testing/selftests/bpf/prog_tests/cgrp_local_storage.c @@ -193,7 +193,7 @@ out: cgrp_ls_sleepable__destroy(skel); } -static void test_no_rcu_lock(__u64 cgroup_id) +static void test_yes_rcu_lock(__u64 cgroup_id) { struct cgrp_ls_sleepable *skel; int err; @@ -204,7 +204,7 @@ static void test_no_rcu_lock(__u64 cgroup_id) skel->bss->target_pid = syscall(SYS_gettid); - bpf_program__set_autoload(skel->progs.no_rcu_lock, true); + bpf_program__set_autoload(skel->progs.yes_rcu_lock, true); err = cgrp_ls_sleepable__load(skel); if (!ASSERT_OK(err, "skel_load")) goto out; @@ -220,7 +220,7 @@ out: cgrp_ls_sleepable__destroy(skel); } -static void test_rcu_lock(void) +static void test_no_rcu_lock(void) { struct cgrp_ls_sleepable *skel; int err; @@ -229,7 +229,7 @@ static void test_rcu_lock(void) if (!ASSERT_OK_PTR(skel, "skel_open")) return; - bpf_program__set_autoload(skel->progs.yes_rcu_lock, true); + bpf_program__set_autoload(skel->progs.no_rcu_lock, true); err = cgrp_ls_sleepable__load(skel); ASSERT_ERR(err, "skel_load"); @@ -256,10 +256,10 @@ void test_cgrp_local_storage(void) test_negative(); if (test__start_subtest("cgroup_iter_sleepable")) test_cgroup_iter_sleepable(cgroup_fd, cgroup_id); + if (test__start_subtest("yes_rcu_lock")) + test_yes_rcu_lock(cgroup_id); if (test__start_subtest("no_rcu_lock")) - test_no_rcu_lock(cgroup_id); - if (test__start_subtest("rcu_lock")) - test_rcu_lock(); + test_no_rcu_lock(); close(cgroup_fd); } diff --git a/tools/testing/selftests/bpf/prog_tests/rcu_read_lock.c b/tools/testing/selftests/bpf/prog_tests/rcu_read_lock.c index 447d8560ecb6..3f1f58d3a729 100644 --- a/tools/testing/selftests/bpf/prog_tests/rcu_read_lock.c +++ b/tools/testing/selftests/bpf/prog_tests/rcu_read_lock.c @@ -25,10 +25,10 @@ static void test_success(void) bpf_program__set_autoload(skel->progs.get_cgroup_id, true); bpf_program__set_autoload(skel->progs.task_succ, true); - bpf_program__set_autoload(skel->progs.no_lock, true); bpf_program__set_autoload(skel->progs.two_regions, true); bpf_program__set_autoload(skel->progs.non_sleepable_1, true); bpf_program__set_autoload(skel->progs.non_sleepable_2, true); + bpf_program__set_autoload(skel->progs.task_trusted_non_rcuptr, true); err = rcu_read_lock__load(skel); if (!ASSERT_OK(err, "skel_load")) goto out; @@ -69,6 +69,7 @@ out: static const char * const inproper_region_tests[] = { "miss_lock", + "no_lock", "miss_unlock", "non_sleepable_rcu_mismatch", "inproper_sleepable_helper", @@ -99,7 +100,6 @@ out: } static const char * const rcuptr_misuse_tests[] = { - "task_untrusted_non_rcuptr", "task_untrusted_rcuptr", "cross_rcu_region", }; @@ -128,17 +128,8 @@ out: void test_rcu_read_lock(void) { - struct btf *vmlinux_btf; int cgroup_fd; - vmlinux_btf = btf__load_vmlinux_btf(); - if (!ASSERT_OK_PTR(vmlinux_btf, "could not load vmlinux BTF")) - return; - if (btf__find_by_name_kind(vmlinux_btf, "rcu", BTF_KIND_TYPE_TAG) < 0) { - test__skip(); - goto out; - } - cgroup_fd = test__join_cgroup("/rcu_read_lock"); if (!ASSERT_GE(cgroup_fd, 0, "join_cgroup /rcu_read_lock")) goto out; @@ -153,6 +144,5 @@ void test_rcu_read_lock(void) if (test__start_subtest("negative_tests_rcuptr_misuse")) test_rcuptr_misuse(); close(cgroup_fd); -out: - btf__free(vmlinux_btf); +out:; } diff --git a/tools/testing/selftests/bpf/progs/cgrp_ls_sleepable.c b/tools/testing/selftests/bpf/progs/cgrp_ls_sleepable.c index 2d11ed528b6f..7615dc23d301 100644 --- a/tools/testing/selftests/bpf/progs/cgrp_ls_sleepable.c +++ b/tools/testing/selftests/bpf/progs/cgrp_ls_sleepable.c @@ -49,7 +49,7 @@ int no_rcu_lock(void *ctx) if (task->pid != target_pid) return 0; - /* ptr_to_btf_id semantics. should work. */ + /* task->cgroups is untrusted in sleepable prog outside of RCU CS */ cgrp = task->cgroups->dfl_cgrp; ptr = bpf_cgrp_storage_get(&map_a, cgrp, 0, BPF_LOCAL_STORAGE_GET_F_CREATE); @@ -71,7 +71,7 @@ int yes_rcu_lock(void *ctx) bpf_rcu_read_lock(); cgrp = task->cgroups->dfl_cgrp; - /* cgrp is untrusted and cannot pass to bpf_cgrp_storage_get() helper. */ + /* cgrp is trusted under RCU CS */ ptr = bpf_cgrp_storage_get(&map_a, cgrp, 0, BPF_LOCAL_STORAGE_GET_F_CREATE); if (ptr) cgroup_id = cgrp->kn->id; diff --git a/tools/testing/selftests/bpf/progs/cpumask_failure.c b/tools/testing/selftests/bpf/progs/cpumask_failure.c index 33e8e86dd090..c16f7563b84e 100644 --- a/tools/testing/selftests/bpf/progs/cpumask_failure.c +++ b/tools/testing/selftests/bpf/progs/cpumask_failure.c @@ -44,7 +44,7 @@ int BPF_PROG(test_alloc_double_release, struct task_struct *task, u64 clone_flag } SEC("tp_btf/task_newtask") -__failure __msg("bpf_cpumask_acquire args#0 expected pointer to STRUCT bpf_cpumask") +__failure __msg("must be referenced") int BPF_PROG(test_acquire_wrong_cpumask, struct task_struct *task, u64 clone_flags) { struct bpf_cpumask *cpumask; diff --git a/tools/testing/selftests/bpf/progs/nested_trust_failure.c b/tools/testing/selftests/bpf/progs/nested_trust_failure.c index 14aff7676436..0d1aa6bbace4 100644 --- a/tools/testing/selftests/bpf/progs/nested_trust_failure.c +++ b/tools/testing/selftests/bpf/progs/nested_trust_failure.c @@ -17,7 +17,7 @@ char _license[] SEC("license") = "GPL"; */ SEC("tp_btf/task_newtask") -__failure __msg("R2 must be referenced or trusted") +__failure __msg("R2 must be") int BPF_PROG(test_invalid_nested_user_cpus, struct task_struct *task, u64 clone_flags) { bpf_cpumask_test_cpu(0, task->user_cpus_ptr); diff --git a/tools/testing/selftests/bpf/progs/rcu_read_lock.c b/tools/testing/selftests/bpf/progs/rcu_read_lock.c index 5cecbdbbb16e..7250bb76d18a 100644 --- a/tools/testing/selftests/bpf/progs/rcu_read_lock.c +++ b/tools/testing/selftests/bpf/progs/rcu_read_lock.c @@ -81,7 +81,7 @@ int no_lock(void *ctx) { struct task_struct *task, *real_parent; - /* no bpf_rcu_read_lock(), old code still works */ + /* old style ptr_to_btf_id is not allowed in sleepable */ task = bpf_get_current_task_btf(); real_parent = task->real_parent; (void)bpf_task_storage_get(&map_a, real_parent, 0, 0); @@ -286,13 +286,13 @@ out: } SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") -int task_untrusted_non_rcuptr(void *ctx) +int task_trusted_non_rcuptr(void *ctx) { struct task_struct *task, *group_leader; task = bpf_get_current_task_btf(); bpf_rcu_read_lock(); - /* the pointer group_leader marked as untrusted */ + /* the pointer group_leader is explicitly marked as trusted */ group_leader = task->real_parent->group_leader; (void)bpf_task_storage_get(&map_a, group_leader, 0, 0); bpf_rcu_read_unlock(); diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c index 9a326a800e5c..5702fc9761ef 100644 --- a/tools/testing/selftests/bpf/verifier/calls.c +++ b/tools/testing/selftests/bpf/verifier/calls.c @@ -181,7 +181,7 @@ }, .result_unpriv = REJECT, .result = REJECT, - .errstr = "negative offset ptr_ ptr R1 off=-4 disallowed", + .errstr = "ptr R1 off=-4 disallowed", }, { "calls: invalid kfunc call: PTR_TO_BTF_ID with variable offset", -- cgit v1.2.3-58-ga151 From e768e3c5aab44ee63f58649d4c8cbbb3270e5c06 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 3 Mar 2023 15:15:42 +0100 Subject: bpf: Use separate RCU callbacks for freeing selem Martin suggested that instead of using a byte in the hole (which he has a use for in his future patch) in bpf_local_storage_elem, we can dispatch a different call_rcu callback based on whether we need to free special fields in bpf_local_storage_elem data. The free path, described in commit 9db44fdd8105 ("bpf: Support kptrs in local storage maps"), only waits for call_rcu callbacks when there are special (kptrs, etc.) fields in the map value, hence it is necessary that we only access smap in this case. Therefore, dispatch different RCU callbacks based on the BPF map has a valid btf_record, which dereference and use smap's btf_record only when it is valid. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20230303141542.300068-1-memxor@gmail.com Signed-off-by: Martin KaFai Lau --- include/linux/bpf_local_storage.h | 6 --- kernel/bpf/bpf_local_storage.c | 79 ++++++++++++++++++++++++--------------- 2 files changed, 49 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 0fe92986412b..6d37a40cd90e 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -74,12 +74,6 @@ struct bpf_local_storage_elem { struct hlist_node snode; /* Linked to bpf_local_storage */ struct bpf_local_storage __rcu *local_storage; struct rcu_head rcu; - bool can_use_smap; /* Is it safe to access smap in bpf_selem_free_* RCU - * callbacks? bpf_local_storage_map_free only - * executes rcu_barrier when there are special - * fields, this field remembers that to ensure we - * don't access already freed smap in sdata. - */ /* 8 bytes hole */ /* The data is stored in another cacheline to minimize * the number of cachelines access during a cache hit. diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 2bdd722fe293..3d320393a12c 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -109,30 +109,36 @@ void bpf_local_storage_free_rcu(struct rcu_head *rcu) kfree_rcu(local_storage, rcu); } -static void bpf_selem_free_rcu(struct rcu_head *rcu) +static void bpf_selem_free_fields_rcu(struct rcu_head *rcu) { struct bpf_local_storage_elem *selem; + struct bpf_local_storage_map *smap; selem = container_of(rcu, struct bpf_local_storage_elem, rcu); - /* The can_use_smap bool is set whenever we need to free additional - * fields in selem data before freeing selem. bpf_local_storage_map_free - * only executes rcu_barrier to wait for RCU callbacks when it has - * special fields, hence we can only conditionally dereference smap, as - * by this time the map might have already been freed without waiting - * for our call_rcu callback if it did not have any special fields. - */ - if (selem->can_use_smap) - bpf_obj_free_fields(SDATA(selem)->smap->map.record, SDATA(selem)->data); + /* protected by the rcu_barrier*() */ + smap = rcu_dereference_protected(SDATA(selem)->smap, true); + bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); kfree(selem); } -static void bpf_selem_free_tasks_trace_rcu(struct rcu_head *rcu) +static void bpf_selem_free_fields_trace_rcu(struct rcu_head *rcu) { /* Free directly if Tasks Trace RCU GP also implies RCU GP */ if (rcu_trace_implies_rcu_gp()) - bpf_selem_free_rcu(rcu); + bpf_selem_free_fields_rcu(rcu); + else + call_rcu(rcu, bpf_selem_free_fields_rcu); +} + +static void bpf_selem_free_trace_rcu(struct rcu_head *rcu) +{ + struct bpf_local_storage_elem *selem; + + selem = container_of(rcu, struct bpf_local_storage_elem, rcu); + if (rcu_trace_implies_rcu_gp()) + kfree(selem); else - call_rcu(rcu, bpf_selem_free_rcu); + kfree_rcu(selem, rcu); } /* local_storage->lock must be held and selem->local_storage == local_storage. @@ -145,6 +151,7 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor { struct bpf_local_storage_map *smap; bool free_local_storage; + struct btf_record *rec; void *owner; smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); @@ -185,10 +192,26 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor SDATA(selem)) RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL); - if (use_trace_rcu) - call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_tasks_trace_rcu); - else - call_rcu(&selem->rcu, bpf_selem_free_rcu); + /* A different RCU callback is chosen whenever we need to free + * additional fields in selem data before freeing selem. + * bpf_local_storage_map_free only executes rcu_barrier to wait for RCU + * callbacks when it has special fields, hence we can only conditionally + * dereference smap, as by this time the map might have already been + * freed without waiting for our call_rcu callback if it did not have + * any special fields. + */ + rec = smap->map.record; + if (use_trace_rcu) { + if (!IS_ERR_OR_NULL(rec)) + call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_fields_trace_rcu); + else + call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu); + } else { + if (!IS_ERR_OR_NULL(rec)) + call_rcu(&selem->rcu, bpf_selem_free_fields_rcu); + else + kfree_rcu(selem, rcu); + } return free_local_storage; } @@ -256,11 +279,6 @@ void bpf_selem_link_map(struct bpf_local_storage_map *smap, RCU_INIT_POINTER(SDATA(selem)->smap, smap); hlist_add_head_rcu(&selem->map_node, &b->list); raw_spin_unlock_irqrestore(&b->lock, flags); - - /* If our data will have special fields, smap will wait for us to use - * its record in bpf_selem_free_* RCU callbacks before freeing itself. - */ - selem->can_use_smap = !IS_ERR_OR_NULL(smap->map.record); } void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool use_trace_rcu) @@ -748,19 +766,20 @@ void bpf_local_storage_map_free(struct bpf_map *map, kvfree(smap->buckets); /* When local storage has special fields, callbacks for - * bpf_selem_free_rcu and bpf_selem_free_tasks_trace_rcu will keep using - * the map BTF record, we need to execute an RCU barrier to wait for - * them as the record will be freed right after our map_free callback. + * bpf_selem_free_fields_rcu and bpf_selem_free_fields_trace_rcu will + * keep using the map BTF record, we need to execute an RCU barrier to + * wait for them as the record will be freed right after our map_free + * callback. */ if (!IS_ERR_OR_NULL(smap->map.record)) { rcu_barrier_tasks_trace(); /* We cannot skip rcu_barrier() when rcu_trace_implies_rcu_gp() * is true, because while call_rcu invocation is skipped in that - * case in bpf_selem_free_tasks_trace_rcu (and all local storage - * maps pass use_trace_rcu = true), there can be call_rcu - * callbacks based on use_trace_rcu = false in the earlier while - * ((selem = ...)) loop or from bpf_local_storage_unlink_nolock - * called from owner's free path. + * case in bpf_selem_free_fields_trace_rcu (and all local + * storage maps pass use_trace_rcu = true), there can be + * call_rcu callbacks based on use_trace_rcu = false in the + * while ((selem = ...)) loop above or when owner's free path + * calls bpf_local_storage_unlink_nolock. */ rcu_barrier(); } -- cgit v1.2.3-58-ga151 From 0d80a619c113d0e216dbffa56b2d5ccc079ee520 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Sat, 4 Mar 2023 03:12:45 +0200 Subject: bpf: allow ctx writes using BPF_ST_MEM instruction Lift verifier restriction to use BPF_ST_MEM instructions to write to context data structures. This requires the following changes: - verifier.c:do_check() for BPF_ST updated to: - no longer forbid writes to registers of type PTR_TO_CTX; - track dst_reg type in the env->insn_aux_data[...].ptr_type field (same way it is done for BPF_STX and BPF_LDX instructions). - verifier.c:convert_ctx_access() and various callbacks invoked by it are updated to handled BPF_ST instruction alongside BPF_STX. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20230304011247.566040-2-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/cgroup.c | 49 ++++++++----- kernel/bpf/verifier.c | 110 ++++++++++++++--------------- net/core/filter.c | 79 ++++++++++++--------- tools/testing/selftests/bpf/verifier/ctx.c | 11 --- 4 files changed, 126 insertions(+), 123 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index a4ae422b8f12..53edb8ad2471 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -2223,10 +2223,12 @@ static u32 sysctl_convert_ctx_access(enum bpf_access_type type, BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos), treg, si->dst_reg, offsetof(struct bpf_sysctl_kern, ppos)); - *insn++ = BPF_STX_MEM( - BPF_SIZEOF(u32), treg, si->src_reg, + *insn++ = BPF_RAW_INSN( + BPF_CLASS(si->code) | BPF_MEM | BPF_SIZEOF(u32), + treg, si->src_reg, bpf_ctx_narrow_access_offset( - 0, sizeof(u32), sizeof(loff_t))); + 0, sizeof(u32), sizeof(loff_t)), + si->imm); *insn++ = BPF_LDX_MEM( BPF_DW, treg, si->dst_reg, offsetof(struct bpf_sysctl_kern, tmp_reg)); @@ -2376,10 +2378,17 @@ static bool cg_sockopt_is_valid_access(int off, int size, return true; } -#define CG_SOCKOPT_ACCESS_FIELD(T, F) \ - T(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F), \ - si->dst_reg, si->src_reg, \ - offsetof(struct bpf_sockopt_kern, F)) +#define CG_SOCKOPT_READ_FIELD(F) \ + BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F), \ + si->dst_reg, si->src_reg, \ + offsetof(struct bpf_sockopt_kern, F)) + +#define CG_SOCKOPT_WRITE_FIELD(F) \ + BPF_RAW_INSN((BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F) | \ + BPF_MEM | BPF_CLASS(si->code)), \ + si->dst_reg, si->src_reg, \ + offsetof(struct bpf_sockopt_kern, F), \ + si->imm) static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, @@ -2391,25 +2400,25 @@ static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type, switch (si->off) { case offsetof(struct bpf_sockopt, sk): - *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, sk); + *insn++ = CG_SOCKOPT_READ_FIELD(sk); break; case offsetof(struct bpf_sockopt, level): if (type == BPF_WRITE) - *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, level); + *insn++ = CG_SOCKOPT_WRITE_FIELD(level); else - *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, level); + *insn++ = CG_SOCKOPT_READ_FIELD(level); break; case offsetof(struct bpf_sockopt, optname): if (type == BPF_WRITE) - *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optname); + *insn++ = CG_SOCKOPT_WRITE_FIELD(optname); else - *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optname); + *insn++ = CG_SOCKOPT_READ_FIELD(optname); break; case offsetof(struct bpf_sockopt, optlen): if (type == BPF_WRITE) - *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optlen); + *insn++ = CG_SOCKOPT_WRITE_FIELD(optlen); else - *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optlen); + *insn++ = CG_SOCKOPT_READ_FIELD(optlen); break; case offsetof(struct bpf_sockopt, retval): BUILD_BUG_ON(offsetof(struct bpf_cg_run_ctx, run_ctx) != 0); @@ -2429,9 +2438,11 @@ static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type, *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx), treg, treg, offsetof(struct task_struct, bpf_ctx)); - *insn++ = BPF_STX_MEM(BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval), - treg, si->src_reg, - offsetof(struct bpf_cg_run_ctx, retval)); + *insn++ = BPF_RAW_INSN(BPF_CLASS(si->code) | BPF_MEM | + BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval), + treg, si->src_reg, + offsetof(struct bpf_cg_run_ctx, retval), + si->imm); *insn++ = BPF_LDX_MEM(BPF_DW, treg, si->dst_reg, offsetof(struct bpf_sockopt_kern, tmp_reg)); } else { @@ -2447,10 +2458,10 @@ static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type, } break; case offsetof(struct bpf_sockopt, optval): - *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval); + *insn++ = CG_SOCKOPT_READ_FIELD(optval); break; case offsetof(struct bpf_sockopt, optval_end): - *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval_end); + *insn++ = CG_SOCKOPT_READ_FIELD(optval_end); break; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c2adf3c24c64..4c5d2b5e25c8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -14813,6 +14813,44 @@ static bool reg_type_mismatch(enum bpf_reg_type src, enum bpf_reg_type prev) !reg_type_mismatch_ok(prev)); } +static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type type, + bool allow_trust_missmatch) +{ + enum bpf_reg_type *prev_type = &env->insn_aux_data[env->insn_idx].ptr_type; + + if (*prev_type == NOT_INIT) { + /* Saw a valid insn + * dst_reg = *(u32 *)(src_reg + off) + * save type to validate intersecting paths + */ + *prev_type = type; + } else if (reg_type_mismatch(type, *prev_type)) { + /* Abuser program is trying to use the same insn + * dst_reg = *(u32*) (src_reg + off) + * with different pointer types: + * src_reg == ctx in one branch and + * src_reg == stack|map in some other branch. + * Reject it. + */ + if (allow_trust_missmatch && + base_type(type) == PTR_TO_BTF_ID && + base_type(*prev_type) == PTR_TO_BTF_ID) { + /* + * Have to support a use case when one path through + * the program yields TRUSTED pointer while another + * is UNTRUSTED. Fallback to UNTRUSTED to generate + * BPF_PROBE_MEM. + */ + *prev_type = PTR_TO_BTF_ID | PTR_UNTRUSTED; + } else { + verbose(env, "same insn cannot be used with different pointers\n"); + return -EINVAL; + } + } + + return 0; +} + static int do_check(struct bpf_verifier_env *env) { bool pop_log = !(env->log.level & BPF_LOG_LEVEL2); @@ -14922,7 +14960,7 @@ static int do_check(struct bpf_verifier_env *env) return err; } else if (class == BPF_LDX) { - enum bpf_reg_type *prev_src_type, src_reg_type; + enum bpf_reg_type src_reg_type; /* check for reserved fields is already done */ @@ -14946,43 +14984,11 @@ static int do_check(struct bpf_verifier_env *env) if (err) return err; - prev_src_type = &env->insn_aux_data[env->insn_idx].ptr_type; - - if (*prev_src_type == NOT_INIT) { - /* saw a valid insn - * dst_reg = *(u32 *)(src_reg + off) - * save type to validate intersecting paths - */ - *prev_src_type = src_reg_type; - - } else if (reg_type_mismatch(src_reg_type, *prev_src_type)) { - /* ABuser program is trying to use the same insn - * dst_reg = *(u32*) (src_reg + off) - * with different pointer types: - * src_reg == ctx in one branch and - * src_reg == stack|map in some other branch. - * Reject it. - */ - if (base_type(src_reg_type) == PTR_TO_BTF_ID && - base_type(*prev_src_type) == PTR_TO_BTF_ID) { - /* - * Have to support a use case when one path through - * the program yields TRUSTED pointer while another - * is UNTRUSTED. Fallback to UNTRUSTED to generate - * BPF_PROBE_MEM. - */ - *prev_src_type = PTR_TO_BTF_ID | PTR_UNTRUSTED; - } else { - verbose(env, - "The same insn cannot be used with different pointers: %s", - reg_type_str(env, src_reg_type)); - verbose(env, " != %s\n", reg_type_str(env, *prev_src_type)); - return -EINVAL; - } - } - + err = save_aux_ptr_type(env, src_reg_type, true); + if (err) + return err; } else if (class == BPF_STX) { - enum bpf_reg_type *prev_dst_type, dst_reg_type; + enum bpf_reg_type dst_reg_type; if (BPF_MODE(insn->code) == BPF_ATOMIC) { err = check_atomic(env, env->insn_idx, insn); @@ -15015,16 +15021,12 @@ static int do_check(struct bpf_verifier_env *env) if (err) return err; - prev_dst_type = &env->insn_aux_data[env->insn_idx].ptr_type; - - if (*prev_dst_type == NOT_INIT) { - *prev_dst_type = dst_reg_type; - } else if (reg_type_mismatch(dst_reg_type, *prev_dst_type)) { - verbose(env, "same insn cannot be used with different pointers\n"); - return -EINVAL; - } - + err = save_aux_ptr_type(env, dst_reg_type, false); + if (err) + return err; } else if (class == BPF_ST) { + enum bpf_reg_type dst_reg_type; + if (BPF_MODE(insn->code) != BPF_MEM || insn->src_reg != BPF_REG_0) { verbose(env, "BPF_ST uses reserved fields\n"); @@ -15035,12 +15037,7 @@ static int do_check(struct bpf_verifier_env *env) if (err) return err; - if (is_ctx_reg(env, insn->dst_reg)) { - verbose(env, "BPF_ST stores into R%d %s is not allowed\n", - insn->dst_reg, - reg_type_str(env, reg_state(env, insn->dst_reg)->type)); - return -EACCES; - } + dst_reg_type = regs[insn->dst_reg].type; /* check that memory (dst_reg + off) is writeable */ err = check_mem_access(env, env->insn_idx, insn->dst_reg, @@ -15049,6 +15046,9 @@ static int do_check(struct bpf_verifier_env *env) if (err) return err; + err = save_aux_ptr_type(env, dst_reg_type, false); + if (err) + return err; } else if (class == BPF_JMP || class == BPF_JMP32) { u8 opcode = BPF_OP(insn->code); @@ -16157,14 +16157,12 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) for (i = 0; i < insn_cnt; i++, insn++) { bpf_convert_ctx_access_t convert_ctx_access; - bool ctx_access; if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) || insn->code == (BPF_LDX | BPF_MEM | BPF_H) || insn->code == (BPF_LDX | BPF_MEM | BPF_W) || insn->code == (BPF_LDX | BPF_MEM | BPF_DW)) { type = BPF_READ; - ctx_access = true; } else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) || insn->code == (BPF_STX | BPF_MEM | BPF_H) || insn->code == (BPF_STX | BPF_MEM | BPF_W) || @@ -16174,7 +16172,6 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) insn->code == (BPF_ST | BPF_MEM | BPF_W) || insn->code == (BPF_ST | BPF_MEM | BPF_DW)) { type = BPF_WRITE; - ctx_access = BPF_CLASS(insn->code) == BPF_STX; } else { continue; } @@ -16197,9 +16194,6 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) continue; } - if (!ctx_access) - continue; - switch ((int)env->insn_aux_data[i + delta].ptr_type) { case PTR_TO_CTX: if (!ops->convert_ctx_access) diff --git a/net/core/filter.c b/net/core/filter.c index a2dc44e70ea0..50f649f1b4a9 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -9279,11 +9279,15 @@ static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog, #endif /* : skb->tstamp = tstamp */ - *insn++ = BPF_STX_MEM(BPF_DW, skb_reg, value_reg, - offsetof(struct sk_buff, tstamp)); + *insn++ = BPF_RAW_INSN(BPF_CLASS(si->code) | BPF_DW | BPF_MEM, + skb_reg, value_reg, offsetof(struct sk_buff, tstamp), si->imm); return insn; } +#define BPF_EMIT_STORE(size, si, off) \ + BPF_RAW_INSN(BPF_CLASS((si)->code) | (size) | BPF_MEM, \ + (si)->dst_reg, (si)->src_reg, (off), (si)->imm) + static u32 bpf_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, @@ -9313,9 +9317,9 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, case offsetof(struct __sk_buff, priority): if (type == BPF_WRITE) - *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, - bpf_target_off(struct sk_buff, priority, 4, - target_size)); + *insn++ = BPF_EMIT_STORE(BPF_W, si, + bpf_target_off(struct sk_buff, priority, 4, + target_size)); else *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, priority, 4, @@ -9346,9 +9350,9 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, case offsetof(struct __sk_buff, mark): if (type == BPF_WRITE) - *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, - bpf_target_off(struct sk_buff, mark, 4, - target_size)); + *insn++ = BPF_EMIT_STORE(BPF_W, si, + bpf_target_off(struct sk_buff, mark, 4, + target_size)); else *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, mark, 4, @@ -9367,11 +9371,16 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, case offsetof(struct __sk_buff, queue_mapping): if (type == BPF_WRITE) { - *insn++ = BPF_JMP_IMM(BPF_JGE, si->src_reg, NO_QUEUE_MAPPING, 1); - *insn++ = BPF_STX_MEM(BPF_H, si->dst_reg, si->src_reg, - bpf_target_off(struct sk_buff, - queue_mapping, - 2, target_size)); + u32 off = bpf_target_off(struct sk_buff, queue_mapping, 2, target_size); + + if (BPF_CLASS(si->code) == BPF_ST && si->imm >= NO_QUEUE_MAPPING) { + *insn++ = BPF_JMP_A(0); /* noop */ + break; + } + + if (BPF_CLASS(si->code) == BPF_STX) + *insn++ = BPF_JMP_IMM(BPF_JGE, si->src_reg, NO_QUEUE_MAPPING, 1); + *insn++ = BPF_EMIT_STORE(BPF_H, si, off); } else { *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, @@ -9407,8 +9416,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, off += offsetof(struct sk_buff, cb); off += offsetof(struct qdisc_skb_cb, data); if (type == BPF_WRITE) - *insn++ = BPF_STX_MEM(BPF_SIZE(si->code), si->dst_reg, - si->src_reg, off); + *insn++ = BPF_EMIT_STORE(BPF_SIZE(si->code), si, off); else *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg, si->src_reg, off); @@ -9423,8 +9431,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, off += offsetof(struct qdisc_skb_cb, tc_classid); *target_size = 2; if (type == BPF_WRITE) - *insn++ = BPF_STX_MEM(BPF_H, si->dst_reg, - si->src_reg, off); + *insn++ = BPF_EMIT_STORE(BPF_H, si, off); else *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, off); @@ -9457,9 +9464,9 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, case offsetof(struct __sk_buff, tc_index): #ifdef CONFIG_NET_SCHED if (type == BPF_WRITE) - *insn++ = BPF_STX_MEM(BPF_H, si->dst_reg, si->src_reg, - bpf_target_off(struct sk_buff, tc_index, 2, - target_size)); + *insn++ = BPF_EMIT_STORE(BPF_H, si, + bpf_target_off(struct sk_buff, tc_index, 2, + target_size)); else *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, tc_index, 2, @@ -9660,8 +9667,8 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, BUILD_BUG_ON(sizeof_field(struct sock, sk_bound_dev_if) != 4); if (type == BPF_WRITE) - *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, - offsetof(struct sock, sk_bound_dev_if)); + *insn++ = BPF_EMIT_STORE(BPF_W, si, + offsetof(struct sock, sk_bound_dev_if)); else *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sock, sk_bound_dev_if)); @@ -9671,8 +9678,8 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, BUILD_BUG_ON(sizeof_field(struct sock, sk_mark) != 4); if (type == BPF_WRITE) - *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, - offsetof(struct sock, sk_mark)); + *insn++ = BPF_EMIT_STORE(BPF_W, si, + offsetof(struct sock, sk_mark)); else *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sock, sk_mark)); @@ -9682,8 +9689,8 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, BUILD_BUG_ON(sizeof_field(struct sock, sk_priority) != 4); if (type == BPF_WRITE) - *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, - offsetof(struct sock, sk_priority)); + *insn++ = BPF_EMIT_STORE(BPF_W, si, + offsetof(struct sock, sk_priority)); else *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sock, sk_priority)); @@ -9948,10 +9955,12 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, offsetof(S, TF)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), tmp_reg, \ si->dst_reg, offsetof(S, F)); \ - *insn++ = BPF_STX_MEM(SIZE, tmp_reg, si->src_reg, \ + *insn++ = BPF_RAW_INSN(SIZE | BPF_MEM | BPF_CLASS(si->code), \ + tmp_reg, si->src_reg, \ bpf_target_off(NS, NF, sizeof_field(NS, NF), \ target_size) \ - + OFF); \ + + OFF, \ + si->imm); \ *insn++ = BPF_LDX_MEM(BPF_DW, tmp_reg, si->dst_reg, \ offsetof(S, TF)); \ } while (0) @@ -10186,9 +10195,11 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, struct bpf_sock_ops_kern, sk),\ reg, si->dst_reg, \ offsetof(struct bpf_sock_ops_kern, sk));\ - *insn++ = BPF_STX_MEM(BPF_FIELD_SIZEOF(OBJ, OBJ_FIELD), \ - reg, si->src_reg, \ - offsetof(OBJ, OBJ_FIELD)); \ + *insn++ = BPF_RAW_INSN(BPF_FIELD_SIZEOF(OBJ, OBJ_FIELD) | \ + BPF_MEM | BPF_CLASS(si->code), \ + reg, si->src_reg, \ + offsetof(OBJ, OBJ_FIELD), \ + si->imm); \ *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->dst_reg, \ offsetof(struct bpf_sock_ops_kern, \ temp)); \ @@ -10220,8 +10231,7 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, off -= offsetof(struct bpf_sock_ops, replylong[0]); off += offsetof(struct bpf_sock_ops_kern, replylong[0]); if (type == BPF_WRITE) - *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, - off); + *insn++ = BPF_EMIT_STORE(BPF_W, si, off); else *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, off); @@ -10578,8 +10588,7 @@ static u32 sk_skb_convert_ctx_access(enum bpf_access_type type, off += offsetof(struct sk_buff, cb); off += offsetof(struct sk_skb_cb, data); if (type == BPF_WRITE) - *insn++ = BPF_STX_MEM(BPF_SIZE(si->code), si->dst_reg, - si->src_reg, off); + *insn++ = BPF_EMIT_STORE(BPF_SIZE(si->code), si, off); else *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg, si->src_reg, off); diff --git a/tools/testing/selftests/bpf/verifier/ctx.c b/tools/testing/selftests/bpf/verifier/ctx.c index c8eaf0536c24..2fd31612c0b8 100644 --- a/tools/testing/selftests/bpf/verifier/ctx.c +++ b/tools/testing/selftests/bpf/verifier/ctx.c @@ -1,14 +1,3 @@ -{ - "context stores via ST", - .insns = { - BPF_MOV64_IMM(BPF_REG_0, 0), - BPF_ST_MEM(BPF_DW, BPF_REG_1, offsetof(struct __sk_buff, mark), 0), - BPF_EXIT_INSN(), - }, - .errstr = "BPF_ST stores into R1 ctx is not allowed", - .result = REJECT, - .prog_type = BPF_PROG_TYPE_SCHED_CLS, -}, { "context stores via BPF_ATOMIC", .insns = { -- cgit v1.2.3-58-ga151 From d54e0f6c1adffbf72f2cf4aebe6122899c3b851c Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 2 Mar 2023 15:49:59 -0800 Subject: bpf: improve stack slot state printing Improve stack slot state printing to provide more useful and relevant information, especially for dynptrs. While previously we'd see something like: 8: (85) call bpf_ringbuf_reserve_dynptr#198 ; R0_w=scalar() fp-8_w=dddddddd fp-16_w=dddddddd refs=2 Now we'll see way more useful: 8: (85) call bpf_ringbuf_reserve_dynptr#198 ; R0_w=scalar() fp-16_w=dynptr_ringbuf(ref_id=2) refs=2 I experimented with printing the range of slots taken by dynptr, something like: fp-16..8_w=dynptr_ringbuf(ref_id=2) But it felt very awkward and pretty useless. So we print the lowest address (most negative offset) only. The general structure of this code is now also set up for easier extension and will accommodate ITER slots naturally. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20230302235015.2044271-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 75 +++++++++++++++++++++++++++++++++------------------ 1 file changed, 49 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4c5d2b5e25c8..4f71b6b61ef4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -705,6 +705,25 @@ static const char *kernel_type_name(const struct btf* btf, u32 id) return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off); } +static const char *dynptr_type_str(enum bpf_dynptr_type type) +{ + switch (type) { + case BPF_DYNPTR_TYPE_LOCAL: + return "local"; + case BPF_DYNPTR_TYPE_RINGBUF: + return "ringbuf"; + case BPF_DYNPTR_TYPE_SKB: + return "skb"; + case BPF_DYNPTR_TYPE_XDP: + return "xdp"; + case BPF_DYNPTR_TYPE_INVALID: + return ""; + default: + WARN_ONCE(1, "unknown dynptr type %d\n", type); + return ""; + } +} + static void mark_reg_scratched(struct bpf_verifier_env *env, u32 regno) { env->scratched_regs |= 1U << regno; @@ -1176,26 +1195,49 @@ static void print_verifier_state(struct bpf_verifier_env *env, for (j = 0; j < BPF_REG_SIZE; j++) { if (state->stack[i].slot_type[j] != STACK_INVALID) valid = true; - types_buf[j] = slot_type_char[ - state->stack[i].slot_type[j]]; + types_buf[j] = slot_type_char[state->stack[i].slot_type[j]]; } types_buf[BPF_REG_SIZE] = 0; if (!valid) continue; if (!print_all && !stack_slot_scratched(env, i)) continue; - verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); - print_liveness(env, state->stack[i].spilled_ptr.live); - if (is_spilled_reg(&state->stack[i])) { + switch (state->stack[i].slot_type[BPF_REG_SIZE - 1]) { + case STACK_SPILL: reg = &state->stack[i].spilled_ptr; t = reg->type; + + verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); + print_liveness(env, reg->live); verbose(env, "=%s", t == SCALAR_VALUE ? "" : reg_type_str(env, t)); if (t == SCALAR_VALUE && reg->precise) verbose(env, "P"); if (t == SCALAR_VALUE && tnum_is_const(reg->var_off)) verbose(env, "%lld", reg->var_off.value + reg->off); - } else { + break; + case STACK_DYNPTR: + i += BPF_DYNPTR_NR_SLOTS - 1; + reg = &state->stack[i].spilled_ptr; + + verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); + print_liveness(env, reg->live); + verbose(env, "=dynptr_%s", dynptr_type_str(reg->dynptr.type)); + if (reg->ref_obj_id) + verbose(env, "(ref_id=%d)", reg->ref_obj_id); + break; + case STACK_MISC: + case STACK_ZERO: + default: + reg = &state->stack[i].spilled_ptr; + + for (j = 0; j < BPF_REG_SIZE; j++) + types_buf[j] = slot_type_char[state->stack[i].slot_type[j]]; + types_buf[BPF_REG_SIZE] = 0; + + verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); + print_liveness(env, reg->live); verbose(env, "=%s", types_buf); + break; } } if (state->acquired_refs && state->refs[0].id) { @@ -6411,28 +6453,9 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn /* Fold modifiers (in this case, MEM_RDONLY) when checking expected type */ if (!is_dynptr_type_expected(env, reg, arg_type & ~MEM_RDONLY)) { - const char *err_extra = ""; - - switch (arg_type & DYNPTR_TYPE_FLAG_MASK) { - case DYNPTR_TYPE_LOCAL: - err_extra = "local"; - break; - case DYNPTR_TYPE_RINGBUF: - err_extra = "ringbuf"; - break; - case DYNPTR_TYPE_SKB: - err_extra = "skb "; - break; - case DYNPTR_TYPE_XDP: - err_extra = "xdp "; - break; - default: - err_extra = ""; - break; - } verbose(env, "Expected a dynptr of type %s as arg #%d\n", - err_extra, regno); + dynptr_type_str(arg_to_dynptr_type(arg_type)), regno); return -EINVAL; } -- cgit v1.2.3-58-ga151 From 567da5d253cd6b41c6d015adac1af653725bef9d Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 2 Mar 2023 15:50:00 -0800 Subject: bpf: improve regsafe() checks for PTR_TO_{MEM,BUF,TP_BUFFER} Teach regsafe() logic to handle PTR_TO_MEM, PTR_TO_BUF, and PTR_TO_TP_BUFFER similarly to PTR_TO_MAP_{KEY,VALUE}. That is, instead of exact match for var_off and range, use tnum_in() and range_within() checks, allowing more general verified state to subsume more specific current state. This allows to match wider range of valid and safe states, speeding up verification and detecting wider range of equivalent states for upcoming open-coded iteration looping logic. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20230302235015.2044271-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4f71b6b61ef4..b071b922848b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -14209,13 +14209,17 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, tnum_in(rold->var_off, rcur->var_off); case PTR_TO_MAP_KEY: case PTR_TO_MAP_VALUE: + case PTR_TO_MEM: + case PTR_TO_BUF: + case PTR_TO_TP_BUFFER: /* If the new min/max/var_off satisfy the old ones and * everything else matches, we are OK. */ return memcmp(rold, rcur, offsetof(struct bpf_reg_state, var_off)) == 0 && range_within(rold, rcur) && tnum_in(rold->var_off, rcur->var_off) && - check_ids(rold->id, rcur->id, idmap); + check_ids(rold->id, rcur->id, idmap) && + check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap); case PTR_TO_PACKET_META: case PTR_TO_PACKET: /* We must have at least as much range as the old ptr -- cgit v1.2.3-58-ga151 From 98ddcf389d1bb7a407d49c23dfe6443680812f24 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 2 Mar 2023 15:50:02 -0800 Subject: bpf: honor env->test_state_freq flag in is_state_visited() env->test_state_freq flag can be set by user by passing BPF_F_TEST_STATE_FREQ program flag. This is used in a bunch of selftests to have predictable state checkpoints at every jump and so on. Currently, bounded loop handling heuristic ignores this flag if number of processed jumps and/or number of processed instructions is below some thresholds, which throws off that reliable state checkpointing. Honor this flag in all circumstances by disabling heuristic if env->test_state_freq is set. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20230302235015.2044271-5-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b071b922848b..fa93ba10762d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -14651,7 +14651,8 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * This threshold shouldn't be too high either, since states * at the end of the loop are likely to be useful in pruning. */ - if (env->jmps_processed - env->prev_jmps_processed < 20 && + if (!env->test_state_freq && + env->jmps_processed - env->prev_jmps_processed < 20 && env->insn_processed - env->prev_insn_processed < 100) add_new_state = false; goto miss; -- cgit v1.2.3-58-ga151 From 653ae3a874aca6764a4c1f5a8bf1b072ade0d6f4 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 2 Mar 2023 15:50:04 -0800 Subject: bpf: clean up visit_insn()'s instruction processing Instead of referencing processed instruction repeatedly as insns[t] throughout entire visit_insn() function, take a local insn pointer and work with it in a cleaner way. It makes enhancing this function further a bit easier as well. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20230302235015.2044271-7-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index fa93ba10762d..6188d5604ed4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -13484,44 +13484,43 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns, */ static int visit_insn(int t, struct bpf_verifier_env *env) { - struct bpf_insn *insns = env->prog->insnsi; + struct bpf_insn *insns = env->prog->insnsi, *insn = &insns[t]; int ret; - if (bpf_pseudo_func(insns + t)) + if (bpf_pseudo_func(insn)) return visit_func_call_insn(t, insns, env, true); /* All non-branch instructions have a single fall-through edge. */ - if (BPF_CLASS(insns[t].code) != BPF_JMP && - BPF_CLASS(insns[t].code) != BPF_JMP32) + if (BPF_CLASS(insn->code) != BPF_JMP && + BPF_CLASS(insn->code) != BPF_JMP32) return push_insn(t, t + 1, FALLTHROUGH, env, false); - switch (BPF_OP(insns[t].code)) { + switch (BPF_OP(insn->code)) { case BPF_EXIT: return DONE_EXPLORING; case BPF_CALL: - if (insns[t].imm == BPF_FUNC_timer_set_callback) + if (insn->imm == BPF_FUNC_timer_set_callback) /* Mark this call insn as a prune point to trigger * is_state_visited() check before call itself is * processed by __check_func_call(). Otherwise new * async state will be pushed for further exploration. */ mark_prune_point(env, t); - return visit_func_call_insn(t, insns, env, - insns[t].src_reg == BPF_PSEUDO_CALL); + return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL); case BPF_JA: - if (BPF_SRC(insns[t].code) != BPF_K) + if (BPF_SRC(insn->code) != BPF_K) return -EINVAL; /* unconditional jump with single edge */ - ret = push_insn(t, t + insns[t].off + 1, FALLTHROUGH, env, + ret = push_insn(t, t + insn->off + 1, FALLTHROUGH, env, true); if (ret) return ret; - mark_prune_point(env, t + insns[t].off + 1); - mark_jmp_point(env, t + insns[t].off + 1); + mark_prune_point(env, t + insn->off + 1); + mark_jmp_point(env, t + insn->off + 1); return ret; @@ -13533,7 +13532,7 @@ static int visit_insn(int t, struct bpf_verifier_env *env) if (ret) return ret; - return push_insn(t, t + insns[t].off + 1, BRANCH, env, true); + return push_insn(t, t + insn->off + 1, BRANCH, env, true); } } -- cgit v1.2.3-58-ga151 From c1ee85a9806a720aa054f68fe7f9c79418f36c2b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 2 Mar 2023 15:50:05 -0800 Subject: bpf: fix visit_insn()'s detection of BPF_FUNC_timer_set_callback helper It's not correct to assume that any BPF_CALL instruction is a helper call. Fix visit_insn()'s detection of bpf_timer_set_callback() helper by also checking insn->code == 0. For kfuncs insn->code would be set to BPF_PSEUDO_KFUNC_CALL, and for subprog calls it will be BPF_PSEUDO_CALL. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20230302235015.2044271-8-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6188d5604ed4..34fd808e9692 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -13500,7 +13500,7 @@ static int visit_insn(int t, struct bpf_verifier_env *env) return DONE_EXPLORING; case BPF_CALL: - if (insn->imm == BPF_FUNC_timer_set_callback) + if (insn->src_reg == 0 && insn->imm == BPF_FUNC_timer_set_callback) /* Mark this call insn as a prune point to trigger * is_state_visited() check before call itself is * processed by __check_func_call(). Otherwise new -- cgit v1.2.3-58-ga151 From 553a64a85c5d1dac277325a0f51a31c056593048 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 2 Mar 2023 15:50:06 -0800 Subject: bpf: ensure that r0 is marked scratched after any function call r0 is important (unless called function is void-returning, but that's taken care of by print_verifier_state() anyways) in verifier logs. Currently for helpers we seem to print it in verifier log, but for kfuncs we don't. Instead of figuring out where in the maze of code we accidentally set r0 as scratched for helpers and why we don't do that for kfuncs, just enforce that after any function call r0 is marked as scratched. Also, perhaps, we should reconsider "scratched" terminology, as it's mightily confusing. "Touched" would seem more appropriate. But I left that for follow ups for now. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20230302235015.2044271-9-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 34fd808e9692..db0c37e9bc3a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -15110,6 +15110,8 @@ static int do_check(struct bpf_verifier_env *env) err = check_helper_call(env, insn, &env->insn_idx); if (err) return err; + + mark_reg_scratched(env, BPF_REG_0); } else if (opcode == BPF_JA) { if (BPF_SRC(insn->code) != BPF_K || insn->imm != 0 || -- cgit v1.2.3-58-ga151 From d0e1ac227945c6af616c003365c6feb986dc0839 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 2 Mar 2023 15:50:07 -0800 Subject: bpf: move kfunc_call_arg_meta higher in the file Move struct bpf_kfunc_call_arg_meta higher in the file and put it next to struct bpf_call_arg_meta, so it can be used from more functions. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20230302235015.2044271-10-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 70 +++++++++++++++++++++++++-------------------------- 1 file changed, 35 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index db0c37e9bc3a..ed9a96ad7b5b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -270,6 +270,41 @@ struct bpf_call_arg_meta { struct btf_field *kptr_field; }; +struct bpf_kfunc_call_arg_meta { + /* In parameters */ + struct btf *btf; + u32 func_id; + u32 kfunc_flags; + const struct btf_type *func_proto; + const char *func_name; + /* Out parameters */ + u32 ref_obj_id; + u8 release_regno; + bool r0_rdonly; + u32 ret_btf_id; + u64 r0_size; + u32 subprogno; + struct { + u64 value; + bool found; + } arg_constant; + struct { + struct btf *btf; + u32 btf_id; + } arg_obj_drop; + struct { + struct btf_field *field; + } arg_list_head; + struct { + struct btf_field *field; + } arg_rbtree_root; + struct { + enum bpf_dynptr_type type; + u32 id; + } initialized_dynptr; + u64 mem_size; +}; + struct btf *btf_vmlinux; static DEFINE_MUTEX(bpf_verifier_lock); @@ -8811,41 +8846,6 @@ static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno, } } -struct bpf_kfunc_call_arg_meta { - /* In parameters */ - struct btf *btf; - u32 func_id; - u32 kfunc_flags; - const struct btf_type *func_proto; - const char *func_name; - /* Out parameters */ - u32 ref_obj_id; - u8 release_regno; - bool r0_rdonly; - u32 ret_btf_id; - u64 r0_size; - u32 subprogno; - struct { - u64 value; - bool found; - } arg_constant; - struct { - struct btf *btf; - u32 btf_id; - } arg_obj_drop; - struct { - struct btf_field *field; - } arg_list_head; - struct { - struct btf_field *field; - } arg_rbtree_root; - struct { - enum bpf_dynptr_type type; - u32 id; - } initialized_dynptr; - u64 mem_size; -}; - static bool is_kfunc_acquire(struct bpf_kfunc_call_arg_meta *meta) { return meta->kfunc_flags & KF_ACQUIRE; -- cgit v1.2.3-58-ga151 From d5271c5b1950b887def1663b75e2d710cc16535f Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 2 Mar 2023 15:50:08 -0800 Subject: bpf: mark PTR_TO_MEM as non-null register type PTR_TO_MEM register without PTR_MAYBE_NULL is indeed non-null. This is important for BPF verifier to be able to prune guaranteed not to be taken branches. This is always the case with open-coded iterators. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20230302235015.2044271-11-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ed9a96ad7b5b..d95975cbcc19 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -487,7 +487,8 @@ static bool reg_type_not_null(enum bpf_reg_type type) type == PTR_TO_TCP_SOCK || type == PTR_TO_MAP_VALUE || type == PTR_TO_MAP_KEY || - type == PTR_TO_SOCK_COMMON; + type == PTR_TO_SOCK_COMMON || + type == PTR_TO_MEM; } static bool type_is_ptr_alloc_obj(u32 type) -- cgit v1.2.3-58-ga151 From a461f5adf17756e99ee0903d1a40961b0342ebb3 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 2 Mar 2023 15:50:09 -0800 Subject: bpf: generalize dynptr_get_spi to be usable for iters Generalize the logic of fetching special stack slot object state using spi (stack slot index). This will be used by STACK_ITER logic next. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20230302235015.2044271-12-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d95975cbcc19..c4151c9efe24 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -710,32 +710,38 @@ static bool is_spi_bounds_valid(struct bpf_func_state *state, int spi, int nr_sl return spi - nr_slots + 1 >= 0 && spi < allocated_slots; } -static int dynptr_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +static int stack_slot_obj_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + const char *obj_kind, int nr_slots) { int off, spi; if (!tnum_is_const(reg->var_off)) { - verbose(env, "dynptr has to be at a constant offset\n"); + verbose(env, "%s has to be at a constant offset\n", obj_kind); return -EINVAL; } off = reg->off + reg->var_off.value; if (off % BPF_REG_SIZE) { - verbose(env, "cannot pass in dynptr at an offset=%d\n", off); + verbose(env, "cannot pass in %s at an offset=%d\n", obj_kind, off); return -EINVAL; } spi = __get_spi(off); - if (spi < 1) { - verbose(env, "cannot pass in dynptr at an offset=%d\n", off); + if (spi + 1 < nr_slots) { + verbose(env, "cannot pass in %s at an offset=%d\n", obj_kind, off); return -EINVAL; } - if (!is_spi_bounds_valid(func(env, reg), spi, BPF_DYNPTR_NR_SLOTS)) + if (!is_spi_bounds_valid(func(env, reg), spi, nr_slots)) return -ERANGE; return spi; } +static int dynptr_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +{ + return stack_slot_obj_get_spi(env, reg, "dynptr", BPF_DYNPTR_NR_SLOTS); +} + static const char *kernel_type_name(const struct btf* btf, u32 id) { return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off); -- cgit v1.2.3-58-ga151 From f4b4eee6169bb33c5157ebe07e53d7e4be7631c0 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 2 Mar 2023 15:50:10 -0800 Subject: bpf: add support for fixed-size memory pointer returns for kfuncs Support direct fixed-size (and for now, read-only) memory access when kfunc's return type is a pointer to non-struct type. Calculate type size and let BPF program access that many bytes directly. This is crucial for numbers iterator. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20230302235015.2044271-13-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c4151c9efe24..b2116ca78d9a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10336,6 +10336,14 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return -EFAULT; } } else if (!__btf_type_is_struct(ptr_type)) { + if (!meta.r0_size) { + __u32 sz; + + if (!IS_ERR(btf_resolve_size(desc_btf, ptr_type, &sz))) { + meta.r0_size = sz; + meta.r0_rdonly = true; + } + } if (!meta.r0_size) { ptr_type_name = btf_name_by_offset(desc_btf, ptr_type->name_off); -- cgit v1.2.3-58-ga151 From 90a5527d7686d3ebe0dd2a831356a6c7d7dc31bc Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 5 Mar 2023 12:45:58 +0000 Subject: bpf: add new map ops ->map_mem_usage Add a new map ops ->map_mem_usage to print the memory usage of a bpf map. This is a preparation for the followup change. Signed-off-by: Yafang Shao Link: https://lore.kernel.org/r/20230305124615.12358-2-laoar.shao@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ kernel/bpf/syscall.c | 15 +++++++-------- 2 files changed, 9 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d3456804f7aa..9059520bbb5e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -161,6 +161,8 @@ struct bpf_map_ops { bpf_callback_t callback_fn, void *callback_ctx, u64 flags); + u64 (*map_mem_usage)(const struct bpf_map *map); + /* BTF id of struct allocated by map_alloc */ int *map_btf_id; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ede5f987484f..7c96e68859ec 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -771,16 +771,15 @@ static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f) } #ifdef CONFIG_PROC_FS -/* Provides an approximation of the map's memory footprint. - * Used only to provide a backward compatibility and display - * a reasonable "memlock" info. - */ -static unsigned long bpf_map_memory_footprint(const struct bpf_map *map) +/* Show the memory usage of a bpf map */ +static u64 bpf_map_memory_usage(const struct bpf_map *map) { unsigned long size; - size = round_up(map->key_size + bpf_map_value_size(map), 8); + if (map->ops->map_mem_usage) + return map->ops->map_mem_usage(map); + size = round_up(map->key_size + bpf_map_value_size(map), 8); return round_up(map->max_entries * size, PAGE_SIZE); } @@ -803,7 +802,7 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) "max_entries:\t%u\n" "map_flags:\t%#x\n" "map_extra:\t%#llx\n" - "memlock:\t%lu\n" + "memlock:\t%llu\n" "map_id:\t%u\n" "frozen:\t%u\n", map->map_type, @@ -812,7 +811,7 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) map->max_entries, map->map_flags, (unsigned long long)map->map_extra, - bpf_map_memory_footprint(map), + bpf_map_memory_usage(map), map->id, READ_ONCE(map->frozen)); if (type) { -- cgit v1.2.3-58-ga151 From 41d5941e7f9a27f3d28220f08b843e8d87df1be4 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 5 Mar 2023 12:45:59 +0000 Subject: bpf: lpm_trie memory usage trie_mem_usage() is introduced to calculate the lpm_trie memory usage. Some small memory allocations are ignored. The inner node is also ignored. The result as follows, - before 10: lpm_trie flags 0x1 key 8B value 8B max_entries 65536 memlock 1048576B - after 10: lpm_trie flags 0x1 key 8B value 8B max_entries 65536 memlock 2291536B Signed-off-by: Yafang Shao Link: https://lore.kernel.org/r/20230305124615.12358-3-laoar.shao@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/lpm_trie.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index d833496e9e42..dc23f2ac9cde 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -720,6 +720,16 @@ static int trie_check_btf(const struct bpf_map *map, -EINVAL : 0; } +static u64 trie_mem_usage(const struct bpf_map *map) +{ + struct lpm_trie *trie = container_of(map, struct lpm_trie, map); + u64 elem_size; + + elem_size = sizeof(struct lpm_trie_node) + trie->data_size + + trie->map.value_size; + return elem_size * READ_ONCE(trie->n_entries); +} + BTF_ID_LIST_SINGLE(trie_map_btf_ids, struct, lpm_trie) const struct bpf_map_ops trie_map_ops = { .map_meta_equal = bpf_map_meta_equal, @@ -733,5 +743,6 @@ const struct bpf_map_ops trie_map_ops = { .map_update_batch = generic_map_update_batch, .map_delete_batch = generic_map_delete_batch, .map_check_btf = trie_check_btf, + .map_mem_usage = trie_mem_usage, .map_btf_id = &trie_map_btf_ids[0], }; -- cgit v1.2.3-58-ga151 From 304849a27b341cf22d5c15096144cc8c1b69e0c0 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 5 Mar 2023 12:46:00 +0000 Subject: bpf: hashtab memory usage htab_map_mem_usage() is introduced to calculate hashmap memory usage. In this helper, some small memory allocations are ignore, as their size is quite small compared with the total size. The inner_map_meta in hash_of_map is also ignored. The result for hashtab as follows, - before this change 1: hash name count_map flags 0x1 <<<< no prealloc, fully set key 16B value 24B max_entries 1048576 memlock 41943040B 2: hash name count_map flags 0x1 <<<< no prealloc, none set key 16B value 24B max_entries 1048576 memlock 41943040B 3: hash name count_map flags 0x0 <<<< prealloc key 16B value 24B max_entries 1048576 memlock 41943040B The memlock is always a fixed size whatever it is preallocated or not, and whatever the count of allocated elements is. - after this change 1: hash name count_map flags 0x1 <<<< non prealloc, fully set key 16B value 24B max_entries 1048576 memlock 117441536B 2: hash name count_map flags 0x1 <<<< non prealloc, non set key 16B value 24B max_entries 1048576 memlock 16778240B 3: hash name count_map flags 0x0 <<<< prealloc key 16B value 24B max_entries 1048576 memlock 109056000B The memlock now is hashtab actually allocated. The result for percpu hash map as follows, - before this change 4: percpu_hash name count_map flags 0x0 <<<< prealloc key 16B value 24B max_entries 1048576 memlock 822083584B 5: percpu_hash name count_map flags 0x1 <<<< no prealloc key 16B value 24B max_entries 1048576 memlock 822083584B - after this change 4: percpu_hash name count_map flags 0x0 key 16B value 24B max_entries 1048576 memlock 897582080B 5: percpu_hash name count_map flags 0x1 key 16B value 24B max_entries 1048576 memlock 922748736B At worst, the difference can be 10x, for example, - before this change 6: hash name count_map flags 0x0 key 4B value 4B max_entries 1048576 memlock 8388608B - after this change 6: hash name count_map flags 0x0 key 4B value 4B max_entries 1048576 memlock 83889408B Signed-off-by: Yafang Shao Acked-by: Hou Tao Link: https://lore.kernel.org/r/20230305124615.12358-4-laoar.shao@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/hashtab.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 653aeb481c79..0df4b0c10f59 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -2190,6 +2190,44 @@ out: return num_elems; } +static u64 htab_map_mem_usage(const struct bpf_map *map) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + u32 value_size = round_up(htab->map.value_size, 8); + bool prealloc = htab_is_prealloc(htab); + bool percpu = htab_is_percpu(htab); + bool lru = htab_is_lru(htab); + u64 num_entries; + u64 usage = sizeof(struct bpf_htab); + + usage += sizeof(struct bucket) * htab->n_buckets; + usage += sizeof(int) * num_possible_cpus() * HASHTAB_MAP_LOCK_COUNT; + if (prealloc) { + num_entries = map->max_entries; + if (htab_has_extra_elems(htab)) + num_entries += num_possible_cpus(); + + usage += htab->elem_size * num_entries; + + if (percpu) + usage += value_size * num_possible_cpus() * num_entries; + else if (!lru) + usage += sizeof(struct htab_elem *) * num_possible_cpus(); + } else { +#define LLIST_NODE_SZ sizeof(struct llist_node) + + num_entries = htab->use_percpu_counter ? + percpu_counter_sum(&htab->pcount) : + atomic_read(&htab->count); + usage += (htab->elem_size + LLIST_NODE_SZ) * num_entries; + if (percpu) { + usage += (LLIST_NODE_SZ + sizeof(void *)) * num_entries; + usage += value_size * num_possible_cpus() * num_entries; + } + } + return usage; +} + BTF_ID_LIST_SINGLE(htab_map_btf_ids, struct, bpf_htab) const struct bpf_map_ops htab_map_ops = { .map_meta_equal = bpf_map_meta_equal, @@ -2206,6 +2244,7 @@ const struct bpf_map_ops htab_map_ops = { .map_seq_show_elem = htab_map_seq_show_elem, .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_hash_elem, + .map_mem_usage = htab_map_mem_usage, BATCH_OPS(htab), .map_btf_id = &htab_map_btf_ids[0], .iter_seq_info = &iter_seq_info, @@ -2227,6 +2266,7 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_seq_show_elem = htab_map_seq_show_elem, .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_hash_elem, + .map_mem_usage = htab_map_mem_usage, BATCH_OPS(htab_lru), .map_btf_id = &htab_map_btf_ids[0], .iter_seq_info = &iter_seq_info, @@ -2378,6 +2418,7 @@ const struct bpf_map_ops htab_percpu_map_ops = { .map_seq_show_elem = htab_percpu_map_seq_show_elem, .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_hash_elem, + .map_mem_usage = htab_map_mem_usage, BATCH_OPS(htab_percpu), .map_btf_id = &htab_map_btf_ids[0], .iter_seq_info = &iter_seq_info, @@ -2397,6 +2438,7 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = { .map_seq_show_elem = htab_percpu_map_seq_show_elem, .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_hash_elem, + .map_mem_usage = htab_map_mem_usage, BATCH_OPS(htab_lru_percpu), .map_btf_id = &htab_map_btf_ids[0], .iter_seq_info = &iter_seq_info, @@ -2534,6 +2576,7 @@ const struct bpf_map_ops htab_of_maps_map_ops = { .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, .map_gen_lookup = htab_of_map_gen_lookup, .map_check_btf = map_check_no_btf, + .map_mem_usage = htab_map_mem_usage, BATCH_OPS(htab), .map_btf_id = &htab_map_btf_ids[0], }; -- cgit v1.2.3-58-ga151 From 1746d0555a8795c7ecfeab6a2c3e3c824cf57535 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 5 Mar 2023 12:46:01 +0000 Subject: bpf: arraymap memory usage Introduce array_map_mem_usage() to calculate arraymap memory usage. In this helper, some small memory allocations are ignored, like the allocation of struct bpf_array_aux in prog_array. The inner_map_meta in array_of_map is also ignored. The result as follows, - before 11: array name count_map flags 0x0 key 4B value 4B max_entries 65536 memlock 524288B 12: percpu_array name count_map flags 0x0 key 4B value 4B max_entries 65536 memlock 8912896B 13: perf_event_array name count_map flags 0x0 key 4B value 4B max_entries 65536 memlock 524288B 14: prog_array name count_map flags 0x0 key 4B value 4B max_entries 65536 memlock 524288B 15: cgroup_array name count_map flags 0x0 key 4B value 4B max_entries 65536 memlock 524288B - after 11: array name count_map flags 0x0 key 4B value 4B max_entries 65536 memlock 524608B 12: percpu_array name count_map flags 0x0 key 4B value 4B max_entries 65536 memlock 17301824B 13: perf_event_array name count_map flags 0x0 key 4B value 4B max_entries 65536 memlock 524608B 14: prog_array name count_map flags 0x0 key 4B value 4B max_entries 65536 memlock 524608B 15: cgroup_array name count_map flags 0x0 key 4B value 4B max_entries 65536 memlock 524608B Signed-off-by: Yafang Shao Link: https://lore.kernel.org/r/20230305124615.12358-5-laoar.shao@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/arraymap.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 484706959556..1588c793a715 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -721,6 +721,28 @@ static int bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback_ return num_elems; } +static u64 array_map_mem_usage(const struct bpf_map *map) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + bool percpu = map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; + u32 elem_size = array->elem_size; + u64 entries = map->max_entries; + u64 usage = sizeof(*array); + + if (percpu) { + usage += entries * sizeof(void *); + usage += entries * elem_size * num_possible_cpus(); + } else { + if (map->map_flags & BPF_F_MMAPABLE) { + usage = PAGE_ALIGN(usage); + usage += PAGE_ALIGN(entries * elem_size); + } else { + usage += entries * elem_size; + } + } + return usage; +} + BTF_ID_LIST_SINGLE(array_map_btf_ids, struct, bpf_array) const struct bpf_map_ops array_map_ops = { .map_meta_equal = array_map_meta_equal, @@ -742,6 +764,7 @@ const struct bpf_map_ops array_map_ops = { .map_update_batch = generic_map_update_batch, .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_array_elem, + .map_mem_usage = array_map_mem_usage, .map_btf_id = &array_map_btf_ids[0], .iter_seq_info = &iter_seq_info, }; @@ -762,6 +785,7 @@ const struct bpf_map_ops percpu_array_map_ops = { .map_update_batch = generic_map_update_batch, .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_array_elem, + .map_mem_usage = array_map_mem_usage, .map_btf_id = &array_map_btf_ids[0], .iter_seq_info = &iter_seq_info, }; @@ -1156,6 +1180,7 @@ const struct bpf_map_ops prog_array_map_ops = { .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem, .map_release_uref = prog_array_map_clear, .map_seq_show_elem = prog_array_map_seq_show_elem, + .map_mem_usage = array_map_mem_usage, .map_btf_id = &array_map_btf_ids[0], }; @@ -1257,6 +1282,7 @@ const struct bpf_map_ops perf_event_array_map_ops = { .map_fd_put_ptr = perf_event_fd_array_put_ptr, .map_release = perf_event_fd_array_release, .map_check_btf = map_check_no_btf, + .map_mem_usage = array_map_mem_usage, .map_btf_id = &array_map_btf_ids[0], }; @@ -1291,6 +1317,7 @@ const struct bpf_map_ops cgroup_array_map_ops = { .map_fd_get_ptr = cgroup_fd_array_get_ptr, .map_fd_put_ptr = cgroup_fd_array_put_ptr, .map_check_btf = map_check_no_btf, + .map_mem_usage = array_map_mem_usage, .map_btf_id = &array_map_btf_ids[0], }; #endif @@ -1379,5 +1406,6 @@ const struct bpf_map_ops array_of_maps_map_ops = { .map_lookup_batch = generic_map_lookup_batch, .map_update_batch = generic_map_update_batch, .map_check_btf = map_check_no_btf, + .map_mem_usage = array_map_mem_usage, .map_btf_id = &array_map_btf_ids[0], }; -- cgit v1.2.3-58-ga151 From cbb9b6068c68332eefb0ef4c0ebf6f96c35d9bde Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 5 Mar 2023 12:46:02 +0000 Subject: bpf: stackmap memory usage A new helper is introduced to get stackmap memory usage. Some small memory allocations are ignored as their memory size is quite small compared to the totol usage. The result as follows, - before 16: stack_trace name count_map flags 0x0 key 4B value 8B max_entries 65536 memlock 1048576B - after 16: stack_trace name count_map flags 0x0 key 4B value 8B max_entries 65536 memlock 2097472B Signed-off-by: Yafang Shao Link: https://lore.kernel.org/r/20230305124615.12358-6-laoar.shao@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/stackmap.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index aecea7451b61..0f1d8dced933 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -654,6 +654,19 @@ static void stack_map_free(struct bpf_map *map) put_callchain_buffers(); } +static u64 stack_map_mem_usage(const struct bpf_map *map) +{ + struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); + u64 value_size = map->value_size; + u64 n_buckets = smap->n_buckets; + u64 enties = map->max_entries; + u64 usage = sizeof(*smap); + + usage += n_buckets * sizeof(struct stack_map_bucket *); + usage += enties * (sizeof(struct stack_map_bucket) + value_size); + return usage; +} + BTF_ID_LIST_SINGLE(stack_trace_map_btf_ids, struct, bpf_stack_map) const struct bpf_map_ops stack_trace_map_ops = { .map_meta_equal = bpf_map_meta_equal, @@ -664,5 +677,6 @@ const struct bpf_map_ops stack_trace_map_ops = { .map_update_elem = stack_map_update_elem, .map_delete_elem = stack_map_delete_elem, .map_check_btf = map_check_no_btf, + .map_mem_usage = stack_map_mem_usage, .map_btf_id = &stack_trace_map_btf_ids[0], }; -- cgit v1.2.3-58-ga151 From 2e89caf055a64cb531cbd8ab7a01c3e9b0b3133d Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 5 Mar 2023 12:46:03 +0000 Subject: bpf: reuseport_array memory usage A new helper is introduced to calculate reuseport_array memory usage. The result as follows, - before 14: reuseport_sockarray name count_map flags 0x0 key 4B value 8B max_entries 65536 memlock 1048576B - after 14: reuseport_sockarray name count_map flags 0x0 key 4B value 8B max_entries 65536 memlock 524544B Signed-off-by: Yafang Shao Link: https://lore.kernel.org/r/20230305124615.12358-7-laoar.shao@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/reuseport_array.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index 82c61612f382..71cb72f5b733 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -335,6 +335,13 @@ static int reuseport_array_get_next_key(struct bpf_map *map, void *key, return 0; } +static u64 reuseport_array_mem_usage(const struct bpf_map *map) +{ + struct reuseport_array *array; + + return struct_size(array, ptrs, map->max_entries); +} + BTF_ID_LIST_SINGLE(reuseport_array_map_btf_ids, struct, reuseport_array) const struct bpf_map_ops reuseport_array_ops = { .map_meta_equal = bpf_map_meta_equal, @@ -344,5 +351,6 @@ const struct bpf_map_ops reuseport_array_ops = { .map_lookup_elem = reuseport_array_lookup_elem, .map_get_next_key = reuseport_array_get_next_key, .map_delete_elem = reuseport_array_delete_elem, + .map_mem_usage = reuseport_array_mem_usage, .map_btf_id = &reuseport_array_map_btf_ids[0], }; -- cgit v1.2.3-58-ga151 From 2f7e4ab2caa9a692ffc843f801cea90632cdc782 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 5 Mar 2023 12:46:04 +0000 Subject: bpf: ringbuf memory usage A new helper ringbuf_map_mem_usage() is introduced to calculate ringbuf memory usage. The result as follows, - before 15: ringbuf name count_map flags 0x0 key 0B value 0B max_entries 65536 memlock 0B - after 15: ringbuf name count_map flags 0x0 key 0B value 0B max_entries 65536 memlock 78424B Signed-off-by: Yafang Shao Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20230305124615.12358-8-laoar.shao@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/ringbuf.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 8732e0aadf36..0d2a45ff83f1 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -19,6 +19,7 @@ (offsetof(struct bpf_ringbuf, consumer_pos) >> PAGE_SHIFT) /* consumer page and producer page */ #define RINGBUF_POS_PAGES 2 +#define RINGBUF_NR_META_PAGES (RINGBUF_PGOFF + RINGBUF_POS_PAGES) #define RINGBUF_MAX_RECORD_SZ (UINT_MAX/4) @@ -96,7 +97,7 @@ static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node) { const gfp_t flags = GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL | __GFP_NOWARN | __GFP_ZERO; - int nr_meta_pages = RINGBUF_PGOFF + RINGBUF_POS_PAGES; + int nr_meta_pages = RINGBUF_NR_META_PAGES; int nr_data_pages = data_sz >> PAGE_SHIFT; int nr_pages = nr_meta_pages + nr_data_pages; struct page **pages, *page; @@ -336,6 +337,21 @@ static __poll_t ringbuf_map_poll_user(struct bpf_map *map, struct file *filp, return 0; } +static u64 ringbuf_map_mem_usage(const struct bpf_map *map) +{ + struct bpf_ringbuf *rb; + int nr_data_pages; + int nr_meta_pages; + u64 usage = sizeof(struct bpf_ringbuf_map); + + rb = container_of(map, struct bpf_ringbuf_map, map)->rb; + usage += (u64)rb->nr_pages << PAGE_SHIFT; + nr_meta_pages = RINGBUF_NR_META_PAGES; + nr_data_pages = map->max_entries >> PAGE_SHIFT; + usage += (nr_meta_pages + 2 * nr_data_pages) * sizeof(struct page *); + return usage; +} + BTF_ID_LIST_SINGLE(ringbuf_map_btf_ids, struct, bpf_ringbuf_map) const struct bpf_map_ops ringbuf_map_ops = { .map_meta_equal = bpf_map_meta_equal, @@ -347,6 +363,7 @@ const struct bpf_map_ops ringbuf_map_ops = { .map_update_elem = ringbuf_map_update_elem, .map_delete_elem = ringbuf_map_delete_elem, .map_get_next_key = ringbuf_map_get_next_key, + .map_mem_usage = ringbuf_map_mem_usage, .map_btf_id = &ringbuf_map_btf_ids[0], }; @@ -361,6 +378,7 @@ const struct bpf_map_ops user_ringbuf_map_ops = { .map_update_elem = ringbuf_map_update_elem, .map_delete_elem = ringbuf_map_delete_elem, .map_get_next_key = ringbuf_map_get_next_key, + .map_mem_usage = ringbuf_map_mem_usage, .map_btf_id = &user_ringbuf_map_btf_ids[0], }; -- cgit v1.2.3-58-ga151 From 71a49abe73cb56b15bf0701b5e98b6103480f762 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 5 Mar 2023 12:46:05 +0000 Subject: bpf: bloom_filter memory usage Introduce a new helper to calculate the bloom_filter memory usage. The result as follows, - before 16: bloom_filter flags 0x0 key 0B value 8B max_entries 65536 memlock 524288B - after 16: bloom_filter flags 0x0 key 0B value 8B max_entries 65536 memlock 65856B Signed-off-by: Yafang Shao Link: https://lore.kernel.org/r/20230305124615.12358-9-laoar.shao@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/bloom_filter.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c index 48ee750849f2..6350c5d35a9b 100644 --- a/kernel/bpf/bloom_filter.c +++ b/kernel/bpf/bloom_filter.c @@ -193,6 +193,17 @@ static int bloom_map_check_btf(const struct bpf_map *map, return btf_type_is_void(key_type) ? 0 : -EINVAL; } +static u64 bloom_map_mem_usage(const struct bpf_map *map) +{ + struct bpf_bloom_filter *bloom; + u64 bitset_bytes; + + bloom = container_of(map, struct bpf_bloom_filter, map); + bitset_bytes = BITS_TO_BYTES((u64)bloom->bitset_mask + 1); + bitset_bytes = roundup(bitset_bytes, sizeof(unsigned long)); + return sizeof(*bloom) + bitset_bytes; +} + BTF_ID_LIST_SINGLE(bpf_bloom_map_btf_ids, struct, bpf_bloom_filter) const struct bpf_map_ops bloom_filter_map_ops = { .map_meta_equal = bpf_map_meta_equal, @@ -206,5 +217,6 @@ const struct bpf_map_ops bloom_filter_map_ops = { .map_update_elem = bloom_map_update_elem, .map_delete_elem = bloom_map_delete_elem, .map_check_btf = bloom_map_check_btf, + .map_mem_usage = bloom_map_mem_usage, .map_btf_id = &bpf_bloom_map_btf_ids[0], }; -- cgit v1.2.3-58-ga151 From 835f1fca951303930b2c74c5b0abe3c03a3aeadf Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 5 Mar 2023 12:46:06 +0000 Subject: bpf: cpumap memory usage A new helper is introduced to calculate cpumap memory usage. The size of cpu_entries can be dynamically changed when we update or delete a cpumap element, but this patch doesn't include the memory size of cpu_entry yet. We can dynamically calculate the memory usage when we alloc or free a cpu_entry, but it will take extra runtime overhead, so let just put it aside currently. Note that the size of different cpu_entry may be different as well. The result as follows, - before 48: cpumap name count_map flags 0x4 key 4B value 4B max_entries 64 memlock 4096B - after 48: cpumap name count_map flags 0x4 key 4B value 4B max_entries 64 memlock 832B Signed-off-by: Yafang Shao Link: https://lore.kernel.org/r/20230305124615.12358-10-laoar.shao@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/cpumap.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index d2110c1f6fa6..871809e71b4e 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -673,6 +673,15 @@ static int cpu_map_redirect(struct bpf_map *map, u64 index, u64 flags) __cpu_map_lookup_elem); } +static u64 cpu_map_mem_usage(const struct bpf_map *map) +{ + u64 usage = sizeof(struct bpf_cpu_map); + + /* Currently the dynamically allocated elements are not counted */ + usage += (u64)map->max_entries * sizeof(struct bpf_cpu_map_entry *); + return usage; +} + BTF_ID_LIST_SINGLE(cpu_map_btf_ids, struct, bpf_cpu_map) const struct bpf_map_ops cpu_map_ops = { .map_meta_equal = bpf_map_meta_equal, @@ -683,6 +692,7 @@ const struct bpf_map_ops cpu_map_ops = { .map_lookup_elem = cpu_map_lookup_elem, .map_get_next_key = cpu_map_get_next_key, .map_check_btf = map_check_no_btf, + .map_mem_usage = cpu_map_mem_usage, .map_btf_id = &cpu_map_btf_ids[0], .map_redirect = cpu_map_redirect, }; -- cgit v1.2.3-58-ga151 From fa5e83df173beb6c1334737a463fc2b4ef4efa8b Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 5 Mar 2023 12:46:07 +0000 Subject: bpf: devmap memory usage A new helper is introduced to calculate the memory usage of devmap and devmap_hash. The number of dynamically allocated elements are recored for devmap_hash already, but not for devmap. To track the memory size of dynamically allocated elements, this patch also count the numbers for devmap. The result as follows, - before 40: devmap name count_map flags 0x80 key 4B value 4B max_entries 65536 memlock 524288B 41: devmap_hash name count_map flags 0x80 key 4B value 4B max_entries 65536 memlock 524288B - after 40: devmap name count_map flags 0x80 <<<< no elements key 4B value 4B max_entries 65536 memlock 524608B 41: devmap_hash name count_map flags 0x80 <<<< no elements key 4B value 4B max_entries 65536 memlock 524608B Note that the number of buckets is same with max_entries for devmap_hash in this case. Signed-off-by: Yafang Shao Link: https://lore.kernel.org/r/20230305124615.12358-11-laoar.shao@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/devmap.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 2675fefc6cb6..19b036a228f7 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -819,8 +819,10 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key) return -EINVAL; old_dev = unrcu_pointer(xchg(&dtab->netdev_map[k], NULL)); - if (old_dev) + if (old_dev) { call_rcu(&old_dev->rcu, __dev_map_entry_free); + atomic_dec((atomic_t *)&dtab->items); + } return 0; } @@ -931,6 +933,8 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map, old_dev = unrcu_pointer(xchg(&dtab->netdev_map[i], RCU_INITIALIZER(dev))); if (old_dev) call_rcu(&old_dev->rcu, __dev_map_entry_free); + else + atomic_inc((atomic_t *)&dtab->items); return 0; } @@ -1016,6 +1020,20 @@ static int dev_hash_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags) __dev_map_hash_lookup_elem); } +static u64 dev_map_mem_usage(const struct bpf_map *map) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + u64 usage = sizeof(struct bpf_dtab); + + if (map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) + usage += (u64)dtab->n_buckets * sizeof(struct hlist_head); + else + usage += (u64)map->max_entries * sizeof(struct bpf_dtab_netdev *); + usage += atomic_read((atomic_t *)&dtab->items) * + (u64)sizeof(struct bpf_dtab_netdev); + return usage; +} + BTF_ID_LIST_SINGLE(dev_map_btf_ids, struct, bpf_dtab) const struct bpf_map_ops dev_map_ops = { .map_meta_equal = bpf_map_meta_equal, @@ -1026,6 +1044,7 @@ const struct bpf_map_ops dev_map_ops = { .map_update_elem = dev_map_update_elem, .map_delete_elem = dev_map_delete_elem, .map_check_btf = map_check_no_btf, + .map_mem_usage = dev_map_mem_usage, .map_btf_id = &dev_map_btf_ids[0], .map_redirect = dev_map_redirect, }; @@ -1039,6 +1058,7 @@ const struct bpf_map_ops dev_map_hash_ops = { .map_update_elem = dev_map_hash_update_elem, .map_delete_elem = dev_map_hash_delete_elem, .map_check_btf = map_check_no_btf, + .map_mem_usage = dev_map_mem_usage, .map_btf_id = &dev_map_btf_ids[0], .map_redirect = dev_hash_map_redirect, }; @@ -1109,9 +1129,11 @@ static int dev_map_notification(struct notifier_block *notifier, if (!dev || netdev != dev->dev) continue; odev = unrcu_pointer(cmpxchg(&dtab->netdev_map[i], RCU_INITIALIZER(dev), NULL)); - if (dev == odev) + if (dev == odev) { call_rcu(&dev->rcu, __dev_map_entry_free); + atomic_dec((atomic_t *)&dtab->items); + } } } rcu_read_unlock(); -- cgit v1.2.3-58-ga151 From c6e66b42a348125fd41131de75f84fa67b6579ce Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 5 Mar 2023 12:46:08 +0000 Subject: bpf: queue_stack_maps memory usage A new helper is introduced to calculate queue_stack_maps memory usage. The result as follows, - before 20: queue name count_map flags 0x0 key 0B value 4B max_entries 65536 memlock 266240B 21: stack name count_map flags 0x0 key 0B value 4B max_entries 65536 memlock 266240B - after 20: queue name count_map flags 0x0 key 0B value 4B max_entries 65536 memlock 524288B 21: stack name count_map flags 0x0 key 0B value 4B max_entries 65536 memlock 524288B Signed-off-by: Yafang Shao Link: https://lore.kernel.org/r/20230305124615.12358-12-laoar.shao@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/queue_stack_maps.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index 8a5e060de63b..63ecbbcb349d 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -246,6 +246,14 @@ static int queue_stack_map_get_next_key(struct bpf_map *map, void *key, return -EINVAL; } +static u64 queue_stack_map_mem_usage(const struct bpf_map *map) +{ + u64 usage = sizeof(struct bpf_queue_stack); + + usage += ((u64)map->max_entries + 1) * map->value_size; + return usage; +} + BTF_ID_LIST_SINGLE(queue_map_btf_ids, struct, bpf_queue_stack) const struct bpf_map_ops queue_map_ops = { .map_meta_equal = bpf_map_meta_equal, @@ -259,6 +267,7 @@ const struct bpf_map_ops queue_map_ops = { .map_pop_elem = queue_map_pop_elem, .map_peek_elem = queue_map_peek_elem, .map_get_next_key = queue_stack_map_get_next_key, + .map_mem_usage = queue_stack_map_mem_usage, .map_btf_id = &queue_map_btf_ids[0], }; @@ -274,5 +283,6 @@ const struct bpf_map_ops stack_map_ops = { .map_pop_elem = stack_map_pop_elem, .map_peek_elem = stack_map_peek_elem, .map_get_next_key = queue_stack_map_get_next_key, + .map_mem_usage = queue_stack_map_mem_usage, .map_btf_id = &queue_map_btf_ids[0], }; -- cgit v1.2.3-58-ga151 From f062226d8d59b521ddc946ad791048188a16722a Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 5 Mar 2023 12:46:09 +0000 Subject: bpf: bpf_struct_ops memory usage A new helper is introduced to calculate bpf_struct_ops memory usage. The result as follows, - before 1: struct_ops name count_map flags 0x0 key 4B value 256B max_entries 1 memlock 4096B btf_id 73 - after 1: struct_ops name count_map flags 0x0 key 4B value 256B max_entries 1 memlock 5016B btf_id 73 Signed-off-by: Yafang Shao Link: https://lore.kernel.org/r/20230305124615.12358-13-laoar.shao@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_struct_ops.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index ece9870cab68..38903fb52f98 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -641,6 +641,21 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) return map; } +static u64 bpf_struct_ops_map_mem_usage(const struct bpf_map *map) +{ + struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; + const struct bpf_struct_ops *st_ops = st_map->st_ops; + const struct btf_type *vt = st_ops->value_type; + u64 usage; + + usage = sizeof(*st_map) + + vt->size - sizeof(struct bpf_struct_ops_value); + usage += vt->size; + usage += btf_type_vlen(vt) * sizeof(struct bpf_links *); + usage += PAGE_SIZE; + return usage; +} + BTF_ID_LIST_SINGLE(bpf_struct_ops_map_btf_ids, struct, bpf_struct_ops_map) const struct bpf_map_ops bpf_struct_ops_map_ops = { .map_alloc_check = bpf_struct_ops_map_alloc_check, @@ -651,6 +666,7 @@ const struct bpf_map_ops bpf_struct_ops_map_ops = { .map_delete_elem = bpf_struct_ops_map_delete_elem, .map_update_elem = bpf_struct_ops_map_update_elem, .map_seq_show_elem = bpf_struct_ops_map_seq_show_elem, + .map_mem_usage = bpf_struct_ops_map_mem_usage, .map_btf_id = &bpf_struct_ops_map_btf_ids[0], }; -- cgit v1.2.3-58-ga151 From 2f536977d6f1481f0a149140c8311103ddebe36a Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 5 Mar 2023 12:46:10 +0000 Subject: bpf: local_storage memory usage A new helper is introduced to calculate local_storage map memory usage. Currently the dynamically allocated elements are not counted, since it will take runtime overhead in the element update or delete path. So let's put it aside currently, and implement it in the future if the user really needs it. Signed-off-by: Yafang Shao Link: https://lore.kernel.org/r/20230305124615.12358-14-laoar.shao@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/local_storage.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index e90d9f63edc5..a993560f200a 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -446,6 +446,12 @@ static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *key, rcu_read_unlock(); } +static u64 cgroup_storage_map_usage(const struct bpf_map *map) +{ + /* Currently the dynamically allocated elements are not counted. */ + return sizeof(struct bpf_cgroup_storage_map); +} + BTF_ID_LIST_SINGLE(cgroup_storage_map_btf_ids, struct, bpf_cgroup_storage_map) const struct bpf_map_ops cgroup_storage_map_ops = { @@ -457,6 +463,7 @@ const struct bpf_map_ops cgroup_storage_map_ops = { .map_delete_elem = cgroup_storage_delete_elem, .map_check_btf = cgroup_storage_check_btf, .map_seq_show_elem = cgroup_storage_seq_show_elem, + .map_mem_usage = cgroup_storage_map_usage, .map_btf_id = &cgroup_storage_map_btf_ids[0], }; -- cgit v1.2.3-58-ga151 From 7490b7f1c02ef825ef98f7230662049d4a464a21 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 5 Mar 2023 12:46:11 +0000 Subject: bpf, net: bpf_local_storage memory usage A new helper is introduced into bpf_local_storage map to calculate the memory usage. This helper is also used by other maps like bpf_cgrp_storage, bpf_inode_storage, bpf_task_storage and etc. Note that currently the dynamically allocated storage elements are not counted in the usage, since it will take extra runtime overhead in the elements update or delete path. So let's put it aside now, and implement it in the future when someone really need it. Signed-off-by: Yafang Shao Link: https://lore.kernel.org/r/20230305124615.12358-15-laoar.shao@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_local_storage.h | 1 + kernel/bpf/bpf_cgrp_storage.c | 1 + kernel/bpf/bpf_inode_storage.c | 1 + kernel/bpf/bpf_local_storage.c | 10 ++++++++++ kernel/bpf/bpf_task_storage.c | 1 + net/core/bpf_sk_storage.c | 1 + 6 files changed, 15 insertions(+) (limited to 'kernel') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 6d37a40cd90e..d934248b8e81 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -164,5 +164,6 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, void *value, u64 map_flags, gfp_t gfp_flags); void bpf_local_storage_free_rcu(struct rcu_head *rcu); +u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map); #endif /* _BPF_LOCAL_STORAGE_H */ diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c index 6cdf6d9ed91d..9ae07aedaf23 100644 --- a/kernel/bpf/bpf_cgrp_storage.c +++ b/kernel/bpf/bpf_cgrp_storage.c @@ -221,6 +221,7 @@ const struct bpf_map_ops cgrp_storage_map_ops = { .map_update_elem = bpf_cgrp_storage_update_elem, .map_delete_elem = bpf_cgrp_storage_delete_elem, .map_check_btf = bpf_local_storage_map_check_btf, + .map_mem_usage = bpf_local_storage_map_mem_usage, .map_btf_id = &bpf_local_storage_map_btf_id[0], .map_owner_storage_ptr = cgroup_storage_ptr, }; diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index 05f4c66c9089..43e2619c8167 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -223,6 +223,7 @@ const struct bpf_map_ops inode_storage_map_ops = { .map_update_elem = bpf_fd_inode_storage_update_elem, .map_delete_elem = bpf_fd_inode_storage_delete_elem, .map_check_btf = bpf_local_storage_map_check_btf, + .map_mem_usage = bpf_local_storage_map_mem_usage, .map_btf_id = &bpf_local_storage_map_btf_id[0], .map_owner_storage_ptr = inode_storage_ptr, }; diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 3d320393a12c..d3ba3f2db640 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -685,6 +685,16 @@ bool bpf_local_storage_unlink_nolock(struct bpf_local_storage *local_storage) return free_storage; } +u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map) +{ + struct bpf_local_storage_map *smap = (struct bpf_local_storage_map *)map; + u64 usage = sizeof(*smap); + + /* The dynamically callocated selems are not counted currently. */ + usage += sizeof(*smap->buckets) * (1ULL << smap->bucket_log); + return usage; +} + struct bpf_map * bpf_local_storage_map_alloc(union bpf_attr *attr, struct bpf_local_storage_cache *cache) diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index 1e486055a523..20f942229f3c 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -335,6 +335,7 @@ const struct bpf_map_ops task_storage_map_ops = { .map_update_elem = bpf_pid_task_storage_update_elem, .map_delete_elem = bpf_pid_task_storage_delete_elem, .map_check_btf = bpf_local_storage_map_check_btf, + .map_mem_usage = bpf_local_storage_map_mem_usage, .map_btf_id = &bpf_local_storage_map_btf_id[0], .map_owner_storage_ptr = task_storage_ptr, }; diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index bb378c33f542..7a36353dbc22 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -324,6 +324,7 @@ const struct bpf_map_ops sk_storage_map_ops = { .map_local_storage_charge = bpf_sk_storage_charge, .map_local_storage_uncharge = bpf_sk_storage_uncharge, .map_owner_storage_ptr = bpf_sk_storage_ptr, + .map_mem_usage = bpf_local_storage_map_mem_usage, }; const struct bpf_func_proto bpf_sk_storage_get_proto = { -- cgit v1.2.3-58-ga151 From 9629363cd05642fe43aded44938adec067ad1da3 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 5 Mar 2023 12:46:14 +0000 Subject: bpf: offload map memory usage A new helper is introduced to calculate offload map memory usage. But currently the memory dynamically allocated in netdev dev_ops, like nsim_map_update_elem, is not counted. Let's just put it aside now. Signed-off-by: Yafang Shao Link: https://lore.kernel.org/r/20230305124615.12358-18-laoar.shao@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 ++++++ kernel/bpf/offload.c | 6 ++++++ kernel/bpf/syscall.c | 1 + 3 files changed, 13 insertions(+) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9059520bbb5e..6792a7940e1e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2624,6 +2624,7 @@ static inline bool bpf_map_is_offloaded(struct bpf_map *map) struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr); void bpf_map_offload_map_free(struct bpf_map *map); +u64 bpf_map_offload_map_mem_usage(const struct bpf_map *map); int bpf_prog_test_run_syscall(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); @@ -2695,6 +2696,11 @@ static inline void bpf_map_offload_map_free(struct bpf_map *map) { } +static inline u64 bpf_map_offload_map_mem_usage(const struct bpf_map *map) +{ + return 0; +} + static inline int bpf_prog_test_run_syscall(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 0c85e06f7ea7..d9c9f45e3529 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -563,6 +563,12 @@ void bpf_map_offload_map_free(struct bpf_map *map) bpf_map_area_free(offmap); } +u64 bpf_map_offload_map_mem_usage(const struct bpf_map *map) +{ + /* The memory dynamically allocated in netdev dev_ops is not counted */ + return sizeof(struct bpf_offloaded_map); +} + int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value) { struct bpf_offloaded_map *offmap = map_to_offmap(map); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7c96e68859ec..053409d951d2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -105,6 +105,7 @@ const struct bpf_map_ops bpf_map_offload_ops = { .map_alloc = bpf_map_offload_map_alloc, .map_free = bpf_map_offload_map_free, .map_check_btf = map_check_no_btf, + .map_mem_usage = bpf_map_offload_map_mem_usage, }; static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) -- cgit v1.2.3-58-ga151 From 6b4a6ea2c62d34272d64161d43a19c02355576e2 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 5 Mar 2023 12:46:15 +0000 Subject: bpf: enforce all maps having memory usage callback We have implemented memory usage callback for all maps, and we enforce any newly added map having a callback as well. We check this callback at map creation time. If it doesn't have the callback, we will return EINVAL. Signed-off-by: Yafang Shao Link: https://lore.kernel.org/r/20230305124615.12358-19-laoar.shao@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 053409d951d2..f406dfa13792 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -129,6 +129,8 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) } if (attr->map_ifindex) ops = &bpf_map_offload_ops; + if (!ops->map_mem_usage) + return ERR_PTR(-EINVAL); map = ops->map_alloc(attr); if (IS_ERR(map)) return map; @@ -775,13 +777,7 @@ static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f) /* Show the memory usage of a bpf map */ static u64 bpf_map_memory_usage(const struct bpf_map *map) { - unsigned long size; - - if (map->ops->map_mem_usage) - return map->ops->map_mem_usage(map); - - size = round_up(map->key_size + bpf_map_value_size(map), 8); - return round_up(map->max_entries * size, PAGE_SIZE); + return map->ops->map_mem_usage(map); } static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) -- cgit v1.2.3-58-ga151 From 07236eab7a3139da97aef9f5f21f403be82a82ea Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 8 Mar 2023 10:41:14 -0800 Subject: bpf: factor out fetching basic kfunc metadata Factor out logic to fetch basic kfunc metadata based on struct bpf_insn. This is not exactly short or trivial code to just copy/paste and this information is sometimes necessary in other parts of the verifier logic. Subsequent patches will rely on this to determine if an instruction is a kfunc call to iterator next method. No functional changes intended, including that verbose() warning behavior when kfunc is not allowed for a particular program type. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20230308184121.1165081-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 92 +++++++++++++++++++++++++++++++++------------------ 1 file changed, 59 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b2116ca78d9a..8d40fba6a1c0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10079,24 +10079,21 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return 0; } -static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, - int *insn_idx_p) +static int fetch_kfunc_meta(struct bpf_verifier_env *env, + struct bpf_insn *insn, + struct bpf_kfunc_call_arg_meta *meta, + const char **kfunc_name) { - const struct btf_type *t, *func, *func_proto, *ptr_type; - u32 i, nargs, func_id, ptr_type_id, release_ref_obj_id; - struct bpf_reg_state *regs = cur_regs(env); - const char *func_name, *ptr_type_name; - bool sleepable, rcu_lock, rcu_unlock; - struct bpf_kfunc_call_arg_meta meta; - int err, insn_idx = *insn_idx_p; - const struct btf_param *args; - const struct btf_type *ret_t; + const struct btf_type *func, *func_proto; + u32 func_id, *kfunc_flags; + const char *func_name; struct btf *desc_btf; - u32 *kfunc_flags; - /* skip for now, but return error when we find this in fixup_kfunc_call */ + if (kfunc_name) + *kfunc_name = NULL; + if (!insn->imm) - return 0; + return -EINVAL; desc_btf = find_kfunc_desc_btf(env, insn->off); if (IS_ERR(desc_btf)) @@ -10105,22 +10102,51 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, func_id = insn->imm; func = btf_type_by_id(desc_btf, func_id); func_name = btf_name_by_offset(desc_btf, func->name_off); + if (kfunc_name) + *kfunc_name = func_name; func_proto = btf_type_by_id(desc_btf, func->type); kfunc_flags = btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog), func_id); if (!kfunc_flags) { - verbose(env, "calling kernel function %s is not allowed\n", - func_name); return -EACCES; } - /* Prepare kfunc call metadata */ - memset(&meta, 0, sizeof(meta)); - meta.btf = desc_btf; - meta.func_id = func_id; - meta.kfunc_flags = *kfunc_flags; - meta.func_proto = func_proto; - meta.func_name = func_name; + memset(meta, 0, sizeof(*meta)); + meta->btf = desc_btf; + meta->func_id = func_id; + meta->kfunc_flags = *kfunc_flags; + meta->func_proto = func_proto; + meta->func_name = func_name; + + return 0; +} + +static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, + int *insn_idx_p) +{ + const struct btf_type *t, *ptr_type; + u32 i, nargs, ptr_type_id, release_ref_obj_id; + struct bpf_reg_state *regs = cur_regs(env); + const char *func_name, *ptr_type_name; + bool sleepable, rcu_lock, rcu_unlock; + struct bpf_kfunc_call_arg_meta meta; + struct bpf_insn_aux_data *insn_aux; + int err, insn_idx = *insn_idx_p; + const struct btf_param *args; + const struct btf_type *ret_t; + struct btf *desc_btf; + + /* skip for now, but return error when we find this in fixup_kfunc_call */ + if (!insn->imm) + return 0; + + err = fetch_kfunc_meta(env, insn, &meta, &func_name); + if (err == -EACCES && func_name) + verbose(env, "calling kernel function %s is not allowed\n", func_name); + if (err) + return err; + desc_btf = meta.btf; + insn_aux = &env->insn_aux_data[insn_idx]; if (is_kfunc_destructive(&meta) && !capable(CAP_SYS_BOOT)) { verbose(env, "destructive kfunc calls require CAP_SYS_BOOT capability\n"); @@ -10173,7 +10199,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, err = release_reference(env, regs[meta.release_regno].ref_obj_id); if (err) { verbose(env, "kfunc %s#%d reference has not been acquired before\n", - func_name, func_id); + func_name, meta.func_id); return err; } } @@ -10185,14 +10211,14 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, err = ref_convert_owning_non_owning(env, release_ref_obj_id); if (err) { verbose(env, "kfunc %s#%d conversion of owning ref to non-owning failed\n", - func_name, func_id); + func_name, meta.func_id); return err; } err = release_reference(env, release_ref_obj_id); if (err) { verbose(env, "kfunc %s#%d reference has not been acquired before\n", - func_name, func_id); + func_name, meta.func_id); return err; } } @@ -10202,7 +10228,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, set_rbtree_add_callback_state); if (err) { verbose(env, "kfunc %s#%d failed callback verification\n", - func_name, func_id); + func_name, meta.func_id); return err; } } @@ -10211,7 +10237,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, mark_reg_not_init(env, regs, caller_saved[i]); /* Check return type */ - t = btf_type_skip_modifiers(desc_btf, func_proto->type, NULL); + t = btf_type_skip_modifiers(desc_btf, meta.func_proto->type, NULL); if (is_kfunc_acquire(&meta) && !btf_type_is_struct_ptr(meta.btf, t)) { /* Only exception is bpf_obj_new_impl */ @@ -10260,11 +10286,11 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, regs[BPF_REG_0].btf = ret_btf; regs[BPF_REG_0].btf_id = ret_btf_id; - env->insn_aux_data[insn_idx].obj_new_size = ret_t->size; - env->insn_aux_data[insn_idx].kptr_struct_meta = + insn_aux->obj_new_size = ret_t->size; + insn_aux->kptr_struct_meta = btf_find_struct_meta(ret_btf, ret_btf_id); } else if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) { - env->insn_aux_data[insn_idx].kptr_struct_meta = + insn_aux->kptr_struct_meta = btf_find_struct_meta(meta.arg_obj_drop.btf, meta.arg_obj_drop.btf_id); } else if (meta.func_id == special_kfunc_list[KF_bpf_list_pop_front] || @@ -10397,8 +10423,8 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, regs[BPF_REG_0].id = ++env->id_gen; } /* else { add_kfunc_call() ensures it is btf_type_is_void(t) } */ - nargs = btf_type_vlen(func_proto); - args = (const struct btf_param *)(func_proto + 1); + nargs = btf_type_vlen(meta.func_proto); + args = (const struct btf_param *)(meta.func_proto + 1); for (i = 0; i < nargs; i++) { u32 regno = i + 1; -- cgit v1.2.3-58-ga151 From 215bf4962f6c9605710012fad222a5fec001b3ad Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 8 Mar 2023 10:41:15 -0800 Subject: bpf: add iterator kfuncs registration and validation logic Add ability to register kfuncs that implement BPF open-coded iterator contract and enforce naming and function proto convention. Enforcement happens at the time of kfunc registration and significantly simplifies the rest of iterators logic in the verifier. More details follow in subsequent patches, but we enforce the following conditions. All kfuncs (constructor, next, destructor) have to be named consistenly as bpf_iter__{new,next,destroy}(), respectively. represents iterator type, and iterator state should be represented as a matching `struct bpf_iter_` state type. Also, all iter kfuncs should have a pointer to this `struct bpf_iter_` as the very first argument. Additionally: - Constructor, i.e., bpf_iter__new(), can have arbitrary extra number of arguments. Return type is not enforced either. - Next method, i.e., bpf_iter__next(), has to return a pointer type and should have exactly one argument: `struct bpf_iter_ *` (const/volatile/restrict and typedefs are ignored). - Destructor, i.e., bpf_iter__destroy(), should return void and should have exactly one argument, similar to the next method. - struct bpf_iter_ size is enforced to be positive and a multiple of 8 bytes (to fit stack slots correctly). Such strictness and consistency allows to build generic helpers abstracting important, but boilerplate, details to be able to use open-coded iterators effectively and ergonomically (see bpf_for_each() in subsequent patches). It also simplifies the verifier logic in some places. At the same time, this doesn't hurt generality of possible iterator implementations. Win-win. Constructor kfunc is marked with a new KF_ITER_NEW flags, next method is marked with KF_ITER_NEXT (and should also have KF_RET_NULL, of course), while destructor kfunc is marked as KF_ITER_DESTROY. Additionally, we add a trivial kfunc name validation: it should be a valid non-NULL and non-empty string. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20230308184121.1165081-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 2 + include/linux/btf.h | 4 ++ kernel/bpf/btf.c | 112 ++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 117 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 18538bad2b8c..e2dc7f064449 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -59,6 +59,8 @@ struct bpf_active_lock { u32 id; }; +#define ITER_PREFIX "bpf_iter_" + struct bpf_reg_state { /* Ordering of fields matters. See states_equal() */ enum bpf_reg_type type; diff --git a/include/linux/btf.h b/include/linux/btf.h index 556b3e2e7471..1bba0827e8c4 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -71,6 +71,10 @@ #define KF_SLEEPABLE (1 << 5) /* kfunc may sleep */ #define KF_DESTRUCTIVE (1 << 6) /* kfunc performs destructive actions */ #define KF_RCU (1 << 7) /* kfunc takes either rcu or trusted pointer arguments */ +/* only one of KF_ITER_{NEW,NEXT,DESTROY} could be specified per kfunc */ +#define KF_ITER_NEW (1 << 8) /* kfunc implements BPF iter constructor */ +#define KF_ITER_NEXT (1 << 9) /* kfunc implements BPF iter next method */ +#define KF_ITER_DESTROY (1 << 10) /* kfunc implements BPF iter destructor */ /* * Tag marking a kernel function as a kfunc. This is meant to minimize the diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index a8cb09e5973b..71758cd15b07 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -7596,6 +7596,108 @@ BTF_ID_LIST_GLOBAL(btf_tracing_ids, MAX_BTF_TRACING_TYPE) BTF_TRACING_TYPE_xxx #undef BTF_TRACING_TYPE +static int btf_check_iter_kfuncs(struct btf *btf, const char *func_name, + const struct btf_type *func, u32 func_flags) +{ + u32 flags = func_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY); + const char *name, *sfx, *iter_name; + const struct btf_param *arg; + const struct btf_type *t; + char exp_name[128]; + u32 nr_args; + + /* exactly one of KF_ITER_{NEW,NEXT,DESTROY} can be set */ + if (!flags || (flags & (flags - 1))) + return -EINVAL; + + /* any BPF iter kfunc should have `struct bpf_iter_ *` first arg */ + nr_args = btf_type_vlen(func); + if (nr_args < 1) + return -EINVAL; + + arg = &btf_params(func)[0]; + t = btf_type_skip_modifiers(btf, arg->type, NULL); + if (!t || !btf_type_is_ptr(t)) + return -EINVAL; + t = btf_type_skip_modifiers(btf, t->type, NULL); + if (!t || !__btf_type_is_struct(t)) + return -EINVAL; + + name = btf_name_by_offset(btf, t->name_off); + if (!name || strncmp(name, ITER_PREFIX, sizeof(ITER_PREFIX) - 1)) + return -EINVAL; + + /* sizeof(struct bpf_iter_) should be a multiple of 8 to + * fit nicely in stack slots + */ + if (t->size == 0 || (t->size % 8)) + return -EINVAL; + + /* validate bpf_iter__{new,next,destroy}(struct bpf_iter_ *) + * naming pattern + */ + iter_name = name + sizeof(ITER_PREFIX) - 1; + if (flags & KF_ITER_NEW) + sfx = "new"; + else if (flags & KF_ITER_NEXT) + sfx = "next"; + else /* (flags & KF_ITER_DESTROY) */ + sfx = "destroy"; + + snprintf(exp_name, sizeof(exp_name), "bpf_iter_%s_%s", iter_name, sfx); + if (strcmp(func_name, exp_name)) + return -EINVAL; + + /* only iter constructor should have extra arguments */ + if (!(flags & KF_ITER_NEW) && nr_args != 1) + return -EINVAL; + + if (flags & KF_ITER_NEXT) { + /* bpf_iter__next() should return pointer */ + t = btf_type_skip_modifiers(btf, func->type, NULL); + if (!t || !btf_type_is_ptr(t)) + return -EINVAL; + } + + if (flags & KF_ITER_DESTROY) { + /* bpf_iter__destroy() should return void */ + t = btf_type_by_id(btf, func->type); + if (!t || !btf_type_is_void(t)) + return -EINVAL; + } + + return 0; +} + +static int btf_check_kfunc_protos(struct btf *btf, u32 func_id, u32 func_flags) +{ + const struct btf_type *func; + const char *func_name; + int err; + + /* any kfunc should be FUNC -> FUNC_PROTO */ + func = btf_type_by_id(btf, func_id); + if (!func || !btf_type_is_func(func)) + return -EINVAL; + + /* sanity check kfunc name */ + func_name = btf_name_by_offset(btf, func->name_off); + if (!func_name || !func_name[0]) + return -EINVAL; + + func = btf_type_by_id(btf, func->type); + if (!func || !btf_type_is_func_proto(func)) + return -EINVAL; + + if (func_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY)) { + err = btf_check_iter_kfuncs(btf, func_name, func, func_flags); + if (err) + return err; + } + + return 0; +} + /* Kernel Function (kfunc) BTF ID set registration API */ static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook, @@ -7772,7 +7874,7 @@ static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook, const struct btf_kfunc_id_set *kset) { struct btf *btf; - int ret; + int ret, i; btf = btf_get_module_btf(kset->owner); if (!btf) { @@ -7789,7 +7891,15 @@ static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook, if (IS_ERR(btf)) return PTR_ERR(btf); + for (i = 0; i < kset->set->cnt; i++) { + ret = btf_check_kfunc_protos(btf, kset->set->pairs[i].id, + kset->set->pairs[i].flags); + if (ret) + goto err_out; + } + ret = btf_populate_kfunc_set(btf, hook, kset->set); +err_out: btf_put(btf); return ret; } -- cgit v1.2.3-58-ga151 From 06accc8779c1d558a5b5a21f2ac82b0c95827ddd Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 8 Mar 2023 10:41:16 -0800 Subject: bpf: add support for open-coded iterator loops Teach verifier about the concept of the open-coded (or inline) iterators. This patch adds generic iterator loop verification logic, new STACK_ITER stack slot type to contain iterator state, and necessary kfunc plumbing for iterator's constructor, destructor and next methods. Next patch implements first specific iterator (numbers iterator for implementing for() loop logic). Such split allows to have more focused commits for verifier logic and separate commit that we could point later to demonstrating what does it take to add a new kind of iterator. Each kind of iterator has its own associated struct bpf_iter_, where denotes a specific type of iterator. struct bpf_iter_ state is supposed to live on BPF program stack, so there will be no way to change its size later on without breaking backwards compatibility, so choose wisely! But given this struct is specific to a given of iterator, this allows a lot of flexibility: simple iterators could be fine with just one stack slot (8 bytes), like numbers iterator in the next patch, while some other more complicated iterators might need way more to keep their iterator state. Either way, such design allows to avoid runtime memory allocations, which otherwise would be necessary if we fixed on-the-stack size and it turned out to be too small for a given iterator implementation. The way BPF verifier logic is implemented, there are no artificial restrictions on a number of active iterators, it should work correctly using multiple active iterators at the same time. This also means you can have multiple nested iteration loops. struct bpf_iter_ reference can be safely passed to subprograms as well. General flow is easiest to demonstrate with a simple example using number iterator implemented in next patch. Here's the simplest possible loop: struct bpf_iter_num it; int *v; bpf_iter_num_new(&it, 2, 5); while ((v = bpf_iter_num_next(&it))) { bpf_printk("X = %d", *v); } bpf_iter_num_destroy(&it); Above snippet should output "X = 2", "X = 3", "X = 4". Note that 5 is exclusive and is not returned. This matches similar APIs (e.g., slices in Go or Rust) that implement a range of elements, where end index is non-inclusive. In the above example, we see a trio of function: - constructor, bpf_iter_num_new(), which initializes iterator state (struct bpf_iter_num it) on the stack. If any of the input arguments are invalid, constructor should make sure to still initialize it such that subsequent bpf_iter_num_next() calls will return NULL. I.e., on error, return error and construct empty iterator. - next method, bpf_iter_num_next(), which accepts pointer to iterator state and produces an element. Next method should always return a pointer. The contract between BPF verifier is that next method will always eventually return NULL when elements are exhausted. Once NULL is returned, subsequent next calls should keep returning NULL. In the case of numbers iterator, bpf_iter_num_next() returns a pointer to an int (storage for this integer is inside the iterator state itself), which can be dereferenced after corresponding NULL check. - once done with the iterator, it's mandated that user cleans up its state with the call to destructor, bpf_iter_num_destroy() in this case. Destructor frees up any resources and marks stack space used by struct bpf_iter_num as usable for something else. Any other iterator implementation will have to implement at least these three methods. It is enforced that for any given type of iterator only applicable constructor/destructor/next are callable. I.e., verifier ensures you can't pass number iterator state into, say, cgroup iterator's next method. It is important to keep the naming pattern consistent to be able to create generic macros to help with BPF iter usability. E.g., one of the follow up patches adds generic bpf_for_each() macro to bpf_misc.h in selftests, which allows to utilize iterator "trio" nicely without having to code the above somewhat tedious loop explicitly every time. This is enforced at kfunc registration point by one of the previous patches in this series. At the implementation level, iterator state tracking for verification purposes is very similar to dynptr. We add STACK_ITER stack slot type, reserve necessary number of slots, depending on sizeof(struct bpf_iter_), and keep track of necessary extra state in the "main" slot, which is marked with non-zero ref_obj_id. Other slots are also marked as STACK_ITER, but have zero ref_obj_id. This is simpler than having a separate "is_first_slot" flag. Another big distinction is that STACK_ITER is *always refcounted*, which simplifies implementation without sacrificing usability. So no need for extra "iter_id", no need to anticipate reuse of STACK_ITER slots for new constructors, etc. Keeping it simple here. As far as the verification logic goes, there are two extensive comments: in process_iter_next_call() and iter_active_depths_differ() explaining some important and sometimes subtle aspects. Please refer to them for details. But from 10,000-foot point of view, next methods are the points of forking a verification state, which are conceptually similar to what verifier is doing when validating conditional jump. We branch out at a `call bpf_iter__next` instruction and simulate two outcomes: NULL (iteration is done) and non-NULL (new element is returned). NULL is simulated first and is supposed to reach exit without looping. After that non-NULL case is validated and it either reaches exit (for trivial examples with no real loop), or reaches another `call bpf_iter__next` instruction with the state equivalent to already (partially) validated one. State equivalency at that point means we technically are going to be looping forever without "breaking out" out of established "state envelope" (i.e., subsequent iterations don't add any new knowledge or constraints to the verifier state, so running 1, 2, 10, or a million of them doesn't matter). But taking into account the contract stating that iterator next method *has to* return NULL eventually, we can conclude that loop body is safe and will eventually terminate. Given we validated logic outside of the loop (NULL case), and concluded that loop body is safe (though potentially looping many times), verifier can claim safety of the overall program logic. The rest of the patch is necessary plumbing for state tracking, marking, validation, and necessary further kfunc plumbing to allow implementing iterator constructor, destructor, and next methods. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20230308184121.1165081-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 23 ++ kernel/bpf/verifier.c | 595 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 610 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index e2dc7f064449..0c052bc79940 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -61,6 +61,12 @@ struct bpf_active_lock { #define ITER_PREFIX "bpf_iter_" +enum bpf_iter_state { + BPF_ITER_STATE_INVALID, /* for non-first slot */ + BPF_ITER_STATE_ACTIVE, + BPF_ITER_STATE_DRAINED, +}; + struct bpf_reg_state { /* Ordering of fields matters. See states_equal() */ enum bpf_reg_type type; @@ -105,6 +111,18 @@ struct bpf_reg_state { bool first_slot; } dynptr; + /* For bpf_iter stack slots */ + struct { + /* BTF container and BTF type ID describing + * struct bpf_iter_ of an iterator state + */ + struct btf *btf; + u32 btf_id; + /* packing following two fields to fit iter state into 16 bytes */ + enum bpf_iter_state state:2; + int depth:30; + } iter; + /* Max size from any of the above. */ struct { unsigned long raw1; @@ -143,6 +161,8 @@ struct bpf_reg_state { * same reference to the socket, to determine proper reference freeing. * For stack slots that are dynptrs, this is used to track references to * the dynptr to determine proper reference freeing. + * Similarly to dynptrs, we use ID to track "belonging" of a reference + * to a specific instance of bpf_iter. */ u32 id; /* PTR_TO_SOCKET and PTR_TO_TCP_SOCK could be a ptr returned @@ -213,9 +233,11 @@ enum bpf_stack_slot_type { * is stored in bpf_stack_state->spilled_ptr.dynptr.type */ STACK_DYNPTR, + STACK_ITER, }; #define BPF_REG_SIZE 8 /* size of eBPF register in bytes */ + #define BPF_DYNPTR_SIZE sizeof(struct bpf_dynptr_kern) #define BPF_DYNPTR_NR_SLOTS (BPF_DYNPTR_SIZE / BPF_REG_SIZE) @@ -450,6 +472,7 @@ struct bpf_insn_aux_data { bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */ bool zext_dst; /* this insn zero extends dst reg */ bool storage_get_func_atomic; /* bpf_*_storage_get() with atomic memory alloc */ + bool is_iter_next; /* bpf_iter__next() kfunc call */ u8 alu_state; /* used in combination with alu_limit */ /* below fields are initialized once */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8d40fba6a1c0..45a082284464 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -302,6 +302,10 @@ struct bpf_kfunc_call_arg_meta { enum bpf_dynptr_type type; u32 id; } initialized_dynptr; + struct { + u8 spi; + u8 frameno; + } iter; u64 mem_size; }; @@ -668,6 +672,7 @@ static char slot_type_char[] = { [STACK_MISC] = 'm', [STACK_ZERO] = '0', [STACK_DYNPTR] = 'd', + [STACK_ITER] = 'i', }; static void print_liveness(struct bpf_verifier_env *env, @@ -742,6 +747,11 @@ static int dynptr_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *re return stack_slot_obj_get_spi(env, reg, "dynptr", BPF_DYNPTR_NR_SLOTS); } +static int iter_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int nr_slots) +{ + return stack_slot_obj_get_spi(env, reg, "iter", nr_slots); +} + static const char *kernel_type_name(const struct btf* btf, u32 id) { return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off); @@ -766,6 +776,30 @@ static const char *dynptr_type_str(enum bpf_dynptr_type type) } } +static const char *iter_type_str(const struct btf *btf, u32 btf_id) +{ + if (!btf || btf_id == 0) + return ""; + + /* we already validated that type is valid and has conforming name */ + return kernel_type_name(btf, btf_id) + sizeof(ITER_PREFIX) - 1; +} + +static const char *iter_state_str(enum bpf_iter_state state) +{ + switch (state) { + case BPF_ITER_STATE_ACTIVE: + return "active"; + case BPF_ITER_STATE_DRAINED: + return "drained"; + case BPF_ITER_STATE_INVALID: + return ""; + default: + WARN_ONCE(1, "unknown iter state %d\n", state); + return ""; + } +} + static void mark_reg_scratched(struct bpf_verifier_env *env, u32 regno) { env->scratched_regs |= 1U << regno; @@ -1118,6 +1152,157 @@ static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg } } +static void __mark_reg_known_zero(struct bpf_reg_state *reg); + +static int mark_stack_slots_iter(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, int insn_idx, + struct btf *btf, u32 btf_id, int nr_slots) +{ + struct bpf_func_state *state = func(env, reg); + int spi, i, j, id; + + spi = iter_get_spi(env, reg, nr_slots); + if (spi < 0) + return spi; + + id = acquire_reference_state(env, insn_idx); + if (id < 0) + return id; + + for (i = 0; i < nr_slots; i++) { + struct bpf_stack_state *slot = &state->stack[spi - i]; + struct bpf_reg_state *st = &slot->spilled_ptr; + + __mark_reg_known_zero(st); + st->type = PTR_TO_STACK; /* we don't have dedicated reg type */ + st->live |= REG_LIVE_WRITTEN; + st->ref_obj_id = i == 0 ? id : 0; + st->iter.btf = btf; + st->iter.btf_id = btf_id; + st->iter.state = BPF_ITER_STATE_ACTIVE; + st->iter.depth = 0; + + for (j = 0; j < BPF_REG_SIZE; j++) + slot->slot_type[j] = STACK_ITER; + + mark_stack_slot_scratched(env, spi - i); + } + + return 0; +} + +static int unmark_stack_slots_iter(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, int nr_slots) +{ + struct bpf_func_state *state = func(env, reg); + int spi, i, j; + + spi = iter_get_spi(env, reg, nr_slots); + if (spi < 0) + return spi; + + for (i = 0; i < nr_slots; i++) { + struct bpf_stack_state *slot = &state->stack[spi - i]; + struct bpf_reg_state *st = &slot->spilled_ptr; + + if (i == 0) + WARN_ON_ONCE(release_reference(env, st->ref_obj_id)); + + __mark_reg_not_init(env, st); + + /* see unmark_stack_slots_dynptr() for why we need to set REG_LIVE_WRITTEN */ + st->live |= REG_LIVE_WRITTEN; + + for (j = 0; j < BPF_REG_SIZE; j++) + slot->slot_type[j] = STACK_INVALID; + + mark_stack_slot_scratched(env, spi - i); + } + + return 0; +} + +static bool is_iter_reg_valid_uninit(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, int nr_slots) +{ + struct bpf_func_state *state = func(env, reg); + int spi, i, j; + + /* For -ERANGE (i.e. spi not falling into allocated stack slots), we + * will do check_mem_access to check and update stack bounds later, so + * return true for that case. + */ + spi = iter_get_spi(env, reg, nr_slots); + if (spi == -ERANGE) + return true; + if (spi < 0) + return false; + + for (i = 0; i < nr_slots; i++) { + struct bpf_stack_state *slot = &state->stack[spi - i]; + + for (j = 0; j < BPF_REG_SIZE; j++) + if (slot->slot_type[j] == STACK_ITER) + return false; + } + + return true; +} + +static bool is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + struct btf *btf, u32 btf_id, int nr_slots) +{ + struct bpf_func_state *state = func(env, reg); + int spi, i, j; + + spi = iter_get_spi(env, reg, nr_slots); + if (spi < 0) + return false; + + for (i = 0; i < nr_slots; i++) { + struct bpf_stack_state *slot = &state->stack[spi - i]; + struct bpf_reg_state *st = &slot->spilled_ptr; + + /* only main (first) slot has ref_obj_id set */ + if (i == 0 && !st->ref_obj_id) + return false; + if (i != 0 && st->ref_obj_id) + return false; + if (st->iter.btf != btf || st->iter.btf_id != btf_id) + return false; + + for (j = 0; j < BPF_REG_SIZE; j++) + if (slot->slot_type[j] != STACK_ITER) + return false; + } + + return true; +} + +/* Check if given stack slot is "special": + * - spilled register state (STACK_SPILL); + * - dynptr state (STACK_DYNPTR); + * - iter state (STACK_ITER). + */ +static bool is_stack_slot_special(const struct bpf_stack_state *stack) +{ + enum bpf_stack_slot_type type = stack->slot_type[BPF_REG_SIZE - 1]; + + switch (type) { + case STACK_SPILL: + case STACK_DYNPTR: + case STACK_ITER: + return true; + case STACK_INVALID: + case STACK_MISC: + case STACK_ZERO: + return false; + default: + WARN_ONCE(1, "unknown stack slot type %d\n", type); + return true; + } +} + /* The reg state of a pointer or a bounded scalar was saved when * it was spilled to the stack. */ @@ -1267,6 +1452,19 @@ static void print_verifier_state(struct bpf_verifier_env *env, if (reg->ref_obj_id) verbose(env, "(ref_id=%d)", reg->ref_obj_id); break; + case STACK_ITER: + /* only main slot has ref_obj_id set; skip others */ + reg = &state->stack[i].spilled_ptr; + if (!reg->ref_obj_id) + continue; + + verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); + print_liveness(env, reg->live); + verbose(env, "=iter_%s(ref_id=%d,state=%s,depth=%u)", + iter_type_str(reg->iter.btf, reg->iter.btf_id), + reg->ref_obj_id, iter_state_str(reg->iter.state), + reg->iter.depth); + break; case STACK_MISC: case STACK_ZERO: default: @@ -2710,6 +2908,25 @@ static int mark_dynptr_read(struct bpf_verifier_env *env, struct bpf_reg_state * state->stack[spi - 1].spilled_ptr.parent, REG_LIVE_READ64); } +static int mark_iter_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + int spi, int nr_slots) +{ + struct bpf_func_state *state = func(env, reg); + int err, i; + + for (i = 0; i < nr_slots; i++) { + struct bpf_reg_state *st = &state->stack[spi - i].spilled_ptr; + + err = mark_reg_read(env, st, st->parent, REG_LIVE_READ64); + if (err) + return err; + + mark_stack_slot_scratched(env, spi - i); + } + + return 0; +} + /* This function is supposed to be used by the following 32-bit optimization * code only. It returns TRUE if the source or destination register operates * on 64-bit, otherwise return FALSE. @@ -3691,8 +3908,8 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, /* regular write of data into stack destroys any spilled ptr */ state->stack[spi].spilled_ptr.type = NOT_INIT; - /* Mark slots as STACK_MISC if they belonged to spilled ptr. */ - if (is_spilled_reg(&state->stack[spi])) + /* Mark slots as STACK_MISC if they belonged to spilled ptr/dynptr/iter. */ + if (is_stack_slot_special(&state->stack[spi])) for (i = 0; i < BPF_REG_SIZE; i++) scrub_spilled_slot(&state->stack[spi].slot_type[i]); @@ -6506,6 +6723,203 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn return err; } +static u32 iter_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int spi) +{ + struct bpf_func_state *state = func(env, reg); + + return state->stack[spi].spilled_ptr.ref_obj_id; +} + +static bool is_iter_kfunc(struct bpf_kfunc_call_arg_meta *meta) +{ + return meta->kfunc_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY); +} + +static bool is_iter_new_kfunc(struct bpf_kfunc_call_arg_meta *meta) +{ + return meta->kfunc_flags & KF_ITER_NEW; +} + +static bool is_iter_next_kfunc(struct bpf_kfunc_call_arg_meta *meta) +{ + return meta->kfunc_flags & KF_ITER_NEXT; +} + +static bool is_iter_destroy_kfunc(struct bpf_kfunc_call_arg_meta *meta) +{ + return meta->kfunc_flags & KF_ITER_DESTROY; +} + +static bool is_kfunc_arg_iter(struct bpf_kfunc_call_arg_meta *meta, int arg) +{ + /* btf_check_iter_kfuncs() guarantees that first argument of any iter + * kfunc is iter state pointer + */ + return arg == 0 && is_iter_kfunc(meta); +} + +static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_idx, + struct bpf_kfunc_call_arg_meta *meta) +{ + struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + const struct btf_type *t; + const struct btf_param *arg; + int spi, err, i, nr_slots; + u32 btf_id; + + /* btf_check_iter_kfuncs() ensures we don't need to validate anything here */ + arg = &btf_params(meta->func_proto)[0]; + t = btf_type_skip_modifiers(meta->btf, arg->type, NULL); /* PTR */ + t = btf_type_skip_modifiers(meta->btf, t->type, &btf_id); /* STRUCT */ + nr_slots = t->size / BPF_REG_SIZE; + + spi = iter_get_spi(env, reg, nr_slots); + if (spi < 0 && spi != -ERANGE) + return spi; + + meta->iter.spi = spi; + meta->iter.frameno = reg->frameno; + + if (is_iter_new_kfunc(meta)) { + /* bpf_iter__new() expects pointer to uninit iter state */ + if (!is_iter_reg_valid_uninit(env, reg, nr_slots)) { + verbose(env, "expected uninitialized iter_%s as arg #%d\n", + iter_type_str(meta->btf, btf_id), regno); + return -EINVAL; + } + + for (i = 0; i < nr_slots * 8; i += BPF_REG_SIZE) { + err = check_mem_access(env, insn_idx, regno, + i, BPF_DW, BPF_WRITE, -1, false); + if (err) + return err; + } + + err = mark_stack_slots_iter(env, reg, insn_idx, meta->btf, btf_id, nr_slots); + if (err) + return err; + } else { + /* iter_next() or iter_destroy() expect initialized iter state*/ + if (!is_iter_reg_valid_init(env, reg, meta->btf, btf_id, nr_slots)) { + verbose(env, "expected an initialized iter_%s as arg #%d\n", + iter_type_str(meta->btf, btf_id), regno); + return -EINVAL; + } + + err = mark_iter_read(env, reg, spi, nr_slots); + if (err) + return err; + + meta->ref_obj_id = iter_ref_obj_id(env, reg, spi); + + if (is_iter_destroy_kfunc(meta)) { + err = unmark_stack_slots_iter(env, reg, nr_slots); + if (err) + return err; + } + } + + return 0; +} + +/* process_iter_next_call() is called when verifier gets to iterator's next + * "method" (e.g., bpf_iter_num_next() for numbers iterator) call. We'll refer + * to it as just "iter_next()" in comments below. + * + * BPF verifier relies on a crucial contract for any iter_next() + * implementation: it should *eventually* return NULL, and once that happens + * it should keep returning NULL. That is, once iterator exhausts elements to + * iterate, it should never reset or spuriously return new elements. + * + * With the assumption of such contract, process_iter_next_call() simulates + * a fork in the verifier state to validate loop logic correctness and safety + * without having to simulate infinite amount of iterations. + * + * In current state, we first assume that iter_next() returned NULL and + * iterator state is set to DRAINED (BPF_ITER_STATE_DRAINED). In such + * conditions we should not form an infinite loop and should eventually reach + * exit. + * + * Besides that, we also fork current state and enqueue it for later + * verification. In a forked state we keep iterator state as ACTIVE + * (BPF_ITER_STATE_ACTIVE) and assume non-NULL return from iter_next(). We + * also bump iteration depth to prevent erroneous infinite loop detection + * later on (see iter_active_depths_differ() comment for details). In this + * state we assume that we'll eventually loop back to another iter_next() + * calls (it could be in exactly same location or in some other instruction, + * it doesn't matter, we don't make any unnecessary assumptions about this, + * everything revolves around iterator state in a stack slot, not which + * instruction is calling iter_next()). When that happens, we either will come + * to iter_next() with equivalent state and can conclude that next iteration + * will proceed in exactly the same way as we just verified, so it's safe to + * assume that loop converges. If not, we'll go on another iteration + * simulation with a different input state, until all possible starting states + * are validated or we reach maximum number of instructions limit. + * + * This way, we will either exhaustively discover all possible input states + * that iterator loop can start with and eventually will converge, or we'll + * effectively regress into bounded loop simulation logic and either reach + * maximum number of instructions if loop is not provably convergent, or there + * is some statically known limit on number of iterations (e.g., if there is + * an explicit `if n > 100 then break;` statement somewhere in the loop). + * + * One very subtle but very important aspect is that we *always* simulate NULL + * condition first (as the current state) before we simulate non-NULL case. + * This has to do with intricacies of scalar precision tracking. By simulating + * "exit condition" of iter_next() returning NULL first, we make sure all the + * relevant precision marks *that will be set **after** we exit iterator loop* + * are propagated backwards to common parent state of NULL and non-NULL + * branches. Thanks to that, state equivalence checks done later in forked + * state, when reaching iter_next() for ACTIVE iterator, can assume that + * precision marks are finalized and won't change. Because simulating another + * ACTIVE iterator iteration won't change them (because given same input + * states we'll end up with exactly same output states which we are currently + * comparing; and verification after the loop already propagated back what + * needs to be **additionally** tracked as precise). It's subtle, grok + * precision tracking for more intuitive understanding. + */ +static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx, + struct bpf_kfunc_call_arg_meta *meta) +{ + struct bpf_verifier_state *cur_st = env->cur_state, *queued_st; + struct bpf_func_state *cur_fr = cur_st->frame[cur_st->curframe], *queued_fr; + struct bpf_reg_state *cur_iter, *queued_iter; + int iter_frameno = meta->iter.frameno; + int iter_spi = meta->iter.spi; + + BTF_TYPE_EMIT(struct bpf_iter); + + cur_iter = &env->cur_state->frame[iter_frameno]->stack[iter_spi].spilled_ptr; + + if (cur_iter->iter.state != BPF_ITER_STATE_ACTIVE && + cur_iter->iter.state != BPF_ITER_STATE_DRAINED) { + verbose(env, "verifier internal error: unexpected iterator state %d (%s)\n", + cur_iter->iter.state, iter_state_str(cur_iter->iter.state)); + return -EFAULT; + } + + if (cur_iter->iter.state == BPF_ITER_STATE_ACTIVE) { + /* branch out active iter state */ + queued_st = push_stack(env, insn_idx + 1, insn_idx, false); + if (!queued_st) + return -ENOMEM; + + queued_iter = &queued_st->frame[iter_frameno]->stack[iter_spi].spilled_ptr; + queued_iter->iter.state = BPF_ITER_STATE_ACTIVE; + queued_iter->iter.depth++; + + queued_fr = queued_st->frame[queued_st->curframe]; + mark_ptr_not_null_reg(&queued_fr->regs[BPF_REG_0]); + } + + /* switch to DRAINED state, but keep the depth unchanged */ + /* mark current iter state as drained and assume returned NULL */ + cur_iter->iter.state = BPF_ITER_STATE_DRAINED; + __mark_reg_const_zero(&cur_fr->regs[BPF_REG_0]); + + return 0; +} + static bool arg_type_is_mem_size(enum bpf_arg_type type) { return type == ARG_CONST_SIZE || @@ -9099,6 +9513,7 @@ enum kfunc_ptr_arg_type { KF_ARG_PTR_TO_ALLOC_BTF_ID, /* Allocated object */ KF_ARG_PTR_TO_KPTR, /* PTR_TO_KPTR but type specific */ KF_ARG_PTR_TO_DYNPTR, + KF_ARG_PTR_TO_ITER, KF_ARG_PTR_TO_LIST_HEAD, KF_ARG_PTR_TO_LIST_NODE, KF_ARG_PTR_TO_BTF_ID, /* Also covers reg2btf_ids conversions */ @@ -9220,6 +9635,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (is_kfunc_arg_dynptr(meta->btf, &args[argno])) return KF_ARG_PTR_TO_DYNPTR; + if (is_kfunc_arg_iter(meta, argno)) + return KF_ARG_PTR_TO_ITER; + if (is_kfunc_arg_list_head(meta->btf, &args[argno])) return KF_ARG_PTR_TO_LIST_HEAD; @@ -9848,6 +10266,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ break; case KF_ARG_PTR_TO_KPTR: case KF_ARG_PTR_TO_DYNPTR: + case KF_ARG_PTR_TO_ITER: case KF_ARG_PTR_TO_LIST_HEAD: case KF_ARG_PTR_TO_LIST_NODE: case KF_ARG_PTR_TO_RB_ROOT: @@ -9944,6 +10363,11 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ break; } + case KF_ARG_PTR_TO_ITER: + ret = process_iter_arg(env, regno, insn_idx, meta); + if (ret < 0) + return ret; + break; case KF_ARG_PTR_TO_LIST_HEAD: if (reg->type != PTR_TO_MAP_VALUE && reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { @@ -10148,6 +10572,8 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, desc_btf = meta.btf; insn_aux = &env->insn_aux_data[insn_idx]; + insn_aux->is_iter_next = is_iter_next_kfunc(&meta); + if (is_kfunc_destructive(&meta) && !capable(CAP_SYS_BOOT)) { verbose(env, "destructive kfunc calls require CAP_SYS_BOOT capability\n"); return -EACCES; @@ -10436,6 +10862,12 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, mark_btf_func_reg_size(env, regno, t->size); } + if (is_iter_next_kfunc(&meta)) { + err = process_iter_next_call(env, insn_idx, &meta); + if (err) + return err; + } + return 0; } @@ -13548,6 +13980,13 @@ static int visit_insn(int t, struct bpf_verifier_env *env) * async state will be pushed for further exploration. */ mark_prune_point(env, t); + if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { + struct bpf_kfunc_call_arg_meta meta; + + ret = fetch_kfunc_meta(env, insn, &meta, NULL); + if (ret == 0 && is_iter_next_kfunc(&meta)) + mark_prune_point(env, t); + } return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL); case BPF_JA: @@ -14301,6 +14740,8 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, * didn't use them */ for (i = 0; i < old->allocated_stack; i++) { + struct bpf_reg_state *old_reg, *cur_reg; + spi = i / BPF_REG_SIZE; if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ)) { @@ -14357,9 +14798,6 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, return false; break; case STACK_DYNPTR: - { - const struct bpf_reg_state *old_reg, *cur_reg; - old_reg = &old->stack[spi].spilled_ptr; cur_reg = &cur->stack[spi].spilled_ptr; if (old_reg->dynptr.type != cur_reg->dynptr.type || @@ -14367,7 +14805,22 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap)) return false; break; - } + case STACK_ITER: + old_reg = &old->stack[spi].spilled_ptr; + cur_reg = &cur->stack[spi].spilled_ptr; + /* iter.depth is not compared between states as it + * doesn't matter for correctness and would otherwise + * prevent convergence; we maintain it only to prevent + * infinite loop check triggering, see + * iter_active_depths_differ() + */ + if (old_reg->iter.btf != cur_reg->iter.btf || + old_reg->iter.btf_id != cur_reg->iter.btf_id || + old_reg->iter.state != cur_reg->iter.state || + /* ignore {old_reg,cur_reg}->iter.depth, see above */ + !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap)) + return false; + break; case STACK_MISC: case STACK_ZERO: case STACK_INVALID: @@ -14626,6 +15079,92 @@ static bool states_maybe_looping(struct bpf_verifier_state *old, return true; } +static bool is_iter_next_insn(struct bpf_verifier_env *env, int insn_idx) +{ + return env->insn_aux_data[insn_idx].is_iter_next; +} + +/* is_state_visited() handles iter_next() (see process_iter_next_call() for + * terminology) calls specially: as opposed to bounded BPF loops, it *expects* + * states to match, which otherwise would look like an infinite loop. So while + * iter_next() calls are taken care of, we still need to be careful and + * prevent erroneous and too eager declaration of "ininite loop", when + * iterators are involved. + * + * Here's a situation in pseudo-BPF assembly form: + * + * 0: again: ; set up iter_next() call args + * 1: r1 = &it ; + * 2: call bpf_iter_num_next ; this is iter_next() call + * 3: if r0 == 0 goto done + * 4: ... something useful here ... + * 5: goto again ; another iteration + * 6: done: + * 7: r1 = &it + * 8: call bpf_iter_num_destroy ; clean up iter state + * 9: exit + * + * This is a typical loop. Let's assume that we have a prune point at 1:, + * before we get to `call bpf_iter_num_next` (e.g., because of that `goto + * again`, assuming other heuristics don't get in a way). + * + * When we first time come to 1:, let's say we have some state X. We proceed + * to 2:, fork states, enqueue ACTIVE, validate NULL case successfully, exit. + * Now we come back to validate that forked ACTIVE state. We proceed through + * 3-5, come to goto, jump to 1:. Let's assume our state didn't change, so we + * are converging. But the problem is that we don't know that yet, as this + * convergence has to happen at iter_next() call site only. So if nothing is + * done, at 1: verifier will use bounded loop logic and declare infinite + * looping (and would be *technically* correct, if not for iterator's + * "eventual sticky NULL" contract, see process_iter_next_call()). But we + * don't want that. So what we do in process_iter_next_call() when we go on + * another ACTIVE iteration, we bump slot->iter.depth, to mark that it's + * a different iteration. So when we suspect an infinite loop, we additionally + * check if any of the *ACTIVE* iterator states depths differ. If yes, we + * pretend we are not looping and wait for next iter_next() call. + * + * This only applies to ACTIVE state. In DRAINED state we don't expect to + * loop, because that would actually mean infinite loop, as DRAINED state is + * "sticky", and so we'll keep returning into the same instruction with the + * same state (at least in one of possible code paths). + * + * This approach allows to keep infinite loop heuristic even in the face of + * active iterator. E.g., C snippet below is and will be detected as + * inifintely looping: + * + * struct bpf_iter_num it; + * int *p, x; + * + * bpf_iter_num_new(&it, 0, 10); + * while ((p = bpf_iter_num_next(&t))) { + * x = p; + * while (x--) {} // <<-- infinite loop here + * } + * + */ +static bool iter_active_depths_differ(struct bpf_verifier_state *old, struct bpf_verifier_state *cur) +{ + struct bpf_reg_state *slot, *cur_slot; + struct bpf_func_state *state; + int i, fr; + + for (fr = old->curframe; fr >= 0; fr--) { + state = old->frame[fr]; + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { + if (state->stack[i].slot_type[0] != STACK_ITER) + continue; + + slot = &state->stack[i].spilled_ptr; + if (slot->iter.state != BPF_ITER_STATE_ACTIVE) + continue; + + cur_slot = &cur->frame[fr]->stack[i].spilled_ptr; + if (cur_slot->iter.depth != slot->iter.depth) + return true; + } + } + return false; +} static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) { @@ -14673,8 +15212,46 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * Since the verifier still needs to catch infinite loops * inside async callbacks. */ - } else if (states_maybe_looping(&sl->state, cur) && - states_equal(env, &sl->state, cur)) { + goto skip_inf_loop_check; + } + /* BPF open-coded iterators loop detection is special. + * states_maybe_looping() logic is too simplistic in detecting + * states that *might* be equivalent, because it doesn't know + * about ID remapping, so don't even perform it. + * See process_iter_next_call() and iter_active_depths_differ() + * for overview of the logic. When current and one of parent + * states are detected as equivalent, it's a good thing: we prove + * convergence and can stop simulating further iterations. + * It's safe to assume that iterator loop will finish, taking into + * account iter_next() contract of eventually returning + * sticky NULL result. + */ + if (is_iter_next_insn(env, insn_idx)) { + if (states_equal(env, &sl->state, cur)) { + struct bpf_func_state *cur_frame; + struct bpf_reg_state *iter_state, *iter_reg; + int spi; + + cur_frame = cur->frame[cur->curframe]; + /* btf_check_iter_kfuncs() enforces that + * iter state pointer is always the first arg + */ + iter_reg = &cur_frame->regs[BPF_REG_1]; + /* current state is valid due to states_equal(), + * so we can assume valid iter and reg state, + * no need for extra (re-)validations + */ + spi = __get_spi(iter_reg->off + iter_reg->var_off.value); + iter_state = &func(env, iter_reg)->stack[spi].spilled_ptr; + if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE) + goto hit; + } + goto skip_inf_loop_check; + } + /* attempt to detect infinite loop to avoid unnecessary doomed work */ + if (states_maybe_looping(&sl->state, cur) && + states_equal(env, &sl->state, cur) && + !iter_active_depths_differ(&sl->state, cur)) { verbose_linfo(env, insn_idx, "; "); verbose(env, "infinite loop detected at insn %d\n", insn_idx); return -EINVAL; @@ -14691,6 +15268,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * This threshold shouldn't be too high either, since states * at the end of the loop are likely to be useful in pruning. */ +skip_inf_loop_check: if (!env->test_state_freq && env->jmps_processed - env->prev_jmps_processed < 20 && env->insn_processed - env->prev_insn_processed < 100) @@ -14698,6 +15276,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) goto miss; } if (states_equal(env, &sl->state, cur)) { +hit: sl->hit_cnt++; /* reached equivalent register/stack state, * prune the search. -- cgit v1.2.3-58-ga151 From 6018e1f407cccf39b804d1f75ad4de7be4e6cc45 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 8 Mar 2023 10:41:17 -0800 Subject: bpf: implement numbers iterator Implement the first open-coded iterator type over a range of integers. It's public API consists of: - bpf_iter_num_new() constructor, which accepts [start, end) range (that is, start is inclusive, end is exclusive). - bpf_iter_num_next() which will keep returning read-only pointer to int until the range is exhausted, at which point NULL will be returned. If bpf_iter_num_next() is kept calling after this, NULL will be persistently returned. - bpf_iter_num_destroy() destructor, which needs to be called at some point to clean up iterator state. BPF verifier enforces that iterator destructor is called at some point before BPF program exits. Note that `start = end = X` is a valid combination to setup an empty iterator. bpf_iter_num_new() will return 0 (success) for any such combination. If bpf_iter_num_new() detects invalid combination of input arguments, it returns error, resets iterator state to, effectively, empty iterator, so any subsequent call to bpf_iter_num_next() will keep returning NULL. BPF verifier has no knowledge that returned integers are in the [start, end) value range, as both `start` and `end` are not statically known and enforced: they are runtime values. While the implementation is pretty trivial, some care needs to be taken to avoid overflows and underflows. Subsequent selftests will validate correctness of [start, end) semantics, especially around extremes (INT_MIN and INT_MAX). Similarly to bpf_loop(), we enforce that no more than BPF_MAX_LOOPS can be specified. bpf_iter_num_{new,next,destroy}() is a logical evolution from bounded BPF loops and bpf_loop() helper and is the basis for implementing ergonomic BPF loops with no statically known or verified bounds. Subsequent patches implement bpf_for() macro, demonstrating how this can be wrapped into something that works and feels like a normal for() loop in C language. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20230308184121.1165081-5-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 8 +++-- include/uapi/linux/bpf.h | 8 +++++ kernel/bpf/bpf_iter.c | 70 ++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/helpers.c | 3 ++ tools/include/uapi/linux/bpf.h | 8 +++++ 5 files changed, 95 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6792a7940e1e..e64ff1e89fb2 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1617,8 +1617,12 @@ struct bpf_array { #define BPF_COMPLEXITY_LIMIT_INSNS 1000000 /* yes. 1M insns */ #define MAX_TAIL_CALL_CNT 33 -/* Maximum number of loops for bpf_loop */ -#define BPF_MAX_LOOPS BIT(23) +/* Maximum number of loops for bpf_loop and bpf_iter_num. + * It's enum to expose it (and thus make it discoverable) through BTF. + */ +enum { + BPF_MAX_LOOPS = 8 * 1024 * 1024, +}; #define BPF_F_ACCESS_MASK (BPF_F_RDONLY | \ BPF_F_RDONLY_PROG | \ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 976b194eb775..4abddb668a10 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -7112,4 +7112,12 @@ enum { BPF_F_TIMER_ABS = (1ULL << 0), }; +/* BPF numbers iterator state */ +struct bpf_iter_num { + /* opaque iterator state; having __u64 here allows to preserve correct + * alignment requirements in vmlinux.h, generated from BTF + */ + __u64 __opaque[1]; +} __attribute__((aligned(8))); + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 5dc307bdeaeb..96856f130cbf 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -776,3 +776,73 @@ const struct bpf_func_proto bpf_loop_proto = { .arg3_type = ARG_PTR_TO_STACK_OR_NULL, .arg4_type = ARG_ANYTHING, }; + +struct bpf_iter_num_kern { + int cur; /* current value, inclusive */ + int end; /* final value, exclusive */ +} __aligned(8); + +__diag_push(); +__diag_ignore_all("-Wmissing-prototypes", + "Global functions as their definitions will be in vmlinux BTF"); + +__bpf_kfunc int bpf_iter_num_new(struct bpf_iter_num *it, int start, int end) +{ + struct bpf_iter_num_kern *s = (void *)it; + + BUILD_BUG_ON(sizeof(struct bpf_iter_num_kern) != sizeof(struct bpf_iter_num)); + BUILD_BUG_ON(__alignof__(struct bpf_iter_num_kern) != __alignof__(struct bpf_iter_num)); + + BTF_TYPE_EMIT(struct btf_iter_num); + + /* start == end is legit, it's an empty range and we'll just get NULL + * on first (and any subsequent) bpf_iter_num_next() call + */ + if (start > end) { + s->cur = s->end = 0; + return -EINVAL; + } + + /* avoid overflows, e.g., if start == INT_MIN and end == INT_MAX */ + if ((s64)end - (s64)start > BPF_MAX_LOOPS) { + s->cur = s->end = 0; + return -E2BIG; + } + + /* user will call bpf_iter_num_next() first, + * which will set s->cur to exactly start value; + * underflow shouldn't matter + */ + s->cur = start - 1; + s->end = end; + + return 0; +} + +__bpf_kfunc int *bpf_iter_num_next(struct bpf_iter_num* it) +{ + struct bpf_iter_num_kern *s = (void *)it; + + /* check failed initialization or if we are done (same behavior); + * need to be careful about overflow, so convert to s64 for checks, + * e.g., if s->cur == s->end == INT_MAX, we can't just do + * s->cur + 1 >= s->end + */ + if ((s64)(s->cur + 1) >= s->end) { + s->cur = s->end = 0; + return NULL; + } + + s->cur++; + + return &s->cur; +} + +__bpf_kfunc void bpf_iter_num_destroy(struct bpf_iter_num *it) +{ + struct bpf_iter_num_kern *s = (void *)it; + + s->cur = s->end = 0; +} + +__diag_pop(); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 637ac4e92e75..f9b7eeedce08 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2411,6 +2411,9 @@ BTF_ID_FLAGS(func, bpf_rcu_read_lock) BTF_ID_FLAGS(func, bpf_rcu_read_unlock) BTF_ID_FLAGS(func, bpf_dynptr_slice, KF_RET_NULL) BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_iter_num_new, KF_ITER_NEW) +BTF_ID_FLAGS(func, bpf_iter_num_next, KF_ITER_NEXT | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY) BTF_SET8_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 976b194eb775..4abddb668a10 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -7112,4 +7112,12 @@ enum { BPF_F_TIMER_ABS = (1ULL << 0), }; +/* BPF numbers iterator state */ +struct bpf_iter_num { + /* opaque iterator state; having __u64 here allows to preserve correct + * alignment requirements in vmlinux.h, generated from BTF + */ + __u64 __opaque[1]; +} __attribute__((aligned(8))); + #endif /* _UAPI__LINUX_BPF_H__ */ -- cgit v1.2.3-58-ga151 From 4b5ce570dbef57a20acdd71b0c65376009012354 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 9 Mar 2023 22:01:49 -0800 Subject: bpf: ensure state checkpointing at iter_next() call sites State equivalence check and checkpointing performed in is_state_visited() employs certain heuristics to try to save memory by avoiding state checkpoints if not enough jumps and instructions happened since last checkpoint. This leads to unpredictability of whether a particular instruction will be checkpointed and how regularly. While normally this is not causing much problems (except inconveniences for predictable verifier tests, which we overcome with BPF_F_TEST_STATE_FREQ flag), turns out it's not the case for open-coded iterators. Checking and saving state checkpoints at iter_next() call is crucial for fast convergence of open-coded iterator loop logic, so we need to force it. If we don't do that, is_state_visited() might skip saving a checkpoint, causing unnecessarily long sequence of not checkpointed instructions and jumps, leading to exhaustion of jump history buffer, and potentially other undesired outcomes. It is expected that with correct open-coded iterators convergence will happen quickly, so we don't run a risk of exhausting memory. This patch adds, in addition to prune and jump instruction marks, also a "forced checkpoint" mark, and makes sure that any iter_next() call instruction is marked as such. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20230310060149.625887-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 6 +++++- kernel/bpf/verifier.c | 31 ++++++++++++++++++++++++++++--- 2 files changed, 33 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 0c052bc79940..81d525d057c7 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -477,8 +477,12 @@ struct bpf_insn_aux_data { /* below fields are initialized once */ unsigned int orig_idx; /* original instruction index */ - bool prune_point; bool jmp_point; + bool prune_point; + /* ensure we check state equivalence and save state checkpoint and + * this instruction, regardless of any heuristics + */ + bool force_checkpoint; }; #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 45a082284464..13fd4c893f3b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -13865,6 +13865,17 @@ static bool is_prune_point(struct bpf_verifier_env *env, int insn_idx) return env->insn_aux_data[insn_idx].prune_point; } +static void mark_force_checkpoint(struct bpf_verifier_env *env, int idx) +{ + env->insn_aux_data[idx].force_checkpoint = true; +} + +static bool is_force_checkpoint(struct bpf_verifier_env *env, int insn_idx) +{ + return env->insn_aux_data[insn_idx].force_checkpoint; +} + + enum { DONE_EXPLORING = 0, KEEP_EXPLORING = 1, @@ -13984,8 +13995,21 @@ static int visit_insn(int t, struct bpf_verifier_env *env) struct bpf_kfunc_call_arg_meta meta; ret = fetch_kfunc_meta(env, insn, &meta, NULL); - if (ret == 0 && is_iter_next_kfunc(&meta)) + if (ret == 0 && is_iter_next_kfunc(&meta)) { mark_prune_point(env, t); + /* Checking and saving state checkpoints at iter_next() call + * is crucial for fast convergence of open-coded iterator loop + * logic, so we need to force it. If we don't do that, + * is_state_visited() might skip saving a checkpoint, causing + * unnecessarily long sequence of not checkpointed + * instructions and jumps, leading to exhaustion of jump + * history buffer, and potentially other undesired outcomes. + * It is expected that with correct open-coded iterators + * convergence will happen quickly, so we don't run a risk of + * exhausting memory. + */ + mark_force_checkpoint(env, t); + } } return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL); @@ -15172,7 +15196,8 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) struct bpf_verifier_state_list *sl, **pprev; struct bpf_verifier_state *cur = env->cur_state, *new; int i, j, err, states_cnt = 0; - bool add_new_state = env->test_state_freq ? true : false; + bool force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx); + bool add_new_state = force_new_state; /* bpf progs typically have pruning point every 4 instructions * http://vger.kernel.org/bpfconf2019.html#session-1 @@ -15269,7 +15294,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * at the end of the loop are likely to be useful in pruning. */ skip_inf_loop_check: - if (!env->test_state_freq && + if (!force_new_state && env->jmps_processed - env->prev_jmps_processed < 20 && env->insn_processed - env->prev_insn_processed < 100) add_new_state = false; -- cgit v1.2.3-58-ga151 From 52c2b005a3c18c565fc70cfd0ca49375f301e952 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 9 Mar 2023 14:41:31 -0800 Subject: bpf: take into account liveness when propagating precision When doing state comparison, if old state has register that is not marked as REG_LIVE_READ, then we just skip comparison, regardless what's the state of corresponing register in current state. This is because not REG_LIVE_READ register is irrelevant for further program execution and correctness. All good here. But when we get to precision propagation, after two states were declared equivalent, we don't take into account old register's liveness, and thus attempt to propagate precision for register in current state even if that register in old state was not REG_LIVE_READ anymore. This is bad, because register in current state could be anything at all and this could cause -EFAULT due to internal logic bugs. Fix by taking into account REG_LIVE_READ liveness mark to keep the logic in state comparison in sync with precision propagation. Fixes: a3ce685dd01a ("bpf: fix precision tracking") Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20230309224131.57449-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 13fd4c893f3b..0aaf7703326b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -15058,7 +15058,8 @@ static int propagate_precision(struct bpf_verifier_env *env, state_reg = state->regs; for (i = 0; i < BPF_REG_FP; i++, state_reg++) { if (state_reg->type != SCALAR_VALUE || - !state_reg->precise) + !state_reg->precise || + !(state_reg->live & REG_LIVE_READ)) continue; if (env->log.level & BPF_LOG_LEVEL2) verbose(env, "frame %d: propagating r%d\n", i, fr); @@ -15072,7 +15073,8 @@ static int propagate_precision(struct bpf_verifier_env *env, continue; state_reg = &state->stack[i].spilled_ptr; if (state_reg->type != SCALAR_VALUE || - !state_reg->precise) + !state_reg->precise || + !(state_reg->live & REG_LIVE_READ)) continue; if (env->log.level & BPF_LOG_LEVEL2) verbose(env, "frame %d: propagating fp%d\n", -- cgit v1.2.3-58-ga151 From 4cbd23cc92c49173e402753cab62b8a7754ed18f Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 7 Mar 2023 22:59:20 -0800 Subject: bpf: Move a few bpf_local_storage functions to static scope This patch moves the bpf_local_storage_free_rcu() and bpf_selem_unlink_map() to static because they are not used outside of bpf_local_storage.c. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20230308065936.1550103-2-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_local_storage.h | 3 --- kernel/bpf/bpf_local_storage.c | 4 ++-- 2 files changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index d934248b8e81..502ad7093f13 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -147,8 +147,6 @@ void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool use_trace_rcu); void bpf_selem_link_map(struct bpf_local_storage_map *smap, struct bpf_local_storage_elem *selem); -void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem); - struct bpf_local_storage_elem * bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value, bool charge_mem, gfp_t gfp_flags); @@ -163,7 +161,6 @@ struct bpf_local_storage_data * bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, void *value, u64 map_flags, gfp_t gfp_flags); -void bpf_local_storage_free_rcu(struct rcu_head *rcu); u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map); #endif /* _BPF_LOCAL_STORAGE_H */ diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index d3ba3f2db640..1904a4245ebe 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -95,7 +95,7 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, return NULL; } -void bpf_local_storage_free_rcu(struct rcu_head *rcu) +static void bpf_local_storage_free_rcu(struct rcu_head *rcu) { struct bpf_local_storage *local_storage; @@ -251,7 +251,7 @@ void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, hlist_add_head_rcu(&selem->snode, &local_storage->list); } -void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) +static void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) { struct bpf_local_storage_map *smap; struct bpf_local_storage_map_bucket *b; -- cgit v1.2.3-58-ga151 From 2ffcb6fc50174d1efc8f98633eb2647d84483c68 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 7 Mar 2023 22:59:21 -0800 Subject: bpf: Refactor codes into bpf_local_storage_destroy This patch first renames bpf_local_storage_unlink_nolock to bpf_local_storage_destroy(). It better reflects that it is only used when the storage's owner (sk/task/cgrp/inode) is being kfree(). All bpf_local_storage_destroy's caller is taking the spin lock and then free the storage. This patch also moves these two steps into the bpf_local_storage_destroy. This is a preparation work for a later patch that uses bpf_mem_cache_alloc/free in the bpf_local_storage. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20230308065936.1550103-3-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_local_storage.h | 2 +- kernel/bpf/bpf_cgrp_storage.c | 9 +-------- kernel/bpf/bpf_inode_storage.c | 8 +------- kernel/bpf/bpf_local_storage.c | 8 ++++++-- kernel/bpf/bpf_task_storage.c | 9 +-------- net/core/bpf_sk_storage.c | 8 +------- 6 files changed, 11 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 502ad7093f13..5908a954ddc2 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -128,7 +128,7 @@ bpf_local_storage_lookup(struct bpf_local_storage *local_storage, struct bpf_local_storage_map *smap, bool cacheit_lockit); -bool bpf_local_storage_unlink_nolock(struct bpf_local_storage *local_storage); +void bpf_local_storage_destroy(struct bpf_local_storage *local_storage); void bpf_local_storage_map_free(struct bpf_map *map, struct bpf_local_storage_cache *cache, diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c index 9ae07aedaf23..492594d69a86 100644 --- a/kernel/bpf/bpf_cgrp_storage.c +++ b/kernel/bpf/bpf_cgrp_storage.c @@ -46,8 +46,6 @@ static struct bpf_local_storage __rcu **cgroup_storage_ptr(void *owner) void bpf_cgrp_storage_free(struct cgroup *cgroup) { struct bpf_local_storage *local_storage; - bool free_cgroup_storage = false; - unsigned long flags; rcu_read_lock(); local_storage = rcu_dereference(cgroup->bpf_cgrp_storage); @@ -57,14 +55,9 @@ void bpf_cgrp_storage_free(struct cgroup *cgroup) } bpf_cgrp_storage_lock(); - raw_spin_lock_irqsave(&local_storage->lock, flags); - free_cgroup_storage = bpf_local_storage_unlink_nolock(local_storage); - raw_spin_unlock_irqrestore(&local_storage->lock, flags); + bpf_local_storage_destroy(local_storage); bpf_cgrp_storage_unlock(); rcu_read_unlock(); - - if (free_cgroup_storage) - kfree_rcu(local_storage, rcu); } static struct bpf_local_storage_data * diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index 43e2619c8167..2d25bcfa371b 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -57,7 +57,6 @@ static struct bpf_local_storage_data *inode_storage_lookup(struct inode *inode, void bpf_inode_storage_free(struct inode *inode) { struct bpf_local_storage *local_storage; - bool free_inode_storage = false; struct bpf_storage_blob *bsb; bsb = bpf_inode(inode); @@ -72,13 +71,8 @@ void bpf_inode_storage_free(struct inode *inode) return; } - raw_spin_lock_bh(&local_storage->lock); - free_inode_storage = bpf_local_storage_unlink_nolock(local_storage); - raw_spin_unlock_bh(&local_storage->lock); + bpf_local_storage_destroy(local_storage); rcu_read_unlock(); - - if (free_inode_storage) - kfree_rcu(local_storage, rcu); } static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key) diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 1904a4245ebe..e19f9f50a60d 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -652,11 +652,12 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map, return 0; } -bool bpf_local_storage_unlink_nolock(struct bpf_local_storage *local_storage) +void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) { struct bpf_local_storage_elem *selem; bool free_storage = false; struct hlist_node *n; + unsigned long flags; /* Neither the bpf_prog nor the bpf_map's syscall * could be modifying the local_storage->list now. @@ -667,6 +668,7 @@ bool bpf_local_storage_unlink_nolock(struct bpf_local_storage *local_storage) * when unlinking elem from the local_storage->list and * the map's bucket->list. */ + raw_spin_lock_irqsave(&local_storage->lock, flags); hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) { /* Always unlink from map before unlinking from * local_storage. @@ -681,8 +683,10 @@ bool bpf_local_storage_unlink_nolock(struct bpf_local_storage *local_storage) free_storage = bpf_selem_unlink_storage_nolock( local_storage, selem, false, false); } + raw_spin_unlock_irqrestore(&local_storage->lock, flags); - return free_storage; + if (free_storage) + kfree_rcu(local_storage, rcu); } u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map) diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index 20f942229f3c..4dcef28744d1 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -72,8 +72,6 @@ task_storage_lookup(struct task_struct *task, struct bpf_map *map, void bpf_task_storage_free(struct task_struct *task) { struct bpf_local_storage *local_storage; - bool free_task_storage = false; - unsigned long flags; rcu_read_lock(); @@ -84,14 +82,9 @@ void bpf_task_storage_free(struct task_struct *task) } bpf_task_storage_lock(); - raw_spin_lock_irqsave(&local_storage->lock, flags); - free_task_storage = bpf_local_storage_unlink_nolock(local_storage); - raw_spin_unlock_irqrestore(&local_storage->lock, flags); + bpf_local_storage_destroy(local_storage); bpf_task_storage_unlock(); rcu_read_unlock(); - - if (free_task_storage) - kfree_rcu(local_storage, rcu); } static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key) diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 7a36353dbc22..8f56438c104b 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -49,7 +49,6 @@ static int bpf_sk_storage_del(struct sock *sk, struct bpf_map *map) void bpf_sk_storage_free(struct sock *sk) { struct bpf_local_storage *sk_storage; - bool free_sk_storage = false; rcu_read_lock(); sk_storage = rcu_dereference(sk->sk_bpf_storage); @@ -58,13 +57,8 @@ void bpf_sk_storage_free(struct sock *sk) return; } - raw_spin_lock_bh(&sk_storage->lock); - free_sk_storage = bpf_local_storage_unlink_nolock(sk_storage); - raw_spin_unlock_bh(&sk_storage->lock); + bpf_local_storage_destroy(sk_storage); rcu_read_unlock(); - - if (free_sk_storage) - kfree_rcu(sk_storage, rcu); } static void bpf_sk_storage_map_free(struct bpf_map *map) -- cgit v1.2.3-58-ga151 From 62827d612ae525695799b3635a087cb49c55e977 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 7 Mar 2023 22:59:22 -0800 Subject: bpf: Remove __bpf_local_storage_map_alloc bpf_local_storage_map_alloc() is the only caller of __bpf_local_storage_map_alloc(). The remaining logic in bpf_local_storage_map_alloc() is only a one liner setting the smap->cache_idx. Remove __bpf_local_storage_map_alloc() to simplify code. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20230308065936.1550103-4-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_local_storage.c | 63 +++++++++++++++++------------------------- 1 file changed, 26 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index e19f9f50a60d..f7234a8d4959 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -601,40 +601,6 @@ int bpf_local_storage_map_alloc_check(union bpf_attr *attr) return 0; } -static struct bpf_local_storage_map *__bpf_local_storage_map_alloc(union bpf_attr *attr) -{ - struct bpf_local_storage_map *smap; - unsigned int i; - u32 nbuckets; - - smap = bpf_map_area_alloc(sizeof(*smap), NUMA_NO_NODE); - if (!smap) - return ERR_PTR(-ENOMEM); - bpf_map_init_from_attr(&smap->map, attr); - - nbuckets = roundup_pow_of_two(num_possible_cpus()); - /* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */ - nbuckets = max_t(u32, 2, nbuckets); - smap->bucket_log = ilog2(nbuckets); - - smap->buckets = bpf_map_kvcalloc(&smap->map, sizeof(*smap->buckets), - nbuckets, GFP_USER | __GFP_NOWARN); - if (!smap->buckets) { - bpf_map_area_free(smap); - return ERR_PTR(-ENOMEM); - } - - for (i = 0; i < nbuckets; i++) { - INIT_HLIST_HEAD(&smap->buckets[i].list); - raw_spin_lock_init(&smap->buckets[i].lock); - } - - smap->elem_size = offsetof(struct bpf_local_storage_elem, - sdata.data[attr->value_size]); - - return smap; -} - int bpf_local_storage_map_check_btf(const struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, @@ -704,10 +670,33 @@ bpf_local_storage_map_alloc(union bpf_attr *attr, struct bpf_local_storage_cache *cache) { struct bpf_local_storage_map *smap; + unsigned int i; + u32 nbuckets; + + smap = bpf_map_area_alloc(sizeof(*smap), NUMA_NO_NODE); + if (!smap) + return ERR_PTR(-ENOMEM); + bpf_map_init_from_attr(&smap->map, attr); + + nbuckets = roundup_pow_of_two(num_possible_cpus()); + /* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */ + nbuckets = max_t(u32, 2, nbuckets); + smap->bucket_log = ilog2(nbuckets); - smap = __bpf_local_storage_map_alloc(attr); - if (IS_ERR(smap)) - return ERR_CAST(smap); + smap->buckets = bpf_map_kvcalloc(&smap->map, sizeof(*smap->buckets), + nbuckets, GFP_USER | __GFP_NOWARN); + if (!smap->buckets) { + bpf_map_area_free(smap); + return ERR_PTR(-ENOMEM); + } + + for (i = 0; i < nbuckets; i++) { + INIT_HLIST_HEAD(&smap->buckets[i].list); + raw_spin_lock_init(&smap->buckets[i].lock); + } + + smap->elem_size = offsetof(struct bpf_local_storage_elem, + sdata.data[attr->value_size]); smap->cache_idx = bpf_local_storage_cache_idx_get(cache); return &smap->map; -- cgit v1.2.3-58-ga151 From 121f31f3e00dfc1acbca43f6f35779e050b56cfc Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 7 Mar 2023 22:59:23 -0800 Subject: bpf: Remove the preceding __ from __bpf_selem_unlink_storage __bpf_selem_unlink_storage is taking the spin lock and there is no name collision also. Having the preceding '__' is confusing when reviewing the later patch. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20230308065936.1550103-5-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_local_storage.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index f7234a8d4959..70df8dcb2066 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -216,8 +216,8 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor return free_local_storage; } -static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem, - bool use_trace_rcu) +static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem, + bool use_trace_rcu) { struct bpf_local_storage *local_storage; bool free_local_storage = false; @@ -288,7 +288,7 @@ void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool use_trace_rcu) * the local_storage. */ bpf_selem_unlink_map(selem); - __bpf_selem_unlink_storage(selem, use_trace_rcu); + bpf_selem_unlink_storage(selem, use_trace_rcu); } /* If cacheit_lockit is false, this lookup function is lockless */ -- cgit v1.2.3-58-ga151 From fc6652aab6ad545de70b772550da9043d0b47f1c Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 7 Mar 2023 22:59:24 -0800 Subject: bpf: Remember smap in bpf_local_storage This patch remembers which smap triggers the allocation of a 'struct bpf_local_storage' object. The local_storage is allocated during the very first selem added to the owner. The smap pointer is needed when using the bpf_mem_cache_free in a later patch because it needs to free to the correct smap's bpf_mem_alloc object. When a selem is being removed, it needs to check if it is the selem that triggers the creation of the local_storage. If it is, the local_storage->smap pointer will be reset to NULL. This NULL reset is done under the local_storage->lock in bpf_selem_unlink_storage_nolock() when a selem is being removed. Also note that the local_storage may not go away even local_storage->smap is NULL because there may be other selem still stored in the local_storage. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20230308065936.1550103-6-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_local_storage.h | 1 + kernel/bpf/bpf_local_storage.c | 4 ++++ 2 files changed, 5 insertions(+) (limited to 'kernel') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 5908a954ddc2..613b1805ed9f 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -83,6 +83,7 @@ struct bpf_local_storage_elem { struct bpf_local_storage { struct bpf_local_storage_data __rcu *cache[BPF_LOCAL_STORAGE_CACHE_SIZE]; + struct bpf_local_storage_map __rcu *smap; struct hlist_head list; /* List of bpf_local_storage_elem */ void *owner; /* The object that owns the above "list" of * bpf_local_storage_elem. diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 70df8dcb2066..5585dbfd9c66 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -213,6 +213,9 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor kfree_rcu(selem, rcu); } + if (rcu_access_pointer(local_storage->smap) == smap) + RCU_INIT_POINTER(local_storage->smap, NULL); + return free_local_storage; } @@ -368,6 +371,7 @@ int bpf_local_storage_alloc(void *owner, goto uncharge; } + RCU_INIT_POINTER(storage->smap, smap); INIT_HLIST_HEAD(&storage->list); raw_spin_lock_init(&storage->lock); storage->owner = owner; -- cgit v1.2.3-58-ga151 From a47eabf216f77cb6f22ceb38d46f1bb95968579c Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 7 Mar 2023 22:59:25 -0800 Subject: bpf: Repurpose use_trace_rcu to reuse_now in bpf_local_storage This patch re-purpose the use_trace_rcu to mean if the freed memory can be reused immediately or not. The use_trace_rcu is renamed to reuse_now. Other than the boolean test is reversed, it should be a no-op. The following explains the reason for the rename and how it will be used in a later patch. In a later patch, bpf_mem_cache_alloc/free will be used in the bpf_local_storage. The bpf mem allocator will reuse the freed memory immediately. Some of the free paths in bpf_local_storage does not support memory to be reused immediately. These paths are the "delete" elem cases from the bpf_*_storage_delete() helper and the map_delete_elem() syscall. Note that "delete" elem before the owner's (sk/task/cgrp/inode) lifetime ended is not the common usage for the local storage. The common free path, bpf_local_storage_destroy(), can reuse the memory immediately. This common path means the storage stays with its owner until the owner is destroyed. The above mentioned "delete" elem paths that cannot reuse immediately always has the 'use_trace_rcu == true'. The cases that is safe for immediate reuse always have 'use_trace_rcu == false'. Instead of adding another arg in a later patch, this patch re-purpose this arg to reuse_now and have the test logic reversed. In a later patch, 'reuse_now == true' will free to the bpf_mem_cache_free() where the memory can be reused immediately. 'reuse_now == false' will go through the call_rcu_tasks_trace(). Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20230308065936.1550103-7-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_local_storage.h | 2 +- kernel/bpf/bpf_cgrp_storage.c | 2 +- kernel/bpf/bpf_inode_storage.c | 2 +- kernel/bpf/bpf_local_storage.c | 24 ++++++++++++------------ kernel/bpf/bpf_task_storage.c | 2 +- net/core/bpf_sk_storage.c | 2 +- 6 files changed, 17 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 613b1805ed9f..18a31add2255 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -143,7 +143,7 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map, void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem); -void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool use_trace_rcu); +void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now); void bpf_selem_link_map(struct bpf_local_storage_map *smap, struct bpf_local_storage_elem *selem); diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c index 492594d69a86..c975cacdd16b 100644 --- a/kernel/bpf/bpf_cgrp_storage.c +++ b/kernel/bpf/bpf_cgrp_storage.c @@ -121,7 +121,7 @@ static int cgroup_storage_delete(struct cgroup *cgroup, struct bpf_map *map) if (!sdata) return -ENOENT; - bpf_selem_unlink(SELEM(sdata), true); + bpf_selem_unlink(SELEM(sdata), false); return 0; } diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index 2d25bcfa371b..ad2ab0187e45 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -122,7 +122,7 @@ static int inode_storage_delete(struct inode *inode, struct bpf_map *map) if (!sdata) return -ENOENT; - bpf_selem_unlink(SELEM(sdata), true); + bpf_selem_unlink(SELEM(sdata), false); return 0; } diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 5585dbfd9c66..70c34a948c3c 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -147,7 +147,7 @@ static void bpf_selem_free_trace_rcu(struct rcu_head *rcu) */ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem, - bool uncharge_mem, bool use_trace_rcu) + bool uncharge_mem, bool reuse_now) { struct bpf_local_storage_map *smap; bool free_local_storage; @@ -201,7 +201,7 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor * any special fields. */ rec = smap->map.record; - if (use_trace_rcu) { + if (!reuse_now) { if (!IS_ERR_OR_NULL(rec)) call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_fields_trace_rcu); else @@ -220,7 +220,7 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor } static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem, - bool use_trace_rcu) + bool reuse_now) { struct bpf_local_storage *local_storage; bool free_local_storage = false; @@ -235,11 +235,11 @@ static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem, raw_spin_lock_irqsave(&local_storage->lock, flags); if (likely(selem_linked_to_storage(selem))) free_local_storage = bpf_selem_unlink_storage_nolock( - local_storage, selem, true, use_trace_rcu); + local_storage, selem, true, reuse_now); raw_spin_unlock_irqrestore(&local_storage->lock, flags); if (free_local_storage) { - if (use_trace_rcu) + if (!reuse_now) call_rcu_tasks_trace(&local_storage->rcu, bpf_local_storage_free_rcu); else @@ -284,14 +284,14 @@ void bpf_selem_link_map(struct bpf_local_storage_map *smap, raw_spin_unlock_irqrestore(&b->lock, flags); } -void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool use_trace_rcu) +void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now) { /* Always unlink from map before unlinking from local_storage * because selem will be freed after successfully unlinked from * the local_storage. */ bpf_selem_unlink_map(selem); - bpf_selem_unlink_storage(selem, use_trace_rcu); + bpf_selem_unlink_storage(selem, reuse_now); } /* If cacheit_lockit is false, this lookup function is lockless */ @@ -538,7 +538,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, if (old_sdata) { bpf_selem_unlink_map(SELEM(old_sdata)); bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata), - false, true); + false, false); } unlock: @@ -651,7 +651,7 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) * of the loop will set the free_cgroup_storage to true. */ free_storage = bpf_selem_unlink_storage_nolock( - local_storage, selem, false, false); + local_storage, selem, false, true); } raw_spin_unlock_irqrestore(&local_storage->lock, flags); @@ -745,7 +745,7 @@ void bpf_local_storage_map_free(struct bpf_map *map, migrate_disable(); this_cpu_inc(*busy_counter); } - bpf_selem_unlink(selem, false); + bpf_selem_unlink(selem, true); if (busy_counter) { this_cpu_dec(*busy_counter); migrate_enable(); @@ -783,8 +783,8 @@ void bpf_local_storage_map_free(struct bpf_map *map, /* We cannot skip rcu_barrier() when rcu_trace_implies_rcu_gp() * is true, because while call_rcu invocation is skipped in that * case in bpf_selem_free_fields_trace_rcu (and all local - * storage maps pass use_trace_rcu = true), there can be - * call_rcu callbacks based on use_trace_rcu = false in the + * storage maps pass reuse_now = false), there can be + * call_rcu callbacks based on reuse_now = true in the * while ((selem = ...)) loop above or when owner's free path * calls bpf_local_storage_unlink_nolock. */ diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index 4dcef28744d1..c88cc04c17c1 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -168,7 +168,7 @@ static int task_storage_delete(struct task_struct *task, struct bpf_map *map, if (!nobusy) return -EBUSY; - bpf_selem_unlink(SELEM(sdata), true); + bpf_selem_unlink(SELEM(sdata), false); return 0; } diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 8f56438c104b..a5f185b8e50a 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -40,7 +40,7 @@ static int bpf_sk_storage_del(struct sock *sk, struct bpf_map *map) if (!sdata) return -ENOENT; - bpf_selem_unlink(SELEM(sdata), true); + bpf_selem_unlink(SELEM(sdata), false); return 0; } -- cgit v1.2.3-58-ga151 From c609981342dca634e5dea8c6ca175b6533581261 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 7 Mar 2023 22:59:26 -0800 Subject: bpf: Remove bpf_selem_free_fields*_rcu This patch removes the bpf_selem_free_fields*_rcu. The bpf_obj_free_fields() can be done before the call_rcu_trasks_trace() and kfree_rcu(). It is needed when a later patch uses bpf_mem_cache_alloc/free. In bpf hashtab, bpf_obj_free_fields() is also called before calling bpf_mem_cache_free. The discussion can be found in https://lore.kernel.org/bpf/f67021ee-21d9-bfae-6134-4ca542fab843@linux.dev/ Acked-by: Kumar Kartikeya Dwivedi Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20230308065936.1550103-8-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_local_storage.c | 67 ++++-------------------------------------- 1 file changed, 5 insertions(+), 62 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 70c34a948c3c..715deaaefe13 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -109,27 +109,6 @@ static void bpf_local_storage_free_rcu(struct rcu_head *rcu) kfree_rcu(local_storage, rcu); } -static void bpf_selem_free_fields_rcu(struct rcu_head *rcu) -{ - struct bpf_local_storage_elem *selem; - struct bpf_local_storage_map *smap; - - selem = container_of(rcu, struct bpf_local_storage_elem, rcu); - /* protected by the rcu_barrier*() */ - smap = rcu_dereference_protected(SDATA(selem)->smap, true); - bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); - kfree(selem); -} - -static void bpf_selem_free_fields_trace_rcu(struct rcu_head *rcu) -{ - /* Free directly if Tasks Trace RCU GP also implies RCU GP */ - if (rcu_trace_implies_rcu_gp()) - bpf_selem_free_fields_rcu(rcu); - else - call_rcu(rcu, bpf_selem_free_fields_rcu); -} - static void bpf_selem_free_trace_rcu(struct rcu_head *rcu) { struct bpf_local_storage_elem *selem; @@ -151,7 +130,6 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor { struct bpf_local_storage_map *smap; bool free_local_storage; - struct btf_record *rec; void *owner; smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); @@ -192,26 +170,11 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor SDATA(selem)) RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL); - /* A different RCU callback is chosen whenever we need to free - * additional fields in selem data before freeing selem. - * bpf_local_storage_map_free only executes rcu_barrier to wait for RCU - * callbacks when it has special fields, hence we can only conditionally - * dereference smap, as by this time the map might have already been - * freed without waiting for our call_rcu callback if it did not have - * any special fields. - */ - rec = smap->map.record; - if (!reuse_now) { - if (!IS_ERR_OR_NULL(rec)) - call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_fields_trace_rcu); - else - call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu); - } else { - if (!IS_ERR_OR_NULL(rec)) - call_rcu(&selem->rcu, bpf_selem_free_fields_rcu); - else - kfree_rcu(selem, rcu); - } + bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); + if (!reuse_now) + call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu); + else + kfree_rcu(selem, rcu); if (rcu_access_pointer(local_storage->smap) == smap) RCU_INIT_POINTER(local_storage->smap, NULL); @@ -769,26 +732,6 @@ void bpf_local_storage_map_free(struct bpf_map *map, */ synchronize_rcu(); - /* Only delay freeing of smap, buckets are not needed anymore */ kvfree(smap->buckets); - - /* When local storage has special fields, callbacks for - * bpf_selem_free_fields_rcu and bpf_selem_free_fields_trace_rcu will - * keep using the map BTF record, we need to execute an RCU barrier to - * wait for them as the record will be freed right after our map_free - * callback. - */ - if (!IS_ERR_OR_NULL(smap->map.record)) { - rcu_barrier_tasks_trace(); - /* We cannot skip rcu_barrier() when rcu_trace_implies_rcu_gp() - * is true, because while call_rcu invocation is skipped in that - * case in bpf_selem_free_fields_trace_rcu (and all local - * storage maps pass reuse_now = false), there can be - * call_rcu callbacks based on reuse_now = true in the - * while ((selem = ...)) loop above or when owner's free path - * calls bpf_local_storage_unlink_nolock. - */ - rcu_barrier(); - } bpf_map_area_free(smap); } -- cgit v1.2.3-58-ga151 From f8ccf30c179ec1ac16654f6e6ceb40cce1530b91 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 7 Mar 2023 22:59:27 -0800 Subject: bpf: Add bpf_selem_free_rcu callback Add bpf_selem_free_rcu() callback to do the kfree() instead of using kfree_rcu. It is a preparation work for using bpf_mem_cache_alloc/free in a later patch. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20230308065936.1550103-9-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_local_storage.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 715deaaefe13..146e9caeda96 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -109,15 +109,20 @@ static void bpf_local_storage_free_rcu(struct rcu_head *rcu) kfree_rcu(local_storage, rcu); } -static void bpf_selem_free_trace_rcu(struct rcu_head *rcu) +static void bpf_selem_free_rcu(struct rcu_head *rcu) { struct bpf_local_storage_elem *selem; selem = container_of(rcu, struct bpf_local_storage_elem, rcu); + kfree(selem); +} + +static void bpf_selem_free_trace_rcu(struct rcu_head *rcu) +{ if (rcu_trace_implies_rcu_gp()) - kfree(selem); + bpf_selem_free_rcu(rcu); else - kfree_rcu(selem, rcu); + call_rcu(rcu, bpf_selem_free_rcu); } /* local_storage->lock must be held and selem->local_storage == local_storage. @@ -174,7 +179,7 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor if (!reuse_now) call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu); else - kfree_rcu(selem, rcu); + call_rcu(&selem->rcu, bpf_selem_free_rcu); if (rcu_access_pointer(local_storage->smap) == smap) RCU_INIT_POINTER(local_storage->smap, NULL); -- cgit v1.2.3-58-ga151 From c0d63f309186d8492577c67c67984c714b6b72bc Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 7 Mar 2023 22:59:28 -0800 Subject: bpf: Add bpf_selem_free() This patch refactors the selem freeing logic into bpf_selem_free(). It is a preparation work for a later patch using bpf_mem_cache_alloc/free. The other kfree(selem) cases are also changed to bpf_selem_free(..., reuse_now = true). Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20230308065936.1550103-10-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_local_storage.h | 4 ++++ kernel/bpf/bpf_local_storage.c | 21 ++++++++++++++------- net/core/bpf_sk_storage.c | 2 +- 3 files changed, 19 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 18a31add2255..a34f61467a2f 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -152,6 +152,10 @@ struct bpf_local_storage_elem * bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value, bool charge_mem, gfp_t gfp_flags); +void bpf_selem_free(struct bpf_local_storage_elem *selem, + struct bpf_local_storage_map *smap, + bool reuse_now); + int bpf_local_storage_alloc(void *owner, struct bpf_local_storage_map *smap, diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 146e9caeda96..512943aac435 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -125,6 +125,17 @@ static void bpf_selem_free_trace_rcu(struct rcu_head *rcu) call_rcu(rcu, bpf_selem_free_rcu); } +void bpf_selem_free(struct bpf_local_storage_elem *selem, + struct bpf_local_storage_map *smap, + bool reuse_now) +{ + bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); + if (!reuse_now) + call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu); + else + call_rcu(&selem->rcu, bpf_selem_free_rcu); +} + /* local_storage->lock must be held and selem->local_storage == local_storage. * The caller must ensure selem->smap is still valid to be * dereferenced for its smap->elem_size and smap->cache_idx. @@ -175,11 +186,7 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor SDATA(selem)) RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL); - bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); - if (!reuse_now) - call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu); - else - call_rcu(&selem->rcu, bpf_selem_free_rcu); + bpf_selem_free(selem, smap, reuse_now); if (rcu_access_pointer(local_storage->smap) == smap) RCU_INIT_POINTER(local_storage->smap, NULL); @@ -423,7 +430,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, err = bpf_local_storage_alloc(owner, smap, selem, gfp_flags); if (err) { - kfree(selem); + bpf_selem_free(selem, smap, true); mem_uncharge(smap, owner, smap->elem_size); return ERR_PTR(err); } @@ -517,7 +524,7 @@ unlock_err: raw_spin_unlock_irqrestore(&local_storage->lock, flags); if (selem) { mem_uncharge(smap, owner, smap->elem_size); - kfree(selem); + bpf_selem_free(selem, smap, true); } return ERR_PTR(err); } diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index a5f185b8e50a..24c3dc0d62e5 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -197,7 +197,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) } else { ret = bpf_local_storage_alloc(newsk, smap, copy_selem, GFP_ATOMIC); if (ret) { - kfree(copy_selem); + bpf_selem_free(copy_selem, smap, true); atomic_sub(smap->elem_size, &newsk->sk_omem_alloc); bpf_map_put(map); -- cgit v1.2.3-58-ga151 From 1288aaa2786b1e58c9e88e53f7654d520ebe0f3b Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 7 Mar 2023 22:59:29 -0800 Subject: bpf: Add bpf_local_storage_rcu callback The existing bpf_local_storage_free_rcu is renamed to bpf_local_storage_free_trace_rcu. A new bpf_local_storage_rcu callback is added to do the kfree instead of using kfree_rcu. It is a preparation work for a later patch using bpf_mem_cache_alloc/free. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20230308065936.1550103-11-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_local_storage.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 512943aac435..0fbc477c8b27 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -99,14 +99,19 @@ static void bpf_local_storage_free_rcu(struct rcu_head *rcu) { struct bpf_local_storage *local_storage; + local_storage = container_of(rcu, struct bpf_local_storage, rcu); + kfree(local_storage); +} + +static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu) +{ /* If RCU Tasks Trace grace period implies RCU grace period, do * kfree(), else do kfree_rcu(). */ - local_storage = container_of(rcu, struct bpf_local_storage, rcu); if (rcu_trace_implies_rcu_gp()) - kfree(local_storage); + bpf_local_storage_free_rcu(rcu); else - kfree_rcu(local_storage, rcu); + call_rcu(rcu, bpf_local_storage_free_rcu); } static void bpf_selem_free_rcu(struct rcu_head *rcu) @@ -216,9 +221,9 @@ static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem, if (free_local_storage) { if (!reuse_now) call_rcu_tasks_trace(&local_storage->rcu, - bpf_local_storage_free_rcu); + bpf_local_storage_free_trace_rcu); else - kfree_rcu(local_storage, rcu); + call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu); } } @@ -631,7 +636,7 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) raw_spin_unlock_irqrestore(&local_storage->lock, flags); if (free_storage) - kfree_rcu(local_storage, rcu); + call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu); } u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map) -- cgit v1.2.3-58-ga151 From 7e30a8477b0bdd13dfd0b24e4f32b26d22b96e6c Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 7 Mar 2023 22:59:30 -0800 Subject: bpf: Add bpf_local_storage_free() This patch refactors local_storage freeing logic into bpf_local_storage_free(). It is a preparation work for a later patch that uses bpf_mem_cache_alloc/free. The other kfree(local_storage) cases are also changed to bpf_local_storage_free(..., reuse_now = true). Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20230308065936.1550103-12-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_local_storage.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 0fbc477c8b27..351d991694cb 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -114,6 +114,16 @@ static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu) call_rcu(rcu, bpf_local_storage_free_rcu); } +static void bpf_local_storage_free(struct bpf_local_storage *local_storage, + bool reuse_now) +{ + if (!reuse_now) + call_rcu_tasks_trace(&local_storage->rcu, + bpf_local_storage_free_trace_rcu); + else + call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu); +} + static void bpf_selem_free_rcu(struct rcu_head *rcu) { struct bpf_local_storage_elem *selem; @@ -218,13 +228,8 @@ static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem, local_storage, selem, true, reuse_now); raw_spin_unlock_irqrestore(&local_storage->lock, flags); - if (free_local_storage) { - if (!reuse_now) - call_rcu_tasks_trace(&local_storage->rcu, - bpf_local_storage_free_trace_rcu); - else - call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu); - } + if (free_local_storage) + bpf_local_storage_free(local_storage, reuse_now); } void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, @@ -391,7 +396,7 @@ int bpf_local_storage_alloc(void *owner, return 0; uncharge: - kfree(storage); + bpf_local_storage_free(storage, true); mem_uncharge(smap, owner, sizeof(*storage)); return err; } @@ -636,7 +641,7 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) raw_spin_unlock_irqrestore(&local_storage->lock, flags); if (free_storage) - call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu); + bpf_local_storage_free(local_storage, true); } u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map) -- cgit v1.2.3-58-ga151 From b32a5dae44cc7346835839c2e92356ff4609c823 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Thu, 9 Mar 2023 10:01:06 -0800 Subject: bpf: verifier: Rename kernel_type_name helper to btf_type_name kernel_type_name was introduced in commit 9e15db66136a ("bpf: Implement accurate raw_tp context access via BTF") with type signature: const char *kernel_type_name(u32 id) At that time the function used global btf_vmlinux BTF for all id lookups. Later, in commit 22dc4a0f5ed1 ("bpf: Remove hard-coded btf_vmlinux assumption from BPF verifier"), the type signature was changed to: static const char *kernel_type_name(const struct btf* btf, u32 id) With the btf parameter used for lookups instead of global btf_vmlinux. The helper will function as expected for type name lookup using non-kernel BTFs, and will be used for such in further patches in the series. Let's rename it to avoid incorrect assumptions that might arise when seeing the current name. Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20230309180111.1618459-2-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0aaf7703326b..073c56dbec63 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -752,7 +752,7 @@ static int iter_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg, return stack_slot_obj_get_spi(env, reg, "iter", nr_slots); } -static const char *kernel_type_name(const struct btf* btf, u32 id) +static const char *btf_type_name(const struct btf *btf, u32 id) { return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off); } @@ -782,7 +782,7 @@ static const char *iter_type_str(const struct btf *btf, u32 btf_id) return ""; /* we already validated that type is valid and has conforming name */ - return kernel_type_name(btf, btf_id) + sizeof(ITER_PREFIX) - 1; + return btf_type_name(btf, btf_id) + sizeof(ITER_PREFIX) - 1; } static const char *iter_state_str(enum bpf_iter_state state) @@ -1349,7 +1349,7 @@ static void print_verifier_state(struct bpf_verifier_env *env, verbose(env, "%s", reg_type_str(env, t)); if (base_type(t) == PTR_TO_BTF_ID) - verbose(env, "%s", kernel_type_name(reg->btf, reg->btf_id)); + verbose(env, "%s", btf_type_name(reg->btf, reg->btf_id)); verbose(env, "("); /* * _a stands for append, was shortened to avoid multiline statements below. @@ -4518,7 +4518,7 @@ static int map_kptr_match_type(struct bpf_verifier_env *env, struct btf_field *kptr_field, struct bpf_reg_state *reg, u32 regno) { - const char *targ_name = kernel_type_name(kptr_field->kptr.btf, kptr_field->kptr.btf_id); + const char *targ_name = btf_type_name(kptr_field->kptr.btf, kptr_field->kptr.btf_id); int perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED | MEM_RCU; const char *reg_name = ""; @@ -4534,7 +4534,7 @@ static int map_kptr_match_type(struct bpf_verifier_env *env, return -EINVAL; } /* We need to verify reg->type and reg->btf, before accessing reg->btf */ - reg_name = kernel_type_name(reg->btf, reg->btf_id); + reg_name = btf_type_name(reg->btf, reg->btf_id); /* For ref_ptr case, release function check should ensure we get one * referenced PTR_TO_BTF_ID, and that its fixed offset is 0. For the @@ -7177,8 +7177,8 @@ found: btf_vmlinux, *arg_btf_id, strict_type_match)) { verbose(env, "R%d is of type %s but %s is expected\n", - regno, kernel_type_name(reg->btf, reg->btf_id), - kernel_type_name(btf_vmlinux, *arg_btf_id)); + regno, btf_type_name(reg->btf, reg->btf_id), + btf_type_name(btf_vmlinux, *arg_btf_id)); return -EACCES; } } @@ -7248,7 +7248,7 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env, verbose(env, "R%d must have zero offset when passed to release func\n", regno); verbose(env, "No graph node or root found at R%d type:%s off:%d\n", regno, - kernel_type_name(reg->btf, reg->btf_id), reg->off); + btf_type_name(reg->btf, reg->btf_id), reg->off); return -EINVAL; } -- cgit v1.2.3-58-ga151 From a4aa38897b6a6dad4318bed036edc7ed0c8a4578 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Thu, 9 Mar 2023 10:01:07 -0800 Subject: bpf: btf: Remove unused btf_field_info_type enum This enum was added and used in commit aa3496accc41 ("bpf: Refactor kptr_off_tab into btf_record"). Later refactoring in commit db559117828d ("bpf: Consolidate spin_lock, timer management into btf_record") resulted in the enum values no longer being used anywhere. Let's remove them. Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20230309180111.1618459-3-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 71758cd15b07..37779ceefd09 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3231,12 +3231,6 @@ static void btf_struct_log(struct btf_verifier_env *env, btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t)); } -enum btf_field_info_type { - BTF_FIELD_SPIN_LOCK, - BTF_FIELD_TIMER, - BTF_FIELD_KPTR, -}; - enum { BTF_FIELD_IGNORE = 0, BTF_FIELD_FOUND = 1, -- cgit v1.2.3-58-ga151 From 74843b57ec70af7b67b7e6153374834ee18d139f Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Thu, 9 Mar 2023 10:01:08 -0800 Subject: bpf: Change btf_record_find enum parameter to field_mask btf_record_find's 3rd parameter can be multiple enum btf_field_type's masked together. The function is called with BPF_KPTR in two places in verifier.c, so it works with masked values already. Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20230309180111.1618459-4-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 +- kernel/bpf/syscall.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e64ff1e89fb2..3a38db315f7f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1925,7 +1925,7 @@ void bpf_prog_free_id(struct bpf_prog *prog); void bpf_map_free_id(struct bpf_map *map); struct btf_field *btf_record_find(const struct btf_record *rec, - u32 offset, enum btf_field_type type); + u32 offset, u32 field_mask); void btf_record_free(struct btf_record *rec); void bpf_map_free_record(struct bpf_map *map); struct btf_record *btf_record_dup(const struct btf_record *rec); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index f406dfa13792..cc4b7684910c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -520,14 +520,14 @@ static int btf_field_cmp(const void *a, const void *b) } struct btf_field *btf_record_find(const struct btf_record *rec, u32 offset, - enum btf_field_type type) + u32 field_mask) { struct btf_field *field; - if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & type)) + if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & field_mask)) return NULL; field = bsearch(&offset, rec->fields, rec->cnt, sizeof(rec->fields[0]), btf_field_cmp); - if (!field || !(field->type & type)) + if (!field || !(field->type & field_mask)) return NULL; return field; } -- cgit v1.2.3-58-ga151 From c8e18754091479fac3f5b6c053c6bc4be0b7fb11 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Fri, 10 Mar 2023 15:07:41 -0800 Subject: bpf: Support __kptr to local kptrs If a PTR_TO_BTF_ID type comes from program BTF - not vmlinux or module BTF - it must have been allocated by bpf_obj_new and therefore must be free'd with bpf_obj_drop. Such a PTR_TO_BTF_ID is considered a "local kptr" and is tagged with MEM_ALLOC type tag by bpf_obj_new. This patch adds support for treating __kptr-tagged pointers to "local kptrs" as having an implicit bpf_obj_drop destructor for referenced kptr acquire / release semantics. Consider the following example: struct node_data { long key; long data; struct bpf_rb_node node; }; struct map_value { struct node_data __kptr *node; }; struct { __uint(type, BPF_MAP_TYPE_ARRAY); __type(key, int); __type(value, struct map_value); __uint(max_entries, 1); } some_nodes SEC(".maps"); If struct node_data had a matching definition in kernel BTF, the verifier would expect a destructor for the type to be registered. Since struct node_data does not match any type in kernel BTF, the verifier knows that there is no kfunc that provides a PTR_TO_BTF_ID to this type, and that such a PTR_TO_BTF_ID can only come from bpf_obj_new. So instead of searching for a registered dtor, a bpf_obj_drop dtor can be assumed. This allows the runtime to properly destruct such kptrs in bpf_obj_free_fields, which enables maps to clean up map_vals w/ such kptrs when going away. Implementation notes: * "kernel_btf" variable is renamed to "kptr_btf" in btf_parse_kptr. Before this patch, the variable would only ever point to vmlinux or module BTFs, but now it can point to some program BTF for local kptr type. It's later used to populate the (btf, btf_id) pair in kptr btf field. * It's necessary to btf_get the program BTF when populating btf_field for local kptr. btf_record_free later does a btf_put. * Behavior for non-local referenced kptrs is not modified, as bpf_find_btf_id helper only searches vmlinux and module BTFs for matching BTF type. If such a type is found, btf_field_kptr's btf will pass btf_is_kernel check, and the associated release function is some one-argument dtor. If btf_is_kernel check fails, associated release function is two-arg bpf_obj_drop_impl. Before this patch only btf_field_kptr's w/ kernel or module BTFs were created. Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20230310230743.2320707-2-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 11 ++++++++++- include/linux/btf.h | 2 -- kernel/bpf/btf.c | 37 ++++++++++++++++++++++++++++--------- kernel/bpf/helpers.c | 11 ++++++++--- kernel/bpf/syscall.c | 14 +++++++++++++- 5 files changed, 59 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3a38db315f7f..756b85f0d0d3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -189,10 +189,19 @@ enum btf_field_type { BPF_RB_NODE | BPF_RB_ROOT, }; +typedef void (*btf_dtor_kfunc_t)(void *); +typedef void (*btf_dtor_obj_drop)(void *, const struct btf_record *); + struct btf_field_kptr { struct btf *btf; struct module *module; - btf_dtor_kfunc_t dtor; + union { + /* dtor used if btf_is_kernel(btf), otherwise the type + * is program-allocated and obj_drop is used + */ + btf_dtor_kfunc_t dtor; + btf_dtor_obj_drop obj_drop; + }; u32 btf_id; }; diff --git a/include/linux/btf.h b/include/linux/btf.h index 1bba0827e8c4..d53b10cc55f2 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -121,8 +121,6 @@ struct btf_struct_metas { struct btf_struct_meta types[]; }; -typedef void (*btf_dtor_kfunc_t)(void *); - extern const struct file_operations btf_fops; void btf_get(struct btf *btf); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 37779ceefd09..66fad7a16b6c 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3551,12 +3551,17 @@ static int btf_find_field(const struct btf *btf, const struct btf_type *t, return -EINVAL; } +extern void __bpf_obj_drop_impl(void *p, const struct btf_record *rec); + static int btf_parse_kptr(const struct btf *btf, struct btf_field *field, struct btf_field_info *info) { struct module *mod = NULL; const struct btf_type *t; - struct btf *kernel_btf; + /* If a matching btf type is found in kernel or module BTFs, kptr_ref + * is that BTF, otherwise it's program BTF + */ + struct btf *kptr_btf; int ret; s32 id; @@ -3565,7 +3570,20 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field, */ t = btf_type_by_id(btf, info->kptr.type_id); id = bpf_find_btf_id(__btf_name_by_offset(btf, t->name_off), BTF_INFO_KIND(t->info), - &kernel_btf); + &kptr_btf); + if (id == -ENOENT) { + /* btf_parse_kptr should only be called w/ btf = program BTF */ + WARN_ON_ONCE(btf_is_kernel(btf)); + + /* Type exists only in program BTF. Assume that it's a MEM_ALLOC + * kptr allocated via bpf_obj_new + */ + field->kptr.dtor = (void *)&__bpf_obj_drop_impl; + id = info->kptr.type_id; + kptr_btf = (struct btf *)btf; + btf_get(kptr_btf); + goto found_dtor; + } if (id < 0) return id; @@ -3582,20 +3600,20 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field, * can be used as a referenced pointer and be stored in a map at * the same time. */ - dtor_btf_id = btf_find_dtor_kfunc(kernel_btf, id); + dtor_btf_id = btf_find_dtor_kfunc(kptr_btf, id); if (dtor_btf_id < 0) { ret = dtor_btf_id; goto end_btf; } - dtor_func = btf_type_by_id(kernel_btf, dtor_btf_id); + dtor_func = btf_type_by_id(kptr_btf, dtor_btf_id); if (!dtor_func) { ret = -ENOENT; goto end_btf; } - if (btf_is_module(kernel_btf)) { - mod = btf_try_get_module(kernel_btf); + if (btf_is_module(kptr_btf)) { + mod = btf_try_get_module(kptr_btf); if (!mod) { ret = -ENXIO; goto end_btf; @@ -3605,7 +3623,7 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field, /* We already verified dtor_func to be btf_type_is_func * in register_btf_id_dtor_kfuncs. */ - dtor_func_name = __btf_name_by_offset(kernel_btf, dtor_func->name_off); + dtor_func_name = __btf_name_by_offset(kptr_btf, dtor_func->name_off); addr = kallsyms_lookup_name(dtor_func_name); if (!addr) { ret = -EINVAL; @@ -3614,14 +3632,15 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field, field->kptr.dtor = (void *)addr; } +found_dtor: field->kptr.btf_id = id; - field->kptr.btf = kernel_btf; + field->kptr.btf = kptr_btf; field->kptr.module = mod; return 0; end_mod: module_put(mod); end_btf: - btf_put(kernel_btf); + btf_put(kptr_btf); return ret; } diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index f9b7eeedce08..77d64b6951b9 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1896,14 +1896,19 @@ __bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign) return p; } +void __bpf_obj_drop_impl(void *p, const struct btf_record *rec) +{ + if (rec) + bpf_obj_free_fields(rec, p); + bpf_mem_free(&bpf_global_ma, p); +} + __bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign) { struct btf_struct_meta *meta = meta__ign; void *p = p__alloc; - if (meta) - bpf_obj_free_fields(meta->record, p); - bpf_mem_free(&bpf_global_ma, p); + __bpf_obj_drop_impl(p, meta ? meta->record : NULL); } static void __bpf_list_add(struct bpf_list_node *node, struct bpf_list_head *head, bool tail) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index cc4b7684910c..0684febc447a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -659,8 +659,10 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) return; fields = rec->fields; for (i = 0; i < rec->cnt; i++) { + struct btf_struct_meta *pointee_struct_meta; const struct btf_field *field = &fields[i]; void *field_ptr = obj + field->offset; + void *xchgd_field; switch (fields[i].type) { case BPF_SPIN_LOCK: @@ -672,7 +674,17 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) WRITE_ONCE(*(u64 *)field_ptr, 0); break; case BPF_KPTR_REF: - field->kptr.dtor((void *)xchg((unsigned long *)field_ptr, 0)); + xchgd_field = (void *)xchg((unsigned long *)field_ptr, 0); + if (!btf_is_kernel(field->kptr.btf)) { + pointee_struct_meta = btf_find_struct_meta(field->kptr.btf, + field->kptr.btf_id); + WARN_ON_ONCE(!pointee_struct_meta); + field->kptr.obj_drop(xchgd_field, pointee_struct_meta ? + pointee_struct_meta->record : + NULL); + } else { + field->kptr.dtor(xchgd_field); + } break; case BPF_LIST_HEAD: if (WARN_ON_ONCE(rec->spin_lock_off < 0)) -- cgit v1.2.3-58-ga151 From 738c96d5e2e3700b370f02ac84a9dc159ca81f25 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Fri, 10 Mar 2023 15:07:42 -0800 Subject: bpf: Allow local kptrs to be exchanged via bpf_kptr_xchg The previous patch added necessary plumbing for verifier and runtime to know what to do with non-kernel PTR_TO_BTF_IDs in map values, but didn't provide any way to get such local kptrs into a map value. This patch modifies verifier handling of bpf_kptr_xchg to allow MEM_ALLOC kptr types. check_reg_type is modified accept MEM_ALLOC-flagged input to bpf_kptr_xchg despite such types not being in btf_ptr_types. This could have been done with a MAYBE_MEM_ALLOC equivalent to MAYBE_NULL, but bpf_kptr_xchg is the only helper that I can forsee using MAYBE_MEM_ALLOC, so keep it special-cased for now. The verifier tags bpf_kptr_xchg retval MEM_ALLOC if and only if the BTF associated with the retval is not kernel BTF. Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20230310230743.2320707-3-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 073c56dbec63..519d465407ef 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7123,6 +7123,9 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, if (arg_type & PTR_MAYBE_NULL) type &= ~PTR_MAYBE_NULL; + if (meta->func_id == BPF_FUNC_kptr_xchg && type & MEM_ALLOC) + type &= ~MEM_ALLOC; + for (i = 0; i < ARRAY_SIZE(compatible->types); i++) { expected = compatible->types[i]; if (expected == NOT_INIT) @@ -7185,7 +7188,8 @@ found: break; } case PTR_TO_BTF_ID | MEM_ALLOC: - if (meta->func_id != BPF_FUNC_spin_lock && meta->func_id != BPF_FUNC_spin_unlock) { + if (meta->func_id != BPF_FUNC_spin_lock && meta->func_id != BPF_FUNC_spin_unlock && + meta->func_id != BPF_FUNC_kptr_xchg) { verbose(env, "verifier internal error: unimplemented handling of MEM_ALLOC\n"); return -EFAULT; } @@ -9151,6 +9155,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (func_id == BPF_FUNC_kptr_xchg) { ret_btf = meta.kptr_field->kptr.btf; ret_btf_id = meta.kptr_field->kptr.btf_id; + if (!btf_is_kernel(ret_btf)) + regs[BPF_REG_0].type |= MEM_ALLOC; } else { if (fn->ret_btf_id == BPF_PTR_POISON) { verbose(env, "verifier internal error:"); -- cgit v1.2.3-58-ga151 From 34f0677e7afd3a292bc1aadda7ce8e35faedb204 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 13 Mar 2023 11:40:17 -0700 Subject: bpf: fix precision propagation verbose logging Fix wrong order of frame index vs register/slot index in precision propagation verbose (level 2) output. It's wrong and very confusing as is. Fixes: 529409ea92d5 ("bpf: propagate precision across all frames, not just the last one") Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20230313184017.4083374-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 519d465407ef..883d4ff2e288 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -15068,7 +15068,7 @@ static int propagate_precision(struct bpf_verifier_env *env, !(state_reg->live & REG_LIVE_READ)) continue; if (env->log.level & BPF_LOG_LEVEL2) - verbose(env, "frame %d: propagating r%d\n", i, fr); + verbose(env, "frame %d: propagating r%d\n", fr, i); err = mark_chain_precision_frame(env, fr, i); if (err < 0) return err; @@ -15084,7 +15084,7 @@ static int propagate_precision(struct bpf_verifier_env *env, continue; if (env->log.level & BPF_LOG_LEVEL2) verbose(env, "frame %d: propagating fp%d\n", - (-i - 1) * BPF_REG_SIZE, fr); + fr, (-i - 1) * BPF_REG_SIZE); err = mark_chain_precision_stack_frame(env, fr, i); if (err < 0) return err; -- cgit v1.2.3-58-ga151 From 9e36a204bd43553a9cd4bd574612cd9a5df791ea Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Mon, 13 Mar 2023 14:46:41 -0700 Subject: bpf: Disable migration when freeing stashed local kptr using obj drop When a local kptr is stashed in a map and freed when the map goes away, currently an error like the below appears: [ 39.195695] BUG: using smp_processor_id() in preemptible [00000000] code: kworker/u32:15/2875 [ 39.196549] caller is bpf_mem_free+0x56/0xc0 [ 39.196958] CPU: 15 PID: 2875 Comm: kworker/u32:15 Tainted: G O 6.2.0-13016-g22df776a9a86 #4477 [ 39.197897] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014 [ 39.198949] Workqueue: events_unbound bpf_map_free_deferred [ 39.199470] Call Trace: [ 39.199703] [ 39.199911] dump_stack_lvl+0x60/0x70 [ 39.200267] check_preemption_disabled+0xbf/0xe0 [ 39.200704] bpf_mem_free+0x56/0xc0 [ 39.201032] ? bpf_obj_new_impl+0xa0/0xa0 [ 39.201430] bpf_obj_free_fields+0x1cd/0x200 [ 39.201838] array_map_free+0xad/0x220 [ 39.202193] ? finish_task_switch+0xe5/0x3c0 [ 39.202614] bpf_map_free_deferred+0xea/0x210 [ 39.203006] ? lockdep_hardirqs_on_prepare+0xe/0x220 [ 39.203460] process_one_work+0x64f/0xbe0 [ 39.203822] ? pwq_dec_nr_in_flight+0x110/0x110 [ 39.204264] ? do_raw_spin_lock+0x107/0x1c0 [ 39.204662] ? lockdep_hardirqs_on_prepare+0xe/0x220 [ 39.205107] worker_thread+0x74/0x7a0 [ 39.205451] ? process_one_work+0xbe0/0xbe0 [ 39.205818] kthread+0x171/0x1a0 [ 39.206111] ? kthread_complete_and_exit+0x20/0x20 [ 39.206552] ret_from_fork+0x1f/0x30 [ 39.206886] This happens because the call to __bpf_obj_drop_impl I added in the patch adding support for stashing local kptrs doesn't disable migration. Prior to that patch, __bpf_obj_drop_impl logic only ran when called by a BPF progarm, whereas now it can be called from map free path, so it's necessary to explicitly disable migration. Also, refactor a bit to just call __bpf_obj_drop_impl directly instead of bothering w/ dtor union and setting pointer-to-obj_drop. Fixes: c8e187540914 ("bpf: Support __kptr to local kptrs") Reported-by: Alexei Starovoitov Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20230313214641.3731908-1-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 12 ++++-------- kernel/bpf/btf.c | 4 +--- kernel/bpf/syscall.c | 10 +++++++--- 3 files changed, 12 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 756b85f0d0d3..71cc92a4ba48 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -190,18 +190,14 @@ enum btf_field_type { }; typedef void (*btf_dtor_kfunc_t)(void *); -typedef void (*btf_dtor_obj_drop)(void *, const struct btf_record *); struct btf_field_kptr { struct btf *btf; struct module *module; - union { - /* dtor used if btf_is_kernel(btf), otherwise the type - * is program-allocated and obj_drop is used - */ - btf_dtor_kfunc_t dtor; - btf_dtor_obj_drop obj_drop; - }; + /* dtor used if btf_is_kernel(btf), otherwise the type is + * program-allocated, dtor is NULL, and __bpf_obj_drop_impl is used + */ + btf_dtor_kfunc_t dtor; u32 btf_id; }; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 66fad7a16b6c..b7e5a5510b91 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3551,8 +3551,6 @@ static int btf_find_field(const struct btf *btf, const struct btf_type *t, return -EINVAL; } -extern void __bpf_obj_drop_impl(void *p, const struct btf_record *rec); - static int btf_parse_kptr(const struct btf *btf, struct btf_field *field, struct btf_field_info *info) { @@ -3578,7 +3576,7 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field, /* Type exists only in program BTF. Assume that it's a MEM_ALLOC * kptr allocated via bpf_obj_new */ - field->kptr.dtor = (void *)&__bpf_obj_drop_impl; + field->kptr.dtor = NULL; id = info->kptr.type_id; kptr_btf = (struct btf *)btf; btf_get(kptr_btf); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0684febc447a..5b88301a2ae0 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -650,6 +650,8 @@ void bpf_obj_free_timer(const struct btf_record *rec, void *obj) bpf_timer_cancel_and_free(obj + rec->timer_off); } +extern void __bpf_obj_drop_impl(void *p, const struct btf_record *rec); + void bpf_obj_free_fields(const struct btf_record *rec, void *obj) { const struct btf_field *fields; @@ -679,9 +681,11 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) pointee_struct_meta = btf_find_struct_meta(field->kptr.btf, field->kptr.btf_id); WARN_ON_ONCE(!pointee_struct_meta); - field->kptr.obj_drop(xchgd_field, pointee_struct_meta ? - pointee_struct_meta->record : - NULL); + migrate_disable(); + __bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ? + pointee_struct_meta->record : + NULL); + migrate_enable(); } else { field->kptr.dtor(xchgd_field); } -- cgit v1.2.3-58-ga151 From c9267aa8b794c2188d49c7d7bd2990e98b2d6b84 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 13 Mar 2023 16:58:43 -0700 Subject: bpf: Fix bpf_strncmp proto. bpf_strncmp() doesn't write into its first argument. Make sure that the verifier knows about it. Signed-off-by: Alexei Starovoitov Acked-by: David Vernet Link: https://lore.kernel.org/r/20230313235845.61029-2-alexei.starovoitov@gmail.com Signed-off-by: Martin KaFai Lau --- kernel/bpf/helpers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 77d64b6951b9..f753676ef652 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -571,7 +571,7 @@ static const struct bpf_func_proto bpf_strncmp_proto = { .func = bpf_strncmp, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_MEM, + .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE, .arg3_type = ARG_PTR_TO_CONST_STR, }; -- cgit v1.2.3-58-ga151 From 3e30be4288b31702d4898487a74e80ba14150a9f Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 13 Mar 2023 16:58:44 -0700 Subject: bpf: Allow helpers access trusted PTR_TO_BTF_ID. The verifier rejects the code: bpf_strncmp(task->comm, 16, "my_task"); with the message: 16: (85) call bpf_strncmp#182 R1 type=trusted_ptr_ expected=fp, pkt, pkt_meta, map_key, map_value, mem, ringbuf_mem, buf Teach the verifier that such access pattern is safe. Do not allow untrusted and legacy ptr_to_btf_id to be passed into helpers. Reported-by: David Vernet Signed-off-by: Alexei Starovoitov Acked-by: David Vernet Link: https://lore.kernel.org/r/20230313235845.61029-3-alexei.starovoitov@gmail.com Signed-off-by: Martin KaFai Lau --- kernel/bpf/verifier.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 883d4ff2e288..2bbd89279070 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6303,6 +6303,9 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, env, regno, reg->off, access_size, zero_size_allowed, ACCESS_HELPER, meta); + case PTR_TO_BTF_ID: + return check_ptr_to_btf_access(env, regs, regno, reg->off, + access_size, BPF_READ, -1); case PTR_TO_CTX: /* in case the function doesn't know how to access the context, * (because we are in a program of type SYSCALL for example), we @@ -7014,6 +7017,7 @@ static const struct bpf_reg_types mem_types = { PTR_TO_MEM, PTR_TO_MEM | MEM_RINGBUF, PTR_TO_BUF, + PTR_TO_BTF_ID | PTR_TRUSTED, }, }; @@ -7145,6 +7149,17 @@ found: if (base_type(reg->type) != PTR_TO_BTF_ID) return 0; + if (compatible == &mem_types) { + if (!(arg_type & MEM_RDONLY)) { + verbose(env, + "%s() may write into memory pointed by R%d type=%s\n", + func_id_name(meta->func_id), + regno, reg_type_str(env, reg->type)); + return -EACCES; + } + return 0; + } + switch ((int)reg->type) { case PTR_TO_BTF_ID: case PTR_TO_BTF_ID | PTR_TRUSTED: -- cgit v1.2.3-58-ga151 From b8a2e3f93d412114a1539ea97b59b3e6ed6e1f9a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 14 Mar 2023 11:59:49 -1000 Subject: cgroup: Make current_cgns_cgroup_dfl() safe to call after exit_task_namespace() The commit 332ea1f697be ("bpf: Add bpf_cgroup_from_id() kfunc") added bpf_cgroup_from_id() which calls current_cgns_cgroup_dfl() through cgroup_get_from_id(). However, BPF programs may be attached to a point where current->nsproxy has already been cleared to NULL by exit_task_namespace() and calling bpf_cgroup_from_id() would cause an oops. Just return the system-wide root if nsproxy has been cleared. This allows all cgroups to be looked up after the task passed through exit_task_namespace(), which semantically makes sense. Given that the only way to get this behavior is through BPF programs, it seems safe but let's see what others think. Fixes: 332ea1f697be ("bpf: Add bpf_cgroup_from_id() kfunc") Signed-off-by: Tejun Heo Link: https://lore.kernel.org/r/ZBDuVWiFj2jiz3i8@slm.duckdns.org Signed-off-by: Alexei Starovoitov --- kernel/cgroup/cgroup.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 935e8121b21e..8a5294f4ce72 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1465,8 +1465,18 @@ static struct cgroup *current_cgns_cgroup_dfl(void) { struct css_set *cset; - cset = current->nsproxy->cgroup_ns->root_cset; - return __cset_cgroup_from_root(cset, &cgrp_dfl_root); + if (current->nsproxy) { + cset = current->nsproxy->cgroup_ns->root_cset; + return __cset_cgroup_from_root(cset, &cgrp_dfl_root); + } else { + /* + * NOTE: This function may be called from bpf_cgroup_from_id() + * on a task which has already passed exit_task_namespaces() and + * nsproxy == NULL. Fall back to cgrp_dfl_root which will make all + * cgroups visible for lookups. + */ + return &cgrp_dfl_root.cgrp; + } } /* look up cgroup associated with given css_set on the specified hierarchy */ -- cgit v1.2.3-58-ga151 From 31bf1dbccfb0a9861d4846755096b3fff5687f8a Mon Sep 17 00:00:00 2001 From: Viktor Malik Date: Fri, 10 Mar 2023 08:40:59 +0100 Subject: bpf: Fix attaching fentry/fexit/fmod_ret/lsm to modules This resolves two problems with attachment of fentry/fexit/fmod_ret/lsm to functions located in modules: 1. The verifier tries to find the address to attach to in kallsyms. This is always done by searching the entire kallsyms, not respecting the module in which the function is located. Such approach causes an incorrect attachment address to be computed if the function to attach to is shadowed by a function of the same name located earlier in kallsyms. 2. If the address to attach to is located in a module, the module reference is only acquired in register_fentry. If the module is unloaded between the place where the address is found (bpf_check_attach_target in the verifier) and register_fentry, it is possible that another module is loaded to the same address which may lead to potential errors. Since the attachment must contain the BTF of the program to attach to, we extract the module from it and search for the function address in the correct module (resolving problem no. 1). Then, the module reference is taken directly in bpf_check_attach_target and stored in the bpf program (in bpf_prog_aux). The reference is only released when the program is unloaded (resolving problem no. 2). Signed-off-by: Viktor Malik Acked-by: Jiri Olsa Reviewed-by: Luis Chamberlain Link: https://lore.kernel.org/r/3f6a9d8ae850532b5ef864ef16327b0f7a669063.1678432753.git.vmalik@redhat.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ kernel/bpf/syscall.c | 6 ++++++ kernel/bpf/trampoline.c | 28 ---------------------------- kernel/bpf/verifier.c | 18 +++++++++++++++++- kernel/module/internal.h | 5 +++++ 5 files changed, 30 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 71cc92a4ba48..3ef98fb92987 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1103,6 +1103,7 @@ struct bpf_trampoline { struct bpf_attach_target_info { struct btf_func_model fmodel; long tgt_addr; + struct module *tgt_mod; const char *tgt_name; const struct btf_type *tgt_type; }; @@ -1406,6 +1407,7 @@ struct bpf_prog_aux { * main prog always has linfo_idx == 0 */ u32 linfo_idx; + struct module *mod; u32 num_exentries; struct exception_table_entry *extable; union { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5b88301a2ae0..099e9068bcdd 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2067,6 +2067,7 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred) { bpf_prog_kallsyms_del_all(prog); btf_put(prog->aux->btf); + module_put(prog->aux->mod); kvfree(prog->aux->jited_linfo); kvfree(prog->aux->linfo); kfree(prog->aux->kfunc_tab); @@ -3113,6 +3114,11 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, if (err) goto out_unlock; + if (tgt_info.tgt_mod) { + module_put(prog->aux->mod); + prog->aux->mod = tgt_info.tgt_mod; + } + tr = bpf_trampoline_get(key, &tgt_info); if (!tr) { err = -ENOMEM; diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index d0ed7d6f5eec..f61d5138b12b 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include @@ -172,26 +171,6 @@ out: return tr; } -static int bpf_trampoline_module_get(struct bpf_trampoline *tr) -{ - struct module *mod; - int err = 0; - - preempt_disable(); - mod = __module_text_address((unsigned long) tr->func.addr); - if (mod && !try_module_get(mod)) - err = -ENOENT; - preempt_enable(); - tr->mod = mod; - return err; -} - -static void bpf_trampoline_module_put(struct bpf_trampoline *tr) -{ - module_put(tr->mod); - tr->mod = NULL; -} - static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr) { void *ip = tr->func.addr; @@ -202,8 +181,6 @@ static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr) else ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL); - if (!ret) - bpf_trampoline_module_put(tr); return ret; } @@ -238,9 +215,6 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr) tr->func.ftrace_managed = true; } - if (bpf_trampoline_module_get(tr)) - return -ENOENT; - if (tr->func.ftrace_managed) { ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 1); ret = register_ftrace_direct_multi(tr->fops, (long)new_addr); @@ -248,8 +222,6 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr) ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr); } - if (ret) - bpf_trampoline_module_put(tr); return ret; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2bbd89279070..60793f793ca6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -24,6 +24,7 @@ #include #include #include +#include "../module/internal.h" #include "disasm.h" @@ -18307,6 +18308,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, const char *tname; struct btf *btf; long addr = 0; + struct module *mod = NULL; if (!btf_id) { bpf_log(log, "Tracing programs must provide btf_id\n"); @@ -18480,8 +18482,17 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, else addr = (long) tgt_prog->aux->func[subprog]->bpf_func; } else { - addr = kallsyms_lookup_name(tname); + if (btf_is_module(btf)) { + mod = btf_try_get_module(btf); + if (mod) + addr = find_kallsyms_symbol_value(mod, tname); + else + addr = 0; + } else { + addr = kallsyms_lookup_name(tname); + } if (!addr) { + module_put(mod); bpf_log(log, "The address of function %s cannot be found\n", tname); @@ -18521,11 +18532,13 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, break; } if (ret) { + module_put(mod); bpf_log(log, "%s is not sleepable\n", tname); return ret; } } else if (prog->expected_attach_type == BPF_MODIFY_RETURN) { if (tgt_prog) { + module_put(mod); bpf_log(log, "can't modify return codes of BPF programs\n"); return -EINVAL; } @@ -18534,6 +18547,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, !check_attach_modify_return(addr, tname)) ret = 0; if (ret) { + module_put(mod); bpf_log(log, "%s() is not modifiable\n", tname); return ret; } @@ -18544,6 +18558,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, tgt_info->tgt_addr = addr; tgt_info->tgt_name = tname; tgt_info->tgt_type = t; + tgt_info->tgt_mod = mod; return 0; } @@ -18623,6 +18638,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) /* store info about the attachment target that will be used later */ prog->aux->attach_func_proto = tgt_info.tgt_type; prog->aux->attach_func_name = tgt_info.tgt_name; + prog->aux->mod = tgt_info.tgt_mod; if (tgt_prog) { prog->aux->saved_dst_prog_type = tgt_prog->type; diff --git a/kernel/module/internal.h b/kernel/module/internal.h index 2e2bf236f558..5c9170f9135c 100644 --- a/kernel/module/internal.h +++ b/kernel/module/internal.h @@ -256,6 +256,11 @@ static inline bool sect_empty(const Elf_Shdr *sect) static inline void init_build_id(struct module *mod, const struct load_info *info) { } static inline void layout_symtab(struct module *mod, struct load_info *info) { } static inline void add_kallsyms(struct module *mod, const struct load_info *info) { } +static inline unsigned long find_kallsyms_symbol_value(struct module *mod, + const char *name) +{ + return 0; +} #endif /* CONFIG_KALLSYMS */ #ifdef CONFIG_SYSFS -- cgit v1.2.3-58-ga151 From 77473d1a962f3d4f7ba48324502b6d27b8ef2591 Mon Sep 17 00:00:00 2001 From: David Vernet Date: Thu, 16 Mar 2023 00:40:24 -0500 Subject: bpf: Free struct bpf_cpumask in call_rcu handler The struct bpf_cpumask type uses the bpf_mem_cache_{alloc,free}() APIs to allocate and free its cpumasks. The bpf_mem allocator may currently immediately reuse some memory when its freed, without waiting for an RCU read cycle to elapse. We want to be able to treat struct bpf_cpumask objects as completely RCU safe. This is necessary for two reasons: 1. bpf_cpumask_kptr_get() currently does an RCU-protected refcnt_inc_not_zero(). This of course assumes that the underlying memory is not reused, and is therefore unsafe in its current form. 2. We want to be able to get rid of bpf_cpumask_kptr_get() entirely, and intead use the superior kptr RCU semantics now afforded by the verifier. This patch fixes (1), and enables (2), by making struct bpf_cpumask RCU safe. A subsequent patch will update the verifier to allow struct bpf_cpumask * pointers to be passed to KF_RCU kfuncs, and then a latter patch will remove bpf_cpumask_kptr_get(). Fixes: 516f4d3397c9 ("bpf: Enable cpumasks to be queried and used as kptrs") Signed-off-by: David Vernet Link: https://lore.kernel.org/r/20230316054028.88924-2-void@manifault.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/cpumask.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c index b6587ec40f1b..98eea62b6b7b 100644 --- a/kernel/bpf/cpumask.c +++ b/kernel/bpf/cpumask.c @@ -9,6 +9,7 @@ /** * struct bpf_cpumask - refcounted BPF cpumask wrapper structure * @cpumask: The actual cpumask embedded in the struct. + * @rcu: The RCU head used to free the cpumask with RCU safety. * @usage: Object reference counter. When the refcount goes to 0, the * memory is released back to the BPF allocator, which provides * RCU safety. @@ -24,6 +25,7 @@ */ struct bpf_cpumask { cpumask_t cpumask; + struct rcu_head rcu; refcount_t usage; }; @@ -108,6 +110,16 @@ __bpf_kfunc struct bpf_cpumask *bpf_cpumask_kptr_get(struct bpf_cpumask **cpumas return cpumask; } +static void cpumask_free_cb(struct rcu_head *head) +{ + struct bpf_cpumask *cpumask; + + cpumask = container_of(head, struct bpf_cpumask, rcu); + migrate_disable(); + bpf_mem_cache_free(&bpf_cpumask_ma, cpumask); + migrate_enable(); +} + /** * bpf_cpumask_release() - Release a previously acquired BPF cpumask. * @cpumask: The cpumask being released. @@ -121,11 +133,8 @@ __bpf_kfunc void bpf_cpumask_release(struct bpf_cpumask *cpumask) if (!cpumask) return; - if (refcount_dec_and_test(&cpumask->usage)) { - migrate_disable(); - bpf_mem_cache_free(&bpf_cpumask_ma, cpumask); - migrate_enable(); - } + if (refcount_dec_and_test(&cpumask->usage)) + call_rcu(&cpumask->rcu, cpumask_free_cb); } /** -- cgit v1.2.3-58-ga151 From 63d2d83d21a6e2c6f019da5b2d5cdabe6d1cb951 Mon Sep 17 00:00:00 2001 From: David Vernet Date: Thu, 16 Mar 2023 00:40:25 -0500 Subject: bpf: Mark struct bpf_cpumask as rcu protected struct bpf_cpumask is a BPF-wrapper around the struct cpumask type which can be instantiated by a BPF program, and then queried as a cpumask in similar fashion to normal kernel code. The previous patch in this series makes the type fully RCU safe, so the type can be included in the rcu_protected_type BTF ID list. A subsequent patch will remove bpf_cpumask_kptr_get(), as it's no longer useful now that we can just treat the type as RCU safe by default and do our own if check. Signed-off-by: David Vernet Link: https://lore.kernel.org/r/20230316054028.88924-3-void@manifault.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 60793f793ca6..15b5c5c729f9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4599,6 +4599,7 @@ static bool in_rcu_cs(struct bpf_verifier_env *env) BTF_SET_START(rcu_protected_types) BTF_ID(struct, prog_test_ref_kfunc) BTF_ID(struct, cgroup) +BTF_ID(struct, bpf_cpumask) BTF_SET_END(rcu_protected_types) static bool rcu_protected_object(const struct btf *btf, u32 btf_id) -- cgit v1.2.3-58-ga151 From 1b403ce77dfbf234723a91bc411dfb03a0499d6e Mon Sep 17 00:00:00 2001 From: David Vernet Date: Thu, 16 Mar 2023 00:40:27 -0500 Subject: bpf: Remove bpf_cpumask_kptr_get() kfunc Now that struct bpf_cpumask is RCU safe, there's no need for this kfunc. Rather than doing the following: private(MASK) static struct bpf_cpumask __kptr *global; int BPF_PROG(prog, s32 cpu, ...) { struct bpf_cpumask *cpumask; bpf_rcu_read_lock(); cpumask = bpf_cpumask_kptr_get(&global); if (!cpumask) { bpf_rcu_read_unlock(); return -1; } bpf_cpumask_setall(cpumask); ... bpf_cpumask_release(cpumask); bpf_rcu_read_unlock(); } Programs can instead simply do (assume same global cpumask): int BPF_PROG(prog, ...) { struct bpf_cpumask *cpumask; bpf_rcu_read_lock(); cpumask = global; if (!cpumask) { bpf_rcu_read_unlock(); return -1; } bpf_cpumask_setall(cpumask); ... bpf_rcu_read_unlock(); } In other words, no extra atomic acquire / release, and less boilerplate code. This patch removes both the kfunc, as well as its selftests and documentation. Signed-off-by: David Vernet Link: https://lore.kernel.org/r/20230316054028.88924-5-void@manifault.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/cpumask.c | 29 --------------------- tools/testing/selftests/bpf/prog_tests/cpumask.c | 1 - tools/testing/selftests/bpf/progs/cpumask_common.h | 1 - .../testing/selftests/bpf/progs/cpumask_failure.c | 24 ----------------- .../testing/selftests/bpf/progs/cpumask_success.c | 30 ---------------------- 5 files changed, 85 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c index 98eea62b6b7b..db9da2194c1a 100644 --- a/kernel/bpf/cpumask.c +++ b/kernel/bpf/cpumask.c @@ -82,34 +82,6 @@ __bpf_kfunc struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask) return cpumask; } -/** - * bpf_cpumask_kptr_get() - Attempt to acquire a reference to a BPF cpumask - * stored in a map. - * @cpumaskp: A pointer to a BPF cpumask map value. - * - * Attempts to acquire a reference to a BPF cpumask stored in a map value. The - * cpumask returned by this function must either be embedded in a map as a - * kptr, or freed with bpf_cpumask_release(). This function may return NULL if - * no BPF cpumask was found in the specified map value. - */ -__bpf_kfunc struct bpf_cpumask *bpf_cpumask_kptr_get(struct bpf_cpumask **cpumaskp) -{ - struct bpf_cpumask *cpumask; - - /* The BPF memory allocator frees memory backing its caches in an RCU - * callback. Thus, we can safely use RCU to ensure that the cpumask is - * safe to read. - */ - rcu_read_lock(); - - cpumask = READ_ONCE(*cpumaskp); - if (cpumask && !refcount_inc_not_zero(&cpumask->usage)) - cpumask = NULL; - - rcu_read_unlock(); - return cpumask; -} - static void cpumask_free_cb(struct rcu_head *head) { struct bpf_cpumask *cpumask; @@ -435,7 +407,6 @@ BTF_SET8_START(cpumask_kfunc_btf_ids) BTF_ID_FLAGS(func, bpf_cpumask_create, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_cpumask_release, KF_RELEASE | KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_cpumask_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_cpumask_first, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_first_zero, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_set_cpu, KF_RCU) diff --git a/tools/testing/selftests/bpf/prog_tests/cpumask.c b/tools/testing/selftests/bpf/prog_tests/cpumask.c index 6c0fe23498c7..cdf4acc18e4c 100644 --- a/tools/testing/selftests/bpf/prog_tests/cpumask.c +++ b/tools/testing/selftests/bpf/prog_tests/cpumask.c @@ -16,7 +16,6 @@ static const char * const cpumask_success_testcases[] = { "test_copy_any_anyand", "test_insert_leave", "test_insert_remove_release", - "test_insert_kptr_get_release", "test_global_mask_rcu", }; diff --git a/tools/testing/selftests/bpf/progs/cpumask_common.h b/tools/testing/selftests/bpf/progs/cpumask_common.h index 7623782fbd62..0c5b785a93e4 100644 --- a/tools/testing/selftests/bpf/progs/cpumask_common.h +++ b/tools/testing/selftests/bpf/progs/cpumask_common.h @@ -26,7 +26,6 @@ struct array_map { struct bpf_cpumask *bpf_cpumask_create(void) __ksym; void bpf_cpumask_release(struct bpf_cpumask *cpumask) __ksym; struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask) __ksym; -struct bpf_cpumask *bpf_cpumask_kptr_get(struct bpf_cpumask **cpumask) __ksym; u32 bpf_cpumask_first(const struct cpumask *cpumask) __ksym; u32 bpf_cpumask_first_zero(const struct cpumask *cpumask) __ksym; void bpf_cpumask_set_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym; diff --git a/tools/testing/selftests/bpf/progs/cpumask_failure.c b/tools/testing/selftests/bpf/progs/cpumask_failure.c index 9f726d55f747..db4f94e72b61 100644 --- a/tools/testing/selftests/bpf/progs/cpumask_failure.c +++ b/tools/testing/selftests/bpf/progs/cpumask_failure.c @@ -94,30 +94,6 @@ int BPF_PROG(test_insert_remove_no_release, struct task_struct *task, u64 clone_ return 0; } -SEC("tp_btf/task_newtask") -__failure __msg("Unreleased reference") -int BPF_PROG(test_kptr_get_no_release, struct task_struct *task, u64 clone_flags) -{ - struct bpf_cpumask *cpumask; - struct __cpumask_map_value *v; - - cpumask = create_cpumask(); - if (!cpumask) - return 0; - - if (cpumask_map_insert(cpumask)) - return 0; - - v = cpumask_map_value_lookup(); - if (!v) - return 0; - - cpumask = bpf_cpumask_kptr_get(&v->cpumask); - - /* cpumask is never released. */ - return 0; -} - SEC("tp_btf/task_newtask") __failure __msg("NULL pointer passed to trusted arg0") int BPF_PROG(test_cpumask_null, struct task_struct *task, u64 clone_flags) diff --git a/tools/testing/selftests/bpf/progs/cpumask_success.c b/tools/testing/selftests/bpf/progs/cpumask_success.c index fe928ff72a06..2fcdd7f68ac7 100644 --- a/tools/testing/selftests/bpf/progs/cpumask_success.c +++ b/tools/testing/selftests/bpf/progs/cpumask_success.c @@ -394,36 +394,6 @@ int BPF_PROG(test_insert_remove_release, struct task_struct *task, u64 clone_fla return 0; } -SEC("tp_btf/task_newtask") -int BPF_PROG(test_insert_kptr_get_release, struct task_struct *task, u64 clone_flags) -{ - struct bpf_cpumask *cpumask; - struct __cpumask_map_value *v; - - cpumask = create_cpumask(); - if (!cpumask) - return 0; - - if (cpumask_map_insert(cpumask)) { - err = 3; - return 0; - } - - v = cpumask_map_value_lookup(); - if (!v) { - err = 4; - return 0; - } - - cpumask = bpf_cpumask_kptr_get(&v->cpumask); - if (cpumask) - bpf_cpumask_release(cpumask); - else - err = 5; - - return 0; -} - SEC("tp_btf/task_newtask") int BPF_PROG(test_global_mask_rcu, struct task_struct *task, u64 clone_flags) { -- cgit v1.2.3-58-ga151 From 082cdc69a4651dd2a77539d69416a359ed1214f5 Mon Sep 17 00:00:00 2001 From: Luis Gerhorst Date: Wed, 15 Mar 2023 17:54:00 +0100 Subject: bpf: Remove misleading spec_v1 check on var-offset stack read For every BPF_ADD/SUB involving a pointer, adjust_ptr_min_max_vals() ensures that the resulting pointer has a constant offset if bypass_spec_v1 is false. This is ensured by calling sanitize_check_bounds() which in turn calls check_stack_access_for_ptr_arithmetic(). There, -EACCESS is returned if the register's offset is not constant, thereby rejecting the program. In summary, an unprivileged user must never be able to create stack pointers with a variable offset. That is also the case, because a respective check in check_stack_write() is missing. If they were able to create a variable-offset pointer, users could still use it in a stack-write operation to trigger unsafe speculative behavior [1]. Because unprivileged users must already be prevented from creating variable-offset stack pointers, viable options are to either remove this check (replacing it with a clarifying comment), or to turn it into a "verifier BUG"-message, also adding a similar check in check_stack_write() (for consistency, as a second-level defense). This patch implements the first option to reduce verifier bloat. This check was introduced by commit 01f810ace9ed ("bpf: Allow variable-offset stack access") which correctly notes that "variable-offset reads and writes are disallowed (they were already disallowed for the indirect access case) because the speculative execution checking code doesn't support them". However, it does not further discuss why the check in check_stack_read() is necessary. The code which made this check obsolete was also introduced in this commit. I have compiled ~650 programs from the Linux selftests, Linux samples, Cilium, and libbpf/examples projects and confirmed that none of these trigger the check in check_stack_read() [2]. Instead, all of these programs are, as expected, already rejected when constructing the variable-offset pointers. Note that the check in check_stack_access_for_ptr_arithmetic() also prints "off=%d" while the code removed by this patch does not (the error removed does not appear in the "verification_error" values). For reproducibility, the repository linked includes the raw data and scripts used to create the plot. [1] https://arxiv.org/pdf/1807.03757.pdf [2] https://gitlab.cs.fau.de/un65esoq/bpf-spectre/-/raw/53dc19fcf459c186613b1156a81504b39c8d49db/data/plots/23-02-26_23-56_bpftool/bpftool/0004-errors.pdf?inline=false Fixes: 01f810ace9ed ("bpf: Allow variable-offset stack access") Signed-off-by: Luis Gerhorst Signed-off-by: Daniel Borkmann Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20230315165358.23701-1-gerhorst@cs.fau.de --- kernel/bpf/verifier.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 15b5c5c729f9..d62b7127ff2a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4303,17 +4303,13 @@ static int check_stack_read(struct bpf_verifier_env *env, } /* Variable offset is prohibited for unprivileged mode for simplicity * since it requires corresponding support in Spectre masking for stack - * ALU. See also retrieve_ptr_limit(). + * ALU. See also retrieve_ptr_limit(). The check in + * check_stack_access_for_ptr_arithmetic() called by + * adjust_ptr_min_max_vals() prevents users from creating stack pointers + * with variable offsets, therefore no check is required here. Further, + * just checking it here would be insufficient as speculative stack + * writes could still lead to unsafe speculative behaviour. */ - if (!env->bypass_spec_v1 && var_off) { - char tn_buf[48]; - - tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "R%d variable offset stack access prohibited for !root, var_off=%s\n", - ptr_regno, tn_buf); - return -EACCES; - } - if (!var_off) { off += reg->var_off.value; err = check_stack_read_fixed_off(env, state, off, size, -- cgit v1.2.3-58-ga151 From bd5314f8dd2d41330eecb60f0490c3fcfe1fc99d Mon Sep 17 00:00:00 2001 From: Viktor Malik Date: Fri, 17 Mar 2023 10:56:01 +0100 Subject: kallsyms, bpf: Move find_kallsyms_symbol_value out of internal header Moving find_kallsyms_symbol_value from kernel/module/internal.h to include/linux/module.h. The reason is that internal.h is not prepared to be included when CONFIG_MODULES=n. find_kallsyms_symbol_value is used by kernel/bpf/verifier.c and including internal.h from it (without modules) leads into a compilation error: In file included from ../include/linux/container_of.h:5, from ../include/linux/list.h:5, from ../include/linux/timer.h:5, from ../include/linux/workqueue.h:9, from ../include/linux/bpf.h:10, from ../include/linux/bpf-cgroup.h:5, from ../kernel/bpf/verifier.c:7: ../kernel/bpf/../module/internal.h: In function 'mod_find': ../include/linux/container_of.h:20:54: error: invalid use of undefined type 'struct module' 20 | static_assert(__same_type(*(ptr), ((type *)0)->member) || \ | ^~ [...] This patch fixes the above error. Fixes: 31bf1dbccfb0 ("bpf: Fix attaching fentry/fexit/fmod_ret/lsm to modules") Reported-by: kernel test robot Signed-off-by: Viktor Malik Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/oe-kbuild-all/202303161404.OrmfCy09-lkp@intel.com/ Link: https://lore.kernel.org/bpf/20230317095601.386738-1-vmalik@redhat.com --- include/linux/module.h | 8 ++++++++ kernel/bpf/verifier.c | 2 +- kernel/module/internal.h | 6 ------ 3 files changed, 9 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/module.h b/include/linux/module.h index 4435ad9439ab..41cfd3be57e5 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -616,6 +616,8 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, /* Look for this name: can be of form module:name. */ unsigned long module_kallsyms_lookup_name(const char *name); +unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name); + extern void __noreturn __module_put_and_kthread_exit(struct module *mod, long code); #define module_put_and_kthread_exit(code) __module_put_and_kthread_exit(THIS_MODULE, code) @@ -796,6 +798,12 @@ static inline unsigned long module_kallsyms_lookup_name(const char *name) return 0; } +static inline unsigned long find_kallsyms_symbol_value(struct module *mod, + const char *name) +{ + return 0; +} + static inline int register_module_notifier(struct notifier_block *nb) { /* no events will happen anyway, so this can always succeed */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d62b7127ff2a..99394a2f7ee4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -24,7 +24,7 @@ #include #include #include -#include "../module/internal.h" +#include #include "disasm.h" diff --git a/kernel/module/internal.h b/kernel/module/internal.h index 5c9170f9135c..1c877561a7d2 100644 --- a/kernel/module/internal.h +++ b/kernel/module/internal.h @@ -246,7 +246,6 @@ static inline void kmemleak_load_module(const struct module *mod, void init_build_id(struct module *mod, const struct load_info *info); void layout_symtab(struct module *mod, struct load_info *info); void add_kallsyms(struct module *mod, const struct load_info *info); -unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name); static inline bool sect_empty(const Elf_Shdr *sect) { @@ -256,11 +255,6 @@ static inline bool sect_empty(const Elf_Shdr *sect) static inline void init_build_id(struct module *mod, const struct load_info *info) { } static inline void layout_symtab(struct module *mod, struct load_info *info) { } static inline void add_kallsyms(struct module *mod, const struct load_info *info) { } -static inline unsigned long find_kallsyms_symbol_value(struct module *mod, - const char *name) -{ - return 0; -} #endif /* CONFIG_KALLSYMS */ #ifdef CONFIG_SYSFS -- cgit v1.2.3-58-ga151 From 58aa2afbb1e61fcf35bfcc819952a3c13d9f9203 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 17 Mar 2023 13:19:17 -0700 Subject: bpf: Allow ld_imm64 instruction to point to kfunc. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allow ld_imm64 insn with BPF_PSEUDO_BTF_ID to hold the address of kfunc. The ld_imm64 pointing to a valid kfunc will be seen as non-null PTR_TO_MEM by is_branch_taken() logic of the verifier, while libbpf will resolve address to unknown kfunc as ld_imm64 reg, 0 which will also be recognized by is_branch_taken() and the verifier will proceed dead code elimination. BPF programs can use this logic to detect at load time whether kfunc is present in the kernel with bpf_ksym_exists() macro that is introduced in the next patches. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Reviewed-by: Martin KaFai Lau Reviewed-by: Toke Høiland-Jørgensen Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20230317201920.62030-2-alexei.starovoitov@gmail.com --- kernel/bpf/verifier.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 99394a2f7ee4..8bc44f5dc5b6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -15952,8 +15952,8 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, goto err_put; } - if (!btf_type_is_var(t)) { - verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR.\n", id); + if (!btf_type_is_var(t) && !btf_type_is_func(t)) { + verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR or KIND_FUNC\n", id); err = -EINVAL; goto err_put; } @@ -15966,6 +15966,14 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, err = -ENOENT; goto err_put; } + insn[0].imm = (u32)addr; + insn[1].imm = addr >> 32; + + if (btf_type_is_func(t)) { + aux->btf_var.reg_type = PTR_TO_MEM | MEM_RDONLY; + aux->btf_var.mem_size = 0; + goto check_btf; + } datasec_id = find_btf_percpu_datasec(btf); if (datasec_id > 0) { @@ -15978,9 +15986,6 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, } } - insn[0].imm = (u32)addr; - insn[1].imm = addr >> 32; - type = t->type; t = btf_type_skip_modifiers(btf, type, NULL); if (percpu) { @@ -16008,7 +16013,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, aux->btf_var.btf = btf; aux->btf_var.btf_id = type; } - +check_btf: /* check whether we recorded this BTF (and maybe module) already */ for (i = 0; i < env->used_btf_cnt; i++) { if (env->used_btfs[i].btf == btf) { -- cgit v1.2.3-58-ga151 From 1057d299459657b85e593a4b6294a000f920672a Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 21 Mar 2023 13:38:52 -0700 Subject: bpf: Teach the verifier to recognize rdonly_mem as not null. Teach the verifier to recognize PTR_TO_MEM | MEM_RDONLY as not NULL otherwise if (!bpf_ksym_exists(known_kfunc)) doesn't go through dead code elimination. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: David Vernet Link: https://lore.kernel.org/bpf/20230321203854.3035-3-alexei.starovoitov@gmail.com --- kernel/bpf/verifier.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8bc44f5dc5b6..5693e4a92752 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -486,8 +486,17 @@ static bool type_is_sk_pointer(enum bpf_reg_type type) type == PTR_TO_XDP_SOCK; } +static bool type_may_be_null(u32 type) +{ + return type & PTR_MAYBE_NULL; +} + static bool reg_type_not_null(enum bpf_reg_type type) { + if (type_may_be_null(type)) + return false; + + type = base_type(type); return type == PTR_TO_SOCKET || type == PTR_TO_TCP_SOCK || type == PTR_TO_MAP_VALUE || @@ -531,11 +540,6 @@ static bool type_is_rdonly_mem(u32 type) return type & MEM_RDONLY; } -static bool type_may_be_null(u32 type) -{ - return type & PTR_MAYBE_NULL; -} - static bool is_acquire_function(enum bpf_func_id func_id, const struct bpf_map *map) { -- cgit v1.2.3-58-ga151 From d7ba4cc900bf1eea2d8c807c6b1fc6bd61f41237 Mon Sep 17 00:00:00 2001 From: JP Kobryn Date: Wed, 22 Mar 2023 12:47:54 -0700 Subject: bpf: return long from bpf_map_ops funcs This patch changes the return types of bpf_map_ops functions to long, where previously int was returned. Using long allows for bpf programs to maintain the sign bit in the absence of sign extension during situations where inlined bpf helper funcs make calls to the bpf_map_ops funcs and a negative error is returned. The definitions of the helper funcs are generated from comments in the bpf uapi header at `include/uapi/linux/bpf.h`. The return type of these helpers was previously changed from int to long in commit bdb7b79b4ce8. For any case where one of the map helpers call the bpf_map_ops funcs that are still returning 32-bit int, a compiler might not include sign extension instructions to properly convert the 32-bit negative value a 64-bit negative value. For example: bpf assembly excerpt of an inlined helper calling a kernel function and checking for a specific error: ; err = bpf_map_update_elem(&mymap, &key, &val, BPF_NOEXIST); ... 46: call 0xffffffffe103291c ; htab_map_update_elem ; if (err && err != -EEXIST) { 4b: cmp $0xffffffffffffffef,%rax ; cmp -EEXIST,%rax kernel function assembly excerpt of return value from `htab_map_update_elem` returning 32-bit int: movl $0xffffffef, %r9d ... movl %r9d, %eax ...results in the comparison: cmp $0xffffffffffffffef, $0x00000000ffffffef Fixes: bdb7b79b4ce8 ("bpf: Switch most helper return values from 32-bit int to 64-bit long") Tested-by: Eduard Zingerman Signed-off-by: JP Kobryn Link: https://lore.kernel.org/r/20230322194754.185781-3-inwardvessel@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 14 +++++++------- include/linux/filter.h | 6 +++--- kernel/bpf/arraymap.c | 12 ++++++------ kernel/bpf/bloom_filter.c | 12 ++++++------ kernel/bpf/bpf_cgrp_storage.c | 6 +++--- kernel/bpf/bpf_inode_storage.c | 6 +++--- kernel/bpf/bpf_struct_ops.c | 6 +++--- kernel/bpf/bpf_task_storage.c | 6 +++--- kernel/bpf/cpumap.c | 8 ++++---- kernel/bpf/devmap.c | 24 ++++++++++++------------ kernel/bpf/hashtab.c | 36 ++++++++++++++++++------------------ kernel/bpf/local_storage.c | 6 +++--- kernel/bpf/lpm_trie.c | 6 +++--- kernel/bpf/queue_stack_maps.c | 22 +++++++++++----------- kernel/bpf/reuseport_array.c | 2 +- kernel/bpf/ringbuf.c | 6 +++--- kernel/bpf/stackmap.c | 6 +++--- kernel/bpf/verifier.c | 14 +++++++------- net/core/bpf_sk_storage.c | 6 +++--- net/core/sock_map.c | 8 ++++---- net/xdp/xskmap.c | 8 ++++---- 21 files changed, 110 insertions(+), 110 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3ef98fb92987..ec0df059f562 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -96,11 +96,11 @@ struct bpf_map_ops { /* funcs callable from userspace and from eBPF programs */ void *(*map_lookup_elem)(struct bpf_map *map, void *key); - int (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags); - int (*map_delete_elem)(struct bpf_map *map, void *key); - int (*map_push_elem)(struct bpf_map *map, void *value, u64 flags); - int (*map_pop_elem)(struct bpf_map *map, void *value); - int (*map_peek_elem)(struct bpf_map *map, void *value); + long (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags); + long (*map_delete_elem)(struct bpf_map *map, void *key); + long (*map_push_elem)(struct bpf_map *map, void *value, u64 flags); + long (*map_pop_elem)(struct bpf_map *map, void *value); + long (*map_peek_elem)(struct bpf_map *map, void *value); void *(*map_lookup_percpu_elem)(struct bpf_map *map, void *key, u32 cpu); /* funcs called by prog_array and perf_event_array map */ @@ -139,7 +139,7 @@ struct bpf_map_ops { struct bpf_local_storage __rcu ** (*map_owner_storage_ptr)(void *owner); /* Misc helpers.*/ - int (*map_redirect)(struct bpf_map *map, u64 key, u64 flags); + long (*map_redirect)(struct bpf_map *map, u64 key, u64 flags); /* map_meta_equal must be implemented for maps that can be * used as an inner map. It is a runtime check to ensure @@ -157,7 +157,7 @@ struct bpf_map_ops { int (*map_set_for_each_callback_args)(struct bpf_verifier_env *env, struct bpf_func_state *caller, struct bpf_func_state *callee); - int (*map_for_each_callback)(struct bpf_map *map, + long (*map_for_each_callback)(struct bpf_map *map, bpf_callback_t callback_fn, void *callback_ctx, u64 flags); diff --git a/include/linux/filter.h b/include/linux/filter.h index efa5d4a1677e..23c08c31bea9 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1504,9 +1504,9 @@ static inline bool bpf_sk_lookup_run_v6(struct net *net, int protocol, } #endif /* IS_ENABLED(CONFIG_IPV6) */ -static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u64 index, - u64 flags, const u64 flag_mask, - void *lookup_elem(struct bpf_map *map, u32 key)) +static __always_inline long __bpf_xdp_redirect_map(struct bpf_map *map, u64 index, + u64 flags, const u64 flag_mask, + void *lookup_elem(struct bpf_map *map, u32 key)) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); const u64 action_mask = XDP_ABORTED | XDP_DROP | XDP_PASS | XDP_TX; diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 1588c793a715..2058e89b5ddd 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -307,8 +307,8 @@ static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key } /* Called from syscall or from eBPF program */ -static int array_map_update_elem(struct bpf_map *map, void *key, void *value, - u64 map_flags) +static long array_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) { struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = *(u32 *)key; @@ -386,7 +386,7 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, } /* Called from syscall or from eBPF program */ -static int array_map_delete_elem(struct bpf_map *map, void *key) +static long array_map_delete_elem(struct bpf_map *map, void *key) { return -EINVAL; } @@ -686,8 +686,8 @@ static const struct bpf_iter_seq_info iter_seq_info = { .seq_priv_size = sizeof(struct bpf_iter_seq_array_map_info), }; -static int bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback_fn, - void *callback_ctx, u64 flags) +static long bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback_fn, + void *callback_ctx, u64 flags) { u32 i, key, num_elems = 0; struct bpf_array *array; @@ -871,7 +871,7 @@ int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, return 0; } -static int fd_array_map_delete_elem(struct bpf_map *map, void *key) +static long fd_array_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_array *array = container_of(map, struct bpf_array, map); void *old_ptr; diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c index 6350c5d35a9b..db19784601a7 100644 --- a/kernel/bpf/bloom_filter.c +++ b/kernel/bpf/bloom_filter.c @@ -41,7 +41,7 @@ static u32 hash(struct bpf_bloom_filter *bloom, void *value, return h & bloom->bitset_mask; } -static int bloom_map_peek_elem(struct bpf_map *map, void *value) +static long bloom_map_peek_elem(struct bpf_map *map, void *value) { struct bpf_bloom_filter *bloom = container_of(map, struct bpf_bloom_filter, map); @@ -56,7 +56,7 @@ static int bloom_map_peek_elem(struct bpf_map *map, void *value) return 0; } -static int bloom_map_push_elem(struct bpf_map *map, void *value, u64 flags) +static long bloom_map_push_elem(struct bpf_map *map, void *value, u64 flags) { struct bpf_bloom_filter *bloom = container_of(map, struct bpf_bloom_filter, map); @@ -73,12 +73,12 @@ static int bloom_map_push_elem(struct bpf_map *map, void *value, u64 flags) return 0; } -static int bloom_map_pop_elem(struct bpf_map *map, void *value) +static long bloom_map_pop_elem(struct bpf_map *map, void *value) { return -EOPNOTSUPP; } -static int bloom_map_delete_elem(struct bpf_map *map, void *value) +static long bloom_map_delete_elem(struct bpf_map *map, void *value) { return -EOPNOTSUPP; } @@ -177,8 +177,8 @@ static void *bloom_map_lookup_elem(struct bpf_map *map, void *key) return ERR_PTR(-EINVAL); } -static int bloom_map_update_elem(struct bpf_map *map, void *key, - void *value, u64 flags) +static long bloom_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 flags) { /* The eBPF program should use map_push_elem instead */ return -EINVAL; diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c index c975cacdd16b..f5b016a5484d 100644 --- a/kernel/bpf/bpf_cgrp_storage.c +++ b/kernel/bpf/bpf_cgrp_storage.c @@ -93,8 +93,8 @@ static void *bpf_cgrp_storage_lookup_elem(struct bpf_map *map, void *key) return sdata ? sdata->data : NULL; } -static int bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key, - void *value, u64 map_flags) +static long bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) { struct bpf_local_storage_data *sdata; struct cgroup *cgroup; @@ -125,7 +125,7 @@ static int cgroup_storage_delete(struct cgroup *cgroup, struct bpf_map *map) return 0; } -static int bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key) +static long bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key) { struct cgroup *cgroup; int err, fd; diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index ad2ab0187e45..9a5f05151898 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -91,8 +91,8 @@ static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key) return sdata ? sdata->data : NULL; } -static int bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key, - void *value, u64 map_flags) +static long bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) { struct bpf_local_storage_data *sdata; struct file *f; @@ -127,7 +127,7 @@ static int inode_storage_delete(struct inode *inode, struct bpf_map *map) return 0; } -static int bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key) +static long bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key) { struct file *f; int fd, err; diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 38903fb52f98..ba7a94276e3b 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -349,8 +349,8 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, model, flags, tlinks, NULL); } -static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, - void *value, u64 flags) +static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 flags) { struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; const struct bpf_struct_ops *st_ops = st_map->st_ops; @@ -524,7 +524,7 @@ unlock: return err; } -static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key) +static long bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key) { enum bpf_struct_ops_state prev_state; struct bpf_struct_ops_map *st_map; diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index c88cc04c17c1..ab5bd1ef58c4 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -120,8 +120,8 @@ out: return ERR_PTR(err); } -static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key, - void *value, u64 map_flags) +static long bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) { struct bpf_local_storage_data *sdata; struct task_struct *task; @@ -173,7 +173,7 @@ static int task_storage_delete(struct task_struct *task, struct bpf_map *map, return 0; } -static int bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key) +static long bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key) { struct task_struct *task; unsigned int f_flags; diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 871809e71b4e..8ec18faa74ac 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -540,7 +540,7 @@ static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap, } } -static int cpu_map_delete_elem(struct bpf_map *map, void *key) +static long cpu_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); u32 key_cpu = *(u32 *)key; @@ -553,8 +553,8 @@ static int cpu_map_delete_elem(struct bpf_map *map, void *key) return 0; } -static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, - u64 map_flags) +static long cpu_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); struct bpf_cpumap_val cpumap_value = {}; @@ -667,7 +667,7 @@ static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } -static int cpu_map_redirect(struct bpf_map *map, u64 index, u64 flags) +static long cpu_map_redirect(struct bpf_map *map, u64 index, u64 flags) { return __bpf_xdp_redirect_map(map, index, flags, 0, __cpu_map_lookup_elem); diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 19b036a228f7..802692fa3905 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -809,7 +809,7 @@ static void __dev_map_entry_free(struct rcu_head *rcu) kfree(dev); } -static int dev_map_delete_elem(struct bpf_map *map, void *key) +static long dev_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *old_dev; @@ -826,7 +826,7 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key) return 0; } -static int dev_map_hash_delete_elem(struct bpf_map *map, void *key) +static long dev_map_hash_delete_elem(struct bpf_map *map, void *key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *old_dev; @@ -897,8 +897,8 @@ err_out: return ERR_PTR(-EINVAL); } -static int __dev_map_update_elem(struct net *net, struct bpf_map *map, - void *key, void *value, u64 map_flags) +static long __dev_map_update_elem(struct net *net, struct bpf_map *map, + void *key, void *value, u64 map_flags) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *dev, *old_dev; @@ -939,15 +939,15 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map, return 0; } -static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, - u64 map_flags) +static long dev_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) { return __dev_map_update_elem(current->nsproxy->net_ns, map, key, value, map_flags); } -static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, - void *key, void *value, u64 map_flags) +static long __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, + void *key, void *value, u64 map_flags) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *dev, *old_dev; @@ -999,21 +999,21 @@ out_err: return err; } -static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value, - u64 map_flags) +static long dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) { return __dev_map_hash_update_elem(current->nsproxy->net_ns, map, key, value, map_flags); } -static int dev_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags) +static long dev_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags) { return __bpf_xdp_redirect_map(map, ifindex, flags, BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS, __dev_map_lookup_elem); } -static int dev_hash_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags) +static long dev_hash_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags) { return __bpf_xdp_redirect_map(map, ifindex, flags, BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS, diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 0df4b0c10f59..96b645bba3a4 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1073,8 +1073,8 @@ static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old, } /* Called from syscall or from eBPF program */ -static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, - u64 map_flags) +static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct htab_elem *l_new = NULL, *l_old; @@ -1175,8 +1175,8 @@ static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem) bpf_lru_push_free(&htab->lru, &elem->lru_node); } -static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, - u64 map_flags) +static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct htab_elem *l_new, *l_old = NULL; @@ -1242,9 +1242,9 @@ err: return ret; } -static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, - void *value, u64 map_flags, - bool onallcpus) +static long __htab_percpu_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags, + bool onallcpus) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct htab_elem *l_new = NULL, *l_old; @@ -1297,9 +1297,9 @@ err: return ret; } -static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, - void *value, u64 map_flags, - bool onallcpus) +static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags, + bool onallcpus) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct htab_elem *l_new = NULL, *l_old; @@ -1364,21 +1364,21 @@ err: return ret; } -static int htab_percpu_map_update_elem(struct bpf_map *map, void *key, - void *value, u64 map_flags) +static long htab_percpu_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) { return __htab_percpu_map_update_elem(map, key, value, map_flags, false); } -static int htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, - void *value, u64 map_flags) +static long htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) { return __htab_lru_percpu_map_update_elem(map, key, value, map_flags, false); } /* Called from syscall or from eBPF program */ -static int htab_map_delete_elem(struct bpf_map *map, void *key) +static long htab_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct hlist_nulls_head *head; @@ -1414,7 +1414,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) return ret; } -static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) +static long htab_lru_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct hlist_nulls_head *head; @@ -2134,8 +2134,8 @@ static const struct bpf_iter_seq_info iter_seq_info = { .seq_priv_size = sizeof(struct bpf_iter_seq_hash_map_info), }; -static int bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_fn, - void *callback_ctx, u64 flags) +static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_fn, + void *callback_ctx, u64 flags) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct hlist_nulls_head *head; diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index a993560f200a..4c7bbec4a9e4 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -141,8 +141,8 @@ static void *cgroup_storage_lookup_elem(struct bpf_map *_map, void *key) return &READ_ONCE(storage->buf)->data[0]; } -static int cgroup_storage_update_elem(struct bpf_map *map, void *key, - void *value, u64 flags) +static long cgroup_storage_update_elem(struct bpf_map *map, void *key, + void *value, u64 flags) { struct bpf_cgroup_storage *storage; struct bpf_storage_buffer *new; @@ -348,7 +348,7 @@ static void cgroup_storage_map_free(struct bpf_map *_map) bpf_map_area_free(map); } -static int cgroup_storage_delete_elem(struct bpf_map *map, void *key) +static long cgroup_storage_delete_elem(struct bpf_map *map, void *key) { return -EINVAL; } diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index dc23f2ac9cde..e0d3ddf2037a 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -300,8 +300,8 @@ static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie, } /* Called from syscall or from eBPF program */ -static int trie_update_elem(struct bpf_map *map, - void *_key, void *value, u64 flags) +static long trie_update_elem(struct bpf_map *map, + void *_key, void *value, u64 flags) { struct lpm_trie *trie = container_of(map, struct lpm_trie, map); struct lpm_trie_node *node, *im_node = NULL, *new_node = NULL; @@ -431,7 +431,7 @@ out: } /* Called from syscall or from eBPF program */ -static int trie_delete_elem(struct bpf_map *map, void *_key) +static long trie_delete_elem(struct bpf_map *map, void *_key) { struct lpm_trie *trie = container_of(map, struct lpm_trie, map); struct bpf_lpm_trie_key *key = _key; diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index 63ecbbcb349d..601609164ef3 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -95,7 +95,7 @@ static void queue_stack_map_free(struct bpf_map *map) bpf_map_area_free(qs); } -static int __queue_map_get(struct bpf_map *map, void *value, bool delete) +static long __queue_map_get(struct bpf_map *map, void *value, bool delete) { struct bpf_queue_stack *qs = bpf_queue_stack(map); unsigned long flags; @@ -124,7 +124,7 @@ out: } -static int __stack_map_get(struct bpf_map *map, void *value, bool delete) +static long __stack_map_get(struct bpf_map *map, void *value, bool delete) { struct bpf_queue_stack *qs = bpf_queue_stack(map); unsigned long flags; @@ -156,32 +156,32 @@ out: } /* Called from syscall or from eBPF program */ -static int queue_map_peek_elem(struct bpf_map *map, void *value) +static long queue_map_peek_elem(struct bpf_map *map, void *value) { return __queue_map_get(map, value, false); } /* Called from syscall or from eBPF program */ -static int stack_map_peek_elem(struct bpf_map *map, void *value) +static long stack_map_peek_elem(struct bpf_map *map, void *value) { return __stack_map_get(map, value, false); } /* Called from syscall or from eBPF program */ -static int queue_map_pop_elem(struct bpf_map *map, void *value) +static long queue_map_pop_elem(struct bpf_map *map, void *value) { return __queue_map_get(map, value, true); } /* Called from syscall or from eBPF program */ -static int stack_map_pop_elem(struct bpf_map *map, void *value) +static long stack_map_pop_elem(struct bpf_map *map, void *value) { return __stack_map_get(map, value, true); } /* Called from syscall or from eBPF program */ -static int queue_stack_map_push_elem(struct bpf_map *map, void *value, - u64 flags) +static long queue_stack_map_push_elem(struct bpf_map *map, void *value, + u64 flags) { struct bpf_queue_stack *qs = bpf_queue_stack(map); unsigned long irq_flags; @@ -227,14 +227,14 @@ static void *queue_stack_map_lookup_elem(struct bpf_map *map, void *key) } /* Called from syscall or from eBPF program */ -static int queue_stack_map_update_elem(struct bpf_map *map, void *key, - void *value, u64 flags) +static long queue_stack_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 flags) { return -EINVAL; } /* Called from syscall or from eBPF program */ -static int queue_stack_map_delete_elem(struct bpf_map *map, void *key) +static long queue_stack_map_delete_elem(struct bpf_map *map, void *key) { return -EINVAL; } diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index 71cb72f5b733..cbf2d8d784b8 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -59,7 +59,7 @@ static void *reuseport_array_lookup_elem(struct bpf_map *map, void *key) } /* Called from syscall only */ -static int reuseport_array_delete_elem(struct bpf_map *map, void *key) +static long reuseport_array_delete_elem(struct bpf_map *map, void *key) { struct reuseport_array *array = reuseport_array(map); u32 index = *(u32 *)key; diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 0d2a45ff83f1..875ac9b698d9 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -242,13 +242,13 @@ static void *ringbuf_map_lookup_elem(struct bpf_map *map, void *key) return ERR_PTR(-ENOTSUPP); } -static int ringbuf_map_update_elem(struct bpf_map *map, void *key, void *value, - u64 flags) +static long ringbuf_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 flags) { return -ENOTSUPP; } -static int ringbuf_map_delete_elem(struct bpf_map *map, void *key) +static long ringbuf_map_delete_elem(struct bpf_map *map, void *key) { return -ENOTSUPP; } diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 0f1d8dced933..b25fce425b2c 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -618,14 +618,14 @@ static int stack_map_get_next_key(struct bpf_map *map, void *key, return 0; } -static int stack_map_update_elem(struct bpf_map *map, void *key, void *value, - u64 map_flags) +static long stack_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) { return -EINVAL; } /* Called from syscall or from eBPF program */ -static int stack_map_delete_elem(struct bpf_map *map, void *key) +static long stack_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); struct stack_map_bucket *old_bucket; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5693e4a92752..50c995697f0e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -17692,21 +17692,21 @@ static int do_misc_fixups(struct bpf_verifier_env *env) BUILD_BUG_ON(!__same_type(ops->map_lookup_elem, (void *(*)(struct bpf_map *map, void *key))NULL)); BUILD_BUG_ON(!__same_type(ops->map_delete_elem, - (int (*)(struct bpf_map *map, void *key))NULL)); + (long (*)(struct bpf_map *map, void *key))NULL)); BUILD_BUG_ON(!__same_type(ops->map_update_elem, - (int (*)(struct bpf_map *map, void *key, void *value, + (long (*)(struct bpf_map *map, void *key, void *value, u64 flags))NULL)); BUILD_BUG_ON(!__same_type(ops->map_push_elem, - (int (*)(struct bpf_map *map, void *value, + (long (*)(struct bpf_map *map, void *value, u64 flags))NULL)); BUILD_BUG_ON(!__same_type(ops->map_pop_elem, - (int (*)(struct bpf_map *map, void *value))NULL)); + (long (*)(struct bpf_map *map, void *value))NULL)); BUILD_BUG_ON(!__same_type(ops->map_peek_elem, - (int (*)(struct bpf_map *map, void *value))NULL)); + (long (*)(struct bpf_map *map, void *value))NULL)); BUILD_BUG_ON(!__same_type(ops->map_redirect, - (int (*)(struct bpf_map *map, u64 index, u64 flags))NULL)); + (long (*)(struct bpf_map *map, u64 index, u64 flags))NULL)); BUILD_BUG_ON(!__same_type(ops->map_for_each_callback, - (int (*)(struct bpf_map *map, + (long (*)(struct bpf_map *map, bpf_callback_t callback_fn, void *callback_ctx, u64 flags))NULL)); diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 24c3dc0d62e5..cb0f5a105b89 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -94,8 +94,8 @@ static void *bpf_fd_sk_storage_lookup_elem(struct bpf_map *map, void *key) return ERR_PTR(err); } -static int bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key, - void *value, u64 map_flags) +static long bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) { struct bpf_local_storage_data *sdata; struct socket *sock; @@ -114,7 +114,7 @@ static int bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key, return err; } -static int bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key) +static long bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key) { struct socket *sock; int fd, err; diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 9b854e236d23..7c189c2e2fbf 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -437,7 +437,7 @@ static void sock_map_delete_from_link(struct bpf_map *map, struct sock *sk, __sock_map_delete(stab, sk, link_raw); } -static int sock_map_delete_elem(struct bpf_map *map, void *key) +static long sock_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); u32 i = *(u32 *)key; @@ -587,8 +587,8 @@ out: return ret; } -static int sock_map_update_elem(struct bpf_map *map, void *key, - void *value, u64 flags) +static long sock_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 flags) { struct sock *sk = (struct sock *)value; int ret; @@ -925,7 +925,7 @@ static void sock_hash_delete_from_link(struct bpf_map *map, struct sock *sk, raw_spin_unlock_bh(&bucket->lock); } -static int sock_hash_delete_elem(struct bpf_map *map, void *key) +static long sock_hash_delete_elem(struct bpf_map *map, void *key) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); u32 hash, key_size = map->key_size; diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c index 0c38d7175922..2c1427074a3b 100644 --- a/net/xdp/xskmap.c +++ b/net/xdp/xskmap.c @@ -162,8 +162,8 @@ static void *xsk_map_lookup_elem_sys_only(struct bpf_map *map, void *key) return ERR_PTR(-EOPNOTSUPP); } -static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value, - u64 map_flags) +static long xsk_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) { struct xsk_map *m = container_of(map, struct xsk_map, map); struct xdp_sock __rcu **map_entry; @@ -223,7 +223,7 @@ out: return err; } -static int xsk_map_delete_elem(struct bpf_map *map, void *key) +static long xsk_map_delete_elem(struct bpf_map *map, void *key) { struct xsk_map *m = container_of(map, struct xsk_map, map); struct xdp_sock __rcu **map_entry; @@ -243,7 +243,7 @@ static int xsk_map_delete_elem(struct bpf_map *map, void *key) return 0; } -static int xsk_map_redirect(struct bpf_map *map, u64 index, u64 flags) +static long xsk_map_redirect(struct bpf_map *map, u64 index, u64 flags) { return __bpf_xdp_redirect_map(map, index, flags, 0, __xsk_map_lookup_elem); -- cgit v1.2.3-58-ga151 From 7be14c1c9030f73cc18b4ff23b78a0a081f16188 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 22 Mar 2023 22:30:55 +0100 Subject: bpf: Fix __reg_bound_offset 64->32 var_off subreg propagation Xu reports that after commit 3f50f132d840 ("bpf: Verifier, do explicit ALU32 bounds tracking"), the following BPF program is rejected by the verifier: 0: (61) r2 = *(u32 *)(r1 +0) ; R2_w=pkt(off=0,r=0,imm=0) 1: (61) r3 = *(u32 *)(r1 +4) ; R3_w=pkt_end(off=0,imm=0) 2: (bf) r1 = r2 3: (07) r1 += 1 4: (2d) if r1 > r3 goto pc+8 5: (71) r1 = *(u8 *)(r2 +0) ; R1_w=scalar(umax=255,var_off=(0x0; 0xff)) 6: (18) r0 = 0x7fffffffffffff10 8: (0f) r1 += r0 ; R1_w=scalar(umin=0x7fffffffffffff10,umax=0x800000000000000f) 9: (18) r0 = 0x8000000000000000 11: (07) r0 += 1 12: (ad) if r0 < r1 goto pc-2 13: (b7) r0 = 0 14: (95) exit And the verifier log says: func#0 @0 0: R1=ctx(off=0,imm=0) R10=fp0 0: (61) r2 = *(u32 *)(r1 +0) ; R1=ctx(off=0,imm=0) R2_w=pkt(off=0,r=0,imm=0) 1: (61) r3 = *(u32 *)(r1 +4) ; R1=ctx(off=0,imm=0) R3_w=pkt_end(off=0,imm=0) 2: (bf) r1 = r2 ; R1_w=pkt(off=0,r=0,imm=0) R2_w=pkt(off=0,r=0,imm=0) 3: (07) r1 += 1 ; R1_w=pkt(off=1,r=0,imm=0) 4: (2d) if r1 > r3 goto pc+8 ; R1_w=pkt(off=1,r=1,imm=0) R3_w=pkt_end(off=0,imm=0) 5: (71) r1 = *(u8 *)(r2 +0) ; R1_w=scalar(umax=255,var_off=(0x0; 0xff)) R2_w=pkt(off=0,r=1,imm=0) 6: (18) r0 = 0x7fffffffffffff10 ; R0_w=9223372036854775568 8: (0f) r1 += r0 ; R0_w=9223372036854775568 R1_w=scalar(umin=9223372036854775568,umax=9223372036854775823,s32_min=-240,s32_max=15) 9: (18) r0 = 0x8000000000000000 ; R0_w=-9223372036854775808 11: (07) r0 += 1 ; R0_w=-9223372036854775807 12: (ad) if r0 < r1 goto pc-2 ; R0_w=-9223372036854775807 R1_w=scalar(umin=9223372036854775568,umax=9223372036854775809) 13: (b7) r0 = 0 ; R0_w=0 14: (95) exit from 12 to 11: R0_w=-9223372036854775807 R1_w=scalar(umin=9223372036854775810,umax=9223372036854775823,var_off=(0x8000000000000000; 0xffffffff)) R2_w=pkt(off=0,r=1,imm=0) R3_w=pkt_end(off=0,imm=0) R10=fp0 11: (07) r0 += 1 ; R0_w=-9223372036854775806 12: (ad) if r0 < r1 goto pc-2 ; R0_w=-9223372036854775806 R1_w=scalar(umin=9223372036854775810,umax=9223372036854775810,var_off=(0x8000000000000000; 0xffffffff)) 13: safe [...] from 12 to 11: R0_w=-9223372036854775795 R1=scalar(umin=9223372036854775822,umax=9223372036854775823,var_off=(0x8000000000000000; 0xffffffff)) R2=pkt(off=0,r=1,imm=0) R3=pkt_end(off=0,imm=0) R10=fp0 11: (07) r0 += 1 ; R0_w=-9223372036854775794 12: (ad) if r0 < r1 goto pc-2 ; R0_w=-9223372036854775794 R1=scalar(umin=9223372036854775822,umax=9223372036854775822,var_off=(0x8000000000000000; 0xffffffff)) 13: safe from 12 to 11: R0_w=-9223372036854775794 R1=scalar(umin=9223372036854775823,umax=9223372036854775823,var_off=(0x8000000000000000; 0xffffffff)) R2=pkt(off=0,r=1,imm=0) R3=pkt_end(off=0,imm=0) R10=fp0 11: (07) r0 += 1 ; R0_w=-9223372036854775793 12: (ad) if r0 < r1 goto pc-2 ; R0_w=-9223372036854775793 R1=scalar(umin=9223372036854775823,umax=9223372036854775823,var_off=(0x8000000000000000; 0xffffffff)) 13: safe from 12 to 11: R0_w=-9223372036854775793 R1=scalar(umin=9223372036854775824,umax=9223372036854775823,var_off=(0x8000000000000000; 0xffffffff)) R2=pkt(off=0,r=1,imm=0) R3=pkt_end(off=0,imm=0) R10=fp0 11: (07) r0 += 1 ; R0_w=-9223372036854775792 12: (ad) if r0 < r1 goto pc-2 ; R0_w=-9223372036854775792 R1=scalar(umin=9223372036854775824,umax=9223372036854775823,var_off=(0x8000000000000000; 0xffffffff)) 13: safe [...] The 64bit umin=9223372036854775810 bound continuously bumps by +1 while umax=9223372036854775823 stays as-is until the verifier complexity limit is reached and the program gets finally rejected. During this simulation, the umin also eventually surpasses umax. Looking at the first 'from 12 to 11' output line from the loop, R1 has the following state: R1_w=scalar(umin=0x8000000000000002 (9223372036854775810), umax=0x800000000000000f (9223372036854775823), var_off=(0x8000000000000000; 0xffffffff)) The var_off has technically not an inconsistent state but it's very imprecise and far off surpassing 64bit umax bounds whereas the expected output with refined known bits in var_off should have been like: R1_w=scalar(umin=0x8000000000000002 (9223372036854775810), umax=0x800000000000000f (9223372036854775823), var_off=(0x8000000000000000; 0xf)) In the above log, var_off stays as var_off=(0x8000000000000000; 0xffffffff) and does not converge into a narrower mask where more bits become known, eventually transforming R1 into a constant upon umin=9223372036854775823, umax=9223372036854775823 case where the verifier would have terminated and let the program pass. The __reg_combine_64_into_32() marks the subregister unknown and propagates 64bit {s,u}min/{s,u}max bounds to their 32bit equivalents iff they are within the 32bit universe. The question came up whether __reg_combine_64_into_32() should special case the situation that when 64bit {s,u}min bounds have the same value as 64bit {s,u}max bounds to then assign the latter as well to the 32bit reg->{s,u}32_{min,max}_value. As can be seen from the above example however, that is just /one/ special case and not a /generic/ solution given above example would still not be addressed this way and remain at an imprecise var_off=(0x8000000000000000; 0xffffffff). The improvement is needed in __reg_bound_offset() to refine var32_off with the updated var64_off instead of the prior reg->var_off. The reg_bounds_sync() code first refines information about the register's min/max bounds via __update_reg_bounds() from the current var_off, then in __reg_deduce_bounds() from sign bit and with the potentially learned bits from bounds it'll update the var_off tnum in __reg_bound_offset(). For example, intersecting with the old var_off might have improved bounds slightly, e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc), then new var_off will then result in (0; 0x7f...fc). The intersected var64_off holds then the universe which is a superset of var32_off. The point for the latter is not to broaden, but to further refine known bits based on the intersection of var_off with 32 bit bounds, so that we later construct the final var_off from upper and lower 32 bits. The final __update_reg_bounds() can then potentially still slightly refine bounds if more bits became known from the new var_off. After the improvement, we can see R1 converging successively: func#0 @0 0: R1=ctx(off=0,imm=0) R10=fp0 0: (61) r2 = *(u32 *)(r1 +0) ; R1=ctx(off=0,imm=0) R2_w=pkt(off=0,r=0,imm=0) 1: (61) r3 = *(u32 *)(r1 +4) ; R1=ctx(off=0,imm=0) R3_w=pkt_end(off=0,imm=0) 2: (bf) r1 = r2 ; R1_w=pkt(off=0,r=0,imm=0) R2_w=pkt(off=0,r=0,imm=0) 3: (07) r1 += 1 ; R1_w=pkt(off=1,r=0,imm=0) 4: (2d) if r1 > r3 goto pc+8 ; R1_w=pkt(off=1,r=1,imm=0) R3_w=pkt_end(off=0,imm=0) 5: (71) r1 = *(u8 *)(r2 +0) ; R1_w=scalar(umax=255,var_off=(0x0; 0xff)) R2_w=pkt(off=0,r=1,imm=0) 6: (18) r0 = 0x7fffffffffffff10 ; R0_w=9223372036854775568 8: (0f) r1 += r0 ; R0_w=9223372036854775568 R1_w=scalar(umin=9223372036854775568,umax=9223372036854775823,s32_min=-240,s32_max=15) 9: (18) r0 = 0x8000000000000000 ; R0_w=-9223372036854775808 11: (07) r0 += 1 ; R0_w=-9223372036854775807 12: (ad) if r0 < r1 goto pc-2 ; R0_w=-9223372036854775807 R1_w=scalar(umin=9223372036854775568,umax=9223372036854775809) 13: (b7) r0 = 0 ; R0_w=0 14: (95) exit from 12 to 11: R0_w=-9223372036854775807 R1_w=scalar(umin=9223372036854775810,umax=9223372036854775823,var_off=(0x8000000000000000; 0xf),s32_min=0,s32_max=15,u32_max=15) R2_w=pkt(off=0,r=1,imm=0) R3_w=pkt_end(off=0,imm=0) R10=fp0 11: (07) r0 += 1 ; R0_w=-9223372036854775806 12: (ad) if r0 < r1 goto pc-2 ; R0_w=-9223372036854775806 R1_w=-9223372036854775806 13: safe from 12 to 11: R0_w=-9223372036854775806 R1_w=scalar(umin=9223372036854775811,umax=9223372036854775823,var_off=(0x8000000000000000; 0xf),s32_min=0,s32_max=15,u32_max=15) R2_w=pkt(off=0,r=1,imm=0) R3_w=pkt_end(off=0,imm=0) R10=fp0 11: (07) r0 += 1 ; R0_w=-9223372036854775805 12: (ad) if r0 < r1 goto pc-2 ; R0_w=-9223372036854775805 R1_w=-9223372036854775805 13: safe [...] from 12 to 11: R0_w=-9223372036854775798 R1=scalar(umin=9223372036854775819,umax=9223372036854775823,var_off=(0x8000000000000008; 0x7),s32_min=8,s32_max=15,u32_min=8,u32_max=15) R2=pkt(off=0,r=1,imm=0) R3=pkt_end(off=0,imm=0) R10=fp0 11: (07) r0 += 1 ; R0_w=-9223372036854775797 12: (ad) if r0 < r1 goto pc-2 ; R0_w=-9223372036854775797 R1=-9223372036854775797 13: safe from 12 to 11: R0_w=-9223372036854775797 R1=scalar(umin=9223372036854775820,umax=9223372036854775823,var_off=(0x800000000000000c; 0x3),s32_min=12,s32_max=15,u32_min=12,u32_max=15) R2=pkt(off=0,r=1,imm=0) R3=pkt_end(off=0,imm=0) R10=fp0 11: (07) r0 += 1 ; R0_w=-9223372036854775796 12: (ad) if r0 < r1 goto pc-2 ; R0_w=-9223372036854775796 R1=-9223372036854775796 13: safe from 12 to 11: R0_w=-9223372036854775796 R1=scalar(umin=9223372036854775821,umax=9223372036854775823,var_off=(0x800000000000000c; 0x3),s32_min=12,s32_max=15,u32_min=12,u32_max=15) R2=pkt(off=0,r=1,imm=0) R3=pkt_end(off=0,imm=0) R10=fp0 11: (07) r0 += 1 ; R0_w=-9223372036854775795 12: (ad) if r0 < r1 goto pc-2 ; R0_w=-9223372036854775795 R1=-9223372036854775795 13: safe from 12 to 11: R0_w=-9223372036854775795 R1=scalar(umin=9223372036854775822,umax=9223372036854775823,var_off=(0x800000000000000e; 0x1),s32_min=14,s32_max=15,u32_min=14,u32_max=15) R2=pkt(off=0,r=1,imm=0) R3=pkt_end(off=0,imm=0) R10=fp0 11: (07) r0 += 1 ; R0_w=-9223372036854775794 12: (ad) if r0 < r1 goto pc-2 ; R0_w=-9223372036854775794 R1=-9223372036854775794 13: safe from 12 to 11: R0_w=-9223372036854775794 R1=-9223372036854775793 R2=pkt(off=0,r=1,imm=0) R3=pkt_end(off=0,imm=0) R10=fp0 11: (07) r0 += 1 ; R0_w=-9223372036854775793 12: (ad) if r0 < r1 goto pc-2 last_idx 12 first_idx 12 parent didn't have regs=1 stack=0 marks: R0_rw=P-9223372036854775801 R1_r=scalar(umin=9223372036854775815,umax=9223372036854775823,var_off=(0x8000000000000000; 0xf),s32_min=0,s32_max=15,u32_max=15) R2=pkt(off=0,r=1,imm=0) R3=pkt_end(off=0,imm=0) R10=fp0 last_idx 11 first_idx 11 regs=1 stack=0 before 11: (07) r0 += 1 parent didn't have regs=1 stack=0 marks: R0_rw=P-9223372036854775805 R1_rw=scalar(umin=9223372036854775812,umax=9223372036854775823,var_off=(0x8000000000000000; 0xf),s32_min=0,s32_max=15,u32_max=15) R2_w=pkt(off=0,r=1,imm=0) R3_w=pkt_end(off=0,imm=0) R10=fp0 last_idx 12 first_idx 0 regs=1 stack=0 before 12: (ad) if r0 < r1 goto pc-2 regs=1 stack=0 before 11: (07) r0 += 1 regs=1 stack=0 before 12: (ad) if r0 < r1 goto pc-2 regs=1 stack=0 before 11: (07) r0 += 1 regs=1 stack=0 before 12: (ad) if r0 < r1 goto pc-2 regs=1 stack=0 before 11: (07) r0 += 1 regs=1 stack=0 before 9: (18) r0 = 0x8000000000000000 last_idx 12 first_idx 12 parent didn't have regs=2 stack=0 marks: R0_rw=P-9223372036854775801 R1_r=Pscalar(umin=9223372036854775815,umax=9223372036854775823,var_off=(0x8000000000000000; 0xf),s32_min=0,s32_max=15,u32_max=15) R2=pkt(off=0,r=1,imm=0) R3=pkt_end(off=0,imm=0) R10=fp0 last_idx 11 first_idx 11 regs=2 stack=0 before 11: (07) r0 += 1 parent didn't have regs=2 stack=0 marks: R0_rw=P-9223372036854775805 R1_rw=Pscalar(umin=9223372036854775812,umax=9223372036854775823,var_off=(0x8000000000000000; 0xf),s32_min=0,s32_max=15,u32_max=15) R2_w=pkt(off=0,r=1,imm=0) R3_w=pkt_end(off=0,imm=0) R10=fp0 last_idx 12 first_idx 0 regs=2 stack=0 before 12: (ad) if r0 < r1 goto pc-2 regs=2 stack=0 before 11: (07) r0 += 1 regs=2 stack=0 before 12: (ad) if r0 < r1 goto pc-2 regs=2 stack=0 before 11: (07) r0 += 1 regs=2 stack=0 before 12: (ad) if r0 < r1 goto pc-2 regs=2 stack=0 before 11: (07) r0 += 1 regs=2 stack=0 before 9: (18) r0 = 0x8000000000000000 regs=2 stack=0 before 8: (0f) r1 += r0 regs=3 stack=0 before 6: (18) r0 = 0x7fffffffffffff10 regs=2 stack=0 before 5: (71) r1 = *(u8 *)(r2 +0) 13: safe from 4 to 13: safe verification time 322 usec stack depth 0 processed 56 insns (limit 1000000) max_states_per_insn 1 total_states 3 peak_states 3 mark_read 1 This also fixes up a test case along with this improvement where we match on the verifier log. The updated log now has a refined var_off, too. Fixes: 3f50f132d840 ("bpf: Verifier, do explicit ALU32 bounds tracking") Reported-by: Xu Kuohai Signed-off-by: Daniel Borkmann Signed-off-by: Andrii Nakryiko Reviewed-by: John Fastabend Link: https://lore.kernel.org/bpf/20230314203424.4015351-2-xukuohai@huaweicloud.com Link: https://lore.kernel.org/bpf/20230322213056.2470-1-daniel@iogearbox.net --- kernel/bpf/verifier.c | 6 +++--- tools/testing/selftests/bpf/prog_tests/align.c | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 50c995697f0e..fd2f216de920 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2149,9 +2149,9 @@ static void __reg_bound_offset(struct bpf_reg_state *reg) struct tnum var64_off = tnum_intersect(reg->var_off, tnum_range(reg->umin_value, reg->umax_value)); - struct tnum var32_off = tnum_intersect(tnum_subreg(reg->var_off), - tnum_range(reg->u32_min_value, - reg->u32_max_value)); + struct tnum var32_off = tnum_intersect(tnum_subreg(var64_off), + tnum_range(reg->u32_min_value, + reg->u32_max_value)); reg->var_off = tnum_or(tnum_clear_subreg(var64_off), var32_off); } diff --git a/tools/testing/selftests/bpf/prog_tests/align.c b/tools/testing/selftests/bpf/prog_tests/align.c index c94fa8d6c4f6..b92770592563 100644 --- a/tools/testing/selftests/bpf/prog_tests/align.c +++ b/tools/testing/selftests/bpf/prog_tests/align.c @@ -575,14 +575,14 @@ static struct bpf_align_test tests[] = { /* New unknown value in R7 is (4n), >= 76 */ {14, "R7_w=scalar(umin=76,umax=1096,var_off=(0x0; 0x7fc))"}, /* Adding it to packet pointer gives nice bounds again */ - {16, "R5_w=pkt(id=3,off=0,r=0,umin=2,umax=1082,var_off=(0x2; 0xfffffffc)"}, + {16, "R5_w=pkt(id=3,off=0,r=0,umin=2,umax=1082,var_off=(0x2; 0x7fc)"}, /* At the time the word size load is performed from R5, * its total fixed offset is NET_IP_ALIGN + reg->off (0) * which is 2. Then the variable offset is (4n+2), so * the total offset is 4-byte aligned and meets the * load's requirements. */ - {20, "R5=pkt(id=3,off=0,r=4,umin=2,umax=1082,var_off=(0x2; 0xfffffffc)"}, + {20, "R5=pkt(id=3,off=0,r=4,umin=2,umax=1082,var_off=(0x2; 0x7fc)"}, }, }, }; -- cgit v1.2.3-58-ga151 From b63cbc490e18d893632929b8faa55bb28da3fcd4 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 22 Mar 2023 16:25:02 -0700 Subject: bpf: remember meta->iter info only for initialized iters For iter_new() functions iterator state's slot might not be yet initialized, in which case iter_get_spi() will return -ERANGE. This is expected and is handled properly. But for iter_next() and iter_destroy() cases iter slot is supposed to be initialized and correct, so -ERANGE is not possible. Move meta->iter.{spi,frameno} initialization into iter_next/iter_destroy handling branch to make it more explicit that valid information will be remembered in meta->iter block for subsequent use in process_iter_next_call(), avoiding confusingly looking -ERANGE assignment for meta->iter.spi. Reported-by: Dan Carpenter Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20230322232502.836171-1-andrii@kernel.org Signed-off-by: Martin KaFai Lau --- kernel/bpf/verifier.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index fd2f216de920..64f06f6e16bf 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6778,13 +6778,6 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id t = btf_type_skip_modifiers(meta->btf, t->type, &btf_id); /* STRUCT */ nr_slots = t->size / BPF_REG_SIZE; - spi = iter_get_spi(env, reg, nr_slots); - if (spi < 0 && spi != -ERANGE) - return spi; - - meta->iter.spi = spi; - meta->iter.frameno = reg->frameno; - if (is_iter_new_kfunc(meta)) { /* bpf_iter__new() expects pointer to uninit iter state */ if (!is_iter_reg_valid_uninit(env, reg, nr_slots)) { @@ -6811,10 +6804,17 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id return -EINVAL; } + spi = iter_get_spi(env, reg, nr_slots); + if (spi < 0) + return spi; + err = mark_iter_read(env, reg, spi, nr_slots); if (err) return err; + /* remember meta->iter info for process_iter_next_call() */ + meta->iter.spi = spi; + meta->iter.frameno = reg->frameno; meta->ref_obj_id = iter_ref_obj_id(env, reg, spi); if (is_iter_destroy_kfunc(meta)) { -- cgit v1.2.3-58-ga151 From b671c2067a04c0668df174ff5dfdb573d1f9b074 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Wed, 22 Mar 2023 20:23:58 -0700 Subject: bpf: Retire the struct_ops map kvalue->refcnt. We have replaced kvalue-refcnt with synchronize_rcu() to wait for an RCU grace period. Maintenance of kvalue->refcnt was a complicated task, as we had to simultaneously keep track of two reference counts: one for the reference count of bpf_map. When the kvalue->refcnt reaches zero, we also have to reduce the reference count on bpf_map - yet these steps are not performed in an atomic manner and require us to be vigilant when managing them. By eliminating kvalue->refcnt, we can make our maintenance more straightforward as the refcount of bpf_map is now solely managed! To prevent the trampoline image of a struct_ops from being released while it is still in use, we wait for an RCU grace period. The setsockopt(TCP_CONGESTION, "...") command allows you to change your socket's congestion control algorithm and can result in releasing the old struct_ops implementation. It is fine. However, this function is exposed through bpf_setsockopt(), it may be accessed by BPF programs as well. To ensure that the trampoline image belonging to struct_op can be safely called while its method is in use, the trampoline safeguarde the BPF program with rcu_read_lock(). Doing so prevents any destruction of the associated images before returning from a trampoline and requires us to wait for an RCU grace period. Signed-off-by: Kui-Feng Lee Link: https://lore.kernel.org/r/20230323032405.3735486-2-kuifeng@meta.com Signed-off-by: Martin KaFai Lau --- include/linux/bpf.h | 1 + kernel/bpf/bpf_struct_ops.c | 77 ++++++++++++++++++++++++++------------------- kernel/bpf/syscall.c | 6 ++-- 3 files changed, 49 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ec0df059f562..f04098468d7a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1945,6 +1945,7 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd); struct bpf_map *__bpf_map_get(struct fd f); void bpf_map_inc(struct bpf_map *map); void bpf_map_inc_with_uref(struct bpf_map *map); +struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref); struct bpf_map * __must_check bpf_map_inc_not_zero(struct bpf_map *map); void bpf_map_put_with_uref(struct bpf_map *map); void bpf_map_put(struct bpf_map *map); diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index ba7a94276e3b..2f3c4a0e03ee 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -11,6 +11,7 @@ #include #include #include +#include enum bpf_struct_ops_state { BPF_STRUCT_OPS_STATE_INIT, @@ -249,6 +250,7 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; struct bpf_struct_ops_value *uvalue, *kvalue; enum bpf_struct_ops_state state; + s64 refcnt; if (unlikely(*(u32 *)key != 0)) return -ENOENT; @@ -267,7 +269,14 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, uvalue = value; memcpy(uvalue, st_map->uvalue, map->value_size); uvalue->state = state; - refcount_set(&uvalue->refcnt, refcount_read(&kvalue->refcnt)); + + /* This value offers the user space a general estimate of how + * many sockets are still utilizing this struct_ops for TCP + * congestion control. The number might not be exact, but it + * should sufficiently meet our present goals. + */ + refcnt = atomic64_read(&map->refcnt) - atomic64_read(&map->usercnt); + refcount_set(&uvalue->refcnt, max_t(s64, refcnt, 0)); return 0; } @@ -491,7 +500,6 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, *(unsigned long *)(udata + moff) = prog->aux->id; } - refcount_set(&kvalue->refcnt, 1); bpf_map_inc(map); set_memory_rox((long)st_map->image, 1); @@ -536,8 +544,7 @@ static long bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key) switch (prev_state) { case BPF_STRUCT_OPS_STATE_INUSE: st_map->st_ops->unreg(&st_map->kvalue.data); - if (refcount_dec_and_test(&st_map->kvalue.refcnt)) - bpf_map_put(map); + bpf_map_put(map); return 0; case BPF_STRUCT_OPS_STATE_TOBEFREE: return -EINPROGRESS; @@ -570,7 +577,7 @@ static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key, kfree(value); } -static void bpf_struct_ops_map_free(struct bpf_map *map) +static void __bpf_struct_ops_map_free(struct bpf_map *map) { struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; @@ -582,6 +589,28 @@ static void bpf_struct_ops_map_free(struct bpf_map *map) bpf_map_area_free(st_map); } +static void bpf_struct_ops_map_free(struct bpf_map *map) +{ + /* The struct_ops's function may switch to another struct_ops. + * + * For example, bpf_tcp_cc_x->init() may switch to + * another tcp_cc_y by calling + * setsockopt(TCP_CONGESTION, "tcp_cc_y"). + * During the switch, bpf_struct_ops_put(tcp_cc_x) is called + * and its refcount may reach 0 which then free its + * trampoline image while tcp_cc_x is still running. + * + * A vanilla rcu gp is to wait for all bpf-tcp-cc prog + * to finish. bpf-tcp-cc prog is non sleepable. + * A rcu_tasks gp is to wait for the last few insn + * in the tramopline image to finish before releasing + * the trampoline image. + */ + synchronize_rcu_mult(call_rcu, call_rcu_tasks); + + __bpf_struct_ops_map_free(map); +} + static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr) { if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 || @@ -630,7 +659,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) NUMA_NO_NODE); st_map->image = bpf_jit_alloc_exec(PAGE_SIZE); if (!st_map->uvalue || !st_map->links || !st_map->image) { - bpf_struct_ops_map_free(map); + __bpf_struct_ops_map_free(map); return ERR_PTR(-ENOMEM); } @@ -676,41 +705,23 @@ const struct bpf_map_ops bpf_struct_ops_map_ops = { bool bpf_struct_ops_get(const void *kdata) { struct bpf_struct_ops_value *kvalue; + struct bpf_struct_ops_map *st_map; + struct bpf_map *map; kvalue = container_of(kdata, struct bpf_struct_ops_value, data); + st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue); - return refcount_inc_not_zero(&kvalue->refcnt); -} - -static void bpf_struct_ops_put_rcu(struct rcu_head *head) -{ - struct bpf_struct_ops_map *st_map; - - st_map = container_of(head, struct bpf_struct_ops_map, rcu); - bpf_map_put(&st_map->map); + map = __bpf_map_inc_not_zero(&st_map->map, false); + return !IS_ERR(map); } void bpf_struct_ops_put(const void *kdata) { struct bpf_struct_ops_value *kvalue; + struct bpf_struct_ops_map *st_map; kvalue = container_of(kdata, struct bpf_struct_ops_value, data); - if (refcount_dec_and_test(&kvalue->refcnt)) { - struct bpf_struct_ops_map *st_map; - - st_map = container_of(kvalue, struct bpf_struct_ops_map, - kvalue); - /* The struct_ops's function may switch to another struct_ops. - * - * For example, bpf_tcp_cc_x->init() may switch to - * another tcp_cc_y by calling - * setsockopt(TCP_CONGESTION, "tcp_cc_y"). - * During the switch, bpf_struct_ops_put(tcp_cc_x) is called - * and its map->refcnt may reach 0 which then free its - * trampoline image while tcp_cc_x is still running. - * - * Thus, a rcu grace period is needed here. - */ - call_rcu(&st_map->rcu, bpf_struct_ops_put_rcu); - } + st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue); + + bpf_map_put(&st_map->map); } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 099e9068bcdd..cff0348a2871 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1303,8 +1303,10 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd) return map; } -/* map_idr_lock should have been held */ -static struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref) +/* map_idr_lock should have been held or the map should have been + * protected by rcu read lock. + */ +struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref) { int refold; -- cgit v1.2.3-58-ga151 From 68b04864ca425d1894c96b8141d4fba1181f11cb Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Wed, 22 Mar 2023 20:24:00 -0700 Subject: bpf: Create links for BPF struct_ops maps. Make bpf_link support struct_ops. Previously, struct_ops were always used alone without any associated links. Upon updating its value, a struct_ops would be activated automatically. Yet other BPF program types required to make a bpf_link with their instances before they could become active. Now, however, you can create an inactive struct_ops, and create a link to activate it later. With bpf_links, struct_ops has a behavior similar to other BPF program types. You can pin/unpin them from their links and the struct_ops will be deactivated when its link is removed while previously need someone to delete the value for it to be deactivated. bpf_links are responsible for registering their associated struct_ops. You can only use a struct_ops that has the BPF_F_LINK flag set to create a bpf_link, while a structs without this flag behaves in the same manner as before and is registered upon updating its value. The BPF_LINK_TYPE_STRUCT_OPS serves a dual purpose. Not only is it used to craft the links for BPF struct_ops programs, but also to create links for BPF struct_ops them-self. Since the links of BPF struct_ops programs are only used to create trampolines internally, they are never seen in other contexts. Thus, they can be reused for struct_ops themself. To maintain a reference to the map supporting this link, we add bpf_struct_ops_link as an additional type. The pointer of the map is RCU and won't be necessary until later in the patchset. Signed-off-by: Kui-Feng Lee Link: https://lore.kernel.org/r/20230323032405.3735486-4-kuifeng@meta.com Signed-off-by: Martin KaFai Lau --- include/linux/bpf.h | 7 ++ include/uapi/linux/bpf.h | 12 +++- kernel/bpf/bpf_struct_ops.c | 143 ++++++++++++++++++++++++++++++++++++++++- kernel/bpf/syscall.c | 23 ++++--- net/ipv4/bpf_tcp_ca.c | 8 ++- tools/include/uapi/linux/bpf.h | 12 +++- 6 files changed, 190 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f04098468d7a..8552279efe46 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1518,6 +1518,7 @@ struct bpf_struct_ops { void *kdata, const void *udata); int (*reg)(void *kdata); void (*unreg)(void *kdata); + int (*validate)(void *kdata); const struct btf_type *type; const struct btf_type *value_type; const char *name; @@ -1552,6 +1553,7 @@ static inline void bpf_module_put(const void *data, struct module *owner) else module_put(owner); } +int bpf_struct_ops_link_create(union bpf_attr *attr); #ifdef CONFIG_NET /* Define it here to avoid the use of forward declaration */ @@ -1592,6 +1594,11 @@ static inline int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, { return -EINVAL; } +static inline int bpf_struct_ops_link_create(union bpf_attr *attr) +{ + return -EOPNOTSUPP; +} + #endif #if defined(CONFIG_CGROUP_BPF) && defined(CONFIG_BPF_LSM) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 13129df937cd..42f40ee083bf 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1033,6 +1033,7 @@ enum bpf_attach_type { BPF_PERF_EVENT, BPF_TRACE_KPROBE_MULTI, BPF_LSM_CGROUP, + BPF_STRUCT_OPS, __MAX_BPF_ATTACH_TYPE }; @@ -1266,6 +1267,9 @@ enum { /* Create a map that is suitable to be an inner map with dynamic max entries */ BPF_F_INNER_MAP = (1U << 12), + +/* Create a map that will be registered/unregesitered by the backed bpf_link */ + BPF_F_LINK = (1U << 13), }; /* Flags for BPF_PROG_QUERY. */ @@ -1507,7 +1511,10 @@ union bpf_attr { } task_fd_query; struct { /* struct used by BPF_LINK_CREATE command */ - __u32 prog_fd; /* eBPF program to attach */ + union { + __u32 prog_fd; /* eBPF program to attach */ + __u32 map_fd; /* struct_ops to attach */ + }; union { __u32 target_fd; /* object to attach to */ __u32 target_ifindex; /* target ifindex */ @@ -6379,6 +6386,9 @@ struct bpf_link_info { struct { __u32 ifindex; } xdp; + struct { + __u32 map_id; + } struct_ops; }; } __attribute__((aligned(8))); diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 2f3c4a0e03ee..3d6b5240c25a 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -17,6 +17,7 @@ enum bpf_struct_ops_state { BPF_STRUCT_OPS_STATE_INIT, BPF_STRUCT_OPS_STATE_INUSE, BPF_STRUCT_OPS_STATE_TOBEFREE, + BPF_STRUCT_OPS_STATE_READY, }; #define BPF_STRUCT_OPS_COMMON_VALUE \ @@ -59,6 +60,11 @@ struct bpf_struct_ops_map { struct bpf_struct_ops_value kvalue; }; +struct bpf_struct_ops_link { + struct bpf_link link; + struct bpf_map __rcu *map; +}; + #define VALUE_PREFIX "bpf_struct_ops_" #define VALUE_PREFIX_LEN (sizeof(VALUE_PREFIX) - 1) @@ -500,11 +506,29 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, *(unsigned long *)(udata + moff) = prog->aux->id; } - bpf_map_inc(map); + if (st_map->map.map_flags & BPF_F_LINK) { + err = st_ops->validate(kdata); + if (err) + goto reset_unlock; + set_memory_rox((long)st_map->image, 1); + /* Let bpf_link handle registration & unregistration. + * + * Pair with smp_load_acquire() during lookup_elem(). + */ + smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_READY); + goto unlock; + } set_memory_rox((long)st_map->image, 1); err = st_ops->reg(kdata); if (likely(!err)) { + /* This refcnt increment on the map here after + * 'st_ops->reg()' is secure since the state of the + * map must be set to INIT at this moment, and thus + * bpf_struct_ops_map_delete_elem() can't unregister + * or transition it to TOBEFREE concurrently. + */ + bpf_map_inc(map); /* Pair with smp_load_acquire() during lookup_elem(). * It ensures the above udata updates (e.g. prog->aux->id) * can be seen once BPF_STRUCT_OPS_STATE_INUSE is set. @@ -520,7 +544,6 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, */ set_memory_nx((long)st_map->image, 1); set_memory_rw((long)st_map->image, 1); - bpf_map_put(map); reset_unlock: bpf_struct_ops_map_put_progs(st_map); @@ -538,6 +561,9 @@ static long bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key) struct bpf_struct_ops_map *st_map; st_map = (struct bpf_struct_ops_map *)map; + if (st_map->map.map_flags & BPF_F_LINK) + return -EOPNOTSUPP; + prev_state = cmpxchg(&st_map->kvalue.state, BPF_STRUCT_OPS_STATE_INUSE, BPF_STRUCT_OPS_STATE_TOBEFREE); @@ -614,7 +640,7 @@ static void bpf_struct_ops_map_free(struct bpf_map *map) static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr) { if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 || - attr->map_flags || !attr->btf_vmlinux_value_type_id) + (attr->map_flags & ~BPF_F_LINK) || !attr->btf_vmlinux_value_type_id) return -EINVAL; return 0; } @@ -638,6 +664,9 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) if (attr->value_size != vt->size) return ERR_PTR(-EINVAL); + if (attr->map_flags & BPF_F_LINK && !st_ops->validate) + return ERR_PTR(-EOPNOTSUPP); + t = st_ops->type; st_map_size = sizeof(*st_map) + @@ -725,3 +754,111 @@ void bpf_struct_ops_put(const void *kdata) bpf_map_put(&st_map->map); } + +static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map) +{ + struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; + + return map->map_type == BPF_MAP_TYPE_STRUCT_OPS && + map->map_flags & BPF_F_LINK && + /* Pair with smp_store_release() during map_update */ + smp_load_acquire(&st_map->kvalue.state) == BPF_STRUCT_OPS_STATE_READY; +} + +static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link) +{ + struct bpf_struct_ops_link *st_link; + struct bpf_struct_ops_map *st_map; + + st_link = container_of(link, struct bpf_struct_ops_link, link); + st_map = (struct bpf_struct_ops_map *) + rcu_dereference_protected(st_link->map, true); + if (st_map) { + /* st_link->map can be NULL if + * bpf_struct_ops_link_create() fails to register. + */ + st_map->st_ops->unreg(&st_map->kvalue.data); + bpf_map_put(&st_map->map); + } + kfree(st_link); +} + +static void bpf_struct_ops_map_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_struct_ops_link *st_link; + struct bpf_map *map; + + st_link = container_of(link, struct bpf_struct_ops_link, link); + rcu_read_lock(); + map = rcu_dereference(st_link->map); + seq_printf(seq, "map_id:\t%d\n", map->id); + rcu_read_unlock(); +} + +static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + struct bpf_struct_ops_link *st_link; + struct bpf_map *map; + + st_link = container_of(link, struct bpf_struct_ops_link, link); + rcu_read_lock(); + map = rcu_dereference(st_link->map); + info->struct_ops.map_id = map->id; + rcu_read_unlock(); + return 0; +} + +static const struct bpf_link_ops bpf_struct_ops_map_lops = { + .dealloc = bpf_struct_ops_map_link_dealloc, + .show_fdinfo = bpf_struct_ops_map_link_show_fdinfo, + .fill_link_info = bpf_struct_ops_map_link_fill_link_info, +}; + +int bpf_struct_ops_link_create(union bpf_attr *attr) +{ + struct bpf_struct_ops_link *link = NULL; + struct bpf_link_primer link_primer; + struct bpf_struct_ops_map *st_map; + struct bpf_map *map; + int err; + + map = bpf_map_get(attr->link_create.map_fd); + if (!map) + return -EINVAL; + + st_map = (struct bpf_struct_ops_map *)map; + + if (!bpf_struct_ops_valid_to_reg(map)) { + err = -EINVAL; + goto err_out; + } + + link = kzalloc(sizeof(*link), GFP_USER); + if (!link) { + err = -ENOMEM; + goto err_out; + } + bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL); + + err = bpf_link_prime(&link->link, &link_primer); + if (err) + goto err_out; + + err = st_map->st_ops->reg(st_map->kvalue.data); + if (err) { + bpf_link_cleanup(&link_primer); + link = NULL; + goto err_out; + } + RCU_INIT_POINTER(link->map, map); + + return bpf_link_settle(&link_primer); + +err_out: + bpf_map_put(map); + kfree(link); + return err; +} + diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index cff0348a2871..21f76698875c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2825,16 +2825,19 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) const struct bpf_prog *prog = link->prog; char prog_tag[sizeof(prog->tag) * 2 + 1] = { }; - bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); seq_printf(m, "link_type:\t%s\n" - "link_id:\t%u\n" - "prog_tag:\t%s\n" - "prog_id:\t%u\n", + "link_id:\t%u\n", bpf_link_type_strs[link->type], - link->id, - prog_tag, - prog->aux->id); + link->id); + if (prog) { + bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); + seq_printf(m, + "prog_tag:\t%s\n" + "prog_id:\t%u\n", + prog_tag, + prog->aux->id); + } if (link->ops->show_fdinfo) link->ops->show_fdinfo(link, m); } @@ -4314,7 +4317,8 @@ static int bpf_link_get_info_by_fd(struct file *file, info.type = link->type; info.id = link->id; - info.prog_id = link->prog->aux->id; + if (link->prog) + info.prog_id = link->prog->aux->id; if (link->ops->fill_link_info) { err = link->ops->fill_link_info(link, &info); @@ -4577,6 +4581,9 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) if (CHECK_ATTR(BPF_LINK_CREATE)) return -EINVAL; + if (attr->link_create.attach_type == BPF_STRUCT_OPS) + return bpf_struct_ops_link_create(attr); + prog = bpf_prog_get(attr->link_create.prog_fd); if (IS_ERR(prog)) return PTR_ERR(prog); diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 13fc0c185cd9..bbbd5eb94db2 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -239,8 +239,6 @@ static int bpf_tcp_ca_init_member(const struct btf_type *t, if (bpf_obj_name_cpy(tcp_ca->name, utcp_ca->name, sizeof(tcp_ca->name)) <= 0) return -EINVAL; - if (tcp_ca_find(utcp_ca->name)) - return -EEXIST; return 1; } @@ -266,6 +264,11 @@ static void bpf_tcp_ca_unreg(void *kdata) tcp_unregister_congestion_control(kdata); } +static int bpf_tcp_ca_validate(void *kdata) +{ + return tcp_validate_congestion_control(kdata); +} + struct bpf_struct_ops bpf_tcp_congestion_ops = { .verifier_ops = &bpf_tcp_ca_verifier_ops, .reg = bpf_tcp_ca_reg, @@ -273,6 +276,7 @@ struct bpf_struct_ops bpf_tcp_congestion_ops = { .check_member = bpf_tcp_ca_check_member, .init_member = bpf_tcp_ca_init_member, .init = bpf_tcp_ca_init, + .validate = bpf_tcp_ca_validate, .name = "tcp_congestion_ops", }; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 13129df937cd..9cf1deaf21f2 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1033,6 +1033,7 @@ enum bpf_attach_type { BPF_PERF_EVENT, BPF_TRACE_KPROBE_MULTI, BPF_LSM_CGROUP, + BPF_STRUCT_OPS, __MAX_BPF_ATTACH_TYPE }; @@ -1266,6 +1267,9 @@ enum { /* Create a map that is suitable to be an inner map with dynamic max entries */ BPF_F_INNER_MAP = (1U << 12), + +/* Create a map that will be registered/unregesitered by the backed bpf_link */ + BPF_F_LINK = (1U << 13), }; /* Flags for BPF_PROG_QUERY. */ @@ -1507,7 +1511,10 @@ union bpf_attr { } task_fd_query; struct { /* struct used by BPF_LINK_CREATE command */ - __u32 prog_fd; /* eBPF program to attach */ + union { + __u32 prog_fd; /* eBPF program to attach */ + __u32 map_fd; /* eBPF struct_ops to attach */ + }; union { __u32 target_fd; /* object to attach to */ __u32 target_ifindex; /* target ifindex */ @@ -6379,6 +6386,9 @@ struct bpf_link_info { struct { __u32 ifindex; } xdp; + struct { + __u32 map_id; + } struct_ops; }; } __attribute__((aligned(8))); -- cgit v1.2.3-58-ga151 From aef56f2e918bf8fc8de25f0b36e8c2aba44116ec Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Wed, 22 Mar 2023 20:24:02 -0700 Subject: bpf: Update the struct_ops of a bpf_link. By improving the BPF_LINK_UPDATE command of bpf(), it should allow you to conveniently switch between different struct_ops on a single bpf_link. This would enable smoother transitions from one struct_ops to another. The struct_ops maps passing along with BPF_LINK_UPDATE should have the BPF_F_LINK flag. Signed-off-by: Kui-Feng Lee Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20230323032405.3735486-6-kuifeng@meta.com Signed-off-by: Martin KaFai Lau --- include/linux/bpf.h | 3 +++ include/uapi/linux/bpf.h | 21 +++++++++++++----- kernel/bpf/bpf_struct_ops.c | 48 +++++++++++++++++++++++++++++++++++++++++- kernel/bpf/syscall.c | 34 ++++++++++++++++++++++++++++++ net/ipv4/bpf_tcp_ca.c | 6 ++++++ tools/include/uapi/linux/bpf.h | 21 +++++++++++++----- 6 files changed, 122 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8552279efe46..2d8f3f639e68 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1476,6 +1476,8 @@ struct bpf_link_ops { void (*show_fdinfo)(const struct bpf_link *link, struct seq_file *seq); int (*fill_link_info)(const struct bpf_link *link, struct bpf_link_info *info); + int (*update_map)(struct bpf_link *link, struct bpf_map *new_map, + struct bpf_map *old_map); }; struct bpf_tramp_link { @@ -1518,6 +1520,7 @@ struct bpf_struct_ops { void *kdata, const void *udata); int (*reg)(void *kdata); void (*unreg)(void *kdata); + int (*update)(void *kdata, void *old_kdata); int (*validate)(void *kdata); const struct btf_type *type; const struct btf_type *value_type; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 42f40ee083bf..e3d3b5160d26 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1555,12 +1555,23 @@ union bpf_attr { struct { /* struct used by BPF_LINK_UPDATE command */ __u32 link_fd; /* link fd */ - /* new program fd to update link with */ - __u32 new_prog_fd; + union { + /* new program fd to update link with */ + __u32 new_prog_fd; + /* new struct_ops map fd to update link with */ + __u32 new_map_fd; + }; __u32 flags; /* extra flags */ - /* expected link's program fd; is specified only if - * BPF_F_REPLACE flag is set in flags */ - __u32 old_prog_fd; + union { + /* expected link's program fd; is specified only if + * BPF_F_REPLACE flag is set in flags. + */ + __u32 old_prog_fd; + /* expected link's map fd; is specified only + * if BPF_F_REPLACE flag is set. + */ + __u32 old_map_fd; + }; } link_update; struct { diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 3d6b5240c25a..6401deca3b56 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -65,6 +65,8 @@ struct bpf_struct_ops_link { struct bpf_map __rcu *map; }; +static DEFINE_MUTEX(update_mutex); + #define VALUE_PREFIX "bpf_struct_ops_" #define VALUE_PREFIX_LEN (sizeof(VALUE_PREFIX) - 1) @@ -664,7 +666,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) if (attr->value_size != vt->size) return ERR_PTR(-EINVAL); - if (attr->map_flags & BPF_F_LINK && !st_ops->validate) + if (attr->map_flags & BPF_F_LINK && (!st_ops->validate || !st_ops->update)) return ERR_PTR(-EOPNOTSUPP); t = st_ops->type; @@ -810,10 +812,54 @@ static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link *link, return 0; } +static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map *new_map, + struct bpf_map *expected_old_map) +{ + struct bpf_struct_ops_map *st_map, *old_st_map; + struct bpf_map *old_map; + struct bpf_struct_ops_link *st_link; + int err = 0; + + st_link = container_of(link, struct bpf_struct_ops_link, link); + st_map = container_of(new_map, struct bpf_struct_ops_map, map); + + if (!bpf_struct_ops_valid_to_reg(new_map)) + return -EINVAL; + + mutex_lock(&update_mutex); + + old_map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex)); + if (expected_old_map && old_map != expected_old_map) { + err = -EPERM; + goto err_out; + } + + old_st_map = container_of(old_map, struct bpf_struct_ops_map, map); + /* The new and old struct_ops must be the same type. */ + if (st_map->st_ops != old_st_map->st_ops) { + err = -EINVAL; + goto err_out; + } + + err = st_map->st_ops->update(st_map->kvalue.data, old_st_map->kvalue.data); + if (err) + goto err_out; + + bpf_map_inc(new_map); + rcu_assign_pointer(st_link->map, new_map); + bpf_map_put(old_map); + +err_out: + mutex_unlock(&update_mutex); + + return err; +} + static const struct bpf_link_ops bpf_struct_ops_map_lops = { .dealloc = bpf_struct_ops_map_link_dealloc, .show_fdinfo = bpf_struct_ops_map_link_show_fdinfo, .fill_link_info = bpf_struct_ops_map_link_fill_link_info, + .update_map = bpf_struct_ops_map_link_update, }; int bpf_struct_ops_link_create(union bpf_attr *attr) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 21f76698875c..b4d758fa5981 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -4682,6 +4682,35 @@ out: return ret; } +static int link_update_map(struct bpf_link *link, union bpf_attr *attr) +{ + struct bpf_map *new_map, *old_map = NULL; + int ret; + + new_map = bpf_map_get(attr->link_update.new_map_fd); + if (IS_ERR(new_map)) + return -EINVAL; + + if (attr->link_update.flags & BPF_F_REPLACE) { + old_map = bpf_map_get(attr->link_update.old_map_fd); + if (IS_ERR(old_map)) { + ret = -EINVAL; + goto out_put; + } + } else if (attr->link_update.old_map_fd) { + ret = -EINVAL; + goto out_put; + } + + ret = link->ops->update_map(link, new_map, old_map); + + if (old_map) + bpf_map_put(old_map); +out_put: + bpf_map_put(new_map); + return ret; +} + #define BPF_LINK_UPDATE_LAST_FIELD link_update.old_prog_fd static int link_update(union bpf_attr *attr) @@ -4702,6 +4731,11 @@ static int link_update(union bpf_attr *attr) if (IS_ERR(link)) return PTR_ERR(link); + if (link->ops->update_map) { + ret = link_update_map(link, attr); + goto out_put_link; + } + new_prog = bpf_prog_get(attr->link_update.new_prog_fd); if (IS_ERR(new_prog)) { ret = PTR_ERR(new_prog); diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index bbbd5eb94db2..e8b27826283e 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -264,6 +264,11 @@ static void bpf_tcp_ca_unreg(void *kdata) tcp_unregister_congestion_control(kdata); } +static int bpf_tcp_ca_update(void *kdata, void *old_kdata) +{ + return tcp_update_congestion_control(kdata, old_kdata); +} + static int bpf_tcp_ca_validate(void *kdata) { return tcp_validate_congestion_control(kdata); @@ -273,6 +278,7 @@ struct bpf_struct_ops bpf_tcp_congestion_ops = { .verifier_ops = &bpf_tcp_ca_verifier_ops, .reg = bpf_tcp_ca_reg, .unreg = bpf_tcp_ca_unreg, + .update = bpf_tcp_ca_update, .check_member = bpf_tcp_ca_check_member, .init_member = bpf_tcp_ca_init_member, .init = bpf_tcp_ca_init, diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 9cf1deaf21f2..d6c5a022ae28 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1555,12 +1555,23 @@ union bpf_attr { struct { /* struct used by BPF_LINK_UPDATE command */ __u32 link_fd; /* link fd */ - /* new program fd to update link with */ - __u32 new_prog_fd; + union { + /* new program fd to update link with */ + __u32 new_prog_fd; + /* new struct_ops map fd to update link with */ + __u32 new_map_fd; + }; __u32 flags; /* extra flags */ - /* expected link's program fd; is specified only if - * BPF_F_REPLACE flag is set in flags */ - __u32 old_prog_fd; + union { + /* expected link's program fd; is specified only if + * BPF_F_REPLACE flag is set in flags. + */ + __u32 old_prog_fd; + /* expected link's map fd; is specified only + * if BPF_F_REPLACE flag is set. + */ + __u32 old_map_fd; + }; } link_update; struct { -- cgit v1.2.3-58-ga151 From 55fbae05476df65e5eee8be54f61d0257af0240b Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 24 Mar 2023 11:42:41 -0700 Subject: bpf: Check IS_ERR for the bpf_map_get() return value This patch fixes a mistake in checking NULL instead of checking IS_ERR for the bpf_map_get() return value. It also fixes the return value in link_update_map() from -EINVAL to PTR_ERR(*_map). Reported-by: syzbot+71ccc0fe37abb458406b@syzkaller.appspotmail.com Fixes: 68b04864ca42 ("bpf: Create links for BPF struct_ops maps.") Fixes: aef56f2e918b ("bpf: Update the struct_ops of a bpf_link.") Signed-off-by: Martin KaFai Lau Acked-by: Kui-Feng Lee Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20230324184241.1387437-1-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_struct_ops.c | 4 ++-- kernel/bpf/syscall.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 6401deca3b56..d3f0a4825fa6 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -871,8 +871,8 @@ int bpf_struct_ops_link_create(union bpf_attr *attr) int err; map = bpf_map_get(attr->link_create.map_fd); - if (!map) - return -EINVAL; + if (IS_ERR(map)) + return PTR_ERR(map); st_map = (struct bpf_struct_ops_map *)map; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b4d758fa5981..a09597c95029 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -4689,12 +4689,12 @@ static int link_update_map(struct bpf_link *link, union bpf_attr *attr) new_map = bpf_map_get(attr->link_update.new_map_fd); if (IS_ERR(new_map)) - return -EINVAL; + return PTR_ERR(new_map); if (attr->link_update.flags & BPF_F_REPLACE) { old_map = bpf_map_get(attr->link_update.old_map_fd); if (IS_ERR(old_map)) { - ret = -EINVAL; + ret = PTR_ERR(old_map); goto out_put; } } else if (attr->link_update.old_map_fd) { -- cgit v1.2.3-58-ga151 From 1431d0b584a673ea690c88a5f7e1aedd9caf0e84 Mon Sep 17 00:00:00 2001 From: David Vernet Date: Sat, 25 Mar 2023 16:31:42 -0500 Subject: bpf: Only invoke kptr dtor following non-NULL xchg When a map value is being freed, we loop over all of the fields of the corresponding BPF object and issue the appropriate cleanup calls corresponding to the field's type. If the field is a referenced kptr, we atomically xchg the value out of the map, and invoke the kptr's destructor on whatever was there before (or bpf_obj_drop() it if it was a local kptr). Currently, we always invoke the destructor (either bpf_obj_drop() or the kptr's registered destructor) on any KPTR_REF-type field in a map, even if there wasn't a value in the map. This means that any function serving as the kptr's KF_RELEASE destructor must always treat the argument as possibly NULL, as the following can and regularly does happen: void *xchgd_field; /* No value was in the map, so xchgd_field is NULL */ xchgd_field = (void *)xchg(unsigned long *field_ptr, 0); field->kptr.dtor(xchgd_field); These are odd semantics to impose on KF_RELEASE kfuncs -- BPF programs are prohibited by the verifier from passing NULL pointers to KF_RELEASE kfuncs, so it doesn't make sense to require this of BPF programs, but not the main kernel destructor path. It's also unnecessary to invoke any cleanup logic for local kptrs. If there is no object there, there's nothing to drop. So as to allow KF_RELEASE kfuncs to fully assume that an argument is non-NULL, this patch updates a KPTR_REF's destructor to only be invoked when a non-NULL value is xchg'd out of the kptr map field. Signed-off-by: David Vernet Link: https://lore.kernel.org/r/20230325213144.486885-2-void@manifault.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a09597c95029..e18ac7fdc210 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -677,6 +677,9 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) break; case BPF_KPTR_REF: xchgd_field = (void *)xchg((unsigned long *)field_ptr, 0); + if (!xchgd_field) + break; + if (!btf_is_kernel(field->kptr.btf)) { pointee_struct_meta = btf_find_struct_meta(field->kptr.btf, field->kptr.btf_id); -- cgit v1.2.3-58-ga151 From fb2211a57c110b4ced3cb7f8570bd7246acf2d04 Mon Sep 17 00:00:00 2001 From: David Vernet Date: Sat, 25 Mar 2023 16:31:45 -0500 Subject: bpf: Remove now-unnecessary NULL checks for KF_RELEASE kfuncs Now that we're not invoking kfunc destructors when the kptr in a map was NULL, we no longer require NULL checks in many of our KF_RELEASE kfuncs. This patch removes those NULL checks. Signed-off-by: David Vernet Link: https://lore.kernel.org/r/20230325213144.486885-3-void@manifault.com Signed-off-by: Alexei Starovoitov --- drivers/hid/bpf/hid_bpf_dispatch.c | 3 --- kernel/bpf/cpumask.c | 3 --- kernel/bpf/helpers.c | 6 ------ net/bpf/test_run.c | 3 --- net/netfilter/nf_conntrack_bpf.c | 2 -- 5 files changed, 17 deletions(-) (limited to 'kernel') diff --git a/drivers/hid/bpf/hid_bpf_dispatch.c b/drivers/hid/bpf/hid_bpf_dispatch.c index 8a034a555d4c..d9ef45fcaeab 100644 --- a/drivers/hid/bpf/hid_bpf_dispatch.c +++ b/drivers/hid/bpf/hid_bpf_dispatch.c @@ -342,9 +342,6 @@ hid_bpf_release_context(struct hid_bpf_ctx *ctx) { struct hid_bpf_ctx_kern *ctx_kern; - if (!ctx) - return; - ctx_kern = container_of(ctx, struct hid_bpf_ctx_kern, ctx); kfree(ctx_kern); diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c index db9da2194c1a..e991af7dc13c 100644 --- a/kernel/bpf/cpumask.c +++ b/kernel/bpf/cpumask.c @@ -102,9 +102,6 @@ static void cpumask_free_cb(struct rcu_head *head) */ __bpf_kfunc void bpf_cpumask_release(struct bpf_cpumask *cpumask) { - if (!cpumask) - return; - if (refcount_dec_and_test(&cpumask->usage)) call_rcu(&cpumask->rcu, cpumask_free_cb); } diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index f753676ef652..8980f6859443 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2089,9 +2089,6 @@ __bpf_kfunc struct task_struct *bpf_task_kptr_get(struct task_struct **pp) */ __bpf_kfunc void bpf_task_release(struct task_struct *p) { - if (!p) - return; - put_task_struct(p); } @@ -2148,9 +2145,6 @@ __bpf_kfunc struct cgroup *bpf_cgroup_kptr_get(struct cgroup **cgrpp) */ __bpf_kfunc void bpf_cgroup_release(struct cgroup *cgrp) { - if (!cgrp) - return; - cgroup_put(cgrp); } diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 8d6b31209bd6..27587f1c5f36 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -615,9 +615,6 @@ bpf_kfunc_call_memb_acquire(void) __bpf_kfunc void bpf_kfunc_call_test_release(struct prog_test_ref_kfunc *p) { - if (!p) - return; - refcount_dec(&p->cnt); } diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c index cd99e6dc1f35..002e9d24a1e9 100644 --- a/net/netfilter/nf_conntrack_bpf.c +++ b/net/netfilter/nf_conntrack_bpf.c @@ -401,8 +401,6 @@ __bpf_kfunc struct nf_conn *bpf_ct_insert_entry(struct nf_conn___init *nfct_i) */ __bpf_kfunc void bpf_ct_release(struct nf_conn *nfct) { - if (!nfct) - return; nf_ct_put(nfct); } -- cgit v1.2.3-58-ga151 From 6c831c4684124a544f73f7c9b83bc7b2eb0b23d3 Mon Sep 17 00:00:00 2001 From: David Vernet Date: Sat, 25 Mar 2023 16:31:46 -0500 Subject: bpf: Treat KF_RELEASE kfuncs as KF_TRUSTED_ARGS KF_RELEASE kfuncs are not currently treated as having KF_TRUSTED_ARGS, even though they have a superset of the requirements of KF_TRUSTED_ARGS. Like KF_TRUSTED_ARGS, KF_RELEASE kfuncs require a 0-offset argument, and don't allow NULL-able arguments. Unlike KF_TRUSTED_ARGS which require _either_ an argument with ref_obj_id > 0, _or_ (ref->type & BPF_REG_TRUSTED_MODIFIERS) (and no unsafe modifiers allowed), KF_RELEASE only allows for ref_obj_id > 0. Because KF_RELEASE today doesn't automatically imply KF_TRUSTED_ARGS, some of these requirements are enforced in different ways that can make the behavior of the verifier feel unpredictable. For example, a KF_RELEASE kfunc with a NULL-able argument will currently fail in the verifier with a message like, "arg#0 is ptr_or_null_ expected ptr_ or socket" rather than "Possibly NULL pointer passed to trusted arg0". Our intention is the same, but the semantics are different due to implemenetation details that kfunc authors and BPF program writers should not need to care about. Let's make the behavior of the verifier more consistent and intuitive by having KF_RELEASE kfuncs imply the presence of KF_TRUSTED_ARGS. Our eventual goal is to have all kfuncs assume KF_TRUSTED_ARGS by default anyways, so this takes us a step in that direction. Note that it does not make sense to assume KF_TRUSTED_ARGS for all KF_ACQUIRE kfuncs. KF_ACQUIRE kfuncs can have looser semantics than KF_RELEASE, with e.g. KF_RCU | KF_RET_NULL. We may want to have KF_ACQUIRE imply KF_TRUSTED_ARGS _unless_ KF_RCU is specified, but that can be left to another patch set, and there are no such subtleties to address for KF_RELEASE. Signed-off-by: David Vernet Link: https://lore.kernel.org/r/20230325213144.486885-4-void@manifault.com Signed-off-by: Alexei Starovoitov --- Documentation/bpf/kfuncs.rst | 7 ++++--- kernel/bpf/cpumask.c | 2 +- kernel/bpf/verifier.c | 2 +- net/bpf/test_run.c | 6 ++++++ tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c | 4 ++-- tools/testing/selftests/bpf/progs/task_kfunc_failure.c | 6 +++--- tools/testing/selftests/bpf/verifier/calls.c | 10 +++++++--- tools/testing/selftests/bpf/verifier/ref_tracking.c | 6 +++--- 8 files changed, 27 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst index 69eccf6f98ef..bf1b85941452 100644 --- a/Documentation/bpf/kfuncs.rst +++ b/Documentation/bpf/kfuncs.rst @@ -179,9 +179,10 @@ both are orthogonal to each other. --------------------- The KF_RELEASE flag is used to indicate that the kfunc releases the pointer -passed in to it. There can be only one referenced pointer that can be passed in. -All copies of the pointer being released are invalidated as a result of invoking -kfunc with this flag. +passed in to it. There can be only one referenced pointer that can be passed +in. All copies of the pointer being released are invalidated as a result of +invoking kfunc with this flag. KF_RELEASE kfuncs automatically receive the +protection afforded by the KF_TRUSTED_ARGS flag described below. 2.4.4 KF_KPTR_GET flag ---------------------- diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c index e991af7dc13c..7efdf5d770ca 100644 --- a/kernel/bpf/cpumask.c +++ b/kernel/bpf/cpumask.c @@ -402,7 +402,7 @@ __diag_pop(); BTF_SET8_START(cpumask_kfunc_btf_ids) BTF_ID_FLAGS(func, bpf_cpumask_create, KF_ACQUIRE | KF_RET_NULL) -BTF_ID_FLAGS(func, bpf_cpumask_release, KF_RELEASE | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_cpumask_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_cpumask_first, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_first_zero, KF_RCU) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 64f06f6e16bf..20eb2015842f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9307,7 +9307,7 @@ static bool is_kfunc_release(struct bpf_kfunc_call_arg_meta *meta) static bool is_kfunc_trusted_args(struct bpf_kfunc_call_arg_meta *meta) { - return meta->kfunc_flags & KF_TRUSTED_ARGS; + return (meta->kfunc_flags & KF_TRUSTED_ARGS) || is_kfunc_release(meta); } static bool is_kfunc_sleepable(struct bpf_kfunc_call_arg_meta *meta) diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 27587f1c5f36..f1652f5fbd2e 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -606,6 +606,11 @@ bpf_kfunc_call_test_acquire(unsigned long *scalar_ptr) return &prog_test_struct; } +__bpf_kfunc void bpf_kfunc_call_test_offset(struct prog_test_ref_kfunc *p) +{ + WARN_ON_ONCE(1); +} + __bpf_kfunc struct prog_test_member * bpf_kfunc_call_memb_acquire(void) { @@ -800,6 +805,7 @@ BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_fail2) BTF_ID_FLAGS(func, bpf_kfunc_call_test_ref, KF_TRUSTED_ARGS | KF_RCU) BTF_ID_FLAGS(func, bpf_kfunc_call_test_destructive, KF_DESTRUCTIVE) BTF_ID_FLAGS(func, bpf_kfunc_call_test_static_unused_arg) +BTF_ID_FLAGS(func, bpf_kfunc_call_test_offset) BTF_SET8_END(test_sk_check_kfunc_ids) static void *bpf_test_init(const union bpf_attr *kattr, u32 user_size, diff --git a/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c b/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c index 807fb0ac41e9..48b2034cadb3 100644 --- a/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c +++ b/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c @@ -206,7 +206,7 @@ int BPF_PROG(cgrp_kfunc_get_unreleased, struct cgroup *cgrp, const char *path) } SEC("tp_btf/cgroup_mkdir") -__failure __msg("expects refcounted") +__failure __msg("Possibly NULL pointer passed to trusted arg0") int BPF_PROG(cgrp_kfunc_release_untrusted, struct cgroup *cgrp, const char *path) { struct __cgrps_kfunc_map_value *v; @@ -234,7 +234,7 @@ int BPF_PROG(cgrp_kfunc_release_fp, struct cgroup *cgrp, const char *path) } SEC("tp_btf/cgroup_mkdir") -__failure __msg("arg#0 is ptr_or_null_ expected ptr_ or socket") +__failure __msg("Possibly NULL pointer passed to trusted arg0") int BPF_PROG(cgrp_kfunc_release_null, struct cgroup *cgrp, const char *path) { struct __cgrps_kfunc_map_value local, *v; diff --git a/tools/testing/selftests/bpf/progs/task_kfunc_failure.c b/tools/testing/selftests/bpf/progs/task_kfunc_failure.c index 27994d6b2914..2c374a7ffece 100644 --- a/tools/testing/selftests/bpf/progs/task_kfunc_failure.c +++ b/tools/testing/selftests/bpf/progs/task_kfunc_failure.c @@ -206,7 +206,7 @@ int BPF_PROG(task_kfunc_get_unreleased, struct task_struct *task, u64 clone_flag } SEC("tp_btf/task_newtask") -__failure __msg("arg#0 is untrusted_ptr_or_null_ expected ptr_ or socket") +__failure __msg("Possibly NULL pointer passed to trusted arg0") int BPF_PROG(task_kfunc_release_untrusted, struct task_struct *task, u64 clone_flags) { struct __tasks_kfunc_map_value *v; @@ -234,7 +234,7 @@ int BPF_PROG(task_kfunc_release_fp, struct task_struct *task, u64 clone_flags) } SEC("tp_btf/task_newtask") -__failure __msg("arg#0 is ptr_or_null_ expected ptr_ or socket") +__failure __msg("Possibly NULL pointer passed to trusted arg0") int BPF_PROG(task_kfunc_release_null, struct task_struct *task, u64 clone_flags) { struct __tasks_kfunc_map_value local, *v; @@ -277,7 +277,7 @@ int BPF_PROG(task_kfunc_release_unacquired, struct task_struct *task, u64 clone_ } SEC("tp_btf/task_newtask") -__failure __msg("arg#0 is ptr_or_null_ expected ptr_ or socket") +__failure __msg("Possibly NULL pointer passed to trusted arg0") int BPF_PROG(task_kfunc_from_pid_no_null_check, struct task_struct *task, u64 clone_flags) { struct task_struct *acquired; diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c index 5702fc9761ef..1bdf2b43e49e 100644 --- a/tools/testing/selftests/bpf/verifier/calls.c +++ b/tools/testing/selftests/bpf/verifier/calls.c @@ -109,7 +109,7 @@ }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = REJECT, - .errstr = "arg#0 is ptr_or_null_ expected ptr_ or socket", + .errstr = "Possibly NULL pointer passed to trusted arg0", .fixup_kfunc_btf_id = { { "bpf_kfunc_call_test_acquire", 3 }, { "bpf_kfunc_call_test_release", 5 }, @@ -165,19 +165,23 @@ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), BPF_ST_MEM(BPF_DW, BPF_REG_1, 0, 0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, BPF_PSEUDO_KFUNC_CALL, 0, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_0), BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), BPF_EXIT_INSN(), BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), - BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, 16), BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -4), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, BPF_PSEUDO_KFUNC_CALL, 0, 0), BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_2), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, BPF_PSEUDO_KFUNC_CALL, 0, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .fixup_kfunc_btf_id = { { "bpf_kfunc_call_test_acquire", 3 }, - { "bpf_kfunc_call_test_release", 9 }, + { "bpf_kfunc_call_test_offset", 9 }, + { "bpf_kfunc_call_test_release", 12 }, }, .result_unpriv = REJECT, .result = REJECT, diff --git a/tools/testing/selftests/bpf/verifier/ref_tracking.c b/tools/testing/selftests/bpf/verifier/ref_tracking.c index 9540164712b7..5a2e154dd1e0 100644 --- a/tools/testing/selftests/bpf/verifier/ref_tracking.c +++ b/tools/testing/selftests/bpf/verifier/ref_tracking.c @@ -142,7 +142,7 @@ .kfunc = "bpf", .expected_attach_type = BPF_LSM_MAC, .flags = BPF_F_SLEEPABLE, - .errstr = "arg#0 is ptr_or_null_ expected ptr_ or socket", + .errstr = "Possibly NULL pointer passed to trusted arg0", .fixup_kfunc_btf_id = { { "bpf_lookup_user_key", 2 }, { "bpf_key_put", 4 }, @@ -163,7 +163,7 @@ .kfunc = "bpf", .expected_attach_type = BPF_LSM_MAC, .flags = BPF_F_SLEEPABLE, - .errstr = "arg#0 is ptr_or_null_ expected ptr_ or socket", + .errstr = "Possibly NULL pointer passed to trusted arg0", .fixup_kfunc_btf_id = { { "bpf_lookup_system_key", 1 }, { "bpf_key_put", 3 }, @@ -182,7 +182,7 @@ .kfunc = "bpf", .expected_attach_type = BPF_LSM_MAC, .flags = BPF_F_SLEEPABLE, - .errstr = "arg#0 pointer type STRUCT bpf_key must point to scalar, or struct with scalar", + .errstr = "Possibly NULL pointer passed to trusted arg0", .fixup_kfunc_btf_id = { { "bpf_key_put", 1 }, }, -- cgit v1.2.3-58-ga151 From e65a5c6edbc6ca4853e6076bd81db1a410592a09 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 22 Mar 2023 14:52:42 -0700 Subject: bpf: Add a few bpf mem allocator functions This patch adds a few bpf mem allocator functions which will be used in the bpf_local_storage in a later patch. bpf_mem_cache_alloc_flags(..., gfp_t flags) is added. When the flags == GFP_KERNEL, it will fallback to __alloc(..., GFP_KERNEL). bpf_local_storage knows its running context is sleepable (GFP_KERNEL) and provides a better guarantee on memory allocation. bpf_local_storage has some uncommon cases that its selem cannot be reused immediately. It handles its own rcu_head and goes through a rcu_trace gp and then free it. bpf_mem_cache_raw_free() is added for direct free purpose without leaking the LLIST_NODE_SZ internal knowledge. During free time, the 'struct bpf_mem_alloc *ma' is no longer available. However, the caller should know if it is percpu memory or not and it can call different raw_free functions. bpf_local_storage does not support percpu value, so only the non-percpu 'bpf_mem_cache_raw_free()' is added in this patch. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20230322215246.1675516-2-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_mem_alloc.h | 2 ++ kernel/bpf/memalloc.c | 59 ++++++++++++++++++++++++++++++++++++------- 2 files changed, 52 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h index a7104af61ab4..3929be5743f4 100644 --- a/include/linux/bpf_mem_alloc.h +++ b/include/linux/bpf_mem_alloc.h @@ -31,5 +31,7 @@ void bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr); /* kmem_cache_alloc/free equivalent: */ void *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma); void bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr); +void bpf_mem_cache_raw_free(void *ptr); +void *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags); #endif /* _BPF_MEM_ALLOC_H */ diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 5fcdacbb8439..410637c225fb 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -121,15 +121,8 @@ static struct llist_node notrace *__llist_del_first(struct llist_head *head) return entry; } -static void *__alloc(struct bpf_mem_cache *c, int node) +static void *__alloc(struct bpf_mem_cache *c, int node, gfp_t flags) { - /* Allocate, but don't deplete atomic reserves that typical - * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc - * will allocate from the current numa node which is what we - * want here. - */ - gfp_t flags = GFP_NOWAIT | __GFP_NOWARN | __GFP_ACCOUNT; - if (c->percpu_size) { void **obj = kmalloc_node(c->percpu_size, flags, node); void *pptr = __alloc_percpu_gfp(c->unit_size, 8, flags); @@ -185,7 +178,12 @@ static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node) */ obj = __llist_del_first(&c->free_by_rcu); if (!obj) { - obj = __alloc(c, node); + /* Allocate, but don't deplete atomic reserves that typical + * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc + * will allocate from the current numa node which is what we + * want here. + */ + obj = __alloc(c, node, GFP_NOWAIT | __GFP_NOWARN | __GFP_ACCOUNT); if (!obj) break; } @@ -676,3 +674,46 @@ void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr) unit_free(this_cpu_ptr(ma->cache), ptr); } + +/* Directly does a kfree() without putting 'ptr' back to the free_llist + * for reuse and without waiting for a rcu_tasks_trace gp. + * The caller must first go through the rcu_tasks_trace gp for 'ptr' + * before calling bpf_mem_cache_raw_free(). + * It could be used when the rcu_tasks_trace callback does not have + * a hold on the original bpf_mem_alloc object that allocated the + * 'ptr'. This should only be used in the uncommon code path. + * Otherwise, the bpf_mem_alloc's free_llist cannot be refilled + * and may affect performance. + */ +void bpf_mem_cache_raw_free(void *ptr) +{ + if (!ptr) + return; + + kfree(ptr - LLIST_NODE_SZ); +} + +/* When flags == GFP_KERNEL, it signals that the caller will not cause + * deadlock when using kmalloc. bpf_mem_cache_alloc_flags() will use + * kmalloc if the free_llist is empty. + */ +void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags) +{ + struct bpf_mem_cache *c; + void *ret; + + c = this_cpu_ptr(ma->cache); + + ret = unit_alloc(c); + if (!ret && flags == GFP_KERNEL) { + struct mem_cgroup *memcg, *old_memcg; + + memcg = get_memcg(c); + old_memcg = set_active_memcg(memcg); + ret = __alloc(c, NUMA_NO_NODE, GFP_KERNEL | __GFP_NOWARN | __GFP_ACCOUNT); + set_active_memcg(old_memcg); + mem_cgroup_put(memcg); + } + + return !ret ? NULL : ret + LLIST_NODE_SZ; +} -- cgit v1.2.3-58-ga151 From 08a7ce384e33e53e0732c500a8af67a73f8fceca Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 22 Mar 2023 14:52:43 -0700 Subject: bpf: Use bpf_mem_cache_alloc/free in bpf_local_storage_elem This patch uses bpf_mem_alloc for the task and cgroup local storage that the bpf prog can easily get a hold of the storage owner's PTR_TO_BTF_ID. eg. bpf_get_current_task_btf() can be used in some of the kmalloc code path which will cause deadlock/recursion. bpf_mem_cache_alloc is deadlock free and will solve a legit use case in [1]. For sk storage, its batch creation benchmark shows a few percent regression when the sk create/destroy batch size is larger than 32. The sk creation/destruction happens much more often and depends on external traffic. Considering it is hypothetical to be able to cause deadlock with sk storage, it can cross the bridge to use bpf_mem_alloc till a legit (ie. useful) use case comes up. For inode storage, bpf_local_storage_destroy() is called before waiting for a rcu gp and its memory cannot be reused immediately. inode stays with kmalloc/kfree after the rcu [or tasks_trace] gp. A 'bool bpf_ma' argument is added to bpf_local_storage_map_alloc(). Only task and cgroup storage have 'bpf_ma == true' which means to use bpf_mem_cache_alloc/free(). This patch only changes selem to use bpf_mem_alloc for task and cgroup. The next patch will change the local_storage to use bpf_mem_alloc also for task and cgroup. Here is some more details on the changes: * memory allocation: After bpf_mem_cache_alloc(), the SDATA(selem)->data is zero-ed because bpf_mem_cache_alloc() could return a reused selem. It is to keep the existing bpf_map_kzalloc() behavior. Only SDATA(selem)->data is zero-ed. SDATA(selem)->data is the visible part to the bpf prog. No need to use zero_map_value() to do the zeroing because bpf_selem_free(..., reuse_now = true) ensures no bpf prog is using the selem before returning the selem through bpf_mem_cache_free(). For the internal fields of selem, they will be initialized when linking to the new smap and the new local_storage. When 'bpf_ma == false', nothing changes in this patch. It will stay with the bpf_map_kzalloc(). * memory free: The bpf_selem_free() and bpf_selem_free_rcu() are modified to handle the bpf_ma == true case. For the common selem free path where its owner is also being destroyed, the mem is freed in bpf_local_storage_destroy(), the owner (task and cgroup) has gone through a rcu gp. The memory can be reused immediately, so bpf_local_storage_destroy() will call bpf_selem_free(..., reuse_now = true) which will do bpf_mem_cache_free() for immediate reuse consideration. An exception is the delete elem code path. The delete elem code path is called from the helper bpf_*_storage_delete() and the syscall bpf_map_delete_elem(). This path is an unusual case for local storage because the common use case is to have the local storage staying with its owner life time so that the bpf prog and the user space does not have to monitor the owner's destruction. For the delete elem path, the selem cannot be reused immediately because there could be bpf prog using it. It will call bpf_selem_free(..., reuse_now = false) and it will wait for a rcu tasks trace gp before freeing the elem. The rcu callback is changed to do bpf_mem_cache_raw_free() instead of kfree(). When 'bpf_ma == false', it should be the same as before. __bpf_selem_free() is added to do the kfree_rcu and call_tasks_trace_rcu(). A few words on the 'reuse_now == true'. When 'reuse_now == true', it is still racing with bpf_local_storage_map_free which is under rcu protection, so it still needs to wait for a rcu gp instead of kfree(). Otherwise, the selem may be reused by slab for a totally different struct while the bpf_local_storage_map_free() is still using it (as a rcu reader). For the inode case, there may be other rcu readers also. In short, when bpf_ma == false and reuse_now == true => vanilla rcu. [1]: https://lore.kernel.org/bpf/20221118190109.1512674-1-namhyung@kernel.org/ Cc: Namhyung Kim Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20230322215246.1675516-3-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_local_storage.h | 6 ++- kernel/bpf/bpf_cgrp_storage.c | 2 +- kernel/bpf/bpf_inode_storage.c | 2 +- kernel/bpf/bpf_local_storage.c | 95 +++++++++++++++++++++++++++++++++++---- kernel/bpf/bpf_task_storage.c | 2 +- net/core/bpf_sk_storage.c | 2 +- 6 files changed, 95 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index a34f61467a2f..30efbcab2798 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #define BPF_LOCAL_STORAGE_CACHE_SIZE 16 @@ -55,6 +56,8 @@ struct bpf_local_storage_map { u32 bucket_log; u16 elem_size; u16 cache_idx; + struct bpf_mem_alloc selem_ma; + bool bpf_ma; }; struct bpf_local_storage_data { @@ -122,7 +125,8 @@ int bpf_local_storage_map_alloc_check(union bpf_attr *attr); struct bpf_map * bpf_local_storage_map_alloc(union bpf_attr *attr, - struct bpf_local_storage_cache *cache); + struct bpf_local_storage_cache *cache, + bool bpf_ma); struct bpf_local_storage_data * bpf_local_storage_lookup(struct bpf_local_storage *local_storage, diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c index f5b016a5484d..d17d5b694668 100644 --- a/kernel/bpf/bpf_cgrp_storage.c +++ b/kernel/bpf/bpf_cgrp_storage.c @@ -149,7 +149,7 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key) static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) { - return bpf_local_storage_map_alloc(attr, &cgroup_cache); + return bpf_local_storage_map_alloc(attr, &cgroup_cache, true); } static void cgroup_storage_map_free(struct bpf_map *map) diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index 9a5f05151898..e17ad581b9be 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -199,7 +199,7 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key, static struct bpf_map *inode_storage_map_alloc(union bpf_attr *attr) { - return bpf_local_storage_map_alloc(attr, &inode_cache); + return bpf_local_storage_map_alloc(attr, &inode_cache, false); } static void inode_storage_map_free(struct bpf_map *map) diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 351d991694cb..309ea727a5cb 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -80,8 +80,24 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, if (charge_mem && mem_charge(smap, owner, smap->elem_size)) return NULL; - selem = bpf_map_kzalloc(&smap->map, smap->elem_size, - gfp_flags | __GFP_NOWARN); + if (smap->bpf_ma) { + migrate_disable(); + selem = bpf_mem_cache_alloc_flags(&smap->selem_ma, gfp_flags); + migrate_enable(); + if (selem) + /* Keep the original bpf_map_kzalloc behavior + * before started using the bpf_mem_cache_alloc. + * + * No need to use zero_map_value. The bpf_selem_free() + * only does bpf_mem_cache_free when there is + * no other bpf prog is using the selem. + */ + memset(SDATA(selem)->data, 0, smap->map.value_size); + } else { + selem = bpf_map_kzalloc(&smap->map, smap->elem_size, + gfp_flags | __GFP_NOWARN); + } + if (selem) { if (value) copy_map_value(&smap->map, SDATA(selem)->data, value); @@ -124,12 +140,34 @@ static void bpf_local_storage_free(struct bpf_local_storage *local_storage, call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu); } +/* rcu tasks trace callback for bpf_ma == false */ +static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu) +{ + struct bpf_local_storage_elem *selem; + + selem = container_of(rcu, struct bpf_local_storage_elem, rcu); + if (rcu_trace_implies_rcu_gp()) + kfree(selem); + else + kfree_rcu(selem, rcu); +} + +/* Handle bpf_ma == false */ +static void __bpf_selem_free(struct bpf_local_storage_elem *selem, + bool vanilla_rcu) +{ + if (vanilla_rcu) + kfree_rcu(selem, rcu); + else + call_rcu_tasks_trace(&selem->rcu, __bpf_selem_free_trace_rcu); +} + static void bpf_selem_free_rcu(struct rcu_head *rcu) { struct bpf_local_storage_elem *selem; selem = container_of(rcu, struct bpf_local_storage_elem, rcu); - kfree(selem); + bpf_mem_cache_raw_free(selem); } static void bpf_selem_free_trace_rcu(struct rcu_head *rcu) @@ -145,10 +183,23 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem, bool reuse_now) { bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); - if (!reuse_now) + + if (!smap->bpf_ma) { + __bpf_selem_free(selem, reuse_now); + return; + } + + if (!reuse_now) { call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu); - else - call_rcu(&selem->rcu, bpf_selem_free_rcu); + } else { + /* Instead of using the vanilla call_rcu(), + * bpf_mem_cache_free will be able to reuse selem + * immediately. + */ + migrate_disable(); + bpf_mem_cache_free(&smap->selem_ma, selem); + migrate_enable(); + } } /* local_storage->lock must be held and selem->local_storage == local_storage. @@ -654,13 +705,25 @@ u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map) return usage; } +/* When bpf_ma == true, the bpf_mem_alloc is used to allocate and free memory. + * A deadlock free allocator is useful for storage that the bpf prog can easily + * get a hold of the owner PTR_TO_BTF_ID in any context. eg. bpf_get_current_task_btf. + * The task and cgroup storage fall into this case. The bpf_mem_alloc reuses + * memory immediately. To be reuse-immediate safe, the owner destruction + * code path needs to go through a rcu grace period before calling + * bpf_local_storage_destroy(). + * + * When bpf_ma == false, the kmalloc and kfree are used. + */ struct bpf_map * bpf_local_storage_map_alloc(union bpf_attr *attr, - struct bpf_local_storage_cache *cache) + struct bpf_local_storage_cache *cache, + bool bpf_ma) { struct bpf_local_storage_map *smap; unsigned int i; u32 nbuckets; + int err; smap = bpf_map_area_alloc(sizeof(*smap), NUMA_NO_NODE); if (!smap) @@ -675,8 +738,8 @@ bpf_local_storage_map_alloc(union bpf_attr *attr, smap->buckets = bpf_map_kvcalloc(&smap->map, sizeof(*smap->buckets), nbuckets, GFP_USER | __GFP_NOWARN); if (!smap->buckets) { - bpf_map_area_free(smap); - return ERR_PTR(-ENOMEM); + err = -ENOMEM; + goto free_smap; } for (i = 0; i < nbuckets; i++) { @@ -687,8 +750,20 @@ bpf_local_storage_map_alloc(union bpf_attr *attr, smap->elem_size = offsetof(struct bpf_local_storage_elem, sdata.data[attr->value_size]); + smap->bpf_ma = bpf_ma; + if (bpf_ma) { + err = bpf_mem_alloc_init(&smap->selem_ma, smap->elem_size, false); + if (err) + goto free_smap; + } + smap->cache_idx = bpf_local_storage_cache_idx_get(cache); return &smap->map; + +free_smap: + kvfree(smap->buckets); + bpf_map_area_free(smap); + return ERR_PTR(err); } void bpf_local_storage_map_free(struct bpf_map *map, @@ -754,6 +829,8 @@ void bpf_local_storage_map_free(struct bpf_map *map, */ synchronize_rcu(); + if (smap->bpf_ma) + bpf_mem_alloc_destroy(&smap->selem_ma); kvfree(smap->buckets); bpf_map_area_free(smap); } diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index ab5bd1ef58c4..d1af0c8f9ce4 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -309,7 +309,7 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key) static struct bpf_map *task_storage_map_alloc(union bpf_attr *attr) { - return bpf_local_storage_map_alloc(attr, &task_cache); + return bpf_local_storage_map_alloc(attr, &task_cache, true); } static void task_storage_map_free(struct bpf_map *map) diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index cb0f5a105b89..085025c7130a 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -68,7 +68,7 @@ static void bpf_sk_storage_map_free(struct bpf_map *map) static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) { - return bpf_local_storage_map_alloc(attr, &sk_cache); + return bpf_local_storage_map_alloc(attr, &sk_cache, false); } static int notsupp_get_next_key(struct bpf_map *map, void *key, -- cgit v1.2.3-58-ga151 From 6ae9d5e99e1dd26babdd9502759fa25a3fd348ad Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 22 Mar 2023 14:52:44 -0700 Subject: bpf: Use bpf_mem_cache_alloc/free for bpf_local_storage This patch uses bpf_mem_cache_alloc/free for allocating and freeing bpf_local_storage for task and cgroup storage. The changes are similar to the previous patch. A few things that worth to mention for bpf_local_storage: The local_storage is freed when the last selem is deleted. Before deleting a selem from local_storage, it needs to retrieve the local_storage->smap because the bpf_selem_unlink_storage_nolock() may have set it to NULL. Note that local_storage->smap may have already been NULL when the selem created this local_storage has been removed. In this case, call_rcu will be used to free the local_storage. Also, the bpf_ma (true or false) value is needed before calling bpf_local_storage_free(). The bpf_ma can either be obtained from the local_storage->smap (if available) or any of its selem's smap. A new helper check_storage_bpf_ma() is added to obtain bpf_ma for a deleting bpf_local_storage. When bpf_local_storage_alloc getting a reused memory, all fields are either in the correct values or will be initialized. 'cache[]' must already be all NULLs. 'list' must be empty. Others will be initialized. Cc: Namhyung Kim Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20230322215246.1675516-4-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_local_storage.h | 1 + kernel/bpf/bpf_local_storage.c | 130 +++++++++++++++++++++++++++++++++----- 2 files changed, 116 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 30efbcab2798..173ec7f43ed1 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -57,6 +57,7 @@ struct bpf_local_storage_map { u16 elem_size; u16 cache_idx; struct bpf_mem_alloc selem_ma; + struct bpf_mem_alloc storage_ma; bool bpf_ma; }; diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 309ea727a5cb..dab2ff4c99d9 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -111,33 +111,74 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, return NULL; } +/* rcu tasks trace callback for bpf_ma == false */ +static void __bpf_local_storage_free_trace_rcu(struct rcu_head *rcu) +{ + struct bpf_local_storage *local_storage; + + /* If RCU Tasks Trace grace period implies RCU grace period, do + * kfree(), else do kfree_rcu(). + */ + local_storage = container_of(rcu, struct bpf_local_storage, rcu); + if (rcu_trace_implies_rcu_gp()) + kfree(local_storage); + else + kfree_rcu(local_storage, rcu); +} + static void bpf_local_storage_free_rcu(struct rcu_head *rcu) { struct bpf_local_storage *local_storage; local_storage = container_of(rcu, struct bpf_local_storage, rcu); - kfree(local_storage); + bpf_mem_cache_raw_free(local_storage); } static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu) { - /* If RCU Tasks Trace grace period implies RCU grace period, do - * kfree(), else do kfree_rcu(). - */ if (rcu_trace_implies_rcu_gp()) bpf_local_storage_free_rcu(rcu); else call_rcu(rcu, bpf_local_storage_free_rcu); } +/* Handle bpf_ma == false */ +static void __bpf_local_storage_free(struct bpf_local_storage *local_storage, + bool vanilla_rcu) +{ + if (vanilla_rcu) + kfree_rcu(local_storage, rcu); + else + call_rcu_tasks_trace(&local_storage->rcu, + __bpf_local_storage_free_trace_rcu); +} + static void bpf_local_storage_free(struct bpf_local_storage *local_storage, - bool reuse_now) + struct bpf_local_storage_map *smap, + bool bpf_ma, bool reuse_now) { - if (!reuse_now) + if (!bpf_ma) { + __bpf_local_storage_free(local_storage, reuse_now); + return; + } + + if (!reuse_now) { call_rcu_tasks_trace(&local_storage->rcu, bpf_local_storage_free_trace_rcu); - else + return; + } + + if (smap) { + migrate_disable(); + bpf_mem_cache_free(&smap->storage_ma, local_storage); + migrate_enable(); + } else { + /* smap could be NULL if the selem that triggered + * this 'local_storage' creation had been long gone. + * In this case, directly do call_rcu(). + */ call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu); + } } /* rcu tasks trace callback for bpf_ma == false */ @@ -260,11 +301,47 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor return free_local_storage; } +static bool check_storage_bpf_ma(struct bpf_local_storage *local_storage, + struct bpf_local_storage_map *storage_smap, + struct bpf_local_storage_elem *selem) +{ + + struct bpf_local_storage_map *selem_smap; + + /* local_storage->smap may be NULL. If it is, get the bpf_ma + * from any selem in the local_storage->list. The bpf_ma of all + * local_storage and selem should have the same value + * for the same map type. + * + * If the local_storage->list is already empty, the caller will not + * care about the bpf_ma value also because the caller is not + * responsibile to free the local_storage. + */ + + if (storage_smap) + return storage_smap->bpf_ma; + + if (!selem) { + struct hlist_node *n; + + n = rcu_dereference_check(hlist_first_rcu(&local_storage->list), + bpf_rcu_lock_held()); + if (!n) + return false; + + selem = hlist_entry(n, struct bpf_local_storage_elem, snode); + } + selem_smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); + + return selem_smap->bpf_ma; +} + static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem, bool reuse_now) { + struct bpf_local_storage_map *storage_smap; struct bpf_local_storage *local_storage; - bool free_local_storage = false; + bool bpf_ma, free_local_storage = false; unsigned long flags; if (unlikely(!selem_linked_to_storage_lockless(selem))) @@ -273,6 +350,10 @@ static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem, local_storage = rcu_dereference_check(selem->local_storage, bpf_rcu_lock_held()); + storage_smap = rcu_dereference_check(local_storage->smap, + bpf_rcu_lock_held()); + bpf_ma = check_storage_bpf_ma(local_storage, storage_smap, selem); + raw_spin_lock_irqsave(&local_storage->lock, flags); if (likely(selem_linked_to_storage(selem))) free_local_storage = bpf_selem_unlink_storage_nolock( @@ -280,7 +361,7 @@ static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem, raw_spin_unlock_irqrestore(&local_storage->lock, flags); if (free_local_storage) - bpf_local_storage_free(local_storage, reuse_now); + bpf_local_storage_free(local_storage, storage_smap, bpf_ma, reuse_now); } void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, @@ -400,8 +481,15 @@ int bpf_local_storage_alloc(void *owner, if (err) return err; - storage = bpf_map_kzalloc(&smap->map, sizeof(*storage), - gfp_flags | __GFP_NOWARN); + if (smap->bpf_ma) { + migrate_disable(); + storage = bpf_mem_cache_alloc_flags(&smap->storage_ma, gfp_flags); + migrate_enable(); + } else { + storage = bpf_map_kzalloc(&smap->map, sizeof(*storage), + gfp_flags | __GFP_NOWARN); + } + if (!storage) { err = -ENOMEM; goto uncharge; @@ -447,7 +535,7 @@ int bpf_local_storage_alloc(void *owner, return 0; uncharge: - bpf_local_storage_free(storage, true); + bpf_local_storage_free(storage, smap, smap->bpf_ma, true); mem_uncharge(smap, owner, sizeof(*storage)); return err; } @@ -660,11 +748,15 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map, void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) { + struct bpf_local_storage_map *storage_smap; struct bpf_local_storage_elem *selem; - bool free_storage = false; + bool bpf_ma, free_storage = false; struct hlist_node *n; unsigned long flags; + storage_smap = rcu_dereference_check(local_storage->smap, bpf_rcu_lock_held()); + bpf_ma = check_storage_bpf_ma(local_storage, storage_smap, NULL); + /* Neither the bpf_prog nor the bpf_map's syscall * could be modifying the local_storage->list now. * Thus, no elem can be added to or deleted from the @@ -692,7 +784,7 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) raw_spin_unlock_irqrestore(&local_storage->lock, flags); if (free_storage) - bpf_local_storage_free(local_storage, true); + bpf_local_storage_free(local_storage, storage_smap, bpf_ma, true); } u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map) @@ -755,6 +847,12 @@ bpf_local_storage_map_alloc(union bpf_attr *attr, err = bpf_mem_alloc_init(&smap->selem_ma, smap->elem_size, false); if (err) goto free_smap; + + err = bpf_mem_alloc_init(&smap->storage_ma, sizeof(struct bpf_local_storage), false); + if (err) { + bpf_mem_alloc_destroy(&smap->selem_ma); + goto free_smap; + } } smap->cache_idx = bpf_local_storage_cache_idx_get(cache); @@ -829,8 +927,10 @@ void bpf_local_storage_map_free(struct bpf_map *map, */ synchronize_rcu(); - if (smap->bpf_ma) + if (smap->bpf_ma) { bpf_mem_alloc_destroy(&smap->selem_ma); + bpf_mem_alloc_destroy(&smap->storage_ma); + } kvfree(smap->buckets); bpf_map_area_free(smap); } -- cgit v1.2.3-58-ga151 From e4c2acab95a5947fe7948140a83e4f4918c6c048 Mon Sep 17 00:00:00 2001 From: David Vernet Date: Thu, 30 Mar 2023 09:52:02 -0500 Subject: bpf: Handle PTR_MAYBE_NULL case in PTR_TO_BTF_ID helper call arg When validating a helper function argument, we use check_reg_type() to ensure that the register containing the argument is of the correct type. When the register's base type is PTR_TO_BTF_ID, there is some supplemental logic where we do extra checks for various combinations of PTR_TO_BTF_ID type modifiers. For example, for PTR_TO_BTF_ID, PTR_TO_BTF_ID | PTR_TRUSTED, and PTR_TO_BTF_ID | MEM_RCU, we call map_kptr_match_type() for bpf_kptr_xchg() calls, and btf_struct_ids_match() for other helper calls. When an unhandled PTR_TO_BTF_ID type modifier combination is passed to check_reg_type(), the verifier fails with an internal verifier error message. This can currently be triggered by passing a PTR_MAYBE_NULL pointer to helper functions (currently just bpf_kptr_xchg()) with an ARG_PTR_TO_BTF_ID_OR_NULL arg type. For example, by callin bpf_kptr_xchg(&v->kptr, bpf_cpumask_create()). Whether or not passing a PTR_MAYBE_NULL arg to an ARG_PTR_TO_BTF_ID_OR_NULL argument is valid is an interesting question. In a vacuum, it seems fine. A helper function with an ARG_PTR_TO_BTF_ID_OR_NULL arg would seem to be implying that it can handle either a NULL or non-NULL arg, and has logic in place to detect and gracefully handle each. This is the case for bpf_kptr_xchg(), which of course simply does an xchg(). On the other hand, bpf_kptr_xchg() also specifies OBJ_RELEASE, and refcounting semantics for a PTR_MAYBE_NULL pointer is different than handling it for a NULL _OR_ non-NULL pointer. For example, with a non-NULL arg, we should always fail if there was not a nonzero refcount for the value in the register being passed to the helper. For PTR_MAYBE_NULL on the other hand, it's unclear. If the pointer is NULL it would be fine, but if it's not NULL, it would be incorrect to load the program. The current solution to this is to just fail if PTR_MAYBE_NULL is passed, and to instead require programs to have a NULL check to explicitly handle the NULL and non-NULL cases. This seems reasonable. Not only would it possibly be quite complicated to correctly handle PTR_MAYBE_NULL refcounting in the verifier, but it's also an arguably odd programming pattern in general to not explicitly handle the NULL case anyways. For example, it seems odd to not care about whether a pointer you're passing to bpf_kptr_xchg() was successfully allocated in a program such as the following: private(MASK) static struct bpf_cpumask __kptr * global_mask; SEC("tp_btf/task_newtask") int BPF_PROG(example, struct task_struct *task, u64 clone_flags) { struct bpf_cpumask *prev; /* bpf_cpumask_create() returns PTR_MAYBE_NULL */ prev = bpf_kptr_xchg(&global_mask, bpf_cpumask_create()); if (prev) bpf_cpumask_release(prev); return 0; } This patch therefore updates the verifier to explicitly check for PTR_MAYBE_NULL in check_reg_type(), and fail gracefully if it's observed. This isn't really "fixing" anything unsafe or incorrect. We're just updating the verifier to fail gracefully, and explicitly handle this pattern rather than unintentionally falling back to an internal verifier error path. A subsequent patch will update selftests. Signed-off-by: David Vernet Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20230330145203.80506-1-void@manifault.com --- kernel/bpf/verifier.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 20eb2015842f..52738f9dcb15 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7204,6 +7204,10 @@ found: } break; } + case PTR_TO_BTF_ID | PTR_MAYBE_NULL: + case PTR_TO_BTF_ID | PTR_MAYBE_NULL | MEM_RCU: + verbose(env, "Possibly NULL pointer passed to helper arg%d\n", regno); + return -EACCES; case PTR_TO_BTF_ID | MEM_ALLOC: if (meta->func_id != BPF_FUNC_spin_lock && meta->func_id != BPF_FUNC_spin_unlock && meta->func_id != BPF_FUNC_kptr_xchg) { -- cgit v1.2.3-58-ga151 From d02c48fa113953aba0b330ec6c35f50c7d1d7986 Mon Sep 17 00:00:00 2001 From: David Vernet Date: Fri, 31 Mar 2023 14:57:31 -0500 Subject: bpf: Make struct task_struct an RCU-safe type struct task_struct objects are a bit interesting in terms of how their lifetime is protected by refcounts. task structs have two refcount fields: 1. refcount_t usage: Protects the memory backing the task struct. When this refcount drops to 0, the task is immediately freed, without waiting for an RCU grace period to elapse. This is the field that most callers in the kernel currently use to ensure that a task remains valid while it's being referenced, and is what's currently tracked with bpf_task_acquire() and bpf_task_release(). 2. refcount_t rcu_users: A refcount field which, when it drops to 0, schedules an RCU callback that drops a reference held on the 'usage' field above (which is acquired when the task is first created). This field therefore provides a form of RCU protection on the task by ensuring that at least one 'usage' refcount will be held until an RCU grace period has elapsed. The qualifier "a form of" is important here, as a task can remain valid after task->rcu_users has dropped to 0 and the subsequent RCU gp has elapsed. In terms of BPF, we want to use task->rcu_users to protect tasks that function as referenced kptrs, and to allow tasks stored as referenced kptrs in maps to be accessed with RCU protection. Let's first determine whether we can safely use task->rcu_users to protect tasks stored in maps. All of the bpf_task* kfuncs can only be called from tracepoint, struct_ops, or BPF_PROG_TYPE_SCHED_CLS, program types. For tracepoint and struct_ops programs, the struct task_struct passed to a program handler will always be trusted, so it will always be safe to call bpf_task_acquire() with any task passed to a program. Note, however, that we must update bpf_task_acquire() to be KF_RET_NULL, as it is possible that the task has exited by the time the program is invoked, even if the pointer is still currently valid because the main kernel holds a task->usage refcount. For BPF_PROG_TYPE_SCHED_CLS, tasks should never be passed as an argument to the any program handlers, so it should not be relevant. The second question is whether it's safe to use RCU to access a task that was acquired with bpf_task_acquire(), and stored in a map. Because bpf_task_acquire() now uses task->rcu_users, it follows that if the task is present in the map, that it must have had at least one task->rcu_users refcount by the time the current RCU cs was started. Therefore, it's safe to access that task until the end of the current RCU cs. With all that said, this patch makes struct task_struct is an RCU-protected object. In doing so, we also change bpf_task_acquire() to be KF_ACQUIRE | KF_RCU | KF_RET_NULL, and adjust any selftests as necessary. A subsequent patch will remove bpf_task_kptr_get(), and bpf_task_acquire_not_zero() respectively. Signed-off-by: David Vernet Link: https://lore.kernel.org/r/20230331195733.699708-2-void@manifault.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 11 +-- kernel/bpf/verifier.c | 1 + .../testing/selftests/bpf/prog_tests/task_kfunc.c | 1 + .../selftests/bpf/progs/task_kfunc_common.h | 5 ++ .../selftests/bpf/progs/task_kfunc_failure.c | 80 +++++++++++++++++++--- .../selftests/bpf/progs/task_kfunc_success.c | 26 ++++++- 6 files changed, 108 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 8980f6859443..e71a4a54ce99 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -2013,7 +2014,9 @@ __bpf_kfunc struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root) */ __bpf_kfunc struct task_struct *bpf_task_acquire(struct task_struct *p) { - return get_task_struct(p); + if (refcount_inc_not_zero(&p->rcu_users)) + return p; + return NULL; } /** @@ -2089,7 +2092,7 @@ __bpf_kfunc struct task_struct *bpf_task_kptr_get(struct task_struct **pp) */ __bpf_kfunc void bpf_task_release(struct task_struct *p) { - put_task_struct(p); + put_task_struct_rcu_user(p); } #ifdef CONFIG_CGROUPS @@ -2199,7 +2202,7 @@ __bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid) rcu_read_lock(); p = find_task_by_pid_ns(pid, &init_pid_ns); if (p) - bpf_task_acquire(p); + p = bpf_task_acquire(p); rcu_read_unlock(); return p; @@ -2371,7 +2374,7 @@ BTF_ID_FLAGS(func, bpf_list_push_front) BTF_ID_FLAGS(func, bpf_list_push_back) BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL) -BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_acquire_not_zero, KF_ACQUIRE | KF_RCU | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 52738f9dcb15..92ae4e8ab87b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4600,6 +4600,7 @@ BTF_SET_START(rcu_protected_types) BTF_ID(struct, prog_test_ref_kfunc) BTF_ID(struct, cgroup) BTF_ID(struct, bpf_cpumask) +BTF_ID(struct, task_struct) BTF_SET_END(rcu_protected_types) static bool rcu_protected_object(const struct btf *btf, u32 btf_id) diff --git a/tools/testing/selftests/bpf/prog_tests/task_kfunc.c b/tools/testing/selftests/bpf/prog_tests/task_kfunc.c index f79fa5bc9a8d..330133ece3f6 100644 --- a/tools/testing/selftests/bpf/prog_tests/task_kfunc.c +++ b/tools/testing/selftests/bpf/prog_tests/task_kfunc.c @@ -78,6 +78,7 @@ static const char * const success_tests[] = { "test_task_from_pid_arg", "test_task_from_pid_current", "test_task_from_pid_invalid", + "task_kfunc_acquire_trusted_walked", }; void test_task_kfunc(void) diff --git a/tools/testing/selftests/bpf/progs/task_kfunc_common.h b/tools/testing/selftests/bpf/progs/task_kfunc_common.h index 4c2a4b0e3a25..bf0d1da9aff8 100644 --- a/tools/testing/selftests/bpf/progs/task_kfunc_common.h +++ b/tools/testing/selftests/bpf/progs/task_kfunc_common.h @@ -24,6 +24,8 @@ struct task_struct *bpf_task_acquire(struct task_struct *p) __ksym; struct task_struct *bpf_task_kptr_get(struct task_struct **pp) __ksym; void bpf_task_release(struct task_struct *p) __ksym; struct task_struct *bpf_task_from_pid(s32 pid) __ksym; +void bpf_rcu_read_lock(void) __ksym; +void bpf_rcu_read_unlock(void) __ksym; static inline struct __tasks_kfunc_map_value *tasks_kfunc_map_value_lookup(struct task_struct *p) { @@ -60,6 +62,9 @@ static inline int tasks_kfunc_map_insert(struct task_struct *p) } acquired = bpf_task_acquire(p); + if (!acquired) + return -ENOENT; + old = bpf_kptr_xchg(&v->task, acquired); if (old) { bpf_task_release(old); diff --git a/tools/testing/selftests/bpf/progs/task_kfunc_failure.c b/tools/testing/selftests/bpf/progs/task_kfunc_failure.c index 2c374a7ffece..63aef547da87 100644 --- a/tools/testing/selftests/bpf/progs/task_kfunc_failure.c +++ b/tools/testing/selftests/bpf/progs/task_kfunc_failure.c @@ -40,6 +40,9 @@ int BPF_PROG(task_kfunc_acquire_untrusted, struct task_struct *task, u64 clone_f /* Can't invoke bpf_task_acquire() on an untrusted pointer. */ acquired = bpf_task_acquire(v->task); + if (!acquired) + return 0; + bpf_task_release(acquired); return 0; @@ -53,38 +56,49 @@ int BPF_PROG(task_kfunc_acquire_fp, struct task_struct *task, u64 clone_flags) /* Can't invoke bpf_task_acquire() on a random frame pointer. */ acquired = bpf_task_acquire((struct task_struct *)&stack_task); + if (!acquired) + return 0; + bpf_task_release(acquired); return 0; } SEC("kretprobe/free_task") -__failure __msg("reg type unsupported for arg#0 function") +__failure __msg("calling kernel function bpf_task_acquire is not allowed") int BPF_PROG(task_kfunc_acquire_unsafe_kretprobe, struct task_struct *task, u64 clone_flags) { struct task_struct *acquired; + /* Can't call bpf_task_acquire() or bpf_task_release() in an untrusted prog. */ acquired = bpf_task_acquire(task); - /* Can't release a bpf_task_acquire()'d task without a NULL check. */ + if (!acquired) + return 0; bpf_task_release(acquired); return 0; } -SEC("tp_btf/task_newtask") -__failure __msg("R1 must be referenced or trusted") -int BPF_PROG(task_kfunc_acquire_trusted_walked, struct task_struct *task, u64 clone_flags) +SEC("kretprobe/free_task") +__failure __msg("calling kernel function bpf_task_acquire is not allowed") +int BPF_PROG(task_kfunc_acquire_unsafe_kretprobe_rcu, struct task_struct *task, u64 clone_flags) { struct task_struct *acquired; - /* Can't invoke bpf_task_acquire() on a trusted pointer obtained from walking a struct. */ - acquired = bpf_task_acquire(task->group_leader); - bpf_task_release(acquired); + bpf_rcu_read_lock(); + if (!task) { + bpf_rcu_read_unlock(); + return 0; + } + /* Can't call bpf_task_acquire() or bpf_task_release() in an untrusted prog. */ + acquired = bpf_task_acquire(task); + if (acquired) + bpf_task_release(acquired); + bpf_rcu_read_unlock(); return 0; } - SEC("tp_btf/task_newtask") __failure __msg("Possibly NULL pointer passed to trusted arg0") int BPF_PROG(task_kfunc_acquire_null, struct task_struct *task, u64 clone_flags) @@ -137,6 +151,8 @@ int BPF_PROG(task_kfunc_get_non_kptr_acquired, struct task_struct *task, u64 clo struct task_struct *kptr, *acquired; acquired = bpf_task_acquire(task); + if (!acquired) + return 0; /* Cannot use bpf_task_kptr_get() on a non-kptr, even if it was acquired. */ kptr = bpf_task_kptr_get(&acquired); @@ -185,6 +201,19 @@ int BPF_PROG(task_kfunc_xchg_unreleased, struct task_struct *task, u64 clone_fla return 0; } +SEC("tp_btf/task_newtask") +__failure __msg("Possibly NULL pointer passed to trusted arg0") +int BPF_PROG(task_kfunc_acquire_release_no_null_check, struct task_struct *task, u64 clone_flags) +{ + struct task_struct *acquired; + + acquired = bpf_task_acquire(task); + /* Can't invoke bpf_task_release() on an acquired task without a NULL check. */ + bpf_task_release(acquired); + + return 0; +} + SEC("tp_btf/task_newtask") __failure __msg("Unreleased reference") int BPF_PROG(task_kfunc_get_unreleased, struct task_struct *task, u64 clone_flags) @@ -256,12 +285,13 @@ int BPF_PROG(task_kfunc_release_null, struct task_struct *task, u64 clone_flags) return -ENOENT; acquired = bpf_task_acquire(task); + if (!acquired) + return -EEXIST; old = bpf_kptr_xchg(&v->task, acquired); /* old cannot be passed to bpf_task_release() without a NULL check. */ bpf_task_release(old); - bpf_task_release(old); return 0; } @@ -298,6 +328,9 @@ int BPF_PROG(task_kfunc_from_lsm_task_free, struct task_struct *task) /* the argument of lsm task_free hook is untrusted. */ acquired = bpf_task_acquire(task); + if (!acquired) + return 0; + bpf_task_release(acquired); return 0; } @@ -337,3 +370,30 @@ int BPF_PROG(task_access_comm4, struct task_struct *task, const char *buf, bool bpf_strncmp(task->comm, 16, "foo"); return 0; } + +SEC("tp_btf/task_newtask") +__failure __msg("R1 must be referenced or trusted") +int BPF_PROG(task_kfunc_release_in_map, struct task_struct *task, u64 clone_flags) +{ + struct task_struct *local; + struct __tasks_kfunc_map_value *v; + + if (tasks_kfunc_map_insert(task)) + return 0; + + v = tasks_kfunc_map_value_lookup(task); + if (!v) + return 0; + + bpf_rcu_read_lock(); + local = v->task; + if (!local) { + bpf_rcu_read_unlock(); + return 0; + } + /* Can't release a kptr that's still stored in a map. */ + bpf_task_release(local); + bpf_rcu_read_unlock(); + + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/task_kfunc_success.c b/tools/testing/selftests/bpf/progs/task_kfunc_success.c index cfa7f12b84e8..a75304a5e860 100644 --- a/tools/testing/selftests/bpf/progs/task_kfunc_success.c +++ b/tools/testing/selftests/bpf/progs/task_kfunc_success.c @@ -47,7 +47,10 @@ static int test_acquire_release(struct task_struct *task) } acquired = bpf_task_acquire(task); - bpf_task_release(acquired); + if (acquired) + bpf_task_release(acquired); + else + err = 6; return 0; } @@ -166,7 +169,10 @@ int BPF_PROG(test_task_current_acquire_release, struct task_struct *task, u64 cl current = bpf_get_current_task_btf(); acquired = bpf_task_acquire(current); - bpf_task_release(acquired); + if (acquired) + bpf_task_release(acquired); + else + err = 1; return 0; } @@ -241,3 +247,19 @@ int BPF_PROG(test_task_from_pid_invalid, struct task_struct *task, u64 clone_fla return 0; } + +SEC("tp_btf/task_newtask") +int BPF_PROG(task_kfunc_acquire_trusted_walked, struct task_struct *task, u64 clone_flags) +{ + struct task_struct *acquired; + + /* task->group_leader is listed as a trusted, non-NULL field of task struct. */ + acquired = bpf_task_acquire(task->group_leader); + if (acquired) + bpf_task_release(acquired); + else + err = 1; + + + return 0; +} -- cgit v1.2.3-58-ga151 From f85671c6ef46d490a90dac719e0c0e0adbacfd9b Mon Sep 17 00:00:00 2001 From: David Vernet Date: Fri, 31 Mar 2023 14:57:32 -0500 Subject: bpf: Remove now-defunct task kfuncs In commit 22df776a9a86 ("tasks: Extract rcu_users out of union"), the 'refcount_t rcu_users' field was extracted out of a union with the 'struct rcu_head rcu' field. This allows us to safely perform a refcount_inc_not_zero() on task->rcu_users when acquiring a reference on a task struct. A prior patch leveraged this by making struct task_struct an RCU-protected object in the verifier, and by bpf_task_acquire() to use the task->rcu_users field for synchronization. Now that we can use RCU to protect tasks, we no longer need bpf_task_kptr_get(), or bpf_task_acquire_not_zero(). bpf_task_kptr_get() is truly completely unnecessary, as we can just use RCU to get the object. bpf_task_acquire_not_zero() is now equivalent to bpf_task_acquire(). In addition to these changes, this patch also updates the associated selftests to no longer use these kfuncs. Signed-off-by: David Vernet Link: https://lore.kernel.org/r/20230331195733.699708-3-void@manifault.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 69 -------------------- .../testing/selftests/bpf/prog_tests/task_kfunc.c | 2 +- tools/testing/selftests/bpf/progs/rcu_read_lock.c | 9 +-- .../selftests/bpf/progs/task_kfunc_common.h | 1 - .../selftests/bpf/progs/task_kfunc_failure.c | 73 ---------------------- .../selftests/bpf/progs/task_kfunc_success.c | 22 +++---- 6 files changed, 14 insertions(+), 162 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index e71a4a54ce99..6be16db9f188 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2019,73 +2019,6 @@ __bpf_kfunc struct task_struct *bpf_task_acquire(struct task_struct *p) return NULL; } -/** - * bpf_task_acquire_not_zero - Acquire a reference to a rcu task object. A task - * acquired by this kfunc which is not stored in a map as a kptr, must be - * released by calling bpf_task_release(). - * @p: The task on which a reference is being acquired. - */ -__bpf_kfunc struct task_struct *bpf_task_acquire_not_zero(struct task_struct *p) -{ - /* For the time being this function returns NULL, as it's not currently - * possible to safely acquire a reference to a task with RCU protection - * using get_task_struct() and put_task_struct(). This is due to the - * slightly odd mechanics of p->rcu_users, and how task RCU protection - * works. - * - * A struct task_struct is refcounted by two different refcount_t - * fields: - * - * 1. p->usage: The "true" refcount field which tracks a task's - * lifetime. The task is freed as soon as this - * refcount drops to 0. - * - * 2. p->rcu_users: An "RCU users" refcount field which is statically - * initialized to 2, and is co-located in a union with - * a struct rcu_head field (p->rcu). p->rcu_users - * essentially encapsulates a single p->usage - * refcount, and when p->rcu_users goes to 0, an RCU - * callback is scheduled on the struct rcu_head which - * decrements the p->usage refcount. - * - * There are two important implications to this task refcounting logic - * described above. The first is that - * refcount_inc_not_zero(&p->rcu_users) cannot be used anywhere, as - * after the refcount goes to 0, the RCU callback being scheduled will - * cause the memory backing the refcount to again be nonzero due to the - * fields sharing a union. The other is that we can't rely on RCU to - * guarantee that a task is valid in a BPF program. This is because a - * task could have already transitioned to being in the TASK_DEAD - * state, had its rcu_users refcount go to 0, and its rcu callback - * invoked in which it drops its single p->usage reference. At this - * point the task will be freed as soon as the last p->usage reference - * goes to 0, without waiting for another RCU gp to elapse. The only - * way that a BPF program can guarantee that a task is valid is in this - * scenario is to hold a p->usage refcount itself. - * - * Until we're able to resolve this issue, either by pulling - * p->rcu_users and p->rcu out of the union, or by getting rid of - * p->usage and just using p->rcu_users for refcounting, we'll just - * return NULL here. - */ - return NULL; -} - -/** - * bpf_task_kptr_get - Acquire a reference on a struct task_struct kptr. A task - * kptr acquired by this kfunc which is not subsequently stored in a map, must - * be released by calling bpf_task_release(). - * @pp: A pointer to a task kptr on which a reference is being acquired. - */ -__bpf_kfunc struct task_struct *bpf_task_kptr_get(struct task_struct **pp) -{ - /* We must return NULL here until we have clarity on how to properly - * leverage RCU for ensuring a task's lifetime. See the comment above - * in bpf_task_acquire_not_zero() for more details. - */ - return NULL; -} - /** * bpf_task_release - Release the reference acquired on a task. * @p: The task on which a reference is being released. @@ -2375,8 +2308,6 @@ BTF_ID_FLAGS(func, bpf_list_push_back) BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL) -BTF_ID_FLAGS(func, bpf_task_acquire_not_zero, KF_ACQUIRE | KF_RCU | KF_RET_NULL) -BTF_ID_FLAGS(func, bpf_task_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_rbtree_remove, KF_ACQUIRE) BTF_ID_FLAGS(func, bpf_rbtree_add) diff --git a/tools/testing/selftests/bpf/prog_tests/task_kfunc.c b/tools/testing/selftests/bpf/prog_tests/task_kfunc.c index 330133ece3f6..740d5f644b40 100644 --- a/tools/testing/selftests/bpf/prog_tests/task_kfunc.c +++ b/tools/testing/selftests/bpf/prog_tests/task_kfunc.c @@ -73,7 +73,7 @@ static const char * const success_tests[] = { "test_task_acquire_release_current", "test_task_acquire_leave_in_map", "test_task_xchg_release", - "test_task_get_release", + "test_task_map_acquire_release", "test_task_current_acquire_release", "test_task_from_pid_arg", "test_task_from_pid_current", diff --git a/tools/testing/selftests/bpf/progs/rcu_read_lock.c b/tools/testing/selftests/bpf/progs/rcu_read_lock.c index 6a8c88e58df2..14fb01437fb8 100644 --- a/tools/testing/selftests/bpf/progs/rcu_read_lock.c +++ b/tools/testing/selftests/bpf/progs/rcu_read_lock.c @@ -23,7 +23,7 @@ struct bpf_key *bpf_lookup_user_key(__u32 serial, __u64 flags) __ksym; void bpf_key_put(struct bpf_key *key) __ksym; void bpf_rcu_read_lock(void) __ksym; void bpf_rcu_read_unlock(void) __ksym; -struct task_struct *bpf_task_acquire_not_zero(struct task_struct *p) __ksym; +struct task_struct *bpf_task_acquire(struct task_struct *p) __ksym; void bpf_task_release(struct task_struct *p) __ksym; SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") @@ -159,13 +159,8 @@ int task_acquire(void *ctx) goto out; /* acquire a reference which can be used outside rcu read lock region */ - gparent = bpf_task_acquire_not_zero(gparent); + gparent = bpf_task_acquire(gparent); if (!gparent) - /* Until we resolve the issues with using task->rcu_users, we - * expect bpf_task_acquire_not_zero() to return a NULL task. - * See the comment at the definition of - * bpf_task_acquire_not_zero() for more details. - */ goto out; (void)bpf_task_storage_get(&map_a, gparent, 0, 0); diff --git a/tools/testing/selftests/bpf/progs/task_kfunc_common.h b/tools/testing/selftests/bpf/progs/task_kfunc_common.h index bf0d1da9aff8..41f2d44f49cb 100644 --- a/tools/testing/selftests/bpf/progs/task_kfunc_common.h +++ b/tools/testing/selftests/bpf/progs/task_kfunc_common.h @@ -21,7 +21,6 @@ struct hash_map { } __tasks_kfunc_map SEC(".maps"); struct task_struct *bpf_task_acquire(struct task_struct *p) __ksym; -struct task_struct *bpf_task_kptr_get(struct task_struct **pp) __ksym; void bpf_task_release(struct task_struct *p) __ksym; struct task_struct *bpf_task_from_pid(s32 pid) __ksym; void bpf_rcu_read_lock(void) __ksym; diff --git a/tools/testing/selftests/bpf/progs/task_kfunc_failure.c b/tools/testing/selftests/bpf/progs/task_kfunc_failure.c index 63aef547da87..dcdea3127086 100644 --- a/tools/testing/selftests/bpf/progs/task_kfunc_failure.c +++ b/tools/testing/selftests/bpf/progs/task_kfunc_failure.c @@ -128,59 +128,6 @@ int BPF_PROG(task_kfunc_acquire_unreleased, struct task_struct *task, u64 clone_ return 0; } -SEC("tp_btf/task_newtask") -__failure __msg("arg#0 expected pointer to map value") -int BPF_PROG(task_kfunc_get_non_kptr_param, struct task_struct *task, u64 clone_flags) -{ - struct task_struct *kptr; - - /* Cannot use bpf_task_kptr_get() on a non-kptr, even on a valid task. */ - kptr = bpf_task_kptr_get(&task); - if (!kptr) - return 0; - - bpf_task_release(kptr); - - return 0; -} - -SEC("tp_btf/task_newtask") -__failure __msg("arg#0 expected pointer to map value") -int BPF_PROG(task_kfunc_get_non_kptr_acquired, struct task_struct *task, u64 clone_flags) -{ - struct task_struct *kptr, *acquired; - - acquired = bpf_task_acquire(task); - if (!acquired) - return 0; - - /* Cannot use bpf_task_kptr_get() on a non-kptr, even if it was acquired. */ - kptr = bpf_task_kptr_get(&acquired); - bpf_task_release(acquired); - if (!kptr) - return 0; - - bpf_task_release(kptr); - - return 0; -} - -SEC("tp_btf/task_newtask") -__failure __msg("arg#0 expected pointer to map value") -int BPF_PROG(task_kfunc_get_null, struct task_struct *task, u64 clone_flags) -{ - struct task_struct *kptr; - - /* Cannot use bpf_task_kptr_get() on a NULL pointer. */ - kptr = bpf_task_kptr_get(NULL); - if (!kptr) - return 0; - - bpf_task_release(kptr); - - return 0; -} - SEC("tp_btf/task_newtask") __failure __msg("Unreleased reference") int BPF_PROG(task_kfunc_xchg_unreleased, struct task_struct *task, u64 clone_flags) @@ -214,26 +161,6 @@ int BPF_PROG(task_kfunc_acquire_release_no_null_check, struct task_struct *task, return 0; } -SEC("tp_btf/task_newtask") -__failure __msg("Unreleased reference") -int BPF_PROG(task_kfunc_get_unreleased, struct task_struct *task, u64 clone_flags) -{ - struct task_struct *kptr; - struct __tasks_kfunc_map_value *v; - - v = insert_lookup_task(task); - if (!v) - return 0; - - kptr = bpf_task_kptr_get(&v->task); - if (!kptr) - return 0; - - /* Kptr acquired above is never released. */ - - return 0; -} - SEC("tp_btf/task_newtask") __failure __msg("Possibly NULL pointer passed to trusted arg0") int BPF_PROG(task_kfunc_release_untrusted, struct task_struct *task, u64 clone_flags) diff --git a/tools/testing/selftests/bpf/progs/task_kfunc_success.c b/tools/testing/selftests/bpf/progs/task_kfunc_success.c index a75304a5e860..b09371bba204 100644 --- a/tools/testing/selftests/bpf/progs/task_kfunc_success.c +++ b/tools/testing/selftests/bpf/progs/task_kfunc_success.c @@ -122,7 +122,7 @@ int BPF_PROG(test_task_xchg_release, struct task_struct *task, u64 clone_flags) } SEC("tp_btf/task_newtask") -int BPF_PROG(test_task_get_release, struct task_struct *task, u64 clone_flags) +int BPF_PROG(test_task_map_acquire_release, struct task_struct *task, u64 clone_flags) { struct task_struct *kptr; struct __tasks_kfunc_map_value *v; @@ -143,18 +143,18 @@ int BPF_PROG(test_task_get_release, struct task_struct *task, u64 clone_flags) return 0; } - kptr = bpf_task_kptr_get(&v->task); - if (kptr) { - /* Until we resolve the issues with using task->rcu_users, we - * expect bpf_task_kptr_get() to return a NULL task. See the - * comment at the definition of bpf_task_acquire_not_zero() for - * more details. - */ - bpf_task_release(kptr); + bpf_rcu_read_lock(); + kptr = v->task; + if (!kptr) { err = 3; - return 0; + } else { + kptr = bpf_task_acquire(kptr); + if (!kptr) + err = 4; + else + bpf_task_release(kptr); } - + bpf_rcu_read_unlock(); return 0; } -- cgit v1.2.3-58-ga151 From 5b85575ad4280171c382e888b006cb8d12c35bde Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Sat, 1 Apr 2023 20:06:02 +0000 Subject: bpf: optimize hashmap lookups when key_size is divisible by 4 The BPF hashmap uses the jhash() hash function. There is an optimized version of this hash function which may be used if hash size is a multiple of 4. Apply this optimization to the hashmap in a similar way as it is done in the bloom filter map. On practice the optimization is only noticeable for smaller key sizes, which, however, is sufficient for many applications. An example is listed in the following table of measurements (a hashmap of 65536 elements was used): -------------------------------------------------------------------- | key_size | fullness | lookups /sec | lookups (opt) /sec | gain | -------------------------------------------------------------------- | 4 | 25% | 42.990M | 46.000M | 7.0% | | 4 | 50% | 37.910M | 39.094M | 3.1% | | 4 | 75% | 34.486M | 36.124M | 4.7% | | 4 | 100% | 31.760M | 32.719M | 3.0% | -------------------------------------------------------------------- | 8 | 25% | 43.855M | 49.626M | 13.2% | | 8 | 50% | 38.328M | 42.152M | 10.0% | | 8 | 75% | 34.483M | 38.088M | 10.5% | | 8 | 100% | 31.306M | 34.686M | 10.8% | -------------------------------------------------------------------- | 12 | 25% | 38.398M | 43.770M | 14.0% | | 12 | 50% | 33.336M | 37.712M | 13.1% | | 12 | 75% | 29.917M | 34.440M | 15.1% | | 12 | 100% | 27.322M | 30.480M | 11.6% | -------------------------------------------------------------------- | 16 | 25% | 41.491M | 41.921M | 1.0% | | 16 | 50% | 36.206M | 36.474M | 0.7% | | 16 | 75% | 32.529M | 33.027M | 1.5% | | 16 | 100% | 29.581M | 30.325M | 2.5% | -------------------------------------------------------------------- | 20 | 25% | 34.240M | 36.787M | 7.4% | | 20 | 50% | 30.328M | 32.663M | 7.7% | | 20 | 75% | 27.536M | 29.354M | 6.6% | | 20 | 100% | 24.847M | 26.505M | 6.7% | -------------------------------------------------------------------- | 24 | 25% | 36.329M | 40.608M | 11.8% | | 24 | 50% | 31.444M | 35.059M | 11.5% | | 24 | 75% | 28.426M | 31.452M | 10.6% | | 24 | 100% | 26.278M | 28.741M | 9.4% | -------------------------------------------------------------------- | 28 | 25% | 31.540M | 31.944M | 1.3% | | 28 | 50% | 27.739M | 28.063M | 1.2% | | 28 | 75% | 24.993M | 25.814M | 3.3% | | 28 | 100% | 23.513M | 23.500M | -0.1% | -------------------------------------------------------------------- | 32 | 25% | 32.116M | 33.953M | 5.7% | | 32 | 50% | 28.879M | 29.859M | 3.4% | | 32 | 75% | 26.227M | 26.948M | 2.7% | | 32 | 100% | 23.829M | 24.613M | 3.3% | -------------------------------------------------------------------- | 64 | 25% | 22.535M | 22.554M | 0.1% | | 64 | 50% | 20.471M | 20.675M | 1.0% | | 64 | 75% | 19.077M | 19.146M | 0.4% | | 64 | 100% | 17.710M | 18.131M | 2.4% | -------------------------------------------------------------------- The following script was used to gather the results (SMT & frequency off): cd tools/testing/selftests/bpf for key_size in 4 8 12 16 20 24 28 32 64; do for nr_entries in `seq 16384 16384 65536`; do fullness=$(printf '%3s' $((nr_entries*100/65536))) echo -n "key_size=$key_size: $fullness% full: " sudo ./bench -d2 -a bpf-hashmap-lookup --key_size=$key_size --nr_entries=$nr_entries --max_entries=65536 --nr_loops=2000000 --map_flags=0x40 | grep cpu done echo done Signed-off-by: Anton Protopopov Link: https://lore.kernel.org/r/20230401200602.3275-1-aspsk@isovalent.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/hashtab.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 96b645bba3a4..00c253b84bf5 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -607,6 +607,8 @@ free_htab: static inline u32 htab_map_hash(const void *key, u32 key_len, u32 hashrnd) { + if (likely(key_len % 4 == 0)) + return jhash2(key, key_len / 4, hashrnd); return jhash(key, key_len, hashrnd); } -- cgit v1.2.3-58-ga151 From 92b2e810f0d3a2c05d8cf12a800592b238d458df Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Sun, 2 Apr 2023 11:43:40 +0000 Subject: bpf: compute hashes in bloom filter similar to hashmap If the value size in a bloom filter is a multiple of 4, then the jhash2() function is used to compute hashes. The length parameter of this function equals to the number of 32-bit words in input. Compute it in the hot path instead of pre-computing it, as this is translated to one extra shift to divide the length by four vs. one extra memory load of a pre-computed length. Signed-off-by: Anton Protopopov Link: https://lore.kernel.org/r/20230402114340.3441-1-aspsk@isovalent.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/bloom_filter.c | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c index db19784601a7..540331b610a9 100644 --- a/kernel/bpf/bloom_filter.c +++ b/kernel/bpf/bloom_filter.c @@ -16,13 +16,6 @@ struct bpf_bloom_filter { struct bpf_map map; u32 bitset_mask; u32 hash_seed; - /* If the size of the values in the bloom filter is u32 aligned, - * then it is more performant to use jhash2 as the underlying hash - * function, else we use jhash. This tracks the number of u32s - * in an u32-aligned value size. If the value size is not u32 aligned, - * this will be 0. - */ - u32 aligned_u32_count; u32 nr_hash_funcs; unsigned long bitset[]; }; @@ -32,9 +25,8 @@ static u32 hash(struct bpf_bloom_filter *bloom, void *value, { u32 h; - if (bloom->aligned_u32_count) - h = jhash2(value, bloom->aligned_u32_count, - bloom->hash_seed + index); + if (likely(value_size % 4 == 0)) + h = jhash2(value, value_size / 4, bloom->hash_seed + index); else h = jhash(value, value_size, bloom->hash_seed + index); @@ -152,11 +144,6 @@ static struct bpf_map *bloom_map_alloc(union bpf_attr *attr) bloom->nr_hash_funcs = nr_hash_funcs; bloom->bitset_mask = bitset_mask; - /* Check whether the value size is u32-aligned */ - if ((attr->value_size & (sizeof(u32) - 1)) == 0) - bloom->aligned_u32_count = - attr->value_size / sizeof(u32); - if (!(attr->map_flags & BPF_F_ZERO_SEED)) bloom->hash_seed = get_random_u32(); -- cgit v1.2.3-58-ga151 From f6a6a5a976288e4d0d94eb1c6c9e983e8e5cdb31 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Mon, 3 Apr 2023 13:00:27 -0700 Subject: bpf: Fix struct_meta lookup for bpf_obj_free_fields kfunc call bpf_obj_drop_impl has a void return type. In check_kfunc_call, the "else if" which sets insn_aux->kptr_struct_meta for bpf_obj_drop_impl is surrounded by a larger if statement which checks btf_type_is_ptr. As a result: * The bpf_obj_drop_impl-specific code will never execute * The btf_struct_meta input to bpf_obj_drop is always NULL * __bpf_obj_drop_impl will always see a NULL btf_record when called from BPF program, and won't call bpf_obj_free_fields * program-allocated kptrs which have fields that should be cleaned up by bpf_obj_free_fields may instead leak resources This patch adds a btf_type_is_void branch to the larger if and moves special handling for bpf_obj_drop_impl there, fixing the issue. Fixes: ac9f06050a35 ("bpf: Introduce bpf_obj_drop") Cc: Kumar Kartikeya Dwivedi Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20230403200027.2271029-1-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 92ae4e8ab87b..eaf9c5291cf0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10743,10 +10743,6 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, insn_aux->obj_new_size = ret_t->size; insn_aux->kptr_struct_meta = btf_find_struct_meta(ret_btf, ret_btf_id); - } else if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) { - insn_aux->kptr_struct_meta = - btf_find_struct_meta(meta.arg_obj_drop.btf, - meta.arg_obj_drop.btf_id); } else if (meta.func_id == special_kfunc_list[KF_bpf_list_pop_front] || meta.func_id == special_kfunc_list[KF_bpf_list_pop_back]) { struct btf_field *field = meta.arg_list_head.field; @@ -10875,7 +10871,15 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (reg_may_point_to_spin_lock(®s[BPF_REG_0]) && !regs[BPF_REG_0].id) regs[BPF_REG_0].id = ++env->id_gen; - } /* else { add_kfunc_call() ensures it is btf_type_is_void(t) } */ + } else if (btf_type_is_void(t)) { + if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) { + if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) { + insn_aux->kptr_struct_meta = + btf_find_struct_meta(meta.arg_obj_drop.btf, + meta.arg_obj_drop.btf_id); + } + } + } nargs = btf_type_vlen(meta.func_proto); args = (const struct btf_param *)(meta.func_proto + 1); -- cgit v1.2.3-58-ga151 From 7d64c513284408fee5178a0953a686e9410f2399 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 3 Apr 2023 21:50:22 -0700 Subject: bpf: Invoke btf_struct_access() callback only for writes. Remove duplicated if (atype == BPF_READ) btf_struct_access() from btf_struct_access() callback and invoke it only for writes. This is possible to do because currently btf_struct_access() custom callback always delegates to generic btf_struct_access() helper for BPF_READ accesses. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: David Vernet Link: https://lore.kernel.org/bpf/20230404045029.82870-2-alexei.starovoitov@gmail.com --- kernel/bpf/verifier.c | 2 +- net/bpf/bpf_dummy_struct_ops.c | 2 +- net/core/filter.c | 6 ------ net/ipv4/bpf_tcp_ca.c | 3 --- 4 files changed, 2 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index eaf9c5291cf0..83984568ccb4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5504,7 +5504,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, return -EACCES; } - if (env->ops->btf_struct_access && !type_is_alloc(reg->type)) { + if (env->ops->btf_struct_access && !type_is_alloc(reg->type) && atype == BPF_WRITE) { if (!btf_is_kernel(reg->btf)) { verbose(env, "verifier internal error: reg->btf must be kernel btf\n"); return -EFAULT; diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c index ff4f89a2b02a..9535c8506cda 100644 --- a/net/bpf/bpf_dummy_struct_ops.c +++ b/net/bpf/bpf_dummy_struct_ops.c @@ -198,7 +198,7 @@ static int bpf_dummy_ops_btf_struct_access(struct bpf_verifier_log *log, if (err < 0) return err; - return atype == BPF_READ ? err : NOT_INIT; + return NOT_INIT; } static const struct bpf_verifier_ops bpf_dummy_verifier_ops = { diff --git a/net/core/filter.c b/net/core/filter.c index 3370efad1dda..8b9f409a2ec3 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -8753,9 +8753,6 @@ static int tc_cls_act_btf_struct_access(struct bpf_verifier_log *log, { int ret = -EACCES; - if (atype == BPF_READ) - return btf_struct_access(log, reg, off, size, atype, next_btf_id, flag); - mutex_lock(&nf_conn_btf_access_lock); if (nfct_btf_struct_access) ret = nfct_btf_struct_access(log, reg, off, size, atype, next_btf_id, flag); @@ -8830,9 +8827,6 @@ static int xdp_btf_struct_access(struct bpf_verifier_log *log, { int ret = -EACCES; - if (atype == BPF_READ) - return btf_struct_access(log, reg, off, size, atype, next_btf_id, flag); - mutex_lock(&nf_conn_btf_access_lock); if (nfct_btf_struct_access) ret = nfct_btf_struct_access(log, reg, off, size, atype, next_btf_id, flag); diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index ea21c96c03aa..d6465876bbf6 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -78,9 +78,6 @@ static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log, const struct btf_type *t; size_t end; - if (atype == BPF_READ) - return btf_struct_access(log, reg, off, size, atype, next_btf_id, flag); - t = btf_type_by_id(reg->btf, reg->btf_id); if (t != tcp_sock_type) { bpf_log(log, "only read is supported\n"); -- cgit v1.2.3-58-ga151 From b7e852a9ec96635168c04204fb7cf1f7390b9a8c Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 3 Apr 2023 21:50:23 -0700 Subject: bpf: Remove unused arguments from btf_struct_access(). Remove unused arguments from btf_struct_access() callback. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: David Vernet Link: https://lore.kernel.org/bpf/20230404045029.82870-3-alexei.starovoitov@gmail.com --- include/linux/bpf.h | 3 +-- include/linux/filter.h | 3 +-- kernel/bpf/verifier.c | 4 ++-- net/bpf/bpf_dummy_struct_ops.c | 12 +++++------- net/core/filter.c | 13 +++++-------- net/ipv4/bpf_tcp_ca.c | 3 +-- net/netfilter/nf_conntrack_bpf.c | 3 +-- 7 files changed, 16 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2d8f3f639e68..4f689dda748f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -893,8 +893,7 @@ struct bpf_verifier_ops { struct bpf_prog *prog, u32 *target_size); int (*btf_struct_access)(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, - int off, int size, enum bpf_access_type atype, - u32 *next_btf_id, enum bpf_type_flag *flag); + int off, int size); }; struct bpf_prog_offload_ops { diff --git a/include/linux/filter.h b/include/linux/filter.h index 23c08c31bea9..5364b0c52c1d 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -571,8 +571,7 @@ DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key); extern struct mutex nf_conn_btf_access_lock; extern int (*nfct_btf_struct_access)(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, - int off, int size, enum bpf_access_type atype, - u32 *next_btf_id, enum bpf_type_flag *flag); + int off, int size); typedef unsigned int (*bpf_dispatcher_fn)(const void *ctx, const struct bpf_insn *insnsi, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 83984568ccb4..5ca520e5eddf 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5459,7 +5459,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, const struct btf_type *t = btf_type_by_id(reg->btf, reg->btf_id); const char *tname = btf_name_by_offset(reg->btf, t->name_off); enum bpf_type_flag flag = 0; - u32 btf_id; + u32 btf_id = 0; int ret; if (!env->allow_ptr_leaks) { @@ -5509,7 +5509,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, verbose(env, "verifier internal error: reg->btf must be kernel btf\n"); return -EFAULT; } - ret = env->ops->btf_struct_access(&env->log, reg, off, size, atype, &btf_id, &flag); + ret = env->ops->btf_struct_access(&env->log, reg, off, size); } else { /* Writes are permitted with default btf_struct_access for * program allocated objects (which always have ref_obj_id > 0), diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c index 9535c8506cda..5918d1b32e19 100644 --- a/net/bpf/bpf_dummy_struct_ops.c +++ b/net/bpf/bpf_dummy_struct_ops.c @@ -173,14 +173,11 @@ static int bpf_dummy_ops_check_member(const struct btf_type *t, static int bpf_dummy_ops_btf_struct_access(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, - int off, int size, enum bpf_access_type atype, - u32 *next_btf_id, - enum bpf_type_flag *flag) + int off, int size) { const struct btf_type *state; const struct btf_type *t; s32 type_id; - int err; type_id = btf_find_by_name_kind(reg->btf, "bpf_dummy_ops_state", BTF_KIND_STRUCT); @@ -194,9 +191,10 @@ static int bpf_dummy_ops_btf_struct_access(struct bpf_verifier_log *log, return -EACCES; } - err = btf_struct_access(log, reg, off, size, atype, next_btf_id, flag); - if (err < 0) - return err; + if (off + size > sizeof(struct bpf_dummy_ops_state)) { + bpf_log(log, "write access at off %d with size %d\n", off, size); + return -EACCES; + } return NOT_INIT; } diff --git a/net/core/filter.c b/net/core/filter.c index 8b9f409a2ec3..1f2abf0f60e6 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -8742,20 +8742,18 @@ EXPORT_SYMBOL_GPL(nf_conn_btf_access_lock); int (*nfct_btf_struct_access)(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, - int off, int size, enum bpf_access_type atype, - u32 *next_btf_id, enum bpf_type_flag *flag); + int off, int size); EXPORT_SYMBOL_GPL(nfct_btf_struct_access); static int tc_cls_act_btf_struct_access(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, - int off, int size, enum bpf_access_type atype, - u32 *next_btf_id, enum bpf_type_flag *flag) + int off, int size) { int ret = -EACCES; mutex_lock(&nf_conn_btf_access_lock); if (nfct_btf_struct_access) - ret = nfct_btf_struct_access(log, reg, off, size, atype, next_btf_id, flag); + ret = nfct_btf_struct_access(log, reg, off, size); mutex_unlock(&nf_conn_btf_access_lock); return ret; @@ -8822,14 +8820,13 @@ EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); static int xdp_btf_struct_access(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, - int off, int size, enum bpf_access_type atype, - u32 *next_btf_id, enum bpf_type_flag *flag) + int off, int size) { int ret = -EACCES; mutex_lock(&nf_conn_btf_access_lock); if (nfct_btf_struct_access) - ret = nfct_btf_struct_access(log, reg, off, size, atype, next_btf_id, flag); + ret = nfct_btf_struct_access(log, reg, off, size); mutex_unlock(&nf_conn_btf_access_lock); return ret; diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index d6465876bbf6..4406d796cc2f 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -72,8 +72,7 @@ static bool bpf_tcp_ca_is_valid_access(int off, int size, static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, - int off, int size, enum bpf_access_type atype, - u32 *next_btf_id, enum bpf_type_flag *flag) + int off, int size) { const struct btf_type *t; size_t end; diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c index 002e9d24a1e9..3f821b7ba646 100644 --- a/net/netfilter/nf_conntrack_bpf.c +++ b/net/netfilter/nf_conntrack_bpf.c @@ -192,8 +192,7 @@ BTF_ID(struct, nf_conn___init) /* Check writes into `struct nf_conn` */ static int _nf_conntrack_btf_struct_access(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, - int off, int size, enum bpf_access_type atype, - u32 *next_btf_id, enum bpf_type_flag *flag) + int off, int size) { const struct btf_type *ncit, *nct, *t; size_t end; -- cgit v1.2.3-58-ga151 From 63260df1396578226ac3134cf7f764690002e70e Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 3 Apr 2023 21:50:24 -0700 Subject: bpf: Refactor btf_nested_type_is_trusted(). btf_nested_type_is_trusted() tries to find a struct member at corresponding offset. It works for flat structures and falls apart in more complex structs with nested structs. The offset->member search is already performed by btf_struct_walk() including nested structs. Reuse this work and pass {field name, field btf id} into btf_nested_type_is_trusted() instead of offset to make BTF_TYPE_SAFE*() logic more robust. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: David Vernet Link: https://lore.kernel.org/bpf/20230404045029.82870-4-alexei.starovoitov@gmail.com --- include/linux/bpf.h | 7 ++++--- kernel/bpf/btf.c | 44 +++++++++++++++++--------------------------- kernel/bpf/verifier.c | 23 ++++++++++++----------- 3 files changed, 33 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4f689dda748f..002a811b6b90 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2263,7 +2263,7 @@ static inline bool bpf_tracing_btf_ctx_access(int off, int size, int btf_struct_access(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, int off, int size, enum bpf_access_type atype, - u32 *next_btf_id, enum bpf_type_flag *flag); + u32 *next_btf_id, enum bpf_type_flag *flag, const char **field_name); bool btf_struct_ids_match(struct bpf_verifier_log *log, const struct btf *btf, u32 id, int off, const struct btf *need_btf, u32 need_type_id, @@ -2302,7 +2302,7 @@ struct bpf_core_ctx { bool btf_nested_type_is_trusted(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, - int off, const char *suffix); + const char *field_name, u32 btf_id, const char *suffix); bool btf_type_ids_nocast_alias(struct bpf_verifier_log *log, const struct btf *reg_btf, u32 reg_id, @@ -2517,7 +2517,8 @@ static inline struct bpf_prog *bpf_prog_by_id(u32 id) static inline int btf_struct_access(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, int off, int size, enum bpf_access_type atype, - u32 *next_btf_id, enum bpf_type_flag *flag) + u32 *next_btf_id, enum bpf_type_flag *flag, + const char **field_name) { return -EACCES; } diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index b7e5a5510b91..593c45a294d0 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6166,7 +6166,8 @@ enum bpf_struct_walk_result { static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf, const struct btf_type *t, int off, int size, - u32 *next_btf_id, enum bpf_type_flag *flag) + u32 *next_btf_id, enum bpf_type_flag *flag, + const char **field_name) { u32 i, moff, mtrue_end, msize = 0, total_nelems = 0; const struct btf_type *mtype, *elem_type = NULL; @@ -6395,6 +6396,8 @@ error: if (btf_type_is_struct(stype)) { *next_btf_id = id; *flag |= tmp_flag; + if (field_name) + *field_name = mname; return WALK_PTR; } } @@ -6421,7 +6424,8 @@ error: int btf_struct_access(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, int off, int size, enum bpf_access_type atype __maybe_unused, - u32 *next_btf_id, enum bpf_type_flag *flag) + u32 *next_btf_id, enum bpf_type_flag *flag, + const char **field_name) { const struct btf *btf = reg->btf; enum bpf_type_flag tmp_flag = 0; @@ -6453,7 +6457,7 @@ int btf_struct_access(struct bpf_verifier_log *log, t = btf_type_by_id(btf, id); do { - err = btf_struct_walk(log, btf, t, off, size, &id, &tmp_flag); + err = btf_struct_walk(log, btf, t, off, size, &id, &tmp_flag, field_name); switch (err) { case WALK_PTR: @@ -6528,7 +6532,7 @@ again: type = btf_type_by_id(btf, id); if (!type) return false; - err = btf_struct_walk(log, btf, type, off, 1, &id, &flag); + err = btf_struct_walk(log, btf, type, off, 1, &id, &flag, NULL); if (err != WALK_STRUCT) return false; @@ -8488,16 +8492,15 @@ out: bool btf_nested_type_is_trusted(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, - int off, const char *suffix) + const char *field_name, u32 btf_id, const char *suffix) { struct btf *btf = reg->btf; const struct btf_type *walk_type, *safe_type; const char *tname; char safe_tname[64]; long ret, safe_id; - const struct btf_member *member, *m_walk = NULL; + const struct btf_member *member; u32 i; - const char *walk_name; walk_type = btf_type_by_id(btf, reg->btf_id); if (!walk_type) @@ -8517,30 +8520,17 @@ bool btf_nested_type_is_trusted(struct bpf_verifier_log *log, if (!safe_type) return false; - for_each_member(i, walk_type, member) { - u32 moff; - - /* We're looking for the PTR_TO_BTF_ID member in the struct - * type we're walking which matches the specified offset. - * Below, we'll iterate over the fields in the safe variant of - * the struct and see if any of them has a matching type / - * name. - */ - moff = __btf_member_bit_offset(walk_type, member) / 8; - if (off == moff) { - m_walk = member; - break; - } - } - if (m_walk == NULL) - return false; - - walk_name = __btf_name_by_offset(btf, m_walk->name_off); for_each_member(i, safe_type, member) { const char *m_name = __btf_name_by_offset(btf, member->name_off); + const struct btf_type *mtype = btf_type_by_id(btf, member->type); + u32 id; + + if (!btf_type_is_ptr(mtype)) + continue; + btf_type_skip_modifiers(btf, mtype->type, &id); /* If we match on both type and name, the field is considered trusted. */ - if (m_walk->type == member->type && !strcmp(walk_name, m_name)) + if (btf_id == id && !strcmp(field_name, m_name)) return true; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5ca520e5eddf..2cd2e0b725cd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5400,12 +5400,12 @@ BTF_TYPE_SAFE_RCU(struct css_set) { /* full trusted: these fields are trusted even outside of RCU CS and never NULL */ BTF_TYPE_SAFE_TRUSTED(struct bpf_iter_meta) { - __bpf_md_ptr(struct seq_file *, seq); + struct seq_file *seq; }; BTF_TYPE_SAFE_TRUSTED(struct bpf_iter__task) { - __bpf_md_ptr(struct bpf_iter_meta *, meta); - __bpf_md_ptr(struct task_struct *, task); + struct bpf_iter_meta *meta; + struct task_struct *task; }; BTF_TYPE_SAFE_TRUSTED(struct linux_binprm) { @@ -5427,17 +5427,17 @@ BTF_TYPE_SAFE_TRUSTED(struct socket) { static bool type_is_rcu(struct bpf_verifier_env *env, struct bpf_reg_state *reg, - int off) + const char *field_name, u32 btf_id) { BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct task_struct)); BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct css_set)); - return btf_nested_type_is_trusted(&env->log, reg, off, "__safe_rcu"); + return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_rcu"); } static bool type_is_trusted(struct bpf_verifier_env *env, struct bpf_reg_state *reg, - int off) + const char *field_name, u32 btf_id) { BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct bpf_iter_meta)); BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct bpf_iter__task)); @@ -5446,7 +5446,7 @@ static bool type_is_trusted(struct bpf_verifier_env *env, BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct dentry)); BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct socket)); - return btf_nested_type_is_trusted(&env->log, reg, off, "__safe_trusted"); + return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_trusted"); } static int check_ptr_to_btf_access(struct bpf_verifier_env *env, @@ -5458,6 +5458,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, struct bpf_reg_state *reg = regs + regno; const struct btf_type *t = btf_type_by_id(reg->btf, reg->btf_id); const char *tname = btf_name_by_offset(reg->btf, t->name_off); + const char *field_name = NULL; enum bpf_type_flag flag = 0; u32 btf_id = 0; int ret; @@ -5526,7 +5527,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, return -EFAULT; } - ret = btf_struct_access(&env->log, reg, off, size, atype, &btf_id, &flag); + ret = btf_struct_access(&env->log, reg, off, size, atype, &btf_id, &flag, &field_name); } if (ret < 0) @@ -5554,10 +5555,10 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, * A regular RCU-protected pointer with __rcu tag can also be deemed * trusted if we are in an RCU CS. Such pointer can be NULL. */ - if (type_is_trusted(env, reg, off)) { + if (type_is_trusted(env, reg, field_name, btf_id)) { flag |= PTR_TRUSTED; } else if (in_rcu_cs(env) && !type_may_be_null(reg->type)) { - if (type_is_rcu(env, reg, off)) { + if (type_is_rcu(env, reg, field_name, btf_id)) { /* ignore __rcu tag and mark it MEM_RCU */ flag |= MEM_RCU; } else if (flag & MEM_RCU) { @@ -5640,7 +5641,7 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env, /* Simulate access to a PTR_TO_BTF_ID */ memset(&map_reg, 0, sizeof(map_reg)); mark_btf_ld_reg(env, &map_reg, 0, PTR_TO_BTF_ID, btf_vmlinux, *map->ops->map_btf_id, 0); - ret = btf_struct_access(&env->log, &map_reg, off, size, atype, &btf_id, &flag); + ret = btf_struct_access(&env->log, &map_reg, off, size, atype, &btf_id, &flag, NULL); if (ret < 0) return ret; -- cgit v1.2.3-58-ga151 From 91571a515d1bcdc280bb46423bb697ea7eb42ff3 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 3 Apr 2023 21:50:25 -0700 Subject: bpf: Teach verifier that certain helpers accept NULL pointer. bpf_[sk|inode|task|cgrp]_storage_[get|delete]() and bpf_get_socket_cookie() helpers perform run-time check that sk|inode|task|cgrp pointer != NULL. Teach verifier about this fact and allow bpf programs to pass PTR_TO_BTF_ID | PTR_MAYBE_NULL into such helpers. It will be used in the subsequent patch that will do bpf_sk_storage_get(.., skb->sk, ...); Even when 'skb' pointer is trusted the 'sk' pointer may be NULL. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: David Vernet Link: https://lore.kernel.org/bpf/20230404045029.82870-5-alexei.starovoitov@gmail.com --- kernel/bpf/bpf_cgrp_storage.c | 4 ++-- kernel/bpf/bpf_inode_storage.c | 4 ++-- kernel/bpf/bpf_task_storage.c | 8 ++++---- net/core/bpf_sk_storage.c | 4 ++-- net/core/filter.c | 2 +- 5 files changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c index d17d5b694668..d44fe8dd9732 100644 --- a/kernel/bpf/bpf_cgrp_storage.c +++ b/kernel/bpf/bpf_cgrp_storage.c @@ -224,7 +224,7 @@ const struct bpf_func_proto bpf_cgrp_storage_get_proto = { .gpl_only = false, .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL, .arg2_btf_id = &bpf_cgroup_btf_id[0], .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, .arg4_type = ARG_ANYTHING, @@ -235,6 +235,6 @@ const struct bpf_func_proto bpf_cgrp_storage_delete_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL, .arg2_btf_id = &bpf_cgroup_btf_id[0], }; diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index e17ad581b9be..a4d93df78c75 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -229,7 +229,7 @@ const struct bpf_func_proto bpf_inode_storage_get_proto = { .gpl_only = false, .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL, .arg2_btf_id = &bpf_inode_storage_btf_ids[0], .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, .arg4_type = ARG_ANYTHING, @@ -240,6 +240,6 @@ const struct bpf_func_proto bpf_inode_storage_delete_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL, .arg2_btf_id = &bpf_inode_storage_btf_ids[0], }; diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index d1af0c8f9ce4..adf6dfe0ba68 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -338,7 +338,7 @@ const struct bpf_func_proto bpf_task_storage_get_recur_proto = { .gpl_only = false, .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL, .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, .arg4_type = ARG_ANYTHING, @@ -349,7 +349,7 @@ const struct bpf_func_proto bpf_task_storage_get_proto = { .gpl_only = false, .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL, .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, .arg4_type = ARG_ANYTHING, @@ -360,7 +360,7 @@ const struct bpf_func_proto bpf_task_storage_delete_recur_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL, .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], }; @@ -369,6 +369,6 @@ const struct bpf_func_proto bpf_task_storage_delete_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL, .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], }; diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 085025c7130a..d4172534dfa8 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -412,7 +412,7 @@ const struct bpf_func_proto bpf_sk_storage_get_tracing_proto = { .gpl_only = false, .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL, .arg2_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, .arg4_type = ARG_ANYTHING, @@ -424,7 +424,7 @@ const struct bpf_func_proto bpf_sk_storage_delete_tracing_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL, .arg2_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], .allowed = bpf_sk_storage_tracing_allowed, }; diff --git a/net/core/filter.c b/net/core/filter.c index 1f2abf0f60e6..727c5269867d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4998,7 +4998,7 @@ const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto = { .func = bpf_get_socket_ptr_cookie, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON | PTR_MAYBE_NULL, }; BPF_CALL_1(bpf_get_socket_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx) -- cgit v1.2.3-58-ga151 From add68b843f33d4e5dcbdc7ba6dffe7750a964159 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 3 Apr 2023 21:50:26 -0700 Subject: bpf: Refactor NULL-ness check in check_reg_type(). check_reg_type() unconditionally disallows PTR_TO_BTF_ID | PTR_MAYBE_NULL. It's problematic for helpers that allow ARG_PTR_TO_BTF_ID_OR_NULL like bpf_sk_storage_get(). Allow passing PTR_TO_BTF_ID | PTR_MAYBE_NULL into such helpers. That technically includes bpf_kptr_xchg() helper, but in practice: bpf_kptr_xchg(..., bpf_cpumask_create()); is still disallowed because bpf_cpumask_create() returns ref counted pointer with ref_obj_id > 0. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: David Vernet Link: https://lore.kernel.org/bpf/20230404045029.82870-6-alexei.starovoitov@gmail.com --- kernel/bpf/verifier.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2cd2e0b725cd..4e7d671497f4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7168,6 +7168,8 @@ found: case PTR_TO_BTF_ID: case PTR_TO_BTF_ID | PTR_TRUSTED: case PTR_TO_BTF_ID | MEM_RCU: + case PTR_TO_BTF_ID | PTR_MAYBE_NULL: + case PTR_TO_BTF_ID | PTR_MAYBE_NULL | MEM_RCU: { /* For bpf_sk_release, it needs to match against first member * 'struct sock_common', hence make an exception for it. This @@ -7176,6 +7178,12 @@ found: bool strict_type_match = arg_type_is_release(arg_type) && meta->func_id != BPF_FUNC_sk_release; + if (type_may_be_null(reg->type) && + (!type_may_be_null(arg_type) || arg_type_is_release(arg_type))) { + verbose(env, "Possibly NULL pointer passed to helper arg%d\n", regno); + return -EACCES; + } + if (!arg_btf_id) { if (!compatible->btf_id) { verbose(env, "verifier internal error: missing arg compatible BTF ID\n"); @@ -7206,10 +7214,6 @@ found: } break; } - case PTR_TO_BTF_ID | PTR_MAYBE_NULL: - case PTR_TO_BTF_ID | PTR_MAYBE_NULL | MEM_RCU: - verbose(env, "Possibly NULL pointer passed to helper arg%d\n", regno); - return -EACCES; case PTR_TO_BTF_ID | MEM_ALLOC: if (meta->func_id != BPF_FUNC_spin_lock && meta->func_id != BPF_FUNC_spin_unlock && meta->func_id != BPF_FUNC_kptr_xchg) { -- cgit v1.2.3-58-ga151 From 30ee9821f9430c34a6254134a5f0e8db227510be Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 3 Apr 2023 21:50:27 -0700 Subject: bpf: Allowlist few fields similar to __rcu tag. Allow bpf program access cgrp->kn, mm->exe_file, skb->sk, req->sk. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: David Vernet Link: https://lore.kernel.org/bpf/20230404045029.82870-7-alexei.starovoitov@gmail.com --- kernel/bpf/verifier.c | 39 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4e7d671497f4..fd90ba498ccc 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5378,6 +5378,7 @@ static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val) } #define BTF_TYPE_SAFE_RCU(__type) __PASTE(__type, __safe_rcu) +#define BTF_TYPE_SAFE_RCU_OR_NULL(__type) __PASTE(__type, __safe_rcu_or_null) #define BTF_TYPE_SAFE_TRUSTED(__type) __PASTE(__type, __safe_trusted) /* @@ -5394,10 +5395,31 @@ BTF_TYPE_SAFE_RCU(struct task_struct) { struct task_struct *group_leader; }; +BTF_TYPE_SAFE_RCU(struct cgroup) { + /* cgrp->kn is always accessible as documented in kernel/cgroup/cgroup.c */ + struct kernfs_node *kn; +}; + BTF_TYPE_SAFE_RCU(struct css_set) { struct cgroup *dfl_cgrp; }; +/* RCU trusted: these fields are trusted in RCU CS and can be NULL */ +BTF_TYPE_SAFE_RCU_OR_NULL(struct mm_struct) { + struct file __rcu *exe_file; +}; + +/* skb->sk, req->sk are not RCU protected, but we mark them as such + * because bpf prog accessible sockets are SOCK_RCU_FREE. + */ +BTF_TYPE_SAFE_RCU_OR_NULL(struct sk_buff) { + struct sock *sk; +}; + +BTF_TYPE_SAFE_RCU_OR_NULL(struct request_sock) { + struct sock *sk; +}; + /* full trusted: these fields are trusted even outside of RCU CS and never NULL */ BTF_TYPE_SAFE_TRUSTED(struct bpf_iter_meta) { struct seq_file *seq; @@ -5430,11 +5452,23 @@ static bool type_is_rcu(struct bpf_verifier_env *env, const char *field_name, u32 btf_id) { BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct task_struct)); + BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct cgroup)); BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct css_set)); return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_rcu"); } +static bool type_is_rcu_or_null(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, + const char *field_name, u32 btf_id) +{ + BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU_OR_NULL(struct mm_struct)); + BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU_OR_NULL(struct sk_buff)); + BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU_OR_NULL(struct request_sock)); + + return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_rcu_or_null"); +} + static bool type_is_trusted(struct bpf_verifier_env *env, struct bpf_reg_state *reg, const char *field_name, u32 btf_id) @@ -5561,9 +5595,10 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, if (type_is_rcu(env, reg, field_name, btf_id)) { /* ignore __rcu tag and mark it MEM_RCU */ flag |= MEM_RCU; - } else if (flag & MEM_RCU) { + } else if (flag & MEM_RCU || + type_is_rcu_or_null(env, reg, field_name, btf_id)) { /* __rcu tagged pointers can be NULL */ - flag |= PTR_MAYBE_NULL; + flag |= MEM_RCU | PTR_MAYBE_NULL; } else if (flag & (MEM_PERCPU | MEM_USER)) { /* keep as-is */ } else { -- cgit v1.2.3-58-ga151 From afeebf9f57a4965e0be90e4dc87f8f3a1417376d Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 3 Apr 2023 21:50:28 -0700 Subject: bpf: Undo strict enforcement for walking untagged fields. The commit 6fcd486b3a0a ("bpf: Refactor RCU enforcement in the verifier.") broke several tracing bpf programs. Even in clang compiled kernels there are many fields that are not marked with __rcu that are safe to read and pass into helpers, but the verifier doesn't know that they're safe. Aggressively marking them as PTR_UNTRUSTED was premature. Fixes: 6fcd486b3a0a ("bpf: Refactor RCU enforcement in the verifier.") Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: David Vernet Link: https://lore.kernel.org/bpf/20230404045029.82870-8-alexei.starovoitov@gmail.com --- kernel/bpf/verifier.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index fd90ba498ccc..56f569811f70 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4974,6 +4974,11 @@ static bool is_rcu_reg(const struct bpf_reg_state *reg) return reg->type & MEM_RCU; } +static void clear_trusted_flags(enum bpf_type_flag *flag) +{ + *flag &= ~(BPF_REG_TRUSTED_MODIFIERS | MEM_RCU); +} + static int check_pkt_ptr_alignment(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int off, int size, bool strict) @@ -5602,8 +5607,8 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, } else if (flag & (MEM_PERCPU | MEM_USER)) { /* keep as-is */ } else { - /* walking unknown pointers yields untrusted pointer */ - flag = PTR_UNTRUSTED; + /* walking unknown pointers yields old deprecated PTR_TO_BTF_ID */ + clear_trusted_flags(&flag); } } else { /* @@ -5617,7 +5622,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, } } else { /* Old compat. Deprecated */ - flag &= ~PTR_TRUSTED; + clear_trusted_flags(&flag); } if (atype == BPF_READ && value_regno >= 0) -- cgit v1.2.3-58-ga151 From d099f594ad5650e8d8232b5f31f5f90104e65def Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 4 Apr 2023 00:02:54 +0200 Subject: kallsyms: Disable preemption for find_kallsyms_symbol_value Artem reported suspicious RCU usage [1]. The reason is that verifier calls find_kallsyms_symbol_value with preemption enabled which will trigger suspicious RCU usage warning in rcu_dereference_sched call. Disabling preemption in find_kallsyms_symbol_value and adding __find_kallsyms_symbol_value function. Fixes: 31bf1dbccfb0 ("bpf: Fix attaching fentry/fexit/fmod_ret/lsm to modules") Reported-by: Artem Savkov Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Tested-by: Artem Savkov Reviewed-by: Zhen Lei Link: https://lore.kernel.org/bpf/20230403220254.2191240-1-jolsa@kernel.org [1] https://lore.kernel.org/bpf/ZBrPMkv8YVRiWwCR@samus.usersys.redhat.com/ --- kernel/module/kallsyms.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c index ab2376a1be88..bdc911dbcde5 100644 --- a/kernel/module/kallsyms.c +++ b/kernel/module/kallsyms.c @@ -442,7 +442,7 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, } /* Given a module and name of symbol, find and return the symbol's value */ -unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name) +static unsigned long __find_kallsyms_symbol_value(struct module *mod, const char *name) { unsigned int i; struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms); @@ -466,7 +466,7 @@ static unsigned long __module_kallsyms_lookup_name(const char *name) if (colon) { mod = find_module_all(name, colon - name, false); if (mod) - return find_kallsyms_symbol_value(mod, colon + 1); + return __find_kallsyms_symbol_value(mod, colon + 1); return 0; } @@ -475,7 +475,7 @@ static unsigned long __module_kallsyms_lookup_name(const char *name) if (mod->state == MODULE_STATE_UNFORMED) continue; - ret = find_kallsyms_symbol_value(mod, name); + ret = __find_kallsyms_symbol_value(mod, name); if (ret) return ret; } @@ -494,6 +494,16 @@ unsigned long module_kallsyms_lookup_name(const char *name) return ret; } +unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name) +{ + unsigned long ret; + + preempt_disable(); + ret = __find_kallsyms_symbol_value(mod, name); + preempt_enable(); + return ret; +} + int module_kallsyms_on_each_symbol(const char *modname, int (*fn)(void *, const char *, struct module *, unsigned long), -- cgit v1.2.3-58-ga151 From 13fbcee55706db45ce047a7cea14811d68f94ee3 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 6 Apr 2023 09:44:55 -0700 Subject: bpf: Improve verifier JEQ/JNE insn branch taken checking Currently, for BPF_JEQ/BPF_JNE insn, verifier determines whether the branch is taken or not only if both operands are constants. Therefore, for the following code snippet, 0: (85) call bpf_ktime_get_ns#5 ; R0_w=scalar() 1: (a5) if r0 < 0x3 goto pc+2 ; R0_w=scalar(umin=3) 2: (b7) r2 = 2 ; R2_w=2 3: (1d) if r0 == r2 goto pc+2 6 At insn 3, since r0 is not a constant, verifier assumes both branch can be taken which may lead inproper verification failure. Add comparing umin/umax value and the constant. If the umin value is greater than the constant, or umax value is smaller than the constant, for JEQ the branch must be not-taken, and for JNE the branch must be taken. The jmp32 mode JEQ/JNE branch taken checking is also handled similarly. The following lists the veristat result w.r.t. changed number of processes insns during verification: File Program Insns (A) Insns (B) Insns (DIFF) ----------------------------------------------------- ---------------------------------------------------- --------- --------- --------------- test_cls_redirect.bpf.linked3.o cls_redirect 64980 73472 +8492 (+13.07%) test_seg6_loop.bpf.linked3.o __add_egr_x 12425 12423 -2 (-0.02%) test_tcp_hdr_options.bpf.linked3.o estab 2634 2558 -76 (-2.89%) test_parse_tcp_hdr_opt.bpf.linked3.o xdp_ingress_v6 1421 1420 -1 (-0.07%) test_parse_tcp_hdr_opt_dynptr.bpf.linked3.o xdp_ingress_v6 1238 1237 -1 (-0.08%) test_tc_dtime.bpf.linked3.o egress_fwdns_prio100 414 411 -3 (-0.72%) Mostly a small improvement but test_cls_redirect.bpf.linked3.o has a 13% regression. I checked with verifier log and found it this is due to pruning. For some JEQ/JNE branches impacted by this patch, one branch is explored and the other has state equivalence and pruned. Signed-off-by: Yonghong Song Acked-by: Dave Marchevsky Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20230406164455.1045294-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 56f569811f70..5c6b90e384a5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12651,10 +12651,14 @@ static int is_branch32_taken(struct bpf_reg_state *reg, u32 val, u8 opcode) case BPF_JEQ: if (tnum_is_const(subreg)) return !!tnum_equals_const(subreg, val); + else if (val < reg->u32_min_value || val > reg->u32_max_value) + return 0; break; case BPF_JNE: if (tnum_is_const(subreg)) return !tnum_equals_const(subreg, val); + else if (val < reg->u32_min_value || val > reg->u32_max_value) + return 1; break; case BPF_JSET: if ((~subreg.mask & subreg.value) & val) @@ -12724,10 +12728,14 @@ static int is_branch64_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) case BPF_JEQ: if (tnum_is_const(reg->var_off)) return !!tnum_equals_const(reg->var_off, val); + else if (val < reg->umin_value || val > reg->umax_value) + return 0; break; case BPF_JNE: if (tnum_is_const(reg->var_off)) return !tnum_equals_const(reg->var_off, val); + else if (val < reg->umin_value || val > reg->umax_value) + return 1; break; case BPF_JSET: if ((~reg->var_off.mask & reg->var_off.value) & val) -- cgit v1.2.3-58-ga151 From 953d9f5beaf75e88c69a13d70ce424cd606a29f5 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 6 Apr 2023 09:45:05 -0700 Subject: bpf: Improve handling of pattern ' ' in verifier Currently, the verifier does not handle ' ' well. For example, ... 10: (79) r1 = *(u64 *)(r10 -16) ; R1_w=scalar() R10=fp0 11: (b7) r2 = 0 ; R2_w=0 12: (2d) if r2 > r1 goto pc+2 13: (b7) r0 = 0 14: (95) exit 15: (65) if r1 s> 0x1 goto pc+3 16: (0f) r0 += r1 ... At insn 12, verifier decides both true and false branch are possible, but actually only false branch is possible. Currently, the verifier already supports patterns ' . Add support for patterns ' ' in a similar way. Also fix selftest 'verifier_bounds_mix_sign_unsign/bounds checks mixing signed and unsigned, variant 10' due to this change. Signed-off-by: Yonghong Song Acked-by: Dave Marchevsky Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20230406164505.1046801-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 12 ++++++++++++ .../selftests/bpf/progs/verifier_bounds_mix_sign_unsign.c | 2 +- 2 files changed, 13 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5c6b90e384a5..3660b573048a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -13356,6 +13356,18 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, src_reg->var_off.value, opcode, is_jmp32); + } else if (dst_reg->type == SCALAR_VALUE && + is_jmp32 && tnum_is_const(tnum_subreg(dst_reg->var_off))) { + pred = is_branch_taken(src_reg, + tnum_subreg(dst_reg->var_off).value, + flip_opcode(opcode), + is_jmp32); + } else if (dst_reg->type == SCALAR_VALUE && + !is_jmp32 && tnum_is_const(dst_reg->var_off)) { + pred = is_branch_taken(src_reg, + dst_reg->var_off.value, + flip_opcode(opcode), + is_jmp32); } else if (reg_is_pkt_pointer_any(dst_reg) && reg_is_pkt_pointer_any(src_reg) && !is_jmp32) { diff --git a/tools/testing/selftests/bpf/progs/verifier_bounds_mix_sign_unsign.c b/tools/testing/selftests/bpf/progs/verifier_bounds_mix_sign_unsign.c index 91a66357896a..4f40144748a5 100644 --- a/tools/testing/selftests/bpf/progs/verifier_bounds_mix_sign_unsign.c +++ b/tools/testing/selftests/bpf/progs/verifier_bounds_mix_sign_unsign.c @@ -354,7 +354,7 @@ __naked void signed_and_unsigned_variant_10(void) call %[bpf_map_lookup_elem]; \ if r0 == 0 goto l0_%=; \ r1 = *(u64*)(r10 - 16); \ - r2 = 0; \ + r2 = -1; \ if r2 > r1 goto l1_%=; \ r0 = 0; \ exit; \ -- cgit v1.2.3-58-ga151 From f3f21349779776135349a8e6f114a1485b2476b7 Mon Sep 17 00:00:00 2001 From: Barret Rhoden Date: Thu, 6 Apr 2023 20:18:08 -0400 Subject: bpf: ensure all memory is initialized in bpf_get_current_comm BPF helpers that take an ARG_PTR_TO_UNINIT_MEM must ensure that all of the memory is set, including beyond the end of the string. Signed-off-by: Barret Rhoden Link: https://lore.kernel.org/r/20230407001808.1622968-1-brho@google.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 6be16db9f188..b6a5cda5bb59 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -258,7 +258,7 @@ BPF_CALL_2(bpf_get_current_comm, char *, buf, u32, size) goto err_clear; /* Verifier guarantees that size > 0 */ - strscpy(buf, task->comm, size); + strscpy_pad(buf, task->comm, size); return 0; err_clear: memset(buf, 0, size); -- cgit v1.2.3-58-ga151 From 4294a0a7ab6282c3d92f03de84e762dda993c93d Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 6 Apr 2023 16:41:47 -0700 Subject: bpf: Split off basic BPF verifier log into separate file kernel/bpf/verifier.c file is large and growing larger all the time. So it's good to start splitting off more or less self-contained parts into separate files to keep source code size (somewhat) somewhat under control. This patch is a one step in this direction, moving some of BPF verifier log routines into a separate kernel/bpf/log.c. Right now it's most low-level and isolated routines to append data to log, reset log to previous position, etc. Eventually we could probably move verifier state printing logic here as well, but this patch doesn't attempt to do that yet. Subsequent patches will add more logic to verifier log management, so having basics in a separate file will make sure verifier.c doesn't grow more with new changes. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Lorenz Bauer Link: https://lore.kernel.org/bpf/20230406234205.323208-2-andrii@kernel.org --- include/linux/bpf_verifier.h | 19 ++++------ kernel/bpf/Makefile | 3 +- kernel/bpf/log.c | 85 ++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 69 ----------------------------------- 4 files changed, 94 insertions(+), 82 deletions(-) create mode 100644 kernel/bpf/log.c (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 81d525d057c7..83dff25545ee 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -498,11 +498,6 @@ struct bpf_verifier_log { u32 len_total; }; -static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log) -{ - return log->len_used >= log->len_total - 1; -} - #define BPF_LOG_LEVEL1 1 #define BPF_LOG_LEVEL2 2 #define BPF_LOG_STATS 4 @@ -512,6 +507,11 @@ static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log) #define BPF_LOG_MIN_ALIGNMENT 8U #define BPF_LOG_ALIGNMENT 40U +static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log) +{ + return log->len_used >= log->len_total - 1; +} + static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log) { return log && @@ -519,13 +519,6 @@ static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log) log->level == BPF_LOG_KERNEL); } -static inline bool -bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log) -{ - return log->len_total >= 128 && log->len_total <= UINT_MAX >> 2 && - log->level && log->ubuf && !(log->level & ~BPF_LOG_MASK); -} - #define BPF_MAX_SUBPROGS 256 struct bpf_subprog_info { @@ -608,12 +601,14 @@ struct bpf_verifier_env { char type_str_buf[TYPE_STR_BUF_LEN]; }; +bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log); __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, va_list args); __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, const char *fmt, ...); __printf(2, 3) void bpf_log(struct bpf_verifier_log *log, const char *fmt, ...); +void bpf_vlog_reset(struct bpf_verifier_log *log, u32 new_pos); static inline struct bpf_func_state *cur_func(struct bpf_verifier_env *env) { diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 02242614dcc7..1d3892168d32 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -6,7 +6,8 @@ cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse endif CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy) -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o +obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c new file mode 100644 index 000000000000..920061e38d2e --- /dev/null +++ b/kernel/bpf/log.c @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * Copyright (c) 2016 Facebook + * Copyright (c) 2018 Covalent IO, Inc. http://covalent.io + */ +#include +#include +#include +#include +#include + +bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log) +{ + return log->len_total >= 128 && log->len_total <= UINT_MAX >> 2 && + log->level && log->ubuf && !(log->level & ~BPF_LOG_MASK); +} + +void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, + va_list args) +{ + unsigned int n; + + n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args); + + WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1, + "verifier log line truncated - local buffer too short\n"); + + if (log->level == BPF_LOG_KERNEL) { + bool newline = n > 0 && log->kbuf[n - 1] == '\n'; + + pr_err("BPF: %s%s", log->kbuf, newline ? "" : "\n"); + return; + } + + n = min(log->len_total - log->len_used - 1, n); + log->kbuf[n] = '\0'; + if (!copy_to_user(log->ubuf + log->len_used, log->kbuf, n + 1)) + log->len_used += n; + else + log->ubuf = NULL; +} + +void bpf_vlog_reset(struct bpf_verifier_log *log, u32 new_pos) +{ + char zero = 0; + + if (!bpf_verifier_log_needed(log)) + return; + + log->len_used = new_pos; + if (put_user(zero, log->ubuf + new_pos)) + log->ubuf = NULL; +} + +/* log_level controls verbosity level of eBPF verifier. + * bpf_verifier_log_write() is used to dump the verification trace to the log, + * so the user can figure out what's wrong with the program + */ +__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, + const char *fmt, ...) +{ + va_list args; + + if (!bpf_verifier_log_needed(&env->log)) + return; + + va_start(args, fmt); + bpf_verifier_vlog(&env->log, fmt, args); + va_end(args); +} +EXPORT_SYMBOL_GPL(bpf_verifier_log_write); + +__printf(2, 3) void bpf_log(struct bpf_verifier_log *log, + const char *fmt, ...) +{ + va_list args; + + if (!bpf_verifier_log_needed(log)) + return; + + va_start(args, fmt); + bpf_verifier_vlog(log, fmt, args); + va_end(args); +} +EXPORT_SYMBOL_GPL(bpf_log); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3660b573048a..745ae0cd01d4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -335,61 +335,6 @@ find_linfo(const struct bpf_verifier_env *env, u32 insn_off) return &linfo[i - 1]; } -void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, - va_list args) -{ - unsigned int n; - - n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args); - - WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1, - "verifier log line truncated - local buffer too short\n"); - - if (log->level == BPF_LOG_KERNEL) { - bool newline = n > 0 && log->kbuf[n - 1] == '\n'; - - pr_err("BPF: %s%s", log->kbuf, newline ? "" : "\n"); - return; - } - - n = min(log->len_total - log->len_used - 1, n); - log->kbuf[n] = '\0'; - if (!copy_to_user(log->ubuf + log->len_used, log->kbuf, n + 1)) - log->len_used += n; - else - log->ubuf = NULL; -} - -static void bpf_vlog_reset(struct bpf_verifier_log *log, u32 new_pos) -{ - char zero = 0; - - if (!bpf_verifier_log_needed(log)) - return; - - log->len_used = new_pos; - if (put_user(zero, log->ubuf + new_pos)) - log->ubuf = NULL; -} - -/* log_level controls verbosity level of eBPF verifier. - * bpf_verifier_log_write() is used to dump the verification trace to the log, - * so the user can figure out what's wrong with the program - */ -__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, - const char *fmt, ...) -{ - va_list args; - - if (!bpf_verifier_log_needed(&env->log)) - return; - - va_start(args, fmt); - bpf_verifier_vlog(&env->log, fmt, args); - va_end(args); -} -EXPORT_SYMBOL_GPL(bpf_verifier_log_write); - __printf(2, 3) static void verbose(void *private_data, const char *fmt, ...) { struct bpf_verifier_env *env = private_data; @@ -403,20 +348,6 @@ __printf(2, 3) static void verbose(void *private_data, const char *fmt, ...) va_end(args); } -__printf(2, 3) void bpf_log(struct bpf_verifier_log *log, - const char *fmt, ...) -{ - va_list args; - - if (!bpf_verifier_log_needed(log)) - return; - - va_start(args, fmt); - bpf_verifier_vlog(log, fmt, args); - va_end(args); -} -EXPORT_SYMBOL_GPL(bpf_log); - static const char *ltrim(const char *s) { while (isspace(*s)) -- cgit v1.2.3-58-ga151 From 03cc3aa6a53394481f01c16231f99298332066f9 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 6 Apr 2023 16:41:48 -0700 Subject: bpf: Remove minimum size restrictions on verifier log buffer It's not clear why we have 128 as minimum size, but it makes testing harder and seems unnecessary, as we carefully handle truncation scenarios and use proper snprintf variants. So remove this limitation and just enforce positive length for log buffer. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Lorenz Bauer Link: https://lore.kernel.org/bpf/20230406234205.323208-3-andrii@kernel.org --- kernel/bpf/log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 920061e38d2e..1974891fc324 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -11,7 +11,7 @@ bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log) { - return log->len_total >= 128 && log->len_total <= UINT_MAX >> 2 && + return log->len_total > 0 && log->len_total <= UINT_MAX >> 2 && log->level && log->ubuf && !(log->level & ~BPF_LOG_MASK); } -- cgit v1.2.3-58-ga151 From 1216640938035e63bdbd32438e91c9bcc1fd8ee1 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 6 Apr 2023 16:41:49 -0700 Subject: bpf: Switch BPF verifier log to be a rotating log by default Currently, if user-supplied log buffer to collect BPF verifier log turns out to be too small to contain full log, bpf() syscall returns -ENOSPC, fails BPF program verification/load, and preserves first N-1 bytes of the verifier log (where N is the size of user-supplied buffer). This is problematic in a bunch of common scenarios, especially when working with real-world BPF programs that tend to be pretty complex as far as verification goes and require big log buffers. Typically, it's when debugging tricky cases at log level 2 (verbose). Also, when BPF program is successfully validated, log level 2 is the only way to actually see verifier state progression and all the important details. Even with log level 1, it's possible to get -ENOSPC even if the final verifier log fits in log buffer, if there is a code path that's deep enough to fill up entire log, even if normally it would be reset later on (there is a logic to chop off successfully validated portions of BPF verifier log). In short, it's not always possible to pre-size log buffer. Also, what's worse, in practice, the end of the log most often is way more important than the beginning, but verifier stops emitting log as soon as initial log buffer is filled up. This patch switches BPF verifier log behavior to effectively behave as rotating log. That is, if user-supplied log buffer turns out to be too short, verifier will keep overwriting previously written log, effectively treating user's log buffer as a ring buffer. -ENOSPC is still going to be returned at the end, to notify user that log contents was truncated, but the important last N bytes of the log would be returned, which might be all that user really needs. This consistent -ENOSPC behavior, regardless of rotating or fixed log behavior, allows to prevent backwards compatibility breakage. The only user-visible change is which portion of verifier log user ends up seeing *if buffer is too small*. Given contents of verifier log itself is not an ABI, there is no breakage due to this behavior change. Specialized tools that rely on specific contents of verifier log in -ENOSPC scenario are expected to be easily adapted to accommodate old and new behaviors. Importantly, though, to preserve good user experience and not require every user-space application to adopt to this new behavior, before exiting to user-space verifier will rotate log (in place) to make it start at the very beginning of user buffer as a continuous zero-terminated string. The contents will be a chopped off N-1 last bytes of full verifier log, of course. Given beginning of log is sometimes important as well, we add BPF_LOG_FIXED (which equals 8) flag to force old behavior, which allows tools like veristat to request first part of verifier log, if necessary. BPF_LOG_FIXED flag is also a simple and straightforward way to check if BPF verifier supports rotating behavior. On the implementation side, conceptually, it's all simple. We maintain 64-bit logical start and end positions. If we need to truncate the log, start position will be adjusted accordingly to lag end position by N bytes. We then use those logical positions to calculate their matching actual positions in user buffer and handle wrap around the end of the buffer properly. Finally, right before returning from bpf_check(), we rotate user log buffer contents in-place as necessary, to make log contents contiguous. See comments in relevant functions for details. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Reviewed-by: Lorenz Bauer Link: https://lore.kernel.org/bpf/20230406234205.323208-4-andrii@kernel.org --- include/linux/bpf_verifier.h | 33 +++- kernel/bpf/btf.c | 3 +- kernel/bpf/log.c | 198 ++++++++++++++++++++- kernel/bpf/verifier.c | 19 +- tools/testing/selftests/bpf/prog_tests/log_fixup.c | 1 + 5 files changed, 228 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 83dff25545ee..4c926227f612 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -491,25 +491,42 @@ struct bpf_insn_aux_data { #define BPF_VERIFIER_TMP_LOG_SIZE 1024 struct bpf_verifier_log { - u32 level; - char kbuf[BPF_VERIFIER_TMP_LOG_SIZE]; + /* Logical start and end positions of a "log window" of the verifier log. + * start_pos == 0 means we haven't truncated anything. + * Once truncation starts to happen, start_pos + len_total == end_pos, + * except during log reset situations, in which (end_pos - start_pos) + * might get smaller than len_total (see bpf_vlog_reset()). + * Generally, (end_pos - start_pos) gives number of useful data in + * user log buffer. + */ + u64 start_pos; + u64 end_pos; char __user *ubuf; - u32 len_used; + u32 level; u32 len_total; + char kbuf[BPF_VERIFIER_TMP_LOG_SIZE]; }; #define BPF_LOG_LEVEL1 1 #define BPF_LOG_LEVEL2 2 #define BPF_LOG_STATS 4 +#define BPF_LOG_FIXED 8 #define BPF_LOG_LEVEL (BPF_LOG_LEVEL1 | BPF_LOG_LEVEL2) -#define BPF_LOG_MASK (BPF_LOG_LEVEL | BPF_LOG_STATS) +#define BPF_LOG_MASK (BPF_LOG_LEVEL | BPF_LOG_STATS | BPF_LOG_FIXED) #define BPF_LOG_KERNEL (BPF_LOG_MASK + 1) /* kernel internal flag */ #define BPF_LOG_MIN_ALIGNMENT 8U #define BPF_LOG_ALIGNMENT 40U +static inline u32 bpf_log_used(const struct bpf_verifier_log *log) +{ + return log->end_pos - log->start_pos; +} + static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log) { - return log->len_used >= log->len_total - 1; + if (log->level & BPF_LOG_FIXED) + return bpf_log_used(log) >= log->len_total - 1; + return false; } static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log) @@ -596,7 +613,7 @@ struct bpf_verifier_env { u32 scratched_regs; /* Same as scratched_regs but for stack slots */ u64 scratched_stack_slots; - u32 prev_log_len, prev_insn_print_len; + u64 prev_log_pos, prev_insn_print_pos; /* buffer used in reg_type_str() to generate reg_type string */ char type_str_buf[TYPE_STR_BUF_LEN]; }; @@ -608,7 +625,9 @@ __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, const char *fmt, ...); __printf(2, 3) void bpf_log(struct bpf_verifier_log *log, const char *fmt, ...); -void bpf_vlog_reset(struct bpf_verifier_log *log, u32 new_pos); +void bpf_vlog_reset(struct bpf_verifier_log *log, u64 new_pos); +void bpf_vlog_finalize(struct bpf_verifier_log *log); +bool bpf_vlog_truncated(const struct bpf_verifier_log *log); static inline struct bpf_func_state *cur_func(struct bpf_verifier_env *env) { diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 593c45a294d0..20a05b8932db 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5593,7 +5593,8 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size, } } - if (log->level && bpf_verifier_log_full(log)) { + bpf_vlog_finalize(log); + if (log->level && bpf_vlog_truncated(log)) { err = -ENOSPC; goto errout_meta; } diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 1974891fc324..92b1c8ad6601 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -8,6 +8,7 @@ #include #include #include +#include bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log) { @@ -32,23 +33,202 @@ void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, return; } - n = min(log->len_total - log->len_used - 1, n); - log->kbuf[n] = '\0'; - if (!copy_to_user(log->ubuf + log->len_used, log->kbuf, n + 1)) - log->len_used += n; - else - log->ubuf = NULL; + if (log->level & BPF_LOG_FIXED) { + n = min(log->len_total - bpf_log_used(log) - 1, n); + log->kbuf[n] = '\0'; + n += 1; + + if (copy_to_user(log->ubuf + log->end_pos, log->kbuf, n)) + goto fail; + + log->end_pos += n - 1; /* don't count terminating '\0' */ + } else { + u64 new_end, new_start, cur_pos; + u32 buf_start, buf_end, new_n; + + n += 1; + + new_end = log->end_pos + n; + if (new_end - log->start_pos >= log->len_total) + new_start = new_end - log->len_total; + else + new_start = log->start_pos; + new_n = min(n, log->len_total); + cur_pos = new_end - new_n; + + div_u64_rem(cur_pos, log->len_total, &buf_start); + div_u64_rem(new_end, log->len_total, &buf_end); + /* new_end and buf_end are exclusive indices, so if buf_end is + * exactly zero, then it actually points right to the end of + * ubuf and there is no wrap around + */ + if (buf_end == 0) + buf_end = log->len_total; + + /* if buf_start > buf_end, we wrapped around; + * if buf_start == buf_end, then we fill ubuf completely; we + * can't have buf_start == buf_end to mean that there is + * nothing to write, because we always write at least + * something, even if terminal '\0' + */ + if (buf_start < buf_end) { + /* message fits within contiguous chunk of ubuf */ + if (copy_to_user(log->ubuf + buf_start, + log->kbuf + n - new_n, + buf_end - buf_start)) + goto fail; + } else { + /* message wraps around the end of ubuf, copy in two chunks */ + if (copy_to_user(log->ubuf + buf_start, + log->kbuf + n - new_n, + log->len_total - buf_start)) + goto fail; + if (copy_to_user(log->ubuf, + log->kbuf + n - buf_end, + buf_end)) + goto fail; + } + + log->start_pos = new_start; + log->end_pos = new_end - 1; /* don't count terminating '\0' */ + } + + return; +fail: + log->ubuf = NULL; } -void bpf_vlog_reset(struct bpf_verifier_log *log, u32 new_pos) +void bpf_vlog_reset(struct bpf_verifier_log *log, u64 new_pos) { char zero = 0; + u32 pos; + + if (WARN_ON_ONCE(new_pos > log->end_pos)) + return; if (!bpf_verifier_log_needed(log)) return; - log->len_used = new_pos; - if (put_user(zero, log->ubuf + new_pos)) + /* if position to which we reset is beyond current log window, + * then we didn't preserve any useful content and should adjust + * start_pos to end up with an empty log (start_pos == end_pos) + */ + log->end_pos = new_pos; + if (log->end_pos < log->start_pos) + log->start_pos = log->end_pos; + div_u64_rem(new_pos, log->len_total, &pos); + if (put_user(zero, log->ubuf + pos)) + log->ubuf = NULL; +} + +static void bpf_vlog_reverse_kbuf(char *buf, int len) +{ + int i, j; + + for (i = 0, j = len - 1; i < j; i++, j--) + swap(buf[i], buf[j]); +} + +static int bpf_vlog_reverse_ubuf(struct bpf_verifier_log *log, int start, int end) +{ + /* we split log->kbuf into two equal parts for both ends of array */ + int n = sizeof(log->kbuf) / 2, nn; + char *lbuf = log->kbuf, *rbuf = log->kbuf + n; + + /* Read ubuf's section [start, end) two chunks at a time, from left + * and right side; within each chunk, swap all the bytes; after that + * reverse the order of lbuf and rbuf and write result back to ubuf. + * This way we'll end up with swapped contents of specified + * [start, end) ubuf segment. + */ + while (end - start > 1) { + nn = min(n, (end - start ) / 2); + + if (copy_from_user(lbuf, log->ubuf + start, nn)) + return -EFAULT; + if (copy_from_user(rbuf, log->ubuf + end - nn, nn)) + return -EFAULT; + + bpf_vlog_reverse_kbuf(lbuf, nn); + bpf_vlog_reverse_kbuf(rbuf, nn); + + /* we write lbuf to the right end of ubuf, while rbuf to the + * left one to end up with properly reversed overall ubuf + */ + if (copy_to_user(log->ubuf + start, rbuf, nn)) + return -EFAULT; + if (copy_to_user(log->ubuf + end - nn, lbuf, nn)) + return -EFAULT; + + start += nn; + end -= nn; + } + + return 0; +} + +bool bpf_vlog_truncated(const struct bpf_verifier_log *log) +{ + if (log->level & BPF_LOG_FIXED) + return bpf_log_used(log) >= log->len_total - 1; + else + return log->start_pos > 0; +} + +void bpf_vlog_finalize(struct bpf_verifier_log *log) +{ + u32 sublen; + int err; + + if (!log || !log->level || !log->ubuf) + return; + if ((log->level & BPF_LOG_FIXED) || log->level == BPF_LOG_KERNEL) + return; + + /* If we never truncated log, there is nothing to move around. */ + if (log->start_pos == 0) + return; + + /* Otherwise we need to rotate log contents to make it start from the + * buffer beginning and be a continuous zero-terminated string. Note + * that if log->start_pos != 0 then we definitely filled up entire log + * buffer with no gaps, and we just need to shift buffer contents to + * the left by (log->start_pos % log->len_total) bytes. + * + * Unfortunately, user buffer could be huge and we don't want to + * allocate temporary kernel memory of the same size just to shift + * contents in a straightforward fashion. Instead, we'll be clever and + * do in-place array rotation. This is a leetcode-style problem, which + * could be solved by three rotations. + * + * Let's say we have log buffer that has to be shifted left by 7 bytes + * (spaces and vertical bar is just for demonstrative purposes): + * E F G H I J K | A B C D + * + * First, we reverse entire array: + * D C B A | K J I H G F E + * + * Then we rotate first 4 bytes (DCBA) and separately last 7 bytes + * (KJIHGFE), resulting in a properly rotated array: + * A B C D | E F G H I J K + * + * We'll utilize log->kbuf to read user memory chunk by chunk, swap + * bytes, and write them back. Doing it byte-by-byte would be + * unnecessarily inefficient. Altogether we are going to read and + * write each byte twice, for total 4 memory copies between kernel and + * user space. + */ + + /* length of the chopped off part that will be the beginning; + * len(ABCD) in the example above + */ + div_u64_rem(log->start_pos, log->len_total, &sublen); + sublen = log->len_total - sublen; + + err = bpf_vlog_reverse_ubuf(log, 0, log->len_total); + err = err ?: bpf_vlog_reverse_ubuf(log, 0, sublen); + err = err ?: bpf_vlog_reverse_ubuf(log, sublen, log->len_total); + if (err) log->ubuf = NULL; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 745ae0cd01d4..a476bb319685 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1439,10 +1439,10 @@ static inline u32 vlog_alignment(u32 pos) static void print_insn_state(struct bpf_verifier_env *env, const struct bpf_func_state *state) { - if (env->prev_log_len && env->prev_log_len == env->log.len_used) { + if (env->prev_log_pos && env->prev_log_pos == env->log.end_pos) { /* remove new line character */ - bpf_vlog_reset(&env->log, env->prev_log_len - 1); - verbose(env, "%*c;", vlog_alignment(env->prev_insn_print_len), ' '); + bpf_vlog_reset(&env->log, env->prev_log_pos - 1); + verbose(env, "%*c;", vlog_alignment(env->prev_insn_print_pos), ' '); } else { verbose(env, "%d:", env->insn_idx); } @@ -1750,7 +1750,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, elem->insn_idx = insn_idx; elem->prev_insn_idx = prev_insn_idx; elem->next = env->head; - elem->log_pos = env->log.len_used; + elem->log_pos = env->log.end_pos; env->head = elem; env->stack_size++; err = copy_verifier_state(&elem->st, cur); @@ -2286,7 +2286,7 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env, elem->insn_idx = insn_idx; elem->prev_insn_idx = prev_insn_idx; elem->next = env->head; - elem->log_pos = env->log.len_used; + elem->log_pos = env->log.end_pos; env->head = elem; env->stack_size++; if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) { @@ -15638,11 +15638,11 @@ static int do_check(struct bpf_verifier_env *env) print_insn_state(env, state->frame[state->curframe]); verbose_linfo(env, env->insn_idx, "; "); - env->prev_log_len = env->log.len_used; + env->prev_log_pos = env->log.end_pos; verbose(env, "%d: ", env->insn_idx); print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); - env->prev_insn_print_len = env->log.len_used - env->prev_log_len; - env->prev_log_len = env->log.len_used; + env->prev_insn_print_pos = env->log.end_pos - env->prev_log_pos; + env->prev_log_pos = env->log.end_pos; } if (bpf_prog_is_offloaded(env->prog->aux)) { @@ -18860,7 +18860,8 @@ skip_full_check: print_verification_stats(env); env->prog->aux->verified_insns = env->insn_processed; - if (log->level && bpf_verifier_log_full(log)) + bpf_vlog_finalize(log); + if (log->level && bpf_vlog_truncated(log)) ret = -ENOSPC; if (log->level && !log->ubuf) { ret = -EFAULT; diff --git a/tools/testing/selftests/bpf/prog_tests/log_fixup.c b/tools/testing/selftests/bpf/prog_tests/log_fixup.c index 239e1c5753b0..bc27170bdeb0 100644 --- a/tools/testing/selftests/bpf/prog_tests/log_fixup.c +++ b/tools/testing/selftests/bpf/prog_tests/log_fixup.c @@ -24,6 +24,7 @@ static void bad_core_relo(size_t log_buf_size, enum trunc_type trunc_type) bpf_program__set_autoload(skel->progs.bad_relo, true); memset(log_buf, 0, sizeof(log_buf)); bpf_program__set_log_buf(skel->progs.bad_relo, log_buf, log_buf_size ?: sizeof(log_buf)); + bpf_program__set_log_level(skel->progs.bad_relo, 1 | 8); /* BPF_LOG_FIXED to force truncation */ err = test_log_fixup__load(skel); if (!ASSERT_ERR(err, "load_fail")) -- cgit v1.2.3-58-ga151 From 24bc80887adb4d6fc0057d4f14fabeaa4502b2a0 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 6 Apr 2023 16:41:53 -0700 Subject: bpf: Ignore verifier log reset in BPF_LOG_KERNEL mode Verifier log position reset is meaningless in BPF_LOG_KERNEL mode, so just exit early in bpf_vlog_reset() if log->level is BPF_LOG_KERNEL. This avoid meaningless put_user() into NULL log->ubuf. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Lorenz Bauer Link: https://lore.kernel.org/bpf/20230406234205.323208-8-andrii@kernel.org --- kernel/bpf/log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 92b1c8ad6601..d99a50f07187 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -106,7 +106,7 @@ void bpf_vlog_reset(struct bpf_verifier_log *log, u64 new_pos) if (WARN_ON_ONCE(new_pos > log->end_pos)) return; - if (!bpf_verifier_log_needed(log)) + if (!bpf_verifier_log_needed(log) || log->level == BPF_LOG_KERNEL) return; /* if position to which we reset is beyond current log window, -- cgit v1.2.3-58-ga151 From 971fb5057d787d0a7e7c8cb910207c82e2db920e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 6 Apr 2023 16:41:54 -0700 Subject: bpf: Fix missing -EFAULT return on user log buf error in btf_parse() btf_parse() is missing -EFAULT error return if log->ubuf was NULL-ed out due to error while copying data into user-provided buffer. Add it, but handle a special case of BPF_LOG_KERNEL in which log->ubuf is always NULL. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Lorenz Bauer Link: https://lore.kernel.org/bpf/20230406234205.323208-9-andrii@kernel.org --- kernel/bpf/btf.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 20a05b8932db..6372c144a294 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5598,6 +5598,10 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size, err = -ENOSPC; goto errout_meta; } + if (log->level && log->level != BPF_LOG_KERNEL && !log->ubuf) { + err = -EFAULT; + goto errout_meta; + } btf_verifier_env_free(env); refcount_set(&btf->refcnt, 1); -- cgit v1.2.3-58-ga151 From cbedb42a0da3bb48819b2200af4b4cb5d922c518 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 6 Apr 2023 16:41:55 -0700 Subject: bpf: Avoid incorrect -EFAULT error in BPF_LOG_KERNEL mode If verifier log is in BPF_LOG_KERNEL mode, no log->ubuf is expected and it stays NULL throughout entire verification process. Don't erroneously return -EFAULT in such case. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Lorenz Bauer Link: https://lore.kernel.org/bpf/20230406234205.323208-10-andrii@kernel.org --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a476bb319685..0323149803f5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -18863,7 +18863,7 @@ skip_full_check: bpf_vlog_finalize(log); if (log->level && bpf_vlog_truncated(log)) ret = -ENOSPC; - if (log->level && !log->ubuf) { + if (log->level && log->level != BPF_LOG_KERNEL && !log->ubuf) { ret = -EFAULT; goto err_release_maps; } -- cgit v1.2.3-58-ga151 From 8a6ca6bc553e3c878fa53c506bc6ec281efdc039 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 6 Apr 2023 16:41:56 -0700 Subject: bpf: Simplify logging-related error conditions handling Move log->level == 0 check into bpf_vlog_truncated() instead of doing it explicitly. Also remove unnecessary goto in kernel/bpf/verifier.c. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Lorenz Bauer Link: https://lore.kernel.org/bpf/20230406234205.323208-11-andrii@kernel.org --- kernel/bpf/btf.c | 2 +- kernel/bpf/log.c | 4 +++- kernel/bpf/verifier.c | 6 ++---- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 6372c144a294..5aa540ee611f 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5594,7 +5594,7 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size, } bpf_vlog_finalize(log); - if (log->level && bpf_vlog_truncated(log)) { + if (bpf_vlog_truncated(log)) { err = -ENOSPC; goto errout_meta; } diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index d99a50f07187..c778f3b290cb 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -169,7 +169,9 @@ static int bpf_vlog_reverse_ubuf(struct bpf_verifier_log *log, int start, int en bool bpf_vlog_truncated(const struct bpf_verifier_log *log) { - if (log->level & BPF_LOG_FIXED) + if (!log->level) + return false; + else if (log->level & BPF_LOG_FIXED) return bpf_log_used(log) >= log->len_total - 1; else return log->start_pos > 0; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0323149803f5..a98cbc046d1e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -18861,12 +18861,10 @@ skip_full_check: env->prog->aux->verified_insns = env->insn_processed; bpf_vlog_finalize(log); - if (log->level && bpf_vlog_truncated(log)) + if (bpf_vlog_truncated(log)) ret = -ENOSPC; - if (log->level && log->level != BPF_LOG_KERNEL && !log->ubuf) { + if (log->level && log->level != BPF_LOG_KERNEL && !log->ubuf) ret = -EFAULT; - goto err_release_maps; - } if (ret) goto err_release_maps; -- cgit v1.2.3-58-ga151 From fa1c7d5cc404ac3b6e6b4ab6d00b07c76bd819be Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 6 Apr 2023 16:41:57 -0700 Subject: bpf: Keep track of total log content size in both fixed and rolling modes Change how we do accounting in BPF_LOG_FIXED mode and adopt log->end_pos as *logical* log position. This means that we can go beyond physical log buffer size now and be able to tell what log buffer size should be to fit entire log contents without -ENOSPC. To do this for BPF_LOG_FIXED mode, we need to remove a short-circuiting logic of not vsnprintf()'ing further log content once we filled up user-provided buffer, which is done by bpf_verifier_log_needed() checks. We modify these checks to always keep going if log->level is non-zero (i.e., log is requested), even if log->ubuf was NULL'ed out due to copying data to user-space, or if entire log buffer is physically full. We adopt bpf_verifier_vlog() routine to work correctly with log->ubuf == NULL condition, performing log formatting into temporary kernel buffer, doing all the necessary accounting, but just avoiding copying data out if buffer is full or NULL'ed out. With these changes, it's now possible to do this sort of determination of log contents size in both BPF_LOG_FIXED and default rolling log mode. We need to keep in mind bpf_vlog_reset(), though, which shrinks log contents after successful verification of a particular code path. This log reset means that log->end_pos isn't always increasing, so to return back to users what should be the log buffer size to fit all log content without causing -ENOSPC even in the presence of log resetting, we need to keep maximum over "lifetime" of logging. We do this accounting in bpf_vlog_update_len_max() helper. A related and subtle aspect is that with this logical log->end_pos even in BPF_LOG_FIXED mode we could temporary "overflow" buffer, but then reset it back with bpf_vlog_reset() to a position inside user-supplied log_buf. In such situation we still want to properly maintain terminating zero. We will eventually return -ENOSPC even if final log buffer is small (we detect this through log->len_max check). This behavior is simpler to reason about and is consistent with current behavior of verifier log. Handling of this required a small addition to bpf_vlog_reset() logic to avoid doing put_user() beyond physical log buffer dimensions. Another issue to keep in mind is that we limit log buffer size to 32-bit value and keep such log length as u32, but theoretically verifier could produce huge log stretching beyond 4GB. Instead of keeping (and later returning) 64-bit log length, we cap it at UINT_MAX. Current UAPI makes it impossible to specify log buffer size bigger than 4GB anyways, so we don't really loose anything here and keep everything consistently 32-bit in UAPI. This property will be utilized in next patch. Doing the same determination of maximum log buffer for rolling mode is trivial, as log->end_pos and log->start_pos are already logical positions, so there is nothing new there. These changes do incidentally fix one small issue with previous logging logic. Previously, if use provided log buffer of size N, and actual log output was exactly N-1 bytes + terminating \0, kernel logic coun't distinguish this condition from log truncation scenario which would end up with truncated log contents of N-1 bytes + terminating \0 as well. But now with log->end_pos being logical position that could go beyond actual log buffer size, we can distinguish these two conditions, which we do in this patch. This plays nicely with returning log_size_actual (implemented in UAPI in the next patch), as we can now guarantee that if user takes such log_size_actual and provides log buffer of that exact size, they will not get -ENOSPC in return. All in all, all these changes do conceptually unify fixed and rolling log modes much better, and allow a nice feature requested by users: knowing what should be the size of the buffer to avoid -ENOSPC. We'll plumb this through the UAPI and the code in the next patch. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Lorenz Bauer Link: https://lore.kernel.org/bpf/20230406234205.323208-12-andrii@kernel.org --- include/linux/bpf_verifier.h | 12 ++------ kernel/bpf/log.c | 67 ++++++++++++++++++++++++++++++-------------- 2 files changed, 49 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 4c926227f612..98d2eb382dbb 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -504,6 +504,7 @@ struct bpf_verifier_log { char __user *ubuf; u32 level; u32 len_total; + u32 len_max; char kbuf[BPF_VERIFIER_TMP_LOG_SIZE]; }; @@ -517,23 +518,16 @@ struct bpf_verifier_log { #define BPF_LOG_MIN_ALIGNMENT 8U #define BPF_LOG_ALIGNMENT 40U -static inline u32 bpf_log_used(const struct bpf_verifier_log *log) -{ - return log->end_pos - log->start_pos; -} - static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log) { if (log->level & BPF_LOG_FIXED) - return bpf_log_used(log) >= log->len_total - 1; + return log->end_pos >= log->len_total; return false; } static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log) { - return log && - ((log->level && log->ubuf && !bpf_verifier_log_full(log)) || - log->level == BPF_LOG_KERNEL); + return log && log->level; } #define BPF_MAX_SUBPROGS 256 diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index c778f3b290cb..47bea2fad6fe 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -16,10 +16,26 @@ bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log) log->level && log->ubuf && !(log->level & ~BPF_LOG_MASK); } +static void bpf_vlog_update_len_max(struct bpf_verifier_log *log, u32 add_len) +{ + /* add_len includes terminal \0, so no need for +1. */ + u64 len = log->end_pos + add_len; + + /* log->len_max could be larger than our current len due to + * bpf_vlog_reset() calls, so we maintain the max of any length at any + * previous point + */ + if (len > UINT_MAX) + log->len_max = UINT_MAX; + else if (len > log->len_max) + log->len_max = len; +} + void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, va_list args) { - unsigned int n; + u64 cur_pos; + u32 new_n, n; n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args); @@ -33,21 +49,27 @@ void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, return; } - if (log->level & BPF_LOG_FIXED) { - n = min(log->len_total - bpf_log_used(log) - 1, n); - log->kbuf[n] = '\0'; - n += 1; + n += 1; /* include terminating zero */ + bpf_vlog_update_len_max(log, n); - if (copy_to_user(log->ubuf + log->end_pos, log->kbuf, n)) - goto fail; + if (log->level & BPF_LOG_FIXED) { + /* check if we have at least something to put into user buf */ + new_n = 0; + if (log->end_pos < log->len_total) { + new_n = min_t(u32, log->len_total - log->end_pos, n); + log->kbuf[new_n - 1] = '\0'; + } + cur_pos = log->end_pos; log->end_pos += n - 1; /* don't count terminating '\0' */ + + if (log->ubuf && new_n && + copy_to_user(log->ubuf + cur_pos, log->kbuf, new_n)) + goto fail; } else { - u64 new_end, new_start, cur_pos; + u64 new_end, new_start; u32 buf_start, buf_end, new_n; - n += 1; - new_end = log->end_pos + n; if (new_end - log->start_pos >= log->len_total) new_start = new_end - log->len_total; @@ -65,6 +87,12 @@ void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, if (buf_end == 0) buf_end = log->len_total; + log->start_pos = new_start; + log->end_pos = new_end - 1; /* don't count terminating '\0' */ + + if (!log->ubuf) + return; + /* if buf_start > buf_end, we wrapped around; * if buf_start == buf_end, then we fill ubuf completely; we * can't have buf_start == buf_end to mean that there is @@ -88,9 +116,6 @@ void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, buf_end)) goto fail; } - - log->start_pos = new_start; - log->end_pos = new_end - 1; /* don't count terminating '\0' */ } return; @@ -116,8 +141,13 @@ void bpf_vlog_reset(struct bpf_verifier_log *log, u64 new_pos) log->end_pos = new_pos; if (log->end_pos < log->start_pos) log->start_pos = log->end_pos; - div_u64_rem(new_pos, log->len_total, &pos); - if (put_user(zero, log->ubuf + pos)) + + if (log->level & BPF_LOG_FIXED) + pos = log->end_pos + 1; + else + div_u64_rem(new_pos, log->len_total, &pos); + + if (log->ubuf && pos < log->len_total && put_user(zero, log->ubuf + pos)) log->ubuf = NULL; } @@ -169,12 +199,7 @@ static int bpf_vlog_reverse_ubuf(struct bpf_verifier_log *log, int start, int en bool bpf_vlog_truncated(const struct bpf_verifier_log *log) { - if (!log->level) - return false; - else if (log->level & BPF_LOG_FIXED) - return bpf_log_used(log) >= log->len_total - 1; - else - return log->start_pos > 0; + return log->len_max > log->len_total; } void bpf_vlog_finalize(struct bpf_verifier_log *log) -- cgit v1.2.3-58-ga151 From 47a71c1f9af0a334c9dfa97633c41de4feda4287 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 6 Apr 2023 16:41:58 -0700 Subject: bpf: Add log_true_size output field to return necessary log buffer size Add output-only log_true_size and btf_log_true_size field to BPF_PROG_LOAD and BPF_BTF_LOAD commands, respectively. It will return the size of log buffer necessary to fit in all the log contents at specified log_level. This is very useful for BPF loader libraries like libbpf to be able to size log buffer correctly, but could be used by users directly, if necessary, as well. This patch plumbs all this through the code, taking into account actual bpf_attr size provided by user to determine if these new fields are expected by users. And if they are, set them from kernel on return. We refactory btf_parse() function to accommodate this, moving attr and uattr handling inside it. The rest is very straightforward code, which is split from the logging accounting changes in the previous patch to make it simpler to review logic vs UAPI changes. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Lorenz Bauer Link: https://lore.kernel.org/bpf/20230406234205.323208-13-andrii@kernel.org --- include/linux/bpf.h | 2 +- include/linux/btf.h | 2 +- include/uapi/linux/bpf.h | 10 ++++++++++ kernel/bpf/btf.c | 32 ++++++++++++++++++-------------- kernel/bpf/syscall.c | 16 ++++++++-------- kernel/bpf/verifier.c | 8 +++++++- tools/include/uapi/linux/bpf.h | 12 +++++++++++- 7 files changed, 56 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 002a811b6b90..2c6095bd7d69 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2175,7 +2175,7 @@ int bpf_check_uarg_tail_zero(bpfptr_t uaddr, size_t expected_size, size_t actual_size); /* verify correctness of eBPF program */ -int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, bpfptr_t uattr); +int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size); #ifndef CONFIG_BPF_JIT_ALWAYS_ON void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth); diff --git a/include/linux/btf.h b/include/linux/btf.h index d53b10cc55f2..495250162422 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -125,7 +125,7 @@ extern const struct file_operations btf_fops; void btf_get(struct btf *btf); void btf_put(struct btf *btf); -int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr); +int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_sz); struct btf *btf_get_by_fd(int fd); int btf_get_info_by_fd(const struct btf *btf, const union bpf_attr *attr, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e3d3b5160d26..3823100b7934 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1407,6 +1407,11 @@ union bpf_attr { __aligned_u64 fd_array; /* array of FDs */ __aligned_u64 core_relos; __u32 core_relo_rec_size; /* sizeof(struct bpf_core_relo) */ + /* output: actual total log contents size (including termintaing zero). + * It could be both larger than original log_size (if log was + * truncated), or smaller (if log buffer wasn't filled completely). + */ + __u32 log_true_size; }; struct { /* anonymous struct used by BPF_OBJ_* commands */ @@ -1492,6 +1497,11 @@ union bpf_attr { __u32 btf_size; __u32 btf_log_size; __u32 btf_log_level; + /* output: actual total log contents size (including termintaing zero). + * It could be both larger than original log_size (if log was + * truncated), or smaller (if log buffer wasn't filled completely). + */ + __u32 btf_log_true_size; }; struct { diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 5aa540ee611f..0748cf4b8ab6 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5504,9 +5504,10 @@ static int btf_check_type_tags(struct btf_verifier_env *env, return 0; } -static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size, - u32 log_level, char __user *log_ubuf, u32 log_size) +static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) { + bpfptr_t btf_data = make_bpfptr(attr->btf, uattr.is_kernel); + char __user *log_ubuf = u64_to_user_ptr(attr->btf_log_buf); struct btf_struct_metas *struct_meta_tab; struct btf_verifier_env *env = NULL; struct bpf_verifier_log *log; @@ -5514,7 +5515,7 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size, u8 *data; int err; - if (btf_data_size > BTF_MAX_SIZE) + if (attr->btf_size > BTF_MAX_SIZE) return ERR_PTR(-E2BIG); env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN); @@ -5522,13 +5523,13 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size, return ERR_PTR(-ENOMEM); log = &env->log; - if (log_level || log_ubuf || log_size) { + if (attr->btf_log_level || log_ubuf || attr->btf_log_size) { /* user requested verbose verifier output * and supplied buffer to store the verification trace */ - log->level = log_level; + log->level = attr->btf_log_level; log->ubuf = log_ubuf; - log->len_total = log_size; + log->len_total = attr->btf_log_size; /* log attributes have to be sane */ if (!bpf_verifier_log_attr_valid(log)) { @@ -5544,16 +5545,16 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size, } env->btf = btf; - data = kvmalloc(btf_data_size, GFP_KERNEL | __GFP_NOWARN); + data = kvmalloc(attr->btf_size, GFP_KERNEL | __GFP_NOWARN); if (!data) { err = -ENOMEM; goto errout; } btf->data = data; - btf->data_size = btf_data_size; + btf->data_size = attr->btf_size; - if (copy_from_bpfptr(data, btf_data, btf_data_size)) { + if (copy_from_bpfptr(data, btf_data, attr->btf_size)) { err = -EFAULT; goto errout; } @@ -5594,6 +5595,12 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size, } bpf_vlog_finalize(log); + if (uattr_size >= offsetofend(union bpf_attr, btf_log_true_size) && + copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, btf_log_true_size), + &log->len_max, sizeof(log->len_max))) { + err = -EFAULT; + goto errout_meta; + } if (bpf_vlog_truncated(log)) { err = -ENOSPC; goto errout_meta; @@ -7218,15 +7225,12 @@ static int __btf_new_fd(struct btf *btf) return anon_inode_getfd("btf", &btf_fops, btf, O_RDONLY | O_CLOEXEC); } -int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr) +int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) { struct btf *btf; int ret; - btf = btf_parse(make_bpfptr(attr->btf, uattr.is_kernel), - attr->btf_size, attr->btf_log_level, - u64_to_user_ptr(attr->btf_log_buf), - attr->btf_log_size); + btf = btf_parse(attr, uattr, uattr_size); if (IS_ERR(btf)) return PTR_ERR(btf); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e18ac7fdc210..6d575505f89c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2501,9 +2501,9 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type) } /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD core_relo_rec_size +#define BPF_PROG_LOAD_LAST_FIELD log_true_size -static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr) +static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) { enum bpf_prog_type type = attr->prog_type; struct bpf_prog *prog, *dst_prog = NULL; @@ -2653,7 +2653,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr) goto free_prog_sec; /* run eBPF verifier */ - err = bpf_check(&prog, attr, uattr); + err = bpf_check(&prog, attr, uattr, uattr_size); if (err < 0) goto free_used_maps; @@ -4371,9 +4371,9 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, return err; } -#define BPF_BTF_LOAD_LAST_FIELD btf_log_level +#define BPF_BTF_LOAD_LAST_FIELD btf_log_true_size -static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr) +static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size) { if (CHECK_ATTR(BPF_BTF_LOAD)) return -EINVAL; @@ -4381,7 +4381,7 @@ static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr) if (!bpf_capable()) return -EPERM; - return btf_new_fd(attr, uattr); + return btf_new_fd(attr, uattr, uattr_size); } #define BPF_BTF_GET_FD_BY_ID_LAST_FIELD btf_id @@ -5059,7 +5059,7 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size) err = map_freeze(&attr); break; case BPF_PROG_LOAD: - err = bpf_prog_load(&attr, uattr); + err = bpf_prog_load(&attr, uattr, size); break; case BPF_OBJ_PIN: err = bpf_obj_pin(&attr); @@ -5104,7 +5104,7 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size) err = bpf_raw_tracepoint_open(&attr); break; case BPF_BTF_LOAD: - err = bpf_btf_load(&attr, uattr); + err = bpf_btf_load(&attr, uattr, size); break; case BPF_BTF_GET_FD_BY_ID: err = bpf_btf_get_fd_by_id(&attr); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a98cbc046d1e..308e7abeb979 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -18694,7 +18694,7 @@ struct btf *bpf_get_btf_vmlinux(void) return btf_vmlinux; } -int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr) +int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size) { u64 start_time = ktime_get_ns(); struct bpf_verifier_env *env; @@ -18861,6 +18861,12 @@ skip_full_check: env->prog->aux->verified_insns = env->insn_processed; bpf_vlog_finalize(log); + if (uattr_size >= offsetofend(union bpf_attr, log_true_size) && + copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, log_true_size), + &log->len_max, sizeof(log->len_max))) { + ret = -EFAULT; + goto err_release_maps; + } if (bpf_vlog_truncated(log)) ret = -ENOSPC; if (log->level && log->level != BPF_LOG_KERNEL && !log->ubuf) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index d6c5a022ae28..3823100b7934 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1407,6 +1407,11 @@ union bpf_attr { __aligned_u64 fd_array; /* array of FDs */ __aligned_u64 core_relos; __u32 core_relo_rec_size; /* sizeof(struct bpf_core_relo) */ + /* output: actual total log contents size (including termintaing zero). + * It could be both larger than original log_size (if log was + * truncated), or smaller (if log buffer wasn't filled completely). + */ + __u32 log_true_size; }; struct { /* anonymous struct used by BPF_OBJ_* commands */ @@ -1492,6 +1497,11 @@ union bpf_attr { __u32 btf_size; __u32 btf_log_size; __u32 btf_log_level; + /* output: actual total log contents size (including termintaing zero). + * It could be both larger than original log_size (if log was + * truncated), or smaller (if log buffer wasn't filled completely). + */ + __u32 btf_log_true_size; }; struct { @@ -1513,7 +1523,7 @@ union bpf_attr { struct { /* struct used by BPF_LINK_CREATE command */ union { __u32 prog_fd; /* eBPF program to attach */ - __u32 map_fd; /* eBPF struct_ops to attach */ + __u32 map_fd; /* struct_ops to attach */ }; union { __u32 target_fd; /* object to attach to */ -- cgit v1.2.3-58-ga151 From bdcab4144f5da97cc0fa7e1dd63b8475e10c8f0a Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 6 Apr 2023 16:41:59 -0700 Subject: bpf: Simplify internal verifier log interface Simplify internal verifier log API down to bpf_vlog_init() and bpf_vlog_finalize(). The former handles input arguments validation in one place and makes it easier to change it. The latter subsumes -ENOSPC (truncation) and -EFAULT handling and simplifies both caller's code (bpf_check() and btf_parse()). For btf_parse(), this patch also makes sure that verifier log finalization happens even if there is some error condition during BTF verification process prior to normal finalization step. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Lorenz Bauer Link: https://lore.kernel.org/bpf/20230406234205.323208-14-andrii@kernel.org --- include/linux/bpf_verifier.h | 13 ++------- kernel/bpf/btf.c | 65 ++++++++++++++++++++++---------------------- kernel/bpf/log.c | 48 ++++++++++++++++++++++++++------ kernel/bpf/verifier.c | 39 +++++++++++--------------- 4 files changed, 90 insertions(+), 75 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 98d2eb382dbb..f03852b89d28 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -518,13 +518,6 @@ struct bpf_verifier_log { #define BPF_LOG_MIN_ALIGNMENT 8U #define BPF_LOG_ALIGNMENT 40U -static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log) -{ - if (log->level & BPF_LOG_FIXED) - return log->end_pos >= log->len_total; - return false; -} - static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log) { return log && log->level; @@ -612,16 +605,16 @@ struct bpf_verifier_env { char type_str_buf[TYPE_STR_BUF_LEN]; }; -bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log); __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, va_list args); __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, const char *fmt, ...); __printf(2, 3) void bpf_log(struct bpf_verifier_log *log, const char *fmt, ...); +int bpf_vlog_init(struct bpf_verifier_log *log, u32 log_level, + char __user *log_buf, u32 log_size); void bpf_vlog_reset(struct bpf_verifier_log *log, u64 new_pos); -void bpf_vlog_finalize(struct bpf_verifier_log *log); -bool bpf_vlog_truncated(const struct bpf_verifier_log *log); +int bpf_vlog_finalize(struct bpf_verifier_log *log, u32 *log_size_actual); static inline struct bpf_func_state *cur_func(struct bpf_verifier_env *env) { diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 0748cf4b8ab6..ffc31a1c84af 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5504,16 +5504,30 @@ static int btf_check_type_tags(struct btf_verifier_env *env, return 0; } +static int finalize_log(struct bpf_verifier_log *log, bpfptr_t uattr, u32 uattr_size) +{ + u32 log_true_size; + int err; + + err = bpf_vlog_finalize(log, &log_true_size); + + if (uattr_size >= offsetofend(union bpf_attr, btf_log_true_size) && + copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, btf_log_true_size), + &log_true_size, sizeof(log_true_size))) + err = -EFAULT; + + return err; +} + static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) { bpfptr_t btf_data = make_bpfptr(attr->btf, uattr.is_kernel); char __user *log_ubuf = u64_to_user_ptr(attr->btf_log_buf); struct btf_struct_metas *struct_meta_tab; struct btf_verifier_env *env = NULL; - struct bpf_verifier_log *log; struct btf *btf = NULL; u8 *data; - int err; + int err, ret; if (attr->btf_size > BTF_MAX_SIZE) return ERR_PTR(-E2BIG); @@ -5522,21 +5536,13 @@ static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uat if (!env) return ERR_PTR(-ENOMEM); - log = &env->log; - if (attr->btf_log_level || log_ubuf || attr->btf_log_size) { - /* user requested verbose verifier output - * and supplied buffer to store the verification trace - */ - log->level = attr->btf_log_level; - log->ubuf = log_ubuf; - log->len_total = attr->btf_log_size; - - /* log attributes have to be sane */ - if (!bpf_verifier_log_attr_valid(log)) { - err = -EINVAL; - goto errout; - } - } + /* user could have requested verbose verifier output + * and supplied buffer to store the verification trace + */ + err = bpf_vlog_init(&env->log, attr->btf_log_level, + log_ubuf, attr->btf_log_size); + if (err) + goto errout_free; btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN); if (!btf) { @@ -5577,7 +5583,7 @@ static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uat if (err) goto errout; - struct_meta_tab = btf_parse_struct_metas(log, btf); + struct_meta_tab = btf_parse_struct_metas(&env->log, btf); if (IS_ERR(struct_meta_tab)) { err = PTR_ERR(struct_meta_tab); goto errout; @@ -5594,21 +5600,9 @@ static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uat } } - bpf_vlog_finalize(log); - if (uattr_size >= offsetofend(union bpf_attr, btf_log_true_size) && - copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, btf_log_true_size), - &log->len_max, sizeof(log->len_max))) { - err = -EFAULT; - goto errout_meta; - } - if (bpf_vlog_truncated(log)) { - err = -ENOSPC; - goto errout_meta; - } - if (log->level && log->level != BPF_LOG_KERNEL && !log->ubuf) { - err = -EFAULT; - goto errout_meta; - } + err = finalize_log(&env->log, uattr, uattr_size); + if (err) + goto errout_free; btf_verifier_env_free(env); refcount_set(&btf->refcnt, 1); @@ -5617,6 +5611,11 @@ static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uat errout_meta: btf_free_struct_meta_tab(btf); errout: + /* overwrite err with -ENOSPC or -EFAULT */ + ret = finalize_log(&env->log, uattr, uattr_size); + if (ret) + err = ret; +errout_free: btf_verifier_env_free(env); if (btf) btf_free(btf); diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 47bea2fad6fe..1fae2c5d7ae4 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -10,12 +10,26 @@ #include #include -bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log) +static bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log) { return log->len_total > 0 && log->len_total <= UINT_MAX >> 2 && log->level && log->ubuf && !(log->level & ~BPF_LOG_MASK); } +int bpf_vlog_init(struct bpf_verifier_log *log, u32 log_level, + char __user *log_buf, u32 log_size) +{ + log->level = log_level; + log->ubuf = log_buf; + log->len_total = log_size; + + /* log attributes have to be sane */ + if (!bpf_verifier_log_attr_valid(log)) + return -EINVAL; + + return 0; +} + static void bpf_vlog_update_len_max(struct bpf_verifier_log *log, u32 add_len) { /* add_len includes terminal \0, so no need for +1. */ @@ -197,24 +211,25 @@ static int bpf_vlog_reverse_ubuf(struct bpf_verifier_log *log, int start, int en return 0; } -bool bpf_vlog_truncated(const struct bpf_verifier_log *log) +static bool bpf_vlog_truncated(const struct bpf_verifier_log *log) { return log->len_max > log->len_total; } -void bpf_vlog_finalize(struct bpf_verifier_log *log) +int bpf_vlog_finalize(struct bpf_verifier_log *log, u32 *log_size_actual) { u32 sublen; int err; - if (!log || !log->level || !log->ubuf) - return; - if ((log->level & BPF_LOG_FIXED) || log->level == BPF_LOG_KERNEL) - return; + *log_size_actual = 0; + if (!log || log->level == 0 || log->level == BPF_LOG_KERNEL) + return 0; + if (!log->ubuf) + goto skip_log_rotate; /* If we never truncated log, there is nothing to move around. */ - if (log->start_pos == 0) - return; + if ((log->level & BPF_LOG_FIXED) || log->start_pos == 0) + goto skip_log_rotate; /* Otherwise we need to rotate log contents to make it start from the * buffer beginning and be a continuous zero-terminated string. Note @@ -257,6 +272,21 @@ void bpf_vlog_finalize(struct bpf_verifier_log *log) err = err ?: bpf_vlog_reverse_ubuf(log, sublen, log->len_total); if (err) log->ubuf = NULL; + +skip_log_rotate: + *log_size_actual = log->len_max; + + /* properly initialized log has either both ubuf!=NULL and len_total>0 + * or ubuf==NULL and len_total==0, so if this condition doesn't hold, + * we got a fault somewhere along the way, so report it back + */ + if (!!log->ubuf != !!log->len_total) + return -EFAULT; + + if (bpf_vlog_truncated(log)) + return -ENOSPC; + + return 0; } /* log_level controls verbosity level of eBPF verifier. diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 308e7abeb979..d6db6de3e9ea 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -18698,8 +18698,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 { u64 start_time = ktime_get_ns(); struct bpf_verifier_env *env; - struct bpf_verifier_log *log; - int i, len, ret = -EINVAL; + int i, len, ret = -EINVAL, err; + u32 log_true_size; bool is_priv; /* no program is valid */ @@ -18712,7 +18712,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL); if (!env) return -ENOMEM; - log = &env->log; len = (*prog)->len; env->insn_aux_data = @@ -18733,20 +18732,14 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 if (!is_priv) mutex_lock(&bpf_verifier_lock); - if (attr->log_level || attr->log_buf || attr->log_size) { - /* user requested verbose verifier output - * and supplied buffer to store the verification trace - */ - log->level = attr->log_level; - log->ubuf = (char __user *) (unsigned long) attr->log_buf; - log->len_total = attr->log_size; - - /* log attributes have to be sane */ - if (!bpf_verifier_log_attr_valid(log)) { - ret = -EINVAL; - goto err_unlock; - } - } + /* user could have requested verbose verifier output + * and supplied buffer to store the verification trace + */ + ret = bpf_vlog_init(&env->log, attr->log_level, + (char __user *) (unsigned long) attr->log_buf, + attr->log_size); + if (ret) + goto err_unlock; mark_verifier_state_clean(env); @@ -18860,17 +18853,17 @@ skip_full_check: print_verification_stats(env); env->prog->aux->verified_insns = env->insn_processed; - bpf_vlog_finalize(log); + /* preserve original error even if log finalization is successful */ + err = bpf_vlog_finalize(&env->log, &log_true_size); + if (err) + ret = err; + if (uattr_size >= offsetofend(union bpf_attr, log_true_size) && copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, log_true_size), - &log->len_max, sizeof(log->len_max))) { + &log_true_size, sizeof(log_true_size))) { ret = -EFAULT; goto err_release_maps; } - if (bpf_vlog_truncated(log)) - ret = -ENOSPC; - if (log->level && log->level != BPF_LOG_KERNEL && !log->ubuf) - ret = -EFAULT; if (ret) goto err_release_maps; -- cgit v1.2.3-58-ga151 From fac08d45e2531f91d8fb3d11fc6576f588049476 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 6 Apr 2023 16:42:00 -0700 Subject: bpf: Relax log_buf NULL conditions when log_level>0 is requested Drop the log_size>0 and log_buf!=NULL condition when log_level>0. This allows users to request log_true_size of a full log without providing actual (even if small) log buffer. Verifier log handling code was mostly ready to handle NULL log->ubuf, so only few small changes were necessary to prevent NULL log->ubuf from causing problems. Note, that if user provided NULL log_buf with log_level>0 we don't consider this a log truncation, and thus won't return -ENOSPC. We also enforce that either (log_buf==NULL && log_size==0) or (log_buf!=NULL && log_size>0). Suggested-by: Lorenz Bauer Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Reviewed-by: Lorenz Bauer Link: https://lore.kernel.org/bpf/20230406234205.323208-15-andrii@kernel.org --- kernel/bpf/log.c | 42 +++++++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 1fae2c5d7ae4..046ddff37a76 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -12,8 +12,17 @@ static bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log) { - return log->len_total > 0 && log->len_total <= UINT_MAX >> 2 && - log->level && log->ubuf && !(log->level & ~BPF_LOG_MASK); + /* ubuf and len_total should both be specified (or not) together */ + if (!!log->ubuf != !!log->len_total) + return false; + /* log buf without log_level is meaningless */ + if (log->ubuf && log->level == 0) + return false; + if (log->level & ~BPF_LOG_MASK) + return false; + if (log->len_total > UINT_MAX >> 2) + return false; + return true; } int bpf_vlog_init(struct bpf_verifier_log *log, u32 log_level, @@ -89,9 +98,15 @@ void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, new_start = new_end - log->len_total; else new_start = log->start_pos; + + log->start_pos = new_start; + log->end_pos = new_end - 1; /* don't count terminating '\0' */ + + if (!log->ubuf) + return; + new_n = min(n, log->len_total); cur_pos = new_end - new_n; - div_u64_rem(cur_pos, log->len_total, &buf_start); div_u64_rem(new_end, log->len_total, &buf_end); /* new_end and buf_end are exclusive indices, so if buf_end is @@ -101,12 +116,6 @@ void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, if (buf_end == 0) buf_end = log->len_total; - log->start_pos = new_start; - log->end_pos = new_end - 1; /* don't count terminating '\0' */ - - if (!log->ubuf) - return; - /* if buf_start > buf_end, we wrapped around; * if buf_start == buf_end, then we fill ubuf completely; we * can't have buf_start == buf_end to mean that there is @@ -156,12 +165,15 @@ void bpf_vlog_reset(struct bpf_verifier_log *log, u64 new_pos) if (log->end_pos < log->start_pos) log->start_pos = log->end_pos; + if (!log->ubuf) + return; + if (log->level & BPF_LOG_FIXED) pos = log->end_pos + 1; else div_u64_rem(new_pos, log->len_total, &pos); - if (log->ubuf && pos < log->len_total && put_user(zero, log->ubuf + pos)) + if (pos < log->len_total && put_user(zero, log->ubuf + pos)) log->ubuf = NULL; } @@ -211,11 +223,6 @@ static int bpf_vlog_reverse_ubuf(struct bpf_verifier_log *log, int start, int en return 0; } -static bool bpf_vlog_truncated(const struct bpf_verifier_log *log) -{ - return log->len_max > log->len_total; -} - int bpf_vlog_finalize(struct bpf_verifier_log *log, u32 *log_size_actual) { u32 sublen; @@ -228,7 +235,7 @@ int bpf_vlog_finalize(struct bpf_verifier_log *log, u32 *log_size_actual) if (!log->ubuf) goto skip_log_rotate; /* If we never truncated log, there is nothing to move around. */ - if ((log->level & BPF_LOG_FIXED) || log->start_pos == 0) + if (log->start_pos == 0) goto skip_log_rotate; /* Otherwise we need to rotate log contents to make it start from the @@ -283,7 +290,8 @@ skip_log_rotate: if (!!log->ubuf != !!log->len_total) return -EFAULT; - if (bpf_vlog_truncated(log)) + /* did truncation actually happen? */ + if (log->ubuf && log->len_max > log->len_total) return -ENOSPC; return 0; -- cgit v1.2.3-58-ga151 From 91f2dc6838c19342f7f2993627c622835cc24890 Mon Sep 17 00:00:00 2001 From: Feng Zhou Date: Mon, 10 Apr 2023 16:59:07 +0800 Subject: bpf/btf: Fix is_int_ptr() When tracing a kernel function with arg type is u32*, btf_ctx_access() would report error: arg2 type INT is not a struct. The commit bb6728d75611 ("bpf: Allow access to int pointer arguments in tracing programs") added support for int pointer, but did not skip modifiers before checking it's type. This patch fixes it. Fixes: bb6728d75611 ("bpf: Allow access to int pointer arguments in tracing programs") Co-developed-by: Chengming Zhou Signed-off-by: Chengming Zhou Signed-off-by: Feng Zhou Signed-off-by: Daniel Borkmann Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20230410085908.98493-2-zhoufeng.zf@bytedance.com --- kernel/bpf/btf.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index ffc31a1c84af..913b9d717a4a 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5921,12 +5921,8 @@ struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog) static bool is_int_ptr(struct btf *btf, const struct btf_type *t) { - /* t comes in already as a pointer */ - t = btf_type_by_id(btf, t->type); - - /* allow const */ - if (BTF_INFO_KIND(t->info) == BTF_KIND_CONST) - t = btf_type_by_id(btf, t->type); + /* skip modifiers */ + t = btf_type_skip_modifiers(btf, t->type, NULL); return btf_type_is_int(t); } -- cgit v1.2.3-58-ga151 From 10fd5f70c397782a97f411f25bfb312ea92b55bc Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 12 Apr 2023 10:12:52 -0700 Subject: bpf: Handle NULL in bpf_local_storage_free. During OOM bpf_local_storage_alloc() may fail to allocate 'storage' and call to bpf_local_storage_free() with NULL pointer will cause a crash like: [ 271718.917646] BUG: kernel NULL pointer dereference, address: 00000000000000a0 [ 271719.019620] RIP: 0010:call_rcu+0x2d/0x240 [ 271719.216274] bpf_local_storage_alloc+0x19e/0x1e0 [ 271719.250121] bpf_local_storage_update+0x33b/0x740 Fixes: 7e30a8477b0b ("bpf: Add bpf_local_storage_free()") Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20230412171252.15635-1-alexei.starovoitov@gmail.com --- kernel/bpf/bpf_local_storage.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index dab2ff4c99d9..47d9948d768f 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -157,6 +157,9 @@ static void bpf_local_storage_free(struct bpf_local_storage *local_storage, struct bpf_local_storage_map *smap, bool bpf_ma, bool reuse_now) { + if (!local_storage) + return; + if (!bpf_ma) { __bpf_local_storage_free(local_storage, reuse_now); return; -- cgit v1.2.3-58-ga151 From 1d71283987c729dceccce834a864c27301ba155e Mon Sep 17 00:00:00 2001 From: David Vernet Date: Mon, 10 Apr 2023 23:16:31 -0500 Subject: bpf: Make bpf_cgroup_acquire() KF_RCU | KF_RET_NULL struct cgroup is already an RCU-safe type in the verifier. We can therefore update bpf_cgroup_acquire() to be KF_RCU | KF_RET_NULL, and subsequently remove bpf_cgroup_kptr_get(). This patch does the first of these by updating bpf_cgroup_acquire() to be KF_RCU | KF_RET_NULL, and also updates selftests accordingly. Signed-off-by: David Vernet Link: https://lore.kernel.org/r/20230411041633.179404-1-void@manifault.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 5 ++-- .../selftests/bpf/progs/cgrp_kfunc_common.h | 5 ++++ .../selftests/bpf/progs/cgrp_kfunc_failure.c | 35 ++++++++++++++++++---- .../selftests/bpf/progs/cgrp_kfunc_success.c | 5 +++- 4 files changed, 40 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index b6a5cda5bb59..71f0604bdc97 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2037,8 +2037,7 @@ __bpf_kfunc void bpf_task_release(struct task_struct *p) */ __bpf_kfunc struct cgroup *bpf_cgroup_acquire(struct cgroup *cgrp) { - cgroup_get(cgrp); - return cgrp; + return cgroup_tryget(cgrp) ? cgrp : NULL; } /** @@ -2314,7 +2313,7 @@ BTF_ID_FLAGS(func, bpf_rbtree_add) BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL) #ifdef CONFIG_CGROUPS -BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_cgroup_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_cgroup_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_RCU | KF_RET_NULL) diff --git a/tools/testing/selftests/bpf/progs/cgrp_kfunc_common.h b/tools/testing/selftests/bpf/progs/cgrp_kfunc_common.h index d0b7cd0d09d7..b0e279f4652b 100644 --- a/tools/testing/selftests/bpf/progs/cgrp_kfunc_common.h +++ b/tools/testing/selftests/bpf/progs/cgrp_kfunc_common.h @@ -61,6 +61,11 @@ static inline int cgrps_kfunc_map_insert(struct cgroup *cgrp) } acquired = bpf_cgroup_acquire(cgrp); + if (!acquired) { + bpf_map_delete_elem(&__cgrps_kfunc_map, &id); + return -ENOENT; + } + old = bpf_kptr_xchg(&v->cgrp, acquired); if (old) { bpf_cgroup_release(old); diff --git a/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c b/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c index 48b2034cadb3..49347f12de39 100644 --- a/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c +++ b/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c @@ -41,6 +41,23 @@ int BPF_PROG(cgrp_kfunc_acquire_untrusted, struct cgroup *cgrp, const char *path /* Can't invoke bpf_cgroup_acquire() on an untrusted pointer. */ acquired = bpf_cgroup_acquire(v->cgrp); + if (acquired) + bpf_cgroup_release(acquired); + + return 0; +} + +SEC("tp_btf/cgroup_mkdir") +__failure __msg("Possibly NULL pointer passed to trusted arg0") +int BPF_PROG(cgrp_kfunc_acquire_no_null_check, struct cgroup *cgrp, const char *path) +{ + struct cgroup *acquired; + + acquired = bpf_cgroup_acquire(cgrp); + /* + * Can't invoke bpf_cgroup_release() without checking the return value + * of bpf_cgroup_acquire(). + */ bpf_cgroup_release(acquired); return 0; @@ -54,7 +71,8 @@ int BPF_PROG(cgrp_kfunc_acquire_fp, struct cgroup *cgrp, const char *path) /* Can't invoke bpf_cgroup_acquire() on a random frame pointer. */ acquired = bpf_cgroup_acquire((struct cgroup *)&stack_cgrp); - bpf_cgroup_release(acquired); + if (acquired) + bpf_cgroup_release(acquired); return 0; } @@ -67,7 +85,8 @@ int BPF_PROG(cgrp_kfunc_acquire_unsafe_kretprobe, struct cgroup *cgrp) /* Can't acquire an untrusted struct cgroup * pointer. */ acquired = bpf_cgroup_acquire(cgrp); - bpf_cgroup_release(acquired); + if (acquired) + bpf_cgroup_release(acquired); return 0; } @@ -80,7 +99,8 @@ int BPF_PROG(cgrp_kfunc_acquire_trusted_walked, struct cgroup *cgrp, const char /* Can't invoke bpf_cgroup_acquire() on a pointer obtained from walking a trusted cgroup. */ acquired = bpf_cgroup_acquire(cgrp->old_dom_cgrp); - bpf_cgroup_release(acquired); + if (acquired) + bpf_cgroup_release(acquired); return 0; } @@ -93,9 +113,8 @@ int BPF_PROG(cgrp_kfunc_acquire_null, struct cgroup *cgrp, const char *path) /* Can't invoke bpf_cgroup_acquire() on a NULL pointer. */ acquired = bpf_cgroup_acquire(NULL); - if (!acquired) - return 0; - bpf_cgroup_release(acquired); + if (acquired) + bpf_cgroup_release(acquired); return 0; } @@ -137,6 +156,8 @@ int BPF_PROG(cgrp_kfunc_get_non_kptr_acquired, struct cgroup *cgrp, const char * struct cgroup *kptr, *acquired; acquired = bpf_cgroup_acquire(cgrp); + if (!acquired) + return 0; /* Cannot use bpf_cgroup_kptr_get() on a non-map-value, even if the kptr was acquired. */ kptr = bpf_cgroup_kptr_get(&acquired); @@ -256,6 +277,8 @@ int BPF_PROG(cgrp_kfunc_release_null, struct cgroup *cgrp, const char *path) return -ENOENT; acquired = bpf_cgroup_acquire(cgrp); + if (!acquired) + return -ENOENT; old = bpf_kptr_xchg(&v->cgrp, acquired); diff --git a/tools/testing/selftests/bpf/progs/cgrp_kfunc_success.c b/tools/testing/selftests/bpf/progs/cgrp_kfunc_success.c index 030aff700084..e9dbd1af05a7 100644 --- a/tools/testing/selftests/bpf/progs/cgrp_kfunc_success.c +++ b/tools/testing/selftests/bpf/progs/cgrp_kfunc_success.c @@ -38,7 +38,10 @@ int BPF_PROG(test_cgrp_acquire_release_argument, struct cgroup *cgrp, const char return 0; acquired = bpf_cgroup_acquire(cgrp); - bpf_cgroup_release(acquired); + if (!acquired) + err = 1; + else + bpf_cgroup_release(acquired); return 0; } -- cgit v1.2.3-58-ga151 From 6499fe6edc4fd5b91aed4d5cd84bd113e1c58d5f Mon Sep 17 00:00:00 2001 From: David Vernet Date: Mon, 10 Apr 2023 23:16:32 -0500 Subject: bpf: Remove bpf_cgroup_kptr_get() kfunc Now that bpf_cgroup_acquire() is KF_RCU | KF_RET_NULL, bpf_cgroup_kptr_get() is redundant. Let's remove it, and update selftests to instead use bpf_cgroup_acquire() where appropriate. The next patch will update the BPF documentation to not mention bpf_cgroup_kptr_get(). Signed-off-by: David Vernet Link: https://lore.kernel.org/r/20230411041633.179404-2-void@manifault.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 32 ---------- .../selftests/bpf/progs/cgrp_kfunc_common.h | 3 +- .../selftests/bpf/progs/cgrp_kfunc_failure.c | 68 +++------------------- .../selftests/bpf/progs/cgrp_kfunc_success.c | 10 ++-- 4 files changed, 14 insertions(+), 99 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 71f0604bdc97..f04e60a4847f 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2040,37 +2040,6 @@ __bpf_kfunc struct cgroup *bpf_cgroup_acquire(struct cgroup *cgrp) return cgroup_tryget(cgrp) ? cgrp : NULL; } -/** - * bpf_cgroup_kptr_get - Acquire a reference on a struct cgroup kptr. A cgroup - * kptr acquired by this kfunc which is not subsequently stored in a map, must - * be released by calling bpf_cgroup_release(). - * @cgrpp: A pointer to a cgroup kptr on which a reference is being acquired. - */ -__bpf_kfunc struct cgroup *bpf_cgroup_kptr_get(struct cgroup **cgrpp) -{ - struct cgroup *cgrp; - - rcu_read_lock(); - /* Another context could remove the cgroup from the map and release it - * at any time, including after we've done the lookup above. This is - * safe because we're in an RCU read region, so the cgroup is - * guaranteed to remain valid until at least the rcu_read_unlock() - * below. - */ - cgrp = READ_ONCE(*cgrpp); - - if (cgrp && !cgroup_tryget(cgrp)) - /* If the cgroup had been removed from the map and freed as - * described above, cgroup_tryget() will return false. The - * cgroup will be freed at some point after the current RCU gp - * has ended, so just return NULL to the user. - */ - cgrp = NULL; - rcu_read_unlock(); - - return cgrp; -} - /** * bpf_cgroup_release - Release the reference acquired on a cgroup. * If this kfunc is invoked in an RCU read region, the cgroup is guaranteed to @@ -2314,7 +2283,6 @@ BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL) #ifdef CONFIG_CGROUPS BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL) -BTF_ID_FLAGS(func, bpf_cgroup_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_cgroup_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_RCU | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_cgroup_from_id, KF_ACQUIRE | KF_RET_NULL) diff --git a/tools/testing/selftests/bpf/progs/cgrp_kfunc_common.h b/tools/testing/selftests/bpf/progs/cgrp_kfunc_common.h index b0e279f4652b..22914a70db54 100644 --- a/tools/testing/selftests/bpf/progs/cgrp_kfunc_common.h +++ b/tools/testing/selftests/bpf/progs/cgrp_kfunc_common.h @@ -21,10 +21,11 @@ struct hash_map { } __cgrps_kfunc_map SEC(".maps"); struct cgroup *bpf_cgroup_acquire(struct cgroup *p) __ksym; -struct cgroup *bpf_cgroup_kptr_get(struct cgroup **pp) __ksym; void bpf_cgroup_release(struct cgroup *p) __ksym; struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level) __ksym; struct cgroup *bpf_cgroup_from_id(u64 cgid) __ksym; +void bpf_rcu_read_lock(void) __ksym; +void bpf_rcu_read_unlock(void) __ksym; static inline struct __cgrps_kfunc_map_value *cgrps_kfunc_map_value_lookup(struct cgroup *cgrp) { diff --git a/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c b/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c index 49347f12de39..0fa564a5cc5b 100644 --- a/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c +++ b/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c @@ -133,59 +133,6 @@ int BPF_PROG(cgrp_kfunc_acquire_unreleased, struct cgroup *cgrp, const char *pat return 0; } -SEC("tp_btf/cgroup_mkdir") -__failure __msg("arg#0 expected pointer to map value") -int BPF_PROG(cgrp_kfunc_get_non_kptr_param, struct cgroup *cgrp, const char *path) -{ - struct cgroup *kptr; - - /* Cannot use bpf_cgroup_kptr_get() on a non-kptr, even on a valid cgroup. */ - kptr = bpf_cgroup_kptr_get(&cgrp); - if (!kptr) - return 0; - - bpf_cgroup_release(kptr); - - return 0; -} - -SEC("tp_btf/cgroup_mkdir") -__failure __msg("arg#0 expected pointer to map value") -int BPF_PROG(cgrp_kfunc_get_non_kptr_acquired, struct cgroup *cgrp, const char *path) -{ - struct cgroup *kptr, *acquired; - - acquired = bpf_cgroup_acquire(cgrp); - if (!acquired) - return 0; - - /* Cannot use bpf_cgroup_kptr_get() on a non-map-value, even if the kptr was acquired. */ - kptr = bpf_cgroup_kptr_get(&acquired); - bpf_cgroup_release(acquired); - if (!kptr) - return 0; - - bpf_cgroup_release(kptr); - - return 0; -} - -SEC("tp_btf/cgroup_mkdir") -__failure __msg("arg#0 expected pointer to map value") -int BPF_PROG(cgrp_kfunc_get_null, struct cgroup *cgrp, const char *path) -{ - struct cgroup *kptr; - - /* Cannot use bpf_cgroup_kptr_get() on a NULL pointer. */ - kptr = bpf_cgroup_kptr_get(NULL); - if (!kptr) - return 0; - - bpf_cgroup_release(kptr); - - return 0; -} - SEC("tp_btf/cgroup_mkdir") __failure __msg("Unreleased reference") int BPF_PROG(cgrp_kfunc_xchg_unreleased, struct cgroup *cgrp, const char *path) @@ -207,8 +154,8 @@ int BPF_PROG(cgrp_kfunc_xchg_unreleased, struct cgroup *cgrp, const char *path) } SEC("tp_btf/cgroup_mkdir") -__failure __msg("Unreleased reference") -int BPF_PROG(cgrp_kfunc_get_unreleased, struct cgroup *cgrp, const char *path) +__failure __msg("must be referenced or trusted") +int BPF_PROG(cgrp_kfunc_rcu_get_release, struct cgroup *cgrp, const char *path) { struct cgroup *kptr; struct __cgrps_kfunc_map_value *v; @@ -217,11 +164,12 @@ int BPF_PROG(cgrp_kfunc_get_unreleased, struct cgroup *cgrp, const char *path) if (!v) return 0; - kptr = bpf_cgroup_kptr_get(&v->cgrp); - if (!kptr) - return 0; - - /* Kptr acquired above is never released. */ + bpf_rcu_read_lock(); + kptr = v->cgrp; + if (kptr) + /* Can't release a cgroup kptr stored in a map. */ + bpf_cgroup_release(kptr); + bpf_rcu_read_unlock(); return 0; } diff --git a/tools/testing/selftests/bpf/progs/cgrp_kfunc_success.c b/tools/testing/selftests/bpf/progs/cgrp_kfunc_success.c index e9dbd1af05a7..5354455a01be 100644 --- a/tools/testing/selftests/bpf/progs/cgrp_kfunc_success.c +++ b/tools/testing/selftests/bpf/progs/cgrp_kfunc_success.c @@ -126,13 +126,11 @@ int BPF_PROG(test_cgrp_get_release, struct cgroup *cgrp, const char *path) return 0; } - kptr = bpf_cgroup_kptr_get(&v->cgrp); - if (!kptr) { + bpf_rcu_read_lock(); + kptr = v->cgrp; + if (!kptr) err = 3; - return 0; - } - - bpf_cgroup_release(kptr); + bpf_rcu_read_unlock(); return 0; } -- cgit v1.2.3-58-ga151 From c11bd046485d7bf1ca200db0e7d0bdc4bafdd395 Mon Sep 17 00:00:00 2001 From: Yafang Date: Thu, 13 Apr 2023 02:52:48 +0000 Subject: bpf: Add preempt_count_{sub,add} into btf id deny list The recursion check in __bpf_prog_enter* and __bpf_prog_exit* leave preempt_count_{sub,add} unprotected. When attaching trampoline to them we get panic as follows, [ 867.843050] BUG: TASK stack guard page was hit at 0000000009d325cf (stack is 0000000046a46a15..00000000537e7b28) [ 867.843064] stack guard page: 0000 [#1] PREEMPT SMP NOPTI [ 867.843067] CPU: 8 PID: 11009 Comm: trace Kdump: loaded Not tainted 6.2.0+ #4 [ 867.843100] Call Trace: [ 867.843101] [ 867.843104] asm_exc_int3+0x3a/0x40 [ 867.843108] RIP: 0010:preempt_count_sub+0x1/0xa0 [ 867.843135] __bpf_prog_enter_recur+0x17/0x90 [ 867.843148] bpf_trampoline_6442468108_0+0x2e/0x1000 [ 867.843154] ? preempt_count_sub+0x1/0xa0 [ 867.843157] preempt_count_sub+0x5/0xa0 [ 867.843159] ? migrate_enable+0xac/0xf0 [ 867.843164] __bpf_prog_exit_recur+0x2d/0x40 [ 867.843168] bpf_trampoline_6442468108_0+0x55/0x1000 ... [ 867.843788] preempt_count_sub+0x5/0xa0 [ 867.843793] ? migrate_enable+0xac/0xf0 [ 867.843829] __bpf_prog_exit_recur+0x2d/0x40 [ 867.843837] BUG: IRQ stack guard page was hit at 0000000099bd8228 (stack is 00000000b23e2bc4..000000006d95af35) [ 867.843841] BUG: IRQ stack guard page was hit at 000000005ae07924 (stack is 00000000ffd69623..0000000014eb594c) [ 867.843843] BUG: IRQ stack guard page was hit at 00000000028320f0 (stack is 00000000034b6438..0000000078d1bcec) [ 867.843842] bpf_trampoline_6442468108_0+0x55/0x1000 ... That is because in __bpf_prog_exit_recur, the preempt_count_{sub,add} are called after prog->active is decreased. Fixing this by adding these two functions into btf ids deny list. Suggested-by: Steven Rostedt Signed-off-by: Yafang Cc: Masami Hiramatsu Cc: Steven Rostedt Cc: Jiri Olsa Acked-by: Hao Luo Link: https://lore.kernel.org/r/20230413025248.79764-1-laoar.shao@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d6db6de3e9ea..e3d5a7a2f428 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -18582,6 +18582,10 @@ BTF_ID(func, migrate_enable) #if !defined CONFIG_PREEMPT_RCU && !defined CONFIG_TINY_RCU BTF_ID(func, rcu_read_unlock_strict) #endif +#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_TRACE_PREEMPT_TOGGLE) +BTF_ID(func, preempt_count_add) +BTF_ID(func, preempt_count_sub) +#endif BTF_SET_END(btf_id_deny) static bool can_be_sleepable(struct bpf_prog *prog) -- cgit v1.2.3-58-ga151 From 1cf3bfc60f9836f44da951f58b6ae24680484b35 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Thu, 13 Apr 2023 01:06:32 +0200 Subject: bpf: Support 64-bit pointers to kfuncs test_ksyms_module fails to emit a kfunc call targeting a module on s390x, because the verifier stores the difference between kfunc address and __bpf_call_base in bpf_insn.imm, which is s32, and modules are roughly (1 << 42) bytes away from the kernel on s390x. Fix by keeping BTF id in bpf_insn.imm for BPF_PSEUDO_KFUNC_CALLs, and storing the absolute address in bpf_kfunc_desc. Introduce bpf_jit_supports_far_kfunc_call() in order to limit this new behavior to the s390x JIT. Otherwise other JITs need to be modified, which is not desired. Introduce bpf_get_kfunc_addr() instead of exposing both find_kfunc_desc() and struct bpf_kfunc_desc. In addition to sorting kfuncs by imm, also sort them by offset, in order to handle conflicting imms from different modules. Do this on all architectures in order to simplify code. Factor out resolving specialized kfuncs (XPD and dynptr) from fixup_kfunc_call(). This was required in the first place, because fixup_kfunc_call() uses find_kfunc_desc(), which returns a const pointer, so it's not possible to modify kfunc addr without stripping const, which is not nice. It also removes repetition of code like: if (bpf_jit_supports_far_kfunc_call()) desc->addr = func; else insn->imm = BPF_CALL_IMM(func); and separates kfunc_desc_tab fixups from kfunc_call fixups. Suggested-by: Jiri Olsa Signed-off-by: Ilya Leoshkevich Acked-by: Jiri Olsa Link: https://lore.kernel.org/r/20230412230632.885985-1-iii@linux.ibm.com Signed-off-by: Alexei Starovoitov --- arch/s390/net/bpf_jit_comp.c | 5 ++ include/linux/bpf.h | 10 ++++ include/linux/filter.h | 1 + kernel/bpf/core.c | 11 ++++ kernel/bpf/verifier.c | 123 +++++++++++++++++++++++++++++-------------- 5 files changed, 110 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index d0846ba818ee..7102e4b674a0 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -2001,6 +2001,11 @@ bool bpf_jit_supports_kfunc_call(void) return true; } +bool bpf_jit_supports_far_kfunc_call(void) +{ + return true; +} + int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, void *old_addr, void *new_addr) { diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2c6095bd7d69..88845aadc47d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2295,6 +2295,9 @@ bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog); const struct btf_func_model * bpf_jit_find_kfunc_model(const struct bpf_prog *prog, const struct bpf_insn *insn); +int bpf_get_kfunc_addr(const struct bpf_prog *prog, u32 func_id, + u16 btf_fd_idx, u8 **func_addr); + struct bpf_core_ctx { struct bpf_verifier_log *log; const struct btf *btf; @@ -2545,6 +2548,13 @@ bpf_jit_find_kfunc_model(const struct bpf_prog *prog, return NULL; } +static inline int +bpf_get_kfunc_addr(const struct bpf_prog *prog, u32 func_id, + u16 btf_fd_idx, u8 **func_addr) +{ + return -ENOTSUPP; +} + static inline bool unprivileged_ebpf_enabled(void) { return false; diff --git a/include/linux/filter.h b/include/linux/filter.h index 5364b0c52c1d..bbce89937fde 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -920,6 +920,7 @@ void bpf_jit_compile(struct bpf_prog *prog); bool bpf_jit_needs_zext(void); bool bpf_jit_supports_subprog_tailcalls(void); bool bpf_jit_supports_kfunc_call(void); +bool bpf_jit_supports_far_kfunc_call(void); bool bpf_helper_changes_pkt_data(void *func); static inline bool bpf_dump_raw_ok(const struct cred *cred) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index e2d256c82072..7421487422d4 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1187,6 +1187,7 @@ int bpf_jit_get_func_addr(const struct bpf_prog *prog, s16 off = insn->off; s32 imm = insn->imm; u8 *addr; + int err; *func_addr_fixed = insn->src_reg != BPF_PSEUDO_CALL; if (!*func_addr_fixed) { @@ -1201,6 +1202,11 @@ int bpf_jit_get_func_addr(const struct bpf_prog *prog, addr = (u8 *)prog->aux->func[off]->bpf_func; else return -EINVAL; + } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && + bpf_jit_supports_far_kfunc_call()) { + err = bpf_get_kfunc_addr(prog, insn->imm, insn->off, &addr); + if (err) + return err; } else { /* Address of a BPF helper call. Since part of the core * kernel, it's always at a fixed location. __bpf_call_base @@ -2732,6 +2738,11 @@ bool __weak bpf_jit_supports_kfunc_call(void) return false; } +bool __weak bpf_jit_supports_far_kfunc_call(void) +{ + return false; +} + /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call * skb_copy_bits(), so provide a weak definition of it for NET-less config. */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e3d5a7a2f428..4aa6d715e655 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -195,6 +195,8 @@ static void invalidate_non_owning_refs(struct bpf_verifier_env *env); static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env); static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg); +static void specialize_kfunc(struct bpf_verifier_env *env, + u32 func_id, u16 offset, unsigned long *addr); static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux) { @@ -2374,6 +2376,7 @@ struct bpf_kfunc_desc { u32 func_id; s32 imm; u16 offset; + unsigned long addr; }; struct bpf_kfunc_btf { @@ -2383,6 +2386,11 @@ struct bpf_kfunc_btf { }; struct bpf_kfunc_desc_tab { + /* Sorted by func_id (BTF ID) and offset (fd_array offset) during + * verification. JITs do lookups by bpf_insn, where func_id may not be + * available, therefore at the end of verification do_misc_fixups() + * sorts this by imm and offset. + */ struct bpf_kfunc_desc descs[MAX_KFUNC_DESCS]; u32 nr_descs; }; @@ -2423,6 +2431,19 @@ find_kfunc_desc(const struct bpf_prog *prog, u32 func_id, u16 offset) sizeof(tab->descs[0]), kfunc_desc_cmp_by_id_off); } +int bpf_get_kfunc_addr(const struct bpf_prog *prog, u32 func_id, + u16 btf_fd_idx, u8 **func_addr) +{ + const struct bpf_kfunc_desc *desc; + + desc = find_kfunc_desc(prog, func_id, btf_fd_idx); + if (!desc) + return -EFAULT; + + *func_addr = (u8 *)desc->addr; + return 0; +} + static struct btf *__find_kfunc_desc_btf(struct bpf_verifier_env *env, s16 offset) { @@ -2602,13 +2623,18 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) func_name); return -EINVAL; } + specialize_kfunc(env, func_id, offset, &addr); - call_imm = BPF_CALL_IMM(addr); - /* Check whether or not the relative offset overflows desc->imm */ - if ((unsigned long)(s32)call_imm != call_imm) { - verbose(env, "address of kernel function %s is out of range\n", - func_name); - return -EINVAL; + if (bpf_jit_supports_far_kfunc_call()) { + call_imm = func_id; + } else { + call_imm = BPF_CALL_IMM(addr); + /* Check whether the relative offset overflows desc->imm */ + if ((unsigned long)(s32)call_imm != call_imm) { + verbose(env, "address of kernel function %s is out of range\n", + func_name); + return -EINVAL; + } } if (bpf_dev_bound_kfunc_id(func_id)) { @@ -2621,6 +2647,7 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) desc->func_id = func_id; desc->imm = call_imm; desc->offset = offset; + desc->addr = addr; err = btf_distill_func_proto(&env->log, desc_btf, func_proto, func_name, &desc->func_model); @@ -2630,19 +2657,19 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) return err; } -static int kfunc_desc_cmp_by_imm(const void *a, const void *b) +static int kfunc_desc_cmp_by_imm_off(const void *a, const void *b) { const struct bpf_kfunc_desc *d0 = a; const struct bpf_kfunc_desc *d1 = b; - if (d0->imm > d1->imm) - return 1; - else if (d0->imm < d1->imm) - return -1; + if (d0->imm != d1->imm) + return d0->imm < d1->imm ? -1 : 1; + if (d0->offset != d1->offset) + return d0->offset < d1->offset ? -1 : 1; return 0; } -static void sort_kfunc_descs_by_imm(struct bpf_prog *prog) +static void sort_kfunc_descs_by_imm_off(struct bpf_prog *prog) { struct bpf_kfunc_desc_tab *tab; @@ -2651,7 +2678,7 @@ static void sort_kfunc_descs_by_imm(struct bpf_prog *prog) return; sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]), - kfunc_desc_cmp_by_imm, NULL); + kfunc_desc_cmp_by_imm_off, NULL); } bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog) @@ -2665,13 +2692,14 @@ bpf_jit_find_kfunc_model(const struct bpf_prog *prog, { const struct bpf_kfunc_desc desc = { .imm = insn->imm, + .offset = insn->off, }; const struct bpf_kfunc_desc *res; struct bpf_kfunc_desc_tab *tab; tab = prog->aux->kfunc_tab; res = bsearch(&desc, tab->descs, tab->nr_descs, - sizeof(tab->descs[0]), kfunc_desc_cmp_by_imm); + sizeof(tab->descs[0]), kfunc_desc_cmp_by_imm_off); return res ? &res->func_model : NULL; } @@ -17293,11 +17321,45 @@ static int fixup_call_args(struct bpf_verifier_env *env) return err; } +/* replace a generic kfunc with a specialized version if necessary */ +static void specialize_kfunc(struct bpf_verifier_env *env, + u32 func_id, u16 offset, unsigned long *addr) +{ + struct bpf_prog *prog = env->prog; + bool seen_direct_write; + void *xdp_kfunc; + bool is_rdonly; + + if (bpf_dev_bound_kfunc_id(func_id)) { + xdp_kfunc = bpf_dev_bound_resolve_kfunc(prog, func_id); + if (xdp_kfunc) { + *addr = (unsigned long)xdp_kfunc; + return; + } + /* fallback to default kfunc when not supported by netdev */ + } + + if (offset) + return; + + if (func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) { + seen_direct_write = env->seen_direct_write; + is_rdonly = !may_access_direct_pkt_data(env, NULL, BPF_WRITE); + + if (is_rdonly) + *addr = (unsigned long)bpf_dynptr_from_skb_rdonly; + + /* restore env->seen_direct_write to its original value, since + * may_access_direct_pkt_data mutates it + */ + env->seen_direct_write = seen_direct_write; + } +} + static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, struct bpf_insn *insn_buf, int insn_idx, int *cnt) { const struct bpf_kfunc_desc *desc; - void *xdp_kfunc; if (!insn->imm) { verbose(env, "invalid kernel function call not eliminated in verifier pass\n"); @@ -17306,18 +17368,9 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, *cnt = 0; - if (bpf_dev_bound_kfunc_id(insn->imm)) { - xdp_kfunc = bpf_dev_bound_resolve_kfunc(env->prog, insn->imm); - if (xdp_kfunc) { - insn->imm = BPF_CALL_IMM(xdp_kfunc); - return 0; - } - - /* fallback to default kfunc when not supported by netdev */ - } - - /* insn->imm has the btf func_id. Replace it with - * an address (relative to __bpf_call_base). + /* insn->imm has the btf func_id. Replace it with an offset relative to + * __bpf_call_base, unless the JIT needs to call functions that are + * further than 32 bits away (bpf_jit_supports_far_kfunc_call()). */ desc = find_kfunc_desc(env->prog, insn->imm, insn->off); if (!desc) { @@ -17326,7 +17379,8 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return -EFAULT; } - insn->imm = desc->imm; + if (!bpf_jit_supports_far_kfunc_call()) + insn->imm = BPF_CALL_IMM(desc->addr); if (insn->off) return 0; if (desc->func_id == special_kfunc_list[KF_bpf_obj_new_impl]) { @@ -17351,17 +17405,6 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) { insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1); *cnt = 1; - } else if (desc->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) { - bool seen_direct_write = env->seen_direct_write; - bool is_rdonly = !may_access_direct_pkt_data(env, NULL, BPF_WRITE); - - if (is_rdonly) - insn->imm = BPF_CALL_IMM(bpf_dynptr_from_skb_rdonly); - - /* restore env->seen_direct_write to its original value, since - * may_access_direct_pkt_data mutates it - */ - env->seen_direct_write = seen_direct_write; } return 0; } @@ -17891,7 +17934,7 @@ patch_call_imm: } } - sort_kfunc_descs_by_imm(env->prog); + sort_kfunc_descs_by_imm_off(env->prog); return 0; } -- cgit v1.2.3-58-ga151 From cd2a8079014aced27da9b2e669784f31680f1351 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Sat, 15 Apr 2023 13:18:03 -0700 Subject: bpf: Remove btf_field_offs, use btf_record's fields instead The btf_field_offs struct contains (offset, size) for btf_record fields, sorted by offset. btf_field_offs is always used in conjunction with btf_record, which has btf_field 'fields' array with (offset, type), the latter of which btf_field_offs' size is derived from via btf_field_type_size. This patch adds a size field to struct btf_field and sorts btf_record's fields by offset, making it possible to get rid of btf_field_offs. Less data duplication and less code complexity results. Since btf_field_offs' lifetime closely followed the btf_record used to populate it, most complexity wins are from removal of initialization code like: if (btf_record_successfully_initialized) { foffs = btf_parse_field_offs(rec); if (IS_ERR_OR_NULL(foffs)) // free the btf_record and return err } Other changes in this patch are pretty mechanical: * foffs->field_off[i] -> rec->fields[i].offset * foffs->field_sz[i] -> rec->fields[i].size * Sort rec->fields in btf_parse_fields before returning * It's possible that this is necessary independently of other changes in this patch. btf_record_find in syscall.c expects btf_record's fields to be sorted by offset, yet there's no explicit sorting of them before this patch, record's fields are populated in the order they're read from BTF struct definition. BTF docs don't say anything about the sortedness of struct fields. * All functions taking struct btf_field_offs * input now instead take struct btf_record *. All callsites of these functions already have access to the correct btf_record. Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20230415201811.343116-2-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 44 ++++++++++------------- include/linux/btf.h | 2 -- kernel/bpf/btf.c | 93 +++++++++++-------------------------------------- kernel/bpf/helpers.c | 2 +- kernel/bpf/map_in_map.c | 15 -------- kernel/bpf/syscall.c | 17 ++------- 6 files changed, 43 insertions(+), 130 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 88845aadc47d..7888ed497432 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -210,6 +210,7 @@ struct btf_field_graph_root { struct btf_field { u32 offset; + u32 size; enum btf_field_type type; union { struct btf_field_kptr kptr; @@ -225,12 +226,6 @@ struct btf_record { struct btf_field fields[]; }; -struct btf_field_offs { - u32 cnt; - u32 field_off[BTF_FIELDS_MAX]; - u8 field_sz[BTF_FIELDS_MAX]; -}; - struct bpf_map { /* The first two cachelines with read-mostly members of which some * are also accessed in fast-path (e.g. ops, max_entries). @@ -257,7 +252,6 @@ struct bpf_map { struct obj_cgroup *objcg; #endif char name[BPF_OBJ_NAME_LEN]; - struct btf_field_offs *field_offs; /* The 3rd and 4th cacheline with misc members to avoid false sharing * particularly with refcounting. */ @@ -360,14 +354,14 @@ static inline bool btf_record_has_field(const struct btf_record *rec, enum btf_f return rec->field_mask & type; } -static inline void bpf_obj_init(const struct btf_field_offs *foffs, void *obj) +static inline void bpf_obj_init(const struct btf_record *rec, void *obj) { int i; - if (!foffs) + if (IS_ERR_OR_NULL(rec)) return; - for (i = 0; i < foffs->cnt; i++) - memset(obj + foffs->field_off[i], 0, foffs->field_sz[i]); + for (i = 0; i < rec->cnt; i++) + memset(obj + rec->fields[i].offset, 0, rec->fields[i].size); } /* 'dst' must be a temporary buffer and should not point to memory that is being @@ -379,7 +373,7 @@ static inline void bpf_obj_init(const struct btf_field_offs *foffs, void *obj) */ static inline void check_and_init_map_value(struct bpf_map *map, void *dst) { - bpf_obj_init(map->field_offs, dst); + bpf_obj_init(map->record, dst); } /* memcpy that is used with 8-byte aligned pointers, power-of-8 size and @@ -399,14 +393,14 @@ static inline void bpf_long_memcpy(void *dst, const void *src, u32 size) } /* copy everything but bpf_spin_lock, bpf_timer, and kptrs. There could be one of each. */ -static inline void bpf_obj_memcpy(struct btf_field_offs *foffs, +static inline void bpf_obj_memcpy(struct btf_record *rec, void *dst, void *src, u32 size, bool long_memcpy) { u32 curr_off = 0; int i; - if (likely(!foffs)) { + if (IS_ERR_OR_NULL(rec)) { if (long_memcpy) bpf_long_memcpy(dst, src, round_up(size, 8)); else @@ -414,49 +408,49 @@ static inline void bpf_obj_memcpy(struct btf_field_offs *foffs, return; } - for (i = 0; i < foffs->cnt; i++) { - u32 next_off = foffs->field_off[i]; + for (i = 0; i < rec->cnt; i++) { + u32 next_off = rec->fields[i].offset; u32 sz = next_off - curr_off; memcpy(dst + curr_off, src + curr_off, sz); - curr_off += foffs->field_sz[i] + sz; + curr_off += rec->fields[i].size + sz; } memcpy(dst + curr_off, src + curr_off, size - curr_off); } static inline void copy_map_value(struct bpf_map *map, void *dst, void *src) { - bpf_obj_memcpy(map->field_offs, dst, src, map->value_size, false); + bpf_obj_memcpy(map->record, dst, src, map->value_size, false); } static inline void copy_map_value_long(struct bpf_map *map, void *dst, void *src) { - bpf_obj_memcpy(map->field_offs, dst, src, map->value_size, true); + bpf_obj_memcpy(map->record, dst, src, map->value_size, true); } -static inline void bpf_obj_memzero(struct btf_field_offs *foffs, void *dst, u32 size) +static inline void bpf_obj_memzero(struct btf_record *rec, void *dst, u32 size) { u32 curr_off = 0; int i; - if (likely(!foffs)) { + if (IS_ERR_OR_NULL(rec)) { memset(dst, 0, size); return; } - for (i = 0; i < foffs->cnt; i++) { - u32 next_off = foffs->field_off[i]; + for (i = 0; i < rec->cnt; i++) { + u32 next_off = rec->fields[i].offset; u32 sz = next_off - curr_off; memset(dst + curr_off, 0, sz); - curr_off += foffs->field_sz[i] + sz; + curr_off += rec->fields[i].size + sz; } memset(dst + curr_off, 0, size - curr_off); } static inline void zero_map_value(struct bpf_map *map, void *dst) { - bpf_obj_memzero(map->field_offs, dst, map->value_size); + bpf_obj_memzero(map->record, dst, map->value_size); } void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, diff --git a/include/linux/btf.h b/include/linux/btf.h index 495250162422..813227bff58a 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -113,7 +113,6 @@ struct btf_id_dtor_kfunc { struct btf_struct_meta { u32 btf_id; struct btf_record *record; - struct btf_field_offs *field_offs; }; struct btf_struct_metas { @@ -207,7 +206,6 @@ int btf_find_timer(const struct btf *btf, const struct btf_type *t); struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t, u32 field_mask, u32 value_size); int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec); -struct btf_field_offs *btf_parse_field_offs(struct btf_record *rec); bool btf_type_is_void(const struct btf_type *t); s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind); const struct btf_type *btf_type_skip_modifiers(const struct btf *btf, diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 2c2d1fb9f410..f3c998feeccb 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -1666,10 +1666,8 @@ static void btf_struct_metas_free(struct btf_struct_metas *tab) if (!tab) return; - for (i = 0; i < tab->cnt; i++) { + for (i = 0; i < tab->cnt; i++) btf_record_free(tab->types[i].record); - kfree(tab->types[i].field_offs); - } kfree(tab); } @@ -3700,12 +3698,24 @@ static int btf_parse_rb_root(const struct btf *btf, struct btf_field *field, __alignof__(struct bpf_rb_node)); } +static int btf_field_cmp(const void *_a, const void *_b, const void *priv) +{ + const struct btf_field *a = (const struct btf_field *)_a; + const struct btf_field *b = (const struct btf_field *)_b; + + if (a->offset < b->offset) + return -1; + else if (a->offset > b->offset) + return 1; + return 0; +} + struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t, u32 field_mask, u32 value_size) { struct btf_field_info info_arr[BTF_FIELDS_MAX]; + u32 next_off = 0, field_type_size; struct btf_record *rec; - u32 next_off = 0; int ret, i, cnt; ret = btf_find_field(btf, t, field_mask, info_arr, ARRAY_SIZE(info_arr)); @@ -3725,7 +3735,8 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type rec->spin_lock_off = -EINVAL; rec->timer_off = -EINVAL; for (i = 0; i < cnt; i++) { - if (info_arr[i].off + btf_field_type_size(info_arr[i].type) > value_size) { + field_type_size = btf_field_type_size(info_arr[i].type); + if (info_arr[i].off + field_type_size > value_size) { WARN_ONCE(1, "verifier bug off %d size %d", info_arr[i].off, value_size); ret = -EFAULT; goto end; @@ -3734,11 +3745,12 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type ret = -EEXIST; goto end; } - next_off = info_arr[i].off + btf_field_type_size(info_arr[i].type); + next_off = info_arr[i].off + field_type_size; rec->field_mask |= info_arr[i].type; rec->fields[i].offset = info_arr[i].off; rec->fields[i].type = info_arr[i].type; + rec->fields[i].size = field_type_size; switch (info_arr[i].type) { case BPF_SPIN_LOCK: @@ -3808,6 +3820,9 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type goto end; } + sort_r(rec->fields, rec->cnt, sizeof(struct btf_field), btf_field_cmp, + NULL, rec); + return rec; end: btf_record_free(rec); @@ -3889,61 +3904,6 @@ int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec) return 0; } -static int btf_field_offs_cmp(const void *_a, const void *_b, const void *priv) -{ - const u32 a = *(const u32 *)_a; - const u32 b = *(const u32 *)_b; - - if (a < b) - return -1; - else if (a > b) - return 1; - return 0; -} - -static void btf_field_offs_swap(void *_a, void *_b, int size, const void *priv) -{ - struct btf_field_offs *foffs = (void *)priv; - u32 *off_base = foffs->field_off; - u32 *a = _a, *b = _b; - u8 *sz_a, *sz_b; - - sz_a = foffs->field_sz + (a - off_base); - sz_b = foffs->field_sz + (b - off_base); - - swap(*a, *b); - swap(*sz_a, *sz_b); -} - -struct btf_field_offs *btf_parse_field_offs(struct btf_record *rec) -{ - struct btf_field_offs *foffs; - u32 i, *off; - u8 *sz; - - BUILD_BUG_ON(ARRAY_SIZE(foffs->field_off) != ARRAY_SIZE(foffs->field_sz)); - if (IS_ERR_OR_NULL(rec)) - return NULL; - - foffs = kzalloc(sizeof(*foffs), GFP_KERNEL | __GFP_NOWARN); - if (!foffs) - return ERR_PTR(-ENOMEM); - - off = foffs->field_off; - sz = foffs->field_sz; - for (i = 0; i < rec->cnt; i++) { - off[i] = rec->fields[i].offset; - sz[i] = btf_field_type_size(rec->fields[i].type); - } - foffs->cnt = rec->cnt; - - if (foffs->cnt == 1) - return foffs; - sort_r(foffs->field_off, foffs->cnt, sizeof(foffs->field_off[0]), - btf_field_offs_cmp, btf_field_offs_swap, foffs); - return foffs; -} - static void __btf_struct_show(const struct btf *btf, const struct btf_type *t, u32 type_id, void *data, u8 bits_offset, struct btf_show *show) @@ -5386,7 +5346,6 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf) for (i = 1; i < n; i++) { struct btf_struct_metas *new_tab; const struct btf_member *member; - struct btf_field_offs *foffs; struct btf_struct_meta *type; struct btf_record *record; const struct btf_type *t; @@ -5428,17 +5387,7 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf) ret = PTR_ERR_OR_ZERO(record) ?: -EFAULT; goto free; } - foffs = btf_parse_field_offs(record); - /* We need the field_offs to be valid for a valid record, - * either both should be set or both should be unset. - */ - if (IS_ERR_OR_NULL(foffs)) { - btf_record_free(record); - ret = -EFAULT; - goto free; - } type->record = record; - type->field_offs = foffs; tab->cnt++; } return tab; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index f04e60a4847f..e989b6460147 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1893,7 +1893,7 @@ __bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign) if (!p) return NULL; if (meta) - bpf_obj_init(meta->field_offs, p); + bpf_obj_init(meta->record, p); return p; } diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 38136ec4e095..2c5c64c2a53b 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -56,18 +56,6 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) ret = PTR_ERR(inner_map_meta->record); goto free; } - if (inner_map_meta->record) { - struct btf_field_offs *field_offs; - /* If btf_record is !IS_ERR_OR_NULL, then field_offs is always - * valid. - */ - field_offs = kmemdup(inner_map->field_offs, sizeof(*inner_map->field_offs), GFP_KERNEL | __GFP_NOWARN); - if (!field_offs) { - ret = -ENOMEM; - goto free_rec; - } - inner_map_meta->field_offs = field_offs; - } /* Note: We must use the same BTF, as we also used btf_record_dup above * which relies on BTF being same for both maps, as some members like * record->fields.list_head have pointers like value_rec pointing into @@ -88,8 +76,6 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) fdput(f); return inner_map_meta; -free_rec: - btf_record_free(inner_map_meta->record); free: kfree(inner_map_meta); put: @@ -99,7 +85,6 @@ put: void bpf_map_meta_free(struct bpf_map *map_meta) { - kfree(map_meta->field_offs); bpf_map_free_record(map_meta); btf_put(map_meta->btf); kfree(map_meta); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 6d575505f89c..c08b7933bf8f 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -717,14 +717,13 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) static void bpf_map_free_deferred(struct work_struct *work) { struct bpf_map *map = container_of(work, struct bpf_map, work); - struct btf_field_offs *foffs = map->field_offs; struct btf_record *rec = map->record; security_bpf_map_free(map); bpf_map_release_memcg(map); /* implementation dependent freeing */ map->ops->map_free(map); - /* Delay freeing of field_offs and btf_record for maps, as map_free + /* Delay freeing of btf_record for maps, as map_free * callback usually needs access to them. It is better to do it here * than require each callback to do the free itself manually. * @@ -733,7 +732,6 @@ static void bpf_map_free_deferred(struct work_struct *work) * eventually calls bpf_map_free_meta, since inner_map_meta is only a * template bpf_map struct used during verification. */ - kfree(foffs); btf_record_free(rec); } @@ -1125,7 +1123,6 @@ free_map_tab: static int map_create(union bpf_attr *attr) { int numa_node = bpf_map_attr_numa_node(attr); - struct btf_field_offs *foffs; struct bpf_map *map; int f_flags; int err; @@ -1205,17 +1202,9 @@ static int map_create(union bpf_attr *attr) attr->btf_vmlinux_value_type_id; } - - foffs = btf_parse_field_offs(map->record); - if (IS_ERR(foffs)) { - err = PTR_ERR(foffs); - goto free_map; - } - map->field_offs = foffs; - err = security_bpf_map_alloc(map); if (err) - goto free_map_field_offs; + goto free_map; err = bpf_map_alloc_id(map); if (err) @@ -1239,8 +1228,6 @@ static int map_create(union bpf_attr *attr) free_map_sec: security_bpf_map_free(map); -free_map_field_offs: - kfree(map->field_offs); free_map: btf_put(map->btf); map->ops->map_free(map); -- cgit v1.2.3-58-ga151 From d54730b50bae1f3119bd686d551d66f0fcc387ca Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Sat, 15 Apr 2023 13:18:04 -0700 Subject: bpf: Introduce opaque bpf_refcount struct and add btf_record plumbing A 'struct bpf_refcount' is added to the set of opaque uapi/bpf.h types meant for use in BPF programs. Similarly to other opaque types like bpf_spin_lock and bpf_rbtree_node, the verifier needs to know where in user-defined struct types a bpf_refcount can be located, so necessary btf_record plumbing is added to enable this. bpf_refcount is sized to hold a refcount_t. Similarly to bpf_spin_lock, the offset of a bpf_refcount is cached in btf_record as refcount_off in addition to being in the field array. Caching refcount_off makes sense for this field because further patches in the series will modify functions that take local kptrs (e.g. bpf_obj_drop) to change their behavior if the type they're operating on is refcounted. So enabling fast "is this type refcounted?" checks is desirable. No such verifier behavior changes are introduced in this patch, just logic to recognize 'struct bpf_refcount' in btf_record. Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20230415201811.343116-3-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 8 ++++++++ include/uapi/linux/bpf.h | 4 ++++ kernel/bpf/btf.c | 12 +++++++++++- kernel/bpf/syscall.c | 6 +++++- tools/include/uapi/linux/bpf.h | 4 ++++ 5 files changed, 32 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 7888ed497432..be44d765b7a4 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -187,6 +187,7 @@ enum btf_field_type { BPF_RB_NODE = (1 << 7), BPF_GRAPH_NODE_OR_ROOT = BPF_LIST_NODE | BPF_LIST_HEAD | BPF_RB_NODE | BPF_RB_ROOT, + BPF_REFCOUNT = (1 << 8), }; typedef void (*btf_dtor_kfunc_t)(void *); @@ -223,6 +224,7 @@ struct btf_record { u32 field_mask; int spin_lock_off; int timer_off; + int refcount_off; struct btf_field fields[]; }; @@ -293,6 +295,8 @@ static inline const char *btf_field_type_name(enum btf_field_type type) return "bpf_rb_root"; case BPF_RB_NODE: return "bpf_rb_node"; + case BPF_REFCOUNT: + return "bpf_refcount"; default: WARN_ON_ONCE(1); return "unknown"; @@ -317,6 +321,8 @@ static inline u32 btf_field_type_size(enum btf_field_type type) return sizeof(struct bpf_rb_root); case BPF_RB_NODE: return sizeof(struct bpf_rb_node); + case BPF_REFCOUNT: + return sizeof(struct bpf_refcount); default: WARN_ON_ONCE(1); return 0; @@ -341,6 +347,8 @@ static inline u32 btf_field_type_align(enum btf_field_type type) return __alignof__(struct bpf_rb_root); case BPF_RB_NODE: return __alignof__(struct bpf_rb_node); + case BPF_REFCOUNT: + return __alignof__(struct bpf_refcount); default: WARN_ON_ONCE(1); return 0; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 3823100b7934..4b20a7269bee 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -6985,6 +6985,10 @@ struct bpf_rb_node { __u64 :64; } __attribute__((aligned(8))); +struct bpf_refcount { + __u32 :32; +} __attribute__((aligned(4))); + struct bpf_sysctl { __u32 write; /* Sysctl is being read (= 0) or written (= 1). * Allows 1,2,4-byte read, but no write. diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index f3c998feeccb..14889fd5ba8e 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3391,6 +3391,7 @@ static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask, field_mask_test_name(BPF_LIST_NODE, "bpf_list_node"); field_mask_test_name(BPF_RB_ROOT, "bpf_rb_root"); field_mask_test_name(BPF_RB_NODE, "bpf_rb_node"); + field_mask_test_name(BPF_REFCOUNT, "bpf_refcount"); /* Only return BPF_KPTR when all other types with matchable names fail */ if (field_mask & BPF_KPTR) { @@ -3439,6 +3440,7 @@ static int btf_find_struct_field(const struct btf *btf, case BPF_TIMER: case BPF_LIST_NODE: case BPF_RB_NODE: + case BPF_REFCOUNT: ret = btf_find_struct(btf, member_type, off, sz, field_type, idx < info_cnt ? &info[idx] : &tmp); if (ret < 0) @@ -3504,6 +3506,7 @@ static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t, case BPF_TIMER: case BPF_LIST_NODE: case BPF_RB_NODE: + case BPF_REFCOUNT: ret = btf_find_struct(btf, var_type, off, sz, field_type, idx < info_cnt ? &info[idx] : &tmp); if (ret < 0) @@ -3734,6 +3737,7 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type rec->spin_lock_off = -EINVAL; rec->timer_off = -EINVAL; + rec->refcount_off = -EINVAL; for (i = 0; i < cnt; i++) { field_type_size = btf_field_type_size(info_arr[i].type); if (info_arr[i].off + field_type_size > value_size) { @@ -3763,6 +3767,11 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type /* Cache offset for faster lookup at runtime */ rec->timer_off = rec->fields[i].offset; break; + case BPF_REFCOUNT: + WARN_ON_ONCE(rec->refcount_off >= 0); + /* Cache offset for faster lookup at runtime */ + rec->refcount_off = rec->fields[i].offset; + break; case BPF_KPTR_UNREF: case BPF_KPTR_REF: ret = btf_parse_kptr(btf, &rec->fields[i], &info_arr[i]); @@ -5308,6 +5317,7 @@ static const char *alloc_obj_fields[] = { "bpf_list_node", "bpf_rb_root", "bpf_rb_node", + "bpf_refcount", }; static struct btf_struct_metas * @@ -5381,7 +5391,7 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf) type = &tab->types[tab->cnt]; type->btf_id = i; record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE | - BPF_RB_ROOT | BPF_RB_NODE, t->size); + BPF_RB_ROOT | BPF_RB_NODE | BPF_REFCOUNT, t->size); /* The record cannot be unset, treat it as an error if so */ if (IS_ERR_OR_NULL(record)) { ret = PTR_ERR_OR_ZERO(record) ?: -EFAULT; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c08b7933bf8f..28eac7434d32 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -552,6 +552,7 @@ void btf_record_free(struct btf_record *rec) case BPF_RB_NODE: case BPF_SPIN_LOCK: case BPF_TIMER: + case BPF_REFCOUNT: /* Nothing to release */ break; default: @@ -599,6 +600,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec) case BPF_RB_NODE: case BPF_SPIN_LOCK: case BPF_TIMER: + case BPF_REFCOUNT: /* Nothing to acquire */ break; default: @@ -705,6 +707,7 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) break; case BPF_LIST_NODE: case BPF_RB_NODE: + case BPF_REFCOUNT: break; default: WARN_ON_ONCE(1); @@ -1032,7 +1035,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, map->record = btf_parse_fields(btf, value_type, BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD | - BPF_RB_ROOT, + BPF_RB_ROOT | BPF_REFCOUNT, map->value_size); if (!IS_ERR_OR_NULL(map->record)) { int i; @@ -1071,6 +1074,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, break; case BPF_KPTR_UNREF: case BPF_KPTR_REF: + case BPF_REFCOUNT: if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_PERCPU_HASH && map->map_type != BPF_MAP_TYPE_LRU_HASH && diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 3823100b7934..4b20a7269bee 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -6985,6 +6985,10 @@ struct bpf_rb_node { __u64 :64; } __attribute__((aligned(8))); +struct bpf_refcount { + __u32 :32; +} __attribute__((aligned(4))); + struct bpf_sysctl { __u32 write; /* Sysctl is being read (= 0) or written (= 1). * Allows 1,2,4-byte read, but no write. -- cgit v1.2.3-58-ga151 From 1512217c47f0e8ea076dd0e67262e5a668a78f01 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Sat, 15 Apr 2023 13:18:05 -0700 Subject: bpf: Support refcounted local kptrs in existing semantics A local kptr is considered 'refcounted' when it is of a type that has a bpf_refcount field. When such a kptr is created, its refcount should be initialized to 1; when destroyed, the object should be free'd only if a refcount decr results in 0 refcount. Existing logic always frees the underlying memory when destroying a local kptr, and 0-initializes all btf_record fields. This patch adds checks for "is local kptr refcounted?" and new logic for that case in the appropriate places. This patch focuses on changing existing semantics and thus conspicuously does _not_ provide a way for BPF programs in increment refcount. That follows later in the series. __bpf_obj_drop_impl is modified to do the right thing when it sees a refcounted type. Container types for graph nodes (list, tree, stashed in map) are migrated to use __bpf_obj_drop_impl as a destructor for their nodes instead of each having custom destruction code in their _free paths. Now that "drop" isn't a synonym for "free" when the type is refcounted it makes sense to centralize this logic. Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20230415201811.343116-4-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 +++ kernel/bpf/helpers.c | 21 +++++++++++++-------- 2 files changed, 16 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index be44d765b7a4..b065de2cfcea 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -370,6 +370,9 @@ static inline void bpf_obj_init(const struct btf_record *rec, void *obj) return; for (i = 0; i < rec->cnt; i++) memset(obj + rec->fields[i].offset, 0, rec->fields[i].size); + + if (rec->refcount_off >= 0) + refcount_set((refcount_t *)(obj + rec->refcount_off), 1); } /* 'dst' must be a temporary buffer and should not point to memory that is being diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index e989b6460147..e2dbd9644e5c 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1798,6 +1798,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) } } +void __bpf_obj_drop_impl(void *p, const struct btf_record *rec); + void bpf_list_head_free(const struct btf_field *field, void *list_head, struct bpf_spin_lock *spin_lock) { @@ -1828,13 +1830,8 @@ unlock: /* The contained type can also have resources, including a * bpf_list_head which needs to be freed. */ - bpf_obj_free_fields(field->graph_root.value_rec, obj); - /* bpf_mem_free requires migrate_disable(), since we can be - * called from map free path as well apart from BPF program (as - * part of map ops doing bpf_obj_free_fields). - */ migrate_disable(); - bpf_mem_free(&bpf_global_ma, obj); + __bpf_obj_drop_impl(obj, field->graph_root.value_rec); migrate_enable(); } } @@ -1871,10 +1868,9 @@ void bpf_rb_root_free(const struct btf_field *field, void *rb_root, obj = pos; obj -= field->graph_root.node_offset; - bpf_obj_free_fields(field->graph_root.value_rec, obj); migrate_disable(); - bpf_mem_free(&bpf_global_ma, obj); + __bpf_obj_drop_impl(obj, field->graph_root.value_rec); migrate_enable(); } } @@ -1897,8 +1893,17 @@ __bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign) return p; } +/* Must be called under migrate_disable(), as required by bpf_mem_free */ void __bpf_obj_drop_impl(void *p, const struct btf_record *rec) { + if (rec && rec->refcount_off >= 0 && + !refcount_dec_and_test((refcount_t *)(p + rec->refcount_off))) { + /* Object is refcounted and refcount_dec didn't result in 0 + * refcount. Return without freeing the object + */ + return; + } + if (rec) bpf_obj_free_fields(rec, p); bpf_mem_free(&bpf_global_ma, p); -- cgit v1.2.3-58-ga151 From 7c50b1cb76aca4540aa917db5f2a302acddcadff Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Sat, 15 Apr 2023 13:18:06 -0700 Subject: bpf: Add bpf_refcount_acquire kfunc Currently, BPF programs can interact with the lifetime of refcounted local kptrs in the following ways: bpf_obj_new - Initialize refcount to 1 as part of new object creation bpf_obj_drop - Decrement refcount and free object if it's 0 collection add - Pass ownership to the collection. No change to refcount but collection is responsible for bpf_obj_dropping it In order to be able to add a refcounted local kptr to multiple collections we need to be able to increment the refcount and acquire a new owning reference. This patch adds a kfunc, bpf_refcount_acquire, implementing such an operation. bpf_refcount_acquire takes a refcounted local kptr and returns a new owning reference to the same underlying memory as the input. The input can be either owning or non-owning. To reinforce why this is safe, consider the following code snippets: struct node *n = bpf_obj_new(typeof(*n)); // A struct node *m = bpf_refcount_acquire(n); // B In the above snippet, n will be alive with refcount=1 after (A), and since nothing changes that state before (B), it's obviously safe. If n is instead added to some rbtree, we can still safely refcount_acquire it: struct node *n = bpf_obj_new(typeof(*n)); struct node *m; bpf_spin_lock(&glock); bpf_rbtree_add(&groot, &n->node, less); // A m = bpf_refcount_acquire(n); // B bpf_spin_unlock(&glock); In the above snippet, after (A) n is a non-owning reference, and after (B) m is an owning reference pointing to the same memory as n. Although n has no ownership of that memory's lifetime, it's guaranteed to be alive until the end of the critical section, and n would be clobbered if we were past the end of the critical section, so it's safe to bump refcount. Implementation details: * From verifier's perspective, bpf_refcount_acquire handling is similar to bpf_obj_new and bpf_obj_drop. Like the former, it returns a new owning reference matching input type, although like the latter, type can be inferred from concrete kptr input. Verifier changes in {check,fixup}_kfunc_call and check_kfunc_args are largely copied from aforementioned functions' verifier changes. * An exception to the above is the new KF_ARG_PTR_TO_REFCOUNTED_KPTR arg, indicated by new "__refcounted_kptr" kfunc arg suffix. This is necessary in order to handle both owning and non-owning input without adding special-casing to "__alloc" arg handling. Also a convenient place to confirm that input type has bpf_refcount field. * The implemented kfunc is actually bpf_refcount_acquire_impl, with 'hidden' second arg that the verifier sets to the type's struct_meta in fixup_kfunc_call. Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20230415201811.343116-5-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 15 ++++++ kernel/bpf/verifier.c | 74 ++++++++++++++++++++++---- tools/testing/selftests/bpf/bpf_experimental.h | 13 +++++ 3 files changed, 91 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index e2dbd9644e5c..57ff8a60222c 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1917,6 +1917,20 @@ __bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign) __bpf_obj_drop_impl(p, meta ? meta->record : NULL); } +__bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta__ign) +{ + struct btf_struct_meta *meta = meta__ign; + struct bpf_refcount *ref; + + /* Could just cast directly to refcount_t *, but need some code using + * bpf_refcount type so that it is emitted in vmlinux BTF + */ + ref = (struct bpf_refcount *)p__refcounted_kptr + meta->record->refcount_off; + + refcount_inc((refcount_t *)ref); + return (void *)p__refcounted_kptr; +} + static void __bpf_list_add(struct bpf_list_node *node, struct bpf_list_head *head, bool tail) { struct list_head *n = (void *)node, *h = (void *)head; @@ -2276,6 +2290,7 @@ BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE) #endif BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_obj_drop_impl, KF_RELEASE) +BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE) BTF_ID_FLAGS(func, bpf_list_push_front) BTF_ID_FLAGS(func, bpf_list_push_back) BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4aa6d715e655..29e106f7ccaa 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -273,6 +273,11 @@ struct bpf_call_arg_meta { struct btf_field *kptr_field; }; +struct btf_and_id { + struct btf *btf; + u32 btf_id; +}; + struct bpf_kfunc_call_arg_meta { /* In parameters */ struct btf *btf; @@ -291,10 +296,10 @@ struct bpf_kfunc_call_arg_meta { u64 value; bool found; } arg_constant; - struct { - struct btf *btf; - u32 btf_id; - } arg_obj_drop; + union { + struct btf_and_id arg_obj_drop; + struct btf_and_id arg_refcount_acquire; + }; struct { struct btf_field *field; } arg_list_head; @@ -9403,6 +9408,11 @@ static bool is_kfunc_arg_uninit(const struct btf *btf, const struct btf_param *a return __kfunc_param_match_suffix(btf, arg, "__uninit"); } +static bool is_kfunc_arg_refcounted_kptr(const struct btf *btf, const struct btf_param *arg) +{ + return __kfunc_param_match_suffix(btf, arg, "__refcounted_kptr"); +} + static bool is_kfunc_arg_scalar_with_name(const struct btf *btf, const struct btf_param *arg, const char *name) @@ -9542,15 +9552,16 @@ static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = { enum kfunc_ptr_arg_type { KF_ARG_PTR_TO_CTX, - KF_ARG_PTR_TO_ALLOC_BTF_ID, /* Allocated object */ - KF_ARG_PTR_TO_KPTR, /* PTR_TO_KPTR but type specific */ + KF_ARG_PTR_TO_ALLOC_BTF_ID, /* Allocated object */ + KF_ARG_PTR_TO_REFCOUNTED_KPTR, /* Refcounted local kptr */ + KF_ARG_PTR_TO_KPTR, /* PTR_TO_KPTR but type specific */ KF_ARG_PTR_TO_DYNPTR, KF_ARG_PTR_TO_ITER, KF_ARG_PTR_TO_LIST_HEAD, KF_ARG_PTR_TO_LIST_NODE, - KF_ARG_PTR_TO_BTF_ID, /* Also covers reg2btf_ids conversions */ + KF_ARG_PTR_TO_BTF_ID, /* Also covers reg2btf_ids conversions */ KF_ARG_PTR_TO_MEM, - KF_ARG_PTR_TO_MEM_SIZE, /* Size derived from next argument, skip it */ + KF_ARG_PTR_TO_MEM_SIZE, /* Size derived from next argument, skip it */ KF_ARG_PTR_TO_CALLBACK, KF_ARG_PTR_TO_RB_ROOT, KF_ARG_PTR_TO_RB_NODE, @@ -9559,6 +9570,7 @@ enum kfunc_ptr_arg_type { enum special_kfunc_type { KF_bpf_obj_new_impl, KF_bpf_obj_drop_impl, + KF_bpf_refcount_acquire_impl, KF_bpf_list_push_front, KF_bpf_list_push_back, KF_bpf_list_pop_front, @@ -9579,6 +9591,7 @@ enum special_kfunc_type { BTF_SET_START(special_kfunc_set) BTF_ID(func, bpf_obj_new_impl) BTF_ID(func, bpf_obj_drop_impl) +BTF_ID(func, bpf_refcount_acquire_impl) BTF_ID(func, bpf_list_push_front) BTF_ID(func, bpf_list_push_back) BTF_ID(func, bpf_list_pop_front) @@ -9597,6 +9610,7 @@ BTF_SET_END(special_kfunc_set) BTF_ID_LIST(special_kfunc_list) BTF_ID(func, bpf_obj_new_impl) BTF_ID(func, bpf_obj_drop_impl) +BTF_ID(func, bpf_refcount_acquire_impl) BTF_ID(func, bpf_list_push_front) BTF_ID(func, bpf_list_push_back) BTF_ID(func, bpf_list_pop_front) @@ -9649,6 +9663,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (is_kfunc_arg_alloc_obj(meta->btf, &args[argno])) return KF_ARG_PTR_TO_ALLOC_BTF_ID; + if (is_kfunc_arg_refcounted_kptr(meta->btf, &args[argno])) + return KF_ARG_PTR_TO_REFCOUNTED_KPTR; + if (is_kfunc_arg_kptr_get(meta, argno)) { if (!btf_type_is_ptr(ref_t)) { verbose(env, "arg#0 BTF type must be a double pointer for kptr_get kfunc\n"); @@ -9952,7 +9969,8 @@ static bool is_bpf_rbtree_api_kfunc(u32 btf_id) static bool is_bpf_graph_api_kfunc(u32 btf_id) { - return is_bpf_list_api_kfunc(btf_id) || is_bpf_rbtree_api_kfunc(btf_id); + return is_bpf_list_api_kfunc(btf_id) || is_bpf_rbtree_api_kfunc(btf_id) || + btf_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]; } static bool is_callback_calling_kfunc(u32 btf_id) @@ -10171,6 +10189,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ const char *func_name = meta->func_name, *ref_tname; const struct btf *btf = meta->btf; const struct btf_param *args; + struct btf_record *rec; u32 i, nargs; int ret; @@ -10306,6 +10325,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_MEM: case KF_ARG_PTR_TO_MEM_SIZE: case KF_ARG_PTR_TO_CALLBACK: + case KF_ARG_PTR_TO_REFCOUNTED_KPTR: /* Trusted by default */ break; default: @@ -10523,6 +10543,26 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_CALLBACK: meta->subprogno = reg->subprogno; break; + case KF_ARG_PTR_TO_REFCOUNTED_KPTR: + if (!type_is_ptr_alloc_obj(reg->type) && !type_is_non_owning_ref(reg->type)) { + verbose(env, "arg#%d is neither owning or non-owning ref\n", i); + return -EINVAL; + } + + rec = reg_btf_record(reg); + if (!rec) { + verbose(env, "verifier internal error: Couldn't find btf_record\n"); + return -EFAULT; + } + + if (rec->refcount_off < 0) { + verbose(env, "arg#%d doesn't point to a type with bpf_refcount field\n", i); + return -EINVAL; + } + + meta->arg_refcount_acquire.btf = reg->btf; + meta->arg_refcount_acquire.btf_id = reg->btf_id; + break; } } @@ -10699,7 +10739,9 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (is_kfunc_acquire(&meta) && !btf_type_is_struct_ptr(meta.btf, t)) { /* Only exception is bpf_obj_new_impl */ - if (meta.btf != btf_vmlinux || meta.func_id != special_kfunc_list[KF_bpf_obj_new_impl]) { + if (meta.btf != btf_vmlinux || + (meta.func_id != special_kfunc_list[KF_bpf_obj_new_impl] && + meta.func_id != special_kfunc_list[KF_bpf_refcount_acquire_impl])) { verbose(env, "acquire kernel function does not return PTR_TO_BTF_ID\n"); return -EINVAL; } @@ -10747,6 +10789,15 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, insn_aux->obj_new_size = ret_t->size; insn_aux->kptr_struct_meta = btf_find_struct_meta(ret_btf, ret_btf_id); + } else if (meta.func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) { + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC; + regs[BPF_REG_0].btf = meta.arg_refcount_acquire.btf; + regs[BPF_REG_0].btf_id = meta.arg_refcount_acquire.btf_id; + + insn_aux->kptr_struct_meta = + btf_find_struct_meta(meta.arg_refcount_acquire.btf, + meta.arg_refcount_acquire.btf_id); } else if (meta.func_id == special_kfunc_list[KF_bpf_list_pop_front] || meta.func_id == special_kfunc_list[KF_bpf_list_pop_back]) { struct btf_field *field = meta.arg_list_head.field; @@ -17393,7 +17444,8 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, insn_buf[2] = addr[1]; insn_buf[3] = *insn; *cnt = 4; - } else if (desc->func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) { + } else if (desc->func_id == special_kfunc_list[KF_bpf_obj_drop_impl] || + desc->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) { struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta; struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) }; diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h index dbd2c729781a..619afcab2ab0 100644 --- a/tools/testing/selftests/bpf/bpf_experimental.h +++ b/tools/testing/selftests/bpf/bpf_experimental.h @@ -37,6 +37,19 @@ extern void bpf_obj_drop_impl(void *kptr, void *meta) __ksym; /* Convenience macro to wrap over bpf_obj_drop_impl */ #define bpf_obj_drop(kptr) bpf_obj_drop_impl(kptr, NULL) +/* Description + * Increment the refcount on a refcounted local kptr, turning the + * non-owning reference input into an owning reference in the process. + * + * The 'meta' parameter is a hidden argument that is ignored. + * Returns + * An owning reference to the object pointed to by 'kptr' + */ +extern void *bpf_refcount_acquire_impl(void *kptr, void *meta) __ksym; + +/* Convenience macro to wrap over bpf_refcount_acquire_impl */ +#define bpf_refcount_acquire(kptr) bpf_refcount_acquire_impl(kptr, NULL) + /* Description * Add a new entry to the beginning of the BPF linked list. * Returns -- cgit v1.2.3-58-ga151 From d2dcc67df910dd85253a701b6a5b747f955d28f5 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Sat, 15 Apr 2023 13:18:07 -0700 Subject: bpf: Migrate bpf_rbtree_add and bpf_list_push_{front,back} to possibly fail Consider this code snippet: struct node { long key; bpf_list_node l; bpf_rb_node r; bpf_refcount ref; } int some_bpf_prog(void *ctx) { struct node *n = bpf_obj_new(/*...*/), *m; bpf_spin_lock(&glock); bpf_rbtree_add(&some_tree, &n->r, /* ... */); m = bpf_refcount_acquire(n); bpf_rbtree_add(&other_tree, &m->r, /* ... */); bpf_spin_unlock(&glock); /* ... */ } After bpf_refcount_acquire, n and m point to the same underlying memory, and that node's bpf_rb_node field is being used by the some_tree insert, so overwriting it as a result of the second insert is an error. In order to properly support refcounted nodes, the rbtree and list insert functions must be allowed to fail. This patch adds such support. The kfuncs bpf_rbtree_add, bpf_list_push_{front,back} are modified to return an int indicating success/failure, with 0 -> success, nonzero -> failure. bpf_obj_drop on failure ======================= Currently the only reason an insert can fail is the example above: the bpf_{list,rb}_node is already in use. When such a failure occurs, the insert kfuncs will bpf_obj_drop the input node. This allows the insert operations to logically fail without changing their verifier owning ref behavior, namely the unconditional release_reference of the input owning ref. With insert that always succeeds, ownership of the node is always passed to the collection, since the node always ends up in the collection. With a possibly-failed insert w/ bpf_obj_drop, ownership of the node is always passed either to the collection (success), or to bpf_obj_drop (failure). Regardless, it's correct to continue unconditionally releasing the input owning ref, as something is always taking ownership from the calling program on insert. Keeping owning ref behavior unchanged results in a nice default UX for insert functions that can fail. If the program's reaction to a failed insert is "fine, just get rid of this owning ref for me and let me go on with my business", then there's no reason to check for failure since that's default behavior. e.g.: long important_failures = 0; int some_bpf_prog(void *ctx) { struct node *n, *m, *o; /* all bpf_obj_new'd */ bpf_spin_lock(&glock); bpf_rbtree_add(&some_tree, &n->node, /* ... */); bpf_rbtree_add(&some_tree, &m->node, /* ... */); if (bpf_rbtree_add(&some_tree, &o->node, /* ... */)) { important_failures++; } bpf_spin_unlock(&glock); } If we instead chose to pass ownership back to the program on failed insert - by returning NULL on success or an owning ref on failure - programs would always have to do something with the returned ref on failure. The most likely action is probably "I'll just get rid of this owning ref and go about my business", which ideally would look like: if (n = bpf_rbtree_add(&some_tree, &n->node, /* ... */)) bpf_obj_drop(n); But bpf_obj_drop isn't allowed in a critical section and inserts must occur within one, so in reality error handling would become a hard-to-parse mess. For refcounted nodes, we can replicate the "pass ownership back to program on failure" logic with this patch's semantics, albeit in an ugly way: struct node *n = bpf_obj_new(/* ... */), *m; bpf_spin_lock(&glock); m = bpf_refcount_acquire(n); if (bpf_rbtree_add(&some_tree, &n->node, /* ... */)) { /* Do something with m */ } bpf_spin_unlock(&glock); bpf_obj_drop(m); bpf_refcount_acquire is used to simulate "return owning ref on failure". This should be an uncommon occurrence, though. Addition of two verifier-fixup'd args to collection inserts =========================================================== The actual bpf_obj_drop kfunc is bpf_obj_drop_impl(void *, struct btf_struct_meta *), with bpf_obj_drop macro populating the second arg with 0 and the verifier later filling in the arg during insn fixup. Because bpf_rbtree_add and bpf_list_push_{front,back} now might do bpf_obj_drop, these kfuncs need a btf_struct_meta parameter that can be passed to bpf_obj_drop_impl. Similarly, because the 'node' param to those insert functions is the bpf_{list,rb}_node within the node type, and bpf_obj_drop expects a pointer to the beginning of the node, the insert functions need to be able to find the beginning of the node struct. A second verifier-populated param is necessary: the offset of {list,rb}_node within the node type. These two new params allow the insert kfuncs to correctly call __bpf_obj_drop_impl: beginning_of_node = bpf_rb_node_ptr - offset if (already_inserted) __bpf_obj_drop_impl(beginning_of_node, btf_struct_meta->record); Similarly to other kfuncs with "hidden" verifier-populated params, the insert functions are renamed with _impl prefix and a macro is provided for common usage. For example, bpf_rbtree_add kfunc is now bpf_rbtree_add_impl and bpf_rbtree_add is now a macro which sets "hidden" args to 0. Due to the two new args BPF progs will need to be recompiled to work with the new _impl kfuncs. This patch also rewrites the "hidden argument" explanation to more directly say why the BPF program writer doesn't need to populate the arguments with anything meaningful. How does this new logic affect non-owning references? ===================================================== Currently, non-owning refs are valid until the end of the critical section in which they're created. We can make this guarantee because, if a non-owning ref exists, the referent was added to some collection. The collection will drop() its nodes when it goes away, but it can't go away while our program is accessing it, so that's not a problem. If the referent is removed from the collection in the same CS that it was added in, it can't be bpf_obj_drop'd until after CS end. Those are the only two ways to free the referent's memory and neither can happen until after the non-owning ref's lifetime ends. On first glance, having these collection insert functions potentially bpf_obj_drop their input seems like it breaks the "can't be bpf_obj_drop'd until after CS end" line of reasoning. But we care about the memory not being _freed_ until end of CS end, and a previous patch in the series modified bpf_obj_drop such that it doesn't free refcounted nodes until refcount == 0. So the statement can be more accurately rewritten as "can't be free'd until after CS end". We can prove that this rewritten statement holds for any non-owning reference produced by collection insert functions: * If the input to the insert function is _not_ refcounted * We have an owning reference to the input, and can conclude it isn't in any collection * Inserting a node in a collection turns owning refs into non-owning, and since our input type isn't refcounted, there's no way to obtain additional owning refs to the same underlying memory * Because our node isn't in any collection, the insert operation cannot fail, so bpf_obj_drop will not execute * If bpf_obj_drop is guaranteed not to execute, there's no risk of memory being free'd * Otherwise, the input to the insert function is refcounted * If the insert operation fails due to the node's list_head or rb_root already being in some collection, there was some previous successful insert which passed refcount to the collection * We have an owning reference to the input, it must have been acquired via bpf_refcount_acquire, which bumped the refcount * refcount must be >= 2 since there's a valid owning reference and the node is already in a collection * Insert triggering bpf_obj_drop will decr refcount to >= 1, never resulting in a free So although we may do bpf_obj_drop during the critical section, this will never result in memory being free'd, and no changes to non-owning ref logic are needed in this patch. Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20230415201811.343116-6-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 7 ++- kernel/bpf/helpers.c | 65 +++++++++++++++------ kernel/bpf/verifier.c | 78 ++++++++++++++++++-------- tools/testing/selftests/bpf/bpf_experimental.h | 49 ++++++++++++---- 4 files changed, 148 insertions(+), 51 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index f03852b89d28..3dd29a53b711 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -464,7 +464,12 @@ struct bpf_insn_aux_data { */ struct bpf_loop_inline_state loop_inline_state; }; - u64 obj_new_size; /* remember the size of type passed to bpf_obj_new to rewrite R1 */ + union { + /* remember the size of type passed to bpf_obj_new to rewrite R1 */ + u64 obj_new_size; + /* remember the offset of node field within type to rewrite */ + u64 insert_off; + }; struct btf_struct_meta *kptr_struct_meta; u64 map_key_state; /* constant (32 bit) key tracking for maps */ int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 57ff8a60222c..5067f8d46872 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1931,7 +1931,8 @@ __bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta return (void *)p__refcounted_kptr; } -static void __bpf_list_add(struct bpf_list_node *node, struct bpf_list_head *head, bool tail) +static int __bpf_list_add(struct bpf_list_node *node, struct bpf_list_head *head, + bool tail, struct btf_record *rec, u64 off) { struct list_head *n = (void *)node, *h = (void *)head; @@ -1939,17 +1940,35 @@ static void __bpf_list_add(struct bpf_list_node *node, struct bpf_list_head *hea INIT_LIST_HEAD(h); if (unlikely(!n->next)) INIT_LIST_HEAD(n); + if (!list_empty(n)) { + /* Only called from BPF prog, no need to migrate_disable */ + __bpf_obj_drop_impl(n - off, rec); + return -EINVAL; + } + tail ? list_add_tail(n, h) : list_add(n, h); + + return 0; } -__bpf_kfunc void bpf_list_push_front(struct bpf_list_head *head, struct bpf_list_node *node) +__bpf_kfunc int bpf_list_push_front_impl(struct bpf_list_head *head, + struct bpf_list_node *node, + void *meta__ign, u64 off) { - return __bpf_list_add(node, head, false); + struct btf_struct_meta *meta = meta__ign; + + return __bpf_list_add(node, head, false, + meta ? meta->record : NULL, off); } -__bpf_kfunc void bpf_list_push_back(struct bpf_list_head *head, struct bpf_list_node *node) +__bpf_kfunc int bpf_list_push_back_impl(struct bpf_list_head *head, + struct bpf_list_node *node, + void *meta__ign, u64 off) { - return __bpf_list_add(node, head, true); + struct btf_struct_meta *meta = meta__ign; + + return __bpf_list_add(node, head, true, + meta ? meta->record : NULL, off); } static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, bool tail) @@ -1989,14 +2008,23 @@ __bpf_kfunc struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root, /* Need to copy rbtree_add_cached's logic here because our 'less' is a BPF * program */ -static void __bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node, - void *less) +static int __bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node, + void *less, struct btf_record *rec, u64 off) { struct rb_node **link = &((struct rb_root_cached *)root)->rb_root.rb_node; + struct rb_node *parent = NULL, *n = (struct rb_node *)node; bpf_callback_t cb = (bpf_callback_t)less; - struct rb_node *parent = NULL; bool leftmost = true; + if (!n->__rb_parent_color) + RB_CLEAR_NODE(n); + + if (!RB_EMPTY_NODE(n)) { + /* Only called from BPF prog, no need to migrate_disable */ + __bpf_obj_drop_impl(n - off, rec); + return -EINVAL; + } + while (*link) { parent = *link; if (cb((uintptr_t)node, (uintptr_t)parent, 0, 0, 0)) { @@ -2007,15 +2035,18 @@ static void __bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node, } } - rb_link_node((struct rb_node *)node, parent, link); - rb_insert_color_cached((struct rb_node *)node, - (struct rb_root_cached *)root, leftmost); + rb_link_node(n, parent, link); + rb_insert_color_cached(n, (struct rb_root_cached *)root, leftmost); + return 0; } -__bpf_kfunc void bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node, - bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b)) +__bpf_kfunc int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node, + bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b), + void *meta__ign, u64 off) { - __bpf_rbtree_add(root, node, (void *)less); + struct btf_struct_meta *meta = meta__ign; + + return __bpf_rbtree_add(root, node, (void *)less, meta ? meta->record : NULL, off); } __bpf_kfunc struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root) @@ -2291,14 +2322,14 @@ BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE) BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_obj_drop_impl, KF_RELEASE) BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE) -BTF_ID_FLAGS(func, bpf_list_push_front) -BTF_ID_FLAGS(func, bpf_list_push_back) +BTF_ID_FLAGS(func, bpf_list_push_front_impl) +BTF_ID_FLAGS(func, bpf_list_push_back_impl) BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_rbtree_remove, KF_ACQUIRE) -BTF_ID_FLAGS(func, bpf_rbtree_add) +BTF_ID_FLAGS(func, bpf_rbtree_add_impl) BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL) #ifdef CONFIG_CGROUPS diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 29e106f7ccaa..736cb7cec0bd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8500,10 +8500,10 @@ static int set_rbtree_add_callback_state(struct bpf_verifier_env *env, struct bpf_func_state *callee, int insn_idx) { - /* void bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node, + /* void bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node, * bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b)); * - * 'struct bpf_rb_node *node' arg to bpf_rbtree_add is the same PTR_TO_BTF_ID w/ offset + * 'struct bpf_rb_node *node' arg to bpf_rbtree_add_impl is the same PTR_TO_BTF_ID w/ offset * that 'less' callback args will be receiving. However, 'node' arg was release_reference'd * by this point, so look at 'root' */ @@ -9571,8 +9571,8 @@ enum special_kfunc_type { KF_bpf_obj_new_impl, KF_bpf_obj_drop_impl, KF_bpf_refcount_acquire_impl, - KF_bpf_list_push_front, - KF_bpf_list_push_back, + KF_bpf_list_push_front_impl, + KF_bpf_list_push_back_impl, KF_bpf_list_pop_front, KF_bpf_list_pop_back, KF_bpf_cast_to_kern_ctx, @@ -9580,7 +9580,7 @@ enum special_kfunc_type { KF_bpf_rcu_read_lock, KF_bpf_rcu_read_unlock, KF_bpf_rbtree_remove, - KF_bpf_rbtree_add, + KF_bpf_rbtree_add_impl, KF_bpf_rbtree_first, KF_bpf_dynptr_from_skb, KF_bpf_dynptr_from_xdp, @@ -9592,14 +9592,14 @@ BTF_SET_START(special_kfunc_set) BTF_ID(func, bpf_obj_new_impl) BTF_ID(func, bpf_obj_drop_impl) BTF_ID(func, bpf_refcount_acquire_impl) -BTF_ID(func, bpf_list_push_front) -BTF_ID(func, bpf_list_push_back) +BTF_ID(func, bpf_list_push_front_impl) +BTF_ID(func, bpf_list_push_back_impl) BTF_ID(func, bpf_list_pop_front) BTF_ID(func, bpf_list_pop_back) BTF_ID(func, bpf_cast_to_kern_ctx) BTF_ID(func, bpf_rdonly_cast) BTF_ID(func, bpf_rbtree_remove) -BTF_ID(func, bpf_rbtree_add) +BTF_ID(func, bpf_rbtree_add_impl) BTF_ID(func, bpf_rbtree_first) BTF_ID(func, bpf_dynptr_from_skb) BTF_ID(func, bpf_dynptr_from_xdp) @@ -9611,8 +9611,8 @@ BTF_ID_LIST(special_kfunc_list) BTF_ID(func, bpf_obj_new_impl) BTF_ID(func, bpf_obj_drop_impl) BTF_ID(func, bpf_refcount_acquire_impl) -BTF_ID(func, bpf_list_push_front) -BTF_ID(func, bpf_list_push_back) +BTF_ID(func, bpf_list_push_front_impl) +BTF_ID(func, bpf_list_push_back_impl) BTF_ID(func, bpf_list_pop_front) BTF_ID(func, bpf_list_pop_back) BTF_ID(func, bpf_cast_to_kern_ctx) @@ -9620,7 +9620,7 @@ BTF_ID(func, bpf_rdonly_cast) BTF_ID(func, bpf_rcu_read_lock) BTF_ID(func, bpf_rcu_read_unlock) BTF_ID(func, bpf_rbtree_remove) -BTF_ID(func, bpf_rbtree_add) +BTF_ID(func, bpf_rbtree_add_impl) BTF_ID(func, bpf_rbtree_first) BTF_ID(func, bpf_dynptr_from_skb) BTF_ID(func, bpf_dynptr_from_xdp) @@ -9954,15 +9954,15 @@ static int check_reg_allocation_locked(struct bpf_verifier_env *env, struct bpf_ static bool is_bpf_list_api_kfunc(u32 btf_id) { - return btf_id == special_kfunc_list[KF_bpf_list_push_front] || - btf_id == special_kfunc_list[KF_bpf_list_push_back] || + return btf_id == special_kfunc_list[KF_bpf_list_push_front_impl] || + btf_id == special_kfunc_list[KF_bpf_list_push_back_impl] || btf_id == special_kfunc_list[KF_bpf_list_pop_front] || btf_id == special_kfunc_list[KF_bpf_list_pop_back]; } static bool is_bpf_rbtree_api_kfunc(u32 btf_id) { - return btf_id == special_kfunc_list[KF_bpf_rbtree_add] || + return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl] || btf_id == special_kfunc_list[KF_bpf_rbtree_remove] || btf_id == special_kfunc_list[KF_bpf_rbtree_first]; } @@ -9975,7 +9975,7 @@ static bool is_bpf_graph_api_kfunc(u32 btf_id) static bool is_callback_calling_kfunc(u32 btf_id) { - return btf_id == special_kfunc_list[KF_bpf_rbtree_add]; + return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl]; } static bool is_rbtree_lock_required_kfunc(u32 btf_id) @@ -10016,12 +10016,12 @@ static bool check_kfunc_is_graph_node_api(struct bpf_verifier_env *env, switch (node_field_type) { case BPF_LIST_NODE: - ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_list_push_front] || - kfunc_btf_id == special_kfunc_list[KF_bpf_list_push_back]); + ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_list_push_front_impl] || + kfunc_btf_id == special_kfunc_list[KF_bpf_list_push_back_impl]); break; case BPF_RB_NODE: ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_remove] || - kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_add]); + kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl]); break; default: verbose(env, "verifier internal error: unexpected graph node argument type %s\n", @@ -10702,10 +10702,11 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } } - if (meta.func_id == special_kfunc_list[KF_bpf_list_push_front] || - meta.func_id == special_kfunc_list[KF_bpf_list_push_back] || - meta.func_id == special_kfunc_list[KF_bpf_rbtree_add]) { + if (meta.func_id == special_kfunc_list[KF_bpf_list_push_front_impl] || + meta.func_id == special_kfunc_list[KF_bpf_list_push_back_impl] || + meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) { release_ref_obj_id = regs[BPF_REG_2].ref_obj_id; + insn_aux->insert_off = regs[BPF_REG_2].off; err = ref_convert_owning_non_owning(env, release_ref_obj_id); if (err) { verbose(env, "kfunc %s#%d conversion of owning ref to non-owning failed\n", @@ -10721,7 +10722,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } } - if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_add]) { + if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) { err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, set_rbtree_add_callback_state); if (err) { @@ -14764,7 +14765,7 @@ static bool regs_exact(const struct bpf_reg_state *rold, const struct bpf_reg_state *rcur, struct bpf_id_pair *idmap) { - return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 && + return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 && check_ids(rold->id, rcur->id, idmap) && check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap); } @@ -17407,6 +17408,23 @@ static void specialize_kfunc(struct bpf_verifier_env *env, } } +static void __fixup_collection_insert_kfunc(struct bpf_insn_aux_data *insn_aux, + u16 struct_meta_reg, + u16 node_offset_reg, + struct bpf_insn *insn, + struct bpf_insn *insn_buf, + int *cnt) +{ + struct btf_struct_meta *kptr_struct_meta = insn_aux->kptr_struct_meta; + struct bpf_insn addr[2] = { BPF_LD_IMM64(struct_meta_reg, (long)kptr_struct_meta) }; + + insn_buf[0] = addr[0]; + insn_buf[1] = addr[1]; + insn_buf[2] = BPF_MOV64_IMM(node_offset_reg, insn_aux->insert_off); + insn_buf[3] = *insn; + *cnt = 4; +} + static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, struct bpf_insn *insn_buf, int insn_idx, int *cnt) { @@ -17453,6 +17471,20 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, insn_buf[1] = addr[1]; insn_buf[2] = *insn; *cnt = 3; + } else if (desc->func_id == special_kfunc_list[KF_bpf_list_push_back_impl] || + desc->func_id == special_kfunc_list[KF_bpf_list_push_front_impl] || + desc->func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) { + int struct_meta_reg = BPF_REG_3; + int node_offset_reg = BPF_REG_4; + + /* rbtree_add has extra 'less' arg, so args-to-fixup are in diff regs */ + if (desc->func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) { + struct_meta_reg = BPF_REG_4; + node_offset_reg = BPF_REG_5; + } + + __fixup_collection_insert_kfunc(&env->insn_aux_data[insn_idx], struct_meta_reg, + node_offset_reg, insn, insn_buf, cnt); } else if (desc->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] || desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) { insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1); diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h index 619afcab2ab0..209811b1993a 100644 --- a/tools/testing/selftests/bpf/bpf_experimental.h +++ b/tools/testing/selftests/bpf/bpf_experimental.h @@ -14,7 +14,8 @@ * type ID of a struct in program BTF. * * The 'local_type_id' parameter must be a known constant. - * The 'meta' parameter is a hidden argument that is ignored. + * The 'meta' parameter is rewritten by the verifier, no need for BPF + * program to set it. * Returns * A pointer to an object of the type corresponding to the passed in * 'local_type_id', or NULL on failure. @@ -28,7 +29,8 @@ extern void *bpf_obj_new_impl(__u64 local_type_id, void *meta) __ksym; * Free an allocated object. All fields of the object that require * destruction will be destructed before the storage is freed. * - * The 'meta' parameter is a hidden argument that is ignored. + * The 'meta' parameter is rewritten by the verifier, no need for BPF + * program to set it. * Returns * Void. */ @@ -41,7 +43,8 @@ extern void bpf_obj_drop_impl(void *kptr, void *meta) __ksym; * Increment the refcount on a refcounted local kptr, turning the * non-owning reference input into an owning reference in the process. * - * The 'meta' parameter is a hidden argument that is ignored. + * The 'meta' parameter is rewritten by the verifier, no need for BPF + * program to set it. * Returns * An owning reference to the object pointed to by 'kptr' */ @@ -52,17 +55,35 @@ extern void *bpf_refcount_acquire_impl(void *kptr, void *meta) __ksym; /* Description * Add a new entry to the beginning of the BPF linked list. + * + * The 'meta' and 'off' parameters are rewritten by the verifier, no need + * for BPF programs to set them * Returns - * Void. + * 0 if the node was successfully added + * -EINVAL if the node wasn't added because it's already in a list */ -extern void bpf_list_push_front(struct bpf_list_head *head, struct bpf_list_node *node) __ksym; +extern int bpf_list_push_front_impl(struct bpf_list_head *head, + struct bpf_list_node *node, + void *meta, __u64 off) __ksym; + +/* Convenience macro to wrap over bpf_list_push_front_impl */ +#define bpf_list_push_front(head, node) bpf_list_push_front_impl(head, node, NULL, 0) /* Description * Add a new entry to the end of the BPF linked list. + * + * The 'meta' and 'off' parameters are rewritten by the verifier, no need + * for BPF programs to set them * Returns - * Void. + * 0 if the node was successfully added + * -EINVAL if the node wasn't added because it's already in a list */ -extern void bpf_list_push_back(struct bpf_list_head *head, struct bpf_list_node *node) __ksym; +extern int bpf_list_push_back_impl(struct bpf_list_head *head, + struct bpf_list_node *node, + void *meta, __u64 off) __ksym; + +/* Convenience macro to wrap over bpf_list_push_back_impl */ +#define bpf_list_push_back(head, node) bpf_list_push_back_impl(head, node, NULL, 0) /* Description * Remove the entry at the beginning of the BPF linked list. @@ -88,11 +109,19 @@ extern struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root, /* Description * Add 'node' to rbtree with root 'root' using comparator 'less' + * + * The 'meta' and 'off' parameters are rewritten by the verifier, no need + * for BPF programs to set them * Returns - * Nothing + * 0 if the node was successfully added + * -EINVAL if the node wasn't added because it's already in a tree */ -extern void bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node, - bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b)) __ksym; +extern int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node, + bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b), + void *meta, __u64 off) __ksym; + +/* Convenience macro to wrap over bpf_rbtree_add_impl */ +#define bpf_rbtree_add(head, node, less) bpf_rbtree_add_impl(head, node, less, NULL, 0) /* Description * Return the first (leftmost) node in input tree -- cgit v1.2.3-58-ga151 From 404ad75a36fb1a1008e9fe803aa7d0212df9e240 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Sat, 15 Apr 2023 13:18:09 -0700 Subject: bpf: Migrate bpf_rbtree_remove to possibly fail This patch modifies bpf_rbtree_remove to account for possible failure due to the input rb_node already not being in any collection. The function can now return NULL, and does when the aforementioned scenario occurs. As before, on successful removal an owning reference to the removed node is returned. Adding KF_RET_NULL to bpf_rbtree_remove's kfunc flags - now KF_RET_NULL | KF_ACQUIRE - provides the desired verifier semantics: * retval must be checked for NULL before use * if NULL, retval's ref_obj_id is released * retval is a "maybe acquired" owning ref, not a non-owning ref, so it will live past end of critical section (bpf_spin_unlock), and thus can be checked for NULL after the end of the CS BPF programs must add checks ============================ This does change bpf_rbtree_remove's verifier behavior. BPF program writers will need to add NULL checks to their programs, but the resulting UX looks natural: bpf_spin_lock(&glock); n = bpf_rbtree_first(&ghead); if (!n) { /* ... */} res = bpf_rbtree_remove(&ghead, &n->node); bpf_spin_unlock(&glock); if (!res) /* Newly-added check after this patch */ return 1; n = container_of(res, /* ... */); /* Do something else with n */ bpf_obj_drop(n); return 0; The "if (!res)" check above is the only addition necessary for the above program to pass verification after this patch. bpf_rbtree_remove no longer clobbers non-owning refs ==================================================== An issue arises when bpf_rbtree_remove fails, though. Consider this example: struct node_data { long key; struct bpf_list_node l; struct bpf_rb_node r; struct bpf_refcount ref; }; long failed_sum; void bpf_prog() { struct node_data *n = bpf_obj_new(/* ... */); struct bpf_rb_node *res; n->key = 10; bpf_spin_lock(&glock); bpf_list_push_back(&some_list, &n->l); /* n is now a non-owning ref */ res = bpf_rbtree_remove(&some_tree, &n->r, /* ... */); if (!res) failed_sum += n->key; /* not possible */ bpf_spin_unlock(&glock); /* if (res) { do something useful and drop } ... */ } The bpf_rbtree_remove in this example will always fail. Similarly to bpf_spin_unlock, bpf_rbtree_remove is a non-owning reference invalidation point. The verifier clobbers all non-owning refs after a bpf_rbtree_remove call, so the "failed_sum += n->key" line will fail verification, and in fact there's no good way to get information about the node which failed to add after the invalidation. This patch removes non-owning reference invalidation from bpf_rbtree_remove to allow the above usecase to pass verification. The logic for why this is now possible is as follows: Before this series, bpf_rbtree_add couldn't fail and thus assumed that its input, a non-owning reference, was in the tree. But it's easy to construct an example where two non-owning references pointing to the same underlying memory are acquired and passed to rbtree_remove one after another (see rbtree_api_release_aliasing in selftests/bpf/progs/rbtree_fail.c). So it was necessary to clobber non-owning refs to prevent this case and, more generally, to enforce "non-owning ref is definitely in some collection" invariant. This series removes that invariant and the failure / runtime checking added in this patch provide a clean way to deal with the aliasing issue - just fail to remove. Because the aliasing issue prevented by clobbering non-owning refs is no longer an issue, this patch removes the invalidate_non_owning_refs call from verifier handling of bpf_rbtree_remove. Note that bpf_spin_unlock - the other caller of invalidate_non_owning_refs - clobbers non-owning refs for a different reason, so its clobbering behavior remains unchanged. No BPF program changes are necessary for programs to remain valid as a result of this clobbering change. A valid program before this patch passed verification with its non-owning refs having shorter (or equal) lifetimes due to more aggressive clobbering. Also, update existing tests to check bpf_rbtree_remove retval for NULL where necessary, and move rbtree_api_release_aliasing from progs/rbtree_fail.c to progs/rbtree.c since it's now expected to pass verification. Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20230415201811.343116-8-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 21 +---- kernel/bpf/helpers.c | 8 +- kernel/bpf/verifier.c | 3 - .../testing/selftests/bpf/prog_tests/linked_list.c | 90 ++++++++++++++-------- tools/testing/selftests/bpf/prog_tests/rbtree.c | 25 ++++++ tools/testing/selftests/bpf/progs/rbtree.c | 74 +++++++++++++++++- tools/testing/selftests/bpf/progs/rbtree_fail.c | 77 +++++++----------- 7 files changed, 191 insertions(+), 107 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 14889fd5ba8e..027f9f8a3551 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3805,25 +3805,8 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type goto end; } - /* need collection identity for non-owning refs before allowing this - * - * Consider a node type w/ both list and rb_node fields: - * struct node { - * struct bpf_list_node l; - * struct bpf_rb_node r; - * } - * - * Used like so: - * struct node *n = bpf_obj_new(....); - * bpf_list_push_front(&list_head, &n->l); - * bpf_rbtree_remove(&rb_root, &n->r); - * - * It should not be possible to rbtree_remove the node since it hasn't - * been added to a tree. But push_front converts n to a non-owning - * reference, and rbtree_remove accepts the non-owning reference to - * a type w/ bpf_rb_node field. - */ - if (btf_record_has_field(rec, BPF_LIST_NODE) && + if (rec->refcount_off < 0 && + btf_record_has_field(rec, BPF_LIST_NODE) && btf_record_has_field(rec, BPF_RB_NODE)) { ret = -EINVAL; goto end; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 5067f8d46872..1835df333287 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2000,6 +2000,12 @@ __bpf_kfunc struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root, struct rb_root_cached *r = (struct rb_root_cached *)root; struct rb_node *n = (struct rb_node *)node; + if (!n->__rb_parent_color) + RB_CLEAR_NODE(n); + + if (RB_EMPTY_NODE(n)) + return NULL; + rb_erase_cached(n, r); RB_CLEAR_NODE(n); return (struct bpf_rb_node *)n; @@ -2328,7 +2334,7 @@ BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE) -BTF_ID_FLAGS(func, bpf_rbtree_remove, KF_ACQUIRE) +BTF_ID_FLAGS(func, bpf_rbtree_remove, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_rbtree_add_impl) BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 736cb7cec0bd..6a41b69a424e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10922,9 +10922,6 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, ref_set_non_owning(env, ®s[BPF_REG_0]); } - if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_remove]) - invalidate_non_owning_refs(env); - if (reg_may_point_to_spin_lock(®s[BPF_REG_0]) && !regs[BPF_REG_0].id) regs[BPF_REG_0].id = ++env->id_gen; } else if (btf_type_is_void(t)) { diff --git a/tools/testing/selftests/bpf/prog_tests/linked_list.c b/tools/testing/selftests/bpf/prog_tests/linked_list.c index 872e4bd500fd..f63309fd0e28 100644 --- a/tools/testing/selftests/bpf/prog_tests/linked_list.c +++ b/tools/testing/selftests/bpf/prog_tests/linked_list.c @@ -266,6 +266,59 @@ end: return NULL; } +static void list_and_rb_node_same_struct(bool refcount_field) +{ + int bpf_rb_node_btf_id, bpf_refcount_btf_id, foo_btf_id; + struct btf *btf; + int id, err; + + btf = init_btf(); + if (!ASSERT_OK_PTR(btf, "init_btf")) + return; + + bpf_rb_node_btf_id = btf__add_struct(btf, "bpf_rb_node", 24); + if (!ASSERT_GT(bpf_rb_node_btf_id, 0, "btf__add_struct bpf_rb_node")) + return; + + if (refcount_field) { + bpf_refcount_btf_id = btf__add_struct(btf, "bpf_refcount", 4); + if (!ASSERT_GT(bpf_refcount_btf_id, 0, "btf__add_struct bpf_refcount")) + return; + } + + id = btf__add_struct(btf, "bar", refcount_field ? 44 : 40); + if (!ASSERT_GT(id, 0, "btf__add_struct bar")) + return; + err = btf__add_field(btf, "a", LIST_NODE, 0, 0); + if (!ASSERT_OK(err, "btf__add_field bar::a")) + return; + err = btf__add_field(btf, "c", bpf_rb_node_btf_id, 128, 0); + if (!ASSERT_OK(err, "btf__add_field bar::c")) + return; + if (refcount_field) { + err = btf__add_field(btf, "ref", bpf_refcount_btf_id, 320, 0); + if (!ASSERT_OK(err, "btf__add_field bar::ref")) + return; + } + + foo_btf_id = btf__add_struct(btf, "foo", 20); + if (!ASSERT_GT(foo_btf_id, 0, "btf__add_struct foo")) + return; + err = btf__add_field(btf, "a", LIST_HEAD, 0, 0); + if (!ASSERT_OK(err, "btf__add_field foo::a")) + return; + err = btf__add_field(btf, "b", SPIN_LOCK, 128, 0); + if (!ASSERT_OK(err, "btf__add_field foo::b")) + return; + id = btf__add_decl_tag(btf, "contains:bar:a", foo_btf_id, 0); + if (!ASSERT_GT(id, 0, "btf__add_decl_tag contains:bar:a")) + return; + + err = btf__load_into_kernel(btf); + ASSERT_EQ(err, refcount_field ? 0 : -EINVAL, "check btf"); + btf__free(btf); +} + static void test_btf(void) { struct btf *btf = NULL; @@ -717,39 +770,12 @@ static void test_btf(void) } while (test__start_subtest("btf: list_node and rb_node in same struct")) { - btf = init_btf(); - if (!ASSERT_OK_PTR(btf, "init_btf")) - break; - - id = btf__add_struct(btf, "bpf_rb_node", 24); - if (!ASSERT_EQ(id, 5, "btf__add_struct bpf_rb_node")) - break; - id = btf__add_struct(btf, "bar", 40); - if (!ASSERT_EQ(id, 6, "btf__add_struct bar")) - break; - err = btf__add_field(btf, "a", LIST_NODE, 0, 0); - if (!ASSERT_OK(err, "btf__add_field bar::a")) - break; - err = btf__add_field(btf, "c", 5, 128, 0); - if (!ASSERT_OK(err, "btf__add_field bar::c")) - break; - - id = btf__add_struct(btf, "foo", 20); - if (!ASSERT_EQ(id, 7, "btf__add_struct foo")) - break; - err = btf__add_field(btf, "a", LIST_HEAD, 0, 0); - if (!ASSERT_OK(err, "btf__add_field foo::a")) - break; - err = btf__add_field(btf, "b", SPIN_LOCK, 128, 0); - if (!ASSERT_OK(err, "btf__add_field foo::b")) - break; - id = btf__add_decl_tag(btf, "contains:bar:a", 7, 0); - if (!ASSERT_EQ(id, 8, "btf__add_decl_tag contains:bar:a")) - break; + list_and_rb_node_same_struct(true); + break; + } - err = btf__load_into_kernel(btf); - ASSERT_EQ(err, -EINVAL, "check btf"); - btf__free(btf); + while (test__start_subtest("btf: list_node and rb_node in same struct, no bpf_refcount")) { + list_and_rb_node_same_struct(false); break; } } diff --git a/tools/testing/selftests/bpf/prog_tests/rbtree.c b/tools/testing/selftests/bpf/prog_tests/rbtree.c index 156fa95c42f6..e9300c96607d 100644 --- a/tools/testing/selftests/bpf/prog_tests/rbtree.c +++ b/tools/testing/selftests/bpf/prog_tests/rbtree.c @@ -77,6 +77,29 @@ static void test_rbtree_first_and_remove(void) rbtree__destroy(skel); } +static void test_rbtree_api_release_aliasing(void) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts, + .data_in = &pkt_v4, + .data_size_in = sizeof(pkt_v4), + .repeat = 1, + ); + struct rbtree *skel; + int ret; + + skel = rbtree__open_and_load(); + if (!ASSERT_OK_PTR(skel, "rbtree__open_and_load")) + return; + + ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.rbtree_api_release_aliasing), &opts); + ASSERT_OK(ret, "rbtree_api_release_aliasing"); + ASSERT_OK(opts.retval, "rbtree_api_release_aliasing retval"); + ASSERT_EQ(skel->data->first_data[0], 42, "rbtree_api_release_aliasing first rbtree_remove()"); + ASSERT_EQ(skel->data->first_data[1], -1, "rbtree_api_release_aliasing second rbtree_remove()"); + + rbtree__destroy(skel); +} + void test_rbtree_success(void) { if (test__start_subtest("rbtree_add_nodes")) @@ -85,6 +108,8 @@ void test_rbtree_success(void) test_rbtree_add_and_remove(); if (test__start_subtest("rbtree_first_and_remove")) test_rbtree_first_and_remove(); + if (test__start_subtest("rbtree_api_release_aliasing")) + test_rbtree_api_release_aliasing(); } #define BTF_FAIL_TEST(suffix) \ diff --git a/tools/testing/selftests/bpf/progs/rbtree.c b/tools/testing/selftests/bpf/progs/rbtree.c index 4c90aa6abddd..b09f4fffe57c 100644 --- a/tools/testing/selftests/bpf/progs/rbtree.c +++ b/tools/testing/selftests/bpf/progs/rbtree.c @@ -93,9 +93,11 @@ long rbtree_add_and_remove(void *ctx) res = bpf_rbtree_remove(&groot, &n->node); bpf_spin_unlock(&glock); + if (!res) + return 1; + n = container_of(res, struct node_data, node); removed_key = n->key; - bpf_obj_drop(n); return 0; @@ -148,9 +150,11 @@ long rbtree_first_and_remove(void *ctx) res = bpf_rbtree_remove(&groot, &o->node); bpf_spin_unlock(&glock); + if (!res) + return 5; + o = container_of(res, struct node_data, node); removed_key = o->key; - bpf_obj_drop(o); bpf_spin_lock(&glock); @@ -173,4 +177,70 @@ err_out: return 1; } +SEC("tc") +long rbtree_api_release_aliasing(void *ctx) +{ + struct node_data *n, *m, *o; + struct bpf_rb_node *res, *res2; + + n = bpf_obj_new(typeof(*n)); + if (!n) + return 1; + n->key = 41; + n->data = 42; + + bpf_spin_lock(&glock); + bpf_rbtree_add(&groot, &n->node, less); + bpf_spin_unlock(&glock); + + bpf_spin_lock(&glock); + + /* m and o point to the same node, + * but verifier doesn't know this + */ + res = bpf_rbtree_first(&groot); + if (!res) + goto err_out; + o = container_of(res, struct node_data, node); + + res = bpf_rbtree_first(&groot); + if (!res) + goto err_out; + m = container_of(res, struct node_data, node); + + res = bpf_rbtree_remove(&groot, &m->node); + /* Retval of previous remove returns an owning reference to m, + * which is the same node non-owning ref o is pointing at. + * We can safely try to remove o as the second rbtree_remove will + * return NULL since the node isn't in a tree. + * + * Previously we relied on the verifier type system + rbtree_remove + * invalidating non-owning refs to ensure that rbtree_remove couldn't + * fail, but now rbtree_remove does runtime checking so we no longer + * invalidate non-owning refs after remove. + */ + res2 = bpf_rbtree_remove(&groot, &o->node); + + bpf_spin_unlock(&glock); + + if (res) { + o = container_of(res, struct node_data, node); + first_data[0] = o->data; + bpf_obj_drop(o); + } + if (res2) { + /* The second remove fails, so res2 is null and this doesn't + * execute + */ + m = container_of(res2, struct node_data, node); + first_data[1] = m->data; + bpf_obj_drop(m); + } + return 0; + +err_out: + bpf_spin_unlock(&glock); + return 1; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/rbtree_fail.c b/tools/testing/selftests/bpf/progs/rbtree_fail.c index 46d7d18a218f..3fecf1c6dfe5 100644 --- a/tools/testing/selftests/bpf/progs/rbtree_fail.c +++ b/tools/testing/selftests/bpf/progs/rbtree_fail.c @@ -105,7 +105,7 @@ long rbtree_api_remove_unadded_node(void *ctx) } SEC("?tc") -__failure __msg("Unreleased reference id=2 alloc_insn=10") +__failure __msg("Unreleased reference id=3 alloc_insn=10") long rbtree_api_remove_no_drop(void *ctx) { struct bpf_rb_node *res; @@ -118,11 +118,13 @@ long rbtree_api_remove_no_drop(void *ctx) res = bpf_rbtree_remove(&groot, res); - n = container_of(res, struct node_data, node); - __sink(n); + if (res) { + n = container_of(res, struct node_data, node); + __sink(n); + } bpf_spin_unlock(&glock); - /* bpf_obj_drop(n) is missing here */ + /* if (res) { bpf_obj_drop(n); } is missing here */ return 0; unlock_err: @@ -150,35 +152,36 @@ long rbtree_api_add_to_multiple_trees(void *ctx) } SEC("?tc") -__failure __msg("rbtree_remove node input must be non-owning ref") -long rbtree_api_add_release_unlock_escape(void *ctx) +__failure __msg("dereference of modified ptr_or_null_ ptr R2 off=16 disallowed") +long rbtree_api_use_unchecked_remove_retval(void *ctx) { - struct node_data *n; - - n = bpf_obj_new(typeof(*n)); - if (!n) - return 1; + struct bpf_rb_node *res; bpf_spin_lock(&glock); - bpf_rbtree_add(&groot, &n->node, less); + + res = bpf_rbtree_first(&groot); + if (!res) + goto err_out; + res = bpf_rbtree_remove(&groot, res); + bpf_spin_unlock(&glock); bpf_spin_lock(&glock); - /* After add() in previous critical section, n should be - * release_on_unlock and released after previous spin_unlock, - * so should not be possible to use it here - */ - bpf_rbtree_remove(&groot, &n->node); + /* Must check res for NULL before using in rbtree_add below */ + bpf_rbtree_add(&groot, res, less); bpf_spin_unlock(&glock); return 0; + +err_out: + bpf_spin_unlock(&glock); + return 1; } SEC("?tc") __failure __msg("rbtree_remove node input must be non-owning ref") -long rbtree_api_release_aliasing(void *ctx) +long rbtree_api_add_release_unlock_escape(void *ctx) { - struct node_data *n, *m, *o; - struct bpf_rb_node *res; + struct node_data *n; n = bpf_obj_new(typeof(*n)); if (!n) @@ -189,37 +192,11 @@ long rbtree_api_release_aliasing(void *ctx) bpf_spin_unlock(&glock); bpf_spin_lock(&glock); - - /* m and o point to the same node, - * but verifier doesn't know this - */ - res = bpf_rbtree_first(&groot); - if (!res) - return 1; - o = container_of(res, struct node_data, node); - - res = bpf_rbtree_first(&groot); - if (!res) - return 1; - m = container_of(res, struct node_data, node); - - bpf_rbtree_remove(&groot, &m->node); - /* This second remove shouldn't be possible. Retval of previous - * remove returns owning reference to m, which is the same - * node o's non-owning ref is pointing at - * - * In order to preserve property - * * owning ref must not be in rbtree - * * non-owning ref must be in rbtree - * - * o's ref must be invalidated after previous remove. Otherwise - * we'd have non-owning ref to node that isn't in rbtree, and - * verifier wouldn't be able to use type system to prevent remove - * of ref that already isn't in any tree. Would have to do runtime - * checks in that case. + /* After add() in previous critical section, n should be + * release_on_unlock and released after previous spin_unlock, + * so should not be possible to use it here */ - bpf_rbtree_remove(&groot, &o->node); - + bpf_rbtree_remove(&groot, &n->node); bpf_spin_unlock(&glock); return 0; } -- cgit v1.2.3-58-ga151 From 3e81740a90626024a9d9c6f9bfa3d36204dafefb Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Sat, 15 Apr 2023 13:18:10 -0700 Subject: bpf: Centralize btf_field-specific initialization logic All btf_fields in an object are 0-initialized by memset in bpf_obj_init. This might not be a valid initial state for some field types, in which case kfuncs that use the type will properly initialize their input if it's been 0-initialized. Some BPF graph collection types and kfuncs do this: bpf_list_{head,node} and bpf_rb_node. An earlier patch in this series added the bpf_refcount field, for which the 0 state indicates that the refcounted object should be free'd. bpf_obj_init treats this field specially, setting refcount to 1 instead of relying on scattered "refcount is 0? Must have just been initialized, let's set to 1" logic in kfuncs. This patch extends this treatment to list and rbtree field types, allowing most scattered initialization logic in kfuncs to be removed. Note that bpf_{list_head,rb_root} may be inside a BPF map, in which case they'll be 0-initialized without passing through the newly-added logic, so scattered initialization logic must remain for these collection root types. Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20230415201811.343116-9-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 33 +++++++++++++++++++++++++++++---- kernel/bpf/helpers.c | 14 ++++++-------- 2 files changed, 35 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b065de2cfcea..18b592fde896 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -355,6 +355,34 @@ static inline u32 btf_field_type_align(enum btf_field_type type) } } +static inline void bpf_obj_init_field(const struct btf_field *field, void *addr) +{ + memset(addr, 0, field->size); + + switch (field->type) { + case BPF_REFCOUNT: + refcount_set((refcount_t *)addr, 1); + break; + case BPF_RB_NODE: + RB_CLEAR_NODE((struct rb_node *)addr); + break; + case BPF_LIST_HEAD: + case BPF_LIST_NODE: + INIT_LIST_HEAD((struct list_head *)addr); + break; + case BPF_RB_ROOT: + /* RB_ROOT_CACHED 0-inits, no need to do anything after memset */ + case BPF_SPIN_LOCK: + case BPF_TIMER: + case BPF_KPTR_UNREF: + case BPF_KPTR_REF: + break; + default: + WARN_ON_ONCE(1); + return; + } +} + static inline bool btf_record_has_field(const struct btf_record *rec, enum btf_field_type type) { if (IS_ERR_OR_NULL(rec)) @@ -369,10 +397,7 @@ static inline void bpf_obj_init(const struct btf_record *rec, void *obj) if (IS_ERR_OR_NULL(rec)) return; for (i = 0; i < rec->cnt; i++) - memset(obj + rec->fields[i].offset, 0, rec->fields[i].size); - - if (rec->refcount_off >= 0) - refcount_set((refcount_t *)(obj + rec->refcount_off), 1); + bpf_obj_init_field(&rec->fields[i], obj + rec->fields[i].offset); } /* 'dst' must be a temporary buffer and should not point to memory that is being diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 1835df333287..00e5fb0682ac 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1936,10 +1936,11 @@ static int __bpf_list_add(struct bpf_list_node *node, struct bpf_list_head *head { struct list_head *n = (void *)node, *h = (void *)head; + /* If list_head was 0-initialized by map, bpf_obj_init_field wasn't + * called on its fields, so init here + */ if (unlikely(!h->next)) INIT_LIST_HEAD(h); - if (unlikely(!n->next)) - INIT_LIST_HEAD(n); if (!list_empty(n)) { /* Only called from BPF prog, no need to migrate_disable */ __bpf_obj_drop_impl(n - off, rec); @@ -1975,6 +1976,9 @@ static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, bool tai { struct list_head *n, *h = (void *)head; + /* If list_head was 0-initialized by map, bpf_obj_init_field wasn't + * called on its fields, so init here + */ if (unlikely(!h->next)) INIT_LIST_HEAD(h); if (list_empty(h)) @@ -2000,9 +2004,6 @@ __bpf_kfunc struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root, struct rb_root_cached *r = (struct rb_root_cached *)root; struct rb_node *n = (struct rb_node *)node; - if (!n->__rb_parent_color) - RB_CLEAR_NODE(n); - if (RB_EMPTY_NODE(n)) return NULL; @@ -2022,9 +2023,6 @@ static int __bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node, bpf_callback_t cb = (bpf_callback_t)less; bool leftmost = true; - if (!n->__rb_parent_color) - RB_CLEAR_NODE(n); - if (!RB_EMPTY_NODE(n)) { /* Only called from BPF prog, no need to migrate_disable */ __bpf_obj_drop_impl(n - off, rec); -- cgit v1.2.3-58-ga151 From 7b4ddf3920d247c2949073b9c274301c8131332a Mon Sep 17 00:00:00 2001 From: David Vernet Date: Sun, 16 Apr 2023 03:49:27 -0500 Subject: bpf: Remove KF_KPTR_GET kfunc flag We've managed to improve the UX for kptrs significantly over the last 9 months. All of the existing use cases which previously had KF_KPTR_GET kfuncs (struct bpf_cpumask *, struct task_struct *, and struct cgroup *) have all been updated to be synchronized using RCU. In other words, their KF_KPTR_GET kfuncs have been removed in favor of KF_RCU | KF_ACQUIRE kfuncs, with the pointers themselves also being readable from maps in an RCU read region thanks to the types being RCU safe. While KF_KPTR_GET was a logical starting point for kptrs, it's become clear that they're not the correct abstraction. KF_KPTR_GET is a flag that essentially does nothing other than enforcing that the argument to a function is a pointer to a referenced kptr map value. At first glance, that's a useful thing to guarantee to a kfunc. It gives kfuncs the ability to try and acquire a reference on that kptr without requiring the BPF prog to do something like this: struct kptr_type *in_map, *new = NULL; in_map = bpf_kptr_xchg(&map->value, NULL); if (in_map) { new = bpf_kptr_type_acquire(in_map); in_map = bpf_kptr_xchg(&map->value, in_map); if (in_map) bpf_kptr_type_release(in_map); } That's clearly a pretty ugly (and racy) UX, and if using KF_KPTR_GET is the only alternative, it's better than nothing. However, the problem with any KF_KPTR_GET kfunc lies in the fact that it always requires some kind of synchronization in order to safely do an opportunistic acquire of the kptr in the map. This is because a BPF program running on another CPU could do a bpf_kptr_xchg() on that map value, and free the kptr after it's been read by the KF_KPTR_GET kfunc. For example, the now-removed bpf_task_kptr_get() kfunc did the following: struct task_struct *bpf_task_kptr_get(struct task_struct **pp) { struct task_struct *p; rcu_read_lock(); p = READ_ONCE(*pp); /* If p is non-NULL, it could still be freed by another CPU, * so we have to do an opportunistic refcount_inc_not_zero() * and return NULL if the task will be freed after the * current RCU read region. */ |f (p && !refcount_inc_not_zero(&p->rcu_users)) p = NULL; rcu_read_unlock(); return p; } In other words, the kfunc uses RCU to ensure that the task remains valid after it's been peeked from the map. However, this is completely redundant with just defining a KF_RCU kfunc that itself does a refcount_inc_not_zero(), which is exactly what bpf_task_acquire() now does. So, the question of whether KF_KPTR_GET is useful is actually, "Are there any synchronization mechanisms / safety flags that are required by certain kptrs, but which are not provided by the verifier to kfuncs?" The answer to that question today is "No", because every kptr we currently care about is RCU protected. Even if the answer ever became "yes", the proper way to support that referenced kptr type would be to add support for whatever synchronization mechanism it requires in the verifier, rather than giving kfuncs a flag that says, "Here's a pointer to a referenced kptr in a map, do whatever you need to do." With all that said -- so as to allow us to consolidate the kfunc API, and simplify the verifier a bit, this patch removes KF_KPTR_GET, and all relevant logic from the verifier. Signed-off-by: David Vernet Link: https://lore.kernel.org/r/20230416084928.326135-3-void@manifault.com Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 1 - kernel/bpf/verifier.c | 65 --------------------------------------------------- 2 files changed, 66 deletions(-) (limited to 'kernel') diff --git a/include/linux/btf.h b/include/linux/btf.h index 813227bff58a..508199e38415 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -18,7 +18,6 @@ #define KF_ACQUIRE (1 << 0) /* kfunc is an acquire function */ #define KF_RELEASE (1 << 1) /* kfunc is a release function */ #define KF_RET_NULL (1 << 2) /* kfunc returns a pointer that may be NULL */ -#define KF_KPTR_GET (1 << 3) /* kfunc returns reference to a kptr */ /* Trusted arguments are those which are guaranteed to be valid when passed to * the kfunc. It is used to enforce that pointers obtained from either acquire * kfuncs, or from the main kernel on a tracepoint or struct_ops callback diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6a41b69a424e..5dae11ee01c3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9339,11 +9339,6 @@ static bool is_kfunc_rcu(struct bpf_kfunc_call_arg_meta *meta) return meta->kfunc_flags & KF_RCU; } -static bool is_kfunc_arg_kptr_get(struct bpf_kfunc_call_arg_meta *meta, int arg) -{ - return arg == 0 && (meta->kfunc_flags & KF_KPTR_GET); -} - static bool __kfunc_param_match_suffix(const struct btf *btf, const struct btf_param *arg, const char *suffix) @@ -9554,7 +9549,6 @@ enum kfunc_ptr_arg_type { KF_ARG_PTR_TO_CTX, KF_ARG_PTR_TO_ALLOC_BTF_ID, /* Allocated object */ KF_ARG_PTR_TO_REFCOUNTED_KPTR, /* Refcounted local kptr */ - KF_ARG_PTR_TO_KPTR, /* PTR_TO_KPTR but type specific */ KF_ARG_PTR_TO_DYNPTR, KF_ARG_PTR_TO_ITER, KF_ARG_PTR_TO_LIST_HEAD, @@ -9666,21 +9660,6 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (is_kfunc_arg_refcounted_kptr(meta->btf, &args[argno])) return KF_ARG_PTR_TO_REFCOUNTED_KPTR; - if (is_kfunc_arg_kptr_get(meta, argno)) { - if (!btf_type_is_ptr(ref_t)) { - verbose(env, "arg#0 BTF type must be a double pointer for kptr_get kfunc\n"); - return -EINVAL; - } - ref_t = btf_type_by_id(meta->btf, ref_t->type); - ref_tname = btf_name_by_offset(meta->btf, ref_t->name_off); - if (!btf_type_is_struct(ref_t)) { - verbose(env, "kernel function %s args#0 pointer type %s %s is not supported\n", - meta->func_name, btf_type_str(ref_t), ref_tname); - return -EINVAL; - } - return KF_ARG_PTR_TO_KPTR; - } - if (is_kfunc_arg_dynptr(meta->btf, &args[argno])) return KF_ARG_PTR_TO_DYNPTR; @@ -9794,40 +9773,6 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env, return 0; } -static int process_kf_arg_ptr_to_kptr(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, - const struct btf_type *ref_t, - const char *ref_tname, - struct bpf_kfunc_call_arg_meta *meta, - int argno) -{ - struct btf_field *kptr_field; - - /* check_func_arg_reg_off allows var_off for - * PTR_TO_MAP_VALUE, but we need fixed offset to find - * off_desc. - */ - if (!tnum_is_const(reg->var_off)) { - verbose(env, "arg#0 must have constant offset\n"); - return -EINVAL; - } - - kptr_field = btf_record_find(reg->map_ptr->record, reg->off + reg->var_off.value, BPF_KPTR); - if (!kptr_field || kptr_field->type != BPF_KPTR_REF) { - verbose(env, "arg#0 no referenced kptr at map value offset=%llu\n", - reg->off + reg->var_off.value); - return -EINVAL; - } - - if (!btf_struct_ids_match(&env->log, meta->btf, ref_t->type, 0, kptr_field->kptr.btf, - kptr_field->kptr.btf_id, true)) { - verbose(env, "kernel function %s args#%d expected pointer to %s %s\n", - meta->func_name, argno, btf_type_str(ref_t), ref_tname); - return -EINVAL; - } - return 0; -} - static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { struct bpf_verifier_state *state = env->cur_state; @@ -10315,7 +10260,6 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ /* Trusted arguments have the same offset checks as release arguments */ arg_type |= OBJ_RELEASE; break; - case KF_ARG_PTR_TO_KPTR: case KF_ARG_PTR_TO_DYNPTR: case KF_ARG_PTR_TO_ITER: case KF_ARG_PTR_TO_LIST_HEAD: @@ -10368,15 +10312,6 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ meta->arg_obj_drop.btf_id = reg->btf_id; } break; - case KF_ARG_PTR_TO_KPTR: - if (reg->type != PTR_TO_MAP_VALUE) { - verbose(env, "arg#0 expected pointer to map value\n"); - return -EINVAL; - } - ret = process_kf_arg_ptr_to_kptr(env, reg, ref_t, ref_tname, meta, i); - if (ret < 0) - return ret; - break; case KF_ARG_PTR_TO_DYNPTR: { enum bpf_arg_type dynptr_arg_type = ARG_PTR_TO_DYNPTR; -- cgit v1.2.3-58-ga151 From 69a8c792cd9518071dc801bb110e0f2210d9f958 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Mon, 17 Apr 2023 09:17:48 +0100 Subject: bpf: lirc program type should not require SYS_CAP_ADMIN Make it possible to load lirc program type with just CAP_BPF. There is nothing exceptional about lirc programs that means they require SYS_CAP_ADMIN. In order to attach or detach a lirc program type you need permission to open /dev/lirc0; if you have permission to do that, you can alter all sorts of lirc receiving options. Changing the IR protocol decoder is no different. Right now on a typical distribution /dev/lirc devices are only read/write by root. Ideally we would make them group read/write like other devices so that local users can use them without becoming root. Signed-off-by: Sean Young Link: https://lore.kernel.org/r/ZD0ArKpwnDBJZsrE@gofer.mess.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 28eac7434d32..bcf1a1920ddd 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2454,7 +2454,6 @@ static bool is_net_admin_prog_type(enum bpf_prog_type prog_type) case BPF_PROG_TYPE_LWT_SEG6LOCAL: case BPF_PROG_TYPE_SK_SKB: case BPF_PROG_TYPE_SK_MSG: - case BPF_PROG_TYPE_LIRC_MODE2: case BPF_PROG_TYPE_FLOW_DISSECTOR: case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_SOCK: -- cgit v1.2.3-58-ga151 From 3be49f79555ee975acb4ad8b5de06fa4351264aa Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 17 Apr 2023 15:21:34 -0700 Subject: bpf: Improve verifier u32 scalar equality checking In [1], I tried to remove bpf-specific codes to prevent certain llvm optimizations, and add llvm TTI (target transform info) hooks to prevent those optimizations. During this process, I found if I enable llvm SimplifyCFG:shouldFoldTwoEntryPHINode transformation, I will hit the following verification failure with selftests: ... 8: (18) r1 = 0xffffc900001b2230 ; R1_w=map_value(off=560,ks=4,vs=564,imm=0) 10: (61) r1 = *(u32 *)(r1 +0) ; R1_w=scalar(umax=4294967295,var_off=(0x0; 0xffffffff)) ; if (skb->tstamp == EGRESS_ENDHOST_MAGIC) 11: (79) r2 = *(u64 *)(r6 +152) ; R2_w=scalar() R6=ctx(off=0,imm=0) ; if (skb->tstamp == EGRESS_ENDHOST_MAGIC) 12: (55) if r2 != 0xb9fbeef goto pc+10 ; R2_w=195018479 13: (bc) w2 = w1 ; R1_w=scalar(umax=4294967295,var_off=(0x0; 0xffffffff)) R2_w=scalar(umax=4294967295,var_off=(0x0; 0xffffffff)) ; if (test < __NR_TESTS) 14: (a6) if w1 < 0x9 goto pc+1 16: R0=2 R1_w=scalar(umax=8,var_off=(0x0; 0xf)) R2_w=scalar(umax=4294967295,var_off=(0x0; 0xffffffff)) R6=ctx(off=0,imm=0) R10=fp0 ; 16: (27) r2 *= 28 ; R2_w=scalar(umax=120259084260,var_off=(0x0; 0x1ffffffffc),s32_max=2147483644,u32_max=-4) 17: (18) r3 = 0xffffc900001b2118 ; R3_w=map_value(off=280,ks=4,vs=564,imm=0) 19: (0f) r3 += r2 ; R2_w=scalar(umax=120259084260,var_off=(0x0; 0x1ffffffffc),s32_max=2147483644,u32_max=-4) R3_w=map_value(off=280,ks=4,vs=564,umax=120259084260,var_off=(0x0; 0x1ffffffffc),s32_max=2147483644,u32_max=-4) 20: (61) r2 = *(u32 *)(r3 +0) R3 unbounded memory access, make sure to bounds check any such access processed 97 insns (limit 1000000) max_states_per_insn 1 total_states 10 peak_states 10 mark_read 6 -- END PROG LOAD LOG -- libbpf: prog 'ingress_fwdns_prio100': failed to load: -13 libbpf: failed to load object 'test_tc_dtime' libbpf: failed to load BPF skeleton 'test_tc_dtime': -13 ... At insn 14, with condition 'w1 < 9', register r1 is changed from an arbitrary u32 value to `scalar(umax=8,var_off=(0x0; 0xf))`. Register r2, however, remains as an arbitrary u32 value. Current verifier won't claim r1/r2 equality if the previous mov is alu32 ('w2 = w1'). If r1 upper 32bit value is not 0, we indeed cannot clamin r1/r2 equality after 'w2 = w1'. But in this particular case, we know r1 upper 32bit value is 0, so it is safe to claim r1/r2 equality. This patch exactly did this. For a 32bit subreg mov, if the src register upper 32bit is 0, it is okay to claim equality between src and dst registers. With this patch, the above verification sequence becomes ... 8: (18) r1 = 0xffffc9000048e230 ; R1_w=map_value(off=560,ks=4,vs=564,imm=0) 10: (61) r1 = *(u32 *)(r1 +0) ; R1_w=scalar(umax=4294967295,var_off=(0x0; 0xffffffff)) ; if (skb->tstamp == EGRESS_ENDHOST_MAGIC) 11: (79) r2 = *(u64 *)(r6 +152) ; R2_w=scalar() R6=ctx(off=0,imm=0) ; if (skb->tstamp == EGRESS_ENDHOST_MAGIC) 12: (55) if r2 != 0xb9fbeef goto pc+10 ; R2_w=195018479 13: (bc) w2 = w1 ; R1_w=scalar(id=6,umax=4294967295,var_off=(0x0; 0xffffffff)) R2_w=scalar(id=6,umax=4294967295,var_off=(0x0; 0xffffffff)) ; if (test < __NR_TESTS) 14: (a6) if w1 < 0x9 goto pc+1 ; R1_w=scalar(id=6,umin=9,umax=4294967295,var_off=(0x0; 0xffffffff)) ... from 14 to 16: R0=2 R1_w=scalar(id=6,umax=8,var_off=(0x0; 0xf)) R2_w=scalar(id=6,umax=8,var_off=(0x0; 0xf)) R6=ctx(off=0,imm=0) R10=fp0 16: (27) r2 *= 28 ; R2_w=scalar(umax=224,var_off=(0x0; 0xfc)) 17: (18) r3 = 0xffffc9000048e118 ; R3_w=map_value(off=280,ks=4,vs=564,imm=0) 19: (0f) r3 += r2 20: (61) r2 = *(u32 *)(r3 +0) ; R2_w=scalar(umax=4294967295,var_off=(0x0; 0xffffffff)) R3_w=map_value(off=280,ks=4,vs=564,umax=224,var_off=(0x0; 0xfc),s32_max=252,u32_max=252) ... and eventually the bpf program can be verified successfully. [1] https://reviews.llvm.org/D147968 Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20230417222134.359714-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5dae11ee01c3..1e05355facdc 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12421,12 +12421,17 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) insn->src_reg); return -EACCES; } else if (src_reg->type == SCALAR_VALUE) { + bool is_src_reg_u32 = src_reg->umax_value <= U32_MAX; + + if (is_src_reg_u32 && !src_reg->id) + src_reg->id = ++env->id_gen; copy_register_state(dst_reg, src_reg); - /* Make sure ID is cleared otherwise + /* Make sure ID is cleared if src_reg is not in u32 range otherwise * dst_reg min/max could be incorrectly * propagated into src_reg by find_equal_scalars() */ - dst_reg->id = 0; + if (!is_src_reg_u32) + dst_reg->id = 0; dst_reg->live |= REG_LIVE_WRITTEN; dst_reg->subreg_def = env->insn_idx + 1; } else { -- cgit v1.2.3-58-ga151 From 2569c7b8726fc06d946a4f999fb1be15b68f3f3c Mon Sep 17 00:00:00 2001 From: Feng Zhou Date: Thu, 20 Apr 2023 11:27:34 +0800 Subject: bpf: support access variable length array of integer type After this commit: bpf: Support variable length array in tracing programs (9c5f8a1008a1) Trace programs can access variable length array, but for structure type. This patch adds support for integer type. Example: Hook load_balance struct sched_domain { ... unsigned long span[]; } The access: sd->span[0]. Co-developed-by: Chengming Zhou Signed-off-by: Chengming Zhou Signed-off-by: Feng Zhou Link: https://lore.kernel.org/r/20230420032735.27760-2-zhoufeng.zf@bytedance.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 027f9f8a3551..a0887ee44e89 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6157,11 +6157,13 @@ again: if (off < moff) goto error; - /* Only allow structure for now, can be relaxed for - * other types later. - */ + /* allow structure and integer */ t = btf_type_skip_modifiers(btf, array_elem->type, NULL); + + if (btf_type_is_int(t)) + return WALK_SCALAR; + if (!btf_type_is_struct(t)) goto error; -- cgit v1.2.3-58-ga151 From acf1c3d68e9a31f10d92bc67ad4673cdae5e8d92 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 20 Apr 2023 18:49:01 -0700 Subject: bpf: Fix race between btf_put and btf_idr walk. Florian and Eduard reported hard dead lock: [ 58.433327] _raw_spin_lock_irqsave+0x40/0x50 [ 58.433334] btf_put+0x43/0x90 [ 58.433338] bpf_find_btf_id+0x157/0x240 [ 58.433353] btf_parse_fields+0x921/0x11c0 This happens since btf->refcount can be 1 at the time of btf_put() and btf_put() will call btf_free_id() which will try to grab btf_idr_lock and will dead lock. Avoid the issue by doing btf_put() without locking. Fixes: 3d78417b60fb ("bpf: Add bpf_btf_find_by_name_kind() helper.") Fixes: 1e89106da253 ("bpf: Add bpf_core_add_cands() and wire it into bpf_core_apply_relo_insn().") Reported-by: Florian Westphal Reported-by: Eduard Zingerman Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Tested-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20230421014901.70908-1-alexei.starovoitov@gmail.com --- kernel/bpf/btf.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index a0887ee44e89..7db4ec125fbd 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -577,8 +577,8 @@ static s32 bpf_find_btf_id(const char *name, u32 kind, struct btf **btf_p) *btf_p = btf; return ret; } - spin_lock_bh(&btf_idr_lock); btf_put(btf); + spin_lock_bh(&btf_idr_lock); } spin_unlock_bh(&btf_idr_lock); return ret; @@ -8354,12 +8354,10 @@ check_modules: btf_get(mod_btf); spin_unlock_bh(&btf_idr_lock); cands = bpf_core_add_cands(cands, mod_btf, btf_nr_types(main_btf)); - if (IS_ERR(cands)) { - btf_put(mod_btf); + btf_put(mod_btf); + if (IS_ERR(cands)) return ERR_CAST(cands); - } spin_lock_bh(&btf_idr_lock); - btf_put(mod_btf); } spin_unlock_bh(&btf_idr_lock); /* cands is a pointer to kmalloced memory here if cands->cnt > 0 -- cgit v1.2.3-58-ga151 From 4ab07209d5cc8cb6d2a5324c07b3efc3b2fde494 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Fri, 21 Apr 2023 00:44:31 -0700 Subject: bpf: Fix bpf_refcount_acquire's refcount_t address calculation When calculating the address of the refcount_t struct within a local kptr, bpf_refcount_acquire_impl should add refcount_off bytes to the address of the local kptr. Due to some missing parens, the function is incorrectly adding sizeof(refcount_t) * refcount_off bytes. This patch fixes the calculation. Due to the incorrect calculation, bpf_refcount_acquire_impl was trying to refcount_inc some memory well past the end of local kptrs, resulting in kasan and refcount complaints, as reported in [0]. In that thread, Florian and Eduard discovered that bpf selftests written in the new style - with __success and an expected __retval, specifically - were not actually being run. As a result, selftests added in bpf_refcount series weren't really exercising this behavior, and thus didn't unearth the bug. With this fixed behavior it's safe to revert commit 7c4b96c00043 ("selftests/bpf: disable program test run for progs/refcounted_kptr.c"), this patch does so. [0] https://lore.kernel.org/bpf/ZEEp+j22imoN6rn9@strlen.de/ Fixes: 7c50b1cb76ac ("bpf: Add bpf_refcount_acquire kfunc") Reported-by: Florian Westphal Reported-by: Eduard Zingerman Signed-off-by: Dave Marchevsky Signed-off-by: Daniel Borkmann Tested-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20230421074431.3548349-1-davemarchevsky@fb.com --- kernel/bpf/helpers.c | 2 +- tools/testing/selftests/bpf/progs/refcounted_kptr.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 00e5fb0682ac..8d368fa353f9 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1925,7 +1925,7 @@ __bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta /* Could just cast directly to refcount_t *, but need some code using * bpf_refcount type so that it is emitted in vmlinux BTF */ - ref = (struct bpf_refcount *)p__refcounted_kptr + meta->record->refcount_off; + ref = (struct bpf_refcount *)(p__refcounted_kptr + meta->record->refcount_off); refcount_inc((refcount_t *)ref); return (void *)p__refcounted_kptr; diff --git a/tools/testing/selftests/bpf/progs/refcounted_kptr.c b/tools/testing/selftests/bpf/progs/refcounted_kptr.c index b6b2d4f97b19..1d348a225140 100644 --- a/tools/testing/selftests/bpf/progs/refcounted_kptr.c +++ b/tools/testing/selftests/bpf/progs/refcounted_kptr.c @@ -219,7 +219,7 @@ static long __read_from_unstash(int idx) #define INSERT_READ_BOTH(rem_tree, rem_list, desc) \ SEC("tc") \ __description(desc) \ -__success /* __retval(579) temporarily disabled */ \ +__success __retval(579) \ long insert_and_remove_tree_##rem_tree##_list_##rem_list(void *ctx) \ { \ long err, tree_data, list_data; \ @@ -258,7 +258,7 @@ INSERT_READ_BOTH(false, true, "insert_read_both: remove from list"); #define INSERT_READ_BOTH(rem_tree, rem_list, desc) \ SEC("tc") \ __description(desc) \ -__success /* __retval(579) temporarily disabled */ \ +__success __retval(579) \ long insert_and_remove_lf_tree_##rem_tree##_list_##rem_list(void *ctx) \ { \ long err, tree_data, list_data; \ @@ -296,7 +296,7 @@ INSERT_READ_BOTH(false, true, "insert_read_both_list_first: remove from list"); #define INSERT_DOUBLE_READ_AND_DEL(read_fn, read_root, desc) \ SEC("tc") \ __description(desc) \ -__success /* temporarily __retval(-1) disabled */ \ +__success __retval(-1) \ long insert_double_##read_fn##_and_del_##read_root(void *ctx) \ { \ long err, list_data; \ @@ -329,7 +329,7 @@ INSERT_DOUBLE_READ_AND_DEL(__read_from_list, head, "insert_double_del: 2x read-a #define INSERT_STASH_READ(rem_tree, desc) \ SEC("tc") \ __description(desc) \ -__success /* __retval(84) temporarily disabled */ \ +__success __retval(84) \ long insert_rbtree_and_stash__del_tree_##rem_tree(void *ctx) \ { \ long err, tree_data, map_data; \ -- cgit v1.2.3-58-ga151 From 00e74ae0863827d944e36e56a4ce1e77e50edb91 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 18 Apr 2023 15:53:38 -0700 Subject: bpf: Don't EFAULT for getsockopt with optval=NULL Some socket options do getsockopt with optval=NULL to estimate the size of the final buffer (which is returned via optlen). This breaks BPF getsockopt assumptions about permitted optval buffer size. Let's enforce these assumptions only when non-NULL optval is provided. Fixes: 0d01da6afc54 ("bpf: implement getsockopt and setsockopt hooks") Reported-by: Martin KaFai Lau Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/ZD7Js4fj5YyI2oLd@google.com/T/#mb68daf700f87a9244a15d01d00c3f0e5b08f49f7 Link: https://lore.kernel.org/bpf/20230418225343.553806-2-sdf@google.com --- kernel/bpf/cgroup.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 53edb8ad2471..a06e118a9be5 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1921,14 +1921,17 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, if (ret < 0) goto out; - if (ctx.optlen > max_optlen || ctx.optlen < 0) { + if (optval && (ctx.optlen > max_optlen || ctx.optlen < 0)) { ret = -EFAULT; goto out; } if (ctx.optlen != 0) { - if (copy_to_user(optval, ctx.optval, ctx.optlen) || - put_user(ctx.optlen, optlen)) { + if (optval && copy_to_user(optval, ctx.optval, ctx.optlen)) { + ret = -EFAULT; + goto out; + } + if (put_user(ctx.optlen, optlen)) { ret = -EFAULT; goto out; } -- cgit v1.2.3-58-ga151 From 84601d6ee68ae820dec97450934797046d62db4b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 21 Apr 2023 19:02:54 +0200 Subject: bpf: add bpf_link support for BPF_NETFILTER programs Add bpf_link support skeleton. To keep this reviewable, no bpf program can be invoked yet, if a program is attached only a c-stub is called and not the actual bpf program. Defaults to 'y' if both netfilter and bpf syscall are enabled in kconfig. Uapi example usage: union bpf_attr attr = { }; attr.link_create.prog_fd = progfd; attr.link_create.attach_type = 0; /* unused */ attr.link_create.netfilter.pf = PF_INET; attr.link_create.netfilter.hooknum = NF_INET_LOCAL_IN; attr.link_create.netfilter.priority = -128; err = bpf(BPF_LINK_CREATE, &attr, sizeof(attr)); ... this would attach progfd to ipv4:input hook. Such hook gets removed automatically if the calling program exits. BPF_NETFILTER program invocation is added in followup change. NF_HOOK_OP_BPF enum will eventually be read from nfnetlink_hook, it allows to tell userspace which program is attached at the given hook when user runs 'nft hook list' command rather than just the priority and not-very-helpful 'this hook runs a bpf prog but I can't tell which one'. Will also be used to disallow registration of two bpf programs with same priority in a followup patch. v4: arm32 cmpxchg only supports 32bit operand s/prio/priority/ v3: restrict prog attachment to ip/ip6 for now, lets lift restrictions if more use cases pop up (arptables, ebtables, netdev ingress/egress etc). Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20230421170300.24115-2-fw@strlen.de Signed-off-by: Alexei Starovoitov --- include/linux/netfilter.h | 1 + include/net/netfilter/nf_bpf_link.h | 10 +++ include/uapi/linux/bpf.h | 14 ++++ kernel/bpf/syscall.c | 6 ++ net/netfilter/Kconfig | 3 + net/netfilter/Makefile | 1 + net/netfilter/nf_bpf_link.c | 159 ++++++++++++++++++++++++++++++++++++ 7 files changed, 194 insertions(+) create mode 100644 include/net/netfilter/nf_bpf_link.h create mode 100644 net/netfilter/nf_bpf_link.c (limited to 'kernel') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index c8e03bcaecaa..0762444e3767 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -80,6 +80,7 @@ typedef unsigned int nf_hookfn(void *priv, enum nf_hook_ops_type { NF_HOOK_OP_UNDEFINED, NF_HOOK_OP_NF_TABLES, + NF_HOOK_OP_BPF, }; struct nf_hook_ops { diff --git a/include/net/netfilter/nf_bpf_link.h b/include/net/netfilter/nf_bpf_link.h new file mode 100644 index 000000000000..eeaeaf3d15de --- /dev/null +++ b/include/net/netfilter/nf_bpf_link.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#if IS_ENABLED(CONFIG_NETFILTER_BPF_LINK) +int bpf_nf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); +#else +static inline int bpf_nf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + return -EOPNOTSUPP; +} +#endif diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4b20a7269bee..1bb11a6ee667 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -986,6 +986,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_LSM, BPF_PROG_TYPE_SK_LOOKUP, BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */ + BPF_PROG_TYPE_NETFILTER, }; enum bpf_attach_type { @@ -1050,6 +1051,7 @@ enum bpf_link_type { BPF_LINK_TYPE_PERF_EVENT = 7, BPF_LINK_TYPE_KPROBE_MULTI = 8, BPF_LINK_TYPE_STRUCT_OPS = 9, + BPF_LINK_TYPE_NETFILTER = 10, MAX_BPF_LINK_TYPE, }; @@ -1560,6 +1562,12 @@ union bpf_attr { */ __u64 cookie; } tracing; + struct { + __u32 pf; + __u32 hooknum; + __s32 priority; + __u32 flags; + } netfilter; }; } link_create; @@ -6410,6 +6418,12 @@ struct bpf_link_info { struct { __u32 map_id; } struct_ops; + struct { + __u32 pf; + __u32 hooknum; + __s32 priority; + __u32 flags; + } netfilter; }; } __attribute__((aligned(8))); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index bcf1a1920ddd..14f39c1e573e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -35,6 +35,7 @@ #include #include #include +#include #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ @@ -2462,6 +2463,7 @@ static bool is_net_admin_prog_type(enum bpf_prog_type prog_type) case BPF_PROG_TYPE_CGROUP_SYSCTL: case BPF_PROG_TYPE_SOCK_OPS: case BPF_PROG_TYPE_EXT: /* extends any prog */ + case BPF_PROG_TYPE_NETFILTER: return true; case BPF_PROG_TYPE_CGROUP_SKB: /* always unpriv */ @@ -4588,6 +4590,7 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) switch (prog->type) { case BPF_PROG_TYPE_EXT: + case BPF_PROG_TYPE_NETFILTER: break; case BPF_PROG_TYPE_PERF_EVENT: case BPF_PROG_TYPE_TRACEPOINT: @@ -4654,6 +4657,9 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) case BPF_PROG_TYPE_XDP: ret = bpf_xdp_link_attach(attr, prog); break; + case BPF_PROG_TYPE_NETFILTER: + ret = bpf_nf_link_attach(attr, prog); + break; #endif case BPF_PROG_TYPE_PERF_EVENT: case BPF_PROG_TYPE_TRACEPOINT: diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index d0bf630482c1..441d1f134110 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -30,6 +30,9 @@ config NETFILTER_FAMILY_BRIDGE config NETFILTER_FAMILY_ARP bool +config NETFILTER_BPF_LINK + def_bool BPF_SYSCALL + config NETFILTER_NETLINK_HOOK tristate "Netfilter base hook dump support" depends on NETFILTER_ADVANCED diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 5ffef1cd6143..d4958e7e7631 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -22,6 +22,7 @@ nf_conntrack-$(CONFIG_DEBUG_INFO_BTF) += nf_conntrack_bpf.o endif obj-$(CONFIG_NETFILTER) = netfilter.o +obj-$(CONFIG_NETFILTER_BPF_LINK) += nf_bpf_link.o obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o obj-$(CONFIG_NETFILTER_NETLINK_ACCT) += nfnetlink_acct.o diff --git a/net/netfilter/nf_bpf_link.c b/net/netfilter/nf_bpf_link.c new file mode 100644 index 000000000000..efa4f3390742 --- /dev/null +++ b/net/netfilter/nf_bpf_link.c @@ -0,0 +1,159 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include + +#include +#include + +static unsigned int nf_hook_run_bpf(void *bpf_prog, struct sk_buff *skb, + const struct nf_hook_state *s) +{ + return NF_ACCEPT; +} + +struct bpf_nf_link { + struct bpf_link link; + struct nf_hook_ops hook_ops; + struct net *net; + u32 dead; +}; + +static void bpf_nf_link_release(struct bpf_link *link) +{ + struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link); + + if (nf_link->dead) + return; + + /* prevent hook-not-found warning splat from netfilter core when + * .detach was already called + */ + if (!cmpxchg(&nf_link->dead, 0, 1)) + nf_unregister_net_hook(nf_link->net, &nf_link->hook_ops); +} + +static void bpf_nf_link_dealloc(struct bpf_link *link) +{ + struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link); + + kfree(nf_link); +} + +static int bpf_nf_link_detach(struct bpf_link *link) +{ + bpf_nf_link_release(link); + return 0; +} + +static void bpf_nf_link_show_info(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link); + + seq_printf(seq, "pf:\t%u\thooknum:\t%u\tprio:\t%d\n", + nf_link->hook_ops.pf, nf_link->hook_ops.hooknum, + nf_link->hook_ops.priority); +} + +static int bpf_nf_link_fill_link_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link); + + info->netfilter.pf = nf_link->hook_ops.pf; + info->netfilter.hooknum = nf_link->hook_ops.hooknum; + info->netfilter.priority = nf_link->hook_ops.priority; + info->netfilter.flags = 0; + + return 0; +} + +static int bpf_nf_link_update(struct bpf_link *link, struct bpf_prog *new_prog, + struct bpf_prog *old_prog) +{ + return -EOPNOTSUPP; +} + +static const struct bpf_link_ops bpf_nf_link_lops = { + .release = bpf_nf_link_release, + .dealloc = bpf_nf_link_dealloc, + .detach = bpf_nf_link_detach, + .show_fdinfo = bpf_nf_link_show_info, + .fill_link_info = bpf_nf_link_fill_link_info, + .update_prog = bpf_nf_link_update, +}; + +static int bpf_nf_check_pf_and_hooks(const union bpf_attr *attr) +{ + switch (attr->link_create.netfilter.pf) { + case NFPROTO_IPV4: + case NFPROTO_IPV6: + if (attr->link_create.netfilter.hooknum >= NF_INET_NUMHOOKS) + return -EPROTO; + break; + default: + return -EAFNOSUPPORT; + } + + if (attr->link_create.netfilter.flags) + return -EOPNOTSUPP; + + /* make sure conntrack confirm is always last. + * + * In the future, if userspace can e.g. request defrag, then + * "defrag_requested && prio before NF_IP_PRI_CONNTRACK_DEFRAG" + * should fail. + */ + switch (attr->link_create.netfilter.priority) { + case NF_IP_PRI_FIRST: return -ERANGE; /* sabotage_in and other warts */ + case NF_IP_PRI_LAST: return -ERANGE; /* e.g. conntrack confirm */ + } + + return 0; +} + +int bpf_nf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct net *net = current->nsproxy->net_ns; + struct bpf_link_primer link_primer; + struct bpf_nf_link *link; + int err; + + if (attr->link_create.flags) + return -EINVAL; + + err = bpf_nf_check_pf_and_hooks(attr); + if (err) + return err; + + link = kzalloc(sizeof(*link), GFP_USER); + if (!link) + return -ENOMEM; + + bpf_link_init(&link->link, BPF_LINK_TYPE_NETFILTER, &bpf_nf_link_lops, prog); + + link->hook_ops.hook = nf_hook_run_bpf; + link->hook_ops.hook_ops_type = NF_HOOK_OP_BPF; + link->hook_ops.priv = prog; + + link->hook_ops.pf = attr->link_create.netfilter.pf; + link->hook_ops.priority = attr->link_create.netfilter.priority; + link->hook_ops.hooknum = attr->link_create.netfilter.hooknum; + + link->net = net; + link->dead = false; + + err = bpf_link_prime(&link->link, &link_primer); + if (err) { + kfree(link); + return err; + } + + err = nf_register_net_hook(net, &link->hook_ops); + if (err) { + bpf_link_cleanup(&link_primer); + return err; + } + + return bpf_link_settle(&link_primer); +} -- cgit v1.2.3-58-ga151 From fd9c663b9ad67dedfc9a3fd3429ddd3e83782b4d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 21 Apr 2023 19:02:55 +0200 Subject: bpf: minimal support for programs hooked into netfilter framework This adds minimal support for BPF_PROG_TYPE_NETFILTER bpf programs that will be invoked via the NF_HOOK() points in the ip stack. Invocation incurs an indirect call. This is not a necessity: Its possible to add 'DEFINE_BPF_DISPATCHER(nf_progs)' and handle the program invocation with the same method already done for xdp progs. This isn't done here to keep the size of this chunk down. Verifier restricts verdicts to either DROP or ACCEPT. Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20230421170300.24115-3-fw@strlen.de Signed-off-by: Alexei Starovoitov --- include/linux/bpf_types.h | 4 +++ include/net/netfilter/nf_bpf_link.h | 5 +++ kernel/bpf/btf.c | 6 ++++ kernel/bpf/verifier.c | 3 ++ net/core/filter.c | 1 + net/netfilter/nf_bpf_link.c | 70 ++++++++++++++++++++++++++++++++++++- 6 files changed, 88 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index d4ee3ccd3753..39a999abb0ce 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -79,6 +79,10 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LSM, lsm, #endif BPF_PROG_TYPE(BPF_PROG_TYPE_SYSCALL, bpf_syscall, void *, void *) +#ifdef CONFIG_NETFILTER +BPF_PROG_TYPE(BPF_PROG_TYPE_NETFILTER, netfilter, + struct bpf_nf_ctx, struct bpf_nf_ctx) +#endif BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops) diff --git a/include/net/netfilter/nf_bpf_link.h b/include/net/netfilter/nf_bpf_link.h index eeaeaf3d15de..6c984b0ea838 100644 --- a/include/net/netfilter/nf_bpf_link.h +++ b/include/net/netfilter/nf_bpf_link.h @@ -1,5 +1,10 @@ /* SPDX-License-Identifier: GPL-2.0 */ +struct bpf_nf_ctx { + const struct nf_hook_state *state; + struct sk_buff *skb; +}; + #if IS_ENABLED(CONFIG_NETFILTER_BPF_LINK) int bpf_nf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); #else diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 7db4ec125fbd..6b682b8e4b50 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -25,6 +25,9 @@ #include #include #include + +#include + #include #include "../tools/lib/bpf/relo_core.h" @@ -212,6 +215,7 @@ enum btf_kfunc_hook { BTF_KFUNC_HOOK_SK_SKB, BTF_KFUNC_HOOK_SOCKET_FILTER, BTF_KFUNC_HOOK_LWT, + BTF_KFUNC_HOOK_NETFILTER, BTF_KFUNC_HOOK_MAX, }; @@ -7802,6 +7806,8 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type) case BPF_PROG_TYPE_LWT_XMIT: case BPF_PROG_TYPE_LWT_SEG6LOCAL: return BTF_KFUNC_HOOK_LWT; + case BPF_PROG_TYPE_NETFILTER: + return BTF_KFUNC_HOOK_NETFILTER; default: return BTF_KFUNC_HOOK_MAX; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1e05355facdc..fc7281d39e46 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -13816,6 +13816,9 @@ static int check_return_code(struct bpf_verifier_env *env) } break; + case BPF_PROG_TYPE_NETFILTER: + range = tnum_range(NF_DROP, NF_ACCEPT); + break; case BPF_PROG_TYPE_EXT: /* freplace program can return anything as its return value * depends on the to-be-replaced kernel func or bpf program. diff --git a/net/core/filter.c b/net/core/filter.c index 44fb997434ad..d9ce04ca22ce 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -11717,6 +11717,7 @@ static int __init bpf_kfunc_init(void) ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_IN, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_XMIT, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_SEG6LOCAL, &bpf_kfunc_set_skb); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_NETFILTER, &bpf_kfunc_set_skb); return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp); } late_initcall(bpf_kfunc_init); diff --git a/net/netfilter/nf_bpf_link.c b/net/netfilter/nf_bpf_link.c index efa4f3390742..49cfc5215386 100644 --- a/net/netfilter/nf_bpf_link.c +++ b/net/netfilter/nf_bpf_link.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include #include #include @@ -8,7 +9,13 @@ static unsigned int nf_hook_run_bpf(void *bpf_prog, struct sk_buff *skb, const struct nf_hook_state *s) { - return NF_ACCEPT; + const struct bpf_prog *prog = bpf_prog; + struct bpf_nf_ctx ctx = { + .state = s, + .skb = skb, + }; + + return bpf_prog_run(prog, &ctx); } struct bpf_nf_link { @@ -157,3 +164,64 @@ int bpf_nf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) return bpf_link_settle(&link_primer); } + +const struct bpf_prog_ops netfilter_prog_ops = { +}; + +static bool nf_ptr_to_btf_id(struct bpf_insn_access_aux *info, const char *name) +{ + struct btf *btf; + s32 type_id; + + btf = bpf_get_btf_vmlinux(); + if (IS_ERR_OR_NULL(btf)) + return false; + + type_id = btf_find_by_name_kind(btf, name, BTF_KIND_STRUCT); + if (WARN_ON_ONCE(type_id < 0)) + return false; + + info->btf = btf; + info->btf_id = type_id; + info->reg_type = PTR_TO_BTF_ID | PTR_TRUSTED; + return true; +} + +static bool nf_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (off < 0 || off >= sizeof(struct bpf_nf_ctx)) + return false; + + if (type == BPF_WRITE) + return false; + + switch (off) { + case bpf_ctx_range(struct bpf_nf_ctx, skb): + if (size != sizeof_field(struct bpf_nf_ctx, skb)) + return false; + + return nf_ptr_to_btf_id(info, "sk_buff"); + case bpf_ctx_range(struct bpf_nf_ctx, state): + if (size != sizeof_field(struct bpf_nf_ctx, state)) + return false; + + return nf_ptr_to_btf_id(info, "nf_hook_state"); + default: + return false; + } + + return false; +} + +static const struct bpf_func_proto * +bpf_nf_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + return bpf_base_func_proto(func_id); +} + +const struct bpf_verifier_ops netfilter_verifier_ops = { + .is_valid_access = nf_is_valid_access, + .get_func_proto = bpf_nf_func_proto, +}; -- cgit v1.2.3-58-ga151 From 7deca5eae83389ca40ac1b1bde96e4af17cca84f Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Mon, 24 Apr 2023 13:43:21 -0700 Subject: bpf: Disable bpf_refcount_acquire kfunc calls until race conditions are fixed As reported by Kumar in [0], the shared ownership implementation for BPF programs has some race conditions which need to be addressed before it can safely be used. This patch does so in a minimal way instead of ripping out shared ownership entirely, as proper fixes for the issues raised will follow ASAP, at which point this patch's commit can be reverted to re-enable shared ownership. The patch removes the ability to call bpf_refcount_acquire_impl from BPF programs. Programs can only bump refcount and obtain a new owning reference using this kfunc, so removing the ability to call it effectively disables shared ownership. Instead of changing success / failure expectations for bpf_refcount-related selftests, this patch just disables them from running for now. [0]: https://lore.kernel.org/bpf/d7hyspcow5wtjcmw4fugdgyp3fwhljwuscp3xyut5qnwivyeru@ysdq543otzv2/ Reported-by: Kumar Kartikeya Dwivedi Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20230424204321.2680232-1-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 5 ++++- tools/testing/selftests/bpf/prog_tests/refcounted_kptr.c | 2 -- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0d73139ee4d8..5c4aa393f65a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10509,7 +10509,10 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ verbose(env, "arg#%d doesn't point to a type with bpf_refcount field\n", i); return -EINVAL; } - + if (rec->refcount_off >= 0) { + verbose(env, "bpf_refcount_acquire calls are disabled for now\n"); + return -EINVAL; + } meta->arg_refcount_acquire.btf = reg->btf; meta->arg_refcount_acquire.btf_id = reg->btf_id; break; diff --git a/tools/testing/selftests/bpf/prog_tests/refcounted_kptr.c b/tools/testing/selftests/bpf/prog_tests/refcounted_kptr.c index 2ab23832062d..595cbf92bff5 100644 --- a/tools/testing/selftests/bpf/prog_tests/refcounted_kptr.c +++ b/tools/testing/selftests/bpf/prog_tests/refcounted_kptr.c @@ -9,10 +9,8 @@ void test_refcounted_kptr(void) { - RUN_TESTS(refcounted_kptr); } void test_refcounted_kptr_fail(void) { - RUN_TESTS(refcounted_kptr_fail); } -- cgit v1.2.3-58-ga151 From a0c109dcafb15b8bee187c49fb746779374f60f0 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Mon, 24 Apr 2023 16:11:03 +0000 Subject: bpf: Add __rcu_read_{lock,unlock} into btf id deny list The tracing recursion prevention mechanism must be protected by rcu, that leaves __rcu_read_{lock,unlock} unprotected by this mechanism. If we trace them, the recursion will happen. Let's add them into the btf id deny list. When CONFIG_PREEMPT_RCU is enabled, it can be reproduced with a simple bpf program as such: SEC("fentry/__rcu_read_lock") int fentry_run() { return 0; } Signed-off-by: Yafang Shao Link: https://lore.kernel.org/r/20230424161104.3737-2-laoar.shao@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5c4aa393f65a..fbcf5a4e2fcd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -18671,6 +18671,10 @@ BTF_ID(func, rcu_read_unlock_strict) BTF_ID(func, preempt_count_add) BTF_ID(func, preempt_count_sub) #endif +#ifdef CONFIG_PREEMPT_RCU +BTF_ID(func, __rcu_read_lock) +BTF_ID(func, __rcu_read_unlock) +#endif BTF_SET_END(btf_id_deny) static bool can_be_sleepable(struct bpf_prog *prog) -- cgit v1.2.3-58-ga151