From 21b172ee11b6ec260bd7e6a27b11a8a8d392fce5 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Mon, 13 Aug 2018 12:21:58 +0300 Subject: net/xdp: Fix suspicious RCU usage warning Fix the warning below by calling rhashtable_lookup_fast. Also, make some code movements for better quality and human readability. [ 342.450870] WARNING: suspicious RCU usage [ 342.455856] 4.18.0-rc2+ #17 Tainted: G O [ 342.462210] ----------------------------- [ 342.467202] ./include/linux/rhashtable.h:481 suspicious rcu_dereference_check() usage! [ 342.476568] [ 342.476568] other info that might help us debug this: [ 342.476568] [ 342.486978] [ 342.486978] rcu_scheduler_active = 2, debug_locks = 1 [ 342.495211] 4 locks held by modprobe/3934: [ 342.500265] #0: 00000000e23116b2 (mlx5_intf_mutex){+.+.}, at: mlx5_unregister_interface+0x18/0x90 [mlx5_core] [ 342.511953] #1: 00000000ca16db96 (rtnl_mutex){+.+.}, at: unregister_netdev+0xe/0x20 [ 342.521109] #2: 00000000a46e2c4b (&priv->state_lock){+.+.}, at: mlx5e_close+0x29/0x60 [mlx5_core] [ 342.531642] #3: 0000000060c5bde3 (mem_id_lock){+.+.}, at: xdp_rxq_info_unreg+0x93/0x6b0 [ 342.541206] [ 342.541206] stack backtrace: [ 342.547075] CPU: 12 PID: 3934 Comm: modprobe Tainted: G O 4.18.0-rc2+ #17 [ 342.556621] Hardware name: Dell Inc. PowerEdge R730/0H21J3, BIOS 1.5.4 10/002/2015 [ 342.565606] Call Trace: [ 342.568861] dump_stack+0x78/0xb3 [ 342.573086] xdp_rxq_info_unreg+0x3f5/0x6b0 [ 342.578285] ? __call_rcu+0x220/0x300 [ 342.582911] mlx5e_free_rq+0x38/0xc0 [mlx5_core] [ 342.588602] mlx5e_close_channel+0x20/0x120 [mlx5_core] [ 342.594976] mlx5e_close_channels+0x26/0x40 [mlx5_core] [ 342.601345] mlx5e_close_locked+0x44/0x50 [mlx5_core] [ 342.607519] mlx5e_close+0x42/0x60 [mlx5_core] [ 342.613005] __dev_close_many+0xb1/0x120 [ 342.617911] dev_close_many+0xa2/0x170 [ 342.622622] rollback_registered_many+0x148/0x460 [ 342.628401] ? __lock_acquire+0x48d/0x11b0 [ 342.633498] ? unregister_netdev+0xe/0x20 [ 342.638495] rollback_registered+0x56/0x90 [ 342.643588] unregister_netdevice_queue+0x7e/0x100 [ 342.649461] unregister_netdev+0x18/0x20 [ 342.654362] mlx5e_remove+0x2a/0x50 [mlx5_core] [ 342.659944] mlx5_remove_device+0xe5/0x110 [mlx5_core] [ 342.666208] mlx5_unregister_interface+0x39/0x90 [mlx5_core] [ 342.673038] cleanup+0x5/0xbfc [mlx5_core] [ 342.678094] __x64_sys_delete_module+0x16b/0x240 [ 342.683725] ? do_syscall_64+0x1c/0x210 [ 342.688476] do_syscall_64+0x5a/0x210 [ 342.693025] entry_SYSCALL_64_after_hwframe+0x49/0xbe Fixes: 8d5d88527587 ("xdp: rhashtable with allocator ID to pointer mapping") Signed-off-by: Tariq Toukan Suggested-by: Daniel Borkmann Cc: Jesper Dangaard Brouer Acked-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann --- net/core/xdp.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/core/xdp.c b/net/core/xdp.c index 3dd99e1c04f5..89b6785cef2a 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -98,23 +98,15 @@ static void __xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) { struct xdp_mem_allocator *xa; int id = xdp_rxq->mem.id; - int err; if (id == 0) return; mutex_lock(&mem_id_lock); - xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params); - if (!xa) { - mutex_unlock(&mem_id_lock); - return; - } - - err = rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params); - WARN_ON(err); - - call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free); + xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params); + if (xa && !rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params)) + call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free); mutex_unlock(&mem_id_lock); } -- cgit v1.2.3-58-ga151 From 037b0b86ecf5646f8eae777d8b52ff8b401692ec Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 16 Aug 2018 21:49:06 +0200 Subject: tcp, ulp: add alias for all ulp modules Lets not turn the TCP ULP lookup into an arbitrary module loader as we only intend to load ULP modules through this mechanism, not other unrelated kernel modules: [root@bar]# cat foo.c #include #include #include #include int main(void) { int sock = socket(PF_INET, SOCK_STREAM, 0); setsockopt(sock, IPPROTO_TCP, TCP_ULP, "sctp", sizeof("sctp")); return 0; } [root@bar]# gcc foo.c -O2 -Wall [root@bar]# lsmod | grep sctp [root@bar]# ./a.out [root@bar]# lsmod | grep sctp sctp 1077248 4 libcrc32c 16384 3 nf_conntrack,nf_nat,sctp [root@bar]# Fix it by adding module alias to TCP ULP modules, so probing module via request_module() will be limited to tcp-ulp-[name]. The existing modules like kTLS will load fine given tcp-ulp-tls alias, but others will fail to load: [root@bar]# lsmod | grep sctp [root@bar]# ./a.out [root@bar]# lsmod | grep sctp [root@bar]# Sockmap is not affected from this since it's either built-in or not. Fixes: 734942cc4ea6 ("tcp: ULP infrastructure") Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- include/net/tcp.h | 4 ++++ net/ipv4/tcp_ulp.c | 2 +- net/tls/tls_main.c | 1 + 3 files changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/tcp.h b/include/net/tcp.h index d196901c9dba..770917d0caa7 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2065,6 +2065,10 @@ int tcp_set_ulp_id(struct sock *sk, const int ulp); void tcp_get_available_ulp(char *buf, size_t len); void tcp_cleanup_ulp(struct sock *sk); +#define MODULE_ALIAS_TCP_ULP(name) \ + __MODULE_INFO(alias, alias_userspace, name); \ + __MODULE_INFO(alias, alias_tcp_ulp, "tcp-ulp-" name) + /* Call BPF_SOCK_OPS program that returns an int. If the return value * is < 0, then the BPF op failed (for example if the loaded BPF * program does not support the chosen operation or there is no BPF diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c index 622caa4039e0..7dd44b6156c7 100644 --- a/net/ipv4/tcp_ulp.c +++ b/net/ipv4/tcp_ulp.c @@ -51,7 +51,7 @@ static const struct tcp_ulp_ops *__tcp_ulp_find_autoload(const char *name) #ifdef CONFIG_MODULES if (!ulp && capable(CAP_NET_ADMIN)) { rcu_read_unlock(); - request_module("%s", name); + request_module("tcp-ulp-%s", name); rcu_read_lock(); ulp = tcp_ulp_find(name); } diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index b09867c8b817..93c0c225ab34 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -45,6 +45,7 @@ MODULE_AUTHOR("Mellanox Technologies"); MODULE_DESCRIPTION("Transport Layer Security Support"); MODULE_LICENSE("Dual BSD/GPL"); +MODULE_ALIAS_TCP_ULP("tls"); enum { TLSV4, -- cgit v1.2.3-58-ga151 From 90545cdc3f2b2ea700e24335610cd181e73756da Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 16 Aug 2018 21:49:07 +0200 Subject: tcp, ulp: fix leftover icsk_ulp_ops preventing sock from reattach I found that in BPF sockmap programs once we either delete a socket from the map or we updated a map slot and the old socket was purged from the map that these socket can never get reattached into a map even though their related psock has been dropped entirely at that point. Reason is that tcp_cleanup_ulp() leaves the old icsk->icsk_ulp_ops intact, so that on the next tcp_set_ulp_id() the kernel returns an -EEXIST thinking there is still some active ULP attached. BPF sockmap is the only one that has this issue as the other user, kTLS, only calls tcp_cleanup_ulp() from tcp_v4_destroy_sock() whereas sockmap semantics allow dropping the socket from the map with all related psock state being cleaned up. Fixes: 1aa12bdf1bfb ("bpf: sockmap, add sock close() hook to remove socks") Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- net/ipv4/tcp_ulp.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c index 7dd44b6156c7..a5995bb2eaca 100644 --- a/net/ipv4/tcp_ulp.c +++ b/net/ipv4/tcp_ulp.c @@ -129,6 +129,8 @@ void tcp_cleanup_ulp(struct sock *sk) if (icsk->icsk_ulp_ops->release) icsk->icsk_ulp_ops->release(sk); module_put(icsk->icsk_ulp_ops->owner); + + icsk->icsk_ulp_ops = NULL; } /* Change upper layer protocol for socket */ -- cgit v1.2.3-58-ga151 From f6069b9aa9934ede26f41ac0781fce241279ad43 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 17 Aug 2018 23:26:14 +0200 Subject: bpf: fix redirect to map under tail calls Commits 109980b894e9 ("bpf: don't select potentially stale ri->map from buggy xdp progs") and 7c3001313396 ("bpf: fix ri->map_owner pointer on bpf_prog_realloc") tried to mitigate that buggy programs using bpf_redirect_map() helper call do not leave stale maps behind. Idea was to add a map_owner cookie into the per CPU struct redirect_info which was set to prog->aux by the prog making the helper call as a proof that the map is not stale since the prog is implicitly holding a reference to it. This owner cookie could later on get compared with the program calling into BPF whether they match and therefore the redirect could proceed with processing the map safely. In (obvious) hindsight, this approach breaks down when tail calls are involved since the original caller's prog->aux pointer does not have to match the one from one of the progs out of the tail call chain, and therefore the xdp buffer will be dropped instead of redirected. A way around that would be to fix the issue differently (which also allows to remove related work in fast path at the same time): once the life-time of a redirect map has come to its end we use it's map free callback where we need to wait on synchronize_rcu() for current outstanding xdp buffers and remove such a map pointer from the redirect info if found to be present. At that time no program is using this map anymore so we simply invalidate the map pointers to NULL iff they previously pointed to that instance while making sure that the redirect path only reads out the map once. Fixes: 97f91a7cf04f ("bpf: add bpf_redirect_map helper routine") Fixes: 109980b894e9 ("bpf: don't select potentially stale ri->map from buggy xdp progs") Reported-by: Sebastiano Miano Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 3 +- include/trace/events/xdp.h | 5 ++-- kernel/bpf/cpumap.c | 2 ++ kernel/bpf/devmap.c | 1 + kernel/bpf/verifier.c | 21 -------------- kernel/bpf/xskmap.c | 1 + net/core/filter.c | 68 ++++++++++++++++++++-------------------------- 7 files changed, 38 insertions(+), 63 deletions(-) (limited to 'net') diff --git a/include/linux/filter.h b/include/linux/filter.h index 5d565c50bcb2..6791a0ac0139 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -543,7 +543,6 @@ struct bpf_redirect_info { u32 flags; struct bpf_map *map; struct bpf_map *map_to_flush; - unsigned long map_owner; u32 kern_flags; }; @@ -781,6 +780,8 @@ static inline bool bpf_dump_raw_ok(void) struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len); +void bpf_clear_redirect_map(struct bpf_map *map); + static inline bool xdp_return_frame_no_direct(void) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index 1ecf4c67fcf7..e95cb86b65cf 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -147,9 +147,8 @@ struct _bpf_dtab_netdev { #define devmap_ifindex(fwd, map) \ (!fwd ? 0 : \ - (!map ? 0 : \ - ((map->map_type == BPF_MAP_TYPE_DEVMAP) ? \ - ((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0))) + ((map->map_type == BPF_MAP_TYPE_DEVMAP) ? \ + ((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0)) #define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx) \ trace_xdp_redirect_map(dev, xdp, devmap_ifindex(fwd, map), \ diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 620bc5024d7d..24aac0d0f412 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -479,6 +479,8 @@ static void cpu_map_free(struct bpf_map *map) * It does __not__ ensure pending flush operations (if any) are * complete. */ + + bpf_clear_redirect_map(map); synchronize_rcu(); /* To ensure all pending flush operations have completed wait for flush diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index ac1df79f3788..141710b82a6c 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -161,6 +161,7 @@ static void dev_map_free(struct bpf_map *map) list_del_rcu(&dtab->list); spin_unlock(&dev_map_lock); + bpf_clear_redirect_map(map); synchronize_rcu(); /* To ensure all pending flush operations have completed wait for flush diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ca90679a7fe5..92246117d2b0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5844,27 +5844,6 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) goto patch_call_imm; } - if (insn->imm == BPF_FUNC_redirect_map) { - /* Note, we cannot use prog directly as imm as subsequent - * rewrites would still change the prog pointer. The only - * stable address we can use is aux, which also works with - * prog clones during blinding. - */ - u64 addr = (unsigned long)prog->aux; - struct bpf_insn r4_ld[] = { - BPF_LD_IMM64(BPF_REG_4, addr), - *insn, - }; - cnt = ARRAY_SIZE(r4_ld); - - new_prog = bpf_patch_insn_data(env, i + delta, r4_ld, cnt); - if (!new_prog) - return -ENOMEM; - - delta += cnt - 1; - env->prog = prog = new_prog; - insn = new_prog->insnsi + i + delta; - } patch_call_imm: fn = env->ops->get_func_proto(insn->imm, env->prog); /* all functions that have prototype and verifier allowed diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index 4ddf61e158f6..9f8463afda9c 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -75,6 +75,7 @@ static void xsk_map_free(struct bpf_map *map) struct xsk_map *m = container_of(map, struct xsk_map, map); int i; + bpf_clear_redirect_map(map); synchronize_net(); for (i = 0; i < map->max_entries; i++) { diff --git a/net/core/filter.c b/net/core/filter.c index fd423ce3da34..c25eb36f1320 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3246,31 +3246,33 @@ static void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index) } } -static inline bool xdp_map_invalid(const struct bpf_prog *xdp_prog, - unsigned long aux) +void bpf_clear_redirect_map(struct bpf_map *map) { - return (unsigned long)xdp_prog->aux != aux; + struct bpf_redirect_info *ri; + int cpu; + + for_each_possible_cpu(cpu) { + ri = per_cpu_ptr(&bpf_redirect_info, cpu); + /* Avoid polluting remote cacheline due to writes if + * not needed. Once we pass this test, we need the + * cmpxchg() to make sure it hasn't been changed in + * the meantime by remote CPU. + */ + if (unlikely(READ_ONCE(ri->map) == map)) + cmpxchg(&ri->map, map, NULL); + } } static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog) + struct bpf_prog *xdp_prog, struct bpf_map *map) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); - unsigned long map_owner = ri->map_owner; - struct bpf_map *map = ri->map; u32 index = ri->ifindex; void *fwd = NULL; int err; ri->ifindex = 0; - ri->map = NULL; - ri->map_owner = 0; - - if (unlikely(xdp_map_invalid(xdp_prog, map_owner))) { - err = -EFAULT; - map = NULL; - goto err; - } + WRITE_ONCE(ri->map, NULL); fwd = __xdp_map_lookup_elem(map, index); if (!fwd) { @@ -3296,12 +3298,13 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_map *map = READ_ONCE(ri->map); struct net_device *fwd; u32 index = ri->ifindex; int err; - if (ri->map) - return xdp_do_redirect_map(dev, xdp, xdp_prog); + if (map) + return xdp_do_redirect_map(dev, xdp, xdp_prog, map); fwd = dev_get_by_index_rcu(dev_net(dev), index); ri->ifindex = 0; @@ -3325,24 +3328,17 @@ EXPORT_SYMBOL_GPL(xdp_do_redirect); static int xdp_do_generic_redirect_map(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog) + struct bpf_prog *xdp_prog, + struct bpf_map *map) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); - unsigned long map_owner = ri->map_owner; - struct bpf_map *map = ri->map; u32 index = ri->ifindex; void *fwd = NULL; int err = 0; ri->ifindex = 0; - ri->map = NULL; - ri->map_owner = 0; + WRITE_ONCE(ri->map, NULL); - if (unlikely(xdp_map_invalid(xdp_prog, map_owner))) { - err = -EFAULT; - map = NULL; - goto err; - } fwd = __xdp_map_lookup_elem(map, index); if (unlikely(!fwd)) { err = -EINVAL; @@ -3379,13 +3375,14 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_map *map = READ_ONCE(ri->map); u32 index = ri->ifindex; struct net_device *fwd; int err = 0; - if (ri->map) - return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog); - + if (map) + return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, + map); ri->ifindex = 0; fwd = dev_get_by_index_rcu(dev_net(dev), index); if (unlikely(!fwd)) { @@ -3416,8 +3413,7 @@ BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags) ri->ifindex = ifindex; ri->flags = flags; - ri->map = NULL; - ri->map_owner = 0; + WRITE_ONCE(ri->map, NULL); return XDP_REDIRECT; } @@ -3430,8 +3426,8 @@ static const struct bpf_func_proto bpf_xdp_redirect_proto = { .arg2_type = ARG_ANYTHING, }; -BPF_CALL_4(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags, - unsigned long, map_owner) +BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, + u64, flags) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); @@ -3440,15 +3436,11 @@ BPF_CALL_4(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags ri->ifindex = ifindex; ri->flags = flags; - ri->map = map; - ri->map_owner = map_owner; + WRITE_ONCE(ri->map, map); return XDP_REDIRECT; } -/* Note, arg4 is hidden from users and populated by the verifier - * with the right pointer. - */ static const struct bpf_func_proto bpf_xdp_redirect_map_proto = { .func = bpf_xdp_redirect_map, .gpl_only = false, -- cgit v1.2.3-58-ga151