From 6596a0229541270fb8d38d989f91b78838e5e9da Mon Sep 17 00:00:00 2001 From: Jiri Bohac Date: Wed, 19 Jan 2022 10:22:53 +0100 Subject: xfrm: fix MTU regression Commit 749439bfac6e1a2932c582e2699f91d329658196 ("ipv6: fix udpv6 sendmsg crash caused by too small MTU") breaks PMTU for xfrm. A Packet Too Big ICMPv6 message received in response to an ESP packet will prevent all further communication through the tunnel if the reported MTU minus the ESP overhead is smaller than 1280. E.g. in a case of a tunnel-mode ESP with sha256/aes the overhead is 92 bytes. Receiving a PTB with MTU of 1371 or less will result in all further packets in the tunnel dropped. A ping through the tunnel fails with "ping: sendmsg: Invalid argument". Apparently the MTU on the xfrm route is smaller than 1280 and fails the check inside ip6_setup_cork() added by 749439bf. We found this by debugging USGv6/ipv6ready failures. Failing tests are: "Phase-2 Interoperability Test Scenario IPsec" / 5.3.11 and 5.4.11 (Tunnel Mode: Fragmentation). Commit b515d2637276a3810d6595e10ab02c13bfd0b63a ("xfrm: xfrm_state_mtu should return at least 1280 for ipv6") attempted to fix this but caused another regression in TCP MSS calculations and had to be reverted. The patch below fixes the situation by dropping the MTU check and instead checking for the underflows described in the 749439bf commit message. Signed-off-by: Jiri Bohac Fixes: 749439bfac6e ("ipv6: fix udpv6 sendmsg crash caused by too small MTU") Signed-off-by: Steffen Klassert --- net/ipv6/ip6_output.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 2995f8d89e7e..4ff110214dea 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1408,8 +1408,6 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, if (np->frag_size) mtu = np->frag_size; } - if (mtu < IPV6_MIN_MTU) - return -EINVAL; cork->base.fragsize = mtu; cork->base.gso_size = ipc6->gso_size; cork->base.tx_flags = 0; @@ -1471,8 +1469,6 @@ static int __ip6_append_data(struct sock *sk, fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len + (opt ? opt->opt_nflen : 0); - maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - - sizeof(struct frag_hdr); headersize = sizeof(struct ipv6hdr) + (opt ? opt->opt_flen + opt->opt_nflen : 0) + @@ -1480,6 +1476,13 @@ static int __ip6_append_data(struct sock *sk, sizeof(struct frag_hdr) : 0) + rt->rt6i_nfheader_len; + if (mtu < fragheaderlen || + ((mtu - fragheaderlen) & ~7) + fragheaderlen < sizeof(struct frag_hdr)) + goto emsgsize; + + maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - + sizeof(struct frag_hdr); + /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit * the first fragment */ -- cgit v1.2.3-58-ga151 From c1aca3080e382886e2e58e809787441984a2f89b Mon Sep 17 00:00:00 2001 From: Yan Yan Date: Tue, 18 Jan 2022 16:00:13 -0800 Subject: xfrm: Check if_id in xfrm_migrate This patch enables distinguishing SAs and SPs based on if_id during the xfrm_migrate flow. This ensures support for xfrm interfaces throughout the SA/SP lifecycle. When there are multiple existing SPs with the same direction, the same xfrm_selector and different endpoint addresses, xfrm_migrate might fail with ENODATA. Specifically, the code path for performing xfrm_migrate is: Stage 1: find policy to migrate with xfrm_migrate_policy_find(sel, dir, type, net) Stage 2: find and update state(s) with xfrm_migrate_state_find(mp, net) Stage 3: update endpoint address(es) of template(s) with xfrm_policy_migrate(pol, m, num_migrate) Currently "Stage 1" always returns the first xfrm_policy that matches, and "Stage 3" looks for the xfrm_tmpl that matches the old endpoint address. Thus if there are multiple xfrm_policy with same selector, direction, type and net, "Stage 1" might rertun a wrong xfrm_policy and "Stage 3" will fail with ENODATA because it cannot find a xfrm_tmpl with the matching endpoint address. The fix is to allow userspace to pass an if_id and add if_id to the matching rule in Stage 1 and Stage 2 since if_id is a unique ID for xfrm_policy and xfrm_state. For compatibility, if_id will only be checked if the attribute is set. Tested with additions to Android's kernel unit test suite: https://android-review.googlesource.com/c/kernel/tests/+/1668886 Signed-off-by: Yan Yan Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 5 +++-- net/key/af_key.c | 2 +- net/xfrm/xfrm_policy.c | 14 ++++++++------ net/xfrm/xfrm_state.c | 7 ++++++- net/xfrm/xfrm_user.c | 6 +++++- 5 files changed, 23 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index fdb41e8bb626..743dd1da506e 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1681,14 +1681,15 @@ int km_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, const struct xfrm_migrate *m, int num_bundles, const struct xfrm_kmaddress *k, const struct xfrm_encap_tmpl *encap); -struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *net); +struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *net, + u32 if_id); struct xfrm_state *xfrm_state_migrate(struct xfrm_state *x, struct xfrm_migrate *m, struct xfrm_encap_tmpl *encap); int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, struct xfrm_migrate *m, int num_bundles, struct xfrm_kmaddress *k, struct net *net, - struct xfrm_encap_tmpl *encap); + struct xfrm_encap_tmpl *encap, u32 if_id); #endif int km_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, __be16 sport); diff --git a/net/key/af_key.c b/net/key/af_key.c index de24a7d474df..9bf52a09b5ff 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -2623,7 +2623,7 @@ static int pfkey_migrate(struct sock *sk, struct sk_buff *skb, } return xfrm_migrate(&sel, dir, XFRM_POLICY_TYPE_MAIN, m, i, - kma ? &k : NULL, net, NULL); + kma ? &k : NULL, net, NULL, 0); out: return err; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 04d1ce9b510f..882526159d3a 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -4256,7 +4256,7 @@ static bool xfrm_migrate_selector_match(const struct xfrm_selector *sel_cmp, } static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector *sel, - u8 dir, u8 type, struct net *net) + u8 dir, u8 type, struct net *net, u32 if_id) { struct xfrm_policy *pol, *ret = NULL; struct hlist_head *chain; @@ -4265,7 +4265,8 @@ static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector * spin_lock_bh(&net->xfrm.xfrm_policy_lock); chain = policy_hash_direct(net, &sel->daddr, &sel->saddr, sel->family, dir); hlist_for_each_entry(pol, chain, bydst) { - if (xfrm_migrate_selector_match(sel, &pol->selector) && + if ((if_id == 0 || pol->if_id == if_id) && + xfrm_migrate_selector_match(sel, &pol->selector) && pol->type == type) { ret = pol; priority = ret->priority; @@ -4277,7 +4278,8 @@ static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector * if ((pol->priority >= priority) && ret) break; - if (xfrm_migrate_selector_match(sel, &pol->selector) && + if ((if_id == 0 || pol->if_id == if_id) && + xfrm_migrate_selector_match(sel, &pol->selector) && pol->type == type) { ret = pol; break; @@ -4393,7 +4395,7 @@ static int xfrm_migrate_check(const struct xfrm_migrate *m, int num_migrate) int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, struct xfrm_migrate *m, int num_migrate, struct xfrm_kmaddress *k, struct net *net, - struct xfrm_encap_tmpl *encap) + struct xfrm_encap_tmpl *encap, u32 if_id) { int i, err, nx_cur = 0, nx_new = 0; struct xfrm_policy *pol = NULL; @@ -4412,14 +4414,14 @@ int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, } /* Stage 1 - find policy */ - if ((pol = xfrm_migrate_policy_find(sel, dir, type, net)) == NULL) { + if ((pol = xfrm_migrate_policy_find(sel, dir, type, net, if_id)) == NULL) { err = -ENOENT; goto out; } /* Stage 2 - find and update state(s) */ for (i = 0, mp = m; i < num_migrate; i++, mp++) { - if ((x = xfrm_migrate_state_find(mp, net))) { + if ((x = xfrm_migrate_state_find(mp, net, if_id))) { x_cur[nx_cur] = x; nx_cur++; xc = xfrm_state_migrate(x, mp, encap); diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index ca6bee18346d..b0eeb0aef493 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1606,7 +1606,8 @@ out: return NULL; } -struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *net) +struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *net, + u32 if_id) { unsigned int h; struct xfrm_state *x = NULL; @@ -1622,6 +1623,8 @@ struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *n continue; if (m->reqid && x->props.reqid != m->reqid) continue; + if (if_id != 0 && x->if_id != if_id) + continue; if (!xfrm_addr_equal(&x->id.daddr, &m->old_daddr, m->old_family) || !xfrm_addr_equal(&x->props.saddr, &m->old_saddr, @@ -1637,6 +1640,8 @@ struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *n if (x->props.mode != m->mode || x->id.proto != m->proto) continue; + if (if_id != 0 && x->if_id != if_id) + continue; if (!xfrm_addr_equal(&x->id.daddr, &m->old_daddr, m->old_family) || !xfrm_addr_equal(&x->props.saddr, &m->old_saddr, diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 8cd6c8129004..a4fb596e87af 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -2608,6 +2608,7 @@ static int xfrm_do_migrate(struct sk_buff *skb, struct nlmsghdr *nlh, int n = 0; struct net *net = sock_net(skb->sk); struct xfrm_encap_tmpl *encap = NULL; + u32 if_id = 0; if (attrs[XFRMA_MIGRATE] == NULL) return -EINVAL; @@ -2632,7 +2633,10 @@ static int xfrm_do_migrate(struct sk_buff *skb, struct nlmsghdr *nlh, return -ENOMEM; } - err = xfrm_migrate(&pi->sel, pi->dir, type, m, n, kmp, net, encap); + if (attrs[XFRMA_IF_ID]) + if_id = nla_get_u32(attrs[XFRMA_IF_ID]); + + err = xfrm_migrate(&pi->sel, pi->dir, type, m, n, kmp, net, encap, if_id); kfree(encap); -- cgit v1.2.3-58-ga151 From e03c3bba351f99ad932e8f06baa9da1afc418e02 Mon Sep 17 00:00:00 2001 From: Yan Yan Date: Tue, 18 Jan 2022 16:00:14 -0800 Subject: xfrm: Fix xfrm migrate issues when address family changes xfrm_migrate cannot handle address family change of an xfrm_state. The symptons are the xfrm_state will be migrated to a wrong address, and sending as well as receiving packets wil be broken. This commit fixes it by breaking the original xfrm_state_clone method into two steps so as to update the props.family before running xfrm_init_state. As the result, xfrm_state's inner mode, outer mode, type and IP header length in xfrm_state_migrate can be updated with the new address family. Tested with additions to Android's kernel unit test suite: https://android-review.googlesource.com/c/kernel/tests/+/1885354 Signed-off-by: Yan Yan Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index b0eeb0aef493..1ba6fbfe8cdb 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1579,9 +1579,6 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig, memcpy(&x->mark, &orig->mark, sizeof(x->mark)); memcpy(&x->props.smark, &orig->props.smark, sizeof(x->props.smark)); - if (xfrm_init_state(x) < 0) - goto error; - x->props.flags = orig->props.flags; x->props.extra_flags = orig->props.extra_flags; @@ -1668,6 +1665,11 @@ struct xfrm_state *xfrm_state_migrate(struct xfrm_state *x, if (!xc) return NULL; + xc->props.family = m->new_family; + + if (xfrm_init_state(xc) < 0) + goto error; + memcpy(&xc->id.daddr, &m->new_daddr, sizeof(xc->id.daddr)); memcpy(&xc->props.saddr, &m->new_saddr, sizeof(xc->props.saddr)); -- cgit v1.2.3-58-ga151 From a6d95c5a628a09be129f25d5663a7e9db8261f51 Mon Sep 17 00:00:00 2001 From: Jiri Bohac Date: Wed, 26 Jan 2022 16:00:18 +0100 Subject: Revert "xfrm: xfrm_state_mtu should return at least 1280 for ipv6" This reverts commit b515d2637276a3810d6595e10ab02c13bfd0b63a. Commit b515d2637276a3810d6595e10ab02c13bfd0b63a ("xfrm: xfrm_state_mtu should return at least 1280 for ipv6") in v5.14 breaks the TCP MSS calculation in ipsec transport mode, resulting complete stalls of TCP connections. This happens when the (P)MTU is 1280 or slighly larger. The desired formula for the MSS is: MSS = (MTU - ESP_overhead) - IP header - TCP header However, the above commit clamps the (MTU - ESP_overhead) to a minimum of 1280, turning the formula into MSS = max(MTU - ESP overhead, 1280) - IP header - TCP header With the (P)MTU near 1280, the calculated MSS is too large and the resulting TCP packets never make it to the destination because they are over the actual PMTU. The above commit also causes suboptimal double fragmentation in xfrm tunnel mode, as described in https://lore.kernel.org/netdev/20210429202529.codhwpc7w6kbudug@dwarf.suse.cz/ The original problem the above commit was trying to fix is now fixed by commit 6596a0229541270fb8d38d989f91b78838e5e9da ("xfrm: fix MTU regression"). Signed-off-by: Jiri Bohac Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 1 - net/ipv4/esp4.c | 2 +- net/ipv6/esp6.c | 2 +- net/xfrm/xfrm_state.c | 14 ++------------ 4 files changed, 4 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 743dd1da506e..76aa6f11a540 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1568,7 +1568,6 @@ void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si); void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si); u32 xfrm_replay_seqhi(struct xfrm_state *x, __be32 net_seq); int xfrm_init_replay(struct xfrm_state *x); -u32 __xfrm_state_mtu(struct xfrm_state *x, int mtu); u32 xfrm_state_mtu(struct xfrm_state *x, int mtu); int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload); int xfrm_init_state(struct xfrm_state *x); diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 851f542928a3..e1b1d080e908 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -671,7 +671,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb); u32 padto; - padto = min(x->tfcpad, __xfrm_state_mtu(x, dst->child_mtu_cached)); + padto = min(x->tfcpad, xfrm_state_mtu(x, dst->child_mtu_cached)); if (skb->len < padto) esp.tfclen = padto - skb->len; } diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 8bb2c407b46b..7591160edce1 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -707,7 +707,7 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb); u32 padto; - padto = min(x->tfcpad, __xfrm_state_mtu(x, dst->child_mtu_cached)); + padto = min(x->tfcpad, xfrm_state_mtu(x, dst->child_mtu_cached)); if (skb->len < padto) esp.tfclen = padto - skb->len; } diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 1ba6fbfe8cdb..b749935152ba 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -2579,7 +2579,7 @@ void xfrm_state_delete_tunnel(struct xfrm_state *x) } EXPORT_SYMBOL(xfrm_state_delete_tunnel); -u32 __xfrm_state_mtu(struct xfrm_state *x, int mtu) +u32 xfrm_state_mtu(struct xfrm_state *x, int mtu) { const struct xfrm_type *type = READ_ONCE(x->type); struct crypto_aead *aead; @@ -2610,17 +2610,7 @@ u32 __xfrm_state_mtu(struct xfrm_state *x, int mtu) return ((mtu - x->props.header_len - crypto_aead_authsize(aead) - net_adj) & ~(blksize - 1)) + net_adj - 2; } -EXPORT_SYMBOL_GPL(__xfrm_state_mtu); - -u32 xfrm_state_mtu(struct xfrm_state *x, int mtu) -{ - mtu = __xfrm_state_mtu(x, mtu); - - if (x->props.family == AF_INET6 && mtu < IPV6_MIN_MTU) - return IPV6_MIN_MTU; - - return mtu; -} +EXPORT_SYMBOL_GPL(xfrm_state_mtu); int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload) { -- cgit v1.2.3-58-ga151 From 6d0d95a1c2b07270870e7be16575c513c29af3f1 Mon Sep 17 00:00:00 2001 From: Antony Antony Date: Tue, 1 Feb 2022 07:51:57 +0100 Subject: xfrm: fix the if_id check in changelink if_id will be always 0, because it was not yet initialized. Fixes: 8dce43919566 ("xfrm: interface with if_id 0 should return error") Reported-by: Pavel Machek Signed-off-by: Antony Antony Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_interface.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index 57448fc519fc..4e3c62d1ad9e 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -673,12 +673,12 @@ static int xfrmi_changelink(struct net_device *dev, struct nlattr *tb[], struct net *net = xi->net; struct xfrm_if_parms p = {}; + xfrmi_netlink_parms(data, &p); if (!p.if_id) { NL_SET_ERR_MSG(extack, "if_id must be non zero"); return -EINVAL; } - xfrmi_netlink_parms(data, &p); xi = xfrmi_locate(net, &p); if (!xi) { xi = netdev_priv(dev); -- cgit v1.2.3-58-ga151 From 7c76ecd9c99b6e9a771d813ab1aa7fa428b3ade1 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 8 Feb 2022 16:14:32 +0200 Subject: xfrm: enforce validity of offload input flags struct xfrm_user_offload has flags variable that received user input, but kernel didn't check if valid bits were provided. It caused a situation where not sanitized input was forwarded directly to the drivers. For example, XFRM_OFFLOAD_IPV6 define that was exposed, was used by strongswan, but not implemented in the kernel at all. As a solution, check and sanitize input flags to forward XFRM_OFFLOAD_INBOUND to the drivers. Fixes: d77e38e612a0 ("xfrm: Add an IPsec hardware offloading API") Signed-off-by: Leon Romanovsky Signed-off-by: Steffen Klassert --- include/uapi/linux/xfrm.h | 6 ++++++ net/xfrm/xfrm_device.c | 6 +++++- 2 files changed, 11 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h index 4e29d7851890..65e13a099b1a 100644 --- a/include/uapi/linux/xfrm.h +++ b/include/uapi/linux/xfrm.h @@ -511,6 +511,12 @@ struct xfrm_user_offload { int ifindex; __u8 flags; }; +/* This flag was exposed without any kernel code that supporting it. + * Unfortunately, strongswan has the code that uses sets this flag, + * which makes impossible to reuse this bit. + * + * So leave it here to make sure that it won't be reused by mistake. + */ #define XFRM_OFFLOAD_IPV6 1 #define XFRM_OFFLOAD_INBOUND 2 diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 3fa066419d37..39bce5d764de 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -223,6 +223,9 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, if (x->encap || x->tfcpad) return -EINVAL; + if (xuo->flags & ~(XFRM_OFFLOAD_IPV6 | XFRM_OFFLOAD_INBOUND)) + return -EINVAL; + dev = dev_get_by_index(net, xuo->ifindex); if (!dev) { if (!(xuo->flags & XFRM_OFFLOAD_INBOUND)) { @@ -262,7 +265,8 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, netdev_tracker_alloc(dev, &xso->dev_tracker, GFP_ATOMIC); xso->real_dev = dev; xso->num_exthdrs = 1; - xso->flags = xuo->flags; + /* Don't forward bit that is not implemented */ + xso->flags = xuo->flags & ~XFRM_OFFLOAD_IPV6; err = dev->xfrmdev_ops->xdo_dev_state_add(x); if (err) { -- cgit v1.2.3-58-ga151 From 610d086d6df0b15c3732a7b4a5b0f1c3e1b84d4c Mon Sep 17 00:00:00 2001 From: Deren Wu Date: Sun, 13 Feb 2022 00:20:15 +0800 Subject: mac80211: fix EAPoL rekey fail in 802.3 rx path mac80211 set capability NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211 to upper layer by default. That means we should pass EAPoL packets through nl80211 path only, and should not send the EAPoL skb to netdevice diretly. At the meanwhile, wpa_supplicant would not register sock to listen EAPoL skb on the netdevice. However, there is no control_port_protocol handler in mac80211 for 802.3 RX packets, mac80211 driver would pass up the EAPoL rekey frame to netdevice and wpa_supplicant would be never interactive with this kind of packets, if SUPPORTS_RX_DECAP_OFFLOAD is enabled. This causes STA always rekey fail if EAPoL frame go through 802.3 path. To avoid this problem, align the same process as 802.11 type to handle this frame before put it into network stack. This also addresses a potential security issue in 802.3 RX mode that was previously fixed in commit a8c4d76a8dd4 ("mac80211: do not accept/forward invalid EAPOL frames"). Cc: stable@vger.kernel.org # 5.12+ Fixes: 80a915ec4427 ("mac80211: add rx decapsulation offload support") Signed-off-by: Deren Wu Link: https://lore.kernel.org/r/6889c9fced5859ebb088564035f84fd0fa792a49.1644680751.git.deren.wu@mediatek.com [fix typos, update comment and add note about security issue] Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 93680af62c47..7b699b2f4d89 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2607,7 +2607,8 @@ static void ieee80211_deliver_skb_to_local_stack(struct sk_buff *skb, * address, so that the authenticator (e.g. hostapd) will see * the frame, but bridge won't forward it anywhere else. Note * that due to earlier filtering, the only other address can - * be the PAE group address. + * be the PAE group address, unless the hardware allowed them + * through in 802.3 offloaded mode. */ if (unlikely(skb->protocol == sdata->control_port_protocol && !ether_addr_equal(ehdr->h_dest, sdata->vif.addr))) @@ -4514,12 +4515,7 @@ static void ieee80211_rx_8023(struct ieee80211_rx_data *rx, /* deliver to local stack */ skb->protocol = eth_type_trans(skb, fast_rx->dev); - memset(skb->cb, 0, sizeof(skb->cb)); - if (rx->list) - list_add_tail(&skb->list, rx->list); - else - netif_receive_skb(skb); - + ieee80211_deliver_skb_to_local_stack(skb, rx); } static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx, -- cgit v1.2.3-58-ga151 From a6bce78262f5dd4b50510f0aa47f3995f7b185f3 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 3 Feb 2022 20:15:29 +0100 Subject: mac80211: refuse aggregations sessions before authorized If an MFP station isn't authorized, the receiver will (or at least should) drop the action frame since it's a robust management frame, but if we're not authorized we haven't installed keys yet. Refuse attempts to start a session as they'd just time out. Signed-off-by: Johannes Berg Link: https://lore.kernel.org/r/20220203201528.ff4d5679dce9.I34bb1f2bc341e161af2d6faf74f91b332ba11285@changeid Signed-off-by: Johannes Berg --- net/mac80211/agg-tx.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index 74a878f213d3..1deb3d874a4b 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -9,7 +9,7 @@ * Copyright 2007, Michael Wu * Copyright 2007-2010, Intel Corporation * Copyright(c) 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018 - 2021 Intel Corporation + * Copyright (C) 2018 - 2022 Intel Corporation */ #include @@ -626,6 +626,14 @@ int ieee80211_start_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid, return -EINVAL; } + if (test_sta_flag(sta, WLAN_STA_MFP) && + !test_sta_flag(sta, WLAN_STA_AUTHORIZED)) { + ht_dbg(sdata, + "MFP STA not authorized - deny BA session request %pM tid %d\n", + sta->sta.addr, tid); + return -EINVAL; + } + /* * 802.11n-2009 11.5.1.1: If the initiating STA is an HT STA, is a * member of an IBSS, and has no other existing Block Ack agreement -- cgit v1.2.3-58-ga151 From 859ae7018316daa4adbc496012dcbbb458d7e510 Mon Sep 17 00:00:00 2001 From: Nicolas Escande Date: Mon, 14 Feb 2022 18:32:14 +0100 Subject: mac80211: fix forwarded mesh frames AC & queue selection There are two problems with the current code that have been highlighted with the AQL feature that is now enbaled by default. First problem is in ieee80211_rx_h_mesh_fwding(), ieee80211_select_queue_80211() is used on received packets to choose the sending AC queue of the forwarding packet although this function should only be called on TX packet (it uses ieee80211_tx_info). This ends with forwarded mesh packets been sent on unrelated random AC queue. To fix that, AC queue can directly be infered from skb->priority which has been extracted from QOS info (see ieee80211_parse_qos()). Second problem is the value of queue_mapping set on forwarded mesh frames via skb_set_queue_mapping() is not the AC of the packet but a hardware queue index. This may or may not work depending on AC to HW queue mapping which is driver specific. Both of these issues lead to improper AC selection while forwarding mesh packets but more importantly due to improper airtime accounting (which is done on a per STA, per AC basis) caused traffic stall with the introduction of AQL. Fixes: cf44012810cc ("mac80211: fix unnecessary frame drops in mesh fwding") Fixes: d3c1597b8d1b ("mac80211: fix forwarded mesh frame queue mapping") Co-developed-by: Remi Pommarel Signed-off-by: Remi Pommarel Signed-off-by: Nicolas Escande Link: https://lore.kernel.org/r/20220214173214.368862-1-nico.escande@gmail.com Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 7b699b2f4d89..48d9553dafe3 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2923,13 +2923,13 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx) ether_addr_equal(sdata->vif.addr, hdr->addr3)) return RX_CONTINUE; - ac = ieee80211_select_queue_80211(sdata, skb, hdr); + ac = ieee802_1d_to_ac[skb->priority]; q = sdata->vif.hw_queue[ac]; if (ieee80211_queue_stopped(&local->hw, q)) { IEEE80211_IFSTA_MESH_CTR_INC(ifmsh, dropped_frames_congestion); return RX_DROP_MONITOR; } - skb_set_queue_mapping(skb, q); + skb_set_queue_mapping(skb, ac); if (!--mesh_hdr->ttl) { if (!is_multicast_ether_addr(hdr->addr1)) -- cgit v1.2.3-58-ga151 From ae089831ff28a115908b8d796f667c2dadef1637 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 22 Feb 2022 10:13:31 -0800 Subject: netfilter: nf_tables: prefer kfree_rcu(ptr, rcu) variant While kfree_rcu(ptr) _is_ supported, it has some limitations. Given that 99.99% of kfree_rcu() users [1] use the legacy two parameters variant, and @catchall objects do have an rcu head, simply use it. Choice of kfree_rcu(ptr) variant was probably not intentional. [1] including calls from net/netfilter/nf_tables_api.c Fixes: aaa31047a6d2 ("netfilter: nftables: add catch-all set element support") Signed-off-by: Eric Dumazet Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 9cd1d7a62804..c86748b3873b 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4502,7 +4502,7 @@ static void nft_set_catchall_destroy(const struct nft_ctx *ctx, list_for_each_entry_safe(catchall, next, &set->catchall_list, list) { list_del_rcu(&catchall->list); nft_set_elem_destroy(set, catchall->elem, true); - kfree_rcu(catchall); + kfree_rcu(catchall, rcu); } } @@ -5669,7 +5669,7 @@ static void nft_setelem_catchall_remove(const struct net *net, list_for_each_entry_safe(catchall, next, &set->catchall_list, list) { if (catchall->elem == elem->priv) { list_del_rcu(&catchall->list); - kfree_rcu(catchall); + kfree_rcu(catchall, rcu); break; } } -- cgit v1.2.3-58-ga151 From dd3b1dc3dd050f1f47cd13e300732852414270f8 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 4 Feb 2022 13:12:35 -0800 Subject: Bluetooth: hci_core: Fix leaking sent_cmd skb sent_cmd memory is not freed before freeing hci_dev causing it to leak it contents. Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 2b7bd3655b07..2882bc7d79d7 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -2738,6 +2738,7 @@ void hci_release_dev(struct hci_dev *hdev) hci_dev_unlock(hdev); ida_simple_remove(&hci_index_ida, hdev->id); + kfree_skb(hdev->sent_cmd); kfree(hdev); } EXPORT_SYMBOL(hci_release_dev); -- cgit v1.2.3-58-ga151 From fa78d2d1d64f147062e384a4a10a26a5f89944b5 Mon Sep 17 00:00:00 2001 From: Lin Ma Date: Wed, 16 Feb 2022 12:37:14 +0800 Subject: Bluetooth: fix data races in smp_unregister(), smp_del_chan() Previous commit e04480920d1e ("Bluetooth: defer cleanup of resources in hci_unregister_dev()") defers all destructive actions to hci_release_dev() to prevent cocurrent problems like NPD, UAF. However, there are still some exceptions that are ignored. The smp_unregister() in hci_dev_close_sync() (previously in hci_dev_do_close) will release resources like the sensitive channel and the smp_dev objects. Consider the situations the device is detaching or power down while the kernel is still operating on it, the following data race could take place. thread-A hci_dev_close_sync | thread-B read_local_oob_ext_data | hci_dev_unlock() | ... | hci_dev_lock() if (hdev->smp_data) | chan = hdev->smp_data | | chan = hdev->smp_data (3) | hdev->smp_data = NULL (1) | if (!chan || !chan->data) (4) ... | smp = chan->data | smp = chan->data if (smp) | chan->data = NULL (2) | ... | kfree_sensitive(smp) | | // dereference smp trigger UFA That is, the objects hdev->smp_data and chan->data both suffer from the data races. In a preempt-enable kernel, the above schedule (when (3) is before (1) and (4) is before (2)) leads to UAF bugs. It can be reproduced in the latest kernel and below is part of the report: [ 49.097146] ================================================================ [ 49.097611] BUG: KASAN: use-after-free in smp_generate_oob+0x2dd/0x570 [ 49.097611] Read of size 8 at addr ffff888006528360 by task generate_oob/155 [ 49.097611] [ 49.097611] Call Trace: [ 49.097611] [ 49.097611] dump_stack_lvl+0x34/0x44 [ 49.097611] print_address_description.constprop.0+0x1f/0x150 [ 49.097611] ? smp_generate_oob+0x2dd/0x570 [ 49.097611] ? smp_generate_oob+0x2dd/0x570 [ 49.097611] kasan_report.cold+0x7f/0x11b [ 49.097611] ? smp_generate_oob+0x2dd/0x570 [ 49.097611] smp_generate_oob+0x2dd/0x570 [ 49.097611] read_local_oob_ext_data+0x689/0xc30 [ 49.097611] ? hci_event_packet+0xc80/0xc80 [ 49.097611] ? sysvec_apic_timer_interrupt+0x9b/0xc0 [ 49.097611] ? asm_sysvec_apic_timer_interrupt+0x12/0x20 [ 49.097611] ? mgmt_init_hdev+0x1c/0x240 [ 49.097611] ? mgmt_init_hdev+0x28/0x240 [ 49.097611] hci_sock_sendmsg+0x1880/0x1e70 [ 49.097611] ? create_monitor_event+0x890/0x890 [ 49.097611] ? create_monitor_event+0x890/0x890 [ 49.097611] sock_sendmsg+0xdf/0x110 [ 49.097611] __sys_sendto+0x19e/0x270 [ 49.097611] ? __ia32_sys_getpeername+0xa0/0xa0 [ 49.097611] ? kernel_fpu_begin_mask+0x1c0/0x1c0 [ 49.097611] __x64_sys_sendto+0xd8/0x1b0 [ 49.097611] ? syscall_exit_to_user_mode+0x1d/0x40 [ 49.097611] do_syscall_64+0x3b/0x90 [ 49.097611] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 49.097611] RIP: 0033:0x7f5a59f51f64 ... [ 49.097611] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f5a59f51f64 [ 49.097611] RDX: 0000000000000007 RSI: 00007f5a59d6ac70 RDI: 0000000000000006 [ 49.097611] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000 [ 49.097611] R10: 0000000000000040 R11: 0000000000000246 R12: 00007ffec26916ee [ 49.097611] R13: 00007ffec26916ef R14: 00007f5a59d6afc0 R15: 00007f5a59d6b700 To solve these data races, this patch places the smp_unregister() function in the protected area by the hci_dev_lock(). That is, the smp_unregister() function can not be concurrently executed when operating functions (most of them are mgmt operations in mgmt.c) hold the device lock. This patch is tested with kernel LOCK DEBUGGING enabled. The price from the extended holding time of the device lock is supposed to be low as the smp_unregister() function is fairly short and efficient. Signed-off-by: Lin Ma Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_sync.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 0feb68f12545..e34fc15b7d2c 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -4106,9 +4106,9 @@ int hci_dev_close_sync(struct hci_dev *hdev) hci_inquiry_cache_flush(hdev); hci_pend_le_actions_clear(hdev); hci_conn_hash_flush(hdev); - hci_dev_unlock(hdev); - + /* Prevent data races on hdev->smp_data or hdev->smp_bredr_data */ smp_unregister(hdev); + hci_dev_unlock(hdev); hci_sock_dev_event(hdev, HCI_DEV_DOWN); -- cgit v1.2.3-58-ga151 From 2e8ecb4bbc13d4752d64a9f8f5512d59125cab25 Mon Sep 17 00:00:00 2001 From: Wang Qing Date: Mon, 14 Feb 2022 18:01:56 -0800 Subject: Bluetooth: assign len after null check len should be assigned after a null check Signed-off-by: Wang Qing Signed-off-by: Marcel Holtmann --- net/bluetooth/mgmt_util.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/mgmt_util.c b/net/bluetooth/mgmt_util.c index edee60bbc7b4..37eef2ce55ae 100644 --- a/net/bluetooth/mgmt_util.c +++ b/net/bluetooth/mgmt_util.c @@ -77,11 +77,12 @@ int mgmt_send_event_skb(unsigned short channel, struct sk_buff *skb, int flag, { struct hci_dev *hdev; struct mgmt_hdr *hdr; - int len = skb->len; + int len; if (!skb) return -EINVAL; + len = skb->len; hdev = bt_cb(skb)->mgmt.hdev; /* Time stamp */ -- cgit v1.2.3-58-ga151 From 80740ebb7e1ad15ab9c11425dcd26e073f86d74b Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 24 Feb 2022 07:11:47 -0800 Subject: Bluetooth: hci_sync: Fix hci_update_accept_list_sync hci_update_accept_list_sync is returning the filter based on the error but that gets overwritten by hci_le_set_addr_resolution_enable_sync return instead of using the actual result of the likes of hci_le_add_accept_list_sync which was intended. Fixes: ad383c2c65a5b ("Bluetooth: hci_sync: Enable advertising when LL privacy is enabled") Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_sync.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index e34fc15b7d2c..9d8490570b42 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -1841,6 +1841,7 @@ static u8 hci_update_accept_list_sync(struct hci_dev *hdev) struct bdaddr_list *b, *t; u8 num_entries = 0; bool pend_conn, pend_report; + u8 filter_policy; int err; /* Pause advertising if resolving list can be used as controllers are @@ -1927,6 +1928,8 @@ static u8 hci_update_accept_list_sync(struct hci_dev *hdev) err = -EINVAL; done: + filter_policy = err ? 0x00 : 0x01; + /* Enable address resolution when LL Privacy is enabled. */ err = hci_le_set_addr_resolution_enable_sync(hdev, 0x01); if (err) @@ -1937,7 +1940,7 @@ done: hci_resume_advertising_sync(hdev); /* Select filter policy to use accept list */ - return err ? 0x00 : 0x01; + return filter_policy; } /* Returns true if an le connection is in the scanning state */ -- cgit v1.2.3-58-ga151 From a56a1138cbd85e4d565356199d60e1cb94e5a77a Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 17 Feb 2022 13:10:38 -0800 Subject: Bluetooth: hci_sync: Fix not using conn_timeout When using hci_le_create_conn_sync it shall wait for the conn_timeout since the connection complete may take longer than just 2 seconds. Also fix the masking of HCI_EV_LE_ENHANCED_CONN_COMPLETE and HCI_EV_LE_CONN_COMPLETE so they are never both set so we can predict which one the controller will use in case of HCI_OP_LE_CREATE_CONN. Fixes: 6cd29ec6ae5e3 ("Bluetooth: hci_sync: Wait for proper events when connecting LE") Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 8 ++++++++ net/bluetooth/hci_sync.c | 21 +++++++++++++++------ 2 files changed, 23 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 586f69d084a2..e336e9c1dda4 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1489,6 +1489,14 @@ void hci_conn_del_sysfs(struct hci_conn *conn); /* Extended advertising support */ #define ext_adv_capable(dev) (((dev)->le_features[1] & HCI_LE_EXT_ADV)) +/* BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E page 1789: + * + * C24: Mandatory if the LE Controller supports Connection State and either + * LE Feature (LL Privacy) or LE Feature (Extended Advertising) is supported + */ +#define use_enhanced_conn_complete(dev) (ll_privacy_capable(dev) || \ + ext_adv_capable(dev)) + /* ----- HCI protocols ----- */ #define HCI_PROTO_DEFER 0x01 diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 9d8490570b42..9ba2a1a7d481 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -3265,10 +3265,10 @@ static int hci_le_set_event_mask_sync(struct hci_dev *hdev) if (hdev->le_features[0] & HCI_LE_DATA_LEN_EXT) events[0] |= 0x40; /* LE Data Length Change */ - /* If the controller supports LL Privacy feature, enable - * the corresponding event. + /* If the controller supports LL Privacy feature or LE Extended Adv, + * enable the corresponding event. */ - if (hdev->le_features[0] & HCI_LE_LL_PRIVACY) + if (use_enhanced_conn_complete(hdev)) events[1] |= 0x02; /* LE Enhanced Connection Complete */ /* If the controller supports Extended Scanner Filter @@ -5188,7 +5188,7 @@ int hci_le_ext_create_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, return __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_EXT_CREATE_CONN, plen, data, HCI_EV_LE_ENHANCED_CONN_COMPLETE, - HCI_CMD_TIMEOUT, NULL); + conn->conn_timeout, NULL); } int hci_le_create_conn_sync(struct hci_dev *hdev, struct hci_conn *conn) @@ -5273,9 +5273,18 @@ int hci_le_create_conn_sync(struct hci_dev *hdev, struct hci_conn *conn) cp.min_ce_len = cpu_to_le16(0x0000); cp.max_ce_len = cpu_to_le16(0x0000); + /* BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E page 2261: + * + * If this event is unmasked and the HCI_LE_Connection_Complete event + * is unmasked, only the HCI_LE_Enhanced_Connection_Complete event is + * sent when a new connection has been created. + */ err = __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_CREATE_CONN, - sizeof(cp), &cp, HCI_EV_LE_CONN_COMPLETE, - HCI_CMD_TIMEOUT, NULL); + sizeof(cp), &cp, + use_enhanced_conn_complete(hdev) ? + HCI_EV_LE_ENHANCED_CONN_COMPLETE : + HCI_EV_LE_CONN_COMPLETE, + conn->conn_timeout, NULL); done: /* Re-enable advertising after the connection attempt is finished. */ -- cgit v1.2.3-58-ga151 From 07c2c7a3b622e109ba4d2efd916da0477617ce81 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 24 Feb 2022 16:52:57 -0800 Subject: mptcp: accurate SIOCOUTQ for fallback socket The MPTCP SIOCOUTQ implementation is not very accurate in case of fallback: it only measures the data in the MPTCP-level write queue, but it does not take in account the subflow write queue utilization. In case of fallback the first can be empty, while the latter is not. The above produces sporadic self-tests issues and can foul legit user-space application. Fix the issue additionally querying the subflow in case of fallback. Fixes: 644807e3e462 ("mptcp: add SIOCINQ, OUTQ and OUTQNSD ioctls") Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/260 Reported-by: Matthieu Baerts Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index f60f01b14fac..12bb28c5007e 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3294,6 +3294,17 @@ static int mptcp_ioctl_outq(const struct mptcp_sock *msk, u64 v) return 0; delta = msk->write_seq - v; + if (__mptcp_check_fallback(msk) && msk->first) { + struct tcp_sock *tp = tcp_sk(msk->first); + + /* the first subflow is disconnected after close - see + * __mptcp_close_ssk(). tcp_disconnect() moves the write_seq + * so ignore that status, too. + */ + if (!((1 << msk->first->sk_state) & + (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE))) + delta += READ_ONCE(tp->write_seq) - tp->snd_una; + } if (delta > INT_MAX) delta = INT_MAX; -- cgit v1.2.3-58-ga151 From 877d11f0332cd2160e19e3313e262754c321fa36 Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Thu, 24 Feb 2022 16:52:59 -0800 Subject: mptcp: Correctly set DATA_FIN timeout when number of retransmits is large Syzkaller with UBSAN uncovered a scenario where a large number of DATA_FIN retransmits caused a shift-out-of-bounds in the DATA_FIN timeout calculation: ================================================================================ UBSAN: shift-out-of-bounds in net/mptcp/protocol.c:470:29 shift exponent 32 is too large for 32-bit type 'unsigned int' CPU: 1 PID: 13059 Comm: kworker/1:0 Not tainted 5.17.0-rc2-00630-g5fbf21c90c60 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014 Workqueue: events mptcp_worker Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 ubsan_epilogue+0xb/0x5a lib/ubsan.c:151 __ubsan_handle_shift_out_of_bounds.cold+0xb2/0x20e lib/ubsan.c:330 mptcp_set_datafin_timeout net/mptcp/protocol.c:470 [inline] __mptcp_retrans.cold+0x72/0x77 net/mptcp/protocol.c:2445 mptcp_worker+0x58a/0xa70 net/mptcp/protocol.c:2528 process_one_work+0x9df/0x16d0 kernel/workqueue.c:2307 worker_thread+0x95/0xe10 kernel/workqueue.c:2454 kthread+0x2f4/0x3b0 kernel/kthread.c:377 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:295 ================================================================================ This change limits the maximum timeout by limiting the size of the shift, which keeps all intermediate values in-bounds. Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/259 Fixes: 6477dd39e62c ("mptcp: Retransmit DATA_FIN") Acked-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 12bb28c5007e..1c72f25f083e 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -466,9 +466,12 @@ static bool mptcp_pending_data_fin(struct sock *sk, u64 *seq) static void mptcp_set_datafin_timeout(const struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); + u32 retransmits; - mptcp_sk(sk)->timer_ival = min(TCP_RTO_MAX, - TCP_RTO_MIN << icsk->icsk_retransmits); + retransmits = min_t(u32, icsk->icsk_retransmits, + ilog2(TCP_RTO_MAX / TCP_RTO_MIN)); + + mptcp_sk(sk)->timer_ival = TCP_RTO_MIN << retransmits; } static void __mptcp_set_timeout(struct sock *sk, long tout) -- cgit v1.2.3-58-ga151 From 9f1c50cf39167ff71dc5953a3234f3f6eeb8fcb5 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Thu, 24 Feb 2022 23:26:19 +0800 Subject: net/smc: fix connection leak There's a potential leak issue under following execution sequence : smc_release smc_connect_work if (sk->sk_state == SMC_INIT) send_clc_confirim tcp_abort(); ... sk.sk_state = SMC_ACTIVE smc_close_active switch(sk->sk_state) { ... case SMC_ACTIVE: smc_close_final() // then wait peer closed Unfortunately, tcp_abort() may discard CLC CONFIRM messages that are still in the tcp send buffer, in which case our connection token cannot be delivered to the server side, which means that we cannot get a passive close message at all. Therefore, it is impossible for the to be disconnected at all. This patch tries a very simple way to avoid this issue, once the state has changed to SMC_ACTIVE after tcp_abort(), we can actively abort the smc connection, considering that the state is SMC_INIT before tcp_abort(), abandoning the complete disconnection process should not cause too much problem. In fact, this problem may exist as long as the CLC CONFIRM message is not received by the server. Whether a timer should be added after smc_close_final() needs to be discussed in the future. But even so, this patch provides a faster release for connection in above case, it should also be valuable. Fixes: 39f41f367b08 ("net/smc: common release code for non-accepted sockets") Signed-off-by: D. Wythe Acked-by: Karsten Graul Signed-off-by: David S. Miller --- net/smc/af_smc.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 306d9e8cd1dd..81984c1c0e78 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -183,7 +183,7 @@ static int smc_release(struct socket *sock) { struct sock *sk = sock->sk; struct smc_sock *smc; - int rc = 0; + int old_state, rc = 0; if (!sk) goto out; @@ -191,8 +191,10 @@ static int smc_release(struct socket *sock) sock_hold(sk); /* sock_put below */ smc = smc_sk(sk); + old_state = sk->sk_state; + /* cleanup for a dangling non-blocking connect */ - if (smc->connect_nonblock && sk->sk_state == SMC_INIT) + if (smc->connect_nonblock && old_state == SMC_INIT) tcp_abort(smc->clcsock->sk, ECONNABORTED); if (cancel_work_sync(&smc->connect_work)) @@ -206,6 +208,10 @@ static int smc_release(struct socket *sock) else lock_sock(sk); + if (old_state == SMC_INIT && sk->sk_state == SMC_ACTIVE && + !smc->use_fallback) + smc_close_active_abort(smc); + rc = __smc_release(smc); /* detach socket */ -- cgit v1.2.3-58-ga151 From 91b0383fef06f20b847fa9e4f0e3054ead0b1a1b Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 24 Feb 2022 18:01:54 +0200 Subject: net: dcb: flush lingering app table entries for unregistered devices If I'm not mistaken (and I don't think I am), the way in which the dcbnl_ops work is that drivers call dcb_ieee_setapp() and this populates the application table with dynamically allocated struct dcb_app_type entries that are kept in the module-global dcb_app_list. However, nobody keeps exact track of these entries, and although dcb_ieee_delapp() is supposed to remove them, nobody does so when the interface goes away (example: driver unbinds from device). So the dcb_app_list will contain lingering entries with an ifindex that no longer matches any device in dcb_app_lookup(). Reclaim the lost memory by listening for the NETDEV_UNREGISTER event and flushing the app table entries of interfaces that are now gone. In fact something like this used to be done as part of the initial commit (blamed below), but it was done in dcbnl_exit() -> dcb_flushapp(), essentially at module_exit time. That became dead code after commit 7a6b6f515f77 ("DCB: fix kconfig option") which essentially merged "tristate config DCB" and "bool config DCBNL" into a single "bool config DCB", so net/dcb/dcbnl.c could not be built as a module anymore. Commit 36b9ad8084bd ("net/dcb: make dcbnl.c explicitly non-modular") recognized this and deleted dcbnl_exit() and dcb_flushapp() altogether, leaving us with the version we have today. Since flushing application table entries can and should be done as soon as the netdevice disappears, fundamentally the commit that is to blame is the one that introduced the design of this API. Fixes: 9ab933ab2cc8 ("dcbnl: add appliction tlv handlers") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- net/dcb/dcbnl.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) (limited to 'net') diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index b441ab330fd3..36c91273daac 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -2073,8 +2073,52 @@ u8 dcb_ieee_getapp_default_prio_mask(const struct net_device *dev) } EXPORT_SYMBOL(dcb_ieee_getapp_default_prio_mask); +static void dcbnl_flush_dev(struct net_device *dev) +{ + struct dcb_app_type *itr, *tmp; + + spin_lock(&dcb_lock); + + list_for_each_entry_safe(itr, tmp, &dcb_app_list, list) { + if (itr->ifindex == dev->ifindex) { + list_del(&itr->list); + kfree(itr); + } + } + + spin_unlock(&dcb_lock); +} + +static int dcbnl_netdevice_event(struct notifier_block *nb, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + + switch (event) { + case NETDEV_UNREGISTER: + if (!dev->dcbnl_ops) + return NOTIFY_DONE; + + dcbnl_flush_dev(dev); + + return NOTIFY_OK; + default: + return NOTIFY_DONE; + } +} + +static struct notifier_block dcbnl_nb __read_mostly = { + .notifier_call = dcbnl_netdevice_event, +}; + static int __init dcbnl_init(void) { + int err; + + err = register_netdevice_notifier(&dcbnl_nb); + if (err) + return err; + rtnl_register(PF_UNSPEC, RTM_GETDCB, dcb_doit, NULL, 0); rtnl_register(PF_UNSPEC, RTM_SETDCB, dcb_doit, NULL, 0); -- cgit v1.2.3-58-ga151 From 9995b408f17ff8c7f11bc725c8aa225ba3a63b1c Mon Sep 17 00:00:00 2001 From: "j.nixdorf@avm.de" Date: Thu, 24 Feb 2022 10:06:49 +0100 Subject: net: ipv6: ensure we call ipv6_mc_down() at most once There are two reasons for addrconf_notify() to be called with NETDEV_DOWN: either the network device is actually going down, or IPv6 was disabled on the interface. If either of them stays down while the other is toggled, we repeatedly call the code for NETDEV_DOWN, including ipv6_mc_down(), while never calling the corresponding ipv6_mc_up() in between. This will cause a new entry in idev->mc_tomb to be allocated for each multicast group the interface is subscribed to, which in turn leaks one struct ifmcaddr6 per nontrivial multicast group the interface is subscribed to. The following reproducer will leak at least $n objects: ip addr add ff2e::4242/32 dev eth0 autojoin sysctl -w net.ipv6.conf.eth0.disable_ipv6=1 for i in $(seq 1 $n); do ip link set up eth0; ip link set down eth0 done Joining groups with IPV6_ADD_MEMBERSHIP (unprivileged) or setting the sysctl net.ipv6.conf.eth0.forwarding to 1 (=> subscribing to ff02::2) can also be used to create a nontrivial idev->mc_list, which will the leak objects with the right up-down-sequence. Based on both sources for NETDEV_DOWN events the interface IPv6 state should be considered: - not ready if the network interface is not ready OR IPv6 is disabled for it - ready if the network interface is ready AND IPv6 is enabled for it The functions ipv6_mc_up() and ipv6_down() should only be run when this state changes. Implement this by remembering when the IPv6 state is ready, and only run ipv6_mc_down() if it actually changed from ready to not ready. The other direction (not ready -> ready) already works correctly, as: - the interface notification triggered codepath for NETDEV_UP / NETDEV_CHANGE returns early if ipv6 is disabled, and - the disable_ipv6=0 triggered codepath skips fully initializing the interface as long as addrconf_link_ready(dev) returns false - calling ipv6_mc_up() repeatedly does not leak anything Fixes: 3ce62a84d53c ("ipv6: exit early in addrconf_notify() if IPv6 is disabled") Signed-off-by: Johannes Nixdorf Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 6c8ab3e6e6fe..f908e2fd30b2 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3732,6 +3732,7 @@ static int addrconf_ifdown(struct net_device *dev, bool unregister) struct inet6_dev *idev; struct inet6_ifaddr *ifa, *tmp; bool keep_addr = false; + bool was_ready; int state, i; ASSERT_RTNL(); @@ -3797,7 +3798,10 @@ restart: addrconf_del_rs_timer(idev); - /* Step 2: clear flags for stateless addrconf */ + /* Step 2: clear flags for stateless addrconf, repeated down + * detection + */ + was_ready = idev->if_flags & IF_READY; if (!unregister) idev->if_flags &= ~(IF_RS_SENT|IF_RA_RCVD|IF_READY); @@ -3871,7 +3875,7 @@ restart: if (unregister) { ipv6_ac_destroy_dev(idev); ipv6_mc_destroy_dev(idev); - } else { + } else if (was_ready) { ipv6_mc_down(idev); } -- cgit v1.2.3-58-ga151 From 4d08b7b57ece83a1c31c633a7e4e27f121157f9c Mon Sep 17 00:00:00 2001 From: Tony Lu Date: Fri, 25 Feb 2022 14:56:57 +0800 Subject: net/smc: Fix cleanup when register ULP fails This patch calls smc_ib_unregister_client() when tcp_register_ulp() fails, and make sure to clean it up. Fixes: d7cd421da9da ("net/smc: Introduce TCP ULP support") Signed-off-by: Tony Lu Signed-off-by: David S. Miller --- net/smc/af_smc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 81984c1c0e78..284befa90967 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -3087,12 +3087,14 @@ static int __init smc_init(void) rc = tcp_register_ulp(&smc_ulp_ops); if (rc) { pr_err("%s: tcp_ulp_register fails with %d\n", __func__, rc); - goto out_sock; + goto out_ib; } static_branch_enable(&tcp_have_smc); return 0; +out_ib: + smc_ib_unregister_client(); out_sock: sock_unregister(PF_SMC); out_proto6: -- cgit v1.2.3-58-ga151 From 56763f12b0f02706576a088e85ef856deacc98a0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 27 Feb 2022 10:01:41 -0800 Subject: netfilter: fix use-after-free in __nf_register_net_hook() We must not dereference @new_hooks after nf_hook_mutex has been released, because other threads might have freed our allocated hooks already. BUG: KASAN: use-after-free in nf_hook_entries_get_hook_ops include/linux/netfilter.h:130 [inline] BUG: KASAN: use-after-free in hooks_validate net/netfilter/core.c:171 [inline] BUG: KASAN: use-after-free in __nf_register_net_hook+0x77a/0x820 net/netfilter/core.c:438 Read of size 2 at addr ffff88801c1a8000 by task syz-executor237/4430 CPU: 1 PID: 4430 Comm: syz-executor237 Not tainted 5.17.0-rc5-syzkaller-00306-g2293be58d6a1 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 print_address_description.constprop.0.cold+0x8d/0x336 mm/kasan/report.c:255 __kasan_report mm/kasan/report.c:442 [inline] kasan_report.cold+0x83/0xdf mm/kasan/report.c:459 nf_hook_entries_get_hook_ops include/linux/netfilter.h:130 [inline] hooks_validate net/netfilter/core.c:171 [inline] __nf_register_net_hook+0x77a/0x820 net/netfilter/core.c:438 nf_register_net_hook+0x114/0x170 net/netfilter/core.c:571 nf_register_net_hooks+0x59/0xc0 net/netfilter/core.c:587 nf_synproxy_ipv6_init+0x85/0xe0 net/netfilter/nf_synproxy_core.c:1218 synproxy_tg6_check+0x30d/0x560 net/ipv6/netfilter/ip6t_SYNPROXY.c:81 xt_check_target+0x26c/0x9e0 net/netfilter/x_tables.c:1038 check_target net/ipv6/netfilter/ip6_tables.c:530 [inline] find_check_entry.constprop.0+0x7f1/0x9e0 net/ipv6/netfilter/ip6_tables.c:573 translate_table+0xc8b/0x1750 net/ipv6/netfilter/ip6_tables.c:735 do_replace net/ipv6/netfilter/ip6_tables.c:1153 [inline] do_ip6t_set_ctl+0x56e/0xb90 net/ipv6/netfilter/ip6_tables.c:1639 nf_setsockopt+0x83/0xe0 net/netfilter/nf_sockopt.c:101 ipv6_setsockopt+0x122/0x180 net/ipv6/ipv6_sockglue.c:1024 rawv6_setsockopt+0xd3/0x6a0 net/ipv6/raw.c:1084 __sys_setsockopt+0x2db/0x610 net/socket.c:2180 __do_sys_setsockopt net/socket.c:2191 [inline] __se_sys_setsockopt net/socket.c:2188 [inline] __x64_sys_setsockopt+0xba/0x150 net/socket.c:2188 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f65a1ace7d9 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 71 15 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f65a1a7f308 EFLAGS: 00000246 ORIG_RAX: 0000000000000036 RAX: ffffffffffffffda RBX: 0000000000000006 RCX: 00007f65a1ace7d9 RDX: 0000000000000040 RSI: 0000000000000029 RDI: 0000000000000003 RBP: 00007f65a1b574c8 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000020000000 R11: 0000000000000246 R12: 00007f65a1b55130 R13: 00007f65a1b574c0 R14: 00007f65a1b24090 R15: 0000000000022000 The buggy address belongs to the page: page:ffffea0000706a00 refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x1c1a8 flags: 0xfff00000000000(node=0|zone=1|lastcpupid=0x7ff) raw: 00fff00000000000 ffffea0001c1b108 ffffea000046dd08 0000000000000000 raw: 0000000000000000 0000000000000000 00000000ffffffff 0000000000000000 page dumped because: kasan: bad access detected page_owner tracks the page as freed page last allocated via order 2, migratetype Unmovable, gfp_mask 0x52dc0(GFP_KERNEL|__GFP_NOWARN|__GFP_NORETRY|__GFP_COMP|__GFP_ZERO), pid 4430, ts 1061781545818, free_ts 1061791488993 prep_new_page mm/page_alloc.c:2434 [inline] get_page_from_freelist+0xa72/0x2f50 mm/page_alloc.c:4165 __alloc_pages+0x1b2/0x500 mm/page_alloc.c:5389 __alloc_pages_node include/linux/gfp.h:572 [inline] alloc_pages_node include/linux/gfp.h:595 [inline] kmalloc_large_node+0x62/0x130 mm/slub.c:4438 __kmalloc_node+0x35a/0x4a0 mm/slub.c:4454 kmalloc_node include/linux/slab.h:604 [inline] kvmalloc_node+0x97/0x100 mm/util.c:580 kvmalloc include/linux/slab.h:731 [inline] kvzalloc include/linux/slab.h:739 [inline] allocate_hook_entries_size net/netfilter/core.c:61 [inline] nf_hook_entries_grow+0x140/0x780 net/netfilter/core.c:128 __nf_register_net_hook+0x144/0x820 net/netfilter/core.c:429 nf_register_net_hook+0x114/0x170 net/netfilter/core.c:571 nf_register_net_hooks+0x59/0xc0 net/netfilter/core.c:587 nf_synproxy_ipv6_init+0x85/0xe0 net/netfilter/nf_synproxy_core.c:1218 synproxy_tg6_check+0x30d/0x560 net/ipv6/netfilter/ip6t_SYNPROXY.c:81 xt_check_target+0x26c/0x9e0 net/netfilter/x_tables.c:1038 check_target net/ipv6/netfilter/ip6_tables.c:530 [inline] find_check_entry.constprop.0+0x7f1/0x9e0 net/ipv6/netfilter/ip6_tables.c:573 translate_table+0xc8b/0x1750 net/ipv6/netfilter/ip6_tables.c:735 do_replace net/ipv6/netfilter/ip6_tables.c:1153 [inline] do_ip6t_set_ctl+0x56e/0xb90 net/ipv6/netfilter/ip6_tables.c:1639 nf_setsockopt+0x83/0xe0 net/netfilter/nf_sockopt.c:101 page last free stack trace: reset_page_owner include/linux/page_owner.h:24 [inline] free_pages_prepare mm/page_alloc.c:1352 [inline] free_pcp_prepare+0x374/0x870 mm/page_alloc.c:1404 free_unref_page_prepare mm/page_alloc.c:3325 [inline] free_unref_page+0x19/0x690 mm/page_alloc.c:3404 kvfree+0x42/0x50 mm/util.c:613 rcu_do_batch kernel/rcu/tree.c:2527 [inline] rcu_core+0x7b1/0x1820 kernel/rcu/tree.c:2778 __do_softirq+0x29b/0x9c2 kernel/softirq.c:558 Memory state around the buggy address: ffff88801c1a7f00: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ffff88801c1a7f80: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff >ffff88801c1a8000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ^ ffff88801c1a8080: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ffff88801c1a8100: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff Fixes: 2420b79f8c18 ("netfilter: debug: check for sorted array") Signed-off-by: Eric Dumazet Reported-by: syzbot Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 354cb472f386..8a77a3fd69bc 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -428,14 +428,15 @@ static int __nf_register_net_hook(struct net *net, int pf, p = nf_entry_dereference(*pp); new_hooks = nf_hook_entries_grow(p, reg); - if (!IS_ERR(new_hooks)) + if (!IS_ERR(new_hooks)) { + hooks_validate(new_hooks); rcu_assign_pointer(*pp, new_hooks); + } mutex_unlock(&nf_hook_mutex); if (IS_ERR(new_hooks)) return PTR_ERR(new_hooks); - hooks_validate(new_hooks); #ifdef CONFIG_NETFILTER_INGRESS if (nf_ingress_hook(reg, pf)) net_inc_ingress_queue(); -- cgit v1.2.3-58-ga151 From 6ad27f522cb3b210476daf63ce6ddb6568c0508b Mon Sep 17 00:00:00 2001 From: Jiasheng Jiang Date: Tue, 1 Mar 2022 18:00:20 +0800 Subject: nl80211: Handle nla_memdup failures in handle_nan_filter As there's potential for failure of the nla_memdup(), check the return value. Fixes: a442b761b24b ("cfg80211: add add_nan_func / del_nan_func") Signed-off-by: Jiasheng Jiang Link: https://lore.kernel.org/r/20220301100020.3801187-1-jiasheng@iscas.ac.cn Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 578bff9c378b..b1909ce2b739 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -13411,6 +13411,9 @@ static int handle_nan_filter(struct nlattr *attr_filter, i = 0; nla_for_each_nested(attr, attr_filter, rem) { filter[i].filter = nla_memdup(attr, GFP_KERNEL); + if (!filter[i].filter) + goto err; + filter[i].len = nla_len(attr); i++; } @@ -13423,6 +13426,15 @@ static int handle_nan_filter(struct nlattr *attr_filter, } return 0; + +err: + i = 0; + nla_for_each_nested(attr, attr_filter, rem) { + kfree(filter[i].filter); + i++; + } + kfree(filter); + return -ENOMEM; } static int nl80211_nan_add_func(struct sk_buff *skb, -- cgit v1.2.3-58-ga151 From 94d9864cc86f572f881db9b842a78e9d075493ae Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 24 Feb 2022 10:39:34 +0100 Subject: mac80211: treat some SAE auth steps as final When we get anti-clogging token required (added by the commit mentioned below), or the other status codes added by the later commit 4e56cde15f7d ("mac80211: Handle special status codes in SAE commit") we currently just pretend (towards the internal state machine of authentication) that we didn't receive anything. This has the undesirable consequence of retransmitting the prior frame, which is not expected, because the timer is still armed. If we just disarm the timer at that point, it would result in the undesirable side effect of being in this state indefinitely if userspace crashes, or so. So to fix this, reset the timer and set a new auth_data->waiting in order to have no more retransmissions, but to have the data destroyed when the timer actually fires, which will only happen if userspace didn't continue (i.e. crashed or abandoned it.) Fixes: a4055e74a2ff ("mac80211: Don't destroy auth data in case of anti-clogging") Reported-by: Jouni Malinen Link: https://lore.kernel.org/r/20220224103932.75964e1d7932.Ia487f91556f29daae734bf61f8181404642e1eec@changeid Signed-off-by: Johannes Berg --- net/mac80211/ieee80211_i.h | 2 +- net/mac80211/mlme.c | 16 ++++++++++++---- 2 files changed, 13 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 330ea62231fa..e87bccaab561 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -376,7 +376,7 @@ struct ieee80211_mgd_auth_data { u8 key[WLAN_KEY_LEN_WEP104]; u8 key_len, key_idx; - bool done; + bool done, waiting; bool peer_confirmed; bool timeout_started; diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index e5ccf17618ab..744842c4513b 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -37,6 +37,7 @@ #define IEEE80211_AUTH_TIMEOUT_SAE (HZ * 2) #define IEEE80211_AUTH_MAX_TRIES 3 #define IEEE80211_AUTH_WAIT_ASSOC (HZ * 5) +#define IEEE80211_AUTH_WAIT_SAE_RETRY (HZ * 2) #define IEEE80211_ASSOC_TIMEOUT (HZ / 5) #define IEEE80211_ASSOC_TIMEOUT_LONG (HZ / 2) #define IEEE80211_ASSOC_TIMEOUT_SHORT (HZ / 10) @@ -3011,8 +3012,15 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, (status_code == WLAN_STATUS_ANTI_CLOG_REQUIRED || (auth_transaction == 1 && (status_code == WLAN_STATUS_SAE_HASH_TO_ELEMENT || - status_code == WLAN_STATUS_SAE_PK)))) + status_code == WLAN_STATUS_SAE_PK)))) { + /* waiting for userspace now */ + ifmgd->auth_data->waiting = true; + ifmgd->auth_data->timeout = + jiffies + IEEE80211_AUTH_WAIT_SAE_RETRY; + ifmgd->auth_data->timeout_started = true; + run_again(sdata, ifmgd->auth_data->timeout); goto notify_driver; + } sdata_info(sdata, "%pM denied authentication (status %d)\n", mgmt->sa, status_code); @@ -4603,10 +4611,10 @@ void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata) if (ifmgd->auth_data && ifmgd->auth_data->timeout_started && time_after(jiffies, ifmgd->auth_data->timeout)) { - if (ifmgd->auth_data->done) { + if (ifmgd->auth_data->done || ifmgd->auth_data->waiting) { /* - * ok ... we waited for assoc but userspace didn't, - * so let's just kill the auth data + * ok ... we waited for assoc or continuation but + * userspace didn't do it, so kill the auth data */ ieee80211_destroy_auth_data(sdata, false); } else if (ieee80211_auth(sdata)) { -- cgit v1.2.3-58-ga151 From 747670fd9a2d1b7774030dba65ca022ba442ce71 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 25 Feb 2022 14:02:41 +0100 Subject: netfilter: nf_queue: don't assume sk is full socket There is no guarantee that state->sk refers to a full socket. If refcount transitions to 0, sock_put calls sk_free which then ends up with garbage fields. I'd like to thank Oleksandr Natalenko and Jiri Benc for considerable debug work and pointing out state->sk oddities. Fixes: ca6fb0651883 ("tcp: attach SYNACK messages to request sockets instead of listener") Tested-by: Oleksandr Natalenko Signed-off-by: Florian Westphal --- net/netfilter/nf_queue.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index 6d12afabfe8a..5ab0680db445 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -46,6 +46,15 @@ void nf_unregister_queue_handler(void) } EXPORT_SYMBOL(nf_unregister_queue_handler); +static void nf_queue_sock_put(struct sock *sk) +{ +#ifdef CONFIG_INET + sock_gen_put(sk); +#else + sock_put(sk); +#endif +} + static void nf_queue_entry_release_refs(struct nf_queue_entry *entry) { struct nf_hook_state *state = &entry->state; @@ -54,7 +63,7 @@ static void nf_queue_entry_release_refs(struct nf_queue_entry *entry) dev_put(state->in); dev_put(state->out); if (state->sk) - sock_put(state->sk); + nf_queue_sock_put(state->sk); #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) dev_put(entry->physin); -- cgit v1.2.3-58-ga151 From c3873070247d9e3c7a6b0cf9bf9b45e8018427b1 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 28 Feb 2022 06:22:22 +0100 Subject: netfilter: nf_queue: fix possible use-after-free Eric Dumazet says: The sock_hold() side seems suspect, because there is no guarantee that sk_refcnt is not already 0. On failure, we cannot queue the packet and need to indicate an error. The packet will be dropped by the caller. v2: split skb prefetch hunk into separate change Fixes: 271b72c7fa82c ("udp: RCU handling for Unicast packets.") Reported-by: Eric Dumazet Reviewed-by: Eric Dumazet Signed-off-by: Florian Westphal --- include/net/netfilter/nf_queue.h | 2 +- net/netfilter/nf_queue.c | 13 +++++++++---- net/netfilter/nfnetlink_queue.c | 12 +++++++++--- 3 files changed, 19 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h index 9eed51e920e8..980daa6e1e3a 100644 --- a/include/net/netfilter/nf_queue.h +++ b/include/net/netfilter/nf_queue.h @@ -37,7 +37,7 @@ void nf_register_queue_handler(const struct nf_queue_handler *qh); void nf_unregister_queue_handler(void); void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict); -void nf_queue_entry_get_refs(struct nf_queue_entry *entry); +bool nf_queue_entry_get_refs(struct nf_queue_entry *entry); void nf_queue_entry_free(struct nf_queue_entry *entry); static inline void init_hashrandom(u32 *jhash_initval) diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index 5ab0680db445..e39549c55945 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -96,19 +96,21 @@ static void __nf_queue_entry_init_physdevs(struct nf_queue_entry *entry) } /* Bump dev refs so they don't vanish while packet is out */ -void nf_queue_entry_get_refs(struct nf_queue_entry *entry) +bool nf_queue_entry_get_refs(struct nf_queue_entry *entry) { struct nf_hook_state *state = &entry->state; + if (state->sk && !refcount_inc_not_zero(&state->sk->sk_refcnt)) + return false; + dev_hold(state->in); dev_hold(state->out); - if (state->sk) - sock_hold(state->sk); #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) dev_hold(entry->physin); dev_hold(entry->physout); #endif + return true; } EXPORT_SYMBOL_GPL(nf_queue_entry_get_refs); @@ -196,7 +198,10 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state, __nf_queue_entry_init_physdevs(entry); - nf_queue_entry_get_refs(entry); + if (!nf_queue_entry_get_refs(entry)) { + kfree(entry); + return -ENOTCONN; + } switch (entry->state.pf) { case AF_INET: diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index ea2d9c2a44cf..64a6acb6aeae 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -710,9 +710,15 @@ static struct nf_queue_entry * nf_queue_entry_dup(struct nf_queue_entry *e) { struct nf_queue_entry *entry = kmemdup(e, e->size, GFP_ATOMIC); - if (entry) - nf_queue_entry_get_refs(entry); - return entry; + + if (!entry) + return NULL; + + if (nf_queue_entry_get_refs(entry)) + return entry; + + kfree(entry); + return NULL; } #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) -- cgit v1.2.3-58-ga151 From 3b836da4081fa585cf6c392f62557496f2cb0efe Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 1 Mar 2022 00:46:19 +0100 Subject: netfilter: nf_queue: handle socket prefetch In case someone combines bpf socket assign and nf_queue, then we will queue an skb who references a struct sock that did not have its reference count incremented. As we leave rcu protection, there is no guarantee that skb->sk is still valid. For refcount-less skb->sk case, try to increment the reference count and then override the destructor. In case of failure we have two choices: orphan the skb and 'delete' preselect or let nf_queue() drop the packet. Do the latter, it should not happen during normal operation. Fixes: cf7fbe660f2d ("bpf: Add socket assign support") Acked-by: Joe Stringer Signed-off-by: Florian Westphal --- net/netfilter/nf_queue.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index e39549c55945..63d1516816b1 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -180,6 +180,18 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state, break; } + if (skb_sk_is_prefetched(skb)) { + struct sock *sk = skb->sk; + + if (!sk_is_refcounted(sk)) { + if (!refcount_inc_not_zero(&sk->sk_refcnt)) + return -ENOTCONN; + + /* drop refcount on skb_orphan */ + skb->destructor = sock_edemux; + } + } + entry = kmalloc(sizeof(*entry) + route_key_size, GFP_ATOMIC); if (!entry) return -ENOMEM; -- cgit v1.2.3-58-ga151 From a12f76345e026f1b300a0d17c56f020b6949b093 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 21 Feb 2022 15:55:12 +0100 Subject: cfg80211: fix CONFIG_CFG80211_EXTRA_REGDB_KEYDIR typo The kbuild change here accidentally removed not only the unquoting, but also the last character of the variable name. Fix that. Fixes: 129ab0d2d9f3 ("kbuild: do not quote string values in include/config/auto.conf") Reviewed-by: Masahiro Yamada Link: https://lore.kernel.org/r/20220221155512.1d25895f7c5f.I50fa3d4189fcab90a2896fe8cae215035dae9508@changeid Signed-off-by: Johannes Berg --- net/wireless/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/Makefile b/net/wireless/Makefile index 1e9be50469ce..527ae669f6f7 100644 --- a/net/wireless/Makefile +++ b/net/wireless/Makefile @@ -33,7 +33,7 @@ $(obj)/shipped-certs.c: $(wildcard $(srctree)/$(src)/certs/*.hex) echo 'unsigned int shipped_regdb_certs_len = sizeof(shipped_regdb_certs);'; \ ) > $@ -$(obj)/extra-certs.c: $(CONFIG_CFG80211_EXTRA_REGDB_KEYDI) \ +$(obj)/extra-certs.c: $(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR) \ $(wildcard $(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR)/*.x509) @$(kecho) " GEN $@" $(Q)(set -e; \ -- cgit v1.2.3-58-ga151 From db6140e5e35a48405e669353bd54042c1d4c3841 Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Mon, 28 Feb 2022 11:23:49 +0200 Subject: net/sched: act_ct: Fix flow table lookup failure with no originating ifindex After cited commit optimizted hw insertion, flow table entries are populated with ifindex information which was intended to only be used for HW offload. This tuple ifindex is hashed in the flow table key, so it must be filled for lookup to be successful. But tuple ifindex is only relevant for the netfilter flowtables (nft), so it's not filled in act_ct flow table lookup, resulting in lookup failure, and no SW offload and no offload teardown for TCP connection FIN/RST packets. To fix this, add new tc ifindex field to tuple, which will only be used for offloading, not for lookup, as it will not be part of the tuple hash. Fixes: 9795ded7f924 ("net/sched: act_ct: Fill offloading tuple iifidx") Signed-off-by: Paul Blakey Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 6 +++++- net/netfilter/nf_flow_table_offload.c | 6 +++++- net/sched/act_ct.c | 13 +++++++++---- 3 files changed, 19 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index a3647fadf1cc..bd59e950f4d6 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -96,6 +96,7 @@ enum flow_offload_xmit_type { FLOW_OFFLOAD_XMIT_NEIGH, FLOW_OFFLOAD_XMIT_XFRM, FLOW_OFFLOAD_XMIT_DIRECT, + FLOW_OFFLOAD_XMIT_TC, }; #define NF_FLOW_TABLE_ENCAP_MAX 2 @@ -127,7 +128,7 @@ struct flow_offload_tuple { struct { } __hash; u8 dir:2, - xmit_type:2, + xmit_type:3, encap_num:2, in_vlan_ingress:2; u16 mtu; @@ -142,6 +143,9 @@ struct flow_offload_tuple { u8 h_source[ETH_ALEN]; u8 h_dest[ETH_ALEN]; } out; + struct { + u32 iifidx; + } tc; }; }; diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index b561e0a44a45..fc4265acd9c4 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -110,7 +110,11 @@ static int nf_flow_rule_match(struct nf_flow_match *match, nf_flow_rule_lwt_match(match, tun_info); } - key->meta.ingress_ifindex = tuple->iifidx; + if (tuple->xmit_type == FLOW_OFFLOAD_XMIT_TC) + key->meta.ingress_ifindex = tuple->tc.iifidx; + else + key->meta.ingress_ifindex = tuple->iifidx; + mask->meta.ingress_ifindex = 0xffffffff; if (tuple->encap_num > 0 && !(tuple->in_vlan_ingress & BIT(0)) && diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 33e70d60f0bf..ec19f625863a 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -361,6 +361,13 @@ static void tcf_ct_flow_table_put(struct tcf_ct_params *params) } } +static void tcf_ct_flow_tc_ifidx(struct flow_offload *entry, + struct nf_conn_act_ct_ext *act_ct_ext, u8 dir) +{ + entry->tuplehash[dir].tuple.xmit_type = FLOW_OFFLOAD_XMIT_TC; + entry->tuplehash[dir].tuple.tc.iifidx = act_ct_ext->ifindex[dir]; +} + static void tcf_ct_flow_table_add(struct tcf_ct_flow_table *ct_ft, struct nf_conn *ct, bool tcp) @@ -385,10 +392,8 @@ static void tcf_ct_flow_table_add(struct tcf_ct_flow_table *ct_ft, act_ct_ext = nf_conn_act_ct_ext_find(ct); if (act_ct_ext) { - entry->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.iifidx = - act_ct_ext->ifindex[IP_CT_DIR_ORIGINAL]; - entry->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.iifidx = - act_ct_ext->ifindex[IP_CT_DIR_REPLY]; + tcf_ct_flow_tc_ifidx(entry, act_ct_ext, FLOW_OFFLOAD_DIR_ORIGINAL); + tcf_ct_flow_tc_ifidx(entry, act_ct_ext, FLOW_OFFLOAD_DIR_REPLY); } err = flow_offload_add(&ct_ft->nf_ft, entry); -- cgit v1.2.3-58-ga151 From 275f3f64870245b06188f24bdf917e55a813d294 Mon Sep 17 00:00:00 2001 From: Brian Gix Date: Tue, 1 Mar 2022 14:34:57 -0800 Subject: Bluetooth: Fix not checking MGMT cmd pending queue A number of places in the MGMT handlers we examine the command queue for other commands (in progress but not yet complete) that will interact with the process being performed. However, not all commands go into the queue if one of: 1. There is no negative side effect of consecutive or redundent commands 2. The command is entirely perform "inline". This change examines each "pending command" check, and if it is not needed, deletes the check. Of the remaining pending command checks, we make sure that the command is in the pending queue by using the mgmt_pending_add/mgmt_pending_remove pair rather than the mgmt_pending_new/mgmt_pending_free pair. Link: https://lore.kernel.org/linux-bluetooth/f648f2e11bb3c2974c32e605a85ac3a9fac944f1.camel@redhat.com/T/ Tested-by: Maxim Levitsky Signed-off-by: Brian Gix Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/mgmt.c | 99 +++++++++++++++++++++++++++++++++------------------- 1 file changed, 63 insertions(+), 36 deletions(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 37087cf7dc5a..533cf60673a3 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -1218,7 +1218,13 @@ static int new_settings(struct hci_dev *hdev, struct sock *skip) static void mgmt_set_powered_complete(struct hci_dev *hdev, void *data, int err) { struct mgmt_pending_cmd *cmd = data; - struct mgmt_mode *cp = cmd->param; + struct mgmt_mode *cp; + + /* Make sure cmd still outstanding. */ + if (cmd != pending_find(MGMT_OP_SET_POWERED, hdev)) + return; + + cp = cmd->param; bt_dev_dbg(hdev, "err %d", err); @@ -1242,7 +1248,7 @@ static void mgmt_set_powered_complete(struct hci_dev *hdev, void *data, int err) mgmt_status(err)); } - mgmt_pending_free(cmd); + mgmt_pending_remove(cmd); } static int set_powered_sync(struct hci_dev *hdev, void *data) @@ -1281,7 +1287,7 @@ static int set_powered(struct sock *sk, struct hci_dev *hdev, void *data, goto failed; } - cmd = mgmt_pending_new(sk, MGMT_OP_SET_POWERED, hdev, data, len); + cmd = mgmt_pending_add(sk, MGMT_OP_SET_POWERED, hdev, data, len); if (!cmd) { err = -ENOMEM; goto failed; @@ -1290,6 +1296,9 @@ static int set_powered(struct sock *sk, struct hci_dev *hdev, void *data, err = hci_cmd_sync_queue(hdev, set_powered_sync, cmd, mgmt_set_powered_complete); + if (err < 0) + mgmt_pending_remove(cmd); + failed: hci_dev_unlock(hdev); return err; @@ -1383,6 +1392,10 @@ static void mgmt_set_discoverable_complete(struct hci_dev *hdev, void *data, bt_dev_dbg(hdev, "err %d", err); + /* Make sure cmd still outstanding. */ + if (cmd != pending_find(MGMT_OP_SET_DISCOVERABLE, hdev)) + return; + hci_dev_lock(hdev); if (err) { @@ -1402,7 +1415,7 @@ static void mgmt_set_discoverable_complete(struct hci_dev *hdev, void *data, new_settings(hdev, cmd->sk); done: - mgmt_pending_free(cmd); + mgmt_pending_remove(cmd); hci_dev_unlock(hdev); } @@ -1511,7 +1524,7 @@ static int set_discoverable(struct sock *sk, struct hci_dev *hdev, void *data, goto failed; } - cmd = mgmt_pending_new(sk, MGMT_OP_SET_DISCOVERABLE, hdev, data, len); + cmd = mgmt_pending_add(sk, MGMT_OP_SET_DISCOVERABLE, hdev, data, len); if (!cmd) { err = -ENOMEM; goto failed; @@ -1538,6 +1551,9 @@ static int set_discoverable(struct sock *sk, struct hci_dev *hdev, void *data, err = hci_cmd_sync_queue(hdev, set_discoverable_sync, cmd, mgmt_set_discoverable_complete); + if (err < 0) + mgmt_pending_remove(cmd); + failed: hci_dev_unlock(hdev); return err; @@ -1550,6 +1566,10 @@ static void mgmt_set_connectable_complete(struct hci_dev *hdev, void *data, bt_dev_dbg(hdev, "err %d", err); + /* Make sure cmd still outstanding. */ + if (cmd != pending_find(MGMT_OP_SET_CONNECTABLE, hdev)) + return; + hci_dev_lock(hdev); if (err) { @@ -1562,7 +1582,9 @@ static void mgmt_set_connectable_complete(struct hci_dev *hdev, void *data, new_settings(hdev, cmd->sk); done: - mgmt_pending_free(cmd); + if (cmd) + mgmt_pending_remove(cmd); + hci_dev_unlock(hdev); } @@ -1634,7 +1656,7 @@ static int set_connectable(struct sock *sk, struct hci_dev *hdev, void *data, goto failed; } - cmd = mgmt_pending_new(sk, MGMT_OP_SET_CONNECTABLE, hdev, data, len); + cmd = mgmt_pending_add(sk, MGMT_OP_SET_CONNECTABLE, hdev, data, len); if (!cmd) { err = -ENOMEM; goto failed; @@ -1654,6 +1676,9 @@ static int set_connectable(struct sock *sk, struct hci_dev *hdev, void *data, err = hci_cmd_sync_queue(hdev, set_connectable_sync, cmd, mgmt_set_connectable_complete); + if (err < 0) + mgmt_pending_remove(cmd); + failed: hci_dev_unlock(hdev); return err; @@ -1774,6 +1799,10 @@ static void set_ssp_complete(struct hci_dev *hdev, void *data, int err) u8 enable = cp->val; bool changed; + /* Make sure cmd still outstanding. */ + if (cmd != pending_find(MGMT_OP_SET_SSP, hdev)) + return; + if (err) { u8 mgmt_err = mgmt_status(err); @@ -3321,6 +3350,9 @@ static void set_name_complete(struct hci_dev *hdev, void *data, int err) bt_dev_dbg(hdev, "err %d", err); + if (cmd != pending_find(MGMT_OP_SET_LOCAL_NAME, hdev)) + return; + if (status) { mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_SET_LOCAL_NAME, status); @@ -3493,6 +3525,9 @@ static void set_default_phy_complete(struct hci_dev *hdev, void *data, int err) struct sk_buff *skb = cmd->skb; u8 status = mgmt_status(err); + if (cmd != pending_find(MGMT_OP_SET_PHY_CONFIGURATION, hdev)) + return; + if (!status) { if (!skb) status = MGMT_STATUS_FAILED; @@ -3759,13 +3794,6 @@ static int set_wideband_speech(struct sock *sk, struct hci_dev *hdev, hci_dev_lock(hdev); - if (pending_find(MGMT_OP_SET_WIDEBAND_SPEECH, hdev)) { - err = mgmt_cmd_status(sk, hdev->id, - MGMT_OP_SET_WIDEBAND_SPEECH, - MGMT_STATUS_BUSY); - goto unlock; - } - if (hdev_is_powered(hdev) && !!cp->val != hci_dev_test_flag(hdev, HCI_WIDEBAND_SPEECH_ENABLED)) { @@ -5036,12 +5064,6 @@ static int read_local_oob_data(struct sock *sk, struct hci_dev *hdev, goto unlock; } - if (pending_find(MGMT_OP_READ_LOCAL_OOB_DATA, hdev)) { - err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_READ_LOCAL_OOB_DATA, - MGMT_STATUS_BUSY); - goto unlock; - } - cmd = mgmt_pending_new(sk, MGMT_OP_READ_LOCAL_OOB_DATA, hdev, NULL, 0); if (!cmd) err = -ENOMEM; @@ -5261,11 +5283,16 @@ static void start_discovery_complete(struct hci_dev *hdev, void *data, int err) { struct mgmt_pending_cmd *cmd = data; + if (cmd != pending_find(MGMT_OP_START_DISCOVERY, hdev) && + cmd != pending_find(MGMT_OP_START_LIMITED_DISCOVERY, hdev) && + cmd != pending_find(MGMT_OP_START_SERVICE_DISCOVERY, hdev)) + return; + bt_dev_dbg(hdev, "err %d", err); mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, mgmt_status(err), cmd->param, 1); - mgmt_pending_free(cmd); + mgmt_pending_remove(cmd); hci_discovery_set_state(hdev, err ? DISCOVERY_STOPPED: DISCOVERY_FINDING); @@ -5327,7 +5354,7 @@ static int start_discovery_internal(struct sock *sk, struct hci_dev *hdev, else hdev->discovery.limited = false; - cmd = mgmt_pending_new(sk, op, hdev, data, len); + cmd = mgmt_pending_add(sk, op, hdev, data, len); if (!cmd) { err = -ENOMEM; goto failed; @@ -5336,7 +5363,7 @@ static int start_discovery_internal(struct sock *sk, struct hci_dev *hdev, err = hci_cmd_sync_queue(hdev, start_discovery_sync, cmd, start_discovery_complete); if (err < 0) { - mgmt_pending_free(cmd); + mgmt_pending_remove(cmd); goto failed; } @@ -5430,7 +5457,7 @@ static int start_service_discovery(struct sock *sk, struct hci_dev *hdev, goto failed; } - cmd = mgmt_pending_new(sk, MGMT_OP_START_SERVICE_DISCOVERY, + cmd = mgmt_pending_add(sk, MGMT_OP_START_SERVICE_DISCOVERY, hdev, data, len); if (!cmd) { err = -ENOMEM; @@ -5463,7 +5490,7 @@ static int start_service_discovery(struct sock *sk, struct hci_dev *hdev, err = hci_cmd_sync_queue(hdev, start_discovery_sync, cmd, start_discovery_complete); if (err < 0) { - mgmt_pending_free(cmd); + mgmt_pending_remove(cmd); goto failed; } @@ -5495,11 +5522,14 @@ static void stop_discovery_complete(struct hci_dev *hdev, void *data, int err) { struct mgmt_pending_cmd *cmd = data; + if (cmd != pending_find(MGMT_OP_STOP_DISCOVERY, hdev)) + return; + bt_dev_dbg(hdev, "err %d", err); mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, mgmt_status(err), cmd->param, 1); - mgmt_pending_free(cmd); + mgmt_pending_remove(cmd); if (!err) hci_discovery_set_state(hdev, DISCOVERY_STOPPED); @@ -5535,7 +5565,7 @@ static int stop_discovery(struct sock *sk, struct hci_dev *hdev, void *data, goto unlock; } - cmd = mgmt_pending_new(sk, MGMT_OP_STOP_DISCOVERY, hdev, data, len); + cmd = mgmt_pending_add(sk, MGMT_OP_STOP_DISCOVERY, hdev, data, len); if (!cmd) { err = -ENOMEM; goto unlock; @@ -5544,7 +5574,7 @@ static int stop_discovery(struct sock *sk, struct hci_dev *hdev, void *data, err = hci_cmd_sync_queue(hdev, stop_discovery_sync, cmd, stop_discovery_complete); if (err < 0) { - mgmt_pending_free(cmd); + mgmt_pending_remove(cmd); goto unlock; } @@ -7474,6 +7504,9 @@ static void read_local_oob_ext_data_complete(struct hci_dev *hdev, void *data, u8 status = mgmt_status(err); u16 eir_len; + if (cmd != pending_find(MGMT_OP_READ_LOCAL_OOB_EXT_DATA, hdev)) + return; + if (!status) { if (!skb) status = MGMT_STATUS_FAILED; @@ -7969,11 +8002,7 @@ static bool requested_adv_flags_are_valid(struct hci_dev *hdev, u32 adv_flags) static bool adv_busy(struct hci_dev *hdev) { - return (pending_find(MGMT_OP_ADD_ADVERTISING, hdev) || - pending_find(MGMT_OP_REMOVE_ADVERTISING, hdev) || - pending_find(MGMT_OP_SET_LE, hdev) || - pending_find(MGMT_OP_ADD_EXT_ADV_PARAMS, hdev) || - pending_find(MGMT_OP_ADD_EXT_ADV_DATA, hdev)); + return pending_find(MGMT_OP_SET_LE, hdev); } static void add_adv_complete(struct hci_dev *hdev, struct sock *sk, u8 instance, @@ -8563,9 +8592,7 @@ static int remove_advertising(struct sock *sk, struct hci_dev *hdev, goto unlock; } - if (pending_find(MGMT_OP_ADD_ADVERTISING, hdev) || - pending_find(MGMT_OP_REMOVE_ADVERTISING, hdev) || - pending_find(MGMT_OP_SET_LE, hdev)) { + if (pending_find(MGMT_OP_SET_LE, hdev)) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_REMOVE_ADVERTISING, MGMT_STATUS_BUSY); goto unlock; -- cgit v1.2.3-58-ga151 From 0b0e2ff10356e7e2ffd66ecdd6eee69a2f03449b Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 28 Feb 2022 16:17:15 +0200 Subject: net: dsa: restore error path of dsa_tree_change_tag_proto When the DSA_NOTIFIER_TAG_PROTO returns an error, the user space process which initiated the protocol change exits the kernel processing while still holding the rtnl_mutex. So any other process attempting to lock the rtnl_mutex would deadlock after such event. The error handling of DSA_NOTIFIER_TAG_PROTO was inadvertently changed by the blamed commit, introducing this regression. We must still call rtnl_unlock(), and we must still call DSA_NOTIFIER_TAG_PROTO for the old protocol. The latter is due to the limiting design of notifier chains for cross-chip operations, which don't have a built-in error recovery mechanism - we should look into using notifier_call_chain_robust for that. Fixes: dc452a471dba ("net: dsa: introduce tagger-owned storage for private and shared data") Signed-off-by: Vladimir Oltean Link: https://lore.kernel.org/r/20220228141715.146485-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- net/dsa/dsa2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index dcad3100b164..209496996cdf 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -1261,7 +1261,7 @@ int dsa_tree_change_tag_proto(struct dsa_switch_tree *dst, info.tag_ops = tag_ops; err = dsa_tree_notify(dst, DSA_NOTIFIER_TAG_PROTO, &info); if (err) - return err; + goto out_unlock; err = dsa_tree_bind_tag_proto(dst, tag_ops); if (err) -- cgit v1.2.3-58-ga151 From 690bb6fb64f5dc7437317153902573ecad67593d Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Mon, 28 Feb 2022 00:01:24 +0100 Subject: batman-adv: Request iflink once in batadv-on-batadv check There is no need to call dev_get_iflink multiple times for the same net_device in batadv_is_on_batman_iface. And since some of the .ndo_get_iflink callbacks are dynamic (for example via RCUs like in vxcan_get_iflink), it could easily happen that the returned values are not stable. The pre-checks before __dev_get_by_index are then of course bogus. Fixes: b7eddd0b3950 ("batman-adv: prevent using any virtual device created on batman-adv as hard-interface") Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/hard-interface.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 8a2b78f9c4b2..35aa1122043b 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -149,22 +149,23 @@ static bool batadv_is_on_batman_iface(const struct net_device *net_dev) struct net *net = dev_net(net_dev); struct net_device *parent_dev; struct net *parent_net; + int iflink; bool ret; /* check if this is a batman-adv mesh interface */ if (batadv_softif_is_valid(net_dev)) return true; + iflink = dev_get_iflink(net_dev); + /* no more parents..stop recursion */ - if (dev_get_iflink(net_dev) == 0 || - dev_get_iflink(net_dev) == net_dev->ifindex) + if (iflink == 0 || iflink == net_dev->ifindex) return false; parent_net = batadv_getlink_net(net_dev, net); /* recurse over the parent device */ - parent_dev = __dev_get_by_index((struct net *)parent_net, - dev_get_iflink(net_dev)); + parent_dev = __dev_get_by_index((struct net *)parent_net, iflink); /* if we got a NULL parent_dev there is something broken.. */ if (!parent_dev) { pr_err("Cannot find parent device\n"); -- cgit v1.2.3-58-ga151 From 6116ba09423f7d140f0460be6a1644dceaad00da Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Mon, 28 Feb 2022 00:01:24 +0100 Subject: batman-adv: Request iflink once in batadv_get_real_netdevice There is no need to call dev_get_iflink multiple times for the same net_device in batadv_get_real_netdevice. And since some of the ndo_get_iflink callbacks are dynamic (for example via RCUs like in vxcan_get_iflink), it could easily happen that the returned values are not stable. The pre-checks before __dev_get_by_index are then of course bogus. Fixes: 5ed4a460a1d3 ("batman-adv: additional checks for virtual interfaces on top of WiFi") Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/hard-interface.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 35aa1122043b..e2760cfce190 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -215,14 +215,16 @@ static struct net_device *batadv_get_real_netdevice(struct net_device *netdev) struct net_device *real_netdev = NULL; struct net *real_net; struct net *net; - int ifindex; + int iflink; ASSERT_RTNL(); if (!netdev) return NULL; - if (netdev->ifindex == dev_get_iflink(netdev)) { + iflink = dev_get_iflink(netdev); + + if (netdev->ifindex == iflink) { dev_hold(netdev); return netdev; } @@ -232,9 +234,8 @@ static struct net_device *batadv_get_real_netdevice(struct net_device *netdev) goto out; net = dev_net(hard_iface->soft_iface); - ifindex = dev_get_iflink(netdev); real_net = batadv_getlink_net(netdev, net); - real_netdev = dev_get_by_index(real_net, ifindex); + real_netdev = dev_get_by_index(real_net, iflink); out: batadv_hardif_put(hard_iface); -- cgit v1.2.3-58-ga151 From 6c1f41afc1dbe59d9d3c8bb0d80b749c119aa334 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sun, 27 Feb 2022 23:23:49 +0100 Subject: batman-adv: Don't expect inter-netns unique iflink indices The ifindex doesn't have to be unique for multiple network namespaces on the same machine. $ ip netns add test1 $ ip -net test1 link add dummy1 type dummy $ ip netns add test2 $ ip -net test2 link add dummy2 type dummy $ ip -net test1 link show dev dummy1 6: dummy1: mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000 link/ether 96:81:55:1e:dd:85 brd ff:ff:ff:ff:ff:ff $ ip -net test2 link show dev dummy2 6: dummy2: mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000 link/ether 5a:3c:af:35:07:c3 brd ff:ff:ff:ff:ff:ff But the batman-adv code to walk through the various layers of virtual interfaces uses this assumption because dev_get_iflink handles it internally and doesn't return the actual netns of the iflink. And dev_get_iflink only documents the situation where ifindex == iflink for physical devices. But only checking for dev->netdev_ops->ndo_get_iflink is also not an option because ipoib_get_iflink implements it even when it sometimes returns an iflink != ifindex and sometimes iflink == ifindex. The caller must therefore make sure itself to check both netns and iflink + ifindex for equality. Only when they are equal, a "physical" interface was detected which should stop the traversal. On the other hand, vxcan_get_iflink can also return 0 in case there was currently no valid peer. In this case, it is still necessary to stop. Fixes: b7eddd0b3950 ("batman-adv: prevent using any virtual device created on batman-adv as hard-interface") Fixes: 5ed4a460a1d3 ("batman-adv: additional checks for virtual interfaces on top of WiFi") Reported-by: Sabrina Dubroca Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/hard-interface.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index e2760cfce190..35fadb924849 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -157,13 +157,15 @@ static bool batadv_is_on_batman_iface(const struct net_device *net_dev) return true; iflink = dev_get_iflink(net_dev); - - /* no more parents..stop recursion */ - if (iflink == 0 || iflink == net_dev->ifindex) + if (iflink == 0) return false; parent_net = batadv_getlink_net(net_dev, net); + /* iflink to itself, most likely physical device */ + if (net == parent_net && iflink == net_dev->ifindex) + return false; + /* recurse over the parent device */ parent_dev = __dev_get_by_index((struct net *)parent_net, iflink); /* if we got a NULL parent_dev there is something broken.. */ @@ -223,8 +225,7 @@ static struct net_device *batadv_get_real_netdevice(struct net_device *netdev) return NULL; iflink = dev_get_iflink(netdev); - - if (netdev->ifindex == iflink) { + if (iflink == 0) { dev_hold(netdev); return netdev; } @@ -235,6 +236,14 @@ static struct net_device *batadv_get_real_netdevice(struct net_device *netdev) net = dev_net(hard_iface->soft_iface); real_net = batadv_getlink_net(netdev, net); + + /* iflink to itself, most likely physical device */ + if (net == real_net && netdev->ifindex == iflink) { + real_netdev = netdev; + dev_hold(real_netdev); + goto out; + } + real_netdev = dev_get_by_index(real_net, iflink); out: -- cgit v1.2.3-58-ga151 From e50b88c4f076242358b66ddb67482b96947438f2 Mon Sep 17 00:00:00 2001 From: Sreeramya Soratkal Date: Tue, 1 Mar 2022 11:33:20 +0530 Subject: nl80211: Update bss channel on channel switch for P2P_CLIENT The wdev channel information is updated post channel switch only for the station mode and not for the other modes. Due to this, the P2P client still points to the old value though it moved to the new channel when the channel change is induced from the P2P GO. Update the bss channel after CSA channel switch completion for P2P client interface as well. Signed-off-by: Sreeramya Soratkal Link: https://lore.kernel.org/r/1646114600-31479-1-git-send-email-quic_ssramya@quicinc.com Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index b1909ce2b739..c01fbcc848e8 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -17828,7 +17828,8 @@ void cfg80211_ch_switch_notify(struct net_device *dev, wdev->chandef = *chandef; wdev->preset_chandef = *chandef; - if (wdev->iftype == NL80211_IFTYPE_STATION && + if ((wdev->iftype == NL80211_IFTYPE_STATION || + wdev->iftype == NL80211_IFTYPE_P2P_CLIENT) && !WARN_ON(!wdev->current_bss)) cfg80211_update_assoc_bss_entry(wdev, chandef->chan); -- cgit v1.2.3-58-ga151 From 224102de2ff105a2c05695e66a08f4b5b6b2d19c Mon Sep 17 00:00:00 2001 From: lena wang Date: Tue, 1 Mar 2022 19:17:09 +0800 Subject: net: fix up skbs delta_truesize in UDP GRO frag_list The truesize for a UDP GRO packet is added by main skb and skbs in main skb's frag_list: skb_gro_receive_list p->truesize += skb->truesize; The commit 53475c5dd856 ("net: fix use-after-free when UDP GRO with shared fraglist") introduced a truesize increase for frag_list skbs. When uncloning skb, it will call pskb_expand_head and trusesize for frag_list skbs may increase. This can occur when allocators uses __netdev_alloc_skb and not jump into __alloc_skb. This flow does not use ksize(len) to calculate truesize while pskb_expand_head uses. skb_segment_list err = skb_unclone(nskb, GFP_ATOMIC); pskb_expand_head if (!skb->sk || skb->destructor == sock_edemux) skb->truesize += size - osize; If we uses increased truesize adding as delta_truesize, it will be larger than before and even larger than previous total truesize value if skbs in frag_list are abundant. The main skb truesize will become smaller and even a minus value or a huge value for an unsigned int parameter. Then the following memory check will drop this abnormal skb. To avoid this error we should use the original truesize to segment the main skb. Fixes: 53475c5dd856 ("net: fix use-after-free when UDP GRO with shared fraglist") Signed-off-by: lena wang Acked-by: Paolo Abeni Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/1646133431-8948-1-git-send-email-lena.wang@mediatek.com Signed-off-by: Jakub Kicinski --- net/core/skbuff.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b8138c372535..ea51e23e9247 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3876,6 +3876,7 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, list_skb = list_skb->next; err = 0; + delta_truesize += nskb->truesize; if (skb_shared(nskb)) { tmp = skb_clone(nskb, GFP_ATOMIC); if (tmp) { @@ -3900,7 +3901,6 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, tail = nskb; delta_len += nskb->len; - delta_truesize += nskb->truesize; skb_push(nskb, -skb_network_offset(nskb) + offset); -- cgit v1.2.3-58-ga151 From 60ce37b03917e593d8e5d8bcc7ec820773daf81d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 2 Mar 2022 08:17:22 -0800 Subject: bpf, sockmap: Do not ignore orig_len parameter Currently, sk_psock_verdict_recv() returns skb->len This is problematic because tcp_read_sock() might have passed orig_len < skb->len, due to the presence of TCP urgent data. This causes an infinite loop from tcp_read_sock() Followup patch will make tcp_read_sock() more robust vs bad actors. Fixes: ef5659280eb1 ("bpf, sockmap: Allow skipping sk_skb parser program") Reported-by: syzbot Signed-off-by: Eric Dumazet Acked-by: John Fastabend Acked-by: Jakub Sitnicki Tested-by: Jakub Sitnicki Acked-by: Daniel Borkmann Link: https://lore.kernel.org/r/20220302161723.3910001-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- net/core/skmsg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 8eb671c827f9..929a2b096b04 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -1153,7 +1153,7 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, struct sk_psock *psock; struct bpf_prog *prog; int ret = __SK_DROP; - int len = skb->len; + int len = orig_len; /* clone here so sk_eat_skb() in tcp_read_sock does not drop our data */ skb = skb_clone(skb, GFP_ATOMIC); -- cgit v1.2.3-58-ga151 From e3d5ea2c011ecb16fb94c56a659364e6b30fac94 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 2 Mar 2022 08:17:23 -0800 Subject: tcp: make tcp_read_sock() more robust If recv_actor() returns an incorrect value, tcp_read_sock() might loop forever. Instead, issue a one time warning and make sure to make progress. Signed-off-by: Eric Dumazet Acked-by: John Fastabend Acked-by: Jakub Sitnicki Acked-by: Daniel Borkmann Link: https://lore.kernel.org/r/20220302161723.3910001-2-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 02cb275e5487..28ff2a820f7c 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1684,11 +1684,13 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, if (!copied) copied = used; break; - } else if (used <= len) { - seq += used; - copied += used; - offset += used; } + if (WARN_ON_ONCE(used > len)) + used = len; + seq += used; + copied += used; + offset += used; + /* If recv_actor drops the lock (e.g. TCP splice * receive) the skb pointer might be invalid when * getting here: tcp_collapse might have deleted it -- cgit v1.2.3-58-ga151 From 0537f0a2151375dcf90c1bbfda6a0aaf57164e89 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Wed, 2 Mar 2022 21:25:11 +0800 Subject: net/smc: fix unexpected SMC_CLC_DECL_ERR_REGRMB error generated by client The main reason for this unexpected SMC_CLC_DECL_ERR_REGRMB in client dues to following execution sequence: Server Conn A: Server Conn B: Client Conn B: smc_lgr_unregister_conn smc_lgr_register_conn smc_clc_send_accept -> smc_rtoken_add smcr_buf_unuse -> Client Conn A: smc_rtoken_delete smc_lgr_unregister_conn() makes current link available to assigned to new incoming connection, while smcr_buf_unuse() has not executed yet, which means that smc_rtoken_add may fail because of insufficient rtoken_entry, reversing their execution order will avoid this problem. Fixes: 3e034725c0d8 ("net/smc: common functions for RMBs and send buffers") Signed-off-by: D. Wythe Signed-off-by: David S. Miller --- net/smc/smc_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 29525d03b253..f8c9675f4af3 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1161,8 +1161,8 @@ void smc_conn_free(struct smc_connection *conn) cancel_work_sync(&conn->abort_work); } if (!list_empty(&lgr->list)) { - smc_lgr_unregister_conn(conn); smc_buf_unuse(conn, lgr); /* allow buffer reuse */ + smc_lgr_unregister_conn(conn); } if (!lgr->conns_num) -- cgit v1.2.3-58-ga151 From 4940a1fdf31c39f0806ac831cde333134862030b Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Wed, 2 Mar 2022 21:25:12 +0800 Subject: net/smc: fix unexpected SMC_CLC_DECL_ERR_REGRMB error cause by server The problem of SMC_CLC_DECL_ERR_REGRMB on the server is very clear. Based on the fact that whether a new SMC connection can be accepted or not depends on not only the limit of conn nums, but also the available entries of rtoken. Since the rtoken release is trigger by peer, while the conn nums is decrease by local, tons of thing can happen in this time difference. This only thing that needs to be mentioned is that now all connection creations are completely protected by smc_server_lgr_pending lock, it's enough to check only the available entries in rtokens_used_mask. Fixes: cd6851f30386 ("smc: remote memory buffers (RMBs)") Signed-off-by: D. Wythe Signed-off-by: David S. Miller --- net/smc/smc_core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index f8c9675f4af3..be7d704976ff 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1864,7 +1864,8 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) (ini->smcd_version == SMC_V2 || lgr->vlan_id == ini->vlan_id) && (role == SMC_CLNT || ini->is_smcd || - lgr->conns_num < SMC_RMBS_PER_LGR_MAX)) { + (lgr->conns_num < SMC_RMBS_PER_LGR_MAX && + !bitmap_full(lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX)))) { /* link group found */ ini->first_contact_local = 0; conn->lgr = lgr; -- cgit v1.2.3-58-ga151 From 10b6bb62ae1a49ee818fc479cf57b8900176773e Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 2 Mar 2022 21:39:39 +0200 Subject: net: dcb: disable softirqs in dcbnl_flush_dev() Ido Schimmel points out that since commit 52cff74eef5d ("dcbnl : Disable software interrupts before taking dcb_lock"), the DCB API can be called by drivers from softirq context. One such in-tree example is the chelsio cxgb4 driver: dcb_rpl -> cxgb4_dcb_handle_fw_update -> dcb_ieee_setapp If the firmware for this driver happened to send an event which resulted in a call to dcb_ieee_setapp() at the exact same time as another DCB-enabled interface was unregistering on the same CPU, the softirq would deadlock, because the interrupted process was already holding the dcb_lock in dcbnl_flush_dev(). Fix this unlikely event by using spin_lock_bh() in dcbnl_flush_dev() as in the rest of the dcbnl code. Fixes: 91b0383fef06 ("net: dcb: flush lingering app table entries for unregistered devices") Reported-by: Ido Schimmel Signed-off-by: Vladimir Oltean Link: https://lore.kernel.org/r/20220302193939.1368823-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- net/dcb/dcbnl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index 36c91273daac..dc4fb699b56c 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -2077,7 +2077,7 @@ static void dcbnl_flush_dev(struct net_device *dev) { struct dcb_app_type *itr, *tmp; - spin_lock(&dcb_lock); + spin_lock_bh(&dcb_lock); list_for_each_entry_safe(itr, tmp, &dcb_app_list, list) { if (itr->ifindex == dev->ifindex) { @@ -2086,7 +2086,7 @@ static void dcbnl_flush_dev(struct net_device *dev) } } - spin_unlock(&dcb_lock); + spin_unlock_bh(&dcb_lock); } static int dcbnl_netdevice_event(struct notifier_block *nb, -- cgit v1.2.3-58-ga151 From e1bec7fa1cee311a6d3fb9161037c7675904134d Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 3 Mar 2022 17:42:49 +0200 Subject: net: dsa: make dsa_tree_change_tag_proto actually unwind the tag proto change The blamed commit said one thing but did another. It explains that we should restore the "return err" to the original "goto out_unwind_tagger", but instead it replaced it with "goto out_unlock". When DSA_NOTIFIER_TAG_PROTO fails after the first switch of a multi-switch tree, the switches would end up not using the same tagging protocol. Fixes: 0b0e2ff10356 ("net: dsa: restore error path of dsa_tree_change_tag_proto") Signed-off-by: Vladimir Oltean Link: https://lore.kernel.org/r/20220303154249.1854436-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- net/dsa/dsa2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 209496996cdf..b4e67758e104 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -1261,7 +1261,7 @@ int dsa_tree_change_tag_proto(struct dsa_switch_tree *dst, info.tag_ops = tag_ops; err = dsa_tree_notify(dst, DSA_NOTIFIER_TAG_PROTO, &info); if (err) - goto out_unlock; + goto out_unwind_tagger; err = dsa_tree_bind_tag_proto(dst, tag_ops); if (err) -- cgit v1.2.3-58-ga151 From 2d3916f3189172d5c69d33065c3c21119fe539fc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 3 Mar 2022 09:37:28 -0800 Subject: ipv6: fix skb drops in igmp6_event_query() and igmp6_event_report() While investigating on why a synchronize_net() has been added recently in ipv6_mc_down(), I found that igmp6_event_query() and igmp6_event_report() might drop skbs in some cases. Discussion about removing synchronize_net() from ipv6_mc_down() will happen in a different thread. Fixes: f185de28d9ae ("mld: add new workqueues for process mld events") Signed-off-by: Eric Dumazet Cc: Taehee Yoo Cc: Cong Wang Cc: David Ahern Link: https://lore.kernel.org/r/20220303173728.937869-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- include/net/ndisc.h | 4 ++-- net/ipv6/mcast.c | 32 ++++++++++++-------------------- 2 files changed, 14 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/include/net/ndisc.h b/include/net/ndisc.h index 53cb8de0e589..47ffb360ddfa 100644 --- a/include/net/ndisc.h +++ b/include/net/ndisc.h @@ -475,9 +475,9 @@ int igmp6_late_init(void); void igmp6_cleanup(void); void igmp6_late_cleanup(void); -int igmp6_event_query(struct sk_buff *skb); +void igmp6_event_query(struct sk_buff *skb); -int igmp6_event_report(struct sk_buff *skb); +void igmp6_event_report(struct sk_buff *skb); #ifdef CONFIG_SYSCTL diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index a8861db52c18..909f937befd7 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1371,27 +1371,23 @@ static void mld_process_v2(struct inet6_dev *idev, struct mld2_query *mld, } /* called with rcu_read_lock() */ -int igmp6_event_query(struct sk_buff *skb) +void igmp6_event_query(struct sk_buff *skb) { struct inet6_dev *idev = __in6_dev_get(skb->dev); - if (!idev) - return -EINVAL; - - if (idev->dead) { - kfree_skb(skb); - return -ENODEV; - } + if (!idev || idev->dead) + goto out; spin_lock_bh(&idev->mc_query_lock); if (skb_queue_len(&idev->mc_query_queue) < MLD_MAX_SKBS) { __skb_queue_tail(&idev->mc_query_queue, skb); if (!mod_delayed_work(mld_wq, &idev->mc_query_work, 0)) in6_dev_hold(idev); + skb = NULL; } spin_unlock_bh(&idev->mc_query_lock); - - return 0; +out: + kfree_skb(skb); } static void __mld_query_work(struct sk_buff *skb) @@ -1542,27 +1538,23 @@ static void mld_query_work(struct work_struct *work) } /* called with rcu_read_lock() */ -int igmp6_event_report(struct sk_buff *skb) +void igmp6_event_report(struct sk_buff *skb) { struct inet6_dev *idev = __in6_dev_get(skb->dev); - if (!idev) - return -EINVAL; - - if (idev->dead) { - kfree_skb(skb); - return -ENODEV; - } + if (!idev || idev->dead) + goto out; spin_lock_bh(&idev->mc_report_lock); if (skb_queue_len(&idev->mc_report_queue) < MLD_MAX_SKBS) { __skb_queue_tail(&idev->mc_report_queue, skb); if (!mod_delayed_work(mld_wq, &idev->mc_report_work, 0)) in6_dev_hold(idev); + skb = NULL; } spin_unlock_bh(&idev->mc_report_lock); - - return 0; +out: + kfree_skb(skb); } static void __mld_report_work(struct sk_buff *skb) -- cgit v1.2.3-58-ga151