From 1ba5bf993c6a3142e18e68ea6452b347f9cb5635 Mon Sep 17 00:00:00 2001 From: Vegard Nossum <vegard.nossum@oracle.com> Date: Tue, 5 Jul 2016 10:18:08 +0200 Subject: xfrm: fix crash in XFRM_MSG_GETSA netlink handler If we hit any of the error conditions inside xfrm_dump_sa(), then xfrm_state_walk_init() never gets called. However, we still call xfrm_state_walk_done() from xfrm_dump_sa_done(), which will crash because the state walk was never initialized properly. We can fix this by setting cb->args[0] only after we've processed the first element and checking this before calling xfrm_state_walk_done(). Fixes: d3623099d3 ("ipsec: add support of limited SA dump") Cc: Nicolas Dichtel <nicolas.dichtel@6wind.com> Cc: Steffen Klassert <steffen.klassert@secunet.com> Signed-off-by: Vegard Nossum <vegard.nossum@oracle.com> Acked-by: Nicolas Dichtel <nicolas.dichtel@6wind.com> Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com> --- net/xfrm/xfrm_user.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index d516845e16e3..4fb04ced5867 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -896,7 +896,8 @@ static int xfrm_dump_sa_done(struct netlink_callback *cb) struct sock *sk = cb->skb->sk; struct net *net = sock_net(sk); - xfrm_state_walk_done(walk, net); + if (cb->args[0]) + xfrm_state_walk_done(walk, net); return 0; } @@ -921,8 +922,6 @@ static int xfrm_dump_sa(struct sk_buff *skb, struct netlink_callback *cb) u8 proto = 0; int err; - cb->args[0] = 1; - err = nlmsg_parse(cb->nlh, 0, attrs, XFRMA_MAX, xfrma_policy); if (err < 0) @@ -939,6 +938,7 @@ static int xfrm_dump_sa(struct sk_buff *skb, struct netlink_callback *cb) proto = nla_get_u8(attrs[XFRMA_PROTO]); xfrm_state_walk_init(walk, proto, filter); + cb->args[0] = 1; } (void) xfrm_state_walk(net, walk, dump_one_state, &info); -- cgit v1.2.3-58-ga151 From 73efc3245fd3edb3632d82a3a9c5d5d975a02efc Mon Sep 17 00:00:00 2001 From: Vegard Nossum <vegard.nossum@oracle.com> Date: Wed, 27 Jul 2016 08:03:18 +0200 Subject: xfrm: get rid of incorrect WARN AFAICT this message is just printed whenever input validation fails. This is a normal failure and we shouldn't be dumping the stack over it. Looks like it was originally a printk that was maybe incorrectly upgraded to a WARN: commit 62db5cfd70b1ef53aa21f144a806fe3b78c84fab Author: stephen hemminger <shemminger@vyatta.com> Date: Wed May 12 06:37:06 2010 +0000 xfrm: add severity to printk Cc: Stephen Hemminger <stephen@networkplumber.org> Cc: Steffen Klassert <steffen.klassert@secunet.com> Signed-off-by: Vegard Nossum <vegard.nossum@oracle.com> Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com> --- net/xfrm/xfrm_user.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 4fb04ced5867..1a4f142dd50a 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -2117,7 +2117,7 @@ static int xfrm_add_acquire(struct sk_buff *skb, struct nlmsghdr *nlh, err = verify_newpolicy_info(&ua->policy); if (err) - goto bad_policy; + goto free_state; /* build an XP */ xp = xfrm_policy_construct(net, &ua->policy, attrs, &err); @@ -2149,8 +2149,6 @@ static int xfrm_add_acquire(struct sk_buff *skb, struct nlmsghdr *nlh, return 0; -bad_policy: - WARN(1, "BAD policy passed\n"); free_state: kfree(x); nomem: -- cgit v1.2.3-58-ga151 From 7677c7560c3e80fde08a7e710d378dedabf950c3 Mon Sep 17 00:00:00 2001 From: Vegard Nossum <vegard.nossum@oracle.com> Date: Wed, 27 Jul 2016 08:44:15 +0200 Subject: xfrm: get rid of another incorrect WARN During fuzzing I regularly run into this WARN(). According to Herbert Xu, this "certainly shouldn't be a WARN, it probably shouldn't print anything either". Cc: Stephen Hemminger <stephen@networkplumber.org> Cc: Steffen Klassert <steffen.klassert@secunet.com> Cc: Herbert Xu <herbert@gondor.apana.org.au> Signed-off-by: Vegard Nossum <vegard.nossum@oracle.com> Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com> --- net/xfrm/xfrm_user.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 1a4f142dd50a..cb65d916a345 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -2051,9 +2051,6 @@ static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh, if (up->hard) { xfrm_policy_delete(xp, p->dir); xfrm_audit_policy_delete(xp, 1, true); - } else { - // reset the timers here? - WARN(1, "Don't know what to do with soft policy expire\n"); } km_policy_expired(xp, p->dir, up->hard, nlh->nlmsg_pid); -- cgit v1.2.3-58-ga151 From 6916fb3b10b3cbe3b1f9f5b680675f53e4e299eb Mon Sep 17 00:00:00 2001 From: Tobias Brunner <tobias@strongswan.org> Date: Fri, 29 Jul 2016 09:57:32 +0200 Subject: xfrm: Ignore socket policies when rebuilding hash tables Whenever thresholds are changed the hash tables are rebuilt. This is done by enumerating all policies and hashing and inserting them into the right table according to the thresholds and direction. Because socket policies are also contained in net->xfrm.policy_all but no hash tables are defined for their direction (dir + XFRM_POLICY_MAX) this causes a NULL or invalid pointer dereference after returning from policy_hash_bysel() if the rebuild is done while any socket policies are installed. Since the rebuild after changing thresholds is scheduled this crash could even occur if the userland sets thresholds seemingly before installing any socket policies. Fixes: 53c2e285f970 ("xfrm: Do not hash socket policies") Signed-off-by: Tobias Brunner <tobias@strongswan.org> Acked-by: Herbert Xu <herbert@gondor.apana.org.au> Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com> --- net/xfrm/xfrm_policy.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index b5e665b3cfb0..45f9cf97ea25 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -626,6 +626,10 @@ static void xfrm_hash_rebuild(struct work_struct *work) /* re-insert all policies by order of creation */ list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) { + if (xfrm_policy_id2dir(policy->index) >= XFRM_POLICY_MAX) { + /* skip socket policies */ + continue; + } newpos = NULL; chain = policy_hash_bysel(net, &policy->selector, policy->family, -- cgit v1.2.3-58-ga151 From 4d0bd46a4d55383f7b925e6cf7865a77e0f0e020 Mon Sep 17 00:00:00 2001 From: Johannes Berg <johannes.berg@intel.com> Date: Mon, 8 Aug 2016 08:45:33 +0200 Subject: Revert "wext: Fix 32 bit iwpriv compatibility issue with 64 bit Kernel" This reverts commit 3d5fdff46c4b2b9534fa2f9fc78e90a48e0ff724. Ben Hutchings pointed out that the commit isn't safe since it assumes that the structure used by the driver is iw_point, when in fact there's no way to know about that. Fortunately, the only driver in the tree that ever runs this code path is the wilc1000 staging driver, so it doesn't really matter. Clearly I should have investigated this better before applying, sorry. Reported-by: Ben Hutchings <ben@decadent.org.uk> Cc: stable@vger.kernel.org [though I guess it doesn't matter much] Fixes: 3d5fdff46c4b ("wext: Fix 32 bit iwpriv compatibility issue with 64 bit Kernel") Signed-off-by: Johannes Berg <johannes.berg@intel.com> --- net/wireless/wext-core.c | 25 ++----------------------- 1 file changed, 2 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c index dbb2738e356a..6250b1cfcde5 100644 --- a/net/wireless/wext-core.c +++ b/net/wireless/wext-core.c @@ -958,29 +958,8 @@ static int wireless_process_ioctl(struct net *net, struct ifreq *ifr, return private(dev, iwr, cmd, info, handler); } /* Old driver API : call driver ioctl handler */ - if (dev->netdev_ops->ndo_do_ioctl) { -#ifdef CONFIG_COMPAT - if (info->flags & IW_REQUEST_FLAG_COMPAT) { - int ret = 0; - struct iwreq iwr_lcl; - struct compat_iw_point *iwp_compat = (void *) &iwr->u.data; - - memcpy(&iwr_lcl, iwr, sizeof(struct iwreq)); - iwr_lcl.u.data.pointer = compat_ptr(iwp_compat->pointer); - iwr_lcl.u.data.length = iwp_compat->length; - iwr_lcl.u.data.flags = iwp_compat->flags; - - ret = dev->netdev_ops->ndo_do_ioctl(dev, (void *) &iwr_lcl, cmd); - - iwp_compat->pointer = ptr_to_compat(iwr_lcl.u.data.pointer); - iwp_compat->length = iwr_lcl.u.data.length; - iwp_compat->flags = iwr_lcl.u.data.flags; - - return ret; - } else -#endif - return dev->netdev_ops->ndo_do_ioctl(dev, ifr, cmd); - } + if (dev->netdev_ops->ndo_do_ioctl) + return dev->netdev_ops->ndo_do_ioctl(dev, ifr, cmd); return -EOPNOTSUPP; } -- cgit v1.2.3-58-ga151 From 1625f4529957738be7d87cf157e107b8fb9d23b9 Mon Sep 17 00:00:00 2001 From: Alexey Kodanev <alexey.kodanev@oracle.com> Date: Wed, 10 Aug 2016 13:54:57 +0300 Subject: net/xfrm_input: fix possible NULL deref of tunnel.ip6->parms.i_key Running LTP 'icmp-uni-basic.sh -6 -p ipcomp -m tunnel' test over openvswitch + veth can trigger kernel panic: BUG: unable to handle kernel NULL pointer dereference at 00000000000000e0 IP: [<ffffffff8169d1d2>] xfrm_input+0x82/0x750 ... [<ffffffff816d472e>] xfrm6_rcv_spi+0x1e/0x20 [<ffffffffa082c3c2>] xfrm6_tunnel_rcv+0x42/0x50 [xfrm6_tunnel] [<ffffffffa082727e>] tunnel6_rcv+0x3e/0x8c [tunnel6] [<ffffffff8169f365>] ip6_input_finish+0xd5/0x430 [<ffffffff8169fc53>] ip6_input+0x33/0x90 [<ffffffff8169f1d5>] ip6_rcv_finish+0xa5/0xb0 ... It seems that tunnel.ip6 can have garbage values and also dereferenced without a proper check, only tunnel.ip4 is being verified. Fix it by adding one more if block for AF_INET6 and initialize tunnel.ip6 with NULL inside xfrm6_rcv_spi() (which is similar to xfrm4_rcv_spi()). Fixes: 049f8e2 ("xfrm: Override skb->mark with tunnel->parm.i_key in xfrm_input") Signed-off-by: Alexey Kodanev <alexey.kodanev@oracle.com> Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com> --- net/ipv6/xfrm6_input.c | 1 + net/xfrm/xfrm_input.c | 14 +++++++------- 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c index 0eaab1fa6be5..00a2d40677d6 100644 --- a/net/ipv6/xfrm6_input.c +++ b/net/ipv6/xfrm6_input.c @@ -23,6 +23,7 @@ int xfrm6_extract_input(struct xfrm_state *x, struct sk_buff *skb) int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi) { + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL; XFRM_SPI_SKB_CB(skb)->family = AF_INET6; XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr); return xfrm_input(skb, nexthdr, spi, 0); diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 1c4ad477ce93..6e3f0254d8a1 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -207,15 +207,15 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) family = XFRM_SPI_SKB_CB(skb)->family; /* if tunnel is present override skb->mark value with tunnel i_key */ - if (XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4) { - switch (family) { - case AF_INET: + switch (family) { + case AF_INET: + if (XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4) mark = be32_to_cpu(XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4->parms.i_key); - break; - case AF_INET6: + break; + case AF_INET6: + if (XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6) mark = be32_to_cpu(XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6->parms.i_key); - break; - } + break; } /* Allocate new secpath or COW existing one. */ -- cgit v1.2.3-58-ga151 From 11d7a0bb95eaaba1741bb24a7c3c169c82f09c7b Mon Sep 17 00:00:00 2001 From: David Ahern <dsa@cumulusnetworks.com> Date: Sun, 14 Aug 2016 19:52:56 -0700 Subject: xfrm: Only add l3mdev oif to dst lookups Subash reported that commit 42a7b32b73d6 ("xfrm: Add oif to dst lookups") broke a wifi use case that uses fib rules and xfrms. The intent of 42a7b32b73d6 was driven by VRFs with IPsec. As a compromise relax the use of oif in xfrm lookups to L3 master devices only (ie., oif is either an L3 master device or is enslaved to a master device). Fixes: 42a7b32b73d6 ("xfrm: Add oif to dst lookups") Reported-by: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org> Signed-off-by: David Ahern <dsa@cumulusnetworks.com> Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com> --- net/ipv4/xfrm4_policy.c | 2 +- net/ipv6/xfrm6_policy.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 7b0edb37a115..e07ed8b1deb3 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -29,7 +29,7 @@ static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4, memset(fl4, 0, sizeof(*fl4)); fl4->daddr = daddr->a4; fl4->flowi4_tos = tos; - fl4->flowi4_oif = oif; + fl4->flowi4_oif = l3mdev_master_ifindex_by_index(net, oif); if (saddr) fl4->saddr = saddr->a4; diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index c074771a10f7..dd84ecd1221b 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -36,7 +36,7 @@ static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif, int err; memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_oif = oif; + fl6.flowi6_oif = l3mdev_master_ifindex_by_index(net, oif); fl6.flowi6_flags = FLOWI_FLAG_SKIP_NH_OIF; memcpy(&fl6.daddr, daddr, sizeof(fl6.daddr)); if (saddr) -- cgit v1.2.3-58-ga151 From 89e1f6d2b956649fbe0704d543a90b8e0cf872b0 Mon Sep 17 00:00:00 2001 From: Liping Zhang <liping.zhang@spreadtrum.com> Date: Mon, 22 Aug 2016 01:02:18 +0800 Subject: netfilter: nft_reject: restrict to INPUT/FORWARD/OUTPUT After I add the nft rule "nft add rule filter prerouting reject with tcp reset", kernel panic happened on my system: NULL pointer dereference at ... IP: [<ffffffff81b9db2f>] nf_send_reset+0xaf/0x400 Call Trace: [<ffffffff81b9da80>] ? nf_reject_ip_tcphdr_get+0x160/0x160 [<ffffffffa0928061>] nft_reject_ipv4_eval+0x61/0xb0 [nft_reject_ipv4] [<ffffffffa08e836a>] nft_do_chain+0x1fa/0x890 [nf_tables] [<ffffffffa08e8170>] ? __nft_trace_packet+0x170/0x170 [nf_tables] [<ffffffffa06e0900>] ? nf_ct_invert_tuple+0xb0/0xc0 [nf_conntrack] [<ffffffffa07224d4>] ? nf_nat_setup_info+0x5d4/0x650 [nf_nat] [...] Because in the PREROUTING chain, routing information is not exist, then we will dereference the NULL pointer and oops happen. So we restrict reject expression to INPUT, FORWARD and OUTPUT chain. This is consistent with iptables REJECT target. Signed-off-by: Liping Zhang <liping.zhang@spreadtrum.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> --- include/net/netfilter/nft_reject.h | 4 ++++ net/ipv4/netfilter/nft_reject_ipv4.c | 1 + net/ipv6/netfilter/nft_reject_ipv6.c | 1 + net/netfilter/nft_reject.c | 16 ++++++++++++++++ net/netfilter/nft_reject_inet.c | 7 ++++++- 5 files changed, 28 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/netfilter/nft_reject.h b/include/net/netfilter/nft_reject.h index 60fa1530006b..02e28c529b29 100644 --- a/include/net/netfilter/nft_reject.h +++ b/include/net/netfilter/nft_reject.h @@ -8,6 +8,10 @@ struct nft_reject { extern const struct nla_policy nft_reject_policy[]; +int nft_reject_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data); + int nft_reject_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]); diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c index c24f41c816b3..2c2553b9026c 100644 --- a/net/ipv4/netfilter/nft_reject_ipv4.c +++ b/net/ipv4/netfilter/nft_reject_ipv4.c @@ -46,6 +46,7 @@ static const struct nft_expr_ops nft_reject_ipv4_ops = { .eval = nft_reject_ipv4_eval, .init = nft_reject_init, .dump = nft_reject_dump, + .validate = nft_reject_validate, }; static struct nft_expr_type nft_reject_ipv4_type __read_mostly = { diff --git a/net/ipv6/netfilter/nft_reject_ipv6.c b/net/ipv6/netfilter/nft_reject_ipv6.c index 533cd5719c59..92bda9908bb9 100644 --- a/net/ipv6/netfilter/nft_reject_ipv6.c +++ b/net/ipv6/netfilter/nft_reject_ipv6.c @@ -47,6 +47,7 @@ static const struct nft_expr_ops nft_reject_ipv6_ops = { .eval = nft_reject_ipv6_eval, .init = nft_reject_init, .dump = nft_reject_dump, + .validate = nft_reject_validate, }; static struct nft_expr_type nft_reject_ipv6_type __read_mostly = { diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c index 0522fc9bfb0a..c64de3f7379d 100644 --- a/net/netfilter/nft_reject.c +++ b/net/netfilter/nft_reject.c @@ -26,11 +26,27 @@ const struct nla_policy nft_reject_policy[NFTA_REJECT_MAX + 1] = { }; EXPORT_SYMBOL_GPL(nft_reject_policy); +int nft_reject_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data) +{ + return nft_chain_validate_hooks(ctx->chain, + (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_FORWARD) | + (1 << NF_INET_LOCAL_OUT)); +} +EXPORT_SYMBOL_GPL(nft_reject_validate); + int nft_reject_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_reject *priv = nft_expr_priv(expr); + int err; + + err = nft_reject_validate(ctx, expr, NULL); + if (err < 0) + return err; if (tb[NFTA_REJECT_TYPE] == NULL) return -EINVAL; diff --git a/net/netfilter/nft_reject_inet.c b/net/netfilter/nft_reject_inet.c index 759ca5248a3d..e79d9ca2ffee 100644 --- a/net/netfilter/nft_reject_inet.c +++ b/net/netfilter/nft_reject_inet.c @@ -66,7 +66,11 @@ static int nft_reject_inet_init(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_reject *priv = nft_expr_priv(expr); - int icmp_code; + int icmp_code, err; + + err = nft_reject_validate(ctx, expr, NULL); + if (err < 0) + return err; if (tb[NFTA_REJECT_TYPE] == NULL) return -EINVAL; @@ -124,6 +128,7 @@ static const struct nft_expr_ops nft_reject_inet_ops = { .eval = nft_reject_inet_eval, .init = nft_reject_inet_init, .dump = nft_reject_inet_dump, + .validate = nft_reject_validate, }; static struct nft_expr_type nft_reject_inet_type __read_mostly = { -- cgit v1.2.3-58-ga151 From 93fac10b99d78eb2c50a739cba2e590c7332d539 Mon Sep 17 00:00:00 2001 From: Liping Zhang <liping.zhang@spreadtrum.com> Date: Mon, 22 Aug 2016 21:58:16 +0800 Subject: netfilter: nfnetlink: use list_for_each_entry_safe to delete all objects cttimeout and acct objects are deleted from the list while traversing it, so use list_for_each_entry is unsafe here. Signed-off-by: Liping Zhang <liping.zhang@spreadtrum.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> --- net/netfilter/nfnetlink_acct.c | 6 +++--- net/netfilter/nfnetlink_cttimeout.c | 5 +++-- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c index 70eb2f6a3b01..d44d89b56127 100644 --- a/net/netfilter/nfnetlink_acct.c +++ b/net/netfilter/nfnetlink_acct.c @@ -343,12 +343,12 @@ static int nfnl_acct_del(struct net *net, struct sock *nfnl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const tb[]) { - char *acct_name; - struct nf_acct *cur; + struct nf_acct *cur, *tmp; int ret = -ENOENT; + char *acct_name; if (!tb[NFACCT_NAME]) { - list_for_each_entry(cur, &net->nfnl_acct_list, head) + list_for_each_entry_safe(cur, tmp, &net->nfnl_acct_list, head) nfnl_acct_try_del(cur); return 0; diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index 68216cdc7083..f74fee1e2d0a 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -350,12 +350,13 @@ static int cttimeout_del_timeout(struct net *net, struct sock *ctnl, const struct nlmsghdr *nlh, const struct nlattr * const cda[]) { - struct ctnl_timeout *cur; + struct ctnl_timeout *cur, *tmp; int ret = -ENOENT; char *name; if (!cda[CTA_TIMEOUT_NAME]) { - list_for_each_entry(cur, &net->nfct_timeout_list, head) + list_for_each_entry_safe(cur, tmp, &net->nfct_timeout_list, + head) ctnl_timeout_try_del(net, cur); return 0; -- cgit v1.2.3-58-ga151 From 23aaba5ad55547db62bada5066c8fb6412d5b1c2 Mon Sep 17 00:00:00 2001 From: Liping Zhang <liping.zhang@spreadtrum.com> Date: Mon, 22 Aug 2016 21:58:17 +0800 Subject: netfilter: cttimeout: put back l4proto when replacing timeout policy We forget to call nf_ct_l4proto_put when replacing the existing timeout policy. Acctually, there's no need to get ct l4proto before doing replace, so we can move it to a later position. Signed-off-by: Liping Zhang <liping.zhang@spreadtrum.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> --- net/netfilter/nfnetlink_cttimeout.c | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index f74fee1e2d0a..6844c7af0b8f 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -98,31 +98,28 @@ static int cttimeout_new_timeout(struct net *net, struct sock *ctnl, break; } - l4proto = nf_ct_l4proto_find_get(l3num, l4num); - - /* This protocol is not supportted, skip. */ - if (l4proto->l4proto != l4num) { - ret = -EOPNOTSUPP; - goto err_proto_put; - } - if (matching) { if (nlh->nlmsg_flags & NLM_F_REPLACE) { /* You cannot replace one timeout policy by another of * different kind, sorry. */ if (matching->l3num != l3num || - matching->l4proto->l4proto != l4num) { - ret = -EINVAL; - goto err_proto_put; - } - - ret = ctnl_timeout_parse_policy(&matching->data, - l4proto, net, - cda[CTA_TIMEOUT_DATA]); - return ret; + matching->l4proto->l4proto != l4num) + return -EINVAL; + + return ctnl_timeout_parse_policy(&matching->data, + matching->l4proto, net, + cda[CTA_TIMEOUT_DATA]); } - ret = -EBUSY; + + return -EBUSY; + } + + l4proto = nf_ct_l4proto_find_get(l3num, l4num); + + /* This protocol is not supportted, skip. */ + if (l4proto->l4proto != l4num) { + ret = -EOPNOTSUPP; goto err_proto_put; } -- cgit v1.2.3-58-ga151 From 533e33009897c7dd1b0424c0d4b3331b222d5681 Mon Sep 17 00:00:00 2001 From: Liping Zhang <liping.zhang@spreadtrum.com> Date: Mon, 22 Aug 2016 21:58:18 +0800 Subject: netfilter: cttimeout: unlink timeout objs in the unconfirmed ct lists KASAN reported this bug: BUG: KASAN: use-after-free in icmp_packet+0x25/0x50 [nf_conntrack_ipv4] at addr ffff880002db08c8 Read of size 4 by task lt-nf-queue/19041 Call Trace: <IRQ> [<ffffffff815eeebb>] dump_stack+0x63/0x88 [<ffffffff813386f8>] kasan_report_error+0x528/0x560 [<ffffffff81338cc8>] kasan_report+0x58/0x60 [<ffffffffa07393f5>] ? icmp_packet+0x25/0x50 [nf_conntrack_ipv4] [<ffffffff81337551>] __asan_load4+0x61/0x80 [<ffffffffa07393f5>] icmp_packet+0x25/0x50 [nf_conntrack_ipv4] [<ffffffffa06ecaa0>] nf_conntrack_in+0x550/0x980 [nf_conntrack] [<ffffffffa06ec550>] ? __nf_conntrack_confirm+0xb10/0xb10 [nf_conntrack] [ ... ] The main reason is that we missed to unlink the timeout objects in the unconfirmed ct lists, so we will access the timeout objects that have already been freed. Signed-off-by: Liping Zhang <liping.zhang@spreadtrum.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> --- net/netfilter/nfnetlink_cttimeout.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index 6844c7af0b8f..139e0867e56e 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -302,7 +302,16 @@ static void ctnl_untimeout(struct net *net, struct ctnl_timeout *timeout) const struct hlist_nulls_node *nn; unsigned int last_hsize; spinlock_t *lock; - int i; + int i, cpu; + + for_each_possible_cpu(cpu) { + struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); + + spin_lock_bh(&pcpu->lock); + hlist_nulls_for_each_entry(h, nn, &pcpu->unconfirmed, hnnode) + untimeout(h, timeout); + spin_unlock_bh(&pcpu->lock); + } local_bh_disable(); restart: -- cgit v1.2.3-58-ga151 From 960fa72f67f1be6891d63a5518860d1ae4e14b88 Mon Sep 17 00:00:00 2001 From: Liping Zhang <liping.zhang@spreadtrum.com> Date: Mon, 22 Aug 2016 22:57:56 +0800 Subject: netfilter: nft_meta: improve the validity check of pkttype set expr "meta pkttype set" is only supported on prerouting chain with bridge family and ingress chain with netdev family. But the validate check is incomplete, and the user can add the nft rules on input chain with bridge family, for example: # nft add table bridge filter # nft add chain bridge filter input {type filter hook input \ priority 0 \;} # nft add chain bridge filter test # nft add rule bridge filter test meta pkttype set unicast # nft add rule bridge filter input jump test This patch fixes the problem. Signed-off-by: Liping Zhang <liping.zhang@spreadtrum.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> --- include/net/netfilter/nft_meta.h | 4 ++++ net/bridge/netfilter/nft_meta_bridge.c | 1 + net/netfilter/nft_meta.c | 17 +++++++++++++---- 3 files changed, 18 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nft_meta.h b/include/net/netfilter/nft_meta.h index d27588c8dbd9..1139cde0fdc5 100644 --- a/include/net/netfilter/nft_meta.h +++ b/include/net/netfilter/nft_meta.h @@ -36,4 +36,8 @@ void nft_meta_set_eval(const struct nft_expr *expr, void nft_meta_set_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr); +int nft_meta_set_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data); + #endif diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c index 4b901d9f2e7c..ad47a921b701 100644 --- a/net/bridge/netfilter/nft_meta_bridge.c +++ b/net/bridge/netfilter/nft_meta_bridge.c @@ -86,6 +86,7 @@ static const struct nft_expr_ops nft_meta_bridge_set_ops = { .init = nft_meta_set_init, .destroy = nft_meta_set_destroy, .dump = nft_meta_set_dump, + .validate = nft_meta_set_validate, }; static const struct nft_expr_ops * diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 2863f3493038..8a6bc7630912 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -291,10 +291,16 @@ int nft_meta_get_init(const struct nft_ctx *ctx, } EXPORT_SYMBOL_GPL(nft_meta_get_init); -static int nft_meta_set_init_pkttype(const struct nft_ctx *ctx) +int nft_meta_set_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data) { + struct nft_meta *priv = nft_expr_priv(expr); unsigned int hooks; + if (priv->key != NFT_META_PKTTYPE) + return 0; + switch (ctx->afi->family) { case NFPROTO_BRIDGE: hooks = 1 << NF_BR_PRE_ROUTING; @@ -308,6 +314,7 @@ static int nft_meta_set_init_pkttype(const struct nft_ctx *ctx) return nft_chain_validate_hooks(ctx->chain, hooks); } +EXPORT_SYMBOL_GPL(nft_meta_set_validate); int nft_meta_set_init(const struct nft_ctx *ctx, const struct nft_expr *expr, @@ -327,15 +334,16 @@ int nft_meta_set_init(const struct nft_ctx *ctx, len = sizeof(u8); break; case NFT_META_PKTTYPE: - err = nft_meta_set_init_pkttype(ctx); - if (err) - return err; len = sizeof(u8); break; default: return -EOPNOTSUPP; } + err = nft_meta_set_validate(ctx, expr, NULL); + if (err < 0) + return err; + priv->sreg = nft_parse_register(tb[NFTA_META_SREG]); err = nft_validate_register_load(priv->sreg, len); if (err < 0) @@ -407,6 +415,7 @@ static const struct nft_expr_ops nft_meta_set_ops = { .init = nft_meta_set_init, .destroy = nft_meta_set_destroy, .dump = nft_meta_set_dump, + .validate = nft_meta_set_validate, }; static const struct nft_expr_ops * -- cgit v1.2.3-58-ga151 From 4249fc1f023a2106170bbf715e2e1a0ebc2d5b1f Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca <sd@queasysnail.net> Date: Tue, 23 Aug 2016 10:20:31 +0200 Subject: netfilter: ebtables: put module reference when an incorrect extension is found commit bcf493428840 ("netfilter: ebtables: Fix extension lookup with identical name") added a second lookup in case the extension that was found during the first lookup matched another extension with the same name, but didn't release the reference on the incorrect module. Fixes: bcf493428840 ("netfilter: ebtables: Fix extension lookup with identical name") Signed-off-by: Sabrina Dubroca <sd@queasysnail.net> Acked-by: Phil Sutter <phil@nwl.cc> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> --- net/bridge/netfilter/ebtables.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index cceac5bb658f..0833c251aef7 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -368,6 +368,8 @@ ebt_check_match(struct ebt_entry_match *m, struct xt_mtchk_param *par, match = xt_find_match(NFPROTO_BRIDGE, m->u.name, 0); if (IS_ERR(match) || match->family != NFPROTO_BRIDGE) { + if (!IS_ERR(match)) + module_put(match->me); request_module("ebt_%s", m->u.name); match = xt_find_match(NFPROTO_BRIDGE, m->u.name, 0); } -- cgit v1.2.3-58-ga151 From 936523441bb64cdc9a5b263e8fd2782e70313a57 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann <sven@narfation.org> Date: Sat, 6 Aug 2016 15:50:52 +0200 Subject: batman-adv: Add missing refcnt for last_candidate batadv_find_router dereferences last_bonding_candidate from orig_node without making sure that it has a valid reference. This reference has to be retrieved by increasing the reference counter while holding neigh_list_lock. The lock is required to avoid that batadv_last_bonding_replace removes the current last_bonding_candidate, reduces the reference counter and maybe destroys the object in this process. Fixes: f3b3d9018975 ("batman-adv: add bonding again") Signed-off-by: Sven Eckelmann <sven@narfation.org> Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch> Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de> --- net/batman-adv/routing.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index 7602c001e92b..3d199478c405 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -469,6 +469,29 @@ static int batadv_check_unicast_packet(struct batadv_priv *bat_priv, return 0; } +/** + * batadv_last_bonding_get - Get last_bonding_candidate of orig_node + * @orig_node: originator node whose last bonding candidate should be retrieved + * + * Return: last bonding candidate of router or NULL if not found + * + * The object is returned with refcounter increased by 1. + */ +static struct batadv_orig_ifinfo * +batadv_last_bonding_get(struct batadv_orig_node *orig_node) +{ + struct batadv_orig_ifinfo *last_bonding_candidate; + + spin_lock_bh(&orig_node->neigh_list_lock); + last_bonding_candidate = orig_node->last_bonding_candidate; + + if (last_bonding_candidate) + kref_get(&last_bonding_candidate->refcount); + spin_unlock_bh(&orig_node->neigh_list_lock); + + return last_bonding_candidate; +} + /** * batadv_last_bonding_replace - Replace last_bonding_candidate of orig_node * @orig_node: originator node whose bonding candidates should be replaced @@ -539,7 +562,7 @@ batadv_find_router(struct batadv_priv *bat_priv, * router - obviously there are no other candidates. */ rcu_read_lock(); - last_candidate = orig_node->last_bonding_candidate; + last_candidate = batadv_last_bonding_get(orig_node); if (last_candidate) last_cand_router = rcu_dereference(last_candidate->router); @@ -631,6 +654,9 @@ next: batadv_orig_ifinfo_put(next_candidate); } + if (last_candidate) + batadv_orig_ifinfo_put(last_candidate); + return router; } -- cgit v1.2.3-58-ga151 From 1e5d343b8f23770e8ac5d31f5c439826bdb35148 Mon Sep 17 00:00:00 2001 From: Linus Lüssing <linus.luessing@c0d3.blue> Date: Tue, 23 Aug 2016 03:13:03 +0200 Subject: batman-adv: fix elp packet data reservation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The skb_reserve() call only reserved headroom for the mac header, but not the elp packet header itself. Fixing this by using skb_put()'ing towards the skb tail instead of skb_push()'ing towards the skb head. Fixes: d6f94d91f766 ("batman-adv: ELP - adding basic infrastructure") Signed-off-by: Linus Lüssing <linus.luessing@c0d3.blue> Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch> Signed-off-by: Sven Eckelmann <sven@narfation.org> Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de> --- net/batman-adv/bat_v_elp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index 7d170010beb9..ee08540ce503 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -335,7 +335,7 @@ int batadv_v_elp_iface_enable(struct batadv_hard_iface *hard_iface) goto out; skb_reserve(hard_iface->bat_v.elp_skb, ETH_HLEN + NET_IP_ALIGN); - elp_buff = skb_push(hard_iface->bat_v.elp_skb, BATADV_ELP_HLEN); + elp_buff = skb_put(hard_iface->bat_v.elp_skb, BATADV_ELP_HLEN); elp_packet = (struct batadv_elp_packet *)elp_buff; memset(elp_packet, 0, BATADV_ELP_HLEN); -- cgit v1.2.3-58-ga151 From 554d072e7bc3e56de5893c8181110a547b2062c9 Mon Sep 17 00:00:00 2001 From: Arik Nemtsov <arik@wizery.com> Date: Mon, 29 Aug 2016 12:37:35 +0300 Subject: mac80211: TDLS: don't require beaconing for AP BW Stop downgrading TDLS chandef when reaching the AP BW. The AP provides the necessary regulatory protection in this case. This fixes https://bugzilla.kernel.org/show_bug.cgi?id=153961, which reported an infinite loop here. Reported-by: Kamil Toman <kamil.toman@gmail.com> Signed-off-by: Arik Nemtsov <arikx.nemtsov@intel.com> Signed-off-by: Johannes Berg <johannes.berg@intel.com> --- net/mac80211/tdls.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index b5d28f14b9cf..afca7d103684 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -333,10 +333,11 @@ ieee80211_tdls_chandef_vht_upgrade(struct ieee80211_sub_if_data *sdata, if (!uc.center_freq1) return; - /* proceed to downgrade the chandef until usable or the same */ + /* proceed to downgrade the chandef until usable or the same as AP BW */ while (uc.width > max_width || - !cfg80211_reg_can_beacon_relax(sdata->local->hw.wiphy, &uc, - sdata->wdev.iftype)) + (uc.width > sta->tdls_chandef.width && + !cfg80211_reg_can_beacon_relax(sdata->local->hw.wiphy, &uc, + sdata->wdev.iftype))) ieee80211_chandef_downgrade(&uc); if (!cfg80211_chandef_identical(&uc, &sta->tdls_chandef)) { -- cgit v1.2.3-58-ga151 From c73c2484901139c28383b58eabcbf4d613e91518 Mon Sep 17 00:00:00 2001 From: Liping Zhang <liping.zhang@spreadtrum.com> Date: Sun, 28 Aug 2016 16:59:52 +0800 Subject: netfilter: nf_tables_netdev: remove redundant ip_hdr assignment We have already use skb_header_pointer to get the ip header pointer, so there's no need to use ip_hdr again. Moreover, in NETDEV INGRESS hook, ip header maybe not linear, so use ip_hdr is not appropriate, remove it. Signed-off-by: Liping Zhang <liping.zhang@spreadtrum.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> --- net/netfilter/nf_tables_netdev.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_netdev.c b/net/netfilter/nf_tables_netdev.c index 5eefe4a355c6..75d696f11045 100644 --- a/net/netfilter/nf_tables_netdev.c +++ b/net/netfilter/nf_tables_netdev.c @@ -30,7 +30,6 @@ nft_netdev_set_pktinfo_ipv4(struct nft_pktinfo *pkt, if (!iph) return; - iph = ip_hdr(skb); if (iph->ihl < 5 || iph->version != 4) return; -- cgit v1.2.3-58-ga151 From 9264251ee2a55bce8fb93826b3f581fb9eb7e2c2 Mon Sep 17 00:00:00 2001 From: Davide Caratti <dcaratti@redhat.com> Date: Wed, 31 Aug 2016 14:16:44 +0200 Subject: bridge: re-introduce 'fix parsing of MLDv2 reports' commit bc8c20acaea1 ("bridge: multicast: treat igmpv3 report with INCLUDE and no sources as a leave") seems to have accidentally reverted commit 47cc84ce0c2f ("bridge: fix parsing of MLDv2 reports"). This commit brings back a change to br_ip6_multicast_mld2_report() where parsing of MLDv2 reports stops when the first group is successfully added to the MDB cache. Fixes: bc8c20acaea1 ("bridge: multicast: treat igmpv3 report with INCLUDE and no sources as a leave") Signed-off-by: Davide Caratti <dcaratti@redhat.com> Acked-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com> Acked-by: Thadeu Lima de Souza Cascardo <cascardo@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/bridge/br_multicast.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index a5423a1eec05..c5fea9393946 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -1138,7 +1138,7 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br, } else { err = br_ip6_multicast_add_group(br, port, &grec->grec_mca, vid); - if (!err) + if (err) break; } } -- cgit v1.2.3-58-ga151 From c0338aff2260ea6c092806312dbb154cec07a242 Mon Sep 17 00:00:00 2001 From: WANG Cong <xiyou.wangcong@gmail.com> Date: Sun, 28 Aug 2016 21:28:26 -0700 Subject: kcm: fix a socket double free Dmitry reported a double free on kcm socket, which could be easily reproduced by: #include <unistd.h> #include <sys/syscall.h> int main() { int fd = syscall(SYS_socket, 0x29ul, 0x5ul, 0x0ul, 0, 0, 0); syscall(SYS_ioctl, fd, 0x89e2ul, 0x20a98000ul, 0, 0, 0); return 0; } This is because on the error path, after we install the new socket file, we call sock_release() to clean up the socket, which leaves the fd pointing to a freed socket. Fix this by calling sys_close() on that fd directly. Fixes: ab7ac4eb9832 ("kcm: Kernel Connection Multiplexor module") Reported-by: Dmitry Vyukov <dvyukov@google.com> Cc: Tom Herbert <tom@herbertland.com> Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/kcm/kcmsock.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index cb39e05b166c..411693288648 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -13,6 +13,7 @@ #include <linux/socket.h> #include <linux/uaccess.h> #include <linux/workqueue.h> +#include <linux/syscalls.h> #include <net/kcm.h> #include <net/netns/generic.h> #include <net/sock.h> @@ -2029,7 +2030,7 @@ static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) if (copy_to_user((void __user *)arg, &info, sizeof(info))) { err = -EFAULT; - sock_release(newsock); + sys_close(info.fd); } } -- cgit v1.2.3-58-ga151 From d2f394dc4816b7bd1b44981d83509f18f19c53f0 Mon Sep 17 00:00:00 2001 From: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@ericsson.com> Date: Thu, 1 Sep 2016 16:22:16 +0200 Subject: tipc: fix random link resets while adding a second bearer In a dual bearer configuration, if the second tipc link becomes active while the first link still has pending nametable "bulk" updates, it randomly leads to reset of the second link. When a link is established, the function named_distribute(), fills the skb based on node mtu (allows room for TUNNEL_PROTOCOL) with NAME_DISTRIBUTOR message for each PUBLICATION. However, the function named_distribute() allocates the buffer by increasing the node mtu by INT_H_SIZE (to insert NAME_DISTRIBUTOR). This consumes the space allocated for TUNNEL_PROTOCOL. When establishing the second link, the link shall tunnel all the messages in the first link queue including the "bulk" update. As size of the NAME_DISTRIBUTOR messages while tunnelling, exceeds the link mtu the transmission fails (-EMSGSIZE). Thus, the synch point based on the message count of the tunnel packets is never reached leading to link timeout. In this commit, we adjust the size of name distributor message so that they can be tunnelled. Reviewed-by: Jon Maloy <jon.maloy@ericsson.com> Signed-off-by: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@ericsson.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/tipc/name_distr.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c index 6b626a64b517..a04fe9be1c60 100644 --- a/net/tipc/name_distr.c +++ b/net/tipc/name_distr.c @@ -62,6 +62,8 @@ static void publ_to_item(struct distr_item *i, struct publication *p) /** * named_prepare_buf - allocate & initialize a publication message + * + * The buffer returned is of size INT_H_SIZE + payload size */ static struct sk_buff *named_prepare_buf(struct net *net, u32 type, u32 size, u32 dest) @@ -141,9 +143,9 @@ static void named_distribute(struct net *net, struct sk_buff_head *list, struct publication *publ; struct sk_buff *skb = NULL; struct distr_item *item = NULL; - uint msg_dsz = (tipc_node_get_mtu(net, dnode, 0) / ITEM_SIZE) * - ITEM_SIZE; - uint msg_rem = msg_dsz; + u32 msg_dsz = ((tipc_node_get_mtu(net, dnode, 0) - INT_H_SIZE) / + ITEM_SIZE) * ITEM_SIZE; + u32 msg_rem = msg_dsz; list_for_each_entry(publ, pls, local_list) { /* Prepare next buffer: */ -- cgit v1.2.3-58-ga151 From d26c638c16cb54f6fb1507e27df93ede692db572 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel <nicolas.dichtel@6wind.com> Date: Tue, 30 Aug 2016 10:09:21 +0200 Subject: ipv6: add missing netconf notif when 'all' is updated The 'default' value was not advertised. Fixes: f3a1bfb11ccb ("rtnl/ipv6: use netconf msg to advertise forwarding status") Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/ipv6/addrconf.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index f418d2eaeddd..2a688171a188 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -778,7 +778,14 @@ static int addrconf_fixup_forwarding(struct ctl_table *table, int *p, int newf) } if (p == &net->ipv6.devconf_all->forwarding) { + int old_dflt = net->ipv6.devconf_dflt->forwarding; + net->ipv6.devconf_dflt->forwarding = newf; + if ((!newf) ^ (!old_dflt)) + inet6_netconf_notify_devconf(net, NETCONFA_FORWARDING, + NETCONFA_IFINDEX_DEFAULT, + net->ipv6.devconf_dflt); + addrconf_forward_change(net, newf); if ((!newf) ^ (!old)) inet6_netconf_notify_devconf(net, NETCONFA_FORWARDING, -- cgit v1.2.3-58-ga151 From 29c994e361009142ec0bca6493cc8f7b0d3c561a Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel <nicolas.dichtel@6wind.com> Date: Tue, 30 Aug 2016 10:09:22 +0200 Subject: netconf: add a notif when settings are created All changes are notified, but the initial state was missing. Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/ipv4/devinet.c | 11 +++++++---- net/ipv6/addrconf.c | 9 ++++++++- 2 files changed, 15 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 415e117967c7..062a67ca9a21 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -2232,7 +2232,7 @@ static struct devinet_sysctl_table { }; static int __devinet_sysctl_register(struct net *net, char *dev_name, - struct ipv4_devconf *p) + int ifindex, struct ipv4_devconf *p) { int i; struct devinet_sysctl_table *t; @@ -2255,6 +2255,8 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name, goto free; p->sysctl = t; + + inet_netconf_notify_devconf(net, NETCONFA_ALL, ifindex, p); return 0; free: @@ -2286,7 +2288,7 @@ static int devinet_sysctl_register(struct in_device *idev) if (err) return err; err = __devinet_sysctl_register(dev_net(idev->dev), idev->dev->name, - &idev->cnf); + idev->dev->ifindex, &idev->cnf); if (err) neigh_sysctl_unregister(idev->arp_parms); return err; @@ -2347,11 +2349,12 @@ static __net_init int devinet_init_net(struct net *net) } #ifdef CONFIG_SYSCTL - err = __devinet_sysctl_register(net, "all", all); + err = __devinet_sysctl_register(net, "all", NETCONFA_IFINDEX_ALL, all); if (err < 0) goto err_reg_all; - err = __devinet_sysctl_register(net, "default", dflt); + err = __devinet_sysctl_register(net, "default", + NETCONFA_IFINDEX_DEFAULT, dflt); if (err < 0) goto err_reg_dflt; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 2a688171a188..bdf368eff5ab 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -6032,7 +6032,7 @@ static const struct ctl_table addrconf_sysctl[] = { static int __addrconf_sysctl_register(struct net *net, char *dev_name, struct inet6_dev *idev, struct ipv6_devconf *p) { - int i; + int i, ifindex; struct ctl_table *table; char path[sizeof("net/ipv6/conf/") + IFNAMSIZ]; @@ -6052,6 +6052,13 @@ static int __addrconf_sysctl_register(struct net *net, char *dev_name, if (!p->sysctl_header) goto free; + if (!strcmp(dev_name, "all")) + ifindex = NETCONFA_IFINDEX_ALL; + else if (!strcmp(dev_name, "default")) + ifindex = NETCONFA_IFINDEX_DEFAULT; + else + ifindex = idev->dev->ifindex; + inet6_netconf_notify_devconf(net, NETCONFA_ALL, ifindex, p); return 0; free: -- cgit v1.2.3-58-ga151 From 85a3d4a9356b595d5440c3f1bf07ee7cecca1567 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com> Date: Tue, 30 Aug 2016 17:44:29 +0200 Subject: net: bridge: don't increment tx_dropped in br_do_proxy_arp pskb_may_pull may fail due to various reasons (e.g. alloc failure), but the skb isn't changed/dropped and processing continues so we shouldn't increment tx_dropped. CC: Kyeyoon Park <kyeyoonp@codeaurora.org> CC: Roopa Prabhu <roopa@cumulusnetworks.com> CC: Stephen Hemminger <stephen@networkplumber.org> CC: bridge@lists.linux-foundation.org Fixes: 958501163ddd ("bridge: Add support for IEEE 802.11 Proxy ARP") Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/bridge/br_input.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 8e486203d133..abe11f085479 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -80,13 +80,10 @@ static void br_do_proxy_arp(struct sk_buff *skb, struct net_bridge *br, BR_INPUT_SKB_CB(skb)->proxyarp_replied = false; - if (dev->flags & IFF_NOARP) + if ((dev->flags & IFF_NOARP) || + !pskb_may_pull(skb, arp_hdr_len(dev))) return; - if (!pskb_may_pull(skb, arp_hdr_len(dev))) { - dev->stats.tx_dropped++; - return; - } parp = arp_hdr(skb); if (parp->ar_pro != htons(ETH_P_IP) || -- cgit v1.2.3-58-ga151 From 28b346cbc0715ae45b2814d857f1d8a7e6817ed8 Mon Sep 17 00:00:00 2001 From: Neal Cardwell <ncardwell@google.com> Date: Tue, 30 Aug 2016 11:55:23 -0400 Subject: tcp: fastopen: fix rcv_wup initialization for TFO server on SYN/data Yuchung noticed that on the first TFO server data packet sent after the (TFO) handshake, the server echoed the TCP timestamp value in the SYN/data instead of the timestamp value in the final ACK of the handshake. This problem did not happen on regular opens. The tcp_replace_ts_recent() logic that decides whether to remember an incoming TS value needs tp->rcv_wup to hold the latest receive sequence number that we have ACKed (latest tp->rcv_nxt we have ACKed). This commit fixes this issue by ensuring that a TFO server properly updates tp->rcv_wup to match tp->rcv_nxt at the time it sends a SYN/ACK for the SYN/data. Reported-by: Yuchung Cheng <ycheng@google.com> Signed-off-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: Yuchung Cheng <ycheng@google.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com> Fixes: 168a8f58059a ("tcp: TCP Fast Open Server - main code path") Signed-off-by: David S. Miller <davem@davemloft.net> --- net/ipv4/tcp_fastopen.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 54d9f9b0120f..62a5751d4fe1 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -226,6 +226,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, tcp_fastopen_add_skb(child, skb); tcp_rsk(req)->rcv_nxt = tp->rcv_nxt; + tp->rcv_wup = tp->rcv_nxt; /* tcp_conn_request() is sending the SYNACK, * and queues the child into listener accept queue. */ -- cgit v1.2.3-58-ga151 From 635c223cfa05af9523146b2f37e119d945f449ae Mon Sep 17 00:00:00 2001 From: Gao Feng <fgao@ikuai8.com> Date: Wed, 31 Aug 2016 14:15:05 +0800 Subject: rps: flow_dissector: Fix uninitialized flow_keys used in __skb_get_hash possibly The original codes depend on that the function parameters are evaluated from left to right. But the parameter's evaluation order is not defined in C standard actually. When flow_keys_have_l4(&keys) is invoked before ___skb_get_hash(skb, &keys, hashrnd) with some compilers or environment, the keys passed to flow_keys_have_l4 is not initialized. Fixes: 6db61d79c1e1 ("flow_dissector: Ignore flow dissector return value from ___skb_get_hash") Acked-by: Eric Dumazet <edumazet@google.com> Signed-off-by: Gao Feng <fgao@ikuai8.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/core/flow_dissector.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 61ad43f61c5e..52742a02814f 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -680,11 +680,13 @@ EXPORT_SYMBOL_GPL(__skb_get_hash_symmetric); void __skb_get_hash(struct sk_buff *skb) { struct flow_keys keys; + u32 hash; __flow_hash_secret_init(); - __skb_set_sw_hash(skb, ___skb_get_hash(skb, &keys, hashrnd), - flow_keys_have_l4(&keys)); + hash = ___skb_get_hash(skb, &keys, hashrnd); + + __skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys)); } EXPORT_SYMBOL(__skb_get_hash); -- cgit v1.2.3-58-ga151 From ab34380162cbc9b5172afdadf5136643c687bb73 Mon Sep 17 00:00:00 2001 From: Eli Cooper <elicooper@gmx.com> Date: Fri, 26 Aug 2016 23:52:29 +0800 Subject: ipv6: Don't unset flowi6_proto in ipxip6_tnl_xmit() Commit 8eb30be0352d0916 ("ipv6: Create ip6_tnl_xmit") unsets flowi6_proto in ip4ip6_tnl_xmit() and ip6ip6_tnl_xmit(). Since xfrm_selector_match() relies on this info, IPv6 packets sent by an ip6tunnel cannot be properly selected by their protocols after removing it. This patch puts flowi6_proto back. Cc: stable@vger.kernel.org Fixes: 8eb30be0352d ("ipv6: Create ip6_tnl_xmit") Signed-off-by: Eli Cooper <elicooper@gmx.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/ipv6/ip6_tunnel.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 7b0481e3738f..888543debe4e 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1174,6 +1174,7 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) encap_limit = t->parms.encap_limit; memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); + fl6.flowi6_proto = IPPROTO_IPIP; dsfield = ipv4_get_dsfield(iph); @@ -1233,6 +1234,7 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) encap_limit = t->parms.encap_limit; memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); + fl6.flowi6_proto = IPPROTO_IPV6; dsfield = ipv6_get_dsfield(ipv6h); if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) -- cgit v1.2.3-58-ga151 From 2f86953e7436c9b9a4690909c5e2db24799e173b Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca <sd@queasysnail.net> Date: Fri, 2 Sep 2016 10:22:54 +0200 Subject: l2tp: fix use-after-free during module unload Tunnel deletion is delayed by both a workqueue (l2tp_tunnel_delete -> wq -> l2tp_tunnel_del_work) and RCU (sk_destruct -> RCU -> l2tp_tunnel_destruct). By the time l2tp_tunnel_destruct() runs to destroy the tunnel and finish destroying the socket, the private data reserved via the net_generic mechanism has already been freed, but l2tp_tunnel_destruct() actually uses this data. Make sure tunnel deletion for the netns has completed before returning from l2tp_exit_net() by first flushing the tunnel removal workqueue, and then waiting for RCU callbacks to complete. Fixes: 167eb17e0b17 ("l2tp: create tunnel sockets in the right namespace") Signed-off-by: Sabrina Dubroca <sd@queasysnail.net> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/l2tp/l2tp_core.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 1e40dacaa137..a2ed3bda4ddc 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1855,6 +1855,9 @@ static __net_exit void l2tp_exit_net(struct net *net) (void)l2tp_tunnel_delete(tunnel); } rcu_read_unlock_bh(); + + flush_workqueue(l2tp_wq); + rcu_barrier(); } static struct pernet_operations l2tp_net_ops = { -- cgit v1.2.3-58-ga151 From a41bd25ae67d3e4052c7f00ee9f2b4ba9219309e Mon Sep 17 00:00:00 2001 From: Paolo Abeni <pabeni@redhat.com> Date: Thu, 25 Aug 2016 18:42:35 +0200 Subject: sunrpc: fix UDP memory accounting The commit f9b2ee714c5c ("SUNRPC: Move UDP receive data path into a workqueue context"), as a side effect, moved the skb_free_datagram() call outside the scope of the related socket lock, but UDP sockets require such lock to be held for proper memory accounting. Fix it by replacing skb_free_datagram() with skb_free_datagram_locked(). Fixes: f9b2ee714c5c ("SUNRPC: Move UDP receive data path into a workqueue context") Reported-and-tested-by: Jan Stancek <jstancek@redhat.com> Signed-off-by: Paolo Abeni <pabeni@redhat.com> Cc: stable@vger.kernel.org # 4.4+ Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com> --- net/sunrpc/xprtsock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 8ede3bc52481..bf168838a029 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1074,7 +1074,7 @@ static void xs_udp_data_receive(struct sock_xprt *transport) skb = skb_recv_datagram(sk, 0, 1, &err); if (skb != NULL) { xs_udp_data_read_skb(&transport->xprt, sk, skb); - skb_free_datagram(sk, skb); + skb_free_datagram_locked(sk, skb); continue; } if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) -- cgit v1.2.3-58-ga151 From 24b27fc4cdf9e10c5e79e5923b6b7c2c5c95096c Mon Sep 17 00:00:00 2001 From: Mahesh Bandewar <maheshb@google.com> Date: Thu, 1 Sep 2016 22:18:34 -0700 Subject: bonding: Fix bonding crash Following few steps will crash kernel - (a) Create bonding master > modprobe bonding miimon=50 (b) Create macvlan bridge on eth2 > ip link add link eth2 dev mvl0 address aa:0:0:0:0:01 \ type macvlan (c) Now try adding eth2 into the bond > echo +eth2 > /sys/class/net/bond0/bonding/slaves <crash> Bonding does lots of things before checking if the device enslaved is busy or not. In this case when the notifier call-chain sends notifications, the bond_netdev_event() assumes that the rx_handler /rx_handler_data is registered while the bond_enslave() hasn't progressed far enough to register rx_handler for the new slave. This patch adds a rx_handler check that can be performed right at the beginning of the enslave code to avoid getting into this situation. Signed-off-by: Mahesh Bandewar <maheshb@google.com> Acked-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/net/bonding/bond_main.c | 7 ++++--- include/linux/netdevice.h | 1 + net/core/dev.c | 16 ++++++++++++++++ 3 files changed, 21 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 217e8da0628c..9599ed6f1213 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1341,9 +1341,10 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev) slave_dev->name); } - /* already enslaved */ - if (slave_dev->flags & IFF_SLAVE) { - netdev_dbg(bond_dev, "Error: Device was already enslaved\n"); + /* already in-use? */ + if (netdev_is_rx_handler_busy(slave_dev)) { + netdev_err(bond_dev, + "Error: Device is in use and cannot be enslaved\n"); return -EBUSY; } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3a788bf0affd..e8d79d4ebcfe 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3267,6 +3267,7 @@ static inline void napi_free_frags(struct napi_struct *napi) napi->skb = NULL; } +bool netdev_is_rx_handler_busy(struct net_device *dev); int netdev_rx_handler_register(struct net_device *dev, rx_handler_func_t *rx_handler, void *rx_handler_data); diff --git a/net/core/dev.c b/net/core/dev.c index dd6ce598de89..ea6312057a71 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3974,6 +3974,22 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, return skb; } +/** + * netdev_is_rx_handler_busy - check if receive handler is registered + * @dev: device to check + * + * Check if a receive handler is already registered for a given device. + * Return true if there one. + * + * The caller must hold the rtnl_mutex. + */ +bool netdev_is_rx_handler_busy(struct net_device *dev) +{ + ASSERT_RTNL(); + return dev && rtnl_dereference(dev->rx_handler); +} +EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy); + /** * netdev_rx_handler_register - register receive handler * @dev: device to register a handler for -- cgit v1.2.3-58-ga151 From 38f7bd94a97b542de86a2be9229289717e33a7a4 Mon Sep 17 00:00:00 2001 From: Linus Torvalds <torvalds@linux-foundation.org> Date: Thu, 1 Sep 2016 14:56:49 -0700 Subject: Revert "af_unix: Fix splice-bind deadlock" This reverts commit c845acb324aa85a39650a14e7696982ceea75dc1. It turns out that it just replaces one deadlock with another one: we can still get the wrong lock ordering with the readlock due to overlayfs calling back into the filesystem layer and still taking the vfs locks after the readlock. The proper solution ends up being to just split the readlock into two pieces: the bind lock (taken *outside* the vfs locks) and the IO lock (taken *inside* the filesystem locks). The two locks are independent anyway. Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> Reviewed-by: Shmulik Ladkani <shmulik.ladkani@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/unix/af_unix.c | 66 +++++++++++++++++++++--------------------------------- 1 file changed, 26 insertions(+), 40 deletions(-) (limited to 'net') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index f1dffe84f0d5..433ae1bbef97 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -954,20 +954,32 @@ fail: return NULL; } -static int unix_mknod(struct dentry *dentry, const struct path *path, umode_t mode, - struct path *res) +static int unix_mknod(const char *sun_path, umode_t mode, struct path *res) { - int err; + struct dentry *dentry; + struct path path; + int err = 0; + /* + * Get the parent directory, calculate the hash for last + * component. + */ + dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0); + err = PTR_ERR(dentry); + if (IS_ERR(dentry)) + return err; - err = security_path_mknod(path, dentry, mode, 0); + /* + * All right, let's create it. + */ + err = security_path_mknod(&path, dentry, mode, 0); if (!err) { - err = vfs_mknod(d_inode(path->dentry), dentry, mode, 0); + err = vfs_mknod(d_inode(path.dentry), dentry, mode, 0); if (!err) { - res->mnt = mntget(path->mnt); + res->mnt = mntget(path.mnt); res->dentry = dget(dentry); } } - + done_path_create(&path, dentry); return err; } @@ -978,12 +990,10 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) struct unix_sock *u = unix_sk(sk); struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; char *sun_path = sunaddr->sun_path; - int err, name_err; + int err; unsigned int hash; struct unix_address *addr; struct hlist_head *list; - struct path path; - struct dentry *dentry; err = -EINVAL; if (sunaddr->sun_family != AF_UNIX) @@ -999,34 +1009,14 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) goto out; addr_len = err; - name_err = 0; - dentry = NULL; - if (sun_path[0]) { - /* Get the parent directory, calculate the hash for last - * component. - */ - dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0); - - if (IS_ERR(dentry)) { - /* delay report until after 'already bound' check */ - name_err = PTR_ERR(dentry); - dentry = NULL; - } - } - err = mutex_lock_interruptible(&u->readlock); if (err) - goto out_path; + goto out; err = -EINVAL; if (u->addr) goto out_up; - if (name_err) { - err = name_err == -EEXIST ? -EADDRINUSE : name_err; - goto out_up; - } - err = -ENOMEM; addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL); if (!addr) @@ -1037,11 +1027,11 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) addr->hash = hash ^ sk->sk_type; atomic_set(&addr->refcnt, 1); - if (dentry) { - struct path u_path; + if (sun_path[0]) { + struct path path; umode_t mode = S_IFSOCK | (SOCK_INODE(sock)->i_mode & ~current_umask()); - err = unix_mknod(dentry, &path, mode, &u_path); + err = unix_mknod(sun_path, mode, &path); if (err) { if (err == -EEXIST) err = -EADDRINUSE; @@ -1049,9 +1039,9 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) goto out_up; } addr->hash = UNIX_HASH_SIZE; - hash = d_real_inode(dentry)->i_ino & (UNIX_HASH_SIZE - 1); + hash = d_real_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE - 1); spin_lock(&unix_table_lock); - u->path = u_path; + u->path = path; list = &unix_socket_table[hash]; } else { spin_lock(&unix_table_lock); @@ -1074,10 +1064,6 @@ out_unlock: spin_unlock(&unix_table_lock); out_up: mutex_unlock(&u->readlock); -out_path: - if (dentry) - done_path_create(&path, dentry); - out: return err; } -- cgit v1.2.3-58-ga151 From 6e1ce3c3451291142a57c4f3f6f999a29fb5b3bc Mon Sep 17 00:00:00 2001 From: Linus Torvalds <torvalds@linux-foundation.org> Date: Thu, 1 Sep 2016 14:43:53 -0700 Subject: af_unix: split 'u->readlock' into two: 'iolock' and 'bindlock' Right now we use the 'readlock' both for protecting some of the af_unix IO path and for making the bind be single-threaded. The two are independent, but using the same lock makes for a nasty deadlock due to ordering with regards to filesystem locking. The bind locking would want to nest outside the VSF pathname locking, but the IO locking wants to nest inside some of those same locks. We tried to fix this earlier with commit c845acb324aa ("af_unix: Fix splice-bind deadlock") which moved the readlock inside the vfs locks, but that caused problems with overlayfs that will then call back into filesystem routines that take the lock in the wrong order anyway. Splitting the locks means that we can go back to having the bind lock be the outermost lock, and we don't have any deadlocks with lock ordering. Acked-by: Rainer Weikusat <rweikusat@cyberadapt.com> Acked-by: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org> Signed-off-by: David S. Miller <davem@davemloft.net> --- include/net/af_unix.h | 2 +- net/unix/af_unix.c | 45 +++++++++++++++++++++++---------------------- 2 files changed, 24 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/include/net/af_unix.h b/include/net/af_unix.h index 9b4c418bebd8..fd60eccb59a6 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -52,7 +52,7 @@ struct unix_sock { struct sock sk; struct unix_address *addr; struct path path; - struct mutex readlock; + struct mutex iolock, bindlock; struct sock *peer; struct list_head link; atomic_long_t inflight; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 433ae1bbef97..8309687a56b0 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -661,11 +661,11 @@ static int unix_set_peek_off(struct sock *sk, int val) { struct unix_sock *u = unix_sk(sk); - if (mutex_lock_interruptible(&u->readlock)) + if (mutex_lock_interruptible(&u->iolock)) return -EINTR; sk->sk_peek_off = val; - mutex_unlock(&u->readlock); + mutex_unlock(&u->iolock); return 0; } @@ -779,7 +779,8 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern) spin_lock_init(&u->lock); atomic_long_set(&u->inflight, 0); INIT_LIST_HEAD(&u->link); - mutex_init(&u->readlock); /* single task reading lock */ + mutex_init(&u->iolock); /* single task reading lock */ + mutex_init(&u->bindlock); /* single task binding lock */ init_waitqueue_head(&u->peer_wait); init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay); unix_insert_socket(unix_sockets_unbound(sk), sk); @@ -848,7 +849,7 @@ static int unix_autobind(struct socket *sock) int err; unsigned int retries = 0; - err = mutex_lock_interruptible(&u->readlock); + err = mutex_lock_interruptible(&u->bindlock); if (err) return err; @@ -895,7 +896,7 @@ retry: spin_unlock(&unix_table_lock); err = 0; -out: mutex_unlock(&u->readlock); +out: mutex_unlock(&u->bindlock); return err; } @@ -1009,7 +1010,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) goto out; addr_len = err; - err = mutex_lock_interruptible(&u->readlock); + err = mutex_lock_interruptible(&u->bindlock); if (err) goto out; @@ -1063,7 +1064,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) out_unlock: spin_unlock(&unix_table_lock); out_up: - mutex_unlock(&u->readlock); + mutex_unlock(&u->bindlock); out: return err; } @@ -1955,17 +1956,17 @@ static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page, if (false) { alloc_skb: unix_state_unlock(other); - mutex_unlock(&unix_sk(other)->readlock); + mutex_unlock(&unix_sk(other)->iolock); newskb = sock_alloc_send_pskb(sk, 0, 0, flags & MSG_DONTWAIT, &err, 0); if (!newskb) goto err; } - /* we must acquire readlock as we modify already present + /* we must acquire iolock as we modify already present * skbs in the sk_receive_queue and mess with skb->len */ - err = mutex_lock_interruptible(&unix_sk(other)->readlock); + err = mutex_lock_interruptible(&unix_sk(other)->iolock); if (err) { err = flags & MSG_DONTWAIT ? -EAGAIN : -ERESTARTSYS; goto err; @@ -2032,7 +2033,7 @@ alloc_skb: } unix_state_unlock(other); - mutex_unlock(&unix_sk(other)->readlock); + mutex_unlock(&unix_sk(other)->iolock); other->sk_data_ready(other); scm_destroy(&scm); @@ -2041,7 +2042,7 @@ alloc_skb: err_state_unlock: unix_state_unlock(other); err_unlock: - mutex_unlock(&unix_sk(other)->readlock); + mutex_unlock(&unix_sk(other)->iolock); err: kfree_skb(newskb); if (send_sigpipe && !(flags & MSG_NOSIGNAL)) @@ -2109,7 +2110,7 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); do { - mutex_lock(&u->readlock); + mutex_lock(&u->iolock); skip = sk_peek_offset(sk, flags); skb = __skb_try_recv_datagram(sk, flags, &peeked, &skip, &err, @@ -2117,14 +2118,14 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, if (skb) break; - mutex_unlock(&u->readlock); + mutex_unlock(&u->iolock); if (err != -EAGAIN) break; } while (timeo && !__skb_wait_for_more_packets(sk, &err, &timeo, last)); - if (!skb) { /* implies readlock unlocked */ + if (!skb) { /* implies iolock unlocked */ unix_state_lock(sk); /* Signal EOF on disconnected non-blocking SEQPACKET socket. */ if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN && @@ -2189,7 +2190,7 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, out_free: skb_free_datagram(sk, skb); - mutex_unlock(&u->readlock); + mutex_unlock(&u->iolock); out: return err; } @@ -2284,7 +2285,7 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state) /* Lock the socket to prevent queue disordering * while sleeps in memcpy_tomsg */ - mutex_lock(&u->readlock); + mutex_lock(&u->iolock); if (flags & MSG_PEEK) skip = sk_peek_offset(sk, flags); @@ -2326,7 +2327,7 @@ again: break; } - mutex_unlock(&u->readlock); + mutex_unlock(&u->iolock); timeo = unix_stream_data_wait(sk, timeo, last, last_len); @@ -2337,7 +2338,7 @@ again: goto out; } - mutex_lock(&u->readlock); + mutex_lock(&u->iolock); goto redo; unlock: unix_state_unlock(sk); @@ -2440,7 +2441,7 @@ unlock: } } while (size); - mutex_unlock(&u->readlock); + mutex_unlock(&u->iolock); if (state->msg) scm_recv(sock, state->msg, &scm, flags); else @@ -2481,9 +2482,9 @@ static ssize_t skb_unix_socket_splice(struct sock *sk, int ret; struct unix_sock *u = unix_sk(sk); - mutex_unlock(&u->readlock); + mutex_unlock(&u->iolock); ret = splice_to_pipe(pipe, spd); - mutex_lock(&u->readlock); + mutex_lock(&u->iolock); return ret; } -- cgit v1.2.3-58-ga151 From 5210d393ef84e5d2a4854671a9af2d97fd1b8dd4 Mon Sep 17 00:00:00 2001 From: Liping Zhang <liping.zhang@spreadtrum.com> Date: Fri, 2 Sep 2016 20:49:12 +0800 Subject: netfilter: nf_tables_trace: fix endiness when dump chain policy NFTA_TRACE_POLICY attribute is big endian, but we forget to call htonl to convert it. Fortunately, this attribute is parsed as big endian in libnftnl. Signed-off-by: Liping Zhang <liping.zhang@spreadtrum.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> --- net/netfilter/nf_tables_trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c index 39eb1cc62e91..fa24a5b398b1 100644 --- a/net/netfilter/nf_tables_trace.c +++ b/net/netfilter/nf_tables_trace.c @@ -237,7 +237,7 @@ void nft_trace_notify(struct nft_traceinfo *info) break; case NFT_TRACETYPE_POLICY: if (nla_put_be32(skb, NFTA_TRACE_POLICY, - info->basechain->policy)) + htonl(info->basechain->policy))) goto nla_put_failure; break; } -- cgit v1.2.3-58-ga151 From d1a6cba576fc7c43e476538fe5aa72fe04bd80e1 Mon Sep 17 00:00:00 2001 From: Liping Zhang <liping.zhang@spreadtrum.com> Date: Tue, 6 Sep 2016 22:31:02 +0800 Subject: netfilter: nft_chain_route: re-route before skb is queued to userspace Imagine such situation, user add the following nft rules, and queue the packets to userspace for further check: # ip rule add fwmark 0x0/0x1 lookup eth0 # ip rule add fwmark 0x1/0x1 lookup eth1 # nft add table filter # nft add chain filter output {type route hook output priority 0 \;} # nft add rule filter output mark set 0x1 # nft add rule filter output queue num 0 But after we reinject the skbuff, the packet will be sent via the wrong route, i.e. in this case, the packet will be routed via eth0 table, not eth1 table. Because we skip to do re-route when verdict is NF_QUEUE, even if the mark was changed. Acctually, we should not touch sk_buff if verdict is NF_DROP or NF_STOLEN, and when re-route fails, return NF_DROP with error code. This is consistent with the mangle table in iptables. Signed-off-by: Liping Zhang <liping.zhang@spreadtrum.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> --- net/ipv4/netfilter/nft_chain_route_ipv4.c | 11 +++++++---- net/ipv6/netfilter/nft_chain_route_ipv6.c | 10 +++++++--- 2 files changed, 14 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/nft_chain_route_ipv4.c b/net/ipv4/netfilter/nft_chain_route_ipv4.c index 2375b0a8be46..30493beb611a 100644 --- a/net/ipv4/netfilter/nft_chain_route_ipv4.c +++ b/net/ipv4/netfilter/nft_chain_route_ipv4.c @@ -31,6 +31,7 @@ static unsigned int nf_route_table_hook(void *priv, __be32 saddr, daddr; u_int8_t tos; const struct iphdr *iph; + int err; /* root is playing with raw sockets. */ if (skb->len < sizeof(struct iphdr) || @@ -46,15 +47,17 @@ static unsigned int nf_route_table_hook(void *priv, tos = iph->tos; ret = nft_do_chain(&pkt, priv); - if (ret != NF_DROP && ret != NF_QUEUE) { + if (ret != NF_DROP && ret != NF_STOLEN) { iph = ip_hdr(skb); if (iph->saddr != saddr || iph->daddr != daddr || skb->mark != mark || - iph->tos != tos) - if (ip_route_me_harder(state->net, skb, RTN_UNSPEC)) - ret = NF_DROP; + iph->tos != tos) { + err = ip_route_me_harder(state->net, skb, RTN_UNSPEC); + if (err < 0) + ret = NF_DROP_ERR(err); + } } return ret; } diff --git a/net/ipv6/netfilter/nft_chain_route_ipv6.c b/net/ipv6/netfilter/nft_chain_route_ipv6.c index 71d995ff3108..2535223ba956 100644 --- a/net/ipv6/netfilter/nft_chain_route_ipv6.c +++ b/net/ipv6/netfilter/nft_chain_route_ipv6.c @@ -31,6 +31,7 @@ static unsigned int nf_route_table_hook(void *priv, struct in6_addr saddr, daddr; u_int8_t hop_limit; u32 mark, flowlabel; + int err; /* malformed packet, drop it */ if (nft_set_pktinfo_ipv6(&pkt, skb, state) < 0) @@ -46,13 +47,16 @@ static unsigned int nf_route_table_hook(void *priv, flowlabel = *((u32 *)ipv6_hdr(skb)); ret = nft_do_chain(&pkt, priv); - if (ret != NF_DROP && ret != NF_QUEUE && + if (ret != NF_DROP && ret != NF_STOLEN && (memcmp(&ipv6_hdr(skb)->saddr, &saddr, sizeof(saddr)) || memcmp(&ipv6_hdr(skb)->daddr, &daddr, sizeof(daddr)) || skb->mark != mark || ipv6_hdr(skb)->hop_limit != hop_limit || - flowlabel != *((u_int32_t *)ipv6_hdr(skb)))) - return ip6_route_me_harder(state->net, skb) == 0 ? ret : NF_DROP; + flowlabel != *((u_int32_t *)ipv6_hdr(skb)))) { + err = ip6_route_me_harder(state->net, skb); + if (err < 0) + ret = NF_DROP_ERR(err); + } return ret; } -- cgit v1.2.3-58-ga151 From 03c2778a938aaba0893f6d6cdc29511d91a79848 Mon Sep 17 00:00:00 2001 From: Dave Jones <davej@codemonkey.org.uk> Date: Fri, 2 Sep 2016 14:39:50 -0400 Subject: ipv6: release dst in ping_v6_sendmsg Neither the failure or success paths of ping_v6_sendmsg release the dst it acquires. This leads to a flood of warnings from "net/core/dst.c:288 dst_release" on older kernels that don't have 8bf4ada2e21378816b28205427ee6b0e1ca4c5f1 backported. That patch optimistically hoped this had been fixed post 3.10, but it seems at least one case wasn't, where I've seen this triggered a lot from machines doing unprivileged icmp sockets. Cc: Martin Lau <kafai@fb.com> Signed-off-by: Dave Jones <davej@codemonkey.org.uk> Acked-by: Martin KaFai Lau <kafai@fb.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/ipv6/ping.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index 0900352c924c..0e983b694ee8 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -126,8 +126,10 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) rt = (struct rt6_info *) dst; np = inet6_sk(sk); - if (!np) - return -EBADF; + if (!np) { + err = -EBADF; + goto dst_err_out; + } if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) fl6.flowi6_oif = np->mcast_oif; @@ -163,6 +165,9 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } release_sock(sk); +dst_err_out: + dst_release(dst); + if (err) return err; -- cgit v1.2.3-58-ga151 From 78d506e1b7071b24850fd5ac22b896c459b0a04c Mon Sep 17 00:00:00 2001 From: Chuck Lever <chuck.lever@oracle.com> Date: Tue, 6 Sep 2016 11:22:49 -0400 Subject: xprtrdma: Revert 3d4cf35bd4fa ("xprtrdma: Reply buffer exhaustion...") Receive buffer exhaustion, if it were to actually occur, would be catastrophic. However, when there are no reply buffers to post, that means all of them have already been posted and are waiting for incoming replies. By design, there can never be more RPCs in flight than there are available receive buffers. A receive buffer can be left posted after an RPC exits without a received reply; say, due to a credential problem or a soft timeout. This does not result in fewer posted receive buffers than there are pending RPCs, and there is already logic in xprtrdma to deal appropriately with this case. It also looks like the "+ 2" that was removed was accidentally accommodating the number of extra receive buffers needed for receiving backchannel requests. That will need to be addressed by another patch. Fixes: 3d4cf35bd4fa ("xprtrdma: Reply buffer exhaustion can be...") Signed-off-by: Chuck Lever <chuck.lever@oracle.com> Reviewed-by: Anna Schumaker <Anna.Schumaker@Netapp.com> Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com> --- net/sunrpc/xprtrdma/verbs.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 536d0be3f61b..fefcba982415 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -923,7 +923,7 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) } INIT_LIST_HEAD(&buf->rb_recv_bufs); - for (i = 0; i < buf->rb_max_requests; i++) { + for (i = 0; i < buf->rb_max_requests + 2; i++) { struct rpcrdma_rep *rep; rep = rpcrdma_create_rep(r_xprt); @@ -1076,6 +1076,8 @@ rpcrdma_put_mw(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw) /* * Get a set of request/reply buffers. + * + * Reply buffer (if available) is attached to send buffer upon return. */ struct rpcrdma_req * rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) @@ -1094,13 +1096,13 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) out_reqbuf: spin_unlock(&buffers->rb_lock); - pr_warn("rpcrdma: out of request buffers (%p)\n", buffers); + pr_warn("RPC: %s: out of request buffers\n", __func__); return NULL; out_repbuf: - list_add(&req->rl_free, &buffers->rb_send_bufs); spin_unlock(&buffers->rb_lock); - pr_warn("rpcrdma: out of reply buffers (%p)\n", buffers); - return NULL; + pr_warn("RPC: %s: out of reply buffers\n", __func__); + req->rl_reply = NULL; + return req; } /* -- cgit v1.2.3-58-ga151 From 05c974669ecec510a85d8534099bb75404e82c41 Mon Sep 17 00:00:00 2001 From: Chuck Lever <chuck.lever@oracle.com> Date: Tue, 6 Sep 2016 11:22:58 -0400 Subject: xprtrdma: Fix receive buffer accounting An RPC can terminate before its reply arrives, if a credential problem or a soft timeout occurs. After this happens, xprtrdma reports it is out of Receive buffers. A Receive buffer is posted before each RPC is sent, and returned to the buffer pool when a reply is received. If no reply is received for an RPC, that Receive buffer remains posted. But xprtrdma tries to post another when the next RPC is sent. If this happens a few dozen times, there are no receive buffers left to be posted at send time. I don't see a way for a transport connection to recover at that point, and it will spit warnings and unnecessarily delay RPCs on occasion for its remaining lifetime. Commit 1e465fd4ff47 ("xprtrdma: Replace send and receive arrays") removed a little bit of logic to detect this case and not provide a Receive buffer so no more buffers are posted, and then transport operation continues correctly. We didn't understand what that logic did, and it wasn't commented, so it was removed as part of the overhaul to support backchannel requests. Restore it, but be wary of the need to keep extra Receives posted to deal with backchannel requests. Fixes: 1e465fd4ff47 ("xprtrdma: Replace send and receive arrays") Signed-off-by: Chuck Lever <chuck.lever@oracle.com> Reviewed-by: Anna Schumaker <Anna.Schumaker@Netapp.com> Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com> --- net/sunrpc/xprtrdma/verbs.c | 41 +++++++++++++++++++++++++++++------------ net/sunrpc/xprtrdma/xprt_rdma.h | 1 + 2 files changed, 30 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index fefcba982415..799cce6cbe45 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -51,6 +51,7 @@ #include <linux/slab.h> #include <linux/prefetch.h> #include <linux/sunrpc/addr.h> +#include <linux/sunrpc/svc_rdma.h> #include <asm/bitops.h> #include <linux/module.h> /* try_module_get()/module_put() */ @@ -923,7 +924,7 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) } INIT_LIST_HEAD(&buf->rb_recv_bufs); - for (i = 0; i < buf->rb_max_requests + 2; i++) { + for (i = 0; i < buf->rb_max_requests + RPCRDMA_MAX_BC_REQUESTS; i++) { struct rpcrdma_rep *rep; rep = rpcrdma_create_rep(r_xprt); @@ -1018,6 +1019,7 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) rep = rpcrdma_buffer_get_rep_locked(buf); rpcrdma_destroy_rep(ia, rep); } + buf->rb_send_count = 0; spin_lock(&buf->rb_reqslock); while (!list_empty(&buf->rb_allreqs)) { @@ -1032,6 +1034,7 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) spin_lock(&buf->rb_reqslock); } spin_unlock(&buf->rb_reqslock); + buf->rb_recv_count = 0; rpcrdma_destroy_mrs(buf); } @@ -1074,6 +1077,23 @@ rpcrdma_put_mw(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw) spin_unlock(&buf->rb_mwlock); } +static struct rpcrdma_rep * +rpcrdma_buffer_get_rep(struct rpcrdma_buffer *buffers) +{ + /* If an RPC previously completed without a reply (say, a + * credential problem or a soft timeout occurs) then hold off + * on supplying more Receive buffers until the number of new + * pending RPCs catches up to the number of posted Receives. + */ + if (unlikely(buffers->rb_send_count < buffers->rb_recv_count)) + return NULL; + + if (unlikely(list_empty(&buffers->rb_recv_bufs))) + return NULL; + buffers->rb_recv_count++; + return rpcrdma_buffer_get_rep_locked(buffers); +} + /* * Get a set of request/reply buffers. * @@ -1087,10 +1107,9 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) spin_lock(&buffers->rb_lock); if (list_empty(&buffers->rb_send_bufs)) goto out_reqbuf; + buffers->rb_send_count++; req = rpcrdma_buffer_get_req_locked(buffers); - if (list_empty(&buffers->rb_recv_bufs)) - goto out_repbuf; - req->rl_reply = rpcrdma_buffer_get_rep_locked(buffers); + req->rl_reply = rpcrdma_buffer_get_rep(buffers); spin_unlock(&buffers->rb_lock); return req; @@ -1098,11 +1117,6 @@ out_reqbuf: spin_unlock(&buffers->rb_lock); pr_warn("RPC: %s: out of request buffers\n", __func__); return NULL; -out_repbuf: - spin_unlock(&buffers->rb_lock); - pr_warn("RPC: %s: out of reply buffers\n", __func__); - req->rl_reply = NULL; - return req; } /* @@ -1119,9 +1133,12 @@ rpcrdma_buffer_put(struct rpcrdma_req *req) req->rl_reply = NULL; spin_lock(&buffers->rb_lock); + buffers->rb_send_count--; list_add_tail(&req->rl_free, &buffers->rb_send_bufs); - if (rep) + if (rep) { + buffers->rb_recv_count--; list_add_tail(&rep->rr_list, &buffers->rb_recv_bufs); + } spin_unlock(&buffers->rb_lock); } @@ -1135,8 +1152,7 @@ rpcrdma_recv_buffer_get(struct rpcrdma_req *req) struct rpcrdma_buffer *buffers = req->rl_buffer; spin_lock(&buffers->rb_lock); - if (!list_empty(&buffers->rb_recv_bufs)) - req->rl_reply = rpcrdma_buffer_get_rep_locked(buffers); + req->rl_reply = rpcrdma_buffer_get_rep(buffers); spin_unlock(&buffers->rb_lock); } @@ -1150,6 +1166,7 @@ rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) struct rpcrdma_buffer *buffers = &rep->rr_rxprt->rx_buf; spin_lock(&buffers->rb_lock); + buffers->rb_recv_count--; list_add_tail(&rep->rr_list, &buffers->rb_recv_bufs); spin_unlock(&buffers->rb_lock); } diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 670fad57153a..a71b0f5897d8 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -321,6 +321,7 @@ struct rpcrdma_buffer { char *rb_pool; spinlock_t rb_lock; /* protect buf lists */ + int rb_send_count, rb_recv_count; struct list_head rb_send_bufs; struct list_head rb_recv_bufs; u32 rb_max_requests; -- cgit v1.2.3-58-ga151 From 5a56a0b3a45dd0cc5b2f7bec6afd053a474ed9f5 Mon Sep 17 00:00:00 2001 From: Mark Tomlinson <mark.tomlinson@alliedtelesis.co.nz> Date: Mon, 5 Sep 2016 10:20:20 +1200 Subject: net: Don't delete routes in different VRFs When deleting an IP address from an interface, there is a clean-up of routes which refer to this local address. However, there was no check to see that the VRF matched. This meant that deletion wasn't confined to the VRF it should have been. To solve this, a new field has been added to fib_info to hold a table id. When removing fib entries corresponding to a local ip address, this table id is also used in the comparison. The table id is populated when the fib_info is created. This was already done in some places, but not in ip_rt_ioctl(). This has now been fixed. Fixes: 021dd3b8a142 ("net: Add routes to the table associated with the device") Acked-by: David Ahern <dsa@cumulusnetworks.com> Tested-by: David Ahern <dsa@cumulusnetworks.com> Signed-off-by: Mark Tomlinson <mark.tomlinson@alliedtelesis.co.nz> Signed-off-by: David S. Miller <davem@davemloft.net> --- include/net/ip_fib.h | 3 ++- net/ipv4/fib_frontend.c | 3 ++- net/ipv4/fib_semantics.c | 8 ++++++-- 3 files changed, 10 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 4079fc18ffe4..7d4a72e75f33 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -111,6 +111,7 @@ struct fib_info { unsigned char fib_scope; unsigned char fib_type; __be32 fib_prefsrc; + u32 fib_tb_id; u32 fib_priority; u32 *fib_metrics; #define fib_mtu fib_metrics[RTAX_MTU-1] @@ -319,7 +320,7 @@ void fib_flush_external(struct net *net); /* Exported by fib_semantics.c */ int ip_fib_check_default(__be32 gw, struct net_device *dev); int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force); -int fib_sync_down_addr(struct net *net, __be32 local); +int fib_sync_down_addr(struct net_device *dev, __be32 local); int fib_sync_up(struct net_device *dev, unsigned int nh_flags); extern u32 fib_multipath_secret __read_mostly; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index ef2ebeb89d0f..1b25daf8c7f1 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -509,6 +509,7 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt, if (!dev) return -ENODEV; cfg->fc_oif = dev->ifindex; + cfg->fc_table = l3mdev_fib_table(dev); if (colon) { struct in_ifaddr *ifa; struct in_device *in_dev = __in_dev_get_rtnl(dev); @@ -1027,7 +1028,7 @@ no_promotions: * First of all, we scan fib_info list searching * for stray nexthop entries, then ignite fib_flush. */ - if (fib_sync_down_addr(dev_net(dev), ifa->ifa_local)) + if (fib_sync_down_addr(dev, ifa->ifa_local)) fib_flush(dev_net(dev)); } } diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 539fa264e67d..e9f56225e53f 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1057,6 +1057,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg) fi->fib_priority = cfg->fc_priority; fi->fib_prefsrc = cfg->fc_prefsrc; fi->fib_type = cfg->fc_type; + fi->fib_tb_id = cfg->fc_table; fi->fib_nhs = nhs; change_nexthops(fi) { @@ -1337,18 +1338,21 @@ nla_put_failure: * referring to it. * - device went down -> we must shutdown all nexthops going via it. */ -int fib_sync_down_addr(struct net *net, __be32 local) +int fib_sync_down_addr(struct net_device *dev, __be32 local) { int ret = 0; unsigned int hash = fib_laddr_hashfn(local); struct hlist_head *head = &fib_info_laddrhash[hash]; + struct net *net = dev_net(dev); + int tb_id = l3mdev_fib_table(dev); struct fib_info *fi; if (!fib_info_laddrhash || local == 0) return 0; hlist_for_each_entry(fi, head, fib_lhash) { - if (!net_eq(fi->fib_net, net)) + if (!net_eq(fi->fib_net, net) || + fi->fib_tb_id != tb_id) continue; if (fi->fib_prefsrc == local) { fi->fib_flags |= RTNH_F_DEAD; -- cgit v1.2.3-58-ga151 From 751eb6b6042a596b0080967c1a529a9fe98dac1d Mon Sep 17 00:00:00 2001 From: Wei Yongjun <weiyongjun1@huawei.com> Date: Mon, 5 Sep 2016 16:06:31 +0800 Subject: ipv6: addrconf: fix dev refcont leak when DAD failed In general, when DAD detected IPv6 duplicate address, ifp->state will be set to INET6_IFADDR_STATE_ERRDAD and DAD is stopped by a delayed work, the call tree should be like this: ndisc_recv_ns -> addrconf_dad_failure <- missing ifp put -> addrconf_mod_dad_work -> schedule addrconf_dad_work() -> addrconf_dad_stop() <- missing ifp hold before call it addrconf_dad_failure() called with ifp refcont holding but not put. addrconf_dad_work() call addrconf_dad_stop() without extra holding refcount. This will not cause any issue normally. But the race between addrconf_dad_failure() and addrconf_dad_work() may cause ifp refcount leak and netdevice can not be unregister, dmesg show the following messages: IPv6: eth0: IPv6 duplicate address fe80::XX:XXXX:XXXX:XX detected! ... unregister_netdevice: waiting for eth0 to become free. Usage count = 1 Cc: stable@vger.kernel.org Fixes: c15b1ccadb32 ("ipv6: move DAD and addrconf_verify processing to workqueue") Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/ipv6/addrconf.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index bdf368eff5ab..2f1f5d439788 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1948,6 +1948,7 @@ errdad: spin_unlock_bh(&ifp->lock); addrconf_mod_dad_work(ifp, 0); + in6_ifa_put(ifp); } /* Join to solicited addr multicast group. @@ -3857,6 +3858,7 @@ static void addrconf_dad_work(struct work_struct *w) addrconf_dad_begin(ifp); goto out; } else if (action == DAD_ABORT) { + in6_ifa_hold(ifp); addrconf_dad_stop(ifp, 1); if (disable_ipv6) addrconf_ifdown(idev->dev, 0); -- cgit v1.2.3-58-ga151 From 76061f631c2ea4ab9c4d66f3a96ecc5737f5aaf7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet <edumazet@google.com> Date: Wed, 7 Sep 2016 08:34:11 -0700 Subject: tcp: fastopen: avoid negative sk_forward_alloc When DATA and/or FIN are carried in a SYN/ACK message or SYN message, we append an skb in socket receive queue, but we forget to call sk_forced_mem_schedule(). Effect is that the socket has a negative sk->sk_forward_alloc as long as the message is not read by the application. Josh Hunt fixed a similar issue in commit d22e15371811 ("tcp: fix tcp fin memory accounting") Fixes: 168a8f58059a ("tcp: TCP Fast Open Server - main code path") Signed-off-by: Eric Dumazet <edumazet@google.com> Reviewed-by: Josh Hunt <johunt@akamai.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/ipv4/tcp_fastopen.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 62a5751d4fe1..4e777a3243f9 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -150,6 +150,7 @@ void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb) tp->segs_in = 0; tcp_segs_in(tp, skb); __skb_pull(skb, tcp_hdrlen(skb)); + sk_forced_mem_schedule(sk, skb->truesize); skb_set_owner_r(skb, sk); TCP_SKB_CB(skb)->seq++; -- cgit v1.2.3-58-ga151 From db7196a0d0984b933ccf2cd6a60e26abf466e8a3 Mon Sep 17 00:00:00 2001 From: Artem Germanov <agermanov@anchorfree.com> Date: Wed, 7 Sep 2016 10:49:36 -0700 Subject: tcp: cwnd does not increase in TCP YeAH Commit 76174004a0f19785a328f40388e87e982bbf69b9 (tcp: do not slow start when cwnd equals ssthresh ) introduced regression in TCP YeAH. Using 100ms delay 1% loss virtual ethernet link kernel 4.2 shows bandwidth ~500KB/s for single TCP connection and kernel 4.3 and above (including 4.8-rc4) shows bandwidth ~100KB/s. That is caused by stalled cwnd when cwnd equals ssthresh. This patch fixes it by proper increasing cwnd in this case. Signed-off-by: Artem Germanov <agermanov@anchorfree.com> Acked-by: Dmitry Adamushko <d.adamushko@anchorfree.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/ipv4/tcp_yeah.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c index 028eb046ea40..9c5fc973267f 100644 --- a/net/ipv4/tcp_yeah.c +++ b/net/ipv4/tcp_yeah.c @@ -76,7 +76,7 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked) if (!tcp_is_cwnd_limited(sk)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) + if (tcp_in_slow_start(tp)) tcp_slow_start(tp, acked); else if (!yeah->doing_reno_now) { -- cgit v1.2.3-58-ga151 From 2f30ea5090cbc57ea573cdc66421264b3de3fb0a Mon Sep 17 00:00:00 2001 From: Mathias Krause <minipli@googlemail.com> Date: Thu, 8 Sep 2016 18:09:57 +0200 Subject: xfrm_user: propagate sec ctx allocation errors When we fail to attach the security context in xfrm_state_construct() we'll return 0 as error value which, in turn, will wrongly claim success to userland when, in fact, we won't be adding / updating the XFRM state. This is a regression introduced by commit fd21150a0fe1 ("[XFRM] netlink: Inline attach_encap_tmpl(), attach_sec_ctx(), and attach_one_addr()"). Fix it by propagating the error returned by security_xfrm_state_alloc() in this case. Fixes: fd21150a0fe1 ("[XFRM] netlink: Inline attach_encap_tmpl()...") Signed-off-by: Mathias Krause <minipli@googlemail.com> Cc: Thomas Graf <tgraf@suug.ch> Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com> --- net/xfrm/xfrm_user.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index cb65d916a345..08892091cfe3 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -581,9 +581,12 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, if (err) goto error; - if (attrs[XFRMA_SEC_CTX] && - security_xfrm_state_alloc(x, nla_data(attrs[XFRMA_SEC_CTX]))) - goto error; + if (attrs[XFRMA_SEC_CTX]) { + err = security_xfrm_state_alloc(x, + nla_data(attrs[XFRMA_SEC_CTX])); + if (err) + goto error; + } if ((err = xfrm_alloc_replay_state_esn(&x->replay_esn, &x->preplay_esn, attrs[XFRMA_REPLAY_ESN_VAL]))) -- cgit v1.2.3-58-ga151 From 1fb81e09d487656aa23f2acb1232c7f56b4c2367 Mon Sep 17 00:00:00 2001 From: "thomas.zeitlhofer+lkml@ze-it.at" <thomas.zeitlhofer+lkml@ze-it.at> Date: Wed, 7 Sep 2016 20:40:38 +0200 Subject: vti: use right inner_mode for inbound inter address family policy checks In case of inter address family tunneling (IPv6 over vti4 or IPv4 over vti6), the inbound policy checks in vti_rcv_cb() and vti6_rcv_cb() are using the wrong address family. As a result, all inbound inter address family traffic is dropped. Use the xfrm_ip2inner_mode() helper, as done in xfrm_input() (i.e., also increment LINUX_MIB_XFRMINSTATEMODEERROR in case of error), to select the inner_mode that contains the right address family for the inbound policy checks. Signed-off-by: Thomas Zeitlhofer <thomas.zeitlhofer+lkml@ze-it.at> Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com> --- net/ipv4/ip_vti.c | 15 ++++++++++++++- net/ipv6/ip6_vti.c | 15 ++++++++++++++- 2 files changed, 28 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index cc701fa70b12..5d7944f394d9 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -88,6 +88,7 @@ static int vti_rcv_cb(struct sk_buff *skb, int err) struct net_device *dev; struct pcpu_sw_netstats *tstats; struct xfrm_state *x; + struct xfrm_mode *inner_mode; struct ip_tunnel *tunnel = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4; u32 orig_mark = skb->mark; int ret; @@ -105,7 +106,19 @@ static int vti_rcv_cb(struct sk_buff *skb, int err) } x = xfrm_input_state(skb); - family = x->inner_mode->afinfo->family; + + inner_mode = x->inner_mode; + + if (x->sel.family == AF_UNSPEC) { + inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol); + if (inner_mode == NULL) { + XFRM_INC_STATS(dev_net(skb->dev), + LINUX_MIB_XFRMINSTATEMODEERROR); + return -EINVAL; + } + } + + family = inner_mode->afinfo->family; skb->mark = be32_to_cpu(tunnel->parms.i_key); ret = xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family); diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index d90a11f14040..52a2f735881f 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -340,6 +340,7 @@ static int vti6_rcv_cb(struct sk_buff *skb, int err) struct net_device *dev; struct pcpu_sw_netstats *tstats; struct xfrm_state *x; + struct xfrm_mode *inner_mode; struct ip6_tnl *t = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6; u32 orig_mark = skb->mark; int ret; @@ -357,7 +358,19 @@ static int vti6_rcv_cb(struct sk_buff *skb, int err) } x = xfrm_input_state(skb); - family = x->inner_mode->afinfo->family; + + inner_mode = x->inner_mode; + + if (x->sel.family == AF_UNSPEC) { + inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol); + if (inner_mode == NULL) { + XFRM_INC_STATS(dev_net(skb->dev), + LINUX_MIB_XFRMINSTATEMODEERROR); + return -EINVAL; + } + } + + family = inner_mode->afinfo->family; skb->mark = be32_to_cpu(t->parms.i_key); ret = xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family); -- cgit v1.2.3-58-ga151 From 7303a1475008bee5c3e82a06a282568415690d72 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> Date: Thu, 8 Sep 2016 17:54:11 +0800 Subject: sctp: identify chunks that need to be fragmented at IP level Previously, without GSO, it was easy to identify it: if the chunk didn't fit and there was no data chunk in the packet yet, we could fragment at IP level. So if there was an auth chunk and we were bundling a big data chunk, it would fragment regardless of the size of the auth chunk. This also works for the context of PMTU reductions. But with GSO, we cannot distinguish such PMTU events anymore, as the packet is allowed to exceed PMTU. So we need another check: to ensure that the chunk that we are adding, actually fits the current PMTU. If it doesn't, trigger a flush and let it be fragmented at IP level in the next round. Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/sctp/output.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/output.c b/net/sctp/output.c index 1f1682b9a6a8..31b7bc35895d 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -878,7 +878,7 @@ static sctp_xmit_t sctp_packet_will_fit(struct sctp_packet *packet, struct sctp_chunk *chunk, u16 chunk_len) { - size_t psize, pmtu; + size_t psize, pmtu, maxsize; sctp_xmit_t retval = SCTP_XMIT_OK; psize = packet->size; @@ -906,6 +906,17 @@ static sctp_xmit_t sctp_packet_will_fit(struct sctp_packet *packet, goto out; } + /* Similarly, if this chunk was built before a PMTU + * reduction, we have to fragment it at IP level now. So + * if the packet already contains something, we need to + * flush. + */ + maxsize = pmtu - packet->overhead; + if (packet->auth) + maxsize -= WORD_ROUND(packet->auth->skb->len); + if (chunk_len > maxsize) + retval = SCTP_XMIT_PMTU_FULL; + /* It is also okay to fragment if the chunk we are * adding is a control chunk, but only if current packet * is not a GSO one otherwise it causes fragmentation of -- cgit v1.2.3-58-ga151 From 83843c80dcf11a78995d167255b03072a1e49c2c Mon Sep 17 00:00:00 2001 From: Felix Fietkau <nbd@nbd.name> Date: Sun, 28 Aug 2016 13:10:37 +0200 Subject: mac80211: fix tim recalculation after PS response Handle the case where the mac80211 intermediate queues are empty and the driver has buffered frames Fixes: ba8c3d6f16a1 ("mac80211: add an intermediate software queue implementation") Signed-off-by: Felix Fietkau <nbd@nbd.name> Signed-off-by: Johannes Berg <johannes.berg@intel.com> --- net/mac80211/sta_info.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 76b737dcc36f..aa58df80ede0 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -1616,7 +1616,6 @@ ieee80211_sta_ps_deliver_response(struct sta_info *sta, sta_info_recalc_tim(sta); } else { - unsigned long tids = sta->txq_buffered_tids & driver_release_tids; int tid; /* @@ -1648,7 +1647,8 @@ ieee80211_sta_ps_deliver_response(struct sta_info *sta, for (tid = 0; tid < ARRAY_SIZE(sta->sta.txq); tid++) { struct txq_info *txqi = to_txq_info(sta->sta.txq[tid]); - if (!(tids & BIT(tid)) || txqi->tin.backlog_packets) + if (!(driver_release_tids & BIT(tid)) || + txqi->tin.backlog_packets) continue; sta_info_recalc_tim(sta); -- cgit v1.2.3-58-ga151 From df6ef5d8a87ace995d5c10a7bd684be05911a321 Mon Sep 17 00:00:00 2001 From: Felix Fietkau <nbd@nbd.name> Date: Sun, 4 Sep 2016 18:00:59 +0200 Subject: mac80211: fix sequence number assignment for PS response frames When using intermediate queues, sequence number allocation is deferred until dequeue. This doesn't work for PS response frames, which bypass those queues. Signed-off-by: Felix Fietkau <nbd@nbd.name> Signed-off-by: Johannes Berg <johannes.berg@intel.com> --- net/mac80211/tx.c | 65 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 33 insertions(+), 32 deletions(-) (limited to 'net') diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 502396694f47..cc8e95554b48 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -796,6 +796,36 @@ static __le16 ieee80211_tx_next_seq(struct sta_info *sta, int tid) return ret; } +static struct txq_info *ieee80211_get_txq(struct ieee80211_local *local, + struct ieee80211_vif *vif, + struct ieee80211_sta *pubsta, + struct sk_buff *skb) +{ + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + struct ieee80211_txq *txq = NULL; + + if ((info->flags & IEEE80211_TX_CTL_SEND_AFTER_DTIM) || + (info->control.flags & IEEE80211_TX_CTRL_PS_RESPONSE)) + return NULL; + + if (!ieee80211_is_data(hdr->frame_control)) + return NULL; + + if (pubsta) { + u8 tid = skb->priority & IEEE80211_QOS_CTL_TID_MASK; + + txq = pubsta->txq[tid]; + } else if (vif) { + txq = vif->txq; + } + + if (!txq) + return NULL; + + return to_txq_info(txq); +} + static ieee80211_tx_result debug_noinline ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx) { @@ -853,7 +883,8 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx) tid = *qc & IEEE80211_QOS_CTL_TID_MASK; tx->sta->tx_stats.msdu[tid]++; - if (!tx->sta->sta.txq[0]) + if (!ieee80211_get_txq(tx->local, info->control.vif, &tx->sta->sta, + tx->skb)) hdr->seq_ctrl = ieee80211_tx_next_seq(tx->sta, tid); return TX_CONTINUE; @@ -1243,36 +1274,6 @@ ieee80211_tx_prepare(struct ieee80211_sub_if_data *sdata, return TX_CONTINUE; } -static struct txq_info *ieee80211_get_txq(struct ieee80211_local *local, - struct ieee80211_vif *vif, - struct ieee80211_sta *pubsta, - struct sk_buff *skb) -{ - struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; - struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); - struct ieee80211_txq *txq = NULL; - - if ((info->flags & IEEE80211_TX_CTL_SEND_AFTER_DTIM) || - (info->control.flags & IEEE80211_TX_CTRL_PS_RESPONSE)) - return NULL; - - if (!ieee80211_is_data(hdr->frame_control)) - return NULL; - - if (pubsta) { - u8 tid = skb->priority & IEEE80211_QOS_CTL_TID_MASK; - - txq = pubsta->txq[tid]; - } else if (vif) { - txq = vif->txq; - } - - if (!txq) - return NULL; - - return to_txq_info(txq); -} - static void ieee80211_set_skb_enqueue_time(struct sk_buff *skb) { IEEE80211_SKB_CB(skb)->control.enqueue_time = codel_get_time(); @@ -3264,7 +3265,7 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) { *ieee80211_get_qos_ctl(hdr) = tid; - if (!sta->sta.txq[0]) + if (!ieee80211_get_txq(local, &sdata->vif, &sta->sta, skb)) hdr->seq_ctrl = ieee80211_tx_next_seq(sta, tid); } else { info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ; -- cgit v1.2.3-58-ga151 From 5df20f2141eadb5430caaad20eceac61cfe0f139 Mon Sep 17 00:00:00 2001 From: "Pedersen, Thomas" <twp@qca.qualcomm.com> Date: Tue, 6 Sep 2016 11:59:00 -0700 Subject: mac80211: make mpath path fixing more robust A fixed mpath was not quite being treated as such: 1) if a PERR frame was received, a fixed mpath was deactivated. 2) queued path discovery for fixed mpath was potentially being considered, changing mpath state. 3) other mpath flags were potentially being inherited when fixing the mpath. Just assign PATH_FIXED and SN_VALID. This solves several issues when fixing a mesh path in one direction. The reverse direction mpath should probably also be fixed, or root announcements at least be enabled. Signed-off-by: Thomas Pedersen <twp@qca.qualcomm.com> Signed-off-by: Johannes Berg <johannes.berg@intel.com> --- net/mac80211/mesh_hwmp.c | 3 ++- net/mac80211/mesh_pathtbl.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index 8f9c3bde835f..faccef977670 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -746,6 +746,7 @@ static void hwmp_perr_frame_process(struct ieee80211_sub_if_data *sdata, sta = next_hop_deref_protected(mpath); if (mpath->flags & MESH_PATH_ACTIVE && ether_addr_equal(ta, sta->sta.addr) && + !(mpath->flags & MESH_PATH_FIXED) && (!(mpath->flags & MESH_PATH_SN_VALID) || SN_GT(target_sn, mpath->sn) || target_sn == 0)) { mpath->flags &= ~MESH_PATH_ACTIVE; @@ -1012,7 +1013,7 @@ void mesh_path_start_discovery(struct ieee80211_sub_if_data *sdata) goto enddiscovery; spin_lock_bh(&mpath->state_lock); - if (mpath->flags & MESH_PATH_DELETED) { + if (mpath->flags & (MESH_PATH_DELETED | MESH_PATH_FIXED)) { spin_unlock_bh(&mpath->state_lock); goto enddiscovery; } diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c index 6db2ddfa0695..f0e6175a9821 100644 --- a/net/mac80211/mesh_pathtbl.c +++ b/net/mac80211/mesh_pathtbl.c @@ -826,7 +826,7 @@ void mesh_path_fix_nexthop(struct mesh_path *mpath, struct sta_info *next_hop) mpath->metric = 0; mpath->hop_count = 0; mpath->exp_time = 0; - mpath->flags |= MESH_PATH_FIXED; + mpath->flags = MESH_PATH_FIXED | MESH_PATH_SN_VALID; mesh_path_activate(mpath); spin_unlock_bh(&mpath->state_lock); mesh_path_tx_pending(mpath); -- cgit v1.2.3-58-ga151 From ecfcdfec7e0cc64215a194044305f02a5a836e6d Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso <pablo@netfilter.org> Date: Fri, 9 Sep 2016 15:38:12 +0200 Subject: netfilter: nf_nat: handle NF_DROP from nfnetlink_parse_nat_setup() nf_nat_setup_info() returns NF_* verdicts, so convert them to error codes that is what ctnelink expects. This has passed overlook without having any impact since this nf_nat_setup_info() has always returned NF_ACCEPT so far. Since 870190a9ec90 ("netfilter: nat: convert nat bysrc hash to rhashtable"), this is problem. Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> --- net/netfilter/nf_nat_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index de31818417b8..19c081e1b328 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -807,7 +807,7 @@ nfnetlink_parse_nat_setup(struct nf_conn *ct, if (err < 0) return err; - return nf_nat_setup_info(ct, &range, manip); + return nf_nat_setup_info(ct, &range, manip) == NF_DROP ? -ENOMEM : 0; } #else static int -- cgit v1.2.3-58-ga151 From bf2c4b6f9b74c2ee1dd3c050b181e9b9c86fbcdb Mon Sep 17 00:00:00 2001 From: Chuck Lever <chuck.lever@oracle.com> Date: Thu, 1 Sep 2016 10:50:38 -0400 Subject: svcauth_gss: Revert 64c59a3726f2 ("Remove unnecessary allocation") rsc_lookup steals the passed-in memory to avoid doing an allocation of its own, so we can't just pass in a pointer to memory that someone else is using. If we really want to avoid allocation there then maybe we should preallocate somwhere, or reference count these handles. For now we should revert. On occasion I see this on my server: kernel: kernel BUG at /home/cel/src/linux/linux-2.6/mm/slub.c:3851! kernel: invalid opcode: 0000 [#1] SMP kernel: Modules linked in: cts rpcsec_gss_krb5 sb_edac edac_core x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel kvm irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel aesni_intel lrw gf128mul glue_helper ablk_helper cryptd btrfs xor iTCO_wdt iTCO_vendor_support raid6_pq pcspkr i2c_i801 i2c_smbus lpc_ich mfd_core mei_me sg mei shpchp wmi ioatdma ipmi_si ipmi_msghandler acpi_pad acpi_power_meter rpcrdma ib_ipoib rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm nfsd nfs_acl lockd grace auth_rpcgss sunrpc ip_tables xfs libcrc32c mlx4_ib mlx4_en ib_core sr_mod cdrom sd_mod ast drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm drm crc32c_intel igb mlx4_core ahci libahci libata ptp pps_core dca i2c_algo_bit i2c_core dm_mirror dm_region_hash dm_log dm_mod kernel: CPU: 7 PID: 145 Comm: kworker/7:2 Not tainted 4.8.0-rc4-00006-g9d06b0b #15 kernel: Hardware name: Supermicro Super Server/X10SRL-F, BIOS 1.0c 09/09/2015 kernel: Workqueue: events do_cache_clean [sunrpc] kernel: task: ffff8808541d8000 task.stack: ffff880854344000 kernel: RIP: 0010:[<ffffffff811e7075>] [<ffffffff811e7075>] kfree+0x155/0x180 kernel: RSP: 0018:ffff880854347d70 EFLAGS: 00010246 kernel: RAX: ffffea0020fe7660 RBX: ffff88083f9db064 RCX: 146ff0f9d5ec5600 kernel: RDX: 000077ff80000000 RSI: ffff880853f01500 RDI: ffff88083f9db064 kernel: RBP: ffff880854347d88 R08: ffff8808594ee000 R09: ffff88087fdd8780 kernel: R10: 0000000000000000 R11: ffffea0020fe76c0 R12: ffff880853f01500 kernel: R13: ffffffffa013cf76 R14: ffffffffa013cff0 R15: ffffffffa04253a0 kernel: FS: 0000000000000000(0000) GS:ffff88087fdc0000(0000) knlGS:0000000000000000 kernel: CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 kernel: CR2: 00007fed60b020c3 CR3: 0000000001c06000 CR4: 00000000001406e0 kernel: Stack: kernel: ffff8808589f2f00 ffff880853f01500 0000000000000001 ffff880854347da0 kernel: ffffffffa013cf76 ffff8808589f2f00 ffff880854347db8 ffffffffa013d006 kernel: ffff8808589f2f20 ffff880854347e00 ffffffffa0406f60 0000000057c7044f kernel: Call Trace: kernel: [<ffffffffa013cf76>] rsc_free+0x16/0x90 [auth_rpcgss] kernel: [<ffffffffa013d006>] rsc_put+0x16/0x30 [auth_rpcgss] kernel: [<ffffffffa0406f60>] cache_clean+0x2e0/0x300 [sunrpc] kernel: [<ffffffffa04073ee>] do_cache_clean+0xe/0x70 [sunrpc] kernel: [<ffffffff8109a70f>] process_one_work+0x1ff/0x3b0 kernel: [<ffffffff8109b15c>] worker_thread+0x2bc/0x4a0 kernel: [<ffffffff8109aea0>] ? rescuer_thread+0x3a0/0x3a0 kernel: [<ffffffff810a0ba4>] kthread+0xe4/0xf0 kernel: [<ffffffff8169c47f>] ret_from_fork+0x1f/0x40 kernel: [<ffffffff810a0ac0>] ? kthread_stop+0x110/0x110 kernel: Code: f7 ff ff eb 3b 65 8b 05 da 30 e2 7e 89 c0 48 0f a3 05 a0 38 b8 00 0f 92 c0 84 c0 0f 85 d1 fe ff ff 0f 1f 44 00 00 e9 f5 fe ff ff <0f> 0b 49 8b 03 31 f6 f6 c4 40 0f 85 62 ff ff ff e9 61 ff ff ff kernel: RIP [<ffffffff811e7075>] kfree+0x155/0x180 kernel: RSP <ffff880854347d70> kernel: ---[ end trace 3fdec044969def26 ]--- It seems to be most common after a server reboot where a client has been using a Kerberos mount, and reconnects to continue its workload. Signed-off-by: Chuck Lever <chuck.lever@oracle.com> Cc: stable@vger.kernel.org Signed-off-by: J. Bruce Fields <bfields@redhat.com> --- net/sunrpc/auth_gss/svcauth_gss.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 1d281816f2bf..d8582028b346 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -569,9 +569,10 @@ gss_svc_searchbyctx(struct cache_detail *cd, struct xdr_netobj *handle) struct rsc *found; memset(&rsci, 0, sizeof(rsci)); - rsci.handle.data = handle->data; - rsci.handle.len = handle->len; + if (dup_to_netobj(&rsci.handle, handle->data, handle->len)) + return NULL; found = rsc_lookup(cd, &rsci); + rsc_free(&rsci); if (!found) return NULL; if (cache_check(cd, &found->h, NULL)) -- cgit v1.2.3-58-ga151 From 4440a2ab3b9f40dddbe006331ef0659c76859296 Mon Sep 17 00:00:00 2001 From: Gao Feng <fgao@ikuai8.com> Date: Tue, 13 Sep 2016 08:49:18 +0800 Subject: netfilter: synproxy: Check oom when adding synproxy and seqadj ct extensions When memory is exhausted, nfct_seqadj_ext_add may fail to add the synproxy and seqadj extensions. The function nf_ct_seqadj_init doesn't check if get valid seqadj pointer by the nfct_seqadj. Now drop the packet directly when fail to add seqadj extension to avoid dereference NULL pointer in nf_ct_seqadj_init from init_conntrack(). Signed-off-by: Gao Feng <fgao@ikuai8.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> --- include/net/netfilter/nf_conntrack_synproxy.h | 14 ++++++++++++++ net/netfilter/nf_conntrack_core.c | 6 +++--- net/netfilter/nf_nat_core.c | 3 ++- 3 files changed, 19 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_conntrack_synproxy.h b/include/net/netfilter/nf_conntrack_synproxy.h index 6793614e6502..e6937318546c 100644 --- a/include/net/netfilter/nf_conntrack_synproxy.h +++ b/include/net/netfilter/nf_conntrack_synproxy.h @@ -27,6 +27,20 @@ static inline struct nf_conn_synproxy *nfct_synproxy_ext_add(struct nf_conn *ct) #endif } +static inline bool nf_ct_add_synproxy(struct nf_conn *ct, + const struct nf_conn *tmpl) +{ + if (tmpl && nfct_synproxy(tmpl)) { + if (!nfct_seqadj_ext_add(ct)) + return false; + + if (!nfct_synproxy_ext_add(ct)) + return false; + } + + return true; +} + struct synproxy_stats { unsigned int syn_received; unsigned int cookie_invalid; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index dd2c43abf9e2..9934b0c93c1e 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1035,9 +1035,9 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, if (IS_ERR(ct)) return (struct nf_conntrack_tuple_hash *)ct; - if (tmpl && nfct_synproxy(tmpl)) { - nfct_seqadj_ext_add(ct); - nfct_synproxy_ext_add(ct); + if (!nf_ct_add_synproxy(ct, tmpl)) { + nf_conntrack_free(ct); + return ERR_PTR(-ENOMEM); } timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL; diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 19c081e1b328..ecee105bbada 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -441,7 +441,8 @@ nf_nat_setup_info(struct nf_conn *ct, ct->status |= IPS_DST_NAT; if (nfct_help(ct)) - nfct_seqadj_ext_add(ct); + if (!nfct_seqadj_ext_add(ct)) + return NF_DROP; } if (maniptype == NF_NAT_MANIP_SRC) { -- cgit v1.2.3-58-ga151 From 715f5552b1e90ba3eecf6d1a6d044d0d5226663f Mon Sep 17 00:00:00 2001 From: Xin Long <lucien.xin@gmail.com> Date: Sat, 10 Sep 2016 23:11:23 +0800 Subject: sctp: hold the transport before using it in sctp_hash_cmp Since commit 4f0087812648 ("sctp: apply rhashtable api to send/recv path"), sctp uses transport rhashtable with .obj_cmpfn sctp_hash_cmp, in which it compares the members of the transport with the rhashtable args to check if it's the right transport. But sctp uses the transport without holding it in sctp_hash_cmp, it can cause a use-after-free panic. As after it gets transport from hashtable, another CPU may close the sk and free the asoc. In sctp_association_free, it frees all the transports, meanwhile, the assoc's refcnt may be reduced to 0, assoc can be destroyed by sctp_association_destroy. So after that, transport->assoc is actually an unavailable memory address in sctp_hash_cmp. Although sctp_hash_cmp is under rcu_read_lock, it still can not avoid this, as assoc is not freed by RCU. This patch is to hold the transport before checking it's members with sctp_transport_hold, in which it checks the refcnt first, holds it if it's not 0. Fixes: 4f0087812648 ("sctp: apply rhashtable api to send/recv path") Signed-off-by: Xin Long <lucien.xin@gmail.com> Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/sctp/input.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/sctp/input.c b/net/sctp/input.c index 69444d32ecda..1555fb8c68e0 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -796,27 +796,34 @@ struct sctp_hash_cmp_arg { static inline int sctp_hash_cmp(struct rhashtable_compare_arg *arg, const void *ptr) { + struct sctp_transport *t = (struct sctp_transport *)ptr; const struct sctp_hash_cmp_arg *x = arg->key; - const struct sctp_transport *t = ptr; - struct sctp_association *asoc = t->asoc; - const struct net *net = x->net; + struct sctp_association *asoc; + int err = 1; if (!sctp_cmp_addr_exact(&t->ipaddr, x->paddr)) - return 1; - if (!net_eq(sock_net(asoc->base.sk), net)) - return 1; + return err; + if (!sctp_transport_hold(t)) + return err; + + asoc = t->asoc; + if (!net_eq(sock_net(asoc->base.sk), x->net)) + goto out; if (x->ep) { if (x->ep != asoc->ep) - return 1; + goto out; } else { if (x->laddr->v4.sin_port != htons(asoc->base.bind_addr.port)) - return 1; + goto out; if (!sctp_bind_addr_match(&asoc->base.bind_addr, x->laddr, sctp_sk(asoc->base.sk))) - return 1; + goto out; } - return 0; + err = 0; +out: + sctp_transport_put(t); + return err; } static inline u32 sctp_hash_obj(const void *data, u32 len, u32 seed) -- cgit v1.2.3-58-ga151 From ad5987b47e96a0fb6d13fea250e936aed000093c Mon Sep 17 00:00:00 2001 From: Johannes Berg <johannes.berg@intel.com> Date: Tue, 13 Sep 2016 15:53:55 +0200 Subject: nl80211: validate number of probe response CSA counters Due to an apparent copy/paste bug, the number of counters for the beacon configuration were checked twice, instead of checking the number of probe response counters. Fix this to check the number of probe response counters before parsing those. Cc: stable@vger.kernel.org Fixes: 9a774c78e211 ("cfg80211: Support multiple CSA counters") Signed-off-by: Johannes Berg <johannes.berg@intel.com> --- net/wireless/nl80211.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index f02653a08993..4809f4d2cdcc 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -6978,7 +6978,7 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info) params.n_counter_offsets_presp = len / sizeof(u16); if (rdev->wiphy.max_num_csa_counters && - (params.n_counter_offsets_beacon > + (params.n_counter_offsets_presp > rdev->wiphy.max_num_csa_counters)) return -EINVAL; -- cgit v1.2.3-58-ga151 From 0b97a484e52cb423662eb98904aad82dafcc1f10 Mon Sep 17 00:00:00 2001 From: Johannes Berg <johannes.berg@intel.com> Date: Wed, 14 Sep 2016 09:41:34 +0200 Subject: mac80211: check skb_linearize() return value The A-MSDU TX code (within TXQs) didn't always check the return value of skb_linearize() properly, resulting in potentially passing a frag- list SKB down to the driver even when it said it can't handle it. Fix that. Fixes: 6e0456b545456 ("mac80211: add A-MSDU tx support") Signed-off-by: Johannes Berg <johannes.berg@intel.com> --- net/mac80211/tx.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index cc8e95554b48..18b285e06bc8 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1515,8 +1515,12 @@ out: spin_unlock_bh(&fq->lock); if (skb && skb_has_frag_list(skb) && - !ieee80211_hw_check(&local->hw, TX_FRAG_LIST)) - skb_linearize(skb); + !ieee80211_hw_check(&local->hw, TX_FRAG_LIST)) { + if (skb_linearize(skb)) { + ieee80211_free_txskb(&local->hw, skb); + return NULL; + } + } return skb; } -- cgit v1.2.3-58-ga151 From 85d5313ed717ad60769491c7c072d23bc0a68e7a Mon Sep 17 00:00:00 2001 From: Johannes Berg <johannes.berg@intel.com> Date: Wed, 14 Sep 2016 11:38:31 +0200 Subject: mac80211: reject TSPEC TIDs (TSIDs) for aggregation Since mac80211 doesn't currently support TSIDs 8-15 which can only be used after QoS TSPEC negotiation (and not even after WMM negotiation), reject attempts to set up aggregation sessions for them, which might confuse drivers. In mac80211 we do correctly handle that, but the TSIDs should never get used anyway, and drivers might not be able to handle it. Cc: stable@vger.kernel.org Signed-off-by: Johannes Berg <johannes.berg@intel.com> --- net/mac80211/agg-rx.c | 8 +++++++- net/mac80211/agg-tx.c | 3 +++ 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index a9aff6079c42..afa94687d5e1 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -261,10 +261,16 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, .timeout = timeout, .ssn = start_seq_num, }; - int i, ret = -EOPNOTSUPP; u16 status = WLAN_STATUS_REQUEST_DECLINED; + if (tid >= IEEE80211_FIRST_TSPEC_TSID) { + ht_dbg(sta->sdata, + "STA %pM requests BA session on unsupported tid %d\n", + sta->sta.addr, tid); + goto end_no_lock; + } + if (!sta->sta.ht_cap.ht_supported) { ht_dbg(sta->sdata, "STA %pM erroneously requests BA session on tid %d w/o QoS\n", diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index 5650c46bf91a..45319cc01121 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -584,6 +584,9 @@ int ieee80211_start_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid, ieee80211_hw_check(&local->hw, TX_AMPDU_SETUP_IN_HW)) return -EINVAL; + if (WARN_ON(tid >= IEEE80211_FIRST_TSPEC_TSID)) + return -EINVAL; + ht_dbg(sdata, "Open BA session requested for %pM tid %u\n", pubsta->addr, tid); -- cgit v1.2.3-58-ga151 From d6f64d725bac20df66b2eacd847fc41d7a1905e0 Mon Sep 17 00:00:00 2001 From: Mark Tomlinson <mark.tomlinson@alliedtelesis.co.nz> Date: Thu, 15 Sep 2016 11:40:05 +1200 Subject: net: VRF: Pass original iif to ip_route_input() The function ip_rcv_finish() calls l3mdev_ip_rcv(). On any VRF except the global VRF, this replaces skb->dev with the VRF master interface. When calling ip_route_input_noref() from here, the checks for forwarding look at this master device instead of the initial ingress interface. This will allow packets to be routed which normally would be dropped. For example, an interface that is not assigned an IP address should drop packets, but because the checking is against the master device, the packet will be forwarded. The fix here is to still call l3mdev_ip_rcv(), but remember the initial net_device. This is passed to the other functions within ip_rcv_finish, so they still see the original interface. Signed-off-by: Mark Tomlinson <mark.tomlinson@alliedtelesis.co.nz> Acked-by: David Ahern <dsa@cumulusnetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/ipv4/ip_input.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 4b351af3e67b..d6feabb03516 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -312,6 +312,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { const struct iphdr *iph = ip_hdr(skb); struct rtable *rt; + struct net_device *dev = skb->dev; /* if ingress device is enslaved to an L3 master device pass the * skb to its handler for processing @@ -341,7 +342,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) */ if (!skb_valid_dst(skb)) { int err = ip_route_input_noref(skb, iph->daddr, iph->saddr, - iph->tos, skb->dev); + iph->tos, dev); if (unlikely(err)) { if (err == -EXDEV) __NET_INC_STATS(net, LINUX_MIB_IPRPFILTER); @@ -370,7 +371,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) __IP_UPD_PO_STATS(net, IPSTATS_MIB_INBCAST, skb->len); } else if (skb->pkt_type == PACKET_BROADCAST || skb->pkt_type == PACKET_MULTICAST) { - struct in_device *in_dev = __in_dev_get_rcu(skb->dev); + struct in_device *in_dev = __in_dev_get_rcu(dev); /* RFC 1122 3.3.6: * -- cgit v1.2.3-58-ga151 From ffb4d6c8508657824bcef68a36b2a0f9d8c09d10 Mon Sep 17 00:00:00 2001 From: Eric Dumazet <edumazet@google.com> Date: Thu, 15 Sep 2016 08:12:33 -0700 Subject: tcp: fix overflow in __tcp_retransmit_skb() If a TCP socket gets a large write queue, an overflow can happen in a test in __tcp_retransmit_skb() preventing all retransmits. The flow then stalls and resets after timeouts. Tested: sysctl -w net.core.wmem_max=1000000000 netperf -H dest -- -s 1000000000 Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/ipv4/tcp_output.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index bdaef7fd6e47..f53d0cca5fa4 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2605,7 +2605,8 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) * copying overhead: fragmentation, tunneling, mangling etc. */ if (atomic_read(&sk->sk_wmem_alloc) > - min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf)) + min_t(u32, sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), + sk->sk_sndbuf)) return -EAGAIN; if (skb_still_in_host_queue(sk, skb)) -- cgit v1.2.3-58-ga151 From 8ab86c00e349cef9fb14719093a7f198bcc72629 Mon Sep 17 00:00:00 2001 From: "phil.turnbull@oracle.com" <phil.turnbull@oracle.com> Date: Thu, 15 Sep 2016 12:41:44 -0400 Subject: irda: Free skb on irda_accept error path. skb is not freed if newsk is NULL. Rework the error path so free_skb is unconditionally called on function exit. Fixes: c3ea9fa27413 ("[IrDA] af_irda: IRDA_ASSERT cleanups") Signed-off-by: Phil Turnbull <phil.turnbull@oracle.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/irda/af_irda.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index 8d2f7c9b491d..ccc244406fb9 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c @@ -832,7 +832,7 @@ static int irda_accept(struct socket *sock, struct socket *newsock, int flags) struct sock *sk = sock->sk; struct irda_sock *new, *self = irda_sk(sk); struct sock *newsk; - struct sk_buff *skb; + struct sk_buff *skb = NULL; int err; err = irda_create(sock_net(sk), newsock, sk->sk_protocol, 0); @@ -900,7 +900,6 @@ static int irda_accept(struct socket *sock, struct socket *newsock, int flags) err = -EPERM; /* value does not seem to make sense. -arnd */ if (!new->tsap) { pr_debug("%s(), dup failed!\n", __func__); - kfree_skb(skb); goto out; } @@ -919,7 +918,6 @@ static int irda_accept(struct socket *sock, struct socket *newsock, int flags) /* Clean up the original one to keep it in listen state */ irttp_listen(self->tsap); - kfree_skb(skb); sk->sk_ack_backlog--; newsock->state = SS_CONNECTED; @@ -927,6 +925,7 @@ static int irda_accept(struct socket *sock, struct socket *newsock, int flags) irda_connect_response(new); err = 0; out: + kfree_skb(skb); release_sock(sk); return err; } -- cgit v1.2.3-58-ga151 From b588479358ce26f32138e0f0a7ab0678f8e3e601 Mon Sep 17 00:00:00 2001 From: Ilan Tayari <ilant@mellanox.com> Date: Sun, 18 Sep 2016 07:42:53 +0000 Subject: xfrm: Fix memory leak of aead algorithm name commit 1a6509d99122 ("[IPSEC]: Add support for combined mode algorithms") introduced aead. The function attach_aead kmemdup()s the algorithm name during xfrm_state_construct(). However this memory is never freed. Implementation has since been slightly modified in commit ee5c23176fcc ("xfrm: Clone states properly on migration") without resolving this leak. This patch adds a kfree() call for the aead algorithm name. Fixes: 1a6509d99122 ("[IPSEC]: Add support for combined mode algorithms") Signed-off-by: Ilan Tayari <ilant@mellanox.com> Acked-by: Rami Rosen <roszenrami@gmail.com> Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com> --- net/xfrm/xfrm_state.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 9895a8c56d8c..a30f898dc1c5 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -332,6 +332,7 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x) { tasklet_hrtimer_cancel(&x->mtimer); del_timer_sync(&x->rtimer); + kfree(x->aead); kfree(x->aalg); kfree(x->ealg); kfree(x->calg); -- cgit v1.2.3-58-ga151 From d979a39d7242e0601bf9b60e89628fb8ac577179 Mon Sep 17 00:00:00 2001 From: Johannes Weiner <jweiner@fb.com> Date: Mon, 19 Sep 2016 14:44:38 -0700 Subject: cgroup: duplicate cgroup reference when cloning sockets When a socket is cloned, the associated sock_cgroup_data is duplicated but not its reference on the cgroup. As a result, the cgroup reference count will underflow when both sockets are destroyed later on. Fixes: bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup") Link: http://lkml.kernel.org/r/20160914194846.11153-2-hannes@cmpxchg.org Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Acked-by: Tejun Heo <tj@kernel.org> Cc: Michal Hocko <mhocko@suse.cz> Cc: Vladimir Davydov <vdavydov@virtuozzo.com> Cc: <stable@vger.kernel.org> [4.5+] Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- kernel/cgroup.c | 6 ++++++ net/core/sock.c | 5 ++++- 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d1c51b7f5221..5e8dab5bf9ad 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -6270,6 +6270,12 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd) if (cgroup_sk_alloc_disabled) return; + /* Socket clone path */ + if (skcd->val) { + cgroup_get(sock_cgroup_ptr(skcd)); + return; + } + rcu_read_lock(); while (true) { diff --git a/net/core/sock.c b/net/core/sock.c index 25dab8b60223..fd7b41edf1ce 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1362,7 +1362,6 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, if (!try_module_get(prot->owner)) goto out_free_sec; sk_tx_queue_clear(sk); - cgroup_sk_alloc(&sk->sk_cgrp_data); } return sk; @@ -1422,6 +1421,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, sock_net_set(sk, net); atomic_set(&sk->sk_wmem_alloc, 1); + cgroup_sk_alloc(&sk->sk_cgrp_data); sock_update_classid(&sk->sk_cgrp_data); sock_update_netprioidx(&sk->sk_cgrp_data); } @@ -1566,6 +1566,9 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) newsk->sk_priority = 0; newsk->sk_incoming_cpu = raw_smp_processor_id(); atomic64_set(&newsk->sk_cookie, 0); + + cgroup_sk_alloc(&newsk->sk_cgrp_data); + /* * Before updating sk_refcnt, we must commit prior changes to memory * (Documentation/RCU/rculist_nulls.txt for details) -- cgit v1.2.3-58-ga151 From a435a07f9164dda7c0c26e8ad758881f4bafc127 Mon Sep 17 00:00:00 2001 From: Vincent Bernat <vincent@bernat.im> Date: Sun, 18 Sep 2016 17:46:07 +0200 Subject: net: ipv6: fallback to full lookup if table lookup is unsuitable Commit 8c14586fc320 ("net: ipv6: Use passed in table for nexthop lookups") introduced a regression: insertion of an IPv6 route in a table not containing the appropriate connected route for the gateway but which contained a non-connected route (like a default gateway) fails while it was previously working: $ ip link add eth0 type dummy $ ip link set up dev eth0 $ ip addr add 2001:db8::1/64 dev eth0 $ ip route add ::/0 via 2001:db8::5 dev eth0 table 20 $ ip route add 2001:db8:cafe::1/128 via 2001:db8::6 dev eth0 table 20 RTNETLINK answers: No route to host $ ip -6 route show table 20 default via 2001:db8::5 dev eth0 metric 1024 pref medium After this patch, we get: $ ip route add 2001:db8:cafe::1/128 via 2001:db8::6 dev eth0 table 20 $ ip -6 route show table 20 2001:db8:cafe::1 via 2001:db8::6 dev eth0 metric 1024 pref medium default via 2001:db8::5 dev eth0 metric 1024 pref medium Fixes: 8c14586fc320 ("net: ipv6: Use passed in table for nexthop lookups") Signed-off-by: Vincent Bernat <vincent@bernat.im> Acked-by: David Ahern <dsa@cumulusnetworks.com> Tested-by: David Ahern <dsa@cumulusnetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/ipv6/route.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 49817555449e..e3a224b97905 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1986,9 +1986,18 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg) if (!(gwa_type & IPV6_ADDR_UNICAST)) goto out; - if (cfg->fc_table) + if (cfg->fc_table) { grt = ip6_nh_lookup_table(net, cfg, gw_addr); + if (grt) { + if (grt->rt6i_flags & RTF_GATEWAY || + (dev && dev != grt->dst.dev)) { + ip6_rt_put(grt); + grt = NULL; + } + } + } + if (!grt) grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1); -- cgit v1.2.3-58-ga151 From b5036cd4ed3173ab8cdbc85e2ba74acf46bafb51 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com> Date: Tue, 20 Sep 2016 16:17:22 +0200 Subject: ipmr, ip6mr: return lastuse relative to now When I introduced the lastuse member I made a subtle error because it was returned as an absolute value but that is meaningless to user-space as it doesn't allow to see how old exactly an entry is. Let's make it similar to how the bridge returns such values and make it relative to "now" (jiffies). This allows us to show the actual age of the entries and is much more useful (e.g. user-space daemons can age out entries, iproute2 can display the lastuse properly). Fixes: 43b9e1274060 ("net: ipmr/ip6mr: add support for keeping an entry age") Reported-by: Satish Ashok <sashok@cumulusnetworks.com> Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/ipv4/ipmr.c | 7 +++++-- net/ipv6/ip6mr.c | 7 +++++-- 2 files changed, 10 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 26253328d227..a87bcd2d4a94 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -2076,6 +2076,7 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, struct rta_mfc_stats mfcs; struct nlattr *mp_attr; struct rtnexthop *nhp; + unsigned long lastuse; int ct; /* If cache is unresolved, don't try to parse IIF and OIF */ @@ -2105,12 +2106,14 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, nla_nest_end(skb, mp_attr); + lastuse = READ_ONCE(c->mfc_un.res.lastuse); + lastuse = time_after_eq(jiffies, lastuse) ? jiffies - lastuse : 0; + mfcs.mfcs_packets = c->mfc_un.res.pkt; mfcs.mfcs_bytes = c->mfc_un.res.bytes; mfcs.mfcs_wrong_if = c->mfc_un.res.wrong_if; if (nla_put_64bit(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs, RTA_PAD) || - nla_put_u64_64bit(skb, RTA_EXPIRES, - jiffies_to_clock_t(c->mfc_un.res.lastuse), + nla_put_u64_64bit(skb, RTA_EXPIRES, jiffies_to_clock_t(lastuse), RTA_PAD)) return -EMSGSIZE; diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 6122f9c5cc49..fccb5dd91902 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -2239,6 +2239,7 @@ static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb, struct rta_mfc_stats mfcs; struct nlattr *mp_attr; struct rtnexthop *nhp; + unsigned long lastuse; int ct; /* If cache is unresolved, don't try to parse IIF and OIF */ @@ -2269,12 +2270,14 @@ static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb, nla_nest_end(skb, mp_attr); + lastuse = READ_ONCE(c->mfc_un.res.lastuse); + lastuse = time_after_eq(jiffies, lastuse) ? jiffies - lastuse : 0; + mfcs.mfcs_packets = c->mfc_un.res.pkt; mfcs.mfcs_bytes = c->mfc_un.res.bytes; mfcs.mfcs_wrong_if = c->mfc_un.res.wrong_if; if (nla_put_64bit(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs, RTA_PAD) || - nla_put_u64_64bit(skb, RTA_EXPIRES, - jiffies_to_clock_t(c->mfc_un.res.lastuse), + nla_put_u64_64bit(skb, RTA_EXPIRES, jiffies_to_clock_t(lastuse), RTA_PAD)) return -EMSGSIZE; -- cgit v1.2.3-58-ga151 From 63c43787d35e45562a6b5927e2edc8f4783d95b8 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel <nicolas.dichtel@6wind.com> Date: Mon, 19 Sep 2016 16:17:57 +0200 Subject: vti6: fix input path Since commit 1625f4529957, vti6 is broken, all input packets are dropped (LINUX_MIB_XFRMINNOSTATES is incremented). XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 is set by vti6_rcv() before calling xfrm6_rcv()/xfrm6_rcv_spi(), thus we cannot set to NULL that value in xfrm6_rcv_spi(). A new function xfrm6_rcv_tnl() that enables to pass a value to xfrm6_rcv_spi() is added, so that xfrm6_rcv() is not touched (this function is used in several handlers). CC: Alexey Kodanev <alexey.kodanev@oracle.com> Fixes: 1625f4529957 ("net/xfrm_input: fix possible NULL deref of tunnel.ip6->parms.i_key") Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com> Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com> --- include/net/xfrm.h | 4 +++- net/ipv6/ip6_vti.c | 4 +--- net/ipv6/xfrm6_input.c | 16 +++++++++++----- net/ipv6/xfrm6_tunnel.c | 2 +- 4 files changed, 16 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index adfebd6f243c..17934312eecb 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1540,8 +1540,10 @@ int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family); void xfrm4_local_error(struct sk_buff *skb, u32 mtu); int xfrm6_extract_header(struct sk_buff *skb); int xfrm6_extract_input(struct xfrm_state *x, struct sk_buff *skb); -int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi); +int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi, + struct ip6_tnl *t); int xfrm6_transport_finish(struct sk_buff *skb, int async); +int xfrm6_rcv_tnl(struct sk_buff *skb, struct ip6_tnl *t); int xfrm6_rcv(struct sk_buff *skb); int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr, xfrm_address_t *saddr, u8 proto); diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 52a2f735881f..5bd3afdcc771 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -321,11 +321,9 @@ static int vti6_rcv(struct sk_buff *skb) goto discard; } - XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = t; - rcu_read_unlock(); - return xfrm6_rcv(skb); + return xfrm6_rcv_tnl(skb, t); } rcu_read_unlock(); return -EINVAL; diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c index 00a2d40677d6..b5789562aded 100644 --- a/net/ipv6/xfrm6_input.c +++ b/net/ipv6/xfrm6_input.c @@ -21,9 +21,10 @@ int xfrm6_extract_input(struct xfrm_state *x, struct sk_buff *skb) return xfrm6_extract_header(skb); } -int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi) +int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi, + struct ip6_tnl *t) { - XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL; + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = t; XFRM_SPI_SKB_CB(skb)->family = AF_INET6; XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr); return xfrm_input(skb, nexthdr, spi, 0); @@ -49,13 +50,18 @@ int xfrm6_transport_finish(struct sk_buff *skb, int async) return -1; } -int xfrm6_rcv(struct sk_buff *skb) +int xfrm6_rcv_tnl(struct sk_buff *skb, struct ip6_tnl *t) { return xfrm6_rcv_spi(skb, skb_network_header(skb)[IP6CB(skb)->nhoff], - 0); + 0, t); } -EXPORT_SYMBOL(xfrm6_rcv); +EXPORT_SYMBOL(xfrm6_rcv_tnl); +int xfrm6_rcv(struct sk_buff *skb) +{ + return xfrm6_rcv_tnl(skb, NULL); +} +EXPORT_SYMBOL(xfrm6_rcv); int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr, xfrm_address_t *saddr, u8 proto) { diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c index 5743044cd660..e1c0bbe7996c 100644 --- a/net/ipv6/xfrm6_tunnel.c +++ b/net/ipv6/xfrm6_tunnel.c @@ -236,7 +236,7 @@ static int xfrm6_tunnel_rcv(struct sk_buff *skb) __be32 spi; spi = xfrm6_tunnel_spi_lookup(net, (const xfrm_address_t *)&iph->saddr); - return xfrm6_rcv_spi(skb, IPPROTO_IPV6, spi); + return xfrm6_rcv_spi(skb, IPPROTO_IPV6, spi, NULL); } static int xfrm6_tunnel_err(struct sk_buff *skb, struct inet6_skb_parm *opt, -- cgit v1.2.3-58-ga151 From adb03115f4590baa280ddc440a8eff08a6be0cb7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet <edumazet@google.com> Date: Tue, 20 Sep 2016 18:06:17 -0700 Subject: net: get rid of an signed integer overflow in ip_idents_reserve() Jiri Pirko reported an UBSAN warning happening in ip_idents_reserve() [] UBSAN: Undefined behaviour in ./arch/x86/include/asm/atomic.h:156:11 [] signed integer overflow: [] -2117905507 + -695755206 cannot be represented in type 'int' Since we do not have uatomic_add_return() yet, use atomic_cmpxchg() so that the arithmetics can be done using unsigned int. Fixes: 04ca6973f7c1 ("ip: make IP identifiers less predictable") Signed-off-by: Eric Dumazet <edumazet@google.com> Reported-by: Jiri Pirko <jiri@resnulli.us> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/ipv4/route.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index a1f2830d8110..b5b47a26d4ec 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -476,12 +476,18 @@ u32 ip_idents_reserve(u32 hash, int segs) atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ; u32 old = ACCESS_ONCE(*p_tstamp); u32 now = (u32)jiffies; - u32 delta = 0; + u32 new, delta = 0; if (old != now && cmpxchg(p_tstamp, old, now) == old) delta = prandom_u32_max(now - old); - return atomic_add_return(segs + delta, p_id) - segs; + /* Do not use atomic_add_return() as it makes UBSAN unhappy */ + do { + old = (u32)atomic_read(p_id); + new = old + delta + segs; + } while (atomic_cmpxchg(p_id, old, new) != old); + + return new - segs; } EXPORT_SYMBOL(ip_idents_reserve); -- cgit v1.2.3-58-ga151 From de1d657816c6fbb70f07b01d50ec669dff0d4e60 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng <ycheng@google.com> Date: Wed, 21 Sep 2016 16:16:14 -0700 Subject: tcp: fix under-accounting retransmit SNMP counters This patch fixes these under-accounting SNMP rtx stats LINUX_MIB_TCPFORWARDRETRANS LINUX_MIB_TCPFASTRETRANS LINUX_MIB_TCPSLOWSTARTRETRANS when retransmitting TSO packets Fixes: 10d3be569243 ("tcp-tso: do not split TSO packets at retransmit time") Signed-off-by: Yuchung Cheng <ycheng@google.com> Acked-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/ipv4/tcp_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f53d0cca5fa4..e15ec82a6319 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2831,7 +2831,7 @@ begin_fwd: if (tcp_retransmit_skb(sk, skb, segs)) return; - NET_INC_STATS(sock_net(sk), mib_idx); + NET_ADD_STATS(sock_net(sk), mib_idx, tcp_skb_pcount(skb)); if (tcp_in_cwnd_reduction(sk)) tp->prr_out += tcp_skb_pcount(skb); -- cgit v1.2.3-58-ga151 From 7e32b44361abc77fbc01f2b97b045c405b2583e5 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng <ycheng@google.com> Date: Wed, 21 Sep 2016 16:16:15 -0700 Subject: tcp: properly account Fast Open SYN-ACK retrans Since the TFO socket is accepted right off SYN-data, the socket owner can call getsockopt(TCP_INFO) to collect ongoing SYN-ACK retransmission or timeout stats (i.e., tcpi_total_retrans, tcpi_retransmits). Currently those stats are only updated upon handshake completes. This patch fixes it. Signed-off-by: Yuchung Cheng <ycheng@google.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_output.c | 2 ++ net/ipv4/tcp_timer.c | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3ebf45b38bc3..08323bd95f2a 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5885,7 +5885,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) * so release it. */ if (req) { - tp->total_retrans = req->num_retrans; + inet_csk(sk)->icsk_retransmits = 0; reqsk_fastopen_remove(sk, req, false); } else { /* Make sure socket is routed, for correct metrics. */ diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index e15ec82a6319..5288cec4a2b2 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3568,6 +3568,8 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) if (!res) { __TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); + if (unlikely(tcp_passive_fastopen(sk))) + tcp_sk(sk)->total_retrans++; } return res; } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index d84930b2dd95..f712b411f6ed 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -384,6 +384,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk) */ inet_rtx_syn_ack(sk, req); req->num_timeout++; + icsk->icsk_retransmits++; inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX); } -- cgit v1.2.3-58-ga151