From f230d1e891ba1da5953460516960894154f265db Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 15 Sep 2015 14:30:06 -0700 Subject: ipv6: Rename the dst_cache helper functions in ip6_tunnel It is a prep work to fix the dst_entry refcnt bugs in ip6_tunnel. This patch rename: 1. ip6_tnl_dst_check() to ip6_tnl_dst_get() to better reflect that it will take a dst refcnt in the next patch. 2. ip6_tnl_dst_store() to ip6_tnl_dst_set() to have a more conventional name matching with ip6_tnl_dst_get(). Signed-off-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net/ipv6/ip6_tunnel.c') diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index b0ab420612bc..599b0b419fbc 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -126,7 +126,7 @@ static struct net_device_stats *ip6_get_stats(struct net_device *dev) * Locking : hash tables are protected by RCU and RTNL */ -struct dst_entry *ip6_tnl_dst_check(struct ip6_tnl *t) +struct dst_entry *ip6_tnl_dst_get(struct ip6_tnl *t) { struct dst_entry *dst = t->dst_cache; @@ -139,7 +139,7 @@ struct dst_entry *ip6_tnl_dst_check(struct ip6_tnl *t) return dst; } -EXPORT_SYMBOL_GPL(ip6_tnl_dst_check); +EXPORT_SYMBOL_GPL(ip6_tnl_dst_get); void ip6_tnl_dst_reset(struct ip6_tnl *t) { @@ -148,14 +148,14 @@ void ip6_tnl_dst_reset(struct ip6_tnl *t) } EXPORT_SYMBOL_GPL(ip6_tnl_dst_reset); -void ip6_tnl_dst_store(struct ip6_tnl *t, struct dst_entry *dst) +void ip6_tnl_dst_set(struct ip6_tnl *t, struct dst_entry *dst) { struct rt6_info *rt = (struct rt6_info *) dst; t->dst_cookie = rt6_get_cookie(rt); dst_release(t->dst_cache); t->dst_cache = dst; } -EXPORT_SYMBOL_GPL(ip6_tnl_dst_store); +EXPORT_SYMBOL_GPL(ip6_tnl_dst_set); /** * ip6_tnl_lookup - fetch tunnel matching the end-point addresses @@ -1010,7 +1010,7 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, memcpy(&fl6->daddr, addr6, sizeof(fl6->daddr)); neigh_release(neigh); } else if (!fl6->flowi6_mark) - dst = ip6_tnl_dst_check(t); + dst = ip6_tnl_dst_get(t); if (!ip6_tnl_xmit_ctl(t, &fl6->saddr, &fl6->daddr)) goto tx_err_link_failure; @@ -1102,7 +1102,7 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, ipv6h->daddr = fl6->daddr; ip6tunnel_xmit(NULL, skb, dev); if (ndst) - ip6_tnl_dst_store(t, ndst); + ip6_tnl_dst_set(t, ndst); return 0; tx_err_link_failure: stats->tx_carrier_errors++; -- cgit v1.2.3-58-ga151 From cdf3464e6c6bd764277cbbe992cd12da735b92fb Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 15 Sep 2015 14:30:07 -0700 Subject: ipv6: Fix dst_entry refcnt bugs in ip6_tunnel Problems in the current dst_entry cache in the ip6_tunnel: 1. ip6_tnl_dst_set is racy. There is no lock to protect it: - One major problem is that the dst refcnt gets messed up. F.e. the same dst_cache can be released multiple times and then triggering the infamous dst refcnt < 0 warning message. - Another issue is the inconsistency between dst_cache and dst_cookie. It can be reproduced by adding and removing the ip6gre tunnel while running a super_netperf TCP_CRR test. 2. ip6_tnl_dst_get does not take the dst refcnt before returning the dst. This patch: 1. Create a percpu dst_entry cache in ip6_tnl 2. Use a spinlock to protect the dst_cache operations 3. ip6_tnl_dst_get always takes the dst refcnt before returning Signed-off-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/net/ip6_tunnel.h | 11 ++++- net/ipv6/ip6_gre.c | 38 ++++++++------- net/ipv6/ip6_tunnel.c | 122 +++++++++++++++++++++++++++++++++++------------ 3 files changed, 123 insertions(+), 48 deletions(-) (limited to 'net/ipv6/ip6_tunnel.c') diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h index 979b081a47e8..60b4f402f78c 100644 --- a/include/net/ip6_tunnel.h +++ b/include/net/ip6_tunnel.h @@ -32,6 +32,12 @@ struct __ip6_tnl_parm { __be32 o_key; }; +struct ip6_tnl_dst { + spinlock_t lock; + struct dst_entry *dst; + u32 cookie; +}; + /* IPv6 tunnel */ struct ip6_tnl { struct ip6_tnl __rcu *next; /* next tunnel in list */ @@ -39,8 +45,7 @@ struct ip6_tnl { struct net *net; /* netns for packet i/o */ struct __ip6_tnl_parm parms; /* tunnel configuration parameters */ struct flowi fl; /* flowi template for xmit */ - struct dst_entry *dst_cache; /* cached dst */ - u32 dst_cookie; + struct ip6_tnl_dst __percpu *dst_cache; /* cached dst */ int err_count; unsigned long err_time; @@ -61,6 +66,8 @@ struct ipv6_tlv_tnl_enc_lim { } __packed; struct dst_entry *ip6_tnl_dst_get(struct ip6_tnl *t); +int ip6_tnl_dst_init(struct ip6_tnl *t); +void ip6_tnl_dst_destroy(struct ip6_tnl *t); void ip6_tnl_dst_reset(struct ip6_tnl *t); void ip6_tnl_dst_set(struct ip6_tnl *t, struct dst_entry *dst); int ip6_tnl_rcv_ctl(struct ip6_tnl *t, const struct in6_addr *laddr, diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 24f5dd8f76a8..646512488c28 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -637,17 +637,17 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, dst = ip6_tnl_dst_get(tunnel); if (!dst) { - ndst = ip6_route_output(net, NULL, fl6); + dst = ip6_route_output(net, NULL, fl6); - if (ndst->error) + if (dst->error) goto tx_err_link_failure; - ndst = xfrm_lookup(net, ndst, flowi6_to_flowi(fl6), NULL, 0); - if (IS_ERR(ndst)) { - err = PTR_ERR(ndst); - ndst = NULL; + dst = xfrm_lookup(net, dst, flowi6_to_flowi(fl6), NULL, 0); + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + dst = NULL; goto tx_err_link_failure; } - dst = ndst; + ndst = dst; } tdev = dst->dev; @@ -702,12 +702,9 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, skb = new_skb; } - if (fl6->flowi6_mark) { - skb_dst_set(skb, dst); - ndst = NULL; - } else { - skb_dst_set_noref(skb, dst); - } + if (!fl6->flowi6_mark && ndst) + ip6_tnl_dst_set(tunnel, ndst); + skb_dst_set(skb, dst); proto = NEXTHDR_GRE; if (encap_limit >= 0) { @@ -762,14 +759,12 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, skb_set_inner_protocol(skb, protocol); ip6tunnel_xmit(NULL, skb, dev); - if (ndst) - ip6_tnl_dst_set(tunnel, ndst); return 0; tx_err_link_failure: stats->tx_carrier_errors++; dst_link_failure(skb); tx_err_dst_release: - dst_release(ndst); + dst_release(dst); return err; } @@ -1223,6 +1218,9 @@ static const struct net_device_ops ip6gre_netdev_ops = { static void ip6gre_dev_free(struct net_device *dev) { + struct ip6_tnl *t = netdev_priv(dev); + + ip6_tnl_dst_destroy(t); free_percpu(dev->tstats); free_netdev(dev); } @@ -1248,6 +1246,7 @@ static void ip6gre_tunnel_setup(struct net_device *dev) static int ip6gre_tunnel_init_common(struct net_device *dev) { struct ip6_tnl *tunnel; + int ret; tunnel = netdev_priv(dev); @@ -1259,6 +1258,13 @@ static int ip6gre_tunnel_init_common(struct net_device *dev) if (!dev->tstats) return -ENOMEM; + ret = ip6_tnl_dst_init(tunnel); + if (ret) { + free_percpu(dev->tstats); + dev->tstats = NULL; + return ret; + } + return 0; } diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 599b0b419fbc..851cf6d1eb45 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -126,37 +126,90 @@ static struct net_device_stats *ip6_get_stats(struct net_device *dev) * Locking : hash tables are protected by RCU and RTNL */ -struct dst_entry *ip6_tnl_dst_get(struct ip6_tnl *t) +static void __ip6_tnl_per_cpu_dst_set(struct ip6_tnl_dst *idst, + struct dst_entry *dst) { - struct dst_entry *dst = t->dst_cache; - - if (dst && dst->obsolete && - !dst->ops->check(dst, t->dst_cookie)) { - t->dst_cache = NULL; - dst_release(dst); - return NULL; + dst_release(idst->dst); + if (dst) { + dst_hold(dst); + idst->cookie = rt6_get_cookie((struct rt6_info *)dst); + } else { + idst->cookie = 0; } + idst->dst = dst; +} + +static void ip6_tnl_per_cpu_dst_set(struct ip6_tnl_dst *idst, + struct dst_entry *dst) +{ + spin_lock_bh(&idst->lock); + __ip6_tnl_per_cpu_dst_set(idst, dst); + spin_unlock_bh(&idst->lock); +} + +struct dst_entry *ip6_tnl_dst_get(struct ip6_tnl *t) +{ + struct ip6_tnl_dst *idst; + struct dst_entry *dst; + + idst = raw_cpu_ptr(t->dst_cache); + spin_lock_bh(&idst->lock); + dst = idst->dst; + if (dst) { + if (!dst->obsolete || dst->ops->check(dst, idst->cookie)) { + dst_hold(idst->dst); + } else { + __ip6_tnl_per_cpu_dst_set(idst, NULL); + dst = NULL; + } + } + spin_unlock_bh(&idst->lock); return dst; } EXPORT_SYMBOL_GPL(ip6_tnl_dst_get); void ip6_tnl_dst_reset(struct ip6_tnl *t) { - dst_release(t->dst_cache); - t->dst_cache = NULL; + int i; + + for_each_possible_cpu(i) + ip6_tnl_per_cpu_dst_set(raw_cpu_ptr(t->dst_cache), NULL); } EXPORT_SYMBOL_GPL(ip6_tnl_dst_reset); void ip6_tnl_dst_set(struct ip6_tnl *t, struct dst_entry *dst) { - struct rt6_info *rt = (struct rt6_info *) dst; - t->dst_cookie = rt6_get_cookie(rt); - dst_release(t->dst_cache); - t->dst_cache = dst; + ip6_tnl_per_cpu_dst_set(raw_cpu_ptr(t->dst_cache), dst); + } EXPORT_SYMBOL_GPL(ip6_tnl_dst_set); +void ip6_tnl_dst_destroy(struct ip6_tnl *t) +{ + if (!t->dst_cache) + return; + + ip6_tnl_dst_reset(t); + free_percpu(t->dst_cache); +} +EXPORT_SYMBOL_GPL(ip6_tnl_dst_destroy); + +int ip6_tnl_dst_init(struct ip6_tnl *t) +{ + int i; + + t->dst_cache = alloc_percpu(struct ip6_tnl_dst); + if (!t->dst_cache) + return -ENOMEM; + + for_each_possible_cpu(i) + spin_lock_init(&per_cpu_ptr(t->dst_cache, i)->lock); + + return 0; +} +EXPORT_SYMBOL_GPL(ip6_tnl_dst_init); + /** * ip6_tnl_lookup - fetch tunnel matching the end-point addresses * @remote: the address of the tunnel exit-point @@ -271,6 +324,9 @@ ip6_tnl_unlink(struct ip6_tnl_net *ip6n, struct ip6_tnl *t) static void ip6_dev_free(struct net_device *dev) { + struct ip6_tnl *t = netdev_priv(dev); + + ip6_tnl_dst_destroy(t); free_percpu(dev->tstats); free_netdev(dev); } @@ -1016,17 +1072,17 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, goto tx_err_link_failure; if (!dst) { - ndst = ip6_route_output(net, NULL, fl6); + dst = ip6_route_output(net, NULL, fl6); - if (ndst->error) + if (dst->error) goto tx_err_link_failure; - ndst = xfrm_lookup(net, ndst, flowi6_to_flowi(fl6), NULL, 0); - if (IS_ERR(ndst)) { - err = PTR_ERR(ndst); - ndst = NULL; + dst = xfrm_lookup(net, dst, flowi6_to_flowi(fl6), NULL, 0); + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + dst = NULL; goto tx_err_link_failure; } - dst = ndst; + ndst = dst; } tdev = dst->dev; @@ -1072,12 +1128,11 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, consume_skb(skb); skb = new_skb; } - if (fl6->flowi6_mark) { - skb_dst_set(skb, dst); - ndst = NULL; - } else { - skb_dst_set_noref(skb, dst); - } + + if (!fl6->flowi6_mark && ndst) + ip6_tnl_dst_set(t, ndst); + skb_dst_set(skb, dst); + skb->transport_header = skb->network_header; proto = fl6->flowi6_proto; @@ -1101,14 +1156,12 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, ipv6h->saddr = fl6->saddr; ipv6h->daddr = fl6->daddr; ip6tunnel_xmit(NULL, skb, dev); - if (ndst) - ip6_tnl_dst_set(t, ndst); return 0; tx_err_link_failure: stats->tx_carrier_errors++; dst_link_failure(skb); tx_err_dst_release: - dst_release(ndst); + dst_release(dst); return err; } @@ -1573,12 +1626,21 @@ static inline int ip6_tnl_dev_init_gen(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); + int ret; t->dev = dev; t->net = dev_net(dev); dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!dev->tstats) return -ENOMEM; + + ret = ip6_tnl_dst_init(t); + if (ret) { + free_percpu(dev->tstats); + dev->tstats = NULL; + return ret; + } + return 0; } -- cgit v1.2.3-58-ga151 From 70da5b5c532f0ec8aa76b4f46158da5f010f34b3 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 15 Sep 2015 14:30:09 -0700 Subject: ipv6: Replace spinlock with seqlock and rcu in ip6_tunnel This patch uses a seqlock to ensure consistency between idst->dst and idst->cookie. It also makes dst freeing from fib tree to undergo a rcu grace period. Signed-off-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/net/ip6_tunnel.h | 4 ++-- net/ipv6/ip6_fib.c | 9 +++++++-- net/ipv6/ip6_tunnel.c | 51 +++++++++++++++++++++++++----------------------- 3 files changed, 36 insertions(+), 28 deletions(-) (limited to 'net/ipv6/ip6_tunnel.c') diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h index 60b4f402f78c..65c2a9397b3c 100644 --- a/include/net/ip6_tunnel.h +++ b/include/net/ip6_tunnel.h @@ -33,8 +33,8 @@ struct __ip6_tnl_parm { }; struct ip6_tnl_dst { - spinlock_t lock; - struct dst_entry *dst; + seqlock_t lock; + struct dst_entry __rcu *dst; u32 cookie; }; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index e68350bf838b..8a9ec01f4d01 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -155,6 +155,11 @@ static void node_free(struct fib6_node *fn) kmem_cache_free(fib6_node_kmem, fn); } +static void rt6_rcu_free(struct rt6_info *rt) +{ + call_rcu(&rt->dst.rcu_head, dst_rcu_free); +} + static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt) { int cpu; @@ -169,7 +174,7 @@ static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt) ppcpu_rt = per_cpu_ptr(non_pcpu_rt->rt6i_pcpu, cpu); pcpu_rt = *ppcpu_rt; if (pcpu_rt) { - dst_free(&pcpu_rt->dst); + rt6_rcu_free(pcpu_rt); *ppcpu_rt = NULL; } } @@ -181,7 +186,7 @@ static void rt6_release(struct rt6_info *rt) { if (atomic_dec_and_test(&rt->rt6i_ref)) { rt6_free_pcpu(rt); - dst_free(&rt->dst); + rt6_rcu_free(rt); } } diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 851cf6d1eb45..983f0d20f96d 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -126,45 +126,48 @@ static struct net_device_stats *ip6_get_stats(struct net_device *dev) * Locking : hash tables are protected by RCU and RTNL */ -static void __ip6_tnl_per_cpu_dst_set(struct ip6_tnl_dst *idst, - struct dst_entry *dst) +static void ip6_tnl_per_cpu_dst_set(struct ip6_tnl_dst *idst, + struct dst_entry *dst) { - dst_release(idst->dst); + write_seqlock_bh(&idst->lock); + dst_release(rcu_dereference_protected( + idst->dst, + lockdep_is_held(&idst->lock.lock))); if (dst) { dst_hold(dst); idst->cookie = rt6_get_cookie((struct rt6_info *)dst); } else { idst->cookie = 0; } - idst->dst = dst; -} - -static void ip6_tnl_per_cpu_dst_set(struct ip6_tnl_dst *idst, - struct dst_entry *dst) -{ - - spin_lock_bh(&idst->lock); - __ip6_tnl_per_cpu_dst_set(idst, dst); - spin_unlock_bh(&idst->lock); + rcu_assign_pointer(idst->dst, dst); + write_sequnlock_bh(&idst->lock); } struct dst_entry *ip6_tnl_dst_get(struct ip6_tnl *t) { struct ip6_tnl_dst *idst; struct dst_entry *dst; + unsigned int seq; + u32 cookie; idst = raw_cpu_ptr(t->dst_cache); - spin_lock_bh(&idst->lock); - dst = idst->dst; - if (dst) { - if (!dst->obsolete || dst->ops->check(dst, idst->cookie)) { - dst_hold(idst->dst); - } else { - __ip6_tnl_per_cpu_dst_set(idst, NULL); - dst = NULL; - } + + rcu_read_lock(); + do { + seq = read_seqbegin(&idst->lock); + dst = rcu_dereference(idst->dst); + cookie = idst->cookie; + } while (read_seqretry(&idst->lock, seq)); + + if (dst && !atomic_inc_not_zero(&dst->__refcnt)) + dst = NULL; + rcu_read_unlock(); + + if (dst && dst->obsolete && !dst->ops->check(dst, cookie)) { + ip6_tnl_per_cpu_dst_set(idst, NULL); + dst_release(dst); + dst = NULL; } - spin_unlock_bh(&idst->lock); return dst; } EXPORT_SYMBOL_GPL(ip6_tnl_dst_get); @@ -204,7 +207,7 @@ int ip6_tnl_dst_init(struct ip6_tnl *t) return -ENOMEM; for_each_possible_cpu(i) - spin_lock_init(&per_cpu_ptr(t->dst_cache, i)->lock); + seqlock_init(&per_cpu_ptr(t->dst_cache, i)->lock); return 0; } -- cgit v1.2.3-58-ga151 From 17a10c9215b39a5ea8faeb241759c1aee6553919 Mon Sep 17 00:00:00 2001 From: Matt Bennett Date: Fri, 25 Sep 2015 11:01:47 +1200 Subject: ip6_tunnel: Reduce log level in ip6_tnl_err() to debug Currently error log messages in ip6_tnl_err are printed at 'warn' level. This is different to other tunnel types which don't print any messages. These log messages don't provide any information that couldn't be deduced with networking tools. Also it can be annoying to have one end of the tunnel go down and have the logs fill with pointless messages such as "Path to destination invalid or inactive!". This patch reduces the log level of these messages to 'dbg' level to bring the visible behaviour into line with other tunnel types. Signed-off-by: Matt Bennett Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'net/ipv6/ip6_tunnel.c') diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 983f0d20f96d..eabffbb89795 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -569,14 +569,14 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, struct ipv6_tlv_tnl_enc_lim *tel; __u32 mtu; case ICMPV6_DEST_UNREACH: - net_warn_ratelimited("%s: Path to destination invalid or inactive!\n", - t->parms.name); + net_dbg_ratelimited("%s: Path to destination invalid or inactive!\n", + t->parms.name); rel_msg = 1; break; case ICMPV6_TIME_EXCEED: if ((*code) == ICMPV6_EXC_HOPLIMIT) { - net_warn_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n", - t->parms.name); + net_dbg_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n", + t->parms.name); rel_msg = 1; } break; @@ -588,13 +588,13 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, if (teli && teli == *info - 2) { tel = (struct ipv6_tlv_tnl_enc_lim *) &skb->data[teli]; if (tel->encap_limit == 0) { - net_warn_ratelimited("%s: Too small encapsulation limit or routing loop in tunnel!\n", - t->parms.name); + net_dbg_ratelimited("%s: Too small encapsulation limit or routing loop in tunnel!\n", + t->parms.name); rel_msg = 1; } } else { - net_warn_ratelimited("%s: Recipient unable to parse tunneled packet!\n", - t->parms.name); + net_dbg_ratelimited("%s: Recipient unable to parse tunneled packet!\n", + t->parms.name); } break; case ICMPV6_PKT_TOOBIG: -- cgit v1.2.3-58-ga151