From 1536e2857bd38e3bcd19963fd6b3c3287b4747c4 Mon Sep 17 00:00:00 2001 From: Kenjiro Nakayama Date: Thu, 17 Apr 2014 01:25:01 +0900 Subject: tcp: Add a TCP_FASTOPEN socket option to get a max backlog on its listner This patch adds a TCP_FASTOPEN socket option to get a max backlog on its listener to getsockopt(). Signed-off-by: Kenjiro Nakayama Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 4bd6d52eeffb..eb1dde37e678 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2916,6 +2916,14 @@ static int do_tcp_getsockopt(struct sock *sk, int level, case TCP_USER_TIMEOUT: val = jiffies_to_msecs(icsk->icsk_user_timeout); break; + + case TCP_FASTOPEN: + if (icsk->icsk_accept_queue.fastopenq != NULL) + val = icsk->icsk_accept_queue.fastopenq->max_qlen; + else + val = 0; + break; + case TCP_TIMESTAMP: val = tcp_time_stamp + tp->tsoffset; break; -- cgit v1.2.3-58-ga151 From 86fd14ad1e8c4b8f5e9a7a27b26bdade91dd4bd0 Mon Sep 17 00:00:00 2001 From: Weiping Pan Date: Fri, 18 Apr 2014 12:27:46 +0800 Subject: tcp: make tcp_cwnd_application_limited() static Make tcp_cwnd_application_limited() static and move it from tcp_input.c to tcp_output.c Signed-off-by: Weiping Pan Signed-off-by: David S. Miller --- include/net/tcp.h | 1 - net/ipv4/tcp_input.c | 22 ---------------------- net/ipv4/tcp_output.c | 22 ++++++++++++++++++++++ 3 files changed, 22 insertions(+), 23 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/tcp.h b/include/net/tcp.h index 87d877408188..163d2b467d78 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -558,7 +558,6 @@ void tcp_send_loss_probe(struct sock *sk); bool tcp_schedule_loss_probe(struct sock *sk); /* tcp_input.c */ -void tcp_cwnd_application_limited(struct sock *sk); void tcp_resume_early_retransmit(struct sock *sk); void tcp_rearm_rto(struct sock *sk); void tcp_reset(struct sock *sk); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d6b46eb2f94c..6efed134ab63 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4703,28 +4703,6 @@ static int tcp_prune_queue(struct sock *sk) return -1; } -/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto. - * As additional protections, we do not touch cwnd in retransmission phases, - * and if application hit its sndbuf limit recently. - */ -void tcp_cwnd_application_limited(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - - if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open && - sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { - /* Limited by application or receiver window. */ - u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk)); - u32 win_used = max(tp->snd_cwnd_used, init_win); - if (win_used < tp->snd_cwnd) { - tp->snd_ssthresh = tcp_current_ssthresh(sk); - tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1; - } - tp->snd_cwnd_used = 0; - } - tp->snd_cwnd_stamp = tcp_time_stamp; -} - static bool tcp_should_expand_sndbuf(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 025e25093984..29dde97c3c41 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1387,6 +1387,28 @@ unsigned int tcp_current_mss(struct sock *sk) return mss_now; } +/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto. + * As additional protections, we do not touch cwnd in retransmission phases, + * and if application hit its sndbuf limit recently. + */ +static void tcp_cwnd_application_limited(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open && + sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { + /* Limited by application or receiver window. */ + u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk)); + u32 win_used = max(tp->snd_cwnd_used, init_win); + if (win_used < tp->snd_cwnd) { + tp->snd_ssthresh = tcp_current_ssthresh(sk); + tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1; + } + tp->snd_cwnd_used = 0; + } + tp->snd_cwnd_stamp = tcp_time_stamp; +} + /* Congestion window validation. (RFC2861) */ static void tcp_cwnd_validate(struct sock *sk) { -- cgit v1.2.3-58-ga151 From 1f3279ae0c13cd742731726b0ed195d5f09b14e4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 20 Apr 2014 17:58:17 -0700 Subject: tcp: avoid retransmits of TCP packets hanging in host queues In commit 0e280af026a5 ("tcp: introduce TCPSpuriousRtxHostQueues SNMP counter") we added a logic to detect when a packet was retransmitted while the prior clone was still in a qdisc or driver queue. We are now confident we can do better, and catch the problem before we fragment a TSO packet before retransmit, or in TLP path. This patch fully exploits the logic by simply canceling the spurious retransmit. Original packet is in a queue and will eventually leave the host. This helps to avoid network collapses when some events make the RTO estimations very wrong, particularly when dealing with huge number of sockets with synchronized blast. Signed-off-by: Eric Dumazet Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 29dde97c3c41..20847de991ea 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -878,15 +878,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, BUG_ON(!skb || !tcp_skb_pcount(skb)); if (clone_it) { - const struct sk_buff *fclone = skb + 1; - skb_mstamp_get(&skb->skb_mstamp); - if (unlikely(skb->fclone == SKB_FCLONE_ORIG && - fclone->fclone == SKB_FCLONE_CLONE)) - NET_INC_STATS(sock_net(sk), - LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); - if (unlikely(skb_cloned(skb))) skb = pskb_copy(skb, gfp_mask); else @@ -2061,6 +2054,25 @@ bool tcp_schedule_loss_probe(struct sock *sk) return true; } +/* Thanks to skb fast clones, we can detect if a prior transmit of + * a packet is still in a qdisc or driver queue. + * In this case, there is very little point doing a retransmit ! + * Note: This is called from BH context only. + */ +static bool skb_still_in_host_queue(const struct sock *sk, + const struct sk_buff *skb) +{ + const struct sk_buff *fclone = skb + 1; + + if (unlikely(skb->fclone == SKB_FCLONE_ORIG && + fclone->fclone == SKB_FCLONE_CLONE)) { + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); + return true; + } + return false; +} + /* When probe timeout (PTO) fires, send a new segment if one exists, else * retransmit the last segment. */ @@ -2086,6 +2098,9 @@ void tcp_send_loss_probe(struct sock *sk) if (WARN_ON(!skb)) goto rearm_timer; + if (skb_still_in_host_queue(sk, skb)) + goto rearm_timer; + pcount = tcp_skb_pcount(skb); if (WARN_ON(!pcount)) goto rearm_timer; @@ -2407,6 +2422,9 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf)) return -EAGAIN; + if (skb_still_in_host_queue(sk, skb)) + return -EBUSY; + if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) { if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) BUG(); @@ -2500,7 +2518,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) * see tcp_input.c tcp_sacktag_write_queue(). */ TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt; - } else { + } else if (err != -EBUSY) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); } return err; -- cgit v1.2.3-58-ga151 From b57708add31494175be741ed3fd24023b50c3423 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Tue, 22 Apr 2014 10:15:23 +0200 Subject: gre: add x-netns support This patch allows to switch the netns when packet is encapsulated or decapsulated. In other word, the encapsulated packet is received in a netns, where the lookup is done to find the tunnel. Once the tunnel is found, the packet is decapsulated and injecting into the corresponding interface which stands to another netns. When one of the two netns is removed, the tunnel is destroyed. Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 94213c891565..c5a557a06a31 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -410,7 +410,7 @@ static int ipgre_open(struct net_device *dev) struct flowi4 fl4; struct rtable *rt; - rt = ip_route_output_gre(dev_net(dev), &fl4, + rt = ip_route_output_gre(t->net, &fl4, t->parms.iph.daddr, t->parms.iph.saddr, t->parms.o_key, @@ -434,7 +434,7 @@ static int ipgre_close(struct net_device *dev) if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) { struct in_device *in_dev; - in_dev = inetdev_by_index(dev_net(dev), t->mlink); + in_dev = inetdev_by_index(t->net, t->mlink); if (in_dev) ip_mc_dec_group(in_dev, t->parms.iph.daddr); } @@ -478,7 +478,7 @@ static void __gre_tunnel_init(struct net_device *dev) dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4; dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4; - dev->features |= NETIF_F_NETNS_LOCAL | GRE_FEATURES; + dev->features |= GRE_FEATURES; dev->hw_features |= GRE_FEATURES; if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) { -- cgit v1.2.3-58-ga151 From 851bdd11ca8b855ee946f50dac0850a4bec875c9 Mon Sep 17 00:00:00 2001 From: xiao jin Date: Fri, 25 Apr 2014 08:53:29 +0800 Subject: inetpeer_gc_worker: trivial cleanup Do not initialize list twice. list_replace_init() already takes care of initializing list. We don't need to initialize it with LIST_HEAD() beforehand. Signed-off-by: xiao jin Reviewed-by: David Cohen Signed-off-by: David S. Miller --- net/ipv4/inetpeer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index 48f424465112..c98cf141f4ed 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -120,7 +120,7 @@ int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min static void inetpeer_gc_worker(struct work_struct *work) { struct inet_peer *p, *n, *c; - LIST_HEAD(list); + struct list_head list; spin_lock_bh(&gc_lock); list_replace_init(&gc_list, &list); -- cgit v1.2.3-58-ga151 From 5a2b646ffe21e6014314b4d1df040e2553e39a3b Mon Sep 17 00:00:00 2001 From: Hisao Tanabe Date: Sun, 27 Apr 2014 19:03:45 +0900 Subject: ipv4: Use predefined value for readability Signed-off-by: Hisao Tanabe Signed-off-by: David S. Miller --- net/ipv4/ip_options.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index f4ab72e19af9..5e7aecea05cd 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -364,7 +364,7 @@ int ip_options_compile(struct net *net, } if (optptr[2] <= optlen) { unsigned char *timeptr = NULL; - if (optptr[2]+3 > optptr[1]) { + if (optptr[2]+3 > optlen) { pp_ptr = optptr + 2; goto error; } @@ -376,7 +376,7 @@ int ip_options_compile(struct net *net, optptr[2] += 4; break; case IPOPT_TS_TSANDADDR: - if (optptr[2]+7 > optptr[1]) { + if (optptr[2]+7 > optlen) { pp_ptr = optptr + 2; goto error; } @@ -390,7 +390,7 @@ int ip_options_compile(struct net *net, optptr[2] += 8; break; case IPOPT_TS_PRESPEC: - if (optptr[2]+7 > optptr[1]) { + if (optptr[2]+7 > optlen) { pp_ptr = optptr + 2; goto error; } -- cgit v1.2.3-58-ga151 From f768e5bdefe1ec9adbf7a116dfb156b73cacb582 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 28 Apr 2014 21:09:50 +0200 Subject: netfilter: add helper for adding nat extension Reduce copy-past a bit by adding a common helper. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_nat.h | 2 ++ net/ipv4/netfilter/iptable_nat.c | 14 +++----------- net/ipv4/netfilter/nft_chain_nat_ipv4.c | 12 +++--------- net/ipv6/netfilter/ip6table_nat.c | 14 +++----------- net/ipv6/netfilter/nft_chain_nat_ipv6.c | 12 +++--------- net/netfilter/nf_nat_core.c | 24 ++++++++++++++++-------- 6 files changed, 30 insertions(+), 48 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h index 07eaaf604092..a71dd333ac68 100644 --- a/include/net/netfilter/nf_nat.h +++ b/include/net/netfilter/nf_nat.h @@ -48,6 +48,8 @@ unsigned int nf_nat_setup_info(struct nf_conn *ct, extern unsigned int nf_nat_alloc_null_binding(struct nf_conn *ct, unsigned int hooknum); +struct nf_conn_nat *nf_ct_nat_ext_add(struct nf_conn *ct); + /* Is this tuple already taken? (not by us)*/ int nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple, const struct nf_conn *ignored_conntrack); diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c index ee2886126e3d..f1787c04a4dd 100644 --- a/net/ipv4/netfilter/iptable_nat.c +++ b/net/ipv4/netfilter/iptable_nat.c @@ -91,17 +91,9 @@ nf_nat_ipv4_fn(const struct nf_hook_ops *ops, if (nf_ct_is_untracked(ct)) return NF_ACCEPT; - nat = nfct_nat(ct); - if (!nat) { - /* NAT module was loaded late. */ - if (nf_ct_is_confirmed(ct)) - return NF_ACCEPT; - nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC); - if (nat == NULL) { - pr_debug("failed to add NAT extension\n"); - return NF_ACCEPT; - } - } + nat = nf_ct_nat_ext_add(ct); + if (nat == NULL) + return NF_ACCEPT; switch (ctinfo) { case IP_CT_RELATED: diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c index b5b256d45e67..3964157d826c 100644 --- a/net/ipv4/netfilter/nft_chain_nat_ipv4.c +++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c @@ -48,15 +48,9 @@ static unsigned int nf_nat_fn(const struct nf_hook_ops *ops, NF_CT_ASSERT(!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET))); - nat = nfct_nat(ct); - if (nat == NULL) { - /* Conntrack module was loaded late, can't add extension. */ - if (nf_ct_is_confirmed(ct)) - return NF_ACCEPT; - nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC); - if (nat == NULL) - return NF_ACCEPT; - } + nat = nf_ct_nat_ext_add(ct); + if (nat == NULL) + return NF_ACCEPT; switch (ctinfo) { case IP_CT_RELATED: diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c index 84c7f33d0cf8..387d8b8fc18d 100644 --- a/net/ipv6/netfilter/ip6table_nat.c +++ b/net/ipv6/netfilter/ip6table_nat.c @@ -90,17 +90,9 @@ nf_nat_ipv6_fn(const struct nf_hook_ops *ops, if (nf_ct_is_untracked(ct)) return NF_ACCEPT; - nat = nfct_nat(ct); - if (!nat) { - /* NAT module was loaded late. */ - if (nf_ct_is_confirmed(ct)) - return NF_ACCEPT; - nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC); - if (nat == NULL) { - pr_debug("failed to add NAT extension\n"); - return NF_ACCEPT; - } - } + nat = nf_ct_nat_ext_add(ct); + if (nat == NULL) + return NF_ACCEPT; switch (ctinfo) { case IP_CT_RELATED: diff --git a/net/ipv6/netfilter/nft_chain_nat_ipv6.c b/net/ipv6/netfilter/nft_chain_nat_ipv6.c index 9c3297a768fd..d189fcb437fe 100644 --- a/net/ipv6/netfilter/nft_chain_nat_ipv6.c +++ b/net/ipv6/netfilter/nft_chain_nat_ipv6.c @@ -47,15 +47,9 @@ static unsigned int nf_nat_ipv6_fn(const struct nf_hook_ops *ops, if (ct == NULL || nf_ct_is_untracked(ct)) return NF_ACCEPT; - nat = nfct_nat(ct); - if (nat == NULL) { - /* Conntrack module was loaded late, can't add extension. */ - if (nf_ct_is_confirmed(ct)) - return NF_ACCEPT; - nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC); - if (nat == NULL) - return NF_ACCEPT; - } + nat = nf_ct_nat_ext_add(ct); + if (nat == NULL) + return NF_ACCEPT; switch (ctinfo) { case IP_CT_RELATED: diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 52ca952b802c..09096a670c45 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -358,6 +358,19 @@ out: rcu_read_unlock(); } +struct nf_conn_nat *nf_ct_nat_ext_add(struct nf_conn *ct) +{ + struct nf_conn_nat *nat = nfct_nat(ct); + if (nat) + return nat; + + if (!nf_ct_is_confirmed(ct)) + nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC); + + return nat; +} +EXPORT_SYMBOL_GPL(nf_ct_nat_ext_add); + unsigned int nf_nat_setup_info(struct nf_conn *ct, const struct nf_nat_range *range, @@ -368,14 +381,9 @@ nf_nat_setup_info(struct nf_conn *ct, struct nf_conn_nat *nat; /* nat helper or nfctnetlink also setup binding */ - nat = nfct_nat(ct); - if (!nat) { - nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC); - if (nat == NULL) { - pr_debug("failed to add NAT extension\n"); - return NF_ACCEPT; - } - } + nat = nf_ct_nat_ext_add(ct); + if (nat == NULL) + return NF_ACCEPT; NF_CT_ASSERT(maniptype == NF_NAT_MANIP_SRC || maniptype == NF_NAT_MANIP_DST); -- cgit v1.2.3-58-ga151 From e114a710aa5058c0ba4aa1dfb105132aefeb5e04 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 30 Apr 2014 11:58:13 -0700 Subject: tcp: fix cwnd limited checking to improve congestion control Yuchung discovered tcp_is_cwnd_limited() was returning false in slow start phase even if the application filled the socket write queue. All congestion modules take into account tcp_is_cwnd_limited() before increasing cwnd, so this behavior limits slow start from probing the bandwidth at full speed. The problem is that even if write queue is full (aka we are _not_ application limited), cwnd can be under utilized if TSO should auto defer or TCP Small queues decided to hold packets. So the in_flight can be kept to smaller value, and we can get to the point tcp_is_cwnd_limited() returns false. With TCP Small Queues and FQ/pacing, this issue is more visible. We fix this by having tcp_cwnd_validate(), which is supposed to track such things, take into account unsent_segs, the number of segs that we are not sending at the moment due to TSO or TSQ, but intend to send real soon. Then when we are cwnd-limited, remember this fact while we are processing the window of ACKs that comes back. For example, suppose we have a brand new connection with cwnd=10; we are in slow start, and we send a flight of 9 packets. By the time we have received ACKs for all 9 packets we want our cwnd to be 18. We implement this by setting tp->lsnd_pending to 9, and considering ourselves to be cwnd-limited while cwnd is less than twice tp->lsnd_pending (2*9 -> 18). This makes tcp_is_cwnd_limited() more understandable, by removing the GSO/TSO kludge, that tried to work around the issue. Note the in_flight parameter can be removed in a followup cleanup patch. Signed-off-by: Eric Dumazet Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: David S. Miller --- include/linux/tcp.h | 1 + include/net/tcp.h | 22 +++++++++++++++++++++- net/ipv4/tcp_cong.c | 20 -------------------- net/ipv4/tcp_output.c | 21 ++++++++++++++------- 4 files changed, 36 insertions(+), 28 deletions(-) (limited to 'net/ipv4') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 239946868142..4e37c71ecd74 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -230,6 +230,7 @@ struct tcp_sock { u32 snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this */ u32 snd_cwnd_used; u32 snd_cwnd_stamp; + u32 lsnd_pending; /* packets inflight or unsent since last xmit */ u32 prior_cwnd; /* Congestion window at start of Recovery. */ u32 prr_delivered; /* Number of newly delivered packets to * receiver in Recovery. */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 163d2b467d78..a9fe7bc4f4bb 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -974,7 +974,27 @@ static inline u32 tcp_wnd_end(const struct tcp_sock *tp) { return tp->snd_una + tp->snd_wnd; } -bool tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight); + +/* We follow the spirit of RFC2861 to validate cwnd but implement a more + * flexible approach. The RFC suggests cwnd should not be raised unless + * it was fully used previously. But we allow cwnd to grow as long as the + * application has used half the cwnd. + * Example : + * cwnd is 10 (IW10), but application sends 9 frames. + * We allow cwnd to reach 18 when all frames are ACKed. + * This check is safe because it's as aggressive as slow start which already + * risks 100% overshoot. The advantage is that we discourage application to + * either send more filler packets or data to artificially blow up the cwnd + * usage, and allow application-limited process to probe bw more aggressively. + * + * TODO: remove in_flight once we can fix all callers, and their callers... + */ +static inline bool tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight) +{ + const struct tcp_sock *tp = tcp_sk(sk); + + return tp->snd_cwnd < 2 * tp->lsnd_pending; +} static inline void tcp_check_probe_timer(struct sock *sk) { diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 2b9464c93b88..a93b41ba05ff 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -276,26 +276,6 @@ int tcp_set_congestion_control(struct sock *sk, const char *name) return err; } -/* RFC2861 Check whether we are limited by application or congestion window - * This is the inverse of cwnd check in tcp_tso_should_defer - */ -bool tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight) -{ - const struct tcp_sock *tp = tcp_sk(sk); - u32 left; - - if (in_flight >= tp->snd_cwnd) - return true; - - left = tp->snd_cwnd - in_flight; - if (sk_can_gso(sk) && - left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd && - left < tp->xmit_size_goal_segs) - return true; - return left <= tcp_max_tso_deferred_mss(tp); -} -EXPORT_SYMBOL_GPL(tcp_is_cwnd_limited); - /* Slow start is used when congestion window is no greater than the slow start * threshold. We base on RFC2581 and also handle stretch ACKs properly. * We do not implement RFC3465 Appropriate Byte Counting (ABC) per se but diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 20847de991ea..f9181a133462 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1402,12 +1402,13 @@ static void tcp_cwnd_application_limited(struct sock *sk) tp->snd_cwnd_stamp = tcp_time_stamp; } -/* Congestion window validation. (RFC2861) */ -static void tcp_cwnd_validate(struct sock *sk) +static void tcp_cwnd_validate(struct sock *sk, u32 unsent_segs) { struct tcp_sock *tp = tcp_sk(sk); - if (tp->packets_out >= tp->snd_cwnd) { + tp->lsnd_pending = tp->packets_out + unsent_segs; + + if (tcp_is_cwnd_limited(sk, 0)) { /* Network is feed fully. */ tp->snd_cwnd_used = 0; tp->snd_cwnd_stamp = tcp_time_stamp; @@ -1880,7 +1881,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - unsigned int tso_segs, sent_pkts; + unsigned int tso_segs, sent_pkts, unsent_segs = 0; int cwnd_quota; int result; @@ -1924,7 +1925,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, break; } else { if (!push_one && tcp_tso_should_defer(sk, skb)) - break; + goto compute_unsent_segs; } /* TCP Small Queues : @@ -1949,8 +1950,14 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, * there is no smp_mb__after_set_bit() yet */ smp_mb__after_clear_bit(); - if (atomic_read(&sk->sk_wmem_alloc) > limit) + if (atomic_read(&sk->sk_wmem_alloc) > limit) { + u32 unsent_bytes; + +compute_unsent_segs: + unsent_bytes = tp->write_seq - tp->snd_nxt; + unsent_segs = DIV_ROUND_UP(unsent_bytes, mss_now); break; + } } limit = mss_now; @@ -1990,7 +1997,7 @@ repair: /* Send one loss probe per tail loss episode. */ if (push_one != 2) tcp_schedule_loss_probe(sk); - tcp_cwnd_validate(sk); + tcp_cwnd_validate(sk, unsent_segs); return false; } return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk)); -- cgit v1.2.3-58-ga151 From 249015515fe3fc9818d86cb5c83bbc92505ad7dc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 2 May 2014 21:18:05 -0700 Subject: tcp: remove in_flight parameter from cong_avoid() methods Commit e114a710aa505 ("tcp: fix cwnd limited checking to improve congestion control") obsoleted in_flight parameter from tcp_is_cwnd_limited() and its callers. This patch does the removal as promised. Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- include/net/tcp.h | 8 +++----- net/ipv4/tcp_bic.c | 5 ++--- net/ipv4/tcp_cong.c | 4 ++-- net/ipv4/tcp_cubic.c | 5 ++--- net/ipv4/tcp_highspeed.c | 4 ++-- net/ipv4/tcp_htcp.c | 4 ++-- net/ipv4/tcp_hybla.c | 7 +++---- net/ipv4/tcp_illinois.c | 5 ++--- net/ipv4/tcp_input.c | 9 ++++----- net/ipv4/tcp_lp.c | 5 ++--- net/ipv4/tcp_output.c | 2 +- net/ipv4/tcp_scalable.c | 5 ++--- net/ipv4/tcp_vegas.c | 7 +++---- net/ipv4/tcp_veno.c | 9 ++++----- net/ipv4/tcp_yeah.c | 5 ++--- 15 files changed, 36 insertions(+), 48 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/tcp.h b/include/net/tcp.h index a9fe7bc4f4bb..3c9418456640 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -796,7 +796,7 @@ struct tcp_congestion_ops { /* return slow start threshold (required) */ u32 (*ssthresh)(struct sock *sk); /* do new cwnd calculation (required) */ - void (*cong_avoid)(struct sock *sk, u32 ack, u32 acked, u32 in_flight); + void (*cong_avoid)(struct sock *sk, u32 ack, u32 acked); /* call before changing ca_state (optional) */ void (*set_state)(struct sock *sk, u8 new_state); /* call when cwnd event occurs (optional) */ @@ -828,7 +828,7 @@ void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w); extern struct tcp_congestion_ops tcp_init_congestion_ops; u32 tcp_reno_ssthresh(struct sock *sk); -void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight); +void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked); extern struct tcp_congestion_ops tcp_reno; static inline void tcp_set_ca_state(struct sock *sk, const u8 ca_state) @@ -986,10 +986,8 @@ static inline u32 tcp_wnd_end(const struct tcp_sock *tp) * risks 100% overshoot. The advantage is that we discourage application to * either send more filler packets or data to artificially blow up the cwnd * usage, and allow application-limited process to probe bw more aggressively. - * - * TODO: remove in_flight once we can fix all callers, and their callers... */ -static inline bool tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight) +static inline bool tcp_is_cwnd_limited(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index 821846fb0a7e..d5de69bc04f5 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -140,13 +140,12 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) ca->cnt = 1; } -static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, - u32 in_flight) +static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); - if (!tcp_is_cwnd_limited(sk, in_flight)) + if (!tcp_is_cwnd_limited(sk)) return; if (tp->snd_cwnd <= tp->snd_ssthresh) diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index a93b41ba05ff..7b09d8b49fa5 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -317,11 +317,11 @@ EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai); /* This is Jacobson's slow start and congestion avoidance. * SIGCOMM '88, p. 328. */ -void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight) +void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); - if (!tcp_is_cwnd_limited(sk, in_flight)) + if (!tcp_is_cwnd_limited(sk)) return; /* In "safe" area, increase. */ diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 8bf224516ba2..ba2a4f3a6a1e 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -304,13 +304,12 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) ca->cnt = 1; } -static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, - u32 in_flight) +static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); - if (!tcp_is_cwnd_limited(sk, in_flight)) + if (!tcp_is_cwnd_limited(sk)) return; if (tp->snd_cwnd <= tp->snd_ssthresh) { diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index 8b9e7bad77c0..1c4908280d92 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c @@ -109,12 +109,12 @@ static void hstcp_init(struct sock *sk) tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128); } -static void hstcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight) +static void hstcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct hstcp *ca = inet_csk_ca(sk); - if (!tcp_is_cwnd_limited(sk, in_flight)) + if (!tcp_is_cwnd_limited(sk)) return; if (tp->snd_cwnd <= tp->snd_ssthresh) diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index 4a194acfd923..031361311a8b 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c @@ -227,12 +227,12 @@ static u32 htcp_recalc_ssthresh(struct sock *sk) return max((tp->snd_cwnd * ca->beta) >> 7, 2U); } -static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight) +static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct htcp *ca = inet_csk_ca(sk); - if (!tcp_is_cwnd_limited(sk, in_flight)) + if (!tcp_is_cwnd_limited(sk)) return; if (tp->snd_cwnd <= tp->snd_ssthresh) diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c index a15a799bf768..d8f8f05a4951 100644 --- a/net/ipv4/tcp_hybla.c +++ b/net/ipv4/tcp_hybla.c @@ -87,8 +87,7 @@ static inline u32 hybla_fraction(u32 odds) * o Give cwnd a new value based on the model proposed * o remember increments <1 */ -static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked, - u32 in_flight) +static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct hybla *ca = inet_csk_ca(sk); @@ -101,11 +100,11 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked, ca->minrtt_us = tp->srtt_us; } - if (!tcp_is_cwnd_limited(sk, in_flight)) + if (!tcp_is_cwnd_limited(sk)) return; if (!ca->hybla_en) { - tcp_reno_cong_avoid(sk, ack, acked, in_flight); + tcp_reno_cong_avoid(sk, ack, acked); return; } diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c index 863d105e3015..5999b3972e64 100644 --- a/net/ipv4/tcp_illinois.c +++ b/net/ipv4/tcp_illinois.c @@ -255,8 +255,7 @@ static void tcp_illinois_state(struct sock *sk, u8 new_state) /* * Increase window in response to successful acknowledgment. */ -static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 acked, - u32 in_flight) +static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct illinois *ca = inet_csk_ca(sk); @@ -265,7 +264,7 @@ static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 acked, update_params(sk); /* RFC2861 only increase cwnd if fully utilized */ - if (!tcp_is_cwnd_limited(sk, in_flight)) + if (!tcp_is_cwnd_limited(sk)) return; /* In slow start */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 6efed134ab63..350b2072f0ab 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2938,10 +2938,11 @@ static void tcp_synack_rtt_meas(struct sock *sk, const u32 synack_stamp) tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt_us, -1L); } -static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight) +static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) { const struct inet_connection_sock *icsk = inet_csk(sk); - icsk->icsk_ca_ops->cong_avoid(sk, ack, acked, in_flight); + + icsk->icsk_ca_ops->cong_avoid(sk, ack, acked); tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp; } @@ -3364,7 +3365,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) u32 ack_seq = TCP_SKB_CB(skb)->seq; u32 ack = TCP_SKB_CB(skb)->ack_seq; bool is_dupack = false; - u32 prior_in_flight; u32 prior_fackets; int prior_packets = tp->packets_out; const int prior_unsacked = tp->packets_out - tp->sacked_out; @@ -3397,7 +3397,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) flag |= FLAG_SND_UNA_ADVANCED; prior_fackets = tp->fackets_out; - prior_in_flight = tcp_packets_in_flight(tp); /* ts_recent update must be made after we are sure that the packet * is in window. @@ -3452,7 +3451,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) /* Advance cwnd if state allows */ if (tcp_may_raise_cwnd(sk, flag)) - tcp_cong_avoid(sk, ack, acked, prior_in_flight); + tcp_cong_avoid(sk, ack, acked); if (tcp_ack_is_dubious(sk, flag)) { is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c index c9aecae31327..1e70fa8fa793 100644 --- a/net/ipv4/tcp_lp.c +++ b/net/ipv4/tcp_lp.c @@ -115,13 +115,12 @@ static void tcp_lp_init(struct sock *sk) * Will only call newReno CA when away from inference. * From TCP-LP's paper, this will be handled in additive increasement. */ -static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 acked, - u32 in_flight) +static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct lp *lp = inet_csk_ca(sk); if (!(lp->flag & LP_WITHIN_INF)) - tcp_reno_cong_avoid(sk, ack, acked, in_flight); + tcp_reno_cong_avoid(sk, ack, acked); } /** diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f9181a133462..89277a34f2c9 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1408,7 +1408,7 @@ static void tcp_cwnd_validate(struct sock *sk, u32 unsent_segs) tp->lsnd_pending = tp->packets_out + unsent_segs; - if (tcp_is_cwnd_limited(sk, 0)) { + if (tcp_is_cwnd_limited(sk)) { /* Network is feed fully. */ tp->snd_cwnd_used = 0; tp->snd_cwnd_stamp = tcp_time_stamp; diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c index 0ac50836da4d..8250949b8853 100644 --- a/net/ipv4/tcp_scalable.c +++ b/net/ipv4/tcp_scalable.c @@ -15,12 +15,11 @@ #define TCP_SCALABLE_AI_CNT 50U #define TCP_SCALABLE_MD_SCALE 3 -static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked, - u32 in_flight) +static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); - if (!tcp_is_cwnd_limited(sk, in_flight)) + if (!tcp_is_cwnd_limited(sk)) return; if (tp->snd_cwnd <= tp->snd_ssthresh) diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 48539fff6357..9a5e05f27f4f 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -163,14 +163,13 @@ static inline u32 tcp_vegas_ssthresh(struct tcp_sock *tp) return min(tp->snd_ssthresh, tp->snd_cwnd-1); } -static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked, - u32 in_flight) +static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct vegas *vegas = inet_csk_ca(sk); if (!vegas->doing_vegas_now) { - tcp_reno_cong_avoid(sk, ack, acked, in_flight); + tcp_reno_cong_avoid(sk, ack, acked); return; } @@ -195,7 +194,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked, /* We don't have enough RTT samples to do the Vegas * calculation, so we'll behave like Reno. */ - tcp_reno_cong_avoid(sk, ack, acked, in_flight); + tcp_reno_cong_avoid(sk, ack, acked); } else { u32 rtt, diff; u64 target_cwnd; diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c index 1b8e28fcd7e1..27b9825753d1 100644 --- a/net/ipv4/tcp_veno.c +++ b/net/ipv4/tcp_veno.c @@ -114,19 +114,18 @@ static void tcp_veno_cwnd_event(struct sock *sk, enum tcp_ca_event event) tcp_veno_init(sk); } -static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked, - u32 in_flight) +static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct veno *veno = inet_csk_ca(sk); if (!veno->doing_veno_now) { - tcp_reno_cong_avoid(sk, ack, acked, in_flight); + tcp_reno_cong_avoid(sk, ack, acked); return; } /* limited by applications */ - if (!tcp_is_cwnd_limited(sk, in_flight)) + if (!tcp_is_cwnd_limited(sk)) return; /* We do the Veno calculations only if we got enough rtt samples */ @@ -134,7 +133,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked, /* We don't have enough rtt samples to do the Veno * calculation, so we'll behave like Reno. */ - tcp_reno_cong_avoid(sk, ack, acked, in_flight); + tcp_reno_cong_avoid(sk, ack, acked); } else { u64 target_cwnd; u32 rtt; diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c index 5ede0e727945..599b79b8eac0 100644 --- a/net/ipv4/tcp_yeah.c +++ b/net/ipv4/tcp_yeah.c @@ -69,13 +69,12 @@ static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked, s32 rtt_us) tcp_vegas_pkts_acked(sk, pkts_acked, rtt_us); } -static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked, - u32 in_flight) +static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct yeah *yeah = inet_csk_ca(sk); - if (!tcp_is_cwnd_limited(sk, in_flight)) + if (!tcp_is_cwnd_limited(sk)) return; if (tp->snd_cwnd <= tp->snd_ssthresh) -- cgit v1.2.3-58-ga151 From ed70fcfcee953a76028bfc3f963d2167c2990020 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 2 May 2014 16:29:38 -0700 Subject: net: Call skb_checksum_init in IPv4 Call skb_checksum_init instead of private functions. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/ip.h | 6 ++++++ net/ipv4/tcp_ipv4.c | 25 ++----------------------- net/ipv4/udp.c | 19 ++----------------- 3 files changed, 10 insertions(+), 40 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/ip.h b/include/net/ip.h index 3ec2b0fb9d83..1988cefdbb70 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -342,6 +342,12 @@ static inline void ip_select_ident_more(struct sk_buff *skb, struct dst_entry *d __ip_select_ident(iph, dst, more); } +static inline __wsum inet_compute_pseudo(struct sk_buff *skb, int proto) +{ + return csum_tcpudp_nofold(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, + skb->len, proto, 0); +} + /* * Map a multicast IP onto multicast MAC for type ethernet. */ diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 438f3b95143d..ad166dcc278f 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1744,28 +1744,6 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) return sk; } -static __sum16 tcp_v4_checksum_init(struct sk_buff *skb) -{ - const struct iphdr *iph = ip_hdr(skb); - - if (skb->ip_summed == CHECKSUM_COMPLETE) { - if (!tcp_v4_check(skb->len, iph->saddr, - iph->daddr, skb->csum)) { - skb->ip_summed = CHECKSUM_UNNECESSARY; - return 0; - } - } - - skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, - skb->len, IPPROTO_TCP, 0); - - if (skb->len <= 76) { - return __skb_checksum_complete(skb); - } - return 0; -} - - /* The socket must have it's spinlock held when we get * here. * @@ -1960,7 +1938,8 @@ int tcp_v4_rcv(struct sk_buff *skb) * Packet length and doff are validated by header prediction, * provided case of th->doff==0 is eliminated. * So, we defer the checks. */ - if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb)) + + if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) goto csum_error; th = tcp_hdr(skb); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 4468e1adc094..f2d05d7be743 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1672,7 +1672,6 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh, int proto) { - const struct iphdr *iph; int err; UDP_SKB_CB(skb)->partial_cov = 0; @@ -1684,22 +1683,8 @@ static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh, return err; } - iph = ip_hdr(skb); - if (uh->check == 0) { - skb->ip_summed = CHECKSUM_UNNECESSARY; - } else if (skb->ip_summed == CHECKSUM_COMPLETE) { - if (!csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len, - proto, skb->csum)) - skb->ip_summed = CHECKSUM_UNNECESSARY; - } - if (!skb_csum_unnecessary(skb)) - skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, - skb->len, proto, 0); - /* Probably, we should checksum udp header (it should be in cache - * in any case) and data in tiny packets (< rx copybreak). - */ - - return 0; + return skb_checksum_init_zero_check(skb, proto, uh->check, + inet_compute_pseudo); } /* -- cgit v1.2.3-58-ga151 From 698365fa1874aa7635d51667a34a2842228e9837 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Mon, 5 May 2014 15:55:55 -0700 Subject: net: clean up snmp stats code commit 8f0ea0fe3a036a47767f9c80e (snmp: reduce percpu needs by 50%) reduced snmp array size to 1, so technically it doesn't have to be an array any more. What's more, after the following commit: commit 933393f58fef9963eac61db8093689544e29a600 Date: Thu Dec 22 11:58:51 2011 -0600 percpu: Remove irqsafe_cpu_xxx variants We simply say that regular this_cpu use must be safe regardless of preemption and interrupt state. That has no material change for x86 and s390 implementations of this_cpu operations. However, arches that do not provide their own implementation for this_cpu operations will now get code generated that disables interrupts instead of preemption. probably no arch wants to have SNMP_ARRAY_SZ == 2. At least after almost 3 years, no one complains. So, just convert the array to a single pointer and remove snmp_mib_init() and snmp_mib_free() as well. Cc: Christoph Lameter Cc: Eric Dumazet Cc: David S. Miller Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/net/ip.h | 18 ++-------- include/net/snmp.h | 32 ++++++++--------- net/dccp/proto.c | 9 ++--- net/ipv4/af_inet.c | 93 ++++++++++++++++-------------------------------- net/ipv4/proc.c | 24 ++++++------- net/ipv6/addrconf.c | 17 ++++----- net/ipv6/addrconf_core.c | 2 +- net/ipv6/af_inet6.c | 42 +++++++++------------- net/ipv6/proc.c | 6 ++-- net/sctp/protocol.c | 9 ++--- net/xfrm/xfrm_policy.c | 10 +++--- net/xfrm/xfrm_proc.c | 3 +- 12 files changed, 103 insertions(+), 162 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/ip.h b/include/net/ip.h index 1988cefdbb70..16146b667ddb 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -196,27 +196,15 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr, #define NET_ADD_STATS_BH(net, field, adnd) SNMP_ADD_STATS_BH((net)->mib.net_statistics, field, adnd) #define NET_ADD_STATS_USER(net, field, adnd) SNMP_ADD_STATS_USER((net)->mib.net_statistics, field, adnd) -unsigned long snmp_fold_field(void __percpu *mib[], int offt); +unsigned long snmp_fold_field(void __percpu *mib, int offt); #if BITS_PER_LONG==32 -u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t sync_off); +u64 snmp_fold_field64(void __percpu *mib, int offt, size_t sync_off); #else -static inline u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_off) +static inline u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_off) { return snmp_fold_field(mib, offt); } #endif -int snmp_mib_init(void __percpu *ptr[2], size_t mibsize, size_t align); - -static inline void snmp_mib_free(void __percpu *ptr[SNMP_ARRAY_SZ]) -{ - int i; - - BUG_ON(ptr == NULL); - for (i = 0; i < SNMP_ARRAY_SZ; i++) { - free_percpu(ptr[i]); - ptr[i] = NULL; - } -} void inet_get_local_port_range(struct net *net, int *low, int *high); diff --git a/include/net/snmp.h b/include/net/snmp.h index 71596261fa99..f1f27fdbb0d5 100644 --- a/include/net/snmp.h +++ b/include/net/snmp.h @@ -116,51 +116,49 @@ struct linux_xfrm_mib { unsigned long mibs[LINUX_MIB_XFRMMAX]; }; -#define SNMP_ARRAY_SZ 1 - #define DEFINE_SNMP_STAT(type, name) \ - __typeof__(type) __percpu *name[SNMP_ARRAY_SZ] + __typeof__(type) __percpu *name #define DEFINE_SNMP_STAT_ATOMIC(type, name) \ __typeof__(type) *name #define DECLARE_SNMP_STAT(type, name) \ - extern __typeof__(type) __percpu *name[SNMP_ARRAY_SZ] + extern __typeof__(type) __percpu *name #define SNMP_INC_STATS_BH(mib, field) \ - __this_cpu_inc(mib[0]->mibs[field]) + __this_cpu_inc(mib->mibs[field]) #define SNMP_INC_STATS_USER(mib, field) \ - this_cpu_inc(mib[0]->mibs[field]) + this_cpu_inc(mib->mibs[field]) #define SNMP_INC_STATS_ATOMIC_LONG(mib, field) \ atomic_long_inc(&mib->mibs[field]) #define SNMP_INC_STATS(mib, field) \ - this_cpu_inc(mib[0]->mibs[field]) + this_cpu_inc(mib->mibs[field]) #define SNMP_DEC_STATS(mib, field) \ - this_cpu_dec(mib[0]->mibs[field]) + this_cpu_dec(mib->mibs[field]) #define SNMP_ADD_STATS_BH(mib, field, addend) \ - __this_cpu_add(mib[0]->mibs[field], addend) + __this_cpu_add(mib->mibs[field], addend) #define SNMP_ADD_STATS_USER(mib, field, addend) \ - this_cpu_add(mib[0]->mibs[field], addend) + this_cpu_add(mib->mibs[field], addend) #define SNMP_ADD_STATS(mib, field, addend) \ - this_cpu_add(mib[0]->mibs[field], addend) + this_cpu_add(mib->mibs[field], addend) /* - * Use "__typeof__(*mib[0]) *ptr" instead of "__typeof__(mib[0]) ptr" + * Use "__typeof__(*mib) *ptr" instead of "__typeof__(mib) ptr" * to make @ptr a non-percpu pointer. */ #define SNMP_UPD_PO_STATS(mib, basefield, addend) \ do { \ - __typeof__(*mib[0]->mibs) *ptr = mib[0]->mibs; \ + __typeof__(*mib->mibs) *ptr = mib->mibs; \ this_cpu_inc(ptr[basefield##PKTS]); \ this_cpu_add(ptr[basefield##OCTETS], addend); \ } while (0) #define SNMP_UPD_PO_STATS_BH(mib, basefield, addend) \ do { \ - __typeof__(*mib[0]->mibs) *ptr = mib[0]->mibs; \ + __typeof__(*mib->mibs) *ptr = mib->mibs; \ __this_cpu_inc(ptr[basefield##PKTS]); \ __this_cpu_add(ptr[basefield##OCTETS], addend); \ } while (0) @@ -170,7 +168,7 @@ struct linux_xfrm_mib { #define SNMP_ADD_STATS64_BH(mib, field, addend) \ do { \ - __typeof__(*mib[0]) *ptr = __this_cpu_ptr((mib)[0]); \ + __typeof__(*mib) *ptr = __this_cpu_ptr(mib); \ u64_stats_update_begin(&ptr->syncp); \ ptr->mibs[field] += addend; \ u64_stats_update_end(&ptr->syncp); \ @@ -191,8 +189,8 @@ struct linux_xfrm_mib { #define SNMP_INC_STATS64(mib, field) SNMP_ADD_STATS64(mib, field, 1) #define SNMP_UPD_PO_STATS64_BH(mib, basefield, addend) \ do { \ - __typeof__(*mib[0]) *ptr; \ - ptr = __this_cpu_ptr((mib)[0]); \ + __typeof__(*mib) *ptr; \ + ptr = __this_cpu_ptr(mib); \ u64_stats_update_begin(&ptr->syncp); \ ptr->mibs[basefield##PKTS]++; \ ptr->mibs[basefield##OCTETS] += addend; \ diff --git a/net/dccp/proto.c b/net/dccp/proto.c index eb892b4f4814..de2c1e719305 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -1084,14 +1084,15 @@ EXPORT_SYMBOL_GPL(dccp_shutdown); static inline int dccp_mib_init(void) { - return snmp_mib_init((void __percpu **)dccp_statistics, - sizeof(struct dccp_mib), - __alignof__(struct dccp_mib)); + dccp_statistics = alloc_percpu(struct dccp_mib); + if (!dccp_statistics) + return -ENOMEM; + return 0; } static inline void dccp_mib_exit(void) { - snmp_mib_free((void __percpu **)dccp_statistics); + free_percpu(dccp_statistics); } static int thash_entries; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 8c54870db792..6765d91b8e75 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1476,22 +1476,20 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family, } EXPORT_SYMBOL_GPL(inet_ctl_sock_create); -unsigned long snmp_fold_field(void __percpu *mib[], int offt) +unsigned long snmp_fold_field(void __percpu *mib, int offt) { unsigned long res = 0; - int i, j; + int i; - for_each_possible_cpu(i) { - for (j = 0; j < SNMP_ARRAY_SZ; j++) - res += *(((unsigned long *) per_cpu_ptr(mib[j], i)) + offt); - } + for_each_possible_cpu(i) + res += *(((unsigned long *) per_cpu_ptr(mib, i)) + offt); return res; } EXPORT_SYMBOL_GPL(snmp_fold_field); #if BITS_PER_LONG==32 -u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_offset) +u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_offset) { u64 res = 0; int cpu; @@ -1502,7 +1500,7 @@ u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_offset) u64 v; unsigned int start; - bhptr = per_cpu_ptr(mib[0], cpu); + bhptr = per_cpu_ptr(mib, cpu); syncp = (struct u64_stats_sync *)(bhptr + syncp_offset); do { start = u64_stats_fetch_begin_irq(syncp); @@ -1516,25 +1514,6 @@ u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_offset) EXPORT_SYMBOL_GPL(snmp_fold_field64); #endif -int snmp_mib_init(void __percpu *ptr[2], size_t mibsize, size_t align) -{ - BUG_ON(ptr == NULL); - ptr[0] = __alloc_percpu(mibsize, align); - if (!ptr[0]) - return -ENOMEM; - -#if SNMP_ARRAY_SZ == 2 - ptr[1] = __alloc_percpu(mibsize, align); - if (!ptr[1]) { - free_percpu(ptr[0]); - ptr[0] = NULL; - return -ENOMEM; - } -#endif - return 0; -} -EXPORT_SYMBOL_GPL(snmp_mib_init); - #ifdef CONFIG_IP_MULTICAST static const struct net_protocol igmp_protocol = { .handler = igmp_rcv, @@ -1570,40 +1549,30 @@ static __net_init int ipv4_mib_init_net(struct net *net) { int i; - if (snmp_mib_init((void __percpu **)net->mib.tcp_statistics, - sizeof(struct tcp_mib), - __alignof__(struct tcp_mib)) < 0) + net->mib.tcp_statistics = alloc_percpu(struct tcp_mib); + if (!net->mib.tcp_statistics) goto err_tcp_mib; - if (snmp_mib_init((void __percpu **)net->mib.ip_statistics, - sizeof(struct ipstats_mib), - __alignof__(struct ipstats_mib)) < 0) + net->mib.ip_statistics = alloc_percpu(struct ipstats_mib); + if (!net->mib.ip_statistics) goto err_ip_mib; for_each_possible_cpu(i) { struct ipstats_mib *af_inet_stats; - af_inet_stats = per_cpu_ptr(net->mib.ip_statistics[0], i); + af_inet_stats = per_cpu_ptr(net->mib.ip_statistics, i); u64_stats_init(&af_inet_stats->syncp); -#if SNMP_ARRAY_SZ == 2 - af_inet_stats = per_cpu_ptr(net->mib.ip_statistics[1], i); - u64_stats_init(&af_inet_stats->syncp); -#endif } - if (snmp_mib_init((void __percpu **)net->mib.net_statistics, - sizeof(struct linux_mib), - __alignof__(struct linux_mib)) < 0) + net->mib.net_statistics = alloc_percpu(struct linux_mib); + if (!net->mib.net_statistics) goto err_net_mib; - if (snmp_mib_init((void __percpu **)net->mib.udp_statistics, - sizeof(struct udp_mib), - __alignof__(struct udp_mib)) < 0) + net->mib.udp_statistics = alloc_percpu(struct udp_mib); + if (!net->mib.udp_statistics) goto err_udp_mib; - if (snmp_mib_init((void __percpu **)net->mib.udplite_statistics, - sizeof(struct udp_mib), - __alignof__(struct udp_mib)) < 0) + net->mib.udplite_statistics = alloc_percpu(struct udp_mib); + if (!net->mib.udplite_statistics) goto err_udplite_mib; - if (snmp_mib_init((void __percpu **)net->mib.icmp_statistics, - sizeof(struct icmp_mib), - __alignof__(struct icmp_mib)) < 0) + net->mib.icmp_statistics = alloc_percpu(struct icmp_mib); + if (!net->mib.icmp_statistics) goto err_icmp_mib; net->mib.icmpmsg_statistics = kzalloc(sizeof(struct icmpmsg_mib), GFP_KERNEL); @@ -1614,17 +1583,17 @@ static __net_init int ipv4_mib_init_net(struct net *net) return 0; err_icmpmsg_mib: - snmp_mib_free((void __percpu **)net->mib.icmp_statistics); + free_percpu(net->mib.icmp_statistics); err_icmp_mib: - snmp_mib_free((void __percpu **)net->mib.udplite_statistics); + free_percpu(net->mib.udplite_statistics); err_udplite_mib: - snmp_mib_free((void __percpu **)net->mib.udp_statistics); + free_percpu(net->mib.udp_statistics); err_udp_mib: - snmp_mib_free((void __percpu **)net->mib.net_statistics); + free_percpu(net->mib.net_statistics); err_net_mib: - snmp_mib_free((void __percpu **)net->mib.ip_statistics); + free_percpu(net->mib.ip_statistics); err_ip_mib: - snmp_mib_free((void __percpu **)net->mib.tcp_statistics); + free_percpu(net->mib.tcp_statistics); err_tcp_mib: return -ENOMEM; } @@ -1632,12 +1601,12 @@ err_tcp_mib: static __net_exit void ipv4_mib_exit_net(struct net *net) { kfree(net->mib.icmpmsg_statistics); - snmp_mib_free((void __percpu **)net->mib.icmp_statistics); - snmp_mib_free((void __percpu **)net->mib.udplite_statistics); - snmp_mib_free((void __percpu **)net->mib.udp_statistics); - snmp_mib_free((void __percpu **)net->mib.net_statistics); - snmp_mib_free((void __percpu **)net->mib.ip_statistics); - snmp_mib_free((void __percpu **)net->mib.tcp_statistics); + free_percpu(net->mib.icmp_statistics); + free_percpu(net->mib.udplite_statistics); + free_percpu(net->mib.udp_statistics); + free_percpu(net->mib.net_statistics); + free_percpu(net->mib.ip_statistics); + free_percpu(net->mib.tcp_statistics); } static __net_initdata struct pernet_operations ipv4_mib_ops = { diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index ad737fad6d8b..ae0af9386f7c 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -345,15 +345,15 @@ static void icmp_put(struct seq_file *seq) for (i = 0; icmpmibmap[i].name != NULL; i++) seq_printf(seq, " Out%s", icmpmibmap[i].name); seq_printf(seq, "\nIcmp: %lu %lu %lu", - snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INMSGS), - snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INERRORS), - snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_CSUMERRORS)); + snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INMSGS), + snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INERRORS), + snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_CSUMERRORS)); for (i = 0; icmpmibmap[i].name != NULL; i++) seq_printf(seq, " %lu", atomic_long_read(ptr + icmpmibmap[i].index)); seq_printf(seq, " %lu %lu", - snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTMSGS), - snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTERRORS)); + snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTMSGS), + snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTERRORS)); for (i = 0; icmpmibmap[i].name != NULL; i++) seq_printf(seq, " %lu", atomic_long_read(ptr + (icmpmibmap[i].index | 0x100))); @@ -379,7 +379,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v) BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0); for (i = 0; snmp4_ipstats_list[i].name != NULL; i++) seq_printf(seq, " %llu", - snmp_fold_field64((void __percpu **)net->mib.ip_statistics, + snmp_fold_field64(net->mib.ip_statistics, snmp4_ipstats_list[i].entry, offsetof(struct ipstats_mib, syncp))); @@ -395,11 +395,11 @@ static int snmp_seq_show(struct seq_file *seq, void *v) /* MaxConn field is signed, RFC 2012 */ if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN) seq_printf(seq, " %ld", - snmp_fold_field((void __percpu **)net->mib.tcp_statistics, + snmp_fold_field(net->mib.tcp_statistics, snmp4_tcp_list[i].entry)); else seq_printf(seq, " %lu", - snmp_fold_field((void __percpu **)net->mib.tcp_statistics, + snmp_fold_field(net->mib.tcp_statistics, snmp4_tcp_list[i].entry)); } @@ -410,7 +410,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v) seq_puts(seq, "\nUdp:"); for (i = 0; snmp4_udp_list[i].name != NULL; i++) seq_printf(seq, " %lu", - snmp_fold_field((void __percpu **)net->mib.udp_statistics, + snmp_fold_field(net->mib.udp_statistics, snmp4_udp_list[i].entry)); /* the UDP and UDP-Lite MIBs are the same */ @@ -421,7 +421,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v) seq_puts(seq, "\nUdpLite:"); for (i = 0; snmp4_udp_list[i].name != NULL; i++) seq_printf(seq, " %lu", - snmp_fold_field((void __percpu **)net->mib.udplite_statistics, + snmp_fold_field(net->mib.udplite_statistics, snmp4_udp_list[i].entry)); seq_putc(seq, '\n'); @@ -458,7 +458,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v) seq_puts(seq, "\nTcpExt:"); for (i = 0; snmp4_net_list[i].name != NULL; i++) seq_printf(seq, " %lu", - snmp_fold_field((void __percpu **)net->mib.net_statistics, + snmp_fold_field(net->mib.net_statistics, snmp4_net_list[i].entry)); seq_puts(seq, "\nIpExt:"); @@ -468,7 +468,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v) seq_puts(seq, "\nIpExt:"); for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++) seq_printf(seq, " %llu", - snmp_fold_field64((void __percpu **)net->mib.ip_statistics, + snmp_fold_field64(net->mib.ip_statistics, snmp4_ipextstats_list[i].entry, offsetof(struct ipstats_mib, syncp))); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 1ac13c0300b7..5667b3003af9 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -275,19 +275,14 @@ static int snmp6_alloc_dev(struct inet6_dev *idev) { int i; - if (snmp_mib_init((void __percpu **)idev->stats.ipv6, - sizeof(struct ipstats_mib), - __alignof__(struct ipstats_mib)) < 0) + idev->stats.ipv6 = alloc_percpu(struct ipstats_mib); + if (!idev->stats.ipv6) goto err_ip; for_each_possible_cpu(i) { struct ipstats_mib *addrconf_stats; - addrconf_stats = per_cpu_ptr(idev->stats.ipv6[0], i); + addrconf_stats = per_cpu_ptr(idev->stats.ipv6, i); u64_stats_init(&addrconf_stats->syncp); -#if SNMP_ARRAY_SZ == 2 - addrconf_stats = per_cpu_ptr(idev->stats.ipv6[1], i); - u64_stats_init(&addrconf_stats->syncp); -#endif } @@ -305,7 +300,7 @@ static int snmp6_alloc_dev(struct inet6_dev *idev) err_icmpmsg: kfree(idev->stats.icmpv6dev); err_icmp: - snmp_mib_free((void __percpu **)idev->stats.ipv6); + free_percpu(idev->stats.ipv6); err_ip: return -ENOMEM; } @@ -4363,7 +4358,7 @@ static inline void __snmp6_fill_statsdev(u64 *stats, atomic_long_t *mib, memset(&stats[items], 0, pad); } -static inline void __snmp6_fill_stats64(u64 *stats, void __percpu **mib, +static inline void __snmp6_fill_stats64(u64 *stats, void __percpu *mib, int items, int bytes, size_t syncpoff) { int i; @@ -4383,7 +4378,7 @@ static void snmp6_fill_stats(u64 *stats, struct inet6_dev *idev, int attrtype, { switch (attrtype) { case IFLA_INET6_STATS: - __snmp6_fill_stats64(stats, (void __percpu **)idev->stats.ipv6, + __snmp6_fill_stats64(stats, idev->stats.ipv6, IPSTATS_MIB_MAX, bytes, offsetof(struct ipstats_mib, syncp)); break; case IFLA_INET6_ICMP6STATS: diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c index 4c11cbcf8308..e6960457f625 100644 --- a/net/ipv6/addrconf_core.c +++ b/net/ipv6/addrconf_core.c @@ -123,7 +123,7 @@ static void snmp6_free_dev(struct inet6_dev *idev) { kfree(idev->stats.icmpv6msgdev); kfree(idev->stats.icmpv6dev); - snmp_mib_free((void __percpu **)idev->stats.ipv6); + free_percpu(idev->stats.ipv6); } /* Nobody refers to this device, we may destroy it. */ diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index d935889f1008..dc47cc757b80 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -715,33 +715,25 @@ static int __net_init ipv6_init_mibs(struct net *net) { int i; - if (snmp_mib_init((void __percpu **)net->mib.udp_stats_in6, - sizeof(struct udp_mib), - __alignof__(struct udp_mib)) < 0) + net->mib.udp_stats_in6 = alloc_percpu(struct udp_mib); + if (!net->mib.udp_stats_in6) return -ENOMEM; - if (snmp_mib_init((void __percpu **)net->mib.udplite_stats_in6, - sizeof(struct udp_mib), - __alignof__(struct udp_mib)) < 0) + net->mib.udplite_stats_in6 = alloc_percpu(struct udp_mib); + if (!net->mib.udplite_stats_in6) goto err_udplite_mib; - if (snmp_mib_init((void __percpu **)net->mib.ipv6_statistics, - sizeof(struct ipstats_mib), - __alignof__(struct ipstats_mib)) < 0) + net->mib.ipv6_statistics = alloc_percpu(struct ipstats_mib); + if (!net->mib.ipv6_statistics) goto err_ip_mib; for_each_possible_cpu(i) { struct ipstats_mib *af_inet6_stats; - af_inet6_stats = per_cpu_ptr(net->mib.ipv6_statistics[0], i); + af_inet6_stats = per_cpu_ptr(net->mib.ipv6_statistics, i); u64_stats_init(&af_inet6_stats->syncp); -#if SNMP_ARRAY_SZ == 2 - af_inet6_stats = per_cpu_ptr(net->mib.ipv6_statistics[1], i); - u64_stats_init(&af_inet6_stats->syncp); -#endif } - if (snmp_mib_init((void __percpu **)net->mib.icmpv6_statistics, - sizeof(struct icmpv6_mib), - __alignof__(struct icmpv6_mib)) < 0) + net->mib.icmpv6_statistics = alloc_percpu(struct icmpv6_mib); + if (!net->mib.icmpv6_statistics) goto err_icmp_mib; net->mib.icmpv6msg_statistics = kzalloc(sizeof(struct icmpv6msg_mib), GFP_KERNEL); @@ -750,22 +742,22 @@ static int __net_init ipv6_init_mibs(struct net *net) return 0; err_icmpmsg_mib: - snmp_mib_free((void __percpu **)net->mib.icmpv6_statistics); + free_percpu(net->mib.icmpv6_statistics); err_icmp_mib: - snmp_mib_free((void __percpu **)net->mib.ipv6_statistics); + free_percpu(net->mib.ipv6_statistics); err_ip_mib: - snmp_mib_free((void __percpu **)net->mib.udplite_stats_in6); + free_percpu(net->mib.udplite_stats_in6); err_udplite_mib: - snmp_mib_free((void __percpu **)net->mib.udp_stats_in6); + free_percpu(net->mib.udp_stats_in6); return -ENOMEM; } static void ipv6_cleanup_mibs(struct net *net) { - snmp_mib_free((void __percpu **)net->mib.udp_stats_in6); - snmp_mib_free((void __percpu **)net->mib.udplite_stats_in6); - snmp_mib_free((void __percpu **)net->mib.ipv6_statistics); - snmp_mib_free((void __percpu **)net->mib.icmpv6_statistics); + free_percpu(net->mib.udp_stats_in6); + free_percpu(net->mib.udplite_stats_in6); + free_percpu(net->mib.ipv6_statistics); + free_percpu(net->mib.icmpv6_statistics); kfree(net->mib.icmpv6msg_statistics); } diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index 091d066a57b3..69cb47625df7 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -201,7 +201,7 @@ static void snmp6_seq_show_item(struct seq_file *seq, void __percpu **pcpumib, } } -static void snmp6_seq_show_item64(struct seq_file *seq, void __percpu **mib, +static void snmp6_seq_show_item64(struct seq_file *seq, void __percpu *mib, const struct snmp_mib *itemlist, size_t syncpoff) { int i; @@ -215,7 +215,7 @@ static int snmp6_seq_show(struct seq_file *seq, void *v) { struct net *net = (struct net *)seq->private; - snmp6_seq_show_item64(seq, (void __percpu **)net->mib.ipv6_statistics, + snmp6_seq_show_item64(seq, net->mib.ipv6_statistics, snmp6_ipstats_list, offsetof(struct ipstats_mib, syncp)); snmp6_seq_show_item(seq, (void __percpu **)net->mib.icmpv6_statistics, NULL, snmp6_icmp6_list); @@ -245,7 +245,7 @@ static int snmp6_dev_seq_show(struct seq_file *seq, void *v) struct inet6_dev *idev = (struct inet6_dev *)seq->private; seq_printf(seq, "%-32s\t%u\n", "ifIndex", idev->dev->ifindex); - snmp6_seq_show_item64(seq, (void __percpu **)idev->stats.ipv6, + snmp6_seq_show_item64(seq, idev->stats.ipv6, snmp6_ipstats_list, offsetof(struct ipstats_mib, syncp)); snmp6_seq_show_item(seq, NULL, idev->stats.icmpv6dev->mibs, snmp6_icmp6_list); diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index c09757fbf803..074b60e2faab 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1100,14 +1100,15 @@ int sctp_register_pf(struct sctp_pf *pf, sa_family_t family) static inline int init_sctp_mibs(struct net *net) { - return snmp_mib_init((void __percpu **)net->sctp.sctp_statistics, - sizeof(struct sctp_mib), - __alignof__(struct sctp_mib)); + net->sctp.sctp_statistics = alloc_percpu(struct sctp_mib); + if (!net->sctp.sctp_statistics) + return -ENOMEM; + return 0; } static inline void cleanup_sctp_mibs(struct net *net) { - snmp_mib_free((void __percpu **)net->sctp.sctp_statistics); + free_percpu(net->sctp.sctp_statistics); } static void sctp_v4_pf_init(void) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index c08fbd11ceff..e63f242ae03e 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2783,21 +2783,19 @@ static struct notifier_block xfrm_dev_notifier = { static int __net_init xfrm_statistics_init(struct net *net) { int rv; - - if (snmp_mib_init((void __percpu **)net->mib.xfrm_statistics, - sizeof(struct linux_xfrm_mib), - __alignof__(struct linux_xfrm_mib)) < 0) + net->mib.xfrm_statistics = alloc_percpu(struct linux_xfrm_mib); + if (!net->mib.xfrm_statistics) return -ENOMEM; rv = xfrm_proc_init(net); if (rv < 0) - snmp_mib_free((void __percpu **)net->mib.xfrm_statistics); + free_percpu(net->mib.xfrm_statistics); return rv; } static void xfrm_statistics_fini(struct net *net) { xfrm_proc_fini(net); - snmp_mib_free((void __percpu **)net->mib.xfrm_statistics); + free_percpu(net->mib.xfrm_statistics); } #else static int __net_init xfrm_statistics_init(struct net *net) diff --git a/net/xfrm/xfrm_proc.c b/net/xfrm/xfrm_proc.c index fc5abd0b456f..9c4fbd8935f4 100644 --- a/net/xfrm/xfrm_proc.c +++ b/net/xfrm/xfrm_proc.c @@ -54,8 +54,7 @@ static int xfrm_statistics_seq_show(struct seq_file *seq, void *v) int i; for (i = 0; xfrm_mib_list[i].name; i++) seq_printf(seq, "%-24s\t%lu\n", xfrm_mib_list[i].name, - snmp_fold_field((void __percpu **) - net->mib.xfrm_statistics, + snmp_fold_field(net->mib.xfrm_statistics, xfrm_mib_list[i].entry)); return 0; } -- cgit v1.2.3-58-ga151 From 32a4be48907b930606f8736caa15c812af802227 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Tue, 6 May 2014 11:15:56 -0700 Subject: ipv4: remove inet_addr_hash_lock in devinet.c All the callers hold RTNL lock, so there is no need to use inet_addr_hash_lock to protect the hash list. Cc: David S. Miller Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/ipv4/devinet.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index bdbf68bb2e2d..78692e46b64f 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -106,7 +106,6 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = { #define IN4_ADDR_HSIZE (1U << IN4_ADDR_HSIZE_SHIFT) static struct hlist_head inet_addr_lst[IN4_ADDR_HSIZE]; -static DEFINE_SPINLOCK(inet_addr_hash_lock); static u32 inet_addr_hash(struct net *net, __be32 addr) { @@ -119,16 +118,14 @@ static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa) { u32 hash = inet_addr_hash(net, ifa->ifa_local); - spin_lock(&inet_addr_hash_lock); + ASSERT_RTNL(); hlist_add_head_rcu(&ifa->hash, &inet_addr_lst[hash]); - spin_unlock(&inet_addr_hash_lock); } static void inet_hash_remove(struct in_ifaddr *ifa) { - spin_lock(&inet_addr_hash_lock); + ASSERT_RTNL(); hlist_del_init_rcu(&ifa->hash); - spin_unlock(&inet_addr_hash_lock); } /** -- cgit v1.2.3-58-ga151 From b1036c6a470ccf5f18490a7ce4c99422d3bf77c4 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 7 May 2014 16:51:46 -0700 Subject: gre: Call skb_checksum_simple_validate Use skb_checksum_simple_validate to verify checksum. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv4/gre_demux.c | 24 +----------------------- 1 file changed, 1 insertion(+), 23 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c index 250be7421ab3..fbfd829f4049 100644 --- a/net/ipv4/gre_demux.c +++ b/net/ipv4/gre_demux.c @@ -93,28 +93,6 @@ void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi, } EXPORT_SYMBOL_GPL(gre_build_header); -static __sum16 check_checksum(struct sk_buff *skb) -{ - __sum16 csum = 0; - - switch (skb->ip_summed) { - case CHECKSUM_COMPLETE: - csum = csum_fold(skb->csum); - - if (!csum) - break; - /* Fall through. */ - - case CHECKSUM_NONE: - skb->csum = 0; - csum = __skb_checksum_complete(skb); - skb->ip_summed = CHECKSUM_COMPLETE; - break; - } - - return csum; -} - static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, bool *csum_err) { @@ -141,7 +119,7 @@ static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, options = (__be32 *)(greh + 1); if (greh->flags & GRE_CSUM) { - if (check_checksum(skb)) { + if (skb_checksum_simple_validate(skb)) { *csum_err = true; return -EINVAL; } -- cgit v1.2.3-58-ga151 From de08dc1a8e7052f4cbaf920ce0af6bb261595705 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 7 May 2014 16:52:10 -0700 Subject: igmp: Call skb_checksum_simple_validate Use skb_checksum_simple_validate to verify checksum. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv4/igmp.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 97e4d1655d26..17d34e3c2ac3 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -988,16 +988,8 @@ int igmp_rcv(struct sk_buff *skb) if (!pskb_may_pull(skb, sizeof(struct igmphdr))) goto drop; - switch (skb->ip_summed) { - case CHECKSUM_COMPLETE: - if (!csum_fold(skb->csum)) - break; - /* fall through */ - case CHECKSUM_NONE: - skb->csum = 0; - if (__skb_checksum_complete(skb)) - goto drop; - } + if (skb_checksum_simple_validate(skb)) + goto drop; ih = igmp_hdr(skb); switch (ih->type) { -- cgit v1.2.3-58-ga151 From 29a96e1f36dbd6fc7911a6d517625c656ba4809f Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 7 May 2014 16:52:21 -0700 Subject: icmp: Call skb_checksum_simple_validate Use skb_checksum_simple_validate to verify checksum. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv4/icmp.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 0134663fdbce..fe52666dc43c 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -908,16 +908,8 @@ int icmp_rcv(struct sk_buff *skb) ICMP_INC_STATS_BH(net, ICMP_MIB_INMSGS); - switch (skb->ip_summed) { - case CHECKSUM_COMPLETE: - if (!csum_fold(skb->csum)) - break; - /* fall through */ - case CHECKSUM_NONE: - skb->csum = 0; - if (__skb_checksum_complete(skb)) - goto csum_error; - } + if (skb_checksum_simple_validate(skb)) + goto csum_error; if (!pskb_pull(skb, sizeof(*icmph))) goto error; -- cgit v1.2.3-58-ga151 From 0a80966b1043c3e2dc684140f155a3fded308660 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 7 May 2014 16:52:39 -0700 Subject: net: Verify UDP checksum before handoff to encap Moving validation of UDP checksum to be done in UDP not encap layer. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv4/udp.c | 4 ++++ net/ipv6/udp.c | 4 ++++ 2 files changed, 8 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index f2d05d7be743..54ea0a3a48f1 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1495,6 +1495,10 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) if (skb->len > sizeof(struct udphdr) && encap_rcv != NULL) { int ret; + /* Verify checksum before giving to encap */ + if (udp_lib_checksum_complete(skb)) + goto csum_error; + ret = encap_rcv(sk, skb); if (ret <= 0) { UDP_INC_STATS_BH(sock_net(sk), diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index fc2be63e32d5..7edf096867c4 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -634,6 +634,10 @@ int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) if (skb->len > sizeof(struct udphdr) && encap_rcv != NULL) { int ret; + /* Verify checksum before giving to encap */ + if (udp_lib_checksum_complete(skb)) + goto csum_error; + ret = encap_rcv(sk, skb); if (ret <= 0) { UDP_INC_STATS_BH(sock_net(sk), -- cgit v1.2.3-58-ga151 From 60ff746739bf805a912484643c720b6124826140 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Sun, 4 May 2014 16:39:18 -0700 Subject: net: rename local_df to ignore_df MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As suggested by several people, rename local_df to ignore_df, since it means "ignore df bit if it is set". Cc: Maciej Å»enczykowski Cc: Florian Westphal Cc: David S. Miller Cc: Eric Dumazet Signed-off-by: Cong Wang Acked-by: Maciej Å»enczykowski Signed-off-by: David S. Miller --- include/linux/skbuff.h | 4 ++-- include/net/ip.h | 6 +++--- include/net/ip6_route.h | 2 +- net/core/skbuff.c | 4 ++-- net/ipv4/ip_forward.c | 2 +- net/ipv4/ip_output.c | 12 ++++++------ net/ipv4/netfilter/nf_defrag_ipv4.c | 2 +- net/ipv4/xfrm4_output.c | 2 +- net/ipv6/ip6_output.c | 12 ++++++------ net/ipv6/netfilter/nf_conntrack_reasm.c | 2 +- net/ipv6/xfrm6_output.c | 6 +++--- net/l2tp/l2tp_core.c | 2 +- net/netfilter/ipvs/ip_vs_xmit.c | 20 ++++++++++---------- net/openvswitch/vport-gre.c | 2 +- net/openvswitch/vport-vxlan.c | 2 +- net/sctp/ipv6.c | 2 +- net/sctp/output.c | 2 +- 17 files changed, 42 insertions(+), 42 deletions(-) (limited to 'net/ipv4') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 3ca0dda5a42e..7a9beeb1c458 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -426,7 +426,7 @@ static inline u32 skb_mstamp_us_delta(const struct skb_mstamp *t1, * @csum_start: Offset from skb->head where checksumming should start * @csum_offset: Offset from csum_start where checksum should be stored * @priority: Packet queueing priority - * @local_df: allow local fragmentation + * @ignore_df: allow local fragmentation * @cloned: Head may be cloned (check refcnt to be sure) * @ip_summed: Driver fed us an IP checksum * @nohdr: Payload reference only, must not modify header @@ -514,7 +514,7 @@ struct sk_buff { }; __u32 priority; kmemcheck_bitfield_begin(flags1); - __u8 local_df:1, + __u8 ignore_df:1, cloned:1, ip_summed:2, nohdr:1, diff --git a/include/net/ip.h b/include/net/ip.h index 16146b667ddb..55752985c144 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -269,7 +269,7 @@ static inline bool ip_sk_use_pmtu(const struct sock *sk) return inet_sk(sk)->pmtudisc < IP_PMTUDISC_PROBE; } -static inline bool ip_sk_local_df(const struct sock *sk) +static inline bool ip_sk_ignore_df(const struct sock *sk) { return inet_sk(sk)->pmtudisc < IP_PMTUDISC_DO || inet_sk(sk)->pmtudisc == IP_PMTUDISC_OMIT; @@ -304,7 +304,7 @@ static inline void ip_select_ident(struct sk_buff *skb, struct dst_entry *dst, s { struct iphdr *iph = ip_hdr(skb); - if ((iph->frag_off & htons(IP_DF)) && !skb->local_df) { + if ((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) { /* This is only to work around buggy Windows95/2000 * VJ compression implementations. If the ID field * does not change, they drop every other packet in @@ -320,7 +320,7 @@ static inline void ip_select_ident_more(struct sk_buff *skb, struct dst_entry *d { struct iphdr *iph = ip_hdr(skb); - if ((iph->frag_off & htons(IP_DF)) && !skb->local_df) { + if ((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) { if (sk && inet_sk(sk)->inet_daddr) { iph->id = htons(inet_sk(sk)->inet_id); inet_sk(sk)->inet_id += 1 + more; diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 6c4f5eac98e7..38e41e4d0998 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -185,7 +185,7 @@ static inline bool ip6_sk_accept_pmtu(const struct sock *sk) inet6_sk(sk)->pmtudisc != IPV6_PMTUDISC_OMIT; } -static inline bool ip6_sk_local_df(const struct sock *sk) +static inline bool ip6_sk_ignore_df(const struct sock *sk) { return inet6_sk(sk)->pmtudisc < IPV6_PMTUDISC_DO || inet6_sk(sk)->pmtudisc == IPV6_PMTUDISC_OMIT; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 1b62343f5837..3d74530ae82b 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -694,7 +694,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) #endif memcpy(new->cb, old->cb, sizeof(old->cb)); new->csum = old->csum; - new->local_df = old->local_df; + new->ignore_df = old->ignore_df; new->pkt_type = old->pkt_type; new->ip_summed = old->ip_summed; skb_copy_queue_mapping(new, old); @@ -3913,7 +3913,7 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet) skb->tstamp.tv64 = 0; skb->pkt_type = PACKET_HOST; skb->skb_iif = 0; - skb->local_df = 0; + skb->ignore_df = 0; skb_dst_drop(skb); skb->mark = 0; secpath_reset(skb); diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 6f111e48e11c..3a83ce5efa80 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -42,7 +42,7 @@ static bool ip_may_fragment(const struct sk_buff *skb) { return unlikely((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0) || - skb->local_df; + skb->ignore_df; } static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index a52f50187b54..6aa4380fde1a 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -415,7 +415,7 @@ packet_routed: skb_reset_network_header(skb); iph = ip_hdr(skb); *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff)); - if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df) + if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df) iph->frag_off = htons(IP_DF); else iph->frag_off = 0; @@ -501,7 +501,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) iph = ip_hdr(skb); mtu = ip_skb_dst_mtu(skb); - if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->local_df) || + if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) || (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size > mtu))) { IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); @@ -866,7 +866,7 @@ static int __ip_append_data(struct sock *sk, fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; - maxnonfragsize = ip_sk_local_df(sk) ? 0xFFFF : mtu; + maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu; if (cork->length + length > maxnonfragsize - fragheaderlen) { ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, @@ -1189,7 +1189,7 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page, fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; - maxnonfragsize = ip_sk_local_df(sk) ? 0xFFFF : mtu; + maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu; if (cork->length + size > maxnonfragsize - fragheaderlen) { ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, @@ -1350,10 +1350,10 @@ struct sk_buff *__ip_make_skb(struct sock *sk, * to fragment the frame generated here. No matter, what transforms * how transforms change size of the packet, it will come out. */ - skb->local_df = ip_sk_local_df(sk); + skb->ignore_df = ip_sk_ignore_df(sk); /* DF bit is set when we want to see DF on outgoing frames. - * If local_df is set too, we still allow to fragment this frame + * If ignore_df is set too, we still allow to fragment this frame * locally. */ if (inet->pmtudisc == IP_PMTUDISC_DO || inet->pmtudisc == IP_PMTUDISC_PROBE || diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index f40f321b41fc..b8f6381c7d0b 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -34,7 +34,7 @@ static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user) if (!err) { ip_send_check(ip_hdr(skb)); - skb->local_df = 1; + skb->ignore_df = 1; } return err; diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index 40e701f2e1e0..8e8c018d9d2d 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -25,7 +25,7 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb) if (IPCB(skb)->flags & IPSKB_XFRM_TUNNEL_SIZE) goto out; - if (!(ip_hdr(skb)->frag_off & htons(IP_DF)) || skb->local_df) + if (!(ip_hdr(skb)->frag_off & htons(IP_DF)) || skb->ignore_df) goto out; mtu = dst_mtu(skb_dst(skb)); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 31a38bde69ef..ab0cc57f779c 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -219,7 +219,7 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, skb->mark = sk->sk_mark; mtu = dst_mtu(dst); - if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) { + if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) { IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_OUT, skb->len); return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL, @@ -347,11 +347,11 @@ static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) if (skb->len <= mtu) return false; - /* ipv6 conntrack defrag sets max_frag_size + local_df */ + /* ipv6 conntrack defrag sets max_frag_size + ignore_df */ if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu) return true; - if (skb->local_df) + if (skb->ignore_df) return false; if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu) @@ -559,7 +559,7 @@ int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) /* We must not fragment if the socket is set to force MTU discovery * or if the skb it not generated by a local socket. */ - if (unlikely(!skb->local_df && skb->len > mtu) || + if (unlikely(!skb->ignore_df && skb->len > mtu) || (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)) { if (skb->sk && dst_allfrag(skb_dst(skb))) @@ -1234,7 +1234,7 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, sizeof(struct frag_hdr) : 0) + rt->rt6i_nfheader_len; - if (ip6_sk_local_df(sk)) + if (ip6_sk_ignore_df(sk)) maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN; else maxnonfragsize = mtu; @@ -1544,7 +1544,7 @@ int ip6_push_pending_frames(struct sock *sk) } /* Allow local fragmentation. */ - skb->local_df = ip6_sk_local_df(sk); + skb->ignore_df = ip6_sk_ignore_df(sk); *final_dst = fl6->daddr; __skb_pull(skb, skb_network_header_len(skb)); diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 767ab8da8218..0d5279fd852a 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -451,7 +451,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct net_device *dev) } sub_frag_mem_limit(&fq->q, head->truesize); - head->local_df = 1; + head->ignore_df = 1; head->next = NULL; head->dev = dev; head->tstamp = fq->q.stamp; diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index 19ef329bdbf8..f47c8b153dd3 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -78,7 +78,7 @@ static int xfrm6_tunnel_check_size(struct sk_buff *skb) if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; - if (!skb->local_df && skb->len > mtu) { + if (!skb->ignore_df && skb->len > mtu) { skb->dev = dst->dev; if (xfrm6_local_dontfrag(skb)) @@ -120,7 +120,7 @@ int xfrm6_prepare_output(struct xfrm_state *x, struct sk_buff *skb) #endif skb->protocol = htons(ETH_P_IPV6); - skb->local_df = 1; + skb->ignore_df = 1; return x->outer_mode->output2(x, skb); } @@ -150,7 +150,7 @@ static int __xfrm6_output(struct sk_buff *skb) if (skb->len > mtu && xfrm6_local_dontfrag(skb)) { xfrm6_local_rxpmtu(skb, mtu); return -EMSGSIZE; - } else if (!skb->local_df && skb->len > mtu && skb->sk) { + } else if (!skb->ignore_df && skb->len > mtu && skb->sk) { xfrm_local_error(skb, mtu); return -EMSGSIZE; } diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index aa1a9d44c107..ed0716a075ba 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1073,7 +1073,7 @@ static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, } /* Queue the packet to IP for output */ - skb->local_df = 1; + skb->ignore_df = 1; #if IS_ENABLED(CONFIG_IPV6) if (tunnel->sock->sk_family == PF_INET6 && !tunnel->v4mapped) error = inet6_csk_xmit(tunnel->sock, skb, NULL); diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index c47444e4cf8c..487b55e04337 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -562,7 +562,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, ip_send_check(iph); /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; + skb->ignore_df = 1; ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0); rcu_read_unlock(); @@ -590,7 +590,7 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, goto tx_error; /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; + skb->ignore_df = 1; ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0); rcu_read_unlock(); @@ -684,7 +684,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, MTU problem. */ /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; + skb->ignore_df = 1; rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local); rcu_read_unlock(); @@ -774,7 +774,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, MTU problem. */ /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; + skb->ignore_df = 1; rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local); rcu_read_unlock(); @@ -886,7 +886,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, ip_select_ident(skb, &rt->dst, NULL); /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; + skb->ignore_df = 1; ret = ip_vs_tunnel_xmit_prepare(skb, cp); if (ret == NF_ACCEPT) @@ -974,7 +974,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, iph->hop_limit = old_iph->hop_limit; /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; + skb->ignore_df = 1; ret = ip_vs_tunnel_xmit_prepare(skb, cp); if (ret == NF_ACCEPT) @@ -1023,7 +1023,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, ip_send_check(ip_hdr(skb)); /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; + skb->ignore_df = 1; ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0); rcu_read_unlock(); @@ -1060,7 +1060,7 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, } /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; + skb->ignore_df = 1; ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0); rcu_read_unlock(); @@ -1157,7 +1157,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, ip_vs_nat_icmp(skb, pp, cp, 0); /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; + skb->ignore_df = 1; rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local); rcu_read_unlock(); @@ -1249,7 +1249,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, ip_vs_nat_icmp_v6(skb, pp, cp, 0); /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; + skb->ignore_df = 1; rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local); rcu_read_unlock(); diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c index ebb6e2442554..0856f014d7a9 100644 --- a/net/openvswitch/vport-gre.c +++ b/net/openvswitch/vport-gre.c @@ -172,7 +172,7 @@ static int gre_tnl_send(struct vport *vport, struct sk_buff *skb) df = OVS_CB(skb)->tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; - skb->local_df = 1; + skb->ignore_df = 1; return iptunnel_xmit(skb->sk, rt, skb, fl.saddr, OVS_CB(skb)->tun_key->ipv4_dst, IPPROTO_GRE, diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c index 21cceb3bdf78..a93efa3f64c3 100644 --- a/net/openvswitch/vport-vxlan.c +++ b/net/openvswitch/vport-vxlan.c @@ -170,7 +170,7 @@ static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb) df = OVS_CB(skb)->tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; - skb->local_df = 1; + skb->ignore_df = 1; inet_get_local_port_range(net, &port_min, &port_max); src_port = vxlan_src_port(port_min, port_max, skb); diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 2b1738ef9394..4dc5d9e08311 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -216,7 +216,7 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport) IP6_ECN_flow_xmit(sk, fl6->flowlabel); if (!(transport->param_flags & SPP_PMTUD_ENABLE)) - skb->local_df = 1; + skb->ignore_df = 1; SCTP_INC_STATS(sock_net(sk), SCTP_MIB_OUTSCTPPACKS); diff --git a/net/sctp/output.c b/net/sctp/output.c index 0f4d15fc2627..01ab8e0723f0 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -591,7 +591,7 @@ int sctp_packet_transmit(struct sctp_packet *packet) pr_debug("***sctp_transmit_packet*** skb->len:%d\n", nskb->len); - nskb->local_df = packet->ipfragok; + nskb->ignore_df = packet->ipfragok; tp->af_specific->sctp_xmit(nskb, tp); out: -- cgit v1.2.3-58-ga151 From 5b7ed0892f2af4e60b9a8d2c71c77774512a6cb9 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Sun, 11 May 2014 20:22:09 -0700 Subject: tcp: move fastopen functions to tcp_fastopen.c Move common TFO functions that will be used by both v4 and v6 to tcp_fastopen.c. Create a helper tcp_fastopen_queue_check(). Signed-off-by: Yuchung Cheng Signed-off-by: Daniel Lee Signed-off-by: Jerry Chu Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- include/net/tcp.h | 10 ++- net/ipv4/tcp_fastopen.c | 190 ++++++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/tcp_ipv4.c | 185 +--------------------------------------------- 3 files changed, 200 insertions(+), 185 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/tcp.h b/include/net/tcp.h index 3c9418456640..012236838583 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1329,8 +1329,14 @@ void tcp_free_fastopen_req(struct tcp_sock *tp); extern struct tcp_fastopen_context __rcu *tcp_fastopen_ctx; int tcp_fastopen_reset_cipher(void *key, unsigned int len); -void tcp_fastopen_cookie_gen(__be32 src, __be32 dst, - struct tcp_fastopen_cookie *foc); +int tcp_fastopen_create_child(struct sock *sk, + struct sk_buff *skb, + struct sk_buff *skb_synack, + struct request_sock *req); +bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct tcp_fastopen_cookie *foc, + struct tcp_fastopen_cookie *valid_foc); void tcp_fastopen_init_key_once(bool publish); #define TCP_FASTOPEN_KEY_LENGTH 16 diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index f195d9316e55..0606c91d9d0b 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -94,3 +94,193 @@ void tcp_fastopen_cookie_gen(__be32 src, __be32 dst, } rcu_read_unlock(); } + +int tcp_fastopen_create_child(struct sock *sk, + struct sk_buff *skb, + struct sk_buff *skb_synack, + struct request_sock *req) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; + const struct inet_request_sock *ireq = inet_rsk(req); + struct sock *child; + int err; + + req->num_retrans = 0; + req->num_timeout = 0; + req->sk = NULL; + + child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); + if (child == NULL) { + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPFASTOPENPASSIVEFAIL); + kfree_skb(skb_synack); + return -1; + } + err = ip_build_and_send_pkt(skb_synack, sk, ireq->ir_loc_addr, + ireq->ir_rmt_addr, ireq->opt); + err = net_xmit_eval(err); + if (!err) + tcp_rsk(req)->snt_synack = tcp_time_stamp; + /* XXX (TFO) - is it ok to ignore error and continue? */ + + spin_lock(&queue->fastopenq->lock); + queue->fastopenq->qlen++; + spin_unlock(&queue->fastopenq->lock); + + /* Initialize the child socket. Have to fix some values to take + * into account the child is a Fast Open socket and is created + * only out of the bits carried in the SYN packet. + */ + tp = tcp_sk(child); + + tp->fastopen_rsk = req; + /* Do a hold on the listner sk so that if the listener is being + * closed, the child that has been accepted can live on and still + * access listen_lock. + */ + sock_hold(sk); + tcp_rsk(req)->listener = sk; + + /* RFC1323: The window in SYN & SYN/ACK segments is never + * scaled. So correct it appropriately. + */ + tp->snd_wnd = ntohs(tcp_hdr(skb)->window); + + /* Activate the retrans timer so that SYNACK can be retransmitted. + * The request socket is not added to the SYN table of the parent + * because it's been added to the accept queue directly. + */ + inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS, + TCP_TIMEOUT_INIT, TCP_RTO_MAX); + + /* Add the child socket directly into the accept queue */ + inet_csk_reqsk_queue_add(sk, req, child); + + /* Now finish processing the fastopen child socket. */ + inet_csk(child)->icsk_af_ops->rebuild_header(child); + tcp_init_congestion_control(child); + tcp_mtup_init(child); + tcp_init_metrics(child); + tcp_init_buffer_space(child); + + /* Queue the data carried in the SYN packet. We need to first + * bump skb's refcnt because the caller will attempt to free it. + * + * XXX (TFO) - we honor a zero-payload TFO request for now. + * (Any reason not to?) + */ + if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq + 1) { + /* Don't queue the skb if there is no payload in SYN. + * XXX (TFO) - How about SYN+FIN? + */ + tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + } else { + skb = skb_get(skb); + skb_dst_drop(skb); + __skb_pull(skb, tcp_hdr(skb)->doff * 4); + skb_set_owner_r(skb, child); + __skb_queue_tail(&child->sk_receive_queue, skb); + tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + tp->syn_data_acked = 1; + } + sk->sk_data_ready(sk); + bh_unlock_sock(child); + sock_put(child); + WARN_ON(req->sk == NULL); + return 0; +} +EXPORT_SYMBOL(tcp_fastopen_create_child); + +static bool tcp_fastopen_queue_check(struct sock *sk) +{ + struct fastopen_queue *fastopenq; + + /* Make sure the listener has enabled fastopen, and we don't + * exceed the max # of pending TFO requests allowed before trying + * to validating the cookie in order to avoid burning CPU cycles + * unnecessarily. + * + * XXX (TFO) - The implication of checking the max_qlen before + * processing a cookie request is that clients can't differentiate + * between qlen overflow causing Fast Open to be disabled + * temporarily vs a server not supporting Fast Open at all. + */ + fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq; + if (fastopenq == NULL || fastopenq->max_qlen == 0) + return false; + + if (fastopenq->qlen >= fastopenq->max_qlen) { + struct request_sock *req1; + spin_lock(&fastopenq->lock); + req1 = fastopenq->rskq_rst_head; + if ((req1 == NULL) || time_after(req1->expires, jiffies)) { + spin_unlock(&fastopenq->lock); + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPFASTOPENLISTENOVERFLOW); + return false; + } + fastopenq->rskq_rst_head = req1->dl_next; + fastopenq->qlen--; + spin_unlock(&fastopenq->lock); + reqsk_free(req1); + } + return true; +} + +bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct tcp_fastopen_cookie *foc, + struct tcp_fastopen_cookie *valid_foc) +{ + bool skip_cookie = false; + + if (likely(!fastopen_cookie_present(foc))) { + /* See include/net/tcp.h for the meaning of these knobs */ + if ((sysctl_tcp_fastopen & TFO_SERVER_ALWAYS) || + ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD) && + (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1))) + skip_cookie = true; /* no cookie to validate */ + else + return false; + } + /* A FO option is present; bump the counter. */ + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE); + + if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) == 0 || + !tcp_fastopen_queue_check(sk)) + return false; + + if (skip_cookie) { + tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + return true; + } + + if (foc->len == TCP_FASTOPEN_COOKIE_SIZE) { + if ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_CHKED) == 0) { + tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, + ip_hdr(skb)->daddr, valid_foc); + if ((valid_foc->len != TCP_FASTOPEN_COOKIE_SIZE) || + memcmp(&foc->val[0], &valid_foc->val[0], + TCP_FASTOPEN_COOKIE_SIZE) != 0) + return false; + valid_foc->len = -1; + } + /* Acknowledge the data received from the peer. */ + tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + return true; + } else if (foc->len == 0) { /* Client requesting a cookie */ + tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, + ip_hdr(skb)->daddr, valid_foc); + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPFASTOPENCOOKIEREQD); + } else { + /* Client sent a cookie with wrong size. Treat it + * the same as invalid and return a valid one. + */ + tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, + ip_hdr(skb)->daddr, valid_foc); + } + return false; +} +EXPORT_SYMBOL(tcp_fastopen_check); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ad166dcc278f..032fcaee164a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1260,187 +1260,6 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { }; #endif -static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb, - struct request_sock *req, - struct tcp_fastopen_cookie *foc, - struct tcp_fastopen_cookie *valid_foc) -{ - bool skip_cookie = false; - struct fastopen_queue *fastopenq; - - if (likely(!fastopen_cookie_present(foc))) { - /* See include/net/tcp.h for the meaning of these knobs */ - if ((sysctl_tcp_fastopen & TFO_SERVER_ALWAYS) || - ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD) && - (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1))) - skip_cookie = true; /* no cookie to validate */ - else - return false; - } - fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq; - /* A FO option is present; bump the counter. */ - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE); - - /* Make sure the listener has enabled fastopen, and we don't - * exceed the max # of pending TFO requests allowed before trying - * to validating the cookie in order to avoid burning CPU cycles - * unnecessarily. - * - * XXX (TFO) - The implication of checking the max_qlen before - * processing a cookie request is that clients can't differentiate - * between qlen overflow causing Fast Open to be disabled - * temporarily vs a server not supporting Fast Open at all. - */ - if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) == 0 || - fastopenq == NULL || fastopenq->max_qlen == 0) - return false; - - if (fastopenq->qlen >= fastopenq->max_qlen) { - struct request_sock *req1; - spin_lock(&fastopenq->lock); - req1 = fastopenq->rskq_rst_head; - if ((req1 == NULL) || time_after(req1->expires, jiffies)) { - spin_unlock(&fastopenq->lock); - NET_INC_STATS_BH(sock_net(sk), - LINUX_MIB_TCPFASTOPENLISTENOVERFLOW); - /* Avoid bumping LINUX_MIB_TCPFASTOPENPASSIVEFAIL*/ - foc->len = -1; - return false; - } - fastopenq->rskq_rst_head = req1->dl_next; - fastopenq->qlen--; - spin_unlock(&fastopenq->lock); - reqsk_free(req1); - } - if (skip_cookie) { - tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq; - return true; - } - - if (foc->len == TCP_FASTOPEN_COOKIE_SIZE) { - if ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_CHKED) == 0) { - tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, - ip_hdr(skb)->daddr, valid_foc); - if ((valid_foc->len != TCP_FASTOPEN_COOKIE_SIZE) || - memcmp(&foc->val[0], &valid_foc->val[0], - TCP_FASTOPEN_COOKIE_SIZE) != 0) - return false; - valid_foc->len = -1; - } - /* Acknowledge the data received from the peer. */ - tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq; - return true; - } else if (foc->len == 0) { /* Client requesting a cookie */ - tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, - ip_hdr(skb)->daddr, valid_foc); - NET_INC_STATS_BH(sock_net(sk), - LINUX_MIB_TCPFASTOPENCOOKIEREQD); - } else { - /* Client sent a cookie with wrong size. Treat it - * the same as invalid and return a valid one. - */ - tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, - ip_hdr(skb)->daddr, valid_foc); - } - return false; -} - -static int tcp_v4_conn_req_fastopen(struct sock *sk, - struct sk_buff *skb, - struct sk_buff *skb_synack, - struct request_sock *req) -{ - struct tcp_sock *tp = tcp_sk(sk); - struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; - const struct inet_request_sock *ireq = inet_rsk(req); - struct sock *child; - int err; - - req->num_retrans = 0; - req->num_timeout = 0; - req->sk = NULL; - - child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); - if (child == NULL) { - NET_INC_STATS_BH(sock_net(sk), - LINUX_MIB_TCPFASTOPENPASSIVEFAIL); - kfree_skb(skb_synack); - return -1; - } - err = ip_build_and_send_pkt(skb_synack, sk, ireq->ir_loc_addr, - ireq->ir_rmt_addr, ireq->opt); - err = net_xmit_eval(err); - if (!err) - tcp_rsk(req)->snt_synack = tcp_time_stamp; - /* XXX (TFO) - is it ok to ignore error and continue? */ - - spin_lock(&queue->fastopenq->lock); - queue->fastopenq->qlen++; - spin_unlock(&queue->fastopenq->lock); - - /* Initialize the child socket. Have to fix some values to take - * into account the child is a Fast Open socket and is created - * only out of the bits carried in the SYN packet. - */ - tp = tcp_sk(child); - - tp->fastopen_rsk = req; - /* Do a hold on the listner sk so that if the listener is being - * closed, the child that has been accepted can live on and still - * access listen_lock. - */ - sock_hold(sk); - tcp_rsk(req)->listener = sk; - - /* RFC1323: The window in SYN & SYN/ACK segments is never - * scaled. So correct it appropriately. - */ - tp->snd_wnd = ntohs(tcp_hdr(skb)->window); - - /* Activate the retrans timer so that SYNACK can be retransmitted. - * The request socket is not added to the SYN table of the parent - * because it's been added to the accept queue directly. - */ - inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS, - TCP_TIMEOUT_INIT, TCP_RTO_MAX); - - /* Add the child socket directly into the accept queue */ - inet_csk_reqsk_queue_add(sk, req, child); - - /* Now finish processing the fastopen child socket. */ - inet_csk(child)->icsk_af_ops->rebuild_header(child); - tcp_init_congestion_control(child); - tcp_mtup_init(child); - tcp_init_metrics(child); - tcp_init_buffer_space(child); - - /* Queue the data carried in the SYN packet. We need to first - * bump skb's refcnt because the caller will attempt to free it. - * - * XXX (TFO) - we honor a zero-payload TFO request for now. - * (Any reason not to?) - */ - if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq + 1) { - /* Don't queue the skb if there is no payload in SYN. - * XXX (TFO) - How about SYN+FIN? - */ - tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; - } else { - skb = skb_get(skb); - skb_dst_drop(skb); - __skb_pull(skb, tcp_hdr(skb)->doff * 4); - skb_set_owner_r(skb, child); - __skb_queue_tail(&child->sk_receive_queue, skb); - tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; - tp->syn_data_acked = 1; - } - sk->sk_data_ready(sk); - bh_unlock_sock(child); - sock_put(child); - WARN_ON(req->sk == NULL); - return 0; -} - int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) { struct tcp_options_received tmp_opt; @@ -1599,8 +1418,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) if (fastopen_cookie_present(&foc) && foc.len != 0) NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL); - } else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req)) - goto drop_and_free; + } else if (tcp_fastopen_create_child(sk, skb, skb_synack, req)) + goto drop_and_release; return 0; -- cgit v1.2.3-58-ga151 From 89278c9dc922272df921042aafa18311f3398c6c Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Sun, 11 May 2014 20:22:10 -0700 Subject: tcp: simplify fast open cookie processing Consolidate various cookie checking and generation code to simplify the fast open processing. The main goal is to reduce code duplication in tcp_v4_conn_request() for IPv6 support. Removes two experimental sysctl flags TFO_SERVER_ALWAYS and TFO_SERVER_COOKIE_NOT_CHKD used primarily for developmental debugging purposes. Signed-off-by: Yuchung Cheng Signed-off-by: Daniel Lee Signed-off-by: Jerry Chu Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- include/linux/tcp.h | 5 ---- include/net/tcp.h | 9 +------ net/ipv4/tcp_fastopen.c | 71 +++++++++++++++++++------------------------------ net/ipv4/tcp_ipv4.c | 10 +++---- net/ipv4/tcp_output.c | 2 +- 5 files changed, 33 insertions(+), 64 deletions(-) (limited to 'net/ipv4') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 4e37c71ecd74..bc35e4709e8e 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -366,11 +366,6 @@ static inline bool tcp_passive_fastopen(const struct sock *sk) tcp_sk(sk)->fastopen_rsk != NULL); } -static inline bool fastopen_cookie_present(struct tcp_fastopen_cookie *foc) -{ - return foc->len != -1; -} - extern void tcp_sock_destruct(struct sock *sk); static inline int fastopen_init_queue(struct sock *sk, int backlog) diff --git a/include/net/tcp.h b/include/net/tcp.h index 012236838583..17d7c6a3d037 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -220,8 +220,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); #define TFO_SERVER_ENABLE 2 #define TFO_CLIENT_NO_COOKIE 4 /* Data in SYN w/o cookie option */ -/* Process SYN data but skip cookie validation */ -#define TFO_SERVER_COOKIE_NOT_CHKED 0x100 /* Accept SYN data w/o any cookie option */ #define TFO_SERVER_COOKIE_NOT_REQD 0x200 @@ -230,10 +228,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); */ #define TFO_SERVER_WO_SOCKOPT1 0x400 #define TFO_SERVER_WO_SOCKOPT2 0x800 -/* Always create TFO child sockets on a TFO listener even when - * cookie/data not present. (For testing purpose!) - */ -#define TFO_SERVER_ALWAYS 0x1000 extern struct inet_timewait_death_row tcp_death_row; @@ -1335,8 +1329,7 @@ int tcp_fastopen_create_child(struct sock *sk, struct request_sock *req); bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb, struct request_sock *req, - struct tcp_fastopen_cookie *foc, - struct tcp_fastopen_cookie *valid_foc); + struct tcp_fastopen_cookie *foc); void tcp_fastopen_init_key_once(bool publish); #define TCP_FASTOPEN_KEY_LENGTH 16 diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 0606c91d9d0b..5a98277b9a82 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -228,59 +228,44 @@ static bool tcp_fastopen_queue_check(struct sock *sk) return true; } +/* Returns true if we should perform Fast Open on the SYN. The cookie (foc) + * may be updated and return the client in the SYN-ACK later. E.g., Fast Open + * cookie request (foc->len == 0). + */ bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb, struct request_sock *req, - struct tcp_fastopen_cookie *foc, - struct tcp_fastopen_cookie *valid_foc) + struct tcp_fastopen_cookie *foc) { - bool skip_cookie = false; - - if (likely(!fastopen_cookie_present(foc))) { - /* See include/net/tcp.h for the meaning of these knobs */ - if ((sysctl_tcp_fastopen & TFO_SERVER_ALWAYS) || - ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD) && - (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1))) - skip_cookie = true; /* no cookie to validate */ - else - return false; - } - /* A FO option is present; bump the counter. */ - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE); + struct tcp_fastopen_cookie valid_foc = { .len = -1 }; + bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1; - if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) == 0 || - !tcp_fastopen_queue_check(sk)) + if (!((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) && + (syn_data || foc->len >= 0) && + tcp_fastopen_queue_check(sk))) { + foc->len = -1; return false; - - if (skip_cookie) { - tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq; - return true; } - if (foc->len == TCP_FASTOPEN_COOKIE_SIZE) { - if ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_CHKED) == 0) { - tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, - ip_hdr(skb)->daddr, valid_foc); - if ((valid_foc->len != TCP_FASTOPEN_COOKIE_SIZE) || - memcmp(&foc->val[0], &valid_foc->val[0], - TCP_FASTOPEN_COOKIE_SIZE) != 0) - return false; - valid_foc->len = -1; - } - /* Acknowledge the data received from the peer. */ + if (syn_data && (sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD)) + goto fastopen; + + tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, + ip_hdr(skb)->daddr, &valid_foc); + + if (foc->len == TCP_FASTOPEN_COOKIE_SIZE && + foc->len == valid_foc.len && + !memcmp(foc->val, valid_foc.val, foc->len)) { +fastopen: tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + foc->len = -1; + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE); return true; - } else if (foc->len == 0) { /* Client requesting a cookie */ - tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, - ip_hdr(skb)->daddr, valid_foc); - NET_INC_STATS_BH(sock_net(sk), - LINUX_MIB_TCPFASTOPENCOOKIEREQD); - } else { - /* Client sent a cookie with wrong size. Treat it - * the same as invalid and return a valid one. - */ - tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, - ip_hdr(skb)->daddr, valid_foc); } + + NET_INC_STATS_BH(sock_net(sk), foc->len ? + LINUX_MIB_TCPFASTOPENPASSIVEFAIL : + LINUX_MIB_TCPFASTOPENCOOKIEREQD); + *foc = valid_foc; return false; } EXPORT_SYMBOL(tcp_fastopen_check); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 032fcaee164a..5ea0949dadfd 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1273,7 +1273,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) bool want_cookie = false; struct flowi4 fl4; struct tcp_fastopen_cookie foc = { .len = -1 }; - struct tcp_fastopen_cookie valid_foc = { .len = -1 }; struct sk_buff *skb_synack; int do_fastopen; @@ -1381,7 +1380,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) if (dst == NULL) goto drop_and_free; } - do_fastopen = tcp_fastopen_check(sk, skb, req, &foc, &valid_foc); + do_fastopen = !want_cookie && + tcp_fastopen_check(sk, skb, req, &foc); /* We don't call tcp_v4_send_synack() directly because we need * to make sure a child socket can be created successfully before @@ -1394,8 +1394,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) * latter to remove its dependency on the current implementation * of tcp_v4_send_synack()->tcp_select_initial_window(). */ - skb_synack = tcp_make_synack(sk, dst, req, - fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL); + skb_synack = tcp_make_synack(sk, dst, req, &foc); if (skb_synack) { __tcp_v4_send_check(skb_synack, ireq->ir_loc_addr, ireq->ir_rmt_addr); @@ -1415,9 +1414,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) tcp_rsk(req)->listener = NULL; /* Add the request_sock to the SYN table */ inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); - if (fastopen_cookie_present(&foc) && foc.len != 0) - NET_INC_STATS_BH(sock_net(sk), - LINUX_MIB_TCPFASTOPENPASSIVEFAIL); } else if (tcp_fastopen_create_child(sk, skb, skb_synack, req)) goto drop_and_release; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 694711a140d4..b20fc02920f9 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -627,7 +627,7 @@ static unsigned int tcp_synack_options(struct sock *sk, if (unlikely(!ireq->tstamp_ok)) remaining -= TCPOLEN_SACKPERM_ALIGNED; } - if (foc != NULL) { + if (foc != NULL && foc->len >= 0) { u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len; need = (need + 3) & ~3U; /* Align to 32 bits */ if (remaining >= need) { -- cgit v1.2.3-58-ga151 From 843f4a55e336e6d0c7bb92e7f9621535bc8d5fcd Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Sun, 11 May 2014 20:22:11 -0700 Subject: tcp: use tcp_v4_send_synack on first SYN-ACK To avoid large code duplication in IPv6, we need to first simplify the complicate SYN-ACK sending code in tcp_v4_conn_request(). To use tcp_v4(6)_send_synack() to send all SYN-ACKs, we need to initialize the mini socket's receive window before trying to create the child socket and/or building the SYN-ACK packet. So we move that initialization from tcp_make_synack() to tcp_v4_conn_request() as a new function tcp_openreq_init_req_rwin(). After this refactoring the SYN-ACK sending code is simpler and easier to implement Fast Open for IPv6. Signed-off-by: Yuchung Cheng Signed-off-by: Daniel Lee Signed-off-by: Jerry Chu Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- include/net/tcp.h | 14 +++++----- net/ipv4/tcp_fastopen.c | 67 ++++++++++++++++++++++-------------------------- net/ipv4/tcp_ipv4.c | 57 ++++++++++++---------------------------- net/ipv4/tcp_minisocks.c | 31 ++++++++++++++++++++++ net/ipv4/tcp_output.c | 21 --------------- 5 files changed, 85 insertions(+), 105 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/tcp.h b/include/net/tcp.h index 17d7c6a3d037..f5d6ca4a9d28 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1114,6 +1114,9 @@ static inline void tcp_openreq_init(struct request_sock *req, ireq->ir_num = ntohs(tcp_hdr(skb)->dest); } +extern void tcp_openreq_init_rwin(struct request_sock *req, + struct sock *sk, struct dst_entry *dst); + void tcp_enter_memory_pressure(struct sock *sk); static inline int keepalive_intvl_when(const struct tcp_sock *tp) @@ -1323,13 +1326,10 @@ void tcp_free_fastopen_req(struct tcp_sock *tp); extern struct tcp_fastopen_context __rcu *tcp_fastopen_ctx; int tcp_fastopen_reset_cipher(void *key, unsigned int len); -int tcp_fastopen_create_child(struct sock *sk, - struct sk_buff *skb, - struct sk_buff *skb_synack, - struct request_sock *req); -bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb, - struct request_sock *req, - struct tcp_fastopen_cookie *foc); +bool tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct tcp_fastopen_cookie *foc, + struct dst_entry *dst); void tcp_fastopen_init_key_once(bool publish); #define TCP_FASTOPEN_KEY_LENGTH 16 diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 5a98277b9a82..9b947a9aaf6e 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -95,34 +95,22 @@ void tcp_fastopen_cookie_gen(__be32 src, __be32 dst, rcu_read_unlock(); } -int tcp_fastopen_create_child(struct sock *sk, - struct sk_buff *skb, - struct sk_buff *skb_synack, - struct request_sock *req) +static bool tcp_fastopen_create_child(struct sock *sk, + struct sk_buff *skb, + struct dst_entry *dst, + struct request_sock *req) { struct tcp_sock *tp = tcp_sk(sk); struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; - const struct inet_request_sock *ireq = inet_rsk(req); struct sock *child; - int err; req->num_retrans = 0; req->num_timeout = 0; req->sk = NULL; child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); - if (child == NULL) { - NET_INC_STATS_BH(sock_net(sk), - LINUX_MIB_TCPFASTOPENPASSIVEFAIL); - kfree_skb(skb_synack); - return -1; - } - err = ip_build_and_send_pkt(skb_synack, sk, ireq->ir_loc_addr, - ireq->ir_rmt_addr, ireq->opt); - err = net_xmit_eval(err); - if (!err) - tcp_rsk(req)->snt_synack = tcp_time_stamp; - /* XXX (TFO) - is it ok to ignore error and continue? */ + if (child == NULL) + return false; spin_lock(&queue->fastopenq->lock); queue->fastopenq->qlen++; @@ -167,28 +155,24 @@ int tcp_fastopen_create_child(struct sock *sk, /* Queue the data carried in the SYN packet. We need to first * bump skb's refcnt because the caller will attempt to free it. * - * XXX (TFO) - we honor a zero-payload TFO request for now. - * (Any reason not to?) + * XXX (TFO) - we honor a zero-payload TFO request for now, + * (any reason not to?) but no need to queue the skb since + * there is no data. How about SYN+FIN? */ - if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq + 1) { - /* Don't queue the skb if there is no payload in SYN. - * XXX (TFO) - How about SYN+FIN? - */ - tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; - } else { + if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1) { skb = skb_get(skb); skb_dst_drop(skb); __skb_pull(skb, tcp_hdr(skb)->doff * 4); skb_set_owner_r(skb, child); __skb_queue_tail(&child->sk_receive_queue, skb); - tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; tp->syn_data_acked = 1; } + tcp_rsk(req)->rcv_nxt = tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; sk->sk_data_ready(sk); bh_unlock_sock(child); sock_put(child); WARN_ON(req->sk == NULL); - return 0; + return true; } EXPORT_SYMBOL(tcp_fastopen_create_child); @@ -232,9 +216,10 @@ static bool tcp_fastopen_queue_check(struct sock *sk) * may be updated and return the client in the SYN-ACK later. E.g., Fast Open * cookie request (foc->len == 0). */ -bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb, - struct request_sock *req, - struct tcp_fastopen_cookie *foc) +bool tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct tcp_fastopen_cookie *foc, + struct dst_entry *dst) { struct tcp_fastopen_cookie valid_foc = { .len = -1 }; bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1; @@ -255,11 +240,21 @@ bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb, if (foc->len == TCP_FASTOPEN_COOKIE_SIZE && foc->len == valid_foc.len && !memcmp(foc->val, valid_foc.val, foc->len)) { + /* Cookie is valid. Create a (full) child socket to accept + * the data in SYN before returning a SYN-ACK to ack the + * data. If we fail to create the socket, fall back and + * ack the ISN only but includes the same cookie. + * + * Note: Data-less SYN with valid cookie is allowed to send + * data in SYN_RECV state. + */ fastopen: - tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq; - foc->len = -1; - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE); - return true; + if (tcp_fastopen_create_child(sk, skb, dst, req)) { + foc->len = -1; + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPFASTOPENPASSIVE); + return true; + } } NET_INC_STATS_BH(sock_net(sk), foc->len ? @@ -268,4 +263,4 @@ fastopen: *foc = valid_foc; return false; } -EXPORT_SYMBOL(tcp_fastopen_check); +EXPORT_SYMBOL(tcp_try_fastopen); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 5ea0949dadfd..1665f0f84233 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -822,7 +822,8 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, */ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, struct request_sock *req, - u16 queue_mapping) + u16 queue_mapping, + struct tcp_fastopen_cookie *foc) { const struct inet_request_sock *ireq = inet_rsk(req); struct flowi4 fl4; @@ -833,7 +834,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) return -1; - skb = tcp_make_synack(sk, dst, req, NULL); + skb = tcp_make_synack(sk, dst, req, foc); if (skb) { __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); @@ -852,7 +853,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req) { - int res = tcp_v4_send_synack(sk, NULL, req, 0); + int res = tcp_v4_send_synack(sk, NULL, req, 0, NULL); if (!res) { TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); @@ -1270,11 +1271,10 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) __be32 saddr = ip_hdr(skb)->saddr; __be32 daddr = ip_hdr(skb)->daddr; __u32 isn = TCP_SKB_CB(skb)->when; - bool want_cookie = false; + bool want_cookie = false, fastopen; struct flowi4 fl4; struct tcp_fastopen_cookie foc = { .len = -1 }; - struct sk_buff *skb_synack; - int do_fastopen; + int err; /* Never answer to SYNs send to broadcast or multicast */ if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) @@ -1373,49 +1373,24 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) isn = tcp_v4_init_sequence(skb); } - tcp_rsk(req)->snt_isn = isn; - - if (dst == NULL) { - dst = inet_csk_route_req(sk, &fl4, req); - if (dst == NULL) - goto drop_and_free; - } - do_fastopen = !want_cookie && - tcp_fastopen_check(sk, skb, req, &foc); - - /* We don't call tcp_v4_send_synack() directly because we need - * to make sure a child socket can be created successfully before - * sending back synack! - * - * XXX (TFO) - Ideally one would simply call tcp_v4_send_synack() - * (or better yet, call tcp_send_synack() in the child context - * directly, but will have to fix bunch of other code first) - * after syn_recv_sock() except one will need to first fix the - * latter to remove its dependency on the current implementation - * of tcp_v4_send_synack()->tcp_select_initial_window(). - */ - skb_synack = tcp_make_synack(sk, dst, req, &foc); - - if (skb_synack) { - __tcp_v4_send_check(skb_synack, ireq->ir_loc_addr, ireq->ir_rmt_addr); - skb_set_queue_mapping(skb_synack, skb_get_queue_mapping(skb)); - } else + if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) goto drop_and_free; - if (likely(!do_fastopen)) { - int err; - err = ip_build_and_send_pkt(skb_synack, sk, ireq->ir_loc_addr, - ireq->ir_rmt_addr, ireq->opt); - err = net_xmit_eval(err); + tcp_rsk(req)->snt_isn = isn; + tcp_rsk(req)->snt_synack = tcp_time_stamp; + tcp_openreq_init_rwin(req, sk, dst); + fastopen = !want_cookie && + tcp_try_fastopen(sk, skb, req, &foc, dst); + err = tcp_v4_send_synack(sk, dst, req, + skb_get_queue_mapping(skb), &foc); + if (!fastopen) { if (err || want_cookie) goto drop_and_free; tcp_rsk(req)->snt_synack = tcp_time_stamp; tcp_rsk(req)->listener = NULL; - /* Add the request_sock to the SYN table */ inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); - } else if (tcp_fastopen_create_child(sk, skb, skb_synack, req)) - goto drop_and_release; + } return 0; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 05c1b155251d..e68e0d4af6c9 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -362,6 +362,37 @@ void tcp_twsk_destructor(struct sock *sk) } EXPORT_SYMBOL_GPL(tcp_twsk_destructor); +void tcp_openreq_init_rwin(struct request_sock *req, + struct sock *sk, struct dst_entry *dst) +{ + struct inet_request_sock *ireq = inet_rsk(req); + struct tcp_sock *tp = tcp_sk(sk); + __u8 rcv_wscale; + int mss = dst_metric_advmss(dst); + + if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) + mss = tp->rx_opt.user_mss; + + /* Set this up on the first call only */ + req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); + + /* limit the window selection if the user enforce a smaller rx buffer */ + if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && + (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0)) + req->window_clamp = tcp_full_space(sk); + + /* tcp_full_space because it is guaranteed to be the first packet */ + tcp_select_initial_window(tcp_full_space(sk), + mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), + &req->rcv_wnd, + &req->window_clamp, + ireq->wscale_ok, + &rcv_wscale, + dst_metric(dst, RTAX_INITRWND)); + ireq->rcv_wscale = rcv_wscale; +} +EXPORT_SYMBOL(tcp_openreq_init_rwin); + static inline void TCP_ECN_openreq_child(struct tcp_sock *tp, struct request_sock *req) { diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index b20fc02920f9..3d61c52bdf79 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2803,27 +2803,6 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) mss = tp->rx_opt.user_mss; - if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ - __u8 rcv_wscale; - /* Set this up on the first call only */ - req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); - - /* limit the window selection if the user enforce a smaller rx buffer */ - if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && - (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0)) - req->window_clamp = tcp_full_space(sk); - - /* tcp_full_space because it is guaranteed to be the first packet */ - tcp_select_initial_window(tcp_full_space(sk), - mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), - &req->rcv_wnd, - &req->window_clamp, - ireq->wscale_ok, - &rcv_wscale, - dst_metric(dst, RTAX_INITRWND)); - ireq->rcv_wscale = rcv_wscale; - } - memset(&opts, 0, sizeof(opts)); #ifdef CONFIG_SYN_COOKIES if (unlikely(req->cookie_ts)) -- cgit v1.2.3-58-ga151 From 0a672f74131dd682087dfd5f45bf61f95804772e Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Sun, 11 May 2014 20:22:12 -0700 Subject: tcp: improve fastopen icmp handling If a fast open socket is already accepted by the user, it should be treated like a connected socket to record the ICMP error in sk_softerr, so the user can fetch it. Do that in both tcp_v4_err and tcp_v6_err. Also refactor the sequence window check to improve readability (e.g., there were two local variables named 'req'). Signed-off-by: Yuchung Cheng Signed-off-by: Daniel Lee Signed-off-by: Jerry Chu Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 35 ++++++++++++++--------------------- net/ipv6/tcp_ipv6.c | 22 +++++++++++++++++----- 2 files changed, 31 insertions(+), 26 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 1665f0f84233..a2780e5334c9 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -336,8 +336,8 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) const int code = icmp_hdr(icmp_skb)->code; struct sock *sk; struct sk_buff *skb; - struct request_sock *req; - __u32 seq; + struct request_sock *fastopen; + __u32 seq, snd_una; __u32 remaining; int err; struct net *net = dev_net(icmp_skb->dev); @@ -378,12 +378,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) icsk = inet_csk(sk); tp = tcp_sk(sk); - req = tp->fastopen_rsk; seq = ntohl(th->seq); + /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ + fastopen = tp->fastopen_rsk; + snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; if (sk->sk_state != TCP_LISTEN && - !between(seq, tp->snd_una, tp->snd_nxt) && - (req == NULL || seq != tcp_rsk(req)->snt_isn)) { - /* For a Fast Open socket, allow seq to be snt_isn. */ + !between(seq, snd_una, tp->snd_nxt)) { NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); goto out; } @@ -426,11 +426,9 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH) break; if (seq != tp->snd_una || !icsk->icsk_retransmits || - !icsk->icsk_backoff) + !icsk->icsk_backoff || fastopen) break; - /* XXX (TFO) - revisit the following logic for TFO */ - if (sock_owned_by_user(sk)) break; @@ -462,14 +460,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) goto out; } - /* XXX (TFO) - if it's a TFO socket and has been accepted, rather - * than following the TCP_SYN_RECV case and closing the socket, - * we ignore the ICMP error and keep trying like a fully established - * socket. Is this the right thing to do? - */ - if (req && req->sk == NULL) - goto out; - switch (sk->sk_state) { struct request_sock *req, **prev; case TCP_LISTEN: @@ -502,10 +492,13 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) goto out; case TCP_SYN_SENT: - case TCP_SYN_RECV: /* Cannot happen. - It can f.e. if SYNs crossed, - or Fast Open. - */ + case TCP_SYN_RECV: + /* Only in fast or simultaneous open. If a fast open socket is + * is already accepted it is treated as a connected one below. + */ + if (fastopen && fastopen->sk == NULL) + break; + if (!sock_owned_by_user(sk)) { sk->sk_err = err; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 7fa67439f4d6..a7a62ce12b3f 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -340,7 +340,8 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, struct sock *sk; int err; struct tcp_sock *tp; - __u32 seq; + struct request_sock *fastopen; + __u32 seq, snd_una; struct net *net = dev_net(skb->dev); sk = inet6_lookup(net, &tcp_hashinfo, &hdr->daddr, @@ -371,8 +372,11 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, tp = tcp_sk(sk); seq = ntohl(th->seq); + /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ + fastopen = tp->fastopen_rsk; + snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; if (sk->sk_state != TCP_LISTEN && - !between(seq, tp->snd_una, tp->snd_nxt)) { + !between(seq, snd_una, tp->snd_nxt)) { NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); goto out; } @@ -436,8 +440,13 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, goto out; case TCP_SYN_SENT: - case TCP_SYN_RECV: /* Cannot happen. - It can, it SYNs are crossed. --ANK */ + case TCP_SYN_RECV: + /* Only in fast or simultaneous open. If a fast open socket is + * is already accepted it is treated as a connected one below. + */ + if (fastopen && fastopen->sk == NULL) + break; + if (!sock_owned_by_user(sk)) { sk->sk_err = err; sk->sk_error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ @@ -1760,6 +1769,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) const struct inet_sock *inet = inet_sk(sp); const struct tcp_sock *tp = tcp_sk(sp); const struct inet_connection_sock *icsk = inet_csk(sp); + struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq; dest = &sp->sk_v6_daddr; src = &sp->sk_v6_rcv_saddr; @@ -1802,7 +1812,9 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) jiffies_to_clock_t(icsk->icsk_ack.ato), (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong, tp->snd_cwnd, - tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh + sp->sk_state == TCP_LISTEN ? + (fastopenq ? fastopenq->max_qlen : 0) : + (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh) ); } -- cgit v1.2.3-58-ga151 From 3a19ce0eec32667b835d8dc887002019fc6b3a02 Mon Sep 17 00:00:00 2001 From: Daniel Lee Date: Sun, 11 May 2014 20:22:13 -0700 Subject: tcp: IPv6 support for fastopen server After all the preparatory works, supporting IPv6 in Fast Open is now easy. We pretty much just mirror v4 code. The only difference is how we generate the Fast Open cookie for IPv6 sockets. Since Fast Open cookie is 128 bits and we use AES 128, we use CBC-MAC to encrypt both the source and destination IPv6 addresses since the cookie is a MAC tag. Signed-off-by: Daniel Lee Signed-off-by: Yuchung Cheng Signed-off-by: Jerry Chu Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_fastopen.c | 57 +++++++++++++++++++++++++++++++++++++------------ net/ipv6/tcp_ipv6.c | 40 +++++++++++++++++++++++----------- 2 files changed, 71 insertions(+), 26 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 9b947a9aaf6e..62e48cf84e60 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -72,27 +72,58 @@ error: kfree(ctx); return err; } -/* Computes the fastopen cookie for the IP path. - * The path is a 128 bits long (pad with zeros for IPv4). - * - * The caller must check foc->len to determine if a valid cookie - * has been generated successfully. -*/ -void tcp_fastopen_cookie_gen(__be32 src, __be32 dst, - struct tcp_fastopen_cookie *foc) +static bool __tcp_fastopen_cookie_gen(const void *path, + struct tcp_fastopen_cookie *foc) { - __be32 path[4] = { src, dst, 0, 0 }; struct tcp_fastopen_context *ctx; + bool ok = false; tcp_fastopen_init_key_once(true); rcu_read_lock(); ctx = rcu_dereference(tcp_fastopen_ctx); if (ctx) { - crypto_cipher_encrypt_one(ctx->tfm, foc->val, (__u8 *)path); + crypto_cipher_encrypt_one(ctx->tfm, foc->val, path); foc->len = TCP_FASTOPEN_COOKIE_SIZE; + ok = true; } rcu_read_unlock(); + return ok; +} + +/* Generate the fastopen cookie by doing aes128 encryption on both + * the source and destination addresses. Pad 0s for IPv4 or IPv4-mapped-IPv6 + * addresses. For the longer IPv6 addresses use CBC-MAC. + * + * XXX (TFO) - refactor when TCP_FASTOPEN_COOKIE_SIZE != AES_BLOCK_SIZE. + */ +static bool tcp_fastopen_cookie_gen(struct request_sock *req, + struct sk_buff *syn, + struct tcp_fastopen_cookie *foc) +{ + if (req->rsk_ops->family == AF_INET) { + const struct iphdr *iph = ip_hdr(syn); + + __be32 path[4] = { iph->saddr, iph->daddr, 0, 0 }; + return __tcp_fastopen_cookie_gen(path, foc); + } + +#if IS_ENABLED(CONFIG_IPV6) + if (req->rsk_ops->family == AF_INET6) { + const struct ipv6hdr *ip6h = ipv6_hdr(syn); + struct tcp_fastopen_cookie tmp; + + if (__tcp_fastopen_cookie_gen(&ip6h->saddr, &tmp)) { + struct in6_addr *buf = (struct in6_addr *) tmp.val; + int i = 4; + + for (i = 0; i < 4; i++) + buf->s6_addr32[i] ^= ip6h->daddr.s6_addr32[i]; + return __tcp_fastopen_cookie_gen(buf, foc); + } + } +#endif + return false; } static bool tcp_fastopen_create_child(struct sock *sk, @@ -234,10 +265,8 @@ bool tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, if (syn_data && (sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD)) goto fastopen; - tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, - ip_hdr(skb)->daddr, &valid_foc); - - if (foc->len == TCP_FASTOPEN_COOKIE_SIZE && + if (tcp_fastopen_cookie_gen(req, skb, &valid_foc) && + foc->len == TCP_FASTOPEN_COOKIE_SIZE && foc->len == valid_foc.len && !memcmp(foc->val, valid_foc.val, foc->len)) { /* Cookie is valid. Create a (full) child socket to accept diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index a7a62ce12b3f..3a267bf14f2f 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -472,7 +472,8 @@ out: static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst, struct flowi6 *fl6, struct request_sock *req, - u16 queue_mapping) + u16 queue_mapping, + struct tcp_fastopen_cookie *foc) { struct inet_request_sock *ireq = inet_rsk(req); struct ipv6_pinfo *np = inet6_sk(sk); @@ -483,7 +484,7 @@ static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst, if (!dst && (dst = inet6_csk_route_req(sk, fl6, req)) == NULL) goto done; - skb = tcp_make_synack(sk, dst, req, NULL); + skb = tcp_make_synack(sk, dst, req, foc); if (skb) { __tcp_v6_send_check(skb, &ireq->ir_v6_loc_addr, @@ -507,7 +508,7 @@ static int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req) struct flowi6 fl6; int res; - res = tcp_v6_send_synack(sk, NULL, &fl6, req, 0); + res = tcp_v6_send_synack(sk, NULL, &fl6, req, 0, NULL); if (!res) { TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); @@ -926,7 +927,12 @@ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) static void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, struct request_sock *req) { - tcp_v6_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, + /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV + * sk->sk_state == TCP_SYN_RECV -> for Fast Open. + */ + tcp_v6_send_ack(skb, (sk->sk_state == TCP_LISTEN) ? + tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt, + tcp_rsk(req)->rcv_nxt, req->rcv_wnd, tcp_time_stamp, req->ts_recent, sk->sk_bound_dev_if, tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr), 0, 0); @@ -978,8 +984,10 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) struct tcp_sock *tp = tcp_sk(sk); __u32 isn = TCP_SKB_CB(skb)->when; struct dst_entry *dst = NULL; + struct tcp_fastopen_cookie foc = { .len = -1 }; + bool want_cookie = false, fastopen; struct flowi6 fl6; - bool want_cookie = false; + int err; if (skb->protocol == htons(ETH_P_IP)) return tcp_v4_conn_request(sk, skb); @@ -1010,7 +1018,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) tcp_clear_options(&tmp_opt); tmp_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); tmp_opt.user_mss = tp->rx_opt.user_mss; - tcp_parse_options(skb, &tmp_opt, 0, NULL); + tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc); if (want_cookie && !tmp_opt.saw_tstamp) tcp_clear_options(&tmp_opt); @@ -1083,19 +1091,27 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) isn = tcp_v6_init_sequence(skb); } have_isn: - tcp_rsk(req)->snt_isn = isn; if (security_inet_conn_request(sk, skb, req)) goto drop_and_release; - if (tcp_v6_send_synack(sk, dst, &fl6, req, - skb_get_queue_mapping(skb)) || - want_cookie) + if (!dst && (dst = inet6_csk_route_req(sk, &fl6, req)) == NULL) goto drop_and_free; + tcp_rsk(req)->snt_isn = isn; tcp_rsk(req)->snt_synack = tcp_time_stamp; - tcp_rsk(req)->listener = NULL; - inet6_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); + tcp_openreq_init_rwin(req, sk, dst); + fastopen = !want_cookie && + tcp_try_fastopen(sk, skb, req, &foc, dst); + err = tcp_v6_send_synack(sk, dst, &fl6, req, + skb_get_queue_mapping(skb), &foc); + if (!fastopen) { + if (err || want_cookie) + goto drop_and_free; + + tcp_rsk(req)->listener = NULL; + inet6_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); + } return 0; drop_and_release: -- cgit v1.2.3-58-ga151 From e110861f86094cd78cc85593b873970092deb43a Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Tue, 13 May 2014 10:17:33 -0700 Subject: net: add a sysctl to reflect the fwmark on replies Kernel-originated IP packets that have no user socket associated with them (e.g., ICMP errors and echo replies, TCP RSTs, etc.) are emitted with a mark of zero. Add a sysctl to make them have the same mark as the packet they are replying to. This allows an administrator that wishes to do so to use mark-based routing, firewalling, etc. for these replies by marking the original packets inbound. Tested using user-mode linux: - ICMP/ICMPv6 echo replies and errors. - TCP RST packets (IPv4 and IPv6). Signed-off-by: Lorenzo Colitti Signed-off-by: David S. Miller --- include/net/ip.h | 3 +++ include/net/ipv6.h | 3 +++ include/net/netns/ipv4.h | 2 ++ include/net/netns/ipv6.h | 1 + net/ipv4/icmp.c | 11 +++++++++-- net/ipv4/ip_output.c | 3 ++- net/ipv4/sysctl_net_ipv4.c | 7 +++++++ net/ipv6/icmp.c | 6 ++++++ net/ipv6/sysctl_net_ipv6.c | 7 +++++++ net/ipv6/tcp_ipv6.c | 1 + 10 files changed, 41 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/ip.h b/include/net/ip.h index 55752985c144..14c50a1650ef 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -231,6 +231,9 @@ void ipfrag_init(void); void ip_static_sysctl_init(void); +#define IP4_REPLY_MARK(net, mark) \ + ((net)->ipv4.sysctl_fwmark_reflect ? (mark) : 0) + static inline bool ip_is_fragment(const struct iphdr *iph) { return (iph->frag_off & htons(IP_MF | IP_OFFSET)) != 0; diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 5b40ad297b8c..ba810d0546bc 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -113,6 +113,9 @@ struct frag_hdr { #define IP6_MF 0x0001 #define IP6_OFFSET 0xFFF8 +#define IP6_REPLY_MARK(net, mark) \ + ((net)->ipv6.sysctl.fwmark_reflect ? (mark) : 0) + #include /* sysctls */ diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index b2704fd0ec80..a32fc4d705da 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -77,6 +77,8 @@ struct netns_ipv4 { int sysctl_ip_no_pmtu_disc; int sysctl_ip_fwd_use_pmtu; + int sysctl_fwmark_reflect; + struct ping_group_range ping_group_range; atomic_t dev_addr_genid; diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 21edaf1f7916..19d3446e59d2 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -30,6 +30,7 @@ struct netns_sysctl_ipv6 { int flowlabel_consistency; int icmpv6_time; int anycast_src_echo_reply; + int fwmark_reflect; }; struct netns_ipv6 { diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index fe52666dc43c..79c3d947a481 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -337,6 +337,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) struct sock *sk; struct inet_sock *inet; __be32 daddr, saddr; + u32 mark = IP4_REPLY_MARK(net, skb->mark); if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb)) return; @@ -349,6 +350,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) icmp_param->data.icmph.checksum = 0; inet->tos = ip_hdr(skb)->tos; + sk->sk_mark = mark; daddr = ipc.addr = ip_hdr(skb)->saddr; saddr = fib_compute_spec_dst(skb); ipc.opt = NULL; @@ -364,6 +366,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) memset(&fl4, 0, sizeof(fl4)); fl4.daddr = daddr; fl4.saddr = saddr; + fl4.flowi4_mark = mark; fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); fl4.flowi4_proto = IPPROTO_ICMP; security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); @@ -382,7 +385,7 @@ static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4, struct sk_buff *skb_in, const struct iphdr *iph, - __be32 saddr, u8 tos, + __be32 saddr, u8 tos, u32 mark, int type, int code, struct icmp_bxm *param) { @@ -394,6 +397,7 @@ static struct rtable *icmp_route_lookup(struct net *net, fl4->daddr = (param->replyopts.opt.opt.srr ? param->replyopts.opt.opt.faddr : iph->saddr); fl4->saddr = saddr; + fl4->flowi4_mark = mark; fl4->flowi4_tos = RT_TOS(tos); fl4->flowi4_proto = IPPROTO_ICMP; fl4->fl4_icmp_type = type; @@ -491,6 +495,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) struct flowi4 fl4; __be32 saddr; u8 tos; + u32 mark; struct net *net; struct sock *sk; @@ -592,6 +597,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) | IPTOS_PREC_INTERNETCONTROL) : iph->tos; + mark = IP4_REPLY_MARK(net, skb_in->mark); if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb_in)) goto out_unlock; @@ -608,13 +614,14 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) icmp_param->skb = skb_in; icmp_param->offset = skb_network_offset(skb_in); inet_sk(sk)->tos = tos; + sk->sk_mark = mark; ipc.addr = iph->saddr; ipc.opt = &icmp_param->replyopts.opt; ipc.tx_flags = 0; ipc.ttl = 0; ipc.tos = -1; - rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, + rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark, type, code, icmp_param); if (IS_ERR(rt)) goto out_unlock; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 6aa4380fde1a..6e231ab58d65 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1546,7 +1546,8 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr, daddr = replyopts.opt.opt.faddr; } - flowi4_init_output(&fl4, arg->bound_dev_if, 0, + flowi4_init_output(&fl4, arg->bound_dev_if, + IP4_REPLY_MARK(net, skb->mark), RT_TOS(arg->tos), RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol, ip_reply_arg_flowi_flags(arg), diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 5cde8f263d40..f50d51850285 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -838,6 +838,13 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "fwmark_reflect", + .data = &init_net.ipv4.sysctl_fwmark_reflect, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { } }; diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 8d3952796d39..f6c84a6eb238 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -400,6 +400,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) int len; int hlimit; int err = 0; + u32 mark = IP6_REPLY_MARK(net, skb->mark); if ((u8 *)hdr < skb->head || (skb_network_header(skb) + sizeof(*hdr)) > skb_tail_pointer(skb)) @@ -466,6 +467,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) fl6.daddr = hdr->saddr; if (saddr) fl6.saddr = *saddr; + fl6.flowi6_mark = mark; fl6.flowi6_oif = iif; fl6.fl6_icmp_type = type; fl6.fl6_icmp_code = code; @@ -474,6 +476,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) sk = icmpv6_xmit_lock(net); if (sk == NULL) return; + sk->sk_mark = mark; np = inet6_sk(sk); if (!icmpv6_xrlim_allow(sk, type, &fl6)) @@ -551,6 +554,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb) int err = 0; int hlimit; u8 tclass; + u32 mark = IP6_REPLY_MARK(net, skb->mark); saddr = &ipv6_hdr(skb)->daddr; @@ -569,11 +573,13 @@ static void icmpv6_echo_reply(struct sk_buff *skb) fl6.saddr = *saddr; fl6.flowi6_oif = skb->dev->ifindex; fl6.fl6_icmp_type = ICMPV6_ECHO_REPLY; + fl6.flowi6_mark = mark; security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); sk = icmpv6_xmit_lock(net); if (sk == NULL) return; + sk->sk_mark = mark; np = inet6_sk(sk); if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index 7f405a168822..058f3eca2e53 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -38,6 +38,13 @@ static struct ctl_table ipv6_table_template[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "fwmark_reflect", + .data = &init_net.ipv6.sysctl.fwmark_reflect, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { } }; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 3a267bf14f2f..c54976a44425 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -812,6 +812,7 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win, fl6.flowi6_oif = inet6_iif(skb); else fl6.flowi6_oif = oif; + fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark); fl6.fl6_dport = t1->dest; fl6.fl6_sport = t1->source; security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); -- cgit v1.2.3-58-ga151 From 1b3c61dc1aebf5d3d6c3981ba3eedc1e66f3ecda Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Tue, 13 May 2014 10:17:34 -0700 Subject: net: Use fwmark reflection in PMTU discovery. Currently, routing lookups used for Path PMTU Discovery in absence of a socket or on unmarked sockets use a mark of 0. This causes PMTUD not to work when using routing based on netfilter fwmark mangling and fwmark ip rules, such as: iptables -j MARK --set-mark 17 ip rule add fwmark 17 lookup 100 This patch causes these route lookups to use the fwmark from the received ICMP error when the fwmark_reflect sysctl is enabled. This allows the administrator to make PMTUD work by configuring appropriate fwmark rules to mark the inbound ICMP packets. Black-box tested using user-mode linux by pointing different fwmarks at routing tables egressing on different interfaces, and using iptables mangling to mark packets inbound on each interface with the interface's fwmark. ICMPv4 and ICMPv6 PMTU discovery work as expected when mark reflection is enabled and fail when it is disabled. Signed-off-by: Lorenzo Colitti Signed-off-by: David S. Miller --- net/ipv4/route.c | 7 +++++++ net/ipv6/route.c | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index db1e0da871f4..50e1e0feddfc 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -993,6 +993,9 @@ void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, struct flowi4 fl4; struct rtable *rt; + if (!mark) + mark = IP4_REPLY_MARK(net, skb->mark); + __build_flow_key(&fl4, NULL, iph, oif, RT_TOS(iph->tos), protocol, mark, flow_flags); rt = __ip_route_output_key(net, &fl4); @@ -1010,6 +1013,10 @@ static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) struct rtable *rt; __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); + + if (!fl4.flowi4_mark) + fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark); + rt = __ip_route_output_key(sock_net(sk), &fl4); if (!IS_ERR(rt)) { __ip_rt_update_pmtu(rt, &fl4, mtu); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 004fffb6c221..f0a8ff9ed891 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1176,7 +1176,7 @@ void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_oif = oif; - fl6.flowi6_mark = mark; + fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark); fl6.daddr = iph->daddr; fl6.saddr = iph->saddr; fl6.flowlabel = ip6_flowinfo(iph); -- cgit v1.2.3-58-ga151 From 84f39b08d7868ce10eeaf640627cb89777f0ae93 Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Tue, 13 May 2014 10:17:35 -0700 Subject: net: support marking accepting TCP sockets When using mark-based routing, sockets returned from accept() may need to be marked differently depending on the incoming connection request. This is the case, for example, if different socket marks identify different networks: a listening socket may want to accept connections from all networks, but each connection should be marked with the network that the request came in on, so that subsequent packets are sent on the correct network. This patch adds a sysctl to mark TCP sockets based on the fwmark of the incoming SYN packet. If enabled, and an unmarked socket receives a SYN, then the SYN packet's fwmark is written to the connection's inet_request_sock, and later written back to the accepted socket when the connection is established. If the socket already has a nonzero mark, then the behaviour is the same as it is today, i.e., the listening socket's fwmark is used. Black-box tested using user-mode linux: - IPv4/IPv6 SYN+ACK, FIN, etc. packets are routed based on the mark of the incoming SYN packet. - The socket returned by accept() is marked with the mark of the incoming SYN packet. - Tested with syncookies=1 and syncookies=2. Signed-off-by: Lorenzo Colitti Signed-off-by: David S. Miller --- include/net/inet_sock.h | 10 ++++++++++ include/net/netns/ipv4.h | 1 + net/ipv4/inet_connection_sock.c | 6 ++++-- net/ipv4/syncookies.c | 3 ++- net/ipv4/sysctl_net_ipv4.c | 7 +++++++ net/ipv4/tcp_ipv4.c | 1 + net/ipv6/inet6_connection_sock.c | 2 +- net/ipv6/syncookies.c | 4 +++- net/ipv6/tcp_ipv6.c | 1 + 9 files changed, 30 insertions(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 1833c3f389ee..b1edf17bec01 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -90,6 +90,7 @@ struct inet_request_sock { kmemcheck_bitfield_end(flags); struct ip_options_rcu *opt; struct sk_buff *pktopts; + u32 ir_mark; }; static inline struct inet_request_sock *inet_rsk(const struct request_sock *sk) @@ -97,6 +98,15 @@ static inline struct inet_request_sock *inet_rsk(const struct request_sock *sk) return (struct inet_request_sock *)sk; } +static inline u32 inet_request_mark(struct sock *sk, struct sk_buff *skb) +{ + if (!sk->sk_mark && sock_net(sk)->ipv4.sysctl_tcp_fwmark_accept) { + return skb->mark; + } else { + return sk->sk_mark; + } +} + struct inet_cork { unsigned int flags; __be32 addr; diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index a32fc4d705da..2f0cfad66666 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -78,6 +78,7 @@ struct netns_ipv4 { int sysctl_ip_fwd_use_pmtu; int sysctl_fwmark_reflect; + int sysctl_tcp_fwmark_accept; struct ping_group_range ping_group_range; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index a56b8e6e866a..12e502cbfdc7 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -408,7 +408,7 @@ struct dst_entry *inet_csk_route_req(struct sock *sk, struct net *net = sock_net(sk); int flags = inet_sk_flowi_flags(sk); - flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, + flowi4_init_output(fl4, sk->sk_bound_dev_if, ireq->ir_mark, RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, sk->sk_protocol, flags, @@ -445,7 +445,7 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk, rcu_read_lock(); opt = rcu_dereference(newinet->inet_opt); - flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, + flowi4_init_output(fl4, sk->sk_bound_dev_if, inet_rsk(req)->ir_mark, RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, sk->sk_protocol, inet_sk_flowi_flags(sk), (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, @@ -680,6 +680,8 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, inet_sk(newsk)->inet_sport = htons(inet_rsk(req)->ir_num); newsk->sk_write_space = sk_stream_write_space; + newsk->sk_mark = inet_rsk(req)->ir_mark; + newicsk->icsk_retransmits = 0; newicsk->icsk_backoff = 0; newicsk->icsk_probes_out = 0; diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index f2ed13c2125f..c86624b36a62 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -303,6 +303,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, ireq->ir_rmt_port = th->source; ireq->ir_loc_addr = ip_hdr(skb)->daddr; ireq->ir_rmt_addr = ip_hdr(skb)->saddr; + ireq->ir_mark = inet_request_mark(sk, skb); ireq->ecn_ok = ecn_ok; ireq->snd_wscale = tcp_opt.snd_wscale; ireq->sack_ok = tcp_opt.sack_ok; @@ -339,7 +340,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, * hasn't changed since we received the original syn, but I see * no easy way to do this. */ - flowi4_init_output(&fl4, sk->sk_bound_dev_if, sk->sk_mark, + flowi4_init_output(&fl4, sk->sk_bound_dev_if, ireq->ir_mark, RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP, inet_sk_flowi_flags(sk), (opt && opt->srr) ? opt->faddr : ireq->ir_rmt_addr, diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index f50d51850285..a33b9fbc1d80 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -845,6 +845,13 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "tcp_fwmark_accept", + .data = &init_net.ipv4.sysctl_tcp_fwmark_accept, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { } }; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a2780e5334c9..77cccda1ad0c 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1318,6 +1318,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) ireq->ir_rmt_addr = saddr; ireq->no_srccheck = inet_sk(sk)->transparent; ireq->opt = tcp_v4_save_options(skb); + ireq->ir_mark = inet_request_mark(sk, skb); if (security_inet_conn_request(sk, skb, req)) goto drop_and_free; diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index d4ade34ab375..a245e5ddffbd 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -81,7 +81,7 @@ struct dst_entry *inet6_csk_route_req(struct sock *sk, final_p = fl6_update_dst(fl6, np->opt, &final); fl6->saddr = ireq->ir_v6_loc_addr; fl6->flowi6_oif = ireq->ir_iif; - fl6->flowi6_mark = sk->sk_mark; + fl6->flowi6_mark = ireq->ir_mark; fl6->fl6_dport = ireq->ir_rmt_port; fl6->fl6_sport = htons(ireq->ir_num); security_req_classify_flow(req, flowi6_to_flowi(fl6)); diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index bb53a5e73c1a..a822b880689b 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -216,6 +216,8 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL) ireq->ir_iif = inet6_iif(skb); + ireq->ir_mark = inet_request_mark(sk, skb); + req->expires = 0UL; req->num_retrans = 0; ireq->ecn_ok = ecn_ok; @@ -242,7 +244,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) final_p = fl6_update_dst(&fl6, np->opt, &final); fl6.saddr = ireq->ir_v6_loc_addr; fl6.flowi6_oif = sk->sk_bound_dev_if; - fl6.flowi6_mark = sk->sk_mark; + fl6.flowi6_mark = ireq->ir_mark; fl6.fl6_dport = ireq->ir_rmt_port; fl6.fl6_sport = inet_sk(sk)->inet_sport; security_req_classify_flow(req, flowi6_to_flowi(&fl6)); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index c54976a44425..f07b2abba359 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1034,6 +1034,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) TCP_ECN_create_request(req, skb, sock_net(sk)); ireq->ir_iif = sk->sk_bound_dev_if; + ireq->ir_mark = inet_request_mark(sk, skb); /* So that link locals have meaning */ if (!sk->sk_bound_dev_if && -- cgit v1.2.3-58-ga151 From 122ff243f5f104194750ecbc76d5946dd1eec934 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Mon, 12 May 2014 16:04:53 -0700 Subject: ipv4: make ip_local_reserved_ports per netns ip_local_port_range is already per netns, so should ip_local_reserved_ports be. And since it is none by default we don't actually need it when we don't enable CONFIG_SYSCTL. By the way, rename inet_is_reserved_local_port() to inet_is_local_reserved_port() Cc: "David S. Miller" Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/net/ip.h | 14 +++++++++++--- include/net/netns/ipv4.h | 4 ++++ kernel/sysctl.c | 4 ++-- net/ipv4/af_inet.c | 8 +------- net/ipv4/inet_connection_sock.c | 5 +---- net/ipv4/inet_hashtables.c | 2 +- net/ipv4/sysctl_net_ipv4.c | 31 ++++++++++++++----------------- net/ipv4/udp.c | 2 +- net/sctp/socket.c | 5 +++-- 9 files changed, 38 insertions(+), 37 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/ip.h b/include/net/ip.h index 14c50a1650ef..512bcd5dabac 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -208,11 +208,19 @@ static inline u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_o void inet_get_local_port_range(struct net *net, int *low, int *high); -extern unsigned long *sysctl_local_reserved_ports; -static inline int inet_is_reserved_local_port(int port) +#if CONFIG_SYSCTL +static inline int inet_is_local_reserved_port(struct net *net, int port) { - return test_bit(port, sysctl_local_reserved_ports); + if (!net->ipv4.sysctl_local_reserved_ports) + return 0; + return test_bit(port, net->ipv4.sysctl_local_reserved_ports); } +#else +static inline int inet_is_local_reserved_port(struct net *net, int port) +{ + return 0; +} +#endif extern int sysctl_ip_nonlocal_bind; diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 2f0cfad66666..aec5e12f9f19 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -84,6 +84,10 @@ struct netns_ipv4 { atomic_t dev_addr_genid; +#ifdef CONFIG_SYSCTL + unsigned long *sysctl_local_reserved_ports; +#endif + #ifdef CONFIG_IP_MROUTE #ifndef CONFIG_IP_MROUTE_MULTIPLE_TABLES struct mr_table *mrt; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 74f5b580fe34..e36ae4b15726 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2501,11 +2501,11 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, bool first = 1; size_t left = *lenp; unsigned long bitmap_len = table->maxlen; - unsigned long *bitmap = (unsigned long *) table->data; + unsigned long *bitmap = *(unsigned long **) table->data; unsigned long *tmp_bitmap = NULL; char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c; - if (!bitmap_len || !left || (*ppos && !write)) { + if (!bitmap || !bitmap_len || !left || (*ppos && !write)) { *lenp = 0; return 0; } diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 211c0cc6c3d3..279132bcadd9 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1705,13 +1705,9 @@ static int __init inet_init(void) BUILD_BUG_ON(sizeof(struct inet_skb_parm) > FIELD_SIZEOF(struct sk_buff, cb)); - sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL); - if (!sysctl_local_reserved_ports) - goto out; - rc = proto_register(&tcp_prot, 1); if (rc) - goto out_free_reserved_ports; + goto out; rc = proto_register(&udp_prot, 1); if (rc) @@ -1821,8 +1817,6 @@ out_unregister_udp_proto: proto_unregister(&udp_prot); out_unregister_tcp_proto: proto_unregister(&tcp_prot); -out_free_reserved_ports: - kfree(sysctl_local_reserved_ports); goto out; } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 12e502cbfdc7..14d02ea905b6 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -29,9 +29,6 @@ const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; EXPORT_SYMBOL(inet_csk_timer_bug_msg); #endif -unsigned long *sysctl_local_reserved_ports; -EXPORT_SYMBOL(sysctl_local_reserved_ports); - void inet_get_local_port_range(struct net *net, int *low, int *high) { unsigned int seq; @@ -113,7 +110,7 @@ again: smallest_size = -1; do { - if (inet_is_reserved_local_port(rover)) + if (inet_is_local_reserved_port(net, rover)) goto next_nolock; head = &hashinfo->bhash[inet_bhashfn(net, rover, hashinfo->bhash_size)]; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 8b9cf279450d..83331f1b86ac 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -500,7 +500,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, local_bh_disable(); for (i = 1; i <= remaining; i++) { port = low + (i + offset) % remaining; - if (inet_is_reserved_local_port(port)) + if (inet_is_local_reserved_port(net, port)) continue; head = &hinfo->bhash[inet_bhashfn(net, port, hinfo->bhash_size)]; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index a33b9fbc1d80..79a007c52558 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -436,13 +436,6 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, - { - .procname = "ip_local_reserved_ports", - .data = NULL, /* initialized in sysctl_ipv4_init */ - .maxlen = 65536, - .mode = 0644, - .proc_handler = proc_do_large_bitmap, - }, { .procname = "igmp_max_memberships", .data = &sysctl_igmp_max_memberships, @@ -824,6 +817,13 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = ipv4_local_port_range, }, + { + .procname = "ip_local_reserved_ports", + .data = &init_net.ipv4.sysctl_local_reserved_ports, + .maxlen = 65536, + .mode = 0644, + .proc_handler = proc_do_large_bitmap, + }, { .procname = "ip_no_pmtu_disc", .data = &init_net.ipv4.sysctl_ip_no_pmtu_disc, @@ -876,8 +876,14 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) if (net->ipv4.ipv4_hdr == NULL) goto err_reg; + net->ipv4.sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL); + if (!net->ipv4.sysctl_local_reserved_ports) + goto err_ports; + return 0; +err_ports: + unregister_net_sysctl_table(net->ipv4.ipv4_hdr); err_reg: if (!net_eq(net, &init_net)) kfree(table); @@ -889,6 +895,7 @@ static __net_exit void ipv4_sysctl_exit_net(struct net *net) { struct ctl_table *table; + kfree(net->ipv4.sysctl_local_reserved_ports); table = net->ipv4.ipv4_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv4.ipv4_hdr); kfree(table); @@ -902,16 +909,6 @@ static __net_initdata struct pernet_operations ipv4_sysctl_ops = { static __init int sysctl_ipv4_init(void) { struct ctl_table_header *hdr; - struct ctl_table *i; - - for (i = ipv4_table; i->procname; i++) { - if (strcmp(i->procname, "ip_local_reserved_ports") == 0) { - i->data = sysctl_local_reserved_ports; - break; - } - } - if (!i->procname) - return -EINVAL; hdr = register_net_sysctl(&init_net, "net/ipv4", ipv4_table); if (hdr == NULL) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 54ea0a3a48f1..6729ea97a59d 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -246,7 +246,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, do { if (low <= snum && snum <= high && !test_bit(snum >> udptable->log, bitmap) && - !inet_is_reserved_local_port(snum)) + !inet_is_local_reserved_port(net, snum)) goto found; snum += rand; } while (snum != first); diff --git a/net/sctp/socket.c b/net/sctp/socket.c index e37b2cbbf177..2af76eaba8f7 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -5946,8 +5946,9 @@ static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr) /* Search for an available port. */ int low, high, remaining, index; unsigned int rover; + struct net *net = sock_net(sk); - inet_get_local_port_range(sock_net(sk), &low, &high); + inet_get_local_port_range(net, &low, &high); remaining = (high - low) + 1; rover = prandom_u32() % remaining + low; @@ -5955,7 +5956,7 @@ static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr) rover++; if ((rover < low) || (rover > high)) rover = low; - if (inet_is_reserved_local_port(rover)) + if (inet_is_local_reserved_port(net, rover)) continue; index = sctp_phashfn(sock_net(sk), rover); head = &sctp_port_hashtable[index]; -- cgit v1.2.3-58-ga151 From c7228317441f4dee5e5916e30300dd8c61f75af7 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 13 May 2014 20:30:07 -0700 Subject: net: Use a more standard macro for INET_ADDR_COOKIE Missing a colon on definition use is a bit odd so change the macro for the 32 bit case to declare an __attribute__((unused)) and __deprecated variable. The __deprecated attribute will cause gcc to emit an error if the variable is actually used. Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- include/net/inet_hashtables.h | 8 +++++--- net/ipv4/inet_hashtables.c | 4 ++-- net/ipv4/udp.c | 2 +- 3 files changed, 8 insertions(+), 6 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 1bdb47715def..dd1950a7e273 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -292,12 +292,12 @@ static inline struct sock *inet_lookup_listener(struct net *net, #define INET_ADDR_COOKIE(__name, __saddr, __daddr) \ const __addrpair __name = (__force __addrpair) ( \ (((__force __u64)(__be32)(__saddr)) << 32) | \ - ((__force __u64)(__be32)(__daddr))); + ((__force __u64)(__be32)(__daddr))) #else /* __LITTLE_ENDIAN */ #define INET_ADDR_COOKIE(__name, __saddr, __daddr) \ const __addrpair __name = (__force __addrpair) ( \ (((__force __u64)(__be32)(__daddr)) << 32) | \ - ((__force __u64)(__be32)(__saddr))); + ((__force __u64)(__be32)(__saddr))) #endif /* __BIG_ENDIAN */ #define INET_MATCH(__sk, __net, __cookie, __saddr, __daddr, __ports, __dif) \ (((__sk)->sk_portpair == (__ports)) && \ @@ -306,7 +306,9 @@ static inline struct sock *inet_lookup_listener(struct net *net, ((__sk)->sk_bound_dev_if == (__dif))) && \ net_eq(sock_net(__sk), (__net))) #else /* 32-bit arch */ -#define INET_ADDR_COOKIE(__name, __saddr, __daddr) +#define INET_ADDR_COOKIE(__name, __saddr, __daddr) \ + const int __name __deprecated __attribute__((unused)) + #define INET_MATCH(__sk, __net, __cookie, __saddr, __daddr, __ports, __dif) \ (((__sk)->sk_portpair == (__ports)) && \ ((__sk)->sk_daddr == (__saddr)) && \ diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 83331f1b86ac..43116e8c8e13 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -274,7 +274,7 @@ struct sock *__inet_lookup_established(struct net *net, const __be32 daddr, const u16 hnum, const int dif) { - INET_ADDR_COOKIE(acookie, saddr, daddr) + INET_ADDR_COOKIE(acookie, saddr, daddr); const __portpair ports = INET_COMBINED_PORTS(sport, hnum); struct sock *sk; const struct hlist_nulls_node *node; @@ -327,7 +327,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, __be32 daddr = inet->inet_rcv_saddr; __be32 saddr = inet->inet_daddr; int dif = sk->sk_bound_dev_if; - INET_ADDR_COOKIE(acookie, saddr, daddr) + INET_ADDR_COOKIE(acookie, saddr, daddr); const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport); struct net *net = sock_net(sk); unsigned int hash = inet_ehashfn(net, daddr, lport, diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 6729ea97a59d..590532a7bd2d 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1875,7 +1875,7 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net, unsigned int hash2 = udp4_portaddr_hash(net, loc_addr, hnum); unsigned int slot2 = hash2 & udp_table.mask; struct udp_hslot *hslot2 = &udp_table.hash2[slot2]; - INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr) + INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr); const __portpair ports = INET_COMBINED_PORTS(rmt_port, hnum); rcu_read_lock(); -- cgit v1.2.3-58-ga151 From 4929fd8cb064e1bbee76b8c4cf6bd3f0e40bf66e Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Thu, 15 May 2014 05:43:20 +0200 Subject: ip_tunnel: delete unneeded call to netdev_priv Netdev_priv is an accessor function, and has no purpose if its result is not used. A simplified version of the semantic match that fixes this problem is as follows: (http://coccinelle.lip6.fr/) // @@ local idexpression x; @@ -x = netdev_priv(...); ... when != x // Signed-off-by: Julia Lawall Signed-off-by: David S. Miller --- net/ipv4/ip_tunnel.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index b3f859731c60..059176dfab7f 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -395,11 +395,10 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net, struct ip_tunnel_net *itn, struct ip_tunnel_parm *parms) { - struct ip_tunnel *nt, *fbt; + struct ip_tunnel *nt; struct net_device *dev; BUG_ON(!itn->fb_tunnel_dev); - fbt = netdev_priv(itn->fb_tunnel_dev); dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms); if (IS_ERR(dev)) return ERR_CAST(dev); -- cgit v1.2.3-58-ga151 From ee30ef4d45e7bb64a13e6f3c35b4c75b12a8a4e9 Mon Sep 17 00:00:00 2001 From: Duan Jiong Date: Thu, 15 May 2014 13:07:02 +0800 Subject: ip_tunnel: don't add tunnel twice When using command "ip tunnel add" to add a tunnel, the tunnel will be added twice, through ip_tunnel_create() and ip_tunnel_update(). Because the second is unnecessary, so we can just break after adding tunnel through ip_tunnel_create(). Signed-off-by: Duan Jiong Signed-off-by: David S. Miller --- net/ipv4/ip_tunnel.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 059176dfab7f..289c6ee388c1 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -754,10 +754,8 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) if (!t && (cmd == SIOCADDTUNNEL)) { t = ip_tunnel_create(net, itn, p); - if (IS_ERR(t)) { - err = PTR_ERR(t); - break; - } + err = PTR_ERR_OR_ZERO(t); + break; } if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { if (t != NULL) { -- cgit v1.2.3-58-ga151 From 614d056c8e464e969361ebb03e4bad8a82047d0e Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Fri, 16 May 2014 20:46:58 -0700 Subject: ipv4: minor spelling fix Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/devinet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 78692e46b64f..e9449376b58e 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -827,7 +827,7 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh) ifa_existing = find_matching_ifa(ifa); if (!ifa_existing) { /* It would be best to check for !NLM_F_CREATE here but - * userspace alreay relies on not having to provide this. + * userspace already relies on not having to provide this. */ set_ifa_lifetime(ifa, valid_lft, prefered_lft); return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid); -- cgit v1.2.3-58-ga151 From f98f89a0104454f35a62d681683c844f6dbf4043 Mon Sep 17 00:00:00 2001 From: Tom Gundersen Date: Thu, 15 May 2014 23:21:30 +0200 Subject: net: tunnels - enable module autoloading Enable the module alias hookup to allow tunnel modules to be autoloaded on demand. This is in line with how most other netdev kinds work, and will allow userspace to create tunnels without having CAP_SYS_MODULE. Signed-off-by: Tom Gundersen Signed-off-by: David S. Miller --- net/ipv4/ipip.c | 1 + net/ipv6/ip6_tunnel.c | 1 + net/ipv6/sit.c | 1 + 3 files changed, 3 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 812b18351462..4bc508f0db90 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -486,4 +486,5 @@ static void __exit ipip_fini(void) module_init(ipip_init); module_exit(ipip_fini); MODULE_LICENSE("GPL"); +MODULE_ALIAS_RTNL_LINK("ipip"); MODULE_ALIAS_NETDEV("tunl0"); diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index b05b609f69d1..fe61545dde71 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -61,6 +61,7 @@ MODULE_AUTHOR("Ville Nuorvala"); MODULE_DESCRIPTION("IPv6 tunneling device"); MODULE_LICENSE("GPL"); +MODULE_ALIAS_RTNL_LINK("ip6tnl"); MODULE_ALIAS_NETDEV("ip6tnl0"); #ifdef IP6_TNL_DEBUG diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index e5a453ca302e..f4380041f5e7 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1828,4 +1828,5 @@ xfrm_tunnel_failed: module_init(sit_init); module_exit(sit_cleanup); MODULE_LICENSE("GPL"); +MODULE_ALIAS_RTNL_LINK("sit"); MODULE_ALIAS_NETDEV("sit0"); -- cgit v1.2.3-58-ga151 From ca8a22634381537c92b5a10308652e1c38fd9edf Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Thu, 22 May 2014 10:41:08 -0400 Subject: tcp: make cwnd-limited checks measurement-based, and gentler Experience with the recent e114a710aa50 ("tcp: fix cwnd limited checking to improve congestion control") has shown that there are common cases where that commit can cause cwnd to be much larger than necessary. This leads to TSO autosizing cooking skbs that are too large, among other things. The main problems seemed to be: (1) That commit attempted to predict the future behavior of the connection by looking at the write queue (if TSO or TSQ limit sending). That prediction sometimes overestimated future outstanding packets. (2) That commit always allowed cwnd to grow to twice the number of outstanding packets (even in congestion avoidance, where this is not needed). This commit improves both of these, by: (1) Switching to a measurement-based approach where we explicitly track the largest number of packets in flight during the past window ("max_packets_out"), and remember whether we were cwnd-limited at the moment we finished sending that flight. (2) Only allowing cwnd to grow to twice the number of outstanding packets ("max_packets_out") in slow start. In congestion avoidance mode we now only allow cwnd to grow if it was fully utilized. Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/tcp.h | 6 ++++-- include/net/tcp.h | 11 ++++++++--- net/ipv4/tcp_output.c | 37 +++++++++++++++++++++++-------------- 3 files changed, 35 insertions(+), 19 deletions(-) (limited to 'net/ipv4') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index bc35e4709e8e..a0513210798f 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -197,7 +197,8 @@ struct tcp_sock { u8 do_early_retrans:1,/* Enable RFC5827 early-retransmit */ syn_data:1, /* SYN includes data */ syn_fastopen:1, /* SYN includes Fast Open option */ - syn_data_acked:1;/* data in SYN is acked by SYN-ACK */ + syn_data_acked:1,/* data in SYN is acked by SYN-ACK */ + is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */ u32 tlp_high_seq; /* snd_nxt at the time of TLP retransmit. */ /* RTT measurement */ @@ -209,6 +210,8 @@ struct tcp_sock { u32 packets_out; /* Packets which are "in flight" */ u32 retrans_out; /* Retransmitted packets out */ + u32 max_packets_out; /* max packets_out in last window */ + u32 max_packets_seq; /* right edge of max_packets_out flight */ u16 urg_data; /* Saved octet of OOB data and control flags */ u8 ecn_flags; /* ECN status bits. */ @@ -230,7 +233,6 @@ struct tcp_sock { u32 snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this */ u32 snd_cwnd_used; u32 snd_cwnd_stamp; - u32 lsnd_pending; /* packets inflight or unsent since last xmit */ u32 prior_cwnd; /* Congestion window at start of Recovery. */ u32 prr_delivered; /* Number of newly delivered packets to * receiver in Recovery. */ diff --git a/include/net/tcp.h b/include/net/tcp.h index f5d6ca4a9d28..e80abe4486cb 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -971,8 +971,9 @@ static inline u32 tcp_wnd_end(const struct tcp_sock *tp) /* We follow the spirit of RFC2861 to validate cwnd but implement a more * flexible approach. The RFC suggests cwnd should not be raised unless - * it was fully used previously. But we allow cwnd to grow as long as the - * application has used half the cwnd. + * it was fully used previously. And that's exactly what we do in + * congestion avoidance mode. But in slow start we allow cwnd to grow + * as long as the application has used half the cwnd. * Example : * cwnd is 10 (IW10), but application sends 9 frames. * We allow cwnd to reach 18 when all frames are ACKed. @@ -985,7 +986,11 @@ static inline bool tcp_is_cwnd_limited(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); - return tp->snd_cwnd < 2 * tp->lsnd_pending; + /* If in slow start, ensure cwnd grows to twice what was ACKed. */ + if (tp->snd_cwnd <= tp->snd_ssthresh) + return tp->snd_cwnd < 2 * tp->max_packets_out; + + return tp->is_cwnd_limited; } static inline void tcp_check_probe_timer(struct sock *sk) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 3d61c52bdf79..d463c35db33d 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1402,11 +1402,19 @@ static void tcp_cwnd_application_limited(struct sock *sk) tp->snd_cwnd_stamp = tcp_time_stamp; } -static void tcp_cwnd_validate(struct sock *sk, u32 unsent_segs) +static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) { struct tcp_sock *tp = tcp_sk(sk); - tp->lsnd_pending = tp->packets_out + unsent_segs; + /* Track the maximum number of outstanding packets in each + * window, and remember whether we were cwnd-limited then. + */ + if (!before(tp->snd_una, tp->max_packets_seq) || + tp->packets_out > tp->max_packets_out) { + tp->max_packets_out = tp->packets_out; + tp->max_packets_seq = tp->snd_nxt; + tp->is_cwnd_limited = is_cwnd_limited; + } if (tcp_is_cwnd_limited(sk)) { /* Network is feed fully. */ @@ -1660,7 +1668,8 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, * * This algorithm is from John Heffner. */ -static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) +static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, + bool *is_cwnd_limited) { struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); @@ -1724,6 +1733,9 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) if (!tp->tso_deferred) tp->tso_deferred = 1 | (jiffies << 1); + if (cong_win < send_win && cong_win < skb->len) + *is_cwnd_limited = true; + return true; send_now: @@ -1881,9 +1893,10 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - unsigned int tso_segs, sent_pkts, unsent_segs = 0; + unsigned int tso_segs, sent_pkts; int cwnd_quota; int result; + bool is_cwnd_limited = false; sent_pkts = 0; @@ -1908,6 +1921,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, cwnd_quota = tcp_cwnd_test(tp, skb); if (!cwnd_quota) { + is_cwnd_limited = true; if (push_one == 2) /* Force out a loss probe pkt. */ cwnd_quota = 1; @@ -1924,8 +1938,9 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, nonagle : TCP_NAGLE_PUSH)))) break; } else { - if (!push_one && tcp_tso_should_defer(sk, skb)) - goto compute_unsent_segs; + if (!push_one && + tcp_tso_should_defer(sk, skb, &is_cwnd_limited)) + break; } /* TCP Small Queues : @@ -1950,14 +1965,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, * there is no smp_mb__after_set_bit() yet */ smp_mb__after_clear_bit(); - if (atomic_read(&sk->sk_wmem_alloc) > limit) { - u32 unsent_bytes; - -compute_unsent_segs: - unsent_bytes = tp->write_seq - tp->snd_nxt; - unsent_segs = DIV_ROUND_UP(unsent_bytes, mss_now); + if (atomic_read(&sk->sk_wmem_alloc) > limit) break; - } } limit = mss_now; @@ -1997,7 +2006,7 @@ repair: /* Send one loss probe per tail loss episode. */ if (push_one != 2) tcp_schedule_loss_probe(sk); - tcp_cwnd_validate(sk, unsent_segs); + tcp_cwnd_validate(sk, is_cwnd_limited); return false; } return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk)); -- cgit v1.2.3-58-ga151 From b26ba202e0500eb852e89499ece1b2deaa64c3a7 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 23 May 2014 08:47:09 -0700 Subject: net: Eliminate no_check from protosw It doesn't seem like an protocols are setting anything other than the default, and allowing to arbitrarily disable checksums for a whole protocol seems dangerous. This can be done on a per socket basis. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/protocol.h | 1 - net/dccp/ipv4.c | 1 - net/ipv4/af_inet.c | 7 ------- net/ipv4/udplite.c | 1 - net/ipv6/af_inet6.c | 3 --- net/ipv6/ping.c | 1 - net/ipv6/raw.c | 1 - net/ipv6/tcp_ipv6.c | 1 - net/ipv6/udp.c | 1 - net/ipv6/udplite.c | 1 - net/l2tp/l2tp_ip.c | 1 - net/l2tp/l2tp_ip6.c | 1 - net/sctp/ipv6.c | 2 -- net/sctp/protocol.c | 2 -- 14 files changed, 24 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/protocol.h b/include/net/protocol.h index a7e986b08147..d6fcc1fcdb5b 100644 --- a/include/net/protocol.h +++ b/include/net/protocol.h @@ -86,7 +86,6 @@ struct inet_protosw { struct proto *prot; const struct proto_ops *ops; - char no_check; /* checksum on rcv/xmit/none? */ unsigned char flags; /* See INET_PROTOSW_* below. */ }; #define INET_PROTOSW_REUSE 0x01 /* Are ports automatically reusable? */ diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 22b5d818b200..6ca645c4b48e 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -1024,7 +1024,6 @@ static struct inet_protosw dccp_v4_protosw = { .protocol = IPPROTO_DCCP, .prot = &dccp_v4_prot, .ops = &inet_dccp_ops, - .no_check = 0, .flags = INET_PROTOSW_ICSK, }; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 279132bcadd9..0e9bb08a91e4 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -254,7 +254,6 @@ static int inet_create(struct net *net, struct socket *sock, int protocol, struct inet_sock *inet; struct proto *answer_prot; unsigned char answer_flags; - char answer_no_check; int try_loading_module = 0; int err; @@ -312,7 +311,6 @@ lookup_protocol: sock->ops = answer->ops; answer_prot = answer->prot; - answer_no_check = answer->no_check; answer_flags = answer->flags; rcu_read_unlock(); @@ -324,7 +322,6 @@ lookup_protocol: goto out; err = 0; - sk->sk_no_check = answer_no_check; if (INET_PROTOSW_REUSE & answer_flags) sk->sk_reuse = SK_CAN_REUSE; @@ -1002,7 +999,6 @@ static struct inet_protosw inetsw_array[] = .protocol = IPPROTO_TCP, .prot = &tcp_prot, .ops = &inet_stream_ops, - .no_check = 0, .flags = INET_PROTOSW_PERMANENT | INET_PROTOSW_ICSK, }, @@ -1012,7 +1008,6 @@ static struct inet_protosw inetsw_array[] = .protocol = IPPROTO_UDP, .prot = &udp_prot, .ops = &inet_dgram_ops, - .no_check = UDP_CSUM_DEFAULT, .flags = INET_PROTOSW_PERMANENT, }, @@ -1021,7 +1016,6 @@ static struct inet_protosw inetsw_array[] = .protocol = IPPROTO_ICMP, .prot = &ping_prot, .ops = &inet_dgram_ops, - .no_check = UDP_CSUM_DEFAULT, .flags = INET_PROTOSW_REUSE, }, @@ -1030,7 +1024,6 @@ static struct inet_protosw inetsw_array[] = .protocol = IPPROTO_IP, /* wild card */ .prot = &raw_prot, .ops = &inet_sockraw_ops, - .no_check = UDP_CSUM_DEFAULT, .flags = INET_PROTOSW_REUSE, } }; diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index 2c46acd4cc36..3b3efbda48e1 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -70,7 +70,6 @@ static struct inet_protosw udplite4_protosw = { .protocol = IPPROTO_UDPLITE, .prot = &udplite_prot, .ops = &inet_dgram_ops, - .no_check = 0, /* must checksum (RFC 3828) */ .flags = INET_PROTOSW_PERMANENT, }; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index dc47cc757b80..7cb4392690dd 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -106,7 +106,6 @@ static int inet6_create(struct net *net, struct socket *sock, int protocol, struct inet_protosw *answer; struct proto *answer_prot; unsigned char answer_flags; - char answer_no_check; int try_loading_module = 0; int err; @@ -162,7 +161,6 @@ lookup_protocol: sock->ops = answer->ops; answer_prot = answer->prot; - answer_no_check = answer->no_check; answer_flags = answer->flags; rcu_read_unlock(); @@ -176,7 +174,6 @@ lookup_protocol: sock_init_data(sock, sk); err = 0; - sk->sk_no_check = answer_no_check; if (INET_PROTOSW_REUSE & answer_flags) sk->sk_reuse = SK_CAN_REUSE; diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index a2a1d80dfe0c..5b7a1ed2aba9 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -51,7 +51,6 @@ static struct inet_protosw pingv6_protosw = { .protocol = IPPROTO_ICMPV6, .prot = &pingv6_prot, .ops = &inet6_dgram_ops, - .no_check = UDP_CSUM_DEFAULT, .flags = INET_PROTOSW_REUSE, }; diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index dddfb5fa2b7a..b2dc60b0c764 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -1322,7 +1322,6 @@ static struct inet_protosw rawv6_protosw = { .protocol = IPPROTO_IP, /* wild card */ .prot = &rawv6_prot, .ops = &inet6_sockraw_ops, - .no_check = UDP_CSUM_DEFAULT, .flags = INET_PROTOSW_REUSE, }; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index f07b2abba359..229239ad96b1 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1992,7 +1992,6 @@ static struct inet_protosw tcpv6_protosw = { .protocol = IPPROTO_TCP, .prot = &tcpv6_prot, .ops = &inet6_stream_ops, - .no_check = 0, .flags = INET_PROTOSW_PERMANENT | INET_PROTOSW_ICSK, }; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 7edf096867c4..c7ed47bde54b 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1507,7 +1507,6 @@ static struct inet_protosw udpv6_protosw = { .protocol = IPPROTO_UDP, .prot = &udpv6_prot, .ops = &inet6_dgram_ops, - .no_check = UDP_CSUM_DEFAULT, .flags = INET_PROTOSW_PERMANENT, }; diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c index dfcc4be46898..9cf097e206e9 100644 --- a/net/ipv6/udplite.c +++ b/net/ipv6/udplite.c @@ -64,7 +64,6 @@ static struct inet_protosw udplite6_protosw = { .protocol = IPPROTO_UDPLITE, .prot = &udplitev6_prot, .ops = &inet6_dgram_ops, - .no_check = 0, .flags = INET_PROTOSW_PERMANENT, }; diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 3397fe6897c0..369a9822488c 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -606,7 +606,6 @@ static struct inet_protosw l2tp_ip_protosw = { .protocol = IPPROTO_L2TP, .prot = &l2tp_ip_prot, .ops = &l2tp_ip_ops, - .no_check = 0, }; static struct net_protocol l2tp_ip_protocol __read_mostly = { diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index e472d44a3b91..f3f98a156cee 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -755,7 +755,6 @@ static struct inet_protosw l2tp_ip6_protosw = { .protocol = IPPROTO_L2TP, .prot = &l2tp_ip6_prot, .ops = &l2tp_ip6_ops, - .no_check = 0, }; static struct inet6_protocol l2tp_ip6_protocol __read_mostly = { diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 4dc5d9e08311..1999592ba88c 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -943,7 +943,6 @@ static struct inet_protosw sctpv6_seqpacket_protosw = { .protocol = IPPROTO_SCTP, .prot = &sctpv6_prot, .ops = &inet6_seqpacket_ops, - .no_check = 0, .flags = SCTP_PROTOSW_FLAG }; static struct inet_protosw sctpv6_stream_protosw = { @@ -951,7 +950,6 @@ static struct inet_protosw sctpv6_stream_protosw = { .protocol = IPPROTO_SCTP, .prot = &sctpv6_prot, .ops = &inet6_seqpacket_ops, - .no_check = 0, .flags = SCTP_PROTOSW_FLAG, }; diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index af5afca4b85a..6789d785e698 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1017,7 +1017,6 @@ static struct inet_protosw sctp_seqpacket_protosw = { .protocol = IPPROTO_SCTP, .prot = &sctp_prot, .ops = &inet_seqpacket_ops, - .no_check = 0, .flags = SCTP_PROTOSW_FLAG }; static struct inet_protosw sctp_stream_protosw = { @@ -1025,7 +1024,6 @@ static struct inet_protosw sctp_stream_protosw = { .protocol = IPPROTO_SCTP, .prot = &sctp_prot, .ops = &inet_seqpacket_ops, - .no_check = 0, .flags = SCTP_PROTOSW_FLAG }; -- cgit v1.2.3-58-ga151 From 28448b80456feafe07e2d05b6363b00f61f6171e Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 23 May 2014 08:47:19 -0700 Subject: net: Split sk_no_check into sk_no_check_{rx,tx} Define separate fields in the sock structure for configuring disabling checksums in both TX and RX-- sk_no_check_tx and sk_no_check_rx. The SO_NO_CHECK socket option only affects sk_no_check_tx. Also, removed UDP_CSUM_* defines since they are no longer necessary. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- drivers/scsi/iscsi_tcp.c | 2 +- include/net/sock.h | 6 ++++-- include/net/udp.h | 9 --------- net/appletalk/ddp.c | 2 +- net/core/sock.c | 4 ++-- net/decnet/af_decnet.c | 2 +- net/ipv4/udp.c | 2 +- net/ipv6/udp.c | 6 +++--- net/ipx/af_ipx.c | 2 +- net/ipx/ipx_route.c | 3 ++- net/l2tp/l2tp_core.c | 4 ++-- net/l2tp/l2tp_netlink.c | 3 +-- net/sctp/socket.c | 3 ++- 13 files changed, 21 insertions(+), 27 deletions(-) (limited to 'net/ipv4') diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c index 11854845393b..a669f2d11c31 100644 --- a/drivers/scsi/iscsi_tcp.c +++ b/drivers/scsi/iscsi_tcp.c @@ -244,7 +244,7 @@ iscsi_sw_tcp_conn_restore_callbacks(struct iscsi_conn *conn) sk->sk_data_ready = tcp_sw_conn->old_data_ready; sk->sk_state_change = tcp_sw_conn->old_state_change; sk->sk_write_space = tcp_sw_conn->old_write_space; - sk->sk_no_check = 0; + sk->sk_no_check_tx = 0; write_unlock_bh(&sk->sk_callback_lock); } diff --git a/include/net/sock.h b/include/net/sock.h index 21569cf456ed..07b7fcd60d80 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -243,7 +243,8 @@ struct cg_proto; * @sk_sndbuf: size of send buffer in bytes * @sk_flags: %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE, * %SO_OOBINLINE settings, %SO_TIMESTAMPING settings - * @sk_no_check: %SO_NO_CHECK setting, whether or not checkup packets + * @sk_no_check_tx: %SO_NO_CHECK setting, set checksum in TX packets + * @sk_no_check_rx: allow zero checksum in RX packets * @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO) * @sk_route_nocaps: forbidden route capabilities (e.g NETIF_F_GSO_MASK) * @sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4) @@ -371,7 +372,8 @@ struct sock { struct sk_buff_head sk_write_queue; kmemcheck_bitfield_begin(flags); unsigned int sk_shutdown : 2, - sk_no_check : 2, + sk_no_check_tx : 1, + sk_no_check_rx : 1, sk_userlocks : 4, sk_protocol : 8, sk_type : 16; diff --git a/include/net/udp.h b/include/net/udp.h index a24f0f3e107f..5eb86874bcd6 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -95,15 +95,6 @@ static inline struct udp_hslot *udp_hashslot2(struct udp_table *table, return &table->hash2[hash & table->mask]; } -/* Note: this must match 'valbool' in sock_setsockopt */ -#define UDP_CSUM_NOXMIT 1 - -/* Used by SunRPC/xprt layer. */ -#define UDP_CSUM_NORCV 2 - -/* Default, as per the RFC, is to always do csums. */ -#define UDP_CSUM_DEFAULT 0 - extern struct proto udp_prot; extern atomic_long_t udp_memory_allocated; diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 786ee2f83d5f..01a1082e02b3 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -1669,7 +1669,7 @@ static int atalk_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr goto out; } - if (sk->sk_no_check == 1) + if (sk->sk_no_check_tx) ddp->deh_sum = 0; else ddp->deh_sum = atalk_checksum(skb, len + sizeof(*ddp)); diff --git a/net/core/sock.c b/net/core/sock.c index 664ee4295b6f..026e01f70274 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -784,7 +784,7 @@ set_rcvbuf: break; case SO_NO_CHECK: - sk->sk_no_check = valbool; + sk->sk_no_check_tx = valbool; break; case SO_PRIORITY: @@ -1064,7 +1064,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, break; case SO_NO_CHECK: - v.val = sk->sk_no_check; + v.val = sk->sk_no_check_tx; break; case SO_PRIORITY: diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index 4c04848953bd..ae011b46c071 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -481,7 +481,7 @@ static struct sock *dn_alloc_sock(struct net *net, struct socket *sock, gfp_t gf sk->sk_backlog_rcv = dn_nsp_backlog_rcv; sk->sk_destruct = dn_destruct; - sk->sk_no_check = 1; + sk->sk_no_check_tx = 1; sk->sk_family = PF_DECnet; sk->sk_protocol = 0; sk->sk_allocation = gfp; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 590532a7bd2d..12c6175b29cd 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -785,7 +785,7 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4) if (is_udplite) /* UDP-Lite */ csum = udplite_csum(skb); - else if (sk->sk_no_check == UDP_CSUM_NOXMIT) { /* UDP csum disabled */ + else if (sk->sk_no_check_tx) { /* UDP csum disabled */ skb->ip_summed = CHECKSUM_NONE; goto send; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index c7ed47bde54b..b8db453133aa 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -797,7 +797,7 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, /* If zero checksum and sk_no_check is not on for * the socket then skip it. */ - if (uh->check || sk->sk_no_check) + if (uh->check || sk->sk_no_check_rx) stack[count++] = sk; sk = udp_v6_mcast_next(net, sk_nulls_next(sk), uh->dest, daddr, @@ -887,7 +887,7 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, if (sk != NULL) { int ret; - if (!uh->check && !sk->sk_no_check) { + if (!uh->check && !sk->sk_no_check_rx) { sock_put(sk); udp6_csum_zero_error(skb); goto csum_error; @@ -1037,7 +1037,7 @@ static int udp_v6_push_pending_frames(struct sock *sk) if (is_udplite) csum = udplite_csum_outgoing(sk, skb); - else if (sk->sk_no_check == UDP_CSUM_NOXMIT) { /* UDP csum disabled */ + else if (sk->sk_no_check_tx) { /* UDP csum disabled */ skb->ip_summed = CHECKSUM_NONE; goto send; } else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */ diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c index 41e4e93cb3aa..91729b807c7d 100644 --- a/net/ipx/af_ipx.c +++ b/net/ipx/af_ipx.c @@ -1353,7 +1353,7 @@ static int ipx_create(struct net *net, struct socket *sock, int protocol, sk_refcnt_debug_inc(sk); sock_init_data(sock, sk); - sk->sk_no_check = 1; /* Checksum off by default */ + sk->sk_no_check_tx = 1; /* Checksum off by default */ sock->ops = &ipx_dgram_ops; rc = 0; out: diff --git a/net/ipx/ipx_route.c b/net/ipx/ipx_route.c index c1f03185c5e1..67e7ad3d46b1 100644 --- a/net/ipx/ipx_route.c +++ b/net/ipx/ipx_route.c @@ -236,7 +236,8 @@ int ipxrtr_route_packet(struct sock *sk, struct sockaddr_ipx *usipx, } /* Apply checksum. Not allowed on 802.3 links. */ - if (sk->sk_no_check || intrfc->if_dlink_type == htons(IPX_FRAME_8023)) + if (sk->sk_no_check_tx || + intrfc->if_dlink_type == htons(IPX_FRAME_8023)) ipx->ipx_checksum = htons(0xFFFF); else ipx->ipx_checksum = ipx_cksum(ipx, len + sizeof(struct ipxhdr)); diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index ed0716a075ba..a1186105f537 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1188,7 +1188,7 @@ int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len l2tp_xmit_ipv6_csum(sk, skb, udp_len); else #endif - if (sk->sk_no_check == UDP_CSUM_NOXMIT) + if (sk->sk_no_check_tx) skb->ip_summed = CHECKSUM_NONE; else if ((skb_dst(skb) && skb_dst(skb)->dev) && (!(skb_dst(skb)->dev->features & NETIF_F_V4_CSUM))) { @@ -1463,7 +1463,7 @@ static int l2tp_tunnel_sock_create(struct net *net, } if (!cfg->use_udp_checksums) - sock->sk->sk_no_check = UDP_CSUM_NOXMIT; + sock->sk->sk_no_check_tx = 1; break; diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index bd7387adea9e..f3d331bdd706 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -297,8 +297,7 @@ static int l2tp_nl_tunnel_send(struct sk_buff *skb, u32 portid, u32 seq, int fla case L2TP_ENCAPTYPE_UDP: if (nla_put_u16(skb, L2TP_ATTR_UDP_SPORT, ntohs(inet->inet_sport)) || nla_put_u16(skb, L2TP_ATTR_UDP_DPORT, ntohs(inet->inet_dport)) || - nla_put_u8(skb, L2TP_ATTR_UDP_CSUM, - (sk->sk_no_check != UDP_CSUM_NOXMIT))) + nla_put_u8(skb, L2TP_ATTR_UDP_CSUM, !sk->sk_no_check_tx)) goto nla_put_failure; /* NOBREAK */ case L2TP_ENCAPTYPE_IP: diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 2af76eaba8f7..429899689408 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -6946,7 +6946,8 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk, newsk->sk_type = sk->sk_type; newsk->sk_bound_dev_if = sk->sk_bound_dev_if; newsk->sk_flags = sk->sk_flags; - newsk->sk_no_check = sk->sk_no_check; + newsk->sk_no_check_tx = sk->sk_no_check_tx; + newsk->sk_no_check_rx = sk->sk_no_check_rx; newsk->sk_reuse = sk->sk_reuse; newsk->sk_shutdown = sk->sk_shutdown; -- cgit v1.2.3-58-ga151 From 1c19448c9ba6545b80ded18488a64a7f3d8e6998 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 23 May 2014 08:47:32 -0700 Subject: net: Make enabling of zero UDP6 csums more restrictive RFC 6935 permits zero checksums to be used in IPv6 however this is recommended only for certain tunnel protocols, it does not make checksums completely optional like they are in IPv4. This patch restricts the use of IPv6 zero checksums that was previously intoduced. no_check6_tx and no_check6_rx have been added to control the use of checksums in UDP6 RX and TX path. The normal sk_no_check_{rx,tx} settings are not used (this avoids ambiguity when dealing with a dual stack socket). A helper function has been added (udp_set_no_check6) which can be called by tunnel impelmentations to all zero checksums (send on the socket, and accept them as valid). Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/udp.h | 24 +++++++++++++++++++++++- include/uapi/linux/udp.h | 2 ++ net/ipv4/udp.c | 20 +++++++++++++++++++- net/ipv6/udp.c | 8 ++++---- 4 files changed, 48 insertions(+), 6 deletions(-) (limited to 'net/ipv4') diff --git a/include/linux/udp.h b/include/linux/udp.h index 42278bbf7a88..247cfdcc4b08 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -47,7 +47,9 @@ struct udp_sock { #define udp_portaddr_node inet.sk.__sk_common.skc_portaddr_node int pending; /* Any pending frames ? */ unsigned int corkflag; /* Cork is required */ - __u16 encap_type; /* Is this an Encapsulation socket? */ + __u8 encap_type; /* Is this an Encapsulation socket? */ + unsigned char no_check6_tx:1,/* Send zero UDP6 checksums on TX? */ + no_check6_rx:1;/* Allow zero UDP6 checksums on RX? */ /* * Following member retains the information to create a UDP header * when the socket is uncorked. @@ -76,6 +78,26 @@ static inline struct udp_sock *udp_sk(const struct sock *sk) return (struct udp_sock *)sk; } +static inline void udp_set_no_check6_tx(struct sock *sk, bool val) +{ + udp_sk(sk)->no_check6_tx = val; +} + +static inline void udp_set_no_check6_rx(struct sock *sk, bool val) +{ + udp_sk(sk)->no_check6_rx = val; +} + +static inline bool udp_get_no_check6_tx(struct sock *sk) +{ + return udp_sk(sk)->no_check6_tx; +} + +static inline bool udp_get_no_check6_rx(struct sock *sk) +{ + return udp_sk(sk)->no_check6_rx; +} + #define udp_portaddr_for_each_entry(__sk, node, list) \ hlist_nulls_for_each_entry(__sk, node, list, __sk_common.skc_portaddr_node) diff --git a/include/uapi/linux/udp.h b/include/uapi/linux/udp.h index e2bcfd75a30d..16574ea18f0c 100644 --- a/include/uapi/linux/udp.h +++ b/include/uapi/linux/udp.h @@ -29,6 +29,8 @@ struct udphdr { /* UDP socket options */ #define UDP_CORK 1 /* Never send partially complete segments */ #define UDP_ENCAP 100 /* Set the socket to accept encapsulated packets */ +#define UDP_NO_CHECK6_TX 101 /* Disable sending checksum for UDP6X */ +#define UDP_NO_CHECK6_RX 102 /* Disable accpeting checksum for UDP6 */ /* UDP encapsulation types */ #define UDP_ENCAP_ESPINUDP_NON_IKE 1 /* draft-ietf-ipsec-nat-t-ike-00/01 */ diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 12c6175b29cd..e07d52b8617a 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1968,7 +1968,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, int (*push_pending_frames)(struct sock *)) { struct udp_sock *up = udp_sk(sk); - int val; + int val, valbool; int err = 0; int is_udplite = IS_UDPLITE(sk); @@ -1978,6 +1978,8 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, if (get_user(val, (int __user *)optval)) return -EFAULT; + valbool = val ? 1 : 0; + switch (optname) { case UDP_CORK: if (val != 0) { @@ -2007,6 +2009,14 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, } break; + case UDP_NO_CHECK6_TX: + up->no_check6_tx = valbool; + break; + + case UDP_NO_CHECK6_RX: + up->no_check6_rx = valbool; + break; + /* * UDP-Lite's partial checksum coverage (RFC 3828). */ @@ -2089,6 +2099,14 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname, val = up->encap_type; break; + case UDP_NO_CHECK6_TX: + val = up->no_check6_tx; + break; + + case UDP_NO_CHECK6_RX: + val = up->no_check6_rx; + break; + /* The following two cannot be changed on UDP sockets, the return is * always 0 (which corresponds to the full checksum coverage of UDP). */ case UDPLITE_SEND_CSCOV: diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index b8db453133aa..60325236446a 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -794,10 +794,10 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, dif = inet6_iif(skb); sk = udp_v6_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif); while (sk) { - /* If zero checksum and sk_no_check is not on for + /* If zero checksum and no_check is not on for * the socket then skip it. */ - if (uh->check || sk->sk_no_check_rx) + if (uh->check || udp_sk(sk)->no_check6_rx) stack[count++] = sk; sk = udp_v6_mcast_next(net, sk_nulls_next(sk), uh->dest, daddr, @@ -887,7 +887,7 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, if (sk != NULL) { int ret; - if (!uh->check && !sk->sk_no_check_rx) { + if (!uh->check && !udp_sk(sk)->no_check6_rx) { sock_put(sk); udp6_csum_zero_error(skb); goto csum_error; @@ -1037,7 +1037,7 @@ static int udp_v6_push_pending_frames(struct sock *sk) if (is_udplite) csum = udplite_csum_outgoing(sk, skb); - else if (sk->sk_no_check_tx) { /* UDP csum disabled */ + else if (up->no_check6_tx) { /* UDP csum disabled */ skb->ip_summed = CHECKSUM_NONE; goto send; } else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */ -- cgit v1.2.3-58-ga151 From 70cb4a4526d4d3ad901a979a5f26917149408f8d Mon Sep 17 00:00:00 2001 From: Himangi Saraogi Date: Fri, 30 May 2014 21:10:48 +0530 Subject: ipmr: Replace comma with semicolon This patch replaces a comma between expression statements by a semicolon. A simplified version of the semantic patch that performs this transformation is as follows: // @r@ expression e1,e2,e; type T; identifier i; @@ e1 -, +; e2; // Signed-off-by: Himangi Saraogi Signed-off-by: David S. Miller --- net/ipv4/ipmr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index d84dc8d4c916..2bc9cc47f246 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -484,7 +484,7 @@ static void reg_vif_setup(struct net_device *dev) dev->type = ARPHRD_PIMREG; dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 8; dev->flags = IFF_NOARP; - dev->netdev_ops = ®_vif_netdev_ops, + dev->netdev_ops = ®_vif_netdev_ops; dev->destructor = free_netdev; dev->features |= NETIF_F_NETNS_LOCAL; } -- cgit v1.2.3-58-ga151 From 73f156a6e8c1074ac6327e0abd1169e95eb66463 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 2 Jun 2014 05:26:03 -0700 Subject: inetpeer: get rid of ip_id_count Ideally, we would need to generate IP ID using a per destination IP generator. linux kernels used inet_peer cache for this purpose, but this had a huge cost on servers disabling MTU discovery. 1) each inet_peer struct consumes 192 bytes 2) inetpeer cache uses a binary tree of inet_peer structs, with a nominal size of ~66000 elements under load. 3) lookups in this tree are hitting a lot of cache lines, as tree depth is about 20. 4) If server deals with many tcp flows, we have a high probability of not finding the inet_peer, allocating a fresh one, inserting it in the tree with same initial ip_id_count, (cf secure_ip_id()) 5) We garbage collect inet_peer aggressively. IP ID generation do not have to be 'perfect' Goal is trying to avoid duplicates in a short period of time, so that reassembly units have a chance to complete reassembly of fragments belonging to one message before receiving other fragments with a recycled ID. We simply use an array of generators, and a Jenkin hash using the dst IP as a key. ipv6_select_ident() is put back into net/ipv6/ip6_output.c where it belongs (it is only used from this file) secure_ip_id() and secure_ipv6_id() no longer are needed. Rename ip_select_ident_more() to ip_select_ident_segs() to avoid unnecessary decrement/increment of the number of segments. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/ppp/pptp.c | 2 +- include/net/inetpeer.h | 23 +++------------------ include/net/ip.h | 40 ++++++++++++++++++++---------------- include/net/ipv6.h | 2 -- include/net/secure_seq.h | 2 -- net/core/secure_seq.c | 25 ----------------------- net/ipv4/igmp.c | 4 ++-- net/ipv4/inetpeer.c | 18 ----------------- net/ipv4/ip_output.c | 7 +++---- net/ipv4/ip_tunnel_core.c | 2 +- net/ipv4/ipmr.c | 2 +- net/ipv4/raw.c | 2 +- net/ipv4/route.c | 45 +++++++++++++++-------------------------- net/ipv4/xfrm4_mode_tunnel.c | 2 +- net/ipv6/ip6_output.c | 12 +++++++++++ net/ipv6/output_core.c | 30 --------------------------- net/netfilter/ipvs/ip_vs_xmit.c | 2 +- 17 files changed, 65 insertions(+), 155 deletions(-) (limited to 'net/ipv4') diff --git a/drivers/net/ppp/pptp.c b/drivers/net/ppp/pptp.c index 01805319e1e0..1aff970be33e 100644 --- a/drivers/net/ppp/pptp.c +++ b/drivers/net/ppp/pptp.c @@ -281,7 +281,7 @@ static int pptp_xmit(struct ppp_channel *chan, struct sk_buff *skb) nf_reset(skb); skb->ip_summed = CHECKSUM_NONE; - ip_select_ident(skb, &rt->dst, NULL); + ip_select_ident(skb, NULL); ip_send_check(iph); ip_local_out(skb); diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h index 6efe73c79c52..823ec7bb9c67 100644 --- a/include/net/inetpeer.h +++ b/include/net/inetpeer.h @@ -41,14 +41,13 @@ struct inet_peer { struct rcu_head gc_rcu; }; /* - * Once inet_peer is queued for deletion (refcnt == -1), following fields - * are not available: rid, ip_id_count + * Once inet_peer is queued for deletion (refcnt == -1), following field + * is not available: rid * We can share memory with rcu_head to help keep inet_peer small. */ union { struct { atomic_t rid; /* Frag reception counter */ - atomic_t ip_id_count; /* IP ID for the next packet */ }; struct rcu_head rcu; struct inet_peer *gc_next; @@ -165,7 +164,7 @@ bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout); void inetpeer_invalidate_tree(struct inet_peer_base *); /* - * temporary check to make sure we dont access rid, ip_id_count, tcp_ts, + * temporary check to make sure we dont access rid, tcp_ts, * tcp_ts_stamp if no refcount is taken on inet_peer */ static inline void inet_peer_refcheck(const struct inet_peer *p) @@ -173,20 +172,4 @@ static inline void inet_peer_refcheck(const struct inet_peer *p) WARN_ON_ONCE(atomic_read(&p->refcnt) <= 0); } - -/* can be called with or without local BH being disabled */ -static inline int inet_getid(struct inet_peer *p, int more) -{ - int old, new; - more++; - inet_peer_refcheck(p); - do { - old = atomic_read(&p->ip_id_count); - new = old + more; - if (!new) - new = 1; - } while (atomic_cmpxchg(&p->ip_id_count, old, new) != old); - return new; -} - #endif /* _NET_INETPEER_H */ diff --git a/include/net/ip.h b/include/net/ip.h index 2e4947895d75..0e795df05ec9 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -309,9 +309,19 @@ static inline unsigned int ip_skb_dst_mtu(const struct sk_buff *skb) } } -void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more); +#define IP_IDENTS_SZ 2048u +extern atomic_t *ip_idents; -static inline void ip_select_ident(struct sk_buff *skb, struct dst_entry *dst, struct sock *sk) +static inline u32 ip_idents_reserve(u32 hash, int segs) +{ + atomic_t *id_ptr = ip_idents + hash % IP_IDENTS_SZ; + + return atomic_add_return(segs, id_ptr) - segs; +} + +void __ip_select_ident(struct iphdr *iph, int segs); + +static inline void ip_select_ident_segs(struct sk_buff *skb, struct sock *sk, int segs) { struct iphdr *iph = ip_hdr(skb); @@ -321,24 +331,20 @@ static inline void ip_select_ident(struct sk_buff *skb, struct dst_entry *dst, s * does not change, they drop every other packet in * a TCP stream using header compression. */ - iph->id = (sk && inet_sk(sk)->inet_daddr) ? - htons(inet_sk(sk)->inet_id++) : 0; - } else - __ip_select_ident(iph, dst, 0); -} - -static inline void ip_select_ident_more(struct sk_buff *skb, struct dst_entry *dst, struct sock *sk, int more) -{ - struct iphdr *iph = ip_hdr(skb); - - if ((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) { if (sk && inet_sk(sk)->inet_daddr) { iph->id = htons(inet_sk(sk)->inet_id); - inet_sk(sk)->inet_id += 1 + more; - } else + inet_sk(sk)->inet_id += segs; + } else { iph->id = 0; - } else - __ip_select_ident(iph, dst, more); + } + } else { + __ip_select_ident(iph, segs); + } +} + +static inline void ip_select_ident(struct sk_buff *skb, struct sock *sk) +{ + ip_select_ident_segs(skb, sk, 1); } static inline __wsum inet_compute_pseudo(struct sk_buff *skb, int proto) diff --git a/include/net/ipv6.h b/include/net/ipv6.h index ba810d0546bc..574337fe72dd 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -668,8 +668,6 @@ static inline int ipv6_addr_diff(const struct in6_addr *a1, const struct in6_add return __ipv6_addr_diff(a1, a2, sizeof(struct in6_addr)); } -void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt); - int ip6_dst_hoplimit(struct dst_entry *dst); static inline int ip6_sk_dst_hoplimit(struct ipv6_pinfo *np, struct flowi6 *fl6, diff --git a/include/net/secure_seq.h b/include/net/secure_seq.h index f257486f17be..3f36d45b714a 100644 --- a/include/net/secure_seq.h +++ b/include/net/secure_seq.h @@ -3,8 +3,6 @@ #include -__u32 secure_ip_id(__be32 daddr); -__u32 secure_ipv6_id(const __be32 daddr[4]); u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport); u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr, __be16 dport); diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c index 897da56f3aff..ba71212f0251 100644 --- a/net/core/secure_seq.c +++ b/net/core/secure_seq.c @@ -85,31 +85,6 @@ EXPORT_SYMBOL(secure_ipv6_port_ephemeral); #endif #ifdef CONFIG_INET -__u32 secure_ip_id(__be32 daddr) -{ - u32 hash[MD5_DIGEST_WORDS]; - - net_secret_init(); - hash[0] = (__force __u32) daddr; - hash[1] = net_secret[13]; - hash[2] = net_secret[14]; - hash[3] = net_secret[15]; - - md5_transform(hash, net_secret); - - return hash[0]; -} - -__u32 secure_ipv6_id(const __be32 daddr[4]) -{ - __u32 hash[4]; - - net_secret_init(); - memcpy(hash, daddr, 16); - md5_transform(hash, net_secret); - - return hash[0]; -} __u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport) diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 17d34e3c2ac3..6748d420f714 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -369,7 +369,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) pip->saddr = fl4.saddr; pip->protocol = IPPROTO_IGMP; pip->tot_len = 0; /* filled in later */ - ip_select_ident(skb, &rt->dst, NULL); + ip_select_ident(skb, NULL); ((u8 *)&pip[1])[0] = IPOPT_RA; ((u8 *)&pip[1])[1] = 4; ((u8 *)&pip[1])[2] = 0; @@ -714,7 +714,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, iph->daddr = dst; iph->saddr = fl4.saddr; iph->protocol = IPPROTO_IGMP; - ip_select_ident(skb, &rt->dst, NULL); + ip_select_ident(skb, NULL); ((u8 *)&iph[1])[0] = IPOPT_RA; ((u8 *)&iph[1])[1] = 4; ((u8 *)&iph[1])[2] = 0; diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index c98cf141f4ed..4ced1b9a97f0 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -26,20 +26,7 @@ * Theory of operations. * We keep one entry for each peer IP address. The nodes contains long-living * information about the peer which doesn't depend on routes. - * At this moment this information consists only of ID field for the next - * outgoing IP packet. This field is incremented with each packet as encoded - * in inet_getid() function (include/net/inetpeer.h). - * At the moment of writing this notes identifier of IP packets is generated - * to be unpredictable using this code only for packets subjected - * (actually or potentially) to defragmentation. I.e. DF packets less than - * PMTU in size when local fragmentation is disabled use a constant ID and do - * not use this code (see ip_select_ident() in include/net/ip.h). * - * Route cache entries hold references to our nodes. - * New cache entries get references via lookup by destination IP address in - * the avl tree. The reference is grabbed only when it's needed i.e. only - * when we try to output IP packet which needs an unpredictable ID (see - * __ip_select_ident() in net/ipv4/route.c). * Nodes are removed only when reference counter goes to 0. * When it's happened the node may be removed when a sufficient amount of * time has been passed since its last use. The less-recently-used entry can @@ -62,7 +49,6 @@ * refcnt: atomically against modifications on other CPU; * usually under some other lock to prevent node disappearing * daddr: unchangeable - * ip_id_count: atomic value (no lock needed) */ static struct kmem_cache *peer_cachep __read_mostly; @@ -497,10 +483,6 @@ relookup: p->daddr = *daddr; atomic_set(&p->refcnt, 1); atomic_set(&p->rid, 0); - atomic_set(&p->ip_id_count, - (daddr->family == AF_INET) ? - secure_ip_id(daddr->addr.a4) : - secure_ipv6_id(daddr->addr.a6)); p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW; p->rate_tokens = 0; /* 60*HZ is arbitrary, but chosen enough high so that the first diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 6e231ab58d65..8d3b6b0e9857 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -148,7 +148,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr); iph->saddr = saddr; iph->protocol = sk->sk_protocol; - ip_select_ident(skb, &rt->dst, sk); + ip_select_ident(skb, sk); if (opt && opt->opt.optlen) { iph->ihl += opt->opt.optlen>>2; @@ -430,8 +430,7 @@ packet_routed: ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0); } - ip_select_ident_more(skb, &rt->dst, sk, - (skb_shinfo(skb)->gso_segs ?: 1) - 1); + ip_select_ident_segs(skb, sk, skb_shinfo(skb)->gso_segs ?: 1); /* TODO : should we use skb->sk here instead of sk ? */ skb->priority = sk->sk_priority; @@ -1379,7 +1378,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk, iph->ttl = ttl; iph->protocol = sk->sk_protocol; ip_copy_addrs(iph, fl4); - ip_select_ident(skb, &rt->dst, sk); + ip_select_ident(skb, sk); if (opt) { iph->ihl += opt->optlen>>2; diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index bcf206c79005..847e69cbff7e 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -74,7 +74,7 @@ int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, iph->daddr = dst; iph->saddr = src; iph->ttl = ttl; - __ip_select_ident(iph, &rt->dst, (skb_shinfo(skb)->gso_segs ?: 1) - 1); + __ip_select_ident(iph, skb_shinfo(skb)->gso_segs ?: 1); err = ip_local_out_sk(sk, skb); if (unlikely(net_xmit_eval(err))) diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 2bc9cc47f246..65bcaa789043 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1663,7 +1663,7 @@ static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr) iph->protocol = IPPROTO_IPIP; iph->ihl = 5; iph->tot_len = htons(skb->len); - ip_select_ident(skb, skb_dst(skb), NULL); + ip_select_ident(skb, NULL); ip_send_check(iph); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index a9dbe58bdfe7..2c65160565e1 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -389,7 +389,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, iph->check = 0; iph->tot_len = htons(length); if (!iph->id) - ip_select_ident(skb, &rt->dst, NULL); + ip_select_ident(skb, NULL); iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 4154eb76b0ad..082239ffe34a 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -89,6 +89,7 @@ #include #include #include +#include #include #include #include @@ -456,39 +457,19 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, return neigh_create(&arp_tbl, pkey, dev); } -/* - * Peer allocation may fail only in serious out-of-memory conditions. However - * we still can generate some output. - * Random ID selection looks a bit dangerous because we have no chances to - * select ID being unique in a reasonable period of time. - * But broken packet identifier may be better than no packet at all. - */ -static void ip_select_fb_ident(struct iphdr *iph) -{ - static DEFINE_SPINLOCK(ip_fb_id_lock); - static u32 ip_fallback_id; - u32 salt; +atomic_t *ip_idents __read_mostly; +EXPORT_SYMBOL(ip_idents); - spin_lock_bh(&ip_fb_id_lock); - salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr); - iph->id = htons(salt & 0xFFFF); - ip_fallback_id = salt; - spin_unlock_bh(&ip_fb_id_lock); -} - -void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) +void __ip_select_ident(struct iphdr *iph, int segs) { - struct net *net = dev_net(dst->dev); - struct inet_peer *peer; + static u32 ip_idents_hashrnd __read_mostly; + u32 hash, id; - peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1); - if (peer) { - iph->id = htons(inet_getid(peer, more)); - inet_putpeer(peer); - return; - } + net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd)); - ip_select_fb_ident(iph); + hash = jhash_1word((__force u32)iph->daddr, ip_idents_hashrnd); + id = ip_idents_reserve(hash, segs); + iph->id = htons(id); } EXPORT_SYMBOL(__ip_select_ident); @@ -2711,6 +2692,12 @@ int __init ip_rt_init(void) { int rc = 0; + ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL); + if (!ip_idents) + panic("IP: failed to allocate ip_idents\n"); + + prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents)); + #ifdef CONFIG_IP_ROUTE_CLASSID ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); if (!ip_rt_acct) diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c index 05f2b484954f..91771a7c802f 100644 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -58,12 +58,12 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ? 0 : (XFRM_MODE_SKB_CB(skb)->frag_off & htons(IP_DF)); - ip_select_ident(skb, dst->child, NULL); top_iph->ttl = ip4_dst_hoplimit(dst->child); top_iph->saddr = x->props.saddr.a4; top_iph->daddr = x->id.daddr.a4; + ip_select_ident(skb, NULL); return 0; } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 85aaeca1f7f3..cb9df0eb4023 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -537,6 +537,18 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) skb_copy_secmark(to, from); } +static void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt) +{ + static u32 ip6_idents_hashrnd __read_mostly; + u32 hash, id; + + net_get_random_once(&ip6_idents_hashrnd, sizeof(ip6_idents_hashrnd)); + + hash = __ipv6_addr_jhash(&rt->rt6i_dst.addr, ip6_idents_hashrnd); + id = ip_idents_reserve(hash, 1); + fhdr->identification = htonl(id); +} + int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) { struct sk_buff *frag; diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index 6313abd53c9d..6179ac186ab9 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -8,36 +8,6 @@ #include #include -void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt) -{ - static atomic_t ipv6_fragmentation_id; - struct in6_addr addr; - int old, new; - -#if IS_ENABLED(CONFIG_IPV6) - struct inet_peer *peer; - struct net *net; - - net = dev_net(rt->dst.dev); - peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1); - if (peer) { - fhdr->identification = htonl(inet_getid(peer, 0)); - inet_putpeer(peer); - return; - } -#endif - do { - old = atomic_read(&ipv6_fragmentation_id); - new = old + 1; - if (!new) - new = 1; - } while (atomic_cmpxchg(&ipv6_fragmentation_id, old, new) != old); - - addr = rt->rt6i_dst.addr; - addr.s6_addr32[0] ^= (__force __be32)new; - fhdr->identification = htonl(secure_ipv6_id(addr.s6_addr32)); -} -EXPORT_SYMBOL(ipv6_select_ident); int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) { diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 487b55e04337..73ba1cc7a88d 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -883,7 +883,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, iph->daddr = cp->daddr.ip; iph->saddr = saddr; iph->ttl = old_iph->ttl; - ip_select_ident(skb, &rt->dst, NULL); + ip_select_ident(skb, NULL); /* Another hack: avoid icmp_send in ip_fragment */ skb->ignore_df = 1; -- cgit v1.2.3-58-ga151 From 30f38d2fdd79f13fc929489f7e6e517b4a4bfe63 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 30 May 2014 13:34:39 -0600 Subject: fib_trie: use seq_file_net rather than seq->private Make fib_triestat_seq_show consistent with other /proc/net files and use seq_file_net. Signed-off-by: David Ahern Cc: David S. Miller Cc: Alexey Kuznetsov Cc: James Morris Cc: Hideaki YOSHIFUJI Cc: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 5afeb5aa4c7c..243c7f4e451e 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -2166,7 +2166,7 @@ static void fib_table_print(struct seq_file *seq, struct fib_table *tb) static int fib_triestat_seq_show(struct seq_file *seq, void *v) { - struct net *net = (struct net *)seq->private; + struct net *net = seq_file_net(seq); unsigned int h; seq_printf(seq, -- cgit v1.2.3-58-ga151 From f830b0223cabfc614552a73dabff920859191f2e Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Wed, 4 Jun 2014 18:02:32 -0400 Subject: net: Revert "fib_trie: use seq_file_net rather than seq->private" This reverts commit 30f38d2fdd79f13fc929489f7e6e517b4a4bfe63. fib_triestat is surrounded by a big lie: while it claims that it's a seq_file (fib_triestat_seq_open, fib_triestat_seq_show), it isn't: static const struct file_operations fib_triestat_fops = { .owner = THIS_MODULE, .open = fib_triestat_seq_open, .read = seq_read, .llseek = seq_lseek, .release = single_release_net, }; Yes, fib_triestat is just a regular file. A small detail (assuming CONFIG_NET_NS=y) is that while for seq_files you could do seq_file_net() to get the net ptr, doing so for a regular file would be wrong and would dereference an invalid pointer. The fib_triestat lie claimed a victim, and trying to show the file would be bad for the kernel. This patch just reverts the issue and fixes fib_triestat, which still needs a rewrite to either be a seq_file or stop claiming it is. Signed-off-by: Sasha Levin Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 243c7f4e451e..5afeb5aa4c7c 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -2166,7 +2166,7 @@ static void fib_table_print(struct seq_file *seq, struct fib_table *tb) static int fib_triestat_seq_show(struct seq_file *seq, void *v) { - struct net *net = seq_file_net(seq); + struct net *net = (struct net *)seq->private; unsigned int h; seq_printf(seq, -- cgit v1.2.3-58-ga151 From af5fcba7f38f3166392f4087ab734433c84f160b Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 4 Jun 2014 17:19:48 -0700 Subject: udp: Generic functions to set checksum Added udp_set_csum and udp6_set_csum functions to set UDP checksums in packets. These are for simple UDP packets such as those that might be created in UDP tunnels. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/ip6_checksum.h | 12 ++++++++++++ include/net/udp.h | 9 +++++++++ net/ipv4/udp.c | 37 +++++++++++++++++++++++++++++++++++++ net/ipv6/ip6_checksum.c | 38 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 96 insertions(+) (limited to 'net/ipv4') diff --git a/include/net/ip6_checksum.h b/include/net/ip6_checksum.h index 8ac5c21f8456..55236cb71174 100644 --- a/include/net/ip6_checksum.h +++ b/include/net/ip6_checksum.h @@ -82,5 +82,17 @@ static inline void tcp_v6_send_check(struct sock *sk, struct sk_buff *skb) } #endif +static inline __sum16 udp_v6_check(int len, + const struct in6_addr *saddr, + const struct in6_addr *daddr, + __wsum base) +{ + return csum_ipv6_magic(saddr, daddr, len, IPPROTO_UDP, base); +} + +void udp6_set_csum(bool nocheck, struct sk_buff *skb, + const struct in6_addr *saddr, + const struct in6_addr *daddr, int len); + int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh, int proto); #endif diff --git a/include/net/udp.h b/include/net/udp.h index 5eb86874bcd6..2ecfc6e15609 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -147,6 +147,15 @@ static inline __wsum udp_csum(struct sk_buff *skb) return csum; } +static inline __sum16 udp_v4_check(int len, __be32 saddr, + __be32 daddr, __wsum base) +{ + return csum_tcpudp_magic(saddr, daddr, len, IPPROTO_UDP, base); +} + +void udp_set_csum(bool nocheck, struct sk_buff *skb, + __be32 saddr, __be32 daddr, int len); + /* hash routines shared between UDPv4/6 and UDP-Litev4/6 */ static inline void udp_lib_hash(struct sock *sk) { diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index e07d52b8617a..a84f6762ea9e 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -762,6 +762,43 @@ void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst) } EXPORT_SYMBOL_GPL(udp4_hwcsum); +/* Function to set UDP checksum for an IPv4 UDP packet. This is intended + * for the simple case like when setting the checksum for a UDP tunnel. + */ +void udp_set_csum(bool nocheck, struct sk_buff *skb, + __be32 saddr, __be32 daddr, int len) +{ + struct udphdr *uh = udp_hdr(skb); + + if (nocheck) + uh->check = 0; + else if (skb_is_gso(skb)) + uh->check = ~udp_v4_check(len, saddr, daddr, 0); + else if (skb_dst(skb) && skb_dst(skb)->dev && + (skb_dst(skb)->dev->features & NETIF_F_V4_CSUM)) { + + BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL); + + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct udphdr, check); + uh->check = ~udp_v4_check(len, saddr, daddr, 0); + } else { + __wsum csum; + + BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL); + + uh->check = 0; + csum = skb_checksum(skb, 0, len, 0); + uh->check = udp_v4_check(len, saddr, daddr, csum); + if (uh->check == 0) + uh->check = CSUM_MANGLED_0; + + skb->ip_summed = CHECKSUM_UNNECESSARY; + } +} +EXPORT_SYMBOL(udp_set_csum); + static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4) { struct sock *sk = skb->sk; diff --git a/net/ipv6/ip6_checksum.c b/net/ipv6/ip6_checksum.c index da26224a5993..9a4d7322fb22 100644 --- a/net/ipv6/ip6_checksum.c +++ b/net/ipv6/ip6_checksum.c @@ -84,3 +84,41 @@ int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh, int proto) ip6_compute_pseudo); } EXPORT_SYMBOL(udp6_csum_init); + +/* Function to set UDP checksum for an IPv6 UDP packet. This is intended + * for the simple case like when setting the checksum for a UDP tunnel. + */ +void udp6_set_csum(bool nocheck, struct sk_buff *skb, + const struct in6_addr *saddr, + const struct in6_addr *daddr, int len) +{ + struct udphdr *uh = udp_hdr(skb); + + if (nocheck) + uh->check = 0; + else if (skb_is_gso(skb)) + uh->check = ~udp_v6_check(len, saddr, daddr, 0); + else if (skb_dst(skb) && skb_dst(skb)->dev && + (skb_dst(skb)->dev->features & NETIF_F_IPV6_CSUM)) { + + BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL); + + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct udphdr, check); + uh->check = ~udp_v6_check(len, saddr, daddr, 0); + } else { + __wsum csum; + + BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL); + + uh->check = 0; + csum = skb_checksum(skb, 0, len, 0); + uh->check = udp_v6_check(len, saddr, daddr, csum); + if (uh->check == 0) + uh->check = CSUM_MANGLED_0; + + skb->ip_summed = CHECKSUM_UNNECESSARY; + } +} +EXPORT_SYMBOL(udp6_set_csum); -- cgit v1.2.3-58-ga151 From 7e2b10c1e52ca37fb522be49f4be367f9311d0cd Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 4 Jun 2014 17:20:02 -0700 Subject: net: Support for multiple checksums with gso When creating a GSO packet segment we may need to set more than one checksum in the packet (for instance a TCP checksum and UDP checksum for VXLAN encapsulation). To be efficient, we want to do checksum calculation for any part of the packet at most once. This patch adds csum_start offset to skb_gso_cb. This tracks the starting offset for skb->csum which is initially set in skb_segment. When a protocol needs to compute a transport checksum it calls gso_make_checksum which computes the checksum value from the start of transport header to csum_start and then adds in skb->csum to get the full checksum. skb->csum and csum_start are then updated to reflect the checksum of the resultant packet starting from the transport header. This patch also adds a flag to skbuff, encap_hdr_csum, which is set in *gso_segment fucntions to indicate that a tunnel protocol needs checksum calculation Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/skbuff.h | 26 +++++++++++++++++++++++++- net/core/skbuff.c | 8 +++++++- net/ipv4/ip_tunnel_core.c | 8 ++++++++ 3 files changed, 40 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 7a9beeb1c458..d8d397acb52c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -567,7 +567,8 @@ struct sk_buff { * headers if needed */ __u8 encapsulation:1; - /* 6/8 bit hole (depending on ndisc_nodetype presence) */ + __u8 encap_hdr_csum:1; + /* 5/7 bit hole (depending on ndisc_nodetype presence) */ kmemcheck_bitfield_end(flags2); #if defined CONFIG_NET_DMA || defined CONFIG_NET_RX_BUSY_POLL @@ -2988,6 +2989,7 @@ static inline struct sec_path *skb_sec_path(struct sk_buff *skb) struct skb_gso_cb { int mac_offset; int encap_level; + __u16 csum_start; }; #define SKB_GSO_CB(skb) ((struct skb_gso_cb *)(skb)->cb) @@ -3012,6 +3014,28 @@ static inline int gso_pskb_expand_head(struct sk_buff *skb, int extra) return 0; } +/* Compute the checksum for a gso segment. First compute the checksum value + * from the start of transport header to SKB_GSO_CB(skb)->csum_start, and + * then add in skb->csum (checksum from csum_start to end of packet). + * skb->csum and csum_start are then updated to reflect the checksum of the + * resultant packet starting from the transport header-- the resultant checksum + * is in the res argument (i.e. normally zero or ~ of checksum of a pseudo + * header. + */ +static inline __sum16 gso_make_checksum(struct sk_buff *skb, __wsum res) +{ + int plen = SKB_GSO_CB(skb)->csum_start - skb_headroom(skb) - + skb_transport_offset(skb); + __u16 csum; + + csum = csum_fold(csum_partial(skb_transport_header(skb), + plen, skb->csum)); + skb->csum = res; + SKB_GSO_CB(skb)->csum_start -= plen; + + return csum; +} + static inline bool skb_is_gso(const struct sk_buff *skb) { return skb_shinfo(skb)->gso_size; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 3f6c7e8be8a4..05f4bef2ce12 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2885,7 +2885,9 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, if (unlikely(!proto)) return ERR_PTR(-EINVAL); - csum = !!can_checksum_protocol(features, proto); + csum = !head_skb->encap_hdr_csum && + !!can_checksum_protocol(features, proto); + __skb_push(head_skb, doffset); headroom = skb_headroom(head_skb); pos = skb_headlen(head_skb); @@ -2983,6 +2985,8 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, nskb->csum = skb_copy_and_csum_bits(head_skb, offset, skb_put(nskb, len), len, 0); + SKB_GSO_CB(nskb)->csum_start = + skb_headroom(nskb) + offset; continue; } @@ -3052,6 +3056,8 @@ perform_csum_check: nskb->csum = skb_checksum(nskb, doffset, nskb->len - doffset, 0); nskb->ip_summed = CHECKSUM_NONE; + SKB_GSO_CB(nskb)->csum_start = + skb_headroom(nskb) + doffset; } } while ((offset += len) < head_skb->len); diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 847e69cbff7e..f4c987bb7e94 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -135,6 +135,14 @@ struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb, return skb; } + /* If packet is not gso and we are resolving any partial checksum, + * clear encapsulation flag. This allows setting CHECKSUM_PARTIAL + * on the outer header without confusing devices that implement + * NETIF_F_IP_CSUM with encapsulation. + */ + if (csum_help) + skb->encapsulation = 0; + if (skb->ip_summed == CHECKSUM_PARTIAL && csum_help) { err = skb_checksum_help(skb); if (unlikely(err)) -- cgit v1.2.3-58-ga151 From e9c3a24b3ace90b428848fdaf772ed264982abcc Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 4 Jun 2014 17:20:09 -0700 Subject: tcp: Call gso_make_checksum Call common gso_make_checksum when calculating checksum for a TCP GSO segment. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv4/tcp_offload.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index b92b81718ca4..d8de7b9e0720 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -97,9 +97,7 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, th->check = newcheck; if (skb->ip_summed != CHECKSUM_PARTIAL) - th->check = - csum_fold(csum_partial(skb_transport_header(skb), - thlen, skb->csum)); + th->check = gso_make_checksum(skb, ~th->check); seq += mss; if (copy_destructor) { @@ -133,8 +131,7 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, th->check = ~csum_fold((__force __wsum)((__force u32)th->check + (__force u32)delta)); if (skb->ip_summed != CHECKSUM_PARTIAL) - th->check = csum_fold(csum_partial(skb_transport_header(skb), - thlen, skb->csum)); + th->check = gso_make_checksum(skb, ~th->check); out: return segs; } -- cgit v1.2.3-58-ga151 From 0f4f4ffa7b7c3d29d0537a126145c9f8d8ed5dbc Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 4 Jun 2014 17:20:16 -0700 Subject: net: Add GSO support for UDP tunnels with checksum Added a new netif feature for GSO_UDP_TUNNEL_CSUM. This indicates that a device is capable of computing the UDP checksum in the encapsulating header of a UDP tunnel. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/netdev_features.h | 1 + include/linux/skbuff.h | 2 ++ net/ipv4/af_inet.c | 1 + net/ipv4/tcp_offload.c | 1 + net/ipv4/udp.c | 40 +++++++++++++++++++--------------------- net/ipv4/udp_offload.c | 4 +++- net/ipv6/ip6_offload.c | 1 + net/ipv6/udp_offload.c | 4 +++- 8 files changed, 31 insertions(+), 23 deletions(-) (limited to 'net/ipv4') diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index c26d0ec2ef3a..f1338e0f9866 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -45,6 +45,7 @@ enum { NETIF_F_GSO_IPIP_BIT, /* ... IPIP tunnel with TSO */ NETIF_F_GSO_SIT_BIT, /* ... SIT tunnel with TSO */ NETIF_F_GSO_UDP_TUNNEL_BIT, /* ... UDP TUNNEL with TSO */ + NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT,/* ... UDP TUNNEL with TSO & CSUM */ NETIF_F_GSO_MPLS_BIT, /* ... MPLS segmentation */ /**/NETIF_F_GSO_LAST = /* last bit, see GSO_MASK */ NETIF_F_GSO_MPLS_BIT, diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index d8d397acb52c..5a6d10a538f5 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -345,6 +345,8 @@ enum { SKB_GSO_UDP_TUNNEL = 1 << 9, SKB_GSO_MPLS = 1 << 10, + + SKB_GSO_UDP_TUNNEL_CSUM = 1 << 11, }; #if BITS_PER_LONG > 32 diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 0e9bb08a91e4..0070ab87109b 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1258,6 +1258,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, SKB_GSO_SIT | SKB_GSO_TCPV6 | SKB_GSO_UDP_TUNNEL | + SKB_GSO_UDP_TUNNEL_CSUM | SKB_GSO_MPLS | 0))) goto out; diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index d8de7b9e0720..c02f2d2e7bab 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -61,6 +61,7 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, SKB_GSO_SIT | SKB_GSO_MPLS | SKB_GSO_UDP_TUNNEL | + SKB_GSO_UDP_TUNNEL_CSUM | 0) || !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))) goto out; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index a84f6762ea9e..8d8c33d84c9a 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2528,7 +2528,11 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb); __be16 protocol = skb->protocol; netdev_features_t enc_features; - int outer_hlen; + int udp_offset, outer_hlen; + unsigned int oldlen; + bool need_csum; + + oldlen = (u16)~skb->len; if (unlikely(!pskb_may_pull(skb, tnl_hlen))) goto out; @@ -2540,6 +2544,10 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, skb->mac_len = skb_inner_network_offset(skb); skb->protocol = htons(ETH_P_TEB); + need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM); + if (need_csum) + skb->encap_hdr_csum = 1; + /* segment inner packet. */ enc_features = skb->dev->hw_enc_features & netif_skb_features(skb); segs = skb_mac_gso_segment(skb, enc_features); @@ -2550,10 +2558,11 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, } outer_hlen = skb_tnl_header_len(skb); + udp_offset = outer_hlen - tnl_hlen; skb = segs; do { struct udphdr *uh; - int udp_offset = outer_hlen - tnl_hlen; + int len; skb_reset_inner_headers(skb); skb->encapsulation = 1; @@ -2564,31 +2573,20 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, skb_reset_mac_header(skb); skb_set_network_header(skb, mac_len); skb_set_transport_header(skb, udp_offset); + len = skb->len - udp_offset; uh = udp_hdr(skb); - uh->len = htons(skb->len - udp_offset); - - /* csum segment if tunnel sets skb with csum. */ - if (protocol == htons(ETH_P_IP) && unlikely(uh->check)) { - struct iphdr *iph = ip_hdr(skb); + uh->len = htons(len); - uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, - skb->len - udp_offset, - IPPROTO_UDP, 0); - uh->check = csum_fold(skb_checksum(skb, udp_offset, - skb->len - udp_offset, 0)); - if (uh->check == 0) - uh->check = CSUM_MANGLED_0; + if (need_csum) { + __be32 delta = htonl(oldlen + len); - } else if (protocol == htons(ETH_P_IPV6)) { - struct ipv6hdr *ipv6h = ipv6_hdr(skb); - u32 len = skb->len - udp_offset; + uh->check = ~csum_fold((__force __wsum) + ((__force u32)uh->check + + (__force u32)delta)); + uh->check = gso_make_checksum(skb, ~uh->check); - uh->check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, - len, IPPROTO_UDP, 0); - uh->check = csum_fold(skb_checksum(skb, udp_offset, len, 0)); if (uh->check == 0) uh->check = CSUM_MANGLED_0; - skb->ip_summed = CHECKSUM_NONE; } skb->protocol = protocol; diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 88b4023ecfcf..5c23f4765af9 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -56,7 +56,8 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, __wsum csum; if (skb->encapsulation && - skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL) { + (skb_shinfo(skb)->gso_type & + (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))) { segs = skb_udp_tunnel_segment(skb, features); goto out; } @@ -71,6 +72,7 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY | SKB_GSO_UDP_TUNNEL | + SKB_GSO_UDP_TUNNEL_CSUM | SKB_GSO_IPIP | SKB_GSO_GRE | SKB_GSO_MPLS) || !(type & (SKB_GSO_UDP)))) diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index b2f091566f88..d54c5744e3db 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -100,6 +100,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, SKB_GSO_IPIP | SKB_GSO_SIT | SKB_GSO_UDP_TUNNEL | + SKB_GSO_UDP_TUNNEL_CSUM | SKB_GSO_MPLS | SKB_GSO_TCPV6 | 0))) diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index b261ee8b83fc..79da8b305ced 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -63,6 +63,7 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY | SKB_GSO_UDP_TUNNEL | + SKB_GSO_UDP_TUNNEL_CSUM | SKB_GSO_GRE | SKB_GSO_IPIP | SKB_GSO_SIT | @@ -76,7 +77,8 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, goto out; } - if (skb->encapsulation && skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL) + if (skb->encapsulation && skb_shinfo(skb)->gso_type & + (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM)) segs = skb_udp_tunnel_segment(skb, features); else { /* Do software UFO. Complete and fill in the UDP checksum as HW cannot -- cgit v1.2.3-58-ga151 From 4749c09c37030ccdc44aecebe0f71b02a377fc14 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 4 Jun 2014 17:20:23 -0700 Subject: gre: Call gso_make_checksum Call gso_make_checksum. This should have the benefit of using a checksum that may have been previously computed for the packet. This also adds NETIF_F_GSO_GRE_CSUM to differentiate devices that offload GRE GSO with and without the GRE checksum offloaed. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/netdev_features.h | 2 ++ include/linux/skbuff.h | 2 ++ include/net/gre.h | 5 +++-- net/ipv4/af_inet.c | 1 + net/ipv4/gre_demux.c | 3 ++- net/ipv4/gre_offload.c | 10 ++++++++-- net/ipv4/tcp_offload.c | 1 + net/ipv4/udp_offload.c | 3 ++- net/ipv6/ip6_offload.c | 1 + net/ipv6/udp_offload.c | 1 + net/mpls/mpls_gso.c | 1 + 11 files changed, 24 insertions(+), 6 deletions(-) (limited to 'net/ipv4') diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index f1338e0f9866..e5a589435e2b 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -42,6 +42,7 @@ enum { NETIF_F_TSO6_BIT, /* ... TCPv6 segmentation */ NETIF_F_FSO_BIT, /* ... FCoE segmentation */ NETIF_F_GSO_GRE_BIT, /* ... GRE with TSO */ + NETIF_F_GSO_GRE_CSUM_BIT, /* ... GRE with csum with TSO */ NETIF_F_GSO_IPIP_BIT, /* ... IPIP tunnel with TSO */ NETIF_F_GSO_SIT_BIT, /* ... SIT tunnel with TSO */ NETIF_F_GSO_UDP_TUNNEL_BIT, /* ... UDP TUNNEL with TSO */ @@ -112,6 +113,7 @@ enum { #define NETIF_F_RXFCS __NETIF_F(RXFCS) #define NETIF_F_RXALL __NETIF_F(RXALL) #define NETIF_F_GSO_GRE __NETIF_F(GSO_GRE) +#define NETIF_F_GSO_GRE_CSUM __NETIF_F(GSO_GRE_CSUM) #define NETIF_F_GSO_IPIP __NETIF_F(GSO_IPIP) #define NETIF_F_GSO_SIT __NETIF_F(GSO_SIT) #define NETIF_F_GSO_UDP_TUNNEL __NETIF_F(GSO_UDP_TUNNEL) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 5a6d10a538f5..c705808bef9c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -347,6 +347,8 @@ enum { SKB_GSO_MPLS = 1 << 10, SKB_GSO_UDP_TUNNEL_CSUM = 1 << 11, + + SKB_GSO_GRE_CSUM = 1 << 12, }; #if BITS_PER_LONG > 32 diff --git a/include/net/gre.h b/include/net/gre.h index 70046a0b0b89..b53182018743 100644 --- a/include/net/gre.h +++ b/include/net/gre.h @@ -37,9 +37,10 @@ void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi, int hdr_len); static inline struct sk_buff *gre_handle_offloads(struct sk_buff *skb, - bool gre_csum) + bool csum) { - return iptunnel_handle_offloads(skb, gre_csum, SKB_GSO_GRE); + return iptunnel_handle_offloads(skb, csum, + csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE); } diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 0070ab87109b..d5e6836cf772 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1254,6 +1254,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, SKB_GSO_DODGY | SKB_GSO_TCP_ECN | SKB_GSO_GRE | + SKB_GSO_GRE_CSUM | SKB_GSO_IPIP | SKB_GSO_SIT | SKB_GSO_TCPV6 | diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c index fbfd829f4049..4e9619bca732 100644 --- a/net/ipv4/gre_demux.c +++ b/net/ipv4/gre_demux.c @@ -84,7 +84,8 @@ void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi, ptr--; } if (tpi->flags&TUNNEL_CSUM && - !(skb_shinfo(skb)->gso_type & SKB_GSO_GRE)) { + !(skb_shinfo(skb)->gso_type & + (SKB_GSO_GRE|SKB_GSO_GRE_CSUM))) { *ptr = 0; *(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0, skb->len, 0)); diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index f1d32280cb54..24deb3928b9e 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -42,6 +42,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, SKB_GSO_DODGY | SKB_GSO_TCP_ECN | SKB_GSO_GRE | + SKB_GSO_GRE_CSUM | SKB_GSO_IPIP))) goto out; @@ -55,6 +56,8 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, goto out; csum = !!(greh->flags & GRE_CSUM); + if (csum) + skb->encap_hdr_csum = 1; if (unlikely(!pskb_may_pull(skb, ghl))) goto out; @@ -94,10 +97,13 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, } } - greh = (struct gre_base_hdr *)(skb->data); + skb_reset_transport_header(skb); + + greh = (struct gre_base_hdr *) + skb_transport_header(skb); pcsum = (__be32 *)(greh + 1); *pcsum = 0; - *(__sum16 *)pcsum = csum_fold(skb_checksum(skb, 0, skb->len, 0)); + *(__sum16 *)pcsum = gso_make_checksum(skb, 0); } __skb_push(skb, tnl_hlen - ghl); diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index c02f2d2e7bab..4e86c59ec7f7 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -57,6 +57,7 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, SKB_GSO_TCP_ECN | SKB_GSO_TCPV6 | SKB_GSO_GRE | + SKB_GSO_GRE_CSUM | SKB_GSO_IPIP | SKB_GSO_SIT | SKB_GSO_MPLS | diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 5c23f4765af9..7b1840110173 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -74,7 +74,8 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, SKB_GSO_UDP_TUNNEL | SKB_GSO_UDP_TUNNEL_CSUM | SKB_GSO_IPIP | - SKB_GSO_GRE | SKB_GSO_MPLS) || + SKB_GSO_GRE | SKB_GSO_GRE_CSUM | + SKB_GSO_MPLS) || !(type & (SKB_GSO_UDP)))) goto out; diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index d54c5744e3db..65eda2a8af48 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -97,6 +97,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, SKB_GSO_DODGY | SKB_GSO_TCP_ECN | SKB_GSO_GRE | + SKB_GSO_GRE_CSUM | SKB_GSO_IPIP | SKB_GSO_SIT | SKB_GSO_UDP_TUNNEL | diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 79da8b305ced..0ae3d98f83e0 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -65,6 +65,7 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, SKB_GSO_UDP_TUNNEL | SKB_GSO_UDP_TUNNEL_CSUM | SKB_GSO_GRE | + SKB_GSO_GRE_CSUM | SKB_GSO_IPIP | SKB_GSO_SIT | SKB_GSO_MPLS) || diff --git a/net/mpls/mpls_gso.c b/net/mpls/mpls_gso.c index 851cd880b0c0..6b38d083e1c9 100644 --- a/net/mpls/mpls_gso.c +++ b/net/mpls/mpls_gso.c @@ -33,6 +33,7 @@ static struct sk_buff *mpls_gso_segment(struct sk_buff *skb, SKB_GSO_DODGY | SKB_GSO_TCP_ECN | SKB_GSO_GRE | + SKB_GSO_GRE_CSUM | SKB_GSO_IPIP | SKB_GSO_MPLS))) goto out; -- cgit v1.2.3-58-ga151 From 4cb28970a23ff209199b0a4358d68efe82c8f493 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Mon, 2 Jun 2014 15:55:22 -0700 Subject: net: use the new API kvfree() It is available since v3.15-rc5. Cc: Pablo Neira Ayuso Cc: "David S. Miller" Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/core/dev.c | 10 ++-------- net/ipv4/tcp_metrics.c | 5 +---- net/netfilter/ipset/ip_set_core.c | 5 +---- net/netfilter/nft_hash.c | 5 +---- net/netfilter/xt_recent.c | 5 +---- net/sched/sch_choke.c | 7 +------ net/sched/sch_fq.c | 5 +---- net/sched/sch_fq_codel.c | 7 +------ net/sched/sch_hhf.c | 7 +------ net/sched/sch_netem.c | 7 +------ net/sched/sch_sfq.c | 7 +------ 11 files changed, 12 insertions(+), 58 deletions(-) (limited to 'net/ipv4') diff --git a/net/core/dev.c b/net/core/dev.c index 5367bfba0947..a9a08e4a4857 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5933,10 +5933,7 @@ static void netdev_init_one_queue(struct net_device *dev, static void netif_free_tx_queues(struct net_device *dev) { - if (is_vmalloc_addr(dev->_tx)) - vfree(dev->_tx); - else - kfree(dev->_tx); + kvfree(dev->_tx); } static int netif_alloc_netdev_queues(struct net_device *dev) @@ -6410,10 +6407,7 @@ void netdev_freemem(struct net_device *dev) { char *addr = (char *)dev - dev->padded; - if (is_vmalloc_addr(addr)) - vfree(addr); - else - kfree(addr); + kvfree(addr); } /** diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index dcaf72f10216..4fe041805989 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -1159,10 +1159,7 @@ static void __net_exit tcp_net_metrics_exit(struct net *net) tm = next; } } - if (is_vmalloc_addr(net->ipv4.tcp_metrics_hash)) - vfree(net->ipv4.tcp_metrics_hash); - else - kfree(net->ipv4.tcp_metrics_hash); + kvfree(net->ipv4.tcp_metrics_hash); } static __net_initdata struct pernet_operations tcp_net_metrics_ops = { diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 117208321f16..ec8114fae50b 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -271,10 +271,7 @@ ip_set_free(void *members) { pr_debug("%p: free with %s\n", members, is_vmalloc_addr(members) ? "vfree" : "kfree"); - if (is_vmalloc_addr(members)) - vfree(members); - else - kfree(members); + kvfree(members); } EXPORT_SYMBOL_GPL(ip_set_free); diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c index 1dfeb6786832..4080ed6a072b 100644 --- a/net/netfilter/nft_hash.c +++ b/net/netfilter/nft_hash.c @@ -76,10 +76,7 @@ static bool nft_hash_lookup(const struct nft_set *set, static void nft_hash_tbl_free(const struct nft_hash_table *tbl) { - if (is_vmalloc_addr(tbl)) - vfree(tbl); - else - kfree(tbl); + kvfree(tbl); } static unsigned int nft_hash_tbl_size(unsigned int nelem) diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index 1e657cf715c4..a9faae89f955 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -313,10 +313,7 @@ out: static void recent_table_free(void *addr) { - if (is_vmalloc_addr(addr)) - vfree(addr); - else - kfree(addr); + kvfree(addr); } static int recent_mt_check(const struct xt_mtchk_param *par, diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index 2aee02802c27..ed30e436128b 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -391,12 +391,7 @@ static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = { static void choke_free(void *addr) { - if (addr) { - if (is_vmalloc_addr(addr)) - vfree(addr); - else - kfree(addr); - } + kvfree(addr); } static int choke_change(struct Qdisc *sch, struct nlattr *opt) diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 23c682b42f99..ba32c2b005d0 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -591,10 +591,7 @@ static void *fq_alloc_node(size_t sz, int node) static void fq_free(void *addr) { - if (addr && is_vmalloc_addr(addr)) - vfree(addr); - else - kfree(addr); + kvfree(addr); } static int fq_resize(struct Qdisc *sch, u32 log) diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 0bf432c782c1..063b726bf1f8 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -365,12 +365,7 @@ static void *fq_codel_zalloc(size_t sz) static void fq_codel_free(void *addr) { - if (addr) { - if (is_vmalloc_addr(addr)) - vfree(addr); - else - kfree(addr); - } + kvfree(addr); } static void fq_codel_destroy(struct Qdisc *sch) diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c index 6aab8619bbb0..d85b6812a7d4 100644 --- a/net/sched/sch_hhf.c +++ b/net/sched/sch_hhf.c @@ -494,12 +494,7 @@ static void *hhf_zalloc(size_t sz) static void hhf_free(void *addr) { - if (addr) { - if (is_vmalloc_addr(addr)) - vfree(addr); - else - kfree(addr); - } + kvfree(addr); } static void hhf_destroy(struct Qdisc *sch) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index f1669a00f571..111d70fddaea 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -648,12 +648,7 @@ static void netem_reset(struct Qdisc *sch) static void dist_free(struct disttable *d) { - if (d) { - if (is_vmalloc_addr(d)) - vfree(d); - else - kfree(d); - } + kvfree(d); } /* diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 87317ff0b4ec..1af2f73906d0 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -716,12 +716,7 @@ static void *sfq_alloc(size_t sz) static void sfq_free(void *addr) { - if (addr) { - if (is_vmalloc_addr(addr)) - vfree(addr); - else - kfree(addr); - } + kvfree(addr); } static void sfq_destroy(struct Qdisc *sch) -- cgit v1.2.3-58-ga151 From ebbe495f19d565bb39be31ca28126a222b6a66dd Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Mon, 2 Jun 2014 16:12:02 -0700 Subject: ipv4: use skb frags api in udp4_hwcsum() Cc: "David S. Miller" Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/ipv4/udp.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 8d8c33d84c9a..185ed3e59802 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -727,13 +727,12 @@ EXPORT_SYMBOL(udp_flush_pending_frames); void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst) { struct udphdr *uh = udp_hdr(skb); - struct sk_buff *frags = skb_shinfo(skb)->frag_list; int offset = skb_transport_offset(skb); int len = skb->len - offset; int hlen = len; __wsum csum = 0; - if (!frags) { + if (!skb_has_frag_list(skb)) { /* * Only one fragment on the socket. */ @@ -742,15 +741,17 @@ void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst) uh->check = ~csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, 0); } else { + struct sk_buff *frags; + /* * HW-checksum won't work as there are two or more * fragments on the socket so that all csums of sk_buffs * should be together */ - do { + skb_walk_frags(skb, frags) { csum = csum_add(csum, frags->csum); hlen -= frags->len; - } while ((frags = frags->next)); + } csum = skb_checksum(skb, offset, hlen, csum); skb->ip_summed = CHECKSUM_NONE; -- cgit v1.2.3-58-ga151 From 586d5fc867be8f03c049b4b89fd29d0b8b02cab5 Mon Sep 17 00:00:00 2001 From: Dmitry Popov Date: Fri, 6 Jun 2014 04:34:37 +0400 Subject: ip_tunnel: fix possible rtable leak ip_rt_put(rt) is always called in "error" branches above, but was missed in skb_cow_head branch. As rt is not yet bound to skb here we have to release it by hand. Signed-off-by: Dmitry Popov Signed-off-by: David S. Miller --- net/ipv4/ip_tunnel.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 2acc2337d38b..3f6135bc54ef 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -668,6 +668,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, dev->needed_headroom = max_headroom; if (skb_cow_head(skb, dev->needed_headroom)) { + ip_rt_put(rt); dev->stats.tx_dropped++; kfree_skb(skb); return; -- cgit v1.2.3-58-ga151 From 6cc55e096f1f2a8585cf8dc9049862f2376f66d4 Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Fri, 6 Jun 2014 17:32:37 +0300 Subject: tcp: add gfp parameter to tcp_fragment tcp_fragment can be called from process context (from tso_fragment). Add a new gfp parameter to allow it to preserve atomic memory if possible. Signed-off-by: Octavian Purdila Reviewed-by: Christoph Paasch Signed-off-by: David S. Miller --- include/net/tcp.h | 2 +- net/ipv4/tcp_input.c | 5 +++-- net/ipv4/tcp_output.c | 15 ++++++++------- 3 files changed, 12 insertions(+), 10 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/tcp.h b/include/net/tcp.h index e80abe4486cb..7286db80e8b8 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -535,7 +535,7 @@ void tcp_retransmit_timer(struct sock *sk); void tcp_xmit_retransmit_queue(struct sock *); void tcp_simple_retransmit(struct sock *); int tcp_trim_head(struct sock *, struct sk_buff *, u32); -int tcp_fragment(struct sock *, struct sk_buff *, u32, unsigned int); +int tcp_fragment(struct sock *, struct sk_buff *, u32, unsigned int, gfp_t); void tcp_send_probe0(struct sock *); void tcp_send_partial(struct sock *); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 931529d5daa2..40661fc1e233 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1167,7 +1167,7 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, } pkt_len = new_len; } - err = tcp_fragment(sk, skb, pkt_len, mss); + err = tcp_fragment(sk, skb, pkt_len, mss, GFP_ATOMIC); if (err < 0) return err; } @@ -2241,7 +2241,8 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) break; mss = skb_shinfo(skb)->gso_size; - err = tcp_fragment(sk, skb, (packets - oldcnt) * mss, mss); + err = tcp_fragment(sk, skb, (packets - oldcnt) * mss, + mss, GFP_ATOMIC); if (err < 0) break; cnt = packets; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index d463c35db33d..ad7549f1d0ad 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1074,7 +1074,7 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de * Remember, these are still headerless SKBs at this point. */ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, - unsigned int mss_now) + unsigned int mss_now, gfp_t gfp) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *buff; @@ -1089,11 +1089,11 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, if (nsize < 0) nsize = 0; - if (skb_unclone(skb, GFP_ATOMIC)) + if (skb_unclone(skb, gfp)) return -ENOMEM; /* Get a new skb... force flag on. */ - buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC); + buff = sk_stream_alloc_skb(sk, nsize, gfp); if (buff == NULL) return -ENOMEM; /* We'll just try again later. */ @@ -1625,7 +1625,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, /* All of a TSO frame must be composed of paged data. */ if (skb->len != skb->data_len) - return tcp_fragment(sk, skb, len, mss_now); + return tcp_fragment(sk, skb, len, mss_now, gfp); buff = sk_stream_alloc_skb(sk, 0, gfp); if (unlikely(buff == NULL)) @@ -2122,7 +2122,8 @@ void tcp_send_loss_probe(struct sock *sk) goto rearm_timer; if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) { - if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss))) + if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss, + GFP_ATOMIC))) goto rearm_timer; skb = tcp_write_queue_tail(sk); } @@ -2463,7 +2464,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) return -EAGAIN; if (skb->len > cur_mss) { - if (tcp_fragment(sk, skb, cur_mss, cur_mss)) + if (tcp_fragment(sk, skb, cur_mss, cur_mss, GFP_ATOMIC)) return -ENOMEM; /* We'll try again later. */ } else { int oldpcount = tcp_skb_pcount(skb); @@ -3244,7 +3245,7 @@ int tcp_write_wakeup(struct sock *sk) skb->len > mss) { seg_size = min(seg_size, mss); TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; - if (tcp_fragment(sk, skb, seg_size, mss)) + if (tcp_fragment(sk, skb, seg_size, mss, GFP_ATOMIC)) return -1; } else if (!tcp_skb_pcount(skb)) tcp_set_skb_tso_segs(sk, skb, mss); -- cgit v1.2.3-58-ga151 From f8c1b7ce00254a5bb75d5b5e5ef1601326a0e08e Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Fri, 6 Jun 2014 08:10:00 -0700 Subject: gre: allow changing mac address when device is up There is no need to require forcing device down on a Ethernet GRE (gretap) tunnel to change the MAC address. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index c5a557a06a31..9b842544aea3 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -649,6 +649,7 @@ static void ipgre_tap_setup(struct net_device *dev) { ether_setup(dev); dev->netdev_ops = &gre_tap_netdev_ops; + dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; ip_tunnel_setup(dev, gre_tap_net_id); } -- cgit v1.2.3-58-ga151 From 2346829e641b804ece9ac9298136b56d9567c278 Mon Sep 17 00:00:00 2001 From: Dmitry Popov Date: Fri, 6 Jun 2014 23:19:21 +0400 Subject: ipip, sit: fix ipv4_{update_pmtu,redirect} calls ipv4_{update_pmtu,redirect} were called with tunnel's ifindex (t->dev is a tunnel netdevice). It caused wrong route lookup and failure of pmtu update or redirect. We should use the same ifindex that we use in ip_route_output_* in *tunnel_xmit code. It is t->parms.link . Signed-off-by: Dmitry Popov Signed-off-by: David S. Miller --- net/ipv4/ipip.c | 4 ++-- net/ipv6/sit.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 812b18351462..09680ddbc677 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -149,13 +149,13 @@ static int ipip_err(struct sk_buff *skb, u32 info) if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { ipv4_update_pmtu(skb, dev_net(skb->dev), info, - t->dev->ifindex, 0, IPPROTO_IPIP, 0); + t->parms.link, 0, IPPROTO_IPIP, 0); err = 0; goto out; } if (type == ICMP_REDIRECT) { - ipv4_redirect(skb, dev_net(skb->dev), t->dev->ifindex, 0, + ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0, IPPROTO_IPIP, 0); err = 0; goto out; diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index e5a453ca302e..45397b2a4a0b 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -560,12 +560,12 @@ static int ipip6_err(struct sk_buff *skb, u32 info) if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { ipv4_update_pmtu(skb, dev_net(skb->dev), info, - t->dev->ifindex, 0, IPPROTO_IPV6, 0); + t->parms.link, 0, IPPROTO_IPV6, 0); err = 0; goto out; } if (type == ICMP_REDIRECT) { - ipv4_redirect(skb, dev_net(skb->dev), t->dev->ifindex, 0, + ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0, IPPROTO_IPV6, 0); err = 0; goto out; -- cgit v1.2.3-58-ga151 From 7c8e6b9c2811fd37702a9043eabea3545022011e Mon Sep 17 00:00:00 2001 From: Dmitry Popov Date: Sun, 8 Jun 2014 02:06:25 +0400 Subject: ip_vti: Fix 'ip tunnel add' with 'key' parameters ip tunnel add remote 10.2.2.1 local 10.2.2.2 mode vti ikey 1 okey 2 translates to p->iflags = VTI_ISVTI|GRE_KEY and p->i_key = 1, but GRE_KEY != TUNNEL_KEY, so ip_tunnel_ioctl would set i_key to 0 (same story with o_key) making us unable to create vti tunnels with [io]key via ip tunnel. We cannot simply translate GRE_KEY to TUNNEL_KEY (as GRE module does) because vti_tunnels with same local/remote addresses but different ikeys will be treated as different then. So, imo the best option here is to move p->i_flags & *_KEY check for vti tunnels from ip_tunnel.c to ip_vti.c and to think about [io]_mark field for ip_tunnel_parm in the future. Signed-off-by: Dmitry Popov Signed-off-by: David S. Miller --- net/ipv4/ip_tunnel.c | 10 ++++++---- net/ipv4/ip_vti.c | 8 +++++++- 2 files changed, 13 insertions(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 3f6135bc54ef..3dbb550abb30 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -748,10 +748,12 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) goto done; if (p->iph.ttl) p->iph.frag_off |= htons(IP_DF); - if (!(p->i_flags&TUNNEL_KEY)) - p->i_key = 0; - if (!(p->o_flags&TUNNEL_KEY)) - p->o_key = 0; + if (!(p->i_flags & VTI_ISVTI)) { + if (!(p->i_flags & TUNNEL_KEY)) + p->i_key = 0; + if (!(p->o_flags & TUNNEL_KEY)) + p->o_key = 0; + } t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 13ef00f1e17b..b8960f3527f3 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -313,7 +313,13 @@ vti_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) return -EINVAL; } - p.i_flags |= VTI_ISVTI; + if (!(p.i_flags & GRE_KEY)) + p.i_key = 0; + if (!(p.o_flags & GRE_KEY)) + p.o_key = 0; + + p.i_flags = VTI_ISVTI; + err = ip_tunnel_ioctl(dev, &p, cmd); if (err) return err; -- cgit v1.2.3-58-ga151 From 5ce54af1fc9d2718d46c9fd92a161379fb197266 Mon Sep 17 00:00:00 2001 From: Dmitry Popov Date: Sun, 8 Jun 2014 03:03:08 +0400 Subject: ip_tunnel: fix i_key matching in ip_tunnel_find Some tunnels (though only vti as for now) can use i_key just for internal use: for example vti uses it for fwmark'ing incoming packets. So raw i_key value shouldn't be treated as a distinguisher for them. ip_tunnel_key_match exists for cases when we want to compare two ip_tunnel_parms' i_keys. Example bug: ip link add type vti ikey 1 local 1.0.0.1 remote 2.0.0.2 ip link add type vti ikey 2 local 1.0.0.1 remote 2.0.0.2 spawned two tunnels, although it doesn't make sense. Signed-off-by: Dmitry Popov Signed-off-by: David S. Miller --- net/ipv4/ip_tunnel.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 3dbb550abb30..9b553157e556 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -268,6 +268,7 @@ static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn, __be32 remote = parms->iph.daddr; __be32 local = parms->iph.saddr; __be32 key = parms->i_key; + __be16 flags = parms->i_flags; int link = parms->link; struct ip_tunnel *t = NULL; struct hlist_head *head = ip_bucket(itn, parms); @@ -275,9 +276,9 @@ static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn, hlist_for_each_entry_rcu(t, head, hash_node) { if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr && - key == t->parms.i_key && link == t->parms.link && - type == t->dev->type) + type == t->dev->type && + ip_tunnel_key_match(&t->parms, flags, key)) break; } return t; -- cgit v1.2.3-58-ga151 From 9709674e68646cee5a24e3000b3558d25412203a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 10 Jun 2014 06:43:01 -0700 Subject: ipv4: fix a race in ip4_datagram_release_cb() Alexey gave a AddressSanitizer[1] report that finally gave a good hint at where was the origin of various problems already reported by Dormando in the past [2] Problem comes from the fact that UDP can have a lockless TX path, and concurrent threads can manipulate sk_dst_cache, while another thread, is holding socket lock and calls __sk_dst_set() in ip4_datagram_release_cb() (this was added in linux-3.8) It seems that all we need to do is to use sk_dst_check() and sk_dst_set() so that all the writers hold same spinlock (sk->sk_dst_lock) to prevent corruptions. TCP stack do not need this protection, as all sk_dst_cache writers hold the socket lock. [1] https://code.google.com/p/address-sanitizer/wiki/AddressSanitizerForKernel AddressSanitizer: heap-use-after-free in ipv4_dst_check Read of size 2 by thread T15453: [] ipv4_dst_check+0x1a/0x90 ./net/ipv4/route.c:1116 [] __sk_dst_check+0x89/0xe0 ./net/core/sock.c:531 [] ip4_datagram_release_cb+0x46/0x390 ??:0 [] release_sock+0x17a/0x230 ./net/core/sock.c:2413 [] ip4_datagram_connect+0x462/0x5d0 ??:0 [] inet_dgram_connect+0x76/0xd0 ./net/ipv4/af_inet.c:534 [] SYSC_connect+0x15c/0x1c0 ./net/socket.c:1701 [] SyS_connect+0xe/0x10 ./net/socket.c:1682 [] system_call_fastpath+0x16/0x1b ./arch/x86/kernel/entry_64.S:629 Freed by thread T15455: [] dst_destroy+0xa8/0x160 ./net/core/dst.c:251 [] dst_release+0x45/0x80 ./net/core/dst.c:280 [] ip4_datagram_connect+0xa1/0x5d0 ??:0 [] inet_dgram_connect+0x76/0xd0 ./net/ipv4/af_inet.c:534 [] SYSC_connect+0x15c/0x1c0 ./net/socket.c:1701 [] SyS_connect+0xe/0x10 ./net/socket.c:1682 [] system_call_fastpath+0x16/0x1b ./arch/x86/kernel/entry_64.S:629 Allocated by thread T15453: [] dst_alloc+0x81/0x2b0 ./net/core/dst.c:171 [] rt_dst_alloc+0x47/0x50 ./net/ipv4/route.c:1406 [< inlined >] __ip_route_output_key+0x3e8/0xf70 __mkroute_output ./net/ipv4/route.c:1939 [] __ip_route_output_key+0x3e8/0xf70 ./net/ipv4/route.c:2161 [] ip_route_output_flow+0x14/0x30 ./net/ipv4/route.c:2249 [] ip4_datagram_connect+0x317/0x5d0 ??:0 [] inet_dgram_connect+0x76/0xd0 ./net/ipv4/af_inet.c:534 [] SYSC_connect+0x15c/0x1c0 ./net/socket.c:1701 [] SyS_connect+0xe/0x10 ./net/socket.c:1682 [] system_call_fastpath+0x16/0x1b ./arch/x86/kernel/entry_64.S:629 [2] <4>[196727.311203] general protection fault: 0000 [#1] SMP <4>[196727.311224] Modules linked in: xt_TEE xt_dscp xt_DSCP macvlan bridge coretemp crc32_pclmul ghash_clmulni_intel gpio_ich microcode ipmi_watchdog ipmi_devintf sb_edac edac_core lpc_ich mfd_core tpm_tis tpm tpm_bios ipmi_si ipmi_msghandler isci igb libsas i2c_algo_bit ixgbe ptp pps_core mdio <4>[196727.311333] CPU: 17 PID: 0 Comm: swapper/17 Not tainted 3.10.26 #1 <4>[196727.311344] Hardware name: Supermicro X9DRi-LN4+/X9DR3-LN4+/X9DRi-LN4+/X9DR3-LN4+, BIOS 3.0 07/05/2013 <4>[196727.311364] task: ffff885e6f069700 ti: ffff885e6f072000 task.ti: ffff885e6f072000 <4>[196727.311377] RIP: 0010:[] [] ipv4_dst_destroy+0x4f/0x80 <4>[196727.311399] RSP: 0018:ffff885effd23a70 EFLAGS: 00010282 <4>[196727.311409] RAX: dead000000200200 RBX: ffff8854c398ecc0 RCX: 0000000000000040 <4>[196727.311423] RDX: dead000000100100 RSI: dead000000100100 RDI: dead000000200200 <4>[196727.311437] RBP: ffff885effd23a80 R08: ffffffff815fd9e0 R09: ffff885d5a590800 <4>[196727.311451] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 <4>[196727.311464] R13: ffffffff81c8c280 R14: 0000000000000000 R15: ffff880e85ee16ce <4>[196727.311510] FS: 0000000000000000(0000) GS:ffff885effd20000(0000) knlGS:0000000000000000 <4>[196727.311554] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 <4>[196727.311581] CR2: 00007a46751eb000 CR3: 0000005e65688000 CR4: 00000000000407e0 <4>[196727.311625] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 <4>[196727.311669] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 <4>[196727.311713] Stack: <4>[196727.311733] ffff8854c398ecc0 ffff8854c398ecc0 ffff885effd23ab0 ffffffff815b7f42 <4>[196727.311784] ffff88be6595bc00 ffff8854c398ecc0 0000000000000000 ffff8854c398ecc0 <4>[196727.311834] ffff885effd23ad0 ffffffff815b86c6 ffff885d5a590800 ffff8816827821c0 <4>[196727.311885] Call Trace: <4>[196727.311907] <4>[196727.311912] [] dst_destroy+0x32/0xe0 <4>[196727.311959] [] dst_release+0x56/0x80 <4>[196727.311986] [] tcp_v4_do_rcv+0x2a5/0x4a0 <4>[196727.312013] [] tcp_v4_rcv+0x7da/0x820 <4>[196727.312041] [] ? ip_rcv_finish+0x360/0x360 <4>[196727.312070] [] ? nf_hook_slow+0x7d/0x150 <4>[196727.312097] [] ? ip_rcv_finish+0x360/0x360 <4>[196727.312125] [] ip_local_deliver_finish+0xb2/0x230 <4>[196727.312154] [] ip_local_deliver+0x4a/0x90 <4>[196727.312183] [] ip_rcv_finish+0x119/0x360 <4>[196727.312212] [] ip_rcv+0x22b/0x340 <4>[196727.312242] [] ? macvlan_broadcast+0x160/0x160 [macvlan] <4>[196727.312275] [] __netif_receive_skb_core+0x512/0x640 <4>[196727.312308] [] ? kmem_cache_alloc+0x13b/0x150 <4>[196727.312338] [] __netif_receive_skb+0x21/0x70 <4>[196727.312368] [] netif_receive_skb+0x31/0xa0 <4>[196727.312397] [] napi_gro_receive+0xe8/0x140 <4>[196727.312433] [] ixgbe_poll+0x551/0x11f0 [ixgbe] <4>[196727.312463] [] ? ip_rcv+0x22b/0x340 <4>[196727.312491] [] net_rx_action+0x111/0x210 <4>[196727.312521] [] ? __netif_receive_skb+0x21/0x70 <4>[196727.312552] [] __do_softirq+0xd0/0x270 <4>[196727.312583] [] call_softirq+0x1c/0x30 <4>[196727.312613] [] do_softirq+0x55/0x90 <4>[196727.312640] [] irq_exit+0x55/0x60 <4>[196727.312668] [] do_IRQ+0x63/0xe0 <4>[196727.312696] [] common_interrupt+0x6a/0x6a <4>[196727.312722] <1>[196727.313071] RIP [] ipv4_dst_destroy+0x4f/0x80 <4>[196727.313100] RSP <4>[196727.313377] ---[ end trace 64b3f14fae0f2e29 ]--- <0>[196727.380908] Kernel panic - not syncing: Fatal exception in interrupt Reported-by: Alexey Preobrazhensky Reported-by: dormando Signed-off-by: Eric Dumazet Fixes: 8141ed9fcedb2 ("ipv4: Add a socket release callback for datagram sockets") Cc: Steffen Klassert Signed-off-by: David S. Miller --- net/ipv4/datagram.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index 8b5134c582f1..a3095fdefbed 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -86,18 +86,26 @@ out: } EXPORT_SYMBOL(ip4_datagram_connect); +/* Because UDP xmit path can manipulate sk_dst_cache without holding + * socket lock, we need to use sk_dst_set() here, + * even if we own the socket lock. + */ void ip4_datagram_release_cb(struct sock *sk) { const struct inet_sock *inet = inet_sk(sk); const struct ip_options_rcu *inet_opt; __be32 daddr = inet->inet_daddr; + struct dst_entry *dst; struct flowi4 fl4; struct rtable *rt; - if (! __sk_dst_get(sk) || __sk_dst_check(sk, 0)) - return; - rcu_read_lock(); + + dst = __sk_dst_get(sk); + if (!dst || !dst->obsolete || dst->ops->check(dst, 0)) { + rcu_read_unlock(); + return; + } inet_opt = rcu_dereference(inet->inet_opt); if (inet_opt && inet_opt->opt.srr) daddr = inet_opt->opt.faddr; @@ -105,8 +113,10 @@ void ip4_datagram_release_cb(struct sock *sk) inet->inet_saddr, inet->inet_dport, inet->inet_sport, sk->sk_protocol, RT_CONN_FLAGS(sk), sk->sk_bound_dev_if); - if (!IS_ERR(rt)) - __sk_dst_set(sk, &rt->dst); + + dst = !IS_ERR(rt) ? &rt->dst : NULL; + sk_dst_set(sk, dst); + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(ip4_datagram_release_cb); -- cgit v1.2.3-58-ga151 From 7e3cead5172927732f51fde77fef6f521e22f209 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 10 Jun 2014 18:54:19 -0700 Subject: net: Save software checksum complete In skb_checksum complete, if we need to compute the checksum for the packet (via skb_checksum) save the result as CHECKSUM_COMPLETE. Subsequent checksum verification can use this. Also, added csum_complete_sw flag to distinguish between software and hardware generated checksum complete, we should always be able to trust the software computation. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/skbuff.h | 3 ++- net/core/datagram.c | 14 +++++++++----- net/ipv4/gre_offload.c | 6 ++++-- net/sunrpc/socklib.c | 3 ++- 4 files changed, 17 insertions(+), 9 deletions(-) (limited to 'net/ipv4') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 72a53805858a..5b5cd3189c98 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -573,7 +573,8 @@ struct sk_buff { __u8 encapsulation:1; __u8 encap_hdr_csum:1; __u8 csum_valid:1; - /* 4/6 bit hole (depending on ndisc_nodetype presence) */ + __u8 csum_complete_sw:1; + /* 3/5 bit hole (depending on ndisc_nodetype presence) */ kmemcheck_bitfield_end(flags2); #if defined CONFIG_NET_DMA || defined CONFIG_NET_RX_BUSY_POLL diff --git a/net/core/datagram.c b/net/core/datagram.c index a16ed7bbe376..6b1c04ca1d50 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -739,11 +739,15 @@ __sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len) __sum16 sum; sum = csum_fold(skb_checksum(skb, 0, len, skb->csum)); - if (likely(!sum)) { - if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) - netdev_rx_csum_fault(skb->dev); - skb->ip_summed = CHECKSUM_UNNECESSARY; - } + if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && !sum && + !skb->csum_complete_sw) + netdev_rx_csum_fault(skb->dev); + + /* Save checksum complete for later use */ + skb->csum = sum; + skb->ip_summed = CHECKSUM_COMPLETE; + skb->csum_complete_sw = 1; + return sum; } EXPORT_SYMBOL(__skb_checksum_complete_head); diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 24deb3928b9e..eb92deb12666 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -131,10 +131,12 @@ static __sum16 gro_skb_checksum(struct sk_buff *skb) csum_partial(skb->data, skb_gro_offset(skb), 0)); sum = csum_fold(NAPI_GRO_CB(skb)->csum); if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) { - if (unlikely(!sum)) + if (unlikely(!sum) && !skb->csum_complete_sw) netdev_rx_csum_fault(skb->dev); - } else + } else { skb->ip_summed = CHECKSUM_COMPLETE; + skb->csum_complete_sw = 1; + } return sum; } diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c index 0a648c502fc3..2df87f78e518 100644 --- a/net/sunrpc/socklib.c +++ b/net/sunrpc/socklib.c @@ -173,7 +173,8 @@ int csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb) return -1; if (csum_fold(desc.csum)) return -1; - if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) + if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && + !skb->csum_complete_sw) netdev_rx_csum_fault(skb->dev); return 0; no_checksum: -- cgit v1.2.3-58-ga151 From 6bae1d4cc395ad46613e40c9e865ee171dc9de5c Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 10 Jun 2014 18:54:26 -0700 Subject: net: Add skb_gro_postpull_rcsum to udp and vxlan Need to gro_postpull_rcsum for GRO to work with checksum complete. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 2 ++ net/ipv4/udp_offload.c | 1 + 2 files changed, 3 insertions(+) (limited to 'net/ipv4') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 4e2caaf8b5da..1610d51dbb5c 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -565,6 +565,7 @@ static struct sk_buff **vxlan_gro_receive(struct sk_buff **head, struct sk_buff goto out; } skb_gro_pull(skb, sizeof(struct vxlanhdr)); /* pull vxlan header */ + skb_gro_postpull_rcsum(skb, vh, sizeof(struct vxlanhdr)); off_eth = skb_gro_offset(skb); hlen = off_eth + sizeof(*eh); @@ -599,6 +600,7 @@ static struct sk_buff **vxlan_gro_receive(struct sk_buff **head, struct sk_buff } skb_gro_pull(skb, sizeof(*eh)); /* pull inner eth header */ + skb_gro_postpull_rcsum(skb, eh, sizeof(*eh)); pp = ptype->callbacks.gro_receive(head, skb); out_unlock: diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 7b1840110173..546d2d439dda 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -200,6 +200,7 @@ unflush: } skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */ + skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr)); pp = uo_priv->offload->callbacks.gro_receive(head, skb); out_unlock: -- cgit v1.2.3-58-ga151 From bef1909ee3ed1ca39231b260a8d3b4544ecd0c8f Mon Sep 17 00:00:00 2001 From: Per Hurtig Date: Thu, 12 Jun 2014 17:08:32 +0200 Subject: tcp: fixing TLP's FIN recovery Fix to a problem observed when losing a FIN segment that does not contain data. In such situations, TLP is unable to recover from *any* tail loss and instead adds at least PTO ms to the retransmission process, i.e., RTO = RTO + PTO. Signed-off-by: Per Hurtig Signed-off-by: Eric Dumazet Acked-by: Nandita Dukkipati Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index ad7549f1d0ad..819bf0c8fbde 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2131,9 +2131,7 @@ void tcp_send_loss_probe(struct sock *sk) if (WARN_ON(!skb || !tcp_skb_pcount(skb))) goto rearm_timer; - /* Probe with zero data doesn't trigger fast recovery. */ - if (skb->len > 0) - err = __tcp_retransmit_skb(sk, skb); + err = __tcp_retransmit_skb(sk, skb); /* Record snd_nxt for loss detection. */ if (likely(!err)) -- cgit v1.2.3-58-ga151