diff options
author | Eric Dumazet <edumazet@google.com> | 2016-04-21 10:55:23 -0700 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2016-04-24 14:43:59 -0400 |
commit | 10d3be569243def8d92ac3722395ef5a59c504e6 (patch) | |
tree | ac01b70cff99ad2e59c54e80ddcd524eaa9691a8 /net | |
parent | 8cee83dd29dea4e7d27fda3b170381059f628868 (diff) |
tcp-tso: do not split TSO packets at retransmit time
Linux TCP stack painfully segments all TSO/GSO packets before retransmits.
This was fine back in the days when TSO/GSO were emerging, with their
bugs, but we believe the dark age is over.
Keeping big packets in write queues, but also in stack traversal
has a lot of benefits.
- Less memory overhead, because write queues have less skbs
- Less cpu overhead at ACK processing.
- Better SACK processing, as lot of studies mentioned how
awful linux was at this ;)
- Less cpu overhead to send the rtx packets
(IP stack traversal, netfilter traversal, drivers...)
- Better latencies in presence of losses.
- Smaller spikes in fq like packet schedulers, as retransmits
are not constrained by TCP Small Queues.
1 % packet losses are common today, and at 100Gbit speeds, this
translates to ~80,000 losses per second.
Losses are often correlated, and we see many retransmit events
leading to 1-MSS train of packets, at the time hosts are already
under stress.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net')
-rw-r--r-- | net/ipv4/tcp_input.c | 2 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 64 | ||||
-rw-r--r-- | net/ipv4/tcp_timer.c | 4 |
3 files changed, 32 insertions, 38 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 75e8336f6ecd..dcad8f9f96eb 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5545,7 +5545,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, if (data) { /* Retransmit unacked data in SYN */ tcp_for_write_queue_from(data, sk) { if (data == tcp_send_head(sk) || - __tcp_retransmit_skb(sk, data)) + __tcp_retransmit_skb(sk, data, 1)) break; } tcp_rearm_rto(sk); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a6e4a8353b02..9d3b4b364652 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2268,7 +2268,7 @@ void tcp_send_loss_probe(struct sock *sk) if (WARN_ON(!skb || !tcp_skb_pcount(skb))) goto rearm_timer; - if (__tcp_retransmit_skb(sk, skb)) + if (__tcp_retransmit_skb(sk, skb, 1)) goto rearm_timer; /* Record snd_nxt for loss detection. */ @@ -2571,17 +2571,17 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, * state updates are done by the caller. Returns non-zero if an * error occurred which prevented the send. */ -int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) +int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) { - struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); unsigned int cur_mss; - int err; + int diff, len, err; + - /* Inconslusive MTU probe */ - if (icsk->icsk_mtup.probe_size) { + /* Inconclusive MTU probe */ + if (icsk->icsk_mtup.probe_size) icsk->icsk_mtup.probe_size = 0; - } /* Do not sent more than we queued. 1/4 is reserved for possible * copying overhead: fragmentation, tunneling, mangling etc. @@ -2614,30 +2614,27 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) TCP_SKB_CB(skb)->seq != tp->snd_una) return -EAGAIN; - if (skb->len > cur_mss) { - if (tcp_fragment(sk, skb, cur_mss, cur_mss, GFP_ATOMIC)) + len = cur_mss * segs; + if (skb->len > len) { + if (tcp_fragment(sk, skb, len, cur_mss, GFP_ATOMIC)) return -ENOMEM; /* We'll try again later. */ } else { - int oldpcount = tcp_skb_pcount(skb); + if (skb_unclone(skb, GFP_ATOMIC)) + return -ENOMEM; - if (unlikely(oldpcount > 1)) { - if (skb_unclone(skb, GFP_ATOMIC)) - return -ENOMEM; - tcp_init_tso_segs(skb, cur_mss); - tcp_adjust_pcount(sk, skb, oldpcount - tcp_skb_pcount(skb)); - } + diff = tcp_skb_pcount(skb); + tcp_set_skb_tso_segs(skb, cur_mss); + diff -= tcp_skb_pcount(skb); + if (diff) + tcp_adjust_pcount(sk, skb, diff); + if (skb->len < cur_mss) + tcp_retrans_try_collapse(sk, skb, cur_mss); } /* RFC3168, section 6.1.1.1. ECN fallback */ if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN) tcp_ecn_clear_syn(sk, skb); - tcp_retrans_try_collapse(sk, skb, cur_mss); - - /* Make a copy, if the first transmission SKB clone we made - * is still in somebody's hands, else make a clone. - */ - /* make sure skb->data is aligned on arches that require it * and check if ack-trimming & collapsing extended the headroom * beyond what csum_start can cover. @@ -2653,20 +2650,22 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) } if (likely(!err)) { + segs = tcp_skb_pcount(skb); + TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; /* Update global TCP statistics. */ - TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); + TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs); if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); - tp->total_retrans++; + tp->total_retrans += segs; } return err; } -int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) +int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) { struct tcp_sock *tp = tcp_sk(sk); - int err = __tcp_retransmit_skb(sk, skb); + int err = __tcp_retransmit_skb(sk, skb, segs); if (err == 0) { #if FASTRETRANS_DEBUG > 0 @@ -2757,6 +2756,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) tcp_for_write_queue_from(skb, sk) { __u8 sacked = TCP_SKB_CB(skb)->sacked; + int segs; if (skb == tcp_send_head(sk)) break; @@ -2764,14 +2764,8 @@ void tcp_xmit_retransmit_queue(struct sock *sk) if (!hole) tp->retransmit_skb_hint = skb; - /* Assume this retransmit will generate - * only one packet for congestion window - * calculation purposes. This works because - * tcp_retransmit_skb() will chop up the - * packet to be MSS sized and all the - * packet counting works out. - */ - if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) + segs = tp->snd_cwnd - tcp_packets_in_flight(tp); + if (segs <= 0) return; if (fwd_rexmitting) { @@ -2808,7 +2802,7 @@ begin_fwd: if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS)) continue; - if (tcp_retransmit_skb(sk, skb)) + if (tcp_retransmit_skb(sk, skb, segs)) return; NET_INC_STATS_BH(sock_net(sk), mib_idx); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 49bc474f8e35..373b03e78aaa 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -404,7 +404,7 @@ void tcp_retransmit_timer(struct sock *sk) goto out; } tcp_enter_loss(sk); - tcp_retransmit_skb(sk, tcp_write_queue_head(sk)); + tcp_retransmit_skb(sk, tcp_write_queue_head(sk), 1); __sk_dst_reset(sk); goto out_reset_timer; } @@ -436,7 +436,7 @@ void tcp_retransmit_timer(struct sock *sk) tcp_enter_loss(sk); - if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk)) > 0) { + if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk), 1) > 0) { /* Retransmission failed because of local congestion, * do not backoff. */ |