summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2017-10-04 12:59:58 -0700
committerDavid S. Miller <davem@davemloft.net>2017-10-05 21:24:47 -0700
commite2080072ed2d98a55ae69d95dea60ff7a17cddd5 (patch)
tree6ad479e3db638db9c6469cab047b59685d04c5b1 /include
parentb1fb67fa501c4787035317f84db6caf013385581 (diff)
tcp: new list for sent but unacked skbs for RACK recovery
This patch adds a new queue (list) that tracks the sent but not yet acked or SACKed skbs for a TCP connection. The list is chronologically ordered by skb->skb_mstamp (the head is the oldest sent skb). This list will be used to optimize TCP Rack recovery, which checks an skb's timestamp to judge if it has been lost and needs to be retransmitted. Since TCP write queue is ordered by sequence instead of sent time, RACK has to scan over the write queue to catch all eligible packets to detect lost retransmission, and iterates through SACKed skbs repeatedly. Special cares for rare events: 1. TCP repair fakes skb transmission so the send queue needs adjusted 2. SACK reneging would require re-inserting SACKed skbs into the send queue. For now I believe it's not worth the complexity to make RACK work perfectly on SACK reneging, so we do nothing here. 3. Fast Open: currently for non-TFO, send-queue correctly queues the pure SYN packet. For TFO which queues a pure SYN and then a data packet, send-queue only queues the data packet but not the pure SYN due to the structure of TFO code. This is okay because the SYN receiver would never respond with a SACK on a missing SYN (i.e. SYN is never fast-retransmitted by SACK/RACK). In order to not grow sk_buff, we use an union for the new list and _skb_refdst/destructor fields. This is a bit complicated because we need to make sure _skb_refdst and destructor are properly zeroed before skb is cloned/copied at transmit, and before being freed. Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: Yuchung Cheng <ycheng@google.com> Signed-off-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'include')
-rw-r--r--include/linux/skbuff.h11
-rw-r--r--include/linux/tcp.h1
-rw-r--r--include/net/tcp.h24
3 files changed, 33 insertions, 3 deletions
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index ada821466e88..01a985937867 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -617,6 +617,7 @@ typedef unsigned char *sk_buff_data_t;
* @nf_trace: netfilter packet trace flag
* @protocol: Packet protocol from driver
* @destructor: Destruct function
+ * @tcp_tsorted_anchor: list structure for TCP (tp->tsorted_sent_queue)
* @_nfct: Associated connection, if any (with nfctinfo bits)
* @nf_bridge: Saved data about a bridged frame - see br_netfilter.c
* @skb_iif: ifindex of device we arrived on
@@ -686,8 +687,14 @@ struct sk_buff {
*/
char cb[48] __aligned(8);
- unsigned long _skb_refdst;
- void (*destructor)(struct sk_buff *skb);
+ union {
+ struct {
+ unsigned long _skb_refdst;
+ void (*destructor)(struct sk_buff *skb);
+ };
+ struct list_head tcp_tsorted_anchor;
+ };
+
#ifdef CONFIG_XFRM
struct sec_path *sp;
#endif
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 4aa40ef02d32..1d2c44e09e31 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -191,6 +191,7 @@ struct tcp_sock {
u32 tsoffset; /* timestamp offset */
struct list_head tsq_node; /* anchor in tsq_tasklet.head list */
+ struct list_head tsorted_sent_queue; /* time-sorted sent but un-SACKed skbs */
u32 snd_wl1; /* Sequence for window update */
u32 snd_wnd; /* The window we expect to receive */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 426c2e986016..3b16f353b539 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1589,14 +1589,34 @@ enum tcp_chrono {
void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type);
void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type);
+/* This helper is needed, because skb->tcp_tsorted_anchor uses
+ * the same memory storage than skb->destructor/_skb_refdst
+ */
+static inline void tcp_skb_tsorted_anchor_cleanup(struct sk_buff *skb)
+{
+ skb->destructor = NULL;
+ skb->_skb_refdst = 0UL;
+}
+
+#define tcp_skb_tsorted_save(skb) { \
+ unsigned long _save = skb->_skb_refdst; \
+ skb->_skb_refdst = 0UL;
+
+#define tcp_skb_tsorted_restore(skb) \
+ skb->_skb_refdst = _save; \
+}
+
/* write queue abstraction */
static inline void tcp_write_queue_purge(struct sock *sk)
{
struct sk_buff *skb;
tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
- while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL)
+ while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
+ tcp_skb_tsorted_anchor_cleanup(skb);
sk_wmem_free_skb(sk, skb);
+ }
+ INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
sk_mem_reclaim(sk);
tcp_clear_all_retrans_hints(tcp_sk(sk));
}
@@ -1711,6 +1731,8 @@ static inline void tcp_insert_write_queue_before(struct sk_buff *new,
static inline void tcp_unlink_write_queue(struct sk_buff *skb, struct sock *sk)
{
+ list_del(&skb->tcp_tsorted_anchor);
+ tcp_skb_tsorted_anchor_cleanup(skb);
__skb_unlink(skb, &sk->sk_write_queue);
}