diff options
author | David S. Miller <davem@davemloft.net> | 2021-09-23 12:50:26 +0100 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2021-09-23 12:50:26 +0100 |
commit | 5146a574606ae6543adeb04471cc4903ee169fb6 (patch) | |
tree | 1976a34b63ebd315e6741ceb3bf582fe0e63a4bc | |
parent | efe686ffce0145418b4e004f95c6d0c0200aedd2 (diff) | |
parent | d8b81175e412c7abebdb5b37d8a84d5fd19b1aad (diff) |
Merge branch 'remove-sk-skb-caches'
Paolo Abeni says:
====================
net: remove sk skb caches
Eric noted we would be better off reverting the sk
skb caches.
MPTCP relies on such a feature, so we need a
little refactor of the MPTCP tx path before the mentioned
revert.
The first patch exposes additional TCP helpers. The 2nd patch
changes the MPTCP code to do locally the whole skb allocation
and updating, so it does not rely anymore on core TCP helpers
for that nor the sk skb cache.
As a side effect, we can make the tcp_build_frag helper static.
Finally, we can pull Eric's revert.
RFC -> v1:
- drop driver specific patch - no more needed after helper rename
- rename skb_entail -> tcp_skb_entail (Eric)
- preserve the tcp_build_frag helpwe, just make it static (Eric)
====================
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | Documentation/networking/ip-sysctl.rst | 8 | ||||
-rw-r--r-- | include/net/sock.h | 19 | ||||
-rw-r--r-- | include/net/tcp.h | 4 | ||||
-rw-r--r-- | net/ipv4/af_inet.c | 4 | ||||
-rw-r--r-- | net/ipv4/sysctl_net_ipv4.c | 12 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 38 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 6 | ||||
-rw-r--r-- | net/ipv6/tcp_ipv6.c | 6 | ||||
-rw-r--r-- | net/mptcp/protocol.c | 137 |
9 files changed, 85 insertions, 149 deletions
diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index d91ab28718d4..16b8bf72feaf 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -989,14 +989,6 @@ tcp_challenge_ack_limit - INTEGER in RFC 5961 (Improving TCP's Robustness to Blind In-Window Attacks) Default: 1000 -tcp_rx_skb_cache - BOOLEAN - Controls a per TCP socket cache of one skb, that might help - performance of some workloads. This might be dangerous - on systems with a lot of TCP sockets, since it increases - memory usage. - - Default: 0 (disabled) - UDP variables ============= diff --git a/include/net/sock.h b/include/net/sock.h index 66a9a90f9558..708b9de3cdbb 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -262,7 +262,6 @@ struct bpf_local_storage; * @sk_dst_cache: destination cache * @sk_dst_pending_confirm: need to confirm neighbour * @sk_policy: flow policy - * @sk_rx_skb_cache: cache copy of recently accessed RX skb * @sk_receive_queue: incoming packets * @sk_wmem_alloc: transmit queue bytes committed * @sk_tsq_flags: TCP Small Queues flags @@ -328,7 +327,6 @@ struct bpf_local_storage; * @sk_peek_off: current peek_offset value * @sk_send_head: front of stuff to transmit * @tcp_rtx_queue: TCP re-transmit queue [union with @sk_send_head] - * @sk_tx_skb_cache: cache copy of recently accessed TX skb * @sk_security: used by security modules * @sk_mark: generic packet mark * @sk_cgrp_data: cgroup data for this cgroup @@ -393,7 +391,6 @@ struct sock { atomic_t sk_drops; int sk_rcvlowat; struct sk_buff_head sk_error_queue; - struct sk_buff *sk_rx_skb_cache; struct sk_buff_head sk_receive_queue; /* * The backlog queue is special, it is always used with @@ -442,7 +439,6 @@ struct sock { struct sk_buff *sk_send_head; struct rb_root tcp_rtx_queue; }; - struct sk_buff *sk_tx_skb_cache; struct sk_buff_head sk_write_queue; __s32 sk_peek_off; int sk_write_pending; @@ -1555,18 +1551,10 @@ static inline void sk_mem_uncharge(struct sock *sk, int size) __sk_mem_reclaim(sk, 1 << 20); } -DECLARE_STATIC_KEY_FALSE(tcp_tx_skb_cache_key); static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb) { sk_wmem_queued_add(sk, -skb->truesize); sk_mem_uncharge(sk, skb->truesize); - if (static_branch_unlikely(&tcp_tx_skb_cache_key) && - !sk->sk_tx_skb_cache && !skb_cloned(skb)) { - skb_ext_reset(skb); - skb_zcopy_clear(skb, true); - sk->sk_tx_skb_cache = skb; - return; - } __kfree_skb(skb); } @@ -2575,7 +2563,6 @@ static inline void skb_setup_tx_timestamp(struct sk_buff *skb, __u16 tsflags) &skb_shinfo(skb)->tskey); } -DECLARE_STATIC_KEY_FALSE(tcp_rx_skb_cache_key); /** * sk_eat_skb - Release a skb if it is no longer needed * @sk: socket to eat this skb from @@ -2587,12 +2574,6 @@ DECLARE_STATIC_KEY_FALSE(tcp_rx_skb_cache_key); static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb) { __skb_unlink(skb, &sk->sk_receive_queue); - if (static_branch_unlikely(&tcp_rx_skb_cache_key) && - !sk->sk_rx_skb_cache) { - sk->sk_rx_skb_cache = skb; - skb_orphan(skb); - return; - } __kfree_skb(skb); } diff --git a/include/net/tcp.h b/include/net/tcp.h index 3166dc15d7d6..673c3b01e287 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -330,8 +330,6 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset, size_t size, int flags); -struct sk_buff *tcp_build_frag(struct sock *sk, int size_goal, int flags, - struct page *page, int offset, size_t *size); ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, size_t size, int flags); int tcp_send_mss(struct sock *sk, int *size_goal, int flags); @@ -581,6 +579,8 @@ __u32 cookie_v6_init_sequence(const struct sk_buff *skb, __u16 *mss); #endif /* tcp_output.c */ +void tcp_skb_entail(struct sock *sk, struct sk_buff *skb); +void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb); void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, int nonagle); int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 1d816a5fd3eb..40558033f857 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -133,10 +133,6 @@ void inet_sock_destruct(struct sock *sk) struct inet_sock *inet = inet_sk(sk); __skb_queue_purge(&sk->sk_receive_queue); - if (sk->sk_rx_skb_cache) { - __kfree_skb(sk->sk_rx_skb_cache); - sk->sk_rx_skb_cache = NULL; - } __skb_queue_purge(&sk->sk_error_queue); sk_mem_reclaim(sk); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 4680268f2e59..97eb54774924 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -585,18 +585,6 @@ static struct ctl_table ipv4_table[] = { .extra1 = &sysctl_fib_sync_mem_min, .extra2 = &sysctl_fib_sync_mem_max, }, - { - .procname = "tcp_rx_skb_cache", - .data = &tcp_rx_skb_cache_key.key, - .mode = 0644, - .proc_handler = proc_do_static_key, - }, - { - .procname = "tcp_tx_skb_cache", - .data = &tcp_tx_skb_cache_key.key, - .mode = 0644, - .proc_handler = proc_do_static_key, - }, { } }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e8b48df73c85..414c179c28e0 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -325,11 +325,6 @@ struct tcp_splice_state { unsigned long tcp_memory_pressure __read_mostly; EXPORT_SYMBOL_GPL(tcp_memory_pressure); -DEFINE_STATIC_KEY_FALSE(tcp_rx_skb_cache_key); -EXPORT_SYMBOL(tcp_rx_skb_cache_key); - -DEFINE_STATIC_KEY_FALSE(tcp_tx_skb_cache_key); - void tcp_enter_memory_pressure(struct sock *sk) { unsigned long val; @@ -647,7 +642,7 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) } EXPORT_SYMBOL(tcp_ioctl); -static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb) +void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb) { TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; tp->pushed_seq = tp->write_seq; @@ -658,7 +653,7 @@ static inline bool forced_push(const struct tcp_sock *tp) return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1)); } -static void skb_entail(struct sock *sk, struct sk_buff *skb) +void tcp_skb_entail(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); @@ -866,18 +861,6 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, { struct sk_buff *skb; - if (likely(!size)) { - skb = sk->sk_tx_skb_cache; - if (skb) { - skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); - sk->sk_tx_skb_cache = NULL; - pskb_trim(skb, 0); - INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); - skb_shinfo(skb)->tx_flags = 0; - memset(TCP_SKB_CB(skb), 0, sizeof(struct tcp_skb_cb)); - return skb; - } - } /* The TCP header must be at least 32-bit aligned. */ size = ALIGN(size, 4); @@ -963,8 +946,8 @@ void tcp_remove_empty_skb(struct sock *sk, struct sk_buff *skb) } } -struct sk_buff *tcp_build_frag(struct sock *sk, int size_goal, int flags, - struct page *page, int offset, size_t *size) +static struct sk_buff *tcp_build_frag(struct sock *sk, int size_goal, int flags, + struct page *page, int offset, size_t *size) { struct sk_buff *skb = tcp_write_queue_tail(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -985,7 +968,7 @@ new_segment: #ifdef CONFIG_TLS_DEVICE skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED); #endif - skb_entail(sk, skb); + tcp_skb_entail(sk, skb); copy = size_goal; } @@ -1314,7 +1297,7 @@ new_segment: process_backlog++; skb->ip_summed = CHECKSUM_PARTIAL; - skb_entail(sk, skb); + tcp_skb_entail(sk, skb); copy = size_goal; /* All packets are restored as if they have @@ -2920,11 +2903,6 @@ void tcp_write_queue_purge(struct sock *sk) sk_wmem_free_skb(sk, skb); } tcp_rtx_queue_purge(sk); - skb = sk->sk_tx_skb_cache; - if (skb) { - __kfree_skb(skb); - sk->sk_tx_skb_cache = NULL; - } INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue); sk_mem_reclaim(sk); tcp_clear_all_retrans_hints(tcp_sk(sk)); @@ -2961,10 +2939,6 @@ int tcp_disconnect(struct sock *sk, int flags) tcp_clear_xmit_timers(sk); __skb_queue_purge(&sk->sk_receive_queue); - if (sk->sk_rx_skb_cache) { - __kfree_skb(sk->sk_rx_skb_cache); - sk->sk_rx_skb_cache = NULL; - } WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); tp->urg_data = 0; tcp_write_queue_purge(sk); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 2e62e0d6373a..29a57bd159f0 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1941,7 +1941,6 @@ static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, int tcp_v4_rcv(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); - struct sk_buff *skb_to_free; int sdif = inet_sdif(skb); int dif = inet_iif(skb); const struct iphdr *iph; @@ -2082,17 +2081,12 @@ process: tcp_segs_in(tcp_sk(sk), skb); ret = 0; if (!sock_owned_by_user(sk)) { - skb_to_free = sk->sk_rx_skb_cache; - sk->sk_rx_skb_cache = NULL; ret = tcp_v4_do_rcv(sk, skb); } else { if (tcp_add_backlog(sk, skb)) goto discard_and_relse; - skb_to_free = NULL; } bh_unlock_sock(sk); - if (skb_to_free) - __kfree_skb(skb_to_free); put_and_return: if (refcounted) diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 0ce52d46e4f8..8cf5ff2e9504 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1618,7 +1618,6 @@ static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr, INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb) { - struct sk_buff *skb_to_free; int sdif = inet6_sdif(skb); int dif = inet6_iif(skb); const struct tcphdr *th; @@ -1754,17 +1753,12 @@ process: tcp_segs_in(tcp_sk(sk), skb); ret = 0; if (!sock_owned_by_user(sk)) { - skb_to_free = sk->sk_rx_skb_cache; - sk->sk_rx_skb_cache = NULL; ret = tcp_v6_do_rcv(sk, skb); } else { if (tcp_add_backlog(sk, skb)) goto discard_and_relse; - skb_to_free = NULL; } bh_unlock_sock(sk); - if (skb_to_free) - __kfree_skb(skb_to_free); put_and_return: if (refcounted) sock_put(sk); diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index dbcebf56798f..7e5f76092b64 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1224,6 +1224,7 @@ static struct sk_buff *__mptcp_do_alloc_tx_skb(struct sock *sk, gfp_t gfp) if (likely(__mptcp_add_ext(skb, gfp))) { skb_reserve(skb, MAX_TCP_HEADER); skb->reserved_tailroom = skb->end - skb->tail; + INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); return skb; } __kfree_skb(skb); @@ -1233,31 +1234,23 @@ static struct sk_buff *__mptcp_do_alloc_tx_skb(struct sock *sk, gfp_t gfp) return NULL; } -static bool __mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp) +static struct sk_buff *__mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp) { struct sk_buff *skb; - if (ssk->sk_tx_skb_cache) { - skb = ssk->sk_tx_skb_cache; - if (unlikely(!skb_ext_find(skb, SKB_EXT_MPTCP) && - !__mptcp_add_ext(skb, gfp))) - return false; - return true; - } - skb = __mptcp_do_alloc_tx_skb(sk, gfp); if (!skb) - return false; + return NULL; if (likely(sk_wmem_schedule(ssk, skb->truesize))) { - ssk->sk_tx_skb_cache = skb; - return true; + tcp_skb_entail(ssk, skb); + return skb; } kfree_skb(skb); - return false; + return NULL; } -static bool mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, bool data_lock_held) +static struct sk_buff *mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, bool data_lock_held) { gfp_t gfp = data_lock_held ? GFP_ATOMIC : sk->sk_allocation; @@ -1287,23 +1280,29 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, struct mptcp_sendmsg_info *info) { u64 data_seq = dfrag->data_seq + info->sent; + int offset = dfrag->offset + info->sent; struct mptcp_sock *msk = mptcp_sk(sk); bool zero_window_probe = false; struct mptcp_ext *mpext = NULL; - struct sk_buff *skb, *tail; - bool must_collapse = false; - int size_bias = 0; - int avail_size; - size_t ret = 0; + bool can_coalesce = false; + bool reuse_skb = true; + struct sk_buff *skb; + size_t copy; + int i; pr_debug("msk=%p ssk=%p sending dfrag at seq=%llu len=%u already sent=%u", msk, ssk, dfrag->data_seq, dfrag->data_len, info->sent); + if (WARN_ON_ONCE(info->sent > info->limit || + info->limit > dfrag->data_len)) + return 0; + /* compute send limit */ info->mss_now = tcp_send_mss(ssk, &info->size_goal, info->flags); - avail_size = info->size_goal; + copy = info->size_goal; + skb = tcp_write_queue_tail(ssk); - if (skb) { + if (skb && copy > skb->len) { /* Limit the write to the size available in the * current skb, if any, so that we create at most a new skb. * Explicitly tells TCP internals to avoid collapsing on later @@ -1316,62 +1315,80 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, goto alloc_skb; } - must_collapse = (info->size_goal > skb->len) && - (skb_shinfo(skb)->nr_frags < sysctl_max_skb_frags); - if (must_collapse) { - size_bias = skb->len; - avail_size = info->size_goal - skb->len; + i = skb_shinfo(skb)->nr_frags; + can_coalesce = skb_can_coalesce(skb, i, dfrag->page, offset); + if (!can_coalesce && i >= sysctl_max_skb_frags) { + tcp_mark_push(tcp_sk(ssk), skb); + goto alloc_skb; } - } + copy -= skb->len; + } else { alloc_skb: - if (!must_collapse && - !mptcp_alloc_tx_skb(sk, ssk, info->data_lock_held)) - return 0; + skb = mptcp_alloc_tx_skb(sk, ssk, info->data_lock_held); + if (!skb) + return -ENOMEM; + + i = skb_shinfo(skb)->nr_frags; + reuse_skb = false; + mpext = skb_ext_find(skb, SKB_EXT_MPTCP); + } /* Zero window and all data acked? Probe. */ - avail_size = mptcp_check_allowed_size(msk, data_seq, avail_size); - if (avail_size == 0) { + copy = mptcp_check_allowed_size(msk, data_seq, copy); + if (copy == 0) { u64 snd_una = READ_ONCE(msk->snd_una); - if (skb || snd_una != msk->snd_nxt) + if (snd_una != msk->snd_nxt) { + tcp_remove_empty_skb(ssk, tcp_write_queue_tail(ssk)); return 0; + } + zero_window_probe = true; data_seq = snd_una - 1; - avail_size = 1; - } + copy = 1; - if (WARN_ON_ONCE(info->sent > info->limit || - info->limit > dfrag->data_len)) - return 0; + /* all mptcp-level data is acked, no skbs should be present into the + * ssk write queue + */ + WARN_ON_ONCE(reuse_skb); + } - ret = info->limit - info->sent; - tail = tcp_build_frag(ssk, avail_size + size_bias, info->flags, - dfrag->page, dfrag->offset + info->sent, &ret); - if (!tail) { - tcp_remove_empty_skb(sk, tcp_write_queue_tail(ssk)); + copy = min_t(size_t, copy, info->limit - info->sent); + if (!sk_wmem_schedule(ssk, copy)) { + tcp_remove_empty_skb(ssk, tcp_write_queue_tail(ssk)); return -ENOMEM; } - /* if the tail skb is still the cached one, collapsing really happened. - */ - if (skb == tail) { - TCP_SKB_CB(tail)->tcp_flags &= ~TCPHDR_PSH; - mpext->data_len += ret; + if (can_coalesce) { + skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); + } else { + get_page(dfrag->page); + skb_fill_page_desc(skb, i, dfrag->page, offset, copy); + } + + skb->len += copy; + skb->data_len += copy; + skb->truesize += copy; + sk_wmem_queued_add(ssk, copy); + sk_mem_charge(ssk, copy); + skb->ip_summed = CHECKSUM_PARTIAL; + WRITE_ONCE(tcp_sk(ssk)->write_seq, tcp_sk(ssk)->write_seq + copy); + TCP_SKB_CB(skb)->end_seq += copy; + tcp_skb_pcount_set(skb, 0); + + /* on skb reuse we just need to update the DSS len */ + if (reuse_skb) { + TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; + mpext->data_len += copy; WARN_ON_ONCE(zero_window_probe); goto out; } - mpext = skb_ext_find(tail, SKB_EXT_MPTCP); - if (WARN_ON_ONCE(!mpext)) { - /* should never reach here, stream corrupted */ - return -EINVAL; - } - memset(mpext, 0, sizeof(*mpext)); mpext->data_seq = data_seq; mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq; - mpext->data_len = ret; + mpext->data_len = copy; mpext->use_map = 1; mpext->dsn64 = 1; @@ -1380,18 +1397,18 @@ alloc_skb: mpext->dsn64); if (zero_window_probe) { - mptcp_subflow_ctx(ssk)->rel_write_seq += ret; + mptcp_subflow_ctx(ssk)->rel_write_seq += copy; mpext->frozen = 1; if (READ_ONCE(msk->csum_enabled)) - mptcp_update_data_checksum(tail, ret); + mptcp_update_data_checksum(skb, copy); tcp_push_pending_frames(ssk); return 0; } out: if (READ_ONCE(msk->csum_enabled)) - mptcp_update_data_checksum(tail, ret); - mptcp_subflow_ctx(ssk)->rel_write_seq += ret; - return ret; + mptcp_update_data_checksum(skb, copy); + mptcp_subflow_ctx(ssk)->rel_write_seq += copy; + return copy; } #define MPTCP_SEND_BURST_SIZE ((1 << 16) - \ |