From 6e06312035032924fc97f2050bfe85e63ca26514 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 25 Mar 2024 13:41:55 +0000 Subject: net: remove skb_free_datagram_locked() Last user of skb_free_datagram_locked() went away in 2016 with commit 850cbaddb52d ("udp: use it's own memory accounting schema"). Signed-off-by: Eric Dumazet Reviewed-by: Jason Xing Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240325134155.620531-1-edumazet@google.com Signed-off-by: Paolo Abeni --- include/linux/skbuff.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux/skbuff.h') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 0c7c67b3a87b..b945af8a6208 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4063,12 +4063,6 @@ int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset, struct iov_iter *from, int len); int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *frm); void skb_free_datagram(struct sock *sk, struct sk_buff *skb); -void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len); -static inline void skb_free_datagram_locked(struct sock *sk, - struct sk_buff *skb) -{ - __skb_free_datagram_locked(sk, skb, 0); -} int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags); int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len); int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len); -- cgit v1.2.3-58-ga151 From 6e9b01909a811555ff3326cf80a5847169c57806 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 26 Mar 2024 21:02:12 -0700 Subject: net: remove gfp_mask from napi_alloc_skb() __napi_alloc_skb() is napi_alloc_skb() with the added flexibility of choosing gfp_mask. This is a NAPI function, so GFP_ATOMIC is implied. The only practical choice the caller has is whether to set __GFP_NOWARN. But that's a false choice, too, allocation failures in atomic context will happen, and printing warnings in logs, effectively for a packet drop, is both too much and very likely non-actionable. This leads me to a conclusion that most uses of napi_alloc_skb() are simply misguided, and should use __GFP_NOWARN in the first place. We also have a "standard" way of reporting allocation failures via the queue stat API (qstats::rx-alloc-fail). The direct motivation for this patch is that one of the drivers used at Meta calls napi_alloc_skb() (so prior to this patch without __GFP_NOWARN), and the resulting OOM warning is the top networking warning in our fleet. Reviewed-by: Alexander Lobakin Reviewed-by: Simon Horman Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20240327040213.3153864-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/mm/page_frags.rst | 2 +- Documentation/translations/zh_CN/mm/page_frags.rst | 2 +- drivers/net/ethernet/intel/i40e/i40e_txrx.c | 4 +--- drivers/net/ethernet/intel/i40e/i40e_xsk.c | 3 +-- drivers/net/ethernet/intel/iavf/iavf_txrx.c | 4 +--- drivers/net/ethernet/intel/ice/ice_txrx.c | 3 +-- drivers/net/ethernet/intel/ice/ice_xsk.c | 3 +-- drivers/net/ethernet/intel/idpf/idpf_txrx.c | 5 ++--- drivers/net/ethernet/intel/igc/igc_main.c | 3 +-- drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c | 3 +-- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 5 ++--- include/linux/skbuff.h | 8 +------- net/core/skbuff.c | 9 ++++----- 13 files changed, 18 insertions(+), 36 deletions(-) (limited to 'include/linux/skbuff.h') diff --git a/Documentation/mm/page_frags.rst b/Documentation/mm/page_frags.rst index a81617e688a8..503ca6cdb804 100644 --- a/Documentation/mm/page_frags.rst +++ b/Documentation/mm/page_frags.rst @@ -25,7 +25,7 @@ to be disabled when executing the fragment allocation. The network stack uses two separate caches per CPU to handle fragment allocation. The netdev_alloc_cache is used by callers making use of the netdev_alloc_frag and __netdev_alloc_skb calls. The napi_alloc_cache is -used by callers of the __napi_alloc_frag and __napi_alloc_skb calls. The +used by callers of the __napi_alloc_frag and napi_alloc_skb calls. The main difference between these two calls is the context in which they may be called. The "netdev" prefixed functions are usable in any context as these functions will disable interrupts, while the "napi" prefixed functions are diff --git a/Documentation/translations/zh_CN/mm/page_frags.rst b/Documentation/translations/zh_CN/mm/page_frags.rst index 20bd3fafdc8c..a5b22486a913 100644 --- a/Documentation/translations/zh_CN/mm/page_frags.rst +++ b/Documentation/translations/zh_CN/mm/page_frags.rst @@ -25,7 +25,7 @@ sk_buff->head使用,或者用于skb_shared_info的 “frags” 部分。 网络堆栈在每个CPU使用两个独立的缓存来处理碎片分配。netdev_alloc_cache被使用 netdev_alloc_frag和__netdev_alloc_skb调用的调用者使用。napi_alloc_cache -被调用__napi_alloc_frag和__napi_alloc_skb的调用者使用。这两个调用的主要区别是 +被调用__napi_alloc_frag和napi_alloc_skb的调用者使用。这两个调用的主要区别是 它们可能被调用的环境。“netdev” 前缀的函数可以在任何上下文中使用,因为这些函数 将禁用中断,而 ”napi“ 前缀的函数只可以在softirq上下文中使用。 diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index 0d7177083708..ac2fcc5ac595 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -2144,9 +2144,7 @@ static struct sk_buff *i40e_construct_skb(struct i40e_ring *rx_ring, */ /* allocate a skb to store the frags */ - skb = __napi_alloc_skb(&rx_ring->q_vector->napi, - I40E_RX_HDR_SIZE, - GFP_ATOMIC | __GFP_NOWARN); + skb = napi_alloc_skb(&rx_ring->q_vector->napi, I40E_RX_HDR_SIZE); if (unlikely(!skb)) return NULL; diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c index 11500003af0d..a85b425794df 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c @@ -301,8 +301,7 @@ static struct sk_buff *i40e_construct_skb_zc(struct i40e_ring *rx_ring, net_prefetch(xdp->data_meta); /* allocate a skb to store the frags */ - skb = __napi_alloc_skb(&rx_ring->q_vector->napi, totalsize, - GFP_ATOMIC | __GFP_NOWARN); + skb = napi_alloc_skb(&rx_ring->q_vector->napi, totalsize); if (unlikely(!skb)) goto out; diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c index b71484c87a84..32bb604a1382 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c +++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c @@ -1334,9 +1334,7 @@ static struct sk_buff *iavf_construct_skb(struct iavf_ring *rx_ring, net_prefetch(va); /* allocate a skb to store the frags */ - skb = __napi_alloc_skb(&rx_ring->q_vector->napi, - IAVF_RX_HDR_SIZE, - GFP_ATOMIC | __GFP_NOWARN); + skb = napi_alloc_skb(&rx_ring->q_vector->napi, IAVF_RX_HDR_SIZE); if (unlikely(!skb)) return NULL; diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c index 97d41d6ebf1f..8bb743f78fcb 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c @@ -1051,8 +1051,7 @@ ice_construct_skb(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp) } /* allocate a skb to store the frags */ - skb = __napi_alloc_skb(&rx_ring->q_vector->napi, ICE_RX_HDR_SIZE, - GFP_ATOMIC | __GFP_NOWARN); + skb = napi_alloc_skb(&rx_ring->q_vector->napi, ICE_RX_HDR_SIZE); if (unlikely(!skb)) return NULL; diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index 1857220d27fe..aa81d1162b81 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -555,8 +555,7 @@ ice_construct_skb_zc(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp) } net_prefetch(xdp->data_meta); - skb = __napi_alloc_skb(&rx_ring->q_vector->napi, totalsize, - GFP_ATOMIC | __GFP_NOWARN); + skb = napi_alloc_skb(&rx_ring->q_vector->napi, totalsize); if (unlikely(!skb)) return NULL; diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c index 6dd7a66bb897..f940f650cd78 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c @@ -3005,8 +3005,7 @@ struct sk_buff *idpf_rx_construct_skb(struct idpf_queue *rxq, /* prefetch first cache line of first page */ net_prefetch(va); /* allocate a skb to store the frags */ - skb = __napi_alloc_skb(&rxq->q_vector->napi, IDPF_RX_HDR_SIZE, - GFP_ATOMIC); + skb = napi_alloc_skb(&rxq->q_vector->napi, IDPF_RX_HDR_SIZE); if (unlikely(!skb)) { idpf_rx_put_page(rx_buf); @@ -3060,7 +3059,7 @@ static struct sk_buff *idpf_rx_hdr_construct_skb(struct idpf_queue *rxq, struct sk_buff *skb; /* allocate a skb to store the frags */ - skb = __napi_alloc_skb(&rxq->q_vector->napi, size, GFP_ATOMIC); + skb = napi_alloc_skb(&rxq->q_vector->napi, size); if (unlikely(!skb)) return NULL; diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 35ad40a803cb..0cd923c8ac9b 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -2712,8 +2712,7 @@ static struct sk_buff *igc_construct_skb_zc(struct igc_ring *ring, net_prefetch(xdp->data_meta); - skb = __napi_alloc_skb(&ring->q_vector->napi, totalsize, - GFP_ATOMIC | __GFP_NOWARN); + skb = napi_alloc_skb(&ring->q_vector->napi, totalsize); if (unlikely(!skb)) return NULL; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c index d34d715c59eb..397cb773fabb 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c @@ -220,8 +220,7 @@ static struct sk_buff *ixgbe_construct_skb_zc(struct ixgbe_ring *rx_ring, net_prefetch(xdp->data_meta); /* allocate a skb to store the frags */ - skb = __napi_alloc_skb(&rx_ring->q_vector->napi, totalsize, - GFP_ATOMIC | __GFP_NOWARN); + skb = napi_alloc_skb(&rx_ring->q_vector->napi, totalsize); if (unlikely(!skb)) return NULL; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 24cd80490d19..bcdde68a099a 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -5109,9 +5109,8 @@ static struct sk_buff *stmmac_construct_skb_zc(struct stmmac_channel *ch, unsigned int datasize = xdp->data_end - xdp->data; struct sk_buff *skb; - skb = __napi_alloc_skb(&ch->rxtx_napi, - xdp->data_end - xdp->data_hard_start, - GFP_ATOMIC | __GFP_NOWARN); + skb = napi_alloc_skb(&ch->rxtx_napi, + xdp->data_end - xdp->data_hard_start); if (unlikely(!skb)) return NULL; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 517e546a120a..b7f1ecdaec38 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3350,13 +3350,7 @@ static inline void *napi_alloc_frag_align(unsigned int fragsz, return __napi_alloc_frag_align(fragsz, -align); } -struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, - unsigned int length, gfp_t gfp_mask); -static inline struct sk_buff *napi_alloc_skb(struct napi_struct *napi, - unsigned int length) -{ - return __napi_alloc_skb(napi, length, GFP_ATOMIC); -} +struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int length); void napi_consume_skb(struct sk_buff *skb, int budget); void napi_skb_free_stolen_head(struct sk_buff *skb); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 17617c29be2d..a1be84be5d35 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -775,10 +775,9 @@ skb_fail: EXPORT_SYMBOL(__netdev_alloc_skb); /** - * __napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance + * napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance * @napi: napi instance this buffer was allocated for * @len: length to allocate - * @gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages * * Allocate a new sk_buff for use in NAPI receive. This buffer will * attempt to allocate the head from a special reserved region used @@ -787,9 +786,9 @@ EXPORT_SYMBOL(__netdev_alloc_skb); * * %NULL is returned if there is no free memory. */ -struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, - gfp_t gfp_mask) +struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len) { + gfp_t gfp_mask = GFP_ATOMIC | __GFP_NOWARN; struct napi_alloc_cache *nc; struct sk_buff *skb; bool pfmemalloc; @@ -860,7 +859,7 @@ skb_success: skb_fail: return skb; } -EXPORT_SYMBOL(__napi_alloc_skb); +EXPORT_SYMBOL(napi_alloc_skb); void skb_add_rx_frag_netmem(struct sk_buff *skb, int i, netmem_ref netmem, int off, int size, unsigned int truesize) -- cgit v1.2.3-58-ga151 From 4a96a4e807c390a9d91b450ebe04eeb2e0ecc076 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Fri, 29 Mar 2024 17:55:06 +0100 Subject: page_pool: check for PP direct cache locality later Since we have pool->p.napi (Jakub) and pool->cpuid (Lorenzo) to check whether it's safe to use direct recycling, we can use both globally for each page instead of relying solely on @allow_direct argument. Let's assume that @allow_direct means "I'm sure it's local, don't waste time rechecking this" and when it's false, try the mentioned params to still recycle the page directly. If neither is true, we'll lose some CPU cycles, but then it surely won't be hotpath. On the other hand, paths where it's possible to use direct cache, but not possible to safely set @allow_direct, will benefit from this move. The whole propagation of @napi_safe through a dozen of skb freeing functions can now go away, which saves us some stack space. Signed-off-by: Alexander Lobakin Link: https://lore.kernel.org/r/20240329165507.3240110-2-aleksander.lobakin@intel.com Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 12 ++++----- net/core/page_pool.c | 31 +++++++++++++++++++--- net/core/skbuff.c | 70 ++++++++++++++++---------------------------------- net/ipv4/esp4.c | 2 +- net/ipv6/esp6.c | 2 +- 5 files changed, 58 insertions(+), 59 deletions(-) (limited to 'include/linux/skbuff.h') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b7f1ecdaec38..03ea36a82cdd 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3510,25 +3510,25 @@ int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb, unsigned int headroom); int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb, struct bpf_prog *prog); -bool napi_pp_put_page(struct page *page, bool napi_safe); +bool napi_pp_put_page(struct page *page); static inline void -skb_page_unref(const struct sk_buff *skb, struct page *page, bool napi_safe) +skb_page_unref(const struct sk_buff *skb, struct page *page) { #ifdef CONFIG_PAGE_POOL - if (skb->pp_recycle && napi_pp_put_page(page, napi_safe)) + if (skb->pp_recycle && napi_pp_put_page(page)) return; #endif put_page(page); } static inline void -napi_frag_unref(skb_frag_t *frag, bool recycle, bool napi_safe) +napi_frag_unref(skb_frag_t *frag, bool recycle) { struct page *page = skb_frag_page(frag); #ifdef CONFIG_PAGE_POOL - if (recycle && napi_pp_put_page(page, napi_safe)) + if (recycle && napi_pp_put_page(page)) return; #endif put_page(page); @@ -3544,7 +3544,7 @@ napi_frag_unref(skb_frag_t *frag, bool recycle, bool napi_safe) */ static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle) { - napi_frag_unref(frag, recycle, false); + napi_frag_unref(frag, recycle); } /** diff --git a/net/core/page_pool.c b/net/core/page_pool.c index dd364d738c00..9d56257e444b 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -690,8 +690,7 @@ __page_pool_put_page(struct page_pool *pool, struct page *page, page_pool_dma_sync_for_device(pool, page, dma_sync_size); - if (allow_direct && in_softirq() && - page_pool_recycle_in_cache(page, pool)) + if (allow_direct && page_pool_recycle_in_cache(page, pool)) return NULL; /* Page found as candidate for recycling */ @@ -716,9 +715,35 @@ __page_pool_put_page(struct page_pool *pool, struct page *page, return NULL; } +static bool page_pool_napi_local(const struct page_pool *pool) +{ + const struct napi_struct *napi; + u32 cpuid; + + if (unlikely(!in_softirq())) + return false; + + /* Allow direct recycle if we have reasons to believe that we are + * in the same context as the consumer would run, so there's + * no possible race. + * __page_pool_put_page() makes sure we're not in hardirq context + * and interrupts are enabled prior to accessing the cache. + */ + cpuid = smp_processor_id(); + if (READ_ONCE(pool->cpuid) == cpuid) + return true; + + napi = READ_ONCE(pool->p.napi); + + return napi && READ_ONCE(napi->list_owner) == cpuid; +} + void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page, unsigned int dma_sync_size, bool allow_direct) { + if (!allow_direct) + allow_direct = page_pool_napi_local(pool); + page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct); if (page && !page_pool_recycle_in_ring(pool, page)) { /* Cache full, fallback to free pages */ @@ -969,7 +994,7 @@ void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), static void page_pool_disable_direct_recycling(struct page_pool *pool) { /* Disable direct recycling based on pool->cpuid. - * Paired with READ_ONCE() in napi_pp_put_page(). + * Paired with READ_ONCE() in page_pool_napi_local(). */ WRITE_ONCE(pool->cpuid, -1); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index a1be84be5d35..2a5ce6667bbb 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1004,11 +1004,8 @@ int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb, EXPORT_SYMBOL(skb_cow_data_for_xdp); #if IS_ENABLED(CONFIG_PAGE_POOL) -bool napi_pp_put_page(struct page *page, bool napi_safe) +bool napi_pp_put_page(struct page *page) { - bool allow_direct = false; - struct page_pool *pp; - page = compound_head(page); /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation @@ -1021,39 +1018,18 @@ bool napi_pp_put_page(struct page *page, bool napi_safe) if (unlikely(!is_pp_page(page))) return false; - pp = page->pp; - - /* Allow direct recycle if we have reasons to believe that we are - * in the same context as the consumer would run, so there's - * no possible race. - * __page_pool_put_page() makes sure we're not in hardirq context - * and interrupts are enabled prior to accessing the cache. - */ - if (napi_safe || in_softirq()) { - const struct napi_struct *napi = READ_ONCE(pp->p.napi); - unsigned int cpuid = smp_processor_id(); - - allow_direct = napi && READ_ONCE(napi->list_owner) == cpuid; - allow_direct |= READ_ONCE(pp->cpuid) == cpuid; - } - - /* Driver set this to memory recycling info. Reset it on recycle. - * This will *not* work for NIC using a split-page memory model. - * The page will be returned to the pool here regardless of the - * 'flipped' fragment being in use or not. - */ - page_pool_put_full_page(pp, page, allow_direct); + page_pool_put_full_page(page->pp, page, false); return true; } EXPORT_SYMBOL(napi_pp_put_page); #endif -static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe) +static bool skb_pp_recycle(struct sk_buff *skb, void *data) { if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle) return false; - return napi_pp_put_page(virt_to_page(data), napi_safe); + return napi_pp_put_page(virt_to_page(data)); } /** @@ -1095,12 +1071,12 @@ static void skb_kfree_head(void *head, unsigned int end_offset) kfree(head); } -static void skb_free_head(struct sk_buff *skb, bool napi_safe) +static void skb_free_head(struct sk_buff *skb) { unsigned char *head = skb->head; if (skb->head_frag) { - if (skb_pp_recycle(skb, head, napi_safe)) + if (skb_pp_recycle(skb, head)) return; skb_free_frag(head); } else { @@ -1108,8 +1084,7 @@ static void skb_free_head(struct sk_buff *skb, bool napi_safe) } } -static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason, - bool napi_safe) +static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason) { struct skb_shared_info *shinfo = skb_shinfo(skb); int i; @@ -1126,13 +1101,13 @@ static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason, } for (i = 0; i < shinfo->nr_frags; i++) - napi_frag_unref(&shinfo->frags[i], skb->pp_recycle, napi_safe); + napi_frag_unref(&shinfo->frags[i], skb->pp_recycle); free_head: if (shinfo->frag_list) kfree_skb_list_reason(shinfo->frag_list, reason); - skb_free_head(skb, napi_safe); + skb_free_head(skb); exit: /* When we clone an SKB we copy the reycling bit. The pp_recycle * bit is only set on the head though, so in order to avoid races @@ -1193,12 +1168,11 @@ void skb_release_head_state(struct sk_buff *skb) } /* Free everything but the sk_buff shell. */ -static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason, - bool napi_safe) +static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason) { skb_release_head_state(skb); if (likely(skb->head)) - skb_release_data(skb, reason, napi_safe); + skb_release_data(skb, reason); } /** @@ -1212,7 +1186,7 @@ static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason, void __kfree_skb(struct sk_buff *skb) { - skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED, false); + skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED); kfree_skbmem(skb); } EXPORT_SYMBOL(__kfree_skb); @@ -1269,7 +1243,7 @@ static void kfree_skb_add_bulk(struct sk_buff *skb, return; } - skb_release_all(skb, reason, false); + skb_release_all(skb, reason); sa->skb_array[sa->skb_count++] = skb; if (unlikely(sa->skb_count == KFREE_SKB_BULK_SIZE)) { @@ -1443,7 +1417,7 @@ EXPORT_SYMBOL(consume_skb); void __consume_stateless_skb(struct sk_buff *skb) { trace_consume_skb(skb, __builtin_return_address(0)); - skb_release_data(skb, SKB_CONSUMED, false); + skb_release_data(skb, SKB_CONSUMED); kfree_skbmem(skb); } @@ -1470,7 +1444,7 @@ static void napi_skb_cache_put(struct sk_buff *skb) void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason) { - skb_release_all(skb, reason, true); + skb_release_all(skb, reason); napi_skb_cache_put(skb); } @@ -1508,7 +1482,7 @@ void napi_consume_skb(struct sk_buff *skb, int budget) return; } - skb_release_all(skb, SKB_CONSUMED, !!budget); + skb_release_all(skb, SKB_CONSUMED); napi_skb_cache_put(skb); } EXPORT_SYMBOL(napi_consume_skb); @@ -1639,7 +1613,7 @@ EXPORT_SYMBOL_GPL(alloc_skb_for_msg); */ struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src) { - skb_release_all(dst, SKB_CONSUMED, false); + skb_release_all(dst, SKB_CONSUMED); return __skb_clone(dst, src); } EXPORT_SYMBOL_GPL(skb_morph); @@ -2271,9 +2245,9 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, if (skb_has_frag_list(skb)) skb_clone_fraglist(skb); - skb_release_data(skb, SKB_CONSUMED, false); + skb_release_data(skb, SKB_CONSUMED); } else { - skb_free_head(skb, false); + skb_free_head(skb); } off = (data + nhead) - skb->head; @@ -6574,12 +6548,12 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off, skb_frag_ref(skb, i); if (skb_has_frag_list(skb)) skb_clone_fraglist(skb); - skb_release_data(skb, SKB_CONSUMED, false); + skb_release_data(skb, SKB_CONSUMED); } else { /* we can reuse existing recount- all we did was * relocate values */ - skb_free_head(skb, false); + skb_free_head(skb); } skb->head = data; @@ -6714,7 +6688,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, skb_kfree_head(data, size); return -ENOMEM; } - skb_release_data(skb, SKB_CONSUMED, false); + skb_release_data(skb, SKB_CONSUMED); skb->head = data; skb->head_frag = 0; diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index d33d12421814..3d647c9a7a21 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -114,7 +114,7 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb) */ if (req->src != req->dst) for (sg = sg_next(req->src); sg; sg = sg_next(sg)) - skb_page_unref(skb, sg_page(sg), false); + skb_page_unref(skb, sg_page(sg)); } #ifdef CONFIG_INET_ESPINTCP diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 7371886d4f9f..fe8d53f5a5ee 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -131,7 +131,7 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb) */ if (req->src != req->dst) for (sg = sg_next(req->src); sg; sg = sg_next(sg)) - skb_page_unref(skb, sg_page(sg), false); + skb_page_unref(skb, sg_page(sg)); } #ifdef CONFIG_INET6_ESPINTCP -- cgit v1.2.3-58-ga151 From 9f06f87fef689d28588cde8c7ebb00a67da34026 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 3 Apr 2024 13:21:39 -0700 Subject: net: skbuff: generalize the skb->decrypted bit The ->decrypted bit can be reused for other crypto protocols. Remove the direct dependency on TLS, add helpers to clean up the ifdefs leaking out everywhere. Signed-off-by: Jakub Kicinski Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/linux/skbuff.h | 15 ++++++++++++--- include/net/sock.h | 4 +--- net/Kconfig | 3 +++ net/core/sock.c | 5 ++--- net/ipv4/tcp_input.c | 12 +++--------- net/ipv4/tcp_ipv4.c | 4 +--- net/ipv4/tcp_offload.c | 4 +--- net/tls/Kconfig | 1 + 8 files changed, 24 insertions(+), 24 deletions(-) (limited to 'include/linux/skbuff.h') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 03ea36a82cdd..7dfb906d92f7 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -992,7 +992,7 @@ struct sk_buff { #ifdef CONFIG_NETFILTER_SKIP_EGRESS __u8 nf_skip_egress:1; #endif -#ifdef CONFIG_TLS_DEVICE +#ifdef CONFIG_SKB_DECRYPTED __u8 decrypted:1; #endif __u8 slow_gro:1; @@ -1615,17 +1615,26 @@ static inline void skb_copy_hash(struct sk_buff *to, const struct sk_buff *from) static inline int skb_cmp_decrypted(const struct sk_buff *skb1, const struct sk_buff *skb2) { -#ifdef CONFIG_TLS_DEVICE +#ifdef CONFIG_SKB_DECRYPTED return skb2->decrypted - skb1->decrypted; #else return 0; #endif } +static inline bool skb_is_decrypted(const struct sk_buff *skb) +{ +#ifdef CONFIG_SKB_DECRYPTED + return skb->decrypted; +#else + return false; +#endif +} + static inline void skb_copy_decrypted(struct sk_buff *to, const struct sk_buff *from) { -#ifdef CONFIG_TLS_DEVICE +#ifdef CONFIG_SKB_DECRYPTED to->decrypted = from->decrypted; #endif } diff --git a/include/net/sock.h b/include/net/sock.h index 2253eefe2848..a495330c5c49 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2835,12 +2835,10 @@ static inline struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb, if (sk && sk_fullsock(sk) && sk->sk_validate_xmit_skb) { skb = sk->sk_validate_xmit_skb(sk, dev, skb); -#ifdef CONFIG_TLS_DEVICE - } else if (unlikely(skb->decrypted)) { + } else if (unlikely(skb_is_decrypted(skb))) { pr_warn_ratelimited("unencrypted skb with no associated socket - dropping\n"); kfree_skb(skb); skb = NULL; -#endif } #endif diff --git a/net/Kconfig b/net/Kconfig index 3e57ccf0da27..d5ab791f7afa 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -60,6 +60,9 @@ config NET_XGRESS config NET_REDIRECT bool +config SKB_DECRYPTED + bool + config SKB_EXTENSIONS bool diff --git a/net/core/sock.c b/net/core/sock.c index 5ed411231fc7..fe9195186c13 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2526,13 +2526,12 @@ EXPORT_SYMBOL(skb_set_owner_w); static bool can_skb_orphan_partial(const struct sk_buff *skb) { -#ifdef CONFIG_TLS_DEVICE /* Drivers depend on in-order delivery for crypto offload, * partial orphan breaks out-of-order-OK logic. */ - if (skb->decrypted) + if (skb_is_decrypted(skb)) return false; -#endif + return (skb->destructor == sock_wfree || (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree)); } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8d44ab5671ea..1f28a2561795 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4805,10 +4805,8 @@ static bool tcp_try_coalesce(struct sock *sk, if (!mptcp_skb_can_collapse(to, from)) return false; -#ifdef CONFIG_TLS_DEVICE - if (from->decrypted != to->decrypted) + if (skb_cmp_decrypted(from, to)) return false; -#endif if (!skb_try_coalesce(to, from, fragstolen, &delta)) return false; @@ -5377,9 +5375,7 @@ restart: break; memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); -#ifdef CONFIG_TLS_DEVICE - nskb->decrypted = skb->decrypted; -#endif + skb_copy_decrypted(nskb, skb); TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; if (list) __skb_queue_before(list, skb, nskb); @@ -5409,10 +5405,8 @@ restart: !mptcp_skb_can_collapse(nskb, skb) || (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) goto end; -#ifdef CONFIG_TLS_DEVICE - if (skb->decrypted != nskb->decrypted) + if (skb_cmp_decrypted(skb, nskb)) goto end; -#endif } } } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 56b75efcfd12..52963c3bb8ca 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2044,10 +2044,8 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || ((TCP_SKB_CB(tail)->tcp_flags ^ TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) || -#ifdef CONFIG_TLS_DEVICE - tail->decrypted != skb->decrypted || -#endif !mptcp_skb_can_collapse(tail, skb) || + skb_cmp_decrypted(tail, skb) || thtail->doff != th->doff || memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) goto no_coalesce; diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index ebe4722bb020..fab0973f995b 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -265,9 +265,7 @@ found: flush |= (len - 1) >= mss; flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq); -#ifdef CONFIG_TLS_DEVICE - flush |= p->decrypted ^ skb->decrypted; -#endif + flush |= skb_cmp_decrypted(p, skb); if (flush || skb_gro_receive(p, skb)) { mss = 1; diff --git a/net/tls/Kconfig b/net/tls/Kconfig index 0cdc1f7b6b08..ce8d56a19187 100644 --- a/net/tls/Kconfig +++ b/net/tls/Kconfig @@ -20,6 +20,7 @@ config TLS config TLS_DEVICE bool "Transport Layer Security HW offload" depends on TLS + select SKB_DECRYPTED select SOCK_VALIDATE_XMIT select SOCK_RX_QUEUE_MAPPING default n -- cgit v1.2.3-58-ga151 From 959fa5c188bf095558c417554e4772ac1fda3531 Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Mon, 8 Apr 2024 08:29:56 -0700 Subject: net: make napi_frag_unref reuse skb_page_unref The implementations of these 2 functions are almost identical. Remove the implementation of napi_frag_unref, and make it a call into skb_page_unref so we don't duplicate the implementation. Signed-off-by: Mina Almasry Reviewed-by: Eric Dumazet Reviewed-by: Jacob Keller Link: https://lore.kernel.org/r/20240408153000.2152844-2-almasrymina@google.com Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 12 +++--------- net/ipv4/esp4.c | 2 +- net/ipv6/esp6.c | 2 +- 3 files changed, 5 insertions(+), 11 deletions(-) (limited to 'include/linux/skbuff.h') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 7dfb906d92f7..c0ff85bb087a 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3522,10 +3522,10 @@ int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb, bool napi_pp_put_page(struct page *page); static inline void -skb_page_unref(const struct sk_buff *skb, struct page *page) +skb_page_unref(struct page *page, bool recycle) { #ifdef CONFIG_PAGE_POOL - if (skb->pp_recycle && napi_pp_put_page(page)) + if (recycle && napi_pp_put_page(page)) return; #endif put_page(page); @@ -3534,13 +3534,7 @@ skb_page_unref(const struct sk_buff *skb, struct page *page) static inline void napi_frag_unref(skb_frag_t *frag, bool recycle) { - struct page *page = skb_frag_page(frag); - -#ifdef CONFIG_PAGE_POOL - if (recycle && napi_pp_put_page(page)) - return; -#endif - put_page(page); + skb_page_unref(skb_frag_page(frag), recycle); } /** diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 3d647c9a7a21..40330253f076 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -114,7 +114,7 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb) */ if (req->src != req->dst) for (sg = sg_next(req->src); sg; sg = sg_next(sg)) - skb_page_unref(skb, sg_page(sg)); + skb_page_unref(sg_page(sg), skb->pp_recycle); } #ifdef CONFIG_INET_ESPINTCP diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index fe8d53f5a5ee..fb431d0a3475 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -131,7 +131,7 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb) */ if (req->src != req->dst) for (sg = sg_next(req->src); sg; sg = sg_next(sg)) - skb_page_unref(skb, sg_page(sg)); + skb_page_unref(sg_page(sg), skb->pp_recycle); } #ifdef CONFIG_INET6_ESPINTCP -- cgit v1.2.3-58-ga151 From f58f3c9563409e618e591d0d540316286cb0665f Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Mon, 8 Apr 2024 08:29:58 -0700 Subject: net: remove napi_frag_unref With the changes in the last patches, napi_frag_unref() is now reduandant. Remove it and use skb_page_unref directly. Signed-off-by: Mina Almasry Reviewed-by: Dragos Tatulea Reviewed-by: Eric Dumazet Reviewed-by: Jacob Keller Link: https://lore.kernel.org/r/20240408153000.2152844-4-almasrymina@google.com Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 8 +------- net/core/skbuff.c | 2 +- 2 files changed, 2 insertions(+), 8 deletions(-) (limited to 'include/linux/skbuff.h') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index c0ff85bb087a..7135a3e94afd 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3531,12 +3531,6 @@ skb_page_unref(struct page *page, bool recycle) put_page(page); } -static inline void -napi_frag_unref(skb_frag_t *frag, bool recycle) -{ - skb_page_unref(skb_frag_page(frag), recycle); -} - /** * __skb_frag_unref - release a reference on a paged fragment. * @frag: the paged fragment @@ -3547,7 +3541,7 @@ napi_frag_unref(skb_frag_t *frag, bool recycle) */ static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle) { - napi_frag_unref(frag, recycle); + skb_page_unref(skb_frag_page(frag), recycle); } /** diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 21cd01641f4c..888874ef8566 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1101,7 +1101,7 @@ static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason) } for (i = 0; i < shinfo->nr_frags; i++) - napi_frag_unref(&shinfo->frags[i], skb->pp_recycle); + __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle); free_head: if (shinfo->frag_list) -- cgit v1.2.3-58-ga151 From f6d827b180bda01f8805bf5e85307419b0d6f890 Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Wed, 10 Apr 2024 12:05:01 -0700 Subject: net: move skb ref helpers to new header Add a new header, linux/skbuff_ref.h, which contains all the skb_*_ref() helpers. Many of the consumers of skbuff.h do not actually use any of the skb ref helpers, and we can speed up compilation a bit by minimizing this header file. Additionally in the later patch in the series we add page_pool support to skb_frag_ref(), which requires some page_pool dependencies. We can now add these dependencies to skbuff_ref.h instead of a very ubiquitous skbuff.h Signed-off-by: Mina Almasry Link: https://lore.kernel.org/r/20240410190505.1225848-2-almasrymina@google.com Signed-off-by: Jakub Kicinski --- .../chelsio/inline_crypto/ch_ktls/chcr_ktls.c | 1 + drivers/net/ethernet/marvell/sky2.c | 1 + drivers/net/ethernet/mellanox/mlx4/en_rx.c | 1 + drivers/net/ethernet/sun/cassini.c | 1 + drivers/net/veth.c | 1 + drivers/net/xen-netback/netback.c | 1 + include/linux/skbuff.h | 63 ------------------ include/linux/skbuff_ref.h | 75 ++++++++++++++++++++++ net/core/gro.c | 1 + net/core/skbuff.c | 1 + net/ipv4/esp4.c | 1 + net/ipv4/tcp_output.c | 1 + net/ipv6/esp6.c | 1 + net/tls/tls_device.c | 1 + net/tls/tls_device_fallback.c | 1 + net/tls/tls_strp.c | 1 + 16 files changed, 89 insertions(+), 63 deletions(-) create mode 100644 include/linux/skbuff_ref.h (limited to 'include/linux/skbuff.h') diff --git a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c index 6482728794dd..e8e460a92e0e 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "chcr_ktls.h" static LIST_HEAD(uld_ctx_list); diff --git a/drivers/net/ethernet/marvell/sky2.c b/drivers/net/ethernet/marvell/sky2.c index 07720841a8d7..f3f7f4cc27b3 100644 --- a/drivers/net/ethernet/marvell/sky2.c +++ b/drivers/net/ethernet/marvell/sky2.c @@ -34,6 +34,7 @@ #include #include #include +#include #include diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index eac49657bd07..8328df8645d5 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #if IS_ENABLED(CONFIG_IPV6) diff --git a/drivers/net/ethernet/sun/cassini.c b/drivers/net/ethernet/sun/cassini.c index bfb903506367..8f1f43dbb76d 100644 --- a/drivers/net/ethernet/sun/cassini.c +++ b/drivers/net/ethernet/sun/cassini.c @@ -73,6 +73,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/veth.c b/drivers/net/veth.c index bcdfbf61eb66..426e68a95067 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #define DRV_NAME "veth" diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index ef76850d9bcd..48254fc07d64 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -38,6 +38,7 @@ #include #include #include +#include #include diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 7135a3e94afd..4072a7ee3859 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3492,73 +3492,10 @@ static inline struct page *skb_frag_page(const skb_frag_t *frag) return netmem_to_page(frag->netmem); } -/** - * __skb_frag_ref - take an addition reference on a paged fragment. - * @frag: the paged fragment - * - * Takes an additional reference on the paged fragment @frag. - */ -static inline void __skb_frag_ref(skb_frag_t *frag) -{ - get_page(skb_frag_page(frag)); -} - -/** - * skb_frag_ref - take an addition reference on a paged fragment of an skb. - * @skb: the buffer - * @f: the fragment offset. - * - * Takes an additional reference on the @f'th paged fragment of @skb. - */ -static inline void skb_frag_ref(struct sk_buff *skb, int f) -{ - __skb_frag_ref(&skb_shinfo(skb)->frags[f]); -} - int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb, unsigned int headroom); int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb, struct bpf_prog *prog); -bool napi_pp_put_page(struct page *page); - -static inline void -skb_page_unref(struct page *page, bool recycle) -{ -#ifdef CONFIG_PAGE_POOL - if (recycle && napi_pp_put_page(page)) - return; -#endif - put_page(page); -} - -/** - * __skb_frag_unref - release a reference on a paged fragment. - * @frag: the paged fragment - * @recycle: recycle the page if allocated via page_pool - * - * Releases a reference on the paged fragment @frag - * or recycles the page via the page_pool API. - */ -static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle) -{ - skb_page_unref(skb_frag_page(frag), recycle); -} - -/** - * skb_frag_unref - release a reference on a paged fragment of an skb. - * @skb: the buffer - * @f: the fragment offset - * - * Releases a reference on the @f'th paged fragment of @skb. - */ -static inline void skb_frag_unref(struct sk_buff *skb, int f) -{ - struct skb_shared_info *shinfo = skb_shinfo(skb); - - if (!skb_zcopy_managed(skb)) - __skb_frag_unref(&shinfo->frags[f], skb->pp_recycle); -} - /** * skb_frag_address - gets the address of the data contained in a paged fragment * @frag: the paged fragment buffer diff --git a/include/linux/skbuff_ref.h b/include/linux/skbuff_ref.h new file mode 100644 index 000000000000..11f0a4063403 --- /dev/null +++ b/include/linux/skbuff_ref.h @@ -0,0 +1,75 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Skb ref helpers. + * + */ + +#ifndef _LINUX_SKBUFF_REF_H +#define _LINUX_SKBUFF_REF_H + +#include + +/** + * __skb_frag_ref - take an addition reference on a paged fragment. + * @frag: the paged fragment + * + * Takes an additional reference on the paged fragment @frag. + */ +static inline void __skb_frag_ref(skb_frag_t *frag) +{ + get_page(skb_frag_page(frag)); +} + +/** + * skb_frag_ref - take an addition reference on a paged fragment of an skb. + * @skb: the buffer + * @f: the fragment offset. + * + * Takes an additional reference on the @f'th paged fragment of @skb. + */ +static inline void skb_frag_ref(struct sk_buff *skb, int f) +{ + __skb_frag_ref(&skb_shinfo(skb)->frags[f]); +} + +bool napi_pp_put_page(struct page *page); + +static inline void +skb_page_unref(struct page *page, bool recycle) +{ +#ifdef CONFIG_PAGE_POOL + if (recycle && napi_pp_put_page(page)) + return; +#endif + put_page(page); +} + +/** + * __skb_frag_unref - release a reference on a paged fragment. + * @frag: the paged fragment + * @recycle: recycle the page if allocated via page_pool + * + * Releases a reference on the paged fragment @frag + * or recycles the page via the page_pool API. + */ +static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle) +{ + skb_page_unref(skb_frag_page(frag), recycle); +} + +/** + * skb_frag_unref - release a reference on a paged fragment of an skb. + * @skb: the buffer + * @f: the fragment offset + * + * Releases a reference on the @f'th paged fragment of @skb. + */ +static inline void skb_frag_unref(struct sk_buff *skb, int f) +{ + struct skb_shared_info *shinfo = skb_shinfo(skb); + + if (!skb_zcopy_managed(skb)) + __skb_frag_unref(&shinfo->frags[f], skb->pp_recycle); +} + +#endif /* _LINUX_SKBUFF_REF_H */ diff --git a/net/core/gro.c b/net/core/gro.c index 83f35d99a682..2459ab697f7f 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -3,6 +3,7 @@ #include #include #include +#include #define MAX_GRO_SKBS 8 diff --git a/net/core/skbuff.c b/net/core/skbuff.c index ab970ded8a7b..2554a6f5f386 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -51,6 +51,7 @@ #endif #include #include +#include #include #include #include diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 40330253f076..dff04580318f 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -20,6 +20,7 @@ #include #include #include +#include #include diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 9282fafc0e61..61119d42b0fd 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -44,6 +44,7 @@ #include #include #include +#include #include diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index fb431d0a3475..6bc0a84c8d05 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -36,6 +36,7 @@ #include #include #include +#include #include diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index bf8ed36b1ad6..ab6e694f7bc2 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -37,6 +37,7 @@ #include #include #include +#include #include "tls.h" #include "trace.h" diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c index 4e7228f275fa..f9e3d3d90dcf 100644 --- a/net/tls/tls_device_fallback.c +++ b/net/tls/tls_device_fallback.c @@ -33,6 +33,7 @@ #include #include #include +#include #include "tls.h" diff --git a/net/tls/tls_strp.c b/net/tls/tls_strp.c index ca1e0e198ceb..58c4b06f4f0c 100644 --- a/net/tls/tls_strp.c +++ b/net/tls/tls_strp.c @@ -2,6 +2,7 @@ /* Copyright (c) 2016 Tom Herbert */ #include +#include #include #include #include -- cgit v1.2.3-58-ga151 From 05d6d492097c55f2d153fc3fd33cbe78e1e28e0a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 29 Apr 2024 13:30:09 +0000 Subject: inet: introduce dst_rtable() helper I added dst_rt6_info() in commit e8dfd42c17fa ("ipv6: introduce dst_rt6_info() helper") This patch does a similar change for IPv4. Instead of (struct rtable *)dst casts, we can use : #define dst_rtable(_ptr) \ container_of_const(_ptr, struct rtable, dst) Patch is smaller than IPv6 one, because IPv4 has skb_rtable() helper. Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Reviewed-by: Sabrina Dubroca Link: https://lore.kernel.org/r/20240429133009.1227754-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- drivers/infiniband/core/addr.c | 12 +++--------- drivers/net/vrf.c | 2 +- drivers/s390/net/qeth_core.h | 5 ++--- include/linux/skbuff.h | 9 --------- include/net/ip.h | 4 ++-- include/net/route.h | 11 +++++++++++ net/atm/clip.c | 2 +- net/core/dst_cache.c | 2 +- net/core/filter.c | 3 +-- net/ipv4/af_inet.c | 2 +- net/ipv4/icmp.c | 26 ++++++++++++++------------ net/ipv4/ip_input.c | 2 +- net/ipv4/ip_output.c | 8 ++++---- net/ipv4/route.c | 24 +++++++++++------------- net/ipv4/udp.c | 2 +- net/ipv4/xfrm4_policy.c | 2 +- net/l2tp/l2tp_ip.c | 2 +- net/mpls/mpls_iptunnel.c | 2 +- net/netfilter/ipvs/ip_vs_xmit.c | 2 +- net/netfilter/nf_flow_table_ip.c | 4 ++-- net/netfilter/nft_rt.c | 2 +- net/sctp/protocol.c | 4 ++-- net/tipc/udp_media.c | 2 +- 23 files changed, 64 insertions(+), 70 deletions(-) (limited to 'include/linux/skbuff.h') diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index f20dfe70fa0e..be0743dac3ff 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -348,16 +348,10 @@ static int dst_fetch_ha(const struct dst_entry *dst, static bool has_gateway(const struct dst_entry *dst, sa_family_t family) { - const struct rtable *rt; - const struct rt6_info *rt6; + if (family == AF_INET) + return dst_rtable(dst)->rt_uses_gateway; - if (family == AF_INET) { - rt = container_of(dst, struct rtable, dst); - return rt->rt_uses_gateway; - } - - rt6 = dst_rt6_info(dst); - return rt6->rt6i_flags & RTF_GATEWAY; + return dst_rt6_info(dst)->rt6i_flags & RTF_GATEWAY; } static int fetch_ha(const struct dst_entry *dst, struct rdma_dev_addr *dev_addr, diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 784b9b2d275e..3a252ac5dd28 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -860,7 +860,7 @@ static int vrf_rt6_create(struct net_device *dev) static int vrf_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); - struct rtable *rt = (struct rtable *)dst; + struct rtable *rt = dst_rtable(dst); struct net_device *dev = dst->dev; unsigned int hh_len = LL_RESERVED_SPACE(dev); struct neighbour *neigh; diff --git a/drivers/s390/net/qeth_core.h b/drivers/s390/net/qeth_core.h index 5f17a2a5d0e3..41fe8a043d61 100644 --- a/drivers/s390/net/qeth_core.h +++ b/drivers/s390/net/qeth_core.h @@ -970,9 +970,8 @@ static inline struct dst_entry *qeth_dst_check_rcu(struct sk_buff *skb, static inline __be32 qeth_next_hop_v4_rcu(struct sk_buff *skb, struct dst_entry *dst) { - struct rtable *rt = (struct rtable *) dst; - - return (rt) ? rt_nexthop(rt, ip_hdr(skb)->daddr) : ip_hdr(skb)->daddr; + return (dst) ? rt_nexthop(dst_rtable(dst), ip_hdr(skb)->daddr) : + ip_hdr(skb)->daddr; } static inline struct in6_addr *qeth_next_hop_v6_rcu(struct sk_buff *skb, diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f76825e5b92a..adf75d69770c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1180,15 +1180,6 @@ static inline bool skb_dst_is_noref(const struct sk_buff *skb) return (skb->_skb_refdst & SKB_DST_NOREF) && skb_dst(skb); } -/** - * skb_rtable - Returns the skb &rtable - * @skb: buffer - */ -static inline struct rtable *skb_rtable(const struct sk_buff *skb) -{ - return (struct rtable *)skb_dst(skb); -} - /* For mangling skb->pkt_type from user space side from applications * such as nft, tc, etc, we only allow a conservative subset of * possible pkt_types to be set. diff --git a/include/net/ip.h b/include/net/ip.h index 25cb688bdc62..6d735e00d3f3 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -423,7 +423,7 @@ int ip_decrease_ttl(struct iphdr *iph) static inline int ip_mtu_locked(const struct dst_entry *dst) { - const struct rtable *rt = (const struct rtable *)dst; + const struct rtable *rt = dst_rtable(dst); return rt->rt_mtu_locked || dst_metric_locked(dst, RTAX_MTU); } @@ -461,7 +461,7 @@ static inline bool ip_sk_ignore_df(const struct sock *sk) static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst, bool forwarding) { - const struct rtable *rt = container_of(dst, struct rtable, dst); + const struct rtable *rt = dst_rtable(dst); struct net *net = dev_net(dst->dev); unsigned int mtu; diff --git a/include/net/route.h b/include/net/route.h index 630d1ef6868a..93833cfe9c96 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -75,6 +75,17 @@ struct rtable { rt_pmtu:31; }; +#define dst_rtable(_ptr) container_of_const(_ptr, struct rtable, dst) + +/** + * skb_rtable - Returns the skb &rtable + * @skb: buffer + */ +static inline struct rtable *skb_rtable(const struct sk_buff *skb) +{ + return dst_rtable(skb_dst(skb)); +} + static inline bool rt_is_input_route(const struct rtable *rt) { return rt->rt_is_input != 0; diff --git a/net/atm/clip.c b/net/atm/clip.c index 362e8d25a79e..42b910cb4e8e 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -345,7 +345,7 @@ static netdev_tx_t clip_start_xmit(struct sk_buff *skb, dev->stats.tx_dropped++; return NETDEV_TX_OK; } - rt = (struct rtable *) dst; + rt = dst_rtable(dst); if (rt->rt_gw_family == AF_INET) daddr = &rt->rt_gw4; else diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c index b17171345d64..0c0bdb058c5b 100644 --- a/net/core/dst_cache.c +++ b/net/core/dst_cache.c @@ -83,7 +83,7 @@ struct rtable *dst_cache_get_ip4(struct dst_cache *dst_cache, __be32 *saddr) return NULL; *saddr = idst->in_saddr.s_addr; - return container_of(dst, struct rtable, dst); + return dst_rtable(dst); } EXPORT_SYMBOL_GPL(dst_cache_get_ip4); diff --git a/net/core/filter.c b/net/core/filter.c index 6d319c76188b..29165744c505 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2317,8 +2317,7 @@ static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb, rcu_read_lock(); if (!nh) { - struct dst_entry *dst = skb_dst(skb); - struct rtable *rt = container_of(dst, struct rtable, dst); + struct rtable *rt = skb_rtable(skb); neigh = ip_neigh_for_gw(rt, skb, &is_v6gw); } else if (nh->nh_family == AF_INET6) { diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index a7cfeda28bb2..486a8d4f53b1 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1307,8 +1307,8 @@ static int inet_sk_reselect_saddr(struct sock *sk) int inet_sk_rebuild_header(struct sock *sk) { + struct rtable *rt = dst_rtable(__sk_dst_check(sk, 0)); struct inet_sock *inet = inet_sk(sk); - struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0); __be32 daddr; struct ip_options_rcu *inet_opt; struct flowi4 *fl4; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 437e782b9663..207482d30dc7 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -483,6 +483,7 @@ static struct rtable *icmp_route_lookup(struct net *net, struct icmp_bxm *param) { struct net_device *route_lookup_dev; + struct dst_entry *dst, *dst2; struct rtable *rt, *rt2; struct flowi4 fl4_dec; int err; @@ -508,16 +509,17 @@ static struct rtable *icmp_route_lookup(struct net *net, /* No need to clone since we're just using its address. */ rt2 = rt; - rt = (struct rtable *) xfrm_lookup(net, &rt->dst, - flowi4_to_flowi(fl4), NULL, 0); - if (!IS_ERR(rt)) { + dst = xfrm_lookup(net, &rt->dst, + flowi4_to_flowi(fl4), NULL, 0); + rt = dst_rtable(dst); + if (!IS_ERR(dst)) { if (rt != rt2) return rt; - } else if (PTR_ERR(rt) == -EPERM) { + } else if (PTR_ERR(dst) == -EPERM) { rt = NULL; - } else + } else { return rt; - + } err = xfrm_decode_session_reverse(net, skb_in, flowi4_to_flowi(&fl4_dec), AF_INET); if (err) goto relookup_failed; @@ -551,19 +553,19 @@ static struct rtable *icmp_route_lookup(struct net *net, if (err) goto relookup_failed; - rt2 = (struct rtable *) xfrm_lookup(net, &rt2->dst, - flowi4_to_flowi(&fl4_dec), NULL, - XFRM_LOOKUP_ICMP); - if (!IS_ERR(rt2)) { + dst2 = xfrm_lookup(net, &rt2->dst, flowi4_to_flowi(&fl4_dec), NULL, + XFRM_LOOKUP_ICMP); + rt2 = dst_rtable(dst2); + if (!IS_ERR(dst2)) { dst_release(&rt->dst); memcpy(fl4, &fl4_dec, sizeof(*fl4)); rt = rt2; - } else if (PTR_ERR(rt2) == -EPERM) { + } else if (PTR_ERR(dst2) == -EPERM) { if (rt) dst_release(&rt->dst); return rt2; } else { - err = PTR_ERR(rt2); + err = PTR_ERR(dst2); goto relookup_failed; } return rt; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 5e9c8156656a..d6fbcbd2358a 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -616,7 +616,7 @@ static void ip_list_rcv_finish(struct net *net, struct sock *sk, dst = skb_dst(skb); if (curr_dst != dst) { hint = ip_extract_route_hint(net, skb, - ((struct rtable *)dst)->rt_type); + dst_rtable(dst)->rt_type); /* dispatch old sublist */ if (!list_empty(&sublist)) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 1fe794967211..b455bd05a7d5 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -198,7 +198,7 @@ EXPORT_SYMBOL_GPL(ip_build_and_send_pkt); static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); - struct rtable *rt = (struct rtable *)dst; + struct rtable *rt = dst_rtable(dst); struct net_device *dev = dst->dev; unsigned int hh_len = LL_RESERVED_SPACE(dev); struct neighbour *neigh; @@ -475,7 +475,7 @@ int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, goto packet_routed; /* Make sure we can route this packet. */ - rt = (struct rtable *)__sk_dst_check(sk, 0); + rt = dst_rtable(__sk_dst_check(sk, 0)); if (!rt) { __be32 daddr; @@ -971,7 +971,7 @@ static int __ip_append_data(struct sock *sk, bool zc = false; unsigned int maxfraglen, fragheaderlen, maxnonfragsize; int csummode = CHECKSUM_NONE; - struct rtable *rt = (struct rtable *)cork->dst; + struct rtable *rt = dst_rtable(cork->dst); bool paged, hold_tskey, extra_uref = false; unsigned int wmem_alloc_delta = 0; u32 tskey = 0; @@ -1390,7 +1390,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk, struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); struct ip_options *opt = NULL; - struct rtable *rt = (struct rtable *)cork->dst; + struct rtable *rt = dst_rtable(cork->dst); struct iphdr *iph; u8 pmtudisc, ttl; __be16 df = 0; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index f89ff2e5a05b..0fd9a3d7ac4a 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -819,7 +819,7 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf u32 mark = skb->mark; __u8 tos = iph->tos; - rt = (struct rtable *) dst; + rt = dst_rtable(dst); __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0); __ip_do_redirect(rt, skb, &fl4, true); @@ -827,7 +827,7 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) { - struct rtable *rt = (struct rtable *)dst; + struct rtable *rt = dst_rtable(dst); struct dst_entry *ret = dst; if (rt) { @@ -1044,7 +1044,7 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, bool confirm_neigh) { - struct rtable *rt = (struct rtable *) dst; + struct rtable *rt = dst_rtable(dst); struct flowi4 fl4; ip_rt_build_flow_key(&fl4, sk, skb); @@ -1115,7 +1115,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0); - rt = (struct rtable *)odst; + rt = dst_rtable(odst); if (odst->obsolete && !odst->ops->check(odst, 0)) { rt = ip_route_output_flow(sock_net(sk), &fl4, sk); if (IS_ERR(rt)) @@ -1124,7 +1124,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) new = true; } - __ip_rt_update_pmtu((struct rtable *)xfrm_dst_path(&rt->dst), &fl4, mtu); + __ip_rt_update_pmtu(dst_rtable(xfrm_dst_path(&rt->dst)), &fl4, mtu); if (!dst_check(&rt->dst, 0)) { if (new) @@ -1181,7 +1181,7 @@ EXPORT_SYMBOL_GPL(ipv4_sk_redirect); INDIRECT_CALLABLE_SCOPE struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) { - struct rtable *rt = (struct rtable *) dst; + struct rtable *rt = dst_rtable(dst); /* All IPV4 dsts are created with ->obsolete set to the value * DST_OBSOLETE_FORCE_CHK which forces validation calls down @@ -1516,10 +1516,8 @@ void rt_del_uncached_list(struct rtable *rt) static void ipv4_dst_destroy(struct dst_entry *dst) { - struct rtable *rt = (struct rtable *)dst; - ip_dst_metrics_put(dst); - rt_del_uncached_list(rt); + rt_del_uncached_list(dst_rtable(dst)); } void rt_flush_dev(struct net_device *dev) @@ -2820,7 +2818,7 @@ static struct dst_ops ipv4_dst_blackhole_ops = { struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig) { - struct rtable *ort = (struct rtable *) dst_orig; + struct rtable *ort = dst_rtable(dst_orig); struct rtable *rt; rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, DST_OBSOLETE_DEAD, 0); @@ -2865,9 +2863,9 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, if (flp4->flowi4_proto) { flp4->flowi4_oif = rt->dst.dev->ifindex; - rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst, - flowi4_to_flowi(flp4), - sk, 0); + rt = dst_rtable(xfrm_lookup_route(net, &rt->dst, + flowi4_to_flowi(flp4), + sk, 0)); } return rt; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 6e2446295089..fe55ff5d379b 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1217,7 +1217,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } if (connected) - rt = (struct rtable *)sk_dst_check(sk, 0); + rt = dst_rtable(sk_dst_check(sk, 0)); if (!rt) { struct net *net = sock_net(sk); diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 1dda59e0aeab..fccbbd3e1a4b 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -69,7 +69,7 @@ static int xfrm4_get_saddr(struct net *net, int oif, static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, const struct flowi *fl) { - struct rtable *rt = (struct rtable *)xdst->route; + struct rtable *rt = dst_rtable(xdst->route); const struct flowi4 *fl4 = &fl->u.ip4; xdst->u.rt.rt_iif = fl4->flowi4_iif; diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 970af3983d11..19c8cc5289d5 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -459,7 +459,7 @@ static int l2tp_ip_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl4 = &inet->cork.fl.u.ip4; if (connected) - rt = (struct rtable *)__sk_dst_check(sk, 0); + rt = dst_rtable(__sk_dst_check(sk, 0)); rcu_read_lock(); if (!rt) { diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c index 606349c8df0e..4385fd3b13be 100644 --- a/net/mpls/mpls_iptunnel.c +++ b/net/mpls/mpls_iptunnel.c @@ -81,7 +81,7 @@ static int mpls_xmit(struct sk_buff *skb) ttl = net->mpls.default_ttl; else ttl = ip_hdr(skb)->ttl; - rt = (struct rtable *)dst; + rt = dst_rtable(dst); } else if (dst->ops->family == AF_INET6) { if (tun_encap_info->ttl_propagate == MPLS_TTL_PROP_DISABLED) ttl = tun_encap_info->default_ttl; diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 6e8b9d100ad2..3313bceb6cc9 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -318,7 +318,7 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, if (dest) { dest_dst = __ip_vs_dst_check(dest); if (likely(dest_dst)) - rt = (struct rtable *) dest_dst->dst_cache; + rt = dst_rtable(dest_dst->dst_cache); else { dest_dst = ip_vs_dest_dst_alloc(); spin_lock_bh(&dest->dst_lock); diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 100887beed31..c2c005234dcd 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -434,7 +434,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, return NF_ACCEPT; if (unlikely(tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)) { - rt = (struct rtable *)tuplehash->tuple.dst_cache; + rt = dst_rtable(tuplehash->tuple.dst_cache); memset(skb->cb, 0, sizeof(struct inet_skb_parm)); IPCB(skb)->iif = skb->dev->ifindex; IPCB(skb)->flags = IPSKB_FORWARDED; @@ -446,7 +446,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, switch (tuplehash->tuple.xmit_type) { case FLOW_OFFLOAD_XMIT_NEIGH: - rt = (struct rtable *)tuplehash->tuple.dst_cache; + rt = dst_rtable(tuplehash->tuple.dst_cache); outdev = rt->dst.dev; skb->dev = outdev; nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr); diff --git a/net/netfilter/nft_rt.c b/net/netfilter/nft_rt.c index 2434c624aafd..14d88394bcb7 100644 --- a/net/netfilter/nft_rt.c +++ b/net/netfilter/nft_rt.c @@ -73,7 +73,7 @@ void nft_rt_get_eval(const struct nft_expr *expr, if (nft_pf(pkt) != NFPROTO_IPV4) goto err; - *dest = (__force u32)rt_nexthop((const struct rtable *)dst, + *dest = (__force u32)rt_nexthop(dst_rtable(dst), ip_hdr(skb)->daddr); break; case NFT_RT_NEXTHOP6: diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index e849f368ed91..5a7436a13b74 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -552,7 +552,7 @@ static void sctp_v4_get_saddr(struct sctp_sock *sk, struct flowi *fl) { union sctp_addr *saddr = &t->saddr; - struct rtable *rt = (struct rtable *)t->dst; + struct rtable *rt = dst_rtable(t->dst); if (rt) { saddr->v4.sin_family = AF_INET; @@ -1085,7 +1085,7 @@ static inline int sctp_v4_xmit(struct sk_buff *skb, struct sctp_transport *t) skb_reset_inner_mac_header(skb); skb_reset_inner_transport_header(skb); skb_set_inner_ipproto(skb, IPPROTO_SCTP); - udp_tunnel_xmit_skb((struct rtable *)dst, sk, skb, fl4->saddr, + udp_tunnel_xmit_skb(dst_rtable(dst), sk, skb, fl4->saddr, fl4->daddr, dscp, ip4_dst_hoplimit(dst), df, sctp_sk(sk)->udp_port, t->encap_port, false, false); return 0; diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index f892b0903dba..b849a3d133a0 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -174,7 +174,7 @@ static int tipc_udp_xmit(struct net *net, struct sk_buff *skb, local_bh_disable(); ndst = dst_cache_get(cache); if (dst->proto == htons(ETH_P_IP)) { - struct rtable *rt = (struct rtable *)ndst; + struct rtable *rt = dst_rtable(ndst); if (!rt) { struct flowi4 fl = { -- cgit v1.2.3-58-ga151 From a86a0661b86f310c0b73a30c829648864f0b2619 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 29 Apr 2024 13:40:21 +0000 Subject: net: move sysctl_max_skb_frags to net_hotdata sysctl_max_skb_frags is used in TCP and MPTCP fast paths, move it to net_hodata for better cache locality. Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20240429134025.1233626-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 2 -- include/net/hotdata.h | 1 + net/core/hotdata.c | 1 + net/core/skbuff.c | 5 +---- net/core/sysctl_net_core.c | 2 +- net/ipv4/tcp.c | 3 ++- net/mptcp/protocol.c | 3 ++- 7 files changed, 8 insertions(+), 9 deletions(-) (limited to 'include/linux/skbuff.h') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index adf75d69770c..36b133f04d30 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -353,8 +353,6 @@ struct sk_buff; #define MAX_SKB_FRAGS CONFIG_MAX_SKB_FRAGS -extern int sysctl_max_skb_frags; - /* Set skb_shinfo(skb)->gso_size to this in case you want skb_segment to * segment using its current segmentation instead. */ diff --git a/include/net/hotdata.h b/include/net/hotdata.h index 003667a1efd6..a6cff6590426 100644 --- a/include/net/hotdata.h +++ b/include/net/hotdata.h @@ -38,6 +38,7 @@ struct net_hotdata { int max_backlog; int dev_tx_weight; int dev_rx_weight; + int sysctl_max_skb_frags; }; #define inet_ehash_secret net_hotdata.tcp_protocol.secret diff --git a/net/core/hotdata.c b/net/core/hotdata.c index c8a7a451c18a..f17cbb4807b9 100644 --- a/net/core/hotdata.c +++ b/net/core/hotdata.c @@ -18,5 +18,6 @@ struct net_hotdata net_hotdata __cacheline_aligned = { .max_backlog = 1000, .dev_tx_weight = 64, .dev_rx_weight = 64, + .sysctl_max_skb_frags = MAX_SKB_FRAGS, }; EXPORT_SYMBOL(net_hotdata); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 0c8b82750000..65779b8f0b12 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -109,9 +109,6 @@ static struct kmem_cache *skbuff_ext_cache __ro_after_init; #define SKB_SMALL_HEAD_HEADROOM \ SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) -int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS; -EXPORT_SYMBOL(sysctl_max_skb_frags); - /* kcm_write_msgs() relies on casting paged frags to bio_vec to use * iov_iter_bvec(). These static asserts ensure the cast is valid is long as the * netmem is a page. @@ -7040,7 +7037,7 @@ static void skb_splice_csum_page(struct sk_buff *skb, struct page *page, ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter, ssize_t maxsize, gfp_t gfp) { - size_t frag_limit = READ_ONCE(sysctl_max_skb_frags); + size_t frag_limit = READ_ONCE(net_hotdata.sysctl_max_skb_frags); struct page *pages[8], **ppages = pages; ssize_t spliced = 0, ret = 0; unsigned int i; diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 903ab4a51c17..e75375d54b9e 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -595,7 +595,7 @@ static struct ctl_table net_core_table[] = { }, { .procname = "max_skb_frags", - .data = &sysctl_max_skb_frags, + .data = &net_hotdata.sysctl_max_skb_frags, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 4ec0f4feee00..388f6e115bf1 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -280,6 +280,7 @@ #include #include #include +#include #include /* Track pending CMSGs. */ @@ -1188,7 +1189,7 @@ new_segment: if (!skb_can_coalesce(skb, i, pfrag->page, pfrag->offset)) { - if (i >= READ_ONCE(sysctl_max_skb_frags)) { + if (i >= READ_ONCE(net_hotdata.sysctl_max_skb_frags)) { tcp_mark_push(tp, skb); goto new_segment; } diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 4b13ca362efa..aff17597e6a7 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -20,6 +20,7 @@ #include #endif #include +#include #include #include #include "protocol.h" @@ -1272,7 +1273,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, i = skb_shinfo(skb)->nr_frags; can_coalesce = skb_can_coalesce(skb, i, dfrag->page, offset); - if (!can_coalesce && i >= READ_ONCE(sysctl_max_skb_frags)) { + if (!can_coalesce && i >= READ_ONCE(net_hotdata.sysctl_max_skb_frags)) { tcp_mark_push(tcp_sk(ssk), skb); goto alloc_skb; } -- cgit v1.2.3-58-ga151