From 5e1abdc3fe56939d9ac34209706b1a527b77b61b Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Tue, 6 Nov 2018 10:45:36 -0500 Subject: net: skbuff.h: remove unnecessary unlikely() WARN_ON() already contains an unlikely(), so it's not necessary to use unlikely. Signed-off-by: Yangtao Li Signed-off-by: David S. Miller --- include/linux/skbuff.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux/skbuff.h') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 0ba687454267..7dcfb5591dc3 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2508,10 +2508,8 @@ int ___pskb_trim(struct sk_buff *skb, unsigned int len); static inline void __skb_set_length(struct sk_buff *skb, unsigned int len) { - if (unlikely(skb_is_nonlinear(skb))) { - WARN_ON(1); + if (WARN_ON(skb_is_nonlinear(skb))) return; - } skb->len = len; skb_set_tail_pointer(skb, len); } -- cgit v1.2.3-58-ga151 From 5109f9fd6a76116090b34a192d4a957d2ad0621e Mon Sep 17 00:00:00 2001 From: Michał Mirosław Date: Sat, 10 Nov 2018 19:58:34 +0100 Subject: net/skbuff: add macros for VLAN_PRESENT bit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wrap VLAN_PRESENT bit using macro like PKT_TYPE_* and CLONED_*, as used by BPF code. Signed-off-by: Michał Mirosław Signed-off-by: David S. Miller --- include/linux/skbuff.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux/skbuff.h') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 7dcfb5591dc3..99f38779332c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -816,6 +816,12 @@ struct sk_buff { __u32 priority; int skb_iif; __u32 hash; +#define PKT_VLAN_PRESENT_BIT 4 // CFI (12-th bit) in TCI +#ifdef __BIG_ENDIAN +#define PKT_VLAN_PRESENT_OFFSET() offsetof(struct sk_buff, vlan_tci) +#else +#define PKT_VLAN_PRESENT_OFFSET() (offsetof(struct sk_buff, vlan_tci) + 1) +#endif __be16 vlan_proto; __u16 vlan_tci; #if defined(CONFIG_NET_RX_BUSY_POLL) || defined(CONFIG_XPS) -- cgit v1.2.3-58-ga151 From 0c4b2d370514cb4f3454dd3b18f031d2651fab73 Mon Sep 17 00:00:00 2001 From: Michał Mirosław Date: Sat, 10 Nov 2018 19:58:36 +0100 Subject: net: remove VLAN_TAG_PRESENT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace VLAN_TAG_PRESENT with single bit flag and free up VLAN.CFI overload. Now VLAN.CFI is visible in networking stack and can be passed around intact. Signed-off-by: Michał Mirosław Signed-off-by: David S. Miller --- arch/mips/net/bpf_jit.c | 3 --- arch/powerpc/net/bpf_jit_comp.c | 3 --- arch/sparc/net/bpf_jit_comp_32.c | 4 ---- include/linux/if_vlan.h | 11 ++++++----- include/linux/skbuff.h | 16 +++++++++------- lib/test_bpf.c | 14 ++++++++------ net/core/filter.c | 6 ------ 7 files changed, 23 insertions(+), 34 deletions(-) (limited to 'include/linux/skbuff.h') diff --git a/arch/mips/net/bpf_jit.c b/arch/mips/net/bpf_jit.c index de4c6372ad9a..3a0e34f4e615 100644 --- a/arch/mips/net/bpf_jit.c +++ b/arch/mips/net/bpf_jit.c @@ -1164,9 +1164,6 @@ jmp_cmp: vlan_tci) != 2); off = offsetof(struct sk_buff, vlan_tci); emit_half_load_unsigned(r_A, r_skb, off, ctx); -#ifdef VLAN_TAG_PRESENT - emit_andi(r_A, r_A, (u16)~VLAN_TAG_PRESENT, ctx); -#endif break; case BPF_ANC | SKF_AD_VLAN_TAG_PRESENT: ctx->flags |= SEEN_SKB | SEEN_A; diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c index dc4a2f54e829..91d223cf512b 100644 --- a/arch/powerpc/net/bpf_jit_comp.c +++ b/arch/powerpc/net/bpf_jit_comp.c @@ -383,9 +383,6 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, PPC_LHZ_OFFS(r_A, r_skb, offsetof(struct sk_buff, vlan_tci)); -#ifdef VLAN_TAG_PRESENT - PPC_ANDI(r_A, r_A, ~VLAN_TAG_PRESENT); -#endif break; case BPF_ANC | SKF_AD_VLAN_TAG_PRESENT: PPC_LBZ_OFFS(r_A, r_skb, PKT_VLAN_PRESENT_OFFSET()); diff --git a/arch/sparc/net/bpf_jit_comp_32.c b/arch/sparc/net/bpf_jit_comp_32.c index 48f3c04dd179..84cc8f7f83e9 100644 --- a/arch/sparc/net/bpf_jit_comp_32.c +++ b/arch/sparc/net/bpf_jit_comp_32.c @@ -553,10 +553,6 @@ void bpf_jit_compile(struct bpf_prog *fp) break; case BPF_ANC | SKF_AD_VLAN_TAG: emit_skb_load16(vlan_tci, r_A); -#ifdef VLAN_TAG_PRESENT - emit_loadimm(~VLAN_TAG_PRESENT, r_TMP); - emit_and(r_A, r_TMP, r_A); -#endif break; case BPF_ANC | SKF_AD_VLAN_TAG_PRESENT: __emit_skb_load8(__pkt_vlan_present_offset, r_A); diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 1be5230921b5..7a541eadf78e 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -66,7 +66,6 @@ static inline struct vlan_ethhdr *vlan_eth_hdr(const struct sk_buff *skb) #define VLAN_PRIO_MASK 0xe000 /* Priority Code Point */ #define VLAN_PRIO_SHIFT 13 #define VLAN_CFI_MASK 0x1000 /* Canonical Format Indicator */ -#define VLAN_TAG_PRESENT VLAN_CFI_MASK #define VLAN_VID_MASK 0x0fff /* VLAN Identifier */ #define VLAN_N_VID 4096 @@ -78,8 +77,8 @@ static inline bool is_vlan_dev(const struct net_device *dev) return dev->priv_flags & IFF_802_1Q_VLAN; } -#define skb_vlan_tag_present(__skb) ((__skb)->vlan_tci & VLAN_TAG_PRESENT) -#define skb_vlan_tag_get(__skb) ((__skb)->vlan_tci & ~VLAN_TAG_PRESENT) +#define skb_vlan_tag_present(__skb) ((__skb)->vlan_present) +#define skb_vlan_tag_get(__skb) ((__skb)->vlan_tci) #define skb_vlan_tag_get_id(__skb) ((__skb)->vlan_tci & VLAN_VID_MASK) #define skb_vlan_tag_get_prio(__skb) (((__skb)->vlan_tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT) @@ -480,7 +479,7 @@ static inline struct sk_buff *vlan_insert_tag_set_proto(struct sk_buff *skb, */ static inline void __vlan_hwaccel_clear_tag(struct sk_buff *skb) { - skb->vlan_tci = 0; + skb->vlan_present = 0; } /** @@ -492,6 +491,7 @@ static inline void __vlan_hwaccel_clear_tag(struct sk_buff *skb) */ static inline void __vlan_hwaccel_copy_tag(struct sk_buff *dst, const struct sk_buff *src) { + dst->vlan_present = src->vlan_present; dst->vlan_proto = src->vlan_proto; dst->vlan_tci = src->vlan_tci; } @@ -526,7 +526,8 @@ static inline void __vlan_hwaccel_put_tag(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) { skb->vlan_proto = vlan_proto; - skb->vlan_tci = VLAN_TAG_PRESENT | vlan_tci; + skb->vlan_tci = vlan_tci; + skb->vlan_present = 1; } /** diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 99f38779332c..b9aa0d1b21cf 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -777,6 +777,14 @@ struct sk_buff { __u8 encap_hdr_csum:1; __u8 csum_valid:1; +#ifdef __BIG_ENDIAN_BITFIELD +#define PKT_VLAN_PRESENT_BIT 7 +#else +#define PKT_VLAN_PRESENT_BIT 0 +#endif +#define PKT_VLAN_PRESENT_OFFSET() offsetof(struct sk_buff, __pkt_vlan_present_offset) + __u8 __pkt_vlan_present_offset[0]; + __u8 vlan_present:1; __u8 csum_complete_sw:1; __u8 csum_level:2; __u8 csum_not_inet:1; @@ -784,8 +792,8 @@ struct sk_buff { #ifdef CONFIG_IPV6_NDISC_NODETYPE __u8 ndisc_nodetype:2; #endif - __u8 ipvs_property:1; + __u8 ipvs_property:1; __u8 inner_protocol_type:1; __u8 remcsum_offload:1; #ifdef CONFIG_NET_SWITCHDEV @@ -816,12 +824,6 @@ struct sk_buff { __u32 priority; int skb_iif; __u32 hash; -#define PKT_VLAN_PRESENT_BIT 4 // CFI (12-th bit) in TCI -#ifdef __BIG_ENDIAN -#define PKT_VLAN_PRESENT_OFFSET() offsetof(struct sk_buff, vlan_tci) -#else -#define PKT_VLAN_PRESENT_OFFSET() (offsetof(struct sk_buff, vlan_tci) + 1) -#endif __be16 vlan_proto; __u16 vlan_tci; #if defined(CONFIG_NET_RX_BUSY_POLL) || defined(CONFIG_XPS) diff --git a/lib/test_bpf.c b/lib/test_bpf.c index aa22bcaec1dc..f3e570722a7e 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -39,6 +39,7 @@ #define SKB_HASH 0x1234aaab #define SKB_QUEUE_MAP 123 #define SKB_VLAN_TCI 0xffff +#define SKB_VLAN_PRESENT 1 #define SKB_DEV_IFINDEX 577 #define SKB_DEV_TYPE 588 @@ -725,8 +726,8 @@ static struct bpf_test tests[] = { CLASSIC, { }, { - { 1, SKB_VLAN_TCI & ~VLAN_TAG_PRESENT }, - { 10, SKB_VLAN_TCI & ~VLAN_TAG_PRESENT } + { 1, SKB_VLAN_TCI }, + { 10, SKB_VLAN_TCI } }, }, { @@ -739,8 +740,8 @@ static struct bpf_test tests[] = { CLASSIC, { }, { - { 1, !!(SKB_VLAN_TCI & VLAN_TAG_PRESENT) }, - { 10, !!(SKB_VLAN_TCI & VLAN_TAG_PRESENT) } + { 1, SKB_VLAN_PRESENT }, + { 10, SKB_VLAN_PRESENT } }, }, { @@ -5289,8 +5290,8 @@ static struct bpf_test tests[] = { #endif { }, { - { 1, !!(SKB_VLAN_TCI & VLAN_TAG_PRESENT) }, - { 10, !!(SKB_VLAN_TCI & VLAN_TAG_PRESENT) } + { 1, SKB_VLAN_PRESENT }, + { 10, SKB_VLAN_PRESENT } }, .fill_helper = bpf_fill_maxinsns6, .expected_errcode = -ENOTSUPP, @@ -6493,6 +6494,7 @@ static struct sk_buff *populate_skb(char *buf, int size) skb->hash = SKB_HASH; skb->queue_mapping = SKB_QUEUE_MAP; skb->vlan_tci = SKB_VLAN_TCI; + skb->vlan_present = SKB_VLAN_PRESENT; skb->vlan_proto = htons(ETH_P_IP); dev_net_set(&dev, &init_net); skb->dev = &dev; diff --git a/net/core/filter.c b/net/core/filter.c index c151b906df53..10acbc00ff6c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -301,9 +301,6 @@ static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg, /* dst_reg = *(u16 *) (src_reg + offsetof(vlan_tci)) */ *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, offsetof(struct sk_buff, vlan_tci)); -#ifdef VLAN_TAG_PRESENT - *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, ~VLAN_TAG_PRESENT); -#endif break; case SKF_AD_VLAN_TAG_PRESENT: *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_VLAN_PRESENT_OFFSET()); @@ -6152,9 +6149,6 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, vlan_tci, 2, target_size)); -#ifdef VLAN_TAG_PRESENT - *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, ~VLAN_TAG_PRESENT); -#endif break; case offsetof(struct __sk_buff, cb[0]) ... -- cgit v1.2.3-58-ga151 From 7f600f14dfac4ba4aee6283a415cdad2925d7791 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 12 Nov 2018 18:05:24 -0800 Subject: net: remove unused skb_send_sock() Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 - net/core/skbuff.c | 13 ------------- 2 files changed, 14 deletions(-) (limited to 'include/linux/skbuff.h') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b9aa0d1b21cf..a2e8297a5b00 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3335,7 +3335,6 @@ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, unsigned int flags); int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset, int len); -int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len); void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to); unsigned int skb_zerocopy_headlen(const struct sk_buff *from); int skb_zerocopy(struct sk_buff *to, struct sk_buff *from, diff --git a/net/core/skbuff.c b/net/core/skbuff.c index f95ab41c9fb9..a1be7f19d998 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2364,19 +2364,6 @@ error: } EXPORT_SYMBOL_GPL(skb_send_sock_locked); -/* Send skb data on a socket. */ -int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len) -{ - int ret = 0; - - lock_sock(sk); - ret = skb_send_sock_locked(sk, skb, offset, len); - release_sock(sk); - - return ret; -} -EXPORT_SYMBOL_GPL(skb_send_sock); - /** * skb_store_bits - store bits from kernel buffer to skb * @skb: destination buffer -- cgit v1.2.3-58-ga151 From 4bffc669d6248d655aeb985a0e51bfaaf21c8b40 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 25 Nov 2018 08:26:23 -0800 Subject: net: remove unsafe skb_insert() I do not see how one can effectively use skb_insert() without holding some kind of lock. Otherwise other cpus could have changed the list right before we have a chance of acquiring list->lock. Only existing user is in drivers/infiniband/hw/nes/nes_mgt.c and this one probably meant to use __skb_insert() since it appears nesqp->pau_list is protected by nesqp->pau_lock. This looks like nesqp->pau_lock could be removed, since nesqp->pau_list.lock could be used instead. Signed-off-by: Eric Dumazet Cc: Faisal Latif Cc: Doug Ledford Cc: Jason Gunthorpe Cc: linux-rdma Signed-off-by: David S. Miller --- drivers/infiniband/hw/nes/nes_mgt.c | 4 ++-- include/linux/skbuff.h | 2 -- net/core/skbuff.c | 22 ---------------------- 3 files changed, 2 insertions(+), 26 deletions(-) (limited to 'include/linux/skbuff.h') diff --git a/drivers/infiniband/hw/nes/nes_mgt.c b/drivers/infiniband/hw/nes/nes_mgt.c index fc0c191014e9..cc4dce5c3e5f 100644 --- a/drivers/infiniband/hw/nes/nes_mgt.c +++ b/drivers/infiniband/hw/nes/nes_mgt.c @@ -551,14 +551,14 @@ static void queue_fpdus(struct sk_buff *skb, struct nes_vnic *nesvnic, struct ne /* Queue skb by sequence number */ if (skb_queue_len(&nesqp->pau_list) == 0) { - skb_queue_head(&nesqp->pau_list, skb); + __skb_queue_head(&nesqp->pau_list, skb); } else { skb_queue_walk(&nesqp->pau_list, tmpskb) { cb = (struct nes_rskb_cb *)&tmpskb->cb[0]; if (before(seqnum, cb->seqnum)) break; } - skb_insert(tmpskb, skb, &nesqp->pau_list); + __skb_insert(skb, tmpskb->prev, tmpskb, &nesqp->pau_list); } if (nesqp->pau_state == PAU_READY) process_it = true; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f17a7452ac7b..73902acf2b71 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1749,8 +1749,6 @@ static inline void skb_queue_head_init_class(struct sk_buff_head *list, * The "__skb_xxxx()" functions are the non-atomic ones that * can only be called with interrupts disabled. */ -void skb_insert(struct sk_buff *old, struct sk_buff *newsk, - struct sk_buff_head *list); static inline void __skb_insert(struct sk_buff *newsk, struct sk_buff *prev, struct sk_buff *next, struct sk_buff_head *list) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 9a8a72cefe9b..02cd7ae3d0fb 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2990,28 +2990,6 @@ void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head } EXPORT_SYMBOL(skb_append); -/** - * skb_insert - insert a buffer - * @old: buffer to insert before - * @newsk: buffer to insert - * @list: list to use - * - * Place a packet before a given packet in a list. The list locks are - * taken and this function is atomic with respect to other list locked - * calls. - * - * A buffer cannot be placed on two lists at the same time. - */ -void skb_insert(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list) -{ - unsigned long flags; - - spin_lock_irqsave(&list->lock, flags); - __skb_insert(newsk, old->prev, old, list); - spin_unlock_irqrestore(&list->lock, flags); -} -EXPORT_SYMBOL(skb_insert); - static inline void skb_split_inside_header(struct sk_buff *skb, struct sk_buff* skb1, const u32 len, const int pos) -- cgit v1.2.3-58-ga151 From b5947e5d1e710c35ea281247bd27e6975250285c Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 30 Nov 2018 15:32:39 -0500 Subject: udp: msg_zerocopy Extend zerocopy to udp sockets. Allow setting sockopt SO_ZEROCOPY and interpret flag MSG_ZEROCOPY. This patch was previously part of the zerocopy RFC patchsets. Zerocopy is not effective at small MTU. With segmentation offload building larger datagrams, the benefit of page flipping outweights the cost of generating a completion notification. tools/testing/selftests/net/msg_zerocopy.sh after applying follow-on test patch and making skb_orphan_frags_rx same as skb_orphan_frags: ipv4 udp -t 1 tx=191312 (11938 MB) txc=0 zc=n rx=191312 (11938 MB) ipv4 udp -z -t 1 tx=304507 (19002 MB) txc=304507 zc=y rx=304507 (19002 MB) ok ipv6 udp -t 1 tx=174485 (10888 MB) txc=0 zc=n rx=174485 (10888 MB) ipv6 udp -z -t 1 tx=294801 (18396 MB) txc=294801 zc=y rx=294801 (18396 MB) ok Changes v1 -> v2 - Fixup reverse christmas tree violation v2 -> v3 - Split refcount avoidance optimization into separate patch - Fix refcount leak on error in fragmented case (thanks to Paolo Abeni for pointing this one out!) - Fix refcount inc on zero - Test sock_flag SOCK_ZEROCOPY directly in __ip_append_data. This is needed since commit 5cf4a8532c99 ("tcp: really ignore MSG_ZEROCOPY if no SO_ZEROCOPY") did the same for tcp. Signed-off-by: Willem de Bruijn Acked-by: Paolo Abeni Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 + net/core/skbuff.c | 6 ++++++ net/core/sock.c | 5 ++++- net/ipv4/ip_output.c | 23 ++++++++++++++++++++++- net/ipv6/ip6_output.c | 23 ++++++++++++++++++++++- 5 files changed, 55 insertions(+), 3 deletions(-) (limited to 'include/linux/skbuff.h') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 73902acf2b71..04f52e719571 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -485,6 +485,7 @@ void sock_zerocopy_put_abort(struct ubuf_info *uarg); void sock_zerocopy_callback(struct ubuf_info *uarg, bool success); +int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len); int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, struct msghdr *msg, int len, struct ubuf_info *uarg); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 3c814565ed7c..1350901c5cb8 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1105,6 +1105,12 @@ EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort); extern int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb, struct iov_iter *from, size_t length); +int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len) +{ + return __zerocopy_sg_from_iter(skb->sk, skb, &msg->msg_iter, len); +} +EXPORT_SYMBOL_GPL(skb_zerocopy_iter_dgram); + int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, struct msghdr *msg, int len, struct ubuf_info *uarg) diff --git a/net/core/sock.c b/net/core/sock.c index 6d7e189e3cd9..f5bb89785e47 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1018,7 +1018,10 @@ set_rcvbuf: case SO_ZEROCOPY: if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) { - if (sk->sk_protocol != IPPROTO_TCP) + if (!((sk->sk_type == SOCK_STREAM && + sk->sk_protocol == IPPROTO_TCP) || + (sk->sk_type == SOCK_DGRAM && + sk->sk_protocol == IPPROTO_UDP))) ret = -ENOTSUPP; } else if (sk->sk_family != PF_RDS) { ret = -ENOTSUPP; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 5dbec21856f4..6f843aff628c 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -867,6 +867,7 @@ static int __ip_append_data(struct sock *sk, unsigned int flags) { struct inet_sock *inet = inet_sk(sk); + struct ubuf_info *uarg = NULL; struct sk_buff *skb; struct ip_options *opt = cork->opt; @@ -916,6 +917,19 @@ static int __ip_append_data(struct sock *sk, (!exthdrlen || (rt->dst.dev->features & NETIF_F_HW_ESP_TX_CSUM))) csummode = CHECKSUM_PARTIAL; + if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) { + uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb)); + if (!uarg) + return -ENOBUFS; + if (rt->dst.dev->features & NETIF_F_SG && + csummode == CHECKSUM_PARTIAL) { + paged = true; + } else { + uarg->zerocopy = 0; + skb_zcopy_set(skb, uarg); + } + } + cork->length += length; /* So, what's going on in the loop below? @@ -1006,6 +1020,7 @@ alloc_new_skb: cork->tx_flags = 0; skb_shinfo(skb)->tskey = tskey; tskey = 0; + skb_zcopy_set(skb, uarg); /* * Find where to start putting bytes. @@ -1068,7 +1083,7 @@ alloc_new_skb: err = -EFAULT; goto error; } - } else { + } else if (!uarg || !uarg->zerocopy) { int i = skb_shinfo(skb)->nr_frags; err = -ENOMEM; @@ -1098,6 +1113,10 @@ alloc_new_skb: skb->data_len += copy; skb->truesize += copy; wmem_alloc_delta += copy; + } else { + err = skb_zerocopy_iter_dgram(skb, from, copy); + if (err < 0) + goto error; } offset += copy; length -= copy; @@ -1105,11 +1124,13 @@ alloc_new_skb: if (wmem_alloc_delta) refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); + sock_zerocopy_put(uarg); return 0; error_efault: err = -EFAULT; error: + sock_zerocopy_put_abort(uarg); cork->length -= length; IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 827a3f5ff3bb..7df04d20a91f 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1245,6 +1245,7 @@ static int __ip6_append_data(struct sock *sk, { struct sk_buff *skb, *skb_prev = NULL; unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu; + struct ubuf_info *uarg = NULL; int exthdrlen = 0; int dst_exthdrlen = 0; int hh_len; @@ -1322,6 +1323,19 @@ emsgsize: rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) csummode = CHECKSUM_PARTIAL; + if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) { + uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb)); + if (!uarg) + return -ENOBUFS; + if (rt->dst.dev->features & NETIF_F_SG && + csummode == CHECKSUM_PARTIAL) { + paged = true; + } else { + uarg->zerocopy = 0; + skb_zcopy_set(skb, uarg); + } + } + /* * Let's try using as much space as possible. * Use MTU if total length of the message fits into the MTU. @@ -1445,6 +1459,7 @@ alloc_new_skb: cork->tx_flags = 0; skb_shinfo(skb)->tskey = tskey; tskey = 0; + skb_zcopy_set(skb, uarg); /* * Find where to start putting bytes @@ -1506,7 +1521,7 @@ alloc_new_skb: err = -EFAULT; goto error; } - } else { + } else if (!uarg || !uarg->zerocopy) { int i = skb_shinfo(skb)->nr_frags; err = -ENOMEM; @@ -1536,6 +1551,10 @@ alloc_new_skb: skb->data_len += copy; skb->truesize += copy; wmem_alloc_delta += copy; + } else { + err = skb_zerocopy_iter_dgram(skb, from, copy); + if (err < 0) + goto error; } offset += copy; length -= copy; @@ -1543,11 +1562,13 @@ alloc_new_skb: if (wmem_alloc_delta) refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); + sock_zerocopy_put(uarg); return 0; error_efault: err = -EFAULT; error: + sock_zerocopy_put_abort(uarg); cork->length -= length; IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); -- cgit v1.2.3-58-ga151 From 52900d22288e7d45846037e1db277c665bbc40db Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 30 Nov 2018 15:32:40 -0500 Subject: udp: elide zerocopy operation in hot path With MSG_ZEROCOPY, each skb holds a reference to a struct ubuf_info. Release of its last reference triggers a completion notification. The TCP stack in tcp_sendmsg_locked holds an extra ref independent of the skbs, because it can build, send and free skbs within its loop, possibly reaching refcount zero and freeing the ubuf_info too soon. The UDP stack currently also takes this extra ref, but does not need it as all skbs are sent after return from __ip(6)_append_data. Avoid the extra refcount_inc and refcount_dec_and_test, and generally the sock_zerocopy_put in the common path, by passing the initial reference to the first skb. This approach is taken instead of initializing the refcount to 0, as that would generate error "refcount_t: increment on 0" on the next skb_zcopy_set. Changes v3 -> v4 - Move skb_zcopy_set below the only kfree_skb that might cause a premature uarg destroy before skb_zerocopy_put_abort - Move the entire skb_shinfo assignment block, to keep that cacheline access in one place Signed-off-by: Willem de Bruijn Acked-by: Paolo Abeni Signed-off-by: David S. Miller --- include/linux/skbuff.h | 12 ++++++++---- net/core/skbuff.c | 9 +++++---- net/ipv4/ip_output.c | 22 +++++++++++----------- net/ipv4/tcp.c | 2 +- net/ipv6/ip6_output.c | 22 +++++++++++----------- 5 files changed, 36 insertions(+), 31 deletions(-) (limited to 'include/linux/skbuff.h') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 04f52e719571..75d50ab7997c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -481,7 +481,7 @@ static inline void sock_zerocopy_get(struct ubuf_info *uarg) } void sock_zerocopy_put(struct ubuf_info *uarg); -void sock_zerocopy_put_abort(struct ubuf_info *uarg); +void sock_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref); void sock_zerocopy_callback(struct ubuf_info *uarg, bool success); @@ -1326,10 +1326,14 @@ static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb) return is_zcopy ? skb_uarg(skb) : NULL; } -static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg) +static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg, + bool *have_ref) { if (skb && uarg && !skb_zcopy(skb)) { - sock_zerocopy_get(uarg); + if (unlikely(have_ref && *have_ref)) + *have_ref = false; + else + sock_zerocopy_get(uarg); skb_shinfo(skb)->destructor_arg = uarg; skb_shinfo(skb)->tx_flags |= SKBTX_ZEROCOPY_FRAG; } @@ -1374,7 +1378,7 @@ static inline void skb_zcopy_abort(struct sk_buff *skb) struct ubuf_info *uarg = skb_zcopy(skb); if (uarg) { - sock_zerocopy_put_abort(uarg); + sock_zerocopy_put_abort(uarg, false); skb_shinfo(skb)->tx_flags &= ~SKBTX_ZEROCOPY_FRAG; } } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 1350901c5cb8..c78ce114537e 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1089,7 +1089,7 @@ void sock_zerocopy_put(struct ubuf_info *uarg) } EXPORT_SYMBOL_GPL(sock_zerocopy_put); -void sock_zerocopy_put_abort(struct ubuf_info *uarg) +void sock_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref) { if (uarg) { struct sock *sk = skb_from_uarg(uarg)->sk; @@ -1097,7 +1097,8 @@ void sock_zerocopy_put_abort(struct ubuf_info *uarg) atomic_dec(&sk->sk_zckey); uarg->len--; - sock_zerocopy_put(uarg); + if (have_uref) + sock_zerocopy_put(uarg); } } EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort); @@ -1137,7 +1138,7 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, return err; } - skb_zcopy_set(skb, uarg); + skb_zcopy_set(skb, uarg, NULL); return skb->len - orig_len; } EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream); @@ -1157,7 +1158,7 @@ static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig, if (skb_copy_ubufs(nskb, GFP_ATOMIC)) return -EIO; } - skb_zcopy_set(nskb, skb_uarg(orig)); + skb_zcopy_set(nskb, skb_uarg(orig), NULL); } return 0; } diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 6f843aff628c..78f028bdad30 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -881,8 +881,8 @@ static int __ip_append_data(struct sock *sk, int csummode = CHECKSUM_NONE; struct rtable *rt = (struct rtable *)cork->dst; unsigned int wmem_alloc_delta = 0; + bool paged, extra_uref; u32 tskey = 0; - bool paged; skb = skb_peek_tail(queue); @@ -921,12 +921,13 @@ static int __ip_append_data(struct sock *sk, uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb)); if (!uarg) return -ENOBUFS; + extra_uref = true; if (rt->dst.dev->features & NETIF_F_SG && csummode == CHECKSUM_PARTIAL) { paged = true; } else { uarg->zerocopy = 0; - skb_zcopy_set(skb, uarg); + skb_zcopy_set(skb, uarg, &extra_uref); } } @@ -1015,13 +1016,6 @@ alloc_new_skb: skb->csum = 0; skb_reserve(skb, hh_len); - /* only the initial fragment is time stamped */ - skb_shinfo(skb)->tx_flags = cork->tx_flags; - cork->tx_flags = 0; - skb_shinfo(skb)->tskey = tskey; - tskey = 0; - skb_zcopy_set(skb, uarg); - /* * Find where to start putting bytes. */ @@ -1054,6 +1048,13 @@ alloc_new_skb: exthdrlen = 0; csummode = CHECKSUM_NONE; + /* only the initial fragment is time stamped */ + skb_shinfo(skb)->tx_flags = cork->tx_flags; + cork->tx_flags = 0; + skb_shinfo(skb)->tskey = tskey; + tskey = 0; + skb_zcopy_set(skb, uarg, &extra_uref); + if ((flags & MSG_CONFIRM) && !skb_prev) skb_set_dst_pending_confirm(skb, 1); @@ -1124,13 +1125,12 @@ alloc_new_skb: if (wmem_alloc_delta) refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); - sock_zerocopy_put(uarg); return 0; error_efault: err = -EFAULT; error: - sock_zerocopy_put_abort(uarg); + sock_zerocopy_put_abort(uarg, extra_uref); cork->length -= length; IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 215e4d3b3616..dc68c408bba0 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1423,7 +1423,7 @@ do_error: if (copied + copied_syn) goto out; out_err: - sock_zerocopy_put_abort(uarg); + sock_zerocopy_put_abort(uarg, true); err = sk_stream_error(sk, flags, err); /* make sure we wake any epoll edge trigger waiter */ if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 7df04d20a91f..ec8c235ea891 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1258,7 +1258,7 @@ static int __ip6_append_data(struct sock *sk, int csummode = CHECKSUM_NONE; unsigned int maxnonfragsize, headersize; unsigned int wmem_alloc_delta = 0; - bool paged; + bool paged, extra_uref; skb = skb_peek_tail(queue); if (!skb) { @@ -1327,12 +1327,13 @@ emsgsize: uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb)); if (!uarg) return -ENOBUFS; + extra_uref = true; if (rt->dst.dev->features & NETIF_F_SG && csummode == CHECKSUM_PARTIAL) { paged = true; } else { uarg->zerocopy = 0; - skb_zcopy_set(skb, uarg); + skb_zcopy_set(skb, uarg, &extra_uref); } } @@ -1454,13 +1455,6 @@ alloc_new_skb: skb_reserve(skb, hh_len + sizeof(struct frag_hdr) + dst_exthdrlen); - /* Only the initial fragment is time stamped */ - skb_shinfo(skb)->tx_flags = cork->tx_flags; - cork->tx_flags = 0; - skb_shinfo(skb)->tskey = tskey; - tskey = 0; - skb_zcopy_set(skb, uarg); - /* * Find where to start putting bytes */ @@ -1492,6 +1486,13 @@ alloc_new_skb: exthdrlen = 0; dst_exthdrlen = 0; + /* Only the initial fragment is time stamped */ + skb_shinfo(skb)->tx_flags = cork->tx_flags; + cork->tx_flags = 0; + skb_shinfo(skb)->tskey = tskey; + tskey = 0; + skb_zcopy_set(skb, uarg, &extra_uref); + if ((flags & MSG_CONFIRM) && !skb_prev) skb_set_dst_pending_confirm(skb, 1); @@ -1562,13 +1563,12 @@ alloc_new_skb: if (wmem_alloc_delta) refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); - sock_zerocopy_put(uarg); return 0; error_efault: err = -EFAULT; error: - sock_zerocopy_put_abort(uarg); + sock_zerocopy_put_abort(uarg, extra_uref); cork->length -= length; IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); -- cgit v1.2.3-58-ga151 From 875e8939953483d856de226b72d14c6a000f9457 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 4 Dec 2018 08:15:10 +0000 Subject: skbuff: Rename 'offload_mr_fwd_mark' to 'offload_l3_fwd_mark' Commit abf4bb6b63d0 ("skbuff: Add the offload_mr_fwd_mark field") added the 'offload_mr_fwd_mark' field to indicate that a packet has already undergone L3 multicast routing by a capable device. The field is used to prevent the kernel from forwarding a packet through a netdev through which the device has already forwarded the packet. Currently, no unicast packet is routed by both the device and the kernel, but this is about to change by subsequent patches and we need to be able to mark such packets, so that they will no be forwarded twice. Instead of adding yet another field to 'struct sk_buff', we can just rename 'offload_mr_fwd_mark' to 'offload_l3_fwd_mark', as a packet either has a multicast or a unicast destination IP. While at it, add a comment about both 'offload_fwd_mark' and 'offload_l3_fwd_mark'. Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 10 +++++----- include/linux/skbuff.h | 4 +++- net/core/skbuff.c | 2 +- net/ipv4/ipmr.c | 2 +- 4 files changed, 10 insertions(+), 8 deletions(-) (limited to 'include/linux/skbuff.h') diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index c293ff1eed63..920085fbbf2a 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -3554,10 +3554,10 @@ static void mlxsw_sp_rx_listener_mark_func(struct sk_buff *skb, u8 local_port, return mlxsw_sp_rx_listener_no_mark_func(skb, local_port, priv); } -static void mlxsw_sp_rx_listener_mr_mark_func(struct sk_buff *skb, +static void mlxsw_sp_rx_listener_l3_mark_func(struct sk_buff *skb, u8 local_port, void *priv) { - skb->offload_mr_fwd_mark = 1; + skb->offload_l3_fwd_mark = 1; skb->offload_fwd_mark = 1; return mlxsw_sp_rx_listener_no_mark_func(skb, local_port, priv); } @@ -3605,8 +3605,8 @@ out: MLXSW_RXL(mlxsw_sp_rx_listener_mark_func, _trap_id, _action, \ _is_ctrl, SP_##_trap_group, DISCARD) -#define MLXSW_SP_RXL_MR_MARK(_trap_id, _action, _trap_group, _is_ctrl) \ - MLXSW_RXL(mlxsw_sp_rx_listener_mr_mark_func, _trap_id, _action, \ +#define MLXSW_SP_RXL_L3_MARK(_trap_id, _action, _trap_group, _is_ctrl) \ + MLXSW_RXL(mlxsw_sp_rx_listener_l3_mark_func, _trap_id, _action, \ _is_ctrl, SP_##_trap_group, DISCARD) #define MLXSW_SP_EVENTL(_func, _trap_id) \ @@ -3683,7 +3683,7 @@ static const struct mlxsw_listener mlxsw_sp_listener[] = { MLXSW_SP_RXL_MARK(IPV6_PIM, TRAP_TO_CPU, PIM, false), MLXSW_SP_RXL_MARK(RPF, TRAP_TO_CPU, RPF, false), MLXSW_SP_RXL_MARK(ACL1, TRAP_TO_CPU, MULTICAST, false), - MLXSW_SP_RXL_MR_MARK(ACL2, TRAP_TO_CPU, MULTICAST, false), + MLXSW_SP_RXL_L3_MARK(ACL2, TRAP_TO_CPU, MULTICAST, false), /* NVE traps */ MLXSW_SP_RXL_MARK(NVE_ENCAP_ARP, TRAP_TO_CPU, ARP, false), }; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 75d50ab7997c..b1831a5ca173 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -616,6 +616,8 @@ typedef unsigned char *sk_buff_data_t; * @pkt_type: Packet class * @fclone: skbuff clone status * @ipvs_property: skbuff is owned by ipvs + * @offload_fwd_mark: Packet was L2-forwarded in hardware + * @offload_l3_fwd_mark: Packet was L3-forwarded in hardware * @tc_skip_classify: do not classify packet. set by IFB device * @tc_at_ingress: used within tc_classify to distinguish in/egress * @tc_redirected: packet was redirected by a tc action @@ -799,7 +801,7 @@ struct sk_buff { __u8 remcsum_offload:1; #ifdef CONFIG_NET_SWITCHDEV __u8 offload_fwd_mark:1; - __u8 offload_mr_fwd_mark:1; + __u8 offload_l3_fwd_mark:1; #endif #ifdef CONFIG_NET_CLS_ACT __u8 tc_skip_classify:1; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index c78ce114537e..40552547c69a 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4885,7 +4885,7 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet) #ifdef CONFIG_NET_SWITCHDEV skb->offload_fwd_mark = 0; - skb->offload_mr_fwd_mark = 0; + skb->offload_l3_fwd_mark = 0; #endif if (!xnet) diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index a6defbec4f1b..5cbc749a50aa 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1802,7 +1802,7 @@ static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt, struct vif_device *out_vif = &mrt->vif_table[out_vifi]; struct vif_device *in_vif = &mrt->vif_table[in_vifi]; - if (!skb->offload_mr_fwd_mark) + if (!skb->offload_l3_fwd_mark) return false; if (!out_vif->dev_parent_id.id_len || !in_vif->dev_parent_id.id_len) return false; -- cgit v1.2.3-58-ga151 From 65d69e2505bb64f6a8d7f417f6e46e2a351174c6 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 3 Dec 2018 17:52:10 -0800 Subject: datagram: introduce skb_copy_and_hash_datagram_iter helper Introduce a helper to copy datagram into an iovec iterator but also update a predefined hash. This is useful for consumers of skb_copy_datagram_iter to also support inflight data digest without having to finish to copy and only then traverse the iovec and calculate the digest hash. Acked-by: David S. Miller Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- include/linux/skbuff.h | 3 +++ net/core/datagram.c | 20 +++++++++++++++++++- 2 files changed, 22 insertions(+), 1 deletion(-) (limited to 'include/linux/skbuff.h') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 0d1b2c3f127b..b96c809c29eb 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3325,6 +3325,9 @@ static inline int skb_copy_datagram_msg(const struct sk_buff *from, int offset, } int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, int hlen, struct msghdr *msg); +int skb_copy_and_hash_datagram_iter(const struct sk_buff *skb, int offset, + struct iov_iter *to, int len, + struct ahash_request *hash); int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset, struct iov_iter *from, int len); int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *frm); diff --git a/net/core/datagram.c b/net/core/datagram.c index 382543302ae5..ef262282c8be 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -465,7 +465,7 @@ int __skb_datagram_iter(const struct sk_buff *skb, int offset, if (copy > len) copy = len; if (__skb_datagram_iter(frag_iter, offset - start, - to, copy, short_copy, cb, data)) + to, copy, fault_short, cb, data)) goto fault; if ((len -= copy) == 0) return 0; @@ -492,6 +492,24 @@ short_copy: return 0; } +/** + * skb_copy_and_hash_datagram_iter - Copy datagram to an iovec iterator + * and update a hash. + * @skb: buffer to copy + * @offset: offset in the buffer to start copying from + * @to: iovec iterator to copy to + * @len: amount of data to copy from buffer to iovec + * @hash: hash request to update + */ +int skb_copy_and_hash_datagram_iter(const struct sk_buff *skb, int offset, + struct iov_iter *to, int len, + struct ahash_request *hash) +{ + return __skb_datagram_iter(skb, offset, to, len, true, + hash_and_copy_to_iter, hash); +} +EXPORT_SYMBOL(skb_copy_and_hash_datagram_iter); + static size_t simple_copy_to_iter(const void *addr, size_t bytes, void *data __always_unused, struct iov_iter *i) { -- cgit v1.2.3-58-ga151 From df5042f4c5b9326c593bf2e31ed859ebc3b4130a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 18 Dec 2018 17:15:16 +0100 Subject: sk_buff: add skb extension infrastructure This adds an optional extension infrastructure, with ispec (xfrm) and bridge netfilter as first users. objdiff shows no changes if kernel is built without xfrm and br_netfilter support. The third (planned future) user is Multipath TCP which is still out-of-tree. MPTCP needs to map logical mptcp sequence numbers to the tcp sequence numbers used by individual subflows. This DSS mapping is read/written from tcp option space on receive and written to tcp option space on transmitted tcp packets that are part of and MPTCP connection. Extending skb_shared_info or adding a private data field to skb fclones doesn't work for incoming skb, so a different DSS propagation method would be required for the receive side. mptcp has same requirements as secpath/bridge netfilter: 1. extension memory is released when the sk_buff is free'd. 2. data is shared after cloning an skb (clone inherits extension) 3. adding extension to an skb will COW the extension buffer if needed. The "MPTCP upstreaming" effort adds SKB_EXT_MPTCP extension to store the mapping for tx and rx processing. Two new members are added to sk_buff: 1. 'active_extensions' byte (filling a hole), telling which extensions are available for this skb. This has two purposes. a) avoids the need to initialize the pointer. b) allows to "delete" an extension by clearing its bit value in ->active_extensions. While it would be possible to store the active_extensions byte in the extension struct instead of sk_buff, there is one problem with this: When an extension has to be disabled, we can always clear the bit in skb->active_extensions. But in case it would be stored in the extension buffer itself, we might have to COW it first, if we are dealing with a cloned skb. On kmalloc failure we would be unable to turn an extension off. 2. extension pointer, located at the end of the sk_buff. If the active_extensions byte is 0, the pointer is undefined, it is not initialized on skb allocation. This adds extra code to skb clone and free paths (to deal with refcount/free of extension area) but this replaces similar code that manages skb->nf_bridge and skb->sp structs in the followup patches of the series. It is possible to add support for extensions that are not preseved on clones/copies. To do this, it would be needed to define a bitmask of all extensions that need copy/cow semantics, and change __skb_ext_copy() to check ->active_extensions & SKB_EXT_PRESERVE_ON_CLONE, then just set ->active_extensions to 0 on the new clone. This isn't done here because all extensions that get added here need the copy/cow semantics. v2: Allocate entire extension space using kmem_cache. Upside is that this allows better tracking of used memory, downside is that we will allocate more space than strictly needed in most cases (its unlikely that all extensions are active/needed at same time for same skb). The allocated memory (except the small extension header) is not cleared, so no additonal overhead aside from memory usage. Avoid atomic_dec_and_test operation on skb_ext_put() by using similar trick as kfree_skbmem() does with fclone_ref: If recount is 1, there is no concurrent user and we can free right away. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/linux/skbuff.h | 111 ++++++++++++++++++++++++++++++++++- net/Kconfig | 3 + net/core/skbuff.c | 155 +++++++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/ip_output.c | 1 + net/ipv6/ip6_output.c | 1 + 5 files changed, 270 insertions(+), 1 deletion(-) (limited to 'include/linux/skbuff.h') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b1831a5ca173..88f7541837e3 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -245,6 +245,7 @@ struct iov_iter; struct napi_struct; struct bpf_prog; union bpf_attr; +struct skb_ext; #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) struct nf_conntrack { @@ -636,6 +637,7 @@ typedef unsigned char *sk_buff_data_t; * @queue_mapping: Queue mapping for multiqueue devices * @xmit_more: More SKBs are pending for this queue * @pfmemalloc: skbuff was allocated from PFMEMALLOC reserves + * @active_extensions: active extensions (skb_ext_id types) * @ndisc_nodetype: router type (from link layer) * @ooo_okay: allow the mapping of a socket to a queue to be changed * @l4_hash: indicate hash is a canonical 4-tuple hash over transport @@ -665,6 +667,7 @@ typedef unsigned char *sk_buff_data_t; * @data: Data head pointer * @truesize: Buffer size * @users: User count - see {datagram,tcp}.c + * @extensions: allocated extensions, valid if active_extensions is nonzero */ struct sk_buff { @@ -747,7 +750,9 @@ struct sk_buff { head_frag:1, xmit_more:1, pfmemalloc:1; - +#ifdef CONFIG_SKB_EXTENSIONS + __u8 active_extensions; +#endif /* fields enclosed in headers_start/headers_end are copied * using a single memcpy() in __copy_skb_header() */ @@ -869,6 +874,11 @@ struct sk_buff { *data; unsigned int truesize; refcount_t users; + +#ifdef CONFIG_SKB_EXTENSIONS + /* only useable after checking ->active_extensions != 0 */ + struct skb_ext *extensions; +#endif }; #ifdef __KERNEL__ @@ -3896,6 +3906,105 @@ static inline void nf_conntrack_get(struct nf_conntrack *nfct) atomic_inc(&nfct->use); } #endif + +#ifdef CONFIG_SKB_EXTENSIONS +enum skb_ext_id { +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + SKB_EXT_BRIDGE_NF, +#endif + SKB_EXT_NUM, /* must be last */ +}; + +/** + * struct skb_ext - sk_buff extensions + * @refcnt: 1 on allocation, deallocated on 0 + * @offset: offset to add to @data to obtain extension address + * @chunks: size currently allocated, stored in SKB_EXT_ALIGN_SHIFT units + * @data: start of extension data, variable sized + * + * Note: offsets/lengths are stored in chunks of 8 bytes, this allows + * to use 'u8' types while allowing up to 2kb worth of extension data. + */ +struct skb_ext { + refcount_t refcnt; + u8 offset[SKB_EXT_NUM]; /* in chunks of 8 bytes */ + u8 chunks; /* same */ + char data[0] __aligned(8); +}; + +void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id); +void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id); +void __skb_ext_put(struct skb_ext *ext); + +static inline void skb_ext_put(struct sk_buff *skb) +{ + if (skb->active_extensions) + __skb_ext_put(skb->extensions); +} + +static inline void skb_ext_get(struct sk_buff *skb) +{ + if (skb->active_extensions) { + struct skb_ext *ext = skb->extensions; + + if (ext) + refcount_inc(&ext->refcnt); + } +} + +static inline void __skb_ext_copy(struct sk_buff *dst, + const struct sk_buff *src) +{ + dst->active_extensions = src->active_extensions; + + if (src->active_extensions) { + struct skb_ext *ext = src->extensions; + + refcount_inc(&ext->refcnt); + dst->extensions = ext; + } +} + +static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *src) +{ + skb_ext_put(dst); + __skb_ext_copy(dst, src); +} + +static inline bool __skb_ext_exist(const struct skb_ext *ext, enum skb_ext_id i) +{ + return !!ext->offset[i]; +} + +static inline bool skb_ext_exist(const struct sk_buff *skb, enum skb_ext_id id) +{ + return skb->active_extensions & (1 << id); +} + +static inline void skb_ext_del(struct sk_buff *skb, enum skb_ext_id id) +{ + if (skb_ext_exist(skb, id)) + __skb_ext_del(skb, id); +} + +static inline void *skb_ext_find(const struct sk_buff *skb, enum skb_ext_id id) +{ + if (skb_ext_exist(skb, id)) { + struct skb_ext *ext = skb->extensions; + + return (void *)ext + (ext->offset[id] << 3); + } + + return NULL; +} +#else +static inline void skb_ext_put(struct sk_buff *skb) {} +static inline void skb_ext_get(struct sk_buff *skb) {} +static inline void skb_ext_del(struct sk_buff *skb, int unused) {} +static inline void __skb_ext_copy(struct sk_buff *d, const struct sk_buff *s) {} +static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *s) {} +#endif /* CONFIG_SKB_EXTENSIONS */ + #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) static inline void nf_bridge_put(struct nf_bridge_info *nf_bridge) { diff --git a/net/Kconfig b/net/Kconfig index f235edb593ba..93b291292860 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -51,6 +51,9 @@ config NET_INGRESS config NET_EGRESS bool +config SKB_EXTENSIONS + bool + menu "Networking options" source "net/packet/Kconfig" diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 40552547c69a..d2dfad33e686 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -79,6 +79,9 @@ struct kmem_cache *skbuff_head_cache __ro_after_init; static struct kmem_cache *skbuff_fclone_cache __ro_after_init; +#ifdef CONFIG_SKB_EXTENSIONS +static struct kmem_cache *skbuff_ext_cache __ro_after_init; +#endif int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS; EXPORT_SYMBOL(sysctl_max_skb_frags); @@ -617,6 +620,7 @@ void skb_release_head_state(struct sk_buff *skb) #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) nf_bridge_put(skb->nf_bridge); #endif + skb_ext_put(skb); } /* Free everything but the sk_buff shell. */ @@ -796,6 +800,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->dev = old->dev; memcpy(new->cb, old->cb, sizeof(old->cb)); skb_dst_copy(new, old); + __skb_ext_copy(new, old); #ifdef CONFIG_XFRM new->sp = secpath_get(old->sp); #endif @@ -3902,6 +3907,40 @@ done: } EXPORT_SYMBOL_GPL(skb_gro_receive); +#ifdef CONFIG_SKB_EXTENSIONS +#define SKB_EXT_ALIGN_VALUE 8 +#define SKB_EXT_CHUNKSIZEOF(x) (ALIGN((sizeof(x)), SKB_EXT_ALIGN_VALUE) / SKB_EXT_ALIGN_VALUE) + +static const u8 skb_ext_type_len[] = { +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + [SKB_EXT_BRIDGE_NF] = SKB_EXT_CHUNKSIZEOF(struct nf_bridge_info), +#endif +}; + +static __always_inline unsigned int skb_ext_total_length(void) +{ + return SKB_EXT_CHUNKSIZEOF(struct skb_ext) + +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + skb_ext_type_len[SKB_EXT_BRIDGE_NF] + +#endif + 0; +} + +static void skb_extensions_init(void) +{ + BUILD_BUG_ON(SKB_EXT_NUM >= 8); + BUILD_BUG_ON(skb_ext_total_length() > 255); + + skbuff_ext_cache = kmem_cache_create("skbuff_ext_cache", + SKB_EXT_ALIGN_VALUE * skb_ext_total_length(), + 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, + NULL); +} +#else +static void skb_extensions_init(void) {} +#endif + void __init skb_init(void) { skbuff_head_cache = kmem_cache_create_usercopy("skbuff_head_cache", @@ -3916,6 +3955,7 @@ void __init skb_init(void) 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + skb_extensions_init(); } static int @@ -5554,3 +5594,118 @@ void skb_condense(struct sk_buff *skb) */ skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); } + +#ifdef CONFIG_SKB_EXTENSIONS +static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id) +{ + return (void *)ext + (ext->offset[id] * SKB_EXT_ALIGN_VALUE); +} + +static struct skb_ext *skb_ext_alloc(void) +{ + struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC); + + if (new) { + memset(new->offset, 0, sizeof(new->offset)); + refcount_set(&new->refcnt, 1); + } + + return new; +} + +static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old) +{ + struct skb_ext *new; + + if (refcount_read(&old->refcnt) == 1) + return old; + + new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC); + if (!new) + return NULL; + + memcpy(new, old, old->chunks * SKB_EXT_ALIGN_VALUE); + refcount_set(&new->refcnt, 1); + + __skb_ext_put(old); + return new; +} + +/** + * skb_ext_add - allocate space for given extension, COW if needed + * @skb: buffer + * @id: extension to allocate space for + * + * Allocates enough space for the given extension. + * If the extension is already present, a pointer to that extension + * is returned. + * + * If the skb was cloned, COW applies and the returned memory can be + * modified without changing the extension space of clones buffers. + * + * Returns pointer to the extension or NULL on allocation failure. + */ +void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id) +{ + struct skb_ext *new, *old = NULL; + unsigned int newlen, newoff; + + if (skb->active_extensions) { + old = skb->extensions; + + new = skb_ext_maybe_cow(old); + if (!new) + return NULL; + + if (__skb_ext_exist(old, id)) { + if (old != new) + skb->extensions = new; + goto set_active; + } + + newoff = old->chunks; + } else { + newoff = SKB_EXT_CHUNKSIZEOF(*new); + + new = skb_ext_alloc(); + if (!new) + return NULL; + } + + newlen = newoff + skb_ext_type_len[id]; + new->chunks = newlen; + new->offset[id] = newoff; + skb->extensions = new; +set_active: + skb->active_extensions |= 1 << id; + return skb_ext_get_ptr(new, id); +} +EXPORT_SYMBOL(skb_ext_add); + +void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id) +{ + struct skb_ext *ext = skb->extensions; + + skb->active_extensions &= ~(1 << id); + if (skb->active_extensions == 0) { + skb->extensions = NULL; + __skb_ext_put(ext); + } +} +EXPORT_SYMBOL(__skb_ext_del); + +void __skb_ext_put(struct skb_ext *ext) +{ + /* If this is last clone, nothing can increment + * it after check passes. Avoids one atomic op. + */ + if (refcount_read(&ext->refcnt) == 1) + goto free_now; + + if (!refcount_dec_and_test(&ext->refcnt)) + return; +free_now: + kmem_cache_free(skbuff_ext_cache, ext); +} +EXPORT_SYMBOL(__skb_ext_put); +#endif /* CONFIG_SKB_EXTENSIONS */ diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index ab6618036afe..c80188875f39 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -533,6 +533,7 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) to->tc_index = from->tc_index; #endif nf_copy(to, from); + skb_ext_copy(to, from); #if IS_ENABLED(CONFIG_IP_VS) to->ipvs_property = from->ipvs_property; #endif diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 9d55ee33b7f9..703a8e801c5c 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -581,6 +581,7 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) to->tc_index = from->tc_index; #endif nf_copy(to, from); + skb_ext_copy(to, from); skb_copy_secmark(to, from); } -- cgit v1.2.3-58-ga151 From de8bda1d22d38b7d5cd08b33f86efd94d4c86630 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 18 Dec 2018 17:15:17 +0100 Subject: net: convert bridge_nf to use skb extension infrastructure This converts the bridge netfilter (calling iptables hooks from bridge) facility to use the extension infrastructure. The bridge_nf specific hooks in skb clone and free paths are removed, they have been replaced by the skb_ext hooks that do the same as the bridge nf allocations hooks did. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/linux/netfilter_bridge.h | 4 ++-- include/linux/skbuff.h | 28 ++-------------------------- include/net/netfilter/br_netfilter.h | 8 ++++---- net/Kconfig | 1 + net/bridge/br_netfilter_hooks.c | 20 ++------------------ net/bridge/br_netfilter_ipv6.c | 4 ++-- net/core/skbuff.c | 3 --- 7 files changed, 13 insertions(+), 55 deletions(-) (limited to 'include/linux/skbuff.h') diff --git a/include/linux/netfilter_bridge.h b/include/linux/netfilter_bridge.h index 0a65a422587c..5f2614d02e03 100644 --- a/include/linux/netfilter_bridge.h +++ b/include/linux/netfilter_bridge.h @@ -20,12 +20,12 @@ static inline void br_drop_fake_rtable(struct sk_buff *skb) static inline struct nf_bridge_info * nf_bridge_info_get(const struct sk_buff *skb) { - return skb->nf_bridge; + return skb_ext_find(skb, SKB_EXT_BRIDGE_NF); } static inline bool nf_bridge_info_exists(const struct sk_buff *skb) { - return skb->nf_bridge != NULL; + return skb_ext_exist(skb, SKB_EXT_BRIDGE_NF); } static inline int nf_bridge_get_physinif(const struct sk_buff *skb) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 88f7541837e3..2f42d2e99f17 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -255,7 +255,6 @@ struct nf_conntrack { #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) struct nf_bridge_info { - refcount_t use; enum { BRNF_PROTO_UNCHANGED, BRNF_PROTO_8021Q, @@ -720,9 +719,6 @@ struct sk_buff { #endif #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) unsigned long _nfct; -#endif -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - struct nf_bridge_info *nf_bridge; #endif unsigned int len, data_len; @@ -4005,18 +4001,6 @@ static inline void __skb_ext_copy(struct sk_buff *d, const struct sk_buff *s) {} static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *s) {} #endif /* CONFIG_SKB_EXTENSIONS */ -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) -static inline void nf_bridge_put(struct nf_bridge_info *nf_bridge) -{ - if (nf_bridge && refcount_dec_and_test(&nf_bridge->use)) - kfree(nf_bridge); -} -static inline void nf_bridge_get(struct nf_bridge_info *nf_bridge) -{ - if (nf_bridge) - refcount_inc(&nf_bridge->use); -} -#endif /* CONFIG_BRIDGE_NETFILTER */ static inline void nf_reset(struct sk_buff *skb) { #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) @@ -4024,8 +4008,7 @@ static inline void nf_reset(struct sk_buff *skb) skb->_nfct = 0; #endif #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - nf_bridge_put(skb->nf_bridge); - skb->nf_bridge = NULL; + skb_ext_del(skb, SKB_EXT_BRIDGE_NF); #endif } @@ -4043,7 +4026,7 @@ static inline void ipvs_reset(struct sk_buff *skb) #endif } -/* Note: This doesn't put any conntrack and bridge info in dst. */ +/* Note: This doesn't put any conntrack info in dst. */ static inline void __nf_copy(struct sk_buff *dst, const struct sk_buff *src, bool copy) { @@ -4051,10 +4034,6 @@ static inline void __nf_copy(struct sk_buff *dst, const struct sk_buff *src, dst->_nfct = src->_nfct; nf_conntrack_get(skb_nfct(src)); #endif -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - dst->nf_bridge = src->nf_bridge; - nf_bridge_get(src->nf_bridge); -#endif #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || defined(CONFIG_NF_TABLES) if (copy) dst->nf_trace = src->nf_trace; @@ -4065,9 +4044,6 @@ static inline void nf_copy(struct sk_buff *dst, const struct sk_buff *src) { #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) nf_conntrack_put(skb_nfct(dst)); -#endif -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - nf_bridge_put(dst->nf_bridge); #endif __nf_copy(dst, src, true); } diff --git a/include/net/netfilter/br_netfilter.h b/include/net/netfilter/br_netfilter.h index 6efc0153987b..4cd56808ac4e 100644 --- a/include/net/netfilter/br_netfilter.h +++ b/include/net/netfilter/br_netfilter.h @@ -6,12 +6,12 @@ static inline struct nf_bridge_info *nf_bridge_alloc(struct sk_buff *skb) { - skb->nf_bridge = kzalloc(sizeof(struct nf_bridge_info), GFP_ATOMIC); + struct nf_bridge_info *b = skb_ext_add(skb, SKB_EXT_BRIDGE_NF); - if (likely(skb->nf_bridge)) - refcount_set(&(skb->nf_bridge->use), 1); + if (b) + memset(b, 0, sizeof(*b)); - return skb->nf_bridge; + return b; } void nf_bridge_update_protocol(struct sk_buff *skb); diff --git a/net/Kconfig b/net/Kconfig index 93b291292860..5cb9de1aaf88 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -187,6 +187,7 @@ config BRIDGE_NETFILTER depends on NETFILTER && INET depends on NETFILTER_ADVANCED select NETFILTER_FAMILY_BRIDGE + select SKB_EXTENSIONS default m ---help--- Enabling this option will let arptables resp. iptables see bridged diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index c58cf68b45c5..d21a23698410 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -132,10 +132,7 @@ static DEFINE_PER_CPU(struct brnf_frag_data, brnf_frag_data_storage); static void nf_bridge_info_free(struct sk_buff *skb) { - if (skb->nf_bridge) { - nf_bridge_put(skb->nf_bridge); - skb->nf_bridge = NULL; - } + skb_ext_del(skb, SKB_EXT_BRIDGE_NF); } static inline struct net_device *bridge_parent(const struct net_device *dev) @@ -148,19 +145,7 @@ static inline struct net_device *bridge_parent(const struct net_device *dev) static inline struct nf_bridge_info *nf_bridge_unshare(struct sk_buff *skb) { - struct nf_bridge_info *nf_bridge = skb->nf_bridge; - - if (refcount_read(&nf_bridge->use) > 1) { - struct nf_bridge_info *tmp = nf_bridge_alloc(skb); - - if (tmp) { - memcpy(tmp, nf_bridge, sizeof(struct nf_bridge_info)); - refcount_set(&tmp->use, 1); - } - nf_bridge_put(nf_bridge); - nf_bridge = tmp; - } - return nf_bridge; + return skb_ext_add(skb, SKB_EXT_BRIDGE_NF); } unsigned int nf_bridge_encap_header_len(const struct sk_buff *skb) @@ -508,7 +493,6 @@ static unsigned int br_nf_pre_routing(void *priv, if (br_validate_ipv4(state->net, skb)) return NF_DROP; - nf_bridge_put(skb->nf_bridge); if (!nf_bridge_alloc(skb)) return NF_DROP; if (!setup_pre_routing(skb)) diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c index 96c072e71ea2..94039f588f1d 100644 --- a/net/bridge/br_netfilter_ipv6.c +++ b/net/bridge/br_netfilter_ipv6.c @@ -224,8 +224,8 @@ unsigned int br_nf_pre_routing_ipv6(void *priv, if (br_validate_ipv6(state->net, skb)) return NF_DROP; - nf_bridge_put(skb->nf_bridge); - if (!nf_bridge_alloc(skb)) + nf_bridge = nf_bridge_alloc(skb); + if (!nf_bridge) return NF_DROP; if (!setup_pre_routing(skb)) return NF_DROP; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index d2dfad33e686..0c65723591d7 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -616,9 +616,6 @@ void skb_release_head_state(struct sk_buff *skb) } #if IS_ENABLED(CONFIG_NF_CONNTRACK) nf_conntrack_put(skb_nfct(skb)); -#endif -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - nf_bridge_put(skb->nf_bridge); #endif skb_ext_put(skb); } -- cgit v1.2.3-58-ga151 From 7af8f4ca314a592e2ba49cb5ea1de1325974998e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 18 Dec 2018 17:15:19 +0100 Subject: net: move secpath_exist helper to sk_buff.h Future patch will remove skb->sp pointer. To reduce noise in those patches, move existing helper to sk_buff and use it in more places to ease skb->sp replacement later. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/linux/skbuff.h | 13 ++++++++++--- include/net/xfrm.h | 9 --------- net/netfilter/nft_meta.c | 2 +- 3 files changed, 11 insertions(+), 13 deletions(-) (limited to 'include/linux/skbuff.h') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 2f42d2e99f17..70ac58240ec0 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4066,12 +4066,19 @@ static inline void skb_init_secmark(struct sk_buff *skb) { } #endif +static inline int secpath_exists(const struct sk_buff *skb) +{ +#ifdef CONFIG_XFRM + return skb->sp != NULL; +#else + return 0; +#endif +} + static inline bool skb_irq_freeable(const struct sk_buff *skb) { return !skb->destructor && -#if IS_ENABLED(CONFIG_XFRM) - !skb->sp && -#endif + !secpath_exists(skb) && !skb_nfct(skb) && !skb->_skb_refdst && !skb_has_frag_list(skb); diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 62ca62177bc6..9cb506d09b98 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1104,15 +1104,6 @@ struct sec_path { struct xfrm_offload ovec[XFRM_MAX_OFFLOAD_DEPTH]; }; -static inline int secpath_exists(struct sk_buff *skb) -{ -#ifdef CONFIG_XFRM - return skb->sp != NULL; -#else - return 0; -#endif -} - static inline struct sec_path * secpath_get(struct sec_path *sp) { diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 6180626c3f80..6df486c5ebd3 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -229,7 +229,7 @@ void nft_meta_get_eval(const struct nft_expr *expr, } #ifdef CONFIG_XFRM case NFT_META_SECPATH: - nft_reg_store8(dest, !!skb->sp); + nft_reg_store8(dest, secpath_exists(skb)); break; #endif #ifdef CONFIG_NF_TABLES_BRIDGE -- cgit v1.2.3-58-ga151 From 2294be0f11e22b6197d025e5d3ab42888879ec4e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 18 Dec 2018 17:15:20 +0100 Subject: net: use skb_sec_path helper in more places skb_sec_path gains 'const' qualifier to avoid xt_policy.c: 'skb_sec_path' discards 'const' qualifier from pointer target type same reasoning as previous conversions: Won't need to touch these spots anymore when skb->sp is removed. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 +- include/net/xfrm.h | 6 ++++-- net/ipv4/esp4.c | 9 ++++++--- net/ipv4/esp4_offload.c | 4 +++- net/ipv6/esp6.c | 9 ++++++--- net/ipv6/esp6_offload.c | 4 +++- net/ipv6/xfrm6_input.c | 2 +- net/netfilter/nft_xfrm.c | 2 +- net/netfilter/xt_policy.c | 2 +- net/xfrm/xfrm_device.c | 4 +++- net/xfrm/xfrm_input.c | 16 ++++++++++------ net/xfrm/xfrm_policy.c | 19 +++++++++++-------- security/selinux/xfrm.c | 4 ++-- 13 files changed, 52 insertions(+), 31 deletions(-) (limited to 'include/linux/skbuff.h') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 70ac58240ec0..d0f254a016bf 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4124,7 +4124,7 @@ static inline bool skb_get_dst_pending_confirm(const struct sk_buff *skb) return skb->dst_pending_confirm != 0; } -static inline struct sec_path *skb_sec_path(struct sk_buff *skb) +static inline struct sec_path *skb_sec_path(const struct sk_buff *skb) { #ifdef CONFIG_XFRM return skb->sp; diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 9cb506d09b98..af723448c972 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1896,14 +1896,16 @@ static inline void xfrm_states_delete(struct xfrm_state **states, int n) #ifdef CONFIG_XFRM static inline struct xfrm_state *xfrm_input_state(struct sk_buff *skb) { - return skb->sp->xvec[skb->sp->len - 1]; + struct sec_path *sp = skb_sec_path(skb); + + return sp->xvec[sp->len - 1]; } #endif static inline struct xfrm_offload *xfrm_offload(struct sk_buff *skb) { #ifdef CONFIG_XFRM - struct sec_path *sp = skb->sp; + struct sec_path *sp = skb_sec_path(skb); if (!sp || !sp->olen || sp->len != sp->olen) return NULL; diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 9e1c840596c5..5459f41fc26f 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -125,10 +125,13 @@ static void esp_output_done(struct crypto_async_request *base, int err) void *tmp; struct xfrm_state *x; - if (xo && (xo->flags & XFRM_DEV_RESUME)) - x = skb->sp->xvec[skb->sp->len - 1]; - else + if (xo && (xo->flags & XFRM_DEV_RESUME)) { + struct sec_path *sp = skb_sec_path(skb); + + x = sp->xvec[sp->len - 1]; + } else { x = skb_dst(skb)->xfrm; + } tmp = ESP_SKB_CB(skb)->tmp; esp_ssg_unref(x, tmp); diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index 19bd22aa05f9..8756e0e790d2 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -115,6 +115,7 @@ static struct sk_buff *esp4_gso_segment(struct sk_buff *skb, struct crypto_aead *aead; netdev_features_t esp_features = features; struct xfrm_offload *xo = xfrm_offload(skb); + struct sec_path *sp; if (!xo) return ERR_PTR(-EINVAL); @@ -122,7 +123,8 @@ static struct sk_buff *esp4_gso_segment(struct sk_buff *skb, if (!(skb_shinfo(skb)->gso_type & SKB_GSO_ESP)) return ERR_PTR(-EINVAL); - x = skb->sp->xvec[skb->sp->len - 1]; + sp = skb_sec_path(skb); + x = sp->xvec[sp->len - 1]; aead = x->data; esph = ip_esp_hdr(skb); diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 63b2b66f9dfa..5afe9f83374d 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -145,10 +145,13 @@ static void esp_output_done(struct crypto_async_request *base, int err) void *tmp; struct xfrm_state *x; - if (xo && (xo->flags & XFRM_DEV_RESUME)) - x = skb->sp->xvec[skb->sp->len - 1]; - else + if (xo && (xo->flags & XFRM_DEV_RESUME)) { + struct sec_path *sp = skb_sec_path(skb); + + x = sp->xvec[sp->len - 1]; + } else { x = skb_dst(skb)->xfrm; + } tmp = ESP_SKB_CB(skb)->tmp; esp_ssg_unref(x, tmp); diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index 01a97f5dfa4e..d46b4eb645c2 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -142,6 +142,7 @@ static struct sk_buff *esp6_gso_segment(struct sk_buff *skb, struct crypto_aead *aead; netdev_features_t esp_features = features; struct xfrm_offload *xo = xfrm_offload(skb); + struct sec_path *sp; if (!xo) return ERR_PTR(-EINVAL); @@ -149,7 +150,8 @@ static struct sk_buff *esp6_gso_segment(struct sk_buff *skb, if (!(skb_shinfo(skb)->gso_type & SKB_GSO_ESP)) return ERR_PTR(-EINVAL); - x = skb->sp->xvec[skb->sp->len - 1]; + sp = skb_sec_path(skb); + x = sp->xvec[sp->len - 1]; aead = x->data; esph = ip_esp_hdr(skb); diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c index 97c69df1b329..a52cb3fc6df5 100644 --- a/net/ipv6/xfrm6_input.c +++ b/net/ipv6/xfrm6_input.c @@ -147,7 +147,7 @@ int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr, goto drop; } - skb->sp->xvec[skb->sp->len++] = x; + sp->xvec[sp->len++] = x; spin_lock(&x->lock); diff --git a/net/netfilter/nft_xfrm.c b/net/netfilter/nft_xfrm.c index 5322609f7662..b08865ec5ed3 100644 --- a/net/netfilter/nft_xfrm.c +++ b/net/netfilter/nft_xfrm.c @@ -161,7 +161,7 @@ static void nft_xfrm_get_eval_in(const struct nft_xfrm *priv, struct nft_regs *regs, const struct nft_pktinfo *pkt) { - const struct sec_path *sp = pkt->skb->sp; + const struct sec_path *sp = skb_sec_path(pkt->skb); const struct xfrm_state *state; if (sp == NULL || sp->len <= priv->spnum) { diff --git a/net/netfilter/xt_policy.c b/net/netfilter/xt_policy.c index 13f8ccf946d6..aa84e8121c93 100644 --- a/net/netfilter/xt_policy.c +++ b/net/netfilter/xt_policy.c @@ -56,7 +56,7 @@ match_policy_in(const struct sk_buff *skb, const struct xt_policy_info *info, unsigned short family) { const struct xt_policy_elem *e; - const struct sec_path *sp = skb->sp; + const struct sec_path *sp = skb_sec_path(skb); int strict = info->flags & XT_POLICY_MATCH_STRICT; int i, pos; diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 144c137886b1..b8736f56e7f7 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -32,6 +32,7 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur struct softnet_data *sd; netdev_features_t esp_features = features; struct xfrm_offload *xo = xfrm_offload(skb); + struct sec_path *sp; if (!xo) return skb; @@ -39,7 +40,8 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur if (!(features & NETIF_F_HW_ESP)) esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK); - x = skb->sp->xvec[skb->sp->len - 1]; + sp = skb_sec_path(skb); + x = sp->xvec[sp->len - 1]; if (xo->flags & XFRM_GRO || x->xso.flags & XFRM_OFFLOAD_INBOUND) return skb; diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index bda929b9ff35..b4db25b244fa 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -330,7 +330,9 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) daddr = (xfrm_address_t *)(skb_network_header(skb) + XFRM_SPI_SKB_CB(skb)->daddroff); do { - if (skb->sp->len == XFRM_MAX_DEPTH) { + sp = skb_sec_path(skb); + + if (sp->len == XFRM_MAX_DEPTH) { secpath_reset(skb); XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); goto drop; @@ -346,7 +348,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) skb->mark = xfrm_smark_get(skb->mark, x); - skb->sp->xvec[skb->sp->len++] = x; + sp->xvec[sp->len++] = x; lock: spin_lock(&x->lock); @@ -470,8 +472,9 @@ resume: nf_reset(skb); if (decaps) { - if (skb->sp) - skb->sp->olen = 0; + sp = skb_sec_path(skb); + if (sp) + sp->olen = 0; skb_dst_drop(skb); gro_cells_receive(&gro_cells, skb); return 0; @@ -482,8 +485,9 @@ resume: err = x->inner_mode->afinfo->transport_finish(skb, xfrm_gro || async); if (xfrm_gro) { - if (skb->sp) - skb->sp->olen = 0; + sp = skb_sec_path(skb); + if (sp) + sp->olen = 0; skb_dst_drop(skb); gro_cells_receive(&gro_cells, skb); return err; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index be04091eb7db..d6acba07bdc9 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -3200,11 +3200,12 @@ EXPORT_SYMBOL(xfrm_lookup_route); static inline int xfrm_secpath_reject(int idx, struct sk_buff *skb, const struct flowi *fl) { + struct sec_path *sp = skb_sec_path(skb); struct xfrm_state *x; - if (!skb->sp || idx < 0 || idx >= skb->sp->len) + if (!sp || idx < 0 || idx >= sp->len) return 0; - x = skb->sp->xvec[idx]; + x = sp->xvec[idx]; if (!x->type->reject) return 0; return x->type->reject(x, skb, fl); @@ -3304,6 +3305,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, struct flowi fl; int xerr_idx = -1; const struct xfrm_if_cb *ifcb; + struct sec_path *sp; struct xfrm_if *xi; u32 if_id = 0; @@ -3328,11 +3330,12 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, nf_nat_decode_session(skb, &fl, family); /* First, check used SA against their selectors. */ - if (skb->sp) { + sp = skb_sec_path(skb); + if (sp) { int i; - for (i = skb->sp->len-1; i >= 0; i--) { - struct xfrm_state *x = skb->sp->xvec[i]; + for (i = sp->len - 1; i >= 0; i--) { + struct xfrm_state *x = sp->xvec[i]; if (!xfrm_selector_match(&x->sel, &fl, family)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMISMATCH); return 0; @@ -3359,7 +3362,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, } if (!pol) { - if (skb->sp && secpath_has_nontransport(skb->sp, 0, &xerr_idx)) { + if (sp && secpath_has_nontransport(sp, 0, &xerr_idx)) { xfrm_secpath_reject(xerr_idx, skb, &fl); XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOPOLS); return 0; @@ -3388,7 +3391,6 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, #endif if (pol->action == XFRM_POLICY_ALLOW) { - struct sec_path *sp; static struct sec_path dummy; struct xfrm_tmpl *tp[XFRM_MAX_DEPTH]; struct xfrm_tmpl *stp[XFRM_MAX_DEPTH]; @@ -3396,7 +3398,8 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, int ti = 0; int i, k; - if ((sp = skb->sp) == NULL) + sp = skb_sec_path(skb); + if (!sp) sp = &dummy; for (pi = 0; pi < npols; pi++) { diff --git a/security/selinux/xfrm.c b/security/selinux/xfrm.c index 91dc3783ed94..bd7d18bdb147 100644 --- a/security/selinux/xfrm.c +++ b/security/selinux/xfrm.c @@ -230,7 +230,7 @@ static int selinux_xfrm_skb_sid_ingress(struct sk_buff *skb, u32 *sid, int ckall) { u32 sid_session = SECSID_NULL; - struct sec_path *sp = skb->sp; + struct sec_path *sp = skb_sec_path(skb); if (sp) { int i; @@ -408,7 +408,7 @@ int selinux_xfrm_sock_rcv_skb(u32 sk_sid, struct sk_buff *skb, struct common_audit_data *ad) { int i; - struct sec_path *sp = skb->sp; + struct sec_path *sp = skb_sec_path(skb); u32 peer_sid = SECINITSID_UNLABELED; if (sp) { -- cgit v1.2.3-58-ga151 From 4165079ba328dd47262a2183049d3591f0a750b1 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 18 Dec 2018 17:15:27 +0100 Subject: net: switch secpath to use skb extension infrastructure Remove skb->sp and allocate secpath storage via extension infrastructure. This also reduces sk_buff by 8 bytes on x86_64. Total size of allyesconfig kernel is reduced slightly, as there is less inlined code (one conditional atomic op instead of two on skb_clone). No differences in throughput in following ipsec performance tests: - transport mode with aes on 10GB link - tunnel mode between two network namespaces with aes and null cipher Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- Documentation/networking/xfrm_device.txt | 7 ++-- include/linux/skbuff.h | 10 +++--- include/net/xfrm.h | 22 +------------ net/core/skbuff.c | 47 +++++++++++++++++++++++---- net/xfrm/xfrm_input.c | 56 +++++--------------------------- 5 files changed, 59 insertions(+), 83 deletions(-) (limited to 'include/linux/skbuff.h') diff --git a/Documentation/networking/xfrm_device.txt b/Documentation/networking/xfrm_device.txt index 267f55b5f54a..a1c904dc70dc 100644 --- a/Documentation/networking/xfrm_device.txt +++ b/Documentation/networking/xfrm_device.txt @@ -111,9 +111,10 @@ the stack in xfrm_input(). xfrm_state_hold(xs); store the state information into the skb - skb->sp = secpath_dup(skb->sp); - skb->sp->xvec[skb->sp->len++] = xs; - skb->sp->olen++; + sp = secpath_set(skb); + if (!sp) return; + sp->xvec[sp->len++] = xs; + sp->olen++; indicate the success and/or error status of the offload xo = xfrm_offload(skb); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index d0f254a016bf..3f741b04e55d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -714,9 +714,6 @@ struct sk_buff { struct list_head tcp_tsorted_anchor; }; -#ifdef CONFIG_XFRM - struct sec_path *sp; -#endif #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) unsigned long _nfct; #endif @@ -3907,6 +3904,9 @@ static inline void nf_conntrack_get(struct nf_conntrack *nfct) enum skb_ext_id { #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) SKB_EXT_BRIDGE_NF, +#endif +#ifdef CONFIG_XFRM + SKB_EXT_SEC_PATH, #endif SKB_EXT_NUM, /* must be last */ }; @@ -4069,7 +4069,7 @@ static inline void skb_init_secmark(struct sk_buff *skb) static inline int secpath_exists(const struct sk_buff *skb) { #ifdef CONFIG_XFRM - return skb->sp != NULL; + return skb_ext_exist(skb, SKB_EXT_SEC_PATH); #else return 0; #endif @@ -4127,7 +4127,7 @@ static inline bool skb_get_dst_pending_confirm(const struct sk_buff *skb) static inline struct sec_path *skb_sec_path(const struct sk_buff *skb) { #ifdef CONFIG_XFRM - return skb->sp; + return skb_ext_find(skb, SKB_EXT_SEC_PATH); #else return NULL; #endif diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 31220edcce95..38c232861a64 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1096,7 +1096,6 @@ struct xfrm_offload { }; struct sec_path { - refcount_t refcnt; int len; int olen; @@ -1104,32 +1103,13 @@ struct sec_path { struct xfrm_offload ovec[XFRM_MAX_OFFLOAD_DEPTH]; }; -static inline struct sec_path * -secpath_get(struct sec_path *sp) -{ - if (sp) - refcount_inc(&sp->refcnt); - return sp; -} - -void __secpath_destroy(struct sec_path *sp); - -static inline void -secpath_put(struct sec_path *sp) -{ - if (sp && refcount_dec_and_test(&sp->refcnt)) - __secpath_destroy(sp); -} - -struct sec_path *secpath_dup(struct sec_path *src); struct sec_path *secpath_set(struct sk_buff *skb); static inline void secpath_reset(struct sk_buff *skb) { #ifdef CONFIG_XFRM - secpath_put(skb->sp); - skb->sp = NULL; + skb_ext_del(skb, SKB_EXT_SEC_PATH); #endif } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 0c65723591d7..cb0bf4215745 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -609,7 +609,6 @@ fastpath: void skb_release_head_state(struct sk_buff *skb) { skb_dst_drop(skb); - secpath_reset(skb); if (skb->destructor) { WARN_ON(in_irq()); skb->destructor(skb); @@ -798,9 +797,6 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) memcpy(new->cb, old->cb, sizeof(old->cb)); skb_dst_copy(new, old); __skb_ext_copy(new, old); -#ifdef CONFIG_XFRM - new->sp = secpath_get(old->sp); -#endif __nf_copy(new, old, false); /* Note : this field could be in headers_start/headers_end section @@ -3912,6 +3908,9 @@ static const u8 skb_ext_type_len[] = { #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) [SKB_EXT_BRIDGE_NF] = SKB_EXT_CHUNKSIZEOF(struct nf_bridge_info), #endif +#ifdef CONFIG_XFRM + [SKB_EXT_SEC_PATH] = SKB_EXT_CHUNKSIZEOF(struct sec_path), +#endif }; static __always_inline unsigned int skb_ext_total_length(void) @@ -3919,6 +3918,9 @@ static __always_inline unsigned int skb_ext_total_length(void) return SKB_EXT_CHUNKSIZEOF(struct skb_ext) + #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) skb_ext_type_len[SKB_EXT_BRIDGE_NF] + +#endif +#ifdef CONFIG_XFRM + skb_ext_type_len[SKB_EXT_SEC_PATH] + #endif 0; } @@ -5610,7 +5612,8 @@ static struct skb_ext *skb_ext_alloc(void) return new; } -static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old) +static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old, + unsigned int old_active) { struct skb_ext *new; @@ -5624,6 +5627,15 @@ static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old) memcpy(new, old, old->chunks * SKB_EXT_ALIGN_VALUE); refcount_set(&new->refcnt, 1); +#ifdef CONFIG_XFRM + if (old_active & (1 << SKB_EXT_SEC_PATH)) { + struct sec_path *sp = skb_ext_get_ptr(old, SKB_EXT_SEC_PATH); + unsigned int i; + + for (i = 0; i < sp->len; i++) + xfrm_state_hold(sp->xvec[i]); + } +#endif __skb_ext_put(old); return new; } @@ -5650,7 +5662,7 @@ void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id) if (skb->active_extensions) { old = skb->extensions; - new = skb_ext_maybe_cow(old); + new = skb_ext_maybe_cow(old, skb->active_extensions); if (!new) return NULL; @@ -5679,6 +5691,16 @@ set_active: } EXPORT_SYMBOL(skb_ext_add); +#ifdef CONFIG_XFRM +static void skb_ext_put_sp(struct sec_path *sp) +{ + unsigned int i; + + for (i = 0; i < sp->len; i++) + xfrm_state_put(sp->xvec[i]); +} +#endif + void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id) { struct skb_ext *ext = skb->extensions; @@ -5687,6 +5709,14 @@ void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id) if (skb->active_extensions == 0) { skb->extensions = NULL; __skb_ext_put(ext); +#ifdef CONFIG_XFRM + } else if (id == SKB_EXT_SEC_PATH && + refcount_read(&ext->refcnt) == 1) { + struct sec_path *sp = skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH); + + skb_ext_put_sp(sp); + sp->len = 0; +#endif } } EXPORT_SYMBOL(__skb_ext_del); @@ -5702,6 +5732,11 @@ void __skb_ext_put(struct skb_ext *ext) if (!refcount_dec_and_test(&ext->refcnt)) return; free_now: +#ifdef CONFIG_XFRM + if (__skb_ext_exist(ext, SKB_EXT_SEC_PATH)) + skb_ext_put_sp(skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH)); +#endif + kmem_cache_free(skbuff_ext_cache, ext); } EXPORT_SYMBOL(__skb_ext_put); diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index b4db25b244fa..6bc817359b58 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -38,8 +38,6 @@ struct xfrm_trans_cb { #define XFRM_TRANS_SKB_CB(__skb) ((struct xfrm_trans_cb *)&((__skb)->cb[0])) -static struct kmem_cache *secpath_cachep __ro_after_init; - static DEFINE_SPINLOCK(xfrm_input_afinfo_lock); static struct xfrm_input_afinfo const __rcu *xfrm_input_afinfo[AF_INET6 + 1]; @@ -111,54 +109,21 @@ static int xfrm_rcv_cb(struct sk_buff *skb, unsigned int family, u8 protocol, return ret; } -void __secpath_destroy(struct sec_path *sp) -{ - int i; - for (i = 0; i < sp->len; i++) - xfrm_state_put(sp->xvec[i]); - kmem_cache_free(secpath_cachep, sp); -} -EXPORT_SYMBOL(__secpath_destroy); - -struct sec_path *secpath_dup(struct sec_path *src) +struct sec_path *secpath_set(struct sk_buff *skb) { - struct sec_path *sp; + struct sec_path *sp, *tmp = skb_ext_find(skb, SKB_EXT_SEC_PATH); - sp = kmem_cache_alloc(secpath_cachep, GFP_ATOMIC); + sp = skb_ext_add(skb, SKB_EXT_SEC_PATH); if (!sp) return NULL; - sp->len = 0; - sp->olen = 0; + if (tmp) /* reused existing one (was COW'd if needed) */ + return sp; + /* allocated new secpath */ memset(sp->ovec, 0, sizeof(sp->ovec)); - - if (src) { - int i; - - memcpy(sp, src, sizeof(*sp)); - for (i = 0; i < sp->len; i++) - xfrm_state_hold(sp->xvec[i]); - } - refcount_set(&sp->refcnt, 1); - return sp; -} -EXPORT_SYMBOL(secpath_dup); - -struct sec_path *secpath_set(struct sk_buff *skb) -{ - struct sec_path *sp = skb->sp; - - /* Allocate new secpath or COW existing one. */ - if (!sp || refcount_read(&sp->refcnt) != 1) { - sp = secpath_dup(skb->sp); - if (!sp) - return NULL; - - if (skb->sp) - secpath_put(skb->sp); - skb->sp = sp; - } + sp->olen = 0; + sp->len = 0; return sp; } @@ -552,11 +517,6 @@ void __init xfrm_input_init(void) if (err) gro_cells.cells = NULL; - secpath_cachep = kmem_cache_create("secpath_cache", - sizeof(struct sec_path), - 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, - NULL); - for_each_possible_cpu(i) { struct xfrm_trans_tasklet *trans; -- cgit v1.2.3-58-ga151 From d312d0a6846a4553bd955afd414f8f55398ece07 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 21 Dec 2018 19:03:14 +0100 Subject: net: drop the unused helper skb_ext_get() Such helper is currently unused, and skb extension users are better off using skb_ext_add()/skb_ext_del(). So let's drop it. Signed-off-by: Paolo Abeni Acked-by: Florian Westphal Signed-off-by: David S. Miller --- include/linux/skbuff.h | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'include/linux/skbuff.h') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 3f741b04e55d..2a57a365c711 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3938,16 +3938,6 @@ static inline void skb_ext_put(struct sk_buff *skb) __skb_ext_put(skb->extensions); } -static inline void skb_ext_get(struct sk_buff *skb) -{ - if (skb->active_extensions) { - struct skb_ext *ext = skb->extensions; - - if (ext) - refcount_inc(&ext->refcnt); - } -} - static inline void __skb_ext_copy(struct sk_buff *dst, const struct sk_buff *src) { @@ -3995,7 +3985,6 @@ static inline void *skb_ext_find(const struct sk_buff *skb, enum skb_ext_id id) } #else static inline void skb_ext_put(struct sk_buff *skb) {} -static inline void skb_ext_get(struct sk_buff *skb) {} static inline void skb_ext_del(struct sk_buff *skb, int unused) {} static inline void __skb_ext_copy(struct sk_buff *d, const struct sk_buff *s) {} static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *s) {} -- cgit v1.2.3-58-ga151 From d0dcde6426ce071ad447fb9d91c85ab649026114 Mon Sep 17 00:00:00 2001 From: Otto Sabart Date: Sun, 6 Jan 2019 00:29:15 +0100 Subject: doc: networking: convert offload files into RST and update references This patch renames offload files. This is necessary for Sphinx. Also update reference to checksum-offloads.rst file. Whole kernel code was grepped for references using: $ grep -r "\(segmentation\|checksum\)-offloads.txt" . There should be no other references to {segmentation,checksum}-offloads.txt files. Signed-off-by: Otto Sabart Acked-by: David S. Miller Signed-off-by: Jonathan Corbet --- Documentation/networking/checksum-offloads.rst | 143 ++++++++++++++++ Documentation/networking/checksum-offloads.txt | 143 ---------------- Documentation/networking/segmentation-offloads.rst | 184 +++++++++++++++++++++ Documentation/networking/segmentation-offloads.txt | 184 --------------------- include/linux/skbuff.h | 2 +- 5 files changed, 328 insertions(+), 328 deletions(-) create mode 100644 Documentation/networking/checksum-offloads.rst delete mode 100644 Documentation/networking/checksum-offloads.txt create mode 100644 Documentation/networking/segmentation-offloads.rst delete mode 100644 Documentation/networking/segmentation-offloads.txt (limited to 'include/linux/skbuff.h') diff --git a/Documentation/networking/checksum-offloads.rst b/Documentation/networking/checksum-offloads.rst new file mode 100644 index 000000000000..1a1cd94a3f6d --- /dev/null +++ b/Documentation/networking/checksum-offloads.rst @@ -0,0 +1,143 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=============================================== +Checksum Offloads in the Linux Networking Stack +=============================================== + + +Introduction +============ + +This document describes a set of techniques in the Linux networking stack to +take advantage of checksum offload capabilities of various NICs. + +The following technologies are described: + +* TX Checksum Offload +* LCO: Local Checksum Offload +* RCO: Remote Checksum Offload + +Things that should be documented here but aren't yet: + +* RX Checksum Offload +* CHECKSUM_UNNECESSARY conversion + + +TX Checksum Offload +=================== + +The interface for offloading a transmit checksum to a device is explained in +detail in comments near the top of include/linux/skbuff.h. + +In brief, it allows to request the device fill in a single ones-complement +checksum defined by the sk_buff fields skb->csum_start and skb->csum_offset. +The device should compute the 16-bit ones-complement checksum (i.e. the +'IP-style' checksum) from csum_start to the end of the packet, and fill in the +result at (csum_start + csum_offset). + +Because csum_offset cannot be negative, this ensures that the previous value of +the checksum field is included in the checksum computation, thus it can be used +to supply any needed corrections to the checksum (such as the sum of the +pseudo-header for UDP or TCP). + +This interface only allows a single checksum to be offloaded. Where +encapsulation is used, the packet may have multiple checksum fields in +different header layers, and the rest will have to be handled by another +mechanism such as LCO or RCO. + +CRC32c can also be offloaded using this interface, by means of filling +skb->csum_start and skb->csum_offset as described above, and setting +skb->csum_not_inet: see skbuff.h comment (section 'D') for more details. + +No offloading of the IP header checksum is performed; it is always done in +software. This is OK because when we build the IP header, we obviously have it +in cache, so summing it isn't expensive. It's also rather short. + +The requirements for GSO are more complicated, because when segmenting an +encapsulated packet both the inner and outer checksums may need to be edited or +recomputed for each resulting segment. See the skbuff.h comment (section 'E') +for more details. + +A driver declares its offload capabilities in netdev->hw_features; see +Documentation/networking/netdev-features.txt for more. Note that a device +which only advertises NETIF_F_IP[V6]_CSUM must still obey the csum_start and +csum_offset given in the SKB; if it tries to deduce these itself in hardware +(as some NICs do) the driver should check that the values in the SKB match +those which the hardware will deduce, and if not, fall back to checksumming in +software instead (with skb_csum_hwoffload_help() or one of the +skb_checksum_help() / skb_crc32c_csum_help functions, as mentioned in +include/linux/skbuff.h). + +The stack should, for the most part, assume that checksum offload is supported +by the underlying device. The only place that should check is +validate_xmit_skb(), and the functions it calls directly or indirectly. That +function compares the offload features requested by the SKB (which may include +other offloads besides TX Checksum Offload) and, if they are not supported or +enabled on the device (determined by netdev->features), performs the +corresponding offload in software. In the case of TX Checksum Offload, that +means calling skb_csum_hwoffload_help(skb, features). + + +LCO: Local Checksum Offload +=========================== + +LCO is a technique for efficiently computing the outer checksum of an +encapsulated datagram when the inner checksum is due to be offloaded. + +The ones-complement sum of a correctly checksummed TCP or UDP packet is equal +to the complement of the sum of the pseudo header, because everything else gets +'cancelled out' by the checksum field. This is because the sum was +complemented before being written to the checksum field. + +More generally, this holds in any case where the 'IP-style' ones complement +checksum is used, and thus any checksum that TX Checksum Offload supports. + +That is, if we have set up TX Checksum Offload with a start/offset pair, we +know that after the device has filled in that checksum, the ones complement sum +from csum_start to the end of the packet will be equal to the complement of +whatever value we put in the checksum field beforehand. This allows us to +compute the outer checksum without looking at the payload: we simply stop +summing when we get to csum_start, then add the complement of the 16-bit word +at (csum_start + csum_offset). + +Then, when the true inner checksum is filled in (either by hardware or by +skb_checksum_help()), the outer checksum will become correct by virtue of the +arithmetic. + +LCO is performed by the stack when constructing an outer UDP header for an +encapsulation such as VXLAN or GENEVE, in udp_set_csum(). Similarly for the +IPv6 equivalents, in udp6_set_csum(). + +It is also performed when constructing an IPv4 GRE header, in +net/ipv4/ip_gre.c:build_header(). It is *not* currently performed when +constructing an IPv6 GRE header; the GRE checksum is computed over the whole +packet in net/ipv6/ip6_gre.c:ip6gre_xmit2(), but it should be possible to use +LCO here as IPv6 GRE still uses an IP-style checksum. + +All of the LCO implementations use a helper function lco_csum(), in +include/linux/skbuff.h. + +LCO can safely be used for nested encapsulations; in this case, the outer +encapsulation layer will sum over both its own header and the 'middle' header. +This does mean that the 'middle' header will get summed multiple times, but +there doesn't seem to be a way to avoid that without incurring bigger costs +(e.g. in SKB bloat). + + +RCO: Remote Checksum Offload +============================ + +RCO is a technique for eliding the inner checksum of an encapsulated datagram, +allowing the outer checksum to be offloaded. It does, however, involve a +change to the encapsulation protocols, which the receiver must also support. +For this reason, it is disabled by default. + +RCO is detailed in the following Internet-Drafts: + +* https://tools.ietf.org/html/draft-herbert-remotecsumoffload-00 +* https://tools.ietf.org/html/draft-herbert-vxlan-rco-00 + +In Linux, RCO is implemented individually in each encapsulation protocol, and +most tunnel types have flags controlling its use. For instance, VXLAN has the +flag VXLAN_F_REMCSUM_TX (per struct vxlan_rdst) to indicate that RCO should be +used when transmitting to a given remote destination. diff --git a/Documentation/networking/checksum-offloads.txt b/Documentation/networking/checksum-offloads.txt deleted file mode 100644 index 1a1cd94a3f6d..000000000000 --- a/Documentation/networking/checksum-offloads.txt +++ /dev/null @@ -1,143 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -=============================================== -Checksum Offloads in the Linux Networking Stack -=============================================== - - -Introduction -============ - -This document describes a set of techniques in the Linux networking stack to -take advantage of checksum offload capabilities of various NICs. - -The following technologies are described: - -* TX Checksum Offload -* LCO: Local Checksum Offload -* RCO: Remote Checksum Offload - -Things that should be documented here but aren't yet: - -* RX Checksum Offload -* CHECKSUM_UNNECESSARY conversion - - -TX Checksum Offload -=================== - -The interface for offloading a transmit checksum to a device is explained in -detail in comments near the top of include/linux/skbuff.h. - -In brief, it allows to request the device fill in a single ones-complement -checksum defined by the sk_buff fields skb->csum_start and skb->csum_offset. -The device should compute the 16-bit ones-complement checksum (i.e. the -'IP-style' checksum) from csum_start to the end of the packet, and fill in the -result at (csum_start + csum_offset). - -Because csum_offset cannot be negative, this ensures that the previous value of -the checksum field is included in the checksum computation, thus it can be used -to supply any needed corrections to the checksum (such as the sum of the -pseudo-header for UDP or TCP). - -This interface only allows a single checksum to be offloaded. Where -encapsulation is used, the packet may have multiple checksum fields in -different header layers, and the rest will have to be handled by another -mechanism such as LCO or RCO. - -CRC32c can also be offloaded using this interface, by means of filling -skb->csum_start and skb->csum_offset as described above, and setting -skb->csum_not_inet: see skbuff.h comment (section 'D') for more details. - -No offloading of the IP header checksum is performed; it is always done in -software. This is OK because when we build the IP header, we obviously have it -in cache, so summing it isn't expensive. It's also rather short. - -The requirements for GSO are more complicated, because when segmenting an -encapsulated packet both the inner and outer checksums may need to be edited or -recomputed for each resulting segment. See the skbuff.h comment (section 'E') -for more details. - -A driver declares its offload capabilities in netdev->hw_features; see -Documentation/networking/netdev-features.txt for more. Note that a device -which only advertises NETIF_F_IP[V6]_CSUM must still obey the csum_start and -csum_offset given in the SKB; if it tries to deduce these itself in hardware -(as some NICs do) the driver should check that the values in the SKB match -those which the hardware will deduce, and if not, fall back to checksumming in -software instead (with skb_csum_hwoffload_help() or one of the -skb_checksum_help() / skb_crc32c_csum_help functions, as mentioned in -include/linux/skbuff.h). - -The stack should, for the most part, assume that checksum offload is supported -by the underlying device. The only place that should check is -validate_xmit_skb(), and the functions it calls directly or indirectly. That -function compares the offload features requested by the SKB (which may include -other offloads besides TX Checksum Offload) and, if they are not supported or -enabled on the device (determined by netdev->features), performs the -corresponding offload in software. In the case of TX Checksum Offload, that -means calling skb_csum_hwoffload_help(skb, features). - - -LCO: Local Checksum Offload -=========================== - -LCO is a technique for efficiently computing the outer checksum of an -encapsulated datagram when the inner checksum is due to be offloaded. - -The ones-complement sum of a correctly checksummed TCP or UDP packet is equal -to the complement of the sum of the pseudo header, because everything else gets -'cancelled out' by the checksum field. This is because the sum was -complemented before being written to the checksum field. - -More generally, this holds in any case where the 'IP-style' ones complement -checksum is used, and thus any checksum that TX Checksum Offload supports. - -That is, if we have set up TX Checksum Offload with a start/offset pair, we -know that after the device has filled in that checksum, the ones complement sum -from csum_start to the end of the packet will be equal to the complement of -whatever value we put in the checksum field beforehand. This allows us to -compute the outer checksum without looking at the payload: we simply stop -summing when we get to csum_start, then add the complement of the 16-bit word -at (csum_start + csum_offset). - -Then, when the true inner checksum is filled in (either by hardware or by -skb_checksum_help()), the outer checksum will become correct by virtue of the -arithmetic. - -LCO is performed by the stack when constructing an outer UDP header for an -encapsulation such as VXLAN or GENEVE, in udp_set_csum(). Similarly for the -IPv6 equivalents, in udp6_set_csum(). - -It is also performed when constructing an IPv4 GRE header, in -net/ipv4/ip_gre.c:build_header(). It is *not* currently performed when -constructing an IPv6 GRE header; the GRE checksum is computed over the whole -packet in net/ipv6/ip6_gre.c:ip6gre_xmit2(), but it should be possible to use -LCO here as IPv6 GRE still uses an IP-style checksum. - -All of the LCO implementations use a helper function lco_csum(), in -include/linux/skbuff.h. - -LCO can safely be used for nested encapsulations; in this case, the outer -encapsulation layer will sum over both its own header and the 'middle' header. -This does mean that the 'middle' header will get summed multiple times, but -there doesn't seem to be a way to avoid that without incurring bigger costs -(e.g. in SKB bloat). - - -RCO: Remote Checksum Offload -============================ - -RCO is a technique for eliding the inner checksum of an encapsulated datagram, -allowing the outer checksum to be offloaded. It does, however, involve a -change to the encapsulation protocols, which the receiver must also support. -For this reason, it is disabled by default. - -RCO is detailed in the following Internet-Drafts: - -* https://tools.ietf.org/html/draft-herbert-remotecsumoffload-00 -* https://tools.ietf.org/html/draft-herbert-vxlan-rco-00 - -In Linux, RCO is implemented individually in each encapsulation protocol, and -most tunnel types have flags controlling its use. For instance, VXLAN has the -flag VXLAN_F_REMCSUM_TX (per struct vxlan_rdst) to indicate that RCO should be -used when transmitting to a given remote destination. diff --git a/Documentation/networking/segmentation-offloads.rst b/Documentation/networking/segmentation-offloads.rst new file mode 100644 index 000000000000..1794bfe98196 --- /dev/null +++ b/Documentation/networking/segmentation-offloads.rst @@ -0,0 +1,184 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=================================================== +Segmentation Offloads in the Linux Networking Stack +=================================================== + + +Introduction +============ + +This document describes a set of techniques in the Linux networking stack +to take advantage of segmentation offload capabilities of various NICs. + +The following technologies are described: + * TCP Segmentation Offload - TSO + * UDP Fragmentation Offload - UFO + * IPIP, SIT, GRE, and UDP Tunnel Offloads + * Generic Segmentation Offload - GSO + * Generic Receive Offload - GRO + * Partial Generic Segmentation Offload - GSO_PARTIAL + * SCTP accelleration with GSO - GSO_BY_FRAGS + + +TCP Segmentation Offload +======================== + +TCP segmentation allows a device to segment a single frame into multiple +frames with a data payload size specified in skb_shinfo()->gso_size. +When TCP segmentation requested the bit for either SKB_GSO_TCPV4 or +SKB_GSO_TCPV6 should be set in skb_shinfo()->gso_type and +skb_shinfo()->gso_size should be set to a non-zero value. + +TCP segmentation is dependent on support for the use of partial checksum +offload. For this reason TSO is normally disabled if the Tx checksum +offload for a given device is disabled. + +In order to support TCP segmentation offload it is necessary to populate +the network and transport header offsets of the skbuff so that the device +drivers will be able determine the offsets of the IP or IPv6 header and the +TCP header. In addition as CHECKSUM_PARTIAL is required csum_start should +also point to the TCP header of the packet. + +For IPv4 segmentation we support one of two types in terms of the IP ID. +The default behavior is to increment the IP ID with every segment. If the +GSO type SKB_GSO_TCP_FIXEDID is specified then we will not increment the IP +ID and all segments will use the same IP ID. If a device has +NETIF_F_TSO_MANGLEID set then the IP ID can be ignored when performing TSO +and we will either increment the IP ID for all frames, or leave it at a +static value based on driver preference. + + +UDP Fragmentation Offload +========================= + +UDP fragmentation offload allows a device to fragment an oversized UDP +datagram into multiple IPv4 fragments. Many of the requirements for UDP +fragmentation offload are the same as TSO. However the IPv4 ID for +fragments should not increment as a single IPv4 datagram is fragmented. + +UFO is deprecated: modern kernels will no longer generate UFO skbs, but can +still receive them from tuntap and similar devices. Offload of UDP-based +tunnel protocols is still supported. + + +IPIP, SIT, GRE, UDP Tunnel, and Remote Checksum Offloads +======================================================== + +In addition to the offloads described above it is possible for a frame to +contain additional headers such as an outer tunnel. In order to account +for such instances an additional set of segmentation offload types were +introduced including SKB_GSO_IPXIP4, SKB_GSO_IPXIP6, SKB_GSO_GRE, and +SKB_GSO_UDP_TUNNEL. These extra segmentation types are used to identify +cases where there are more than just 1 set of headers. For example in the +case of IPIP and SIT we should have the network and transport headers moved +from the standard list of headers to "inner" header offsets. + +Currently only two levels of headers are supported. The convention is to +refer to the tunnel headers as the outer headers, while the encapsulated +data is normally referred to as the inner headers. Below is the list of +calls to access the given headers: + +IPIP/SIT Tunnel:: + + Outer Inner + MAC skb_mac_header + Network skb_network_header skb_inner_network_header + Transport skb_transport_header + +UDP/GRE Tunnel:: + + Outer Inner + MAC skb_mac_header skb_inner_mac_header + Network skb_network_header skb_inner_network_header + Transport skb_transport_header skb_inner_transport_header + +In addition to the above tunnel types there are also SKB_GSO_GRE_CSUM and +SKB_GSO_UDP_TUNNEL_CSUM. These two additional tunnel types reflect the +fact that the outer header also requests to have a non-zero checksum +included in the outer header. + +Finally there is SKB_GSO_TUNNEL_REMCSUM which indicates that a given tunnel +header has requested a remote checksum offload. In this case the inner +headers will be left with a partial checksum and only the outer header +checksum will be computed. + + +Generic Segmentation Offload +============================ + +Generic segmentation offload is a pure software offload that is meant to +deal with cases where device drivers cannot perform the offloads described +above. What occurs in GSO is that a given skbuff will have its data broken +out over multiple skbuffs that have been resized to match the MSS provided +via skb_shinfo()->gso_size. + +Before enabling any hardware segmentation offload a corresponding software +offload is required in GSO. Otherwise it becomes possible for a frame to +be re-routed between devices and end up being unable to be transmitted. + + +Generic Receive Offload +======================= + +Generic receive offload is the complement to GSO. Ideally any frame +assembled by GRO should be segmented to create an identical sequence of +frames using GSO, and any sequence of frames segmented by GSO should be +able to be reassembled back to the original by GRO. The only exception to +this is IPv4 ID in the case that the DF bit is set for a given IP header. +If the value of the IPv4 ID is not sequentially incrementing it will be +altered so that it is when a frame assembled via GRO is segmented via GSO. + + +Partial Generic Segmentation Offload +==================================== + +Partial generic segmentation offload is a hybrid between TSO and GSO. What +it effectively does is take advantage of certain traits of TCP and tunnels +so that instead of having to rewrite the packet headers for each segment +only the inner-most transport header and possibly the outer-most network +header need to be updated. This allows devices that do not support tunnel +offloads or tunnel offloads with checksum to still make use of segmentation. + +With the partial offload what occurs is that all headers excluding the +inner transport header are updated such that they will contain the correct +values for if the header was simply duplicated. The one exception to this +is the outer IPv4 ID field. It is up to the device drivers to guarantee +that the IPv4 ID field is incremented in the case that a given header does +not have the DF bit set. + + +SCTP accelleration with GSO +=========================== + +SCTP - despite the lack of hardware support - can still take advantage of +GSO to pass one large packet through the network stack, rather than +multiple small packets. + +This requires a different approach to other offloads, as SCTP packets +cannot be just segmented to (P)MTU. Rather, the chunks must be contained in +IP segments, padding respected. So unlike regular GSO, SCTP can't just +generate a big skb, set gso_size to the fragmentation point and deliver it +to IP layer. + +Instead, the SCTP protocol layer builds an skb with the segments correctly +padded and stored as chained skbs, and skb_segment() splits based on those. +To signal this, gso_size is set to the special value GSO_BY_FRAGS. + +Therefore, any code in the core networking stack must be aware of the +possibility that gso_size will be GSO_BY_FRAGS and handle that case +appropriately. + +There are some helpers to make this easier: + +- skb_is_gso(skb) && skb_is_gso_sctp(skb) is the best way to see if + an skb is an SCTP GSO skb. + +- For size checks, the skb_gso_validate_*_len family of helpers correctly + considers GSO_BY_FRAGS. + +- For manipulating packets, skb_increase_gso_size and skb_decrease_gso_size + will check for GSO_BY_FRAGS and WARN if asked to manipulate these skbs. + +This also affects drivers with the NETIF_F_FRAGLIST & NETIF_F_GSO_SCTP bits +set. Note also that NETIF_F_GSO_SCTP is included in NETIF_F_GSO_SOFTWARE. diff --git a/Documentation/networking/segmentation-offloads.txt b/Documentation/networking/segmentation-offloads.txt deleted file mode 100644 index 1794bfe98196..000000000000 --- a/Documentation/networking/segmentation-offloads.txt +++ /dev/null @@ -1,184 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -=================================================== -Segmentation Offloads in the Linux Networking Stack -=================================================== - - -Introduction -============ - -This document describes a set of techniques in the Linux networking stack -to take advantage of segmentation offload capabilities of various NICs. - -The following technologies are described: - * TCP Segmentation Offload - TSO - * UDP Fragmentation Offload - UFO - * IPIP, SIT, GRE, and UDP Tunnel Offloads - * Generic Segmentation Offload - GSO - * Generic Receive Offload - GRO - * Partial Generic Segmentation Offload - GSO_PARTIAL - * SCTP accelleration with GSO - GSO_BY_FRAGS - - -TCP Segmentation Offload -======================== - -TCP segmentation allows a device to segment a single frame into multiple -frames with a data payload size specified in skb_shinfo()->gso_size. -When TCP segmentation requested the bit for either SKB_GSO_TCPV4 or -SKB_GSO_TCPV6 should be set in skb_shinfo()->gso_type and -skb_shinfo()->gso_size should be set to a non-zero value. - -TCP segmentation is dependent on support for the use of partial checksum -offload. For this reason TSO is normally disabled if the Tx checksum -offload for a given device is disabled. - -In order to support TCP segmentation offload it is necessary to populate -the network and transport header offsets of the skbuff so that the device -drivers will be able determine the offsets of the IP or IPv6 header and the -TCP header. In addition as CHECKSUM_PARTIAL is required csum_start should -also point to the TCP header of the packet. - -For IPv4 segmentation we support one of two types in terms of the IP ID. -The default behavior is to increment the IP ID with every segment. If the -GSO type SKB_GSO_TCP_FIXEDID is specified then we will not increment the IP -ID and all segments will use the same IP ID. If a device has -NETIF_F_TSO_MANGLEID set then the IP ID can be ignored when performing TSO -and we will either increment the IP ID for all frames, or leave it at a -static value based on driver preference. - - -UDP Fragmentation Offload -========================= - -UDP fragmentation offload allows a device to fragment an oversized UDP -datagram into multiple IPv4 fragments. Many of the requirements for UDP -fragmentation offload are the same as TSO. However the IPv4 ID for -fragments should not increment as a single IPv4 datagram is fragmented. - -UFO is deprecated: modern kernels will no longer generate UFO skbs, but can -still receive them from tuntap and similar devices. Offload of UDP-based -tunnel protocols is still supported. - - -IPIP, SIT, GRE, UDP Tunnel, and Remote Checksum Offloads -======================================================== - -In addition to the offloads described above it is possible for a frame to -contain additional headers such as an outer tunnel. In order to account -for such instances an additional set of segmentation offload types were -introduced including SKB_GSO_IPXIP4, SKB_GSO_IPXIP6, SKB_GSO_GRE, and -SKB_GSO_UDP_TUNNEL. These extra segmentation types are used to identify -cases where there are more than just 1 set of headers. For example in the -case of IPIP and SIT we should have the network and transport headers moved -from the standard list of headers to "inner" header offsets. - -Currently only two levels of headers are supported. The convention is to -refer to the tunnel headers as the outer headers, while the encapsulated -data is normally referred to as the inner headers. Below is the list of -calls to access the given headers: - -IPIP/SIT Tunnel:: - - Outer Inner - MAC skb_mac_header - Network skb_network_header skb_inner_network_header - Transport skb_transport_header - -UDP/GRE Tunnel:: - - Outer Inner - MAC skb_mac_header skb_inner_mac_header - Network skb_network_header skb_inner_network_header - Transport skb_transport_header skb_inner_transport_header - -In addition to the above tunnel types there are also SKB_GSO_GRE_CSUM and -SKB_GSO_UDP_TUNNEL_CSUM. These two additional tunnel types reflect the -fact that the outer header also requests to have a non-zero checksum -included in the outer header. - -Finally there is SKB_GSO_TUNNEL_REMCSUM which indicates that a given tunnel -header has requested a remote checksum offload. In this case the inner -headers will be left with a partial checksum and only the outer header -checksum will be computed. - - -Generic Segmentation Offload -============================ - -Generic segmentation offload is a pure software offload that is meant to -deal with cases where device drivers cannot perform the offloads described -above. What occurs in GSO is that a given skbuff will have its data broken -out over multiple skbuffs that have been resized to match the MSS provided -via skb_shinfo()->gso_size. - -Before enabling any hardware segmentation offload a corresponding software -offload is required in GSO. Otherwise it becomes possible for a frame to -be re-routed between devices and end up being unable to be transmitted. - - -Generic Receive Offload -======================= - -Generic receive offload is the complement to GSO. Ideally any frame -assembled by GRO should be segmented to create an identical sequence of -frames using GSO, and any sequence of frames segmented by GSO should be -able to be reassembled back to the original by GRO. The only exception to -this is IPv4 ID in the case that the DF bit is set for a given IP header. -If the value of the IPv4 ID is not sequentially incrementing it will be -altered so that it is when a frame assembled via GRO is segmented via GSO. - - -Partial Generic Segmentation Offload -==================================== - -Partial generic segmentation offload is a hybrid between TSO and GSO. What -it effectively does is take advantage of certain traits of TCP and tunnels -so that instead of having to rewrite the packet headers for each segment -only the inner-most transport header and possibly the outer-most network -header need to be updated. This allows devices that do not support tunnel -offloads or tunnel offloads with checksum to still make use of segmentation. - -With the partial offload what occurs is that all headers excluding the -inner transport header are updated such that they will contain the correct -values for if the header was simply duplicated. The one exception to this -is the outer IPv4 ID field. It is up to the device drivers to guarantee -that the IPv4 ID field is incremented in the case that a given header does -not have the DF bit set. - - -SCTP accelleration with GSO -=========================== - -SCTP - despite the lack of hardware support - can still take advantage of -GSO to pass one large packet through the network stack, rather than -multiple small packets. - -This requires a different approach to other offloads, as SCTP packets -cannot be just segmented to (P)MTU. Rather, the chunks must be contained in -IP segments, padding respected. So unlike regular GSO, SCTP can't just -generate a big skb, set gso_size to the fragmentation point and deliver it -to IP layer. - -Instead, the SCTP protocol layer builds an skb with the segments correctly -padded and stored as chained skbs, and skb_segment() splits based on those. -To signal this, gso_size is set to the special value GSO_BY_FRAGS. - -Therefore, any code in the core networking stack must be aware of the -possibility that gso_size will be GSO_BY_FRAGS and handle that case -appropriately. - -There are some helpers to make this easier: - -- skb_is_gso(skb) && skb_is_gso_sctp(skb) is the best way to see if - an skb is an SCTP GSO skb. - -- For size checks, the skb_gso_validate_*_len family of helpers correctly - considers GSO_BY_FRAGS. - -- For manipulating packets, skb_increase_gso_size and skb_decrease_gso_size - will check for GSO_BY_FRAGS and WARN if asked to manipulate these skbs. - -This also affects drivers with the NETIF_F_FRAGLIST & NETIF_F_GSO_SCTP bits -set. Note also that NETIF_F_GSO_SCTP is included in NETIF_F_GSO_SOFTWARE. diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 93f56fddd92a..4e671b46e767 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4296,7 +4296,7 @@ static inline bool skb_head_is_locked(const struct sk_buff *skb) /* Local Checksum Offload. * Compute outer checksum based on the assumption that the * inner checksum will be offloaded later. - * See Documentation/networking/checksum-offloads.txt for + * See Documentation/networking/checksum-offloads.rst for * explanation of how this works. * Fill in outer checksum adjustment (e.g. with sum of outer * pseudo-header) before calling. -- cgit v1.2.3-58-ga151 From 6c57f0458022298e4da1729c67bd33ce41c14e7a Mon Sep 17 00:00:00 2001 From: Ross Lagerwall Date: Thu, 17 Jan 2019 15:34:38 +0000 Subject: net: Fix usage of pskb_trim_rcsum In certain cases, pskb_trim_rcsum() may change skb pointers. Reinitialize header pointers afterwards to avoid potential use-after-frees. Add a note in the documentation of pskb_trim_rcsum(). Found by KASAN. Signed-off-by: Ross Lagerwall Signed-off-by: David S. Miller --- drivers/net/ppp/pppoe.c | 1 + include/linux/skbuff.h | 1 + net/bridge/br_netfilter_ipv6.c | 1 + net/bridge/netfilter/nft_reject_bridge.c | 1 + net/ipv4/ip_input.c | 1 + 5 files changed, 5 insertions(+) (limited to 'include/linux/skbuff.h') diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c index 62dc564b251d..f22639f0116a 100644 --- a/drivers/net/ppp/pppoe.c +++ b/drivers/net/ppp/pppoe.c @@ -445,6 +445,7 @@ static int pppoe_rcv(struct sk_buff *skb, struct net_device *dev, if (pskb_trim_rcsum(skb, len)) goto drop; + ph = pppoe_hdr(skb); pn = pppoe_pernet(dev_net(dev)); /* Note that get_item does a sock_hold(), so sk_pppox(po) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 93f56fddd92a..95d25b010a25 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3218,6 +3218,7 @@ int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len); * * This is exactly the same as pskb_trim except that it ensures the * checksum of received packets are still valid after the operation. + * It can change skb pointers. */ static inline int pskb_trim_rcsum(struct sk_buff *skb, unsigned int len) diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c index 94039f588f1d..564710f88f93 100644 --- a/net/bridge/br_netfilter_ipv6.c +++ b/net/bridge/br_netfilter_ipv6.c @@ -131,6 +131,7 @@ int br_validate_ipv6(struct net *net, struct sk_buff *skb) IPSTATS_MIB_INDISCARDS); goto drop; } + hdr = ipv6_hdr(skb); } if (hdr->nexthdr == NEXTHDR_HOP && br_nf_check_hbh_len(skb)) goto drop; diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c index 08cbed7d940e..419e8edf23ba 100644 --- a/net/bridge/netfilter/nft_reject_bridge.c +++ b/net/bridge/netfilter/nft_reject_bridge.c @@ -229,6 +229,7 @@ static bool reject6_br_csum_ok(struct sk_buff *skb, int hook) pskb_trim_rcsum(skb, ntohs(ip6h->payload_len) + sizeof(*ip6h))) return false; + ip6h = ipv6_hdr(skb); thoff = ipv6_skip_exthdr(skb, ((u8*)(ip6h+1) - skb->data), &proto, &fo); if (thoff < 0 || thoff >= skb->len || (fo & htons(~0x7)) != 0) return false; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 26921f6b3b92..51d8efba6de2 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -488,6 +488,7 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net) goto drop; } + iph = ip_hdr(skb); skb->transport_header = skb->network_header + iph->ihl*4; /* Remove any debris in the socket control block */ -- cgit v1.2.3-58-ga151 From c8aa703822bf811269975cf7251b5eaad4c38e9c Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 28 Jan 2019 08:53:53 -0800 Subject: net/flow_dissector: move bpf case into __skb_flow_bpf_dissect This way, we can reuse it for flow dissector in BPF_PROG_TEST_RUN. No functional changes. Signed-off-by: Stanislav Fomichev Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- include/linux/skbuff.h | 5 +++ net/core/flow_dissector.c | 92 +++++++++++++++++++++++++++-------------------- 2 files changed, 59 insertions(+), 38 deletions(-) (limited to 'include/linux/skbuff.h') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 93f56fddd92a..be762fc34ff3 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1221,6 +1221,11 @@ static inline int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr) } #endif +struct bpf_flow_keys; +bool __skb_flow_bpf_dissect(struct bpf_prog *prog, + const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + struct bpf_flow_keys *flow_keys); bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 9f2840510e63..bb1a54747d64 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -683,6 +683,46 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys, } } +bool __skb_flow_bpf_dissect(struct bpf_prog *prog, + const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + struct bpf_flow_keys *flow_keys) +{ + struct bpf_skb_data_end cb_saved; + struct bpf_skb_data_end *cb; + u32 result; + + /* Note that even though the const qualifier is discarded + * throughout the execution of the BPF program, all changes(the + * control block) are reverted after the BPF program returns. + * Therefore, __skb_flow_dissect does not alter the skb. + */ + + cb = (struct bpf_skb_data_end *)skb->cb; + + /* Save Control Block */ + memcpy(&cb_saved, cb, sizeof(cb_saved)); + memset(cb, 0, sizeof(*cb)); + + /* Pass parameters to the BPF program */ + memset(flow_keys, 0, sizeof(*flow_keys)); + cb->qdisc_cb.flow_keys = flow_keys; + flow_keys->nhoff = skb_network_offset(skb); + flow_keys->thoff = flow_keys->nhoff; + + bpf_compute_data_pointers((struct sk_buff *)skb); + result = BPF_PROG_RUN(prog, skb); + + /* Restore state */ + memcpy(cb, &cb_saved, sizeof(cb_saved)); + + flow_keys->nhoff = clamp_t(u16, flow_keys->nhoff, 0, skb->len); + flow_keys->thoff = clamp_t(u16, flow_keys->thoff, + flow_keys->nhoff, skb->len); + + return result == BPF_OK; +} + /** * __skb_flow_dissect - extract the flow_keys struct and return it * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified @@ -714,7 +754,6 @@ bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector_key_vlan *key_vlan; enum flow_dissect_ret fdret; enum flow_dissector_key_id dissector_vlan = FLOW_DISSECTOR_KEY_MAX; - struct bpf_prog *attached = NULL; int num_hdrs = 0; u8 ip_proto = 0; bool ret; @@ -754,53 +793,30 @@ bool __skb_flow_dissect(const struct sk_buff *skb, FLOW_DISSECTOR_KEY_BASIC, target_container); - rcu_read_lock(); if (skb) { + struct bpf_flow_keys flow_keys; + struct bpf_prog *attached = NULL; + + rcu_read_lock(); + if (skb->dev) attached = rcu_dereference(dev_net(skb->dev)->flow_dissector_prog); else if (skb->sk) attached = rcu_dereference(sock_net(skb->sk)->flow_dissector_prog); else WARN_ON_ONCE(1); - } - if (attached) { - /* Note that even though the const qualifier is discarded - * throughout the execution of the BPF program, all changes(the - * control block) are reverted after the BPF program returns. - * Therefore, __skb_flow_dissect does not alter the skb. - */ - struct bpf_flow_keys flow_keys = {}; - struct bpf_skb_data_end cb_saved; - struct bpf_skb_data_end *cb; - u32 result; - - cb = (struct bpf_skb_data_end *)skb->cb; - - /* Save Control Block */ - memcpy(&cb_saved, cb, sizeof(cb_saved)); - memset(cb, 0, sizeof(cb_saved)); - /* Pass parameters to the BPF program */ - cb->qdisc_cb.flow_keys = &flow_keys; - flow_keys.nhoff = nhoff; - flow_keys.thoff = nhoff; - - bpf_compute_data_pointers((struct sk_buff *)skb); - result = BPF_PROG_RUN(attached, skb); - - /* Restore state */ - memcpy(cb, &cb_saved, sizeof(cb_saved)); - - flow_keys.nhoff = clamp_t(u16, flow_keys.nhoff, 0, skb->len); - flow_keys.thoff = clamp_t(u16, flow_keys.thoff, - flow_keys.nhoff, skb->len); - - __skb_flow_bpf_to_target(&flow_keys, flow_dissector, - target_container); + if (attached) { + ret = __skb_flow_bpf_dissect(attached, skb, + flow_dissector, + &flow_keys); + __skb_flow_bpf_to_target(&flow_keys, flow_dissector, + target_container); + rcu_read_unlock(); + return ret; + } rcu_read_unlock(); - return result == BPF_OK; } - rcu_read_unlock(); if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ETH_ADDRS)) { -- cgit v1.2.3-58-ga151 From 13c6ee2a921683bae4bb4ba57b1f5b82f49e6b8a Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Sat, 2 Feb 2019 07:34:48 -0800 Subject: socket: Use old_timeval types for socket timestamps As part of y2038 solution, all internal uses of struct timeval are replaced by struct __kernel_old_timeval and struct compat_timeval by struct old_timeval32. Make socket timestamps use these new types. This is mainly to be able to verify that the kernel build is y2038 safe when such non y2038 safe types are not supported anymore. Signed-off-by: Deepa Dinamani Acked-by: Willem de Bruijn Cc: isdn@linux-pingi.de Signed-off-by: David S. Miller --- drivers/isdn/mISDN/socket.c | 2 +- include/linux/skbuff.h | 6 +++--- net/bluetooth/hci_sock.c | 4 ++-- net/compat.c | 6 +++--- net/ipv4/tcp.c | 2 +- net/rds/recv.c | 2 +- net/socket.c | 2 +- 7 files changed, 12 insertions(+), 12 deletions(-) (limited to 'include/linux/skbuff.h') diff --git a/drivers/isdn/mISDN/socket.c b/drivers/isdn/mISDN/socket.c index 15d3ca37669a..4ab8b1b6608f 100644 --- a/drivers/isdn/mISDN/socket.c +++ b/drivers/isdn/mISDN/socket.c @@ -103,7 +103,7 @@ mISDN_ctrl(struct mISDNchannel *ch, u_int cmd, void *arg) static inline void mISDN_sock_cmsg(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) { - struct timeval tv; + struct __kernel_old_timeval tv; if (_pms(sk)->cmask & MISDN_TIME_STAMP) { skb_get_timestamp(skb, &tv); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index c34595374e93..4001611a4c9f 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3486,16 +3486,16 @@ static inline ktime_t skb_get_ktime(const struct sk_buff *skb) /** * skb_get_timestamp - get timestamp from a skb * @skb: skb to get stamp from - * @stamp: pointer to struct timeval to store stamp in + * @stamp: pointer to struct __kernel_old_timeval to store stamp in * * Timestamps are stored in the skb as offsets to a base timestamp. * This function converts the offset back to a struct timeval and stores * it in stamp. */ static inline void skb_get_timestamp(const struct sk_buff *skb, - struct timeval *stamp) + struct __kernel_old_timeval *stamp) { - *stamp = ktime_to_timeval(skb->tstamp); + *stamp = ns_to_kernel_old_timeval(skb->tstamp); } static inline void skb_get_timestampns(const struct sk_buff *skb, diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 1506e1632394..65228bfa4487 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -1383,9 +1383,9 @@ static void hci_sock_cmsg(struct sock *sk, struct msghdr *msg, if (mask & HCI_CMSG_TSTAMP) { #ifdef CONFIG_COMPAT - struct compat_timeval ctv; + struct old_timeval32 ctv; #endif - struct timeval tv; + struct __kernel_old_timeval tv; void *data; int len; diff --git a/net/compat.c b/net/compat.c index ccf93cd0e49b..9629f053d4fa 100644 --- a/net/compat.c +++ b/net/compat.c @@ -209,8 +209,8 @@ int put_cmsg_compat(struct msghdr *kmsg, int level, int type, int len, void *dat { struct compat_cmsghdr __user *cm = (struct compat_cmsghdr __user *) kmsg->msg_control; struct compat_cmsghdr cmhdr; - struct compat_timeval ctv; - struct compat_timespec cts[3]; + struct old_timeval32 ctv; + struct old_timespec32 cts[3]; int cmlen; if (cm == NULL || kmsg->msg_controllen < sizeof(*cm)) { @@ -220,7 +220,7 @@ int put_cmsg_compat(struct msghdr *kmsg, int level, int type, int len, void *dat if (!COMPAT_USE_64BIT_TIME) { if (level == SOL_SOCKET && type == SO_TIMESTAMP_OLD) { - struct timeval *tv = (struct timeval *)data; + struct __kernel_old_timeval *tv = (struct __kernel_old_timeval *)data; ctv.tv_sec = tv->tv_sec; ctv.tv_usec = tv->tv_usec; data = &ctv; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e29aec59cad1..3ce41b04c0f0 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1861,7 +1861,7 @@ static void tcp_update_recv_tstamps(struct sk_buff *skb, static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, struct scm_timestamping *tss) { - struct timeval tv; + struct __kernel_old_timeval tv; bool has_timestamping = false; if (tss->ts[0].tv_sec || tss->ts[0].tv_nsec) { diff --git a/net/rds/recv.c b/net/rds/recv.c index 04e30d63a159..435bf2320cd3 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -549,7 +549,7 @@ static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg, if ((inc->i_rx_tstamp != 0) && sock_flag(rds_rs_to_sk(rs), SOCK_RCVTSTAMP)) { - struct timeval tv = ktime_to_timeval(inc->i_rx_tstamp); + struct __kernel_old_timeval tv = ns_to_kernel_old_timeval(inc->i_rx_tstamp); ret = put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD, sizeof(tv), &tv); if (ret) diff --git a/net/socket.c b/net/socket.c index 5087f9e40f3a..9cc281cdb9d9 100644 --- a/net/socket.c +++ b/net/socket.c @@ -719,7 +719,7 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, if (need_software_tstamp) { if (!sock_flag(sk, SOCK_RCVTSTAMPNS)) { - struct timeval tv; + struct __kernel_old_timeval tv; skb_get_timestamp(skb, &tv); put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD, sizeof(tv), &tv); -- cgit v1.2.3-58-ga151 From 887feae36aee6c08e0dafcdaa5ba921abbb2c56b Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Sat, 2 Feb 2019 07:34:50 -0800 Subject: socket: Add SO_TIMESTAMP[NS]_NEW Add SO_TIMESTAMP_NEW and SO_TIMESTAMPNS_NEW variants of socket timestamp options. These are the y2038 safe versions of the SO_TIMESTAMP_OLD and SO_TIMESTAMPNS_OLD for all architectures. Note that the format of scm_timestamping.ts[0] is not changed in this patch. Signed-off-by: Deepa Dinamani Acked-by: Willem de Bruijn Cc: jejb@parisc-linux.org Cc: ralf@linux-mips.org Cc: rth@twiddle.net Cc: linux-alpha@vger.kernel.org Cc: linux-mips@linux-mips.org Cc: linux-parisc@vger.kernel.org Cc: linux-rdma@vger.kernel.org Cc: netdev@vger.kernel.org Cc: sparclinux@vger.kernel.org Signed-off-by: David S. Miller --- arch/alpha/include/uapi/asm/socket.h | 14 ++++++++++++-- arch/mips/include/uapi/asm/socket.h | 14 ++++++++++++-- arch/parisc/include/uapi/asm/socket.h | 14 ++++++++++++-- arch/sparc/include/uapi/asm/socket.h | 14 ++++++++++++-- include/linux/skbuff.h | 18 ++++++++++++++++++ include/net/sock.h | 1 + include/uapi/asm-generic/socket.h | 15 +++++++++++++-- net/core/sock.c | 21 +++++++++++++++++++-- net/ipv4/tcp.c | 33 +++++++++++++++++++++++++-------- net/rds/af_rds.c | 8 ++++++-- net/rds/recv.c | 16 ++++++++++++++-- net/socket.c | 35 +++++++++++++++++++++++++++-------- 12 files changed, 171 insertions(+), 32 deletions(-) (limited to 'include/linux/skbuff.h') diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h index 992a0a6dcea1..aab11eec7c22 100644 --- a/arch/alpha/include/uapi/asm/socket.h +++ b/arch/alpha/include/uapi/asm/socket.h @@ -3,6 +3,7 @@ #define _UAPI_ASM_SOCKET_H #include +#include /* For setsockopt(2) */ /* @@ -114,10 +115,19 @@ #define SO_TIMESTAMPNS_OLD 35 #define SO_TIMESTAMPING_OLD 37 +#define SO_TIMESTAMP_NEW 63 +#define SO_TIMESTAMPNS_NEW 64 + #if !defined(__KERNEL__) -#define SO_TIMESTAMP SO_TIMESTAMP_OLD -#define SO_TIMESTAMPNS SO_TIMESTAMPNS_OLD +#if __BITS_PER_LONG == 64 +#define SO_TIMESTAMP SO_TIMESTAMP_OLD +#define SO_TIMESTAMPNS SO_TIMESTAMPNS_OLD +#else +#define SO_TIMESTAMP (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMP_OLD : SO_TIMESTAMP_NEW) +#define SO_TIMESTAMPNS (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMPNS_OLD : SO_TIMESTAMPNS_NEW) +#endif + #define SO_TIMESTAMPING SO_TIMESTAMPING_OLD #define SCM_TIMESTAMP SO_TIMESTAMP diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h index 0f4516c34df2..11014f684d9f 100644 --- a/arch/mips/include/uapi/asm/socket.h +++ b/arch/mips/include/uapi/asm/socket.h @@ -11,6 +11,7 @@ #define _UAPI_ASM_SOCKET_H #include +#include /* * For setsockopt(2) @@ -125,10 +126,19 @@ #define SO_TIMESTAMPNS_OLD 35 #define SO_TIMESTAMPING_OLD 37 +#define SO_TIMESTAMP_NEW 63 +#define SO_TIMESTAMPNS_NEW 64 + #if !defined(__KERNEL__) -#define SO_TIMESTAMP SO_TIMESTAMP_OLD -#define SO_TIMESTAMPNS SO_TIMESTAMPNS_OLD +#if __BITS_PER_LONG == 64 +#define SO_TIMESTAMP SO_TIMESTAMP_OLD +#define SO_TIMESTAMPNS SO_TIMESTAMPNS_OLD +#else +#define SO_TIMESTAMP (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMP_OLD : SO_TIMESTAMP_NEW) +#define SO_TIMESTAMPNS (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMPNS_OLD : SO_TIMESTAMPNS_NEW) +#endif + #define SO_TIMESTAMPING SO_TIMESTAMPING_OLD #define SCM_TIMESTAMP SO_TIMESTAMP diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h index 7c180321ebd6..cbc4b89c2fe4 100644 --- a/arch/parisc/include/uapi/asm/socket.h +++ b/arch/parisc/include/uapi/asm/socket.h @@ -3,6 +3,7 @@ #define _UAPI_ASM_SOCKET_H #include +#include /* For setsockopt(2) */ #define SOL_SOCKET 0xffff @@ -106,10 +107,19 @@ #define SO_TIMESTAMPNS_OLD 0x4013 #define SO_TIMESTAMPING_OLD 0x4020 +#define SO_TIMESTAMP_NEW 0x4038 +#define SO_TIMESTAMPNS_NEW 0x4039 + #if !defined(__KERNEL__) -#define SO_TIMESTAMP SO_TIMESTAMP_OLD -#define SO_TIMESTAMPNS SO_TIMESTAMPNS_OLD +#if __BITS_PER_LONG == 64 +#define SO_TIMESTAMP SO_TIMESTAMP_OLD +#define SO_TIMESTAMPNS SO_TIMESTAMPNS_OLD +#else +#define SO_TIMESTAMP (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMP_OLD : SO_TIMESTAMP_NEW) +#define SO_TIMESTAMPNS (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMPNS_OLD : SO_TIMESTAMPNS_NEW) +#endif + #define SO_TIMESTAMPING SO_TIMESTAMPING_OLD #define SCM_TIMESTAMP SO_TIMESTAMP diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h index d8a1bbc3e6c4..85127425b294 100644 --- a/arch/sparc/include/uapi/asm/socket.h +++ b/arch/sparc/include/uapi/asm/socket.h @@ -3,6 +3,7 @@ #define _ASM_SOCKET_H #include +#include /* For setsockopt(2) */ #define SOL_SOCKET 0xffff @@ -107,10 +108,19 @@ #define SO_TIMESTAMPNS_OLD 0x0021 #define SO_TIMESTAMPING_OLD 0x0023 +#define SO_TIMESTAMP_NEW 0x0041 +#define SO_TIMESTAMPNS_NEW 0x0042 + #if !defined(__KERNEL__) -#define SO_TIMESTAMP SO_TIMESTAMP_OLD -#define SO_TIMESTAMPNS SO_TIMESTAMPNS_OLD +#if __BITS_PER_LONG == 64 +#define SO_TIMESTAMP SO_TIMESTAMP_OLD +#define SO_TIMESTAMPNS SO_TIMESTAMPNS_OLD +#else +#define SO_TIMESTAMP (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMP_OLD : SO_TIMESTAMP_NEW) +#define SO_TIMESTAMPNS (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMPNS_OLD : SO_TIMESTAMPNS_NEW) +#endif + #define SO_TIMESTAMPING SO_TIMESTAMPING_OLD #define SCM_TIMESTAMP SO_TIMESTAMP diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 4001611a4c9f..831846617d07 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3498,12 +3498,30 @@ static inline void skb_get_timestamp(const struct sk_buff *skb, *stamp = ns_to_kernel_old_timeval(skb->tstamp); } +static inline void skb_get_new_timestamp(const struct sk_buff *skb, + struct __kernel_sock_timeval *stamp) +{ + struct timespec64 ts = ktime_to_timespec64(skb->tstamp); + + stamp->tv_sec = ts.tv_sec; + stamp->tv_usec = ts.tv_nsec / 1000; +} + static inline void skb_get_timestampns(const struct sk_buff *skb, struct timespec *stamp) { *stamp = ktime_to_timespec(skb->tstamp); } +static inline void skb_get_new_timestampns(const struct sk_buff *skb, + struct __kernel_timespec *stamp) +{ + struct timespec64 ts = ktime_to_timespec64(skb->tstamp); + + stamp->tv_sec = ts.tv_sec; + stamp->tv_nsec = ts.tv_nsec; +} + static inline void __net_timestamp(struct sk_buff *skb) { skb->tstamp = ktime_get_real(); diff --git a/include/net/sock.h b/include/net/sock.h index 2b229f7be8eb..6679f3c120b0 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -805,6 +805,7 @@ enum sock_flags { SOCK_RCU_FREE, /* wait rcu grace period in sk_destruct() */ SOCK_TXTIME, SOCK_XDP, /* XDP is attached */ + SOCK_TSTAMP_NEW, /* Indicates 64 bit timestamps always */ }; #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h index 4ef3aed31fb7..f22d3f7162f8 100644 --- a/include/uapi/asm-generic/socket.h +++ b/include/uapi/asm-generic/socket.h @@ -3,6 +3,7 @@ #define __ASM_GENERIC_SOCKET_H #include +#include /* For setsockopt(2) */ #define SOL_SOCKET 1 @@ -109,10 +110,20 @@ #define SO_TIMESTAMPNS_OLD 35 #define SO_TIMESTAMPING_OLD 37 +#define SO_TIMESTAMP_NEW 63 +#define SO_TIMESTAMPNS_NEW 64 + #if !defined(__KERNEL__) -#define SO_TIMESTAMP SO_TIMESTAMP_OLD -#define SO_TIMESTAMPNS SO_TIMESTAMPNS_OLD +#if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__)) +/* on 64-bit and x32, avoid the ?: operator */ +#define SO_TIMESTAMP SO_TIMESTAMP_OLD +#define SO_TIMESTAMPNS SO_TIMESTAMPNS_OLD +#else +#define SO_TIMESTAMP (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMP_OLD : SO_TIMESTAMP_NEW) +#define SO_TIMESTAMPNS (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMPNS_OLD : SO_TIMESTAMPNS_NEW) +#endif + #define SO_TIMESTAMPING SO_TIMESTAMPING_OLD #define SCM_TIMESTAMP SO_TIMESTAMP diff --git a/net/core/sock.c b/net/core/sock.c index d5ca8641968f..14b987eab10c 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -868,9 +868,16 @@ set_rcvbuf: break; case SO_TIMESTAMP_OLD: + case SO_TIMESTAMP_NEW: case SO_TIMESTAMPNS_OLD: + case SO_TIMESTAMPNS_NEW: if (valbool) { - if (optname == SO_TIMESTAMP_OLD) + if (optname == SO_TIMESTAMP_NEW || optname == SO_TIMESTAMPNS_NEW) + sock_set_flag(sk, SOCK_TSTAMP_NEW); + else + sock_reset_flag(sk, SOCK_TSTAMP_NEW); + + if (optname == SO_TIMESTAMP_OLD || optname == SO_TIMESTAMP_NEW) sock_reset_flag(sk, SOCK_RCVTSTAMPNS); else sock_set_flag(sk, SOCK_RCVTSTAMPNS); @@ -879,6 +886,7 @@ set_rcvbuf: } else { sock_reset_flag(sk, SOCK_RCVTSTAMP); sock_reset_flag(sk, SOCK_RCVTSTAMPNS); + sock_reset_flag(sk, SOCK_TSTAMP_NEW); } break; @@ -1245,11 +1253,20 @@ int sock_getsockopt(struct socket *sock, int level, int optname, case SO_TIMESTAMP_OLD: v.val = sock_flag(sk, SOCK_RCVTSTAMP) && + !sock_flag(sk, SOCK_TSTAMP_NEW) && !sock_flag(sk, SOCK_RCVTSTAMPNS); break; case SO_TIMESTAMPNS_OLD: - v.val = sock_flag(sk, SOCK_RCVTSTAMPNS); + v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW); + break; + + case SO_TIMESTAMP_NEW: + v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW); + break; + + case SO_TIMESTAMPNS_NEW: + v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW); break; case SO_TIMESTAMPING_OLD: diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 3ce41b04c0f0..4e9388bf104a 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1861,20 +1861,37 @@ static void tcp_update_recv_tstamps(struct sk_buff *skb, static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, struct scm_timestamping *tss) { - struct __kernel_old_timeval tv; + int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW); bool has_timestamping = false; if (tss->ts[0].tv_sec || tss->ts[0].tv_nsec) { if (sock_flag(sk, SOCK_RCVTSTAMP)) { if (sock_flag(sk, SOCK_RCVTSTAMPNS)) { - put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD, - sizeof(tss->ts[0]), &tss->ts[0]); + if (new_tstamp) { + struct __kernel_timespec kts = {tss->ts[0].tv_sec, tss->ts[0].tv_nsec}; + + put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_NEW, + sizeof(kts), &kts); + } else { + put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD, + sizeof(tss->ts[0]), &tss->ts[0]); + } } else { - tv.tv_sec = tss->ts[0].tv_sec; - tv.tv_usec = tss->ts[0].tv_nsec / 1000; - - put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD, - sizeof(tv), &tv); + if (new_tstamp) { + struct __kernel_sock_timeval stv; + + stv.tv_sec = tss->ts[0].tv_sec; + stv.tv_usec = tss->ts[0].tv_nsec / 1000; + put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW, + sizeof(stv), &stv); + } else { + struct __kernel_old_timeval tv; + + tv.tv_sec = tss->ts[0].tv_sec; + tv.tv_usec = tss->ts[0].tv_nsec / 1000; + put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD, + sizeof(tv), &tv); + } } } diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index eeb4639adbe5..65571a6273c3 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -348,7 +348,7 @@ static int rds_set_transport(struct rds_sock *rs, char __user *optval, } static int rds_enable_recvtstamp(struct sock *sk, char __user *optval, - int optlen) + int optlen, int optname) { int val, valbool; @@ -360,6 +360,9 @@ static int rds_enable_recvtstamp(struct sock *sk, char __user *optval, valbool = val ? 1 : 0; + if (optname == SO_TIMESTAMP_NEW) + sock_set_flag(sk, SOCK_TSTAMP_NEW); + if (valbool) sock_set_flag(sk, SOCK_RCVTSTAMP); else @@ -431,8 +434,9 @@ static int rds_setsockopt(struct socket *sock, int level, int optname, release_sock(sock->sk); break; case SO_TIMESTAMP_OLD: + case SO_TIMESTAMP_NEW: lock_sock(sock->sk); - ret = rds_enable_recvtstamp(sock->sk, optval, optlen); + ret = rds_enable_recvtstamp(sock->sk, optval, optlen, optname); release_sock(sock->sk); break; case SO_RDS_MSG_RXPATH_LATENCY: diff --git a/net/rds/recv.c b/net/rds/recv.c index 435bf2320cd3..6bb6b16ca270 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -550,8 +550,20 @@ static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg, if ((inc->i_rx_tstamp != 0) && sock_flag(rds_rs_to_sk(rs), SOCK_RCVTSTAMP)) { struct __kernel_old_timeval tv = ns_to_kernel_old_timeval(inc->i_rx_tstamp); - ret = put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD, - sizeof(tv), &tv); + + if (!sock_flag(rds_rs_to_sk(rs), SOCK_TSTAMP_NEW)) { + ret = put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD, + sizeof(tv), &tv); + } else { + struct __kernel_sock_timeval sk_tv; + + sk_tv.tv_sec = tv.tv_sec; + sk_tv.tv_usec = tv.tv_usec; + + ret = put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW, + sizeof(sk_tv), &sk_tv); + } + if (ret) goto out; } diff --git a/net/socket.c b/net/socket.c index 9cc281cdb9d9..1de96abd78d3 100644 --- a/net/socket.c +++ b/net/socket.c @@ -705,6 +705,7 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { int need_software_tstamp = sock_flag(sk, SOCK_RCVTSTAMP); + int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW); struct scm_timestamping tss; int empty = 1, false_tstamp = 0; struct skb_shared_hwtstamps *shhwtstamps = @@ -719,15 +720,33 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, if (need_software_tstamp) { if (!sock_flag(sk, SOCK_RCVTSTAMPNS)) { - struct __kernel_old_timeval tv; - skb_get_timestamp(skb, &tv); - put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD, - sizeof(tv), &tv); + if (new_tstamp) { + struct __kernel_sock_timeval tv; + + skb_get_new_timestamp(skb, &tv); + put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW, + sizeof(tv), &tv); + } else { + struct __kernel_old_timeval tv; + + skb_get_timestamp(skb, &tv); + put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD, + sizeof(tv), &tv); + } } else { - struct timespec ts; - skb_get_timestampns(skb, &ts); - put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD, - sizeof(ts), &ts); + if (new_tstamp) { + struct __kernel_timespec ts; + + skb_get_new_timestampns(skb, &ts); + put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_NEW, + sizeof(ts), &ts); + } else { + struct timespec ts; + + skb_get_timestampns(skb, &ts); + put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD, + sizeof(ts), &ts); + } } } -- cgit v1.2.3-58-ga151 From b90efd2258749e04e1b3f71ef0d716f2ac2337e0 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 7 Feb 2019 14:54:16 -0500 Subject: bpf: only adjust gso_size on bytestream protocols bpf_skb_change_proto and bpf_skb_adjust_room change skb header length. For GSO packets they adjust gso_size to maintain the same MTU. The gso size can only be safely adjusted on bytestream protocols. Commit d02f51cbcf12 ("bpf: fix bpf_skb_adjust_net/bpf_skb_proto_xlat to deal with gso sctp skbs") excluded SKB_GSO_SCTP. Since then type SKB_GSO_UDP_L4 has been added, whose contents are one gso_size unit per datagram. Also exclude these. Move from a blacklist to a whitelist check to future proof against additional such new GSO types, e.g., for fraglist based GRO. Fixes: bec1f6f69736 ("udp: generate gso with UDP_SEGMENT") Signed-off-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov --- include/linux/skbuff.h | 6 ++++++ net/core/filter.c | 12 ++++-------- 2 files changed, 10 insertions(+), 8 deletions(-) (limited to 'include/linux/skbuff.h') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 95d25b010a25..5a7a8b93a5ab 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4212,6 +4212,12 @@ static inline bool skb_is_gso_sctp(const struct sk_buff *skb) return skb_shinfo(skb)->gso_type & SKB_GSO_SCTP; } +static inline bool skb_is_gso_tcp(const struct sk_buff *skb) +{ + return skb_is_gso(skb) && + skb_shinfo(skb)->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6); +} + static inline void skb_gso_reset(struct sk_buff *skb) { skb_shinfo(skb)->gso_size = 0; diff --git a/net/core/filter.c b/net/core/filter.c index 7a54dc11ac2d..f7d0004fc160 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2789,8 +2789,7 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb) u32 off = skb_mac_header_len(skb); int ret; - /* SCTP uses GSO_BY_FRAGS, thus cannot adjust it. */ - if (skb_is_gso(skb) && unlikely(skb_is_gso_sctp(skb))) + if (!skb_is_gso_tcp(skb)) return -ENOTSUPP; ret = skb_cow(skb, len_diff); @@ -2831,8 +2830,7 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb) u32 off = skb_mac_header_len(skb); int ret; - /* SCTP uses GSO_BY_FRAGS, thus cannot adjust it. */ - if (skb_is_gso(skb) && unlikely(skb_is_gso_sctp(skb))) + if (!skb_is_gso_tcp(skb)) return -ENOTSUPP; ret = skb_unclone(skb, GFP_ATOMIC); @@ -2957,8 +2955,7 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 len_diff) u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb); int ret; - /* SCTP uses GSO_BY_FRAGS, thus cannot adjust it. */ - if (skb_is_gso(skb) && unlikely(skb_is_gso_sctp(skb))) + if (!skb_is_gso_tcp(skb)) return -ENOTSUPP; ret = skb_cow(skb, len_diff); @@ -2987,8 +2984,7 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 len_diff) u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb); int ret; - /* SCTP uses GSO_BY_FRAGS, thus cannot adjust it. */ - if (skb_is_gso(skb) && unlikely(skb_is_gso_sctp(skb))) + if (!skb_is_gso_tcp(skb)) return -ENOTSUPP; ret = skb_unclone(skb, GFP_ATOMIC); -- cgit v1.2.3-58-ga151 From 4ea7b0cf0da7494d5c7b8dc328493c50640d0cbc Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Mon, 11 Feb 2019 13:02:25 -0800 Subject: net/skbuff: fix up kernel-doc placement There are several skb_* functions where the locked and unlocked functions are confusingly documented. For several of them, the kernel-doc for the unlocked version is placed above the locked version, which to the casual reader makes it seems like the locked version "takes no locks and you must therefore hold required locks before calling it." One can see, for example, that this link claims to document skb_queue_head(), while instead describing __skb_queue_head(). https://www.kernel.org/doc/html/latest/networking/kapi.html#c.skb_queue_head The correct documentation for skb_queue_head() is also included further down the page. This diff tested via: $ scripts/kernel-doc -rst include/linux/skbuff.h net/core/skbuff.c No new warnings were seen, and the output makes a little more sense. Signed-off-by: Brian Norris Signed-off-by: David S. Miller --- include/linux/skbuff.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux/skbuff.h') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 831846617d07..a41e84f7730c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1889,12 +1889,12 @@ static inline void __skb_queue_before(struct sk_buff_head *list, * * A buffer cannot be placed on two lists at the same time. */ -void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk); static inline void __skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk) { __skb_queue_after(list, (struct sk_buff *)list, newsk); } +void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk); /** * __skb_queue_tail - queue a buffer at the list tail @@ -1906,12 +1906,12 @@ static inline void __skb_queue_head(struct sk_buff_head *list, * * A buffer cannot be placed on two lists at the same time. */ -void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk); static inline void __skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk) { __skb_queue_before(list, (struct sk_buff *)list, newsk); } +void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk); /* * remove sk_buff from list. _Must_ be called atomically, and with @@ -1938,7 +1938,6 @@ static inline void __skb_unlink(struct sk_buff *skb, struct sk_buff_head *list) * so must be used with appropriate locks held only. The head item is * returned or %NULL if the list is empty. */ -struct sk_buff *skb_dequeue(struct sk_buff_head *list); static inline struct sk_buff *__skb_dequeue(struct sk_buff_head *list) { struct sk_buff *skb = skb_peek(list); @@ -1946,6 +1945,7 @@ static inline struct sk_buff *__skb_dequeue(struct sk_buff_head *list) __skb_unlink(skb, list); return skb; } +struct sk_buff *skb_dequeue(struct sk_buff_head *list); /** * __skb_dequeue_tail - remove from the tail of the queue @@ -1955,7 +1955,6 @@ static inline struct sk_buff *__skb_dequeue(struct sk_buff_head *list) * so must be used with appropriate locks held only. The tail item is * returned or %NULL if the list is empty. */ -struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list); static inline struct sk_buff *__skb_dequeue_tail(struct sk_buff_head *list) { struct sk_buff *skb = skb_peek_tail(list); @@ -1963,6 +1962,7 @@ static inline struct sk_buff *__skb_dequeue_tail(struct sk_buff_head *list) __skb_unlink(skb, list); return skb; } +struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list); static inline bool skb_is_nonlinear(const struct sk_buff *skb) @@ -2653,13 +2653,13 @@ static inline int skb_orphan_frags_rx(struct sk_buff *skb, gfp_t gfp_mask) * the list and one reference dropped. This function does not take the * list lock and the caller must hold the relevant locks to use it. */ -void skb_queue_purge(struct sk_buff_head *list); static inline void __skb_queue_purge(struct sk_buff_head *list) { struct sk_buff *skb; while ((skb = __skb_dequeue(list)) != NULL) kfree_skb(skb); } +void skb_queue_purge(struct sk_buff_head *list); unsigned int skb_rbtree_purge(struct rb_root *root); @@ -3028,7 +3028,7 @@ static inline int skb_padto(struct sk_buff *skb, unsigned int len) } /** - * skb_put_padto - increase size and pad an skbuff up to a minimal size + * __skb_put_padto - increase size and pad an skbuff up to a minimal size * @skb: buffer to pad * @len: minimal length * @free_on_error: free buffer on error -- cgit v1.2.3-58-ga151 From d5be7f632bad0f489879eed0ff4b99bd7fe0b74c Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 15 Feb 2019 12:15:47 -0500 Subject: net: validate untrusted gso packets without csum offload Syzkaller again found a path to a kernel crash through bad gso input. By building an excessively large packet to cause an skb field to wrap. If VIRTIO_NET_HDR_F_NEEDS_CSUM was set this would have been dropped in skb_partial_csum_set. GSO packets that do not set checksum offload are suspicious and rare. Most callers of virtio_net_hdr_to_skb already pass them to skb_probe_transport_header. Move that test forward, change it to detect parse failure and drop packets on failure as those cleary are not one of the legitimate VIRTIO_NET_HDR_GSO types. Fixes: bfd5f4a3d605 ("packet: Add GSO/csum offload support.") Fixes: f43798c27684 ("tun: Allow GSO using virtio_net_hdr") Reported-by: syzbot Signed-off-by: Willem de Bruijn Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 +- include/linux/virtio_net.h | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'include/linux/skbuff.h') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 95d25b010a25..4c1c82a5678c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2434,7 +2434,7 @@ static inline void skb_probe_transport_header(struct sk_buff *skb, if (skb_flow_dissect_flow_keys_basic(skb, &keys, NULL, 0, 0, 0, 0)) skb_set_transport_header(skb, keys.control.thoff); - else + else if (offset_hint >= 0) skb_set_transport_header(skb, offset_hint); } diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index cb462f9ab7dd..71f2394abbf7 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -57,6 +57,15 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, if (!skb_partial_csum_set(skb, start, off)) return -EINVAL; + } else { + /* gso packets without NEEDS_CSUM do not set transport_offset. + * probe and drop if does not match one of the above types. + */ + if (gso_type) { + skb_probe_transport_header(skb, -1); + if (!skb_transport_header_was_set(skb)) + return -EINVAL; + } } if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { -- cgit v1.2.3-58-ga151 From d2aa125d629080c4f3e31f23b7f612ef6b8492ac Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Thu, 21 Feb 2019 12:39:57 +0000 Subject: net: Don't set transport offset to invalid value If the socket was created with socket(AF_PACKET, SOCK_RAW, 0), skb->protocol will be unset, __skb_flow_dissect() will fail, and skb_probe_transport_header() will fall back to the offset_hint, making the resulting skb_transport_offset incorrect. If, however, there is no transport header in the packet, transport_header shouldn't be set to an arbitrary value. Fix it by leaving the transport offset unset if it couldn't be found, to be explicit rather than to fill it with some wrong value. It changes the behavior, but if some code relied on the old behavior, it would be broken anyway, as the old one is incorrect. Signed-off-by: Maxim Mikityanskiy Signed-off-by: David S. Miller --- drivers/net/tap.c | 4 ++-- drivers/net/tun.c | 4 ++-- drivers/net/xen-netback/netback.c | 15 ++++++++++++--- include/linux/skbuff.h | 5 +---- include/linux/virtio_net.h | 2 +- net/packet/af_packet.c | 6 +++--- 6 files changed, 21 insertions(+), 15 deletions(-) (limited to 'include/linux/skbuff.h') diff --git a/drivers/net/tap.c b/drivers/net/tap.c index c0b52e48f0e6..2ea9b4976f4a 100644 --- a/drivers/net/tap.c +++ b/drivers/net/tap.c @@ -712,7 +712,7 @@ static ssize_t tap_get_user(struct tap_queue *q, void *msg_control, goto err_kfree; } - skb_probe_transport_header(skb, ETH_HLEN); + skb_probe_transport_header(skb); /* Move network header to the right position for VLAN tagged packets */ if ((skb->protocol == htons(ETH_P_8021Q) || @@ -1187,7 +1187,7 @@ static int tap_get_user_xdp(struct tap_queue *q, struct xdp_buff *xdp) tap = rcu_dereference(q->tap); if (tap) { skb->dev = tap->dev; - skb_probe_transport_header(skb, ETH_HLEN); + skb_probe_transport_header(skb); dev_queue_xmit(skb); } else { kfree_skb(skb); diff --git a/drivers/net/tun.c b/drivers/net/tun.c index fed298c0cb39..80bff1b4ec17 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1929,7 +1929,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, } skb_reset_network_header(skb); - skb_probe_transport_header(skb, 0); + skb_probe_transport_header(skb); if (skb_xdp) { struct bpf_prog *xdp_prog; @@ -2482,7 +2482,7 @@ build: skb->protocol = eth_type_trans(skb, tun->dev); skb_reset_network_header(skb); - skb_probe_transport_header(skb, 0); + skb_probe_transport_header(skb); if (skb_xdp) { err = do_xdp_generic(xdp_prog, skb); diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index 80aae3a32c2a..c801a832851c 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -1169,15 +1169,24 @@ static int xenvif_tx_submit(struct xenvif_queue *queue) continue; } - skb_probe_transport_header(skb, 0); + skb_probe_transport_header(skb); /* If the packet is GSO then we will have just set up the * transport header offset in checksum_setup so it's now * straightforward to calculate gso_segs. */ if (skb_is_gso(skb)) { - int mss = skb_shinfo(skb)->gso_size; - int hdrlen = skb_transport_header(skb) - + int mss, hdrlen; + + /* GSO implies having the L4 header. */ + WARN_ON_ONCE(!skb_transport_header_was_set(skb)); + if (unlikely(!skb_transport_header_was_set(skb))) { + kfree_skb(skb); + continue; + } + + mss = skb_shinfo(skb)->gso_size; + hdrlen = skb_transport_header(skb) - skb_mac_header(skb) + tcp_hdrlen(skb); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 2069fb90a559..27beb549ffbe 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2429,8 +2429,7 @@ static inline void skb_pop_mac_header(struct sk_buff *skb) skb->mac_header = skb->network_header; } -static inline void skb_probe_transport_header(struct sk_buff *skb, - const int offset_hint) +static inline void skb_probe_transport_header(struct sk_buff *skb) { struct flow_keys_basic keys; @@ -2439,8 +2438,6 @@ static inline void skb_probe_transport_header(struct sk_buff *skb, if (skb_flow_dissect_flow_keys_basic(skb, &keys, NULL, 0, 0, 0, 0)) skb_set_transport_header(skb, keys.control.thoff); - else if (offset_hint >= 0) - skb_set_transport_header(skb, offset_hint); } static inline void skb_mac_header_rebuild(struct sk_buff *skb) diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index 71f2394abbf7..6728bf581e98 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -62,7 +62,7 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, * probe and drop if does not match one of the above types. */ if (gso_type) { - skb_probe_transport_header(skb, -1); + skb_probe_transport_header(skb); if (!skb_transport_header_was_set(skb)) return -EINVAL; } diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 1cd1d83a4be0..6afd6369d19e 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1970,7 +1970,7 @@ retry: if (unlikely(extra_len == 4)) skb->no_fcs = 1; - skb_probe_transport_header(skb, 0); + skb_probe_transport_header(skb); dev_queue_xmit(skb); rcu_read_unlock(); @@ -2519,7 +2519,7 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, len = ((to_write > len_max) ? len_max : to_write); } - skb_probe_transport_header(skb, 0); + skb_probe_transport_header(skb); return tp_len; } @@ -2925,7 +2925,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) virtio_net_hdr_set_proto(skb, &vnet_hdr); } - skb_probe_transport_header(skb, reserve); + skb_probe_transport_header(skb); if (unlikely(extra_len == 4)) skb->no_fcs = 1; -- cgit v1.2.3-58-ga151 From 4c3024debf62de4c6ac6d3cb4c0063be21d4f652 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Wed, 6 Mar 2019 14:35:15 -0500 Subject: bpf: only test gso type on gso packets BPF can adjust gso only for tcp bytestreams. Fail on other gso types. But only on gso packets. It does not touch this field if !gso_size. Fixes: b90efd225874 ("bpf: only adjust gso_size on bytestream protocols") Signed-off-by: Willem de Bruijn Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann --- include/linux/skbuff.h | 4 ++-- net/core/filter.c | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux/skbuff.h') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 27beb549ffbe..f32f32407dc4 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4232,10 +4232,10 @@ static inline bool skb_is_gso_sctp(const struct sk_buff *skb) return skb_shinfo(skb)->gso_type & SKB_GSO_SCTP; } +/* Note: Should be called only if skb_is_gso(skb) is true */ static inline bool skb_is_gso_tcp(const struct sk_buff *skb) { - return skb_is_gso(skb) && - skb_shinfo(skb)->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6); + return skb_shinfo(skb)->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6); } static inline void skb_gso_reset(struct sk_buff *skb) diff --git a/net/core/filter.c b/net/core/filter.c index 5ceba98069d4..f274620945ff 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2804,7 +2804,7 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb) u32 off = skb_mac_header_len(skb); int ret; - if (!skb_is_gso_tcp(skb)) + if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) return -ENOTSUPP; ret = skb_cow(skb, len_diff); @@ -2845,7 +2845,7 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb) u32 off = skb_mac_header_len(skb); int ret; - if (!skb_is_gso_tcp(skb)) + if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) return -ENOTSUPP; ret = skb_unclone(skb, GFP_ATOMIC); @@ -2970,7 +2970,7 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 len_diff) u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb); int ret; - if (!skb_is_gso_tcp(skb)) + if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) return -ENOTSUPP; ret = skb_cow(skb, len_diff); @@ -2999,7 +2999,7 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 len_diff) u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb); int ret; - if (!skb_is_gso_tcp(skb)) + if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) return -ENOTSUPP; ret = skb_unclone(skb, GFP_ATOMIC); -- cgit v1.2.3-58-ga151 From 161e613755e93c45cc47e75ab046f0f8de9e6d49 Mon Sep 17 00:00:00 2001 From: Pedro Tammela Date: Tue, 5 Mar 2019 11:35:54 -0300 Subject: net: add missing documentation in linux/skbuff.h This patch adds missing documentation for some inline functions on linux/skbuff.h. The patch is incomplete and a lot more can be added, just wondering if it's of interest of the netdev developers. Also fixed some whitespaces. Signed-off-by: Pedro Tammela Signed-off-by: David S. Miller --- include/linux/skbuff.h | 64 ++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 59 insertions(+), 5 deletions(-) (limited to 'include/linux/skbuff.h') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 27beb549ffbe..730b333be591 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -327,26 +327,49 @@ struct skb_frag_struct { #endif }; +/** + * skb_frag_size - Returns the size of a skb fragment + * @frag: skb fragment + */ static inline unsigned int skb_frag_size(const skb_frag_t *frag) { return frag->size; } +/** + * skb_frag_size_set - Sets the size of a skb fragment + * @frag: skb fragment + * @size: size of fragment + */ static inline void skb_frag_size_set(skb_frag_t *frag, unsigned int size) { frag->size = size; } +/** + * skb_frag_size_add - Incrementes the size of a skb fragment by %delta + * @frag: skb fragment + * @delta: value to add + */ static inline void skb_frag_size_add(skb_frag_t *frag, int delta) { frag->size += delta; } +/** + * skb_frag_size_sub - Decrements the size of a skb fragment by %delta + * @frag: skb fragment + * @delta: value to subtract + */ static inline void skb_frag_size_sub(skb_frag_t *frag, int delta) { frag->size -= delta; } +/** + * skb_frag_must_loop - Test if %p is a high memory page + * @p: fragment's page + */ static inline bool skb_frag_must_loop(struct page *p) { #if defined(CONFIG_HIGHMEM) @@ -590,7 +613,7 @@ typedef unsigned int sk_buff_data_t; typedef unsigned char *sk_buff_data_t; #endif -/** +/** * struct sk_buff - socket buffer * @next: Next buffer in list * @prev: Previous buffer in list @@ -648,7 +671,7 @@ typedef unsigned char *sk_buff_data_t; * @csum_not_inet: use CRC32c to resolve CHECKSUM_PARTIAL * @dst_pending_confirm: need to confirm neighbour * @decrypted: Decrypted SKB - * @napi_id: id of the NAPI struct this skb came from + * @napi_id: id of the NAPI struct this skb came from * @secmark: security marking * @mark: Generic packet mark * @vlan_proto: vlan encapsulation protocol @@ -883,7 +906,10 @@ struct sk_buff { #define SKB_ALLOC_RX 0x02 #define SKB_ALLOC_NAPI 0x04 -/* Returns true if the skb was allocated from PFMEMALLOC reserves */ +/** + * skb_pfmemalloc - Test if the skb was allocated from PFMEMALLOC reserves + * @skb: buffer + */ static inline bool skb_pfmemalloc(const struct sk_buff *skb) { return unlikely(skb->pfmemalloc); @@ -905,7 +931,7 @@ static inline bool skb_pfmemalloc(const struct sk_buff *skb) */ static inline struct dst_entry *skb_dst(const struct sk_buff *skb) { - /* If refdst was not refcounted, check we still are in a + /* If refdst was not refcounted, check we still are in a * rcu_read_lock section */ WARN_ON((skb->_skb_refdst & SKB_DST_NOREF) && @@ -952,6 +978,10 @@ static inline bool skb_dst_is_noref(const struct sk_buff *skb) return (skb->_skb_refdst & SKB_DST_NOREF) && skb_dst(skb); } +/** + * skb_rtable - Returns the skb &rtable + * @skb: buffer + */ static inline struct rtable *skb_rtable(const struct sk_buff *skb) { return (struct rtable *)skb_dst(skb); @@ -966,6 +996,10 @@ static inline bool skb_pkt_type_ok(u32 ptype) return ptype <= PACKET_OTHERHOST; } +/** + * skb_napi_id - Returns the skb's NAPI id + * @skb: buffer + */ static inline unsigned int skb_napi_id(const struct sk_buff *skb) { #ifdef CONFIG_NET_RX_BUSY_POLL @@ -975,7 +1009,12 @@ static inline unsigned int skb_napi_id(const struct sk_buff *skb) #endif } -/* decrement the reference count and return true if we can free the skb */ +/** + * skb_unref - decrement the skb's reference count + * @skb: buffer + * + * Returns true if we can free the skb. + */ static inline bool skb_unref(struct sk_buff *skb) { if (unlikely(!skb)) @@ -1005,6 +1044,14 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t priority, int flags, int node); struct sk_buff *__build_skb(void *data, unsigned int frag_size); struct sk_buff *build_skb(void *data, unsigned int frag_size); + +/** + * alloc_skb - allocate a network buffer + * @size: size to allocate + * @priority: allocation mask + * + * This function is a convenient wrapper around __alloc_skb(). + */ static inline struct sk_buff *alloc_skb(unsigned int size, gfp_t priority) { @@ -1047,6 +1094,13 @@ static inline bool skb_fclone_busy(const struct sock *sk, fclones->skb2.sk == sk; } +/** + * alloc_skb_fclone - allocate a network buffer from fclone cache + * @size: size to allocate + * @priority: allocation mask + * + * This function is a convenient wrapper around __alloc_skb(). + */ static inline struct sk_buff *alloc_skb_fclone(unsigned int size, gfp_t priority) { -- cgit v1.2.3-58-ga151