diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2014-12-11 14:27:06 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-12-11 14:27:06 -0800 |
commit | 70e71ca0af244f48a5dcf56dc435243792e3a495 (patch) | |
tree | f7d9c4c4d9a857a00043e9bf6aa2d6f533a34778 /include/linux/skbuff.h | |
parent | bae41e45b7400496b9bf0c70c6004419d9987819 (diff) | |
parent | 00c83b01d58068dfeb2e1351cca6fccf2a83fa8f (diff) |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller:
1) New offloading infrastructure and example 'rocker' driver for
offloading of switching and routing to hardware.
This work was done by a large group of dedicated individuals, not
limited to: Scott Feldman, Jiri Pirko, Thomas Graf, John Fastabend,
Jamal Hadi Salim, Andy Gospodarek, Florian Fainelli, Roopa Prabhu
2) Start making the networking operate on IOV iterators instead of
modifying iov objects in-situ during transfers. Thanks to Al Viro
and Herbert Xu.
3) A set of new netlink interfaces for the TIPC stack, from Richard
Alpe.
4) Remove unnecessary looping during ipv6 routing lookups, from Martin
KaFai Lau.
5) Add PAUSE frame generation support to gianfar driver, from Matei
Pavaluca.
6) Allow for larger reordering levels in TCP, which are easily
achievable in the real world right now, from Eric Dumazet.
7) Add a variable of napi_schedule that doesn't need to disable cpu
interrupts, from Eric Dumazet.
8) Use a doubly linked list to optimize neigh_parms_release(), from
Nicolas Dichtel.
9) Various enhancements to the kernel BPF verifier, and allow eBPF
programs to actually be attached to sockets. From Alexei
Starovoitov.
10) Support TSO/LSO in sunvnet driver, from David L Stevens.
11) Allow controlling ECN usage via routing metrics, from Florian
Westphal.
12) Remote checksum offload, from Tom Herbert.
13) Add split-header receive, BQL, and xmit_more support to amd-xgbe
driver, from Thomas Lendacky.
14) Add MPLS support to openvswitch, from Simon Horman.
15) Support wildcard tunnel endpoints in ipv6 tunnels, from Steffen
Klassert.
16) Do gro flushes on a per-device basis using a timer, from Eric
Dumazet. This tries to resolve the conflicting goals between the
desired handling of bulk vs. RPC-like traffic.
17) Allow userspace to ask for the CPU upon what a packet was
received/steered, via SO_INCOMING_CPU. From Eric Dumazet.
18) Limit GSO packets to half the current congestion window, from Eric
Dumazet.
19) Add a generic helper so that all drivers set their RSS keys in a
consistent way, from Eric Dumazet.
20) Add xmit_more support to enic driver, from Govindarajulu
Varadarajan.
21) Add VLAN packet scheduler action, from Jiri Pirko.
22) Support configurable RSS hash functions via ethtool, from Eyal
Perry.
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1820 commits)
Fix race condition between vxlan_sock_add and vxlan_sock_release
net/macb: fix compilation warning for print_hex_dump() called with skb->mac_header
net/mlx4: Add support for A0 steering
net/mlx4: Refactor QUERY_PORT
net/mlx4_core: Add explicit error message when rule doesn't meet configuration
net/mlx4: Add A0 hybrid steering
net/mlx4: Add mlx4_bitmap zone allocator
net/mlx4: Add a check if there are too many reserved QPs
net/mlx4: Change QP allocation scheme
net/mlx4_core: Use tasklet for user-space CQ completion events
net/mlx4_core: Mask out host side virtualization features for guests
net/mlx4_en: Set csum level for encapsulated packets
be2net: Export tunnel offloads only when a VxLAN tunnel is created
gianfar: Fix dma check map error when DMA_API_DEBUG is enabled
cxgb4/csiostor: Don't use MASTER_MUST for fw_hello call
net: fec: only enable mdio interrupt before phy device link up
net: fec: clear all interrupt events to support i.MX6SX
net: fec: reset fep link status in suspend function
net: sock: fix access via invalid file descriptor
net: introduce helper macro for_each_cmsghdr
...
Diffstat (limited to 'include/linux/skbuff.h')
-rw-r--r-- | include/linux/skbuff.h | 188 |
1 files changed, 114 insertions, 74 deletions
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 6c8b6f604e76..85ab7d72b54c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -20,6 +20,8 @@ #include <linux/time.h> #include <linux/bug.h> #include <linux/cache.h> +#include <linux/rbtree.h> +#include <linux/socket.h> #include <linux/atomic.h> #include <asm/types.h> @@ -148,6 +150,8 @@ struct net_device; struct scatterlist; struct pipe_inode_info; +struct iov_iter; +struct napi_struct; #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) struct nf_conntrack { @@ -341,7 +345,6 @@ enum { SKB_FCLONE_UNAVAILABLE, /* skb has no fclone (from head_cache) */ SKB_FCLONE_ORIG, /* orig skb (from fclone_cache) */ SKB_FCLONE_CLONE, /* companion fclone skb (from fclone_cache) */ - SKB_FCLONE_FREE, /* this companion fclone skb is available */ }; enum { @@ -370,8 +373,7 @@ enum { SKB_GSO_UDP_TUNNEL_CSUM = 1 << 11, - SKB_GSO_MPLS = 1 << 12, - + SKB_GSO_TUNNEL_REMCSUM = 1 << 12, }; #if BITS_PER_LONG > 32 @@ -440,6 +442,7 @@ static inline u32 skb_mstamp_us_delta(const struct skb_mstamp *t1, * @next: Next buffer in list * @prev: Previous buffer in list * @tstamp: Time we arrived/left + * @rbnode: RB tree node, alternative to next/prev for netem/tcp * @sk: Socket we are owned by * @dev: Device we arrived on/are leaving by * @cb: Control buffer. Free for use by every layer. Put private vars here @@ -504,15 +507,19 @@ static inline u32 skb_mstamp_us_delta(const struct skb_mstamp *t1, */ struct sk_buff { - /* These two members must be first. */ - struct sk_buff *next; - struct sk_buff *prev; - union { - ktime_t tstamp; - struct skb_mstamp skb_mstamp; + struct { + /* These two members must be first. */ + struct sk_buff *next; + struct sk_buff *prev; + + union { + ktime_t tstamp; + struct skb_mstamp skb_mstamp; + }; + }; + struct rb_node rbnode; /* used in netem & tcp stack */ }; - struct sock *sk; struct net_device *dev; @@ -597,7 +604,8 @@ struct sk_buff { #endif __u8 ipvs_property:1; __u8 inner_protocol_type:1; - /* 4 or 6 bit hole */ + __u8 remcsum_offload:1; + /* 3 or 5 bit hole */ #ifdef CONFIG_NET_SCHED __u16 tc_index; /* traffic control index */ @@ -666,6 +674,7 @@ struct sk_buff { #define SKB_ALLOC_FCLONE 0x01 #define SKB_ALLOC_RX 0x02 +#define SKB_ALLOC_NAPI 0x04 /* Returns true if the skb was allocated from PFMEMALLOC reserves */ static inline bool skb_pfmemalloc(const struct sk_buff *skb) @@ -710,9 +719,6 @@ static inline void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst) skb->_skb_refdst = (unsigned long)dst; } -void __skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst, - bool force); - /** * skb_dst_set_noref - sets skb dst, hopefully, without taking reference * @skb: buffer @@ -725,24 +731,8 @@ void __skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst, */ static inline void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst) { - __skb_dst_set_noref(skb, dst, false); -} - -/** - * skb_dst_set_noref_force - sets skb dst, without taking reference - * @skb: buffer - * @dst: dst entry - * - * Sets skb dst, assuming a reference was not taken on dst. - * No reference is taken and no dst_release will be called. While for - * cached dsts deferred reclaim is a basic feature, for entries that are - * not cached it is caller's job to guarantee that last dst_release for - * provided dst happens when nobody uses it, eg. after a RCU grace period. - */ -static inline void skb_dst_set_noref_force(struct sk_buff *skb, - struct dst_entry *dst) -{ - __skb_dst_set_noref(skb, dst, true); + WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); + skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF; } /** @@ -810,7 +800,7 @@ static inline bool skb_fclone_busy(const struct sock *sk, fclones = container_of(skb, struct sk_buff_fclones, skb1); return skb->fclone == SKB_FCLONE_ORIG && - fclones->skb2.fclone == SKB_FCLONE_CLONE && + atomic_read(&fclones->fclone_ref) > 1 && fclones->skb2.sk == sk; } @@ -2176,47 +2166,61 @@ static inline struct sk_buff *netdev_alloc_skb_ip_align(struct net_device *dev, return __netdev_alloc_skb_ip_align(dev, length, GFP_ATOMIC); } +void *napi_alloc_frag(unsigned int fragsz); +struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, + unsigned int length, gfp_t gfp_mask); +static inline struct sk_buff *napi_alloc_skb(struct napi_struct *napi, + unsigned int length) +{ + return __napi_alloc_skb(napi, length, GFP_ATOMIC); +} + /** - * __skb_alloc_pages - allocate pages for ps-rx on a skb and preserve pfmemalloc data - * @gfp_mask: alloc_pages_node mask. Set __GFP_NOMEMALLOC if not for network packet RX - * @skb: skb to set pfmemalloc on if __GFP_MEMALLOC is used - * @order: size of the allocation + * __dev_alloc_pages - allocate page for network Rx + * @gfp_mask: allocation priority. Set __GFP_NOMEMALLOC if not for network Rx + * @order: size of the allocation * - * Allocate a new page. + * Allocate a new page. * - * %NULL is returned if there is no free memory. + * %NULL is returned if there is no free memory. */ -static inline struct page *__skb_alloc_pages(gfp_t gfp_mask, - struct sk_buff *skb, - unsigned int order) -{ - struct page *page; - - gfp_mask |= __GFP_COLD; - - if (!(gfp_mask & __GFP_NOMEMALLOC)) - gfp_mask |= __GFP_MEMALLOC; +static inline struct page *__dev_alloc_pages(gfp_t gfp_mask, + unsigned int order) +{ + /* This piece of code contains several assumptions. + * 1. This is for device Rx, therefor a cold page is preferred. + * 2. The expectation is the user wants a compound page. + * 3. If requesting a order 0 page it will not be compound + * due to the check to see if order has a value in prep_new_page + * 4. __GFP_MEMALLOC is ignored if __GFP_NOMEMALLOC is set due to + * code in gfp_to_alloc_flags that should be enforcing this. + */ + gfp_mask |= __GFP_COLD | __GFP_COMP | __GFP_MEMALLOC; - page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, order); - if (skb && page && page->pfmemalloc) - skb->pfmemalloc = true; + return alloc_pages_node(NUMA_NO_NODE, gfp_mask, order); +} - return page; +static inline struct page *dev_alloc_pages(unsigned int order) +{ + return __dev_alloc_pages(GFP_ATOMIC, order); } /** - * __skb_alloc_page - allocate a page for ps-rx for a given skb and preserve pfmemalloc data - * @gfp_mask: alloc_pages_node mask. Set __GFP_NOMEMALLOC if not for network packet RX - * @skb: skb to set pfmemalloc on if __GFP_MEMALLOC is used + * __dev_alloc_page - allocate a page for network Rx + * @gfp_mask: allocation priority. Set __GFP_NOMEMALLOC if not for network Rx * - * Allocate a new page. + * Allocate a new page. * - * %NULL is returned if there is no free memory. + * %NULL is returned if there is no free memory. */ -static inline struct page *__skb_alloc_page(gfp_t gfp_mask, - struct sk_buff *skb) +static inline struct page *__dev_alloc_page(gfp_t gfp_mask) +{ + return __dev_alloc_pages(gfp_mask, 0); +} + +static inline struct page *dev_alloc_page(void) { - return __skb_alloc_pages(gfp_mask, skb, 0); + return __dev_alloc_page(GFP_ATOMIC); } /** @@ -2448,7 +2452,6 @@ static inline int skb_cow_head(struct sk_buff *skb, unsigned int headroom) * is untouched. Otherwise it is extended. Returns zero on * success. The skb is freed on error. */ - static inline int skb_padto(struct sk_buff *skb, unsigned int len) { unsigned int size = skb->len; @@ -2457,6 +2460,29 @@ static inline int skb_padto(struct sk_buff *skb, unsigned int len) return skb_pad(skb, len - size); } +/** + * skb_put_padto - increase size and pad an skbuff up to a minimal size + * @skb: buffer to pad + * @len: minimal length + * + * Pads up a buffer to ensure the trailing bytes exist and are + * blanked. If the buffer already contains sufficient data it + * is untouched. Otherwise it is extended. Returns zero on + * success. The skb is freed on error. + */ +static inline int skb_put_padto(struct sk_buff *skb, unsigned int len) +{ + unsigned int size = skb->len; + + if (unlikely(size < len)) { + len -= size; + if (skb_pad(skb, len)) + return -ENOMEM; + __skb_put(skb, len); + } + return 0; +} + static inline int skb_add_data(struct sk_buff *skb, char __user *from, int copy) { @@ -2629,18 +2655,18 @@ struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, int noblock, int *err); unsigned int datagram_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait); -int skb_copy_datagram_iovec(const struct sk_buff *from, int offset, - struct iovec *to, int size); -int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb, int hlen, - struct iovec *iov); -int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset, - const struct iovec *from, int from_offset, - int len); -int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *frm, - int offset, size_t count); -int skb_copy_datagram_const_iovec(const struct sk_buff *from, int offset, - const struct iovec *to, int to_offset, - int size); +int skb_copy_datagram_iter(const struct sk_buff *from, int offset, + struct iov_iter *to, int size); +static inline int skb_copy_datagram_msg(const struct sk_buff *from, int offset, + struct msghdr *msg, int size) +{ + return skb_copy_datagram_iter(from, offset, &msg->msg_iter, size); +} +int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, int hlen, + struct msghdr *msg); +int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset, + struct iov_iter *from, int len); +int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *frm); void skb_free_datagram(struct sock *sk, struct sk_buff *skb); void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb); int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags); @@ -2661,6 +2687,20 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet); unsigned int skb_gso_transport_seglen(const struct sk_buff *skb); struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features); struct sk_buff *skb_vlan_untag(struct sk_buff *skb); +int skb_ensure_writable(struct sk_buff *skb, int write_len); +int skb_vlan_pop(struct sk_buff *skb); +int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci); + +static inline int memcpy_from_msg(void *data, struct msghdr *msg, int len) +{ + /* XXX: stripping const */ + return memcpy_fromiovec(data, (struct iovec *)msg->msg_iter.iov, len); +} + +static inline int memcpy_to_msg(struct msghdr *msg, void *data, int len) +{ + return copy_to_iter(data, len, &msg->msg_iter) == len ? 0 : -EFAULT; +} struct skb_checksum_ops { __wsum (*update)(const void *mem, int len, __wsum wsum); |