diff options
-rw-r--r-- | Documentation/networking/netlink_mmap.txt | 339 | ||||
-rw-r--r-- | include/linux/netfilter/nfnetlink.h | 11 | ||||
-rw-r--r-- | include/linux/netlink.h | 11 | ||||
-rw-r--r-- | include/linux/skbuff.h | 6 | ||||
-rw-r--r-- | include/net/netfilter/nf_conntrack.h | 2 | ||||
-rw-r--r-- | include/net/netfilter/nf_conntrack_expect.h | 4 | ||||
-rw-r--r-- | include/uapi/linux/netlink.h | 32 | ||||
-rw-r--r-- | include/uapi/linux/netlink_diag.h | 10 | ||||
-rw-r--r-- | net/Kconfig | 9 | ||||
-rw-r--r-- | net/core/skbuff.c | 30 | ||||
-rw-r--r-- | net/ipv4/inet_diag.c | 6 | ||||
-rw-r--r-- | net/ipv4/udp_diag.c | 4 | ||||
-rw-r--r-- | net/netfilter/nf_conntrack_core.c | 8 | ||||
-rw-r--r-- | net/netfilter/nf_conntrack_expect.c | 8 | ||||
-rw-r--r-- | net/netfilter/nfnetlink.c | 20 | ||||
-rw-r--r-- | net/netfilter/nfnetlink_log.c | 12 | ||||
-rw-r--r-- | net/netfilter/nfnetlink_queue_core.c | 3 | ||||
-rw-r--r-- | net/netlink/af_netlink.c | 836 | ||||
-rw-r--r-- | net/netlink/af_netlink.h | 20 | ||||
-rw-r--r-- | net/netlink/diag.c | 32 | ||||
-rw-r--r-- | net/sched/cls_flow.c | 2 |
21 files changed, 1331 insertions, 74 deletions
diff --git a/Documentation/networking/netlink_mmap.txt b/Documentation/networking/netlink_mmap.txt new file mode 100644 index 000000000000..1c2dab409625 --- /dev/null +++ b/Documentation/networking/netlink_mmap.txt @@ -0,0 +1,339 @@ +This file documents how to use memory mapped I/O with netlink. + +Author: Patrick McHardy <kaber@trash.net> + +Overview +-------- + +Memory mapped netlink I/O can be used to increase throughput and decrease +overhead of unicast receive and transmit operations. Some netlink subsystems +require high throughput, these are mainly the netfilter subsystems +nfnetlink_queue and nfnetlink_log, but it can also help speed up large +dump operations of f.i. the routing database. + +Memory mapped netlink I/O used two circular ring buffers for RX and TX which +are mapped into the processes address space. + +The RX ring is used by the kernel to directly construct netlink messages into +user-space memory without copying them as done with regular socket I/O, +additionally as long as the ring contains messages no recvmsg() or poll() +syscalls have to be issued by user-space to get more message. + +The TX ring is used to process messages directly from user-space memory, the +kernel processes all messages contained in the ring using a single sendmsg() +call. + +Usage overview +-------------- + +In order to use memory mapped netlink I/O, user-space needs three main changes: + +- ring setup +- conversion of the RX path to get messages from the ring instead of recvmsg() +- conversion of the TX path to construct messages into the ring + +Ring setup is done using setsockopt() to provide the ring parameters to the +kernel, then a call to mmap() to map the ring into the processes address space: + +- setsockopt(fd, SOL_NETLINK, NETLINK_RX_RING, ¶ms, sizeof(params)); +- setsockopt(fd, SOL_NETLINK, NETLINK_TX_RING, ¶ms, sizeof(params)); +- ring = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0) + +Usage of either ring is optional, but even if only the RX ring is used the +mapping still needs to be writable in order to update the frame status after +processing. + +Conversion of the reception path involves calling poll() on the file +descriptor, once the socket is readable the frames from the ring are +processsed in order until no more messages are available, as indicated by +a status word in the frame header. + +On kernel side, in order to make use of memory mapped I/O on receive, the +originating netlink subsystem needs to support memory mapped I/O, otherwise +it will use an allocated socket buffer as usual and the contents will be + copied to the ring on transmission, nullifying most of the performance gains. +Dumps of kernel databases automatically support memory mapped I/O. + +Conversion of the transmit path involves changing message contruction to +use memory from the TX ring instead of (usually) a buffer declared on the +stack and setting up the frame header approriately. Optionally poll() can +be used to wait for free frames in the TX ring. + +Structured and definitions for using memory mapped I/O are contained in +<linux/netlink.h>. + +RX and TX rings +---------------- + +Each ring contains a number of continous memory blocks, containing frames of +fixed size dependant on the parameters used for ring setup. + +Ring: [ block 0 ] + [ frame 0 ] + [ frame 1 ] + [ block 1 ] + [ frame 2 ] + [ frame 3 ] + ... + [ block n ] + [ frame 2 * n ] + [ frame 2 * n + 1 ] + +The blocks are only visible to the kernel, from the point of view of user-space +the ring just contains the frames in a continous memory zone. + +The ring parameters used for setting up the ring are defined as follows: + +struct nl_mmap_req { + unsigned int nm_block_size; + unsigned int nm_block_nr; + unsigned int nm_frame_size; + unsigned int nm_frame_nr; +}; + +Frames are grouped into blocks, where each block is a continous region of memory +and holds nm_block_size / nm_frame_size frames. The total number of frames in +the ring is nm_frame_nr. The following invariants hold: + +- frames_per_block = nm_block_size / nm_frame_size + +- nm_frame_nr = frames_per_block * nm_block_nr + +Some parameters are constrained, specifically: + +- nm_block_size must be a multiple of the architectures memory page size. + The getpagesize() function can be used to get the page size. + +- nm_frame_size must be equal or larger to NL_MMAP_HDRLEN, IOW a frame must be + able to hold at least the frame header + +- nm_frame_size must be smaller or equal to nm_block_size + +- nm_frame_size must be a multiple of NL_MMAP_MSG_ALIGNMENT + +- nm_frame_nr must equal the actual number of frames as specified above. + +When the kernel can't allocate phsyically continous memory for a ring block, +it will fall back to use physically discontinous memory. This might affect +performance negatively, in order to avoid this the nm_frame_size parameter +should be chosen to be as small as possible for the required frame size and +the number of blocks should be increased instead. + +Ring frames +------------ + +Each frames contain a frame header, consisting of a synchronization word and some +meta-data, and the message itself. + +Frame: [ header message ] + +The frame header is defined as follows: + +struct nl_mmap_hdr { + unsigned int nm_status; + unsigned int nm_len; + __u32 nm_group; + /* credentials */ + __u32 nm_pid; + __u32 nm_uid; + __u32 nm_gid; +}; + +- nm_status is used for synchronizing processing between the kernel and user- + space and specifies ownership of the frame as well as the operation to perform + +- nm_len contains the length of the message contained in the data area + +- nm_group specified the destination multicast group of message + +- nm_pid, nm_uid and nm_gid contain the netlink pid, UID and GID of the sending + process. These values correspond to the data available using SOCK_PASSCRED in + the SCM_CREDENTIALS cmsg. + +The possible values in the status word are: + +- NL_MMAP_STATUS_UNUSED: + RX ring: frame belongs to the kernel and contains no message + for user-space. Approriate action is to invoke poll() + to wait for new messages. + + TX ring: frame belongs to user-space and can be used for + message construction. + +- NL_MMAP_STATUS_RESERVED: + RX ring only: frame is currently used by the kernel for message + construction and contains no valid message yet. + Appropriate action is to invoke poll() to wait for + new messages. + +- NL_MMAP_STATUS_VALID: + RX ring: frame contains a valid message. Approriate action is + to process the message and release the frame back to + the kernel by setting the status to + NL_MMAP_STATUS_UNUSED or queue the frame by setting the + status to NL_MMAP_STATUS_SKIP. + + TX ring: the frame contains a valid message from user-space to + be processed by the kernel. After completing processing + the kernel will release the frame back to user-space by + setting the status to NL_MMAP_STATUS_UNUSED. + +- NL_MMAP_STATUS_COPY: + RX ring only: a message is ready to be processed but could not be + stored in the ring, either because it exceeded the + frame size or because the originating subsystem does + not support memory mapped I/O. Appropriate action is + to invoke recvmsg() to receive the message and release + the frame back to the kernel by setting the status to + NL_MMAP_STATUS_UNUSED. + +- NL_MMAP_STATUS_SKIP: + RX ring only: user-space queued the message for later processing, but + processed some messages following it in the ring. The + kernel should skip this frame when looking for unused + frames. + +The data area of a frame begins at a offset of NL_MMAP_HDRLEN relative to the +frame header. + +TX limitations +-------------- + +Kernel processing usually involves validation of the message received by +user-space, then processing its contents. The kernel must assure that +userspace is not able to modify the message contents after they have been +validated. In order to do so, the message is copied from the ring frame +to an allocated buffer if either of these conditions is false: + +- only a single mapping of the ring exists +- the file descriptor is not shared between processes + +This means that for threaded programs, the kernel will fall back to copying. + +Example +------- + +Ring setup: + + unsigned int block_size = 16 * getpagesize(); + struct nl_mmap_req req = { + .nm_block_size = block_size, + .nm_block_nr = 64, + .nm_frame_size = 16384, + .nm_frame_nr = 64 * block_size / 16384, + }; + unsigned int ring_size; + void *rx_ring, *tx_ring; + + /* Configure ring parameters */ + if (setsockopt(fd, NETLINK_RX_RING, &req, sizeof(req)) < 0) + exit(1); + if (setsockopt(fd, NETLINK_TX_RING, &req, sizeof(req)) < 0) + exit(1) + + /* Calculate size of each invididual ring */ + ring_size = req.nm_block_nr * req.nm_block_size; + + /* Map RX/TX rings. The TX ring is located after the RX ring */ + rx_ring = mmap(NULL, 2 * ring_size, PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0); + if ((long)rx_ring == -1L) + exit(1); + tx_ring = rx_ring + ring_size: + +Message reception: + +This example assumes some ring parameters of the ring setup are available. + + unsigned int frame_offset = 0; + struct nl_mmap_hdr *hdr; + struct nlmsghdr *nlh; + unsigned char buf[16384]; + ssize_t len; + + while (1) { + struct pollfd pfds[1]; + + pfds[0].fd = fd; + pfds[0].events = POLLIN | POLLERR; + pfds[0].revents = 0; + + if (poll(pfds, 1, -1) < 0 && errno != -EINTR) + exit(1); + + /* Check for errors. Error handling omitted */ + if (pfds[0].revents & POLLERR) + <handle error> + + /* If no new messages, poll again */ + if (!(pfds[0].revents & POLLIN)) + continue; + + /* Process all frames */ + while (1) { + /* Get next frame header */ + hdr = rx_ring + frame_offset; + + if (hdr->nm_status == NL_MMAP_STATUS_VALID) + /* Regular memory mapped frame */ + nlh = (void *hdr) + NL_MMAP_HDRLEN; + len = hdr->nm_len; + + /* Release empty message immediately. May happen + * on error during message construction. + */ + if (len == 0) + goto release; + } else if (hdr->nm_status == NL_MMAP_STATUS_COPY) { + /* Frame queued to socket receive queue */ + len = recv(fd, buf, sizeof(buf), MSG_DONTWAIT); + if (len <= 0) + break; + nlh = buf; + } else + /* No more messages to process, continue polling */ + break; + + process_msg(nlh); +release: + /* Release frame back to the kernel */ + hdr->nm_status = NL_MMAP_STATUS_UNUSED; + + /* Advance frame offset to next frame */ + frame_offset = (frame_offset + frame_size) % ring_size; + } + } + +Message transmission: + +This example assumes some ring parameters of the ring setup are available. +A single message is constructed and transmitted, to send multiple messages +at once they would be constructed in consecutive frames before a final call +to sendto(). + + unsigned int frame_offset = 0; + struct nl_mmap_hdr *hdr; + struct nlmsghdr *nlh; + struct sockaddr_nl addr = { + .nl_family = AF_NETLINK, + }; + + hdr = tx_ring + frame_offset; + if (hdr->nm_status != NL_MMAP_STATUS_UNUSED) + /* No frame available. Use poll() to avoid. */ + exit(1); + + nlh = (void *)hdr + NL_MMAP_HDRLEN; + + /* Build message */ + build_message(nlh); + + /* Fill frame header: length and status need to be set */ + hdr->nm_len = nlh->nlmsg_len; + hdr->nm_status = NL_MMAP_STATUS_VALID; + + if (sendto(fd, NULL, 0, 0, &addr, sizeof(addr)) < 0) + exit(1); + + /* Advance frame offset to next frame */ + frame_offset = (frame_offset + frame_size) % ring_size; diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h index ecbb8e495912..cadb7402d7a7 100644 --- a/include/linux/netfilter/nfnetlink.h +++ b/include/linux/netfilter/nfnetlink.h @@ -29,10 +29,13 @@ extern int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n); extern int nfnetlink_subsys_unregister(const struct nfnetlink_subsystem *n); extern int nfnetlink_has_listeners(struct net *net, unsigned int group); -extern int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned int group, - int echo, gfp_t flags); -extern int nfnetlink_set_err(struct net *net, u32 pid, u32 group, int error); -extern int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u_int32_t pid, int flags); +extern struct sk_buff *nfnetlink_alloc_skb(struct net *net, unsigned int size, + u32 dst_portid, gfp_t gfp_mask); +extern int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 portid, + unsigned int group, int echo, gfp_t flags); +extern int nfnetlink_set_err(struct net *net, u32 portid, u32 group, int error); +extern int nfnetlink_unicast(struct sk_buff *skb, struct net *net, + u32 portid, int flags); extern void nfnl_lock(__u8 subsys_id); extern void nfnl_unlock(__u8 subsys_id); diff --git a/include/linux/netlink.h b/include/linux/netlink.h index e0f746b7b95c..6358da5eeee8 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -15,11 +15,18 @@ static inline struct nlmsghdr *nlmsg_hdr(const struct sk_buff *skb) return (struct nlmsghdr *)skb->data; } +enum netlink_skb_flags { + NETLINK_SKB_MMAPED = 0x1, /* Packet data is mmaped */ + NETLINK_SKB_TX = 0x2, /* Packet was sent by userspace */ + NETLINK_SKB_DELIVERED = 0x4, /* Packet was delivered */ +}; + struct netlink_skb_parms { struct scm_creds creds; /* Skb credentials */ __u32 portid; __u32 dst_group; - struct sock *ssk; + __u32 flags; + struct sock *sk; }; #define NETLINK_CB(skb) (*(struct netlink_skb_parms*)&((skb)->cb)) @@ -57,6 +64,8 @@ extern void __netlink_clear_multicast_users(struct sock *sk, unsigned int group) extern void netlink_clear_multicast_users(struct sock *sk, unsigned int group); extern void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err); extern int netlink_has_listeners(struct sock *sk, unsigned int group); +extern struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size, + u32 dst_portid, gfp_t gfp_mask); extern int netlink_unicast(struct sock *ssk, struct sk_buff *skb, __u32 portid, int nonblock); extern int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, __u32 portid, __u32 group, gfp_t allocation); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f5bed7b31954..2e0ced1af3b1 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -651,6 +651,12 @@ static inline struct sk_buff *alloc_skb_fclone(unsigned int size, return __alloc_skb(size, priority, SKB_ALLOC_FCLONE, NUMA_NO_NODE); } +extern struct sk_buff *__alloc_skb_head(gfp_t priority, int node); +static inline struct sk_buff *alloc_skb_head(gfp_t priority) +{ + return __alloc_skb_head(priority, -1); +} + extern struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src); extern int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask); extern struct sk_buff *skb_clone(struct sk_buff *skb, diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index caca0c4d6b4b..644d9c223d24 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -184,7 +184,7 @@ extern int nf_conntrack_hash_check_insert(struct nf_conn *ct); extern void nf_ct_delete_from_lists(struct nf_conn *ct); extern void nf_ct_dying_timeout(struct nf_conn *ct); -extern void nf_conntrack_flush_report(struct net *net, u32 pid, int report); +extern void nf_conntrack_flush_report(struct net *net, u32 portid, int report); extern bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff, u_int16_t l3num, diff --git a/include/net/netfilter/nf_conntrack_expect.h b/include/net/netfilter/nf_conntrack_expect.h index cbbae7621e22..3f3aecbc8632 100644 --- a/include/net/netfilter/nf_conntrack_expect.h +++ b/include/net/netfilter/nf_conntrack_expect.h @@ -88,7 +88,7 @@ nf_ct_find_expectation(struct net *net, u16 zone, const struct nf_conntrack_tuple *tuple); void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, - u32 pid, int report); + u32 portid, int report); static inline void nf_ct_unlink_expect(struct nf_conntrack_expect *exp) { nf_ct_unlink_expect_report(exp, 0, 0); @@ -106,7 +106,7 @@ void nf_ct_expect_init(struct nf_conntrack_expect *, unsigned int, u_int8_t, u_int8_t, const __be16 *, const __be16 *); void nf_ct_expect_put(struct nf_conntrack_expect *exp); int nf_ct_expect_related_report(struct nf_conntrack_expect *expect, - u32 pid, int report); + u32 portid, int report); static inline int nf_ct_expect_related(struct nf_conntrack_expect *expect) { return nf_ct_expect_related_report(expect, 0, 0); diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h index 32a354f67ba4..1a85940f8ab7 100644 --- a/include/uapi/linux/netlink.h +++ b/include/uapi/linux/netlink.h @@ -1,6 +1,7 @@ #ifndef _UAPI__LINUX_NETLINK_H #define _UAPI__LINUX_NETLINK_H +#include <linux/kernel.h> #include <linux/socket.h> /* for __kernel_sa_family_t */ #include <linux/types.h> @@ -105,11 +106,42 @@ struct nlmsgerr { #define NETLINK_PKTINFO 3 #define NETLINK_BROADCAST_ERROR 4 #define NETLINK_NO_ENOBUFS 5 +#define NETLINK_RX_RING 6 +#define NETLINK_TX_RING 7 struct nl_pktinfo { __u32 group; }; +struct nl_mmap_req { + unsigned int nm_block_size; + unsigned int nm_block_nr; + unsigned int nm_frame_size; + unsigned int nm_frame_nr; +}; + +struct nl_mmap_hdr { + unsigned int nm_status; + unsigned int nm_len; + __u32 nm_group; + /* credentials */ + __u32 nm_pid; + __u32 nm_uid; + __u32 nm_gid; +}; + +enum nl_mmap_status { + NL_MMAP_STATUS_UNUSED, + NL_MMAP_STATUS_RESERVED, + NL_MMAP_STATUS_VALID, + NL_MMAP_STATUS_COPY, + NL_MMAP_STATUS_SKIP, +}; + +#define NL_MMAP_MSG_ALIGNMENT NLMSG_ALIGNTO +#define NL_MMAP_MSG_ALIGN(sz) __ALIGN_KERNEL(sz, NL_MMAP_MSG_ALIGNMENT) +#define NL_MMAP_HDRLEN NL_MMAP_MSG_ALIGN(sizeof(struct nl_mmap_hdr)) + #define NET_MAJOR 36 /* Major 36 is reserved for networking */ enum { diff --git a/include/uapi/linux/netlink_diag.h b/include/uapi/linux/netlink_diag.h index 88009a31cd06..4e31db4eea41 100644 --- a/include/uapi/linux/netlink_diag.h +++ b/include/uapi/linux/netlink_diag.h @@ -25,9 +25,18 @@ struct netlink_diag_msg { __u32 ndiag_cookie[2]; }; +struct netlink_diag_ring { + __u32 ndr_block_size; + __u32 ndr_block_nr; + __u32 ndr_frame_size; + __u32 ndr_frame_nr; +}; + enum { NETLINK_DIAG_MEMINFO, NETLINK_DIAG_GROUPS, + NETLINK_DIAG_RX_RING, + NETLINK_DIAG_TX_RING, __NETLINK_DIAG_MAX, }; @@ -38,5 +47,6 @@ enum { #define NDIAG_SHOW_MEMINFO 0x00000001 /* show memory info of a socket */ #define NDIAG_SHOW_GROUPS 0x00000002 /* show groups of a netlink socket */ +#define NDIAG_SHOW_RING_CFG 0x00000004 /* show ring configuration */ #endif diff --git a/net/Kconfig b/net/Kconfig index 2ddc9046868e..1a2221630e6a 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -23,6 +23,15 @@ menuconfig NET if NET +config NETLINK_MMAP + bool "Netlink: mmaped IO" + help + This option enables support for memory mapped netlink IO. This + reduces overhead by avoiding copying data between kernel- and + userspace. + + If unsure, say N. + config WANT_COMPAT_NETLINK_MESSAGES bool help diff --git a/net/core/skbuff.c b/net/core/skbuff.c index a92d9e7d10f7..898cf5c566f9 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -179,6 +179,33 @@ out: * */ +struct sk_buff *__alloc_skb_head(gfp_t gfp_mask, int node) +{ + struct sk_buff *skb; + + /* Get the HEAD */ + skb = kmem_cache_alloc_node(skbuff_head_cache, + gfp_mask & ~__GFP_DMA, node); + if (!skb) + goto out; + + /* + * Only clear those fields we need to clear, not those that we will + * actually initialise below. Hence, don't put any more fields after + * the tail pointer in struct sk_buff! + */ + memset(skb, 0, offsetof(struct sk_buff, tail)); + skb->data = NULL; + skb->truesize = sizeof(struct sk_buff); + atomic_set(&skb->users, 1); + +#ifdef NET_SKBUFF_DATA_USES_OFFSET + skb->mac_header = ~0U; +#endif +out: + return skb; +} + /** * __alloc_skb - allocate a network buffer * @size: size to allocate @@ -584,7 +611,8 @@ static void skb_release_head_state(struct sk_buff *skb) static void skb_release_all(struct sk_buff *skb) { skb_release_head_state(skb); - skb_release_data(skb); + if (likely(skb->data)) + skb_release_data(skb); } /** diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 8620408af574..5f648751fce2 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -324,7 +324,7 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_s } err = sk_diag_fill(sk, rep, req, - sk_user_ns(NETLINK_CB(in_skb).ssk), + sk_user_ns(NETLINK_CB(in_skb).sk), NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 0, nlh); if (err < 0) { @@ -630,7 +630,7 @@ static int inet_csk_diag_dump(struct sock *sk, return 0; return inet_csk_diag_fill(sk, skb, r, - sk_user_ns(NETLINK_CB(cb->skb).ssk), + sk_user_ns(NETLINK_CB(cb->skb).sk), NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); } @@ -805,7 +805,7 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, } err = inet_diag_fill_req(skb, sk, req, - sk_user_ns(NETLINK_CB(cb->skb).ssk), + sk_user_ns(NETLINK_CB(cb->skb).sk), NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, cb->nlh); if (err < 0) { diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index 369a781851ad..7927db0a9279 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -25,7 +25,7 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, return 0; return inet_sk_diag_fill(sk, NULL, skb, req, - sk_user_ns(NETLINK_CB(cb->skb).ssk), + sk_user_ns(NETLINK_CB(cb->skb).sk), NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); } @@ -71,7 +71,7 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, goto out; err = inet_sk_diag_fill(sk, NULL, rep, req, - sk_user_ns(NETLINK_CB(in_skb).ssk), + sk_user_ns(NETLINK_CB(in_skb).sk), NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 0, nlh); if (err < 0) { diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 007e8c43d19a..54ddc2f8e7c9 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1260,7 +1260,7 @@ void nf_ct_iterate_cleanup(struct net *net, EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup); struct __nf_ct_flush_report { - u32 pid; + u32 portid; int report; }; @@ -1275,7 +1275,7 @@ static int kill_report(struct nf_conn *i, void *data) /* If we fail to deliver the event, death_by_timeout() will retry */ if (nf_conntrack_event_report(IPCT_DESTROY, i, - fr->pid, fr->report) < 0) + fr->portid, fr->report) < 0) return 1; /* Avoid the delivery of the destroy event in death_by_timeout(). */ @@ -1298,10 +1298,10 @@ void nf_ct_free_hashtable(void *hash, unsigned int size) } EXPORT_SYMBOL_GPL(nf_ct_free_hashtable); -void nf_conntrack_flush_report(struct net *net, u32 pid, int report) +void nf_conntrack_flush_report(struct net *net, u32 portid, int report) { struct __nf_ct_flush_report fr = { - .pid = pid, + .portid = portid, .report = report, }; nf_ct_iterate_cleanup(net, kill_report, &fr); diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 8c10e3db3d9b..0adfdcc68bae 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -40,7 +40,7 @@ static struct kmem_cache *nf_ct_expect_cachep __read_mostly; /* nf_conntrack_expect helper functions */ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, - u32 pid, int report) + u32 portid, int report) { struct nf_conn_help *master_help = nfct_help(exp->master); struct net *net = nf_ct_exp_net(exp); @@ -54,7 +54,7 @@ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, hlist_del(&exp->lnode); master_help->expecting[exp->class]--; - nf_ct_expect_event_report(IPEXP_DESTROY, exp, pid, report); + nf_ct_expect_event_report(IPEXP_DESTROY, exp, portid, report); nf_ct_expect_put(exp); NF_CT_STAT_INC(net, expect_delete); @@ -412,7 +412,7 @@ out: } int nf_ct_expect_related_report(struct nf_conntrack_expect *expect, - u32 pid, int report) + u32 portid, int report) { int ret; @@ -425,7 +425,7 @@ int nf_ct_expect_related_report(struct nf_conntrack_expect *expect, if (ret < 0) goto out; spin_unlock_bh(&nf_conntrack_lock); - nf_ct_expect_event_report(IPEXP_NEW, expect, pid, report); + nf_ct_expect_event_report(IPEXP_NEW, expect, portid, report); return ret; out: spin_unlock_bh(&nf_conntrack_lock); diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index bc4c499adb13..572d87dc116f 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -112,22 +112,30 @@ int nfnetlink_has_listeners(struct net *net, unsigned int group) } EXPORT_SYMBOL_GPL(nfnetlink_has_listeners); -int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, +struct sk_buff *nfnetlink_alloc_skb(struct net *net, unsigned int size, + u32 dst_portid, gfp_t gfp_mask) +{ + return netlink_alloc_skb(net->nfnl, size, dst_portid, gfp_mask); +} +EXPORT_SYMBOL_GPL(nfnetlink_alloc_skb); + +int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 portid, unsigned int group, int echo, gfp_t flags) { - return nlmsg_notify(net->nfnl, skb, pid, group, echo, flags); + return nlmsg_notify(net->nfnl, skb, portid, group, echo, flags); } EXPORT_SYMBOL_GPL(nfnetlink_send); -int nfnetlink_set_err(struct net *net, u32 pid, u32 group, int error) +int nfnetlink_set_err(struct net *net, u32 portid, u32 group, int error) { - return netlink_set_err(net->nfnl, pid, group, error); + return netlink_set_err(net->nfnl, portid, group, error); } EXPORT_SYMBOL_GPL(nfnetlink_set_err); -int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u_int32_t pid, int flags) +int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid, + int flags) { - return netlink_unicast(net->nfnl, skb, pid, flags); + return netlink_unicast(net->nfnl, skb, portid, flags); } EXPORT_SYMBOL_GPL(nfnetlink_unicast); diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 1a0be2af1dd8..d4199eb9b338 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -318,7 +318,7 @@ nfulnl_set_flags(struct nfulnl_instance *inst, u_int16_t flags) } static struct sk_buff * -nfulnl_alloc_skb(unsigned int inst_size, unsigned int pkt_size) +nfulnl_alloc_skb(u32 peer_portid, unsigned int inst_size, unsigned int pkt_size) { struct sk_buff *skb; unsigned int n; @@ -327,13 +327,14 @@ nfulnl_alloc_skb(unsigned int inst_size, unsigned int pkt_size) * message. WARNING: has to be <= 128k due to slab restrictions */ n = max(inst_size, pkt_size); - skb = alloc_skb(n, GFP_ATOMIC); + skb = nfnetlink_alloc_skb(&init_net, n, peer_portid, GFP_ATOMIC); if (!skb) { if (n > pkt_size) { /* try to allocate only as much as we need for current * packet */ - skb = alloc_skb(pkt_size, GFP_ATOMIC); + skb = nfnetlink_alloc_skb(&init_net, pkt_size, + peer_portid, GFP_ATOMIC); if (!skb) pr_err("nfnetlink_log: can't even alloc %u bytes\n", pkt_size); @@ -696,7 +697,8 @@ nfulnl_log_packet(u_int8_t pf, } if (!inst->skb) { - inst->skb = nfulnl_alloc_skb(inst->nlbufsiz, size); + inst->skb = nfulnl_alloc_skb(inst->peer_portid, inst->nlbufsiz, + size); if (!inst->skb) goto alloc_failure; } @@ -824,7 +826,7 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb, inst = instance_create(net, group_num, NETLINK_CB(skb).portid, - sk_user_ns(NETLINK_CB(skb).ssk)); + sk_user_ns(NETLINK_CB(skb).sk)); if (IS_ERR(inst)) { ret = PTR_ERR(inst); goto out; diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c index 5e280b3e154f..ef3cdb4bfeea 100644 --- a/net/netfilter/nfnetlink_queue_core.c +++ b/net/netfilter/nfnetlink_queue_core.c @@ -339,7 +339,8 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, if (queue->flags & NFQA_CFG_F_CONNTRACK) ct = nfqnl_ct_get(entskb, &size, &ctinfo); - skb = alloc_skb(size, GFP_ATOMIC); + skb = nfnetlink_alloc_skb(&init_net, size, queue->peer_portid, + GFP_ATOMIC); if (!skb) return NULL; diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index ce2e0064e7f6..2a3e9ba814c4 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -3,6 +3,7 @@ * * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> + * Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -55,6 +56,8 @@ #include <linux/types.h> #include <linux/audit.h> #include <linux/mutex.h> +#include <linux/vmalloc.h> +#include <asm/cacheflush.h> #include <net/net_namespace.h> #include <net/sock.h> @@ -68,6 +71,10 @@ struct listeners { unsigned long masks[0]; }; +/* state bits */ +#define NETLINK_CONGESTED 0x0 + +/* flags */ #define NETLINK_KERNEL_SOCKET 0x1 #define NETLINK_RECV_PKTINFO 0x2 #define NETLINK_BROADCAST_SEND_ERROR 0x4 @@ -84,6 +91,7 @@ EXPORT_SYMBOL_GPL(nl_table); static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait); static int netlink_dump(struct sock *sk); +static void netlink_skb_destructor(struct sk_buff *skb); DEFINE_RWLOCK(nl_table_lock); EXPORT_SYMBOL_GPL(nl_table_lock); @@ -103,6 +111,599 @@ static inline struct hlist_head *nl_portid_hashfn(struct nl_portid_hash *hash, u return &hash->table[jhash_1word(portid, hash->rnd) & hash->mask]; } +static void netlink_overrun(struct sock *sk) +{ + struct netlink_sock *nlk = nlk_sk(sk); + + if (!(nlk->flags & NETLINK_RECV_NO_ENOBUFS)) { + if (!test_and_set_bit(NETLINK_CONGESTED, &nlk_sk(sk)->state)) { + sk->sk_err = ENOBUFS; + sk->sk_error_report(sk); + } + } + atomic_inc(&sk->sk_drops); +} + +static void netlink_rcv_wake(struct sock *sk) +{ + struct netlink_sock *nlk = nlk_sk(sk); + + if (skb_queue_empty(&sk->sk_receive_queue)) + clear_bit(NETLINK_CONGESTED, &nlk->state); + if (!test_bit(NETLINK_CONGESTED, &nlk->state)) + wake_up_interruptible(&nlk->wait); +} + +#ifdef CONFIG_NETLINK_MMAP +static bool netlink_skb_is_mmaped(const struct sk_buff *skb) +{ + return NETLINK_CB(skb).flags & NETLINK_SKB_MMAPED; +} + +static bool netlink_rx_is_mmaped(struct sock *sk) +{ + return nlk_sk(sk)->rx_ring.pg_vec != NULL; +} + +static bool netlink_tx_is_mmaped(struct sock *sk) +{ + return nlk_sk(sk)->tx_ring.pg_vec != NULL; +} + +static __pure struct page *pgvec_to_page(const void *addr) +{ + if (is_vmalloc_addr(addr)) + return vmalloc_to_page(addr); + else + return virt_to_page(addr); +} + +static void free_pg_vec(void **pg_vec, unsigned int order, unsigned int len) +{ + unsigned int i; + + for (i = 0; i < len; i++) { + if (pg_vec[i] != NULL) { + if (is_vmalloc_addr(pg_vec[i])) + vfree(pg_vec[i]); + else + free_pages((unsigned long)pg_vec[i], order); + } + } + kfree(pg_vec); +} + +static void *alloc_one_pg_vec_page(unsigned long order) +{ + void *buffer; + gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | + __GFP_NOWARN | __GFP_NORETRY; + + buffer = (void *)__get_free_pages(gfp_flags, order); + if (buffer != NULL) + return buffer; + + buffer = vzalloc((1 << order) * PAGE_SIZE); + if (buffer != NULL) + return buffer; + + gfp_flags &= ~__GFP_NORETRY; + return (void *)__get_free_pages(gfp_flags, order); +} + +static void **alloc_pg_vec(struct netlink_sock *nlk, + struct nl_mmap_req *req, unsigned int order) +{ + unsigned int block_nr = req->nm_block_nr; + unsigned int i; + void **pg_vec, *ptr; + + pg_vec = kcalloc(block_nr, sizeof(void *), GFP_KERNEL); + if (pg_vec == NULL) + return NULL; + + for (i = 0; i < block_nr; i++) { + pg_vec[i] = ptr = alloc_one_pg_vec_page(order); + if (pg_vec[i] == NULL) + goto err1; + } + + return pg_vec; +err1: + free_pg_vec(pg_vec, order, block_nr); + return NULL; +} + +static int netlink_set_ring(struct sock *sk, struct nl_mmap_req *req, + bool closing, bool tx_ring) +{ + struct netlink_sock *nlk = nlk_sk(sk); + struct netlink_ring *ring; + struct sk_buff_head *queue; + void **pg_vec = NULL; + unsigned int order = 0; + int err; + + ring = tx_ring ? &nlk->tx_ring : &nlk->rx_ring; + queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue; + + if (!closing) { + if (atomic_read(&nlk->mapped)) + return -EBUSY; + if (atomic_read(&ring->pending)) + return -EBUSY; + } + + if (req->nm_block_nr) { + if (ring->pg_vec != NULL) + return -EBUSY; + + if ((int)req->nm_block_size <= 0) + return -EINVAL; + if (!IS_ALIGNED(req->nm_block_size, PAGE_SIZE)) + return -EINVAL; + if (req->nm_frame_size < NL_MMAP_HDRLEN) + return -EINVAL; + if (!IS_ALIGNED(req->nm_frame_size, NL_MMAP_MSG_ALIGNMENT)) + return -EINVAL; + + ring->frames_per_block = req->nm_block_size / + req->nm_frame_size; + if (ring->frames_per_block == 0) + return -EINVAL; + if (ring->frames_per_block * req->nm_block_nr != + req->nm_frame_nr) + return -EINVAL; + + order = get_order(req->nm_block_size); + pg_vec = alloc_pg_vec(nlk, req, order); + if (pg_vec == NULL) + return -ENOMEM; + } else { + if (req->nm_frame_nr) + return -EINVAL; + } + + err = -EBUSY; + mutex_lock(&nlk->pg_vec_lock); + if (closing || atomic_read(&nlk->mapped) == 0) { + err = 0; + spin_lock_bh(&queue->lock); + + ring->frame_max = req->nm_frame_nr - 1; + ring->head = 0; + ring->frame_size = req->nm_frame_size; + ring->pg_vec_pages = req->nm_block_size / PAGE_SIZE; + + swap(ring->pg_vec_len, req->nm_block_nr); + swap(ring->pg_vec_order, order); + swap(ring->pg_vec, pg_vec); + + __skb_queue_purge(queue); + spin_unlock_bh(&queue->lock); + + WARN_ON(atomic_read(&nlk->mapped)); + } + mutex_unlock(&nlk->pg_vec_lock); + + if (pg_vec) + free_pg_vec(pg_vec, order, req->nm_block_nr); + return err; +} + +static void netlink_mm_open(struct vm_area_struct *vma) +{ + struct file *file = vma->vm_file; + struct socket *sock = file->private_data; + struct sock *sk = sock->sk; + + if (sk) + atomic_inc(&nlk_sk(sk)->mapped); +} + +static void netlink_mm_close(struct vm_area_struct *vma) +{ + struct file *file = vma->vm_file; + struct socket *sock = file->private_data; + struct sock *sk = sock->sk; + + if (sk) + atomic_dec(&nlk_sk(sk)->mapped); +} + +static const struct vm_operations_struct netlink_mmap_ops = { + .open = netlink_mm_open, + .close = netlink_mm_close, +}; + +static int netlink_mmap(struct file *file, struct socket *sock, + struct vm_area_struct *vma) +{ + struct sock *sk = sock->sk; + struct netlink_sock *nlk = nlk_sk(sk); + struct netlink_ring *ring; + unsigned long start, size, expected; + unsigned int i; + int err = -EINVAL; + + if (vma->vm_pgoff) + return -EINVAL; + + mutex_lock(&nlk->pg_vec_lock); + + expected = 0; + for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) { + if (ring->pg_vec == NULL) + continue; + expected += ring->pg_vec_len * ring->pg_vec_pages * PAGE_SIZE; + } + + if (expected == 0) + goto out; + + size = vma->vm_end - vma->vm_start; + if (size != expected) + goto out; + + start = vma->vm_start; + for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) { + if (ring->pg_vec == NULL) + continue; + + for (i = 0; i < ring->pg_vec_len; i++) { + struct page *page; + void *kaddr = ring->pg_vec[i]; + unsigned int pg_num; + + for (pg_num = 0; pg_num < ring->pg_vec_pages; pg_num++) { + page = pgvec_to_page(kaddr); + err = vm_insert_page(vma, start, page); + if (err < 0) + goto out; + start += PAGE_SIZE; + kaddr += PAGE_SIZE; + } + } + } + + atomic_inc(&nlk->mapped); + vma->vm_ops = &netlink_mmap_ops; + err = 0; +out: + mutex_unlock(&nlk->pg_vec_lock); + return 0; +} + +static void netlink_frame_flush_dcache(const struct nl_mmap_hdr *hdr) +{ +#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 + struct page *p_start, *p_end; + + /* First page is flushed through netlink_{get,set}_status */ + p_start = pgvec_to_page(hdr + PAGE_SIZE); + p_end = pgvec_to_page((void *)hdr + NL_MMAP_MSG_HDRLEN + hdr->nm_len - 1); + while (p_start <= p_end) { + flush_dcache_page(p_start); + p_start++; + } +#endif +} + +static enum nl_mmap_status netlink_get_status(const struct nl_mmap_hdr *hdr) +{ + smp_rmb(); + flush_dcache_page(pgvec_to_page(hdr)); + return hdr->nm_status; +} + +static void netlink_set_status(struct nl_mmap_hdr *hdr, + enum nl_mmap_status status) +{ + hdr->nm_status = status; + flush_dcache_page(pgvec_to_page(hdr)); + smp_wmb(); +} + +static struct nl_mmap_hdr * +__netlink_lookup_frame(const struct netlink_ring *ring, unsigned int pos) +{ + unsigned int pg_vec_pos, frame_off; + + pg_vec_pos = pos / ring->frames_per_block; + frame_off = pos % ring->frames_per_block; + + return ring->pg_vec[pg_vec_pos] + (frame_off * ring->frame_size); +} + +static struct nl_mmap_hdr * +netlink_lookup_frame(const struct netlink_ring *ring, unsigned int pos, + enum nl_mmap_status status) +{ + struct nl_mmap_hdr *hdr; + + hdr = __netlink_lookup_frame(ring, pos); + if (netlink_get_status(hdr) != status) + return NULL; + + return hdr; +} + +static struct nl_mmap_hdr * +netlink_current_frame(const struct netlink_ring *ring, + enum nl_mmap_status status) +{ + return netlink_lookup_frame(ring, ring->head, status); +} + +static struct nl_mmap_hdr * +netlink_previous_frame(const struct netlink_ring *ring, + enum nl_mmap_status status) +{ + unsigned int prev; + + prev = ring->head ? ring->head - 1 : ring->frame_max; + return netlink_lookup_frame(ring, prev, status); +} + +static void netlink_increment_head(struct netlink_ring *ring) +{ + ring->head = ring->head != ring->frame_max ? ring->head + 1 : 0; +} + +static void netlink_forward_ring(struct netlink_ring *ring) +{ + unsigned int head = ring->head, pos = head; + const struct nl_mmap_hdr *hdr; + + do { + hdr = __netlink_lookup_frame(ring, pos); + if (hdr->nm_status == NL_MMAP_STATUS_UNUSED) + break; + if (hdr->nm_status != NL_MMAP_STATUS_SKIP) + break; + netlink_increment_head(ring); + } while (ring->head != head); +} + +static bool netlink_dump_space(struct netlink_sock *nlk) +{ + struct netlink_ring *ring = &nlk->rx_ring; + struct nl_mmap_hdr *hdr; + unsigned int n; + + hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED); + if (hdr == NULL) + return false; + + n = ring->head + ring->frame_max / 2; + if (n > ring->frame_max) + n -= ring->frame_max; + + hdr = __netlink_lookup_frame(ring, n); + + return hdr->nm_status == NL_MMAP_STATUS_UNUSED; +} + +static unsigned int netlink_poll(struct file *file, struct socket *sock, + poll_table *wait) +{ + struct sock *sk = sock->sk; + struct netlink_sock *nlk = nlk_sk(sk); + unsigned int mask; + int err; + + if (nlk->rx_ring.pg_vec != NULL) { + /* Memory mapped sockets don't call recvmsg(), so flow control + * for dumps is performed here. A dump is allowed to continue + * if at least half the ring is unused. + */ + while (nlk->cb != NULL && netlink_dump_space(nlk)) { + err = netlink_dump(sk); + if (err < 0) { + sk->sk_err = err; + sk->sk_error_report(sk); + break; + } + } + netlink_rcv_wake(sk); + } + + mask = datagram_poll(file, sock, wait); + + spin_lock_bh(&sk->sk_receive_queue.lock); + if (nlk->rx_ring.pg_vec) { + netlink_forward_ring(&nlk->rx_ring); + if (!netlink_previous_frame(&nlk->rx_ring, NL_MMAP_STATUS_UNUSED)) + mask |= POLLIN | POLLRDNORM; + } + spin_unlock_bh(&sk->sk_receive_queue.lock); + + spin_lock_bh(&sk->sk_write_queue.lock); + if (nlk->tx_ring.pg_vec) { + if (netlink_current_frame(&nlk->tx_ring, NL_MMAP_STATUS_UNUSED)) + mask |= POLLOUT | POLLWRNORM; + } + spin_unlock_bh(&sk->sk_write_queue.lock); + + return mask; +} + +static struct nl_mmap_hdr *netlink_mmap_hdr(struct sk_buff *skb) +{ + return (struct nl_mmap_hdr *)(skb->head - NL_MMAP_HDRLEN); +} + +static void netlink_ring_setup_skb(struct sk_buff *skb, struct sock *sk, + struct netlink_ring *ring, + struct nl_mmap_hdr *hdr) +{ + unsigned int size; + void *data; + + size = ring->frame_size - NL_MMAP_HDRLEN; + data = (void *)hdr + NL_MMAP_HDRLEN; + + skb->head = data; + skb->data = data; + skb_reset_tail_pointer(skb); + skb->end = skb->tail + size; + skb->len = 0; + + skb->destructor = netlink_skb_destructor; + NETLINK_CB(skb).flags |= NETLINK_SKB_MMAPED; + NETLINK_CB(skb).sk = sk; +} + +static int netlink_mmap_sendmsg(struct sock *sk, struct msghdr *msg, + u32 dst_portid, u32 dst_group, + struct sock_iocb *siocb) +{ + struct netlink_sock *nlk = nlk_sk(sk); + struct netlink_ring *ring; + struct nl_mmap_hdr *hdr; + struct sk_buff *skb; + unsigned int maxlen; + bool excl = true; + int err = 0, len = 0; + + /* Netlink messages are validated by the receiver before processing. + * In order to avoid userspace changing the contents of the message + * after validation, the socket and the ring may only be used by a + * single process, otherwise we fall back to copying. + */ + if (atomic_long_read(&sk->sk_socket->file->f_count) > 2 || + atomic_read(&nlk->mapped) > 1) + excl = false; + + mutex_lock(&nlk->pg_vec_lock); + + ring = &nlk->tx_ring; + maxlen = ring->frame_size - NL_MMAP_HDRLEN; + + do { + hdr = netlink_current_frame(ring, NL_MMAP_STATUS_VALID); + if (hdr == NULL) { + if (!(msg->msg_flags & MSG_DONTWAIT) && + atomic_read(&nlk->tx_ring.pending)) + schedule(); + continue; + } + if (hdr->nm_len > maxlen) { + err = -EINVAL; + goto out; + } + + netlink_frame_flush_dcache(hdr); + + if (likely(dst_portid == 0 && dst_group == 0 && excl)) { + skb = alloc_skb_head(GFP_KERNEL); + if (skb == NULL) { + err = -ENOBUFS; + goto out; + } + sock_hold(sk); + netlink_ring_setup_skb(skb, sk, ring, hdr); + NETLINK_CB(skb).flags |= NETLINK_SKB_TX; + __skb_put(skb, hdr->nm_len); + netlink_set_status(hdr, NL_MMAP_STATUS_RESERVED); + atomic_inc(&ring->pending); + } else { + skb = alloc_skb(hdr->nm_len, GFP_KERNEL); + if (skb == NULL) { + err = -ENOBUFS; + goto out; + } + __skb_put(skb, hdr->nm_len); + memcpy(skb->data, (void *)hdr + NL_MMAP_HDRLEN, hdr->nm_len); + netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED); + } + + netlink_increment_head(ring); + + NETLINK_CB(skb).portid = nlk->portid; + NETLINK_CB(skb).dst_group = dst_group; + NETLINK_CB(skb).creds = siocb->scm->creds; + + err = security_netlink_send(sk, skb); + if (err) { + kfree_skb(skb); + goto out; + } + + if (unlikely(dst_group)) { + atomic_inc(&skb->users); + netlink_broadcast(sk, skb, dst_portid, dst_group, + GFP_KERNEL); + } + err = netlink_unicast(sk, skb, dst_portid, + msg->msg_flags & MSG_DONTWAIT); + if (err < 0) + goto out; + len += err; + + } while (hdr != NULL || + (!(msg->msg_flags & MSG_DONTWAIT) && + atomic_read(&nlk->tx_ring.pending))); + + if (len > 0) + err = len; +out: + mutex_unlock(&nlk->pg_vec_lock); + return err; +} + +static void netlink_queue_mmaped_skb(struct sock *sk, struct sk_buff *skb) +{ + struct nl_mmap_hdr *hdr; + + hdr = netlink_mmap_hdr(skb); + hdr->nm_len = skb->len; + hdr->nm_group = NETLINK_CB(skb).dst_group; + hdr->nm_pid = NETLINK_CB(skb).creds.pid; + hdr->nm_uid = NETLINK_CB(skb).creds.uid; + hdr->nm_gid = NETLINK_CB(skb).creds.gid; + netlink_frame_flush_dcache(hdr); + netlink_set_status(hdr, NL_MMAP_STATUS_VALID); + + NETLINK_CB(skb).flags |= NETLINK_SKB_DELIVERED; + kfree_skb(skb); +} + +static void netlink_ring_set_copied(struct sock *sk, struct sk_buff *skb) +{ + struct netlink_sock *nlk = nlk_sk(sk); + struct netlink_ring *ring = &nlk->rx_ring; + struct nl_mmap_hdr *hdr; + + spin_lock_bh(&sk->sk_receive_queue.lock); + hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED); + if (hdr == NULL) { + spin_unlock_bh(&sk->sk_receive_queue.lock); + kfree_skb(skb); + netlink_overrun(sk); + return; + } + netlink_increment_head(ring); + __skb_queue_tail(&sk->sk_receive_queue, skb); + spin_unlock_bh(&sk->sk_receive_queue.lock); + + hdr->nm_len = skb->len; + hdr->nm_group = NETLINK_CB(skb).dst_group; + hdr->nm_pid = NETLINK_CB(skb).creds.pid; + hdr->nm_uid = NETLINK_CB(skb).creds.uid; + hdr->nm_gid = NETLINK_CB(skb).creds.gid; + netlink_set_status(hdr, NL_MMAP_STATUS_COPY); +} + +#else /* CONFIG_NETLINK_MMAP */ +#define netlink_skb_is_mmaped(skb) false +#define netlink_rx_is_mmaped(sk) false +#define netlink_tx_is_mmaped(sk) false +#define netlink_mmap sock_no_mmap +#define netlink_poll datagram_poll +#define netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group, siocb) 0 +#endif /* CONFIG_NETLINK_MMAP */ + static void netlink_destroy_callback(struct netlink_callback *cb) { kfree_skb(cb->skb); @@ -115,6 +716,53 @@ static void netlink_consume_callback(struct netlink_callback *cb) kfree(cb); } +static void netlink_skb_destructor(struct sk_buff *skb) +{ +#ifdef CONFIG_NETLINK_MMAP + struct nl_mmap_hdr *hdr; + struct netlink_ring *ring; + struct sock *sk; + + /* If a packet from the kernel to userspace was freed because of an + * error without being delivered to userspace, the kernel must reset + * the status. In the direction userspace to kernel, the status is + * always reset here after the packet was processed and freed. + */ + if (netlink_skb_is_mmaped(skb)) { + hdr = netlink_mmap_hdr(skb); + sk = NETLINK_CB(skb).sk; + + if (NETLINK_CB(skb).flags & NETLINK_SKB_TX) { + netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED); + ring = &nlk_sk(sk)->tx_ring; + } else { + if (!(NETLINK_CB(skb).flags & NETLINK_SKB_DELIVERED)) { + hdr->nm_len = 0; + netlink_set_status(hdr, NL_MMAP_STATUS_VALID); + } + ring = &nlk_sk(sk)->rx_ring; + } + + WARN_ON(atomic_read(&ring->pending) == 0); + atomic_dec(&ring->pending); + sock_put(sk); + + skb->data = NULL; + } +#endif + if (skb->sk != NULL) + sock_rfree(skb); +} + +static void netlink_skb_set_owner_r(struct sk_buff *skb, struct sock *sk) +{ + WARN_ON(skb->sk != NULL); + skb->sk = sk; + skb->destructor = netlink_skb_destructor; + atomic_add(skb->truesize, &sk->sk_rmem_alloc); + sk_mem_charge(sk, skb->truesize); +} + static void netlink_sock_destruct(struct sock *sk) { struct netlink_sock *nlk = nlk_sk(sk); @@ -128,6 +776,18 @@ static void netlink_sock_destruct(struct sock *sk) } skb_queue_purge(&sk->sk_receive_queue); +#ifdef CONFIG_NETLINK_MMAP + if (1) { + struct nl_mmap_req req; + + memset(&req, 0, sizeof(req)); + if (nlk->rx_ring.pg_vec) + netlink_set_ring(sk, &req, true, false); + memset(&req, 0, sizeof(req)); + if (nlk->tx_ring.pg_vec) + netlink_set_ring(sk, &req, true, true); + } +#endif /* CONFIG_NETLINK_MMAP */ if (!sock_flag(sk, SOCK_DEAD)) { printk(KERN_ERR "Freeing alive netlink socket %p\n", sk); @@ -391,6 +1051,9 @@ static int __netlink_create(struct net *net, struct socket *sock, mutex_init(nlk->cb_mutex); } init_waitqueue_head(&nlk->wait); +#ifdef CONFIG_NETLINK_MMAP + mutex_init(&nlk->pg_vec_lock); +#endif sk->sk_destruct = netlink_sock_destruct; sk->sk_protocol = protocol; @@ -722,19 +1385,6 @@ static int netlink_getname(struct socket *sock, struct sockaddr *addr, return 0; } -static void netlink_overrun(struct sock *sk) -{ - struct netlink_sock *nlk = nlk_sk(sk); - - if (!(nlk->flags & NETLINK_RECV_NO_ENOBUFS)) { - if (!test_and_set_bit(0, &nlk_sk(sk)->state)) { - sk->sk_err = ENOBUFS; - sk->sk_error_report(sk); - } - } - atomic_inc(&sk->sk_drops); -} - static struct sock *netlink_getsockbyportid(struct sock *ssk, u32 portid) { struct sock *sock; @@ -787,8 +1437,9 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, nlk = nlk_sk(sk); - if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || - test_bit(0, &nlk->state)) { + if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || + test_bit(NETLINK_CONGESTED, &nlk->state)) && + !netlink_skb_is_mmaped(skb)) { DECLARE_WAITQUEUE(wait, current); if (!*timeo) { if (!ssk || netlink_is_kernel(ssk)) @@ -802,7 +1453,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, add_wait_queue(&nlk->wait, &wait); if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || - test_bit(0, &nlk->state)) && + test_bit(NETLINK_CONGESTED, &nlk->state)) && !sock_flag(sk, SOCK_DEAD)) *timeo = schedule_timeout(*timeo); @@ -816,7 +1467,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, } return 1; } - skb_set_owner_r(skb, sk); + netlink_skb_set_owner_r(skb, sk); return 0; } @@ -824,7 +1475,14 @@ static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb) { int len = skb->len; - skb_queue_tail(&sk->sk_receive_queue, skb); +#ifdef CONFIG_NETLINK_MMAP + if (netlink_skb_is_mmaped(skb)) + netlink_queue_mmaped_skb(sk, skb); + else if (netlink_rx_is_mmaped(sk)) + netlink_ring_set_copied(sk, skb); + else +#endif /* CONFIG_NETLINK_MMAP */ + skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_data_ready(sk, len); return len; } @@ -847,7 +1505,9 @@ static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation) { int delta; - skb_orphan(skb); + WARN_ON(skb->sk != NULL); + if (netlink_skb_is_mmaped(skb)) + return skb; delta = skb->end - skb->tail; if (delta * 2 < skb->truesize) @@ -867,16 +1527,6 @@ static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation) return skb; } -static void netlink_rcv_wake(struct sock *sk) -{ - struct netlink_sock *nlk = nlk_sk(sk); - - if (skb_queue_empty(&sk->sk_receive_queue)) - clear_bit(0, &nlk->state); - if (!test_bit(0, &nlk->state)) - wake_up_interruptible(&nlk->wait); -} - static int netlink_unicast_kernel(struct sock *sk, struct sk_buff *skb, struct sock *ssk) { @@ -886,8 +1536,8 @@ static int netlink_unicast_kernel(struct sock *sk, struct sk_buff *skb, ret = -ECONNREFUSED; if (nlk->netlink_rcv != NULL) { ret = skb->len; - skb_set_owner_r(skb, sk); - NETLINK_CB(skb).ssk = ssk; + netlink_skb_set_owner_r(skb, sk); + NETLINK_CB(skb).sk = ssk; nlk->netlink_rcv(skb); consume_skb(skb); } else { @@ -933,6 +1583,69 @@ retry: } EXPORT_SYMBOL(netlink_unicast); +struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size, + u32 dst_portid, gfp_t gfp_mask) +{ +#ifdef CONFIG_NETLINK_MMAP + struct sock *sk = NULL; + struct sk_buff *skb; + struct netlink_ring *ring; + struct nl_mmap_hdr *hdr; + unsigned int maxlen; + + sk = netlink_getsockbyportid(ssk, dst_portid); + if (IS_ERR(sk)) + goto out; + + ring = &nlk_sk(sk)->rx_ring; + /* fast-path without atomic ops for common case: non-mmaped receiver */ + if (ring->pg_vec == NULL) + goto out_put; + + skb = alloc_skb_head(gfp_mask); + if (skb == NULL) + goto err1; + + spin_lock_bh(&sk->sk_receive_queue.lock); + /* check again under lock */ + if (ring->pg_vec == NULL) + goto out_free; + + maxlen = ring->frame_size - NL_MMAP_HDRLEN; + if (maxlen < size) + goto out_free; + + netlink_forward_ring(ring); + hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED); + if (hdr == NULL) + goto err2; + netlink_ring_setup_skb(skb, sk, ring, hdr); + netlink_set_status(hdr, NL_MMAP_STATUS_RESERVED); + atomic_inc(&ring->pending); + netlink_increment_head(ring); + + spin_unlock_bh(&sk->sk_receive_queue.lock); + return skb; + +err2: + kfree_skb(skb); + spin_unlock_bh(&sk->sk_receive_queue.lock); + netlink_overrun(sk); +err1: + sock_put(sk); + return NULL; + +out_free: + kfree_skb(skb); + spin_unlock_bh(&sk->sk_receive_queue.lock); +out_put: + sock_put(sk); +out: +#endif + return alloc_skb(size, gfp_mask); +} +EXPORT_SYMBOL_GPL(netlink_alloc_skb); + int netlink_has_listeners(struct sock *sk, unsigned int group) { int res = 0; @@ -957,8 +1670,8 @@ static int netlink_broadcast_deliver(struct sock *sk, struct sk_buff *skb) struct netlink_sock *nlk = nlk_sk(sk); if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && - !test_bit(0, &nlk->state)) { - skb_set_owner_r(skb, sk); + !test_bit(NETLINK_CONGESTED, &nlk->state)) { + netlink_skb_set_owner_r(skb, sk); __netlink_sendskb(sk, skb); return atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1); } @@ -1193,7 +1906,8 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, if (level != SOL_NETLINK) return -ENOPROTOOPT; - if (optlen >= sizeof(int) && + if (optname != NETLINK_RX_RING && optname != NETLINK_TX_RING && + optlen >= sizeof(int) && get_user(val, (unsigned int __user *)optval)) return -EFAULT; @@ -1235,13 +1949,32 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, case NETLINK_NO_ENOBUFS: if (val) { nlk->flags |= NETLINK_RECV_NO_ENOBUFS; - clear_bit(0, &nlk->state); + clear_bit(NETLINK_CONGESTED, &nlk->state); wake_up_interruptible(&nlk->wait); } else { nlk->flags &= ~NETLINK_RECV_NO_ENOBUFS; } err = 0; break; +#ifdef CONFIG_NETLINK_MMAP + case NETLINK_RX_RING: + case NETLINK_TX_RING: { + struct nl_mmap_req req; + + /* Rings might consume more memory than queue limits, require + * CAP_NET_ADMIN. + */ + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (optlen < sizeof(req)) + return -EINVAL; + if (copy_from_user(&req, optval, sizeof(req))) + return -EFAULT; + err = netlink_set_ring(sk, &req, false, + optname == NETLINK_TX_RING); + break; + } +#endif /* CONFIG_NETLINK_MMAP */ default: err = -ENOPROTOOPT; } @@ -1352,6 +2085,13 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, goto out; } + if (netlink_tx_is_mmaped(sk) && + msg->msg_iov->iov_base == NULL) { + err = netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group, + siocb); + goto out; + } + err = -EMSGSIZE; if (len > sk->sk_sndbuf - 32) goto out; @@ -1684,9 +2424,13 @@ static int netlink_dump(struct sock *sk) alloc_size = max_t(int, cb->min_dump_alloc, NLMSG_GOODSIZE); - skb = sock_rmalloc(sk, alloc_size, 0, GFP_KERNEL); + if (!netlink_rx_is_mmaped(sk) && + atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) + goto errout_skb; + skb = netlink_alloc_skb(sk, alloc_size, nlk->portid, GFP_KERNEL); if (!skb) goto errout_skb; + netlink_skb_set_owner_r(skb, sk); len = cb->dump(skb, cb); @@ -1741,6 +2485,19 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, if (cb == NULL) return -ENOBUFS; + /* Memory mapped dump requests need to be copied to avoid looping + * on the pending state in netlink_mmap_sendmsg() while the CB hold + * a reference to the skb. + */ + if (netlink_skb_is_mmaped(skb)) { + skb = skb_copy(skb, GFP_KERNEL); + if (skb == NULL) { + kfree(cb); + return -ENOBUFS; + } + } else + atomic_inc(&skb->users); + cb->dump = control->dump; cb->done = control->done; cb->nlh = nlh; @@ -1801,7 +2558,8 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err) if (err) payload += nlmsg_len(nlh); - skb = nlmsg_new(payload, GFP_KERNEL); + skb = netlink_alloc_skb(in_skb->sk, nlmsg_total_size(payload), + NETLINK_CB(in_skb).portid, GFP_KERNEL); if (!skb) { struct sock *sk; @@ -2067,7 +2825,7 @@ static const struct proto_ops netlink_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = netlink_getname, - .poll = datagram_poll, + .poll = netlink_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, @@ -2075,7 +2833,7 @@ static const struct proto_ops netlink_ops = { .getsockopt = netlink_getsockopt, .sendmsg = netlink_sendmsg, .recvmsg = netlink_recvmsg, - .mmap = sock_no_mmap, + .mmap = netlink_mmap, .sendpage = sock_no_sendpage, }; diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h index d9acb2a1d855..ed8522265f4e 100644 --- a/net/netlink/af_netlink.h +++ b/net/netlink/af_netlink.h @@ -6,6 +6,20 @@ #define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8) #define NLGRPLONGS(x) (NLGRPSZ(x)/sizeof(unsigned long)) +struct netlink_ring { + void **pg_vec; + unsigned int head; + unsigned int frames_per_block; + unsigned int frame_size; + unsigned int frame_max; + + unsigned int pg_vec_order; + unsigned int pg_vec_pages; + unsigned int pg_vec_len; + + atomic_t pending; +}; + struct netlink_sock { /* struct sock has to be the first member of netlink_sock */ struct sock sk; @@ -24,6 +38,12 @@ struct netlink_sock { void (*netlink_rcv)(struct sk_buff *skb); void (*netlink_bind)(int group); struct module *module; +#ifdef CONFIG_NETLINK_MMAP + struct mutex pg_vec_lock; + struct netlink_ring rx_ring; + struct netlink_ring tx_ring; + atomic_t mapped; +#endif /* CONFIG_NETLINK_MMAP */ }; static inline struct netlink_sock *nlk_sk(struct sock *sk) diff --git a/net/netlink/diag.c b/net/netlink/diag.c index 5ffb1d1cf402..4e4aa471cd05 100644 --- a/net/netlink/diag.c +++ b/net/netlink/diag.c @@ -7,6 +7,34 @@ #include "af_netlink.h" +static int sk_diag_put_ring(struct netlink_ring *ring, int nl_type, + struct sk_buff *nlskb) +{ + struct netlink_diag_ring ndr; + + ndr.ndr_block_size = ring->pg_vec_pages << PAGE_SHIFT; + ndr.ndr_block_nr = ring->pg_vec_len; + ndr.ndr_frame_size = ring->frame_size; + ndr.ndr_frame_nr = ring->frame_max + 1; + + return nla_put(nlskb, nl_type, sizeof(ndr), &ndr); +} + +static int sk_diag_put_rings_cfg(struct sock *sk, struct sk_buff *nlskb) +{ + struct netlink_sock *nlk = nlk_sk(sk); + int ret; + + mutex_lock(&nlk->pg_vec_lock); + ret = sk_diag_put_ring(&nlk->rx_ring, NETLINK_DIAG_RX_RING, nlskb); + if (!ret) + ret = sk_diag_put_ring(&nlk->tx_ring, NETLINK_DIAG_TX_RING, + nlskb); + mutex_unlock(&nlk->pg_vec_lock); + + return ret; +} + static int sk_diag_dump_groups(struct sock *sk, struct sk_buff *nlskb) { struct netlink_sock *nlk = nlk_sk(sk); @@ -51,6 +79,10 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, sock_diag_put_meminfo(sk, skb, NETLINK_DIAG_MEMINFO)) goto out_nlmsg_trim; + if ((req->ndiag_show & NDIAG_SHOW_RING_CFG) && + sk_diag_put_rings_cfg(sk, skb)) + goto out_nlmsg_trim; + return nlmsg_end(skb, nlh); out_nlmsg_trim: diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index aa36a8c8b33b..7881e2fccbc2 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -393,7 +393,7 @@ static int flow_change(struct net *net, struct sk_buff *in_skb, return -EOPNOTSUPP; if ((keymask & (FLOW_KEY_SKUID|FLOW_KEY_SKGID)) && - sk_user_ns(NETLINK_CB(in_skb).ssk) != &init_user_ns) + sk_user_ns(NETLINK_CB(in_skb).sk) != &init_user_ns) return -EOPNOTSUPP; } |