From 36cd0e696a832a00247fca522034703566ac8885 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 16 Nov 2020 14:28:06 -0800 Subject: bpf, sockmap: Ensure SO_RCVBUF memory is observed on ingress redirect Fix sockmap sk_skb programs so that they observe sk_rcvbuf limits. This allows users to tune SO_RCVBUF and sockmap will honor them. We can refactor the if(charge) case out in later patches. But, keep this fix to the point. Fixes: 51199405f9672 ("bpf: skb_verdict, support SK_PASS on RX BPF path") Suggested-by: Jakub Sitnicki Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/160556568657.73229.8404601585878439060.stgit@john-XPS-13-9370 --- net/core/skmsg.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 654182ecf87b..fe44280c033e 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -170,10 +170,12 @@ static int sk_msg_free_elem(struct sock *sk, struct sk_msg *msg, u32 i, struct scatterlist *sge = sk_msg_elem(msg, i); u32 len = sge->length; - if (charge) - sk_mem_uncharge(sk, len); - if (!msg->skb) + /* When the skb owns the memory we free it from consume_skb path. */ + if (!msg->skb) { + if (charge) + sk_mem_uncharge(sk, len); put_page(sg_page(sge)); + } memset(sge, 0, sizeof(*sge)); return len; } @@ -403,6 +405,9 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb) int copied = 0, num_sge; struct sk_msg *msg; + if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) + return -EAGAIN; + msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_ATOMIC); if (unlikely(!msg)) return -EAGAIN; @@ -418,7 +423,14 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb) return num_sge; } - sk_mem_charge(sk, skb->len); + /* This will transition ownership of the data from the socket where + * the BPF program was run initiating the redirect to the socket + * we will eventually receive this data on. The data will be released + * from skb_consume found in __tcp_bpf_recvmsg() after its been copied + * into user buffers. + */ + skb_set_owner_r(skb, sk); + copied = skb->len; msg->sg.start = 0; msg->sg.size = copied; -- cgit v1.2.3-58-ga151 From 70796fb751f1d34cc650e640572a174faf009cd4 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 16 Nov 2020 14:28:26 -0800 Subject: bpf, sockmap: Use truesize with sk_rmem_schedule() We use skb->size with sk_rmem_scheduled() which is not correct. Instead use truesize to align with socket and tcp stack usage of sk_rmem_schedule. Suggested-by: Daniel Borkman Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/160556570616.73229.17003722112077507863.stgit@john-XPS-13-9370 --- net/core/skmsg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index fe44280c033e..d09426ce4af3 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -411,7 +411,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb) msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_ATOMIC); if (unlikely(!msg)) return -EAGAIN; - if (!sk_rmem_schedule(sk, skb, skb->len)) { + if (!sk_rmem_schedule(sk, skb, skb->truesize)) { kfree(msg); return -EAGAIN; } -- cgit v1.2.3-58-ga151 From 6fa9201a898983da731fca068bb4b5c941537588 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 16 Nov 2020 14:28:46 -0800 Subject: bpf, sockmap: Avoid returning unneeded EAGAIN when redirecting to self If a socket redirects to itself and it is under memory pressure it is possible to get a socket stuck so that recv() returns EAGAIN and the socket can not advance for some time. This happens because when redirecting a skb to the same socket we received the skb on we first check if it is OK to enqueue the skb on the receiving socket by checking memory limits. But, if the skb is itself the object holding the memory needed to enqueue the skb we will keep retrying from kernel side and always fail with EAGAIN. Then userspace will get a recv() EAGAIN error if there are no skbs in the psock ingress queue. This will continue until either some skbs get kfree'd causing the memory pressure to reduce far enough that we can enqueue the pending packet or the socket is destroyed. In some cases its possible to get a socket stuck for a noticeable amount of time if the socket is only receiving skbs from sk_skb verdict programs. To reproduce I make the socket memory limits ridiculously low so sockets are always under memory pressure. More often though if under memory pressure it looks like a spurious EAGAIN error on user space side causing userspace to retry and typically enough has moved on the memory side that it works. To fix skip memory checks and skb_orphan if receiving on the same sock as already assigned. For SK_PASS cases this is easy, its always the same socket so we can just omit the orphan/set_owner pair. For backlog cases we need to check skb->sk and decide if the orphan and set_owner pair are needed. Fixes: 51199405f9672 ("bpf: skb_verdict, support SK_PASS on RX BPF path") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/160556572660.73229.12566203819812939627.stgit@john-XPS-13-9370 --- net/core/skmsg.c | 72 +++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 53 insertions(+), 19 deletions(-) (limited to 'net/core') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index d09426ce4af3..9aed5a2c7c5b 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -399,38 +399,38 @@ out: } EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter); -static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb) +static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk, + struct sk_buff *skb) { - struct sock *sk = psock->sk; - int copied = 0, num_sge; struct sk_msg *msg; if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) - return -EAGAIN; + return NULL; + + if (!sk_rmem_schedule(sk, skb, skb->truesize)) + return NULL; msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_ATOMIC); if (unlikely(!msg)) - return -EAGAIN; - if (!sk_rmem_schedule(sk, skb, skb->truesize)) { - kfree(msg); - return -EAGAIN; - } + return NULL; sk_msg_init(msg); - num_sge = skb_to_sgvec(skb, msg->sg.data, 0, skb->len); + return msg; +} + +static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb, + struct sk_psock *psock, + struct sock *sk, + struct sk_msg *msg) +{ + int num_sge = skb_to_sgvec(skb, msg->sg.data, 0, skb->len); + int copied; + if (unlikely(num_sge < 0)) { kfree(msg); return num_sge; } - /* This will transition ownership of the data from the socket where - * the BPF program was run initiating the redirect to the socket - * we will eventually receive this data on. The data will be released - * from skb_consume found in __tcp_bpf_recvmsg() after its been copied - * into user buffers. - */ - skb_set_owner_r(skb, sk); - copied = skb->len; msg->sg.start = 0; msg->sg.size = copied; @@ -442,6 +442,40 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb) return copied; } +static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb) +{ + struct sock *sk = psock->sk; + struct sk_msg *msg; + + msg = sk_psock_create_ingress_msg(sk, skb); + if (!msg) + return -EAGAIN; + + /* This will transition ownership of the data from the socket where + * the BPF program was run initiating the redirect to the socket + * we will eventually receive this data on. The data will be released + * from skb_consume found in __tcp_bpf_recvmsg() after its been copied + * into user buffers. + */ + skb_set_owner_r(skb, sk); + return sk_psock_skb_ingress_enqueue(skb, psock, sk, msg); +} + +/* Puts an skb on the ingress queue of the socket already assigned to the + * skb. In this case we do not need to check memory limits or skb_set_owner_r + * because the skb is already accounted for here. + */ +static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb) +{ + struct sk_msg *msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_ATOMIC); + struct sock *sk = psock->sk; + + if (unlikely(!msg)) + return -EAGAIN; + sk_msg_init(msg); + return sk_psock_skb_ingress_enqueue(skb, psock, sk, msg); +} + static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb, u32 off, u32 len, bool ingress) { @@ -801,7 +835,7 @@ static void sk_psock_verdict_apply(struct sk_psock *psock, * retrying later from workqueue. */ if (skb_queue_empty(&psock->ingress_skb)) { - err = sk_psock_skb_ingress(psock, skb); + err = sk_psock_skb_ingress_self(psock, skb); } if (err < 0) { skb_queue_tail(&psock->ingress_skb, skb); -- cgit v1.2.3-58-ga151 From 2443ca66676d50a4eb3305c236bccd84a9828ce2 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 16 Nov 2020 14:29:08 -0800 Subject: bpf, sockmap: Handle memory acct if skb_verdict prog redirects to self If the skb_verdict_prog redirects an skb knowingly to itself, fix your BPF program this is not optimal and an abuse of the API please use SK_PASS. That said there may be cases, such as socket load balancing, where picking the socket is hashed based or otherwise picks the same socket it was received on in some rare cases. If this happens we don't want to confuse userspace giving them an EAGAIN error if we can avoid it. To avoid double accounting in these cases. At the moment even if the skb has already been charged against the sockets rcvbuf and forward alloc we check it again and do set_owner_r() causing it to be orphaned and recharged. For one this is useless work, but more importantly we can have a case where the skb could be put on the ingress queue, but because we are under memory pressure we return EAGAIN. The trouble here is the skb has already been accounted for so any rcvbuf checks include the memory associated with the packet already. This rolls up and can result in unnecessary EAGAIN errors in userspace read() calls. Fix by doing an unlikely check and skipping checks if skb->sk == sk. Fixes: 51199405f9672 ("bpf: skb_verdict, support SK_PASS on RX BPF path") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/160556574804.73229.11328201020039674147.stgit@john-XPS-13-9370 --- net/core/skmsg.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'net/core') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 9aed5a2c7c5b..514bc9f6f8ae 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -442,11 +442,19 @@ static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb, return copied; } +static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb); + static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb) { struct sock *sk = psock->sk; struct sk_msg *msg; + /* If we are receiving on the same sock skb->sk is already assigned, + * skip memory accounting and owner transition seeing it already set + * correctly. + */ + if (unlikely(skb->sk == sk)) + return sk_psock_skb_ingress_self(psock, skb); msg = sk_psock_create_ingress_msg(sk, skb); if (!msg) return -EAGAIN; -- cgit v1.2.3-58-ga151 From 4363023d2668e621b0743db351a9555d6e6ea57e Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 16 Nov 2020 14:29:28 -0800 Subject: bpf, sockmap: Avoid failures from skb_to_sgvec when skb has frag_list When skb has a frag_list its possible for skb_to_sgvec() to fail. This happens when the scatterlist has fewer elements to store pages than would be needed for the initial skb plus any of its frags. This case appears rare, but is possible when running an RX parser/verdict programs exposed to the internet. Currently, when this happens we throw an error, break the pipe, and kfree the msg. This effectively breaks the application or forces it to do a retry. Lets catch this case and handle it by doing an skb_linearize() on any skb we receive with frags. At this point skb_to_sgvec should not fail because the failing conditions would require frags to be in place. Fixes: 604326b41a6fb ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/160556576837.73229.14800682790808797635.stgit@john-XPS-13-9370 --- net/core/skmsg.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 514bc9f6f8ae..25cdbb20f3a0 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -423,9 +423,16 @@ static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb, struct sock *sk, struct sk_msg *msg) { - int num_sge = skb_to_sgvec(skb, msg->sg.data, 0, skb->len); - int copied; + int num_sge, copied; + /* skb linearize may fail with ENOMEM, but lets simply try again + * later if this happens. Under memory pressure we don't want to + * drop the skb. We need to linearize the skb so that the mapping + * in skb_to_sgvec can not error. + */ + if (skb_linearize(skb)) + return -EAGAIN; + num_sge = skb_to_sgvec(skb, msg->sg.data, 0, skb->len); if (unlikely(num_sge < 0)) { kfree(msg); return num_sge; -- cgit v1.2.3-58-ga151