summaryrefslogtreecommitdiff
path: root/net/core/sock_reuseport.c
diff options
context:
space:
mode:
authorCraig Gallek <kraig@google.com>2016-01-04 17:41:47 -0500
committerDavid S. Miller <davem@davemloft.net>2016-01-04 22:49:59 -0500
commit538950a1b7527a0a52ccd9337e3fcd304f027f13 (patch)
tree2ecd86127a55719e61ea9a37aeb1cc7be8022d0f /net/core/sock_reuseport.c
parente32ea7e747271a0abcd37e265005e97cc81d9df5 (diff)
soreuseport: setsockopt SO_ATTACH_REUSEPORT_[CE]BPF
Expose socket options for setting a classic or extended BPF program for use when selecting sockets in an SO_REUSEPORT group. These options can be used on the first socket to belong to a group before bind or on any socket in the group after bind. This change includes refactoring of the existing sk_filter code to allow reuse of the existing BPF filter validation checks. Signed-off-by: Craig Gallek <kraig@google.com> Acked-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/core/sock_reuseport.c')
-rw-r--r--net/core/sock_reuseport.c88
1 files changed, 83 insertions, 5 deletions
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index 963c8d5f3027..ae0969c0fc2e 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -1,10 +1,12 @@
/*
* To speed up listener socket lookup, create an array to store all sockets
* listening on the same port. This allows a decision to be made after finding
- * the first socket.
+ * the first socket. An optional BPF program can also be configured for
+ * selecting the socket index from the array of available sockets.
*/
#include <net/sock_reuseport.h>
+#include <linux/bpf.h>
#include <linux/rcupdate.h>
#define INIT_SOCKS 128
@@ -22,6 +24,7 @@ static struct sock_reuseport *__reuseport_alloc(u16 max_socks)
reuse->max_socks = max_socks;
+ RCU_INIT_POINTER(reuse->prog, NULL);
return reuse;
}
@@ -67,6 +70,7 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse)
more_reuse->max_socks = more_socks_size;
more_reuse->num_socks = reuse->num_socks;
+ more_reuse->prog = reuse->prog;
memcpy(more_reuse->socks, reuse->socks,
reuse->num_socks * sizeof(struct sock *));
@@ -75,6 +79,10 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse)
rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb,
more_reuse);
+ /* Note: we use kfree_rcu here instead of reuseport_free_rcu so
+ * that reuse and more_reuse can temporarily share a reference
+ * to prog.
+ */
kfree_rcu(reuse, rcu);
return more_reuse;
}
@@ -116,6 +124,16 @@ int reuseport_add_sock(struct sock *sk, const struct sock *sk2)
}
EXPORT_SYMBOL(reuseport_add_sock);
+static void reuseport_free_rcu(struct rcu_head *head)
+{
+ struct sock_reuseport *reuse;
+
+ reuse = container_of(head, struct sock_reuseport, rcu);
+ if (reuse->prog)
+ bpf_prog_destroy(reuse->prog);
+ kfree(reuse);
+}
+
void reuseport_detach_sock(struct sock *sk)
{
struct sock_reuseport *reuse;
@@ -131,7 +149,7 @@ void reuseport_detach_sock(struct sock *sk)
reuse->socks[i] = reuse->socks[reuse->num_socks - 1];
reuse->num_socks--;
if (reuse->num_socks == 0)
- kfree_rcu(reuse, rcu);
+ call_rcu(&reuse->rcu, reuseport_free_rcu);
break;
}
}
@@ -139,15 +157,53 @@ void reuseport_detach_sock(struct sock *sk)
}
EXPORT_SYMBOL(reuseport_detach_sock);
+static struct sock *run_bpf(struct sock_reuseport *reuse, u16 socks,
+ struct bpf_prog *prog, struct sk_buff *skb,
+ int hdr_len)
+{
+ struct sk_buff *nskb = NULL;
+ u32 index;
+
+ if (skb_shared(skb)) {
+ nskb = skb_clone(skb, GFP_ATOMIC);
+ if (!nskb)
+ return NULL;
+ skb = nskb;
+ }
+
+ /* temporarily advance data past protocol header */
+ if (!pskb_pull(skb, hdr_len)) {
+ consume_skb(nskb);
+ return NULL;
+ }
+ index = bpf_prog_run_save_cb(prog, skb);
+ __skb_push(skb, hdr_len);
+
+ consume_skb(nskb);
+
+ if (index >= socks)
+ return NULL;
+
+ return reuse->socks[index];
+}
+
/**
* reuseport_select_sock - Select a socket from an SO_REUSEPORT group.
* @sk: First socket in the group.
- * @hash: Use this hash to select.
+ * @hash: When no BPF filter is available, use this hash to select.
+ * @skb: skb to run through BPF filter.
+ * @hdr_len: BPF filter expects skb data pointer at payload data. If
+ * the skb does not yet point at the payload, this parameter represents
+ * how far the pointer needs to advance to reach the payload.
* Returns a socket that should receive the packet (or NULL on error).
*/
-struct sock *reuseport_select_sock(struct sock *sk, u32 hash)
+struct sock *reuseport_select_sock(struct sock *sk,
+ u32 hash,
+ struct sk_buff *skb,
+ int hdr_len)
{
struct sock_reuseport *reuse;
+ struct bpf_prog *prog;
struct sock *sk2 = NULL;
u16 socks;
@@ -158,12 +214,16 @@ struct sock *reuseport_select_sock(struct sock *sk, u32 hash)
if (!reuse)
goto out;
+ prog = rcu_dereference(reuse->prog);
socks = READ_ONCE(reuse->num_socks);
if (likely(socks)) {
/* paired with smp_wmb() in reuseport_add_sock() */
smp_rmb();
- sk2 = reuse->socks[reciprocal_scale(hash, socks)];
+ if (prog && skb)
+ sk2 = run_bpf(reuse, socks, prog, skb, hdr_len);
+ else
+ sk2 = reuse->socks[reciprocal_scale(hash, socks)];
}
out:
@@ -171,3 +231,21 @@ out:
return sk2;
}
EXPORT_SYMBOL(reuseport_select_sock);
+
+struct bpf_prog *
+reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog)
+{
+ struct sock_reuseport *reuse;
+ struct bpf_prog *old_prog;
+
+ spin_lock_bh(&reuseport_lock);
+ reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
+ lockdep_is_held(&reuseport_lock));
+ old_prog = rcu_dereference_protected(reuse->prog,
+ lockdep_is_held(&reuseport_lock));
+ rcu_assign_pointer(reuse->prog, prog);
+ spin_unlock_bh(&reuseport_lock);
+
+ return old_prog;
+}
+EXPORT_SYMBOL(reuseport_attach_prog);