summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/net/inet6_hashtables.h3
-rw-r--r--include/net/inet_hashtables.h3
-rw-r--r--include/net/sock.h46
-rw-r--r--include/uapi/linux/bpf.h25
-rw-r--r--net/core/filter.c35
-rw-r--r--net/core/sock.c12
-rw-r--r--net/ipv4/ip_input.c3
-rw-r--r--net/ipv4/udp.c6
-rw-r--r--net/ipv6/ip6_input.c3
-rw-r--r--net/ipv6/udp.c9
-rw-r--r--net/sched/act_bpf.c3
-rw-r--r--tools/include/uapi/linux/bpf.h25
-rw-r--r--tools/testing/selftests/bpf/prog_tests/sk_assign.c309
-rw-r--r--tools/testing/selftests/bpf/progs/test_sk_assign.c204
14 files changed, 662 insertions, 24 deletions
diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h
index fe96bf247aac..81b965953036 100644
--- a/include/net/inet6_hashtables.h
+++ b/include/net/inet6_hashtables.h
@@ -85,9 +85,8 @@ static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo,
int iif, int sdif,
bool *refcounted)
{
- struct sock *sk = skb_steal_sock(skb);
+ struct sock *sk = skb_steal_sock(skb, refcounted);
- *refcounted = true;
if (sk)
return sk;
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index d0019d3395cf..ad64ba6a057f 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -379,10 +379,9 @@ static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo,
const int sdif,
bool *refcounted)
{
- struct sock *sk = skb_steal_sock(skb);
+ struct sock *sk = skb_steal_sock(skb, refcounted);
const struct iphdr *iph = ip_hdr(skb);
- *refcounted = true;
if (sk)
return sk;
diff --git a/include/net/sock.h b/include/net/sock.h
index b5cca7bae69b..6d84784d33fa 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1659,6 +1659,7 @@ void sock_rfree(struct sk_buff *skb);
void sock_efree(struct sk_buff *skb);
#ifdef CONFIG_INET
void sock_edemux(struct sk_buff *skb);
+void sock_pfree(struct sk_buff *skb);
#else
#define sock_edemux sock_efree
#endif
@@ -2526,16 +2527,14 @@ void sock_net_set(struct sock *sk, struct net *net)
write_pnet(&sk->sk_net, net);
}
-static inline struct sock *skb_steal_sock(struct sk_buff *skb)
+static inline bool
+skb_sk_is_prefetched(struct sk_buff *skb)
{
- if (skb->sk) {
- struct sock *sk = skb->sk;
-
- skb->destructor = NULL;
- skb->sk = NULL;
- return sk;
- }
- return NULL;
+#ifdef CONFIG_INET
+ return skb->destructor == sock_pfree;
+#else
+ return false;
+#endif /* CONFIG_INET */
}
/* This helper checks if a socket is a full socket,
@@ -2546,6 +2545,35 @@ static inline bool sk_fullsock(const struct sock *sk)
return (1 << sk->sk_state) & ~(TCPF_TIME_WAIT | TCPF_NEW_SYN_RECV);
}
+static inline bool
+sk_is_refcounted(struct sock *sk)
+{
+ /* Only full sockets have sk->sk_flags. */
+ return !sk_fullsock(sk) || !sock_flag(sk, SOCK_RCU_FREE);
+}
+
+/**
+ * skb_steal_sock
+ * @skb to steal the socket from
+ * @refcounted is set to true if the socket is reference-counted
+ */
+static inline struct sock *
+skb_steal_sock(struct sk_buff *skb, bool *refcounted)
+{
+ if (skb->sk) {
+ struct sock *sk = skb->sk;
+
+ *refcounted = true;
+ if (skb_sk_is_prefetched(skb))
+ *refcounted = sk_is_refcounted(sk);
+ skb->destructor = NULL;
+ skb->sk = NULL;
+ return sk;
+ }
+ *refcounted = false;
+ return NULL;
+}
+
/* Checks if this SKB belongs to an HW offloaded socket
* and whether any SW fallbacks are required based on dev.
* Check decrypted mark in case skb_orphan() cleared socket.
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f1fbc36f58d3..9f786a5a44ac 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2983,6 +2983,28 @@ union bpf_attr {
* **bpf_get_current_cgroup_id**\ ().
* Return
* The id is returned or 0 in case the id could not be retrieved.
+ *
+ * int bpf_sk_assign(struct sk_buff *skb, struct bpf_sock *sk, u64 flags)
+ * Description
+ * Assign the *sk* to the *skb*. When combined with appropriate
+ * routing configuration to receive the packet towards the socket,
+ * will cause *skb* to be delivered to the specified socket.
+ * Subsequent redirection of *skb* via **bpf_redirect**\ (),
+ * **bpf_clone_redirect**\ () or other methods outside of BPF may
+ * interfere with successful delivery to the socket.
+ *
+ * This operation is only valid from TC ingress path.
+ *
+ * The *flags* argument must be zero.
+ * Return
+ * 0 on success, or a negative errno in case of failure.
+ *
+ * * **-EINVAL** Unsupported flags specified.
+ * * **-ENOENT** Socket is unavailable for assignment.
+ * * **-ENETUNREACH** Socket is unreachable (wrong netns).
+ * * **-EOPNOTSUPP** Unsupported operation, for example a
+ * call from outside of TC ingress.
+ * * **-ESOCKTNOSUPPORT** Socket type not supported (reuseport).
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
@@ -3108,7 +3130,8 @@ union bpf_attr {
FN(get_ns_current_pid_tgid), \
FN(xdp_output), \
FN(get_netns_cookie), \
- FN(get_current_ancestor_cgroup_id),
+ FN(get_current_ancestor_cgroup_id), \
+ FN(sk_assign),
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call
diff --git a/net/core/filter.c b/net/core/filter.c
index bb4a196c8809..7628b947dbc3 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5401,8 +5401,7 @@ static const struct bpf_func_proto bpf_sk_lookup_udp_proto = {
BPF_CALL_1(bpf_sk_release, struct sock *, sk)
{
- /* Only full sockets have sk->sk_flags. */
- if (!sk_fullsock(sk) || !sock_flag(sk, SOCK_RCU_FREE))
+ if (sk_is_refcounted(sk))
sock_gen_put(sk);
return 0;
}
@@ -5918,6 +5917,36 @@ static const struct bpf_func_proto bpf_tcp_gen_syncookie_proto = {
.arg5_type = ARG_CONST_SIZE,
};
+BPF_CALL_3(bpf_sk_assign, struct sk_buff *, skb, struct sock *, sk, u64, flags)
+{
+ if (flags != 0)
+ return -EINVAL;
+ if (!skb_at_tc_ingress(skb))
+ return -EOPNOTSUPP;
+ if (unlikely(dev_net(skb->dev) != sock_net(sk)))
+ return -ENETUNREACH;
+ if (unlikely(sk->sk_reuseport))
+ return -ESOCKTNOSUPPORT;
+ if (sk_is_refcounted(sk) &&
+ unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
+ return -ENOENT;
+
+ skb_orphan(skb);
+ skb->sk = sk;
+ skb->destructor = sock_pfree;
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_sk_assign_proto = {
+ .func = bpf_sk_assign,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_SOCK_COMMON,
+ .arg3_type = ARG_ANYTHING,
+};
+
#endif /* CONFIG_INET */
bool bpf_helper_changes_pkt_data(void *func)
@@ -6249,6 +6278,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_skb_ecn_set_ce_proto;
case BPF_FUNC_tcp_gen_syncookie:
return &bpf_tcp_gen_syncookie_proto;
+ case BPF_FUNC_sk_assign:
+ return &bpf_sk_assign_proto;
#endif
default:
return bpf_base_func_proto(func_id);
diff --git a/net/core/sock.c b/net/core/sock.c
index 0fc8937a7ff4..da32d9b6d09f 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2071,6 +2071,18 @@ void sock_efree(struct sk_buff *skb)
}
EXPORT_SYMBOL(sock_efree);
+/* Buffer destructor for prefetch/receive path where reference count may
+ * not be held, e.g. for listen sockets.
+ */
+#ifdef CONFIG_INET
+void sock_pfree(struct sk_buff *skb)
+{
+ if (sk_is_refcounted(skb->sk))
+ sock_gen_put(skb->sk);
+}
+EXPORT_SYMBOL(sock_pfree);
+#endif /* CONFIG_INET */
+
kuid_t sock_i_uid(struct sock *sk)
{
kuid_t uid;
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index aa438c6758a7..b0c244af1e4d 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -509,7 +509,8 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
IPCB(skb)->iif = skb->skb_iif;
/* Must drop socket now because of tproxy. */
- skb_orphan(skb);
+ if (!skb_sk_is_prefetched(skb))
+ skb_orphan(skb);
return skb;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 2633fc231593..b4035021bbd3 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2288,6 +2288,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
struct rtable *rt = skb_rtable(skb);
__be32 saddr, daddr;
struct net *net = dev_net(skb->dev);
+ bool refcounted;
/*
* Validate the packet.
@@ -2313,7 +2314,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
if (udp4_csum_init(skb, uh, proto))
goto csum_error;
- sk = skb_steal_sock(skb);
+ sk = skb_steal_sock(skb, &refcounted);
if (sk) {
struct dst_entry *dst = skb_dst(skb);
int ret;
@@ -2322,7 +2323,8 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
udp_sk_rx_dst_set(sk, dst);
ret = udp_unicast_rcv_skb(sk, skb, uh);
- sock_put(sk);
+ if (refcounted)
+ sock_put(sk);
return ret;
}
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 7b089d0ac8cd..e96304d8a4a7 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -285,7 +285,8 @@ static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev,
rcu_read_unlock();
/* Must drop socket now because of tproxy. */
- skb_orphan(skb);
+ if (!skb_sk_is_prefetched(skb))
+ skb_orphan(skb);
return skb;
err:
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 5dc439a391fe..7d4151747340 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -843,6 +843,7 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
struct net *net = dev_net(skb->dev);
struct udphdr *uh;
struct sock *sk;
+ bool refcounted;
u32 ulen = 0;
if (!pskb_may_pull(skb, sizeof(struct udphdr)))
@@ -879,7 +880,7 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
goto csum_error;
/* Check if the socket is already available, e.g. due to early demux */
- sk = skb_steal_sock(skb);
+ sk = skb_steal_sock(skb, &refcounted);
if (sk) {
struct dst_entry *dst = skb_dst(skb);
int ret;
@@ -888,12 +889,14 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
udp6_sk_rx_dst_set(sk, dst);
if (!uh->check && !udp_sk(sk)->no_check6_rx) {
- sock_put(sk);
+ if (refcounted)
+ sock_put(sk);
goto report_csum_error;
}
ret = udp6_unicast_rcv_skb(sk, skb, uh);
- sock_put(sk);
+ if (refcounted)
+ sock_put(sk);
return ret;
}
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index 46f47e58b3be..54d5652cfe6c 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -12,6 +12,7 @@
#include <linux/bpf.h>
#include <net/netlink.h>
+#include <net/sock.h>
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
@@ -53,6 +54,8 @@ static int tcf_bpf_act(struct sk_buff *skb, const struct tc_action *act,
bpf_compute_data_pointers(skb);
filter_res = BPF_PROG_RUN(filter, skb);
}
+ if (skb_sk_is_prefetched(skb) && filter_res != TC_ACT_OK)
+ skb_orphan(skb);
rcu_read_unlock();
/* A BPF program may overwrite the default action opcode.
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index f1fbc36f58d3..9f786a5a44ac 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2983,6 +2983,28 @@ union bpf_attr {
* **bpf_get_current_cgroup_id**\ ().
* Return
* The id is returned or 0 in case the id could not be retrieved.
+ *
+ * int bpf_sk_assign(struct sk_buff *skb, struct bpf_sock *sk, u64 flags)
+ * Description
+ * Assign the *sk* to the *skb*. When combined with appropriate
+ * routing configuration to receive the packet towards the socket,
+ * will cause *skb* to be delivered to the specified socket.
+ * Subsequent redirection of *skb* via **bpf_redirect**\ (),
+ * **bpf_clone_redirect**\ () or other methods outside of BPF may
+ * interfere with successful delivery to the socket.
+ *
+ * This operation is only valid from TC ingress path.
+ *
+ * The *flags* argument must be zero.
+ * Return
+ * 0 on success, or a negative errno in case of failure.
+ *
+ * * **-EINVAL** Unsupported flags specified.
+ * * **-ENOENT** Socket is unavailable for assignment.
+ * * **-ENETUNREACH** Socket is unreachable (wrong netns).
+ * * **-EOPNOTSUPP** Unsupported operation, for example a
+ * call from outside of TC ingress.
+ * * **-ESOCKTNOSUPPORT** Socket type not supported (reuseport).
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
@@ -3108,7 +3130,8 @@ union bpf_attr {
FN(get_ns_current_pid_tgid), \
FN(xdp_output), \
FN(get_netns_cookie), \
- FN(get_current_ancestor_cgroup_id),
+ FN(get_current_ancestor_cgroup_id), \
+ FN(sk_assign),
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call
diff --git a/tools/testing/selftests/bpf/prog_tests/sk_assign.c b/tools/testing/selftests/bpf/prog_tests/sk_assign.c
new file mode 100644
index 000000000000..d572e1a2c297
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/sk_assign.c
@@ -0,0 +1,309 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2018 Facebook
+// Copyright (c) 2019 Cloudflare
+// Copyright (c) 2020 Isovalent, Inc.
+/*
+ * Test that the socket assign program is able to redirect traffic towards a
+ * socket, regardless of whether the port or address destination of the traffic
+ * matches the port.
+ */
+
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "test_progs.h"
+
+#define BIND_PORT 1234
+#define CONNECT_PORT 4321
+#define TEST_DADDR (0xC0A80203)
+#define NS_SELF "/proc/self/ns/net"
+
+static const struct timeval timeo_sec = { .tv_sec = 3 };
+static const size_t timeo_optlen = sizeof(timeo_sec);
+static int stop, duration;
+
+static bool
+configure_stack(void)
+{
+ char tc_cmd[BUFSIZ];
+
+ /* Move to a new networking namespace */
+ if (CHECK_FAIL(unshare(CLONE_NEWNET)))
+ return false;
+
+ /* Configure necessary links, routes */
+ if (CHECK_FAIL(system("ip link set dev lo up")))
+ return false;
+ if (CHECK_FAIL(system("ip route add local default dev lo")))
+ return false;
+ if (CHECK_FAIL(system("ip -6 route add local default dev lo")))
+ return false;
+
+ /* Load qdisc, BPF program */
+ if (CHECK_FAIL(system("tc qdisc add dev lo clsact")))
+ return false;
+ sprintf(tc_cmd, "%s %s %s %s", "tc filter add dev lo ingress bpf",
+ "direct-action object-file ./test_sk_assign.o",
+ "section classifier/sk_assign_test",
+ (env.verbosity < VERBOSE_VERY) ? " 2>/dev/null" : "");
+ if (CHECK(system(tc_cmd), "BPF load failed;",
+ "run with -vv for more info\n"))
+ return false;
+
+ return true;
+}
+
+static int
+start_server(const struct sockaddr *addr, socklen_t len, int type)
+{
+ int fd;
+
+ fd = socket(addr->sa_family, type, 0);
+ if (CHECK_FAIL(fd == -1))
+ goto out;
+ if (CHECK_FAIL(setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &timeo_sec,
+ timeo_optlen)))
+ goto close_out;
+ if (CHECK_FAIL(bind(fd, addr, len) == -1))
+ goto close_out;
+ if (type == SOCK_STREAM && CHECK_FAIL(listen(fd, 128) == -1))
+ goto close_out;
+
+ goto out;
+close_out:
+ close(fd);
+ fd = -1;
+out:
+ return fd;
+}
+
+static int
+connect_to_server(const struct sockaddr *addr, socklen_t len, int type)
+{
+ int fd = -1;
+
+ fd = socket(addr->sa_family, type, 0);
+ if (CHECK_FAIL(fd == -1))
+ goto out;
+ if (CHECK_FAIL(setsockopt(fd, SOL_SOCKET, SO_SNDTIMEO, &timeo_sec,
+ timeo_optlen)))
+ goto close_out;
+ if (CHECK_FAIL(connect(fd, addr, len)))
+ goto close_out;
+
+ goto out;
+close_out:
+ close(fd);
+ fd = -1;
+out:
+ return fd;
+}
+
+static in_port_t
+get_port(int fd)
+{
+ struct sockaddr_storage ss;
+ socklen_t slen = sizeof(ss);
+ in_port_t port = 0;
+
+ if (CHECK_FAIL(getsockname(fd, (struct sockaddr *)&ss, &slen)))
+ return port;
+
+ switch (ss.ss_family) {
+ case AF_INET:
+ port = ((struct sockaddr_in *)&ss)->sin_port;
+ break;
+ case AF_INET6:
+ port = ((struct sockaddr_in6 *)&ss)->sin6_port;
+ break;
+ default:
+ CHECK(1, "Invalid address family", "%d\n", ss.ss_family);
+ }
+ return port;
+}
+
+static ssize_t
+rcv_msg(int srv_client, int type)
+{
+ struct sockaddr_storage ss;
+ char buf[BUFSIZ];
+ socklen_t slen;
+
+ if (type == SOCK_STREAM)
+ return read(srv_client, &buf, sizeof(buf));
+ else
+ return recvfrom(srv_client, &buf, sizeof(buf), 0,
+ (struct sockaddr *)&ss, &slen);
+}
+
+static int
+run_test(int server_fd, const struct sockaddr *addr, socklen_t len, int type)
+{
+ int client = -1, srv_client = -1;
+ char buf[] = "testing";
+ in_port_t port;
+ int ret = 1;
+
+ client = connect_to_server(addr, len, type);
+ if (client == -1) {
+ perror("Cannot connect to server");
+ goto out;
+ }
+
+ if (type == SOCK_STREAM) {
+ srv_client = accept(server_fd, NULL, NULL);
+ if (CHECK_FAIL(srv_client == -1)) {
+ perror("Can't accept connection");
+ goto out;
+ }
+ } else {
+ srv_client = server_fd;
+ }
+ if (CHECK_FAIL(write(client, buf, sizeof(buf)) != sizeof(buf))) {
+ perror("Can't write on client");
+ goto out;
+ }
+ if (CHECK_FAIL(rcv_msg(srv_client, type) != sizeof(buf))) {
+ perror("Can't read on server");
+ goto out;
+ }
+
+ port = get_port(srv_client);
+ if (CHECK_FAIL(!port))
+ goto out;
+ /* SOCK_STREAM is connected via accept(), so the server's local address
+ * will be the CONNECT_PORT rather than the BIND port that corresponds
+ * to the listen socket. SOCK_DGRAM on the other hand is connectionless
+ * so we can't really do the same check there; the server doesn't ever
+ * create a socket with CONNECT_PORT.
+ */
+ if (type == SOCK_STREAM &&
+ CHECK(port != htons(CONNECT_PORT), "Expected", "port %u but got %u",
+ CONNECT_PORT, ntohs(port)))
+ goto out;
+ else if (type == SOCK_DGRAM &&
+ CHECK(port != htons(BIND_PORT), "Expected",
+ "port %u but got %u", BIND_PORT, ntohs(port)))
+ goto out;
+
+ ret = 0;
+out:
+ close(client);
+ if (srv_client != server_fd)
+ close(srv_client);
+ if (ret)
+ WRITE_ONCE(stop, 1);
+ return ret;
+}
+
+static void
+prepare_addr(struct sockaddr *addr, int family, __u16 port, bool rewrite_addr)
+{
+ struct sockaddr_in *addr4;
+ struct sockaddr_in6 *addr6;
+
+ switch (family) {
+ case AF_INET:
+ addr4 = (struct sockaddr_in *)addr;
+ memset(addr4, 0, sizeof(*addr4));
+ addr4->sin_family = family;
+ addr4->sin_port = htons(port);
+ if (rewrite_addr)
+ addr4->sin_addr.s_addr = htonl(TEST_DADDR);
+ else
+ addr4->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+ break;
+ case AF_INET6:
+ addr6 = (struct sockaddr_in6 *)addr;
+ memset(addr6, 0, sizeof(*addr6));
+ addr6->sin6_family = family;
+ addr6->sin6_port = htons(port);
+ addr6->sin6_addr = in6addr_loopback;
+ if (rewrite_addr)
+ addr6->sin6_addr.s6_addr32[3] = htonl(TEST_DADDR);
+ break;
+ default:
+ fprintf(stderr, "Invalid family %d", family);
+ }
+}
+
+struct test_sk_cfg {
+ const char *name;
+ int family;
+ struct sockaddr *addr;
+ socklen_t len;
+ int type;
+ bool rewrite_addr;
+};
+
+#define TEST(NAME, FAMILY, TYPE, REWRITE) \
+{ \
+ .name = NAME, \
+ .family = FAMILY, \
+ .addr = (FAMILY == AF_INET) ? (struct sockaddr *)&addr4 \
+ : (struct sockaddr *)&addr6, \
+ .len = (FAMILY == AF_INET) ? sizeof(addr4) : sizeof(addr6), \
+ .type = TYPE, \
+ .rewrite_addr = REWRITE, \
+}
+
+void test_sk_assign(void)
+{
+ struct sockaddr_in addr4;
+ struct sockaddr_in6 addr6;
+ struct test_sk_cfg tests[] = {
+ TEST("ipv4 tcp port redir", AF_INET, SOCK_STREAM, false),
+ TEST("ipv4 tcp addr redir", AF_INET, SOCK_STREAM, true),
+ TEST("ipv6 tcp port redir", AF_INET6, SOCK_STREAM, false),
+ TEST("ipv6 tcp addr redir", AF_INET6, SOCK_STREAM, true),
+ TEST("ipv4 udp port redir", AF_INET, SOCK_DGRAM, false),
+ TEST("ipv4 udp addr redir", AF_INET, SOCK_DGRAM, true),
+ TEST("ipv6 udp port redir", AF_INET6, SOCK_DGRAM, false),
+ TEST("ipv6 udp addr redir", AF_INET6, SOCK_DGRAM, true),
+ };
+ int server = -1;
+ int self_net;
+
+ self_net = open(NS_SELF, O_RDONLY);
+ if (CHECK_FAIL(self_net < 0)) {
+ perror("Unable to open "NS_SELF);
+ return;
+ }
+
+ if (!configure_stack()) {
+ perror("configure_stack");
+ goto cleanup;
+ }
+
+ for (int i = 0; i < ARRAY_SIZE(tests) && !READ_ONCE(stop); i++) {
+ struct test_sk_cfg *test = &tests[i];
+ const struct sockaddr *addr;
+
+ if (!test__start_subtest(test->name))
+ continue;
+ prepare_addr(test->addr, test->family, BIND_PORT, false);
+ addr = (const struct sockaddr *)test->addr;
+ server = start_server(addr, test->len, test->type);
+ if (server == -1)
+ goto cleanup;
+
+ /* connect to unbound ports */
+ prepare_addr(test->addr, test->family, CONNECT_PORT,
+ test->rewrite_addr);
+ if (run_test(server, addr, test->len, test->type))
+ goto close;
+
+ close(server);
+ server = -1;
+ }
+
+close:
+ close(server);
+cleanup:
+ if (CHECK_FAIL(setns(self_net, CLONE_NEWNET)))
+ perror("Failed to setns("NS_SELF")");
+ close(self_net);
+}
diff --git a/tools/testing/selftests/bpf/progs/test_sk_assign.c b/tools/testing/selftests/bpf/progs/test_sk_assign.c
new file mode 100644
index 000000000000..8f530843b4da
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_sk_assign.c
@@ -0,0 +1,204 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Cloudflare Ltd.
+// Copyright (c) 2020 Isovalent, Inc.
+
+#include <stddef.h>
+#include <stdbool.h>
+#include <string.h>
+#include <linux/bpf.h>
+#include <linux/if_ether.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/pkt_cls.h>
+#include <linux/tcp.h>
+#include <sys/socket.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+int _version SEC("version") = 1;
+char _license[] SEC("license") = "GPL";
+
+/* Fill 'tuple' with L3 info, and attempt to find L4. On fail, return NULL. */
+static inline struct bpf_sock_tuple *
+get_tuple(struct __sk_buff *skb, bool *ipv4, bool *tcp)
+{
+ void *data_end = (void *)(long)skb->data_end;
+ void *data = (void *)(long)skb->data;
+ struct bpf_sock_tuple *result;
+ struct ethhdr *eth;
+ __u64 tuple_len;
+ __u8 proto = 0;
+ __u64 ihl_len;
+
+ eth = (struct ethhdr *)(data);
+ if (eth + 1 > data_end)
+ return NULL;
+
+ if (eth->h_proto == bpf_htons(ETH_P_IP)) {
+ struct iphdr *iph = (struct iphdr *)(data + sizeof(*eth));
+
+ if (iph + 1 > data_end)
+ return NULL;
+ if (iph->ihl != 5)
+ /* Options are not supported */
+ return NULL;
+ ihl_len = iph->ihl * 4;
+ proto = iph->protocol;
+ *ipv4 = true;
+ result = (struct bpf_sock_tuple *)&iph->saddr;
+ } else if (eth->h_proto == bpf_htons(ETH_P_IPV6)) {
+ struct ipv6hdr *ip6h = (struct ipv6hdr *)(data + sizeof(*eth));
+
+ if (ip6h + 1 > data_end)
+ return NULL;
+ ihl_len = sizeof(*ip6h);
+ proto = ip6h->nexthdr;
+ *ipv4 = false;
+ result = (struct bpf_sock_tuple *)&ip6h->saddr;
+ } else {
+ return (struct bpf_sock_tuple *)data;
+ }
+
+ if (proto != IPPROTO_TCP && proto != IPPROTO_UDP)
+ return NULL;
+
+ *tcp = (proto == IPPROTO_TCP);
+ return result;
+}
+
+static inline int
+handle_udp(struct __sk_buff *skb, struct bpf_sock_tuple *tuple, bool ipv4)
+{
+ struct bpf_sock_tuple ln = {0};
+ struct bpf_sock *sk;
+ size_t tuple_len;
+ int ret;
+
+ tuple_len = ipv4 ? sizeof(tuple->ipv4) : sizeof(tuple->ipv6);
+ if ((void *)tuple + tuple_len > (void *)(long)skb->data_end)
+ return TC_ACT_SHOT;
+
+ sk = bpf_sk_lookup_udp(skb, tuple, tuple_len, BPF_F_CURRENT_NETNS, 0);
+ if (sk)
+ goto assign;
+
+ if (ipv4) {
+ if (tuple->ipv4.dport != bpf_htons(4321))
+ return TC_ACT_OK;
+
+ ln.ipv4.daddr = bpf_htonl(0x7f000001);
+ ln.ipv4.dport = bpf_htons(1234);
+
+ sk = bpf_sk_lookup_udp(skb, &ln, sizeof(ln.ipv4),
+ BPF_F_CURRENT_NETNS, 0);
+ } else {
+ if (tuple->ipv6.dport != bpf_htons(4321))
+ return TC_ACT_OK;
+
+ /* Upper parts of daddr are already zero. */
+ ln.ipv6.daddr[3] = bpf_htonl(0x1);
+ ln.ipv6.dport = bpf_htons(1234);
+
+ sk = bpf_sk_lookup_udp(skb, &ln, sizeof(ln.ipv6),
+ BPF_F_CURRENT_NETNS, 0);
+ }
+
+ /* workaround: We can't do a single socket lookup here, because then
+ * the compiler will likely spill tuple_len to the stack. This makes it
+ * lose all bounds information in the verifier, which then rejects the
+ * call as unsafe.
+ */
+ if (!sk)
+ return TC_ACT_SHOT;
+
+assign:
+ ret = bpf_sk_assign(skb, sk, 0);
+ bpf_sk_release(sk);
+ return ret;
+}
+
+static inline int
+handle_tcp(struct __sk_buff *skb, struct bpf_sock_tuple *tuple, bool ipv4)
+{
+ struct bpf_sock_tuple ln = {0};
+ struct bpf_sock *sk;
+ size_t tuple_len;
+ int ret;
+
+ tuple_len = ipv4 ? sizeof(tuple->ipv4) : sizeof(tuple->ipv6);
+ if ((void *)tuple + tuple_len > (void *)(long)skb->data_end)
+ return TC_ACT_SHOT;
+
+ sk = bpf_skc_lookup_tcp(skb, tuple, tuple_len, BPF_F_CURRENT_NETNS, 0);
+ if (sk) {
+ if (sk->state != BPF_TCP_LISTEN)
+ goto assign;
+ bpf_sk_release(sk);
+ }
+
+ if (ipv4) {
+ if (tuple->ipv4.dport != bpf_htons(4321))
+ return TC_ACT_OK;
+
+ ln.ipv4.daddr = bpf_htonl(0x7f000001);
+ ln.ipv4.dport = bpf_htons(1234);
+
+ sk = bpf_skc_lookup_tcp(skb, &ln, sizeof(ln.ipv4),
+ BPF_F_CURRENT_NETNS, 0);
+ } else {
+ if (tuple->ipv6.dport != bpf_htons(4321))
+ return TC_ACT_OK;
+
+ /* Upper parts of daddr are already zero. */
+ ln.ipv6.daddr[3] = bpf_htonl(0x1);
+ ln.ipv6.dport = bpf_htons(1234);
+
+ sk = bpf_skc_lookup_tcp(skb, &ln, sizeof(ln.ipv6),
+ BPF_F_CURRENT_NETNS, 0);
+ }
+
+ /* workaround: We can't do a single socket lookup here, because then
+ * the compiler will likely spill tuple_len to the stack. This makes it
+ * lose all bounds information in the verifier, which then rejects the
+ * call as unsafe.
+ */
+ if (!sk)
+ return TC_ACT_SHOT;
+
+ if (sk->state != BPF_TCP_LISTEN) {
+ bpf_sk_release(sk);
+ return TC_ACT_SHOT;
+ }
+
+assign:
+ ret = bpf_sk_assign(skb, sk, 0);
+ bpf_sk_release(sk);
+ return ret;
+}
+
+SEC("classifier/sk_assign_test")
+int bpf_sk_assign_test(struct __sk_buff *skb)
+{
+ struct bpf_sock_tuple *tuple, ln = {0};
+ bool ipv4 = false;
+ bool tcp = false;
+ int tuple_len;
+ int ret = 0;
+
+ tuple = get_tuple(skb, &ipv4, &tcp);
+ if (!tuple)
+ return TC_ACT_SHOT;
+
+ /* Note that the verifier socket return type for bpf_skc_lookup_tcp()
+ * differs from bpf_sk_lookup_udp(), so even though the C-level type is
+ * the same here, if we try to share the implementations they will
+ * fail to verify because we're crossing pointer types.
+ */
+ if (tcp)
+ ret = handle_tcp(skb, tuple, ipv4);
+ else
+ ret = handle_udp(skb, tuple, ipv4);
+
+ return ret == 0 ? TC_ACT_OK : TC_ACT_SHOT;
+}