summaryrefslogtreecommitdiff
path: root/net/core/sock_map.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2020-10-15 18:42:13 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2020-10-15 18:42:13 -0700
commit9ff9b0d392ea08090cd1780fb196f36dbb586529 (patch)
tree276a3a5c4525b84dee64eda30b423fc31bf94850 /net/core/sock_map.c
parent840e5bb326bbcb16ce82dd2416d2769de4839aea (diff)
parent105faa8742437c28815b2a3eb8314ebc5fd9288c (diff)
Merge tag 'net-next-5.10' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from Jakub Kicinski: - Add redirect_neigh() BPF packet redirect helper, allowing to limit stack traversal in common container configs and improving TCP back-pressure. Daniel reports ~10Gbps => ~15Gbps single stream TCP performance gain. - Expand netlink policy support and improve policy export to user space. (Ge)netlink core performs request validation according to declared policies. Expand the expressiveness of those policies (min/max length and bitmasks). Allow dumping policies for particular commands. This is used for feature discovery by user space (instead of kernel version parsing or trial and error). - Support IGMPv3/MLDv2 multicast listener discovery protocols in bridge. - Allow more than 255 IPv4 multicast interfaces. - Add support for Type of Service (ToS) reflection in SYN/SYN-ACK packets of TCPv6. - In Multi-patch TCP (MPTCP) support concurrent transmission of data on multiple subflows in a load balancing scenario. Enhance advertising addresses via the RM_ADDR/ADD_ADDR options. - Support SMC-Dv2 version of SMC, which enables multi-subnet deployments. - Allow more calls to same peer in RxRPC. - Support two new Controller Area Network (CAN) protocols - CAN-FD and ISO 15765-2:2016. - Add xfrm/IPsec compat layer, solving the 32bit user space on 64bit kernel problem. - Add TC actions for implementing MPLS L2 VPNs. - Improve nexthop code - e.g. handle various corner cases when nexthop objects are removed from groups better, skip unnecessary notifications and make it easier to offload nexthops into HW by converting to a blocking notifier. - Support adding and consuming TCP header options by BPF programs, opening the doors for easy experimental and deployment-specific TCP option use. - Reorganize TCP congestion control (CC) initialization to simplify life of TCP CC implemented in BPF. - Add support for shipping BPF programs with the kernel and loading them early on boot via the User Mode Driver mechanism, hence reusing all the user space infra we have. - Support sleepable BPF programs, initially targeting LSM and tracing. - Add bpf_d_path() helper for returning full path for given 'struct path'. - Make bpf_tail_call compatible with bpf-to-bpf calls. - Allow BPF programs to call map_update_elem on sockmaps. - Add BPF Type Format (BTF) support for type and enum discovery, as well as support for using BTF within the kernel itself (current use is for pretty printing structures). - Support listing and getting information about bpf_links via the bpf syscall. - Enhance kernel interfaces around NIC firmware update. Allow specifying overwrite mask to control if settings etc. are reset during update; report expected max time operation may take to users; support firmware activation without machine reboot incl. limits of how much impact reset may have (e.g. dropping link or not). - Extend ethtool configuration interface to report IEEE-standard counters, to limit the need for per-vendor logic in user space. - Adopt or extend devlink use for debug, monitoring, fw update in many drivers (dsa loop, ice, ionic, sja1105, qed, mlxsw, mv88e6xxx, dpaa2-eth). - In mlxsw expose critical and emergency SFP module temperature alarms. Refactor port buffer handling to make the defaults more suitable and support setting these values explicitly via the DCBNL interface. - Add XDP support for Intel's igb driver. - Support offloading TC flower classification and filtering rules to mscc_ocelot switches. - Add PTP support for Marvell Octeontx2 and PP2.2 hardware, as well as fixed interval period pulse generator and one-step timestamping in dpaa-eth. - Add support for various auth offloads in WiFi APs, e.g. SAE (WPA3) offload. - Add Lynx PHY/PCS MDIO module, and convert various drivers which have this HW to use it. Convert mvpp2 to split PCS. - Support Marvell Prestera 98DX3255 24-port switch ASICs, as well as 7-port Mediatek MT7531 IP. - Add initial support for QCA6390 and IPQ6018 in ath11k WiFi driver, and wcn3680 support in wcn36xx. - Improve performance for packets which don't require much offloads on recent Mellanox NICs by 20% by making multiple packets share a descriptor entry. - Move chelsio inline crypto drivers (for TLS and IPsec) from the crypto subtree to drivers/net. Move MDIO drivers out of the phy directory. - Clean up a lot of W=1 warnings, reportedly the actively developed subsections of networking drivers should now build W=1 warning free. - Make sure drivers don't use in_interrupt() to dynamically adapt their code. Convert tasklets to use new tasklet_setup API (sadly this conversion is not yet complete). * tag 'net-next-5.10' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (2583 commits) Revert "bpfilter: Fix build error with CONFIG_BPFILTER_UMH" net, sockmap: Don't call bpf_prog_put() on NULL pointer bpf, selftest: Fix flaky tcp_hdr_options test when adding addr to lo bpf, sockmap: Add locking annotations to iterator netfilter: nftables: allow re-computing sctp CRC-32C in 'payload' statements net: fix pos incrementment in ipv6_route_seq_next net/smc: fix invalid return code in smcd_new_buf_create() net/smc: fix valid DMBE buffer sizes net/smc: fix use-after-free of delayed events bpfilter: Fix build error with CONFIG_BPFILTER_UMH cxgb4/ch_ipsec: Replace the module name to ch_ipsec from chcr net: sched: Fix suspicious RCU usage while accessing tcf_tunnel_info bpf: Fix register equivalence tracking. rxrpc: Fix loss of final ack on shutdown rxrpc: Fix bundle counting for exclusive connections netfilter: restore NF_INET_NUMHOOKS ibmveth: Identify ingress large send packets. ibmveth: Switch order of ibmveth_helper calls. cxgb4: handle 4-tuple PEDIT to NAT mode translation selftests: Add VRF route leaking tests ...
Diffstat (limited to 'net/core/sock_map.c')
-rw-r--r--net/core/sock_map.c441
1 files changed, 361 insertions, 80 deletions
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index 119f52a99dc1..ddc899e83313 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -2,6 +2,7 @@
/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */
#include <linux/bpf.h>
+#include <linux/btf_ids.h>
#include <linux/filter.h>
#include <linux/errno.h>
#include <linux/file.h>
@@ -147,8 +148,8 @@ static void sock_map_add_link(struct sk_psock *psock,
static void sock_map_del_link(struct sock *sk,
struct sk_psock *psock, void *link_raw)
{
+ bool strp_stop = false, verdict_stop = false;
struct sk_psock_link *link, *tmp;
- bool strp_stop = false;
spin_lock_bh(&psock->link_lock);
list_for_each_entry_safe(link, tmp, &psock->link, list) {
@@ -158,14 +159,19 @@ static void sock_map_del_link(struct sock *sk,
map);
if (psock->parser.enabled && stab->progs.skb_parser)
strp_stop = true;
+ if (psock->parser.enabled && stab->progs.skb_verdict)
+ verdict_stop = true;
list_del(&link->list);
sk_psock_free_link(link);
}
}
spin_unlock_bh(&psock->link_lock);
- if (strp_stop) {
+ if (strp_stop || verdict_stop) {
write_lock_bh(&sk->sk_callback_lock);
- sk_psock_stop_strp(sk, psock);
+ if (strp_stop)
+ sk_psock_stop_strp(sk, psock);
+ else
+ sk_psock_stop_verdict(sk, psock);
write_unlock_bh(&sk->sk_callback_lock);
}
}
@@ -184,8 +190,6 @@ static int sock_map_init_proto(struct sock *sk, struct sk_psock *psock)
{
struct proto *prot;
- sock_owned_by_me(sk);
-
switch (sk->sk_type) {
case SOCK_STREAM:
prot = tcp_bpf_get_proto(sk, psock);
@@ -231,20 +235,21 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
{
struct bpf_prog *msg_parser, *skb_parser, *skb_verdict;
struct sk_psock *psock;
- bool skb_progs;
int ret;
skb_verdict = READ_ONCE(progs->skb_verdict);
- skb_parser = READ_ONCE(progs->skb_parser);
- skb_progs = skb_parser && skb_verdict;
- if (skb_progs) {
+ if (skb_verdict) {
skb_verdict = bpf_prog_inc_not_zero(skb_verdict);
if (IS_ERR(skb_verdict))
return PTR_ERR(skb_verdict);
+ }
+
+ skb_parser = READ_ONCE(progs->skb_parser);
+ if (skb_parser) {
skb_parser = bpf_prog_inc_not_zero(skb_parser);
if (IS_ERR(skb_parser)) {
- bpf_prog_put(skb_verdict);
- return PTR_ERR(skb_parser);
+ ret = PTR_ERR(skb_parser);
+ goto out_put_skb_verdict;
}
}
@@ -253,7 +258,7 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
msg_parser = bpf_prog_inc_not_zero(msg_parser);
if (IS_ERR(msg_parser)) {
ret = PTR_ERR(msg_parser);
- goto out;
+ goto out_put_skb_parser;
}
}
@@ -265,15 +270,16 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
if (psock) {
if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) ||
- (skb_progs && READ_ONCE(psock->progs.skb_parser))) {
+ (skb_parser && READ_ONCE(psock->progs.skb_parser)) ||
+ (skb_verdict && READ_ONCE(psock->progs.skb_verdict))) {
sk_psock_put(sk, psock);
ret = -EBUSY;
goto out_progs;
}
} else {
psock = sk_psock_init(sk, map->numa_node);
- if (!psock) {
- ret = -ENOMEM;
+ if (IS_ERR(psock)) {
+ ret = PTR_ERR(psock);
goto out_progs;
}
}
@@ -286,28 +292,32 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
goto out_drop;
write_lock_bh(&sk->sk_callback_lock);
- if (skb_progs && !psock->parser.enabled) {
+ if (skb_parser && skb_verdict && !psock->parser.enabled) {
ret = sk_psock_init_strp(sk, psock);
- if (ret) {
- write_unlock_bh(&sk->sk_callback_lock);
- goto out_drop;
- }
+ if (ret)
+ goto out_unlock_drop;
psock_set_prog(&psock->progs.skb_verdict, skb_verdict);
psock_set_prog(&psock->progs.skb_parser, skb_parser);
sk_psock_start_strp(sk, psock);
+ } else if (!skb_parser && skb_verdict && !psock->parser.enabled) {
+ psock_set_prog(&psock->progs.skb_verdict, skb_verdict);
+ sk_psock_start_verdict(sk,psock);
}
write_unlock_bh(&sk->sk_callback_lock);
return 0;
+out_unlock_drop:
+ write_unlock_bh(&sk->sk_callback_lock);
out_drop:
sk_psock_put(sk, psock);
out_progs:
if (msg_parser)
bpf_prog_put(msg_parser);
-out:
- if (skb_progs) {
- bpf_prog_put(skb_verdict);
+out_put_skb_parser:
+ if (skb_parser)
bpf_prog_put(skb_parser);
- }
+out_put_skb_verdict:
+ if (skb_verdict)
+ bpf_prog_put(skb_verdict);
return ret;
}
@@ -322,8 +332,8 @@ static int sock_map_link_no_progs(struct bpf_map *map, struct sock *sk)
if (!psock) {
psock = sk_psock_init(sk, map->numa_node);
- if (!psock)
- return -ENOMEM;
+ if (IS_ERR(psock))
+ return PTR_ERR(psock);
}
ret = sock_map_init_proto(sk, psock);
@@ -384,7 +394,7 @@ static void *sock_map_lookup(struct bpf_map *map, void *key)
struct sock *sk;
sk = __sock_map_lookup_elem(map, *(u32 *)key);
- if (!sk || !sk_fullsock(sk))
+ if (!sk)
return NULL;
if (sk_is_refcounted(sk) && !refcount_inc_not_zero(&sk->sk_refcnt))
return NULL;
@@ -402,7 +412,7 @@ static void *sock_map_lookup_sys(struct bpf_map *map, void *key)
if (!sk)
return ERR_PTR(-ENOENT);
- sock_gen_cookie(sk);
+ __sock_gen_cookie(sk);
return &sk->sk_cookie;
}
@@ -478,8 +488,6 @@ static int sock_map_update_common(struct bpf_map *map, u32 idx,
return -EINVAL;
if (unlikely(idx >= map->max_entries))
return -E2BIG;
- if (inet_csk_has_ulp(sk))
- return -EINVAL;
link = sk_psock_init_link();
if (!link)
@@ -563,10 +571,12 @@ static bool sock_map_sk_state_allowed(const struct sock *sk)
return false;
}
-static int sock_map_update_elem(struct bpf_map *map, void *key,
- void *value, u64 flags)
+static int sock_hash_update_common(struct bpf_map *map, void *key,
+ struct sock *sk, u64 flags);
+
+int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value,
+ u64 flags)
{
- u32 idx = *(u32 *)key;
struct socket *sock;
struct sock *sk;
int ret;
@@ -595,14 +605,41 @@ static int sock_map_update_elem(struct bpf_map *map, void *key,
sock_map_sk_acquire(sk);
if (!sock_map_sk_state_allowed(sk))
ret = -EOPNOTSUPP;
+ else if (map->map_type == BPF_MAP_TYPE_SOCKMAP)
+ ret = sock_map_update_common(map, *(u32 *)key, sk, flags);
else
- ret = sock_map_update_common(map, idx, sk, flags);
+ ret = sock_hash_update_common(map, key, sk, flags);
sock_map_sk_release(sk);
out:
fput(sock->file);
return ret;
}
+static int sock_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
+{
+ struct sock *sk = (struct sock *)value;
+ int ret;
+
+ if (unlikely(!sk || !sk_fullsock(sk)))
+ return -EINVAL;
+
+ if (!sock_map_sk_is_suitable(sk))
+ return -EOPNOTSUPP;
+
+ local_bh_disable();
+ bh_lock_sock(sk);
+ if (!sock_map_sk_state_allowed(sk))
+ ret = -EOPNOTSUPP;
+ else if (map->map_type == BPF_MAP_TYPE_SOCKMAP)
+ ret = sock_map_update_common(map, *(u32 *)key, sk, flags);
+ else
+ ret = sock_hash_update_common(map, key, sk, flags);
+ bh_unlock_sock(sk);
+ local_bh_enable();
+ return ret;
+}
+
BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, sops,
struct bpf_map *, map, void *, key, u64, flags)
{
@@ -681,8 +718,116 @@ const struct bpf_func_proto bpf_msg_redirect_map_proto = {
.arg4_type = ARG_ANYTHING,
};
+struct sock_map_seq_info {
+ struct bpf_map *map;
+ struct sock *sk;
+ u32 index;
+};
+
+struct bpf_iter__sockmap {
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
+ __bpf_md_ptr(struct bpf_map *, map);
+ __bpf_md_ptr(void *, key);
+ __bpf_md_ptr(struct sock *, sk);
+};
+
+DEFINE_BPF_ITER_FUNC(sockmap, struct bpf_iter_meta *meta,
+ struct bpf_map *map, void *key,
+ struct sock *sk)
+
+static void *sock_map_seq_lookup_elem(struct sock_map_seq_info *info)
+{
+ if (unlikely(info->index >= info->map->max_entries))
+ return NULL;
+
+ info->sk = __sock_map_lookup_elem(info->map, info->index);
+
+ /* can't return sk directly, since that might be NULL */
+ return info;
+}
+
+static void *sock_map_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(rcu)
+{
+ struct sock_map_seq_info *info = seq->private;
+
+ if (*pos == 0)
+ ++*pos;
+
+ /* pairs with sock_map_seq_stop */
+ rcu_read_lock();
+ return sock_map_seq_lookup_elem(info);
+}
+
+static void *sock_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+ __must_hold(rcu)
+{
+ struct sock_map_seq_info *info = seq->private;
+
+ ++*pos;
+ ++info->index;
+
+ return sock_map_seq_lookup_elem(info);
+}
+
+static int sock_map_seq_show(struct seq_file *seq, void *v)
+ __must_hold(rcu)
+{
+ struct sock_map_seq_info *info = seq->private;
+ struct bpf_iter__sockmap ctx = {};
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, !v);
+ if (!prog)
+ return 0;
+
+ ctx.meta = &meta;
+ ctx.map = info->map;
+ if (v) {
+ ctx.key = &info->index;
+ ctx.sk = info->sk;
+ }
+
+ return bpf_iter_run_prog(prog, &ctx);
+}
+
+static void sock_map_seq_stop(struct seq_file *seq, void *v)
+ __releases(rcu)
+{
+ if (!v)
+ (void)sock_map_seq_show(seq, NULL);
+
+ /* pairs with sock_map_seq_start */
+ rcu_read_unlock();
+}
+
+static const struct seq_operations sock_map_seq_ops = {
+ .start = sock_map_seq_start,
+ .next = sock_map_seq_next,
+ .stop = sock_map_seq_stop,
+ .show = sock_map_seq_show,
+};
+
+static int sock_map_init_seq_private(void *priv_data,
+ struct bpf_iter_aux_info *aux)
+{
+ struct sock_map_seq_info *info = priv_data;
+
+ info->map = aux->map;
+ return 0;
+}
+
+static const struct bpf_iter_seq_info sock_map_iter_seq_info = {
+ .seq_ops = &sock_map_seq_ops,
+ .init_seq_private = sock_map_init_seq_private,
+ .seq_priv_size = sizeof(struct sock_map_seq_info),
+};
+
static int sock_map_btf_id;
const struct bpf_map_ops sock_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
.map_alloc = sock_map_alloc,
.map_free = sock_map_free,
.map_get_next_key = sock_map_get_next_key,
@@ -694,6 +839,7 @@ const struct bpf_map_ops sock_map_ops = {
.map_check_btf = map_check_no_btf,
.map_btf_name = "bpf_stab",
.map_btf_id = &sock_map_btf_id,
+ .iter_seq_info = &sock_map_iter_seq_info,
};
struct bpf_shtab_elem {
@@ -855,8 +1001,6 @@ static int sock_hash_update_common(struct bpf_map *map, void *key,
WARN_ON_ONCE(!rcu_read_lock_held());
if (unlikely(flags > BPF_EXIST))
return -EINVAL;
- if (inet_csk_has_ulp(sk))
- return -EINVAL;
link = sk_psock_init_link();
if (!link)
@@ -915,45 +1059,6 @@ out_free:
return ret;
}
-static int sock_hash_update_elem(struct bpf_map *map, void *key,
- void *value, u64 flags)
-{
- struct socket *sock;
- struct sock *sk;
- int ret;
- u64 ufd;
-
- if (map->value_size == sizeof(u64))
- ufd = *(u64 *)value;
- else
- ufd = *(u32 *)value;
- if (ufd > S32_MAX)
- return -EINVAL;
-
- sock = sockfd_lookup(ufd, &ret);
- if (!sock)
- return ret;
- sk = sock->sk;
- if (!sk) {
- ret = -EINVAL;
- goto out;
- }
- if (!sock_map_sk_is_suitable(sk)) {
- ret = -EOPNOTSUPP;
- goto out;
- }
-
- sock_map_sk_acquire(sk);
- if (!sock_map_sk_state_allowed(sk))
- ret = -EOPNOTSUPP;
- else
- ret = sock_hash_update_common(map, key, sk, flags);
- sock_map_sk_release(sk);
-out:
- fput(sock->file);
- return ret;
-}
-
static int sock_hash_get_next_key(struct bpf_map *map, void *key,
void *key_next)
{
@@ -971,7 +1076,7 @@ static int sock_hash_get_next_key(struct bpf_map *map, void *key,
if (!elem)
goto find_first_elem;
- elem_next = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&elem->node)),
+ elem_next = hlist_entry_safe(rcu_dereference(hlist_next_rcu(&elem->node)),
struct bpf_shtab_elem, node);
if (elem_next) {
memcpy(key_next, elem_next->key, key_size);
@@ -983,7 +1088,7 @@ static int sock_hash_get_next_key(struct bpf_map *map, void *key,
find_first_elem:
for (; i < htab->buckets_num; i++) {
head = &sock_hash_select_bucket(htab, i)->head;
- elem_next = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
+ elem_next = hlist_entry_safe(rcu_dereference(hlist_first_rcu(head)),
struct bpf_shtab_elem, node);
if (elem_next) {
memcpy(key_next, elem_next->key, key_size);
@@ -1119,7 +1224,7 @@ static void *sock_hash_lookup_sys(struct bpf_map *map, void *key)
if (!sk)
return ERR_PTR(-ENOENT);
- sock_gen_cookie(sk);
+ __sock_gen_cookie(sk);
return &sk->sk_cookie;
}
@@ -1128,7 +1233,7 @@ static void *sock_hash_lookup(struct bpf_map *map, void *key)
struct sock *sk;
sk = __sock_hash_lookup_elem(map, key);
- if (!sk || !sk_fullsock(sk))
+ if (!sk)
return NULL;
if (sk_is_refcounted(sk) && !refcount_inc_not_zero(&sk->sk_refcnt))
return NULL;
@@ -1217,12 +1322,128 @@ const struct bpf_func_proto bpf_msg_redirect_hash_proto = {
.arg4_type = ARG_ANYTHING,
};
+struct sock_hash_seq_info {
+ struct bpf_map *map;
+ struct bpf_shtab *htab;
+ u32 bucket_id;
+};
+
+static void *sock_hash_seq_find_next(struct sock_hash_seq_info *info,
+ struct bpf_shtab_elem *prev_elem)
+{
+ const struct bpf_shtab *htab = info->htab;
+ struct bpf_shtab_bucket *bucket;
+ struct bpf_shtab_elem *elem;
+ struct hlist_node *node;
+
+ /* try to find next elem in the same bucket */
+ if (prev_elem) {
+ node = rcu_dereference(hlist_next_rcu(&prev_elem->node));
+ elem = hlist_entry_safe(node, struct bpf_shtab_elem, node);
+ if (elem)
+ return elem;
+
+ /* no more elements, continue in the next bucket */
+ info->bucket_id++;
+ }
+
+ for (; info->bucket_id < htab->buckets_num; info->bucket_id++) {
+ bucket = &htab->buckets[info->bucket_id];
+ node = rcu_dereference(hlist_first_rcu(&bucket->head));
+ elem = hlist_entry_safe(node, struct bpf_shtab_elem, node);
+ if (elem)
+ return elem;
+ }
+
+ return NULL;
+}
+
+static void *sock_hash_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(rcu)
+{
+ struct sock_hash_seq_info *info = seq->private;
+
+ if (*pos == 0)
+ ++*pos;
+
+ /* pairs with sock_hash_seq_stop */
+ rcu_read_lock();
+ return sock_hash_seq_find_next(info, NULL);
+}
+
+static void *sock_hash_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+ __must_hold(rcu)
+{
+ struct sock_hash_seq_info *info = seq->private;
+
+ ++*pos;
+ return sock_hash_seq_find_next(info, v);
+}
+
+static int sock_hash_seq_show(struct seq_file *seq, void *v)
+ __must_hold(rcu)
+{
+ struct sock_hash_seq_info *info = seq->private;
+ struct bpf_iter__sockmap ctx = {};
+ struct bpf_shtab_elem *elem = v;
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, !elem);
+ if (!prog)
+ return 0;
+
+ ctx.meta = &meta;
+ ctx.map = info->map;
+ if (elem) {
+ ctx.key = elem->key;
+ ctx.sk = elem->sk;
+ }
+
+ return bpf_iter_run_prog(prog, &ctx);
+}
+
+static void sock_hash_seq_stop(struct seq_file *seq, void *v)
+ __releases(rcu)
+{
+ if (!v)
+ (void)sock_hash_seq_show(seq, NULL);
+
+ /* pairs with sock_hash_seq_start */
+ rcu_read_unlock();
+}
+
+static const struct seq_operations sock_hash_seq_ops = {
+ .start = sock_hash_seq_start,
+ .next = sock_hash_seq_next,
+ .stop = sock_hash_seq_stop,
+ .show = sock_hash_seq_show,
+};
+
+static int sock_hash_init_seq_private(void *priv_data,
+ struct bpf_iter_aux_info *aux)
+{
+ struct sock_hash_seq_info *info = priv_data;
+
+ info->map = aux->map;
+ info->htab = container_of(aux->map, struct bpf_shtab, map);
+ return 0;
+}
+
+static const struct bpf_iter_seq_info sock_hash_iter_seq_info = {
+ .seq_ops = &sock_hash_seq_ops,
+ .init_seq_private = sock_hash_init_seq_private,
+ .seq_priv_size = sizeof(struct sock_hash_seq_info),
+};
+
static int sock_hash_map_btf_id;
const struct bpf_map_ops sock_hash_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
.map_alloc = sock_hash_alloc,
.map_free = sock_hash_free,
.map_get_next_key = sock_hash_get_next_key,
- .map_update_elem = sock_hash_update_elem,
+ .map_update_elem = sock_map_update_elem,
.map_delete_elem = sock_hash_delete_elem,
.map_lookup_elem = sock_hash_lookup,
.map_lookup_elem_sys_only = sock_hash_lookup_sys,
@@ -1230,6 +1451,7 @@ const struct bpf_map_ops sock_hash_ops = {
.map_check_btf = map_check_no_btf,
.map_btf_name = "bpf_shtab",
.map_btf_id = &sock_hash_map_btf_id,
+ .iter_seq_info = &sock_hash_iter_seq_info,
};
static struct sk_psock_progs *sock_map_progs(struct bpf_map *map)
@@ -1340,3 +1562,62 @@ void sock_map_close(struct sock *sk, long timeout)
release_sock(sk);
saved_close(sk, timeout);
}
+
+static int sock_map_iter_attach_target(struct bpf_prog *prog,
+ union bpf_iter_link_info *linfo,
+ struct bpf_iter_aux_info *aux)
+{
+ struct bpf_map *map;
+ int err = -EINVAL;
+
+ if (!linfo->map.map_fd)
+ return -EBADF;
+
+ map = bpf_map_get_with_uref(linfo->map.map_fd);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ if (map->map_type != BPF_MAP_TYPE_SOCKMAP &&
+ map->map_type != BPF_MAP_TYPE_SOCKHASH)
+ goto put_map;
+
+ if (prog->aux->max_rdonly_access > map->key_size) {
+ err = -EACCES;
+ goto put_map;
+ }
+
+ aux->map = map;
+ return 0;
+
+put_map:
+ bpf_map_put_with_uref(map);
+ return err;
+}
+
+static void sock_map_iter_detach_target(struct bpf_iter_aux_info *aux)
+{
+ bpf_map_put_with_uref(aux->map);
+}
+
+static struct bpf_iter_reg sock_map_iter_reg = {
+ .target = "sockmap",
+ .attach_target = sock_map_iter_attach_target,
+ .detach_target = sock_map_iter_detach_target,
+ .show_fdinfo = bpf_iter_map_show_fdinfo,
+ .fill_link_info = bpf_iter_map_fill_link_info,
+ .ctx_arg_info_size = 2,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__sockmap, key),
+ PTR_TO_RDONLY_BUF_OR_NULL },
+ { offsetof(struct bpf_iter__sockmap, sk),
+ PTR_TO_BTF_ID_OR_NULL },
+ },
+};
+
+static int __init bpf_sockmap_iter_init(void)
+{
+ sock_map_iter_reg.ctx_arg_info[1].btf_id =
+ btf_sock_ids[BTF_SOCK_TYPE_SOCK];
+ return bpf_iter_reg_target(&sock_map_iter_reg);
+}
+late_initcall(bpf_sockmap_iter_init);