diff options
Diffstat (limited to 'net/ipv4/udp.c')
-rw-r--r-- | net/ipv4/udp.c | 291 |
1 files changed, 241 insertions, 50 deletions
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 514ebd4aff74..fd3dae081f3a 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2883,7 +2883,8 @@ EXPORT_SYMBOL(udp_poll); int udp_abort(struct sock *sk, int err) { - lock_sock(sk); + if (!has_current_bpf_ctx()) + lock_sock(sk); /* udp{v6}_destroy_sock() sets it under the sk lock, avoid racing * with close() @@ -2896,7 +2897,8 @@ int udp_abort(struct sock *sk, int err) __udp_disconnect(sk, 0); out: - release_sock(sk); + if (!has_current_bpf_ctx()) + release_sock(sk); return 0; } @@ -2941,9 +2943,30 @@ EXPORT_SYMBOL(udp_prot); /* ------------------------------------------------------------------------ */ #ifdef CONFIG_PROC_FS -static struct udp_table *udp_get_table_afinfo(struct udp_seq_afinfo *afinfo, - struct net *net) +static unsigned short seq_file_family(const struct seq_file *seq); +static bool seq_sk_match(struct seq_file *seq, const struct sock *sk) +{ + unsigned short family = seq_file_family(seq); + + /* AF_UNSPEC is used as a match all */ + return ((family == AF_UNSPEC || family == sk->sk_family) && + net_eq(sock_net(sk), seq_file_net(seq))); +} + +#ifdef CONFIG_BPF_SYSCALL +static const struct seq_operations bpf_iter_udp_seq_ops; +#endif +static struct udp_table *udp_get_table_seq(struct seq_file *seq, + struct net *net) { + const struct udp_seq_afinfo *afinfo; + +#ifdef CONFIG_BPF_SYSCALL + if (seq->op == &bpf_iter_udp_seq_ops) + return net->ipv4.udp_table; +#endif + + afinfo = pde_data(file_inode(seq->file)); return afinfo->udp_table ? : net->ipv4.udp_table; } @@ -2951,16 +2974,10 @@ static struct sock *udp_get_first(struct seq_file *seq, int start) { struct udp_iter_state *state = seq->private; struct net *net = seq_file_net(seq); - struct udp_seq_afinfo *afinfo; struct udp_table *udptable; struct sock *sk; - if (state->bpf_seq_afinfo) - afinfo = state->bpf_seq_afinfo; - else - afinfo = pde_data(file_inode(seq->file)); - - udptable = udp_get_table_afinfo(afinfo, net); + udptable = udp_get_table_seq(seq, net); for (state->bucket = start; state->bucket <= udptable->mask; ++state->bucket) { @@ -2971,10 +2988,7 @@ static struct sock *udp_get_first(struct seq_file *seq, int start) spin_lock_bh(&hslot->lock); sk_for_each(sk, &hslot->head) { - if (!net_eq(sock_net(sk), net)) - continue; - if (afinfo->family == AF_UNSPEC || - sk->sk_family == afinfo->family) + if (seq_sk_match(seq, sk)) goto found; } spin_unlock_bh(&hslot->lock); @@ -2988,22 +3002,14 @@ static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk) { struct udp_iter_state *state = seq->private; struct net *net = seq_file_net(seq); - struct udp_seq_afinfo *afinfo; struct udp_table *udptable; - if (state->bpf_seq_afinfo) - afinfo = state->bpf_seq_afinfo; - else - afinfo = pde_data(file_inode(seq->file)); - do { sk = sk_next(sk); - } while (sk && (!net_eq(sock_net(sk), net) || - (afinfo->family != AF_UNSPEC && - sk->sk_family != afinfo->family))); + } while (sk && !seq_sk_match(seq, sk)); if (!sk) { - udptable = udp_get_table_afinfo(afinfo, net); + udptable = udp_get_table_seq(seq, net); if (state->bucket <= udptable->mask) spin_unlock_bh(&udptable->hash[state->bucket].lock); @@ -3049,15 +3055,9 @@ EXPORT_SYMBOL(udp_seq_next); void udp_seq_stop(struct seq_file *seq, void *v) { struct udp_iter_state *state = seq->private; - struct udp_seq_afinfo *afinfo; struct udp_table *udptable; - if (state->bpf_seq_afinfo) - afinfo = state->bpf_seq_afinfo; - else - afinfo = pde_data(file_inode(seq->file)); - - udptable = udp_get_table_afinfo(afinfo, seq_file_net(seq)); + udptable = udp_get_table_seq(seq, seq_file_net(seq)); if (state->bucket <= udptable->mask) spin_unlock_bh(&udptable->hash[state->bucket].lock); @@ -3110,6 +3110,143 @@ struct bpf_iter__udp { int bucket __aligned(8); }; +struct bpf_udp_iter_state { + struct udp_iter_state state; + unsigned int cur_sk; + unsigned int end_sk; + unsigned int max_sk; + int offset; + struct sock **batch; + bool st_bucket_done; +}; + +static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter, + unsigned int new_batch_sz); +static struct sock *bpf_iter_udp_batch(struct seq_file *seq) +{ + struct bpf_udp_iter_state *iter = seq->private; + struct udp_iter_state *state = &iter->state; + struct net *net = seq_file_net(seq); + struct udp_table *udptable; + unsigned int batch_sks = 0; + bool resized = false; + struct sock *sk; + + /* The current batch is done, so advance the bucket. */ + if (iter->st_bucket_done) { + state->bucket++; + iter->offset = 0; + } + + udptable = udp_get_table_seq(seq, net); + +again: + /* New batch for the next bucket. + * Iterate over the hash table to find a bucket with sockets matching + * the iterator attributes, and return the first matching socket from + * the bucket. The remaining matched sockets from the bucket are batched + * before releasing the bucket lock. This allows BPF programs that are + * called in seq_show to acquire the bucket lock if needed. + */ + iter->cur_sk = 0; + iter->end_sk = 0; + iter->st_bucket_done = false; + batch_sks = 0; + + for (; state->bucket <= udptable->mask; state->bucket++) { + struct udp_hslot *hslot2 = &udptable->hash2[state->bucket]; + + if (hlist_empty(&hslot2->head)) { + iter->offset = 0; + continue; + } + + spin_lock_bh(&hslot2->lock); + udp_portaddr_for_each_entry(sk, &hslot2->head) { + if (seq_sk_match(seq, sk)) { + /* Resume from the last iterated socket at the + * offset in the bucket before iterator was stopped. + */ + if (iter->offset) { + --iter->offset; + continue; + } + if (iter->end_sk < iter->max_sk) { + sock_hold(sk); + iter->batch[iter->end_sk++] = sk; + } + batch_sks++; + } + } + spin_unlock_bh(&hslot2->lock); + + if (iter->end_sk) + break; + + /* Reset the current bucket's offset before moving to the next bucket. */ + iter->offset = 0; + } + + /* All done: no batch made. */ + if (!iter->end_sk) + return NULL; + + if (iter->end_sk == batch_sks) { + /* Batching is done for the current bucket; return the first + * socket to be iterated from the batch. + */ + iter->st_bucket_done = true; + goto done; + } + if (!resized && !bpf_iter_udp_realloc_batch(iter, batch_sks * 3 / 2)) { + resized = true; + /* After allocating a larger batch, retry one more time to grab + * the whole bucket. + */ + state->bucket--; + goto again; + } +done: + return iter->batch[0]; +} + +static void *bpf_iter_udp_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct bpf_udp_iter_state *iter = seq->private; + struct sock *sk; + + /* Whenever seq_next() is called, the iter->cur_sk is + * done with seq_show(), so unref the iter->cur_sk. + */ + if (iter->cur_sk < iter->end_sk) { + sock_put(iter->batch[iter->cur_sk++]); + ++iter->offset; + } + + /* After updating iter->cur_sk, check if there are more sockets + * available in the current bucket batch. + */ + if (iter->cur_sk < iter->end_sk) + sk = iter->batch[iter->cur_sk]; + else + /* Prepare a new batch. */ + sk = bpf_iter_udp_batch(seq); + + ++*pos; + return sk; +} + +static void *bpf_iter_udp_seq_start(struct seq_file *seq, loff_t *pos) +{ + /* bpf iter does not support lseek, so it always + * continue from where it was stop()-ped. + */ + if (*pos) + return bpf_iter_udp_batch(seq); + + return SEQ_START_TOKEN; +} + static int udp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, struct udp_sock *udp_sk, uid_t uid, int bucket) { @@ -3130,18 +3267,37 @@ static int bpf_iter_udp_seq_show(struct seq_file *seq, void *v) struct bpf_prog *prog; struct sock *sk = v; uid_t uid; + int ret; if (v == SEQ_START_TOKEN) return 0; + lock_sock(sk); + + if (unlikely(sk_unhashed(sk))) { + ret = SEQ_SKIP; + goto unlock; + } + uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); meta.seq = seq; prog = bpf_iter_get_info(&meta, false); - return udp_prog_seq_show(prog, &meta, v, uid, state->bucket); + ret = udp_prog_seq_show(prog, &meta, v, uid, state->bucket); + +unlock: + release_sock(sk); + return ret; +} + +static void bpf_iter_udp_put_batch(struct bpf_udp_iter_state *iter) +{ + while (iter->cur_sk < iter->end_sk) + sock_put(iter->batch[iter->cur_sk++]); } static void bpf_iter_udp_seq_stop(struct seq_file *seq, void *v) { + struct bpf_udp_iter_state *iter = seq->private; struct bpf_iter_meta meta; struct bpf_prog *prog; @@ -3152,17 +3308,35 @@ static void bpf_iter_udp_seq_stop(struct seq_file *seq, void *v) (void)udp_prog_seq_show(prog, &meta, v, 0, 0); } - udp_seq_stop(seq, v); + if (iter->cur_sk < iter->end_sk) { + bpf_iter_udp_put_batch(iter); + iter->st_bucket_done = false; + } } static const struct seq_operations bpf_iter_udp_seq_ops = { - .start = udp_seq_start, - .next = udp_seq_next, + .start = bpf_iter_udp_seq_start, + .next = bpf_iter_udp_seq_next, .stop = bpf_iter_udp_seq_stop, .show = bpf_iter_udp_seq_show, }; #endif +static unsigned short seq_file_family(const struct seq_file *seq) +{ + const struct udp_seq_afinfo *afinfo; + +#ifdef CONFIG_BPF_SYSCALL + /* BPF iterator: bpf programs to filter sockets. */ + if (seq->op == &bpf_iter_udp_seq_ops) + return AF_UNSPEC; +#endif + + /* Proc fs iterator */ + afinfo = pde_data(file_inode(seq->file)); + return afinfo->family; +} + const struct seq_operations udp_seq_ops = { .start = udp_seq_start, .next = udp_seq_next, @@ -3371,38 +3545,55 @@ static struct pernet_operations __net_initdata udp_sysctl_ops = { DEFINE_BPF_ITER_FUNC(udp, struct bpf_iter_meta *meta, struct udp_sock *udp_sk, uid_t uid, int bucket) -static int bpf_iter_init_udp(void *priv_data, struct bpf_iter_aux_info *aux) +static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter, + unsigned int new_batch_sz) { - struct udp_iter_state *st = priv_data; - struct udp_seq_afinfo *afinfo; - int ret; + struct sock **new_batch; - afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN); - if (!afinfo) + new_batch = kvmalloc_array(new_batch_sz, sizeof(*new_batch), + GFP_USER | __GFP_NOWARN); + if (!new_batch) return -ENOMEM; - afinfo->family = AF_UNSPEC; - afinfo->udp_table = NULL; - st->bpf_seq_afinfo = afinfo; + bpf_iter_udp_put_batch(iter); + kvfree(iter->batch); + iter->batch = new_batch; + iter->max_sk = new_batch_sz; + + return 0; +} + +#define INIT_BATCH_SZ 16 + +static int bpf_iter_init_udp(void *priv_data, struct bpf_iter_aux_info *aux) +{ + struct bpf_udp_iter_state *iter = priv_data; + int ret; + ret = bpf_iter_init_seq_net(priv_data, aux); if (ret) - kfree(afinfo); + return ret; + + ret = bpf_iter_udp_realloc_batch(iter, INIT_BATCH_SZ); + if (ret) + bpf_iter_fini_seq_net(priv_data); + return ret; } static void bpf_iter_fini_udp(void *priv_data) { - struct udp_iter_state *st = priv_data; + struct bpf_udp_iter_state *iter = priv_data; - kfree(st->bpf_seq_afinfo); bpf_iter_fini_seq_net(priv_data); + kvfree(iter->batch); } static const struct bpf_iter_seq_info udp_seq_info = { .seq_ops = &bpf_iter_udp_seq_ops, .init_seq_private = bpf_iter_init_udp, .fini_seq_private = bpf_iter_fini_udp, - .seq_priv_size = sizeof(struct udp_iter_state), + .seq_priv_size = sizeof(struct bpf_udp_iter_state), }; static struct bpf_iter_reg udp_reg_info = { @@ -3410,7 +3601,7 @@ static struct bpf_iter_reg udp_reg_info = { .ctx_arg_info_size = 1, .ctx_arg_info = { { offsetof(struct bpf_iter__udp, udp_sk), - PTR_TO_BTF_ID_OR_NULL }, + PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED }, }, .seq_info = &udp_seq_info, }; |