From 04292c695f82b6cf0d25dd5ae494f16ddbb621f6 Mon Sep 17 00:00:00 2001 From: Abhijeet Rastogi Date: Tue, 16 May 2023 20:08:49 -0700 Subject: ipvs: increase ip_vs_conn_tab_bits range for 64BIT Current range [8, 20] is set purely due to historical reasons because at the time, ~1M (2^20) was considered sufficient. With this change, 27 is the upper limit for 64-bit, 20 otherwise. Previous change regarding this limit is here. Link: https://lore.kernel.org/all/86eabeb9dd62aebf1e2533926fdd13fed48bab1f.1631289960.git.aclaudi@redhat.com/T/#u Signed-off-by: Abhijeet Rastogi Acked-by: Julian Anastasov Acked-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/Kconfig | 27 ++++++++++++++------------- net/netfilter/ipvs/ip_vs_conn.c | 4 ++-- 2 files changed, 16 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig index 271da8447b29..2a3017b9c001 100644 --- a/net/netfilter/ipvs/Kconfig +++ b/net/netfilter/ipvs/Kconfig @@ -44,7 +44,8 @@ config IP_VS_DEBUG config IP_VS_TAB_BITS int "IPVS connection table size (the Nth power of 2)" - range 8 20 + range 8 20 if !64BIT + range 8 27 if 64BIT default 12 help The IPVS connection hash table uses the chaining scheme to handle @@ -54,24 +55,24 @@ config IP_VS_TAB_BITS Note the table size must be power of 2. The table size will be the value of 2 to the your input number power. The number to choose is - from 8 to 20, the default number is 12, which means the table size - is 4096. Don't input the number too small, otherwise you will lose - performance on it. You can adapt the table size yourself, according - to your virtual server application. It is good to set the table size - not far less than the number of connections per second multiplying - average lasting time of connection in the table. For example, your - virtual server gets 200 connections per second, the connection lasts - for 200 seconds in average in the connection table, the table size - should be not far less than 200x200, it is good to set the table - size 32768 (2**15). + from 8 to 27 for 64BIT(20 otherwise), the default number is 12, + which means the table size is 4096. Don't input the number too + small, otherwise you will lose performance on it. You can adapt the + table size yourself, according to your virtual server application. + It is good to set the table size not far less than the number of + connections per second multiplying average lasting time of + connection in the table. For example, your virtual server gets 200 + connections per second, the connection lasts for 200 seconds in + average in the connection table, the table size should be not far + less than 200x200, it is good to set the table size 32768 (2**15). Another note that each connection occupies 128 bytes effectively and each hash entry uses 8 bytes, so you can estimate how much memory is needed for your box. You can overwrite this number setting conn_tab_bits module parameter - or by appending ip_vs.conn_tab_bits=? to the kernel command line - if IP VS was compiled built-in. + or by appending ip_vs.conn_tab_bits=? to the kernel command line if + IP VS was compiled built-in. comment "IPVS transport protocol load balancing support" diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 928e64653837..f4c55e65abd1 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -1485,8 +1485,8 @@ int __init ip_vs_conn_init(void) int idx; /* Compute size and mask */ - if (ip_vs_conn_tab_bits < 8 || ip_vs_conn_tab_bits > 20) { - pr_info("conn_tab_bits not in [8, 20]. Using default value\n"); + if (ip_vs_conn_tab_bits < 8 || ip_vs_conn_tab_bits > 27) { + pr_info("conn_tab_bits not in [8, 27]. Using default value\n"); ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS; } ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits; -- cgit v1.2.3-58-ga151 From 4f325e26277b6a1381235008ca6fa97e6cc8f43b Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Wed, 17 May 2023 15:37:31 +0300 Subject: ipvs: dynamically limit the connection hash table As we allow the hash table to be configured to rows above 2^20, we should limit it depending on the available memory to some sane values. Switch to kvmalloc allocation to better select the needed allocation type. Signed-off-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_conn.c | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index f4c55e65abd1..9065da3cdd12 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -26,7 +26,6 @@ #include #include #include -#include #include /* for proc_net_* */ #include #include @@ -1482,13 +1481,21 @@ void __net_exit ip_vs_conn_net_cleanup(struct netns_ipvs *ipvs) int __init ip_vs_conn_init(void) { size_t tab_array_size; + int max_avail; +#if BITS_PER_LONG > 32 + int max = 27; +#else + int max = 20; +#endif + int min = 8; int idx; - /* Compute size and mask */ - if (ip_vs_conn_tab_bits < 8 || ip_vs_conn_tab_bits > 27) { - pr_info("conn_tab_bits not in [8, 27]. Using default value\n"); - ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS; - } + max_avail = order_base_2(totalram_pages()) + PAGE_SHIFT; + max_avail -= 2; /* ~4 in hash row */ + max_avail -= 1; /* IPVS up to 1/2 of mem */ + max_avail -= order_base_2(sizeof(struct ip_vs_conn)); + max = clamp(max, min, max_avail); + ip_vs_conn_tab_bits = clamp_val(ip_vs_conn_tab_bits, min, max); ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits; ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1; @@ -1497,7 +1504,8 @@ int __init ip_vs_conn_init(void) */ tab_array_size = array_size(ip_vs_conn_tab_size, sizeof(*ip_vs_conn_tab)); - ip_vs_conn_tab = vmalloc(tab_array_size); + ip_vs_conn_tab = kvmalloc_array(ip_vs_conn_tab_size, + sizeof(*ip_vs_conn_tab), GFP_KERNEL); if (!ip_vs_conn_tab) return -ENOMEM; @@ -1506,7 +1514,7 @@ int __init ip_vs_conn_init(void) sizeof(struct ip_vs_conn), 0, SLAB_HWCACHE_ALIGN, NULL); if (!ip_vs_conn_cachep) { - vfree(ip_vs_conn_tab); + kvfree(ip_vs_conn_tab); return -ENOMEM; } @@ -1534,5 +1542,5 @@ void ip_vs_conn_cleanup(void) rcu_barrier(); /* Release the empty cache */ kmem_cache_destroy(ip_vs_conn_cachep); - vfree(ip_vs_conn_tab); + kvfree(ip_vs_conn_tab); } -- cgit v1.2.3-58-ga151 From de6843be3082d416eaf2a00b72dad95c784ca980 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 6 Jun 2023 09:38:42 +0200 Subject: netfilter: nft_payload: rebuild vlan header when needed Skip rebuilding the vlan header when accessing destination and source mac address. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_payload.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 3a3c7746e88f..8cb800989947 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -171,7 +171,8 @@ void nft_payload_eval(const struct nft_expr *expr, if (!skb_mac_header_was_set(skb)) goto err; - if (skb_vlan_tag_present(skb)) { + if (skb_vlan_tag_present(skb) && + priv->offset >= offsetof(struct ethhdr, h_proto)) { if (!nft_payload_copy_vlan(dest, skb, priv->offset, priv->len)) goto err; -- cgit v1.2.3-58-ga151 From 78aa23d0081b2029a5763c4f7d396bf2666c0c87 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 6 Jun 2023 13:58:27 +0200 Subject: netfilter: ipset: remove rcu_read_lock_bh pair from ip_set_test Callers already hold rcu_read_lock. Prior to RCU conversion this used to be a read_lock_bh(), but now the bh-disable isn't needed anymore. Cc: Jozsef Kadlecsik Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_core.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 46ebee9400da..a77c75f6dfb9 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -739,9 +739,7 @@ ip_set_test(ip_set_id_t index, const struct sk_buff *skb, !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) return 0; - rcu_read_lock_bh(); ret = set->variant->kadt(set, skb, par, IPSET_TEST, opt); - rcu_read_unlock_bh(); if (ret == -EAGAIN) { /* Type requests element to be completed */ -- cgit v1.2.3-58-ga151 From 96b2ef9b16cb302d0b47c5670d30a05963e0e1e3 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 6 Jun 2023 14:08:49 +0200 Subject: netfilter: nf_tables: permit update of set size Now that set->nelems is always updated permit update of the sets max size. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 3 +++ net/netfilter/nf_tables_api.c | 4 ++++ 2 files changed, 7 insertions(+) (limited to 'net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 2e24ea1d744c..89b1ac4e6d4a 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1589,6 +1589,7 @@ struct nft_trans_set { u64 timeout; bool update; bool bound; + u32 size; }; #define nft_trans_set(trans) \ @@ -1603,6 +1604,8 @@ struct nft_trans_set { (((struct nft_trans_set *)trans->data)->timeout) #define nft_trans_set_gc_int(trans) \ (((struct nft_trans_set *)trans->data)->gc_int) +#define nft_trans_set_size(trans) \ + (((struct nft_trans_set *)trans->data)->size) struct nft_trans_chain { bool update; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 0396fd8f4e71..dfd441ff1e3e 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -483,6 +483,7 @@ static int __nft_trans_set_add(const struct nft_ctx *ctx, int msg_type, nft_trans_set_update(trans) = true; nft_trans_set_gc_int(trans) = desc->gc_int; nft_trans_set_timeout(trans) = desc->timeout; + nft_trans_set_size(trans) = desc->size; } nft_trans_commit_list_add_tail(ctx->net, trans); @@ -9428,6 +9429,9 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) WRITE_ONCE(set->timeout, nft_trans_set_timeout(trans)); WRITE_ONCE(set->gc_int, nft_trans_set_gc_int(trans)); + + if (nft_trans_set_size(trans)) + WRITE_ONCE(set->size, nft_trans_set_size(trans)); } else { nft_clear(net, nft_trans_set(trans)); /* This avoids hitting -EBUSY when deleting the table -- cgit v1.2.3-58-ga151 From 4589725502871e77d06464f731f92fd9173e2be6 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 6 Jun 2023 22:59:30 +0200 Subject: netfilter: snat: evict closing tcp entries on reply tuple collision When all tried source tuples are in use, the connection request (skb) and the new conntrack will be dropped in nf_confirm() due to the non-recoverable clash. Make it so that the last 32 attempts are allowed to evict a colliding entry if this connection is already closing and the new sequence number has advanced past the old one. Such "all tuples taken" secenario can happen with tcp-rpc workloads where same dst:dport gets queried repeatedly. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_nat_core.c | 92 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 88 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index ce829d434f13..fadbd4ed3dc0 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -27,6 +27,9 @@ #include "nf_internals.h" +#define NF_NAT_MAX_ATTEMPTS 128 +#define NF_NAT_HARDER_THRESH (NF_NAT_MAX_ATTEMPTS / 4) + static spinlock_t nf_nat_locks[CONNTRACK_LOCKS]; static DEFINE_MUTEX(nf_nat_proto_mutex); @@ -197,6 +200,88 @@ nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple, return nf_conntrack_tuple_taken(&reply, ignored_conntrack); } +static bool nf_nat_may_kill(struct nf_conn *ct, unsigned long flags) +{ + static const unsigned long flags_refuse = IPS_FIXED_TIMEOUT | + IPS_DYING; + static const unsigned long flags_needed = IPS_SRC_NAT; + enum tcp_conntrack old_state; + + old_state = READ_ONCE(ct->proto.tcp.state); + if (old_state < TCP_CONNTRACK_TIME_WAIT) + return false; + + if (flags & flags_refuse) + return false; + + return (flags & flags_needed) == flags_needed; +} + +/* reverse direction will send packets to new source, so + * make sure such packets are invalid. + */ +static bool nf_seq_has_advanced(const struct nf_conn *old, const struct nf_conn *new) +{ + return (__s32)(new->proto.tcp.seen[0].td_end - + old->proto.tcp.seen[0].td_end) > 0; +} + +static int +nf_nat_used_tuple_harder(const struct nf_conntrack_tuple *tuple, + const struct nf_conn *ignored_conntrack, + unsigned int attempts_left) +{ + static const unsigned long flags_offload = IPS_OFFLOAD | IPS_HW_OFFLOAD; + struct nf_conntrack_tuple_hash *thash; + const struct nf_conntrack_zone *zone; + struct nf_conntrack_tuple reply; + unsigned long flags; + struct nf_conn *ct; + bool taken = true; + struct net *net; + + nf_ct_invert_tuple(&reply, tuple); + + if (attempts_left > NF_NAT_HARDER_THRESH || + tuple->dst.protonum != IPPROTO_TCP || + ignored_conntrack->proto.tcp.state != TCP_CONNTRACK_SYN_SENT) + return nf_conntrack_tuple_taken(&reply, ignored_conntrack); + + /* :ast few attempts to find a free tcp port. Destructive + * action: evict colliding if its in timewait state and the + * tcp sequence number has advanced past the one used by the + * old entry. + */ + net = nf_ct_net(ignored_conntrack); + zone = nf_ct_zone(ignored_conntrack); + + thash = nf_conntrack_find_get(net, zone, &reply); + if (!thash) + return false; + + ct = nf_ct_tuplehash_to_ctrack(thash); + + if (thash->tuple.dst.dir == IP_CT_DIR_ORIGINAL) + goto out; + + if (WARN_ON_ONCE(ct == ignored_conntrack)) + goto out; + + flags = READ_ONCE(ct->status); + if (!nf_nat_may_kill(ct, flags)) + goto out; + + if (!nf_seq_has_advanced(ct, ignored_conntrack)) + goto out; + + /* Even if we can evict do not reuse if entry is offloaded. */ + if (nf_ct_kill(ct)) + taken = flags & flags_offload; +out: + nf_ct_put(ct); + return taken; +} + static bool nf_nat_inet_in_range(const struct nf_conntrack_tuple *t, const struct nf_nat_range2 *range) { @@ -385,7 +470,6 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple, unsigned int range_size, min, max, i, attempts; __be16 *keyptr; u16 off; - static const unsigned int max_attempts = 128; switch (tuple->dst.protonum) { case IPPROTO_ICMP: @@ -471,8 +555,8 @@ find_free_id: off = get_random_u16(); attempts = range_size; - if (attempts > max_attempts) - attempts = max_attempts; + if (attempts > NF_NAT_MAX_ATTEMPTS) + attempts = NF_NAT_MAX_ATTEMPTS; /* We are in softirq; doing a search of the entire range risks * soft lockup when all tuples are already used. @@ -483,7 +567,7 @@ find_free_id: another_round: for (i = 0; i < attempts; i++, off++) { *keyptr = htons(min + off % range_size); - if (!nf_nat_used_tuple(tuple, ct)) + if (!nf_nat_used_tuple_harder(tuple, ct, attempts - i)) return; } -- cgit v1.2.3-58-ga151 From 079cd633219d7298d087cd115c17682264244c18 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Thu, 15 Jun 2023 16:31:40 +0200 Subject: netfilter: nf_tables: Introduce NFT_MSG_GETSETELEM_RESET Analogous to NFT_MSG_GETOBJ_RESET, but for set elements with a timeout or attached stateful expressions like counters or quotas - reset them all at once. Respect a per element timeout value if present to reset the 'expires' value to. Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 + net/netfilter/nf_tables_api.c | 68 ++++++++++++++++++++++---------- 2 files changed, 50 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index e059dc2644df..8466c2a9938f 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -105,6 +105,7 @@ enum nft_verdicts { * @NFT_MSG_DESTROYSETELEM: destroy a set element (enum nft_set_elem_attributes) * @NFT_MSG_DESTROYOBJ: destroy a stateful object (enum nft_object_attributes) * @NFT_MSG_DESTROYFLOWTABLE: destroy flow table (enum nft_flowtable_attributes) + * @NFT_MSG_GETSETELEM_RESET: get set elements and reset attached stateful expressions (enum nft_set_elem_attributes) */ enum nf_tables_msg_types { NFT_MSG_NEWTABLE, @@ -140,6 +141,7 @@ enum nf_tables_msg_types { NFT_MSG_DESTROYSETELEM, NFT_MSG_DESTROYOBJ, NFT_MSG_DESTROYFLOWTABLE, + NFT_MSG_GETSETELEM_RESET, NFT_MSG_MAX, }; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index dfd441ff1e3e..30fd62224df9 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5229,7 +5229,8 @@ static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + static int nft_set_elem_expr_dump(struct sk_buff *skb, const struct nft_set *set, - const struct nft_set_ext *ext) + const struct nft_set_ext *ext, + bool reset) { struct nft_set_elem_expr *elem_expr; u32 size, num_exprs = 0; @@ -5242,7 +5243,7 @@ static int nft_set_elem_expr_dump(struct sk_buff *skb, if (num_exprs == 1) { expr = nft_setelem_expr_at(elem_expr, 0); - if (nft_expr_dump(skb, NFTA_SET_ELEM_EXPR, expr, false) < 0) + if (nft_expr_dump(skb, NFTA_SET_ELEM_EXPR, expr, reset) < 0) return -1; return 0; @@ -5253,7 +5254,7 @@ static int nft_set_elem_expr_dump(struct sk_buff *skb, nft_setelem_expr_foreach(expr, elem_expr, size) { expr = nft_setelem_expr_at(elem_expr, size); - if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr, false) < 0) + if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr, reset) < 0) goto nla_put_failure; } nla_nest_end(skb, nest); @@ -5266,11 +5267,13 @@ nla_put_failure: static int nf_tables_fill_setelem(struct sk_buff *skb, const struct nft_set *set, - const struct nft_set_elem *elem) + const struct nft_set_elem *elem, + bool reset) { const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); unsigned char *b = skb_tail_pointer(skb); struct nlattr *nest; + u64 timeout = 0; nest = nla_nest_start_noflag(skb, NFTA_LIST_ELEM); if (nest == NULL) @@ -5293,7 +5296,7 @@ static int nf_tables_fill_setelem(struct sk_buff *skb, goto nla_put_failure; if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS) && - nft_set_elem_expr_dump(skb, set, ext)) + nft_set_elem_expr_dump(skb, set, ext, reset)) goto nla_put_failure; if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) && @@ -5306,11 +5309,15 @@ static int nf_tables_fill_setelem(struct sk_buff *skb, htonl(*nft_set_ext_flags(ext)))) goto nla_put_failure; - if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT) && - nla_put_be64(skb, NFTA_SET_ELEM_TIMEOUT, - nf_jiffies64_to_msecs(*nft_set_ext_timeout(ext)), - NFTA_SET_ELEM_PAD)) - goto nla_put_failure; + if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT)) { + timeout = *nft_set_ext_timeout(ext); + if (nla_put_be64(skb, NFTA_SET_ELEM_TIMEOUT, + nf_jiffies64_to_msecs(timeout), + NFTA_SET_ELEM_PAD)) + goto nla_put_failure; + } else if (set->flags & NFT_SET_TIMEOUT) { + timeout = READ_ONCE(set->timeout); + } if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) { u64 expires, now = get_jiffies_64(); @@ -5325,6 +5332,9 @@ static int nf_tables_fill_setelem(struct sk_buff *skb, nf_jiffies64_to_msecs(expires), NFTA_SET_ELEM_PAD)) goto nla_put_failure; + + if (reset) + *nft_set_ext_expiration(ext) = now + timeout; } if (nft_set_ext_exists(ext, NFT_SET_EXT_USERDATA)) { @@ -5348,6 +5358,7 @@ struct nft_set_dump_args { const struct netlink_callback *cb; struct nft_set_iter iter; struct sk_buff *skb; + bool reset; }; static int nf_tables_dump_setelem(const struct nft_ctx *ctx, @@ -5358,7 +5369,7 @@ static int nf_tables_dump_setelem(const struct nft_ctx *ctx, struct nft_set_dump_args *args; args = container_of(iter, struct nft_set_dump_args, iter); - return nf_tables_fill_setelem(args->skb, set, elem); + return nf_tables_fill_setelem(args->skb, set, elem, args->reset); } struct nft_set_dump_ctx { @@ -5367,7 +5378,7 @@ struct nft_set_dump_ctx { }; static int nft_set_catchall_dump(struct net *net, struct sk_buff *skb, - const struct nft_set *set) + const struct nft_set *set, bool reset) { struct nft_set_elem_catchall *catchall; u8 genmask = nft_genmask_cur(net); @@ -5382,7 +5393,7 @@ static int nft_set_catchall_dump(struct net *net, struct sk_buff *skb, continue; elem.priv = catchall->elem; - ret = nf_tables_fill_setelem(skb, set, &elem); + ret = nf_tables_fill_setelem(skb, set, &elem, reset); break; } @@ -5400,6 +5411,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) bool set_found = false; struct nlmsghdr *nlh; struct nlattr *nest; + bool reset = false; u32 portid, seq; int event; @@ -5447,8 +5459,12 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) if (nest == NULL) goto nla_put_failure; + if (NFNL_MSG_TYPE(cb->nlh->nlmsg_type) == NFT_MSG_GETSETELEM_RESET) + reset = true; + args.cb = cb; args.skb = skb; + args.reset = reset; args.iter.genmask = nft_genmask_cur(net); args.iter.skip = cb->args[0]; args.iter.count = 0; @@ -5457,7 +5473,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) set->ops->walk(&dump_ctx->ctx, set, &args.iter); if (!args.iter.err && args.iter.count == cb->args[0]) - args.iter.err = nft_set_catchall_dump(net, skb, set); + args.iter.err = nft_set_catchall_dump(net, skb, set, reset); rcu_read_unlock(); nla_nest_end(skb, nest); @@ -5495,7 +5511,8 @@ static int nf_tables_fill_setelem_info(struct sk_buff *skb, const struct nft_ctx *ctx, u32 seq, u32 portid, int event, u16 flags, const struct nft_set *set, - const struct nft_set_elem *elem) + const struct nft_set_elem *elem, + bool reset) { struct nlmsghdr *nlh; struct nlattr *nest; @@ -5516,7 +5533,7 @@ static int nf_tables_fill_setelem_info(struct sk_buff *skb, if (nest == NULL) goto nla_put_failure; - err = nf_tables_fill_setelem(skb, set, elem); + err = nf_tables_fill_setelem(skb, set, elem, reset); if (err < 0) goto nla_put_failure; @@ -5622,7 +5639,7 @@ static int nft_setelem_get(struct nft_ctx *ctx, struct nft_set *set, } static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set, - const struct nlattr *attr) + const struct nlattr *attr, bool reset) { struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; struct nft_set_elem elem; @@ -5666,7 +5683,8 @@ static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set, return err; err = nf_tables_fill_setelem_info(skb, ctx, ctx->seq, ctx->portid, - NFT_MSG_NEWSETELEM, 0, set, &elem); + NFT_MSG_NEWSETELEM, 0, set, &elem, + reset); if (err < 0) goto err_fill_setelem; @@ -5690,6 +5708,7 @@ static int nf_tables_getsetelem(struct sk_buff *skb, struct nft_set *set; struct nlattr *attr; struct nft_ctx ctx; + bool reset = false; int rem, err = 0; table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family, @@ -5724,8 +5743,11 @@ static int nf_tables_getsetelem(struct sk_buff *skb, if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS]) return -EINVAL; + if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_GETSETELEM_RESET) + reset = true; + nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { - err = nft_get_set_elem(&ctx, set, attr); + err = nft_get_set_elem(&ctx, set, attr, reset); if (err < 0) { NL_SET_BAD_ATTR(extack, attr); break; @@ -5758,7 +5780,7 @@ static void nf_tables_setelem_notify(const struct nft_ctx *ctx, flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL); err = nf_tables_fill_setelem_info(skb, ctx, 0, portid, event, flags, - set, elem); + set, elem, false); if (err < 0) { kfree_skb(skb); goto err; @@ -8715,6 +8737,12 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .attr_count = NFTA_SET_ELEM_LIST_MAX, .policy = nft_set_elem_list_policy, }, + [NFT_MSG_GETSETELEM_RESET] = { + .call = nf_tables_getsetelem, + .type = NFNL_CB_RCU, + .attr_count = NFTA_SET_ELEM_LIST_MAX, + .policy = nft_set_elem_list_policy, + }, [NFT_MSG_DELSETELEM] = { .call = nf_tables_delsetelem, .type = NFNL_CB_BATCH, -- cgit v1.2.3-58-ga151 From a412dbf40ff37515acca4bba666f5386aa37246e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 21 Jun 2023 21:11:03 +0200 Subject: netfilter: nf_tables: limit allowed range via nla_policy These NLA_U32 types get stored in u8 fields, reject invalid values instead of silently casting to u8. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_bitwise.c | 2 +- net/netfilter/nft_byteorder.c | 6 +++--- net/netfilter/nft_ct.c | 2 +- net/netfilter/nft_dynset.c | 2 +- net/netfilter/nft_exthdr.c | 4 ++-- net/netfilter/nft_fwd_netdev.c | 2 +- net/netfilter/nft_hash.c | 2 +- net/netfilter/nft_meta.c | 2 +- net/netfilter/nft_range.c | 2 +- net/netfilter/nft_reject.c | 2 +- net/netfilter/nft_rt.c | 2 +- net/netfilter/nft_socket.c | 4 ++-- net/netfilter/nft_tproxy.c | 2 +- net/netfilter/nft_tunnel.c | 4 ++-- net/netfilter/nft_xfrm.c | 4 ++-- 15 files changed, 21 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c index 84eae7cabc67..14e3c44ef959 100644 --- a/net/netfilter/nft_bitwise.c +++ b/net/netfilter/nft_bitwise.c @@ -86,7 +86,7 @@ static const struct nla_policy nft_bitwise_policy[NFTA_BITWISE_MAX + 1] = { [NFTA_BITWISE_LEN] = { .type = NLA_U32 }, [NFTA_BITWISE_MASK] = { .type = NLA_NESTED }, [NFTA_BITWISE_XOR] = { .type = NLA_NESTED }, - [NFTA_BITWISE_OP] = { .type = NLA_U32 }, + [NFTA_BITWISE_OP] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_BITWISE_DATA] = { .type = NLA_NESTED }, }; diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c index b66647a5a171..9a85e797ed58 100644 --- a/net/netfilter/nft_byteorder.c +++ b/net/netfilter/nft_byteorder.c @@ -88,9 +88,9 @@ void nft_byteorder_eval(const struct nft_expr *expr, static const struct nla_policy nft_byteorder_policy[NFTA_BYTEORDER_MAX + 1] = { [NFTA_BYTEORDER_SREG] = { .type = NLA_U32 }, [NFTA_BYTEORDER_DREG] = { .type = NLA_U32 }, - [NFTA_BYTEORDER_OP] = { .type = NLA_U32 }, - [NFTA_BYTEORDER_LEN] = { .type = NLA_U32 }, - [NFTA_BYTEORDER_SIZE] = { .type = NLA_U32 }, + [NFTA_BYTEORDER_OP] = NLA_POLICY_MAX(NLA_BE32, 255), + [NFTA_BYTEORDER_LEN] = NLA_POLICY_MAX(NLA_BE32, 255), + [NFTA_BYTEORDER_SIZE] = NLA_POLICY_MAX(NLA_BE32, 255), }; static int nft_byteorder_init(const struct nft_ctx *ctx, diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index b9c84499438b..38958e067aa8 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -332,7 +332,7 @@ static void nft_ct_set_eval(const struct nft_expr *expr, static const struct nla_policy nft_ct_policy[NFTA_CT_MAX + 1] = { [NFTA_CT_DREG] = { .type = NLA_U32 }, - [NFTA_CT_KEY] = { .type = NLA_U32 }, + [NFTA_CT_KEY] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_CT_DIRECTION] = { .type = NLA_U8 }, [NFTA_CT_SREG] = { .type = NLA_U32 }, }; diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index bd19c7aec92e..4fb34d76dbea 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -148,7 +148,7 @@ static const struct nla_policy nft_dynset_policy[NFTA_DYNSET_MAX + 1] = { [NFTA_DYNSET_SET_NAME] = { .type = NLA_STRING, .len = NFT_SET_MAXNAMELEN - 1 }, [NFTA_DYNSET_SET_ID] = { .type = NLA_U32 }, - [NFTA_DYNSET_OP] = { .type = NLA_U32 }, + [NFTA_DYNSET_OP] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_DYNSET_SREG_KEY] = { .type = NLA_U32 }, [NFTA_DYNSET_SREG_DATA] = { .type = NLA_U32 }, [NFTA_DYNSET_TIMEOUT] = { .type = NLA_U64 }, diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index 671474e59817..7f856ceb3a66 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -487,9 +487,9 @@ static const struct nla_policy nft_exthdr_policy[NFTA_EXTHDR_MAX + 1] = { [NFTA_EXTHDR_DREG] = { .type = NLA_U32 }, [NFTA_EXTHDR_TYPE] = { .type = NLA_U8 }, [NFTA_EXTHDR_OFFSET] = { .type = NLA_U32 }, - [NFTA_EXTHDR_LEN] = { .type = NLA_U32 }, + [NFTA_EXTHDR_LEN] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_EXTHDR_FLAGS] = { .type = NLA_U32 }, - [NFTA_EXTHDR_OP] = { .type = NLA_U32 }, + [NFTA_EXTHDR_OP] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_EXTHDR_SREG] = { .type = NLA_U32 }, }; diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c index 7b9d4d1bd17c..a5268e6dd32f 100644 --- a/net/netfilter/nft_fwd_netdev.c +++ b/net/netfilter/nft_fwd_netdev.c @@ -40,7 +40,7 @@ static void nft_fwd_netdev_eval(const struct nft_expr *expr, static const struct nla_policy nft_fwd_netdev_policy[NFTA_FWD_MAX + 1] = { [NFTA_FWD_SREG_DEV] = { .type = NLA_U32 }, [NFTA_FWD_SREG_ADDR] = { .type = NLA_U32 }, - [NFTA_FWD_NFPROTO] = { .type = NLA_U32 }, + [NFTA_FWD_NFPROTO] = NLA_POLICY_MAX(NLA_BE32, 255), }; static int nft_fwd_netdev_init(const struct nft_ctx *ctx, diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c index ee8d487b69c0..92d47e469204 100644 --- a/net/netfilter/nft_hash.c +++ b/net/netfilter/nft_hash.c @@ -59,7 +59,7 @@ static void nft_symhash_eval(const struct nft_expr *expr, static const struct nla_policy nft_hash_policy[NFTA_HASH_MAX + 1] = { [NFTA_HASH_SREG] = { .type = NLA_U32 }, [NFTA_HASH_DREG] = { .type = NLA_U32 }, - [NFTA_HASH_LEN] = { .type = NLA_U32 }, + [NFTA_HASH_LEN] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_HASH_MODULUS] = { .type = NLA_U32 }, [NFTA_HASH_SEED] = { .type = NLA_U32 }, [NFTA_HASH_OFFSET] = { .type = NLA_U32 }, diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index e384e0de7a54..8fdc7318c03c 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -458,7 +458,7 @@ EXPORT_SYMBOL_GPL(nft_meta_set_eval); const struct nla_policy nft_meta_policy[NFTA_META_MAX + 1] = { [NFTA_META_DREG] = { .type = NLA_U32 }, - [NFTA_META_KEY] = { .type = NLA_U32 }, + [NFTA_META_KEY] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_META_SREG] = { .type = NLA_U32 }, }; EXPORT_SYMBOL_GPL(nft_meta_policy); diff --git a/net/netfilter/nft_range.c b/net/netfilter/nft_range.c index 0566d6aaf1e5..51ae64cd268f 100644 --- a/net/netfilter/nft_range.c +++ b/net/netfilter/nft_range.c @@ -42,7 +42,7 @@ void nft_range_eval(const struct nft_expr *expr, static const struct nla_policy nft_range_policy[NFTA_RANGE_MAX + 1] = { [NFTA_RANGE_SREG] = { .type = NLA_U32 }, - [NFTA_RANGE_OP] = { .type = NLA_U32 }, + [NFTA_RANGE_OP] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_RANGE_FROM_DATA] = { .type = NLA_NESTED }, [NFTA_RANGE_TO_DATA] = { .type = NLA_NESTED }, }; diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c index f2addc844dd2..ed2e668474d6 100644 --- a/net/netfilter/nft_reject.c +++ b/net/netfilter/nft_reject.c @@ -18,7 +18,7 @@ #include const struct nla_policy nft_reject_policy[NFTA_REJECT_MAX + 1] = { - [NFTA_REJECT_TYPE] = { .type = NLA_U32 }, + [NFTA_REJECT_TYPE] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_REJECT_ICMP_CODE] = { .type = NLA_U8 }, }; EXPORT_SYMBOL_GPL(nft_reject_policy); diff --git a/net/netfilter/nft_rt.c b/net/netfilter/nft_rt.c index 5990fdd7b3cc..35a2c28caa60 100644 --- a/net/netfilter/nft_rt.c +++ b/net/netfilter/nft_rt.c @@ -104,7 +104,7 @@ err: static const struct nla_policy nft_rt_policy[NFTA_RT_MAX + 1] = { [NFTA_RT_DREG] = { .type = NLA_U32 }, - [NFTA_RT_KEY] = { .type = NLA_U32 }, + [NFTA_RT_KEY] = NLA_POLICY_MAX(NLA_BE32, 255), }; static int nft_rt_get_init(const struct nft_ctx *ctx, diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c index 85f8df87efda..84def74698b7 100644 --- a/net/netfilter/nft_socket.c +++ b/net/netfilter/nft_socket.c @@ -138,9 +138,9 @@ static void nft_socket_eval(const struct nft_expr *expr, } static const struct nla_policy nft_socket_policy[NFTA_SOCKET_MAX + 1] = { - [NFTA_SOCKET_KEY] = { .type = NLA_U32 }, + [NFTA_SOCKET_KEY] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_SOCKET_DREG] = { .type = NLA_U32 }, - [NFTA_SOCKET_LEVEL] = { .type = NLA_U32 }, + [NFTA_SOCKET_LEVEL] = NLA_POLICY_MAX(NLA_BE32, 255), }; static int nft_socket_init(const struct nft_ctx *ctx, diff --git a/net/netfilter/nft_tproxy.c b/net/netfilter/nft_tproxy.c index ea83f661417e..ae15cd693f0e 100644 --- a/net/netfilter/nft_tproxy.c +++ b/net/netfilter/nft_tproxy.c @@ -183,7 +183,7 @@ static void nft_tproxy_eval(const struct nft_expr *expr, } static const struct nla_policy nft_tproxy_policy[NFTA_TPROXY_MAX + 1] = { - [NFTA_TPROXY_FAMILY] = { .type = NLA_U32 }, + [NFTA_TPROXY_FAMILY] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_TPROXY_REG_ADDR] = { .type = NLA_U32 }, [NFTA_TPROXY_REG_PORT] = { .type = NLA_U32 }, }; diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c index b059aa541798..9f21953c7433 100644 --- a/net/netfilter/nft_tunnel.c +++ b/net/netfilter/nft_tunnel.c @@ -66,9 +66,9 @@ static void nft_tunnel_get_eval(const struct nft_expr *expr, } static const struct nla_policy nft_tunnel_policy[NFTA_TUNNEL_MAX + 1] = { - [NFTA_TUNNEL_KEY] = { .type = NLA_U32 }, + [NFTA_TUNNEL_KEY] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_TUNNEL_DREG] = { .type = NLA_U32 }, - [NFTA_TUNNEL_MODE] = { .type = NLA_U32 }, + [NFTA_TUNNEL_MODE] = NLA_POLICY_MAX(NLA_BE32, 255), }; static int nft_tunnel_get_init(const struct nft_ctx *ctx, diff --git a/net/netfilter/nft_xfrm.c b/net/netfilter/nft_xfrm.c index c88fd078a9ae..452f8587adda 100644 --- a/net/netfilter/nft_xfrm.c +++ b/net/netfilter/nft_xfrm.c @@ -16,9 +16,9 @@ #include static const struct nla_policy nft_xfrm_policy[NFTA_XFRM_MAX + 1] = { - [NFTA_XFRM_KEY] = { .type = NLA_U32 }, + [NFTA_XFRM_KEY] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_XFRM_DIR] = { .type = NLA_U8 }, - [NFTA_XFRM_SPNUM] = { .type = NLA_U32 }, + [NFTA_XFRM_SPNUM] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_XFRM_DREG] = { .type = NLA_U32 }, }; -- cgit v1.2.3-58-ga151