summaryrefslogtreecommitdiff
path: root/net/xfrm
diff options
context:
space:
mode:
authorEyal Birger <eyal.birger@gmail.com>2024-05-27 20:29:14 -0700
committerSteffen Klassert <steffen.klassert@secunet.com>2024-06-26 13:22:42 +0200
commitf531d13bdfe3f4f084aaa8acae2cb0f02295f5ae (patch)
tree4a90306144790ba365971577562c3f74a4eac04e /net/xfrm
parent5233a55a5254ea38dcdd8d836a0f9ee886c3df51 (diff)
xfrm: support sending NAT keepalives in ESP in UDP states
Add the ability to send out RFC-3948 NAT keepalives from the xfrm stack. To use, Userspace sets an XFRM_NAT_KEEPALIVE_INTERVAL integer property when creating XFRM outbound states which denotes the number of seconds between keepalive messages. Keepalive messages are sent from a per net delayed work which iterates over the xfrm states. The logic is guarded by the xfrm state spinlock due to the xfrm state walk iterator. Possible future enhancements: - Adding counters to keep track of sent keepalives. - deduplicate NAT keepalives between states sharing the same nat keepalive parameters. - provisioning hardware offloads for devices capable of implementing this. - revise xfrm state list to use an rcu list in order to avoid running this under spinlock. Suggested-by: Paul Wouters <paul.wouters@aiven.io> Tested-by: Paul Wouters <paul.wouters@aiven.io> Tested-by: Antony Antony <antony.antony@secunet.com> Signed-off-by: Eyal Birger <eyal.birger@gmail.com> Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Diffstat (limited to 'net/xfrm')
-rw-r--r--net/xfrm/Makefile3
-rw-r--r--net/xfrm/xfrm_compat.c6
-rw-r--r--net/xfrm/xfrm_nat_keepalive.c292
-rw-r--r--net/xfrm/xfrm_policy.c8
-rw-r--r--net/xfrm/xfrm_state.c17
-rw-r--r--net/xfrm/xfrm_user.c15
6 files changed, 338 insertions, 3 deletions
diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile
index 547cec77ba03..512e0b2f8514 100644
--- a/net/xfrm/Makefile
+++ b/net/xfrm/Makefile
@@ -13,7 +13,8 @@ endif
obj-$(CONFIG_XFRM) := xfrm_policy.o xfrm_state.o xfrm_hash.o \
xfrm_input.o xfrm_output.o \
- xfrm_sysctl.o xfrm_replay.o xfrm_device.o
+ xfrm_sysctl.o xfrm_replay.o xfrm_device.o \
+ xfrm_nat_keepalive.o
obj-$(CONFIG_XFRM_STATISTICS) += xfrm_proc.o
obj-$(CONFIG_XFRM_ALGO) += xfrm_algo.o
obj-$(CONFIG_XFRM_USER) += xfrm_user.o
diff --git a/net/xfrm/xfrm_compat.c b/net/xfrm/xfrm_compat.c
index 703d4172c7d7..91357ccaf4af 100644
--- a/net/xfrm/xfrm_compat.c
+++ b/net/xfrm/xfrm_compat.c
@@ -131,6 +131,7 @@ static const struct nla_policy compat_policy[XFRMA_MAX+1] = {
[XFRMA_IF_ID] = { .type = NLA_U32 },
[XFRMA_MTIMER_THRESH] = { .type = NLA_U32 },
[XFRMA_SA_DIR] = NLA_POLICY_RANGE(NLA_U8, XFRM_SA_DIR_IN, XFRM_SA_DIR_OUT),
+ [XFRMA_NAT_KEEPALIVE_INTERVAL] = { .type = NLA_U32 },
};
static struct nlmsghdr *xfrm_nlmsg_put_compat(struct sk_buff *skb,
@@ -280,9 +281,10 @@ static int xfrm_xlate64_attr(struct sk_buff *dst, const struct nlattr *src)
case XFRMA_IF_ID:
case XFRMA_MTIMER_THRESH:
case XFRMA_SA_DIR:
+ case XFRMA_NAT_KEEPALIVE_INTERVAL:
return xfrm_nla_cpy(dst, src, nla_len(src));
default:
- BUILD_BUG_ON(XFRMA_MAX != XFRMA_SA_DIR);
+ BUILD_BUG_ON(XFRMA_MAX != XFRMA_NAT_KEEPALIVE_INTERVAL);
pr_warn_once("unsupported nla_type %d\n", src->nla_type);
return -EOPNOTSUPP;
}
@@ -437,7 +439,7 @@ static int xfrm_xlate32_attr(void *dst, const struct nlattr *nla,
int err;
if (type > XFRMA_MAX) {
- BUILD_BUG_ON(XFRMA_MAX != XFRMA_SA_DIR);
+ BUILD_BUG_ON(XFRMA_MAX != XFRMA_NAT_KEEPALIVE_INTERVAL);
NL_SET_ERR_MSG(extack, "Bad attribute");
return -EOPNOTSUPP;
}
diff --git a/net/xfrm/xfrm_nat_keepalive.c b/net/xfrm/xfrm_nat_keepalive.c
new file mode 100644
index 000000000000..82f0a301683f
--- /dev/null
+++ b/net/xfrm/xfrm_nat_keepalive.c
@@ -0,0 +1,292 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * xfrm_nat_keepalive.c
+ *
+ * (c) 2024 Eyal Birger <eyal.birger@gmail.com>
+ */
+
+#include <net/inet_common.h>
+#include <net/ip6_checksum.h>
+#include <net/xfrm.h>
+
+static DEFINE_PER_CPU(struct sock *, nat_keepalive_sk_ipv4);
+#if IS_ENABLED(CONFIG_IPV6)
+static DEFINE_PER_CPU(struct sock *, nat_keepalive_sk_ipv6);
+#endif
+
+struct nat_keepalive {
+ struct net *net;
+ u16 family;
+ xfrm_address_t saddr;
+ xfrm_address_t daddr;
+ __be16 encap_sport;
+ __be16 encap_dport;
+ __u32 smark;
+};
+
+static void nat_keepalive_init(struct nat_keepalive *ka, struct xfrm_state *x)
+{
+ ka->net = xs_net(x);
+ ka->family = x->props.family;
+ ka->saddr = x->props.saddr;
+ ka->daddr = x->id.daddr;
+ ka->encap_sport = x->encap->encap_sport;
+ ka->encap_dport = x->encap->encap_dport;
+ ka->smark = xfrm_smark_get(0, x);
+}
+
+static int nat_keepalive_send_ipv4(struct sk_buff *skb,
+ struct nat_keepalive *ka)
+{
+ struct net *net = ka->net;
+ struct flowi4 fl4;
+ struct rtable *rt;
+ struct sock *sk;
+ __u8 tos = 0;
+ int err;
+
+ flowi4_init_output(&fl4, 0 /* oif */, skb->mark, tos,
+ RT_SCOPE_UNIVERSE, IPPROTO_UDP, 0,
+ ka->daddr.a4, ka->saddr.a4, ka->encap_dport,
+ ka->encap_sport, sock_net_uid(net, NULL));
+
+ rt = ip_route_output_key(net, &fl4);
+ if (IS_ERR(rt))
+ return PTR_ERR(rt);
+
+ skb_dst_set(skb, &rt->dst);
+
+ sk = *this_cpu_ptr(&nat_keepalive_sk_ipv4);
+ sock_net_set(sk, net);
+ err = ip_build_and_send_pkt(skb, sk, fl4.saddr, fl4.daddr, NULL, tos);
+ sock_net_set(sk, &init_net);
+ return err;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int nat_keepalive_send_ipv6(struct sk_buff *skb,
+ struct nat_keepalive *ka,
+ struct udphdr *uh)
+{
+ struct net *net = ka->net;
+ struct dst_entry *dst;
+ struct flowi6 fl6;
+ struct sock *sk;
+ __wsum csum;
+ int err;
+
+ csum = skb_checksum(skb, 0, skb->len, 0);
+ uh->check = csum_ipv6_magic(&ka->saddr.in6, &ka->daddr.in6,
+ skb->len, IPPROTO_UDP, csum);
+ if (uh->check == 0)
+ uh->check = CSUM_MANGLED_0;
+
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_mark = skb->mark;
+ fl6.saddr = ka->saddr.in6;
+ fl6.daddr = ka->daddr.in6;
+ fl6.flowi6_proto = IPPROTO_UDP;
+ fl6.fl6_sport = ka->encap_sport;
+ fl6.fl6_dport = ka->encap_dport;
+
+ sk = *this_cpu_ptr(&nat_keepalive_sk_ipv6);
+ sock_net_set(sk, net);
+ dst = ipv6_stub->ipv6_dst_lookup_flow(net, sk, &fl6, NULL);
+ if (IS_ERR(dst))
+ return PTR_ERR(dst);
+
+ skb_dst_set(skb, dst);
+ err = ipv6_stub->ip6_xmit(sk, skb, &fl6, skb->mark, NULL, 0, 0);
+ sock_net_set(sk, &init_net);
+ return err;
+}
+#endif
+
+static void nat_keepalive_send(struct nat_keepalive *ka)
+{
+ const int nat_ka_hdrs_len = max(sizeof(struct iphdr),
+ sizeof(struct ipv6hdr)) +
+ sizeof(struct udphdr);
+ const u8 nat_ka_payload = 0xFF;
+ int err = -EAFNOSUPPORT;
+ struct sk_buff *skb;
+ struct udphdr *uh;
+
+ skb = alloc_skb(nat_ka_hdrs_len + sizeof(nat_ka_payload), GFP_ATOMIC);
+ if (unlikely(!skb))
+ return;
+
+ skb_reserve(skb, nat_ka_hdrs_len);
+
+ skb_put_u8(skb, nat_ka_payload);
+
+ uh = skb_push(skb, sizeof(*uh));
+ uh->source = ka->encap_sport;
+ uh->dest = ka->encap_dport;
+ uh->len = htons(skb->len);
+ uh->check = 0;
+
+ skb->mark = ka->smark;
+
+ switch (ka->family) {
+ case AF_INET:
+ err = nat_keepalive_send_ipv4(skb, ka);
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ err = nat_keepalive_send_ipv6(skb, ka, uh);
+ break;
+#endif
+ }
+ if (err)
+ kfree_skb(skb);
+}
+
+struct nat_keepalive_work_ctx {
+ time64_t next_run;
+ time64_t now;
+};
+
+static int nat_keepalive_work_single(struct xfrm_state *x, int count, void *ptr)
+{
+ struct nat_keepalive_work_ctx *ctx = ptr;
+ bool send_keepalive = false;
+ struct nat_keepalive ka;
+ time64_t next_run;
+ u32 interval;
+ int delta;
+
+ interval = x->nat_keepalive_interval;
+ if (!interval)
+ return 0;
+
+ spin_lock(&x->lock);
+
+ delta = (int)(ctx->now - x->lastused);
+ if (delta < interval) {
+ x->nat_keepalive_expiration = ctx->now + interval - delta;
+ next_run = x->nat_keepalive_expiration;
+ } else if (x->nat_keepalive_expiration > ctx->now) {
+ next_run = x->nat_keepalive_expiration;
+ } else {
+ next_run = ctx->now + interval;
+ nat_keepalive_init(&ka, x);
+ send_keepalive = true;
+ }
+
+ spin_unlock(&x->lock);
+
+ if (send_keepalive)
+ nat_keepalive_send(&ka);
+
+ if (!ctx->next_run || next_run < ctx->next_run)
+ ctx->next_run = next_run;
+ return 0;
+}
+
+static void nat_keepalive_work(struct work_struct *work)
+{
+ struct nat_keepalive_work_ctx ctx;
+ struct xfrm_state_walk walk;
+ struct net *net;
+
+ ctx.next_run = 0;
+ ctx.now = ktime_get_real_seconds();
+
+ net = container_of(work, struct net, xfrm.nat_keepalive_work.work);
+ xfrm_state_walk_init(&walk, IPPROTO_ESP, NULL);
+ xfrm_state_walk(net, &walk, nat_keepalive_work_single, &ctx);
+ xfrm_state_walk_done(&walk, net);
+ if (ctx.next_run)
+ schedule_delayed_work(&net->xfrm.nat_keepalive_work,
+ (ctx.next_run - ctx.now) * HZ);
+}
+
+static int nat_keepalive_sk_init(struct sock * __percpu *socks,
+ unsigned short family)
+{
+ struct sock *sk;
+ int err, i;
+
+ for_each_possible_cpu(i) {
+ err = inet_ctl_sock_create(&sk, family, SOCK_RAW, IPPROTO_UDP,
+ &init_net);
+ if (err < 0)
+ goto err;
+
+ *per_cpu_ptr(socks, i) = sk;
+ }
+
+ return 0;
+err:
+ for_each_possible_cpu(i)
+ inet_ctl_sock_destroy(*per_cpu_ptr(socks, i));
+ return err;
+}
+
+static void nat_keepalive_sk_fini(struct sock * __percpu *socks)
+{
+ int i;
+
+ for_each_possible_cpu(i)
+ inet_ctl_sock_destroy(*per_cpu_ptr(socks, i));
+}
+
+void xfrm_nat_keepalive_state_updated(struct xfrm_state *x)
+{
+ struct net *net;
+
+ if (!x->nat_keepalive_interval)
+ return;
+
+ net = xs_net(x);
+ schedule_delayed_work(&net->xfrm.nat_keepalive_work, 0);
+}
+
+int __net_init xfrm_nat_keepalive_net_init(struct net *net)
+{
+ INIT_DELAYED_WORK(&net->xfrm.nat_keepalive_work, nat_keepalive_work);
+ return 0;
+}
+
+int xfrm_nat_keepalive_net_fini(struct net *net)
+{
+ cancel_delayed_work_sync(&net->xfrm.nat_keepalive_work);
+ return 0;
+}
+
+int xfrm_nat_keepalive_init(unsigned short family)
+{
+ int err = -EAFNOSUPPORT;
+
+ switch (family) {
+ case AF_INET:
+ err = nat_keepalive_sk_init(&nat_keepalive_sk_ipv4, PF_INET);
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ err = nat_keepalive_sk_init(&nat_keepalive_sk_ipv6, PF_INET6);
+ break;
+#endif
+ }
+
+ if (err)
+ pr_err("xfrm nat keepalive init: failed to init err:%d\n", err);
+ return err;
+}
+EXPORT_SYMBOL_GPL(xfrm_nat_keepalive_init);
+
+void xfrm_nat_keepalive_fini(unsigned short family)
+{
+ switch (family) {
+ case AF_INET:
+ nat_keepalive_sk_fini(&nat_keepalive_sk_ipv4);
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ nat_keepalive_sk_fini(&nat_keepalive_sk_ipv6);
+ break;
+#endif
+ }
+}
+EXPORT_SYMBOL_GPL(xfrm_nat_keepalive_fini);
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 475b904fe68b..6603d3bd171f 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -4289,8 +4289,14 @@ static int __net_init xfrm_net_init(struct net *net)
if (rv < 0)
goto out_sysctl;
+ rv = xfrm_nat_keepalive_net_init(net);
+ if (rv < 0)
+ goto out_nat_keepalive;
+
return 0;
+out_nat_keepalive:
+ xfrm_sysctl_fini(net);
out_sysctl:
xfrm_policy_fini(net);
out_policy:
@@ -4303,6 +4309,7 @@ out_statistics:
static void __net_exit xfrm_net_exit(struct net *net)
{
+ xfrm_nat_keepalive_net_fini(net);
xfrm_sysctl_fini(net);
xfrm_policy_fini(net);
xfrm_state_fini(net);
@@ -4364,6 +4371,7 @@ void __init xfrm_init(void)
#endif
register_xfrm_state_bpf();
+ xfrm_nat_keepalive_init(AF_INET);
}
#ifdef CONFIG_AUDITSYSCALL
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 649bb739df0d..abadc857cd45 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -715,6 +715,7 @@ int __xfrm_state_delete(struct xfrm_state *x)
if (x->id.spi)
hlist_del_rcu(&x->byspi);
net->xfrm.state_num--;
+ xfrm_nat_keepalive_state_updated(x);
spin_unlock(&net->xfrm.xfrm_state_lock);
if (x->encap_sk)
@@ -1453,6 +1454,7 @@ static void __xfrm_state_insert(struct xfrm_state *x)
net->xfrm.state_num++;
xfrm_hash_grow_check(net, x->bydst.next != NULL);
+ xfrm_nat_keepalive_state_updated(x);
}
/* net->xfrm.xfrm_state_lock is held */
@@ -2871,6 +2873,21 @@ int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload,
goto error;
}
+ if (x->nat_keepalive_interval) {
+ if (x->dir != XFRM_SA_DIR_OUT) {
+ NL_SET_ERR_MSG(extack, "NAT keepalive is only supported for outbound SAs");
+ err = -EINVAL;
+ goto error;
+ }
+
+ if (!x->encap || x->encap->encap_type != UDP_ENCAP_ESPINUDP) {
+ NL_SET_ERR_MSG(extack,
+ "NAT keepalive is only supported for UDP encapsulation");
+ err = -EINVAL;
+ goto error;
+ }
+ }
+
error:
return err;
}
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index e83c687bd64e..a552cfa623ea 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -833,6 +833,10 @@ static struct xfrm_state *xfrm_state_construct(struct net *net,
if (attrs[XFRMA_SA_DIR])
x->dir = nla_get_u8(attrs[XFRMA_SA_DIR]);
+ if (attrs[XFRMA_NAT_KEEPALIVE_INTERVAL])
+ x->nat_keepalive_interval =
+ nla_get_u32(attrs[XFRMA_NAT_KEEPALIVE_INTERVAL]);
+
err = __xfrm_init_state(x, false, attrs[XFRMA_OFFLOAD_DEV], extack);
if (err)
goto error;
@@ -1288,6 +1292,13 @@ static int copy_to_user_state_extra(struct xfrm_state *x,
}
if (x->dir)
ret = nla_put_u8(skb, XFRMA_SA_DIR, x->dir);
+
+ if (x->nat_keepalive_interval) {
+ ret = nla_put_u32(skb, XFRMA_NAT_KEEPALIVE_INTERVAL,
+ x->nat_keepalive_interval);
+ if (ret)
+ goto out;
+ }
out:
return ret;
}
@@ -3165,6 +3176,7 @@ const struct nla_policy xfrma_policy[XFRMA_MAX+1] = {
[XFRMA_IF_ID] = { .type = NLA_U32 },
[XFRMA_MTIMER_THRESH] = { .type = NLA_U32 },
[XFRMA_SA_DIR] = NLA_POLICY_RANGE(NLA_U8, XFRM_SA_DIR_IN, XFRM_SA_DIR_OUT),
+ [XFRMA_NAT_KEEPALIVE_INTERVAL] = { .type = NLA_U32 },
};
EXPORT_SYMBOL_GPL(xfrma_policy);
@@ -3474,6 +3486,9 @@ static inline unsigned int xfrm_sa_len(struct xfrm_state *x)
if (x->dir)
l += nla_total_size(sizeof(x->dir));
+ if (x->nat_keepalive_interval)
+ l += nla_total_size(sizeof(x->nat_keepalive_interval));
+
return l;
}