From b0355dbbf13c0052931dd14c38c789efed64d3de Mon Sep 17 00:00:00 2001 From: Benedict Wong Date: Thu, 5 Jan 2023 21:28:12 +0000 Subject: Fix XFRM-I support for nested ESP tunnels This change adds support for nested IPsec tunnels by ensuring that XFRM-I verifies existing policies before decapsulating a subsequent policies. Addtionally, this clears the secpath entries after policies are verified, ensuring that previous tunnels with no-longer-valid do not pollute subsequent policy checks. This is necessary especially for nested tunnels, as the IP addresses, protocol and ports may all change, thus not matching the previous policies. In order to ensure that packets match the relevant inbound templates, the xfrm_policy_check should be done before handing off to the inner XFRM protocol to decrypt and decapsulate. Notably, raw ESP/AH packets did not perform policy checks inherently, whereas all other encapsulated packets (UDP, TCP encapsulated) do policy checks after calling xfrm_input handling in the respective encapsulation layer. Test: Verified with additional Android Kernel Unit tests Signed-off-by: Benedict Wong Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_interface_core.c | 54 ++++++++++++++++++++++++++++++++++++++---- net/xfrm/xfrm_policy.c | 3 +++ 2 files changed, 53 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_interface_core.c b/net/xfrm/xfrm_interface_core.c index 1f99dc469027..35279c220bd7 100644 --- a/net/xfrm/xfrm_interface_core.c +++ b/net/xfrm/xfrm_interface_core.c @@ -310,6 +310,52 @@ static void xfrmi_scrub_packet(struct sk_buff *skb, bool xnet) skb->mark = 0; } +static int xfrmi_input(struct sk_buff *skb, int nexthdr, __be32 spi, + int encap_type, unsigned short family) +{ + struct sec_path *sp; + + sp = skb_sec_path(skb); + if (sp && (sp->len || sp->olen) && + !xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family)) + goto discard; + + XFRM_SPI_SKB_CB(skb)->family = family; + if (family == AF_INET) { + XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; + } else { + XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr); + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL; + } + + return xfrm_input(skb, nexthdr, spi, encap_type); +discard: + kfree_skb(skb); + return 0; +} + +static int xfrmi4_rcv(struct sk_buff *skb) +{ + return xfrmi_input(skb, ip_hdr(skb)->protocol, 0, 0, AF_INET); +} + +static int xfrmi6_rcv(struct sk_buff *skb) +{ + return xfrmi_input(skb, skb_network_header(skb)[IP6CB(skb)->nhoff], + 0, 0, AF_INET6); +} + +static int xfrmi4_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) +{ + return xfrmi_input(skb, nexthdr, spi, encap_type, AF_INET); +} + +static int xfrmi6_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) +{ + return xfrmi_input(skb, nexthdr, spi, encap_type, AF_INET6); +} + static int xfrmi_rcv_cb(struct sk_buff *skb, int err) { const struct xfrm_mode *inner_mode; @@ -945,8 +991,8 @@ static struct pernet_operations xfrmi_net_ops = { }; static struct xfrm6_protocol xfrmi_esp6_protocol __read_mostly = { - .handler = xfrm6_rcv, - .input_handler = xfrm_input, + .handler = xfrmi6_rcv, + .input_handler = xfrmi6_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, .priority = 10, @@ -996,8 +1042,8 @@ static struct xfrm6_tunnel xfrmi_ip6ip_handler __read_mostly = { #endif static struct xfrm4_protocol xfrmi_esp4_protocol __read_mostly = { - .handler = xfrm4_rcv, - .input_handler = xfrm_input, + .handler = xfrmi4_rcv, + .input_handler = xfrmi4_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi4_err, .priority = 10, diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index e9eb82c5457d..ed0976f8e42b 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -3742,6 +3742,9 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, goto reject; } + if (if_id) + secpath_reset(skb); + xfrm_pols_put(pols, npols); return 1; } -- cgit v1.2.3-58-ga151 From eb6c59b735aa6cca77cdbb59cc69d69a0d63d986 Mon Sep 17 00:00:00 2001 From: Anastasia Belova Date: Tue, 10 Jan 2023 12:14:50 +0300 Subject: xfrm: compat: change expression for switch in xfrm_xlate64 Compare XFRM_MSG_NEWSPDINFO (value from netlink configuration messages enum) with nlh_src->nlmsg_type instead of nlh_src->nlmsg_type - XFRM_MSG_BASE. Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: 4e9505064f58 ("net/xfrm/compat: Copy xfrm_spdattr_type_t atributes") Signed-off-by: Anastasia Belova Acked-by: Dmitry Safonov <0x7f454c46@gmail.com> Tested-by: Dmitry Safonov <0x7f454c46@gmail.com> Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_compat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/xfrm/xfrm_compat.c b/net/xfrm/xfrm_compat.c index a0f62fa02e06..12405aa5bce8 100644 --- a/net/xfrm/xfrm_compat.c +++ b/net/xfrm/xfrm_compat.c @@ -302,7 +302,7 @@ static int xfrm_xlate64(struct sk_buff *dst, const struct nlmsghdr *nlh_src) nla_for_each_attr(nla, attrs, len, remaining) { int err; - switch (type) { + switch (nlh_src->nlmsg_type) { case XFRM_MSG_NEWSPDINFO: err = xfrm_nla_cpy(dst, nla, nla_len(nla)); break; -- cgit v1.2.3-58-ga151 From b6ee896385380aa621102e8ea402ba12db1cabff Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 20 Jan 2023 13:02:49 +0000 Subject: xfrm/compat: prevent potential spectre v1 gadget in xfrm_xlate32_attr() int type = nla_type(nla); if (type > XFRMA_MAX) { return -EOPNOTSUPP; } @type is then used as an array index and can be used as a Spectre v1 gadget. if (nla_len(nla) < compat_policy[type].len) { array_index_nospec() can be used to prevent leaking content of kernel memory to malicious users. Fixes: 5106f4a8acff ("xfrm/compat: Add 32=>64-bit messages translator") Signed-off-by: Eric Dumazet Cc: Dmitry Safonov Cc: Steffen Klassert Reviewed-by: Dmitry Safonov Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_compat.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/xfrm/xfrm_compat.c b/net/xfrm/xfrm_compat.c index 12405aa5bce8..8cbf45a8bcdc 100644 --- a/net/xfrm/xfrm_compat.c +++ b/net/xfrm/xfrm_compat.c @@ -5,6 +5,7 @@ * Based on code and translator idea by: Florian Westphal */ #include +#include #include #include @@ -437,6 +438,7 @@ static int xfrm_xlate32_attr(void *dst, const struct nlattr *nla, NL_SET_ERR_MSG(extack, "Bad attribute"); return -EOPNOTSUPP; } + type = array_index_nospec(type, XFRMA_MAX + 1); if (nla_len(nla) < compat_policy[type].len) { NL_SET_ERR_MSG(extack, "Attribute bad length"); return -EOPNOTSUPP; -- cgit v1.2.3-58-ga151 From 195e4aac74ce67385f95432301da6db3c2cfc72a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Jan 2023 11:21:29 +0000 Subject: xfrm: consistently use time64_t in xfrm_timer_handler() For some reason, blamed commit did the right thing in xfrm_policy_timer() but did not in xfrm_timer_handler() Fixes: 386c5680e2e8 ("xfrm: use time64_t for in-kernel timestamps") Signed-off-by: Eric Dumazet Cc: Arnd Bergmann Cc: Steffen Klassert Acked-by: Arnd Bergmann Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 89c731f4f0c7..5f03d1fbb98e 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -577,7 +577,7 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me) if (x->km.state == XFRM_STATE_EXPIRED) goto expired; if (x->lft.hard_add_expires_seconds) { - long tmo = x->lft.hard_add_expires_seconds + + time64_t tmo = x->lft.hard_add_expires_seconds + x->curlft.add_time - now; if (tmo <= 0) { if (x->xflags & XFRM_SOFT_EXPIRE) { @@ -594,7 +594,7 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me) next = tmo; } if (x->lft.hard_use_expires_seconds) { - long tmo = x->lft.hard_use_expires_seconds + + time64_t tmo = x->lft.hard_use_expires_seconds + (x->curlft.use_time ? : now) - now; if (tmo <= 0) goto expired; @@ -604,7 +604,7 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me) if (x->km.dying) goto resched; if (x->lft.soft_add_expires_seconds) { - long tmo = x->lft.soft_add_expires_seconds + + time64_t tmo = x->lft.soft_add_expires_seconds + x->curlft.add_time - now; if (tmo <= 0) { warn = 1; @@ -616,7 +616,7 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me) } } if (x->lft.soft_use_expires_seconds) { - long tmo = x->lft.soft_use_expires_seconds + + time64_t tmo = x->lft.soft_use_expires_seconds + (x->curlft.use_time ? : now) - now; if (tmo <= 0) warn = 1; -- cgit v1.2.3-58-ga151 From 0a9e5794b21e2d1303759ff8fe5f9215db7757ba Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Jan 2023 11:21:30 +0000 Subject: xfrm: annotate data-race around use_time KCSAN reported multiple cpus can update use_time at the same time. Adds READ_ONCE()/WRITE_ONCE() annotations. Note that 32bit arches are not fully protected, but they will probably no longer be supported/used in 2106. BUG: KCSAN: data-race in __xfrm_policy_check / __xfrm_policy_check write to 0xffff88813e7ec108 of 8 bytes by interrupt on cpu 0: __xfrm_policy_check+0x6ae/0x17f0 net/xfrm/xfrm_policy.c:3664 __xfrm_policy_check2 include/net/xfrm.h:1174 [inline] xfrm_policy_check include/net/xfrm.h:1179 [inline] xfrm6_policy_check+0x2e9/0x320 include/net/xfrm.h:1189 udpv6_queue_rcv_one_skb+0x48/0xa30 net/ipv6/udp.c:703 udpv6_queue_rcv_skb+0x2d6/0x310 net/ipv6/udp.c:792 udp6_unicast_rcv_skb+0x16b/0x190 net/ipv6/udp.c:935 __udp6_lib_rcv+0x84b/0x9b0 net/ipv6/udp.c:1020 udpv6_rcv+0x4b/0x50 net/ipv6/udp.c:1133 ip6_protocol_deliver_rcu+0x99e/0x1020 net/ipv6/ip6_input.c:439 ip6_input_finish net/ipv6/ip6_input.c:484 [inline] NF_HOOK include/linux/netfilter.h:302 [inline] ip6_input+0xca/0x180 net/ipv6/ip6_input.c:493 dst_input include/net/dst.h:454 [inline] ip6_rcv_finish+0x1e9/0x2d0 net/ipv6/ip6_input.c:79 NF_HOOK include/linux/netfilter.h:302 [inline] ipv6_rcv+0x85/0x140 net/ipv6/ip6_input.c:309 __netif_receive_skb_one_core net/core/dev.c:5482 [inline] __netif_receive_skb+0x8b/0x1b0 net/core/dev.c:5596 process_backlog+0x23f/0x3b0 net/core/dev.c:5924 __napi_poll+0x65/0x390 net/core/dev.c:6485 napi_poll net/core/dev.c:6552 [inline] net_rx_action+0x37e/0x730 net/core/dev.c:6663 __do_softirq+0xf2/0x2c7 kernel/softirq.c:571 do_softirq+0xb1/0xf0 kernel/softirq.c:472 __local_bh_enable_ip+0x6f/0x80 kernel/softirq.c:396 __raw_read_unlock_bh include/linux/rwlock_api_smp.h:257 [inline] _raw_read_unlock_bh+0x17/0x20 kernel/locking/spinlock.c:284 wg_socket_send_skb_to_peer+0x107/0x120 drivers/net/wireguard/socket.c:184 wg_packet_create_data_done drivers/net/wireguard/send.c:251 [inline] wg_packet_tx_worker+0x142/0x360 drivers/net/wireguard/send.c:276 process_one_work+0x3d3/0x720 kernel/workqueue.c:2289 worker_thread+0x618/0xa70 kernel/workqueue.c:2436 kthread+0x1a9/0x1e0 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:308 write to 0xffff88813e7ec108 of 8 bytes by interrupt on cpu 1: __xfrm_policy_check+0x6ae/0x17f0 net/xfrm/xfrm_policy.c:3664 __xfrm_policy_check2 include/net/xfrm.h:1174 [inline] xfrm_policy_check include/net/xfrm.h:1179 [inline] xfrm6_policy_check+0x2e9/0x320 include/net/xfrm.h:1189 udpv6_queue_rcv_one_skb+0x48/0xa30 net/ipv6/udp.c:703 udpv6_queue_rcv_skb+0x2d6/0x310 net/ipv6/udp.c:792 udp6_unicast_rcv_skb+0x16b/0x190 net/ipv6/udp.c:935 __udp6_lib_rcv+0x84b/0x9b0 net/ipv6/udp.c:1020 udpv6_rcv+0x4b/0x50 net/ipv6/udp.c:1133 ip6_protocol_deliver_rcu+0x99e/0x1020 net/ipv6/ip6_input.c:439 ip6_input_finish net/ipv6/ip6_input.c:484 [inline] NF_HOOK include/linux/netfilter.h:302 [inline] ip6_input+0xca/0x180 net/ipv6/ip6_input.c:493 dst_input include/net/dst.h:454 [inline] ip6_rcv_finish+0x1e9/0x2d0 net/ipv6/ip6_input.c:79 NF_HOOK include/linux/netfilter.h:302 [inline] ipv6_rcv+0x85/0x140 net/ipv6/ip6_input.c:309 __netif_receive_skb_one_core net/core/dev.c:5482 [inline] __netif_receive_skb+0x8b/0x1b0 net/core/dev.c:5596 process_backlog+0x23f/0x3b0 net/core/dev.c:5924 __napi_poll+0x65/0x390 net/core/dev.c:6485 napi_poll net/core/dev.c:6552 [inline] net_rx_action+0x37e/0x730 net/core/dev.c:6663 __do_softirq+0xf2/0x2c7 kernel/softirq.c:571 do_softirq+0xb1/0xf0 kernel/softirq.c:472 __local_bh_enable_ip+0x6f/0x80 kernel/softirq.c:396 __raw_read_unlock_bh include/linux/rwlock_api_smp.h:257 [inline] _raw_read_unlock_bh+0x17/0x20 kernel/locking/spinlock.c:284 wg_socket_send_skb_to_peer+0x107/0x120 drivers/net/wireguard/socket.c:184 wg_packet_create_data_done drivers/net/wireguard/send.c:251 [inline] wg_packet_tx_worker+0x142/0x360 drivers/net/wireguard/send.c:276 process_one_work+0x3d3/0x720 kernel/workqueue.c:2289 worker_thread+0x618/0xa70 kernel/workqueue.c:2436 kthread+0x1a9/0x1e0 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:308 value changed: 0x0000000063c62d6f -> 0x0000000063c62d70 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 4185 Comm: kworker/1:2 Tainted: G W 6.2.0-rc4-syzkaller-00009-gd532dd102151-dirty #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/26/2022 Workqueue: wg-crypt-wg0 wg_packet_tx_worker Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: Steffen Klassert Cc: Arnd Bergmann Acked-by: Arnd Bergmann Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 11 +++++++---- net/xfrm/xfrm_state.c | 10 +++++----- 2 files changed, 12 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index ed0976f8e42b..5c61ec04b839 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -336,7 +336,7 @@ static void xfrm_policy_timer(struct timer_list *t) } if (xp->lft.hard_use_expires_seconds) { time64_t tmo = xp->lft.hard_use_expires_seconds + - (xp->curlft.use_time ? : xp->curlft.add_time) - now; + (READ_ONCE(xp->curlft.use_time) ? : xp->curlft.add_time) - now; if (tmo <= 0) goto expired; if (tmo < next) @@ -354,7 +354,7 @@ static void xfrm_policy_timer(struct timer_list *t) } if (xp->lft.soft_use_expires_seconds) { time64_t tmo = xp->lft.soft_use_expires_seconds + - (xp->curlft.use_time ? : xp->curlft.add_time) - now; + (READ_ONCE(xp->curlft.use_time) ? : xp->curlft.add_time) - now; if (tmo <= 0) { warn = 1; tmo = XFRM_KM_TIMEOUT; @@ -3661,7 +3661,8 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, return 1; } - pol->curlft.use_time = ktime_get_real_seconds(); + /* This lockless write can happen from different cpus. */ + WRITE_ONCE(pol->curlft.use_time, ktime_get_real_seconds()); pols[0] = pol; npols++; @@ -3676,7 +3677,9 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, xfrm_pol_put(pols[0]); return 0; } - pols[1]->curlft.use_time = ktime_get_real_seconds(); + /* This write can happen from different cpus. */ + WRITE_ONCE(pols[1]->curlft.use_time, + ktime_get_real_seconds()); npols++; } } diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 5f03d1fbb98e..00afe831c71c 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -595,7 +595,7 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me) } if (x->lft.hard_use_expires_seconds) { time64_t tmo = x->lft.hard_use_expires_seconds + - (x->curlft.use_time ? : now) - now; + (READ_ONCE(x->curlft.use_time) ? : now) - now; if (tmo <= 0) goto expired; if (tmo < next) @@ -617,7 +617,7 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me) } if (x->lft.soft_use_expires_seconds) { time64_t tmo = x->lft.soft_use_expires_seconds + - (x->curlft.use_time ? : now) - now; + (READ_ONCE(x->curlft.use_time) ? : now) - now; if (tmo <= 0) warn = 1; else if (tmo < next) @@ -1906,7 +1906,7 @@ out: hrtimer_start(&x1->mtimer, ktime_set(1, 0), HRTIMER_MODE_REL_SOFT); - if (x1->curlft.use_time) + if (READ_ONCE(x1->curlft.use_time)) xfrm_state_check_expire(x1); if (x->props.smark.m || x->props.smark.v || x->if_id) { @@ -1940,8 +1940,8 @@ int xfrm_state_check_expire(struct xfrm_state *x) { xfrm_dev_state_update_curlft(x); - if (!x->curlft.use_time) - x->curlft.use_time = ktime_get_real_seconds(); + if (!READ_ONCE(x->curlft.use_time)) + WRITE_ONCE(x->curlft.use_time, ktime_get_real_seconds()); if (x->curlft.bytes >= x->lft.hard_byte_limit || x->curlft.packets >= x->lft.hard_packet_limit) { -- cgit v1.2.3-58-ga151 From 6028da3f125fec34425dbd5fec18e85d372b2af6 Mon Sep 17 00:00:00 2001 From: Christian Hopps Date: Thu, 26 Jan 2023 11:33:50 -0500 Subject: xfrm: fix bug with DSCP copy to v6 from v4 tunnel When copying the DSCP bits for decap-dscp into IPv6 don't assume the outer encap is always IPv6. Instead, as with the inner IPv4 case, copy the DSCP bits from the correctly saved "tos" value in the control block. Fixes: 227620e29509 ("[IPSEC]: Separate inner/outer mode processing on input") Signed-off-by: Christian Hopps Acked-by: Herbert Xu Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_input.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index c06e54a10540..436d29640ac2 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -279,8 +279,7 @@ static int xfrm6_remove_tunnel_encap(struct xfrm_state *x, struct sk_buff *skb) goto out; if (x->props.flags & XFRM_STATE_DECAP_DSCP) - ipv6_copy_dscp(ipv6_get_dsfield(ipv6_hdr(skb)), - ipipv6_hdr(skb)); + ipv6_copy_dscp(XFRM_MODE_SKB_CB(skb)->tos, ipipv6_hdr(skb)); if (!(x->props.flags & XFRM_STATE_NOECN)) ipip6_ecn_decapsulate(skb); -- cgit v1.2.3-58-ga151 From c1d2ecdf5e38e3489ce8328238b558b3b2866fe1 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Thu, 2 Feb 2023 17:25:51 +0200 Subject: neigh: make sure used and confirmed times are valid Entries can linger in cache without timer for days, thanks to the gc_thresh1 limit. As result, without traffic, the confirmed time can be outdated and to appear to be in the future. Later, on traffic, NUD_STALE entries can switch to NUD_DELAY and start the timer which can see the invalid confirmed time and wrongly switch to NUD_REACHABLE state instead of NUD_PROBE. As result, timer is set many days in the future. This is more visible on 32-bit platforms, with higher HZ value. Why this is a problem? While we expect unused entries to expire, such entries stay in REACHABLE state for too long, locked in cache. They are not expired normally, only when cache is full. Problem and the wrong state change reported by Zhang Changzhong: 172.16.1.18 dev bond0 lladdr 0a:0e:0f:01:12:01 ref 1 used 350521/15994171/350520 probes 4 REACHABLE 350520 seconds have elapsed since this entry was last updated, but it is still in the REACHABLE state (base_reachable_time_ms is 30000), preventing lladdr from being updated through probe. Fix it by ensuring timer is started with valid used/confirmed times. Considering the valid time range is LONG_MAX jiffies, we try not to go too much in the past while we are in DELAY/PROBE state. There are also places that need used/updated times to be validated while timer is not running. Reported-by: Zhang Changzhong Signed-off-by: Julian Anastasov Tested-by: Zhang Changzhong Signed-off-by: David S. Miller --- net/core/neighbour.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index f00a79fc301b..4edd2176e238 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -269,7 +269,7 @@ static int neigh_forced_gc(struct neigh_table *tbl) (n->nud_state == NUD_NOARP) || (tbl->is_multicast && tbl->is_multicast(n->primary_key)) || - time_after(tref, n->updated)) + !time_in_range(n->updated, tref, jiffies)) remove = true; write_unlock(&n->lock); @@ -289,7 +289,17 @@ static int neigh_forced_gc(struct neigh_table *tbl) static void neigh_add_timer(struct neighbour *n, unsigned long when) { + /* Use safe distance from the jiffies - LONG_MAX point while timer + * is running in DELAY/PROBE state but still show to user space + * large times in the past. + */ + unsigned long mint = jiffies - (LONG_MAX - 86400 * HZ); + neigh_hold(n); + if (!time_in_range(n->confirmed, mint, jiffies)) + n->confirmed = mint; + if (time_before(n->used, n->confirmed)) + n->used = n->confirmed; if (unlikely(mod_timer(&n->timer, when))) { printk("NEIGH: BUG, double timer add, state is %x\n", n->nud_state); @@ -1001,12 +1011,14 @@ static void neigh_periodic_work(struct work_struct *work) goto next_elt; } - if (time_before(n->used, n->confirmed)) + if (time_before(n->used, n->confirmed) && + time_is_before_eq_jiffies(n->confirmed)) n->used = n->confirmed; if (refcount_read(&n->refcnt) == 1 && (state == NUD_FAILED || - time_after(jiffies, n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) { + !time_in_range_open(jiffies, n->used, + n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) { *np = n->next; neigh_mark_dead(n); write_unlock(&n->lock); -- cgit v1.2.3-58-ga151 From 565b4824c39fa335cba2028a09d7beb7112f3c9a Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 6 Feb 2023 10:41:51 +0100 Subject: devlink: change port event netdev notifier from per-net to global Currently only the network namespace of devlink instance is monitored for port events. If netdev is moved to a different namespace and then unregistered, NETDEV_PRE_UNINIT is missed which leads to trigger following WARN_ON in devl_port_unregister(). WARN_ON(devlink_port->type != DEVLINK_PORT_TYPE_NOTSET); Fix this by changing the netdev notifier from per-net to global so no event is missed. Fixes: 02a68a47eade ("net: devlink: track netdev with devlink_port assigned") Signed-off-by: Jiri Pirko Reviewed-by: Jacob Keller Link: https://lore.kernel.org/r/20230206094151.2557264-1-jiri@resnulli.us Signed-off-by: Paolo Abeni --- net/core/devlink.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/devlink.c b/net/core/devlink.c index 032d6d0a5ce6..909a10e4b0dd 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -9979,7 +9979,7 @@ struct devlink *devlink_alloc_ns(const struct devlink_ops *ops, goto err_xa_alloc; devlink->netdevice_nb.notifier_call = devlink_netdevice_event; - ret = register_netdevice_notifier_net(net, &devlink->netdevice_nb); + ret = register_netdevice_notifier(&devlink->netdevice_nb); if (ret) goto err_register_netdevice_notifier; @@ -10171,8 +10171,7 @@ void devlink_free(struct devlink *devlink) xa_destroy(&devlink->snapshot_ids); xa_destroy(&devlink->ports); - WARN_ON_ONCE(unregister_netdevice_notifier_net(devlink_net(devlink), - &devlink->netdevice_nb)); + WARN_ON_ONCE(unregister_netdevice_notifier(&devlink->netdevice_nb)); xa_erase(&devlinks, devlink->index); @@ -10503,6 +10502,8 @@ static int devlink_netdevice_event(struct notifier_block *nb, break; case NETDEV_REGISTER: case NETDEV_CHANGENAME: + if (devlink_net(devlink) != dev_net(netdev)) + return NOTIFY_OK; /* Set the netdev on top of previously set type. Note this * event happens also during net namespace change so here * we take into account netdev pointer appearing in this @@ -10512,6 +10513,8 @@ static int devlink_netdevice_event(struct notifier_block *nb, netdev); break; case NETDEV_UNREGISTER: + if (devlink_net(devlink) != dev_net(netdev)) + return NOTIFY_OK; /* Clear netdev pointer, but not the type. This event happens * also during net namespace change so we need to clear * pointer to netdev that is going to another net namespace. -- cgit v1.2.3-58-ga151 From 4ae5e1e97c44f4654516c1d41591a462ed62fa7b Mon Sep 17 00:00:00 2001 From: Devid Antonio Filoni Date: Fri, 25 Nov 2022 18:04:18 +0100 Subject: can: j1939: do not wait 250 ms if the same addr was already claimed The ISO 11783-5 standard, in "4.5.2 - Address claim requirements", states: d) No CF shall begin, or resume, transmission on the network until 250 ms after it has successfully claimed an address except when responding to a request for address-claimed. But "Figure 6" and "Figure 7" in "4.5.4.2 - Address-claim prioritization" show that the CF begins the transmission after 250 ms from the first AC (address-claimed) message even if it sends another AC message during that time window to resolve the address contention with another CF. As stated in "4.4.2.3 - Address-claimed message": In order to successfully claim an address, the CF sending an address claimed message shall not receive a contending claim from another CF for at least 250 ms. As stated in "4.4.3.2 - NAME management (NM) message": 1) A commanding CF can d) request that a CF with a specified NAME transmit the address- claimed message with its current NAME. 2) A target CF shall d) send an address-claimed message in response to a request for a matching NAME Taking the above arguments into account, the 250 ms wait is requested only during network initialization. Do not restart the timer on AC message if both the NAME and the address match and so if the address has already been claimed (timer has expired) or the AC message has been sent to resolve the contention with another CF (timer is still running). Signed-off-by: Devid Antonio Filoni Acked-by: Oleksij Rempel Link: https://lore.kernel.org/all/20221125170418.34575-1-devid.filoni@egluetechnologies.com Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol") Cc: stable@vger.kernel.org Signed-off-by: Marc Kleine-Budde --- net/can/j1939/address-claim.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'net') diff --git a/net/can/j1939/address-claim.c b/net/can/j1939/address-claim.c index f33c47327927..ca4ad6cdd5cb 100644 --- a/net/can/j1939/address-claim.c +++ b/net/can/j1939/address-claim.c @@ -165,6 +165,46 @@ static void j1939_ac_process(struct j1939_priv *priv, struct sk_buff *skb) * leaving this function. */ ecu = j1939_ecu_get_by_name_locked(priv, name); + + if (ecu && ecu->addr == skcb->addr.sa) { + /* The ISO 11783-5 standard, in "4.5.2 - Address claim + * requirements", states: + * d) No CF shall begin, or resume, transmission on the + * network until 250 ms after it has successfully claimed + * an address except when responding to a request for + * address-claimed. + * + * But "Figure 6" and "Figure 7" in "4.5.4.2 - Address-claim + * prioritization" show that the CF begins the transmission + * after 250 ms from the first AC (address-claimed) message + * even if it sends another AC message during that time window + * to resolve the address contention with another CF. + * + * As stated in "4.4.2.3 - Address-claimed message": + * In order to successfully claim an address, the CF sending + * an address claimed message shall not receive a contending + * claim from another CF for at least 250 ms. + * + * As stated in "4.4.3.2 - NAME management (NM) message": + * 1) A commanding CF can + * d) request that a CF with a specified NAME transmit + * the address-claimed message with its current NAME. + * 2) A target CF shall + * d) send an address-claimed message in response to a + * request for a matching NAME + * + * Taking the above arguments into account, the 250 ms wait is + * requested only during network initialization. + * + * Do not restart the timer on AC message if both the NAME and + * the address match and so if the address has already been + * claimed (timer has expired) or the AC message has been sent + * to resolve the contention with another CF (timer is still + * running). + */ + goto out_ecu_put; + } + if (!ecu && j1939_address_is_unicast(skcb->addr.sa)) ecu = j1939_ecu_create_locked(priv, name); -- cgit v1.2.3-58-ga151 From 9cec2aaffe969f2a3e18b5ec105fc20bb908e475 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 6 Feb 2023 16:18:32 +0300 Subject: net: sched: sch: Fix off by one in htb_activate_prios() The > needs be >= to prevent an out of bounds access. Fixes: de5ca4c3852f ("net: sched: sch: Bounds check priority") Signed-off-by: Dan Carpenter Reviewed-by: Simon Horman Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/Y+D+KN18FQI2DKLq@kili Signed-off-by: Jakub Kicinski --- net/sched/sch_htb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index cc28e41fb745..92f2975b6a82 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -433,7 +433,7 @@ static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl) while (m) { unsigned int prio = ffz(~m); - if (WARN_ON_ONCE(prio > ARRAY_SIZE(p->inner.clprio))) + if (WARN_ON_ONCE(prio >= ARRAY_SIZE(p->inner.clprio))) break; m &= ~(1 << prio); -- cgit v1.2.3-58-ga151 From c11204c78d6966c5bda6dd05c3ac5cbb193f93e3 Mon Sep 17 00:00:00 2001 From: Kevin Yang Date: Tue, 7 Feb 2023 02:08:20 +0000 Subject: txhash: fix sk->sk_txrehash default This code fix a bug that sk->sk_txrehash gets its default enable value from sysctl_txrehash only when the socket is a TCP listener. We should have sysctl_txrehash to set the default sk->sk_txrehash, no matter TCP, nor listerner/connector. Tested by following packetdrill: 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 +0 socket(..., SOCK_DGRAM, IPPROTO_UDP) = 4 // SO_TXREHASH == 74, default to sysctl_txrehash == 1 +0 getsockopt(3, SOL_SOCKET, 74, [1], [4]) = 0 +0 getsockopt(4, SOL_SOCKET, 74, [1], [4]) = 0 Fixes: 26859240e4ee ("txhash: Add socket option to control TX hash rethink behavior") Signed-off-by: Kevin Yang Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 3 ++- net/ipv4/af_inet.c | 1 + net/ipv4/inet_connection_sock.c | 3 --- net/ipv6/af_inet6.c | 1 + 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index f954d5893e79..6f27c24016fe 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1531,6 +1531,8 @@ set_sndbuf: ret = -EINVAL; break; } + if ((u8)val == SOCK_TXREHASH_DEFAULT) + val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash); /* Paired with READ_ONCE() in tcp_rtx_synack() */ WRITE_ONCE(sk->sk_txrehash, (u8)val); break; @@ -3451,7 +3453,6 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_pacing_rate = ~0UL; WRITE_ONCE(sk->sk_pacing_shift, 10); sk->sk_incoming_cpu = -1; - sk->sk_txrehash = SOCK_TXREHASH_DEFAULT; sk_rx_queue_clear(sk); /* diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 6c0ec2789943..cf11f10927e1 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -347,6 +347,7 @@ lookup_protocol: sk->sk_destruct = inet_sock_destruct; sk->sk_protocol = protocol; sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv; + sk->sk_txrehash = READ_ONCE(net->core.sysctl_txrehash); inet->uc_ttl = -1; inet->mc_loop = 1; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index d1f837579398..f2c43f67187d 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -1225,9 +1225,6 @@ int inet_csk_listen_start(struct sock *sk) sk->sk_ack_backlog = 0; inet_csk_delack_init(sk); - if (sk->sk_txrehash == SOCK_TXREHASH_DEFAULT) - sk->sk_txrehash = READ_ONCE(sock_net(sk)->core.sysctl_txrehash); - /* There is race window here: we announce ourselves listening, * but this transition is still not validated by get_port(). * It is OK, because this socket enters to hash table only diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index fee9163382c2..847934763868 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -222,6 +222,7 @@ lookup_protocol: np->pmtudisc = IPV6_PMTUDISC_WANT; np->repflow = net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_ESTABLISHED; sk->sk_ipv6only = net->ipv6.sysctl.bindv6only; + sk->sk_txrehash = READ_ONCE(net->core.sysctl_txrehash); /* Init the ipv4 part of the socket since we can have sockets * using v6 API for ipv4. -- cgit v1.2.3-58-ga151 From d4e85922e3e7ef2071f91f65e61629b60f3a9cf4 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 7 Feb 2023 14:04:13 +0100 Subject: mptcp: do not wait for bare sockets' timeout If the peer closes all the existing subflows for a given mptcp socket and later the application closes it, the current implementation let it survive until the timewait timeout expires. While the above is allowed by the protocol specification it consumes resources for almost no reason and additionally causes sporadic self-tests failures. Let's move the mptcp socket to the TCP_CLOSE state when there are no alive subflows at close time, so that the allocated resources will be freed immediately. Fixes: e16163b6e2b7 ("mptcp: refactor shutdown and close") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 8cd6cc67c2c5..bc6c1f62a690 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2897,6 +2897,7 @@ bool __mptcp_close(struct sock *sk, long timeout) struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); bool do_cancel_work = false; + int subflows_alive = 0; sk->sk_shutdown = SHUTDOWN_MASK; @@ -2922,6 +2923,8 @@ cleanup: struct sock *ssk = mptcp_subflow_tcp_sock(subflow); bool slow = lock_sock_fast_nested(ssk); + subflows_alive += ssk->sk_state != TCP_CLOSE; + /* since the close timeout takes precedence on the fail one, * cancel the latter */ @@ -2937,6 +2940,12 @@ cleanup: } sock_orphan(sk); + /* all the subflows are closed, only timeout can change the msk + * state, let's not keep resources busy for no reasons + */ + if (subflows_alive == 0) + inet_sk_state_store(sk, TCP_CLOSE); + sock_hold(sk); pr_debug("msk=%p state=%d", sk, sk->sk_state); if (msk->token) -- cgit v1.2.3-58-ga151 From 21e43569685de4ad773fb060c11a15f3fd5e7ac4 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 7 Feb 2023 14:04:14 +0100 Subject: mptcp: fix locking for setsockopt corner-case We need to call the __mptcp_nmpc_socket(), and later subflow socket access under the msk socket lock, or e.g. a racing connect() could change the socket status under the hood, with unexpected results. Fixes: 54635bd04701 ("mptcp: add TCP_FASTOPEN_CONNECT socket option") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Signed-off-by: David S. Miller --- net/mptcp/sockopt.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index d4b1e6ec1b36..7f2c3727ab23 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -760,14 +760,21 @@ static int mptcp_setsockopt_v4(struct mptcp_sock *msk, int optname, static int mptcp_setsockopt_first_sf_only(struct mptcp_sock *msk, int level, int optname, sockptr_t optval, unsigned int optlen) { + struct sock *sk = (struct sock *)msk; struct socket *sock; + int ret = -EINVAL; /* Limit to first subflow, before the connection establishment */ + lock_sock(sk); sock = __mptcp_nmpc_socket(msk); if (!sock) - return -EINVAL; + goto unlock; - return tcp_setsockopt(sock->sk, level, optname, optval, optlen); + ret = tcp_setsockopt(sock->sk, level, optname, optval, optlen); + +unlock: + release_sock(sk); + return ret; } static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname, -- cgit v1.2.3-58-ga151 From ad2171009d968104ccda9dc517f5a3ba891515db Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 7 Feb 2023 14:04:15 +0100 Subject: mptcp: fix locking for in-kernel listener creation For consistency, in mptcp_pm_nl_create_listen_socket(), we need to call the __mptcp_nmpc_socket() under the msk socket lock. Note that as a side effect, mptcp_subflow_create_socket() needs a 'nested' lockdep annotation, as it will acquire the subflow (kernel) socket lock under the in-kernel listener msk socket lock. The current lack of locking is almost harmless, because the relevant socket is not exposed to the user space, but in future we will add more complexity to the mentioned helper, let's play safe. Fixes: 1729cf186d8a ("mptcp: create the listening socket for new port") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Signed-off-by: David S. Miller --- net/mptcp/pm_netlink.c | 10 ++++++---- net/mptcp/subflow.c | 2 +- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 2ea7eae43bdb..10fe9771a852 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -998,8 +998,8 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk, { int addrlen = sizeof(struct sockaddr_in); struct sockaddr_storage addr; - struct mptcp_sock *msk; struct socket *ssock; + struct sock *newsk; int backlog = 1024; int err; @@ -1008,11 +1008,13 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk, if (err) return err; - msk = mptcp_sk(entry->lsk->sk); - if (!msk) + newsk = entry->lsk->sk; + if (!newsk) return -EINVAL; - ssock = __mptcp_nmpc_socket(msk); + lock_sock(newsk); + ssock = __mptcp_nmpc_socket(mptcp_sk(newsk)); + release_sock(newsk); if (!ssock) return -EINVAL; diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index ec54413fb31f..a3e5026bee5b 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1679,7 +1679,7 @@ int mptcp_subflow_create_socket(struct sock *sk, unsigned short family, if (err) return err; - lock_sock(sf->sk); + lock_sock_nested(sf->sk, SINGLE_DEPTH_NESTING); /* the newly created socket has to be in the same cgroup as its parent */ mptcp_attach_cgroup(sk, sf->sk); -- cgit v1.2.3-58-ga151 From 1249db44a102d9d3541ed7798d4b01ffdcf03524 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 7 Feb 2023 14:04:16 +0100 Subject: mptcp: be careful on subflow status propagation on errors Currently the subflow error report callback unconditionally propagates the fallback subflow status to the owning msk. If the msk is already orphaned, the above prevents the code from correctly tracking the msk moving to the TCP_CLOSE state and doing the appropriate cleanup. All the above causes increasing memory usage over time and sporadic self-tests failures. There is a great deal of infrastructure trying to propagate correctly the fallback subflow status to the owning mptcp socket, e.g. via mptcp_subflow_eof() and subflow_sched_work_if_closed(): in the error propagation path we need only to cope with unorphaned sockets. Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/339 Fixes: 15cc10453398 ("mptcp: deliver ssk errors to msk") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Signed-off-by: David S. Miller --- net/mptcp/subflow.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index a3e5026bee5b..32904c76c6a1 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1399,6 +1399,7 @@ void __mptcp_error_report(struct sock *sk) mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); int err = sock_error(ssk); + int ssk_state; if (!err) continue; @@ -1409,7 +1410,14 @@ void __mptcp_error_report(struct sock *sk) if (sk->sk_state != TCP_SYN_SENT && !__mptcp_check_fallback(msk)) continue; - inet_sk_state_store(sk, inet_sk_state_load(ssk)); + /* We need to propagate only transition to CLOSE state. + * Orphaned socket will see such state change via + * subflow_sched_work_if_closed() and that path will properly + * destroy the msk as needed. + */ + ssk_state = inet_sk_state_load(ssk); + if (ssk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DEAD)) + inet_sk_state_store(sk, ssk_state); sk->sk_err = -err; /* This barrier is coupled with smp_rmb() in mptcp_poll() */ -- cgit v1.2.3-58-ga151 From f753a68980cf4b59a80fe677619da2b1804f526d Mon Sep 17 00:00:00 2001 From: Pietro Borrello Date: Tue, 7 Feb 2023 18:26:34 +0000 Subject: rds: rds_rm_zerocopy_callback() use list_first_entry() rds_rm_zerocopy_callback() uses list_entry() on the head of a list causing a type confusion. Use list_first_entry() to actually access the first element of the rs_zcookie_queue list. Fixes: 9426bbc6de99 ("rds: use list structure to track information for zerocopy completion notification") Reviewed-by: Willem de Bruijn Signed-off-by: Pietro Borrello Link: https://lore.kernel.org/r/20230202-rds-zerocopy-v3-1-83b0df974f9a@diag.uniroma1.it Signed-off-by: Paolo Abeni --- net/rds/message.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/rds/message.c b/net/rds/message.c index b47e4f0a1639..c19c93561227 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -104,9 +104,9 @@ static void rds_rm_zerocopy_callback(struct rds_sock *rs, spin_lock_irqsave(&q->lock, flags); head = &q->zcookie_head; if (!list_empty(head)) { - info = list_entry(head, struct rds_msg_zcopy_info, - rs_zcookie_next); - if (info && rds_zcookie_add(info, cookie)) { + info = list_first_entry(head, struct rds_msg_zcopy_info, + rs_zcookie_next); + if (rds_zcookie_add(info, cookie)) { spin_unlock_irqrestore(&q->lock, flags); kfree(rds_info_from_znotifier(znotif)); /* caller invokes rds_wake_sk_sleep() */ -- cgit v1.2.3-58-ga151