From b515d2637276a3810d6595e10ab02c13bfd0b63a Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 16 Apr 2021 11:27:59 +0200 Subject: xfrm: xfrm_state_mtu should return at least 1280 for ipv6 Jianwen reported that IPv6 Interoperability tests are failing in an IPsec case where one of the links between the IPsec peers has an MTU of 1280. The peer generates a packet larger than this MTU, the router replies with a "Packet too big" message indicating an MTU of 1280. When the peer tries to send another large packet, xfrm_state_mtu returns 1280 - ipsec_overhead, which causes ip6_setup_cork to fail with EINVAL. We can fix this by forcing xfrm_state_mtu to return IPV6_MIN_MTU when IPv6 is used. After going through IPsec, the packet will then be fragmented to obey the actual network's PMTU, just before leaving the host. Currently, TFC padding is capped to PMTU - overhead to avoid fragementation: after padding and encapsulation, we still fit within the PMTU. That behavior is preserved in this patch. Fixes: 91657eafb64b ("xfrm: take net hdr len into account for esp payload size calculation") Reported-by: Jianwen Ji Signed-off-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- net/ipv4/esp4.c | 2 +- net/ipv6/esp6.c | 2 +- net/xfrm/xfrm_state.c | 14 ++++++++++++-- 3 files changed, 14 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 4b834bbf95e0..ed9857b2875d 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -673,7 +673,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb); u32 padto; - padto = min(x->tfcpad, xfrm_state_mtu(x, dst->child_mtu_cached)); + padto = min(x->tfcpad, __xfrm_state_mtu(x, dst->child_mtu_cached)); if (skb->len < padto) esp.tfclen = padto - skb->len; } diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 727d791ed5e6..9d1327b36bd3 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -708,7 +708,7 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb); u32 padto; - padto = min(x->tfcpad, xfrm_state_mtu(x, dst->child_mtu_cached)); + padto = min(x->tfcpad, __xfrm_state_mtu(x, dst->child_mtu_cached)); if (skb->len < padto) esp.tfclen = padto - skb->len; } diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 4496f7efa220..c25586156c6a 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -2518,7 +2518,7 @@ void xfrm_state_delete_tunnel(struct xfrm_state *x) } EXPORT_SYMBOL(xfrm_state_delete_tunnel); -u32 xfrm_state_mtu(struct xfrm_state *x, int mtu) +u32 __xfrm_state_mtu(struct xfrm_state *x, int mtu) { const struct xfrm_type *type = READ_ONCE(x->type); struct crypto_aead *aead; @@ -2549,7 +2549,17 @@ u32 xfrm_state_mtu(struct xfrm_state *x, int mtu) return ((mtu - x->props.header_len - crypto_aead_authsize(aead) - net_adj) & ~(blksize - 1)) + net_adj - 2; } -EXPORT_SYMBOL_GPL(xfrm_state_mtu); +EXPORT_SYMBOL_GPL(__xfrm_state_mtu); + +u32 xfrm_state_mtu(struct xfrm_state *x, int mtu) +{ + mtu = __xfrm_state_mtu(x, mtu); + + if (x->props.family == AF_INET6 && mtu < IPV6_MIN_MTU) + return IPV6_MIN_MTU; + + return mtu; +} int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload) { -- cgit v1.2.3-58-ga151 From d2792e91de2bedf829828b091720f0f7920719ed Mon Sep 17 00:00:00 2001 From: Yejune Deng Date: Sat, 8 May 2021 10:27:07 +0800 Subject: net: openvswitch: Remove unnecessary skb_nfct() There is no need add 'if (skb_nfct(skb))' assignment, the nf_conntrack_put() would check it. Signed-off-by: Yejune Deng Signed-off-by: David S. Miller --- net/openvswitch/conntrack.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index cadb6a29b285..1b5eae57bc90 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -967,8 +967,7 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, /* Associate skb with specified zone. */ if (tmpl) { - if (skb_nfct(skb)) - nf_conntrack_put(skb_nfct(skb)); + nf_conntrack_put(skb_nfct(skb)); nf_conntrack_get(&tmpl->ct_general); nf_ct_set(skb, tmpl, IP_CT_NEW); } @@ -1329,11 +1328,9 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb, int ovs_ct_clear(struct sk_buff *skb, struct sw_flow_key *key) { - if (skb_nfct(skb)) { - nf_conntrack_put(skb_nfct(skb)); - nf_ct_set(skb, NULL, IP_CT_UNTRACKED); - ovs_ct_fill_key(skb, key, false); - } + nf_conntrack_put(skb_nfct(skb)); + nf_ct_set(skb, NULL, IP_CT_UNTRACKED); + ovs_ct_fill_key(skb, key, false); return 0; } -- cgit v1.2.3-58-ga151 From 48de7c0c1c9225b478c602184be8e00b92b35c61 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Sat, 8 May 2021 18:03:05 +0800 Subject: neighbour: Remove redundant initialization of 'bucket' Integer variable 'bucket' is being initialized however this value is never read as 'bucket' is assigned zero in for statement. Remove the redundant assignment. Cleans up clang warning: net/core/neighbour.c:3144:6: warning: Value stored to 'bucket' during its initialization is never read [clang-analyzer-deadcode.DeadStores] Reported-by: Abaci Robot Signed-off-by: Yang Li Signed-off-by: David S. Miller --- net/core/neighbour.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 98f20efbfadf..2b2f333bcdfe 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -3141,7 +3141,7 @@ static struct pneigh_entry *pneigh_get_first(struct seq_file *seq) struct net *net = seq_file_net(seq); struct neigh_table *tbl = state->tbl; struct pneigh_entry *pn = NULL; - int bucket = state->bucket; + int bucket; state->flags |= NEIGH_SEQ_IS_PNEIGH; for (bucket = 0; bucket <= PNEIGH_HASHMASK; bucket++) { -- cgit v1.2.3-58-ga151 From a100243d95a60d74ae9bb9df1f5f2192e9aed6a7 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sat, 8 May 2021 11:00:33 -0700 Subject: rtnetlink: avoid RCU read lock when holding RTNL When we call af_ops->set_link_af() we hold a RCU read lock as we retrieve af_ops from the RCU protected list, but this is unnecessary because we already hold RTNL lock, which is the writer lock for protecting rtnl_af_ops, so it is safer than RCU read lock. Similar for af_ops->validate_link_af(). This was not a problem until we begin to take mutex lock down the path of ->set_link_af() in __ipv6_dev_mc_dec() recently. We can just drop the RCU read lock there and assert RTNL lock. Reported-and-tested-by: syzbot+7d941e89dd48bcf42573@syzkaller.appspotmail.com Fixes: 63ed8de4be81 ("mld: add mc_lock for protecting per-interface mld data") Tested-by: Taehee Yoo Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 26 +++++++------------------- net/ipv4/devinet.c | 4 ++-- 2 files changed, 9 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 714d5fa38546..04b4f0f2a3d2 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -543,7 +543,9 @@ static const struct rtnl_af_ops *rtnl_af_lookup(const int family) { const struct rtnl_af_ops *ops; - list_for_each_entry_rcu(ops, &rtnl_af_ops, list) { + ASSERT_RTNL(); + + list_for_each_entry(ops, &rtnl_af_ops, list) { if (ops->family == family) return ops; } @@ -2274,27 +2276,18 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[]) nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) { const struct rtnl_af_ops *af_ops; - rcu_read_lock(); af_ops = rtnl_af_lookup(nla_type(af)); - if (!af_ops) { - rcu_read_unlock(); + if (!af_ops) return -EAFNOSUPPORT; - } - if (!af_ops->set_link_af) { - rcu_read_unlock(); + if (!af_ops->set_link_af) return -EOPNOTSUPP; - } if (af_ops->validate_link_af) { err = af_ops->validate_link_af(dev, af); - if (err < 0) { - rcu_read_unlock(); + if (err < 0) return err; - } } - - rcu_read_unlock(); } } @@ -2868,17 +2861,12 @@ static int do_setlink(const struct sk_buff *skb, nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) { const struct rtnl_af_ops *af_ops; - rcu_read_lock(); - BUG_ON(!(af_ops = rtnl_af_lookup(nla_type(af)))); err = af_ops->set_link_af(dev, af, extack); - if (err < 0) { - rcu_read_unlock(); + if (err < 0) goto errout; - } - rcu_read_unlock(); status |= DO_SETLINK_NOTIFY; } } diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 2e35f68da40a..50deeff48c8b 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1955,7 +1955,7 @@ static int inet_validate_link_af(const struct net_device *dev, struct nlattr *a, *tb[IFLA_INET_MAX+1]; int err, rem; - if (dev && !__in_dev_get_rcu(dev)) + if (dev && !__in_dev_get_rtnl(dev)) return -EAFNOSUPPORT; err = nla_parse_nested_deprecated(tb, IFLA_INET_MAX, nla, @@ -1981,7 +1981,7 @@ static int inet_validate_link_af(const struct net_device *dev, static int inet_set_link_af(struct net_device *dev, const struct nlattr *nla, struct netlink_ext_ack *extack) { - struct in_device *in_dev = __in_dev_get_rcu(dev); + struct in_device *in_dev = __in_dev_get_rtnl(dev); struct nlattr *a, *tb[IFLA_INET_MAX+1]; int rem; -- cgit v1.2.3-58-ga151 From faa5f5da809b690542e1108ba66886574ac57d2c Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Tue, 11 May 2021 13:54:49 -0700 Subject: net/sched: taprio: Drop unnecessary NULL check after container_of The rcu_head pointer passed to taprio_free_sched_cb is never NULL. That means that the result of container_of() operations on it is also never NULL, even though rcu_head is the first element of the structure embedding it. On top of that, it is misleading to perform a NULL check on the result of container_of() because the position of the contained element could change, which would make the check invalid. Remove the unnecessary NULL check. This change was made automatically with the following Coccinelle script. @@ type t; identifier v; statement s; @@ <+... ( t v = container_of(...); | v = container_of(...); ) ... when != v - if (\( !v \| v == NULL \) ) s ...+> Signed-off-by: Guenter Roeck Signed-off-by: David S. Miller --- net/sched/sch_taprio.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net') diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 5c91df52b8c2..71e8a7a84841 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -114,9 +114,6 @@ static void taprio_free_sched_cb(struct rcu_head *head) struct sched_gate_list *sched = container_of(head, struct sched_gate_list, rcu); struct sched_entry *entry, *n; - if (!sched) - return; - list_for_each_entry_safe(entry, n, &sched->entries, list) { list_del(&entry->list); kfree(entry); -- cgit v1.2.3-58-ga151 From d8654f4f9300e5e7cf8d5e7885978541cf61326b Mon Sep 17 00:00:00 2001 From: Jim Ma Date: Wed, 12 May 2021 17:00:11 +0800 Subject: tls splice: remove inappropriate flags checking for MSG_PEEK In function tls_sw_splice_read, before call tls_sw_advance_skb it checks likely(!(flags & MSG_PEEK)), while MSG_PEEK is used for recvmsg, splice supports SPLICE_F_NONBLOCK, SPLICE_F_MOVE, SPLICE_F_MORE, should remove this checking. Signed-off-by: Jim Ma Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 1dcb34dfd56b..7b59ec9a24c5 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -2018,8 +2018,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, if (copied < 0) goto splice_read_end; - if (likely(!(flags & MSG_PEEK))) - tls_sw_advance_skb(sk, skb, copied); + tls_sw_advance_skb(sk, skb, copied); splice_read_end: release_sock(sk); -- cgit v1.2.3-58-ga151 From 13511704f8d7591faf19fdb84f0902dff0535ccb Mon Sep 17 00:00:00 2001 From: Yannick Vignon Date: Tue, 11 May 2021 19:18:29 +0200 Subject: net: taprio offload: enforce qdisc to netdev queue mapping Even though the taprio qdisc is designed for multiqueue devices, all the queues still point to the same top-level taprio qdisc. This works and is probably required for software taprio, but at least with offload taprio, it has an undesirable side effect: because the whole qdisc is run when a packet has to be sent, it allows packets in a best-effort class to be processed in the context of a task sending higher priority traffic. If there are packets left in the qdisc after that first run, the NET_TX softirq is raised and gets executed immediately in the same process context. As with any other softirq, it runs up to 10 times and for up to 2ms, during which the calling process is waiting for the sendmsg call (or similar) to return. In my use case, that calling process is a real-time task scheduled to send a packet every 2ms, so the long sendmsg calls are leading to missed timeslots. By attaching each netdev queue to its own qdisc, as it is done with the "classic" mq qdisc, each traffic class can be processed independently without touching the other classes. A high-priority process can then send packets without getting stuck in the sendmsg call anymore. Signed-off-by: Yannick Vignon Signed-off-by: David S. Miller --- net/sched/sch_taprio.c | 85 ++++++++++++++++++++++++++------------------------ 1 file changed, 45 insertions(+), 40 deletions(-) (limited to 'net') diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 71e8a7a84841..66fe2b82af9a 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -435,6 +435,11 @@ static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct Qdisc *child; int queue; + if (unlikely(FULL_OFFLOAD_IS_ENABLED(q->flags))) { + WARN_ONCE(1, "Trying to enqueue skb into the root of a taprio qdisc configured with full offload\n"); + return qdisc_drop(skb, sch, to_free); + } + queue = skb_get_queue_mapping(skb); child = q->qdiscs[queue]; @@ -526,23 +531,7 @@ static struct sk_buff *taprio_peek_soft(struct Qdisc *sch) static struct sk_buff *taprio_peek_offload(struct Qdisc *sch) { - struct taprio_sched *q = qdisc_priv(sch); - struct net_device *dev = qdisc_dev(sch); - struct sk_buff *skb; - int i; - - for (i = 0; i < dev->num_tx_queues; i++) { - struct Qdisc *child = q->qdiscs[i]; - - if (unlikely(!child)) - continue; - - skb = child->ops->peek(child); - if (!skb) - continue; - - return skb; - } + WARN_ONCE(1, "Trying to peek into the root of a taprio qdisc configured with full offload\n"); return NULL; } @@ -651,27 +640,7 @@ done: static struct sk_buff *taprio_dequeue_offload(struct Qdisc *sch) { - struct taprio_sched *q = qdisc_priv(sch); - struct net_device *dev = qdisc_dev(sch); - struct sk_buff *skb; - int i; - - for (i = 0; i < dev->num_tx_queues; i++) { - struct Qdisc *child = q->qdiscs[i]; - - if (unlikely(!child)) - continue; - - skb = child->ops->dequeue(child); - if (unlikely(!skb)) - continue; - - qdisc_bstats_update(sch, skb); - qdisc_qstats_backlog_dec(sch, skb); - sch->q.qlen--; - - return skb; - } + WARN_ONCE(1, "Trying to dequeue from the root of a taprio qdisc configured with full offload\n"); return NULL; } @@ -1756,6 +1725,37 @@ static int taprio_init(struct Qdisc *sch, struct nlattr *opt, return taprio_change(sch, opt, extack); } +static void taprio_attach(struct Qdisc *sch) +{ + struct taprio_sched *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + unsigned int ntx; + + /* Attach underlying qdisc */ + for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { + struct Qdisc *qdisc = q->qdiscs[ntx]; + struct Qdisc *old; + + if (FULL_OFFLOAD_IS_ENABLED(q->flags)) { + qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; + old = dev_graft_qdisc(qdisc->dev_queue, qdisc); + if (ntx < dev->real_num_tx_queues) + qdisc_hash_add(qdisc, false); + } else { + old = dev_graft_qdisc(qdisc->dev_queue, sch); + qdisc_refcount_inc(sch); + } + if (old) + qdisc_put(old); + } + + /* access to the child qdiscs is not needed in offload mode */ + if (FULL_OFFLOAD_IS_ENABLED(q->flags)) { + kfree(q->qdiscs); + q->qdiscs = NULL; + } +} + static struct netdev_queue *taprio_queue_get(struct Qdisc *sch, unsigned long cl) { @@ -1782,8 +1782,12 @@ static int taprio_graft(struct Qdisc *sch, unsigned long cl, if (dev->flags & IFF_UP) dev_deactivate(dev); - *old = q->qdiscs[cl - 1]; - q->qdiscs[cl - 1] = new; + if (FULL_OFFLOAD_IS_ENABLED(q->flags)) { + *old = dev_graft_qdisc(dev_queue, new); + } else { + *old = q->qdiscs[cl - 1]; + q->qdiscs[cl - 1] = new; + } if (new) new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; @@ -2017,6 +2021,7 @@ static struct Qdisc_ops taprio_qdisc_ops __read_mostly = { .change = taprio_change, .destroy = taprio_destroy, .reset = taprio_reset, + .attach = taprio_attach, .peek = taprio_peek, .dequeue = taprio_dequeue, .enqueue = taprio_enqueue, -- cgit v1.2.3-58-ga151 From 8380c81d5c4fced6f4397795a5ae65758272bbfd Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 12 May 2021 23:43:24 +0200 Subject: net: Treat __napi_schedule_irqoff() as __napi_schedule() on PREEMPT_RT __napi_schedule_irqoff() is an optimized version of __napi_schedule() which can be used where it is known that interrupts are disabled, e.g. in interrupt-handlers, spin_lock_irq() sections or hrtimer callbacks. On PREEMPT_RT enabled kernels this assumptions is not true. Force- threaded interrupt handlers and spinlocks are not disabling interrupts and the NAPI hrtimer callback is forced into softirq context which runs with interrupts enabled as well. Chasing all usage sites of __napi_schedule_irqoff() is a whack-a-mole game so make __napi_schedule_irqoff() invoke __napi_schedule() for PREEMPT_RT kernels. The callers of ____napi_schedule() in the networking core have been audited and are correct on PREEMPT_RT kernels as well. Reported-by: Juri Lelli Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Thomas Gleixner Reviewed-by: Juri Lelli Signed-off-by: David S. Miller --- net/core/dev.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 222b1d322c96..febb23708184 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6501,11 +6501,18 @@ EXPORT_SYMBOL(napi_schedule_prep); * __napi_schedule_irqoff - schedule for receive * @n: entry to schedule * - * Variant of __napi_schedule() assuming hard irqs are masked + * Variant of __napi_schedule() assuming hard irqs are masked. + * + * On PREEMPT_RT enabled kernels this maps to __napi_schedule() + * because the interrupt disabled assumption might not be true + * due to force-threaded interrupts and spinlock substitution. */ void __napi_schedule_irqoff(struct napi_struct *n) { - ____napi_schedule(this_cpu_ptr(&softnet_data), n); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + ____napi_schedule(this_cpu_ptr(&softnet_data), n); + else + __napi_schedule(n); } EXPORT_SYMBOL(__napi_schedule_irqoff); -- cgit v1.2.3-58-ga151 From ce6f709775bdf9bc8dd852a8758e10a98f31f280 Mon Sep 17 00:00:00 2001 From: Linus Lüssing Date: Thu, 13 May 2021 15:20:43 +0200 Subject: net: bridge: mcast: rename multicast router lists and timers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In preparation for the upcoming split of multicast router state into their IPv4 and IPv6 variants, rename the affected variable to the IPv4 version first to avoid some renames in later commits. Signed-off-by: Linus Lüssing Signed-off-by: David S. Miller --- net/bridge/br_forward.c | 4 ++-- net/bridge/br_mdb.c | 6 +++--- net/bridge/br_multicast.c | 48 +++++++++++++++++++++++------------------------ net/bridge/br_private.h | 10 +++++----- 4 files changed, 34 insertions(+), 34 deletions(-) (limited to 'net') diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 6e9b049ae521..eb9847ad40cf 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -276,7 +276,7 @@ void br_multicast_flood(struct net_bridge_mdb_entry *mdst, bool allow_mode_include = true; struct hlist_node *rp; - rp = rcu_dereference(hlist_first_rcu(&br->router_list)); + rp = rcu_dereference(hlist_first_rcu(&br->ip4_mc_router_list)); if (mdst) { p = rcu_dereference(mdst->ports); if (br_multicast_should_handle_mode(br, mdst->addr.proto) && @@ -290,7 +290,7 @@ void br_multicast_flood(struct net_bridge_mdb_entry *mdst, struct net_bridge_port *port, *lport, *rport; lport = p ? p->key.port : NULL; - rport = hlist_entry_safe(rp, struct net_bridge_port, rlist); + rport = hlist_entry_safe(rp, struct net_bridge_port, ip4_rlist); if ((unsigned long)lport > (unsigned long)rport) { port = lport; diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 95fa4af0e8dd..d61def8c4647 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -23,14 +23,14 @@ static int br_rports_fill_info(struct sk_buff *skb, struct netlink_callback *cb, struct net_bridge_port *p; struct nlattr *nest, *port_nest; - if (!br->multicast_router || hlist_empty(&br->router_list)) + if (!br->multicast_router || hlist_empty(&br->ip4_mc_router_list)) return 0; nest = nla_nest_start_noflag(skb, MDBA_ROUTER); if (nest == NULL) return -EMSGSIZE; - hlist_for_each_entry_rcu(p, &br->router_list, rlist) { + hlist_for_each_entry_rcu(p, &br->ip4_mc_router_list, ip4_rlist) { if (!p) continue; port_nest = nla_nest_start_noflag(skb, MDBA_ROUTER_PORT); @@ -38,7 +38,7 @@ static int br_rports_fill_info(struct sk_buff *skb, struct netlink_callback *cb, goto fail; if (nla_put_nohdr(skb, sizeof(u32), &p->dev->ifindex) || nla_put_u32(skb, MDBA_ROUTER_PATTR_TIMER, - br_timer_value(&p->multicast_router_timer)) || + br_timer_value(&p->ip4_mc_router_timer)) || nla_put_u8(skb, MDBA_ROUTER_PATTR_TYPE, p->multicast_router)) { nla_nest_cancel(skb, port_nest); diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 226bb05c3b42..6fe93a30b575 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -1357,13 +1357,13 @@ static int br_ip6_multicast_add_group(struct net_bridge *br, static void br_multicast_router_expired(struct timer_list *t) { struct net_bridge_port *port = - from_timer(port, t, multicast_router_timer); + from_timer(port, t, ip4_mc_router_timer); struct net_bridge *br = port->br; spin_lock(&br->multicast_lock); if (port->multicast_router == MDB_RTR_TYPE_DISABLED || port->multicast_router == MDB_RTR_TYPE_PERM || - timer_pending(&port->multicast_router_timer)) + timer_pending(&port->ip4_mc_router_timer)) goto out; __del_port_router(port); @@ -1386,12 +1386,12 @@ static void br_mc_router_state_change(struct net_bridge *p, static void br_multicast_local_router_expired(struct timer_list *t) { - struct net_bridge *br = from_timer(br, t, multicast_router_timer); + struct net_bridge *br = from_timer(br, t, ip4_mc_router_timer); spin_lock(&br->multicast_lock); if (br->multicast_router == MDB_RTR_TYPE_DISABLED || br->multicast_router == MDB_RTR_TYPE_PERM || - timer_pending(&br->multicast_router_timer)) + timer_pending(&br->ip4_mc_router_timer)) goto out; br_mc_router_state_change(br, false); @@ -1613,7 +1613,7 @@ int br_multicast_add_port(struct net_bridge_port *port) port->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; port->multicast_eht_hosts_limit = BR_MCAST_DEFAULT_EHT_HOSTS_LIMIT; - timer_setup(&port->multicast_router_timer, + timer_setup(&port->ip4_mc_router_timer, br_multicast_router_expired, 0); timer_setup(&port->ip4_own_query.timer, br_ip4_multicast_port_query_expired, 0); @@ -1649,7 +1649,7 @@ void br_multicast_del_port(struct net_bridge_port *port) hlist_move_list(&br->mcast_gc_list, &deleted_head); spin_unlock_bh(&br->multicast_lock); br_multicast_gc(&deleted_head); - del_timer_sync(&port->multicast_router_timer); + del_timer_sync(&port->ip4_mc_router_timer); free_percpu(port->mcast_stats); } @@ -1674,7 +1674,7 @@ static void __br_multicast_enable_port(struct net_bridge_port *port) br_multicast_enable(&port->ip6_own_query); #endif if (port->multicast_router == MDB_RTR_TYPE_PERM && - hlist_unhashed(&port->rlist)) + hlist_unhashed(&port->ip4_rlist)) br_multicast_add_router(br, port); } @@ -1700,7 +1700,7 @@ void br_multicast_disable_port(struct net_bridge_port *port) __del_port_router(port); - del_timer(&port->multicast_router_timer); + del_timer(&port->ip4_mc_router_timer); del_timer(&port->ip4_own_query.timer); #if IS_ENABLED(CONFIG_IPV6) del_timer(&port->ip6_own_query.timer); @@ -2666,19 +2666,19 @@ static void br_multicast_add_router(struct net_bridge *br, struct net_bridge_port *p; struct hlist_node *slot = NULL; - if (!hlist_unhashed(&port->rlist)) + if (!hlist_unhashed(&port->ip4_rlist)) return; - hlist_for_each_entry(p, &br->router_list, rlist) { + hlist_for_each_entry(p, &br->ip4_mc_router_list, ip4_rlist) { if ((unsigned long) port >= (unsigned long) p) break; - slot = &p->rlist; + slot = &p->ip4_rlist; } if (slot) - hlist_add_behind_rcu(&port->rlist, slot); + hlist_add_behind_rcu(&port->ip4_rlist, slot); else - hlist_add_head_rcu(&port->rlist, &br->router_list); + hlist_add_head_rcu(&port->ip4_rlist, &br->ip4_mc_router_list); br_rtr_notify(br->dev, port, RTM_NEWMDB); br_port_mc_router_state_change(port, true); } @@ -2690,9 +2690,9 @@ static void br_multicast_mark_router(struct net_bridge *br, if (!port) { if (br->multicast_router == MDB_RTR_TYPE_TEMP_QUERY) { - if (!timer_pending(&br->multicast_router_timer)) + if (!timer_pending(&br->ip4_mc_router_timer)) br_mc_router_state_change(br, true); - mod_timer(&br->multicast_router_timer, + mod_timer(&br->ip4_mc_router_timer, now + br->multicast_querier_interval); } return; @@ -2704,7 +2704,7 @@ static void br_multicast_mark_router(struct net_bridge *br, br_multicast_add_router(br, port); - mod_timer(&port->multicast_router_timer, + mod_timer(&port->ip4_mc_router_timer, now + br->multicast_querier_interval); } @@ -3316,7 +3316,7 @@ void br_multicast_init(struct net_bridge *br) br_opt_toggle(br, BROPT_HAS_IPV6_ADDR, true); spin_lock_init(&br->multicast_lock); - timer_setup(&br->multicast_router_timer, + timer_setup(&br->ip4_mc_router_timer, br_multicast_local_router_expired, 0); timer_setup(&br->ip4_other_query.timer, br_ip4_multicast_querier_expired, 0); @@ -3416,7 +3416,7 @@ void br_multicast_open(struct net_bridge *br) void br_multicast_stop(struct net_bridge *br) { - del_timer_sync(&br->multicast_router_timer); + del_timer_sync(&br->ip4_mc_router_timer); del_timer_sync(&br->ip4_other_query.timer); del_timer_sync(&br->ip4_own_query.timer); #if IS_ENABLED(CONFIG_IPV6) @@ -3453,7 +3453,7 @@ int br_multicast_set_router(struct net_bridge *br, unsigned long val) case MDB_RTR_TYPE_DISABLED: case MDB_RTR_TYPE_PERM: br_mc_router_state_change(br, val == MDB_RTR_TYPE_PERM); - del_timer(&br->multicast_router_timer); + del_timer(&br->ip4_mc_router_timer); br->multicast_router = val; err = 0; break; @@ -3472,9 +3472,9 @@ int br_multicast_set_router(struct net_bridge *br, unsigned long val) static void __del_port_router(struct net_bridge_port *p) { - if (hlist_unhashed(&p->rlist)) + if (hlist_unhashed(&p->ip4_rlist)) return; - hlist_del_init_rcu(&p->rlist); + hlist_del_init_rcu(&p->ip4_rlist); br_rtr_notify(p->br->dev, p, RTM_DELMDB); br_port_mc_router_state_change(p, false); @@ -3493,7 +3493,7 @@ int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val) if (p->multicast_router == val) { /* Refresh the temp router port timer */ if (p->multicast_router == MDB_RTR_TYPE_TEMP) - mod_timer(&p->multicast_router_timer, + mod_timer(&p->ip4_mc_router_timer, now + br->multicast_querier_interval); err = 0; goto unlock; @@ -3502,7 +3502,7 @@ int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val) case MDB_RTR_TYPE_DISABLED: p->multicast_router = MDB_RTR_TYPE_DISABLED; __del_port_router(p); - del_timer(&p->multicast_router_timer); + del_timer(&p->ip4_mc_router_timer); break; case MDB_RTR_TYPE_TEMP_QUERY: p->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; @@ -3510,7 +3510,7 @@ int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val) break; case MDB_RTR_TYPE_PERM: p->multicast_router = MDB_RTR_TYPE_PERM; - del_timer(&p->multicast_router_timer); + del_timer(&p->ip4_mc_router_timer); br_multicast_add_router(br, p); break; case MDB_RTR_TYPE_TEMP: diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 7ce8a77cc6b6..26e91d253687 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -307,6 +307,8 @@ struct net_bridge_port { #ifdef CONFIG_BRIDGE_IGMP_SNOOPING struct bridge_mcast_own_query ip4_own_query; + struct timer_list ip4_mc_router_timer; + struct hlist_node ip4_rlist; #if IS_ENABLED(CONFIG_IPV6) struct bridge_mcast_own_query ip6_own_query; #endif /* IS_ENABLED(CONFIG_IPV6) */ @@ -314,9 +316,7 @@ struct net_bridge_port { u32 multicast_eht_hosts_cnt; unsigned char multicast_router; struct bridge_mcast_stats __percpu *mcast_stats; - struct timer_list multicast_router_timer; struct hlist_head mglist; - struct hlist_node rlist; #endif #ifdef CONFIG_SYSFS @@ -449,9 +449,9 @@ struct net_bridge { struct hlist_head mcast_gc_list; struct hlist_head mdb_list; - struct hlist_head router_list; - struct timer_list multicast_router_timer; + struct hlist_head ip4_mc_router_list; + struct timer_list ip4_mc_router_timer; struct bridge_mcast_other_query ip4_other_query; struct bridge_mcast_own_query ip4_own_query; struct bridge_mcast_querier ip4_querier; @@ -868,7 +868,7 @@ static inline bool br_multicast_is_router(struct net_bridge *br) { return br->multicast_router == 2 || (br->multicast_router == 1 && - timer_pending(&br->multicast_router_timer)); + timer_pending(&br->ip4_mc_router_timer)); } static inline bool -- cgit v1.2.3-58-ga151 From 44ebb081dc6934e43d3c7444f183d6426adeca21 Mon Sep 17 00:00:00 2001 From: Linus Lüssing Date: Thu, 13 May 2021 15:20:44 +0200 Subject: net: bridge: mcast: add wrappers for router node retrieval MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In preparation for the upcoming split of multicast router state into their IPv4 and IPv6 variants and to avoid IPv6 #ifdef clutter later add two wrapper functions for router node retrieval in the payload forwarding code. Signed-off-by: Linus Lüssing Signed-off-by: David S. Miller --- net/bridge/br_forward.c | 5 +++-- net/bridge/br_private.h | 10 ++++++++++ 2 files changed, 13 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index eb9847ad40cf..07856362538f 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -276,7 +276,8 @@ void br_multicast_flood(struct net_bridge_mdb_entry *mdst, bool allow_mode_include = true; struct hlist_node *rp; - rp = rcu_dereference(hlist_first_rcu(&br->ip4_mc_router_list)); + rp = br_multicast_get_first_rport_node(br, skb); + if (mdst) { p = rcu_dereference(mdst->ports); if (br_multicast_should_handle_mode(br, mdst->addr.proto) && @@ -290,7 +291,7 @@ void br_multicast_flood(struct net_bridge_mdb_entry *mdst, struct net_bridge_port *port, *lport, *rport; lport = p ? p->key.port : NULL; - rport = hlist_entry_safe(rp, struct net_bridge_port, ip4_rlist); + rport = br_multicast_rport_from_node_skb(rp, skb); if ((unsigned long)lport > (unsigned long)rport) { port = lport; diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 26e91d253687..d970ef78bf98 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -864,6 +864,16 @@ static inline bool br_group_is_l2(const struct br_ip *group) #define mlock_dereference(X, br) \ rcu_dereference_protected(X, lockdep_is_held(&br->multicast_lock)) +static inline struct hlist_node * +br_multicast_get_first_rport_node(struct net_bridge *b, struct sk_buff *skb) { + return rcu_dereference(hlist_first_rcu(&b->ip4_mc_router_list)); +} + +static inline struct net_bridge_port * +br_multicast_rport_from_node_skb(struct hlist_node *rp, struct sk_buff *skb) { + return hlist_entry_safe(rp, struct net_bridge_port, ip4_rlist); +} + static inline bool br_multicast_is_router(struct net_bridge *br) { return br->multicast_router == 2 || -- cgit v1.2.3-58-ga151 From ff391c5d9871894c620f1e6ae2b18d7db572e49d Mon Sep 17 00:00:00 2001 From: Linus Lüssing Date: Thu, 13 May 2021 15:20:45 +0200 Subject: net: bridge: mcast: prepare mdb netlink for mcast router split MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In preparation for the upcoming split of multicast router state into their IPv4 and IPv6 variants and to avoid IPv6 #ifdef clutter later add some inline functions for the protocol specific parts in the mdb router netlink code. Also the we need iterate over the port instead of router list to be able put one router port entry with both the IPv4 and IPv6 multicast router info later. Signed-off-by: Linus Lüssing Signed-off-by: David S. Miller --- net/bridge/br_mdb.c | 39 ++++++++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index d61def8c4647..482edb9aadc7 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -16,29 +16,58 @@ #include "br_private.h" +static bool br_rports_have_mc_router(struct net_bridge *br) +{ + return !hlist_empty(&br->ip4_mc_router_list); +} + +static bool +br_ip4_rports_get_timer(struct net_bridge_port *port, unsigned long *timer) +{ + *timer = br_timer_value(&port->ip4_mc_router_timer); + return !hlist_unhashed(&port->ip4_rlist); +} + +static bool +br_ip6_rports_get_timer(struct net_bridge_port *port, unsigned long *timer) +{ + *timer = 0; + return false; +} + static int br_rports_fill_info(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); - struct net_bridge_port *p; + bool have_ip4_mc_rtr, have_ip6_mc_rtr; + unsigned long ip4_timer, ip6_timer; struct nlattr *nest, *port_nest; + struct net_bridge_port *p; + + if (!br->multicast_router) + return 0; - if (!br->multicast_router || hlist_empty(&br->ip4_mc_router_list)) + if (!br_rports_have_mc_router(br)) return 0; nest = nla_nest_start_noflag(skb, MDBA_ROUTER); if (nest == NULL) return -EMSGSIZE; - hlist_for_each_entry_rcu(p, &br->ip4_mc_router_list, ip4_rlist) { - if (!p) + list_for_each_entry_rcu(p, &br->port_list, list) { + have_ip4_mc_rtr = br_ip4_rports_get_timer(p, &ip4_timer); + have_ip6_mc_rtr = br_ip6_rports_get_timer(p, &ip6_timer); + + if (!have_ip4_mc_rtr && !have_ip6_mc_rtr) continue; + port_nest = nla_nest_start_noflag(skb, MDBA_ROUTER_PORT); if (!port_nest) goto fail; + if (nla_put_nohdr(skb, sizeof(u32), &p->dev->ifindex) || nla_put_u32(skb, MDBA_ROUTER_PATTR_TIMER, - br_timer_value(&p->ip4_mc_router_timer)) || + max(ip4_timer, ip6_timer)) || nla_put_u8(skb, MDBA_ROUTER_PATTR_TYPE, p->multicast_router)) { nla_nest_cancel(skb, port_nest); -- cgit v1.2.3-58-ga151 From b19232effd09c2cb5e11b1b74547406a3c9adc5a Mon Sep 17 00:00:00 2001 From: Linus Lüssing Date: Thu, 13 May 2021 15:20:46 +0200 Subject: net: bridge: mcast: prepare query reception for mcast router split MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In preparation for the upcoming split of multicast router state into their IPv4 and IPv6 variants and as the br_multicast_mark_router() will be split for that remove the select querier wrapper and instead add ip4 and ip6 variants for br_multicast_query_received(). Signed-off-by: Linus Lüssing Signed-off-by: David S. Miller --- net/bridge/br_multicast.c | 53 ++++++++++++++++++++++++----------------------- 1 file changed, 27 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 6fe93a30b575..7edbbc9941ea 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -2615,22 +2615,6 @@ update: } #endif -static bool br_multicast_select_querier(struct net_bridge *br, - struct net_bridge_port *port, - struct br_ip *saddr) -{ - switch (saddr->proto) { - case htons(ETH_P_IP): - return br_ip4_multicast_select_querier(br, port, saddr->src.ip4); -#if IS_ENABLED(CONFIG_IPV6) - case htons(ETH_P_IPV6): - return br_ip6_multicast_select_querier(br, port, &saddr->src.ip6); -#endif - } - - return false; -} - static void br_multicast_update_query_timer(struct net_bridge *br, struct bridge_mcast_other_query *query, @@ -2708,18 +2692,35 @@ static void br_multicast_mark_router(struct net_bridge *br, now + br->multicast_querier_interval); } -static void br_multicast_query_received(struct net_bridge *br, - struct net_bridge_port *port, - struct bridge_mcast_other_query *query, - struct br_ip *saddr, - unsigned long max_delay) +static void +br_ip4_multicast_query_received(struct net_bridge *br, + struct net_bridge_port *port, + struct bridge_mcast_other_query *query, + struct br_ip *saddr, + unsigned long max_delay) +{ + if (!br_ip4_multicast_select_querier(br, port, saddr->src.ip4)) + return; + + br_multicast_update_query_timer(br, query, max_delay); + br_multicast_mark_router(br, port); +} + +#if IS_ENABLED(CONFIG_IPV6) +static void +br_ip6_multicast_query_received(struct net_bridge *br, + struct net_bridge_port *port, + struct bridge_mcast_other_query *query, + struct br_ip *saddr, + unsigned long max_delay) { - if (!br_multicast_select_querier(br, port, saddr)) + if (!br_ip6_multicast_select_querier(br, port, &saddr->src.ip6)) return; br_multicast_update_query_timer(br, query, max_delay); br_multicast_mark_router(br, port); } +#endif static void br_ip4_multicast_query(struct net_bridge *br, struct net_bridge_port *port, @@ -2768,8 +2769,8 @@ static void br_ip4_multicast_query(struct net_bridge *br, saddr.proto = htons(ETH_P_IP); saddr.src.ip4 = iph->saddr; - br_multicast_query_received(br, port, &br->ip4_other_query, - &saddr, max_delay); + br_ip4_multicast_query_received(br, port, &br->ip4_other_query, + &saddr, max_delay); goto out; } @@ -2856,8 +2857,8 @@ static int br_ip6_multicast_query(struct net_bridge *br, saddr.proto = htons(ETH_P_IPV6); saddr.src.ip6 = ipv6_hdr(skb)->saddr; - br_multicast_query_received(br, port, &br->ip6_other_query, - &saddr, max_delay); + br_ip6_multicast_query_received(br, port, &br->ip6_other_query, + &saddr, max_delay); goto out; } else if (!group) { goto out; -- cgit v1.2.3-58-ga151 From 1a3065a26807b4cdd65d3b696ddb18385610f7da Mon Sep 17 00:00:00 2001 From: Linus Lüssing Date: Thu, 13 May 2021 15:20:47 +0200 Subject: net: bridge: mcast: prepare is-router function for mcast router split MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In preparation for the upcoming split of multicast router state into their IPv4 and IPv6 variants make br_multicast_is_router() protocol family aware. Note that for now br_ip6_multicast_is_router() uses the currently still common ip4_mc_router_timer for now. It will be renamed to ip6_mc_router_timer later when the split is performed. While at it also renames the "1" and "2" constants in br_multicast_is_router() to the MDB_RTR_TYPE_TEMP_QUERY and MDB_RTR_TYPE_PERM enums. Signed-off-by: Linus Lüssing Signed-off-by: David S. Miller --- net/bridge/br_input.c | 2 +- net/bridge/br_multicast.c | 5 +++-- net/bridge/br_private.h | 37 +++++++++++++++++++++++++++++++++---- 3 files changed, 37 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 8875e953ac53..1f506309efa8 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -132,7 +132,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) && br_multicast_querier_exists(br, eth_hdr(skb), mdst)) { if ((mdst && mdst->host_joined) || - br_multicast_is_router(br)) { + br_multicast_is_router(br, skb)) { local_rcv = true; br->dev->stats.multicast++; } diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 7edbbc9941ea..048b5b9e9c89 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -1391,7 +1391,8 @@ static void br_multicast_local_router_expired(struct timer_list *t) spin_lock(&br->multicast_lock); if (br->multicast_router == MDB_RTR_TYPE_DISABLED || br->multicast_router == MDB_RTR_TYPE_PERM || - timer_pending(&br->ip4_mc_router_timer)) + br_ip4_multicast_is_router(br) || + br_ip6_multicast_is_router(br)) goto out; br_mc_router_state_change(br, false); @@ -3622,7 +3623,7 @@ bool br_multicast_router(const struct net_device *dev) bool is_router; spin_lock_bh(&br->multicast_lock); - is_router = br_multicast_is_router(br); + is_router = br_multicast_is_router(br, NULL); spin_unlock_bh(&br->multicast_lock); return is_router; } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index d970ef78bf98..f9a381fcff09 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -874,11 +874,40 @@ br_multicast_rport_from_node_skb(struct hlist_node *rp, struct sk_buff *skb) { return hlist_entry_safe(rp, struct net_bridge_port, ip4_rlist); } -static inline bool br_multicast_is_router(struct net_bridge *br) +static inline bool br_ip4_multicast_is_router(struct net_bridge *br) { - return br->multicast_router == 2 || - (br->multicast_router == 1 && - timer_pending(&br->ip4_mc_router_timer)); + return timer_pending(&br->ip4_mc_router_timer); +} + +static inline bool br_ip6_multicast_is_router(struct net_bridge *br) +{ +#if IS_ENABLED(CONFIG_IPV6) + return timer_pending(&br->ip4_mc_router_timer); +#else + return false; +#endif +} + +static inline bool +br_multicast_is_router(struct net_bridge *br, struct sk_buff *skb) +{ + switch (br->multicast_router) { + case MDB_RTR_TYPE_PERM: + return true; + case MDB_RTR_TYPE_TEMP_QUERY: + if (skb) { + if (skb->protocol == htons(ETH_P_IP)) + return br_ip4_multicast_is_router(br); + else if (skb->protocol == htons(ETH_P_IPV6)) + return br_ip6_multicast_is_router(br); + } else { + return br_ip4_multicast_is_router(br) || + br_ip6_multicast_is_router(br); + } + fallthrough; + default: + return false; + } } static inline bool -- cgit v1.2.3-58-ga151 From ee5fb2223ee581676fe7e4e5a87481c419569454 Mon Sep 17 00:00:00 2001 From: Linus Lüssing Date: Thu, 13 May 2021 15:20:48 +0200 Subject: net: bridge: mcast: prepare expiry functions for mcast router split MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In preparation for the upcoming split of multicast router state into their IPv4 and IPv6 variants move the protocol specific timer access to an ip4 wrapper function. Signed-off-by: Linus Lüssing Signed-off-by: David S. Miller --- net/bridge/br_multicast.c | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 048b5b9e9c89..781599155d8a 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -1354,16 +1354,16 @@ static int br_ip6_multicast_add_group(struct net_bridge *br, } #endif -static void br_multicast_router_expired(struct timer_list *t) +static void br_multicast_router_expired(struct net_bridge_port *port, + struct timer_list *t, + struct hlist_node *rlist) { - struct net_bridge_port *port = - from_timer(port, t, ip4_mc_router_timer); struct net_bridge *br = port->br; spin_lock(&br->multicast_lock); if (port->multicast_router == MDB_RTR_TYPE_DISABLED || port->multicast_router == MDB_RTR_TYPE_PERM || - timer_pending(&port->ip4_mc_router_timer)) + timer_pending(t)) goto out; __del_port_router(port); @@ -1371,6 +1371,13 @@ out: spin_unlock(&br->multicast_lock); } +static void br_ip4_multicast_router_expired(struct timer_list *t) +{ + struct net_bridge_port *port = from_timer(port, t, ip4_mc_router_timer); + + br_multicast_router_expired(port, t, &port->ip4_rlist); +} + static void br_mc_router_state_change(struct net_bridge *p, bool is_mc_router) { @@ -1384,10 +1391,9 @@ static void br_mc_router_state_change(struct net_bridge *p, switchdev_port_attr_set(p->dev, &attr, NULL); } -static void br_multicast_local_router_expired(struct timer_list *t) +static void br_multicast_local_router_expired(struct net_bridge *br, + struct timer_list *timer) { - struct net_bridge *br = from_timer(br, t, ip4_mc_router_timer); - spin_lock(&br->multicast_lock); if (br->multicast_router == MDB_RTR_TYPE_DISABLED || br->multicast_router == MDB_RTR_TYPE_PERM || @@ -1400,6 +1406,13 @@ out: spin_unlock(&br->multicast_lock); } +static void br_ip4_multicast_local_router_expired(struct timer_list *t) +{ + struct net_bridge *br = from_timer(br, t, ip4_mc_router_timer); + + br_multicast_local_router_expired(br, t); +} + static void br_multicast_querier_expired(struct net_bridge *br, struct bridge_mcast_own_query *query) { @@ -1615,7 +1628,7 @@ int br_multicast_add_port(struct net_bridge_port *port) port->multicast_eht_hosts_limit = BR_MCAST_DEFAULT_EHT_HOSTS_LIMIT; timer_setup(&port->ip4_mc_router_timer, - br_multicast_router_expired, 0); + br_ip4_multicast_router_expired, 0); timer_setup(&port->ip4_own_query.timer, br_ip4_multicast_port_query_expired, 0); #if IS_ENABLED(CONFIG_IPV6) @@ -3319,7 +3332,7 @@ void br_multicast_init(struct net_bridge *br) spin_lock_init(&br->multicast_lock); timer_setup(&br->ip4_mc_router_timer, - br_multicast_local_router_expired, 0); + br_ip4_multicast_local_router_expired, 0); timer_setup(&br->ip4_other_query.timer, br_ip4_multicast_querier_expired, 0); timer_setup(&br->ip4_own_query.timer, -- cgit v1.2.3-58-ga151 From d9b8c4d8d937f58e618aa1e756162e80b385c701 Mon Sep 17 00:00:00 2001 From: Linus Lüssing Date: Thu, 13 May 2021 15:20:49 +0200 Subject: net: bridge: mcast: prepare add-router function for mcast router split MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In preparation for the upcoming split of multicast router state into their IPv4 and IPv6 variants move the protocol specific router list and timer access to ip4 wrapper functions. Signed-off-by: Linus Lüssing Signed-off-by: David S. Miller --- net/bridge/br_multicast.c | 117 +++++++++++++++++++++++++++++++++------------- 1 file changed, 85 insertions(+), 32 deletions(-) (limited to 'net') diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 781599155d8a..dc9546415520 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -51,8 +51,8 @@ static const struct rhashtable_params br_sg_port_rht_params = { static void br_multicast_start_querier(struct net_bridge *br, struct bridge_mcast_own_query *query); -static void br_multicast_add_router(struct net_bridge *br, - struct net_bridge_port *port); +static void br_ip4_multicast_add_router(struct net_bridge *br, + struct net_bridge_port *port); static void br_ip4_multicast_leave_group(struct net_bridge *br, struct net_bridge_port *port, __be32 group, @@ -1687,9 +1687,8 @@ static void __br_multicast_enable_port(struct net_bridge_port *port) #if IS_ENABLED(CONFIG_IPV6) br_multicast_enable(&port->ip6_own_query); #endif - if (port->multicast_router == MDB_RTR_TYPE_PERM && - hlist_unhashed(&port->ip4_rlist)) - br_multicast_add_router(br, port); + if (port->multicast_router == MDB_RTR_TYPE_PERM) + br_ip4_multicast_add_router(br, port); } void br_multicast_enable_port(struct net_bridge_port *port) @@ -2653,45 +2652,86 @@ static void br_port_mc_router_state_change(struct net_bridge_port *p, switchdev_port_attr_set(p->dev, &attr, NULL); } -/* - * Add port to router_list +static struct net_bridge_port * +br_multicast_rport_from_node(struct net_bridge *br, + struct hlist_head *mc_router_list, + struct hlist_node *rlist) +{ + return hlist_entry(rlist, struct net_bridge_port, ip4_rlist); +} + +static struct hlist_node * +br_multicast_get_rport_slot(struct net_bridge *br, + struct net_bridge_port *port, + struct hlist_head *mc_router_list) + +{ + struct hlist_node *slot = NULL; + struct net_bridge_port *p; + struct hlist_node *rlist; + + hlist_for_each(rlist, mc_router_list) { + p = br_multicast_rport_from_node(br, mc_router_list, rlist); + + if ((unsigned long)port >= (unsigned long)p) + break; + + slot = rlist; + } + + return slot; +} + +/* Add port to router_list * list is maintained ordered by pointer value * and locked by br->multicast_lock and RCU */ static void br_multicast_add_router(struct net_bridge *br, - struct net_bridge_port *port) + struct net_bridge_port *port, + struct hlist_node *rlist, + struct hlist_head *mc_router_list) { - struct net_bridge_port *p; - struct hlist_node *slot = NULL; + struct hlist_node *slot; - if (!hlist_unhashed(&port->ip4_rlist)) + if (!hlist_unhashed(rlist)) return; - hlist_for_each_entry(p, &br->ip4_mc_router_list, ip4_rlist) { - if ((unsigned long) port >= (unsigned long) p) - break; - slot = &p->ip4_rlist; - } + slot = br_multicast_get_rport_slot(br, port, mc_router_list); if (slot) - hlist_add_behind_rcu(&port->ip4_rlist, slot); + hlist_add_behind_rcu(rlist, slot); else - hlist_add_head_rcu(&port->ip4_rlist, &br->ip4_mc_router_list); + hlist_add_head_rcu(rlist, mc_router_list); + br_rtr_notify(br->dev, port, RTM_NEWMDB); br_port_mc_router_state_change(port, true); } +/* Add port to router_list + * list is maintained ordered by pointer value + * and locked by br->multicast_lock and RCU + */ +static void br_ip4_multicast_add_router(struct net_bridge *br, + struct net_bridge_port *port) +{ + br_multicast_add_router(br, port, &port->ip4_rlist, + &br->ip4_mc_router_list); +} + static void br_multicast_mark_router(struct net_bridge *br, - struct net_bridge_port *port) + struct net_bridge_port *port, + struct timer_list *timer, + struct hlist_node *rlist, + struct hlist_head *mc_router_list) { unsigned long now = jiffies; if (!port) { if (br->multicast_router == MDB_RTR_TYPE_TEMP_QUERY) { - if (!timer_pending(&br->ip4_mc_router_timer)) + if (!br_ip4_multicast_is_router(br) && + !br_ip6_multicast_is_router(br)) br_mc_router_state_change(br, true); - mod_timer(&br->ip4_mc_router_timer, - now + br->multicast_querier_interval); + mod_timer(timer, now + br->multicast_querier_interval); } return; } @@ -2700,10 +2740,23 @@ static void br_multicast_mark_router(struct net_bridge *br, port->multicast_router == MDB_RTR_TYPE_PERM) return; - br_multicast_add_router(br, port); + br_multicast_add_router(br, port, rlist, mc_router_list); + mod_timer(timer, now + br->multicast_querier_interval); +} + +static void br_ip4_multicast_mark_router(struct net_bridge *br, + struct net_bridge_port *port) +{ + struct timer_list *timer = &br->ip4_mc_router_timer; + struct hlist_node *rlist = NULL; + + if (port) { + timer = &port->ip4_mc_router_timer; + rlist = &port->ip4_rlist; + } - mod_timer(&port->ip4_mc_router_timer, - now + br->multicast_querier_interval); + br_multicast_mark_router(br, port, timer, rlist, + &br->ip4_mc_router_list); } static void @@ -2717,7 +2770,7 @@ br_ip4_multicast_query_received(struct net_bridge *br, return; br_multicast_update_query_timer(br, query, max_delay); - br_multicast_mark_router(br, port); + br_ip4_multicast_mark_router(br, port); } #if IS_ENABLED(CONFIG_IPV6) @@ -2732,7 +2785,7 @@ br_ip6_multicast_query_received(struct net_bridge *br, return; br_multicast_update_query_timer(br, query, max_delay); - br_multicast_mark_router(br, port); + br_ip4_multicast_mark_router(br, port); } #endif @@ -3102,7 +3155,7 @@ static void br_multicast_pim(struct net_bridge *br, pim_hdr_type(pimhdr) != PIM_TYPE_HELLO) return; - br_multicast_mark_router(br, port); + br_ip4_multicast_mark_router(br, port); } static int br_ip4_multicast_mrd_rcv(struct net_bridge *br, @@ -3113,7 +3166,7 @@ static int br_ip4_multicast_mrd_rcv(struct net_bridge *br, igmp_hdr(skb)->type != IGMP_MRDISC_ADV) return -ENOMSG; - br_multicast_mark_router(br, port); + br_ip4_multicast_mark_router(br, port); return 0; } @@ -3181,7 +3234,7 @@ static void br_ip6_multicast_mrd_rcv(struct net_bridge *br, if (icmp6_hdr(skb)->icmp6_type != ICMPV6_MRDISC_ADV) return; - br_multicast_mark_router(br, port); + br_ip4_multicast_mark_router(br, port); } static int br_multicast_ipv6_rcv(struct net_bridge *br, @@ -3526,11 +3579,11 @@ int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val) case MDB_RTR_TYPE_PERM: p->multicast_router = MDB_RTR_TYPE_PERM; del_timer(&p->ip4_mc_router_timer); - br_multicast_add_router(br, p); + br_ip4_multicast_add_router(br, p); break; case MDB_RTR_TYPE_TEMP: p->multicast_router = MDB_RTR_TYPE_TEMP; - br_multicast_mark_router(br, p); + br_ip4_multicast_mark_router(br, p); break; default: goto unlock; -- cgit v1.2.3-58-ga151 From ed2d35971a8066aa24ce65dd66c113c0506bb206 Mon Sep 17 00:00:00 2001 From: Linus Lüssing Date: Thu, 13 May 2021 15:20:50 +0200 Subject: net: bridge: mcast: split router port del+notify for mcast router split MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In preparation for the upcoming split of multicast router state into their IPv4 and IPv6 variants split router port deletion and notification into two functions. When we disable a port for instance later we want to only send one notification to switchdev and netlink for compatibility and want to avoid sending one for IPv4 and one for IPv6. For that the split is needed. Signed-off-by: Linus Lüssing Signed-off-by: David S. Miller --- net/bridge/br_multicast.c | 40 +++++++++++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index dc9546415520..30144f9d19d7 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -60,7 +60,8 @@ static void br_ip4_multicast_leave_group(struct net_bridge *br, const unsigned char *src); static void br_multicast_port_group_rexmit(struct timer_list *t); -static void __del_port_router(struct net_bridge_port *p); +static void +br_multicast_rport_del_notify(struct net_bridge_port *p, bool deleted); #if IS_ENABLED(CONFIG_IPV6) static void br_ip6_multicast_leave_group(struct net_bridge *br, struct net_bridge_port *port, @@ -1354,11 +1355,26 @@ static int br_ip6_multicast_add_group(struct net_bridge *br, } #endif +static bool br_multicast_rport_del(struct hlist_node *rlist) +{ + if (hlist_unhashed(rlist)) + return false; + + hlist_del_init_rcu(rlist); + return true; +} + +static bool br_ip4_multicast_rport_del(struct net_bridge_port *p) +{ + return br_multicast_rport_del(&p->ip4_rlist); +} + static void br_multicast_router_expired(struct net_bridge_port *port, struct timer_list *t, struct hlist_node *rlist) { struct net_bridge *br = port->br; + bool del; spin_lock(&br->multicast_lock); if (port->multicast_router == MDB_RTR_TYPE_DISABLED || @@ -1366,7 +1382,8 @@ static void br_multicast_router_expired(struct net_bridge_port *port, timer_pending(t)) goto out; - __del_port_router(port); + del = br_multicast_rport_del(rlist); + br_multicast_rport_del_notify(port, del); out: spin_unlock(&br->multicast_lock); } @@ -1705,19 +1722,20 @@ void br_multicast_disable_port(struct net_bridge_port *port) struct net_bridge *br = port->br; struct net_bridge_port_group *pg; struct hlist_node *n; + bool del = false; spin_lock(&br->multicast_lock); hlist_for_each_entry_safe(pg, n, &port->mglist, mglist) if (!(pg->flags & MDB_PG_FLAGS_PERMANENT)) br_multicast_find_del_pg(br, pg); - __del_port_router(port); - + del |= br_ip4_multicast_rport_del(port); del_timer(&port->ip4_mc_router_timer); del_timer(&port->ip4_own_query.timer); #if IS_ENABLED(CONFIG_IPV6) del_timer(&port->ip6_own_query.timer); #endif + br_multicast_rport_del_notify(port, del); spin_unlock(&br->multicast_lock); } @@ -3538,11 +3556,12 @@ int br_multicast_set_router(struct net_bridge *br, unsigned long val) return err; } -static void __del_port_router(struct net_bridge_port *p) +static void +br_multicast_rport_del_notify(struct net_bridge_port *p, bool deleted) { - if (hlist_unhashed(&p->ip4_rlist)) + if (!deleted) return; - hlist_del_init_rcu(&p->ip4_rlist); + br_rtr_notify(p->br->dev, p, RTM_DELMDB); br_port_mc_router_state_change(p, false); @@ -3556,6 +3575,7 @@ int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val) struct net_bridge *br = p->br; unsigned long now = jiffies; int err = -EINVAL; + bool del = false; spin_lock(&br->multicast_lock); if (p->multicast_router == val) { @@ -3569,12 +3589,14 @@ int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val) switch (val) { case MDB_RTR_TYPE_DISABLED: p->multicast_router = MDB_RTR_TYPE_DISABLED; - __del_port_router(p); + del |= br_ip4_multicast_rport_del(p); del_timer(&p->ip4_mc_router_timer); + br_multicast_rport_del_notify(p, del); break; case MDB_RTR_TYPE_TEMP_QUERY: p->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; - __del_port_router(p); + del |= br_ip4_multicast_rport_del(p); + br_multicast_rport_del_notify(p, del); break; case MDB_RTR_TYPE_PERM: p->multicast_router = MDB_RTR_TYPE_PERM; -- cgit v1.2.3-58-ga151 From a3c02e769efe66dce5e2c716862b60c8d44d191e Mon Sep 17 00:00:00 2001 From: Linus Lüssing Date: Thu, 13 May 2021 15:20:51 +0200 Subject: net: bridge: mcast: split multicast router state for IPv4 and IPv6 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A multicast router for IPv4 does not imply that the same host also is a multicast router for IPv6 and vice versa. To reduce multicast traffic when a host is only a multicast router for one of these two protocol families, keep router state for IPv4 and IPv6 separately. Similar to how querier state is kept separately. For backwards compatibility for netlink and switchdev notifications these two will still only notify if a port switched from either no IPv4/IPv6 multicast router to any IPv4/IPv6 multicast router or the other way round. However a full netlink MDB router dump will now also include a multicast router timeout for both IPv4 and IPv6. Signed-off-by: Linus Lüssing Signed-off-by: David S. Miller --- net/bridge/br_mdb.c | 10 ++++ net/bridge/br_multicast.c | 134 +++++++++++++++++++++++++++++++++++++++++++--- net/bridge/br_private.h | 14 ++++- 3 files changed, 151 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 482edb9aadc7..10c416c7bf47 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -18,7 +18,12 @@ static bool br_rports_have_mc_router(struct net_bridge *br) { +#if IS_ENABLED(CONFIG_IPV6) + return !hlist_empty(&br->ip4_mc_router_list) || + !hlist_empty(&br->ip6_mc_router_list); +#else return !hlist_empty(&br->ip4_mc_router_list); +#endif } static bool @@ -31,8 +36,13 @@ br_ip4_rports_get_timer(struct net_bridge_port *port, unsigned long *timer) static bool br_ip6_rports_get_timer(struct net_bridge_port *port, unsigned long *timer) { +#if IS_ENABLED(CONFIG_IPV6) + *timer = br_timer_value(&port->ip6_mc_router_timer); + return !hlist_unhashed(&port->ip6_rlist); +#else *timer = 0; return false; +#endif } static int br_rports_fill_info(struct sk_buff *skb, struct netlink_callback *cb, diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 30144f9d19d7..f234c48036c8 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -63,6 +63,8 @@ static void br_multicast_port_group_rexmit(struct timer_list *t); static void br_multicast_rport_del_notify(struct net_bridge_port *p, bool deleted); #if IS_ENABLED(CONFIG_IPV6) +static void br_ip6_multicast_add_router(struct net_bridge *br, + struct net_bridge_port *port); static void br_ip6_multicast_leave_group(struct net_bridge *br, struct net_bridge_port *port, const struct in6_addr *group, @@ -1369,6 +1371,15 @@ static bool br_ip4_multicast_rport_del(struct net_bridge_port *p) return br_multicast_rport_del(&p->ip4_rlist); } +static bool br_ip6_multicast_rport_del(struct net_bridge_port *p) +{ +#if IS_ENABLED(CONFIG_IPV6) + return br_multicast_rport_del(&p->ip6_rlist); +#else + return false; +#endif +} + static void br_multicast_router_expired(struct net_bridge_port *port, struct timer_list *t, struct hlist_node *rlist) @@ -1395,6 +1406,15 @@ static void br_ip4_multicast_router_expired(struct timer_list *t) br_multicast_router_expired(port, t, &port->ip4_rlist); } +#if IS_ENABLED(CONFIG_IPV6) +static void br_ip6_multicast_router_expired(struct timer_list *t) +{ + struct net_bridge_port *port = from_timer(port, t, ip6_mc_router_timer); + + br_multicast_router_expired(port, t, &port->ip6_rlist); +} +#endif + static void br_mc_router_state_change(struct net_bridge *p, bool is_mc_router) { @@ -1430,6 +1450,15 @@ static void br_ip4_multicast_local_router_expired(struct timer_list *t) br_multicast_local_router_expired(br, t); } +#if IS_ENABLED(CONFIG_IPV6) +static void br_ip6_multicast_local_router_expired(struct timer_list *t) +{ + struct net_bridge *br = from_timer(br, t, ip6_mc_router_timer); + + br_multicast_local_router_expired(br, t); +} +#endif + static void br_multicast_querier_expired(struct net_bridge *br, struct bridge_mcast_own_query *query) { @@ -1649,6 +1678,8 @@ int br_multicast_add_port(struct net_bridge_port *port) timer_setup(&port->ip4_own_query.timer, br_ip4_multicast_port_query_expired, 0); #if IS_ENABLED(CONFIG_IPV6) + timer_setup(&port->ip6_mc_router_timer, + br_ip6_multicast_router_expired, 0); timer_setup(&port->ip6_own_query.timer, br_ip6_multicast_port_query_expired, 0); #endif @@ -1681,6 +1712,9 @@ void br_multicast_del_port(struct net_bridge_port *port) spin_unlock_bh(&br->multicast_lock); br_multicast_gc(&deleted_head); del_timer_sync(&port->ip4_mc_router_timer); +#if IS_ENABLED(CONFIG_IPV6) + del_timer_sync(&port->ip6_mc_router_timer); +#endif free_percpu(port->mcast_stats); } @@ -1704,8 +1738,10 @@ static void __br_multicast_enable_port(struct net_bridge_port *port) #if IS_ENABLED(CONFIG_IPV6) br_multicast_enable(&port->ip6_own_query); #endif - if (port->multicast_router == MDB_RTR_TYPE_PERM) + if (port->multicast_router == MDB_RTR_TYPE_PERM) { br_ip4_multicast_add_router(br, port); + br_ip6_multicast_add_router(br, port); + } } void br_multicast_enable_port(struct net_bridge_port *port) @@ -1732,7 +1768,9 @@ void br_multicast_disable_port(struct net_bridge_port *port) del |= br_ip4_multicast_rport_del(port); del_timer(&port->ip4_mc_router_timer); del_timer(&port->ip4_own_query.timer); + del |= br_ip6_multicast_rport_del(port); #if IS_ENABLED(CONFIG_IPV6) + del_timer(&port->ip6_mc_router_timer); del_timer(&port->ip6_own_query.timer); #endif br_multicast_rport_del_notify(port, del); @@ -2675,6 +2713,10 @@ br_multicast_rport_from_node(struct net_bridge *br, struct hlist_head *mc_router_list, struct hlist_node *rlist) { +#if IS_ENABLED(CONFIG_IPV6) + if (mc_router_list == &br->ip6_mc_router_list) + return hlist_entry(rlist, struct net_bridge_port, ip6_rlist); +#endif return hlist_entry(rlist, struct net_bridge_port, ip4_rlist); } @@ -2700,6 +2742,19 @@ br_multicast_get_rport_slot(struct net_bridge *br, return slot; } +static bool br_multicast_no_router_otherpf(struct net_bridge_port *port, + struct hlist_node *rnode) +{ +#if IS_ENABLED(CONFIG_IPV6) + if (rnode != &port->ip6_rlist) + return hlist_unhashed(&port->ip6_rlist); + else + return hlist_unhashed(&port->ip4_rlist); +#else + return true; +#endif +} + /* Add port to router_list * list is maintained ordered by pointer value * and locked by br->multicast_lock and RCU @@ -2721,8 +2776,14 @@ static void br_multicast_add_router(struct net_bridge *br, else hlist_add_head_rcu(rlist, mc_router_list); - br_rtr_notify(br->dev, port, RTM_NEWMDB); - br_port_mc_router_state_change(port, true); + /* For backwards compatibility for now, only notify if we + * switched from no IPv4/IPv6 multicast router to a new + * IPv4 or IPv6 multicast router. + */ + if (br_multicast_no_router_otherpf(port, rlist)) { + br_rtr_notify(br->dev, port, RTM_NEWMDB); + br_port_mc_router_state_change(port, true); + } } /* Add port to router_list @@ -2736,6 +2797,19 @@ static void br_ip4_multicast_add_router(struct net_bridge *br, &br->ip4_mc_router_list); } +/* Add port to router_list + * list is maintained ordered by pointer value + * and locked by br->multicast_lock and RCU + */ +static void br_ip6_multicast_add_router(struct net_bridge *br, + struct net_bridge_port *port) +{ +#if IS_ENABLED(CONFIG_IPV6) + br_multicast_add_router(br, port, &port->ip6_rlist, + &br->ip6_mc_router_list); +#endif +} + static void br_multicast_mark_router(struct net_bridge *br, struct net_bridge_port *port, struct timer_list *timer, @@ -2777,6 +2851,23 @@ static void br_ip4_multicast_mark_router(struct net_bridge *br, &br->ip4_mc_router_list); } +static void br_ip6_multicast_mark_router(struct net_bridge *br, + struct net_bridge_port *port) +{ +#if IS_ENABLED(CONFIG_IPV6) + struct timer_list *timer = &br->ip6_mc_router_timer; + struct hlist_node *rlist = NULL; + + if (port) { + timer = &port->ip6_mc_router_timer; + rlist = &port->ip6_rlist; + } + + br_multicast_mark_router(br, port, timer, rlist, + &br->ip6_mc_router_list); +#endif +} + static void br_ip4_multicast_query_received(struct net_bridge *br, struct net_bridge_port *port, @@ -2803,7 +2894,7 @@ br_ip6_multicast_query_received(struct net_bridge *br, return; br_multicast_update_query_timer(br, query, max_delay); - br_ip4_multicast_mark_router(br, port); + br_ip6_multicast_mark_router(br, port); } #endif @@ -3252,7 +3343,7 @@ static void br_ip6_multicast_mrd_rcv(struct net_bridge *br, if (icmp6_hdr(skb)->icmp6_type != ICMPV6_MRDISC_ADV) return; - br_ip4_multicast_mark_router(br, port); + br_ip6_multicast_mark_router(br, port); } static int br_multicast_ipv6_rcv(struct net_bridge *br, @@ -3409,6 +3500,8 @@ void br_multicast_init(struct net_bridge *br) timer_setup(&br->ip4_own_query.timer, br_ip4_multicast_query_expired, 0); #if IS_ENABLED(CONFIG_IPV6) + timer_setup(&br->ip6_mc_router_timer, + br_ip6_multicast_local_router_expired, 0); timer_setup(&br->ip6_other_query.timer, br_ip6_multicast_querier_expired, 0); timer_setup(&br->ip6_own_query.timer, @@ -3506,6 +3599,7 @@ void br_multicast_stop(struct net_bridge *br) del_timer_sync(&br->ip4_other_query.timer); del_timer_sync(&br->ip4_own_query.timer); #if IS_ENABLED(CONFIG_IPV6) + del_timer_sync(&br->ip6_mc_router_timer); del_timer_sync(&br->ip6_other_query.timer); del_timer_sync(&br->ip6_own_query.timer); #endif @@ -3540,6 +3634,9 @@ int br_multicast_set_router(struct net_bridge *br, unsigned long val) case MDB_RTR_TYPE_PERM: br_mc_router_state_change(br, val == MDB_RTR_TYPE_PERM); del_timer(&br->ip4_mc_router_timer); +#if IS_ENABLED(CONFIG_IPV6) + del_timer(&br->ip6_mc_router_timer); +#endif br->multicast_router = val; err = 0; break; @@ -3562,6 +3659,16 @@ br_multicast_rport_del_notify(struct net_bridge_port *p, bool deleted) if (!deleted) return; + /* For backwards compatibility for now, only notify if there is + * no multicast router anymore for both IPv4 and IPv6. + */ + if (!hlist_unhashed(&p->ip4_rlist)) + return; +#if IS_ENABLED(CONFIG_IPV6) + if (!hlist_unhashed(&p->ip6_rlist)) + return; +#endif + br_rtr_notify(p->br->dev, p, RTM_DELMDB); br_port_mc_router_state_change(p, false); @@ -3580,9 +3687,14 @@ int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val) spin_lock(&br->multicast_lock); if (p->multicast_router == val) { /* Refresh the temp router port timer */ - if (p->multicast_router == MDB_RTR_TYPE_TEMP) + if (p->multicast_router == MDB_RTR_TYPE_TEMP) { mod_timer(&p->ip4_mc_router_timer, now + br->multicast_querier_interval); +#if IS_ENABLED(CONFIG_IPV6) + mod_timer(&p->ip6_mc_router_timer, + now + br->multicast_querier_interval); +#endif + } err = 0; goto unlock; } @@ -3591,21 +3703,31 @@ int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val) p->multicast_router = MDB_RTR_TYPE_DISABLED; del |= br_ip4_multicast_rport_del(p); del_timer(&p->ip4_mc_router_timer); + del |= br_ip6_multicast_rport_del(p); +#if IS_ENABLED(CONFIG_IPV6) + del_timer(&p->ip6_mc_router_timer); +#endif br_multicast_rport_del_notify(p, del); break; case MDB_RTR_TYPE_TEMP_QUERY: p->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; del |= br_ip4_multicast_rport_del(p); + del |= br_ip6_multicast_rport_del(p); br_multicast_rport_del_notify(p, del); break; case MDB_RTR_TYPE_PERM: p->multicast_router = MDB_RTR_TYPE_PERM; del_timer(&p->ip4_mc_router_timer); br_ip4_multicast_add_router(br, p); +#if IS_ENABLED(CONFIG_IPV6) + del_timer(&p->ip6_mc_router_timer); +#endif + br_ip6_multicast_add_router(br, p); break; case MDB_RTR_TYPE_TEMP: p->multicast_router = MDB_RTR_TYPE_TEMP; br_ip4_multicast_mark_router(br, p); + br_ip6_multicast_mark_router(br, p); break; default: goto unlock; diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index f9a381fcff09..03197ab4af76 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -311,6 +311,8 @@ struct net_bridge_port { struct hlist_node ip4_rlist; #if IS_ENABLED(CONFIG_IPV6) struct bridge_mcast_own_query ip6_own_query; + struct timer_list ip6_mc_router_timer; + struct hlist_node ip6_rlist; #endif /* IS_ENABLED(CONFIG_IPV6) */ u32 multicast_eht_hosts_limit; u32 multicast_eht_hosts_cnt; @@ -457,6 +459,8 @@ struct net_bridge { struct bridge_mcast_querier ip4_querier; struct bridge_mcast_stats __percpu *mcast_stats; #if IS_ENABLED(CONFIG_IPV6) + struct hlist_head ip6_mc_router_list; + struct timer_list ip6_mc_router_timer; struct bridge_mcast_other_query ip6_other_query; struct bridge_mcast_own_query ip6_own_query; struct bridge_mcast_querier ip6_querier; @@ -866,11 +870,19 @@ static inline bool br_group_is_l2(const struct br_ip *group) static inline struct hlist_node * br_multicast_get_first_rport_node(struct net_bridge *b, struct sk_buff *skb) { +#if IS_ENABLED(CONFIG_IPV6) + if (skb->protocol == htons(ETH_P_IPV6)) + return rcu_dereference(hlist_first_rcu(&b->ip6_mc_router_list)); +#endif return rcu_dereference(hlist_first_rcu(&b->ip4_mc_router_list)); } static inline struct net_bridge_port * br_multicast_rport_from_node_skb(struct hlist_node *rp, struct sk_buff *skb) { +#if IS_ENABLED(CONFIG_IPV6) + if (skb->protocol == htons(ETH_P_IPV6)) + return hlist_entry_safe(rp, struct net_bridge_port, ip6_rlist); +#endif return hlist_entry_safe(rp, struct net_bridge_port, ip4_rlist); } @@ -882,7 +894,7 @@ static inline bool br_ip4_multicast_is_router(struct net_bridge *br) static inline bool br_ip6_multicast_is_router(struct net_bridge *br) { #if IS_ENABLED(CONFIG_IPV6) - return timer_pending(&br->ip4_mc_router_timer); + return timer_pending(&br->ip6_mc_router_timer); #else return false; #endif -- cgit v1.2.3-58-ga151 From b7fb0916544de44ce099d9f3b6129c86b484de25 Mon Sep 17 00:00:00 2001 From: Linus Lüssing Date: Thu, 13 May 2021 15:20:52 +0200 Subject: net: bridge: mcast: add ip4+ip6 mcast router timers to mdb netlink MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that we have split the multicast router state into two, one for IPv4 and one for IPv6, also add individual timers to the mdb netlink router port dump. Leaving the old timer attribute for backwards compatibility. Signed-off-by: Linus Lüssing Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 2 ++ net/bridge/br_mdb.c | 8 +++++++- 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index 13d59c51ef5b..6b56a7549531 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -627,6 +627,8 @@ enum { MDBA_ROUTER_PATTR_UNSPEC, MDBA_ROUTER_PATTR_TIMER, MDBA_ROUTER_PATTR_TYPE, + MDBA_ROUTER_PATTR_INET_TIMER, + MDBA_ROUTER_PATTR_INET6_TIMER, __MDBA_ROUTER_PATTR_MAX }; #define MDBA_ROUTER_PATTR_MAX (__MDBA_ROUTER_PATTR_MAX - 1) diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 10c416c7bf47..3f839a8cc9fb 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -79,7 +79,13 @@ static int br_rports_fill_info(struct sk_buff *skb, struct netlink_callback *cb, nla_put_u32(skb, MDBA_ROUTER_PATTR_TIMER, max(ip4_timer, ip6_timer)) || nla_put_u8(skb, MDBA_ROUTER_PATTR_TYPE, - p->multicast_router)) { + p->multicast_router) || + (have_ip4_mc_rtr && + nla_put_u32(skb, MDBA_ROUTER_PATTR_INET_TIMER, + ip4_timer)) || + (have_ip6_mc_rtr && + nla_put_u32(skb, MDBA_ROUTER_PATTR_INET6_TIMER, + ip6_timer))) { nla_nest_cancel(skb, port_nest); goto fail; } -- cgit v1.2.3-58-ga151 From 3b85f9ba3480c1bcbebb2bb490822bec0e7a1201 Mon Sep 17 00:00:00 2001 From: Linus Lüssing Date: Thu, 13 May 2021 15:20:53 +0200 Subject: net: bridge: mcast: export multicast router presence adjacent to a port MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To properly support routable multicast addresses in batman-adv in a group-aware way, a batman-adv node needs to know if it serves multicast routers. This adds a function to the bridge to export this so that batman-adv can then make full use of the Multicast Router Discovery capability of the bridge. Signed-off-by: Linus Lüssing Signed-off-by: David S. Miller --- include/linux/if_bridge.h | 8 +++++++ net/bridge/br_multicast.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+) (limited to 'net') diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index 2cc35038a8ca..12e9a32dbca0 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -67,6 +67,7 @@ int br_multicast_list_adjacent(struct net_device *dev, struct list_head *br_ip_list); bool br_multicast_has_querier_anywhere(struct net_device *dev, int proto); bool br_multicast_has_querier_adjacent(struct net_device *dev, int proto); +bool br_multicast_has_router_adjacent(struct net_device *dev, int proto); bool br_multicast_enabled(const struct net_device *dev); bool br_multicast_router(const struct net_device *dev); int br_mdb_replay(struct net_device *br_dev, struct net_device *dev, @@ -87,6 +88,13 @@ static inline bool br_multicast_has_querier_adjacent(struct net_device *dev, { return false; } + +static inline bool br_multicast_has_router_adjacent(struct net_device *dev, + int proto) +{ + return true; +} + static inline bool br_multicast_enabled(const struct net_device *dev) { return false; diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index f234c48036c8..0703725527b3 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -4054,6 +4054,61 @@ unlock: } EXPORT_SYMBOL_GPL(br_multicast_has_querier_adjacent); +/** + * br_multicast_has_router_adjacent - Checks for a router behind a bridge port + * @dev: The bridge port adjacent to which to check for a multicast router + * @proto: The protocol family to check for: IGMP -> ETH_P_IP, MLD -> ETH_P_IPV6 + * + * Checks whether the given interface has a bridge on top and if so returns + * true if a multicast router is behind one of the other ports of this + * bridge. Otherwise returns false. + */ +bool br_multicast_has_router_adjacent(struct net_device *dev, int proto) +{ + struct net_bridge_port *port, *p; + bool ret = false; + + rcu_read_lock(); + port = br_port_get_check_rcu(dev); + if (!port) + goto unlock; + + switch (proto) { + case ETH_P_IP: + hlist_for_each_entry_rcu(p, &port->br->ip4_mc_router_list, + ip4_rlist) { + if (p == port) + continue; + + ret = true; + goto unlock; + } + break; +#if IS_ENABLED(CONFIG_IPV6) + case ETH_P_IPV6: + hlist_for_each_entry_rcu(p, &port->br->ip6_mc_router_list, + ip6_rlist) { + if (p == port) + continue; + + ret = true; + goto unlock; + } + break; +#endif + default: + /* when compiled without IPv6 support, be conservative and + * always assume presence of an IPv6 multicast router + */ + ret = true; + } + +unlock: + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL_GPL(br_multicast_has_router_adjacent); + static void br_mcast_stats_add(struct bridge_mcast_stats __percpu *stats, const struct sk_buff *skb, u8 type, u8 dir) { -- cgit v1.2.3-58-ga151 From 0f3ee280331e28b81560715356d47351a6016bce Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Thu, 13 May 2021 09:58:40 -0700 Subject: net: caif: Drop unnecessary NULL check after container_of The first parameter passed to chnl_recv_cb() can never be NULL since all callers dereferenced it. Consequently, container_of() on it is also never NULL, even though the reference into the structure points to the first element of the structure. The NULL check is therefore unnecessary. On top of that, it is misleading to perform a NULL check on the result of container_of() because the position of the contained element could change, which would make the test invalid. Remove the unnecessary NULL check. This change was made automatically with the following Coccinelle script. @@ type t; identifier v; statement s; @@ <+... ( t v = container_of(...); | v = container_of(...); ) ... when != v - if (\( !v \| v == NULL \) ) s ...+> Signed-off-by: Guenter Roeck Signed-off-by: David S. Miller --- net/caif/chnl_net.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c index fadc7c8a3107..37b67194c0df 100644 --- a/net/caif/chnl_net.c +++ b/net/caif/chnl_net.c @@ -76,8 +76,6 @@ static int chnl_recv_cb(struct cflayer *layr, struct cfpkt *pkt) u8 buf; priv = container_of(layr, struct chnl_net, chnl); - if (!priv) - return -EINVAL; skb = (struct sk_buff *) cfpkt_tonative(pkt); -- cgit v1.2.3-58-ga151 From fe0bdaec8dea9912b95296d758422d95aa57fac0 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 13 May 2021 16:50:49 -0500 Subject: bpf: Use struct_size() in kzalloc() Make use of the struct_size() helper instead of an open-coded version, in order to avoid any potential type mistakes or integer overflows that, in the worst scenario, could lead to heap overflows. This code was detected with the help of Coccinelle and, audited and fixed manually. Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- net/core/bpf_sk_storage.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index cc3712ad8716..f564f82e91d9 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -524,8 +524,7 @@ bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs) nr_maps++; } - diag = kzalloc(sizeof(*diag) + sizeof(diag->maps[0]) * nr_maps, - GFP_KERNEL); + diag = kzalloc(struct_size(diag, maps, nr_maps), GFP_KERNEL); if (!diag) return ERR_PTR(-ENOMEM); -- cgit v1.2.3-58-ga151 From 335a2a1fcefc948927e8c15636d9dc5d983b8f50 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Sun, 25 Apr 2021 18:14:32 +0800 Subject: esp: drop unneeded assignment in esp4_gro_receive() Making '!=' operation with 0 directly after calling the function xfrm_parse_spi() is more efficient, assignment to err is redundant. Eliminate the following clang_analyzer warning: net/ipv4/esp4_offload.c:41:7: warning: Although the value stored to 'err' is used in the enclosing expression, the value is never actually read from 'err' No functional change, only more efficient. Reported-by: Abaci Robot Signed-off-by: Yang Li Signed-off-by: Steffen Klassert --- net/ipv4/esp4_offload.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index 33687cf58286..be019a1fe3af 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -33,12 +33,11 @@ static struct sk_buff *esp4_gro_receive(struct list_head *head, struct xfrm_state *x; __be32 seq; __be32 spi; - int err; if (!pskb_pull(skb, offset)) return NULL; - if ((err = xfrm_parse_spi(skb, IPPROTO_ESP, &spi, &seq)) != 0) + if (xfrm_parse_spi(skb, IPPROTO_ESP, &spi, &seq) != 0) goto out; xo = xfrm_offload(skb); -- cgit v1.2.3-58-ga151 From fe9f1d8779cb47046e76ea209b6eece7ec56d1b4 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Sun, 25 Apr 2021 21:47:12 +0200 Subject: xfrm: add state hashtable keyed by seq When creating new states with seq set in xfrm_usersa_info, we walk through all the states already installed in that netns to find a matching ACQUIRE state (__xfrm_find_acq_byseq, called from xfrm_state_add). This causes severe slowdowns on systems with a large number of states. This patch introduces a hashtable using x->km.seq as key, so that the corresponding state can be found in a reasonable time. Signed-off-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- include/net/netns/xfrm.h | 1 + include/net/xfrm.h | 1 + net/xfrm/xfrm_hash.h | 7 ++++++ net/xfrm/xfrm_state.c | 65 ++++++++++++++++++++++++++++++++++++++---------- 4 files changed, 61 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h index e816b6a3ef2b..e946366e8ba5 100644 --- a/include/net/netns/xfrm.h +++ b/include/net/netns/xfrm.h @@ -42,6 +42,7 @@ struct netns_xfrm { struct hlist_head __rcu *state_bydst; struct hlist_head __rcu *state_bysrc; struct hlist_head __rcu *state_byspi; + struct hlist_head __rcu *state_byseq; unsigned int state_hmask; unsigned int state_num; struct work_struct state_hash_work; diff --git a/include/net/xfrm.h b/include/net/xfrm.h index c58a6d4eb610..6e11db6fa0ab 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -154,6 +154,7 @@ struct xfrm_state { }; struct hlist_node bysrc; struct hlist_node byspi; + struct hlist_node byseq; refcount_t refcnt; spinlock_t lock; diff --git a/net/xfrm/xfrm_hash.h b/net/xfrm/xfrm_hash.h index ce66323102f9..d12bb906c9c9 100644 --- a/net/xfrm/xfrm_hash.h +++ b/net/xfrm/xfrm_hash.h @@ -131,6 +131,13 @@ __xfrm_spi_hash(const xfrm_address_t *daddr, __be32 spi, u8 proto, return (h ^ (h >> 10) ^ (h >> 20)) & hmask; } +static inline unsigned int +__xfrm_seq_hash(u32 seq, unsigned int hmask) +{ + unsigned int h = seq; + return (h ^ (h >> 10) ^ (h >> 20)) & hmask; +} + static inline unsigned int __idx_hash(u32 index, unsigned int hmask) { return (index ^ (index >> 8)) & hmask; diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 4496f7efa220..8f6058e56f7f 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -78,10 +78,16 @@ xfrm_spi_hash(struct net *net, const xfrm_address_t *daddr, return __xfrm_spi_hash(daddr, spi, proto, family, net->xfrm.state_hmask); } +static unsigned int xfrm_seq_hash(struct net *net, u32 seq) +{ + return __xfrm_seq_hash(seq, net->xfrm.state_hmask); +} + static void xfrm_hash_transfer(struct hlist_head *list, struct hlist_head *ndsttable, struct hlist_head *nsrctable, struct hlist_head *nspitable, + struct hlist_head *nseqtable, unsigned int nhashmask) { struct hlist_node *tmp; @@ -106,6 +112,11 @@ static void xfrm_hash_transfer(struct hlist_head *list, nhashmask); hlist_add_head_rcu(&x->byspi, nspitable + h); } + + if (x->km.seq) { + h = __xfrm_seq_hash(x->km.seq, nhashmask); + hlist_add_head_rcu(&x->byseq, nseqtable + h); + } } } @@ -117,7 +128,7 @@ static unsigned long xfrm_hash_new_size(unsigned int state_hmask) static void xfrm_hash_resize(struct work_struct *work) { struct net *net = container_of(work, struct net, xfrm.state_hash_work); - struct hlist_head *ndst, *nsrc, *nspi, *odst, *osrc, *ospi; + struct hlist_head *ndst, *nsrc, *nspi, *nseq, *odst, *osrc, *ospi, *oseq; unsigned long nsize, osize; unsigned int nhashmask, ohashmask; int i; @@ -137,6 +148,13 @@ static void xfrm_hash_resize(struct work_struct *work) xfrm_hash_free(nsrc, nsize); return; } + nseq = xfrm_hash_alloc(nsize); + if (!nseq) { + xfrm_hash_free(ndst, nsize); + xfrm_hash_free(nsrc, nsize); + xfrm_hash_free(nspi, nsize); + return; + } spin_lock_bh(&net->xfrm.xfrm_state_lock); write_seqcount_begin(&net->xfrm.xfrm_state_hash_generation); @@ -144,15 +162,17 @@ static void xfrm_hash_resize(struct work_struct *work) nhashmask = (nsize / sizeof(struct hlist_head)) - 1U; odst = xfrm_state_deref_prot(net->xfrm.state_bydst, net); for (i = net->xfrm.state_hmask; i >= 0; i--) - xfrm_hash_transfer(odst + i, ndst, nsrc, nspi, nhashmask); + xfrm_hash_transfer(odst + i, ndst, nsrc, nspi, nseq, nhashmask); osrc = xfrm_state_deref_prot(net->xfrm.state_bysrc, net); ospi = xfrm_state_deref_prot(net->xfrm.state_byspi, net); + oseq = xfrm_state_deref_prot(net->xfrm.state_byseq, net); ohashmask = net->xfrm.state_hmask; rcu_assign_pointer(net->xfrm.state_bydst, ndst); rcu_assign_pointer(net->xfrm.state_bysrc, nsrc); rcu_assign_pointer(net->xfrm.state_byspi, nspi); + rcu_assign_pointer(net->xfrm.state_byseq, nseq); net->xfrm.state_hmask = nhashmask; write_seqcount_end(&net->xfrm.xfrm_state_hash_generation); @@ -165,6 +185,7 @@ static void xfrm_hash_resize(struct work_struct *work) xfrm_hash_free(odst, osize); xfrm_hash_free(osrc, osize); xfrm_hash_free(ospi, osize); + xfrm_hash_free(oseq, osize); } static DEFINE_SPINLOCK(xfrm_state_afinfo_lock); @@ -621,6 +642,7 @@ struct xfrm_state *xfrm_state_alloc(struct net *net) INIT_HLIST_NODE(&x->bydst); INIT_HLIST_NODE(&x->bysrc); INIT_HLIST_NODE(&x->byspi); + INIT_HLIST_NODE(&x->byseq); hrtimer_init(&x->mtimer, CLOCK_BOOTTIME, HRTIMER_MODE_ABS_SOFT); x->mtimer.function = xfrm_timer_handler; timer_setup(&x->rtimer, xfrm_replay_timer_handler, 0); @@ -664,6 +686,8 @@ int __xfrm_state_delete(struct xfrm_state *x) list_del(&x->km.all); hlist_del_rcu(&x->bydst); hlist_del_rcu(&x->bysrc); + if (x->km.seq) + hlist_del_rcu(&x->byseq); if (x->id.spi) hlist_del_rcu(&x->byspi); net->xfrm.state_num--; @@ -1148,6 +1172,10 @@ found: h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, encap_family); hlist_add_head_rcu(&x->byspi, net->xfrm.state_byspi + h); } + if (x->km.seq) { + h = xfrm_seq_hash(net, x->km.seq); + hlist_add_head_rcu(&x->byseq, net->xfrm.state_byseq + h); + } x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires; hrtimer_start(&x->mtimer, ktime_set(net->xfrm.sysctl_acq_expires, 0), @@ -1263,6 +1291,12 @@ static void __xfrm_state_insert(struct xfrm_state *x) hlist_add_head_rcu(&x->byspi, net->xfrm.state_byspi + h); } + if (x->km.seq) { + h = xfrm_seq_hash(net, x->km.seq); + + hlist_add_head_rcu(&x->byseq, net->xfrm.state_byseq + h); + } + hrtimer_start(&x->mtimer, ktime_set(1, 0), HRTIMER_MODE_REL_SOFT); if (x->replay_maxage) mod_timer(&x->rtimer, jiffies + x->replay_maxage); @@ -1932,20 +1966,18 @@ xfrm_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n, static struct xfrm_state *__xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq) { - int i; - - for (i = 0; i <= net->xfrm.state_hmask; i++) { - struct xfrm_state *x; + unsigned int h = xfrm_seq_hash(net, seq); + struct xfrm_state *x; - hlist_for_each_entry(x, net->xfrm.state_bydst+i, bydst) { - if (x->km.seq == seq && - (mark & x->mark.m) == x->mark.v && - x->km.state == XFRM_STATE_ACQ) { - xfrm_state_hold(x); - return x; - } + hlist_for_each_entry_rcu(x, net->xfrm.state_byseq + h, byseq) { + if (x->km.seq == seq && + (mark & x->mark.m) == x->mark.v && + x->km.state == XFRM_STATE_ACQ) { + xfrm_state_hold(x); + return x; } } + return NULL; } @@ -2660,6 +2692,9 @@ int __net_init xfrm_state_init(struct net *net) net->xfrm.state_byspi = xfrm_hash_alloc(sz); if (!net->xfrm.state_byspi) goto out_byspi; + net->xfrm.state_byseq = xfrm_hash_alloc(sz); + if (!net->xfrm.state_byseq) + goto out_byseq; net->xfrm.state_hmask = ((sz / sizeof(struct hlist_head)) - 1); net->xfrm.state_num = 0; @@ -2669,6 +2704,8 @@ int __net_init xfrm_state_init(struct net *net) &net->xfrm.xfrm_state_lock); return 0; +out_byseq: + xfrm_hash_free(net->xfrm.state_byspi, sz); out_byspi: xfrm_hash_free(net->xfrm.state_bysrc, sz); out_bysrc: @@ -2688,6 +2725,8 @@ void xfrm_state_fini(struct net *net) WARN_ON(!list_empty(&net->xfrm.state_all)); sz = (net->xfrm.state_hmask + 1) * sizeof(struct hlist_head); + WARN_ON(!hlist_empty(net->xfrm.state_byseq)); + xfrm_hash_free(net->xfrm.state_byseq, sz); WARN_ON(!hlist_empty(net->xfrm.state_byspi)); xfrm_hash_free(net->xfrm.state_byspi, sz); WARN_ON(!hlist_empty(net->xfrm.state_bysrc)); -- cgit v1.2.3-58-ga151 From bbc6f2cca74e548914a7705f5c39549c28ab8815 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 14 May 2021 10:32:33 +0300 Subject: net: bridge: fix br_multicast_is_router stub when igmp is disabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit br_multicast_is_router takes two arguments when bridge IGMP is enabled and just one when it's disabled, fix the stub to take two as well. Fixes: 1a3065a26807 ("net: bridge: mcast: prepare is-router function for mcast router split") Signed-off-by: Nikolay Aleksandrov Acked-by: Linus Lüssing Signed-off-by: David S. Miller --- net/bridge/br_private.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 03197ab4af76..ec661130c2d0 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -1068,7 +1068,8 @@ static inline void br_multicast_flood(struct net_bridge_mdb_entry *mdst, { } -static inline bool br_multicast_is_router(struct net_bridge *br) +static inline bool br_multicast_is_router(struct net_bridge *br, + struct sk_buff *skb) { return false; } -- cgit v1.2.3-58-ga151 From 30515832e987597eae354f6ffcdb3374bdfde16d Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Fri, 14 May 2021 03:53:48 +0200 Subject: net: bridge: fix build when IPv6 is disabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The br_ip6_multicast_add_router() prototype is defined only when CONFIG_IPV6 is enabled, but the function is always referenced, so there is this build error with CONFIG_IPV6 not defined: net/bridge/br_multicast.c: In function ‘__br_multicast_enable_port’: net/bridge/br_multicast.c:1743:3: error: implicit declaration of function ‘br_ip6_multicast_add_router’; did you mean ‘br_ip4_multicast_add_router’? [-Werror=implicit-function-declaration] 1743 | br_ip6_multicast_add_router(br, port); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~ | br_ip4_multicast_add_router net/bridge/br_multicast.c: At top level: net/bridge/br_multicast.c:2804:13: warning: conflicting types for ‘br_ip6_multicast_add_router’ 2804 | static void br_ip6_multicast_add_router(struct net_bridge *br, | ^~~~~~~~~~~~~~~~~~~~~~~~~~~ net/bridge/br_multicast.c:2804:13: error: static declaration of ‘br_ip6_multicast_add_router’ follows non-static declaration net/bridge/br_multicast.c:1743:3: note: previous implicit declaration of ‘br_ip6_multicast_add_router’ was here 1743 | br_ip6_multicast_add_router(br, port); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~ Fix this build error by moving the definition out of the #ifdef. Fixes: a3c02e769efe ("net: bridge: mcast: split multicast router state for IPv4 and IPv6") Signed-off-by: Matteo Croce Signed-off-by: David S. Miller --- net/bridge/br_multicast.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 0703725527b3..53c3a9d80d9c 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -62,9 +62,9 @@ static void br_multicast_port_group_rexmit(struct timer_list *t); static void br_multicast_rport_del_notify(struct net_bridge_port *p, bool deleted); -#if IS_ENABLED(CONFIG_IPV6) static void br_ip6_multicast_add_router(struct net_bridge *br, struct net_bridge_port *port); +#if IS_ENABLED(CONFIG_IPV6) static void br_ip6_multicast_leave_group(struct net_bridge *br, struct net_bridge_port *port, const struct in6_addr *group, -- cgit v1.2.3-58-ga151 From 709c0314239992162cba26a860f04319a15860c4 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 14 May 2021 13:04:25 -0700 Subject: tcp: add tracepoint for checksum errors Add a tracepoint for capturing TCP segments with a bad checksum. This makes it easy to identify sources of bad frames in the fleet (e.g. machines with faulty NICs). It should also help tools like IOvisor's tcpdrop.py which are used today to get detailed information about such packets. We don't have a socket in many cases so we must open code the address extraction based just on the skb. v2: add missing export for ipv6=m Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/trace/events/tcp.h | 76 ++++++++++++++++++++++++++++++++++++++++++++++ net/core/net-traces.c | 1 + net/ipv4/tcp_input.c | 1 + net/ipv4/tcp_ipv4.c | 3 ++ net/ipv6/tcp_ipv6.c | 2 ++ 5 files changed, 83 insertions(+) (limited to 'net') diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index ba94857eea11..521059d8dc0a 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -295,6 +295,82 @@ TRACE_EVENT(tcp_probe, __entry->srtt, __entry->rcv_wnd, __entry->sock_cookie) ); +#define TP_STORE_ADDR_PORTS_SKB_V4(__entry, skb) \ + do { \ + const struct tcphdr *th = (const struct tcphdr *)skb->data; \ + struct sockaddr_in *v4 = (void *)__entry->saddr; \ + \ + v4->sin_family = AF_INET; \ + v4->sin_port = th->source; \ + v4->sin_addr.s_addr = ip_hdr(skb)->saddr; \ + v4 = (void *)__entry->daddr; \ + v4->sin_family = AF_INET; \ + v4->sin_port = th->dest; \ + v4->sin_addr.s_addr = ip_hdr(skb)->daddr; \ + } while (0) + +#if IS_ENABLED(CONFIG_IPV6) + +#define TP_STORE_ADDR_PORTS_SKB(__entry, skb) \ + do { \ + const struct iphdr *iph = ip_hdr(skb); \ + \ + if (iph->version == 6) { \ + const struct tcphdr *th = (const struct tcphdr *)skb->data; \ + struct sockaddr_in6 *v6 = (void *)__entry->saddr; \ + \ + v6->sin6_family = AF_INET6; \ + v6->sin6_port = th->source; \ + v6->sin6_addr = ipv6_hdr(skb)->saddr; \ + v6 = (void *)__entry->daddr; \ + v6->sin6_family = AF_INET6; \ + v6->sin6_port = th->dest; \ + v6->sin6_addr = ipv6_hdr(skb)->daddr; \ + } else \ + TP_STORE_ADDR_PORTS_SKB_V4(__entry, skb); \ + } while (0) + +#else + +#define TP_STORE_ADDR_PORTS_SKB(__entry, skb) \ + TP_STORE_ADDR_PORTS_SKB_V4(__entry, skb) + +#endif + +/* + * tcp event with only skb + */ +DECLARE_EVENT_CLASS(tcp_event_skb, + + TP_PROTO(const struct sk_buff *skb), + + TP_ARGS(skb), + + TP_STRUCT__entry( + __field(const void *, skbaddr) + __array(__u8, saddr, sizeof(struct sockaddr_in6)) + __array(__u8, daddr, sizeof(struct sockaddr_in6)) + ), + + TP_fast_assign( + __entry->skbaddr = skb; + + memset(__entry->saddr, 0, sizeof(struct sockaddr_in6)); + memset(__entry->daddr, 0, sizeof(struct sockaddr_in6)); + + TP_STORE_ADDR_PORTS_SKB(__entry, skb); + ), + + TP_printk("src=%pISpc dest=%pISpc", __entry->saddr, __entry->daddr) +); + +DEFINE_EVENT(tcp_event_skb, tcp_bad_csum, + + TP_PROTO(const struct sk_buff *skb), + + TP_ARGS(skb) +); + #endif /* _TRACE_TCP_H */ /* This part must be outside protection */ diff --git a/net/core/net-traces.c b/net/core/net-traces.c index 283ddb2dbc7d..c40cd8dd75c7 100644 --- a/net/core/net-traces.c +++ b/net/core/net-traces.c @@ -60,3 +60,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb); EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll); EXPORT_TRACEPOINT_SYMBOL_GPL(tcp_send_reset); +EXPORT_TRACEPOINT_SYMBOL_GPL(tcp_bad_csum); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 4cf4dd532d1c..cd52ce0a2a85 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5885,6 +5885,7 @@ step5: return; csum_error: + trace_tcp_bad_csum(skb); TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 312184cead57..4f5b68a90be9 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1731,6 +1731,7 @@ discard: return 0; csum_err: + trace_tcp_bad_csum(skb); TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); goto discard; @@ -1801,6 +1802,7 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) if (unlikely(tcp_checksum_complete(skb))) { bh_unlock_sock(sk); + trace_tcp_bad_csum(skb); __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); return true; @@ -2098,6 +2100,7 @@ no_tcp_socket: if (tcp_checksum_complete(skb)) { csum_error: + trace_tcp_bad_csum(skb); __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); bad_packet: __TCP_INC_STATS(net, TCP_MIB_INERRS); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 5f47c0b6e3de..4435fa342e7a 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1538,6 +1538,7 @@ discard: kfree_skb(skb); return 0; csum_err: + trace_tcp_bad_csum(skb); TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); goto discard; @@ -1754,6 +1755,7 @@ no_tcp_socket: if (tcp_checksum_complete(skb)) { csum_error: + trace_tcp_bad_csum(skb); __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); bad_packet: __TCP_INC_STATS(net, TCP_MIB_INERRS); -- cgit v1.2.3-58-ga151 From 9a959cab22194d633b3a1d9d1943b0df3475122c Mon Sep 17 00:00:00 2001 From: Simon Wunderlich Date: Mon, 17 May 2021 09:11:48 +0200 Subject: batman-adv: Start new development cycle This version will contain all the (major or even only minor) changes for Linux 5.14. The version number isn't a semantic version number with major and minor information. It is just encoding the year of the expected publishing as Linux -rc1 and the number of published versions this year (starting at 0). Signed-off-by: Simon Wunderlich --- net/batman-adv/main.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 8f0102b71656..b8f819a53a86 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -13,7 +13,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2021.1" +#define BATADV_SOURCE_VERSION "2021.2" #endif /* B.A.T.M.A.N. parameters */ -- cgit v1.2.3-58-ga151 From d295345abb3e91e5a16f3293eb12b111e352bd2b Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Mon, 10 May 2021 15:05:42 +0200 Subject: batman-adv: Always send iface index+name in genlmsg The batman-adv netlink messages often contain the interface index and interface name in the same message. This makes it easy for the receiver to operate on the incoming data when it either needs to print something or needs to operate on the interface index. But one of the attributes was missing for: * neighbor table dumps * originator table dumps * gateway list dumps * query of hardif information * query of vid information The userspace therefore had to implement special workarounds using SIOCGIFNAME or SIOCGIFINDEX depending on what was actually provided. Providing both information simplifies the userspace code massively without adding a lot of extra overhead in the kernel portion. Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/bat_iv_ogm.c | 6 ++++++ net/batman-adv/bat_v.c | 10 ++++++++++ net/batman-adv/netlink.c | 8 ++++++++ 3 files changed, 24 insertions(+) (limited to 'net') diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 789f257be24f..680def809838 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -1849,6 +1849,8 @@ batadv_iv_ogm_orig_dump_subentry(struct sk_buff *msg, u32 portid, u32 seq, orig_node->orig) || nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN, neigh_node->addr) || + nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, + neigh_node->if_incoming->net_dev->name) || nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, neigh_node->if_incoming->net_dev->ifindex) || nla_put_u8(msg, BATADV_ATTR_TQ, tq_avg) || @@ -2078,6 +2080,8 @@ batadv_iv_ogm_neigh_dump_neigh(struct sk_buff *msg, u32 portid, u32 seq, if (nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN, hardif_neigh->addr) || + nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, + hardif_neigh->if_incoming->net_dev->name) || nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, hardif_neigh->if_incoming->net_dev->ifindex) || nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, @@ -2459,6 +2463,8 @@ static int batadv_iv_gw_dump_entry(struct sk_buff *msg, u32 portid, router->addr) || nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, router->if_incoming->net_dev->name) || + nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, + router->if_incoming->net_dev->ifindex) || nla_put_u32(msg, BATADV_ATTR_BANDWIDTH_DOWN, gw_node->bandwidth_down) || nla_put_u32(msg, BATADV_ATTR_BANDWIDTH_UP, diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c index e1ca2b8c3152..b98aea958e3d 100644 --- a/net/batman-adv/bat_v.c +++ b/net/batman-adv/bat_v.c @@ -146,6 +146,8 @@ batadv_v_neigh_dump_neigh(struct sk_buff *msg, u32 portid, u32 seq, if (nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN, hardif_neigh->addr) || + nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, + hardif_neigh->if_incoming->net_dev->name) || nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, hardif_neigh->if_incoming->net_dev->ifindex) || nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, @@ -298,6 +300,8 @@ batadv_v_orig_dump_subentry(struct sk_buff *msg, u32 portid, u32 seq, if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN, orig_node->orig) || nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN, neigh_node->addr) || + nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, + neigh_node->if_incoming->net_dev->name) || nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, neigh_node->if_incoming->net_dev->ifindex) || nla_put_u32(msg, BATADV_ATTR_THROUGHPUT, throughput) || @@ -739,6 +743,12 @@ static int batadv_v_gw_dump_entry(struct sk_buff *msg, u32 portid, goto out; } + if (nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, + router->if_incoming->net_dev->ifindex)) { + genlmsg_cancel(msg, hdr); + goto out; + } + if (nla_put_u32(msg, BATADV_ATTR_BANDWIDTH_DOWN, gw_node->bandwidth_down)) { genlmsg_cancel(msg, hdr); diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index f317d206b411..b6cc746e01a6 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -814,6 +814,10 @@ static int batadv_netlink_hardif_fill(struct sk_buff *msg, bat_priv->soft_iface->ifindex)) goto nla_put_failure; + if (nla_put_string(msg, BATADV_ATTR_MESH_IFNAME, + bat_priv->soft_iface->name)) + goto nla_put_failure; + if (nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, net_dev->ifindex) || nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, @@ -1045,6 +1049,10 @@ static int batadv_netlink_vlan_fill(struct sk_buff *msg, bat_priv->soft_iface->ifindex)) goto nla_put_failure; + if (nla_put_string(msg, BATADV_ATTR_MESH_IFNAME, + bat_priv->soft_iface->name)) + goto nla_put_failure; + if (nla_put_u32(msg, BATADV_ATTR_VLANID, vlan->vid & VLAN_VID_MASK)) goto nla_put_failure; -- cgit v1.2.3-58-ga151 From 3f69339068f93e206e581e6ab9927502f8722ac7 Mon Sep 17 00:00:00 2001 From: Linus Lüssing Date: Mon, 17 May 2021 00:33:07 +0200 Subject: batman-adv: bcast: queue per interface, if needed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently we schedule a broadcast packet like: 3x: [ [(re-)queue] --> for(hard-if): maybe-transmit ] The intention of queueing a broadcast packet multiple times is to increase robustness for wireless interfaces. However on interfaces which we only broadcast on once the queueing induces an unnecessary penalty. This patch restructures the queueing to be performed on a per interface basis: for(hard-if): - transmit - if wireless: [queue] --> transmit --> [requeue] --> transmit Next to the performance benefits on non-wireless interfaces this should also make it easier to apply alternative strategies for transmissions on wireless interfaces in the future (for instance sending via unicast transmissions on wireless interfaces, without queueing in batman-adv, if appropriate). Signed-off-by: Linus Lüssing Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/main.h | 1 - net/batman-adv/routing.c | 9 +- net/batman-adv/send.c | 374 +++++++++++++++++++++++++++------------- net/batman-adv/send.h | 12 +- net/batman-adv/soft-interface.c | 12 +- 5 files changed, 270 insertions(+), 138 deletions(-) (limited to 'net') diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index b8f819a53a86..014235fd4681 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -88,7 +88,6 @@ /* number of packets to send for broadcasts on different interface types */ #define BATADV_NUM_BCASTS_DEFAULT 1 #define BATADV_NUM_BCASTS_WIRELESS 3 -#define BATADV_NUM_BCASTS_MAX 3 /* length of the single packet used by the TP meter */ #define BATADV_TP_PACKET_LEN ETH_DATA_LEN diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index 40f5cffde6a3..bb9e93e3d98c 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -1182,9 +1182,9 @@ int batadv_recv_bcast_packet(struct sk_buff *skb, struct batadv_bcast_packet *bcast_packet; struct ethhdr *ethhdr; int hdr_size = sizeof(*bcast_packet); - int ret = NET_RX_DROP; s32 seq_diff; u32 seqno; + int ret; /* drop packet if it has not necessary minimum size */ if (unlikely(!pskb_may_pull(skb, hdr_size))) @@ -1210,7 +1210,7 @@ int batadv_recv_bcast_packet(struct sk_buff *skb, if (batadv_is_my_mac(bat_priv, bcast_packet->orig)) goto free_skb; - if (bcast_packet->ttl < 2) + if (bcast_packet->ttl-- < 2) goto free_skb; orig_node = batadv_orig_hash_find(bat_priv, bcast_packet->orig); @@ -1249,7 +1249,9 @@ int batadv_recv_bcast_packet(struct sk_buff *skb, batadv_skb_set_priority(skb, sizeof(struct batadv_bcast_packet)); /* rebroadcast packet */ - batadv_add_bcast_packet_to_list(bat_priv, skb, 1, false); + ret = batadv_forw_bcast_packet(bat_priv, skb, 0, false); + if (ret == NETDEV_TX_BUSY) + goto free_skb; /* don't hand the broadcast up if it is from an originator * from the same backbone. @@ -1275,6 +1277,7 @@ spin_unlock: spin_unlock_bh(&orig_node->bcast_seqno_lock); free_skb: kfree_skb(skb); + ret = NET_RX_DROP; out: if (orig_node) batadv_orig_node_put(orig_node); diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index 157abe92d827..07b0ba265472 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -737,57 +737,48 @@ void batadv_forw_packet_ogmv1_queue(struct batadv_priv *bat_priv, } /** - * batadv_add_bcast_packet_to_list() - queue broadcast packet for multiple sends + * batadv_forw_bcast_packet_to_list() - queue broadcast packet for transmissions * @bat_priv: the bat priv with all the soft interface information * @skb: broadcast packet to add * @delay: number of jiffies to wait before sending * @own_packet: true if it is a self-generated broadcast packet + * @if_in: the interface where the packet was received on + * @if_out: the outgoing interface to queue on * - * add a broadcast packet to the queue and setup timers. broadcast packets + * Adds a broadcast packet to the queue and sets up timers. Broadcast packets * are sent multiple times to increase probability for being received. * - * The skb is not consumed, so the caller should make sure that the - * skb is freed. - * * Return: NETDEV_TX_OK on success and NETDEV_TX_BUSY on errors. */ -int batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv, - const struct sk_buff *skb, - unsigned long delay, - bool own_packet) +static int batadv_forw_bcast_packet_to_list(struct batadv_priv *bat_priv, + struct sk_buff *skb, + unsigned long delay, + bool own_packet, + struct batadv_hard_iface *if_in, + struct batadv_hard_iface *if_out) { - struct batadv_hard_iface *primary_if; struct batadv_forw_packet *forw_packet; - struct batadv_bcast_packet *bcast_packet; + unsigned long send_time = jiffies; struct sk_buff *newskb; - primary_if = batadv_primary_if_get_selected(bat_priv); - if (!primary_if) - goto err; - newskb = skb_copy(skb, GFP_ATOMIC); - if (!newskb) { - batadv_hardif_put(primary_if); + if (!newskb) goto err; - } - forw_packet = batadv_forw_packet_alloc(primary_if, NULL, + forw_packet = batadv_forw_packet_alloc(if_in, if_out, &bat_priv->bcast_queue_left, bat_priv, newskb); - batadv_hardif_put(primary_if); if (!forw_packet) goto err_packet_free; - /* as we have a copy now, it is safe to decrease the TTL */ - bcast_packet = (struct batadv_bcast_packet *)newskb->data; - bcast_packet->ttl--; - forw_packet->own = own_packet; INIT_DELAYED_WORK(&forw_packet->delayed_work, batadv_send_outstanding_bcast_packet); - batadv_forw_packet_bcast_queue(bat_priv, forw_packet, jiffies + delay); + send_time += delay ? delay : msecs_to_jiffies(5); + + batadv_forw_packet_bcast_queue(bat_priv, forw_packet, send_time); return NETDEV_TX_OK; err_packet_free: @@ -796,10 +787,220 @@ err: return NETDEV_TX_BUSY; } +/** + * batadv_forw_bcast_packet_if() - forward and queue a broadcast packet + * @bat_priv: the bat priv with all the soft interface information + * @skb: broadcast packet to add + * @delay: number of jiffies to wait before sending + * @own_packet: true if it is a self-generated broadcast packet + * @if_in: the interface where the packet was received on + * @if_out: the outgoing interface to forward to + * + * Transmits a broadcast packet on the specified interface either immediately + * or if a delay is given after that. Furthermore, queues additional + * retransmissions if this interface is a wireless one. + * + * Return: NETDEV_TX_OK on success and NETDEV_TX_BUSY on errors. + */ +static int batadv_forw_bcast_packet_if(struct batadv_priv *bat_priv, + struct sk_buff *skb, + unsigned long delay, + bool own_packet, + struct batadv_hard_iface *if_in, + struct batadv_hard_iface *if_out) +{ + unsigned int num_bcasts = if_out->num_bcasts; + struct sk_buff *newskb; + int ret = NETDEV_TX_OK; + + if (!delay) { + newskb = skb_copy(skb, GFP_ATOMIC); + if (!newskb) + return NETDEV_TX_BUSY; + + batadv_send_broadcast_skb(newskb, if_out); + num_bcasts--; + } + + /* delayed broadcast or rebroadcasts? */ + if (num_bcasts >= 1) { + BATADV_SKB_CB(skb)->num_bcasts = num_bcasts; + + ret = batadv_forw_bcast_packet_to_list(bat_priv, skb, delay, + own_packet, if_in, + if_out); + } + + return ret; +} + +/** + * batadv_send_no_broadcast() - check whether (re)broadcast is necessary + * @bat_priv: the bat priv with all the soft interface information + * @skb: broadcast packet to check + * @own_packet: true if it is a self-generated broadcast packet + * @if_out: the outgoing interface checked and considered for (re)broadcast + * + * Return: False if a packet needs to be (re)broadcasted on the given interface, + * true otherwise. + */ +static bool batadv_send_no_broadcast(struct batadv_priv *bat_priv, + struct sk_buff *skb, bool own_packet, + struct batadv_hard_iface *if_out) +{ + struct batadv_hardif_neigh_node *neigh_node = NULL; + struct batadv_bcast_packet *bcast_packet; + u8 *orig_neigh; + u8 *neigh_addr; + char *type; + int ret; + + if (!own_packet) { + neigh_addr = eth_hdr(skb)->h_source; + neigh_node = batadv_hardif_neigh_get(if_out, + neigh_addr); + } + + bcast_packet = (struct batadv_bcast_packet *)skb->data; + orig_neigh = neigh_node ? neigh_node->orig : NULL; + + ret = batadv_hardif_no_broadcast(if_out, bcast_packet->orig, + orig_neigh); + + if (neigh_node) + batadv_hardif_neigh_put(neigh_node); + + /* ok, may broadcast */ + if (!ret) + return false; + + /* no broadcast */ + switch (ret) { + case BATADV_HARDIF_BCAST_NORECIPIENT: + type = "no neighbor"; + break; + case BATADV_HARDIF_BCAST_DUPFWD: + type = "single neighbor is source"; + break; + case BATADV_HARDIF_BCAST_DUPORIG: + type = "single neighbor is originator"; + break; + default: + type = "unknown"; + } + + batadv_dbg(BATADV_DBG_BATMAN, bat_priv, + "BCAST packet from orig %pM on %s suppressed: %s\n", + bcast_packet->orig, + if_out->net_dev->name, type); + + return true; +} + +/** + * __batadv_forw_bcast_packet() - forward and queue a broadcast packet + * @bat_priv: the bat priv with all the soft interface information + * @skb: broadcast packet to add + * @delay: number of jiffies to wait before sending + * @own_packet: true if it is a self-generated broadcast packet + * + * Transmits a broadcast packet either immediately or if a delay is given + * after that. Furthermore, queues additional retransmissions on wireless + * interfaces. + * + * This call clones the given skb, hence the caller needs to take into + * account that the data segment of the given skb might not be + * modifiable anymore. + * + * Return: NETDEV_TX_OK on success and NETDEV_TX_BUSY on errors. + */ +static int __batadv_forw_bcast_packet(struct batadv_priv *bat_priv, + struct sk_buff *skb, + unsigned long delay, + bool own_packet) +{ + struct batadv_hard_iface *hard_iface; + struct batadv_hard_iface *primary_if; + int ret = NETDEV_TX_OK; + + primary_if = batadv_primary_if_get_selected(bat_priv); + if (!primary_if) + return NETDEV_TX_BUSY; + + rcu_read_lock(); + list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { + if (hard_iface->soft_iface != bat_priv->soft_iface) + continue; + + if (!kref_get_unless_zero(&hard_iface->refcount)) + continue; + + if (batadv_send_no_broadcast(bat_priv, skb, own_packet, + hard_iface)) { + batadv_hardif_put(hard_iface); + continue; + } + + ret = batadv_forw_bcast_packet_if(bat_priv, skb, delay, + own_packet, primary_if, + hard_iface); + batadv_hardif_put(hard_iface); + + if (ret == NETDEV_TX_BUSY) + break; + } + rcu_read_unlock(); + + batadv_hardif_put(primary_if); + return ret; +} + +/** + * batadv_forw_bcast_packet() - forward and queue a broadcast packet + * @bat_priv: the bat priv with all the soft interface information + * @skb: broadcast packet to add + * @delay: number of jiffies to wait before sending + * @own_packet: true if it is a self-generated broadcast packet + * + * Transmits a broadcast packet either immediately or if a delay is given + * after that. Furthermore, queues additional retransmissions on wireless + * interfaces. + * + * Return: NETDEV_TX_OK on success and NETDEV_TX_BUSY on errors. + */ +int batadv_forw_bcast_packet(struct batadv_priv *bat_priv, + struct sk_buff *skb, + unsigned long delay, + bool own_packet) +{ + return __batadv_forw_bcast_packet(bat_priv, skb, delay, own_packet); +} + +/** + * batadv_send_bcast_packet() - send and queue a broadcast packet + * @bat_priv: the bat priv with all the soft interface information + * @skb: broadcast packet to add + * @delay: number of jiffies to wait before sending + * @own_packet: true if it is a self-generated broadcast packet + * + * Transmits a broadcast packet either immediately or if a delay is given + * after that. Furthermore, queues additional retransmissions on wireless + * interfaces. + * + * Consumes the provided skb. + */ +void batadv_send_bcast_packet(struct batadv_priv *bat_priv, + struct sk_buff *skb, + unsigned long delay, + bool own_packet) +{ + __batadv_forw_bcast_packet(bat_priv, skb, delay, own_packet); + consume_skb(skb); +} + /** * batadv_forw_packet_bcasts_left() - check if a retransmission is necessary * @forw_packet: the forwarding packet to check - * @hard_iface: the interface to check on * * Checks whether a given packet has any (re)transmissions left on the provided * interface. @@ -811,28 +1012,20 @@ err: * Return: True if (re)transmissions are left, false otherwise. */ static bool -batadv_forw_packet_bcasts_left(struct batadv_forw_packet *forw_packet, - struct batadv_hard_iface *hard_iface) +batadv_forw_packet_bcasts_left(struct batadv_forw_packet *forw_packet) { - unsigned int max; - - if (hard_iface) - max = hard_iface->num_bcasts; - else - max = BATADV_NUM_BCASTS_MAX; - - return BATADV_SKB_CB(forw_packet->skb)->num_bcasts < max; + return BATADV_SKB_CB(forw_packet->skb)->num_bcasts; } /** - * batadv_forw_packet_bcasts_inc() - increment retransmission counter of a + * batadv_forw_packet_bcasts_dec() - decrement retransmission counter of a * packet - * @forw_packet: the packet to increase the counter for + * @forw_packet: the packet to decrease the counter for */ static void -batadv_forw_packet_bcasts_inc(struct batadv_forw_packet *forw_packet) +batadv_forw_packet_bcasts_dec(struct batadv_forw_packet *forw_packet) { - BATADV_SKB_CB(forw_packet->skb)->num_bcasts++; + BATADV_SKB_CB(forw_packet->skb)->num_bcasts--; } /** @@ -843,30 +1036,30 @@ batadv_forw_packet_bcasts_inc(struct batadv_forw_packet *forw_packet) */ bool batadv_forw_packet_is_rebroadcast(struct batadv_forw_packet *forw_packet) { - return BATADV_SKB_CB(forw_packet->skb)->num_bcasts > 0; + unsigned char num_bcasts = BATADV_SKB_CB(forw_packet->skb)->num_bcasts; + + return num_bcasts != forw_packet->if_outgoing->num_bcasts; } +/** + * batadv_send_outstanding_bcast_packet() - transmit a queued broadcast packet + * @work: work queue item + * + * Transmits a queued broadcast packet and if necessary reschedules it. + */ static void batadv_send_outstanding_bcast_packet(struct work_struct *work) { - struct batadv_hard_iface *hard_iface; - struct batadv_hardif_neigh_node *neigh_node; - struct delayed_work *delayed_work; + unsigned long send_time = jiffies + msecs_to_jiffies(5); struct batadv_forw_packet *forw_packet; - struct batadv_bcast_packet *bcast_packet; - struct sk_buff *skb1; - struct net_device *soft_iface; + struct delayed_work *delayed_work; struct batadv_priv *bat_priv; - unsigned long send_time = jiffies + msecs_to_jiffies(5); + struct sk_buff *skb1; bool dropped = false; - u8 *neigh_addr; - u8 *orig_neigh; - int ret = 0; delayed_work = to_delayed_work(work); forw_packet = container_of(delayed_work, struct batadv_forw_packet, delayed_work); - soft_iface = forw_packet->if_incoming->soft_iface; - bat_priv = netdev_priv(soft_iface); + bat_priv = netdev_priv(forw_packet->if_incoming->soft_iface); if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_DEACTIVATING) { dropped = true; @@ -878,76 +1071,15 @@ static void batadv_send_outstanding_bcast_packet(struct work_struct *work) goto out; } - bcast_packet = (struct batadv_bcast_packet *)forw_packet->skb->data; - - /* rebroadcast packet */ - rcu_read_lock(); - list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { - if (hard_iface->soft_iface != soft_iface) - continue; - - if (!batadv_forw_packet_bcasts_left(forw_packet, hard_iface)) - continue; - - if (forw_packet->own) { - neigh_node = NULL; - } else { - neigh_addr = eth_hdr(forw_packet->skb)->h_source; - neigh_node = batadv_hardif_neigh_get(hard_iface, - neigh_addr); - } - - orig_neigh = neigh_node ? neigh_node->orig : NULL; - - ret = batadv_hardif_no_broadcast(hard_iface, bcast_packet->orig, - orig_neigh); - - if (ret) { - char *type; - - switch (ret) { - case BATADV_HARDIF_BCAST_NORECIPIENT: - type = "no neighbor"; - break; - case BATADV_HARDIF_BCAST_DUPFWD: - type = "single neighbor is source"; - break; - case BATADV_HARDIF_BCAST_DUPORIG: - type = "single neighbor is originator"; - break; - default: - type = "unknown"; - } - - batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "BCAST packet from orig %pM on %s suppressed: %s\n", - bcast_packet->orig, - hard_iface->net_dev->name, type); - - if (neigh_node) - batadv_hardif_neigh_put(neigh_node); - - continue; - } - - if (neigh_node) - batadv_hardif_neigh_put(neigh_node); - - if (!kref_get_unless_zero(&hard_iface->refcount)) - continue; - - /* send a copy of the saved skb */ - skb1 = skb_clone(forw_packet->skb, GFP_ATOMIC); - if (skb1) - batadv_send_broadcast_skb(skb1, hard_iface); - - batadv_hardif_put(hard_iface); - } - rcu_read_unlock(); + /* send a copy of the saved skb */ + skb1 = skb_copy(forw_packet->skb, GFP_ATOMIC); + if (!skb1) + goto out; - batadv_forw_packet_bcasts_inc(forw_packet); + batadv_send_broadcast_skb(skb1, forw_packet->if_outgoing); + batadv_forw_packet_bcasts_dec(forw_packet); - /* if we still have some more bcasts to send */ - if (batadv_forw_packet_bcasts_left(forw_packet, NULL)) { + if (batadv_forw_packet_bcasts_left(forw_packet)) { batadv_forw_packet_bcast_queue(bat_priv, forw_packet, send_time); return; diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h index 2b0daf8b2bc4..08af251b765c 100644 --- a/net/batman-adv/send.h +++ b/net/batman-adv/send.h @@ -39,10 +39,14 @@ int batadv_send_broadcast_skb(struct sk_buff *skb, struct batadv_hard_iface *hard_iface); int batadv_send_unicast_skb(struct sk_buff *skb, struct batadv_neigh_node *neigh_node); -int batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv, - const struct sk_buff *skb, - unsigned long delay, - bool own_packet); +int batadv_forw_bcast_packet(struct batadv_priv *bat_priv, + struct sk_buff *skb, + unsigned long delay, + bool own_packet); +void batadv_send_bcast_packet(struct batadv_priv *bat_priv, + struct sk_buff *skb, + unsigned long delay, + bool own_packet); void batadv_purge_outstanding_packets(struct batadv_priv *bat_priv, const struct batadv_hard_iface *hard_iface); diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 6b8181bc3122..a21884c0d47f 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -191,7 +191,7 @@ static netdev_tx_t batadv_interface_tx(struct sk_buff *skb, struct vlan_ethhdr *vhdr; unsigned int header_len = 0; int data_len = skb->len, ret; - unsigned long brd_delay = 1; + unsigned long brd_delay = 0; bool do_bcast = false, client_added; unsigned short vid; u32 seqno; @@ -330,7 +330,7 @@ send: bcast_packet = (struct batadv_bcast_packet *)skb->data; bcast_packet->version = BATADV_COMPAT_VERSION; - bcast_packet->ttl = BATADV_TTL; + bcast_packet->ttl = BATADV_TTL - 1; /* batman packet type: broadcast */ bcast_packet->packet_type = BATADV_BCAST; @@ -346,13 +346,7 @@ send: seqno = atomic_inc_return(&bat_priv->bcast_seqno); bcast_packet->seqno = htonl(seqno); - batadv_add_bcast_packet_to_list(bat_priv, skb, brd_delay, true); - - /* a copy is stored in the bcast list, therefore removing - * the original skb. - */ - consume_skb(skb); - + batadv_send_bcast_packet(bat_priv, skb, brd_delay, true); /* unicast packet */ } else { /* DHCP packets going to a server will use the GW feature */ -- cgit v1.2.3-58-ga151 From 4cbf055002c53c364d1b3275792e4487af76dd2d Mon Sep 17 00:00:00 2001 From: Linus Lüssing Date: Mon, 17 May 2021 00:33:08 +0200 Subject: batman-adv: bcast: avoid skb-copy for (re)queued broadcasts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Broadcast packets send via batadv_send_outstanding_bcast_packet() were originally copied in batadv_forw_bcast_packet_to_list() before being queued. And after that only the ethernet header will be pushed through batadv_send_broadcast_skb()->batadv_send_skb_packet() which works safely on skb clones as it uses batadv_skb_head_push()->skb_cow_head(). Signed-off-by: Linus Lüssing Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/send.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index 07b0ba265472..0b9dd29d3b6a 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -1072,7 +1072,7 @@ static void batadv_send_outstanding_bcast_packet(struct work_struct *work) } /* send a copy of the saved skb */ - skb1 = skb_copy(forw_packet->skb, GFP_ATOMIC); + skb1 = skb_clone(forw_packet->skb, GFP_ATOMIC); if (!skb1) goto out; -- cgit v1.2.3-58-ga151 From 94c821c74bf5fe0c25e09df5334a16f98608db90 Mon Sep 17 00:00:00 2001 From: Seth David Schoen Date: Wed, 12 May 2021 21:37:49 -0700 Subject: ip: Treat IPv4 segment's lowest address as unicast Treat only the highest, not the lowest, IPv4 address within a local subnet as a broadcast address. Signed-off-by: Seth David Schoen Suggested-by: John Gilmore Acked-by: Dave Taht Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 84bb707bd88d..bfb345c88271 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1122,10 +1122,8 @@ void fib_add_ifaddr(struct in_ifaddr *ifa) prefix, ifa->ifa_prefixlen, prim, ifa->ifa_rt_priority); - /* Add network specific broadcasts, when it takes a sense */ + /* Add the network broadcast address, when it makes sense */ if (ifa->ifa_prefixlen < 31) { - fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, - prim, 0); fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask, 32, prim, 0); } -- cgit v1.2.3-58-ga151 From 5796254e467bf1cff002df65fbb53ecef6a0e060 Mon Sep 17 00:00:00 2001 From: Yejune Deng Date: Mon, 17 May 2021 20:22:05 +0800 Subject: net: Remove the member netns_ok Every protocol has the 'netns_ok' member and it is euqal to 1. The 'if (!prot->netns_ok)' always false in inet_add_protocol(). Signed-off-by: Yejune Deng Signed-off-by: David S. Miller --- include/net/protocol.h | 1 - net/dccp/ipv4.c | 1 - net/ipv4/af_inet.c | 4 ---- net/ipv4/gre_demux.c | 1 - net/ipv4/ipmr.c | 1 - net/ipv4/protocol.c | 6 ------ net/ipv4/tunnel4.c | 3 --- net/ipv4/udplite.c | 1 - net/ipv4/xfrm4_protocol.c | 3 --- net/l2tp/l2tp_ip.c | 1 - net/sctp/protocol.c | 1 - 11 files changed, 23 deletions(-) (limited to 'net') diff --git a/include/net/protocol.h b/include/net/protocol.h index 2b778e1d2d8f..f51c06ae365f 100644 --- a/include/net/protocol.h +++ b/include/net/protocol.h @@ -43,7 +43,6 @@ struct net_protocol { int (*err_handler)(struct sk_buff *skb, u32 info); unsigned int no_policy:1, - netns_ok:1, /* does the protocol do more stringent * icmp tag validation than simple * socket lookup? diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index ffc601a3b329..f81c1df761d3 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -977,7 +977,6 @@ static const struct net_protocol dccp_v4_protocol = { .handler = dccp_v4_rcv, .err_handler = dccp_v4_err, .no_policy = 1, - .netns_ok = 1, .icmp_strict_tag_validation = 1, }; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index f17870ee558b..d9bccad65e2b 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1720,7 +1720,6 @@ EXPORT_SYMBOL_GPL(snmp_fold_field64); #ifdef CONFIG_IP_MULTICAST static const struct net_protocol igmp_protocol = { .handler = igmp_rcv, - .netns_ok = 1, }; #endif @@ -1733,7 +1732,6 @@ static struct net_protocol tcp_protocol = { .handler = tcp_v4_rcv, .err_handler = tcp_v4_err, .no_policy = 1, - .netns_ok = 1, .icmp_strict_tag_validation = 1, }; @@ -1746,14 +1744,12 @@ static struct net_protocol udp_protocol = { .handler = udp_rcv, .err_handler = udp_err, .no_policy = 1, - .netns_ok = 1, }; static const struct net_protocol icmp_protocol = { .handler = icmp_rcv, .err_handler = icmp_err, .no_policy = 1, - .netns_ok = 1, }; static __net_init int ipv4_mib_init_net(struct net *net) diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c index 5d1e6fe9d838..cbb2b4bb0dfa 100644 --- a/net/ipv4/gre_demux.c +++ b/net/ipv4/gre_demux.c @@ -195,7 +195,6 @@ static int gre_err(struct sk_buff *skb, u32 info) static const struct net_protocol net_gre_protocol = { .handler = gre_rcv, .err_handler = gre_err, - .netns_ok = 1, }; static int __init gre_init(void) diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 939792a38814..12b564b1ecb4 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -3007,7 +3007,6 @@ static const struct seq_operations ipmr_mfc_seq_ops = { #ifdef CONFIG_IP_PIMSM_V2 static const struct net_protocol pim_protocol = { .handler = pim_rcv, - .netns_ok = 1, }; #endif diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c index 9a8c0892622b..6913979948d7 100644 --- a/net/ipv4/protocol.c +++ b/net/ipv4/protocol.c @@ -31,12 +31,6 @@ EXPORT_SYMBOL(inet_offloads); int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol) { - if (!prot->netns_ok) { - pr_err("Protocol %u is not namespace aware, cannot register.\n", - protocol); - return -EINVAL; - } - return !cmpxchg((const struct net_protocol **)&inet_protos[protocol], NULL, prot) ? 0 : -1; } diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c index e44aaf41a138..5048c47c79b2 100644 --- a/net/ipv4/tunnel4.c +++ b/net/ipv4/tunnel4.c @@ -218,7 +218,6 @@ static const struct net_protocol tunnel4_protocol = { .handler = tunnel4_rcv, .err_handler = tunnel4_err, .no_policy = 1, - .netns_ok = 1, }; #if IS_ENABLED(CONFIG_IPV6) @@ -226,7 +225,6 @@ static const struct net_protocol tunnel64_protocol = { .handler = tunnel64_rcv, .err_handler = tunnel64_err, .no_policy = 1, - .netns_ok = 1, }; #endif @@ -235,7 +233,6 @@ static const struct net_protocol tunnelmpls4_protocol = { .handler = tunnelmpls4_rcv, .err_handler = tunnelmpls4_err, .no_policy = 1, - .netns_ok = 1, }; #endif diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index bd8773b49e72..cd1cd68adeec 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -31,7 +31,6 @@ static const struct net_protocol udplite_protocol = { .handler = udplite_rcv, .err_handler = udplite_err, .no_policy = 1, - .netns_ok = 1, }; struct proto udplite_prot = { diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c index ea595c8549c7..2fe5860c21d6 100644 --- a/net/ipv4/xfrm4_protocol.c +++ b/net/ipv4/xfrm4_protocol.c @@ -181,21 +181,18 @@ static const struct net_protocol esp4_protocol = { .handler = xfrm4_esp_rcv, .err_handler = xfrm4_esp_err, .no_policy = 1, - .netns_ok = 1, }; static const struct net_protocol ah4_protocol = { .handler = xfrm4_ah_rcv, .err_handler = xfrm4_ah_err, .no_policy = 1, - .netns_ok = 1, }; static const struct net_protocol ipcomp4_protocol = { .handler = xfrm4_ipcomp_rcv, .err_handler = xfrm4_ipcomp_err, .no_policy = 1, - .netns_ok = 1, }; static const struct xfrm_input_afinfo xfrm4_input_afinfo = { diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 97ae1255fcb6..536c30d4dd7d 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -635,7 +635,6 @@ static struct inet_protosw l2tp_ip_protosw = { static struct net_protocol l2tp_ip_protocol __read_mostly = { .handler = l2tp_ip_recv, - .netns_ok = 1, }; static int __init l2tp_ip_init(void) diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 6f2bbfeec3a4..baa4e770e4ba 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1171,7 +1171,6 @@ static const struct net_protocol sctp_protocol = { .handler = sctp4_rcv, .err_handler = sctp_v4_err, .no_policy = 1, - .netns_ok = 1, .icmp_strict_tag_validation = 1, }; -- cgit v1.2.3-58-ga151 From 25c55b38d85b54e49f2c9a3c7d483d1a24173b94 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Mon, 17 May 2021 18:15:25 +0800 Subject: net/packet: Remove redundant assignment to ret Variable ret is set to '0' or '-EBUSY', but this value is never read as it is not used later on, hence it is a redundant assignment and can be removed. Clean up the following clang-analyzer warning: net/packet/af_packet.c:3936:4: warning: Value stored to 'ret' is never read [clang-analyzer-deadcode.DeadStores]. net/packet/af_packet.c:3933:4: warning: Value stored to 'ret' is never read [clang-analyzer-deadcode.DeadStores]. No functional change. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Signed-off-by: David S. Miller --- net/packet/af_packet.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index ba96db1880ea..597d798ac0a5 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3923,12 +3923,9 @@ packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, return -EFAULT; lock_sock(sk); - if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) { - ret = -EBUSY; - } else { + if (!po->rx_ring.pg_vec && !po->tx_ring.pg_vec) po->tp_tx_has_off = !!val; - ret = 0; - } + release_sock(sk); return 0; } -- cgit v1.2.3-58-ga151 From c49661aa6f7097047b7e86ad37b1cf308a7a8d4f Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sun, 16 May 2021 19:23:48 -0700 Subject: skmsg: Remove unused parameters of sk_msg_wait_data() 'err' and 'flags' are not used, we can just get rid of them. Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Acked-by: Song Liu Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210517022348.50555-1-xiyou.wangcong@gmail.com --- include/linux/skmsg.h | 3 +-- net/core/skmsg.c | 3 +-- net/ipv4/tcp_bpf.c | 9 ++------- net/ipv4/udp_bpf.c | 8 ++------ 4 files changed, 6 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index aba0f0f429be..fcaa9a7996c8 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -126,8 +126,7 @@ int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from, struct sk_msg *msg, u32 bytes); int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, struct sk_msg *msg, u32 bytes); -int sk_msg_wait_data(struct sock *sk, struct sk_psock *psock, int flags, - long timeo, int *err); +int sk_msg_wait_data(struct sock *sk, struct sk_psock *psock, long timeo); int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, int len, int flags); diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 43ce17a6a585..f0b9decdf279 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -399,8 +399,7 @@ out: } EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter); -int sk_msg_wait_data(struct sock *sk, struct sk_psock *psock, int flags, - long timeo, int *err) +int sk_msg_wait_data(struct sock *sk, struct sk_psock *psock, long timeo) { DEFINE_WAIT_FUNC(wait, woken_wake_function); int ret = 0; diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index ad9d17923fc5..a80de92ea3b6 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -184,11 +184,11 @@ static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, msg_bytes_ready: copied = sk_msg_recvmsg(sk, psock, msg, len, flags); if (!copied) { - int data, err = 0; long timeo; + int data; timeo = sock_rcvtimeo(sk, nonblock); - data = sk_msg_wait_data(sk, psock, flags, timeo, &err); + data = sk_msg_wait_data(sk, psock, timeo); if (data) { if (!sk_psock_queue_empty(psock)) goto msg_bytes_ready; @@ -196,14 +196,9 @@ msg_bytes_ready: sk_psock_put(sk, psock); return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); } - if (err) { - ret = err; - goto out; - } copied = -EAGAIN; } ret = copied; -out: release_sock(sk); sk_psock_put(sk, psock); return ret; diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c index 954c4591a6fd..b07e4b6dda25 100644 --- a/net/ipv4/udp_bpf.c +++ b/net/ipv4/udp_bpf.c @@ -43,21 +43,17 @@ static int udp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, msg_bytes_ready: copied = sk_msg_recvmsg(sk, psock, msg, len, flags); if (!copied) { - int data, err = 0; long timeo; + int data; timeo = sock_rcvtimeo(sk, nonblock); - data = sk_msg_wait_data(sk, psock, flags, timeo, &err); + data = sk_msg_wait_data(sk, psock, timeo); if (data) { if (!sk_psock_queue_empty(psock)) goto msg_bytes_ready; ret = sk_udp_recvmsg(sk, msg, len, nonblock, flags, addr_len); goto out; } - if (err) { - ret = err; - goto out; - } copied = -EAGAIN; } ret = copied; -- cgit v1.2.3-58-ga151 From fa7b83bf3b156c767f3e4a25bbf3817b08f3ff8e Mon Sep 17 00:00:00 2001 From: Dongseok Yi Date: Wed, 12 May 2021 16:27:33 +0900 Subject: bpf: Check for BPF_F_ADJ_ROOM_FIXED_GSO when bpf_skb_change_proto In the forwarding path GRO -> BPF 6 to 4 -> GSO for TCP traffic, the coalesced packet payload can be > MSS, but < MSS + 20. bpf_skb_proto_6_to_4() will upgrade the MSS and it can be > the payload length. After then tcp_gso_segment checks for the payload length if it is <= MSS. The condition is causing the packet to be dropped. tcp_gso_segment(): [...] mss = skb_shinfo(skb)->gso_size; if (unlikely(skb->len <= mss)) goto out; [...] Allow to upgrade/downgrade MSS only when BPF_F_ADJ_ROOM_FIXED_GSO is not set. Signed-off-by: Dongseok Yi Signed-off-by: Daniel Borkmann Acked-by: Willem de Bruijn Link: https://lore.kernel.org/bpf/1620804453-57566-1-git-send-email-dseok.yi@samsung.com --- net/core/filter.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index cae56d08a670..582ac196fd94 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3235,7 +3235,7 @@ static int bpf_skb_net_hdr_pop(struct sk_buff *skb, u32 off, u32 len) return ret; } -static int bpf_skb_proto_4_to_6(struct sk_buff *skb) +static int bpf_skb_proto_4_to_6(struct sk_buff *skb, u64 flags) { const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr); u32 off = skb_mac_header_len(skb); @@ -3264,7 +3264,9 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb) } /* Due to IPv6 header, MSS needs to be downgraded. */ - skb_decrease_gso_size(shinfo, len_diff); + if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) + skb_decrease_gso_size(shinfo, len_diff); + /* Header must be checked, and gso_segs recomputed. */ shinfo->gso_type |= SKB_GSO_DODGY; shinfo->gso_segs = 0; @@ -3276,7 +3278,7 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb) return 0; } -static int bpf_skb_proto_6_to_4(struct sk_buff *skb) +static int bpf_skb_proto_6_to_4(struct sk_buff *skb, u64 flags) { const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr); u32 off = skb_mac_header_len(skb); @@ -3305,7 +3307,9 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb) } /* Due to IPv4 header, MSS can be upgraded. */ - skb_increase_gso_size(shinfo, len_diff); + if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) + skb_increase_gso_size(shinfo, len_diff); + /* Header must be checked, and gso_segs recomputed. */ shinfo->gso_type |= SKB_GSO_DODGY; shinfo->gso_segs = 0; @@ -3317,17 +3321,17 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb) return 0; } -static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto) +static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto, u64 flags) { __be16 from_proto = skb->protocol; if (from_proto == htons(ETH_P_IP) && to_proto == htons(ETH_P_IPV6)) - return bpf_skb_proto_4_to_6(skb); + return bpf_skb_proto_4_to_6(skb, flags); if (from_proto == htons(ETH_P_IPV6) && to_proto == htons(ETH_P_IP)) - return bpf_skb_proto_6_to_4(skb); + return bpf_skb_proto_6_to_4(skb, flags); return -ENOTSUPP; } @@ -3337,7 +3341,7 @@ BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto, { int ret; - if (unlikely(flags)) + if (unlikely(flags & ~(BPF_F_ADJ_ROOM_FIXED_GSO))) return -EINVAL; /* General idea is that this helper does the basic groundwork @@ -3357,7 +3361,7 @@ BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto, * that. For offloads, we mark packet as dodgy, so that headers * need to be verified first. */ - ret = bpf_skb_proto_xlat(skb, proto); + ret = bpf_skb_proto_xlat(skb, proto, flags); bpf_compute_data_pointers(skb); return ret; } -- cgit v1.2.3-58-ga151 From 2e68ea92684181412b73979baf1af7d04619c52c Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 17 May 2021 21:15:17 +0300 Subject: ipv4: Calculate multipath hash inside switch statement A subsequent patch will add another multipath hash policy where the multipath hash is calculated directly by the policy specific code and not outside of the switch statement. Prepare for this change by moving the multipath hash calculation inside the switch statement. No functional changes intended. Signed-off-by: Ido Schimmel Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/route.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index f6787c55f6ab..9d61e969446e 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1912,7 +1912,7 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, { u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0; struct flow_keys hash_keys; - u32 mhash; + u32 mhash = 0; switch (net->ipv4.sysctl_fib_multipath_hash_policy) { case 0: @@ -1924,6 +1924,7 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, hash_keys.addrs.v4addrs.src = fl4->saddr; hash_keys.addrs.v4addrs.dst = fl4->daddr; } + mhash = flow_hash_from_keys(&hash_keys); break; case 1: /* skb is currently provided only when forwarding */ @@ -1957,6 +1958,7 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, hash_keys.ports.dst = fl4->fl4_dport; hash_keys.basic.ip_proto = fl4->flowi4_proto; } + mhash = flow_hash_from_keys(&hash_keys); break; case 2: memset(&hash_keys, 0, sizeof(hash_keys)); @@ -1987,9 +1989,9 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, hash_keys.addrs.v4addrs.src = fl4->saddr; hash_keys.addrs.v4addrs.dst = fl4->daddr; } + mhash = flow_hash_from_keys(&hash_keys); break; } - mhash = flow_hash_from_keys(&hash_keys); if (multipath_hash) mhash = jhash_2words(mhash, multipath_hash, 0); -- cgit v1.2.3-58-ga151 From ce5c9c20d364f156c885efed8c71fca2945db00f Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 17 May 2021 21:15:18 +0300 Subject: ipv4: Add a sysctl to control multipath hash fields A subsequent patch will add a new multipath hash policy where the packet fields used for multipath hash calculation are determined by user space. This patch adds a sysctl that allows user space to set these fields. The packet fields are represented using a bitmask and are common between IPv4 and IPv6 to allow user space to use the same numbering across both protocols. For example, to hash based on standard 5-tuple: # sysctl -w net.ipv4.fib_multipath_hash_fields=0x0037 net.ipv4.fib_multipath_hash_fields = 0x0037 The kernel rejects unknown fields, for example: # sysctl -w net.ipv4.fib_multipath_hash_fields=0x1000 sysctl: setting key "net.ipv4.fib_multipath_hash_fields": Invalid argument More fields can be added in the future, if needed. Signed-off-by: Ido Schimmel Reviewed-by: David Ahern Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.rst | 27 +++++++++++++++++++++ include/net/ip_fib.h | 43 ++++++++++++++++++++++++++++++++++ include/net/netns/ipv4.h | 1 + net/ipv4/fib_frontend.c | 6 +++++ net/ipv4/sysctl_net_ipv4.c | 12 ++++++++++ 5 files changed, 89 insertions(+) (limited to 'net') diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index c2ecc9894fd0..47494798d03b 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -100,6 +100,33 @@ fib_multipath_hash_policy - INTEGER - 1 - Layer 4 - 2 - Layer 3 or inner Layer 3 if present +fib_multipath_hash_fields - UNSIGNED INTEGER + When fib_multipath_hash_policy is set to 3 (custom multipath hash), the + fields used for multipath hash calculation are determined by this + sysctl. + + This value is a bitmask which enables various fields for multipath hash + calculation. + + Possible fields are: + + ====== ============================ + 0x0001 Source IP address + 0x0002 Destination IP address + 0x0004 IP protocol + 0x0008 Unused (Flow Label) + 0x0010 Source port + 0x0020 Destination port + 0x0040 Inner source IP address + 0x0080 Inner destination IP address + 0x0100 Inner IP protocol + 0x0200 Inner Flow Label + 0x0400 Inner source port + 0x0800 Inner destination port + ====== ============================ + + Default: 0x0007 (source IP, destination IP and IP protocol) + fib_sync_mem - UNSIGNED INTEGER Amount of dirty memory from fib entries that can be backlogged before synchronize_rcu is forced. diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index a914f33f3ed5..3ab2563b1a23 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -466,6 +466,49 @@ int fib_sync_up(struct net_device *dev, unsigned char nh_flags); void fib_sync_mtu(struct net_device *dev, u32 orig_mtu); void fib_nhc_update_mtu(struct fib_nh_common *nhc, u32 new, u32 orig); +/* Fields used for sysctl_fib_multipath_hash_fields. + * Common to IPv4 and IPv6. + * + * Add new fields at the end. This is user API. + */ +#define FIB_MULTIPATH_HASH_FIELD_SRC_IP BIT(0) +#define FIB_MULTIPATH_HASH_FIELD_DST_IP BIT(1) +#define FIB_MULTIPATH_HASH_FIELD_IP_PROTO BIT(2) +#define FIB_MULTIPATH_HASH_FIELD_FLOWLABEL BIT(3) +#define FIB_MULTIPATH_HASH_FIELD_SRC_PORT BIT(4) +#define FIB_MULTIPATH_HASH_FIELD_DST_PORT BIT(5) +#define FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP BIT(6) +#define FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP BIT(7) +#define FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO BIT(8) +#define FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL BIT(9) +#define FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT BIT(10) +#define FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT BIT(11) + +#define FIB_MULTIPATH_HASH_FIELD_OUTER_MASK \ + (FIB_MULTIPATH_HASH_FIELD_SRC_IP | \ + FIB_MULTIPATH_HASH_FIELD_DST_IP | \ + FIB_MULTIPATH_HASH_FIELD_IP_PROTO | \ + FIB_MULTIPATH_HASH_FIELD_FLOWLABEL | \ + FIB_MULTIPATH_HASH_FIELD_SRC_PORT | \ + FIB_MULTIPATH_HASH_FIELD_DST_PORT) + +#define FIB_MULTIPATH_HASH_FIELD_INNER_MASK \ + (FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP | \ + FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP | \ + FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO | \ + FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL | \ + FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT | \ + FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT) + +#define FIB_MULTIPATH_HASH_FIELD_ALL_MASK \ + (FIB_MULTIPATH_HASH_FIELD_OUTER_MASK | \ + FIB_MULTIPATH_HASH_FIELD_INNER_MASK) + +#define FIB_MULTIPATH_HASH_FIELD_DEFAULT_MASK \ + (FIB_MULTIPATH_HASH_FIELD_SRC_IP | \ + FIB_MULTIPATH_HASH_FIELD_DST_IP | \ + FIB_MULTIPATH_HASH_FIELD_IP_PROTO) + #ifdef CONFIG_IP_ROUTE_MULTIPATH int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, const struct sk_buff *skb, struct flow_keys *flkeys); diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index f6af8d96d3c6..746c80cd4257 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -210,6 +210,7 @@ struct netns_ipv4 { #endif #endif #ifdef CONFIG_IP_ROUTE_MULTIPATH + u32 sysctl_fib_multipath_hash_fields; u8 sysctl_fib_multipath_use_neigh; u8 sysctl_fib_multipath_hash_policy; #endif diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index bfb345c88271..af8814a11378 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1514,6 +1514,12 @@ static int __net_init ip_fib_net_init(struct net *net) if (err) return err; +#ifdef CONFIG_IP_ROUTE_MULTIPATH + /* Default to 3-tuple */ + net->ipv4.sysctl_fib_multipath_hash_fields = + FIB_MULTIPATH_HASH_FIELD_DEFAULT_MASK; +#endif + /* Avoid false sharing : Use at least a full cache line */ size = max_t(size_t, size, L1_CACHE_BYTES); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index a62934b9f15a..45bab3733621 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -48,6 +49,8 @@ static int ip_ping_group_range_min[] = { 0, 0 }; static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; static u32 u32_max_div_HZ = UINT_MAX / HZ; static int one_day_secs = 24 * 3600; +static u32 fib_multipath_hash_fields_all_mask __maybe_unused = + FIB_MULTIPATH_HASH_FIELD_ALL_MASK; /* obsolete */ static int sysctl_tcp_low_latency __read_mostly; @@ -1052,6 +1055,15 @@ static struct ctl_table ipv4_net_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = &two, }, + { + .procname = "fib_multipath_hash_fields", + .data = &init_net.ipv4.sysctl_fib_multipath_hash_fields, + .maxlen = sizeof(u32), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = &fib_multipath_hash_fields_all_mask, + }, #endif { .procname = "ip_unprivileged_port_start", -- cgit v1.2.3-58-ga151 From 4253b4986f98da4bfcb6a24d3fc6ff19f28e8420 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 17 May 2021 21:15:19 +0300 Subject: ipv4: Add custom multipath hash policy Add a new multipath hash policy where the packet fields used for hash calculation are determined by user space via the fib_multipath_hash_fields sysctl that was introduced in the previous patch. The current set of available packet fields includes both outer and inner fields, which requires two invocations of the flow dissector. Avoid unnecessary dissection of the outer or inner flows by skipping dissection if none of the outer or inner fields are required. In accordance with the existing policies, when an skb is not available, packet fields are extracted from the provided flow key. In which case, only outer fields are considered. Signed-off-by: Ido Schimmel Reviewed-by: David Ahern Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.rst | 2 + net/ipv4/route.c | 121 +++++++++++++++++++++++++++++++++ net/ipv4/sysctl_net_ipv4.c | 3 +- 3 files changed, 125 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 47494798d03b..afdcdc0691d6 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -99,6 +99,8 @@ fib_multipath_hash_policy - INTEGER - 0 - Layer 3 - 1 - Layer 4 - 2 - Layer 3 or inner Layer 3 if present + - 3 - Custom multipath hash. Fields used for multipath hash calculation + are determined by fib_multipath_hash_fields sysctl fib_multipath_hash_fields - UNSIGNED INTEGER When fib_multipath_hash_policy is set to 3 (custom multipath hash), the diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 9d61e969446e..a4c477475f4c 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1906,6 +1906,121 @@ out: hash_keys->addrs.v4addrs.dst = key_iph->daddr; } +static u32 fib_multipath_custom_hash_outer(const struct net *net, + const struct sk_buff *skb, + bool *p_has_inner) +{ + u32 hash_fields = net->ipv4.sysctl_fib_multipath_hash_fields; + struct flow_keys keys, hash_keys; + + if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK)) + return 0; + + memset(&hash_keys, 0, sizeof(hash_keys)); + skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_ENCAP); + + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP) + hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP) + hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO) + hash_keys.basic.ip_proto = keys.basic.ip_proto; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) + hash_keys.ports.src = keys.ports.src; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) + hash_keys.ports.dst = keys.ports.dst; + + *p_has_inner = !!(keys.control.flags & FLOW_DIS_ENCAPSULATION); + return flow_hash_from_keys(&hash_keys); +} + +static u32 fib_multipath_custom_hash_inner(const struct net *net, + const struct sk_buff *skb, + bool has_inner) +{ + u32 hash_fields = net->ipv4.sysctl_fib_multipath_hash_fields; + struct flow_keys keys, hash_keys; + + /* We assume the packet carries an encapsulation, but if none was + * encountered during dissection of the outer flow, then there is no + * point in calling the flow dissector again. + */ + if (!has_inner) + return 0; + + if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_MASK)) + return 0; + + memset(&hash_keys, 0, sizeof(hash_keys)); + skb_flow_dissect_flow_keys(skb, &keys, 0); + + if (!(keys.control.flags & FLOW_DIS_ENCAPSULATION)) + return 0; + + if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP) + hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP) + hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst; + } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP) + hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP) + hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL) + hash_keys.tags.flow_label = keys.tags.flow_label; + } + + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO) + hash_keys.basic.ip_proto = keys.basic.ip_proto; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT) + hash_keys.ports.src = keys.ports.src; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT) + hash_keys.ports.dst = keys.ports.dst; + + return flow_hash_from_keys(&hash_keys); +} + +static u32 fib_multipath_custom_hash_skb(const struct net *net, + const struct sk_buff *skb) +{ + u32 mhash, mhash_inner; + bool has_inner = true; + + mhash = fib_multipath_custom_hash_outer(net, skb, &has_inner); + mhash_inner = fib_multipath_custom_hash_inner(net, skb, has_inner); + + return jhash_2words(mhash, mhash_inner, 0); +} + +static u32 fib_multipath_custom_hash_fl4(const struct net *net, + const struct flowi4 *fl4) +{ + u32 hash_fields = net->ipv4.sysctl_fib_multipath_hash_fields; + struct flow_keys hash_keys; + + if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK)) + return 0; + + memset(&hash_keys, 0, sizeof(hash_keys)); + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP) + hash_keys.addrs.v4addrs.src = fl4->saddr; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP) + hash_keys.addrs.v4addrs.dst = fl4->daddr; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO) + hash_keys.basic.ip_proto = fl4->flowi4_proto; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) + hash_keys.ports.src = fl4->fl4_sport; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) + hash_keys.ports.dst = fl4->fl4_dport; + + return flow_hash_from_keys(&hash_keys); +} + /* if skb is set it will be used and fl4 can be NULL */ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, const struct sk_buff *skb, struct flow_keys *flkeys) @@ -1991,6 +2106,12 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, } mhash = flow_hash_from_keys(&hash_keys); break; + case 3: + if (skb) + mhash = fib_multipath_custom_hash_skb(net, skb); + else + mhash = fib_multipath_custom_hash_fl4(net, fl4); + break; } if (multipath_hash) diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 45bab3733621..ffb38ea06841 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -30,6 +30,7 @@ #include static int two = 2; +static int three __maybe_unused = 3; static int four = 4; static int thousand = 1000; static int tcp_retr1_max = 255; @@ -1053,7 +1054,7 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_fib_multipath_hash_policy, .extra1 = SYSCTL_ZERO, - .extra2 = &two, + .extra2 = &three, }, { .procname = "fib_multipath_hash_fields", -- cgit v1.2.3-58-ga151 From 67db5ca73b1f98584ae9b6ed35c1c670677c9001 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 17 May 2021 21:15:20 +0300 Subject: ipv6: Use a more suitable label name The 'out_timer' label was added in commit 63152fc0de4d ("[NETNS][IPV6] ip6_fib - gc timer per namespace") when the timer was allocated on the heap. Commit 417f28bb3407 ("netns: dont alloc ipv6 fib timer list") removed the allocation, but kept the label name. Rename it to a more suitable name. Signed-off-by: Ido Schimmel Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 679699e953f1..33d2d6a4e28c 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -2362,7 +2362,7 @@ static int __net_init fib6_net_init(struct net *net) net->ipv6.rt6_stats = kzalloc(sizeof(*net->ipv6.rt6_stats), GFP_KERNEL); if (!net->ipv6.rt6_stats) - goto out_timer; + goto out_notifier; /* Avoid false sharing : Use at least a full cache line */ size = max_t(size_t, size, L1_CACHE_BYTES); @@ -2407,7 +2407,7 @@ out_fib_table_hash: kfree(net->ipv6.fib_table_hash); out_rt6_stats: kfree(net->ipv6.rt6_stats); -out_timer: +out_notifier: fib6_notifier_exit(net); return -ENOMEM; } -- cgit v1.2.3-58-ga151 From b95b6e072a92042320fad99de658008cc0beb3b0 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 17 May 2021 21:15:21 +0300 Subject: ipv6: Calculate multipath hash inside switch statement A subsequent patch will add another multipath hash policy where the multipath hash is calculated directly by the policy specific code and not outside of the switch statement. Prepare for this change by moving the multipath hash calculation inside the switch statement. No functional changes intended. Signed-off-by: Ido Schimmel Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index a22822bdbf39..9935e18146e5 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2331,7 +2331,7 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, const struct sk_buff *skb, struct flow_keys *flkeys) { struct flow_keys hash_keys; - u32 mhash; + u32 mhash = 0; switch (ip6_multipath_hash_policy(net)) { case 0: @@ -2345,6 +2345,7 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); hash_keys.basic.ip_proto = fl6->flowi6_proto; } + mhash = flow_hash_from_keys(&hash_keys); break; case 1: if (skb) { @@ -2376,6 +2377,7 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, hash_keys.ports.dst = fl6->fl6_dport; hash_keys.basic.ip_proto = fl6->flowi6_proto; } + mhash = flow_hash_from_keys(&hash_keys); break; case 2: memset(&hash_keys, 0, sizeof(hash_keys)); @@ -2412,9 +2414,9 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); hash_keys.basic.ip_proto = fl6->flowi6_proto; } + mhash = flow_hash_from_keys(&hash_keys); break; } - mhash = flow_hash_from_keys(&hash_keys); return mhash >> 1; } -- cgit v1.2.3-58-ga151 From ed13923f980ef84dde0b9010b9e09052dc31a909 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 17 May 2021 21:15:22 +0300 Subject: ipv6: Add a sysctl to control multipath hash fields A subsequent patch will add a new multipath hash policy where the packet fields used for multipath hash calculation are determined by user space. This patch adds a sysctl that allows user space to set these fields. The packet fields are represented using a bitmask and are common between IPv4 and IPv6 to allow user space to use the same numbering across both protocols. For example, to hash based on standard 5-tuple: # sysctl -w net.ipv6.fib_multipath_hash_fields=0x0037 net.ipv6.fib_multipath_hash_fields = 0x0037 To avoid introducing holes in 'struct netns_sysctl_ipv6', move the 'bindv6only' field after the multipath hash fields. The kernel rejects unknown fields, for example: # sysctl -w net.ipv6.fib_multipath_hash_fields=0x1000 sysctl: setting key "net.ipv6.fib_multipath_hash_fields": Invalid argument Signed-off-by: Ido Schimmel Reviewed-by: David Ahern Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.rst | 27 +++++++++++++++++++++++++++ include/net/ipv6.h | 8 ++++++++ include/net/netns/ipv6.h | 3 ++- net/ipv6/ip6_fib.c | 5 +++++ net/ipv6/sysctl_net_ipv6.c | 12 ++++++++++++ 5 files changed, 54 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index afdcdc0691d6..4246cc4ae35b 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -1773,6 +1773,33 @@ fib_multipath_hash_policy - INTEGER - 1 - Layer 4 (standard 5-tuple) - 2 - Layer 3 or inner Layer 3 if present +fib_multipath_hash_fields - UNSIGNED INTEGER + When fib_multipath_hash_policy is set to 3 (custom multipath hash), the + fields used for multipath hash calculation are determined by this + sysctl. + + This value is a bitmask which enables various fields for multipath hash + calculation. + + Possible fields are: + + ====== ============================ + 0x0001 Source IP address + 0x0002 Destination IP address + 0x0004 IP protocol + 0x0008 Flow Label + 0x0010 Source port + 0x0020 Destination port + 0x0040 Inner source IP address + 0x0080 Inner destination IP address + 0x0100 Inner IP protocol + 0x0200 Inner Flow Label + 0x0400 Inner source port + 0x0800 Inner destination port + ====== ============================ + + Default: 0x0007 (source IP, destination IP and IP protocol) + anycast_src_echo_reply - BOOLEAN Controls the use of anycast addresses as source addresses for ICMPv6 echo reply diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 448bf2b34759..f2d0ecc257bb 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -926,11 +926,19 @@ static inline int ip6_multipath_hash_policy(const struct net *net) { return net->ipv6.sysctl.multipath_hash_policy; } +static inline u32 ip6_multipath_hash_fields(const struct net *net) +{ + return net->ipv6.sysctl.multipath_hash_fields; +} #else static inline int ip6_multipath_hash_policy(const struct net *net) { return 0; } +static inline u32 ip6_multipath_hash_fields(const struct net *net) +{ + return 0; +} #endif /* diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 6153c8067009..bde0b7adb4a3 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -28,8 +28,9 @@ struct netns_sysctl_ipv6 { int ip6_rt_gc_elasticity; int ip6_rt_mtu_expires; int ip6_rt_min_advmss; - u8 bindv6only; + u32 multipath_hash_fields; u8 multipath_hash_policy; + u8 bindv6only; u8 flowlabel_consistency; u8 auto_flowlabels; int icmpv6_time; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 33d2d6a4e28c..2d650dc24349 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -32,6 +32,7 @@ #include #include +#include #include #include @@ -2355,6 +2356,10 @@ static int __net_init fib6_net_init(struct net *net) if (err) return err; + /* Default to 3-tuple */ + net->ipv6.sysctl.multipath_hash_fields = + FIB_MULTIPATH_HASH_FIELD_DEFAULT_MASK; + spin_lock_init(&net->ipv6.fib6_gc_lock); rwlock_init(&net->ipv6.fib6_walker_lock); INIT_LIST_HEAD(&net->ipv6.fib6_walkers); diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index 27102c3d6e1d..ce23c8f7ceb3 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -17,6 +17,7 @@ #include #include #include +#include #ifdef CONFIG_NETLABEL #include #endif @@ -24,6 +25,8 @@ static int two = 2; static int flowlabel_reflect_max = 0x7; static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX; +static u32 rt6_multipath_hash_fields_all_mask = + FIB_MULTIPATH_HASH_FIELD_ALL_MASK; static int proc_rt6_multipath_hash_policy(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) @@ -151,6 +154,15 @@ static struct ctl_table ipv6_table_template[] = { .extra1 = SYSCTL_ZERO, .extra2 = &two, }, + { + .procname = "fib_multipath_hash_fields", + .data = &init_net.ipv6.sysctl.multipath_hash_fields, + .maxlen = sizeof(u32), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = &rt6_multipath_hash_fields_all_mask, + }, { .procname = "seg6_flowlabel", .data = &init_net.ipv6.sysctl.seg6_flowlabel, -- cgit v1.2.3-58-ga151 From 73c2c5cbb15a8a82d5bea52594b0beb038963bcc Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 17 May 2021 21:15:23 +0300 Subject: ipv6: Add custom multipath hash policy Add a new multipath hash policy where the packet fields used for hash calculation are determined by user space via the fib_multipath_hash_fields sysctl that was introduced in the previous patch. The current set of available packet fields includes both outer and inner fields, which requires two invocations of the flow dissector. Avoid unnecessary dissection of the outer or inner flows by skipping dissection if none of the outer or inner fields are required. In accordance with the existing policies, when an skb is not available, packet fields are extracted from the provided flow key. In which case, only outer fields are considered. Signed-off-by: Ido Schimmel Reviewed-by: David Ahern Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.rst | 2 + net/ipv6/route.c | 125 +++++++++++++++++++++++++++++++++ net/ipv6/sysctl_net_ipv6.c | 3 +- 3 files changed, 129 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 4246cc4ae35b..a5c250044500 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -1772,6 +1772,8 @@ fib_multipath_hash_policy - INTEGER - 0 - Layer 3 (source and destination addresses plus flow label) - 1 - Layer 4 (standard 5-tuple) - 2 - Layer 3 or inner Layer 3 if present + - 3 - Custom multipath hash. Fields used for multipath hash calculation + are determined by fib_multipath_hash_fields sysctl fib_multipath_hash_fields - UNSIGNED INTEGER When fib_multipath_hash_policy is set to 3 (custom multipath hash), the diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 9935e18146e5..c46889381ae4 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2326,6 +2326,125 @@ out: } } +static u32 rt6_multipath_custom_hash_outer(const struct net *net, + const struct sk_buff *skb, + bool *p_has_inner) +{ + u32 hash_fields = ip6_multipath_hash_fields(net); + struct flow_keys keys, hash_keys; + + if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK)) + return 0; + + memset(&hash_keys, 0, sizeof(hash_keys)); + skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_ENCAP); + + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP) + hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP) + hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO) + hash_keys.basic.ip_proto = keys.basic.ip_proto; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_FLOWLABEL) + hash_keys.tags.flow_label = keys.tags.flow_label; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) + hash_keys.ports.src = keys.ports.src; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) + hash_keys.ports.dst = keys.ports.dst; + + *p_has_inner = !!(keys.control.flags & FLOW_DIS_ENCAPSULATION); + return flow_hash_from_keys(&hash_keys); +} + +static u32 rt6_multipath_custom_hash_inner(const struct net *net, + const struct sk_buff *skb, + bool has_inner) +{ + u32 hash_fields = ip6_multipath_hash_fields(net); + struct flow_keys keys, hash_keys; + + /* We assume the packet carries an encapsulation, but if none was + * encountered during dissection of the outer flow, then there is no + * point in calling the flow dissector again. + */ + if (!has_inner) + return 0; + + if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_MASK)) + return 0; + + memset(&hash_keys, 0, sizeof(hash_keys)); + skb_flow_dissect_flow_keys(skb, &keys, 0); + + if (!(keys.control.flags & FLOW_DIS_ENCAPSULATION)) + return 0; + + if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP) + hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP) + hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst; + } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP) + hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP) + hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL) + hash_keys.tags.flow_label = keys.tags.flow_label; + } + + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO) + hash_keys.basic.ip_proto = keys.basic.ip_proto; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT) + hash_keys.ports.src = keys.ports.src; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT) + hash_keys.ports.dst = keys.ports.dst; + + return flow_hash_from_keys(&hash_keys); +} + +static u32 rt6_multipath_custom_hash_skb(const struct net *net, + const struct sk_buff *skb) +{ + u32 mhash, mhash_inner; + bool has_inner = true; + + mhash = rt6_multipath_custom_hash_outer(net, skb, &has_inner); + mhash_inner = rt6_multipath_custom_hash_inner(net, skb, has_inner); + + return jhash_2words(mhash, mhash_inner, 0); +} + +static u32 rt6_multipath_custom_hash_fl6(const struct net *net, + const struct flowi6 *fl6) +{ + u32 hash_fields = ip6_multipath_hash_fields(net); + struct flow_keys hash_keys; + + if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK)) + return 0; + + memset(&hash_keys, 0, sizeof(hash_keys)); + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP) + hash_keys.addrs.v6addrs.src = fl6->saddr; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP) + hash_keys.addrs.v6addrs.dst = fl6->daddr; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO) + hash_keys.basic.ip_proto = fl6->flowi6_proto; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_FLOWLABEL) + hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) + hash_keys.ports.src = fl6->fl6_sport; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) + hash_keys.ports.dst = fl6->fl6_dport; + + return flow_hash_from_keys(&hash_keys); +} + /* if skb is set it will be used and fl6 can be NULL */ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, const struct sk_buff *skb, struct flow_keys *flkeys) @@ -2416,6 +2535,12 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, } mhash = flow_hash_from_keys(&hash_keys); break; + case 3: + if (skb) + mhash = rt6_multipath_custom_hash_skb(net, skb); + else + mhash = rt6_multipath_custom_hash_fl6(net, fl6); + break; } return mhash >> 1; diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index ce23c8f7ceb3..160bea5db973 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -23,6 +23,7 @@ #endif static int two = 2; +static int three = 3; static int flowlabel_reflect_max = 0x7; static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX; static u32 rt6_multipath_hash_fields_all_mask = @@ -152,7 +153,7 @@ static struct ctl_table ipv6_table_template[] = { .mode = 0644, .proc_handler = proc_rt6_multipath_hash_policy, .extra1 = SYSCTL_ZERO, - .extra2 = &two, + .extra2 = &three, }, { .procname = "fib_multipath_hash_fields", -- cgit v1.2.3-58-ga151 From 4ac9e23cf2ccdad99dbc57d7d1bf264d53d23057 Mon Sep 17 00:00:00 2001 From: Zheng Yejian Date: Tue, 18 May 2021 17:11:41 +0800 Subject: cipso: correct comments of cipso_v4_cache_invalidate() Since cipso_v4_cache_invalidate() has no return value, so drop related descriptions in its comments. Fixes: 446fda4f2682 ("[NetLabel]: CIPSOv4 engine") Signed-off-by: Zheng Yejian Acked-by: Paul Moore Signed-off-by: David S. Miller --- net/ipv4/cipso_ipv4.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index bfaf327e9d12..d6e3a92841e3 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -187,8 +187,7 @@ static int __init cipso_v4_cache_init(void) * cipso_v4_cache_invalidate - Invalidates the current CIPSO cache * * Description: - * Invalidates and frees any entries in the CIPSO cache. Returns zero on - * success and negative values on failure. + * Invalidates and frees any entries in the CIPSO cache. * */ void cipso_v4_cache_invalidate(void) -- cgit v1.2.3-58-ga151 From e2bd6bad9c1e976674de7d714a5c1567281a0843 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Tue, 18 May 2021 21:03:58 +0800 Subject: net: dcb: Remove unnecessary INIT_LIST_HEAD() The list_head dcb_app_list is initialized statically. It is unnecessary to initialize by INIT_LIST_HEAD(). Reported-by: Hulk Robot Signed-off-by: Yang Yingliang Signed-off-by: David S. Miller --- net/dcb/dcbnl.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index 653e3bc9c87b..51f80a2f8194 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -2075,8 +2075,6 @@ EXPORT_SYMBOL(dcb_ieee_getapp_default_prio_mask); static int __init dcbnl_init(void) { - INIT_LIST_HEAD(&dcb_app_list); - rtnl_register(PF_UNSPEC, RTM_GETDCB, dcb_doit, NULL, 0); rtnl_register(PF_UNSPEC, RTM_SETDCB, dcb_doit, NULL, 0); -- cgit v1.2.3-58-ga151 From 79a7f8bdb159d9914b58740f3d31d602a6e4aca8 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 13 May 2021 17:36:03 -0700 Subject: bpf: Introduce bpf_sys_bpf() helper and program type. Add placeholders for bpf_sys_bpf() helper and new program type. Make sure to check that expected_attach_type is zero for future extensibility. Allow tracing helper functions to be used in this program type, since they will only execute from user context via bpf_prog_test_run. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210514003623.28033-2-alexei.starovoitov@gmail.com --- include/linux/bpf.h | 10 ++++++++ include/linux/bpf_types.h | 2 ++ include/uapi/linux/bpf.h | 8 +++++++ kernel/bpf/syscall.c | 53 ++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 8 +++++++ net/bpf/test_run.c | 43 ++++++++++++++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 8 +++++++ 7 files changed, 132 insertions(+) (limited to 'net') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 02b02cb29ce2..04a2bf41ae72 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1826,6 +1826,9 @@ static inline bool bpf_map_is_dev_bound(struct bpf_map *map) struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr); void bpf_map_offload_map_free(struct bpf_map *map); +int bpf_prog_test_run_syscall(struct bpf_prog *prog, + const union bpf_attr *kattr, + union bpf_attr __user *uattr); #else static inline int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) @@ -1851,6 +1854,13 @@ static inline struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) static inline void bpf_map_offload_map_free(struct bpf_map *map) { } + +static inline int bpf_prog_test_run_syscall(struct bpf_prog *prog, + const union bpf_attr *kattr, + union bpf_attr __user *uattr) +{ + return -ENOTSUPP; +} #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */ #if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index f883f01a5061..a9db1eae6796 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -77,6 +77,8 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LSM, lsm, void *, void *) #endif /* CONFIG_BPF_LSM */ #endif +BPF_PROG_TYPE(BPF_PROG_TYPE_SYSCALL, bpf_syscall, + void *, void *) BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index ec6d85a81744..c92648f38144 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -937,6 +937,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_EXT, BPF_PROG_TYPE_LSM, BPF_PROG_TYPE_SK_LOOKUP, + BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */ }; enum bpf_attach_type { @@ -4735,6 +4736,12 @@ union bpf_attr { * be zero-terminated except when **str_size** is 0. * * Or **-EBUSY** if the per-CPU memory copy buffer is busy. + * + * long bpf_sys_bpf(u32 cmd, void *attr, u32 attr_size) + * Description + * Execute bpf syscall with given arguments. + * Return + * A syscall result. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4903,6 +4910,7 @@ union bpf_attr { FN(check_mtu), \ FN(for_each_map_elem), \ FN(snprintf), \ + FN(sys_bpf), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 941ca06d9dfa..b1e7352919cb 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2014,6 +2014,7 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, if (expected_attach_type == BPF_SK_LOOKUP) return 0; return -EINVAL; + case BPF_PROG_TYPE_SYSCALL: case BPF_PROG_TYPE_EXT: if (expected_attach_type) return -EINVAL; @@ -4508,3 +4509,55 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz return err; } + +static bool syscall_prog_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (off < 0 || off >= U16_MAX) + return false; + if (off % size != 0) + return false; + return true; +} + +BPF_CALL_3(bpf_sys_bpf, int, cmd, void *, attr, u32, attr_size) +{ + return -EINVAL; +} + +const struct bpf_func_proto bpf_sys_bpf_proto = { + .func = bpf_sys_bpf, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + +const struct bpf_func_proto * __weak +tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + return bpf_base_func_proto(func_id); +} + +static const struct bpf_func_proto * +syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_sys_bpf: + return &bpf_sys_bpf_proto; + default: + return tracing_prog_func_proto(func_id, prog); + } +} + +const struct bpf_verifier_ops bpf_syscall_verifier_ops = { + .get_func_proto = syscall_prog_func_proto, + .is_valid_access = syscall_prog_is_valid_access, +}; + +const struct bpf_prog_ops bpf_syscall_prog_ops = { + .test_run = bpf_prog_test_run_syscall, +}; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bdfdb54676ea..37407d8fbca4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -13196,6 +13196,14 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) int ret; u64 key; + if (prog->type == BPF_PROG_TYPE_SYSCALL) { + if (prog->aux->sleepable) + /* attach_btf_id checked to be zero already */ + return 0; + verbose(env, "Syscall programs can only be sleepable\n"); + return -EINVAL; + } + if (prog->aux->sleepable && prog->type != BPF_PROG_TYPE_TRACING && prog->type != BPF_PROG_TYPE_LSM) { verbose(env, "Only fentry/fexit/fmod_ret and lsm programs can be sleepable\n"); diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index a5d72c48fb66..a6972d7ddf80 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -918,3 +918,46 @@ out: kfree(user_ctx); return ret; } + +int bpf_prog_test_run_syscall(struct bpf_prog *prog, + const union bpf_attr *kattr, + union bpf_attr __user *uattr) +{ + void __user *ctx_in = u64_to_user_ptr(kattr->test.ctx_in); + __u32 ctx_size_in = kattr->test.ctx_size_in; + void *ctx = NULL; + u32 retval; + int err = 0; + + /* doesn't support data_in/out, ctx_out, duration, or repeat or flags */ + if (kattr->test.data_in || kattr->test.data_out || + kattr->test.ctx_out || kattr->test.duration || + kattr->test.repeat || kattr->test.flags) + return -EINVAL; + + if (ctx_size_in < prog->aux->max_ctx_offset || + ctx_size_in > U16_MAX) + return -EINVAL; + + if (ctx_size_in) { + ctx = kzalloc(ctx_size_in, GFP_USER); + if (!ctx) + return -ENOMEM; + if (copy_from_user(ctx, ctx_in, ctx_size_in)) { + err = -EFAULT; + goto out; + } + } + retval = bpf_prog_run_pin_on_cpu(prog, ctx); + + if (copy_to_user(&uattr->test.retval, &retval, sizeof(u32))) { + err = -EFAULT; + goto out; + } + if (ctx_size_in) + if (copy_to_user(ctx_in, ctx, ctx_size_in)) + err = -EFAULT; +out: + kfree(ctx); + return err; +} diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index ec6d85a81744..c92648f38144 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -937,6 +937,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_EXT, BPF_PROG_TYPE_LSM, BPF_PROG_TYPE_SK_LOOKUP, + BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */ }; enum bpf_attach_type { @@ -4735,6 +4736,12 @@ union bpf_attr { * be zero-terminated except when **str_size** is 0. * * Or **-EBUSY** if the per-CPU memory copy buffer is busy. + * + * long bpf_sys_bpf(u32 cmd, void *attr, u32 attr_size) + * Description + * Execute bpf syscall with given arguments. + * Return + * A syscall result. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4903,6 +4910,7 @@ union bpf_attr { FN(check_mtu), \ FN(for_each_map_elem), \ FN(snprintf), \ + FN(sys_bpf), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3-58-ga151 From af2ac3e13e45752af03c8a933f9b6e18841b128b Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 13 May 2021 17:36:05 -0700 Subject: bpf: Prepare bpf syscall to be used from kernel and user space. With the help from bpfptr_t prepare relevant bpf syscall commands to be used from kernel and user space. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210514003623.28033-4-alexei.starovoitov@gmail.com --- include/linux/bpf.h | 8 ++-- kernel/bpf/bpf_iter.c | 13 +++--- kernel/bpf/syscall.c | 113 +++++++++++++++++++++++++++++++++----------------- kernel/bpf/verifier.c | 34 ++++++++------- net/bpf/test_run.c | 2 +- 5 files changed, 104 insertions(+), 66 deletions(-) (limited to 'net') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 04a2bf41ae72..7fd53380c981 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -22,6 +22,7 @@ #include #include #include +#include struct bpf_verifier_env; struct bpf_verifier_log; @@ -1428,7 +1429,7 @@ struct bpf_iter__bpf_map_elem { int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info); void bpf_iter_unreg_target(const struct bpf_iter_reg *reg_info); bool bpf_iter_prog_supported(struct bpf_prog *prog); -int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); +int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr, struct bpf_prog *prog); int bpf_iter_new_fd(struct bpf_link *link); bool bpf_link_is_iter(struct bpf_link *link); struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop); @@ -1459,7 +1460,7 @@ int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value); int bpf_get_file_flag(int flags); -int bpf_check_uarg_tail_zero(void __user *uaddr, size_t expected_size, +int bpf_check_uarg_tail_zero(bpfptr_t uaddr, size_t expected_size, size_t actual_size); /* memcpy that is used with 8-byte aligned pointers, power-of-8 size and @@ -1479,8 +1480,7 @@ static inline void bpf_long_memcpy(void *dst, const void *src, u32 size) } /* verify correctness of eBPF program */ -int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, - union bpf_attr __user *uattr); +int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, bpfptr_t uattr); #ifndef CONFIG_BPF_JIT_ALWAYS_ON void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth); diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 931870f9cf56..2d4fbdbb194e 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -473,15 +473,16 @@ bool bpf_link_is_iter(struct bpf_link *link) return link->ops == &bpf_iter_link_lops; } -int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr, + struct bpf_prog *prog) { - union bpf_iter_link_info __user *ulinfo; struct bpf_link_primer link_primer; struct bpf_iter_target_info *tinfo; union bpf_iter_link_info linfo; struct bpf_iter_link *link; u32 prog_btf_id, linfo_len; bool existed = false; + bpfptr_t ulinfo; int err; if (attr->link_create.target_fd || attr->link_create.flags) @@ -489,18 +490,18 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) memset(&linfo, 0, sizeof(union bpf_iter_link_info)); - ulinfo = u64_to_user_ptr(attr->link_create.iter_info); + ulinfo = make_bpfptr(attr->link_create.iter_info, uattr.is_kernel); linfo_len = attr->link_create.iter_info_len; - if (!ulinfo ^ !linfo_len) + if (bpfptr_is_null(ulinfo) ^ !linfo_len) return -EINVAL; - if (ulinfo) { + if (!bpfptr_is_null(ulinfo)) { err = bpf_check_uarg_tail_zero(ulinfo, sizeof(linfo), linfo_len); if (err) return err; linfo_len = min_t(u32, linfo_len, sizeof(linfo)); - if (copy_from_user(&linfo, ulinfo, linfo_len)) + if (copy_from_bpfptr(&linfo, ulinfo, linfo_len)) return -EFAULT; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b1e7352919cb..28387fe149ba 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -72,11 +72,10 @@ static const struct bpf_map_ops * const bpf_map_types[] = { * copy_from_user() call. However, this is not a concern since this function is * meant to be a future-proofing of bits. */ -int bpf_check_uarg_tail_zero(void __user *uaddr, +int bpf_check_uarg_tail_zero(bpfptr_t uaddr, size_t expected_size, size_t actual_size) { - unsigned char __user *addr = uaddr + expected_size; int res; if (unlikely(actual_size > PAGE_SIZE)) /* silly large */ @@ -85,7 +84,12 @@ int bpf_check_uarg_tail_zero(void __user *uaddr, if (actual_size <= expected_size) return 0; - res = check_zeroed_user(addr, actual_size - expected_size); + if (uaddr.is_kernel) + res = memchr_inv(uaddr.kernel + expected_size, 0, + actual_size - expected_size) == NULL; + else + res = check_zeroed_user(uaddr.user + expected_size, + actual_size - expected_size); if (res < 0) return res; return res ? 0 : -E2BIG; @@ -1004,6 +1008,17 @@ static void *__bpf_copy_key(void __user *ukey, u64 key_size) return NULL; } +static void *___bpf_copy_key(bpfptr_t ukey, u64 key_size) +{ + if (key_size) + return memdup_bpfptr(ukey, key_size); + + if (!bpfptr_is_null(ukey)) + return ERR_PTR(-EINVAL); + + return NULL; +} + /* last field in 'union bpf_attr' used by this command */ #define BPF_MAP_LOOKUP_ELEM_LAST_FIELD flags @@ -1074,10 +1089,10 @@ err_put: #define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags -static int map_update_elem(union bpf_attr *attr) +static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr) { - void __user *ukey = u64_to_user_ptr(attr->key); - void __user *uvalue = u64_to_user_ptr(attr->value); + bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel); + bpfptr_t uvalue = make_bpfptr(attr->value, uattr.is_kernel); int ufd = attr->map_fd; struct bpf_map *map; void *key, *value; @@ -1103,7 +1118,7 @@ static int map_update_elem(union bpf_attr *attr) goto err_put; } - key = __bpf_copy_key(ukey, map->key_size); + key = ___bpf_copy_key(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); goto err_put; @@ -1123,7 +1138,7 @@ static int map_update_elem(union bpf_attr *attr) goto free_key; err = -EFAULT; - if (copy_from_user(value, uvalue, value_size) != 0) + if (copy_from_bpfptr(value, uvalue, value_size) != 0) goto free_value; err = bpf_map_update_value(map, f, key, value, attr->flags); @@ -2076,7 +2091,7 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type) /* last field in 'union bpf_attr' used by this command */ #define BPF_PROG_LOAD_LAST_FIELD attach_prog_fd -static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) +static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr) { enum bpf_prog_type type = attr->prog_type; struct bpf_prog *prog, *dst_prog = NULL; @@ -2101,8 +2116,9 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) return -EPERM; /* copy eBPF program license from user space */ - if (strncpy_from_user(license, u64_to_user_ptr(attr->license), - sizeof(license) - 1) < 0) + if (strncpy_from_bpfptr(license, + make_bpfptr(attr->license, uattr.is_kernel), + sizeof(license) - 1) < 0) return -EFAULT; license[sizeof(license) - 1] = 0; @@ -2186,8 +2202,9 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) prog->len = attr->insn_cnt; err = -EFAULT; - if (copy_from_user(prog->insns, u64_to_user_ptr(attr->insns), - bpf_prog_insn_size(prog)) != 0) + if (copy_from_bpfptr(prog->insns, + make_bpfptr(attr->insns, uattr.is_kernel), + bpf_prog_insn_size(prog)) != 0) goto free_prog_sec; prog->orig_prog = NULL; @@ -3423,7 +3440,7 @@ static int bpf_prog_get_info_by_fd(struct file *file, u32 ulen; int err; - err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len); + err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len); if (err) return err; info_len = min_t(u32, sizeof(info), info_len); @@ -3702,7 +3719,7 @@ static int bpf_map_get_info_by_fd(struct file *file, u32 info_len = attr->info.info_len; int err; - err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len); + err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len); if (err) return err; info_len = min_t(u32, sizeof(info), info_len); @@ -3745,7 +3762,7 @@ static int bpf_btf_get_info_by_fd(struct file *file, u32 info_len = attr->info.info_len; int err; - err = bpf_check_uarg_tail_zero(uinfo, sizeof(*uinfo), info_len); + err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(*uinfo), info_len); if (err) return err; @@ -3762,7 +3779,7 @@ static int bpf_link_get_info_by_fd(struct file *file, u32 info_len = attr->info.info_len; int err; - err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len); + err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len); if (err) return err; info_len = min_t(u32, sizeof(info), info_len); @@ -4023,13 +4040,14 @@ err_put: return err; } -static int tracing_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +static int tracing_bpf_link_attach(const union bpf_attr *attr, bpfptr_t uattr, + struct bpf_prog *prog) { if (attr->link_create.attach_type != prog->expected_attach_type) return -EINVAL; if (prog->expected_attach_type == BPF_TRACE_ITER) - return bpf_iter_link_attach(attr, prog); + return bpf_iter_link_attach(attr, uattr, prog); else if (prog->type == BPF_PROG_TYPE_EXT) return bpf_tracing_prog_attach(prog, attr->link_create.target_fd, @@ -4038,7 +4056,7 @@ static int tracing_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog * } #define BPF_LINK_CREATE_LAST_FIELD link_create.iter_info_len -static int link_create(union bpf_attr *attr) +static int link_create(union bpf_attr *attr, bpfptr_t uattr) { enum bpf_prog_type ptype; struct bpf_prog *prog; @@ -4057,7 +4075,7 @@ static int link_create(union bpf_attr *attr) goto out; if (prog->type == BPF_PROG_TYPE_EXT) { - ret = tracing_bpf_link_attach(attr, prog); + ret = tracing_bpf_link_attach(attr, uattr, prog); goto out; } @@ -4078,7 +4096,7 @@ static int link_create(union bpf_attr *attr) ret = cgroup_bpf_link_attach(attr, prog); break; case BPF_PROG_TYPE_TRACING: - ret = tracing_bpf_link_attach(attr, prog); + ret = tracing_bpf_link_attach(attr, uattr, prog); break; case BPF_PROG_TYPE_FLOW_DISSECTOR: case BPF_PROG_TYPE_SK_LOOKUP: @@ -4366,7 +4384,7 @@ out_prog_put: return ret; } -SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) +static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size) { union bpf_attr attr; int err; @@ -4381,7 +4399,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz /* copy attributes from user space, may be less than sizeof(bpf_attr) */ memset(&attr, 0, sizeof(attr)); - if (copy_from_user(&attr, uattr, size) != 0) + if (copy_from_bpfptr(&attr, uattr, size) != 0) return -EFAULT; err = security_bpf(cmd, &attr, size); @@ -4396,7 +4414,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz err = map_lookup_elem(&attr); break; case BPF_MAP_UPDATE_ELEM: - err = map_update_elem(&attr); + err = map_update_elem(&attr, uattr); break; case BPF_MAP_DELETE_ELEM: err = map_delete_elem(&attr); @@ -4423,21 +4441,21 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz err = bpf_prog_detach(&attr); break; case BPF_PROG_QUERY: - err = bpf_prog_query(&attr, uattr); + err = bpf_prog_query(&attr, uattr.user); break; case BPF_PROG_TEST_RUN: - err = bpf_prog_test_run(&attr, uattr); + err = bpf_prog_test_run(&attr, uattr.user); break; case BPF_PROG_GET_NEXT_ID: - err = bpf_obj_get_next_id(&attr, uattr, + err = bpf_obj_get_next_id(&attr, uattr.user, &prog_idr, &prog_idr_lock); break; case BPF_MAP_GET_NEXT_ID: - err = bpf_obj_get_next_id(&attr, uattr, + err = bpf_obj_get_next_id(&attr, uattr.user, &map_idr, &map_idr_lock); break; case BPF_BTF_GET_NEXT_ID: - err = bpf_obj_get_next_id(&attr, uattr, + err = bpf_obj_get_next_id(&attr, uattr.user, &btf_idr, &btf_idr_lock); break; case BPF_PROG_GET_FD_BY_ID: @@ -4447,7 +4465,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz err = bpf_map_get_fd_by_id(&attr); break; case BPF_OBJ_GET_INFO_BY_FD: - err = bpf_obj_get_info_by_fd(&attr, uattr); + err = bpf_obj_get_info_by_fd(&attr, uattr.user); break; case BPF_RAW_TRACEPOINT_OPEN: err = bpf_raw_tracepoint_open(&attr); @@ -4459,26 +4477,26 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz err = bpf_btf_get_fd_by_id(&attr); break; case BPF_TASK_FD_QUERY: - err = bpf_task_fd_query(&attr, uattr); + err = bpf_task_fd_query(&attr, uattr.user); break; case BPF_MAP_LOOKUP_AND_DELETE_ELEM: err = map_lookup_and_delete_elem(&attr); break; case BPF_MAP_LOOKUP_BATCH: - err = bpf_map_do_batch(&attr, uattr, BPF_MAP_LOOKUP_BATCH); + err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_LOOKUP_BATCH); break; case BPF_MAP_LOOKUP_AND_DELETE_BATCH: - err = bpf_map_do_batch(&attr, uattr, + err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_LOOKUP_AND_DELETE_BATCH); break; case BPF_MAP_UPDATE_BATCH: - err = bpf_map_do_batch(&attr, uattr, BPF_MAP_UPDATE_BATCH); + err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_UPDATE_BATCH); break; case BPF_MAP_DELETE_BATCH: - err = bpf_map_do_batch(&attr, uattr, BPF_MAP_DELETE_BATCH); + err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_DELETE_BATCH); break; case BPF_LINK_CREATE: - err = link_create(&attr); + err = link_create(&attr, uattr); break; case BPF_LINK_UPDATE: err = link_update(&attr); @@ -4487,7 +4505,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz err = bpf_link_get_fd_by_id(&attr); break; case BPF_LINK_GET_NEXT_ID: - err = bpf_obj_get_next_id(&attr, uattr, + err = bpf_obj_get_next_id(&attr, uattr.user, &link_idr, &link_idr_lock); break; case BPF_ENABLE_STATS: @@ -4510,6 +4528,11 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz return err; } +SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) +{ + return __sys_bpf(cmd, USER_BPFPTR(uattr), size); +} + static bool syscall_prog_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, @@ -4524,7 +4547,19 @@ static bool syscall_prog_is_valid_access(int off, int size, BPF_CALL_3(bpf_sys_bpf, int, cmd, void *, attr, u32, attr_size) { - return -EINVAL; + switch (cmd) { + case BPF_MAP_CREATE: + case BPF_MAP_UPDATE_ELEM: + case BPF_MAP_FREEZE: + case BPF_PROG_LOAD: + break; + /* case BPF_PROG_TEST_RUN: + * is not part of this list to prevent recursive test_run + */ + default: + return -EINVAL; + } + return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size); } const struct bpf_func_proto bpf_sys_bpf_proto = { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 37407d8fbca4..e63c7d60e00d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9436,7 +9436,7 @@ static int check_abnormal_return(struct bpf_verifier_env *env) static int check_btf_func(struct bpf_verifier_env *env, const union bpf_attr *attr, - union bpf_attr __user *uattr) + bpfptr_t uattr) { const struct btf_type *type, *func_proto, *ret_type; u32 i, nfuncs, urec_size, min_size; @@ -9445,7 +9445,7 @@ static int check_btf_func(struct bpf_verifier_env *env, struct bpf_func_info_aux *info_aux = NULL; struct bpf_prog *prog; const struct btf *btf; - void __user *urecord; + bpfptr_t urecord; u32 prev_offset = 0; bool scalar_return; int ret = -ENOMEM; @@ -9473,7 +9473,7 @@ static int check_btf_func(struct bpf_verifier_env *env, prog = env->prog; btf = prog->aux->btf; - urecord = u64_to_user_ptr(attr->func_info); + urecord = make_bpfptr(attr->func_info, uattr.is_kernel); min_size = min_t(u32, krec_size, urec_size); krecord = kvcalloc(nfuncs, krec_size, GFP_KERNEL | __GFP_NOWARN); @@ -9491,13 +9491,15 @@ static int check_btf_func(struct bpf_verifier_env *env, /* set the size kernel expects so loader can zero * out the rest of the record. */ - if (put_user(min_size, &uattr->func_info_rec_size)) + if (copy_to_bpfptr_offset(uattr, + offsetof(union bpf_attr, func_info_rec_size), + &min_size, sizeof(min_size))) ret = -EFAULT; } goto err_free; } - if (copy_from_user(&krecord[i], urecord, min_size)) { + if (copy_from_bpfptr(&krecord[i], urecord, min_size)) { ret = -EFAULT; goto err_free; } @@ -9549,7 +9551,7 @@ static int check_btf_func(struct bpf_verifier_env *env, } prev_offset = krecord[i].insn_off; - urecord += urec_size; + bpfptr_add(&urecord, urec_size); } prog->aux->func_info = krecord; @@ -9581,14 +9583,14 @@ static void adjust_btf_func(struct bpf_verifier_env *env) static int check_btf_line(struct bpf_verifier_env *env, const union bpf_attr *attr, - union bpf_attr __user *uattr) + bpfptr_t uattr) { u32 i, s, nr_linfo, ncopy, expected_size, rec_size, prev_offset = 0; struct bpf_subprog_info *sub; struct bpf_line_info *linfo; struct bpf_prog *prog; const struct btf *btf; - void __user *ulinfo; + bpfptr_t ulinfo; int err; nr_linfo = attr->line_info_cnt; @@ -9614,7 +9616,7 @@ static int check_btf_line(struct bpf_verifier_env *env, s = 0; sub = env->subprog_info; - ulinfo = u64_to_user_ptr(attr->line_info); + ulinfo = make_bpfptr(attr->line_info, uattr.is_kernel); expected_size = sizeof(struct bpf_line_info); ncopy = min_t(u32, expected_size, rec_size); for (i = 0; i < nr_linfo; i++) { @@ -9622,14 +9624,15 @@ static int check_btf_line(struct bpf_verifier_env *env, if (err) { if (err == -E2BIG) { verbose(env, "nonzero tailing record in line_info"); - if (put_user(expected_size, - &uattr->line_info_rec_size)) + if (copy_to_bpfptr_offset(uattr, + offsetof(union bpf_attr, line_info_rec_size), + &expected_size, sizeof(expected_size))) err = -EFAULT; } goto err_free; } - if (copy_from_user(&linfo[i], ulinfo, ncopy)) { + if (copy_from_bpfptr(&linfo[i], ulinfo, ncopy)) { err = -EFAULT; goto err_free; } @@ -9681,7 +9684,7 @@ static int check_btf_line(struct bpf_verifier_env *env, } prev_offset = linfo[i].insn_off; - ulinfo += rec_size; + bpfptr_add(&ulinfo, rec_size); } if (s != env->subprog_cnt) { @@ -9703,7 +9706,7 @@ err_free: static int check_btf_info(struct bpf_verifier_env *env, const union bpf_attr *attr, - union bpf_attr __user *uattr) + bpfptr_t uattr) { struct btf *btf; int err; @@ -13275,8 +13278,7 @@ struct btf *bpf_get_btf_vmlinux(void) return btf_vmlinux; } -int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, - union bpf_attr __user *uattr) +int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr) { u64 start_time = ktime_get_ns(); struct bpf_verifier_env *env; diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index a6972d7ddf80..aa47af349ba8 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -409,7 +409,7 @@ static void *bpf_ctx_init(const union bpf_attr *kattr, u32 max_size) return ERR_PTR(-ENOMEM); if (data_in) { - err = bpf_check_uarg_tail_zero(data_in, max_size, size); + err = bpf_check_uarg_tail_zero(USER_BPFPTR(data_in), max_size, size); if (err) { kfree(data); return ERR_PTR(err); -- cgit v1.2.3-58-ga151 From f7e0318a314f9271b0f0cdd4bfdc691976976d8c Mon Sep 17 00:00:00 2001 From: Zheng Yejian Date: Wed, 19 May 2021 15:34:38 +0800 Subject: netlabel: remove unused parameter in netlbl_netlink_auditinfo() loginuid/sessionid/secid have been read from 'current' instead of struct netlink_skb_parms, the parameter 'skb' seems no longer needed. Fixes: c53fa1ed92cd ("netlink: kill loginuid/sessionid/sid members from struct netlink_skb_parms") Signed-off-by: Zheng Yejian Signed-off-by: David S. Miller --- net/netlabel/netlabel_calipso.c | 4 ++-- net/netlabel/netlabel_cipso_v4.c | 4 ++-- net/netlabel/netlabel_mgmt.c | 8 ++++---- net/netlabel/netlabel_unlabeled.c | 10 +++++----- net/netlabel/netlabel_user.h | 4 +--- 5 files changed, 14 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/netlabel/netlabel_calipso.c b/net/netlabel/netlabel_calipso.c index f28c8947c730..91a19c3ea1a3 100644 --- a/net/netlabel/netlabel_calipso.c +++ b/net/netlabel/netlabel_calipso.c @@ -105,7 +105,7 @@ static int netlbl_calipso_add(struct sk_buff *skb, struct genl_info *info) !info->attrs[NLBL_CALIPSO_A_MTYPE]) return -EINVAL; - netlbl_netlink_auditinfo(skb, &audit_info); + netlbl_netlink_auditinfo(&audit_info); switch (nla_get_u32(info->attrs[NLBL_CALIPSO_A_MTYPE])) { case CALIPSO_MAP_PASS: ret_val = netlbl_calipso_add_pass(info, &audit_info); @@ -287,7 +287,7 @@ static int netlbl_calipso_remove(struct sk_buff *skb, struct genl_info *info) if (!info->attrs[NLBL_CALIPSO_A_DOI]) return -EINVAL; - netlbl_netlink_auditinfo(skb, &audit_info); + netlbl_netlink_auditinfo(&audit_info); cb_arg.doi = nla_get_u32(info->attrs[NLBL_CALIPSO_A_DOI]); cb_arg.audit_info = &audit_info; ret_val = netlbl_domhsh_walk(&skip_bkt, &skip_chain, diff --git a/net/netlabel/netlabel_cipso_v4.c b/net/netlabel/netlabel_cipso_v4.c index 4f50a64315cf..baf235721c43 100644 --- a/net/netlabel/netlabel_cipso_v4.c +++ b/net/netlabel/netlabel_cipso_v4.c @@ -410,7 +410,7 @@ static int netlbl_cipsov4_add(struct sk_buff *skb, struct genl_info *info) !info->attrs[NLBL_CIPSOV4_A_MTYPE]) return -EINVAL; - netlbl_netlink_auditinfo(skb, &audit_info); + netlbl_netlink_auditinfo(&audit_info); switch (nla_get_u32(info->attrs[NLBL_CIPSOV4_A_MTYPE])) { case CIPSO_V4_MAP_TRANS: ret_val = netlbl_cipsov4_add_std(info, &audit_info); @@ -709,7 +709,7 @@ static int netlbl_cipsov4_remove(struct sk_buff *skb, struct genl_info *info) if (!info->attrs[NLBL_CIPSOV4_A_DOI]) return -EINVAL; - netlbl_netlink_auditinfo(skb, &audit_info); + netlbl_netlink_auditinfo(&audit_info); cb_arg.doi = nla_get_u32(info->attrs[NLBL_CIPSOV4_A_DOI]); cb_arg.audit_info = &audit_info; ret_val = netlbl_domhsh_walk(&skip_bkt, &skip_chain, diff --git a/net/netlabel/netlabel_mgmt.c b/net/netlabel/netlabel_mgmt.c index ca52f5085989..e664ab990941 100644 --- a/net/netlabel/netlabel_mgmt.c +++ b/net/netlabel/netlabel_mgmt.c @@ -434,7 +434,7 @@ static int netlbl_mgmt_add(struct sk_buff *skb, struct genl_info *info) (info->attrs[NLBL_MGMT_A_IPV6MASK] != NULL))) return -EINVAL; - netlbl_netlink_auditinfo(skb, &audit_info); + netlbl_netlink_auditinfo(&audit_info); return netlbl_mgmt_add_common(info, &audit_info); } @@ -457,7 +457,7 @@ static int netlbl_mgmt_remove(struct sk_buff *skb, struct genl_info *info) if (!info->attrs[NLBL_MGMT_A_DOMAIN]) return -EINVAL; - netlbl_netlink_auditinfo(skb, &audit_info); + netlbl_netlink_auditinfo(&audit_info); domain = nla_data(info->attrs[NLBL_MGMT_A_DOMAIN]); return netlbl_domhsh_remove(domain, AF_UNSPEC, &audit_info); @@ -557,7 +557,7 @@ static int netlbl_mgmt_adddef(struct sk_buff *skb, struct genl_info *info) (info->attrs[NLBL_MGMT_A_IPV6MASK] != NULL))) return -EINVAL; - netlbl_netlink_auditinfo(skb, &audit_info); + netlbl_netlink_auditinfo(&audit_info); return netlbl_mgmt_add_common(info, &audit_info); } @@ -576,7 +576,7 @@ static int netlbl_mgmt_removedef(struct sk_buff *skb, struct genl_info *info) { struct netlbl_audit audit_info; - netlbl_netlink_auditinfo(skb, &audit_info); + netlbl_netlink_auditinfo(&audit_info); return netlbl_domhsh_remove_default(AF_UNSPEC, &audit_info); } diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c index 3e6ac9b790b1..2483df0bbd7c 100644 --- a/net/netlabel/netlabel_unlabeled.c +++ b/net/netlabel/netlabel_unlabeled.c @@ -814,7 +814,7 @@ static int netlbl_unlabel_accept(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NLBL_UNLABEL_A_ACPTFLG]) { value = nla_get_u8(info->attrs[NLBL_UNLABEL_A_ACPTFLG]); if (value == 1 || value == 0) { - netlbl_netlink_auditinfo(skb, &audit_info); + netlbl_netlink_auditinfo(&audit_info); netlbl_unlabel_acceptflg_set(value, &audit_info); return 0; } @@ -897,7 +897,7 @@ static int netlbl_unlabel_staticadd(struct sk_buff *skb, !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) return -EINVAL; - netlbl_netlink_auditinfo(skb, &audit_info); + netlbl_netlink_auditinfo(&audit_info); ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); if (ret_val != 0) @@ -947,7 +947,7 @@ static int netlbl_unlabel_staticadddef(struct sk_buff *skb, !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) return -EINVAL; - netlbl_netlink_auditinfo(skb, &audit_info); + netlbl_netlink_auditinfo(&audit_info); ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); if (ret_val != 0) @@ -994,7 +994,7 @@ static int netlbl_unlabel_staticremove(struct sk_buff *skb, !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) return -EINVAL; - netlbl_netlink_auditinfo(skb, &audit_info); + netlbl_netlink_auditinfo(&audit_info); ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); if (ret_val != 0) @@ -1034,7 +1034,7 @@ static int netlbl_unlabel_staticremovedef(struct sk_buff *skb, !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) return -EINVAL; - netlbl_netlink_auditinfo(skb, &audit_info); + netlbl_netlink_auditinfo(&audit_info); ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); if (ret_val != 0) diff --git a/net/netlabel/netlabel_user.h b/net/netlabel/netlabel_user.h index b9ba8112b3c5..6190cbf94bf0 100644 --- a/net/netlabel/netlabel_user.h +++ b/net/netlabel/netlabel_user.h @@ -28,11 +28,9 @@ /** * netlbl_netlink_auditinfo - Fetch the audit information from a NETLINK msg - * @skb: the packet * @audit_info: NetLabel audit information */ -static inline void netlbl_netlink_auditinfo(struct sk_buff *skb, - struct netlbl_audit *audit_info) +static inline void netlbl_netlink_auditinfo(struct netlbl_audit *audit_info) { security_task_getsecid_subj(current, &audit_info->secid); audit_info->loginuid = audit_get_loginuid(current); -- cgit v1.2.3-58-ga151 From eb0e4d59b6edbe678ecfc5d5b77608b634057f08 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 19 May 2021 15:08:18 +0300 Subject: net: Add notifications when multipath hash field change In-kernel notifications are already sent when the multipath hash policy itself changes, but not when the multipath hash fields change. Add these notifications, so that interested listeners (e.g., switch ASIC drivers) could perform the necessary configuration. Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 18 +++++++++++++++++- net/ipv6/sysctl_net_ipv6.c | 18 +++++++++++++++++- 2 files changed, 34 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index ffb38ea06841..4fa77f182dcb 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -465,6 +465,22 @@ static int proc_fib_multipath_hash_policy(struct ctl_table *table, int write, return ret; } + +static int proc_fib_multipath_hash_fields(struct ctl_table *table, int write, + void *buffer, size_t *lenp, + loff_t *ppos) +{ + struct net *net; + int ret; + + net = container_of(table->data, struct net, + ipv4.sysctl_fib_multipath_hash_fields); + ret = proc_douintvec_minmax(table, write, buffer, lenp, ppos); + if (write && ret == 0) + call_netevent_notifiers(NETEVENT_IPV4_MPATH_HASH_UPDATE, net); + + return ret; +} #endif static struct ctl_table ipv4_table[] = { @@ -1061,7 +1077,7 @@ static struct ctl_table ipv4_net_table[] = { .data = &init_net.ipv4.sysctl_fib_multipath_hash_fields, .maxlen = sizeof(u32), .mode = 0644, - .proc_handler = proc_douintvec_minmax, + .proc_handler = proc_fib_multipath_hash_fields, .extra1 = SYSCTL_ONE, .extra2 = &fib_multipath_hash_fields_all_mask, }, diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index 160bea5db973..d7cf26f730d7 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -44,6 +44,22 @@ static int proc_rt6_multipath_hash_policy(struct ctl_table *table, int write, return ret; } +static int +proc_rt6_multipath_hash_fields(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) +{ + struct net *net; + int ret; + + net = container_of(table->data, struct net, + ipv6.sysctl.multipath_hash_fields); + ret = proc_douintvec_minmax(table, write, buffer, lenp, ppos); + if (write && ret == 0) + call_netevent_notifiers(NETEVENT_IPV6_MPATH_HASH_UPDATE, net); + + return ret; +} + static struct ctl_table ipv6_table_template[] = { { .procname = "bindv6only", @@ -160,7 +176,7 @@ static struct ctl_table ipv6_table_template[] = { .data = &init_net.ipv6.sysctl.multipath_hash_fields, .maxlen = sizeof(u32), .mode = 0644, - .proc_handler = proc_douintvec_minmax, + .proc_handler = proc_rt6_multipath_hash_fields, .extra1 = SYSCTL_ONE, .extra2 = &rt6_multipath_hash_fields_all_mask, }, -- cgit v1.2.3-58-ga151 From 05ff8435e50569a0a6b95e5ceaea43696e8827ab Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Wed, 19 May 2021 15:17:21 +0200 Subject: net/sched: cls_api: increase max_reclassify_loop modern userspace applications, like OVN, can configure the TC datapath to "recirculate" packets several times. If more than 4 "recirculation" rules are configured, packets can be dropped by __tcf_classify(). Changing the maximum number of reclassifications (from 4 to 16) should be sufficient to prevent drops in most use cases, and guard against loops at the same time. Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- net/sched/cls_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 40fbea626dfd..75e3a288a7c8 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1531,7 +1531,7 @@ static inline int __tcf_classify(struct sk_buff *skb, u32 *last_executed_chain) { #ifdef CONFIG_NET_CLS_ACT - const int max_reclassify_loop = 4; + const int max_reclassify_loop = 16; const struct tcf_proto *first_tp; int limit = 0; -- cgit v1.2.3-58-ga151 From a49e72b3bda73d36664a084e47da9727a31b8095 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 19 May 2021 15:58:52 +0000 Subject: net: qrtr: ns: Fix error return code in qrtr_ns_init() Fix to return a negative error code -ENOMEM from the error handling case instead of 0, as done elsewhere in this function. Fixes: c6e08d6251f3 ("net: qrtr: Allocate workqueue before kernel_bind") Reported-by: Hulk Robot Signed-off-by: Wei Yongjun Reviewed-by: Manivannan Sadhasivam Signed-off-by: David S. Miller --- net/qrtr/ns.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/qrtr/ns.c b/net/qrtr/ns.c index 8d00dfe8139e..1990d496fcfc 100644 --- a/net/qrtr/ns.c +++ b/net/qrtr/ns.c @@ -775,8 +775,10 @@ int qrtr_ns_init(void) } qrtr_ns.workqueue = alloc_workqueue("qrtr_ns_handler", WQ_UNBOUND, 1); - if (!qrtr_ns.workqueue) + if (!qrtr_ns.workqueue) { + ret = -ENOMEM; goto err_sock; + } qrtr_ns.sock->sk->sk_data_ready = qrtr_ns_data_ready; -- cgit v1.2.3-58-ga151 From 48afdaea04eb691df3244b6a361f1a0c4540ff45 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Thu, 20 May 2021 21:36:45 +0800 Subject: net: atm: use DEVICE_ATTR_RO macro Use DEVICE_ATTR_RO helper instead of plain DEVICE_ATTR, which makes the code a bit shorter and easier to read. Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- net/atm/atm_sysfs.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/atm/atm_sysfs.c b/net/atm/atm_sysfs.c index aa1b57161f3b..0fdbdfd19474 100644 --- a/net/atm/atm_sysfs.c +++ b/net/atm/atm_sysfs.c @@ -11,7 +11,7 @@ #define to_atm_dev(cldev) container_of(cldev, struct atm_dev, class_dev) -static ssize_t show_type(struct device *cdev, +static ssize_t type_show(struct device *cdev, struct device_attribute *attr, char *buf) { struct atm_dev *adev = to_atm_dev(cdev); @@ -19,7 +19,7 @@ static ssize_t show_type(struct device *cdev, return scnprintf(buf, PAGE_SIZE, "%s\n", adev->type); } -static ssize_t show_address(struct device *cdev, +static ssize_t address_show(struct device *cdev, struct device_attribute *attr, char *buf) { struct atm_dev *adev = to_atm_dev(cdev); @@ -27,7 +27,7 @@ static ssize_t show_address(struct device *cdev, return scnprintf(buf, PAGE_SIZE, "%pM\n", adev->esi); } -static ssize_t show_atmaddress(struct device *cdev, +static ssize_t atmaddress_show(struct device *cdev, struct device_attribute *attr, char *buf) { unsigned long flags; @@ -50,7 +50,7 @@ static ssize_t show_atmaddress(struct device *cdev, return count; } -static ssize_t show_atmindex(struct device *cdev, +static ssize_t atmindex_show(struct device *cdev, struct device_attribute *attr, char *buf) { struct atm_dev *adev = to_atm_dev(cdev); @@ -58,7 +58,7 @@ static ssize_t show_atmindex(struct device *cdev, return scnprintf(buf, PAGE_SIZE, "%d\n", adev->number); } -static ssize_t show_carrier(struct device *cdev, +static ssize_t carrier_show(struct device *cdev, struct device_attribute *attr, char *buf) { struct atm_dev *adev = to_atm_dev(cdev); @@ -67,7 +67,7 @@ static ssize_t show_carrier(struct device *cdev, adev->signal == ATM_PHY_SIG_LOST ? 0 : 1); } -static ssize_t show_link_rate(struct device *cdev, +static ssize_t link_rate_show(struct device *cdev, struct device_attribute *attr, char *buf) { struct atm_dev *adev = to_atm_dev(cdev); @@ -90,12 +90,12 @@ static ssize_t show_link_rate(struct device *cdev, return scnprintf(buf, PAGE_SIZE, "%d\n", link_rate); } -static DEVICE_ATTR(address, 0444, show_address, NULL); -static DEVICE_ATTR(atmaddress, 0444, show_atmaddress, NULL); -static DEVICE_ATTR(atmindex, 0444, show_atmindex, NULL); -static DEVICE_ATTR(carrier, 0444, show_carrier, NULL); -static DEVICE_ATTR(type, 0444, show_type, NULL); -static DEVICE_ATTR(link_rate, 0444, show_link_rate, NULL); +static DEVICE_ATTR_RO(address); +static DEVICE_ATTR_RO(atmaddress); +static DEVICE_ATTR_RO(atmindex); +static DEVICE_ATTR_RO(carrier); +static DEVICE_ATTR_RO(type); +static DEVICE_ATTR_RO(link_rate); static struct device_attribute *atm_attrs[] = { &dev_attr_atmaddress, -- cgit v1.2.3-58-ga151 From a720a2a0ad6cb6f769b6c7cbc3c54287a7d54ff8 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Fri, 21 May 2021 10:33:01 +0200 Subject: xsk: Use kvcalloc to support large umems MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use kvcalloc() instead of kcalloc() to support large umems with, on my server, one million pages or more in the umem. Reported-by: Dan Siemon Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Acked-by: Björn Töpel Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210521083301.26921-1-magnus.karlsson@gmail.com --- net/xdp/xdp_umem.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 56a28a686988..f01ef6bda390 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -27,7 +27,7 @@ static void xdp_umem_unpin_pages(struct xdp_umem *umem) { unpin_user_pages_dirty_lock(umem->pgs, umem->npgs, true); - kfree(umem->pgs); + kvfree(umem->pgs); umem->pgs = NULL; } @@ -99,8 +99,7 @@ static int xdp_umem_pin_pages(struct xdp_umem *umem, unsigned long address) long npgs; int err; - umem->pgs = kcalloc(umem->npgs, sizeof(*umem->pgs), - GFP_KERNEL | __GFP_NOWARN); + umem->pgs = kvcalloc(umem->npgs, sizeof(*umem->pgs), GFP_KERNEL | __GFP_NOWARN); if (!umem->pgs) return -ENOMEM; @@ -123,7 +122,7 @@ static int xdp_umem_pin_pages(struct xdp_umem *umem, unsigned long address) out_pin: xdp_umem_unpin_pages(umem); out_pgs: - kfree(umem->pgs); + kvfree(umem->pgs); umem->pgs = NULL; return err; } -- cgit v1.2.3-58-ga151 From ccc882f0d838cb45a1a78ea4e48c219887f920dc Mon Sep 17 00:00:00 2001 From: Nigel Christian Date: Mon, 24 May 2021 23:05:58 -0500 Subject: net: bridge: remove redundant assignment The variable br is assigned a value that is not being read after exiting case IFLA_STATS_LINK_XSTATS_SLAVE. The assignment is redundant and can be removed. Addresses-Coverity ("Unused value") Signed-off-by: Nigel Christian Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index e4e6e991313e..8642e56059fb 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -1644,7 +1644,6 @@ static size_t br_get_linkxstats_size(const struct net_device *dev, int attr) p = br_port_get_rtnl(dev); if (!p) return 0; - br = p->br; vg = nbp_vlan_group(p); break; default: -- cgit v1.2.3-58-ga151 From a925316af80ae93186f00d80163f5a3c7f5b4782 Mon Sep 17 00:00:00 2001 From: zuoqilin Date: Fri, 14 May 2021 15:55:13 +0800 Subject: net: Remove unnecessary variables It is not necessary to define variables to receive -ENOMEM, directly return -ENOMEM. Signed-off-by: zuoqilin Signed-off-by: Steffen Klassert --- net/key/af_key.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'net') diff --git a/net/key/af_key.c b/net/key/af_key.c index ef9b4ac03e7b..de24a7d474df 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -141,7 +141,6 @@ static int pfkey_create(struct net *net, struct socket *sock, int protocol, struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); struct sock *sk; struct pfkey_sock *pfk; - int err; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; @@ -150,10 +149,9 @@ static int pfkey_create(struct net *net, struct socket *sock, int protocol, if (protocol != PF_KEY_V2) return -EPROTONOSUPPORT; - err = -ENOMEM; sk = sk_alloc(net, PF_KEY, GFP_KERNEL, &key_proto, kern); if (sk == NULL) - goto out; + return -ENOMEM; pfk = pfkey_sk(sk); mutex_init(&pfk->dump_lock); @@ -169,8 +167,6 @@ static int pfkey_create(struct net *net, struct socket *sock, int protocol, pfkey_insert(sk); return 0; -out: - return err; } static int pfkey_release(struct socket *sock) -- cgit v1.2.3-58-ga151 From e624d4ed4aa8cc3c69d1359b0aaea539203ed266 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Wed, 19 May 2021 17:07:45 +0800 Subject: xdp: Extend xdp_redirect_map with broadcast support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds two flags BPF_F_BROADCAST and BPF_F_EXCLUDE_INGRESS to extend xdp_redirect_map for broadcast support. With BPF_F_BROADCAST the packet will be broadcasted to all the interfaces in the map. with BPF_F_EXCLUDE_INGRESS the ingress interface will be excluded when do broadcasting. When getting the devices in dev hash map via dev_map_hash_get_next_key(), there is a possibility that we fall back to the first key when a device was removed. This will duplicate packets on some interfaces. So just walk the whole buckets to avoid this issue. For dev array map, we also walk the whole map to find valid interfaces. Function bpf_clear_redirect_map() was removed in commit ee75aef23afe ("bpf, xdp: Restructure redirect actions"). Add it back as we need to use ri->map again. With test topology: +-------------------+ +-------------------+ | Host A (i40e 10G) | ---------- | eno1(i40e 10G) | +-------------------+ | | | Host B | +-------------------+ | | | Host C (i40e 10G) | ---------- | eno2(i40e 10G) | +-------------------+ | | | +------+ | | veth0 -- | Peer | | | veth1 -- | | | | veth2 -- | NS | | | +------+ | +-------------------+ On Host A: # pktgen/pktgen_sample03_burst_single_flow.sh -i eno1 -d $dst_ip -m $dst_mac -s 64 On Host B(Intel(R) Xeon(R) CPU E5-2690 v3 @ 2.60GHz, 128G Memory): Use xdp_redirect_map and xdp_redirect_map_multi in samples/bpf for testing. All the veth peers in the NS have a XDP_DROP program loaded. The forward_map max_entries in xdp_redirect_map_multi is modify to 4. Testing the performance impact on the regular xdp_redirect path with and without patch (to check impact of additional check for broadcast mode): 5.12 rc4 | redirect_map i40e->i40e | 2.0M | 9.7M 5.12 rc4 | redirect_map i40e->veth | 1.7M | 11.8M 5.12 rc4 + patch | redirect_map i40e->i40e | 2.0M | 9.6M 5.12 rc4 + patch | redirect_map i40e->veth | 1.7M | 11.7M Testing the performance when cloning packets with the redirect_map_multi test, using a redirect map size of 4, filled with 1-3 devices: 5.12 rc4 + patch | redirect_map multi i40e->veth (x1) | 1.7M | 11.4M 5.12 rc4 + patch | redirect_map multi i40e->veth (x2) | 1.1M | 4.3M 5.12 rc4 + patch | redirect_map multi i40e->veth (x3) | 0.8M | 2.6M Signed-off-by: Hangbin Liu Signed-off-by: Daniel Borkmann Acked-by: Toke Høiland-Jørgensen Acked-by: Martin KaFai Lau Acked-by: John Fastabend Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/bpf/20210519090747.1655268-3-liuhangbin@gmail.com --- include/linux/bpf.h | 20 +++++ include/linux/filter.h | 19 ++++- include/net/xdp.h | 1 + include/trace/events/xdp.h | 6 +- include/uapi/linux/bpf.h | 14 +++- kernel/bpf/cpumap.c | 3 +- kernel/bpf/devmap.c | 183 ++++++++++++++++++++++++++++++++++++++++- net/core/filter.c | 37 ++++++++- net/core/xdp.c | 28 +++++++ net/xdp/xskmap.c | 3 +- tools/include/uapi/linux/bpf.h | 14 +++- 11 files changed, 313 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 1e9a0ff3217b..86dec5001ae2 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1501,8 +1501,13 @@ int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, struct net_device *dev_rx); int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, struct net_device *dev_rx); +int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx, + struct bpf_map *map, bool exclude_ingress); int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, struct bpf_prog *xdp_prog); +int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, + struct bpf_prog *xdp_prog, struct bpf_map *map, + bool exclude_ingress); bool dev_map_can_have_prog(struct bpf_map *map); void __cpu_map_flush(void); @@ -1670,6 +1675,13 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, return 0; } +static inline +int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx, + struct bpf_map *map, bool exclude_ingress) +{ + return 0; +} + struct sk_buff; static inline int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, @@ -1679,6 +1691,14 @@ static inline int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, return 0; } +static inline +int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, + struct bpf_prog *xdp_prog, struct bpf_map *map, + bool exclude_ingress) +{ + return 0; +} + static inline void __cpu_map_flush(void) { } diff --git a/include/linux/filter.h b/include/linux/filter.h index 9a09547bc7ba..c5ad7df029ed 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -646,6 +646,7 @@ struct bpf_redirect_info { u32 flags; u32 tgt_index; void *tgt_value; + struct bpf_map *map; u32 map_id; enum bpf_map_type map_type; u32 kern_flags; @@ -1464,17 +1465,19 @@ static inline bool bpf_sk_lookup_run_v6(struct net *net, int protocol, } #endif /* IS_ENABLED(CONFIG_IPV6) */ -static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifindex, u64 flags, +static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifindex, + u64 flags, const u64 flag_mask, void *lookup_elem(struct bpf_map *map, u32 key)) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + const u64 action_mask = XDP_ABORTED | XDP_DROP | XDP_PASS | XDP_TX; /* Lower bits of the flags are used as return code on lookup failure */ - if (unlikely(flags > XDP_TX)) + if (unlikely(flags & ~(action_mask | flag_mask))) return XDP_ABORTED; ri->tgt_value = lookup_elem(map, ifindex); - if (unlikely(!ri->tgt_value)) { + if (unlikely(!ri->tgt_value) && !(flags & BPF_F_BROADCAST)) { /* If the lookup fails we want to clear out the state in the * redirect_info struct completely, so that if an eBPF program * performs multiple lookups, the last one always takes @@ -1482,13 +1485,21 @@ static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifind */ ri->map_id = INT_MAX; /* Valid map id idr range: [1,INT_MAX[ */ ri->map_type = BPF_MAP_TYPE_UNSPEC; - return flags; + return flags & action_mask; } ri->tgt_index = ifindex; ri->map_id = map->id; ri->map_type = map->map_type; + if (flags & BPF_F_BROADCAST) { + WRITE_ONCE(ri->map, map); + ri->flags = flags; + } else { + WRITE_ONCE(ri->map, NULL); + ri->flags = 0; + } + return XDP_REDIRECT; } diff --git a/include/net/xdp.h b/include/net/xdp.h index a5bc214a49d9..5533f0ab2afc 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -170,6 +170,7 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf, struct net_device *dev); int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp); +struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf); static inline void xdp_convert_frame_to_buff(struct xdp_frame *frame, struct xdp_buff *xdp) diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index fcad3645a70b..c40fc97f9417 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -110,7 +110,11 @@ DECLARE_EVENT_CLASS(xdp_redirect_template, u32 ifindex = 0, map_index = index; if (map_type == BPF_MAP_TYPE_DEVMAP || map_type == BPF_MAP_TYPE_DEVMAP_HASH) { - ifindex = ((struct _bpf_dtab_netdev *)tgt)->dev->ifindex; + /* Just leave to_ifindex to 0 if do broadcast redirect, + * as tgt will be NULL. + */ + if (tgt) + ifindex = ((struct _bpf_dtab_netdev *)tgt)->dev->ifindex; } else if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) { ifindex = index; map_index = 0; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 562adeac1d67..2c1ba70abbf1 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2555,8 +2555,12 @@ union bpf_attr { * The lower two bits of *flags* are used as the return code if * the map lookup fails. This is so that the return value can be * one of the XDP program return codes up to **XDP_TX**, as chosen - * by the caller. Any higher bits in the *flags* argument must be - * unset. + * by the caller. The higher bits of *flags* can be set to + * BPF_F_BROADCAST or BPF_F_EXCLUDE_INGRESS as defined below. + * + * With BPF_F_BROADCAST the packet will be broadcasted to all the + * interfaces in the map, with BPF_F_EXCLUDE_INGRESS the ingress + * interface will be excluded when do broadcasting. * * See also **bpf_redirect**\ (), which only supports redirecting * to an ifindex, but doesn't require a map to do so. @@ -5122,6 +5126,12 @@ enum { BPF_F_BPRM_SECUREEXEC = (1ULL << 0), }; +/* Flags for bpf_redirect_map helper */ +enum { + BPF_F_BROADCAST = (1ULL << 3), + BPF_F_EXCLUDE_INGRESS = (1ULL << 4), +}; + #define __bpf_md_ptr(type, name) \ union { \ type name; \ diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 5dd3e866599a..a1a0c4e791c6 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -601,7 +601,8 @@ static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key) static int cpu_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) { - return __bpf_xdp_redirect_map(map, ifindex, flags, __cpu_map_lookup_elem); + return __bpf_xdp_redirect_map(map, ifindex, flags, 0, + __cpu_map_lookup_elem); } static int cpu_map_btf_id; diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 642264e32abd..f9148daab0e3 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -198,6 +198,7 @@ static void dev_map_free(struct bpf_map *map) list_del_rcu(&dtab->list); spin_unlock(&dev_map_lock); + bpf_clear_redirect_map(map); synchronize_rcu(); /* Make sure prior __dev_map_entry_free() have completed. */ @@ -515,6 +516,99 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, return __xdp_enqueue(dev, xdp, dev_rx, dst->xdp_prog); } +static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_buff *xdp, + int exclude_ifindex) +{ + if (!obj || obj->dev->ifindex == exclude_ifindex || + !obj->dev->netdev_ops->ndo_xdp_xmit) + return false; + + if (xdp_ok_fwd_dev(obj->dev, xdp->data_end - xdp->data)) + return false; + + return true; +} + +static int dev_map_enqueue_clone(struct bpf_dtab_netdev *obj, + struct net_device *dev_rx, + struct xdp_frame *xdpf) +{ + struct xdp_frame *nxdpf; + + nxdpf = xdpf_clone(xdpf); + if (!nxdpf) + return -ENOMEM; + + bq_enqueue(obj->dev, nxdpf, dev_rx, obj->xdp_prog); + + return 0; +} + +int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx, + struct bpf_map *map, bool exclude_ingress) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + int exclude_ifindex = exclude_ingress ? dev_rx->ifindex : 0; + struct bpf_dtab_netdev *dst, *last_dst = NULL; + struct hlist_head *head; + struct xdp_frame *xdpf; + unsigned int i; + int err; + + xdpf = xdp_convert_buff_to_frame(xdp); + if (unlikely(!xdpf)) + return -EOVERFLOW; + + if (map->map_type == BPF_MAP_TYPE_DEVMAP) { + for (i = 0; i < map->max_entries; i++) { + dst = READ_ONCE(dtab->netdev_map[i]); + if (!is_valid_dst(dst, xdp, exclude_ifindex)) + continue; + + /* we only need n-1 clones; last_dst enqueued below */ + if (!last_dst) { + last_dst = dst; + continue; + } + + err = dev_map_enqueue_clone(last_dst, dev_rx, xdpf); + if (err) + return err; + + last_dst = dst; + } + } else { /* BPF_MAP_TYPE_DEVMAP_HASH */ + for (i = 0; i < dtab->n_buckets; i++) { + head = dev_map_index_hash(dtab, i); + hlist_for_each_entry_rcu(dst, head, index_hlist, + lockdep_is_held(&dtab->index_lock)) { + if (!is_valid_dst(dst, xdp, exclude_ifindex)) + continue; + + /* we only need n-1 clones; last_dst enqueued below */ + if (!last_dst) { + last_dst = dst; + continue; + } + + err = dev_map_enqueue_clone(last_dst, dev_rx, xdpf); + if (err) + return err; + + last_dst = dst; + } + } + } + + /* consume the last copy of the frame */ + if (last_dst) + bq_enqueue(last_dst->dev, xdpf, dev_rx, last_dst->xdp_prog); + else + xdp_return_frame_rx_napi(xdpf); /* dtab is empty */ + + return 0; +} + int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, struct bpf_prog *xdp_prog) { @@ -529,6 +623,87 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, return 0; } +static int dev_map_redirect_clone(struct bpf_dtab_netdev *dst, + struct sk_buff *skb, + struct bpf_prog *xdp_prog) +{ + struct sk_buff *nskb; + int err; + + nskb = skb_clone(skb, GFP_ATOMIC); + if (!nskb) + return -ENOMEM; + + err = dev_map_generic_redirect(dst, nskb, xdp_prog); + if (unlikely(err)) { + consume_skb(nskb); + return err; + } + + return 0; +} + +int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, + struct bpf_prog *xdp_prog, struct bpf_map *map, + bool exclude_ingress) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + int exclude_ifindex = exclude_ingress ? dev->ifindex : 0; + struct bpf_dtab_netdev *dst, *last_dst = NULL; + struct hlist_head *head; + struct hlist_node *next; + unsigned int i; + int err; + + if (map->map_type == BPF_MAP_TYPE_DEVMAP) { + for (i = 0; i < map->max_entries; i++) { + dst = READ_ONCE(dtab->netdev_map[i]); + if (!dst || dst->dev->ifindex == exclude_ifindex) + continue; + + /* we only need n-1 clones; last_dst enqueued below */ + if (!last_dst) { + last_dst = dst; + continue; + } + + err = dev_map_redirect_clone(last_dst, skb, xdp_prog); + if (err) + return err; + + last_dst = dst; + } + } else { /* BPF_MAP_TYPE_DEVMAP_HASH */ + for (i = 0; i < dtab->n_buckets; i++) { + head = dev_map_index_hash(dtab, i); + hlist_for_each_entry_safe(dst, next, head, index_hlist) { + if (!dst || dst->dev->ifindex == exclude_ifindex) + continue; + + /* we only need n-1 clones; last_dst enqueued below */ + if (!last_dst) { + last_dst = dst; + continue; + } + + err = dev_map_redirect_clone(last_dst, skb, xdp_prog); + if (err) + return err; + + last_dst = dst; + } + } + } + + /* consume the first skb and return */ + if (last_dst) + return dev_map_generic_redirect(last_dst, skb, xdp_prog); + + /* dtab is empty */ + consume_skb(skb); + return 0; +} + static void *dev_map_lookup_elem(struct bpf_map *map, void *key) { struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key); @@ -755,12 +930,16 @@ static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value, static int dev_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) { - return __bpf_xdp_redirect_map(map, ifindex, flags, __dev_map_lookup_elem); + return __bpf_xdp_redirect_map(map, ifindex, flags, + BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS, + __dev_map_lookup_elem); } static int dev_hash_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) { - return __bpf_xdp_redirect_map(map, ifindex, flags, __dev_map_hash_lookup_elem); + return __bpf_xdp_redirect_map(map, ifindex, flags, + BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS, + __dev_map_hash_lookup_elem); } static int dev_map_btf_id; diff --git a/net/core/filter.c b/net/core/filter.c index 582ac196fd94..caa88955562e 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3930,6 +3930,23 @@ void xdp_do_flush(void) } EXPORT_SYMBOL_GPL(xdp_do_flush); +void bpf_clear_redirect_map(struct bpf_map *map) +{ + struct bpf_redirect_info *ri; + int cpu; + + for_each_possible_cpu(cpu) { + ri = per_cpu_ptr(&bpf_redirect_info, cpu); + /* Avoid polluting remote cacheline due to writes if + * not needed. Once we pass this test, we need the + * cmpxchg() to make sure it hasn't been changed in + * the meantime by remote CPU. + */ + if (unlikely(READ_ONCE(ri->map) == map)) + cmpxchg(&ri->map, map, NULL); + } +} + int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { @@ -3937,6 +3954,7 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, enum bpf_map_type map_type = ri->map_type; void *fwd = ri->tgt_value; u32 map_id = ri->map_id; + struct bpf_map *map; int err; ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */ @@ -3946,7 +3964,14 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, case BPF_MAP_TYPE_DEVMAP: fallthrough; case BPF_MAP_TYPE_DEVMAP_HASH: - err = dev_map_enqueue(fwd, xdp, dev); + map = READ_ONCE(ri->map); + if (unlikely(map)) { + WRITE_ONCE(ri->map, NULL); + err = dev_map_enqueue_multi(xdp, dev, map, + ri->flags & BPF_F_EXCLUDE_INGRESS); + } else { + err = dev_map_enqueue(fwd, xdp, dev); + } break; case BPF_MAP_TYPE_CPUMAP: err = cpu_map_enqueue(fwd, xdp, dev); @@ -3988,13 +4013,21 @@ static int xdp_do_generic_redirect_map(struct net_device *dev, enum bpf_map_type map_type, u32 map_id) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_map *map; int err; switch (map_type) { case BPF_MAP_TYPE_DEVMAP: fallthrough; case BPF_MAP_TYPE_DEVMAP_HASH: - err = dev_map_generic_redirect(fwd, skb, xdp_prog); + map = READ_ONCE(ri->map); + if (unlikely(map)) { + WRITE_ONCE(ri->map, NULL); + err = dev_map_redirect_multi(dev, skb, xdp_prog, map, + ri->flags & BPF_F_EXCLUDE_INGRESS); + } else { + err = dev_map_generic_redirect(fwd, skb, xdp_prog); + } if (unlikely(err)) goto err; break; diff --git a/net/core/xdp.c b/net/core/xdp.c index 858276e72c68..725d20f1b100 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -584,3 +584,31 @@ struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf, return __xdp_build_skb_from_frame(xdpf, skb, dev); } EXPORT_SYMBOL_GPL(xdp_build_skb_from_frame); + +struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf) +{ + unsigned int headroom, totalsize; + struct xdp_frame *nxdpf; + struct page *page; + void *addr; + + headroom = xdpf->headroom + sizeof(*xdpf); + totalsize = headroom + xdpf->len; + + if (unlikely(totalsize > PAGE_SIZE)) + return NULL; + page = dev_alloc_page(); + if (!page) + return NULL; + addr = page_to_virt(page); + + memcpy(addr, xdpf, totalsize); + + nxdpf = addr; + nxdpf->data = addr + headroom; + nxdpf->frame_sz = PAGE_SIZE; + nxdpf->mem.type = MEM_TYPE_PAGE_ORDER0; + nxdpf->mem.id = 0; + + return nxdpf; +} diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c index 67b4ce504852..9df75ea4a567 100644 --- a/net/xdp/xskmap.c +++ b/net/xdp/xskmap.c @@ -226,7 +226,8 @@ static int xsk_map_delete_elem(struct bpf_map *map, void *key) static int xsk_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) { - return __bpf_xdp_redirect_map(map, ifindex, flags, __xsk_map_lookup_elem); + return __bpf_xdp_redirect_map(map, ifindex, flags, 0, + __xsk_map_lookup_elem); } void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 562adeac1d67..2c1ba70abbf1 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2555,8 +2555,12 @@ union bpf_attr { * The lower two bits of *flags* are used as the return code if * the map lookup fails. This is so that the return value can be * one of the XDP program return codes up to **XDP_TX**, as chosen - * by the caller. Any higher bits in the *flags* argument must be - * unset. + * by the caller. The higher bits of *flags* can be set to + * BPF_F_BROADCAST or BPF_F_EXCLUDE_INGRESS as defined below. + * + * With BPF_F_BROADCAST the packet will be broadcasted to all the + * interfaces in the map, with BPF_F_EXCLUDE_INGRESS the ingress + * interface will be excluded when do broadcasting. * * See also **bpf_redirect**\ (), which only supports redirecting * to an ifindex, but doesn't require a map to do so. @@ -5122,6 +5126,12 @@ enum { BPF_F_BPRM_SECUREEXEC = (1ULL << 0), }; +/* Flags for bpf_redirect_map helper */ +enum { + BPF_F_BROADCAST = (1ULL << 3), + BPF_F_EXCLUDE_INGRESS = (1ULL << 4), +}; + #define __bpf_md_ptr(type, name) \ union { \ type name; \ -- cgit v1.2.3-58-ga151 From 24a774a4f9750ecd37d7aaeacfc04a844b9cf20b Mon Sep 17 00:00:00 2001 From: zuoqilin Date: Fri, 14 May 2021 18:08:06 +0800 Subject: can: proc: remove unnecessary variables There is no need to define the variable "rate" to receive, just return directly. Link: https://lore.kernel.org/r/20210514100806.792-1-zuoqilin1@163.com Signed-off-by: zuoqilin Signed-off-by: Marc Kleine-Budde --- net/can/proc.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'net') diff --git a/net/can/proc.c b/net/can/proc.c index d1fe49e6f16d..b3099f0a3cb8 100644 --- a/net/can/proc.c +++ b/net/can/proc.c @@ -99,8 +99,6 @@ static void can_init_stats(struct net *net) static unsigned long calc_rate(unsigned long oldjif, unsigned long newjif, unsigned long count) { - unsigned long rate; - if (oldjif == newjif) return 0; @@ -111,9 +109,7 @@ static unsigned long calc_rate(unsigned long oldjif, unsigned long newjif, return 99999999; } - rate = (count * HZ) / (newjif - oldjif); - - return rate; + return (count * HZ) / (newjif - oldjif); } void can_stat_update(struct timer_list *t) -- cgit v1.2.3-58-ga151 From 46d8657a6b284e32b6b3bf1a6c93ee507fdd3cdb Mon Sep 17 00:00:00 2001 From: Patrick Menschel Date: Tue, 27 Apr 2021 05:21:47 +0000 Subject: can: isotp: change error format from decimal to symbolic error names This patch changes the format string for errors from decimal %d to symbolic error names %pe to achieve more comprehensive log messages. Link: https://lore.kernel.org/r/20210427052150.2308-2-menschel.p@posteo.de Signed-off-by: Patrick Menschel Signed-off-by: Marc Kleine-Budde --- net/can/isotp.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/can/isotp.c b/net/can/isotp.c index 9f94ad3caee9..2c4f84fac70a 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -221,8 +221,8 @@ static int isotp_send_fc(struct sock *sk, int ae, u8 flowstatus) can_send_ret = can_send(nskb, 1); if (can_send_ret) - pr_notice_once("can-isotp: %s: can_send_ret %d\n", - __func__, can_send_ret); + pr_notice_once("can-isotp: %s: can_send_ret %pe\n", + __func__, ERR_PTR(can_send_ret)); dev_put(dev); @@ -798,8 +798,8 @@ isotp_tx_burst: can_send_ret = can_send(skb, 1); if (can_send_ret) - pr_notice_once("can-isotp: %s: can_send_ret %d\n", - __func__, can_send_ret); + pr_notice_once("can-isotp: %s: can_send_ret %pe\n", + __func__, ERR_PTR(can_send_ret)); if (so->tx.idx >= so->tx.len) { /* we are done */ @@ -946,8 +946,8 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) err = can_send(skb, 1); dev_put(dev); if (err) { - pr_notice_once("can-isotp: %s: can_send_ret %d\n", - __func__, err); + pr_notice_once("can-isotp: %s: can_send_ret %pe\n", + __func__, ERR_PTR(err)); return err; } -- cgit v1.2.3-58-ga151 From 6a5ddae578842652719fb926b22f1d510fe50bee Mon Sep 17 00:00:00 2001 From: Patrick Menschel Date: Tue, 27 Apr 2021 05:21:48 +0000 Subject: can: isotp: add symbolic error message to isotp_module_init() This patch adds the value of err with format %pe to the already existing error message. Link: https://lore.kernel.org/r/20210427052150.2308-3-menschel.p@posteo.de Signed-off-by: Patrick Menschel Signed-off-by: Marc Kleine-Budde --- net/can/isotp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/can/isotp.c b/net/can/isotp.c index 2c4f84fac70a..2075d8d9e6b6 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -1433,7 +1433,7 @@ static __init int isotp_module_init(void) err = can_proto_register(&isotp_can_proto); if (err < 0) - pr_err("can: registration of isotp protocol failed\n"); + pr_err("can: registration of isotp protocol failed %pe\n", ERR_PTR(err)); return err; } -- cgit v1.2.3-58-ga151 From c69d190f7bb9a03cf5237d45a457993730d01605 Mon Sep 17 00:00:00 2001 From: Patrick Menschel Date: Tue, 27 Apr 2021 05:21:49 +0000 Subject: can: isotp: Add error message if txqueuelen is too small This patch adds an additional error message in case that txqueuelen is set too small and advices the user to increase txqueuelen. This is likely to happen even with small transfers if txqueuelen is at default value 10 frames. Link: https://lore.kernel.org/r/20210427052150.2308-4-menschel.p@posteo.de Signed-off-by: Patrick Menschel Signed-off-by: Marc Kleine-Budde --- net/can/isotp.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/can/isotp.c b/net/can/isotp.c index 2075d8d9e6b6..5ff11aaf0a79 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -797,10 +797,12 @@ isotp_tx_burst: can_skb_set_owner(skb, sk); can_send_ret = can_send(skb, 1); - if (can_send_ret) + if (can_send_ret) { pr_notice_once("can-isotp: %s: can_send_ret %pe\n", __func__, ERR_PTR(can_send_ret)); - + if (can_send_ret == -ENOBUFS) + pr_notice_once("can-isotp: tx queue is full, increasing txqueuelen may prevent this error\n"); + } if (so->tx.idx >= so->tx.len) { /* we are done */ so->tx.state = ISOTP_IDLE; -- cgit v1.2.3-58-ga151 From f285f37cb1e6b29e7dc732c81510aa115463730f Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 27 May 2021 12:48:19 +0200 Subject: devlink: append split port number to the port name Instead of doing sprintf twice in case the port is split or not, append the split port suffix in case the port is split. Signed-off-by: Jiri Pirko Link: https://lore.kernel.org/r/20210527104819.789840-1-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- net/core/devlink.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/core/devlink.c b/net/core/devlink.c index 4eb969518ee0..69681f19388e 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -8632,12 +8632,10 @@ static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port, switch (attrs->flavour) { case DEVLINK_PORT_FLAVOUR_PHYSICAL: case DEVLINK_PORT_FLAVOUR_VIRTUAL: - if (!attrs->split) - n = snprintf(name, len, "p%u", attrs->phys.port_number); - else - n = snprintf(name, len, "p%us%u", - attrs->phys.port_number, - attrs->phys.split_subport_number); + n = snprintf(name, len, "p%u", attrs->phys.port_number); + if (n < len && attrs->split) + n += snprintf(name + n, len - n, "s%u", + attrs->phys.split_subport_number); break; case DEVLINK_PORT_FLAVOUR_CPU: case DEVLINK_PORT_FLAVOUR_DSA: -- cgit v1.2.3-58-ga151 From 133dc203d77dff617d9c4673973ef3859be2c476 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Tue, 4 May 2021 17:54:06 +0200 Subject: netfilter: nft_exthdr: Support SCTP chunks Chunks are SCTP header extensions similar in implementation to IPv6 extension headers or TCP options. Reusing exthdr expression to find and extract field values from them is therefore pretty straightforward. For now, this supports extracting data from chunks at a fixed offset (and length) only - chunks themselves are an extensible data structure; in order to make all fields available, a nested extension search is needed. Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/netfilter/nft_exthdr.c | 51 ++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 1fb4ca18ffbb..19715e2679d1 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -813,11 +813,13 @@ enum nft_exthdr_flags { * @NFT_EXTHDR_OP_IPV6: match against ipv6 extension headers * @NFT_EXTHDR_OP_TCP: match against tcp options * @NFT_EXTHDR_OP_IPV4: match against ipv4 options + * @NFT_EXTHDR_OP_SCTP: match against sctp chunks */ enum nft_exthdr_op { NFT_EXTHDR_OP_IPV6, NFT_EXTHDR_OP_TCPOPT, NFT_EXTHDR_OP_IPV4, + NFT_EXTHDR_OP_SCTP, __NFT_EXTHDR_OP_MAX }; #define NFT_EXTHDR_OP_MAX (__NFT_EXTHDR_OP_MAX - 1) diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index f64f0017e9a5..4d0b8e1c40c0 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -10,8 +10,10 @@ #include #include #include +#include #include #include +#include #include struct nft_exthdr { @@ -300,6 +302,43 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr, } } +static void nft_exthdr_sctp_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + unsigned int offset = pkt->xt.thoff + sizeof(struct sctphdr); + struct nft_exthdr *priv = nft_expr_priv(expr); + u32 *dest = ®s->data[priv->dreg]; + const struct sctp_chunkhdr *sch; + struct sctp_chunkhdr _sch; + + do { + sch = skb_header_pointer(pkt->skb, offset, sizeof(_sch), &_sch); + if (!sch || !sch->length) + break; + + if (sch->type == priv->type) { + if (priv->flags & NFT_EXTHDR_F_PRESENT) { + nft_reg_store8(dest, true); + return; + } + if (priv->offset + priv->len > ntohs(sch->length) || + offset + ntohs(sch->length) > pkt->skb->len) + break; + + dest[priv->len / NFT_REG32_SIZE] = 0; + memcpy(dest, (char *)sch + priv->offset, priv->len); + return; + } + offset += SCTP_PAD4(ntohs(sch->length)); + } while (offset < pkt->skb->len); + + if (priv->flags & NFT_EXTHDR_F_PRESENT) + nft_reg_store8(dest, false); + else + regs->verdict.code = NFT_BREAK; +} + static const struct nla_policy nft_exthdr_policy[NFTA_EXTHDR_MAX + 1] = { [NFTA_EXTHDR_DREG] = { .type = NLA_U32 }, [NFTA_EXTHDR_TYPE] = { .type = NLA_U8 }, @@ -499,6 +538,14 @@ static const struct nft_expr_ops nft_exthdr_tcp_set_ops = { .dump = nft_exthdr_dump_set, }; +static const struct nft_expr_ops nft_exthdr_sctp_ops = { + .type = &nft_exthdr_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)), + .eval = nft_exthdr_sctp_eval, + .init = nft_exthdr_init, + .dump = nft_exthdr_dump, +}; + static const struct nft_expr_ops * nft_exthdr_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) @@ -529,6 +576,10 @@ nft_exthdr_select_ops(const struct nft_ctx *ctx, return &nft_exthdr_ipv4_ops; } break; + case NFT_EXTHDR_OP_SCTP: + if (tb[NFTA_EXTHDR_DREG]) + return &nft_exthdr_sctp_ops; + break; } return ERR_PTR(-EOPNOTSUPP); -- cgit v1.2.3-58-ga151 From a58db7ad80e89dab014bd2d769c233eeca1bf519 Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Mon, 10 May 2021 07:58:52 +0200 Subject: netfilter: nft_set_pipapo_avx2: Skip LDMXCSR, we don't need a valid MXCSR state We don't need a valid MXCSR state for the lookup routines, none of the instructions we use rely on or affect any bit in the MXCSR register. Instead of calling kernel_fpu_begin(), we can pass 0 as mask to kernel_fpu_begin_mask() and spare one LDMXCSR instruction. Commit 49200d17d27d ("x86/fpu/64: Don't FNINIT in kernel_fpu_begin()") already speeds up lookups considerably, and by dropping the MCXSR initialisation we can now get a much smaller, but measurable, increase in matching rates. The table below reports matching rates and a wild approximation of clock cycles needed for a match in a "port,net" test with 10 entries from selftests/netfilter/nft_concat_range.sh, limited to the first field, i.e. the port (with nft_set_rbtree initialisation skipped), run on a single AMD Epyc 7351 thread (2.9GHz, 512 KiB L1D$, 8 MiB L2$). The (very rough) estimation of clock cycles is obtained by simply dividing frequency by matching rate. The "cycles spared" column refers to the difference in cycles compared to the previous row, and the rate increase also refers to the previous row. Results are averages of six runs. Merely for context, I'm also reporting packet rates obtained by skipping kernel_fpu_begin() and kernel_fpu_end() altogether (which shows a very limited impact now), as well as skipping the whole lookup function, compared to simply counting and dropping all packets using the netdev hook drop (see nft_concat_range.sh for details). This workload also includes packet generation with pktgen and the receive path of veth. |matching| est. | cycles | rate | | rate | cycles | spared |increase| | (Mpps) | | | | --------------------------------------|--------|--------|--------|--------| FNINIT, LDMXCSR (before 49200d17d27d) | 5.245 | 553 | - | - | LDMXCSR only (with 49200d17d27d) | 6.347 | 457 | 96 | 21.0% | Without LDMXCSR (this patch) | 6.461 | 449 | 8 | 1.8% | -------- for reference only: ---------|--------|--------|--------|--------| Without kernel_fpu_begin() | 6.513 | 445 | 4 | 0.8% | Without actual matching (return true) | 7.649 | 379 | 66 | 17.4% | Without lookup operation (netdev drop)| 10.320 | 281 | 98 | 34.9% | The clock cycles spared by avoiding LDMXCSR appear to be in line with CPI and latency indicated in the manuals of comparable architectures: Intel Skylake (CPI: 1, latency: 7) and AMD 12h (latency: 12) -- I couldn't find this information for AMD 17h. Signed-off-by: Stefano Brivio Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_pipapo_avx2.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c index eabdb8d552ee..1c2620923a61 100644 --- a/net/netfilter/nft_set_pipapo_avx2.c +++ b/net/netfilter/nft_set_pipapo_avx2.c @@ -1136,8 +1136,13 @@ bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, m = rcu_dereference(priv->match); - /* This also protects access to all data related to scratch maps */ - kernel_fpu_begin(); + /* This also protects access to all data related to scratch maps. + * + * Note that we don't need a valid MXCSR state for any of the + * operations we use here, so pass 0 as mask and spare a LDMXCSR + * instruction. + */ + kernel_fpu_begin_mask(0); scratch = *raw_cpu_ptr(m->scratch_aligned); if (unlikely(!scratch)) { -- cgit v1.2.3-58-ga151 From 0974cff3eb66764cc86b60f1071958acc432a4e8 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 13 May 2021 22:29:55 +0200 Subject: netfilter: add and use nft_set_do_lookup helper Followup patch will add a CONFIG_RETPOLINE wrapper to avoid the ops->lookup() indirection cost for retpoline builds. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables_core.h | 7 +++++++ net/netfilter/nft_lookup.c | 4 ++-- net/netfilter/nft_objref.c | 4 ++-- 3 files changed, 11 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index fd10a7862fdc..5eb699454490 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -88,6 +88,13 @@ extern const struct nft_set_type nft_set_bitmap_type; extern const struct nft_set_type nft_set_pipapo_type; extern const struct nft_set_type nft_set_pipapo_avx2_type; +static inline bool +nft_set_do_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext) +{ + return set->ops->lookup(net, set, key, ext); +} + struct nft_expr; struct nft_regs; struct nft_pktinfo; diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index a479f8a1270c..1a8581879af5 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -33,8 +33,8 @@ void nft_lookup_eval(const struct nft_expr *expr, const struct net *net = nft_net(pkt); bool found; - found = set->ops->lookup(net, set, ®s->data[priv->sreg], &ext) ^ - priv->invert; + found = nft_set_do_lookup(net, set, ®s->data[priv->sreg], &ext) ^ + priv->invert; if (!found) { ext = nft_set_catchall_lookup(net, set); if (!ext) { diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c index 7e47edee88ee..94b2327e71dc 100644 --- a/net/netfilter/nft_objref.c +++ b/net/netfilter/nft_objref.c @@ -9,7 +9,7 @@ #include #include #include -#include +#include #define nft_objref_priv(expr) *((struct nft_object **)nft_expr_priv(expr)) @@ -110,7 +110,7 @@ static void nft_objref_map_eval(const struct nft_expr *expr, struct nft_object *obj; bool found; - found = set->ops->lookup(net, set, ®s->data[priv->sreg], &ext); + found = nft_set_do_lookup(net, set, ®s->data[priv->sreg], &ext); if (!found) { ext = nft_set_catchall_lookup(net, set); if (!ext) { -- cgit v1.2.3-58-ga151 From 2f1af441fd5dd5caf0807bb19ce9bbf9325ce534 Mon Sep 17 00:00:00 2001 From: Jianguo Wu Date: Thu, 27 May 2021 16:54:24 -0700 Subject: mptcp: fix pr_debug in mptcp_token_new_connect After commit 2c5ebd001d4f ("mptcp: refactor token container"), pr_debug() is called before mptcp_crypto_key_gen_sha() in mptcp_token_new_connect(), so the output local_key, token and idsn are 0, like: MPTCP: ssk=00000000f6b3c4a2, local_key=0, token=0, idsn=0 Move pr_debug() after mptcp_crypto_key_gen_sha(). Fixes: 2c5ebd001d4f ("mptcp: refactor token container") Acked-by: Paolo Abeni Signed-off-by: Jianguo Wu Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/token.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mptcp/token.c b/net/mptcp/token.c index 8f0270a780ce..72a24e63b131 100644 --- a/net/mptcp/token.c +++ b/net/mptcp/token.c @@ -156,9 +156,6 @@ int mptcp_token_new_connect(struct sock *sk) int retries = TOKEN_MAX_RETRIES; struct token_bucket *bucket; - pr_debug("ssk=%p, local_key=%llu, token=%u, idsn=%llu\n", - sk, subflow->local_key, subflow->token, subflow->idsn); - again: mptcp_crypto_key_gen_sha(&subflow->local_key, &subflow->token, &subflow->idsn); @@ -172,6 +169,9 @@ again: goto again; } + pr_debug("ssk=%p, local_key=%llu, token=%u, idsn=%llu\n", + sk, subflow->local_key, subflow->token, subflow->idsn); + WRITE_ONCE(msk->token, subflow->token); __sk_nulls_add_node_rcu((struct sock *)msk, &bucket->msk_chain); bucket->chain_len++; -- cgit v1.2.3-58-ga151 From c68a0cd1735fe09fa7c1a7de1f11a5b674f1c549 Mon Sep 17 00:00:00 2001 From: Jianguo Wu Date: Thu, 27 May 2021 16:54:25 -0700 Subject: mptcp: using TOKEN_MAX_RETRIES instead of magic number We have macro TOKEN_MAX_RETRIES for the number of token generate retries, so using TOKEN_MAX_RETRIES in subflow_check_req(). And rename TOKEN_MAX_RETRIES to MPTCP_TOKEN_MAX_RETRIES as it is now exposed. Fixes: 535fb8152f31 ("mptcp: token: move retry to caller") Reviewed-by: Matthieu Baerts Signed-off-by: Jianguo Wu Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.h | 2 ++ net/mptcp/subflow.c | 2 +- net/mptcp/token.c | 3 +-- 3 files changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 0c6f99c67345..89f6b73783d5 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -627,6 +627,8 @@ static inline void mptcp_write_space(struct sock *sk) void mptcp_destroy_common(struct mptcp_sock *msk); +#define MPTCP_TOKEN_MAX_RETRIES 4 + void __init mptcp_token_init(void); static inline void mptcp_token_init_request(struct request_sock *req) { diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index bde6be77ea73..a50a97908866 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -162,7 +162,7 @@ static int subflow_check_req(struct request_sock *req, } if (mp_opt.mp_capable && listener->request_mptcp) { - int err, retries = 4; + int err, retries = MPTCP_TOKEN_MAX_RETRIES; subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; again: diff --git a/net/mptcp/token.c b/net/mptcp/token.c index 72a24e63b131..a98e554b034f 100644 --- a/net/mptcp/token.c +++ b/net/mptcp/token.c @@ -33,7 +33,6 @@ #include #include "protocol.h" -#define TOKEN_MAX_RETRIES 4 #define TOKEN_MAX_CHAIN_LEN 4 struct token_bucket { @@ -153,7 +152,7 @@ int mptcp_token_new_connect(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); - int retries = TOKEN_MAX_RETRIES; + int retries = MPTCP_TOKEN_MAX_RETRIES; struct token_bucket *bucket; again: -- cgit v1.2.3-58-ga151 From 0a4d8e96e4fd687af92b961d5cdcea0fdbde05fe Mon Sep 17 00:00:00 2001 From: Jianguo Wu Date: Thu, 27 May 2021 16:54:26 -0700 Subject: mptcp: generate subflow hmac after mptcp_finish_join() For outgoing subflow join, when recv SYNACK, in subflow_finish_connect(), the mptcp_finish_join() may return false in some cases, and send a RESET to remote, and no local hmac is required. So generate subflow hmac after mptcp_finish_join(). Fixes: ec3edaa7ca6c ("mptcp: Add handling of outgoing MP_JOIN requests") Signed-off-by: Jianguo Wu Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/subflow.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index a50a97908866..2a58503e55bd 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -430,15 +430,15 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) goto do_reset; } + if (!mptcp_finish_join(sk)) + goto do_reset; + subflow_generate_hmac(subflow->local_key, subflow->remote_key, subflow->local_nonce, subflow->remote_nonce, hmac); memcpy(subflow->hmac, hmac, MPTCPOPT_HMAC_LEN); - if (!mptcp_finish_join(sk)) - goto do_reset; - subflow->mp_join = 1; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX); -- cgit v1.2.3-58-ga151 From ae514983f2e416cb1476bdd8e23b6d9be4ce94cd Mon Sep 17 00:00:00 2001 From: Jianguo Wu Date: Thu, 27 May 2021 16:54:27 -0700 Subject: mptcp: remove redundant initialization in pm_nl_init_net() Memory of struct pm_nl_pernet{} is allocated by kzalloc() in setup_net()->ops_init(), so it's no need to reset counters and zero bitmap in pm_nl_init_net(). Acked-by: Paolo Abeni Reviewed-by: Matthieu Baerts Signed-off-by: Jianguo Wu Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 2469e06a3a9d..7dbc4f308dbe 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -1913,10 +1913,13 @@ static int __net_init pm_nl_init_net(struct net *net) struct pm_nl_pernet *pernet = net_generic(net, pm_nl_pernet_id); INIT_LIST_HEAD_RCU(&pernet->local_addr_list); - __reset_counters(pernet); pernet->next_id = 1; - bitmap_zero(pernet->id_bitmap, MAX_ADDR_ID + 1); spin_lock_init(&pernet->lock); + + /* No need to initialize other pernet fields, the struct is zeroed at + * allocation time. + */ + return 0; } -- cgit v1.2.3-58-ga151 From eb5fb629f56da3f40f496c807da44a7ce7644779 Mon Sep 17 00:00:00 2001 From: Jianguo Wu Date: Thu, 27 May 2021 16:54:28 -0700 Subject: mptcp: make sure flag signal is set when add addr with port When add address with port, it is mean to create a listening socket, and send an ADD_ADDR to remote, so it must have flag signal set, add this check in mptcp_pm_parse_addr(). Fixes: a77e9179c7651 ("mptcp: deal with MPTCP_PM_ADDR_ATTR_PORT in PM netlink") Acked-by: Geliang Tang Reviewed-by: Matthieu Baerts Signed-off-by: Jianguo Wu Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 7dbc4f308dbe..09722598994d 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -971,8 +971,14 @@ skip_family: if (tb[MPTCP_PM_ADDR_ATTR_FLAGS]) entry->flags = nla_get_u32(tb[MPTCP_PM_ADDR_ATTR_FLAGS]); - if (tb[MPTCP_PM_ADDR_ATTR_PORT]) + if (tb[MPTCP_PM_ADDR_ATTR_PORT]) { + if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) { + NL_SET_ERR_MSG_ATTR(info->extack, attr, + "flags must have signal when using port"); + return -EINVAL; + } entry->addr.port = htons(nla_get_u16(tb[MPTCP_PM_ADDR_ATTR_PORT])); + } return 0; } -- cgit v1.2.3-58-ga151 From 804c72eeecd2cd38567b64f868cc8c63202cf1a2 Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Thu, 27 May 2021 16:54:29 -0700 Subject: mptcp: support SYSCTL only if enabled Since the introduction of the sysctl support in MPTCP with commit 784325e9f037 ("mptcp: new sysctl to control the activation per NS"), we don't check CONFIG_SYSCTL. Until now, that was not an issue: the register and unregister functions were replaced by NO-OP one if SYSCTL was not enabled in the config. The only thing we could have avoid is not to reserve memory for the table but that's for the moment only a small table per net-ns. But the following commit is going to use SYSCTL_ZERO and SYSCTL_ONE which are not be defined if SYSCTL is not enabled in the config. This causes 'undefined reference' errors from the linker. Reported-by: kernel test robot Signed-off-by: Matthieu Baerts Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/ctrl.c | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c index 96ba616f59bf..a3b15ed60b77 100644 --- a/net/mptcp/ctrl.c +++ b/net/mptcp/ctrl.c @@ -4,7 +4,9 @@ * Copyright (c) 2019, Tessares SA. */ +#ifdef CONFIG_SYSCTL #include +#endif #include #include @@ -15,7 +17,9 @@ static int mptcp_pernet_id; struct mptcp_pernet { +#ifdef CONFIG_SYSCTL struct ctl_table_header *ctl_table_hdr; +#endif int mptcp_enabled; unsigned int add_addr_timeout; @@ -36,6 +40,13 @@ unsigned int mptcp_get_add_addr_timeout(struct net *net) return mptcp_get_pernet(net)->add_addr_timeout; } +static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet) +{ + pernet->mptcp_enabled = 1; + pernet->add_addr_timeout = TCP_RTO_MAX; +} + +#ifdef CONFIG_SYSCTL static struct ctl_table mptcp_sysctl_table[] = { { .procname = "enabled", @@ -55,12 +66,6 @@ static struct ctl_table mptcp_sysctl_table[] = { {} }; -static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet) -{ - pernet->mptcp_enabled = 1; - pernet->add_addr_timeout = TCP_RTO_MAX; -} - static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) { struct ctl_table_header *hdr; @@ -100,6 +105,17 @@ static void mptcp_pernet_del_table(struct mptcp_pernet *pernet) kfree(table); } +#else + +static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) +{ + return 0; +} + +static void mptcp_pernet_del_table(struct mptcp_pernet *pernet) {} + +#endif /* CONFIG_SYSCTL */ + static int __net_init mptcp_net_init(struct net *net) { struct mptcp_pernet *pernet = mptcp_get_pernet(net); -- cgit v1.2.3-58-ga151 From 744ee14054c8ca5ad0fe3ab9172709c17d8a240a Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Thu, 27 May 2021 16:54:30 -0700 Subject: mptcp: restrict values of 'enabled' sysctl To avoid confusions, it seems better to parse this sysctl parameter as a boolean. We use it as a boolean, no need to parse an integer and bring confusions if we see a value different from 0 and 1, especially with this parameter name: enabled. It seems fine to do this modification because the default value is 1 (enabled). Then the only other interesting value to set is 0 (disabled). All other values would not have changed the default behaviour. Suggested-by: Florian Westphal Acked-by: Florian Westphal Signed-off-by: Matthieu Baerts Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- Documentation/networking/mptcp-sysctl.rst | 8 ++++---- net/mptcp/ctrl.c | 8 +++++--- 2 files changed, 9 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/Documentation/networking/mptcp-sysctl.rst b/Documentation/networking/mptcp-sysctl.rst index 6af0196c4297..3b352e5f6300 100644 --- a/Documentation/networking/mptcp-sysctl.rst +++ b/Documentation/networking/mptcp-sysctl.rst @@ -7,13 +7,13 @@ MPTCP Sysfs variables /proc/sys/net/mptcp/* Variables =============================== -enabled - INTEGER +enabled - BOOLEAN Control whether MPTCP sockets can be created. - MPTCP sockets can be created if the value is nonzero. This is - a per-namespace sysctl. + MPTCP sockets can be created if the value is 1. This is a + per-namespace sysctl. - Default: 1 + Default: 1 (enabled) add_addr_timeout - INTEGER (seconds) Set the timeout after which an ADD_ADDR control message will be diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c index a3b15ed60b77..1ec4d36a39f0 100644 --- a/net/mptcp/ctrl.c +++ b/net/mptcp/ctrl.c @@ -21,7 +21,7 @@ struct mptcp_pernet { struct ctl_table_header *ctl_table_hdr; #endif - int mptcp_enabled; + u8 mptcp_enabled; unsigned int add_addr_timeout; }; @@ -50,12 +50,14 @@ static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet) static struct ctl_table mptcp_sysctl_table[] = { { .procname = "enabled", - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, /* users with CAP_NET_ADMIN or root (not and) can change this * value, same as other sysctl or the 'net' tree. */ - .proc_handler = proc_dointvec, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE }, { .procname = "add_addr_timeout", -- cgit v1.2.3-58-ga151 From f227925e53c3ecc168027e0015ab0a953d1bf013 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 13 May 2021 22:29:56 +0200 Subject: netfilter: nf_tables: prefer direct calls for set lookups Extend nft_set_do_lookup() to use direct calls when retpoline feature is enabled. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables_core.h | 24 ++++++++++++++++++++++++ net/netfilter/nft_lookup.c | 31 +++++++++++++++++++++++++++++++ net/netfilter/nft_set_bitmap.c | 5 +++-- net/netfilter/nft_set_hash.c | 17 ++++++++++------- net/netfilter/nft_set_pipapo.h | 2 -- net/netfilter/nft_set_pipapo_avx2.h | 2 -- net/netfilter/nft_set_rbtree.c | 5 +++-- 7 files changed, 71 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index 5eb699454490..46c8d5bb5d8d 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -3,6 +3,7 @@ #define _NET_NF_TABLES_CORE_H #include +#include extern struct nft_expr_type nft_imm_type; extern struct nft_expr_type nft_cmp_type; @@ -88,12 +89,35 @@ extern const struct nft_set_type nft_set_bitmap_type; extern const struct nft_set_type nft_set_pipapo_type; extern const struct nft_set_type nft_set_pipapo_avx2_type; +#ifdef CONFIG_RETPOLINE +bool nft_rhash_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext); +bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext); +bool nft_bitmap_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext); +bool nft_hash_lookup_fast(const struct net *net, + const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext); +bool nft_hash_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext); +bool nft_set_do_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext); +#else static inline bool nft_set_do_lookup(const struct net *net, const struct nft_set *set, const u32 *key, const struct nft_set_ext **ext) { return set->ops->lookup(net, set, key, ext); } +#endif + +/* called from nft_pipapo_avx2.c */ +bool nft_pipapo_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext); +/* called from nft_set_pipapo.c */ +bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext); struct nft_expr; struct nft_regs; diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index 1a8581879af5..90becbf5bff3 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -23,6 +23,37 @@ struct nft_lookup { struct nft_set_binding binding; }; +#ifdef CONFIG_RETPOLINE +bool nft_set_do_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext) +{ + if (set->ops == &nft_set_hash_fast_type.ops) + return nft_hash_lookup_fast(net, set, key, ext); + if (set->ops == &nft_set_hash_type.ops) + return nft_hash_lookup(net, set, key, ext); + + if (set->ops == &nft_set_rhash_type.ops) + return nft_rhash_lookup(net, set, key, ext); + + if (set->ops == &nft_set_bitmap_type.ops) + return nft_bitmap_lookup(net, set, key, ext); + + if (set->ops == &nft_set_pipapo_type.ops) + return nft_pipapo_lookup(net, set, key, ext); +#if defined(CONFIG_X86_64) && !defined(CONFIG_UML) + if (set->ops == &nft_set_pipapo_avx2_type.ops) + return nft_pipapo_avx2_lookup(net, set, key, ext); +#endif + + if (set->ops == &nft_set_rbtree_type.ops) + return nft_rbtree_lookup(net, set, key, ext); + + WARN_ON_ONCE(1); + return set->ops->lookup(net, set, key, ext); +} +EXPORT_SYMBOL_GPL(nft_set_do_lookup); +#endif + void nft_lookup_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index 2a81ea421819..e7ae5914971e 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -73,8 +73,9 @@ nft_bitmap_active(const u8 *bitmap, u32 idx, u32 off, u8 genmask) return (bitmap[idx] & (0x3 << off)) & (genmask << off); } -static bool nft_bitmap_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext) +INDIRECT_CALLABLE_SCOPE +bool nft_bitmap_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext) { const struct nft_bitmap *priv = nft_set_priv(set); u8 genmask = nft_genmask_cur(net); diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 7b3d0a78c569..df40314de21f 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -74,8 +74,9 @@ static const struct rhashtable_params nft_rhash_params = { .automatic_shrinking = true, }; -static bool nft_rhash_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext) +INDIRECT_CALLABLE_SCOPE +bool nft_rhash_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext) { struct nft_rhash *priv = nft_set_priv(set); const struct nft_rhash_elem *he; @@ -446,8 +447,9 @@ struct nft_hash_elem { struct nft_set_ext ext; }; -static bool nft_hash_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext) +INDIRECT_CALLABLE_SCOPE +bool nft_hash_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext) { struct nft_hash *priv = nft_set_priv(set); u8 genmask = nft_genmask_cur(net); @@ -484,9 +486,10 @@ static void *nft_hash_get(const struct net *net, const struct nft_set *set, return ERR_PTR(-ENOENT); } -static bool nft_hash_lookup_fast(const struct net *net, - const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext) +INDIRECT_CALLABLE_SCOPE +bool nft_hash_lookup_fast(const struct net *net, + const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext) { struct nft_hash *priv = nft_set_priv(set); u8 genmask = nft_genmask_cur(net); diff --git a/net/netfilter/nft_set_pipapo.h b/net/netfilter/nft_set_pipapo.h index d84afb8fa79a..25a75591583e 100644 --- a/net/netfilter/nft_set_pipapo.h +++ b/net/netfilter/nft_set_pipapo.h @@ -178,8 +178,6 @@ struct nft_pipapo_elem { int pipapo_refill(unsigned long *map, int len, int rules, unsigned long *dst, union nft_pipapo_map_bucket *mt, bool match_only); -bool nft_pipapo_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext); /** * pipapo_and_field_buckets_4bit() - Intersect 4-bit buckets diff --git a/net/netfilter/nft_set_pipapo_avx2.h b/net/netfilter/nft_set_pipapo_avx2.h index 394bcb704db7..dbb6aaca8a7a 100644 --- a/net/netfilter/nft_set_pipapo_avx2.h +++ b/net/netfilter/nft_set_pipapo_avx2.h @@ -5,8 +5,6 @@ #include #define NFT_PIPAPO_ALIGN (XSAVE_YMM_SIZE / BITS_PER_BYTE) -bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext); bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features, struct nft_set_estimate *est); #endif /* defined(CONFIG_X86_64) && !defined(CONFIG_UML) */ diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 9e36eb4a7429..d600a566da32 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -107,8 +107,9 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set return false; } -static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext) +INDIRECT_CALLABLE_SCOPE +bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext) { struct nft_rbtree *priv = nft_set_priv(set); unsigned int seq = read_seqcount_begin(&priv->count); -- cgit v1.2.3-58-ga151 From 06f029930264ee8013fb76cfb591c6e1ad2f0dd0 Mon Sep 17 00:00:00 2001 From: Juerg Haefliger Date: Mon, 17 May 2021 11:58:50 +0200 Subject: netfilter: Remove leading spaces in Kconfig Remove leading spaces before tabs in Kconfig file(s) by running the following command: $ find net/netfilter -name 'Kconfig*' | xargs sed -r -i 's/^[ ]+\t/\t/' Signed-off-by: Juerg Haefliger Signed-off-by: Pablo Neira Ayuso --- net/netfilter/Kconfig | 2 +- net/netfilter/ipvs/Kconfig | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 56a2531a3402..172d74560632 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -816,7 +816,7 @@ config NETFILTER_XT_TARGET_CLASSIFY the priority of a packet. Some qdiscs can use this value for classification, among these are: - atm, cbq, dsmark, pfifo_fast, htb, prio + atm, cbq, dsmark, pfifo_fast, htb, prio To compile it as a module, choose M here. If unsure, say N. diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig index d61886874940..271da8447b29 100644 --- a/net/netfilter/ipvs/Kconfig +++ b/net/netfilter/ipvs/Kconfig @@ -318,7 +318,7 @@ config IP_VS_MH_TAB_INDEX comment 'IPVS application helper' config IP_VS_FTP - tristate "FTP protocol helper" + tristate "FTP protocol helper" depends on IP_VS_PROTO_TCP && NF_CONNTRACK && NF_NAT && \ NF_CONNTRACK_FTP select IP_VS_NFCT -- cgit v1.2.3-58-ga151 From 07df3fc90a03919b5f1bc3b2fad0046a0aa0e2cb Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Thu, 29 Apr 2021 12:26:13 -0400 Subject: netfilter: x_tables: improve limit_mt scalability We've seen this spin_lock show up high in profiles. Let's introduce a lockless version. I've tested this using pktgen_sample01_simple.sh. Signed-off-by: Jason Baron Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_limit.c | 46 ++++++++++++++++++++++++++-------------------- 1 file changed, 26 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c index 24d4afb9988d..8b4fd27857f2 100644 --- a/net/netfilter/xt_limit.c +++ b/net/netfilter/xt_limit.c @@ -8,16 +8,14 @@ #include #include #include -#include #include #include #include struct xt_limit_priv { - spinlock_t lock; unsigned long prev; - uint32_t credit; + u32 credit; }; MODULE_LICENSE("GPL"); @@ -66,22 +64,31 @@ limit_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_rateinfo *r = par->matchinfo; struct xt_limit_priv *priv = r->master; - unsigned long now = jiffies; - - spin_lock_bh(&priv->lock); - priv->credit += (now - xchg(&priv->prev, now)) * CREDITS_PER_JIFFY; - if (priv->credit > r->credit_cap) - priv->credit = r->credit_cap; - - if (priv->credit >= r->cost) { - /* We're not limited. */ - priv->credit -= r->cost; - spin_unlock_bh(&priv->lock); - return true; - } - - spin_unlock_bh(&priv->lock); - return false; + unsigned long now; + u32 old_credit, new_credit, credit_increase = 0; + bool ret; + + /* fastpath if there is nothing to update */ + if ((READ_ONCE(priv->credit) < r->cost) && (READ_ONCE(priv->prev) == jiffies)) + return false; + + do { + now = jiffies; + credit_increase += (now - xchg(&priv->prev, now)) * CREDITS_PER_JIFFY; + old_credit = READ_ONCE(priv->credit); + new_credit = old_credit; + new_credit += credit_increase; + if (new_credit > r->credit_cap) + new_credit = r->credit_cap; + if (new_credit >= r->cost) { + ret = true; + new_credit -= r->cost; + } else { + ret = false; + } + } while (cmpxchg(&priv->credit, old_credit, new_credit) != old_credit); + + return ret; } /* Precision saver. */ @@ -122,7 +129,6 @@ static int limit_mt_check(const struct xt_mtchk_param *par) r->credit_cap = priv->credit; /* Credits full. */ r->cost = user2credits(r->avg); } - spin_lock_init(&priv->lock); return 0; } -- cgit v1.2.3-58-ga151 From 02d85142670b6676abcfd95023c8d28288dc5ad9 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Fri, 30 Apr 2021 17:25:10 +0800 Subject: netfilter: xt_CT: Remove redundant assignment to ret Variable 'ret' is set to zero but this value is never read as it is overwritten with a new value later on, hence it is a redundant assignment and can be removed Clean up the following clang-analyzer warning: net/netfilter/xt_CT.c:175:2: warning: Value stored to 'ret' is never read [clang-analyzer-deadcode.DeadStores] Reported-by: Abaci Robot Signed-off-by: Yang Li Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_CT.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index d4deee39158b..12404d221026 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -172,7 +172,6 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par, goto err2; } - ret = 0; if ((info->ct_events || info->exp_events) && !nf_ct_ecache_ext_add(ct, info->ct_events, info->exp_events, GFP_KERNEL)) { -- cgit v1.2.3-58-ga151 From e0241ae6ac59ffa318255640c047f7c90457fbe5 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 17 May 2021 14:43:08 +0200 Subject: netfilter: use nfnetlink_unicast() Replace netlink_unicast() calls by nfnetlink_unicast() which already deals with translating EAGAIN to ENOBUFS as the nfnetlink core expects. nfnetlink_unicast() calls nlmsg_unicast() which returns zero in case of success, otherwise the netlink core function netlink_rcv_skb() turns err > 0 into an acknowlegment. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_core.c | 50 ++++++--------------------- net/netfilter/nf_conntrack_netlink.c | 65 ++++++++++-------------------------- net/netfilter/nfnetlink_acct.c | 9 ++--- net/netfilter/nfnetlink_cthelper.c | 10 ++---- net/netfilter/nfnetlink_cttimeout.c | 34 ++++++------------- 5 files changed, 44 insertions(+), 124 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index de2d20c37cda..16ae92054baa 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -1685,8 +1685,8 @@ static const struct nla_policy ip_set_adt_policy[IPSET_ATTR_CMD_MAX + 1] = { }; static int -call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, - struct nlattr *tb[], enum ipset_adt adt, +call_ad(struct net *net, struct sock *ctnl, struct sk_buff *skb, + struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 flags, bool use_lineno) { int ret; @@ -1738,8 +1738,7 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, *errline = lineno; - netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); + nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); /* Signal netlink not to send its ACK/errmsg. */ return -EINTR; } @@ -1783,7 +1782,7 @@ static int ip_set_ad(struct net *net, struct sock *ctnl, attr[IPSET_ATTR_DATA], set->type->adt_policy, NULL)) return -IPSET_ERR_PROTOCOL; - ret = call_ad(ctnl, skb, set, tb, adt, flags, + ret = call_ad(net, ctnl, skb, set, tb, adt, flags, use_lineno); } else { int nla_rem; @@ -1794,7 +1793,7 @@ static int ip_set_ad(struct net *net, struct sock *ctnl, nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, nla, set->type->adt_policy, NULL)) return -IPSET_ERR_PROTOCOL; - ret = call_ad(ctnl, skb, set, tb, adt, + ret = call_ad(net, ctnl, skb, set, tb, adt, flags, use_lineno); if (ret < 0) return ret; @@ -1859,7 +1858,6 @@ static int ip_set_header(struct sk_buff *skb, const struct nfnl_info *info, const struct ip_set *set; struct sk_buff *skb2; struct nlmsghdr *nlh2; - int ret = 0; if (unlikely(protocol_min_failed(attr) || !attr[IPSET_ATTR_SETNAME])) @@ -1885,12 +1883,7 @@ static int ip_set_header(struct sk_buff *skb, const struct nfnl_info *info, goto nla_put_failure; nlmsg_end(skb2, nlh2); - ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); - if (ret < 0) - return ret; - - return 0; + return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); nla_put_failure: nlmsg_cancel(skb2, nlh2); @@ -1945,12 +1938,7 @@ static int ip_set_type(struct sk_buff *skb, const struct nfnl_info *info, nlmsg_end(skb2, nlh2); pr_debug("Send TYPE, nlmsg_len: %u\n", nlh2->nlmsg_len); - ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); - if (ret < 0) - return ret; - - return 0; + return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); nla_put_failure: nlmsg_cancel(skb2, nlh2); @@ -1971,7 +1959,6 @@ static int ip_set_protocol(struct sk_buff *skb, const struct nfnl_info *info, { struct sk_buff *skb2; struct nlmsghdr *nlh2; - int ret = 0; if (unlikely(!attr[IPSET_ATTR_PROTOCOL])) return -IPSET_ERR_PROTOCOL; @@ -1990,12 +1977,7 @@ static int ip_set_protocol(struct sk_buff *skb, const struct nfnl_info *info, goto nla_put_failure; nlmsg_end(skb2, nlh2); - ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); - if (ret < 0) - return ret; - - return 0; + return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); nla_put_failure: nlmsg_cancel(skb2, nlh2); @@ -2014,7 +1996,6 @@ static int ip_set_byname(struct sk_buff *skb, const struct nfnl_info *info, struct nlmsghdr *nlh2; ip_set_id_t id = IPSET_INVALID_ID; const struct ip_set *set; - int ret = 0; if (unlikely(protocol_failed(attr) || !attr[IPSET_ATTR_SETNAME])) @@ -2038,12 +2019,7 @@ static int ip_set_byname(struct sk_buff *skb, const struct nfnl_info *info, goto nla_put_failure; nlmsg_end(skb2, nlh2); - ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); - if (ret < 0) - return ret; - - return 0; + return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); nla_put_failure: nlmsg_cancel(skb2, nlh2); @@ -2065,7 +2041,6 @@ static int ip_set_byindex(struct sk_buff *skb, const struct nfnl_info *info, struct nlmsghdr *nlh2; ip_set_id_t id = IPSET_INVALID_ID; const struct ip_set *set; - int ret = 0; if (unlikely(protocol_failed(attr) || !attr[IPSET_ATTR_INDEX])) @@ -2091,12 +2066,7 @@ static int ip_set_byindex(struct sk_buff *skb, const struct nfnl_info *info, goto nla_put_failure; nlmsg_end(skb2, nlh2); - ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); - if (ret < 0) - return ret; - - return 0; + return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); nla_put_failure: nlmsg_cancel(skb2, nlh2); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 8690fc07030f..220f51f055ab 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1628,9 +1628,8 @@ static int ctnetlink_get_conntrack(struct sk_buff *skb, ct = nf_ct_tuplehash_to_ctrack(h); - err = -ENOMEM; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (skb2 == NULL) { + if (!skb2) { nf_ct_put(ct); return -ENOMEM; } @@ -1640,21 +1639,12 @@ static int ctnetlink_get_conntrack(struct sk_buff *skb, NFNL_MSG_TYPE(info->nlh->nlmsg_type), ct, true, 0); nf_ct_put(ct); - if (err <= 0) - goto free; - - err = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); - if (err < 0) - goto out; - - return 0; + if (err <= 0) { + kfree_skb(skb2); + return -ENOMEM; + } -free: - kfree_skb(skb2); -out: - /* this avoids a loop in nfnetlink. */ - return err == -EAGAIN ? -ENOBUFS : err; + return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); } static int ctnetlink_done_list(struct netlink_callback *cb) @@ -2590,21 +2580,12 @@ static int ctnetlink_stat_ct(struct sk_buff *skb, const struct nfnl_info *info, info->nlh->nlmsg_seq, NFNL_MSG_TYPE(info->nlh->nlmsg_type), sock_net(skb->sk)); - if (err <= 0) - goto free; - - err = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); - if (err < 0) - goto out; - - return 0; + if (err <= 0) { + kfree_skb(skb2); + return -ENOMEM; + } -free: - kfree_skb(skb2); -out: - /* this avoids a loop in nfnetlink. */ - return err == -EAGAIN ? -ENOBUFS : err; + return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); } static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = { @@ -3329,11 +3310,10 @@ static int ctnetlink_get_expect(struct sk_buff *skb, } } - err = -ENOMEM; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (skb2 == NULL) { + if (!skb2) { nf_ct_expect_put(exp); - goto out; + return -ENOMEM; } rcu_read_lock(); @@ -3342,21 +3322,12 @@ static int ctnetlink_get_expect(struct sk_buff *skb, exp); rcu_read_unlock(); nf_ct_expect_put(exp); - if (err <= 0) - goto free; - - err = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); - if (err < 0) - goto out; - - return 0; + if (err <= 0) { + kfree_skb(skb2); + return -ENOMEM; + } -free: - kfree_skb(skb2); -out: - /* this avoids a loop in nfnetlink. */ - return err == -EAGAIN ? -ENOBUFS : err; + return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); } static bool expect_iter_name(struct nf_conntrack_expect *exp, void *data) diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c index 3c8cf8748cfb..505f46a32173 100644 --- a/net/netfilter/nfnetlink_acct.c +++ b/net/netfilter/nfnetlink_acct.c @@ -314,14 +314,11 @@ static int nfnl_acct_get(struct sk_buff *skb, const struct nfnl_info *info, kfree_skb(skb2); break; } - ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); - if (ret > 0) - ret = 0; - /* this avoids a loop in nfnetlink. */ - return ret == -EAGAIN ? -ENOBUFS : ret; + ret = nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); + break; } + return ret; } diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c index 322ac5dd5402..df58cd534ff5 100644 --- a/net/netfilter/nfnetlink_cthelper.c +++ b/net/netfilter/nfnetlink_cthelper.c @@ -663,14 +663,10 @@ static int nfnl_cthelper_get(struct sk_buff *skb, const struct nfnl_info *info, break; } - ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); - if (ret > 0) - ret = 0; - - /* this avoids a loop in nfnetlink. */ - return ret == -EAGAIN ? -ENOBUFS : ret; + ret = nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); + break; } + return ret; } diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index 38848ad68899..c57673d499be 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -287,14 +287,11 @@ static int cttimeout_get_timeout(struct sk_buff *skb, kfree_skb(skb2); break; } - ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); - if (ret > 0) - ret = 0; - /* this avoids a loop in nfnetlink. */ - return ret == -EAGAIN ? -ENOBUFS : ret; + ret = nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); + break; } + return ret; } @@ -427,9 +424,9 @@ static int cttimeout_default_get(struct sk_buff *skb, const struct nf_conntrack_l4proto *l4proto; unsigned int *timeouts = NULL; struct sk_buff *skb2; - int ret, err; __u16 l3num; __u8 l4num; + int ret; if (!cda[CTA_TIMEOUT_L3PROTO] || !cda[CTA_TIMEOUT_L4PROTO]) return -EINVAL; @@ -438,9 +435,8 @@ static int cttimeout_default_get(struct sk_buff *skb, l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]); l4proto = nf_ct_l4proto_find(l4num); - err = -EOPNOTSUPP; if (l4proto->l4proto != l4num) - goto err; + return -EOPNOTSUPP; switch (l4proto->l4proto) { case IPPROTO_ICMP: @@ -480,13 +476,11 @@ static int cttimeout_default_get(struct sk_buff *skb, } if (!timeouts) - goto err; + return -EOPNOTSUPP; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (skb2 == NULL) { - err = -ENOMEM; - goto err; - } + if (!skb2) + return -ENOMEM; ret = cttimeout_default_fill_info(info->net, skb2, NETLINK_CB(skb).portid, @@ -496,18 +490,10 @@ static int cttimeout_default_get(struct sk_buff *skb, l3num, l4proto, timeouts); if (ret <= 0) { kfree_skb(skb2); - err = -ENOMEM; - goto err; + return -ENOMEM; } - ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); - if (ret > 0) - ret = 0; - /* this avoids a loop in nfnetlink. */ - return ret == -EAGAIN ? -ENOBUFS : ret; -err: - return err; + return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); } static struct nf_ct_timeout *ctnl_timeout_find_get(struct net *net, -- cgit v1.2.3-58-ga151 From 586d5a8bcede47fda7bebf4b36be917c5010db16 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 28 May 2021 12:30:03 +0200 Subject: netfilter: x_tables: reduce xt_action_param by 8 byte The fragment offset in ipv4/ipv6 is a 16bit field, so use u16 instead of unsigned int. On 64bit: 40 bytes to 32 bytes. By extension this also reduces nft_pktinfo (56 to 48 byte). Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/x_tables.h | 2 +- net/ipv6/netfilter/ip6_tables.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 07c6ad8f2a02..28d7027cd460 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -36,8 +36,8 @@ struct xt_action_param { const void *matchinfo, *targinfo; }; const struct nf_hook_state *state; - int fragoff; unsigned int thoff; + u16 fragoff; bool hotdrop; }; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index e810a23baf99..de2cf3943b91 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -51,7 +51,7 @@ ip6_packet_match(const struct sk_buff *skb, const char *outdev, const struct ip6t_ip6 *ip6info, unsigned int *protoff, - int *fragoff, bool *hotdrop) + u16 *fragoff, bool *hotdrop) { unsigned long ret; const struct ipv6hdr *ipv6 = ipv6_hdr(skb); -- cgit v1.2.3-58-ga151 From 85554eb981e5a8b0b8947611193aef1737081ef2 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 28 May 2021 12:30:05 +0200 Subject: netfilter: nf_tables: add and use nft_sk helper This allows to change storage placement later on without changing readers. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 5 +++++ net/ipv4/netfilter/nft_reject_ipv4.c | 2 +- net/ipv6/netfilter/nft_reject_ipv6.c | 2 +- net/netfilter/nft_reject_inet.c | 4 ++-- 4 files changed, 9 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 27eeb613bb4e..af1228f58e5a 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -29,6 +29,11 @@ struct nft_pktinfo { struct xt_action_param xt; }; +static inline struct sock *nft_sk(const struct nft_pktinfo *pkt) +{ + return pkt->xt.state->sk; +} + static inline struct net *nft_net(const struct nft_pktinfo *pkt) { return pkt->xt.state->net; diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c index ff437e4ed6db..55fc23a8f7a7 100644 --- a/net/ipv4/netfilter/nft_reject_ipv4.c +++ b/net/ipv4/netfilter/nft_reject_ipv4.c @@ -27,7 +27,7 @@ static void nft_reject_ipv4_eval(const struct nft_expr *expr, nf_send_unreach(pkt->skb, priv->icmp_code, nft_hook(pkt)); break; case NFT_REJECT_TCP_RST: - nf_send_reset(nft_net(pkt), pkt->xt.state->sk, pkt->skb, + nf_send_reset(nft_net(pkt), nft_sk(pkt), pkt->skb, nft_hook(pkt)); break; default: diff --git a/net/ipv6/netfilter/nft_reject_ipv6.c b/net/ipv6/netfilter/nft_reject_ipv6.c index 7969d1f3018d..ed69c768797e 100644 --- a/net/ipv6/netfilter/nft_reject_ipv6.c +++ b/net/ipv6/netfilter/nft_reject_ipv6.c @@ -28,7 +28,7 @@ static void nft_reject_ipv6_eval(const struct nft_expr *expr, nft_hook(pkt)); break; case NFT_REJECT_TCP_RST: - nf_send_reset6(nft_net(pkt), pkt->xt.state->sk, pkt->skb, + nf_send_reset6(nft_net(pkt), nft_sk(pkt), pkt->skb, nft_hook(pkt)); break; default: diff --git a/net/netfilter/nft_reject_inet.c b/net/netfilter/nft_reject_inet.c index 95090186ee90..554caf967baa 100644 --- a/net/netfilter/nft_reject_inet.c +++ b/net/netfilter/nft_reject_inet.c @@ -28,7 +28,7 @@ static void nft_reject_inet_eval(const struct nft_expr *expr, nft_hook(pkt)); break; case NFT_REJECT_TCP_RST: - nf_send_reset(nft_net(pkt), pkt->xt.state->sk, + nf_send_reset(nft_net(pkt), nft_sk(pkt), pkt->skb, nft_hook(pkt)); break; case NFT_REJECT_ICMPX_UNREACH: @@ -45,7 +45,7 @@ static void nft_reject_inet_eval(const struct nft_expr *expr, priv->icmp_code, nft_hook(pkt)); break; case NFT_REJECT_TCP_RST: - nf_send_reset6(nft_net(pkt), pkt->xt.state->sk, + nf_send_reset6(nft_net(pkt), nft_sk(pkt), pkt->skb, nft_hook(pkt)); break; case NFT_REJECT_ICMPX_UNREACH: -- cgit v1.2.3-58-ga151 From 2d7b4ace0754ebaaf71c6824880178d46aa0ab33 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 28 May 2021 12:30:06 +0200 Subject: netfilter: nf_tables: add and use nft_thoff helper This allows to change storage placement later on without changing readers. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 5 +++++ net/netfilter/nf_tables_core.c | 2 +- net/netfilter/nf_tables_trace.c | 6 +++--- net/netfilter/nft_exthdr.c | 8 ++++---- net/netfilter/nft_flow_offload.c | 2 +- net/netfilter/nft_payload.c | 10 +++++----- net/netfilter/nft_synproxy.c | 4 ++-- net/netfilter/nft_tproxy.c | 4 ++-- 8 files changed, 23 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index af1228f58e5a..10c1b8759990 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -34,6 +34,11 @@ static inline struct sock *nft_sk(const struct nft_pktinfo *pkt) return pkt->xt.state->sk; } +static inline unsigned int nft_thoff(const struct nft_pktinfo *pkt) +{ + return pkt->xt.thoff; +} + static inline struct net *nft_net(const struct nft_pktinfo *pkt) { return pkt->xt.state->net; diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index dbc2e945c98e..7780342e2f2d 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -81,7 +81,7 @@ static bool nft_payload_fast_eval(const struct nft_expr *expr, else { if (!pkt->tprot_set) return false; - ptr = skb_network_header(skb) + pkt->xt.thoff; + ptr = skb_network_header(skb) + nft_thoff(pkt); } ptr += priv->offset; diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c index 0cf3278007ba..e4fe2f0780eb 100644 --- a/net/netfilter/nf_tables_trace.c +++ b/net/netfilter/nf_tables_trace.c @@ -113,17 +113,17 @@ static int nf_trace_fill_pkt_info(struct sk_buff *nlskb, int off = skb_network_offset(skb); unsigned int len, nh_end; - nh_end = pkt->tprot_set ? pkt->xt.thoff : skb->len; + nh_end = pkt->tprot_set ? nft_thoff(pkt) : skb->len; len = min_t(unsigned int, nh_end - skb_network_offset(skb), NFT_TRACETYPE_NETWORK_HSIZE); if (trace_fill_header(nlskb, NFTA_TRACE_NETWORK_HEADER, skb, off, len)) return -1; if (pkt->tprot_set) { - len = min_t(unsigned int, skb->len - pkt->xt.thoff, + len = min_t(unsigned int, skb->len - nft_thoff(pkt), NFT_TRACETYPE_TRANSPORT_HSIZE); if (trace_fill_header(nlskb, NFTA_TRACE_TRANSPORT_HEADER, skb, - pkt->xt.thoff, len)) + nft_thoff(pkt), len)) return -1; } diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index 4d0b8e1c40c0..1b0579cb62d0 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -167,7 +167,7 @@ nft_tcp_header_pointer(const struct nft_pktinfo *pkt, if (!pkt->tprot_set || pkt->tprot != IPPROTO_TCP) return NULL; - tcph = skb_header_pointer(pkt->skb, pkt->xt.thoff, sizeof(*tcph), buffer); + tcph = skb_header_pointer(pkt->skb, nft_thoff(pkt), sizeof(*tcph), buffer); if (!tcph) return NULL; @@ -175,7 +175,7 @@ nft_tcp_header_pointer(const struct nft_pktinfo *pkt, if (*tcphdr_len < sizeof(*tcph) || *tcphdr_len > len) return NULL; - return skb_header_pointer(pkt->skb, pkt->xt.thoff, *tcphdr_len, buffer); + return skb_header_pointer(pkt->skb, nft_thoff(pkt), *tcphdr_len, buffer); } static void nft_exthdr_tcp_eval(const struct nft_expr *expr, @@ -251,7 +251,7 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr, return; if (skb_ensure_writable(pkt->skb, - pkt->xt.thoff + i + priv->len)) + nft_thoff(pkt) + i + priv->len)) return; tcph = nft_tcp_header_pointer(pkt, sizeof(buff), buff, @@ -306,7 +306,7 @@ static void nft_exthdr_sctp_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { - unsigned int offset = pkt->xt.thoff + sizeof(struct sctphdr); + unsigned int offset = nft_thoff(pkt) + sizeof(struct sctphdr); struct nft_exthdr *priv = nft_expr_priv(expr); u32 *dest = ®s->data[priv->dreg]; const struct sctp_chunkhdr *sch; diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index 4843dd2b410c..0af34ad41479 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -291,7 +291,7 @@ static void nft_flow_offload_eval(const struct nft_expr *expr, switch (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum) { case IPPROTO_TCP: - tcph = skb_header_pointer(pkt->skb, pkt->xt.thoff, + tcph = skb_header_pointer(pkt->skb, nft_thoff(pkt), sizeof(_tcph), &_tcph); if (unlikely(!tcph || tcph->fin || tcph->rst)) goto out; diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 501c5b24cc39..a44b14f6c0dc 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -110,7 +110,7 @@ void nft_payload_eval(const struct nft_expr *expr, case NFT_PAYLOAD_TRANSPORT_HEADER: if (!pkt->tprot_set) goto err; - offset = pkt->xt.thoff; + offset = nft_thoff(pkt); break; default: BUG(); @@ -507,7 +507,7 @@ static int nft_payload_l4csum_offset(const struct nft_pktinfo *pkt, *l4csum_offset = offsetof(struct tcphdr, check); break; case IPPROTO_UDP: - if (!nft_payload_udp_checksum(skb, pkt->xt.thoff)) + if (!nft_payload_udp_checksum(skb, nft_thoff(pkt))) return -1; fallthrough; case IPPROTO_UDPLITE: @@ -520,7 +520,7 @@ static int nft_payload_l4csum_offset(const struct nft_pktinfo *pkt, return -1; } - *l4csum_offset += pkt->xt.thoff; + *l4csum_offset += nft_thoff(pkt); return 0; } @@ -612,7 +612,7 @@ static void nft_payload_set_eval(const struct nft_expr *expr, case NFT_PAYLOAD_TRANSPORT_HEADER: if (!pkt->tprot_set) goto err; - offset = pkt->xt.thoff; + offset = nft_thoff(pkt); break; default: BUG(); @@ -643,7 +643,7 @@ static void nft_payload_set_eval(const struct nft_expr *expr, if (priv->csum_type == NFT_PAYLOAD_CSUM_SCTP && pkt->tprot == IPPROTO_SCTP && skb->ip_summed != CHECKSUM_PARTIAL) { - if (nft_payload_csum_sctp(skb, pkt->xt.thoff)) + if (nft_payload_csum_sctp(skb, nft_thoff(pkt))) goto err; } diff --git a/net/netfilter/nft_synproxy.c b/net/netfilter/nft_synproxy.c index 4fda8b3f1762..a0109fa1e92d 100644 --- a/net/netfilter/nft_synproxy.c +++ b/net/netfilter/nft_synproxy.c @@ -109,7 +109,7 @@ static void nft_synproxy_do_eval(const struct nft_synproxy *priv, { struct synproxy_options opts = {}; struct sk_buff *skb = pkt->skb; - int thoff = pkt->xt.thoff; + int thoff = nft_thoff(pkt); const struct tcphdr *tcp; struct tcphdr _tcph; @@ -123,7 +123,7 @@ static void nft_synproxy_do_eval(const struct nft_synproxy *priv, return; } - tcp = skb_header_pointer(skb, pkt->xt.thoff, + tcp = skb_header_pointer(skb, thoff, sizeof(struct tcphdr), &_tcph); if (!tcp) { diff --git a/net/netfilter/nft_tproxy.c b/net/netfilter/nft_tproxy.c index accef672088c..18e79c0fd3cf 100644 --- a/net/netfilter/nft_tproxy.c +++ b/net/netfilter/nft_tproxy.c @@ -82,9 +82,9 @@ static void nft_tproxy_eval_v6(const struct nft_expr *expr, const struct nft_tproxy *priv = nft_expr_priv(expr); struct sk_buff *skb = pkt->skb; const struct ipv6hdr *iph = ipv6_hdr(skb); - struct in6_addr taddr; - int thoff = pkt->xt.thoff; + int thoff = nft_thoff(pkt); struct udphdr _hdr, *hp; + struct in6_addr taddr; __be16 tport = 0; struct sock *sk; int l4proto; -- cgit v1.2.3-58-ga151 From f06ad944b6a92dd9ce95f2e5f4164a8e70d32af5 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 28 May 2021 12:30:07 +0200 Subject: netfilter: nf_tables: remove unused arg in nft_set_pktinfo_unspec() The functions pass extra skb arg, but either its not used or the helpers can already access it via pkt->skb. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 3 +-- include/net/netfilter/nf_tables_ipv4.h | 28 ++++++++++++---------------- include/net/netfilter/nf_tables_ipv6.h | 30 +++++++++++++----------------- net/netfilter/nft_chain_filter.c | 26 +++++++++++++------------- net/netfilter/nft_chain_nat.c | 4 ++-- net/netfilter/nft_chain_route.c | 4 ++-- 6 files changed, 43 insertions(+), 52 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 10c1b8759990..958b8e68bb1a 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -72,8 +72,7 @@ static inline void nft_set_pktinfo(struct nft_pktinfo *pkt, pkt->xt.state = state; } -static inline void nft_set_pktinfo_unspec(struct nft_pktinfo *pkt, - struct sk_buff *skb) +static inline void nft_set_pktinfo_unspec(struct nft_pktinfo *pkt) { pkt->tprot_set = false; pkt->tprot = 0; diff --git a/include/net/netfilter/nf_tables_ipv4.h b/include/net/netfilter/nf_tables_ipv4.h index 1f7bea39ad1b..b185a9216bf1 100644 --- a/include/net/netfilter/nf_tables_ipv4.h +++ b/include/net/netfilter/nf_tables_ipv4.h @@ -5,8 +5,7 @@ #include #include -static inline void nft_set_pktinfo_ipv4(struct nft_pktinfo *pkt, - struct sk_buff *skb) +static inline void nft_set_pktinfo_ipv4(struct nft_pktinfo *pkt) { struct iphdr *ip; @@ -17,14 +16,13 @@ static inline void nft_set_pktinfo_ipv4(struct nft_pktinfo *pkt, pkt->xt.fragoff = ntohs(ip->frag_off) & IP_OFFSET; } -static inline int __nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt, - struct sk_buff *skb) +static inline int __nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt) { struct iphdr *iph, _iph; u32 len, thoff; - iph = skb_header_pointer(skb, skb_network_offset(skb), sizeof(*iph), - &_iph); + iph = skb_header_pointer(pkt->skb, skb_network_offset(pkt->skb), + sizeof(*iph), &_iph); if (!iph) return -1; @@ -33,7 +31,7 @@ static inline int __nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt, len = ntohs(iph->tot_len); thoff = iph->ihl * 4; - if (skb->len < len) + if (pkt->skb->len < len) return -1; else if (len < thoff) return -1; @@ -46,29 +44,27 @@ static inline int __nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt, return 0; } -static inline void nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt, - struct sk_buff *skb) +static inline void nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt) { - if (__nft_set_pktinfo_ipv4_validate(pkt, skb) < 0) - nft_set_pktinfo_unspec(pkt, skb); + if (__nft_set_pktinfo_ipv4_validate(pkt) < 0) + nft_set_pktinfo_unspec(pkt); } -static inline int nft_set_pktinfo_ipv4_ingress(struct nft_pktinfo *pkt, - struct sk_buff *skb) +static inline int nft_set_pktinfo_ipv4_ingress(struct nft_pktinfo *pkt) { struct iphdr *iph; u32 len, thoff; - if (!pskb_may_pull(skb, sizeof(*iph))) + if (!pskb_may_pull(pkt->skb, sizeof(*iph))) return -1; - iph = ip_hdr(skb); + iph = ip_hdr(pkt->skb); if (iph->ihl < 5 || iph->version != 4) goto inhdr_error; len = ntohs(iph->tot_len); thoff = iph->ihl * 4; - if (skb->len < len) { + if (pkt->skb->len < len) { __IP_INC_STATS(nft_net(pkt), IPSTATS_MIB_INTRUNCATEDPKTS); return -1; } else if (len < thoff) { diff --git a/include/net/netfilter/nf_tables_ipv6.h b/include/net/netfilter/nf_tables_ipv6.h index 867de29f3f7a..bf132d488b17 100644 --- a/include/net/netfilter/nf_tables_ipv6.h +++ b/include/net/netfilter/nf_tables_ipv6.h @@ -6,8 +6,7 @@ #include #include -static inline void nft_set_pktinfo_ipv6(struct nft_pktinfo *pkt, - struct sk_buff *skb) +static inline void nft_set_pktinfo_ipv6(struct nft_pktinfo *pkt) { unsigned int flags = IP6_FH_F_AUTH; int protohdr, thoff = 0; @@ -15,7 +14,7 @@ static inline void nft_set_pktinfo_ipv6(struct nft_pktinfo *pkt, protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, &flags); if (protohdr < 0) { - nft_set_pktinfo_unspec(pkt, skb); + nft_set_pktinfo_unspec(pkt); return; } @@ -25,8 +24,7 @@ static inline void nft_set_pktinfo_ipv6(struct nft_pktinfo *pkt, pkt->xt.fragoff = frag_off; } -static inline int __nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt, - struct sk_buff *skb) +static inline int __nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt) { #if IS_ENABLED(CONFIG_IPV6) unsigned int flags = IP6_FH_F_AUTH; @@ -36,8 +34,8 @@ static inline int __nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt, int protohdr; u32 pkt_len; - ip6h = skb_header_pointer(skb, skb_network_offset(skb), sizeof(*ip6h), - &_ip6h); + ip6h = skb_header_pointer(pkt->skb, skb_network_offset(pkt->skb), + sizeof(*ip6h), &_ip6h); if (!ip6h) return -1; @@ -45,7 +43,7 @@ static inline int __nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt, return -1; pkt_len = ntohs(ip6h->payload_len); - if (pkt_len + sizeof(*ip6h) > skb->len) + if (pkt_len + sizeof(*ip6h) > pkt->skb->len) return -1; protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, &flags); @@ -63,15 +61,13 @@ static inline int __nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt, #endif } -static inline void nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt, - struct sk_buff *skb) +static inline void nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt) { - if (__nft_set_pktinfo_ipv6_validate(pkt, skb) < 0) - nft_set_pktinfo_unspec(pkt, skb); + if (__nft_set_pktinfo_ipv6_validate(pkt) < 0) + nft_set_pktinfo_unspec(pkt); } -static inline int nft_set_pktinfo_ipv6_ingress(struct nft_pktinfo *pkt, - struct sk_buff *skb) +static inline int nft_set_pktinfo_ipv6_ingress(struct nft_pktinfo *pkt) { #if IS_ENABLED(CONFIG_IPV6) unsigned int flags = IP6_FH_F_AUTH; @@ -82,15 +78,15 @@ static inline int nft_set_pktinfo_ipv6_ingress(struct nft_pktinfo *pkt, int protohdr; u32 pkt_len; - if (!pskb_may_pull(skb, sizeof(*ip6h))) + if (!pskb_may_pull(pkt->skb, sizeof(*ip6h))) return -1; - ip6h = ipv6_hdr(skb); + ip6h = ipv6_hdr(pkt->skb); if (ip6h->version != 6) goto inhdr_error; pkt_len = ntohs(ip6h->payload_len); - if (pkt_len + sizeof(*ip6h) > skb->len) { + if (pkt_len + sizeof(*ip6h) > pkt->skb->len) { idev = __in6_dev_get(nft_in(pkt)); __IP6_INC_STATS(nft_net(pkt), idev, IPSTATS_MIB_INTRUNCATEDPKTS); return -1; diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c index 363bdd7044ec..5b02408a920b 100644 --- a/net/netfilter/nft_chain_filter.c +++ b/net/netfilter/nft_chain_filter.c @@ -18,7 +18,7 @@ static unsigned int nft_do_chain_ipv4(void *priv, struct nft_pktinfo pkt; nft_set_pktinfo(&pkt, skb, state); - nft_set_pktinfo_ipv4(&pkt, skb); + nft_set_pktinfo_ipv4(&pkt); return nft_do_chain(&pkt, priv); } @@ -62,7 +62,7 @@ static unsigned int nft_do_chain_arp(void *priv, struct sk_buff *skb, struct nft_pktinfo pkt; nft_set_pktinfo(&pkt, skb, state); - nft_set_pktinfo_unspec(&pkt, skb); + nft_set_pktinfo_unspec(&pkt); return nft_do_chain(&pkt, priv); } @@ -102,7 +102,7 @@ static unsigned int nft_do_chain_ipv6(void *priv, struct nft_pktinfo pkt; nft_set_pktinfo(&pkt, skb, state); - nft_set_pktinfo_ipv6(&pkt, skb); + nft_set_pktinfo_ipv6(&pkt); return nft_do_chain(&pkt, priv); } @@ -149,10 +149,10 @@ static unsigned int nft_do_chain_inet(void *priv, struct sk_buff *skb, switch (state->pf) { case NFPROTO_IPV4: - nft_set_pktinfo_ipv4(&pkt, skb); + nft_set_pktinfo_ipv4(&pkt); break; case NFPROTO_IPV6: - nft_set_pktinfo_ipv6(&pkt, skb); + nft_set_pktinfo_ipv6(&pkt); break; default: break; @@ -174,7 +174,7 @@ static unsigned int nft_do_chain_inet_ingress(void *priv, struct sk_buff *skb, ingress_state.hook = NF_INET_INGRESS; nft_set_pktinfo(&pkt, skb, &ingress_state); - if (nft_set_pktinfo_ipv4_ingress(&pkt, skb) < 0) + if (nft_set_pktinfo_ipv4_ingress(&pkt) < 0) return NF_DROP; break; case htons(ETH_P_IPV6): @@ -182,7 +182,7 @@ static unsigned int nft_do_chain_inet_ingress(void *priv, struct sk_buff *skb, ingress_state.hook = NF_INET_INGRESS; nft_set_pktinfo(&pkt, skb, &ingress_state); - if (nft_set_pktinfo_ipv6_ingress(&pkt, skb) < 0) + if (nft_set_pktinfo_ipv6_ingress(&pkt) < 0) return NF_DROP; break; default: @@ -238,13 +238,13 @@ nft_do_chain_bridge(void *priv, switch (eth_hdr(skb)->h_proto) { case htons(ETH_P_IP): - nft_set_pktinfo_ipv4_validate(&pkt, skb); + nft_set_pktinfo_ipv4_validate(&pkt); break; case htons(ETH_P_IPV6): - nft_set_pktinfo_ipv6_validate(&pkt, skb); + nft_set_pktinfo_ipv6_validate(&pkt); break; default: - nft_set_pktinfo_unspec(&pkt, skb); + nft_set_pktinfo_unspec(&pkt); break; } @@ -293,13 +293,13 @@ static unsigned int nft_do_chain_netdev(void *priv, struct sk_buff *skb, switch (skb->protocol) { case htons(ETH_P_IP): - nft_set_pktinfo_ipv4_validate(&pkt, skb); + nft_set_pktinfo_ipv4_validate(&pkt); break; case htons(ETH_P_IPV6): - nft_set_pktinfo_ipv6_validate(&pkt, skb); + nft_set_pktinfo_ipv6_validate(&pkt); break; default: - nft_set_pktinfo_unspec(&pkt, skb); + nft_set_pktinfo_unspec(&pkt); break; } diff --git a/net/netfilter/nft_chain_nat.c b/net/netfilter/nft_chain_nat.c index eac4a901233f..98e4946100c5 100644 --- a/net/netfilter/nft_chain_nat.c +++ b/net/netfilter/nft_chain_nat.c @@ -17,12 +17,12 @@ static unsigned int nft_nat_do_chain(void *priv, struct sk_buff *skb, switch (state->pf) { #ifdef CONFIG_NF_TABLES_IPV4 case NFPROTO_IPV4: - nft_set_pktinfo_ipv4(&pkt, skb); + nft_set_pktinfo_ipv4(&pkt); break; #endif #ifdef CONFIG_NF_TABLES_IPV6 case NFPROTO_IPV6: - nft_set_pktinfo_ipv6(&pkt, skb); + nft_set_pktinfo_ipv6(&pkt); break; #endif default: diff --git a/net/netfilter/nft_chain_route.c b/net/netfilter/nft_chain_route.c index edd02cda57fc..925db0dce48d 100644 --- a/net/netfilter/nft_chain_route.c +++ b/net/netfilter/nft_chain_route.c @@ -26,7 +26,7 @@ static unsigned int nf_route_table_hook4(void *priv, u8 tos; nft_set_pktinfo(&pkt, skb, state); - nft_set_pktinfo_ipv4(&pkt, skb); + nft_set_pktinfo_ipv4(&pkt); mark = skb->mark; iph = ip_hdr(skb); @@ -74,7 +74,7 @@ static unsigned int nf_route_table_hook6(void *priv, int err; nft_set_pktinfo(&pkt, skb, state); - nft_set_pktinfo_ipv6(&pkt, skb); + nft_set_pktinfo_ipv6(&pkt); /* save source/dest address, mark, hoplimit, flowlabel, priority */ memcpy(&saddr, &ipv6_hdr(skb)->saddr, sizeof(saddr)); -- cgit v1.2.3-58-ga151 From 897389de48283d413728dae7520973872cef8eb2 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 28 May 2021 12:30:08 +0200 Subject: netfilter: nf_tables: remove xt_action_param from nft_pktinfo Init it on demand in the nft_compat expression. This reduces size of nft_pktinfo from 48 to 24 bytes on x86_64. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 25 +++++++++++++------------ include/net/netfilter/nf_tables_ipv4.h | 12 ++++++------ include/net/netfilter/nf_tables_ipv6.h | 12 ++++++------ net/netfilter/nft_compat.c | 28 ++++++++++++++++++---------- 4 files changed, 43 insertions(+), 34 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 958b8e68bb1a..6783164428f1 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -23,45 +23,46 @@ struct module; struct nft_pktinfo { struct sk_buff *skb; + const struct nf_hook_state *state; bool tprot_set; u8 tprot; - /* for x_tables compatibility */ - struct xt_action_param xt; + u16 fragoff; + unsigned int thoff; }; static inline struct sock *nft_sk(const struct nft_pktinfo *pkt) { - return pkt->xt.state->sk; + return pkt->state->sk; } static inline unsigned int nft_thoff(const struct nft_pktinfo *pkt) { - return pkt->xt.thoff; + return pkt->thoff; } static inline struct net *nft_net(const struct nft_pktinfo *pkt) { - return pkt->xt.state->net; + return pkt->state->net; } static inline unsigned int nft_hook(const struct nft_pktinfo *pkt) { - return pkt->xt.state->hook; + return pkt->state->hook; } static inline u8 nft_pf(const struct nft_pktinfo *pkt) { - return pkt->xt.state->pf; + return pkt->state->pf; } static inline const struct net_device *nft_in(const struct nft_pktinfo *pkt) { - return pkt->xt.state->in; + return pkt->state->in; } static inline const struct net_device *nft_out(const struct nft_pktinfo *pkt) { - return pkt->xt.state->out; + return pkt->state->out; } static inline void nft_set_pktinfo(struct nft_pktinfo *pkt, @@ -69,15 +70,15 @@ static inline void nft_set_pktinfo(struct nft_pktinfo *pkt, const struct nf_hook_state *state) { pkt->skb = skb; - pkt->xt.state = state; + pkt->state = state; } static inline void nft_set_pktinfo_unspec(struct nft_pktinfo *pkt) { pkt->tprot_set = false; pkt->tprot = 0; - pkt->xt.thoff = 0; - pkt->xt.fragoff = 0; + pkt->thoff = 0; + pkt->fragoff = 0; } /** diff --git a/include/net/netfilter/nf_tables_ipv4.h b/include/net/netfilter/nf_tables_ipv4.h index b185a9216bf1..eb4c094cd54d 100644 --- a/include/net/netfilter/nf_tables_ipv4.h +++ b/include/net/netfilter/nf_tables_ipv4.h @@ -12,8 +12,8 @@ static inline void nft_set_pktinfo_ipv4(struct nft_pktinfo *pkt) ip = ip_hdr(pkt->skb); pkt->tprot_set = true; pkt->tprot = ip->protocol; - pkt->xt.thoff = ip_hdrlen(pkt->skb); - pkt->xt.fragoff = ntohs(ip->frag_off) & IP_OFFSET; + pkt->thoff = ip_hdrlen(pkt->skb); + pkt->fragoff = ntohs(ip->frag_off) & IP_OFFSET; } static inline int __nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt) @@ -38,8 +38,8 @@ static inline int __nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt) pkt->tprot_set = true; pkt->tprot = iph->protocol; - pkt->xt.thoff = thoff; - pkt->xt.fragoff = ntohs(iph->frag_off) & IP_OFFSET; + pkt->thoff = thoff; + pkt->fragoff = ntohs(iph->frag_off) & IP_OFFSET; return 0; } @@ -73,8 +73,8 @@ static inline int nft_set_pktinfo_ipv4_ingress(struct nft_pktinfo *pkt) pkt->tprot_set = true; pkt->tprot = iph->protocol; - pkt->xt.thoff = thoff; - pkt->xt.fragoff = ntohs(iph->frag_off) & IP_OFFSET; + pkt->thoff = thoff; + pkt->fragoff = ntohs(iph->frag_off) & IP_OFFSET; return 0; diff --git a/include/net/netfilter/nf_tables_ipv6.h b/include/net/netfilter/nf_tables_ipv6.h index bf132d488b17..7595e02b00ba 100644 --- a/include/net/netfilter/nf_tables_ipv6.h +++ b/include/net/netfilter/nf_tables_ipv6.h @@ -20,8 +20,8 @@ static inline void nft_set_pktinfo_ipv6(struct nft_pktinfo *pkt) pkt->tprot_set = true; pkt->tprot = protohdr; - pkt->xt.thoff = thoff; - pkt->xt.fragoff = frag_off; + pkt->thoff = thoff; + pkt->fragoff = frag_off; } static inline int __nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt) @@ -52,8 +52,8 @@ static inline int __nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt) pkt->tprot_set = true; pkt->tprot = protohdr; - pkt->xt.thoff = thoff; - pkt->xt.fragoff = frag_off; + pkt->thoff = thoff; + pkt->fragoff = frag_off; return 0; #else @@ -98,8 +98,8 @@ static inline int nft_set_pktinfo_ipv6_ingress(struct nft_pktinfo *pkt) pkt->tprot_set = true; pkt->tprot = protohdr; - pkt->xt.thoff = thoff; - pkt->xt.fragoff = frag_off; + pkt->thoff = thoff; + pkt->fragoff = frag_off; return 0; diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 5415ab14400d..3144a9ad2f6a 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -57,8 +57,13 @@ union nft_entry { }; static inline void -nft_compat_set_par(struct xt_action_param *par, void *xt, const void *xt_info) +nft_compat_set_par(struct xt_action_param *par, + const struct nft_pktinfo *pkt, + const void *xt, const void *xt_info) { + par->state = pkt->state; + par->thoff = nft_thoff(pkt); + par->fragoff = pkt->fragoff; par->target = xt; par->targinfo = xt_info; par->hotdrop = false; @@ -71,13 +76,14 @@ static void nft_target_eval_xt(const struct nft_expr *expr, void *info = nft_expr_priv(expr); struct xt_target *target = expr->ops->data; struct sk_buff *skb = pkt->skb; + struct xt_action_param xt; int ret; - nft_compat_set_par((struct xt_action_param *)&pkt->xt, target, info); + nft_compat_set_par(&xt, pkt, target, info); - ret = target->target(skb, &pkt->xt); + ret = target->target(skb, &xt); - if (pkt->xt.hotdrop) + if (xt.hotdrop) ret = NF_DROP; switch (ret) { @@ -97,13 +103,14 @@ static void nft_target_eval_bridge(const struct nft_expr *expr, void *info = nft_expr_priv(expr); struct xt_target *target = expr->ops->data; struct sk_buff *skb = pkt->skb; + struct xt_action_param xt; int ret; - nft_compat_set_par((struct xt_action_param *)&pkt->xt, target, info); + nft_compat_set_par(&xt, pkt, target, info); - ret = target->target(skb, &pkt->xt); + ret = target->target(skb, &xt); - if (pkt->xt.hotdrop) + if (xt.hotdrop) ret = NF_DROP; switch (ret) { @@ -350,13 +357,14 @@ static void __nft_match_eval(const struct nft_expr *expr, { struct xt_match *match = expr->ops->data; struct sk_buff *skb = pkt->skb; + struct xt_action_param xt; bool ret; - nft_compat_set_par((struct xt_action_param *)&pkt->xt, match, info); + nft_compat_set_par(&xt, pkt, match, info); - ret = match->match(skb, (struct xt_action_param *)&pkt->xt); + ret = match->match(skb, &xt); - if (pkt->xt.hotdrop) { + if (xt.hotdrop) { regs->verdict.code = NF_DROP; return; } -- cgit v1.2.3-58-ga151 From 7a68cc16b82c963c096387917ced11a27c352261 Mon Sep 17 00:00:00 2001 From: Linus Lüssing Date: Sat, 15 May 2021 17:22:08 +0200 Subject: batman-adv: mcast: add MRD + routable IPv4 multicast with bridges support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds support for routable IPv4 multicast addresses (224.0.0.0/4, excluding 224.0.0.0/24) in bridged setups. This utilizes the Multicast Router Discovery (MRD, RFC4286) support in the Linux bridge. batman-adv will now query the Linux bridge for IPv4 multicast routers, which the bridge has previously learned about via MRD. This allows us to then safely send routable IPv4 multicast packets in bridged setups to multicast listeners and multicast routers only. Before we had to flood such packets to avoid potential multicast packet loss to IPv4 multicast routers, which we were not able to detect before. With the bridge MRD integration, we are now also able to perform more fine-grained detection of IPv6 multicast routers in bridged setups: Before we were "guessing" IPv6 multicast routers by looking up multicast listeners for the link-local All Routers multicast address (ff02::2), which every IPv6 multicast router is listening to. However this would also include more nodes than necessary: For instance nodes which are just a router for unicast, but not multicast would be included, too. Signed-off-by: Linus Lüssing Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/multicast.c | 41 +++++------------------------------------ 1 file changed, 5 insertions(+), 36 deletions(-) (limited to 'net') diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index 1d63c8cbbfe7..923e2197c2db 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -193,53 +193,22 @@ static u8 batadv_mcast_mla_rtr_flags_softif_get(struct batadv_priv *bat_priv, * BATADV_MCAST_WANT_NO_RTR6: No IPv6 multicast router is present * The former two OR'd: no multicast router is present */ -#if IS_ENABLED(CONFIG_IPV6) static u8 batadv_mcast_mla_rtr_flags_bridge_get(struct batadv_priv *bat_priv, struct net_device *bridge) { - struct list_head bridge_mcast_list = LIST_HEAD_INIT(bridge_mcast_list); struct net_device *dev = bat_priv->soft_iface; - struct br_ip_list *br_ip_entry, *tmp; - u8 flags = BATADV_MCAST_WANT_NO_RTR6; - int ret; + u8 flags = BATADV_NO_FLAGS; if (!bridge) return BATADV_MCAST_WANT_NO_RTR4 | BATADV_MCAST_WANT_NO_RTR6; - /* TODO: ask the bridge if a multicast router is present (the bridge - * is capable of performing proper RFC4286 multicast router - * discovery) instead of searching for a ff02::2 listener here - */ - ret = br_multicast_list_adjacent(dev, &bridge_mcast_list); - if (ret < 0) - return BATADV_NO_FLAGS; - - list_for_each_entry_safe(br_ip_entry, tmp, &bridge_mcast_list, list) { - /* the bridge snooping does not maintain IPv4 link-local - * addresses - therefore we won't find any IPv4 multicast router - * address here, only IPv6 ones - */ - if (br_ip_entry->addr.proto == htons(ETH_P_IPV6) && - ipv6_addr_is_ll_all_routers(&br_ip_entry->addr.dst.ip6)) - flags &= ~BATADV_MCAST_WANT_NO_RTR6; - - list_del(&br_ip_entry->list); - kfree(br_ip_entry); - } + if (!br_multicast_has_router_adjacent(dev, ETH_P_IP)) + flags |= BATADV_MCAST_WANT_NO_RTR4; + if (!br_multicast_has_router_adjacent(dev, ETH_P_IPV6)) + flags |= BATADV_MCAST_WANT_NO_RTR6; return flags; } -#else -static inline u8 -batadv_mcast_mla_rtr_flags_bridge_get(struct batadv_priv *bat_priv, - struct net_device *bridge) -{ - if (bridge) - return BATADV_NO_FLAGS; - else - return BATADV_MCAST_WANT_NO_RTR4 | BATADV_MCAST_WANT_NO_RTR6; -} -#endif /** * batadv_mcast_mla_rtr_flags_get() - get multicast router flags -- cgit v1.2.3-58-ga151 From 1cf1ef60a1a64cb5e00148b35bb35abad0984234 Mon Sep 17 00:00:00 2001 From: Shaokun Zhang Date: Fri, 28 May 2021 17:56:25 +0800 Subject: batman-adv: Remove the repeated declaration Function 'batadv_bla_claim_dump' is declared twice, so remove the repeated declaration. Signed-off-by: Shaokun Zhang Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/bridge_loop_avoidance.h | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h index 5c22955bb9d5..8673a265995f 100644 --- a/net/batman-adv/bridge_loop_avoidance.h +++ b/net/batman-adv/bridge_loop_avoidance.h @@ -52,7 +52,6 @@ void batadv_bla_update_orig_address(struct batadv_priv *bat_priv, void batadv_bla_status_update(struct net_device *net_dev); int batadv_bla_init(struct batadv_priv *bat_priv); void batadv_bla_free(struct batadv_priv *bat_priv); -int batadv_bla_claim_dump(struct sk_buff *msg, struct netlink_callback *cb); #ifdef CONFIG_BATMAN_ADV_DAT bool batadv_bla_check_claim(struct batadv_priv *bat_priv, u8 *addr, unsigned short vid); -- cgit v1.2.3-58-ga151 From 62f20e068ccc50d6ab66fdb72ba90da2b9418c99 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Sat, 29 May 2021 13:07:46 +0200 Subject: ipv6: use prandom_u32() for ID generation This is a complement to commit aa6dd211e4b1 ("inet: use bigger hash table for IP ID generation"), but focusing on some specific aspects of IPv6. Contary to IPv4, IPv6 only uses packet IDs with fragments, and with a minimum MTU of 1280, it's much less easy to force a remote peer to produce many fragments to explore its ID sequence. In addition packet IDs are 32-bit in IPv6, which further complicates their analysis. On the other hand, it is often easier to choose among plenty of possible source addresses and partially work around the bigger hash table the commit above permits, which leaves IPv6 partially exposed to some possibilities of remote analysis at the risk of weakening some protocols like DNS if some IDs can be predicted with a good enough probability. Given the wide range of permitted IDs, the risk of collision is extremely low so there's no need to rely on the positive increment algorithm that is shared with the IPv4 code via ip_idents_reserve(). We have a fast PRNG, so let's simply call prandom_u32() and be done with it. Performance measurements at 10 Gbps couldn't show any difference with the previous code, even when using a single core, because due to the large fragments, we're limited to only ~930 kpps at 10 Gbps and the cost of the random generation is completely offset by other operations and by the network transfer time. In addition, this change removes the need to update a shared entry in the idents table so it may even end up being slightly faster on large scale systems where this matters. The risk of at least one collision here is about 1/80 million among 10 IDs, 1/850k among 100 IDs, and still only 1/8.5k among 1000 IDs, which remains very low compared to IPv4 where all IDs are reused every 4 to 80ms on a 10 Gbps flow depending on packet sizes. Reported-by: Amit Klein Signed-off-by: Willy Tarreau Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20210529110746.6796-1-w@1wt.eu Signed-off-by: Jakub Kicinski --- net/ipv6/output_core.c | 28 +++++----------------------- 1 file changed, 5 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index af36acc1a644..2880dc7d9a49 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -15,29 +15,11 @@ static u32 __ipv6_select_ident(struct net *net, const struct in6_addr *dst, const struct in6_addr *src) { - const struct { - struct in6_addr dst; - struct in6_addr src; - } __aligned(SIPHASH_ALIGNMENT) combined = { - .dst = *dst, - .src = *src, - }; - u32 hash, id; - - /* Note the following code is not safe, but this is okay. */ - if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key))) - get_random_bytes(&net->ipv4.ip_id_key, - sizeof(net->ipv4.ip_id_key)); - - hash = siphash(&combined, sizeof(combined), &net->ipv4.ip_id_key); - - /* Treat id of 0 as unset and if we get 0 back from ip_idents_reserve, - * set the hight order instead thus minimizing possible future - * collisions. - */ - id = ip_idents_reserve(hash, 1); - if (unlikely(!id)) - id = 1 << 31; + u32 id; + + do { + id = prandom_u32(); + } while (!id); return id; } -- cgit v1.2.3-58-ga151 From 12e64b3bb9a8cd376a1a92434302e5aa7f1ab1e5 Mon Sep 17 00:00:00 2001 From: Rocco Yue Date: Sun, 30 May 2021 19:38:11 +0800 Subject: ipv6: align code with context The Tab key is used three times, causing the code block to be out of alignment with the context. Signed-off-by: Rocco Yue Link: https://lore.kernel.org/r/20210530113811.8817-1-rocco.yue@mediatek.com Signed-off-by: Jakub Kicinski --- net/ipv6/addrconf.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index b0ef65eb9bd2..048570900fdf 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -6903,10 +6903,10 @@ static const struct ctl_table addrconf_sysctl[] = { .proc_handler = proc_dointvec, }, { - .procname = "addr_gen_mode", - .data = &ipv6_devconf.addr_gen_mode, - .maxlen = sizeof(int), - .mode = 0644, + .procname = "addr_gen_mode", + .data = &ipv6_devconf.addr_gen_mode, + .maxlen = sizeof(int), + .mode = 0644, .proc_handler = addrconf_sysctl_addr_gen_mode, }, { -- cgit v1.2.3-58-ga151 From 01709d0977d464b862968ef063899e576615e5e5 Mon Sep 17 00:00:00 2001 From: Zheng Yongjun Date: Mon, 31 May 2021 10:00:19 +0800 Subject: nfc: hci: Fix spelling mistakes Fix some spelling mistakes in comments: occured ==> occurred negociate ==> negotiate Signed-off-by: Zheng Yongjun Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20210531020019.2919799-1-zhengyongjun3@huawei.com Signed-off-by: Jakub Kicinski --- net/nfc/hci/command.c | 2 +- net/nfc/hci/core.c | 2 +- net/nfc/hci/llc_shdlc.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/nfc/hci/command.c b/net/nfc/hci/command.c index e02b9befce0b..3a89bd9b89fc 100644 --- a/net/nfc/hci/command.c +++ b/net/nfc/hci/command.c @@ -34,7 +34,7 @@ static int nfc_hci_execute_cmd_async(struct nfc_hci_dev *hdev, u8 pipe, u8 cmd, * HCI command execution completion callback. * err will be a standard linux error (may be converted from HCI response) * skb contains the response data and must be disposed, or may be NULL if - * an error occured + * an error occurred */ static void nfc_hci_execute_cb(void *context, struct sk_buff *skb, int err) { diff --git a/net/nfc/hci/core.c b/net/nfc/hci/core.c index 43811b5219b5..3481941be70b 100644 --- a/net/nfc/hci/core.c +++ b/net/nfc/hci/core.c @@ -705,7 +705,7 @@ static void hci_transceive_cb(void *context, struct sk_buff *skb, int err) /* * TODO: Check RF Error indicator to make sure data is valid. * It seems that HCI cmd can complete without error, but data - * can be invalid if an RF error occured? Ignore for now. + * can be invalid if an RF error occurred? Ignore for now. */ if (err == 0) skb_trim(skb, skb->len - 1); /* RF Err ind */ diff --git a/net/nfc/hci/llc_shdlc.c b/net/nfc/hci/llc_shdlc.c index c0c8fea3a186..1e3a90049da9 100644 --- a/net/nfc/hci/llc_shdlc.c +++ b/net/nfc/hci/llc_shdlc.c @@ -406,7 +406,7 @@ static void llc_shdlc_rcv_u_frame(struct llc_shdlc *shdlc, case SHDLC_NEGOTIATING: case SHDLC_CONNECTING: /* - * We sent RSET, but chip wants to negociate or we + * We sent RSET, but chip wants to negotiate or we * got RSET before we managed to send out our. */ if (skb->len > 0) -- cgit v1.2.3-58-ga151 From 37f2ad2b9018c5e23455536e5c240cac1334f20a Mon Sep 17 00:00:00 2001 From: Zheng Yongjun Date: Mon, 31 May 2021 10:00:48 +0800 Subject: net: sched: Fix spelling mistakes Fix some spelling mistakes in comments: sevaral ==> several sugestion ==> suggestion unregster ==> unregister suplied ==> supplied cirsumstances ==> circumstances Signed-off-by: Zheng Yongjun Link: https://lore.kernel.org/r/20210531020048.2920054-1-zhengyongjun3@huawei.com Signed-off-by: Jakub Kicinski --- net/sched/cls_rsvp.h | 2 +- net/sched/ematch.c | 2 +- net/sched/sch_gred.c | 2 +- net/sched/sch_htb.c | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h index 2e288f88ff02..27a4b6dbcf57 100644 --- a/net/sched/cls_rsvp.h +++ b/net/sched/cls_rsvp.h @@ -7,7 +7,7 @@ /* Comparing to general packet classification problem, - RSVP needs only sevaral relatively simple rules: + RSVP needs only several relatively simple rules: * (dst, protocol) are always specified, so that we are able to hash them. diff --git a/net/sched/ematch.c b/net/sched/ematch.c index f885bea5b452..4ce681361851 100644 --- a/net/sched/ematch.c +++ b/net/sched/ematch.c @@ -141,7 +141,7 @@ errout: EXPORT_SYMBOL(tcf_em_register); /** - * tcf_em_unregister - unregster and extended match + * tcf_em_unregister - unregister and extended match * * @ops: ematch operations lookup table * diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index f4132dc25ac0..621dc6afde8f 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -6,7 +6,7 @@ * * 991129: - Bug fix with grio mode * - a better sing. AvgQ mode with Grio(WRED) - * - A finer grained VQ dequeue based on sugestion + * - A finer grained VQ dequeue based on suggestion * from Ren Liu * - More error checks * diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 081c11d5717c..282614614905 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -52,7 +52,7 @@ */ static int htb_hysteresis __read_mostly = 0; /* whether to use mode hysteresis for speedup */ -#define HTB_VER 0x30011 /* major must be matched with number suplied by TC as version */ +#define HTB_VER 0x30011 /* major must be matched with number supplied by TC as version */ #if HTB_VER >> 16 != TC_HTB_PROTOVER #error "Mismatched sch_htb.c and pkt_sch.h" @@ -523,7 +523,7 @@ htb_class_mode(struct htb_class *cl, s64 *diff) * htb_change_class_mode - changes classe's mode * * This should be the only way how to change classe's mode under normal - * cirsumstances. Routine will update feed lists linkage, change mode + * circumstances. Routine will update feed lists linkage, change mode * and add class to the wait event queue if appropriate. New mode should * be different from old one and cl->pq_key has to be valid if changing * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree). -- cgit v1.2.3-58-ga151 From 379aecbce08f7187feab9aa12609d049ce2675b4 Mon Sep 17 00:00:00 2001 From: Zheng Yongjun Date: Mon, 31 May 2021 14:36:17 +0800 Subject: rds: Fix spelling mistakes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix some spelling mistakes in comments: alloced ==> allocated Signed-off-by: Zheng Yongjun Reviewed-by: Håkon Bugge Link: https://lore.kernel.org/r/20210531063617.3018637-1-zhengyongjun3@huawei.com Signed-off-by: Jakub Kicinski --- net/rds/ib_ring.c | 2 +- net/rds/tcp_recv.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/rds/ib_ring.c b/net/rds/ib_ring.c index ff97e8eda858..006b2e441418 100644 --- a/net/rds/ib_ring.c +++ b/net/rds/ib_ring.c @@ -141,7 +141,7 @@ int rds_ib_ring_low(struct rds_ib_work_ring *ring) } /* - * returns the oldest alloced ring entry. This will be the next one + * returns the oldest allocated ring entry. This will be the next one * freed. This can't be called if there are none allocated. */ u32 rds_ib_ring_oldest(struct rds_ib_work_ring *ring) diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c index 42c5ff1eda95..f4ee13da90c7 100644 --- a/net/rds/tcp_recv.c +++ b/net/rds/tcp_recv.c @@ -177,7 +177,7 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb, goto out; } tc->t_tinc = tinc; - rdsdebug("alloced tinc %p\n", tinc); + rdsdebug("allocated tinc %p\n", tinc); rds_inc_path_init(&tinc->ti_inc, cp, &cp->cp_conn->c_faddr); tinc->ti_inc.i_rx_lat_trace[RDS_MSG_RX_HDR] = -- cgit v1.2.3-58-ga151 From 0c2c366e0ec55533decb00d0f1ea1cbc42247e7b Mon Sep 17 00:00:00 2001 From: Zheng Yongjun Date: Tue, 1 Jun 2021 10:08:01 +0800 Subject: sctp: sm_statefuns: Fix spelling mistakes Fix some spelling mistakes in comments: genereate ==> generate correclty ==> correctly boundries ==> boundaries failes ==> fails isses ==> issues assocition ==> association signe ==> sign assocaition ==> association managemement ==> management restransmissions ==> retransmission sideffect ==> sideeffect bomming ==> booming chukns ==> chunks SHUDOWN ==> SHUTDOWN violationg ==> violating explcitly ==> explicitly CHunk ==> Chunk Signed-off-by: Zheng Yongjun Link: https://lore.kernel.org/r/20210601020801.3625358-1-zhengyongjun3@huawei.com Signed-off-by: Jakub Kicinski --- net/sctp/sm_statefuns.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index fd1e319eda00..4f30388a0dd0 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -361,7 +361,7 @@ enum sctp_disposition sctp_sf_do_5_1B_init(struct net *net, /* If the INIT is coming toward a closing socket, we'll send back * and ABORT. Essentially, this catches the race of INIT being - * backloged to the socket at the same time as the user isses close(). + * backloged to the socket at the same time as the user issues close(). * Since the socket and all its associations are going away, we * can treat this OOTB */ @@ -608,8 +608,8 @@ enum sctp_disposition sctp_sf_do_5_1C_ack(struct net *net, sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, SCTP_STATE(SCTP_STATE_COOKIE_ECHOED)); - /* SCTP-AUTH: genereate the assocition shared keys so that - * we can potentially signe the COOKIE-ECHO. + /* SCTP-AUTH: generate the association shared keys so that + * we can potentially sign the COOKIE-ECHO. */ sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_SHKEY, SCTP_NULL()); @@ -787,7 +787,7 @@ enum sctp_disposition sctp_sf_do_5_1D_ce(struct net *net, goto nomem_init; /* SCTP-AUTH: Now that we've populate required fields in - * sctp_process_init, set up the assocaition shared keys as + * sctp_process_init, set up the association shared keys as * necessary so that we can potentially authenticate the ACK */ error = sctp_auth_asoc_init_active_key(new_asoc, GFP_ATOMIC); @@ -838,7 +838,7 @@ enum sctp_disposition sctp_sf_do_5_1D_ce(struct net *net, /* Add all the state machine commands now since we've created * everything. This way we don't introduce memory corruptions - * during side-effect processing and correclty count established + * during side-effect processing and correctly count established * associations. */ sctp_add_cmd_sf(commands, SCTP_CMD_NEW_ASOC, SCTP_ASOC(new_asoc)); @@ -923,7 +923,7 @@ enum sctp_disposition sctp_sf_do_5_1E_ca(struct net *net, commands); /* Reset init error count upon receipt of COOKIE-ACK, - * to avoid problems with the managemement of this + * to avoid problems with the management of this * counter in stale cookie situations when a transition back * from the COOKIE-ECHOED state to the COOKIE-WAIT * state is performed. @@ -2950,7 +2950,7 @@ enum sctp_disposition sctp_sf_do_9_2_reshutack( commands); /* Since we are not going to really process this INIT, there - * is no point in verifying chunk boundries. Just generate + * is no point in verifying chunk boundaries. Just generate * the SHUTDOWN ACK. */ reply = sctp_make_shutdown_ack(asoc, chunk); @@ -3560,7 +3560,7 @@ enum sctp_disposition sctp_sf_do_9_2_final(struct net *net, goto nomem_chunk; /* Do all the commands now (after allocation), so that we - * have consistent state if memory allocation failes + * have consistent state if memory allocation fails */ sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(ev)); @@ -3747,7 +3747,7 @@ static enum sctp_disposition sctp_sf_shut_8_4_5( return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); /* We need to discard the rest of the packet to prevent - * potential bomming attacks from additional bundled chunks. + * potential boomming attacks from additional bundled chunks. * This is documented in SCTP Threats ID. */ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); @@ -4257,7 +4257,7 @@ gen_shutdown: } /* - * SCTP-AUTH Section 6.3 Receiving authenticated chukns + * SCTP-AUTH Section 6.3 Receiving authenticated chunks * * The receiver MUST use the HMAC algorithm indicated in the HMAC * Identifier field. If this algorithm was not specified by the @@ -4812,7 +4812,7 @@ static enum sctp_disposition sctp_sf_violation_ctsn( /* Handle protocol violation of an invalid chunk bundling. For example, * when we have an association and we receive bundled INIT-ACK, or - * SHUDOWN-COMPLETE, our peer is clearly violationg the "MUST NOT bundle" + * SHUTDOWN-COMPLETE, our peer is clearly violating the "MUST NOT bundle" * statement from the specs. Additionally, there might be an attacker * on the path and we may not want to continue this communication. */ @@ -5208,7 +5208,7 @@ enum sctp_disposition sctp_sf_cookie_wait_prm_shutdown( * Inputs * (endpoint, asoc) * - * The RFC does not explcitly address this issue, but is the route through the + * The RFC does not explicitly address this issue, but is the route through the * state table when someone issues a shutdown while in COOKIE_ECHOED state. * * Outputs @@ -5932,7 +5932,7 @@ enum sctp_disposition sctp_sf_t1_cookie_timer_expire( /* RFC2960 9.2 If the timer expires, the endpoint must re-send the SHUTDOWN * with the updated last sequential TSN received from its peer. * - * An endpoint should limit the number of retransmissions of the + * An endpoint should limit the number of retransmission of the * SHUTDOWN chunk to the protocol parameter 'Association.Max.Retrans'. * If this threshold is exceeded the endpoint should destroy the TCB and * MUST report the peer endpoint unreachable to the upper layer (and @@ -6010,7 +6010,7 @@ nomem: } /* - * ADDIP Section 4.1 ASCONF CHunk Procedures + * ADDIP Section 4.1 ASCONF Chunk Procedures * If the T4 RTO timer expires the endpoint should do B1 to B5 */ enum sctp_disposition sctp_sf_t4_timer_expire( @@ -6441,7 +6441,7 @@ static int sctp_eat_data(const struct sctp_association *asoc, chunk->ecn_ce_done = 1; if (af->is_ce(sctp_gso_headskb(chunk->skb))) { - /* Do real work as sideffect. */ + /* Do real work as side effect. */ sctp_add_cmd_sf(commands, SCTP_CMD_ECN_CE, SCTP_U32(tsn)); } -- cgit v1.2.3-58-ga151 From d7b0408934c749f546b01f2b33d07421a49b6f3e Mon Sep 17 00:00:00 2001 From: Varad Gautam Date: Fri, 28 May 2021 18:04:06 +0200 Subject: xfrm: policy: Read seqcount outside of rcu-read side in xfrm_policy_lookup_bytype xfrm_policy_lookup_bytype loops on seqcount mutex xfrm_policy_hash_generation within an RCU read side critical section. Although ill advised, this is fine if the loop is bounded. xfrm_policy_hash_generation wraps mutex hash_resize_mutex, which is used to serialize writers (xfrm_hash_resize, xfrm_hash_rebuild). This is fine too. On PREEMPT_RT=y, the read_seqcount_begin call within xfrm_policy_lookup_bytype emits a mutex lock/unlock for hash_resize_mutex. Mutex locking is fine, since RCU read side critical sections are allowed to sleep with PREEMPT_RT. xfrm_hash_resize can, however, block on synchronize_rcu while holding hash_resize_mutex. This leads to the following situation on PREEMPT_RT, where the writer is blocked on RCU grace period expiry, while the reader is blocked on a lock held by the writer: Thead 1 (xfrm_hash_resize) Thread 2 (xfrm_policy_lookup_bytype) rcu_read_lock(); mutex_lock(&hash_resize_mutex); read_seqcount_begin(&xfrm_policy_hash_generation); mutex_lock(&hash_resize_mutex); // block xfrm_bydst_resize(); synchronize_rcu(); // block Move the read_seqcount_begin call outside of the RCU read side critical section, and do an rcu_read_unlock/retry if we got stale data within the critical section. On non-PREEMPT_RT, this shortens the time spent within RCU read side critical section in case the seqcount needs a retry, and avoids unbounded looping. Fixes: 77cc278f7b20 ("xfrm: policy: Use sequence counters with associated lock") Signed-off-by: Varad Gautam Cc: linux-rt-users Cc: netdev@vger.kernel.org Cc: stable@vger.kernel.org # v4.9 Cc: Steffen Klassert Cc: Herbert Xu Cc: "David S. Miller" Cc: Jakub Kicinski Cc: Florian Westphal Cc: "Ahmed S. Darwish" Signed-off-by: Steffen Klassert Acked-by: Ahmed S. Darwish --- net/xfrm/xfrm_policy.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index b74f28cabe24..8c56e3e59c3c 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2092,12 +2092,15 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, if (unlikely(!daddr || !saddr)) return NULL; - rcu_read_lock(); retry: - do { - sequence = read_seqcount_begin(&xfrm_policy_hash_generation); - chain = policy_hash_direct(net, daddr, saddr, family, dir); - } while (read_seqcount_retry(&xfrm_policy_hash_generation, sequence)); + sequence = read_seqcount_begin(&xfrm_policy_hash_generation); + rcu_read_lock(); + + chain = policy_hash_direct(net, daddr, saddr, family, dir); + if (read_seqcount_retry(&xfrm_policy_hash_generation, sequence)) { + rcu_read_unlock(); + goto retry; + } ret = NULL; hlist_for_each_entry_rcu(pol, chain, bydst) { @@ -2128,11 +2131,15 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, } skip_inexact: - if (read_seqcount_retry(&xfrm_policy_hash_generation, sequence)) + if (read_seqcount_retry(&xfrm_policy_hash_generation, sequence)) { + rcu_read_unlock(); goto retry; + } - if (ret && !xfrm_pol_hold_rcu(ret)) + if (ret && !xfrm_pol_hold_rcu(ret)) { + rcu_read_unlock(); goto retry; + } fail: rcu_read_unlock(); -- cgit v1.2.3-58-ga151 From eebd49a4ffb420a991c606e54aa3c9f02857a334 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 29 May 2021 16:23:18 -0400 Subject: xfrm: remove the fragment check for ipv6 beet mode In commit 68dc022d04eb ("xfrm: BEET mode doesn't support fragments for inner packets"), it tried to fix the issue that in TX side the packet is fragmented before the ESP encapping while in the RX side the fragments always get reassembled before decapping with ESP. This is not true for IPv6. IPv6 is different, and it's using exthdr to save fragment info, as well as the ESP info. Exthdrs are added in TX and processed in RX both in order. So in the above case, the ESP decapping will be done earlier than the fragment reassembling in TX side. Here just remove the fragment check for the IPv6 inner packets to recover the fragments support for BEET mode. Fixes: 68dc022d04eb ("xfrm: BEET mode doesn't support fragments for inner packets") Reported-by: Xiumei Mu Signed-off-by: Xin Long Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_output.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index e4cb0ff4dcf4..ac907b9d32d1 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -711,15 +711,8 @@ out: static int xfrm6_extract_output(struct xfrm_state *x, struct sk_buff *skb) { #if IS_ENABLED(CONFIG_IPV6) - unsigned int ptr = 0; int err; - if (x->outer_mode.encap == XFRM_MODE_BEET && - ipv6_find_hdr(skb, &ptr, NEXTHDR_FRAGMENT, NULL, NULL) >= 0) { - net_warn_ratelimited("BEET mode doesn't support inner IPv6 fragments\n"); - return -EAFNOSUPPORT; - } - err = xfrm6_tunnel_check_size(skb); if (err) return err; -- cgit v1.2.3-58-ga151 From 89258f8e4148630a7d327d23ce55b6f80b290ff4 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 29 May 2021 18:50:45 +0200 Subject: netfilter: nft_set_pipapo_avx2: fix up description warnings W=1: net/netfilter/nft_set_pipapo_avx2.c:159: warning: Excess function parameter 'len' description in 'nft_pipapo_avx2_refill' net/netfilter/nft_set_pipapo_avx2.c:1124: warning: Function parameter or member 'key' not described in 'nft_pipapo_avx2_lookup' net/netfilter/nft_set_pipapo_avx2.c:1124: warning: Excess function parameter 'elem' description in 'nft_pipapo_avx2_lookup' Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_pipapo_avx2.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c index 1c2620923a61..e517663e0cd1 100644 --- a/net/netfilter/nft_set_pipapo_avx2.c +++ b/net/netfilter/nft_set_pipapo_avx2.c @@ -142,7 +142,6 @@ static void nft_pipapo_avx2_fill(unsigned long *data, int start, int len) * @map: Bitmap to be scanned for set bits * @dst: Destination bitmap * @mt: Mapping table containing bit set specifiers - * @len: Length of bitmap in longs * @last: Return index of first set bit, if this is the last field * * This is an alternative implementation of pipapo_refill() suitable for usage @@ -1109,7 +1108,7 @@ bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features, * nft_pipapo_avx2_lookup() - Lookup function for AVX2 implementation * @net: Network namespace * @set: nftables API set representation - * @elem: nftables API element representation containing key data + * @key: nftables API element representation containing key data * @ext: nftables API extension pointer, filled with matching reference * * For more details, see DOC: Theory of Operation in nft_set_pipapo.c. -- cgit v1.2.3-58-ga151 From 8a1c08ad19b6ecb7254eca5c7275cb5d6fa1b0cb Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 29 May 2021 18:53:25 +0200 Subject: netfilter: fix clang-12 fmt string warnings nf_conntrack_h323_main.c:198:6: warning: format specifies type 'unsigned short' but xt_AUDIT.c:121:9: warning: format specifies type 'unsigned char' but the argument has type 'int' [-Wformat] Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_h323_main.c | 2 +- net/netfilter/xt_AUDIT.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index aafaff00baf1..2eb31ffb3d14 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -194,7 +194,7 @@ static int get_tpkt_data(struct sk_buff *skb, unsigned int protoff, if (tcpdatalen == 4) { /* Separate TPKT header */ /* Netmeeting sends TPKT header and data separately */ pr_debug("nf_ct_h323: separate TPKT header indicates " - "there will be TPKT data of %hu bytes\n", + "there will be TPKT data of %d bytes\n", tpktlen - 4); info->tpkt_len[dir] = tpktlen - 4; return 0; diff --git a/net/netfilter/xt_AUDIT.c b/net/netfilter/xt_AUDIT.c index 9cdc16b0d0d8..b6a015aee0ce 100644 --- a/net/netfilter/xt_AUDIT.c +++ b/net/netfilter/xt_AUDIT.c @@ -117,7 +117,7 @@ static int audit_tg_check(const struct xt_tgchk_param *par) const struct xt_audit_info *info = par->targinfo; if (info->type > XT_AUDIT_TYPE_MAX) { - pr_info_ratelimited("Audit type out of range (valid range: 0..%hhu)\n", + pr_info_ratelimited("Audit type out of range (valid range: 0..%u)\n", XT_AUDIT_TYPE_MAX); return -ERANGE; } -- cgit v1.2.3-58-ga151 From b0f6c9ac8088a01cd9b6bfba8ede22f1bd0ff72f Mon Sep 17 00:00:00 2001 From: Wander Lairson Costa Date: Mon, 31 May 2021 12:23:23 -0300 Subject: netpoll: don't require irqs disabled in rt kernels write_msg(netconsole.c:836) calls netpoll_send_udp after a call to spin_lock_irqsave, which normally disables interrupts; but in PREEMPT_RT this call just locks an rt_mutex without disabling irqs. In this case, netpoll_send_udp is called with interrupts enabled. Signed-off-by: Wander Lairson Costa Signed-off-by: David S. Miller --- net/core/netpoll.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/netpoll.c b/net/core/netpoll.c index c310c7c1cef7..0a6b04714558 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -36,6 +36,7 @@ #include #include #include +#include /* * We maintain a small pool of fully-sized skbs, to make sure the @@ -389,7 +390,8 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len) static atomic_t ip_ident; struct ipv6hdr *ip6h; - WARN_ON_ONCE(!irqs_disabled()); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + WARN_ON_ONCE(!irqs_disabled()); udp_len = len + sizeof(*udph); if (np->ipv6) -- cgit v1.2.3-58-ga151 From 7cf85f8caa042db0e33d70dbd72d8b92b1051f93 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Tue, 1 Jun 2021 17:49:50 +0800 Subject: NFC: nci: Remove redundant assignment to len Variable 'len' is set to conn_info->max_pkt_payload_len but this value is never read as it is overwritten with a new value later on, hence it is a redundant assignment and can be removed. Clean up the following clang-analyzer warning: net/nfc/nci/hci.c:164:3: warning: Value stored to 'len' is never read [clang-analyzer-deadcode.DeadStores] Reported-by: Abaci Robot Signed-off-by: Yang Li Reviewed-by: Krzysztof Kozlowski Signed-off-by: David S. Miller --- net/nfc/nci/hci.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/nfc/nci/hci.c b/net/nfc/nci/hci.c index 96865142104f..d6732e5e8958 100644 --- a/net/nfc/nci/hci.c +++ b/net/nfc/nci/hci.c @@ -161,8 +161,6 @@ static int nci_hci_send_data(struct nci_dev *ndev, u8 pipe, *(u8 *)skb_push(skb, 1) = data_type; do { - len = conn_info->max_pkt_payload_len; - /* If last packet add NCI_HFP_NO_CHAINING */ if (i + conn_info->max_pkt_payload_len - (skb->len + 1) >= data_len) { -- cgit v1.2.3-58-ga151 From 9c5eee0afca09cbde6bd00f77876754aaa552970 Mon Sep 17 00:00:00 2001 From: Boris Sukholitko Date: Tue, 1 Jun 2021 15:30:50 +0300 Subject: net/sched: act_vlan: Fix modify to allow 0 Currently vlan modification action checks existence of vlan priority by comparing it to 0. Therefore it is impossible to modify existing vlan tag to have priority 0. For example, the following tc command will change the vlan id but will not affect vlan priority: tc filter add dev eth1 ingress matchall action vlan modify id 300 \ priority 0 pipe mirred egress redirect dev eth2 The incoming packet on eth1: ethertype 802.1Q (0x8100), vlan 200, p 4, ethertype IPv4 will be changed to: ethertype 802.1Q (0x8100), vlan 300, p 4, ethertype IPv4 although the user has intended to have p == 0. The fix is to add tcfv_push_prio_exists flag to struct tcf_vlan_params and rely on it when deciding to set the priority. Fixes: 45a497f2d149a4a8061c (net/sched: act_vlan: Introduce TCA_VLAN_ACT_MODIFY vlan action) Signed-off-by: Boris Sukholitko Signed-off-by: David S. Miller --- include/net/tc_act/tc_vlan.h | 1 + net/sched/act_vlan.c | 7 +++++-- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/tc_act/tc_vlan.h b/include/net/tc_act/tc_vlan.h index f051046ba034..f94b8bc26f9e 100644 --- a/include/net/tc_act/tc_vlan.h +++ b/include/net/tc_act/tc_vlan.h @@ -16,6 +16,7 @@ struct tcf_vlan_params { u16 tcfv_push_vid; __be16 tcfv_push_proto; u8 tcfv_push_prio; + bool tcfv_push_prio_exists; struct rcu_head rcu; }; diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 1cac3c6fbb49..a108469c664f 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -70,7 +70,7 @@ static int tcf_vlan_act(struct sk_buff *skb, const struct tc_action *a, /* replace the vid */ tci = (tci & ~VLAN_VID_MASK) | p->tcfv_push_vid; /* replace prio bits, if tcfv_push_prio specified */ - if (p->tcfv_push_prio) { + if (p->tcfv_push_prio_exists) { tci &= ~VLAN_PRIO_MASK; tci |= p->tcfv_push_prio << VLAN_PRIO_SHIFT; } @@ -121,6 +121,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, struct tc_action_net *tn = net_generic(net, vlan_net_id); struct nlattr *tb[TCA_VLAN_MAX + 1]; struct tcf_chain *goto_ch = NULL; + bool push_prio_exists = false; struct tcf_vlan_params *p; struct tc_vlan *parm; struct tcf_vlan *v; @@ -189,7 +190,8 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, push_proto = htons(ETH_P_8021Q); } - if (tb[TCA_VLAN_PUSH_VLAN_PRIORITY]) + push_prio_exists = !!tb[TCA_VLAN_PUSH_VLAN_PRIORITY]; + if (push_prio_exists) push_prio = nla_get_u8(tb[TCA_VLAN_PUSH_VLAN_PRIORITY]); break; case TCA_VLAN_ACT_POP_ETH: @@ -241,6 +243,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, p->tcfv_action = action; p->tcfv_push_vid = push_vid; p->tcfv_push_prio = push_prio; + p->tcfv_push_prio_exists = push_prio_exists || action == TCA_VLAN_ACT_PUSH; p->tcfv_push_proto = push_proto; if (action == TCA_VLAN_ACT_PUSH_ETH) { -- cgit v1.2.3-58-ga151 From 8323b20f1d76b10fb413daae6abf76b7b903c8de Mon Sep 17 00:00:00 2001 From: Boris Sukholitko Date: Tue, 1 Jun 2021 15:30:51 +0300 Subject: net/sched: act_vlan: No dump for unset priority Dump vlan priority only if it has been previously set. Fix the tests accordingly. Signed-off-by: Boris Sukholitko Signed-off-by: David S. Miller --- net/sched/act_vlan.c | 4 ++-- tools/testing/selftests/tc-testing/tc-tests/actions/vlan.json | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index a108469c664f..71f2015c70ca 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -307,8 +307,8 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a, (nla_put_u16(skb, TCA_VLAN_PUSH_VLAN_ID, p->tcfv_push_vid) || nla_put_be16(skb, TCA_VLAN_PUSH_VLAN_PROTOCOL, p->tcfv_push_proto) || - (nla_put_u8(skb, TCA_VLAN_PUSH_VLAN_PRIORITY, - p->tcfv_push_prio)))) + (p->tcfv_push_prio_exists && + nla_put_u8(skb, TCA_VLAN_PUSH_VLAN_PRIORITY, p->tcfv_push_prio)))) goto nla_put_failure; if (p->tcfv_action == TCA_VLAN_ACT_PUSH_ETH) { diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/vlan.json b/tools/testing/selftests/tc-testing/tc-tests/actions/vlan.json index 41d783254b08..eccbf0d7c7b9 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/actions/vlan.json +++ b/tools/testing/selftests/tc-testing/tc-tests/actions/vlan.json @@ -463,7 +463,7 @@ "cmdUnderTest": "$TC actions add action vlan modify protocol 802.1Q id 5 index 100", "expExitCode": "0", "verifyCmd": "$TC actions get action vlan index 100", - "matchPattern": "action order [0-9]+: vlan.*modify id 100 protocol 802.1Q priority 0 pipe.*index 100 ref", + "matchPattern": "action order [0-9]+: vlan.*modify id 100 protocol 802.1Q pipe.*index 100 ref", "matchCount": "0", "teardown": [ "$TC actions flush action vlan" @@ -487,7 +487,7 @@ "cmdUnderTest": "$TC actions add action vlan modify protocol 802.1ad id 500 reclassify index 12", "expExitCode": "0", "verifyCmd": "$TC actions get action vlan index 12", - "matchPattern": "action order [0-9]+: vlan.*modify id 500 protocol 802.1ad priority 0 reclassify.*index 12 ref", + "matchPattern": "action order [0-9]+: vlan.*modify id 500 protocol 802.1ad reclassify.*index 12 ref", "matchCount": "1", "teardown": [ "$TC actions flush action vlan" -- cgit v1.2.3-58-ga151 From b923cda9638860d6fbb688cfc4c939ff13df31b5 Mon Sep 17 00:00:00 2001 From: Zheng Yongjun Date: Tue, 1 Jun 2021 22:13:58 +0800 Subject: net: dcb: Return the correct errno code When kalloc or kmemdup failed, should return ENOMEM rather than ENOBUF. Signed-off-by: Zheng Yongjun Signed-off-by: David S. Miller --- net/dcb/dcbnl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index 51f80a2f8194..b441ab330fd3 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -1381,7 +1381,7 @@ static int dcbnl_notify(struct net_device *dev, int event, int cmd, skb = dcbnl_newmsg(event, cmd, portid, seq, 0, &nlh); if (!skb) - return -ENOBUFS; + return -ENOMEM; if (dcbx_ver == DCB_CAP_DCBX_VER_IEEE) err = dcbnl_ieee_fill(skb, dev); @@ -1781,7 +1781,7 @@ static int dcb_doit(struct sk_buff *skb, struct nlmsghdr *nlh, reply_skb = dcbnl_newmsg(fn->type, dcb->cmd, portid, nlh->nlmsg_seq, nlh->nlmsg_flags, &reply_nlh); if (!reply_skb) - return -ENOBUFS; + return -ENOMEM; ret = fn->cb(netdev, nlh, nlh->nlmsg_seq, tb, reply_skb); if (ret < 0) { -- cgit v1.2.3-58-ga151 From ca746c55a7e6e597cc2d29a094082d345b2c33c9 Mon Sep 17 00:00:00 2001 From: Zheng Yongjun Date: Tue, 1 Jun 2021 22:14:07 +0800 Subject: net: Return the correct errno code When kalloc or kmemdup failed, should return ENOMEM rather than ENOBUF. Signed-off-by: Zheng Yongjun Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index d9bccad65e2b..750f388a4a68 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -318,7 +318,7 @@ lookup_protocol: WARN_ON(!answer_prot->slab); - err = -ENOBUFS; + err = -ENOMEM; sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern); if (!sk) goto out; -- cgit v1.2.3-58-ga151 From 791ad7f5c17ea3d0887b94eed1e7812777f8e496 Mon Sep 17 00:00:00 2001 From: Zheng Yongjun Date: Wed, 2 Jun 2021 14:56:03 +0800 Subject: batman-adv: Fix spelling mistakes Fix some spelling mistakes in comments: containg ==> containing dont ==> don't datas ==> data brodcast ==> broadcast Signed-off-by: Zheng Yongjun Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/bridge_loop_avoidance.c | 4 ++-- net/batman-adv/hard-interface.c | 2 +- net/batman-adv/hash.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index 7dc133cfc363..63d42dcc9324 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -395,7 +395,7 @@ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, u8 *mac, break; case BATADV_CLAIM_TYPE_ANNOUNCE: /* announcement frame - * set HW SRC to the special mac containg the crc + * set HW SRC to the special mac containing the crc */ ether_addr_copy(hw_src, mac); batadv_dbg(BATADV_DBG_BLA, bat_priv, @@ -1040,7 +1040,7 @@ static int batadv_check_claim_group(struct batadv_priv *bat_priv, /* lets see if this originator is in our mesh */ orig_node = batadv_orig_hash_find(bat_priv, backbone_addr); - /* dont accept claims from gateways which are not in + /* don't accept claims from gateways which are not in * the same mesh or group. */ if (!orig_node) diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 4a6a25d551a8..b99f64f483fc 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -403,7 +403,7 @@ int batadv_hardif_no_broadcast(struct batadv_hard_iface *if_outgoing, goto out; } - /* >1 neighbors -> (re)brodcast */ + /* >1 neighbors -> (re)broadcast */ if (rcu_dereference(hlist_next_rcu(first))) goto out; diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h index 46696759f194..fb251c385a1b 100644 --- a/net/batman-adv/hash.h +++ b/net/batman-adv/hash.h @@ -18,7 +18,7 @@ #include #include -/* callback to a compare function. should compare 2 element datas for their +/* callback to a compare function. should compare 2 element data for their * keys * * Return: true if same and false if not same -- cgit v1.2.3-58-ga151 From bf6b260b8a9654db99761cde74c6b16356b9b441 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Tue, 1 Jun 2021 23:40:16 +0200 Subject: batman-adv: Drop implicit creation of batadv net_devices The sysfs code in batman-adv was could create a new batadv interfaces on demand when a string (interface name) was written to the batman-adv/mesh_iface file. But the code no longer exists in the current batman-adv codebase. The helper code to implement this behavior must be considered as unused and can be dropped. Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/hard-interface.c | 12 ++---------- net/batman-adv/soft-interface.c | 34 +--------------------------------- net/batman-adv/soft-interface.h | 2 -- 3 files changed, 3 insertions(+), 45 deletions(-) (limited to 'net') diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index b99f64f483fc..a638f35598f0 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -725,17 +725,9 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, kref_get(&hard_iface->refcount); soft_iface = dev_get_by_name(net, iface_name); - if (!soft_iface) { - soft_iface = batadv_softif_create(net, iface_name); - - if (!soft_iface) { - ret = -ENOMEM; - goto err; - } - - /* dev_get_by_name() increases the reference counter for us */ - dev_hold(soft_iface); + ret = -EINVAL; + goto err; } if (!batadv_softif_is_valid(soft_iface)) { diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index a21884c0d47f..0c5b34251a6d 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include @@ -37,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -1086,38 +1086,6 @@ static int batadv_softif_newlink(struct net *src_net, struct net_device *dev, return register_netdevice(dev); } -/** - * batadv_softif_create() - Create and register soft interface - * @net: the applicable net namespace - * @name: name of the new soft interface - * - * Return: newly allocated soft_interface, NULL on errors - */ -struct net_device *batadv_softif_create(struct net *net, const char *name) -{ - struct net_device *soft_iface; - int ret; - - soft_iface = alloc_netdev(sizeof(struct batadv_priv), name, - NET_NAME_UNKNOWN, batadv_softif_init_early); - if (!soft_iface) - return NULL; - - dev_net_set(soft_iface, net); - - soft_iface->rtnl_link_ops = &batadv_link_ops; - - ret = register_netdevice(soft_iface); - if (ret < 0) { - pr_err("Unable to register the batman interface '%s': %i\n", - name, ret); - free_netdev(soft_iface); - return NULL; - } - - return soft_iface; -} - /** * batadv_softif_destroy_netlink() - deletion of batadv_soft_interface via * netlink diff --git a/net/batman-adv/soft-interface.h b/net/batman-adv/soft-interface.h index 38b0ad182584..67a2ddd6832f 100644 --- a/net/batman-adv/soft-interface.h +++ b/net/batman-adv/soft-interface.h @@ -12,14 +12,12 @@ #include #include #include -#include #include int batadv_skb_head_push(struct sk_buff *skb, unsigned int len); void batadv_interface_rx(struct net_device *soft_iface, struct sk_buff *skb, int hdr_size, struct batadv_orig_node *orig_node); -struct net_device *batadv_softif_create(struct net *net, const char *name); bool batadv_softif_is_valid(const struct net_device *net_dev); extern struct rtnl_link_ops batadv_link_ops; int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid); -- cgit v1.2.3-58-ga151 From fa205602d46e0b66c0c90672bce8b36e5de449df Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Tue, 1 Jun 2021 23:50:35 +0200 Subject: batman-adv: Avoid name based attaching of hard interfaces The sysfs code for the batman-adv/mesh_iface file was receiving a string of the batadv interface. This interface name was then provided to the code which shared sysfs+rtnetlink code for attaching an hard-interface to an batadv interface. The rtnetlink code was also using the (extracted) interface name from the ndo_add_slave callback to increase the shared code - even when it would have been more efficient to use the provided net_device object directly instead of searching it again (based on its name) in batadv_hardif_enable_interface. But this indirect handling is no longer necessary because the sysfs code was dropped. There is now only a single code path which is using batadv_hardif_enable_interface. Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/hard-interface.c | 14 ++++---------- net/batman-adv/hard-interface.h | 3 +-- net/batman-adv/soft-interface.c | 3 +-- 3 files changed, 6 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index a638f35598f0..81d201cc343d 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -705,16 +705,15 @@ static int batadv_master_del_slave(struct batadv_hard_iface *slave, /** * batadv_hardif_enable_interface() - Enslave hard interface to soft interface * @hard_iface: hard interface to add to soft interface - * @net: the applicable net namespace - * @iface_name: name of the soft interface + * @soft_iface: netdev struct of the mesh interface * * Return: 0 on success or negative error number in case of failure */ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, - struct net *net, const char *iface_name) + struct net_device *soft_iface) { struct batadv_priv *bat_priv; - struct net_device *soft_iface, *master; + struct net_device *master; __be16 ethertype = htons(ETH_P_BATMAN); int max_header_len = batadv_max_header_len(); int ret; @@ -724,11 +723,7 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, kref_get(&hard_iface->refcount); - soft_iface = dev_get_by_name(net, iface_name); - if (!soft_iface) { - ret = -EINVAL; - goto err; - } + dev_hold(soft_iface); if (!batadv_softif_is_valid(soft_iface)) { pr_err("Can't create batman mesh interface %s: already exists as regular interface\n", @@ -802,7 +797,6 @@ err_upper: err_dev: hard_iface->soft_iface = NULL; dev_put(soft_iface); -err: batadv_hardif_put(hard_iface); return ret; } diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h index 83d11b46a9d8..8cb2a1f10080 100644 --- a/net/batman-adv/hard-interface.h +++ b/net/batman-adv/hard-interface.h @@ -16,7 +16,6 @@ #include #include #include -#include /** * enum batadv_hard_if_state - State of a hard interface @@ -75,7 +74,7 @@ bool batadv_is_wifi_hardif(struct batadv_hard_iface *hard_iface); struct batadv_hard_iface* batadv_hardif_get_by_netdev(const struct net_device *net_dev); int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, - struct net *net, const char *iface_name); + struct net_device *soft_iface); void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface); int batadv_hardif_min_mtu(struct net_device *soft_iface); void batadv_update_min_mtu(struct net_device *soft_iface); diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 0c5b34251a6d..ae368a42a4ad 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -842,14 +842,13 @@ static int batadv_softif_slave_add(struct net_device *dev, struct netlink_ext_ack *extack) { struct batadv_hard_iface *hard_iface; - struct net *net = dev_net(dev); int ret = -EINVAL; hard_iface = batadv_hardif_get_by_netdev(slave_dev); if (!hard_iface || hard_iface->soft_iface) goto out; - ret = batadv_hardif_enable_interface(hard_iface, net, dev->name); + ret = batadv_hardif_enable_interface(hard_iface, dev); out: if (hard_iface) -- cgit v1.2.3-58-ga151 From 170258ce1c71dc4e03d5cb92b5f03cfb01941514 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Tue, 1 Jun 2021 23:52:48 +0200 Subject: batman-adv: Don't manually reattach hard-interface The batadv_hardif_enable_interface is now only called from the callback ndo_add_slave. This callback is only used by do_set_master in the rtnetlink code which only does two things: 1. remove the net_device from its old master 2. add the net_device to its new batadv master The code to replicate the first step in batman-adv is therefore unused since the sysfs code was dropped. Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/hard-interface.c | 34 ---------------------------------- 1 file changed, 34 deletions(-) (limited to 'net') diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 81d201cc343d..44b0aa30c30a 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -677,31 +677,6 @@ batadv_hardif_deactivate_interface(struct batadv_hard_iface *hard_iface) batadv_update_min_mtu(hard_iface->soft_iface); } -/** - * batadv_master_del_slave() - remove hard_iface from the current master iface - * @slave: the interface enslaved in another master - * @master: the master from which slave has to be removed - * - * Invoke ndo_del_slave on master passing slave as argument. In this way the - * slave is free'd and the master can correctly change its internal state. - * - * Return: 0 on success, a negative value representing the error otherwise - */ -static int batadv_master_del_slave(struct batadv_hard_iface *slave, - struct net_device *master) -{ - int ret; - - if (!master) - return 0; - - ret = -EBUSY; - if (master->netdev_ops->ndo_del_slave) - ret = master->netdev_ops->ndo_del_slave(master, slave->net_dev); - - return ret; -} - /** * batadv_hardif_enable_interface() - Enslave hard interface to soft interface * @hard_iface: hard interface to add to soft interface @@ -713,7 +688,6 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, struct net_device *soft_iface) { struct batadv_priv *bat_priv; - struct net_device *master; __be16 ethertype = htons(ETH_P_BATMAN); int max_header_len = batadv_max_header_len(); int ret; @@ -732,14 +706,6 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, goto err_dev; } - /* check if the interface is enslaved in another virtual one and - * in that case unlink it first - */ - master = netdev_master_upper_dev_get(hard_iface->net_dev); - ret = batadv_master_del_slave(hard_iface, master); - if (ret) - goto err_dev; - hard_iface->soft_iface = soft_iface; bat_priv = netdev_priv(hard_iface->soft_iface); -- cgit v1.2.3-58-ga151 From 020577f879be736bc87e1f48dfad7220923401c0 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Wed, 2 Jun 2021 21:54:30 +0200 Subject: batman-adv: Drop reduntant batadv interface check If batadv_hardif_enable_interface is called then its called from its callback ndo_add_slave. It is therefore not necessary to check if it is a batadv interface. Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/hard-interface.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'net') diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 44b0aa30c30a..55d97e18aa4a 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -9,7 +9,6 @@ #include #include -#include #include #include #include @@ -698,14 +697,6 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, kref_get(&hard_iface->refcount); dev_hold(soft_iface); - - if (!batadv_softif_is_valid(soft_iface)) { - pr_err("Can't create batman mesh interface %s: already exists as regular interface\n", - soft_iface->name); - ret = -EINVAL; - goto err_dev; - } - hard_iface->soft_iface = soft_iface; bat_priv = netdev_priv(hard_iface->soft_iface); -- cgit v1.2.3-58-ga151 From 7f0e869c4e3902abc44ae6a9c9fc4fba6867408e Mon Sep 17 00:00:00 2001 From: zhang kai Date: Wed, 2 Jun 2021 09:50:39 +0800 Subject: sit: replace 68 with micro IPV4_MIN_MTU Use meaningfull micro IPV4_MIN_MTU Signed-off-by: zhang kai Signed-off-by: David S. Miller --- net/ipv6/sit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index aa98294a3ad3..71b57bdb1519 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -970,7 +970,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, if (df) { mtu = dst_mtu(&rt->dst) - t_hlen; - if (mtu < 68) { + if (mtu < IPV4_MIN_MTU) { dev->stats.collisions++; ip_rt_put(rt); goto tx_error; -- cgit v1.2.3-58-ga151 From b676c7f1c383390bfd940582e493b374541f2dc2 Mon Sep 17 00:00:00 2001 From: Zheng Yongjun Date: Wed, 2 Jun 2021 14:54:28 +0800 Subject: ethtool: Fix a typo atribute ==> attribute Signed-off-by: Zheng Yongjun Signed-off-by: David S. Miller --- net/ethtool/netlink.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 8abcbc10796c..90b10966b16b 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -138,7 +138,7 @@ static inline void ethnl_update_bool32(u32 *dst, const struct nlattr *attr, } /** - * ethnl_update_binary() - update binary data from NLA_BINARY atribute + * ethnl_update_binary() - update binary data from NLA_BINARY attribute * @dst: value to update * @len: destination buffer length * @attr: netlink attribute with new value or null -- cgit v1.2.3-58-ga151 From 8ab1784df65176ad24b6d1e1376b8dd53ce2b695 Mon Sep 17 00:00:00 2001 From: Zheng Yongjun Date: Wed, 2 Jun 2021 14:54:42 +0800 Subject: 9p/trans_virtio: Fix spelling mistakes reseting ==> resetting alloced ==> allocated accomodate ==> accommodate Signed-off-by: Zheng Yongjun Signed-off-by: David S. Miller --- net/9p/trans_virtio.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index 93f2f8654882..2bbd7dce0f1d 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -99,7 +99,7 @@ static unsigned int rest_of_page(void *data) * @client: client instance * * This reclaims a channel by freeing its resources and - * reseting its inuse flag. + * resetting its inuse flag. * */ @@ -463,7 +463,7 @@ req_retry_pinned: * For example TREAD have 11. * 11 is the read/write header = PDU Header(7) + IO Size (4). * Arrange in such a way that server places header in the - * alloced memory and payload onto the user buffer. + * allocated memory and payload onto the user buffer. */ in = pack_sg_list(chan->sg, out, VIRTQUEUE_NUM, req->rc.sdata, in_hdr_len); @@ -760,7 +760,7 @@ static struct p9_trans_module p9_virtio_trans = { .cancelled = p9_virtio_cancelled, /* * We leave one entry for input and one entry for response - * headers. We also skip one more entry to accomodate, address + * headers. We also skip one more entry to accommodate, address * that are not at page boundary, that can result in an extra * page in zero copy. */ -- cgit v1.2.3-58-ga151 From 91641b79e1e1538eed5dac1a00770c2b94e07a33 Mon Sep 17 00:00:00 2001 From: Zheng Yongjun Date: Wed, 2 Jun 2021 14:54:58 +0800 Subject: Bluetooth: Fix spelling mistakes Fix some spelling mistakes in comments: udpate ==> update retreive ==> retrieve accidentially ==> accidentally correspondig ==> corresponding adddress ==> address estabilish ==> establish commplete ==> complete Unkown ==> Unknown triggerd ==> triggered transtion ==> transition Signed-off-by: Zheng Yongjun Signed-off-by: David S. Miller --- net/bluetooth/hci_conn.c | 2 +- net/bluetooth/hci_core.c | 8 ++++---- net/bluetooth/hci_event.c | 2 +- net/bluetooth/hci_sock.c | 6 +++--- net/bluetooth/mgmt.c | 2 +- net/bluetooth/smp.c | 6 +++--- 6 files changed, 13 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 88ec08978ff4..0ceb72d32208 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -758,7 +758,7 @@ void hci_le_conn_failed(struct hci_conn *conn, u8 status) conn->state = BT_CLOSED; /* If the status indicates successful cancellation of - * the attempt (i.e. Unkown Connection Id) there's no point of + * the attempt (i.e. Unknown Connection Id) there's no point of * notifying failure since we'll go back to keep trying to * connect. The only exception is explicit connect requests * where a timeout + cancel does indicate an actual failure. diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index fd12f1652bdf..aa214834c374 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -648,7 +648,7 @@ static int hci_init3_req(struct hci_request *req, unsigned long opt) */ /* If the controller supports Extended Scanner Filter - * Policies, enable the correspondig event. + * Policies, enable the corresponding event. */ if (hdev->le_features[0] & HCI_LE_EXT_SCAN_POLICY) events[1] |= 0x04; /* LE Direct Advertising @@ -1454,7 +1454,7 @@ static int hci_dev_do_open(struct hci_dev *hdev) } /* Check for valid public address or a configured static - * random adddress, but let the HCI setup proceed to + * random address, but let the HCI setup proceed to * be able to determine if there is a public address * or not. * @@ -3544,7 +3544,7 @@ void hci_conn_params_clear_disabled(struct hci_dev *hdev) if (params->auto_connect != HCI_AUTO_CONN_DISABLED) continue; - /* If trying to estabilish one time connection to disabled + /* If trying to establish one time connection to disabled * device, leave the params, but mark them as just once. */ if (params->explicit_connect) { @@ -4279,7 +4279,7 @@ void *hci_sent_cmd_data(struct hci_dev *hdev, __u16 opcode) return hdev->sent_cmd->data + HCI_COMMAND_HDR_SIZE; } -/* Send HCI command and wait for command commplete event */ +/* Send HCI command and wait for command complete event */ struct sk_buff *hci_cmd_sync(struct hci_dev *hdev, u16 opcode, u32 plen, const void *param, u32 timeout) { diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 016b2999f219..ea06b010ccad 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -6032,7 +6032,7 @@ static bool hci_get_cmd_complete(struct hci_dev *hdev, u16 opcode, return true; } - /* Check if request ended in Command Status - no way to retreive + /* Check if request ended in Command Status - no way to retrieve * any extra parameters in this case. */ if (hdr->evt == HCI_EV_CMD_STATUS) diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 251b9128f530..6ef98a887571 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -1130,7 +1130,7 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, if (!hci_sock_gen_cookie(sk)) { /* In the case when a cookie has already been assigned, * then there has been already an ioctl issued against - * an unbound socket and with that triggerd an open + * an unbound socket and with that triggered an open * notification. Send a close notification first to * allow the state transition to bounded. */ @@ -1326,9 +1326,9 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, if (hci_pi(sk)->channel == HCI_CHANNEL_CONTROL) { if (!hci_sock_gen_cookie(sk)) { /* In the case when a cookie has already been - * assigned, this socket will transtion from + * assigned, this socket will transition from * a raw socket into a control socket. To - * allow for a clean transtion, send the + * allow for a clean transition, send the * close notification first. */ skb = create_monitor_ctrl_close(sk); diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index f9be7f9084d6..f290d0c54d32 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -3341,7 +3341,7 @@ static int set_local_name(struct sock *sk, struct hci_dev *hdev, void *data, } /* The name is stored in the scan response data and so - * no need to udpate the advertising data here. + * no need to update the advertising data here. */ if (lmp_le_capable(hdev) && hci_dev_test_flag(hdev, HCI_ADVERTISING)) __hci_req_update_scan_rsp_data(&req, hdev->cur_adv_instance); diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 372e3b25aaa4..93144e0c7efa 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -40,7 +40,7 @@ ((struct smp_dev *)((struct l2cap_chan *)((hdev)->smp_data))->data) /* Low-level debug macros to be used for stuff that we don't want - * accidentially in dmesg, i.e. the values of the various crypto keys + * accidentally in dmesg, i.e. the values of the various crypto keys * and the inputs & outputs of crypto functions. */ #ifdef DEBUG @@ -560,7 +560,7 @@ int smp_generate_oob(struct hci_dev *hdev, u8 hash[16], u8 rand[16]) return err; /* This is unlikely, but we need to check that - * we didn't accidentially generate a debug key. + * we didn't accidentally generate a debug key. */ if (crypto_memneq(smp->local_pk, debug_pk, 64)) break; @@ -1902,7 +1902,7 @@ static u8 sc_send_public_key(struct smp_chan *smp) return SMP_UNSPECIFIED; /* This is unlikely, but we need to check that - * we didn't accidentially generate a debug key. + * we didn't accidentally generate a debug key. */ if (crypto_memneq(smp->local_pk, debug_pk, 64)) break; -- cgit v1.2.3-58-ga151 From fe6c0262bdf96623e4b088d4bd32a9454a37eb3a Mon Sep 17 00:00:00 2001 From: Zheng Yongjun Date: Wed, 2 Jun 2021 14:55:08 +0800 Subject: rxrpc: Fix a typo targetted ==> targeted Signed-off-by: Zheng Yongjun Signed-off-by: David S. Miller --- net/rxrpc/local_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rxrpc/local_event.c b/net/rxrpc/local_event.c index 3ce6d628cd75..19e929c7c38b 100644 --- a/net/rxrpc/local_event.c +++ b/net/rxrpc/local_event.c @@ -77,7 +77,7 @@ static void rxrpc_send_version_request(struct rxrpc_local *local, } /* - * Process event packets targetted at a local endpoint. + * Process event packets targeted at a local endpoint. */ void rxrpc_process_local_events(struct rxrpc_local *local) { -- cgit v1.2.3-58-ga151 From 5debe0b30bac2b921722d7419f02acee6f02fa71 Mon Sep 17 00:00:00 2001 From: Zheng Yongjun Date: Wed, 2 Jun 2021 14:55:44 +0800 Subject: decnet: Fix spelling mistakes Fix some spelling mistakes in comments: thats ==> that's serivce ==> service varience ==> variance Signed-off-by: Zheng Yongjun Signed-off-by: David S. Miller --- net/decnet/dn_nsp_in.c | 2 +- net/decnet/dn_nsp_out.c | 2 +- net/decnet/dn_route.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c index 1a12912b88d6..7ab788f41a3f 100644 --- a/net/decnet/dn_nsp_in.c +++ b/net/decnet/dn_nsp_in.c @@ -870,7 +870,7 @@ int dn_nsp_backlog_rcv(struct sock *sk, struct sk_buff *skb) /* * Read out ack data here, this applies equally - * to data, other data, link serivce and both + * to data, other data, link service and both * ack data and ack otherdata. */ dn_process_ack(sk, skb, other); diff --git a/net/decnet/dn_nsp_out.c b/net/decnet/dn_nsp_out.c index 00f2ed721ec1..eadc89583168 100644 --- a/net/decnet/dn_nsp_out.c +++ b/net/decnet/dn_nsp_out.c @@ -179,7 +179,7 @@ static void dn_nsp_rtt(struct sock *sk, long rtt) scp->nsp_srtt = 1; /* - * Add new rtt varience to smoothed varience + * Add new rtt variance to smoothed varience */ delta >>= 1; rttvar += ((((delta>0)?(delta):(-delta)) - rttvar) >> 2); diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index 32b1bed8ae51..729d3de6020d 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -604,7 +604,7 @@ drop_it: static int dn_route_discard(struct net *net, struct sock *sk, struct sk_buff *skb) { /* - * I know we drop the packet here, but thats considered success in + * I know we drop the packet here, but that's considered success in * this case */ kfree_skb(skb); -- cgit v1.2.3-58-ga151 From 4677efc486e1872f62d4632c50f7183f82296fa6 Mon Sep 17 00:00:00 2001 From: Dmytro Linkin Date: Wed, 2 Jun 2021 15:17:19 +0300 Subject: devlink: Introduce rate object Allow registering rate object for devlink ports with dedicated devlink_rate_leaf_{create|destroy}() API. Implement new netlink DEVLINK_CMD_RATE_GET command that is used to retrieve rate object info. Add new DEVLINK_CMD_RATE_{NEW|DEL} commands that are used for notifications when creating/deleting leaf rate object. Rate API is intended to be used for rate limiting of individual devlink ports (leafs) and their aggregates (nodes). Example: $ devlink port show pci/0000:03:00.0/0 pci/0000:03:00.0/1 $ devlink port function rate show pci/0000:03:00.0/0: type leaf pci/0000:03:00.0/1: type leaf Co-developed-by: Vlad Buslov Signed-off-by: Vlad Buslov Signed-off-by: Dmytro Linkin Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 14 +++ include/uapi/linux/devlink.h | 11 +++ net/core/devlink.c | 229 ++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 253 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/devlink.h b/include/net/devlink.h index 7c984cadfec4..2f5954d96c3e 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -34,6 +34,7 @@ struct devlink_ops; struct devlink { struct list_head list; struct list_head port_list; + struct list_head rate_list; struct list_head sb_list; struct list_head dpipe_table_list; struct list_head resource_list; @@ -133,6 +134,15 @@ struct devlink_port_attrs { }; }; +struct devlink_rate { + struct list_head list; + enum devlink_rate_type type; + struct devlink *devlink; + void *priv; + + struct devlink_port *devlink_port; +}; + struct devlink_port { struct list_head list; struct list_head param_list; @@ -152,6 +162,8 @@ struct devlink_port { struct delayed_work type_warn_dw; struct list_head reporter_list; struct mutex reporters_lock; /* Protects reporter_list */ + + struct devlink_rate *devlink_rate; }; struct devlink_port_new_attrs { @@ -1512,6 +1524,8 @@ void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 contro void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 controller, u16 pf, u32 sf, bool external); +int devlink_rate_leaf_create(struct devlink_port *port, void *priv); +void devlink_rate_leaf_destroy(struct devlink_port *devlink_port); int devlink_sb_register(struct devlink *devlink, unsigned int sb_index, u32 size, u16 ingress_pools_count, u16 egress_pools_count, u16 ingress_tc_count, diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index f6008b2fa60f..0c27b45c47db 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -126,6 +126,11 @@ enum devlink_command { DEVLINK_CMD_HEALTH_REPORTER_TEST, + DEVLINK_CMD_RATE_GET, /* can dump */ + DEVLINK_CMD_RATE_SET, + DEVLINK_CMD_RATE_NEW, + DEVLINK_CMD_RATE_DEL, + /* add new commands above here */ __DEVLINK_CMD_MAX, DEVLINK_CMD_MAX = __DEVLINK_CMD_MAX - 1 @@ -206,6 +211,10 @@ enum devlink_port_flavour { */ }; +enum devlink_rate_type { + DEVLINK_RATE_TYPE_LEAF, +}; + enum devlink_param_cmode { DEVLINK_PARAM_CMODE_RUNTIME, DEVLINK_PARAM_CMODE_DRIVERINIT, @@ -534,6 +543,8 @@ enum devlink_attr { DEVLINK_ATTR_RELOAD_ACTION_STATS, /* nested */ DEVLINK_ATTR_PORT_PCI_SF_NUMBER, /* u32 */ + + DEVLINK_ATTR_RATE_TYPE, /* u16 */ /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index 69681f19388e..3b785f51156f 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -190,6 +190,25 @@ static struct devlink_port *devlink_port_get_from_info(struct devlink *devlink, return devlink_port_get_from_attrs(devlink, info->attrs); } +static inline bool +devlink_rate_is_leaf(struct devlink_rate *devlink_rate) +{ + return devlink_rate->type == DEVLINK_RATE_TYPE_LEAF; +} + +static struct devlink_rate * +devlink_rate_leaf_get_from_info(struct devlink *devlink, struct genl_info *info) +{ + struct devlink_rate *devlink_rate; + struct devlink_port *devlink_port; + + devlink_port = devlink_port_get_from_attrs(devlink, info->attrs); + if (IS_ERR(devlink_port)) + return ERR_CAST(devlink_port); + devlink_rate = devlink_port->devlink_rate; + return devlink_rate ?: ERR_PTR(-ENODEV); +} + struct devlink_sb { struct list_head list; unsigned int index; @@ -408,12 +427,13 @@ devlink_region_snapshot_get_by_id(struct devlink_region *region, u32 id) #define DEVLINK_NL_FLAG_NEED_PORT BIT(0) #define DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT BIT(1) +#define DEVLINK_NL_FLAG_NEED_RATE BIT(2) /* The per devlink instance lock is taken by default in the pre-doit * operation, yet several commands do not require this. The global * devlink lock is taken and protects from disruption by user-calls. */ -#define DEVLINK_NL_FLAG_NO_LOCK BIT(2) +#define DEVLINK_NL_FLAG_NO_LOCK BIT(3) static int devlink_nl_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, struct genl_info *info) @@ -442,6 +462,15 @@ static int devlink_nl_pre_doit(const struct genl_ops *ops, devlink_port = devlink_port_get_from_info(devlink, info); if (!IS_ERR(devlink_port)) info->user_ptr[1] = devlink_port; + } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_RATE) { + struct devlink_rate *devlink_rate; + + devlink_rate = devlink_rate_leaf_get_from_info(devlink, info); + if (IS_ERR(devlink_rate)) { + err = PTR_ERR(devlink_rate); + goto unlock; + } + info->user_ptr[1] = devlink_rate; } return 0; @@ -749,6 +778,39 @@ devlink_port_fn_hw_addr_fill(struct devlink *devlink, const struct devlink_ops * return 0; } +static int devlink_nl_rate_fill(struct sk_buff *msg, + struct devlink *devlink, + struct devlink_rate *devlink_rate, + enum devlink_command cmd, u32 portid, + u32 seq, int flags, + struct netlink_ext_ack *extack) +{ + void *hdr; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(msg, devlink)) + goto nla_put_failure; + + if (nla_put_u16(msg, DEVLINK_ATTR_RATE_TYPE, devlink_rate->type)) + goto nla_put_failure; + + if (devlink_rate_is_leaf(devlink_rate)) { + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, + devlink_rate->devlink_port->index)) + goto nla_put_failure; + } + + genlmsg_end(msg, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + static bool devlink_port_fn_state_valid(enum devlink_port_fn_state state) { @@ -920,6 +982,99 @@ static void devlink_port_notify(struct devlink_port *devlink_port, msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); } +static void devlink_rate_notify(struct devlink_rate *devlink_rate, + enum devlink_command cmd) +{ + struct devlink *devlink = devlink_rate->devlink; + struct sk_buff *msg; + int err; + + WARN_ON(cmd != DEVLINK_CMD_RATE_NEW && + cmd != DEVLINK_CMD_RATE_DEL); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + + err = devlink_nl_rate_fill(msg, devlink, devlink_rate, + cmd, 0, 0, 0, NULL); + if (err) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), + msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); +} + +static int devlink_nl_cmd_rate_get_dumpit(struct sk_buff *msg, + struct netlink_callback *cb) +{ + struct devlink_rate *devlink_rate; + struct devlink *devlink; + int start = cb->args[0]; + int idx = 0; + int err = 0; + + mutex_lock(&devlink_mutex); + list_for_each_entry(devlink, &devlink_list, list) { + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + continue; + mutex_lock(&devlink->lock); + list_for_each_entry(devlink_rate, &devlink->rate_list, list) { + enum devlink_command cmd = DEVLINK_CMD_RATE_NEW; + u32 id = NETLINK_CB(cb->skb).portid; + + if (idx < start) { + idx++; + continue; + } + err = devlink_nl_rate_fill(msg, devlink, + devlink_rate, + cmd, id, + cb->nlh->nlmsg_seq, + NLM_F_MULTI, NULL); + if (err) { + mutex_unlock(&devlink->lock); + goto out; + } + idx++; + } + mutex_unlock(&devlink->lock); + } +out: + mutex_unlock(&devlink_mutex); + if (err != -EMSGSIZE) + return err; + + cb->args[0] = idx; + return msg->len; +} + +static int devlink_nl_cmd_rate_get_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink_rate *devlink_rate = info->user_ptr[1]; + struct devlink *devlink = devlink_rate->devlink; + struct sk_buff *msg; + int err; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = devlink_nl_rate_fill(msg, devlink, devlink_rate, + DEVLINK_CMD_RATE_NEW, + info->snd_portid, info->snd_seq, 0, + info->extack); + if (err) { + nlmsg_free(msg); + return err; + } + + return genlmsg_reply(msg, info); +} + static int devlink_nl_cmd_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; @@ -7802,6 +7957,7 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_PORT_PCI_PF_NUMBER] = { .type = NLA_U16 }, [DEVLINK_ATTR_PORT_PCI_SF_NUMBER] = { .type = NLA_U32 }, [DEVLINK_ATTR_PORT_CONTROLLER_NUMBER] = { .type = NLA_U32 }, + [DEVLINK_ATTR_RATE_TYPE] = { .type = NLA_U16 }, }; static const struct genl_small_ops devlink_nl_ops[] = { @@ -7827,6 +7983,13 @@ static const struct genl_small_ops devlink_nl_ops[] = { .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, }, + { + .cmd = DEVLINK_CMD_RATE_GET, + .doit = devlink_nl_cmd_rate_get_doit, + .dumpit = devlink_nl_cmd_rate_get_dumpit, + .internal_flags = DEVLINK_NL_FLAG_NEED_RATE, + /* can be retrieved by unprivileged users */ + }, { .cmd = DEVLINK_CMD_PORT_SPLIT, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, @@ -8202,6 +8365,7 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size) xa_init_flags(&devlink->snapshot_ids, XA_FLAGS_ALLOC); __devlink_net_set(devlink, &init_net); INIT_LIST_HEAD(&devlink->port_list); + INIT_LIST_HEAD(&devlink->rate_list); INIT_LIST_HEAD(&devlink->sb_list); INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list); INIT_LIST_HEAD(&devlink->resource_list); @@ -8304,6 +8468,7 @@ void devlink_free(struct devlink *devlink) WARN_ON(!list_empty(&devlink->resource_list)); WARN_ON(!list_empty(&devlink->dpipe_table_list)); WARN_ON(!list_empty(&devlink->sb_list)); + WARN_ON(!list_empty(&devlink->rate_list)); WARN_ON(!list_empty(&devlink->port_list)); xa_destroy(&devlink->snapshot_ids); @@ -8620,6 +8785,68 @@ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 contro } EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_sf_set); +/** + * devlink_rate_leaf_create - create devlink rate leaf + * + * @devlink_port: devlink port object to create rate object on + * @priv: driver private data + * + * Create devlink rate object of type leaf on provided @devlink_port. + * Throws call trace if @devlink_port already has a devlink rate object. + * + * Context: Takes and release devlink->lock . + * + * Return: -ENOMEM if failed to allocate rate object, 0 otherwise. + */ +int +devlink_rate_leaf_create(struct devlink_port *devlink_port, void *priv) +{ + struct devlink *devlink = devlink_port->devlink; + struct devlink_rate *devlink_rate; + + devlink_rate = kzalloc(sizeof(*devlink_rate), GFP_KERNEL); + if (!devlink_rate) + return -ENOMEM; + + mutex_lock(&devlink->lock); + WARN_ON(devlink_port->devlink_rate); + devlink_rate->type = DEVLINK_RATE_TYPE_LEAF; + devlink_rate->devlink = devlink; + devlink_rate->devlink_port = devlink_port; + devlink_rate->priv = priv; + list_add_tail(&devlink_rate->list, &devlink->rate_list); + devlink_port->devlink_rate = devlink_rate; + devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_NEW); + mutex_unlock(&devlink->lock); + + return 0; +} +EXPORT_SYMBOL_GPL(devlink_rate_leaf_create); + +/** + * devlink_rate_leaf_destroy - destroy devlink rate leaf + * + * @devlink_port: devlink port linked to the rate object + * + * Context: Takes and release devlink->lock . + */ +void devlink_rate_leaf_destroy(struct devlink_port *devlink_port) +{ + struct devlink_rate *devlink_rate = devlink_port->devlink_rate; + struct devlink *devlink = devlink_port->devlink; + + if (!devlink_rate) + return; + + mutex_lock(&devlink->lock); + devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_DEL); + list_del(&devlink_rate->list); + devlink_port->devlink_rate = NULL; + mutex_unlock(&devlink->lock); + kfree(devlink_rate); +} +EXPORT_SYMBOL_GPL(devlink_rate_leaf_destroy); + static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port, char *name, size_t len) { -- cgit v1.2.3-58-ga151 From 1897db2ec3109eb1dd07b357c95c5e03d54e41b9 Mon Sep 17 00:00:00 2001 From: Dmytro Linkin Date: Wed, 2 Jun 2021 15:17:22 +0300 Subject: devlink: Allow setting tx rate for devlink rate leaf objects Implement support for DEVLINK_CMD_RATE_SET command with new attributes DEVLINK_ATTR_RATE_TX_{SHARE|MAX} that are used to set devlink rate shared/max tx rate values. Extend devlink ops with new callbacks rate_leaf_tx_{share|max}_set() to allow supporting drivers to implement rate control through devlink. New attributes are optional. Driver implementations are allowed to support either or both of them. Shared rate example: $ devlink port function rate set netdevsim/netdevsim10/0 tx_share 10mbit $ devlink port function rate show netdevsim/netdevsim10/0 netdevsim/netdevsim10/0: type leaf tx_share 10mbit Max rate example: $ devlink port function rate set netdevsim/netdevsim10/0 tx_max 100mbit $ devlink port function rate show netdevsim/netdevsim10/0 netdevsim/netdevsim10/0: type leaf tx_max 100mbit Co-developed-by: Vlad Buslov Signed-off-by: Vlad Buslov Signed-off-by: Dmytro Linkin Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 10 ++++++ include/uapi/linux/devlink.h | 2 ++ net/core/devlink.c | 86 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 98 insertions(+) (limited to 'net') diff --git a/include/net/devlink.h b/include/net/devlink.h index 2f5954d96c3e..46d553545f83 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -139,6 +139,8 @@ struct devlink_rate { enum devlink_rate_type type; struct devlink *devlink; void *priv; + u64 tx_share; + u64 tx_max; struct devlink_port *devlink_port; }; @@ -1465,6 +1467,14 @@ struct devlink_ops { struct devlink_port *port, enum devlink_port_fn_state state, struct netlink_ext_ack *extack); + + /** + * Rate control callbacks. + */ + int (*rate_leaf_tx_share_set)(struct devlink_rate *devlink_rate, void *priv, + u64 tx_share, struct netlink_ext_ack *extack); + int (*rate_leaf_tx_max_set)(struct devlink_rate *devlink_rate, void *priv, + u64 tx_max, struct netlink_ext_ack *extack); }; static inline void *devlink_priv(struct devlink *devlink) diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 0c27b45c47db..ae94cd2a1078 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -545,6 +545,8 @@ enum devlink_attr { DEVLINK_ATTR_PORT_PCI_SF_NUMBER, /* u32 */ DEVLINK_ATTR_RATE_TYPE, /* u16 */ + DEVLINK_ATTR_RATE_TX_SHARE, /* u64 */ + DEVLINK_ATTR_RATE_TX_MAX, /* u64 */ /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index 3b785f51156f..37839fd5ca73 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -803,6 +803,14 @@ static int devlink_nl_rate_fill(struct sk_buff *msg, goto nla_put_failure; } + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_RATE_TX_SHARE, + devlink_rate->tx_share, DEVLINK_ATTR_PAD)) + goto nla_put_failure; + + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_RATE_TX_MAX, + devlink_rate->tx_max, DEVLINK_ATTR_PAD)) + goto nla_put_failure; + genlmsg_end(msg, hdr); return 0; @@ -1495,6 +1503,76 @@ static int devlink_nl_cmd_port_del_doit(struct sk_buff *skb, return devlink->ops->port_del(devlink, port_index, extack); } +static int devlink_nl_rate_set(struct devlink_rate *devlink_rate, + const struct devlink_ops *ops, + struct genl_info *info) +{ + struct nlattr **attrs = info->attrs; + u64 rate; + int err; + + if (attrs[DEVLINK_ATTR_RATE_TX_SHARE]) { + rate = nla_get_u64(attrs[DEVLINK_ATTR_RATE_TX_SHARE]); + err = ops->rate_leaf_tx_share_set(devlink_rate, devlink_rate->priv, + rate, info->extack); + if (err) + return err; + devlink_rate->tx_share = rate; + } + + if (attrs[DEVLINK_ATTR_RATE_TX_MAX]) { + rate = nla_get_u64(attrs[DEVLINK_ATTR_RATE_TX_MAX]); + err = ops->rate_leaf_tx_max_set(devlink_rate, devlink_rate->priv, + rate, info->extack); + if (err) + return err; + devlink_rate->tx_max = rate; + } + + return 0; +} + +static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops, + struct genl_info *info, + enum devlink_rate_type type) +{ + struct nlattr **attrs = info->attrs; + + if (type == DEVLINK_RATE_TYPE_LEAF) { + if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_leaf_tx_share_set) { + NL_SET_ERR_MSG_MOD(info->extack, "TX share set isn't supported for the leafs"); + return false; + } + if (attrs[DEVLINK_ATTR_RATE_TX_MAX] && !ops->rate_leaf_tx_max_set) { + NL_SET_ERR_MSG_MOD(info->extack, "TX max set isn't supported for the leafs"); + return false; + } + } else { + WARN_ON("Unknown type of rate object"); + return false; + } + + return true; +} + +static int devlink_nl_cmd_rate_set_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink_rate *devlink_rate = info->user_ptr[1]; + struct devlink *devlink = devlink_rate->devlink; + const struct devlink_ops *ops = devlink->ops; + int err; + + if (!ops || !devlink_rate_set_ops_supported(ops, info, devlink_rate->type)) + return -EOPNOTSUPP; + + err = devlink_nl_rate_set(devlink_rate, ops, info); + + if (!err) + devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_NEW); + return err; +} + static int devlink_nl_sb_fill(struct sk_buff *msg, struct devlink *devlink, struct devlink_sb *devlink_sb, enum devlink_command cmd, u32 portid, @@ -7958,6 +8036,8 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_PORT_PCI_SF_NUMBER] = { .type = NLA_U32 }, [DEVLINK_ATTR_PORT_CONTROLLER_NUMBER] = { .type = NLA_U32 }, [DEVLINK_ATTR_RATE_TYPE] = { .type = NLA_U16 }, + [DEVLINK_ATTR_RATE_TX_SHARE] = { .type = NLA_U64 }, + [DEVLINK_ATTR_RATE_TX_MAX] = { .type = NLA_U64 }, }; static const struct genl_small_ops devlink_nl_ops[] = { @@ -7990,6 +8070,12 @@ static const struct genl_small_ops devlink_nl_ops[] = { .internal_flags = DEVLINK_NL_FLAG_NEED_RATE, /* can be retrieved by unprivileged users */ }, + { + .cmd = DEVLINK_CMD_RATE_SET, + .doit = devlink_nl_cmd_rate_set_doit, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_RATE, + }, { .cmd = DEVLINK_CMD_PORT_SPLIT, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, -- cgit v1.2.3-58-ga151 From a8ecb93ef03de4c59fb6289f99bc9616a852c917 Mon Sep 17 00:00:00 2001 From: Dmytro Linkin Date: Wed, 2 Jun 2021 15:17:25 +0300 Subject: devlink: Introduce rate nodes Implement support for DEVLINK_CMD_RATE_{NEW|DEL} commands that are used to create and delete devlink rate nodes. Add new attribute DEVLINK_ATTR_RATE_NODE_NAME that specify node name string. The node name is an alphanumeric identifier. No valid node name can be a devlink port index, eg. decimal number. Extend devlink ops with new callbacks rate_node_{new|del}() and rate_node_tx_{share|max}_set() to allow supporting drivers to implement ports rate grouping and setting tx rate of rate nodes through devlink. Expose devlink_rate_nodes_destroy() function to allow vendor driver do proper cleanup of internally allocated resources for the nodes if the driver goes down or due to any other reasons which requires nodes to be destroyed. Disallow moving device from switchdev to legacy mode if any node exists on that device. User must explicitly delete nodes before switching mode. Example: $ devlink port function rate add netdevsim/netdevsim10/group1 $ devlink port function rate set netdevsim/netdevsim10/group1 \ tx_share 10mbit tx_max 100mbit Add + set command can be combined: $ devlink port function rate add netdevsim/netdevsim10/group1 \ tx_share 10mbit tx_max 100mbit $ devlink port function rate show netdevsim/netdevsim10/group1 netdevsim/netdevsim10/group1: type node tx_share 10mbit tx_max 100mbit $ devlink port function rate del netdevsim/netdevsim10/group1 Co-developed-by: Vlad Buslov Signed-off-by: Vlad Buslov Signed-off-by: Dmytro Linkin Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 14 ++- include/uapi/linux/devlink.h | 3 + net/core/devlink.c | 238 +++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 247 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/net/devlink.h b/include/net/devlink.h index 46d553545f83..13162b579124 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -142,7 +142,10 @@ struct devlink_rate { u64 tx_share; u64 tx_max; - struct devlink_port *devlink_port; + union { + struct devlink_port *devlink_port; + char *name; + }; }; struct devlink_port { @@ -1475,6 +1478,14 @@ struct devlink_ops { u64 tx_share, struct netlink_ext_ack *extack); int (*rate_leaf_tx_max_set)(struct devlink_rate *devlink_rate, void *priv, u64 tx_max, struct netlink_ext_ack *extack); + int (*rate_node_tx_share_set)(struct devlink_rate *devlink_rate, void *priv, + u64 tx_share, struct netlink_ext_ack *extack); + int (*rate_node_tx_max_set)(struct devlink_rate *devlink_rate, void *priv, + u64 tx_max, struct netlink_ext_ack *extack); + int (*rate_node_new)(struct devlink_rate *rate_node, void **priv, + struct netlink_ext_ack *extack); + int (*rate_node_del)(struct devlink_rate *rate_node, void *priv, + struct netlink_ext_ack *extack); }; static inline void *devlink_priv(struct devlink *devlink) @@ -1536,6 +1547,7 @@ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, bool external); int devlink_rate_leaf_create(struct devlink_port *port, void *priv); void devlink_rate_leaf_destroy(struct devlink_port *devlink_port); +void devlink_rate_nodes_destroy(struct devlink *devlink); int devlink_sb_register(struct devlink *devlink, unsigned int sb_index, u32 size, u16 ingress_pools_count, u16 egress_pools_count, u16 ingress_tc_count, diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index ae94cd2a1078..7e15853b77fe 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -213,6 +213,7 @@ enum devlink_port_flavour { enum devlink_rate_type { DEVLINK_RATE_TYPE_LEAF, + DEVLINK_RATE_TYPE_NODE, }; enum devlink_param_cmode { @@ -547,6 +548,8 @@ enum devlink_attr { DEVLINK_ATTR_RATE_TYPE, /* u16 */ DEVLINK_ATTR_RATE_TX_SHARE, /* u64 */ DEVLINK_ATTR_RATE_TX_MAX, /* u64 */ + DEVLINK_ATTR_RATE_NODE_NAME, /* string */ + /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index 37839fd5ca73..589d750b70e4 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -196,6 +196,12 @@ devlink_rate_is_leaf(struct devlink_rate *devlink_rate) return devlink_rate->type == DEVLINK_RATE_TYPE_LEAF; } +static inline bool +devlink_rate_is_node(struct devlink_rate *devlink_rate) +{ + return devlink_rate->type == DEVLINK_RATE_TYPE_NODE; +} + static struct devlink_rate * devlink_rate_leaf_get_from_info(struct devlink *devlink, struct genl_info *info) { @@ -209,6 +215,55 @@ devlink_rate_leaf_get_from_info(struct devlink *devlink, struct genl_info *info) return devlink_rate ?: ERR_PTR(-ENODEV); } +static struct devlink_rate * +devlink_rate_node_get_by_name(struct devlink *devlink, const char *node_name) +{ + static struct devlink_rate *devlink_rate; + + list_for_each_entry(devlink_rate, &devlink->rate_list, list) { + if (devlink_rate_is_node(devlink_rate) && + !strcmp(node_name, devlink_rate->name)) + return devlink_rate; + } + return ERR_PTR(-ENODEV); +} + +static struct devlink_rate * +devlink_rate_node_get_from_attrs(struct devlink *devlink, struct nlattr **attrs) +{ + const char *rate_node_name; + size_t len; + + if (!attrs[DEVLINK_ATTR_RATE_NODE_NAME]) + return ERR_PTR(-EINVAL); + rate_node_name = nla_data(attrs[DEVLINK_ATTR_RATE_NODE_NAME]); + len = strlen(rate_node_name); + /* Name cannot be empty or decimal number */ + if (!len || strspn(rate_node_name, "0123456789") == len) + return ERR_PTR(-EINVAL); + + return devlink_rate_node_get_by_name(devlink, rate_node_name); +} + +static struct devlink_rate * +devlink_rate_node_get_from_info(struct devlink *devlink, struct genl_info *info) +{ + return devlink_rate_node_get_from_attrs(devlink, info->attrs); +} + +static struct devlink_rate * +devlink_rate_get_from_info(struct devlink *devlink, struct genl_info *info) +{ + struct nlattr **attrs = info->attrs; + + if (attrs[DEVLINK_ATTR_PORT_INDEX]) + return devlink_rate_leaf_get_from_info(devlink, info); + else if (attrs[DEVLINK_ATTR_RATE_NODE_NAME]) + return devlink_rate_node_get_from_info(devlink, info); + else + return ERR_PTR(-EINVAL); +} + struct devlink_sb { struct list_head list; unsigned int index; @@ -428,12 +483,13 @@ devlink_region_snapshot_get_by_id(struct devlink_region *region, u32 id) #define DEVLINK_NL_FLAG_NEED_PORT BIT(0) #define DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT BIT(1) #define DEVLINK_NL_FLAG_NEED_RATE BIT(2) +#define DEVLINK_NL_FLAG_NEED_RATE_NODE BIT(3) /* The per devlink instance lock is taken by default in the pre-doit * operation, yet several commands do not require this. The global * devlink lock is taken and protects from disruption by user-calls. */ -#define DEVLINK_NL_FLAG_NO_LOCK BIT(3) +#define DEVLINK_NL_FLAG_NO_LOCK BIT(4) static int devlink_nl_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, struct genl_info *info) @@ -465,12 +521,21 @@ static int devlink_nl_pre_doit(const struct genl_ops *ops, } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_RATE) { struct devlink_rate *devlink_rate; - devlink_rate = devlink_rate_leaf_get_from_info(devlink, info); + devlink_rate = devlink_rate_get_from_info(devlink, info); if (IS_ERR(devlink_rate)) { err = PTR_ERR(devlink_rate); goto unlock; } info->user_ptr[1] = devlink_rate; + } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_RATE_NODE) { + struct devlink_rate *rate_node; + + rate_node = devlink_rate_node_get_from_info(devlink, info); + if (IS_ERR(rate_node)) { + err = PTR_ERR(rate_node); + goto unlock; + } + info->user_ptr[1] = rate_node; } return 0; @@ -801,6 +866,10 @@ static int devlink_nl_rate_fill(struct sk_buff *msg, if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_rate->devlink_port->index)) goto nla_put_failure; + } else if (devlink_rate_is_node(devlink_rate)) { + if (nla_put_string(msg, DEVLINK_ATTR_RATE_NODE_NAME, + devlink_rate->name)) + goto nla_put_failure; } if (nla_put_u64_64bit(msg, DEVLINK_ATTR_RATE_TX_SHARE, @@ -1508,13 +1577,17 @@ static int devlink_nl_rate_set(struct devlink_rate *devlink_rate, struct genl_info *info) { struct nlattr **attrs = info->attrs; + int err = -EOPNOTSUPP; u64 rate; - int err; if (attrs[DEVLINK_ATTR_RATE_TX_SHARE]) { rate = nla_get_u64(attrs[DEVLINK_ATTR_RATE_TX_SHARE]); - err = ops->rate_leaf_tx_share_set(devlink_rate, devlink_rate->priv, - rate, info->extack); + if (devlink_rate_is_leaf(devlink_rate)) + err = ops->rate_leaf_tx_share_set(devlink_rate, devlink_rate->priv, + rate, info->extack); + else if (devlink_rate_is_node(devlink_rate)) + err = ops->rate_node_tx_share_set(devlink_rate, devlink_rate->priv, + rate, info->extack); if (err) return err; devlink_rate->tx_share = rate; @@ -1522,8 +1595,12 @@ static int devlink_nl_rate_set(struct devlink_rate *devlink_rate, if (attrs[DEVLINK_ATTR_RATE_TX_MAX]) { rate = nla_get_u64(attrs[DEVLINK_ATTR_RATE_TX_MAX]); - err = ops->rate_leaf_tx_max_set(devlink_rate, devlink_rate->priv, - rate, info->extack); + if (devlink_rate_is_leaf(devlink_rate)) + err = ops->rate_leaf_tx_max_set(devlink_rate, devlink_rate->priv, + rate, info->extack); + else if (devlink_rate_is_node(devlink_rate)) + err = ops->rate_node_tx_max_set(devlink_rate, devlink_rate->priv, + rate, info->extack); if (err) return err; devlink_rate->tx_max = rate; @@ -1547,6 +1624,15 @@ static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops, NL_SET_ERR_MSG_MOD(info->extack, "TX max set isn't supported for the leafs"); return false; } + } else if (type == DEVLINK_RATE_TYPE_NODE) { + if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_node_tx_share_set) { + NL_SET_ERR_MSG_MOD(info->extack, "TX share set isn't supported for the nodes"); + return false; + } + if (attrs[DEVLINK_ATTR_RATE_TX_MAX] && !ops->rate_node_tx_max_set) { + NL_SET_ERR_MSG_MOD(info->extack, "TX max set isn't supported for the nodes"); + return false; + } } else { WARN_ON("Unknown type of rate object"); return false; @@ -1573,6 +1659,78 @@ static int devlink_nl_cmd_rate_set_doit(struct sk_buff *skb, return err; } +static int devlink_nl_cmd_rate_new_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_rate *rate_node; + const struct devlink_ops *ops; + int err; + + ops = devlink->ops; + if (!ops || !ops->rate_node_new || !ops->rate_node_del) { + NL_SET_ERR_MSG_MOD(info->extack, "Rate nodes aren't supported"); + return -EOPNOTSUPP; + } + + if (!devlink_rate_set_ops_supported(ops, info, DEVLINK_RATE_TYPE_NODE)) + return -EOPNOTSUPP; + + rate_node = devlink_rate_node_get_from_attrs(devlink, info->attrs); + if (!IS_ERR(rate_node)) + return -EEXIST; + else if (rate_node == ERR_PTR(-EINVAL)) + return -EINVAL; + + rate_node = kzalloc(sizeof(*rate_node), GFP_KERNEL); + if (!rate_node) + return -ENOMEM; + + rate_node->devlink = devlink; + rate_node->type = DEVLINK_RATE_TYPE_NODE; + rate_node->name = nla_strdup(info->attrs[DEVLINK_ATTR_RATE_NODE_NAME], GFP_KERNEL); + if (!rate_node->name) { + err = -ENOMEM; + goto err_strdup; + } + + err = ops->rate_node_new(rate_node, &rate_node->priv, info->extack); + if (err) + goto err_node_new; + + err = devlink_nl_rate_set(rate_node, ops, info); + if (err) + goto err_rate_set; + + list_add(&rate_node->list, &devlink->rate_list); + devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_NEW); + return 0; + +err_rate_set: + ops->rate_node_del(rate_node, rate_node->priv, info->extack); +err_node_new: + kfree(rate_node->name); +err_strdup: + kfree(rate_node); + return err; +} + +static int devlink_nl_cmd_rate_del_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink_rate *rate_node = info->user_ptr[1]; + struct devlink *devlink = rate_node->devlink; + const struct devlink_ops *ops = devlink->ops; + int err; + + devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_DEL); + err = ops->rate_node_del(rate_node, rate_node->priv, info->extack); + list_del(&rate_node->list); + kfree(rate_node->name); + kfree(rate_node); + return err; +} + static int devlink_nl_sb_fill(struct sk_buff *msg, struct devlink *devlink, struct devlink_sb *devlink_sb, enum devlink_command cmd, u32 portid, @@ -2441,6 +2599,30 @@ static int devlink_nl_cmd_eswitch_get_doit(struct sk_buff *skb, return genlmsg_reply(msg, info); } +static int devlink_rate_nodes_check(struct devlink *devlink, u16 mode, + struct netlink_ext_ack *extack) +{ + struct devlink_rate *devlink_rate; + u16 old_mode; + int err; + + if (!devlink->ops->eswitch_mode_get) + return -EOPNOTSUPP; + err = devlink->ops->eswitch_mode_get(devlink, &old_mode); + if (err) + return err; + + if (old_mode == mode) + return 0; + + list_for_each_entry(devlink_rate, &devlink->rate_list, list) + if (devlink_rate_is_node(devlink_rate)) { + NL_SET_ERR_MSG_MOD(extack, "Rate node(s) exists."); + return -EBUSY; + } + return 0; +} + static int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb, struct genl_info *info) { @@ -2455,6 +2637,9 @@ static int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb, if (!ops->eswitch_mode_set) return -EOPNOTSUPP; mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]); + err = devlink_rate_nodes_check(devlink, mode, info->extack); + if (err) + return err; err = ops->eswitch_mode_set(devlink, mode, info->extack); if (err) return err; @@ -8038,6 +8223,7 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_RATE_TYPE] = { .type = NLA_U16 }, [DEVLINK_ATTR_RATE_TX_SHARE] = { .type = NLA_U64 }, [DEVLINK_ATTR_RATE_TX_MAX] = { .type = NLA_U64 }, + [DEVLINK_ATTR_RATE_NODE_NAME] = { .type = NLA_NUL_STRING }, }; static const struct genl_small_ops devlink_nl_ops[] = { @@ -8076,6 +8262,17 @@ static const struct genl_small_ops devlink_nl_ops[] = { .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_RATE, }, + { + .cmd = DEVLINK_CMD_RATE_NEW, + .doit = devlink_nl_cmd_rate_new_doit, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = DEVLINK_CMD_RATE_DEL, + .doit = devlink_nl_cmd_rate_del_doit, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_RATE_NODE, + }, { .cmd = DEVLINK_CMD_PORT_SPLIT, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, @@ -8933,6 +9130,33 @@ void devlink_rate_leaf_destroy(struct devlink_port *devlink_port) } EXPORT_SYMBOL_GPL(devlink_rate_leaf_destroy); +/** + * devlink_rate_nodes_destroy - destroy all devlink rate nodes on device + * + * @devlink: devlink instance + * + * Destroy all rate nodes on specified device + * + * Context: Takes and release devlink->lock . + */ +void devlink_rate_nodes_destroy(struct devlink *devlink) +{ + static struct devlink_rate *devlink_rate, *tmp; + const struct devlink_ops *ops = devlink->ops; + + mutex_lock(&devlink->lock); + list_for_each_entry_safe(devlink_rate, tmp, &devlink->rate_list, list) { + if (devlink_rate_is_node(devlink_rate)) { + ops->rate_node_del(devlink_rate, devlink_rate->priv, NULL); + list_del(&devlink_rate->list); + kfree(devlink_rate->name); + kfree(devlink_rate); + } + } + mutex_unlock(&devlink->lock); +} +EXPORT_SYMBOL_GPL(devlink_rate_nodes_destroy); + static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port, char *name, size_t len) { -- cgit v1.2.3-58-ga151 From d7555984507822458b32a6405881038241d140be Mon Sep 17 00:00:00 2001 From: Dmytro Linkin Date: Wed, 2 Jun 2021 15:17:28 +0300 Subject: devlink: Allow setting parent node of rate objects Refactor DEVLINK_CMD_RATE_{GET|SET} command handlers to support setting a node as a parent for another rate object (leaf or node) by means of new attribute DEVLINK_ATTR_RATE_PARENT_NODE_NAME. Extend devlink ops with new callbacks rate_{leaf|node}_parent_set() to set node as a parent for rate object to allow supporting drivers to implement rate grouping through devlink. Driver implementations are allowed to support leafs or node children only. Invoking callback with NULL as parent should be threated by the driver as unset parent action. Extend rate object struct with reference counter to disallow deleting a node with any child pointing to it. User should unset parent for the child explicitly. Example: $ devlink port function rate add netdevsim/netdevsim10/group1 $ devlink port function rate add netdevsim/netdevsim10/group2 $ devlink port function rate set netdevsim/netdevsim10/group1 parent group2 $ devlink port function rate show netdevsim/netdevsim10/group1 netdevsim/netdevsim10/group1: type node parent group2 $ devlink port function rate set netdevsim/netdevsim10/group1 noparent Co-developed-by: Vlad Buslov Signed-off-by: Vlad Buslov Signed-off-by: Dmytro Linkin Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 14 ++++- include/uapi/linux/devlink.h | 1 + net/core/devlink.c | 125 ++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 137 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/net/devlink.h b/include/net/devlink.h index 13162b579124..eb045f1b5d1d 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -142,9 +142,13 @@ struct devlink_rate { u64 tx_share; u64 tx_max; + struct devlink_rate *parent; union { struct devlink_port *devlink_port; - char *name; + struct { + char *name; + refcount_t refcnt; + }; }; }; @@ -1486,6 +1490,14 @@ struct devlink_ops { struct netlink_ext_ack *extack); int (*rate_node_del)(struct devlink_rate *rate_node, void *priv, struct netlink_ext_ack *extack); + int (*rate_leaf_parent_set)(struct devlink_rate *child, + struct devlink_rate *parent, + void *priv_child, void *priv_parent, + struct netlink_ext_ack *extack); + int (*rate_node_parent_set)(struct devlink_rate *child, + struct devlink_rate *parent, + void *priv_child, void *priv_parent, + struct netlink_ext_ack *extack); }; static inline void *devlink_priv(struct devlink *devlink) diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 7e15853b77fe..32f53a0069d6 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -549,6 +549,7 @@ enum devlink_attr { DEVLINK_ATTR_RATE_TX_SHARE, /* u64 */ DEVLINK_ATTR_RATE_TX_MAX, /* u64 */ DEVLINK_ATTR_RATE_NODE_NAME, /* string */ + DEVLINK_ATTR_RATE_PARENT_NODE_NAME, /* string */ /* add new attributes above here, update the policy in devlink.c */ diff --git a/net/core/devlink.c b/net/core/devlink.c index 589d750b70e4..464f56408247 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -880,6 +880,11 @@ static int devlink_nl_rate_fill(struct sk_buff *msg, devlink_rate->tx_max, DEVLINK_ATTR_PAD)) goto nla_put_failure; + if (devlink_rate->parent) + if (nla_put_string(msg, DEVLINK_ATTR_RATE_PARENT_NODE_NAME, + devlink_rate->parent->name)) + goto nla_put_failure; + genlmsg_end(msg, hdr); return 0; @@ -1152,6 +1157,18 @@ static int devlink_nl_cmd_rate_get_doit(struct sk_buff *skb, return genlmsg_reply(msg, info); } +static bool +devlink_rate_is_parent_node(struct devlink_rate *devlink_rate, + struct devlink_rate *parent) +{ + while (parent) { + if (parent == devlink_rate) + return true; + parent = parent->parent; + } + return false; +} + static int devlink_nl_cmd_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; @@ -1572,11 +1589,75 @@ static int devlink_nl_cmd_port_del_doit(struct sk_buff *skb, return devlink->ops->port_del(devlink, port_index, extack); } +static int +devlink_nl_rate_parent_node_set(struct devlink_rate *devlink_rate, + struct genl_info *info, + struct nlattr *nla_parent) +{ + struct devlink *devlink = devlink_rate->devlink; + const char *parent_name = nla_data(nla_parent); + const struct devlink_ops *ops = devlink->ops; + size_t len = strlen(parent_name); + struct devlink_rate *parent; + int err = -EOPNOTSUPP; + + parent = devlink_rate->parent; + if (parent && len) { + NL_SET_ERR_MSG_MOD(info->extack, "Rate object already has parent."); + return -EBUSY; + } else if (parent && !len) { + if (devlink_rate_is_leaf(devlink_rate)) + err = ops->rate_leaf_parent_set(devlink_rate, NULL, + devlink_rate->priv, NULL, + info->extack); + else if (devlink_rate_is_node(devlink_rate)) + err = ops->rate_node_parent_set(devlink_rate, NULL, + devlink_rate->priv, NULL, + info->extack); + if (err) + return err; + + refcount_dec(&parent->refcnt); + devlink_rate->parent = NULL; + } else if (!parent && len) { + parent = devlink_rate_node_get_by_name(devlink, parent_name); + if (IS_ERR(parent)) + return -ENODEV; + + if (parent == devlink_rate) { + NL_SET_ERR_MSG_MOD(info->extack, "Parent to self is not allowed"); + return -EINVAL; + } + + if (devlink_rate_is_node(devlink_rate) && + devlink_rate_is_parent_node(devlink_rate, parent->parent)) { + NL_SET_ERR_MSG_MOD(info->extack, "Node is already a parent of parent node."); + return -EEXIST; + } + + if (devlink_rate_is_leaf(devlink_rate)) + err = ops->rate_leaf_parent_set(devlink_rate, parent, + devlink_rate->priv, parent->priv, + info->extack); + else if (devlink_rate_is_node(devlink_rate)) + err = ops->rate_node_parent_set(devlink_rate, parent, + devlink_rate->priv, parent->priv, + info->extack); + if (err) + return err; + + refcount_inc(&parent->refcnt); + devlink_rate->parent = parent; + } + + return 0; +} + static int devlink_nl_rate_set(struct devlink_rate *devlink_rate, const struct devlink_ops *ops, struct genl_info *info) { - struct nlattr **attrs = info->attrs; + struct nlattr *nla_parent, **attrs = info->attrs; int err = -EOPNOTSUPP; u64 rate; @@ -1606,6 +1687,14 @@ static int devlink_nl_rate_set(struct devlink_rate *devlink_rate, devlink_rate->tx_max = rate; } + nla_parent = attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME]; + if (nla_parent) { + err = devlink_nl_rate_parent_node_set(devlink_rate, info, + nla_parent); + if (err) + return err; + } + return 0; } @@ -1624,6 +1713,11 @@ static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops, NL_SET_ERR_MSG_MOD(info->extack, "TX max set isn't supported for the leafs"); return false; } + if (attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME] && + !ops->rate_leaf_parent_set) { + NL_SET_ERR_MSG_MOD(info->extack, "Parent set isn't supported for the leafs"); + return false; + } } else if (type == DEVLINK_RATE_TYPE_NODE) { if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_node_tx_share_set) { NL_SET_ERR_MSG_MOD(info->extack, "TX share set isn't supported for the nodes"); @@ -1633,6 +1727,11 @@ static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops, NL_SET_ERR_MSG_MOD(info->extack, "TX max set isn't supported for the nodes"); return false; } + if (attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME] && + !ops->rate_node_parent_set) { + NL_SET_ERR_MSG_MOD(info->extack, "Parent set isn't supported for the nodes"); + return false; + } } else { WARN_ON("Unknown type of rate object"); return false; @@ -1702,6 +1801,7 @@ static int devlink_nl_cmd_rate_new_doit(struct sk_buff *skb, if (err) goto err_rate_set; + refcount_set(&rate_node->refcnt, 1); list_add(&rate_node->list, &devlink->rate_list); devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_NEW); return 0; @@ -1723,8 +1823,15 @@ static int devlink_nl_cmd_rate_del_doit(struct sk_buff *skb, const struct devlink_ops *ops = devlink->ops; int err; + if (refcount_read(&rate_node->refcnt) > 1) { + NL_SET_ERR_MSG_MOD(info->extack, "Node has children. Cannot delete node."); + return -EBUSY; + } + devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_DEL); err = ops->rate_node_del(rate_node, rate_node->priv, info->extack); + if (rate_node->parent) + refcount_dec(&rate_node->parent->refcnt); list_del(&rate_node->list); kfree(rate_node->name); kfree(rate_node); @@ -8224,6 +8331,7 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_RATE_TX_SHARE] = { .type = NLA_U64 }, [DEVLINK_ATTR_RATE_TX_MAX] = { .type = NLA_U64 }, [DEVLINK_ATTR_RATE_NODE_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_RATE_PARENT_NODE_NAME] = { .type = NLA_NUL_STRING }, }; static const struct genl_small_ops devlink_nl_ops[] = { @@ -9135,7 +9243,8 @@ EXPORT_SYMBOL_GPL(devlink_rate_leaf_destroy); * * @devlink: devlink instance * - * Destroy all rate nodes on specified device + * Unset parent for all rate objects and destroy all rate nodes + * on specified device. * * Context: Takes and release devlink->lock . */ @@ -9145,6 +9254,18 @@ void devlink_rate_nodes_destroy(struct devlink *devlink) const struct devlink_ops *ops = devlink->ops; mutex_lock(&devlink->lock); + list_for_each_entry(devlink_rate, &devlink->rate_list, list) { + if (!devlink_rate->parent) + continue; + + refcount_dec(&devlink_rate->parent->refcnt); + if (devlink_rate_is_leaf(devlink_rate)) + ops->rate_leaf_parent_set(devlink_rate, NULL, devlink_rate->priv, + NULL, NULL); + else if (devlink_rate_is_node(devlink_rate)) + ops->rate_node_parent_set(devlink_rate, NULL, devlink_rate->priv, + NULL, NULL); + } list_for_each_entry_safe(devlink_rate, tmp, &devlink->rate_list, list) { if (devlink_rate_is_node(devlink_rate)) { ops->rate_node_del(devlink_rate, devlink_rate->priv, NULL); -- cgit v1.2.3-58-ga151 From 490dcecabbf93e705006af498fa6815251404a54 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 19 May 2021 10:18:25 -0700 Subject: mlx5: count all link events mlx5 devices were observed generating MLX5_PORT_CHANGE_SUBTYPE_ACTIVE events without an intervening MLX5_PORT_CHANGE_SUBTYPE_DOWN. This breaks link flap detection based on Linux carrier state transition count as netif_carrier_on() does nothing if carrier is already on. Make sure we count such events. netif_carrier_event() increments the counters and fires the linkwatch events. The latter is not necessary for the use case but seems like the right thing to do. Signed-off-by: Jakub Kicinski Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 6 +++++- include/linux/netdevice.h | 2 +- net/sched/sch_generic.c | 18 ++++++++++++++++++ 3 files changed, 24 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index ad0f69480b9c..e36d0c6a08db 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -91,12 +91,16 @@ void mlx5e_update_carrier(struct mlx5e_priv *priv) { struct mlx5_core_dev *mdev = priv->mdev; u8 port_state; + bool up; port_state = mlx5_query_vport_state(mdev, MLX5_VPORT_STATE_OP_MOD_VNIC_VPORT, 0); - if (port_state == VPORT_STATE_UP) { + up = port_state == VPORT_STATE_UP; + if (up == netif_carrier_ok(priv->netdev)) + netif_carrier_event(priv->netdev); + if (up) { netdev_info(priv->netdev, "Link up\n"); netif_carrier_on(priv->netdev); } else { diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5cbc950b34df..be1dcceda5e4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4187,8 +4187,8 @@ unsigned long dev_trans_start(struct net_device *dev); void __netdev_watchdog_up(struct net_device *dev); void netif_carrier_on(struct net_device *dev); - void netif_carrier_off(struct net_device *dev); +void netif_carrier_event(struct net_device *dev); /** * netif_dormant_on - mark device as dormant. diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index fc8b56bcabf3..e9c0afc8becc 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -540,6 +540,24 @@ void netif_carrier_off(struct net_device *dev) } EXPORT_SYMBOL(netif_carrier_off); +/** + * netif_carrier_event - report carrier state event + * @dev: network device + * + * Device has detected a carrier event but the carrier state wasn't changed. + * Use in drivers when querying carrier state asynchronously, to avoid missing + * events (link flaps) if link recovers before it's queried. + */ +void netif_carrier_event(struct net_device *dev) +{ + if (dev->reg_state == NETREG_UNINITIALIZED) + return; + atomic_inc(&dev->carrier_up_count); + atomic_inc(&dev->carrier_down_count); + linkwatch_fire_event(dev); +} +EXPORT_SYMBOL_GPL(netif_carrier_event); + /* "NOOP" scheduler: the best scheduler, recommended for all interfaces under all circumstances. It is difficult to invent anything faster or cheaper. -- cgit v1.2.3-58-ga151 From d467d0bc7ab8062197158658c456e1f2f6c3fcf1 Mon Sep 17 00:00:00 2001 From: Zheng Yongjun Date: Wed, 2 Jun 2021 14:56:23 +0800 Subject: rtnetlink: Fix spelling mistakes Signed-off-by: Zheng Yongjun Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 04b4f0f2a3d2..dbf59b2d5700 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -9,7 +9,7 @@ * Authors: Alexey Kuznetsov, * * Fixes: - * Vitaly E. Lavrov RTA_OK arithmetics was wrong. + * Vitaly E. Lavrov RTA_OK arithmetic was wrong. */ #include @@ -234,7 +234,7 @@ unlock: * @msgtype: rtnetlink message type * @doit: Function pointer called for each request message * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message - * @flags: rtnl_link_flags to modifiy behaviour of doit/dumpit functions + * @flags: rtnl_link_flags to modify behaviour of doit/dumpit functions * * Like rtnl_register, but for use by removable modules. */ @@ -254,7 +254,7 @@ EXPORT_SYMBOL_GPL(rtnl_register_module); * @msgtype: rtnetlink message type * @doit: Function pointer called for each request message * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message - * @flags: rtnl_link_flags to modifiy behaviour of doit/dumpit functions + * @flags: rtnl_link_flags to modify behaviour of doit/dumpit functions * * Registers the specified function pointers (at least one of them has * to be non-NULL) to be called whenever a request message for the @@ -2567,7 +2567,7 @@ static int do_set_proto_down(struct net_device *dev, if (nl_proto_down) { proto_down = nla_get_u8(nl_proto_down); - /* Dont turn off protodown if there are active reasons */ + /* Don't turn off protodown if there are active reasons */ if (!proto_down && dev->proto_down_reason) { NL_SET_ERR_MSG(extack, "Cannot clear protodown, active reasons"); return -EBUSY; -- cgit v1.2.3-58-ga151 From dd0d91b9139899ba2546290ab282767600e0f358 Mon Sep 17 00:00:00 2001 From: Zheng Yongjun Date: Wed, 2 Jun 2021 14:56:35 +0800 Subject: libceph: Fix spelling mistakes Fix some spelling mistakes in comments: enconding ==> encoding ambigous ==> ambiguous orignal ==> original encyption ==> encryption Signed-off-by: Zheng Yongjun Signed-off-by: David S. Miller --- net/ceph/auth_x_protocol.h | 2 +- net/ceph/mon_client.c | 2 +- net/ceph/osdmap.c | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ceph/auth_x_protocol.h b/net/ceph/auth_x_protocol.h index 792fcb974dc3..9c60feeb1bcb 100644 --- a/net/ceph/auth_x_protocol.h +++ b/net/ceph/auth_x_protocol.h @@ -87,7 +87,7 @@ struct ceph_x_authorize_reply { /* - * encyption bundle + * encryption bundle */ #define CEPHX_ENC_MAGIC 0xff009cad8826aa55ull diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 195ceb8afb06..013cbdb6cfe2 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -1508,7 +1508,7 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con, return get_generic_reply(con, hdr, skip); /* - * Older OSDs don't set reply tid even if the orignal + * Older OSDs don't set reply tid even if the original * request had a non-zero tid. Work around this weirdness * by allocating a new message. */ diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index c959320c4775..75b738083523 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1309,7 +1309,7 @@ static int get_osdmap_client_data_v(void **p, void *end, return -EINVAL; } - /* old osdmap enconding */ + /* old osdmap encoding */ struct_v = 0; } @@ -3010,7 +3010,7 @@ static bool is_valid_crush_name(const char *name) * parent, returns 0. * * Does a linear search, as there are no parent pointers of any - * kind. Note that the result is ambigous for items that occur + * kind. Note that the result is ambiguous for items that occur * multiple times in the map. */ static int get_immediate_parent(struct crush_map *c, int id, -- cgit v1.2.3-58-ga151 From f8e0a68babae3f612799178c718ec5358eac41cf Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Wed, 2 Jun 2021 10:56:25 +0200 Subject: net/smc: avoid possible duplicate dmb unregistration smc_lgr_cleanup() calls smcd_unregister_all_dmbs() as part of the link group termination process. This is a leftover from the times when smc_lgr_cleanup() scheduled a worker to actually free the link group. Nowadays smc_lgr_cleanup() directly calls smc_lgr_free() without any delay so an earlier dmb unregistration is no longer needed. So remove smcd_unregister_all_dmbs() and the call to it. Signed-off-by: Karsten Graul Signed-off-by: David S. Miller --- net/smc/smc_core.c | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'net') diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 0df85a12651e..317bc2c90fab 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1235,20 +1235,6 @@ static void smc_lgr_free(struct smc_link_group *lgr) kfree(lgr); } -static void smcd_unregister_all_dmbs(struct smc_link_group *lgr) -{ - int i; - - for (i = 0; i < SMC_RMBE_SIZES; i++) { - struct smc_buf_desc *buf_desc; - - list_for_each_entry(buf_desc, &lgr->rmbs[i], list) { - buf_desc->len += sizeof(struct smcd_cdc_msg); - smc_ism_unregister_dmb(lgr->smcd, buf_desc); - } - } -} - static void smc_sk_wake_ups(struct smc_sock *smc) { smc->sk.sk_write_space(&smc->sk); @@ -1285,7 +1271,6 @@ static void smc_lgr_cleanup(struct smc_link_group *lgr) { if (lgr->is_smcd) { smc_ism_signal_shutdown(lgr); - smcd_unregister_all_dmbs(lgr); } else { u32 rsn = lgr->llc_termination_rsn; -- cgit v1.2.3-58-ga151 From 5e4a43ceb22a6fd2d372fde923a6a95ef6728fd7 Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Wed, 2 Jun 2021 10:56:26 +0200 Subject: net/smc: no need to flush smcd_dev's event_wq before destroying it destroy_workqueue() already calls drain_workqueue(), which is a stronger variant of flush_workqueue(). Signed-off-by: Julian Wiedmann Signed-off-by: Karsten Graul Signed-off-by: David S. Miller --- net/smc/smc_ism.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index 967712ba52a0..9cb2df289963 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -470,7 +470,6 @@ void smcd_unregister_dev(struct smcd_dev *smcd) mutex_unlock(&smcd_dev_list.mutex); smcd->going_away = 1; smc_smcd_terminate_all(smcd); - flush_workqueue(smcd->event_wq); destroy_workqueue(smcd->event_wq); device_del(&smcd->dev); -- cgit v1.2.3-58-ga151 From 14623e005a1e74864afca1261a3aa8e6e8017df9 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Wed, 2 Jun 2021 13:44:24 -0400 Subject: tipc: eliminate redundant fields in struct tipc_sock We eliminate the redundant fields conn_type and conn_instance in struct tipc_sock. On the connecting side, this information is already present in the unused (after the connection is established) part of the pre-allocated header, and on the accepting side, we put it there when the new socket is created. Reviewed-by: Xin Long Tested-by: Hoang Le Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/socket.c | 53 +++++++++++++++++++++++++++-------------------------- 1 file changed, 27 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 53af72824c9c..cb2d9fffbc5d 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -73,9 +73,6 @@ struct sockaddr_pair { /** * struct tipc_sock - TIPC socket structure * @sk: socket - interacts with 'port' and with user via the socket API - * @conn_type: TIPC type used when connection was established - * @conn_instance: TIPC instance used when connection was established - * @published: non-zero if port has one or more associated names * @max_pkt: maximum packet size "hint" used when building messages sent by port * @maxnagle: maximum size of msg which can be subject to nagle * @portid: unique port identity in TIPC socket hash table @@ -106,11 +103,11 @@ struct sockaddr_pair { * @expect_ack: whether this TIPC socket is expecting an ack * @nodelay: setsockopt() TIPC_NODELAY setting * @group_is_open: TIPC socket group is fully open (FIXME) + * @published: true if port has one or more associated names + * @conn_addrtype: address type used when establishing connection */ struct tipc_sock { struct sock sk; - u32 conn_type; - u32 conn_instance; u32 max_pkt; u32 maxnagle; u32 portid; @@ -141,6 +138,7 @@ struct tipc_sock { bool nodelay; bool group_is_open; bool published; + u8 conn_addrtype; }; static int tipc_sk_backlog_rcv(struct sock *sk, struct sk_buff *skb); @@ -1463,10 +1461,8 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) return -EISCONN; if (tsk->published) return -EOPNOTSUPP; - if (atype == TIPC_SERVICE_ADDR) { - tsk->conn_type = ua->sa.type; - tsk->conn_instance = ua->sa.instance; - } + if (atype == TIPC_SERVICE_ADDR) + tsk->conn_addrtype = atype; msg_set_syn(hdr, 1); } @@ -1783,10 +1779,10 @@ static int tipc_sk_anc_data_recv(struct msghdr *m, struct sk_buff *skb, anc_data[2] = msg_nameupper(msg); break; case TIPC_CONN_MSG: - has_name = (tsk->conn_type != 0); - anc_data[0] = tsk->conn_type; - anc_data[1] = tsk->conn_instance; - anc_data[2] = tsk->conn_instance; + has_name = !!tsk->conn_addrtype; + anc_data[0] = msg_nametype(&tsk->phdr); + anc_data[1] = msg_nameinst(&tsk->phdr); + anc_data[2] = anc_data[1]; break; default: has_name = 0; @@ -2750,8 +2746,9 @@ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags, tsk_set_importance(new_sk, msg_importance(msg)); if (msg_named(msg)) { - new_tsock->conn_type = msg_nametype(msg); - new_tsock->conn_instance = msg_nameinst(msg); + new_tsock->conn_addrtype = TIPC_SERVICE_ADDR; + msg_set_nametype(&new_tsock->phdr, msg_nametype(msg)); + msg_set_nameinst(&new_tsock->phdr, msg_nameinst(msg)); } /* @@ -3455,13 +3452,14 @@ void tipc_socket_stop(void) /* Caller should hold socket lock for the passed tipc socket. */ static int __tipc_nl_add_sk_con(struct sk_buff *skb, struct tipc_sock *tsk) { - u32 peer_node; - u32 peer_port; + u32 peer_node, peer_port; + u32 conn_type, conn_instance; struct nlattr *nest; peer_node = tsk_peer_node(tsk); peer_port = tsk_peer_port(tsk); - + conn_type = msg_nametype(&tsk->phdr); + conn_instance = msg_nameinst(&tsk->phdr); nest = nla_nest_start_noflag(skb, TIPC_NLA_SOCK_CON); if (!nest) return -EMSGSIZE; @@ -3471,12 +3469,12 @@ static int __tipc_nl_add_sk_con(struct sk_buff *skb, struct tipc_sock *tsk) if (nla_put_u32(skb, TIPC_NLA_CON_SOCK, peer_port)) goto msg_full; - if (tsk->conn_type != 0) { + if (tsk->conn_addrtype != 0) { if (nla_put_flag(skb, TIPC_NLA_CON_FLAG)) goto msg_full; - if (nla_put_u32(skb, TIPC_NLA_CON_TYPE, tsk->conn_type)) + if (nla_put_u32(skb, TIPC_NLA_CON_TYPE, conn_type)) goto msg_full; - if (nla_put_u32(skb, TIPC_NLA_CON_INST, tsk->conn_instance)) + if (nla_put_u32(skb, TIPC_NLA_CON_INST, conn_instance)) goto msg_full; } nla_nest_end(skb, nest); @@ -3866,9 +3864,9 @@ bool tipc_sk_filtering(struct sock *sk) } if (!tipc_sk_type_connectionless(sk)) { - type = tsk->conn_type; - lower = tsk->conn_instance; - upper = tsk->conn_instance; + type = msg_nametype(&tsk->phdr); + lower = msg_nameinst(&tsk->phdr); + upper = lower; } if ((_type && _type != type) || (_lower && _lower != lower) || @@ -3933,6 +3931,7 @@ int tipc_sk_dump(struct sock *sk, u16 dqueues, char *buf) { int i = 0; size_t sz = (dqueues) ? SK_LMAX : SK_LMIN; + u32 conn_type, conn_instance; struct tipc_sock *tsk; struct publication *p; bool tsk_connected; @@ -3953,8 +3952,10 @@ int tipc_sk_dump(struct sock *sk, u16 dqueues, char *buf) if (tsk_connected) { i += scnprintf(buf + i, sz - i, " %x", tsk_peer_node(tsk)); i += scnprintf(buf + i, sz - i, " %u", tsk_peer_port(tsk)); - i += scnprintf(buf + i, sz - i, " %u", tsk->conn_type); - i += scnprintf(buf + i, sz - i, " %u", tsk->conn_instance); + conn_type = msg_nametype(&tsk->phdr); + conn_instance = msg_nameinst(&tsk->phdr); + i += scnprintf(buf + i, sz - i, " %u", conn_type); + i += scnprintf(buf + i, sz - i, " %u", conn_instance); } i += scnprintf(buf + i, sz - i, " | %u", tsk->published); if (tsk->published) { -- cgit v1.2.3-58-ga151 From 62633c2f17f1f0e6dd6932f990ade9525204ea24 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Wed, 2 Jun 2021 13:44:25 -0400 Subject: tipc: refactor function tipc_sk_anc_data_recv() We refactor tipc_sk_anc_data_recv() to make it slightly more comprehensible, but also to facilitate application of some additions to the code in a future commit. Reviewed-by: Xin Long Tested-by: Hoang Le Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/socket.c | 85 +++++++++++++++++++++++++------------------------------ 1 file changed, 38 insertions(+), 47 deletions(-) (limited to 'net') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index cb2d9fffbc5d..c635fd27fb38 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1733,67 +1733,58 @@ static void tipc_sk_set_orig_addr(struct msghdr *m, struct sk_buff *skb) static int tipc_sk_anc_data_recv(struct msghdr *m, struct sk_buff *skb, struct tipc_sock *tsk) { - struct tipc_msg *msg; - u32 anc_data[3]; - u32 err; - u32 dest_type; - int has_name; - int res; + struct tipc_msg *hdr; + u32 data[3] = {0,}; + bool has_addr; + int dlen, rc; if (likely(m->msg_controllen == 0)) return 0; - msg = buf_msg(skb); - /* Optionally capture errored message object(s) */ - err = msg ? msg_errcode(msg) : 0; - if (unlikely(err)) { - anc_data[0] = err; - anc_data[1] = msg_data_sz(msg); - res = put_cmsg(m, SOL_TIPC, TIPC_ERRINFO, 8, anc_data); - if (res) - return res; - if (anc_data[1]) { - if (skb_linearize(skb)) - return -ENOMEM; - msg = buf_msg(skb); - res = put_cmsg(m, SOL_TIPC, TIPC_RETDATA, anc_data[1], - msg_data(msg)); - if (res) - return res; - } + hdr = buf_msg(skb); + dlen = msg_data_sz(hdr); + + /* Capture errored message object, if any */ + if (msg_errcode(hdr)) { + if (skb_linearize(skb)) + return -ENOMEM; + hdr = buf_msg(skb); + data[0] = msg_errcode(hdr); + data[1] = dlen; + rc = put_cmsg(m, SOL_TIPC, TIPC_ERRINFO, 8, data); + if (rc || !dlen) + return rc; + rc = put_cmsg(m, SOL_TIPC, TIPC_RETDATA, dlen, msg_data(hdr)); + if (rc) + return rc; } - /* Optionally capture message destination object */ - dest_type = msg ? msg_type(msg) : TIPC_DIRECT_MSG; - switch (dest_type) { + /* Capture TIPC_SERVICE_ADDR/RANGE destination address, if any */ + switch (msg_type(hdr)) { case TIPC_NAMED_MSG: - has_name = 1; - anc_data[0] = msg_nametype(msg); - anc_data[1] = msg_namelower(msg); - anc_data[2] = msg_namelower(msg); + has_addr = true; + data[0] = msg_nametype(hdr); + data[1] = msg_namelower(hdr); + data[2] = data[1]; break; case TIPC_MCAST_MSG: - has_name = 1; - anc_data[0] = msg_nametype(msg); - anc_data[1] = msg_namelower(msg); - anc_data[2] = msg_nameupper(msg); + has_addr = true; + data[0] = msg_nametype(hdr); + data[1] = msg_namelower(hdr); + data[2] = msg_nameupper(hdr); break; case TIPC_CONN_MSG: - has_name = !!tsk->conn_addrtype; - anc_data[0] = msg_nametype(&tsk->phdr); - anc_data[1] = msg_nameinst(&tsk->phdr); - anc_data[2] = anc_data[1]; + has_addr = !!tsk->conn_addrtype; + data[0] = msg_nametype(&tsk->phdr); + data[1] = msg_nameinst(&tsk->phdr); + data[2] = data[1]; break; default: - has_name = 0; - } - if (has_name) { - res = put_cmsg(m, SOL_TIPC, TIPC_DESTNAME, 12, anc_data); - if (res) - return res; + has_addr = false; } - - return 0; + if (!has_addr) + return 0; + return put_cmsg(m, SOL_TIPC, TIPC_DESTNAME, 12, data); } static struct sk_buff *tipc_sk_build_ack(struct tipc_sock *tsk) -- cgit v1.2.3-58-ga151 From 5ef213258ddf38fc8b6de5d7aea3d514ff13f71a Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Wed, 2 Jun 2021 13:44:26 -0400 Subject: tipc: simplify handling of lookup scope during multicast message reception We introduce a new macro TIPC_ANY_SCOPE to make the handling of the lookup scope value more comprehensible during multicast reception. The (unchanged) rules go as follows: 1) Multicast messages sent from own node are delivered to all matching sockets on the own node, irrespective of their binding scope. 2) Multicast messages sent from other nodes arrive here because they have found TIPC_CLUSTER_SCOPE bindings emanating from this node. Those messages should be delivered to exactly those sockets, but not to local sockets bound with TIPC_NODE_SCOPE, since the latter obviously were not meant to be visible for those senders. 3) Group multicast/broadcast messages are delivered to the sockets with a binding scope matching exactly the lookup scope indicated in the message header, and nobody else. Reviewed-by: Xin Long Tested-by: Hoang Le Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/name_table.c | 6 +++--- net/tipc/name_table.h | 4 +++- net/tipc/socket.c | 26 ++++++++++---------------- 3 files changed, 16 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index fecab516bf41..01396dd1c899 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -673,12 +673,12 @@ exit: * Returns a list of local sockets */ void tipc_nametbl_lookup_mcast_sockets(struct net *net, struct tipc_uaddr *ua, - bool exact, struct list_head *dports) + struct list_head *dports) { struct service_range *sr; struct tipc_service *sc; struct publication *p; - u32 scope = ua->scope; + u8 scope = ua->scope; rcu_read_lock(); sc = tipc_service_find(net, ua); @@ -688,7 +688,7 @@ void tipc_nametbl_lookup_mcast_sockets(struct net *net, struct tipc_uaddr *ua, spin_lock_bh(&sc->lock); service_range_foreach_match(sr, sc, ua->sr.lower, ua->sr.upper) { list_for_each_entry(p, &sr->local_publ, local_publ) { - if (p->scope == scope || (!exact && p->scope < scope)) + if (scope == p->scope || scope == TIPC_ANY_SCOPE) tipc_dest_push(dports, 0, p->sk.ref); } } diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h index c7c9a3ddd420..259f95e3d99c 100644 --- a/net/tipc/name_table.h +++ b/net/tipc/name_table.h @@ -51,6 +51,8 @@ struct tipc_uaddr; #define TIPC_PUBL_SCOPE_NUM (TIPC_NODE_SCOPE + 1) #define TIPC_NAMETBL_SIZE 1024 /* must be a power of 2 */ +#define TIPC_ANY_SCOPE 10 /* Both node and cluster scope will match */ + /** * struct publication - info about a published service address or range * @sr: service range represented by this publication @@ -113,7 +115,7 @@ int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb); bool tipc_nametbl_lookup_anycast(struct net *net, struct tipc_uaddr *ua, struct tipc_socket_addr *sk); void tipc_nametbl_lookup_mcast_sockets(struct net *net, struct tipc_uaddr *ua, - bool exact, struct list_head *dports); + struct list_head *dports); void tipc_nametbl_lookup_mcast_nodes(struct net *net, struct tipc_uaddr *ua, struct tipc_nlist *nodes); bool tipc_nametbl_lookup_group(struct net *net, struct tipc_uaddr *ua, diff --git a/net/tipc/socket.c b/net/tipc/socket.c index c635fd27fb38..575a0238deb2 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1200,12 +1200,12 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, struct tipc_msg *hdr; struct tipc_uaddr ua; int user, mtyp, hlen; - bool exact; __skb_queue_head_init(&tmpq); INIT_LIST_HEAD(&dports); ua.addrtype = TIPC_SERVICE_RANGE; + /* tipc_skb_peek() increments the head skb's reference counter */ skb = tipc_skb_peek(arrvq, &inputq->lock); for (; skb; skb = tipc_skb_peek(arrvq, &inputq->lock)) { hdr = buf_msg(skb); @@ -1214,6 +1214,12 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, hlen = skb_headroom(skb) + msg_hdr_sz(hdr); onode = msg_orignode(hdr); ua.sr.type = msg_nametype(hdr); + ua.sr.lower = msg_namelower(hdr); + ua.sr.upper = msg_nameupper(hdr); + if (onode == self) + ua.scope = TIPC_ANY_SCOPE; + else + ua.scope = TIPC_CLUSTER_SCOPE; if (mtyp == TIPC_GRP_UCAST_MSG || user == GROUP_PROTOCOL) { spin_lock_bh(&inputq->lock); @@ -1231,20 +1237,10 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, ua.sr.lower = 0; ua.sr.upper = ~0; ua.scope = msg_lookup_scope(hdr); - exact = true; - } else { - /* TIPC_NODE_SCOPE means "any scope" in this context */ - if (onode == self) - ua.scope = TIPC_NODE_SCOPE; - else - ua.scope = TIPC_CLUSTER_SCOPE; - exact = false; - ua.sr.lower = msg_namelower(hdr); - ua.sr.upper = msg_nameupper(hdr); } /* Create destination port list: */ - tipc_nametbl_lookup_mcast_sockets(net, &ua, exact, &dports); + tipc_nametbl_lookup_mcast_sockets(net, &ua, &dports); /* Clone message per destination */ while (tipc_dest_pop(&dports, NULL, &portid)) { @@ -1256,13 +1252,11 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, } pr_warn("Failed to clone mcast rcv buffer\n"); } - /* Append to inputq if not already done by other thread */ + /* Append clones to inputq only if skb is still head of arrvq */ spin_lock_bh(&inputq->lock); if (skb_peek(arrvq) == skb) { skb_queue_splice_tail_init(&tmpq, inputq); - /* Decrease the skb's refcnt as increasing in the - * function tipc_skb_peek - */ + /* Decrement the skb's refcnt */ kfree_skb(__skb_dequeue(arrvq)); } spin_unlock_bh(&inputq->lock); -- cgit v1.2.3-58-ga151 From 9c153d3889767eb347a9b1719cc6f336faccdba9 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 2 Jun 2021 13:27:41 -0700 Subject: net: vlan: Avoid using strncpy() Use strscpy_pad() instead of strncpy() which is considered deprecated: https://www.kernel.org/doc/html/latest/process/deprecated.html#strncpy-on-nul-terminated-strings Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- net/8021q/vlan.c | 3 ++- net/8021q/vlan.h | 3 ++- net/8021q/vlan_dev.c | 6 +++--- 3 files changed, 7 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index fb3d3262dc1a..4cdf8416869d 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -638,7 +638,8 @@ static int vlan_ioctl_handler(struct net *net, void __user *arg) case GET_VLAN_REALDEV_NAME_CMD: err = 0; - vlan_dev_get_realdev_name(dev, args.u.device2); + vlan_dev_get_realdev_name(dev, args.u.device2, + sizeof(args.u.device2)); if (copy_to_user(arg, &args, sizeof(struct vlan_ioctl_args))) err = -EFAULT; diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h index fa3ad3d4d58c..e3f6ff05a528 100644 --- a/net/8021q/vlan.h +++ b/net/8021q/vlan.h @@ -129,7 +129,8 @@ void vlan_dev_set_ingress_priority(const struct net_device *dev, int vlan_dev_set_egress_priority(const struct net_device *dev, u32 skb_prio, u16 vlan_prio); int vlan_dev_change_flags(const struct net_device *dev, u32 flag, u32 mask); -void vlan_dev_get_realdev_name(const struct net_device *dev, char *result); +void vlan_dev_get_realdev_name(const struct net_device *dev, char *result, + size_t size); int vlan_check_real_dev(struct net_device *real_dev, __be16 protocol, u16 vlan_id, diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 4db3f0621959..a0367b37512d 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -239,9 +239,9 @@ int vlan_dev_change_flags(const struct net_device *dev, u32 flags, u32 mask) return 0; } -void vlan_dev_get_realdev_name(const struct net_device *dev, char *result) +void vlan_dev_get_realdev_name(const struct net_device *dev, char *result, size_t size) { - strncpy(result, vlan_dev_priv(dev)->real_dev->name, 23); + strscpy_pad(result, vlan_dev_priv(dev)->real_dev->name, size); } bool vlan_dev_inherit_address(struct net_device *dev, @@ -360,7 +360,7 @@ static int vlan_dev_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) struct ifreq ifrr; int err = -EOPNOTSUPP; - strncpy(ifrr.ifr_name, real_dev->name, IFNAMSIZ); + strscpy_pad(ifrr.ifr_name, real_dev->name, IFNAMSIZ); ifrr.ifr_ifru = ifr->ifr_ifru; switch (cmd) { -- cgit v1.2.3-58-ga151 From a29cb6914681a55667436a9eb7a42e28da8cf387 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 2 Jun 2021 17:51:21 -0700 Subject: net: tcp better handling of reordering then loss cases This patch aims to improve the situation when reordering and loss are ocurring in the same flight of packets. Previously the reordering would first induce a spurious recovery, then the subsequent ACK may undo the cwnd (based on the timestamps e.g.). However the current loss recovery does not proceed to invoke RACK to install a reordering timer. If some packets are also lost, this may lead to a long RTO-based recovery. An example is https://groups.google.com/g/bbr-dev/c/OFHADvJbTEI The solution is to after reverting the recovery, always invoke RACK to either mount the RACK timer to fast retransmit after the reordering window, or restarts the recovery if new loss is identified. Hence it is possible the sender may go from Recovery to Disorder/Open to Recovery again in one ACK. Reported-by: mingkun bian Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 45 ++++++++++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index cd52ce0a2a85..7d5e59f688de 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2816,8 +2816,17 @@ static void tcp_process_loss(struct sock *sk, int flag, int num_dupack, *rexmit = REXMIT_LOST; } +static bool tcp_force_fast_retransmit(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + return after(tcp_highest_sack_seq(tp), + tp->snd_una + tp->reordering * tp->mss_cache); +} + /* Undo during fast recovery after partial ACK. */ -static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una) +static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una, + bool *do_lost) { struct tcp_sock *tp = tcp_sk(sk); @@ -2842,7 +2851,9 @@ static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una) tcp_undo_cwnd_reduction(sk, true); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO); tcp_try_keep_open(sk); - return true; + } else { + /* Partial ACK arrived. Force fast retransmit. */ + *do_lost = tcp_force_fast_retransmit(sk); } return false; } @@ -2866,14 +2877,6 @@ static void tcp_identify_packet_loss(struct sock *sk, int *ack_flag) } } -static bool tcp_force_fast_retransmit(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - - return after(tcp_highest_sack_seq(tp), - tp->snd_una + tp->reordering * tp->mss_cache); -} - /* Process an event, which can update packets-in-flight not trivially. * Main goal of this function is to calculate new estimate for left_out, * taking into account both packets sitting in receiver's buffer and @@ -2943,17 +2946,21 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, if (!(flag & FLAG_SND_UNA_ADVANCED)) { if (tcp_is_reno(tp)) tcp_add_reno_sack(sk, num_dupack, ece_ack); - } else { - if (tcp_try_undo_partial(sk, prior_snd_una)) - return; - /* Partial ACK arrived. Force fast retransmit. */ - do_lost = tcp_force_fast_retransmit(sk); - } - if (tcp_try_undo_dsack(sk)) { - tcp_try_keep_open(sk); + } else if (tcp_try_undo_partial(sk, prior_snd_una, &do_lost)) return; - } + + if (tcp_try_undo_dsack(sk)) + tcp_try_keep_open(sk); + tcp_identify_packet_loss(sk, ack_flag); + if (icsk->icsk_ca_state != TCP_CA_Recovery) { + if (!tcp_time_to_recover(sk, flag)) + return; + /* Undo reverts the recovery state. If loss is evident, + * starts a new recovery (e.g. reordering then loss); + */ + tcp_enter_recovery(sk, ece_ack); + } break; case TCP_CA_Loss: tcp_process_loss(sk, flag, num_dupack, rexmit); -- cgit v1.2.3-58-ga151 From a10541f5d9fa2aab5ff54311473b05ba75b84226 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 3 Jun 2021 22:07:49 +0800 Subject: sch_htb: fix doc warning in htb_add_to_id_tree() Add description for parameters of htb_add_to_id_tree() to fix gcc W=1 warnings: net/sched/sch_htb.c:282: warning: Function parameter or member 'root' not described in 'htb_add_to_id_tree' net/sched/sch_htb.c:282: warning: Function parameter or member 'cl' not described in 'htb_add_to_id_tree' net/sched/sch_htb.c:282: warning: Function parameter or member 'prio' not described in 'htb_add_to_id_tree' Signed-off-by: Yu Kuai Signed-off-by: David S. Miller --- net/sched/sch_htb.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 282614614905..4f9304567dcc 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -273,6 +273,9 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch, /** * htb_add_to_id_tree - adds class to the round robin list + * @root: the root of the tree + * @cl: the class to add + * @prio: the give prio in class * * Routine adds class to the list (actually tree) sorted by classid. * Make sure that class is not already on such list for given prio. -- cgit v1.2.3-58-ga151 From e32ea44c7ae476f4c90e35ab0a29dc8ff082bc11 Mon Sep 17 00:00:00 2001 From: Andreas Roeseler Date: Thu, 3 Jun 2021 16:22:11 -0500 Subject: icmp: fix lib conflict with trinity Including and in the dependencies breaks compilation of trinity due to multiple definitions. is only used in to provide the definition of the struct in_addr, but this can be substituted out by using the datatype __be32. Signed-off-by: Andreas Roeseler Signed-off-by: David S. Miller --- include/uapi/linux/icmp.h | 3 +-- net/ipv4/icmp.c | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/icmp.h b/include/uapi/linux/icmp.h index c1da8244c5e1..163c0998aec9 100644 --- a/include/uapi/linux/icmp.h +++ b/include/uapi/linux/icmp.h @@ -20,7 +20,6 @@ #include #include -#include #include #include @@ -154,7 +153,7 @@ struct icmp_ext_echo_iio { struct { struct icmp_ext_echo_ctype3_hdr ctype3_hdr; union { - struct in_addr ipv4_addr; + __be32 ipv4_addr; struct in6_addr ipv6_addr; } ip_addr; } addr; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 7b6931a4d775..2e09d62d59e3 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -1059,7 +1059,7 @@ static bool icmp_echo(struct sk_buff *skb) if (ident_len != sizeof(iio->ident.addr.ctype3_hdr) + sizeof(struct in_addr)) goto send_mal_query; - dev = ip_dev_find(net, iio->ident.addr.ip_addr.ipv4_addr.s_addr); + dev = ip_dev_find(net, iio->ident.addr.ip_addr.ipv4_addr); break; #if IS_ENABLED(CONFIG_IPV6) case ICMP_AFI_IP6: -- cgit v1.2.3-58-ga151 From 371087aa476ab0ac0072303ac94a3bba2d7b0a1d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 3 Jun 2021 16:24:27 -0700 Subject: sock: expose so_timestamp options for mptcp This exports SO_TIMESTAMP_* function for re-use by MPTCP. Without this there is too much copy & paste needed to support this from mptcp setsockopt path. Acked-by: Paolo Abeni Signed-off-by: Florian Westphal Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- include/net/sock.h | 1 + net/core/sock.c | 26 +++++++++++++++++++------- 2 files changed, 20 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/net/sock.h b/include/net/sock.h index 0e962d8bc73b..7e0116b1a73f 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2743,6 +2743,7 @@ static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif) void sock_def_readable(struct sock *sk); int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk); +void sock_set_timestamp(struct sock *sk, int optname, bool valbool); void sock_enable_timestamps(struct sock *sk); void sock_no_linger(struct sock *sk); void sock_set_keepalive(struct sock *sk); diff --git a/net/core/sock.c b/net/core/sock.c index 958614ea16ed..5b85dd37b562 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -776,6 +776,24 @@ void sock_enable_timestamps(struct sock *sk) } EXPORT_SYMBOL(sock_enable_timestamps); +void sock_set_timestamp(struct sock *sk, int optname, bool valbool) +{ + switch (optname) { + case SO_TIMESTAMP_OLD: + __sock_set_timestamps(sk, valbool, false, false); + break; + case SO_TIMESTAMP_NEW: + __sock_set_timestamps(sk, valbool, true, false); + break; + case SO_TIMESTAMPNS_OLD: + __sock_set_timestamps(sk, valbool, false, true); + break; + case SO_TIMESTAMPNS_NEW: + __sock_set_timestamps(sk, valbool, true, true); + break; + } +} + void sock_set_keepalive(struct sock *sk) { lock_sock(sk); @@ -989,16 +1007,10 @@ set_sndbuf: break; case SO_TIMESTAMP_OLD: - __sock_set_timestamps(sk, valbool, false, false); - break; case SO_TIMESTAMP_NEW: - __sock_set_timestamps(sk, valbool, true, false); - break; case SO_TIMESTAMPNS_OLD: - __sock_set_timestamps(sk, valbool, false, true); - break; case SO_TIMESTAMPNS_NEW: - __sock_set_timestamps(sk, valbool, true, true); + sock_set_timestamp(sk, valbool, optname); break; case SO_TIMESTAMPING_NEW: case SO_TIMESTAMPING_OLD: -- cgit v1.2.3-58-ga151 From ced122d90f52eb6ff37272e32941845d46ac64c6 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 3 Jun 2021 16:24:28 -0700 Subject: sock: expose so_timestamping options for mptcp Similar to previous patch: expose SO_TIMESTAMPING helper so we do not have to copy & paste this into the mptcp core. Acked-by: Paolo Abeni Signed-off-by: Florian Westphal Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- include/net/sock.h | 2 ++ net/core/sock.c | 71 +++++++++++++++++++++++++++--------------------------- 2 files changed, 38 insertions(+), 35 deletions(-) (limited to 'net') diff --git a/include/net/sock.h b/include/net/sock.h index 7e0116b1a73f..9b341c2c924f 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2744,6 +2744,8 @@ void sock_def_readable(struct sock *sk); int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk); void sock_set_timestamp(struct sock *sk, int optname, bool valbool); +int sock_set_timestamping(struct sock *sk, int optname, int val); + void sock_enable_timestamps(struct sock *sk); void sock_no_linger(struct sock *sk); void sock_set_keepalive(struct sock *sk); diff --git a/net/core/sock.c b/net/core/sock.c index 5b85dd37b562..bd887cb075ce 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -794,6 +794,40 @@ void sock_set_timestamp(struct sock *sk, int optname, bool valbool) } } +int sock_set_timestamping(struct sock *sk, int optname, int val) +{ + if (val & ~SOF_TIMESTAMPING_MASK) + return -EINVAL; + + if (val & SOF_TIMESTAMPING_OPT_ID && + !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { + if (sk->sk_protocol == IPPROTO_TCP && + sk->sk_type == SOCK_STREAM) { + if ((1 << sk->sk_state) & + (TCPF_CLOSE | TCPF_LISTEN)) + return -EINVAL; + sk->sk_tskey = tcp_sk(sk)->snd_una; + } else { + sk->sk_tskey = 0; + } + } + + if (val & SOF_TIMESTAMPING_OPT_STATS && + !(val & SOF_TIMESTAMPING_OPT_TSONLY)) + return -EINVAL; + + sk->sk_tsflags = val; + sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW); + + if (val & SOF_TIMESTAMPING_RX_SOFTWARE) + sock_enable_timestamp(sk, + SOCK_TIMESTAMPING_RX_SOFTWARE); + else + sock_disable_timestamp(sk, + (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); + return 0; +} + void sock_set_keepalive(struct sock *sk) { lock_sock(sk); @@ -1012,43 +1046,10 @@ set_sndbuf: case SO_TIMESTAMPNS_NEW: sock_set_timestamp(sk, valbool, optname); break; + case SO_TIMESTAMPING_NEW: case SO_TIMESTAMPING_OLD: - if (val & ~SOF_TIMESTAMPING_MASK) { - ret = -EINVAL; - break; - } - - if (val & SOF_TIMESTAMPING_OPT_ID && - !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { - if (sk->sk_protocol == IPPROTO_TCP && - sk->sk_type == SOCK_STREAM) { - if ((1 << sk->sk_state) & - (TCPF_CLOSE | TCPF_LISTEN)) { - ret = -EINVAL; - break; - } - sk->sk_tskey = tcp_sk(sk)->snd_una; - } else { - sk->sk_tskey = 0; - } - } - - if (val & SOF_TIMESTAMPING_OPT_STATS && - !(val & SOF_TIMESTAMPING_OPT_TSONLY)) { - ret = -EINVAL; - break; - } - - sk->sk_tsflags = val; - sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW); - - if (val & SOF_TIMESTAMPING_RX_SOFTWARE) - sock_enable_timestamp(sk, - SOCK_TIMESTAMPING_RX_SOFTWARE); - else - sock_disable_timestamp(sk, - (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); + ret = sock_set_timestamping(sk, optname, val); break; case SO_RCVLOWAT: -- cgit v1.2.3-58-ga151 From 9061f24bf82ec2e92dd1e7c10b98b680db023d31 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 3 Jun 2021 16:24:29 -0700 Subject: mptcp: sockopt: propagate timestamp request to subflows This adds support for TIMESTAMP(NS) setsockopt. This doesn't make things work yet, because the mptcp receive path doesn't convert the skb timestamps to cmsgs for userspace consumption. receive path cmsg support is added ina followup patch. Acked-by: Paolo Abeni Signed-off-by: Florian Westphal Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/sockopt.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) (limited to 'net') diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index a79798189599..3168ad4a9298 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -140,6 +140,43 @@ static void mptcp_so_incoming_cpu(struct mptcp_sock *msk, int val) mptcp_sol_socket_sync_intval(msk, SO_INCOMING_CPU, val); } +static int mptcp_setsockopt_sol_socket_tstamp(struct mptcp_sock *msk, int optname, int val) +{ + sockptr_t optval = KERNEL_SOCKPTR(&val); + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + int ret; + + ret = sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname, + optval, sizeof(val)); + if (ret) + return ret; + + lock_sock(sk); + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + bool slow = lock_sock_fast(ssk); + + switch (optname) { + case SO_TIMESTAMP_OLD: + case SO_TIMESTAMP_NEW: + case SO_TIMESTAMPNS_OLD: + case SO_TIMESTAMPNS_NEW: + sock_set_timestamp(sk, optname, !!val); + break; + case SO_TIMESTAMPING_NEW: + case SO_TIMESTAMPING_OLD: + sock_set_timestamping(sk, optname, val); + break; + } + + unlock_sock_fast(ssk, slow); + } + + release_sock(sk); + return 0; +} + static int mptcp_setsockopt_sol_socket_int(struct mptcp_sock *msk, int optname, sockptr_t optval, unsigned int optlen) { @@ -164,6 +201,13 @@ static int mptcp_setsockopt_sol_socket_int(struct mptcp_sock *msk, int optname, case SO_INCOMING_CPU: mptcp_so_incoming_cpu(msk, val); return 0; + case SO_TIMESTAMP_OLD: + case SO_TIMESTAMP_NEW: + case SO_TIMESTAMPNS_OLD: + case SO_TIMESTAMPNS_NEW: + case SO_TIMESTAMPING_OLD: + case SO_TIMESTAMPING_NEW: + return mptcp_setsockopt_sol_socket_tstamp(msk, optname, val); } return -ENOPROTOOPT; @@ -251,6 +295,12 @@ static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname, case SO_MARK: case SO_INCOMING_CPU: case SO_DEBUG: + case SO_TIMESTAMP_OLD: + case SO_TIMESTAMP_NEW: + case SO_TIMESTAMPNS_OLD: + case SO_TIMESTAMPNS_NEW: + case SO_TIMESTAMPING_OLD: + case SO_TIMESTAMPING_NEW: return mptcp_setsockopt_sol_socket_int(msk, optname, optval, optlen); case SO_LINGER: return mptcp_setsockopt_sol_socket_linger(msk, optval, optlen); -- cgit v1.2.3-58-ga151 From 7a009a70ff8adcba3b31dc8922a3671e5a8e1361 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 3 Jun 2021 16:24:30 -0700 Subject: mptcp: setsockopt: handle SOL_SOCKET in one place only Move the pre-check to the function that handles all SOL_SOCKET values. At this point there is complete coverage for all values that were accepted by the pre-check. BUSYPOLL functions are accepted but will not have any functionality yet until its clear how the expected mptcp behaviour should look like. Acked-by: Paolo Abeni Signed-off-by: Florian Westphal Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/sockopt.c | 99 ++++++++++++++++------------------------------------- 1 file changed, 29 insertions(+), 70 deletions(-) (limited to 'net') diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 3168ad4a9298..092d1f635d27 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -304,6 +304,14 @@ static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname, return mptcp_setsockopt_sol_socket_int(msk, optname, optval, optlen); case SO_LINGER: return mptcp_setsockopt_sol_socket_linger(msk, optval, optlen); + case SO_RCVLOWAT: + case SO_RCVTIMEO_OLD: + case SO_RCVTIMEO_NEW: + case SO_BUSY_POLL: + case SO_PREFER_BUSY_POLL: + case SO_BUSY_POLL_BUDGET: + /* No need to copy: only relevant for msk */ + return sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname, optval, optlen); case SO_NO_CHECK: case SO_DONTROUTE: case SO_BROADCAST: @@ -317,7 +325,24 @@ static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname, return 0; } - return sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname, optval, optlen); + /* SO_OOBINLINE is not supported, let's avoid the related mess + * SO_ATTACH_FILTER, SO_ATTACH_BPF, SO_ATTACH_REUSEPORT_CBPF, + * SO_DETACH_REUSEPORT_BPF, SO_DETACH_FILTER, SO_LOCK_FILTER, + * we must be careful with subflows + * + * SO_ATTACH_REUSEPORT_EBPF is not supported, at it checks + * explicitly the sk_protocol field + * + * SO_PEEK_OFF is unsupported, as it is for plain TCP + * SO_MAX_PACING_RATE is unsupported, we must be careful with subflows + * SO_CNX_ADVICE is currently unsupported, could possibly be relevant, + * but likely needs careful design + * + * SO_ZEROCOPY is currently unsupported, TODO in sndmsg + * SO_TXTIME is currently unsupported + */ + + return -EOPNOTSUPP; } static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname, @@ -349,72 +374,6 @@ static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname, static bool mptcp_supported_sockopt(int level, int optname) { - if (level == SOL_SOCKET) { - switch (optname) { - case SO_DEBUG: - case SO_REUSEPORT: - case SO_REUSEADDR: - - /* the following ones need a better implementation, - * but are quite common we want to preserve them - */ - case SO_BINDTODEVICE: - case SO_SNDBUF: - case SO_SNDBUFFORCE: - case SO_RCVBUF: - case SO_RCVBUFFORCE: - case SO_KEEPALIVE: - case SO_PRIORITY: - case SO_LINGER: - case SO_TIMESTAMP_OLD: - case SO_TIMESTAMP_NEW: - case SO_TIMESTAMPNS_OLD: - case SO_TIMESTAMPNS_NEW: - case SO_TIMESTAMPING_OLD: - case SO_TIMESTAMPING_NEW: - case SO_RCVLOWAT: - case SO_RCVTIMEO_OLD: - case SO_RCVTIMEO_NEW: - case SO_SNDTIMEO_OLD: - case SO_SNDTIMEO_NEW: - case SO_MARK: - case SO_INCOMING_CPU: - case SO_BINDTOIFINDEX: - case SO_BUSY_POLL: - case SO_PREFER_BUSY_POLL: - case SO_BUSY_POLL_BUDGET: - - /* next ones are no-op for plain TCP */ - case SO_NO_CHECK: - case SO_DONTROUTE: - case SO_BROADCAST: - case SO_BSDCOMPAT: - case SO_PASSCRED: - case SO_PASSSEC: - case SO_RXQ_OVFL: - case SO_WIFI_STATUS: - case SO_NOFCS: - case SO_SELECT_ERR_QUEUE: - return true; - } - - /* SO_OOBINLINE is not supported, let's avoid the related mess */ - /* SO_ATTACH_FILTER, SO_ATTACH_BPF, SO_ATTACH_REUSEPORT_CBPF, - * SO_DETACH_REUSEPORT_BPF, SO_DETACH_FILTER, SO_LOCK_FILTER, - * we must be careful with subflows - */ - /* SO_ATTACH_REUSEPORT_EBPF is not supported, at it checks - * explicitly the sk_protocol field - */ - /* SO_PEEK_OFF is unsupported, as it is for plain TCP */ - /* SO_MAX_PACING_RATE is unsupported, we must be careful with subflows */ - /* SO_CNX_ADVICE is currently unsupported, could possibly be relevant, - * but likely needs careful design - */ - /* SO_ZEROCOPY is currently unsupported, TODO in sndmsg */ - /* SO_TXTIME is currently unsupported */ - return false; - } if (level == SOL_IP) { switch (optname) { /* should work fine */ @@ -624,12 +583,12 @@ int mptcp_setsockopt(struct sock *sk, int level, int optname, pr_debug("msk=%p", msk); - if (!mptcp_supported_sockopt(level, optname)) - return -ENOPROTOOPT; - if (level == SOL_SOCKET) return mptcp_setsockopt_sol_socket(msk, optname, optval, optlen); + if (!mptcp_supported_sockopt(level, optname)) + return -ENOPROTOOPT; + /* @@ the meaning of setsockopt() when the socket is connected and * there are multiple subflows is not yet defined. It is up to the * MPTCP-level socket to configure the subflows until the subflow -- cgit v1.2.3-58-ga151 From 892bfd3ded0ef0f895ed6356d0f85e2009421747 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 3 Jun 2021 16:24:31 -0700 Subject: tcp: export timestamp helpers for mptcp MPTCP is builtin, so no need to add EXPORT_SYMBOL()s. It will be used to support SO_TIMESTAMP(NS) ancillary messages in the mptcp receive path. Acked-by: Paolo Abeni Signed-off-by: Florian Westphal Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- include/net/tcp.h | 4 ++++ net/ipv4/tcp.c | 10 ++++------ 2 files changed, 8 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/include/net/tcp.h b/include/net/tcp.h index d05193cb0d99..e668f1bf780d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -412,6 +412,10 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); int tcp_set_rcvlowat(struct sock *sk, int val); int tcp_set_window_clamp(struct sock *sk, int val); +void tcp_update_recv_tstamps(struct sk_buff *skb, + struct scm_timestamping_internal *tss); +void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, + struct scm_timestamping_internal *tss); void tcp_data_ready(struct sock *sk); #ifdef CONFIG_MMU int tcp_mmap(struct file *file, struct socket *sock, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f1c1f9e3de72..0e3f0e0e5b51 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1738,8 +1738,8 @@ int tcp_set_rcvlowat(struct sock *sk, int val) } EXPORT_SYMBOL(tcp_set_rcvlowat); -static void tcp_update_recv_tstamps(struct sk_buff *skb, - struct scm_timestamping_internal *tss) +void tcp_update_recv_tstamps(struct sk_buff *skb, + struct scm_timestamping_internal *tss) { if (skb->tstamp) tss->ts[0] = ktime_to_timespec64(skb->tstamp); @@ -2024,8 +2024,6 @@ static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma, } #define TCP_VALID_ZC_MSG_FLAGS (TCP_CMSG_TS) -static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, - struct scm_timestamping_internal *tss); static void tcp_zc_finalize_rx_tstamp(struct sock *sk, struct tcp_zerocopy_receive *zc, struct scm_timestamping_internal *tss) @@ -2197,8 +2195,8 @@ out: #endif /* Similar to __sock_recv_timestamp, but does not require an skb */ -static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, - struct scm_timestamping_internal *tss) +void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, + struct scm_timestamping_internal *tss) { int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW); bool has_timestamping = false; -- cgit v1.2.3-58-ga151 From b7f653b297a4b2ed16a11883044077e6bf3e3481 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 3 Jun 2021 16:24:32 -0700 Subject: mptcp: receive path cmsg support This adds support for SO_TIMESTAMP(NS). Timestamps are passed to userspace in the same way as for plain tcp sockets. Acked-by: Paolo Abeni Signed-off-by: Florian Westphal Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 2bc199549a88..3897d35fd9df 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -39,10 +39,15 @@ struct mptcp_skb_cb { u64 map_seq; u64 end_seq; u32 offset; + u8 has_rxtstamp:1; }; #define MPTCP_SKB_CB(__skb) ((struct mptcp_skb_cb *)&((__skb)->cb[0])) +enum { + MPTCP_CMSG_TS = BIT(0), +}; + static struct percpu_counter mptcp_sockets_allocated; static void __mptcp_destroy_sock(struct sock *sk); @@ -272,6 +277,7 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = (struct sock *)msk; struct sk_buff *tail; + bool has_rxtstamp; __skb_unlink(skb, &ssk->sk_receive_queue); @@ -287,6 +293,8 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, goto drop; } + has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp; + /* the skb map_seq accounts for the skb offset: * mptcp_subflow_get_mapped_dsn() is based on the current tp->copied_seq * value @@ -294,6 +302,7 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, MPTCP_SKB_CB(skb)->map_seq = mptcp_subflow_get_mapped_dsn(subflow); MPTCP_SKB_CB(skb)->end_seq = MPTCP_SKB_CB(skb)->map_seq + copy_len; MPTCP_SKB_CB(skb)->offset = offset; + MPTCP_SKB_CB(skb)->has_rxtstamp = has_rxtstamp; if (MPTCP_SKB_CB(skb)->map_seq == msk->ack_seq) { /* in sequence */ @@ -1757,7 +1766,9 @@ static void mptcp_wait_data(struct sock *sk, long *timeo) static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, struct msghdr *msg, - size_t len, int flags) + size_t len, int flags, + struct scm_timestamping_internal *tss, + int *cmsg_flags) { struct sk_buff *skb, *tmp; int copied = 0; @@ -1777,6 +1788,11 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, } } + if (MPTCP_SKB_CB(skb)->has_rxtstamp) { + tcp_update_recv_tstamps(skb, tss); + *cmsg_flags |= MPTCP_CMSG_TS; + } + copied += count; if (count < data_len) { @@ -1964,7 +1980,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len) { struct mptcp_sock *msk = mptcp_sk(sk); - int copied = 0; + struct scm_timestamping_internal tss; + int copied = 0, cmsg_flags = 0; int target; long timeo; @@ -1986,7 +2003,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, while (copied < len) { int bytes_read; - bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied, flags); + bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied, flags, &tss, &cmsg_flags); if (unlikely(bytes_read < 0)) { if (!copied) copied = bytes_read; @@ -2067,6 +2084,11 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, set_bit(MPTCP_DATA_READY, &msk->flags); } out_err: + if (cmsg_flags && copied >= 0) { + if (cmsg_flags & MPTCP_CMSG_TS) + tcp_recv_timestamp(msg, sk, &tss); + } + pr_debug("msk=%p data_ready=%d rx queue empty=%d copied=%d", msk, test_bit(MPTCP_DATA_READY, &msk->flags), skb_queue_empty_lockless(&sk->sk_receive_queue), copied); -- cgit v1.2.3-58-ga151 From 0efea3c649f0a50d473a4afe2d17c2bbcee639e1 Mon Sep 17 00:00:00 2001 From: Zheng Yongjun Date: Fri, 4 Jun 2021 09:47:02 +0800 Subject: tipc: Return the correct errno code When kalloc or kmemdup failed, should return ENOMEM rather than ENOBUF. Signed-off-by: Zheng Yongjun Signed-off-by: David S. Miller --- net/tipc/link.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index c44b4bfaaee6..5b6181277cc5 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -912,7 +912,7 @@ static int link_schedule_user(struct tipc_link *l, struct tipc_msg *hdr) skb = tipc_msg_create(SOCK_WAKEUP, 0, INT_H_SIZE, 0, dnode, l->addr, dport, 0, 0); if (!skb) - return -ENOBUFS; + return -ENOMEM; msg_set_dest_droppable(buf_msg(skb), true); TIPC_SKB_CB(skb)->chain_imp = msg_importance(hdr); skb_queue_tail(&l->wakeupq, skb); @@ -1030,7 +1030,7 @@ void tipc_link_reset(struct tipc_link *l) * * Consumes the buffer chain. * Messages at TIPC_SYSTEM_IMPORTANCE are always accepted - * Return: 0 if success, or errno: -ELINKCONG, -EMSGSIZE or -ENOBUFS + * Return: 0 if success, or errno: -ELINKCONG, -EMSGSIZE or -ENOBUFS or -ENOMEM */ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, struct sk_buff_head *xmitq) @@ -1088,7 +1088,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, if (!_skb) { kfree_skb(skb); __skb_queue_purge(list); - return -ENOBUFS; + return -ENOMEM; } __skb_queue_tail(transmq, skb); tipc_link_set_skb_retransmit_time(skb, l); -- cgit v1.2.3-58-ga151 From fcb34635854a5a5814227628867ea914a9805384 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Fri, 4 Jun 2021 12:37:47 +0200 Subject: net: bridge: mrp: Update ring transitions. According to the standard IEC 62439-2, the number of transitions needs to be counted for each transition 'between' ring state open and ring state closed and not from open state to closed state. Therefore fix this for both ring and interconnect ring. Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- net/bridge/br_mrp.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/bridge/br_mrp.c b/net/bridge/br_mrp.c index cd2b1e424e54..f7012b7d7ce4 100644 --- a/net/bridge/br_mrp.c +++ b/net/bridge/br_mrp.c @@ -627,8 +627,7 @@ int br_mrp_set_ring_state(struct net_bridge *br, if (!mrp) return -EINVAL; - if (mrp->ring_state == BR_MRP_RING_STATE_CLOSED && - state->ring_state != BR_MRP_RING_STATE_CLOSED) + if (mrp->ring_state != state->ring_state) mrp->ring_transitions++; mrp->ring_state = state->ring_state; @@ -715,8 +714,7 @@ int br_mrp_set_in_state(struct net_bridge *br, struct br_mrp_in_state *state) if (!mrp) return -EINVAL; - if (mrp->in_state == BR_MRP_IN_STATE_CLOSED && - state->in_state != BR_MRP_IN_STATE_CLOSED) + if (mrp->in_state != state->in_state) mrp->in_transitions++; mrp->in_state = state->in_state; -- cgit v1.2.3-58-ga151 From ef4b65e53cc77e2b3ca4667b461047ad04fb45fa Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 31 May 2021 00:08:09 +0200 Subject: netfilter: nfnetlink: add struct nfgenmsg to struct nfnl_info and use it Update the nfnl_info structure to add a pointer to the nfnetlink header. This simplifies the existing codebase since this header is usually accessed. Update existing clients to use this new field. Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nfnetlink.h | 1 + net/netfilter/nf_conntrack_netlink.c | 23 ++++++--------- net/netfilter/nf_tables_api.c | 55 +++++++++++++----------------------- net/netfilter/nfnetlink.c | 2 ++ net/netfilter/nfnetlink_log.c | 5 ++-- net/netfilter/nfnetlink_queue.c | 9 ++---- net/netfilter/nft_compat.c | 17 ++++------- 7 files changed, 42 insertions(+), 70 deletions(-) (limited to 'net') diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h index 515ce53aa20d..241e005f290a 100644 --- a/include/linux/netfilter/nfnetlink.h +++ b/include/linux/netfilter/nfnetlink.h @@ -11,6 +11,7 @@ struct nfnl_info { struct net *net; struct sock *sk; const struct nlmsghdr *nlh; + const struct nfgenmsg *nfmsg; struct netlink_ext_ack *extack; }; diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 220f51f055ab..4e1a9dba7077 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1528,7 +1528,7 @@ static int ctnetlink_del_conntrack(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { - struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); + u8 family = info->nfmsg->nfgen_family; struct nf_conntrack_tuple_hash *h; struct nf_conntrack_tuple tuple; struct nf_conntrack_zone zone; @@ -1541,12 +1541,12 @@ static int ctnetlink_del_conntrack(struct sk_buff *skb, if (cda[CTA_TUPLE_ORIG]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG, - nfmsg->nfgen_family, &zone); + family, &zone); else if (cda[CTA_TUPLE_REPLY]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, - nfmsg->nfgen_family, &zone); + family, &zone); else { - u_int8_t u3 = nfmsg->version ? nfmsg->nfgen_family : AF_UNSPEC; + u_int8_t u3 = info->nfmsg->version ? family : AF_UNSPEC; return ctnetlink_flush_conntrack(info->net, cda, NETLINK_CB(skb).portid, @@ -1586,8 +1586,7 @@ static int ctnetlink_get_conntrack(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { - struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); - u_int8_t u3 = nfmsg->nfgen_family; + u_int8_t u3 = info->nfmsg->nfgen_family; struct nf_conntrack_tuple_hash *h; struct nf_conntrack_tuple tuple; struct nf_conntrack_zone zone; @@ -2363,10 +2362,9 @@ static int ctnetlink_new_conntrack(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { - struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); struct nf_conntrack_tuple otuple, rtuple; struct nf_conntrack_tuple_hash *h = NULL; - u_int8_t u3 = nfmsg->nfgen_family; + u_int8_t u3 = info->nfmsg->nfgen_family; struct nf_conntrack_zone zone; struct nf_conn *ct; int err; @@ -3259,8 +3257,7 @@ static int ctnetlink_get_expect(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { - struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); - u_int8_t u3 = nfmsg->nfgen_family; + u_int8_t u3 = info->nfmsg->nfgen_family; struct nf_conntrack_tuple tuple; struct nf_conntrack_expect *exp; struct nf_conntrack_zone zone; @@ -3349,8 +3346,7 @@ static int ctnetlink_del_expect(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { - struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); - u_int8_t u3 = nfmsg->nfgen_family; + u_int8_t u3 = info->nfmsg->nfgen_family; struct nf_conntrack_expect *exp; struct nf_conntrack_tuple tuple; struct nf_conntrack_zone zone; @@ -3601,8 +3597,7 @@ static int ctnetlink_new_expect(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { - struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); - u_int8_t u3 = nfmsg->nfgen_family; + u_int8_t u3 = info->nfmsg->nfgen_family; struct nf_conntrack_tuple tuple; struct nf_conntrack_expect *exp; struct nf_conntrack_zone zone; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index d63d2d8f769c..b2b4e03ce036 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -861,10 +861,9 @@ static int nft_netlink_dump_start_rcu(struct sock *nlsk, struct sk_buff *skb, static int nf_tables_gettable(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_cur(info->net); - int family = nfmsg->nfgen_family; + u8 family = info->nfmsg->nfgen_family; const struct nft_table *table; struct net *net = info->net; struct sk_buff *skb2; @@ -1059,10 +1058,9 @@ static int nf_tables_newtable(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { struct nftables_pernet *nft_net = nft_pernet(info->net); - const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); - int family = nfmsg->nfgen_family; + u8 family = info->nfmsg->nfgen_family; struct net *net = info->net; const struct nlattr *attr; struct nft_table *table; @@ -1254,10 +1252,9 @@ out: static int nf_tables_deltable(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); - int family = nfmsg->nfgen_family; + u8 family = info->nfmsg->nfgen_family; struct net *net = info->net; const struct nlattr *attr; struct nft_table *table; @@ -1627,10 +1624,9 @@ done: static int nf_tables_getchain(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_cur(info->net); - int family = nfmsg->nfgen_family; + u8 family = info->nfmsg->nfgen_family; const struct nft_chain *chain; struct net *net = info->net; struct nft_table *table; @@ -2355,10 +2351,9 @@ static int nf_tables_newchain(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { struct nftables_pernet *nft_net = nft_pernet(info->net); - const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); - int family = nfmsg->nfgen_family; + u8 family = info->nfmsg->nfgen_family; struct nft_chain *chain = NULL; struct net *net = info->net; const struct nlattr *attr; @@ -2453,10 +2448,9 @@ static int nf_tables_newchain(struct sk_buff *skb, const struct nfnl_info *info, static int nf_tables_delchain(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); - int family = nfmsg->nfgen_family; + u8 family = info->nfmsg->nfgen_family; struct net *net = info->net; const struct nlattr *attr; struct nft_table *table; @@ -3080,10 +3074,9 @@ static int nf_tables_dump_rules_done(struct netlink_callback *cb) static int nf_tables_getrule(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_cur(info->net); - int family = nfmsg->nfgen_family; + u8 family = info->nfmsg->nfgen_family; const struct nft_chain *chain; const struct nft_rule *rule; struct net *net = info->net; @@ -3221,13 +3214,12 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { struct nftables_pernet *nft_net = nft_pernet(info->net); - const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); struct netlink_ext_ack *extack = info->extack; unsigned int size, i, n, ulen = 0, usize = 0; u8 genmask = nft_genmask_next(info->net); struct nft_rule *rule, *old_rule = NULL; struct nft_expr_info *expr_info = NULL; - int family = nfmsg->nfgen_family; + u8 family = info->nfmsg->nfgen_family; struct net *net = info->net; struct nft_flow_rule *flow; struct nft_userdata *udata; @@ -3459,15 +3451,15 @@ static struct nft_rule *nft_rule_lookup_byid(const struct net *net, static int nf_tables_delrule(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); struct netlink_ext_ack *extack = info->extack; - int family = nfmsg->nfgen_family, err = 0; u8 genmask = nft_genmask_next(info->net); + u8 family = info->nfmsg->nfgen_family; struct nft_chain *chain = NULL; struct net *net = info->net; struct nft_table *table; struct nft_rule *rule; struct nft_ctx ctx; + int err = 0; table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask, NETLINK_CB(skb).portid); @@ -4050,7 +4042,6 @@ static int nf_tables_dump_sets_done(struct netlink_callback *cb) static int nf_tables_getset(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_cur(info->net); struct net *net = info->net; @@ -4078,7 +4069,7 @@ static int nf_tables_getset(struct sk_buff *skb, const struct nfnl_info *info, } /* Only accept unspec with dump */ - if (nfmsg->nfgen_family == NFPROTO_UNSPEC) + if (info->nfmsg->nfgen_family == NFPROTO_UNSPEC) return -EAFNOSUPPORT; if (!nla[NFTA_SET_TABLE]) return -EINVAL; @@ -4171,11 +4162,10 @@ static int nf_tables_set_desc_parse(struct nft_set_desc *desc, static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); u32 ktype, dtype, flags, policy, gc_int, objtype; struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); - int family = nfmsg->nfgen_family; + u8 family = info->nfmsg->nfgen_family; const struct nft_set_ops *ops; struct nft_expr *expr = NULL; struct net *net = info->net; @@ -4475,7 +4465,6 @@ static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) static int nf_tables_delset(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); struct net *net = info->net; @@ -4484,7 +4473,7 @@ static int nf_tables_delset(struct sk_buff *skb, const struct nfnl_info *info, struct nft_ctx ctx; int err; - if (nfmsg->nfgen_family == NFPROTO_UNSPEC) + if (info->nfmsg->nfgen_family == NFPROTO_UNSPEC) return -EAFNOSUPPORT; if (nla[NFTA_SET_TABLE] == NULL) return -EINVAL; @@ -6527,11 +6516,10 @@ err_free_trans: static int nf_tables_newobj(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); + u8 family = info->nfmsg->nfgen_family; const struct nft_object_type *type; - int family = nfmsg->nfgen_family; struct net *net = info->net; struct nft_table *table; struct nft_object *obj; @@ -6783,10 +6771,9 @@ static int nf_tables_dump_obj_done(struct netlink_callback *cb) static int nf_tables_getobj(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_cur(info->net); - int family = nfmsg->nfgen_family; + u8 family = info->nfmsg->nfgen_family; const struct nft_table *table; struct net *net = info->net; struct nft_object *obj; @@ -6873,10 +6860,9 @@ static void nft_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj) static int nf_tables_delobj(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); - int family = nfmsg->nfgen_family; + u8 family = info->nfmsg->nfgen_family; struct net *net = info->net; const struct nlattr *attr; struct nft_table *table; @@ -7304,12 +7290,11 @@ static int nf_tables_newflowtable(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); struct netlink_ext_ack *extack = info->extack; struct nft_flowtable_hook flowtable_hook; u8 genmask = nft_genmask_next(info->net); + u8 family = info->nfmsg->nfgen_family; const struct nf_flowtable_type *type; - int family = nfmsg->nfgen_family; struct nft_flowtable *flowtable; struct nft_hook *hook, *next; struct net *net = info->net; @@ -7493,10 +7478,9 @@ static int nf_tables_delflowtable(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); - int family = nfmsg->nfgen_family; + u8 family = info->nfmsg->nfgen_family; struct nft_flowtable *flowtable; struct net *net = info->net; const struct nlattr *attr; @@ -7688,9 +7672,8 @@ static int nf_tables_getflowtable(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); u8 genmask = nft_genmask_cur(info->net); - int family = nfmsg->nfgen_family; + u8 family = info->nfmsg->nfgen_family; struct nft_flowtable *flowtable; const struct nft_table *table; struct net *net = info->net; diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index e8dbd8379027..028a1f39318b 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -256,6 +256,7 @@ replay: .net = net, .sk = nfnlnet->nfnl, .nlh = nlh, + .nfmsg = nlmsg_data(nlh), .extack = extack, }; @@ -491,6 +492,7 @@ replay_abort: .net = net, .sk = nfnlnet->nfnl, .nlh = nlh, + .nfmsg = nlmsg_data(nlh), .extack = &extack, }; diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 587086b18c36..691ef4cffdd9 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -871,15 +871,14 @@ static int nfulnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nfula[]) { struct nfnl_log_net *log = nfnl_log_pernet(info->net); - struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); - u_int16_t group_num = ntohs(nfmsg->res_id); + u_int16_t group_num = ntohs(info->nfmsg->res_id); struct nfulnl_msg_config_cmd *cmd = NULL; struct nfulnl_instance *inst; u16 flags = 0; int ret = 0; if (nfula[NFULA_CFG_CMD]) { - u_int8_t pf = nfmsg->nfgen_family; + u_int8_t pf = info->nfmsg->nfgen_family; cmd = nla_data(nfula[NFULA_CFG_CMD]); /* Commands without queue context */ diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index f37a575ebd7f..f774de0fc24f 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -1051,8 +1051,7 @@ static int nfqnl_recv_verdict_batch(struct sk_buff *skb, const struct nlattr * const nfqa[]) { struct nfnl_queue_net *q = nfnl_queue_pernet(info->net); - struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); - u16 queue_num = ntohs(nfmsg->res_id); + u16 queue_num = ntohs(info->nfmsg->res_id); struct nf_queue_entry *entry, *tmp; struct nfqnl_msg_verdict_hdr *vhdr; struct nfqnl_instance *queue; @@ -1160,8 +1159,7 @@ static int nfqnl_recv_verdict(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nfqa[]) { struct nfnl_queue_net *q = nfnl_queue_pernet(info->net); - struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); - u_int16_t queue_num = ntohs(nfmsg->res_id); + u_int16_t queue_num = ntohs(info->nfmsg->res_id); struct nfqnl_msg_verdict_hdr *vhdr; enum ip_conntrack_info ctinfo; struct nfqnl_instance *queue; @@ -1243,8 +1241,7 @@ static int nfqnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nfqa[]) { struct nfnl_queue_net *q = nfnl_queue_pernet(info->net); - struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); - u_int16_t queue_num = ntohs(nfmsg->res_id); + u_int16_t queue_num = ntohs(info->nfmsg->res_id); struct nfqnl_msg_config_cmd *cmd = NULL; struct nfqnl_instance *queue; __u32 flags = 0, mask = 0; diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 3144a9ad2f6a..639c337c885b 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -625,7 +625,7 @@ static int nfnl_compat_get_rcu(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const tb[]) { - struct nfgenmsg *nfmsg; + u8 family = info->nfmsg->nfgen_family; const char *name, *fmt; struct sk_buff *skb2; int ret = 0, target; @@ -640,9 +640,7 @@ static int nfnl_compat_get_rcu(struct sk_buff *skb, rev = ntohl(nla_get_be32(tb[NFTA_COMPAT_REV])); target = ntohl(nla_get_be32(tb[NFTA_COMPAT_TYPE])); - nfmsg = nlmsg_data(info->nlh); - - switch(nfmsg->nfgen_family) { + switch(family) { case AF_INET: fmt = "ipt_%s"; break; @@ -656,8 +654,7 @@ static int nfnl_compat_get_rcu(struct sk_buff *skb, fmt = "arpt_%s"; break; default: - pr_err("nft_compat: unsupported protocol %d\n", - nfmsg->nfgen_family); + pr_err("nft_compat: unsupported protocol %d\n", family); return -EINVAL; } @@ -665,9 +662,8 @@ static int nfnl_compat_get_rcu(struct sk_buff *skb, return -EINVAL; rcu_read_unlock(); - try_then_request_module(xt_find_revision(nfmsg->nfgen_family, name, - rev, target, &ret), - fmt, name); + try_then_request_module(xt_find_revision(family, name, rev, target, &ret), + fmt, name); if (ret < 0) goto out_put; @@ -682,8 +678,7 @@ static int nfnl_compat_get_rcu(struct sk_buff *skb, info->nlh->nlmsg_seq, NFNL_MSG_TYPE(info->nlh->nlmsg_type), NFNL_MSG_COMPAT_GET, - nfmsg->nfgen_family, - name, ret, target) <= 0) { + family, name, ret, target) <= 0) { kfree_skb(skb2); goto out_put; } -- cgit v1.2.3-58-ga151 From e2b750d78b55d783f6ff4a1ab1e96f01e3e2ccfb Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 31 May 2021 00:08:10 +0200 Subject: netfilter: nf_tables: remove nft_ctx_init_from_elemattr() Replace nft_ctx_init_from_elemattr() by nft_table_lookup() and set up the context structure right before it is really needed. Moreover, nft_ctx_init_from_elemattr() is setting up the context structure for codepaths where this is not really needed at all. This helper function is also not helping to consolidate code, removing it saves us 4 LoC. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 72 ++++++++++++++++++++----------------------- 1 file changed, 34 insertions(+), 38 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index b2b4e03ce036..2fbcb2543795 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4703,28 +4703,6 @@ static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + [NFTA_SET_ELEM_LIST_SET_ID] = { .type = NLA_U32 }, }; -static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, struct net *net, - const struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack, - u8 genmask, u32 nlpid) -{ - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - int family = nfmsg->nfgen_family; - struct nft_table *table; - - table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family, - genmask, nlpid); - if (IS_ERR(table)) { - NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_TABLE]); - return PTR_ERR(table); - } - - nft_ctx_init(ctx, net, skb, nlh, family, table, NULL, nla); - return 0; -} - static int nft_set_elem_expr_dump(struct sk_buff *skb, const struct nft_set *set, const struct nft_set_ext *ext) @@ -5182,21 +5160,27 @@ static int nf_tables_getsetelem(struct sk_buff *skb, { struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_cur(info->net); + u8 family = info->nfmsg->nfgen_family; struct net *net = info->net; + struct nft_table *table; struct nft_set *set; struct nlattr *attr; struct nft_ctx ctx; int rem, err = 0; - err = nft_ctx_init_from_elemattr(&ctx, net, skb, info->nlh, nla, extack, - genmask, NETLINK_CB(skb).portid); - if (err < 0) - return err; + table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family, + genmask, NETLINK_CB(skb).portid); + if (IS_ERR(table)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_TABLE]); + return PTR_ERR(table); + } - set = nft_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], genmask); + set = nft_set_lookup(table, nla[NFTA_SET_ELEM_LIST_SET], genmask); if (IS_ERR(set)) return PTR_ERR(set); + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .start = nf_tables_dump_set_start, @@ -5965,8 +5949,10 @@ static int nf_tables_newsetelem(struct sk_buff *skb, struct nftables_pernet *nft_net = nft_pernet(info->net); struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); + u8 family = info->nfmsg->nfgen_family; struct net *net = info->net; const struct nlattr *attr; + struct nft_table *table; struct nft_set *set; struct nft_ctx ctx; int rem, err; @@ -5974,12 +5960,14 @@ static int nf_tables_newsetelem(struct sk_buff *skb, if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL) return -EINVAL; - err = nft_ctx_init_from_elemattr(&ctx, net, skb, info->nlh, nla, extack, - genmask, NETLINK_CB(skb).portid); - if (err < 0) - return err; + table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family, + genmask, NETLINK_CB(skb).portid); + if (IS_ERR(table)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_TABLE]); + return PTR_ERR(table); + } - set = nft_set_lookup_global(net, ctx.table, nla[NFTA_SET_ELEM_LIST_SET], + set = nft_set_lookup_global(net, table, nla[NFTA_SET_ELEM_LIST_SET], nla[NFTA_SET_ELEM_LIST_SET_ID], genmask); if (IS_ERR(set)) return PTR_ERR(set); @@ -5987,6 +5975,8 @@ static int nf_tables_newsetelem(struct sk_buff *skb, if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT) return -EBUSY; + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); + nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { err = nft_add_set_elem(&ctx, set, attr, info->nlh->nlmsg_flags); if (err < 0) @@ -5994,7 +5984,7 @@ static int nf_tables_newsetelem(struct sk_buff *skb, } if (nft_net->validate_state == NFT_VALIDATE_DO) - return nft_table_validate(net, ctx.table); + return nft_table_validate(net, table); return 0; } @@ -6232,23 +6222,29 @@ static int nf_tables_delsetelem(struct sk_buff *skb, { struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); + u8 family = info->nfmsg->nfgen_family; struct net *net = info->net; const struct nlattr *attr; + struct nft_table *table; struct nft_set *set; struct nft_ctx ctx; int rem, err = 0; - err = nft_ctx_init_from_elemattr(&ctx, net, skb, info->nlh, nla, extack, - genmask, NETLINK_CB(skb).portid); - if (err < 0) - return err; + table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family, + genmask, NETLINK_CB(skb).portid); + if (IS_ERR(table)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_TABLE]); + return PTR_ERR(table); + } - set = nft_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], genmask); + set = nft_set_lookup(table, nla[NFTA_SET_ELEM_LIST_SET], genmask); if (IS_ERR(set)) return PTR_ERR(set); if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT) return -EBUSY; + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); + if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS]) return nft_set_flush(&ctx, set, genmask); -- cgit v1.2.3-58-ga151 From 670866512f971d6fdb9ef2dad19db97d400d1161 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 31 May 2021 00:08:11 +0200 Subject: netfilter: nf_tables: remove nft_ctx_init_from_setattr() Replace nft_ctx_init_from_setattr() by nft_table_lookup(). This patch also disentangles nf_tables_delset() where NFTA_SET_TABLE is required while nft_ctx_init_from_setattr() allows it to be optional. From the nf_tables_delset() path, this also allows to set up the context structure when it is needed. Removing this helper function saves us 14 LoC, so it is not helping to consolidate code. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 64 +++++++++++++++++-------------------------- 1 file changed, 25 insertions(+), 39 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 2fbcb2543795..6c2000a11c7e 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3639,30 +3639,6 @@ static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = { [NFTA_SET_DESC_CONCAT] = { .type = NLA_NESTED }, }; -static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net, - const struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack, - u8 genmask, u32 nlpid) -{ - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - int family = nfmsg->nfgen_family; - struct nft_table *table = NULL; - - if (nla[NFTA_SET_TABLE] != NULL) { - table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family, - genmask, nlpid); - if (IS_ERR(table)) { - NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]); - return PTR_ERR(table); - } - } - - nft_ctx_init(ctx, net, skb, nlh, family, table, NULL, nla); - return 0; -} - static struct nft_set *nft_set_lookup(const struct nft_table *table, const struct nlattr *nla, u8 genmask) { @@ -4044,17 +4020,24 @@ static int nf_tables_getset(struct sk_buff *skb, const struct nfnl_info *info, { struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_cur(info->net); + u8 family = info->nfmsg->nfgen_family; + struct nft_table *table = NULL; struct net *net = info->net; const struct nft_set *set; struct sk_buff *skb2; struct nft_ctx ctx; int err; - /* Verify existence before starting dump */ - err = nft_ctx_init_from_setattr(&ctx, net, skb, info->nlh, nla, extack, - genmask, 0); - if (err < 0) - return err; + if (nla[NFTA_SET_TABLE]) { + table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family, + genmask, 0); + if (IS_ERR(table)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]); + return PTR_ERR(table); + } + } + + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { @@ -4074,7 +4057,7 @@ static int nf_tables_getset(struct sk_buff *skb, const struct nfnl_info *info, if (!nla[NFTA_SET_TABLE]) return -EINVAL; - set = nft_set_lookup(ctx.table, nla[NFTA_SET_NAME], genmask); + set = nft_set_lookup(table, nla[NFTA_SET_NAME], genmask); if (IS_ERR(set)) return PTR_ERR(set); @@ -4467,28 +4450,29 @@ static int nf_tables_delset(struct sk_buff *skb, const struct nfnl_info *info, { struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); + u8 family = info->nfmsg->nfgen_family; struct net *net = info->net; const struct nlattr *attr; + struct nft_table *table; struct nft_set *set; struct nft_ctx ctx; - int err; if (info->nfmsg->nfgen_family == NFPROTO_UNSPEC) return -EAFNOSUPPORT; - if (nla[NFTA_SET_TABLE] == NULL) - return -EINVAL; - err = nft_ctx_init_from_setattr(&ctx, net, skb, info->nlh, nla, extack, - genmask, NETLINK_CB(skb).portid); - if (err < 0) - return err; + table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family, + genmask, NETLINK_CB(skb).portid); + if (IS_ERR(table)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]); + return PTR_ERR(table); + } if (nla[NFTA_SET_HANDLE]) { attr = nla[NFTA_SET_HANDLE]; - set = nft_set_lookup_byhandle(ctx.table, attr, genmask); + set = nft_set_lookup_byhandle(table, attr, genmask); } else { attr = nla[NFTA_SET_NAME]; - set = nft_set_lookup(ctx.table, attr, genmask); + set = nft_set_lookup(table, attr, genmask); } if (IS_ERR(set)) { @@ -4502,6 +4486,8 @@ static int nf_tables_delset(struct sk_buff *skb, const struct nfnl_info *info, return -EBUSY; } + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); + return nft_delset(&ctx, set); } -- cgit v1.2.3-58-ga151 From 0418b989a467885292c294923dec66951c1c1398 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 2 Jun 2021 12:39:07 +0200 Subject: netfilter: nftables: add nf_ct_pernet() helper function Consolidate call to net_generic(net, nf_conntrack_net_id) in this wrapper function. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 7 +++++++ net/netfilter/nf_conntrack_core.c | 22 +++++++++------------- net/netfilter/nf_conntrack_ecache.c | 8 +++----- net/netfilter/nf_conntrack_expect.c | 12 +++++------- net/netfilter/nf_conntrack_helper.c | 6 ++---- net/netfilter/nf_conntrack_proto.c | 6 ++---- net/netfilter/nf_conntrack_standalone.c | 8 +++----- 7 files changed, 31 insertions(+), 38 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 06dc6db70d18..cc663c68ddc4 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -346,6 +346,13 @@ nf_ct_set(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info info) skb_set_nfct(skb, (unsigned long)ct | info); } +extern unsigned int nf_conntrack_net_id; + +static inline struct nf_conntrack_net *nf_ct_pernet(const struct net *net) +{ + return net_generic(net, nf_conntrack_net_id); +} + #define NF_CT_STAT_INC(net, count) __this_cpu_inc((net)->ct.stat->count) #define NF_CT_STAT_INC_ATOMIC(net, count) this_cpu_inc((net)->ct.stat->count) #define NF_CT_STAT_ADD_ATOMIC(net, count, v) this_cpu_add((net)->ct.stat->count, (v)) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index e0befcf8113a..96ba19fc8155 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -55,8 +55,6 @@ #include "nf_internals.h" -extern unsigned int nf_conntrack_net_id; - __cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS]; EXPORT_SYMBOL_GPL(nf_conntrack_locks); @@ -87,8 +85,6 @@ static __read_mostly bool nf_conntrack_locks_all; static struct conntrack_gc_work conntrack_gc_work; -extern unsigned int nf_conntrack_net_id; - void nf_conntrack_lock(spinlock_t *lock) __acquires(lock) { /* 1) Acquire the lock */ @@ -1404,7 +1400,7 @@ static void gc_worker(struct work_struct *work) continue; net = nf_ct_net(tmp); - cnet = net_generic(net, nf_conntrack_net_id); + cnet = nf_ct_pernet(net); if (atomic_read(&cnet->count) < nf_conntrack_max95) continue; @@ -1484,7 +1480,7 @@ __nf_conntrack_alloc(struct net *net, const struct nf_conntrack_tuple *repl, gfp_t gfp, u32 hash) { - struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + struct nf_conntrack_net *cnet = nf_ct_pernet(net); unsigned int ct_count; struct nf_conn *ct; @@ -1556,7 +1552,7 @@ void nf_conntrack_free(struct nf_conn *ct) nf_ct_ext_destroy(ct); kmem_cache_free(nf_conntrack_cachep, ct); - cnet = net_generic(net, nf_conntrack_net_id); + cnet = nf_ct_pernet(net); smp_mb__before_atomic(); atomic_dec(&cnet->count); @@ -1614,7 +1610,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, GFP_ATOMIC); local_bh_disable(); - cnet = net_generic(net, nf_conntrack_net_id); + cnet = nf_ct_pernet(net); if (cnet->expect_count) { spin_lock(&nf_conntrack_expect_lock); exp = nf_ct_find_expectation(net, zone, tuple); @@ -2317,7 +2313,7 @@ __nf_ct_unconfirmed_destroy(struct net *net) void nf_ct_unconfirmed_destroy(struct net *net) { - struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + struct nf_conntrack_net *cnet = nf_ct_pernet(net); might_sleep(); @@ -2333,7 +2329,7 @@ void nf_ct_iterate_cleanup_net(struct net *net, int (*iter)(struct nf_conn *i, void *data), void *data, u32 portid, int report) { - struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + struct nf_conntrack_net *cnet = nf_ct_pernet(net); struct iter_data d; might_sleep(); @@ -2367,7 +2363,7 @@ nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data) down_read(&net_rwsem); for_each_net(net) { - struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + struct nf_conntrack_net *cnet = nf_ct_pernet(net); if (atomic_read(&cnet->count) == 0) continue; @@ -2449,7 +2445,7 @@ void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list) i_see_dead_people: busy = 0; list_for_each_entry(net, net_exit_list, exit_list) { - struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + struct nf_conntrack_net *cnet = nf_ct_pernet(net); nf_ct_iterate_cleanup(kill_all, net, 0, 0); if (atomic_read(&cnet->count) != 0) @@ -2733,7 +2729,7 @@ void nf_conntrack_init_end(void) int nf_conntrack_init_net(struct net *net) { - struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + struct nf_conntrack_net *cnet = nf_ct_pernet(net); int ret = -ENOMEM; int cpu; diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c index 759d87aef95f..296e4a171bd1 100644 --- a/net/netfilter/nf_conntrack_ecache.c +++ b/net/netfilter/nf_conntrack_ecache.c @@ -27,8 +27,6 @@ #include #include -extern unsigned int nf_conntrack_net_id; - static DEFINE_MUTEX(nf_ct_ecache_mutex); #define ECACHE_RETRY_WAIT (HZ/10) @@ -348,7 +346,7 @@ EXPORT_SYMBOL_GPL(nf_ct_expect_unregister_notifier); void nf_conntrack_ecache_work(struct net *net, enum nf_ct_ecache_state state) { - struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + struct nf_conntrack_net *cnet = nf_ct_pernet(net); if (state == NFCT_ECACHE_DESTROY_FAIL && !delayed_work_pending(&cnet->ecache_dwork)) { @@ -371,7 +369,7 @@ static const struct nf_ct_ext_type event_extend = { void nf_conntrack_ecache_pernet_init(struct net *net) { - struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + struct nf_conntrack_net *cnet = nf_ct_pernet(net); net->ct.sysctl_events = nf_ct_events; cnet->ct_net = &net->ct; @@ -380,7 +378,7 @@ void nf_conntrack_ecache_pernet_init(struct net *net) void nf_conntrack_ecache_pernet_fini(struct net *net) { - struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + struct nf_conntrack_net *cnet = nf_ct_pernet(net); cancel_delayed_work_sync(&cnet->ecache_dwork); } diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index efdd391b3f72..1e851bc2e61a 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -43,8 +43,6 @@ unsigned int nf_ct_expect_max __read_mostly; static struct kmem_cache *nf_ct_expect_cachep __read_mostly; static unsigned int nf_ct_expect_hashrnd __read_mostly; -extern unsigned int nf_conntrack_net_id; - /* nf_conntrack_expect helper functions */ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, u32 portid, int report) @@ -58,7 +56,7 @@ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, hlist_del_rcu(&exp->hnode); - cnet = net_generic(net, nf_conntrack_net_id); + cnet = nf_ct_pernet(net); cnet->expect_count--; hlist_del_rcu(&exp->lnode); @@ -123,7 +121,7 @@ __nf_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { - struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + struct nf_conntrack_net *cnet = nf_ct_pernet(net); struct nf_conntrack_expect *i; unsigned int h; @@ -164,7 +162,7 @@ nf_ct_find_expectation(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { - struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + struct nf_conntrack_net *cnet = nf_ct_pernet(net); struct nf_conntrack_expect *i, *exp = NULL; unsigned int h; @@ -397,7 +395,7 @@ static void nf_ct_expect_insert(struct nf_conntrack_expect *exp) master_help->expecting[exp->class]++; hlist_add_head_rcu(&exp->hnode, &nf_ct_expect_hash[h]); - cnet = net_generic(net, nf_conntrack_net_id); + cnet = nf_ct_pernet(net); cnet->expect_count++; NF_CT_STAT_INC(net, expect_create); @@ -468,7 +466,7 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect, } } - cnet = net_generic(net, nf_conntrack_net_id); + cnet = nf_ct_pernet(net); if (cnet->expect_count >= nf_ct_expect_max) { net_warn_ratelimited("nf_conntrack: expectation table full\n"); ret = -EMFILE; diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index ac396cc8bfae..ae4488a13c70 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -43,8 +43,6 @@ MODULE_PARM_DESC(nf_conntrack_helper, static DEFINE_MUTEX(nf_ct_nat_helpers_mutex); static struct list_head nf_ct_nat_helpers __read_mostly; -extern unsigned int nf_conntrack_net_id; - /* Stupid hash, but collision free for the default registrations of the * helpers currently in the kernel. */ static unsigned int helper_hash(const struct nf_conntrack_tuple *tuple) @@ -214,7 +212,7 @@ EXPORT_SYMBOL_GPL(nf_ct_helper_ext_add); static struct nf_conntrack_helper * nf_ct_lookup_helper(struct nf_conn *ct, struct net *net) { - struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + struct nf_conntrack_net *cnet = nf_ct_pernet(net); if (!cnet->sysctl_auto_assign_helper) { if (cnet->auto_assign_helper_warned) @@ -560,7 +558,7 @@ static const struct nf_ct_ext_type helper_extend = { void nf_conntrack_helper_pernet_init(struct net *net) { - struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + struct nf_conntrack_net *cnet = nf_ct_pernet(net); cnet->sysctl_auto_assign_helper = nf_ct_auto_assign_helper; } diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index 89e5bac384d7..fbc1fa36d2c2 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -42,8 +42,6 @@ #include #include -extern unsigned int nf_conntrack_net_id; - static DEFINE_MUTEX(nf_ct_proto_mutex); #ifdef CONFIG_SYSCTL @@ -446,7 +444,7 @@ static struct nf_ct_bridge_info *nf_ct_bridge_info; static int nf_ct_netns_do_get(struct net *net, u8 nfproto) { - struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + struct nf_conntrack_net *cnet = nf_ct_pernet(net); bool fixup_needed = false, retry = true; int err = 0; retry: @@ -531,7 +529,7 @@ retry: static void nf_ct_netns_do_put(struct net *net, u8 nfproto) { - struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + struct nf_conntrack_net *cnet = nf_ct_pernet(net); mutex_lock(&nf_ct_proto_mutex); switch (nfproto) { diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index aaa55246d0ca..bce93656fad9 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -512,9 +512,7 @@ static void nf_conntrack_standalone_fini_proc(struct net *net) u32 nf_conntrack_count(const struct net *net) { - const struct nf_conntrack_net *cnet; - - cnet = net_generic(net, nf_conntrack_net_id); + const struct nf_conntrack_net *cnet = nf_ct_pernet(net); return atomic_read(&cnet->count); } @@ -1032,7 +1030,7 @@ static void nf_conntrack_standalone_init_gre_sysctl(struct net *net, static int nf_conntrack_standalone_init_sysctl(struct net *net) { - struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + struct nf_conntrack_net *cnet = nf_ct_pernet(net); struct nf_udp_net *un = nf_udp_pernet(net); struct ctl_table *table; @@ -1085,7 +1083,7 @@ out_unregister_netfilter: static void nf_conntrack_standalone_fini_sysctl(struct net *net) { - struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + struct nf_conntrack_net *cnet = nf_ct_pernet(net); struct ctl_table *table; table = cnet->sysctl_header->ctl_table_arg; -- cgit v1.2.3-58-ga151 From ef8ed5ea091bf21648d0c4c1fa4a962d079eab2b Mon Sep 17 00:00:00 2001 From: Oz Shlomo Date: Thu, 3 Jun 2021 15:12:33 +0300 Subject: netfilter: conntrack: Introduce tcp offload timeout configuration TCP connections may be offloaded from nf conntrack to nf flow table. Offloaded connections are aged after 30 seconds of inactivity. Once aged, ownership is returned to conntrack with a hard coded pickup time of 120 seconds, after which the connection may be deleted. eted. The current aging intervals may be too aggressive for some users. Provide users with the ability to control the nf flow table offload aging and pickup time intervals via sysctl parameter as a pre-step for configuring the nf flow table GC timeout intervals. Signed-off-by: Oz Shlomo Reviewed-by: Paul Blakey Signed-off-by: Pablo Neira Ayuso --- include/net/netns/conntrack.h | 4 ++++ net/netfilter/nf_conntrack_proto_tcp.c | 5 +++++ net/netfilter/nf_conntrack_standalone.c | 24 ++++++++++++++++++++++++ 3 files changed, 33 insertions(+) (limited to 'net') diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h index ad0a95c2335e..3a391e27ec60 100644 --- a/include/net/netns/conntrack.h +++ b/include/net/netns/conntrack.h @@ -27,6 +27,10 @@ struct nf_tcp_net { u8 tcp_loose; u8 tcp_be_liberal; u8 tcp_max_retrans; +#if IS_ENABLED(CONFIG_NF_FLOW_TABLE) + unsigned int offload_timeout; + unsigned int offload_pickup; +#endif }; enum udp_conntrack { diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 34e22416a721..de840fc41a2e 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -1441,6 +1441,11 @@ void nf_conntrack_tcp_init_net(struct net *net) * will be started. */ tn->tcp_max_retrans = 3; + +#if IS_ENABLED(CONFIG_NF_FLOW_TABLE) + tn->offload_timeout = 30 * HZ; + tn->offload_pickup = 120 * HZ; +#endif } const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp = diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index bce93656fad9..67b0fcd1a787 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -573,6 +573,10 @@ enum nf_ct_sysctl_index { NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_CLOSE, NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_RETRANS, NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_UNACK, +#if IS_ENABLED(CONFIG_NF_FLOW_TABLE) + NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD, + NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD_PICKUP, +#endif NF_SYSCTL_CT_PROTO_TCP_LOOSE, NF_SYSCTL_CT_PROTO_TCP_LIBERAL, NF_SYSCTL_CT_PROTO_TCP_MAX_RETRANS, @@ -760,6 +764,20 @@ static struct ctl_table nf_ct_sysctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, +#if IS_ENABLED(CONFIG_NF_FLOW_TABLE) + [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD] = { + .procname = "nf_flowtable_tcp_timeout", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD_PICKUP] = { + .procname = "nf_flowtable_tcp_pickup", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, +#endif [NF_SYSCTL_CT_PROTO_TCP_LOOSE] = { .procname = "nf_conntrack_tcp_loose", .maxlen = sizeof(u8), @@ -969,6 +987,12 @@ static void nf_conntrack_standalone_init_tcp_sysctl(struct net *net, XASSIGN(LIBERAL, &tn->tcp_be_liberal); XASSIGN(MAX_RETRANS, &tn->tcp_max_retrans); #undef XASSIGN + +#if IS_ENABLED(CONFIG_NF_FLOW_TABLE) + table[NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD].data = &tn->offload_timeout; + table[NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD_PICKUP].data = &tn->offload_pickup; +#endif + } static void nf_conntrack_standalone_init_sctp_sysctl(struct net *net, -- cgit v1.2.3-58-ga151 From 975c57504da1114551fdb3a91ed61dda7739613e Mon Sep 17 00:00:00 2001 From: Oz Shlomo Date: Thu, 3 Jun 2021 15:12:34 +0300 Subject: netfilter: conntrack: Introduce udp offload timeout configuration UDP connections may be offloaded from nf conntrack to nf flow table. Offloaded connections are aged after 30 seconds of inactivity. Once aged, ownership is returned to conntrack with a hard coded pickup time of 30 seconds, after which the connection may be deleted. eted. The current aging intervals may be too aggressive for some users. Provide users with the ability to control the nf flow table offload aging and pickup time intervals via sysctl parameter as a pre-step for configuring the nf flow table GC timeout intervals. Signed-off-by: Oz Shlomo Reviewed-by: Paul Blakey Signed-off-by: Pablo Neira Ayuso --- include/net/netns/conntrack.h | 4 ++++ net/netfilter/nf_conntrack_proto_udp.c | 5 +++++ net/netfilter/nf_conntrack_standalone.c | 22 ++++++++++++++++++++++ 3 files changed, 31 insertions(+) (limited to 'net') diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h index 3a391e27ec60..c3094b83a525 100644 --- a/include/net/netns/conntrack.h +++ b/include/net/netns/conntrack.h @@ -41,6 +41,10 @@ enum udp_conntrack { struct nf_udp_net { unsigned int timeouts[UDP_CT_MAX]; +#if IS_ENABLED(CONFIG_NF_FLOW_TABLE) + unsigned int offload_timeout; + unsigned int offload_pickup; +#endif }; struct nf_icmp_net { diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index af402f458ee0..68911fcaa0f1 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -270,6 +270,11 @@ void nf_conntrack_udp_init_net(struct net *net) for (i = 0; i < UDP_CT_MAX; i++) un->timeouts[i] = udp_timeouts[i]; + +#if IS_ENABLED(CONFIG_NF_FLOW_TABLE) + un->offload_timeout = 30 * HZ; + un->offload_pickup = 30 * HZ; +#endif } const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp = diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 67b0fcd1a787..f57a951c9b5e 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -582,6 +582,10 @@ enum nf_ct_sysctl_index { NF_SYSCTL_CT_PROTO_TCP_MAX_RETRANS, NF_SYSCTL_CT_PROTO_TIMEOUT_UDP, NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_STREAM, +#if IS_ENABLED(CONFIG_NF_FLOW_TABLE) + NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD, + NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD_PICKUP, +#endif NF_SYSCTL_CT_PROTO_TIMEOUT_ICMP, NF_SYSCTL_CT_PROTO_TIMEOUT_ICMPV6, #ifdef CONFIG_NF_CT_PROTO_SCTP @@ -812,6 +816,20 @@ static struct ctl_table nf_ct_sysctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, +#if IS_ENABLED(CONFIG_NFT_FLOW_OFFLOAD) + [NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD] = { + .procname = "nf_flowtable_udp_timeout", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + [NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD_PICKUP] = { + .procname = "nf_flowtable_udp_pickup", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, +#endif [NF_SYSCTL_CT_PROTO_TIMEOUT_ICMP] = { .procname = "nf_conntrack_icmp_timeout", .maxlen = sizeof(unsigned int), @@ -1081,6 +1099,10 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net) table[NF_SYSCTL_CT_PROTO_TIMEOUT_ICMPV6].data = &nf_icmpv6_pernet(net)->timeout; table[NF_SYSCTL_CT_PROTO_TIMEOUT_UDP].data = &un->timeouts[UDP_CT_UNREPLIED]; table[NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_STREAM].data = &un->timeouts[UDP_CT_REPLIED]; +#if IS_ENABLED(CONFIG_NF_FLOW_TABLE) + table[NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD].data = &un->offload_timeout; + table[NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD_PICKUP].data = &un->offload_pickup; +#endif nf_conntrack_standalone_init_tcp_sysctl(net, table); nf_conntrack_standalone_init_sctp_sysctl(net, table); -- cgit v1.2.3-58-ga151 From 1d91d2e1a7f767aa8c11d8507ecf268f787734ec Mon Sep 17 00:00:00 2001 From: Oz Shlomo Date: Thu, 3 Jun 2021 15:12:35 +0300 Subject: netfilter: flowtable: Set offload timeouts according to proto values Currently the aging period for tcp/udp connections is hard coded to 30 seconds. Aged tcp/udp connections configure a hard coded 120/30 seconds pickup timeout for conntrack. This configuration may be too aggressive or permissive for some users. Dynamically configure the nf flow table GC timeout intervals according to the user defined values. Signed-off-by: Oz Shlomo Reviewed-by: Paul Blakey Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 2 ++ net/netfilter/nf_flow_table_core.c | 47 +++++++++++++++++++++++++++-------- net/netfilter/nf_flow_table_offload.c | 4 +-- 3 files changed, 41 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index 48ef7460ff30..a3647fadf1cc 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -177,6 +177,8 @@ struct flow_offload { #define NF_FLOW_TIMEOUT (30 * HZ) #define nf_flowtable_time_stamp (u32)jiffies +unsigned long flow_offload_get_timeout(struct flow_offload *flow); + static inline __s32 nf_flow_timeout_delta(unsigned int timeout) { return (__s32)(timeout - nf_flowtable_time_stamp); diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 1d02650dd715..1e50908b1b7e 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -178,12 +178,10 @@ static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp) tcp->seen[1].td_maxwin = 0; } -#define NF_FLOWTABLE_TCP_PICKUP_TIMEOUT (120 * HZ) -#define NF_FLOWTABLE_UDP_PICKUP_TIMEOUT (30 * HZ) - static void flow_offload_fixup_ct_timeout(struct nf_conn *ct) { const struct nf_conntrack_l4proto *l4proto; + struct net *net = nf_ct_net(ct); int l4num = nf_ct_protonum(ct); unsigned int timeout; @@ -191,12 +189,17 @@ static void flow_offload_fixup_ct_timeout(struct nf_conn *ct) if (!l4proto) return; - if (l4num == IPPROTO_TCP) - timeout = NF_FLOWTABLE_TCP_PICKUP_TIMEOUT; - else if (l4num == IPPROTO_UDP) - timeout = NF_FLOWTABLE_UDP_PICKUP_TIMEOUT; - else + if (l4num == IPPROTO_TCP) { + struct nf_tcp_net *tn = nf_tcp_pernet(net); + + timeout = tn->offload_pickup; + } else if (l4num == IPPROTO_UDP) { + struct nf_udp_net *tn = nf_udp_pernet(net); + + timeout = tn->offload_pickup; + } else { return; + } if (nf_flow_timeout_delta(ct->timeout) > (__s32)timeout) ct->timeout = nfct_time_stamp + timeout; @@ -268,11 +271,35 @@ static const struct rhashtable_params nf_flow_offload_rhash_params = { .automatic_shrinking = true, }; +unsigned long flow_offload_get_timeout(struct flow_offload *flow) +{ + const struct nf_conntrack_l4proto *l4proto; + unsigned long timeout = NF_FLOW_TIMEOUT; + struct net *net = nf_ct_net(flow->ct); + int l4num = nf_ct_protonum(flow->ct); + + l4proto = nf_ct_l4proto_find(l4num); + if (!l4proto) + return timeout; + + if (l4num == IPPROTO_TCP) { + struct nf_tcp_net *tn = nf_tcp_pernet(net); + + timeout = tn->offload_timeout; + } else if (l4num == IPPROTO_UDP) { + struct nf_udp_net *tn = nf_udp_pernet(net); + + timeout = tn->offload_timeout; + } + + return timeout; +} + int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) { int err; - flow->timeout = nf_flowtable_time_stamp + NF_FLOW_TIMEOUT; + flow->timeout = nf_flowtable_time_stamp + flow_offload_get_timeout(flow); err = rhashtable_insert_fast(&flow_table->rhashtable, &flow->tuplehash[0].node, @@ -304,7 +331,7 @@ EXPORT_SYMBOL_GPL(flow_offload_add); void flow_offload_refresh(struct nf_flowtable *flow_table, struct flow_offload *flow) { - flow->timeout = nf_flowtable_time_stamp + NF_FLOW_TIMEOUT; + flow->timeout = nf_flowtable_time_stamp + flow_offload_get_timeout(flow); if (likely(!nf_flowtable_hw_offload(flow_table))) return; diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index 528b2f172684..f92006cec94c 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -937,7 +937,7 @@ static void flow_offload_work_stats(struct flow_offload_work *offload) lastused = max_t(u64, stats[0].lastused, stats[1].lastused); offload->flow->timeout = max_t(u64, offload->flow->timeout, - lastused + NF_FLOW_TIMEOUT); + lastused + flow_offload_get_timeout(offload->flow)); if (offload->flowtable->flags & NF_FLOWTABLE_COUNTER) { if (stats[0].pkts) @@ -1041,7 +1041,7 @@ void nf_flow_offload_stats(struct nf_flowtable *flowtable, __s32 delta; delta = nf_flow_timeout_delta(flow->timeout); - if ((delta >= (9 * NF_FLOW_TIMEOUT) / 10)) + if ((delta >= (9 * flow_offload_get_timeout(flow)) / 10)) return; offload = nf_flow_offload_work_alloc(flowtable, flow, FLOW_CLS_STATS); -- cgit v1.2.3-58-ga151 From 7b4b2fa37587394fb89fa51a4bea0820a1b37a5d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 4 Jun 2021 12:27:06 +0200 Subject: netfilter: annotate nf_tables base hook ops This will allow a followup patch to treat the 'ops->priv' pointer as nft_chain argument without having to first walk the table/chains to check if there is a matching base chain pointer. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter.h | 8 +++++++- net/netfilter/nf_tables_api.c | 4 +++- 2 files changed, 10 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index f161569fbe2f..3fda1a508733 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -77,12 +77,18 @@ struct nf_hook_state { typedef unsigned int nf_hookfn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state); +enum nf_hook_ops_type { + NF_HOOK_OP_UNDEFINED, + NF_HOOK_OP_NF_TABLES, +}; + struct nf_hook_ops { /* User fills in from here down. */ nf_hookfn *hook; struct net_device *dev; void *priv; - u_int8_t pf; + u8 pf; + enum nf_hook_ops_type hook_ops_type:8; unsigned int hooknum; /* Hooks are ordered in ascending priority. */ int priority; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 6c2000a11c7e..c9308241b688 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2168,8 +2168,10 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, } nft_trans_chain_policy(trans) = NFT_CHAIN_POLICY_UNSET; - if (nft_is_base_chain(chain)) + if (nft_is_base_chain(chain)) { + basechain->ops.hook_ops_type = NF_HOOK_OP_NF_TABLES; nft_trans_chain_policy(trans) = policy; + } err = nft_chain_add(table, chain); if (err < 0) { -- cgit v1.2.3-58-ga151 From e2cf17d3774c323ef6dab6e9f7c0cfc5e742afd9 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 4 Jun 2021 12:27:07 +0200 Subject: netfilter: add new hook nfnl subsystem This nfnl subsystem allows to dump the list of all active netfiler hooks, e.g. defrag, conntrack, nf/ip/arp/ip6tables and so on. This helps to see what kind of features are currently enabled in the network stack. Sample output from nft tool using this infra: $ nft list hook ip input family ip hook input { +0000000010 nft_do_chain_inet [nf_tables] # nft table firewalld INPUT +0000000100 nf_nat_ipv4_local_in [nf_nat] +2147483647 ipv4_confirm [nf_conntrack] } Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nfnetlink.h | 3 +- include/uapi/linux/netfilter/nfnetlink_hook.h | 55 ++++ net/netfilter/Kconfig | 9 + net/netfilter/Makefile | 1 + net/netfilter/nfnetlink.c | 1 + net/netfilter/nfnetlink_hook.c | 375 ++++++++++++++++++++++++++ 6 files changed, 443 insertions(+), 1 deletion(-) create mode 100644 include/uapi/linux/netfilter/nfnetlink_hook.h create mode 100644 net/netfilter/nfnetlink_hook.c (limited to 'net') diff --git a/include/uapi/linux/netfilter/nfnetlink.h b/include/uapi/linux/netfilter/nfnetlink.h index 5bc960f220b3..6cd58cd2a6f0 100644 --- a/include/uapi/linux/netfilter/nfnetlink.h +++ b/include/uapi/linux/netfilter/nfnetlink.h @@ -60,7 +60,8 @@ struct nfgenmsg { #define NFNL_SUBSYS_CTHELPER 9 #define NFNL_SUBSYS_NFTABLES 10 #define NFNL_SUBSYS_NFT_COMPAT 11 -#define NFNL_SUBSYS_COUNT 12 +#define NFNL_SUBSYS_HOOK 12 +#define NFNL_SUBSYS_COUNT 13 /* Reserved control nfnetlink messages */ #define NFNL_MSG_BATCH_BEGIN NLMSG_MIN_TYPE diff --git a/include/uapi/linux/netfilter/nfnetlink_hook.h b/include/uapi/linux/netfilter/nfnetlink_hook.h new file mode 100644 index 000000000000..912ec60b26b0 --- /dev/null +++ b/include/uapi/linux/netfilter/nfnetlink_hook.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _NFNL_HOOK_H_ +#define _NFNL_HOOK_H_ + +enum nfnl_hook_msg_types { + NFNL_MSG_HOOK_GET, + NFNL_MSG_HOOK_MAX, +}; + +/** + * enum nfnl_hook_attributes - netfilter hook netlink attributes + * + * @NFNLA_HOOK_HOOKNUM: netfilter hook number (NLA_U32) + * @NFNLA_HOOK_PRIORITY: netfilter hook priority (NLA_U32) + * @NFNLA_HOOK_DEV: netdevice name (NLA_STRING) + * @NFNLA_HOOK_FUNCTION_NAME: hook function name (NLA_STRING) + * @NFNLA_HOOK_MODULE_NAME: kernel module that registered this hook (NLA_STRING) + * @NFNLA_HOOK_CHAIN_INFO: basechain hook metadata (NLA_NESTED) + */ +enum nfnl_hook_attributes { + NFNLA_HOOK_UNSPEC, + NFNLA_HOOK_HOOKNUM, + NFNLA_HOOK_PRIORITY, + NFNLA_HOOK_DEV, + NFNLA_HOOK_FUNCTION_NAME, + NFNLA_HOOK_MODULE_NAME, + NFNLA_HOOK_CHAIN_INFO, + __NFNLA_HOOK_MAX +}; +#define NFNLA_HOOK_MAX (__NFNLA_HOOK_MAX - 1) + +/** + * enum nfnl_hook_chain_info_attributes - chain description + * + * NFNLA_HOOK_INFO_DESC: nft chain and table name (enum nft_table_attributes) (NLA_NESTED) + * NFNLA_HOOK_INFO_TYPE: chain type (enum nfnl_hook_chaintype) (NLA_U32) + */ +enum nfnl_hook_chain_info_attributes { + NFNLA_HOOK_INFO_UNSPEC, + NFNLA_HOOK_INFO_DESC, + NFNLA_HOOK_INFO_TYPE, + __NFNLA_HOOK_INFO_MAX, +}; +#define NFNLA_HOOK_INFO_MAX (__NFNLA_HOOK_INFO_MAX - 1) + +/** + * enum nfnl_hook_chaintype - chain type + * + * @NFNL_HOOK_TYPE_NFTABLES nf_tables base chain + */ +enum nfnl_hook_chaintype { + NFNL_HOOK_TYPE_NFTABLES = 0x1, +}; + +#endif /* _NFNL_HOOK_H */ diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 172d74560632..c81321372198 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -19,6 +19,15 @@ config NETFILTER_FAMILY_BRIDGE config NETFILTER_FAMILY_ARP bool +config NETFILTER_NETLINK_HOOK + tristate "Netfilter base hook dump support" + depends on NETFILTER_ADVANCED + select NETFILTER_NETLINK + help + If this option is enabled, the kernel will include support + to list the base netfilter hooks via NFNETLINK. + This is helpful for debugging. + config NETFILTER_NETLINK_ACCT tristate "Netfilter NFACCT over NFNETLINK interface" depends on NETFILTER_ADVANCED diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index e80e010354b1..87112dad1fd4 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -22,6 +22,7 @@ obj-$(CONFIG_NETFILTER_NETLINK_ACCT) += nfnetlink_acct.o obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += nfnetlink_queue.o obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o obj-$(CONFIG_NETFILTER_NETLINK_OSF) += nfnetlink_osf.o +obj-$(CONFIG_NETFILTER_NETLINK_HOOK) += nfnetlink_hook.o # connection tracking obj-$(CONFIG_NF_CONNTRACK) += nf_conntrack.o diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 028a1f39318b..7e2c8dd01408 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -68,6 +68,7 @@ static const char *const nfnl_lockdep_names[NFNL_SUBSYS_COUNT] = { [NFNL_SUBSYS_CTHELPER] = "nfnl_subsys_cthelper", [NFNL_SUBSYS_NFTABLES] = "nfnl_subsys_nftables", [NFNL_SUBSYS_NFT_COMPAT] = "nfnl_subsys_nftcompat", + [NFNL_SUBSYS_HOOK] = "nfnl_subsys_hook", }; static const int nfnl_group2type[NFNLGRP_MAX+1] = { diff --git a/net/netfilter/nfnetlink_hook.c b/net/netfilter/nfnetlink_hook.c new file mode 100644 index 000000000000..04586dfa2acd --- /dev/null +++ b/net/netfilter/nfnetlink_hook.c @@ -0,0 +1,375 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2021 Red Hat GmbH + * + * Author: Florian Westphal + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include +#include + +static const struct nla_policy nfnl_hook_nla_policy[NFNLA_HOOK_MAX + 1] = { + [NFNLA_HOOK_HOOKNUM] = { .type = NLA_U32 }, + [NFNLA_HOOK_PRIORITY] = { .type = NLA_U32 }, + [NFNLA_HOOK_DEV] = { .type = NLA_STRING, + .len = IFNAMSIZ - 1 }, + [NFNLA_HOOK_FUNCTION_NAME] = { .type = NLA_NUL_STRING, + .len = KSYM_NAME_LEN, }, + [NFNLA_HOOK_MODULE_NAME] = { .type = NLA_NUL_STRING, + .len = MODULE_NAME_LEN, }, + [NFNLA_HOOK_CHAIN_INFO] = { .type = NLA_NESTED, }, +}; + +static int nf_netlink_dump_start_rcu(struct sock *nlsk, struct sk_buff *skb, + const struct nlmsghdr *nlh, + struct netlink_dump_control *c) +{ + int err; + + if (!try_module_get(THIS_MODULE)) + return -EINVAL; + + rcu_read_unlock(); + err = netlink_dump_start(nlsk, skb, nlh, c); + rcu_read_lock(); + module_put(THIS_MODULE); + + return err; +} + +struct nfnl_dump_hook_data { + char devname[IFNAMSIZ]; + unsigned long headv; + u8 hook; +}; + +static int nfnl_hook_put_nft_chain_info(struct sk_buff *nlskb, + const struct nfnl_dump_hook_data *ctx, + unsigned int seq, + const struct nf_hook_ops *ops) +{ + struct net *net = sock_net(nlskb->sk); + struct nlattr *nest, *nest2; + struct nft_chain *chain; + int ret = 0; + + if (ops->hook_ops_type != NF_HOOK_OP_NF_TABLES) + return 0; + + chain = ops->priv; + if (WARN_ON_ONCE(!chain)) + return 0; + + if (!nft_is_active(net, chain)) + return 0; + + nest = nla_nest_start(nlskb, NFNLA_HOOK_CHAIN_INFO); + if (!nest) + return -EMSGSIZE; + + ret = nla_put_be32(nlskb, NFNLA_HOOK_INFO_TYPE, + htonl(NFNL_HOOK_TYPE_NFTABLES)); + if (ret) + goto cancel_nest; + + nest2 = nla_nest_start(nlskb, NFNLA_HOOK_INFO_DESC); + if (!nest2) + goto cancel_nest; + + ret = nla_put_string(nlskb, NFTA_CHAIN_TABLE, chain->table->name); + if (ret) + goto cancel_nest; + + ret = nla_put_string(nlskb, NFTA_CHAIN_NAME, chain->name); + if (ret) + goto cancel_nest; + + nla_nest_end(nlskb, nest2); + nla_nest_end(nlskb, nest); + return ret; + +cancel_nest: + nla_nest_cancel(nlskb, nest); + return -EMSGSIZE; +} + +static int nfnl_hook_dump_one(struct sk_buff *nlskb, + const struct nfnl_dump_hook_data *ctx, + const struct nf_hook_ops *ops, + unsigned int seq) +{ + u16 event = nfnl_msg_type(NFNL_SUBSYS_HOOK, NFNL_MSG_HOOK_GET); + unsigned int portid = NETLINK_CB(nlskb).portid; + struct nlmsghdr *nlh; + int ret = -EMSGSIZE; +#ifdef CONFIG_KALLSYMS + char sym[KSYM_SYMBOL_LEN]; + char *module_name; +#endif + nlh = nfnl_msg_put(nlskb, portid, seq, event, + NLM_F_MULTI, ops->pf, NFNETLINK_V0, 0); + if (!nlh) + goto nla_put_failure; + +#ifdef CONFIG_KALLSYMS + ret = snprintf(sym, sizeof(sym), "%ps", ops->hook); + if (ret < 0 || ret > (int)sizeof(sym)) + goto nla_put_failure; + + module_name = strstr(sym, " ["); + if (module_name) { + char *end; + + module_name += 2; + end = strchr(module_name, ']'); + if (end) { + *end = 0; + + ret = nla_put_string(nlskb, NFNLA_HOOK_MODULE_NAME, module_name); + if (ret) + goto nla_put_failure; + } + } + + ret = nla_put_string(nlskb, NFNLA_HOOK_FUNCTION_NAME, sym); + if (ret) + goto nla_put_failure; +#endif + + ret = nla_put_be32(nlskb, NFNLA_HOOK_HOOKNUM, htonl(ops->hooknum)); + if (ret) + goto nla_put_failure; + + ret = nla_put_be32(nlskb, NFNLA_HOOK_PRIORITY, htonl(ops->priority)); + if (ret) + goto nla_put_failure; + + ret = nfnl_hook_put_nft_chain_info(nlskb, ctx, seq, ops); + if (ret) + goto nla_put_failure; + + nlmsg_end(nlskb, nlh); + return 0; +nla_put_failure: + nlmsg_trim(nlskb, nlh); + return ret; +} + +static const struct nf_hook_entries * +nfnl_hook_entries_head(u8 pf, unsigned int hook, struct net *net, const char *dev) +{ + const struct nf_hook_entries *hook_head = NULL; + struct net_device *netdev; + + switch (pf) { + case NFPROTO_IPV4: + if (hook >= ARRAY_SIZE(net->nf.hooks_ipv4)) + return ERR_PTR(-EINVAL); + hook_head = rcu_dereference(net->nf.hooks_ipv4[hook]); + break; + case NFPROTO_IPV6: + hook_head = rcu_dereference(net->nf.hooks_ipv6[hook]); + if (hook >= ARRAY_SIZE(net->nf.hooks_ipv6)) + return ERR_PTR(-EINVAL); + break; + case NFPROTO_ARP: +#ifdef CONFIG_NETFILTER_FAMILY_ARP + if (hook >= ARRAY_SIZE(net->nf.hooks_arp)) + return ERR_PTR(-EINVAL); + hook_head = rcu_dereference(net->nf.hooks_arp[hook]); +#endif + break; + case NFPROTO_BRIDGE: +#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE + if (hook >= ARRAY_SIZE(net->nf.hooks_bridge)) + return ERR_PTR(-EINVAL); + hook_head = rcu_dereference(net->nf.hooks_bridge[hook]); +#endif + break; +#if IS_ENABLED(CONFIG_DECNET) + case NFPROTO_DECNET: + if (hook >= ARRAY_SIZE(net->nf.hooks_decnet)) + return ERR_PTR(-EINVAL); + hook_head = rcu_dereference(net->nf.hooks_decnet[hook]); + break; +#endif +#ifdef CONFIG_NETFILTER_INGRESS + case NFPROTO_NETDEV: + if (hook != NF_NETDEV_INGRESS) + return ERR_PTR(-EOPNOTSUPP); + + if (!dev) + return ERR_PTR(-ENODEV); + + netdev = dev_get_by_name_rcu(net, dev); + if (!netdev) + return ERR_PTR(-ENODEV); + + return rcu_dereference(netdev->nf_hooks_ingress); +#endif + default: + return ERR_PTR(-EPROTONOSUPPORT); + } + + return hook_head; +} + +static int nfnl_hook_dump(struct sk_buff *nlskb, + struct netlink_callback *cb) +{ + struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); + struct nfnl_dump_hook_data *ctx = cb->data; + int err, family = nfmsg->nfgen_family; + struct net *net = sock_net(nlskb->sk); + struct nf_hook_ops * const *ops; + const struct nf_hook_entries *e; + unsigned int i = cb->args[0]; + + rcu_read_lock(); + + e = nfnl_hook_entries_head(family, ctx->hook, net, ctx->devname); + if (!e) + goto done; + + if (IS_ERR(e)) { + cb->seq++; + goto done; + } + + if ((unsigned long)e != ctx->headv || i >= e->num_hook_entries) + cb->seq++; + + ops = nf_hook_entries_get_hook_ops(e); + + for (; i < e->num_hook_entries; i++) { + err = nfnl_hook_dump_one(nlskb, ctx, ops[i], cb->seq); + if (err) + break; + } + +done: + nl_dump_check_consistent(cb, nlmsg_hdr(nlskb)); + rcu_read_unlock(); + cb->args[0] = i; + return nlskb->len; +} + +static int nfnl_hook_dump_start(struct netlink_callback *cb) +{ + const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); + const struct nlattr * const *nla = cb->data; + struct nfnl_dump_hook_data *ctx = NULL; + struct net *net = sock_net(cb->skb->sk); + u8 family = nfmsg->nfgen_family; + char name[IFNAMSIZ] = ""; + const void *head; + u32 hooknum; + + hooknum = ntohl(nla_get_be32(nla[NFNLA_HOOK_HOOKNUM])); + if (hooknum > 255) + return -EINVAL; + + if (family == NFPROTO_NETDEV) { + if (!nla[NFNLA_HOOK_DEV]) + return -EINVAL; + + nla_strscpy(name, nla[NFNLA_HOOK_DEV], sizeof(name)); + } + + rcu_read_lock(); + /* Not dereferenced; for consistency check only */ + head = nfnl_hook_entries_head(family, hooknum, net, name); + rcu_read_unlock(); + + if (head && IS_ERR(head)) + return PTR_ERR(head); + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + strscpy(ctx->devname, name, sizeof(ctx->devname)); + ctx->headv = (unsigned long)head; + ctx->hook = hooknum; + + cb->seq = 1; + cb->data = ctx; + + return 0; +} + +static int nfnl_hook_dump_stop(struct netlink_callback *cb) +{ + kfree(cb->data); + return 0; +} + +static int nfnl_hook_get(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const nla[]) +{ + if (!nla[NFNLA_HOOK_HOOKNUM]) + return -EINVAL; + + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .start = nfnl_hook_dump_start, + .done = nfnl_hook_dump_stop, + .dump = nfnl_hook_dump, + .module = THIS_MODULE, + .data = (void *)nla, + }; + + return nf_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); + } + + return -EOPNOTSUPP; +} + +static const struct nfnl_callback nfnl_hook_cb[NFNL_MSG_HOOK_MAX] = { + [NFNL_MSG_HOOK_GET] = { + .call = nfnl_hook_get, + .type = NFNL_CB_RCU, + .attr_count = NFNLA_HOOK_MAX, + .policy = nfnl_hook_nla_policy + }, +}; + +static const struct nfnetlink_subsystem nfhook_subsys = { + .name = "nfhook", + .subsys_id = NFNL_SUBSYS_HOOK, + .cb_count = NFNL_MSG_HOOK_MAX, + .cb = nfnl_hook_cb, +}; + +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_HOOK); + +static int __init nfnetlink_hook_init(void) +{ + return nfnetlink_subsys_register(&nfhook_subsys); +} + +static void __exit nfnetlink_hook_exit(void) +{ + nfnetlink_subsys_unregister(&nfhook_subsys); +} + +module_init(nfnetlink_hook_init); +module_exit(nfnetlink_hook_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Florian Westphal "); +MODULE_DESCRIPTION("nfnetlink_hook: list registered netfilter hooks"); -- cgit v1.2.3-58-ga151 From 4d7efa73fa268bba3b309988dd2d4c3787a17ddf Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Sat, 5 Jun 2021 18:18:33 +0800 Subject: sch_htb: fix doc warning in htb_add_to_wait_tree() Add description for parameters of htb_add_to_wait_tree() to fix gcc W=1 warnings: net/sched/sch_htb.c:308: warning: Function parameter or member 'q' not described in 'htb_add_to_wait_tree' net/sched/sch_htb.c:308: warning: Function parameter or member 'cl' not described in 'htb_add_to_wait_tree' net/sched/sch_htb.c:308: warning: Function parameter or member 'delay' not described in 'htb_add_to_wait_tree' Signed-off-by: Yu Kuai Signed-off-by: David S. Miller --- net/sched/sch_htb.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 4f9304567dcc..4eeef342c3c6 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -301,6 +301,9 @@ static void htb_add_to_id_tree(struct rb_root *root, /** * htb_add_to_wait_tree - adds class to the event queue with delay + * @q: the priority event queue + * @cl: the class to add + * @delay: delay in microseconds * * The class is added to priority event queue to indicate that class will * change its mode in cl->pq_key microseconds. Make sure that class is not -- cgit v1.2.3-58-ga151 From 274e5d0e55aa37f2bcee42f618b1923cab0ceabf Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Sat, 5 Jun 2021 18:18:34 +0800 Subject: sch_htb: fix doc warning in htb_next_rb_node() Add description for parameters of htb_next_rb_node() to fix gcc W=1 warnings: net/sched/sch_htb.c:339: warning: Function parameter or member 'n' not described in 'htb_next_rb_node' Signed-off-by: Yu Kuai Signed-off-by: David S. Miller --- net/sched/sch_htb.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 4eeef342c3c6..5ad28df6b18c 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -337,6 +337,7 @@ static void htb_add_to_wait_tree(struct htb_sched *q, /** * htb_next_rb_node - finds next node in binary tree + * @n: the current node in binary tree * * When we are past last key we return NULL. * Average complexity is 2 steps per call. -- cgit v1.2.3-58-ga151 From 996bccc39afba3db1309b8cd845f1f463516050e Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Sat, 5 Jun 2021 18:18:35 +0800 Subject: sch_htb: fix doc warning in htb_add_class_to_row() Add description for parameters of htb_add_class_to_row() to fix gcc W=1 warnings: net/sched/sch_htb.c:351: warning: Function parameter or member 'q' not described in 'htb_add_class_to_row' net/sched/sch_htb.c:351: warning: Function parameter or member 'cl' not described in 'htb_add_class_to_row' net/sched/sch_htb.c:351: warning: Function parameter or member 'mask' not described in 'htb_add_class_to_row' Signed-off-by: Yu Kuai Signed-off-by: David S. Miller --- net/sched/sch_htb.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 5ad28df6b18c..97a9df42849e 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -349,6 +349,9 @@ static inline void htb_next_rb_node(struct rb_node **n) /** * htb_add_class_to_row - add class to its row + * @q: the priority event queue + * @cl: the class to add + * @mask: the given priorities in class in bitmap * * The class is added to row at priorities marked in mask. * It does nothing if mask == 0. -- cgit v1.2.3-58-ga151 From 5f8c6d05f3900a586fc3e7947423cd3b8aafcc33 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Sat, 5 Jun 2021 18:18:36 +0800 Subject: sch_htb: fix doc warning in htb_remove_class_from_row() Add description for parameters of htb_remove_class_from_row() to fix gcc W=1 warnings: net/sched/sch_htb.c:380: warning: Function parameter or member 'q' not described in 'htb_remove_class_from_row' net/sched/sch_htb.c:380: warning: Function parameter or member 'cl' not described in 'htb_remove_class_from_row' net/sched/sch_htb.c:380: warning: Function parameter or member 'mask' not described in 'htb_remove_class_from_row' Signed-off-by: Yu Kuai Signed-off-by: David S. Miller --- net/sched/sch_htb.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 97a9df42849e..30a53db7eeb6 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -381,6 +381,9 @@ static void htb_safe_rb_erase(struct rb_node *rb, struct rb_root *root) /** * htb_remove_class_from_row - removes class from its row + * @q: the priority event queue + * @cl: the class to add + * @mask: the given priorities in class in bitmap * * The class is removed from row at priorities marked in mask. * It does nothing if mask == 0. -- cgit v1.2.3-58-ga151 From 876b5fc0c0fb879d42736a6903af23a9ef6b985a Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Sat, 5 Jun 2021 18:18:37 +0800 Subject: sch_htb: fix doc warning in htb_activate_prios() Add description for parameters of htb_activate_prios() to fix gcc W=1 warnings: net/sched/sch_htb.c:407: warning: Function parameter or member 'q' not described in 'htb_activate_prios' net/sched/sch_htb.c:407: warning: Function parameter or member 'cl' not described in 'htb_activate_prios' Signed-off-by: Yu Kuai Signed-off-by: David S. Miller --- net/sched/sch_htb.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 30a53db7eeb6..06f1b4ee88e2 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -411,6 +411,8 @@ static inline void htb_remove_class_from_row(struct htb_sched *q, /** * htb_activate_prios - creates active classe's feed chain + * @q: the priority event queue + * @cl: the class to activate * * The class is connected to ancestors and/or appropriate rows * for priorities it is participating on. cl->cmode must be new -- cgit v1.2.3-58-ga151 From 4113be2020a82d503a63917d12860766048182eb Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Sat, 5 Jun 2021 18:18:38 +0800 Subject: sch_htb: fix doc warning in htb_deactivate_prios() Add description for parameters of htb_deactivate_prios() to fix gcc W=1 warnings: net/sched/sch_htb.c:442: warning: Function parameter or member 'q' not described in 'htb_deactivate_prios' net/sched/sch_htb.c:442: warning: Function parameter or member 'cl' not described in 'htb_deactivate_prios' Signed-off-by: Yu Kuai Signed-off-by: David S. Miller --- net/sched/sch_htb.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 06f1b4ee88e2..869e59be0993 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -448,6 +448,8 @@ static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl) /** * htb_deactivate_prios - remove class from feed chain + * @q: the priority event queue + * @cl: the class to deactivate * * cl->cmode must represent old mode (before deactivation). It does * nothing if cl->prio_activity == 0. Class is removed from all feed -- cgit v1.2.3-58-ga151 From 1e9559527a9d8cc061e884f6b237754d06711335 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Sat, 5 Jun 2021 18:18:39 +0800 Subject: sch_htb: fix doc warning in htb_class_mode() Add description for parameters of htb_class_mode() to fix gcc W=1 warnings: net/sched/sch_htb.c:507: warning: Function parameter or member 'cl' not described in 'htb_class_mode' net/sched/sch_htb.c:507: warning: Function parameter or member 'diff' not described in 'htb_class_mode' Signed-off-by: Yu Kuai Signed-off-by: David S. Miller --- net/sched/sch_htb.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 869e59be0993..062ddf88ce22 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -510,6 +510,8 @@ static inline s64 htb_hiwater(const struct htb_class *cl) /** * htb_class_mode - computes and returns current class mode + * @cl: the target class + * @diff: diff time in microseconds * * It computes cl's mode at time cl->t_c+diff and returns it. If mode * is not HTB_CAN_SEND then cl->pq_key is updated to time difference -- cgit v1.2.3-58-ga151 From 4b479e9883ce4d99ffd4eeffc61b6ef70e06ee4f Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Sat, 5 Jun 2021 18:18:40 +0800 Subject: sch_htb: fix doc warning in htb_change_class_mode() Add description for parameters of htb_change_class_mode() to fix gcc W=1 warnings: net/sched/sch_htb.c:533: warning: Function parameter or member 'q' not described in 'htb_change_class_mode' net/sched/sch_htb.c:533: warning: Function parameter or member 'cl' not described in 'htb_change_class_mode' net/sched/sch_htb.c:533: warning: Function parameter or member 'diff' not described in 'htb_change_class_mode' Signed-off-by: Yu Kuai Signed-off-by: David S. Miller --- net/sched/sch_htb.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 062ddf88ce22..875ef6cd2ce0 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -540,6 +540,9 @@ htb_class_mode(struct htb_class *cl, s64 *diff) /** * htb_change_class_mode - changes classe's mode + * @q: the priority event queue + * @cl: the target class + * @diff: diff time in microseconds * * This should be the only way how to change classe's mode under normal * circumstances. Routine will update feed lists linkage, change mode -- cgit v1.2.3-58-ga151 From 8df7e8fff8da0feb3d7d686f7992e323f0cc464a Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Sat, 5 Jun 2021 18:18:41 +0800 Subject: sch_htb: fix doc warning in htb_activate() Add description for parameters of htb_activate() to fix gcc W=1 warnings: net/sched/sch_htb.c:562: warning: Function parameter or member 'q' not described in 'htb_activate' net/sched/sch_htb.c:562: warning: Function parameter or member 'cl' not described in 'htb_activate' Signed-off-by: Yu Kuai Signed-off-by: David S. Miller --- net/sched/sch_htb.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 875ef6cd2ce0..1ee47de6f72c 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -575,6 +575,8 @@ htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff) /** * htb_activate - inserts leaf cl into appropriate active feeds + * @q: the priority event queue + * @cl: the target class * * Routine learns (new) priority of leaf and activates feed chain * for the prio. It can be called on already active leaf safely. -- cgit v1.2.3-58-ga151 From 9a034f25e4721c0f021c33ca0788c7dc13bed290 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Sat, 5 Jun 2021 18:18:42 +0800 Subject: sch_htb: fix doc warning in htb_deactivate() Add description for parameters of htb_deactivate() to fix gcc W=1 warnings: net/sched/sch_htb.c:578: warning: Function parameter or member 'q' not described in 'htb_deactivate' net/sched/sch_htb.c:578: warning: Function parameter or member 'cl' not described in 'htb_deactivate' Signed-off-by: Yu Kuai Signed-off-by: David S. Miller --- net/sched/sch_htb.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 1ee47de6f72c..9d4c5370257d 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -594,6 +594,8 @@ static inline void htb_activate(struct htb_sched *q, struct htb_class *cl) /** * htb_deactivate - remove leaf cl from active feeds + * @q: the priority event queue + * @cl: the target class * * Make sure that leaf is active. In the other words it can't be called * with non-active leaf. It also removes class from the drop list. -- cgit v1.2.3-58-ga151 From 0e5c90848a28edc726c6badf6875790830c9428c Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Sat, 5 Jun 2021 18:18:43 +0800 Subject: sch_htb: fix doc warning in htb_charge_class() Add description for parameters of htb_charge_class() to fix gcc W=1 warnings: net/sched/sch_htb.c:663: warning: Function parameter or member 'q' not described in 'htb_charge_class' net/sched/sch_htb.c:663: warning: Function parameter or member 'cl' not described in 'htb_charge_class' net/sched/sch_htb.c:663: warning: Function parameter or member 'level' not described in 'htb_charge_class' net/sched/sch_htb.c:663: warning: Function parameter or member 'skb' not described in 'htb_charge_class' Signed-off-by: Yu Kuai Signed-off-by: David S. Miller --- net/sched/sch_htb.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 9d4c5370257d..a6cd3f18ff87 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -675,6 +675,10 @@ static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, s64 diff) /** * htb_charge_class - charges amount "bytes" to leaf and ancestors + * @q: the priority event queue + * @cl: the class to start iterate + * @level: the minimum level to account + * @skb: the socket buffer * * Routine assumes that packet "bytes" long was dequeued from leaf cl * borrowing from "level". It accounts bytes to ceil leaky bucket for -- cgit v1.2.3-58-ga151 From 2c3ee53ea663a7aff97271278efb19d543e0fbe9 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Sat, 5 Jun 2021 18:18:44 +0800 Subject: sch_htb: fix doc warning in htb_do_events() Add description for parameters of htb_do_events() to fix gcc W=1 warnings: net/sched/sch_htb.c:708: warning: Function parameter or member 'q' not described in 'htb_do_events' net/sched/sch_htb.c:708: warning: Function parameter or member 'level' not described in 'htb_do_events' net/sched/sch_htb.c:708: warning: Function parameter or member 'start' not described in 'htb_do_events' Signed-off-by: Yu Kuai Signed-off-by: David S. Miller --- net/sched/sch_htb.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index a6cd3f18ff87..66c330244b9d 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -728,6 +728,9 @@ static void htb_charge_class(struct htb_sched *q, struct htb_class *cl, /** * htb_do_events - make mode changes to classes at the level + * @q: the priority event queue + * @level: which wait_pq in 'q->hlevel' + * @start: start jiffies * * Scans event queue for pending events and applies them. Returns time of * next pending event (0 for no event in pq, q->now for too many events). -- cgit v1.2.3-58-ga151 From 9977d6f56bacc9784654be4d0f4d27b368f57f5b Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Sat, 5 Jun 2021 18:18:45 +0800 Subject: sch_htb: fix doc warning in htb_lookup_leaf() Add description for parameters of htb_lookup_leaf() to fix gcc W=1 warnings: net/sched/sch_htb.c:773: warning: Function parameter or member 'hprio' not described in 'htb_lookup_leaf' net/sched/sch_htb.c:773: warning: Function parameter or member 'prio' not described in 'htb_lookup_leaf' Signed-off-by: Yu Kuai Signed-off-by: David S. Miller --- net/sched/sch_htb.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 66c330244b9d..7a69e4e608c3 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -799,6 +799,8 @@ static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n, /** * htb_lookup_leaf - returns next leaf class in DRR order + * @hprio: the current one + * @prio: which prio in class * * Find leaf where current feed pointers points to. */ -- cgit v1.2.3-58-ga151 From cda9de0b8daf2ebfc07d385ef0039fd7860ddf25 Mon Sep 17 00:00:00 2001 From: Yejune Deng Date: Mon, 7 Jun 2021 10:37:41 +0800 Subject: pktgen: add pktgen_handle_all_threads() for the same code The pktgen_{run, reset, stop}_all_threads() has the same code, so add pktgen_handle_all_threads() for it. Signed-off-by: Yejune Deng Signed-off-by: David S. Miller --- net/core/pktgen.c | 38 +++++++++++++------------------------- 1 file changed, 13 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 3fba429f1f57..7e258d255e90 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -467,7 +467,7 @@ static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t, static int pktgen_device_event(struct notifier_block *, unsigned long, void *); static void pktgen_run_all_threads(struct pktgen_net *pn); static void pktgen_reset_all_threads(struct pktgen_net *pn); -static void pktgen_stop_all_threads_ifs(struct pktgen_net *pn); +static void pktgen_stop_all_threads(struct pktgen_net *pn); static void pktgen_stop(struct pktgen_thread *t); static void pktgen_clear_counters(struct pktgen_dev *pkt_dev); @@ -516,14 +516,11 @@ static ssize_t pgctrl_write(struct file *file, const char __user *buf, data[count - 1] = 0; /* Strip trailing '\n' and terminate string */ if (!strcmp(data, "stop")) - pktgen_stop_all_threads_ifs(pn); - + pktgen_stop_all_threads(pn); else if (!strcmp(data, "start")) pktgen_run_all_threads(pn); - else if (!strcmp(data, "reset")) pktgen_reset_all_threads(pn); - else return -EINVAL; @@ -3027,20 +3024,25 @@ static void pktgen_run(struct pktgen_thread *t) t->control &= ~(T_STOP); } -static void pktgen_stop_all_threads_ifs(struct pktgen_net *pn) +static void pktgen_handle_all_threads(struct pktgen_net *pn, u32 flags) { struct pktgen_thread *t; - func_enter(); - mutex_lock(&pktgen_thread_lock); list_for_each_entry(t, &pn->pktgen_threads, th_list) - t->control |= T_STOP; + t->control |= (flags); mutex_unlock(&pktgen_thread_lock); } +static void pktgen_stop_all_threads(struct pktgen_net *pn) +{ + func_enter(); + + pktgen_handle_all_threads(pn, T_STOP); +} + static int thread_is_running(const struct pktgen_thread *t) { const struct pktgen_dev *pkt_dev; @@ -3103,16 +3105,9 @@ static int pktgen_wait_all_threads_run(struct pktgen_net *pn) static void pktgen_run_all_threads(struct pktgen_net *pn) { - struct pktgen_thread *t; - func_enter(); - mutex_lock(&pktgen_thread_lock); - - list_for_each_entry(t, &pn->pktgen_threads, th_list) - t->control |= (T_RUN); - - mutex_unlock(&pktgen_thread_lock); + pktgen_handle_all_threads(pn, T_RUN); /* Propagate thread->control */ schedule_timeout_interruptible(msecs_to_jiffies(125)); @@ -3122,16 +3117,9 @@ static void pktgen_run_all_threads(struct pktgen_net *pn) static void pktgen_reset_all_threads(struct pktgen_net *pn) { - struct pktgen_thread *t; - func_enter(); - mutex_lock(&pktgen_thread_lock); - - list_for_each_entry(t, &pn->pktgen_threads, th_list) - t->control |= (T_REMDEVALL); - - mutex_unlock(&pktgen_thread_lock); + pktgen_handle_all_threads(pn, T_REMDEVALL); /* Propagate thread->control */ schedule_timeout_interruptible(msecs_to_jiffies(125)); -- cgit v1.2.3-58-ga151 From 4fb473fe7325181f87d586685d21f27a9b9e25f8 Mon Sep 17 00:00:00 2001 From: gushengxian Date: Sun, 6 Jun 2021 23:33:07 -0700 Subject: atm: [br2864] fix spelling mistakes interrupt should be changed to interrupting. Signed-off-by: gushengxian Signed-off-by: David S. Miller --- net/atm/br2684.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/atm/br2684.c b/net/atm/br2684.c index 3e17a5ecaa94..dd2a8dabed84 100644 --- a/net/atm/br2684.c +++ b/net/atm/br2684.c @@ -93,8 +93,8 @@ struct br2684_dev { * This lock should be held for writing any time the list of devices or * their attached vcc's could be altered. It should be held for reading * any time these are being queried. Note that we sometimes need to - * do read-locking under interrupt context, so write locking must block - * the current CPU's interrupts + * do read-locking under interrupting context, so write locking must block + * the current CPU's interrupts. */ static DEFINE_RWLOCK(devs_lock); -- cgit v1.2.3-58-ga151 From 84a57ae96b299eaceacc4301db222ee12563cc96 Mon Sep 17 00:00:00 2001 From: Zheng Yongjun Date: Mon, 7 Jun 2021 23:01:00 +0800 Subject: netlabel: Fix spelling mistakes Fix some spelling mistakes in comments: Interate ==> Iterate sucess ==> success Signed-off-by: Zheng Yongjun Acked-by: Paul Moore Signed-off-by: David S. Miller --- net/netlabel/netlabel_domainhash.c | 2 +- net/netlabel/netlabel_kapi.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netlabel/netlabel_domainhash.c b/net/netlabel/netlabel_domainhash.c index dc8c39f51f7d..8158a25972b4 100644 --- a/net/netlabel/netlabel_domainhash.c +++ b/net/netlabel/netlabel_domainhash.c @@ -929,7 +929,7 @@ struct netlbl_dommap_def *netlbl_domhsh_getentry_af6(const char *domain, * @cb_arg: argument for the callback function * * Description: - * Interate over the domain mapping hash table, skipping the first @skip_bkt + * Iterate over the domain mapping hash table, skipping the first @skip_bkt * buckets and @skip_chain entries. For each entry in the table call * @callback, if @callback returns a negative value stop 'walking' through the * table and return. Updates the values in @skip_bkt and @skip_chain on diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c index 5e1239cef000..beb0e573266d 100644 --- a/net/netlabel/netlabel_kapi.c +++ b/net/netlabel/netlabel_kapi.c @@ -719,7 +719,7 @@ int netlbl_catmap_walkrng(struct netlbl_lsm_catmap *catmap, u32 offset) * it in @bitmap. The @offset must be aligned to an unsigned long and will be * updated on return if different from what was requested; if the catmap is * empty at the requested offset and beyond, the @offset is set to (u32)-1. - * Returns zero on sucess, negative values on failure. + * Returns zero on success, negative values on failure. * */ int netlbl_catmap_getlong(struct netlbl_lsm_catmap *catmap, -- cgit v1.2.3-58-ga151 From 974d8f86cd60d85f107f86182fb071cea0345387 Mon Sep 17 00:00:00 2001 From: Zheng Yongjun Date: Mon, 7 Jun 2021 23:01:09 +0800 Subject: ipv4: Fix spelling mistakes Fix some spelling mistakes in comments: Dont ==> Don't timout ==> timeout incomming ==> incoming necesarry ==> necessary substract ==> subtract Signed-off-by: Zheng Yongjun Signed-off-by: David S. Miller --- net/ipv4/fib_lookup.h | 2 +- net/ipv4/ipmr.c | 4 ++-- net/ipv4/tcp_fastopen.c | 2 +- net/ipv4/tcp_timer.c | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index b58db1ca4bfb..e184bcb19943 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -25,7 +25,7 @@ struct fib_alias { #define FA_S_ACCESSED 0x01 -/* Dont write on fa_state unless needed, to keep it shared on all cpus */ +/* Don't write on fa_state unless needed, to keep it shared on all cpus */ static inline void fib_alias_accessed(struct fib_alias *fa) { if (!(fa->fa_state & FA_S_ACCESSED)) diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 12b564b1ecb4..7b12a40dd465 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1317,7 +1317,7 @@ static void mroute_clean_tables(struct mr_table *mrt, int flags) } /* called from ip_ra_control(), before an RCU grace period, - * we dont need to call synchronize_rcu() here + * we don't need to call synchronize_rcu() here */ static void mrtsock_destruct(struct sock *sk) { @@ -1938,7 +1938,7 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt, if (c->mfc_origin == htonl(INADDR_ANY) && true_vifi >= 0) { struct mfc_cache *cache_proxy; - /* For an (*,G) entry, we only check that the incomming + /* For an (*,G) entry, we only check that the incoming * interface is part of the static tree. */ cache_proxy = mr_mfc_find_any_parent(mrt, vif); diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index af2814c9342a..47c32604d38f 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -526,7 +526,7 @@ bool tcp_fastopen_active_should_disable(struct sock *sk) if (!tfo_da_times) return false; - /* Limit timout to max: 2^6 * initial timeout */ + /* Limit timeout to max: 2^6 * initial timeout */ multiplier = 1 << min(tfo_da_times - 1, 6); timeout = multiplier * tfo_bh_timeout * HZ; if (time_before(jiffies, sock_net(sk)->ipv4.tfo_active_disable_stamp + timeout)) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 4ef08079ccfa..56b9d648f054 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -441,7 +441,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req) * This function gets called when the kernel timer for a TCP packet * of this socket expires. * - * It handles retransmission, timer adjustment and other necesarry measures. + * It handles retransmission, timer adjustment and other necessary measures. * * Returns: Nothing (void) */ @@ -766,7 +766,7 @@ static enum hrtimer_restart tcp_compressed_ack_kick(struct hrtimer *timer) if (!sock_owned_by_user(sk)) { if (tp->compressed_ack) { /* Since we have to send one ack finally, - * substract one from tp->compressed_ack to keep + * subtract one from tp->compressed_ack to keep * LINUX_MIB_TCPACKCOMPRESSED accurate. */ tp->compressed_ack--; -- cgit v1.2.3-58-ga151 From 4fb3ebbf7e086a02afb0aecad0d21cf536b5fa05 Mon Sep 17 00:00:00 2001 From: Zheng Yongjun Date: Mon, 7 Jun 2021 23:01:18 +0800 Subject: net/ncsi: Fix spelling mistakes Fix some spelling mistakes in comments: constuct ==> construct chanels ==> channels Detination ==> Destination Signed-off-by: Zheng Yongjun Signed-off-by: David S. Miller --- net/ncsi/internal.h | 4 ++-- net/ncsi/ncsi-manage.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h index 49031f804276..cbbb0de4750a 100644 --- a/net/ncsi/internal.h +++ b/net/ncsi/internal.h @@ -238,7 +238,7 @@ struct ncsi_package { struct ncsi_dev_priv *ndp; /* NCSI device */ spinlock_t lock; /* Protect the package */ unsigned int channel_num; /* Number of channels */ - struct list_head channels; /* List of chanels */ + struct list_head channels; /* List of channels */ struct list_head node; /* Form list of packages */ bool multi_channel; /* Enable multiple channels */ @@ -339,7 +339,7 @@ struct ncsi_cmd_arg { unsigned char type; /* Command in the NCSI packet */ unsigned char id; /* Request ID (sequence number) */ unsigned char package; /* Destination package ID */ - unsigned char channel; /* Detination channel ID or 0x1f */ + unsigned char channel; /* Destination channel ID or 0x1f */ unsigned short payload; /* Command packet payload length */ unsigned int req_flags; /* NCSI request properties */ union { diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index ffff8da707b8..ca04b6df1341 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -627,7 +627,7 @@ static int clear_one_vid(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc, return 0; } -/* Find an outstanding VLAN tag and constuct a "Set VLAN Filter - Enable" +/* Find an outstanding VLAN tag and construct a "Set VLAN Filter - Enable" * packet. */ static int set_one_vid(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc, -- cgit v1.2.3-58-ga151 From 7f553ff214105f49e973187488ff93ff9c56b0c8 Mon Sep 17 00:00:00 2001 From: Zheng Yongjun Date: Mon, 7 Jun 2021 23:01:37 +0800 Subject: l2tp: Fix spelling mistakes Fix some spelling mistakes in comments: negociated ==> negotiated dont ==> don't Signed-off-by: Zheng Yongjun Signed-off-by: David S. Miller --- net/l2tp/l2tp_ip.c | 2 +- net/l2tp/l2tp_ppp.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 536c30d4dd7d..b3edafa5fba4 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -488,7 +488,7 @@ static int l2tp_ip_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } } - /* We dont need to clone dst here, it is guaranteed to not disappear. + /* We don't need to clone dst here, it is guaranteed to not disappear. * __dev_xmit_skb() might force a refcount if needed. */ skb_dst_set_noref(skb, &rt->dst); diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index aea85f91f059..bf35710127dd 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -226,7 +226,7 @@ static void pppol2tp_recv(struct l2tp_session *session, struct sk_buff *skb, int /* If the first two bytes are 0xFF03, consider that it is the PPP's * Address and Control fields and skip them. The L2TP module has always * worked this way, although, in theory, the use of these fields should - * be negociated and handled at the PPP layer. These fields are + * be negotiated and handled at the PPP layer. These fields are * constant: 0xFF is the All-Stations Address and 0x03 the Unnumbered * Information command with Poll/Final bit set to zero (RFC 1662). */ -- cgit v1.2.3-58-ga151 From c07aea3ef4d4076f18f567b98ed01e082e02ed51 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Mon, 7 Jun 2021 21:02:36 +0200 Subject: mm: add a signature in struct page This is needed by the page_pool to avoid recycling a page not allocated via page_pool. The page->signature field is aliased to page->lru.next and page->compound_head, but it can't be set by mistake because the signature value is a bad pointer, and can't trigger a false positive in PageTail() because the last bit is 0. Co-developed-by: Matthew Wilcox (Oracle) Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Matteo Croce Signed-off-by: David S. Miller --- include/linux/mm.h | 11 ++++++----- include/linux/mm_types.h | 7 +++++++ include/linux/poison.h | 3 +++ net/core/page_pool.c | 6 ++++++ 4 files changed, 22 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/linux/mm.h b/include/linux/mm.h index c274f75efcf9..a0434e8c2617 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1668,10 +1668,11 @@ struct address_space *page_mapping(struct page *page); static inline bool page_is_pfmemalloc(const struct page *page) { /* - * Page index cannot be this large so this must be - * a pfmemalloc page. + * lru.next has bit 1 set if the page is allocated from the + * pfmemalloc reserves. Callers may simply overwrite it if + * they do not need to preserve that information. */ - return page->index == -1UL; + return (uintptr_t)page->lru.next & BIT(1); } /* @@ -1680,12 +1681,12 @@ static inline bool page_is_pfmemalloc(const struct page *page) */ static inline void set_page_pfmemalloc(struct page *page) { - page->index = -1UL; + page->lru.next = (void *)BIT(1); } static inline void clear_page_pfmemalloc(struct page *page) { - page->index = 0; + page->lru.next = NULL; } /* diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 5aacc1c10a45..ed6862eacb52 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -96,6 +96,13 @@ struct page { unsigned long private; }; struct { /* page_pool used by netstack */ + /** + * @pp_magic: magic value to avoid recycling non + * page_pool allocated pages. + */ + unsigned long pp_magic; + struct page_pool *pp; + unsigned long _pp_mapping_pad; /** * @dma_addr: might require a 64-bit value on * 32-bit architectures. diff --git a/include/linux/poison.h b/include/linux/poison.h index aff1c9250c82..d62ef5a6b4e9 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -78,4 +78,7 @@ /********** security/ **********/ #define KEY_DESTROY 0xbd +/********** net/core/page_pool.c **********/ +#define PP_SIGNATURE (0x40 + POISON_POINTER_DELTA) + #endif diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 3c4c4c7a0402..e1321bc9d316 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -17,6 +17,7 @@ #include #include #include /* for __put_page() */ +#include #include @@ -221,6 +222,8 @@ static struct page *__page_pool_alloc_page_order(struct page_pool *pool, return NULL; } + page->pp_magic |= PP_SIGNATURE; + /* Track how many pages are held 'in-flight' */ pool->pages_state_hold_cnt++; trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt); @@ -263,6 +266,7 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, put_page(page); continue; } + page->pp_magic |= PP_SIGNATURE; pool->alloc.cache[pool->alloc.count++] = page; /* Track how many pages are held 'in-flight' */ pool->pages_state_hold_cnt++; @@ -341,6 +345,8 @@ void page_pool_release_page(struct page_pool *pool, struct page *page) DMA_ATTR_SKIP_CPU_SYNC); page_pool_set_dma_addr(page, 0); skip_dma_unmap: + page->pp_magic = 0; + /* This may be the last page returned, releasing the pool, so * it is not safe to reference pool afterwards. */ -- cgit v1.2.3-58-ga151 From c420c98982fa9e749c99e022845d5f323d098b72 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Mon, 7 Jun 2021 21:02:37 +0200 Subject: skbuff: add a parameter to __skb_frag_unref This is a prerequisite patch, the next one is enabling recycling of skbs and fragments. Add an extra argument on __skb_frag_unref() to handle recycling, and update the current users of the function with that. Signed-off-by: Matteo Croce Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/sky2.c | 2 +- drivers/net/ethernet/mellanox/mlx4/en_rx.c | 2 +- include/linux/skbuff.h | 8 +++++--- net/core/skbuff.c | 4 ++-- net/tls/tls_device.c | 2 +- 5 files changed, 10 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/marvell/sky2.c b/drivers/net/ethernet/marvell/sky2.c index 324c280cc22c..8b8bff59c8fe 100644 --- a/drivers/net/ethernet/marvell/sky2.c +++ b/drivers/net/ethernet/marvell/sky2.c @@ -2503,7 +2503,7 @@ static void skb_put_frags(struct sk_buff *skb, unsigned int hdr_space, if (length == 0) { /* don't need this page */ - __skb_frag_unref(frag); + __skb_frag_unref(frag, false); --skb_shinfo(skb)->nr_frags; } else { size = min(length, (unsigned) PAGE_SIZE); diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index e35e4d7ef4d1..cea62b8f554c 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -526,7 +526,7 @@ static int mlx4_en_complete_rx_desc(struct mlx4_en_priv *priv, fail: while (nr > 0) { nr--; - __skb_frag_unref(skb_shinfo(skb)->frags + nr); + __skb_frag_unref(skb_shinfo(skb)->frags + nr, false); } return 0; } diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index dbf820a50a39..7fcfea7e7b21 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3081,10 +3081,12 @@ static inline void skb_frag_ref(struct sk_buff *skb, int f) /** * __skb_frag_unref - release a reference on a paged fragment. * @frag: the paged fragment + * @recycle: recycle the page if allocated via page_pool * - * Releases a reference on the paged fragment @frag. + * Releases a reference on the paged fragment @frag + * or recycles the page via the page_pool API. */ -static inline void __skb_frag_unref(skb_frag_t *frag) +static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle) { put_page(skb_frag_page(frag)); } @@ -3098,7 +3100,7 @@ static inline void __skb_frag_unref(skb_frag_t *frag) */ static inline void skb_frag_unref(struct sk_buff *skb, int f) { - __skb_frag_unref(&skb_shinfo(skb)->frags[f]); + __skb_frag_unref(&skb_shinfo(skb)->frags[f], false); } /** diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 3ad22870298c..12b7e90dd2b5 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -664,7 +664,7 @@ static void skb_release_data(struct sk_buff *skb) skb_zcopy_clear(skb, true); for (i = 0; i < shinfo->nr_frags; i++) - __skb_frag_unref(&shinfo->frags[i]); + __skb_frag_unref(&shinfo->frags[i], false); if (shinfo->frag_list) kfree_skb_list(shinfo->frag_list); @@ -3495,7 +3495,7 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) fragto = &skb_shinfo(tgt)->frags[merge]; skb_frag_size_add(fragto, skb_frag_size(fragfrom)); - __skb_frag_unref(fragfrom); + __skb_frag_unref(fragfrom, false); } /* Reposition in the original skb */ diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index bd9f1567aa39..b932469ee69c 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -128,7 +128,7 @@ static void destroy_record(struct tls_record_info *record) int i; for (i = 0; i < record->num_frags; i++) - __skb_frag_unref(&record->frags[i]); + __skb_frag_unref(&record->frags[i], false); kfree(record); } -- cgit v1.2.3-58-ga151 From 6a5bcd84e886a9a91982e515c539529c28acdcc2 Mon Sep 17 00:00:00 2001 From: Ilias Apalodimas Date: Mon, 7 Jun 2021 21:02:38 +0200 Subject: page_pool: Allow drivers to hint on SKB recycling Up to now several high speed NICs have custom mechanisms of recycling the allocated memory they use for their payloads. Our page_pool API already has recycling capabilities that are always used when we are running in 'XDP mode'. So let's tweak the API and the kernel network stack slightly and allow the recycling to happen even during the standard operation. The API doesn't take into account 'split page' policies used by those drivers currently, but can be extended once we have users for that. The idea is to be able to intercept the packet on skb_release_data(). If it's a buffer coming from our page_pool API recycle it back to the pool for further usage or just release the packet entirely. To achieve that we introduce a bit in struct sk_buff (pp_recycle:1) and a field in struct page (page->pp) to store the page_pool pointer. Storing the information in page->pp allows us to recycle both SKBs and their fragments. We could have skipped the skb bit entirely, since identical information can bederived from struct page. However, in an effort to affect the free path as less as possible, reading a single bit in the skb which is already in cache, is better that trying to derive identical information for the page stored data. The driver or page_pool has to take care of the sync operations on it's own during the buffer recycling since the buffer is, after opting-in to the recycling, never unmapped. Since the gain on the drivers depends on the architecture, we are not enabling recycling by default if the page_pool API is used on a driver. In order to enable recycling the driver must call skb_mark_for_recycle() to store the information we need for recycling in page->pp and enabling the recycling bit, or page_pool_store_mem_info() for a fragment. Co-developed-by: Jesper Dangaard Brouer Signed-off-by: Jesper Dangaard Brouer Co-developed-by: Matteo Croce Signed-off-by: Matteo Croce Signed-off-by: Ilias Apalodimas Signed-off-by: David S. Miller --- include/linux/skbuff.h | 33 ++++++++++++++++++++++++++++++--- include/net/page_pool.h | 9 +++++++++ net/core/page_pool.c | 22 ++++++++++++++++++++++ net/core/skbuff.c | 20 ++++++++++++++++---- 4 files changed, 77 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 7fcfea7e7b21..b2db9cd9a73f 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -37,6 +37,7 @@ #include #include #include +#include #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include #endif @@ -667,6 +668,8 @@ typedef unsigned char *sk_buff_data_t; * @head_frag: skb was allocated from page fragments, * not allocated by kmalloc() or vmalloc(). * @pfmemalloc: skbuff was allocated from PFMEMALLOC reserves + * @pp_recycle: mark the packet for recycling instead of freeing (implies + * page_pool support on driver) * @active_extensions: active extensions (skb_ext_id types) * @ndisc_nodetype: router type (from link layer) * @ooo_okay: allow the mapping of a socket to a queue to be changed @@ -791,10 +794,12 @@ struct sk_buff { fclone:2, peeked:1, head_frag:1, - pfmemalloc:1; + pfmemalloc:1, + pp_recycle:1; /* page_pool recycle indicator */ #ifdef CONFIG_SKB_EXTENSIONS __u8 active_extensions; #endif + /* fields enclosed in headers_start/headers_end are copied * using a single memcpy() in __copy_skb_header() */ @@ -3088,7 +3093,13 @@ static inline void skb_frag_ref(struct sk_buff *skb, int f) */ static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle) { - put_page(skb_frag_page(frag)); + struct page *page = skb_frag_page(frag); + +#ifdef CONFIG_PAGE_POOL + if (recycle && page_pool_return_skb_page(page)) + return; +#endif + put_page(page); } /** @@ -3100,7 +3111,7 @@ static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle) */ static inline void skb_frag_unref(struct sk_buff *skb, int f) { - __skb_frag_unref(&skb_shinfo(skb)->frags[f], false); + __skb_frag_unref(&skb_shinfo(skb)->frags[f], skb->pp_recycle); } /** @@ -4699,5 +4710,21 @@ static inline u64 skb_get_kcov_handle(struct sk_buff *skb) #endif } +#ifdef CONFIG_PAGE_POOL +static inline void skb_mark_for_recycle(struct sk_buff *skb, struct page *page, + struct page_pool *pp) +{ + skb->pp_recycle = 1; + page_pool_store_mem_info(page, pp); +} +#endif + +static inline bool skb_pp_recycle(struct sk_buff *skb, void *data) +{ + if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle) + return false; + return page_pool_return_skb_page(virt_to_page(data)); +} + #endif /* __KERNEL__ */ #endif /* _LINUX_SKBUFF_H */ diff --git a/include/net/page_pool.h b/include/net/page_pool.h index b4b6de909c93..3dd62dd73027 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -146,6 +146,8 @@ inline enum dma_data_direction page_pool_get_dma_dir(struct page_pool *pool) return pool->p.dma_dir; } +bool page_pool_return_skb_page(struct page *page); + struct page_pool *page_pool_create(const struct page_pool_params *params); #ifdef CONFIG_PAGE_POOL @@ -251,4 +253,11 @@ static inline void page_pool_ring_unlock(struct page_pool *pool) spin_unlock_bh(&pool->ring.producer_lock); } +/* Store mem_info on struct page and use it while recycling skb frags */ +static inline +void page_pool_store_mem_info(struct page *page, struct page_pool *pp) +{ + page->pp = pp; +} + #endif /* _NET_PAGE_POOL_H */ diff --git a/net/core/page_pool.c b/net/core/page_pool.c index e1321bc9d316..5e4eb45b139c 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -628,3 +628,25 @@ void page_pool_update_nid(struct page_pool *pool, int new_nid) } } EXPORT_SYMBOL(page_pool_update_nid); + +bool page_pool_return_skb_page(struct page *page) +{ + struct page_pool *pp; + + page = compound_head(page); + if (unlikely(page->pp_magic != PP_SIGNATURE)) + return false; + + pp = page->pp; + + /* Driver set this to memory recycling info. Reset it on recycle. + * This will *not* work for NIC using a split-page memory model. + * The page will be returned to the pool here regardless of the + * 'flipped' fragment being in use or not. + */ + page->pp = NULL; + page_pool_put_full_page(pp, page, false); + + return true; +} +EXPORT_SYMBOL(page_pool_return_skb_page); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 12b7e90dd2b5..a0b1d4847efe 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -70,6 +70,7 @@ #include #include #include +#include #include #include @@ -645,10 +646,13 @@ static void skb_free_head(struct sk_buff *skb) { unsigned char *head = skb->head; - if (skb->head_frag) + if (skb->head_frag) { + if (skb_pp_recycle(skb, head)) + return; skb_free_frag(head); - else + } else { kfree(head); + } } static void skb_release_data(struct sk_buff *skb) @@ -664,7 +668,7 @@ static void skb_release_data(struct sk_buff *skb) skb_zcopy_clear(skb, true); for (i = 0; i < shinfo->nr_frags; i++) - __skb_frag_unref(&shinfo->frags[i], false); + __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle); if (shinfo->frag_list) kfree_skb_list(shinfo->frag_list); @@ -1046,6 +1050,7 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) n->nohdr = 0; n->peeked = 0; C(pfmemalloc); + C(pp_recycle); n->destructor = NULL; C(tail); C(end); @@ -3495,7 +3500,7 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) fragto = &skb_shinfo(tgt)->frags[merge]; skb_frag_size_add(fragto, skb_frag_size(fragfrom)); - __skb_frag_unref(fragfrom, false); + __skb_frag_unref(fragfrom, skb->pp_recycle); } /* Reposition in the original skb */ @@ -5285,6 +5290,13 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, if (skb_cloned(to)) return false; + /* The page pool signature of struct page will eventually figure out + * which pages can be recycled or not but for now let's prohibit slab + * allocated and page_pool allocated SKBs from being coalesced. + */ + if (to->pp_recycle != from->pp_recycle) + return false; + if (len <= skb_tailroom(to)) { if (len) BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len)); -- cgit v1.2.3-58-ga151 From 3835a6614ae7ee4840459bf47528a97b1dfc5439 Mon Sep 17 00:00:00 2001 From: Wang Hai Date: Tue, 8 Jun 2021 08:05:05 +0000 Subject: net: x25: Use list_for_each_entry() to simplify code in x25_link.c Convert list_for_each() to list_for_each_entry() where applicable. This simplifies the code. Reported-by: Hulk Robot Signed-off-by: Wang Hai Signed-off-by: David S. Miller --- net/x25/x25_link.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'net') diff --git a/net/x25/x25_link.c b/net/x25/x25_link.c index 57a81100c5da..5460b9146dd8 100644 --- a/net/x25/x25_link.c +++ b/net/x25/x25_link.c @@ -332,12 +332,9 @@ void x25_link_device_down(struct net_device *dev) struct x25_neigh *x25_get_neigh(struct net_device *dev) { struct x25_neigh *nb, *use = NULL; - struct list_head *entry; read_lock_bh(&x25_neigh_list_lock); - list_for_each(entry, &x25_neigh_list) { - nb = list_entry(entry, struct x25_neigh, node); - + list_for_each_entry(nb, &x25_neigh_list, node) { if (nb->dev == dev) { use = nb; break; -- cgit v1.2.3-58-ga151 From e83332842a46c091992ad06145b5c1b65a08ab05 Mon Sep 17 00:00:00 2001 From: Wang Hai Date: Tue, 8 Jun 2021 08:13:01 +0000 Subject: net: lapb: Use list_for_each_entry() to simplify code in lapb_iface.c Convert list_for_each() to list_for_each_entry() where applicable. This simplifies the code. Reported-by: Hulk Robot Signed-off-by: Wang Hai Signed-off-by: David S. Miller --- net/lapb/lapb_iface.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/lapb/lapb_iface.c b/net/lapb/lapb_iface.c index 1078e14f1acf..0971ca48ba15 100644 --- a/net/lapb/lapb_iface.c +++ b/net/lapb/lapb_iface.c @@ -80,11 +80,9 @@ static void __lapb_insert_cb(struct lapb_cb *lapb) static struct lapb_cb *__lapb_devtostruct(struct net_device *dev) { - struct list_head *entry; struct lapb_cb *lapb, *use = NULL; - list_for_each(entry, &lapb_list) { - lapb = list_entry(entry, struct lapb_cb, node); + list_for_each_entry(lapb, &lapb_list, node) { if (lapb->dev == dev) { use = lapb; break; -- cgit v1.2.3-58-ga151 From 96bffe70231c871d1b39ecc44288c96bed66422b Mon Sep 17 00:00:00 2001 From: Wang Hai Date: Tue, 8 Jun 2021 13:30:07 +0000 Subject: net: x25: Use list_for_each_entry() to simplify code in x25_forward.c Convert list_for_each() to list_for_each_entry() where applicable. This simplifies the code. Reported-by: Hulk Robot Signed-off-by: Wang Hai Signed-off-by: David S. Miller --- net/x25/x25_forward.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/x25/x25_forward.c b/net/x25/x25_forward.c index d48ad6d29197..21b30b56e889 100644 --- a/net/x25/x25_forward.c +++ b/net/x25/x25_forward.c @@ -19,7 +19,6 @@ int x25_forward_call(struct x25_address *dest_addr, struct x25_neigh *from, { struct x25_route *rt; struct x25_neigh *neigh_new = NULL; - struct list_head *entry; struct x25_forward *x25_frwd, *new_frwd; struct sk_buff *skbn; short same_lci = 0; @@ -46,8 +45,7 @@ int x25_forward_call(struct x25_address *dest_addr, struct x25_neigh *from, * established LCI? It shouldn't happen, just in case.. */ read_lock_bh(&x25_forward_list_lock); - list_for_each(entry, &x25_forward_list) { - x25_frwd = list_entry(entry, struct x25_forward, node); + list_for_each_entry(x25_frwd, &x25_forward_list, node) { if (x25_frwd->lci == lci) { pr_warn("call request for lci which is already registered!, transmitting but not registering new pair\n"); same_lci = 1; @@ -92,15 +90,13 @@ out_no_route: int x25_forward_data(int lci, struct x25_neigh *from, struct sk_buff *skb) { struct x25_forward *frwd; - struct list_head *entry; struct net_device *peer = NULL; struct x25_neigh *nb; struct sk_buff *skbn; int rc = 0; read_lock_bh(&x25_forward_list_lock); - list_for_each(entry, &x25_forward_list) { - frwd = list_entry(entry, struct x25_forward, node); + list_for_each_entry(frwd, &x25_forward_list, node) { if (frwd->lci == lci) { /* The call is established, either side can send */ if (from->dev == frwd->dev1) { -- cgit v1.2.3-58-ga151 From cf6b5ffdce5a78b2fcb0e53b3a2487c490bcbf7f Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Tue, 8 Jun 2021 11:40:57 +0200 Subject: netfilter: nft_exthdr: Fix for unsafe packet data read While iterating through an SCTP packet's chunks, skb_header_pointer() is called for the minimum expected chunk header size. If (that part of) the skbuff is non-linear, the following memcpy() may read data past temporary buffer '_sch'. Use skb_copy_bits() instead which does the right thing in this situation. Fixes: 133dc203d77df ("netfilter: nft_exthdr: Support SCTP chunks") Suggested-by: Florian Westphal Signed-off-by: Phil Sutter Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_exthdr.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index 1b0579cb62d0..7f705b5c09de 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -327,7 +327,9 @@ static void nft_exthdr_sctp_eval(const struct nft_expr *expr, break; dest[priv->len / NFT_REG32_SIZE] = 0; - memcpy(dest, (char *)sch + priv->offset, priv->len); + if (skb_copy_bits(pkt->skb, offset + priv->offset, + dest, priv->len) < 0) + break; return; } offset += SCTP_PAD4(ntohs(sch->length)); -- cgit v1.2.3-58-ga151 From 2aa8eca6cbb5912aa0c07ebecb846b3d6182415c Mon Sep 17 00:00:00 2001 From: gushengxian Date: Tue, 8 Jun 2021 18:52:57 -0700 Subject: net: appletalk: fix some mistakes in grammar Fix some mistakes in grammar. Signed-off-by: gushengxian Signed-off-by: David S. Miller --- net/appletalk/ddp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index ebda397fa95a..8ade5a4ceaf5 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -707,7 +707,7 @@ static int atif_ioctl(int cmd, void __user *arg) /* * Phase 1 is fine on LocalTalk but we don't do - * EtherTalk phase 1. Anyone wanting to add it go ahead. + * EtherTalk phase 1. Anyone wanting to add it, go ahead. */ if (dev->type == ARPHRD_ETHER && nr->nr_phase != 2) return -EPROTONOSUPPORT; @@ -828,7 +828,7 @@ static int atif_ioctl(int cmd, void __user *arg) nr = (struct atalk_netrange *)&(atif->nets); /* * Phase 1 is fine on Localtalk but we don't do - * Ethertalk phase 1. Anyone wanting to add it go ahead. + * Ethertalk phase 1. Anyone wanting to add it, go ahead. */ if (dev->type == ARPHRD_ETHER && nr->nr_phase != 2) return -EPROTONOSUPPORT; @@ -2018,7 +2018,7 @@ module_init(atalk_init); * by the network device layer. * * Ergo, before the AppleTalk module can be removed, all AppleTalk - * sockets be closed from user space. + * sockets should be closed from user space. */ static void __exit atalk_exit(void) { -- cgit v1.2.3-58-ga151 From 152bca090243f2aebbf4c0a2aa723ab610e6f3c4 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 5 Jun 2021 12:54:43 +0200 Subject: xfrm: remove description from xfrm_type struct Its set but never read. Reduces size of xfrm_type to 64 bytes on 64bit. Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 2 -- net/ipv4/ah4.c | 1 - net/ipv4/esp4.c | 1 - net/ipv4/esp4_offload.c | 1 - net/ipv4/ipcomp.c | 1 - net/ipv4/xfrm4_tunnel.c | 1 - net/ipv6/ah6.c | 1 - net/ipv6/esp6.c | 1 - net/ipv6/esp6_offload.c | 1 - net/ipv6/ipcomp6.c | 1 - net/ipv6/mip6.c | 2 -- net/ipv6/xfrm6_tunnel.c | 1 - 12 files changed, 14 deletions(-) (limited to 'net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 6e11db6fa0ab..1aad78c5f2d5 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -388,7 +388,6 @@ void xfrm_flush_gc(void); void xfrm_state_delete_tunnel(struct xfrm_state *x); struct xfrm_type { - char *description; struct module *owner; u8 proto; u8 flags; @@ -410,7 +409,6 @@ int xfrm_register_type(const struct xfrm_type *type, unsigned short family); void xfrm_unregister_type(const struct xfrm_type *type, unsigned short family); struct xfrm_type_offload { - char *description; struct module *owner; u8 proto; void (*encap)(struct xfrm_state *, struct sk_buff *pskb); diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index 36ed85bf2ad5..2d2d08aa787d 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -554,7 +554,6 @@ static int ah4_rcv_cb(struct sk_buff *skb, int err) static const struct xfrm_type ah_type = { - .description = "AH4", .owner = THIS_MODULE, .proto = IPPROTO_AH, .flags = XFRM_TYPE_REPLAY_PROT, diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 35803ab7ac80..f5362b9d75eb 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -1198,7 +1198,6 @@ static int esp4_rcv_cb(struct sk_buff *skb, int err) static const struct xfrm_type esp_type = { - .description = "ESP4", .owner = THIS_MODULE, .proto = IPPROTO_ESP, .flags = XFRM_TYPE_REPLAY_PROT, diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index be019a1fe3af..8e4e9aa12130 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -342,7 +342,6 @@ static const struct net_offload esp4_offload = { }; static const struct xfrm_type_offload esp_type_offload = { - .description = "ESP4 OFFLOAD", .owner = THIS_MODULE, .proto = IPPROTO_ESP, .input_tail = esp_input_tail, diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index b42683212c65..2e69e81e1f5d 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -152,7 +152,6 @@ static int ipcomp4_rcv_cb(struct sk_buff *skb, int err) } static const struct xfrm_type ipcomp_type = { - .description = "IPCOMP4", .owner = THIS_MODULE, .proto = IPPROTO_COMP, .init_state = ipcomp4_init_state, diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c index fb0648e7fb32..f4555a88f86b 100644 --- a/net/ipv4/xfrm4_tunnel.c +++ b/net/ipv4/xfrm4_tunnel.c @@ -42,7 +42,6 @@ static void ipip_destroy(struct xfrm_state *x) } static const struct xfrm_type ipip_type = { - .description = "IPIP", .owner = THIS_MODULE, .proto = IPPROTO_IPIP, .init_state = ipip_init_state, diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index 20d492da725a..e9705c256068 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -755,7 +755,6 @@ static int ah6_rcv_cb(struct sk_buff *skb, int err) } static const struct xfrm_type ah6_type = { - .description = "AH6", .owner = THIS_MODULE, .proto = IPPROTO_AH, .flags = XFRM_TYPE_REPLAY_PROT, diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 393ae2b78e7d..be2c0ac76eaa 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -1243,7 +1243,6 @@ static int esp6_rcv_cb(struct sk_buff *skb, int err) } static const struct xfrm_type esp6_type = { - .description = "ESP6", .owner = THIS_MODULE, .proto = IPPROTO_ESP, .flags = XFRM_TYPE_REPLAY_PROT, diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index 40ed4fcf1cf4..a349d4798077 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -377,7 +377,6 @@ static const struct net_offload esp6_offload = { }; static const struct xfrm_type_offload esp6_type_offload = { - .description = "ESP6 OFFLOAD", .owner = THIS_MODULE, .proto = IPPROTO_ESP, .input_tail = esp6_input_tail, diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c index daef890460b7..491aba66b7ae 100644 --- a/net/ipv6/ipcomp6.c +++ b/net/ipv6/ipcomp6.c @@ -172,7 +172,6 @@ static int ipcomp6_rcv_cb(struct sk_buff *skb, int err) } static const struct xfrm_type ipcomp6_type = { - .description = "IPCOMP6", .owner = THIS_MODULE, .proto = IPPROTO_COMP, .init_state = ipcomp6_init_state, diff --git a/net/ipv6/mip6.c b/net/ipv6/mip6.c index 878fcec14949..bc560e1664aa 100644 --- a/net/ipv6/mip6.c +++ b/net/ipv6/mip6.c @@ -324,7 +324,6 @@ static void mip6_destopt_destroy(struct xfrm_state *x) } static const struct xfrm_type mip6_destopt_type = { - .description = "MIP6DESTOPT", .owner = THIS_MODULE, .proto = IPPROTO_DSTOPTS, .flags = XFRM_TYPE_NON_FRAGMENT | XFRM_TYPE_LOCAL_COADDR, @@ -456,7 +455,6 @@ static void mip6_rthdr_destroy(struct xfrm_state *x) } static const struct xfrm_type mip6_rthdr_type = { - .description = "MIP6RT", .owner = THIS_MODULE, .proto = IPPROTO_ROUTING, .flags = XFRM_TYPE_NON_FRAGMENT | XFRM_TYPE_REMOTE_COADDR, diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c index f696d46e6910..2b31112c0856 100644 --- a/net/ipv6/xfrm6_tunnel.c +++ b/net/ipv6/xfrm6_tunnel.c @@ -291,7 +291,6 @@ static void xfrm6_tunnel_destroy(struct xfrm_state *x) } static const struct xfrm_type xfrm6_tunnel_type = { - .description = "IP6IP6", .owner = THIS_MODULE, .proto = IPPROTO_IPV6, .init_state = xfrm6_tunnel_init_state, -- cgit v1.2.3-58-ga151 From 5302560bb49d38bf6e62a47c44e19ef04bd5344d Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 8 Jun 2021 16:34:08 +0100 Subject: netfilter: nfnetlink_hook: fix array index out-of-bounds error Currently the array net->nf.hooks_ipv6 is accessed by index hook before hook is sanity checked. Fix this by moving the sanity check to before the array access. Addresses-Coverity: ("Out-of-bounds access") Fixes: e2cf17d3774c ("netfilter: add new hook nfnl subsystem") Signed-off-by: Colin Ian King Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink_hook.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nfnetlink_hook.c b/net/netfilter/nfnetlink_hook.c index 04586dfa2acd..58fda6ac663b 100644 --- a/net/netfilter/nfnetlink_hook.c +++ b/net/netfilter/nfnetlink_hook.c @@ -181,9 +181,9 @@ nfnl_hook_entries_head(u8 pf, unsigned int hook, struct net *net, const char *de hook_head = rcu_dereference(net->nf.hooks_ipv4[hook]); break; case NFPROTO_IPV6: - hook_head = rcu_dereference(net->nf.hooks_ipv6[hook]); if (hook >= ARRAY_SIZE(net->nf.hooks_ipv6)) return ERR_PTR(-EINVAL); + hook_head = rcu_dereference(net->nf.hooks_ipv6[hook]); break; case NFPROTO_ARP: #ifdef CONFIG_NETFILTER_FAMILY_ARP -- cgit v1.2.3-58-ga151 From d4fb1f954fc7e2044b64b7d690400b99a6d5775c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 8 Jun 2021 22:53:22 +0200 Subject: netfilter: nfnetlink_hook: add depends-on nftables nfnetlink_hook.c: In function 'nfnl_hook_put_nft_chain_info': nfnetlink_hook.c:76:7: error: implicit declaration of 'nft_is_active' This macro is only defined when NF_TABLES is enabled. While its possible to also add an ifdef-guard, the infrastructure is currently not useful without nf_tables. Reported-by: kernel test robot Fixes: 252956528caa ("netfilter: add new hook nfnl subsystem") Suggested-by: Pablo Neira Ayuso Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index c81321372198..54395266339d 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -22,6 +22,7 @@ config NETFILTER_FAMILY_ARP config NETFILTER_NETLINK_HOOK tristate "Netfilter base hook dump support" depends on NETFILTER_ADVANCED + depends on NF_TABLES select NETFILTER_NETLINK help If this option is enabled, the kernel will include support -- cgit v1.2.3-58-ga151 From c5c6accd7b7e10434d6afda4f6a5107c480bb4fb Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 8 Jun 2021 23:06:07 +0200 Subject: netfilter: nf_tables: move base hook annotation to init helper coverity scanner says: 2187 if (nft_is_base_chain(chain)) { vvv CID 1505166: Memory - corruptions (UNINIT) vvv Using uninitialized value "basechain". 2188 basechain->ops.hook_ops_type = NF_HOOK_OP_NF_TABLES; ... I don't see how nft_is_base_chain() can evaluate to true while basechain pointer is garbage. However, it seems better to place the NF_HOOK_OP_NF_TABLES annotation in nft_basechain_hook_init() instead. Reported-by: coverity-bot Addresses-Coverity-ID: 1505166 ("Memory - corruptions") Fixes: 65b8b7bfc5284f ("netfilter: annotate nf_tables base hook ops") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index c9308241b688..caaff7ab9e73 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1997,11 +1997,12 @@ static void nft_basechain_hook_init(struct nf_hook_ops *ops, u8 family, const struct nft_chain_hook *hook, struct nft_chain *chain) { - ops->pf = family; - ops->hooknum = hook->num; - ops->priority = hook->priority; - ops->priv = chain; - ops->hook = hook->type->hooks[ops->hooknum]; + ops->pf = family; + ops->hooknum = hook->num; + ops->priority = hook->priority; + ops->priv = chain; + ops->hook = hook->type->hooks[ops->hooknum]; + ops->hook_ops_type = NF_HOOK_OP_NF_TABLES; } static int nft_basechain_init(struct nft_base_chain *basechain, u8 family, @@ -2168,10 +2169,8 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, } nft_trans_chain_policy(trans) = NFT_CHAIN_POLICY_UNSET; - if (nft_is_base_chain(chain)) { - basechain->ops.hook_ops_type = NF_HOOK_OP_NF_TABLES; + if (nft_is_base_chain(chain)) nft_trans_chain_policy(trans) = policy; - } err = nft_chain_add(table, chain); if (err < 0) { -- cgit v1.2.3-58-ga151 From db67f2493431c32935e5b239175df4b0b9cf0171 Mon Sep 17 00:00:00 2001 From: gushengxian Date: Tue, 8 Jun 2021 20:03:17 -0700 Subject: net/x25: fix a mistake in grammar Fix a mistake in grammar. Signed-off-by: gushengxian Signed-off-by: David S. Miller --- net/x25/af_x25.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 1816899499ce..3583354a7d7f 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -366,7 +366,7 @@ static void x25_destroy_timer(struct timer_list *t) /* * This is called from user mode and the timers. Thus it protects itself - * against interrupt users but doesn't worry about being called during + * against interrupting users but doesn't worry about being called during * work. Once it is removed from the queue no interrupt or bottom half * will touch it and we are (fairly 8-) ) safe. * Not static as it's used by the timer -- cgit v1.2.3-58-ga151 From 711d1dee1c86294b43e33202fb1eabd7e524ed9a Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 9 Jun 2021 12:54:31 +0300 Subject: devlink: Fix error message in devlink_rate_set_ops_supported() The WARN_ON() macro takes a condition, it doesn't take a message. Use WARN() instead. Fixes: 1897db2ec310 ("devlink: Allow setting tx rate for devlink rate leaf objects") Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- net/core/devlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/devlink.c b/net/core/devlink.c index 5260bdfb2403..3bdb7eac730a 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -1732,7 +1732,7 @@ static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops, return false; } } else { - WARN_ON("Unknown type of rate object"); + WARN(1, "Unknown type of rate object"); return false; } -- cgit v1.2.3-58-ga151 From 268551503d66dc0a266fe6034c84a31ab4f3edf7 Mon Sep 17 00:00:00 2001 From: gushengxian Date: Wed, 9 Jun 2021 18:11:59 -0700 Subject: vsock/vmci: remove the repeated word "be" Remove the repeated word "be". Signed-off-by: gushengxian Signed-off-by: David S. Miller --- net/vmw_vsock/vmci_transport.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index c99bc4ce78e2..e617ed93f06b 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -1248,7 +1248,7 @@ vmci_transport_recv_connecting_server(struct sock *listener, vsock_remove_pending(listener, pending); vsock_enqueue_accept(listener, pending); - /* Callers of accept() will be be waiting on the listening socket, not + /* Callers of accept() will be waiting on the listening socket, not * the pending socket. */ listener->sk_data_ready(listener); -- cgit v1.2.3-58-ga151 From 4e03d073afc4f6e5b1f34e58cce7d9942d703488 Mon Sep 17 00:00:00 2001 From: gushengxian Date: Wed, 9 Jun 2021 20:09:35 -0700 Subject: af_unix: remove the repeated word "and" Remove the repeated word "and". Signed-off-by: gushengxian Signed-off-by: David S. Miller --- net/unix/af_unix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 5a31307ceb76..4d4f24cbd86b 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1392,7 +1392,7 @@ restart: unix_state_unlock(sk); - /* take ten and and send info to listening sock */ + /* take ten and send info to listening sock */ spin_lock(&other->sk_receive_queue.lock); __skb_queue_tail(&other->sk_receive_queue, skb); spin_unlock(&other->sk_receive_queue.lock); -- cgit v1.2.3-58-ga151 From 15139bcbb610f54f4362f099ae6bf9b824b97c82 Mon Sep 17 00:00:00 2001 From: gushengxian Date: Wed, 9 Jun 2021 22:50:46 -0700 Subject: node.c: fix the use of indefinite article Fix the use of indefinite article. Signed-off-by: gushengxian Signed-off-by: David S. Miller --- net/tipc/node.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/node.c b/net/tipc/node.c index 81af92954c6c..9947b7dfe1d2 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1214,7 +1214,7 @@ void tipc_node_check_dest(struct net *net, u32 addr, /* Peer has changed i/f address without rebooting. * If so, the link will reset soon, and the next * discovery will be accepted. So we can ignore it. - * It may also be an cloned or malicious peer having + * It may also be a cloned or malicious peer having * chosen the same node address and signature as an * existing one. * Ignore requests until the link goes down, if ever. -- cgit v1.2.3-58-ga151 From 326af505ca1fbad6b9b7ba9f36399ceba0b6aba2 Mon Sep 17 00:00:00 2001 From: gushengxian Date: Wed, 9 Jun 2021 23:18:53 -0700 Subject: tipc: socket.c: fix the use of copular verb Fix the use of copular verb. Signed-off-by: gushengxian Signed-off-by: David S. Miller --- net/tipc/socket.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 575a0238deb2..34a97ea36cc8 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -662,7 +662,7 @@ static int tipc_release(struct socket *sock) * @skaddr: socket address describing name(s) and desired operation * @alen: size of socket address data structure * - * Name and name sequence binding is indicated using a positive scope value; + * Name and name sequence binding are indicated using a positive scope value; * a negative scope value unbinds the specified name. Specifying no name * (i.e. a socket address length of 0) unbinds all names from the socket. * -- cgit v1.2.3-58-ga151 From f1dcdc075617a2a8a866f4f928a780287a553ed0 Mon Sep 17 00:00:00 2001 From: gushengxian Date: Wed, 9 Jun 2021 23:29:58 -0700 Subject: tipc:subscr.c: fix a spelling mistake Fix a spelling mistake. Signed-off-by: gushengxian Signed-off-by: David S. Miller --- net/tipc/subscr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index 8e00d739f03a..05d49ad81290 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -66,7 +66,7 @@ static void tipc_sub_send_event(struct tipc_subscription *sub, /** * tipc_sub_check_overlap - test for subscription overlap with the given values * @subscribed: the service range subscribed for - * @found: the service range we are checning for match + * @found: the service range we are checking for match * * Returns true if there is overlap, otherwise false. */ -- cgit v1.2.3-58-ga151 From bc831facf8a11e4e615dc67ae790325710bc1979 Mon Sep 17 00:00:00 2001 From: Wang Hai Date: Thu, 10 Jun 2021 20:48:26 +0800 Subject: net: x25: Use list_for_each_entry() to simplify code in x25_route.c Convert list_for_each() to list_for_each_entry() where applicable. This simplifies the code. Reported-by: Hulk Robot Signed-off-by: Wang Hai Signed-off-by: David S. Miller --- net/x25/x25_route.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/x25/x25_route.c b/net/x25/x25_route.c index 9fbe4bb38d94..647f325ed867 100644 --- a/net/x25/x25_route.c +++ b/net/x25/x25_route.c @@ -27,14 +27,11 @@ static int x25_add_route(struct x25_address *address, unsigned int sigdigits, struct net_device *dev) { struct x25_route *rt; - struct list_head *entry; int rc = -EINVAL; write_lock_bh(&x25_route_list_lock); - list_for_each(entry, &x25_route_list) { - rt = list_entry(entry, struct x25_route, node); - + list_for_each_entry(rt, &x25_route_list, node) { if (!memcmp(&rt->address, address, sigdigits) && rt->sigdigits == sigdigits) goto out; @@ -78,14 +75,11 @@ static int x25_del_route(struct x25_address *address, unsigned int sigdigits, struct net_device *dev) { struct x25_route *rt; - struct list_head *entry; int rc = -EINVAL; write_lock_bh(&x25_route_list_lock); - list_for_each(entry, &x25_route_list) { - rt = list_entry(entry, struct x25_route, node); - + list_for_each_entry(rt, &x25_route_list, node) { if (!memcmp(&rt->address, address, sigdigits) && rt->sigdigits == sigdigits && rt->dev == dev) { __x25_remove_route(rt); @@ -141,13 +135,10 @@ struct net_device *x25_dev_get(char *devname) struct x25_route *x25_get_route(struct x25_address *addr) { struct x25_route *rt, *use = NULL; - struct list_head *entry; read_lock_bh(&x25_route_list_lock); - list_for_each(entry, &x25_route_list) { - rt = list_entry(entry, struct x25_route, node); - + list_for_each_entry(rt, &x25_route_list, node) { if (!memcmp(&rt->address, addr, rt->sigdigits)) { if (!use) use = rt; -- cgit v1.2.3-58-ga151 From 73e42909ef2d1fa554b39bf6ae3eb1546dfc97a5 Mon Sep 17 00:00:00 2001 From: Wang Hai Date: Thu, 10 Jun 2021 21:03:55 +0800 Subject: atm: Use list_for_each_entry() to simplify code in resources.c Convert list_for_each() to list_for_each_entry() where applicable. This simplifies the code. Reported-by: Hulk Robot Signed-off-by: Wang Hai Signed-off-by: David S. Miller --- net/atm/resources.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/atm/resources.c b/net/atm/resources.c index 53236986dfe0..2b2d33eeaf20 100644 --- a/net/atm/resources.c +++ b/net/atm/resources.c @@ -52,10 +52,8 @@ static struct atm_dev *__alloc_atm_dev(const char *type) static struct atm_dev *__atm_dev_lookup(int number) { struct atm_dev *dev; - struct list_head *p; - list_for_each(p, &atm_devs) { - dev = list_entry(p, struct atm_dev, dev_list); + list_for_each_entry(dev, &atm_devs, dev_list) { if (dev->number == number) { atm_dev_hold(dev); return dev; @@ -215,8 +213,7 @@ int atm_getnames(void __user *buf, int __user *iobuf_len) return -ENOMEM; } tmp_p = tmp_buf; - list_for_each(p, &atm_devs) { - dev = list_entry(p, struct atm_dev, dev_list); + list_for_each_entry(dev, &atm_devs, dev_list) { *tmp_p++ = dev->number; } mutex_unlock(&atm_dev_mutex); -- cgit v1.2.3-58-ga151 From cb8e2e4300fc17e1028cce554ecf72a9e6161742 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Thu, 10 Jun 2021 21:26:03 +0800 Subject: dccp: tfrc: fix doc warnings in tfrc_equation.c Add description for `tfrc_invert_loss_event_rate` to fix the W=1 warnings: net/dccp/ccids/lib/tfrc_equation.c:695: warning: Function parameter or member 'loss_event_rate' not described in 'tfrc_invert_loss_event_rate' Signed-off-by: Baokun Li Reviewed-by: Richard Sailer Signed-off-by: David S. Miller --- net/dccp/ccids/lib/tfrc_equation.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c index e2a337fa9ff7..92a8c6bea316 100644 --- a/net/dccp/ccids/lib/tfrc_equation.c +++ b/net/dccp/ccids/lib/tfrc_equation.c @@ -688,6 +688,7 @@ u32 tfrc_calc_x_reverse_lookup(u32 fvalue) /** * tfrc_invert_loss_event_rate - Compute p so that 10^6 corresponds to 100% + * @loss_event_rate: loss event rate to invert * When @loss_event_rate is large, there is a chance that p is truncated to 0. * To avoid re-entering slow-start in that case, we set p = TFRC_SMALLEST_P > 0. */ -- cgit v1.2.3-58-ga151 From 7a7ae1eba24a04fdaf84ef6a11760b5b8db3f723 Mon Sep 17 00:00:00 2001 From: gushengxian Date: Tue, 8 Jun 2021 19:27:46 -0700 Subject: xfrm: policy: fix a spelling mistake Fix a spelling mistake. Signed-off-by: gushengxian Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index ce500f847b99..1e24b21457f7 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -3247,7 +3247,7 @@ xfrm_state_ok(const struct xfrm_tmpl *tmpl, const struct xfrm_state *x, /* * 0 or more than 0 is returned when validation is succeeded (either bypass - * because of optional transport mode, or next index of the mathced secpath + * because of optional transport mode, or next index of the matched secpath * state with the template. * -1 is returned when no matching template is found. * Otherwise "-2 - errored_index" is returned. -- cgit v1.2.3-58-ga151 From 9acf4d3b9ec15f27a7d027c4ae4736c2fb967391 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 11 Jun 2021 12:50:10 +0200 Subject: xfrm: ipv6: add xfrm6_hdr_offset helper This moves the ->hdr_offset indirect call to a new helper. A followup patch can then modify the new function to replace the indirect call by direct calls to the required hdr_offset helper. Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_output.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index e4cb0ff4dcf4..6b44b6e738f7 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -77,6 +77,11 @@ static int xfrm4_transport_output(struct xfrm_state *x, struct sk_buff *skb) return 0; } +static int xfrm6_hdr_offset(struct xfrm_state *x, struct sk_buff *skb, u8 **prevhdr) +{ + return x->type->hdr_offset(x, skb, prevhdr); +} + /* Add encapsulation header. * * The IP header and mutable extension headers will be moved forward to make @@ -92,7 +97,7 @@ static int xfrm6_transport_output(struct xfrm_state *x, struct sk_buff *skb) iph = ipv6_hdr(skb); skb_set_inner_transport_header(skb, skb_transport_offset(skb)); - hdr_len = x->type->hdr_offset(x, skb, &prevhdr); + hdr_len = xfrm6_hdr_offset(x, skb, &prevhdr); if (hdr_len < 0) return hdr_len; skb_set_mac_header(skb, @@ -122,7 +127,7 @@ static int xfrm6_ro_output(struct xfrm_state *x, struct sk_buff *skb) iph = ipv6_hdr(skb); - hdr_len = x->type->hdr_offset(x, skb, &prevhdr); + hdr_len = xfrm6_hdr_offset(x, skb, &prevhdr); if (hdr_len < 0) return hdr_len; skb_set_mac_header(skb, -- cgit v1.2.3-58-ga151 From 37b9e7eb55659b270f0e8aebd98308716d935586 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 11 Jun 2021 12:50:11 +0200 Subject: xfrm: ipv6: move mip6_destopt_offset into xfrm core This helper is relatively small, just move this to the xfrm core and call it directly. Next patch does the same for the ROUTING type. Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- net/ipv6/mip6.c | 49 ------------------------------------------- net/xfrm/xfrm_output.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 49 deletions(-) (limited to 'net') diff --git a/net/ipv6/mip6.c b/net/ipv6/mip6.c index bc560e1664aa..fba3b56a7dd2 100644 --- a/net/ipv6/mip6.c +++ b/net/ipv6/mip6.c @@ -247,54 +247,6 @@ static int mip6_destopt_reject(struct xfrm_state *x, struct sk_buff *skb, return err; } -static int mip6_destopt_offset(struct xfrm_state *x, struct sk_buff *skb, - u8 **nexthdr) -{ - u16 offset = sizeof(struct ipv6hdr); - struct ipv6_opt_hdr *exthdr = - (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1); - const unsigned char *nh = skb_network_header(skb); - unsigned int packet_len = skb_tail_pointer(skb) - - skb_network_header(skb); - int found_rhdr = 0; - - *nexthdr = &ipv6_hdr(skb)->nexthdr; - - while (offset + 1 <= packet_len) { - - switch (**nexthdr) { - case NEXTHDR_HOP: - break; - case NEXTHDR_ROUTING: - found_rhdr = 1; - break; - case NEXTHDR_DEST: - /* - * HAO MUST NOT appear more than once. - * XXX: It is better to try to find by the end of - * XXX: packet if HAO exists. - */ - if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0) { - net_dbg_ratelimited("mip6: hao exists already, override\n"); - return offset; - } - - if (found_rhdr) - return offset; - - break; - default: - return offset; - } - - offset += ipv6_optlen(exthdr); - *nexthdr = &exthdr->nexthdr; - exthdr = (struct ipv6_opt_hdr *)(nh + offset); - } - - return offset; -} - static int mip6_destopt_init_state(struct xfrm_state *x) { if (x->id.spi) { @@ -332,7 +284,6 @@ static const struct xfrm_type mip6_destopt_type = { .input = mip6_destopt_input, .output = mip6_destopt_output, .reject = mip6_destopt_reject, - .hdr_offset = mip6_destopt_offset, }; static int mip6_rthdr_input(struct xfrm_state *x, struct sk_buff *skb) diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 6b44b6e738f7..29959054a535 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -77,8 +77,65 @@ static int xfrm4_transport_output(struct xfrm_state *x, struct sk_buff *skb) return 0; } +#if IS_ENABLED(CONFIG_IPV6_MIP6) +static int mip6_destopt_offset(struct xfrm_state *x, struct sk_buff *skb, + u8 **nexthdr) +{ + u16 offset = sizeof(struct ipv6hdr); + struct ipv6_opt_hdr *exthdr = + (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1); + const unsigned char *nh = skb_network_header(skb); + unsigned int packet_len = skb_tail_pointer(skb) - + skb_network_header(skb); + int found_rhdr = 0; + + *nexthdr = &ipv6_hdr(skb)->nexthdr; + + while (offset + 1 <= packet_len) { + switch (**nexthdr) { + case NEXTHDR_HOP: + break; + case NEXTHDR_ROUTING: + found_rhdr = 1; + break; + case NEXTHDR_DEST: + /* HAO MUST NOT appear more than once. + * XXX: It is better to try to find by the end of + * XXX: packet if HAO exists. + */ + if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0) { + net_dbg_ratelimited("mip6: hao exists already, override\n"); + return offset; + } + + if (found_rhdr) + return offset; + + break; + default: + return offset; + } + + offset += ipv6_optlen(exthdr); + *nexthdr = &exthdr->nexthdr; + exthdr = (struct ipv6_opt_hdr *)(nh + offset); + } + + return offset; +} +#endif + static int xfrm6_hdr_offset(struct xfrm_state *x, struct sk_buff *skb, u8 **prevhdr) { + switch (x->type->proto) { +#if IS_ENABLED(CONFIG_IPV6_MIP6) + case IPPROTO_DSTOPTS: + return mip6_destopt_offset(x, skb, prevhdr); +#endif + default: + break; + } + return x->type->hdr_offset(x, skb, prevhdr); } -- cgit v1.2.3-58-ga151 From 848b18fb7fbd2fa5bc4fc2263bb69956fb86120d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 11 Jun 2021 12:50:12 +0200 Subject: xfrm: ipv6: move mip6_rthdr_offset into xfrm core Place the call into the xfrm core. After this all remaining users set the hdr_offset function pointer to the same function which opens the possiblity to remove the indirection. Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- net/ipv6/mip6.c | 48 ------------------------------------------------ net/xfrm/xfrm_output.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 48 deletions(-) (limited to 'net') diff --git a/net/ipv6/mip6.c b/net/ipv6/mip6.c index fba3b56a7dd2..aeb35d26e474 100644 --- a/net/ipv6/mip6.c +++ b/net/ipv6/mip6.c @@ -333,53 +333,6 @@ static int mip6_rthdr_output(struct xfrm_state *x, struct sk_buff *skb) return 0; } -static int mip6_rthdr_offset(struct xfrm_state *x, struct sk_buff *skb, - u8 **nexthdr) -{ - u16 offset = sizeof(struct ipv6hdr); - struct ipv6_opt_hdr *exthdr = - (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1); - const unsigned char *nh = skb_network_header(skb); - unsigned int packet_len = skb_tail_pointer(skb) - - skb_network_header(skb); - int found_rhdr = 0; - - *nexthdr = &ipv6_hdr(skb)->nexthdr; - - while (offset + 1 <= packet_len) { - - switch (**nexthdr) { - case NEXTHDR_HOP: - break; - case NEXTHDR_ROUTING: - if (offset + 3 <= packet_len) { - struct ipv6_rt_hdr *rt; - rt = (struct ipv6_rt_hdr *)(nh + offset); - if (rt->type != 0) - return offset; - } - found_rhdr = 1; - break; - case NEXTHDR_DEST: - if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0) - return offset; - - if (found_rhdr) - return offset; - - break; - default: - return offset; - } - - offset += ipv6_optlen(exthdr); - *nexthdr = &exthdr->nexthdr; - exthdr = (struct ipv6_opt_hdr *)(nh + offset); - } - - return offset; -} - static int mip6_rthdr_init_state(struct xfrm_state *x) { if (x->id.spi) { @@ -413,7 +366,6 @@ static const struct xfrm_type mip6_rthdr_type = { .destructor = mip6_rthdr_destroy, .input = mip6_rthdr_input, .output = mip6_rthdr_output, - .hdr_offset = mip6_rthdr_offset, }; static int __init mip6_init(void) diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 29959054a535..1734339b6dd0 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -123,6 +123,53 @@ static int mip6_destopt_offset(struct xfrm_state *x, struct sk_buff *skb, return offset; } + +static int mip6_rthdr_offset(struct xfrm_state *x, struct sk_buff *skb, + u8 **nexthdr) +{ + u16 offset = sizeof(struct ipv6hdr); + struct ipv6_opt_hdr *exthdr = + (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1); + const unsigned char *nh = skb_network_header(skb); + unsigned int packet_len = skb_tail_pointer(skb) - + skb_network_header(skb); + int found_rhdr = 0; + + *nexthdr = &ipv6_hdr(skb)->nexthdr; + + while (offset + 1 <= packet_len) { + switch (**nexthdr) { + case NEXTHDR_HOP: + break; + case NEXTHDR_ROUTING: + if (offset + 3 <= packet_len) { + struct ipv6_rt_hdr *rt; + + rt = (struct ipv6_rt_hdr *)(nh + offset); + if (rt->type != 0) + return offset; + } + found_rhdr = 1; + break; + case NEXTHDR_DEST: + if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0) + return offset; + + if (found_rhdr) + return offset; + + break; + default: + return offset; + } + + offset += ipv6_optlen(exthdr); + *nexthdr = &exthdr->nexthdr; + exthdr = (struct ipv6_opt_hdr *)(nh + offset); + } + + return offset; +} #endif static int xfrm6_hdr_offset(struct xfrm_state *x, struct sk_buff *skb, u8 **prevhdr) @@ -131,6 +178,8 @@ static int xfrm6_hdr_offset(struct xfrm_state *x, struct sk_buff *skb, u8 **prev #if IS_ENABLED(CONFIG_IPV6_MIP6) case IPPROTO_DSTOPTS: return mip6_destopt_offset(x, skb, prevhdr); + case IPPROTO_ROUTING: + return mip6_rthdr_offset(x, skb, prevhdr); #endif default: break; -- cgit v1.2.3-58-ga151 From d1002d2490e3ebc30dd3ba747656cfa90c87e984 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 11 Jun 2021 12:50:13 +0200 Subject: xfrm: remove hdr_offset indirection After previous patches all remaining users set the function pointer to the same function: xfrm6_find_1stfragopt. So remove this function pointer and call ip6_find_1stfragopt directly. Reduces size of xfrm_type to 64 bytes on 64bit platforms. Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 3 --- net/ipv6/ah6.c | 1 - net/ipv6/esp6.c | 1 - net/ipv6/ipcomp6.c | 1 - net/ipv6/xfrm6_output.c | 7 ------- net/xfrm/xfrm_output.c | 2 +- 6 files changed, 1 insertion(+), 14 deletions(-) (limited to 'net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 1aad78c5f2d5..c8890da00b8a 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -402,7 +402,6 @@ struct xfrm_type { int (*output)(struct xfrm_state *, struct sk_buff *pskb); int (*reject)(struct xfrm_state *, struct sk_buff *, const struct flowi *); - int (*hdr_offset)(struct xfrm_state *, struct sk_buff *, u8 **); }; int xfrm_register_type(const struct xfrm_type *type, unsigned short family); @@ -1605,8 +1604,6 @@ __be32 xfrm6_tunnel_alloc_spi(struct net *net, xfrm_address_t *saddr); __be32 xfrm6_tunnel_spi_lookup(struct net *net, const xfrm_address_t *saddr); int xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb); int xfrm6_output_finish(struct sock *sk, struct sk_buff *skb); -int xfrm6_find_1stfragopt(struct xfrm_state *x, struct sk_buff *skb, - u8 **prevhdr); #ifdef CONFIG_XFRM void xfrm6_local_rxpmtu(struct sk_buff *skb, u32 mtu); diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index e9705c256068..828e62514260 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -762,7 +762,6 @@ static const struct xfrm_type ah6_type = { .destructor = ah6_destroy, .input = ah6_input, .output = ah6_output, - .hdr_offset = xfrm6_find_1stfragopt, }; static struct xfrm6_protocol ah6_protocol = { diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index be2c0ac76eaa..37c4b1726c5e 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -1250,7 +1250,6 @@ static const struct xfrm_type esp6_type = { .destructor = esp6_destroy, .input = esp6_input, .output = esp6_output, - .hdr_offset = xfrm6_find_1stfragopt, }; static struct xfrm6_protocol esp6_protocol = { diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c index 491aba66b7ae..15f984be3570 100644 --- a/net/ipv6/ipcomp6.c +++ b/net/ipv6/ipcomp6.c @@ -178,7 +178,6 @@ static const struct xfrm_type ipcomp6_type = { .destructor = ipcomp_destroy, .input = ipcomp_input, .output = ipcomp_output, - .hdr_offset = xfrm6_find_1stfragopt, }; static struct xfrm6_protocol ipcomp6_protocol = { diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index 8b84d534b19d..57fa27c1cdf9 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -16,13 +16,6 @@ #include #include -int xfrm6_find_1stfragopt(struct xfrm_state *x, struct sk_buff *skb, - u8 **prevhdr) -{ - return ip6_find_1stfragopt(skb, prevhdr); -} -EXPORT_SYMBOL(xfrm6_find_1stfragopt); - void xfrm6_local_rxpmtu(struct sk_buff *skb, u32 mtu) { struct flowi6 fl6; diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 1734339b6dd0..10842d5cf6e1 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -185,7 +185,7 @@ static int xfrm6_hdr_offset(struct xfrm_state *x, struct sk_buff *skb, u8 **prev break; } - return x->type->hdr_offset(x, skb, prevhdr); + return ip6_find_1stfragopt(skb, prevhdr); } /* Add encapsulation header. -- cgit v1.2.3-58-ga151 From 3ca5ca83e206eab566830e08664eda415f428374 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 11 Jun 2021 12:50:14 +0200 Subject: xfrm: merge dstopt and routing hdroff functions Both functions are very similar, so merge them into one. The nexthdr is passed as argument to break the loop in the ROUTING case, this is the only header type where slightly different rules apply. While at it, the merged function is realigned with ip6_find_1stfragopt(). That function received bug fixes for an infinite loop, but neither dstopt nor rh parsing functions (copy-pasted from ip6_find_1stfragopt) were changed. Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_output.c | 80 ++++++++++++++------------------------------------ 1 file changed, 22 insertions(+), 58 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 10842d5cf6e1..e14fca1fb003 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -78,24 +78,30 @@ static int xfrm4_transport_output(struct xfrm_state *x, struct sk_buff *skb) } #if IS_ENABLED(CONFIG_IPV6_MIP6) -static int mip6_destopt_offset(struct xfrm_state *x, struct sk_buff *skb, - u8 **nexthdr) +static int mip6_rthdr_offset(struct sk_buff *skb, u8 **nexthdr, int type) { - u16 offset = sizeof(struct ipv6hdr); - struct ipv6_opt_hdr *exthdr = - (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1); const unsigned char *nh = skb_network_header(skb); - unsigned int packet_len = skb_tail_pointer(skb) - - skb_network_header(skb); + unsigned int offset = sizeof(struct ipv6hdr); + unsigned int packet_len; int found_rhdr = 0; + packet_len = skb_tail_pointer(skb) - nh; *nexthdr = &ipv6_hdr(skb)->nexthdr; - while (offset + 1 <= packet_len) { + while (offset <= packet_len) { + struct ipv6_opt_hdr *exthdr; + switch (**nexthdr) { case NEXTHDR_HOP: break; case NEXTHDR_ROUTING: + if (type == IPPROTO_ROUTING && offset + 3 <= packet_len) { + struct ipv6_rt_hdr *rt; + + rt = (struct ipv6_rt_hdr *)(nh + offset); + if (rt->type != 0) + return offset; + } found_rhdr = 1; break; case NEXTHDR_DEST: @@ -116,59 +122,18 @@ static int mip6_destopt_offset(struct xfrm_state *x, struct sk_buff *skb, return offset; } - offset += ipv6_optlen(exthdr); - *nexthdr = &exthdr->nexthdr; - exthdr = (struct ipv6_opt_hdr *)(nh + offset); - } - - return offset; -} - -static int mip6_rthdr_offset(struct xfrm_state *x, struct sk_buff *skb, - u8 **nexthdr) -{ - u16 offset = sizeof(struct ipv6hdr); - struct ipv6_opt_hdr *exthdr = - (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1); - const unsigned char *nh = skb_network_header(skb); - unsigned int packet_len = skb_tail_pointer(skb) - - skb_network_header(skb); - int found_rhdr = 0; - - *nexthdr = &ipv6_hdr(skb)->nexthdr; - - while (offset + 1 <= packet_len) { - switch (**nexthdr) { - case NEXTHDR_HOP: - break; - case NEXTHDR_ROUTING: - if (offset + 3 <= packet_len) { - struct ipv6_rt_hdr *rt; - - rt = (struct ipv6_rt_hdr *)(nh + offset); - if (rt->type != 0) - return offset; - } - found_rhdr = 1; - break; - case NEXTHDR_DEST: - if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0) - return offset; - - if (found_rhdr) - return offset; - - break; - default: - return offset; - } + if (offset + sizeof(struct ipv6_opt_hdr) > packet_len) + return -EINVAL; + exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) + + offset); offset += ipv6_optlen(exthdr); + if (offset > IPV6_MAXPLEN) + return -EINVAL; *nexthdr = &exthdr->nexthdr; - exthdr = (struct ipv6_opt_hdr *)(nh + offset); } - return offset; + return -EINVAL; } #endif @@ -177,9 +142,8 @@ static int xfrm6_hdr_offset(struct xfrm_state *x, struct sk_buff *skb, u8 **prev switch (x->type->proto) { #if IS_ENABLED(CONFIG_IPV6_MIP6) case IPPROTO_DSTOPTS: - return mip6_destopt_offset(x, skb, prevhdr); case IPPROTO_ROUTING: - return mip6_rthdr_offset(x, skb, prevhdr); + return mip6_rthdr_offset(skb, prevhdr, x->type->proto); #endif default: break; -- cgit v1.2.3-58-ga151 From 4e50025129efabb07714c1f27a80526897da374b Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 11 Jun 2021 22:01:24 +0300 Subject: net: dsa: generalize overhead for taggers that use both headers and trailers Some really really weird switches just couldn't decide whether to use a normal or a tail tagger, so they just did both. This creates problems for DSA, because we only have the concept of an 'overhead' which can be applied to the headroom or to the tailroom of the skb (like for example during the central TX reallocation procedure), depending on the value of bool tail_tag, but not to both. We need to generalize DSA to cater for these odd switches by transforming the 'overhead / tail_tag' pair into 'needed_headroom / needed_tailroom'. The DSA master's MTU is increased to account for both. The flow dissector code is modified such that it only calls the DSA adjustment callback if the tagger has a non-zero header length. Taggers are trivially modified to declare either needed_headroom or needed_tailroom, based on the tail_tag value that they currently declare. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- Documentation/networking/dsa/dsa.rst | 21 +++++++++++---------- include/net/dsa.h | 6 +++--- net/core/flow_dissector.c | 2 +- net/dsa/dsa_priv.h | 5 +++++ net/dsa/master.c | 6 ++++-- net/dsa/slave.c | 10 ++++------ net/dsa/tag_ar9331.c | 2 +- net/dsa/tag_brcm.c | 6 +++--- net/dsa/tag_dsa.c | 4 ++-- net/dsa/tag_gswip.c | 2 +- net/dsa/tag_hellcreek.c | 3 +-- net/dsa/tag_ksz.c | 9 +++------ net/dsa/tag_lan9303.c | 2 +- net/dsa/tag_mtk.c | 2 +- net/dsa/tag_ocelot.c | 4 ++-- net/dsa/tag_ocelot_8021q.c | 2 +- net/dsa/tag_qca.c | 2 +- net/dsa/tag_rtl4_a.c | 2 +- net/dsa/tag_sja1105.c | 2 +- net/dsa/tag_trailer.c | 3 +-- net/dsa/tag_xrs700x.c | 3 +-- 21 files changed, 49 insertions(+), 49 deletions(-) (limited to 'net') diff --git a/Documentation/networking/dsa/dsa.rst b/Documentation/networking/dsa/dsa.rst index 8688009514cc..20baacf2bc5c 100644 --- a/Documentation/networking/dsa/dsa.rst +++ b/Documentation/networking/dsa/dsa.rst @@ -93,14 +93,15 @@ A tagging protocol may tag all packets with switch tags of the same length, or the tag length might vary (for example packets with PTP timestamps might require an extended switch tag, or there might be one tag length on TX and a different one on RX). Either way, the tagging protocol driver must populate the -``struct dsa_device_ops::overhead`` with the length in octets of the longest -switch frame header. The DSA framework will automatically adjust the MTU of the -master interface to accomodate for this extra size in order for DSA user ports -to support the standard MTU (L2 payload length) of 1500 octets. The ``overhead`` -is also used to request from the network stack, on a best-effort basis, the -allocation of packets with a ``needed_headroom`` or ``needed_tailroom`` -sufficient such that the act of pushing the switch tag on transmission of a -packet does not cause it to reallocate due to lack of memory. +``struct dsa_device_ops::needed_headroom`` and/or ``struct dsa_device_ops::needed_tailroom`` +with the length in octets of the longest switch frame header/trailer. The DSA +framework will automatically adjust the MTU of the master interface to +accommodate for this extra size in order for DSA user ports to support the +standard MTU (L2 payload length) of 1500 octets. The ``needed_headroom`` and +``needed_tailroom`` properties are also used to request from the network stack, +on a best-effort basis, the allocation of packets with enough extra space such +that the act of pushing the switch tag on transmission of a packet does not +cause it to reallocate due to lack of memory. Even though applications are not expected to parse DSA-specific frame headers, the format on the wire of the tagging protocol represents an Application Binary @@ -169,8 +170,8 @@ The job of this method is to prepare the skb in a way that the switch will understand what egress port the packet is for (and not deliver it towards other ports). Typically this is fulfilled by pushing a frame header. Checking for insufficient size in the skb headroom or tailroom is unnecessary provided that -the ``overhead`` and ``tail_tag`` properties were filled out properly, because -DSA ensures there is enough space before calling this method. +the ``needed_headroom`` and ``needed_tailroom`` properties were filled out +properly, because DSA ensures there is enough space before calling this method. The reception of a packet goes through the tagger's ``rcv`` function. The passed ``struct sk_buff *skb`` has ``skb->data`` pointing at diff --git a/include/net/dsa.h b/include/net/dsa.h index e1a2610a0e06..0a10f6fffc3d 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -91,7 +91,8 @@ struct dsa_device_ops { * as regular on the master net device. */ bool (*filter)(const struct sk_buff *skb, struct net_device *dev); - unsigned int overhead; + unsigned int needed_headroom; + unsigned int needed_tailroom; const char *name; enum dsa_tag_protocol proto; /* Some tagging protocols either mangle or shift the destination MAC @@ -100,7 +101,6 @@ struct dsa_device_ops { * its RX filter. */ bool promisc_on_master; - bool tail_tag; }; /* This structure defines the control interfaces that are overlayed by the @@ -926,7 +926,7 @@ static inline void dsa_tag_generic_flow_dissect(const struct sk_buff *skb, { #if IS_ENABLED(CONFIG_NET_DSA) const struct dsa_device_ops *ops = skb->dev->dsa_ptr->tag_ops; - int tag_len = ops->overhead; + int tag_len = ops->needed_headroom; *offset = tag_len; *proto = ((__be16 *)skb->data)[(tag_len / 2) - 1]; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 3ed7c98a98e1..c04455981c1e 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -944,7 +944,7 @@ bool __skb_flow_dissect(const struct net *net, ops = skb->dev->dsa_ptr->tag_ops; /* Tail taggers don't break flow dissection */ - if (!ops->tail_tag) { + if (!ops->needed_headroom) { if (ops->flow_dissect) ops->flow_dissect(skb, &proto, &offset); else diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 92282de54230..b8b17474b72b 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -154,6 +154,11 @@ const struct dsa_device_ops *dsa_find_tagger_by_name(const char *buf); bool dsa_schedule_work(struct work_struct *work); const char *dsa_tag_protocol_to_str(const struct dsa_device_ops *ops); +static inline int dsa_tag_protocol_overhead(const struct dsa_device_ops *ops) +{ + return ops->needed_headroom + ops->needed_tailroom; +} + /* master.c */ int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp); void dsa_master_teardown(struct net_device *dev); diff --git a/net/dsa/master.c b/net/dsa/master.c index 63adbc21a735..3fc90e36772d 100644 --- a/net/dsa/master.c +++ b/net/dsa/master.c @@ -346,10 +346,12 @@ static struct lock_class_key dsa_master_addr_list_lock_key; int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp) { - int mtu = ETH_DATA_LEN + cpu_dp->tag_ops->overhead; + const struct dsa_device_ops *tag_ops = cpu_dp->tag_ops; struct dsa_switch *ds = cpu_dp->ds; struct device_link *consumer_link; - int ret; + int mtu, ret; + + mtu = ETH_DATA_LEN + dsa_tag_protocol_overhead(tag_ops); /* The DSA master must use SET_NETDEV_DEV for this to work. */ consumer_link = device_link_add(ds->dev, dev->dev.parent, diff --git a/net/dsa/slave.c b/net/dsa/slave.c index d4756b920108..3ca509eb284d 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1569,7 +1569,7 @@ int dsa_slave_change_mtu(struct net_device *dev, int new_mtu) mtu_limit = min_t(int, master->max_mtu, dev->max_mtu); old_master_mtu = master->mtu; - new_master_mtu = largest_mtu + cpu_dp->tag_ops->overhead; + new_master_mtu = largest_mtu + dsa_tag_protocol_overhead(cpu_dp->tag_ops); if (new_master_mtu > mtu_limit) return -ERANGE; @@ -1605,7 +1605,7 @@ int dsa_slave_change_mtu(struct net_device *dev, int new_mtu) out_port_failed: if (new_master_mtu != old_master_mtu) dsa_port_mtu_change(cpu_dp, old_master_mtu - - cpu_dp->tag_ops->overhead, + dsa_tag_protocol_overhead(cpu_dp->tag_ops), true); out_cpu_failed: if (new_master_mtu != old_master_mtu) @@ -1824,10 +1824,8 @@ void dsa_slave_setup_tagger(struct net_device *slave) const struct dsa_port *cpu_dp = dp->cpu_dp; struct net_device *master = cpu_dp->master; - if (cpu_dp->tag_ops->tail_tag) - slave->needed_tailroom = cpu_dp->tag_ops->overhead; - else - slave->needed_headroom = cpu_dp->tag_ops->overhead; + slave->needed_headroom = cpu_dp->tag_ops->needed_headroom; + slave->needed_tailroom = cpu_dp->tag_ops->needed_tailroom; /* Try to save one extra realloc later in the TX path (in the master) * by also inheriting the master's needed headroom and tailroom. * The 8021q driver also does this. diff --git a/net/dsa/tag_ar9331.c b/net/dsa/tag_ar9331.c index 002cf7f952e2..0efae1a372b3 100644 --- a/net/dsa/tag_ar9331.c +++ b/net/dsa/tag_ar9331.c @@ -85,7 +85,7 @@ static const struct dsa_device_ops ar9331_netdev_ops = { .proto = DSA_TAG_PROTO_AR9331, .xmit = ar9331_tag_xmit, .rcv = ar9331_tag_rcv, - .overhead = AR9331_HDR_LEN, + .needed_headroom = AR9331_HDR_LEN, }; MODULE_LICENSE("GPL v2"); diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index 40e9f3098c8d..0750af951fc9 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -205,7 +205,7 @@ static const struct dsa_device_ops brcm_netdev_ops = { .proto = DSA_TAG_PROTO_BRCM, .xmit = brcm_tag_xmit, .rcv = brcm_tag_rcv, - .overhead = BRCM_TAG_LEN, + .needed_headroom = BRCM_TAG_LEN, }; DSA_TAG_DRIVER(brcm_netdev_ops); @@ -286,7 +286,7 @@ static const struct dsa_device_ops brcm_legacy_netdev_ops = { .proto = DSA_TAG_PROTO_BRCM_LEGACY, .xmit = brcm_leg_tag_xmit, .rcv = brcm_leg_tag_rcv, - .overhead = BRCM_LEG_TAG_LEN, + .needed_headroom = BRCM_LEG_TAG_LEN, }; DSA_TAG_DRIVER(brcm_legacy_netdev_ops); @@ -314,7 +314,7 @@ static const struct dsa_device_ops brcm_prepend_netdev_ops = { .proto = DSA_TAG_PROTO_BRCM_PREPEND, .xmit = brcm_tag_xmit_prepend, .rcv = brcm_tag_rcv_prepend, - .overhead = BRCM_TAG_LEN, + .needed_headroom = BRCM_TAG_LEN, }; DSA_TAG_DRIVER(brcm_prepend_netdev_ops); diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c index 7e7b7decdf39..a822355afc90 100644 --- a/net/dsa/tag_dsa.c +++ b/net/dsa/tag_dsa.c @@ -303,7 +303,7 @@ static const struct dsa_device_ops dsa_netdev_ops = { .proto = DSA_TAG_PROTO_DSA, .xmit = dsa_xmit, .rcv = dsa_rcv, - .overhead = DSA_HLEN, + .needed_headroom = DSA_HLEN, }; DSA_TAG_DRIVER(dsa_netdev_ops); @@ -346,7 +346,7 @@ static const struct dsa_device_ops edsa_netdev_ops = { .proto = DSA_TAG_PROTO_EDSA, .xmit = edsa_xmit, .rcv = edsa_rcv, - .overhead = EDSA_HLEN, + .needed_headroom = EDSA_HLEN, }; DSA_TAG_DRIVER(edsa_netdev_ops); diff --git a/net/dsa/tag_gswip.c b/net/dsa/tag_gswip.c index 2f5bd5e338ab..5985dab06ab8 100644 --- a/net/dsa/tag_gswip.c +++ b/net/dsa/tag_gswip.c @@ -103,7 +103,7 @@ static const struct dsa_device_ops gswip_netdev_ops = { .proto = DSA_TAG_PROTO_GSWIP, .xmit = gswip_tag_xmit, .rcv = gswip_tag_rcv, - .overhead = GSWIP_RX_HEADER_LEN, + .needed_headroom = GSWIP_RX_HEADER_LEN, }; MODULE_LICENSE("GPL"); diff --git a/net/dsa/tag_hellcreek.c b/net/dsa/tag_hellcreek.c index a09805c8e1ab..424130f85f59 100644 --- a/net/dsa/tag_hellcreek.c +++ b/net/dsa/tag_hellcreek.c @@ -54,8 +54,7 @@ static const struct dsa_device_ops hellcreek_netdev_ops = { .proto = DSA_TAG_PROTO_HELLCREEK, .xmit = hellcreek_xmit, .rcv = hellcreek_rcv, - .overhead = HELLCREEK_TAG_LEN, - .tail_tag = true, + .needed_tailroom = HELLCREEK_TAG_LEN, }; MODULE_LICENSE("Dual MIT/GPL"); diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c index 4820dbcedfa2..53565f48934c 100644 --- a/net/dsa/tag_ksz.c +++ b/net/dsa/tag_ksz.c @@ -77,8 +77,7 @@ static const struct dsa_device_ops ksz8795_netdev_ops = { .proto = DSA_TAG_PROTO_KSZ8795, .xmit = ksz8795_xmit, .rcv = ksz8795_rcv, - .overhead = KSZ_INGRESS_TAG_LEN, - .tail_tag = true, + .needed_tailroom = KSZ_INGRESS_TAG_LEN, }; DSA_TAG_DRIVER(ksz8795_netdev_ops); @@ -149,8 +148,7 @@ static const struct dsa_device_ops ksz9477_netdev_ops = { .proto = DSA_TAG_PROTO_KSZ9477, .xmit = ksz9477_xmit, .rcv = ksz9477_rcv, - .overhead = KSZ9477_INGRESS_TAG_LEN, - .tail_tag = true, + .needed_tailroom = KSZ9477_INGRESS_TAG_LEN, }; DSA_TAG_DRIVER(ksz9477_netdev_ops); @@ -183,8 +181,7 @@ static const struct dsa_device_ops ksz9893_netdev_ops = { .proto = DSA_TAG_PROTO_KSZ9893, .xmit = ksz9893_xmit, .rcv = ksz9477_rcv, - .overhead = KSZ_INGRESS_TAG_LEN, - .tail_tag = true, + .needed_tailroom = KSZ_INGRESS_TAG_LEN, }; DSA_TAG_DRIVER(ksz9893_netdev_ops); diff --git a/net/dsa/tag_lan9303.c b/net/dsa/tag_lan9303.c index aa1318dccaf0..26207ef39ebc 100644 --- a/net/dsa/tag_lan9303.c +++ b/net/dsa/tag_lan9303.c @@ -125,7 +125,7 @@ static const struct dsa_device_ops lan9303_netdev_ops = { .proto = DSA_TAG_PROTO_LAN9303, .xmit = lan9303_xmit, .rcv = lan9303_rcv, - .overhead = LAN9303_TAG_LEN, + .needed_headroom = LAN9303_TAG_LEN, }; MODULE_LICENSE("GPL"); diff --git a/net/dsa/tag_mtk.c b/net/dsa/tag_mtk.c index f9b2966d1936..cc3ba864ad5b 100644 --- a/net/dsa/tag_mtk.c +++ b/net/dsa/tag_mtk.c @@ -102,7 +102,7 @@ static const struct dsa_device_ops mtk_netdev_ops = { .proto = DSA_TAG_PROTO_MTK, .xmit = mtk_tag_xmit, .rcv = mtk_tag_rcv, - .overhead = MTK_HDR_LEN, + .needed_headroom = MTK_HDR_LEN, }; MODULE_LICENSE("GPL"); diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c index 91f0fd1242cd..190f4bfd3bef 100644 --- a/net/dsa/tag_ocelot.c +++ b/net/dsa/tag_ocelot.c @@ -143,7 +143,7 @@ static const struct dsa_device_ops ocelot_netdev_ops = { .proto = DSA_TAG_PROTO_OCELOT, .xmit = ocelot_xmit, .rcv = ocelot_rcv, - .overhead = OCELOT_TOTAL_TAG_LEN, + .needed_headroom = OCELOT_TOTAL_TAG_LEN, .promisc_on_master = true, }; @@ -155,7 +155,7 @@ static const struct dsa_device_ops seville_netdev_ops = { .proto = DSA_TAG_PROTO_SEVILLE, .xmit = seville_xmit, .rcv = ocelot_rcv, - .overhead = OCELOT_TOTAL_TAG_LEN, + .needed_headroom = OCELOT_TOTAL_TAG_LEN, .promisc_on_master = true, }; diff --git a/net/dsa/tag_ocelot_8021q.c b/net/dsa/tag_ocelot_8021q.c index 62a93303bd63..663b74793cfc 100644 --- a/net/dsa/tag_ocelot_8021q.c +++ b/net/dsa/tag_ocelot_8021q.c @@ -73,7 +73,7 @@ static const struct dsa_device_ops ocelot_8021q_netdev_ops = { .proto = DSA_TAG_PROTO_OCELOT_8021Q, .xmit = ocelot_xmit, .rcv = ocelot_rcv, - .overhead = VLAN_HLEN, + .needed_headroom = VLAN_HLEN, .promisc_on_master = true, }; diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c index 88181b52f480..693bda013065 100644 --- a/net/dsa/tag_qca.c +++ b/net/dsa/tag_qca.c @@ -91,7 +91,7 @@ static const struct dsa_device_ops qca_netdev_ops = { .proto = DSA_TAG_PROTO_QCA, .xmit = qca_tag_xmit, .rcv = qca_tag_rcv, - .overhead = QCA_HDR_LEN, + .needed_headroom = QCA_HDR_LEN, }; MODULE_LICENSE("GPL"); diff --git a/net/dsa/tag_rtl4_a.c b/net/dsa/tag_rtl4_a.c index cf8ac316f4c7..57c46b4ab2b3 100644 --- a/net/dsa/tag_rtl4_a.c +++ b/net/dsa/tag_rtl4_a.c @@ -124,7 +124,7 @@ static const struct dsa_device_ops rtl4a_netdev_ops = { .proto = DSA_TAG_PROTO_RTL4_A, .xmit = rtl4a_tag_xmit, .rcv = rtl4a_tag_rcv, - .overhead = RTL4_A_HDR_LEN, + .needed_headroom = RTL4_A_HDR_LEN, }; module_dsa_tag_driver(rtl4a_netdev_ops); diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c index 50496013cdb7..ff4a81eae16f 100644 --- a/net/dsa/tag_sja1105.c +++ b/net/dsa/tag_sja1105.c @@ -362,7 +362,7 @@ static const struct dsa_device_ops sja1105_netdev_ops = { .xmit = sja1105_xmit, .rcv = sja1105_rcv, .filter = sja1105_filter, - .overhead = VLAN_HLEN, + .needed_headroom = VLAN_HLEN, .flow_dissect = sja1105_flow_dissect, .promisc_on_master = true, }; diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c index 5b97ede56a0f..ba73804340a5 100644 --- a/net/dsa/tag_trailer.c +++ b/net/dsa/tag_trailer.c @@ -55,8 +55,7 @@ static const struct dsa_device_ops trailer_netdev_ops = { .proto = DSA_TAG_PROTO_TRAILER, .xmit = trailer_xmit, .rcv = trailer_rcv, - .overhead = 4, - .tail_tag = true, + .needed_tailroom = 4, }; MODULE_LICENSE("GPL"); diff --git a/net/dsa/tag_xrs700x.c b/net/dsa/tag_xrs700x.c index 858cdf9d2913..a31ff7fcb45f 100644 --- a/net/dsa/tag_xrs700x.c +++ b/net/dsa/tag_xrs700x.c @@ -56,8 +56,7 @@ static const struct dsa_device_ops xrs700x_netdev_ops = { .proto = DSA_TAG_PROTO_XRS700X, .xmit = xrs700x_xmit, .rcv = xrs700x_rcv, - .overhead = 1, - .tail_tag = true, + .needed_tailroom = 1, }; MODULE_LICENSE("GPL"); -- cgit v1.2.3-58-ga151 From baa3ad08de6d44a40b94ef1a65640b5076755f9d Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 11 Jun 2021 22:01:25 +0300 Subject: net: dsa: tag_sja1105: stop resetting network and transport headers This makes no sense and is not needed, it is probably a debugging leftover. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- net/dsa/tag_sja1105.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c index ff4a81eae16f..92e147293acf 100644 --- a/net/dsa/tag_sja1105.c +++ b/net/dsa/tag_sja1105.c @@ -307,8 +307,6 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb, __skb_vlan_pop(skb, &tci); } skb_pull_rcsum(skb, ETH_HLEN); - skb_reset_network_header(skb); - skb_reset_transport_header(skb); vid = tci & VLAN_VID_MASK; source_port = dsa_8021q_rx_source_port(vid); -- cgit v1.2.3-58-ga151 From 233697b3b3f60b17d02ca2a35230aee0ac6f1759 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 11 Jun 2021 22:01:27 +0300 Subject: net: dsa: tag_8021q: refactor RX VLAN parsing into a dedicated function The added value of this function is that it can deal with both the case where the VLAN header is in the skb head, as well as in the offload field. This is something I was not able to do using other functions in the network stack. Since both ocelot-8021q and sja1105 need to do the same stuff, let's make it a common service provided by tag_8021q. This is done as refactoring for the new SJA1110 tagger, which partly uses tag_8021q as well (just like SJA1105), and will be the third caller. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/linux/dsa/8021q.h | 3 +++ net/dsa/tag_8021q.c | 23 +++++++++++++++++++++++ net/dsa/tag_ocelot_8021q.c | 18 ++---------------- net/dsa/tag_sja1105.c | 33 +++++++++++---------------------- 4 files changed, 39 insertions(+), 38 deletions(-) (limited to 'net') diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h index cbf2c9b1ee4f..1587961f1a7b 100644 --- a/include/linux/dsa/8021q.h +++ b/include/linux/dsa/8021q.h @@ -50,6 +50,9 @@ int dsa_8021q_crosschip_bridge_leave(struct dsa_8021q_context *ctx, int port, struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev, u16 tpid, u16 tci); +void dsa_8021q_rcv(struct sk_buff *skb, int *source_port, int *switch_id, + int *subvlan); + u16 dsa_8021q_tx_vid(struct dsa_switch *ds, int port); u16 dsa_8021q_rx_vid(struct dsa_switch *ds, int port); diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c index 122ad5833fb1..4aa29f90ecea 100644 --- a/net/dsa/tag_8021q.c +++ b/net/dsa/tag_8021q.c @@ -471,4 +471,27 @@ struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev, } EXPORT_SYMBOL_GPL(dsa_8021q_xmit); +void dsa_8021q_rcv(struct sk_buff *skb, int *source_port, int *switch_id, + int *subvlan) +{ + u16 vid, tci; + + skb_push_rcsum(skb, ETH_HLEN); + if (skb_vlan_tag_present(skb)) { + tci = skb_vlan_tag_get(skb); + __vlan_hwaccel_clear_tag(skb); + } else { + __skb_vlan_pop(skb, &tci); + } + skb_pull_rcsum(skb, ETH_HLEN); + + vid = tci & VLAN_VID_MASK; + + *source_port = dsa_8021q_rx_source_port(vid); + *switch_id = dsa_8021q_rx_switch_id(vid); + *subvlan = dsa_8021q_rx_subvlan(vid); + skb->priority = (tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; +} +EXPORT_SYMBOL_GPL(dsa_8021q_rcv); + MODULE_LICENSE("GPL v2"); diff --git a/net/dsa/tag_ocelot_8021q.c b/net/dsa/tag_ocelot_8021q.c index 663b74793cfc..85ac85c3af8c 100644 --- a/net/dsa/tag_ocelot_8021q.c +++ b/net/dsa/tag_ocelot_8021q.c @@ -41,29 +41,15 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb, struct net_device *netdev, struct packet_type *pt) { - int src_port, switch_id, qos_class; - u16 vid, tci; + int src_port, switch_id, subvlan; - skb_push_rcsum(skb, ETH_HLEN); - if (skb_vlan_tag_present(skb)) { - tci = skb_vlan_tag_get(skb); - __vlan_hwaccel_clear_tag(skb); - } else { - __skb_vlan_pop(skb, &tci); - } - skb_pull_rcsum(skb, ETH_HLEN); - - vid = tci & VLAN_VID_MASK; - src_port = dsa_8021q_rx_source_port(vid); - switch_id = dsa_8021q_rx_switch_id(vid); - qos_class = (tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; + dsa_8021q_rcv(skb, &src_port, &switch_id, &subvlan); skb->dev = dsa_master_find_slave(netdev, switch_id, src_port); if (!skb->dev) return NULL; skb->offload_fwd_mark = 1; - skb->priority = qos_class; return skb; } diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c index 92e147293acf..a70625fe64f7 100644 --- a/net/dsa/tag_sja1105.c +++ b/net/dsa/tag_sja1105.c @@ -275,44 +275,33 @@ static void sja1105_decode_subvlan(struct sk_buff *skb, u16 subvlan) __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tci); } +static bool sja1105_skb_has_tag_8021q(const struct sk_buff *skb) +{ + u16 tpid = ntohs(eth_hdr(skb)->h_proto); + + return tpid == ETH_P_SJA1105 || tpid == ETH_P_8021Q || + skb_vlan_tag_present(skb); +} + static struct sk_buff *sja1105_rcv(struct sk_buff *skb, struct net_device *netdev, struct packet_type *pt) { + int source_port, switch_id, subvlan = 0; struct sja1105_meta meta = {0}; - int source_port, switch_id; struct ethhdr *hdr; - u16 tpid, vid, tci; bool is_link_local; - u16 subvlan = 0; - bool is_tagged; bool is_meta; hdr = eth_hdr(skb); - tpid = ntohs(hdr->h_proto); - is_tagged = (tpid == ETH_P_SJA1105 || tpid == ETH_P_8021Q || - skb_vlan_tag_present(skb)); is_link_local = sja1105_is_link_local(skb); is_meta = sja1105_is_meta_frame(skb); skb->offload_fwd_mark = 1; - if (is_tagged) { + if (sja1105_skb_has_tag_8021q(skb)) { /* Normal traffic path. */ - skb_push_rcsum(skb, ETH_HLEN); - if (skb_vlan_tag_present(skb)) { - tci = skb_vlan_tag_get(skb); - __vlan_hwaccel_clear_tag(skb); - } else { - __skb_vlan_pop(skb, &tci); - } - skb_pull_rcsum(skb, ETH_HLEN); - - vid = tci & VLAN_VID_MASK; - source_port = dsa_8021q_rx_source_port(vid); - switch_id = dsa_8021q_rx_switch_id(vid); - skb->priority = (tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; - subvlan = dsa_8021q_rx_subvlan(vid); + dsa_8021q_rcv(skb, &source_port, &switch_id, &subvlan); } else if (is_link_local) { /* Management traffic path. Switch embeds the switch ID and * port ID into bytes of the destination MAC, courtesy of -- cgit v1.2.3-58-ga151 From 617ef8d9377b9aac381c023cd0823da264c2f463 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 11 Jun 2021 22:01:28 +0300 Subject: net: dsa: sja1105: make SJA1105_SKB_CB fit a full timestamp In SJA1105, RX timestamps for packets sent to the CPU are transmitted in separate follow-up packets (metadata frames). These contain partial timestamps (24 or 32 bits) which are kept in SJA1105_SKB_CB(skb)->meta_tstamp. Thankfully, SJA1110 improved that, and the RX timestamps are now transmitted in-band with the actual packet, in the timestamp trailer. The RX timestamps are now full-width 64 bits. Because we process the RX DSA tags in the rcv() method in the tagger, but we would like to preserve the DSA code structure in that we populate the skb timestamp in the port_rxtstamp() call which only happens later, the implication is that we must somehow pass the 64-bit timestamp from the rcv() method all the way to port_rxtstamp(). We can use the skb->cb for that. Rename the meta_tstamp from struct sja1105_skb_cb from "meta_tstamp" to "tstamp", and increase its size to 64 bits. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/sja1105/sja1105_ptp.c | 2 +- include/linux/dsa/sja1105.h | 2 +- net/dsa/tag_sja1105.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/drivers/net/dsa/sja1105/sja1105_ptp.c b/drivers/net/dsa/sja1105/sja1105_ptp.c index 0bc566b9e958..dea82f8a40c4 100644 --- a/drivers/net/dsa/sja1105/sja1105_ptp.c +++ b/drivers/net/dsa/sja1105/sja1105_ptp.c @@ -397,7 +397,7 @@ static long sja1105_rxtstamp_work(struct ptp_clock_info *ptp) *shwt = (struct skb_shared_hwtstamps) {0}; - ts = SJA1105_SKB_CB(skb)->meta_tstamp; + ts = SJA1105_SKB_CB(skb)->tstamp; ts = sja1105_tstamp_reconstruct(ds, ticks, ts); shwt->hwtstamp = ns_to_ktime(sja1105_ticks_to_ns(ts)); diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h index 1eb84562b311..865a548a6ef2 100644 --- a/include/linux/dsa/sja1105.h +++ b/include/linux/dsa/sja1105.h @@ -48,7 +48,7 @@ struct sja1105_tagger_data { struct sja1105_skb_cb { struct sk_buff *clone; - u32 meta_tstamp; + u64 tstamp; }; #define SJA1105_SKB_CB(skb) \ diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c index a70625fe64f7..11f555dd9566 100644 --- a/net/dsa/tag_sja1105.c +++ b/net/dsa/tag_sja1105.c @@ -147,7 +147,7 @@ static void sja1105_transfer_meta(struct sk_buff *skb, hdr->h_dest[3] = meta->dmac_byte_3; hdr->h_dest[4] = meta->dmac_byte_4; - SJA1105_SKB_CB(skb)->meta_tstamp = meta->tstamp; + SJA1105_SKB_CB(skb)->tstamp = meta->tstamp; } /* This is a simple state machine which follows the hardware mechanism of -- cgit v1.2.3-58-ga151 From 4913b8ebf8a9c56ce66466b4daa07d7d4678cdd8 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 11 Jun 2021 22:01:29 +0300 Subject: net: dsa: add support for the SJA1110 native tagging protocol The SJA1110 has improved a few things compared to SJA1105: - To send a control packet from the host port with SJA1105, one needed to program a one-shot "management route" over SPI. This is no longer true with SJA1110, you can actually send "in-band control extensions" in the packets sent by DSA, these are in fact DSA tags which contain the destination port and switch ID. - When receiving a control packet from the switch with SJA1105, the source port and switch ID were written in bytes 3 and 4 of the destination MAC address of the frame (which was a very poor shot at a DSA header). If the control packet also had an RX timestamp, that timestamp was sent in an actual follow-up packet, so there were reordering concerns on multi-core/multi-queue DSA masters, where the metadata frame with the RX timestamp might get processed before the actual packet to which that timestamp belonged (there is no way to pair a packet to its timestamp other than the order in which they were received). On SJA1110, this is no longer true, control packets have the source port, switch ID and timestamp all in the DSA tags. - Timestamps from the switch were partial: to get a 64-bit timestamp as required by PTP stacks, one would need to take the partial 24-bit or 32-bit timestamp from the packet, then read the current PTP time very quickly, and then patch in the high bits of the current PTP time into the captured partial timestamp, to reconstruct what the full 64-bit timestamp must have been. That is awful because packet processing is done in NAPI context, but reading the current PTP time is done over SPI and therefore needs sleepable context. But it also aggravated a few things: - Not only is there a DSA header in SJA1110, but there is a DSA trailer in fact, too. So DSA needs to be extended to support taggers which have both a header and a trailer. Very unconventional - my understanding is that the trailer exists because the timestamps couldn't be prepared in time for putting them in the header area. - Like SJA1105, not all packets sent to the CPU have the DSA tag added to them, only control packets do: * the ones which match the destination MAC filters/traps in MAC_FLTRES1 and MAC_FLTRES0 * the ones which match FDB entries which have TRAP or TAKETS bits set So we could in theory hack something up to request the switch to take timestamps for all packets that reach the CPU, and those would be DSA-tagged and contain the source port / switch ID by virtue of the fact that there needs to be a timestamp trailer provided. BUT: - The SJA1110 does not parse its own DSA tags in a way that is useful for routing in cross-chip topologies, a la Marvell. And the sja1105 driver already supports cross-chip bridging from the SJA1105 days. It does that by automatically setting up the DSA links as VLAN trunks which contain all the necessary tag_8021q RX VLANs that must be communicated between the switches that span the same bridge. So when using tag_8021q on sja1105, it is possible to have 2 switches with ports sw0p0, sw0p1, sw1p0, sw1p1, and 2 VLAN-unaware bridges br0 and br1, and br0 can take sw0p0 and sw1p0, and br1 can take sw0p1 and sw1p1, and forwarding will happen according to the expected rules of the Linux bridge. We like that, and we don't want that to go away, so as a matter of fact, the SJA1110 tagger still needs to support tag_8021q. So the sja1110 tagger is a hybrid between tag_8021q for data packets, and the native hardware support for control packets. On RX, packets have a 13-byte trailer if they contain an RX timestamp. That trailer is padded in such a way that its byte 8 (the start of the "residence time" field - not parsed by Linux because we don't care) is aligned on a 16 byte boundary. So the padding has a variable length between 0 and 15 bytes. The DSA header contains the offset of the beginning of the padding relative to the beginning of the frame (and the end of the padding is obviously the end of the packet minus 13 bytes, the length of the trailer). So we discard it. Packets which don't have a trailer contain the source port and switch ID information in the header (they are "trap-to-host" packets). Packets which have a trailer contain the source port and switch ID in the trailer. On TX, the destination port mask and switch ID is always in the trailer, so we always need to say in the header that a trailer is present. The header needs a custom EtherType and this was chosen as 0xdadc, after 0xdada which is for Marvell and 0xdadb which is for VLANs in VLAN-unaware mode on SJA1105 (and SJA1110 in fact too). Because we use tag_8021q in concert with the native tagging protocol, control packets will have 2 DSA tags. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/sja1105/sja1105.h | 1 + drivers/net/dsa/sja1105/sja1105_main.c | 6 +- drivers/net/dsa/sja1105/sja1105_spi.c | 10 ++ drivers/net/dsa/sja1105/sja1105_static_config.c | 1 + drivers/net/dsa/sja1105/sja1105_static_config.h | 1 + include/linux/dsa/sja1105.h | 1 + include/net/dsa.h | 2 + net/dsa/tag_sja1105.c | 221 +++++++++++++++++++++++- 8 files changed, 240 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/drivers/net/dsa/sja1105/sja1105.h b/drivers/net/dsa/sja1105/sja1105.h index 4d192331754c..a6d64b27e6a9 100644 --- a/drivers/net/dsa/sja1105/sja1105.h +++ b/drivers/net/dsa/sja1105/sja1105.h @@ -110,6 +110,7 @@ struct sja1105_info { int max_frame_mem; int num_ports; bool multiple_cascade_ports; + enum dsa_tag_protocol tag_proto; const struct sja1105_dynamic_table_ops *dyn_ops; const struct sja1105_table_ops *static_ops; const struct sja1105_regs *regs; diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c index 850bbc793369..6e2cfbf605ef 100644 --- a/drivers/net/dsa/sja1105/sja1105_main.c +++ b/drivers/net/dsa/sja1105/sja1105_main.c @@ -667,6 +667,8 @@ static int sja1105_init_general_params(struct sja1105_private *priv) .tpid2 = ETH_P_SJA1105, /* Enable the TTEthernet engine on SJA1110 */ .tte_en = true, + /* Set up the EtherType for control packets on SJA1110 */ + .header_type = ETH_P_SJA1110, }; struct sja1105_general_params_entry *general_params; struct dsa_switch *ds = priv->ds; @@ -2174,7 +2176,9 @@ static enum dsa_tag_protocol sja1105_get_tag_protocol(struct dsa_switch *ds, int port, enum dsa_tag_protocol mp) { - return DSA_TAG_PROTO_SJA1105; + struct sja1105_private *priv = ds->priv; + + return priv->info->tag_proto; } static int sja1105_find_free_subvlan(u16 *subvlan_map, bool pvid) diff --git a/drivers/net/dsa/sja1105/sja1105_spi.c b/drivers/net/dsa/sja1105/sja1105_spi.c index e6c2a37aa617..9156f4cc11f2 100644 --- a/drivers/net/dsa/sja1105/sja1105_spi.c +++ b/drivers/net/dsa/sja1105/sja1105_spi.c @@ -569,6 +569,7 @@ const struct sja1105_info sja1105e_info = { .static_ops = sja1105e_table_ops, .dyn_ops = sja1105et_dyn_ops, .qinq_tpid = ETH_P_8021Q, + .tag_proto = DSA_TAG_PROTO_SJA1105, .can_limit_mcast_flood = false, .ptp_ts_bits = 24, .ptpegr_ts_bytes = 4, @@ -600,6 +601,7 @@ const struct sja1105_info sja1105t_info = { .static_ops = sja1105t_table_ops, .dyn_ops = sja1105et_dyn_ops, .qinq_tpid = ETH_P_8021Q, + .tag_proto = DSA_TAG_PROTO_SJA1105, .can_limit_mcast_flood = false, .ptp_ts_bits = 24, .ptpegr_ts_bytes = 4, @@ -631,6 +633,7 @@ const struct sja1105_info sja1105p_info = { .static_ops = sja1105p_table_ops, .dyn_ops = sja1105pqrs_dyn_ops, .qinq_tpid = ETH_P_8021AD, + .tag_proto = DSA_TAG_PROTO_SJA1105, .can_limit_mcast_flood = true, .ptp_ts_bits = 32, .ptpegr_ts_bytes = 8, @@ -663,6 +666,7 @@ const struct sja1105_info sja1105q_info = { .static_ops = sja1105q_table_ops, .dyn_ops = sja1105pqrs_dyn_ops, .qinq_tpid = ETH_P_8021AD, + .tag_proto = DSA_TAG_PROTO_SJA1105, .can_limit_mcast_flood = true, .ptp_ts_bits = 32, .ptpegr_ts_bytes = 8, @@ -695,6 +699,7 @@ const struct sja1105_info sja1105r_info = { .static_ops = sja1105r_table_ops, .dyn_ops = sja1105pqrs_dyn_ops, .qinq_tpid = ETH_P_8021AD, + .tag_proto = DSA_TAG_PROTO_SJA1105, .can_limit_mcast_flood = true, .ptp_ts_bits = 32, .ptpegr_ts_bytes = 8, @@ -729,6 +734,7 @@ const struct sja1105_info sja1105s_info = { .dyn_ops = sja1105pqrs_dyn_ops, .regs = &sja1105pqrs_regs, .qinq_tpid = ETH_P_8021AD, + .tag_proto = DSA_TAG_PROTO_SJA1105, .can_limit_mcast_flood = true, .ptp_ts_bits = 32, .ptpegr_ts_bytes = 8, @@ -762,6 +768,7 @@ const struct sja1105_info sja1110a_info = { .dyn_ops = sja1110_dyn_ops, .regs = &sja1110_regs, .qinq_tpid = ETH_P_8021AD, + .tag_proto = DSA_TAG_PROTO_SJA1110, .can_limit_mcast_flood = true, .multiple_cascade_ports = true, .ptp_ts_bits = 32, @@ -808,6 +815,7 @@ const struct sja1105_info sja1110b_info = { .dyn_ops = sja1110_dyn_ops, .regs = &sja1110_regs, .qinq_tpid = ETH_P_8021AD, + .tag_proto = DSA_TAG_PROTO_SJA1110, .can_limit_mcast_flood = true, .multiple_cascade_ports = true, .ptp_ts_bits = 32, @@ -854,6 +862,7 @@ const struct sja1105_info sja1110c_info = { .dyn_ops = sja1110_dyn_ops, .regs = &sja1110_regs, .qinq_tpid = ETH_P_8021AD, + .tag_proto = DSA_TAG_PROTO_SJA1110, .can_limit_mcast_flood = true, .multiple_cascade_ports = true, .ptp_ts_bits = 32, @@ -900,6 +909,7 @@ const struct sja1105_info sja1110d_info = { .dyn_ops = sja1110_dyn_ops, .regs = &sja1110_regs, .qinq_tpid = ETH_P_8021AD, + .tag_proto = DSA_TAG_PROTO_SJA1110, .can_limit_mcast_flood = true, .multiple_cascade_ports = true, .ptp_ts_bits = 32, diff --git a/drivers/net/dsa/sja1105/sja1105_static_config.c b/drivers/net/dsa/sja1105/sja1105_static_config.c index eda571819d45..1491b72008f3 100644 --- a/drivers/net/dsa/sja1105/sja1105_static_config.c +++ b/drivers/net/dsa/sja1105/sja1105_static_config.c @@ -212,6 +212,7 @@ size_t sja1110_general_params_entry_packing(void *buf, void *entry_ptr, sja1105_packing(buf, &entry->egrmirrdei, 110, 110, size, op); sja1105_packing(buf, &entry->replay_port, 109, 106, size, op); sja1105_packing(buf, &entry->tdmaconfigidx, 70, 67, size, op); + sja1105_packing(buf, &entry->header_type, 64, 49, size, op); sja1105_packing(buf, &entry->tte_en, 16, 16, size, op); return size; } diff --git a/drivers/net/dsa/sja1105/sja1105_static_config.h b/drivers/net/dsa/sja1105/sja1105_static_config.h index 9bef51791bff..bce0f5c03d0b 100644 --- a/drivers/net/dsa/sja1105/sja1105_static_config.h +++ b/drivers/net/dsa/sja1105/sja1105_static_config.h @@ -217,6 +217,7 @@ struct sja1105_general_params_entry { /* SJA1110 only */ u64 tte_en; u64 tdmaconfigidx; + u64 header_type; }; struct sja1105_schedule_entry_points_entry { diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h index 865a548a6ef2..b02cf7b515ae 100644 --- a/include/linux/dsa/sja1105.h +++ b/include/linux/dsa/sja1105.h @@ -14,6 +14,7 @@ #define ETH_P_SJA1105 ETH_P_DSA_8021Q #define ETH_P_SJA1105_META 0x0008 +#define ETH_P_SJA1110 0xdadc /* IEEE 802.3 Annex 57A: Slow Protocols PDUs (01:80:C2:xx:xx:xx) */ #define SJA1105_LINKLOCAL_FILTER_A 0x0180C2000000ull diff --git a/include/net/dsa.h b/include/net/dsa.h index 0a10f6fffc3d..289d68e82da0 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -50,6 +50,7 @@ struct phylink_link_state; #define DSA_TAG_PROTO_OCELOT_8021Q_VALUE 20 #define DSA_TAG_PROTO_SEVILLE_VALUE 21 #define DSA_TAG_PROTO_BRCM_LEGACY_VALUE 22 +#define DSA_TAG_PROTO_SJA1110_VALUE 23 enum dsa_tag_protocol { DSA_TAG_PROTO_NONE = DSA_TAG_PROTO_NONE_VALUE, @@ -75,6 +76,7 @@ enum dsa_tag_protocol { DSA_TAG_PROTO_XRS700X = DSA_TAG_PROTO_XRS700X_VALUE, DSA_TAG_PROTO_OCELOT_8021Q = DSA_TAG_PROTO_OCELOT_8021Q_VALUE, DSA_TAG_PROTO_SEVILLE = DSA_TAG_PROTO_SEVILLE_VALUE, + DSA_TAG_PROTO_SJA1110 = DSA_TAG_PROTO_SJA1110_VALUE, }; struct packet_type; diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c index 11f555dd9566..37e1d64e07c6 100644 --- a/net/dsa/tag_sja1105.c +++ b/net/dsa/tag_sja1105.c @@ -7,6 +7,47 @@ #include #include "dsa_priv.h" +/* Is this a TX or an RX header? */ +#define SJA1110_HEADER_HOST_TO_SWITCH BIT(15) + +/* RX header */ +#define SJA1110_RX_HEADER_IS_METADATA BIT(14) +#define SJA1110_RX_HEADER_HOST_ONLY BIT(13) +#define SJA1110_RX_HEADER_HAS_TRAILER BIT(12) + +/* Trap-to-host format (no trailer present) */ +#define SJA1110_RX_HEADER_SRC_PORT(x) (((x) & GENMASK(7, 4)) >> 4) +#define SJA1110_RX_HEADER_SWITCH_ID(x) ((x) & GENMASK(3, 0)) + +/* Timestamp format (trailer present) */ +#define SJA1110_RX_HEADER_TRAILER_POS(x) ((x) & GENMASK(11, 0)) + +#define SJA1110_RX_TRAILER_SWITCH_ID(x) (((x) & GENMASK(7, 4)) >> 4) +#define SJA1110_RX_TRAILER_SRC_PORT(x) ((x) & GENMASK(3, 0)) + +/* TX header */ +#define SJA1110_TX_HEADER_UPDATE_TC BIT(14) +#define SJA1110_TX_HEADER_TAKE_TS BIT(13) +#define SJA1110_TX_HEADER_TAKE_TS_CASC BIT(12) +#define SJA1110_TX_HEADER_HAS_TRAILER BIT(11) + +/* Only valid if SJA1110_TX_HEADER_HAS_TRAILER is false */ +#define SJA1110_TX_HEADER_PRIO(x) (((x) << 7) & GENMASK(10, 7)) +#define SJA1110_TX_HEADER_TSTAMP_ID(x) ((x) & GENMASK(7, 0)) + +/* Only valid if SJA1110_TX_HEADER_HAS_TRAILER is true */ +#define SJA1110_TX_HEADER_TRAILER_POS(x) ((x) & GENMASK(10, 0)) + +#define SJA1110_TX_TRAILER_TSTAMP_ID(x) (((x) << 24) & GENMASK(31, 24)) +#define SJA1110_TX_TRAILER_PRIO(x) (((x) << 21) & GENMASK(23, 21)) +#define SJA1110_TX_TRAILER_SWITCHID(x) (((x) << 12) & GENMASK(15, 12)) +#define SJA1110_TX_TRAILER_DESTPORTS(x) (((x) << 1) & GENMASK(11, 1)) + +#define SJA1110_HEADER_LEN 4 +#define SJA1110_RX_TRAILER_LEN 13 +#define SJA1110_TX_TRAILER_LEN 4 +#define SJA1110_MAX_PADDING_LEN 15 + /* Similar to is_link_local_ether_addr(hdr->h_dest) but also covers PTP */ static inline bool sja1105_is_link_local(const struct sk_buff *skb) { @@ -140,6 +181,50 @@ static struct sk_buff *sja1105_xmit(struct sk_buff *skb, ((pcp << VLAN_PRIO_SHIFT) | tx_vid)); } +static struct sk_buff *sja1110_xmit(struct sk_buff *skb, + struct net_device *netdev) +{ + struct dsa_port *dp = dsa_slave_to_port(netdev); + u16 tx_vid = dsa_8021q_tx_vid(dp->ds, dp->index); + u16 queue_mapping = skb_get_queue_mapping(skb); + u8 pcp = netdev_txq_to_tc(netdev, queue_mapping); + struct ethhdr *eth_hdr; + __be32 *tx_trailer; + __be16 *tx_header; + int trailer_pos; + + /* Transmitting control packets is done using in-band control + * extensions, while data packets are transmitted using + * tag_8021q TX VLANs. + */ + if (likely(!sja1105_is_link_local(skb))) + return dsa_8021q_xmit(skb, netdev, sja1105_xmit_tpid(dp->priv), + ((pcp << VLAN_PRIO_SHIFT) | tx_vid)); + + skb_push(skb, SJA1110_HEADER_LEN); + + /* Move Ethernet header to the left, making space for DSA tag */ + memmove(skb->data, skb->data + SJA1110_HEADER_LEN, 2 * ETH_ALEN); + + trailer_pos = skb->len; + + /* On TX, skb->data points to skb_mac_header(skb) */ + eth_hdr = (struct ethhdr *)skb->data; + tx_header = (__be16 *)(eth_hdr + 1); + tx_trailer = skb_put(skb, SJA1110_TX_TRAILER_LEN); + + eth_hdr->h_proto = htons(ETH_P_SJA1110); + + *tx_header = htons(SJA1110_HEADER_HOST_TO_SWITCH | + SJA1110_TX_HEADER_HAS_TRAILER | + SJA1110_TX_HEADER_TRAILER_POS(trailer_pos)); + *tx_trailer = cpu_to_be32(SJA1110_TX_TRAILER_PRIO(pcp) | + SJA1110_TX_TRAILER_SWITCHID(dp->ds->index) | + SJA1110_TX_TRAILER_DESTPORTS(BIT(dp->index))); + + return skb; +} + static void sja1105_transfer_meta(struct sk_buff *skb, const struct sja1105_meta *meta) { @@ -283,6 +368,11 @@ static bool sja1105_skb_has_tag_8021q(const struct sk_buff *skb) skb_vlan_tag_present(skb); } +static bool sja1110_skb_has_inband_control_extension(const struct sk_buff *skb) +{ + return ntohs(eth_hdr(skb)->h_proto) == ETH_P_SJA1110; +} + static struct sk_buff *sja1105_rcv(struct sk_buff *skb, struct net_device *netdev, struct packet_type *pt) @@ -333,6 +423,98 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb, is_meta); } +static struct sk_buff *sja1110_rcv_inband_control_extension(struct sk_buff *skb, + int *source_port, + int *switch_id) +{ + u16 rx_header; + + if (unlikely(!pskb_may_pull(skb, SJA1110_HEADER_LEN))) + return NULL; + + /* skb->data points to skb_mac_header(skb) + ETH_HLEN, which is exactly + * what we need because the caller has checked the EtherType (which is + * located 2 bytes back) and we just need a pointer to the header that + * comes afterwards. + */ + rx_header = ntohs(*(__be16 *)skb->data); + + /* Timestamp frame, we have a trailer */ + if (rx_header & SJA1110_RX_HEADER_HAS_TRAILER) { + int start_of_padding = SJA1110_RX_HEADER_TRAILER_POS(rx_header); + u8 *rx_trailer = skb_tail_pointer(skb) - SJA1110_RX_TRAILER_LEN; + u64 *tstamp = &SJA1105_SKB_CB(skb)->tstamp; + u8 last_byte = rx_trailer[12]; + + /* The timestamp is unaligned, so we need to use packing() + * to get it + */ + packing(rx_trailer, tstamp, 63, 0, 8, UNPACK, 0); + + *source_port = SJA1110_RX_TRAILER_SRC_PORT(last_byte); + *switch_id = SJA1110_RX_TRAILER_SWITCH_ID(last_byte); + + /* skb->len counts from skb->data, while start_of_padding + * counts from the destination MAC address. Right now skb->data + * is still as set by the DSA master, so to trim away the + * padding and trailer we need to account for the fact that + * skb->data points to skb_mac_header(skb) + ETH_HLEN. + */ + pskb_trim_rcsum(skb, start_of_padding - ETH_HLEN); + /* Trap-to-host frame, no timestamp trailer */ + } else { + *source_port = SJA1110_RX_HEADER_SRC_PORT(rx_header); + *switch_id = SJA1110_RX_HEADER_SWITCH_ID(rx_header); + } + + /* Advance skb->data past the DSA header */ + skb_pull_rcsum(skb, SJA1110_HEADER_LEN); + + /* Remove the DSA header */ + memmove(skb->data - ETH_HLEN, skb->data - ETH_HLEN - SJA1110_HEADER_LEN, + 2 * ETH_ALEN); + + /* With skb->data in its final place, update the MAC header + * so that eth_hdr() continues to works properly. + */ + skb_set_mac_header(skb, -ETH_HLEN); + + return skb; +} + +static struct sk_buff *sja1110_rcv(struct sk_buff *skb, + struct net_device *netdev, + struct packet_type *pt) +{ + int source_port = -1, switch_id = -1, subvlan = 0; + + skb->offload_fwd_mark = 1; + + if (sja1110_skb_has_inband_control_extension(skb)) { + skb = sja1110_rcv_inband_control_extension(skb, &source_port, + &switch_id); + if (!skb) + return NULL; + } + + /* Packets with in-band control extensions might still have RX VLANs */ + if (likely(sja1105_skb_has_tag_8021q(skb))) + dsa_8021q_rcv(skb, &source_port, &switch_id, &subvlan); + + skb->dev = dsa_master_find_slave(netdev, switch_id, source_port); + if (!skb->dev) { + netdev_warn(netdev, + "Couldn't decode source port %d and switch id %d\n", + source_port, switch_id); + return NULL; + } + + if (subvlan) + sja1105_decode_subvlan(skb, subvlan); + + return skb; +} + static void sja1105_flow_dissect(const struct sk_buff *skb, __be16 *proto, int *offset) { @@ -343,6 +525,20 @@ static void sja1105_flow_dissect(const struct sk_buff *skb, __be16 *proto, dsa_tag_generic_flow_dissect(skb, proto, offset); } +static void sja1110_flow_dissect(const struct sk_buff *skb, __be16 *proto, + int *offset) +{ + /* Management frames have 2 DSA tags on RX, so the needed_headroom we + * declared is fine for the generic dissector adjustment procedure. + */ + if (unlikely(sja1105_is_link_local(skb))) + return dsa_tag_generic_flow_dissect(skb, proto, offset); + + /* For the rest, there is a single DSA tag, the tag_8021q one */ + *offset = VLAN_HLEN; + *proto = ((__be16 *)skb->data)[(VLAN_HLEN / 2) - 1]; +} + static const struct dsa_device_ops sja1105_netdev_ops = { .name = "sja1105", .proto = DSA_TAG_PROTO_SJA1105, @@ -354,7 +550,28 @@ static const struct dsa_device_ops sja1105_netdev_ops = { .promisc_on_master = true, }; -MODULE_LICENSE("GPL v2"); +DSA_TAG_DRIVER(sja1105_netdev_ops); MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_SJA1105); -module_dsa_tag_driver(sja1105_netdev_ops); +static const struct dsa_device_ops sja1110_netdev_ops = { + .name = "sja1110", + .proto = DSA_TAG_PROTO_SJA1110, + .xmit = sja1110_xmit, + .rcv = sja1110_rcv, + .filter = sja1105_filter, + .flow_dissect = sja1110_flow_dissect, + .needed_headroom = SJA1110_HEADER_LEN + VLAN_HLEN, + .needed_tailroom = SJA1110_RX_TRAILER_LEN + SJA1110_MAX_PADDING_LEN, +}; + +DSA_TAG_DRIVER(sja1110_netdev_ops); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_SJA1110); + +static struct dsa_tag_driver *sja1105_tag_driver_array[] = { + &DSA_TAG_DRIVER_NAME(sja1105_netdev_ops), + &DSA_TAG_DRIVER_NAME(sja1110_netdev_ops), +}; + +module_dsa_tag_drivers(sja1105_tag_driver_array); + +MODULE_LICENSE("GPL v2"); -- cgit v1.2.3-58-ga151 From 566b18c8b752f67c4e82f0eb4563dd71f84a8799 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 11 Jun 2021 22:01:31 +0300 Subject: net: dsa: sja1105: implement TX timestamping for SJA1110 The TX timestamping procedure for SJA1105 is a bit unconventional because the transmit procedure itself is unconventional. Control packets (and therefore PTP as well) are transmitted to a specific port in SJA1105 using "management routes" which must be written over SPI to the switch. These are one-shot rules that match by destination MAC address on traffic coming from the CPU port, and select the precise destination port for that packet. So to transmit a packet from NET_TX softirq context, we actually need to defer to a process context so that we can perform that SPI write before we send the packet. The DSA master dev_queue_xmit() runs in process context, and we poll until the switch confirms it took the TX timestamp, then we annotate the skb clone with that TX timestamp. This is why the sja1105 driver does not need an skb queue for TX timestamping. But the SJA1110 is a bit (not much!) more conventional, and you can request 2-step TX timestamping through the DSA header, as well as give the switch a cookie (timestamp ID) which it will give back to you when it has the timestamp. So now we do need a queue for keeping the skb clones until their TX timestamps become available. The interesting part is that the metadata frames from SJA1105 haven't disappeared completely. On SJA1105 they were used as follow-ups which contained RX timestamps, but on SJA1110 they are actually TX completion packets, which contain a variable (up to 32) array of timestamps. Why an array? Because: - not only is the TX timestamp on the egress port being communicated, but also the RX timestamp on the CPU port. Nice, but we don't care about that, so we ignore it. - because a packet could be multicast to multiple egress ports, each port takes its own timestamp, and the TX completion packet contains the individual timestamps on each port. This is unconventional because switches typically have a timestamping FIFO and raise an interrupt, but this one doesn't. So the tagger needs to detect and parse meta frames, and call into the main switch driver, which pairs the timestamps with the skbs in the TX timestamping queue which are waiting for one. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/sja1105/sja1105.h | 1 + drivers/net/dsa/sja1105/sja1105_ptp.c | 69 +++++++++++++++++++++++++++++++++++ drivers/net/dsa/sja1105/sja1105_ptp.h | 7 ++++ drivers/net/dsa/sja1105/sja1105_spi.c | 4 ++ include/linux/dsa/sja1105.h | 23 ++++++++++++ net/dsa/tag_sja1105.c | 52 ++++++++++++++++++++++++++ 6 files changed, 156 insertions(+) (limited to 'net') diff --git a/drivers/net/dsa/sja1105/sja1105.h b/drivers/net/dsa/sja1105/sja1105.h index 201bca282884..5f3449351668 100644 --- a/drivers/net/dsa/sja1105/sja1105.h +++ b/drivers/net/dsa/sja1105/sja1105.h @@ -131,6 +131,7 @@ struct sja1105_info { void (*ptp_cmd_packing)(u8 *buf, struct sja1105_ptp_cmd *cmd, enum packing_op op); bool (*rxtstamp)(struct dsa_switch *ds, int port, struct sk_buff *skb); + void (*txtstamp)(struct dsa_switch *ds, int port, struct sk_buff *skb); int (*clocking_setup)(struct sja1105_private *priv); const char *name; bool supports_mii[SJA1105_MAX_NUM_PORTS]; diff --git a/drivers/net/dsa/sja1105/sja1105_ptp.c b/drivers/net/dsa/sja1105/sja1105_ptp.c index 62fe05b4cb60..691f6dd7e669 100644 --- a/drivers/net/dsa/sja1105/sja1105_ptp.c +++ b/drivers/net/dsa/sja1105/sja1105_ptp.c @@ -79,6 +79,7 @@ static int sja1105_change_rxtstamping(struct sja1105_private *priv, priv->tagger_data.stampable_skb = NULL; } ptp_cancel_worker_sync(ptp_data->clock); + skb_queue_purge(&ptp_data->skb_txtstamp_queue); skb_queue_purge(&ptp_data->skb_rxtstamp_queue); return sja1105_static_config_reload(priv, SJA1105_RX_HWTSTAMPING); @@ -451,6 +452,67 @@ bool sja1105_port_rxtstamp(struct dsa_switch *ds, int port, return priv->info->rxtstamp(ds, port, skb); } +void sja1110_process_meta_tstamp(struct dsa_switch *ds, int port, u8 ts_id, + enum sja1110_meta_tstamp dir, u64 tstamp) +{ + struct sja1105_private *priv = ds->priv; + struct sja1105_ptp_data *ptp_data = &priv->ptp_data; + struct sk_buff *skb, *skb_tmp, *skb_match = NULL; + struct skb_shared_hwtstamps shwt = {0}; + + /* We don't care about RX timestamps on the CPU port */ + if (dir == SJA1110_META_TSTAMP_RX) + return; + + spin_lock(&ptp_data->skb_txtstamp_queue.lock); + + skb_queue_walk_safe(&ptp_data->skb_txtstamp_queue, skb, skb_tmp) { + if (SJA1105_SKB_CB(skb)->ts_id != ts_id) + continue; + + __skb_unlink(skb, &ptp_data->skb_txtstamp_queue); + skb_match = skb; + + break; + } + + spin_unlock(&ptp_data->skb_txtstamp_queue.lock); + + if (WARN_ON(!skb_match)) + return; + + shwt.hwtstamp = ns_to_ktime(sja1105_ticks_to_ns(tstamp)); + skb_complete_tx_timestamp(skb_match, &shwt); +} +EXPORT_SYMBOL_GPL(sja1110_process_meta_tstamp); + +/* In addition to cloning the skb which is done by the common + * sja1105_port_txtstamp, we need to generate a timestamp ID and save the + * packet to the TX timestamping queue. + */ +void sja1110_txtstamp(struct dsa_switch *ds, int port, struct sk_buff *skb) +{ + struct sk_buff *clone = SJA1105_SKB_CB(skb)->clone; + struct sja1105_private *priv = ds->priv; + struct sja1105_ptp_data *ptp_data = &priv->ptp_data; + struct sja1105_port *sp = &priv->ports[port]; + u8 ts_id; + + skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS; + + spin_lock(&sp->data->meta_lock); + + ts_id = sp->data->ts_id; + /* Deal automatically with 8-bit wraparound */ + sp->data->ts_id++; + + SJA1105_SKB_CB(clone)->ts_id = ts_id; + + spin_unlock(&sp->data->meta_lock); + + skb_queue_tail(&ptp_data->skb_txtstamp_queue, clone); +} + /* Called from dsa_skb_tx_timestamp. This callback is just to clone * the skb and have it available in SJA1105_SKB_CB in the .port_deferred_xmit * callback, where we will timestamp it synchronously. @@ -469,6 +531,9 @@ void sja1105_port_txtstamp(struct dsa_switch *ds, int port, struct sk_buff *skb) return; SJA1105_SKB_CB(skb)->clone = clone; + + if (priv->info->txtstamp) + priv->info->txtstamp(ds, port, skb); } static int sja1105_ptp_reset(struct dsa_switch *ds) @@ -885,7 +950,10 @@ int sja1105_ptp_clock_register(struct dsa_switch *ds) .n_per_out = 1, }; + /* Only used on SJA1105 */ skb_queue_head_init(&ptp_data->skb_rxtstamp_queue); + /* Only used on SJA1110 */ + skb_queue_head_init(&ptp_data->skb_txtstamp_queue); spin_lock_init(&tagger_data->meta_lock); ptp_data->clock = ptp_clock_register(&ptp_data->caps, ds->dev); @@ -910,6 +978,7 @@ void sja1105_ptp_clock_unregister(struct dsa_switch *ds) del_timer_sync(&ptp_data->extts_timer); ptp_cancel_worker_sync(ptp_data->clock); + skb_queue_purge(&ptp_data->skb_txtstamp_queue); skb_queue_purge(&ptp_data->skb_rxtstamp_queue); ptp_clock_unregister(ptp_data->clock); ptp_data->clock = NULL; diff --git a/drivers/net/dsa/sja1105/sja1105_ptp.h b/drivers/net/dsa/sja1105/sja1105_ptp.h index bf0c4f1dfed7..3c874bb4c17b 100644 --- a/drivers/net/dsa/sja1105/sja1105_ptp.h +++ b/drivers/net/dsa/sja1105/sja1105_ptp.h @@ -75,7 +75,12 @@ struct sja1105_ptp_cmd { struct sja1105_ptp_data { struct timer_list extts_timer; + /* Used only on SJA1105 to reconstruct partial timestamps */ struct sk_buff_head skb_rxtstamp_queue; + /* Used on SJA1110 where meta frames are generated only for + * 2-step TX timestamps + */ + struct sk_buff_head skb_txtstamp_queue; struct ptp_clock_info caps; struct ptp_clock *clock; struct sja1105_ptp_cmd cmd; @@ -124,6 +129,7 @@ int sja1105_ptp_commit(struct dsa_switch *ds, struct sja1105_ptp_cmd *cmd, bool sja1105_rxtstamp(struct dsa_switch *ds, int port, struct sk_buff *skb); bool sja1110_rxtstamp(struct dsa_switch *ds, int port, struct sk_buff *skb); +void sja1110_txtstamp(struct dsa_switch *ds, int port, struct sk_buff *skb); #else @@ -189,6 +195,7 @@ static inline int sja1105_ptp_commit(struct dsa_switch *ds, #define sja1105_rxtstamp NULL #define sja1110_rxtstamp NULL +#define sja1110_txtstamp NULL #endif /* IS_ENABLED(CONFIG_NET_DSA_SJA1105_PTP) */ diff --git a/drivers/net/dsa/sja1105/sja1105_spi.c b/drivers/net/dsa/sja1105/sja1105_spi.c index f7dd86271891..32d00212423c 100644 --- a/drivers/net/dsa/sja1105/sja1105_spi.c +++ b/drivers/net/dsa/sja1105/sja1105_spi.c @@ -788,6 +788,7 @@ const struct sja1105_info sja1110a_info = { .fdb_del_cmd = sja1105pqrs_fdb_del, .ptp_cmd_packing = sja1105pqrs_ptp_cmd_packing, .rxtstamp = sja1110_rxtstamp, + .txtstamp = sja1110_txtstamp, .clocking_setup = sja1110_clocking_setup, .port_speed = { [SJA1105_SPEED_AUTO] = 0, @@ -836,6 +837,7 @@ const struct sja1105_info sja1110b_info = { .fdb_del_cmd = sja1105pqrs_fdb_del, .ptp_cmd_packing = sja1105pqrs_ptp_cmd_packing, .rxtstamp = sja1110_rxtstamp, + .txtstamp = sja1110_txtstamp, .clocking_setup = sja1110_clocking_setup, .port_speed = { [SJA1105_SPEED_AUTO] = 0, @@ -884,6 +886,7 @@ const struct sja1105_info sja1110c_info = { .fdb_del_cmd = sja1105pqrs_fdb_del, .ptp_cmd_packing = sja1105pqrs_ptp_cmd_packing, .rxtstamp = sja1110_rxtstamp, + .txtstamp = sja1110_txtstamp, .clocking_setup = sja1110_clocking_setup, .port_speed = { [SJA1105_SPEED_AUTO] = 0, @@ -932,6 +935,7 @@ const struct sja1105_info sja1110d_info = { .fdb_del_cmd = sja1105pqrs_fdb_del, .ptp_cmd_packing = sja1105pqrs_ptp_cmd_packing, .rxtstamp = sja1110_rxtstamp, + .txtstamp = sja1110_txtstamp, .clocking_setup = sja1110_clocking_setup, .port_speed = { [SJA1105_SPEED_AUTO] = 0, diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h index b02cf7b515ae..b6089b88314c 100644 --- a/include/linux/dsa/sja1105.h +++ b/include/linux/dsa/sja1105.h @@ -45,11 +45,14 @@ struct sja1105_tagger_data { */ spinlock_t meta_lock; unsigned long state; + u8 ts_id; }; struct sja1105_skb_cb { struct sk_buff *clone; u64 tstamp; + /* Only valid for packets cloned for 2-step TX timestamping */ + u8 ts_id; }; #define SJA1105_SKB_CB(skb) \ @@ -66,4 +69,24 @@ struct sja1105_port { u16 xmit_tpid; }; +enum sja1110_meta_tstamp { + SJA1110_META_TSTAMP_TX = 0, + SJA1110_META_TSTAMP_RX = 1, +}; + +#if IS_ENABLED(CONFIG_NET_DSA_SJA1105_PTP) + +void sja1110_process_meta_tstamp(struct dsa_switch *ds, int port, u8 ts_id, + enum sja1110_meta_tstamp dir, u64 tstamp); + +#else + +static inline void sja1110_process_meta_tstamp(struct dsa_switch *ds, int port, + u8 ts_id, enum sja1110_meta_tstamp dir, + u64 tstamp) +{ +} + +#endif /* IS_ENABLED(CONFIG_NET_DSA_SJA1105_PTP) */ + #endif /* _NET_DSA_SJA1105_H */ diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c index 37e1d64e07c6..9c2df9ece01b 100644 --- a/net/dsa/tag_sja1105.c +++ b/net/dsa/tag_sja1105.c @@ -25,6 +25,9 @@ #define SJA1110_RX_TRAILER_SWITCH_ID(x) (((x) & GENMASK(7, 4)) >> 4) #define SJA1110_RX_TRAILER_SRC_PORT(x) ((x) & GENMASK(3, 0)) +/* Meta frame format (for 2-step TX timestamps) */ +#define SJA1110_RX_HEADER_N_TS(x) (((x) & GENMASK(8, 4)) >> 4) + /* TX header */ #define SJA1110_TX_HEADER_UPDATE_TC BIT(14) #define SJA1110_TX_HEADER_TAKE_TS BIT(13) @@ -43,6 +46,8 @@ #define SJA1110_TX_TRAILER_SWITCHID(x) (((x) << 12) & GENMASK(15, 12)) #define SJA1110_TX_TRAILER_DESTPORTS(x) (((x) << 1) & GENMASK(11, 1)) +#define SJA1110_META_TSTAMP_SIZE 10 + #define SJA1110_HEADER_LEN 4 #define SJA1110_RX_TRAILER_LEN 13 #define SJA1110_TX_TRAILER_LEN 4 @@ -184,6 +189,7 @@ static struct sk_buff *sja1105_xmit(struct sk_buff *skb, static struct sk_buff *sja1110_xmit(struct sk_buff *skb, struct net_device *netdev) { + struct sk_buff *clone = SJA1105_SKB_CB(skb)->clone; struct dsa_port *dp = dsa_slave_to_port(netdev); u16 tx_vid = dsa_8021q_tx_vid(dp->ds, dp->index); u16 queue_mapping = skb_get_queue_mapping(skb); @@ -221,6 +227,12 @@ static struct sk_buff *sja1110_xmit(struct sk_buff *skb, *tx_trailer = cpu_to_be32(SJA1110_TX_TRAILER_PRIO(pcp) | SJA1110_TX_TRAILER_SWITCHID(dp->ds->index) | SJA1110_TX_TRAILER_DESTPORTS(BIT(dp->index))); + if (clone) { + u8 ts_id = SJA1105_SKB_CB(clone)->ts_id; + + *tx_header |= htons(SJA1110_TX_HEADER_TAKE_TS); + *tx_trailer |= cpu_to_be32(SJA1110_TX_TRAILER_TSTAMP_ID(ts_id)); + } return skb; } @@ -423,6 +435,43 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb, is_meta); } +static struct sk_buff *sja1110_rcv_meta(struct sk_buff *skb, u16 rx_header) +{ + int switch_id = SJA1110_RX_HEADER_SWITCH_ID(rx_header); + int n_ts = SJA1110_RX_HEADER_N_TS(rx_header); + struct net_device *master = skb->dev; + struct dsa_port *cpu_dp; + u8 *buf = skb->data + 2; + struct dsa_switch *ds; + int i; + + cpu_dp = master->dsa_ptr; + ds = dsa_switch_find(cpu_dp->dst->index, switch_id); + if (!ds) { + net_err_ratelimited("%s: cannot find switch id %d\n", + master->name, switch_id); + return NULL; + } + + for (i = 0; i <= n_ts; i++) { + u8 ts_id, source_port, dir; + u64 tstamp; + + ts_id = buf[0]; + source_port = (buf[1] & GENMASK(7, 4)) >> 4; + dir = (buf[1] & BIT(3)) >> 3; + tstamp = be64_to_cpu(*(__be64 *)(buf + 2)); + + sja1110_process_meta_tstamp(ds, source_port, ts_id, dir, + tstamp); + + buf += SJA1110_META_TSTAMP_SIZE; + } + + /* Discard the meta frame, we've consumed the timestamps it contained */ + return NULL; +} + static struct sk_buff *sja1110_rcv_inband_control_extension(struct sk_buff *skb, int *source_port, int *switch_id) @@ -439,6 +488,9 @@ static struct sk_buff *sja1110_rcv_inband_control_extension(struct sk_buff *skb, */ rx_header = ntohs(*(__be16 *)skb->data); + if (rx_header & SJA1110_RX_HEADER_IS_METADATA) + return sja1110_rcv_meta(skb, rx_header); + /* Timestamp frame, we have a trailer */ if (rx_header & SJA1110_RX_HEADER_HAS_TRAILER) { int start_of_padding = SJA1110_RX_HEADER_TRAILER_POS(rx_header); -- cgit v1.2.3-58-ga151 From 51a1ebc35b46dc322071cfa7fcd4cdcfde0c1aa4 Mon Sep 17 00:00:00 2001 From: gushengxian Date: Fri, 11 Jun 2021 09:33:33 +0800 Subject: net: devres: Correct a grammatical error Correct a grammatical error. Signed-off-by: gushengxian Signed-off-by: David S. Miller --- net/devres.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/devres.c b/net/devres.c index 1f9be2133787..5ccf6ca311dc 100644 --- a/net/devres.c +++ b/net/devres.c @@ -60,7 +60,7 @@ static int netdev_devres_match(struct device *dev, void *this, void *match_data) * @ndev: device to register * * This is a devres variant of register_netdev() for which the unregister - * function will be call automatically when the managing device is + * function will be called automatically when the managing device is * detached. Note: the net_device used must also be resource managed by * the same struct device. */ -- cgit v1.2.3-58-ga151 From a9e29e5511b9e68b64e9031edb7b7f8920ad3de1 Mon Sep 17 00:00:00 2001 From: Arseny Krasnov Date: Fri, 11 Jun 2021 14:09:47 +0300 Subject: af_vsock: update functions for connectible socket Prepare af_vsock.c for SEQPACKET support: rename some functions such as setsockopt(), getsockopt(), connect(), recvmsg(), sendmsg() in general manner, because they are shared with stream sockets. Signed-off-by: Arseny Krasnov Reviewed-by: Stefano Garzarella Signed-off-by: David S. Miller --- net/vmw_vsock/af_vsock.c | 64 +++++++++++++++++++++++++----------------------- 1 file changed, 34 insertions(+), 30 deletions(-) (limited to 'net') diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 92a72f0e0d94..7dd8e70d78cd 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -604,8 +604,8 @@ out: /**** SOCKET OPERATIONS ****/ -static int __vsock_bind_stream(struct vsock_sock *vsk, - struct sockaddr_vm *addr) +static int __vsock_bind_connectible(struct vsock_sock *vsk, + struct sockaddr_vm *addr) { static u32 port; struct sockaddr_vm new_addr; @@ -685,7 +685,7 @@ static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr) switch (sk->sk_socket->type) { case SOCK_STREAM: spin_lock_bh(&vsock_table_lock); - retval = __vsock_bind_stream(vsk, addr); + retval = __vsock_bind_connectible(vsk, addr); spin_unlock_bh(&vsock_table_lock); break; @@ -768,6 +768,11 @@ static struct sock *__vsock_create(struct net *net, return sk; } +static bool sock_type_connectible(u16 type) +{ + return type == SOCK_STREAM; +} + static void __vsock_release(struct sock *sk, int level) { if (sk) { @@ -786,7 +791,7 @@ static void __vsock_release(struct sock *sk, int level) if (vsk->transport) vsk->transport->release(vsk); - else if (sk->sk_type == SOCK_STREAM) + else if (sock_type_connectible(sk->sk_type)) vsock_remove_sock(vsk); sock_orphan(sk); @@ -948,7 +953,7 @@ static int vsock_shutdown(struct socket *sock, int mode) lock_sock(sk); if (sock->state == SS_UNCONNECTED) { err = -ENOTCONN; - if (sk->sk_type == SOCK_STREAM) + if (sock_type_connectible(sk->sk_type)) goto out; } else { sock->state = SS_DISCONNECTING; @@ -961,7 +966,7 @@ static int vsock_shutdown(struct socket *sock, int mode) sk->sk_shutdown |= mode; sk->sk_state_change(sk); - if (sk->sk_type == SOCK_STREAM) { + if (sock_type_connectible(sk->sk_type)) { sock_reset_flag(sk, SOCK_DONE); vsock_send_shutdown(sk, mode); } @@ -1016,7 +1021,7 @@ static __poll_t vsock_poll(struct file *file, struct socket *sock, if (!(sk->sk_shutdown & SEND_SHUTDOWN)) mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; - } else if (sock->type == SOCK_STREAM) { + } else if (sock_type_connectible(sk->sk_type)) { const struct vsock_transport *transport; lock_sock(sk); @@ -1263,8 +1268,8 @@ static void vsock_connect_timeout(struct work_struct *work) sock_put(sk); } -static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr, - int addr_len, int flags) +static int vsock_connect(struct socket *sock, struct sockaddr *addr, + int addr_len, int flags) { int err; struct sock *sk; @@ -1414,7 +1419,7 @@ static int vsock_accept(struct socket *sock, struct socket *newsock, int flags, lock_sock(listener); - if (sock->type != SOCK_STREAM) { + if (!sock_type_connectible(sock->type)) { err = -EOPNOTSUPP; goto out; } @@ -1491,7 +1496,7 @@ static int vsock_listen(struct socket *sock, int backlog) lock_sock(sk); - if (sock->type != SOCK_STREAM) { + if (!sock_type_connectible(sk->sk_type)) { err = -EOPNOTSUPP; goto out; } @@ -1535,11 +1540,11 @@ static void vsock_update_buffer_size(struct vsock_sock *vsk, vsk->buffer_size = val; } -static int vsock_stream_setsockopt(struct socket *sock, - int level, - int optname, - sockptr_t optval, - unsigned int optlen) +static int vsock_connectible_setsockopt(struct socket *sock, + int level, + int optname, + sockptr_t optval, + unsigned int optlen) { int err; struct sock *sk; @@ -1617,10 +1622,10 @@ exit: return err; } -static int vsock_stream_getsockopt(struct socket *sock, - int level, int optname, - char __user *optval, - int __user *optlen) +static int vsock_connectible_getsockopt(struct socket *sock, + int level, int optname, + char __user *optval, + int __user *optlen) { int err; int len; @@ -1688,8 +1693,8 @@ static int vsock_stream_getsockopt(struct socket *sock, return 0; } -static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, - size_t len) +static int vsock_connectible_sendmsg(struct socket *sock, struct msghdr *msg, + size_t len) { struct sock *sk; struct vsock_sock *vsk; @@ -1828,10 +1833,9 @@ out: return err; } - static int -vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, - int flags) +vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, + int flags) { struct sock *sk; struct vsock_sock *vsk; @@ -2007,7 +2011,7 @@ static const struct proto_ops vsock_stream_ops = { .owner = THIS_MODULE, .release = vsock_release, .bind = vsock_bind, - .connect = vsock_stream_connect, + .connect = vsock_connect, .socketpair = sock_no_socketpair, .accept = vsock_accept, .getname = vsock_getname, @@ -2015,10 +2019,10 @@ static const struct proto_ops vsock_stream_ops = { .ioctl = sock_no_ioctl, .listen = vsock_listen, .shutdown = vsock_shutdown, - .setsockopt = vsock_stream_setsockopt, - .getsockopt = vsock_stream_getsockopt, - .sendmsg = vsock_stream_sendmsg, - .recvmsg = vsock_stream_recvmsg, + .setsockopt = vsock_connectible_setsockopt, + .getsockopt = vsock_connectible_getsockopt, + .sendmsg = vsock_connectible_sendmsg, + .recvmsg = vsock_connectible_recvmsg, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, }; -- cgit v1.2.3-58-ga151 From b3f7fd54881bcba5dc529935f38df649167803b1 Mon Sep 17 00:00:00 2001 From: Arseny Krasnov Date: Fri, 11 Jun 2021 14:10:07 +0300 Subject: af_vsock: separate wait data loop Wait loop for data could be shared between SEQPACKET and STREAM sockets, so move it to dedicated function. While moving the code around, let's update an old comment. Signed-off-by: Arseny Krasnov Reviewed-by: Stefano Garzarella Signed-off-by: David S. Miller --- net/vmw_vsock/af_vsock.c | 156 +++++++++++++++++++++++++---------------------- 1 file changed, 84 insertions(+), 72 deletions(-) (limited to 'net') diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 7dd8e70d78cd..4269e80b02cd 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1833,6 +1833,69 @@ out: return err; } +static int vsock_wait_data(struct sock *sk, struct wait_queue_entry *wait, + long timeout, + struct vsock_transport_recv_notify_data *recv_data, + size_t target) +{ + const struct vsock_transport *transport; + struct vsock_sock *vsk; + s64 data; + int err; + + vsk = vsock_sk(sk); + err = 0; + transport = vsk->transport; + + while ((data = vsock_stream_has_data(vsk)) == 0) { + prepare_to_wait(sk_sleep(sk), wait, TASK_INTERRUPTIBLE); + + if (sk->sk_err != 0 || + (sk->sk_shutdown & RCV_SHUTDOWN) || + (vsk->peer_shutdown & SEND_SHUTDOWN)) { + break; + } + + /* Don't wait for non-blocking sockets. */ + if (timeout == 0) { + err = -EAGAIN; + break; + } + + if (recv_data) { + err = transport->notify_recv_pre_block(vsk, target, recv_data); + if (err < 0) + break; + } + + release_sock(sk); + timeout = schedule_timeout(timeout); + lock_sock(sk); + + if (signal_pending(current)) { + err = sock_intr_errno(timeout); + break; + } else if (timeout == 0) { + err = -EAGAIN; + break; + } + } + + finish_wait(sk_sleep(sk), wait); + + if (err) + return err; + + /* Internal transport error when checking for available + * data. XXX This should be changed to a connection + * reset in a later change. + */ + if (data < 0) + return -ENOMEM; + + return data; +} + static int vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) @@ -1912,85 +1975,34 @@ vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, while (1) { - s64 ready; + ssize_t read; - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); - ready = vsock_stream_has_data(vsk); + err = vsock_wait_data(sk, &wait, timeout, &recv_data, target); + if (err <= 0) + break; - if (ready == 0) { - if (sk->sk_err != 0 || - (sk->sk_shutdown & RCV_SHUTDOWN) || - (vsk->peer_shutdown & SEND_SHUTDOWN)) { - finish_wait(sk_sleep(sk), &wait); - break; - } - /* Don't wait for non-blocking sockets. */ - if (timeout == 0) { - err = -EAGAIN; - finish_wait(sk_sleep(sk), &wait); - break; - } - - err = transport->notify_recv_pre_block( - vsk, target, &recv_data); - if (err < 0) { - finish_wait(sk_sleep(sk), &wait); - break; - } - release_sock(sk); - timeout = schedule_timeout(timeout); - lock_sock(sk); - - if (signal_pending(current)) { - err = sock_intr_errno(timeout); - finish_wait(sk_sleep(sk), &wait); - break; - } else if (timeout == 0) { - err = -EAGAIN; - finish_wait(sk_sleep(sk), &wait); - break; - } - } else { - ssize_t read; - - finish_wait(sk_sleep(sk), &wait); - - if (ready < 0) { - /* Invalid queue pair content. XXX This should - * be changed to a connection reset in a later - * change. - */ - - err = -ENOMEM; - goto out; - } - - err = transport->notify_recv_pre_dequeue( - vsk, target, &recv_data); - if (err < 0) - break; + err = transport->notify_recv_pre_dequeue(vsk, target, + &recv_data); + if (err < 0) + break; - read = transport->stream_dequeue( - vsk, msg, - len - copied, flags); - if (read < 0) { - err = -ENOMEM; - break; - } + read = transport->stream_dequeue(vsk, msg, len - copied, flags); + if (read < 0) { + err = -ENOMEM; + break; + } - copied += read; + copied += read; - err = transport->notify_recv_post_dequeue( - vsk, target, read, - !(flags & MSG_PEEK), &recv_data); - if (err < 0) - goto out; + err = transport->notify_recv_post_dequeue(vsk, target, read, + !(flags & MSG_PEEK), &recv_data); + if (err < 0) + goto out; - if (read >= target || flags & MSG_PEEK) - break; + if (read >= target || flags & MSG_PEEK) + break; - target -= read; - } + target -= read; } if (sk->sk_err) -- cgit v1.2.3-58-ga151 From 19c1b90e1979c3974cd6a3ec0cbb886a84278d84 Mon Sep 17 00:00:00 2001 From: Arseny Krasnov Date: Fri, 11 Jun 2021 14:10:21 +0300 Subject: af_vsock: separate receive data loop Some code in receive data loop could be shared between SEQPACKET and STREAM sockets, while another part is type specific, so move STREAM specific data receive logic to '__vsock_stream_recvmsg()' dedicated function, while checks, that will be same for both STREAM and SEQPACKET sockets, stays in 'vsock_connectible_recvmsg()'. Signed-off-by: Arseny Krasnov Reviewed-by: Stefano Garzarella Signed-off-by: David S. Miller --- net/vmw_vsock/af_vsock.c | 116 +++++++++++++++++++++++++++-------------------- 1 file changed, 67 insertions(+), 49 deletions(-) (limited to 'net') diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 4269e80b02cd..c4f6bfa1e381 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1896,65 +1896,22 @@ static int vsock_wait_data(struct sock *sk, struct wait_queue_entry *wait, return data; } -static int -vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, - int flags) +static int __vsock_stream_recvmsg(struct sock *sk, struct msghdr *msg, + size_t len, int flags) { - struct sock *sk; - struct vsock_sock *vsk; + struct vsock_transport_recv_notify_data recv_data; const struct vsock_transport *transport; - int err; - size_t target; + struct vsock_sock *vsk; ssize_t copied; + size_t target; long timeout; - struct vsock_transport_recv_notify_data recv_data; + int err; DEFINE_WAIT(wait); - sk = sock->sk; vsk = vsock_sk(sk); - err = 0; - - lock_sock(sk); - transport = vsk->transport; - if (!transport || sk->sk_state != TCP_ESTABLISHED) { - /* Recvmsg is supposed to return 0 if a peer performs an - * orderly shutdown. Differentiate between that case and when a - * peer has not connected or a local shutdown occurred with the - * SOCK_DONE flag. - */ - if (sock_flag(sk, SOCK_DONE)) - err = 0; - else - err = -ENOTCONN; - - goto out; - } - - if (flags & MSG_OOB) { - err = -EOPNOTSUPP; - goto out; - } - - /* We don't check peer_shutdown flag here since peer may actually shut - * down, but there can be data in the queue that a local socket can - * receive. - */ - if (sk->sk_shutdown & RCV_SHUTDOWN) { - err = 0; - goto out; - } - - /* It is valid on Linux to pass in a zero-length receive buffer. This - * is not an error. We may as well bail out now. - */ - if (!len) { - err = 0; - goto out; - } - /* We must not copy less than target bytes into the user's buffer * before returning successfully, so we wait for the consume queue to * have that much data to consume before dequeueing. Note that this @@ -2013,6 +1970,67 @@ vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, if (copied > 0) err = copied; +out: + return err; +} + +static int +vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, + int flags) +{ + struct sock *sk; + struct vsock_sock *vsk; + const struct vsock_transport *transport; + int err; + + DEFINE_WAIT(wait); + + sk = sock->sk; + vsk = vsock_sk(sk); + err = 0; + + lock_sock(sk); + + transport = vsk->transport; + + if (!transport || sk->sk_state != TCP_ESTABLISHED) { + /* Recvmsg is supposed to return 0 if a peer performs an + * orderly shutdown. Differentiate between that case and when a + * peer has not connected or a local shutdown occurred with the + * SOCK_DONE flag. + */ + if (sock_flag(sk, SOCK_DONE)) + err = 0; + else + err = -ENOTCONN; + + goto out; + } + + if (flags & MSG_OOB) { + err = -EOPNOTSUPP; + goto out; + } + + /* We don't check peer_shutdown flag here since peer may actually shut + * down, but there can be data in the queue that a local socket can + * receive. + */ + if (sk->sk_shutdown & RCV_SHUTDOWN) { + err = 0; + goto out; + } + + /* It is valid on Linux to pass in a zero-length receive buffer. This + * is not an error. We may as well bail out now. + */ + if (!len) { + err = 0; + goto out; + } + + err = __vsock_stream_recvmsg(sk, msg, len, flags); + out: release_sock(sk); return err; -- cgit v1.2.3-58-ga151 From 9942c192b256bc11cc903f89f4057bc97434dee9 Mon Sep 17 00:00:00 2001 From: Arseny Krasnov Date: Fri, 11 Jun 2021 14:10:34 +0300 Subject: af_vsock: implement SEQPACKET receive loop Add receive loop for SEQPACKET. It looks like receive loop for STREAM, but there are differences: 1) It doesn't call notify callbacks. 2) It doesn't care about 'SO_SNDLOWAT' and 'SO_RCVLOWAT' values, because there is no sense for these values in SEQPACKET case. 3) It waits until whole record is received. 4) It processes and sets 'MSG_TRUNC' flag. So to avoid extra conditions for two types of socket inside one loop, two independent functions were created. Signed-off-by: Arseny Krasnov Signed-off-by: David S. Miller --- include/net/af_vsock.h | 4 ++++ net/vmw_vsock/af_vsock.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 58 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index b1c717286993..4d7cf6b2aca2 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -135,6 +135,10 @@ struct vsock_transport { bool (*stream_is_active)(struct vsock_sock *); bool (*stream_allow)(u32 cid, u32 port); + /* SEQ_PACKET. */ + ssize_t (*seqpacket_dequeue)(struct vsock_sock *vsk, struct msghdr *msg, + int flags); + /* Notification. */ int (*notify_poll_in)(struct vsock_sock *, size_t, bool *); int (*notify_poll_out)(struct vsock_sock *, size_t, bool *); diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index c4f6bfa1e381..87ae26b2e3e1 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1974,6 +1974,56 @@ out: return err; } +static int __vsock_seqpacket_recvmsg(struct sock *sk, struct msghdr *msg, + size_t len, int flags) +{ + const struct vsock_transport *transport; + struct vsock_sock *vsk; + ssize_t record_len; + long timeout; + int err = 0; + DEFINE_WAIT(wait); + + vsk = vsock_sk(sk); + transport = vsk->transport; + + timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); + + err = vsock_wait_data(sk, &wait, timeout, NULL, 0); + if (err <= 0) + goto out; + + record_len = transport->seqpacket_dequeue(vsk, msg, flags); + + if (record_len < 0) { + err = -ENOMEM; + goto out; + } + + if (sk->sk_err) { + err = -sk->sk_err; + } else if (sk->sk_shutdown & RCV_SHUTDOWN) { + err = 0; + } else { + /* User sets MSG_TRUNC, so return real length of + * packet. + */ + if (flags & MSG_TRUNC) + err = record_len; + else + err = len - msg_data_left(msg); + + /* Always set MSG_TRUNC if real length of packet is + * bigger than user's buffer. + */ + if (record_len > len) + msg->msg_flags |= MSG_TRUNC; + } + +out: + return err; +} + static int vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) @@ -2029,7 +2079,10 @@ vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, goto out; } - err = __vsock_stream_recvmsg(sk, msg, len, flags); + if (sk->sk_type == SOCK_STREAM) + err = __vsock_stream_recvmsg(sk, msg, len, flags); + else + err = __vsock_seqpacket_recvmsg(sk, msg, len, flags); out: release_sock(sk); -- cgit v1.2.3-58-ga151 From fbe70c480796d9052fcc786c76e6b029acb1c7bc Mon Sep 17 00:00:00 2001 From: Arseny Krasnov Date: Fri, 11 Jun 2021 14:10:49 +0300 Subject: af_vsock: implement send logic for SEQPACKET Update current stream enqueue function for SEQPACKET support: 1) Call transport's seqpacket enqueue callback. 2) Return value from enqueue function is whole record length or error for SOCK_SEQPACKET. Signed-off-by: Arseny Krasnov Reviewed-by: Stefano Garzarella Signed-off-by: David S. Miller --- include/net/af_vsock.h | 2 ++ net/vmw_vsock/af_vsock.c | 20 +++++++++++++++----- 2 files changed, 17 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index 4d7cf6b2aca2..d6745d8b8f3e 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -138,6 +138,8 @@ struct vsock_transport { /* SEQ_PACKET. */ ssize_t (*seqpacket_dequeue)(struct vsock_sock *vsk, struct msghdr *msg, int flags); + int (*seqpacket_enqueue)(struct vsock_sock *vsk, struct msghdr *msg, + size_t len); /* Notification. */ int (*notify_poll_in)(struct vsock_sock *, size_t, bool *); diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 87ae26b2e3e1..9e0cc07e3caf 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1808,9 +1808,13 @@ static int vsock_connectible_sendmsg(struct socket *sock, struct msghdr *msg, * responsibility to check how many bytes we were able to send. */ - written = transport->stream_enqueue( - vsk, msg, - len - total_written); + if (sk->sk_type == SOCK_SEQPACKET) { + written = transport->seqpacket_enqueue(vsk, + msg, len - total_written); + } else { + written = transport->stream_enqueue(vsk, + msg, len - total_written); + } if (written < 0) { err = -ENOMEM; goto out_err; @@ -1826,8 +1830,14 @@ static int vsock_connectible_sendmsg(struct socket *sock, struct msghdr *msg, } out_err: - if (total_written > 0) - err = total_written; + if (total_written > 0) { + /* Return number of written bytes only if: + * 1) SOCK_STREAM socket. + * 2) SOCK_SEQPACKET socket when whole buffer is sent. + */ + if (sk->sk_type == SOCK_STREAM || total_written == len) + err = total_written; + } out: release_sock(sk); return err; -- cgit v1.2.3-58-ga151 From 0798e78b102b79ed9fe4b2beeb18cf0db117c79b Mon Sep 17 00:00:00 2001 From: Arseny Krasnov Date: Fri, 11 Jun 2021 14:11:04 +0300 Subject: af_vsock: rest of SEQPACKET support Add socket ops for SEQPACKET type and .seqpacket_allow() callback to query transports if they support SEQPACKET. Also split path for data check for STREAM and SEQPACKET branches. Signed-off-by: Arseny Krasnov Signed-off-by: David S. Miller --- include/net/af_vsock.h | 2 ++ net/vmw_vsock/af_vsock.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 48 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index d6745d8b8f3e..ab207677e0a8 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -140,6 +140,8 @@ struct vsock_transport { int flags); int (*seqpacket_enqueue)(struct vsock_sock *vsk, struct msghdr *msg, size_t len); + bool (*seqpacket_allow)(u32 remote_cid); + u32 (*seqpacket_has_data)(struct vsock_sock *vsk); /* Notification. */ int (*notify_poll_in)(struct vsock_sock *, size_t, bool *); diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 9e0cc07e3caf..21a56f52d683 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -452,6 +452,7 @@ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk) new_transport = transport_dgram; break; case SOCK_STREAM: + case SOCK_SEQPACKET: if (vsock_use_local_transport(remote_cid)) new_transport = transport_local; else if (remote_cid <= VMADDR_CID_HOST || !transport_h2g || @@ -484,6 +485,14 @@ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk) if (!new_transport || !try_module_get(new_transport->module)) return -ENODEV; + if (sk->sk_type == SOCK_SEQPACKET) { + if (!new_transport->seqpacket_allow || + !new_transport->seqpacket_allow(remote_cid)) { + module_put(new_transport->module); + return -ESOCKTNOSUPPORT; + } + } + ret = new_transport->init(vsk, psk); if (ret) { module_put(new_transport->module); @@ -684,6 +693,7 @@ static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr) switch (sk->sk_socket->type) { case SOCK_STREAM: + case SOCK_SEQPACKET: spin_lock_bh(&vsock_table_lock); retval = __vsock_bind_connectible(vsk, addr); spin_unlock_bh(&vsock_table_lock); @@ -770,7 +780,7 @@ static struct sock *__vsock_create(struct net *net, static bool sock_type_connectible(u16 type) { - return type == SOCK_STREAM; + return (type == SOCK_STREAM) || (type == SOCK_SEQPACKET); } static void __vsock_release(struct sock *sk, int level) @@ -849,6 +859,16 @@ s64 vsock_stream_has_data(struct vsock_sock *vsk) } EXPORT_SYMBOL_GPL(vsock_stream_has_data); +static s64 vsock_has_data(struct vsock_sock *vsk) +{ + struct sock *sk = sk_vsock(vsk); + + if (sk->sk_type == SOCK_SEQPACKET) + return vsk->transport->seqpacket_has_data(vsk); + else + return vsock_stream_has_data(vsk); +} + s64 vsock_stream_has_space(struct vsock_sock *vsk) { return vsk->transport->stream_has_space(vsk); @@ -1857,7 +1877,7 @@ static int vsock_wait_data(struct sock *sk, struct wait_queue_entry *wait, err = 0; transport = vsk->transport; - while ((data = vsock_stream_has_data(vsk)) == 0) { + while ((data = vsock_has_data(vsk)) == 0) { prepare_to_wait(sk_sleep(sk), wait, TASK_INTERRUPTIBLE); if (sk->sk_err != 0 || @@ -2120,6 +2140,27 @@ static const struct proto_ops vsock_stream_ops = { .sendpage = sock_no_sendpage, }; +static const struct proto_ops vsock_seqpacket_ops = { + .family = PF_VSOCK, + .owner = THIS_MODULE, + .release = vsock_release, + .bind = vsock_bind, + .connect = vsock_connect, + .socketpair = sock_no_socketpair, + .accept = vsock_accept, + .getname = vsock_getname, + .poll = vsock_poll, + .ioctl = sock_no_ioctl, + .listen = vsock_listen, + .shutdown = vsock_shutdown, + .setsockopt = vsock_connectible_setsockopt, + .getsockopt = vsock_connectible_getsockopt, + .sendmsg = vsock_connectible_sendmsg, + .recvmsg = vsock_connectible_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + static int vsock_create(struct net *net, struct socket *sock, int protocol, int kern) { @@ -2140,6 +2181,9 @@ static int vsock_create(struct net *net, struct socket *sock, case SOCK_STREAM: sock->ops = &vsock_stream_ops; break; + case SOCK_SEQPACKET: + sock->ops = &vsock_seqpacket_ops; + break; default: return -ESOCKTNOSUPPORT; } -- cgit v1.2.3-58-ga151 From 8cb48554ad822fb8553380b4781ea65f1e3ca7bb Mon Sep 17 00:00:00 2001 From: Arseny Krasnov Date: Fri, 11 Jun 2021 14:11:18 +0300 Subject: af_vsock: update comments for stream sockets Replace 'stream' to 'connection oriented' in comments as SEQPACKET is also connection oriented. Signed-off-by: Arseny Krasnov Reviewed-by: Stefano Garzarella Signed-off-by: David S. Miller --- net/vmw_vsock/af_vsock.c | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 21a56f52d683..67954afef4e1 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -415,8 +415,8 @@ static void vsock_deassign_transport(struct vsock_sock *vsk) /* Assign a transport to a socket and call the .init transport callback. * - * Note: for stream socket this must be called when vsk->remote_addr is set - * (e.g. during the connect() or when a connection request on a listener + * Note: for connection oriented socket this must be called when vsk->remote_addr + * is set (e.g. during the connect() or when a connection request on a listener * socket is received). * The vsk->remote_addr is used to decide which transport to use: * - remote CID == VMADDR_CID_LOCAL or g2h->local_cid or VMADDR_CID_HOST if @@ -470,10 +470,10 @@ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk) return 0; /* transport->release() must be called with sock lock acquired. - * This path can only be taken during vsock_stream_connect(), - * where we have already held the sock lock. - * In the other cases, this function is called on a new socket - * which is not assigned to any transport. + * This path can only be taken during vsock_connect(), where we + * have already held the sock lock. In the other cases, this + * function is called on a new socket which is not assigned to + * any transport. */ vsk->transport->release(vsk); vsock_deassign_transport(vsk); @@ -658,9 +658,10 @@ static int __vsock_bind_connectible(struct vsock_sock *vsk, vsock_addr_init(&vsk->local_addr, new_addr.svm_cid, new_addr.svm_port); - /* Remove stream sockets from the unbound list and add them to the hash - * table for easy lookup by its address. The unbound list is simply an - * extra entry at the end of the hash table, a trick used by AF_UNIX. + /* Remove connection oriented sockets from the unbound list and add them + * to the hash table for easy lookup by its address. The unbound list + * is simply an extra entry at the end of the hash table, a trick used + * by AF_UNIX. */ __vsock_remove_bound(vsk); __vsock_insert_bound(vsock_bound_sockets(&vsk->local_addr), vsk); @@ -962,10 +963,10 @@ static int vsock_shutdown(struct socket *sock, int mode) if ((mode & ~SHUTDOWN_MASK) || !mode) return -EINVAL; - /* If this is a STREAM socket and it is not connected then bail out - * immediately. If it is a DGRAM socket then we must first kick the - * socket so that it wakes up from any sleeping calls, for example - * recv(), and then afterwards return the error. + /* If this is a connection oriented socket and it is not connected then + * bail out immediately. If it is a DGRAM socket then we must first + * kick the socket so that it wakes up from any sleeping calls, for + * example recv(), and then afterwards return the error. */ sk = sock->sk; @@ -1737,7 +1738,9 @@ static int vsock_connectible_sendmsg(struct socket *sock, struct msghdr *msg, transport = vsk->transport; - /* Callers should not provide a destination with stream sockets. */ + /* Callers should not provide a destination with connection oriented + * sockets. + */ if (msg->msg_namelen) { err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; goto out; -- cgit v1.2.3-58-ga151 From b93f8877c1f2e3d3dcdec7759c5de3d67777f45d Mon Sep 17 00:00:00 2001 From: Arseny Krasnov Date: Fri, 11 Jun 2021 14:11:31 +0300 Subject: virtio/vsock: set packet's type in virtio_transport_send_pkt_info() There is no need to set type of packet which differs from type of socket, so move passing type of packet from 'info' structure to 'virtio_transport_send_pkt_info()' function. Since at current time only stream type is supported, set it directly in 'virtio_ transport_send_pkt_info()', so callers don't need to set it. Signed-off-by: Arseny Krasnov Reviewed-by: Stefano Garzarella Signed-off-by: David S. Miller --- net/vmw_vsock/virtio_transport_common.c | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 902cb6dd710b..6503a8370130 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -179,6 +179,8 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, struct virtio_vsock_pkt *pkt; u32 pkt_len = info->pkt_len; + info->type = VIRTIO_VSOCK_TYPE_STREAM; + t_ops = virtio_transport_get_ops(vsk); if (unlikely(!t_ops)) return -EFAULT; @@ -270,12 +272,10 @@ void virtio_transport_put_credit(struct virtio_vsock_sock *vvs, u32 credit) EXPORT_SYMBOL_GPL(virtio_transport_put_credit); static int virtio_transport_send_credit_update(struct vsock_sock *vsk, - int type, struct virtio_vsock_hdr *hdr) { struct virtio_vsock_pkt_info info = { .op = VIRTIO_VSOCK_OP_CREDIT_UPDATE, - .type = type, .vsk = vsk, }; @@ -383,11 +383,8 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, * messages, we set the limit to a high value. TODO: experiment * with different values. */ - if (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE) { - virtio_transport_send_credit_update(vsk, - VIRTIO_VSOCK_TYPE_STREAM, - NULL); - } + if (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE) + virtio_transport_send_credit_update(vsk, NULL); return total; @@ -496,8 +493,7 @@ void virtio_transport_notify_buffer_size(struct vsock_sock *vsk, u64 *val) vvs->buf_alloc = *val; - virtio_transport_send_credit_update(vsk, VIRTIO_VSOCK_TYPE_STREAM, - NULL); + virtio_transport_send_credit_update(vsk, NULL); } EXPORT_SYMBOL_GPL(virtio_transport_notify_buffer_size); @@ -624,7 +620,6 @@ int virtio_transport_connect(struct vsock_sock *vsk) { struct virtio_vsock_pkt_info info = { .op = VIRTIO_VSOCK_OP_REQUEST, - .type = VIRTIO_VSOCK_TYPE_STREAM, .vsk = vsk, }; @@ -636,7 +631,6 @@ int virtio_transport_shutdown(struct vsock_sock *vsk, int mode) { struct virtio_vsock_pkt_info info = { .op = VIRTIO_VSOCK_OP_SHUTDOWN, - .type = VIRTIO_VSOCK_TYPE_STREAM, .flags = (mode & RCV_SHUTDOWN ? VIRTIO_VSOCK_SHUTDOWN_RCV : 0) | (mode & SEND_SHUTDOWN ? @@ -665,7 +659,6 @@ virtio_transport_stream_enqueue(struct vsock_sock *vsk, { struct virtio_vsock_pkt_info info = { .op = VIRTIO_VSOCK_OP_RW, - .type = VIRTIO_VSOCK_TYPE_STREAM, .msg = msg, .pkt_len = len, .vsk = vsk, @@ -688,7 +681,6 @@ static int virtio_transport_reset(struct vsock_sock *vsk, { struct virtio_vsock_pkt_info info = { .op = VIRTIO_VSOCK_OP_RST, - .type = VIRTIO_VSOCK_TYPE_STREAM, .reply = !!pkt, .vsk = vsk, }; @@ -1000,7 +992,6 @@ virtio_transport_send_response(struct vsock_sock *vsk, { struct virtio_vsock_pkt_info info = { .op = VIRTIO_VSOCK_OP_RESPONSE, - .type = VIRTIO_VSOCK_TYPE_STREAM, .remote_cid = le64_to_cpu(pkt->hdr.src_cid), .remote_port = le32_to_cpu(pkt->hdr.src_port), .reply = true, -- cgit v1.2.3-58-ga151 From c10844c5979992fde734f566357059e4a7c815bc Mon Sep 17 00:00:00 2001 From: Arseny Krasnov Date: Fri, 11 Jun 2021 14:12:08 +0300 Subject: virtio/vsock: simplify credit update function API This function is static and 'hdr' arg was always NULL. Signed-off-by: Arseny Krasnov Reviewed-by: Stefano Garzarella Signed-off-by: David S. Miller --- net/vmw_vsock/virtio_transport_common.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 6503a8370130..ad0d34d41444 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -271,8 +271,7 @@ void virtio_transport_put_credit(struct virtio_vsock_sock *vvs, u32 credit) } EXPORT_SYMBOL_GPL(virtio_transport_put_credit); -static int virtio_transport_send_credit_update(struct vsock_sock *vsk, - struct virtio_vsock_hdr *hdr) +static int virtio_transport_send_credit_update(struct vsock_sock *vsk) { struct virtio_vsock_pkt_info info = { .op = VIRTIO_VSOCK_OP_CREDIT_UPDATE, @@ -384,7 +383,7 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, * with different values. */ if (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE) - virtio_transport_send_credit_update(vsk, NULL); + virtio_transport_send_credit_update(vsk); return total; @@ -493,7 +492,7 @@ void virtio_transport_notify_buffer_size(struct vsock_sock *vsk, u64 *val) vvs->buf_alloc = *val; - virtio_transport_send_credit_update(vsk, NULL); + virtio_transport_send_credit_update(vsk); } EXPORT_SYMBOL_GPL(virtio_transport_notify_buffer_size); -- cgit v1.2.3-58-ga151 From 44931195a5412a97c46d299227fbabad4e09010d Mon Sep 17 00:00:00 2001 From: Arseny Krasnov Date: Fri, 11 Jun 2021 14:12:38 +0300 Subject: virtio/vsock: dequeue callback for SOCK_SEQPACKET Callback fetches RW packets from rx queue of socket until whole record is copied(if user's buffer is full, user is not woken up). This is done to not stall sender, because if we wake up user and it leaves syscall, nobody will send credit update for rest of record, and sender will wait for next enter of read syscall at receiver's side. So if user buffer is full, we just send credit update and drop data. Signed-off-by: Arseny Krasnov Signed-off-by: David S. Miller --- include/linux/virtio_vsock.h | 5 ++ net/vmw_vsock/virtio_transport_common.c | 84 +++++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+) (limited to 'net') diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h index dc636b727179..1d9a302cb91d 100644 --- a/include/linux/virtio_vsock.h +++ b/include/linux/virtio_vsock.h @@ -36,6 +36,7 @@ struct virtio_vsock_sock { u32 rx_bytes; u32 buf_alloc; struct list_head rx_queue; + u32 msg_count; }; struct virtio_vsock_pkt { @@ -80,6 +81,10 @@ virtio_transport_dgram_dequeue(struct vsock_sock *vsk, struct msghdr *msg, size_t len, int flags); +ssize_t +virtio_transport_seqpacket_dequeue(struct vsock_sock *vsk, + struct msghdr *msg, + int flags); s64 virtio_transport_stream_has_data(struct vsock_sock *vsk); s64 virtio_transport_stream_has_space(struct vsock_sock *vsk); diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index ad0d34d41444..1e1df19ec164 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -393,6 +393,78 @@ out: return err; } +static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk, + struct msghdr *msg, + int flags) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + struct virtio_vsock_pkt *pkt; + int dequeued_len = 0; + size_t user_buf_len = msg_data_left(msg); + bool copy_failed = false; + bool msg_ready = false; + + spin_lock_bh(&vvs->rx_lock); + + if (vvs->msg_count == 0) { + spin_unlock_bh(&vvs->rx_lock); + return 0; + } + + while (!msg_ready) { + pkt = list_first_entry(&vvs->rx_queue, struct virtio_vsock_pkt, list); + + if (!copy_failed) { + size_t pkt_len; + size_t bytes_to_copy; + + pkt_len = (size_t)le32_to_cpu(pkt->hdr.len); + bytes_to_copy = min(user_buf_len, pkt_len); + + if (bytes_to_copy) { + int err; + + /* sk_lock is held by caller so no one else can dequeue. + * Unlock rx_lock since memcpy_to_msg() may sleep. + */ + spin_unlock_bh(&vvs->rx_lock); + + err = memcpy_to_msg(msg, pkt->buf, bytes_to_copy); + if (err) { + /* Copy of message failed, set flag to skip + * copy path for rest of fragments. Rest of + * fragments will be freed without copy. + */ + copy_failed = true; + dequeued_len = err; + } else { + user_buf_len -= bytes_to_copy; + } + + spin_lock_bh(&vvs->rx_lock); + } + + if (dequeued_len >= 0) + dequeued_len += pkt_len; + } + + if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOR) { + msg_ready = true; + vvs->msg_count--; + } + + virtio_transport_dec_rx_pkt(vvs, pkt); + list_del(&pkt->list); + virtio_transport_free_pkt(pkt); + } + + spin_unlock_bh(&vvs->rx_lock); + + virtio_transport_send_credit_update(vsk); + + return dequeued_len; +} + ssize_t virtio_transport_stream_dequeue(struct vsock_sock *vsk, struct msghdr *msg, @@ -405,6 +477,18 @@ virtio_transport_stream_dequeue(struct vsock_sock *vsk, } EXPORT_SYMBOL_GPL(virtio_transport_stream_dequeue); +ssize_t +virtio_transport_seqpacket_dequeue(struct vsock_sock *vsk, + struct msghdr *msg, + int flags) +{ + if (flags & MSG_PEEK) + return -EOPNOTSUPP; + + return virtio_transport_seqpacket_do_dequeue(vsk, msg, flags); +} +EXPORT_SYMBOL_GPL(virtio_transport_seqpacket_dequeue); + int virtio_transport_dgram_dequeue(struct vsock_sock *vsk, struct msghdr *msg, -- cgit v1.2.3-58-ga151 From e4b1ef152f53d5ea0cae89f12f241f7293657718 Mon Sep 17 00:00:00 2001 From: Arseny Krasnov Date: Fri, 11 Jun 2021 14:12:53 +0300 Subject: virtio/vsock: add SEQPACKET receive logic Update current receive logic for SEQPACKET support: performs check for packet and socket types on receive(if mismatch, then reset connection). Increment EOR counter on receive. Also if buffer of new packet was appended to buffer of last packet in rx queue, update flags of last packet with flags of new packet. Signed-off-by: Arseny Krasnov Signed-off-by: David S. Miller --- net/vmw_vsock/virtio_transport_common.c | 34 ++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 1e1df19ec164..3a658ff8fccb 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -165,6 +165,14 @@ void virtio_transport_deliver_tap_pkt(struct virtio_vsock_pkt *pkt) } EXPORT_SYMBOL_GPL(virtio_transport_deliver_tap_pkt); +static u16 virtio_transport_get_type(struct sock *sk) +{ + if (sk->sk_type == SOCK_STREAM) + return VIRTIO_VSOCK_TYPE_STREAM; + else + return VIRTIO_VSOCK_TYPE_SEQPACKET; +} + /* This function can only be used on connecting/connected sockets, * since a socket assigned to a transport is required. * @@ -987,6 +995,9 @@ virtio_transport_recv_enqueue(struct vsock_sock *vsk, goto out; } + if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOR) + vvs->msg_count++; + /* Try to copy small packets into the buffer of last packet queued, * to avoid wasting memory queueing the entire buffer with a small * payload. @@ -998,13 +1009,18 @@ virtio_transport_recv_enqueue(struct vsock_sock *vsk, struct virtio_vsock_pkt, list); /* If there is space in the last packet queued, we copy the - * new packet in its buffer. + * new packet in its buffer. We avoid this if the last packet + * queued has VIRTIO_VSOCK_SEQ_EOR set, because this is + * delimiter of SEQPACKET record, so 'pkt' is the first packet + * of a new record. */ - if (pkt->len <= last_pkt->buf_len - last_pkt->len) { + if ((pkt->len <= last_pkt->buf_len - last_pkt->len) && + !(le32_to_cpu(last_pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOR)) { memcpy(last_pkt->buf + last_pkt->len, pkt->buf, pkt->len); last_pkt->len += pkt->len; free_pkt = true; + last_pkt->hdr.flags |= pkt->hdr.flags; goto out; } } @@ -1170,6 +1186,12 @@ virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt, return 0; } +static bool virtio_transport_valid_type(u16 type) +{ + return (type == VIRTIO_VSOCK_TYPE_STREAM) || + (type == VIRTIO_VSOCK_TYPE_SEQPACKET); +} + /* We are under the virtio-vsock's vsock->rx_lock or vhost-vsock's vq->mutex * lock. */ @@ -1195,7 +1217,7 @@ void virtio_transport_recv_pkt(struct virtio_transport *t, le32_to_cpu(pkt->hdr.buf_alloc), le32_to_cpu(pkt->hdr.fwd_cnt)); - if (le16_to_cpu(pkt->hdr.type) != VIRTIO_VSOCK_TYPE_STREAM) { + if (!virtio_transport_valid_type(le16_to_cpu(pkt->hdr.type))) { (void)virtio_transport_reset_no_sock(t, pkt); goto free_pkt; } @@ -1212,6 +1234,12 @@ void virtio_transport_recv_pkt(struct virtio_transport *t, } } + if (virtio_transport_get_type(sk) != le16_to_cpu(pkt->hdr.type)) { + (void)virtio_transport_reset_no_sock(t, pkt); + sock_put(sk); + goto free_pkt; + } + vsk = vsock_sk(sk); lock_sock(sk); -- cgit v1.2.3-58-ga151 From 9ac841f5e9f261245d9d2841ad123566bd160a6e Mon Sep 17 00:00:00 2001 From: Arseny Krasnov Date: Fri, 11 Jun 2021 14:13:06 +0300 Subject: virtio/vsock: rest of SOCK_SEQPACKET support Small updates to make SOCK_SEQPACKET work: 1) Send SHUTDOWN on socket close for SEQPACKET type. 2) Set SEQPACKET packet type during send. 3) Set 'VIRTIO_VSOCK_SEQ_EOR' bit in flags for last packet of message. 4) Implement data check function for SEQPACKET. 5) Check for max datagram size. Signed-off-by: Arseny Krasnov Signed-off-by: David S. Miller --- include/linux/virtio_vsock.h | 5 ++++ net/vmw_vsock/virtio_transport_common.c | 41 +++++++++++++++++++++++++++++++-- 2 files changed, 44 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h index 1d9a302cb91d..35d7eedb5e8e 100644 --- a/include/linux/virtio_vsock.h +++ b/include/linux/virtio_vsock.h @@ -81,12 +81,17 @@ virtio_transport_dgram_dequeue(struct vsock_sock *vsk, struct msghdr *msg, size_t len, int flags); +int +virtio_transport_seqpacket_enqueue(struct vsock_sock *vsk, + struct msghdr *msg, + size_t len); ssize_t virtio_transport_seqpacket_dequeue(struct vsock_sock *vsk, struct msghdr *msg, int flags); s64 virtio_transport_stream_has_data(struct vsock_sock *vsk); s64 virtio_transport_stream_has_space(struct vsock_sock *vsk); +u32 virtio_transport_seqpacket_has_data(struct vsock_sock *vsk); int virtio_transport_do_socket_init(struct vsock_sock *vsk, struct vsock_sock *psk); diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 3a658ff8fccb..23704a6bc437 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -74,6 +74,10 @@ virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info, err = memcpy_from_msg(pkt->buf, info->msg, len); if (err) goto out; + + if (msg_data_left(info->msg) == 0 && + info->type == VIRTIO_VSOCK_TYPE_SEQPACKET) + pkt->hdr.flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR); } trace_virtio_transport_alloc_pkt(src_cid, src_port, @@ -187,7 +191,7 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, struct virtio_vsock_pkt *pkt; u32 pkt_len = info->pkt_len; - info->type = VIRTIO_VSOCK_TYPE_STREAM; + info->type = virtio_transport_get_type(sk_vsock(vsk)); t_ops = virtio_transport_get_ops(vsk); if (unlikely(!t_ops)) @@ -497,6 +501,26 @@ virtio_transport_seqpacket_dequeue(struct vsock_sock *vsk, } EXPORT_SYMBOL_GPL(virtio_transport_seqpacket_dequeue); +int +virtio_transport_seqpacket_enqueue(struct vsock_sock *vsk, + struct msghdr *msg, + size_t len) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + + spin_lock_bh(&vvs->tx_lock); + + if (len > vvs->peer_buf_alloc) { + spin_unlock_bh(&vvs->tx_lock); + return -EMSGSIZE; + } + + spin_unlock_bh(&vvs->tx_lock); + + return virtio_transport_stream_enqueue(vsk, msg, len); +} +EXPORT_SYMBOL_GPL(virtio_transport_seqpacket_enqueue); + int virtio_transport_dgram_dequeue(struct vsock_sock *vsk, struct msghdr *msg, @@ -519,6 +543,19 @@ s64 virtio_transport_stream_has_data(struct vsock_sock *vsk) } EXPORT_SYMBOL_GPL(virtio_transport_stream_has_data); +u32 virtio_transport_seqpacket_has_data(struct vsock_sock *vsk) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + u32 msg_count; + + spin_lock_bh(&vvs->rx_lock); + msg_count = vvs->msg_count; + spin_unlock_bh(&vvs->rx_lock); + + return msg_count; +} +EXPORT_SYMBOL_GPL(virtio_transport_seqpacket_has_data); + static s64 virtio_transport_has_space(struct vsock_sock *vsk) { struct virtio_vsock_sock *vvs = vsk->trans; @@ -931,7 +968,7 @@ void virtio_transport_release(struct vsock_sock *vsk) struct sock *sk = &vsk->sk; bool remove_sock = true; - if (sk->sk_type == SOCK_STREAM) + if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) remove_sock = virtio_transport_close(vsk); if (remove_sock) { -- cgit v1.2.3-58-ga151 From 53efbba12cc7ea2aa47d888532fdc1b3b43afef0 Mon Sep 17 00:00:00 2001 From: Arseny Krasnov Date: Fri, 11 Jun 2021 14:13:22 +0300 Subject: virtio/vsock: enable SEQPACKET for transport To make transport work with SOCK_SEQPACKET add two things: 1) SOCK_SEQPACKET ops for virtio transport and 'seqpacket_allow()' callback. 2) Handling of SEQPACKET bit: guest tries to negotiate it with vhost, so feature will be enabled only if bit is negotiated with device. Signed-off-by: Arseny Krasnov Signed-off-by: David S. Miller --- net/vmw_vsock/virtio_transport.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'net') diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index 2700a63ab095..e73ce652bf3c 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -62,6 +62,7 @@ struct virtio_vsock { struct virtio_vsock_event event_list[8]; u32 guest_cid; + bool seqpacket_allow; }; static u32 virtio_transport_get_local_cid(void) @@ -443,6 +444,8 @@ static void virtio_vsock_rx_done(struct virtqueue *vq) queue_work(virtio_vsock_workqueue, &vsock->rx_work); } +static bool virtio_transport_seqpacket_allow(u32 remote_cid); + static struct virtio_transport virtio_transport = { .transport = { .module = THIS_MODULE, @@ -469,6 +472,11 @@ static struct virtio_transport virtio_transport = { .stream_is_active = virtio_transport_stream_is_active, .stream_allow = virtio_transport_stream_allow, + .seqpacket_dequeue = virtio_transport_seqpacket_dequeue, + .seqpacket_enqueue = virtio_transport_seqpacket_enqueue, + .seqpacket_allow = virtio_transport_seqpacket_allow, + .seqpacket_has_data = virtio_transport_seqpacket_has_data, + .notify_poll_in = virtio_transport_notify_poll_in, .notify_poll_out = virtio_transport_notify_poll_out, .notify_recv_init = virtio_transport_notify_recv_init, @@ -485,6 +493,19 @@ static struct virtio_transport virtio_transport = { .send_pkt = virtio_transport_send_pkt, }; +static bool virtio_transport_seqpacket_allow(u32 remote_cid) +{ + struct virtio_vsock *vsock; + bool seqpacket_allow; + + rcu_read_lock(); + vsock = rcu_dereference(the_virtio_vsock); + seqpacket_allow = vsock->seqpacket_allow; + rcu_read_unlock(); + + return seqpacket_allow; +} + static void virtio_transport_rx_work(struct work_struct *work) { struct virtio_vsock *vsock = @@ -608,10 +629,14 @@ static int virtio_vsock_probe(struct virtio_device *vdev) vsock->event_run = true; mutex_unlock(&vsock->event_lock); + if (virtio_has_feature(vdev, VIRTIO_VSOCK_F_SEQPACKET)) + vsock->seqpacket_allow = true; + vdev->priv = vsock; rcu_assign_pointer(the_virtio_vsock, vsock); mutex_unlock(&the_virtio_vsock_mutex); + return 0; out: @@ -695,6 +720,7 @@ static struct virtio_device_id id_table[] = { }; static unsigned int features[] = { + VIRTIO_VSOCK_F_SEQPACKET }; static struct virtio_driver virtio_vsock_driver = { -- cgit v1.2.3-58-ga151 From 6e90a57795aa6ab2ab65fd6ac76ee0b245e5988a Mon Sep 17 00:00:00 2001 From: Arseny Krasnov Date: Fri, 11 Jun 2021 14:13:50 +0300 Subject: vsock/loopback: enable SEQPACKET for transport Add SEQPACKET ops for loopback transport and 'seqpacket_allow()' callback. Signed-off-by: Arseny Krasnov Signed-off-by: David S. Miller --- net/vmw_vsock/vsock_loopback.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'net') diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c index a45f7ffca8c5..169a8cf65b39 100644 --- a/net/vmw_vsock/vsock_loopback.c +++ b/net/vmw_vsock/vsock_loopback.c @@ -63,6 +63,8 @@ static int vsock_loopback_cancel_pkt(struct vsock_sock *vsk) return 0; } +static bool vsock_loopback_seqpacket_allow(u32 remote_cid); + static struct virtio_transport loopback_transport = { .transport = { .module = THIS_MODULE, @@ -89,6 +91,11 @@ static struct virtio_transport loopback_transport = { .stream_is_active = virtio_transport_stream_is_active, .stream_allow = virtio_transport_stream_allow, + .seqpacket_dequeue = virtio_transport_seqpacket_dequeue, + .seqpacket_enqueue = virtio_transport_seqpacket_enqueue, + .seqpacket_allow = vsock_loopback_seqpacket_allow, + .seqpacket_has_data = virtio_transport_seqpacket_has_data, + .notify_poll_in = virtio_transport_notify_poll_in, .notify_poll_out = virtio_transport_notify_poll_out, .notify_recv_init = virtio_transport_notify_recv_init, @@ -105,6 +112,11 @@ static struct virtio_transport loopback_transport = { .send_pkt = vsock_loopback_send_pkt, }; +static bool vsock_loopback_seqpacket_allow(u32 remote_cid) +{ + return true; +} + static void vsock_loopback_work(struct work_struct *work) { struct vsock_loopback *vsock = -- cgit v1.2.3-58-ga151 From 87c272c618c7197b24fd3acf2d337315bd93b4fa Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Fri, 11 Jun 2021 09:45:01 +0200 Subject: net/af_iucv: clean up some forward declarations The forward declarations for the iucv_handler callbacks are causing various compile warnings with gcc-11. Reshuffle the code to get rid of these prototypes. Reported-by: Sven Schnelle Signed-off-by: Julian Wiedmann Signed-off-by: David S. Miller --- net/iucv/af_iucv.c | 27 ++++++++++----------------- 1 file changed, 10 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 0fdb389c3390..44453b35c7b7 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -44,6 +44,7 @@ static struct proto iucv_proto = { }; static struct iucv_interface *pr_iucv; +static struct iucv_handler af_iucv_handler; /* special AF_IUCV IPRM messages */ static const u8 iprm_shutdown[8] = @@ -91,28 +92,11 @@ static void iucv_sock_close(struct sock *sk); static void afiucv_hs_callback_txnotify(struct sock *sk, enum iucv_tx_notify); -/* Call Back functions */ -static void iucv_callback_rx(struct iucv_path *, struct iucv_message *); -static void iucv_callback_txdone(struct iucv_path *, struct iucv_message *); -static void iucv_callback_connack(struct iucv_path *, u8 *); -static int iucv_callback_connreq(struct iucv_path *, u8 *, u8 *); -static void iucv_callback_connrej(struct iucv_path *, u8 *); -static void iucv_callback_shutdown(struct iucv_path *, u8 *); - static struct iucv_sock_list iucv_sk_list = { .lock = __RW_LOCK_UNLOCKED(iucv_sk_list.lock), .autobind_name = ATOMIC_INIT(0) }; -static struct iucv_handler af_iucv_handler = { - .path_pending = iucv_callback_connreq, - .path_complete = iucv_callback_connack, - .path_severed = iucv_callback_connrej, - .message_pending = iucv_callback_rx, - .message_complete = iucv_callback_txdone, - .path_quiesced = iucv_callback_shutdown, -}; - static inline void high_nmcpy(unsigned char *dst, char *src) { memcpy(dst, src, 8); @@ -1817,6 +1801,15 @@ static void iucv_callback_shutdown(struct iucv_path *path, u8 ipuser[16]) bh_unlock_sock(sk); } +static struct iucv_handler af_iucv_handler = { + .path_pending = iucv_callback_connreq, + .path_complete = iucv_callback_connack, + .path_severed = iucv_callback_connrej, + .message_pending = iucv_callback_rx, + .message_complete = iucv_callback_txdone, + .path_quiesced = iucv_callback_shutdown, +}; + /***************** HiperSockets transport callbacks ********************/ static void afiucv_swap_src_dest(struct sk_buff *skb) { -- cgit v1.2.3-58-ga151 From 8c713dc93ca9a423d6af8849c9254742a1070c37 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Sat, 12 Jun 2021 10:20:54 +0200 Subject: rtnetlink: add alloc() method to rtnl_link_ops In order to make rtnetlink ops that can create different kinds of devices, like what we want to add to the WWAN framework, the priv_size and setup parameters aren't quite sufficient. Make this easier to manage by allowing ops to allocate their own netdev via an @alloc method that gets the tb netlink data. Signed-off-by: Johannes Berg Signed-off-by: Sergey Ryazanov Signed-off-by: David S. Miller --- include/net/rtnetlink.h | 8 ++++++++ net/core/rtnetlink.c | 19 ++++++++++++++----- 2 files changed, 22 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index 479f60ef54c0..384e800665f2 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -37,6 +37,9 @@ static inline int rtnl_msg_family(const struct nlmsghdr *nlh) * @maxtype: Highest device specific netlink attribute number * @policy: Netlink policy for device specific attribute validation * @validate: Optional validation function for netlink/changelink parameters + * @alloc: netdev allocation function, can be %NULL and is then used + * in place of alloc_netdev_mqs(), in this case @priv_size + * and @setup are unused. Returns a netdev or ERR_PTR(). * @priv_size: sizeof net_device private space * @setup: net_device setup function * @newlink: Function for configuring and registering a new device @@ -63,6 +66,11 @@ struct rtnl_link_ops { const char *kind; size_t priv_size; + struct net_device *(*alloc)(struct nlattr *tb[], + const char *ifname, + unsigned char name_assign_type, + unsigned int num_tx_queues, + unsigned int num_rx_queues); void (*setup)(struct net_device *dev); bool netns_refund; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index cd87c7661c72..92c3e43db812 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -376,12 +376,12 @@ int __rtnl_link_register(struct rtnl_link_ops *ops) if (rtnl_link_ops_get(ops->kind)) return -EEXIST; - /* The check for setup is here because if ops + /* The check for alloc/setup is here because if ops * does not have that filled up, it is not possible * to use the ops for creating device. So do not * fill up dellink as well. That disables rtnl_dellink. */ - if (ops->setup && !ops->dellink) + if ((ops->alloc || ops->setup) && !ops->dellink) ops->dellink = unregister_netdevice_queue; list_add_tail(&ops->list, &link_ops); @@ -3165,8 +3165,17 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname, return ERR_PTR(-EINVAL); } - dev = alloc_netdev_mqs(ops->priv_size, ifname, name_assign_type, - ops->setup, num_tx_queues, num_rx_queues); + if (ops->alloc) { + dev = ops->alloc(tb, ifname, name_assign_type, + num_tx_queues, num_rx_queues); + if (IS_ERR(dev)) + return dev; + } else { + dev = alloc_netdev_mqs(ops->priv_size, ifname, + name_assign_type, ops->setup, + num_tx_queues, num_rx_queues); + } + if (!dev) return ERR_PTR(-ENOMEM); @@ -3399,7 +3408,7 @@ replay: return -EOPNOTSUPP; } - if (!ops->setup) + if (!ops->alloc && !ops->setup) return -EOPNOTSUPP; if (!ifname[0]) { -- cgit v1.2.3-58-ga151 From 00e77ed8e64d5f271c1f015c7153545980d48a76 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Sat, 12 Jun 2021 10:20:55 +0200 Subject: rtnetlink: add IFLA_PARENT_[DEV|DEV_BUS]_NAME In some cases, for example in the upcoming WWAN framework changes, there's no natural "parent netdev", so sometimes dummy netdevs are created or similar. IFLA_PARENT_DEV_NAME is a new attribute intended to contain a device (sysfs, struct device) name that can be used instead when creating a new netdev, if the rtnetlink family implements it. As suggested by Parav Pandit, we also introduce IFLA_PARENT_DEV_BUS_NAME attribute in order to uniquely identify a device on the system (with bus/name pair). ip-link(8) support for the generic parent device attributes will help us avoid code duplication, so no other link type will require a custom code to handle the parent name attribute. E.g. the WWAN interface creation command will looks like this: $ ip link add wwan0-1 parent-dev wwan0 type wwan channel-id 1 So, some future subsystem (or driver) FOO will have an interface creation command that looks like this: $ ip link add foo1-3 parent-dev foo1 type foo bar-id 3 baz-type Y Below is an example of dumping link info of a random device with these new attributes: $ ip --details link show wlp0s20f3 4: wlp0s20f3: mtu 1500 qdisc noqueue state UP mode DORMANT group default qlen 1000 ... parent_bus pci parent_dev 0000:00:14.3 Co-developed-by: Sergey Ryazanov Signed-off-by: Sergey Ryazanov Co-developed-by: Loic Poulain Signed-off-by: Loic Poulain Suggested-by: Sergey Ryazanov Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 7 +++++++ net/core/rtnetlink.c | 10 ++++++++++ 2 files changed, 17 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index a5a7f0e64865..4882e81514b6 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -341,6 +341,13 @@ enum { IFLA_ALT_IFNAME, /* Alternative ifname */ IFLA_PERM_ADDRESS, IFLA_PROTO_DOWN_REASON, + + /* device (sysfs) name as parent, used instead + * of IFLA_LINK where there's no parent netdev + */ + IFLA_PARENT_DEV_NAME, + IFLA_PARENT_DEV_BUS_NAME, + __IFLA_MAX }; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 92c3e43db812..170e97f3b3c6 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1821,6 +1821,16 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, if (rtnl_fill_prop_list(skb, dev)) goto nla_put_failure; + if (dev->dev.parent && + nla_put_string(skb, IFLA_PARENT_DEV_NAME, + dev_name(dev->dev.parent))) + goto nla_put_failure; + + if (dev->dev.parent && dev->dev.parent->bus && + nla_put_string(skb, IFLA_PARENT_DEV_BUS_NAME, + dev->dev.parent->bus->name)) + goto nla_put_failure; + nlmsg_end(skb, nlh); return 0; -- cgit v1.2.3-58-ga151 From 88b710532e53de2466d1033fb1d5125aabf3215a Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Sat, 12 Jun 2021 10:20:56 +0200 Subject: wwan: add interface creation support Add support to create (and destroy) interfaces via a new rtnetlink kind "wwan". The responsible driver has to use the new wwan_register_ops() to make this possible. Signed-off-by: Johannes Berg Signed-off-by: Sergey Ryazanov Signed-off-by: Loic Poulain Signed-off-by: David S. Miller --- drivers/net/wwan/wwan_core.c | 245 +++++++++++++++++++++++++++++++++++++++++-- include/linux/wwan.h | 24 +++++ include/uapi/linux/wwan.h | 16 +++ net/core/rtnetlink.c | 1 + 4 files changed, 279 insertions(+), 7 deletions(-) create mode 100644 include/uapi/linux/wwan.h (limited to 'net') diff --git a/drivers/net/wwan/wwan_core.c b/drivers/net/wwan/wwan_core.c index 45a41aee8958..7e728042fc41 100644 --- a/drivers/net/wwan/wwan_core.c +++ b/drivers/net/wwan/wwan_core.c @@ -14,6 +14,8 @@ #include #include #include +#include +#include /* Maximum number of minors in use */ #define WWAN_MAX_MINORS (1 << MINORBITS) @@ -35,10 +37,16 @@ static int wwan_major; * * @id: WWAN device unique ID. * @dev: Underlying device. + * @port_id: Current available port ID to pick. + * @ops: wwan device ops + * @ops_ctxt: context to pass to ops */ struct wwan_device { unsigned int id; struct device dev; + atomic_t port_id; + const struct wwan_ops *ops; + void *ops_ctxt; }; /** @@ -102,7 +110,8 @@ static const struct device_type wwan_dev_type = { static int wwan_dev_parent_match(struct device *dev, const void *parent) { - return (dev->type == &wwan_dev_type && dev->parent == parent); + return (dev->type == &wwan_dev_type && + (dev->parent == parent || dev == parent)); } static struct wwan_device *wwan_dev_get_by_parent(struct device *parent) @@ -116,6 +125,23 @@ static struct wwan_device *wwan_dev_get_by_parent(struct device *parent) return to_wwan_dev(dev); } +static int wwan_dev_name_match(struct device *dev, const void *name) +{ + return dev->type == &wwan_dev_type && + strcmp(dev_name(dev), name) == 0; +} + +static struct wwan_device *wwan_dev_get_by_name(const char *name) +{ + struct device *dev; + + dev = class_find_device(wwan_class, NULL, name, wwan_dev_name_match); + if (!dev) + return ERR_PTR(-ENODEV); + + return to_wwan_dev(dev); +} + /* This function allocates and registers a new WWAN device OR if a WWAN device * already exist for the given parent, it gets a reference and return it. * This function is not exported (for now), it is called indirectly via @@ -180,9 +206,14 @@ static void wwan_remove_dev(struct wwan_device *wwandev) /* WWAN device is created and registered (get+add) along with its first * child port, and subsequent port registrations only grab a reference * (get). The WWAN device must then be unregistered (del+put) along with - * its latest port, and reference simply dropped (put) otherwise. + * its last port, and reference simply dropped (put) otherwise. In the + * same fashion, we must not unregister it when the ops are still there. */ - ret = device_for_each_child(&wwandev->dev, NULL, is_wwan_child); + if (wwandev->ops) + ret = 1; + else + ret = device_for_each_child(&wwandev->dev, NULL, is_wwan_child); + if (!ret) device_unregister(&wwandev->dev); else @@ -750,26 +781,226 @@ static const struct file_operations wwan_port_fops = { .llseek = noop_llseek, }; +/** + * wwan_register_ops - register WWAN device ops + * @parent: Device to use as parent and shared by all WWAN ports and + * created netdevs + * @ops: operations to register + * @ctxt: context to pass to operations + * + * Returns: 0 on success, a negative error code on failure + */ +int wwan_register_ops(struct device *parent, const struct wwan_ops *ops, + void *ctxt) +{ + struct wwan_device *wwandev; + + if (WARN_ON(!parent || !ops)) + return -EINVAL; + + wwandev = wwan_create_dev(parent); + if (!wwandev) + return -ENOMEM; + + if (WARN_ON(wwandev->ops)) { + wwan_remove_dev(wwandev); + return -EBUSY; + } + + if (!try_module_get(ops->owner)) { + wwan_remove_dev(wwandev); + return -ENODEV; + } + + wwandev->ops = ops; + wwandev->ops_ctxt = ctxt; + + return 0; +} +EXPORT_SYMBOL_GPL(wwan_register_ops); + +/** + * wwan_unregister_ops - remove WWAN device ops + * @parent: Device to use as parent and shared by all WWAN ports and + * created netdevs + */ +void wwan_unregister_ops(struct device *parent) +{ + struct wwan_device *wwandev = wwan_dev_get_by_parent(parent); + bool has_ops; + + if (WARN_ON(IS_ERR(wwandev))) + return; + + has_ops = wwandev->ops; + + /* put the reference obtained by wwan_dev_get_by_parent(), + * we should still have one (that the owner is giving back + * now) due to the ops being assigned, check that below + * and return if not. + */ + put_device(&wwandev->dev); + + if (WARN_ON(!has_ops)) + return; + + module_put(wwandev->ops->owner); + + wwandev->ops = NULL; + wwandev->ops_ctxt = NULL; + wwan_remove_dev(wwandev); +} +EXPORT_SYMBOL_GPL(wwan_unregister_ops); + +static int wwan_rtnl_validate(struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + if (!data) + return -EINVAL; + + if (!tb[IFLA_PARENT_DEV_NAME]) + return -EINVAL; + + if (!data[IFLA_WWAN_LINK_ID]) + return -EINVAL; + + return 0; +} + +static struct device_type wwan_type = { .name = "wwan" }; + +static struct net_device *wwan_rtnl_alloc(struct nlattr *tb[], + const char *ifname, + unsigned char name_assign_type, + unsigned int num_tx_queues, + unsigned int num_rx_queues) +{ + const char *devname = nla_data(tb[IFLA_PARENT_DEV_NAME]); + struct wwan_device *wwandev = wwan_dev_get_by_name(devname); + struct net_device *dev; + + if (IS_ERR(wwandev)) + return ERR_CAST(wwandev); + + /* only supported if ops were registered (not just ports) */ + if (!wwandev->ops) { + dev = ERR_PTR(-EOPNOTSUPP); + goto out; + } + + dev = alloc_netdev_mqs(wwandev->ops->priv_size, ifname, name_assign_type, + wwandev->ops->setup, num_tx_queues, num_rx_queues); + + if (dev) { + SET_NETDEV_DEV(dev, &wwandev->dev); + SET_NETDEV_DEVTYPE(dev, &wwan_type); + } + +out: + /* release the reference */ + put_device(&wwandev->dev); + return dev; +} + +static int wwan_rtnl_newlink(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + struct wwan_device *wwandev = wwan_dev_get_by_parent(dev->dev.parent); + u32 link_id = nla_get_u32(data[IFLA_WWAN_LINK_ID]); + int ret; + + if (IS_ERR(wwandev)) + return PTR_ERR(wwandev); + + /* shouldn't have a netdev (left) with us as parent so WARN */ + if (WARN_ON(!wwandev->ops)) { + ret = -EOPNOTSUPP; + goto out; + } + + if (wwandev->ops->newlink) + ret = wwandev->ops->newlink(wwandev->ops_ctxt, dev, + link_id, extack); + else + ret = register_netdevice(dev); + +out: + /* release the reference */ + put_device(&wwandev->dev); + return ret; +} + +static void wwan_rtnl_dellink(struct net_device *dev, struct list_head *head) +{ + struct wwan_device *wwandev = wwan_dev_get_by_parent(dev->dev.parent); + + if (IS_ERR(wwandev)) + return; + + /* shouldn't have a netdev (left) with us as parent so WARN */ + if (WARN_ON(!wwandev->ops)) + goto out; + + if (wwandev->ops->dellink) + wwandev->ops->dellink(wwandev->ops_ctxt, dev, head); + else + unregister_netdevice(dev); + +out: + /* release the reference */ + put_device(&wwandev->dev); +} + +static const struct nla_policy wwan_rtnl_policy[IFLA_WWAN_MAX + 1] = { + [IFLA_WWAN_LINK_ID] = { .type = NLA_U32 }, +}; + +static struct rtnl_link_ops wwan_rtnl_link_ops __read_mostly = { + .kind = "wwan", + .maxtype = __IFLA_WWAN_MAX, + .alloc = wwan_rtnl_alloc, + .validate = wwan_rtnl_validate, + .newlink = wwan_rtnl_newlink, + .dellink = wwan_rtnl_dellink, + .policy = wwan_rtnl_policy, +}; + static int __init wwan_init(void) { + int err; + + err = rtnl_link_register(&wwan_rtnl_link_ops); + if (err) + return err; + wwan_class = class_create(THIS_MODULE, "wwan"); - if (IS_ERR(wwan_class)) - return PTR_ERR(wwan_class); + if (IS_ERR(wwan_class)) { + err = PTR_ERR(wwan_class); + goto unregister; + } /* chrdev used for wwan ports */ wwan_major = __register_chrdev(0, 0, WWAN_MAX_MINORS, "wwan_port", &wwan_port_fops); if (wwan_major < 0) { - class_destroy(wwan_class); - return wwan_major; + err = wwan_major; + goto destroy; } return 0; + +destroy: + class_destroy(wwan_class); +unregister: + rtnl_link_unregister(&wwan_rtnl_link_ops); + return err; } static void __exit wwan_exit(void) { __unregister_chrdev(wwan_major, 0, WWAN_MAX_MINORS, "wwan_port"); + rtnl_link_unregister(&wwan_rtnl_link_ops); class_destroy(wwan_class); } diff --git a/include/linux/wwan.h b/include/linux/wwan.h index fa33cc16d931..430a3a0817de 100644 --- a/include/linux/wwan.h +++ b/include/linux/wwan.h @@ -7,6 +7,7 @@ #include #include #include +#include /** * enum wwan_port_type - WWAN port types @@ -116,4 +117,27 @@ void wwan_port_txon(struct wwan_port *port); */ void *wwan_port_get_drvdata(struct wwan_port *port); +/** + * struct wwan_ops - WWAN device ops + * @owner: module owner of the WWAN ops + * @priv_size: size of private netdev data area + * @setup: set up a new netdev + * @newlink: register the new netdev + * @dellink: remove the given netdev + */ +struct wwan_ops { + struct module *owner; + unsigned int priv_size; + void (*setup)(struct net_device *dev); + int (*newlink)(void *ctxt, struct net_device *dev, + u32 if_id, struct netlink_ext_ack *extack); + void (*dellink)(void *ctxt, struct net_device *dev, + struct list_head *head); +}; + +int wwan_register_ops(struct device *parent, const struct wwan_ops *ops, + void *ctxt); + +void wwan_unregister_ops(struct device *parent); + #endif /* __WWAN_H */ diff --git a/include/uapi/linux/wwan.h b/include/uapi/linux/wwan.h new file mode 100644 index 000000000000..32a2720b4d11 --- /dev/null +++ b/include/uapi/linux/wwan.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0-only WITH Linux-syscall-note */ +/* + * Copyright (C) 2021 Intel Corporation. + */ +#ifndef _UAPI_WWAN_H_ +#define _UAPI_WWAN_H_ + +enum { + IFLA_WWAN_UNSPEC, + IFLA_WWAN_LINK_ID, /* u32 */ + + __IFLA_WWAN_MAX +}; +#define IFLA_WWAN_MAX (__IFLA_WWAN_MAX - 1) + +#endif /* _UAPI_WWAN_H_ */ diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 170e97f3b3c6..5baa86bca876 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1890,6 +1890,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_PERM_ADDRESS] = { .type = NLA_REJECT }, [IFLA_PROTO_DOWN_REASON] = { .type = NLA_NESTED }, [IFLA_NEW_IFINDEX] = NLA_POLICY_MIN(NLA_S32, 1), + [IFLA_PARENT_DEV_NAME] = { .type = NLA_NUL_STRING }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { -- cgit v1.2.3-58-ga151 From 6fd06963fa74197103cdbb4b494763127b3f2f34 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 7 Jun 2021 15:21:49 +0200 Subject: xfrm: Fix error reporting in xfrm_state_construct. When memory allocation for XFRMA_ENCAP or XFRMA_COADDR fails, the error will not be reported because the -ENOMEM assignment to the err variable is overwritten before. Fix this by moving these two in front of the function so that memory allocation failures will be reported. Reported-by: Tobias Brunner Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 5a0ef4361e43..817e714dedea 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -580,6 +580,20 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, copy_from_user_state(x, p); + if (attrs[XFRMA_ENCAP]) { + x->encap = kmemdup(nla_data(attrs[XFRMA_ENCAP]), + sizeof(*x->encap), GFP_KERNEL); + if (x->encap == NULL) + goto error; + } + + if (attrs[XFRMA_COADDR]) { + x->coaddr = kmemdup(nla_data(attrs[XFRMA_COADDR]), + sizeof(*x->coaddr), GFP_KERNEL); + if (x->coaddr == NULL) + goto error; + } + if (attrs[XFRMA_SA_EXTRA_FLAGS]) x->props.extra_flags = nla_get_u32(attrs[XFRMA_SA_EXTRA_FLAGS]); @@ -600,23 +614,9 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, attrs[XFRMA_ALG_COMP]))) goto error; - if (attrs[XFRMA_ENCAP]) { - x->encap = kmemdup(nla_data(attrs[XFRMA_ENCAP]), - sizeof(*x->encap), GFP_KERNEL); - if (x->encap == NULL) - goto error; - } - if (attrs[XFRMA_TFCPAD]) x->tfcpad = nla_get_u32(attrs[XFRMA_TFCPAD]); - if (attrs[XFRMA_COADDR]) { - x->coaddr = kmemdup(nla_data(attrs[XFRMA_COADDR]), - sizeof(*x->coaddr), GFP_KERNEL); - if (x->coaddr == NULL) - goto error; - } - xfrm_mark_get(attrs, &x->mark); xfrm_smark_init(attrs, &x->props.smark); -- cgit v1.2.3-58-ga151 From b5ec0705ffe891910e8e615b2efb1c2b292e917c Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Sun, 13 Jun 2021 14:46:36 +0100 Subject: ipv6: fib6: remove redundant initialization of variable err The variable err is being initialized with a value that is never read, the assignment is redundant and can be removed. Addresses-Coverity: ("Unused value") Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- net/ipv6/fib6_rules.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index 8f9a83314de7..40f3e4f9f33a 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -467,7 +467,7 @@ static const struct fib_rules_ops __net_initconst fib6_rules_ops_template = { static int __net_init fib6_rules_net_init(struct net *net) { struct fib_rules_ops *ops; - int err = -ENOMEM; + int err; ops = fib_rules_register(&fib6_rules_ops_template, net); if (IS_ERR(ops)) -- cgit v1.2.3-58-ga151 From ffa85b73c3c4143a8e8087c0930f6c5a6ead8e9f Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Sun, 13 Jun 2021 14:43:44 +0000 Subject: mld: avoid unnecessary high order page allocation in mld_newpack() If link mtu is too big, mld_newpack() allocates high-order page. But most mld packets don't need high-order page. So, it might waste unnecessary pages. To avoid this, it makes mld_newpack() try to allocate order-0 page. Suggested-by: Eric Dumazet Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- net/ipv6/mcast.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index d36ef9d25e73..54ec163fbafa 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1729,22 +1729,25 @@ static void ip6_mc_hdr(struct sock *sk, struct sk_buff *skb, static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu) { + u8 ra[8] = { IPPROTO_ICMPV6, 0, IPV6_TLV_ROUTERALERT, + 2, 0, 0, IPV6_TLV_PADN, 0 }; struct net_device *dev = idev->dev; - struct net *net = dev_net(dev); - struct sock *sk = net->ipv6.igmp_sk; - struct sk_buff *skb; - struct mld2_report *pmr; - struct in6_addr addr_buf; - const struct in6_addr *saddr; int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; - unsigned int size = mtu + hlen + tlen; + struct net *net = dev_net(dev); + const struct in6_addr *saddr; + struct in6_addr addr_buf; + struct mld2_report *pmr; + struct sk_buff *skb; + unsigned int size; + struct sock *sk; int err; - u8 ra[8] = { IPPROTO_ICMPV6, 0, - IPV6_TLV_ROUTERALERT, 2, 0, 0, - IPV6_TLV_PADN, 0 }; - /* we assume size > sizeof(ra) here */ + sk = net->ipv6.igmp_sk; + /* we assume size > sizeof(ra) here + * Also try to not allocate high-order pages for big MTU + */ + size = min_t(int, mtu, PAGE_SIZE / 2) + hlen + tlen; skb = sock_alloc_send_skb(sk, size, 1, &err); if (!skb) return NULL; -- cgit v1.2.3-58-ga151 From c916e8e1ea724db0f7bae36c11aaadc631226321 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Mon, 14 Jun 2021 06:31:24 +0200 Subject: net: dsa: dsa_slave_phy_connect(): extend phy's flags with port specific phy flags The current get_phy_flags() is only processed when we connect to a PHY via a designed phy-handle property via phylink_of_phy_connect(), but if we fallback on the internal MDIO bus created by a switch and take the dsa_slave_phy_connect() path then we would not be processing that flag and using it at PHY connection time. Suggested-by: Florian Fainelli Signed-off-by: Oleksij Rempel Signed-off-by: David S. Miller --- net/dsa/slave.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 3ca509eb284d..798944aa847a 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1749,7 +1749,8 @@ static void dsa_slave_phylink_fixed_state(struct phylink_config *config, } /* slave device setup *******************************************************/ -static int dsa_slave_phy_connect(struct net_device *slave_dev, int addr) +static int dsa_slave_phy_connect(struct net_device *slave_dev, int addr, + u32 flags) { struct dsa_port *dp = dsa_slave_to_port(slave_dev); struct dsa_switch *ds = dp->ds; @@ -1760,6 +1761,8 @@ static int dsa_slave_phy_connect(struct net_device *slave_dev, int addr) return -ENODEV; } + slave_dev->phydev->dev_flags |= flags; + return phylink_connect_phy(dp->pl, slave_dev->phydev); } @@ -1804,7 +1807,7 @@ static int dsa_slave_phy_setup(struct net_device *slave_dev) /* We could not connect to a designated PHY or SFP, so try to * use the switch internal MDIO bus instead */ - ret = dsa_slave_phy_connect(slave_dev, dp->index); + ret = dsa_slave_phy_connect(slave_dev, dp->index, phy_flags); if (ret) { netdev_err(slave_dev, "failed to connect to port %d: %d\n", -- cgit v1.2.3-58-ga151 From ddee9dbc3d7aec1cd9fdcc671db2dd0016fd0f3d Mon Sep 17 00:00:00 2001 From: Oleksandr Mazur Date: Mon, 14 Jun 2021 16:01:12 +0300 Subject: net: core: devlink: add dropped stats traps field Whenever query statistics is issued for trap, devlink subsystem would also fill-in statistics 'dropped' field. This field indicates the number of packets HW dropped and failed to report to the device driver, and thus - to the devlink subsystem itself. In case if device driver didn't register callback for hard drop statistics querying, 'dropped' field will be omitted and not filled. Signed-off-by: Oleksandr Mazur Signed-off-by: David S. Miller --- include/net/devlink.h | 10 ++++++++++ net/core/devlink.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 59 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/net/devlink.h b/include/net/devlink.h index eb045f1b5d1d..57b738b78073 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1347,6 +1347,16 @@ struct devlink_ops { const struct devlink_trap_group *group, enum devlink_trap_action action, struct netlink_ext_ack *extack); + /** + * @trap_drop_counter_get: Trap drop counter get function. + * + * Should be used by device drivers to report number of packets + * that have been dropped, and cannot be passed to the devlink + * subsystem by the underlying device. + */ + int (*trap_drop_counter_get)(struct devlink *devlink, + const struct devlink_trap *trap, + u64 *p_drops); /** * @trap_policer_init: Trap policer initialization function. * diff --git a/net/core/devlink.c b/net/core/devlink.c index 3bdb7eac730a..566ddd147633 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -7519,8 +7519,9 @@ static void devlink_trap_stats_read(struct devlink_stats __percpu *trap_stats, } } -static int devlink_trap_stats_put(struct sk_buff *msg, - struct devlink_stats __percpu *trap_stats) +static int +devlink_trap_group_stats_put(struct sk_buff *msg, + struct devlink_stats __percpu *trap_stats) { struct devlink_stats stats; struct nlattr *attr; @@ -7548,6 +7549,50 @@ nla_put_failure: return -EMSGSIZE; } +static int devlink_trap_stats_put(struct sk_buff *msg, struct devlink *devlink, + const struct devlink_trap_item *trap_item) +{ + struct devlink_stats stats; + struct nlattr *attr; + u64 drops = 0; + int err; + + if (devlink->ops->trap_drop_counter_get) { + err = devlink->ops->trap_drop_counter_get(devlink, + trap_item->trap, + &drops); + if (err) + return err; + } + + devlink_trap_stats_read(trap_item->stats, &stats); + + attr = nla_nest_start(msg, DEVLINK_ATTR_STATS); + if (!attr) + return -EMSGSIZE; + + if (devlink->ops->trap_drop_counter_get && + nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_DROPPED, drops, + DEVLINK_ATTR_PAD)) + goto nla_put_failure; + + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_PACKETS, + stats.rx_packets, DEVLINK_ATTR_PAD)) + goto nla_put_failure; + + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_BYTES, + stats.rx_bytes, DEVLINK_ATTR_PAD)) + goto nla_put_failure; + + nla_nest_end(msg, attr); + + return 0; + +nla_put_failure: + nla_nest_cancel(msg, attr); + return -EMSGSIZE; +} + static int devlink_nl_trap_fill(struct sk_buff *msg, struct devlink *devlink, const struct devlink_trap_item *trap_item, enum devlink_command cmd, u32 portid, u32 seq, @@ -7585,7 +7630,7 @@ static int devlink_nl_trap_fill(struct sk_buff *msg, struct devlink *devlink, if (err) goto nla_put_failure; - err = devlink_trap_stats_put(msg, trap_item->stats); + err = devlink_trap_stats_put(msg, devlink, trap_item); if (err) goto nla_put_failure; @@ -7802,7 +7847,7 @@ devlink_nl_trap_group_fill(struct sk_buff *msg, struct devlink *devlink, group_item->policer_item->policer->id)) goto nla_put_failure; - err = devlink_trap_stats_put(msg, group_item->stats); + err = devlink_trap_group_stats_put(msg, group_item->stats); if (err) goto nla_put_failure; -- cgit v1.2.3-58-ga151 From ec13357263fb672390250fcfaa4c86b6dce66062 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 14 Jun 2021 16:58:19 +0300 Subject: net: flow_dissector: fix RPS on DSA masters After the blamed patch, __skb_flow_dissect() on the DSA master stopped adjusting for the length of the DSA headers. This is because it was told to adjust only if the needed_headroom is zero, aka if there is no DSA header. Of course, the adjustment should be done only if there _is_ a DSA header. Modify the comment too so it is clearer. Fixes: 4e50025129ef ("net: dsa: generalize overhead for taggers that use both headers and trailers") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index c04455981c1e..2aadbfc5193b 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -943,8 +943,8 @@ bool __skb_flow_dissect(const struct net *net, int offset = 0; ops = skb->dev->dsa_ptr->tag_ops; - /* Tail taggers don't break flow dissection */ - if (!ops->needed_headroom) { + /* Only DSA header taggers break flow dissection */ + if (ops->needed_headroom) { if (ops->flow_dissect) ops->flow_dissect(skb, &proto, &offset); else -- cgit v1.2.3-58-ga151 From f9ac779f881c2ec3d1cdcd7fa9d4f9442bf60e80 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sat, 12 Jun 2021 21:32:14 +0900 Subject: net: Introduce net.ipv4.tcp_migrate_req. This commit adds a new sysctl option: net.ipv4.tcp_migrate_req. If this option is enabled or eBPF program is attached, we will be able to migrate child sockets from a listener to another in the same reuseport group after close() or shutdown() syscalls. Signed-off-by: Kuniyuki Iwashima Signed-off-by: Daniel Borkmann Reviewed-by: Benjamin Herrenschmidt Reviewed-by: Eric Dumazet Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20210612123224.12525-2-kuniyu@amazon.co.jp --- Documentation/networking/ip-sysctl.rst | 25 +++++++++++++++++++++++++ include/net/netns/ipv4.h | 1 + net/ipv4/sysctl_net_ipv4.c | 9 +++++++++ 3 files changed, 35 insertions(+) (limited to 'net') diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index a5c250044500..b0436d3a4f11 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -761,6 +761,31 @@ tcp_syncookies - INTEGER network connections you can set this knob to 2 to enable unconditionally generation of syncookies. +tcp_migrate_req - BOOLEAN + The incoming connection is tied to a specific listening socket when + the initial SYN packet is received during the three-way handshake. + When a listener is closed, in-flight request sockets during the + handshake and established sockets in the accept queue are aborted. + + If the listener has SO_REUSEPORT enabled, other listeners on the + same port should have been able to accept such connections. This + option makes it possible to migrate such child sockets to another + listener after close() or shutdown(). + + The BPF_SK_REUSEPORT_SELECT_OR_MIGRATE type of eBPF program should + usually be used to define the policy to pick an alive listener. + Otherwise, the kernel will randomly pick an alive listener only if + this option is enabled. + + Note that migration between listeners with different settings may + crash applications. Let's say migration happens from listener A to + B, and only B has TCP_SAVE_SYN enabled. B cannot read SYN data from + the requests migrated from A. To avoid such a situation, cancel + migration by returning SK_DROP in the type of eBPF program, or + disable this option. + + Default: 0 + tcp_fastopen - INTEGER Enable TCP Fast Open (RFC7413) to send and accept data in the opening SYN packet. diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 746c80cd4257..b8620519eace 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -126,6 +126,7 @@ struct netns_ipv4 { u8 sysctl_tcp_syn_retries; u8 sysctl_tcp_synack_retries; u8 sysctl_tcp_syncookies; + u8 sysctl_tcp_migrate_req; int sysctl_tcp_reordering; u8 sysctl_tcp_retries1; u8 sysctl_tcp_retries2; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 4fa77f182dcb..6f1e64d49232 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -960,6 +960,15 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dou8vec_minmax, }, #endif + { + .procname = "tcp_migrate_req", + .data = &init_net.ipv4.sysctl_tcp_migrate_req, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE + }, { .procname = "tcp_reordering", .data = &init_net.ipv4.sysctl_tcp_reordering, -- cgit v1.2.3-58-ga151 From 5c040eaf5d1753aafe12989ca712175df0b9c436 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sat, 12 Jun 2021 21:32:15 +0900 Subject: tcp: Add num_closed_socks to struct sock_reuseport. As noted in the following commit, a closed listener has to hold the reference to the reuseport group for socket migration. This patch adds a field (num_closed_socks) to struct sock_reuseport to manage closed sockets within the same reuseport group. Moreover, this and the following commits introduce some helper functions to split socks[] into two sections and keep TCP_LISTEN and TCP_CLOSE sockets in each section. Like a double-ended queue, we will place TCP_LISTEN sockets from the front and TCP_CLOSE sockets from the end. TCP_LISTEN----------> <-------TCP_CLOSE +---+---+ --- +---+ --- +---+ --- +---+ | 0 | 1 | ... | i | ... | j | ... | k | +---+---+ --- +---+ --- +---+ --- +---+ i = num_socks - 1 j = max_socks - num_closed_socks k = max_socks - 1 This patch also extends reuseport_add_sock() and reuseport_grow() to support num_closed_socks. Signed-off-by: Kuniyuki Iwashima Signed-off-by: Daniel Borkmann Reviewed-by: Eric Dumazet Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20210612123224.12525-3-kuniyu@amazon.co.jp --- include/net/sock_reuseport.h | 5 +-- net/core/sock_reuseport.c | 75 +++++++++++++++++++++++++++++++++----------- 2 files changed, 60 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h index 505f1e18e9bf..0e558ca7afbf 100644 --- a/include/net/sock_reuseport.h +++ b/include/net/sock_reuseport.h @@ -13,8 +13,9 @@ extern spinlock_t reuseport_lock; struct sock_reuseport { struct rcu_head rcu; - u16 max_socks; /* length of socks */ - u16 num_socks; /* elements in socks */ + u16 max_socks; /* length of socks */ + u16 num_socks; /* elements in socks */ + u16 num_closed_socks; /* closed elements in socks */ /* The last synq overflow event timestamp of this * reuse->socks[] group. */ diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index b065f0a103ed..f478c65a281b 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -18,6 +18,49 @@ DEFINE_SPINLOCK(reuseport_lock); static DEFINE_IDA(reuseport_ida); +static int reuseport_sock_index(struct sock *sk, + const struct sock_reuseport *reuse, + bool closed) +{ + int left, right; + + if (!closed) { + left = 0; + right = reuse->num_socks; + } else { + left = reuse->max_socks - reuse->num_closed_socks; + right = reuse->max_socks; + } + + for (; left < right; left++) + if (reuse->socks[left] == sk) + return left; + return -1; +} + +static void __reuseport_add_sock(struct sock *sk, + struct sock_reuseport *reuse) +{ + reuse->socks[reuse->num_socks] = sk; + /* paired with smp_rmb() in reuseport_select_sock() */ + smp_wmb(); + reuse->num_socks++; +} + +static bool __reuseport_detach_sock(struct sock *sk, + struct sock_reuseport *reuse) +{ + int i = reuseport_sock_index(sk, reuse, false); + + if (i == -1) + return false; + + reuse->socks[i] = reuse->socks[reuse->num_socks - 1]; + reuse->num_socks--; + + return true; +} + static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks) { unsigned int size = sizeof(struct sock_reuseport) + @@ -72,9 +115,9 @@ int reuseport_alloc(struct sock *sk, bool bind_inany) } reuse->reuseport_id = id; + reuse->bind_inany = bind_inany; reuse->socks[0] = sk; reuse->num_socks = 1; - reuse->bind_inany = bind_inany; rcu_assign_pointer(sk->sk_reuseport_cb, reuse); out: @@ -98,6 +141,7 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse) return NULL; more_reuse->num_socks = reuse->num_socks; + more_reuse->num_closed_socks = reuse->num_closed_socks; more_reuse->prog = reuse->prog; more_reuse->reuseport_id = reuse->reuseport_id; more_reuse->bind_inany = reuse->bind_inany; @@ -105,9 +149,13 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse) memcpy(more_reuse->socks, reuse->socks, reuse->num_socks * sizeof(struct sock *)); + memcpy(more_reuse->socks + + (more_reuse->max_socks - more_reuse->num_closed_socks), + reuse->socks + (reuse->max_socks - reuse->num_closed_socks), + reuse->num_closed_socks * sizeof(struct sock *)); more_reuse->synq_overflow_ts = READ_ONCE(reuse->synq_overflow_ts); - for (i = 0; i < reuse->num_socks; ++i) + for (i = 0; i < reuse->max_socks; ++i) rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb, more_reuse); @@ -158,7 +206,7 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany) return -EBUSY; } - if (reuse->num_socks == reuse->max_socks) { + if (reuse->num_socks + reuse->num_closed_socks == reuse->max_socks) { reuse = reuseport_grow(reuse); if (!reuse) { spin_unlock_bh(&reuseport_lock); @@ -166,10 +214,7 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany) } } - reuse->socks[reuse->num_socks] = sk; - /* paired with smp_rmb() in reuseport_select_sock() */ - smp_wmb(); - reuse->num_socks++; + __reuseport_add_sock(sk, reuse); rcu_assign_pointer(sk->sk_reuseport_cb, reuse); spin_unlock_bh(&reuseport_lock); @@ -183,7 +228,6 @@ EXPORT_SYMBOL(reuseport_add_sock); void reuseport_detach_sock(struct sock *sk) { struct sock_reuseport *reuse; - int i; spin_lock_bh(&reuseport_lock); reuse = rcu_dereference_protected(sk->sk_reuseport_cb, @@ -200,16 +244,11 @@ void reuseport_detach_sock(struct sock *sk) bpf_sk_reuseport_detach(sk); rcu_assign_pointer(sk->sk_reuseport_cb, NULL); + __reuseport_detach_sock(sk, reuse); + + if (reuse->num_socks + reuse->num_closed_socks == 0) + call_rcu(&reuse->rcu, reuseport_free_rcu); - for (i = 0; i < reuse->num_socks; i++) { - if (reuse->socks[i] == sk) { - reuse->socks[i] = reuse->socks[reuse->num_socks - 1]; - reuse->num_socks--; - if (reuse->num_socks == 0) - call_rcu(&reuse->rcu, reuseport_free_rcu); - break; - } - } spin_unlock_bh(&reuseport_lock); } EXPORT_SYMBOL(reuseport_detach_sock); @@ -274,7 +313,7 @@ struct sock *reuseport_select_sock(struct sock *sk, prog = rcu_dereference(reuse->prog); socks = READ_ONCE(reuse->num_socks); if (likely(socks)) { - /* paired with smp_wmb() in reuseport_add_sock() */ + /* paired with smp_wmb() in __reuseport_add_sock() */ smp_rmb(); if (!prog || !skb) -- cgit v1.2.3-58-ga151 From 333bb73f620e1a5f2e0b8df2c0d25300fab36d89 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sat, 12 Jun 2021 21:32:16 +0900 Subject: tcp: Keep TCP_CLOSE sockets in the reuseport group. When we close a listening socket, to migrate its connections to another listener in the same reuseport group, we have to handle two kinds of child sockets. One is that a listening socket has a reference to, and the other is not. The former is the TCP_ESTABLISHED/TCP_SYN_RECV sockets, and they are in the accept queue of their listening socket. So we can pop them out and push them into another listener's queue at close() or shutdown() syscalls. On the other hand, the latter, the TCP_NEW_SYN_RECV socket is during the three-way handshake and not in the accept queue. Thus, we cannot access such sockets at close() or shutdown() syscalls. Accordingly, we have to migrate immature sockets after their listening socket has been closed. Currently, if their listening socket has been closed, TCP_NEW_SYN_RECV sockets are freed at receiving the final ACK or retransmitting SYN+ACKs. At that time, if we could select a new listener from the same reuseport group, no connection would be aborted. However, we cannot do that because reuseport_detach_sock() sets NULL to sk_reuseport_cb and forbids access to the reuseport group from closed sockets. This patch allows TCP_CLOSE sockets to remain in the reuseport group and access it while any child socket references them. The point is that reuseport_detach_sock() was called twice from inet_unhash() and sk_destruct(). This patch replaces the first reuseport_detach_sock() with reuseport_stop_listen_sock(), which checks if the reuseport group is capable of migration. If capable, it decrements num_socks, moves the socket backwards in socks[] and increments num_closed_socks. When all connections are migrated, sk_destruct() calls reuseport_detach_sock() to remove the socket from socks[], decrement num_closed_socks, and set NULL to sk_reuseport_cb. By this change, closed or shutdowned sockets can keep sk_reuseport_cb. Consequently, calling listen() after shutdown() can cause EADDRINUSE or EBUSY in inet_csk_bind_conflict() or reuseport_add_sock() which expects such sockets not to have the reuseport group. Therefore, this patch also loosens such validation rules so that a socket can listen again if it has a reuseport group with num_closed_socks more than 0. When such sockets listen again, we handle them in reuseport_resurrect(). If there is an existing reuseport group (reuseport_add_sock() path), we move the socket from the old group to the new one and free the old one if necessary. If there is no existing group (reuseport_alloc() path), we allocate a new reuseport group, detach sk from the old one, and free it if necessary, not to break the current shutdown behaviour: - we cannot carry over the eBPF prog of shutdowned sockets - we cannot attach/detach an eBPF prog to/from listening sockets via shutdowned sockets Note that when the number of sockets gets over U16_MAX, we try to detach a closed socket randomly to make room for the new listening socket in reuseport_grow(). Signed-off-by: Kuniyuki Iwashima Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/bpf/20210612123224.12525-4-kuniyu@amazon.co.jp --- include/net/sock_reuseport.h | 1 + net/core/sock_reuseport.c | 182 ++++++++++++++++++++++++++++++++++++++-- net/ipv4/inet_connection_sock.c | 12 ++- net/ipv4/inet_hashtables.c | 2 +- 4 files changed, 186 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h index 0e558ca7afbf..1333d0cddfbc 100644 --- a/include/net/sock_reuseport.h +++ b/include/net/sock_reuseport.h @@ -32,6 +32,7 @@ extern int reuseport_alloc(struct sock *sk, bool bind_inany); extern int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany); extern void reuseport_detach_sock(struct sock *sk); +void reuseport_stop_listen_sock(struct sock *sk); extern struct sock *reuseport_select_sock(struct sock *sk, u32 hash, struct sk_buff *skb, diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index f478c65a281b..41fcd55ab5ae 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -17,6 +17,8 @@ DEFINE_SPINLOCK(reuseport_lock); static DEFINE_IDA(reuseport_ida); +static int reuseport_resurrect(struct sock *sk, struct sock_reuseport *old_reuse, + struct sock_reuseport *reuse, bool bind_inany); static int reuseport_sock_index(struct sock *sk, const struct sock_reuseport *reuse, @@ -61,6 +63,29 @@ static bool __reuseport_detach_sock(struct sock *sk, return true; } +static void __reuseport_add_closed_sock(struct sock *sk, + struct sock_reuseport *reuse) +{ + reuse->socks[reuse->max_socks - reuse->num_closed_socks - 1] = sk; + /* paired with READ_ONCE() in inet_csk_bind_conflict() */ + WRITE_ONCE(reuse->num_closed_socks, reuse->num_closed_socks + 1); +} + +static bool __reuseport_detach_closed_sock(struct sock *sk, + struct sock_reuseport *reuse) +{ + int i = reuseport_sock_index(sk, reuse, true); + + if (i == -1) + return false; + + reuse->socks[i] = reuse->socks[reuse->max_socks - reuse->num_closed_socks]; + /* paired with READ_ONCE() in inet_csk_bind_conflict() */ + WRITE_ONCE(reuse->num_closed_socks, reuse->num_closed_socks - 1); + + return true; +} + static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks) { unsigned int size = sizeof(struct sock_reuseport) + @@ -92,6 +117,12 @@ int reuseport_alloc(struct sock *sk, bool bind_inany) reuse = rcu_dereference_protected(sk->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)); if (reuse) { + if (reuse->num_closed_socks) { + /* sk was shutdown()ed before */ + ret = reuseport_resurrect(sk, reuse, NULL, bind_inany); + goto out; + } + /* Only set reuse->bind_inany if the bind_inany is true. * Otherwise, it will overwrite the reuse->bind_inany * which was set by the bind/hash path. @@ -133,8 +164,23 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse) u32 more_socks_size, i; more_socks_size = reuse->max_socks * 2U; - if (more_socks_size > U16_MAX) + if (more_socks_size > U16_MAX) { + if (reuse->num_closed_socks) { + /* Make room by removing a closed sk. + * The child has already been migrated. + * Only reqsk left at this point. + */ + struct sock *sk; + + sk = reuse->socks[reuse->max_socks - reuse->num_closed_socks]; + RCU_INIT_POINTER(sk->sk_reuseport_cb, NULL); + __reuseport_detach_closed_sock(sk, reuse); + + return reuse; + } + return NULL; + } more_reuse = __reuseport_alloc(more_socks_size); if (!more_reuse) @@ -200,7 +246,15 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany) reuse = rcu_dereference_protected(sk2->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)); old_reuse = rcu_dereference_protected(sk->sk_reuseport_cb, - lockdep_is_held(&reuseport_lock)); + lockdep_is_held(&reuseport_lock)); + if (old_reuse && old_reuse->num_closed_socks) { + /* sk was shutdown()ed before */ + int err = reuseport_resurrect(sk, old_reuse, reuse, reuse->bind_inany); + + spin_unlock_bh(&reuseport_lock); + return err; + } + if (old_reuse && old_reuse->num_socks != 1) { spin_unlock_bh(&reuseport_lock); return -EBUSY; @@ -225,6 +279,65 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany) } EXPORT_SYMBOL(reuseport_add_sock); +static int reuseport_resurrect(struct sock *sk, struct sock_reuseport *old_reuse, + struct sock_reuseport *reuse, bool bind_inany) +{ + if (old_reuse == reuse) { + /* If sk was in the same reuseport group, just pop sk out of + * the closed section and push sk into the listening section. + */ + __reuseport_detach_closed_sock(sk, old_reuse); + __reuseport_add_sock(sk, old_reuse); + return 0; + } + + if (!reuse) { + /* In bind()/listen() path, we cannot carry over the eBPF prog + * for the shutdown()ed socket. In setsockopt() path, we should + * not change the eBPF prog of listening sockets by attaching a + * prog to the shutdown()ed socket. Thus, we will allocate a new + * reuseport group and detach sk from the old group. + */ + int id; + + reuse = __reuseport_alloc(INIT_SOCKS); + if (!reuse) + return -ENOMEM; + + id = ida_alloc(&reuseport_ida, GFP_ATOMIC); + if (id < 0) { + kfree(reuse); + return id; + } + + reuse->reuseport_id = id; + reuse->bind_inany = bind_inany; + } else { + /* Move sk from the old group to the new one if + * - all the other listeners in the old group were close()d or + * shutdown()ed, and then sk2 has listen()ed on the same port + * OR + * - sk listen()ed without bind() (or with autobind), was + * shutdown()ed, and then listen()s on another port which + * sk2 listen()s on. + */ + if (reuse->num_socks + reuse->num_closed_socks == reuse->max_socks) { + reuse = reuseport_grow(reuse); + if (!reuse) + return -ENOMEM; + } + } + + __reuseport_detach_closed_sock(sk, old_reuse); + __reuseport_add_sock(sk, reuse); + rcu_assign_pointer(sk->sk_reuseport_cb, reuse); + + if (old_reuse->num_socks + old_reuse->num_closed_socks == 0) + call_rcu(&old_reuse->rcu, reuseport_free_rcu); + + return 0; +} + void reuseport_detach_sock(struct sock *sk) { struct sock_reuseport *reuse; @@ -233,6 +346,10 @@ void reuseport_detach_sock(struct sock *sk) reuse = rcu_dereference_protected(sk->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)); + /* reuseport_grow() has detached a closed sk */ + if (!reuse) + goto out; + /* Notify the bpf side. The sk may be added to a sockarray * map. If so, sockarray logic will remove it from the map. * @@ -244,15 +361,49 @@ void reuseport_detach_sock(struct sock *sk) bpf_sk_reuseport_detach(sk); rcu_assign_pointer(sk->sk_reuseport_cb, NULL); - __reuseport_detach_sock(sk, reuse); + + if (!__reuseport_detach_closed_sock(sk, reuse)) + __reuseport_detach_sock(sk, reuse); if (reuse->num_socks + reuse->num_closed_socks == 0) call_rcu(&reuse->rcu, reuseport_free_rcu); +out: spin_unlock_bh(&reuseport_lock); } EXPORT_SYMBOL(reuseport_detach_sock); +void reuseport_stop_listen_sock(struct sock *sk) +{ + if (sk->sk_protocol == IPPROTO_TCP) { + struct sock_reuseport *reuse; + + spin_lock_bh(&reuseport_lock); + + reuse = rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + + if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req) { + /* Migration capable, move sk from the listening section + * to the closed section. + */ + bpf_sk_reuseport_detach(sk); + + __reuseport_detach_sock(sk, reuse); + __reuseport_add_closed_sock(sk, reuse); + + spin_unlock_bh(&reuseport_lock); + return; + } + + spin_unlock_bh(&reuseport_lock); + } + + /* Not capable to do migration, detach immediately */ + reuseport_detach_sock(sk); +} +EXPORT_SYMBOL(reuseport_stop_listen_sock); + static struct sock *run_bpf_filter(struct sock_reuseport *reuse, u16 socks, struct bpf_prog *prog, struct sk_buff *skb, int hdr_len) @@ -352,9 +503,13 @@ int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog) struct sock_reuseport *reuse; struct bpf_prog *old_prog; - if (sk_unhashed(sk) && sk->sk_reuseport) { - int err = reuseport_alloc(sk, false); + if (sk_unhashed(sk)) { + int err; + if (!sk->sk_reuseport) + return -EINVAL; + + err = reuseport_alloc(sk, false); if (err) return err; } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) { @@ -380,13 +535,24 @@ int reuseport_detach_prog(struct sock *sk) struct sock_reuseport *reuse; struct bpf_prog *old_prog; - if (!rcu_access_pointer(sk->sk_reuseport_cb)) - return sk->sk_reuseport ? -ENOENT : -EINVAL; - old_prog = NULL; spin_lock_bh(&reuseport_lock); reuse = rcu_dereference_protected(sk->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)); + + /* reuse must be checked after acquiring the reuseport_lock + * because reuseport_grow() can detach a closed sk. + */ + if (!reuse) { + spin_unlock_bh(&reuseport_lock); + return sk->sk_reuseport ? -ENOENT : -EINVAL; + } + + if (sk_unhashed(sk) && reuse->num_closed_socks) { + spin_unlock_bh(&reuseport_lock); + return -ENOENT; + } + old_prog = rcu_replace_pointer(reuse->prog, old_prog, lockdep_is_held(&reuseport_lock)); spin_unlock_bh(&reuseport_lock); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index fd472eae4f5c..fa806e9167ec 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -135,10 +135,18 @@ static int inet_csk_bind_conflict(const struct sock *sk, bool relax, bool reuseport_ok) { struct sock *sk2; + bool reuseport_cb_ok; bool reuse = sk->sk_reuse; bool reuseport = !!sk->sk_reuseport; + struct sock_reuseport *reuseport_cb; kuid_t uid = sock_i_uid((struct sock *)sk); + rcu_read_lock(); + reuseport_cb = rcu_dereference(sk->sk_reuseport_cb); + /* paired with WRITE_ONCE() in __reuseport_(add|detach)_closed_sock */ + reuseport_cb_ok = !reuseport_cb || READ_ONCE(reuseport_cb->num_closed_socks); + rcu_read_unlock(); + /* * Unlike other sk lookup places we do not check * for sk_net here, since _all_ the socks listed @@ -156,14 +164,14 @@ static int inet_csk_bind_conflict(const struct sock *sk, if ((!relax || (!reuseport_ok && reuseport && sk2->sk_reuseport && - !rcu_access_pointer(sk->sk_reuseport_cb) && + reuseport_cb_ok && (sk2->sk_state == TCP_TIME_WAIT || uid_eq(uid, sock_i_uid(sk2))))) && inet_rcv_saddr_equal(sk, sk2, true)) break; } else if (!reuseport_ok || !reuseport || !sk2->sk_reuseport || - rcu_access_pointer(sk->sk_reuseport_cb) || + !reuseport_cb_ok || (sk2->sk_state != TCP_TIME_WAIT && !uid_eq(uid, sock_i_uid(sk2)))) { if (inet_rcv_saddr_equal(sk, sk2, true)) diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index c96866a53a66..80aeaf9e6e16 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -697,7 +697,7 @@ void inet_unhash(struct sock *sk) goto unlock; if (rcu_access_pointer(sk->sk_reuseport_cb)) - reuseport_detach_sock(sk); + reuseport_stop_listen_sock(sk); if (ilb) { inet_unhash2(hashinfo, sk); ilb->count--; -- cgit v1.2.3-58-ga151 From 1cd62c21572c1df6e7090ea4cabf4cf509616dbb Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sat, 12 Jun 2021 21:32:17 +0900 Subject: tcp: Add reuseport_migrate_sock() to select a new listener. reuseport_migrate_sock() does the same check done in reuseport_listen_stop_sock(). If the reuseport group is capable of migration, reuseport_migrate_sock() selects a new listener by the child socket hash and increments the listener's sk_refcnt beforehand. Thus, if we fail in the migration, we have to decrement it later. We will support migration by eBPF in the later commits. Signed-off-by: Kuniyuki Iwashima Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/bpf/20210612123224.12525-5-kuniyu@amazon.co.jp --- include/net/sock_reuseport.h | 3 ++ net/core/sock_reuseport.c | 78 ++++++++++++++++++++++++++++++++++++-------- 2 files changed, 67 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h index 1333d0cddfbc..473b0b0fa4ab 100644 --- a/include/net/sock_reuseport.h +++ b/include/net/sock_reuseport.h @@ -37,6 +37,9 @@ extern struct sock *reuseport_select_sock(struct sock *sk, u32 hash, struct sk_buff *skb, int hdr_len); +struct sock *reuseport_migrate_sock(struct sock *sk, + struct sock *migrating_sk, + struct sk_buff *skb); extern int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog); extern int reuseport_detach_prog(struct sock *sk); diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index 41fcd55ab5ae..b239f8cd9d39 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -44,7 +44,7 @@ static void __reuseport_add_sock(struct sock *sk, struct sock_reuseport *reuse) { reuse->socks[reuse->num_socks] = sk; - /* paired with smp_rmb() in reuseport_select_sock() */ + /* paired with smp_rmb() in reuseport_(select|migrate)_sock() */ smp_wmb(); reuse->num_socks++; } @@ -434,6 +434,23 @@ static struct sock *run_bpf_filter(struct sock_reuseport *reuse, u16 socks, return reuse->socks[index]; } +static struct sock *reuseport_select_sock_by_hash(struct sock_reuseport *reuse, + u32 hash, u16 num_socks) +{ + int i, j; + + i = j = reciprocal_scale(hash, num_socks); + while (reuse->socks[i]->sk_state == TCP_ESTABLISHED) { + i++; + if (i >= num_socks) + i = 0; + if (i == j) + return NULL; + } + + return reuse->socks[i]; +} + /** * reuseport_select_sock - Select a socket from an SO_REUSEPORT group. * @sk: First socket in the group. @@ -477,19 +494,8 @@ struct sock *reuseport_select_sock(struct sock *sk, select_by_hash: /* no bpf or invalid bpf result: fall back to hash usage */ - if (!sk2) { - int i, j; - - i = j = reciprocal_scale(hash, socks); - while (reuse->socks[i]->sk_state == TCP_ESTABLISHED) { - i++; - if (i >= socks) - i = 0; - if (i == j) - goto out; - } - sk2 = reuse->socks[i]; - } + if (!sk2) + sk2 = reuseport_select_sock_by_hash(reuse, hash, socks); } out: @@ -498,6 +504,50 @@ out: } EXPORT_SYMBOL(reuseport_select_sock); +/** + * reuseport_migrate_sock - Select a socket from an SO_REUSEPORT group. + * @sk: close()ed or shutdown()ed socket in the group. + * @migrating_sk: ESTABLISHED/SYN_RECV full socket in the accept queue or + * NEW_SYN_RECV request socket during 3WHS. + * @skb: skb to run through BPF filter. + * Returns a socket (with sk_refcnt +1) that should accept the child socket + * (or NULL on error). + */ +struct sock *reuseport_migrate_sock(struct sock *sk, + struct sock *migrating_sk, + struct sk_buff *skb) +{ + struct sock_reuseport *reuse; + struct sock *nsk = NULL; + u16 socks; + u32 hash; + + rcu_read_lock(); + + reuse = rcu_dereference(sk->sk_reuseport_cb); + if (!reuse) + goto out; + + socks = READ_ONCE(reuse->num_socks); + if (unlikely(!socks)) + goto out; + + /* paired with smp_wmb() in __reuseport_add_sock() */ + smp_rmb(); + + hash = migrating_sk->sk_hash; + if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req) + nsk = reuseport_select_sock_by_hash(reuse, hash, socks); + + if (nsk && unlikely(!refcount_inc_not_zero(&nsk->sk_refcnt))) + nsk = NULL; + +out: + rcu_read_unlock(); + return nsk; +} +EXPORT_SYMBOL(reuseport_migrate_sock); + int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog) { struct sock_reuseport *reuse; -- cgit v1.2.3-58-ga151 From 54b92e84193749c9968aff2dd46e3b0f42643e18 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sat, 12 Jun 2021 21:32:18 +0900 Subject: tcp: Migrate TCP_ESTABLISHED/TCP_SYN_RECV sockets in accept queues. When we call close() or shutdown() for listening sockets, each child socket in the accept queue are freed at inet_csk_listen_stop(). If we can get a new listener by reuseport_migrate_sock() and clone the request by inet_reqsk_clone(), we try to add it into the new listener's accept queue by inet_csk_reqsk_queue_add(). If it fails, we have to call __reqsk_free() to call sock_put() for its listener and free the cloned request. After putting the full socket into ehash, tcp_v[46]_syn_recv_sock() sets NULL to ireq_opt/pktopts in struct inet_request_sock, but ipv6_opt can be non-NULL. So, we have to set NULL to ipv6_opt of the old request to avoid double free. Note that we do not update req->rsk_listener and instead clone the req to migrate because another path may reference the original request. If we protected it by RCU, we would need to add rcu_read_lock() in many places. Suggested-by: Martin KaFai Lau Signed-off-by: Kuniyuki Iwashima Signed-off-by: Daniel Borkmann Reviewed-by: Eric Dumazet Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/netdev/20201209030903.hhow5r53l6fmozjn@kafai-mbp.dhcp.thefacebook.com/ Link: https://lore.kernel.org/bpf/20210612123224.12525-6-kuniyu@amazon.co.jp --- net/ipv4/inet_connection_sock.c | 70 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 69 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index fa806e9167ec..08878ef1bc70 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -695,6 +695,52 @@ int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req) } EXPORT_SYMBOL(inet_rtx_syn_ack); +static struct request_sock *inet_reqsk_clone(struct request_sock *req, + struct sock *sk) +{ + struct sock *req_sk, *nreq_sk; + struct request_sock *nreq; + + nreq = kmem_cache_alloc(req->rsk_ops->slab, GFP_ATOMIC | __GFP_NOWARN); + if (!nreq) { + /* paired with refcount_inc_not_zero() in reuseport_migrate_sock() */ + sock_put(sk); + return NULL; + } + + req_sk = req_to_sk(req); + nreq_sk = req_to_sk(nreq); + + memcpy(nreq_sk, req_sk, + offsetof(struct sock, sk_dontcopy_begin)); + memcpy(&nreq_sk->sk_dontcopy_end, &req_sk->sk_dontcopy_end, + req->rsk_ops->obj_size - offsetof(struct sock, sk_dontcopy_end)); + + sk_node_init(&nreq_sk->sk_node); + nreq_sk->sk_tx_queue_mapping = req_sk->sk_tx_queue_mapping; +#ifdef CONFIG_XPS + nreq_sk->sk_rx_queue_mapping = req_sk->sk_rx_queue_mapping; +#endif + nreq_sk->sk_incoming_cpu = req_sk->sk_incoming_cpu; + + nreq->rsk_listener = sk; + + /* We need not acquire fastopenq->lock + * because the child socket is locked in inet_csk_listen_stop(). + */ + if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(nreq)->tfo_listener) + rcu_assign_pointer(tcp_sk(nreq->sk)->fastopen_rsk, nreq); + + return nreq; +} + +static void reqsk_migrate_reset(struct request_sock *req) +{ +#if IS_ENABLED(CONFIG_IPV6) + inet_rsk(req)->ipv6_opt = NULL; +#endif +} + /* return true if req was found in the ehash table */ static bool reqsk_queue_unlink(struct request_sock *req) { @@ -1036,14 +1082,36 @@ void inet_csk_listen_stop(struct sock *sk) * of the variants now. --ANK */ while ((req = reqsk_queue_remove(queue, sk)) != NULL) { - struct sock *child = req->sk; + struct sock *child = req->sk, *nsk; + struct request_sock *nreq; local_bh_disable(); bh_lock_sock(child); WARN_ON(sock_owned_by_user(child)); sock_hold(child); + nsk = reuseport_migrate_sock(sk, child, NULL); + if (nsk) { + nreq = inet_reqsk_clone(req, nsk); + if (nreq) { + refcount_set(&nreq->rsk_refcnt, 1); + + if (inet_csk_reqsk_queue_add(nsk, nreq, child)) { + reqsk_migrate_reset(req); + } else { + reqsk_migrate_reset(nreq); + __reqsk_free(nreq); + } + + /* inet_csk_reqsk_queue_add() has already + * called inet_child_forget() on failure case. + */ + goto skip_child_forget; + } + } + inet_child_forget(sk, req, child); +skip_child_forget: reqsk_put(req); bh_unlock_sock(child); local_bh_enable(); -- cgit v1.2.3-58-ga151 From c905dee62232db583b50fe214080b98db623151e Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sat, 12 Jun 2021 21:32:19 +0900 Subject: tcp: Migrate TCP_NEW_SYN_RECV requests at retransmitting SYN+ACKs. As with the preceding patch, this patch changes reqsk_timer_handler() to call reuseport_migrate_sock() and inet_reqsk_clone() to migrate in-flight requests at retransmitting SYN+ACKs. If we can select a new listener and clone the request, we resume setting the SYN+ACK timer for the new req. If we can set the timer, we call inet_ehash_insert() to unhash the old req and put the new req into ehash. The noteworthy point here is that by unhashing the old req, another CPU processing it may lose the "own_req" race in tcp_v[46]_syn_recv_sock() and drop the final ACK packet. However, the new timer will recover this situation. Signed-off-by: Kuniyuki Iwashima Signed-off-by: Daniel Borkmann Reviewed-by: Eric Dumazet Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20210612123224.12525-7-kuniyu@amazon.co.jp --- net/ipv4/inet_connection_sock.c | 75 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 69 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 08878ef1bc70..f4b771e45ac1 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -734,10 +734,22 @@ static struct request_sock *inet_reqsk_clone(struct request_sock *req, return nreq; } +static void reqsk_queue_migrated(struct request_sock_queue *queue, + const struct request_sock *req) +{ + if (req->num_timeout == 0) + atomic_inc(&queue->young); + atomic_inc(&queue->qlen); +} + static void reqsk_migrate_reset(struct request_sock *req) { + req->saved_syn = NULL; #if IS_ENABLED(CONFIG_IPV6) inet_rsk(req)->ipv6_opt = NULL; + inet_rsk(req)->pktopts = NULL; +#else + inet_rsk(req)->ireq_opt = NULL; #endif } @@ -781,15 +793,39 @@ EXPORT_SYMBOL(inet_csk_reqsk_queue_drop_and_put); static void reqsk_timer_handler(struct timer_list *t) { struct request_sock *req = from_timer(req, t, rsk_timer); + struct request_sock *nreq = NULL, *oreq = req; struct sock *sk_listener = req->rsk_listener; - struct net *net = sock_net(sk_listener); - struct inet_connection_sock *icsk = inet_csk(sk_listener); - struct request_sock_queue *queue = &icsk->icsk_accept_queue; + struct inet_connection_sock *icsk; + struct request_sock_queue *queue; + struct net *net; int max_syn_ack_retries, qlen, expire = 0, resend = 0; - if (inet_sk_state_load(sk_listener) != TCP_LISTEN) - goto drop; + if (inet_sk_state_load(sk_listener) != TCP_LISTEN) { + struct sock *nsk; + + nsk = reuseport_migrate_sock(sk_listener, req_to_sk(req), NULL); + if (!nsk) + goto drop; + + nreq = inet_reqsk_clone(req, nsk); + if (!nreq) + goto drop; + + /* The new timer for the cloned req can decrease the 2 + * by calling inet_csk_reqsk_queue_drop_and_put(), so + * hold another count to prevent use-after-free and + * call reqsk_put() just before return. + */ + refcount_set(&nreq->rsk_refcnt, 2 + 1); + timer_setup(&nreq->rsk_timer, reqsk_timer_handler, TIMER_PINNED); + reqsk_queue_migrated(&inet_csk(nsk)->icsk_accept_queue, req); + + req = nreq; + sk_listener = nsk; + } + icsk = inet_csk(sk_listener); + net = sock_net(sk_listener); max_syn_ack_retries = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_synack_retries; /* Normally all the openreqs are young and become mature * (i.e. converted to established socket) for first timeout. @@ -808,6 +844,7 @@ static void reqsk_timer_handler(struct timer_list *t) * embrions; and abort old ones without pity, if old * ones are about to clog our table. */ + queue = &icsk->icsk_accept_queue; qlen = reqsk_queue_len(queue); if ((qlen << 1) > max(8U, READ_ONCE(sk_listener->sk_max_ack_backlog))) { int young = reqsk_queue_len_young(queue) << 1; @@ -832,10 +869,36 @@ static void reqsk_timer_handler(struct timer_list *t) atomic_dec(&queue->young); timeo = min(TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX); mod_timer(&req->rsk_timer, jiffies + timeo); + + if (!nreq) + return; + + if (!inet_ehash_insert(req_to_sk(nreq), req_to_sk(oreq), NULL)) { + /* delete timer */ + inet_csk_reqsk_queue_drop(sk_listener, nreq); + goto drop; + } + + reqsk_migrate_reset(oreq); + reqsk_queue_removed(&inet_csk(oreq->rsk_listener)->icsk_accept_queue, oreq); + reqsk_put(oreq); + + reqsk_put(nreq); return; } + drop: - inet_csk_reqsk_queue_drop_and_put(sk_listener, req); + /* Even if we can clone the req, we may need not retransmit any more + * SYN+ACKs (nreq->num_timeout > max_syn_ack_retries, etc), or another + * CPU may win the "own_req" race so that inet_ehash_insert() fails. + */ + if (nreq) { + reqsk_migrate_reset(nreq); + reqsk_queue_removed(queue, nreq); + __reqsk_free(nreq); + } + + inet_csk_reqsk_queue_drop_and_put(oreq->rsk_listener, oreq); } static void reqsk_queue_hash_req(struct request_sock *req, -- cgit v1.2.3-58-ga151 From d4f2c86b2b7e2e606e0868b38c8c6c49cc193a8e Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sat, 12 Jun 2021 21:32:20 +0900 Subject: tcp: Migrate TCP_NEW_SYN_RECV requests at receiving the final ACK. This patch also changes the code to call reuseport_migrate_sock() and inet_reqsk_clone(), but unlike the other cases, we do not call inet_reqsk_clone() right after reuseport_migrate_sock(). Currently, in the receive path for TCP_NEW_SYN_RECV sockets, its listener has three kinds of refcnt: (A) for listener itself (B) carried by reuqest_sock (C) sock_hold() in tcp_v[46]_rcv() While processing the req, (A) may disappear by close(listener). Also, (B) can disappear by accept(listener) once we put the req into the accept queue. So, we have to hold another refcnt (C) for the listener to prevent use-after-free. For socket migration, we call reuseport_migrate_sock() to select a listener with (A) and to increment the new listener's refcnt in tcp_v[46]_rcv(). This refcnt corresponds to (C) and is cleaned up later in tcp_v[46]_rcv(). Thus we have to take another refcnt (B) for the newly cloned request_sock. In inet_csk_complete_hashdance(), we hold the count (B), clone the req, and try to put the new req into the accept queue. By migrating req after winning the "own_req" race, we can avoid such a worst situation: CPU 1 looks up req1 CPU 2 looks up req1, unhashes it, then CPU 1 loses the race CPU 3 looks up req2, unhashes it, then CPU 2 loses the race ... Signed-off-by: Kuniyuki Iwashima Signed-off-by: Daniel Borkmann Reviewed-by: Eric Dumazet Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20210612123224.12525-8-kuniyu@amazon.co.jp --- net/ipv4/inet_connection_sock.c | 34 +++++++++++++++++++++++++++++++--- net/ipv4/tcp_ipv4.c | 20 ++++++++++++++------ net/ipv4/tcp_minisocks.c | 4 ++-- net/ipv6/tcp_ipv6.c | 14 +++++++++++--- 4 files changed, 58 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index f4b771e45ac1..0eea878edc30 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -1114,12 +1114,40 @@ struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child, struct request_sock *req, bool own_req) { if (own_req) { - inet_csk_reqsk_queue_drop(sk, req); - reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req); - if (inet_csk_reqsk_queue_add(sk, req, child)) + inet_csk_reqsk_queue_drop(req->rsk_listener, req); + reqsk_queue_removed(&inet_csk(req->rsk_listener)->icsk_accept_queue, req); + + if (sk != req->rsk_listener) { + /* another listening sk has been selected, + * migrate the req to it. + */ + struct request_sock *nreq; + + /* hold a refcnt for the nreq->rsk_listener + * which is assigned in inet_reqsk_clone() + */ + sock_hold(sk); + nreq = inet_reqsk_clone(req, sk); + if (!nreq) { + inet_child_forget(sk, req, child); + goto child_put; + } + + refcount_set(&nreq->rsk_refcnt, 1); + if (inet_csk_reqsk_queue_add(sk, nreq, child)) { + reqsk_migrate_reset(req); + reqsk_put(req); + return child; + } + + reqsk_migrate_reset(nreq); + __reqsk_free(nreq); + } else if (inet_csk_reqsk_queue_add(sk, req, child)) { return child; + } } /* Too bad, another child took ownership of the request, undo. */ +child_put: bh_unlock_sock(child); sock_put(child); return NULL; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 4f5b68a90be9..6cb8e269f1ab 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2002,13 +2002,21 @@ process: goto csum_error; } if (unlikely(sk->sk_state != TCP_LISTEN)) { - inet_csk_reqsk_queue_drop_and_put(sk, req); - goto lookup; + nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb); + if (!nsk) { + inet_csk_reqsk_queue_drop_and_put(sk, req); + goto lookup; + } + sk = nsk; + /* reuseport_migrate_sock() has already held one sk_refcnt + * before returning. + */ + } else { + /* We own a reference on the listener, increase it again + * as we might lose it too soon. + */ + sock_hold(sk); } - /* We own a reference on the listener, increase it again - * as we might lose it too soon. - */ - sock_hold(sk); refcounted = true; nsk = NULL; if (!tcp_filter(sk, skb)) { diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 7513ba45553d..f258a4c0da71 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -775,8 +775,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, goto listen_overflow; if (own_req && rsk_drop_req(req)) { - reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req); - inet_csk_reqsk_queue_drop_and_put(sk, req); + reqsk_queue_removed(&inet_csk(req->rsk_listener)->icsk_accept_queue, req); + inet_csk_reqsk_queue_drop_and_put(req->rsk_listener, req); return child; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 4435fa342e7a..4d71464094b3 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1664,10 +1664,18 @@ process: goto csum_error; } if (unlikely(sk->sk_state != TCP_LISTEN)) { - inet_csk_reqsk_queue_drop_and_put(sk, req); - goto lookup; + nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb); + if (!nsk) { + inet_csk_reqsk_queue_drop_and_put(sk, req); + goto lookup; + } + sk = nsk; + /* reuseport_migrate_sock() has already held one sk_refcnt + * before returning. + */ + } else { + sock_hold(sk); } - sock_hold(sk); refcounted = true; nsk = NULL; if (!tcp_filter(sk, skb)) { -- cgit v1.2.3-58-ga151 From e061047684af63f2d4f1338ec73140f6e29eb59f Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sat, 12 Jun 2021 21:32:21 +0900 Subject: bpf: Support BPF_FUNC_get_socket_cookie() for BPF_PROG_TYPE_SK_REUSEPORT. We will call sock_reuseport.prog for socket migration in the next commit, so the eBPF program has to know which listener is closing to select a new listener. We can currently get a unique ID of each listener in the userspace by calling bpf_map_lookup_elem() for BPF_MAP_TYPE_REUSEPORT_SOCKARRAY map. This patch makes the pointer of sk available in sk_reuseport_md so that we can get the ID by BPF_FUNC_get_socket_cookie() in the eBPF program. Suggested-by: Martin KaFai Lau Signed-off-by: Kuniyuki Iwashima Signed-off-by: Daniel Borkmann Reviewed-by: Eric Dumazet Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/netdev/20201119001154.kapwihc2plp4f7zc@kafai-mbp.dhcp.thefacebook.com/ Link: https://lore.kernel.org/bpf/20210612123224.12525-9-kuniyu@amazon.co.jp --- include/uapi/linux/bpf.h | 1 + net/core/filter.c | 10 ++++++++++ tools/include/uapi/linux/bpf.h | 1 + 3 files changed, 12 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2c1ba70abbf1..f3b72588442b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5416,6 +5416,7 @@ struct sk_reuseport_md { __u32 ip_protocol; /* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */ __u32 bind_inany; /* Is sock bound to an INANY address? */ __u32 hash; /* A hash of the packet 4 tuples */ + __bpf_md_ptr(struct bpf_sock *, sk); }; #define BPF_TAG_SIZE 8 diff --git a/net/core/filter.c b/net/core/filter.c index caa88955562e..f753ab550525 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -10172,6 +10172,8 @@ sk_reuseport_func_proto(enum bpf_func_id func_id, return &sk_reuseport_load_bytes_proto; case BPF_FUNC_skb_load_bytes_relative: return &sk_reuseport_load_bytes_relative_proto; + case BPF_FUNC_get_socket_cookie: + return &bpf_get_socket_ptr_cookie_proto; default: return bpf_base_func_proto(func_id); } @@ -10201,6 +10203,10 @@ sk_reuseport_is_valid_access(int off, int size, case offsetof(struct sk_reuseport_md, hash): return size == size_default; + case offsetof(struct sk_reuseport_md, sk): + info->reg_type = PTR_TO_SOCKET; + return size == sizeof(__u64); + /* Fields that allow narrowing */ case bpf_ctx_range(struct sk_reuseport_md, eth_protocol): if (size < sizeof_field(struct sk_buff, protocol)) @@ -10273,6 +10279,10 @@ static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type, case offsetof(struct sk_reuseport_md, bind_inany): SK_REUSEPORT_LOAD_FIELD(bind_inany); break; + + case offsetof(struct sk_reuseport_md, sk): + SK_REUSEPORT_LOAD_FIELD(sk); + break; } return insn - insn_buf; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 2c1ba70abbf1..f3b72588442b 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5416,6 +5416,7 @@ struct sk_reuseport_md { __u32 ip_protocol; /* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */ __u32 bind_inany; /* Is sock bound to an INANY address? */ __u32 hash; /* A hash of the packet 4 tuples */ + __bpf_md_ptr(struct bpf_sock *, sk); }; #define BPF_TAG_SIZE 8 -- cgit v1.2.3-58-ga151 From d5e4ddaeb6ab2c3c7fbb7b247a6d34bb0b18d87e Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sat, 12 Jun 2021 21:32:22 +0900 Subject: bpf: Support socket migration by eBPF. This patch introduces a new bpf_attach_type for BPF_PROG_TYPE_SK_REUSEPORT to check if the attached eBPF program is capable of migrating sockets. When the eBPF program is attached, we run it for socket migration if the expected_attach_type is BPF_SK_REUSEPORT_SELECT_OR_MIGRATE or net.ipv4.tcp_migrate_req is enabled. Currently, the expected_attach_type is not enforced for the BPF_PROG_TYPE_SK_REUSEPORT type of program. Thus, this commit follows the earlier idea in the commit aac3fc320d94 ("bpf: Post-hooks for sys_bind") to fix up the zero expected_attach_type in bpf_prog_load_fixup_attach_type(). Moreover, this patch adds a new field (migrating_sk) to sk_reuseport_md to select a new listener based on the child socket. migrating_sk varies depending on if it is migrating a request in the accept queue or during 3WHS. - accept_queue : sock (ESTABLISHED/SYN_RECV) - 3WHS : request_sock (NEW_SYN_RECV) In the eBPF program, we can select a new listener by BPF_FUNC_sk_select_reuseport(). Also, we can cancel migration by returning SK_DROP. This feature is useful when listeners have different settings at the socket API level or when we want to free resources as soon as possible. - SK_PASS with selected_sk, select it as a new listener - SK_PASS with selected_sk NULL, fallbacks to the random selection - SK_DROP, cancel the migration. There is a noteworthy point. We select a listening socket in three places, but we do not have struct skb at closing a listener or retransmitting a SYN+ACK. On the other hand, some helper functions do not expect skb is NULL (e.g. skb_header_pointer() in BPF_FUNC_skb_load_bytes(), skb_tail_pointer() in BPF_FUNC_skb_load_bytes_relative()). So we allocate an empty skb temporarily before running the eBPF program. Suggested-by: Martin KaFai Lau Signed-off-by: Kuniyuki Iwashima Signed-off-by: Daniel Borkmann Reviewed-by: Eric Dumazet Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/netdev/20201123003828.xjpjdtk4ygl6tg6h@kafai-mbp.dhcp.thefacebook.com/ Link: https://lore.kernel.org/netdev/20201203042402.6cskdlit5f3mw4ru@kafai-mbp.dhcp.thefacebook.com/ Link: https://lore.kernel.org/netdev/20201209030903.hhow5r53l6fmozjn@kafai-mbp.dhcp.thefacebook.com/ Link: https://lore.kernel.org/bpf/20210612123224.12525-10-kuniyu@amazon.co.jp --- include/linux/bpf.h | 1 + include/linux/filter.h | 2 ++ include/uapi/linux/bpf.h | 15 +++++++++++++++ kernel/bpf/syscall.c | 13 +++++++++++++ net/core/filter.c | 13 ++++++++++++- net/core/sock_reuseport.c | 34 ++++++++++++++++++++++++++++++---- tools/include/uapi/linux/bpf.h | 15 +++++++++++++++ 7 files changed, 88 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 86dec5001ae2..f309fc1509f2 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2048,6 +2048,7 @@ struct sk_reuseport_kern { struct sk_buff *skb; struct sock *sk; struct sock *selected_sk; + struct sock *migrating_sk; void *data_end; u32 hash; u32 reuseport_id; diff --git a/include/linux/filter.h b/include/linux/filter.h index c5ad7df029ed..688856e0b28a 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -996,11 +996,13 @@ void bpf_warn_invalid_xdp_action(u32 act); #ifdef CONFIG_INET struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, struct bpf_prog *prog, struct sk_buff *skb, + struct sock *migrating_sk, u32 hash); #else static inline struct sock * bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, struct bpf_prog *prog, struct sk_buff *skb, + struct sock *migrating_sk, u32 hash) { return NULL; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f3b72588442b..bf9252c7381e 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -994,6 +994,8 @@ enum bpf_attach_type { BPF_SK_LOOKUP, BPF_XDP, BPF_SK_SKB_VERDICT, + BPF_SK_REUSEPORT_SELECT, + BPF_SK_REUSEPORT_SELECT_OR_MIGRATE, __MAX_BPF_ATTACH_TYPE }; @@ -5416,7 +5418,20 @@ struct sk_reuseport_md { __u32 ip_protocol; /* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */ __u32 bind_inany; /* Is sock bound to an INANY address? */ __u32 hash; /* A hash of the packet 4 tuples */ + /* When reuse->migrating_sk is NULL, it is selecting a sk for the + * new incoming connection request (e.g. selecting a listen sk for + * the received SYN in the TCP case). reuse->sk is one of the sk + * in the reuseport group. The bpf prog can use reuse->sk to learn + * the local listening ip/port without looking into the skb. + * + * When reuse->migrating_sk is not NULL, reuse->sk is closed and + * reuse->migrating_sk is the socket that needs to be migrated + * to another listening socket. migrating_sk could be a fullsock + * sk that is fully established or a reqsk that is in-the-middle + * of 3-way handshake. + */ __bpf_md_ptr(struct bpf_sock *, sk); + __bpf_md_ptr(struct bpf_sock *, migrating_sk); }; #define BPF_TAG_SIZE 8 diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 50457019da27..dbbc5342f221 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1972,6 +1972,11 @@ static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr) attr->expected_attach_type = BPF_CGROUP_INET_SOCK_CREATE; break; + case BPF_PROG_TYPE_SK_REUSEPORT: + if (!attr->expected_attach_type) + attr->expected_attach_type = + BPF_SK_REUSEPORT_SELECT; + break; } } @@ -2055,6 +2060,14 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, if (expected_attach_type == BPF_SK_LOOKUP) return 0; return -EINVAL; + case BPF_PROG_TYPE_SK_REUSEPORT: + switch (expected_attach_type) { + case BPF_SK_REUSEPORT_SELECT: + case BPF_SK_REUSEPORT_SELECT_OR_MIGRATE: + return 0; + default: + return -EINVAL; + } case BPF_PROG_TYPE_SYSCALL: case BPF_PROG_TYPE_EXT: if (expected_attach_type) diff --git a/net/core/filter.c b/net/core/filter.c index f753ab550525..5b86e47ef079 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -10044,11 +10044,13 @@ out: static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern, struct sock_reuseport *reuse, struct sock *sk, struct sk_buff *skb, + struct sock *migrating_sk, u32 hash) { reuse_kern->skb = skb; reuse_kern->sk = sk; reuse_kern->selected_sk = NULL; + reuse_kern->migrating_sk = migrating_sk; reuse_kern->data_end = skb->data + skb_headlen(skb); reuse_kern->hash = hash; reuse_kern->reuseport_id = reuse->reuseport_id; @@ -10057,12 +10059,13 @@ static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern, struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, struct bpf_prog *prog, struct sk_buff *skb, + struct sock *migrating_sk, u32 hash) { struct sk_reuseport_kern reuse_kern; enum sk_action action; - bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, hash); + bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, migrating_sk, hash); action = BPF_PROG_RUN(prog, &reuse_kern); if (action == SK_PASS) @@ -10207,6 +10210,10 @@ sk_reuseport_is_valid_access(int off, int size, info->reg_type = PTR_TO_SOCKET; return size == sizeof(__u64); + case offsetof(struct sk_reuseport_md, migrating_sk): + info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL; + return size == sizeof(__u64); + /* Fields that allow narrowing */ case bpf_ctx_range(struct sk_reuseport_md, eth_protocol): if (size < sizeof_field(struct sk_buff, protocol)) @@ -10283,6 +10290,10 @@ static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type, case offsetof(struct sk_reuseport_md, sk): SK_REUSEPORT_LOAD_FIELD(sk); break; + + case offsetof(struct sk_reuseport_md, migrating_sk): + SK_REUSEPORT_LOAD_FIELD(migrating_sk); + break; } return insn - insn_buf; diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index b239f8cd9d39..de5ee3ae86d5 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -377,13 +377,17 @@ void reuseport_stop_listen_sock(struct sock *sk) { if (sk->sk_protocol == IPPROTO_TCP) { struct sock_reuseport *reuse; + struct bpf_prog *prog; spin_lock_bh(&reuseport_lock); reuse = rcu_dereference_protected(sk->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)); + prog = rcu_dereference_protected(reuse->prog, + lockdep_is_held(&reuseport_lock)); - if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req) { + if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req || + (prog && prog->expected_attach_type == BPF_SK_REUSEPORT_SELECT_OR_MIGRATE)) { /* Migration capable, move sk from the listening section * to the closed section. */ @@ -488,7 +492,7 @@ struct sock *reuseport_select_sock(struct sock *sk, goto select_by_hash; if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) - sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, hash); + sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, NULL, hash); else sk2 = run_bpf_filter(reuse, socks, prog, skb, hdr_len); @@ -519,6 +523,8 @@ struct sock *reuseport_migrate_sock(struct sock *sk, { struct sock_reuseport *reuse; struct sock *nsk = NULL; + bool allocated = false; + struct bpf_prog *prog; u16 socks; u32 hash; @@ -536,10 +542,30 @@ struct sock *reuseport_migrate_sock(struct sock *sk, smp_rmb(); hash = migrating_sk->sk_hash; - if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req) + prog = rcu_dereference(reuse->prog); + if (!prog || prog->expected_attach_type != BPF_SK_REUSEPORT_SELECT_OR_MIGRATE) { + if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req) + goto select_by_hash; + goto out; + } + + if (!skb) { + skb = alloc_skb(0, GFP_ATOMIC); + if (!skb) + goto out; + allocated = true; + } + + nsk = bpf_run_sk_reuseport(reuse, sk, prog, skb, migrating_sk, hash); + + if (allocated) + kfree_skb(skb); + +select_by_hash: + if (!nsk) nsk = reuseport_select_sock_by_hash(reuse, hash, socks); - if (nsk && unlikely(!refcount_inc_not_zero(&nsk->sk_refcnt))) + if (IS_ERR_OR_NULL(nsk) || unlikely(!refcount_inc_not_zero(&nsk->sk_refcnt))) nsk = NULL; out: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index f3b72588442b..bf9252c7381e 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -994,6 +994,8 @@ enum bpf_attach_type { BPF_SK_LOOKUP, BPF_XDP, BPF_SK_SKB_VERDICT, + BPF_SK_REUSEPORT_SELECT, + BPF_SK_REUSEPORT_SELECT_OR_MIGRATE, __MAX_BPF_ATTACH_TYPE }; @@ -5416,7 +5418,20 @@ struct sk_reuseport_md { __u32 ip_protocol; /* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */ __u32 bind_inany; /* Is sock bound to an INANY address? */ __u32 hash; /* A hash of the packet 4 tuples */ + /* When reuse->migrating_sk is NULL, it is selecting a sk for the + * new incoming connection request (e.g. selecting a listen sk for + * the received SYN in the TCP case). reuse->sk is one of the sk + * in the reuseport group. The bpf prog can use reuse->sk to learn + * the local listening ip/port without looking into the skb. + * + * When reuse->migrating_sk is not NULL, reuse->sk is closed and + * reuse->migrating_sk is the socket that needs to be migrated + * to another listening socket. migrating_sk could be a fullsock + * sk that is fully established or a reqsk that is in-the-middle + * of 3-way handshake. + */ __bpf_md_ptr(struct bpf_sock *, sk); + __bpf_md_ptr(struct bpf_sock *, migrating_sk); }; #define BPF_TAG_SIZE 8 -- cgit v1.2.3-58-ga151 From 0dca2c7404a938cb10c85d0515cee40ed5348788 Mon Sep 17 00:00:00 2001 From: Boris Sukholitko Date: Mon, 14 Jun 2021 14:13:22 +0300 Subject: net/sched: cls_flower: Remove match on n_proto The following flower filters fail to match packets: tc filter add dev eth0 ingress protocol 0x8864 flower \ action simple sdata hi64 tc filter add dev eth0 ingress protocol 802.1q flower \ vlan_ethtype 0x8864 action simple sdata "hi vlan" The protocol 0x8864 (ETH_P_PPP_SES) is a tunnel protocol. As such, it is being dissected by __skb_flow_dissect and it's internal protocol is being set as key->basic.n_proto. IOW, the existence of ETH_P_PPP_SES tunnel is transparent to the callers of __skb_flow_dissect. OTOH, in the filters above, cls_flower configures its key->basic.n_proto to the ETH_P_PPP_SES value configured by the user. Matching on this key fails because of __skb_flow_dissect "transparency" mentioned above. In the following, I would argue that the problem lies with cls_flower, unnessary attempting key->basic.n_proto match. There are 3 close places in fl_set_key in cls_flower setting up mask->basic.n_proto. They are (in reverse order of appearance in the code) due to: (a) No vlan is given: use TCA_FLOWER_KEY_ETH_TYPE parameter (b) One vlan tag is given: use TCA_FLOWER_KEY_VLAN_ETH_TYPE (c) Two vlans are given: use TCA_FLOWER_KEY_CVLAN_ETH_TYPE The match in case (a) is unneeded because flower has no its own eth_type parameter. It was removed by Jamal Hadi Salim in commit 488b41d020fb06428b90289f70a41210718f52b7 in iproute2. For TCA_FLOWER_KEY_ETH_TYPE the userspace uses the generic tc filter protocol field. Therefore the match for the case (a) is done by tc itself. The matches in cases (b), (c) are unneeded because the protocol will appear in and will be matched by flow_dissector_key_vlan.vlan_tpid. Therefore in the best case, key->basic.n_proto will try to repeat vlan key match again. The below patch removes mask->basic.n_proto setting and resets it to 0 in case (c). Signed-off-by: Boris Sukholitko Signed-off-by: David S. Miller --- net/sched/cls_flower.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index d7869a984881..2e704c7a105a 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -1531,14 +1531,13 @@ static int fl_set_key(struct net *net, struct nlattr **tb, &mask->basic.n_proto, TCA_FLOWER_UNSPEC, sizeof(key->basic.n_proto)); + mask->basic.n_proto = cpu_to_be16(0); } else { key->basic.n_proto = ethertype; - mask->basic.n_proto = cpu_to_be16(~0); } } } else { key->basic.n_proto = ethertype; - mask->basic.n_proto = cpu_to_be16(~0); } } -- cgit v1.2.3-58-ga151 From b8f6b0522c298ae9267bd6584e19b942a0636910 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Tue, 15 Jun 2021 10:14:44 +0800 Subject: netlabel: Fix memory leak in netlbl_mgmt_add_common Hulk Robot reported memory leak in netlbl_mgmt_add_common. The problem is non-freed map in case of netlbl_domhsh_add() failed. BUG: memory leak unreferenced object 0xffff888100ab7080 (size 96): comm "syz-executor537", pid 360, jiffies 4294862456 (age 22.678s) hex dump (first 32 bytes): 05 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ fe 00 00 00 00 00 00 00 00 00 00 00 00 00 00 01 ................ backtrace: [<0000000008b40026>] netlbl_mgmt_add_common.isra.0+0xb2a/0x1b40 [<000000003be10950>] netlbl_mgmt_add+0x271/0x3c0 [<00000000c70487ed>] genl_family_rcv_msg_doit.isra.0+0x20e/0x320 [<000000001f2ff614>] genl_rcv_msg+0x2bf/0x4f0 [<0000000089045792>] netlink_rcv_skb+0x134/0x3d0 [<0000000020e96fdd>] genl_rcv+0x24/0x40 [<0000000042810c66>] netlink_unicast+0x4a0/0x6a0 [<000000002e1659f0>] netlink_sendmsg+0x789/0xc70 [<000000006e43415f>] sock_sendmsg+0x139/0x170 [<00000000680a73d7>] ____sys_sendmsg+0x658/0x7d0 [<0000000065cbb8af>] ___sys_sendmsg+0xf8/0x170 [<0000000019932b6c>] __sys_sendmsg+0xd3/0x190 [<00000000643ac172>] do_syscall_64+0x37/0x90 [<000000009b79d6dc>] entry_SYSCALL_64_after_hwframe+0x44/0xae Fixes: 63c416887437 ("netlabel: Add network address selectors to the NetLabel/LSM domain mapping") Reported-by: Hulk Robot Signed-off-by: Liu Shixin Signed-off-by: David S. Miller --- net/netlabel/netlabel_mgmt.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/netlabel/netlabel_mgmt.c b/net/netlabel/netlabel_mgmt.c index e664ab990941..032b7d7b32c7 100644 --- a/net/netlabel/netlabel_mgmt.c +++ b/net/netlabel/netlabel_mgmt.c @@ -76,6 +76,7 @@ static const struct nla_policy netlbl_mgmt_genl_policy[NLBL_MGMT_A_MAX + 1] = { static int netlbl_mgmt_add_common(struct genl_info *info, struct netlbl_audit *audit_info) { + void *pmap = NULL; int ret_val = -EINVAL; struct netlbl_domaddr_map *addrmap = NULL; struct cipso_v4_doi *cipsov4 = NULL; @@ -175,6 +176,7 @@ static int netlbl_mgmt_add_common(struct genl_info *info, ret_val = -ENOMEM; goto add_free_addrmap; } + pmap = map; map->list.addr = addr->s_addr & mask->s_addr; map->list.mask = mask->s_addr; map->list.valid = 1; @@ -183,10 +185,8 @@ static int netlbl_mgmt_add_common(struct genl_info *info, map->def.cipso = cipsov4; ret_val = netlbl_af4list_add(&map->list, &addrmap->list4); - if (ret_val != 0) { - kfree(map); - goto add_free_addrmap; - } + if (ret_val != 0) + goto add_free_map; entry->family = AF_INET; entry->def.type = NETLBL_NLTYPE_ADDRSELECT; @@ -223,6 +223,7 @@ static int netlbl_mgmt_add_common(struct genl_info *info, ret_val = -ENOMEM; goto add_free_addrmap; } + pmap = map; map->list.addr = *addr; map->list.addr.s6_addr32[0] &= mask->s6_addr32[0]; map->list.addr.s6_addr32[1] &= mask->s6_addr32[1]; @@ -235,10 +236,8 @@ static int netlbl_mgmt_add_common(struct genl_info *info, map->def.calipso = calipso; ret_val = netlbl_af6list_add(&map->list, &addrmap->list6); - if (ret_val != 0) { - kfree(map); - goto add_free_addrmap; - } + if (ret_val != 0) + goto add_free_map; entry->family = AF_INET6; entry->def.type = NETLBL_NLTYPE_ADDRSELECT; @@ -248,10 +247,12 @@ static int netlbl_mgmt_add_common(struct genl_info *info, ret_val = netlbl_domhsh_add(entry, audit_info); if (ret_val != 0) - goto add_free_addrmap; + goto add_free_map; return 0; +add_free_map: + kfree(pmap); add_free_addrmap: kfree(addrmap); add_doi_put_def: -- cgit v1.2.3-58-ga151 From 30ad6a84f60bdaa32ef5091125299d0d96a330fe Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 15 Jun 2021 16:27:20 +0200 Subject: xfrm: avoid compiler warning when ipv6 is disabled with CONFIG_IPV6=n: xfrm_output.c:140:12: warning: 'xfrm6_hdr_offset' defined but not used Fixes: 9acf4d3b9ec1 ("xfrm: ipv6: add xfrm6_hdr_offset helper") Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_output.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index e14fca1fb003..0b2975ef0668 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -137,6 +137,7 @@ static int mip6_rthdr_offset(struct sk_buff *skb, u8 **nexthdr, int type) } #endif +#if IS_ENABLED(CONFIG_IPV6) static int xfrm6_hdr_offset(struct xfrm_state *x, struct sk_buff *skb, u8 **prevhdr) { switch (x->type->proto) { @@ -151,6 +152,7 @@ static int xfrm6_hdr_offset(struct xfrm_state *x, struct sk_buff *skb, u8 **prev return ip6_find_1stfragopt(skb, prevhdr); } +#endif /* Add encapsulation header. * -- cgit v1.2.3-58-ga151 From 1b3fc771769c9f9418b23dd5676ab25a215d247d Mon Sep 17 00:00:00 2001 From: Maciej Żenczykowski Date: Tue, 15 Jun 2021 23:06:04 -0700 Subject: inet_diag: add support for tw_mark MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Timewait sockets have included mark since approx 4.18. Cc: Eric Dumazet Cc: Jon Maxwell Fixes: 00483690552c ("tcp: Add mark for TIMEWAIT sockets") Signed-off-by: Maciej Żenczykowski Reviewed-by: Eric Dumazet Reviewed-by: Jon Maxwell Signed-off-by: David S. Miller --- net/ipv4/inet_diag.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 93474b1bea4e..e65f4ef024a4 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -416,7 +416,7 @@ EXPORT_SYMBOL_GPL(inet_sk_diag_fill); static int inet_twsk_diag_fill(struct sock *sk, struct sk_buff *skb, struct netlink_callback *cb, - u16 nlmsg_flags) + u16 nlmsg_flags, bool net_admin) { struct inet_timewait_sock *tw = inet_twsk(sk); struct inet_diag_msg *r; @@ -444,6 +444,12 @@ static int inet_twsk_diag_fill(struct sock *sk, r->idiag_uid = 0; r->idiag_inode = 0; + if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, + tw->tw_mark)) { + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; + } + nlmsg_end(skb, nlh); return 0; } @@ -494,7 +500,7 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, u16 nlmsg_flags, bool net_admin) { if (sk->sk_state == TCP_TIME_WAIT) - return inet_twsk_diag_fill(sk, skb, cb, nlmsg_flags); + return inet_twsk_diag_fill(sk, skb, cb, nlmsg_flags, net_admin); if (sk->sk_state == TCP_NEW_SYN_RECV) return inet_req_diag_fill(sk, skb, cb, nlmsg_flags, net_admin); @@ -801,6 +807,8 @@ int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk) entry.mark = sk->sk_mark; else if (sk->sk_state == TCP_NEW_SYN_RECV) entry.mark = inet_rsk(inet_reqsk(sk))->ir_mark; + else if (sk->sk_state == TCP_TIME_WAIT) + entry.mark = inet_twsk(sk)->tw_mark; else entry.mark = 0; #ifdef CONFIG_SOCK_CGROUP_DATA -- cgit v1.2.3-58-ga151 From 4d1fb7cde0ccc6000cafb72d9079de1504e3cb2a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 15 Jun 2021 20:33:38 -0700 Subject: ethtool: add a stricter length check There has been a few errors in the ethtool reply size calculations, most of those are hard to trigger during basic testing because of skb size rounding up and netdev names being shorter than max. Add a more precise check. This change will affect the value of payload length displayed in case of -EMSGSIZE but that should be okay, "payload length" isn't a well defined term here. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/ethtool/netlink.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 88d8a0243f35..a7346346114f 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -315,9 +315,9 @@ static int ethnl_default_doit(struct sk_buff *skb, struct genl_info *info) struct ethnl_req_info *req_info = NULL; const u8 cmd = info->genlhdr->cmd; const struct ethnl_request_ops *ops; + int hdr_len, reply_len; struct sk_buff *rskb; void *reply_payload; - int reply_len; int ret; ops = ethnl_default_requests[cmd]; @@ -346,15 +346,20 @@ static int ethnl_default_doit(struct sk_buff *skb, struct genl_info *info) ret = ops->reply_size(req_info, reply_data); if (ret < 0) goto err_cleanup; - reply_len = ret + ethnl_reply_header_size(); + reply_len = ret; ret = -ENOMEM; - rskb = ethnl_reply_init(reply_len, req_info->dev, ops->reply_cmd, + rskb = ethnl_reply_init(reply_len + ethnl_reply_header_size(), + req_info->dev, ops->reply_cmd, ops->hdr_attr, info, &reply_payload); if (!rskb) goto err_cleanup; + hdr_len = rskb->len; ret = ops->fill_reply(rskb, req_info, reply_data); if (ret < 0) goto err_msg; + WARN_ONCE(rskb->len - hdr_len > reply_len, + "ethnl cmd %d: calculated reply length %d, but consumed %d\n", + cmd, reply_len, rskb->len - hdr_len); if (ops->cleanup_data) ops->cleanup_data(reply_data); -- cgit v1.2.3-58-ga151 From cdd73cc545c0fb9b1a1f7b209f4f536e7990cff4 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 10 Jun 2021 20:20:30 +0200 Subject: netfilter: nft_exthdr: check for IPv6 packet before further processing ipv6_find_hdr() does not validate that this is an IPv6 packet. Add a sanity check for calling ipv6_find_hdr() to make sure an IPv6 packet is passed for parsing. Fixes: 96518518cc41 ("netfilter: add nftables") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_exthdr.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index f64f0017e9a5..670dd146fb2b 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -42,6 +42,9 @@ static void nft_exthdr_ipv6_eval(const struct nft_expr *expr, unsigned int offset = 0; int err; + if (pkt->skb->protocol != htons(ETH_P_IPV6)) + goto err; + err = ipv6_find_hdr(pkt->skb, &offset, priv->type, NULL, NULL); if (priv->flags & NFT_EXTHDR_F_PRESENT) { nft_reg_store8(dest, err >= 0); -- cgit v1.2.3-58-ga151 From 8f518d43f89ae00b9cf5460e10b91694944ca1a8 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 10 Jun 2021 20:20:31 +0200 Subject: netfilter: nft_osf: check for TCP packet before further processing The osf expression only supports for TCP packets, add a upfront sanity check to skip packet parsing if this is not a TCP packet. Fixes: b96af92d6eaf ("netfilter: nf_tables: implement Passive OS fingerprint module in nft_osf") Signed-off-by: Pablo Neira Ayuso Reported-by: kernel test robot Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_osf.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c index ac61f708b82d..d82677e83400 100644 --- a/net/netfilter/nft_osf.c +++ b/net/netfilter/nft_osf.c @@ -28,6 +28,11 @@ static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs, struct nf_osf_data data; struct tcphdr _tcph; + if (pkt->tprot != IPPROTO_TCP) { + regs->verdict.code = NFT_BREAK; + return; + } + tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), &_tcph); if (!tcp) { -- cgit v1.2.3-58-ga151 From 52f0f4e178c757b3d356087376aad8bd77271828 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 11 Jun 2021 19:26:56 +0200 Subject: netfilter: nft_tproxy: restrict support to TCP and UDP transport protocols Add unfront check for TCP and UDP packets before performing further processing. Fixes: 4ed8eb6570a4 ("netfilter: nf_tables: Add native tproxy support") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_tproxy.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nft_tproxy.c b/net/netfilter/nft_tproxy.c index accef672088c..5cb4d575d47f 100644 --- a/net/netfilter/nft_tproxy.c +++ b/net/netfilter/nft_tproxy.c @@ -30,6 +30,12 @@ static void nft_tproxy_eval_v4(const struct nft_expr *expr, __be16 tport = 0; struct sock *sk; + if (pkt->tprot != IPPROTO_TCP && + pkt->tprot != IPPROTO_UDP) { + regs->verdict.code = NFT_BREAK; + return; + } + hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr); if (!hp) { regs->verdict.code = NFT_BREAK; @@ -91,7 +97,8 @@ static void nft_tproxy_eval_v6(const struct nft_expr *expr, memset(&taddr, 0, sizeof(taddr)); - if (!pkt->tprot_set) { + if (pkt->tprot != IPPROTO_TCP && + pkt->tprot != IPPROTO_UDP) { regs->verdict.code = NFT_BREAK; return; } -- cgit v1.2.3-58-ga151 From c2ae34a7deaff463ecafb7db627b77faaca8e159 Mon Sep 17 00:00:00 2001 From: George McCollister Date: Tue, 15 Jun 2021 12:50:37 -0500 Subject: net: hsr: don't check sequence number if tag removal is offloaded Don't check the sequence number when deciding when to update time_in in the node table if tag removal is offloaded since the sequence number is part of the tag. This fixes a problem where the times in the node table wouldn't update when 0 appeared to be before or equal to seq_out when tag removal was offloaded. Signed-off-by: George McCollister Signed-off-by: David S. Miller --- net/hsr/hsr_framereg.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index bb1351c38397..e31949479305 100644 --- a/net/hsr/hsr_framereg.c +++ b/net/hsr/hsr_framereg.c @@ -397,7 +397,8 @@ void hsr_register_frame_in(struct hsr_node *node, struct hsr_port *port, * ensures entries of restarted nodes gets pruned so that they can * re-register and resume communications. */ - if (seq_nr_before(sequence_nr, node->seq_out[port->type])) + if (!(port->dev->features & NETIF_F_HW_HSR_TAG_RM) && + seq_nr_before(sequence_nr, node->seq_out[port->type])) return; node->time_in[port->type] = jiffies; -- cgit v1.2.3-58-ga151 From e0e4b8fa533858532f1b9ea9c6a4660d09beb37a Mon Sep 17 00:00:00 2001 From: Guvenc Gulce Date: Wed, 16 Jun 2021 16:52:55 +0200 Subject: net/smc: Add SMC statistics support Add the ability to collect SMC statistics information. Per-cpu variables are used to collect the statistic information for better performance and for reducing concurrency pitfalls. The code that is collecting statistic data is implemented in macros to increase code reuse and readability. Signed-off-by: Guvenc Gulce Signed-off-by: Karsten Graul Signed-off-by: David S. Miller --- net/smc/Makefile | 2 +- net/smc/af_smc.c | 89 ++++++++++++++---- net/smc/smc_core.c | 13 ++- net/smc/smc_rx.c | 8 ++ net/smc/smc_stats.c | 35 ++++++++ net/smc/smc_stats.h | 253 ++++++++++++++++++++++++++++++++++++++++++++++++++++ net/smc/smc_tx.c | 16 +++- 7 files changed, 395 insertions(+), 21 deletions(-) create mode 100644 net/smc/smc_stats.c create mode 100644 net/smc/smc_stats.h (limited to 'net') diff --git a/net/smc/Makefile b/net/smc/Makefile index 77e54fe42b1c..99a0186cba5b 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -2,4 +2,4 @@ obj-$(CONFIG_SMC) += smc.o obj-$(CONFIG_SMC_DIAG) += smc_diag.o smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o -smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o +smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 5eff7cccceff..efeaed384769 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -49,6 +49,7 @@ #include "smc_tx.h" #include "smc_rx.h" #include "smc_close.h" +#include "smc_stats.h" static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group * creation on server @@ -508,9 +509,42 @@ static void smc_link_save_peer_info(struct smc_link *link, link->peer_mtu = clc->r0.qp_mtu; } -static void smc_switch_to_fallback(struct smc_sock *smc) +static void smc_stat_inc_fback_rsn_cnt(struct smc_sock *smc, + struct smc_stats_fback *fback_arr) +{ + int cnt; + + for (cnt = 0; cnt < SMC_MAX_FBACK_RSN_CNT; cnt++) { + if (fback_arr[cnt].fback_code == smc->fallback_rsn) { + fback_arr[cnt].count++; + break; + } + if (!fback_arr[cnt].fback_code) { + fback_arr[cnt].fback_code = smc->fallback_rsn; + fback_arr[cnt].count++; + break; + } + } +} + +static void smc_stat_fallback(struct smc_sock *smc) +{ + mutex_lock(&smc_stat_fback_rsn); + if (smc->listen_smc) { + smc_stat_inc_fback_rsn_cnt(smc, fback_rsn.srv); + fback_rsn.srv_fback_cnt++; + } else { + smc_stat_inc_fback_rsn_cnt(smc, fback_rsn.clnt); + fback_rsn.clnt_fback_cnt++; + } + mutex_unlock(&smc_stat_fback_rsn); +} + +static void smc_switch_to_fallback(struct smc_sock *smc, int reason_code) { smc->use_fallback = true; + smc->fallback_rsn = reason_code; + smc_stat_fallback(smc); if (smc->sk.sk_socket && smc->sk.sk_socket->file) { smc->clcsock->file = smc->sk.sk_socket->file; smc->clcsock->file->private_data = smc->clcsock; @@ -522,8 +556,7 @@ static void smc_switch_to_fallback(struct smc_sock *smc) /* fall back during connect */ static int smc_connect_fallback(struct smc_sock *smc, int reason_code) { - smc_switch_to_fallback(smc); - smc->fallback_rsn = reason_code; + smc_switch_to_fallback(smc, reason_code); smc_copy_sock_settings_to_clc(smc); smc->connect_nonblock = 0; if (smc->sk.sk_state == SMC_INIT) @@ -538,6 +571,7 @@ static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code, int rc; if (reason_code < 0) { /* error, fallback is not possible */ + this_cpu_inc(smc_stats->clnt_hshake_err_cnt); if (smc->sk.sk_state == SMC_INIT) sock_put(&smc->sk); /* passive closing */ return reason_code; @@ -545,6 +579,7 @@ static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code, if (reason_code != SMC_CLC_DECL_PEERDECL) { rc = smc_clc_send_decline(smc, reason_code, version); if (rc < 0) { + this_cpu_inc(smc_stats->clnt_hshake_err_cnt); if (smc->sk.sk_state == SMC_INIT) sock_put(&smc->sk); /* passive closing */ return rc; @@ -992,6 +1027,7 @@ static int __smc_connect(struct smc_sock *smc) if (rc) goto vlan_cleanup; + SMC_STAT_CLNT_SUCC_INC(aclc); smc_connect_ism_vlan_cleanup(smc, ini); kfree(buf); kfree(ini); @@ -1308,6 +1344,7 @@ static void smc_listen_out_err(struct smc_sock *new_smc) { struct sock *newsmcsk = &new_smc->sk; + this_cpu_inc(smc_stats->srv_hshake_err_cnt); if (newsmcsk->sk_state == SMC_INIT) sock_put(&new_smc->sk); /* passive closing */ newsmcsk->sk_state = SMC_CLOSED; @@ -1325,8 +1362,7 @@ static void smc_listen_decline(struct smc_sock *new_smc, int reason_code, smc_listen_out_err(new_smc); return; } - smc_switch_to_fallback(new_smc); - new_smc->fallback_rsn = reason_code; + smc_switch_to_fallback(new_smc, reason_code); if (reason_code && reason_code != SMC_CLC_DECL_PEERDECL) { if (smc_clc_send_decline(new_smc, reason_code, version) < 0) { smc_listen_out_err(new_smc); @@ -1699,8 +1735,7 @@ static void smc_listen_work(struct work_struct *work) /* check if peer is smc capable */ if (!tcp_sk(newclcsock->sk)->syn_smc) { - smc_switch_to_fallback(new_smc); - new_smc->fallback_rsn = SMC_CLC_DECL_PEERNOSMC; + smc_switch_to_fallback(new_smc, SMC_CLC_DECL_PEERNOSMC); smc_listen_out_connected(new_smc); return; } @@ -1778,6 +1813,7 @@ static void smc_listen_work(struct work_struct *work) } smc_conn_save_peer_info(new_smc, cclc); smc_listen_out_connected(new_smc); + SMC_STAT_SERV_SUCC_INC(ini); goto out_free; out_unlock: @@ -1984,18 +2020,19 @@ static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) if (msg->msg_flags & MSG_FASTOPEN) { if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) { - smc_switch_to_fallback(smc); - smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP; + smc_switch_to_fallback(smc, SMC_CLC_DECL_OPTUNSUPP); } else { rc = -EINVAL; goto out; } } - if (smc->use_fallback) + if (smc->use_fallback) { rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len); - else + } else { rc = smc_tx_sendmsg(smc, msg, len); + SMC_STAT_TX_PAYLOAD(smc, len, rc); + } out: release_sock(sk); return rc; @@ -2030,6 +2067,7 @@ static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, } else { msg->msg_namelen = 0; rc = smc_rx_recvmsg(smc, msg, NULL, len, flags); + SMC_STAT_RX_PAYLOAD(smc, rc, rc); } out: @@ -2194,8 +2232,7 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, case TCP_FASTOPEN_NO_COOKIE: /* option not supported by SMC */ if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) { - smc_switch_to_fallback(smc); - smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP; + smc_switch_to_fallback(smc, SMC_CLC_DECL_OPTUNSUPP); } else { rc = -EINVAL; } @@ -2204,18 +2241,22 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_CLOSED) { - if (val) + if (val) { + SMC_STAT_INC(!smc->conn.lnk, ndly_cnt); mod_delayed_work(smc->conn.lgr->tx_wq, &smc->conn.tx_work, 0); + } } break; case TCP_CORK: if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_CLOSED) { - if (!val) + if (!val) { + SMC_STAT_INC(!smc->conn.lnk, cork_cnt); mod_delayed_work(smc->conn.lgr->tx_wq, &smc->conn.tx_work, 0); + } } break; case TCP_DEFER_ACCEPT: @@ -2338,11 +2379,13 @@ static ssize_t smc_sendpage(struct socket *sock, struct page *page, goto out; } release_sock(sk); - if (smc->use_fallback) + if (smc->use_fallback) { rc = kernel_sendpage(smc->clcsock, page, offset, size, flags); - else + } else { + SMC_STAT_INC(!smc->conn.lnk, sendpage_cnt); rc = sock_no_sendpage(sock, page, offset, size, flags); + } out: return rc; @@ -2391,6 +2434,7 @@ static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos, flags = MSG_DONTWAIT; else flags = 0; + SMC_STAT_INC(!smc->conn.lnk, splice_cnt); rc = smc_rx_recvmsg(smc, NULL, pipe, len, flags); } out: @@ -2514,10 +2558,16 @@ static int __init smc_init(void) if (!smc_close_wq) goto out_alloc_hs_wq; + rc = smc_stats_init(); + if (rc) { + pr_err("%s: smc_stats_init fails with %d\n", __func__, rc); + goto out_alloc_wqs; + } + rc = smc_core_init(); if (rc) { pr_err("%s: smc_core_init fails with %d\n", __func__, rc); - goto out_alloc_wqs; + goto out_smc_stat; } rc = smc_llc_init(); @@ -2569,6 +2619,8 @@ out_proto: proto_unregister(&smc_proto); out_core: smc_core_exit(); +out_smc_stat: + smc_stats_exit(); out_alloc_wqs: destroy_workqueue(smc_close_wq); out_alloc_hs_wq: @@ -2591,6 +2643,7 @@ static void __exit smc_exit(void) smc_ib_unregister_client(); destroy_workqueue(smc_close_wq); destroy_workqueue(smc_hs_wq); + smc_stats_exit(); proto_unregister(&smc_proto6); proto_unregister(&smc_proto); smc_pnet_exit(); diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 317bc2c90fab..d69f58f670a1 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -33,6 +33,7 @@ #include "smc_close.h" #include "smc_ism.h" #include "smc_netlink.h" +#include "smc_stats.h" #define SMC_LGR_NUM_INCR 256 #define SMC_LGR_FREE_DELAY_SERV (600 * HZ) @@ -2029,6 +2030,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) struct smc_link_group *lgr = conn->lgr; struct list_head *buf_list; int bufsize, bufsize_short; + bool is_dgraded = false; struct mutex *lock; /* lock buffer list */ int sk_buf_size; @@ -2056,6 +2058,8 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) /* check for reusable slot in the link group */ buf_desc = smc_buf_get_slot(bufsize_short, lock, buf_list); if (buf_desc) { + SMC_STAT_RMB_SIZE(is_smcd, is_rmb, bufsize); + SMC_STAT_BUF_REUSE(is_smcd, is_rmb); memset(buf_desc->cpu_addr, 0, bufsize); break; /* found reusable slot */ } @@ -2067,9 +2071,16 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) if (PTR_ERR(buf_desc) == -ENOMEM) break; - if (IS_ERR(buf_desc)) + if (IS_ERR(buf_desc)) { + if (!is_dgraded) { + is_dgraded = true; + SMC_STAT_RMB_DOWNGRADED(is_smcd, is_rmb); + } continue; + } + SMC_STAT_RMB_ALLOC(is_smcd, is_rmb); + SMC_STAT_RMB_SIZE(is_smcd, is_rmb, bufsize); buf_desc->used = 1; mutex_lock(lock); list_add(&buf_desc->list, buf_list); diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index fcfac59f8b72..ce1ae39923b1 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -21,6 +21,7 @@ #include "smc_cdc.h" #include "smc_tx.h" /* smc_tx_consumer_update() */ #include "smc_rx.h" +#include "smc_stats.h" /* callback implementation to wakeup consumers blocked with smc_rx_wait(). * indirectly called by smc_cdc_msg_recv_action(). @@ -227,6 +228,7 @@ static int smc_rx_recv_urg(struct smc_sock *smc, struct msghdr *msg, int len, conn->urg_state == SMC_URG_READ) return -EINVAL; + SMC_STAT_INC(!conn->lnk, urg_data_cnt); if (conn->urg_state == SMC_URG_VALID) { if (!(flags & MSG_PEEK)) smc->conn.urg_state = SMC_URG_READ; @@ -303,6 +305,12 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); + readable = atomic_read(&conn->bytes_to_rcv); + if (readable >= conn->rmb_desc->len) + SMC_STAT_RMB_RX_FULL(!conn->lnk); + + if (len < readable) + SMC_STAT_RMB_RX_SIZE_SMALL(!conn->lnk); /* we currently use 1 RMBE per RMB, so RMBE == RMB base addr */ rcvbuf_base = conn->rx_off + conn->rmb_desc->cpu_addr; diff --git a/net/smc/smc_stats.c b/net/smc/smc_stats.c new file mode 100644 index 000000000000..76e938388520 --- /dev/null +++ b/net/smc/smc_stats.c @@ -0,0 +1,35 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * SMC statistics netlink routines + * + * Copyright IBM Corp. 2021 + * + * Author(s): Guvenc Gulce + */ +#include +#include +#include +#include +#include "smc_stats.h" + +/* serialize fallback reason statistic gathering */ +DEFINE_MUTEX(smc_stat_fback_rsn); +struct smc_stats __percpu *smc_stats; /* per cpu counters for SMC */ +struct smc_stats_reason fback_rsn; + +int __init smc_stats_init(void) +{ + memset(&fback_rsn, 0, sizeof(fback_rsn)); + smc_stats = alloc_percpu(struct smc_stats); + if (!smc_stats) + return -ENOMEM; + + return 0; +} + +void smc_stats_exit(void) +{ + free_percpu(smc_stats); +} diff --git a/net/smc/smc_stats.h b/net/smc/smc_stats.h new file mode 100644 index 000000000000..928372114cf1 --- /dev/null +++ b/net/smc/smc_stats.h @@ -0,0 +1,253 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Macros for SMC statistics + * + * Copyright IBM Corp. 2021 + * + * Author(s): Guvenc Gulce + */ + +#ifndef NET_SMC_SMC_STATS_H_ +#define NET_SMC_SMC_STATS_H_ +#include +#include +#include +#include +#include + +#include "smc_clc.h" + +#define SMC_MAX_FBACK_RSN_CNT 30 + +extern struct smc_stats __percpu *smc_stats; /* per cpu counters for SMC */ +extern struct smc_stats_reason fback_rsn; +extern struct mutex smc_stat_fback_rsn; + +enum { + SMC_BUF_8K, + SMC_BUF_16K, + SMC_BUF_32K, + SMC_BUF_64K, + SMC_BUF_128K, + SMC_BUF_256K, + SMC_BUF_512K, + SMC_BUF_1024K, + SMC_BUF_G_1024K, + SMC_BUF_MAX, +}; + +struct smc_stats_fback { + int fback_code; + u16 count; +}; + +struct smc_stats_reason { + struct smc_stats_fback srv[SMC_MAX_FBACK_RSN_CNT]; + struct smc_stats_fback clnt[SMC_MAX_FBACK_RSN_CNT]; + u64 srv_fback_cnt; + u64 clnt_fback_cnt; +}; + +struct smc_stats_rmbcnt { + u64 buf_size_small_peer_cnt; + u64 buf_size_small_cnt; + u64 buf_full_peer_cnt; + u64 buf_full_cnt; + u64 reuse_cnt; + u64 alloc_cnt; + u64 dgrade_cnt; +}; + +struct smc_stats_memsize { + u64 buf[SMC_BUF_MAX]; +}; + +struct smc_stats_tech { + struct smc_stats_memsize tx_rmbsize; + struct smc_stats_memsize rx_rmbsize; + struct smc_stats_memsize tx_pd; + struct smc_stats_memsize rx_pd; + struct smc_stats_rmbcnt rmb_tx; + struct smc_stats_rmbcnt rmb_rx; + u64 clnt_v1_succ_cnt; + u64 clnt_v2_succ_cnt; + u64 srv_v1_succ_cnt; + u64 srv_v2_succ_cnt; + u64 sendpage_cnt; + u64 urg_data_cnt; + u64 splice_cnt; + u64 cork_cnt; + u64 ndly_cnt; + u64 rx_bytes; + u64 tx_bytes; + u64 rx_cnt; + u64 tx_cnt; +}; + +struct smc_stats { + struct smc_stats_tech smc[2]; + u64 clnt_hshake_err_cnt; + u64 srv_hshake_err_cnt; +}; + +#define SMC_STAT_PAYLOAD_SUB(_tech, key, _len, _rc) \ +do { \ + typeof(_tech) t = (_tech); \ + typeof(_len) l = (_len); \ + int _pos = fls64((l) >> 13); \ + typeof(_rc) r = (_rc); \ + int m = SMC_BUF_MAX - 1; \ + this_cpu_inc((*smc_stats).smc[t].key ## _cnt); \ + if (r <= 0) \ + break; \ + _pos = (_pos < m) ? ((l == 1 << (_pos + 12)) ? _pos - 1 : _pos) : m; \ + this_cpu_inc((*smc_stats).smc[t].key ## _pd.buf[_pos]); \ + this_cpu_add((*smc_stats).smc[t].key ## _bytes, r); \ +} \ +while (0) + +#define SMC_STAT_TX_PAYLOAD(_smc, length, rcode) \ +do { \ + typeof(_smc) __smc = _smc; \ + typeof(length) _len = (length); \ + typeof(rcode) _rc = (rcode); \ + bool is_smcd = !__smc->conn.lnk; \ + if (is_smcd) \ + SMC_STAT_PAYLOAD_SUB(SMC_TYPE_D, tx, _len, _rc); \ + else \ + SMC_STAT_PAYLOAD_SUB(SMC_TYPE_R, tx, _len, _rc); \ +} \ +while (0) + +#define SMC_STAT_RX_PAYLOAD(_smc, length, rcode) \ +do { \ + typeof(_smc) __smc = _smc; \ + typeof(length) _len = (length); \ + typeof(rcode) _rc = (rcode); \ + bool is_smcd = !__smc->conn.lnk; \ + if (is_smcd) \ + SMC_STAT_PAYLOAD_SUB(SMC_TYPE_D, rx, _len, _rc); \ + else \ + SMC_STAT_PAYLOAD_SUB(SMC_TYPE_R, rx, _len, _rc); \ +} \ +while (0) + +#define SMC_STAT_RMB_SIZE_SUB(_tech, k, _len) \ +do { \ + typeof(_len) _l = (_len); \ + typeof(_tech) t = (_tech); \ + int _pos = fls((_l) >> 13); \ + int m = SMC_BUF_MAX - 1; \ + _pos = (_pos < m) ? ((_l == 1 << (_pos + 12)) ? _pos - 1 : _pos) : m; \ + this_cpu_inc((*smc_stats).smc[t].k ## _rmbsize.buf[_pos]); \ +} \ +while (0) + +#define SMC_STAT_RMB_SUB(type, t, key) \ + this_cpu_inc((*smc_stats).smc[t].rmb ## _ ## key.type ## _cnt) + +#define SMC_STAT_RMB_SIZE(_is_smcd, _is_rx, _len) \ +do { \ + typeof(_is_smcd) is_d = (_is_smcd); \ + typeof(_is_rx) is_r = (_is_rx); \ + typeof(_len) l = (_len); \ + if ((is_d) && (is_r)) \ + SMC_STAT_RMB_SIZE_SUB(SMC_TYPE_D, rx, l); \ + if ((is_d) && !(is_r)) \ + SMC_STAT_RMB_SIZE_SUB(SMC_TYPE_D, tx, l); \ + if (!(is_d) && (is_r)) \ + SMC_STAT_RMB_SIZE_SUB(SMC_TYPE_R, rx, l); \ + if (!(is_d) && !(is_r)) \ + SMC_STAT_RMB_SIZE_SUB(SMC_TYPE_R, tx, l); \ +} \ +while (0) + +#define SMC_STAT_RMB(type, _is_smcd, _is_rx) \ +do { \ + typeof(_is_smcd) is_d = (_is_smcd); \ + typeof(_is_rx) is_r = (_is_rx); \ + if ((is_d) && (is_r)) \ + SMC_STAT_RMB_SUB(type, SMC_TYPE_D, rx); \ + if ((is_d) && !(is_r)) \ + SMC_STAT_RMB_SUB(type, SMC_TYPE_D, tx); \ + if (!(is_d) && (is_r)) \ + SMC_STAT_RMB_SUB(type, SMC_TYPE_R, rx); \ + if (!(is_d) && !(is_r)) \ + SMC_STAT_RMB_SUB(type, SMC_TYPE_R, tx); \ +} \ +while (0) + +#define SMC_STAT_BUF_REUSE(is_smcd, is_rx) \ + SMC_STAT_RMB(reuse, is_smcd, is_rx) + +#define SMC_STAT_RMB_ALLOC(is_smcd, is_rx) \ + SMC_STAT_RMB(alloc, is_smcd, is_rx) + +#define SMC_STAT_RMB_DOWNGRADED(is_smcd, is_rx) \ + SMC_STAT_RMB(dgrade, is_smcd, is_rx) + +#define SMC_STAT_RMB_TX_PEER_FULL(is_smcd) \ + SMC_STAT_RMB(buf_full_peer, is_smcd, false) + +#define SMC_STAT_RMB_TX_FULL(is_smcd) \ + SMC_STAT_RMB(buf_full, is_smcd, false) + +#define SMC_STAT_RMB_TX_PEER_SIZE_SMALL(is_smcd) \ + SMC_STAT_RMB(buf_size_small_peer, is_smcd, false) + +#define SMC_STAT_RMB_TX_SIZE_SMALL(is_smcd) \ + SMC_STAT_RMB(buf_size_small, is_smcd, false) + +#define SMC_STAT_RMB_RX_SIZE_SMALL(is_smcd) \ + SMC_STAT_RMB(buf_size_small, is_smcd, true) + +#define SMC_STAT_RMB_RX_FULL(is_smcd) \ + SMC_STAT_RMB(buf_full, is_smcd, true) + +#define SMC_STAT_INC(is_smcd, type) \ +do { \ + if ((is_smcd)) \ + this_cpu_inc(smc_stats->smc[SMC_TYPE_D].type); \ + else \ + this_cpu_inc(smc_stats->smc[SMC_TYPE_R].type); \ +} \ +while (0) + +#define SMC_STAT_CLNT_SUCC_INC(_aclc) \ +do { \ + typeof(_aclc) acl = (_aclc); \ + bool is_v2 = (acl->hdr.version == SMC_V2); \ + bool is_smcd = (acl->hdr.typev1 == SMC_TYPE_D); \ + if (is_v2 && is_smcd) \ + this_cpu_inc(smc_stats->smc[SMC_TYPE_D].clnt_v2_succ_cnt); \ + else if (is_v2 && !is_smcd) \ + this_cpu_inc(smc_stats->smc[SMC_TYPE_R].clnt_v2_succ_cnt); \ + else if (!is_v2 && is_smcd) \ + this_cpu_inc(smc_stats->smc[SMC_TYPE_D].clnt_v1_succ_cnt); \ + else if (!is_v2 && !is_smcd) \ + this_cpu_inc(smc_stats->smc[SMC_TYPE_R].clnt_v1_succ_cnt); \ +} \ +while (0) + +#define SMC_STAT_SERV_SUCC_INC(_ini) \ +do { \ + typeof(_ini) i = (_ini); \ + bool is_v2 = (i->smcd_version & SMC_V2); \ + bool is_smcd = (i->is_smcd); \ + if (is_v2 && is_smcd) \ + this_cpu_inc(smc_stats->smc[SMC_TYPE_D].srv_v2_succ_cnt); \ + else if (is_v2 && !is_smcd) \ + this_cpu_inc(smc_stats->smc[SMC_TYPE_R].srv_v2_succ_cnt); \ + else if (!is_v2 && is_smcd) \ + this_cpu_inc(smc_stats->smc[SMC_TYPE_D].srv_v1_succ_cnt); \ + else if (!is_v2 && !is_smcd) \ + this_cpu_inc(smc_stats->smc[SMC_TYPE_R].srv_v1_succ_cnt); \ +} \ +while (0) + +int smc_stats_init(void) __init; +void smc_stats_exit(void); + +#endif /* NET_SMC_SMC_STATS_H_ */ diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 4532c16bf85e..a043544d715f 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -27,6 +27,7 @@ #include "smc_close.h" #include "smc_ism.h" #include "smc_tx.h" +#include "smc_stats.h" #define SMC_TX_WORK_DELAY 0 #define SMC_TX_CORK_DELAY (HZ >> 2) /* 250 ms */ @@ -45,6 +46,8 @@ static void smc_tx_write_space(struct sock *sk) /* similar to sk_stream_write_space */ if (atomic_read(&smc->conn.sndbuf_space) && sock) { + if (test_bit(SOCK_NOSPACE, &sock->flags)) + SMC_STAT_RMB_TX_FULL(!smc->conn.lnk); clear_bit(SOCK_NOSPACE, &sock->flags); rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); @@ -151,6 +154,15 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) goto out_err; } + if (len > conn->sndbuf_desc->len) + SMC_STAT_RMB_TX_SIZE_SMALL(!conn->lnk); + + if (len > conn->peer_rmbe_size) + SMC_STAT_RMB_TX_PEER_SIZE_SMALL(!conn->lnk); + + if (msg->msg_flags & MSG_OOB) + SMC_STAT_INC(!conn->lnk, urg_data_cnt); + while (msg_data_left(msg)) { if (sk->sk_state == SMC_INIT) return -ENOTCONN; @@ -419,8 +431,10 @@ static int smc_tx_rdma_writes(struct smc_connection *conn, /* destination: RMBE */ /* cf. snd_wnd */ rmbespace = atomic_read(&conn->peer_rmbe_space); - if (rmbespace <= 0) + if (rmbespace <= 0) { + SMC_STAT_RMB_TX_PEER_FULL(!conn->lnk); return 0; + } smc_curs_copy(&prod, &conn->local_tx_ctrl.prod, conn); smc_curs_copy(&cons, &conn->local_rx_ctrl.cons, conn); -- cgit v1.2.3-58-ga151 From 8c40602b4be17571dfd75102f4f1e690311c5210 Mon Sep 17 00:00:00 2001 From: Guvenc Gulce Date: Wed, 16 Jun 2021 16:52:56 +0200 Subject: net/smc: Add netlink support for SMC statistics Add the netlink function which collects the statistics information and delivers it to the userspace. Signed-off-by: Guvenc Gulce Signed-off-by: Karsten Graul Signed-off-by: David S. Miller --- include/uapi/linux/smc.h | 69 ++++++++++++ net/smc/smc_netlink.c | 6 + net/smc/smc_stats.c | 279 +++++++++++++++++++++++++++++++++++++++++++++++ net/smc/smc_stats.h | 1 + 4 files changed, 355 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/smc.h b/include/uapi/linux/smc.h index 3e68da07fba2..f32f11b30963 100644 --- a/include/uapi/linux/smc.h +++ b/include/uapi/linux/smc.h @@ -47,6 +47,7 @@ enum { SMC_NETLINK_GET_LGR_SMCD, SMC_NETLINK_GET_DEV_SMCD, SMC_NETLINK_GET_DEV_SMCR, + SMC_NETLINK_GET_STATS, }; /* SMC_GENL_FAMILY top level attributes */ @@ -58,6 +59,7 @@ enum { SMC_GEN_LGR_SMCD, /* nest */ SMC_GEN_DEV_SMCD, /* nest */ SMC_GEN_DEV_SMCR, /* nest */ + SMC_GEN_STATS, /* nest */ __SMC_GEN_MAX, SMC_GEN_MAX = __SMC_GEN_MAX - 1 }; @@ -159,4 +161,71 @@ enum { SMC_NLA_DEV_MAX = __SMC_NLA_DEV_MAX - 1 }; +/* SMC_NLA_STATS_T_TX(RX)_RMB_SIZE nested attributes */ +/* SMC_NLA_STATS_TX(RX)PLOAD_SIZE nested attributes */ +enum { + SMC_NLA_STATS_PLOAD_PAD, + SMC_NLA_STATS_PLOAD_8K, /* u64 */ + SMC_NLA_STATS_PLOAD_16K, /* u64 */ + SMC_NLA_STATS_PLOAD_32K, /* u64 */ + SMC_NLA_STATS_PLOAD_64K, /* u64 */ + SMC_NLA_STATS_PLOAD_128K, /* u64 */ + SMC_NLA_STATS_PLOAD_256K, /* u64 */ + SMC_NLA_STATS_PLOAD_512K, /* u64 */ + SMC_NLA_STATS_PLOAD_1024K, /* u64 */ + SMC_NLA_STATS_PLOAD_G_1024K, /* u64 */ + __SMC_NLA_STATS_PLOAD_MAX, + SMC_NLA_STATS_PLOAD_MAX = __SMC_NLA_STATS_PLOAD_MAX - 1 +}; + +/* SMC_NLA_STATS_T_TX(RX)_RMB_STATS nested attributes */ +enum { + SMC_NLA_STATS_RMB_PAD, + SMC_NLA_STATS_RMB_SIZE_SM_PEER_CNT, /* u64 */ + SMC_NLA_STATS_RMB_SIZE_SM_CNT, /* u64 */ + SMC_NLA_STATS_RMB_FULL_PEER_CNT, /* u64 */ + SMC_NLA_STATS_RMB_FULL_CNT, /* u64 */ + SMC_NLA_STATS_RMB_REUSE_CNT, /* u64 */ + SMC_NLA_STATS_RMB_ALLOC_CNT, /* u64 */ + SMC_NLA_STATS_RMB_DGRADE_CNT, /* u64 */ + __SMC_NLA_STATS_RMB_MAX, + SMC_NLA_STATS_RMB_MAX = __SMC_NLA_STATS_RMB_MAX - 1 +}; + +/* SMC_NLA_STATS_SMCD_TECH and _SMCR_TECH nested attributes */ +enum { + SMC_NLA_STATS_T_PAD, + SMC_NLA_STATS_T_TX_RMB_SIZE, /* nest */ + SMC_NLA_STATS_T_RX_RMB_SIZE, /* nest */ + SMC_NLA_STATS_T_TXPLOAD_SIZE, /* nest */ + SMC_NLA_STATS_T_RXPLOAD_SIZE, /* nest */ + SMC_NLA_STATS_T_TX_RMB_STATS, /* nest */ + SMC_NLA_STATS_T_RX_RMB_STATS, /* nest */ + SMC_NLA_STATS_T_CLNT_V1_SUCC, /* u64 */ + SMC_NLA_STATS_T_CLNT_V2_SUCC, /* u64 */ + SMC_NLA_STATS_T_SRV_V1_SUCC, /* u64 */ + SMC_NLA_STATS_T_SRV_V2_SUCC, /* u64 */ + SMC_NLA_STATS_T_SENDPAGE_CNT, /* u64 */ + SMC_NLA_STATS_T_SPLICE_CNT, /* u64 */ + SMC_NLA_STATS_T_CORK_CNT, /* u64 */ + SMC_NLA_STATS_T_NDLY_CNT, /* u64 */ + SMC_NLA_STATS_T_URG_DATA_CNT, /* u64 */ + SMC_NLA_STATS_T_RX_BYTES, /* u64 */ + SMC_NLA_STATS_T_TX_BYTES, /* u64 */ + SMC_NLA_STATS_T_RX_CNT, /* u64 */ + SMC_NLA_STATS_T_TX_CNT, /* u64 */ + __SMC_NLA_STATS_T_MAX, + SMC_NLA_STATS_T_MAX = __SMC_NLA_STATS_T_MAX - 1 +}; + +/* SMC_GEN_STATS attributes */ +enum { + SMC_NLA_STATS_PAD, + SMC_NLA_STATS_SMCD_TECH, /* nest */ + SMC_NLA_STATS_SMCR_TECH, /* nest */ + SMC_NLA_STATS_CLNT_HS_ERR_CNT, /* u64 */ + SMC_NLA_STATS_SRV_HS_ERR_CNT, /* u64 */ + __SMC_NLA_STATS_MAX, + SMC_NLA_STATS_MAX = __SMC_NLA_STATS_MAX - 1 +}; #endif /* _UAPI_LINUX_SMC_H */ diff --git a/net/smc/smc_netlink.c b/net/smc/smc_netlink.c index 140419a19dbf..30e30b23370f 100644 --- a/net/smc/smc_netlink.c +++ b/net/smc/smc_netlink.c @@ -19,6 +19,7 @@ #include "smc_core.h" #include "smc_ism.h" #include "smc_ib.h" +#include "smc_stats.h" #include "smc_netlink.h" #define SMC_CMD_MAX_ATTR 1 @@ -55,6 +56,11 @@ static const struct genl_ops smc_gen_nl_ops[] = { /* can be retrieved by unprivileged users */ .dumpit = smcr_nl_get_device, }, + { + .cmd = SMC_NETLINK_GET_STATS, + /* can be retrieved by unprivileged users */ + .dumpit = smc_nl_get_stats, + }, }; static const struct nla_policy smc_gen_nl_policy[2] = { diff --git a/net/smc/smc_stats.c b/net/smc/smc_stats.c index 76e938388520..72119d3d8558 100644 --- a/net/smc/smc_stats.c +++ b/net/smc/smc_stats.c @@ -12,6 +12,10 @@ #include #include #include +#include +#include +#include +#include "smc_netlink.h" #include "smc_stats.h" /* serialize fallback reason statistic gathering */ @@ -33,3 +37,278 @@ void smc_stats_exit(void) { free_percpu(smc_stats); } + +static int smc_nl_fill_stats_rmb_data(struct sk_buff *skb, + struct smc_stats *stats, int tech, + int type) +{ + struct smc_stats_rmbcnt *stats_rmb_cnt; + struct nlattr *attrs; + + if (type == SMC_NLA_STATS_T_TX_RMB_STATS) + stats_rmb_cnt = &stats->smc[tech].rmb_tx; + else + stats_rmb_cnt = &stats->smc[tech].rmb_rx; + + attrs = nla_nest_start(skb, type); + if (!attrs) + goto errout; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_REUSE_CNT, + stats_rmb_cnt->reuse_cnt, + SMC_NLA_STATS_RMB_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_SIZE_SM_PEER_CNT, + stats_rmb_cnt->buf_size_small_peer_cnt, + SMC_NLA_STATS_RMB_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_SIZE_SM_CNT, + stats_rmb_cnt->buf_size_small_cnt, + SMC_NLA_STATS_RMB_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_FULL_PEER_CNT, + stats_rmb_cnt->buf_full_peer_cnt, + SMC_NLA_STATS_RMB_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_FULL_CNT, + stats_rmb_cnt->buf_full_cnt, + SMC_NLA_STATS_RMB_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_ALLOC_CNT, + stats_rmb_cnt->alloc_cnt, + SMC_NLA_STATS_RMB_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_DGRADE_CNT, + stats_rmb_cnt->dgrade_cnt, + SMC_NLA_STATS_RMB_PAD)) + goto errattr; + + nla_nest_end(skb, attrs); + return 0; + +errattr: + nla_nest_cancel(skb, attrs); +errout: + return -EMSGSIZE; +} + +static int smc_nl_fill_stats_bufsize_data(struct sk_buff *skb, + struct smc_stats *stats, int tech, + int type) +{ + struct smc_stats_memsize *stats_pload; + struct nlattr *attrs; + + if (type == SMC_NLA_STATS_T_TXPLOAD_SIZE) + stats_pload = &stats->smc[tech].tx_pd; + else if (type == SMC_NLA_STATS_T_RXPLOAD_SIZE) + stats_pload = &stats->smc[tech].rx_pd; + else if (type == SMC_NLA_STATS_T_TX_RMB_SIZE) + stats_pload = &stats->smc[tech].tx_rmbsize; + else if (type == SMC_NLA_STATS_T_RX_RMB_SIZE) + stats_pload = &stats->smc[tech].rx_rmbsize; + else + goto errout; + + attrs = nla_nest_start(skb, type); + if (!attrs) + goto errout; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_8K, + stats_pload->buf[SMC_BUF_8K], + SMC_NLA_STATS_PLOAD_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_16K, + stats_pload->buf[SMC_BUF_16K], + SMC_NLA_STATS_PLOAD_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_32K, + stats_pload->buf[SMC_BUF_32K], + SMC_NLA_STATS_PLOAD_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_64K, + stats_pload->buf[SMC_BUF_64K], + SMC_NLA_STATS_PLOAD_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_128K, + stats_pload->buf[SMC_BUF_128K], + SMC_NLA_STATS_PLOAD_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_256K, + stats_pload->buf[SMC_BUF_256K], + SMC_NLA_STATS_PLOAD_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_512K, + stats_pload->buf[SMC_BUF_512K], + SMC_NLA_STATS_PLOAD_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_1024K, + stats_pload->buf[SMC_BUF_1024K], + SMC_NLA_STATS_PLOAD_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_G_1024K, + stats_pload->buf[SMC_BUF_G_1024K], + SMC_NLA_STATS_PLOAD_PAD)) + goto errattr; + + nla_nest_end(skb, attrs); + return 0; + +errattr: + nla_nest_cancel(skb, attrs); +errout: + return -EMSGSIZE; +} + +static int smc_nl_fill_stats_tech_data(struct sk_buff *skb, + struct smc_stats *stats, int tech) +{ + struct smc_stats_tech *smc_tech; + struct nlattr *attrs; + + smc_tech = &stats->smc[tech]; + if (tech == SMC_TYPE_D) + attrs = nla_nest_start(skb, SMC_NLA_STATS_SMCD_TECH); + else + attrs = nla_nest_start(skb, SMC_NLA_STATS_SMCR_TECH); + + if (!attrs) + goto errout; + if (smc_nl_fill_stats_rmb_data(skb, stats, tech, + SMC_NLA_STATS_T_TX_RMB_STATS)) + goto errattr; + if (smc_nl_fill_stats_rmb_data(skb, stats, tech, + SMC_NLA_STATS_T_RX_RMB_STATS)) + goto errattr; + if (smc_nl_fill_stats_bufsize_data(skb, stats, tech, + SMC_NLA_STATS_T_TXPLOAD_SIZE)) + goto errattr; + if (smc_nl_fill_stats_bufsize_data(skb, stats, tech, + SMC_NLA_STATS_T_RXPLOAD_SIZE)) + goto errattr; + if (smc_nl_fill_stats_bufsize_data(skb, stats, tech, + SMC_NLA_STATS_T_TX_RMB_SIZE)) + goto errattr; + if (smc_nl_fill_stats_bufsize_data(skb, stats, tech, + SMC_NLA_STATS_T_RX_RMB_SIZE)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_CLNT_V1_SUCC, + smc_tech->clnt_v1_succ_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_CLNT_V2_SUCC, + smc_tech->clnt_v2_succ_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_SRV_V1_SUCC, + smc_tech->srv_v1_succ_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_SRV_V2_SUCC, + smc_tech->srv_v2_succ_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_RX_BYTES, + smc_tech->rx_bytes, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_TX_BYTES, + smc_tech->tx_bytes, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_RX_CNT, + smc_tech->rx_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_TX_CNT, + smc_tech->tx_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_SENDPAGE_CNT, + smc_tech->sendpage_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_CORK_CNT, + smc_tech->cork_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_NDLY_CNT, + smc_tech->ndly_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_SPLICE_CNT, + smc_tech->splice_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_URG_DATA_CNT, + smc_tech->urg_data_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + + nla_nest_end(skb, attrs); + return 0; + +errattr: + nla_nest_cancel(skb, attrs); +errout: + return -EMSGSIZE; +} + +int smc_nl_get_stats(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); + struct smc_stats *stats; + struct nlattr *attrs; + int cpu, i, size; + void *nlh; + u64 *src; + u64 *sum; + + if (cb_ctx->pos[0]) + goto errmsg; + nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &smc_gen_nl_family, NLM_F_MULTI, + SMC_NETLINK_GET_STATS); + if (!nlh) + goto errmsg; + + attrs = nla_nest_start(skb, SMC_GEN_STATS); + if (!attrs) + goto errnest; + stats = kzalloc(sizeof(*stats), GFP_KERNEL); + if (!stats) + goto erralloc; + size = sizeof(*stats) / sizeof(u64); + for_each_possible_cpu(cpu) { + src = (u64 *)per_cpu_ptr(smc_stats, cpu); + sum = (u64 *)stats; + for (i = 0; i < size; i++) + *(sum++) += *(src++); + } + if (smc_nl_fill_stats_tech_data(skb, stats, SMC_TYPE_D)) + goto errattr; + if (smc_nl_fill_stats_tech_data(skb, stats, SMC_TYPE_R)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_CLNT_HS_ERR_CNT, + stats->clnt_hshake_err_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_SRV_HS_ERR_CNT, + stats->srv_hshake_err_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + + nla_nest_end(skb, attrs); + genlmsg_end(skb, nlh); + cb_ctx->pos[0] = 1; + kfree(stats); + return skb->len; + +errattr: + kfree(stats); +erralloc: + nla_nest_cancel(skb, attrs); +errnest: + genlmsg_cancel(skb, nlh); +errmsg: + return skb->len; +} diff --git a/net/smc/smc_stats.h b/net/smc/smc_stats.h index 928372114cf1..84baaca59eaf 100644 --- a/net/smc/smc_stats.h +++ b/net/smc/smc_stats.h @@ -247,6 +247,7 @@ do { \ } \ while (0) +int smc_nl_get_stats(struct sk_buff *skb, struct netlink_callback *cb); int smc_stats_init(void) __init; void smc_stats_exit(void); -- cgit v1.2.3-58-ga151 From f0dd7bf5e33066e554442c509ef6351728b95b51 Mon Sep 17 00:00:00 2001 From: Guvenc Gulce Date: Wed, 16 Jun 2021 16:52:57 +0200 Subject: net/smc: Add netlink support for SMC fallback statistics Add support to collect more detailed SMC fallback reason statistics and provide these statistics to user space on the netlink interface. Signed-off-by: Guvenc Gulce Signed-off-by: Karsten Graul Signed-off-by: David S. Miller --- include/uapi/linux/smc.h | 14 ++++++++ net/smc/smc_netlink.c | 5 +++ net/smc/smc_netlink.h | 2 +- net/smc/smc_stats.c | 92 ++++++++++++++++++++++++++++++++++++++++++++++++ net/smc/smc_stats.h | 1 + 5 files changed, 113 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/uapi/linux/smc.h b/include/uapi/linux/smc.h index f32f11b30963..0f7f87c70baf 100644 --- a/include/uapi/linux/smc.h +++ b/include/uapi/linux/smc.h @@ -48,6 +48,7 @@ enum { SMC_NETLINK_GET_DEV_SMCD, SMC_NETLINK_GET_DEV_SMCR, SMC_NETLINK_GET_STATS, + SMC_NETLINK_GET_FBACK_STATS, }; /* SMC_GENL_FAMILY top level attributes */ @@ -60,6 +61,7 @@ enum { SMC_GEN_DEV_SMCD, /* nest */ SMC_GEN_DEV_SMCR, /* nest */ SMC_GEN_STATS, /* nest */ + SMC_GEN_FBACK_STATS, /* nest */ __SMC_GEN_MAX, SMC_GEN_MAX = __SMC_GEN_MAX - 1 }; @@ -228,4 +230,16 @@ enum { __SMC_NLA_STATS_MAX, SMC_NLA_STATS_MAX = __SMC_NLA_STATS_MAX - 1 }; + +/* SMC_GEN_FBACK_STATS attributes */ +enum { + SMC_NLA_FBACK_STATS_PAD, + SMC_NLA_FBACK_STATS_TYPE, /* u8 */ + SMC_NLA_FBACK_STATS_SRV_CNT, /* u64 */ + SMC_NLA_FBACK_STATS_CLNT_CNT, /* u64 */ + SMC_NLA_FBACK_STATS_RSN_CODE, /* u32 */ + SMC_NLA_FBACK_STATS_RSN_CNT, /* u16 */ + __SMC_NLA_FBACK_STATS_MAX, + SMC_NLA_FBACK_STATS_MAX = __SMC_NLA_FBACK_STATS_MAX - 1 +}; #endif /* _UAPI_LINUX_SMC_H */ diff --git a/net/smc/smc_netlink.c b/net/smc/smc_netlink.c index 30e30b23370f..6fb6f96c1d17 100644 --- a/net/smc/smc_netlink.c +++ b/net/smc/smc_netlink.c @@ -61,6 +61,11 @@ static const struct genl_ops smc_gen_nl_ops[] = { /* can be retrieved by unprivileged users */ .dumpit = smc_nl_get_stats, }, + { + .cmd = SMC_NETLINK_GET_FBACK_STATS, + /* can be retrieved by unprivileged users */ + .dumpit = smc_nl_get_fback_stats, + }, }; static const struct nla_policy smc_gen_nl_policy[2] = { diff --git a/net/smc/smc_netlink.h b/net/smc/smc_netlink.h index 3477265cba6c..5ce2c0a89ccd 100644 --- a/net/smc/smc_netlink.h +++ b/net/smc/smc_netlink.h @@ -18,7 +18,7 @@ extern struct genl_family smc_gen_nl_family; struct smc_nl_dmp_ctx { - int pos[2]; + int pos[3]; }; static inline struct smc_nl_dmp_ctx *smc_nl_dmp_ctx(struct netlink_callback *c) diff --git a/net/smc/smc_stats.c b/net/smc/smc_stats.c index 72119d3d8558..b3d279d29c52 100644 --- a/net/smc/smc_stats.c +++ b/net/smc/smc_stats.c @@ -312,3 +312,95 @@ errnest: errmsg: return skb->len; } + +static int smc_nl_get_fback_details(struct sk_buff *skb, + struct netlink_callback *cb, int pos, + bool is_srv) +{ + struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); + int cnt_reported = cb_ctx->pos[2]; + struct smc_stats_fback *trgt_arr; + struct nlattr *attrs; + int rc = 0; + void *nlh; + + if (is_srv) + trgt_arr = &fback_rsn.srv[0]; + else + trgt_arr = &fback_rsn.clnt[0]; + if (!trgt_arr[pos].fback_code) + return -ENODATA; + nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &smc_gen_nl_family, NLM_F_MULTI, + SMC_NETLINK_GET_FBACK_STATS); + if (!nlh) + goto errmsg; + attrs = nla_nest_start(skb, SMC_GEN_FBACK_STATS); + if (!attrs) + goto errout; + if (nla_put_u8(skb, SMC_NLA_FBACK_STATS_TYPE, is_srv)) + goto errattr; + if (!cnt_reported) { + if (nla_put_u64_64bit(skb, SMC_NLA_FBACK_STATS_SRV_CNT, + fback_rsn.srv_fback_cnt, + SMC_NLA_FBACK_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_FBACK_STATS_CLNT_CNT, + fback_rsn.clnt_fback_cnt, + SMC_NLA_FBACK_STATS_PAD)) + goto errattr; + cnt_reported = 1; + } + + if (nla_put_u32(skb, SMC_NLA_FBACK_STATS_RSN_CODE, + trgt_arr[pos].fback_code)) + goto errattr; + if (nla_put_u16(skb, SMC_NLA_FBACK_STATS_RSN_CNT, + trgt_arr[pos].count)) + goto errattr; + + cb_ctx->pos[2] = cnt_reported; + nla_nest_end(skb, attrs); + genlmsg_end(skb, nlh); + return rc; + +errattr: + nla_nest_cancel(skb, attrs); +errout: + genlmsg_cancel(skb, nlh); +errmsg: + return -EMSGSIZE; +} + +int smc_nl_get_fback_stats(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); + int rc_srv = 0, rc_clnt = 0, k; + int skip_serv = cb_ctx->pos[1]; + int snum = cb_ctx->pos[0]; + bool is_srv = true; + + mutex_lock(&smc_stat_fback_rsn); + for (k = 0; k < SMC_MAX_FBACK_RSN_CNT; k++) { + if (k < snum) + continue; + if (!skip_serv) { + rc_srv = smc_nl_get_fback_details(skb, cb, k, is_srv); + if (rc_srv && rc_srv != ENODATA) + break; + } else { + skip_serv = 0; + } + rc_clnt = smc_nl_get_fback_details(skb, cb, k, !is_srv); + if (rc_clnt && rc_clnt != ENODATA) { + skip_serv = 1; + break; + } + if (rc_clnt == ENODATA && rc_srv == ENODATA) + break; + } + mutex_unlock(&smc_stat_fback_rsn); + cb_ctx->pos[1] = skip_serv; + cb_ctx->pos[0] = k; + return skb->len; +} diff --git a/net/smc/smc_stats.h b/net/smc/smc_stats.h index 84baaca59eaf..7c35b22d9e29 100644 --- a/net/smc/smc_stats.h +++ b/net/smc/smc_stats.h @@ -248,6 +248,7 @@ do { \ while (0) int smc_nl_get_stats(struct sk_buff *skb, struct netlink_callback *cb); +int smc_nl_get_fback_stats(struct sk_buff *skb, struct netlink_callback *cb); int smc_stats_init(void) __init; void smc_stats_exit(void); -- cgit v1.2.3-58-ga151 From 194730a9beb52d2b030ea45e12d94868d4a0e6fd Mon Sep 17 00:00:00 2001 From: Guvenc Gulce Date: Wed, 16 Jun 2021 16:52:58 +0200 Subject: net/smc: Make SMC statistics network namespace aware Make the gathered SMC statistics network namespace aware, for each namespace collect an own set of statistic information. Signed-off-by: Guvenc Gulce Signed-off-by: Karsten Graul Signed-off-by: David S. Miller --- include/net/net_namespace.h | 4 ++ include/net/netns/smc.h | 16 +++++++ net/smc/af_smc.c | 65 ++++++++++++++++---------- net/smc/smc_core.c | 10 ++-- net/smc/smc_rx.c | 6 +-- net/smc/smc_stats.c | 47 +++++++++++-------- net/smc/smc_stats.h | 111 ++++++++++++++++++++++++-------------------- net/smc/smc_tx.c | 12 +++-- 8 files changed, 163 insertions(+), 108 deletions(-) create mode 100644 include/net/netns/smc.h (limited to 'net') diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index fa5887143f0d..befc5b93f311 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -170,6 +171,9 @@ struct net { struct sock *crypto_nlsk; #endif struct sock *diag_nlsk; +#if IS_ENABLED(CONFIG_SMC) + struct netns_smc smc; +#endif } __randomize_layout; #include diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h new file mode 100644 index 000000000000..ea8a9cf2619b --- /dev/null +++ b/include/net/netns/smc.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __NETNS_SMC_H__ +#define __NETNS_SMC_H__ +#include +#include + +struct smc_stats_rsn; +struct smc_stats; +struct netns_smc { + /* per cpu counters for SMC */ + struct smc_stats __percpu *smc_stats; + /* protect fback_rsn */ + struct mutex mutex_fback_rsn; + struct smc_stats_rsn *fback_rsn; +}; +#endif diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index efeaed384769..e41fdac606d4 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -529,15 +529,17 @@ static void smc_stat_inc_fback_rsn_cnt(struct smc_sock *smc, static void smc_stat_fallback(struct smc_sock *smc) { - mutex_lock(&smc_stat_fback_rsn); + struct net *net = sock_net(&smc->sk); + + mutex_lock(&net->smc.mutex_fback_rsn); if (smc->listen_smc) { - smc_stat_inc_fback_rsn_cnt(smc, fback_rsn.srv); - fback_rsn.srv_fback_cnt++; + smc_stat_inc_fback_rsn_cnt(smc, net->smc.fback_rsn->srv); + net->smc.fback_rsn->srv_fback_cnt++; } else { - smc_stat_inc_fback_rsn_cnt(smc, fback_rsn.clnt); - fback_rsn.clnt_fback_cnt++; + smc_stat_inc_fback_rsn_cnt(smc, net->smc.fback_rsn->clnt); + net->smc.fback_rsn->clnt_fback_cnt++; } - mutex_unlock(&smc_stat_fback_rsn); + mutex_unlock(&net->smc.mutex_fback_rsn); } static void smc_switch_to_fallback(struct smc_sock *smc, int reason_code) @@ -568,10 +570,11 @@ static int smc_connect_fallback(struct smc_sock *smc, int reason_code) static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code, u8 version) { + struct net *net = sock_net(&smc->sk); int rc; if (reason_code < 0) { /* error, fallback is not possible */ - this_cpu_inc(smc_stats->clnt_hshake_err_cnt); + this_cpu_inc(net->smc.smc_stats->clnt_hshake_err_cnt); if (smc->sk.sk_state == SMC_INIT) sock_put(&smc->sk); /* passive closing */ return reason_code; @@ -579,7 +582,7 @@ static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code, if (reason_code != SMC_CLC_DECL_PEERDECL) { rc = smc_clc_send_decline(smc, reason_code, version); if (rc < 0) { - this_cpu_inc(smc_stats->clnt_hshake_err_cnt); + this_cpu_inc(net->smc.smc_stats->clnt_hshake_err_cnt); if (smc->sk.sk_state == SMC_INIT) sock_put(&smc->sk); /* passive closing */ return rc; @@ -1027,7 +1030,7 @@ static int __smc_connect(struct smc_sock *smc) if (rc) goto vlan_cleanup; - SMC_STAT_CLNT_SUCC_INC(aclc); + SMC_STAT_CLNT_SUCC_INC(sock_net(smc->clcsock->sk), aclc); smc_connect_ism_vlan_cleanup(smc, ini); kfree(buf); kfree(ini); @@ -1343,8 +1346,9 @@ static void smc_listen_out_connected(struct smc_sock *new_smc) static void smc_listen_out_err(struct smc_sock *new_smc) { struct sock *newsmcsk = &new_smc->sk; + struct net *net = sock_net(newsmcsk); - this_cpu_inc(smc_stats->srv_hshake_err_cnt); + this_cpu_inc(net->smc.smc_stats->srv_hshake_err_cnt); if (newsmcsk->sk_state == SMC_INIT) sock_put(&new_smc->sk); /* passive closing */ newsmcsk->sk_state = SMC_CLOSED; @@ -1813,7 +1817,7 @@ static void smc_listen_work(struct work_struct *work) } smc_conn_save_peer_info(new_smc, cclc); smc_listen_out_connected(new_smc); - SMC_STAT_SERV_SUCC_INC(ini); + SMC_STAT_SERV_SUCC_INC(sock_net(newclcsock->sk), ini); goto out_free; out_unlock: @@ -2242,7 +2246,7 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_CLOSED) { if (val) { - SMC_STAT_INC(!smc->conn.lnk, ndly_cnt); + SMC_STAT_INC(smc, ndly_cnt); mod_delayed_work(smc->conn.lgr->tx_wq, &smc->conn.tx_work, 0); } @@ -2253,7 +2257,7 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_CLOSED) { if (!val) { - SMC_STAT_INC(!smc->conn.lnk, cork_cnt); + SMC_STAT_INC(smc, cork_cnt); mod_delayed_work(smc->conn.lgr->tx_wq, &smc->conn.tx_work, 0); } @@ -2383,7 +2387,7 @@ static ssize_t smc_sendpage(struct socket *sock, struct page *page, rc = kernel_sendpage(smc->clcsock, page, offset, size, flags); } else { - SMC_STAT_INC(!smc->conn.lnk, sendpage_cnt); + SMC_STAT_INC(smc, sendpage_cnt); rc = sock_no_sendpage(sock, page, offset, size, flags); } @@ -2434,7 +2438,7 @@ static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos, flags = MSG_DONTWAIT; else flags = 0; - SMC_STAT_INC(!smc->conn.lnk, splice_cnt); + SMC_STAT_INC(smc, splice_cnt); rc = smc_rx_recvmsg(smc, NULL, pipe, len, flags); } out: @@ -2523,6 +2527,16 @@ static void __net_exit smc_net_exit(struct net *net) smc_pnet_net_exit(net); } +static __net_init int smc_net_stat_init(struct net *net) +{ + return smc_stats_init(net); +} + +static void __net_exit smc_net_stat_exit(struct net *net) +{ + smc_stats_exit(net); +} + static struct pernet_operations smc_net_ops = { .init = smc_net_init, .exit = smc_net_exit, @@ -2530,6 +2544,11 @@ static struct pernet_operations smc_net_ops = { .size = sizeof(struct smc_net), }; +static struct pernet_operations smc_net_stat_ops = { + .init = smc_net_stat_init, + .exit = smc_net_stat_exit, +}; + static int __init smc_init(void) { int rc; @@ -2538,6 +2557,10 @@ static int __init smc_init(void) if (rc) return rc; + rc = register_pernet_subsys(&smc_net_stat_ops); + if (rc) + return rc; + smc_ism_init(); smc_clc_init(); @@ -2558,16 +2581,10 @@ static int __init smc_init(void) if (!smc_close_wq) goto out_alloc_hs_wq; - rc = smc_stats_init(); - if (rc) { - pr_err("%s: smc_stats_init fails with %d\n", __func__, rc); - goto out_alloc_wqs; - } - rc = smc_core_init(); if (rc) { pr_err("%s: smc_core_init fails with %d\n", __func__, rc); - goto out_smc_stat; + goto out_alloc_wqs; } rc = smc_llc_init(); @@ -2619,8 +2636,6 @@ out_proto: proto_unregister(&smc_proto); out_core: smc_core_exit(); -out_smc_stat: - smc_stats_exit(); out_alloc_wqs: destroy_workqueue(smc_close_wq); out_alloc_hs_wq: @@ -2643,11 +2658,11 @@ static void __exit smc_exit(void) smc_ib_unregister_client(); destroy_workqueue(smc_close_wq); destroy_workqueue(smc_hs_wq); - smc_stats_exit(); proto_unregister(&smc_proto6); proto_unregister(&smc_proto); smc_pnet_exit(); smc_nl_exit(); + unregister_pernet_subsys(&smc_net_stat_ops); unregister_pernet_subsys(&smc_net_ops); rcu_barrier(); } diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index d69f58f670a1..cd0d7c908b2a 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -2058,8 +2058,8 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) /* check for reusable slot in the link group */ buf_desc = smc_buf_get_slot(bufsize_short, lock, buf_list); if (buf_desc) { - SMC_STAT_RMB_SIZE(is_smcd, is_rmb, bufsize); - SMC_STAT_BUF_REUSE(is_smcd, is_rmb); + SMC_STAT_RMB_SIZE(smc, is_smcd, is_rmb, bufsize); + SMC_STAT_BUF_REUSE(smc, is_smcd, is_rmb); memset(buf_desc->cpu_addr, 0, bufsize); break; /* found reusable slot */ } @@ -2074,13 +2074,13 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) if (IS_ERR(buf_desc)) { if (!is_dgraded) { is_dgraded = true; - SMC_STAT_RMB_DOWNGRADED(is_smcd, is_rmb); + SMC_STAT_RMB_DOWNGRADED(smc, is_smcd, is_rmb); } continue; } - SMC_STAT_RMB_ALLOC(is_smcd, is_rmb); - SMC_STAT_RMB_SIZE(is_smcd, is_rmb, bufsize); + SMC_STAT_RMB_ALLOC(smc, is_smcd, is_rmb); + SMC_STAT_RMB_SIZE(smc, is_smcd, is_rmb, bufsize); buf_desc->used = 1; mutex_lock(lock); list_add(&buf_desc->list, buf_list); diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index ce1ae39923b1..170b733bc736 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -228,7 +228,7 @@ static int smc_rx_recv_urg(struct smc_sock *smc, struct msghdr *msg, int len, conn->urg_state == SMC_URG_READ) return -EINVAL; - SMC_STAT_INC(!conn->lnk, urg_data_cnt); + SMC_STAT_INC(smc, urg_data_cnt); if (conn->urg_state == SMC_URG_VALID) { if (!(flags & MSG_PEEK)) smc->conn.urg_state = SMC_URG_READ; @@ -307,10 +307,10 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, readable = atomic_read(&conn->bytes_to_rcv); if (readable >= conn->rmb_desc->len) - SMC_STAT_RMB_RX_FULL(!conn->lnk); + SMC_STAT_RMB_RX_FULL(smc, !conn->lnk); if (len < readable) - SMC_STAT_RMB_RX_SIZE_SMALL(!conn->lnk); + SMC_STAT_RMB_RX_SIZE_SMALL(smc, !conn->lnk); /* we currently use 1 RMBE per RMB, so RMBE == RMB base addr */ rcvbuf_base = conn->rx_off + conn->rmb_desc->cpu_addr; diff --git a/net/smc/smc_stats.c b/net/smc/smc_stats.c index b3d279d29c52..614013e3b574 100644 --- a/net/smc/smc_stats.c +++ b/net/smc/smc_stats.c @@ -18,24 +18,28 @@ #include "smc_netlink.h" #include "smc_stats.h" -/* serialize fallback reason statistic gathering */ -DEFINE_MUTEX(smc_stat_fback_rsn); -struct smc_stats __percpu *smc_stats; /* per cpu counters for SMC */ -struct smc_stats_reason fback_rsn; - -int __init smc_stats_init(void) +int smc_stats_init(struct net *net) { - memset(&fback_rsn, 0, sizeof(fback_rsn)); - smc_stats = alloc_percpu(struct smc_stats); - if (!smc_stats) - return -ENOMEM; - + net->smc.fback_rsn = kzalloc(sizeof(*net->smc.fback_rsn), GFP_KERNEL); + if (!net->smc.fback_rsn) + goto err_fback; + net->smc.smc_stats = alloc_percpu(struct smc_stats); + if (!net->smc.smc_stats) + goto err_stats; + mutex_init(&net->smc.mutex_fback_rsn); return 0; + +err_stats: + kfree(net->smc.fback_rsn); +err_fback: + return -ENOMEM; } -void smc_stats_exit(void) +void smc_stats_exit(struct net *net) { - free_percpu(smc_stats); + kfree(net->smc.fback_rsn); + if (net->smc.smc_stats) + free_percpu(net->smc.smc_stats); } static int smc_nl_fill_stats_rmb_data(struct sk_buff *skb, @@ -256,6 +260,7 @@ int smc_nl_get_stats(struct sk_buff *skb, struct netlink_callback *cb) { struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); + struct net *net = sock_net(skb->sk); struct smc_stats *stats; struct nlattr *attrs; int cpu, i, size; @@ -279,7 +284,7 @@ int smc_nl_get_stats(struct sk_buff *skb, goto erralloc; size = sizeof(*stats) / sizeof(u64); for_each_possible_cpu(cpu) { - src = (u64 *)per_cpu_ptr(smc_stats, cpu); + src = (u64 *)per_cpu_ptr(net->smc.smc_stats, cpu); sum = (u64 *)stats; for (i = 0; i < size; i++) *(sum++) += *(src++); @@ -318,6 +323,7 @@ static int smc_nl_get_fback_details(struct sk_buff *skb, bool is_srv) { struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); + struct net *net = sock_net(skb->sk); int cnt_reported = cb_ctx->pos[2]; struct smc_stats_fback *trgt_arr; struct nlattr *attrs; @@ -325,9 +331,9 @@ static int smc_nl_get_fback_details(struct sk_buff *skb, void *nlh; if (is_srv) - trgt_arr = &fback_rsn.srv[0]; + trgt_arr = &net->smc.fback_rsn->srv[0]; else - trgt_arr = &fback_rsn.clnt[0]; + trgt_arr = &net->smc.fback_rsn->clnt[0]; if (!trgt_arr[pos].fback_code) return -ENODATA; nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, @@ -342,11 +348,11 @@ static int smc_nl_get_fback_details(struct sk_buff *skb, goto errattr; if (!cnt_reported) { if (nla_put_u64_64bit(skb, SMC_NLA_FBACK_STATS_SRV_CNT, - fback_rsn.srv_fback_cnt, + net->smc.fback_rsn->srv_fback_cnt, SMC_NLA_FBACK_STATS_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_FBACK_STATS_CLNT_CNT, - fback_rsn.clnt_fback_cnt, + net->smc.fback_rsn->clnt_fback_cnt, SMC_NLA_FBACK_STATS_PAD)) goto errattr; cnt_reported = 1; @@ -375,12 +381,13 @@ errmsg: int smc_nl_get_fback_stats(struct sk_buff *skb, struct netlink_callback *cb) { struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); + struct net *net = sock_net(skb->sk); int rc_srv = 0, rc_clnt = 0, k; int skip_serv = cb_ctx->pos[1]; int snum = cb_ctx->pos[0]; bool is_srv = true; - mutex_lock(&smc_stat_fback_rsn); + mutex_lock(&net->smc.mutex_fback_rsn); for (k = 0; k < SMC_MAX_FBACK_RSN_CNT; k++) { if (k < snum) continue; @@ -399,7 +406,7 @@ int smc_nl_get_fback_stats(struct sk_buff *skb, struct netlink_callback *cb) if (rc_clnt == ENODATA && rc_srv == ENODATA) break; } - mutex_unlock(&smc_stat_fback_rsn); + mutex_unlock(&net->smc.mutex_fback_rsn); cb_ctx->pos[1] = skip_serv; cb_ctx->pos[0] = k; return skb->len; diff --git a/net/smc/smc_stats.h b/net/smc/smc_stats.h index 7c35b22d9e29..84b7ecd8c05c 100644 --- a/net/smc/smc_stats.h +++ b/net/smc/smc_stats.h @@ -21,10 +21,6 @@ #define SMC_MAX_FBACK_RSN_CNT 30 -extern struct smc_stats __percpu *smc_stats; /* per cpu counters for SMC */ -extern struct smc_stats_reason fback_rsn; -extern struct mutex smc_stat_fback_rsn; - enum { SMC_BUF_8K, SMC_BUF_16K, @@ -43,7 +39,7 @@ struct smc_stats_fback { u16 count; }; -struct smc_stats_reason { +struct smc_stats_rsn { struct smc_stats_fback srv[SMC_MAX_FBACK_RSN_CNT]; struct smc_stats_fback clnt[SMC_MAX_FBACK_RSN_CNT]; u64 srv_fback_cnt; @@ -92,122 +88,135 @@ struct smc_stats { u64 srv_hshake_err_cnt; }; -#define SMC_STAT_PAYLOAD_SUB(_tech, key, _len, _rc) \ +#define SMC_STAT_PAYLOAD_SUB(_smc_stats, _tech, key, _len, _rc) \ do { \ + typeof(_smc_stats) stats = (_smc_stats); \ typeof(_tech) t = (_tech); \ typeof(_len) l = (_len); \ int _pos = fls64((l) >> 13); \ typeof(_rc) r = (_rc); \ int m = SMC_BUF_MAX - 1; \ - this_cpu_inc((*smc_stats).smc[t].key ## _cnt); \ + this_cpu_inc((*stats).smc[t].key ## _cnt); \ if (r <= 0) \ break; \ _pos = (_pos < m) ? ((l == 1 << (_pos + 12)) ? _pos - 1 : _pos) : m; \ - this_cpu_inc((*smc_stats).smc[t].key ## _pd.buf[_pos]); \ - this_cpu_add((*smc_stats).smc[t].key ## _bytes, r); \ + this_cpu_inc((*stats).smc[t].key ## _pd.buf[_pos]); \ + this_cpu_add((*stats).smc[t].key ## _bytes, r); \ } \ while (0) #define SMC_STAT_TX_PAYLOAD(_smc, length, rcode) \ do { \ typeof(_smc) __smc = _smc; \ + struct net *_net = sock_net(&__smc->sk); \ + struct smc_stats __percpu *_smc_stats = _net->smc.smc_stats; \ typeof(length) _len = (length); \ typeof(rcode) _rc = (rcode); \ bool is_smcd = !__smc->conn.lnk; \ if (is_smcd) \ - SMC_STAT_PAYLOAD_SUB(SMC_TYPE_D, tx, _len, _rc); \ + SMC_STAT_PAYLOAD_SUB(_smc_stats, SMC_TYPE_D, tx, _len, _rc); \ else \ - SMC_STAT_PAYLOAD_SUB(SMC_TYPE_R, tx, _len, _rc); \ + SMC_STAT_PAYLOAD_SUB(_smc_stats, SMC_TYPE_R, tx, _len, _rc); \ } \ while (0) #define SMC_STAT_RX_PAYLOAD(_smc, length, rcode) \ do { \ typeof(_smc) __smc = _smc; \ + struct net *_net = sock_net(&__smc->sk); \ + struct smc_stats __percpu *_smc_stats = _net->smc.smc_stats; \ typeof(length) _len = (length); \ typeof(rcode) _rc = (rcode); \ bool is_smcd = !__smc->conn.lnk; \ if (is_smcd) \ - SMC_STAT_PAYLOAD_SUB(SMC_TYPE_D, rx, _len, _rc); \ + SMC_STAT_PAYLOAD_SUB(_smc_stats, SMC_TYPE_D, rx, _len, _rc); \ else \ - SMC_STAT_PAYLOAD_SUB(SMC_TYPE_R, rx, _len, _rc); \ + SMC_STAT_PAYLOAD_SUB(_smc_stats, SMC_TYPE_R, rx, _len, _rc); \ } \ while (0) -#define SMC_STAT_RMB_SIZE_SUB(_tech, k, _len) \ +#define SMC_STAT_RMB_SIZE_SUB(_smc_stats, _tech, k, _len) \ do { \ typeof(_len) _l = (_len); \ typeof(_tech) t = (_tech); \ int _pos = fls((_l) >> 13); \ int m = SMC_BUF_MAX - 1; \ _pos = (_pos < m) ? ((_l == 1 << (_pos + 12)) ? _pos - 1 : _pos) : m; \ - this_cpu_inc((*smc_stats).smc[t].k ## _rmbsize.buf[_pos]); \ + this_cpu_inc((*(_smc_stats)).smc[t].k ## _rmbsize.buf[_pos]); \ } \ while (0) -#define SMC_STAT_RMB_SUB(type, t, key) \ - this_cpu_inc((*smc_stats).smc[t].rmb ## _ ## key.type ## _cnt) +#define SMC_STAT_RMB_SUB(_smc_stats, type, t, key) \ + this_cpu_inc((*(_smc_stats)).smc[t].rmb ## _ ## key.type ## _cnt) -#define SMC_STAT_RMB_SIZE(_is_smcd, _is_rx, _len) \ +#define SMC_STAT_RMB_SIZE(_smc, _is_smcd, _is_rx, _len) \ do { \ + struct net *_net = sock_net(&(_smc)->sk); \ + struct smc_stats __percpu *_smc_stats = _net->smc.smc_stats; \ typeof(_is_smcd) is_d = (_is_smcd); \ typeof(_is_rx) is_r = (_is_rx); \ typeof(_len) l = (_len); \ if ((is_d) && (is_r)) \ - SMC_STAT_RMB_SIZE_SUB(SMC_TYPE_D, rx, l); \ + SMC_STAT_RMB_SIZE_SUB(_smc_stats, SMC_TYPE_D, rx, l); \ if ((is_d) && !(is_r)) \ - SMC_STAT_RMB_SIZE_SUB(SMC_TYPE_D, tx, l); \ + SMC_STAT_RMB_SIZE_SUB(_smc_stats, SMC_TYPE_D, tx, l); \ if (!(is_d) && (is_r)) \ - SMC_STAT_RMB_SIZE_SUB(SMC_TYPE_R, rx, l); \ + SMC_STAT_RMB_SIZE_SUB(_smc_stats, SMC_TYPE_R, rx, l); \ if (!(is_d) && !(is_r)) \ - SMC_STAT_RMB_SIZE_SUB(SMC_TYPE_R, tx, l); \ + SMC_STAT_RMB_SIZE_SUB(_smc_stats, SMC_TYPE_R, tx, l); \ } \ while (0) -#define SMC_STAT_RMB(type, _is_smcd, _is_rx) \ +#define SMC_STAT_RMB(_smc, type, _is_smcd, _is_rx) \ do { \ + struct net *net = sock_net(&(_smc)->sk); \ + struct smc_stats __percpu *_smc_stats = net->smc.smc_stats; \ typeof(_is_smcd) is_d = (_is_smcd); \ typeof(_is_rx) is_r = (_is_rx); \ if ((is_d) && (is_r)) \ - SMC_STAT_RMB_SUB(type, SMC_TYPE_D, rx); \ + SMC_STAT_RMB_SUB(_smc_stats, type, SMC_TYPE_D, rx); \ if ((is_d) && !(is_r)) \ - SMC_STAT_RMB_SUB(type, SMC_TYPE_D, tx); \ + SMC_STAT_RMB_SUB(_smc_stats, type, SMC_TYPE_D, tx); \ if (!(is_d) && (is_r)) \ - SMC_STAT_RMB_SUB(type, SMC_TYPE_R, rx); \ + SMC_STAT_RMB_SUB(_smc_stats, type, SMC_TYPE_R, rx); \ if (!(is_d) && !(is_r)) \ - SMC_STAT_RMB_SUB(type, SMC_TYPE_R, tx); \ + SMC_STAT_RMB_SUB(_smc_stats, type, SMC_TYPE_R, tx); \ } \ while (0) -#define SMC_STAT_BUF_REUSE(is_smcd, is_rx) \ - SMC_STAT_RMB(reuse, is_smcd, is_rx) +#define SMC_STAT_BUF_REUSE(smc, is_smcd, is_rx) \ + SMC_STAT_RMB(smc, reuse, is_smcd, is_rx) -#define SMC_STAT_RMB_ALLOC(is_smcd, is_rx) \ - SMC_STAT_RMB(alloc, is_smcd, is_rx) +#define SMC_STAT_RMB_ALLOC(smc, is_smcd, is_rx) \ + SMC_STAT_RMB(smc, alloc, is_smcd, is_rx) -#define SMC_STAT_RMB_DOWNGRADED(is_smcd, is_rx) \ - SMC_STAT_RMB(dgrade, is_smcd, is_rx) +#define SMC_STAT_RMB_DOWNGRADED(smc, is_smcd, is_rx) \ + SMC_STAT_RMB(smc, dgrade, is_smcd, is_rx) -#define SMC_STAT_RMB_TX_PEER_FULL(is_smcd) \ - SMC_STAT_RMB(buf_full_peer, is_smcd, false) +#define SMC_STAT_RMB_TX_PEER_FULL(smc, is_smcd) \ + SMC_STAT_RMB(smc, buf_full_peer, is_smcd, false) -#define SMC_STAT_RMB_TX_FULL(is_smcd) \ - SMC_STAT_RMB(buf_full, is_smcd, false) +#define SMC_STAT_RMB_TX_FULL(smc, is_smcd) \ + SMC_STAT_RMB(smc, buf_full, is_smcd, false) -#define SMC_STAT_RMB_TX_PEER_SIZE_SMALL(is_smcd) \ - SMC_STAT_RMB(buf_size_small_peer, is_smcd, false) +#define SMC_STAT_RMB_TX_PEER_SIZE_SMALL(smc, is_smcd) \ + SMC_STAT_RMB(smc, buf_size_small_peer, is_smcd, false) -#define SMC_STAT_RMB_TX_SIZE_SMALL(is_smcd) \ - SMC_STAT_RMB(buf_size_small, is_smcd, false) +#define SMC_STAT_RMB_TX_SIZE_SMALL(smc, is_smcd) \ + SMC_STAT_RMB(smc, buf_size_small, is_smcd, false) -#define SMC_STAT_RMB_RX_SIZE_SMALL(is_smcd) \ - SMC_STAT_RMB(buf_size_small, is_smcd, true) +#define SMC_STAT_RMB_RX_SIZE_SMALL(smc, is_smcd) \ + SMC_STAT_RMB(smc, buf_size_small, is_smcd, true) -#define SMC_STAT_RMB_RX_FULL(is_smcd) \ - SMC_STAT_RMB(buf_full, is_smcd, true) +#define SMC_STAT_RMB_RX_FULL(smc, is_smcd) \ + SMC_STAT_RMB(smc, buf_full, is_smcd, true) -#define SMC_STAT_INC(is_smcd, type) \ +#define SMC_STAT_INC(_smc, type) \ do { \ + typeof(_smc) __smc = _smc; \ + bool is_smcd = !(__smc)->conn.lnk; \ + struct net *net = sock_net(&(__smc)->sk); \ + struct smc_stats __percpu *smc_stats = net->smc.smc_stats; \ if ((is_smcd)) \ this_cpu_inc(smc_stats->smc[SMC_TYPE_D].type); \ else \ @@ -215,11 +224,12 @@ do { \ } \ while (0) -#define SMC_STAT_CLNT_SUCC_INC(_aclc) \ +#define SMC_STAT_CLNT_SUCC_INC(net, _aclc) \ do { \ typeof(_aclc) acl = (_aclc); \ bool is_v2 = (acl->hdr.version == SMC_V2); \ bool is_smcd = (acl->hdr.typev1 == SMC_TYPE_D); \ + struct smc_stats __percpu *smc_stats = (net)->smc.smc_stats; \ if (is_v2 && is_smcd) \ this_cpu_inc(smc_stats->smc[SMC_TYPE_D].clnt_v2_succ_cnt); \ else if (is_v2 && !is_smcd) \ @@ -231,11 +241,12 @@ do { \ } \ while (0) -#define SMC_STAT_SERV_SUCC_INC(_ini) \ +#define SMC_STAT_SERV_SUCC_INC(net, _ini) \ do { \ typeof(_ini) i = (_ini); \ bool is_v2 = (i->smcd_version & SMC_V2); \ bool is_smcd = (i->is_smcd); \ + typeof(net->smc.smc_stats) smc_stats = (net)->smc.smc_stats; \ if (is_v2 && is_smcd) \ this_cpu_inc(smc_stats->smc[SMC_TYPE_D].srv_v2_succ_cnt); \ else if (is_v2 && !is_smcd) \ @@ -249,7 +260,7 @@ while (0) int smc_nl_get_stats(struct sk_buff *skb, struct netlink_callback *cb); int smc_nl_get_fback_stats(struct sk_buff *skb, struct netlink_callback *cb); -int smc_stats_init(void) __init; -void smc_stats_exit(void); +int smc_stats_init(struct net *net); +void smc_stats_exit(struct net *net); #endif /* NET_SMC_SMC_STATS_H_ */ diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index a043544d715f..075c4f4b41cf 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -47,7 +47,7 @@ static void smc_tx_write_space(struct sock *sk) /* similar to sk_stream_write_space */ if (atomic_read(&smc->conn.sndbuf_space) && sock) { if (test_bit(SOCK_NOSPACE, &sock->flags)) - SMC_STAT_RMB_TX_FULL(!smc->conn.lnk); + SMC_STAT_RMB_TX_FULL(smc, !smc->conn.lnk); clear_bit(SOCK_NOSPACE, &sock->flags); rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); @@ -155,13 +155,13 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) } if (len > conn->sndbuf_desc->len) - SMC_STAT_RMB_TX_SIZE_SMALL(!conn->lnk); + SMC_STAT_RMB_TX_SIZE_SMALL(smc, !conn->lnk); if (len > conn->peer_rmbe_size) - SMC_STAT_RMB_TX_PEER_SIZE_SMALL(!conn->lnk); + SMC_STAT_RMB_TX_PEER_SIZE_SMALL(smc, !conn->lnk); if (msg->msg_flags & MSG_OOB) - SMC_STAT_INC(!conn->lnk, urg_data_cnt); + SMC_STAT_INC(smc, urg_data_cnt); while (msg_data_left(msg)) { if (sk->sk_state == SMC_INIT) @@ -432,7 +432,9 @@ static int smc_tx_rdma_writes(struct smc_connection *conn, /* cf. snd_wnd */ rmbespace = atomic_read(&conn->peer_rmbe_space); if (rmbespace <= 0) { - SMC_STAT_RMB_TX_PEER_FULL(!conn->lnk); + struct smc_sock *smc = container_of(conn, struct smc_sock, + conn); + SMC_STAT_RMB_TX_PEER_FULL(smc, !conn->lnk); return 0; } smc_curs_copy(&prod, &conn->local_tx_ctrl.prod, conn); -- cgit v1.2.3-58-ga151 From 5acc44f39458f43dac9724cefa4da29847cfe997 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Fri, 11 Jun 2021 19:06:45 +0200 Subject: netfilter: nft_exthdr: Search chunks in SCTP packets only Since user space does not generate a payload dependency, plain sctp chunk matches cause searching in non-SCTP packets, too. Avoid this potential mis-interpretation of packet data by checking pkt->tprot. Fixes: 133dc203d77df ("netfilter: nft_exthdr: Support SCTP chunks") Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_exthdr.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index 7f705b5c09de..9cf86be2cff4 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -312,6 +312,9 @@ static void nft_exthdr_sctp_eval(const struct nft_expr *expr, const struct sctp_chunkhdr *sch; struct sctp_chunkhdr _sch; + if (pkt->tprot != IPPROTO_SCTP) + goto err; + do { sch = skb_header_pointer(pkt->skb, offset, sizeof(_sch), &_sch); if (!sch || !sch->length) @@ -334,7 +337,7 @@ static void nft_exthdr_sctp_eval(const struct nft_expr *expr, } offset += SCTP_PAD4(ntohs(sch->length)); } while (offset < pkt->skb->len); - +err: if (priv->flags & NFT_EXTHDR_F_PRESENT) nft_reg_store8(dest, false); else -- cgit v1.2.3-58-ga151 From 06e95f0a2aa24d480cbc0c3bd18ca49e1c85f868 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Fri, 11 Jun 2021 19:08:26 +0200 Subject: netfilter: nft_extdhr: Drop pointless check of tprot_set Pablo says, tprot_set is only there to detect if tprot was set to IPPROTO_IP as that evaluates to zero. Therefore, code asserting a different value in tprot does not need to check tprot_set. Fixes: 935b7f6430188 ("netfilter: nft_exthdr: add TCP option matching") Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_exthdr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index 9cf86be2cff4..4f583d2e220e 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -164,7 +164,7 @@ nft_tcp_header_pointer(const struct nft_pktinfo *pkt, { struct tcphdr *tcph; - if (!pkt->tprot_set || pkt->tprot != IPPROTO_TCP) + if (pkt->tprot != IPPROTO_TCP) return NULL; tcph = skb_header_pointer(pkt->skb, nft_thoff(pkt), sizeof(*tcph), buffer); -- cgit v1.2.3-58-ga151 From 836382dc24717af203ce06703530528827086955 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 16 Jun 2021 22:25:05 +0200 Subject: netfilter: nf_tables: add last expression Add a new optional expression that tells you when last matching on a given rule / set element element has happened. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables_core.h | 1 + include/uapi/linux/netfilter/nf_tables.h | 15 ++++++ net/netfilter/Makefile | 2 +- net/netfilter/nf_tables_core.c | 1 + net/netfilter/nft_last.c | 87 ++++++++++++++++++++++++++++++++ 5 files changed, 105 insertions(+), 1 deletion(-) create mode 100644 net/netfilter/nft_last.c (limited to 'net') diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index 46c8d5bb5d8d..0fa5a6d98a00 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -16,6 +16,7 @@ extern struct nft_expr_type nft_range_type; extern struct nft_expr_type nft_meta_type; extern struct nft_expr_type nft_rt_type; extern struct nft_expr_type nft_exthdr_type; +extern struct nft_expr_type nft_last_type; #ifdef CONFIG_NETWORK_SECMARK extern struct nft_object_type nft_secmark_obj_type; diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 19715e2679d1..e94d1fa554cb 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1195,6 +1195,21 @@ enum nft_counter_attributes { }; #define NFTA_COUNTER_MAX (__NFTA_COUNTER_MAX - 1) +/** + * enum nft_last_attributes - nf_tables last expression netlink attributes + * + * @NFTA_LAST_SET: last update has been set, zero means never updated (NLA_U32) + * @NFTA_LAST_MSECS: milliseconds since last update (NLA_U64) + */ +enum nft_last_attributes { + NFTA_LAST_UNSPEC, + NFTA_LAST_SET, + NFTA_LAST_MSECS, + NFTA_LAST_PAD, + __NFTA_LAST_MAX +}; +#define NFTA_LAST_MAX (__NFTA_LAST_MAX - 1) + /** * enum nft_log_attributes - nf_tables log expression netlink attributes * diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 87112dad1fd4..049890e00a3d 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -74,7 +74,7 @@ obj-$(CONFIG_NF_DUP_NETDEV) += nf_dup_netdev.o nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \ nf_tables_trace.o nft_immediate.o nft_cmp.o nft_range.o \ nft_bitwise.o nft_byteorder.o nft_payload.o nft_lookup.o \ - nft_dynset.o nft_meta.o nft_rt.o nft_exthdr.o \ + nft_dynset.o nft_meta.o nft_rt.o nft_exthdr.o nft_last.o \ nft_chain_route.o nf_tables_offload.o \ nft_set_hash.o nft_set_bitmap.o nft_set_rbtree.o \ nft_set_pipapo.o diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index 7780342e2f2d..866cfba04d6c 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -268,6 +268,7 @@ static struct nft_expr_type *nft_basic_types[] = { &nft_meta_type, &nft_rt_type, &nft_exthdr_type, + &nft_last_type, }; static struct nft_object_type *nft_basic_objects[] = { diff --git a/net/netfilter/nft_last.c b/net/netfilter/nft_last.c new file mode 100644 index 000000000000..913ac45167f2 --- /dev/null +++ b/net/netfilter/nft_last.c @@ -0,0 +1,87 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include +#include +#include +#include +#include + +struct nft_last_priv { + unsigned long last_jiffies; + unsigned int last_set; +}; + +static const struct nla_policy nft_last_policy[NFTA_LAST_MAX + 1] = { + [NFTA_LAST_SET] = { .type = NLA_U32 }, + [NFTA_LAST_MSECS] = { .type = NLA_U64 }, +}; + +static int nft_last_init(const struct nft_ctx *ctx, const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_last_priv *priv = nft_expr_priv(expr); + u64 last_jiffies; + int err; + + if (tb[NFTA_LAST_MSECS]) { + err = nf_msecs_to_jiffies64(tb[NFTA_LAST_MSECS], &last_jiffies); + if (err < 0) + return err; + + priv->last_jiffies = jiffies + (unsigned long)last_jiffies; + priv->last_set = 1; + } + + return 0; +} + +static void nft_last_eval(const struct nft_expr *expr, + struct nft_regs *regs, const struct nft_pktinfo *pkt) +{ + struct nft_last_priv *priv = nft_expr_priv(expr); + + priv->last_jiffies = jiffies; + priv->last_set = 1; +} + +static int nft_last_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + struct nft_last_priv *priv = nft_expr_priv(expr); + __be64 msecs; + + if (time_before(jiffies, priv->last_jiffies)) + priv->last_set = 0; + + if (priv->last_set) + msecs = nf_jiffies64_to_msecs(jiffies - priv->last_jiffies); + else + msecs = 0; + + if (nla_put_be32(skb, NFTA_LAST_SET, htonl(priv->last_set)) || + nla_put_be64(skb, NFTA_LAST_MSECS, msecs, NFTA_LAST_PAD)) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} + +static const struct nft_expr_ops nft_last_ops = { + .type = &nft_last_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_last_priv)), + .eval = nft_last_eval, + .init = nft_last_init, + .dump = nft_last_dump, +}; + +struct nft_expr_type nft_last_type __read_mostly = { + .name = "last", + .ops = &nft_last_ops, + .policy = nft_last_policy, + .maxattr = NFTA_LAST_MAX, + .flags = NFT_EXPR_STATEFUL, + .owner = THIS_MODULE, +}; -- cgit v1.2.3-58-ga151 From 55d96f72e8ddc0a294e0b9c94016edbb699537e1 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Thu, 17 Jun 2021 16:02:07 +0800 Subject: net: sched: fix error return code in tcf_del_walker() When nla_put_u32() fails, 'ret' could be 0, it should return error code in tcf_del_walker(). Reported-by: Hulk Robot Signed-off-by: Yang Yingliang Signed-off-by: David S. Miller --- net/sched/act_api.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/act_api.c b/net/sched/act_api.c index f6d5755d669e..d17a66aab8ee 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -381,7 +381,8 @@ static int tcf_del_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb, } mutex_unlock(&idrinfo->lock); - if (nla_put_u32(skb, TCA_FCNT, n_i)) + ret = nla_put_u32(skb, TCA_FCNT, n_i); + if (ret) goto nla_put_failure; nla_nest_end(skb, nest); -- cgit v1.2.3-58-ga151 From 62eec0d73393a136b4523952cecbda1438f1f1b9 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 16 Jun 2021 22:06:19 +0200 Subject: netfilter: conntrack: pass hook state to log functions The packet logger backend is unable to provide the incoming (or outgoing) interface name because that information isn't available. Pass the hook state, it contains the network namespace, the protocol family, the network interfaces and other things. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_l4proto.h | 20 ++++++++++++-------- net/netfilter/nf_conntrack_proto.c | 16 +++++++++------- net/netfilter/nf_conntrack_proto_dccp.c | 14 +++++++------- net/netfilter/nf_conntrack_proto_icmp.c | 7 +++---- net/netfilter/nf_conntrack_proto_icmpv6.c | 3 +-- net/netfilter/nf_conntrack_proto_sctp.c | 2 +- net/netfilter/nf_conntrack_proto_tcp.c | 23 ++++++++++++----------- net/netfilter/nf_conntrack_proto_udp.c | 6 ++---- 8 files changed, 47 insertions(+), 44 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h index 96f9cf81f46b..1f47bef51722 100644 --- a/include/net/netfilter/nf_conntrack_l4proto.h +++ b/include/net/netfilter/nf_conntrack_l4proto.h @@ -159,22 +159,26 @@ unsigned int nf_ct_port_nlattr_tuple_size(void); extern const struct nla_policy nf_ct_port_nla_policy[]; #ifdef CONFIG_SYSCTL -__printf(3, 4) __cold +__printf(4, 5) __cold void nf_ct_l4proto_log_invalid(const struct sk_buff *skb, const struct nf_conn *ct, + const struct nf_hook_state *state, const char *fmt, ...); -__printf(5, 6) __cold +__printf(4, 5) __cold void nf_l4proto_log_invalid(const struct sk_buff *skb, - struct net *net, - u16 pf, u8 protonum, + const struct nf_hook_state *state, + u8 protonum, const char *fmt, ...); #else -static inline __printf(5, 6) __cold -void nf_l4proto_log_invalid(const struct sk_buff *skb, struct net *net, - u16 pf, u8 protonum, const char *fmt, ...) {} -static inline __printf(3, 4) __cold +static inline __printf(4, 5) __cold +void nf_l4proto_log_invalid(const struct sk_buff *skb, + const struct nf_hook_state *state, + u8 protonum, + const char *fmt, ...) {} +static inline __printf(4, 5) __cold void nf_ct_l4proto_log_invalid(const struct sk_buff *skb, const struct nf_conn *ct, + const struct nf_hook_state *state, const char *fmt, ...) { } #endif /* CONFIG_SYSCTL */ diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index be14e0bea4c8..55647409a9be 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -45,12 +45,13 @@ static DEFINE_MUTEX(nf_ct_proto_mutex); #ifdef CONFIG_SYSCTL -__printf(5, 6) +__printf(4, 5) void nf_l4proto_log_invalid(const struct sk_buff *skb, - struct net *net, - u16 pf, u8 protonum, + const struct nf_hook_state *state, + u8 protonum, const char *fmt, ...) { + struct net *net = state->net; struct va_format vaf; va_list args; @@ -62,15 +63,16 @@ void nf_l4proto_log_invalid(const struct sk_buff *skb, vaf.fmt = fmt; vaf.va = &args; - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, - "nf_ct_proto_%d: %pV ", protonum, &vaf); + nf_log_packet(net, state->pf, 0, skb, state->in, state->out, + NULL, "nf_ct_proto_%d: %pV ", protonum, &vaf); va_end(args); } EXPORT_SYMBOL_GPL(nf_l4proto_log_invalid); -__printf(3, 4) +__printf(4, 5) void nf_ct_l4proto_log_invalid(const struct sk_buff *skb, const struct nf_conn *ct, + const struct nf_hook_state *state, const char *fmt, ...) { struct va_format vaf; @@ -85,7 +87,7 @@ void nf_ct_l4proto_log_invalid(const struct sk_buff *skb, vaf.fmt = fmt; vaf.va = &args; - nf_l4proto_log_invalid(skb, net, nf_ct_l3num(ct), + nf_l4proto_log_invalid(skb, state, nf_ct_protonum(ct), "%pV", &vaf); va_end(args); } diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index 4f33307fa3cf..c1557d47ccd1 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -382,7 +382,8 @@ dccp_state_table[CT_DCCP_ROLE_MAX + 1][DCCP_PKT_SYNCACK + 1][CT_DCCP_MAX + 1] = static noinline bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb, - const struct dccp_hdr *dh) + const struct dccp_hdr *dh, + const struct nf_hook_state *hook_state) { struct net *net = nf_ct_net(ct); struct nf_dccp_net *dn; @@ -414,7 +415,7 @@ dccp_new(struct nf_conn *ct, const struct sk_buff *skb, return true; out_invalid: - nf_ct_l4proto_log_invalid(skb, ct, "%s", msg); + nf_ct_l4proto_log_invalid(skb, ct, hook_state, "%s", msg); return false; } @@ -464,8 +465,7 @@ static bool dccp_error(const struct dccp_hdr *dh, } return false; out_invalid: - nf_l4proto_log_invalid(skb, state->net, state->pf, - IPPROTO_DCCP, "%s", msg); + nf_l4proto_log_invalid(skb, state, IPPROTO_DCCP, "%s", msg); return true; } @@ -488,7 +488,7 @@ int nf_conntrack_dccp_packet(struct nf_conn *ct, struct sk_buff *skb, return -NF_ACCEPT; type = dh->dccph_type; - if (!nf_ct_is_confirmed(ct) && !dccp_new(ct, skb, dh)) + if (!nf_ct_is_confirmed(ct) && !dccp_new(ct, skb, dh, state)) return -NF_ACCEPT; if (type == DCCP_PKT_RESET && @@ -543,11 +543,11 @@ int nf_conntrack_dccp_packet(struct nf_conn *ct, struct sk_buff *skb, ct->proto.dccp.last_pkt = type; spin_unlock_bh(&ct->lock); - nf_ct_l4proto_log_invalid(skb, ct, "%s", "invalid packet"); + nf_ct_l4proto_log_invalid(skb, ct, state, "%s", "invalid packet"); return NF_ACCEPT; case CT_DCCP_INVALID: spin_unlock_bh(&ct->lock); - nf_ct_l4proto_log_invalid(skb, ct, "%s", "invalid state transition"); + nf_ct_l4proto_log_invalid(skb, ct, state, "%s", "invalid state transition"); return -NF_ACCEPT; } diff --git a/net/netfilter/nf_conntrack_proto_icmp.c b/net/netfilter/nf_conntrack_proto_icmp.c index 4efd8741c105..b38b7164acd5 100644 --- a/net/netfilter/nf_conntrack_proto_icmp.c +++ b/net/netfilter/nf_conntrack_proto_icmp.c @@ -170,12 +170,12 @@ int nf_conntrack_inet_error(struct nf_conn *tmpl, struct sk_buff *skb, ct_daddr = &ct->tuplehash[dir].tuple.dst.u3; if (!nf_inet_addr_cmp(outer_daddr, ct_daddr)) { if (state->pf == AF_INET) { - nf_l4proto_log_invalid(skb, state->net, state->pf, + nf_l4proto_log_invalid(skb, state, l4proto, "outer daddr %pI4 != inner %pI4", &outer_daddr->ip, &ct_daddr->ip); } else if (state->pf == AF_INET6) { - nf_l4proto_log_invalid(skb, state->net, state->pf, + nf_l4proto_log_invalid(skb, state, l4proto, "outer daddr %pI6 != inner %pI6", &outer_daddr->ip6, &ct_daddr->ip6); @@ -197,8 +197,7 @@ static void icmp_error_log(const struct sk_buff *skb, const struct nf_hook_state *state, const char *msg) { - nf_l4proto_log_invalid(skb, state->net, state->pf, - IPPROTO_ICMP, "%s", msg); + nf_l4proto_log_invalid(skb, state, IPPROTO_ICMP, "%s", msg); } /* Small and modified version of icmp_rcv */ diff --git a/net/netfilter/nf_conntrack_proto_icmpv6.c b/net/netfilter/nf_conntrack_proto_icmpv6.c index facd8c64ec4e..61e3b05cf02c 100644 --- a/net/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/netfilter/nf_conntrack_proto_icmpv6.c @@ -126,8 +126,7 @@ static void icmpv6_error_log(const struct sk_buff *skb, const struct nf_hook_state *state, const char *msg) { - nf_l4proto_log_invalid(skb, state->net, state->pf, - IPPROTO_ICMPV6, "%s", msg); + nf_l4proto_log_invalid(skb, state, IPPROTO_ICMPV6, "%s", msg); } int nf_conntrack_icmpv6_error(struct nf_conn *tmpl, diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index fb8dc02e502f..2394238d01c9 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -351,7 +351,7 @@ static bool sctp_error(struct sk_buff *skb, } return false; out_invalid: - nf_l4proto_log_invalid(skb, state->net, state->pf, IPPROTO_SCTP, "%s", logmsg); + nf_l4proto_log_invalid(skb, state, IPPROTO_SCTP, "%s", logmsg); return true; } diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index de840fc41a2e..f7e8baf59b51 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -446,14 +446,15 @@ static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff, } } -static bool tcp_in_window(const struct nf_conn *ct, - struct ip_ct_tcp *state, +static bool tcp_in_window(struct nf_conn *ct, enum ip_conntrack_dir dir, unsigned int index, const struct sk_buff *skb, unsigned int dataoff, - const struct tcphdr *tcph) + const struct tcphdr *tcph, + const struct nf_hook_state *hook_state) { + struct ip_ct_tcp *state = &ct->proto.tcp; struct net *net = nf_ct_net(ct); struct nf_tcp_net *tn = nf_tcp_pernet(net); struct ip_ct_tcp_state *sender = &state->seen[dir]; @@ -670,7 +671,7 @@ static bool tcp_in_window(const struct nf_conn *ct, tn->tcp_be_liberal) res = true; if (!res) { - nf_ct_l4proto_log_invalid(skb, ct, + nf_ct_l4proto_log_invalid(skb, ct, hook_state, "%s", before(seq, sender->td_maxend + 1) ? in_recv_win ? @@ -710,7 +711,7 @@ static void tcp_error_log(const struct sk_buff *skb, const struct nf_hook_state *state, const char *msg) { - nf_l4proto_log_invalid(skb, state->net, state->pf, IPPROTO_TCP, "%s", msg); + nf_l4proto_log_invalid(skb, state, IPPROTO_TCP, "%s", msg); } /* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c. */ @@ -970,7 +971,7 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct, IP_CT_EXP_CHALLENGE_ACK; } spin_unlock_bh(&ct->lock); - nf_ct_l4proto_log_invalid(skb, ct, + nf_ct_l4proto_log_invalid(skb, ct, state, "packet (index %d) in dir %d ignored, state %s", index, dir, tcp_conntrack_names[old_state]); @@ -995,7 +996,7 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct, pr_debug("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n", dir, get_conntrack_index(th), old_state); spin_unlock_bh(&ct->lock); - nf_ct_l4proto_log_invalid(skb, ct, "invalid state"); + nf_ct_l4proto_log_invalid(skb, ct, state, "invalid state"); return -NF_ACCEPT; case TCP_CONNTRACK_TIME_WAIT: /* RFC5961 compliance cause stack to send "challenge-ACK" @@ -1010,7 +1011,7 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct, /* Detected RFC5961 challenge ACK */ ct->proto.tcp.last_flags &= ~IP_CT_EXP_CHALLENGE_ACK; spin_unlock_bh(&ct->lock); - nf_ct_l4proto_log_invalid(skb, ct, "challenge-ack ignored"); + nf_ct_l4proto_log_invalid(skb, ct, state, "challenge-ack ignored"); return NF_ACCEPT; /* Don't change state */ } break; @@ -1035,7 +1036,7 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct, if (before(seq, ct->proto.tcp.seen[!dir].td_maxack)) { /* Invalid RST */ spin_unlock_bh(&ct->lock); - nf_ct_l4proto_log_invalid(skb, ct, "invalid rst"); + nf_ct_l4proto_log_invalid(skb, ct, state, "invalid rst"); return -NF_ACCEPT; } @@ -1079,8 +1080,8 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct, break; } - if (!tcp_in_window(ct, &ct->proto.tcp, dir, index, - skb, dataoff, th)) { + if (!tcp_in_window(ct, dir, index, + skb, dataoff, th, state)) { spin_unlock_bh(&ct->lock); return -NF_ACCEPT; } diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index 68911fcaa0f1..698fee49e732 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -38,8 +38,7 @@ static void udp_error_log(const struct sk_buff *skb, const struct nf_hook_state *state, const char *msg) { - nf_l4proto_log_invalid(skb, state->net, state->pf, - IPPROTO_UDP, "%s", msg); + nf_l4proto_log_invalid(skb, state, IPPROTO_UDP, "%s", msg); } static bool udp_error(struct sk_buff *skb, @@ -130,8 +129,7 @@ static void udplite_error_log(const struct sk_buff *skb, const struct nf_hook_state *state, const char *msg) { - nf_l4proto_log_invalid(skb, state->net, state->pf, - IPPROTO_UDPLITE, "%s", msg); + nf_l4proto_log_invalid(skb, state, IPPROTO_UDPLITE, "%s", msg); } static bool udplite_error(struct sk_buff *skb, -- cgit v1.2.3-58-ga151 From f654fae47e83e56b454fbbfd0af0a4f232e356d6 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Fri, 18 Jun 2021 09:58:05 +0200 Subject: xsk: Fix broken Tx ring validation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix broken Tx ring validation for AF_XDP. The commit under the Fixes tag, fixed an off-by-one error in the validation but introduced another error. Descriptors are now let through even if they straddle a chunk boundary which they are not allowed to do in aligned mode. Worse is that they are let through even if they straddle the end of the umem itself, tricking the kernel to read data outside the allowed umem region which might or might not be mapped at all. Fix this by reintroducing the old code, but subtract the length by one to fix the off-by-one error that the original patch was addressing. The test chunk != chunk_end makes sure packets do not straddle chunk boundraries. Note that packets of zero length are allowed in the interface, therefore the test if the length is non-zero. Fixes: ac31565c2193 ("xsk: Fix for xp_aligned_validate_desc() when len == chunk_size") Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Reviewed-by: Xuan Zhuo Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/20210618075805.14412-1-magnus.karlsson@gmail.com --- net/xdp/xsk_queue.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 9d2a89d793c0..9ae13cccfb28 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -128,12 +128,15 @@ static inline bool xskq_cons_read_addr_unchecked(struct xsk_queue *q, u64 *addr) static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) { - u64 chunk; - - if (desc->len > pool->chunk_size) - return false; + u64 chunk, chunk_end; chunk = xp_aligned_extract_addr(pool, desc->addr); + if (likely(desc->len)) { + chunk_end = xp_aligned_extract_addr(pool, desc->addr + desc->len - 1); + if (chunk != chunk_end) + return false; + } + if (chunk >= pool->addrs_cnt) return false; -- cgit v1.2.3-58-ga151 From 8b532109bf885b7b59b93487bc4672eb6d071b78 Mon Sep 17 00:00:00 2001 From: Andrea Mayer Date: Thu, 17 Jun 2021 19:16:44 +0200 Subject: seg6: add support for SRv6 End.DT46 Behavior IETF RFC 8986 [1] includes the definition of SRv6 End.DT4, End.DT6, and End.DT46 Behaviors. The current SRv6 code in the Linux kernel only implements End.DT4 and End.DT6 which can be used respectively to support IPv4-in-IPv6 and IPv6-in-IPv6 VPNs. With End.DT4 and End.DT6 it is not possible to create a single SRv6 VPN tunnel to carry both IPv4 and IPv6 traffic. The proposed End.DT46 implementation is meant to support the decapsulation of IPv4 and IPv6 traffic coming from a single SRv6 tunnel. The implementation of the SRv6 End.DT46 Behavior in the Linux kernel greatly simplifies the setup and operations of SRv6 VPNs. The SRv6 End.DT46 Behavior leverages the infrastructure of SRv6 End.DT{4,6} Behaviors implemented so far, because it makes use of a VRF device in order to force the routing lookup into the associated routing table. To make the End.DT46 work properly, it must be guaranteed that the routing table used for routing lookup operations is bound to one and only one VRF during the tunnel creation. Such constraint has to be enforced by enabling the VRF strict_mode sysctl parameter, i.e.: $ sysctl -wq net.vrf.strict_mode=1 Note that the same approach is used for the SRv6 End.DT4 Behavior and for the End.DT6 Behavior in VRF mode. The command used to instantiate an SRv6 End.DT46 Behavior is straightforward, i.e.: $ ip -6 route add 2001:db8::1 encap seg6local action End.DT46 vrftable 100 dev vrf100. [1] https://www.rfc-editor.org/rfc/rfc8986.html#name-enddt46-decapsulation-and-s ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Performance and impact of SRv6 End.DT46 Behavior on the SRv6 Networking ======================================================================= This patch aims to add the SRv6 End.DT46 Behavior with minimal impact on the performance of SRv6 End.DT4 and End.DT6 Behaviors. In order to verify this, we tested the performance of the newly introduced SRv6 End.DT46 Behavior and compared it with the performance of SRv6 End.DT{4,6} Behaviors, considering both the patched kernel and the kernel before applying the End.DT46 patch (referred to as vanilla kernel). In details, the following decapsulation scenarios were considered: 1.a) IPv6 traffic in SRv6 End.DT46 Behavior on patched kernel; 1.b) IPv4 traffic in SRv6 End.DT46 Behavior on patched kernel; 2.a) SRv6 End.DT6 Behavior (VRF mode) on patched kernel; 2.b) SRv6 End.DT4 Behavior on patched kernel; 3.a) SRv6 End.DT6 Behavior (VRF mode) on vanilla kernel (without the End.DT46 patch); 3.b) SRv6 End.DT4 Behavior on vanilla kernel (without the End.DT46 patch). All tests were performed on a testbed deployed on the CloudLab [2] facilities. We considered IPv{4,6} traffic handled by a single core (at 2.4 GHz on a Xeon(R) CPU E5-2630 v3) on kernel 5.13-rc1 using packets of size ~ 100 bytes. Scenario (1.a): average 684.70 kpps; std. dev. 0.7 kpps; Scenario (1.b): average 711.69 kpps; std. dev. 1.2 kpps; Scenario (2.a): average 690.70 kpps; std. dev. 1.2 kpps; Scenario (2.b): average 722.22 kpps; std. dev. 1.7 kpps; Scenario (3.a): average 690.02 kpps; std. dev. 2.6 kpps; Scenario (3.b): average 721.91 kpps; std. dev. 1.2 kpps; Considering the results for the patched kernel (1.a, 1.b, 2.a, 2.b) we observe that the performance degradation incurred in using End.DT46 rather than End.DT6 and End.DT4 respectively for IPv6 and IPv4 traffic is minimal, around 0.9% and 1.5%. Such very minimal performance degradation is the price to be paid if one prefers to use a single tunnel capable of handling both types of traffic (IPv4 and IPv6). Comparing the results for End.DT4 and End.DT6 under the patched and the vanilla kernel (2.a, 2.b, 3.a, 3.b) we observe that the introduction of the End.DT46 patch has no impact on the performance of End.DT4 and End.DT6. [2] https://www.cloudlab.us Signed-off-by: Andrea Mayer Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/uapi/linux/seg6_local.h | 2 + net/ipv6/seg6_local.c | 94 +++++++++++++++++++++++++++++++---------- 2 files changed, 74 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/seg6_local.h b/include/uapi/linux/seg6_local.h index 5ae3ace84de0..332b18f318f8 100644 --- a/include/uapi/linux/seg6_local.h +++ b/include/uapi/linux/seg6_local.h @@ -64,6 +64,8 @@ enum { SEG6_LOCAL_ACTION_END_AM = 14, /* custom BPF action */ SEG6_LOCAL_ACTION_END_BPF = 15, + /* decap and lookup of DA in v4 or v6 table */ + SEG6_LOCAL_ACTION_END_DT46 = 16, __SEG6_LOCAL_ACTION_MAX, }; diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index 4ff38cb08f4b..60bf3b877957 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -87,10 +87,10 @@ struct seg6_end_dt_info { int vrf_ifindex; int vrf_table; - /* tunneled packet proto and family (IPv4 or IPv6) */ - __be16 proto; + /* tunneled packet family (IPv4 or IPv6). + * Protocol and header length are inferred from family. + */ u16 family; - int hdrlen; }; struct pcpu_seg6_local_counters { @@ -521,19 +521,6 @@ static int __seg6_end_dt_vrf_build(struct seg6_local_lwt *slwt, const void *cfg, info->net = net; info->vrf_ifindex = vrf_ifindex; - switch (family) { - case AF_INET: - info->proto = htons(ETH_P_IP); - info->hdrlen = sizeof(struct iphdr); - break; - case AF_INET6: - info->proto = htons(ETH_P_IPV6); - info->hdrlen = sizeof(struct ipv6hdr); - break; - default: - return -EINVAL; - } - info->family = family; info->mode = DT_VRF_MODE; @@ -622,22 +609,44 @@ error: } static struct sk_buff *end_dt_vrf_core(struct sk_buff *skb, - struct seg6_local_lwt *slwt) + struct seg6_local_lwt *slwt, u16 family) { struct seg6_end_dt_info *info = &slwt->dt_info; struct net_device *vrf; + __be16 protocol; + int hdrlen; vrf = end_dt_get_vrf_rcu(skb, info); if (unlikely(!vrf)) goto drop; - skb->protocol = info->proto; + switch (family) { + case AF_INET: + protocol = htons(ETH_P_IP); + hdrlen = sizeof(struct iphdr); + break; + case AF_INET6: + protocol = htons(ETH_P_IPV6); + hdrlen = sizeof(struct ipv6hdr); + break; + case AF_UNSPEC: + fallthrough; + default: + goto drop; + } + + if (unlikely(info->family != AF_UNSPEC && info->family != family)) { + pr_warn_once("seg6local: SRv6 End.DT* family mismatch"); + goto drop; + } + + skb->protocol = protocol; skb_dst_drop(skb); - skb_set_transport_header(skb, info->hdrlen); + skb_set_transport_header(skb, hdrlen); - return end_dt_vrf_rcv(skb, info->family, vrf); + return end_dt_vrf_rcv(skb, family, vrf); drop: kfree_skb(skb); @@ -656,7 +665,7 @@ static int input_action_end_dt4(struct sk_buff *skb, if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto drop; - skb = end_dt_vrf_core(skb, slwt); + skb = end_dt_vrf_core(skb, slwt, AF_INET); if (!skb) /* packet has been processed and consumed by the VRF */ return 0; @@ -739,7 +748,7 @@ static int input_action_end_dt6(struct sk_buff *skb, goto legacy_mode; /* DT6_VRF_MODE */ - skb = end_dt_vrf_core(skb, slwt); + skb = end_dt_vrf_core(skb, slwt, AF_INET6); if (!skb) /* packet has been processed and consumed by the VRF */ return 0; @@ -767,6 +776,36 @@ drop: return -EINVAL; } +#ifdef CONFIG_NET_L3_MASTER_DEV +static int seg6_end_dt46_build(struct seg6_local_lwt *slwt, const void *cfg, + struct netlink_ext_ack *extack) +{ + return __seg6_end_dt_vrf_build(slwt, cfg, AF_UNSPEC, extack); +} + +static int input_action_end_dt46(struct sk_buff *skb, + struct seg6_local_lwt *slwt) +{ + unsigned int off = 0; + int nexthdr; + + nexthdr = ipv6_find_hdr(skb, &off, -1, NULL, NULL); + if (unlikely(nexthdr < 0)) + goto drop; + + switch (nexthdr) { + case IPPROTO_IPIP: + return input_action_end_dt4(skb, slwt); + case IPPROTO_IPV6: + return input_action_end_dt6(skb, slwt); + } + +drop: + kfree_skb(skb); + return -EINVAL; +} +#endif + /* push an SRH on top of the current one */ static int input_action_end_b6(struct sk_buff *skb, struct seg6_local_lwt *slwt) { @@ -968,6 +1007,17 @@ static struct seg6_action_desc seg6_action_table[] = { #endif .input = input_action_end_dt6, }, + { + .action = SEG6_LOCAL_ACTION_END_DT46, + .attrs = SEG6_F_ATTR(SEG6_LOCAL_VRFTABLE), + .optattrs = SEG6_F_LOCAL_COUNTERS, +#ifdef CONFIG_NET_L3_MASTER_DEV + .input = input_action_end_dt46, + .slwt_ops = { + .build_state = seg6_end_dt46_build, + }, +#endif + }, { .action = SEG6_LOCAL_ACTION_END_B6, .attrs = SEG6_F_ATTR(SEG6_LOCAL_SRH), -- cgit v1.2.3-58-ga151 From 752e906732c69412087f716e93baa0330cb7cce3 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 17 Jun 2021 16:46:07 -0700 Subject: mptcp: add csum_enabled in mptcp_sock This patch added a new member named csum_enabled in struct mptcp_sock, used a dummy mptcp_is_checksum_enabled() helper to initialize it. Also added a new member named mptcpi_csum_enabled in struct mptcp_info to expose the csum_enabled flag. Acked-by: Paolo Abeni Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- include/uapi/linux/mptcp.h | 1 + net/mptcp/mptcp_diag.c | 1 + net/mptcp/protocol.c | 1 + net/mptcp/protocol.h | 2 ++ 4 files changed, 5 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index 8eb3c0844bff..7b05f7102321 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -105,6 +105,7 @@ struct mptcp_info { __u64 mptcpi_rcv_nxt; __u8 mptcpi_local_addr_used; __u8 mptcpi_local_addr_max; + __u8 mptcpi_csum_enabled; }; /* diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c index f16d9b5ee978..8f88ddeab6a2 100644 --- a/net/mptcp/mptcp_diag.c +++ b/net/mptcp/mptcp_diag.c @@ -144,6 +144,7 @@ static void mptcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, info->mptcpi_write_seq = READ_ONCE(msk->write_seq); info->mptcpi_snd_una = READ_ONCE(msk->snd_una); info->mptcpi_rcv_nxt = READ_ONCE(msk->ack_seq); + info->mptcpi_csum_enabled = READ_ONCE(msk->csum_enabled); unlock_sock_fast(sk, slow); } diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 993095089990..2caca0dc2c1c 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2453,6 +2453,7 @@ static int __mptcp_init_sock(struct sock *sk) msk->ack_hint = NULL; msk->first = NULL; inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss; + WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk))); mptcp_pm_data_init(msk); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 89f6b73783d5..1fc6693e257e 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -234,6 +234,7 @@ struct mptcp_sock { bool snd_data_fin_enable; bool rcv_fastclose; bool use_64bit_ack; /* Set when we received a 64-bit DSN */ + bool csum_enabled; spinlock_t join_list_lock; struct sock *ack_hint; struct work_struct work; @@ -525,6 +526,7 @@ static inline void mptcp_subflow_delegated_done(struct mptcp_subflow_context *su int mptcp_is_enabled(struct net *net); unsigned int mptcp_get_add_addr_timeout(struct net *net); +static inline int mptcp_is_checksum_enabled(struct net *net) { return false; } void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow, struct mptcp_options_received *mp_opt); bool mptcp_subflow_data_available(struct sock *sk); -- cgit v1.2.3-58-ga151 From d0cc298745f5abb3c43319cb9485daf3471d6f94 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 17 Jun 2021 16:46:08 -0700 Subject: mptcp: generate the data checksum This patch added a new member named csum in struct mptcp_ext, implemented a new function named mptcp_generate_data_checksum(). Generate the data checksum in mptcp_sendmsg_frag, save it in mpext->csum. Note that we must generate the csum for zero window probe, too. Do the csum update incrementally, to avoid multiple csum computation when the data is appended to existing skb. Note that in a later patch we will skip unneeded csum related operation. Changes not included here to keep the delta small. Co-developed-by: Paolo Abeni Signed-off-by: Paolo Abeni Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- include/net/mptcp.h | 1 + net/mptcp/protocol.c | 18 +++++++++++++++++- net/mptcp/protocol.h | 7 +++++++ 3 files changed, 25 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 83f23774b908..23bbd439e115 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -23,6 +23,7 @@ struct mptcp_ext { u64 data_seq; u32 subflow_seq; u16 data_len; + __sum16 csum; u8 use_map:1, dsn64:1, data_fin:1, diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 2caca0dc2c1c..f0da067301f6 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1308,6 +1308,18 @@ static bool mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk) return __mptcp_alloc_tx_skb(sk, ssk, sk->sk_allocation); } +/* note: this always recompute the csum on the whole skb, even + * if we just appended a single frag. More status info needed + */ +static void mptcp_update_data_checksum(struct sk_buff *skb, int added) +{ + struct mptcp_ext *mpext = mptcp_get_ext(skb); + __wsum csum = ~csum_unfold(mpext->csum); + int offset = skb->len - added; + + mpext->csum = csum_fold(csum_block_add(csum, skb_checksum(skb, offset, added, 0), offset)); +} + static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, struct mptcp_data_frag *dfrag, struct mptcp_sendmsg_info *info) @@ -1402,10 +1414,14 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, if (zero_window_probe) { mptcp_subflow_ctx(ssk)->rel_write_seq += ret; mpext->frozen = 1; - ret = 0; + if (READ_ONCE(msk->csum_enabled)) + mptcp_update_data_checksum(tail, ret); tcp_push_pending_frames(ssk); + return 0; } out: + if (READ_ONCE(msk->csum_enabled)) + mptcp_update_data_checksum(tail, ret); mptcp_subflow_ctx(ssk)->rel_write_seq += ret; return ret; } diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 1fc6693e257e..4913ac7b6d19 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -336,6 +336,13 @@ static inline struct mptcp_data_frag *mptcp_rtx_head(const struct sock *sk) return list_first_entry_or_null(&msk->rtx_queue, struct mptcp_data_frag, list); } +struct csum_pseudo_header { + __be64 data_seq; + __be32 subflow_seq; + __be16 data_len; + __sum16 csum; +}; + struct mptcp_subflow_request_sock { struct tcp_request_sock sk; u16 mp_capable : 1, -- cgit v1.2.3-58-ga151 From 06fe1719aa501e3b574b1b2b3a7ad2ddac5fb9cb Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 17 Jun 2021 16:46:09 -0700 Subject: mptcp: add csum_reqd in mptcp_out_options This patch added a new member csum_reqd in struct mptcp_out_options and struct mptcp_subflow_request_sock. Initialized it with the helper function mptcp_is_checksum_enabled(). In mptcp_write_options, if this field is enabled, send out the MP_CAPABLE suboption with the MPTCP_CAP_CHECKSUM_REQD flag. Acked-by: Paolo Abeni Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- include/net/mptcp.h | 5 +++-- net/mptcp/options.c | 11 +++++++++-- net/mptcp/protocol.h | 3 ++- net/mptcp/subflow.c | 1 + 4 files changed, 15 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 23bbd439e115..33af68eea96f 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -64,8 +64,9 @@ struct mptcp_out_options { struct mptcp_rm_list rm_list; u8 join_id; u8 backup; - u8 reset_reason:4; - u8 reset_transient:1; + u8 reset_reason:4, + reset_transient:1, + csum_reqd:1; u32 nonce; u64 thmac; u32 token; diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 6b825fb3fa83..bb3a1f3b6e99 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -380,6 +380,7 @@ bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, subflow->snd_isn = TCP_SKB_CB(skb)->end_seq; if (subflow->request_mptcp) { opts->suboptions = OPTION_MPTCP_MPC_SYN; + opts->csum_reqd = mptcp_is_checksum_enabled(sock_net(sk)); *size = TCPOLEN_MPTCP_MPC_SYN; return true; } else if (subflow->request_join) { @@ -435,6 +436,7 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct mptcp_sock *msk = mptcp_sk(subflow->conn); struct mptcp_ext *mpext; unsigned int data_len; @@ -465,6 +467,7 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, opts->suboptions = OPTION_MPTCP_MPC_ACK; opts->sndr_key = subflow->local_key; opts->rcvr_key = subflow->remote_key; + opts->csum_reqd = READ_ONCE(msk->csum_enabled); /* Section 3.1. * The MP_CAPABLE option is carried on the SYN, SYN/ACK, and ACK @@ -789,6 +792,7 @@ bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, if (subflow_req->mp_capable) { opts->suboptions = OPTION_MPTCP_MPC_SYNACK; opts->sndr_key = subflow_req->local_key; + opts->csum_reqd = subflow_req->csum_reqd; *size = TCPOLEN_MPTCP_MPC_SYNACK; pr_debug("subflow_req=%p, local_key=%llu", subflow_req, subflow_req->local_key); @@ -1123,7 +1127,7 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, { if ((OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_SYNACK | OPTION_MPTCP_MPC_ACK) & opts->suboptions) { - u8 len; + u8 len, flag = MPTCP_CAP_HMAC_SHA256; if (OPTION_MPTCP_MPC_SYN & opts->suboptions) len = TCPOLEN_MPTCP_MPC_SYN; @@ -1134,9 +1138,12 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, else len = TCPOLEN_MPTCP_MPC_ACK; + if (opts->csum_reqd) + flag |= MPTCP_CAP_CHECKSUM_REQD; + *ptr++ = mptcp_option(MPTCPOPT_MP_CAPABLE, len, MPTCP_SUPPORTED_VERSION, - MPTCP_CAP_HMAC_SHA256); + flag); if (!((OPTION_MPTCP_MPC_SYNACK | OPTION_MPTCP_MPC_ACK) & opts->suboptions)) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 4913ac7b6d19..09e94726e030 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -347,7 +347,8 @@ struct mptcp_subflow_request_sock { struct tcp_request_sock sk; u16 mp_capable : 1, mp_join : 1, - backup : 1; + backup : 1, + csum_reqd : 1; u8 local_id; u8 remote_id; u64 local_key; diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 33956337c46b..45acab63c387 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -108,6 +108,7 @@ static void subflow_init_req(struct request_sock *req, const struct sock *sk_lis subflow_req->mp_capable = 0; subflow_req->mp_join = 0; + subflow_req->csum_reqd = mptcp_is_checksum_enabled(sock_net(sk_listener)); subflow_req->msk = NULL; mptcp_token_init_request(req); } -- cgit v1.2.3-58-ga151 From c94b1f96dcfb2e5bd072b10f3429ccf28778ad58 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 17 Jun 2021 16:46:10 -0700 Subject: mptcp: send out checksum for MP_CAPABLE with data If the checksum is enabled, send out the data checksum with the MP_CAPABLE suboption with data. In mptcp_established_options_mp, save the data checksum in opts->ext_copy.csum. In mptcp_write_options, adjust the option length and send it out with the MP_CAPABLE suboption. Co-developed-by: Paolo Abeni Signed-off-by: Paolo Abeni Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/options.c | 52 +++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 43 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/mptcp/options.c b/net/mptcp/options.c index bb3a1f3b6e99..b4da08db1221 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -439,6 +439,7 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, struct mptcp_sock *msk = mptcp_sk(subflow->conn); struct mptcp_ext *mpext; unsigned int data_len; + u8 len; /* When skb is not available, we better over-estimate the emitted * options len. A full DSS option (28 bytes) is longer than @@ -474,10 +475,16 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, * packets that start the first subflow of an MPTCP connection, * as well as the first packet that carries data */ - if (data_len > 0) - *size = ALIGN(TCPOLEN_MPTCP_MPC_ACK_DATA, 4); - else + if (data_len > 0) { + len = TCPOLEN_MPTCP_MPC_ACK_DATA; + if (opts->csum_reqd) { + opts->ext_copy.csum = mpext->csum; + len += TCPOLEN_MPTCP_DSS_CHECKSUM; + } + *size = ALIGN(len, 4); + } else { *size = TCPOLEN_MPTCP_MPC_ACK; + } pr_debug("subflow=%p, local_key=%llu, remote_key=%llu map_len=%d", subflow, subflow->local_key, subflow->remote_key, @@ -1122,6 +1129,25 @@ static void mptcp_set_rwin(const struct tcp_sock *tp) WRITE_ONCE(msk->rcv_wnd_sent, ack_seq); } +static u16 mptcp_make_csum(const struct mptcp_ext *mpext) +{ + struct csum_pseudo_header header; + __wsum csum; + + /* cfr RFC 8684 3.3.1.: + * the data sequence number used in the pseudo-header is + * always the 64-bit value, irrespective of what length is used in the + * DSS option itself. + */ + header.data_seq = cpu_to_be64(mpext->data_seq); + header.subflow_seq = htonl(mpext->subflow_seq); + header.data_len = htons(mpext->data_len); + header.csum = 0; + + csum = csum_partial(&header, sizeof(header), ~csum_unfold(mpext->csum)); + return (__force u16)csum_fold(csum); +} + void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, struct mptcp_out_options *opts) { @@ -1129,14 +1155,17 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, OPTION_MPTCP_MPC_ACK) & opts->suboptions) { u8 len, flag = MPTCP_CAP_HMAC_SHA256; - if (OPTION_MPTCP_MPC_SYN & opts->suboptions) + if (OPTION_MPTCP_MPC_SYN & opts->suboptions) { len = TCPOLEN_MPTCP_MPC_SYN; - else if (OPTION_MPTCP_MPC_SYNACK & opts->suboptions) + } else if (OPTION_MPTCP_MPC_SYNACK & opts->suboptions) { len = TCPOLEN_MPTCP_MPC_SYNACK; - else if (opts->ext_copy.data_len) + } else if (opts->ext_copy.data_len) { len = TCPOLEN_MPTCP_MPC_ACK_DATA; - else + if (opts->csum_reqd) + len += TCPOLEN_MPTCP_DSS_CHECKSUM; + } else { len = TCPOLEN_MPTCP_MPC_ACK; + } if (opts->csum_reqd) flag |= MPTCP_CAP_CHECKSUM_REQD; @@ -1159,8 +1188,13 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, if (!opts->ext_copy.data_len) goto mp_capable_done; - put_unaligned_be32(opts->ext_copy.data_len << 16 | - TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); + if (opts->csum_reqd) { + put_unaligned_be32(opts->ext_copy.data_len << 16 | + mptcp_make_csum(&opts->ext_copy), ptr); + } else { + put_unaligned_be32(opts->ext_copy.data_len << 16 | + TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); + } ptr += 1; } -- cgit v1.2.3-58-ga151 From c5b39e26d0036423be09c39ad142e91a2d5d278b Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 17 Jun 2021 16:46:11 -0700 Subject: mptcp: send out checksum for DSS In mptcp_write_options, if the checksum is enabled, adjust the option length and send out the data checksum with DSS suboption. Co-developed-by: Paolo Abeni Signed-off-by: Paolo Abeni Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/options.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/mptcp/options.c b/net/mptcp/options.c index b4da08db1221..1468774f1f87 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -478,6 +478,9 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, if (data_len > 0) { len = TCPOLEN_MPTCP_MPC_ACK_DATA; if (opts->csum_reqd) { + /* we need to propagate more info to csum the pseudo hdr */ + opts->ext_copy.data_seq = mpext->data_seq; + opts->ext_copy.subflow_seq = mpext->subflow_seq; opts->ext_copy.csum = mpext->csum; len += TCPOLEN_MPTCP_DSS_CHECKSUM; } @@ -545,18 +548,21 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, bool ret = false; u64 ack_seq; + opts->csum_reqd = READ_ONCE(msk->csum_enabled); mpext = skb ? mptcp_get_ext(skb) : NULL; if (!skb || (mpext && mpext->use_map) || snd_data_fin_enable) { - unsigned int map_size; + unsigned int map_size = TCPOLEN_MPTCP_DSS_BASE + TCPOLEN_MPTCP_DSS_MAP64; - map_size = TCPOLEN_MPTCP_DSS_BASE + TCPOLEN_MPTCP_DSS_MAP64; + if (mpext) { + if (opts->csum_reqd) + map_size += TCPOLEN_MPTCP_DSS_CHECKSUM; - remaining -= map_size; - dss_size = map_size; - if (mpext) opts->ext_copy = *mpext; + } + remaining -= map_size; + dss_size = map_size; if (skb && snd_data_fin_enable) mptcp_write_data_fin(subflow, skb, &opts->ext_copy); ret = true; @@ -1346,6 +1352,9 @@ mp_capable_done: flags |= MPTCP_DSS_HAS_MAP | MPTCP_DSS_DSN64; if (mpext->data_fin) flags |= MPTCP_DSS_DATA_FIN; + + if (opts->csum_reqd) + len += TCPOLEN_MPTCP_DSS_CHECKSUM; } *ptr++ = mptcp_option(MPTCPOPT_DSS, len, 0, flags); @@ -1365,8 +1374,13 @@ mp_capable_done: ptr += 2; put_unaligned_be32(mpext->subflow_seq, ptr); ptr += 1; - put_unaligned_be32(mpext->data_len << 16 | - TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); + if (opts->csum_reqd) { + put_unaligned_be32(mpext->data_len << 16 | + mptcp_make_csum(mpext), ptr); + } else { + put_unaligned_be32(mpext->data_len << 16 | + TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); + } } } -- cgit v1.2.3-58-ga151 From c863225b79426459feca2ef5b0cc2f07e8e68771 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 17 Jun 2021 16:46:12 -0700 Subject: mptcp: add sk parameter for mptcp_get_options This patch added a new parameter name sk in mptcp_get_options(). Acked-by: Paolo Abeni Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/options.c | 5 +++-- net/mptcp/protocol.h | 3 ++- net/mptcp/subflow.c | 10 +++++----- 3 files changed, 10 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 1468774f1f87..ae69059583a7 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -323,7 +323,8 @@ static void mptcp_parse_option(const struct sk_buff *skb, } } -void mptcp_get_options(const struct sk_buff *skb, +void mptcp_get_options(const struct sock *sk, + const struct sk_buff *skb, struct mptcp_options_received *mp_opt) { const struct tcphdr *th = tcp_hdr(skb); @@ -1024,7 +1025,7 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) return; } - mptcp_get_options(skb, &mp_opt); + mptcp_get_options(sk, skb, &mp_opt); if (!check_fully_established(msk, sk, subflow, skb, &mp_opt)) return; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 09e94726e030..a7ed0b8eb9bc 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -586,7 +586,8 @@ int __init mptcp_proto_v6_init(void); struct sock *mptcp_sk_clone(const struct sock *sk, const struct mptcp_options_received *mp_opt, struct request_sock *req); -void mptcp_get_options(const struct sk_buff *skb, +void mptcp_get_options(const struct sock *sk, + const struct sk_buff *skb, struct mptcp_options_received *mp_opt); void mptcp_finish_connect(struct sock *sk); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 45acab63c387..aa6b307b27c8 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -151,7 +151,7 @@ static int subflow_check_req(struct request_sock *req, return -EINVAL; #endif - mptcp_get_options(skb, &mp_opt); + mptcp_get_options(sk_listener, skb, &mp_opt); if (mp_opt.mp_capable) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE); @@ -248,7 +248,7 @@ int mptcp_subflow_init_cookie_req(struct request_sock *req, int err; subflow_init_req(req, sk_listener); - mptcp_get_options(skb, &mp_opt); + mptcp_get_options(sk_listener, skb, &mp_opt); if (mp_opt.mp_capable && mp_opt.mp_join) return -EINVAL; @@ -395,7 +395,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) subflow->ssn_offset = TCP_SKB_CB(skb)->seq; pr_debug("subflow=%p synack seq=%x", subflow, subflow->ssn_offset); - mptcp_get_options(skb, &mp_opt); + mptcp_get_options(sk, skb, &mp_opt); if (subflow->request_mptcp) { if (!mp_opt.mp_capable) { MPTCP_INC_STATS(sock_net(sk), @@ -639,7 +639,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, * reordered MPC will cause fallback, but we don't have other * options. */ - mptcp_get_options(skb, &mp_opt); + mptcp_get_options(sk, skb, &mp_opt); if (!mp_opt.mp_capable) { fallback = true; goto create_child; @@ -649,7 +649,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, if (!new_msk) fallback = true; } else if (subflow_req->mp_join) { - mptcp_get_options(skb, &mp_opt); + mptcp_get_options(sk, skb, &mp_opt); if (!mp_opt.mp_join || !subflow_hmac_valid(req, &mp_opt) || !mptcp_can_accept_new_subflow(subflow_req->msk)) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC); -- cgit v1.2.3-58-ga151 From 0625118115cf2ee8e435bf86d1c1f0bfdee9d7c8 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 17 Jun 2021 16:46:13 -0700 Subject: mptcp: add csum_reqd in mptcp_options_received This patch added a new flag csum_reqd in struct mptcp_options_received, if the flag MPTCP_CAP_CHECKSUM_REQD is set in the receiving MP_CAPABLE suboption, set this flag. In mptcp_sk_clone and subflow_finish_connect, if the csum_reqd flag is set, enable the msk->csum_enabled flag. Acked-by: Paolo Abeni Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/options.c | 7 ++++--- net/mptcp/protocol.c | 2 ++ net/mptcp/protocol.h | 1 + net/mptcp/subflow.c | 2 ++ 4 files changed, 9 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mptcp/options.c b/net/mptcp/options.c index ae69059583a7..2e2551590ecd 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -71,11 +71,9 @@ static void mptcp_parse_option(const struct sk_buff *skb, * "If a checksum is not present when its use has been * negotiated, the receiver MUST close the subflow with a RST as * it is considered broken." - * - * We don't implement DSS checksum - fall back to TCP. */ if (flags & MPTCP_CAP_CHECKSUM_REQD) - break; + mp_opt->csum_reqd = 1; mp_opt->mp_capable = 1; if (opsize >= TCPOLEN_MPTCP_MPC_SYNACK) { @@ -327,6 +325,8 @@ void mptcp_get_options(const struct sock *sk, const struct sk_buff *skb, struct mptcp_options_received *mp_opt) { + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct mptcp_sock *msk = mptcp_sk(subflow->conn); const struct tcphdr *th = tcp_hdr(skb); const unsigned char *ptr; int length; @@ -342,6 +342,7 @@ void mptcp_get_options(const struct sock *sk, mp_opt->dss = 0; mp_opt->mp_prio = 0; mp_opt->reset = 0; + mp_opt->csum_reqd = READ_ONCE(msk->csum_enabled); length = (th->doff * 4) - sizeof(struct tcphdr); ptr = (const unsigned char *)(th + 1); diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index f0da067301f6..b6e5c0930533 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2810,6 +2810,8 @@ struct sock *mptcp_sk_clone(const struct sock *sk, msk->token = subflow_req->token; msk->subflow = NULL; WRITE_ONCE(msk->fully_established, false); + if (mp_opt->csum_reqd) + WRITE_ONCE(msk->csum_enabled, true); msk->write_seq = subflow_req->idsn + 1; msk->snd_nxt = msk->write_seq; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index a7ed0b8eb9bc..66e5063ac6c9 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -133,6 +133,7 @@ struct mptcp_options_received { rm_addr : 1, mp_prio : 1, echo : 1, + csum_reqd : 1, backup : 1; u32 token; u32 nonce; diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index aa6b307b27c8..9b82ce635c6e 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -405,6 +405,8 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) goto fallback; } + if (mp_opt.csum_reqd) + WRITE_ONCE(mptcp_sk(parent)->csum_enabled, true); subflow->mp_capable = 1; subflow->can_ack = 1; subflow->remote_key = mp_opt.sndr_key; -- cgit v1.2.3-58-ga151 From 208e8f66926c5d73e3f359385c1dd49dbc48d067 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 17 Jun 2021 16:46:14 -0700 Subject: mptcp: receive checksum for MP_CAPABLE with data This patch added a new member named csum in struct mptcp_options_received. When parsing the MP_CAPABLE with data, if the checksum is enabled, adjust the expected_opsize. If the receiving option length matches the length with the data checksum, get the checksum value and save it in mp_opt->csum. And in mptcp_incoming_options, pass it to mpext->csum. We always parse any csum/nocsum combination and delay the presence check to later code, to allow reset if missing. Additionally, in the TX path, use the newly introduce ext field to avoid MPTCP csum recomputation on TCP retransmission and unneeded csum update on when setting the data fin_flag. Co-developed-by: Paolo Abeni Signed-off-by: Paolo Abeni Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- include/net/mptcp.h | 3 ++- net/mptcp/options.c | 35 ++++++++++++++++++++++++++--------- net/mptcp/protocol.h | 3 +++ 3 files changed, 31 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 33af68eea96f..d61bbbf11979 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -32,7 +32,8 @@ struct mptcp_ext { mpc_map:1, frozen:1, reset_transient:1; - u8 reset_reason:4; + u8 reset_reason:4, + csum_reqd:1; }; #define MPTCP_RM_IDS_MAX 8 diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 2e2551590ecd..8cbc75868969 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -44,7 +44,20 @@ static void mptcp_parse_option(const struct sk_buff *skb, else expected_opsize = TCPOLEN_MPTCP_MPC_SYN; } - if (opsize != expected_opsize) + + /* Cfr RFC 8684 Section 3.3.0: + * If a checksum is present but its use had + * not been negotiated in the MP_CAPABLE handshake, the receiver MUST + * close the subflow with a RST, as it is not behaving as negotiated. + * If a checksum is not present when its use has been negotiated, the + * receiver MUST close the subflow with a RST, as it is considered + * broken + * We parse even option with mismatching csum presence, so that + * later in subflow_data_ready we can trigger the reset. + */ + if (opsize != expected_opsize && + (expected_opsize != TCPOLEN_MPTCP_MPC_ACK_DATA || + opsize != TCPOLEN_MPTCP_MPC_ACK_DATA_CSUM)) break; /* try to be gentle vs future versions on the initial syn */ @@ -66,11 +79,6 @@ static void mptcp_parse_option(const struct sk_buff *skb, * host requires the use of checksums, checksums MUST be used. * In other words, the only way for checksums not to be used * is if both hosts in their SYNs set A=0." - * - * Section 3.3.0: - * "If a checksum is not present when its use has been - * negotiated, the receiver MUST close the subflow with a RST as - * it is considered broken." */ if (flags & MPTCP_CAP_CHECKSUM_REQD) mp_opt->csum_reqd = 1; @@ -84,7 +92,7 @@ static void mptcp_parse_option(const struct sk_buff *skb, mp_opt->rcvr_key = get_unaligned_be64(ptr); ptr += 8; } - if (opsize == TCPOLEN_MPTCP_MPC_ACK_DATA) { + if (opsize >= TCPOLEN_MPTCP_MPC_ACK_DATA) { /* Section 3.1.: * "the data parameters in a MP_CAPABLE are semantically * equivalent to those in a DSS option and can be used @@ -96,9 +104,14 @@ static void mptcp_parse_option(const struct sk_buff *skb, mp_opt->data_len = get_unaligned_be16(ptr); ptr += 2; } - pr_debug("MP_CAPABLE version=%x, flags=%x, optlen=%d sndr=%llu, rcvr=%llu len=%d", + if (opsize == TCPOLEN_MPTCP_MPC_ACK_DATA_CSUM) { + mp_opt->csum = (__force __sum16)get_unaligned_be16(ptr); + mp_opt->csum_reqd = 1; + ptr += 2; + } + pr_debug("MP_CAPABLE version=%x, flags=%x, optlen=%d sndr=%llu, rcvr=%llu len=%d csum=%u", version, flags, opsize, mp_opt->sndr_key, - mp_opt->rcvr_key, mp_opt->data_len); + mp_opt->rcvr_key, mp_opt->data_len, mp_opt->csum); break; case MPTCPOPT_MP_JOIN: @@ -1118,6 +1131,10 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) } mpext->data_len = mp_opt.data_len; mpext->use_map = 1; + mpext->csum_reqd = mp_opt.csum_reqd; + + if (mpext->csum_reqd) + mpext->csum = mp_opt.csum; } } diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 66e5063ac6c9..76194babc754 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -68,6 +68,8 @@ #define TCPOLEN_MPTCP_FASTCLOSE 12 #define TCPOLEN_MPTCP_RST 4 +#define TCPOLEN_MPTCP_MPC_ACK_DATA_CSUM (TCPOLEN_MPTCP_DSS_CHECKSUM + TCPOLEN_MPTCP_MPC_ACK_DATA) + /* MPTCP MP_JOIN flags */ #define MPTCPOPT_BACKUP BIT(0) #define MPTCPOPT_HMAC_LEN 20 @@ -124,6 +126,7 @@ struct mptcp_options_received { u64 data_seq; u32 subflow_seq; u16 data_len; + __sum16 csum; u16 mp_capable : 1, mp_join : 1, fastclose : 1, -- cgit v1.2.3-58-ga151 From 390b95a5fb84e7999eedb021382c96d1500e01fc Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 17 Jun 2021 16:46:15 -0700 Subject: mptcp: receive checksum for DSS In mptcp_parse_option, adjust the expected_opsize, and always parse the data checksum value from the receiving DSS regardless of csum presence. Then save it in mp_opt->csum. Co-developed-by: Paolo Abeni Signed-off-by: Paolo Abeni Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/options.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 8cbc75868969..1aec01686c1a 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -182,10 +182,8 @@ static void mptcp_parse_option(const struct sk_buff *skb, expected_opsize += TCPOLEN_MPTCP_DSS_MAP32; } - /* RFC 6824, Section 3.3: - * If a checksum is present, but its use had - * not been negotiated in the MP_CAPABLE handshake, - * the checksum field MUST be ignored. + /* Always parse any csum presence combination, we will enforce + * RFC 8684 Section 3.3.0 checks later in subflow_data_ready */ if (opsize != expected_opsize && opsize != expected_opsize + TCPOLEN_MPTCP_DSS_CHECKSUM) @@ -220,9 +218,15 @@ static void mptcp_parse_option(const struct sk_buff *skb, mp_opt->data_len = get_unaligned_be16(ptr); ptr += 2; - pr_debug("data_seq=%llu subflow_seq=%u data_len=%u", + if (opsize == expected_opsize + TCPOLEN_MPTCP_DSS_CHECKSUM) { + mp_opt->csum_reqd = 1; + mp_opt->csum = (__force __sum16)get_unaligned_be16(ptr); + ptr += 2; + } + + pr_debug("data_seq=%llu subflow_seq=%u data_len=%u csum=%d:%u", mp_opt->data_seq, mp_opt->subflow_seq, - mp_opt->data_len); + mp_opt->data_len, mp_opt->csum_reqd, mp_opt->csum); } break; -- cgit v1.2.3-58-ga151 From dd8bcd1768ff76bf2da1154897871adcc4ec078a Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 17 Jun 2021 16:46:16 -0700 Subject: mptcp: validate the data checksum This patch added three new members named data_csum, csum_len and map_csum in struct mptcp_subflow_context, implemented a new function named mptcp_validate_data_checksum(). If the current mapping is valid and csum is enabled traverse the later pending skbs and compute csum incrementally till the whole mapping has been covered. If not enough data is available in the rx queue, return MAPPING_EMPTY - that is, no data. Next subflow_data_ready invocation will trigger again csum computation. When the full DSS is available, validate the csum and return to the caller an appropriate error code, to trigger subflow reset of fallback as required by the RFC. Additionally: - if the csum prevence in the DSS don't match the negotiated value e.g. csum present, but not requested, return invalid mapping to trigger subflow reset. - keep some csum state, to avoid re-compute the csum on the same data when multiple rx queue traversal are required. - clean-up the uncompleted mapping from the receive queue on close, to allow proper subflow disposal Co-developed-by: Geliang Tang Signed-off-by: Geliang Tang Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/protocol.h | 4 ++ net/mptcp/subflow.c | 105 ++++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 103 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 76194babc754..12637299b42e 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -400,6 +400,8 @@ struct mptcp_subflow_context { u32 map_subflow_seq; u32 ssn_offset; u32 map_data_len; + __wsum map_data_csum; + u32 map_csum_len; u32 request_mptcp : 1, /* send MP_CAPABLE */ request_join : 1, /* send MP_JOIN */ request_bkup : 1, @@ -409,6 +411,8 @@ struct mptcp_subflow_context { pm_notified : 1, /* PM hook called for established status */ conn_finished : 1, map_valid : 1, + map_csum_reqd : 1, + map_data_fin : 1, mpc_map : 1, backup : 1, send_mp_prio : 1, diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 9b82ce635c6e..9ccc4686d0d4 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -827,10 +827,90 @@ static bool validate_mapping(struct sock *ssk, struct sk_buff *skb) return true; } +static enum mapping_status validate_data_csum(struct sock *ssk, struct sk_buff *skb, + bool csum_reqd) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + struct csum_pseudo_header header; + u32 offset, seq, delta; + __wsum csum; + int len; + + if (!csum_reqd) + return MAPPING_OK; + + /* mapping already validated on previous traversal */ + if (subflow->map_csum_len == subflow->map_data_len) + return MAPPING_OK; + + /* traverse the receive queue, ensuring it contains a full + * DSS mapping and accumulating the related csum. + * Preserve the accoumlate csum across multiple calls, to compute + * the csum only once + */ + delta = subflow->map_data_len - subflow->map_csum_len; + for (;;) { + seq = tcp_sk(ssk)->copied_seq + subflow->map_csum_len; + offset = seq - TCP_SKB_CB(skb)->seq; + + /* if the current skb has not been accounted yet, csum its contents + * up to the amount covered by the current DSS + */ + if (offset < skb->len) { + __wsum csum; + + len = min(skb->len - offset, delta); + csum = skb_checksum(skb, offset, len, 0); + subflow->map_data_csum = csum_block_add(subflow->map_data_csum, csum, + subflow->map_csum_len); + + delta -= len; + subflow->map_csum_len += len; + } + if (delta == 0) + break; + + if (skb_queue_is_last(&ssk->sk_receive_queue, skb)) { + /* if this subflow is closed, the partial mapping + * will be never completed; flush the pending skbs, so + * that subflow_sched_work_if_closed() can kick in + */ + if (unlikely(ssk->sk_state == TCP_CLOSE)) + while ((skb = skb_peek(&ssk->sk_receive_queue))) + sk_eat_skb(ssk, skb); + + /* not enough data to validate the csum */ + return MAPPING_EMPTY; + } + + /* the DSS mapping for next skbs will be validated later, + * when a get_mapping_status call will process such skb + */ + skb = skb->next; + } + + /* note that 'map_data_len' accounts only for the carried data, does + * not include the eventual seq increment due to the data fin, + * while the pseudo header requires the original DSS data len, + * including that + */ + header.data_seq = cpu_to_be64(subflow->map_seq); + header.subflow_seq = htonl(subflow->map_subflow_seq); + header.data_len = htons(subflow->map_data_len + subflow->map_data_fin); + header.csum = 0; + + csum = csum_partial(&header, sizeof(header), subflow->map_data_csum); + if (unlikely(csum_fold(csum))) + return subflow->mp_join ? MAPPING_INVALID : MAPPING_DUMMY; + + return MAPPING_OK; +} + static enum mapping_status get_mapping_status(struct sock *ssk, struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + bool csum_reqd = READ_ONCE(msk->csum_enabled); struct mptcp_ext *mpext; struct sk_buff *skb; u16 data_len; @@ -923,9 +1003,10 @@ static enum mapping_status get_mapping_status(struct sock *ssk, /* Allow replacing only with an identical map */ if (subflow->map_seq == map_seq && subflow->map_subflow_seq == mpext->subflow_seq && - subflow->map_data_len == data_len) { + subflow->map_data_len == data_len && + subflow->map_csum_reqd == mpext->csum_reqd) { skb_ext_del(skb, SKB_EXT_MPTCP); - return MAPPING_OK; + goto validate_csum; } /* If this skb data are fully covered by the current mapping, @@ -937,17 +1018,27 @@ static enum mapping_status get_mapping_status(struct sock *ssk, } /* will validate the next map after consuming the current one */ - return MAPPING_OK; + goto validate_csum; } subflow->map_seq = map_seq; subflow->map_subflow_seq = mpext->subflow_seq; subflow->map_data_len = data_len; subflow->map_valid = 1; + subflow->map_data_fin = mpext->data_fin; subflow->mpc_map = mpext->mpc_map; - pr_debug("new map seq=%llu subflow_seq=%u data_len=%u", + subflow->map_csum_reqd = mpext->csum_reqd; + subflow->map_csum_len = 0; + subflow->map_data_csum = csum_unfold(mpext->csum); + + /* Cfr RFC 8684 Section 3.3.0 */ + if (unlikely(subflow->map_csum_reqd != csum_reqd)) + return MAPPING_INVALID; + + pr_debug("new map seq=%llu subflow_seq=%u data_len=%u csum=%d:%u", subflow->map_seq, subflow->map_subflow_seq, - subflow->map_data_len); + subflow->map_data_len, subflow->map_csum_reqd, + subflow->map_data_csum); validate_seq: /* we revalidate valid mapping on new skb, because we must ensure @@ -957,7 +1048,9 @@ validate_seq: return MAPPING_INVALID; skb_ext_del(skb, SKB_EXT_MPTCP); - return MAPPING_OK; + +validate_csum: + return validate_data_csum(ssk, skb, csum_reqd); } static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb, -- cgit v1.2.3-58-ga151 From 4e14867d5e9185e38f730d65c89b728640d68dd1 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 17 Jun 2021 16:46:17 -0700 Subject: mptcp: tune re-injections for csum enabled mode If the MPTCP-level checksum is enabled, on re-injections we must spool a complete DSS, or the receive side will not be able to compute the csum and process any data. Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index b6e5c0930533..42fc7187beee 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2375,8 +2375,8 @@ static void __mptcp_retrans(struct sock *sk) /* limit retransmission to the bytes already sent on some subflows */ info.sent = 0; - info.limit = dfrag->already_sent; - while (info.sent < dfrag->already_sent) { + info.limit = READ_ONCE(msk->csum_enabled) ? dfrag->data_len : dfrag->already_sent; + while (info.sent < info.limit) { if (!mptcp_alloc_tx_skb(sk, ssk)) break; @@ -2388,9 +2388,11 @@ static void __mptcp_retrans(struct sock *sk) copied += ret; info.sent += ret; } - if (copied) + if (copied) { + dfrag->already_sent = max(dfrag->already_sent, info.sent); tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, info.size_goal); + } mptcp_set_timeout(sk, ssk); release_sock(ssk); -- cgit v1.2.3-58-ga151 From fe3ab1cbd357d9d0903f2d00038c2cb7141e7fe5 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 17 Jun 2021 16:46:18 -0700 Subject: mptcp: add the mib for data checksum This patch added the mib for the data checksum, MPTCP_MIB_DATACSUMERR. Acked-by: Paolo Abeni Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/mib.c | 1 + net/mptcp/mib.h | 1 + net/mptcp/subflow.c | 4 +++- 3 files changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c index eb2dc6dbe212..e7e60bc1fb96 100644 --- a/net/mptcp/mib.c +++ b/net/mptcp/mib.c @@ -25,6 +25,7 @@ static const struct snmp_mib mptcp_snmp_list[] = { SNMP_MIB_ITEM("MPJoinAckHMacFailure", MPTCP_MIB_JOINACKMAC), SNMP_MIB_ITEM("DSSNotMatching", MPTCP_MIB_DSSNOMATCH), SNMP_MIB_ITEM("InfiniteMapRx", MPTCP_MIB_INFINITEMAPRX), + SNMP_MIB_ITEM("DataCsumErr", MPTCP_MIB_DATACSUMERR), SNMP_MIB_ITEM("OFOQueueTail", MPTCP_MIB_OFOQUEUETAIL), SNMP_MIB_ITEM("OFOQueue", MPTCP_MIB_OFOQUEUE), SNMP_MIB_ITEM("OFOMerge", MPTCP_MIB_OFOMERGE), diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h index f0da4f060fe1..92e56c0cfbdd 100644 --- a/net/mptcp/mib.h +++ b/net/mptcp/mib.h @@ -18,6 +18,7 @@ enum linux_mptcp_mib_field { MPTCP_MIB_JOINACKMAC, /* HMAC was wrong on ACK + MP_JOIN */ MPTCP_MIB_DSSNOMATCH, /* Received a new mapping that did not match the previous one */ MPTCP_MIB_INFINITEMAPRX, /* Received an infinite mapping */ + MPTCP_MIB_DATACSUMERR, /* The data checksum fail */ MPTCP_MIB_OFOQUEUETAIL, /* Segments inserted into OoO queue tail */ MPTCP_MIB_OFOQUEUE, /* Segments inserted into OoO queue */ MPTCP_MIB_OFOMERGE, /* Segments merged in OoO queue */ diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 9ccc4686d0d4..6b1cd4257edf 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -900,8 +900,10 @@ static enum mapping_status validate_data_csum(struct sock *ssk, struct sk_buff * header.csum = 0; csum = csum_partial(&header, sizeof(header), subflow->map_data_csum); - if (unlikely(csum_fold(csum))) + if (unlikely(csum_fold(csum))) { + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DATACSUMERR); return subflow->mp_join ? MAPPING_INVALID : MAPPING_DUMMY; + } return MAPPING_OK; } -- cgit v1.2.3-58-ga151 From fc3c82eebf8e2e193412612f509530b4ff5611bf Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 17 Jun 2021 16:46:19 -0700 Subject: mptcp: add a new sysctl checksum_enabled This patch added a new sysctl, named checksum_enabled, to control whether DSS checksum can be enabled. Acked-by: Paolo Abeni Co-developed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- Documentation/networking/mptcp-sysctl.rst | 8 ++++++++ net/mptcp/ctrl.c | 16 ++++++++++++++++ net/mptcp/protocol.h | 2 +- 3 files changed, 25 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/Documentation/networking/mptcp-sysctl.rst b/Documentation/networking/mptcp-sysctl.rst index 3b352e5f6300..ee06fd782465 100644 --- a/Documentation/networking/mptcp-sysctl.rst +++ b/Documentation/networking/mptcp-sysctl.rst @@ -24,3 +24,11 @@ add_addr_timeout - INTEGER (seconds) sysctl. Default: 120 + +checksum_enabled - BOOLEAN + Control whether DSS checksum can be enabled. + + DSS checksum can be enabled if the value is nonzero. This is a + per-namespace sysctl. + + Default: 0 diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c index 1ec4d36a39f0..6c2639bb9c19 100644 --- a/net/mptcp/ctrl.c +++ b/net/mptcp/ctrl.c @@ -23,6 +23,7 @@ struct mptcp_pernet { u8 mptcp_enabled; unsigned int add_addr_timeout; + u8 checksum_enabled; }; static struct mptcp_pernet *mptcp_get_pernet(struct net *net) @@ -40,10 +41,16 @@ unsigned int mptcp_get_add_addr_timeout(struct net *net) return mptcp_get_pernet(net)->add_addr_timeout; } +int mptcp_is_checksum_enabled(struct net *net) +{ + return mptcp_get_pernet(net)->checksum_enabled; +} + static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet) { pernet->mptcp_enabled = 1; pernet->add_addr_timeout = TCP_RTO_MAX; + pernet->checksum_enabled = 0; } #ifdef CONFIG_SYSCTL @@ -65,6 +72,14 @@ static struct ctl_table mptcp_sysctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, + { + .procname = "checksum_enabled", + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE + }, {} }; @@ -82,6 +97,7 @@ static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) table[0].data = &pernet->mptcp_enabled; table[1].data = &pernet->add_addr_timeout; + table[2].data = &pernet->checksum_enabled; hdr = register_net_sysctl(net, MPTCP_SYSCTL_PATH, table); if (!hdr) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 12637299b42e..16e50caf200e 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -542,7 +542,7 @@ static inline void mptcp_subflow_delegated_done(struct mptcp_subflow_context *su int mptcp_is_enabled(struct net *net); unsigned int mptcp_get_add_addr_timeout(struct net *net); -static inline int mptcp_is_checksum_enabled(struct net *net) { return false; } +int mptcp_is_checksum_enabled(struct net *net); void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow, struct mptcp_options_received *mp_opt); bool mptcp_subflow_data_available(struct sock *sk); -- cgit v1.2.3-58-ga151 From 9d72b8da9f13349be11914823d7bd8186c6a91ce Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 17 Jun 2021 21:55:56 -0700 Subject: net: vlan: pass thru all GSO_SOFTWARE in hw_enc_features Currently UDP tunnel devices on top of VLANs lose the ability to offload UDP GSO. Widen the pass thru features from TSO to all GSO_SOFTWARE. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/8021q/vlan.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h index e3f6ff05a528..1a705a4ef7fa 100644 --- a/net/8021q/vlan.h +++ b/net/8021q/vlan.h @@ -108,7 +108,8 @@ static inline netdev_features_t vlan_tnl_features(struct net_device *real_dev) netdev_features_t ret; ret = real_dev->hw_enc_features & - (NETIF_F_CSUM_MASK | NETIF_F_ALL_TSO | NETIF_F_GSO_ENCAP_ALL); + (NETIF_F_CSUM_MASK | NETIF_F_GSO_SOFTWARE | + NETIF_F_GSO_ENCAP_ALL); if ((ret & NETIF_F_GSO_ENCAP_ALL) && (ret & NETIF_F_CSUM_MASK)) return (ret & ~NETIF_F_CSUM_MASK) | NETIF_F_HW_CSUM; -- cgit v1.2.3-58-ga151 From 9fd2bc3206b31c8ff6d54d643730d4c3470471d6 Mon Sep 17 00:00:00 2001 From: Dongliang Mu Date: Fri, 18 Jun 2021 15:32:04 +0800 Subject: net: caif: modify the label out_err to out Modify the label out_err to out to avoid the meanless kfree. Signed-off-by: Dongliang Mu Signed-off-by: David S. Miller --- net/caif/cfcnfg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/caif/cfcnfg.c b/net/caif/cfcnfg.c index cac30e676ac9..23267c8db7c4 100644 --- a/net/caif/cfcnfg.c +++ b/net/caif/cfcnfg.c @@ -480,7 +480,7 @@ got_phyid: phyinfo = kzalloc(sizeof(struct cfcnfg_phyinfo), GFP_ATOMIC); if (!phyinfo) { res = -ENOMEM; - goto out_err; + goto out; } phy_layer->id = phyid; -- cgit v1.2.3-58-ga151 From 040c12570e6865b1a219c9d7f7f4a924a6570d1e Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 18 Jun 2021 11:01:55 +0100 Subject: net: bridge: remove redundant continue statement The continue statement at the end of a for-loop has no effect, invert the if expression and remove the continue. Addresses-Coverity: ("Continue has no effect") Signed-off-by: Colin Ian King Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_vlan.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index da3256a3eed0..8789a57af543 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -113,9 +113,7 @@ static void __vlan_add_list(struct net_bridge_vlan *v) headp = &vg->vlan_list; list_for_each_prev(hpos, headp) { vent = list_entry(hpos, struct net_bridge_vlan, vlist); - if (v->vid < vent->vid) - continue; - else + if (v->vid >= vent->vid) break; } list_add_rcu(&v->vlist, hpos); -- cgit v1.2.3-58-ga151 From cc97141afd768d36eaef1b3e1afea2a74da7df27 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Fri, 18 Jun 2021 15:35:24 +0200 Subject: vsock: rename vsock_has_data() vsock_has_data() is used only by STREAM and SEQPACKET sockets, so let's rename it to vsock_connectible_has_data(), using the same nomenclature (connectible) used in other functions after the introduction of SEQPACKET. Signed-off-by: Stefano Garzarella Signed-off-by: David S. Miller --- net/vmw_vsock/af_vsock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 67954afef4e1..de8249483081 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -860,7 +860,7 @@ s64 vsock_stream_has_data(struct vsock_sock *vsk) } EXPORT_SYMBOL_GPL(vsock_stream_has_data); -static s64 vsock_has_data(struct vsock_sock *vsk) +static s64 vsock_connectible_has_data(struct vsock_sock *vsk) { struct sock *sk = sk_vsock(vsk); @@ -1880,7 +1880,7 @@ static int vsock_wait_data(struct sock *sk, struct wait_queue_entry *wait, err = 0; transport = vsk->transport; - while ((data = vsock_has_data(vsk)) == 0) { + while ((data = vsock_connectible_has_data(vsk)) == 0) { prepare_to_wait(sk_sleep(sk), wait, TASK_INTERRUPTIBLE); if (sk->sk_err != 0 || -- cgit v1.2.3-58-ga151 From 0de5b2e67275695d6ad7369c594feb1578f891fd Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Fri, 18 Jun 2021 15:35:25 +0200 Subject: vsock: rename vsock_wait_data() vsock_wait_data() is used only by STREAM and SEQPACKET sockets, so let's rename it to vsock_connectible_wait_data(), using the same nomenclature (connectible) used in other functions after the introduction of SEQPACKET. Signed-off-by: Stefano Garzarella Signed-off-by: David S. Miller --- net/vmw_vsock/af_vsock.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index de8249483081..21ccf450e249 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1866,10 +1866,11 @@ out: return err; } -static int vsock_wait_data(struct sock *sk, struct wait_queue_entry *wait, - long timeout, - struct vsock_transport_recv_notify_data *recv_data, - size_t target) +static int vsock_connectible_wait_data(struct sock *sk, + struct wait_queue_entry *wait, + long timeout, + struct vsock_transport_recv_notify_data *recv_data, + size_t target) { const struct vsock_transport *transport; struct vsock_sock *vsk; @@ -1967,7 +1968,8 @@ static int __vsock_stream_recvmsg(struct sock *sk, struct msghdr *msg, while (1) { ssize_t read; - err = vsock_wait_data(sk, &wait, timeout, &recv_data, target); + err = vsock_connectible_wait_data(sk, &wait, timeout, + &recv_data, target); if (err <= 0) break; @@ -2022,7 +2024,7 @@ static int __vsock_seqpacket_recvmsg(struct sock *sk, struct msghdr *msg, timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); - err = vsock_wait_data(sk, &wait, timeout, NULL, 0); + err = vsock_connectible_wait_data(sk, &wait, timeout, NULL, 0); if (err <= 0) goto out; -- cgit v1.2.3-58-ga151 From 91aa49a8fa0ffa66966be275b2575009cc12fd3b Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Fri, 18 Jun 2021 15:35:26 +0200 Subject: vsock/virtio: remove redundant `copy_failed` variable When memcpy_to_msg() fails in virtio_transport_seqpacket_do_dequeue(), we already set `dequeued_len` with the negative error value returned by memcpy_to_msg(). So we can directly check `dequeued_len` value instead of using a dedicated flag variable to skip the copy path for the rest of fragments. Signed-off-by: Stefano Garzarella Signed-off-by: David S. Miller --- net/vmw_vsock/virtio_transport_common.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 23704a6bc437..f014ccfdd9c2 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -413,7 +413,6 @@ static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk, struct virtio_vsock_pkt *pkt; int dequeued_len = 0; size_t user_buf_len = msg_data_left(msg); - bool copy_failed = false; bool msg_ready = false; spin_lock_bh(&vvs->rx_lock); @@ -426,7 +425,7 @@ static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk, while (!msg_ready) { pkt = list_first_entry(&vvs->rx_queue, struct virtio_vsock_pkt, list); - if (!copy_failed) { + if (dequeued_len >= 0) { size_t pkt_len; size_t bytes_to_copy; @@ -443,11 +442,9 @@ static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk, err = memcpy_to_msg(msg, pkt->buf, bytes_to_copy); if (err) { - /* Copy of message failed, set flag to skip - * copy path for rest of fragments. Rest of + /* Copy of message failed. Rest of * fragments will be freed without copy. */ - copy_failed = true; dequeued_len = err; } else { user_buf_len -= bytes_to_copy; -- cgit v1.2.3-58-ga151 From 1f3c98eaddec857e16a7a1c6cd83317b3dc89438 Mon Sep 17 00:00:00 2001 From: Yejune Deng Date: Fri, 18 Jun 2021 22:32:47 +0800 Subject: net: add pf_family_names[] for protocol family Modify the pr_info content from int to char *, this looks more readable. Signed-off-by: Yejune Deng Signed-off-by: David S. Miller --- include/uapi/linux/net.h | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ net/socket.c | 2 +- 2 files changed, 49 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/uapi/linux/net.h b/include/uapi/linux/net.h index 4dabec6bd957..a28caaf620c7 100644 --- a/include/uapi/linux/net.h +++ b/include/uapi/linux/net.h @@ -55,4 +55,52 @@ typedef enum { #define __SO_ACCEPTCON (1 << 16) /* performed a listen */ +static const char * const pf_family_names[] = { + [PF_UNSPEC] = "PF_UNSPEC", + [PF_UNIX] = "PF_UNIX/PF_LOCAL", + [PF_INET] = "PF_INET", + [PF_AX25] = "PF_AX25", + [PF_IPX] = "PF_IPX", + [PF_APPLETALK] = "PF_APPLETALK", + [PF_NETROM] = "PF_NETROM", + [PF_BRIDGE] = "PF_BRIDGE", + [PF_ATMPVC] = "PF_ATMPVC", + [PF_X25] = "PF_X25", + [PF_INET6] = "PF_INET6", + [PF_ROSE] = "PF_ROSE", + [PF_DECnet] = "PF_DECnet", + [PF_NETBEUI] = "PF_NETBEUI", + [PF_SECURITY] = "PF_SECURITY", + [PF_KEY] = "PF_KEY", + [PF_NETLINK] = "PF_NETLINK/PF_ROUTE", + [PF_PACKET] = "PF_PACKET", + [PF_ASH] = "PF_ASH", + [PF_ECONET] = "PF_ECONET", + [PF_ATMSVC] = "PF_ATMSVC", + [PF_RDS] = "PF_RDS", + [PF_SNA] = "PF_SNA", + [PF_IRDA] = "PF_IRDA", + [PF_PPPOX] = "PF_PPPOX", + [PF_WANPIPE] = "PF_WANPIPE", + [PF_LLC] = "PF_LLC", + [PF_IB] = "PF_IB", + [PF_MPLS] = "PF_MPLS", + [PF_CAN] = "PF_CAN", + [PF_TIPC] = "PF_TIPC", + [PF_BLUETOOTH] = "PF_BLUETOOTH", + [PF_IUCV] = "PF_IUCV", + [PF_RXRPC] = "PF_RXRPC", + [PF_ISDN] = "PF_ISDN", + [PF_PHONET] = "PF_PHONET", + [PF_IEEE802154] = "PF_IEEE802154", + [PF_CAIF] = "PF_CAIF", + [PF_ALG] = "PF_ALG", + [PF_NFC] = "PF_NFC", + [PF_VSOCK] = "PF_VSOCK", + [PF_KCM] = "PF_KCM", + [PF_QIPCRTR] = "PF_QIPCRTR", + [PF_SMC] = "PF_SMC", + [PF_XDP] = "PF_XDP", +}; + #endif /* _UAPI_LINUX_NET_H */ diff --git a/net/socket.c b/net/socket.c index 27e3e7d53f8e..ff544cf50321 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2988,7 +2988,7 @@ int sock_register(const struct net_proto_family *ops) } spin_unlock(&net_family_lock); - pr_info("NET: Registered protocol family %d\n", ops->family); + pr_info("NET: Registered %s protocol family\n", pf_family_names[ops->family]); return err; } EXPORT_SYMBOL(sock_register); -- cgit v1.2.3-58-ga151 From 103ebe658a262ef5b5db7f01d83857cf82a087d0 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 18 Jun 2021 13:02:45 -0700 Subject: Revert "net: add pf_family_names[] for protocol family" This reverts commit 1f3c98eaddec857e16a7a1c6cd83317b3dc89438. Does not build... Signed-off-by: David S. Miller --- include/uapi/linux/net.h | 48 ------------------------------------------------ net/socket.c | 2 +- 2 files changed, 1 insertion(+), 49 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/net.h b/include/uapi/linux/net.h index a28caaf620c7..4dabec6bd957 100644 --- a/include/uapi/linux/net.h +++ b/include/uapi/linux/net.h @@ -55,52 +55,4 @@ typedef enum { #define __SO_ACCEPTCON (1 << 16) /* performed a listen */ -static const char * const pf_family_names[] = { - [PF_UNSPEC] = "PF_UNSPEC", - [PF_UNIX] = "PF_UNIX/PF_LOCAL", - [PF_INET] = "PF_INET", - [PF_AX25] = "PF_AX25", - [PF_IPX] = "PF_IPX", - [PF_APPLETALK] = "PF_APPLETALK", - [PF_NETROM] = "PF_NETROM", - [PF_BRIDGE] = "PF_BRIDGE", - [PF_ATMPVC] = "PF_ATMPVC", - [PF_X25] = "PF_X25", - [PF_INET6] = "PF_INET6", - [PF_ROSE] = "PF_ROSE", - [PF_DECnet] = "PF_DECnet", - [PF_NETBEUI] = "PF_NETBEUI", - [PF_SECURITY] = "PF_SECURITY", - [PF_KEY] = "PF_KEY", - [PF_NETLINK] = "PF_NETLINK/PF_ROUTE", - [PF_PACKET] = "PF_PACKET", - [PF_ASH] = "PF_ASH", - [PF_ECONET] = "PF_ECONET", - [PF_ATMSVC] = "PF_ATMSVC", - [PF_RDS] = "PF_RDS", - [PF_SNA] = "PF_SNA", - [PF_IRDA] = "PF_IRDA", - [PF_PPPOX] = "PF_PPPOX", - [PF_WANPIPE] = "PF_WANPIPE", - [PF_LLC] = "PF_LLC", - [PF_IB] = "PF_IB", - [PF_MPLS] = "PF_MPLS", - [PF_CAN] = "PF_CAN", - [PF_TIPC] = "PF_TIPC", - [PF_BLUETOOTH] = "PF_BLUETOOTH", - [PF_IUCV] = "PF_IUCV", - [PF_RXRPC] = "PF_RXRPC", - [PF_ISDN] = "PF_ISDN", - [PF_PHONET] = "PF_PHONET", - [PF_IEEE802154] = "PF_IEEE802154", - [PF_CAIF] = "PF_CAIF", - [PF_ALG] = "PF_ALG", - [PF_NFC] = "PF_NFC", - [PF_VSOCK] = "PF_VSOCK", - [PF_KCM] = "PF_KCM", - [PF_QIPCRTR] = "PF_QIPCRTR", - [PF_SMC] = "PF_SMC", - [PF_XDP] = "PF_XDP", -}; - #endif /* _UAPI_LINUX_NET_H */ diff --git a/net/socket.c b/net/socket.c index ff544cf50321..27e3e7d53f8e 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2988,7 +2988,7 @@ int sock_register(const struct net_proto_family *ops) } spin_unlock(&net_family_lock); - pr_info("NET: Registered %s protocol family\n", pf_family_names[ops->family]); + pr_info("NET: Registered protocol family %d\n", ops->family); return err; } EXPORT_SYMBOL(sock_register); -- cgit v1.2.3-58-ga151 From d5f9023fa61ee8b94f37a93f08e94b136cf1e463 Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Sat, 19 Jun 2021 13:18:13 -0300 Subject: can: bcm: delay release of struct bcm_op after synchronize_rcu() can_rx_register() callbacks may be called concurrently to the call to can_rx_unregister(). The callbacks and callback data, though, are protected by RCU and the struct sock reference count. So the callback data is really attached to the life of sk, meaning that it should be released on sk_destruct. However, bcm_remove_op() calls tasklet_kill(), and RCU callbacks may be called under RCU softirq, so that cannot be used on kernels before the introduction of HRTIMER_MODE_SOFT. However, bcm_rx_handler() is called under RCU protection, so after calling can_rx_unregister(), we may call synchronize_rcu() in order to wait for any RCU read-side critical sections to finish. That is, bcm_rx_handler() won't be called anymore for those ops. So, we only free them, after we do that synchronize_rcu(). Fixes: ffd980f976e7 ("[CAN]: Add broadcast manager (bcm) protocol") Link: https://lore.kernel.org/r/20210619161813.2098382-1-cascardo@canonical.com Cc: linux-stable Reported-by: syzbot+0f7e7e5e2f4f40fa89c0@syzkaller.appspotmail.com Reported-by: Norbert Slusarek Signed-off-by: Thadeu Lima de Souza Cascardo Acked-by: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde --- net/can/bcm.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/can/bcm.c b/net/can/bcm.c index f3e4d9528fa3..0928a39c4423 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -785,6 +785,7 @@ static int bcm_delete_rx_op(struct list_head *ops, struct bcm_msg_head *mh, bcm_rx_handler, op); list_del(&op->list); + synchronize_rcu(); bcm_remove_op(op); return 1; /* done */ } @@ -1533,9 +1534,13 @@ static int bcm_release(struct socket *sock) REGMASK(op->can_id), bcm_rx_handler, op); - bcm_remove_op(op); } + synchronize_rcu(); + + list_for_each_entry_safe(op, next, &bo->rx_ops, list) + bcm_remove_op(op); + #if IS_ENABLED(CONFIG_PROC_FS) /* remove procfs entry */ if (net->can.bcmproc_dir && bo->bcm_proc_read) -- cgit v1.2.3-58-ga151 From fb8696ab14adadb2e3f6c17c18ed26b3ecd96691 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Fri, 18 Jun 2021 19:36:45 +0200 Subject: can: gw: synchronize rcu operations before removing gw job entry can_can_gw_rcv() is called under RCU protection, so after calling can_rx_unregister(), we have to call synchronize_rcu in order to wait for any RCU read-side critical sections to finish before removing the kmem_cache entry with the referenced gw job entry. Link: https://lore.kernel.org/r/20210618173645.2238-1-socketcan@hartkopp.net Fixes: c1aabdf379bc ("can-gw: add netlink based CAN routing") Cc: linux-stable Signed-off-by: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde --- net/can/gw.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/can/gw.c b/net/can/gw.c index ba4124805602..d8861e862f15 100644 --- a/net/can/gw.c +++ b/net/can/gw.c @@ -596,6 +596,7 @@ static int cgw_notifier(struct notifier_block *nb, if (gwj->src.dev == dev || gwj->dst.dev == dev) { hlist_del(&gwj->list); cgw_unregister_filter(net, gwj); + synchronize_rcu(); kmem_cache_free(cgw_cache, gwj); } } @@ -1154,6 +1155,7 @@ static void cgw_remove_all_jobs(struct net *net) hlist_for_each_entry_safe(gwj, nx, &net->can.cgw_list, list) { hlist_del(&gwj->list); cgw_unregister_filter(net, gwj); + synchronize_rcu(); kmem_cache_free(cgw_cache, gwj); } } @@ -1222,6 +1224,7 @@ static int cgw_remove_job(struct sk_buff *skb, struct nlmsghdr *nlh, hlist_del(&gwj->list); cgw_unregister_filter(net, gwj); + synchronize_rcu(); kmem_cache_free(cgw_cache, gwj); err = 0; break; -- cgit v1.2.3-58-ga151 From 14a4696bc3118ba49da28f79280e1d55603aa737 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Fri, 18 Jun 2021 19:37:13 +0200 Subject: can: isotp: isotp_release(): omit unintended hrtimer restart on socket release When closing the isotp socket, the potentially running hrtimers are canceled before removing the subscription for CAN identifiers via can_rx_unregister(). This may lead to an unintended (re)start of a hrtimer in isotp_rcv_cf() and isotp_rcv_fc() in the case that a CAN frame is received by isotp_rcv() while the subscription removal is processed. However, isotp_rcv() is called under RCU protection, so after calling can_rx_unregister, we may call synchronize_rcu in order to wait for any RCU read-side critical sections to finish. This prevents the reception of CAN frames after hrtimer_cancel() and therefore the unintended (re)start of the hrtimers. Link: https://lore.kernel.org/r/20210618173713.2296-1-socketcan@hartkopp.net Fixes: e057dd3fc20f ("can: add ISO 15765-2:2016 transport protocol") Cc: linux-stable Signed-off-by: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde --- net/can/isotp.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/can/isotp.c b/net/can/isotp.c index be6183f8ca11..234cc4ad179a 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -1028,9 +1028,6 @@ static int isotp_release(struct socket *sock) lock_sock(sk); - hrtimer_cancel(&so->txtimer); - hrtimer_cancel(&so->rxtimer); - /* remove current filters & unregister */ if (so->bound && (!(so->opt.flags & CAN_ISOTP_SF_BROADCAST))) { if (so->ifindex) { @@ -1042,10 +1039,14 @@ static int isotp_release(struct socket *sock) SINGLE_MASK(so->rxid), isotp_rcv, sk); dev_put(dev); + synchronize_rcu(); } } } + hrtimer_cancel(&so->txtimer); + hrtimer_cancel(&so->rxtimer); + so->ifindex = 0; so->bound = 0; -- cgit v1.2.3-58-ga151 From 22c696fed25c63c7f67508309820358b94a96b6d Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Thu, 17 Jun 2021 15:06:23 +0200 Subject: can: j1939: j1939_sk_init(): set SOCK_RCU_FREE to call sk_destruct() after RCU is done Set SOCK_RCU_FREE to let RCU to call sk_destruct() on completion. Without this patch, we will run in to j1939_can_recv() after priv was freed by j1939_sk_release()->j1939_sk_sock_destruct() Fixes: 25fe97cb7620 ("can: j1939: move j1939_priv_put() into sk_destruct callback") Link: https://lore.kernel.org/r/20210617130623.12705-1-o.rempel@pengutronix.de Cc: linux-stable Reported-by: Thadeu Lima de Souza Cascardo Reported-by: syzbot+bdf710cfc41c186fdff3@syzkaller.appspotmail.com Signed-off-by: Oleksij Rempel Signed-off-by: Marc Kleine-Budde --- net/can/j1939/main.c | 4 ++++ net/can/j1939/socket.c | 3 +++ 2 files changed, 7 insertions(+) (limited to 'net') diff --git a/net/can/j1939/main.c b/net/can/j1939/main.c index da3a7a7bcff2..08c8606cfd9c 100644 --- a/net/can/j1939/main.c +++ b/net/can/j1939/main.c @@ -193,6 +193,10 @@ static void j1939_can_rx_unregister(struct j1939_priv *priv) can_rx_unregister(dev_net(ndev), ndev, J1939_CAN_ID, J1939_CAN_MASK, j1939_can_recv, priv); + /* The last reference of priv is dropped by the RCU deferred + * j1939_sk_sock_destruct() of the last socket, so we can + * safely drop this reference here. + */ j1939_priv_put(priv); } diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c index 56aa66147d5a..fce8bc8afeb7 100644 --- a/net/can/j1939/socket.c +++ b/net/can/j1939/socket.c @@ -398,6 +398,9 @@ static int j1939_sk_init(struct sock *sk) atomic_set(&jsk->skb_pending, 0); spin_lock_init(&jsk->sk_session_queue_lock); INIT_LIST_HEAD(&jsk->sk_session_queue); + + /* j1939_sk_sock_destruct() depends on SOCK_RCU_FREE flag */ + sock_set_flag(sk, SOCK_RCU_FREE); sk->sk_destruct = j1939_sk_sock_destruct; sk->sk_protocol = CAN_J1939; -- cgit v1.2.3-58-ga151 From cfc61c598e43772cc4f76b8fc40c5ec70675716b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 18 Jun 2021 15:51:56 +0200 Subject: xfrm: replay: avoid xfrm replay notify indirection replay protection is implemented using a callback structure and then called via x->repl->notify(), x->repl->recheck(), and so on. all the differect functions are always built-in, so this could be direct calls instead. This first patch prepares for removal of the x->repl structure. Add an enum with the three available replay modes to the xfrm_state structure and then replace all x->repl->notify() calls by the new xfrm_replay_notify() helper. The helper checks the enum internally to adapt behaviour as needed. Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 11 ++++++++++- net/xfrm/xfrm_replay.c | 45 ++++++++++++++++++++++++++++----------------- net/xfrm/xfrm_state.c | 2 +- 3 files changed, 39 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 3a01570410ab..9a79e41defa7 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -145,6 +145,12 @@ enum { XFRM_MODE_FLAG_TUNNEL = 1, }; +enum xfrm_replay_mode { + XFRM_REPLAY_MODE_LEGACY, + XFRM_REPLAY_MODE_BMP, + XFRM_REPLAY_MODE_ESN, +}; + /* Full description of state of transformer. */ struct xfrm_state { possible_net_t xs_net; @@ -218,6 +224,8 @@ struct xfrm_state { /* The functions for replay detection. */ const struct xfrm_replay *repl; + /* replay detection mode */ + enum xfrm_replay_mode repl_mode; /* internal flag that only holds state for delayed aevent at the * moment */ @@ -305,7 +313,6 @@ struct xfrm_replay { int (*recheck)(struct xfrm_state *x, struct sk_buff *skb, __be32 net_seq); - void (*notify)(struct xfrm_state *x, int event); int (*overflow)(struct xfrm_state *x, struct sk_buff *skb); }; @@ -1715,6 +1722,8 @@ static inline int xfrm_policy_id2dir(u32 index) } #ifdef CONFIG_XFRM +void xfrm_replay_notify(struct xfrm_state *x, int event); + static inline int xfrm_aevent_is_on(struct net *net) { struct sock *nlsk; diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c index c6a4338a0d08..5feeb65f00b3 100644 --- a/net/xfrm/xfrm_replay.c +++ b/net/xfrm/xfrm_replay.c @@ -34,8 +34,11 @@ u32 xfrm_replay_seqhi(struct xfrm_state *x, __be32 net_seq) return seq_hi; } EXPORT_SYMBOL(xfrm_replay_seqhi); -; -static void xfrm_replay_notify(struct xfrm_state *x, int event) + +static void xfrm_replay_notify_bmp(struct xfrm_state *x, int event); +static void xfrm_replay_notify_esn(struct xfrm_state *x, int event); + +void xfrm_replay_notify(struct xfrm_state *x, int event) { struct km_event c; /* we send notify messages in case @@ -48,6 +51,17 @@ static void xfrm_replay_notify(struct xfrm_state *x, int event) * The state structure must be locked! */ + switch (x->repl_mode) { + case XFRM_REPLAY_MODE_LEGACY: + break; + case XFRM_REPLAY_MODE_BMP: + xfrm_replay_notify_bmp(x, event); + return; + case XFRM_REPLAY_MODE_ESN: + xfrm_replay_notify_esn(x, event); + return; + } + switch (event) { case XFRM_REPLAY_UPDATE: if (!x->replay_maxdiff || @@ -98,7 +112,7 @@ static int xfrm_replay_overflow(struct xfrm_state *x, struct sk_buff *skb) return err; } if (xfrm_aevent_is_on(net)) - x->repl->notify(x, XFRM_REPLAY_UPDATE); + xfrm_replay_notify(x, XFRM_REPLAY_UPDATE); } return err; @@ -157,7 +171,7 @@ static void xfrm_replay_advance(struct xfrm_state *x, __be32 net_seq) } if (xfrm_aevent_is_on(xs_net(x))) - x->repl->notify(x, XFRM_REPLAY_UPDATE); + xfrm_replay_notify(x, XFRM_REPLAY_UPDATE); } static int xfrm_replay_overflow_bmp(struct xfrm_state *x, struct sk_buff *skb) @@ -178,7 +192,7 @@ static int xfrm_replay_overflow_bmp(struct xfrm_state *x, struct sk_buff *skb) return err; } if (xfrm_aevent_is_on(net)) - x->repl->notify(x, XFRM_REPLAY_UPDATE); + xfrm_replay_notify(x, XFRM_REPLAY_UPDATE); } return err; @@ -273,7 +287,7 @@ static void xfrm_replay_advance_bmp(struct xfrm_state *x, __be32 net_seq) replay_esn->bmp[nr] |= (1U << bitnr); if (xfrm_aevent_is_on(xs_net(x))) - x->repl->notify(x, XFRM_REPLAY_UPDATE); + xfrm_replay_notify(x, XFRM_REPLAY_UPDATE); } static void xfrm_replay_notify_bmp(struct xfrm_state *x, int event) @@ -416,7 +430,7 @@ static int xfrm_replay_overflow_esn(struct xfrm_state *x, struct sk_buff *skb) } } if (xfrm_aevent_is_on(net)) - x->repl->notify(x, XFRM_REPLAY_UPDATE); + xfrm_replay_notify(x, XFRM_REPLAY_UPDATE); } return err; @@ -548,7 +562,7 @@ static void xfrm_replay_advance_esn(struct xfrm_state *x, __be32 net_seq) replay_esn->bmp[nr] |= (1U << bitnr); if (xfrm_aevent_is_on(xs_net(x))) - x->repl->notify(x, XFRM_REPLAY_UPDATE); + xfrm_replay_notify(x, XFRM_REPLAY_UPDATE); } #ifdef CONFIG_XFRM_OFFLOAD @@ -585,7 +599,7 @@ static int xfrm_replay_overflow_offload(struct xfrm_state *x, struct sk_buff *sk x->replay.oseq = oseq; if (xfrm_aevent_is_on(net)) - x->repl->notify(x, XFRM_REPLAY_UPDATE); + xfrm_replay_notify(x, XFRM_REPLAY_UPDATE); } return err; @@ -625,7 +639,7 @@ static int xfrm_replay_overflow_offload_bmp(struct xfrm_state *x, struct sk_buff } if (xfrm_aevent_is_on(net)) - x->repl->notify(x, XFRM_REPLAY_UPDATE); + xfrm_replay_notify(x, XFRM_REPLAY_UPDATE); } return err; @@ -674,7 +688,7 @@ static int xfrm_replay_overflow_offload_esn(struct xfrm_state *x, struct sk_buff replay_esn->oseq = oseq; if (xfrm_aevent_is_on(net)) - x->repl->notify(x, XFRM_REPLAY_UPDATE); + xfrm_replay_notify(x, XFRM_REPLAY_UPDATE); } return err; @@ -684,7 +698,6 @@ static const struct xfrm_replay xfrm_replay_legacy = { .advance = xfrm_replay_advance, .check = xfrm_replay_check, .recheck = xfrm_replay_check, - .notify = xfrm_replay_notify, .overflow = xfrm_replay_overflow_offload, }; @@ -692,7 +705,6 @@ static const struct xfrm_replay xfrm_replay_bmp = { .advance = xfrm_replay_advance_bmp, .check = xfrm_replay_check_bmp, .recheck = xfrm_replay_check_bmp, - .notify = xfrm_replay_notify_bmp, .overflow = xfrm_replay_overflow_offload_bmp, }; @@ -700,7 +712,6 @@ static const struct xfrm_replay xfrm_replay_esn = { .advance = xfrm_replay_advance_esn, .check = xfrm_replay_check_esn, .recheck = xfrm_replay_recheck_esn, - .notify = xfrm_replay_notify_esn, .overflow = xfrm_replay_overflow_offload_esn, }; #else @@ -708,7 +719,6 @@ static const struct xfrm_replay xfrm_replay_legacy = { .advance = xfrm_replay_advance, .check = xfrm_replay_check, .recheck = xfrm_replay_check, - .notify = xfrm_replay_notify, .overflow = xfrm_replay_overflow, }; @@ -716,7 +726,6 @@ static const struct xfrm_replay xfrm_replay_bmp = { .advance = xfrm_replay_advance_bmp, .check = xfrm_replay_check_bmp, .recheck = xfrm_replay_check_bmp, - .notify = xfrm_replay_notify_bmp, .overflow = xfrm_replay_overflow_bmp, }; @@ -724,7 +733,6 @@ static const struct xfrm_replay xfrm_replay_esn = { .advance = xfrm_replay_advance_esn, .check = xfrm_replay_check_esn, .recheck = xfrm_replay_recheck_esn, - .notify = xfrm_replay_notify_esn, .overflow = xfrm_replay_overflow_esn, }; #endif @@ -742,11 +750,14 @@ int xfrm_init_replay(struct xfrm_state *x) if (replay_esn->replay_window == 0) return -EINVAL; x->repl = &xfrm_replay_esn; + x->repl_mode = XFRM_REPLAY_MODE_ESN; } else { x->repl = &xfrm_replay_bmp; + x->repl_mode = XFRM_REPLAY_MODE_BMP; } } else { x->repl = &xfrm_replay_legacy; + x->repl_mode = XFRM_REPLAY_MODE_LEGACY; } return 0; diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 8f6058e56f7f..c2ce1e6f4760 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -2177,7 +2177,7 @@ static void xfrm_replay_timer_handler(struct timer_list *t) if (x->km.state == XFRM_STATE_VALID) { if (xfrm_aevent_is_on(xs_net(x))) - x->repl->notify(x, XFRM_REPLAY_TIMEOUT); + xfrm_replay_notify(x, XFRM_REPLAY_TIMEOUT); else x->xflags |= XFRM_TIME_DEFER; } -- cgit v1.2.3-58-ga151 From c7f877833c9f361be8e88d6b140d8314e80892aa Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 18 Jun 2021 15:51:57 +0200 Subject: xfrm: replay: remove advance indirection Similar to other patches: add a new helper to avoid an indirection. v2: fix 'net/xfrm/xfrm_replay.c:519:13: warning: 'seq' may be used uninitialized in this function' warning. Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 2 +- net/xfrm/xfrm_input.c | 2 +- net/xfrm/xfrm_replay.c | 24 +++++++++++++++--------- 3 files changed, 17 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 9a79e41defa7..a7f997b13198 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -306,7 +306,6 @@ struct km_event { }; struct xfrm_replay { - void (*advance)(struct xfrm_state *x, __be32 net_seq); int (*check)(struct xfrm_state *x, struct sk_buff *skb, __be32 net_seq); @@ -1722,6 +1721,7 @@ static inline int xfrm_policy_id2dir(u32 index) } #ifdef CONFIG_XFRM +void xfrm_replay_advance(struct xfrm_state *x, __be32 net_seq); void xfrm_replay_notify(struct xfrm_state *x, int event); static inline int xfrm_aevent_is_on(struct net *net) diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 1158cd0311d7..c8971e4b33ab 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -665,7 +665,7 @@ resume: goto drop_unlock; } - x->repl->advance(x, seq); + xfrm_replay_advance(x, seq); x->curlft.bytes += skb->len; x->curlft.packets++; diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c index 5feeb65f00b3..9565b0f7d380 100644 --- a/net/xfrm/xfrm_replay.c +++ b/net/xfrm/xfrm_replay.c @@ -150,14 +150,26 @@ err: return -EINVAL; } -static void xfrm_replay_advance(struct xfrm_state *x, __be32 net_seq) +static void xfrm_replay_advance_bmp(struct xfrm_state *x, __be32 net_seq); +static void xfrm_replay_advance_esn(struct xfrm_state *x, __be32 net_seq); + +void xfrm_replay_advance(struct xfrm_state *x, __be32 net_seq) { - u32 diff; - u32 seq = ntohl(net_seq); + u32 diff, seq; + + switch (x->repl_mode) { + case XFRM_REPLAY_MODE_LEGACY: + break; + case XFRM_REPLAY_MODE_BMP: + return xfrm_replay_advance_bmp(x, net_seq); + case XFRM_REPLAY_MODE_ESN: + return xfrm_replay_advance_esn(x, net_seq); + } if (!x->props.replay_window) return; + seq = ntohl(net_seq); if (seq > x->replay.seq) { diff = seq - x->replay.seq; if (diff < x->props.replay_window) @@ -695,42 +707,36 @@ static int xfrm_replay_overflow_offload_esn(struct xfrm_state *x, struct sk_buff } static const struct xfrm_replay xfrm_replay_legacy = { - .advance = xfrm_replay_advance, .check = xfrm_replay_check, .recheck = xfrm_replay_check, .overflow = xfrm_replay_overflow_offload, }; static const struct xfrm_replay xfrm_replay_bmp = { - .advance = xfrm_replay_advance_bmp, .check = xfrm_replay_check_bmp, .recheck = xfrm_replay_check_bmp, .overflow = xfrm_replay_overflow_offload_bmp, }; static const struct xfrm_replay xfrm_replay_esn = { - .advance = xfrm_replay_advance_esn, .check = xfrm_replay_check_esn, .recheck = xfrm_replay_recheck_esn, .overflow = xfrm_replay_overflow_offload_esn, }; #else static const struct xfrm_replay xfrm_replay_legacy = { - .advance = xfrm_replay_advance, .check = xfrm_replay_check, .recheck = xfrm_replay_check, .overflow = xfrm_replay_overflow, }; static const struct xfrm_replay xfrm_replay_bmp = { - .advance = xfrm_replay_advance_bmp, .check = xfrm_replay_check_bmp, .recheck = xfrm_replay_check_bmp, .overflow = xfrm_replay_overflow_bmp, }; static const struct xfrm_replay xfrm_replay_esn = { - .advance = xfrm_replay_advance_esn, .check = xfrm_replay_check_esn, .recheck = xfrm_replay_recheck_esn, .overflow = xfrm_replay_overflow_esn, -- cgit v1.2.3-58-ga151 From 25cfb8bc97c2b8447f86b1ad376ee672b6b173d4 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 18 Jun 2021 15:51:58 +0200 Subject: xfrm: replay: remove recheck indirection Adds new xfrm_replay_recheck() helper and calls it from xfrm input path instead of the indirection. Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 4 +--- net/xfrm/xfrm_input.c | 2 +- net/xfrm/xfrm_replay.c | 22 ++++++++++++++++------ 3 files changed, 18 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index a7f997b13198..3a219b34cb8c 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -309,9 +309,6 @@ struct xfrm_replay { int (*check)(struct xfrm_state *x, struct sk_buff *skb, __be32 net_seq); - int (*recheck)(struct xfrm_state *x, - struct sk_buff *skb, - __be32 net_seq); int (*overflow)(struct xfrm_state *x, struct sk_buff *skb); }; @@ -1723,6 +1720,7 @@ static inline int xfrm_policy_id2dir(u32 index) #ifdef CONFIG_XFRM void xfrm_replay_advance(struct xfrm_state *x, __be32 net_seq); void xfrm_replay_notify(struct xfrm_state *x, int event); +int xfrm_replay_recheck(struct xfrm_state *x, struct sk_buff *skb, __be32 net_seq); static inline int xfrm_aevent_is_on(struct net *net) { diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index c8971e4b33ab..8046ef1a6680 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -660,7 +660,7 @@ resume: /* only the first xfrm gets the encap type */ encap_type = 0; - if (x->repl->recheck(x, skb, seq)) { + if (xfrm_replay_recheck(x, skb, seq)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATESEQERROR); goto drop_unlock; } diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c index 9565b0f7d380..59391dc80fa3 100644 --- a/net/xfrm/xfrm_replay.c +++ b/net/xfrm/xfrm_replay.c @@ -519,6 +519,22 @@ static int xfrm_replay_recheck_esn(struct xfrm_state *x, return xfrm_replay_check_esn(x, skb, net_seq); } +int xfrm_replay_recheck(struct xfrm_state *x, + struct sk_buff *skb, __be32 net_seq) +{ + switch (x->repl_mode) { + case XFRM_REPLAY_MODE_LEGACY: + break; + case XFRM_REPLAY_MODE_BMP: + /* no special recheck treatment */ + return xfrm_replay_check_bmp(x, skb, net_seq); + case XFRM_REPLAY_MODE_ESN: + return xfrm_replay_recheck_esn(x, skb, net_seq); + } + + return xfrm_replay_check(x, skb, net_seq); +} + static void xfrm_replay_advance_esn(struct xfrm_state *x, __be32 net_seq) { unsigned int bitnr, nr, i; @@ -708,37 +724,31 @@ static int xfrm_replay_overflow_offload_esn(struct xfrm_state *x, struct sk_buff static const struct xfrm_replay xfrm_replay_legacy = { .check = xfrm_replay_check, - .recheck = xfrm_replay_check, .overflow = xfrm_replay_overflow_offload, }; static const struct xfrm_replay xfrm_replay_bmp = { .check = xfrm_replay_check_bmp, - .recheck = xfrm_replay_check_bmp, .overflow = xfrm_replay_overflow_offload_bmp, }; static const struct xfrm_replay xfrm_replay_esn = { .check = xfrm_replay_check_esn, - .recheck = xfrm_replay_recheck_esn, .overflow = xfrm_replay_overflow_offload_esn, }; #else static const struct xfrm_replay xfrm_replay_legacy = { .check = xfrm_replay_check, - .recheck = xfrm_replay_check, .overflow = xfrm_replay_overflow, }; static const struct xfrm_replay xfrm_replay_bmp = { .check = xfrm_replay_check_bmp, - .recheck = xfrm_replay_check_bmp, .overflow = xfrm_replay_overflow_bmp, }; static const struct xfrm_replay xfrm_replay_esn = { .check = xfrm_replay_check_esn, - .recheck = xfrm_replay_recheck_esn, .overflow = xfrm_replay_overflow_esn, }; #endif -- cgit v1.2.3-58-ga151 From adfc2fdbae30d42edebad01d0ea1eed43036f1fe Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 18 Jun 2021 15:51:59 +0200 Subject: xfrm: replay: avoid replay indirection Add and use xfrm_replay_check helper instead of indirection. Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 4 +--- net/xfrm/xfrm_input.c | 2 +- net/xfrm/xfrm_replay.c | 27 ++++++++++++++++++--------- 3 files changed, 20 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 3a219b34cb8c..0206d80ec291 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -306,9 +306,6 @@ struct km_event { }; struct xfrm_replay { - int (*check)(struct xfrm_state *x, - struct sk_buff *skb, - __be32 net_seq); int (*overflow)(struct xfrm_state *x, struct sk_buff *skb); }; @@ -1719,6 +1716,7 @@ static inline int xfrm_policy_id2dir(u32 index) #ifdef CONFIG_XFRM void xfrm_replay_advance(struct xfrm_state *x, __be32 net_seq); +int xfrm_replay_check(struct xfrm_state *x, struct sk_buff *skb, __be32 net_seq); void xfrm_replay_notify(struct xfrm_state *x, int event); int xfrm_replay_recheck(struct xfrm_state *x, struct sk_buff *skb, __be32 net_seq); diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 8046ef1a6680..3df0861d4390 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -612,7 +612,7 @@ lock: goto drop_unlock; } - if (x->repl->check(x, skb, seq)) { + if (xfrm_replay_check(x, skb, seq)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATESEQERROR); goto drop_unlock; } diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c index 59391dc80fa3..e8703aa8d06a 100644 --- a/net/xfrm/xfrm_replay.c +++ b/net/xfrm/xfrm_replay.c @@ -118,8 +118,8 @@ static int xfrm_replay_overflow(struct xfrm_state *x, struct sk_buff *skb) return err; } -static int xfrm_replay_check(struct xfrm_state *x, - struct sk_buff *skb, __be32 net_seq) +static int xfrm_replay_check_legacy(struct xfrm_state *x, + struct sk_buff *skb, __be32 net_seq) { u32 diff; u32 seq = ntohl(net_seq); @@ -507,6 +507,21 @@ err: return -EINVAL; } +int xfrm_replay_check(struct xfrm_state *x, + struct sk_buff *skb, __be32 net_seq) +{ + switch (x->repl_mode) { + case XFRM_REPLAY_MODE_LEGACY: + break; + case XFRM_REPLAY_MODE_BMP: + return xfrm_replay_check_bmp(x, skb, net_seq); + case XFRM_REPLAY_MODE_ESN: + return xfrm_replay_check_esn(x, skb, net_seq); + } + + return xfrm_replay_check_legacy(x, skb, net_seq); +} + static int xfrm_replay_recheck_esn(struct xfrm_state *x, struct sk_buff *skb, __be32 net_seq) { @@ -532,7 +547,7 @@ int xfrm_replay_recheck(struct xfrm_state *x, return xfrm_replay_recheck_esn(x, skb, net_seq); } - return xfrm_replay_check(x, skb, net_seq); + return xfrm_replay_check_legacy(x, skb, net_seq); } static void xfrm_replay_advance_esn(struct xfrm_state *x, __be32 net_seq) @@ -723,32 +738,26 @@ static int xfrm_replay_overflow_offload_esn(struct xfrm_state *x, struct sk_buff } static const struct xfrm_replay xfrm_replay_legacy = { - .check = xfrm_replay_check, .overflow = xfrm_replay_overflow_offload, }; static const struct xfrm_replay xfrm_replay_bmp = { - .check = xfrm_replay_check_bmp, .overflow = xfrm_replay_overflow_offload_bmp, }; static const struct xfrm_replay xfrm_replay_esn = { - .check = xfrm_replay_check_esn, .overflow = xfrm_replay_overflow_offload_esn, }; #else static const struct xfrm_replay xfrm_replay_legacy = { - .check = xfrm_replay_check, .overflow = xfrm_replay_overflow, }; static const struct xfrm_replay xfrm_replay_bmp = { - .check = xfrm_replay_check_bmp, .overflow = xfrm_replay_overflow_bmp, }; static const struct xfrm_replay xfrm_replay_esn = { - .check = xfrm_replay_check_esn, .overflow = xfrm_replay_overflow_esn, }; #endif -- cgit v1.2.3-58-ga151 From b5a1d1fe0cbb9d20ba661134a09561af1dc9ebf5 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 18 Jun 2021 15:52:00 +0200 Subject: xfrm: replay: remove last replay indirection This replaces the overflow indirection with the new xfrm_replay_overflow helper. After this, the 'repl' pointer in xfrm_state is no longer needed and can be removed as well. xfrm_replay_overflow() is added in two incarnations, one is used when the kernel is compiled with xfrm hardware offload support enabled, the other when its disabled. Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 8 +------- net/xfrm/xfrm_output.c | 2 +- net/xfrm/xfrm_replay.c | 51 +++++++++++++++++++++++++------------------------- 3 files changed, 28 insertions(+), 33 deletions(-) (limited to 'net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 0206d80ec291..d2a0559c255f 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -221,9 +221,6 @@ struct xfrm_state { struct xfrm_replay_state preplay; struct xfrm_replay_state_esn *preplay_esn; - /* The functions for replay detection. */ - const struct xfrm_replay *repl; - /* replay detection mode */ enum xfrm_replay_mode repl_mode; /* internal flag that only holds state for delayed aevent at the @@ -305,10 +302,6 @@ struct km_event { struct net *net; }; -struct xfrm_replay { - int (*overflow)(struct xfrm_state *x, struct sk_buff *skb); -}; - struct xfrm_if_cb { struct xfrm_if *(*decode_session)(struct sk_buff *skb, unsigned short family); @@ -1718,6 +1711,7 @@ static inline int xfrm_policy_id2dir(u32 index) void xfrm_replay_advance(struct xfrm_state *x, __be32 net_seq); int xfrm_replay_check(struct xfrm_state *x, struct sk_buff *skb, __be32 net_seq); void xfrm_replay_notify(struct xfrm_state *x, int event); +int xfrm_replay_overflow(struct xfrm_state *x, struct sk_buff *skb); int xfrm_replay_recheck(struct xfrm_state *x, struct sk_buff *skb, __be32 net_seq); static inline int xfrm_aevent_is_on(struct net *net) diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 0b2975ef0668..527da58464f3 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -525,7 +525,7 @@ static int xfrm_output_one(struct sk_buff *skb, int err) goto error; } - err = x->repl->overflow(x, skb); + err = xfrm_replay_overflow(x, skb); if (err) { XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATESEQERROR); goto error; diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c index e8703aa8d06a..9277d81b344c 100644 --- a/net/xfrm/xfrm_replay.c +++ b/net/xfrm/xfrm_replay.c @@ -95,7 +95,7 @@ void xfrm_replay_notify(struct xfrm_state *x, int event) x->xflags &= ~XFRM_TIME_DEFER; } -static int xfrm_replay_overflow(struct xfrm_state *x, struct sk_buff *skb) +static int __xfrm_replay_overflow(struct xfrm_state *x, struct sk_buff *skb) { int err = 0; struct net *net = xs_net(x); @@ -617,7 +617,7 @@ static int xfrm_replay_overflow_offload(struct xfrm_state *x, struct sk_buff *sk __u32 oseq = x->replay.oseq; if (!xo) - return xfrm_replay_overflow(x, skb); + return __xfrm_replay_overflow(x, skb); if (x->type->flags & XFRM_TYPE_REPLAY_PROT) { if (!skb_is_gso(skb)) { @@ -737,29 +737,33 @@ static int xfrm_replay_overflow_offload_esn(struct xfrm_state *x, struct sk_buff return err; } -static const struct xfrm_replay xfrm_replay_legacy = { - .overflow = xfrm_replay_overflow_offload, -}; - -static const struct xfrm_replay xfrm_replay_bmp = { - .overflow = xfrm_replay_overflow_offload_bmp, -}; +int xfrm_replay_overflow(struct xfrm_state *x, struct sk_buff *skb) +{ + switch (x->repl_mode) { + case XFRM_REPLAY_MODE_LEGACY: + break; + case XFRM_REPLAY_MODE_BMP: + return xfrm_replay_overflow_offload_bmp(x, skb); + case XFRM_REPLAY_MODE_ESN: + return xfrm_replay_overflow_offload_esn(x, skb); + } -static const struct xfrm_replay xfrm_replay_esn = { - .overflow = xfrm_replay_overflow_offload_esn, -}; + return xfrm_replay_overflow_offload(x, skb); +} #else -static const struct xfrm_replay xfrm_replay_legacy = { - .overflow = xfrm_replay_overflow, -}; - -static const struct xfrm_replay xfrm_replay_bmp = { - .overflow = xfrm_replay_overflow_bmp, -}; +int xfrm_replay_overflow(struct xfrm_state *x, struct sk_buff *skb) +{ + switch (x->repl_mode) { + case XFRM_REPLAY_MODE_LEGACY: + break; + case XFRM_REPLAY_MODE_BMP: + return xfrm_replay_overflow_bmp(x, skb); + case XFRM_REPLAY_MODE_ESN: + return xfrm_replay_overflow_esn(x, skb); + } -static const struct xfrm_replay xfrm_replay_esn = { - .overflow = xfrm_replay_overflow_esn, -}; + return __xfrm_replay_overflow(x, skb); +} #endif int xfrm_init_replay(struct xfrm_state *x) @@ -774,14 +778,11 @@ int xfrm_init_replay(struct xfrm_state *x) if (x->props.flags & XFRM_STATE_ESN) { if (replay_esn->replay_window == 0) return -EINVAL; - x->repl = &xfrm_replay_esn; x->repl_mode = XFRM_REPLAY_MODE_ESN; } else { - x->repl = &xfrm_replay_bmp; x->repl_mode = XFRM_REPLAY_MODE_BMP; } } else { - x->repl = &xfrm_replay_legacy; x->repl_mode = XFRM_REPLAY_MODE_LEGACY; } -- cgit v1.2.3-58-ga151 From 9f2470fbc4cb4583c080bb729a998933ba61aca4 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 14 Jun 2021 19:13:35 -0700 Subject: skmsg: Improve udp_bpf_recvmsg() accuracy I tried to reuse sk_msg_wait_data() for different protocols, but it turns out it can not be simply reused. For example, UDP actually uses two queues to receive skb: udp_sk(sk)->reader_queue and sk->sk_receive_queue. So we have to check both of them to know whether we have received any packet. Also, UDP does not lock the sock during BH Rx path, it makes no sense for its ->recvmsg() to lock the sock. It is always possible for ->recvmsg() to be called before packets actually arrive in the receive queue, we just use best effort to make it accurate here. Fixes: 1f5be6b3b063 ("udp: Implement udp_bpf_recvmsg() for sockmap") Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210615021342.7416-2-xiyou.wangcong@gmail.com --- include/linux/skmsg.h | 2 -- net/core/skmsg.c | 23 ----------------------- net/ipv4/tcp_bpf.c | 24 +++++++++++++++++++++++- net/ipv4/udp_bpf.c | 47 ++++++++++++++++++++++++++++++++++++++++++----- 4 files changed, 65 insertions(+), 31 deletions(-) (limited to 'net') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index aba0f0f429be..e3d080c299f6 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -126,8 +126,6 @@ int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from, struct sk_msg *msg, u32 bytes); int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, struct sk_msg *msg, u32 bytes); -int sk_msg_wait_data(struct sock *sk, struct sk_psock *psock, int flags, - long timeo, int *err); int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, int len, int flags); diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 43ce17a6a585..f9a81b314e4c 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -399,29 +399,6 @@ out: } EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter); -int sk_msg_wait_data(struct sock *sk, struct sk_psock *psock, int flags, - long timeo, int *err) -{ - DEFINE_WAIT_FUNC(wait, woken_wake_function); - int ret = 0; - - if (sk->sk_shutdown & RCV_SHUTDOWN) - return 1; - - if (!timeo) - return ret; - - add_wait_queue(sk_sleep(sk), &wait); - sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); - ret = sk_wait_event(sk, &timeo, - !list_empty(&psock->ingress_msg) || - !skb_queue_empty(&sk->sk_receive_queue), &wait); - sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); - remove_wait_queue(sk_sleep(sk), &wait); - return ret; -} -EXPORT_SYMBOL_GPL(sk_msg_wait_data); - /* Receive sk_msg from psock->ingress_msg to @msg. */ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, int len, int flags) diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index ad9d17923fc5..bb49b52d7be8 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -163,6 +163,28 @@ static bool tcp_bpf_stream_read(const struct sock *sk) return !empty; } +static int tcp_msg_wait_data(struct sock *sk, struct sk_psock *psock, int flags, + long timeo, int *err) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + int ret = 0; + + if (sk->sk_shutdown & RCV_SHUTDOWN) + return 1; + + if (!timeo) + return ret; + + add_wait_queue(sk_sleep(sk), &wait); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); + ret = sk_wait_event(sk, &timeo, + !list_empty(&psock->ingress_msg) || + !skb_queue_empty(&sk->sk_receive_queue), &wait); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); + remove_wait_queue(sk_sleep(sk), &wait); + return ret; +} + static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len) { @@ -188,7 +210,7 @@ msg_bytes_ready: long timeo; timeo = sock_rcvtimeo(sk, nonblock); - data = sk_msg_wait_data(sk, psock, flags, timeo, &err); + data = tcp_msg_wait_data(sk, psock, flags, timeo, &err); if (data) { if (!sk_psock_queue_empty(psock)) goto msg_bytes_ready; diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c index 954c4591a6fd..565a70040c57 100644 --- a/net/ipv4/udp_bpf.c +++ b/net/ipv4/udp_bpf.c @@ -21,6 +21,45 @@ static int sk_udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, return udp_prot.recvmsg(sk, msg, len, noblock, flags, addr_len); } +static bool udp_sk_has_data(struct sock *sk) +{ + return !skb_queue_empty(&udp_sk(sk)->reader_queue) || + !skb_queue_empty(&sk->sk_receive_queue); +} + +static bool psock_has_data(struct sk_psock *psock) +{ + return !skb_queue_empty(&psock->ingress_skb) || + !sk_psock_queue_empty(psock); +} + +#define udp_msg_has_data(__sk, __psock) \ + ({ udp_sk_has_data(__sk) || psock_has_data(__psock); }) + +static int udp_msg_wait_data(struct sock *sk, struct sk_psock *psock, int flags, + long timeo, int *err) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + int ret = 0; + + if (sk->sk_shutdown & RCV_SHUTDOWN) + return 1; + + if (!timeo) + return ret; + + add_wait_queue(sk_sleep(sk), &wait); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); + ret = udp_msg_has_data(sk, psock); + if (!ret) { + wait_woken(&wait, TASK_INTERRUPTIBLE, timeo); + ret = udp_msg_has_data(sk, psock); + } + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); + remove_wait_queue(sk_sleep(sk), &wait); + return ret; +} + static int udp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len) { @@ -34,8 +73,7 @@ static int udp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, if (unlikely(!psock)) return sk_udp_recvmsg(sk, msg, len, nonblock, flags, addr_len); - lock_sock(sk); - if (sk_psock_queue_empty(psock)) { + if (!psock_has_data(psock)) { ret = sk_udp_recvmsg(sk, msg, len, nonblock, flags, addr_len); goto out; } @@ -47,9 +85,9 @@ msg_bytes_ready: long timeo; timeo = sock_rcvtimeo(sk, nonblock); - data = sk_msg_wait_data(sk, psock, flags, timeo, &err); + data = udp_msg_wait_data(sk, psock, flags, timeo, &err); if (data) { - if (!sk_psock_queue_empty(psock)) + if (psock_has_data(psock)) goto msg_bytes_ready; ret = sk_udp_recvmsg(sk, msg, len, nonblock, flags, addr_len); goto out; @@ -62,7 +100,6 @@ msg_bytes_ready: } ret = copied; out: - release_sock(sk); sk_psock_put(sk, psock); return ret; } -- cgit v1.2.3-58-ga151 From e00a5c331bf57f41fcfdc5da4f5caeafe5e54c1d Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 14 Jun 2021 19:13:37 -0700 Subject: udp: Fix a memory leak in udp_read_sock() sk_psock_verdict_recv() clones the skb and uses the clone afterward, so udp_read_sock() should free the skb after using it, regardless of error or not. This fixes a real kmemleak. Fixes: d7f571188ecf ("udp: Implement ->read_sock() for sockmap") Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210615021342.7416-4-xiyou.wangcong@gmail.com --- net/ipv4/udp.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 1307ad0d3b9e..8091276cb85b 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1798,11 +1798,13 @@ int udp_read_sock(struct sock *sk, read_descriptor_t *desc, if (used <= 0) { if (!copied) copied = used; + kfree_skb(skb); break; } else if (used <= skb->len) { copied += used; } + kfree_skb(skb); if (!desc->count) break; } -- cgit v1.2.3-58-ga151 From 30b9c54a707db4155735cf71f4600241c1b7b6ff Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 14 Jun 2021 19:13:38 -0700 Subject: skmsg: Clear skb redirect pointer before dropping it When we drop skb inside sk_psock_skb_redirect(), we have to clear its skb->_sk_redir pointer too, otherwise kfree_skb() would misinterpret it as a valid skb->_skb_refdst and dst_release() would eventually complain. Fixes: e3526bb92a20 ("skmsg: Move sk_redir from TCP_SKB_CB to skb") Reported-by: Jiang Wang Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210615021342.7416-5-xiyou.wangcong@gmail.com --- net/core/skmsg.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index f9a81b314e4c..4334720e2a04 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -843,12 +843,14 @@ static void sk_psock_skb_redirect(struct sk_buff *skb) * a socket that is in this state so we drop the skb. */ if (!psock_other || sock_flag(sk_other, SOCK_DEAD)) { + skb_bpf_redirect_clear(skb); kfree_skb(skb); return; } spin_lock_bh(&psock_other->ingress_lock); if (!sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) { spin_unlock_bh(&psock_other->ingress_lock); + skb_bpf_redirect_clear(skb); kfree_skb(skb); return; } -- cgit v1.2.3-58-ga151 From 0cf6672b23c8aa9d9274798dd63cbf6ede77ef90 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 14 Jun 2021 19:13:39 -0700 Subject: skmsg: Fix a memory leak in sk_psock_verdict_apply() If the dest psock does not set SK_PSOCK_TX_ENABLED, the skb can't be queued anywhere so must be dropped. This one is found during code review. Fixes: 799aa7f98d53 ("skmsg: Avoid lock_sock() in sk_psock_backlog()") Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210615021342.7416-6-xiyou.wangcong@gmail.com --- net/core/skmsg.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 4334720e2a04..5464477e2d3d 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -924,8 +924,13 @@ static void sk_psock_verdict_apply(struct sk_psock *psock, if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { skb_queue_tail(&psock->ingress_skb, skb); schedule_work(&psock->work); + err = 0; } spin_unlock_bh(&psock->ingress_lock); + if (err < 0) { + skb_bpf_redirect_clear(skb); + goto out_free; + } } break; case __SK_REDIRECT: -- cgit v1.2.3-58-ga151 From 1581a6c1c3291a8320b080f4411345f60229976d Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 14 Jun 2021 19:13:40 -0700 Subject: skmsg: Teach sk_psock_verdict_apply() to return errors Currently sk_psock_verdict_apply() is void, but it handles some error conditions too. Its caller is impossible to learn whether it succeeds or fails, especially sk_psock_verdict_recv(). Make it return int to indicate error cases and propagate errors to callers properly. Fixes: ef5659280eb1 ("bpf, sockmap: Allow skipping sk_skb parser program") Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210615021342.7416-7-xiyou.wangcong@gmail.com --- net/core/skmsg.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 5464477e2d3d..e3d210811db4 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -824,7 +824,7 @@ out: } EXPORT_SYMBOL_GPL(sk_psock_msg_verdict); -static void sk_psock_skb_redirect(struct sk_buff *skb) +static int sk_psock_skb_redirect(struct sk_buff *skb) { struct sk_psock *psock_other; struct sock *sk_other; @@ -835,7 +835,7 @@ static void sk_psock_skb_redirect(struct sk_buff *skb) */ if (unlikely(!sk_other)) { kfree_skb(skb); - return; + return -EIO; } psock_other = sk_psock(sk_other); /* This error indicates the socket is being torn down or had another @@ -845,19 +845,20 @@ static void sk_psock_skb_redirect(struct sk_buff *skb) if (!psock_other || sock_flag(sk_other, SOCK_DEAD)) { skb_bpf_redirect_clear(skb); kfree_skb(skb); - return; + return -EIO; } spin_lock_bh(&psock_other->ingress_lock); if (!sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) { spin_unlock_bh(&psock_other->ingress_lock); skb_bpf_redirect_clear(skb); kfree_skb(skb); - return; + return -EIO; } skb_queue_tail(&psock_other->ingress_skb, skb); schedule_work(&psock_other->work); spin_unlock_bh(&psock_other->ingress_lock); + return 0; } static void sk_psock_tls_verdict_apply(struct sk_buff *skb, struct sock *sk, int verdict) @@ -894,14 +895,15 @@ int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb) } EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read); -static void sk_psock_verdict_apply(struct sk_psock *psock, - struct sk_buff *skb, int verdict) +static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, + int verdict) { struct sock *sk_other; - int err = -EIO; + int err = 0; switch (verdict) { case __SK_PASS: + err = -EIO; sk_other = psock->sk; if (sock_flag(sk_other, SOCK_DEAD) || !sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { @@ -934,13 +936,15 @@ static void sk_psock_verdict_apply(struct sk_psock *psock, } break; case __SK_REDIRECT: - sk_psock_skb_redirect(skb); + err = sk_psock_skb_redirect(skb); break; case __SK_DROP: default: out_free: kfree_skb(skb); } + + return err; } static void sk_psock_write_space(struct sock *sk) @@ -1107,7 +1111,8 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); skb->sk = NULL; } - sk_psock_verdict_apply(psock, skb, ret); + if (sk_psock_verdict_apply(psock, skb, ret) < 0) + len = 0; out: rcu_read_unlock(); return len; -- cgit v1.2.3-58-ga151 From 42830571f1fd9751b3fbf38084bbb253320e185f Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 14 Jun 2021 19:13:41 -0700 Subject: skmsg: Pass source psock to sk_psock_skb_redirect() sk_psock_skb_redirect() only takes skb as a parameter, we will need to know where this skb is from, so just pass the source psock to this function as a new parameter. This patch prepares for the next one. Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210615021342.7416-8-xiyou.wangcong@gmail.com --- net/core/skmsg.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index e3d210811db4..3aa9065811ad 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -824,7 +824,7 @@ out: } EXPORT_SYMBOL_GPL(sk_psock_msg_verdict); -static int sk_psock_skb_redirect(struct sk_buff *skb) +static int sk_psock_skb_redirect(struct sk_psock *from, struct sk_buff *skb) { struct sk_psock *psock_other; struct sock *sk_other; @@ -861,11 +861,12 @@ static int sk_psock_skb_redirect(struct sk_buff *skb) return 0; } -static void sk_psock_tls_verdict_apply(struct sk_buff *skb, struct sock *sk, int verdict) +static void sk_psock_tls_verdict_apply(struct sk_buff *skb, + struct sk_psock *from, int verdict) { switch (verdict) { case __SK_REDIRECT: - sk_psock_skb_redirect(skb); + sk_psock_skb_redirect(from, skb); break; case __SK_PASS: case __SK_DROP: @@ -889,7 +890,7 @@ int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb) ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); skb->sk = NULL; } - sk_psock_tls_verdict_apply(skb, psock->sk, ret); + sk_psock_tls_verdict_apply(skb, psock, ret); rcu_read_unlock(); return ret; } @@ -936,7 +937,7 @@ static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, } break; case __SK_REDIRECT: - err = sk_psock_skb_redirect(skb); + err = sk_psock_skb_redirect(psock, skb); break; case __SK_DROP: default: -- cgit v1.2.3-58-ga151 From 781dd0431eb549f9cb1fdddf91a50d985febe884 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 14 Jun 2021 19:13:42 -0700 Subject: skmsg: Increase sk->sk_drops when dropping packets It is hard to observe packet drops without increasing relevant drop counters, here we should increase sk->sk_drops which is a protocol-independent counter. Fortunately psock is always associated with a struct sock, we can just use psock->sk. Suggested-by: John Fastabend Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210615021342.7416-9-xiyou.wangcong@gmail.com --- net/core/skmsg.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 3aa9065811ad..9b6160a191f8 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -578,6 +578,12 @@ static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb, return sk_psock_skb_ingress(psock, skb); } +static void sock_drop(struct sock *sk, struct sk_buff *skb) +{ + sk_drops_add(sk, skb); + kfree_skb(skb); +} + static void sk_psock_backlog(struct work_struct *work) { struct sk_psock *psock = container_of(work, struct sk_psock, work); @@ -617,7 +623,7 @@ start: /* Hard errors break pipe and stop xmit. */ sk_psock_report_error(psock, ret ? -ret : EPIPE); sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); - kfree_skb(skb); + sock_drop(psock->sk, skb); goto end; } off += ret; @@ -708,7 +714,7 @@ static void __sk_psock_zap_ingress(struct sk_psock *psock) while ((skb = skb_dequeue(&psock->ingress_skb)) != NULL) { skb_bpf_redirect_clear(skb); - kfree_skb(skb); + sock_drop(psock->sk, skb); } __sk_psock_purge_ingress_msg(psock); } @@ -834,7 +840,7 @@ static int sk_psock_skb_redirect(struct sk_psock *from, struct sk_buff *skb) * return code, but then didn't set a redirect interface. */ if (unlikely(!sk_other)) { - kfree_skb(skb); + sock_drop(from->sk, skb); return -EIO; } psock_other = sk_psock(sk_other); @@ -844,14 +850,14 @@ static int sk_psock_skb_redirect(struct sk_psock *from, struct sk_buff *skb) */ if (!psock_other || sock_flag(sk_other, SOCK_DEAD)) { skb_bpf_redirect_clear(skb); - kfree_skb(skb); + sock_drop(from->sk, skb); return -EIO; } spin_lock_bh(&psock_other->ingress_lock); if (!sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) { spin_unlock_bh(&psock_other->ingress_lock); skb_bpf_redirect_clear(skb); - kfree_skb(skb); + sock_drop(from->sk, skb); return -EIO; } @@ -942,7 +948,7 @@ static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, case __SK_DROP: default: out_free: - kfree_skb(skb); + sock_drop(psock->sk, skb); } return err; @@ -977,7 +983,7 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) sk = strp->sk; psock = sk_psock(sk); if (unlikely(!psock)) { - kfree_skb(skb); + sock_drop(sk, skb); goto out; } prog = READ_ONCE(psock->progs.stream_verdict); @@ -1098,7 +1104,7 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, psock = sk_psock(sk); if (unlikely(!psock)) { len = 0; - kfree_skb(skb); + sock_drop(sk, skb); goto out; } prog = READ_ONCE(psock->progs.stream_verdict); -- cgit v1.2.3-58-ga151 From 2b4cd14fd995e0a863b2ced4cba0bcd804d89ebc Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 17 Jun 2021 09:38:17 +0200 Subject: net/netif_receive_skb_core: Use migrate_disable() The preempt disable around do_xdp_generic() has been introduced in commit bbbe211c295ff ("net: rcu lock and preempt disable missing around generic xdp") For BPF it is enough to use migrate_disable() and the code was updated as it can be seen in commit 3c58482a382ba ("bpf: Provide bpf_prog_run_pin_on_cpu() helper") This is a leftover which was not converted. Use migrate_disable() before invoking do_xdp_generic(). Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index ef8cf7619baf..439faadab0c2 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5277,9 +5277,9 @@ another_round: if (static_branch_unlikely(&generic_xdp_needed_key)) { int ret2; - preempt_disable(); + migrate_disable(); ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb); - preempt_enable(); + migrate_enable(); if (ret2 != XDP_PASS) { ret = NET_RX_DROP; -- cgit v1.2.3-58-ga151 From 1a1100d53f12451d50bc5ebbc941517760912ab8 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 19 Jun 2021 16:50:21 +0300 Subject: net/smc: Fix ENODATA tests in smc_nl_get_fback_stats() These functions return negative ENODATA but the minus sign was left out in the tests. Fixes: f0dd7bf5e330 ("net/smc: Add netlink support for SMC fallback statistics") Signed-off-by: Dan Carpenter Acked-by: Guvenc Gulce Signed-off-by: David S. Miller --- net/smc/smc_stats.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/smc/smc_stats.c b/net/smc/smc_stats.c index 614013e3b574..e80e34f7ac15 100644 --- a/net/smc/smc_stats.c +++ b/net/smc/smc_stats.c @@ -393,17 +393,17 @@ int smc_nl_get_fback_stats(struct sk_buff *skb, struct netlink_callback *cb) continue; if (!skip_serv) { rc_srv = smc_nl_get_fback_details(skb, cb, k, is_srv); - if (rc_srv && rc_srv != ENODATA) + if (rc_srv && rc_srv != -ENODATA) break; } else { skip_serv = 0; } rc_clnt = smc_nl_get_fback_details(skb, cb, k, !is_srv); - if (rc_clnt && rc_clnt != ENODATA) { + if (rc_clnt && rc_clnt != -ENODATA) { skip_serv = 1; break; } - if (rc_clnt == ENODATA && rc_srv == ENODATA) + if (rc_clnt == -ENODATA && rc_srv == -ENODATA) break; } mutex_unlock(&net->smc.mutex_fback_rsn); -- cgit v1.2.3-58-ga151 From 185ab886d3fb283e837283c343bf539c371e26cf Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 19 Jun 2021 03:50:26 +0000 Subject: af_unix: take address assignment/hash insertion into a new helper Duplicated logics in all bind variants (autobind, bind-to-path, bind-to-abstract) gets taken into a common helper. Signed-off-by: Al Viro Signed-off-by: David S. Miller --- net/unix/af_unix.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index c9dfec7b71e7..91447624a364 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -262,6 +262,14 @@ static void __unix_insert_socket(struct hlist_head *list, struct sock *sk) sk_add_node(sk, list); } +static void __unix_set_addr(struct sock *sk, struct unix_address *addr, + unsigned hash) +{ + __unix_remove_socket(sk); + smp_store_release(&unix_sk(sk)->addr, addr); + __unix_insert_socket(&unix_socket_table[hash], sk); +} + static inline void unix_remove_socket(struct sock *sk) { spin_lock(&unix_table_lock); @@ -913,9 +921,7 @@ retry: } addr->hash ^= sk->sk_type; - __unix_remove_socket(sk); - smp_store_release(&u->addr, addr); - __unix_insert_socket(&unix_socket_table[addr->hash], sk); + __unix_set_addr(sk, addr, addr->hash); spin_unlock(&unix_table_lock); err = 0; @@ -1018,7 +1024,6 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) int err; unsigned int hash; struct unix_address *addr; - struct hlist_head *list; struct path path = { }; err = -EINVAL; @@ -1070,25 +1075,20 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE - 1); spin_lock(&unix_table_lock); u->path = path; - list = &unix_socket_table[hash]; } else { spin_lock(&unix_table_lock); err = -EADDRINUSE; if (__unix_find_socket_byname(net, sunaddr, addr_len, sk->sk_type, hash)) { + spin_unlock(&unix_table_lock); unix_release_addr(addr); - goto out_unlock; + goto out_up; } - - list = &unix_socket_table[addr->hash]; + hash = addr->hash; } err = 0; - __unix_remove_socket(sk); - smp_store_release(&u->addr, addr); - __unix_insert_socket(list, sk); - -out_unlock: + __unix_set_addr(sk, addr, hash); spin_unlock(&unix_table_lock); out_up: mutex_unlock(&u->bindlock); -- cgit v1.2.3-58-ga151 From c34d4582518ff83a4848c2d33a46be82e2499a5b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 19 Jun 2021 03:50:27 +0000 Subject: unix_bind(): allocate addr earlier makes it easier to massage; we do pay for that by extra work (kmalloc+memcpy+kfree) in some error cases, but those are not on the hot paths anyway. Signed-off-by: Al Viro Signed-off-by: David S. Miller --- net/unix/af_unix.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 91447624a364..a984cf3d946d 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1040,6 +1040,15 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (err < 0) goto out; addr_len = err; + err = -ENOMEM; + addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL); + if (!addr) + goto out; + + memcpy(addr->name, sunaddr, addr_len); + addr->len = addr_len; + addr->hash = hash ^ sk->sk_type; + refcount_set(&addr->refcnt, 1); if (sun_path[0]) { umode_t mode = S_IFSOCK | @@ -1048,7 +1057,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (err) { if (err == -EEXIST) err = -EADDRINUSE; - goto out; + goto out_addr; } } @@ -1060,16 +1069,6 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (u->addr) goto out_up; - err = -ENOMEM; - addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL); - if (!addr) - goto out_up; - - memcpy(addr->name, sunaddr, addr_len); - addr->len = addr_len; - addr->hash = hash ^ sk->sk_type; - refcount_set(&addr->refcnt, 1); - if (sun_path[0]) { addr->hash = UNIX_HASH_SIZE; hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE - 1); @@ -1081,20 +1080,23 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (__unix_find_socket_byname(net, sunaddr, addr_len, sk->sk_type, hash)) { spin_unlock(&unix_table_lock); - unix_release_addr(addr); goto out_up; } hash = addr->hash; } - err = 0; __unix_set_addr(sk, addr, hash); spin_unlock(&unix_table_lock); + addr = NULL; + err = 0; out_up: mutex_unlock(&u->bindlock); out_put: if (err) path_put(&path); +out_addr: + if (addr) + unix_release_addr(addr); out: return err; } -- cgit v1.2.3-58-ga151 From aee515170576609a0aa3413dc06a7f36f05a5fe2 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 19 Jun 2021 03:50:28 +0000 Subject: unix_bind(): separate BSD and abstract cases We do get some duplication that way, but it's minor compared to parts that are different. What we get is an ability to change locking in BSD case without making failure exits very hard to follow. Signed-off-by: Al Viro Signed-off-by: David S. Miller --- net/unix/af_unix.c | 55 +++++++++++++++++++++++++++++++++--------------------- 1 file changed, 34 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index a984cf3d946d..84ddfb25bc64 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1024,7 +1024,6 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) int err; unsigned int hash; struct unix_address *addr; - struct path path = { }; err = -EINVAL; if (addr_len < offsetofend(struct sockaddr_un, sun_family) || @@ -1051,6 +1050,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) refcount_set(&addr->refcnt, 1); if (sun_path[0]) { + struct path path = { }; umode_t mode = S_IFSOCK | (SOCK_INODE(sock)->i_mode & ~current_umask()); err = unix_mknod(sun_path, mode, &path); @@ -1059,41 +1059,54 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) err = -EADDRINUSE; goto out_addr; } - } - err = mutex_lock_interruptible(&u->bindlock); - if (err) - goto out_put; + err = mutex_lock_interruptible(&u->bindlock); + if (err) { + path_put(&path); + goto out_addr; + } - err = -EINVAL; - if (u->addr) - goto out_up; + err = -EINVAL; + if (u->addr) { + mutex_unlock(&u->bindlock); + path_put(&path); + goto out_addr; + } - if (sun_path[0]) { addr->hash = UNIX_HASH_SIZE; hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE - 1); spin_lock(&unix_table_lock); u->path = path; + __unix_set_addr(sk, addr, hash); + spin_unlock(&unix_table_lock); + mutex_unlock(&u->bindlock); + addr = NULL; + err = 0; } else { + err = mutex_lock_interruptible(&u->bindlock); + if (err) + goto out_addr; + + err = -EINVAL; + if (u->addr) { + mutex_unlock(&u->bindlock); + goto out_addr; + } + spin_lock(&unix_table_lock); err = -EADDRINUSE; if (__unix_find_socket_byname(net, sunaddr, addr_len, sk->sk_type, hash)) { spin_unlock(&unix_table_lock); - goto out_up; + mutex_unlock(&u->bindlock); + goto out_addr; } - hash = addr->hash; + __unix_set_addr(sk, addr, addr->hash); + spin_unlock(&unix_table_lock); + mutex_unlock(&u->bindlock); + addr = NULL; + err = 0; } - - __unix_set_addr(sk, addr, hash); - spin_unlock(&unix_table_lock); - addr = NULL; - err = 0; -out_up: - mutex_unlock(&u->bindlock); -out_put: - if (err) - path_put(&path); out_addr: if (addr) unix_release_addr(addr); -- cgit v1.2.3-58-ga151 From fa42d910a38ee310d5c6826563dd58a08735d5b0 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 19 Jun 2021 03:50:29 +0000 Subject: unix_bind(): take BSD and abstract address cases into new helpers unix_bind_bsd() and unix_bind_abstract() respectively. Signed-off-by: Al Viro Signed-off-by: David S. Miller --- net/unix/af_unix.c | 147 +++++++++++++++++++++++++++-------------------------- 1 file changed, 74 insertions(+), 73 deletions(-) (limited to 'net') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 84ddfb25bc64..d48ebfb182c7 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1014,104 +1014,105 @@ static int unix_mknod(const char *sun_path, umode_t mode, struct path *res) return err; } +static int unix_bind_bsd(struct sock *sk, struct unix_address *addr) +{ + struct unix_sock *u = unix_sk(sk); + struct path path = { }; + umode_t mode = S_IFSOCK | + (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask()); + unsigned int hash; + int err; + + err = unix_mknod(addr->name->sun_path, mode, &path); + if (err) + return err; + + err = mutex_lock_interruptible(&u->bindlock); + if (err) { + path_put(&path); + return err; + } + + if (u->addr) { + mutex_unlock(&u->bindlock); + path_put(&path); + return -EINVAL; + } + + addr->hash = UNIX_HASH_SIZE; + hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE - 1); + spin_lock(&unix_table_lock); + u->path = path; + __unix_set_addr(sk, addr, hash); + spin_unlock(&unix_table_lock); + mutex_unlock(&u->bindlock); + return 0; +} + +static int unix_bind_abstract(struct sock *sk, unsigned hash, + struct unix_address *addr) +{ + struct unix_sock *u = unix_sk(sk); + int err; + + err = mutex_lock_interruptible(&u->bindlock); + if (err) + return err; + + if (u->addr) { + mutex_unlock(&u->bindlock); + return -EINVAL; + } + + spin_lock(&unix_table_lock); + if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len, + sk->sk_type, hash)) { + spin_unlock(&unix_table_lock); + mutex_unlock(&u->bindlock); + return -EADDRINUSE; + } + __unix_set_addr(sk, addr, addr->hash); + spin_unlock(&unix_table_lock); + mutex_unlock(&u->bindlock); + return 0; +} + static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; - struct net *net = sock_net(sk); - struct unix_sock *u = unix_sk(sk); struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; char *sun_path = sunaddr->sun_path; int err; unsigned int hash; struct unix_address *addr; - err = -EINVAL; if (addr_len < offsetofend(struct sockaddr_un, sun_family) || sunaddr->sun_family != AF_UNIX) - goto out; + return -EINVAL; - if (addr_len == sizeof(short)) { - err = unix_autobind(sock); - goto out; - } + if (addr_len == sizeof(short)) + return unix_autobind(sock); err = unix_mkname(sunaddr, addr_len, &hash); if (err < 0) - goto out; + return err; addr_len = err; - err = -ENOMEM; addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL); if (!addr) - goto out; + return -ENOMEM; memcpy(addr->name, sunaddr, addr_len); addr->len = addr_len; addr->hash = hash ^ sk->sk_type; refcount_set(&addr->refcnt, 1); - if (sun_path[0]) { - struct path path = { }; - umode_t mode = S_IFSOCK | - (SOCK_INODE(sock)->i_mode & ~current_umask()); - err = unix_mknod(sun_path, mode, &path); - if (err) { - if (err == -EEXIST) - err = -EADDRINUSE; - goto out_addr; - } - - err = mutex_lock_interruptible(&u->bindlock); - if (err) { - path_put(&path); - goto out_addr; - } - - err = -EINVAL; - if (u->addr) { - mutex_unlock(&u->bindlock); - path_put(&path); - goto out_addr; - } - - addr->hash = UNIX_HASH_SIZE; - hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE - 1); - spin_lock(&unix_table_lock); - u->path = path; - __unix_set_addr(sk, addr, hash); - spin_unlock(&unix_table_lock); - mutex_unlock(&u->bindlock); - addr = NULL; - err = 0; - } else { - err = mutex_lock_interruptible(&u->bindlock); - if (err) - goto out_addr; - - err = -EINVAL; - if (u->addr) { - mutex_unlock(&u->bindlock); - goto out_addr; - } - - spin_lock(&unix_table_lock); - err = -EADDRINUSE; - if (__unix_find_socket_byname(net, sunaddr, addr_len, - sk->sk_type, hash)) { - spin_unlock(&unix_table_lock); - mutex_unlock(&u->bindlock); - goto out_addr; - } - __unix_set_addr(sk, addr, addr->hash); - spin_unlock(&unix_table_lock); - mutex_unlock(&u->bindlock); - addr = NULL; - err = 0; - } -out_addr: - if (addr) + if (sun_path[0]) + err = unix_bind_bsd(sk, addr); + else + err = unix_bind_abstract(sk, hash, addr); + if (err) unix_release_addr(addr); -out: - return err; + return err == -EEXIST ? -EADDRINUSE : err; } static void unix_state_double_lock(struct sock *sk1, struct sock *sk2) -- cgit v1.2.3-58-ga151 From 71e6be6f7d2bada7099d79205779c4452d4fd35b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 19 Jun 2021 03:50:30 +0000 Subject: fold unix_mknod() into unix_bind_bsd() Signed-off-by: Al Viro Signed-off-by: David S. Miller --- net/unix/af_unix.c | 42 +++++++++++++++++------------------------- 1 file changed, 17 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index d48ebfb182c7..fe337dc98400 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -984,46 +984,38 @@ fail: return NULL; } -static int unix_mknod(const char *sun_path, umode_t mode, struct path *res) +static int unix_bind_bsd(struct sock *sk, struct unix_address *addr) { + struct unix_sock *u = unix_sk(sk); + umode_t mode = S_IFSOCK | + (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask()); + struct path parent, path; + struct user_namespace *ns; // barf... struct dentry *dentry; - struct path path; - int err = 0; + unsigned int hash; + int err; + /* * Get the parent directory, calculate the hash for last * component. */ - dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0); - err = PTR_ERR(dentry); + dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0); if (IS_ERR(dentry)) - return err; + return PTR_ERR(dentry); + ns = mnt_user_ns(parent.mnt); /* * All right, let's create it. */ - err = security_path_mknod(&path, dentry, mode, 0); + err = security_path_mknod(&parent, dentry, mode, 0); if (!err) { - err = vfs_mknod(mnt_user_ns(path.mnt), d_inode(path.dentry), - dentry, mode, 0); + err = vfs_mknod(ns, d_inode(parent.dentry), dentry, mode, 0); if (!err) { - res->mnt = mntget(path.mnt); - res->dentry = dget(dentry); + path.mnt = mntget(parent.mnt); + path.dentry = dget(dentry); } } - done_path_create(&path, dentry); - return err; -} - -static int unix_bind_bsd(struct sock *sk, struct unix_address *addr) -{ - struct unix_sock *u = unix_sk(sk); - struct path path = { }; - umode_t mode = S_IFSOCK | - (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask()); - unsigned int hash; - int err; - - err = unix_mknod(addr->name->sun_path, mode, &path); + done_path_create(&parent, dentry); if (err) return err; -- cgit v1.2.3-58-ga151 From 56c1731b280dc71febf5df80fcac1923ea973ab8 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 19 Jun 2021 03:50:31 +0000 Subject: unix_bind_bsd(): move done_path_create() call after dealing with ->bindlock Final preparations for doing unlink on failure past the successful mknod. We can't hold ->bindlock over ->mknod() or ->unlink(), since either might do sb_start_write() (e.g. on overlayfs). However, we can do it while holding filesystem and VFS locks - doing kern_path_create() vfs_mknod() grab ->bindlock if u->addr had been set drop ->bindlock done_path_create return -EINVAL else assign the address to socket drop ->bindlock done_path_create return 0 would be deadlock-free. Here we massage unix_bind_bsd() to that form. We are still doing equivalent transformations. Next commit will *not* be an equivalent transformation - it will add a call of vfs_unlink() before done_path_create() in "alread bound" case. Signed-off-by: Al Viro Signed-off-by: David S. Miller --- net/unix/af_unix.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index fe337dc98400..25dda9ca9d15 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -989,8 +989,8 @@ static int unix_bind_bsd(struct sock *sk, struct unix_address *addr) struct unix_sock *u = unix_sk(sk); umode_t mode = S_IFSOCK | (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask()); - struct path parent, path; struct user_namespace *ns; // barf... + struct path parent; struct dentry *dentry; unsigned int hash; int err; @@ -1008,36 +1008,32 @@ static int unix_bind_bsd(struct sock *sk, struct unix_address *addr) * All right, let's create it. */ err = security_path_mknod(&parent, dentry, mode, 0); - if (!err) { + if (!err) err = vfs_mknod(ns, d_inode(parent.dentry), dentry, mode, 0); - if (!err) { - path.mnt = mntget(parent.mnt); - path.dentry = dget(dentry); - } - } - done_path_create(&parent, dentry); - if (err) + if (err) { + done_path_create(&parent, dentry); return err; - + } err = mutex_lock_interruptible(&u->bindlock); if (err) { - path_put(&path); + done_path_create(&parent, dentry); return err; } - if (u->addr) { mutex_unlock(&u->bindlock); - path_put(&path); + done_path_create(&parent, dentry); return -EINVAL; } addr->hash = UNIX_HASH_SIZE; - hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE - 1); + hash = d_backing_inode(dentry)->i_ino & (UNIX_HASH_SIZE - 1); spin_lock(&unix_table_lock); - u->path = path; + u->path.mnt = mntget(parent.mnt); + u->path.dentry = dget(dentry); __unix_set_addr(sk, addr, hash); spin_unlock(&unix_table_lock); mutex_unlock(&u->bindlock); + done_path_create(&parent, dentry); return 0; } -- cgit v1.2.3-58-ga151 From c0c3b8d380a8f54c75786d41f6f9efbe761dac6c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 19 Jun 2021 03:50:32 +0000 Subject: unix_bind_bsd(): unlink if we fail after successful mknod We can do that more or less safely, since the parent is held locked all along. Yes, somebody might observe the object via dcache, only to have it disappear afterwards, but there's really no good way to prevent that. It won't race with other bind(2) or attempts to move the sucker elsewhere, or put something else in its place - locked parent prevents that. Signed-off-by: Al Viro Signed-off-by: David S. Miller --- net/unix/af_unix.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 25dda9ca9d15..42a9e0730344 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1010,20 +1010,13 @@ static int unix_bind_bsd(struct sock *sk, struct unix_address *addr) err = security_path_mknod(&parent, dentry, mode, 0); if (!err) err = vfs_mknod(ns, d_inode(parent.dentry), dentry, mode, 0); - if (err) { - done_path_create(&parent, dentry); - return err; - } + if (err) + goto out; err = mutex_lock_interruptible(&u->bindlock); - if (err) { - done_path_create(&parent, dentry); - return err; - } - if (u->addr) { - mutex_unlock(&u->bindlock); - done_path_create(&parent, dentry); - return -EINVAL; - } + if (err) + goto out_unlink; + if (u->addr) + goto out_unlock; addr->hash = UNIX_HASH_SIZE; hash = d_backing_inode(dentry)->i_ino & (UNIX_HASH_SIZE - 1); @@ -1035,6 +1028,16 @@ static int unix_bind_bsd(struct sock *sk, struct unix_address *addr) mutex_unlock(&u->bindlock); done_path_create(&parent, dentry); return 0; + +out_unlock: + mutex_unlock(&u->bindlock); + err = -EINVAL; +out_unlink: + /* failed after successful mknod? unlink what we'd created... */ + vfs_unlink(ns, d_inode(parent.dentry), dentry, NULL); +out: + done_path_create(&parent, dentry); + return err; } static int unix_bind_abstract(struct sock *sk, unsigned hash, -- cgit v1.2.3-58-ga151 From be752283a2a2b4bfc2df512b5d9b03a34aece252 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 19 Jun 2021 03:50:33 +0000 Subject: __unix_find_socket_byname(): don't pass hash and type separately We only care about exclusive or of those, so pass that directly. Makes life simpler for callers as well... Signed-off-by: Al Viro Signed-off-by: David S. Miller --- net/unix/af_unix.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 42a9e0730344..58c2f318b0a8 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -286,11 +286,11 @@ static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk) static struct sock *__unix_find_socket_byname(struct net *net, struct sockaddr_un *sunname, - int len, int type, unsigned int hash) + int len, unsigned int hash) { struct sock *s; - sk_for_each(s, &unix_socket_table[hash ^ type]) { + sk_for_each(s, &unix_socket_table[hash]) { struct unix_sock *u = unix_sk(s); if (!net_eq(sock_net(s), net)) @@ -305,13 +305,12 @@ static struct sock *__unix_find_socket_byname(struct net *net, static inline struct sock *unix_find_socket_byname(struct net *net, struct sockaddr_un *sunname, - int len, int type, - unsigned int hash) + int len, unsigned int hash) { struct sock *s; spin_lock(&unix_table_lock); - s = __unix_find_socket_byname(net, sunname, len, type, hash); + s = __unix_find_socket_byname(net, sunname, len, hash); if (s) sock_hold(s); spin_unlock(&unix_table_lock); @@ -899,12 +898,12 @@ static int unix_autobind(struct socket *sock) retry: addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short); addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0)); + addr->hash ^= sk->sk_type; spin_lock(&unix_table_lock); ordernum = (ordernum+1)&0xFFFFF; - if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type, - addr->hash)) { + if (__unix_find_socket_byname(net, addr->name, addr->len, addr->hash)) { spin_unlock(&unix_table_lock); /* * __unix_find_socket_byname() may take long time if many names @@ -919,7 +918,6 @@ retry: } goto retry; } - addr->hash ^= sk->sk_type; __unix_set_addr(sk, addr, addr->hash); spin_unlock(&unix_table_lock); @@ -966,7 +964,7 @@ static struct sock *unix_find_other(struct net *net, } } else { err = -ECONNREFUSED; - u = unix_find_socket_byname(net, sunname, len, type, hash); + u = unix_find_socket_byname(net, sunname, len, type ^ hash); if (u) { struct dentry *dentry; dentry = unix_sk(u)->path.dentry; @@ -1040,8 +1038,7 @@ out: return err; } -static int unix_bind_abstract(struct sock *sk, unsigned hash, - struct unix_address *addr) +static int unix_bind_abstract(struct sock *sk, struct unix_address *addr) { struct unix_sock *u = unix_sk(sk); int err; @@ -1057,7 +1054,7 @@ static int unix_bind_abstract(struct sock *sk, unsigned hash, spin_lock(&unix_table_lock); if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len, - sk->sk_type, hash)) { + addr->hash)) { spin_unlock(&unix_table_lock); mutex_unlock(&u->bindlock); return -EADDRINUSE; @@ -1100,7 +1097,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (sun_path[0]) err = unix_bind_bsd(sk, addr); else - err = unix_bind_abstract(sk, hash, addr); + err = unix_bind_abstract(sk, addr); if (err) unix_release_addr(addr); return err == -EEXIST ? -EADDRINUSE : err; -- cgit v1.2.3-58-ga151 From d452d48b9f8b1a7f8152d33ef52cfd7fe1735b0a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 18 Jun 2021 13:34:06 -0700 Subject: tls: prevent oversized sendfile() hangs by ignoring MSG_MORE We got multiple reports that multi_chunk_sendfile test case from tls selftest fails. This was sort of expected, as the original fix was never applied (see it in the first Link:). The test in question uses sendfile() with count larger than the size of the underlying file. This will make splice set MSG_MORE on all sendpage calls, meaning TLS will never close and flush the last partial record. Eric seem to have addressed a similar problem in commit 35f9c09fe9c7 ("tcp: tcp_sendpages() should call tcp_push() once") by introducing MSG_SENDPAGE_NOTLAST. Unlike MSG_MORE MSG_SENDPAGE_NOTLAST is not set on the last call of a "pipefull" of data (PIPE_DEF_BUFFERS == 16, so every 16 pages or whenever we run out of data). Having a break every 16 pages should be fine, TLS can pack exactly 4 pages into a record, so for aligned reads there should be no difference, unaligned may see one extra record per sendpage(). Sticking to TCP semantics seems preferable to modifying splice, but we can revisit it if real life scenarios show a regression. Reported-by: Vadim Fedorenko Reported-by: Seth Forshee Link: https://lore.kernel.org/netdev/1591392508-14592-1-git-send-email-pooja.trivedi@stackpath.com/ Fixes: 3c4d7559159b ("tls: kernel TLS support") Signed-off-by: Jakub Kicinski Tested-by: Seth Forshee Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 694de024d0ee..74e5701034aa 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1153,7 +1153,7 @@ static int tls_sw_do_sendpage(struct sock *sk, struct page *page, int ret = 0; bool eor; - eor = !(flags & (MSG_MORE | MSG_SENDPAGE_NOTLAST)); + eor = !(flags & MSG_SENDPAGE_NOTLAST); sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); /* Call the sk_stream functions to manage the sndbuf mem. */ -- cgit v1.2.3-58-ga151 From 8674f8d310215d2bb7469b80e6cfccc044f717b3 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 21 Jun 2021 19:42:14 +0300 Subject: net: dsa: assert uniqueness of dsa,member properties The cross-chip notifiers work by comparing each ds->index against the info->sw_index value from the notifier. The ds->index is retrieved from the device tree dsa,member property. If a single tree cross-chip topology does not declare unique switch IDs, this will result in hard-to-debug issues/voodoo effects such as the cross-chip notifier for one switch port also matching the port with the same number from another switch. Check in dsa_switch_parse_member_of() whether the DSA switch tree contains a DSA switch with the index we're preparing to add, before actually adding it. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index b71e87909f0e..ba244fbd9646 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -1259,6 +1259,13 @@ static int dsa_switch_parse_member_of(struct dsa_switch *ds, if (!ds->dst) return -ENOMEM; + if (dsa_switch_find(ds->dst->index, ds->index)) { + dev_err(ds->dev, + "A DSA switch with index %d already exists in tree %d\n", + ds->index, ds->dst->index); + return -EEXIST; + } + return 0; } -- cgit v1.2.3-58-ga151 From a8986681ccada614a30df7248390780e7708a763 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 21 Jun 2021 19:42:15 +0300 Subject: net: dsa: export the dsa_port_is_{user,cpu,dsa} helpers The difference between dsa_is_user_port and dsa_port_is_user is that the former needs to look up the list of ports of the DSA switch tree in order to find the struct dsa_port, while the latter directly receives it as an argument. dsa_is_user_port is already in widespread use and has its place, so there isn't any chance of converting all callers to a single form. But being able to do: dsa_port_is_user(dp) instead of dsa_is_user_port(dp->ds, dp->index) is much more efficient too, especially when the "dp" comes from an iterator over the DSA switch tree - this reduces the complexity from quadratic to linear. Move these helpers from dsa2.c to include/net/dsa.h so that others can use them too. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/net/dsa.h | 15 +++++++++++++++ net/dsa/dsa2.c | 15 --------------- 2 files changed, 15 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 289d68e82da0..ea47783d5695 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -409,6 +409,21 @@ static inline struct dsa_port *dsa_to_port(struct dsa_switch *ds, int p) return NULL; } +static inline bool dsa_port_is_dsa(struct dsa_port *port) +{ + return port->type == DSA_PORT_TYPE_DSA; +} + +static inline bool dsa_port_is_cpu(struct dsa_port *port) +{ + return port->type == DSA_PORT_TYPE_CPU; +} + +static inline bool dsa_port_is_user(struct dsa_port *dp) +{ + return dp->type == DSA_PORT_TYPE_USER; +} + static inline bool dsa_is_unused_port(struct dsa_switch *ds, int p) { return dsa_to_port(ds, p)->type == DSA_PORT_TYPE_UNUSED; diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index ba244fbd9646..9000a8c84baf 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -219,21 +219,6 @@ static void dsa_tree_put(struct dsa_switch_tree *dst) kref_put(&dst->refcount, dsa_tree_release); } -static bool dsa_port_is_dsa(struct dsa_port *port) -{ - return port->type == DSA_PORT_TYPE_DSA; -} - -static bool dsa_port_is_cpu(struct dsa_port *port) -{ - return port->type == DSA_PORT_TYPE_CPU; -} - -static bool dsa_port_is_user(struct dsa_port *dp) -{ - return dp->type == DSA_PORT_TYPE_USER; -} - static struct dsa_port *dsa_tree_find_port_by_node(struct dsa_switch_tree *dst, struct device_node *dn) { -- cgit v1.2.3-58-ga151 From abd49535c3801f33c3ca42d81271d7e535adce81 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 21 Jun 2021 19:42:16 +0300 Subject: net: dsa: execute dsa_switch_mdb_add only for routing port in cross-chip topologies Currently, the notifier for adding a multicast MAC address matches on the targeted port and on all DSA links in the system, be they upstream or downstream links. This leads to a considerable amount of useless traffic. Consider this daisy chain topology, and a MDB add notifier emitted on sw0p0. It matches on sw0p0, sw0p3, sw1p3 and sw2p4. sw0p0 sw0p1 sw0p2 sw0p3 sw0p4 [ user ] [ user ] [ user ] [ dsa ] [ cpu ] [ x ] [ ] [ ] [ x ] [ ] | +---------+ | sw1p0 sw1p1 sw1p2 sw1p3 sw1p4 [ user ] [ user ] [ user ] [ dsa ] [ dsa ] [ ] [ ] [ ] [ x ] [ x ] | +---------+ | sw2p0 sw2p1 sw2p2 sw2p3 sw2p4 [ user ] [ user ] [ user ] [ user ] [ dsa ] [ ] [ ] [ ] [ ] [ x ] But switch 0 has no reason to send the multicast traffic for that MAC address on sw0p3, which is how it reaches switches 1 and 2. Those switches don't expect, according to the user configuration, to receive this multicast address from switch 1, and they will drop it anyway, because the only valid destination is the port they received it on. They only need to configure themselves to deliver that multicast address _towards_ switch 1, where the MDB entry is installed. Similarly, switch 1 should not send this multicast traffic towards sw1p3, because that is how it reaches switch 2. With this change, the heat map for this MDB notifier changes as follows: sw0p0 sw0p1 sw0p2 sw0p3 sw0p4 [ user ] [ user ] [ user ] [ dsa ] [ cpu ] [ x ] [ ] [ ] [ ] [ ] | +---------+ | sw1p0 sw1p1 sw1p2 sw1p3 sw1p4 [ user ] [ user ] [ user ] [ dsa ] [ dsa ] [ ] [ ] [ ] [ ] [ x ] | +---------+ | sw2p0 sw2p1 sw2p2 sw2p3 sw2p4 [ user ] [ user ] [ user ] [ user ] [ dsa ] [ ] [ ] [ ] [ ] [ x ] Now the mdb notifier behaves the same as the fdb notifier. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/switch.c | 25 ++----------------------- 1 file changed, 2 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/dsa/switch.c b/net/dsa/switch.c index 9bf8e20ecdf3..8b601ced6b45 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -232,36 +232,15 @@ static int dsa_switch_lag_leave(struct dsa_switch *ds, return 0; } -static bool dsa_switch_mdb_match(struct dsa_switch *ds, int port, - struct dsa_notifier_mdb_info *info) -{ - if (ds->index == info->sw_index && port == info->port) - return true; - - if (dsa_is_dsa_port(ds, port)) - return true; - - return false; -} - static int dsa_switch_mdb_add(struct dsa_switch *ds, struct dsa_notifier_mdb_info *info) { - int err = 0; - int port; + int port = dsa_towards_port(ds, info->sw_index, info->port); if (!ds->ops->port_mdb_add) return -EOPNOTSUPP; - for (port = 0; port < ds->num_ports; port++) { - if (dsa_switch_mdb_match(ds, port, info)) { - err = ds->ops->port_mdb_add(ds, port, info->mdb); - if (err) - break; - } - } - - return err; + return ds->ops->port_mdb_add(ds, port, info->mdb); } static int dsa_switch_mdb_del(struct dsa_switch *ds, -- cgit v1.2.3-58-ga151 From 4e4ab7950044d195f6e3d4dac328f506badb6efa Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 21 Jun 2021 19:42:17 +0300 Subject: net: dsa: calculate the largest_mtu across all ports in the tree If we have a cross-chip topology like this: sw0p0 sw0p1 sw0p2 sw0p3 sw0p4 [ cpu ] [ user ] [ user ] [ dsa ] [ user ] | +---------+ | sw1p0 sw1p1 sw1p2 sw1p3 sw1p4 [ user ] [ user ] [ user ] [ dsa ] [ dsa ] and we issue the following commands: 1. ip link set sw0p1 mtu 1700 2. ip link set sw1p1 mtu 1600 we notice the following happening: Command 1. emits a non-targeted MTU notifier for the CPU port (sw0p0) with the largest_mtu calculated across switch 0, of 1700. This matches sw0p0, sw0p3 and sw1p4 (all CPU ports and DSA links). Then, it emits a targeted MTU notifier for the user port (sw0p1), again with MTU 1700 (this doesn't matter). Command 2. emits a non-targeted MTU notifier for the CPU port (sw0p0) with the largest_mtu calculated across switch 1, of 1600. This matches the same group of ports as above, and decreases the MTU for the CPU port and the DSA links from 1700 to 1600. As a result, the sw0p1 user port can no longer communicate with its CPU port at MTU 1700. To address this, we should calculate the largest_mtu across all switches that may share a CPU port, and only emit MTU notifiers with that value. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/slave.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 798944aa847a..ac2ca5f75af3 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1528,6 +1528,7 @@ int dsa_slave_change_mtu(struct net_device *dev, int new_mtu) struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->dp->ds; + struct dsa_port *dp_iter; struct dsa_port *cpu_dp; int port = p->dp->index; int largest_mtu = 0; @@ -1535,31 +1536,31 @@ int dsa_slave_change_mtu(struct net_device *dev, int new_mtu) int old_master_mtu; int mtu_limit; int cpu_mtu; - int err, i; + int err; if (!ds->ops->port_change_mtu) return -EOPNOTSUPP; - for (i = 0; i < ds->num_ports; i++) { + list_for_each_entry(dp_iter, &ds->dst->ports, list) { int slave_mtu; - if (!dsa_is_user_port(ds, i)) + if (!dsa_port_is_user(dp_iter)) continue; /* During probe, this function will be called for each slave * device, while not all of them have been allocated. That's * ok, it doesn't change what the maximum is, so ignore it. */ - if (!dsa_to_port(ds, i)->slave) + if (!dp_iter->slave) continue; /* Pretend that we already applied the setting, which we * actually haven't (still haven't done all integrity checks) */ - if (i == port) + if (dp_iter == dp) slave_mtu = new_mtu; else - slave_mtu = dsa_to_port(ds, i)->slave->mtu; + slave_mtu = dp_iter->slave->mtu; if (largest_mtu < slave_mtu) largest_mtu = slave_mtu; -- cgit v1.2.3-58-ga151 From 88faba20e2100c1f367133af56612742ad37df08 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 21 Jun 2021 19:42:18 +0300 Subject: net: dsa: targeted MTU notifiers should only match on one port dsa_slave_change_mtu() calls dsa_port_mtu_change() twice: - it sends a cross-chip notifier with the MTU of the CPU port which is used to update the DSA links. - it sends one targeted MTU notifier which is supposed to only match the user port on which we are changing the MTU. The "propagate_upstream" variable is used here to bypass the cross-chip notifier system from switch.c But due to a mistake, the second, targeted notifier matches not only on the user port, but also on the DSA link which is a member of the same switch, if that exists. And because the DSA links of the entire dst were programmed in a previous round to the largest_mtu via a "propagate_upstream == true" notification, then the dsa_port_mtu_change(propagate_upstream == false) call that is immediately upcoming will break the MTU on the one DSA link which is chip-wise local to the dp whose MTU is changing right now. Example given this daisy chain topology: sw0p0 sw0p1 sw0p2 sw0p3 sw0p4 [ cpu ] [ user ] [ user ] [ dsa ] [ user ] [ x ] [ ] [ ] [ x ] [ ] | +---------+ | sw1p0 sw1p1 sw1p2 sw1p3 sw1p4 [ user ] [ user ] [ user ] [ dsa ] [ dsa ] [ ] [ ] [ ] [ ] [ x ] ip link set sw0p1 mtu 9000 ip link set sw1p1 mtu 9000 # at this stage, sw0p1 and sw1p1 can talk # to one another using jumbo frames ip link set sw0p2 mtu 1500 # this programs the sw0p3 DSA link first to # the largest_mtu of 9000, then reprograms it to # 1500 with the "propagate_upstream == false" # notifier, breaking communication between # sw0p1 and sw1p1 To escape from this situation, make the targeted match really match on a single port - the user port, and rename the "propagate_upstream" variable to "targeted_match" to clarify the intention and avoid future issues. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa_priv.h | 4 ++-- net/dsa/port.c | 4 ++-- net/dsa/slave.c | 9 +++++---- net/dsa/switch.c | 9 ++++++--- 4 files changed, 15 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index b8b17474b72b..b0811253d101 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -84,7 +84,7 @@ struct dsa_notifier_vlan_info { /* DSA_NOTIFIER_MTU */ struct dsa_notifier_mtu_info { - bool propagate_upstream; + bool targeted_match; int sw_index; int port; int mtu; @@ -200,7 +200,7 @@ int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering, bool dsa_port_skip_vlan_configuration(struct dsa_port *dp); int dsa_port_ageing_time(struct dsa_port *dp, clock_t ageing_clock); int dsa_port_mtu_change(struct dsa_port *dp, int new_mtu, - bool propagate_upstream); + bool targeted_match); int dsa_port_fdb_add(struct dsa_port *dp, const unsigned char *addr, u16 vid); int dsa_port_fdb_del(struct dsa_port *dp, const unsigned char *addr, diff --git a/net/dsa/port.c b/net/dsa/port.c index 6379d66a6bb3..5c93f1e1a03d 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -567,11 +567,11 @@ int dsa_port_mrouter(struct dsa_port *dp, bool mrouter, } int dsa_port_mtu_change(struct dsa_port *dp, int new_mtu, - bool propagate_upstream) + bool targeted_match) { struct dsa_notifier_mtu_info info = { .sw_index = dp->ds->index, - .propagate_upstream = propagate_upstream, + .targeted_match = targeted_match, .port = dp->index, .mtu = new_mtu, }; diff --git a/net/dsa/slave.c b/net/dsa/slave.c index ac2ca5f75af3..5e668e529575 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1586,14 +1586,15 @@ int dsa_slave_change_mtu(struct net_device *dev, int new_mtu) goto out_master_failed; /* We only need to propagate the MTU of the CPU port to - * upstream switches. + * upstream switches, so create a non-targeted notifier which + * updates all switches. */ - err = dsa_port_mtu_change(cpu_dp, cpu_mtu, true); + err = dsa_port_mtu_change(cpu_dp, cpu_mtu, false); if (err) goto out_cpu_failed; } - err = dsa_port_mtu_change(dp, new_mtu, false); + err = dsa_port_mtu_change(dp, new_mtu, true); if (err) goto out_port_failed; @@ -1607,7 +1608,7 @@ out_port_failed: if (new_master_mtu != old_master_mtu) dsa_port_mtu_change(cpu_dp, old_master_mtu - dsa_tag_protocol_overhead(cpu_dp->tag_ops), - true); + false); out_cpu_failed: if (new_master_mtu != old_master_mtu) dev_set_mtu(master, old_master_mtu); diff --git a/net/dsa/switch.c b/net/dsa/switch.c index 8b601ced6b45..75f567390a6b 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -52,10 +52,13 @@ static int dsa_switch_ageing_time(struct dsa_switch *ds, static bool dsa_switch_mtu_match(struct dsa_switch *ds, int port, struct dsa_notifier_mtu_info *info) { - if (ds->index == info->sw_index) - return (port == info->port) || dsa_is_dsa_port(ds, port); + if (ds->index == info->sw_index && port == info->port) + return true; - if (!info->propagate_upstream) + /* Do not propagate to other switches in the tree if the notifier was + * targeted for a single switch. + */ + if (info->targeted_match) return false; if (dsa_is_dsa_port(ds, port) || dsa_is_cpu_port(ds, port)) -- cgit v1.2.3-58-ga151 From f9bcdc362c7776b875c0f390e982cbac597d660f Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 21 Jun 2021 19:42:19 +0300 Subject: net: dsa: remove cross-chip support from the MRP notifiers With MRP hardware assist being supported only by the ocelot switch family, which by design does not support cross-chip bridging, the current match functions are at best a guess and have not been confirmed in any way to do anything relevant in a multi-switch topology. Drop the code and make the notifiers match only on the targeted switch port. Cc: Horatiu Vultur Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/switch.c | 55 +++++++------------------------------------------------ 1 file changed, 7 insertions(+), 48 deletions(-) (limited to 'net') diff --git a/net/dsa/switch.c b/net/dsa/switch.c index 75f567390a6b..c1e5afafe633 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -346,36 +346,16 @@ static int dsa_switch_change_tag_proto(struct dsa_switch *ds, return 0; } -static bool dsa_switch_mrp_match(struct dsa_switch *ds, int port, - struct dsa_notifier_mrp_info *info) -{ - if (ds->index == info->sw_index && port == info->port) - return true; - - if (dsa_is_dsa_port(ds, port)) - return true; - - return false; -} - static int dsa_switch_mrp_add(struct dsa_switch *ds, struct dsa_notifier_mrp_info *info) { - int err = 0; - int port; - if (!ds->ops->port_mrp_add) return -EOPNOTSUPP; - for (port = 0; port < ds->num_ports; port++) { - if (dsa_switch_mrp_match(ds, port, info)) { - err = ds->ops->port_mrp_add(ds, port, info->mrp); - if (err) - break; - } - } + if (ds->index == info->sw_index) + return ds->ops->port_mrp_add(ds, info->port, info->mrp); - return err; + return 0; } static int dsa_switch_mrp_del(struct dsa_switch *ds, @@ -390,39 +370,18 @@ static int dsa_switch_mrp_del(struct dsa_switch *ds, return 0; } -static bool -dsa_switch_mrp_ring_role_match(struct dsa_switch *ds, int port, - struct dsa_notifier_mrp_ring_role_info *info) -{ - if (ds->index == info->sw_index && port == info->port) - return true; - - if (dsa_is_dsa_port(ds, port)) - return true; - - return false; -} - static int dsa_switch_mrp_add_ring_role(struct dsa_switch *ds, struct dsa_notifier_mrp_ring_role_info *info) { - int err = 0; - int port; - if (!ds->ops->port_mrp_add) return -EOPNOTSUPP; - for (port = 0; port < ds->num_ports; port++) { - if (dsa_switch_mrp_ring_role_match(ds, port, info)) { - err = ds->ops->port_mrp_add_ring_role(ds, port, - info->mrp); - if (err) - break; - } - } + if (ds->index == info->sw_index) + return ds->ops->port_mrp_add_ring_role(ds, info->port, + info->mrp); - return err; + return 0; } static int -- cgit v1.2.3-58-ga151 From 24610ed80df65a564d6165d15505a950d05f9f5a Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 19 Jun 2021 16:55:46 +0300 Subject: netfilter: nfnetlink_hook: fix check for snprintf() overflow The kernel version of snprintf() can't return negatives. The "ret > (int)sizeof(sym)" check is off by one because and it should be >=. Finally, we need to set a negative error code. Fixes: e2cf17d3774c ("netfilter: add new hook nfnl subsystem") Signed-off-by: Dan Carpenter Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink_hook.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nfnetlink_hook.c b/net/netfilter/nfnetlink_hook.c index 58fda6ac663b..50b4e3c9347a 100644 --- a/net/netfilter/nfnetlink_hook.c +++ b/net/netfilter/nfnetlink_hook.c @@ -126,8 +126,10 @@ static int nfnl_hook_dump_one(struct sk_buff *nlskb, #ifdef CONFIG_KALLSYMS ret = snprintf(sym, sizeof(sym), "%ps", ops->hook); - if (ret < 0 || ret > (int)sizeof(sym)) + if (ret >= sizeof(sym)) { + ret = -EINVAL; goto nla_put_failure; + } module_name = strstr(sym, " ["); if (module_name) { -- cgit v1.2.3-58-ga151 From 3c5e44622011b9ea21bd425875dcccfc9a158f5f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sat, 19 Jun 2021 00:55:20 +0200 Subject: netfilter: nf_tables: memleak in hw offload abort path Release flow from the abort path, this is easy to reproduce since b72920f6e4a9 ("netfilter: nftables: counter hardware offload support"). If the preparation phase fails, then the abort path is exercised without releasing the flow rule object. unreferenced object 0xffff8881f0fa7700 (size 128): comm "nft", pid 1335, jiffies 4294931120 (age 4163.740s) hex dump (first 32 bytes): 08 e4 de 13 82 88 ff ff 98 e4 de 13 82 88 ff ff ................ 48 e4 de 13 82 88 ff ff 01 00 00 00 00 00 00 00 H............... backtrace: [<00000000634547e7>] flow_rule_alloc+0x26/0x80 [<00000000c8426156>] nft_flow_rule_create+0xc9/0x3f0 [nf_tables] [<0000000075ff8e46>] nf_tables_newrule+0xc79/0x10a0 [nf_tables] [<00000000ba65e40e>] nfnetlink_rcv_batch+0xaac/0xf90 [nfnetlink] [<00000000505c614a>] nfnetlink_rcv+0x1bb/0x1f0 [nfnetlink] [<00000000eb78e1fe>] netlink_unicast+0x34b/0x480 [<00000000a8f72c94>] netlink_sendmsg+0x3af/0x690 [<000000009cb1ddf4>] sock_sendmsg+0x96/0xa0 [<0000000039d06e44>] ____sys_sendmsg+0x3fe/0x440 [<00000000137e82ca>] ___sys_sendmsg+0xd8/0x140 [<000000000c6bf6a6>] __sys_sendmsg+0xb3/0x130 [<0000000043bd6268>] do_syscall_64+0x40/0xb0 [<00000000afdebc2d>] entry_SYSCALL_64_after_hwframe+0x44/0xae Remove flow rule release from the offload commit path, otherwise error from the offload commit phase might trigger a double-free due to the execution of the abort_offload -> abort. After this patch, the abort path takes care of releasing the flow rule. This fix also needs to move the nft_flow_rule_create() call before the transaction object is added otherwise the abort path might find a NULL pointer to the flow rule object for the NFT_CHAIN_HW_OFFLOAD case. While at it, rename BASIC-like goto tags to slightly more meaningful names rather than adding a new "err3" tag. Fixes: 63b48c73ff56 ("netfilter: nf_tables_offload: undo updates if transaction fails") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 51 ++++++++++++++++++++++++--------------- net/netfilter/nf_tables_offload.c | 17 ------------- 2 files changed, 31 insertions(+), 37 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index bf4d6ec9fc55..ca9ec8721e6c 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3243,9 +3243,9 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, u8 genmask = nft_genmask_next(info->net); struct nft_rule *rule, *old_rule = NULL; struct nft_expr_info *expr_info = NULL; + struct nft_flow_rule *flow = NULL; int family = nfmsg->nfgen_family; struct net *net = info->net; - struct nft_flow_rule *flow; struct nft_userdata *udata; struct nft_table *table; struct nft_chain *chain; @@ -3340,13 +3340,13 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, nla_for_each_nested(tmp, nla[NFTA_RULE_EXPRESSIONS], rem) { err = -EINVAL; if (nla_type(tmp) != NFTA_LIST_ELEM) - goto err1; + goto err_release_expr; if (n == NFT_RULE_MAXEXPRS) - goto err1; + goto err_release_expr; err = nf_tables_expr_parse(&ctx, tmp, &expr_info[n]); if (err < 0) { NL_SET_BAD_ATTR(extack, tmp); - goto err1; + goto err_release_expr; } size += expr_info[n].ops->size; n++; @@ -3355,7 +3355,7 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, /* Check for overflow of dlen field */ err = -EFBIG; if (size >= 1 << 12) - goto err1; + goto err_release_expr; if (nla[NFTA_RULE_USERDATA]) { ulen = nla_len(nla[NFTA_RULE_USERDATA]); @@ -3366,7 +3366,7 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, err = -ENOMEM; rule = kzalloc(sizeof(*rule) + size + usize, GFP_KERNEL); if (rule == NULL) - goto err1; + goto err_release_expr; nft_activate_next(net, rule); @@ -3385,7 +3385,7 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, err = nf_tables_newexpr(&ctx, &expr_info[i], expr); if (err < 0) { NL_SET_BAD_ATTR(extack, expr_info[i].attr); - goto err2; + goto err_release_rule; } if (expr_info[i].ops->validate) @@ -3395,16 +3395,24 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, expr = nft_expr_next(expr); } + if (chain->flags & NFT_CHAIN_HW_OFFLOAD) { + flow = nft_flow_rule_create(net, rule); + if (IS_ERR(flow)) { + err = PTR_ERR(flow); + goto err_release_rule; + } + } + if (info->nlh->nlmsg_flags & NLM_F_REPLACE) { trans = nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule); if (trans == NULL) { err = -ENOMEM; - goto err2; + goto err_destroy_flow_rule; } err = nft_delrule(&ctx, old_rule); if (err < 0) { nft_trans_destroy(trans); - goto err2; + goto err_destroy_flow_rule; } list_add_tail_rcu(&rule->list, &old_rule->list); @@ -3412,7 +3420,7 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, trans = nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule); if (!trans) { err = -ENOMEM; - goto err2; + goto err_destroy_flow_rule; } if (info->nlh->nlmsg_flags & NLM_F_APPEND) { @@ -3430,21 +3438,19 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, kvfree(expr_info); chain->use++; + if (flow) + nft_trans_flow_rule(trans) = flow; + if (nft_net->validate_state == NFT_VALIDATE_DO) return nft_table_validate(net, table); - if (chain->flags & NFT_CHAIN_HW_OFFLOAD) { - flow = nft_flow_rule_create(net, rule); - if (IS_ERR(flow)) - return PTR_ERR(flow); - - nft_trans_flow_rule(trans) = flow; - } - return 0; -err2: + +err_destroy_flow_rule: + nft_flow_rule_destroy(flow); +err_release_rule: nf_tables_rule_release(&ctx, rule); -err1: +err_release_expr: for (i = 0; i < n; i++) { if (expr_info[i].ops) { module_put(expr_info[i].ops->type->owner); @@ -8839,11 +8845,16 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) nft_rule_expr_deactivate(&trans->ctx, nft_trans_rule(trans), NFT_TRANS_ABORT); + if (trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD) + nft_flow_rule_destroy(nft_trans_flow_rule(trans)); break; case NFT_MSG_DELRULE: trans->ctx.chain->use++; nft_clear(trans->ctx.net, nft_trans_rule(trans)); nft_rule_expr_activate(&trans->ctx, nft_trans_rule(trans)); + if (trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD) + nft_flow_rule_destroy(nft_trans_flow_rule(trans)); + nft_trans_destroy(trans); break; case NFT_MSG_NEWSET: diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index a48c5fd53a80..ec701b84844f 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -594,23 +594,6 @@ int nft_flow_rule_offload_commit(struct net *net) } } - list_for_each_entry(trans, &nft_net->commit_list, list) { - if (trans->ctx.family != NFPROTO_NETDEV) - continue; - - switch (trans->msg_type) { - case NFT_MSG_NEWRULE: - case NFT_MSG_DELRULE: - if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) - continue; - - nft_flow_rule_destroy(nft_trans_flow_rule(trans)); - break; - default: - break; - } - } - return err; } -- cgit v1.2.3-58-ga151 From ea45fdf82cc90430bb7c280e5e53821e833782c5 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sat, 19 Jun 2021 01:25:14 +0200 Subject: netfilter: nf_tables_offload: check FLOW_DISSECTOR_KEY_BASIC in VLAN transfer logic The VLAN transfer logic should actually check for FLOW_DISSECTOR_KEY_BASIC, not FLOW_DISSECTOR_KEY_CONTROL. Moreover, do not fallback to case 2) .n_proto is set to 802.1q or 802.1ad, if FLOW_DISSECTOR_KEY_BASIC is unset. Fixes: 783003f3bb8a ("netfilter: nftables_offload: special ethertype handling for VLAN") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_offload.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index ec701b84844f..b58d73a96523 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -54,15 +54,10 @@ static void nft_flow_rule_transfer_vlan(struct nft_offload_ctx *ctx, struct nft_flow_rule *flow) { struct nft_flow_match *match = &flow->match; - struct nft_offload_ethertype ethertype; - - if (match->dissector.used_keys & BIT(FLOW_DISSECTOR_KEY_CONTROL) && - match->key.basic.n_proto != htons(ETH_P_8021Q) && - match->key.basic.n_proto != htons(ETH_P_8021AD)) - return; - - ethertype.value = match->key.basic.n_proto; - ethertype.mask = match->mask.basic.n_proto; + struct nft_offload_ethertype ethertype = { + .value = match->key.basic.n_proto, + .mask = match->mask.basic.n_proto, + }; if (match->dissector.used_keys & BIT(FLOW_DISSECTOR_KEY_VLAN) && (match->key.vlan.vlan_tpid == htons(ETH_P_8021Q) || @@ -76,7 +71,9 @@ static void nft_flow_rule_transfer_vlan(struct nft_offload_ctx *ctx, match->dissector.offset[FLOW_DISSECTOR_KEY_CVLAN] = offsetof(struct nft_flow_key, cvlan); match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_CVLAN); - } else { + } else if (match->dissector.used_keys & BIT(FLOW_DISSECTOR_KEY_BASIC) && + (match->key.basic.n_proto == htons(ETH_P_8021Q) || + match->key.basic.n_proto == htons(ETH_P_8021AD))) { match->key.basic.n_proto = match->key.vlan.vlan_tpid; match->mask.basic.n_proto = match->mask.vlan.vlan_tpid; match->key.vlan.vlan_tpid = ethertype.value; -- cgit v1.2.3-58-ga151 From 1502328f17ab0684ca5ed6764433aa0a83bdaf95 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 18 Jun 2021 15:02:20 -0700 Subject: mptcp: fix bad handling of 32 bit ack wrap-around When receiving 32 bits DSS ack from the peer, the MPTCP need to expand them to 64 bits value. The current code is buggy WRT detecting 32 bits ack wrap-around: when the wrap-around happens the current unsigned 32 bit ack value is lower than the previous one. Additionally check for possible reverse wrap and make the helper visible, so that we could re-use it for the next patch. Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/204 Fixes: cc9d25669866 ("mptcp: update per unacked sequence on pkt reception") Reviewed-by: Mat Martineau Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/mptcp/options.c | 29 +++++++++++++++-------------- net/mptcp/protocol.h | 8 ++++++++ 2 files changed, 23 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 9b263f27ce9b..b87e46f515fb 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -896,19 +896,20 @@ reset: return false; } -static u64 expand_ack(u64 old_ack, u64 cur_ack, bool use_64bit) +u64 __mptcp_expand_seq(u64 old_seq, u64 cur_seq) { - u32 old_ack32, cur_ack32; - - if (use_64bit) - return cur_ack; - - old_ack32 = (u32)old_ack; - cur_ack32 = (u32)cur_ack; - cur_ack = (old_ack & GENMASK_ULL(63, 32)) + cur_ack32; - if (unlikely(before(cur_ack32, old_ack32))) - return cur_ack + (1LL << 32); - return cur_ack; + u32 old_seq32, cur_seq32; + + old_seq32 = (u32)old_seq; + cur_seq32 = (u32)cur_seq; + cur_seq = (old_seq & GENMASK_ULL(63, 32)) + cur_seq32; + if (unlikely(cur_seq32 < old_seq32 && before(old_seq32, cur_seq32))) + return cur_seq + (1LL << 32); + + /* reverse wrap could happen, too */ + if (unlikely(cur_seq32 > old_seq32 && after(old_seq32, cur_seq32))) + return cur_seq - (1LL << 32); + return cur_seq; } static void ack_update_msk(struct mptcp_sock *msk, @@ -926,7 +927,7 @@ static void ack_update_msk(struct mptcp_sock *msk, * more dangerous than missing an ack */ old_snd_una = msk->snd_una; - new_snd_una = expand_ack(old_snd_una, mp_opt->data_ack, mp_opt->ack64); + new_snd_una = mptcp_expand_seq(old_snd_una, mp_opt->data_ack, mp_opt->ack64); /* ACK for data not even sent yet? Ignore. */ if (after64(new_snd_una, snd_nxt)) @@ -963,7 +964,7 @@ bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool us return false; WRITE_ONCE(msk->rcv_data_fin_seq, - expand_ack(READ_ONCE(msk->ack_seq), data_fin_seq, use_64bit)); + mptcp_expand_seq(READ_ONCE(msk->ack_seq), data_fin_seq, use_64bit)); WRITE_ONCE(msk->rcv_data_fin, 1); return true; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 385796f0ef19..5d7c44028e47 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -593,6 +593,14 @@ int mptcp_setsockopt(struct sock *sk, int level, int optname, int mptcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *option); +u64 __mptcp_expand_seq(u64 old_seq, u64 cur_seq); +static inline u64 mptcp_expand_seq(u64 old_seq, u64 cur_seq, bool use_64bit) +{ + if (use_64bit) + return cur_seq; + + return __mptcp_expand_seq(old_seq, cur_seq); +} void __mptcp_check_push(struct sock *sk, struct sock *ssk); void __mptcp_data_acked(struct sock *sk); void __mptcp_error_report(struct sock *sk); -- cgit v1.2.3-58-ga151 From 5957a8901db44c03540505ccedd95031c21ef2f2 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 18 Jun 2021 15:02:21 -0700 Subject: mptcp: fix 32 bit DSN expansion The current implementation of 32 bit DSN expansion is buggy. After the previous patch, we can simply reuse the newly introduced helper to do the expansion safely. Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/120 Fixes: 648ef4b88673 ("mptcp: Implement MPTCP receive path") Reviewed-by: Mat Martineau Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/mptcp/subflow.c | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) (limited to 'net') diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index be1de4084196..037fba41e170 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -775,15 +775,6 @@ enum mapping_status { MAPPING_DUMMY }; -static u64 expand_seq(u64 old_seq, u16 old_data_len, u64 seq) -{ - if ((u32)seq == (u32)old_seq) - return old_seq; - - /* Assume map covers data not mapped yet. */ - return seq | ((old_seq + old_data_len + 1) & GENMASK_ULL(63, 32)); -} - static void dbg_bad_map(struct mptcp_subflow_context *subflow, u32 ssn) { pr_debug("Bad mapping: ssn=%d map_seq=%d map_data_len=%d", @@ -907,13 +898,7 @@ static enum mapping_status get_mapping_status(struct sock *ssk, data_len--; } - if (!mpext->dsn64) { - map_seq = expand_seq(subflow->map_seq, subflow->map_data_len, - mpext->data_seq); - pr_debug("expanded seq=%llu", subflow->map_seq); - } else { - map_seq = mpext->data_seq; - } + map_seq = mptcp_expand_seq(READ_ONCE(msk->ack_seq), mpext->data_seq, mpext->dsn64); WRITE_ONCE(mptcp_sk(subflow->conn)->use_64bit_ack, !!mpext->dsn64); if (subflow->map_valid) { -- cgit v1.2.3-58-ga151 From fe0bdbde0756e29784ec9770d3a418c9d1640eee Mon Sep 17 00:00:00 2001 From: Yejune Deng Date: Mon, 21 Jun 2021 13:12:25 +0800 Subject: net: add pf_family_names[] for protocol family Modify the pr_info content from int to char * in sock_register() and sock_unregister(), this looks more readable. Fixed build error in ARCH=sparc64. Signed-off-by: Yejune Deng Reported-by: kernel test robot Signed-off-by: David S. Miller --- net/socket.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 50 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/socket.c b/net/socket.c index 4f2c6d2795d0..bd9233da2497 100644 --- a/net/socket.c +++ b/net/socket.c @@ -165,6 +165,54 @@ static const struct file_operations socket_file_ops = { .show_fdinfo = sock_show_fdinfo, }; +static const char * const pf_family_names[] = { + [PF_UNSPEC] = "PF_UNSPEC", + [PF_UNIX] = "PF_UNIX/PF_LOCAL", + [PF_INET] = "PF_INET", + [PF_AX25] = "PF_AX25", + [PF_IPX] = "PF_IPX", + [PF_APPLETALK] = "PF_APPLETALK", + [PF_NETROM] = "PF_NETROM", + [PF_BRIDGE] = "PF_BRIDGE", + [PF_ATMPVC] = "PF_ATMPVC", + [PF_X25] = "PF_X25", + [PF_INET6] = "PF_INET6", + [PF_ROSE] = "PF_ROSE", + [PF_DECnet] = "PF_DECnet", + [PF_NETBEUI] = "PF_NETBEUI", + [PF_SECURITY] = "PF_SECURITY", + [PF_KEY] = "PF_KEY", + [PF_NETLINK] = "PF_NETLINK/PF_ROUTE", + [PF_PACKET] = "PF_PACKET", + [PF_ASH] = "PF_ASH", + [PF_ECONET] = "PF_ECONET", + [PF_ATMSVC] = "PF_ATMSVC", + [PF_RDS] = "PF_RDS", + [PF_SNA] = "PF_SNA", + [PF_IRDA] = "PF_IRDA", + [PF_PPPOX] = "PF_PPPOX", + [PF_WANPIPE] = "PF_WANPIPE", + [PF_LLC] = "PF_LLC", + [PF_IB] = "PF_IB", + [PF_MPLS] = "PF_MPLS", + [PF_CAN] = "PF_CAN", + [PF_TIPC] = "PF_TIPC", + [PF_BLUETOOTH] = "PF_BLUETOOTH", + [PF_IUCV] = "PF_IUCV", + [PF_RXRPC] = "PF_RXRPC", + [PF_ISDN] = "PF_ISDN", + [PF_PHONET] = "PF_PHONET", + [PF_IEEE802154] = "PF_IEEE802154", + [PF_CAIF] = "PF_CAIF", + [PF_ALG] = "PF_ALG", + [PF_NFC] = "PF_NFC", + [PF_VSOCK] = "PF_VSOCK", + [PF_KCM] = "PF_KCM", + [PF_QIPCRTR] = "PF_QIPCRTR", + [PF_SMC] = "PF_SMC", + [PF_XDP] = "PF_XDP", +}; + /* * The protocol list. Each protocol is registered in here. */ @@ -2975,7 +3023,7 @@ int sock_register(const struct net_proto_family *ops) } spin_unlock(&net_family_lock); - pr_info("NET: Registered protocol family %d\n", ops->family); + pr_info("NET: Registered %s protocol family\n", pf_family_names[ops->family]); return err; } EXPORT_SYMBOL(sock_register); @@ -3003,7 +3051,7 @@ void sock_unregister(int family) synchronize_rcu(); - pr_info("NET: Unregistered protocol family %d\n", family); + pr_info("NET: Unregistered %s protocol family\n", pf_family_names[family]); } EXPORT_SYMBOL(sock_unregister); -- cgit v1.2.3-58-ga151 From c7ff9cff70601ea19245d997bb977344663434c7 Mon Sep 17 00:00:00 2001 From: "Longpeng(Mike)" Date: Mon, 21 Jun 2021 14:26:01 +0800 Subject: vsock: notify server to shutdown when client has pending signal The client's sk_state will be set to TCP_ESTABLISHED if the server replay the client's connect request. However, if the client has pending signal, its sk_state will be set to TCP_CLOSE without notify the server, so the server will hold the corrupt connection. client server 1. sk_state=TCP_SYN_SENT | 2. call ->connect() | 3. wait reply | | 4. sk_state=TCP_ESTABLISHED | 5. insert to connected list | 6. reply to the client 7. sk_state=TCP_ESTABLISHED | 8. insert to connected list | 9. *signal pending* <--------------------- the user kill client 10. sk_state=TCP_CLOSE | client is exiting... | 11. call ->release() | virtio_transport_close if (!(sk->sk_state == TCP_ESTABLISHED || sk->sk_state == TCP_CLOSING)) return true; *return at here, the server cannot notice the connection is corrupt* So the client should notify the peer in this case. Cc: David S. Miller Cc: Jakub Kicinski Cc: Jorgen Hansen Cc: Norbert Slusarek Cc: Andra Paraschiv Cc: Colin Ian King Cc: David Brazdil Cc: Alexander Popov Suggested-by: Stefano Garzarella Link: https://lkml.org/lkml/2021/5/17/418 Signed-off-by: lixianming Signed-off-by: Longpeng(Mike) Reviewed-by: Stefano Garzarella Signed-off-by: David S. Miller --- net/vmw_vsock/af_vsock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 92a72f0e0d94..ae11311807fd 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1369,7 +1369,7 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr, if (signal_pending(current)) { err = sock_intr_errno(timeout); - sk->sk_state = TCP_CLOSE; + sk->sk_state = sk->sk_state == TCP_ESTABLISHED ? TCP_CLOSING : TCP_CLOSE; sock->state = SS_UNCONNECTED; vsock_transport_cancel_pkt(vsk); goto out_wait; -- cgit v1.2.3-58-ga151 From 6d5516177d3b723fe9701b89e69db18cf0ca0421 Mon Sep 17 00:00:00 2001 From: Boris Sukholitko Date: Mon, 21 Jun 2021 12:24:29 +0300 Subject: Revert "net/sched: cls_flower: Remove match on n_proto" This reverts commit 0dca2c7404a938cb10c85d0515cee40ed5348788. The commit in question breaks hardware offload of flower filters. Quoting Vladimir Oltean : fl_hw_replace_filter() and fl_reoffload() create a struct flow_cls_offload with a rule->match.mask member derived from the mask of the software classifier: &f->mask->key - that same mask that is used for initializing the flow dissector keys, and the one from which Boris removed the basic.n_proto member because it was bothering him. Reported-by: Vadym Kochan Signed-off-by: Boris Sukholitko Reviewed-by: Vladimir Oltean Signed-off-by: David S. Miller --- net/sched/cls_flower.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 2e704c7a105a..d7869a984881 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -1531,13 +1531,14 @@ static int fl_set_key(struct net *net, struct nlattr **tb, &mask->basic.n_proto, TCA_FLOWER_UNSPEC, sizeof(key->basic.n_proto)); - mask->basic.n_proto = cpu_to_be16(0); } else { key->basic.n_proto = ethertype; + mask->basic.n_proto = cpu_to_be16(~0); } } } else { key->basic.n_proto = ethertype; + mask->basic.n_proto = cpu_to_be16(~0); } } -- cgit v1.2.3-58-ga151 From 0cd58e5c53babb9237b741dbef711f0a9eb6d3fd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 21 Jun 2021 10:54:49 -0700 Subject: pkt_sched: sch_qfq: fix qfq_change_class() error path If qfq_change_class() is unable to allocate memory for qfq_aggregate, it frees the class that has been inserted in the class hash table, but does not unhash it. Defer the insertion after the problematic allocation. BUG: KASAN: use-after-free in hlist_add_head include/linux/list.h:884 [inline] BUG: KASAN: use-after-free in qdisc_class_hash_insert+0x200/0x210 net/sched/sch_api.c:731 Write of size 8 at addr ffff88814a534f10 by task syz-executor.4/31478 CPU: 0 PID: 31478 Comm: syz-executor.4 Not tainted 5.13.0-rc6-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:79 [inline] dump_stack+0x141/0x1d7 lib/dump_stack.c:120 print_address_description.constprop.0.cold+0x5b/0x2f8 mm/kasan/report.c:233 __kasan_report mm/kasan/report.c:419 [inline] kasan_report.cold+0x7c/0xd8 mm/kasan/report.c:436 hlist_add_head include/linux/list.h:884 [inline] qdisc_class_hash_insert+0x200/0x210 net/sched/sch_api.c:731 qfq_change_class+0x96c/0x1990 net/sched/sch_qfq.c:489 tc_ctl_tclass+0x514/0xe50 net/sched/sch_api.c:2113 rtnetlink_rcv_msg+0x44e/0xad0 net/core/rtnetlink.c:5564 netlink_rcv_skb+0x153/0x420 net/netlink/af_netlink.c:2504 netlink_unicast_kernel net/netlink/af_netlink.c:1314 [inline] netlink_unicast+0x533/0x7d0 net/netlink/af_netlink.c:1340 netlink_sendmsg+0x856/0xd90 net/netlink/af_netlink.c:1929 sock_sendmsg_nosec net/socket.c:654 [inline] sock_sendmsg+0xcf/0x120 net/socket.c:674 ____sys_sendmsg+0x6e8/0x810 net/socket.c:2350 ___sys_sendmsg+0xf3/0x170 net/socket.c:2404 __sys_sendmsg+0xe5/0x1b0 net/socket.c:2433 do_syscall_64+0x3a/0xb0 arch/x86/entry/common.c:47 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x4665d9 Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 bc ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007fdc7b5f0188 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 000000000056bf80 RCX: 00000000004665d9 RDX: 0000000000000000 RSI: 00000000200001c0 RDI: 0000000000000003 RBP: 00007fdc7b5f01d0 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000002 R13: 00007ffcf7310b3f R14: 00007fdc7b5f0300 R15: 0000000000022000 Allocated by task 31445: kasan_save_stack+0x1b/0x40 mm/kasan/common.c:38 kasan_set_track mm/kasan/common.c:46 [inline] set_alloc_info mm/kasan/common.c:428 [inline] ____kasan_kmalloc mm/kasan/common.c:507 [inline] ____kasan_kmalloc mm/kasan/common.c:466 [inline] __kasan_kmalloc+0x9b/0xd0 mm/kasan/common.c:516 kmalloc include/linux/slab.h:556 [inline] kzalloc include/linux/slab.h:686 [inline] qfq_change_class+0x705/0x1990 net/sched/sch_qfq.c:464 tc_ctl_tclass+0x514/0xe50 net/sched/sch_api.c:2113 rtnetlink_rcv_msg+0x44e/0xad0 net/core/rtnetlink.c:5564 netlink_rcv_skb+0x153/0x420 net/netlink/af_netlink.c:2504 netlink_unicast_kernel net/netlink/af_netlink.c:1314 [inline] netlink_unicast+0x533/0x7d0 net/netlink/af_netlink.c:1340 netlink_sendmsg+0x856/0xd90 net/netlink/af_netlink.c:1929 sock_sendmsg_nosec net/socket.c:654 [inline] sock_sendmsg+0xcf/0x120 net/socket.c:674 ____sys_sendmsg+0x6e8/0x810 net/socket.c:2350 ___sys_sendmsg+0xf3/0x170 net/socket.c:2404 __sys_sendmsg+0xe5/0x1b0 net/socket.c:2433 do_syscall_64+0x3a/0xb0 arch/x86/entry/common.c:47 entry_SYSCALL_64_after_hwframe+0x44/0xae Freed by task 31445: kasan_save_stack+0x1b/0x40 mm/kasan/common.c:38 kasan_set_track+0x1c/0x30 mm/kasan/common.c:46 kasan_set_free_info+0x20/0x30 mm/kasan/generic.c:357 ____kasan_slab_free mm/kasan/common.c:360 [inline] ____kasan_slab_free mm/kasan/common.c:325 [inline] __kasan_slab_free+0xfb/0x130 mm/kasan/common.c:368 kasan_slab_free include/linux/kasan.h:212 [inline] slab_free_hook mm/slub.c:1583 [inline] slab_free_freelist_hook+0xdf/0x240 mm/slub.c:1608 slab_free mm/slub.c:3168 [inline] kfree+0xe5/0x7f0 mm/slub.c:4212 qfq_change_class+0x10fb/0x1990 net/sched/sch_qfq.c:518 tc_ctl_tclass+0x514/0xe50 net/sched/sch_api.c:2113 rtnetlink_rcv_msg+0x44e/0xad0 net/core/rtnetlink.c:5564 netlink_rcv_skb+0x153/0x420 net/netlink/af_netlink.c:2504 netlink_unicast_kernel net/netlink/af_netlink.c:1314 [inline] netlink_unicast+0x533/0x7d0 net/netlink/af_netlink.c:1340 netlink_sendmsg+0x856/0xd90 net/netlink/af_netlink.c:1929 sock_sendmsg_nosec net/socket.c:654 [inline] sock_sendmsg+0xcf/0x120 net/socket.c:674 ____sys_sendmsg+0x6e8/0x810 net/socket.c:2350 ___sys_sendmsg+0xf3/0x170 net/socket.c:2404 __sys_sendmsg+0xe5/0x1b0 net/socket.c:2433 do_syscall_64+0x3a/0xb0 arch/x86/entry/common.c:47 entry_SYSCALL_64_after_hwframe+0x44/0xae The buggy address belongs to the object at ffff88814a534f00 which belongs to the cache kmalloc-128 of size 128 The buggy address is located 16 bytes inside of 128-byte region [ffff88814a534f00, ffff88814a534f80) The buggy address belongs to the page: page:ffffea0005294d00 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x14a534 flags: 0x57ff00000000200(slab|node=1|zone=2|lastcpupid=0x7ff) raw: 057ff00000000200 ffffea00004fee00 0000000600000006 ffff8880110418c0 raw: 0000000000000000 0000000000100010 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected page_owner tracks the page as allocated page last allocated via order 0, migratetype Unmovable, gfp_mask 0x12cc0(GFP_KERNEL|__GFP_NOWARN|__GFP_NORETRY), pid 29797, ts 604817765317, free_ts 604810151744 prep_new_page mm/page_alloc.c:2358 [inline] get_page_from_freelist+0x1033/0x2b60 mm/page_alloc.c:3994 __alloc_pages+0x1b2/0x500 mm/page_alloc.c:5200 alloc_pages+0x18c/0x2a0 mm/mempolicy.c:2272 alloc_slab_page mm/slub.c:1646 [inline] allocate_slab+0x2c5/0x4c0 mm/slub.c:1786 new_slab mm/slub.c:1849 [inline] new_slab_objects mm/slub.c:2595 [inline] ___slab_alloc+0x4a1/0x810 mm/slub.c:2758 __slab_alloc.constprop.0+0xa7/0xf0 mm/slub.c:2798 slab_alloc_node mm/slub.c:2880 [inline] slab_alloc mm/slub.c:2922 [inline] __kmalloc+0x315/0x330 mm/slub.c:4050 kmalloc include/linux/slab.h:561 [inline] kzalloc include/linux/slab.h:686 [inline] __register_sysctl_table+0x112/0x1090 fs/proc/proc_sysctl.c:1318 mpls_dev_sysctl_register+0x1b7/0x2d0 net/mpls/af_mpls.c:1421 mpls_add_dev net/mpls/af_mpls.c:1472 [inline] mpls_dev_notify+0x214/0x8b0 net/mpls/af_mpls.c:1588 notifier_call_chain+0xb5/0x200 kernel/notifier.c:83 call_netdevice_notifiers_info+0xb5/0x130 net/core/dev.c:2121 call_netdevice_notifiers_extack net/core/dev.c:2133 [inline] call_netdevice_notifiers net/core/dev.c:2147 [inline] register_netdevice+0x106b/0x1500 net/core/dev.c:10312 veth_newlink+0x585/0xac0 drivers/net/veth.c:1547 __rtnl_newlink+0x1062/0x1710 net/core/rtnetlink.c:3452 rtnl_newlink+0x64/0xa0 net/core/rtnetlink.c:3500 page last free stack trace: reset_page_owner include/linux/page_owner.h:24 [inline] free_pages_prepare mm/page_alloc.c:1298 [inline] free_pcp_prepare+0x223/0x300 mm/page_alloc.c:1342 free_unref_page_prepare mm/page_alloc.c:3250 [inline] free_unref_page+0x12/0x1d0 mm/page_alloc.c:3298 __vunmap+0x783/0xb60 mm/vmalloc.c:2566 free_work+0x58/0x70 mm/vmalloc.c:80 process_one_work+0x98d/0x1600 kernel/workqueue.c:2276 worker_thread+0x64c/0x1120 kernel/workqueue.c:2422 kthread+0x3b1/0x4a0 kernel/kthread.c:313 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:294 Memory state around the buggy address: ffff88814a534e00: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff88814a534e80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc >ffff88814a534f00: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff88814a534f80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff88814a535000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 Fixes: 462dbc9101acd ("pkt_sched: QFQ Plus: fair-queueing service at DRR cost") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/sched/sch_qfq.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 1db9d4a2ef5e..b692a0de1ad5 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -485,11 +485,6 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (cl->qdisc != &noop_qdisc) qdisc_hash_add(cl->qdisc, true); - sch_tree_lock(sch); - qdisc_class_hash_insert(&q->clhash, &cl->common); - sch_tree_unlock(sch); - - qdisc_class_hash_grow(sch, &q->clhash); set_change_agg: sch_tree_lock(sch); @@ -507,8 +502,11 @@ set_change_agg: } if (existing) qfq_deact_rm_from_agg(q, cl); + else + qdisc_class_hash_insert(&q->clhash, &cl->common); qfq_add_to_agg(q, new_agg, cl); sch_tree_unlock(sch); + qdisc_class_hash_grow(sch, &q->clhash); *arg = (unsigned long)cl; return 0; -- cgit v1.2.3-58-ga151 From dd72fadf2186fc8a6018f97fe72f4d5ca05df440 Mon Sep 17 00:00:00 2001 From: Ayush Sawal Date: Tue, 22 Jun 2021 09:25:31 +0530 Subject: xfrm: Fix xfrm offload fallback fail case In case of xfrm offload, if xdo_dev_state_add() of driver returns -EOPNOTSUPP, xfrm offload fallback is failed. In xfrm state_add() both xso->dev and xso->real_dev are initialized to dev and when err(-EOPNOTSUPP) is returned only xso->dev is set to null. So in this scenario the condition in func validate_xmit_xfrm(), if ((x->xso.dev != dev) && (x->xso.real_dev == dev)) return skb; returns true, due to which skb is returned without calling esp_xmit() below which has fallback code. Hence the CRYPTO_FALLBACK is failing. So fixing this with by keeping x->xso.real_dev as NULL when err is returned in func xfrm_dev_state_add(). Fixes: bdfd2d1fa79a ("bonding/xfrm: use real_dev instead of slave_dev") Signed-off-by: Ayush Sawal Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_device.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 6d6917b68856..e843b0d9e2a6 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -268,6 +268,7 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, xso->num_exthdrs = 0; xso->flags = 0; xso->dev = NULL; + xso->real_dev = NULL; dev_put(dev); if (err != -EOPNOTSUPP) -- cgit v1.2.3-58-ga151 From 534799097a777e82910f77a4f9d289c815a9a64e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 22 Jun 2021 11:45:11 +0200 Subject: netfilter: nf_tables: skip netlink portID validation if zero nft_table_lookup() allows us to obtain the table object by the name and the family. The netlink portID validation needs to be skipped for the dump path, since the ownership only applies to commands to update the given table. Skip validation if the specified netlink PortID is zero when calling nft_table_lookup(). Fixes: 6001a930ce03 ("netfilter: nftables: introduce table ownership") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index ca9ec8721e6c..1d62b1a83299 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -571,7 +571,7 @@ static struct nft_table *nft_table_lookup(const struct net *net, table->family == family && nft_active_genmask(table, genmask)) { if (nft_table_has_owner(table) && - table->nlpid != nlpid) + nlpid && table->nlpid != nlpid) return ERR_PTR(-EPERM); return table; -- cgit v1.2.3-58-ga151 From e31f072ffab0397a328b31a9589dcf9733dc9c72 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 22 Jun 2021 12:10:49 +0200 Subject: netfilter: nf_tables: do not allow to delete table with owner by handle nft_table_lookup_byhandle() also needs to validate the netlink PortID owner when deleting a table by handle. Fixes: 6001a930ce03 ("netfilter: nftables: introduce table ownership") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 1d62b1a83299..fcb15b8904e8 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -583,7 +583,7 @@ static struct nft_table *nft_table_lookup(const struct net *net, static struct nft_table *nft_table_lookup_byhandle(const struct net *net, const struct nlattr *nla, - u8 genmask) + u8 genmask, u32 nlpid) { struct nftables_pernet *nft_net; struct nft_table *table; @@ -591,8 +591,13 @@ static struct nft_table *nft_table_lookup_byhandle(const struct net *net, nft_net = nft_pernet(net); list_for_each_entry(table, &nft_net->tables, list) { if (be64_to_cpu(nla_get_be64(nla)) == table->handle && - nft_active_genmask(table, genmask)) + nft_active_genmask(table, genmask)) { + if (nft_table_has_owner(table) && + nlpid && table->nlpid != nlpid) + return ERR_PTR(-EPERM); + return table; + } } return ERR_PTR(-ENOENT); @@ -1279,7 +1284,8 @@ static int nf_tables_deltable(struct sk_buff *skb, const struct nfnl_info *info, if (nla[NFTA_TABLE_HANDLE]) { attr = nla[NFTA_TABLE_HANDLE]; - table = nft_table_lookup_byhandle(net, attr, genmask); + table = nft_table_lookup_byhandle(net, attr, genmask, + NETLINK_CB(skb).portid); } else { attr = nla[NFTA_TABLE_NAME]; table = nft_table_lookup(net, attr, family, genmask, -- cgit v1.2.3-58-ga151 From 64295f0d01ae0661a2cea42c598070b1c87ca6e0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 21 Jun 2021 07:53:48 -0700 Subject: virtio/vsock: avoid NULL deref in virtio_transport_seqpacket_allow() Make sure the_virtio_vsock is not NULL before dereferencing it. general protection fault, probably for non-canonical address 0xdffffc0000000071: 0000 [#1] PREEMPT SMP KASAN KASAN: null-ptr-deref in range [0x0000000000000388-0x000000000000038f] CPU: 0 PID: 8452 Comm: syz-executor406 Not tainted 5.13.0-rc6-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:virtio_transport_seqpacket_allow+0xbf/0x210 net/vmw_vsock/virtio_transport.c:503 Code: e8 c6 d9 ab f8 84 db 0f 84 0f 01 00 00 e8 09 d3 ab f8 48 8d bd 88 03 00 00 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03 <0f> b6 04 02 84 c0 74 06 0f 8e 2a 01 00 00 44 0f b6 a5 88 03 00 00 RSP: 0018:ffffc90003757c18 EFLAGS: 00010206 RAX: dffffc0000000000 RBX: 0000000000000001 RCX: 0000000000000000 RDX: 0000000000000071 RSI: ffffffff88c908e7 RDI: 0000000000000388 RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000 R10: ffffffff88c90a06 R11: 0000000000000000 R12: 0000000000000000 R13: ffffffff88c90840 R14: 0000000000000000 R15: 0000000000000001 FS: 0000000001bee300(0000) GS:ffff8880b9c00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000020000082 CR3: 000000002847e000 CR4: 00000000001506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: vsock_assign_transport+0x575/0x700 net/vmw_vsock/af_vsock.c:490 vsock_connect+0x200/0xc00 net/vmw_vsock/af_vsock.c:1337 __sys_connect_file+0x155/0x1a0 net/socket.c:1824 __sys_connect+0x161/0x190 net/socket.c:1841 __do_sys_connect net/socket.c:1851 [inline] __se_sys_connect net/socket.c:1848 [inline] __x64_sys_connect+0x6f/0xb0 net/socket.c:1848 do_syscall_64+0x3a/0xb0 arch/x86/entry/common.c:47 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x43ee69 Code: 28 c3 e8 2a 14 00 00 66 2e 0f 1f 84 00 00 00 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 c0 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007ffd49e7c788 EFLAGS: 00000246 ORIG_RAX: 000000000000002a RAX: ffffffffffffffda RBX: 0000000000400488 RCX: 000000000043ee69 RDX: 0000000000000010 RSI: 0000000020000080 RDI: 0000000000000003 RBP: 0000000000402e50 R08: 0000000000000000 R09: 0000000000400488 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000402ee0 R13: 0000000000000000 R14: 00000000004ac018 R15: 0000000000400488 Fixes: 53efbba12cc7 ("virtio/vsock: enable SEQPACKET for transport") Signed-off-by: Eric Dumazet Cc: Arseny Krasnov Reported-by: syzbot Reviewed-by: Stefano Garzarella Signed-off-by: David S. Miller --- net/vmw_vsock/virtio_transport.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index e73ce652bf3c..ed1664e7bd88 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -498,9 +498,11 @@ static bool virtio_transport_seqpacket_allow(u32 remote_cid) struct virtio_vsock *vsock; bool seqpacket_allow; + seqpacket_allow = false; rcu_read_lock(); vsock = rcu_dereference(the_virtio_vsock); - seqpacket_allow = vsock->seqpacket_allow; + if (vsock) + seqpacket_allow = vsock->seqpacket_allow; rcu_read_unlock(); return seqpacket_allow; -- cgit v1.2.3-58-ga151 From 8ce568ed06ce4ca38c0b67d8de9b8d75b731f90a Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 21 Jun 2021 15:54:33 -0700 Subject: mptcp: drop tx skb cache The mentioned cache was introduced to reduce the number of skb allocation in atomic context, but the required complexity is excessive. This change remove the mentioned cache. Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 89 +++------------------------------------------------- net/mptcp/protocol.h | 2 -- 2 files changed, 4 insertions(+), 87 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index b5f2f504b85b..77c90d6f04df 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -902,22 +902,14 @@ static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk, df->data_seq + df->data_len == msk->write_seq; } -static int mptcp_wmem_with_overhead(struct sock *sk, int size) +static int mptcp_wmem_with_overhead(int size) { - struct mptcp_sock *msk = mptcp_sk(sk); - int ret, skbs; - - ret = size + ((sizeof(struct mptcp_data_frag) * size) >> PAGE_SHIFT); - skbs = (msk->tx_pending_data + size) / msk->size_goal_cache; - if (skbs < msk->skb_tx_cache.qlen) - return ret; - - return ret + (skbs - msk->skb_tx_cache.qlen) * SKB_TRUESIZE(MAX_TCP_HEADER); + return size + ((sizeof(struct mptcp_data_frag) * size) >> PAGE_SHIFT); } static void __mptcp_wmem_reserve(struct sock *sk, int size) { - int amount = mptcp_wmem_with_overhead(sk, size); + int amount = mptcp_wmem_with_overhead(size); struct mptcp_sock *msk = mptcp_sk(sk); WARN_ON_ONCE(msk->wmem_reserved); @@ -1212,49 +1204,8 @@ static struct sk_buff *__mptcp_do_alloc_tx_skb(struct sock *sk, gfp_t gfp) return NULL; } -static bool mptcp_tx_cache_refill(struct sock *sk, int size, - struct sk_buff_head *skbs, int *total_ts) -{ - struct mptcp_sock *msk = mptcp_sk(sk); - struct sk_buff *skb; - int space_needed; - - if (unlikely(tcp_under_memory_pressure(sk))) { - mptcp_mem_reclaim_partial(sk); - - /* under pressure pre-allocate at most a single skb */ - if (msk->skb_tx_cache.qlen) - return true; - space_needed = msk->size_goal_cache; - } else { - space_needed = msk->tx_pending_data + size - - msk->skb_tx_cache.qlen * msk->size_goal_cache; - } - - while (space_needed > 0) { - skb = __mptcp_do_alloc_tx_skb(sk, sk->sk_allocation); - if (unlikely(!skb)) { - /* under memory pressure, try to pass the caller a - * single skb to allow forward progress - */ - while (skbs->qlen > 1) { - skb = __skb_dequeue_tail(skbs); - *total_ts -= skb->truesize; - __kfree_skb(skb); - } - return skbs->qlen > 0; - } - - *total_ts += skb->truesize; - __skb_queue_tail(skbs, skb); - space_needed -= msk->size_goal_cache; - } - return true; -} - static bool __mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp) { - struct mptcp_sock *msk = mptcp_sk(sk); struct sk_buff *skb; if (ssk->sk_tx_skb_cache) { @@ -1265,22 +1216,6 @@ static bool __mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp) return true; } - skb = skb_peek(&msk->skb_tx_cache); - if (skb) { - if (likely(sk_wmem_schedule(ssk, skb->truesize))) { - skb = __skb_dequeue(&msk->skb_tx_cache); - if (WARN_ON_ONCE(!skb)) - return false; - - mptcp_wmem_uncharge(sk, skb->truesize); - ssk->sk_tx_skb_cache = skb; - return true; - } - - /* over memory limit, no point to try to allocate a new skb */ - return false; - } - skb = __mptcp_do_alloc_tx_skb(sk, gfp); if (!skb) return false; @@ -1296,7 +1231,6 @@ static bool __mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp) static bool mptcp_must_reclaim_memory(struct sock *sk, struct sock *ssk) { return !ssk->sk_tx_skb_cache && - !skb_peek(&mptcp_sk(sk)->skb_tx_cache) && tcp_under_memory_pressure(sk); } @@ -1339,7 +1273,6 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, /* compute send limit */ info->mss_now = tcp_send_mss(ssk, &info->size_goal, info->flags); avail_size = info->size_goal; - msk->size_goal_cache = info->size_goal; skb = tcp_write_queue_tail(ssk); if (skb) { /* Limit the write to the size available in the @@ -1688,7 +1621,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) while (msg_data_left(msg)) { int total_ts, frag_truesize = 0; struct mptcp_data_frag *dfrag; - struct sk_buff_head skbs; bool dfrag_collapsed; size_t psize, offset; @@ -1721,16 +1653,10 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) psize = pfrag->size - offset; psize = min_t(size_t, psize, msg_data_left(msg)); total_ts = psize + frag_truesize; - __skb_queue_head_init(&skbs); - if (!mptcp_tx_cache_refill(sk, psize, &skbs, &total_ts)) - goto wait_for_memory; - if (!mptcp_wmem_alloc(sk, total_ts)) { - __skb_queue_purge(&skbs); + if (!mptcp_wmem_alloc(sk, total_ts)) goto wait_for_memory; - } - skb_queue_splice_tail(&skbs, &msk->skb_tx_cache); if (copy_page_from_iter(dfrag->page, offset, psize, &msg->msg_iter) != psize) { mptcp_wmem_uncharge(sk, psize + frag_truesize); @@ -2462,13 +2388,11 @@ static int __mptcp_init_sock(struct sock *sk) INIT_LIST_HEAD(&msk->rtx_queue); INIT_WORK(&msk->work, mptcp_worker); __skb_queue_head_init(&msk->receive_queue); - __skb_queue_head_init(&msk->skb_tx_cache); msk->out_of_order_queue = RB_ROOT; msk->first_pending = NULL; msk->wmem_reserved = 0; msk->rmem_released = 0; msk->tx_pending_data = 0; - msk->size_goal_cache = TCP_BASE_MSS; msk->ack_hint = NULL; msk->first = NULL; @@ -2525,15 +2449,10 @@ static void __mptcp_clear_xmit(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_data_frag *dtmp, *dfrag; - struct sk_buff *skb; WRITE_ONCE(msk->first_pending, NULL); list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) dfrag_clear(sk, dfrag); - while ((skb = __skb_dequeue(&msk->skb_tx_cache)) != NULL) { - sk->sk_forward_alloc += skb->truesize; - kfree_skb(skb); - } } static void mptcp_cancel_work(struct sock *sk) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 160d716ebc2b..160c2ab09f19 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -245,9 +245,7 @@ struct mptcp_sock { struct sk_buff *ooo_last_skb; struct rb_root out_of_order_queue; struct sk_buff_head receive_queue; - struct sk_buff_head skb_tx_cache; /* this is wmem accounted */ int tx_pending_data; - int size_goal_cache; struct list_head conn_list; struct list_head rtx_queue; struct mptcp_data_frag *first_pending; -- cgit v1.2.3-58-ga151 From 75e908c33615999abe1f3a8429d25dea30d28e4e Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 21 Jun 2021 15:54:34 -0700 Subject: mptcp: use fast lock for subflows when possible There are a bunch of callsite where the ssk socket lock is acquired using the full-blown version eligible for the fast variant. Let's move to the latter. Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/pm_netlink.c | 10 ++++++---- net/mptcp/protocol.c | 15 +++++++++------ 2 files changed, 15 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 09722598994d..d4732a4f223e 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -540,6 +540,7 @@ void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk) subflow = list_first_entry_or_null(&msk->conn_list, typeof(*subflow), node); if (subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + bool slow; spin_unlock_bh(&msk->pm.lock); pr_debug("send ack for %s%s%s", @@ -547,9 +548,9 @@ void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk) mptcp_pm_should_add_signal_ipv6(msk) ? " [ipv6]" : "", mptcp_pm_should_add_signal_port(msk) ? " [port]" : ""); - lock_sock(ssk); + slow = lock_sock_fast(ssk); tcp_send_ack(ssk); - release_sock(ssk); + unlock_sock_fast(ssk, slow); spin_lock_bh(&msk->pm.lock); } } @@ -566,6 +567,7 @@ int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, struct sock *ssk = mptcp_subflow_tcp_sock(subflow); struct sock *sk = (struct sock *)msk; struct mptcp_addr_info local; + bool slow; local_address((struct sock_common *)ssk, &local); if (!addresses_equal(&local, addr, addr->port)) @@ -578,9 +580,9 @@ int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, spin_unlock_bh(&msk->pm.lock); pr_debug("send ack for mp_prio"); - lock_sock(ssk); + slow = lock_sock_fast(ssk); tcp_send_ack(ssk); - release_sock(ssk); + unlock_sock_fast(ssk, slow); spin_lock_bh(&msk->pm.lock); return 0; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 77c90d6f04df..c47ce074737d 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -433,23 +433,25 @@ static void mptcp_send_ack(struct mptcp_sock *msk) mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + bool slow; - lock_sock(ssk); + slow = lock_sock_fast(ssk); if (tcp_can_send_ack(ssk)) tcp_send_ack(ssk); - release_sock(ssk); + unlock_sock_fast(ssk, slow); } } static bool mptcp_subflow_cleanup_rbuf(struct sock *ssk) { + bool slow; int ret; - lock_sock(ssk); + slow = lock_sock_fast(ssk); ret = tcp_can_send_ack(ssk); if (ret) tcp_cleanup_rbuf(ssk, 1); - release_sock(ssk); + unlock_sock_fast(ssk, slow); return ret; } @@ -2252,13 +2254,14 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk) list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); + bool slow; - lock_sock(tcp_sk); + slow = lock_sock_fast(tcp_sk); if (tcp_sk->sk_state != TCP_CLOSE) { tcp_send_active_reset(tcp_sk, GFP_ATOMIC); tcp_set_state(tcp_sk, TCP_CLOSE); } - release_sock(tcp_sk); + unlock_sock_fast(tcp_sk, slow); } inet_sk_state_store(sk, TCP_CLOSE); -- cgit v1.2.3-58-ga151 From 3c90e377a1e87a35a7f868ed1c53ea4d62379a8d Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 21 Jun 2021 15:54:35 -0700 Subject: mptcp: don't clear MPTCP_DATA_READY in sk_wait_event() If we don't flush entirely the receive queue, we need set again such bit later. We can simply avoid clearing it. Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index c47ce074737d..3e088e9d20fd 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1715,7 +1715,7 @@ static void mptcp_wait_data(struct sock *sk, long *timeo) sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); sk_wait_event(sk, timeo, - test_and_clear_bit(MPTCP_DATA_READY, &msk->flags), &wait); + test_bit(MPTCP_DATA_READY, &msk->flags), &wait); sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); remove_wait_queue(sk_sleep(sk), &wait); @@ -2039,10 +2039,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, */ if (unlikely(__mptcp_move_skbs(msk))) set_bit(MPTCP_DATA_READY, &msk->flags); - } else if (unlikely(!test_bit(MPTCP_DATA_READY, &msk->flags))) { - /* data to read but mptcp_wait_data() cleared DATA_READY */ - set_bit(MPTCP_DATA_READY, &msk->flags); } + out_err: if (cmsg_flags && copied >= 0) { if (cmsg_flags & MPTCP_CMSG_TS) -- cgit v1.2.3-58-ga151 From 8cfc47fc2eb0fd2d6eaa9e4b23b4bf6ef1bfaeef Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 21 Jun 2021 15:54:36 -0700 Subject: mptcp: drop redundant test in move_skbs_to_msk() Currently we check the msk state to avoid enqueuing new skbs at msk shutdown time. Such test is racy - as we can't acquire the msk socket lock - and useless, as the caller already checked the subflow field 'disposable', covering the same scenario in a race free manner - read and updated under the ssk socket lock. Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 3e088e9d20fd..cf75be02eb00 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -686,9 +686,6 @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) struct sock *sk = (struct sock *)msk; unsigned int moved = 0; - if (inet_sk_state_load(sk) == TCP_CLOSE) - return false; - __mptcp_move_skbs_from_subflow(msk, ssk, &moved); __mptcp_ofo_queue(msk); if (unlikely(ssk->sk_err)) { -- cgit v1.2.3-58-ga151 From 06285da96a1cdbad265a212f6729e19a515127a2 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 21 Jun 2021 15:54:37 -0700 Subject: mptcp: add MIB counter for invalid mapping Account this exceptional events for better introspection. Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/mib.c | 1 + net/mptcp/mib.h | 1 + net/mptcp/subflow.c | 4 +++- 3 files changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c index e7e60bc1fb96..52ea2517e856 100644 --- a/net/mptcp/mib.c +++ b/net/mptcp/mib.c @@ -25,6 +25,7 @@ static const struct snmp_mib mptcp_snmp_list[] = { SNMP_MIB_ITEM("MPJoinAckHMacFailure", MPTCP_MIB_JOINACKMAC), SNMP_MIB_ITEM("DSSNotMatching", MPTCP_MIB_DSSNOMATCH), SNMP_MIB_ITEM("InfiniteMapRx", MPTCP_MIB_INFINITEMAPRX), + SNMP_MIB_ITEM("DSSNoMatchTCP", MPTCP_MIB_DSSTCPMISMATCH), SNMP_MIB_ITEM("DataCsumErr", MPTCP_MIB_DATACSUMERR), SNMP_MIB_ITEM("OFOQueueTail", MPTCP_MIB_OFOQUEUETAIL), SNMP_MIB_ITEM("OFOQueue", MPTCP_MIB_OFOQUEUE), diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h index 92e56c0cfbdd..193466c9b549 100644 --- a/net/mptcp/mib.h +++ b/net/mptcp/mib.h @@ -18,6 +18,7 @@ enum linux_mptcp_mib_field { MPTCP_MIB_JOINACKMAC, /* HMAC was wrong on ACK + MP_JOIN */ MPTCP_MIB_DSSNOMATCH, /* Received a new mapping that did not match the previous one */ MPTCP_MIB_INFINITEMAPRX, /* Received an infinite mapping */ + MPTCP_MIB_DSSTCPMISMATCH, /* DSS-mapping did not map with TCP's sequence numbers */ MPTCP_MIB_DATACSUMERR, /* The data checksum fail */ MPTCP_MIB_OFOQUEUETAIL, /* Segments inserted into OoO queue tail */ MPTCP_MIB_OFOQUEUE, /* Segments inserted into OoO queue */ diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 8976ff586b87..585951e7e52f 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1046,8 +1046,10 @@ validate_seq: /* we revalidate valid mapping on new skb, because we must ensure * the current skb is completely covered by the available mapping */ - if (!validate_mapping(ssk, skb)) + if (!validate_mapping(ssk, skb)) { + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSTCPMISMATCH); return MAPPING_INVALID; + } skb_ext_del(skb, SKB_EXT_MPTCP); -- cgit v1.2.3-58-ga151 From 7dd5d437c258bbf4cc15b35229e5208b87b8b4e0 Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Sun, 13 Jun 2021 21:34:39 +0700 Subject: bpf: Fix integer overflow in argument calculation for bpf_map_area_alloc In 32-bit architecture, the result of sizeof() is a 32-bit integer so the expression becomes the multiplication between 2 32-bit integer which can potentially leads to integer overflow. As a result, bpf_map_area_alloc() allocates less memory than needed. Fix this by casting 1 operand to u64. Fixes: 0d2c4f964050 ("bpf: Eliminate rlimit-based memory accounting for sockmap and sockhash maps") Fixes: 99c51064fb06 ("devmap: Use bpf_map_area_alloc() for allocating hash buckets") Fixes: 546ac1ffb70d ("bpf: add devmap, a map for storing net device references") Signed-off-by: Bui Quang Minh Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210613143440.71975-1-minhquangbui99@gmail.com --- kernel/bpf/devmap.c | 4 ++-- net/core/sock_map.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index aa516472ce46..3b45c23286c0 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -92,7 +92,7 @@ static struct hlist_head *dev_map_create_hash(unsigned int entries, int i; struct hlist_head *hash; - hash = bpf_map_area_alloc(entries * sizeof(*hash), numa_node); + hash = bpf_map_area_alloc((u64) entries * sizeof(*hash), numa_node); if (hash != NULL) for (i = 0; i < entries; i++) INIT_HLIST_HEAD(&hash[i]); @@ -143,7 +143,7 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) spin_lock_init(&dtab->index_lock); } else { - dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries * + dtab->netdev_map = bpf_map_area_alloc((u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *), dtab->map.numa_node); if (!dtab->netdev_map) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 6f1b82b8ad49..60decd6420ca 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -48,7 +48,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) bpf_map_init_from_attr(&stab->map, attr); raw_spin_lock_init(&stab->lock); - stab->sks = bpf_map_area_alloc(stab->map.max_entries * + stab->sks = bpf_map_area_alloc((u64) stab->map.max_entries * sizeof(struct sock *), stab->map.numa_node); if (!stab->sks) { -- cgit v1.2.3-58-ga151 From 490274b47468793e3e157c2df6b2da0e646cc4a9 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 21 Jun 2021 17:33:08 -0700 Subject: mptcp: avoid race on msk state changes The msk socket state is currently updated in a few spots without owning the msk socket lock itself. Some of such operations are safe, as they happens before exposing the msk socket to user-space and can't race with other changes. A couple of them, at connect time, can actually race with close() or shutdown(), leaving breaking the socket state machine. This change addresses the issue moving such update under the msk socket lock with the usual: scheme. Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/56 Fixes: 8fd738049ac3 ("mptcp: fallback in case of simultaneous connect") Fixes: c3c123d16c0e ("net: mptcp: don't hang in mptcp_sendmsg() after TCP fallback") Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 5 +++++ net/mptcp/protocol.h | 2 ++ net/mptcp/subflow.c | 30 ++++++++++++++++++++++-------- 3 files changed, 29 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 632350018fb6..8ead550df8b1 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2946,6 +2946,11 @@ static void mptcp_release_cb(struct sock *sk) spin_lock_bh(&sk->sk_lock.slock); } + /* be sure to set the current sk state before tacking actions + * depending on sk_state + */ + if (test_and_clear_bit(MPTCP_CONNECTED, &mptcp_sk(sk)->flags)) + __mptcp_set_connected(sk); if (test_and_clear_bit(MPTCP_CLEAN_UNA, &mptcp_sk(sk)->flags)) __mptcp_clean_una_wakeup(sk); if (test_and_clear_bit(MPTCP_ERROR_REPORT, &mptcp_sk(sk)->flags)) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 5d7c44028e47..7b634568f49c 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -109,6 +109,7 @@ #define MPTCP_ERROR_REPORT 8 #define MPTCP_RETRANSMIT 9 #define MPTCP_WORK_SYNC_SETSOCKOPT 10 +#define MPTCP_CONNECTED 11 static inline bool before64(__u64 seq1, __u64 seq2) { @@ -579,6 +580,7 @@ void mptcp_get_options(const struct sk_buff *skb, struct mptcp_options_received *mp_opt); void mptcp_finish_connect(struct sock *sk); +void __mptcp_set_connected(struct sock *sk); static inline bool mptcp_is_fully_established(struct sock *sk) { return inet_sk_state_load(sk) == TCP_ESTABLISHED && diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 037fba41e170..9f934603bfe8 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -371,6 +371,24 @@ static bool subflow_use_different_dport(struct mptcp_sock *msk, const struct soc return inet_sk(sk)->inet_dport != inet_sk((struct sock *)msk)->inet_dport; } +void __mptcp_set_connected(struct sock *sk) +{ + if (sk->sk_state == TCP_SYN_SENT) { + inet_sk_state_store(sk, TCP_ESTABLISHED); + sk->sk_state_change(sk); + } +} + +static void mptcp_set_connected(struct sock *sk) +{ + mptcp_data_lock(sk); + if (!sock_owned_by_user(sk)) + __mptcp_set_connected(sk); + else + set_bit(MPTCP_CONNECTED, &mptcp_sk(sk)->flags); + mptcp_data_unlock(sk); +} + static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); @@ -379,10 +397,6 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) subflow->icsk_af_ops->sk_rx_dst_set(sk, skb); - if (inet_sk_state_load(parent) == TCP_SYN_SENT) { - inet_sk_state_store(parent, TCP_ESTABLISHED); - parent->sk_state_change(parent); - } /* be sure no special action on any packet other than syn-ack */ if (subflow->conn_finished) @@ -411,6 +425,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) subflow->remote_key); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVEACK); mptcp_finish_connect(sk); + mptcp_set_connected(parent); } else if (subflow->request_join) { u8 hmac[SHA256_DIGEST_SIZE]; @@ -451,6 +466,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) } else if (mptcp_check_fallback(sk)) { fallback: mptcp_rcv_space_init(mptcp_sk(parent), sk); + mptcp_set_connected(parent); } return; @@ -558,6 +574,7 @@ static void mptcp_sock_destruct(struct sock *sk) static void mptcp_force_close(struct sock *sk) { + /* the msk is not yet exposed to user-space */ inet_sk_state_store(sk, TCP_CLOSE); sk_common_release(sk); } @@ -1474,10 +1491,7 @@ static void subflow_state_change(struct sock *sk) mptcp_rcv_space_init(mptcp_sk(parent), sk); pr_fallback(mptcp_sk(parent)); subflow->conn_finished = 1; - if (inet_sk_state_load(parent) == TCP_SYN_SENT) { - inet_sk_state_store(parent, TCP_ESTABLISHED); - parent->sk_state_change(parent); - } + mptcp_set_connected(parent); } /* as recvmsg() does not acquire the subflow socket for ssk selection -- cgit v1.2.3-58-ga151 From 597dbae77ee5a2347b1b800c25c89a9181dd8a57 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 21 Jun 2021 17:33:09 -0700 Subject: mptcp: drop duplicate mptcp_setsockopt() declaration commit 7896248983ef ("mptcp: add skeleton to sync msk socket options to subflows") introduced a duplicate declaration of mptcp_setsockopt(), just drop it. Reported-by: Florian Westphal Fixes: 7896248983ef ("mptcp: add skeleton to sync msk socket options to subflows") Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/protocol.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 7b634568f49c..78ac28902f55 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -762,9 +762,6 @@ unsigned int mptcp_pm_get_add_addr_accept_max(struct mptcp_sock *msk); unsigned int mptcp_pm_get_subflows_max(struct mptcp_sock *msk); unsigned int mptcp_pm_get_local_addr_max(struct mptcp_sock *msk); -int mptcp_setsockopt(struct sock *sk, int level, int optname, - sockptr_t optval, unsigned int optlen); - void mptcp_sockopt_sync(struct mptcp_sock *msk, struct sock *ssk); void mptcp_sockopt_sync_all(struct mptcp_sock *msk); -- cgit v1.2.3-58-ga151 From a6e3f2985a80ef6a45a17d2d9d9151f17ea3ce07 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 21 Jun 2021 18:52:54 -0700 Subject: ip6_tunnel: fix GRE6 segmentation Commit 6c11fbf97e69 ("ip6_tunnel: add MPLS transmit support") moved assiging inner_ipproto down from ipxip6_tnl_xmit() to its callee ip6_tnl_xmit(). The latter is also used by GRE. Since commit 38720352412a ("gre: Use inner_proto to obtain inner header protocol") GRE had been depending on skb->inner_protocol during segmentation. It sets it in gre_build_header() and reads it in gre_gso_segment(). Changes to ip6_tnl_xmit() overwrite the protocol, resulting in GSO skbs getting dropped. Note that inner_protocol is a union with inner_ipproto, GRE uses the former while the change switched it to the latter (always setting it to just IPPROTO_GRE). Restore the original location of skb_set_inner_ipproto(), it is unclear why it was moved in the first place. Fixes: 6c11fbf97e69 ("ip6_tunnel: add MPLS transmit support") Signed-off-by: Jakub Kicinski Tested-by: Vadim Fedorenko Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 288bafded998..28ca70af014a 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1239,8 +1239,6 @@ route_lookup: if (max_headroom > dev->needed_headroom) dev->needed_headroom = max_headroom; - skb_set_inner_ipproto(skb, proto); - err = ip6_tnl_encap(skb, t, &proto, fl6); if (err) return err; @@ -1377,6 +1375,8 @@ ipxip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6)) return -1; + skb_set_inner_ipproto(skb, protocol); + err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, protocol); if (err != 0) { -- cgit v1.2.3-58-ga151 From c69f114d09891adfa3e301a35d9e872b8b7b5a50 Mon Sep 17 00:00:00 2001 From: Miao Wang Date: Tue, 22 Jun 2021 12:24:50 +0800 Subject: net/ipv4: swap flow ports when validating source When doing source address validation, the flowi4 struct used for fib_lookup should be in the reverse direction to the given skb. fl4_dport and fl4_sport returned by fib4_rules_early_flow_dissect should thus be swapped. Fixes: 5a847a6e1477 ("net/ipv4: Initialize proto and ports in flow struct") Signed-off-by: Miao Wang Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 84bb707bd88d..647bceab56c2 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -371,6 +371,8 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, fl4.flowi4_proto = 0; fl4.fl4_sport = 0; fl4.fl4_dport = 0; + } else { + swap(fl4.fl4_sport, fl4.fl4_dport); } if (fib_lookup(net, &fl4, &res, 0)) -- cgit v1.2.3-58-ga151 From 98534fce52efc76d961f5fe4188a97a5db93c7dd Mon Sep 17 00:00:00 2001 From: gushengxian Date: Mon, 21 Jun 2021 23:05:19 -0700 Subject: bridge: cfm: remove redundant return Return statements are not needed in Void function. Signed-off-by: gushengxian Signed-off-by: David S. Miller --- net/bridge/br_cfm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_cfm.c b/net/bridge/br_cfm.c index 001064f7583d..a3c755d0a09d 100644 --- a/net/bridge/br_cfm.c +++ b/net/bridge/br_cfm.c @@ -142,7 +142,7 @@ static void br_cfm_notify(int event, const struct net_bridge_port *port) { u32 filter = RTEXT_FILTER_CFM_STATUS; - return br_info_notify(event, port->br, NULL, filter); + br_info_notify(event, port->br, NULL, filter); } static void cc_peer_enable(struct br_cfm_peer_mep *peer_mep) -- cgit v1.2.3-58-ga151 From f5fe211d13af52077bb66e89a5410fa75f691fe8 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 22 Jun 2021 09:50:48 +0300 Subject: ethtool: Decrease size of module EEPROM get policy array The 'ETHTOOL_A_MODULE_EEPROM_DATA' attribute is not part of the get request. Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- net/ethtool/netlink.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 90b10966b16b..3e25a47fd482 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -380,7 +380,7 @@ extern const struct nla_policy ethnl_cable_test_tdr_act_policy[ETHTOOL_A_CABLE_T extern const struct nla_policy ethnl_tunnel_info_get_policy[ETHTOOL_A_TUNNEL_INFO_HEADER + 1]; extern const struct nla_policy ethnl_fec_get_policy[ETHTOOL_A_FEC_HEADER + 1]; extern const struct nla_policy ethnl_fec_set_policy[ETHTOOL_A_FEC_AUTO + 1]; -extern const struct nla_policy ethnl_module_eeprom_get_policy[ETHTOOL_A_MODULE_EEPROM_DATA + 1]; +extern const struct nla_policy ethnl_module_eeprom_get_policy[ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS + 1]; extern const struct nla_policy ethnl_stats_get_policy[ETHTOOL_A_STATS_GROUPS + 1]; int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info); -- cgit v1.2.3-58-ga151 From 0dc7dd02ba7ab5f623f5e3a36443ec441364285a Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 22 Jun 2021 09:50:51 +0300 Subject: ethtool: Validate module EEPROM length as part of policy Validate the number of bytes to read from the module EEPROM as part of the netlink policy and remove the corresponding check from the code. This also makes it possible to query the length range from user space: $ genl ctrl policy name ethtool ... ID: 0x14 policy[32]:attr[3]: type=U32 range:[1,128] ... Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- net/ethtool/eeprom.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ethtool/eeprom.c b/net/ethtool/eeprom.c index 5d38e90895ac..1e75d9c1b154 100644 --- a/net/ethtool/eeprom.c +++ b/net/ethtool/eeprom.c @@ -159,9 +159,6 @@ static int eeprom_parse_request(struct ethnl_req_info *req_info, struct nlattr * request->offset = nla_get_u32(tb[ETHTOOL_A_MODULE_EEPROM_OFFSET]); request->length = nla_get_u32(tb[ETHTOOL_A_MODULE_EEPROM_LENGTH]); - if (!request->length) - return -EINVAL; - /* The following set of conditions limit the API to only dump 1/2 * EEPROM page without crossing low page boundary located at offset 128. * This means user may only request dumps of length limited to 128 from @@ -237,7 +234,8 @@ const struct ethnl_request_ops ethnl_module_eeprom_request_ops = { const struct nla_policy ethnl_module_eeprom_get_policy[] = { [ETHTOOL_A_MODULE_EEPROM_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_MODULE_EEPROM_OFFSET] = { .type = NLA_U32 }, - [ETHTOOL_A_MODULE_EEPROM_LENGTH] = { .type = NLA_U32 }, + [ETHTOOL_A_MODULE_EEPROM_LENGTH] = + NLA_POLICY_RANGE(NLA_U32, 1, ETH_MODULE_EEPROM_PAGE_LEN), [ETHTOOL_A_MODULE_EEPROM_PAGE] = { .type = NLA_U8 }, [ETHTOOL_A_MODULE_EEPROM_BANK] = { .type = NLA_U8 }, [ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS] = -- cgit v1.2.3-58-ga151 From 88f9a87afeeec5dfdda3651f3db96d0006172d91 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 22 Jun 2021 09:50:52 +0300 Subject: ethtool: Validate module EEPROM offset as part of policy Validate the offset to read from module EEPROM as part of the netlink policy and remove the corresponding check from the code. This also makes it possible to query the offset range from user space: $ genl ctrl policy name ethtool ... ID: 0x14 policy[32]:attr[2]: type=U32 range:[0,255] ... Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- net/ethtool/eeprom.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ethtool/eeprom.c b/net/ethtool/eeprom.c index 1e75d9c1b154..7e6b37a54add 100644 --- a/net/ethtool/eeprom.c +++ b/net/ethtool/eeprom.c @@ -177,10 +177,6 @@ static int eeprom_parse_request(struct ethnl_req_info *req_info, struct nlattr * NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_MODULE_EEPROM_LENGTH], "reading cross half page boundary is illegal"); return -EINVAL; - } else if (request->offset >= ETH_MODULE_EEPROM_PAGE_LEN * 2) { - NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_MODULE_EEPROM_OFFSET], - "offset is out of bounds"); - return -EINVAL; } else if (request->offset + request->length > ETH_MODULE_EEPROM_PAGE_LEN * 2) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_MODULE_EEPROM_LENGTH], "reading cross page boundary is illegal"); @@ -233,7 +229,8 @@ const struct ethnl_request_ops ethnl_module_eeprom_request_ops = { const struct nla_policy ethnl_module_eeprom_get_policy[] = { [ETHTOOL_A_MODULE_EEPROM_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), - [ETHTOOL_A_MODULE_EEPROM_OFFSET] = { .type = NLA_U32 }, + [ETHTOOL_A_MODULE_EEPROM_OFFSET] = + NLA_POLICY_MAX(NLA_U32, ETH_MODULE_EEPROM_PAGE_LEN * 2 - 1), [ETHTOOL_A_MODULE_EEPROM_LENGTH] = NLA_POLICY_RANGE(NLA_U32, 1, ETH_MODULE_EEPROM_PAGE_LEN), [ETHTOOL_A_MODULE_EEPROM_PAGE] = { .type = NLA_U8 }, -- cgit v1.2.3-58-ga151 From c4ab7b56be0f6f18f025ddc8d469cce54f82415a Mon Sep 17 00:00:00 2001 From: Aaron Conole Date: Tue, 22 Jun 2021 10:02:33 -0400 Subject: openvswitch: add trace points This makes openvswitch module use the event tracing framework to log the upcall interface and action execution pipeline. When using openvswitch as the packet forwarding engine, some types of debugging are made possible simply by using the ovs-vswitchd's ofproto/trace command. However, such a command has some limitations: 1. When trying to trace packets that go through the CT action, the state of the packet can't be determined, and probably would be potentially wrong. 2. Deducing problem packets can sometimes be difficult as well even if many of the flows are known 3. It's possible to use the openvswitch module even without the ovs-vswitchd (although, not common use). Introduce the event tracing points here to make it possible for working through these problems in kernel space. The style is copied from the mac80211 driver-trace / trace code for consistency - this creates some checkpatch splats, but the official 'guide' for adding tracepoints, as well as the existing examples all add the same splats so it seems acceptable. Signed-off-by: Aaron Conole Signed-off-by: David S. Miller --- net/openvswitch/Makefile | 3 + net/openvswitch/actions.c | 4 + net/openvswitch/datapath.c | 4 + net/openvswitch/openvswitch_trace.c | 10 +++ net/openvswitch/openvswitch_trace.h | 158 ++++++++++++++++++++++++++++++++++++ 5 files changed, 179 insertions(+) create mode 100644 net/openvswitch/openvswitch_trace.c create mode 100644 net/openvswitch/openvswitch_trace.h (limited to 'net') diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile index 41109c326f3a..28982630bef3 100644 --- a/net/openvswitch/Makefile +++ b/net/openvswitch/Makefile @@ -13,6 +13,7 @@ openvswitch-y := \ flow_netlink.o \ flow_table.o \ meter.o \ + openvswitch_trace.o \ vport.o \ vport-internal_dev.o \ vport-netdev.o @@ -24,3 +25,5 @@ endif obj-$(CONFIG_OPENVSWITCH_VXLAN)+= vport-vxlan.o obj-$(CONFIG_OPENVSWITCH_GENEVE)+= vport-geneve.o obj-$(CONFIG_OPENVSWITCH_GRE) += vport-gre.o + +CFLAGS_openvswitch_trace.o = -I$(src) diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 77d924ab8cdb..ef15d9eb4774 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -30,6 +30,7 @@ #include "conntrack.h" #include "vport.h" #include "flow_netlink.h" +#include "openvswitch_trace.h" struct deferred_action { struct sk_buff *skb; @@ -1242,6 +1243,9 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, a = nla_next(a, &rem)) { int err = 0; + if (trace_ovs_do_execute_action_enabled()) + trace_ovs_do_execute_action(dp, skb, key, a, rem); + switch (nla_type(a)) { case OVS_ACTION_ATTR_OUTPUT: { int port = nla_get_u32(a); diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 9d6ef6cb9b26..bc164b35e67d 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -43,6 +43,7 @@ #include "flow_table.h" #include "flow_netlink.h" #include "meter.h" +#include "openvswitch_trace.h" #include "vport-internal_dev.h" #include "vport-netdev.h" @@ -275,6 +276,9 @@ int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb, struct dp_stats_percpu *stats; int err; + if (trace_ovs_dp_upcall_enabled()) + trace_ovs_dp_upcall(dp, skb, key, upcall_info); + if (upcall_info->portid == 0) { err = -ENOTCONN; goto err; diff --git a/net/openvswitch/openvswitch_trace.c b/net/openvswitch/openvswitch_trace.c new file mode 100644 index 000000000000..62c5f7d6f023 --- /dev/null +++ b/net/openvswitch/openvswitch_trace.c @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0 +/* bug in tracepoint.h, it should include this */ +#include + +/* sparse isn't too happy with all macros... */ +#ifndef __CHECKER__ +#define CREATE_TRACE_POINTS +#include "openvswitch_trace.h" + +#endif diff --git a/net/openvswitch/openvswitch_trace.h b/net/openvswitch/openvswitch_trace.h new file mode 100644 index 000000000000..3eb35d9eb700 --- /dev/null +++ b/net/openvswitch/openvswitch_trace.h @@ -0,0 +1,158 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM openvswitch + +#if !defined(_TRACE_OPENVSWITCH_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_OPENVSWITCH_H + +#include + +#include "datapath.h" + +TRACE_EVENT(ovs_do_execute_action, + + TP_PROTO(struct datapath *dp, struct sk_buff *skb, + struct sw_flow_key *key, const struct nlattr *a, int rem), + + TP_ARGS(dp, skb, key, a, rem), + + TP_STRUCT__entry( + __field( void *, dpaddr ) + __string( dp_name, ovs_dp_name(dp) ) + __string( dev_name, skb->dev->name ) + __field( void *, skbaddr ) + __field( unsigned int, len ) + __field( unsigned int, data_len ) + __field( unsigned int, truesize ) + __field( u8, nr_frags ) + __field( u16, gso_size ) + __field( u16, gso_type ) + __field( u32, ovs_flow_hash ) + __field( u32, recirc_id ) + __field( void *, keyaddr ) + __field( u16, key_eth_type ) + __field( u8, key_ct_state ) + __field( u8, key_ct_orig_proto ) + __field( u16, key_ct_zone ) + __field( unsigned int, flow_key_valid ) + __field( u8, action_type ) + __field( unsigned int, action_len ) + __field( void *, action_data ) + __field( u8, is_last ) + ), + + TP_fast_assign( + __entry->dpaddr = dp; + __assign_str(dp_name, ovs_dp_name(dp)); + __assign_str(dev_name, skb->dev->name); + __entry->skbaddr = skb; + __entry->len = skb->len; + __entry->data_len = skb->data_len; + __entry->truesize = skb->truesize; + __entry->nr_frags = skb_shinfo(skb)->nr_frags; + __entry->gso_size = skb_shinfo(skb)->gso_size; + __entry->gso_type = skb_shinfo(skb)->gso_type; + __entry->ovs_flow_hash = key->ovs_flow_hash; + __entry->recirc_id = key->recirc_id; + __entry->keyaddr = key; + __entry->key_eth_type = key->eth.type; + __entry->key_ct_state = key->ct_state; + __entry->key_ct_orig_proto = key->ct_orig_proto; + __entry->key_ct_zone = key->ct_zone; + __entry->flow_key_valid = !(key->mac_proto & SW_FLOW_KEY_INVALID); + __entry->action_type = nla_type(a); + __entry->action_len = nla_len(a); + __entry->action_data = nla_data(a); + __entry->is_last = nla_is_last(a, rem); + ), + + TP_printk("dpaddr=%p dp_name=%s dev=%s skbaddr=%p len=%u data_len=%u truesize=%u nr_frags=%d gso_size=%d gso_type=%#x ovs_flow_hash=0x%08x recirc_id=0x%08x keyaddr=%p eth_type=0x%04x ct_state=%02x ct_orig_proto=%02x ct_Zone=%04x flow_key_valid=%d action_type=%u action_len=%u action_data=%p is_last=%d", + __entry->dpaddr, __get_str(dp_name), __get_str(dev_name), + __entry->skbaddr, __entry->len, __entry->data_len, + __entry->truesize, __entry->nr_frags, __entry->gso_size, + __entry->gso_type, __entry->ovs_flow_hash, + __entry->recirc_id, __entry->keyaddr, __entry->key_eth_type, + __entry->key_ct_state, __entry->key_ct_orig_proto, + __entry->key_ct_zone, + __entry->flow_key_valid, + __entry->action_type, __entry->action_len, + __entry->action_data, __entry->is_last) +); + +TRACE_EVENT(ovs_dp_upcall, + + TP_PROTO(struct datapath *dp, struct sk_buff *skb, + const struct sw_flow_key *key, + const struct dp_upcall_info *upcall_info), + + TP_ARGS(dp, skb, key, upcall_info), + + TP_STRUCT__entry( + __field( void *, dpaddr ) + __string( dp_name, ovs_dp_name(dp) ) + __string( dev_name, skb->dev->name ) + __field( void *, skbaddr ) + __field( unsigned int, len ) + __field( unsigned int, data_len ) + __field( unsigned int, truesize ) + __field( u8, nr_frags ) + __field( u16, gso_size ) + __field( u16, gso_type ) + __field( u32, ovs_flow_hash ) + __field( u32, recirc_id ) + __field( const void *, keyaddr ) + __field( u16, key_eth_type ) + __field( u8, key_ct_state ) + __field( u8, key_ct_orig_proto ) + __field( u16, key_ct_zone ) + __field( unsigned int, flow_key_valid ) + __field( u8, upcall_cmd ) + __field( u32, upcall_port ) + __field( u16, upcall_mru ) + ), + + TP_fast_assign( + __entry->dpaddr = dp; + __assign_str(dp_name, ovs_dp_name(dp)); + __assign_str(dev_name, skb->dev->name); + __entry->skbaddr = skb; + __entry->len = skb->len; + __entry->data_len = skb->data_len; + __entry->truesize = skb->truesize; + __entry->nr_frags = skb_shinfo(skb)->nr_frags; + __entry->gso_size = skb_shinfo(skb)->gso_size; + __entry->gso_type = skb_shinfo(skb)->gso_type; + __entry->ovs_flow_hash = key->ovs_flow_hash; + __entry->recirc_id = key->recirc_id; + __entry->keyaddr = key; + __entry->key_eth_type = key->eth.type; + __entry->key_ct_state = key->ct_state; + __entry->key_ct_orig_proto = key->ct_orig_proto; + __entry->key_ct_zone = key->ct_zone; + __entry->flow_key_valid = !(key->mac_proto & SW_FLOW_KEY_INVALID); + __entry->upcall_cmd = upcall_info->cmd; + __entry->upcall_port = upcall_info->portid; + __entry->upcall_mru = upcall_info->mru; + ), + + TP_printk("dpaddr=%p dp_name=%s dev=%s skbaddr=%p len=%u data_len=%u truesize=%u nr_frags=%d gso_size=%d gso_type=%#x ovs_flow_hash=0x%08x recirc_id=0x%08x keyaddr=%p eth_type=0x%04x ct_state=%02x ct_orig_proto=%02x ct_zone=%04x flow_key_valid=%d upcall_cmd=%u upcall_port=%u upcall_mru=%u", + __entry->dpaddr, __get_str(dp_name), __get_str(dev_name), + __entry->skbaddr, __entry->len, __entry->data_len, + __entry->truesize, __entry->nr_frags, __entry->gso_size, + __entry->gso_type, __entry->ovs_flow_hash, + __entry->recirc_id, __entry->keyaddr, __entry->key_eth_type, + __entry->key_ct_state, __entry->key_ct_orig_proto, + __entry->key_ct_zone, + __entry->flow_key_valid, + __entry->upcall_cmd, __entry->upcall_port, + __entry->upcall_mru) +); + +#endif /* _TRACE_OPENVSWITCH_H */ + +/* This part must be outside protection */ +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE openvswitch_trace +#include -- cgit v1.2.3-58-ga151 From 745a32117b5a0799ce1dd28d5a74dc2b7bf37692 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 22 Jun 2021 14:04:47 -0400 Subject: sctp: add pad chunk and its make function and event table This chunk is defined in rfc4820#section-3, and used to pad an SCTP packet. The receiver must discard this chunk and continue processing the rest of the chunks in the packet. Add it now, as it will be bundled with a heartbeat chunk to probe pmtu in the following patches. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/linux/sctp.h | 7 +++++++ include/net/sctp/sm.h | 1 + net/sctp/sm_make_chunk.c | 26 ++++++++++++++++++++++++++ net/sctp/sm_statetable.c | 23 +++++++++++++++++++++++ 4 files changed, 57 insertions(+) (limited to 'net') diff --git a/include/linux/sctp.h b/include/linux/sctp.h index bb1926589693..a86e852507b3 100644 --- a/include/linux/sctp.h +++ b/include/linux/sctp.h @@ -98,6 +98,7 @@ enum sctp_cid { SCTP_CID_I_FWD_TSN = 0xC2, SCTP_CID_ASCONF_ACK = 0x80, SCTP_CID_RECONF = 0x82, + SCTP_CID_PAD = 0x84, }; /* enum */ @@ -410,6 +411,12 @@ struct sctp_heartbeat_chunk { }; +/* PAD chunk could be bundled with heartbeat chunk to probe pmtu */ +struct sctp_pad_chunk { + struct sctp_chunkhdr uh; +}; + + /* For the abort and shutdown ACK we must carry the init tag in the * common header. Just the common header is all that is needed with a * chunk descriptor. diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h index fd223c94589a..09c59154634d 100644 --- a/include/net/sctp/sm.h +++ b/include/net/sctp/sm.h @@ -230,6 +230,7 @@ struct sctp_chunk *sctp_make_heartbeat_ack(const struct sctp_association *asoc, const struct sctp_chunk *chunk, const void *payload, const size_t paylen); +struct sctp_chunk *sctp_make_pad(const struct sctp_association *asoc, int len); struct sctp_chunk *sctp_make_op_error(const struct sctp_association *asoc, const struct sctp_chunk *chunk, __be16 cause_code, const void *payload, diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 5b44d228b6ca..e5d470cd7c40 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -1218,6 +1218,32 @@ nodata: return retval; } +/* RFC4820 3. Padding Chunk (PAD) + * 0 1 2 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Type = 0x84 | Flags=0 | Length | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | | + * \ Padding Data / + * / \ + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + */ +struct sctp_chunk *sctp_make_pad(const struct sctp_association *asoc, int len) +{ + struct sctp_chunk *retval; + + retval = sctp_make_control(asoc, SCTP_CID_PAD, 0, len, GFP_ATOMIC); + if (!retval) + return NULL; + + skb_put_zero(retval->skb, len); + retval->chunk_hdr->length = htons(ntohs(retval->chunk_hdr->length) + len); + retval->chunk_end = skb_tail_pointer(retval->skb); + + return retval; +} + /* Create an Operation Error chunk with the specified space reserved. * This routine can be used for containing multiple causes in the chunk. */ diff --git a/net/sctp/sm_statetable.c b/net/sctp/sm_statetable.c index 88ea87f4f0e7..c82c4233ec6b 100644 --- a/net/sctp/sm_statetable.c +++ b/net/sctp/sm_statetable.c @@ -526,6 +526,26 @@ auth_chunk_event_table[SCTP_NUM_AUTH_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = { TYPE_SCTP_AUTH, }; /*state_fn_t auth_chunk_event_table[][] */ +static const struct sctp_sm_table_entry +pad_chunk_event_table[SCTP_STATE_NUM_STATES] = { + /* SCTP_STATE_CLOSED */ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), + /* SCTP_STATE_COOKIE_WAIT */ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), + /* SCTP_STATE_COOKIE_ECHOED */ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), + /* SCTP_STATE_ESTABLISHED */ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), + /* SCTP_STATE_SHUTDOWN_PENDING */ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), + /* SCTP_STATE_SHUTDOWN_SENT */ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), + /* SCTP_STATE_SHUTDOWN_RECEIVED */ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), +}; /* chunk pad */ + static const struct sctp_sm_table_entry chunk_event_table_unknown[SCTP_STATE_NUM_STATES] = { /* SCTP_STATE_CLOSED */ @@ -992,6 +1012,9 @@ static const struct sctp_sm_table_entry *sctp_chunk_event_lookup( case SCTP_CID_AUTH: return &auth_chunk_event_table[0][state]; + + case SCTP_CID_PAD: + return &pad_chunk_event_table[state]; } return &chunk_event_table_unknown[state]; -- cgit v1.2.3-58-ga151 From d1e462a7a5f359cbb9a0e8fbfafcfb6657034105 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 22 Jun 2021 14:04:48 -0400 Subject: sctp: add probe_interval in sysctl and sock/asoc/transport PLPMTUD can be enabled by doing 'sysctl -w net.sctp.probe_interval=n'. 'n' is the interval for PLPMTUD probe timer in milliseconds, and it can't be less than 5000 if it's not 0. All asoc/transport's PLPMTUD in a new socket will be enabled by default. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.rst | 8 ++++++++ include/net/netns/sctp.h | 3 +++ include/net/sctp/constants.h | 2 ++ include/net/sctp/structs.h | 3 +++ net/sctp/associola.c | 2 ++ net/sctp/socket.c | 1 + net/sctp/sysctl.c | 35 ++++++++++++++++++++++++++++++++++ 7 files changed, 54 insertions(+) (limited to 'net') diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index b0436d3a4f11..8bff728b3a1e 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -2834,6 +2834,14 @@ encap_port - INTEGER Default: 0 +plpmtud_probe_interval - INTEGER + The time interval (in milliseconds) for sending PLPMTUD probe chunks. + These chunks are sent at the specified interval with a variable size + to probe the mtu of a given path between 2 endpoints. PLPMTUD will + be disabled when 0 is set, and other values for it must be >= 5000. + + Default: 0 + ``/proc/sys/net/core/*`` ======================== diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h index a0f315effa94..40240722cdca 100644 --- a/include/net/netns/sctp.h +++ b/include/net/netns/sctp.h @@ -84,6 +84,9 @@ struct netns_sctp { /* HB.interval - 30 seconds */ unsigned int hb_interval; + /* The interval for PLPMTUD probe timer */ + unsigned int probe_interval; + /* Association.Max.Retrans - 10 attempts * Path.Max.Retrans - 5 attempts (per destination address) * Max.Init.Retransmits - 8 attempts diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h index 14a0d22c9113..449cf9cb428b 100644 --- a/include/net/sctp/constants.h +++ b/include/net/sctp/constants.h @@ -424,4 +424,6 @@ enum { */ #define SCTP_AUTH_RANDOM_LENGTH 32 +#define SCTP_PROBE_TIMER_MIN 5000 + #endif /* __sctp_constants_h__ */ diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 1aa585216f34..bf5d22deaefb 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -177,6 +177,7 @@ struct sctp_sock { * will be inherited by all new associations. */ __u32 hbinterval; + __u32 probe_interval; __be16 udp_port; __be16 encap_port; @@ -858,6 +859,7 @@ struct sctp_transport { * the destination address every heartbeat interval. */ unsigned long hbinterval; + unsigned long probe_interval; /* SACK delay timeout */ unsigned long sackdelay; @@ -1795,6 +1797,7 @@ struct sctp_association { * will be inherited by all new transports. */ unsigned long hbinterval; + unsigned long probe_interval; __be16 encap_port; diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 336df4b36655..e01895edd3a4 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -98,6 +98,7 @@ static struct sctp_association *sctp_association_init( * sock configured value. */ asoc->hbinterval = msecs_to_jiffies(sp->hbinterval); + asoc->probe_interval = msecs_to_jiffies(sp->probe_interval); asoc->encap_port = sp->encap_port; @@ -625,6 +626,7 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, * association configured value. */ peer->hbinterval = asoc->hbinterval; + peer->probe_interval = asoc->probe_interval; peer->encap_port = asoc->encap_port; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index a79d193ff872..d2960ab665a5 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -4989,6 +4989,7 @@ static int sctp_init_sock(struct sock *sk) atomic_set(&sp->pd_mode, 0); skb_queue_head_init(&sp->pd_lobby); sp->frag_interleave = 0; + sp->probe_interval = net->sctp.probe_interval; /* Create a per socket endpoint structure. Even if we * change the data structure relationships, this may still diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index 55871b277f47..b46a416787ec 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -55,6 +55,8 @@ static int proc_sctp_do_alpha_beta(struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); static int proc_sctp_do_auth(struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); +static int proc_sctp_do_probe_interval(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos); static struct ctl_table sctp_table[] = { { @@ -293,6 +295,13 @@ static struct ctl_table sctp_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "plpmtud_probe_interval", + .data = &init_net.sctp.probe_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_sctp_do_probe_interval, + }, { .procname = "udp_port", .data = &init_net.sctp.udp_port, @@ -539,6 +548,32 @@ static int proc_sctp_do_udp_port(struct ctl_table *ctl, int write, return ret; } +static int proc_sctp_do_probe_interval(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct net *net = current->nsproxy->net_ns; + struct ctl_table tbl; + int ret, new_value; + + memset(&tbl, 0, sizeof(struct ctl_table)); + tbl.maxlen = sizeof(unsigned int); + + if (write) + tbl.data = &new_value; + else + tbl.data = &net->sctp.probe_interval; + + ret = proc_dointvec(&tbl, write, buffer, lenp, ppos); + if (write && ret == 0) { + if (new_value && new_value < SCTP_PROBE_TIMER_MIN) + return -EINVAL; + + net->sctp.probe_interval = new_value; + } + + return ret; +} + int sctp_sysctl_net_register(struct net *net) { struct ctl_table *table; -- cgit v1.2.3-58-ga151 From 3190b649b4d9391be7bde3edd8e924e451c5d2f6 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 22 Jun 2021 14:04:49 -0400 Subject: sctp: add SCTP_PLPMTUD_PROBE_INTERVAL sockopt for sock/asoc/transport With this socket option, users can change probe_interval for a transport, asoc or sock after it's created. Note that if the change is for an asoc, also apply the change to each transport in this asoc. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/uapi/linux/sctp.h | 8 ++++ net/sctp/socket.c | 118 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 126 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index cb78e7a739da..c4ff1ebd8bcc 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -141,6 +141,7 @@ typedef __s32 sctp_assoc_t; #define SCTP_EXPOSE_POTENTIALLY_FAILED_STATE 131 #define SCTP_EXPOSE_PF_STATE SCTP_EXPOSE_POTENTIALLY_FAILED_STATE #define SCTP_REMOTE_UDP_ENCAPS_PORT 132 +#define SCTP_PLPMTUD_PROBE_INTERVAL 133 /* PR-SCTP policies */ #define SCTP_PR_SCTP_NONE 0x0000 @@ -1213,4 +1214,11 @@ enum sctp_sched_type { SCTP_SS_MAX = SCTP_SS_RR }; +/* Probe Interval socket option */ +struct sctp_probeinterval { + sctp_assoc_t spi_assoc_id; + struct sockaddr_storage spi_address; + __u32 spi_interval; +}; + #endif /* _UAPI_SCTP_H */ diff --git a/net/sctp/socket.c b/net/sctp/socket.c index d2960ab665a5..aba576f53458 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -4481,6 +4481,58 @@ static int sctp_setsockopt_encap_port(struct sock *sk, return 0; } +static int sctp_setsockopt_probe_interval(struct sock *sk, + struct sctp_probeinterval *params, + unsigned int optlen) +{ + struct sctp_association *asoc; + struct sctp_transport *t; + __u32 probe_interval; + + if (optlen != sizeof(*params)) + return -EINVAL; + + probe_interval = params->spi_interval; + if (probe_interval && probe_interval < SCTP_PROBE_TIMER_MIN) + return -EINVAL; + + /* If an address other than INADDR_ANY is specified, and + * no transport is found, then the request is invalid. + */ + if (!sctp_is_any(sk, (union sctp_addr *)¶ms->spi_address)) { + t = sctp_addr_id2transport(sk, ¶ms->spi_address, + params->spi_assoc_id); + if (!t) + return -EINVAL; + + t->probe_interval = msecs_to_jiffies(probe_interval); + return 0; + } + + /* Get association, if assoc_id != SCTP_FUTURE_ASSOC and the + * socket is a one to many style socket, and an association + * was not found, then the id was invalid. + */ + asoc = sctp_id2assoc(sk, params->spi_assoc_id); + if (!asoc && params->spi_assoc_id != SCTP_FUTURE_ASSOC && + sctp_style(sk, UDP)) + return -EINVAL; + + /* If changes are for association, also apply probe_interval to + * each transport. + */ + if (asoc) { + list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) + t->probe_interval = msecs_to_jiffies(probe_interval); + + asoc->probe_interval = msecs_to_jiffies(probe_interval); + return 0; + } + + sctp_sk(sk)->probe_interval = probe_interval; + return 0; +} + /* API 6.2 setsockopt(), getsockopt() * * Applications use setsockopt() and getsockopt() to set or retrieve @@ -4703,6 +4755,9 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname, case SCTP_REMOTE_UDP_ENCAPS_PORT: retval = sctp_setsockopt_encap_port(sk, kopt, optlen); break; + case SCTP_PLPMTUD_PROBE_INTERVAL: + retval = sctp_setsockopt_probe_interval(sk, kopt, optlen); + break; default: retval = -ENOPROTOOPT; break; @@ -7906,6 +7961,66 @@ out: return 0; } +static int sctp_getsockopt_probe_interval(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + struct sctp_probeinterval params; + struct sctp_association *asoc; + struct sctp_transport *t; + __u32 probe_interval; + + if (len < sizeof(params)) + return -EINVAL; + + len = sizeof(params); + if (copy_from_user(¶ms, optval, len)) + return -EFAULT; + + /* If an address other than INADDR_ANY is specified, and + * no transport is found, then the request is invalid. + */ + if (!sctp_is_any(sk, (union sctp_addr *)¶ms.spi_address)) { + t = sctp_addr_id2transport(sk, ¶ms.spi_address, + params.spi_assoc_id); + if (!t) { + pr_debug("%s: failed no transport\n", __func__); + return -EINVAL; + } + + probe_interval = jiffies_to_msecs(t->probe_interval); + goto out; + } + + /* Get association, if assoc_id != SCTP_FUTURE_ASSOC and the + * socket is a one to many style socket, and an association + * was not found, then the id was invalid. + */ + asoc = sctp_id2assoc(sk, params.spi_assoc_id); + if (!asoc && params.spi_assoc_id != SCTP_FUTURE_ASSOC && + sctp_style(sk, UDP)) { + pr_debug("%s: failed no association\n", __func__); + return -EINVAL; + } + + if (asoc) { + probe_interval = jiffies_to_msecs(asoc->probe_interval); + goto out; + } + + probe_interval = sctp_sk(sk)->probe_interval; + +out: + params.spi_interval = probe_interval; + if (copy_to_user(optval, ¶ms, len)) + return -EFAULT; + + if (put_user(len, optlen)) + return -EFAULT; + + return 0; +} + static int sctp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { @@ -8129,6 +8244,9 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname, case SCTP_REMOTE_UDP_ENCAPS_PORT: retval = sctp_getsockopt_encap_port(sk, len, optval, optlen); break; + case SCTP_PLPMTUD_PROBE_INTERVAL: + retval = sctp_getsockopt_probe_interval(sk, len, optval, optlen); + break; default: retval = -ENOPROTOOPT; break; -- cgit v1.2.3-58-ga151 From 92548ec2f1f92d0c0b60ce59592b645571672568 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 22 Jun 2021 14:04:51 -0400 Subject: sctp: add the probe timer in transport for PLPMTUD There are 3 timers described in rfc8899#section-5.1.1: PROBE_TIMER, PMTU_RAISE_TIMER, CONFIRMATION_TIMER This patches adds a 'probe_timer' in transport, and it works as either PROBE_TIMER or PMTU_RAISE_TIMER. At most time, it works as PROBE_TIMER and expires every a 'probe_interval' time to send the HB probe packet. When transport pl enters COMPLETE state, it works as PMTU_RAISE_TIMER and expires in 'probe_interval * 30' time to go back to SEARCH state and do searching again. SCTP HB is an acknowledged packet, CONFIRMATION_TIMER is not needed. The timer will start when transport pl enters BASE state and stop when it enters DISABLED state. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/command.h | 1 + include/net/sctp/constants.h | 1 + include/net/sctp/sctp.h | 9 ++++++++- include/net/sctp/sm.h | 2 ++ include/net/sctp/structs.h | 4 ++++ net/sctp/debug.c | 1 + net/sctp/sm_sideeffect.c | 37 +++++++++++++++++++++++++++++++++++++ net/sctp/sm_statefuns.c | 17 +++++++++++++++++ net/sctp/sm_statetable.c | 20 ++++++++++++++++++++ net/sctp/transport.c | 18 ++++++++++++++++++ 10 files changed, 109 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/sctp/command.h b/include/net/sctp/command.h index 5e848884ff61..2058fabffbf6 100644 --- a/include/net/sctp/command.h +++ b/include/net/sctp/command.h @@ -59,6 +59,7 @@ enum sctp_verb { SCTP_CMD_HB_TIMERS_START, /* Start the heartbeat timers. */ SCTP_CMD_HB_TIMER_UPDATE, /* Update a heartbeat timers. */ SCTP_CMD_HB_TIMERS_STOP, /* Stop the heartbeat timers. */ + SCTP_CMD_PROBE_TIMER_UPDATE, /* Update a probe timer. */ SCTP_CMD_TRANSPORT_HB_SENT, /* Reset the status of a transport. */ SCTP_CMD_TRANSPORT_IDLE, /* Do manipulations on idle transport */ SCTP_CMD_TRANSPORT_ON, /* Mark the transport as active. */ diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h index 85f6a105c59d..265fffa33dad 100644 --- a/include/net/sctp/constants.h +++ b/include/net/sctp/constants.h @@ -77,6 +77,7 @@ enum sctp_event_timeout { SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD, SCTP_EVENT_TIMEOUT_HEARTBEAT, SCTP_EVENT_TIMEOUT_RECONF, + SCTP_EVENT_TIMEOUT_PROBE, SCTP_EVENT_TIMEOUT_SACK, SCTP_EVENT_TIMEOUT_AUTOCLOSE, }; diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index 08347d3f004f..f7e083602c10 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -635,10 +635,14 @@ static inline void sctp_transport_pl_reset(struct sctp_transport *t) t->pl.state = SCTP_PL_BASE; t->pl.pmtu = SCTP_BASE_PLPMTU; t->pl.probe_size = SCTP_BASE_PLPMTU; + sctp_transport_reset_probe_timer(t); } } else { - if (t->pl.state != SCTP_PL_DISABLED) + if (t->pl.state != SCTP_PL_DISABLED) { + if (del_timer(&t->probe_timer)) + sctp_transport_put(t); t->pl.state = SCTP_PL_DISABLED; + } } } @@ -647,6 +651,9 @@ static inline void sctp_transport_pl_update(struct sctp_transport *t) if (t->pl.state == SCTP_PL_DISABLED) return; + if (del_timer(&t->probe_timer)) + sctp_transport_put(t); + t->pl.state = SCTP_PL_BASE; t->pl.pmtu = SCTP_BASE_PLPMTU; t->pl.probe_size = SCTP_BASE_PLPMTU; diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h index 09c59154634d..45542e2bac93 100644 --- a/include/net/sctp/sm.h +++ b/include/net/sctp/sm.h @@ -151,6 +151,7 @@ sctp_state_fn_t sctp_sf_cookie_wait_icmp_abort; /* Prototypes for timeout event state functions. */ sctp_state_fn_t sctp_sf_do_6_3_3_rtx; sctp_state_fn_t sctp_sf_send_reconf; +sctp_state_fn_t sctp_sf_send_probe; sctp_state_fn_t sctp_sf_do_6_2_sack; sctp_state_fn_t sctp_sf_autoclose_timer_expire; @@ -311,6 +312,7 @@ int sctp_do_sm(struct net *net, enum sctp_event_type event_type, void sctp_generate_t3_rtx_event(struct timer_list *t); void sctp_generate_heartbeat_event(struct timer_list *t); void sctp_generate_reconf_event(struct timer_list *t); +void sctp_generate_probe_event(struct timer_list *t); void sctp_generate_proto_unreach_event(struct timer_list *t); void sctp_ootb_pkt_free(struct sctp_packet *packet); diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 85d3566c2227..a3772f8ee7f6 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -936,6 +936,9 @@ struct sctp_transport { /* Timer to handler reconf chunk rtx */ struct timer_list reconf_timer; + /* Timer to send a probe HB packet for PLPMTUD */ + struct timer_list probe_timer; + /* Since we're using per-destination retransmission timers * (see above), we're also using per-destination "transmitted" * queues. This probably ought to be a private struct @@ -1003,6 +1006,7 @@ void sctp_transport_free(struct sctp_transport *); void sctp_transport_reset_t3_rtx(struct sctp_transport *); void sctp_transport_reset_hb_timer(struct sctp_transport *); void sctp_transport_reset_reconf_timer(struct sctp_transport *transport); +void sctp_transport_reset_probe_timer(struct sctp_transport *transport); int sctp_transport_hold(struct sctp_transport *); void sctp_transport_put(struct sctp_transport *); void sctp_transport_update_rto(struct sctp_transport *, __u32); diff --git a/net/sctp/debug.c b/net/sctp/debug.c index c4d9c7feffb9..ccd773e4c371 100644 --- a/net/sctp/debug.c +++ b/net/sctp/debug.c @@ -154,6 +154,7 @@ static const char *const sctp_timer_tbl[] = { "TIMEOUT_T5_SHUTDOWN_GUARD", "TIMEOUT_HEARTBEAT", "TIMEOUT_RECONF", + "TIMEOUT_PROBE", "TIMEOUT_SACK", "TIMEOUT_AUTOCLOSE", }; diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index ce15d590a615..b3815b568e8e 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -471,6 +471,38 @@ out_unlock: sctp_transport_put(transport); } +/* Handle the timeout of the probe timer. */ +void sctp_generate_probe_event(struct timer_list *t) +{ + struct sctp_transport *transport = from_timer(transport, t, probe_timer); + struct sctp_association *asoc = transport->asoc; + struct sock *sk = asoc->base.sk; + struct net *net = sock_net(sk); + int error = 0; + + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { + pr_debug("%s: sock is busy\n", __func__); + + /* Try again later. */ + if (!mod_timer(&transport->probe_timer, jiffies + (HZ / 20))) + sctp_transport_hold(transport); + goto out_unlock; + } + + error = sctp_do_sm(net, SCTP_EVENT_T_TIMEOUT, + SCTP_ST_TIMEOUT(SCTP_EVENT_TIMEOUT_PROBE), + asoc->state, asoc->ep, asoc, + transport, GFP_ATOMIC); + + if (error) + sk->sk_err = -error; + +out_unlock: + bh_unlock_sock(sk); + sctp_transport_put(transport); +} + /* Inject a SACK Timeout event into the state machine. */ static void sctp_generate_sack_event(struct timer_list *t) { @@ -1641,6 +1673,11 @@ static int sctp_cmd_interpreter(enum sctp_event_type event_type, sctp_cmd_hb_timers_stop(commands, asoc); break; + case SCTP_CMD_PROBE_TIMER_UPDATE: + t = cmd->obj.transport; + sctp_transport_reset_probe_timer(t); + break; + case SCTP_CMD_REPORT_ERROR: error = cmd->obj.error; break; diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 4f30388a0dd0..3b99eda50618 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -1095,6 +1095,23 @@ enum sctp_disposition sctp_sf_send_reconf(struct net *net, return SCTP_DISPOSITION_CONSUME; } +/* send hb chunk with padding for PLPMUTD. */ +enum sctp_disposition sctp_sf_send_probe(struct net *net, + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const union sctp_subtype type, + void *arg, + struct sctp_cmd_seq *commands) +{ + struct sctp_transport *transport = (struct sctp_transport *)arg; + + /* The actual handling will be performed here in a later patch. */ + sctp_add_cmd_sf(commands, SCTP_CMD_PROBE_TIMER_UPDATE, + SCTP_TRANSPORT(transport)); + + return SCTP_DISPOSITION_CONSUME; +} + /* * Process an heartbeat request. * diff --git a/net/sctp/sm_statetable.c b/net/sctp/sm_statetable.c index c82c4233ec6b..1816a4410b2b 100644 --- a/net/sctp/sm_statetable.c +++ b/net/sctp/sm_statetable.c @@ -967,6 +967,25 @@ other_event_table[SCTP_NUM_OTHER_TYPES][SCTP_STATE_NUM_STATES] = { TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ } +#define TYPE_SCTP_EVENT_TIMEOUT_PROBE { \ + /* SCTP_STATE_CLOSED */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_COOKIE_WAIT */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_ESTABLISHED */ \ + TYPE_SCTP_FUNC(sctp_sf_send_probe), \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ +} + static const struct sctp_sm_table_entry timeout_event_table[SCTP_NUM_TIMEOUT_TYPES][SCTP_STATE_NUM_STATES] = { TYPE_SCTP_EVENT_TIMEOUT_NONE, @@ -978,6 +997,7 @@ timeout_event_table[SCTP_NUM_TIMEOUT_TYPES][SCTP_STATE_NUM_STATES] = { TYPE_SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD, TYPE_SCTP_EVENT_TIMEOUT_HEARTBEAT, TYPE_SCTP_EVENT_TIMEOUT_RECONF, + TYPE_SCTP_EVENT_TIMEOUT_PROBE, TYPE_SCTP_EVENT_TIMEOUT_SACK, TYPE_SCTP_EVENT_TIMEOUT_AUTOCLOSE, }; diff --git a/net/sctp/transport.c b/net/sctp/transport.c index bf0ac467e757..ca3343c2c80e 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -75,6 +75,7 @@ static struct sctp_transport *sctp_transport_init(struct net *net, timer_setup(&peer->T3_rtx_timer, sctp_generate_t3_rtx_event, 0); timer_setup(&peer->hb_timer, sctp_generate_heartbeat_event, 0); timer_setup(&peer->reconf_timer, sctp_generate_reconf_event, 0); + timer_setup(&peer->probe_timer, sctp_generate_probe_event, 0); timer_setup(&peer->proto_unreach_timer, sctp_generate_proto_unreach_event, 0); @@ -131,6 +132,9 @@ void sctp_transport_free(struct sctp_transport *transport) if (del_timer(&transport->reconf_timer)) sctp_transport_put(transport); + if (del_timer(&transport->probe_timer)) + sctp_transport_put(transport); + /* Delete the ICMP proto unreachable timer if it's active. */ if (del_timer(&transport->proto_unreach_timer)) sctp_transport_put(transport); @@ -207,6 +211,20 @@ void sctp_transport_reset_reconf_timer(struct sctp_transport *transport) sctp_transport_hold(transport); } +void sctp_transport_reset_probe_timer(struct sctp_transport *transport) +{ + int scale = 1; + + if (timer_pending(&transport->probe_timer)) + return; + if (transport->pl.state == SCTP_PL_COMPLETE && + transport->pl.probe_count == 1) + scale = 30; /* works as PMTU_RAISE_TIMER */ + if (!mod_timer(&transport->probe_timer, + jiffies + transport->probe_interval * scale)) + sctp_transport_hold(transport); +} + /* This transport has been assigned to an association. * Initialize fields from the association or from the sock itself. * Register the reference count in the association. -- cgit v1.2.3-58-ga151 From fe59379b9ab7ddad157f5379fa47dbf84c9b5e09 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 22 Jun 2021 14:04:52 -0400 Subject: sctp: do the basic send and recv for PLPMTUD probe This patch does exactly what rfc8899#section-6.2.1.2 says: The SCTP sender needs to be able to determine the total size of a probe packet. The HEARTBEAT chunk could carry a Heartbeat Information parameter that includes, besides the information suggested in [RFC4960], the probe size to help an implementation associate a HEARTBEAT ACK with the size of probe that was sent. The sender could also use other methods, such as sending a nonce and verifying the information returned also contains the corresponding nonce. The length of the PAD chunk is computed by reducing the probing size by the size of the SCTP common header and the HEARTBEAT chunk. Note that HB ACK chunk will carry back whatever HB chunk carried, including the probe_size we put it in; We also check hbinfo->probe_size in the HB ACK against link->pl.probe_size to validate this HB ACK chunk. v1->v2: - Remove the unused 'sp' and add static for sctp_packet_bundle_pad(). Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/sm.h | 3 ++- include/net/sctp/structs.h | 2 ++ net/sctp/output.c | 30 +++++++++++++++++++++++++++++- net/sctp/outqueue.c | 13 +++++++++++-- net/sctp/sm_make_chunk.c | 5 ++++- net/sctp/sm_statefuns.c | 20 ++++++++++++++++++-- 6 files changed, 66 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h index 45542e2bac93..2eb6d7c2c931 100644 --- a/include/net/sctp/sm.h +++ b/include/net/sctp/sm.h @@ -226,7 +226,8 @@ struct sctp_chunk *sctp_make_new_encap_port( const struct sctp_association *asoc, const struct sctp_chunk *chunk); struct sctp_chunk *sctp_make_heartbeat(const struct sctp_association *asoc, - const struct sctp_transport *transport); + const struct sctp_transport *transport, + __u32 probe_size); struct sctp_chunk *sctp_make_heartbeat_ack(const struct sctp_association *asoc, const struct sctp_chunk *chunk, const void *payload, diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index a3772f8ee7f6..f7b056f5af37 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -386,6 +386,7 @@ struct sctp_sender_hb_info { union sctp_addr daddr; unsigned long sent_at; __u64 hb_nonce; + __u32 probe_size; }; int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt, @@ -657,6 +658,7 @@ struct sctp_chunk { data_accepted:1, /* At least 1 chunk accepted */ auth:1, /* IN: was auth'ed | OUT: needs auth */ has_asconf:1, /* IN: have seen an asconf before */ + pmtu_probe:1, /* Used by PLPMTUD, can be set in s HB chunk */ tsn_missing_report:2, /* Data chunk missing counter. */ fast_retransmit:2; /* Is this chunk fast retransmitted? */ }; diff --git a/net/sctp/output.c b/net/sctp/output.c index a6aa17df09ef..ceefb0616d9d 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -211,6 +211,30 @@ enum sctp_xmit sctp_packet_transmit_chunk(struct sctp_packet *packet, return retval; } +/* Try to bundle a pad chunk into a packet with a heartbeat chunk for PLPMTUTD probe */ +static enum sctp_xmit sctp_packet_bundle_pad(struct sctp_packet *pkt, struct sctp_chunk *chunk) +{ + struct sctp_transport *t = pkt->transport; + struct sctp_chunk *pad; + int overhead = 0; + + if (!chunk->pmtu_probe) + return SCTP_XMIT_OK; + + /* calculate the Padding Data size for the pad chunk */ + overhead += sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr); + overhead += sizeof(struct sctp_sender_hb_info) + sizeof(struct sctp_pad_chunk); + pad = sctp_make_pad(t->asoc, t->pl.probe_size - overhead); + if (!pad) + return SCTP_XMIT_DELAY; + + list_add_tail(&pad->list, &pkt->chunk_list); + pkt->size += SCTP_PAD4(ntohs(pad->chunk_hdr->length)); + chunk->transport = t; + + return SCTP_XMIT_OK; +} + /* Try to bundle an auth chunk into the packet. */ static enum sctp_xmit sctp_packet_bundle_auth(struct sctp_packet *pkt, struct sctp_chunk *chunk) @@ -382,6 +406,10 @@ enum sctp_xmit sctp_packet_append_chunk(struct sctp_packet *packet, goto finish; retval = __sctp_packet_append_chunk(packet, chunk); + if (retval != SCTP_XMIT_OK) + goto finish; + + retval = sctp_packet_bundle_pad(packet, chunk); finish: return retval; @@ -553,7 +581,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) sk = chunk->skb->sk; /* check gso */ - if (packet->size > tp->pathmtu && !packet->ipfragok) { + if (packet->size > tp->pathmtu && !packet->ipfragok && !chunk->pmtu_probe) { if (!sk_can_gso(sk)) { pr_err_once("Trying to GSO but underlying device doesn't support it."); goto out; diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index 5cb1aa5f067b..ff47091c385e 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -769,7 +769,11 @@ static int sctp_packet_singleton(struct sctp_transport *transport, sctp_packet_init(&singleton, transport, sport, dport); sctp_packet_config(&singleton, vtag, 0); - sctp_packet_append_chunk(&singleton, chunk); + if (sctp_packet_append_chunk(&singleton, chunk) != SCTP_XMIT_OK) { + list_del_init(&chunk->list); + sctp_chunk_free(chunk); + return -ENOMEM; + } return sctp_packet_transmit(&singleton, gfp); } @@ -929,8 +933,13 @@ static void sctp_outq_flush_ctrl(struct sctp_flush_ctx *ctx) one_packet = 1; fallthrough; - case SCTP_CID_SACK: case SCTP_CID_HEARTBEAT: + if (chunk->pmtu_probe) { + sctp_packet_singleton(ctx->transport, chunk, ctx->gfp); + break; + } + fallthrough; + case SCTP_CID_SACK: case SCTP_CID_SHUTDOWN: case SCTP_CID_ECN_ECNE: case SCTP_CID_ASCONF: diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index e5d470cd7c40..b0eaa93a9cc6 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -1160,7 +1160,8 @@ nodata: /* Make a HEARTBEAT chunk. */ struct sctp_chunk *sctp_make_heartbeat(const struct sctp_association *asoc, - const struct sctp_transport *transport) + const struct sctp_transport *transport, + __u32 probe_size) { struct sctp_sender_hb_info hbinfo; struct sctp_chunk *retval; @@ -1176,6 +1177,7 @@ struct sctp_chunk *sctp_make_heartbeat(const struct sctp_association *asoc, hbinfo.daddr = transport->ipaddr; hbinfo.sent_at = jiffies; hbinfo.hb_nonce = transport->hb_nonce; + hbinfo.probe_size = probe_size; /* Cast away the 'const', as this is just telling the chunk * what transport it belongs to. @@ -1183,6 +1185,7 @@ struct sctp_chunk *sctp_make_heartbeat(const struct sctp_association *asoc, retval->transport = (struct sctp_transport *) transport; retval->subh.hbs_hdr = sctp_addto_chunk(retval, sizeof(hbinfo), &hbinfo); + retval->pmtu_probe = !!probe_size; nodata: return retval; diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 3b99eda50618..8edb9186112a 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -1004,7 +1004,7 @@ static enum sctp_disposition sctp_sf_heartbeat( struct sctp_chunk *reply; /* Send a heartbeat to our peer. */ - reply = sctp_make_heartbeat(asoc, transport); + reply = sctp_make_heartbeat(asoc, transport, 0); if (!reply) return SCTP_DISPOSITION_NOMEM; @@ -1104,8 +1104,15 @@ enum sctp_disposition sctp_sf_send_probe(struct net *net, struct sctp_cmd_seq *commands) { struct sctp_transport *transport = (struct sctp_transport *)arg; + struct sctp_chunk *reply; + + if (!sctp_transport_pl_enabled(transport)) + return SCTP_DISPOSITION_CONSUME; - /* The actual handling will be performed here in a later patch. */ + reply = sctp_make_heartbeat(asoc, transport, transport->pl.probe_size); + if (!reply) + return SCTP_DISPOSITION_NOMEM; + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(reply)); sctp_add_cmd_sf(commands, SCTP_CMD_PROBE_TIMER_UPDATE, SCTP_TRANSPORT(transport)); @@ -1260,6 +1267,15 @@ enum sctp_disposition sctp_sf_backbeat_8_3(struct net *net, if (hbinfo->hb_nonce != link->hb_nonce) return SCTP_DISPOSITION_DISCARD; + if (hbinfo->probe_size) { + if (hbinfo->probe_size != link->pl.probe_size || + !sctp_transport_pl_enabled(link)) + return SCTP_DISPOSITION_DISCARD; + + /* The actual handling will be performed here in a later patch. */ + return SCTP_DISPOSITION_CONSUME; + } + max_interval = link->hbinterval + link->rto; /* Check if the timestamp looks valid. */ -- cgit v1.2.3-58-ga151 From 1dc68c194571acc4027de5f8378227d0c0ff7e13 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 22 Jun 2021 14:04:53 -0400 Subject: sctp: do state transition when PROBE_COUNT == MAX_PROBES on HB send path The state transition is described in rfc8899#section-5.2, PROBE_COUNT == MAX_PROBES means the probe fails for MAX times, and the state transition includes: - Base -> Error, occurs when BASE_PLPMTU Confirmation Fails, pl.pmtu is set to SCTP_MIN_PLPMTU, probe_size is still SCTP_BASE_PLPMTU; - Search -> Base, occurs when Black Hole Detected, pl.pmtu is set to SCTP_BASE_PLPMTU, probe_size is set back to SCTP_BASE_PLPMTU; - Search Complete -> Base, occurs when Black Hole Detected pl.pmtu is set to SCTP_BASE_PLPMTU, probe_size is set back to SCTP_BASE_PLPMTU; Note a black hole is encountered when a sender is unaware that packets are not being delivered to the destination endpoint. So it includes the probe failures with equal probe_size to pl.pmtu, and definitely not include that with greater probe_size than pl.pmtu. The later one is the normal probe failure where probe_size should decrease back to pl.pmtu and pl.probe_high is set. pl.probe_high would be used on HB ACK recv path in the next patch. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 1 + net/sctp/sm_statefuns.c | 2 ++ net/sctp/transport.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 47 insertions(+) (limited to 'net') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index f7b056f5af37..31165720b28a 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -1023,6 +1023,7 @@ bool sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu); void sctp_transport_immediate_rtx(struct sctp_transport *); void sctp_transport_dst_release(struct sctp_transport *t); void sctp_transport_dst_confirm(struct sctp_transport *t); +void sctp_transport_pl_send(struct sctp_transport *t); /* This is the structure we use to queue packets as they come into diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 8edb9186112a..66c409e5b47c 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -1109,6 +1109,8 @@ enum sctp_disposition sctp_sf_send_probe(struct net *net, if (!sctp_transport_pl_enabled(transport)) return SCTP_DISPOSITION_CONSUME; + sctp_transport_pl_send(transport); + reply = sctp_make_heartbeat(asoc, transport, transport->pl.probe_size); if (!reply) return SCTP_DISPOSITION_NOMEM; diff --git a/net/sctp/transport.c b/net/sctp/transport.c index ca3343c2c80e..99620d86e317 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -261,6 +261,50 @@ void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk) transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT; } +void sctp_transport_pl_send(struct sctp_transport *t) +{ + pr_debug("%s: PLPMTUD: transport: %p, state: %d, pmtu: %d, size: %d, high: %d\n", + __func__, t, t->pl.state, t->pl.pmtu, t->pl.probe_size, t->pl.probe_high); + + if (t->pl.probe_count < SCTP_MAX_PROBES) { + t->pl.probe_count++; + return; + } + + if (t->pl.state == SCTP_PL_BASE) { + if (t->pl.probe_size == SCTP_BASE_PLPMTU) { /* BASE_PLPMTU Confirmation Failed */ + t->pl.state = SCTP_PL_ERROR; /* Base -> Error */ + + t->pl.pmtu = SCTP_MIN_PLPMTU; + t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t); + sctp_assoc_sync_pmtu(t->asoc); + } + } else if (t->pl.state == SCTP_PL_SEARCH) { + if (t->pl.pmtu == t->pl.probe_size) { /* Black Hole Detected */ + t->pl.state = SCTP_PL_BASE; /* Search -> Base */ + t->pl.probe_size = SCTP_BASE_PLPMTU; + t->pl.probe_high = 0; + + t->pl.pmtu = SCTP_BASE_PLPMTU; + t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t); + sctp_assoc_sync_pmtu(t->asoc); + } else { /* Normal probe failure. */ + t->pl.probe_high = t->pl.probe_size; + t->pl.probe_size = t->pl.pmtu; + } + } else if (t->pl.state == SCTP_PL_COMPLETE) { + if (t->pl.pmtu == t->pl.probe_size) { /* Black Hole Detected */ + t->pl.state = SCTP_PL_BASE; /* Search Complete -> Base */ + t->pl.probe_size = SCTP_BASE_PLPMTU; + + t->pl.pmtu = SCTP_BASE_PLPMTU; + t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t); + sctp_assoc_sync_pmtu(t->asoc); + } + } + t->pl.probe_count = 1; +} + bool sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu) { struct dst_entry *dst = sctp_transport_dst_check(t); -- cgit v1.2.3-58-ga151 From b87641aff9e772fda15d3386d159646eada2ceef Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 22 Jun 2021 14:04:54 -0400 Subject: sctp: do state transition when a probe succeeds on HB ACK recv path As described in rfc8899#section-5.2, when a probe succeeds, there might be the following state transitions: - Base -> Search, occurs when probe succeeds with BASE_PLPMTU, pl.pmtu is not changing, pl.probe_size increases by SCTP_PL_BIG_STEP, - Error -> Search, occurs when probe succeeds with BASE_PLPMTU, pl.pmtu is changed from SCTP_MIN_PLPMTU to SCTP_BASE_PLPMTU, pl.probe_size increases by SCTP_PL_BIG_STEP. - Search -> Search Complete, occurs when probe succeeds with the probe size SCTP_MAX_PLPMTU less than pl.probe_high, pl.pmtu is not changing, but update *pathmtu* with it, pl.probe_size is set back to pl.pmtu to double check it. - Search Complete -> Search, occurs when probe succeeds with the probe size equal to pl.pmtu, pl.pmtu is not changing, pl.probe_size increases by SCTP_PL_MIN_STEP. So search process can be described as: 1. When it just enters 'Search' state, *pathmtu* is not updated with pl.pmtu, and probe_size increases by a big step (SCTP_PL_BIG_STEP) each round. 2. Until pl.probe_high is set when a probe fails, and probe_size decreases back to pl.pmtu, as described in the last patch. 3. When the probe with the new size succeeds, probe_size changes to increase by a small step (SCTP_PL_MIN_STEP) due to pl.probe_high is set. 4. Until probe_size is next to pl.probe_high, the searching finishes and it goes to 'Complete' state and updates *pathmtu* with pl.pmtu, and then probe_size is set to pl.pmtu to confirm by once more probe. 5. This probe occurs after "30 * probe_inteval", a much longer time than that in Search state. Once it is done it goes to 'Search' state again with probe_size increased by SCTP_PL_MIN_STEP. As we can see above, during the searching, pl.pmtu changes while *pathmtu* doesn't. *pathmtu* is only updated when the search finishes by which it gets an optimal value for it. A big step is used at the beginning until it gets close to the optimal value, then it changes to a small step until it has this optimal value. The small step is also used in 'Complete' until it goes to 'Search' state again and the probe with 'pmtu + the small step' succeeds, which means a higher size could be used. Then probe_size changes to increase by a big step again until it gets close to the next optimal value. Note that anytime when black hole is detected, it goes directly to 'Base' state with pl.pmtu set to SCTP_BASE_PLPMTU, as described in the last patch. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 1 + net/sctp/sm_statefuns.c | 2 +- net/sctp/transport.c | 38 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 40 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 31165720b28a..9eaa701cda23 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -1024,6 +1024,7 @@ void sctp_transport_immediate_rtx(struct sctp_transport *); void sctp_transport_dst_release(struct sctp_transport *t); void sctp_transport_dst_confirm(struct sctp_transport *t); void sctp_transport_pl_send(struct sctp_transport *t); +void sctp_transport_pl_recv(struct sctp_transport *t); /* This is the structure we use to queue packets as they come into diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 66c409e5b47c..d29b579da904 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -1274,7 +1274,7 @@ enum sctp_disposition sctp_sf_backbeat_8_3(struct net *net, !sctp_transport_pl_enabled(link)) return SCTP_DISPOSITION_DISCARD; - /* The actual handling will be performed here in a later patch. */ + sctp_transport_pl_recv(link); return SCTP_DISPOSITION_CONSUME; } diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 99620d86e317..79ff5ca6b472 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -305,6 +305,44 @@ void sctp_transport_pl_send(struct sctp_transport *t) t->pl.probe_count = 1; } +void sctp_transport_pl_recv(struct sctp_transport *t) +{ + pr_debug("%s: PLPMTUD: transport: %p, state: %d, pmtu: %d, size: %d, high: %d\n", + __func__, t, t->pl.state, t->pl.pmtu, t->pl.probe_size, t->pl.probe_high); + + t->pl.pmtu = t->pl.probe_size; + t->pl.probe_count = 0; + if (t->pl.state == SCTP_PL_BASE) { + t->pl.state = SCTP_PL_SEARCH; /* Base -> Search */ + t->pl.probe_size += SCTP_PL_BIG_STEP; + } else if (t->pl.state == SCTP_PL_ERROR) { + t->pl.state = SCTP_PL_SEARCH; /* Error -> Search */ + + t->pl.pmtu = t->pl.probe_size; + t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t); + sctp_assoc_sync_pmtu(t->asoc); + t->pl.probe_size += SCTP_PL_BIG_STEP; + } else if (t->pl.state == SCTP_PL_SEARCH) { + if (!t->pl.probe_high) { + t->pl.probe_size = min(t->pl.probe_size + SCTP_PL_BIG_STEP, + SCTP_MAX_PLPMTU); + return; + } + t->pl.probe_size += SCTP_PL_MIN_STEP; + if (t->pl.probe_size >= t->pl.probe_high) { + t->pl.probe_high = 0; + t->pl.state = SCTP_PL_COMPLETE; /* Search -> Search Complete */ + + t->pl.probe_size = t->pl.pmtu; + t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t); + sctp_assoc_sync_pmtu(t->asoc); + } + } else if (t->pl.state == SCTP_PL_COMPLETE) { + t->pl.state = SCTP_PL_SEARCH; /* Search Complete -> Search */ + t->pl.probe_size += SCTP_PL_MIN_STEP; + } +} + bool sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu) { struct dst_entry *dst = sctp_transport_dst_check(t); -- cgit v1.2.3-58-ga151 From 83696408317735d105ad86a5470b39879ad2ec4d Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 22 Jun 2021 14:04:55 -0400 Subject: sctp: do state transition when receiving an icmp TOOBIG packet PLPMTUD will short-circuit the old process for icmp TOOBIG packets. This part is described in rfc8899#section-4.6.2 (PL_PTB_SIZE = PTB_SIZE - other_headers_len). Note that from rfc8899#section-5.2 State Machine, each case below is for some specific states only: a) PL_PTB_SIZE < MIN_PLPMTU || PL_PTB_SIZE >= PROBED_SIZE, discard it, for any state b) MIN_PLPMTU < PL_PTB_SIZE < BASE_PLPMTU, Base -> Error, for Base state c) BASE_PLPMTU <= PL_PTB_SIZE < PLPMTU, Search -> Base or Complete -> Base, for Search and Complete states. d) PLPMTU < PL_PTB_SIZE < PROBED_SIZE, set pl.probe_size to PL_PTB_SIZE then verify it, for Search state. The most important one is case d), which will help find the optimal fast during searching. Like when pathmtu = 1392 for SCTP over IPv4, the search will be (20 is iphdr_len): 1. probe with 1200 - 20 2. probe with 1232 - 20 3. probe with 1264 - 20 ... 7. probe with 1388 - 20 8. probe with 1420 - 20 When sending the probe with 1420 - 20, TOOBIG may come with PL_PTB_SIZE = 1392 - 20. Then it matches case d), and saves some rounds to try with the 1392 - 20 probe. But of course, PLPMTUD doesn't trust TOOBIG packets, and it will go back to the common searching once the probe with the new size can't be verified. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/input.c | 4 +++- net/sctp/transport.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 53 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sctp/input.c b/net/sctp/input.c index d508f6f3dd08..9ffdbd6526e9 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -385,7 +385,9 @@ static int sctp_add_backlog(struct sock *sk, struct sk_buff *skb) void sctp_icmp_frag_needed(struct sock *sk, struct sctp_association *asoc, struct sctp_transport *t, __u32 pmtu) { - if (!t || (t->pathmtu <= pmtu)) + if (!t || + (t->pathmtu <= pmtu && + t->pl.probe_size + sctp_transport_pl_hlen(t) <= pmtu)) return; if (sock_owned_by_user(sk)) { diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 79ff5ca6b472..5cefb4eab8a0 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -343,10 +343,55 @@ void sctp_transport_pl_recv(struct sctp_transport *t) } } +static bool sctp_transport_pl_toobig(struct sctp_transport *t, u32 pmtu) +{ + pr_debug("%s: PLPMTUD: transport: %p, state: %d, pmtu: %d, size: %d, ptb: %d\n", + __func__, t, t->pl.state, t->pl.pmtu, t->pl.probe_size, pmtu); + + if (pmtu < SCTP_MIN_PLPMTU || pmtu >= t->pl.probe_size) + return false; + + if (t->pl.state == SCTP_PL_BASE) { + if (pmtu >= SCTP_MIN_PLPMTU && pmtu < SCTP_BASE_PLPMTU) { + t->pl.state = SCTP_PL_ERROR; /* Base -> Error */ + + t->pl.pmtu = SCTP_MIN_PLPMTU; + t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t); + } + } else if (t->pl.state == SCTP_PL_SEARCH) { + if (pmtu >= SCTP_BASE_PLPMTU && pmtu < t->pl.pmtu) { + t->pl.state = SCTP_PL_BASE; /* Search -> Base */ + t->pl.probe_size = SCTP_BASE_PLPMTU; + t->pl.probe_count = 0; + + t->pl.probe_high = 0; + t->pl.pmtu = SCTP_BASE_PLPMTU; + t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t); + } else if (pmtu > t->pl.pmtu && pmtu < t->pl.probe_size) { + t->pl.probe_size = pmtu; + t->pl.probe_count = 0; + + return false; + } + } else if (t->pl.state == SCTP_PL_COMPLETE) { + if (pmtu >= SCTP_BASE_PLPMTU && pmtu < t->pl.pmtu) { + t->pl.state = SCTP_PL_BASE; /* Complete -> Base */ + t->pl.probe_size = SCTP_BASE_PLPMTU; + t->pl.probe_count = 0; + + t->pl.probe_high = 0; + t->pl.pmtu = SCTP_BASE_PLPMTU; + t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t); + } + } + + return true; +} + bool sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu) { - struct dst_entry *dst = sctp_transport_dst_check(t); struct sock *sk = t->asoc->base.sk; + struct dst_entry *dst; bool change = true; if (unlikely(pmtu < SCTP_DEFAULT_MINSEGMENT)) { @@ -357,6 +402,10 @@ bool sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu) } pmtu = SCTP_TRUNC4(pmtu); + if (sctp_transport_pl_enabled(t)) + return sctp_transport_pl_toobig(t, pmtu - sctp_transport_pl_hlen(t)); + + dst = sctp_transport_dst_check(t); if (dst) { struct sctp_pf *pf = sctp_get_pf_specific(dst->ops->family); union sctp_addr addr; -- cgit v1.2.3-58-ga151 From 7307e4fa4d295f6dc017fe4b19467c486a1275d7 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 22 Jun 2021 14:04:56 -0400 Subject: sctp: enable PLPMTUD when the transport is ready sctp_transport_pl_reset() is called whenever any of these 3 members in transport is changed: - probe_interval - param_flags & SPP_PMTUD_ENABLE - state == ACTIVE If all are true, start the PLPMTUD when it's not yet started. If any of these is false, stop the PLPMTUD when it's already running. sctp_transport_pl_update() is called when the transport dst has changed. It will restart the PLPMTUD probe. Again, the pathmtu won't change but use the dst's mtu until the Search phase is done. Note that after using PLPMTUD, the pathmtu is only initialized with the dst mtu when the transport dst changes. At other time it is updated by pl.pmtu. So sctp_transport_pmtu_check() will be called only when PLPMTUD is disabled in sctp_packet_config(). After this patch, the PLPMTUD feature from RFC8899 will be activated and can be used by users. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/associola.c | 4 ++++ net/sctp/output.c | 3 ++- net/sctp/socket.c | 6 +++++- net/sctp/transport.c | 2 ++ 4 files changed, 13 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sctp/associola.c b/net/sctp/associola.c index e01895edd3a4..be29da09cc7a 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -716,6 +716,8 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, return NULL; } + sctp_transport_pl_reset(peer); + /* Attach the remote transport to our asoc. */ list_add_tail_rcu(&peer->transports, &asoc->peer.transport_addr_list); asoc->peer.transport_count++; @@ -814,6 +816,7 @@ void sctp_assoc_control_transport(struct sctp_association *asoc, spc_state = SCTP_ADDR_CONFIRMED; transport->state = SCTP_ACTIVE; + sctp_transport_pl_reset(transport); break; case SCTP_TRANSPORT_DOWN: @@ -823,6 +826,7 @@ void sctp_assoc_control_transport(struct sctp_association *asoc, */ if (transport->state != SCTP_UNCONFIRMED) { transport->state = SCTP_INACTIVE; + sctp_transport_pl_reset(transport); spc_state = SCTP_ADDR_UNREACHABLE; } else { sctp_transport_dst_release(transport); diff --git a/net/sctp/output.c b/net/sctp/output.c index ceefb0616d9d..9032ce60d50e 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -103,7 +103,8 @@ void sctp_packet_config(struct sctp_packet *packet, __u32 vtag, sctp_transport_route(tp, NULL, sp); if (asoc->param_flags & SPP_PMTUD_ENABLE) sctp_assoc_sync_pmtu(asoc); - } else if (!sctp_transport_pmtu_check(tp)) { + } else if (!sctp_transport_pl_enabled(tp) && + !sctp_transport_pmtu_check(tp)) { if (asoc->param_flags & SPP_PMTUD_ENABLE) sctp_assoc_sync_pmtu(asoc); } diff --git a/net/sctp/socket.c b/net/sctp/socket.c index aba576f53458..e64e01f61b11 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -2496,6 +2496,7 @@ static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params, sctp_transport_pmtu(trans, sctp_opt2sk(sp)); sctp_assoc_sync_pmtu(asoc); } + sctp_transport_pl_reset(trans); } else if (asoc) { asoc->param_flags = (asoc->param_flags & ~SPP_PMTUD) | pmtud_change; @@ -4506,6 +4507,7 @@ static int sctp_setsockopt_probe_interval(struct sock *sk, return -EINVAL; t->probe_interval = msecs_to_jiffies(probe_interval); + sctp_transport_pl_reset(t); return 0; } @@ -4522,8 +4524,10 @@ static int sctp_setsockopt_probe_interval(struct sock *sk, * each transport. */ if (asoc) { - list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) + list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) { t->probe_interval = msecs_to_jiffies(probe_interval); + sctp_transport_pl_reset(t); + } asoc->probe_interval = msecs_to_jiffies(probe_interval); return 0; diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 5cefb4eab8a0..f27b856ea8ce 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -259,6 +259,8 @@ void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk) transport->pathmtu = sctp_dst_mtu(transport->dst); else transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT; + + sctp_transport_pl_update(transport); } void sctp_transport_pl_send(struct sctp_transport *t) -- cgit v1.2.3-58-ga151 From 237a6a2e318c1ed7429e72f2640054bdda91646f Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 22 Jun 2021 14:04:57 -0400 Subject: sctp: remove the unessessary hold for idev in sctp_v6_err Same as in tcp_v6_err() and __udp6_lib_err(), there's no need to hold idev in sctp_v6_err(), so just call __in6_dev_get() instead. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/ipv6.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index bd08807c9e44..50ed4de18069 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -126,7 +126,6 @@ static struct notifier_block sctp_inet6addr_notifier = { static int sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { - struct inet6_dev *idev; struct sock *sk; struct sctp_association *asoc; struct sctp_transport *transport; @@ -135,8 +134,6 @@ static int sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, int err, ret = 0; struct net *net = dev_net(skb->dev); - idev = in6_dev_get(skb->dev); - /* Fix up skb to look at the embedded net header. */ saveip = skb->network_header; savesctp = skb->transport_header; @@ -147,9 +144,8 @@ static int sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, skb->network_header = saveip; skb->transport_header = savesctp; if (!sk) { - __ICMP6_INC_STATS(net, idev, ICMP6_MIB_INERRORS); - ret = -ENOENT; - goto out; + __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); + return -ENOENT; } /* Warning: The sock lock is held. Remember to call @@ -185,10 +181,6 @@ static int sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, out_unlock: sctp_err_finish(sk, transport); -out: - if (likely(idev != NULL)) - in6_dev_put(idev); - return ret; } -- cgit v1.2.3-58-ga151 From f6549bd37b927655c6fecad88428a731cd8a4a34 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 22 Jun 2021 14:04:58 -0400 Subject: sctp: extract sctp_v6_err_handle function from sctp_v6_err This patch is to extract sctp_v6_err_handle() from sctp_v6_err() to only handle the icmp err after the sock lookup, and it also makes the code clearer. sctp_v6_err_handle() will be used in sctp over udp's err handling in the following patch. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/ipv6.c | 76 ++++++++++++++++++++++++++++++--------------------------- 1 file changed, 40 insertions(+), 36 deletions(-) (limited to 'net') diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 50ed4de18069..6ad422f2d0d0 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -122,50 +122,28 @@ static struct notifier_block sctp_inet6addr_notifier = { .notifier_call = sctp_inet6addr_event, }; -/* ICMP error handler. */ -static int sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, - u8 type, u8 code, int offset, __be32 info) +static void sctp_v6_err_handle(struct sctp_transport *t, struct sk_buff *skb, + __u8 type, __u8 code, __u32 info) { - struct sock *sk; - struct sctp_association *asoc; - struct sctp_transport *transport; + struct sctp_association *asoc = t->asoc; + struct sock *sk = asoc->base.sk; struct ipv6_pinfo *np; - __u16 saveip, savesctp; - int err, ret = 0; - struct net *net = dev_net(skb->dev); - - /* Fix up skb to look at the embedded net header. */ - saveip = skb->network_header; - savesctp = skb->transport_header; - skb_reset_network_header(skb); - skb_set_transport_header(skb, offset); - sk = sctp_err_lookup(net, AF_INET6, skb, sctp_hdr(skb), &asoc, &transport); - /* Put back, the original pointers. */ - skb->network_header = saveip; - skb->transport_header = savesctp; - if (!sk) { - __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); - return -ENOENT; - } - - /* Warning: The sock lock is held. Remember to call - * sctp_err_finish! - */ + int err = 0; switch (type) { case ICMPV6_PKT_TOOBIG: if (ip6_sk_accept_pmtu(sk)) - sctp_icmp_frag_needed(sk, asoc, transport, ntohl(info)); - goto out_unlock; + sctp_icmp_frag_needed(sk, asoc, t, info); + return; case ICMPV6_PARAMPROB: if (ICMPV6_UNK_NEXTHDR == code) { - sctp_icmp_proto_unreachable(sk, asoc, transport); - goto out_unlock; + sctp_icmp_proto_unreachable(sk, asoc, t); + return; } break; case NDISC_REDIRECT: - sctp_icmp_redirect(sk, transport, skb); - goto out_unlock; + sctp_icmp_redirect(sk, t, skb); + return; default: break; } @@ -175,13 +153,39 @@ static int sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (!sock_owned_by_user(sk) && np->recverr) { sk->sk_err = err; sk->sk_error_report(sk); - } else { /* Only an error on timeout */ + } else { sk->sk_err_soft = err; } +} + +/* ICMP error handler. */ +static int sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) +{ + struct net *net = dev_net(skb->dev); + struct sctp_transport *transport; + struct sctp_association *asoc; + __u16 saveip, savesctp; + struct sock *sk; + + /* Fix up skb to look at the embedded net header. */ + saveip = skb->network_header; + savesctp = skb->transport_header; + skb_reset_network_header(skb); + skb_set_transport_header(skb, offset); + sk = sctp_err_lookup(net, AF_INET6, skb, sctp_hdr(skb), &asoc, &transport); + /* Put back, the original pointers. */ + skb->network_header = saveip; + skb->transport_header = savesctp; + if (!sk) { + __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); + return -ENOENT; + } -out_unlock: + sctp_v6_err_handle(transport, skb, type, code, ntohl(info)); sctp_err_finish(sk, transport); - return ret; + + return 0; } static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *t) -- cgit v1.2.3-58-ga151 From d83060759a652ccb64d7486fe38c8347b4a64048 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 22 Jun 2021 14:04:59 -0400 Subject: sctp: extract sctp_v4_err_handle function from sctp_v4_err This patch is to extract sctp_v4_err_handle() from sctp_v4_err() to only handle the icmp err after the sock lookup, and it also makes the code clearer. sctp_v4_err_handle() will be used in sctp over udp's err handling in the following patch. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/input.c | 106 +++++++++++++++++++++++++------------------------------ 1 file changed, 49 insertions(+), 57 deletions(-) (limited to 'net') diff --git a/net/sctp/input.c b/net/sctp/input.c index 9ffdbd6526e9..83d58d42ea45 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -556,6 +556,49 @@ void sctp_err_finish(struct sock *sk, struct sctp_transport *t) sctp_transport_put(t); } +static void sctp_v4_err_handle(struct sctp_transport *t, struct sk_buff *skb, + __u8 type, __u8 code, __u32 info) +{ + struct sctp_association *asoc = t->asoc; + struct sock *sk = asoc->base.sk; + int err = 0; + + switch (type) { + case ICMP_PARAMETERPROB: + err = EPROTO; + break; + case ICMP_DEST_UNREACH: + if (code > NR_ICMP_UNREACH) + return; + if (code == ICMP_FRAG_NEEDED) { + sctp_icmp_frag_needed(sk, asoc, t, SCTP_TRUNC4(info)); + return; + } + if (code == ICMP_PROT_UNREACH) { + sctp_icmp_proto_unreachable(sk, asoc, t); + return; + } + err = icmp_err_convert[code].errno; + break; + case ICMP_TIME_EXCEEDED: + if (code == ICMP_EXC_FRAGTIME) + return; + + err = EHOSTUNREACH; + break; + case ICMP_REDIRECT: + sctp_icmp_redirect(sk, t, skb); + default: + return; + } + if (!sock_owned_by_user(sk) && inet_sk(sk)->recverr) { + sk->sk_err = err; + sk->sk_error_report(sk); + } else { /* Only an error on timeout */ + sk->sk_err_soft = err; + } +} + /* * This routine is called by the ICMP module when it gets some * sort of error condition. If err < 0 then the socket should @@ -574,22 +617,19 @@ void sctp_err_finish(struct sock *sk, struct sctp_transport *t) int sctp_v4_err(struct sk_buff *skb, __u32 info) { const struct iphdr *iph = (const struct iphdr *)skb->data; - const int ihlen = iph->ihl * 4; const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; - struct sock *sk; - struct sctp_association *asoc = NULL; + struct net *net = dev_net(skb->dev); struct sctp_transport *transport; - struct inet_sock *inet; + struct sctp_association *asoc; __u16 saveip, savesctp; - int err; - struct net *net = dev_net(skb->dev); + struct sock *sk; /* Fix up skb to look at the embedded net header. */ saveip = skb->network_header; savesctp = skb->transport_header; skb_reset_network_header(skb); - skb_set_transport_header(skb, ihlen); + skb_set_transport_header(skb, iph->ihl * 4); sk = sctp_err_lookup(net, AF_INET, skb, sctp_hdr(skb), &asoc, &transport); /* Put back, the original values. */ skb->network_header = saveip; @@ -598,58 +638,10 @@ int sctp_v4_err(struct sk_buff *skb, __u32 info) __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); return -ENOENT; } - /* Warning: The sock lock is held. Remember to call - * sctp_err_finish! - */ - switch (type) { - case ICMP_PARAMETERPROB: - err = EPROTO; - break; - case ICMP_DEST_UNREACH: - if (code > NR_ICMP_UNREACH) - goto out_unlock; - - /* PMTU discovery (RFC1191) */ - if (ICMP_FRAG_NEEDED == code) { - sctp_icmp_frag_needed(sk, asoc, transport, - SCTP_TRUNC4(info)); - goto out_unlock; - } else { - if (ICMP_PROT_UNREACH == code) { - sctp_icmp_proto_unreachable(sk, asoc, - transport); - goto out_unlock; - } - } - err = icmp_err_convert[code].errno; - break; - case ICMP_TIME_EXCEEDED: - /* Ignore any time exceeded errors due to fragment reassembly - * timeouts. - */ - if (ICMP_EXC_FRAGTIME == code) - goto out_unlock; - - err = EHOSTUNREACH; - break; - case ICMP_REDIRECT: - sctp_icmp_redirect(sk, transport, skb); - /* Fall through to out_unlock. */ - default: - goto out_unlock; - } - - inet = inet_sk(sk); - if (!sock_owned_by_user(sk) && inet->recverr) { - sk->sk_err = err; - sk->sk_error_report(sk); - } else { /* Only an error on timeout */ - sk->sk_err_soft = err; - } - -out_unlock: + sctp_v4_err_handle(transport, skb, type, code, info); sctp_err_finish(sk, transport); + return 0; } -- cgit v1.2.3-58-ga151 From 9e47df005cab63e545671dba8dfd6852fff1c2cf Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 22 Jun 2021 14:05:00 -0400 Subject: sctp: process sctp over udp icmp err on sctp side Previously, sctp over udp was using udp tunnel's icmp err process, which only does sk lookup on sctp side. However for sctp's icmp error process, there are more things to do, like syncing assoc pmtu/retransmit packets for toobig type err, and starting proto_unreach_timer for unreach type err etc. Now after adding PLPMTUD, which also requires to process toobig type err on sctp side. This patch is to process icmp err on sctp side by parsing the type/code/info in .encap_err_lookup and call sctp's icmp processing functions. Note as the 'redirect' err process needs to know the outer ip(v6) header's, we have to leave it to udp(v6)_err to handle it. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/sctp.h | 2 ++ net/sctp/input.c | 30 ++++++++++++++++++++++++++++++ net/sctp/ipv6.c | 30 ++++++++++++++++++++++++++++++ net/sctp/protocol.c | 21 ++------------------- 4 files changed, 64 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index f7e083602c10..69bab88ad66b 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -145,6 +145,8 @@ struct sock *sctp_err_lookup(struct net *net, int family, struct sk_buff *, struct sctphdr *, struct sctp_association **, struct sctp_transport **); void sctp_err_finish(struct sock *, struct sctp_transport *); +int sctp_udp_v4_err(struct sock *sk, struct sk_buff *skb); +int sctp_udp_v6_err(struct sock *sk, struct sk_buff *skb); void sctp_icmp_frag_needed(struct sock *, struct sctp_association *, struct sctp_transport *t, __u32 pmtu); void sctp_icmp_redirect(struct sock *, struct sctp_transport *, diff --git a/net/sctp/input.c b/net/sctp/input.c index 83d58d42ea45..fe6429cc012f 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -645,6 +645,36 @@ int sctp_v4_err(struct sk_buff *skb, __u32 info) return 0; } +int sctp_udp_v4_err(struct sock *sk, struct sk_buff *skb) +{ + struct net *net = dev_net(skb->dev); + struct sctp_association *asoc; + struct sctp_transport *t; + struct icmphdr *hdr; + __u32 info = 0; + + skb->transport_header += sizeof(struct udphdr); + sk = sctp_err_lookup(net, AF_INET, skb, sctp_hdr(skb), &asoc, &t); + if (!sk) { + __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); + return -ENOENT; + } + + skb->transport_header -= sizeof(struct udphdr); + hdr = (struct icmphdr *)(skb_network_header(skb) - sizeof(struct icmphdr)); + if (hdr->type == ICMP_REDIRECT) { + /* can't be handled without outer iphdr known, leave it to udp_err */ + sctp_err_finish(sk, t); + return 0; + } + if (hdr->type == ICMP_DEST_UNREACH && hdr->code == ICMP_FRAG_NEEDED) + info = ntohs(hdr->un.frag.mtu); + sctp_v4_err_handle(t, skb, hdr->type, hdr->code, info); + + sctp_err_finish(sk, t); + return 1; +} + /* * RFC 2960, 8.4 - Handle "Out of the blue" Packets. * diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 6ad422f2d0d0..05f81a4d0ee7 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -188,6 +188,36 @@ static int sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, return 0; } +int sctp_udp_v6_err(struct sock *sk, struct sk_buff *skb) +{ + struct net *net = dev_net(skb->dev); + struct sctp_association *asoc; + struct sctp_transport *t; + struct icmp6hdr *hdr; + __u32 info = 0; + + skb->transport_header += sizeof(struct udphdr); + sk = sctp_err_lookup(net, AF_INET6, skb, sctp_hdr(skb), &asoc, &t); + if (!sk) { + __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); + return -ENOENT; + } + + skb->transport_header -= sizeof(struct udphdr); + hdr = (struct icmp6hdr *)(skb_network_header(skb) - sizeof(struct icmp6hdr)); + if (hdr->icmp6_type == NDISC_REDIRECT) { + /* can't be handled without outer ip6hdr known, leave it to udpv6_err */ + sctp_err_finish(sk, t); + return 0; + } + if (hdr->icmp6_type == ICMPV6_PKT_TOOBIG) + info = ntohl(hdr->icmp6_mtu); + sctp_v6_err_handle(t, skb, hdr->icmp6_type, hdr->icmp6_code, info); + + sctp_err_finish(sk, t); + return 1; +} + static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *t) { struct dst_entry *dst = dst_clone(t->dst); diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index baa4e770e4ba..bc5db0b404ce 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -850,23 +850,6 @@ static int sctp_udp_rcv(struct sock *sk, struct sk_buff *skb) return 0; } -static int sctp_udp_err_lookup(struct sock *sk, struct sk_buff *skb) -{ - struct sctp_association *asoc; - struct sctp_transport *t; - int family; - - skb->transport_header += sizeof(struct udphdr); - family = (ip_hdr(skb)->version == 4) ? AF_INET : AF_INET6; - sk = sctp_err_lookup(dev_net(skb->dev), family, skb, sctp_hdr(skb), - &asoc, &t); - if (!sk) - return -ENOENT; - - sctp_err_finish(sk, t); - return 0; -} - int sctp_udp_sock_start(struct net *net) { struct udp_tunnel_sock_cfg tuncfg = {NULL}; @@ -885,7 +868,7 @@ int sctp_udp_sock_start(struct net *net) tuncfg.encap_type = 1; tuncfg.encap_rcv = sctp_udp_rcv; - tuncfg.encap_err_lookup = sctp_udp_err_lookup; + tuncfg.encap_err_lookup = sctp_udp_v4_err; setup_udp_tunnel_sock(net, sock, &tuncfg); net->sctp.udp4_sock = sock->sk; @@ -907,7 +890,7 @@ int sctp_udp_sock_start(struct net *net) tuncfg.encap_type = 1; tuncfg.encap_rcv = sctp_udp_rcv; - tuncfg.encap_err_lookup = sctp_udp_err_lookup; + tuncfg.encap_err_lookup = sctp_udp_v6_err; setup_udp_tunnel_sock(net, sock, &tuncfg); net->sctp.udp6_sock = sock->sk; #endif -- cgit v1.2.3-58-ga151 From d2f77960e5b03b2d373252b2ee150a4a14010f99 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Tue, 22 Jun 2021 12:25:18 -0700 Subject: mptcp: add sysctl allow_join_initial_addr_port This patch added a new sysctl, named allow_join_initial_addr_port, to control whether allow peers to send join requests to the IP address and port number used by the initial subflow. Suggested-by: Florian Westphal Acked-by: Paolo Abeni Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- Documentation/networking/mptcp-sysctl.rst | 13 +++++++++++++ net/mptcp/ctrl.c | 16 ++++++++++++++++ net/mptcp/protocol.h | 1 + 3 files changed, 30 insertions(+) (limited to 'net') diff --git a/Documentation/networking/mptcp-sysctl.rst b/Documentation/networking/mptcp-sysctl.rst index ee06fd782465..76d939e688b8 100644 --- a/Documentation/networking/mptcp-sysctl.rst +++ b/Documentation/networking/mptcp-sysctl.rst @@ -32,3 +32,16 @@ checksum_enabled - BOOLEAN per-namespace sysctl. Default: 0 + +allow_join_initial_addr_port - BOOLEAN + Allow peers to send join requests to the IP address and port number used + by the initial subflow if the value is 1. This controls a flag that is + sent to the peer at connection time, and whether such join requests are + accepted or denied. + + Joins to addresses advertised with ADD_ADDR are not affected by this + value. + + This is a per-namespace sysctl. + + Default: 1 diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c index 6c2639bb9c19..7d738bd06f2c 100644 --- a/net/mptcp/ctrl.c +++ b/net/mptcp/ctrl.c @@ -24,6 +24,7 @@ struct mptcp_pernet { u8 mptcp_enabled; unsigned int add_addr_timeout; u8 checksum_enabled; + u8 allow_join_initial_addr_port; }; static struct mptcp_pernet *mptcp_get_pernet(struct net *net) @@ -46,11 +47,17 @@ int mptcp_is_checksum_enabled(struct net *net) return mptcp_get_pernet(net)->checksum_enabled; } +int mptcp_allow_join_id0(struct net *net) +{ + return mptcp_get_pernet(net)->allow_join_initial_addr_port; +} + static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet) { pernet->mptcp_enabled = 1; pernet->add_addr_timeout = TCP_RTO_MAX; pernet->checksum_enabled = 0; + pernet->allow_join_initial_addr_port = 1; } #ifdef CONFIG_SYSCTL @@ -80,6 +87,14 @@ static struct ctl_table mptcp_sysctl_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE }, + { + .procname = "allow_join_initial_addr_port", + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE + }, {} }; @@ -98,6 +113,7 @@ static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) table[0].data = &pernet->mptcp_enabled; table[1].data = &pernet->add_addr_timeout; table[2].data = &pernet->checksum_enabled; + table[3].data = &pernet->allow_join_initial_addr_port; hdr = register_net_sysctl(net, MPTCP_SYSCTL_PATH, table); if (!hdr) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 160c2ab09f19..9aab5fb54716 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -540,6 +540,7 @@ static inline void mptcp_subflow_delegated_done(struct mptcp_subflow_context *su int mptcp_is_enabled(struct net *net); unsigned int mptcp_get_add_addr_timeout(struct net *net); int mptcp_is_checksum_enabled(struct net *net); +int mptcp_allow_join_id0(struct net *net); void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow, struct mptcp_options_received *mp_opt); bool mptcp_subflow_data_available(struct sock *sk); -- cgit v1.2.3-58-ga151 From bab6b88e056038f618b2fb977d95b05ad3da8d0c Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Tue, 22 Jun 2021 12:25:19 -0700 Subject: mptcp: add allow_join_id0 in mptcp_out_options This patch defined a new flag MPTCP_CAP_DENY_JOIN_ID0 for the third bit, labeled "C" of the MP_CAPABLE option. Add a new flag allow_join_id0 in struct mptcp_out_options. If this flag is set, send out the MP_CAPABLE option with the flag MPTCP_CAP_DENY_JOIN_ID0. Acked-by: Paolo Abeni Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- include/net/mptcp.h | 3 ++- net/mptcp/options.c | 6 ++++++ net/mptcp/protocol.h | 6 ++++-- net/mptcp/subflow.c | 1 + 4 files changed, 13 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/net/mptcp.h b/include/net/mptcp.h index d61bbbf11979..cb580b06152f 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -67,7 +67,8 @@ struct mptcp_out_options { u8 backup; u8 reset_reason:4, reset_transient:1, - csum_reqd:1; + csum_reqd:1, + allow_join_id0:1; u32 nonce; u64 thmac; u32 token; diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 25189595ed1d..7a4b6d0bf3f6 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -402,6 +402,7 @@ bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, if (subflow->request_mptcp) { opts->suboptions = OPTION_MPTCP_MPC_SYN; opts->csum_reqd = mptcp_is_checksum_enabled(sock_net(sk)); + opts->allow_join_id0 = mptcp_allow_join_id0(sock_net(sk)); *size = TCPOLEN_MPTCP_MPC_SYN; return true; } else if (subflow->request_join) { @@ -490,6 +491,7 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, opts->sndr_key = subflow->local_key; opts->rcvr_key = subflow->remote_key; opts->csum_reqd = READ_ONCE(msk->csum_enabled); + opts->allow_join_id0 = mptcp_allow_join_id0(sock_net(sk)); /* Section 3.1. * The MP_CAPABLE option is carried on the SYN, SYN/ACK, and ACK @@ -827,6 +829,7 @@ bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, opts->suboptions = OPTION_MPTCP_MPC_SYNACK; opts->sndr_key = subflow_req->local_key; opts->csum_reqd = subflow_req->csum_reqd; + opts->allow_join_id0 = subflow_req->allow_join_id0; *size = TCPOLEN_MPTCP_MPC_SYNACK; pr_debug("subflow_req=%p, local_key=%llu", subflow_req, subflow_req->local_key); @@ -1201,6 +1204,9 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, if (opts->csum_reqd) flag |= MPTCP_CAP_CHECKSUM_REQD; + if (!opts->allow_join_id0) + flag |= MPTCP_CAP_DENY_JOIN_ID0; + *ptr++ = mptcp_option(MPTCPOPT_MP_CAPABLE, len, MPTCP_SUPPORTED_VERSION, flag); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 9aab5fb54716..f2326f6074b9 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -79,8 +79,9 @@ #define MPTCP_VERSION_MASK (0x0F) #define MPTCP_CAP_CHECKSUM_REQD BIT(7) #define MPTCP_CAP_EXTENSIBILITY BIT(6) +#define MPTCP_CAP_DENY_JOIN_ID0 BIT(5) #define MPTCP_CAP_HMAC_SHA256 BIT(0) -#define MPTCP_CAP_FLAG_MASK (0x3F) +#define MPTCP_CAP_FLAG_MASK (0x1F) /* MPTCP DSS flags */ #define MPTCP_DSS_DATA_FIN BIT(4) @@ -350,7 +351,8 @@ struct mptcp_subflow_request_sock { u16 mp_capable : 1, mp_join : 1, backup : 1, - csum_reqd : 1; + csum_reqd : 1, + allow_join_id0 : 1; u8 local_id; u8 remote_id; u64 local_key; diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 585951e7e52f..e9e8ce862218 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -109,6 +109,7 @@ static void subflow_init_req(struct request_sock *req, const struct sock *sk_lis subflow_req->mp_capable = 0; subflow_req->mp_join = 0; subflow_req->csum_reqd = mptcp_is_checksum_enabled(sock_net(sk_listener)); + subflow_req->allow_join_id0 = mptcp_allow_join_id0(sock_net(sk_listener)); subflow_req->msk = NULL; mptcp_token_init_request(req); } -- cgit v1.2.3-58-ga151 From df377be38725ced628251c1a3b954ef932a5586e Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Tue, 22 Jun 2021 12:25:20 -0700 Subject: mptcp: add deny_join_id0 in mptcp_options_received This patch added a new flag named deny_join_id0 in struct mptcp_options_received. Set it when MP_CAPABLE with the flag MPTCP_CAP_DENYJOIN_ID0 is received. Also add a new flag remote_deny_join_id0 in struct mptcp_pm_data. When the flag deny_join_id0 is set, set this remote_deny_join_id0 flag. In mptcp_pm_create_subflow_or_signal_addr, if the remote_deny_join_id0 flag is set, and the remote address id is zero, stop this connection. Suggested-by: Florian Westphal Acked-by: Paolo Abeni Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/options.c | 7 +++++++ net/mptcp/pm.c | 1 + net/mptcp/pm_netlink.c | 3 ++- net/mptcp/protocol.h | 4 +++- net/mptcp/subflow.c | 2 ++ 5 files changed, 15 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 7a4b6d0bf3f6..a05270996613 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -83,6 +83,9 @@ static void mptcp_parse_option(const struct sk_buff *skb, if (flags & MPTCP_CAP_CHECKSUM_REQD) mp_opt->csum_reqd = 1; + if (flags & MPTCP_CAP_DENY_JOIN_ID0) + mp_opt->deny_join_id0 = 1; + mp_opt->mp_capable = 1; if (opsize >= TCPOLEN_MPTCP_MPC_SYNACK) { mp_opt->sndr_key = get_unaligned_be64(ptr); @@ -360,6 +363,7 @@ void mptcp_get_options(const struct sock *sk, mp_opt->mp_prio = 0; mp_opt->reset = 0; mp_opt->csum_reqd = READ_ONCE(msk->csum_enabled); + mp_opt->deny_join_id0 = 0; length = (th->doff * 4) - sizeof(struct tcphdr); ptr = (const unsigned char *)(th + 1); @@ -908,6 +912,9 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk, return false; } + if (mp_opt->deny_join_id0) + WRITE_ONCE(msk->pm.remote_deny_join_id0, true); + if (unlikely(!READ_ONCE(msk->pm.server_side))) pr_warn_once("bogus mpc option on established client sk"); mptcp_subflow_fully_established(subflow, mp_opt); diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 9d00fa6d22e9..639271e09604 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -320,6 +320,7 @@ void mptcp_pm_data_init(struct mptcp_sock *msk) WRITE_ONCE(msk->pm.addr_signal, 0); WRITE_ONCE(msk->pm.accept_addr, false); WRITE_ONCE(msk->pm.accept_subflow, false); + WRITE_ONCE(msk->pm.remote_deny_join_id0, false); msk->pm.status = 0; spin_lock_init(&msk->pm.lock); diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index d4732a4f223e..d2591ebf01d9 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -451,7 +451,8 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) /* check if should create a new subflow */ if (msk->pm.local_addr_used < local_addr_max && - msk->pm.subflows < subflows_max) { + msk->pm.subflows < subflows_max && + !READ_ONCE(msk->pm.remote_deny_join_id0)) { local = select_local_address(pernet, msk); if (local) { struct mptcp_addr_info remote = { 0 }; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index f2326f6074b9..f4eaa5f57e3f 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -138,7 +138,8 @@ struct mptcp_options_received { mp_prio : 1, echo : 1, csum_reqd : 1, - backup : 1; + backup : 1, + deny_join_id0 : 1; u32 token; u32 nonce; u64 thmac; @@ -193,6 +194,7 @@ struct mptcp_pm_data { bool work_pending; bool accept_addr; bool accept_subflow; + bool remote_deny_join_id0; u8 add_addr_signaled; u8 add_addr_accepted; u8 local_addr_used; diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index e9e8ce862218..d55f4ef736a5 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -408,6 +408,8 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) if (mp_opt.csum_reqd) WRITE_ONCE(mptcp_sk(parent)->csum_enabled, true); + if (mp_opt.deny_join_id0) + WRITE_ONCE(mptcp_sk(parent)->pm.remote_deny_join_id0, true); subflow->mp_capable = 1; subflow->can_ack = 1; subflow->remote_key = mp_opt.sndr_key; -- cgit v1.2.3-58-ga151 From fde56eea01f96b664eb63033990be0fd2a945da5 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 22 Jun 2021 12:25:23 -0700 Subject: mptcp: refine mptcp_cleanup_rbuf The current cleanup rbuf tries a bit too hard to avoid acquiring the subflow socket lock. We may end-up delaying the needed ack, or skip acking a blocked subflow. Address the above extending the conditions used to trigger the cleanup to reflect more closely what TCP does and invoking tcp_cleanup_rbuf() on all the active subflows. Note that we can't replicate the exact tests implemented in tcp_cleanup_rbuf(), as MPTCP lacks some of the required info - e.g. ping-pong mode. Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 56 +++++++++++++++++++++------------------------------- net/mptcp/protocol.h | 1 - 2 files changed, 23 insertions(+), 34 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index cf75be02eb00..ce0c45dfb79e 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -442,49 +442,46 @@ static void mptcp_send_ack(struct mptcp_sock *msk) } } -static bool mptcp_subflow_cleanup_rbuf(struct sock *ssk) +static void mptcp_subflow_cleanup_rbuf(struct sock *ssk) { bool slow; - int ret; slow = lock_sock_fast(ssk); - ret = tcp_can_send_ack(ssk); - if (ret) + if (tcp_can_send_ack(ssk)) tcp_cleanup_rbuf(ssk, 1); unlock_sock_fast(ssk, slow); - return ret; +} + +static bool mptcp_subflow_could_cleanup(const struct sock *ssk, bool rx_empty) +{ + const struct inet_connection_sock *icsk = inet_csk(ssk); + bool ack_pending = READ_ONCE(icsk->icsk_ack.pending); + const struct tcp_sock *tp = tcp_sk(ssk); + + return (ack_pending & ICSK_ACK_SCHED) && + ((READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->rcv_wup) > + READ_ONCE(icsk->icsk_ack.rcv_mss)) || + (rx_empty && ack_pending & + (ICSK_ACK_PUSHED2 | ICSK_ACK_PUSHED))); } static void mptcp_cleanup_rbuf(struct mptcp_sock *msk) { - struct sock *ack_hint = READ_ONCE(msk->ack_hint); int old_space = READ_ONCE(msk->old_wspace); struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; - bool cleanup; + int space = __mptcp_space(sk); + bool cleanup, rx_empty; - /* this is a simple superset of what tcp_cleanup_rbuf() implements - * so that we don't have to acquire the ssk socket lock most of the time - * to do actually nothing - */ - cleanup = __mptcp_space(sk) - old_space >= max(0, old_space); - if (!cleanup) - return; + cleanup = (space > 0) && (space >= (old_space << 1)); + rx_empty = !atomic_read(&sk->sk_rmem_alloc); - /* if the hinted ssk is still active, try to use it */ - if (likely(ack_hint)) { - mptcp_for_each_subflow(msk, subflow) { - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - if (ack_hint == ssk && mptcp_subflow_cleanup_rbuf(ssk)) - return; - } + if (cleanup || mptcp_subflow_could_cleanup(ssk, rx_empty)) + mptcp_subflow_cleanup_rbuf(ssk); } - - /* otherwise pick the first active subflow */ - mptcp_for_each_subflow(msk, subflow) - if (mptcp_subflow_cleanup_rbuf(mptcp_subflow_tcp_sock(subflow))) - return; } static bool mptcp_check_data_fin(struct sock *sk) @@ -629,7 +626,6 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, break; } } while (more_data_avail); - WRITE_ONCE(msk->ack_hint, ssk); *bytes += moved; return done; @@ -1910,7 +1906,6 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk) __mptcp_update_rmem(sk); done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved); mptcp_data_unlock(sk); - tcp_cleanup_rbuf(ssk, moved); if (unlikely(ssk->sk_err)) __mptcp_error_report(sk); @@ -1926,7 +1921,6 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk) ret |= __mptcp_ofo_queue(msk); __mptcp_splice_receive_queue(sk); mptcp_data_unlock(sk); - mptcp_cleanup_rbuf(msk); } if (ret) mptcp_check_data_fin((struct sock *)msk); @@ -2175,9 +2169,6 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, if (ssk == msk->last_snd) msk->last_snd = NULL; - if (ssk == msk->ack_hint) - msk->ack_hint = NULL; - if (ssk == msk->first) msk->first = NULL; @@ -2392,7 +2383,6 @@ static int __mptcp_init_sock(struct sock *sk) msk->rmem_released = 0; msk->tx_pending_data = 0; - msk->ack_hint = NULL; msk->first = NULL; inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss; WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk))); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index f4eaa5f57e3f..d8ad3270dfab 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -243,7 +243,6 @@ struct mptcp_sock { bool use_64bit_ack; /* Set when we received a 64-bit DSN */ bool csum_enabled; spinlock_t join_list_lock; - struct sock *ack_hint; struct work_struct work; struct sk_buff *ooo_last_skb; struct rb_root out_of_order_queue; -- cgit v1.2.3-58-ga151 From fa4535238fb5f306f95de89371a993057b32b2a4 Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Mon, 14 Jun 2021 17:33:48 +0300 Subject: net/xfrm: Add inner_ipproto into sec_path The inner_ipproto saves the inner IP protocol of the plain text packet. This allows vendor's IPsec feature making offload decision at skb's features_check and configuring hardware at ndo_start_xmit. For example, ConnectX6-DX IPsec device needs the plaintext's IP protocol to support partial checksum offload on VXLAN/GENEVE packet over IPsec transport mode tunnel. Signed-off-by: Raed Salem Signed-off-by: Huy Nguyen Cc: Steffen Klassert Acked-by: Steffen Klassert Signed-off-by: Saeed Mahameed --- include/net/xfrm.h | 1 + net/xfrm/xfrm_output.c | 41 ++++++++++++++++++++++++++++++++++++++++- 2 files changed, 41 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index c58a6d4eb610..1d803e890c76 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1024,6 +1024,7 @@ struct xfrm_offload { #define CRYPTO_INVALID_PROTOCOL 128 __u8 proto; + __u8 inner_ipproto; }; struct sec_path { diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index e4cb0ff4dcf4..e321fc63a2e9 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -565,6 +565,42 @@ static int xfrm_output_gso(struct net *net, struct sock *sk, struct sk_buff *skb return 0; } +/* For partial checksum offload, the outer header checksum is calculated + * by software and the inner header checksum is calculated by hardware. + * This requires hardware to know the inner packet type to calculate + * the inner header checksum. Save inner ip protocol here to avoid + * traversing the packet in the vendor's xmit code. + * If the encap type is IPIP, just save skb->inner_ipproto. Otherwise, + * get the ip protocol from the IP header. + */ +static void xfrm_get_inner_ipproto(struct sk_buff *skb) +{ + struct xfrm_offload *xo = xfrm_offload(skb); + const struct ethhdr *eth; + + if (!xo) + return; + + if (skb->inner_protocol_type == ENCAP_TYPE_IPPROTO) { + xo->inner_ipproto = skb->inner_ipproto; + return; + } + + if (skb->inner_protocol_type != ENCAP_TYPE_ETHER) + return; + + eth = (struct ethhdr *)skb_inner_mac_header(skb); + + switch (ntohs(eth->h_proto)) { + case ETH_P_IPV6: + xo->inner_ipproto = inner_ipv6_hdr(skb)->nexthdr; + break; + case ETH_P_IP: + xo->inner_ipproto = inner_ip_hdr(skb)->protocol; + break; + } +} + int xfrm_output(struct sock *sk, struct sk_buff *skb) { struct net *net = dev_net(skb_dst(skb)->dev); @@ -594,12 +630,15 @@ int xfrm_output(struct sock *sk, struct sk_buff *skb) kfree_skb(skb); return -ENOMEM; } - skb->encapsulation = 1; sp->olen++; sp->xvec[sp->len++] = x; xfrm_state_hold(x); + if (skb->encapsulation) + xfrm_get_inner_ipproto(skb); + skb->encapsulation = 1; + if (skb_is_gso(skb)) { if (skb->inner_protocol) return xfrm_output_gso(net, sk, skb); -- cgit v1.2.3-58-ga151 From e93bdd78406da9ed01554c51e38b2a02c8ef8025 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 22 Apr 2021 15:00:32 -0500 Subject: wireless: wext-spy: Fix out-of-bounds warning Fix the following out-of-bounds warning: net/wireless/wext-spy.c:178:2: warning: 'memcpy' offset [25, 28] from the object at 'threshold' is out of the bounds of referenced subobject 'low' with type 'struct iw_quality' at offset 20 [-Warray-bounds] The problem is that the original code is trying to copy data into a couple of struct members adjacent to each other in a single call to memcpy(). This causes a legitimate compiler warning because memcpy() overruns the length of &threshold.low and &spydata->spy_thr_low. As these are just a couple of struct members, fix this by using direct assignments, instead of memcpy(). This helps with the ongoing efforts to globally enable -Warray-bounds and get us closer to being able to tighten the FORTIFY_SOURCE routines on memcpy(). Link: https://github.com/KSPP/linux/issues/109 Reported-by: kernel test robot Signed-off-by: Gustavo A. R. Silva Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20210422200032.GA168995@embeddedor Signed-off-by: Johannes Berg --- net/wireless/wext-spy.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/wireless/wext-spy.c b/net/wireless/wext-spy.c index 33bef22e44e9..b379a0371653 100644 --- a/net/wireless/wext-spy.c +++ b/net/wireless/wext-spy.c @@ -120,8 +120,8 @@ int iw_handler_set_thrspy(struct net_device * dev, return -EOPNOTSUPP; /* Just do it */ - memcpy(&(spydata->spy_thr_low), &(threshold->low), - 2 * sizeof(struct iw_quality)); + spydata->spy_thr_low = threshold->low; + spydata->spy_thr_high = threshold->high; /* Clear flag */ memset(spydata->spy_thr_under, '\0', sizeof(spydata->spy_thr_under)); @@ -147,8 +147,8 @@ int iw_handler_get_thrspy(struct net_device * dev, return -EOPNOTSUPP; /* Just do it */ - memcpy(&(threshold->low), &(spydata->spy_thr_low), - 2 * sizeof(struct iw_quality)); + threshold->low = spydata->spy_thr_low; + threshold->high = spydata->spy_thr_high; return 0; } @@ -173,10 +173,10 @@ static void iw_send_thrspy_event(struct net_device * dev, memcpy(threshold.addr.sa_data, address, ETH_ALEN); threshold.addr.sa_family = ARPHRD_ETHER; /* Copy stats */ - memcpy(&(threshold.qual), wstats, sizeof(struct iw_quality)); + threshold.qual = *wstats; /* Copy also thresholds */ - memcpy(&(threshold.low), &(spydata->spy_thr_low), - 2 * sizeof(struct iw_quality)); + threshold.low = spydata->spy_thr_low; + threshold.high = spydata->spy_thr_high; /* Send event to user space */ wireless_send_event(dev, SIOCGIWTHRSPY, &wrqu, (char *) &threshold); -- cgit v1.2.3-58-ga151 From c2a8637c055e4ea86cd414bbf02034d1449685cc Mon Sep 17 00:00:00 2001 From: Yang Li Date: Wed, 28 Apr 2021 17:59:05 +0800 Subject: net: wireless: wext_compat.c: Remove redundant assignment to ps Variable 'ps' is set to wdev->ps but this value is never read as it is overwritten with a new value later on, hence it is a redundant assignment and can be removed. Cleans up the following clang-analyzer warning: net/wireless/wext-compat.c:1170:7: warning: Value stored to 'ps' during its initialization is never read [clang-analyzer-deadcode.DeadStores] Reported-by: Abaci Robot Signed-off-by: Yang Li Link: https://lore.kernel.org/r/1619603945-116891-1-git-send-email-yang.lee@linux.alibaba.com Signed-off-by: Johannes Berg --- net/wireless/wext-compat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c index a8320dc59af7..50a2330de236 100644 --- a/net/wireless/wext-compat.c +++ b/net/wireless/wext-compat.c @@ -1167,7 +1167,7 @@ static int cfg80211_wext_siwpower(struct net_device *dev, { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); - bool ps = wdev->ps; + bool ps; int timeout = wdev->ps_timeout; int err; -- cgit v1.2.3-58-ga151 From 5eae2705004895a9aa917f5df6c1a2da8eeb4fd5 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Fri, 30 Apr 2021 17:21:23 +0800 Subject: mac80211: Remove redundant assignment to ret Variable 'ret' is set to -ENODEV but this value is never read as it is overwritten with a new value later on, hence it is a redundant assignment and can be removed. Clean up the following clang-analyzer warning: net/mac80211/debugfs_netdev.c:60:2: warning: Value stored to 'ret' is never read [clang-analyzer-deadcode.DeadStores] Reported-by: Abaci Robot Signed-off-by: Yang Li Link: https://lore.kernel.org/r/1619774483-116805-1-git-send-email-yang.lee@linux.alibaba.com Signed-off-by: Johannes Berg --- net/mac80211/debugfs_netdev.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index 0ad3860852ff..f7aac8955681 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -57,7 +57,6 @@ static ssize_t ieee80211_if_write( return -EFAULT; buf[count] = '\0'; - ret = -ENODEV; rtnl_lock(); ret = (*write)(sdata, buf, count); rtnl_unlock(); -- cgit v1.2.3-58-ga151 From 21b7805434f6598eaf70329f78cf3da0bd4aa3e9 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 6 May 2021 22:12:00 +0200 Subject: cfg80211: remove CFG80211_MAX_NUM_DIFFERENT_CHANNELS We no longer need to put any limits here, hardware will and mac80211-hwsim can do whatever it likes. The reason we had this was some accounting code (still mentioned in the comment) but that code was deleted in commit c781944b71f8 ("cfg80211: Remove unused cfg80211_can_use_iftype_chan()"). Link: https://lore.kernel.org/r/20210506221159.d1d61db1d31c.Iac4da68d54b9f1fdc18a03586bbe06aeb9515425@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/mac80211_hwsim.c | 5 ----- include/net/cfg80211.h | 2 -- net/wireless/core.c | 8 -------- 3 files changed, 15 deletions(-) (limited to 'net') diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index 7a6fd46d0c6e..9574afc0cdbf 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -3796,11 +3796,6 @@ static int hwsim_new_radio_nl(struct sk_buff *msg, struct genl_info *info) return -EINVAL; } - if (param.channels > CFG80211_MAX_NUM_DIFFERENT_CHANNELS) { - GENL_SET_ERR_MSG(info, "too many channels specified"); - return -EINVAL; - } - if (info->attrs[HWSIM_ATTR_NO_VIF]) param.no_vif = true; diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 58c2cd417e89..60325b62daae 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1245,8 +1245,6 @@ struct cfg80211_csa_settings { u8 count; }; -#define CFG80211_MAX_NUM_DIFFERENT_CHANNELS 10 - /** * struct iface_combination_params - input parameters for interface combinations * diff --git a/net/wireless/core.c b/net/wireless/core.c index 8d0883e81093..47f551301592 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -589,14 +589,6 @@ static int wiphy_verify_combinations(struct wiphy *wiphy) if (WARN_ON(!c->num_different_channels)) return -EINVAL; - /* - * Put a sane limit on maximum number of different - * channels to simplify channel accounting code. - */ - if (WARN_ON(c->num_different_channels > - CFG80211_MAX_NUM_DIFFERENT_CHANNELS)) - return -EINVAL; - /* DFS only works on one channel. */ if (WARN_ON(c->radar_detect_widths && (c->num_different_channels > 1))) -- cgit v1.2.3-58-ga151 From 5b5c9f3bd5f3d726d07ab8e4776d241863963a6e Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 8 May 2021 16:03:51 +0300 Subject: cfg80211: clean up variable use in cfg80211_parse_colocated_ap() The "ap_info->tbtt_info_len" and "length" variables are the same value but it is confusing how the names are mixed up. Let's use "length" everywhere for consistency. Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/YJaMNzZENkYFAYQX@mwanda Signed-off-by: Johannes Berg --- net/wireless/scan.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 4f06c1825029..a3941b19b516 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -618,7 +618,7 @@ static int cfg80211_parse_colocated_ap(const struct cfg80211_bss_ies *ies, freq = ieee80211_channel_to_frequency(ap_info->channel, band); - if (end - pos < count * ap_info->tbtt_info_len) + if (end - pos < count * length) break; /* @@ -630,7 +630,7 @@ static int cfg80211_parse_colocated_ap(const struct cfg80211_bss_ies *ies, if (band != NL80211_BAND_6GHZ || (length != IEEE80211_TBTT_INFO_OFFSET_BSSID_BSS_PARAM && length < IEEE80211_TBTT_INFO_OFFSET_BSSID_SSSID_BSS_PARAM)) { - pos += count * ap_info->tbtt_info_len; + pos += count * length; continue; } @@ -653,7 +653,7 @@ static int cfg80211_parse_colocated_ap(const struct cfg80211_bss_ies *ies, kfree(entry); } - pos += ap_info->tbtt_info_len; + pos += length; } } -- cgit v1.2.3-58-ga151 From 0edab4ff84b67fc585bb47aba37833da18f5a9dc Mon Sep 17 00:00:00 2001 From: Philipp Borgers Date: Mon, 17 May 2021 14:01:45 +0200 Subject: mac80211: minstrel_ht: ignore frame that was sent with noAck flag QoS Data Frames that were sent with a No Ack policy should be ignored by the minstrel statistics. There will never be an Ack for these frames so there is no way to draw conclusions about the success of the transmission. Signed-off-by: Philipp Borgers Link: https://lore.kernel.org/r/20210517120145.132814-1-borgers@mi.fu-berlin.de Signed-off-by: Johannes Berg --- net/mac80211/rc80211_minstrel_ht.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c index a6f3fb4a9197..bc261d086410 100644 --- a/net/mac80211/rc80211_minstrel_ht.c +++ b/net/mac80211/rc80211_minstrel_ht.c @@ -1211,6 +1211,10 @@ minstrel_ht_tx_status(void *priv, struct ieee80211_supported_band *sband, bool last, update = false; int i; + /* Ignore packet that was sent with noAck flag */ + if (info->flags & IEEE80211_TX_CTL_NO_ACK) + return; + /* This packet was aggregated but doesn't carry status info */ if ((info->flags & IEEE80211_TX_CTL_AMPDU) && !(info->flags & IEEE80211_TX_STAT_AMPDU)) -- cgit v1.2.3-58-ga151 From 0044cc177f23aff1f66589f87c5f1172e9f09fdc Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 17 May 2021 23:07:54 +0200 Subject: mac80211: unify queueing SKB to iface We have a bunch of places that open-code the same to queue an SKB to an interface, unify that. Link: https://lore.kernel.org/r/20210517230754.113b65febd5a.Ie0e1d58a2885e75f242cb6e06f3b9660117fef93@changeid Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index af0ef456eb0f..ab9207e32f07 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -214,6 +214,16 @@ ieee80211_rx_radiotap_hdrlen(struct ieee80211_local *local, return len; } +static void ieee80211_queue_skb_to_iface(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, + struct sk_buff *skb) +{ + skb_queue_tail(&sdata->skb_queue, skb); + ieee80211_queue_work(&sdata->local->hw, &sdata->work); + if (sta) + sta->rx_stats.packets++; +} + static void ieee80211_handle_mu_mimo_mon(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, int rtap_space) @@ -254,8 +264,7 @@ static void ieee80211_handle_mu_mimo_mon(struct ieee80211_sub_if_data *sdata, if (!skb) return; - skb_queue_tail(&sdata->skb_queue, skb); - ieee80211_queue_work(&sdata->local->hw, &sdata->work); + ieee80211_queue_skb_to_iface(sdata, NULL, skb); } /* @@ -1339,7 +1348,6 @@ static void ieee80211_rx_reorder_ampdu(struct ieee80211_rx_data *rx, struct sk_buff_head *frames) { struct sk_buff *skb = rx->skb; - struct ieee80211_local *local = rx->local; struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; struct sta_info *sta = rx->sta; struct tid_ampdu_rx *tid_agg_rx; @@ -1391,8 +1399,7 @@ static void ieee80211_rx_reorder_ampdu(struct ieee80211_rx_data *rx, /* if this mpdu is fragmented - terminate rx aggregation session */ sc = le16_to_cpu(hdr->seq_ctrl); if (sc & IEEE80211_SCTL_FRAG) { - skb_queue_tail(&rx->sdata->skb_queue, skb); - ieee80211_queue_work(&local->hw, &rx->sdata->work); + ieee80211_queue_skb_to_iface(rx->sdata, NULL, skb); return; } @@ -3493,10 +3500,7 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) return RX_QUEUED; queue: - skb_queue_tail(&sdata->skb_queue, rx->skb); - ieee80211_queue_work(&local->hw, &sdata->work); - if (rx->sta) - rx->sta->rx_stats.packets++; + ieee80211_queue_skb_to_iface(sdata, rx->sta, rx->skb); return RX_QUEUED; } @@ -3644,10 +3648,7 @@ ieee80211_rx_h_ext(struct ieee80211_rx_data *rx) return RX_DROP_MONITOR; /* for now only beacons are ext, so queue them */ - skb_queue_tail(&sdata->skb_queue, rx->skb); - ieee80211_queue_work(&rx->local->hw, &sdata->work); - if (rx->sta) - rx->sta->rx_stats.packets++; + ieee80211_queue_skb_to_iface(sdata, rx->sta, rx->skb); return RX_QUEUED; } @@ -3704,11 +3705,7 @@ ieee80211_rx_h_mgmt(struct ieee80211_rx_data *rx) return RX_DROP_MONITOR; } - /* queue up frame and kick off work to process it */ - skb_queue_tail(&sdata->skb_queue, rx->skb); - ieee80211_queue_work(&rx->local->hw, &sdata->work); - if (rx->sta) - rx->sta->rx_stats.packets++; + ieee80211_queue_skb_to_iface(sdata, rx->sta, rx->skb); return RX_QUEUED; } -- cgit v1.2.3-58-ga151 From 07bd1c79c9fbf038483c50031b0f302613a54eb6 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 17 May 2021 23:07:55 +0200 Subject: mac80211: refactor SKB queue processing a bit This is a very long loop body, move it into its own function instead, keeping only the kcov and free outside in the loop body. Link: https://lore.kernel.org/r/20210517230754.6bc6cdd68570.I28a86ebdb19601ca1965c4dc654cc49fc1064efa@changeid Signed-off-by: Johannes Berg --- net/mac80211/iface.c | 228 +++++++++++++++++++++++++++------------------------ 1 file changed, 119 insertions(+), 109 deletions(-) (limited to 'net') diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 137fa4c50e07..30dd3b3778f6 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -1318,13 +1318,130 @@ static void ieee80211_if_setup_no_queue(struct net_device *dev) dev->priv_flags |= IFF_NO_QUEUE; } +static void ieee80211_iface_process_skb(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb) +{ + struct ieee80211_mgmt *mgmt = (void *)skb->data; + + if (ieee80211_is_action(mgmt->frame_control) && + mgmt->u.action.category == WLAN_CATEGORY_BACK) { + struct sta_info *sta; + int len = skb->len; + + mutex_lock(&local->sta_mtx); + sta = sta_info_get_bss(sdata, mgmt->sa); + if (sta) { + switch (mgmt->u.action.u.addba_req.action_code) { + case WLAN_ACTION_ADDBA_REQ: + ieee80211_process_addba_request(local, sta, + mgmt, len); + break; + case WLAN_ACTION_ADDBA_RESP: + ieee80211_process_addba_resp(local, sta, + mgmt, len); + break; + case WLAN_ACTION_DELBA: + ieee80211_process_delba(sdata, sta, + mgmt, len); + break; + default: + WARN_ON(1); + break; + } + } + mutex_unlock(&local->sta_mtx); + } else if (ieee80211_is_action(mgmt->frame_control) && + mgmt->u.action.category == WLAN_CATEGORY_VHT) { + switch (mgmt->u.action.u.vht_group_notif.action_code) { + case WLAN_VHT_ACTION_OPMODE_NOTIF: { + struct ieee80211_rx_status *status; + enum nl80211_band band; + struct sta_info *sta; + u8 opmode; + + status = IEEE80211_SKB_RXCB(skb); + band = status->band; + opmode = mgmt->u.action.u.vht_opmode_notif.operating_mode; + + mutex_lock(&local->sta_mtx); + sta = sta_info_get_bss(sdata, mgmt->sa); + + if (sta) + ieee80211_vht_handle_opmode(sdata, sta, opmode, + band); + + mutex_unlock(&local->sta_mtx); + break; + } + case WLAN_VHT_ACTION_GROUPID_MGMT: + ieee80211_process_mu_groups(sdata, mgmt); + break; + default: + WARN_ON(1); + break; + } + } else if (ieee80211_is_ext(mgmt->frame_control)) { + if (sdata->vif.type == NL80211_IFTYPE_STATION) + ieee80211_sta_rx_queued_ext(sdata, skb); + else + WARN_ON(1); + } else if (ieee80211_is_data_qos(mgmt->frame_control)) { + struct ieee80211_hdr *hdr = (void *)mgmt; + struct sta_info *sta; + + /* + * So the frame isn't mgmt, but frame_control + * is at the right place anyway, of course, so + * the if statement is correct. + * + * Warn if we have other data frame types here, + * they must not get here. + */ + WARN_ON(hdr->frame_control & + cpu_to_le16(IEEE80211_STYPE_NULLFUNC)); + WARN_ON(!(hdr->seq_ctrl & + cpu_to_le16(IEEE80211_SCTL_FRAG))); + /* + * This was a fragment of a frame, received while + * a block-ack session was active. That cannot be + * right, so terminate the session. + */ + mutex_lock(&local->sta_mtx); + sta = sta_info_get_bss(sdata, mgmt->sa); + if (sta) { + u16 tid = ieee80211_get_tid(hdr); + + __ieee80211_stop_rx_ba_session( + sta, tid, WLAN_BACK_RECIPIENT, + WLAN_REASON_QSTA_REQUIRE_SETUP, + true); + } + mutex_unlock(&local->sta_mtx); + } else switch (sdata->vif.type) { + case NL80211_IFTYPE_STATION: + ieee80211_sta_rx_queued_mgmt(sdata, skb); + break; + case NL80211_IFTYPE_ADHOC: + ieee80211_ibss_rx_queued_mgmt(sdata, skb); + break; + case NL80211_IFTYPE_MESH_POINT: + if (!ieee80211_vif_is_mesh(&sdata->vif)) + break; + ieee80211_mesh_rx_queued_mgmt(sdata, skb); + break; + default: + WARN(1, "frame for unexpected interface type"); + break; + } +} + static void ieee80211_iface_work(struct work_struct *work) { struct ieee80211_sub_if_data *sdata = container_of(work, struct ieee80211_sub_if_data, work); struct ieee80211_local *local = sdata->local; struct sk_buff *skb; - struct sta_info *sta; if (!ieee80211_sdata_running(sdata)) return; @@ -1337,116 +1454,9 @@ static void ieee80211_iface_work(struct work_struct *work) /* first process frames */ while ((skb = skb_dequeue(&sdata->skb_queue))) { - struct ieee80211_mgmt *mgmt = (void *)skb->data; - kcov_remote_start_common(skb_get_kcov_handle(skb)); - if (ieee80211_is_action(mgmt->frame_control) && - mgmt->u.action.category == WLAN_CATEGORY_BACK) { - int len = skb->len; - - mutex_lock(&local->sta_mtx); - sta = sta_info_get_bss(sdata, mgmt->sa); - if (sta) { - switch (mgmt->u.action.u.addba_req.action_code) { - case WLAN_ACTION_ADDBA_REQ: - ieee80211_process_addba_request( - local, sta, mgmt, len); - break; - case WLAN_ACTION_ADDBA_RESP: - ieee80211_process_addba_resp(local, sta, - mgmt, len); - break; - case WLAN_ACTION_DELBA: - ieee80211_process_delba(sdata, sta, - mgmt, len); - break; - default: - WARN_ON(1); - break; - } - } - mutex_unlock(&local->sta_mtx); - } else if (ieee80211_is_action(mgmt->frame_control) && - mgmt->u.action.category == WLAN_CATEGORY_VHT) { - switch (mgmt->u.action.u.vht_group_notif.action_code) { - case WLAN_VHT_ACTION_OPMODE_NOTIF: { - struct ieee80211_rx_status *status; - enum nl80211_band band; - u8 opmode; - - status = IEEE80211_SKB_RXCB(skb); - band = status->band; - opmode = mgmt->u.action.u.vht_opmode_notif.operating_mode; - - mutex_lock(&local->sta_mtx); - sta = sta_info_get_bss(sdata, mgmt->sa); - - if (sta) - ieee80211_vht_handle_opmode(sdata, sta, - opmode, - band); - - mutex_unlock(&local->sta_mtx); - break; - } - case WLAN_VHT_ACTION_GROUPID_MGMT: - ieee80211_process_mu_groups(sdata, mgmt); - break; - default: - WARN_ON(1); - break; - } - } else if (ieee80211_is_ext(mgmt->frame_control)) { - if (sdata->vif.type == NL80211_IFTYPE_STATION) - ieee80211_sta_rx_queued_ext(sdata, skb); - else - WARN_ON(1); - } else if (ieee80211_is_data_qos(mgmt->frame_control)) { - struct ieee80211_hdr *hdr = (void *)mgmt; - /* - * So the frame isn't mgmt, but frame_control - * is at the right place anyway, of course, so - * the if statement is correct. - * - * Warn if we have other data frame types here, - * they must not get here. - */ - WARN_ON(hdr->frame_control & - cpu_to_le16(IEEE80211_STYPE_NULLFUNC)); - WARN_ON(!(hdr->seq_ctrl & - cpu_to_le16(IEEE80211_SCTL_FRAG))); - /* - * This was a fragment of a frame, received while - * a block-ack session was active. That cannot be - * right, so terminate the session. - */ - mutex_lock(&local->sta_mtx); - sta = sta_info_get_bss(sdata, mgmt->sa); - if (sta) { - u16 tid = ieee80211_get_tid(hdr); - __ieee80211_stop_rx_ba_session( - sta, tid, WLAN_BACK_RECIPIENT, - WLAN_REASON_QSTA_REQUIRE_SETUP, - true); - } - mutex_unlock(&local->sta_mtx); - } else switch (sdata->vif.type) { - case NL80211_IFTYPE_STATION: - ieee80211_sta_rx_queued_mgmt(sdata, skb); - break; - case NL80211_IFTYPE_ADHOC: - ieee80211_ibss_rx_queued_mgmt(sdata, skb); - break; - case NL80211_IFTYPE_MESH_POINT: - if (!ieee80211_vif_is_mesh(&sdata->vif)) - break; - ieee80211_mesh_rx_queued_mgmt(sdata, skb); - break; - default: - WARN(1, "frame for unexpected interface type"); - break; - } + ieee80211_iface_process_skb(local, sdata, skb); kfree_skb(skb); kcov_remote_stop(); -- cgit v1.2.3-58-ga151 From f057d1403689309c6277961d5c348d4841959a9c Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 17 May 2021 23:07:56 +0200 Subject: mac80211: use sdata->skb_queue for TDLS We need to differentiate these frames since the ones we currently put on the skb_queue_tdls_chsw have already been converted to ethernet format, but now that we've got a single place to enqueue to the sdata->skb_queue this isn't hard. Just differentiate based on protocol and adjust the code to queue the SKBs appropriately. Link: https://lore.kernel.org/r/20210517230754.17034990abef.I5342f2183c0d246b18d36c511eb3b6be298a6572@changeid Signed-off-by: Johannes Berg --- net/mac80211/ieee80211_i.h | 10 +++++----- net/mac80211/iface.c | 5 ++++- net/mac80211/main.c | 5 ----- net/mac80211/rx.c | 21 +++++++++++++-------- net/mac80211/tdls.c | 28 +--------------------------- 5 files changed, 23 insertions(+), 46 deletions(-) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 648696b49f89..b995777566e1 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1414,10 +1414,6 @@ struct ieee80211_local { /* extended capabilities provided by mac80211 */ u8 ext_capa[8]; - - /* TDLS channel switch */ - struct work_struct tdls_chsw_work; - struct sk_buff_head skb_queue_tdls_chsw; }; static inline struct ieee80211_sub_if_data * @@ -2287,9 +2283,13 @@ void ieee80211_tdls_cancel_channel_switch(struct wiphy *wiphy, struct net_device *dev, const u8 *addr); void ieee80211_teardown_tdls_peers(struct ieee80211_sub_if_data *sdata); -void ieee80211_tdls_chsw_work(struct work_struct *wk); void ieee80211_tdls_handle_disconnect(struct ieee80211_sub_if_data *sdata, const u8 *peer, u16 reason); +void +ieee80211_process_tdls_channel_switch(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb); + + const char *ieee80211_get_reason_code_string(u16 reason_code); u16 ieee80211_encode_usf(int val); u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len, diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 30dd3b3778f6..68375ef56b4a 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -1456,7 +1456,10 @@ static void ieee80211_iface_work(struct work_struct *work) while ((skb = skb_dequeue(&sdata->skb_queue))) { kcov_remote_start_common(skb_get_kcov_handle(skb)); - ieee80211_iface_process_skb(local, sdata, skb); + if (skb->protocol == cpu_to_be16(ETH_P_TDLS)) + ieee80211_process_tdls_channel_switch(sdata, skb); + else + ieee80211_iface_process_skb(local, sdata, skb); kfree_skb(skb); kcov_remote_stop(); diff --git a/net/mac80211/main.c b/net/mac80211/main.c index f33a3acd7f96..822ff388410e 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -739,8 +739,6 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, INIT_WORK(&local->sched_scan_stopped_work, ieee80211_sched_scan_stopped_work); - INIT_WORK(&local->tdls_chsw_work, ieee80211_tdls_chsw_work); - spin_lock_init(&local->ack_status_lock); idr_init(&local->ack_status_frames); @@ -757,7 +755,6 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, skb_queue_head_init(&local->skb_queue); skb_queue_head_init(&local->skb_queue_unreliable); - skb_queue_head_init(&local->skb_queue_tdls_chsw); ieee80211_alloc_led_names(local); @@ -1389,7 +1386,6 @@ void ieee80211_unregister_hw(struct ieee80211_hw *hw) cancel_delayed_work_sync(&local->roc_work); cancel_work_sync(&local->restart_work); cancel_work_sync(&local->reconfig_filter); - cancel_work_sync(&local->tdls_chsw_work); flush_work(&local->sched_scan_stopped_work); flush_work(&local->radar_detected_work); @@ -1401,7 +1397,6 @@ void ieee80211_unregister_hw(struct ieee80211_hw *hw) wiphy_warn(local->hw.wiphy, "skb_queue not empty\n"); skb_queue_purge(&local->skb_queue); skb_queue_purge(&local->skb_queue_unreliable); - skb_queue_purge(&local->skb_queue_tdls_chsw); wiphy_unregister(local->hw.wiphy); destroy_workqueue(local->workqueue); diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index ab9207e32f07..a6400adf08bf 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -214,9 +214,9 @@ ieee80211_rx_radiotap_hdrlen(struct ieee80211_local *local, return len; } -static void ieee80211_queue_skb_to_iface(struct ieee80211_sub_if_data *sdata, - struct sta_info *sta, - struct sk_buff *skb) +static void __ieee80211_queue_skb_to_iface(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, + struct sk_buff *skb) { skb_queue_tail(&sdata->skb_queue, skb); ieee80211_queue_work(&sdata->local->hw, &sdata->work); @@ -224,6 +224,14 @@ static void ieee80211_queue_skb_to_iface(struct ieee80211_sub_if_data *sdata, sta->rx_stats.packets++; } +static void ieee80211_queue_skb_to_iface(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, + struct sk_buff *skb) +{ + skb->protocol = 0; + __ieee80211_queue_skb_to_iface(sdata, sta, skb); +} + static void ieee80211_handle_mu_mimo_mon(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, int rtap_space) @@ -3016,11 +3024,8 @@ ieee80211_rx_h_data(struct ieee80211_rx_data *rx) tf->category == WLAN_CATEGORY_TDLS && (tf->action_code == WLAN_TDLS_CHANNEL_SWITCH_REQUEST || tf->action_code == WLAN_TDLS_CHANNEL_SWITCH_RESPONSE)) { - skb_queue_tail(&local->skb_queue_tdls_chsw, rx->skb); - schedule_work(&local->tdls_chsw_work); - if (rx->sta) - rx->sta->rx_stats.packets++; - + rx->skb->protocol = cpu_to_be16(ETH_P_TDLS); + __ieee80211_queue_skb_to_iface(sdata, rx->sta, rx->skb); return RX_QUEUED; } } diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index f91d02b81b92..45e532ad1215 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -1920,7 +1920,7 @@ out: return ret; } -static void +void ieee80211_process_tdls_channel_switch(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) { @@ -1971,32 +1971,6 @@ void ieee80211_teardown_tdls_peers(struct ieee80211_sub_if_data *sdata) rcu_read_unlock(); } -void ieee80211_tdls_chsw_work(struct work_struct *wk) -{ - struct ieee80211_local *local = - container_of(wk, struct ieee80211_local, tdls_chsw_work); - struct ieee80211_sub_if_data *sdata; - struct sk_buff *skb; - struct ieee80211_tdls_data *tf; - - wiphy_lock(local->hw.wiphy); - while ((skb = skb_dequeue(&local->skb_queue_tdls_chsw))) { - tf = (struct ieee80211_tdls_data *)skb->data; - list_for_each_entry(sdata, &local->interfaces, list) { - if (!ieee80211_sdata_running(sdata) || - sdata->vif.type != NL80211_IFTYPE_STATION || - !ether_addr_equal(tf->da, sdata->vif.addr)) - continue; - - ieee80211_process_tdls_channel_switch(sdata, skb); - break; - } - - kfree_skb(skb); - } - wiphy_unlock(local->hw.wiphy); -} - void ieee80211_tdls_handle_disconnect(struct ieee80211_sub_if_data *sdata, const u8 *peer, u16 reason) { -- cgit v1.2.3-58-ga151 From 4ebdce1dcbd44099b0e68db859b21d97b051492c Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 17 May 2021 23:07:57 +0200 Subject: mac80211: simplify ieee80211_add_station() There's no need to do some kind of weird err and RCU dance just use sta_info_insert() directly. Link: https://lore.kernel.org/r/20210517230754.55abd10056c0.I6f5a3b7b23347b2cdaf64e6d5ce1d9e904059654@changeid Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 7a99892e5aba..0d29a9d1f910 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1693,15 +1693,7 @@ static int ieee80211_add_station(struct wiphy *wiphy, struct net_device *dev, test_sta_flag(sta, WLAN_STA_ASSOC)) rate_control_rate_init(sta); - err = sta_info_insert_rcu(sta); - if (err) { - rcu_read_unlock(); - return err; - } - - rcu_read_unlock(); - - return 0; + return sta_info_insert(sta); } static int ieee80211_del_station(struct wiphy *wiphy, struct net_device *dev, -- cgit v1.2.3-58-ga151 From d333322361e7a099dc74df2498d3fa5fde5c4fa7 Mon Sep 17 00:00:00 2001 From: Philipp Borgers Date: Wed, 19 May 2021 14:20:18 +0200 Subject: mac80211: do not use low data rates for data frames with no ack flag Data Frames with no ack flag set should be handled by the rate controler. Make sure we reach the rate controler by returning early from rate_control_send_low if the frame is a data frame with no ack flag. Signed-off-by: Philipp Borgers Link: https://lore.kernel.org/r/20210519122019.92359-3-borgers@mi.fu-berlin.de Signed-off-by: Johannes Berg --- net/mac80211/rate.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c index 63652c39c8e0..01d6407b0279 100644 --- a/net/mac80211/rate.c +++ b/net/mac80211/rate.c @@ -396,6 +396,10 @@ static bool rate_control_send_low(struct ieee80211_sta *pubsta, int mcast_rate; bool use_basicrate = false; + if (ieee80211_is_tx_data(txrc->skb) && + info->flags & IEEE80211_TX_CTL_NO_ACK) + return false; + if (!pubsta || rc_no_data_or_no_ack_use_min(txrc)) { __rate_control_send_low(txrc->hw, sband, pubsta, info, txrc->rate_idx_mask); -- cgit v1.2.3-58-ga151 From 4e6c78bdcfbc3aad01a527e46b89e7ab70e0c332 Mon Sep 17 00:00:00 2001 From: Philipp Borgers Date: Wed, 19 May 2021 14:20:19 +0200 Subject: mac80211: refactor rc_no_data_or_no_ack_use_min function Use newly introduced helper function ieee80211_is_tx_data to check if frame is a data frame. Takes into account that hardware encapsulation can be enabled for a frame and therefore no ieee80211 header is present. Signed-off-by: Philipp Borgers Link: https://lore.kernel.org/r/20210519122019.92359-4-borgers@mi.fu-berlin.de Signed-off-by: Johannes Berg --- net/mac80211/rate.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'net') diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c index 01d6407b0279..9418daa52f68 100644 --- a/net/mac80211/rate.c +++ b/net/mac80211/rate.c @@ -297,15 +297,11 @@ void ieee80211_check_rate_mask(struct ieee80211_sub_if_data *sdata) static bool rc_no_data_or_no_ack_use_min(struct ieee80211_tx_rate_control *txrc) { struct sk_buff *skb = txrc->skb; - struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); - __le16 fc; - - fc = hdr->frame_control; return (info->flags & (IEEE80211_TX_CTL_NO_ACK | IEEE80211_TX_CTL_USE_MINRATE)) || - !ieee80211_is_data(fc); + !ieee80211_is_tx_data(skb); } static void rc_send_low_basicrate(struct ieee80211_tx_rate *rate, -- cgit v1.2.3-58-ga151 From 057e377af24a4f48f9d8340029e765df0345f048 Mon Sep 17 00:00:00 2001 From: Shaokun Zhang Date: Fri, 28 May 2021 18:07:04 +0800 Subject: mac80211: remove the repeated declaration Function 'ieee80211_sta_set_rx_nss' is declared twice, so remove the repeated declaration. Signed-off-by: Shaokun Zhang Link: https://lore.kernel.org/r/1622196424-62403-1-git-send-email-zhangshaokun@hisilicon.com Signed-off-by: Johannes Berg --- net/mac80211/ieee80211_i.h | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index b995777566e1..a7ce59150322 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1875,7 +1875,6 @@ void ieee80211_sta_set_rx_nss(struct sta_info *sta); enum ieee80211_sta_rx_bandwidth ieee80211_chan_width_to_rx_bw(enum nl80211_chan_width width); enum nl80211_chan_width ieee80211_sta_cap_chan_bw(struct sta_info *sta); -void ieee80211_sta_set_rx_nss(struct sta_info *sta); void ieee80211_process_mu_groups(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt); u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, -- cgit v1.2.3-58-ga151 From 9df66d5b9f45c39b3925d16e8947cc10009b186d Mon Sep 17 00:00:00 2001 From: Ping-Ke Shih Date: Wed, 9 Jun 2021 15:59:44 +0800 Subject: cfg80211: fix default HE tx bitrate mask in 2G band In 2G band, a HE sta can only supports HT and HE, but not supports VHT. In this case, default HE tx bitrate mask isn't filled, when we use iw to set bitrates without any parameter. Signed-off-by: Ping-Ke Shih Link: https://lore.kernel.org/r/20210609075944.51130-1-pkshih@realtek.com Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index fc9286afe3c9..912977bf3ec8 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -4781,11 +4781,10 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, sband->ht_cap.mcs.rx_mask, sizeof(mask->control[i].ht_mcs)); - if (!sband->vht_cap.vht_supported) - continue; - - vht_tx_mcs_map = le16_to_cpu(sband->vht_cap.vht_mcs.tx_mcs_map); - vht_build_mcs_mask(vht_tx_mcs_map, mask->control[i].vht_mcs); + if (sband->vht_cap.vht_supported) { + vht_tx_mcs_map = le16_to_cpu(sband->vht_cap.vht_mcs.tx_mcs_map); + vht_build_mcs_mask(vht_tx_mcs_map, mask->control[i].vht_mcs); + } he_cap = ieee80211_get_he_iftype_cap(sband, wdev->iftype); if (!he_cap) -- cgit v1.2.3-58-ga151 From d656a4c6ead6c3f252b2f2532bc9735598f7e317 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 10 Jun 2021 22:08:16 +0200 Subject: mac80211: consider per-CPU statistics if present If we have been keeping per-CPU statistics, consider them regardless of USES_RSS, because we may not actually fill those, for example in non-fast-RX cases when the connection is not compatible with fast-RX. If we didn't fill them, the additional data will be zero and not affect anything, and if we did fill them then it's more correct to consider them. This fixes an issue in mesh mode where some statistics are not updated due to USES_RSS being set, but fast-RX isn't used. Reported-by: Thiraviyam Mariyappan Link: https://lore.kernel.org/r/20210610220814.13b35f5797c5.I511e9b33c5694e0d6cef4b6ae755c873d7c22124@changeid Signed-off-by: Johannes Berg --- net/mac80211/sta_info.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index f2fb69da9b6e..641a6657d0c9 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -2093,10 +2093,9 @@ static struct ieee80211_sta_rx_stats * sta_get_last_rx_stats(struct sta_info *sta) { struct ieee80211_sta_rx_stats *stats = &sta->rx_stats; - struct ieee80211_local *local = sta->local; int cpu; - if (!ieee80211_hw_check(&local->hw, USES_RSS)) + if (!sta->pcpu_rx_stats) return stats; for_each_possible_cpu(cpu) { @@ -2196,9 +2195,7 @@ static void sta_set_tidstats(struct sta_info *sta, int cpu; if (!(tidstats->filled & BIT(NL80211_TID_STATS_RX_MSDU))) { - if (!ieee80211_hw_check(&local->hw, USES_RSS)) - tidstats->rx_msdu += - sta_get_tidstats_msdu(&sta->rx_stats, tid); + tidstats->rx_msdu += sta_get_tidstats_msdu(&sta->rx_stats, tid); if (sta->pcpu_rx_stats) { for_each_possible_cpu(cpu) { @@ -2277,7 +2274,6 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, sinfo->rx_beacon = sdata->u.mgd.count_beacon_signal; drv_sta_statistics(local, sdata, &sta->sta, sinfo); - sinfo->filled |= BIT_ULL(NL80211_STA_INFO_INACTIVE_TIME) | BIT_ULL(NL80211_STA_INFO_STA_FLAGS) | BIT_ULL(NL80211_STA_INFO_BSS_PARAM) | @@ -2312,8 +2308,7 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, if (!(sinfo->filled & (BIT_ULL(NL80211_STA_INFO_RX_BYTES64) | BIT_ULL(NL80211_STA_INFO_RX_BYTES)))) { - if (!ieee80211_hw_check(&local->hw, USES_RSS)) - sinfo->rx_bytes += sta_get_stats_bytes(&sta->rx_stats); + sinfo->rx_bytes += sta_get_stats_bytes(&sta->rx_stats); if (sta->pcpu_rx_stats) { for_each_possible_cpu(cpu) { -- cgit v1.2.3-58-ga151 From 3f9d9725cb7daf7e9a834aa4f24d88b049c3c1f5 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 16 Jun 2021 21:28:05 +0200 Subject: mac80211: don't open-code LED manipulations We shouldn't open-code led_trigger_blink() or led_trigger_event(), use them instead of badly open-coding them. This also fixes the locking, led_trigger_blink() and led_trigger_event() now use read_lock_irqsave(). Link: https://lore.kernel.org/r/20210616212804.b19ba1c60353.I8ea1b4defd5e12fc20ef281291e602feeec336a6@changeid Signed-off-by: Johannes Berg --- net/mac80211/led.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/mac80211/led.c b/net/mac80211/led.c index b275c8853074..6de8d0ad5497 100644 --- a/net/mac80211/led.c +++ b/net/mac80211/led.c @@ -259,7 +259,6 @@ static void tpt_trig_timer(struct timer_list *t) { struct tpt_led_trigger *tpt_trig = from_timer(tpt_trig, t, timer); struct ieee80211_local *local = tpt_trig->local; - struct led_classdev *led_cdev; unsigned long on, off, tpt; int i; @@ -283,10 +282,7 @@ static void tpt_trig_timer(struct timer_list *t) } } - read_lock(&local->tpt_led.leddev_list_lock); - list_for_each_entry(led_cdev, &local->tpt_led.led_cdevs, trig_list) - led_blink_set(led_cdev, &on, &off); - read_unlock(&local->tpt_led.leddev_list_lock); + led_trigger_blink(&local->tpt_led, &on, &off); } const char * @@ -341,7 +337,6 @@ static void ieee80211_start_tpt_led_trig(struct ieee80211_local *local) static void ieee80211_stop_tpt_led_trig(struct ieee80211_local *local) { struct tpt_led_trigger *tpt_trig = local->tpt_led_trigger; - struct led_classdev *led_cdev; if (!tpt_trig->running) return; @@ -349,10 +344,7 @@ static void ieee80211_stop_tpt_led_trig(struct ieee80211_local *local) tpt_trig->running = false; del_timer_sync(&tpt_trig->timer); - read_lock(&local->tpt_led.leddev_list_lock); - list_for_each_entry(led_cdev, &local->tpt_led.led_cdevs, trig_list) - led_set_brightness(led_cdev, LED_OFF); - read_unlock(&local->tpt_led.leddev_list_lock); + led_trigger_event(&local->tpt_led, LED_OFF); } void ieee80211_mod_tpt_led_trig(struct ieee80211_local *local, -- cgit v1.2.3-58-ga151 From 358ae88881adc3ac1544104272eb7e9408f80b39 Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Wed, 16 Jun 2021 23:28:26 +0300 Subject: cfg80211: expose the rfkill device to the low level driver This will allow the low level driver to query the rfkill state. Signed-off-by: Emmanuel Grumbach Link: https://lore.kernel.org/r/20210616202826.9833-1-emmanuel.grumbach@intel.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 9 ++++++++- net/wireless/core.c | 34 +++++++++++++--------------------- net/wireless/core.h | 3 +-- net/wireless/nl80211.c | 4 ++-- net/wireless/wext-compat.c | 6 +++--- 5 files changed, 27 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 60325b62daae..b3bc58ec9098 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -22,6 +22,7 @@ #include #include #include +#include #include /** @@ -4943,6 +4944,7 @@ struct wiphy_iftype_akm_suites { * configuration through the %NL80211_TID_CONFIG_ATTR_RETRY_SHORT and * %NL80211_TID_CONFIG_ATTR_RETRY_LONG attributes * @sar_capa: SAR control capabilities + * @rfkill: a pointer to the rfkill structure */ struct wiphy { struct mutex mtx; @@ -5085,6 +5087,8 @@ struct wiphy { const struct cfg80211_sar_capa *sar_capa; + struct rfkill *rfkill; + char priv[] __aligned(NETDEV_ALIGN); }; @@ -6659,7 +6663,10 @@ void wiphy_rfkill_start_polling(struct wiphy *wiphy); * wiphy_rfkill_stop_polling - stop polling rfkill * @wiphy: the wiphy */ -void wiphy_rfkill_stop_polling(struct wiphy *wiphy); +static inline void wiphy_rfkill_stop_polling(struct wiphy *wiphy) +{ + rfkill_pause_polling(wiphy->rfkill); +} /** * DOC: Vendor commands diff --git a/net/wireless/core.c b/net/wireless/core.c index 47f551301592..41c15cc7791f 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -532,11 +532,11 @@ use_default_name: wiphy_net_set(&rdev->wiphy, &init_net); rdev->rfkill_ops.set_block = cfg80211_rfkill_set_block; - rdev->rfkill = rfkill_alloc(dev_name(&rdev->wiphy.dev), - &rdev->wiphy.dev, RFKILL_TYPE_WLAN, - &rdev->rfkill_ops, rdev); + rdev->wiphy.rfkill = rfkill_alloc(dev_name(&rdev->wiphy.dev), + &rdev->wiphy.dev, RFKILL_TYPE_WLAN, + &rdev->rfkill_ops, rdev); - if (!rdev->rfkill) { + if (!rdev->wiphy.rfkill) { wiphy_free(&rdev->wiphy); return NULL; } @@ -985,10 +985,10 @@ int wiphy_register(struct wiphy *wiphy) rdev->wiphy.registered = true; rtnl_unlock(); - res = rfkill_register(rdev->rfkill); + res = rfkill_register(rdev->wiphy.rfkill); if (res) { - rfkill_destroy(rdev->rfkill); - rdev->rfkill = NULL; + rfkill_destroy(rdev->wiphy.rfkill); + rdev->wiphy.rfkill = NULL; wiphy_unregister(&rdev->wiphy); return res; } @@ -1004,18 +1004,10 @@ void wiphy_rfkill_start_polling(struct wiphy *wiphy) if (!rdev->ops->rfkill_poll) return; rdev->rfkill_ops.poll = cfg80211_rfkill_poll; - rfkill_resume_polling(rdev->rfkill); + rfkill_resume_polling(wiphy->rfkill); } EXPORT_SYMBOL(wiphy_rfkill_start_polling); -void wiphy_rfkill_stop_polling(struct wiphy *wiphy) -{ - struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); - - rfkill_pause_polling(rdev->rfkill); -} -EXPORT_SYMBOL(wiphy_rfkill_stop_polling); - void wiphy_unregister(struct wiphy *wiphy) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); @@ -1027,8 +1019,8 @@ void wiphy_unregister(struct wiphy *wiphy) wiphy_unlock(&rdev->wiphy); __count == 0; })); - if (rdev->rfkill) - rfkill_unregister(rdev->rfkill); + if (rdev->wiphy.rfkill) + rfkill_unregister(rdev->wiphy.rfkill); rtnl_lock(); wiphy_lock(&rdev->wiphy); @@ -1080,7 +1072,7 @@ void cfg80211_dev_free(struct cfg80211_registered_device *rdev) { struct cfg80211_internal_bss *scan, *tmp; struct cfg80211_beacon_registration *reg, *treg; - rfkill_destroy(rdev->rfkill); + rfkill_destroy(rdev->wiphy.rfkill); list_for_each_entry_safe(reg, treg, &rdev->beacon_registrations, list) { list_del(®->list); kfree(reg); @@ -1102,7 +1094,7 @@ void wiphy_rfkill_set_hw_state_reason(struct wiphy *wiphy, bool blocked, { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); - if (rfkill_set_hw_state_reason(rdev->rfkill, blocked, reason)) + if (rfkill_set_hw_state_reason(wiphy->rfkill, blocked, reason)) schedule_work(&rdev->rfkill_block); } EXPORT_SYMBOL(wiphy_rfkill_set_hw_state_reason); @@ -1495,7 +1487,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, wdev->use_4addr, 0)) return notifier_from_errno(-EOPNOTSUPP); - if (rfkill_blocked(rdev->rfkill)) + if (rfkill_blocked(rdev->wiphy.rfkill)) return notifier_from_errno(-ERFKILL); break; default: diff --git a/net/wireless/core.h b/net/wireless/core.h index a7d19b4b40ac..b35d0db12f1d 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -3,7 +3,7 @@ * Wireless configuration interface internals. * * Copyright 2006-2010 Johannes Berg - * Copyright (C) 2018-2020 Intel Corporation + * Copyright (C) 2018-2021 Intel Corporation */ #ifndef __NET_WIRELESS_CORE_H #define __NET_WIRELESS_CORE_H @@ -27,7 +27,6 @@ struct cfg80211_registered_device { /* rfkill support */ struct rfkill_ops rfkill_ops; - struct rfkill *rfkill; struct work_struct rfkill_block; /* ISO / IEC 3166 alpha2 for which this device is receiving diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 912977bf3ec8..c62d61d8aa02 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -13041,7 +13041,7 @@ static int nl80211_start_p2p_device(struct sk_buff *skb, struct genl_info *info) if (wdev_running(wdev)) return 0; - if (rfkill_blocked(rdev->rfkill)) + if (rfkill_blocked(rdev->wiphy.rfkill)) return -ERFKILL; err = rdev_start_p2p_device(rdev, wdev); @@ -13083,7 +13083,7 @@ static int nl80211_start_nan(struct sk_buff *skb, struct genl_info *info) if (wdev_running(wdev)) return -EEXIST; - if (rfkill_blocked(rdev->rfkill)) + if (rfkill_blocked(rdev->wiphy.rfkill)) return -ERFKILL; if (!info->attrs[NL80211_ATTR_NAN_MASTER_PREF]) diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c index 50a2330de236..a32065d600a1 100644 --- a/net/wireless/wext-compat.c +++ b/net/wireless/wext-compat.c @@ -902,7 +902,7 @@ static int cfg80211_wext_siwtxpower(struct net_device *dev, /* only change when not disabling */ if (!data->txpower.disabled) { - rfkill_set_sw_state(rdev->rfkill, false); + rfkill_set_sw_state(rdev->wiphy.rfkill, false); if (data->txpower.fixed) { /* @@ -927,7 +927,7 @@ static int cfg80211_wext_siwtxpower(struct net_device *dev, } } } else { - if (rfkill_set_sw_state(rdev->rfkill, true)) + if (rfkill_set_sw_state(rdev->wiphy.rfkill, true)) schedule_work(&rdev->rfkill_block); return 0; } @@ -963,7 +963,7 @@ static int cfg80211_wext_giwtxpower(struct net_device *dev, /* well... oh well */ data->txpower.fixed = 1; - data->txpower.disabled = rfkill_blocked(rdev->rfkill); + data->txpower.disabled = rfkill_blocked(rdev->wiphy.rfkill); data->txpower.value = val; data->txpower.flags = IW_TXPOW_DBM; -- cgit v1.2.3-58-ga151 From 08a46c6420013c4ecb61262b4869fdd7e82f918a Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Thu, 17 Jun 2021 18:31:11 +0200 Subject: mac80211: move A-MPDU session check from minstrel_ht to mac80211 This avoids calling back into tx handlers from within the rate control module. Preparation for deferring rate control until tx dequeue Signed-off-by: Felix Fietkau Link: https://lore.kernel.org/r/20210617163113.75815-1-nbd@nbd.name Signed-off-by: Johannes Berg --- include/net/mac80211.h | 5 +++++ net/mac80211/rc80211_minstrel_ht.c | 28 +--------------------------- net/mac80211/tx.c | 27 +++++++++++++++++++++++++++ 3 files changed, 33 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 4e876e4598e3..c09cd0e4a6b3 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -6184,6 +6184,11 @@ enum rate_control_capabilities { * otherwise the NSS difference doesn't bother us. */ RATE_CTRL_CAPA_VHT_EXT_NSS_BW = BIT(0), + /** + * @RATE_CTRL_CAPA_AMPDU_TRIGGER: + * mac80211 should start A-MPDU sessions on tx + */ + RATE_CTRL_CAPA_AMPDU_TRIGGER = BIT(1), }; struct rate_control_ops { diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c index bc261d086410..20f2e0bef96b 100644 --- a/net/mac80211/rc80211_minstrel_ht.c +++ b/net/mac80211/rc80211_minstrel_ht.c @@ -1175,29 +1175,6 @@ minstrel_downgrade_rate(struct minstrel_ht_sta *mi, u16 *idx, bool primary) } } -static void -minstrel_aggr_check(struct ieee80211_sta *pubsta, struct sk_buff *skb) -{ - struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; - struct sta_info *sta = container_of(pubsta, struct sta_info, sta); - u16 tid; - - if (skb_get_queue_mapping(skb) == IEEE80211_AC_VO) - return; - - if (unlikely(!ieee80211_is_data_qos(hdr->frame_control))) - return; - - if (unlikely(skb->protocol == cpu_to_be16(ETH_P_PAE))) - return; - - tid = ieee80211_get_tid(hdr); - if (likely(sta->ampdu_mlme.tid_tx[tid])) - return; - - ieee80211_start_tx_ba_session(pubsta, tid, 0); -} - static void minstrel_ht_tx_status(void *priv, struct ieee80211_supported_band *sband, void *priv_sta, struct ieee80211_tx_status *st) @@ -1502,10 +1479,6 @@ minstrel_ht_get_rate(void *priv, struct ieee80211_sta *sta, void *priv_sta, struct minstrel_priv *mp = priv; u16 sample_idx; - if (!(info->flags & IEEE80211_TX_CTL_AMPDU) && - !minstrel_ht_is_legacy_group(MI_RATE_GROUP(mi->max_prob_rate))) - minstrel_aggr_check(sta, txrc->skb); - info->flags |= mi->tx_flags; #ifdef CONFIG_MAC80211_DEBUGFS @@ -1911,6 +1884,7 @@ static u32 minstrel_ht_get_expected_throughput(void *priv_sta) static const struct rate_control_ops mac80211_minstrel_ht = { .name = "minstrel_ht", + .capa = RATE_CTRL_CAPA_AMPDU_TRIGGER, .tx_status_ext = minstrel_ht_tx_status, .get_rate = minstrel_ht_get_rate, .rate_init = minstrel_ht_rate_init, diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 2651498d05e8..b5d05eccfae8 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -3949,6 +3949,29 @@ void ieee80211_txq_schedule_start(struct ieee80211_hw *hw, u8 ac) } EXPORT_SYMBOL(ieee80211_txq_schedule_start); +static void +ieee80211_aggr_check(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, + struct sk_buff *skb) +{ + struct rate_control_ref *ref = sdata->local->rate_ctrl; + u16 tid; + + if (!ref || !(ref->ops->capa & RATE_CTRL_CAPA_AMPDU_TRIGGER)) + return; + + if (!sta || !sta->sta.ht_cap.ht_supported || + !sta->sta.wme || skb_get_queue_mapping(skb) == IEEE80211_AC_VO || + skb->protocol == sdata->control_port_protocol) + return; + + tid = skb->priority & IEEE80211_QOS_CTL_TID_MASK; + if (likely(sta->ampdu_mlme.tid_tx[tid])) + return; + + ieee80211_start_tx_ba_session(&sta->sta, tid, 0); +} + void __ieee80211_subif_start_xmit(struct sk_buff *skb, struct net_device *dev, u32 info_flags, @@ -3979,6 +4002,8 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb, skb_get_hash(skb); } + ieee80211_aggr_check(sdata, sta, skb); + if (sta) { struct ieee80211_fast_tx *fast_tx; @@ -4242,6 +4267,8 @@ static void ieee80211_8023_xmit(struct ieee80211_sub_if_data *sdata, memset(info, 0, sizeof(*info)); + ieee80211_aggr_check(sdata, sta, skb); + tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK; tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[tid]); if (tid_tx) { -- cgit v1.2.3-58-ga151 From 03c3911d2d67a43ad4ffd15b534a5905d6ce5c59 Mon Sep 17 00:00:00 2001 From: Ryder Lee Date: Thu, 17 Jun 2021 18:31:12 +0200 Subject: mac80211: call ieee80211_tx_h_rate_ctrl() when dequeue Make ieee80211_tx_h_rate_ctrl() get called on dequeue to improve performance since it reduces the turnaround time for rate control. Signed-off-by: Ryder Lee Link: https://lore.kernel.org/r/20210617163113.75815-2-nbd@nbd.name Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 52 +++++++++++++++++++++++++++++----------------------- 1 file changed, 29 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index b5d05eccfae8..b9ff455ee01c 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1768,8 +1768,6 @@ static int invoke_tx_handlers_early(struct ieee80211_tx_data *tx) CALL_TXH(ieee80211_tx_h_ps_buf); CALL_TXH(ieee80211_tx_h_check_control_port_protocol); CALL_TXH(ieee80211_tx_h_select_key); - if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL)) - CALL_TXH(ieee80211_tx_h_rate_ctrl); txh_done: if (unlikely(res == TX_DROP)) { @@ -1802,6 +1800,9 @@ static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx) goto txh_done; } + if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL)) + CALL_TXH(ieee80211_tx_h_rate_ctrl); + CALL_TXH(ieee80211_tx_h_michael_mic_add); CALL_TXH(ieee80211_tx_h_sequence); CALL_TXH(ieee80211_tx_h_fragment); @@ -3389,15 +3390,21 @@ out: * Can be called while the sta lock is held. Anything that can cause packets to * be generated will cause deadlock! */ -static void ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sdata, - struct sta_info *sta, u8 pn_offs, - struct ieee80211_key *key, - struct sk_buff *skb) +static ieee80211_tx_result +ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, u8 pn_offs, + struct ieee80211_key *key, + struct ieee80211_tx_data *tx) { + struct sk_buff *skb = tx->skb; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_hdr *hdr = (void *)skb->data; u8 tid = IEEE80211_NUM_TIDS; + if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL) && + ieee80211_tx_h_rate_ctrl(tx) != TX_CONTINUE) + return TX_DROP; + if (key) info->control.hw_key = &key->conf; @@ -3446,6 +3453,8 @@ static void ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sdata, break; } } + + return TX_CONTINUE; } static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, @@ -3549,24 +3558,17 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, tx.sta = sta; tx.key = fast_tx->key; - if (!ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) { - tx.skb = skb; - r = ieee80211_tx_h_rate_ctrl(&tx); - skb = tx.skb; - tx.skb = NULL; - - if (r != TX_CONTINUE) { - if (r != TX_QUEUED) - kfree_skb(skb); - return true; - } - } - if (ieee80211_queue_skb(local, sdata, sta, skb)) return true; - ieee80211_xmit_fast_finish(sdata, sta, fast_tx->pn_offs, - fast_tx->key, skb); + tx.skb = skb; + r = ieee80211_xmit_fast_finish(sdata, sta, fast_tx->pn_offs, + fast_tx->key, &tx); + tx.skb = NULL; + if (r == TX_DROP) { + kfree_skb(skb); + return true; + } if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) sdata = container_of(sdata->bss, @@ -3683,8 +3685,12 @@ begin: (tx.key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV)) pn_offs = ieee80211_hdrlen(hdr->frame_control); - ieee80211_xmit_fast_finish(sta->sdata, sta, pn_offs, - tx.key, skb); + r = ieee80211_xmit_fast_finish(sta->sdata, sta, pn_offs, + tx.key, &tx); + if (r != TX_CONTINUE) { + ieee80211_free_txskb(&local->hw, skb); + goto begin; + } } else { if (invoke_tx_handlers_late(&tx)) goto begin; -- cgit v1.2.3-58-ga151 From 3187ba0cea77c8a4cdaed44fbff02c6e63e509aa Mon Sep 17 00:00:00 2001 From: Ryder Lee Date: Thu, 17 Jun 2021 18:31:13 +0200 Subject: mac80211: add rate control support for encap offload The software rate control cannot deal with encap offload, so fix it. Signed-off-by: Ryder Lee Link: https://lore.kernel.org/r/20210617163113.75815-3-nbd@nbd.name Signed-off-by: Johannes Berg --- net/mac80211/rate.c | 3 +-- net/mac80211/tx.c | 18 ++++++++++++++---- 2 files changed, 15 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c index 9418daa52f68..e5935e3d7a07 100644 --- a/net/mac80211/rate.c +++ b/net/mac80211/rate.c @@ -870,7 +870,6 @@ void ieee80211_get_tx_rates(struct ieee80211_vif *vif, int max_rates) { struct ieee80211_sub_if_data *sdata; - struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_supported_band *sband; @@ -882,7 +881,7 @@ void ieee80211_get_tx_rates(struct ieee80211_vif *vif, sdata = vif_to_sdata(vif); sband = sdata->local->hw.wiphy->bands[info->band]; - if (ieee80211_is_data(hdr->frame_control)) + if (ieee80211_is_tx_data(skb)) rate_control_apply_mask(sdata, sta, sband, dest, max_rates); if (dest[0].idx < 0) diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index b9ff455ee01c..ac506245bb94 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -666,6 +666,7 @@ ieee80211_tx_h_rate_ctrl(struct ieee80211_tx_data *tx) u32 len; struct ieee80211_tx_rate_control txrc; struct ieee80211_sta_rates *ratetbl = NULL; + bool encap = info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP; bool assoc = false; memset(&txrc, 0, sizeof(txrc)); @@ -707,7 +708,7 @@ ieee80211_tx_h_rate_ctrl(struct ieee80211_tx_data *tx) * just wants a probe response. */ if (tx->sdata->vif.bss_conf.use_short_preamble && - (ieee80211_is_data(hdr->frame_control) || + (ieee80211_is_tx_data(tx->skb) || (tx->sta && test_sta_flag(tx->sta, WLAN_STA_SHORT_PREAMBLE)))) txrc.short_preamble = true; @@ -729,7 +730,8 @@ ieee80211_tx_h_rate_ctrl(struct ieee80211_tx_data *tx) "%s: Dropped data frame as no usable bitrate found while " "scanning and associated. Target station: " "%pM on %d GHz band\n", - tx->sdata->name, hdr->addr1, + tx->sdata->name, + encap ? ((struct ethhdr *)hdr)->h_dest : hdr->addr1, info->band ? 5 : 2)) return TX_DROP; @@ -763,7 +765,7 @@ ieee80211_tx_h_rate_ctrl(struct ieee80211_tx_data *tx) if (txrc.reported_rate.idx < 0) { txrc.reported_rate = tx->rate; - if (tx->sta && ieee80211_is_data(hdr->frame_control)) + if (tx->sta && ieee80211_is_tx_data(tx->skb)) tx->sta->tx_stats.last_rate = txrc.reported_rate; } else if (tx->sta) tx->sta->tx_stats.last_rate = txrc.reported_rate; @@ -3673,8 +3675,16 @@ begin: else info->flags &= ~IEEE80211_TX_CTL_AMPDU; - if (info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP) + if (info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP) { + if (!ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) { + r = ieee80211_tx_h_rate_ctrl(&tx); + if (r != TX_CONTINUE) { + ieee80211_free_txskb(&local->hw, skb); + goto begin; + } + } goto encap_out; + } if (info->control.flags & IEEE80211_TX_CTRL_FAST_XMIT) { struct sta_info *sta = container_of(txq->sta, struct sta_info, -- cgit v1.2.3-58-ga151 From 4f2e3eb6c985cc8c2b88d5089b1920b831bc5ed9 Mon Sep 17 00:00:00 2001 From: Ryder Lee Date: Fri, 18 Jun 2021 04:38:59 +0800 Subject: mac80211: check per vif offload_flags in Tx path offload_flags has been introduced to indicate encap status of each interface. An interface can encap offload at runtime, or if it has some extra limitations it can simply override the flags, so it's more flexible to check offload_flags in Tx path. Signed-off-by: Ryder Lee Link: https://lore.kernel.org/r/177785418cf407808bf3a44760302d0647076990.1623961575.git.ryder.lee@mediatek.com Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index ac506245bb94..caa7caa89ab9 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -3287,6 +3287,9 @@ static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata, if (!ieee80211_hw_check(&local->hw, TX_AMSDU)) return false; + if (sdata->vif.offload_flags & IEEE80211_OFFLOAD_ENCAP_ENABLED) + return false; + if (skb_is_gso(skb)) return false; -- cgit v1.2.3-58-ga151 From e6ed929b4140d293bf8523f0376ed9bbdce29c47 Mon Sep 17 00:00:00 2001 From: Wen Gong Date: Sun, 23 May 2021 23:36:24 -0400 Subject: wireless: add check of field VHT Extended NSS BW Capable for 160/80+80 MHz setting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Table 9-251—Supported VHT-MCS and NSS Set subfields, it has subfield VHT Extended NSS BW Capable, its definition is: Indicates whether the STA is capable of interpreting the Extended NSS BW Support subfield of the VHT Capabilities Information field. This patch is to add check for the subfield. Signed-off-by: Wen Gong Link: https://lore.kernel.org/r/20210524033624.16993-1-wgong@codeaurora.org Signed-off-by: Johannes Berg --- net/wireless/chan.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/wireless/chan.c b/net/wireless/chan.c index 285b8076054b..472c895823a4 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -942,7 +942,7 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy, struct ieee80211_sta_vht_cap *vht_cap; struct ieee80211_edmg *edmg_cap; u32 width, control_freq, cap; - bool support_80_80 = false; + bool ext_nss_cap, support_80_80 = false; if (WARN_ON(!cfg80211_chandef_valid(chandef))) return false; @@ -950,6 +950,8 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy, ht_cap = &wiphy->bands[chandef->chan->band]->ht_cap; vht_cap = &wiphy->bands[chandef->chan->band]->vht_cap; edmg_cap = &wiphy->bands[chandef->chan->band]->edmg_cap; + ext_nss_cap = __le16_to_cpu(vht_cap->vht_mcs.tx_highest) & + IEEE80211_VHT_EXT_NSS_BW_CAPABLE; if (edmg_cap->channels && !cfg80211_edmg_usable(wiphy, @@ -1015,7 +1017,8 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy, (cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ) || (cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ && cap & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK) || - u32_get_bits(cap, IEEE80211_VHT_CAP_EXT_NSS_BW_MASK) > 1; + (ext_nss_cap && + u32_get_bits(cap, IEEE80211_VHT_CAP_EXT_NSS_BW_MASK) > 1); if (chandef->chan->band != NL80211_BAND_6GHZ && !support_80_80) return false; fallthrough; @@ -1037,7 +1040,8 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy, cap = vht_cap->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK; if (cap != IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ && cap != IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ && - !(vht_cap->cap & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK)) + !(ext_nss_cap && + (vht_cap->cap & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK))) return false; break; default: -- cgit v1.2.3-58-ga151 From e41eb3e408de27982a5f8f50b2dd8002bed96908 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Sat, 19 Jun 2021 12:15:17 +0200 Subject: mac80211: remove iwlwifi specific workaround that broke sta NDP tx Sending nulldata packets is important for sw AP link probing and detecting 4-address mode links. The checks that dropped these packets were apparently added to work around an iwlwifi firmware bug with multi-TID aggregation. Fixes: 41cbb0f5a295 ("mac80211: add support for HE") Cc: stable@vger.kernel.org Signed-off-by: Felix Fietkau Link: https://lore.kernel.org/r/20210619101517.90806-1-nbd@nbd.name Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/mvm/tx.c | 3 +++ net/mac80211/mlme.c | 9 --------- 2 files changed, 3 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c index 1ad621d13ad3..0a13c2bda2ee 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c @@ -1032,6 +1032,9 @@ static int iwl_mvm_tx_mpdu(struct iwl_mvm *mvm, struct sk_buff *skb, if (WARN_ON_ONCE(mvmsta->sta_id == IWL_MVM_INVALID_STA)) return -1; + if (unlikely(ieee80211_is_any_nullfunc(fc)) && sta->he_cap.has_he) + return -1; + if (unlikely(ieee80211_is_probe_resp(fc))) iwl_mvm_probe_resp_set_noa(mvm, skb); diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 3f2aad2e7436..b1c44fa63a06 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -1094,11 +1094,6 @@ void ieee80211_send_nullfunc(struct ieee80211_local *local, struct ieee80211_hdr_3addr *nullfunc; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; - /* Don't send NDPs when STA is connected HE */ - if (sdata->vif.type == NL80211_IFTYPE_STATION && - !(ifmgd->flags & IEEE80211_STA_DISABLE_HE)) - return; - skb = ieee80211_nullfunc_get(&local->hw, &sdata->vif, !ieee80211_hw_check(&local->hw, DOESNT_SUPPORT_QOS_NDP)); if (!skb) @@ -1130,10 +1125,6 @@ static void ieee80211_send_4addr_nullfunc(struct ieee80211_local *local, if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION)) return; - /* Don't send NDPs when connected HE */ - if (!(sdata->u.mgd.flags & IEEE80211_STA_DISABLE_HE)) - return; - skb = dev_alloc_skb(local->hw.extra_tx_headroom + 30); if (!skb) return; -- cgit v1.2.3-58-ga151 From 10a35c222bc6fdd71421e800003b4c4c02d41bba Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 18 Jun 2021 13:41:32 +0300 Subject: mac80211: allow SMPS requests only in client mode The code currently allows this for AP mode, but then ignores it. Clarify that since the spec doesn't allow it in AP mode. Signed-off-by: Johannes Berg Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20210618133832.0465f8bcbe32.Iba39fc559ecfa887be00a5f3beabd881e5c86e54@changeid Signed-off-by: Johannes Berg --- net/mac80211/ht.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index 3d62a80b5790..2eb7641f5556 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -9,7 +9,7 @@ * Copyright 2007, Michael Wu * Copyright 2007-2010, Intel Corporation * Copyright 2017 Intel Deutschland GmbH - * Copyright(c) 2020 Intel Corporation + * Copyright(c) 2020-2021 Intel Corporation */ #include @@ -555,17 +555,15 @@ void ieee80211_request_smps(struct ieee80211_vif *vif, { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); - if (WARN_ON_ONCE(vif->type != NL80211_IFTYPE_STATION && - vif->type != NL80211_IFTYPE_AP)) + if (WARN_ON_ONCE(vif->type != NL80211_IFTYPE_STATION)) return; - if (vif->type == NL80211_IFTYPE_STATION) { - if (sdata->u.mgd.driver_smps_mode == smps_mode) - return; - sdata->u.mgd.driver_smps_mode = smps_mode; - ieee80211_queue_work(&sdata->local->hw, - &sdata->u.mgd.request_smps_work); - } + if (sdata->u.mgd.driver_smps_mode == smps_mode) + return; + + sdata->u.mgd.driver_smps_mode = smps_mode; + ieee80211_queue_work(&sdata->local->hw, + &sdata->u.mgd.request_smps_work); } /* this might change ... don't want non-open drivers using it */ EXPORT_SYMBOL_GPL(ieee80211_request_smps); -- cgit v1.2.3-58-ga151 From 79ea0a5fad749dabfd7b8a1b73dd6662383762d1 Mon Sep 17 00:00:00 2001 From: Shaul Triebitz Date: Fri, 18 Jun 2021 13:41:34 +0300 Subject: mac80211: move SMPS mode setting after ieee80211_prep_connection ieee80211_mgd_assoc calls ieee80211_prep_connection which might call ieee80211_prep_channel and set smps_mode to OFF. That will override the previous setting of smps_mode in ieee80211_mgd_assoc and HT SMPS will be set to "disabled" in the association request frame. Move the setting of smps_mode in ieee80211_mgd_assoc to after the call to ieee80211_prep_connection. Signed-off-by: Shaul Triebitz Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20210618133832.d8e5cc4b527f.Icf3a67fffbdd8c408c0cadfe43f8f4cffdc90acb@changeid Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index b1c44fa63a06..b33b155f3573 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -5646,15 +5646,6 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, 2 * FILS_NONCE_LEN); assoc_data->bss = req->bss; - - if (ifmgd->req_smps == IEEE80211_SMPS_AUTOMATIC) { - if (ifmgd->powersave) - sdata->smps_mode = IEEE80211_SMPS_DYNAMIC; - else - sdata->smps_mode = IEEE80211_SMPS_OFF; - } else - sdata->smps_mode = ifmgd->req_smps; - assoc_data->capability = req->bss->capability; assoc_data->supp_rates = bss->supp_rates; assoc_data->supp_rates_len = bss->supp_rates_len; @@ -5761,6 +5752,15 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, if (err) goto err_clear; + if (ifmgd->req_smps == IEEE80211_SMPS_AUTOMATIC) { + if (ifmgd->powersave) + sdata->smps_mode = IEEE80211_SMPS_DYNAMIC; + else + sdata->smps_mode = IEEE80211_SMPS_OFF; + } else { + sdata->smps_mode = ifmgd->req_smps; + } + rcu_read_lock(); beacon_ies = rcu_dereference(req->bss->beacon_ies); -- cgit v1.2.3-58-ga151 From 7d7b00759e56bd2c0ff8b1155cb00f452dfc1c5d Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 18 Jun 2021 13:41:33 +0300 Subject: mac80211: free skb in WEP error case If, for some strange reason, ieee80211_wep_encrypt() fails in ieee80211_send_auth() free the SKB instead of sending out the useless frame, in addition to the warning. This can't really happen since the SKB was freshly allocated. Signed-off-by: Johannes Berg Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20210618133832.98f058d7a8b2.Ie605e6a10e72eae02f5734032826af48b85b6d11@changeid Signed-off-by: Johannes Berg --- net/mac80211/util.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 060059ef9668..ee5410bfe9ec 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1693,7 +1693,10 @@ void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata, if (auth_alg == WLAN_AUTH_SHARED_KEY && transaction == 3) { mgmt->frame_control |= cpu_to_le16(IEEE80211_FCTL_PROTECTED); err = ieee80211_wep_encrypt(local, skb, key, key_len, key_idx); - WARN_ON(err); + if (WARN_ON(err)) { + kfree_skb(skb); + return; + } } IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT | -- cgit v1.2.3-58-ga151 From d6c375095ade4ea4d20ada1f020c821bf0bfe7fa Mon Sep 17 00:00:00 2001 From: Mordechay Goodstein Date: Fri, 18 Jun 2021 13:41:30 +0300 Subject: mac80211: handle rate control (RC) racing with chanctx definition chanctx represents the current phy configuration and rate scale uses it for achieving max throughput, so if phy changes bandwidth to narrow bandwidth, RC should be _first_ updated to avoid using the wider bandwidth before updating the phy, and vice versa. We assume in the patch that station interface is always updated before updating phy context by calling ieee80211_vif_update_chandef. Signed-off-by: Mordechay Goodstein Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20210618133832.fc4e24496aa2.Ic40ea947c2f65739ea4b5fe3babd0a544240ced6@changeid Signed-off-by: Johannes Berg --- net/mac80211/chan.c | 108 ++++++++++++++++++++++++++++++++++------------------ net/mac80211/mlme.c | 89 +------------------------------------------ 2 files changed, 71 insertions(+), 126 deletions(-) (limited to 'net') diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index 907bb1f748a1..76fc36a68750 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* * mac80211 - channel management + * Copyright 2020 - 2021 Intel Corporation */ #include @@ -308,8 +309,8 @@ ieee80211_get_chanctx_max_required_bw(struct ieee80211_local *local, * the max of min required widths of all the interfaces bound to this * channel context. */ -void ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local, - struct ieee80211_chanctx *ctx) +static u32 _ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local, + struct ieee80211_chanctx *ctx) { enum nl80211_chan_width max_bw; struct cfg80211_chan_def min_def; @@ -326,7 +327,7 @@ void ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local, ctx->conf.def.width == NL80211_CHAN_WIDTH_16 || ctx->conf.radar_enabled) { ctx->conf.min_def = ctx->conf.def; - return; + return 0; } max_bw = ieee80211_get_chanctx_max_required_bw(local, &ctx->conf); @@ -337,17 +338,21 @@ void ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local, ieee80211_chandef_downgrade(&min_def); if (cfg80211_chandef_identical(&ctx->conf.min_def, &min_def)) - return; + return 0; ctx->conf.min_def = min_def; if (!ctx->driver_present) - return; + return 0; - drv_change_chanctx(local, ctx, IEEE80211_CHANCTX_CHANGE_MIN_WIDTH); + return IEEE80211_CHANCTX_CHANGE_MIN_WIDTH; } +/* calling this function is assuming that station vif is updated to + * lates changes by calling ieee80211_vif_update_chandef + */ static void ieee80211_chan_bw_change(struct ieee80211_local *local, - struct ieee80211_chanctx *ctx) + struct ieee80211_chanctx *ctx, + bool narrowed) { struct sta_info *sta; struct ieee80211_supported_band *sband = @@ -366,9 +371,16 @@ static void ieee80211_chan_bw_change(struct ieee80211_local *local, continue; new_sta_bw = ieee80211_sta_cur_vht_bw(sta); + + /* nothing change */ if (new_sta_bw == sta->sta.bandwidth) continue; + /* vif changed to narrow BW and narrow BW for station wasn't + * requested or vise versa */ + if ((new_sta_bw < sta->sta.bandwidth) == !narrowed) + continue; + sta->sta.bandwidth = new_sta_bw; rate_control_rate_update(local, sband, sta, IEEE80211_RC_BW_CHANGED); @@ -376,21 +388,34 @@ static void ieee80211_chan_bw_change(struct ieee80211_local *local, rcu_read_unlock(); } -static void ieee80211_change_chanctx(struct ieee80211_local *local, - struct ieee80211_chanctx *ctx, - const struct cfg80211_chan_def *chandef) +/* + * recalc the min required chan width of the channel context, which is + * the max of min required widths of all the interfaces bound to this + * channel context. + */ +void ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local, + struct ieee80211_chanctx *ctx) { - enum nl80211_chan_width width; + u32 changed = _ieee80211_recalc_chanctx_min_def(local, ctx); - if (cfg80211_chandef_identical(&ctx->conf.def, chandef)) { - ieee80211_recalc_chanctx_min_def(local, ctx); + if (!changed) return; - } - WARN_ON(!cfg80211_chandef_compatible(&ctx->conf.def, chandef)); + /* check is BW narrowed */ + ieee80211_chan_bw_change(local, ctx, true); - width = ctx->conf.def.width; - ctx->conf.def = *chandef; + drv_change_chanctx(local, ctx, changed); + + /* check is BW wider */ + ieee80211_chan_bw_change(local, ctx, false); +} + +static void ieee80211_change_chanctx(struct ieee80211_local *local, + struct ieee80211_chanctx *ctx, + struct ieee80211_chanctx *old_ctx, + const struct cfg80211_chan_def *chandef) +{ + u32 changed; /* expected to handle only 20/40/80/160 channel widths */ switch (chandef->width) { @@ -405,19 +430,33 @@ static void ieee80211_change_chanctx(struct ieee80211_local *local, WARN_ON(1); } - if (chandef->width < width) - ieee80211_chan_bw_change(local, ctx); + /* Check maybe BW narrowed - we do this _before_ calling recalc_chanctx_min_def + * due to maybe not returning from it, e.g in case new context was added + * first time with all parameters up to date. + */ + ieee80211_chan_bw_change(local, old_ctx, true); + + if (cfg80211_chandef_identical(&ctx->conf.def, chandef)) { + ieee80211_recalc_chanctx_min_def(local, ctx); + return; + } - drv_change_chanctx(local, ctx, IEEE80211_CHANCTX_CHANGE_WIDTH); - ieee80211_recalc_chanctx_min_def(local, ctx); + WARN_ON(!cfg80211_chandef_compatible(&ctx->conf.def, chandef)); + + ctx->conf.def = *chandef; + + /* check if min chanctx also changed */ + changed = IEEE80211_CHANCTX_CHANGE_WIDTH | + _ieee80211_recalc_chanctx_min_def(local, ctx); + drv_change_chanctx(local, ctx, changed); if (!local->use_chanctx) { local->_oper_chandef = *chandef; ieee80211_hw_config(local, 0); } - if (chandef->width > width) - ieee80211_chan_bw_change(local, ctx); + /* check is BW wider */ + ieee80211_chan_bw_change(local, old_ctx, false); } static struct ieee80211_chanctx * @@ -450,7 +489,7 @@ ieee80211_find_chanctx(struct ieee80211_local *local, if (!compat) continue; - ieee80211_change_chanctx(local, ctx, compat); + ieee80211_change_chanctx(local, ctx, ctx, compat); return ctx; } @@ -679,7 +718,7 @@ void ieee80211_recalc_chanctx_chantype(struct ieee80211_local *local, if (!compat) return; - ieee80211_change_chanctx(local, ctx, compat); + ieee80211_change_chanctx(local, ctx, ctx, compat); } static void ieee80211_recalc_radar_chanctx(struct ieee80211_local *local, @@ -1107,13 +1146,12 @@ ieee80211_vif_use_reserved_reassign(struct ieee80211_sub_if_data *sdata) if (WARN_ON(!chandef)) return -EINVAL; - if (old_ctx->conf.def.width > new_ctx->conf.def.width) - ieee80211_chan_bw_change(local, new_ctx); + if (sdata->vif.bss_conf.chandef.width != sdata->reserved_chandef.width) + changed = BSS_CHANGED_BANDWIDTH; - ieee80211_change_chanctx(local, new_ctx, chandef); + ieee80211_vif_update_chandef(sdata, &sdata->reserved_chandef); - if (old_ctx->conf.def.width < new_ctx->conf.def.width) - ieee80211_chan_bw_change(local, new_ctx); + ieee80211_change_chanctx(local, new_ctx, old_ctx, chandef); vif_chsw[0].vif = &sdata->vif; vif_chsw[0].old_ctx = &old_ctx->conf; @@ -1142,14 +1180,9 @@ ieee80211_vif_use_reserved_reassign(struct ieee80211_sub_if_data *sdata) if (ieee80211_chanctx_refcount(local, old_ctx) == 0) ieee80211_free_chanctx(local, old_ctx); - if (sdata->vif.bss_conf.chandef.width != sdata->reserved_chandef.width) - changed = BSS_CHANGED_BANDWIDTH; - - ieee80211_vif_update_chandef(sdata, &sdata->reserved_chandef); - + ieee80211_recalc_chanctx_min_def(local, new_ctx); ieee80211_recalc_smps_chanctx(local, new_ctx); ieee80211_recalc_radar_chanctx(local, new_ctx); - ieee80211_recalc_chanctx_min_def(local, new_ctx); if (changed) ieee80211_bss_info_change_notify(sdata, changed); @@ -1188,7 +1221,7 @@ ieee80211_vif_use_reserved_assign(struct ieee80211_sub_if_data *sdata) if (WARN_ON(!chandef)) return -EINVAL; - ieee80211_change_chanctx(local, new_ctx, chandef); + ieee80211_change_chanctx(local, new_ctx, new_ctx, chandef); list_del(&sdata->reserved_chanctx_list); sdata->reserved_chanctx = NULL; @@ -1505,7 +1538,6 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local) ieee80211_recalc_smps_chanctx(local, ctx); ieee80211_recalc_radar_chanctx(local, ctx); ieee80211_recalc_chanctx_min_def(local, ctx); - ieee80211_chan_bw_change(local, ctx); list_for_each_entry_safe(sdata, sdata_tmp, &ctx->reserved_vifs, reserved_chanctx_list) { diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index b33b155f3573..97efc20b9825 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -371,7 +371,6 @@ static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata, struct cfg80211_chan_def chandef; u16 ht_opmode; u32 flags; - enum ieee80211_sta_rx_bandwidth new_sta_bw; u32 vht_cap_info = 0; int ret; @@ -450,35 +449,8 @@ static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata, return -EINVAL; } - switch (chandef.width) { - case NL80211_CHAN_WIDTH_20_NOHT: - case NL80211_CHAN_WIDTH_20: - new_sta_bw = IEEE80211_STA_RX_BW_20; - break; - case NL80211_CHAN_WIDTH_40: - new_sta_bw = IEEE80211_STA_RX_BW_40; - break; - case NL80211_CHAN_WIDTH_80: - new_sta_bw = IEEE80211_STA_RX_BW_80; - break; - case NL80211_CHAN_WIDTH_80P80: - case NL80211_CHAN_WIDTH_160: - new_sta_bw = IEEE80211_STA_RX_BW_160; - break; - default: - return -EINVAL; - } - - if (new_sta_bw > sta->cur_max_bandwidth) - new_sta_bw = sta->cur_max_bandwidth; - - if (new_sta_bw < sta->sta.bandwidth) { - sta->sta.bandwidth = new_sta_bw; - rate_control_rate_update(local, sband, sta, - IEEE80211_RC_BW_CHANGED); - } - ret = ieee80211_vif_change_bandwidth(sdata, &chandef, changed); + if (ret) { sdata_info(sdata, "AP %pM changed bandwidth to incompatible one - disconnect\n", @@ -486,12 +458,6 @@ static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata, return ret; } - if (new_sta_bw > sta->sta.bandwidth) { - sta->sta.bandwidth = new_sta_bw; - rate_control_rate_update(local, sband, sta, - IEEE80211_RC_BW_CHANGED); - } - return 0; } @@ -1174,10 +1140,6 @@ static void ieee80211_chswitch_work(struct work_struct *work) */ if (sdata->reserved_chanctx) { - struct ieee80211_supported_band *sband = NULL; - struct sta_info *mgd_sta = NULL; - enum ieee80211_sta_rx_bandwidth bw = IEEE80211_STA_RX_BW_20; - /* * with multi-vif csa driver may call ieee80211_csa_finish() * many times while waiting for other interfaces to use their @@ -1186,48 +1148,6 @@ static void ieee80211_chswitch_work(struct work_struct *work) if (sdata->reserved_ready) goto out; - if (sdata->vif.bss_conf.chandef.width != - sdata->csa_chandef.width) { - /* - * For managed interface, we need to also update the AP - * station bandwidth and align the rate scale algorithm - * on the bandwidth change. Here we only consider the - * bandwidth of the new channel definition (as channel - * switch flow does not have the full HT/VHT/HE - * information), assuming that if additional changes are - * required they would be done as part of the processing - * of the next beacon from the AP. - */ - switch (sdata->csa_chandef.width) { - case NL80211_CHAN_WIDTH_20_NOHT: - case NL80211_CHAN_WIDTH_20: - default: - bw = IEEE80211_STA_RX_BW_20; - break; - case NL80211_CHAN_WIDTH_40: - bw = IEEE80211_STA_RX_BW_40; - break; - case NL80211_CHAN_WIDTH_80: - bw = IEEE80211_STA_RX_BW_80; - break; - case NL80211_CHAN_WIDTH_80P80: - case NL80211_CHAN_WIDTH_160: - bw = IEEE80211_STA_RX_BW_160; - break; - } - - mgd_sta = sta_info_get(sdata, ifmgd->bssid); - sband = - local->hw.wiphy->bands[sdata->csa_chandef.chan->band]; - } - - if (sdata->vif.bss_conf.chandef.width > - sdata->csa_chandef.width) { - mgd_sta->sta.bandwidth = bw; - rate_control_rate_update(local, sband, mgd_sta, - IEEE80211_RC_BW_CHANGED); - } - ret = ieee80211_vif_use_reserved_context(sdata); if (ret) { sdata_info(sdata, @@ -1238,13 +1158,6 @@ static void ieee80211_chswitch_work(struct work_struct *work) goto out; } - if (sdata->vif.bss_conf.chandef.width < - sdata->csa_chandef.width) { - mgd_sta->sta.bandwidth = bw; - rate_control_rate_update(local, sband, mgd_sta, - IEEE80211_RC_BW_CHANGED); - } - goto out; } -- cgit v1.2.3-58-ga151 From d8b261548dcf1058646cc48159c88d42d4b9a3b6 Mon Sep 17 00:00:00 2001 From: Shaul Triebitz Date: Fri, 18 Jun 2021 13:41:35 +0300 Subject: mac80211: add to bss_conf if broadcast TWT is supported Add to struct ieee80211_bss_conf a twt_broadcast field. Set it to true if both STA and AP support broadcast TWT. Signed-off-by: Shaul Triebitz Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20210618133832.f7c105237541.I50b302044e2b35e5ed4d3fb8bc7bd3d8bb89b1e1@changeid Signed-off-by: Johannes Berg --- include/net/mac80211.h | 4 +++- net/mac80211/mlme.c | 18 ++++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index c09cd0e4a6b3..5d3ce1bd5753 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -7,7 +7,7 @@ * Copyright 2007-2010 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015 - 2017 Intel Deutschland GmbH - * Copyright (C) 2018 - 2020 Intel Corporation + * Copyright (C) 2018 - 2021 Intel Corporation */ #ifndef MAC80211_H @@ -526,6 +526,7 @@ struct ieee80211_fils_discovery { * @twt_responder: does this BSS support TWT requester (relevant for managed * mode only, set if the AP advertises TWT responder role) * @twt_protected: does this BSS support protected TWT frames + * @twt_broadcast: does this BSS support broadcast TWT * @assoc: association status * @ibss_joined: indicates whether this station is part of an IBSS * or not @@ -642,6 +643,7 @@ struct ieee80211_bss_conf { bool twt_requester; bool twt_responder; bool twt_protected; + bool twt_broadcast; /* association related data */ bool assoc, ibss_joined; bool ibss_creator; diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 97efc20b9825..5e9f79beb8f3 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -3218,6 +3218,21 @@ static int ieee80211_recalc_twt_req(struct ieee80211_sub_if_data *sdata, return 0; } +static bool ieee80211_twt_bcast_support(struct ieee80211_bss_conf *bss_conf, + struct ieee80211_supported_band *sband, + struct sta_info *sta) +{ + const struct ieee80211_sta_he_cap *own_he_cap = + ieee80211_get_he_sta_cap(sband); + + return bss_conf->he_support && + (sta->sta.he_cap.he_cap_elem.mac_cap_info[2] & + IEEE80211_HE_MAC_CAP2_BCAST_TWT) && + own_he_cap && + (own_he_cap->he_cap_elem.mac_cap_info[2] & + IEEE80211_HE_MAC_CAP2_BCAST_TWT); +} + static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, struct cfg80211_bss *cbss, struct ieee80211_mgmt *mgmt, size_t len, @@ -3433,6 +3448,9 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, bss_conf->twt_protected = false; } + bss_conf->twt_broadcast = + ieee80211_twt_bcast_support(bss_conf, sband, sta); + if (bss_conf->he_support) { bss_conf->he_bss_color.color = le32_get_bits(elems->he_operation->he_oper_params, -- cgit v1.2.3-58-ga151 From dd3e4fc75b4ab8186a133cfe9d49666a2f8186e0 Mon Sep 17 00:00:00 2001 From: Avraham Stern Date: Fri, 18 Jun 2021 13:41:36 +0300 Subject: nl80211/cfg80211: add BSS color to NDP ranging parameters In NDP ranging, the initiator need to set the BSS color in the NDP to the BSS color of the responder. Add the BSS color as a parameter for NDP ranging. Signed-off-by: Avraham Stern Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20210618133832.f097a6144b59.I27dec8b994df52e691925ea61be4dd4fa6d396c0@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 8 ++++++-- include/uapi/linux/nl80211.h | 6 +++++- net/wireless/pmsr.c | 12 ++++++++++++ 3 files changed, 23 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index b3bc58ec9098..c6812945b4b8 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -7,7 +7,7 @@ * Copyright 2006-2010 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2020 Intel Corporation + * Copyright (C) 2018-2021 Intel Corporation */ #include @@ -3521,7 +3521,10 @@ struct cfg80211_pmsr_result { * If neither @trigger_based nor @non_trigger_based is set, * EDCA based ranging will be used. * @lmr_feedback: negotiate for I2R LMR feedback. Only valid if either - * @trigger_based or @non_trigger_based is set. + * @trigger_based or @non_trigger_based is set. + * @bss_color: the bss color of the responder. Optional. Set to zero to + * indicate the driver should set the BSS color. Only valid if + * @non_trigger_based or @trigger_based is set. * * See also nl80211 for the respective attribute documentation. */ @@ -3539,6 +3542,7 @@ struct cfg80211_pmsr_ftm_request_peer { u8 burst_duration; u8 ftms_per_burst; u8 ftmr_retries; + u8 bss_color; }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index f962c06e9818..771f238ccff1 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -11,7 +11,7 @@ * Copyright 2008 Jouni Malinen * Copyright 2008 Colin McCabe * Copyright 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2020 Intel Corporation + * Copyright (C) 2018-2021 Intel Corporation * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -6912,6 +6912,9 @@ enum nl80211_peer_measurement_ftm_capa { * @NL80211_PMSR_FTM_REQ_ATTR_LMR_FEEDBACK: negotiate for LMR feedback. Only * valid if either %NL80211_PMSR_FTM_REQ_ATTR_TRIGGER_BASED or * %NL80211_PMSR_FTM_REQ_ATTR_NON_TRIGGER_BASED is set. + * @NL80211_PMSR_FTM_REQ_ATTR_BSS_COLOR: optional. The BSS color of the + * responder. Only valid if %NL80211_PMSR_FTM_REQ_ATTR_NON_TRIGGER_BASED + * or %NL80211_PMSR_FTM_REQ_ATTR_TRIGGER_BASED is set. * * @NUM_NL80211_PMSR_FTM_REQ_ATTR: internal * @NL80211_PMSR_FTM_REQ_ATTR_MAX: highest attribute number @@ -6931,6 +6934,7 @@ enum nl80211_peer_measurement_ftm_req { NL80211_PMSR_FTM_REQ_ATTR_TRIGGER_BASED, NL80211_PMSR_FTM_REQ_ATTR_NON_TRIGGER_BASED, NL80211_PMSR_FTM_REQ_ATTR_LMR_FEEDBACK, + NL80211_PMSR_FTM_REQ_ATTR_BSS_COLOR, /* keep last */ NUM_NL80211_PMSR_FTM_REQ_ATTR, diff --git a/net/wireless/pmsr.c b/net/wireless/pmsr.c index d245968b74cb..328cf54bda82 100644 --- a/net/wireless/pmsr.c +++ b/net/wireless/pmsr.c @@ -168,6 +168,18 @@ static int pmsr_parse_ftm(struct cfg80211_registered_device *rdev, return -EINVAL; } + if (tb[NL80211_PMSR_FTM_REQ_ATTR_BSS_COLOR]) { + if (!out->ftm.non_trigger_based && !out->ftm.trigger_based) { + NL_SET_ERR_MSG_ATTR(info->extack, + tb[NL80211_PMSR_FTM_REQ_ATTR_BSS_COLOR], + "FTM: BSS color set for EDCA based ranging"); + return -EINVAL; + } + + out->ftm.bss_color = + nla_get_u8(tb[NL80211_PMSR_FTM_REQ_ATTR_BSS_COLOR]); + } + return 0; } -- cgit v1.2.3-58-ga151 From 45daaa1318410794de956fb8e9d06aed2dbb23d0 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Fri, 18 Jun 2021 13:41:37 +0300 Subject: mac80211: Properly WARN on HW scan before restart The following race was possible: 1. The device driver requests HW restart. 2. A scan is requested from user space and is propagated to the driver. During this flow HW_SCANNING flag is set. 3. The thread that handles the HW restart is scheduled, and before starting the actual reconfiguration it checks that HW_SCANNING is not set. The flow does so without acquiring any lock, and thus the WARN fires. Fix this by checking that HW_SCANNING is on only after RTNL is acquired, i.e., user space scan request handling is no longer in transit. Signed-off-by: Ilan Peer Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20210618133832.8238ab3e19ab.I2693c581c70251472b4f9089e37e06fb2c18268f@changeid Signed-off-by: Johannes Berg --- net/mac80211/main.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 822ff388410e..cde142fa8cb3 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -257,14 +257,13 @@ static void ieee80211_restart_work(struct work_struct *work) /* wait for scan work complete */ flush_workqueue(local->workqueue); flush_work(&local->sched_scan_stopped_work); + flush_work(&local->radar_detected_work); + + rtnl_lock(); WARN(test_bit(SCAN_HW_SCANNING, &local->scanning), "%s called with hardware scan in progress\n", __func__); - flush_work(&local->radar_detected_work); - /* we might do interface manipulations, so need both */ - rtnl_lock(); - wiphy_lock(local->hw.wiphy); list_for_each_entry(sdata, &local->interfaces, list) { /* * XXX: there may be more work for other vif types and even -- cgit v1.2.3-58-ga151 From c74025f47ac855344d1188a4224a7af216843b22 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 18 Jun 2021 13:41:44 +0300 Subject: mac80211: rearrange struct txq_info for fewer holes We can slightly decrease the size of struct txq_info by rearranging some fields for fewer holes, so do that. Signed-off-by: Johannes Berg Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20210618133832.1bf019a1fe2e.Ib54622b8d6dc1a9a7dc484e573c073119450538b@changeid Signed-off-by: Johannes Berg --- net/mac80211/ieee80211_i.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index a7ce59150322..17068408b27d 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -5,7 +5,7 @@ * Copyright 2006-2007 Jiri Benc * Copyright 2007-2010 Johannes Berg * Copyright 2013-2015 Intel Mobile Communications GmbH - * Copyright (C) 2018-2020 Intel Corporation + * Copyright (C) 2018-2021 Intel Corporation */ #ifndef IEEE80211_I_H @@ -839,9 +839,12 @@ struct txq_info { struct fq_tin tin; struct codel_vars def_cvars; struct codel_stats cstats; - struct sk_buff_head frags; - struct list_head schedule_order; + u16 schedule_round; + struct list_head schedule_order; + + struct sk_buff_head frags; + unsigned long flags; /* keep last! */ -- cgit v1.2.3-58-ga151 From 6516ee22f2a99efca7211ff61f23f778c988bfd4 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 18 Jun 2021 13:41:47 +0300 Subject: mac80211: improve AP disconnect message If the AP changes capability/bandwidth in some fashion, the message might be somewhat misleading and we don't know what really changed. Modify the message to speak about "caps/bw" instead of just "bandwidth", and print out the flags. Signed-off-by: Johannes Berg Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20210618133832.dc22c48985fa.I4bf5fbc17ec783c21d4b50c8c35b1de390896ccd@changeid Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 5e9f79beb8f3..6a599077e362 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -444,8 +444,8 @@ static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata, IEEE80211_STA_DISABLE_160MHZ)) || !cfg80211_chandef_valid(&chandef)) { sdata_info(sdata, - "AP %pM changed bandwidth in a way we can't support - disconnect\n", - ifmgd->bssid); + "AP %pM changed caps/bw in a way we can't support (0x%x/0x%x) - disconnect\n", + ifmgd->bssid, flags, ifmgd->flags); return -EINVAL; } -- cgit v1.2.3-58-ga151 From 64a8747238291c7c497517ab2590c473f708d9be Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 18 Jun 2021 13:41:48 +0300 Subject: cfg80211: trace more information in assoc trace event Add more information to the assoc trace event so we can see more precisely what's going on and what options were used. Signed-off-by: Johannes Berg Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20210618133832.86c58fca486d.Iabd8f036d2ef1d770fd20ed3ccd149f32154f430@changeid Signed-off-by: Johannes Berg --- net/wireless/rdev-ops.h | 12 +++++++++++- net/wireless/trace.h | 36 ++++++++++++++++++++++++++++++++++-- 2 files changed, 45 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index 8b1358d04ca2..b1d37f582dc6 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -464,8 +464,18 @@ static inline int rdev_assoc(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_assoc_request *req) { + const struct cfg80211_bss_ies *bss_ies; int ret; - trace_rdev_assoc(&rdev->wiphy, dev, req); + + /* + * Note: we might trace not exactly the data that's processed, + * due to races and the driver/mac80211 getting a newer copy. + */ + rcu_read_lock(); + bss_ies = rcu_dereference(req->bss->ies); + trace_rdev_assoc(&rdev->wiphy, dev, req, bss_ies); + rcu_read_unlock(); + ret = rdev->ops->assoc(&rdev->wiphy, dev, req); trace_rdev_return_int(&rdev->wiphy, ret); return ret; diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 76b777d5903f..440bce5f0274 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -1195,8 +1195,9 @@ TRACE_EVENT(rdev_auth, TRACE_EVENT(rdev_assoc, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, - struct cfg80211_assoc_request *req), - TP_ARGS(wiphy, netdev, req), + struct cfg80211_assoc_request *req, + const struct cfg80211_bss_ies *bss_ies), + TP_ARGS(wiphy, netdev, req, bss_ies), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY @@ -1204,6 +1205,17 @@ TRACE_EVENT(rdev_assoc, MAC_ENTRY(prev_bssid) __field(bool, use_mfp) __field(u32, flags) + __dynamic_array(u8, bss_elements, bss_ies->len) + __field(bool, bss_elements_bcon) + __field(u64, bss_elements_tsf) + __dynamic_array(u8, elements, req->ie_len) + __array(u8, ht_capa, sizeof(struct ieee80211_ht_cap)) + __array(u8, ht_capa_mask, sizeof(struct ieee80211_ht_cap)) + __array(u8, vht_capa, sizeof(struct ieee80211_vht_cap)) + __array(u8, vht_capa_mask, sizeof(struct ieee80211_vht_cap)) + __dynamic_array(u8, fils_kek, req->fils_kek_len) + __dynamic_array(u8, fils_nonces, + req->fils_nonces ? 2 * FILS_NONCE_LEN : 0) ), TP_fast_assign( WIPHY_ASSIGN; @@ -1215,6 +1227,26 @@ TRACE_EVENT(rdev_assoc, MAC_ASSIGN(prev_bssid, req->prev_bssid); __entry->use_mfp = req->use_mfp; __entry->flags = req->flags; + if (bss_ies->len) + memcpy(__get_dynamic_array(bss_elements), + bss_ies->data, bss_ies->len); + __entry->bss_elements_bcon = bss_ies->from_beacon; + __entry->bss_elements_tsf = bss_ies->tsf; + if (req->ie) + memcpy(__get_dynamic_array(elements), + req->ie, req->ie_len); + memcpy(__entry->ht_capa, &req->ht_capa, sizeof(req->ht_capa)); + memcpy(__entry->ht_capa_mask, &req->ht_capa_mask, + sizeof(req->ht_capa_mask)); + memcpy(__entry->vht_capa, &req->vht_capa, sizeof(req->vht_capa)); + memcpy(__entry->vht_capa_mask, &req->vht_capa_mask, + sizeof(req->vht_capa_mask)); + if (req->fils_kek) + memcpy(__get_dynamic_array(fils_kek), + req->fils_kek, req->fils_kek_len); + if (req->fils_nonces) + memcpy(__get_dynamic_array(fils_nonces), + req->fils_nonces, 2 * FILS_NONCE_LEN); ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: " MAC_PR_FMT ", previous bssid: " MAC_PR_FMT ", use mfp: %s, flags: %u", -- cgit v1.2.3-58-ga151 From bac2fd3d753430032043098dd55543037e3f7a60 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 18 Jun 2021 13:41:50 +0300 Subject: mac80211: remove use of ieee80211_get_he_sta_cap() All uses of ieee80211_get_he_sta_cap() were actually wrong, in net/mac80211/mlme.c they were wrong because that code is also used for P2P (which is a different interface type), in net/mac80211/main.c that should check all interface types. Fix all that. Signed-off-by: Johannes Berg Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20210618133832.ede114bc8b46.Ibcd9a5d98430e936344eb6d242ef8a65c2f59b74@changeid Signed-off-by: Johannes Berg --- net/mac80211/he.c | 4 +++- net/mac80211/main.c | 9 +++++++-- net/mac80211/mlme.c | 26 +++++++++++++++++--------- net/mac80211/util.c | 3 ++- 4 files changed, 29 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/mac80211/he.c b/net/mac80211/he.c index 0c0b970835ce..5984a9dac0bc 100644 --- a/net/mac80211/he.c +++ b/net/mac80211/he.c @@ -120,7 +120,9 @@ ieee80211_he_cap_ie_to_sta_he_cap(struct ieee80211_sub_if_data *sdata, memset(he_cap, 0, sizeof(*he_cap)); - if (!he_cap_ie || !ieee80211_get_he_sta_cap(sband)) + if (!he_cap_ie || + !ieee80211_get_he_iftype_cap(sband, + ieee80211_vif_type_p2p(&sdata->vif))) return; /* Make sure size is OK */ diff --git a/net/mac80211/main.c b/net/mac80211/main.c index cde142fa8cb3..95a8300da2d0 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -1010,8 +1010,13 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) supp_ht = supp_ht || sband->ht_cap.ht_supported; supp_vht = supp_vht || sband->vht_cap.vht_supported; - if (!supp_he) - supp_he = !!ieee80211_get_he_sta_cap(sband); + for (i = 0; i < sband->n_iftype_data; i++) { + const struct ieee80211_sband_iftype_data *iftd; + + iftd = &sband->iftype_data[i]; + + supp_he = supp_he || (iftd && iftd->he_cap.has_he); + } /* HT, VHT, HE require QoS, thus >= 4 queues */ if (WARN_ON(local->hw.queues < IEEE80211_NUM_ACS && diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 6a599077e362..9b9f6ab3b722 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -384,7 +384,9 @@ static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata, /* don't check HE if we associated as non-HE station */ if (ifmgd->flags & IEEE80211_STA_DISABLE_HE || - !ieee80211_get_he_sta_cap(sband)) + !ieee80211_get_he_iftype_cap(sband, + ieee80211_vif_type_p2p(&sdata->vif))) + he_oper = NULL; if (WARN_ON_ONCE(!sta)) @@ -642,7 +644,8 @@ static void ieee80211_add_he_ie(struct ieee80211_sub_if_data *sdata, rcu_read_unlock(); - he_cap = ieee80211_get_he_sta_cap(sband); + he_cap = ieee80211_get_he_iftype_cap(sband, + ieee80211_vif_type_p2p(&sdata->vif)); if (!he_cap || !reg_cap) return; @@ -3218,12 +3221,14 @@ static int ieee80211_recalc_twt_req(struct ieee80211_sub_if_data *sdata, return 0; } -static bool ieee80211_twt_bcast_support(struct ieee80211_bss_conf *bss_conf, +static bool ieee80211_twt_bcast_support(struct ieee80211_sub_if_data *sdata, + struct ieee80211_bss_conf *bss_conf, struct ieee80211_supported_band *sband, struct sta_info *sta) { const struct ieee80211_sta_he_cap *own_he_cap = - ieee80211_get_he_sta_cap(sband); + ieee80211_get_he_iftype_cap(sband, + ieee80211_vif_type_p2p(&sdata->vif)); return bss_conf->he_support && (sta->sta.he_cap.he_cap_elem.mac_cap_info[2] & @@ -3449,7 +3454,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, } bss_conf->twt_broadcast = - ieee80211_twt_bcast_support(bss_conf, sband, sta); + ieee80211_twt_bcast_support(sdata, bss_conf, sband, sta); if (bss_conf->he_support) { bss_conf->he_bss_color.color = @@ -4851,11 +4856,13 @@ static u8 ieee80211_ht_vht_rx_chains(struct ieee80211_sub_if_data *sdata, } static bool -ieee80211_verify_sta_he_mcs_support(struct ieee80211_supported_band *sband, +ieee80211_verify_sta_he_mcs_support(struct ieee80211_sub_if_data *sdata, + struct ieee80211_supported_band *sband, const struct ieee80211_he_operation *he_op) { const struct ieee80211_sta_he_cap *sta_he_cap = - ieee80211_get_he_sta_cap(sband); + ieee80211_get_he_iftype_cap(sband, + ieee80211_vif_type_p2p(&sdata->vif)); u16 ap_min_req_set; int i; @@ -4949,7 +4956,8 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, ifmgd->flags |= IEEE80211_STA_DISABLE_HE; } - if (!ieee80211_get_he_sta_cap(sband)) + if (!ieee80211_get_he_iftype_cap(sband, + ieee80211_vif_type_p2p(&sdata->vif))) ifmgd->flags |= IEEE80211_STA_DISABLE_HE; rcu_read_lock(); @@ -5007,7 +5015,7 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, else he_oper = NULL; - if (!ieee80211_verify_sta_he_mcs_support(sband, he_oper)) + if (!ieee80211_verify_sta_he_mcs_support(sdata, sband, he_oper)) ifmgd->flags |= IEEE80211_STA_DISABLE_HE; } diff --git a/net/mac80211/util.c b/net/mac80211/util.c index ee5410bfe9ec..b352d1d87bf0 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1937,7 +1937,8 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_sub_if_data *sdata, *offset = noffset; } - he_cap = ieee80211_get_he_sta_cap(sband); + he_cap = ieee80211_get_he_iftype_cap(sband, + ieee80211_vif_type_p2p(&sdata->vif)); if (he_cap) { pos = ieee80211_ie_build_he_cap(pos, he_cap, end); if (!pos) -- cgit v1.2.3-58-ga151 From ab4040df6efb87f92c7ec5bd65b5a093654d6a85 Mon Sep 17 00:00:00 2001 From: Zheng Yongjun Date: Mon, 7 Jun 2021 23:00:47 +0800 Subject: mac80211: fix some spelling mistakes Fix some spelling mistakes in comments: freeed ==> freed addreses ==> addresses containging ==> containing capablity ==> capability sucess ==> success atleast ==> at least Signed-off-by: Zheng Yongjun Link: https://lore.kernel.org/r/20210607150047.2855962-1-zhengyongjun3@huawei.com Signed-off-by: Johannes Berg --- net/mac80211/mesh.h | 2 +- net/mac80211/mesh_hwmp.c | 2 +- net/mac80211/mesh_pathtbl.c | 2 +- net/mac80211/mesh_plink.c | 2 +- net/mac80211/mlme.c | 2 +- net/mac80211/rc80211_minstrel_ht.c | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h index 40492d1bd8fd..77080b4f87b8 100644 --- a/net/mac80211/mesh.h +++ b/net/mac80211/mesh.h @@ -134,7 +134,7 @@ struct mesh_path { * gate's mpath may or may not be resolved and active. * @gates_lock: protects updates to known_gates * @rhead: the rhashtable containing struct mesh_paths, keyed by dest addr - * @walk_head: linked list containging all mesh_path objects + * @walk_head: linked list containing all mesh_path objects * @walk_lock: lock protecting walk_head * @entries: number of entries in the table */ diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index 3db514c4c63a..a05b615deb51 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -1124,7 +1124,7 @@ enddiscovery: * forwarding information is found. * * Returns: 0 if the next hop was found and -ENOENT if the frame was queued. - * skb is freeed here if no mpath could be allocated. + * skb is freed here if no mpath could be allocated. */ int mesh_nexthop_resolve(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c index 620ecf922408..efbefcbac3ac 100644 --- a/net/mac80211/mesh_pathtbl.c +++ b/net/mac80211/mesh_pathtbl.c @@ -122,7 +122,7 @@ static void prepare_for_gate(struct sk_buff *skb, char *dst_addr, hdr = (struct ieee80211_hdr *) skb->data; /* we preserve the previous mesh header and only add - * the new addreses */ + * the new addresses */ mshdr = (struct ieee80211s_hdr *) (skb->data + hdrlen); mshdr->flags = MESH_FLAGS_AE_A5_A6; memcpy(mshdr->eaddr1, hdr->addr3, ETH_ALEN); diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index aca26df7587d..a6915847d78a 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -150,7 +150,7 @@ out: * mesh STA in a MBSS. Three HT protection modes are supported for now, non-HT * mixed mode, 20MHz-protection and no-protection mode. non-HT mixed mode is * selected if any non-HT peers are present in our MBSS. 20MHz-protection mode - * is selected if all peers in our 20/40MHz MBSS support HT and atleast one + * is selected if all peers in our 20/40MHz MBSS support HT and at least one * HT20 peer is present. Otherwise no-protection mode is selected. */ static u32 mesh_set_ht_prot_mode(struct ieee80211_sub_if_data *sdata) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 9b9f6ab3b722..b3123e00f118 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -585,7 +585,7 @@ static void ieee80211_add_vht_ie(struct ieee80211_sub_if_data *sdata, cap &= ~IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE; /* - * If some other vif is using the MU-MIMO capablity we cannot associate + * If some other vif is using the MU-MIMO capability we cannot associate * using MU-MIMO - this will lead to contradictions in the group-id * mechanism. * Ownership is defined since association request, in order to avoid diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c index 20f2e0bef96b..72b44d4c42d0 100644 --- a/net/mac80211/rc80211_minstrel_ht.c +++ b/net/mac80211/rc80211_minstrel_ht.c @@ -434,7 +434,7 @@ minstrel_ht_get_tp_avg(struct minstrel_ht_sta *mi, int group, int rate, unsigned int nsecs = 0, overhead = mi->overhead; unsigned int ampdu_len = 1; - /* do not account throughput if sucess prob is below 10% */ + /* do not account throughput if success prob is below 10% */ if (prob_avg < MINSTREL_FRAC(10, 100)) return 0; -- cgit v1.2.3-58-ga151 From aeddc05fa9cff35402fc569cc6e7fca6ee36bac1 Mon Sep 17 00:00:00 2001 From: Sosthène Guédon Date: Thu, 3 Jun 2021 19:39:39 +0200 Subject: nl80211: Fix typo pmsr->pmsr MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This was mis-spelled in the policy, fix that. Signed-off-by: Sosthène Guédon Link: https://lore.kernel.org/r/YLkT27RG0DaWLUot@arch.localdomain Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index c62d61d8aa02..4c61cf66fe05 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -330,7 +330,7 @@ nl80211_pmsr_req_attr_policy[NL80211_PMSR_REQ_ATTR_MAX + 1] = { }; static const struct nla_policy -nl80211_psmr_peer_attr_policy[NL80211_PMSR_PEER_ATTR_MAX + 1] = { +nl80211_pmsr_peer_attr_policy[NL80211_PMSR_PEER_ATTR_MAX + 1] = { [NL80211_PMSR_PEER_ATTR_ADDR] = NLA_POLICY_ETH_ADDR, [NL80211_PMSR_PEER_ATTR_CHAN] = NLA_POLICY_NESTED(nl80211_policy), [NL80211_PMSR_PEER_ATTR_REQ] = @@ -345,7 +345,7 @@ nl80211_pmsr_attr_policy[NL80211_PMSR_ATTR_MAX + 1] = { [NL80211_PMSR_ATTR_RANDOMIZE_MAC_ADDR] = { .type = NLA_REJECT }, [NL80211_PMSR_ATTR_TYPE_CAPA] = { .type = NLA_REJECT }, [NL80211_PMSR_ATTR_PEERS] = - NLA_POLICY_NESTED_ARRAY(nl80211_psmr_peer_attr_policy), + NLA_POLICY_NESTED_ARRAY(nl80211_pmsr_peer_attr_policy), }; static const struct nla_policy -- cgit v1.2.3-58-ga151 From b767ecdaf98a999ef710f4f290bdd89257a90db0 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 18 Jun 2021 13:41:56 +0300 Subject: cfg80211: reg: improve bad regulatory warning There's a WARN_ON here but it says nothing, and the later dump of the regdomain aren't usually printed. As a first step, include the regdomain code in the WARN_ON message, just like in other similar instances. Signed-off-by: Johannes Berg Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20210618133832.853ffdd6c62b.I63e37b2ab184ee3653686e4df4dd23eb303687d2@changeid Signed-off-by: Johannes Berg --- net/wireless/reg.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 0406ce7334fa..2f654a4fc53b 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -3975,7 +3975,9 @@ static int __regulatory_set_wiphy_regd(struct wiphy *wiphy, "wiphy should have REGULATORY_WIPHY_SELF_MANAGED\n")) return -EPERM; - if (WARN(!is_valid_rd(rd), "Invalid regulatory domain detected\n")) { + if (WARN(!is_valid_rd(rd), + "Invalid regulatory domain detected: %c%c\n", + rd->alpha2[0], rd->alpha2[1])) { print_regdomain_info(rd); return -EINVAL; } -- cgit v1.2.3-58-ga151 From be989891e4f2ff5649bf22ab05a7cdd3a287e34b Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 18 Jun 2021 13:41:39 +0300 Subject: cfg80211: add cfg80211_any_usable_channels() This helper function checks if there are any usable channels on any of the given bands with the given properties (as expressed by disallowed channel flags). Signed-off-by: Johannes Berg Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20210618133832.2b613addaa85.Idaf8b859089490537878a7de5c7453a873a3f638@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 11 +++++++++++ net/wireless/chan.c | 33 ++++++++++++++++++++++++++++++++- 2 files changed, 43 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 7b4ef45d49b0..481e4e24800f 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -894,6 +894,17 @@ ieee80211_chandef_max_power(struct cfg80211_chan_def *chandef) return chandef->chan->max_power; } +/** + * cfg80211_any_usable_channels - check for usable channels + * @wiphy: the wiphy to check for + * @band_mask: which bands to check on + * @prohibited_flags: which channels to not consider usable, + * %IEEE80211_CHAN_DISABLED is always taken into account + */ +bool cfg80211_any_usable_channels(struct wiphy *wiphy, + unsigned long band_mask, + u32 prohibited_flags); + /** * enum survey_info_flags - survey information flags * diff --git a/net/wireless/chan.c b/net/wireless/chan.c index 472c895823a4..869c43d4414c 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -6,7 +6,7 @@ * * Copyright 2009 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH - * Copyright 2018-2020 Intel Corporation + * Copyright 2018-2021 Intel Corporation */ #include @@ -1339,3 +1339,34 @@ cfg80211_get_chan_state(struct wireless_dev *wdev, WARN_ON(1); } } + +bool cfg80211_any_usable_channels(struct wiphy *wiphy, + unsigned long sband_mask, + u32 prohibited_flags) +{ + int idx; + + prohibited_flags |= IEEE80211_CHAN_DISABLED; + + for_each_set_bit(idx, &sband_mask, NUM_NL80211_BANDS) { + struct ieee80211_supported_band *sband = wiphy->bands[idx]; + int chanidx; + + if (!sband) + continue; + + for (chanidx = 0; chanidx < sband->n_channels; chanidx++) { + struct ieee80211_channel *chan; + + chan = &sband->channels[chanidx]; + + if (chan->flags & prohibited_flags) + continue; + + return true; + } + } + + return false; +} +EXPORT_SYMBOL(cfg80211_any_usable_channels); -- cgit v1.2.3-58-ga151 From 0bc47057b54b73e5f6d36bfc7c5c96e15be1f221 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 18 Jun 2021 13:41:40 +0300 Subject: mac80211: conditionally advertise HE in probe requests While building probe requests, only enable HE capability if there are actually any channels in the band with HE enabled, otherwise we're not really capable. We're doing the same in association requests, so doing it here makes it consistent. This also makes HE not appear available if it isn't due to regulatory constraints. Signed-off-by: Johannes Berg Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20210618133832.b5513f2af335.Ic01862678712ae4238cea43ad2185928865efad2@changeid Signed-off-by: Johannes Berg --- net/mac80211/util.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/util.c b/net/mac80211/util.c index b352d1d87bf0..fab4bb1948e3 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1939,7 +1939,9 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_sub_if_data *sdata, he_cap = ieee80211_get_he_iftype_cap(sband, ieee80211_vif_type_p2p(&sdata->vif)); - if (he_cap) { + if (he_cap && + cfg80211_any_usable_channels(local->hw.wiphy, BIT(sband->band), + IEEE80211_CHAN_NO_HE)) { pos = ieee80211_ie_build_he_cap(pos, he_cap, end); if (!pos) goto out_err; -- cgit v1.2.3-58-ga151 From 1b7b3ac8ff3317cdcf07a1c413de9bdb68019c2b Mon Sep 17 00:00:00 2001 From: Miri Korenblit Date: Fri, 18 Jun 2021 13:41:46 +0300 Subject: cfg80211: set custom regdomain after wiphy registration We used to set regulatory info before the registration of the device and then the regulatory info didn't get set, because the device isn't registered so there isn't a device to set the regulatory info for. So set the regulatory info after the device registration. Call reg_process_self_managed_hints() once again after the device registration because it does nothing before it. Signed-off-by: Miri Korenblit Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20210618133832.c96eadcffe80.I86799c2c866b5610b4cf91115c21d8ceb525c5aa@changeid Signed-off-by: Johannes Berg --- net/wireless/core.c | 8 ++++---- net/wireless/reg.c | 1 + 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/wireless/core.c b/net/wireless/core.c index 41c15cc7791f..03323121ca50 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -5,7 +5,7 @@ * Copyright 2006-2010 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2020 Intel Corporation + * Copyright (C) 2018-2021 Intel Corporation */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -928,9 +928,6 @@ int wiphy_register(struct wiphy *wiphy) return res; } - /* set up regulatory info */ - wiphy_regulatory_register(wiphy); - list_add_rcu(&rdev->list, &cfg80211_rdev_list); cfg80211_rdev_list_generation++; @@ -941,6 +938,9 @@ int wiphy_register(struct wiphy *wiphy) cfg80211_debugfs_rdev_add(rdev); nl80211_notify_wiphy(rdev, NL80211_CMD_NEW_WIPHY); + /* set up regulatory info */ + wiphy_regulatory_register(wiphy); + if (wiphy->regulatory_flags & REGULATORY_CUSTOM_REG) { struct regulatory_request request; diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 2f654a4fc53b..c2d0ff7f089f 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -4051,6 +4051,7 @@ void wiphy_regulatory_register(struct wiphy *wiphy) wiphy_update_regulatory(wiphy, lr->initiator); wiphy_all_share_dfs_chan_state(wiphy); + reg_process_self_managed_hints(); } void wiphy_regulatory_deregister(struct wiphy *wiphy) -- cgit v1.2.3-58-ga151 From f4f8650588d35deafaa4a4e28cceb3557a71e711 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 18 Jun 2021 13:41:52 +0300 Subject: cfg80211: allow advertising vendor-specific capabilities There may be cases where vendor-specific elements need to be used over the air. Rather than have driver or firmware add them and possibly cause problems that way, add them to the iftype-data band capabilities. This way we can advertise to userspace first, and use them in mac80211 next. Signed-off-by: Johannes Berg Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20210618133832.e8c4f0347276.Iee5964682b3e9ec51fc1cd57a7c62383eaf6ddd7@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 7 +++++++ include/uapi/linux/nl80211.h | 3 +++ net/wireless/nl80211.c | 5 +++++ 3 files changed, 15 insertions(+) (limited to 'net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 481e4e24800f..c93a2cd77920 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -371,11 +371,18 @@ struct ieee80211_sta_he_cap { * @he_cap: holds the HE capabilities * @he_6ghz_capa: HE 6 GHz capabilities, must be filled in for a * 6 GHz band channel (and 0 may be valid value). + * @vendor_elems: vendor element(s) to advertise + * @vendor_elems.data: vendor element(s) data + * @vendor_elems.len: vendor element(s) length */ struct ieee80211_sband_iftype_data { u16 types_mask; struct ieee80211_sta_he_cap he_cap; struct ieee80211_he_6ghz_capa he_6ghz_capa; + struct { + const u8 *data; + unsigned int len; + } vendor_elems; }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 771f238ccff1..db474994fa73 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -3654,6 +3654,8 @@ enum nl80211_mpath_info { * defined * @NL80211_BAND_IFTYPE_ATTR_HE_6GHZ_CAPA: HE 6GHz band capabilities (__le16), * given for all 6 GHz band channels + * @NL80211_BAND_IFTYPE_ATTR_VENDOR_ELEMS: vendor element capabilities that are + * advertised on this band/for this iftype (binary) * @__NL80211_BAND_IFTYPE_ATTR_AFTER_LAST: internal use */ enum nl80211_band_iftype_attr { @@ -3665,6 +3667,7 @@ enum nl80211_band_iftype_attr { NL80211_BAND_IFTYPE_ATTR_HE_CAP_MCS_SET, NL80211_BAND_IFTYPE_ATTR_HE_CAP_PPE, NL80211_BAND_IFTYPE_ATTR_HE_6GHZ_CAPA, + NL80211_BAND_IFTYPE_ATTR_VENDOR_ELEMS, /* keep last */ __NL80211_BAND_IFTYPE_ATTR_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 4c61cf66fe05..50eb405b0690 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -1731,6 +1731,11 @@ nl80211_send_iftype_data(struct sk_buff *msg, &iftdata->he_6ghz_capa)) return -ENOBUFS; + if (iftdata->vendor_elems.data && iftdata->vendor_elems.len && + nla_put(msg, NL80211_BAND_IFTYPE_ATTR_VENDOR_ELEMS, + iftdata->vendor_elems.len, iftdata->vendor_elems.data)) + return -ENOBUFS; + return 0; } -- cgit v1.2.3-58-ga151 From 9bd6a83e53a7a4d82f95b354856b64f4359cdddc Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 18 Jun 2021 13:41:53 +0300 Subject: mac80211: add vendor-specific capabilities to assoc request When sending an association request, add any vendor specific capabilities at the end of the frame. This way, mac80211 is still completely in charge of building the frame, but drivers can determine what should be added depending on the band and interface type. Signed-off-by: Johannes Berg Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20210618133832.80d716d69a5f.I28097ff19be6b22aebdc33a72795d2662755d41f@changeid Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index b3123e00f118..0559c6b6ee71 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -8,7 +8,7 @@ * Copyright 2007, Michael Wu * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015 - 2017 Intel Deutschland GmbH - * Copyright (C) 2018 - 2020 Intel Corporation + * Copyright (C) 2018 - 2021 Intel Corporation */ #include @@ -681,6 +681,8 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) u32 rates = 0; __le16 listen_int; struct element *ext_capa = NULL; + enum nl80211_iftype iftype = ieee80211_vif_type_p2p(&sdata->vif); + const struct ieee80211_sband_iftype_data *iftd; /* we know it's writable, cast away the const */ if (assoc_data->ie_len) @@ -725,6 +727,8 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) } } + iftd = ieee80211_get_sband_iftype_data(sband, iftype); + skb = alloc_skb(local->hw.extra_tx_headroom + sizeof(*mgmt) + /* bit too much but doesn't matter */ 2 + assoc_data->ssid_len + /* SSID */ @@ -739,7 +743,8 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) 2 + 1 + sizeof(struct ieee80211_he_6ghz_capa) + assoc_data->ie_len + /* extra IEs */ (assoc_data->fils_kek_len ? 16 /* AES-SIV */ : 0) + - 9, /* WMM */ + 9 + /* WMM */ + (iftd ? iftd->vendor_elems.len : 0), GFP_KERNEL); if (!skb) return; @@ -1012,6 +1017,9 @@ skip_rates: ieee80211_add_s1g_capab_ie(sdata, &sband->s1g_cap, skb); } + if (iftd && iftd->vendor_elems.data && iftd->vendor_elems.len) + skb_put_data(skb, iftd->vendor_elems.data, iftd->vendor_elems.len); + /* add any remaining custom (i.e. vendor specific here) IEs */ if (assoc_data->ie_len) { noffset = assoc_data->ie_len; -- cgit v1.2.3-58-ga151 From 52bb205213a8169cc40e1eba96483a9e488c17d3 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Fri, 18 Jun 2021 13:41:41 +0300 Subject: cfg80211: Support hidden AP discovery over 6GHz band To discover a hidden AP on the 6GHz band, the probe request sent to the AP needs to include the AP's SSID, as some APs would not respond with a probe response based only on short SSID match. To support hidden AP discovery over the 6GHz band, when constructing the specific 6GHz band scan also include SSIDs that were part of the original scan request, so these can be used in the probe requests transmitted during scan. Signed-off-by: Ilan Peer Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20210618133832.218df9d3203c.Ice0f7a2f6a65f1f9710b7898591481baeefaf490@changeid Signed-off-by: Johannes Berg --- net/wireless/scan.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/wireless/scan.c b/net/wireless/scan.c index a3941b19b516..f03c7ac8e184 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -5,7 +5,7 @@ * Copyright 2008 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2016 Intel Deutschland GmbH - * Copyright (C) 2018-2020 Intel Corporation + * Copyright (C) 2018-2021 Intel Corporation */ #include #include @@ -757,7 +757,8 @@ static int cfg80211_scan_6ghz(struct cfg80211_registered_device *rdev) } request = kzalloc(struct_size(request, channels, n_channels) + - sizeof(*request->scan_6ghz_params) * count, + sizeof(*request->scan_6ghz_params) * count + + sizeof(*request->ssids) * rdev_req->n_ssids, GFP_KERNEL); if (!request) { cfg80211_free_coloc_ap_list(&coloc_ap_list); @@ -848,9 +849,18 @@ skip: if (request->n_channels) { struct cfg80211_scan_request *old = rdev->int_scan_req; - rdev->int_scan_req = request; + /* + * Add the ssids from the parent scan request to the new scan + * request, so the driver would be able to use them in its + * probe requests to discover hidden APs on PSC channels. + */ + request->ssids = (void *)&request->channels[request->n_channels]; + request->n_ssids = rdev_req->n_ssids; + memcpy(request->ssids, rdev_req->ssids, sizeof(*request->ssids) * + request->n_ssids); + /* * If this scan follows a previous scan, save the scan start * info from the first part of the scan -- cgit v1.2.3-58-ga151 From 7d29bc50b30e58102dd0e7a6beb1a72cc41029c5 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 18 Jun 2021 13:41:42 +0300 Subject: mac80211: always include HE 6GHz capability in probe request If HE/6GHz is available (thus we consider dot11HE6GOptionImplemented to be true), then always include the corresponding capability in the probe request as required by the spec. Signed-off-by: Johannes Berg Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20210618133832.25ee4a54a7d0.I8cebd799c85524c8123a11941a104dbdefc03762@changeid Signed-off-by: Johannes Berg --- net/mac80211/util.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/util.c b/net/mac80211/util.c index fab4bb1948e3..d1ecac00bddb 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1945,8 +1945,18 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_sub_if_data *sdata, pos = ieee80211_ie_build_he_cap(pos, he_cap, end); if (!pos) goto out_err; + } + + if (cfg80211_any_usable_channels(local->hw.wiphy, + BIT(NL80211_BAND_6GHZ), + IEEE80211_CHAN_NO_HE)) { + struct ieee80211_supported_band *sband6; + + sband6 = local->hw.wiphy->bands[NL80211_BAND_6GHZ]; + he_cap = ieee80211_get_he_iftype_cap(sband6, + ieee80211_vif_type_p2p(&sdata->vif)); - if (sband->band == NL80211_BAND_6GHZ) { + if (he_cap) { enum nl80211_iftype iftype = ieee80211_vif_type_p2p(&sdata->vif); __le16 cap = ieee80211_get_he_6ghz_capa(sband, iftype); -- cgit v1.2.3-58-ga151 From 15fae3410f1d879b18e08fe8ef293d538549dfcb Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 18 Jun 2021 13:41:55 +0300 Subject: mac80211: notify driver on mgd TX completion We have mgd_prepare_tx(), but sometimes drivers may want/need to take action when the exchange finishes, whether successfully or not. Add a notification to the driver on completion, i.e. call the new method mgd_complete_tx(). To unify the two scenarios, and to add more information, make both of them take a struct that has the duration (prepare only), subtype (both) and success (complete only). Signed-off-by: Johannes Berg Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20210618133832.5d94e78f6230.I6dc979606b6f28701b740d7aab725f7853a5a155@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/ath9k/main.c | 2 +- drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c | 6 +- drivers/net/wireless/realtek/rtw88/mac80211.c | 2 +- include/net/mac80211.h | 28 ++++++++- net/mac80211/driver-ops.h | 26 +++++++-- net/mac80211/mlme.c | 71 +++++++++++++++++------ net/mac80211/trace.h | 33 +++++++++-- 7 files changed, 133 insertions(+), 35 deletions(-) (limited to 'net') diff --git a/drivers/net/wireless/ath/ath9k/main.c b/drivers/net/wireless/ath/ath9k/main.c index 97c3a53f9cef..139831539da3 100644 --- a/drivers/net/wireless/ath/ath9k/main.c +++ b/drivers/net/wireless/ath/ath9k/main.c @@ -2654,7 +2654,7 @@ static void ath9k_unassign_vif_chanctx(struct ieee80211_hw *hw, static void ath9k_mgd_prepare_tx(struct ieee80211_hw *hw, struct ieee80211_vif *vif, - u16 duration) + struct ieee80211_prep_tx_info *info) { struct ath_softc *sc = hw->priv; struct ath_common *common = ath9k_hw_common(sc->sc_ah); diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c index 607d5d564928..cc78f306ac1a 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c @@ -3306,14 +3306,14 @@ static int iwl_mvm_mac_conf_tx(struct ieee80211_hw *hw, static void iwl_mvm_mac_mgd_prepare_tx(struct ieee80211_hw *hw, struct ieee80211_vif *vif, - u16 req_duration) + struct ieee80211_prep_tx_info *info) { struct iwl_mvm *mvm = IWL_MAC80211_GET_MVM(hw); u32 duration = IWL_MVM_TE_SESSION_PROTECTION_MAX_TIME_MS; u32 min_duration = IWL_MVM_TE_SESSION_PROTECTION_MIN_TIME_MS; - if (req_duration > duration) - duration = req_duration; + if (info->duration > duration) + duration = info->duration; mutex_lock(&mvm->mutex); /* Try really hard to protect the session and hear a beacon diff --git a/drivers/net/wireless/realtek/rtw88/mac80211.c b/drivers/net/wireless/realtek/rtw88/mac80211.c index 333df6b38113..d8718b253f0b 100644 --- a/drivers/net/wireless/realtek/rtw88/mac80211.c +++ b/drivers/net/wireless/realtek/rtw88/mac80211.c @@ -629,7 +629,7 @@ static void rtw_ops_sw_scan_complete(struct ieee80211_hw *hw, static void rtw_ops_mgd_prepare_tx(struct ieee80211_hw *hw, struct ieee80211_vif *vif, - u16 duration) + struct ieee80211_prep_tx_info *info) { struct rtw_dev *rtwdev = hw->priv; diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 5d3ce1bd5753..9afbcac61c2a 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -3346,6 +3346,21 @@ enum ieee80211_reconfig_type { IEEE80211_RECONFIG_TYPE_SUSPEND, }; +/** + * struct ieee80211_prep_tx_info - prepare TX information + * @duration: if non-zero, hint about the required duration, + * only used with the mgd_prepare_tx() method. + * @subtype: frame subtype (auth, (re)assoc, deauth, disassoc) + * @success: whether the frame exchange was successful, only + * used with the mgd_complete_tx() method, and then only + * valid for auth and (re)assoc. + */ +struct ieee80211_prep_tx_info { + u16 duration; + u16 subtype; + u8 success:1; +}; + /** * struct ieee80211_ops - callbacks from mac80211 to the driver * @@ -3758,9 +3773,13 @@ enum ieee80211_reconfig_type { * frame in case that no beacon was heard from the AP/P2P GO. * The callback will be called before each transmission and upon return * mac80211 will transmit the frame right away. - * If duration is greater than zero, mac80211 hints to the driver the - * duration for which the operation is requested. + * Additional information is passed in the &struct ieee80211_prep_tx_info + * data. If duration there is greater than zero, mac80211 hints to the + * driver the duration for which the operation is requested. * The callback is optional and can (should!) sleep. + * @mgd_complete_tx: Notify the driver that the response frame for a previously + * transmitted frame announced with @mgd_prepare_tx was received, the data + * is filled similarly to @mgd_prepare_tx though the duration is not used. * * @mgd_protect_tdls_discover: Protect a TDLS discovery session. After sending * a TDLS discovery-request, we expect a reply to arrive on the AP's @@ -4111,7 +4130,10 @@ struct ieee80211_ops { void (*mgd_prepare_tx)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, - u16 duration); + struct ieee80211_prep_tx_info *info); + void (*mgd_complete_tx)(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + struct ieee80211_prep_tx_info *info); void (*mgd_protect_tdls_discover)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index 604ca59937f0..bcb7cc06db3d 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -2,7 +2,7 @@ /* * Portions of this file * Copyright(c) 2016 Intel Deutschland GmbH -* Copyright (C) 2018 - 2019 Intel Corporation +* Copyright (C) 2018 - 2019, 2021 Intel Corporation */ #ifndef __MAC80211_DRIVER_OPS @@ -821,7 +821,7 @@ drv_allow_buffered_frames(struct ieee80211_local *local, static inline void drv_mgd_prepare_tx(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, - u16 duration) + struct ieee80211_prep_tx_info *info) { might_sleep(); @@ -829,9 +829,27 @@ static inline void drv_mgd_prepare_tx(struct ieee80211_local *local, return; WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_STATION); - trace_drv_mgd_prepare_tx(local, sdata, duration); + trace_drv_mgd_prepare_tx(local, sdata, info->duration, + info->subtype, info->success); if (local->ops->mgd_prepare_tx) - local->ops->mgd_prepare_tx(&local->hw, &sdata->vif, duration); + local->ops->mgd_prepare_tx(&local->hw, &sdata->vif, info); + trace_drv_return_void(local); +} + +static inline void drv_mgd_complete_tx(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct ieee80211_prep_tx_info *info) +{ + might_sleep(); + + if (!check_sdata_in_driver(sdata)) + return; + WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_STATION); + + trace_drv_mgd_complete_tx(local, sdata, info->duration, + info->subtype, info->success); + if (local->ops->mgd_complete_tx) + local->ops->mgd_complete_tx(&local->hw, &sdata->vif, info); trace_drv_return_void(local); } diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 0559c6b6ee71..a0572ce99826 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -683,6 +683,7 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) struct element *ext_capa = NULL; enum nl80211_iftype iftype = ieee80211_vif_type_p2p(&sdata->vif); const struct ieee80211_sband_iftype_data *iftd; + struct ieee80211_prep_tx_info info = {}; /* we know it's writable, cast away the const */ if (assoc_data->ie_len) @@ -784,12 +785,14 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) mgmt->u.reassoc_req.listen_interval = listen_int; memcpy(mgmt->u.reassoc_req.current_ap, assoc_data->prev_bssid, ETH_ALEN); + info.subtype = IEEE80211_STYPE_REASSOC_REQ; } else { skb_put(skb, 4); mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ASSOC_REQ); mgmt->u.assoc_req.capab_info = cpu_to_le16(capab); mgmt->u.assoc_req.listen_interval = listen_int; + info.subtype = IEEE80211_STYPE_ASSOC_REQ; } /* SSID */ @@ -1037,7 +1040,7 @@ skip_rates: ifmgd->assoc_req_ies = kmemdup(ie_start, pos - ie_start, GFP_ATOMIC); ifmgd->assoc_req_ies_len = pos - ie_start; - drv_mgd_prepare_tx(local, sdata, 0); + drv_mgd_prepare_tx(local, sdata, &info); IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) @@ -2256,6 +2259,9 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_local *local = sdata->local; u32 changed = 0; + struct ieee80211_prep_tx_info info = { + .subtype = stype, + }; sdata_assert_lock(sdata); @@ -2305,8 +2311,9 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, * driver requested so. */ if (ieee80211_hw_check(&local->hw, DEAUTH_NEED_MGD_TX_PREP) && - !ifmgd->have_beacon) - drv_mgd_prepare_tx(sdata->local, sdata, 0); + !ifmgd->have_beacon) { + drv_mgd_prepare_tx(sdata->local, sdata, &info); + } ieee80211_send_deauth_disassoc(sdata, ifmgd->bssid, ifmgd->bssid, stype, reason, @@ -2317,6 +2324,8 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, if (tx) ieee80211_flush_queues(local, sdata, false); + drv_mgd_complete_tx(sdata->local, sdata, &info); + /* clear bssid only after building the needed mgmt frames */ eth_zero_addr(ifmgd->bssid); @@ -2867,6 +2876,9 @@ static void ieee80211_auth_challenge(struct ieee80211_sub_if_data *sdata, u8 *pos; struct ieee802_11_elems elems; u32 tx_flags = 0; + struct ieee80211_prep_tx_info info = { + .subtype = IEEE80211_STYPE_AUTH, + }; pos = mgmt->u.auth.variable; ieee802_11_parse_elems(pos, len - (pos - (u8 *)mgmt), false, &elems, @@ -2874,7 +2886,7 @@ static void ieee80211_auth_challenge(struct ieee80211_sub_if_data *sdata, if (!elems.challenge) return; auth_data->expected_transaction = 4; - drv_mgd_prepare_tx(sdata->local, sdata, 0); + drv_mgd_prepare_tx(sdata->local, sdata, &info); if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) tx_flags = IEEE80211_TX_CTL_REQ_TX_STATUS | IEEE80211_TX_INTFL_MLME_CONN_TX; @@ -2927,6 +2939,9 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, .type = MLME_EVENT, .u.mlme.data = AUTH_EVENT, }; + struct ieee80211_prep_tx_info info = { + .subtype = IEEE80211_STYPE_AUTH, + }; sdata_assert_lock(sdata); @@ -2955,7 +2970,7 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, mgmt->sa, auth_alg, ifmgd->auth_data->algorithm, auth_transaction, ifmgd->auth_data->expected_transaction); - return; + goto notify_driver; } if (status_code != WLAN_STATUS_SUCCESS) { @@ -2966,7 +2981,7 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, (auth_transaction == 1 && (status_code == WLAN_STATUS_SAE_HASH_TO_ELEMENT || status_code == WLAN_STATUS_SAE_PK)))) - return; + goto notify_driver; sdata_info(sdata, "%pM denied authentication (status %d)\n", mgmt->sa, status_code); @@ -2974,7 +2989,7 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, event.u.mlme.status = MLME_DENIED; event.u.mlme.reason = status_code; drv_event_callback(sdata->local, sdata, &event); - return; + goto notify_driver; } switch (ifmgd->auth_data->algorithm) { @@ -2996,10 +3011,11 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, default: WARN_ONCE(1, "invalid auth alg %d", ifmgd->auth_data->algorithm); - return; + goto notify_driver; } event.u.mlme.status = MLME_SUCCESS; + info.success = 1; drv_event_callback(sdata->local, sdata, &event); if (ifmgd->auth_data->algorithm != WLAN_AUTH_SAE || (auth_transaction == 2 && @@ -3013,6 +3029,8 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, } cfg80211_rx_mlme_mgmt(sdata->dev, (u8 *)mgmt, len); +notify_driver: + drv_mgd_complete_tx(sdata->local, sdata, &info); } #define case_WLAN(type) \ @@ -3634,6 +3652,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, .type = MLME_EVENT, .u.mlme.data = ASSOC_EVENT, }; + struct ieee80211_prep_tx_info info = {}; sdata_assert_lock(sdata); @@ -3663,6 +3682,15 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, aid = 0; /* TODO */ } + /* + * Note: this may not be perfect, AP might misbehave - if + * anyone needs to rely on perfect complete notification + * with the exact right subtype, then we need to track what + * we actually transmitted. + */ + info.subtype = reassoc ? IEEE80211_STYPE_REASSOC_REQ : + IEEE80211_STYPE_ASSOC_REQ; + sdata_info(sdata, "RX %sssocResp from %pM (capab=0x%x status=%d aid=%d)\n", reassoc ? "Rea" : "A", mgmt->sa, @@ -3688,7 +3716,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, assoc_data->timeout_started = true; if (ms > IEEE80211_ASSOC_TIMEOUT) run_again(sdata, assoc_data->timeout); - return; + goto notify_driver; } if (status_code != WLAN_STATUS_SUCCESS) { @@ -3703,7 +3731,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, /* oops -- internal error -- send timeout for now */ ieee80211_destroy_assoc_data(sdata, false, false); cfg80211_assoc_timeout(sdata->dev, cbss); - return; + goto notify_driver; } event.u.mlme.status = MLME_SUCCESS; drv_event_callback(sdata->local, sdata, &event); @@ -3721,10 +3749,14 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) if (sdata->tx_conf[ac].uapsd) uapsd_queues |= ieee80211_ac_to_qos_mask[ac]; + + info.success = 1; } cfg80211_rx_assoc_resp(sdata->dev, cbss, (u8 *)mgmt, len, uapsd_queues, ifmgd->assoc_req_ies, ifmgd->assoc_req_ies_len); +notify_driver: + drv_mgd_complete_tx(sdata->local, sdata, &info); } static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata, @@ -4343,7 +4375,9 @@ static int ieee80211_auth(struct ieee80211_sub_if_data *sdata) u32 tx_flags = 0; u16 trans = 1; u16 status = 0; - u16 prepare_tx_duration = 0; + struct ieee80211_prep_tx_info info = { + .subtype = IEEE80211_STYPE_AUTH, + }; sdata_assert_lock(sdata); @@ -4366,10 +4400,9 @@ static int ieee80211_auth(struct ieee80211_sub_if_data *sdata) } if (auth_data->algorithm == WLAN_AUTH_SAE) - prepare_tx_duration = - jiffies_to_msecs(IEEE80211_AUTH_TIMEOUT_SAE); + info.duration = jiffies_to_msecs(IEEE80211_AUTH_TIMEOUT_SAE); - drv_mgd_prepare_tx(local, sdata, prepare_tx_duration); + drv_mgd_prepare_tx(local, sdata, &info); sdata_info(sdata, "send auth to %pM (try %d/%d)\n", auth_data->bss->bssid, auth_data->tries, @@ -5792,6 +5825,9 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata, struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN]; bool tx = !req->local_state_change; + struct ieee80211_prep_tx_info info = { + .subtype = IEEE80211_STYPE_DEAUTH, + }; if (ifmgd->auth_data && ether_addr_equal(ifmgd->auth_data->bss->bssid, req->bssid)) { @@ -5800,7 +5836,7 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata, req->bssid, req->reason_code, ieee80211_get_reason_code_string(req->reason_code)); - drv_mgd_prepare_tx(sdata->local, sdata, 0); + drv_mgd_prepare_tx(sdata->local, sdata, &info); ieee80211_send_deauth_disassoc(sdata, req->bssid, req->bssid, IEEE80211_STYPE_DEAUTH, req->reason_code, tx, @@ -5809,7 +5845,7 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata, ieee80211_report_disconnect(sdata, frame_buf, sizeof(frame_buf), true, req->reason_code, false); - + drv_mgd_complete_tx(sdata->local, sdata, &info); return 0; } @@ -5820,7 +5856,7 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata, req->bssid, req->reason_code, ieee80211_get_reason_code_string(req->reason_code)); - drv_mgd_prepare_tx(sdata->local, sdata, 0); + drv_mgd_prepare_tx(sdata->local, sdata, &info); ieee80211_send_deauth_disassoc(sdata, req->bssid, req->bssid, IEEE80211_STYPE_DEAUTH, req->reason_code, tx, @@ -5844,6 +5880,7 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata, ieee80211_report_disconnect(sdata, frame_buf, sizeof(frame_buf), true, req->reason_code, false); + drv_mgd_complete_tx(sdata->local, sdata, &info); return 0; } diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index 8fcc39056402..f6ef15366938 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -2,7 +2,7 @@ /* * Portions of this file * Copyright(c) 2016-2017 Intel Deutschland GmbH -* Copyright (C) 2018 - 2020 Intel Corporation +* Copyright (C) 2018 - 2021 Intel Corporation */ #if !defined(__MAC80211_DRIVER_TRACE) || defined(TRACE_HEADER_MULTI_READ) @@ -1461,31 +1461,52 @@ DEFINE_EVENT(release_evt, drv_allow_buffered_frames, TP_ARGS(local, sta, tids, num_frames, reason, more_data) ); -TRACE_EVENT(drv_mgd_prepare_tx, +DECLARE_EVENT_CLASS(mgd_prepare_complete_tx_evt, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, - u16 duration), + u16 duration, u16 subtype, bool success), - TP_ARGS(local, sdata, duration), + TP_ARGS(local, sdata, duration, subtype, success), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY __field(u32, duration) + __field(u16, subtype) + __field(u8, success) ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; __entry->duration = duration; + __entry->subtype = subtype; + __entry->success = success; ), TP_printk( - LOCAL_PR_FMT VIF_PR_FMT " duration: %u", - LOCAL_PR_ARG, VIF_PR_ARG, __entry->duration + LOCAL_PR_FMT VIF_PR_FMT " duration: %u, subtype:0x%x, success:%d", + LOCAL_PR_ARG, VIF_PR_ARG, __entry->duration, + __entry->subtype, __entry->success ) ); +DEFINE_EVENT(mgd_prepare_complete_tx_evt, drv_mgd_prepare_tx, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + u16 duration, u16 subtype, bool success), + + TP_ARGS(local, sdata, duration, subtype, success) +); + +DEFINE_EVENT(mgd_prepare_complete_tx_evt, drv_mgd_complete_tx, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + u16 duration, u16 subtype, bool success), + + TP_ARGS(local, sdata, duration, subtype, success) +); + DEFINE_EVENT(local_sdata_evt, drv_mgd_protect_tdls_discover, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata), -- cgit v1.2.3-58-ga151 From 65be6aa36ded2d2e3bf5058f4d3385b5a2a7ef2e Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 23 Jun 2021 13:05:47 +0200 Subject: mac80211: add HE 6 GHz capability only if supported The HE 6 GHz capability should only be included if there are actually available channels on 6 GHz, and if that's the case we need to get it from the 6 GHz band data, not whatever other band we're on now. Signed-off-by: Johannes Berg Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20210618133832.290bf5c87030.I178aff1c3a6e32456d4ac9238e4a2eb47d209ccd@changeid Link: https://lore.kernel.org/r/iwlwifi.20210618133832.05e935e8dd98.I83ff7eb2ae8ebdf2e30c4fa2461344d9e569f599@changeid Signed-off-by: Johannes Berg --- net/mac80211/util.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/mac80211/util.c b/net/mac80211/util.c index d1ecac00bddb..05e96212b104 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -6,7 +6,7 @@ * Copyright 2007 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2020 Intel Corporation + * Copyright (C) 2018-2021 Intel Corporation * * utilities for mac80211 */ @@ -2960,12 +2960,15 @@ void ieee80211_ie_build_he_6ghz_cap(struct ieee80211_sub_if_data *sdata, u8 *pos; u16 cap; - sband = ieee80211_get_sband(sdata); - if (!sband) + if (!cfg80211_any_usable_channels(sdata->local->hw.wiphy, + BIT(NL80211_BAND_6GHZ), + IEEE80211_CHAN_NO_HE)) return; + sband = sdata->local->hw.wiphy->bands[NL80211_BAND_6GHZ]; + iftd = ieee80211_get_sband_iftype_data(sband, iftype); - if (WARN_ON(!iftd)) + if (!iftd) return; /* Check for device HE 6 GHz capability before adding element */ -- cgit v1.2.3-58-ga151 From 6e899fa027addf2dd069714184c58a7c8c4b3030 Mon Sep 17 00:00:00 2001 From: Bassem Dawood Date: Sat, 27 Feb 2021 16:58:15 +1100 Subject: mac80211: Enable power save after receiving NULL packet ACK Trigger dynamic_ps_timer to re-evaluate power saving once a null function packet (with PM = 1) is ACKed, otherwise dynamic PS is not enabled at that point. Signed-off-by: Bassem Dawood Link: https://lore.kernel.org/r/20210227055815.14838-1-bassem@morsemicro.com [reformatting] Signed-off-by: Johannes Berg --- net/mac80211/status.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/mac80211/status.c b/net/mac80211/status.c index 9baf185ee4c7..b6ef96a25eac 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -1006,12 +1006,11 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw, ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS) && !(info->flags & IEEE80211_TX_CTL_INJECTED) && local->ps_sdata && !(local->scanning)) { - if (info->flags & IEEE80211_TX_STAT_ACK) { + if (info->flags & IEEE80211_TX_STAT_ACK) local->ps_sdata->u.mgd.flags |= IEEE80211_STA_NULLFUNC_ACKED; - } else - mod_timer(&local->dynamic_ps_timer, jiffies + - msecs_to_jiffies(10)); + mod_timer(&local->dynamic_ps_timer, + jiffies + msecs_to_jiffies(10)); } ieee80211_report_used_skb(local, skb, false); -- cgit v1.2.3-58-ga151 From 95f83ee8d857f006813755e89a126f1048b001e8 Mon Sep 17 00:00:00 2001 From: Abinaya Kalaiselvan Date: Wed, 23 Jun 2021 20:10:44 +0530 Subject: mac80211: fix NULL ptr dereference during mesh peer connection for non HE devices "sband->iftype_data" is not assigned with any value for non HE supported devices, which causes NULL pointer access during mesh peer connection in those devices. Fix this by accessing the pointer after HE capabilities condition check. Cc: stable@vger.kernel.org Fixes: 7f7aa94bcaf0 (mac80211: reduce peer HE MCS/NSS to own capabilities) Signed-off-by: Abinaya Kalaiselvan Link: https://lore.kernel.org/r/1624459244-4497-1-git-send-email-akalaise@codeaurora.org Signed-off-by: Johannes Berg --- net/mac80211/he.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/he.c b/net/mac80211/he.c index 5984a9dac0bc..c05af7018f79 100644 --- a/net/mac80211/he.c +++ b/net/mac80211/he.c @@ -111,7 +111,7 @@ ieee80211_he_cap_ie_to_sta_he_cap(struct ieee80211_sub_if_data *sdata, struct sta_info *sta) { struct ieee80211_sta_he_cap *he_cap = &sta->sta.he_cap; - struct ieee80211_sta_he_cap own_he_cap = sband->iftype_data->he_cap; + struct ieee80211_sta_he_cap own_he_cap; struct ieee80211_he_cap_elem *he_cap_ie_elem = (void *)he_cap_ie; u8 he_ppe_size; u8 mcs_nss_size; @@ -125,6 +125,8 @@ ieee80211_he_cap_ie_to_sta_he_cap(struct ieee80211_sub_if_data *sdata, ieee80211_vif_type_p2p(&sdata->vif))) return; + own_he_cap = sband->iftype_data->he_cap; + /* Make sure size is OK */ mcs_nss_size = ieee80211_he_mcs_nss_size(he_cap_ie_elem); he_ppe_size = -- cgit v1.2.3-58-ga151 From 744757e46bf13ec3a7b3507d17ab3faab9516d43 Mon Sep 17 00:00:00 2001 From: Ping-Ke Shih Date: Wed, 23 Jun 2021 21:48:25 +0800 Subject: mac80211: remove iwlwifi specific workaround NDPs of null_response Remove the remaining workaround that is not removed by the commit e41eb3e408de ("mac80211: remove iwlwifi specific workaround that broke sta NDP tx") Fixes: 41cbb0f5a295 ("mac80211: add support for HE") Signed-off-by: Ping-Ke Shih Link: https://lore.kernel.org/r/20210623134826.10318-1-pkshih@realtek.com Signed-off-by: Johannes Berg --- net/mac80211/sta_info.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'net') diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 641a6657d0c9..e18c3855f616 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -1398,11 +1398,6 @@ static void ieee80211_send_null_response(struct sta_info *sta, int tid, struct ieee80211_tx_info *info; struct ieee80211_chanctx_conf *chanctx_conf; - /* Don't send NDPs when STA is connected HE */ - if (sdata->vif.type == NL80211_IFTYPE_STATION && - !(sdata->u.mgd.flags & IEEE80211_STA_DISABLE_HE)) - return; - if (qos) { fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_QOS_NULLFUNC | -- cgit v1.2.3-58-ga151 From 2832943c789aa6a89eb3d1cf1a466e817ae451a7 Mon Sep 17 00:00:00 2001 From: Ping-Ke Shih Date: Wed, 23 Jun 2021 21:48:26 +0800 Subject: Revert "mac80211: HE STA disassoc due to QOS NULL not sent" This reverts commit f39b07fdfb68 ("mac80211: HE STA disassoc due to QOS NULL not sent") Since iwlwifi specific workaround, which blocks to send NDP, is removed, we can revert this commit. Signed-off-by: Ping-Ke Shih Link: https://lore.kernel.org/r/20210623134826.10318-2-pkshih@realtek.com Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index a0572ce99826..a00f11a33699 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -2541,10 +2541,7 @@ static void ieee80211_mgd_probe_ap_send(struct ieee80211_sub_if_data *sdata) if (ieee80211_hw_check(&sdata->local->hw, REPORTS_TX_ACK_STATUS)) { ifmgd->nullfunc_failed = false; - if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE)) - ifmgd->probe_send_count--; - else - ieee80211_send_nullfunc(sdata->local, sdata, false); + ieee80211_send_nullfunc(sdata->local, sdata, false); } else { int ssid_len; -- cgit v1.2.3-58-ga151 From 2433647bc8d983a543e7d31b41ca2de1c7e2c198 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Wed, 23 Jun 2021 15:47:55 +0200 Subject: mac80211: Switch to a virtual time-based airtime scheduler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This switches the airtime scheduler in mac80211 to use a virtual time-based scheduler instead of the round-robin scheduler used before. This has a couple of advantages: - No need to sync up the round-robin scheduler in firmware/hardware with the round-robin airtime scheduler. - If several stations are eligible for transmission we can schedule both of them; no need to hard-block the scheduling rotation until the head of the queue has used up its quantum. - The check of whether a station is eligible for transmission becomes simpler (in ieee80211_txq_may_transmit()). The drawback is that scheduling becomes slightly more expensive, as we need to maintain an rbtree of TXQs sorted by virtual time. This means that ieee80211_register_airtime() becomes O(logN) in the number of currently scheduled TXQs because it can change the order of the scheduled stations. We mitigate this overhead by only resorting when a station changes position in the tree, and hopefully N rarely grows too big (it's only TXQs currently backlogged, not all associated stations), so it shouldn't be too big of an issue. To prevent divisions in the fast path, we maintain both station sums and pre-computed reciprocals of the sums. This turns the fast-path operation into a multiplication, with divisions only happening as the number of active stations change (to re-compute the current sum of all active station weights). To prevent this re-computation of the reciprocal from happening too frequently, we use a time-based notion of station activity, instead of updating the weight every time a station gets scheduled or de-scheduled. As queues can oscillate between empty and occupied quite frequently, this can significantly cut down on the number of re-computations. It also has the added benefit of making the station airtime calculation independent on whether the queue happened to have drained at the time an airtime value was accounted. Co-developed-by: Yibo Zhao Signed-off-by: Yibo Zhao Signed-off-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/20210623134755.235545-1-toke@redhat.com Signed-off-by: Johannes Berg --- include/net/mac80211.h | 17 +- net/mac80211/cfg.c | 35 +++- net/mac80211/debugfs.c | 70 ++++++-- net/mac80211/debugfs_netdev.c | 32 +++- net/mac80211/debugfs_sta.c | 24 +-- net/mac80211/ieee80211_i.h | 182 +++++++++++++++++++-- net/mac80211/iface.c | 3 + net/mac80211/main.c | 11 +- net/mac80211/rx.c | 6 +- net/mac80211/sta_info.c | 67 +++++--- net/mac80211/sta_info.h | 11 +- net/mac80211/status.c | 19 +++ net/mac80211/tx.c | 366 ++++++++++++++++++++++++++++++------------ 13 files changed, 653 insertions(+), 190 deletions(-) (limited to 'net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 9afbcac61c2a..d8a1d09a2141 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -6605,9 +6605,6 @@ static inline void ieee80211_txq_schedule_end(struct ieee80211_hw *hw, u8 ac) { } -void __ieee80211_schedule_txq(struct ieee80211_hw *hw, - struct ieee80211_txq *txq, bool force); - /** * ieee80211_schedule_txq - schedule a TXQ for transmission * @@ -6620,11 +6617,7 @@ void __ieee80211_schedule_txq(struct ieee80211_hw *hw, * The driver may call this function if it has buffered packets for * this TXQ internally. */ -static inline void -ieee80211_schedule_txq(struct ieee80211_hw *hw, struct ieee80211_txq *txq) -{ - __ieee80211_schedule_txq(hw, txq, true); -} +void ieee80211_schedule_txq(struct ieee80211_hw *hw, struct ieee80211_txq *txq); /** * ieee80211_return_txq - return a TXQ previously acquired by ieee80211_next_txq() @@ -6636,12 +6629,8 @@ ieee80211_schedule_txq(struct ieee80211_hw *hw, struct ieee80211_txq *txq) * The driver may set force=true if it has buffered packets for this TXQ * internally. */ -static inline void -ieee80211_return_txq(struct ieee80211_hw *hw, struct ieee80211_txq *txq, - bool force) -{ - __ieee80211_schedule_txq(hw, txq, force); -} +void ieee80211_return_txq(struct ieee80211_hw *hw, struct ieee80211_txq *txq, + bool force); /** * ieee80211_txq_may_transmit - check whether TXQ is allowed to transmit diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 0d29a9d1f910..84cc7733ea66 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1442,6 +1442,38 @@ static void sta_apply_mesh_params(struct ieee80211_local *local, #endif } +static void sta_apply_airtime_params(struct ieee80211_local *local, + struct sta_info *sta, + struct station_parameters *params) +{ + u8 ac; + + for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { + struct airtime_sched_info *air_sched = &local->airtime[ac]; + struct airtime_info *air_info = &sta->airtime[ac]; + struct txq_info *txqi; + u8 tid; + + spin_lock_bh(&air_sched->lock); + for (tid = 0; tid < IEEE80211_NUM_TIDS + 1; tid++) { + if (air_info->weight == params->airtime_weight || + !sta->sta.txq[tid] || + ac != ieee80211_ac_from_tid(tid)) + continue; + + airtime_weight_set(air_info, params->airtime_weight); + + txqi = to_txq_info(sta->sta.txq[tid]); + if (RB_EMPTY_NODE(&txqi->schedule_order)) + continue; + + ieee80211_update_airtime_weight(local, air_sched, + 0, true); + } + spin_unlock_bh(&air_sched->lock); + } +} + static int sta_apply_parameters(struct ieee80211_local *local, struct sta_info *sta, struct station_parameters *params) @@ -1629,7 +1661,8 @@ static int sta_apply_parameters(struct ieee80211_local *local, sta_apply_mesh_params(local, sta, params); if (params->airtime_weight) - sta->airtime_weight = params->airtime_weight; + sta_apply_airtime_params(local, sta, params); + /* set the STA state after all sta info from usermode has been set */ if (test_sta_flag(sta, WLAN_STA_TDLS_PEER) || diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index fc34ae2b604c..8dbfe325ee66 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -216,14 +216,14 @@ static ssize_t aql_txq_limit_read(struct file *file, "VI %u %u\n" "BE %u %u\n" "BK %u %u\n", - local->aql_txq_limit_low[IEEE80211_AC_VO], - local->aql_txq_limit_high[IEEE80211_AC_VO], - local->aql_txq_limit_low[IEEE80211_AC_VI], - local->aql_txq_limit_high[IEEE80211_AC_VI], - local->aql_txq_limit_low[IEEE80211_AC_BE], - local->aql_txq_limit_high[IEEE80211_AC_BE], - local->aql_txq_limit_low[IEEE80211_AC_BK], - local->aql_txq_limit_high[IEEE80211_AC_BK]); + local->airtime[IEEE80211_AC_VO].aql_txq_limit_low, + local->airtime[IEEE80211_AC_VO].aql_txq_limit_high, + local->airtime[IEEE80211_AC_VI].aql_txq_limit_low, + local->airtime[IEEE80211_AC_VI].aql_txq_limit_high, + local->airtime[IEEE80211_AC_BE].aql_txq_limit_low, + local->airtime[IEEE80211_AC_BE].aql_txq_limit_high, + local->airtime[IEEE80211_AC_BK].aql_txq_limit_low, + local->airtime[IEEE80211_AC_BK].aql_txq_limit_high); return simple_read_from_buffer(user_buf, count, ppos, buf, len); } @@ -255,11 +255,11 @@ static ssize_t aql_txq_limit_write(struct file *file, if (ac >= IEEE80211_NUM_ACS) return -EINVAL; - q_limit_low_old = local->aql_txq_limit_low[ac]; - q_limit_high_old = local->aql_txq_limit_high[ac]; + q_limit_low_old = local->airtime[ac].aql_txq_limit_low; + q_limit_high_old = local->airtime[ac].aql_txq_limit_high; - local->aql_txq_limit_low[ac] = q_limit_low; - local->aql_txq_limit_high[ac] = q_limit_high; + local->airtime[ac].aql_txq_limit_low = q_limit_low; + local->airtime[ac].aql_txq_limit_high = q_limit_high; mutex_lock(&local->sta_mtx); list_for_each_entry(sta, &local->sta_list, list) { @@ -382,6 +382,46 @@ static const struct file_operations force_tx_status_ops = { .llseek = default_llseek, }; +static ssize_t airtime_read(struct file *file, + char __user *user_buf, + size_t count, + loff_t *ppos) +{ + struct ieee80211_local *local = file->private_data; + char buf[200]; + u64 v_t[IEEE80211_NUM_ACS]; + u64 wt[IEEE80211_NUM_ACS]; + int len = 0, ac; + + for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { + spin_lock_bh(&local->airtime[ac].lock); + v_t[ac] = local->airtime[ac].v_t; + wt[ac] = local->airtime[ac].weight_sum; + spin_unlock_bh(&local->airtime[ac].lock); + } + len = scnprintf(buf, sizeof(buf), + "\tVO VI BE BK\n" + "Virt-t\t%-10llu %-10llu %-10llu %-10llu\n" + "Weight\t%-10llu %-10llu %-10llu %-10llu\n", + v_t[0], + v_t[1], + v_t[2], + v_t[3], + wt[0], + wt[1], + wt[2], + wt[3]); + + return simple_read_from_buffer(user_buf, count, ppos, + buf, len); +} + +static const struct file_operations airtime_ops = { + .read = airtime_read, + .open = simple_open, + .llseek = default_llseek, +}; + #ifdef CONFIG_PM static ssize_t reset_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) @@ -632,7 +672,11 @@ void debugfs_hw_add(struct ieee80211_local *local) if (local->ops->wake_tx_queue) DEBUGFS_ADD_MODE(aqm, 0600); - DEBUGFS_ADD_MODE(airtime_flags, 0600); + if (wiphy_ext_feature_isset(local->hw.wiphy, + NL80211_EXT_FEATURE_AIRTIME_FAIRNESS)) { + DEBUGFS_ADD_MODE(airtime, 0600); + DEBUGFS_ADD_MODE(airtime_flags, 0600); + } DEBUGFS_ADD(aql_txq_limit); debugfs_create_u32("aql_threshold", 0600, diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index f7aac8955681..db724fc10a5f 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -512,6 +512,34 @@ static ssize_t ieee80211_if_fmt_aqm( } IEEE80211_IF_FILE_R(aqm); +static ssize_t ieee80211_if_fmt_airtime( + const struct ieee80211_sub_if_data *sdata, char *buf, int buflen) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_txq *txq = sdata->vif.txq; + struct airtime_info *air_info; + int len; + + if (!txq) + return 0; + + spin_lock_bh(&local->airtime[txq->ac].lock); + air_info = to_airtime_info(txq); + len = scnprintf(buf, + buflen, + "RX: %llu us\nTX: %llu us\nWeight: %u\n" + "Virt-T: %lld us\n", + air_info->rx_airtime, + air_info->tx_airtime, + air_info->weight, + air_info->v_t); + spin_unlock_bh(&local->airtime[txq->ac].lock); + + return len; +} + +IEEE80211_IF_FILE_R(airtime); + IEEE80211_IF_FILE(multicast_to_unicast, u.ap.multicast_to_unicast, HEX); /* IBSS attributes */ @@ -657,8 +685,10 @@ static void add_common_files(struct ieee80211_sub_if_data *sdata) if (sdata->local->ops->wake_tx_queue && sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE && - sdata->vif.type != NL80211_IFTYPE_NAN) + sdata->vif.type != NL80211_IFTYPE_NAN) { DEBUGFS_ADD(aqm); + DEBUGFS_ADD(airtime); + } } static void add_sta_files(struct ieee80211_sub_if_data *sdata) diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index 936c9dfa86c8..8be28cfd6f64 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -202,7 +202,7 @@ static ssize_t sta_airtime_read(struct file *file, char __user *userbuf, size_t bufsz = 400; char *buf = kzalloc(bufsz, GFP_KERNEL), *p = buf; u64 rx_airtime = 0, tx_airtime = 0; - s64 deficit[IEEE80211_NUM_ACS]; + u64 v_t[IEEE80211_NUM_ACS]; ssize_t rv; int ac; @@ -210,18 +210,18 @@ static ssize_t sta_airtime_read(struct file *file, char __user *userbuf, return -ENOMEM; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { - spin_lock_bh(&local->active_txq_lock[ac]); + spin_lock_bh(&local->airtime[ac].lock); rx_airtime += sta->airtime[ac].rx_airtime; tx_airtime += sta->airtime[ac].tx_airtime; - deficit[ac] = sta->airtime[ac].deficit; - spin_unlock_bh(&local->active_txq_lock[ac]); + v_t[ac] = sta->airtime[ac].v_t; + spin_unlock_bh(&local->airtime[ac].lock); } p += scnprintf(p, bufsz + buf - p, "RX: %llu us\nTX: %llu us\nWeight: %u\n" - "Deficit: VO: %lld us VI: %lld us BE: %lld us BK: %lld us\n", - rx_airtime, tx_airtime, sta->airtime_weight, - deficit[0], deficit[1], deficit[2], deficit[3]); + "Virt-T: VO: %lld us VI: %lld us BE: %lld us BK: %lld us\n", + rx_airtime, tx_airtime, sta->airtime[0].weight, + v_t[0], v_t[1], v_t[2], v_t[3]); rv = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf); kfree(buf); @@ -236,11 +236,11 @@ static ssize_t sta_airtime_write(struct file *file, const char __user *userbuf, int ac; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { - spin_lock_bh(&local->active_txq_lock[ac]); + spin_lock_bh(&local->airtime[ac].lock); sta->airtime[ac].rx_airtime = 0; sta->airtime[ac].tx_airtime = 0; - sta->airtime[ac].deficit = sta->airtime_weight; - spin_unlock_bh(&local->active_txq_lock[ac]); + sta->airtime[ac].v_t = 0; + spin_unlock_bh(&local->airtime[ac].lock); } return count; @@ -263,10 +263,10 @@ static ssize_t sta_aql_read(struct file *file, char __user *userbuf, return -ENOMEM; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { - spin_lock_bh(&local->active_txq_lock[ac]); + spin_lock_bh(&local->airtime[ac].lock); q_limit_l[ac] = sta->airtime[ac].aql_limit_low; q_limit_h[ac] = sta->airtime[ac].aql_limit_high; - spin_unlock_bh(&local->active_txq_lock[ac]); + spin_unlock_bh(&local->airtime[ac].lock); q_depth[ac] = atomic_read(&sta->airtime[ac].aql_tx_pending); } diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 17068408b27d..22549b95d1aa 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -831,20 +831,16 @@ enum txq_info_flags { * @def_flow: used as a fallback flow when a packet destined to @tin hashes to * a fq_flow which is already owned by a different tin * @def_cvars: codel vars for @def_flow - * @frags: used to keep fragments created after dequeue * @schedule_order: used with ieee80211_local->active_txqs - * @schedule_round: counter to prevent infinite loops on TXQ scheduling + * @frags: used to keep fragments created after dequeue */ struct txq_info { struct fq_tin tin; struct codel_vars def_cvars; struct codel_stats cstats; - - u16 schedule_round; - struct list_head schedule_order; + struct rb_node schedule_order; struct sk_buff_head frags; - unsigned long flags; /* keep last! */ @@ -921,6 +917,8 @@ struct ieee80211_sub_if_data { struct ieee80211_tx_queue_params tx_conf[IEEE80211_NUM_ACS]; struct mac80211_qos_map __rcu *qos_map; + struct airtime_info airtime[IEEE80211_NUM_ACS]; + struct work_struct csa_finalize_work; bool csa_block_tx; /* write-protected by sdata_lock and local->mtx */ struct cfg80211_chan_def csa_chandef; @@ -1133,6 +1131,44 @@ enum mac80211_scan_state { SCAN_ABORT, }; +/** + * struct airtime_sched_info - state used for airtime scheduling and AQL + * + * @lock: spinlock that protects all the fields in this struct + * @active_txqs: rbtree of currently backlogged queues, sorted by virtual time + * @schedule_pos: the current position maintained while a driver walks the tree + * with ieee80211_next_txq() + * @active_list: list of struct airtime_info structs that were active within + * the last AIRTIME_ACTIVE_DURATION (100 ms), used to compute + * weight_sum + * @last_weight_update: used for rate limiting walking active_list + * @last_schedule_time: tracks the last time a transmission was scheduled; used + * for catching up v_t if no stations are eligible for + * transmission. + * @v_t: global virtual time; queues with v_t < this are eligible for + * transmission + * @weight_sum: total sum of all active stations used for dividing airtime + * @weight_sum_reciprocal: reciprocal of weight_sum (to avoid divisions in fast + * path - see comment above + * IEEE80211_RECIPROCAL_DIVISOR_64) + * @aql_txq_limit_low: AQL limit when total outstanding airtime + * is < IEEE80211_AQL_THRESHOLD + * @aql_txq_limit_high: AQL limit when total outstanding airtime + * is > IEEE80211_AQL_THRESHOLD + */ +struct airtime_sched_info { + spinlock_t lock; + struct rb_root_cached active_txqs; + struct rb_node *schedule_pos; + struct list_head active_list; + u64 last_weight_update; + u64 last_schedule_activity; + u64 v_t; + u64 weight_sum; + u64 weight_sum_reciprocal; + u32 aql_txq_limit_low; + u32 aql_txq_limit_high; +}; DECLARE_STATIC_KEY_FALSE(aql_disable); struct ieee80211_local { @@ -1146,13 +1182,8 @@ struct ieee80211_local { struct codel_params cparams; /* protects active_txqs and txqi->schedule_order */ - spinlock_t active_txq_lock[IEEE80211_NUM_ACS]; - struct list_head active_txqs[IEEE80211_NUM_ACS]; - u16 schedule_round[IEEE80211_NUM_ACS]; - + struct airtime_sched_info airtime[IEEE80211_NUM_ACS]; u16 airtime_flags; - u32 aql_txq_limit_low[IEEE80211_NUM_ACS]; - u32 aql_txq_limit_high[IEEE80211_NUM_ACS]; u32 aql_threshold; atomic_t aql_total_pending_airtime; @@ -1566,6 +1597,125 @@ static inline bool txq_has_queue(struct ieee80211_txq *txq) return !(skb_queue_empty(&txqi->frags) && !txqi->tin.backlog_packets); } +static inline struct airtime_info *to_airtime_info(struct ieee80211_txq *txq) +{ + struct ieee80211_sub_if_data *sdata; + struct sta_info *sta; + + if (txq->sta) { + sta = container_of(txq->sta, struct sta_info, sta); + return &sta->airtime[txq->ac]; + } + + sdata = vif_to_sdata(txq->vif); + return &sdata->airtime[txq->ac]; +} + +/* To avoid divisions in the fast path, we keep pre-computed reciprocals for + * airtime weight calculations. There are two different weights to keep track + * of: The per-station weight and the sum of weights per phy. + * + * For the per-station weights (kept in airtime_info below), we use 32-bit + * reciprocals with a devisor of 2^19. This lets us keep the multiplications and + * divisions for the station weights as 32-bit operations at the cost of a bit + * of rounding error for high weights; but the choice of divisor keeps rounding + * errors <10% for weights <2^15, assuming no more than 8ms of airtime is + * reported at a time. + * + * For the per-phy sum of weights the values can get higher, so we use 64-bit + * operations for those with a 32-bit divisor, which should avoid any + * significant rounding errors. + */ +#define IEEE80211_RECIPROCAL_DIVISOR_64 0x100000000ULL +#define IEEE80211_RECIPROCAL_SHIFT_64 32 +#define IEEE80211_RECIPROCAL_DIVISOR_32 0x80000U +#define IEEE80211_RECIPROCAL_SHIFT_32 19 + +static inline void airtime_weight_set(struct airtime_info *air_info, u16 weight) +{ + if (air_info->weight == weight) + return; + + air_info->weight = weight; + if (weight) { + air_info->weight_reciprocal = + IEEE80211_RECIPROCAL_DIVISOR_32 / weight; + } else { + air_info->weight_reciprocal = 0; + } +} + +static inline void airtime_weight_sum_set(struct airtime_sched_info *air_sched, + int weight_sum) +{ + if (air_sched->weight_sum == weight_sum) + return; + + air_sched->weight_sum = weight_sum; + if (air_sched->weight_sum) { + air_sched->weight_sum_reciprocal = IEEE80211_RECIPROCAL_DIVISOR_64; + do_div(air_sched->weight_sum_reciprocal, air_sched->weight_sum); + } else { + air_sched->weight_sum_reciprocal = 0; + } +} + +/* A problem when trying to enforce airtime fairness is that we want to divide + * the airtime between the currently *active* stations. However, basing this on + * the instantaneous queue state of stations doesn't work, as queues tend to + * oscillate very quickly between empty and occupied, leading to the scheduler + * thinking only a single station is active when deciding whether to allow + * transmission (and thus not throttling correctly). + * + * To fix this we use a timer-based notion of activity: a station is considered + * active if it has been scheduled within the last 100 ms; we keep a separate + * list of all the stations considered active in this manner, and lazily update + * the total weight of active stations from this list (filtering the stations in + * the list by their 'last active' time). + * + * We add one additional safeguard to guard against stations that manage to get + * scheduled every 100 ms but don't transmit a lot of data, and thus don't use + * up any airtime. Such stations would be able to get priority for an extended + * period of time if they do start transmitting at full capacity again, and so + * we add an explicit maximum for how far behind a station is allowed to fall in + * the virtual airtime domain. This limit is set to a relatively high value of + * 20 ms because the main mechanism for catching up idle stations is the active + * state as described above; i.e., the hard limit should only be hit in + * pathological cases. + */ +#define AIRTIME_ACTIVE_DURATION (100 * NSEC_PER_MSEC) +#define AIRTIME_MAX_BEHIND 20000 /* 20 ms */ + +static inline bool airtime_is_active(struct airtime_info *air_info, u64 now) +{ + return air_info->last_scheduled >= now - AIRTIME_ACTIVE_DURATION; +} + +static inline void airtime_set_active(struct airtime_sched_info *air_sched, + struct airtime_info *air_info, u64 now) +{ + air_info->last_scheduled = now; + air_sched->last_schedule_activity = now; + list_move_tail(&air_info->list, &air_sched->active_list); +} + +static inline bool airtime_catchup_v_t(struct airtime_sched_info *air_sched, + u64 v_t, u64 now) +{ + air_sched->v_t = v_t; + return true; +} + +static inline void init_airtime_info(struct airtime_info *air_info, + struct airtime_sched_info *air_sched) +{ + atomic_set(&air_info->aql_tx_pending, 0); + air_info->aql_limit_low = air_sched->aql_txq_limit_low; + air_info->aql_limit_high = air_sched->aql_txq_limit_high; + airtime_weight_set(air_info, IEEE80211_DEFAULT_AIRTIME_WEIGHT); + INIT_LIST_HEAD(&air_info->list); +} + static inline int ieee80211_bssid_match(const u8 *raddr, const u8 *addr) { return ether_addr_equal(raddr, addr) || @@ -1808,6 +1958,14 @@ int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev, u64 *cookie); int ieee80211_probe_mesh_link(struct wiphy *wiphy, struct net_device *dev, const u8 *buf, size_t len); +void ieee80211_resort_txq(struct ieee80211_hw *hw, + struct ieee80211_txq *txq); +void ieee80211_unschedule_txq(struct ieee80211_hw *hw, + struct ieee80211_txq *txq, + bool purge); +void ieee80211_update_airtime_weight(struct ieee80211_local *local, + struct airtime_sched_info *air_sched, + u64 now, bool force); /* HT */ void ieee80211_apply_htcap_overrides(struct ieee80211_sub_if_data *sdata, diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 68375ef56b4a..1e5e9fc45523 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -1977,6 +1977,9 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name, } } + for (i = 0; i < IEEE80211_NUM_ACS; i++) + init_airtime_info(&sdata->airtime[i], &local->airtime[i]); + ieee80211_set_default_queues(sdata); sdata->ap_power_level = IEEE80211_UNSET_POWER_LEVEL; diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 95a8300da2d0..05f4c3c72619 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -705,10 +705,13 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, spin_lock_init(&local->queue_stop_reason_lock); for (i = 0; i < IEEE80211_NUM_ACS; i++) { - INIT_LIST_HEAD(&local->active_txqs[i]); - spin_lock_init(&local->active_txq_lock[i]); - local->aql_txq_limit_low[i] = IEEE80211_DEFAULT_AQL_TXQ_LIMIT_L; - local->aql_txq_limit_high[i] = + struct airtime_sched_info *air_sched = &local->airtime[i]; + + air_sched->active_txqs = RB_ROOT_CACHED; + INIT_LIST_HEAD(&air_sched->active_list); + spin_lock_init(&air_sched->lock); + air_sched->aql_txq_limit_low = IEEE80211_DEFAULT_AQL_TXQ_LIMIT_L; + air_sched->aql_txq_limit_high = IEEE80211_DEFAULT_AQL_TXQ_LIMIT_H; } diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index a6400adf08bf..771921c057e8 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -1578,12 +1578,8 @@ static void sta_ps_start(struct sta_info *sta) for (tid = 0; tid < IEEE80211_NUM_TIDS; tid++) { struct ieee80211_txq *txq = sta->sta.txq[tid]; - struct txq_info *txqi = to_txq_info(txq); - spin_lock(&local->active_txq_lock[txq->ac]); - if (!list_empty(&txqi->schedule_order)) - list_del_init(&txqi->schedule_order); - spin_unlock(&local->active_txq_lock[txq->ac]); + ieee80211_unschedule_txq(&local->hw, txq, false); if (txq_has_queue(txq)) set_bit(tid, &sta->txq_buffered_tids); diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index e18c3855f616..a5505ee51229 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -425,15 +425,11 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, if (sta_prepare_rate_control(local, sta, gfp)) goto free_txq; - sta->airtime_weight = IEEE80211_DEFAULT_AIRTIME_WEIGHT; for (i = 0; i < IEEE80211_NUM_ACS; i++) { skb_queue_head_init(&sta->ps_tx_buf[i]); skb_queue_head_init(&sta->tx_filtered[i]); - sta->airtime[i].deficit = sta->airtime_weight; - atomic_set(&sta->airtime[i].aql_tx_pending, 0); - sta->airtime[i].aql_limit_low = local->aql_txq_limit_low[i]; - sta->airtime[i].aql_limit_high = local->aql_txq_limit_high[i]; + init_airtime_info(&sta->airtime[i], &local->airtime[i]); } for (i = 0; i < IEEE80211_NUM_TIDS; i++) @@ -1892,24 +1888,59 @@ void ieee80211_sta_set_buffered(struct ieee80211_sta *pubsta, } EXPORT_SYMBOL(ieee80211_sta_set_buffered); -void ieee80211_sta_register_airtime(struct ieee80211_sta *pubsta, u8 tid, - u32 tx_airtime, u32 rx_airtime) +void ieee80211_register_airtime(struct ieee80211_txq *txq, + u32 tx_airtime, u32 rx_airtime) { - struct sta_info *sta = container_of(pubsta, struct sta_info, sta); - struct ieee80211_local *local = sta->sdata->local; - u8 ac = ieee80211_ac_from_tid(tid); + struct ieee80211_sub_if_data *sdata = vif_to_sdata(txq->vif); + struct ieee80211_local *local = sdata->local; + u64 weight_sum, weight_sum_reciprocal; + struct airtime_sched_info *air_sched; + struct airtime_info *air_info; u32 airtime = 0; - if (sta->local->airtime_flags & AIRTIME_USE_TX) + air_sched = &local->airtime[txq->ac]; + air_info = to_airtime_info(txq); + + if (local->airtime_flags & AIRTIME_USE_TX) airtime += tx_airtime; - if (sta->local->airtime_flags & AIRTIME_USE_RX) + if (local->airtime_flags & AIRTIME_USE_RX) airtime += rx_airtime; - spin_lock_bh(&local->active_txq_lock[ac]); - sta->airtime[ac].tx_airtime += tx_airtime; - sta->airtime[ac].rx_airtime += rx_airtime; - sta->airtime[ac].deficit -= airtime; - spin_unlock_bh(&local->active_txq_lock[ac]); + /* Weights scale so the unit weight is 256 */ + airtime <<= 8; + + spin_lock_bh(&air_sched->lock); + + air_info->tx_airtime += tx_airtime; + air_info->rx_airtime += rx_airtime; + + if (air_sched->weight_sum) { + weight_sum = air_sched->weight_sum; + weight_sum_reciprocal = air_sched->weight_sum_reciprocal; + } else { + weight_sum = air_info->weight; + weight_sum_reciprocal = air_info->weight_reciprocal; + } + + /* Round the calculation of global vt */ + air_sched->v_t += (u64)((airtime + (weight_sum >> 1)) * + weight_sum_reciprocal) >> IEEE80211_RECIPROCAL_SHIFT_64; + air_info->v_t += (u32)((airtime + (air_info->weight >> 1)) * + air_info->weight_reciprocal) >> IEEE80211_RECIPROCAL_SHIFT_32; + ieee80211_resort_txq(&local->hw, txq); + + spin_unlock_bh(&air_sched->lock); +} + +void ieee80211_sta_register_airtime(struct ieee80211_sta *pubsta, u8 tid, + u32 tx_airtime, u32 rx_airtime) +{ + struct ieee80211_txq *txq = pubsta->txq[tid]; + + if (!txq) + return; + + ieee80211_register_airtime(txq, tx_airtime, rx_airtime); } EXPORT_SYMBOL(ieee80211_sta_register_airtime); @@ -2353,7 +2384,7 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_AIRTIME_WEIGHT))) { - sinfo->airtime_weight = sta->airtime_weight; + sinfo->airtime_weight = sta->airtime[0].weight; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_AIRTIME_WEIGHT); } diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 0333072ebd98..ba2796782008 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -135,18 +135,25 @@ enum ieee80211_agg_stop_reason { #define AIRTIME_USE_TX BIT(0) #define AIRTIME_USE_RX BIT(1) + struct airtime_info { u64 rx_airtime; u64 tx_airtime; - s64 deficit; + u64 v_t; + u64 last_scheduled; + struct list_head list; atomic_t aql_tx_pending; /* Estimated airtime for frames pending */ u32 aql_limit_low; u32 aql_limit_high; + u32 weight_reciprocal; + u16 weight; }; void ieee80211_sta_update_pending_airtime(struct ieee80211_local *local, struct sta_info *sta, u8 ac, u16 tx_airtime, bool tx_completed); +void ieee80211_register_airtime(struct ieee80211_txq *txq, + u32 tx_airtime, u32 rx_airtime); struct sta_info; @@ -515,7 +522,6 @@ struct ieee80211_fragment_cache { * @tid_seq: per-TID sequence numbers for sending to this STA * @airtime: per-AC struct airtime_info describing airtime statistics for this * station - * @airtime_weight: station weight for airtime fairness calculation purposes * @ampdu_mlme: A-MPDU state machine state * @mesh: mesh STA information * @debugfs_dir: debug filesystem directory dentry @@ -646,7 +652,6 @@ struct sta_info { u16 tid_seq[IEEE80211_QOS_CTL_TID_MASK + 1]; struct airtime_info airtime[IEEE80211_NUM_ACS]; - u16 airtime_weight; /* * Aggregation information, locked with lock. diff --git a/net/mac80211/status.c b/net/mac80211/status.c index b6ef96a25eac..bae321ff77f6 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -970,6 +970,25 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw, if (!(info->flags & IEEE80211_TX_CTL_INJECTED) && acked) ieee80211_frame_acked(sta, skb); + } else if (wiphy_ext_feature_isset(local->hw.wiphy, + NL80211_EXT_FEATURE_AIRTIME_FAIRNESS)) { + struct ieee80211_sub_if_data *sdata; + struct ieee80211_txq *txq; + u32 airtime; + + /* Account airtime to multicast queue */ + sdata = ieee80211_sdata_from_skb(local, skb); + + if (sdata && (txq = sdata->vif.txq)) { + airtime = info->status.tx_time ?: + ieee80211_calc_expected_tx_airtime(hw, + &sdata->vif, + NULL, + skb->len, + false); + + ieee80211_register_airtime(txq, airtime, 0); + } } /* SNMP counters diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index caa7caa89ab9..e96981144358 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -1449,7 +1450,7 @@ void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata, codel_vars_init(&txqi->def_cvars); codel_stats_init(&txqi->cstats); __skb_queue_head_init(&txqi->frags); - INIT_LIST_HEAD(&txqi->schedule_order); + RB_CLEAR_NODE(&txqi->schedule_order); txqi->txq.vif = &sdata->vif; @@ -1493,9 +1494,7 @@ void ieee80211_txq_purge(struct ieee80211_local *local, ieee80211_purge_tx_queue(&local->hw, &txqi->frags); spin_unlock_bh(&fq->lock); - spin_lock_bh(&local->active_txq_lock[txqi->txq.ac]); - list_del_init(&txqi->schedule_order); - spin_unlock_bh(&local->active_txq_lock[txqi->txq.ac]); + ieee80211_unschedule_txq(&local->hw, &txqi->txq, true); } void ieee80211_txq_set_params(struct ieee80211_local *local) @@ -3783,102 +3782,259 @@ EXPORT_SYMBOL(ieee80211_tx_dequeue); struct ieee80211_txq *ieee80211_next_txq(struct ieee80211_hw *hw, u8 ac) { struct ieee80211_local *local = hw_to_local(hw); + struct airtime_sched_info *air_sched; + u64 now = ktime_get_boottime_ns(); struct ieee80211_txq *ret = NULL; - struct txq_info *txqi = NULL, *head = NULL; - bool found_eligible_txq = false; + struct airtime_info *air_info; + struct txq_info *txqi = NULL; + struct rb_node *node; + bool first = false; - spin_lock_bh(&local->active_txq_lock[ac]); + air_sched = &local->airtime[ac]; + spin_lock_bh(&air_sched->lock); - begin: - txqi = list_first_entry_or_null(&local->active_txqs[ac], - struct txq_info, - schedule_order); - if (!txqi) + node = air_sched->schedule_pos; + +begin: + if (!node) { + node = rb_first_cached(&air_sched->active_txqs); + first = true; + } else { + node = rb_next(node); + } + + if (!node) goto out; - if (txqi == head) { - if (!found_eligible_txq) - goto out; - else - found_eligible_txq = false; + txqi = container_of(node, struct txq_info, schedule_order); + air_info = to_airtime_info(&txqi->txq); + + if (air_info->v_t > air_sched->v_t && + (!first || !airtime_catchup_v_t(air_sched, air_info->v_t, now))) + goto out; + + if (!ieee80211_txq_airtime_check(hw, &txqi->txq)) { + first = false; + goto begin; } - if (!head) - head = txqi; + air_sched->schedule_pos = node; + air_sched->last_schedule_activity = now; + ret = &txqi->txq; +out: + spin_unlock_bh(&air_sched->lock); + return ret; +} +EXPORT_SYMBOL(ieee80211_next_txq); - if (txqi->txq.sta) { - struct sta_info *sta = container_of(txqi->txq.sta, - struct sta_info, sta); - bool aql_check = ieee80211_txq_airtime_check(hw, &txqi->txq); - s64 deficit = sta->airtime[txqi->txq.ac].deficit; +static void __ieee80211_insert_txq(struct rb_root_cached *root, + struct txq_info *txqi) +{ + struct rb_node **new = &root->rb_root.rb_node; + struct airtime_info *old_air, *new_air; + struct rb_node *parent = NULL; + struct txq_info *__txqi; + bool leftmost = true; + + while (*new) { + parent = *new; + __txqi = rb_entry(parent, struct txq_info, schedule_order); + old_air = to_airtime_info(&__txqi->txq); + new_air = to_airtime_info(&txqi->txq); + + if (new_air->v_t <= old_air->v_t) { + new = &parent->rb_left; + } else { + new = &parent->rb_right; + leftmost = false; + } + } - if (aql_check) - found_eligible_txq = true; + rb_link_node(&txqi->schedule_order, parent, new); + rb_insert_color_cached(&txqi->schedule_order, root, leftmost); +} - if (deficit < 0) - sta->airtime[txqi->txq.ac].deficit += - sta->airtime_weight; +void ieee80211_resort_txq(struct ieee80211_hw *hw, + struct ieee80211_txq *txq) +{ + struct airtime_info *air_info = to_airtime_info(txq); + struct ieee80211_local *local = hw_to_local(hw); + struct txq_info *txqi = to_txq_info(txq); + struct airtime_sched_info *air_sched; - if (deficit < 0 || !aql_check) { - list_move_tail(&txqi->schedule_order, - &local->active_txqs[txqi->txq.ac]); - goto begin; + air_sched = &local->airtime[txq->ac]; + + lockdep_assert_held(&air_sched->lock); + + if (!RB_EMPTY_NODE(&txqi->schedule_order)) { + struct airtime_info *a_prev = NULL, *a_next = NULL; + struct txq_info *t_prev, *t_next; + struct rb_node *n_prev, *n_next; + + /* Erasing a node can cause an expensive rebalancing operation, + * so we check the previous and next nodes first and only remove + * and re-insert if the current node is not already in the + * correct position. + */ + if ((n_prev = rb_prev(&txqi->schedule_order)) != NULL) { + t_prev = container_of(n_prev, struct txq_info, + schedule_order); + a_prev = to_airtime_info(&t_prev->txq); } + + if ((n_next = rb_next(&txqi->schedule_order)) != NULL) { + t_next = container_of(n_next, struct txq_info, + schedule_order); + a_next = to_airtime_info(&t_next->txq); + } + + if ((!a_prev || a_prev->v_t <= air_info->v_t) && + (!a_next || a_next->v_t > air_info->v_t)) + return; + + if (air_sched->schedule_pos == &txqi->schedule_order) + air_sched->schedule_pos = n_prev; + + rb_erase_cached(&txqi->schedule_order, + &air_sched->active_txqs); + RB_CLEAR_NODE(&txqi->schedule_order); + __ieee80211_insert_txq(&air_sched->active_txqs, txqi); } +} +void ieee80211_update_airtime_weight(struct ieee80211_local *local, + struct airtime_sched_info *air_sched, + u64 now, bool force) +{ + struct airtime_info *air_info, *tmp; + u64 weight_sum = 0; + + if (unlikely(!now)) + now = ktime_get_boottime_ns(); - if (txqi->schedule_round == local->schedule_round[ac]) + lockdep_assert_held(&air_sched->lock); + + if (!force && (air_sched->last_weight_update < + now - AIRTIME_ACTIVE_DURATION)) + return; + + list_for_each_entry_safe(air_info, tmp, + &air_sched->active_list, list) { + if (airtime_is_active(air_info, now)) + weight_sum += air_info->weight; + else + list_del_init(&air_info->list); + } + airtime_weight_sum_set(air_sched, weight_sum); + air_sched->last_weight_update = now; +} + +void ieee80211_schedule_txq(struct ieee80211_hw *hw, + struct ieee80211_txq *txq) + __acquires(txq_lock) __releases(txq_lock) +{ + struct ieee80211_local *local = hw_to_local(hw); + struct txq_info *txqi = to_txq_info(txq); + struct airtime_sched_info *air_sched; + u64 now = ktime_get_boottime_ns(); + struct airtime_info *air_info; + u8 ac = txq->ac; + bool was_active; + + air_sched = &local->airtime[ac]; + air_info = to_airtime_info(txq); + + spin_lock_bh(&air_sched->lock); + was_active = airtime_is_active(air_info, now); + airtime_set_active(air_sched, air_info, now); + + if (!RB_EMPTY_NODE(&txqi->schedule_order)) goto out; - list_del_init(&txqi->schedule_order); - txqi->schedule_round = local->schedule_round[ac]; - ret = &txqi->txq; + /* If the station has been inactive for a while, catch up its v_t so it + * doesn't get indefinite priority; see comment above the definition of + * AIRTIME_MAX_BEHIND. + */ + if ((!was_active && air_info->v_t < air_sched->v_t) || + air_info->v_t < air_sched->v_t - AIRTIME_MAX_BEHIND) + air_info->v_t = air_sched->v_t; + + ieee80211_update_airtime_weight(local, air_sched, now, !was_active); + __ieee80211_insert_txq(&air_sched->active_txqs, txqi); out: - spin_unlock_bh(&local->active_txq_lock[ac]); - return ret; + spin_unlock_bh(&air_sched->lock); } -EXPORT_SYMBOL(ieee80211_next_txq); +EXPORT_SYMBOL(ieee80211_schedule_txq); -void __ieee80211_schedule_txq(struct ieee80211_hw *hw, - struct ieee80211_txq *txq, - bool force) +static void __ieee80211_unschedule_txq(struct ieee80211_hw *hw, + struct ieee80211_txq *txq, + bool purge) { struct ieee80211_local *local = hw_to_local(hw); struct txq_info *txqi = to_txq_info(txq); + struct airtime_sched_info *air_sched; + struct airtime_info *air_info; - spin_lock_bh(&local->active_txq_lock[txq->ac]); - - if (list_empty(&txqi->schedule_order) && - (force || !skb_queue_empty(&txqi->frags) || - txqi->tin.backlog_packets)) { - /* If airtime accounting is active, always enqueue STAs at the - * head of the list to ensure that they only get moved to the - * back by the airtime DRR scheduler once they have a negative - * deficit. A station that already has a negative deficit will - * get immediately moved to the back of the list on the next - * call to ieee80211_next_txq(). - */ - if (txqi->txq.sta && local->airtime_flags && - wiphy_ext_feature_isset(local->hw.wiphy, - NL80211_EXT_FEATURE_AIRTIME_FAIRNESS)) - list_add(&txqi->schedule_order, - &local->active_txqs[txq->ac]); - else - list_add_tail(&txqi->schedule_order, - &local->active_txqs[txq->ac]); + air_sched = &local->airtime[txq->ac]; + air_info = to_airtime_info(&txqi->txq); + + lockdep_assert_held(&air_sched->lock); + + if (purge) { + list_del_init(&air_info->list); + ieee80211_update_airtime_weight(local, air_sched, 0, true); } - spin_unlock_bh(&local->active_txq_lock[txq->ac]); + if (RB_EMPTY_NODE(&txqi->schedule_order)) + return; + + if (air_sched->schedule_pos == &txqi->schedule_order) + air_sched->schedule_pos = rb_prev(&txqi->schedule_order); + + if (!purge) + airtime_set_active(air_sched, air_info, + ktime_get_boottime_ns()); + + rb_erase_cached(&txqi->schedule_order, + &air_sched->active_txqs); + RB_CLEAR_NODE(&txqi->schedule_order); +} + +void ieee80211_unschedule_txq(struct ieee80211_hw *hw, + struct ieee80211_txq *txq, + bool purge) + __acquires(txq_lock) __releases(txq_lock) +{ + struct ieee80211_local *local = hw_to_local(hw); + + spin_lock_bh(&local->airtime[txq->ac].lock); + __ieee80211_unschedule_txq(hw, txq, purge); + spin_unlock_bh(&local->airtime[txq->ac].lock); +} + +void ieee80211_return_txq(struct ieee80211_hw *hw, + struct ieee80211_txq *txq, bool force) +{ + struct ieee80211_local *local = hw_to_local(hw); + struct txq_info *txqi = to_txq_info(txq); + + spin_lock_bh(&local->airtime[txq->ac].lock); + + if (!RB_EMPTY_NODE(&txqi->schedule_order) && !force && + !txq_has_queue(txq)) + __ieee80211_unschedule_txq(hw, txq, false); + + spin_unlock_bh(&local->airtime[txq->ac].lock); } -EXPORT_SYMBOL(__ieee80211_schedule_txq); +EXPORT_SYMBOL(ieee80211_return_txq); DEFINE_STATIC_KEY_FALSE(aql_disable); bool ieee80211_txq_airtime_check(struct ieee80211_hw *hw, struct ieee80211_txq *txq) { - struct sta_info *sta; + struct airtime_info *air_info = to_airtime_info(txq); struct ieee80211_local *local = hw_to_local(hw); if (!wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_AQL)) @@ -3893,15 +4049,12 @@ bool ieee80211_txq_airtime_check(struct ieee80211_hw *hw, if (unlikely(txq->tid == IEEE80211_NUM_TIDS)) return true; - sta = container_of(txq->sta, struct sta_info, sta); - if (atomic_read(&sta->airtime[txq->ac].aql_tx_pending) < - sta->airtime[txq->ac].aql_limit_low) + if (atomic_read(&air_info->aql_tx_pending) < air_info->aql_limit_low) return true; if (atomic_read(&local->aql_total_pending_airtime) < local->aql_threshold && - atomic_read(&sta->airtime[txq->ac].aql_tx_pending) < - sta->airtime[txq->ac].aql_limit_high) + atomic_read(&air_info->aql_tx_pending) < air_info->aql_limit_high) return true; return false; @@ -3911,60 +4064,59 @@ EXPORT_SYMBOL(ieee80211_txq_airtime_check); bool ieee80211_txq_may_transmit(struct ieee80211_hw *hw, struct ieee80211_txq *txq) { + struct txq_info *first_txqi = NULL, *txqi = to_txq_info(txq); struct ieee80211_local *local = hw_to_local(hw); - struct txq_info *iter, *tmp, *txqi = to_txq_info(txq); - struct sta_info *sta; - u8 ac = txq->ac; + struct airtime_sched_info *air_sched; + struct airtime_info *air_info; + struct rb_node *node = NULL; + bool ret = false; + u64 now; - spin_lock_bh(&local->active_txq_lock[ac]); - if (!txqi->txq.sta) - goto out; + if (!ieee80211_txq_airtime_check(hw, txq)) + return false; + + air_sched = &local->airtime[txq->ac]; + spin_lock_bh(&air_sched->lock); - if (list_empty(&txqi->schedule_order)) + if (RB_EMPTY_NODE(&txqi->schedule_order)) goto out; - list_for_each_entry_safe(iter, tmp, &local->active_txqs[ac], - schedule_order) { - if (iter == txqi) - break; + now = ktime_get_boottime_ns(); - if (!iter->txq.sta) { - list_move_tail(&iter->schedule_order, - &local->active_txqs[ac]); - continue; - } - sta = container_of(iter->txq.sta, struct sta_info, sta); - if (sta->airtime[ac].deficit < 0) - sta->airtime[ac].deficit += sta->airtime_weight; - list_move_tail(&iter->schedule_order, &local->active_txqs[ac]); - } + /* Like in ieee80211_next_txq(), make sure the first station in the + * scheduling order is eligible for transmission to avoid starvation. + */ + node = rb_first_cached(&air_sched->active_txqs); + if (node) { + first_txqi = container_of(node, struct txq_info, + schedule_order); + air_info = to_airtime_info(&first_txqi->txq); - sta = container_of(txqi->txq.sta, struct sta_info, sta); - if (sta->airtime[ac].deficit >= 0) - goto out; + if (air_sched->v_t < air_info->v_t) + airtime_catchup_v_t(air_sched, air_info->v_t, now); + } - sta->airtime[ac].deficit += sta->airtime_weight; - list_move_tail(&txqi->schedule_order, &local->active_txqs[ac]); - spin_unlock_bh(&local->active_txq_lock[ac]); + air_info = to_airtime_info(&txqi->txq); + if (air_info->v_t <= air_sched->v_t) { + air_sched->last_schedule_activity = now; + ret = true; + } - return false; out: - if (!list_empty(&txqi->schedule_order)) - list_del_init(&txqi->schedule_order); - spin_unlock_bh(&local->active_txq_lock[ac]); - - return true; + spin_unlock_bh(&air_sched->lock); + return ret; } EXPORT_SYMBOL(ieee80211_txq_may_transmit); void ieee80211_txq_schedule_start(struct ieee80211_hw *hw, u8 ac) { struct ieee80211_local *local = hw_to_local(hw); + struct airtime_sched_info *air_sched = &local->airtime[ac]; - spin_lock_bh(&local->active_txq_lock[ac]); - local->schedule_round[ac]++; - spin_unlock_bh(&local->active_txq_lock[ac]); + spin_lock_bh(&air_sched->lock); + air_sched->schedule_pos = NULL; + spin_unlock_bh(&air_sched->lock); } EXPORT_SYMBOL(ieee80211_txq_schedule_start); -- cgit v1.2.3-58-ga151 From c4fef01ba4793a85b2d38a472bddd1e3b56d9585 Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Tue, 22 Jun 2021 14:49:56 +0800 Subject: net: sched: implement TCQ_F_CAN_BYPASS for lockless qdisc Currently pfifo_fast has both TCQ_F_CAN_BYPASS and TCQ_F_NOLOCK flag set, but queue discipline by-pass does not work for lockless qdisc because skb is always enqueued to qdisc even when the qdisc is empty, see __dev_xmit_skb(). This patch calls sch_direct_xmit() to transmit the skb directly to the driver for empty lockless qdisc, which aviod enqueuing and dequeuing operation. As qdisc->empty is not reliable to indicate a empty qdisc because there is a time window between enqueuing and setting qdisc->empty. So we use the MISSED state added in commit a90c57f2cedd ("net: sched: fix packet stuck problem for lockless qdisc"), which indicate there is lock contention, suggesting that it is better not to do the qdisc bypass in order to avoid packet out of order problem. In order to make MISSED state reliable to indicate a empty qdisc, we need to ensure that testing and clearing of MISSED state is within the protection of qdisc->seqlock, only setting MISSED state can be done without the protection of qdisc->seqlock. A MISSED state testing is added without the protection of qdisc->seqlock to aviod doing unnecessary spin_trylock() for contention case. As the enqueuing is not within the protection of qdisc->seqlock, there is still a potential data race as mentioned by Jakub [1]: thread1 thread2 thread3 qdisc_run_begin() # true qdisc_run_begin(q) set(MISSED) pfifo_fast_dequeue clear(MISSED) # recheck the queue qdisc_run_end() enqueue skb1 qdisc empty # true qdisc_run_begin() # true sch_direct_xmit() # skb2 qdisc_run_begin() set(MISSED) When above happens, skb1 enqueued by thread2 is transmited after skb2 is transmited by thread3 because MISSED state setting and enqueuing is not under the qdisc->seqlock. If qdisc bypass is disabled, skb1 has better chance to be transmited quicker than skb2. This patch does not take care of the above data race, because we view this as similar as below: Even at the same time CPU1 and CPU2 write the skb to two socket which both heading to the same qdisc, there is no guarantee that which skb will hit the qdisc first, because there is a lot of factor like interrupt/softirq/cache miss/scheduling afffecting that. There are below cases that need special handling: 1. When MISSED state is cleared before another round of dequeuing in pfifo_fast_dequeue(), and __qdisc_run() might not be able to dequeue all skb in one round and call __netif_schedule(), which might result in a non-empty qdisc without MISSED set. In order to avoid this, the MISSED state is set for lockless qdisc and __netif_schedule() will be called at the end of qdisc_run_end. 2. The MISSED state also need to be set for lockless qdisc instead of calling __netif_schedule() directly when requeuing a skb for a similar reason. 3. For netdev queue stopped case, the MISSED case need clearing while the netdev queue is stopped, otherwise there may be unnecessary __netif_schedule() calling. So a new DRAINING state is added to indicate this case, which also indicate a non-empty qdisc. 4. As there is already netif_xmit_frozen_or_stopped() checking in dequeue_skb() and sch_direct_xmit(), which are both within the protection of qdisc->seqlock, but the same checking in __dev_xmit_skb() is without the protection, which might cause empty indication of a lockless qdisc to be not reliable. So remove the checking in __dev_xmit_skb(), and the checking in the protection of qdisc->seqlock seems enough to avoid the cpu consumption problem for netdev queue stopped case. 1. https://lkml.org/lkml/2021/5/29/215 Acked-by: Jakub Kicinski Tested-by: Vladimir Oltean # flexcan Signed-off-by: Yunsheng Lin Signed-off-by: David S. Miller --- include/net/sch_generic.h | 16 +++++++++++++--- net/core/dev.c | 27 +++++++++++++++++++++++++-- net/sched/sch_generic.c | 20 ++++++++++++++++---- 3 files changed, 54 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 3ed6bcc4be72..177f240d59d4 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -37,8 +37,15 @@ enum qdisc_state_t { __QDISC_STATE_SCHED, __QDISC_STATE_DEACTIVATED, __QDISC_STATE_MISSED, + __QDISC_STATE_DRAINING, }; +#define QDISC_STATE_MISSED BIT(__QDISC_STATE_MISSED) +#define QDISC_STATE_DRAINING BIT(__QDISC_STATE_DRAINING) + +#define QDISC_STATE_NON_EMPTY (QDISC_STATE_MISSED | \ + QDISC_STATE_DRAINING) + struct qdisc_size_table { struct rcu_head rcu; struct list_head list; @@ -145,6 +152,11 @@ static inline bool qdisc_is_running(struct Qdisc *qdisc) return (raw_read_seqcount(&qdisc->running) & 1) ? true : false; } +static inline bool nolock_qdisc_is_empty(const struct Qdisc *qdisc) +{ + return !(READ_ONCE(qdisc->state) & QDISC_STATE_NON_EMPTY); +} + static inline bool qdisc_is_percpu_stats(const struct Qdisc *q) { return q->flags & TCQ_F_CPUSTATS; @@ -206,10 +218,8 @@ static inline void qdisc_run_end(struct Qdisc *qdisc) spin_unlock(&qdisc->seqlock); if (unlikely(test_bit(__QDISC_STATE_MISSED, - &qdisc->state))) { - clear_bit(__QDISC_STATE_MISSED, &qdisc->state); + &qdisc->state))) __netif_schedule(qdisc); - } } else { write_seqcount_end(&qdisc->running); } diff --git a/net/core/dev.c b/net/core/dev.c index 50531a2d0b20..991d09b67bd9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3852,10 +3852,33 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, qdisc_calculate_pkt_len(skb, q); if (q->flags & TCQ_F_NOLOCK) { + if (q->flags & TCQ_F_CAN_BYPASS && nolock_qdisc_is_empty(q) && + qdisc_run_begin(q)) { + /* Retest nolock_qdisc_is_empty() within the protection + * of q->seqlock to protect from racing with requeuing. + */ + if (unlikely(!nolock_qdisc_is_empty(q))) { + rc = q->enqueue(skb, q, &to_free) & + NET_XMIT_MASK; + __qdisc_run(q); + qdisc_run_end(q); + + goto no_lock_out; + } + + qdisc_bstats_cpu_update(q, skb); + if (sch_direct_xmit(skb, q, dev, txq, NULL, true) && + !nolock_qdisc_is_empty(q)) + __qdisc_run(q); + + qdisc_run_end(q); + return NET_XMIT_SUCCESS; + } + rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK; - if (likely(!netif_xmit_frozen_or_stopped(txq))) - qdisc_run(q); + qdisc_run(q); +no_lock_out: if (unlikely(to_free)) kfree_skb_list(to_free); return rc; diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index e9c0afc8becc..9984ccc45946 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -52,6 +52,8 @@ static void qdisc_maybe_clear_missed(struct Qdisc *q, */ if (!netif_xmit_frozen_or_stopped(txq)) set_bit(__QDISC_STATE_MISSED, &q->state); + else + set_bit(__QDISC_STATE_DRAINING, &q->state); } /* Main transmission queue. */ @@ -164,9 +166,13 @@ static inline void dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) skb = next; } - if (lock) + + if (lock) { spin_unlock(lock); - __netif_schedule(q); + set_bit(__QDISC_STATE_MISSED, &q->state); + } else { + __netif_schedule(q); + } } static void try_bulk_dequeue_skb(struct Qdisc *q, @@ -409,7 +415,11 @@ void __qdisc_run(struct Qdisc *q) while (qdisc_restart(q, &packets)) { quota -= packets; if (quota <= 0) { - __netif_schedule(q); + if (q->flags & TCQ_F_NOLOCK) + set_bit(__QDISC_STATE_MISSED, &q->state); + else + __netif_schedule(q); + break; } } @@ -698,13 +708,14 @@ retry: if (likely(skb)) { qdisc_update_stats_at_dequeue(qdisc, skb); } else if (need_retry && - test_bit(__QDISC_STATE_MISSED, &qdisc->state)) { + READ_ONCE(qdisc->state) & QDISC_STATE_NON_EMPTY) { /* Delay clearing the STATE_MISSED here to reduce * the overhead of the second spin_trylock() in * qdisc_run_begin() and __netif_schedule() calling * in qdisc_run_end(). */ clear_bit(__QDISC_STATE_MISSED, &qdisc->state); + clear_bit(__QDISC_STATE_DRAINING, &qdisc->state); /* Make sure dequeuing happens after clearing * STATE_MISSED. @@ -1222,6 +1233,7 @@ static void dev_reset_queue(struct net_device *dev, spin_unlock_bh(qdisc_lock(qdisc)); if (nolock) { clear_bit(__QDISC_STATE_MISSED, &qdisc->state); + clear_bit(__QDISC_STATE_DRAINING, &qdisc->state); spin_unlock_bh(&qdisc->seqlock); } } -- cgit v1.2.3-58-ga151 From d3e0f57501bde8a9585aff79afcffd99e6a5d91c Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Tue, 22 Jun 2021 14:49:57 +0800 Subject: net: sched: remove qdisc->empty for lockless qdisc As MISSED and DRAINING state are used to indicate a non-empty qdisc, qdisc->empty is not longer needed, so remove it. Acked-by: Jakub Kicinski Tested-by: Vladimir Oltean # flexcan Signed-off-by: Yunsheng Lin Signed-off-by: David S. Miller --- include/net/sch_generic.h | 13 +++---------- net/sched/sch_generic.c | 3 --- 2 files changed, 3 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 177f240d59d4..c99ffe9cc88f 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -117,8 +117,6 @@ struct Qdisc { spinlock_t busylock ____cacheline_aligned_in_smp; spinlock_t seqlock; - /* for NOLOCK qdisc, true if there are no enqueued skbs */ - bool empty; struct rcu_head rcu; /* private data */ @@ -165,7 +163,7 @@ static inline bool qdisc_is_percpu_stats(const struct Qdisc *q) static inline bool qdisc_is_empty(const struct Qdisc *qdisc) { if (qdisc_is_percpu_stats(qdisc)) - return READ_ONCE(qdisc->empty); + return nolock_qdisc_is_empty(qdisc); return !READ_ONCE(qdisc->q.qlen); } @@ -173,7 +171,7 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_NOLOCK) { if (spin_trylock(&qdisc->seqlock)) - goto nolock_empty; + return true; /* If the MISSED flag is set, it means other thread has * set the MISSED flag before second spin_trylock(), so @@ -195,12 +193,7 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc) /* Retry again in case other CPU may not see the new flag * after it releases the lock at the end of qdisc_run_end(). */ - if (!spin_trylock(&qdisc->seqlock)) - return false; - -nolock_empty: - WRITE_ONCE(qdisc->empty, false); - return true; + return spin_trylock(&qdisc->seqlock); } else if (qdisc_is_running(qdisc)) { return false; } diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 9984ccc45946..d9ac60ffe927 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -725,8 +725,6 @@ retry: need_retry = false; goto retry; - } else { - WRITE_ONCE(qdisc->empty, true); } return skb; @@ -927,7 +925,6 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, sch->enqueue = ops->enqueue; sch->dequeue = ops->dequeue; sch->dev_queue = dev_queue; - sch->empty = true; dev_hold(dev); refcount_set(&sch->refcnt, 1); -- cgit v1.2.3-58-ga151 From 55d444b310c64b084dcc62ba3e4dc3862269fb96 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 23 Jun 2021 08:35:29 +0900 Subject: tcp: Add stats for socket migration. This commit adds two stats for the socket migration feature to evaluate the effectiveness: LINUX_MIB_TCPMIGRATEREQ(SUCCESS|FAILURE). If the migration fails because of the own_req race in receiving ACK and sending SYN+ACK paths, we do not increment the failure stat. Then another CPU is responsible for the req. Link: https://lore.kernel.org/bpf/CAK6E8=cgFKuGecTzSCSQ8z3YJ_163C0uwO9yRvfDSE7vOe9mJA@mail.gmail.com/ Suggested-by: Yuchung Cheng Signed-off-by: Kuniyuki Iwashima Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- include/uapi/linux/snmp.h | 2 ++ net/core/sock_reuseport.c | 15 +++++++++++---- net/ipv4/inet_connection_sock.c | 15 +++++++++++++-- net/ipv4/proc.c | 2 ++ net/ipv4/tcp_minisocks.c | 3 +++ 5 files changed, 31 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index 26fc60ce9298..904909d020e2 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -290,6 +290,8 @@ enum LINUX_MIB_TCPDUPLICATEDATAREHASH, /* TCPDuplicateDataRehash */ LINUX_MIB_TCPDSACKRECVSEGS, /* TCPDSACKRecvSegs */ LINUX_MIB_TCPDSACKIGNOREDDUBIOUS, /* TCPDSACKIgnoredDubious */ + LINUX_MIB_TCPMIGRATEREQSUCCESS, /* TCPMigrateReqSuccess */ + LINUX_MIB_TCPMIGRATEREQFAILURE, /* TCPMigrateReqFailure */ __LINUX_MIB_MAX }; diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index de5ee3ae86d5..3f00a28fe762 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -6,6 +6,7 @@ * selecting the socket index from the array of available sockets. */ +#include #include #include #include @@ -536,7 +537,7 @@ struct sock *reuseport_migrate_sock(struct sock *sk, socks = READ_ONCE(reuse->num_socks); if (unlikely(!socks)) - goto out; + goto failure; /* paired with smp_wmb() in __reuseport_add_sock() */ smp_rmb(); @@ -546,13 +547,13 @@ struct sock *reuseport_migrate_sock(struct sock *sk, if (!prog || prog->expected_attach_type != BPF_SK_REUSEPORT_SELECT_OR_MIGRATE) { if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req) goto select_by_hash; - goto out; + goto failure; } if (!skb) { skb = alloc_skb(0, GFP_ATOMIC); if (!skb) - goto out; + goto failure; allocated = true; } @@ -565,12 +566,18 @@ select_by_hash: if (!nsk) nsk = reuseport_select_sock_by_hash(reuse, hash, socks); - if (IS_ERR_OR_NULL(nsk) || unlikely(!refcount_inc_not_zero(&nsk->sk_refcnt))) + if (IS_ERR_OR_NULL(nsk) || unlikely(!refcount_inc_not_zero(&nsk->sk_refcnt))) { nsk = NULL; + goto failure; + } out: rcu_read_unlock(); return nsk; + +failure: + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE); + goto out; } EXPORT_SYMBOL(reuseport_migrate_sock); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 0eea878edc30..754013fa393b 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -703,6 +703,8 @@ static struct request_sock *inet_reqsk_clone(struct request_sock *req, nreq = kmem_cache_alloc(req->rsk_ops->slab, GFP_ATOMIC | __GFP_NOWARN); if (!nreq) { + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE); + /* paired with refcount_inc_not_zero() in reuseport_migrate_sock() */ sock_put(sk); return NULL; @@ -876,9 +878,10 @@ static void reqsk_timer_handler(struct timer_list *t) if (!inet_ehash_insert(req_to_sk(nreq), req_to_sk(oreq), NULL)) { /* delete timer */ inet_csk_reqsk_queue_drop(sk_listener, nreq); - goto drop; + goto no_ownership; } + __NET_INC_STATS(net, LINUX_MIB_TCPMIGRATEREQSUCCESS); reqsk_migrate_reset(oreq); reqsk_queue_removed(&inet_csk(oreq->rsk_listener)->icsk_accept_queue, oreq); reqsk_put(oreq); @@ -887,17 +890,19 @@ static void reqsk_timer_handler(struct timer_list *t) return; } -drop: /* Even if we can clone the req, we may need not retransmit any more * SYN+ACKs (nreq->num_timeout > max_syn_ack_retries, etc), or another * CPU may win the "own_req" race so that inet_ehash_insert() fails. */ if (nreq) { + __NET_INC_STATS(net, LINUX_MIB_TCPMIGRATEREQFAILURE); +no_ownership: reqsk_migrate_reset(nreq); reqsk_queue_removed(queue, nreq); __reqsk_free(nreq); } +drop: inet_csk_reqsk_queue_drop_and_put(oreq->rsk_listener, oreq); } @@ -1135,11 +1140,13 @@ struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child, refcount_set(&nreq->rsk_refcnt, 1); if (inet_csk_reqsk_queue_add(sk, nreq, child)) { + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQSUCCESS); reqsk_migrate_reset(req); reqsk_put(req); return child; } + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE); reqsk_migrate_reset(nreq); __reqsk_free(nreq); } else if (inet_csk_reqsk_queue_add(sk, req, child)) { @@ -1188,8 +1195,12 @@ void inet_csk_listen_stop(struct sock *sk) refcount_set(&nreq->rsk_refcnt, 1); if (inet_csk_reqsk_queue_add(nsk, nreq, child)) { + __NET_INC_STATS(sock_net(nsk), + LINUX_MIB_TCPMIGRATEREQSUCCESS); reqsk_migrate_reset(req); } else { + __NET_INC_STATS(sock_net(nsk), + LINUX_MIB_TCPMIGRATEREQFAILURE); reqsk_migrate_reset(nreq); __reqsk_free(nreq); } diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 6d46297a99f8..b0d3a09dc84e 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -295,6 +295,8 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TcpDuplicateDataRehash", LINUX_MIB_TCPDUPLICATEDATAREHASH), SNMP_MIB_ITEM("TCPDSACKRecvSegs", LINUX_MIB_TCPDSACKRECVSEGS), SNMP_MIB_ITEM("TCPDSACKIgnoredDubious", LINUX_MIB_TCPDSACKIGNOREDDUBIOUS), + SNMP_MIB_ITEM("TCPMigrateReqSuccess", LINUX_MIB_TCPMIGRATEREQSUCCESS), + SNMP_MIB_ITEM("TCPMigrateReqFailure", LINUX_MIB_TCPMIGRATEREQFAILURE), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index f258a4c0da71..0a4f3f16140a 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -786,6 +786,9 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, return inet_csk_complete_hashdance(sk, child, req, own_req); listen_overflow: + if (sk != req->rsk_listener) + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE); + if (!sock_net(sk)->ipv4.sysctl_tcp_abort_on_overflow) { inet_rsk(req)->acked = 1; return NULL; -- cgit v1.2.3-58-ga151 From bcc3f2a829b9edbe3da5fb117ee5a63686d31834 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 23 Jun 2021 08:27:00 -0700 Subject: ipv6: exthdrs: do not blindly use init_net I see no reason why max_dst_opts_cnt and max_hbh_opts_cnt are fetched from the initial net namespace. The other sysctls (max_dst_opts_len & max_hbh_opts_len) are in fact already using the current ns. Note: it is not clear why ipv6_destopt_rcv() use two ways to get to the netns : 1) dev_net(dst->dev) Originally used to increment IPSTATS_MIB_INHDRERRORS 2) dev_net(skb->dev) Tom used this variant in his patch. Maybe this calls to use ipv6_skb_net() instead ? Fixes: 47d3d7ac656a ("ipv6: Implement limits on Hop-by-Hop and Destination options") Signed-off-by: Eric Dumazet Cc: Tom Herbert Cc: Coco Li Signed-off-by: David S. Miller --- net/ipv6/exthdrs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 56e479d158b7..6f7da8f3e2e5 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -306,7 +306,7 @@ fail_and_free: #endif if (ip6_parse_tlv(tlvprocdestopt_lst, skb, - init_net.ipv6.sysctl.max_dst_opts_cnt)) { + net->ipv6.sysctl.max_dst_opts_cnt)) { skb->transport_header += extlen; opt = IP6CB(skb); #if IS_ENABLED(CONFIG_IPV6_MIP6) @@ -1037,7 +1037,7 @@ fail_and_free: opt->flags |= IP6SKB_HOPBYHOP; if (ip6_parse_tlv(tlvprochopopt_lst, skb, - init_net.ipv6.sysctl.max_hbh_opts_cnt)) { + net->ipv6.sysctl.max_hbh_opts_cnt)) { skb->transport_header += extlen; opt = IP6CB(skb); opt->nhoff = sizeof(struct ipv6hdr); -- cgit v1.2.3-58-ga151 From 1321ed5e76488cfd7a5d3ee83254be9b7c1cc581 Mon Sep 17 00:00:00 2001 From: Dmytro Linkin Date: Wed, 23 Jun 2021 16:43:13 +0300 Subject: devlink: Decrease refcnt of parent rate object on leaf destroy Port functions, like SFs, can be deleted by the user when its leaf rate object has parent node. In such case node refcnt won't be decreased which blocks the node from deletion later. Do simple refcnt decrease, since driver in cleanup stage. This: 1) assumes that driver took proper internal parent unset action; 2) allows to avoid nested callbacks call and deadlock. Fixes: d75559845078 ("devlink: Allow setting parent node of rate objects") Signed-off-by: Dmytro Linkin Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/devlink.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/core/devlink.c b/net/core/devlink.c index 566ddd147633..ba27395d8fb3 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -9275,6 +9275,8 @@ void devlink_rate_leaf_destroy(struct devlink_port *devlink_port) mutex_lock(&devlink->lock); devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_DEL); + if (devlink_rate->parent) + refcount_dec(&devlink_rate->parent->refcnt); list_del(&devlink_rate->list); devlink_port->devlink_rate = NULL; mutex_unlock(&devlink->lock); -- cgit v1.2.3-58-ga151 From ff99324ded0176d28c3d8de7cac44580cf79d52a Mon Sep 17 00:00:00 2001 From: Dmytro Linkin Date: Wed, 23 Jun 2021 16:43:14 +0300 Subject: devlink: Remove eswitch mode check for mode set call When eswitch is disabled, querying its current mode results in error. Due to this when trying to set the eswitch mode for mlx5 devices, it fails to set the eswitch switchdev mode. Hence remove such check. Fixes: a8ecb93ef03d ("devlink: Introduce rate nodes") Signed-off-by: Dmytro Linkin Reviewed-by: Parav Pandit Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/devlink.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'net') diff --git a/net/core/devlink.c b/net/core/devlink.c index ba27395d8fb3..153d432f6daf 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -2709,17 +2709,6 @@ static int devlink_rate_nodes_check(struct devlink *devlink, u16 mode, struct netlink_ext_ack *extack) { struct devlink_rate *devlink_rate; - u16 old_mode; - int err; - - if (!devlink->ops->eswitch_mode_get) - return -EOPNOTSUPP; - err = devlink->ops->eswitch_mode_get(devlink, &old_mode); - if (err) - return err; - - if (old_mode == mode) - return 0; list_for_each_entry(devlink_rate, &devlink->rate_list, list) if (devlink_rate_is_node(devlink_rate)) { -- cgit v1.2.3-58-ga151 From a3e5e5797faad0db319d106afaa31b9020fac44f Mon Sep 17 00:00:00 2001 From: Dmytro Linkin Date: Wed, 23 Jun 2021 16:43:15 +0300 Subject: devlink: Protect rate list with lock while switching modes Devlink eswitch set command doesn't hold devlink->lock, which makes possible race condition between rate list traversing and others devlink rate KAPI calls, like devlink_rate_nodes_destroy(). Hold devlink lock while traversing the list. Fixes: a8ecb93ef03d ("devlink: Introduce rate nodes") Signed-off-by: Dmytro Linkin Reviewed-by: Parav Pandit Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/devlink.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/core/devlink.c b/net/core/devlink.c index 153d432f6daf..8fdd04f00fd7 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -2710,11 +2710,15 @@ static int devlink_rate_nodes_check(struct devlink *devlink, u16 mode, { struct devlink_rate *devlink_rate; + /* Take the lock to sync with devlink_rate_nodes_destroy() */ + mutex_lock(&devlink->lock); list_for_each_entry(devlink_rate, &devlink->rate_list, list) if (devlink_rate_is_node(devlink_rate)) { + mutex_unlock(&devlink->lock); NL_SET_ERR_MSG_MOD(extack, "Rate node(s) exists."); return -EBUSY; } + mutex_unlock(&devlink->lock); return 0; } -- cgit v1.2.3-58-ga151 From aaf473d0100f64abc88560e2bea905805bcf2a8e Mon Sep 17 00:00:00 2001 From: Norbert Slusarek Date: Sun, 20 Jun 2021 14:38:42 +0200 Subject: can: j1939: j1939_sk_setsockopt(): prevent allocation of j1939 filter for optlen == 0 If optval != NULL and optlen == 0 are specified for SO_J1939_FILTER in j1939_sk_setsockopt(), memdup_sockptr() will return ZERO_PTR for 0 size allocation. The new filter will be mistakenly assigned ZERO_PTR. This patch checks for optlen != 0 and filter will be assigned NULL in case of optlen == 0. Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol") Link: https://lore.kernel.org/r/20210620123842.117975-1-nslusarek@gmx.net Signed-off-by: Norbert Slusarek Acked-by: Oleksij Rempel Signed-off-by: Marc Kleine-Budde --- net/can/j1939/socket.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c index fce8bc8afeb7..e1a399821238 100644 --- a/net/can/j1939/socket.c +++ b/net/can/j1939/socket.c @@ -676,7 +676,7 @@ static int j1939_sk_setsockopt(struct socket *sock, int level, int optname, switch (optname) { case SO_J1939_FILTER: - if (!sockptr_is_null(optval)) { + if (!sockptr_is_null(optval) && optlen != 0) { struct j1939_filter *f; int c; -- cgit v1.2.3-58-ga151 From ba47396e1c042619f1c038ad19493aef737677f5 Mon Sep 17 00:00:00 2001 From: Maciej Żenczykowski Date: Wed, 16 Jun 2021 17:09:50 -0700 Subject: Revert "bpf: Check for BPF_F_ADJ_ROOM_FIXED_GSO when bpf_skb_change_proto" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit fa7b83bf3b156c767f3e4a25bbf3817b08f3ff8e. See the followup commit for the reasoning why I believe the appropriate approach is to simply make this change without a flag, but it can basically be summarized as using this helper without the flag is bug-prone or outright buggy, and thus the default should be this new behaviour. As this commit has only made it into net-next/master, but not into any real release, such a backwards incompatible change is still ok. Signed-off-by: Maciej Żenczykowski Signed-off-by: Daniel Borkmann Cc: Dongseok Yi Cc: Willem de Bruijn Link: https://lore.kernel.org/bpf/20210617000953.2787453-1-zenczykowski@gmail.com --- net/core/filter.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 0b13d8157a8f..243abf519efd 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3235,7 +3235,7 @@ static int bpf_skb_net_hdr_pop(struct sk_buff *skb, u32 off, u32 len) return ret; } -static int bpf_skb_proto_4_to_6(struct sk_buff *skb, u64 flags) +static int bpf_skb_proto_4_to_6(struct sk_buff *skb) { const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr); u32 off = skb_mac_header_len(skb); @@ -3264,9 +3264,7 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb, u64 flags) } /* Due to IPv6 header, MSS needs to be downgraded. */ - if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) - skb_decrease_gso_size(shinfo, len_diff); - + skb_decrease_gso_size(shinfo, len_diff); /* Header must be checked, and gso_segs recomputed. */ shinfo->gso_type |= SKB_GSO_DODGY; shinfo->gso_segs = 0; @@ -3278,7 +3276,7 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb, u64 flags) return 0; } -static int bpf_skb_proto_6_to_4(struct sk_buff *skb, u64 flags) +static int bpf_skb_proto_6_to_4(struct sk_buff *skb) { const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr); u32 off = skb_mac_header_len(skb); @@ -3307,9 +3305,7 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb, u64 flags) } /* Due to IPv4 header, MSS can be upgraded. */ - if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) - skb_increase_gso_size(shinfo, len_diff); - + skb_increase_gso_size(shinfo, len_diff); /* Header must be checked, and gso_segs recomputed. */ shinfo->gso_type |= SKB_GSO_DODGY; shinfo->gso_segs = 0; @@ -3321,17 +3317,17 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb, u64 flags) return 0; } -static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto, u64 flags) +static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto) { __be16 from_proto = skb->protocol; if (from_proto == htons(ETH_P_IP) && to_proto == htons(ETH_P_IPV6)) - return bpf_skb_proto_4_to_6(skb, flags); + return bpf_skb_proto_4_to_6(skb); if (from_proto == htons(ETH_P_IPV6) && to_proto == htons(ETH_P_IP)) - return bpf_skb_proto_6_to_4(skb, flags); + return bpf_skb_proto_6_to_4(skb); return -ENOTSUPP; } @@ -3341,7 +3337,7 @@ BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto, { int ret; - if (unlikely(flags & ~(BPF_F_ADJ_ROOM_FIXED_GSO))) + if (unlikely(flags)) return -EINVAL; /* General idea is that this helper does the basic groundwork @@ -3361,7 +3357,7 @@ BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto, * that. For offloads, we mark packet as dodgy, so that headers * need to be verified first. */ - ret = bpf_skb_proto_xlat(skb, proto, flags); + ret = bpf_skb_proto_xlat(skb, proto); bpf_compute_data_pointers(skb); return ret; } -- cgit v1.2.3-58-ga151 From 364745fbe981a4370f50274475da4675661104df Mon Sep 17 00:00:00 2001 From: Maciej Żenczykowski Date: Wed, 16 Jun 2021 17:09:51 -0700 Subject: bpf: Do not change gso_size during bpf_skb_change_proto() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is technically a backwards incompatible change in behaviour, but I'm going to argue that it is very unlikely to break things, and likely to fix *far* more then it breaks. In no particular order, various reasons follow: (a) I've long had a bug assigned to myself to debug a super rare kernel crash on Android Pixel phones which can (per stacktrace) be traced back to BPF clat IPv6 to IPv4 protocol conversion causing some sort of ugly failure much later on during transmit deep in the GSO engine, AFAICT precisely because of this change to gso_size, though I've never been able to manually reproduce it. I believe it may be related to the particular network offload support of attached USB ethernet dongle being used for tethering off of an IPv6-only cellular connection. The reason might be we end up with more segments than max permitted, or with a GSO packet with only one segment... (either way we break some assumption and hit a BUG_ON) (b) There is no check that the gso_size is > 20 when reducing it by 20, so we might end up with a negative (or underflowing) gso_size or a gso_size of 0. This can't possibly be good. Indeed this is probably somehow exploitable (or at least can result in a kernel crash) by delivering crafted packets and perhaps triggering an infinite loop or a divide by zero... As a reminder: gso_size (MSS) is related to MTU, but not directly derived from it: gso_size/MSS may be significantly smaller then one would get by deriving from local MTU. And on some NICs (which do loose MTU checking on receive, it may even potentially be larger, for example my work pc with 1500 MTU can receive 1520 byte frames [and sometimes does due to bugs in a vendor plat46 implementation]). Indeed even just going from 21 to 1 is potentially problematic because it increases the number of segments by a factor of 21 (think DoS, or some other crash due to too many segments). (c) It's always safe to not increase the gso_size, because it doesn't result in the max packet size increasing. So the skb_increase_gso_size() call was always unnecessary for correctness (and outright undesirable, see later). As such the only part which is potentially dangerous (ie. could cause backwards compatibility issues) is the removal of the skb_decrease_gso_size() call. (d) If the packets are ultimately destined to the local device, then there is absolutely no benefit to playing around with gso_size. It only matters if the packets will egress the device. ie. we're either forwarding, or transmitting from the device. (e) This logic only triggers for packets which are GSO. It does not trigger for skbs which are not GSO. It will not convert a non-GSO MTU sized packet into a GSO packet (and you don't even know what the MTU is, so you can't even fix it). As such your transmit path must *already* be able to handle an MTU 20 bytes larger then your receive path (for IPv4 to IPv6 translation) - and indeed 28 bytes larger due to IPv4 fragments. Thus removing the skb_decrease_gso_size() call doesn't actually increase the size of the packets your transmit side must be able to handle. ie. to handle non-GSO max-MTU packets, the IPv4/IPv6 device/ route MTUs must already be set correctly. Since for example with an IPv4 egress MTU of 1500, IPv4 to IPv6 translation will already build 1520 byte IPv6 frames, so you need a 1520 byte device MTU. This means if your IPv6 device's egress MTU is 1280, your IPv4 route must be 1260 (and actually 1252, because of the need to handle fragments). This is to handle normal non-GSO packets. Thus the reduction is simply not needed for GSO packets, because when they're correctly built, they will already be the right size. (f) TSO/GSO should be able to exactly undo GRO: the number of packets (TCP segments) should not be modified, so that TCP's MSS counting works correctly (this matters for congestion control). If protocol conversion changes the gso_size, then the number of TCP segments may increase or decrease. Packet loss after protocol conversion can result in partial loss of MSS segments that the sender sent. How's the sending TCP stack going to react to receiving ACKs/SACKs in the middle of the segments it sent? (g) skb_{decrease,increase}_gso_size() are already no-ops for GSO_BY_FRAGS case (besides triggering WARN_ON_ONCE). This means you already cannot guarantee that gso_size (and thus resulting packet MTU) is changed. ie. you must assume it won't be changed. (h) changing gso_size is outright buggy for UDP GSO packets, where framing matters (I believe that's also the case for SCTP, but it's already excluded by [g]). So the only remaining case is TCP, which also doesn't want it (see [f]). (i) see also the reasoning on the previous attempt at fixing this (commit fa7b83bf3b156c767f3e4a25bbf3817b08f3ff8e) which shows that the current behaviour causes TCP packet loss: In the forwarding path GRO -> BPF 6 to 4 -> GSO for TCP traffic, the coalesced packet payload can be > MSS, but < MSS + 20. bpf_skb_proto_6_to_4() will upgrade the MSS and it can be > the payload length. After then tcp_gso_segment checks for the payload length if it is <= MSS. The condition is causing the packet to be dropped. tcp_gso_segment(): [...] mss = skb_shinfo(skb)->gso_size; if (unlikely(skb->len <= mss)) goto out; [...] Thus changing the gso_size is simply a very bad idea. Increasing is unnecessary and buggy, and decreasing can go negative. Fixes: 6578171a7ff0 ("bpf: add bpf_skb_change_proto helper") Signed-off-by: Maciej Żenczykowski Signed-off-by: Daniel Borkmann Cc: Dongseok Yi Cc: Willem de Bruijn Link: https://lore.kernel.org/bpf/CANP3RGfjLikQ6dg=YpBU0OeHvyv7JOki7CyOUS9modaXAi-9vQ@mail.gmail.com Link: https://lore.kernel.org/bpf/20210617000953.2787453-2-zenczykowski@gmail.com --- net/core/filter.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 243abf519efd..ae92a8bada0f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3263,8 +3263,6 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb) shinfo->gso_type |= SKB_GSO_TCPV6; } - /* Due to IPv6 header, MSS needs to be downgraded. */ - skb_decrease_gso_size(shinfo, len_diff); /* Header must be checked, and gso_segs recomputed. */ shinfo->gso_type |= SKB_GSO_DODGY; shinfo->gso_segs = 0; @@ -3304,8 +3302,6 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb) shinfo->gso_type |= SKB_GSO_TCPV4; } - /* Due to IPv4 header, MSS can be upgraded. */ - skb_increase_gso_size(shinfo, len_diff); /* Header must be checked, and gso_segs recomputed. */ shinfo->gso_type |= SKB_GSO_DODGY; shinfo->gso_segs = 0; -- cgit v1.2.3-58-ga151 From 0bc919d3e0b8149a60d2444c6a8e2b5974556522 Mon Sep 17 00:00:00 2001 From: Maciej Żenczykowski Date: Wed, 16 Jun 2021 17:09:52 -0700 Subject: bpf: Support all gso types in bpf_skb_change_proto() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since we no longer modify gso_size, it is now theoretically safe to not set SKB_GSO_DODGY and reset gso_segs to zero. This also means the skb_is_gso_tcp() check should no longer be necessary. Unfortunately we cannot remove the skb_{decrease,increase}_gso_size() helpers, as they are still used elsewhere: bpf_skb_net_grow() without BPF_F_ADJ_ROOM_FIXED_GSO bpf_skb_net_shrink() without BPF_F_ADJ_ROOM_FIXED_GSO net/core/lwt_bpf.c's handle_gso_type() Signed-off-by: Maciej Żenczykowski Signed-off-by: Daniel Borkmann Cc: Dongseok Yi Cc: Willem de Bruijn Link: https://lore.kernel.org/bpf/20210617000953.2787453-3-zenczykowski@gmail.com --- net/core/filter.c | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index ae92a8bada0f..d062053994c7 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3241,9 +3241,6 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb) u32 off = skb_mac_header_len(skb); int ret; - if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) - return -ENOTSUPP; - ret = skb_cow(skb, len_diff); if (unlikely(ret < 0)) return ret; @@ -3255,17 +3252,11 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb) if (skb_is_gso(skb)) { struct skb_shared_info *shinfo = skb_shinfo(skb); - /* SKB_GSO_TCPV4 needs to be changed into - * SKB_GSO_TCPV6. - */ + /* SKB_GSO_TCPV4 needs to be changed into SKB_GSO_TCPV6. */ if (shinfo->gso_type & SKB_GSO_TCPV4) { shinfo->gso_type &= ~SKB_GSO_TCPV4; shinfo->gso_type |= SKB_GSO_TCPV6; } - - /* Header must be checked, and gso_segs recomputed. */ - shinfo->gso_type |= SKB_GSO_DODGY; - shinfo->gso_segs = 0; } skb->protocol = htons(ETH_P_IPV6); @@ -3280,9 +3271,6 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb) u32 off = skb_mac_header_len(skb); int ret; - if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) - return -ENOTSUPP; - ret = skb_unclone(skb, GFP_ATOMIC); if (unlikely(ret < 0)) return ret; @@ -3294,17 +3282,11 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb) if (skb_is_gso(skb)) { struct skb_shared_info *shinfo = skb_shinfo(skb); - /* SKB_GSO_TCPV6 needs to be changed into - * SKB_GSO_TCPV4. - */ + /* SKB_GSO_TCPV6 needs to be changed into SKB_GSO_TCPV4. */ if (shinfo->gso_type & SKB_GSO_TCPV6) { shinfo->gso_type &= ~SKB_GSO_TCPV6; shinfo->gso_type |= SKB_GSO_TCPV4; } - - /* Header must be checked, and gso_segs recomputed. */ - shinfo->gso_type |= SKB_GSO_DODGY; - shinfo->gso_segs = 0; } skb->protocol = htons(ETH_P_IP); -- cgit v1.2.3-58-ga151 From 782347b6bcad07ddb574422e01e22c92e05928c8 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Thu, 24 Jun 2021 18:05:55 +0200 Subject: xdp: Add proper __rcu annotations to redirect map entries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit XDP_REDIRECT works by a three-step process: the bpf_redirect() and bpf_redirect_map() helpers will lookup the target of the redirect and store it (along with some other metadata) in a per-CPU struct bpf_redirect_info. Next, when the program returns the XDP_REDIRECT return code, the driver will call xdp_do_redirect() which will use the information thus stored to actually enqueue the frame into a bulk queue structure (that differs slightly by map type, but shares the same principle). Finally, before exiting its NAPI poll loop, the driver will call xdp_do_flush(), which will flush all the different bulk queues, thus completing the redirect. Pointers to the map entries will be kept around for this whole sequence of steps, protected by RCU. However, there is no top-level rcu_read_lock() in the core code; instead drivers add their own rcu_read_lock() around the XDP portions of the code, but somewhat inconsistently as Martin discovered[0]. However, things still work because everything happens inside a single NAPI poll sequence, which means it's between a pair of calls to local_bh_disable()/local_bh_enable(). So Paul suggested[1] that we could document this intention by using rcu_dereference_check() with rcu_read_lock_bh_held() as a second parameter, thus allowing sparse and lockdep to verify that everything is done correctly. This patch does just that: we add an __rcu annotation to the map entry pointers and remove the various comments explaining the NAPI poll assurance strewn through devmap.c in favour of a longer explanation in filter.c. The goal is to have one coherent documentation of the entire flow, and rely on the RCU annotations as a "standard" way of communicating the flow in the map code (which can additionally be understood by sparse and lockdep). The RCU annotation replacements result in a fairly straight-forward replacement where READ_ONCE() becomes rcu_dereference_check(), WRITE_ONCE() becomes rcu_assign_pointer() and xchg() and cmpxchg() gets wrapped in the proper constructs to cast the pointer back and forth between __rcu and __kernel address space (for the benefit of sparse). The one complication is that xskmap has a few constructions where double-pointers are passed back and forth; these simply all gain __rcu annotations, and only the final reference/dereference to the inner-most pointer gets changed. With this, everything can be run through sparse without eliciting complaints, and lockdep can verify correctness even without the use of rcu_read_lock() in the drivers. Subsequent patches will clean these up from the drivers. [0] https://lore.kernel.org/bpf/20210415173551.7ma4slcbqeyiba2r@kafai-mbp.dhcp.thefacebook.com/ [1] https://lore.kernel.org/bpf/20210419165837.GA975577@paulmck-ThinkPad-P17-Gen-1/ Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210624160609.292325-6-toke@redhat.com --- include/linux/filter.h | 8 +++----- include/net/xdp_sock.h | 2 +- kernel/bpf/cpumap.c | 13 +++++++++---- kernel/bpf/devmap.c | 49 +++++++++++++++++++++---------------------------- net/core/filter.c | 28 ++++++++++++++++++++++++++++ net/xdp/xsk.c | 4 ++-- net/xdp/xsk.h | 4 ++-- net/xdp/xskmap.c | 29 +++++++++++++++++------------ 8 files changed, 83 insertions(+), 54 deletions(-) (limited to 'net') diff --git a/include/linux/filter.h b/include/linux/filter.h index 688856e0b28a..472f97074da0 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -763,11 +763,9 @@ DECLARE_BPF_DISPATCHER(xdp) static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog, struct xdp_buff *xdp) { - /* Caller needs to hold rcu_read_lock() (!), otherwise program - * can be released while still running, or map elements could be - * freed early while still having concurrent users. XDP fastpath - * already takes rcu_read_lock() when fetching the program, so - * it's not necessary here anymore. + /* Driver XDP hooks are invoked within a single NAPI poll cycle and thus + * under local_bh_disable(), which provides the needed RCU protection + * for accessing map entries. */ return __BPF_PROG_RUN(prog, xdp, BPF_DISPATCHER_FUNC(xdp)); } diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 9c0722c6d7ac..fff069d2ed1b 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -37,7 +37,7 @@ struct xdp_umem { struct xsk_map { struct bpf_map map; spinlock_t lock; /* Synchronize map updates */ - struct xdp_sock *xsk_map[]; + struct xdp_sock __rcu *xsk_map[]; }; struct xdp_sock { diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index a1a0c4e791c6..480e936c54d0 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -74,7 +74,7 @@ struct bpf_cpu_map_entry { struct bpf_cpu_map { struct bpf_map map; /* Below members specific for map type */ - struct bpf_cpu_map_entry **cpu_map; + struct bpf_cpu_map_entry __rcu **cpu_map; }; static DEFINE_PER_CPU(struct list_head, cpu_map_flush_list); @@ -469,7 +469,7 @@ static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap, { struct bpf_cpu_map_entry *old_rcpu; - old_rcpu = xchg(&cmap->cpu_map[key_cpu], rcpu); + old_rcpu = unrcu_pointer(xchg(&cmap->cpu_map[key_cpu], RCU_INITIALIZER(rcpu))); if (old_rcpu) { call_rcu(&old_rcpu->rcu, __cpu_map_entry_free); INIT_WORK(&old_rcpu->kthread_stop_wq, cpu_map_kthread_stop); @@ -551,7 +551,7 @@ static void cpu_map_free(struct bpf_map *map) for (i = 0; i < cmap->map.max_entries; i++) { struct bpf_cpu_map_entry *rcpu; - rcpu = READ_ONCE(cmap->cpu_map[i]); + rcpu = rcu_dereference_raw(cmap->cpu_map[i]); if (!rcpu) continue; @@ -562,6 +562,10 @@ static void cpu_map_free(struct bpf_map *map) kfree(cmap); } +/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or + * by local_bh_disable() (from XDP calls inside NAPI). The + * rcu_read_lock_bh_held() below makes lockdep accept both. + */ static void *__cpu_map_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); @@ -570,7 +574,8 @@ static void *__cpu_map_lookup_elem(struct bpf_map *map, u32 key) if (key >= map->max_entries) return NULL; - rcpu = READ_ONCE(cmap->cpu_map[key]); + rcpu = rcu_dereference_check(cmap->cpu_map[key], + rcu_read_lock_bh_held()); return rcpu; } diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 2a75e6c2d27d..2f6bd75cd682 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -73,7 +73,7 @@ struct bpf_dtab_netdev { struct bpf_dtab { struct bpf_map map; - struct bpf_dtab_netdev **netdev_map; /* DEVMAP type only */ + struct bpf_dtab_netdev __rcu **netdev_map; /* DEVMAP type only */ struct list_head list; /* these are only used for DEVMAP_HASH type maps */ @@ -226,7 +226,7 @@ static void dev_map_free(struct bpf_map *map) for (i = 0; i < dtab->map.max_entries; i++) { struct bpf_dtab_netdev *dev; - dev = dtab->netdev_map[i]; + dev = rcu_dereference_raw(dtab->netdev_map[i]); if (!dev) continue; @@ -259,6 +259,10 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } +/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or + * by local_bh_disable() (from XDP calls inside NAPI). The + * rcu_read_lock_bh_held() below makes lockdep accept both. + */ static void *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); @@ -410,15 +414,9 @@ out: trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, cnt - sent, err); } -/* __dev_flush is called from xdp_do_flush() which _must_ be signaled - * from the driver before returning from its napi->poll() routine. The poll() - * routine is called either from busy_poll context or net_rx_action signaled - * from NET_RX_SOFTIRQ. Either way the poll routine must complete before the - * net device can be torn down. On devmap tear down we ensure the flush list - * is empty before completing to ensure all flush operations have completed. - * When drivers update the bpf program they may need to ensure any flush ops - * are also complete. Using synchronize_rcu or call_rcu will suffice for this - * because both wait for napi context to exit. +/* __dev_flush is called from xdp_do_flush() which _must_ be signalled from the + * driver before returning from its napi->poll() routine. See the comment above + * xdp_do_flush() in filter.c. */ void __dev_flush(void) { @@ -433,9 +431,9 @@ void __dev_flush(void) } } -/* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or - * update happens in parallel here a dev_put won't happen until after reading - * the ifindex. +/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or + * by local_bh_disable() (from XDP calls inside NAPI). The + * rcu_read_lock_bh_held() below makes lockdep accept both. */ static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key) { @@ -445,12 +443,14 @@ static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key) if (key >= map->max_entries) return NULL; - obj = READ_ONCE(dtab->netdev_map[key]); + obj = rcu_dereference_check(dtab->netdev_map[key], + rcu_read_lock_bh_held()); return obj; } -/* Runs under RCU-read-side, plus in softirq under NAPI protection. - * Thus, safe percpu variable access. +/* Runs in NAPI, i.e., softirq under local_bh_disable(). Thus, safe percpu + * variable access, and map elements stick around. See comment above + * xdp_do_flush() in filter.c. */ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx, struct bpf_prog *xdp_prog) @@ -735,14 +735,7 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key) if (k >= map->max_entries) return -EINVAL; - /* Use call_rcu() here to ensure any rcu critical sections have - * completed as well as any flush operations because call_rcu - * will wait for preempt-disable region to complete, NAPI in this - * context. And additionally, the driver tear down ensures all - * soft irqs are complete before removing the net device in the - * case of dev_put equals zero. - */ - old_dev = xchg(&dtab->netdev_map[k], NULL); + old_dev = unrcu_pointer(xchg(&dtab->netdev_map[k], NULL)); if (old_dev) call_rcu(&old_dev->rcu, __dev_map_entry_free); return 0; @@ -851,7 +844,7 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map, * Remembering the driver side flush operation will happen before the * net device is removed. */ - old_dev = xchg(&dtab->netdev_map[i], dev); + old_dev = unrcu_pointer(xchg(&dtab->netdev_map[i], RCU_INITIALIZER(dev))); if (old_dev) call_rcu(&old_dev->rcu, __dev_map_entry_free); @@ -1031,10 +1024,10 @@ static int dev_map_notification(struct notifier_block *notifier, for (i = 0; i < dtab->map.max_entries; i++) { struct bpf_dtab_netdev *dev, *odev; - dev = READ_ONCE(dtab->netdev_map[i]); + dev = rcu_dereference(dtab->netdev_map[i]); if (!dev || netdev != dev->dev) continue; - odev = cmpxchg(&dtab->netdev_map[i], dev, NULL); + odev = unrcu_pointer(cmpxchg(&dtab->netdev_map[i], RCU_INITIALIZER(dev), NULL)); if (dev == odev) call_rcu(&dev->rcu, __dev_map_entry_free); diff --git a/net/core/filter.c b/net/core/filter.c index d062053994c7..d22895caa164 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3897,6 +3897,34 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = { .arg2_type = ARG_ANYTHING, }; +/* XDP_REDIRECT works by a three-step process, implemented in the functions + * below: + * + * 1. The bpf_redirect() and bpf_redirect_map() helpers will lookup the target + * of the redirect and store it (along with some other metadata) in a per-CPU + * struct bpf_redirect_info. + * + * 2. When the program returns the XDP_REDIRECT return code, the driver will + * call xdp_do_redirect() which will use the information in struct + * bpf_redirect_info to actually enqueue the frame into a map type-specific + * bulk queue structure. + * + * 3. Before exiting its NAPI poll loop, the driver will call xdp_do_flush(), + * which will flush all the different bulk queues, thus completing the + * redirect. + * + * Pointers to the map entries will be kept around for this whole sequence of + * steps, protected by RCU. However, there is no top-level rcu_read_lock() in + * the core code; instead, the RCU protection relies on everything happening + * inside a single NAPI poll sequence, which means it's between a pair of calls + * to local_bh_disable()/local_bh_enable(). + * + * The map entries are marked as __rcu and the map code makes sure to + * dereference those pointers with rcu_dereference_check() in a way that works + * for both sections that to hold an rcu_read_lock() and sections that are + * called from NAPI without a separate rcu_read_lock(). The code below does not + * use RCU annotations, but relies on those in the map code. + */ void xdp_do_flush(void) { __dev_flush(); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index cd62d4ba87a9..996da915f520 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -749,7 +749,7 @@ static void xsk_unbind_dev(struct xdp_sock *xs) } static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs, - struct xdp_sock ***map_entry) + struct xdp_sock __rcu ***map_entry) { struct xsk_map *map = NULL; struct xsk_map_node *node; @@ -785,7 +785,7 @@ static void xsk_delete_from_maps(struct xdp_sock *xs) * might be updates to the map between * xsk_get_map_list_entry() and xsk_map_try_sock_delete(). */ - struct xdp_sock **map_entry = NULL; + struct xdp_sock __rcu **map_entry = NULL; struct xsk_map *map; while ((map = xsk_get_map_list_entry(xs, &map_entry))) { diff --git a/net/xdp/xsk.h b/net/xdp/xsk.h index edcf249ad1f1..a4bc4749faac 100644 --- a/net/xdp/xsk.h +++ b/net/xdp/xsk.h @@ -31,7 +31,7 @@ struct xdp_mmap_offsets_v1 { struct xsk_map_node { struct list_head node; struct xsk_map *map; - struct xdp_sock **map_entry; + struct xdp_sock __rcu **map_entry; }; static inline struct xdp_sock *xdp_sk(struct sock *sk) @@ -40,7 +40,7 @@ static inline struct xdp_sock *xdp_sk(struct sock *sk) } void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, - struct xdp_sock **map_entry); + struct xdp_sock __rcu **map_entry); void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id); int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool, u16 queue_id); diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c index 9df75ea4a567..2e48d0e094d9 100644 --- a/net/xdp/xskmap.c +++ b/net/xdp/xskmap.c @@ -12,7 +12,7 @@ #include "xsk.h" static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map, - struct xdp_sock **map_entry) + struct xdp_sock __rcu **map_entry) { struct xsk_map_node *node; @@ -42,7 +42,7 @@ static void xsk_map_sock_add(struct xdp_sock *xs, struct xsk_map_node *node) } static void xsk_map_sock_delete(struct xdp_sock *xs, - struct xdp_sock **map_entry) + struct xdp_sock __rcu **map_entry) { struct xsk_map_node *n, *tmp; @@ -124,6 +124,10 @@ static int xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) return insn - insn_buf; } +/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or + * by local_bh_disable() (from XDP calls inside NAPI). The + * rcu_read_lock_bh_held() below makes lockdep accept both. + */ static void *__xsk_map_lookup_elem(struct bpf_map *map, u32 key) { struct xsk_map *m = container_of(map, struct xsk_map, map); @@ -131,12 +135,11 @@ static void *__xsk_map_lookup_elem(struct bpf_map *map, u32 key) if (key >= map->max_entries) return NULL; - return READ_ONCE(m->xsk_map[key]); + return rcu_dereference_check(m->xsk_map[key], rcu_read_lock_bh_held()); } static void *xsk_map_lookup_elem(struct bpf_map *map, void *key) { - WARN_ON_ONCE(!rcu_read_lock_held()); return __xsk_map_lookup_elem(map, *(u32 *)key); } @@ -149,7 +152,8 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct xsk_map *m = container_of(map, struct xsk_map, map); - struct xdp_sock *xs, *old_xs, **map_entry; + struct xdp_sock __rcu **map_entry; + struct xdp_sock *xs, *old_xs; u32 i = *(u32 *)key, fd = *(u32 *)value; struct xsk_map_node *node; struct socket *sock; @@ -179,7 +183,7 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value, } spin_lock_bh(&m->lock); - old_xs = READ_ONCE(*map_entry); + old_xs = rcu_dereference_protected(*map_entry, lockdep_is_held(&m->lock)); if (old_xs == xs) { err = 0; goto out; @@ -191,7 +195,7 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value, goto out; } xsk_map_sock_add(xs, node); - WRITE_ONCE(*map_entry, xs); + rcu_assign_pointer(*map_entry, xs); if (old_xs) xsk_map_sock_delete(old_xs, map_entry); spin_unlock_bh(&m->lock); @@ -208,7 +212,8 @@ out: static int xsk_map_delete_elem(struct bpf_map *map, void *key) { struct xsk_map *m = container_of(map, struct xsk_map, map); - struct xdp_sock *old_xs, **map_entry; + struct xdp_sock __rcu **map_entry; + struct xdp_sock *old_xs; int k = *(u32 *)key; if (k >= map->max_entries) @@ -216,7 +221,7 @@ static int xsk_map_delete_elem(struct bpf_map *map, void *key) spin_lock_bh(&m->lock); map_entry = &m->xsk_map[k]; - old_xs = xchg(map_entry, NULL); + old_xs = unrcu_pointer(xchg(map_entry, NULL)); if (old_xs) xsk_map_sock_delete(old_xs, map_entry); spin_unlock_bh(&m->lock); @@ -231,11 +236,11 @@ static int xsk_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) } void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, - struct xdp_sock **map_entry) + struct xdp_sock __rcu **map_entry) { spin_lock_bh(&map->lock); - if (READ_ONCE(*map_entry) == xs) { - WRITE_ONCE(*map_entry, NULL); + if (rcu_access_pointer(*map_entry) == xs) { + rcu_assign_pointer(*map_entry, NULL); xsk_map_sock_delete(xs, map_entry); } spin_unlock_bh(&map->lock); -- cgit v1.2.3-58-ga151 From 77151ccf10659d4066074f278402032f3265f0cc Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Thu, 24 Jun 2021 18:05:56 +0200 Subject: bpf, sched: Remove unneeded rcu_read_lock() around BPF program invocation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The rcu_read_lock() call in cls_bpf and act_bpf are redundant: on the TX side, there's already a call to rcu_read_lock_bh() in __dev_queue_xmit(), and on RX there's a covering rcu_read_lock() in netif_receive_skb{,_list}_internal(). With the previous patches we also amended the lockdep checks in the map code to not require any particular RCU flavour, so we can just get rid of the rcu_read_lock()s. Suggested-by: Daniel Borkmann Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210624160609.292325-7-toke@redhat.com --- net/sched/act_bpf.c | 2 -- net/sched/cls_bpf.c | 3 --- 2 files changed, 5 deletions(-) (limited to 'net') diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index e48e980c3b93..e409a0005717 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -43,7 +43,6 @@ static int tcf_bpf_act(struct sk_buff *skb, const struct tc_action *act, tcf_lastuse_update(&prog->tcf_tm); bstats_cpu_update(this_cpu_ptr(prog->common.cpu_bstats), skb); - rcu_read_lock(); filter = rcu_dereference(prog->filter); if (at_ingress) { __skb_push(skb, skb->mac_len); @@ -56,7 +55,6 @@ static int tcf_bpf_act(struct sk_buff *skb, const struct tc_action *act, } if (skb_sk_is_prefetched(skb) && filter_res != TC_ACT_OK) skb_orphan(skb); - rcu_read_unlock(); /* A BPF program may overwrite the default action opcode. * Similarly as in cls_bpf, if filter_res == -1 we use the diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 6e3e63db0e01..fa739efa59f4 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -85,8 +85,6 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct cls_bpf_prog *prog; int ret = -1; - /* Needed here for accessing maps. */ - rcu_read_lock(); list_for_each_entry_rcu(prog, &head->plist, link) { int filter_res; @@ -131,7 +129,6 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, break; } - rcu_read_unlock(); return ret; } -- cgit v1.2.3-58-ga151 From e8b9eab99232c4e62ada9d7976c80fd5e8118289 Mon Sep 17 00:00:00 2001 From: Martynas Pumputis Date: Wed, 23 Jun 2021 15:56:45 +0200 Subject: net: retrieve netns cookie via getsocketopt It's getting more common to run nested container environments for testing cloud software. One of such examples is Kind [1] which runs a Kubernetes cluster in Docker containers on a single host. Each container acts as a Kubernetes node, and thus can run any Pod (aka container) inside the former. This approach simplifies testing a lot, as it eliminates complicated VM setups. Unfortunately, such a setup breaks some functionality when cgroupv2 BPF programs are used for load-balancing. The load-balancer BPF program needs to detect whether a request originates from the host netns or a container netns in order to allow some access, e.g. to a service via a loopback IP address. Typically, the programs detect this by comparing netns cookies with the one of the init ns via a call to bpf_get_netns_cookie(NULL). However, in nested environments the latter cannot be used given the Kubernetes node's netns is outside the init ns. To fix this, we need to pass the Kubernetes node netns cookie to the program in a different way: by extending getsockopt() with a SO_NETNS_COOKIE option, the orchestrator which runs in the Kubernetes node netns can retrieve the cookie and pass it to the program instead. Thus, this is following up on Eric's commit 3d368ab87cf6 ("net: initialize net->net_cookie at netns setup") to allow retrieval via SO_NETNS_COOKIE. This is also in line in how we retrieve socket cookie via SO_COOKIE. [1] https://kind.sigs.k8s.io/ Signed-off-by: Lorenz Bauer Signed-off-by: Martynas Pumputis Cc: Eric Dumazet Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- arch/alpha/include/uapi/asm/socket.h | 2 ++ arch/mips/include/uapi/asm/socket.h | 2 ++ arch/parisc/include/uapi/asm/socket.h | 2 ++ arch/sparc/include/uapi/asm/socket.h | 2 ++ include/uapi/asm-generic/socket.h | 2 ++ net/core/sock.c | 7 +++++++ 6 files changed, 17 insertions(+) (limited to 'net') diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h index 57420356ce4c..6b3daba60987 100644 --- a/arch/alpha/include/uapi/asm/socket.h +++ b/arch/alpha/include/uapi/asm/socket.h @@ -127,6 +127,8 @@ #define SO_PREFER_BUSY_POLL 69 #define SO_BUSY_POLL_BUDGET 70 +#define SO_NETNS_COOKIE 71 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h index 2d949969313b..cdf404a831b2 100644 --- a/arch/mips/include/uapi/asm/socket.h +++ b/arch/mips/include/uapi/asm/socket.h @@ -138,6 +138,8 @@ #define SO_PREFER_BUSY_POLL 69 #define SO_BUSY_POLL_BUDGET 70 +#define SO_NETNS_COOKIE 71 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h index f60904329bbc..5b5351cdcb33 100644 --- a/arch/parisc/include/uapi/asm/socket.h +++ b/arch/parisc/include/uapi/asm/socket.h @@ -119,6 +119,8 @@ #define SO_PREFER_BUSY_POLL 0x4043 #define SO_BUSY_POLL_BUDGET 0x4044 +#define SO_NETNS_COOKIE 0x4045 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h index 848a22fbac20..92675dc380fa 100644 --- a/arch/sparc/include/uapi/asm/socket.h +++ b/arch/sparc/include/uapi/asm/socket.h @@ -120,6 +120,8 @@ #define SO_PREFER_BUSY_POLL 0x0048 #define SO_BUSY_POLL_BUDGET 0x0049 +#define SO_NETNS_COOKIE 0x0050 + #if !defined(__KERNEL__) diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h index 4dcd13d097a9..d588c244ec2f 100644 --- a/include/uapi/asm-generic/socket.h +++ b/include/uapi/asm-generic/socket.h @@ -122,6 +122,8 @@ #define SO_PREFER_BUSY_POLL 69 #define SO_BUSY_POLL_BUDGET 70 +#define SO_NETNS_COOKIE 71 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__)) diff --git a/net/core/sock.c b/net/core/sock.c index ddfa88082a2b..a2337b37eba6 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1635,6 +1635,13 @@ int sock_getsockopt(struct socket *sock, int level, int optname, v.val = sk->sk_bound_dev_if; break; + case SO_NETNS_COOKIE: + lv = sizeof(u64); + if (len != lv) + return -EINVAL; + v.val64 = sock_net(sk)->net_cookie; + break; + default: /* We implement the SO_SNDLOWAT etc to not be settable * (1003.1g 7). -- cgit v1.2.3-58-ga151 From 6d123b81ac615072a8525c13c6c41b695270a15d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 23 Jun 2021 14:44:38 -0700 Subject: net: ip: avoid OOM kills with large UDP sends over loopback Dave observed number of machines hitting OOM on the UDP send path. The workload seems to be sending large UDP packets over loopback. Since loopback has MTU of 64k kernel will try to allocate an skb with up to 64k of head space. This has a good chance of failing under memory pressure. What's worse if the message length is <32k the allocation may trigger an OOM killer. This is entirely avoidable, we can use an skb with page frags. af_unix solves a similar problem by limiting the head length to SKB_MAX_ALLOC. This seems like a good and simple approach. It means that UDP messages > 16kB will now use fragments if underlying device supports SG, if extra allocator pressure causes regressions in real workloads we can switch to trying the large allocation first and falling back. v4: pre-calculate all the additions to alloclen so we can be sure it won't go over order-2 Reported-by: Dave Jones Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 32 ++++++++++++++++++-------------- net/ipv6/ip6_output.c | 32 +++++++++++++++++--------------- 2 files changed, 35 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index c3efc7d658f6..8d8a8da3ae7e 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1054,7 +1054,7 @@ static int __ip_append_data(struct sock *sk, unsigned int datalen; unsigned int fraglen; unsigned int fraggap; - unsigned int alloclen; + unsigned int alloclen, alloc_extra; unsigned int pagedlen; struct sk_buff *skb_prev; alloc_new_skb: @@ -1074,35 +1074,39 @@ alloc_new_skb: fraglen = datalen + fragheaderlen; pagedlen = 0; + alloc_extra = hh_len + 15; + alloc_extra += exthdrlen; + + /* The last fragment gets additional space at tail. + * Note, with MSG_MORE we overallocate on fragments, + * because we have no idea what fragment will be + * the last. + */ + if (datalen == length + fraggap) + alloc_extra += rt->dst.trailer_len; + if ((flags & MSG_MORE) && !(rt->dst.dev->features&NETIF_F_SG)) alloclen = mtu; - else if (!paged) + else if (!paged && + (fraglen + alloc_extra < SKB_MAX_ALLOC || + !(rt->dst.dev->features & NETIF_F_SG))) alloclen = fraglen; else { alloclen = min_t(int, fraglen, MAX_HEADER); pagedlen = fraglen - alloclen; } - alloclen += exthdrlen; - - /* The last fragment gets additional space at tail. - * Note, with MSG_MORE we overallocate on fragments, - * because we have no idea what fragment will be - * the last. - */ - if (datalen == length + fraggap) - alloclen += rt->dst.trailer_len; + alloclen += alloc_extra; if (transhdrlen) { - skb = sock_alloc_send_skb(sk, - alloclen + hh_len + 15, + skb = sock_alloc_send_skb(sk, alloclen, (flags & MSG_DONTWAIT), &err); } else { skb = NULL; if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <= 2 * sk->sk_sndbuf) - skb = alloc_skb(alloclen + hh_len + 15, + skb = alloc_skb(alloclen, sk->sk_allocation); if (unlikely(!skb)) err = -ENOBUFS; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index ff4f9ebcf7f6..497974b4372a 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1555,7 +1555,7 @@ emsgsize: unsigned int datalen; unsigned int fraglen; unsigned int fraggap; - unsigned int alloclen; + unsigned int alloclen, alloc_extra; unsigned int pagedlen; alloc_new_skb: /* There's no room in the current skb */ @@ -1582,17 +1582,28 @@ alloc_new_skb: fraglen = datalen + fragheaderlen; pagedlen = 0; + alloc_extra = hh_len; + alloc_extra += dst_exthdrlen; + alloc_extra += rt->dst.trailer_len; + + /* We just reserve space for fragment header. + * Note: this may be overallocation if the message + * (without MSG_MORE) fits into the MTU. + */ + alloc_extra += sizeof(struct frag_hdr); + if ((flags & MSG_MORE) && !(rt->dst.dev->features&NETIF_F_SG)) alloclen = mtu; - else if (!paged) + else if (!paged && + (fraglen + alloc_extra < SKB_MAX_ALLOC || + !(rt->dst.dev->features & NETIF_F_SG))) alloclen = fraglen; else { alloclen = min_t(int, fraglen, MAX_HEADER); pagedlen = fraglen - alloclen; } - - alloclen += dst_exthdrlen; + alloclen += alloc_extra; if (datalen != length + fraggap) { /* @@ -1602,30 +1613,21 @@ alloc_new_skb: datalen += rt->dst.trailer_len; } - alloclen += rt->dst.trailer_len; fraglen = datalen + fragheaderlen; - /* - * We just reserve space for fragment header. - * Note: this may be overallocation if the message - * (without MSG_MORE) fits into the MTU. - */ - alloclen += sizeof(struct frag_hdr); - copy = datalen - transhdrlen - fraggap - pagedlen; if (copy < 0) { err = -EINVAL; goto error; } if (transhdrlen) { - skb = sock_alloc_send_skb(sk, - alloclen + hh_len, + skb = sock_alloc_send_skb(sk, alloclen, (flags & MSG_DONTWAIT), &err); } else { skb = NULL; if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <= 2 * sk->sk_sndbuf) - skb = alloc_skb(alloclen + hh_len, + skb = alloc_skb(alloclen, sk->sk_allocation); if (unlikely(!skb)) err = -ENOBUFS; -- cgit v1.2.3-58-ga151 From 624085a31c1ad6a80b1e53f686bf6ee92abbf6e8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 24 Jun 2021 03:07:20 -0700 Subject: ipv6: fix out-of-bound access in ip6_parse_tlv() First problem is that optlen is fetched without checking there is more than one byte to parse. Fix this by taking care of IPV6_TLV_PAD1 before fetching optlen (under appropriate sanity checks against len) Second problem is that IPV6_TLV_PADN checks of zero padding are performed before the check of remaining length. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Fixes: c1412fce7ecc ("net/ipv6/exthdrs.c: Strict PadN option checking") Signed-off-by: Eric Dumazet Cc: Paolo Abeni Cc: Tom Herbert Signed-off-by: David S. Miller --- net/ipv6/exthdrs.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 6f7da8f3e2e5..26882e165c9e 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -135,18 +135,23 @@ static bool ip6_parse_tlv(const struct tlvtype_proc *procs, len -= 2; while (len > 0) { - int optlen = nh[off + 1] + 2; - int i; + int optlen, i; - switch (nh[off]) { - case IPV6_TLV_PAD1: - optlen = 1; + if (nh[off] == IPV6_TLV_PAD1) { padlen++; if (padlen > 7) goto bad; - break; + off++; + len--; + continue; + } + if (len < 2) + goto bad; + optlen = nh[off + 1] + 2; + if (optlen > len) + goto bad; - case IPV6_TLV_PADN: + if (nh[off] == IPV6_TLV_PADN) { /* RFC 2460 states that the purpose of PadN is * to align the containing header to multiples * of 8. 7 is therefore the highest valid value. @@ -163,12 +168,7 @@ static bool ip6_parse_tlv(const struct tlvtype_proc *procs, if (nh[off + i] != 0) goto bad; } - break; - - default: /* Other TLV code so scan list */ - if (optlen > len) - goto bad; - + } else { tlv_count++; if (tlv_count > max_count) goto bad; @@ -188,7 +188,6 @@ static bool ip6_parse_tlv(const struct tlvtype_proc *procs, return false; padlen = 0; - break; } off += optlen; len -= optlen; -- cgit v1.2.3-58-ga151 From 0dac127c05579854405ef14480936b32371ddaed Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 24 Jun 2021 11:48:08 -0400 Subject: sctp: do black hole detection in search complete state Currently the PLPMUTD probe will stop for a long period (interval * 30) after it enters search complete state. If there's a pmtu change on the route path, it takes a long time to be aware if the ICMP TooBig packet is lost or filtered. As it says in rfc8899#section-4.3: "A DPLPMTUD method MUST NOT rely solely on this method." (ICMP PTB message). This patch is to enable the other method for search complete state: "A PL can use the DPLPMTUD probing mechanism to periodically generate probe packets of the size of the current PLPMTU." With this patch, the probe will continue with the current pmtu every 'interval' until the PMTU_RAISE_TIMER 'timeout', which we implement by adding raise_count to raise the probe size when it counts to 30 and removing the SCTP_PL_COMPLETE check for PMTU_RAISE_TIMER. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 3 ++- net/sctp/transport.c | 11 ++++------- 2 files changed, 6 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 9eaa701cda23..c4a4c1754be8 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -987,7 +987,8 @@ struct sctp_transport { __u16 pmtu; __u16 probe_size; __u16 probe_high; - __u8 probe_count; + __u8 probe_count:3; + __u8 raise_count:5; __u8 state; } pl; /* plpmtud related */ diff --git a/net/sctp/transport.c b/net/sctp/transport.c index f27b856ea8ce..5f23804f21c7 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -213,15 +213,10 @@ void sctp_transport_reset_reconf_timer(struct sctp_transport *transport) void sctp_transport_reset_probe_timer(struct sctp_transport *transport) { - int scale = 1; - if (timer_pending(&transport->probe_timer)) return; - if (transport->pl.state == SCTP_PL_COMPLETE && - transport->pl.probe_count == 1) - scale = 30; /* works as PMTU_RAISE_TIMER */ if (!mod_timer(&transport->probe_timer, - jiffies + transport->probe_interval * scale)) + jiffies + transport->probe_interval)) sctp_transport_hold(transport); } @@ -333,13 +328,15 @@ void sctp_transport_pl_recv(struct sctp_transport *t) t->pl.probe_size += SCTP_PL_MIN_STEP; if (t->pl.probe_size >= t->pl.probe_high) { t->pl.probe_high = 0; + t->pl.raise_count = 0; t->pl.state = SCTP_PL_COMPLETE; /* Search -> Search Complete */ t->pl.probe_size = t->pl.pmtu; t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t); sctp_assoc_sync_pmtu(t->asoc); } - } else if (t->pl.state == SCTP_PL_COMPLETE) { + } else if (t->pl.state == SCTP_PL_COMPLETE && ++t->pl.raise_count == 30) { + /* Raise probe_size again after 30 * interval in Search Complete */ t->pl.state = SCTP_PL_SEARCH; /* Search Complete -> Search */ t->pl.probe_size += SCTP_PL_MIN_STEP; } -- cgit v1.2.3-58-ga151 From fea1d5b17f821b78abbdadb9cb6f28fe433b635e Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 24 Jun 2021 11:48:09 -0400 Subject: sctp: send the next probe immediately once the last one is acked These is no need to wait for 'interval' period for the next probe if the last probe is already acked in search state. The 'interval' period waiting should be only for probe failure timeout and the current pmtu check when it's in search complete state. This change will shorten the probe time a lot in search state, and also fix the document accordingly. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.rst | 12 ++++++++---- net/sctp/sm_statefuns.c | 5 ++++- 2 files changed, 12 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 8bff728b3a1e..b3fa522e4cd9 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -2835,10 +2835,14 @@ encap_port - INTEGER Default: 0 plpmtud_probe_interval - INTEGER - The time interval (in milliseconds) for sending PLPMTUD probe chunks. - These chunks are sent at the specified interval with a variable size - to probe the mtu of a given path between 2 endpoints. PLPMTUD will - be disabled when 0 is set, and other values for it must be >= 5000. + The time interval (in milliseconds) for the PLPMTUD probe timer, + which is configured to expire after this period to receive an + acknowledgment to a probe packet. This is also the time interval + between the probes for the current pmtu when the probe search + is done. + + PLPMTUD will be disabled when 0 is set, and other values for it + must be >= 5000. Default: 0 diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index d29b579da904..09a8f23ec709 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -1275,7 +1275,10 @@ enum sctp_disposition sctp_sf_backbeat_8_3(struct net *net, return SCTP_DISPOSITION_DISCARD; sctp_transport_pl_recv(link); - return SCTP_DISPOSITION_CONSUME; + if (link->pl.state == SCTP_PL_COMPLETE) + return SCTP_DISPOSITION_CONSUME; + + return sctp_sf_send_probe(net, ep, asoc, type, link, commands); } max_interval = link->hbinterval + link->rto; -- cgit v1.2.3-58-ga151 From c305b9e6d553f73b8434dd781736d180d63b1d64 Mon Sep 17 00:00:00 2001 From: zhang kai Date: Thu, 24 Jun 2021 11:09:14 +0800 Subject: ipv6: delete useless dst check in ip6_dst_lookup_tail parameter dst always points to null. Signed-off-by: zhang kai Signed-off-by: David S. Miller --- net/ipv6/ip6_output.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 497974b4372a..984050f35c61 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1055,13 +1055,11 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, * ip6_route_output will fail given src=any saddr, though, so * that's why we try it again later. */ - if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) { + if (ipv6_addr_any(&fl6->saddr)) { struct fib6_info *from; struct rt6_info *rt; - bool had_dst = *dst != NULL; - if (!had_dst) - *dst = ip6_route_output(net, sk, fl6); + *dst = ip6_route_output(net, sk, fl6); rt = (*dst)->error ? NULL : (struct rt6_info *)*dst; rcu_read_lock(); @@ -1078,7 +1076,7 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, * never existed and let the SA-enabled version take * over. */ - if (!had_dst && (*dst)->error) { + if ((*dst)->error) { dst_release(*dst); *dst = NULL; } -- cgit v1.2.3-58-ga151 From a196fa78a26571359740f701cf30d774eb8a72cb Mon Sep 17 00:00:00 2001 From: Gary Lin Date: Wed, 23 Jun 2021 12:09:18 +0800 Subject: bpfilter: Specify the log level for the kmsg message Per the kmsg document [0], if we don't specify the log level with a prefix "" in the message string, the default log level will be applied to the message. Since the default level could be warning(4), this would make the log utility such as journalctl treat the message, "Started bpfilter", as a warning. To avoid confusion, this commit adds the prefix "<5>" to make the message always a notice. [0] https://www.kernel.org/doc/Documentation/ABI/testing/dev-kmsg Fixes: 36c4357c63f3 ("net: bpfilter: print umh messages to /dev/kmsg") Reported-by: Martin Loviska Signed-off-by: Gary Lin Signed-off-by: Daniel Borkmann Acked-by: Dmitrii Banshchikov Link: https://lore.kernel.org/bpf/20210623040918.8683-1-glin@suse.com --- net/bpfilter/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bpfilter/main.c b/net/bpfilter/main.c index 05e1cfc1e5cd..291a92546246 100644 --- a/net/bpfilter/main.c +++ b/net/bpfilter/main.c @@ -57,7 +57,7 @@ int main(void) { debug_f = fopen("/dev/kmsg", "w"); setvbuf(debug_f, 0, _IOLBF, 0); - fprintf(debug_f, "Started bpfilter\n"); + fprintf(debug_f, "<5>Started bpfilter\n"); loop(); fclose(debug_f); return 0; -- cgit v1.2.3-58-ga151 From 17081633e22d83be928a779fd7acd04b247dec90 Mon Sep 17 00:00:00 2001 From: Guvenc Gulce Date: Fri, 25 Jun 2021 17:11:02 +0200 Subject: net/smc: Ensure correct state of the socket in send path When smc_sendmsg() is called before the SMC socket initialization has completed, smc_tx_sendmsg() will access un-initialized fields of the SMC socket which results in a null-pointer dereference. Fix this by checking the socket state first in smc_tx_sendmsg(). Fixes: e0e4b8fa5338 ("net/smc: Add SMC statistics support") Reported-by: syzbot+5dda108b672b54141857@syzkaller.appspotmail.com Reviewed-by: Karsten Graul Signed-off-by: Guvenc Gulce Signed-off-by: Karsten Graul Signed-off-by: David S. Miller --- net/smc/smc_tx.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 075c4f4b41cf..289025cd545a 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -154,6 +154,9 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) goto out_err; } + if (sk->sk_state == SMC_INIT) + return -ENOTCONN; + if (len > conn->sndbuf_desc->len) SMC_STAT_RMB_TX_SIZE_SMALL(smc, !conn->lnk); @@ -164,8 +167,6 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) SMC_STAT_INC(smc, urg_data_cnt); while (msg_data_left(msg)) { - if (sk->sk_state == SMC_INIT) - return -ENOTCONN; if (smc->sk.sk_shutdown & SEND_SHUTDOWN || (smc->sk.sk_err == ECONNABORTED) || conn->killed) -- cgit v1.2.3-58-ga151 From c469c9c9733cc92bef6d4bf2c0f5bea0550abf4d Mon Sep 17 00:00:00 2001 From: Kai Ye Date: Fri, 2 Apr 2021 18:46:29 +0800 Subject: Bluetooth: 6lowpan: delete unneeded variable initialization Delete unneeded variable initialization. Signed-off-by: Kai Ye Signed-off-by: Marcel Holtmann --- net/bluetooth/6lowpan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 97617d02c8f9..d5befa061aa2 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -691,7 +691,7 @@ static struct l2cap_chan *add_peer_chan(struct l2cap_chan *chan, static int setup_netdev(struct l2cap_chan *chan, struct lowpan_btle_dev **dev) { struct net_device *netdev; - int err = 0; + int err; netdev = alloc_netdev(LOWPAN_PRIV_SIZE(sizeof(struct lowpan_btle_dev)), IFACE_NAME_TEMPLATE, NET_NAME_UNKNOWN, -- cgit v1.2.3-58-ga151 From 07d85dbe411a1194eef5b70f1a5d070ee1e226a5 Mon Sep 17 00:00:00 2001 From: Qiheng Lin Date: Sat, 10 Apr 2021 10:19:35 +0800 Subject: Bluetooth: use flexible-array member instead of zero-length array Fix the following coccicheck warning: net/bluetooth/msft.c:37:6-13: WARNING use flexible-array member instead net/bluetooth/msft.c:42:6-10: WARNING use flexible-array member instead net/bluetooth/msft.c:52:6-10: WARNING use flexible-array member instead Signed-off-by: Qiheng Lin Signed-off-by: Marcel Holtmann --- net/bluetooth/msft.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/bluetooth/msft.c b/net/bluetooth/msft.c index e28f15439ce4..37a394786a94 100644 --- a/net/bluetooth/msft.c +++ b/net/bluetooth/msft.c @@ -34,12 +34,12 @@ struct msft_le_monitor_advertisement_pattern { __u8 length; __u8 data_type; __u8 start_byte; - __u8 pattern[0]; + __u8 pattern[]; }; struct msft_le_monitor_advertisement_pattern_data { __u8 count; - __u8 data[0]; + __u8 data[]; }; struct msft_cp_le_monitor_advertisement { @@ -49,7 +49,7 @@ struct msft_cp_le_monitor_advertisement { __u8 rssi_low_interval; __u8 rssi_sampling_period; __u8 cond_type; - __u8 data[0]; + __u8 data[]; } __packed; struct msft_rp_le_monitor_advertisement { -- cgit v1.2.3-58-ga151 From 1c6ed31b1696d9b5462ba5ce15b83f5ea955600c Mon Sep 17 00:00:00 2001 From: Yu Liu Date: Fri, 9 Apr 2021 15:04:06 -0700 Subject: Bluetooth: Return whether a connection is outbound When an MGMT_EV_DEVICE_CONNECTED event is reported back to the user space we will set the flags to tell if the established connection is outbound or not. This is useful for the user space to log better metrics and error messages. Reviewed-by: Miao-chen Chou Reviewed-by: Alain Michaud Signed-off-by: Yu Liu Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 2 +- include/net/bluetooth/mgmt.h | 1 + net/bluetooth/hci_event.c | 8 ++++---- net/bluetooth/l2cap_core.c | 2 +- net/bluetooth/mgmt.c | 6 +++++- 5 files changed, 12 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index c73ac52af186..8f5f390363f5 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1768,7 +1768,7 @@ void __mgmt_power_off(struct hci_dev *hdev); void mgmt_new_link_key(struct hci_dev *hdev, struct link_key *key, bool persistent); void mgmt_device_connected(struct hci_dev *hdev, struct hci_conn *conn, - u32 flags, u8 *name, u8 name_len); + u8 *name, u8 name_len); void mgmt_device_disconnected(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, u8 addr_type, u8 reason, bool mgmt_connected); diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h index a7cffb069565..a03c62b1dc2f 100644 --- a/include/net/bluetooth/mgmt.h +++ b/include/net/bluetooth/mgmt.h @@ -939,6 +939,7 @@ struct mgmt_ev_auth_failed { #define MGMT_DEV_FOUND_CONFIRM_NAME 0x01 #define MGMT_DEV_FOUND_LEGACY_PAIRING 0x02 #define MGMT_DEV_FOUND_NOT_CONNECTABLE 0x04 +#define MGMT_DEV_FOUND_INITIATED_CONN 0x08 #define MGMT_EV_DEVICE_FOUND 0x0012 struct mgmt_ev_device_found { diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index ea06b010ccad..59c5329354e1 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -2069,7 +2069,7 @@ static void hci_check_pending_name(struct hci_dev *hdev, struct hci_conn *conn, if (conn && (conn->state == BT_CONFIG || conn->state == BT_CONNECTED) && !test_and_set_bit(HCI_CONN_MGMT_CONNECTED, &conn->flags)) - mgmt_device_connected(hdev, conn, 0, name, name_len); + mgmt_device_connected(hdev, conn, name, name_len); if (discov->state == DISCOVERY_STOPPED) return; @@ -3256,7 +3256,7 @@ static void hci_remote_features_evt(struct hci_dev *hdev, cp.pscan_rep_mode = 0x02; hci_send_cmd(hdev, HCI_OP_REMOTE_NAME_REQ, sizeof(cp), &cp); } else if (!test_and_set_bit(HCI_CONN_MGMT_CONNECTED, &conn->flags)) - mgmt_device_connected(hdev, conn, 0, NULL, 0); + mgmt_device_connected(hdev, conn, NULL, 0); if (!hci_outgoing_auth_needed(hdev, conn)) { conn->state = BT_CONNECTED; @@ -4330,7 +4330,7 @@ static void hci_remote_ext_features_evt(struct hci_dev *hdev, cp.pscan_rep_mode = 0x02; hci_send_cmd(hdev, HCI_OP_REMOTE_NAME_REQ, sizeof(cp), &cp); } else if (!test_and_set_bit(HCI_CONN_MGMT_CONNECTED, &conn->flags)) - mgmt_device_connected(hdev, conn, 0, NULL, 0); + mgmt_device_connected(hdev, conn, NULL, 0); if (!hci_outgoing_auth_needed(hdev, conn)) { conn->state = BT_CONNECTED; @@ -5204,7 +5204,7 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status, } if (!test_and_set_bit(HCI_CONN_MGMT_CONNECTED, &conn->flags)) - mgmt_device_connected(hdev, conn, 0, NULL, 0); + mgmt_device_connected(hdev, conn, NULL, 0); conn->sec_level = BT_SECURITY_LOW; conn->handle = handle; diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index b6a88b8256c7..7d975cf98c20 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -4237,7 +4237,7 @@ static int l2cap_connect_req(struct l2cap_conn *conn, hci_dev_lock(hdev); if (hci_dev_test_flag(hdev, HCI_MGMT) && !test_and_set_bit(HCI_CONN_MGMT_CONNECTED, &hcon->flags)) - mgmt_device_connected(hdev, hcon, 0, NULL, 0); + mgmt_device_connected(hdev, hcon, NULL, 0); hci_dev_unlock(hdev); l2cap_connect(conn, cmd, data, L2CAP_CONN_RSP, 0); diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index f290d0c54d32..f6e510d06bec 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -8767,15 +8767,19 @@ void mgmt_new_conn_param(struct hci_dev *hdev, bdaddr_t *bdaddr, } void mgmt_device_connected(struct hci_dev *hdev, struct hci_conn *conn, - u32 flags, u8 *name, u8 name_len) + u8 *name, u8 name_len) { char buf[512]; struct mgmt_ev_device_connected *ev = (void *) buf; u16 eir_len = 0; + u32 flags = 0; bacpy(&ev->addr.bdaddr, &conn->dst); ev->addr.type = link_to_bdaddr(conn->type, conn->dst_type); + if (conn->out) + flags |= MGMT_DEV_FOUND_INITIATED_CONN; + ev->flags = __cpu_to_le32(flags); /* We must ensure that the EIR Data fields are ordered and -- cgit v1.2.3-58-ga151 From 3cfdf8fcaafa62a4123f92eb0f4a72650da3a479 Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Tue, 13 Apr 2021 13:21:03 -0300 Subject: Bluetooth: cmtp: fix file refcount when cmtp_attach_device fails When cmtp_attach_device fails, cmtp_add_connection returns the error value which leads to the caller to doing fput through sockfd_put. But cmtp_session kthread, which is stopped in this path will also call fput, leading to a potential refcount underflow or a use-after-free. Add a refcount before we signal the kthread to stop. The kthread will try to grab the cmtp_session_sem mutex before doing the fput, which is held when get_file is called, so there should be no races there. Reported-by: Ryota Shiga Signed-off-by: Thadeu Lima de Souza Cascardo Signed-off-by: Marcel Holtmann --- net/bluetooth/cmtp/core.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/bluetooth/cmtp/core.c b/net/bluetooth/cmtp/core.c index 07cfa3249f83..0a2d78e811cf 100644 --- a/net/bluetooth/cmtp/core.c +++ b/net/bluetooth/cmtp/core.c @@ -392,6 +392,11 @@ int cmtp_add_connection(struct cmtp_connadd_req *req, struct socket *sock) if (!(session->flags & BIT(CMTP_LOOPBACK))) { err = cmtp_attach_device(session); if (err < 0) { + /* Caller will call fput in case of failure, and so + * will cmtp_session kthread. + */ + get_file(session->sock->file); + atomic_inc(&session->terminate); wake_up_interruptible(sk_sleep(session->sock->sk)); up_write(&cmtp_session_sem); -- cgit v1.2.3-58-ga151 From 4ef36a52b0e47c80bbfd69c0cce61c7ae9f541ed Mon Sep 17 00:00:00 2001 From: Yu Liu Date: Mon, 19 Apr 2021 16:53:30 -0700 Subject: Bluetooth: Fix the HCI to MGMT status conversion table 0x2B, 0x31 and 0x33 are reserved for future use but were not present in the HCI to MGMT conversion table, this caused the conversion to be incorrect for the HCI status code greater than 0x2A. Reviewed-by: Miao-chen Chou Signed-off-by: Yu Liu Signed-off-by: Marcel Holtmann --- net/bluetooth/mgmt.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index f6e510d06bec..a81cf1b8b2e8 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -252,12 +252,15 @@ static const u8 mgmt_status_table[] = { MGMT_STATUS_TIMEOUT, /* Instant Passed */ MGMT_STATUS_NOT_SUPPORTED, /* Pairing Not Supported */ MGMT_STATUS_FAILED, /* Transaction Collision */ + MGMT_STATUS_FAILED, /* Reserved for future use */ MGMT_STATUS_INVALID_PARAMS, /* Unacceptable Parameter */ MGMT_STATUS_REJECTED, /* QoS Rejected */ MGMT_STATUS_NOT_SUPPORTED, /* Classification Not Supported */ MGMT_STATUS_REJECTED, /* Insufficient Security */ MGMT_STATUS_INVALID_PARAMS, /* Parameter Out Of Range */ + MGMT_STATUS_FAILED, /* Reserved for future use */ MGMT_STATUS_BUSY, /* Role Switch Pending */ + MGMT_STATUS_FAILED, /* Reserved for future use */ MGMT_STATUS_FAILED, /* Slot Violation */ MGMT_STATUS_FAILED, /* Role Switch Failed */ MGMT_STATUS_INVALID_PARAMS, /* EIR Too Large */ -- cgit v1.2.3-58-ga151 From de75cd0d9b2f3250d5f25846bb5632ccce6275f4 Mon Sep 17 00:00:00 2001 From: Manish Mandlik Date: Thu, 29 Apr 2021 10:24:22 -0700 Subject: Bluetooth: Add ncmd=0 recovery handling During command status or command complete event, the controller may set ncmd=0 indicating that it is not accepting any more commands. In such a case, host holds off sending any more commands to the controller. If the controller doesn't recover from such condition, host will wait forever, until the user decides that the Bluetooth is broken and may power cycles the Bluetooth. This patch triggers the hardware error to reset the controller and driver when it gets into such state as there is no other wat out. Reviewed-by: Abhishek Pandit-Subedi Signed-off-by: Manish Mandlik Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 1 + include/net/bluetooth/hci_core.h | 1 + net/bluetooth/hci_core.c | 22 ++++++++++++++++++++++ net/bluetooth/hci_event.c | 29 +++++++++++++++++++---------- 4 files changed, 43 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index ea4ae551c426..c4b0650fb9ae 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -339,6 +339,7 @@ enum { #define HCI_PAIRING_TIMEOUT msecs_to_jiffies(60000) /* 60 seconds */ #define HCI_INIT_TIMEOUT msecs_to_jiffies(10000) /* 10 seconds */ #define HCI_CMD_TIMEOUT msecs_to_jiffies(2000) /* 2 seconds */ +#define HCI_NCMD_TIMEOUT msecs_to_jiffies(4000) /* 4 seconds */ #define HCI_ACL_TX_TIMEOUT msecs_to_jiffies(45000) /* 45 seconds */ #define HCI_AUTO_OFF_TIMEOUT msecs_to_jiffies(2000) /* 2 seconds */ #define HCI_POWER_OFF_TIMEOUT msecs_to_jiffies(5000) /* 5 seconds */ diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 8f5f390363f5..43b08bebae74 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -470,6 +470,7 @@ struct hci_dev { struct delayed_work service_cache; struct delayed_work cmd_timer; + struct delayed_work ncmd_timer; struct work_struct rx_work; struct work_struct cmd_work; diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 25484bb0773d..572f2362ddb7 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -1730,6 +1730,7 @@ int hci_dev_do_close(struct hci_dev *hdev) } cancel_delayed_work(&hdev->power_off); + cancel_delayed_work(&hdev->ncmd_timer); hci_request_cancel_all(hdev); hci_req_sync_lock(hdev); @@ -2777,6 +2778,24 @@ static void hci_cmd_timeout(struct work_struct *work) queue_work(hdev->workqueue, &hdev->cmd_work); } +/* HCI ncmd timer function */ +static void hci_ncmd_timeout(struct work_struct *work) +{ + struct hci_dev *hdev = container_of(work, struct hci_dev, + ncmd_timer.work); + + bt_dev_err(hdev, "Controller not accepting commands anymore: ncmd = 0"); + + /* During HCI_INIT phase no events can be injected if the ncmd timer + * triggers since the procedure has its own timeout handling. + */ + if (test_bit(HCI_INIT, &hdev->flags)) + return; + + /* This is an irrecoverable state, inject hardware error event */ + hci_reset_dev(hdev); +} + struct oob_data *hci_find_remote_oob_data(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 bdaddr_type) { @@ -3841,6 +3860,7 @@ struct hci_dev *hci_alloc_dev(void) init_waitqueue_head(&hdev->suspend_wait_q); INIT_DELAYED_WORK(&hdev->cmd_timer, hci_cmd_timeout); + INIT_DELAYED_WORK(&hdev->ncmd_timer, hci_ncmd_timeout); hci_request_setup(hdev); @@ -4078,6 +4098,8 @@ int hci_reset_dev(struct hci_dev *hdev) hci_skb_pkt_type(skb) = HCI_EVENT_PKT; skb_put_data(skb, hw_err, 3); + bt_dev_err(hdev, "Injecting HCI hardware error event"); + /* Send Hardware Error to upper stack */ return hci_recv_frame(hdev, skb); } diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 59c5329354e1..18339ebc5959 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -3268,6 +3268,23 @@ unlock: hci_dev_unlock(hdev); } +static inline void handle_cmd_cnt_and_timer(struct hci_dev *hdev, + u16 opcode, u8 ncmd) +{ + if (opcode != HCI_OP_NOP) + cancel_delayed_work(&hdev->cmd_timer); + + if (!test_bit(HCI_RESET, &hdev->flags)) { + if (ncmd) { + cancel_delayed_work(&hdev->ncmd_timer); + atomic_set(&hdev->cmd_cnt, 1); + } else { + schedule_delayed_work(&hdev->ncmd_timer, + HCI_NCMD_TIMEOUT); + } + } +} + static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb, u16 *opcode, u8 *status, hci_req_complete_t *req_complete, @@ -3630,11 +3647,7 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb, break; } - if (*opcode != HCI_OP_NOP) - cancel_delayed_work(&hdev->cmd_timer); - - if (ev->ncmd && !test_bit(HCI_RESET, &hdev->flags)) - atomic_set(&hdev->cmd_cnt, 1); + handle_cmd_cnt_and_timer(hdev, *opcode, ev->ncmd); hci_req_cmd_complete(hdev, *opcode, *status, req_complete, req_complete_skb); @@ -3735,11 +3748,7 @@ static void hci_cmd_status_evt(struct hci_dev *hdev, struct sk_buff *skb, break; } - if (*opcode != HCI_OP_NOP) - cancel_delayed_work(&hdev->cmd_timer); - - if (ev->ncmd && !test_bit(HCI_RESET, &hdev->flags)) - atomic_set(&hdev->cmd_cnt, 1); + handle_cmd_cnt_and_timer(hdev, *opcode, ev->ncmd); /* Indicate request completion if the command failed. Also, if * we're not waiting for a special event and we get a success -- cgit v1.2.3-58-ga151 From b0e56db78744000a26b03fb442d6f944f68a8386 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Thu, 15 Apr 2021 16:49:28 +0800 Subject: Bluetooth: 6lowpan: remove unused function Fix the following clang warning: net/bluetooth/6lowpan.c:913:20: warning: unused function 'bdaddr_type' [-Wunused-function]. net/bluetooth/6lowpan.c:106:35: warning: unused function 'peer_lookup_ba' [-Wunused-function]. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Signed-off-by: Marcel Holtmann --- net/bluetooth/6lowpan.c | 36 ------------------------------------ 1 file changed, 36 deletions(-) (limited to 'net') diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index d5befa061aa2..29c96bc5733f 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -103,34 +103,6 @@ static inline bool peer_del(struct lowpan_btle_dev *dev, return false; } -static inline struct lowpan_peer *peer_lookup_ba(struct lowpan_btle_dev *dev, - bdaddr_t *ba, __u8 type) -{ - struct lowpan_peer *peer; - - BT_DBG("peers %d addr %pMR type %d", atomic_read(&dev->peer_count), - ba, type); - - rcu_read_lock(); - - list_for_each_entry_rcu(peer, &dev->peers, list) { - BT_DBG("dst addr %pMR dst type %d", - &peer->chan->dst, peer->chan->dst_type); - - if (bacmp(&peer->chan->dst, ba)) - continue; - - if (type == peer->chan->dst_type) { - rcu_read_unlock(); - return peer; - } - } - - rcu_read_unlock(); - - return NULL; -} - static inline struct lowpan_peer * __peer_lookup_chan(struct lowpan_btle_dev *dev, struct l2cap_chan *chan) { @@ -907,14 +879,6 @@ static const struct l2cap_ops bt_6lowpan_chan_ops = { .set_shutdown = l2cap_chan_no_set_shutdown, }; -static inline __u8 bdaddr_type(__u8 type) -{ - if (type == ADDR_LE_DEV_PUBLIC) - return BDADDR_LE_PUBLIC; - else - return BDADDR_LE_RANDOM; -} - static int bt_6lowpan_connect(bdaddr_t *addr, u8 dst_type) { struct l2cap_chan *chan; -- cgit v1.2.3-58-ga151 From 06d213d8a89a6f55b708422c3dda2b22add10748 Mon Sep 17 00:00:00 2001 From: Kiran K Date: Thu, 8 Apr 2021 22:31:59 +0530 Subject: Bluetooth: Fix alt settings for incoming SCO with transparent coding format For incoming SCO connection with transparent coding format, alt setting of CVSD is getting applied instead of Transparent. Before fix: < HCI Command: Accept Synchron.. (0x01|0x0029) plen 21 #2196 [hci0] 321.342548 Address: 1C:CC:D6:E2:EA:80 (Xiaomi Communications Co Ltd) Transmit bandwidth: 8000 Receive bandwidth: 8000 Max latency: 13 Setting: 0x0003 Input Coding: Linear Input Data Format: 1's complement Input Sample Size: 8-bit # of bits padding at MSB: 0 Air Coding Format: Transparent Data Retransmission effort: Optimize for link quality (0x02) Packet type: 0x003f HV1 may be used HV2 may be used HV3 may be used EV3 may be used EV4 may be used EV5 may be used > HCI Event: Command Status (0x0f) plen 4 #2197 [hci0] 321.343585 Accept Synchronous Connection Request (0x01|0x0029) ncmd 1 Status: Success (0x00) > HCI Event: Synchronous Connect Comp.. (0x2c) plen 17 #2198 [hci0] 321.351666 Status: Success (0x00) Handle: 257 Address: 1C:CC:D6:E2:EA:80 (Xiaomi Communications Co Ltd) Link type: eSCO (0x02) Transmission interval: 0x0c Retransmission window: 0x04 RX packet length: 60 TX packet length: 60 Air mode: Transparent (0x03) ........ > SCO Data RX: Handle 257 flags 0x00 dlen 48 #2336 [hci0] 321.383655 < SCO Data TX: Handle 257 flags 0x00 dlen 60 #2337 [hci0] 321.389558 > SCO Data RX: Handle 257 flags 0x00 dlen 48 #2338 [hci0] 321.393615 > SCO Data RX: Handle 257 flags 0x00 dlen 48 #2339 [hci0] 321.393618 > SCO Data RX: Handle 257 flags 0x00 dlen 48 #2340 [hci0] 321.393618 < SCO Data TX: Handle 257 flags 0x00 dlen 60 #2341 [hci0] 321.397070 > SCO Data RX: Handle 257 flags 0x00 dlen 48 #2342 [hci0] 321.403622 > SCO Data RX: Handle 257 flags 0x00 dlen 48 #2343 [hci0] 321.403625 > SCO Data RX: Handle 257 flags 0x00 dlen 48 #2344 [hci0] 321.403625 > SCO Data RX: Handle 257 flags 0x00 dlen 48 #2345 [hci0] 321.403625 < SCO Data TX: Handle 257 flags 0x00 dlen 60 #2346 [hci0] 321.404569 < SCO Data TX: Handle 257 flags 0x00 dlen 60 #2347 [hci0] 321.412091 > SCO Data RX: Handle 257 flags 0x00 dlen 48 #2348 [hci0] 321.413626 > SCO Data RX: Handle 257 flags 0x00 dlen 48 #2349 [hci0] 321.413630 > SCO Data RX: Handle 257 flags 0x00 dlen 48 #2350 [hci0] 321.413630 < SCO Data TX: Handle 257 flags 0x00 dlen 60 #2351 [hci0] 321.419674 After fix: < HCI Command: Accept Synchronou.. (0x01|0x0029) plen 21 #309 [hci0] 49.439693 Address: 1C:CC:D6:E2:EA:80 (Xiaomi Communications Co Ltd) Transmit bandwidth: 8000 Receive bandwidth: 8000 Max latency: 13 Setting: 0x0003 Input Coding: Linear Input Data Format: 1's complement Input Sample Size: 8-bit # of bits padding at MSB: 0 Air Coding Format: Transparent Data Retransmission effort: Optimize for link quality (0x02) Packet type: 0x003f HV1 may be used HV2 may be used HV3 may be used EV3 may be used EV4 may be used EV5 may be used > HCI Event: Command Status (0x0f) plen 4 #310 [hci0] 49.440308 Accept Synchronous Connection Request (0x01|0x0029) ncmd 1 Status: Success (0x00) > HCI Event: Synchronous Connect Complete (0x2c) plen 17 #311 [hci0] 49.449308 Status: Success (0x00) Handle: 257 Address: 1C:CC:D6:E2:EA:80 (Xiaomi Communications Co Ltd) Link type: eSCO (0x02) Transmission interval: 0x0c Retransmission window: 0x04 RX packet length: 60 TX packet length: 60 Air mode: Transparent (0x03) < SCO Data TX: Handle 257 flags 0x00 dlen 60 #312 [hci0] 49.450421 < SCO Data TX: Handle 257 flags 0x00 dlen 60 #313 [hci0] 49.457927 > HCI Event: Max Slots Change (0x1b) plen 3 #314 [hci0] 49.460345 Handle: 256 Max slots: 5 < SCO Data TX: Handle 257 flags 0x00 dlen 60 #315 [hci0] 49.465453 > SCO Data RX: Handle 257 flags 0x00 dlen 60 #316 [hci0] 49.470502 > SCO Data RX: Handle 257 flags 0x00 dlen 60 #317 [hci0] 49.470519 < SCO Data TX: Handle 257 flags 0x00 dlen 60 #318 [hci0] 49.472996 > SCO Data RX: Handle 257 flags 0x00 dlen 60 #319 [hci0] 49.480412 < SCO Data TX: Handle 257 flags 0x00 dlen 60 #320 [hci0] 49.480492 < SCO Data TX: Handle 257 flags 0x00 dlen 60 #321 [hci0] 49.487989 > SCO Data RX: Handle 257 flags 0x00 dlen 60 #322 [hci0] 49.490303 < SCO Data TX: Handle 257 flags 0x00 dlen 60 #323 [hci0] 49.495496 > SCO Data RX: Handle 257 flags 0x00 dlen 60 #324 [hci0] 49.500304 > SCO Data RX: Handle 257 flags 0x00 dlen 60 #325 [hci0] 49.500311 Signed-off-by: Kiran K Signed-off-by: Lokendra Singh Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_event.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 18339ebc5959..d73950441938 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -4413,12 +4413,12 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev, bt_dev_dbg(hdev, "SCO connected with air mode: %02x", ev->air_mode); - switch (conn->setting & SCO_AIRMODE_MASK) { - case SCO_AIRMODE_CVSD: + switch (ev->air_mode) { + case 0x02: if (hdev->notify) hdev->notify(hdev, HCI_NOTIFY_ENABLE_SCO_CVSD); break; - case SCO_AIRMODE_TRANSP: + case 0x03: if (hdev->notify) hdev->notify(hdev, HCI_NOTIFY_ENABLE_SCO_TRANSP); break; -- cgit v1.2.3-58-ga151 From 0ea9fd001a14ebc294f112b0361a4e601551d508 Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Fri, 14 May 2021 15:14:52 +0800 Subject: Bluetooth: Shutdown controller after workqueues are flushed or cancelled Rfkill block and unblock Intel USB Bluetooth [8087:0026] may make it stops working: [ 509.691509] Bluetooth: hci0: HCI reset during shutdown failed [ 514.897584] Bluetooth: hci0: MSFT filter_enable is already on [ 530.044751] usb 3-10: reset full-speed USB device number 5 using xhci_hcd [ 545.660350] usb 3-10: device descriptor read/64, error -110 [ 561.283530] usb 3-10: device descriptor read/64, error -110 [ 561.519682] usb 3-10: reset full-speed USB device number 5 using xhci_hcd [ 566.686650] Bluetooth: hci0: unexpected event for opcode 0x0500 [ 568.752452] Bluetooth: hci0: urb 0000000096cd309b failed to resubmit (113) [ 578.797955] Bluetooth: hci0: Failed to read MSFT supported features (-110) [ 586.286565] Bluetooth: hci0: urb 00000000c522f633 failed to resubmit (113) [ 596.215302] Bluetooth: hci0: Failed to read MSFT supported features (-110) Or kernel panics because other workqueues already freed skb: [ 2048.663763] BUG: kernel NULL pointer dereference, address: 0000000000000000 [ 2048.663775] #PF: supervisor read access in kernel mode [ 2048.663779] #PF: error_code(0x0000) - not-present page [ 2048.663782] PGD 0 P4D 0 [ 2048.663787] Oops: 0000 [#1] SMP NOPTI [ 2048.663793] CPU: 3 PID: 4491 Comm: rfkill Tainted: G W 5.13.0-rc1-next-20210510+ #20 [ 2048.663799] Hardware name: HP HP EliteBook 850 G8 Notebook PC/8846, BIOS T76 Ver. 01.01.04 12/02/2020 [ 2048.663801] RIP: 0010:__skb_ext_put+0x6/0x50 [ 2048.663814] Code: 8b 1b 48 85 db 75 db 5b 41 5c 5d c3 be 01 00 00 00 e8 de 13 c0 ff eb e7 be 02 00 00 00 e8 d2 13 c0 ff eb db 0f 1f 44 00 00 55 <8b> 07 48 89 e5 83 f8 01 74 14 b8 ff ff ff ff f0 0f c1 07 83 f8 01 [ 2048.663819] RSP: 0018:ffffc1d105b6fd80 EFLAGS: 00010286 [ 2048.663824] RAX: 0000000000000000 RBX: ffff9d9ac5649000 RCX: 0000000000000000 [ 2048.663827] RDX: ffffffffc0d1daf6 RSI: 0000000000000206 RDI: 0000000000000000 [ 2048.663830] RBP: ffffc1d105b6fd98 R08: 0000000000000001 R09: ffff9d9ace8ceac0 [ 2048.663834] R10: ffff9d9ace8ceac0 R11: 0000000000000001 R12: ffff9d9ac5649000 [ 2048.663838] R13: 0000000000000000 R14: 00007ffe0354d650 R15: 0000000000000000 [ 2048.663843] FS: 00007fe02ab19740(0000) GS:ffff9d9e5f8c0000(0000) knlGS:0000000000000000 [ 2048.663849] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 2048.663853] CR2: 0000000000000000 CR3: 0000000111a52004 CR4: 0000000000770ee0 [ 2048.663856] PKRU: 55555554 [ 2048.663859] Call Trace: [ 2048.663865] ? skb_release_head_state+0x5e/0x80 [ 2048.663873] kfree_skb+0x2f/0xb0 [ 2048.663881] btusb_shutdown_intel_new+0x36/0x60 [btusb] [ 2048.663905] hci_dev_do_close+0x48c/0x5e0 [bluetooth] [ 2048.663954] ? __cond_resched+0x1a/0x50 [ 2048.663962] hci_rfkill_set_block+0x56/0xa0 [bluetooth] [ 2048.664007] rfkill_set_block+0x98/0x170 [ 2048.664016] rfkill_fop_write+0x136/0x1e0 [ 2048.664022] vfs_write+0xc7/0x260 [ 2048.664030] ksys_write+0xb1/0xe0 [ 2048.664035] ? exit_to_user_mode_prepare+0x37/0x1c0 [ 2048.664042] __x64_sys_write+0x1a/0x20 [ 2048.664048] do_syscall_64+0x40/0xb0 [ 2048.664055] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 2048.664060] RIP: 0033:0x7fe02ac23c27 [ 2048.664066] Code: 0d 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b7 0f 1f 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 28 48 89 54 24 18 48 89 74 24 [ 2048.664070] RSP: 002b:00007ffe0354d638 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 [ 2048.664075] RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007fe02ac23c27 [ 2048.664078] RDX: 0000000000000008 RSI: 00007ffe0354d650 RDI: 0000000000000003 [ 2048.664081] RBP: 0000000000000000 R08: 0000559b05998440 R09: 0000559b05998440 [ 2048.664084] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000003 [ 2048.664086] R13: 0000000000000000 R14: ffffffff00000000 R15: 00000000ffffffff So move the shutdown callback to a place where workqueues are either flushed or cancelled to resolve the issue. Signed-off-by: Kai-Heng Feng Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_core.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 572f2362ddb7..cdf147899c50 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -1721,14 +1721,6 @@ int hci_dev_do_close(struct hci_dev *hdev) BT_DBG("%s %p", hdev->name, hdev); - if (!hci_dev_test_flag(hdev, HCI_UNREGISTER) && - !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && - test_bit(HCI_UP, &hdev->flags)) { - /* Execute vendor specific shutdown routine */ - if (hdev->shutdown) - hdev->shutdown(hdev); - } - cancel_delayed_work(&hdev->power_off); cancel_delayed_work(&hdev->ncmd_timer); @@ -1806,6 +1798,14 @@ int hci_dev_do_close(struct hci_dev *hdev) clear_bit(HCI_INIT, &hdev->flags); } + if (!hci_dev_test_flag(hdev, HCI_UNREGISTER) && + !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && + test_bit(HCI_UP, &hdev->flags)) { + /* Execute vendor specific shutdown routine */ + if (hdev->shutdown) + hdev->shutdown(hdev); + } + /* flush cmd work */ flush_work(&hdev->cmd_work); -- cgit v1.2.3-58-ga151 From 1c58e933aba23f68c0d3f192f7cc6eed8fabd694 Mon Sep 17 00:00:00 2001 From: Szymon Janc Date: Tue, 18 May 2021 16:54:36 +0200 Subject: Bluetooth: Remove spurious error message Even with rate limited reporting this is very spammy and since it is remote device that is providing bogus data there is no need to report this as error. Since real_len variable was used only to allow conditional error message it is now also removed. [72454.143336] bt_err_ratelimited: 10 callbacks suppressed [72454.143337] Bluetooth: hci0: advertising data len corrected [72454.296314] Bluetooth: hci0: advertising data len corrected [72454.892329] Bluetooth: hci0: advertising data len corrected [72455.051319] Bluetooth: hci0: advertising data len corrected [72455.357326] Bluetooth: hci0: advertising data len corrected [72455.663295] Bluetooth: hci0: advertising data len corrected [72455.787278] Bluetooth: hci0: advertising data len corrected [72455.942278] Bluetooth: hci0: advertising data len corrected [72456.094276] Bluetooth: hci0: advertising data len corrected [72456.249137] Bluetooth: hci0: advertising data len corrected [72459.416333] bt_err_ratelimited: 13 callbacks suppressed [72459.416334] Bluetooth: hci0: advertising data len corrected [72459.721334] Bluetooth: hci0: advertising data len corrected [72460.011317] Bluetooth: hci0: advertising data len corrected [72460.327171] Bluetooth: hci0: advertising data len corrected [72460.638294] Bluetooth: hci0: advertising data len corrected [72460.946350] Bluetooth: hci0: advertising data len corrected [72461.225320] Bluetooth: hci0: advertising data len corrected [72461.690322] Bluetooth: hci0: advertising data len corrected [72462.118318] Bluetooth: hci0: advertising data len corrected [72462.427319] Bluetooth: hci0: advertising data len corrected [72464.546319] bt_err_ratelimited: 7 callbacks suppressed [72464.546319] Bluetooth: hci0: advertising data len corrected [72464.857318] Bluetooth: hci0: advertising data len corrected [72465.163332] Bluetooth: hci0: advertising data len corrected [72465.278331] Bluetooth: hci0: advertising data len corrected [72465.432323] Bluetooth: hci0: advertising data len corrected [72465.891334] Bluetooth: hci0: advertising data len corrected [72466.045334] Bluetooth: hci0: advertising data len corrected [72466.197321] Bluetooth: hci0: advertising data len corrected [72466.340318] Bluetooth: hci0: advertising data len corrected [72466.498335] Bluetooth: hci0: advertising data len corrected [72469.803299] bt_err_ratelimited: 10 callbacks suppressed Signed-off-by: Szymon Janc Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=203753 Cc: stable@vger.kernel.org Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_event.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index d73950441938..04b2509b7cb5 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -5450,7 +5450,7 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr, struct hci_conn *conn; bool match; u32 flags; - u8 *ptr, real_len; + u8 *ptr; switch (type) { case LE_ADV_IND: @@ -5481,14 +5481,10 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr, break; } - real_len = ptr - data; - - /* Adjust for actual length */ - if (len != real_len) { - bt_dev_err_ratelimited(hdev, "advertising data len corrected %u -> %u", - len, real_len); - len = real_len; - } + /* Adjust for actual length. This handles the case when remote + * device is advertising with incorrect data length. + */ + len = ptr - data; /* If the direct address is present, then this report is from * a LE Direct Advertising Report event. In that case it is -- cgit v1.2.3-58-ga151 From 1fa20d7d4aad02206e84b74915819fbe9f81dab3 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 19 May 2021 13:41:50 -0700 Subject: Bluetooth: L2CAP: Fix invalid access if ECRED Reconfigure fails The use of l2cap_chan_del is not safe under a loop using list_for_each_entry. Reported-by: Dan Carpenter Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Marcel Holtmann --- net/bluetooth/l2cap_core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 7d975cf98c20..f3b70fa348ab 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -6248,7 +6248,7 @@ static inline int l2cap_ecred_reconf_rsp(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, u16 cmd_len, u8 *data) { - struct l2cap_chan *chan; + struct l2cap_chan *chan, *tmp; struct l2cap_ecred_conn_rsp *rsp = (void *) data; u16 result; @@ -6262,7 +6262,7 @@ static inline int l2cap_ecred_reconf_rsp(struct l2cap_conn *conn, if (!result) return 0; - list_for_each_entry(chan, &conn->chan_l, list) { + list_for_each_entry_safe(chan, tmp, &conn->chan_l, list) { if (chan->ident != cmd->ident) continue; -- cgit v1.2.3-58-ga151 From de895b43932cb47e69480540be7eca289af24f23 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 19 May 2021 13:41:51 -0700 Subject: Bluetooth: L2CAP: Fix invalid access on ECRED Connection response The use of l2cap_chan_del is not safe under a loop using list_for_each_entry. Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Marcel Holtmann --- net/bluetooth/l2cap_core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index f3b70fa348ab..9ebb85df4db4 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -6066,7 +6066,7 @@ static inline int l2cap_ecred_conn_rsp(struct l2cap_conn *conn, struct l2cap_ecred_conn_rsp *rsp = (void *) data; struct hci_conn *hcon = conn->hcon; u16 mtu, mps, credits, result; - struct l2cap_chan *chan; + struct l2cap_chan *chan, *tmp; int err = 0, sec_level; int i = 0; @@ -6085,7 +6085,7 @@ static inline int l2cap_ecred_conn_rsp(struct l2cap_conn *conn, cmd_len -= sizeof(*rsp); - list_for_each_entry(chan, &conn->chan_l, list) { + list_for_each_entry_safe(chan, tmp, &conn->chan_l, list) { u16 dcid; if (chan->ident != cmd->ident || -- cgit v1.2.3-58-ga151 From c615943ef0525fdaea631ca42ded446e11389062 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Thu, 20 May 2021 21:32:35 +0800 Subject: Bluetooth: RFCOMM: Use DEVICE_ATTR_RO macro Use DEVICE_ATTR_RO helper instead of plain DEVICE_ATTR, which makes the code a bit shorter and easier to read. Signed-off-by: YueHaibing Signed-off-by: Marcel Holtmann --- net/bluetooth/rfcomm/tty.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c index a58584949a95..8cb53e10a985 100644 --- a/net/bluetooth/rfcomm/tty.c +++ b/net/bluetooth/rfcomm/tty.c @@ -198,20 +198,22 @@ static void rfcomm_reparent_device(struct rfcomm_dev *dev) hci_dev_put(hdev); } -static ssize_t show_address(struct device *tty_dev, struct device_attribute *attr, char *buf) +static ssize_t address_show(struct device *tty_dev, + struct device_attribute *attr, char *buf) { struct rfcomm_dev *dev = dev_get_drvdata(tty_dev); return sprintf(buf, "%pMR\n", &dev->dst); } -static ssize_t show_channel(struct device *tty_dev, struct device_attribute *attr, char *buf) +static ssize_t channel_show(struct device *tty_dev, + struct device_attribute *attr, char *buf) { struct rfcomm_dev *dev = dev_get_drvdata(tty_dev); return sprintf(buf, "%d\n", dev->channel); } -static DEVICE_ATTR(address, 0444, show_address, NULL); -static DEVICE_ATTR(channel, 0444, show_channel, NULL); +static DEVICE_ATTR_RO(address); +static DEVICE_ATTR_RO(channel); static struct rfcomm_dev *__rfcomm_dev_add(struct rfcomm_dev_req *req, struct rfcomm_dlc *dlc) -- cgit v1.2.3-58-ga151 From 79699a7056ff784524d1baa387f30ddf98e14a1c Mon Sep 17 00:00:00 2001 From: Sathish Narasimman Date: Thu, 20 May 2021 17:12:01 +0530 Subject: Bluetooth: Translate additional address type during le_conn_comp When using controller based address resolution, then the destination address type during le_conn_complete uses 0x02 & 0x03 if controller resolves the destination address(RPA). These address types need to be converted back into either 0x00 0r 0x01 Signed-off-by: Sathish Narasimman Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_event.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'net') diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 04b2509b7cb5..7c9482449228 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -5196,6 +5196,23 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status, conn->dst_type = irk->addr_type; } + /* When using controller based address resolution, then the new + * address types 0x02 and 0x03 are used. These types need to be + * converted back into either public address or random address type + */ + if (use_ll_privacy(hdev) && + hci_dev_test_flag(hdev, HCI_ENABLE_LL_PRIVACY) && + hci_dev_test_flag(hdev, HCI_LL_RPA_RESOLUTION)) { + switch (conn->dst_type) { + case ADDR_LE_DEV_PUBLIC_RESOLVED: + conn->dst_type = ADDR_LE_DEV_PUBLIC; + break; + case ADDR_LE_DEV_RANDOM_RESOLVED: + conn->dst_type = ADDR_LE_DEV_RANDOM; + break; + } + } + if (status) { hci_le_conn_failed(conn, status); goto unlock; -- cgit v1.2.3-58-ga151 From c32d624640fd2254ec40e76e4a176e75de77ee09 Mon Sep 17 00:00:00 2001 From: Yun-Hao Chung Date: Thu, 20 May 2021 13:12:09 +0800 Subject: Bluetooth: disable filter dup when scan for adv monitor Disable duplicates filter when scanning for advertisement monitor for the following reasons. The scanning includes active scan and passive scan. For HW pattern filtering (ex. MSFT), Realtek and Qualcomm controllers ignore RSSI_Sampling_Period when the duplicates filter is enabled. For SW pattern filtering, when we're not doing interleaved scanning, it is necessary to disable duplicates filter, otherwise hosts can only receive one advertisement and it's impossible to know if a peer is still in range. Signed-off-by: Yun-Hao Chung Reviewed-by: Archie Pusaka Reviewed-by: Manish Mandlik Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_request.c | 46 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 41 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index fa9125b782f8..3465862429fb 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -932,7 +932,7 @@ static bool scan_use_rpa(struct hci_dev *hdev) static void hci_req_start_scan(struct hci_request *req, u8 type, u16 interval, u16 window, u8 own_addr_type, u8 filter_policy, - bool addr_resolv) + bool filter_dup, bool addr_resolv) { struct hci_dev *hdev = req->hdev; @@ -997,7 +997,7 @@ static void hci_req_start_scan(struct hci_request *req, u8 type, u16 interval, memset(&ext_enable_cp, 0, sizeof(ext_enable_cp)); ext_enable_cp.enable = LE_SCAN_ENABLE; - ext_enable_cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE; + ext_enable_cp.filter_dup = filter_dup; hci_req_add(req, HCI_OP_LE_SET_EXT_SCAN_ENABLE, sizeof(ext_enable_cp), &ext_enable_cp); @@ -1016,7 +1016,7 @@ static void hci_req_start_scan(struct hci_request *req, u8 type, u16 interval, memset(&enable_cp, 0, sizeof(enable_cp)); enable_cp.enable = LE_SCAN_ENABLE; - enable_cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE; + enable_cp.filter_dup = filter_dup; hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(enable_cp), &enable_cp); } @@ -1053,6 +1053,8 @@ void hci_req_add_le_passive_scan(struct hci_request *req) u8 own_addr_type; u8 filter_policy; u16 window, interval; + /* Default is to enable duplicates filter */ + u8 filter_dup = LE_SCAN_FILTER_DUP_ENABLE; /* Background scanning should run with address resolution */ bool addr_resolv = true; @@ -1106,6 +1108,20 @@ void hci_req_add_le_passive_scan(struct hci_request *req) } else if (hci_is_adv_monitoring(hdev)) { window = hdev->le_scan_window_adv_monitor; interval = hdev->le_scan_int_adv_monitor; + + /* Disable duplicates filter when scanning for advertisement + * monitor for the following reasons. + * + * For HW pattern filtering (ex. MSFT), Realtek and Qualcomm + * controllers ignore RSSI_Sampling_Period when the duplicates + * filter is enabled. + * + * For SW pattern filtering, when we're not doing interleaved + * scanning, it is necessary to disable duplicates filter, + * otherwise hosts can only receive one advertisement and it's + * impossible to know if a peer is still in range. + */ + filter_dup = LE_SCAN_FILTER_DUP_DISABLE; } else { window = hdev->le_scan_window; interval = hdev->le_scan_interval; @@ -1113,7 +1129,8 @@ void hci_req_add_le_passive_scan(struct hci_request *req) bt_dev_dbg(hdev, "LE passive scan with whitelist = %d", filter_policy); hci_req_start_scan(req, LE_SCAN_PASSIVE, interval, window, - own_addr_type, filter_policy, addr_resolv); + own_addr_type, filter_policy, filter_dup, + addr_resolv); } static bool adv_instance_is_scannable(struct hci_dev *hdev, u8 instance) @@ -3135,6 +3152,8 @@ static int active_scan(struct hci_request *req, unsigned long opt) u8 own_addr_type; /* White list is not used for discovery */ u8 filter_policy = 0x00; + /* Default is to enable duplicates filter */ + u8 filter_dup = LE_SCAN_FILTER_DUP_ENABLE; /* Discovery doesn't require controller address resolution */ bool addr_resolv = false; int err; @@ -3159,9 +3178,26 @@ static int active_scan(struct hci_request *req, unsigned long opt) if (err < 0) own_addr_type = ADDR_LE_DEV_PUBLIC; + if (hci_is_adv_monitoring(hdev)) { + /* Duplicate filter should be disabled when some advertisement + * monitor is activated, otherwise AdvMon can only receive one + * advertisement for one peer(*) during active scanning, and + * might report loss to these peers. + * + * Note that different controllers have different meanings of + * |duplicate|. Some of them consider packets with the same + * address as duplicate, and others consider packets with the + * same address and the same RSSI as duplicate. Although in the + * latter case we don't need to disable duplicate filter, but + * it is common to have active scanning for a short period of + * time, the power impact should be neglectable. + */ + filter_dup = LE_SCAN_FILTER_DUP_DISABLE; + } + hci_req_start_scan(req, LE_SCAN_ACTIVE, interval, hdev->le_scan_window_discovery, own_addr_type, - filter_policy, addr_resolv); + filter_policy, filter_dup, addr_resolv); return 0; } -- cgit v1.2.3-58-ga151 From 02ce2c2c24024aade65a8d91d6a596651eaf2d0a Mon Sep 17 00:00:00 2001 From: Tedd Ho-Jeong An Date: Wed, 26 May 2021 10:36:22 -0700 Subject: Bluetooth: mgmt: Fix the command returns garbage parameter value When the Get Device Flags command fails, it returns the error status with the parameters filled with the garbage values. Although the parameters are not used, it is better to fill with zero than the random values. Signed-off-by: Tedd Ho-Jeong An Signed-off-by: Marcel Holtmann --- net/bluetooth/mgmt.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index a81cf1b8b2e8..5616e8afb22e 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -4061,6 +4061,8 @@ static int get_device_flags(struct sock *sk, struct hci_dev *hdev, void *data, hci_dev_lock(hdev); + memset(&rp, 0, sizeof(rp)); + if (cp->addr.type == BDADDR_BREDR) { br_params = hci_bdaddr_list_lookup_with_flags(&hdev->whitelist, &cp->addr.bdaddr, -- cgit v1.2.3-58-ga151 From 799acb9347915bfe4eac0ff2345b468f0a1ca207 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 28 May 2021 11:45:02 -0700 Subject: Bluetooth: mgmt: Fix slab-out-of-bounds in tlv_data_is_valid This fixes parsing of LTV entries when the length is 0. Found with: tools/mgmt-tester -s "Add Advertising - Success (ScRsp only)" Add Advertising - Success (ScRsp only) - run Sending Add Advertising (0x003e) Test condition added, total 1 [ 11.004577] ================================================================== [ 11.005292] BUG: KASAN: slab-out-of-bounds in tlv_data_is_valid+0x87/0xe0 [ 11.005984] Read of size 1 at addr ffff888002c695b0 by task mgmt-tester/87 [ 11.006711] [ 11.007176] [ 11.007429] Allocated by task 87: [ 11.008151] [ 11.008438] The buggy address belongs to the object at ffff888002c69580 [ 11.008438] which belongs to the cache kmalloc-64 of size 64 [ 11.010526] The buggy address is located 48 bytes inside of [ 11.010526] 64-byte region [ffff888002c69580, ffff888002c695c0) [ 11.012423] The buggy address belongs to the page: [ 11.013291] [ 11.013544] Memory state around the buggy address: [ 11.014359] ffff888002c69480: fa fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc [ 11.015453] ffff888002c69500: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc [ 11.016232] >ffff888002c69580: 00 00 00 00 00 00 fc fc fc fc fc fc fc fc fc fc [ 11.017010] ^ [ 11.017547] ffff888002c69600: 00 00 00 00 00 00 fc fc fc fc fc fc fc fc fc fc [ 11.018296] ffff888002c69680: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc [ 11.019116] ================================================================== Fixes: 2bb36870e8cb2 ("Bluetooth: Unify advertising instance flags check") Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Marcel Holtmann --- net/bluetooth/mgmt.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 5616e8afb22e..896f0c482912 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -7590,6 +7590,9 @@ static bool tlv_data_is_valid(struct hci_dev *hdev, u32 adv_flags, u8 *data, for (i = 0, cur_len = 0; i < len; i += (cur_len + 1)) { cur_len = data[i]; + if (!cur_len) + continue; + if (data[i + 1] == EIR_FLAGS && (!is_adv_data || flags_managed(adv_flags))) return false; -- cgit v1.2.3-58-ga151 From 6397729bb74df3918187c5e96fb0f63c5f5292d9 Mon Sep 17 00:00:00 2001 From: Archie Pusaka Date: Mon, 31 May 2021 16:37:22 +0800 Subject: Bluetooth: use inclusive language to describe CPB This patch replaces some non-inclusive terms based on the appropriate language mapping table compiled by the Bluetooth SIG: https://specificationrefs.bluetooth.com/language-mapping/Appropriate_Language_Mapping_Table.pdf Specifically, these terms are replaced when describing the connectionless peripheral broadcast feature: master -> central slave -> peripheral Signed-off-by: Archie Pusaka Reviewed-by: Miao-chen Chou Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 26 +++++++++++++------------- include/net/bluetooth/hci_core.h | 4 ++-- net/bluetooth/hci_conn.c | 2 +- net/bluetooth/hci_core.c | 16 ++++++++-------- 4 files changed, 24 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 4eea590cd432..ece96ccc42de 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -36,7 +36,7 @@ #define HCI_MAX_AMP_ASSOC_SIZE 672 -#define HCI_MAX_CSB_DATA_SIZE 252 +#define HCI_MAX_CPB_DATA_SIZE 252 /* HCI dev events */ #define HCI_DEV_REG 1 @@ -472,10 +472,10 @@ enum { #define LMP_EXTFEATURES 0x80 /* Extended LMP features */ -#define LMP_CSB_MASTER 0x01 -#define LMP_CSB_SLAVE 0x02 -#define LMP_SYNC_TRAIN 0x04 -#define LMP_SYNC_SCAN 0x08 +#define LMP_CPB_CENTRAL 0x01 +#define LMP_CPB_PERIPHERAL 0x02 +#define LMP_SYNC_TRAIN 0x04 +#define LMP_SYNC_SCAN 0x08 #define LMP_SC 0x01 #define LMP_PING 0x02 @@ -877,17 +877,17 @@ struct hci_rp_logical_link_cancel { __u8 flow_spec_id; } __packed; -#define HCI_OP_SET_CSB 0x0441 -struct hci_cp_set_csb { +#define HCI_OP_SET_CPB 0x0441 +struct hci_cp_set_cpb { __u8 enable; __u8 lt_addr; __u8 lpo_allowed; __le16 packet_type; __le16 interval_min; __le16 interval_max; - __le16 csb_sv_tout; + __le16 cpb_sv_tout; } __packed; -struct hci_rp_set_csb { +struct hci_rp_set_cpb { __u8 status; __u8 lt_addr; __le16 interval; @@ -1184,14 +1184,14 @@ struct hci_rp_delete_reserved_lt_addr { __u8 lt_addr; } __packed; -#define HCI_OP_SET_CSB_DATA 0x0c76 -struct hci_cp_set_csb_data { +#define HCI_OP_SET_CPB_DATA 0x0c76 +struct hci_cp_set_cpb_data { __u8 lt_addr; __u8 fragment; __u8 data_length; - __u8 data[HCI_MAX_CSB_DATA_SIZE]; + __u8 data[HCI_MAX_CPB_DATA_SIZE]; } __packed; -struct hci_rp_set_csb_data { +struct hci_rp_set_cpb_data { __u8 status; __u8 lt_addr; } __packed; diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 43b08bebae74..c9ec06997e1c 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1394,8 +1394,8 @@ void hci_conn_del_sysfs(struct hci_conn *conn); #define lmp_edr_5slot_capable(dev) ((dev)->features[0][5] & LMP_EDR_5SLOT) /* ----- Extended LMP capabilities ----- */ -#define lmp_csb_master_capable(dev) ((dev)->features[2][0] & LMP_CSB_MASTER) -#define lmp_csb_slave_capable(dev) ((dev)->features[2][0] & LMP_CSB_SLAVE) +#define lmp_cpb_central_capable(dev) ((dev)->features[2][0] & LMP_CPB_CENTRAL) +#define lmp_cpb_peripheral_capable(dev) ((dev)->features[2][0] & LMP_CPB_PERIPHERAL) #define lmp_sync_train_capable(dev) ((dev)->features[2][0] & LMP_SYNC_TRAIN) #define lmp_sync_scan_capable(dev) ((dev)->features[2][0] & LMP_SYNC_SCAN) #define lmp_sc_capable(dev) ((dev)->features[2][1] & LMP_SC) diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 0ceb72d32208..ea0f9cdaa6b1 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -1842,7 +1842,7 @@ u32 hci_conn_get_phy(struct hci_conn *conn) /* BLUETOOTH CORE SPECIFICATION Version 5.2 | Vol 2, Part B page 471: * Table 6.2: Packets defined for synchronous, asynchronous, and - * CSB logical transport types. + * CPB logical transport types. */ switch (conn->type) { case SCO_LINK: diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index cdf147899c50..5735171e2e23 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -545,24 +545,24 @@ static void hci_set_event_mask_page_2(struct hci_request *req) u8 events[8] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; bool changed = false; - /* If Connectionless Slave Broadcast master role is supported + /* If Connectionless Peripheral Broadcast central role is supported * enable all necessary events for it. */ - if (lmp_csb_master_capable(hdev)) { + if (lmp_cpb_central_capable(hdev)) { events[1] |= 0x40; /* Triggered Clock Capture */ events[1] |= 0x80; /* Synchronization Train Complete */ - events[2] |= 0x10; /* Slave Page Response Timeout */ - events[2] |= 0x20; /* CSB Channel Map Change */ + events[2] |= 0x10; /* Peripheral Page Response Timeout */ + events[2] |= 0x20; /* CPB Channel Map Change */ changed = true; } - /* If Connectionless Slave Broadcast slave role is supported + /* If Connectionless Peripheral Broadcast peripheral role is supported * enable all necessary events for it. */ - if (lmp_csb_slave_capable(hdev)) { + if (lmp_cpb_peripheral_capable(hdev)) { events[2] |= 0x01; /* Synchronization Train Received */ - events[2] |= 0x02; /* CSB Receive */ - events[2] |= 0x04; /* CSB Timeout */ + events[2] |= 0x02; /* CPB Receive */ + events[2] |= 0x04; /* CPB Timeout */ events[2] |= 0x08; /* Truncated Page Complete */ changed = true; } -- cgit v1.2.3-58-ga151 From ef365da1803de7891589c75304c8c36bb7cf4b98 Mon Sep 17 00:00:00 2001 From: Archie Pusaka Date: Mon, 31 May 2021 16:37:23 +0800 Subject: Bluetooth: use inclusive language in HCI LE features This patch replaces some non-inclusive terms based on the appropriate language mapping table compiled by the Bluetooth SIG: https://specificationrefs.bluetooth.com/language-mapping/Appropriate_Language_Mapping_Table.pdf Specifically, these terms are replaced: master -> central slave -> peripheral Signed-off-by: Archie Pusaka Reviewed-by: Miao-chen Chou Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 6 +++--- net/bluetooth/hci_event.c | 14 +++++++------- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index ece96ccc42de..3abd6273a189 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -489,7 +489,7 @@ enum { /* LE features */ #define HCI_LE_ENCRYPTION 0x01 #define HCI_LE_CONN_PARAM_REQ_PROC 0x02 -#define HCI_LE_SLAVE_FEATURES 0x08 +#define HCI_LE_PERIPHERAL_FEATURES 0x08 #define HCI_LE_PING 0x10 #define HCI_LE_DATA_LEN_EXT 0x20 #define HCI_LE_LL_PRIVACY 0x40 @@ -498,8 +498,8 @@ enum { #define HCI_LE_PHY_CODED 0x08 #define HCI_LE_EXT_ADV 0x10 #define HCI_LE_CHAN_SEL_ALG2 0x40 -#define HCI_LE_CIS_MASTER 0x10 -#define HCI_LE_CIS_SLAVE 0x20 +#define HCI_LE_CIS_CENTRAL 0x10 +#define HCI_LE_CIS_PERIPHERAL 0x20 /* Connection modes */ #define HCI_CM_ACTIVE 0x0000 diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 7c9482449228..e289187075b9 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -5243,17 +5243,17 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status, hci_debugfs_create_conn(conn); hci_conn_add_sysfs(conn); - /* The remote features procedure is defined for master + /* The remote features procedure is defined for central * role only. So only in case of an initiated connection * request the remote features. * - * If the local controller supports slave-initiated features - * exchange, then requesting the remote features in slave + * If the local controller supports peripheral-initiated features + * exchange, then requesting the remote features in peripheral * role is possible. Otherwise just transition into the * connected state without requesting the remote features. */ if (conn->out || - (hdev->le_features[0] & HCI_LE_SLAVE_FEATURES)) { + (hdev->le_features[0] & HCI_LE_PERIPHERAL_FEATURES)) { struct hci_cp_le_read_remote_features cp; cp.handle = __cpu_to_le16(conn->handle); @@ -5774,7 +5774,7 @@ static void hci_le_remote_feat_complete_evt(struct hci_dev *hdev, if (conn->state == BT_CONFIG) { __u8 status; - /* If the local controller supports slave-initiated + /* If the local controller supports peripheral-initiated * features exchange, but the remote controller does * not, then it is possible that the error code 0x1a * for unsupported remote feature gets returned. @@ -5783,8 +5783,8 @@ static void hci_le_remote_feat_complete_evt(struct hci_dev *hdev, * transition into connected state and mark it as * successful. */ - if ((hdev->le_features[0] & HCI_LE_SLAVE_FEATURES) && - !conn->out && ev->status == 0x1a) + if (!conn->out && ev->status == 0x1a && + (hdev->le_features[0] & HCI_LE_PERIPHERAL_FEATURES)) status = 0x00; else status = ev->status; -- cgit v1.2.3-58-ga151 From fad646e16d3cafd67d3cfff8e66f77401190957e Mon Sep 17 00:00:00 2001 From: Archie Pusaka Date: Mon, 31 May 2021 16:37:25 +0800 Subject: Bluetooth: use inclusive language in SMP This patch replaces some non-inclusive terms based on the appropriate language mapping table compiled by the Bluetooth SIG: https://specificationrefs.bluetooth.com/language-mapping/Appropriate_Language_Mapping_Table.pdf Specifically, these terms are replaced: master -> initiator slave -> responder Signed-off-by: Archie Pusaka Signed-off-by: Marcel Holtmann --- include/net/bluetooth/mgmt.h | 2 +- net/bluetooth/mgmt.c | 10 +++---- net/bluetooth/smp.c | 66 +++++++++++++++++++++++--------------------- net/bluetooth/smp.h | 6 ++-- 4 files changed, 43 insertions(+), 41 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h index a03c62b1dc2f..23a0524061b7 100644 --- a/include/net/bluetooth/mgmt.h +++ b/include/net/bluetooth/mgmt.h @@ -202,7 +202,7 @@ struct mgmt_cp_load_link_keys { struct mgmt_ltk_info { struct mgmt_addr_info addr; __u8 type; - __u8 master; + __u8 initiator; __u8 enc_size; __le16 ediv; __le64 rand; diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 896f0c482912..fac972415d68 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -6169,7 +6169,7 @@ static int load_irks(struct sock *sk, struct hci_dev *hdev, void *cp_data, static bool ltk_is_valid(struct mgmt_ltk_info *key) { - if (key->master != 0x00 && key->master != 0x01) + if (key->initiator != 0x00 && key->initiator != 0x01) return false; switch (key->addr.type) { @@ -6247,11 +6247,11 @@ static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev, switch (key->type) { case MGMT_LTK_UNAUTHENTICATED: authenticated = 0x00; - type = key->master ? SMP_LTK : SMP_LTK_SLAVE; + type = key->initiator ? SMP_LTK : SMP_LTK_RESPONDER; break; case MGMT_LTK_AUTHENTICATED: authenticated = 0x01; - type = key->master ? SMP_LTK : SMP_LTK_SLAVE; + type = key->initiator ? SMP_LTK : SMP_LTK_RESPONDER; break; case MGMT_LTK_P256_UNAUTH: authenticated = 0x00; @@ -8649,7 +8649,7 @@ static u8 mgmt_ltk_type(struct smp_ltk *ltk) { switch (ltk->type) { case SMP_LTK: - case SMP_LTK_SLAVE: + case SMP_LTK_RESPONDER: if (ltk->authenticated) return MGMT_LTK_AUTHENTICATED; return MGMT_LTK_UNAUTHENTICATED; @@ -8695,7 +8695,7 @@ void mgmt_new_ltk(struct hci_dev *hdev, struct smp_ltk *key, bool persistent) ev.key.rand = key->rand; if (key->type == SMP_LTK) - ev.key.master = 1; + ev.key.initiator = 1; /* Make sure we copy only the significant bytes based on the * encryption key size, and set the rest of the value to zeroes. diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 4d93c6c32a71..6197f8ae53ab 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -111,9 +111,9 @@ struct smp_chan { u8 id_addr_type; u8 irk[16]; struct smp_csrk *csrk; - struct smp_csrk *slave_csrk; + struct smp_csrk *responder_csrk; struct smp_ltk *ltk; - struct smp_ltk *slave_ltk; + struct smp_ltk *responder_ltk; struct smp_irk *remote_irk; u8 *link_key; unsigned long flags; @@ -753,7 +753,7 @@ static void smp_chan_destroy(struct l2cap_conn *conn) mgmt_smp_complete(hcon, complete); kfree_sensitive(smp->csrk); - kfree_sensitive(smp->slave_csrk); + kfree_sensitive(smp->responder_csrk); kfree_sensitive(smp->link_key); crypto_free_shash(smp->tfm_cmac); @@ -776,9 +776,9 @@ static void smp_chan_destroy(struct l2cap_conn *conn) kfree_rcu(smp->ltk, rcu); } - if (smp->slave_ltk) { - list_del_rcu(&smp->slave_ltk->list); - kfree_rcu(smp->slave_ltk, rcu); + if (smp->responder_ltk) { + list_del_rcu(&smp->responder_ltk->list); + kfree_rcu(smp->responder_ltk, rcu); } if (smp->remote_irk) { @@ -979,7 +979,7 @@ static u8 smp_random(struct smp_chan *smp) int ret; bt_dev_dbg(conn->hcon->hdev, "conn %p %s", conn, - conn->hcon->out ? "master" : "slave"); + conn->hcon->out ? "initiator" : "responder"); ret = smp_c1(smp->tk, smp->rrnd, smp->preq, smp->prsp, hcon->init_addr_type, &hcon->init_addr, @@ -1021,8 +1021,8 @@ static u8 smp_random(struct smp_chan *smp) else auth = 0; - /* Even though there's no _SLAVE suffix this is the - * slave STK we're adding for later lookup (the master + /* Even though there's no _RESPONDER suffix this is the + * responder STK we're adding for later lookup (the initiator * STK never needs to be stored). */ hci_add_ltk(hcon->hdev, &hcon->dst, hcon->dst_type, @@ -1077,10 +1077,10 @@ static void smp_notify_keys(struct l2cap_conn *conn) mgmt_new_csrk(hdev, smp->csrk, persistent); } - if (smp->slave_csrk) { - smp->slave_csrk->bdaddr_type = hcon->dst_type; - bacpy(&smp->slave_csrk->bdaddr, &hcon->dst); - mgmt_new_csrk(hdev, smp->slave_csrk, persistent); + if (smp->responder_csrk) { + smp->responder_csrk->bdaddr_type = hcon->dst_type; + bacpy(&smp->responder_csrk->bdaddr, &hcon->dst); + mgmt_new_csrk(hdev, smp->responder_csrk, persistent); } if (smp->ltk) { @@ -1089,10 +1089,10 @@ static void smp_notify_keys(struct l2cap_conn *conn) mgmt_new_ltk(hdev, smp->ltk, persistent); } - if (smp->slave_ltk) { - smp->slave_ltk->bdaddr_type = hcon->dst_type; - bacpy(&smp->slave_ltk->bdaddr, &hcon->dst); - mgmt_new_ltk(hdev, smp->slave_ltk, persistent); + if (smp->responder_ltk) { + smp->responder_ltk->bdaddr_type = hcon->dst_type; + bacpy(&smp->responder_ltk->bdaddr, &hcon->dst); + mgmt_new_ltk(hdev, smp->responder_ltk, persistent); } if (smp->link_key) { @@ -1272,7 +1272,7 @@ static void smp_distribute_keys(struct smp_chan *smp) if (*keydist & SMP_DIST_ENC_KEY) { struct smp_cmd_encrypt_info enc; - struct smp_cmd_master_ident ident; + struct smp_cmd_initiator_ident ident; struct smp_ltk *ltk; u8 authenticated; __le16 ediv; @@ -1293,14 +1293,15 @@ static void smp_distribute_keys(struct smp_chan *smp) authenticated = hcon->sec_level == BT_SECURITY_HIGH; ltk = hci_add_ltk(hdev, &hcon->dst, hcon->dst_type, - SMP_LTK_SLAVE, authenticated, enc.ltk, + SMP_LTK_RESPONDER, authenticated, enc.ltk, smp->enc_key_size, ediv, rand); - smp->slave_ltk = ltk; + smp->responder_ltk = ltk; ident.ediv = ediv; ident.rand = rand; - smp_send_cmd(conn, SMP_CMD_MASTER_IDENT, sizeof(ident), &ident); + smp_send_cmd(conn, SMP_CMD_INITIATOR_IDENT, sizeof(ident), + &ident); *keydist &= ~SMP_DIST_ENC_KEY; } @@ -1343,7 +1344,7 @@ static void smp_distribute_keys(struct smp_chan *smp) csrk->type = MGMT_CSRK_LOCAL_UNAUTHENTICATED; memcpy(csrk->val, sign.csrk, sizeof(csrk->val)); } - smp->slave_csrk = csrk; + smp->responder_csrk = csrk; smp_send_cmd(conn, SMP_CMD_SIGN_INFO, sizeof(sign), &sign); @@ -2048,7 +2049,7 @@ static int fixup_sc_false_positive(struct smp_chan *smp) struct smp_cmd_pairing *req, *rsp; u8 auth; - /* The issue is only observed when we're in slave role */ + /* The issue is only observed when we're in responder role */ if (hcon->out) return SMP_UNSPECIFIED; @@ -2084,7 +2085,8 @@ static u8 smp_cmd_pairing_confirm(struct l2cap_conn *conn, struct sk_buff *skb) struct hci_conn *hcon = conn->hcon; struct hci_dev *hdev = hcon->hdev; - bt_dev_dbg(hdev, "conn %p %s", conn, hcon->out ? "master" : "slave"); + bt_dev_dbg(hdev, "conn %p %s", conn, + hcon->out ? "initiator" : "responder"); if (skb->len < sizeof(smp->pcnf)) return SMP_INVALID_PARAMS; @@ -2251,7 +2253,7 @@ static bool smp_ltk_encrypt(struct l2cap_conn *conn, u8 sec_level) hci_le_start_enc(hcon, key->ediv, key->rand, key->val, key->enc_size); hcon->enc_key_size = key->enc_size; - /* We never store STKs for master role, so clear this flag */ + /* We never store STKs for initiator role, so clear this flag */ clear_bit(HCI_CONN_STK_ENCRYPT, &hcon->flags); return true; @@ -2467,7 +2469,7 @@ int smp_cancel_and_remove_pairing(struct hci_dev *hdev, bdaddr_t *bdaddr, /* Set keys to NULL to make sure smp_failure() does not try to * remove and free already invalidated rcu list entries. */ smp->ltk = NULL; - smp->slave_ltk = NULL; + smp->responder_ltk = NULL; smp->remote_irk = NULL; if (test_bit(SMP_FLAG_COMPLETE, &smp->flags)) @@ -2503,7 +2505,7 @@ static int smp_cmd_encrypt_info(struct l2cap_conn *conn, struct sk_buff *skb) return SMP_INVALID_PARAMS; } - SMP_ALLOW_CMD(smp, SMP_CMD_MASTER_IDENT); + SMP_ALLOW_CMD(smp, SMP_CMD_INITIATOR_IDENT); skb_pull(skb, sizeof(*rp)); @@ -2512,9 +2514,9 @@ static int smp_cmd_encrypt_info(struct l2cap_conn *conn, struct sk_buff *skb) return 0; } -static int smp_cmd_master_ident(struct l2cap_conn *conn, struct sk_buff *skb) +static int smp_cmd_initiator_ident(struct l2cap_conn *conn, struct sk_buff *skb) { - struct smp_cmd_master_ident *rp = (void *) skb->data; + struct smp_cmd_initiator_ident *rp = (void *)skb->data; struct l2cap_chan *chan = conn->smp; struct smp_chan *smp = chan->data; struct hci_dev *hdev = conn->hcon->hdev; @@ -2913,7 +2915,7 @@ static int smp_cmd_dhkey_check(struct l2cap_conn *conn, struct sk_buff *skb) return 0; } - /* Slave sends DHKey check as response to master */ + /* Responder sends DHKey check as response to initiator */ sc_dhkey_check(smp); } @@ -3000,8 +3002,8 @@ static int smp_sig_channel(struct l2cap_chan *chan, struct sk_buff *skb) reason = smp_cmd_encrypt_info(conn, skb); break; - case SMP_CMD_MASTER_IDENT: - reason = smp_cmd_master_ident(conn, skb); + case SMP_CMD_INITIATOR_IDENT: + reason = smp_cmd_initiator_ident(conn, skb); break; case SMP_CMD_IDENT_INFO: diff --git a/net/bluetooth/smp.h b/net/bluetooth/smp.h index fc35a8bf358e..87a59ec2c9f0 100644 --- a/net/bluetooth/smp.h +++ b/net/bluetooth/smp.h @@ -79,8 +79,8 @@ struct smp_cmd_encrypt_info { __u8 ltk[16]; } __packed; -#define SMP_CMD_MASTER_IDENT 0x07 -struct smp_cmd_master_ident { +#define SMP_CMD_INITIATOR_IDENT 0x07 +struct smp_cmd_initiator_ident { __le16 ediv; __le64 rand; } __packed; @@ -146,7 +146,7 @@ struct smp_cmd_keypress_notify { enum { SMP_STK, SMP_LTK, - SMP_LTK_SLAVE, + SMP_LTK_RESPONDER, SMP_LTK_P256, SMP_LTK_P256_DEBUG, }; -- cgit v1.2.3-58-ga151 From 67ffb1857a182d90c0e7db16752b556d6cf3944f Mon Sep 17 00:00:00 2001 From: Archie Pusaka Date: Mon, 31 May 2021 16:37:28 +0800 Subject: Bluetooth: use inclusive language in comments This patch replaces some non-inclusive terms based on the appropriate language mapping table compiled by the Bluetooth SIG: https://specificationrefs.bluetooth.com/language-mapping/Appropriate_Language_Mapping_Table.pdf Specifically, these terms are replaced: slave -> peripheral blacklisted -> blocked Signed-off-by: Archie Pusaka Reviewed-by: Miao-chen Chou Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_event.c | 6 +++--- net/bluetooth/hidp/core.c | 2 +- net/bluetooth/mgmt.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index e289187075b9..43c324c46c0b 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -5404,7 +5404,7 @@ static struct hci_conn *check_pending_le_conn(struct hci_dev *hdev, case HCI_AUTO_CONN_DIRECT: /* Only devices advertising with ADV_DIRECT_IND are * triggering a connection attempt. This is allowing - * incoming connections from slave devices. + * incoming connections from peripheral devices. */ if (adv_type != LE_ADV_DIRECT_IND) return NULL; @@ -5412,8 +5412,8 @@ static struct hci_conn *check_pending_le_conn(struct hci_dev *hdev, case HCI_AUTO_CONN_ALWAYS: /* Devices advertising with ADV_IND or ADV_DIRECT_IND * are triggering a connection attempt. This means - * that incoming connections from slave device are - * accepted and also outgoing connections to slave + * that incoming connections from peripheral device are + * accepted and also outgoing connections to peripheral * devices are established when found. */ break; diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c index 0db48c812662..96fedef14723 100644 --- a/net/bluetooth/hidp/core.c +++ b/net/bluetooth/hidp/core.c @@ -794,7 +794,7 @@ static int hidp_setup_hid(struct hidp_session *session, hid->dev.parent = &session->conn->hcon->dev; hid->ll_driver = &hidp_hid_driver; - /* True if device is blacklisted in drivers/hid/hid-quirks.c */ + /* True if device is blocked in drivers/hid/hid-quirks.c */ if (hid_ignore(hid)) { hid_destroy_device(session->hid); session->hid = NULL; diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index fac972415d68..80ec35c67ea5 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -2959,7 +2959,7 @@ static int pair_device(struct sock *sk, struct hci_dev *hdev, void *data, /* When pairing a new device, it is expected to remember * this device for future connections. Adding the connection * parameter information ahead of time allows tracking - * of the slave preferred values and will speed up any + * of the peripheral preferred values and will speed up any * further connection establishment. * * If connection parameters already exist, then they -- cgit v1.2.3-58-ga151 From 8c8ca05d3291d5e77eccf8f87106506a90aa82a2 Mon Sep 17 00:00:00 2001 From: Kai Ye Date: Thu, 3 Jun 2021 15:40:54 +0800 Subject: Bluetooth: bnep: Use the correct print format According to Documentation/core-api/printk-formats.rst, Use the correct print format. Printing an unsigned int value should use %u instead of %d. Otherwise printk() might end up displaying negative numbers. Signed-off-by: Kai Ye Signed-off-by: Marcel Holtmann --- net/bluetooth/bnep/core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/bluetooth/bnep/core.c b/net/bluetooth/bnep/core.c index 43c284158f63..72f47b372705 100644 --- a/net/bluetooth/bnep/core.c +++ b/net/bluetooth/bnep/core.c @@ -126,8 +126,8 @@ static int bnep_ctrl_set_netfilter(struct bnep_session *s, __be16 *data, int len f[i].start = get_unaligned_be16(data++); f[i].end = get_unaligned_be16(data++); - BT_DBG("proto filter start %d end %d", - f[i].start, f[i].end); + BT_DBG("proto filter start %u end %u", + f[i].start, f[i].end); } if (i < BNEP_MAX_PROTO_FILTERS) @@ -266,7 +266,7 @@ static int bnep_rx_extension(struct bnep_session *s, struct sk_buff *skb) break; } - BT_DBG("type 0x%x len %d", h->type, h->len); + BT_DBG("type 0x%x len %u", h->type, h->len); switch (h->type & BNEP_TYPE_MASK) { case BNEP_EXT_CONTROL: @@ -424,7 +424,7 @@ static int bnep_tx_frame(struct bnep_session *s, struct sk_buff *skb) int len = 0, il = 0; u8 type = 0; - BT_DBG("skb %p dev %p type %d", skb, skb->dev, skb->pkt_type); + BT_DBG("skb %p dev %p type %u", skb, skb->dev, skb->pkt_type); if (!skb->dev) { /* Control frame sent by us */ -- cgit v1.2.3-58-ga151 From b442a8533b02b44bafa81b67a3571b2b106fcc88 Mon Sep 17 00:00:00 2001 From: Kai Ye Date: Thu, 3 Jun 2021 15:40:55 +0800 Subject: Bluetooth: cmtp: Use the correct print format According to Documentation/core-api/printk-formats.rst, Use the correct print format. Printing an unsigned int value should use %u instead of %d. Otherwise printk() might end up displaying negative numbers. Signed-off-by: Kai Ye Signed-off-by: Marcel Holtmann --- net/bluetooth/cmtp/capi.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/bluetooth/cmtp/capi.c b/net/bluetooth/cmtp/capi.c index eb41556002e3..f3bedc3b613a 100644 --- a/net/bluetooth/cmtp/capi.c +++ b/net/bluetooth/cmtp/capi.c @@ -74,7 +74,7 @@ static struct cmtp_application *cmtp_application_add(struct cmtp_session *sessio { struct cmtp_application *app = kzalloc(sizeof(*app), GFP_KERNEL); - BT_DBG("session %p application %p appl %d", session, app, appl); + BT_DBG("session %p application %p appl %u", session, app, appl); if (!app) return NULL; @@ -135,7 +135,7 @@ static void cmtp_send_capimsg(struct cmtp_session *session, struct sk_buff *skb) { struct cmtp_scb *scb = (void *) skb->cb; - BT_DBG("session %p skb %p len %d", session, skb, skb->len); + BT_DBG("session %p skb %p len %u", session, skb, skb->len); scb->id = -1; scb->data = (CAPIMSG_COMMAND(skb->data) == CAPI_DATA_B3); @@ -152,7 +152,7 @@ static void cmtp_send_interopmsg(struct cmtp_session *session, struct sk_buff *skb; unsigned char *s; - BT_DBG("session %p subcmd 0x%02x appl %d msgnum %d", session, subcmd, appl, msgnum); + BT_DBG("session %p subcmd 0x%02x appl %u msgnum %u", session, subcmd, appl, msgnum); skb = alloc_skb(CAPI_MSG_BASELEN + 6 + len, GFP_ATOMIC); if (!skb) { @@ -188,7 +188,7 @@ static void cmtp_recv_interopmsg(struct cmtp_session *session, struct sk_buff *s __u16 appl, msgnum, func, info; __u32 controller; - BT_DBG("session %p skb %p len %d", session, skb, skb->len); + BT_DBG("session %p skb %p len %u", session, skb, skb->len); switch (CAPIMSG_SUBCOMMAND(skb->data)) { case CAPI_CONF: @@ -321,7 +321,7 @@ void cmtp_recv_capimsg(struct cmtp_session *session, struct sk_buff *skb) __u16 appl; __u32 contr; - BT_DBG("session %p skb %p len %d", session, skb, skb->len); + BT_DBG("session %p skb %p len %u", session, skb, skb->len); if (skb->len < CAPI_MSG_BASELEN) return; @@ -344,7 +344,7 @@ void cmtp_recv_capimsg(struct cmtp_session *session, struct sk_buff *skb) appl = application->appl; CAPIMSG_SETAPPID(skb->data, appl); } else { - BT_ERR("Can't find application with id %d", appl); + BT_ERR("Can't find application with id %u", appl); kfree_skb(skb); return; } @@ -385,8 +385,8 @@ static void cmtp_register_appl(struct capi_ctr *ctrl, __u16 appl, capi_register_ unsigned char buf[8]; int err = 0, nconn, want = rp->level3cnt; - BT_DBG("ctrl %p appl %d level3cnt %d datablkcnt %d datablklen %d", - ctrl, appl, rp->level3cnt, rp->datablkcnt, rp->datablklen); + BT_DBG("ctrl %p appl %u level3cnt %u datablkcnt %u datablklen %u", + ctrl, appl, rp->level3cnt, rp->datablkcnt, rp->datablklen); application = cmtp_application_add(session, appl); if (!application) { @@ -450,7 +450,7 @@ static void cmtp_release_appl(struct capi_ctr *ctrl, __u16 appl) struct cmtp_session *session = ctrl->driverdata; struct cmtp_application *application; - BT_DBG("ctrl %p appl %d", ctrl, appl); + BT_DBG("ctrl %p appl %u", ctrl, appl); application = cmtp_application_get(session, CMTP_APPLID, appl); if (!application) { @@ -483,7 +483,7 @@ static u16 cmtp_send_message(struct capi_ctr *ctrl, struct sk_buff *skb) application = cmtp_application_get(session, CMTP_APPLID, appl); if ((!application) || (application->state != BT_CONNECTED)) { - BT_ERR("Can't find application with id %d", appl); + BT_ERR("Can't find application with id %u", appl); return CAPI_ILLAPPNR; } @@ -515,7 +515,7 @@ static int cmtp_proc_show(struct seq_file *m, void *v) seq_printf(m, "ctrl %d\n", session->num); list_for_each_entry(app, &session->applications, list) { - seq_printf(m, "appl %d -> %d\n", app->appl, app->mapping); + seq_printf(m, "appl %u -> %u\n", app->appl, app->mapping); } return 0; -- cgit v1.2.3-58-ga151 From 093dabb4f1aff982f7ef1cebf4e24be3fe47bcdb Mon Sep 17 00:00:00 2001 From: Kai Ye Date: Thu, 3 Jun 2021 15:40:56 +0800 Subject: Bluetooth: hidp: Use the correct print format According to Documentation/core-api/printk-formats.rst, Use the correct print format. Printing an unsigned int value should use %u instead of %d. Otherwise printk() might end up displaying negative numbers. Signed-off-by: Kai Ye Signed-off-by: Marcel Holtmann --- net/bluetooth/hidp/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c index 96fedef14723..80848dfc01db 100644 --- a/net/bluetooth/hidp/core.c +++ b/net/bluetooth/hidp/core.c @@ -508,7 +508,7 @@ static int hidp_process_data(struct hidp_session *session, struct sk_buff *skb, unsigned char param) { int done_with_skb = 1; - BT_DBG("session %p skb %p len %d param 0x%02x", session, skb, skb->len, param); + BT_DBG("session %p skb %p len %u param 0x%02x", session, skb, skb->len, param); switch (param) { case HIDP_DATA_RTYPE_INPUT: @@ -553,7 +553,7 @@ static void hidp_recv_ctrl_frame(struct hidp_session *session, unsigned char hdr, type, param; int free_skb = 1; - BT_DBG("session %p skb %p len %d", session, skb, skb->len); + BT_DBG("session %p skb %p len %u", session, skb, skb->len); hdr = skb->data[0]; skb_pull(skb, 1); @@ -589,7 +589,7 @@ static void hidp_recv_intr_frame(struct hidp_session *session, { unsigned char hdr; - BT_DBG("session %p skb %p len %d", session, skb, skb->len); + BT_DBG("session %p skb %p len %u", session, skb, skb->len); hdr = skb->data[0]; skb_pull(skb, 1); -- cgit v1.2.3-58-ga151 From 658d5d8080b5ec6184402d3cf37c2070e4d9b6db Mon Sep 17 00:00:00 2001 From: Kai Ye Date: Thu, 3 Jun 2021 15:40:58 +0800 Subject: Bluetooth: 6lowpan: Use the correct print format According to Documentation/core-api/printk-formats.rst, Use the correct print format. Printing an unsigned int value should use %u instead of %d. Otherwise printk() might end up displaying negative numbers. Signed-off-by: Kai Ye Signed-off-by: Marcel Holtmann --- net/bluetooth/6lowpan.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 29c96bc5733f..fd164a248569 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -167,7 +167,7 @@ static inline struct lowpan_peer *peer_lookup_dst(struct lowpan_btle_dev *dev, rcu_read_lock(); list_for_each_entry_rcu(peer, &dev->peers, list) { - BT_DBG("dst addr %pMR dst type %d ip %pI6c", + BT_DBG("dst addr %pMR dst type %u ip %pI6c", &peer->chan->dst, peer->chan->dst_type, &peer->peer_addr); @@ -478,7 +478,7 @@ static int send_mcast_pkt(struct sk_buff *skb, struct net_device *netdev) local_skb = skb_clone(skb, GFP_ATOMIC); - BT_DBG("xmit %s to %pMR type %d IP %pI6c chan %p", + BT_DBG("xmit %s to %pMR type %u IP %pI6c chan %p", netdev->name, &pentry->chan->dst, pentry->chan->dst_type, &pentry->peer_addr, pentry->chan); @@ -521,7 +521,7 @@ static netdev_tx_t bt_xmit(struct sk_buff *skb, struct net_device *netdev) if (err) { if (lowpan_cb(skb)->chan) { - BT_DBG("xmit %s to %pMR type %d IP %pI6c chan %p", + BT_DBG("xmit %s to %pMR type %u IP %pI6c chan %p", netdev->name, &addr, addr_type, &lowpan_cb(skb)->addr, lowpan_cb(skb)->chan); err = send_pkt(lowpan_cb(skb)->chan, skb, netdev); @@ -790,7 +790,7 @@ static void chan_close_cb(struct l2cap_chan *chan) BT_DBG("dev %p removing %speer %p", dev, last ? "last " : "1 ", peer); - BT_DBG("chan %p orig refcnt %d", chan, + BT_DBG("chan %p orig refcnt %u", chan, kref_read(&chan->kref)); l2cap_chan_put(chan); @@ -904,7 +904,7 @@ static int bt_6lowpan_disconnect(struct l2cap_conn *conn, u8 dst_type) { struct lowpan_peer *peer; - BT_DBG("conn %p dst type %d", conn, dst_type); + BT_DBG("conn %p dst type %u", conn, dst_type); peer = lookup_peer(conn); if (!peer) @@ -936,7 +936,7 @@ static struct l2cap_chan *bt_6lowpan_listen(void) atomic_set(&chan->nesting, L2CAP_NESTING_PARENT); - BT_DBG("chan %p src type %d", chan, chan->src_type); + BT_DBG("chan %p src type %u", chan, chan->src_type); err = l2cap_add_psm(chan, addr, cpu_to_le16(L2CAP_PSM_IPSP)); if (err) { @@ -977,7 +977,7 @@ static int get_l2cap_conn(char *buf, bdaddr_t *addr, u8 *addr_type, *conn = (struct l2cap_conn *)hcon->l2cap_data; - BT_DBG("conn %p dst %pMR type %d", *conn, &hcon->dst, hcon->dst_type); + BT_DBG("conn %p dst %pMR type %u", *conn, &hcon->dst, hcon->dst_type); return 0; } @@ -1119,7 +1119,7 @@ static ssize_t lowpan_control_write(struct file *fp, return -EALREADY; } - BT_DBG("conn %p dst %pMR type %d user %d", conn, + BT_DBG("conn %p dst %pMR type %d user %u", conn, &conn->hcon->dst, conn->hcon->dst_type, addr_type); } -- cgit v1.2.3-58-ga151 From fad48d848cf64d4673474c9ebcb9f6fbf66aa3b8 Mon Sep 17 00:00:00 2001 From: Kai Ye Date: Thu, 3 Jun 2021 15:40:59 +0800 Subject: Bluetooth: a2mp: Use the correct print format According to Documentation/core-api/printk-formats.rst, Use the correct print format. Printing an unsigned int value should use %u instead of %d. Otherwise printk() might end up displaying negative numbers. Signed-off-by: Kai Ye Signed-off-by: Marcel Holtmann --- net/bluetooth/a2mp.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/bluetooth/a2mp.c b/net/bluetooth/a2mp.c index 463bad58478b..1fcc482397c3 100644 --- a/net/bluetooth/a2mp.c +++ b/net/bluetooth/a2mp.c @@ -120,7 +120,7 @@ static int a2mp_command_rej(struct amp_mgr *mgr, struct sk_buff *skb, if (le16_to_cpu(hdr->len) < sizeof(*rej)) return -EINVAL; - BT_DBG("ident %d reason %d", hdr->ident, le16_to_cpu(rej->reason)); + BT_DBG("ident %u reason %d", hdr->ident, le16_to_cpu(rej->reason)); skb_pull(skb, sizeof(*rej)); @@ -219,7 +219,7 @@ static int a2mp_discover_rsp(struct amp_mgr *mgr, struct sk_buff *skb, cl = (void *) skb->data; while (len >= sizeof(*cl)) { - BT_DBG("Remote AMP id %d type %d status %d", cl->id, cl->type, + BT_DBG("Remote AMP id %u type %u status %u", cl->id, cl->type, cl->status); if (cl->id != AMP_ID_BREDR && cl->type != AMP_TYPE_BREDR) { @@ -273,7 +273,7 @@ static int a2mp_change_notify(struct amp_mgr *mgr, struct sk_buff *skb, struct a2mp_cl *cl = (void *) skb->data; while (skb->len >= sizeof(*cl)) { - BT_DBG("Controller id %d type %d status %d", cl->id, cl->type, + BT_DBG("Controller id %u type %u status %u", cl->id, cl->type, cl->status); cl = skb_pull(skb, sizeof(*cl)); } @@ -302,7 +302,7 @@ static int a2mp_getinfo_req(struct amp_mgr *mgr, struct sk_buff *skb, if (le16_to_cpu(hdr->len) < sizeof(*req)) return -EINVAL; - BT_DBG("id %d", req->id); + BT_DBG("id %u", req->id); hdev = hci_dev_get(req->id); if (!hdev || hdev->dev_type != HCI_AMP) { @@ -344,7 +344,7 @@ static int a2mp_getinfo_rsp(struct amp_mgr *mgr, struct sk_buff *skb, if (le16_to_cpu(hdr->len) < sizeof(*rsp)) return -EINVAL; - BT_DBG("id %d status 0x%2.2x", rsp->id, rsp->status); + BT_DBG("id %u status 0x%2.2x", rsp->id, rsp->status); if (rsp->status) return -EINVAL; @@ -373,7 +373,7 @@ static int a2mp_getampassoc_req(struct amp_mgr *mgr, struct sk_buff *skb, if (le16_to_cpu(hdr->len) < sizeof(*req)) return -EINVAL; - BT_DBG("id %d", req->id); + BT_DBG("id %u", req->id); /* Make sure that other request is not processed */ tmp = amp_mgr_lookup_by_state(READ_LOC_AMP_ASSOC); @@ -423,7 +423,7 @@ static int a2mp_getampassoc_rsp(struct amp_mgr *mgr, struct sk_buff *skb, assoc_len = len - sizeof(*rsp); - BT_DBG("id %d status 0x%2.2x assoc len %zu", rsp->id, rsp->status, + BT_DBG("id %u status 0x%2.2x assoc len %zu", rsp->id, rsp->status, assoc_len); if (rsp->status) @@ -457,7 +457,7 @@ static int a2mp_getampassoc_rsp(struct amp_mgr *mgr, struct sk_buff *skb, if (!hcon) goto done; - BT_DBG("Created hcon %p: loc:%d -> rem:%d", hcon, hdev->id, rsp->id); + BT_DBG("Created hcon %p: loc:%u -> rem:%u", hcon, hdev->id, rsp->id); mgr->bredr_chan->remote_amp_id = rsp->id; @@ -481,7 +481,7 @@ static int a2mp_createphyslink_req(struct amp_mgr *mgr, struct sk_buff *skb, if (le16_to_cpu(hdr->len) < sizeof(*req)) return -EINVAL; - BT_DBG("local_id %d, remote_id %d", req->local_id, req->remote_id); + BT_DBG("local_id %u, remote_id %u", req->local_id, req->remote_id); memset(&rsp, 0, sizeof(rsp)); @@ -562,7 +562,7 @@ static int a2mp_discphyslink_req(struct amp_mgr *mgr, struct sk_buff *skb, if (le16_to_cpu(hdr->len) < sizeof(*req)) return -EINVAL; - BT_DBG("local_id %d remote_id %d", req->local_id, req->remote_id); + BT_DBG("local_id %u remote_id %u", req->local_id, req->remote_id); memset(&rsp, 0, sizeof(rsp)); @@ -599,7 +599,7 @@ send_rsp: static inline int a2mp_cmd_rsp(struct amp_mgr *mgr, struct sk_buff *skb, struct a2mp_cmd *hdr) { - BT_DBG("ident %d code 0x%2.2x", hdr->ident, hdr->code); + BT_DBG("ident %u code 0x%2.2x", hdr->ident, hdr->code); skb_pull(skb, le16_to_cpu(hdr->len)); return 0; @@ -620,7 +620,7 @@ static int a2mp_chan_recv_cb(struct l2cap_chan *chan, struct sk_buff *skb) hdr = (void *) skb->data; len = le16_to_cpu(hdr->len); - BT_DBG("code 0x%2.2x id %d len %u", hdr->code, hdr->ident, len); + BT_DBG("code 0x%2.2x id %u len %u", hdr->code, hdr->ident, len); skb_pull(skb, sizeof(*hdr)); -- cgit v1.2.3-58-ga151 From 610850bebc5baaf92d113247387b9fcab187259f Mon Sep 17 00:00:00 2001 From: Kai Ye Date: Thu, 3 Jun 2021 15:41:00 +0800 Subject: Bluetooth: amp: Use the correct print format According to Documentation/core-api/printk-formats.rst, Use the correct print format. Printing an unsigned int value should use %u instead of %d. Otherwise printk() might end up displaying negative numbers. Signed-off-by: Kai Ye Signed-off-by: Marcel Holtmann --- net/bluetooth/amp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/bluetooth/amp.c b/net/bluetooth/amp.c index be2d469d6369..2134f92bd7ac 100644 --- a/net/bluetooth/amp.c +++ b/net/bluetooth/amp.c @@ -78,7 +78,7 @@ struct amp_ctrl *amp_ctrl_lookup(struct amp_mgr *mgr, u8 id) { struct amp_ctrl *ctrl; - BT_DBG("mgr %p id %d", mgr, id); + BT_DBG("mgr %p id %u", mgr, id); mutex_lock(&mgr->amp_ctrls_lock); list_for_each_entry(ctrl, &mgr->amp_ctrls, list) { @@ -179,7 +179,7 @@ int phylink_gen_key(struct hci_conn *conn, u8 *data, u8 *len, u8 *type) /* Legacy key */ if (conn->key_type < 3) { - bt_dev_err(hdev, "legacy key type %d", conn->key_type); + bt_dev_err(hdev, "legacy key type %u", conn->key_type); return -EACCES; } @@ -257,7 +257,7 @@ void amp_read_loc_assoc_frag(struct hci_dev *hdev, u8 phy_handle) struct hci_request req; int err; - BT_DBG("%s handle %d", hdev->name, phy_handle); + BT_DBG("%s handle %u", hdev->name, phy_handle); cp.phy_handle = phy_handle; cp.max_len = cpu_to_le16(hdev->amp_assoc_size); -- cgit v1.2.3-58-ga151 From 85d6728421c9b2797dea3a20f213dd44d9f8d7cd Mon Sep 17 00:00:00 2001 From: Kai Ye Date: Thu, 3 Jun 2021 15:41:02 +0800 Subject: Bluetooth: mgmt: Use the correct print format According to Documentation/core-api/printk-formats.rst, Use the correct print format. Printing an unsigned int value should use %u instead of %d. Otherwise printk() might end up displaying negative numbers. Signed-off-by: Kai Ye Signed-off-by: Marcel Holtmann --- net/bluetooth/mgmt.c | 16 ++++++++-------- net/bluetooth/mgmt_config.c | 4 ++-- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 80ec35c67ea5..22f9f52c5ae6 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -4279,7 +4279,7 @@ int mgmt_add_adv_patterns_monitor_complete(struct hci_dev *hdev, u8 status) done: hci_dev_unlock(hdev); - bt_dev_dbg(hdev, "add monitor %d complete, status %d", + bt_dev_dbg(hdev, "add monitor %d complete, status %u", rp.monitor_handle, status); return err; @@ -4504,7 +4504,7 @@ int mgmt_remove_adv_monitor_complete(struct hci_dev *hdev, u8 status) done: hci_dev_unlock(hdev); - bt_dev_dbg(hdev, "remove monitor %d complete, status %d", + bt_dev_dbg(hdev, "remove monitor %d complete, status %u", rp.monitor_handle, status); return err; @@ -4834,7 +4834,7 @@ void mgmt_start_discovery_complete(struct hci_dev *hdev, u8 status) { struct mgmt_pending_cmd *cmd; - bt_dev_dbg(hdev, "status %d", status); + bt_dev_dbg(hdev, "status %u", status); hci_dev_lock(hdev); @@ -5090,7 +5090,7 @@ void mgmt_stop_discovery_complete(struct hci_dev *hdev, u8 status) { struct mgmt_pending_cmd *cmd; - bt_dev_dbg(hdev, "status %d", status); + bt_dev_dbg(hdev, "status %u", status); hci_dev_lock(hdev); @@ -5303,7 +5303,7 @@ static int set_device_id(struct sock *sk, struct hci_dev *hdev, void *data, static void enable_advertising_instance(struct hci_dev *hdev, u8 status, u16 opcode) { - bt_dev_dbg(hdev, "status %d", status); + bt_dev_dbg(hdev, "status %u", status); } static void set_advertising_complete(struct hci_dev *hdev, u8 status, @@ -6347,7 +6347,7 @@ static void conn_info_refresh_complete(struct hci_dev *hdev, u8 hci_status, handle = __le16_to_cpu(cp->handle); conn = hci_conn_hash_lookup_handle(hdev, handle); if (!conn) { - bt_dev_err(hdev, "unknown handle (%d) in conn_info response", + bt_dev_err(hdev, "unknown handle (%u) in conn_info response", handle); goto unlock; } @@ -7654,7 +7654,7 @@ static void add_advertising_complete(struct hci_dev *hdev, u8 status, struct adv_info *adv_instance, *n; u8 instance; - bt_dev_dbg(hdev, "status %d", status); + bt_dev_dbg(hdev, "status %u", status); hci_dev_lock(hdev); @@ -8184,7 +8184,7 @@ static void remove_advertising_complete(struct hci_dev *hdev, u8 status, struct mgmt_cp_remove_advertising *cp; struct mgmt_rp_remove_advertising rp; - bt_dev_dbg(hdev, "status %d", status); + bt_dev_dbg(hdev, "status %u", status); hci_dev_lock(hdev); diff --git a/net/bluetooth/mgmt_config.c b/net/bluetooth/mgmt_config.c index 1deb0ca7a929..6ef701c27da4 100644 --- a/net/bluetooth/mgmt_config.c +++ b/net/bluetooth/mgmt_config.c @@ -146,7 +146,7 @@ int set_def_system_config(struct sock *sk, struct hci_dev *hdev, void *data, const u16 type = le16_to_cpu(TO_TLV(buffer)->type); if (buffer_left < exp_len) { - bt_dev_warn(hdev, "invalid len left %d, exp >= %d", + bt_dev_warn(hdev, "invalid len left %u, exp >= %u", buffer_left, exp_len); return mgmt_cmd_status(sk, hdev->id, @@ -198,7 +198,7 @@ int set_def_system_config(struct sock *sk, struct hci_dev *hdev, void *data, } if (exp_type_len && len != exp_type_len) { - bt_dev_warn(hdev, "invalid length %d, exp %zu for type %d", + bt_dev_warn(hdev, "invalid length %d, exp %zu for type %u", len, exp_type_len, type); return mgmt_cmd_status(sk, hdev->id, -- cgit v1.2.3-58-ga151 From 496bdeeeda09e84f469f47e66f6d38d3735f802f Mon Sep 17 00:00:00 2001 From: Kai Ye Date: Thu, 3 Jun 2021 15:41:03 +0800 Subject: Bluetooth: msft: Use the correct print format According to Documentation/core-api/printk-formats.rst, Use the correct print format. Printing an unsigned int value should use %u instead of %d. Otherwise printk() might end up displaying negative numbers. Signed-off-by: Kai Ye Signed-off-by: Marcel Holtmann --- net/bluetooth/msft.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/msft.c b/net/bluetooth/msft.c index 37a394786a94..b4bfae41e8a5 100644 --- a/net/bluetooth/msft.c +++ b/net/bluetooth/msft.c @@ -311,7 +311,7 @@ static void msft_le_monitor_advertisement_cb(struct hci_dev *hdev, monitor = idr_find(&hdev->adv_monitors_idr, msft->pending_add_handle); if (!monitor) { - bt_dev_err(hdev, "msft add advmon: monitor %d is not found!", + bt_dev_err(hdev, "msft add advmon: monitor %u is not found!", msft->pending_add_handle); status = HCI_ERROR_UNSPECIFIED; goto unlock; -- cgit v1.2.3-58-ga151 From 79dbeafe5ef162ede87c916054755a987e93e542 Mon Sep 17 00:00:00 2001 From: Kai Ye Date: Thu, 3 Jun 2021 15:41:04 +0800 Subject: Bluetooth: sco: Use the correct print format According to Documentation/core-api/printk-formats.rst, Use the correct print format. Printing an unsigned int value should use %u instead of %d. Otherwise printk() might end up displaying negative numbers. Signed-off-by: Kai Ye Signed-off-by: Marcel Holtmann --- net/bluetooth/sco.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 3bd41563f118..d9a4e88dacbb 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -310,7 +310,7 @@ static void sco_recv_frame(struct sco_conn *conn, struct sk_buff *skb) if (!sk) goto drop; - BT_DBG("sk %p len %d", sk, skb->len); + BT_DBG("sk %p len %u", sk, skb->len); if (sk->sk_state != BT_CONNECTED) goto drop; @@ -905,7 +905,7 @@ static int sco_sock_getsockopt_old(struct socket *sock, int optname, opts.mtu = sco_pi(sk)->conn->mtu; - BT_DBG("mtu %d", opts.mtu); + BT_DBG("mtu %u", opts.mtu); len = min_t(unsigned int, len, sizeof(opts)); if (copy_to_user(optval, (char *)&opts, len)) @@ -1167,7 +1167,7 @@ static void sco_connect_cfm(struct hci_conn *hcon, __u8 status) if (hcon->type != SCO_LINK && hcon->type != ESCO_LINK) return; - BT_DBG("hcon %p bdaddr %pMR status %d", hcon, &hcon->dst, status); + BT_DBG("hcon %p bdaddr %pMR status %u", hcon, &hcon->dst, status); if (!status) { struct sco_conn *conn; @@ -1196,7 +1196,7 @@ void sco_recv_scodata(struct hci_conn *hcon, struct sk_buff *skb) if (!conn) goto drop; - BT_DBG("conn %p len %d", conn, skb->len); + BT_DBG("conn %p len %u", conn, skb->len); if (skb->len) { sco_recv_frame(conn, skb); -- cgit v1.2.3-58-ga151 From 83b4b19551411c83bbcf677718ab5d9f60d982f6 Mon Sep 17 00:00:00 2001 From: Kai Ye Date: Thu, 3 Jun 2021 15:41:05 +0800 Subject: Bluetooth: smp: Use the correct print format According to Documentation/core-api/printk-formats.rst, Use the correct print format. Printing an unsigned int value should use %u instead of %d. Otherwise printk() might end up displaying negative numbers. Signed-off-by: Kai Ye Signed-off-by: Marcel Holtmann --- net/bluetooth/smp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 6197f8ae53ab..b9413a78993a 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -859,7 +859,7 @@ static int tk_request(struct l2cap_conn *conn, u8 remote_oob, u8 auth, memset(smp->tk, 0, sizeof(smp->tk)); clear_bit(SMP_FLAG_TK_VALID, &smp->flags); - bt_dev_dbg(hcon->hdev, "auth:%d lcl:%d rem:%d", auth, local_io, + bt_dev_dbg(hcon->hdev, "auth:%u lcl:%u rem:%u", auth, local_io, remote_io); /* If neither side wants MITM, either "just" confirm an incoming @@ -925,7 +925,7 @@ static int tk_request(struct l2cap_conn *conn, u8 remote_oob, u8 auth, get_random_bytes(&passkey, sizeof(passkey)); passkey %= 1000000; put_unaligned_le32(passkey, smp->tk); - bt_dev_dbg(hcon->hdev, "PassKey: %d", passkey); + bt_dev_dbg(hcon->hdev, "PassKey: %u", passkey); set_bit(SMP_FLAG_TK_VALID, &smp->flags); } @@ -1655,7 +1655,7 @@ int smp_user_confirm_reply(struct hci_conn *hcon, u16 mgmt_op, __le32 passkey) case MGMT_OP_USER_PASSKEY_REPLY: value = le32_to_cpu(passkey); memset(smp->tk, 0, sizeof(smp->tk)); - bt_dev_dbg(conn->hcon->hdev, "PassKey: %d", value); + bt_dev_dbg(conn->hcon->hdev, "PassKey: %u", value); put_unaligned_le32(value, smp->tk); fallthrough; case MGMT_OP_USER_CONFIRM_REPLY: -- cgit v1.2.3-58-ga151 From 74be523ce6bed0531e4f31c3e1387909589e9bfe Mon Sep 17 00:00:00 2001 From: Archie Pusaka Date: Fri, 4 Jun 2021 16:26:25 +0800 Subject: Bluetooth: use inclusive language in HCI role comments This patch replaces some non-inclusive terms based on the appropriate language mapping table compiled by the Bluetooth SIG: https://specificationrefs.bluetooth.com/language-mapping/Appropriate_Language_Mapping_Table.pdf Specifically, these terms are replaced: master -> initiator (for smp) or central (everything else) slave -> responder (for smp) or peripheral (everything else) The #define preprocessor terms are unchanged for now to not disturb dependent APIs. Signed-off-by: Archie Pusaka Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_conn.c | 8 ++++---- net/bluetooth/hci_event.c | 6 +++--- net/bluetooth/l2cap_core.c | 2 +- net/bluetooth/smp.c | 6 +++--- 4 files changed, 11 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index ea0f9cdaa6b1..2b5059a56cda 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -257,7 +257,7 @@ int hci_disconnect(struct hci_conn *conn, __u8 reason) { BT_DBG("hcon %p", conn); - /* When we are master of an established connection and it enters + /* When we are central of an established connection and it enters * the disconnect timeout, then go ahead and try to read the * current clock offset. Processing of the result is done * within the event handling and hci_clock_offset_evt function. @@ -1109,9 +1109,9 @@ struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, hci_req_init(&req, hdev); - /* Disable advertising if we're active. For master role + /* Disable advertising if we're active. For central role * connections most controllers will refuse to connect if - * advertising is enabled, and for slave role connections we + * advertising is enabled, and for peripheral role connections we * anyway have to disable it in order to start directed * advertising. Any registered advertisements will be * re-enabled after the connection attempt is finished. @@ -1119,7 +1119,7 @@ struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, if (hci_dev_test_flag(hdev, HCI_LE_ADV)) __hci_req_pause_adv_instances(&req); - /* If requested to connect as slave use directed advertising */ + /* If requested to connect as peripheral use directed advertising */ if (conn->role == HCI_ROLE_SLAVE) { /* If we're active scanning most controllers are unable * to initiate advertising. Simply reject the attempt. diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 43c324c46c0b..da013d485f14 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -2795,9 +2795,9 @@ static void hci_conn_request_evt(struct hci_dev *hdev, struct sk_buff *skb) bacpy(&cp.bdaddr, &ev->bdaddr); if (lmp_rswitch_capable(hdev) && (mask & HCI_LM_MASTER)) - cp.role = 0x00; /* Become master */ + cp.role = 0x00; /* Become central */ else - cp.role = 0x01; /* Remain slave */ + cp.role = 0x01; /* Remain peripheral */ hci_send_cmd(hdev, HCI_OP_ACCEPT_CONN_REQ, sizeof(cp), &cp); } else if (!(flags & HCI_PROTO_DEFER)) { @@ -5131,7 +5131,7 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status, conn->dst_type = bdaddr_type; /* If we didn't have a hci_conn object previously - * but we're in master role this must be something + * but we're in central role this must be something * initiated using a white list. Since white list based * connections are not "first class citizens" we don't * have full tracking of them. Therefore, we go ahead diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 9ebb85df4db4..b76c5d00b082 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -1691,7 +1691,7 @@ static void l2cap_le_conn_ready(struct l2cap_conn *conn) if (hcon->out) smp_conn_security(hcon, hcon->pending_sec_level); - /* For LE slave connections, make sure the connection interval + /* For LE peripheral connections, make sure the connection interval * is in the range of the minimum and maximum interval that has * been configured for this connection. If not, then trigger * the connection update procedure. diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index b9413a78993a..11f853d0500f 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -909,8 +909,8 @@ static int tk_request(struct l2cap_conn *conn, u8 remote_oob, u8 auth, hcon->pending_sec_level = BT_SECURITY_HIGH; } - /* If both devices have Keyoard-Display I/O, the master - * Confirms and the slave Enters the passkey. + /* If both devices have Keyboard-Display I/O, the initiator + * Confirms and the responder Enters the passkey. */ if (smp->method == OVERLAP) { if (hcon->role == HCI_ROLE_MASTER) @@ -3083,7 +3083,7 @@ static void bredr_pairing(struct l2cap_chan *chan) if (!test_bit(HCI_CONN_ENCRYPT, &hcon->flags)) return; - /* Only master may initiate SMP over BR/EDR */ + /* Only initiator may initiate SMP over BR/EDR */ if (hcon->role != HCI_ROLE_MASTER) return; -- cgit v1.2.3-58-ga151 From 39bc74ca0119025e3cc24b97ebd964b5c605aa83 Mon Sep 17 00:00:00 2001 From: Archie Pusaka Date: Fri, 4 Jun 2021 16:26:26 +0800 Subject: Bluetooth: use inclusive language when tracking connections This patch replaces some non-inclusive terms based on the appropriate language mapping table compiled by the Bluetooth SIG: https://specificationrefs.bluetooth.com/language-mapping/Appropriate_Language_Mapping_Table.pdf Specifically, these terms are replaced: master -> central slave -> peripheral Signed-off-by: Archie Pusaka Reviewed-by: Miao-chen Chou Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 6 +++--- net/bluetooth/hci_event.c | 4 ++-- net/bluetooth/hci_request.c | 17 +++++++++-------- 3 files changed, 14 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index c9ec06997e1c..fe5f3a9d9924 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -122,7 +122,7 @@ struct hci_conn_hash { unsigned int amp_num; unsigned int sco_num; unsigned int le_num; - unsigned int le_num_slave; + unsigned int le_num_peripheral; }; struct bdaddr_list { @@ -894,7 +894,7 @@ static inline void hci_conn_hash_add(struct hci_dev *hdev, struct hci_conn *c) case LE_LINK: h->le_num++; if (c->role == HCI_ROLE_SLAVE) - h->le_num_slave++; + h->le_num_peripheral++; break; case SCO_LINK: case ESCO_LINK: @@ -920,7 +920,7 @@ static inline void hci_conn_hash_del(struct hci_dev *hdev, struct hci_conn *c) case LE_LINK: h->le_num--; if (c->role == HCI_ROLE_SLAVE) - h->le_num_slave--; + h->le_num_peripheral--; break; case SCO_LINK: case ESCO_LINK: diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index da013d485f14..e479dc44e572 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -5384,9 +5384,9 @@ static struct hci_conn *check_pending_le_conn(struct hci_dev *hdev, return NULL; /* Most controller will fail if we try to create new connections - * while we have an existing one in slave role. + * while we have an existing one in peripheral role. */ - if (hdev->conn_hash.le_num_slave > 0 && + if (hdev->conn_hash.le_num_peripheral > 0 && (!test_bit(HCI_QUIRK_VALID_LE_STATES, &hdev->quirks) || !(hdev->le_states[3] & 0x10))) return NULL; diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index 3465862429fb..a5d55175176e 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -1519,13 +1519,14 @@ static bool is_advertising_allowed(struct hci_dev *hdev, bool connectable) if (hci_conn_num(hdev, LE_LINK) == 0) return true; - /* Check le_states if there is any connection in slave role. */ - if (hdev->conn_hash.le_num_slave > 0) { - /* Slave connection state and non connectable mode bit 20. */ + /* Check le_states if there is any connection in peripheral role. */ + if (hdev->conn_hash.le_num_peripheral > 0) { + /* Peripheral connection state and non connectable mode bit 20. + */ if (!connectable && !(hdev->le_states[2] & 0x10)) return false; - /* Slave connection state and connectable mode bit 38 + /* Peripheral connection state and connectable mode bit 38 * and scannable bit 21. */ if (connectable && (!(hdev->le_states[4] & 0x40) || @@ -1533,13 +1534,13 @@ static bool is_advertising_allowed(struct hci_dev *hdev, bool connectable) return false; } - /* Check le_states if there is any connection in master role. */ - if (hci_conn_num(hdev, LE_LINK) != hdev->conn_hash.le_num_slave) { - /* Master connection state and non connectable mode bit 18. */ + /* Check le_states if there is any connection in central role. */ + if (hci_conn_num(hdev, LE_LINK) != hdev->conn_hash.le_num_peripheral) { + /* Central connection state and non connectable mode bit 18. */ if (!connectable && !(hdev->le_states[2] & 0x02)) return false; - /* Master connection state and connectable mode bit 35 and + /* Central connection state and connectable mode bit 35 and * scannable 19. */ if (connectable && (!(hdev->le_states[4] & 0x08) || -- cgit v1.2.3-58-ga151 From 3d4f9c00492b4e21641e5140a5e78cb50b58d60b Mon Sep 17 00:00:00 2001 From: Archie Pusaka Date: Fri, 4 Jun 2021 16:26:27 +0800 Subject: Bluetooth: use inclusive language when filtering devices This patch replaces some non-inclusive terms based on the appropriate language mapping table compiled by the Bluetooth SIG: https://specificationrefs.bluetooth.com/language-mapping/Appropriate_Language_Mapping_Table.pdf Specifically, these terms are replaced: blacklist -> reject list whitelist -> accept list Signed-off-by: Archie Pusaka Reviewed-by: Miao-chen Chou Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 16 ++++---- include/net/bluetooth/hci_core.h | 8 ++-- net/bluetooth/hci_core.c | 24 +++++------ net/bluetooth/hci_debugfs.c | 8 ++-- net/bluetooth/hci_event.c | 70 +++++++++++++++---------------- net/bluetooth/hci_request.c | 89 ++++++++++++++++++++-------------------- net/bluetooth/hci_sock.c | 12 +++--- net/bluetooth/l2cap_core.c | 4 +- net/bluetooth/mgmt.c | 14 +++---- 9 files changed, 123 insertions(+), 122 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 3abd6273a189..2dc947341502 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -1505,7 +1505,7 @@ struct hci_cp_le_set_scan_enable { } __packed; #define HCI_LE_USE_PEER_ADDR 0x00 -#define HCI_LE_USE_WHITELIST 0x01 +#define HCI_LE_USE_ACCEPT_LIST 0x01 #define HCI_OP_LE_CREATE_CONN 0x200d struct hci_cp_le_create_conn { @@ -1525,22 +1525,22 @@ struct hci_cp_le_create_conn { #define HCI_OP_LE_CREATE_CONN_CANCEL 0x200e -#define HCI_OP_LE_READ_WHITE_LIST_SIZE 0x200f -struct hci_rp_le_read_white_list_size { +#define HCI_OP_LE_READ_ACCEPT_LIST_SIZE 0x200f +struct hci_rp_le_read_accept_list_size { __u8 status; __u8 size; } __packed; -#define HCI_OP_LE_CLEAR_WHITE_LIST 0x2010 +#define HCI_OP_LE_CLEAR_ACCEPT_LIST 0x2010 -#define HCI_OP_LE_ADD_TO_WHITE_LIST 0x2011 -struct hci_cp_le_add_to_white_list { +#define HCI_OP_LE_ADD_TO_ACCEPT_LIST 0x2011 +struct hci_cp_le_add_to_accept_list { __u8 bdaddr_type; bdaddr_t bdaddr; } __packed; -#define HCI_OP_LE_DEL_FROM_WHITE_LIST 0x2012 -struct hci_cp_le_del_from_white_list { +#define HCI_OP_LE_DEL_FROM_ACCEPT_LIST 0x2012 +struct hci_cp_le_del_from_accept_list { __u8 bdaddr_type; bdaddr_t bdaddr; } __packed; diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index fe5f3a9d9924..212f46806ce7 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -327,7 +327,7 @@ struct hci_dev { __u8 max_page; __u8 features[HCI_MAX_PAGES][8]; __u8 le_features[8]; - __u8 le_white_list_size; + __u8 le_accept_list_size; __u8 le_resolv_list_size; __u8 le_num_of_adv_sets; __u8 le_states[8]; @@ -522,14 +522,14 @@ struct hci_dev { struct hci_conn_hash conn_hash; struct list_head mgmt_pending; - struct list_head blacklist; - struct list_head whitelist; + struct list_head reject_list; + struct list_head accept_list; struct list_head uuids; struct list_head link_keys; struct list_head long_term_keys; struct list_head identity_resolving_keys; struct list_head remote_oob_data; - struct list_head le_white_list; + struct list_head le_accept_list; struct list_head le_resolv_list; struct list_head le_conn_params; struct list_head pend_le_conns; diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 5735171e2e23..2560ed2f144d 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -749,14 +749,14 @@ static int hci_init3_req(struct hci_request *req, unsigned long opt) } if (hdev->commands[26] & 0x40) { - /* Read LE White List Size */ - hci_req_add(req, HCI_OP_LE_READ_WHITE_LIST_SIZE, + /* Read LE Accept List Size */ + hci_req_add(req, HCI_OP_LE_READ_ACCEPT_LIST_SIZE, 0, NULL); } if (hdev->commands[26] & 0x80) { - /* Clear LE White List */ - hci_req_add(req, HCI_OP_LE_CLEAR_WHITE_LIST, 0, NULL); + /* Clear LE Accept List */ + hci_req_add(req, HCI_OP_LE_CLEAR_ACCEPT_LIST, 0, NULL); } if (hdev->commands[34] & 0x40) { @@ -3713,13 +3713,13 @@ static int hci_suspend_notifier(struct notifier_block *nb, unsigned long action, /* Suspend consists of two actions: * - First, disconnect everything and make the controller not * connectable (disabling scanning) - * - Second, program event filter/whitelist and enable scan + * - Second, program event filter/accept list and enable scan */ ret = hci_change_suspend_state(hdev, BT_SUSPEND_DISCONNECT); if (!ret) state = BT_SUSPEND_DISCONNECT; - /* Only configure whitelist if disconnect succeeded and wake + /* Only configure accept list if disconnect succeeded and wake * isn't being prevented. */ if (!ret && !(hdev->prevent_wake && hdev->prevent_wake(hdev))) { @@ -3827,14 +3827,14 @@ struct hci_dev *hci_alloc_dev(void) mutex_init(&hdev->req_lock); INIT_LIST_HEAD(&hdev->mgmt_pending); - INIT_LIST_HEAD(&hdev->blacklist); - INIT_LIST_HEAD(&hdev->whitelist); + INIT_LIST_HEAD(&hdev->reject_list); + INIT_LIST_HEAD(&hdev->accept_list); INIT_LIST_HEAD(&hdev->uuids); INIT_LIST_HEAD(&hdev->link_keys); INIT_LIST_HEAD(&hdev->long_term_keys); INIT_LIST_HEAD(&hdev->identity_resolving_keys); INIT_LIST_HEAD(&hdev->remote_oob_data); - INIT_LIST_HEAD(&hdev->le_white_list); + INIT_LIST_HEAD(&hdev->le_accept_list); INIT_LIST_HEAD(&hdev->le_resolv_list); INIT_LIST_HEAD(&hdev->le_conn_params); INIT_LIST_HEAD(&hdev->pend_le_conns); @@ -4047,8 +4047,8 @@ void hci_unregister_dev(struct hci_dev *hdev) destroy_workqueue(hdev->req_workqueue); hci_dev_lock(hdev); - hci_bdaddr_list_clear(&hdev->blacklist); - hci_bdaddr_list_clear(&hdev->whitelist); + hci_bdaddr_list_clear(&hdev->reject_list); + hci_bdaddr_list_clear(&hdev->accept_list); hci_uuids_clear(hdev); hci_link_keys_clear(hdev); hci_smp_ltks_clear(hdev); @@ -4056,7 +4056,7 @@ void hci_unregister_dev(struct hci_dev *hdev) hci_remote_oob_data_clear(hdev); hci_adv_instances_clear(hdev); hci_adv_monitors_clear(hdev); - hci_bdaddr_list_clear(&hdev->le_white_list); + hci_bdaddr_list_clear(&hdev->le_accept_list); hci_bdaddr_list_clear(&hdev->le_resolv_list); hci_conn_params_clear_all(hdev); hci_discovery_filter_clear(hdev); diff --git a/net/bluetooth/hci_debugfs.c b/net/bluetooth/hci_debugfs.c index 47f4f21fbc1a..841393389f7b 100644 --- a/net/bluetooth/hci_debugfs.c +++ b/net/bluetooth/hci_debugfs.c @@ -125,7 +125,7 @@ static int device_list_show(struct seq_file *f, void *ptr) struct bdaddr_list *b; hci_dev_lock(hdev); - list_for_each_entry(b, &hdev->whitelist, list) + list_for_each_entry(b, &hdev->accept_list, list) seq_printf(f, "%pMR (type %u)\n", &b->bdaddr, b->bdaddr_type); list_for_each_entry(p, &hdev->le_conn_params, list) { seq_printf(f, "%pMR (type %u) %u\n", &p->addr, p->addr_type, @@ -144,7 +144,7 @@ static int blacklist_show(struct seq_file *f, void *p) struct bdaddr_list *b; hci_dev_lock(hdev); - list_for_each_entry(b, &hdev->blacklist, list) + list_for_each_entry(b, &hdev->reject_list, list) seq_printf(f, "%pMR (type %u)\n", &b->bdaddr, b->bdaddr_type); hci_dev_unlock(hdev); @@ -784,7 +784,7 @@ static int white_list_show(struct seq_file *f, void *ptr) struct bdaddr_list *b; hci_dev_lock(hdev); - list_for_each_entry(b, &hdev->le_white_list, list) + list_for_each_entry(b, &hdev->le_accept_list, list) seq_printf(f, "%pMR (type %u)\n", &b->bdaddr, b->bdaddr_type); hci_dev_unlock(hdev); @@ -1195,7 +1195,7 @@ void hci_debugfs_create_le(struct hci_dev *hdev) &force_static_address_fops); debugfs_create_u8("white_list_size", 0444, hdev->debugfs, - &hdev->le_white_list_size); + &hdev->le_accept_list_size); debugfs_create_file("white_list", 0444, hdev->debugfs, hdev, &white_list_fops); debugfs_create_u8("resolv_list_size", 0444, hdev->debugfs, diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index e479dc44e572..98ec486743ba 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -236,7 +236,7 @@ static void hci_cc_reset(struct hci_dev *hdev, struct sk_buff *skb) hdev->ssp_debug_mode = 0; - hci_bdaddr_list_clear(&hdev->le_white_list); + hci_bdaddr_list_clear(&hdev->le_accept_list); hci_bdaddr_list_clear(&hdev->le_resolv_list); } @@ -1492,21 +1492,21 @@ static void hci_cc_le_read_num_adv_sets(struct hci_dev *hdev, hdev->le_num_of_adv_sets = rp->num_of_sets; } -static void hci_cc_le_read_white_list_size(struct hci_dev *hdev, - struct sk_buff *skb) +static void hci_cc_le_read_accept_list_size(struct hci_dev *hdev, + struct sk_buff *skb) { - struct hci_rp_le_read_white_list_size *rp = (void *) skb->data; + struct hci_rp_le_read_accept_list_size *rp = (void *)skb->data; BT_DBG("%s status 0x%2.2x size %u", hdev->name, rp->status, rp->size); if (rp->status) return; - hdev->le_white_list_size = rp->size; + hdev->le_accept_list_size = rp->size; } -static void hci_cc_le_clear_white_list(struct hci_dev *hdev, - struct sk_buff *skb) +static void hci_cc_le_clear_accept_list(struct hci_dev *hdev, + struct sk_buff *skb) { __u8 status = *((__u8 *) skb->data); @@ -1515,13 +1515,13 @@ static void hci_cc_le_clear_white_list(struct hci_dev *hdev, if (status) return; - hci_bdaddr_list_clear(&hdev->le_white_list); + hci_bdaddr_list_clear(&hdev->le_accept_list); } -static void hci_cc_le_add_to_white_list(struct hci_dev *hdev, - struct sk_buff *skb) +static void hci_cc_le_add_to_accept_list(struct hci_dev *hdev, + struct sk_buff *skb) { - struct hci_cp_le_add_to_white_list *sent; + struct hci_cp_le_add_to_accept_list *sent; __u8 status = *((__u8 *) skb->data); BT_DBG("%s status 0x%2.2x", hdev->name, status); @@ -1529,18 +1529,18 @@ static void hci_cc_le_add_to_white_list(struct hci_dev *hdev, if (status) return; - sent = hci_sent_cmd_data(hdev, HCI_OP_LE_ADD_TO_WHITE_LIST); + sent = hci_sent_cmd_data(hdev, HCI_OP_LE_ADD_TO_ACCEPT_LIST); if (!sent) return; - hci_bdaddr_list_add(&hdev->le_white_list, &sent->bdaddr, - sent->bdaddr_type); + hci_bdaddr_list_add(&hdev->le_accept_list, &sent->bdaddr, + sent->bdaddr_type); } -static void hci_cc_le_del_from_white_list(struct hci_dev *hdev, - struct sk_buff *skb) +static void hci_cc_le_del_from_accept_list(struct hci_dev *hdev, + struct sk_buff *skb) { - struct hci_cp_le_del_from_white_list *sent; + struct hci_cp_le_del_from_accept_list *sent; __u8 status = *((__u8 *) skb->data); BT_DBG("%s status 0x%2.2x", hdev->name, status); @@ -1548,11 +1548,11 @@ static void hci_cc_le_del_from_white_list(struct hci_dev *hdev, if (status) return; - sent = hci_sent_cmd_data(hdev, HCI_OP_LE_DEL_FROM_WHITE_LIST); + sent = hci_sent_cmd_data(hdev, HCI_OP_LE_DEL_FROM_ACCEPT_LIST); if (!sent) return; - hci_bdaddr_list_del(&hdev->le_white_list, &sent->bdaddr, + hci_bdaddr_list_del(&hdev->le_accept_list, &sent->bdaddr, sent->bdaddr_type); } @@ -2367,7 +2367,7 @@ static void cs_le_create_conn(struct hci_dev *hdev, bdaddr_t *peer_addr, /* We don't want the connection attempt to stick around * indefinitely since LE doesn't have a page timeout concept * like BR/EDR. Set a timer for any connection that doesn't use - * the white list for connecting. + * the accept list for connecting. */ if (filter_policy == HCI_LE_USE_PEER_ADDR) queue_delayed_work(conn->hdev->workqueue, @@ -2623,7 +2623,7 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) * only used during suspend. */ if (ev->link_type == ACL_LINK && - hci_bdaddr_list_lookup_with_flags(&hdev->whitelist, + hci_bdaddr_list_lookup_with_flags(&hdev->accept_list, &ev->bdaddr, BDADDR_BREDR)) { conn = hci_conn_add(hdev, ev->link_type, &ev->bdaddr, @@ -2745,19 +2745,19 @@ static void hci_conn_request_evt(struct hci_dev *hdev, struct sk_buff *skb) return; } - if (hci_bdaddr_list_lookup(&hdev->blacklist, &ev->bdaddr, + if (hci_bdaddr_list_lookup(&hdev->reject_list, &ev->bdaddr, BDADDR_BREDR)) { hci_reject_conn(hdev, &ev->bdaddr); return; } - /* Require HCI_CONNECTABLE or a whitelist entry to accept the + /* Require HCI_CONNECTABLE or an accept list entry to accept the * connection. These features are only touched through mgmt so * only do the checks if HCI_MGMT is set. */ if (hci_dev_test_flag(hdev, HCI_MGMT) && !hci_dev_test_flag(hdev, HCI_CONNECTABLE) && - !hci_bdaddr_list_lookup_with_flags(&hdev->whitelist, &ev->bdaddr, + !hci_bdaddr_list_lookup_with_flags(&hdev->accept_list, &ev->bdaddr, BDADDR_BREDR)) { hci_reject_conn(hdev, &ev->bdaddr); return; @@ -3538,20 +3538,20 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb, hci_cc_le_set_scan_enable(hdev, skb); break; - case HCI_OP_LE_READ_WHITE_LIST_SIZE: - hci_cc_le_read_white_list_size(hdev, skb); + case HCI_OP_LE_READ_ACCEPT_LIST_SIZE: + hci_cc_le_read_accept_list_size(hdev, skb); break; - case HCI_OP_LE_CLEAR_WHITE_LIST: - hci_cc_le_clear_white_list(hdev, skb); + case HCI_OP_LE_CLEAR_ACCEPT_LIST: + hci_cc_le_clear_accept_list(hdev, skb); break; - case HCI_OP_LE_ADD_TO_WHITE_LIST: - hci_cc_le_add_to_white_list(hdev, skb); + case HCI_OP_LE_ADD_TO_ACCEPT_LIST: + hci_cc_le_add_to_accept_list(hdev, skb); break; - case HCI_OP_LE_DEL_FROM_WHITE_LIST: - hci_cc_le_del_from_white_list(hdev, skb); + case HCI_OP_LE_DEL_FROM_ACCEPT_LIST: + hci_cc_le_del_from_accept_list(hdev, skb); break; case HCI_OP_LE_READ_SUPPORTED_STATES: @@ -5132,7 +5132,7 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status, /* If we didn't have a hci_conn object previously * but we're in central role this must be something - * initiated using a white list. Since white list based + * initiated using an accept list. Since accept list based * connections are not "first class citizens" we don't * have full tracking of them. Therefore, we go ahead * with a "best effort" approach of determining the @@ -5224,7 +5224,7 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status, addr_type = BDADDR_LE_RANDOM; /* Drop the connection if the device is blocked */ - if (hci_bdaddr_list_lookup(&hdev->blacklist, &conn->dst, addr_type)) { + if (hci_bdaddr_list_lookup(&hdev->reject_list, &conn->dst, addr_type)) { hci_conn_drop(conn); goto unlock; } @@ -5380,7 +5380,7 @@ static struct hci_conn *check_pending_le_conn(struct hci_dev *hdev, return NULL; /* Ignore if the device is blocked */ - if (hci_bdaddr_list_lookup(&hdev->blacklist, addr, addr_type)) + if (hci_bdaddr_list_lookup(&hdev->reject_list, addr, addr_type)) return NULL; /* Most controller will fail if we try to create new connections diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index a5d55175176e..f7a9d97f3e84 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -745,17 +745,17 @@ void hci_req_add_le_scan_disable(struct hci_request *req, bool rpa_le_conn) } } -static void del_from_white_list(struct hci_request *req, bdaddr_t *bdaddr, - u8 bdaddr_type) +static void del_from_accept_list(struct hci_request *req, bdaddr_t *bdaddr, + u8 bdaddr_type) { - struct hci_cp_le_del_from_white_list cp; + struct hci_cp_le_del_from_accept_list cp; cp.bdaddr_type = bdaddr_type; bacpy(&cp.bdaddr, bdaddr); - bt_dev_dbg(req->hdev, "Remove %pMR (0x%x) from whitelist", &cp.bdaddr, + bt_dev_dbg(req->hdev, "Remove %pMR (0x%x) from accept list", &cp.bdaddr, cp.bdaddr_type); - hci_req_add(req, HCI_OP_LE_DEL_FROM_WHITE_LIST, sizeof(cp), &cp); + hci_req_add(req, HCI_OP_LE_DEL_FROM_ACCEPT_LIST, sizeof(cp), &cp); if (use_ll_privacy(req->hdev) && hci_dev_test_flag(req->hdev, HCI_ENABLE_LL_PRIVACY)) { @@ -774,31 +774,31 @@ static void del_from_white_list(struct hci_request *req, bdaddr_t *bdaddr, } } -/* Adds connection to white list if needed. On error, returns -1. */ -static int add_to_white_list(struct hci_request *req, - struct hci_conn_params *params, u8 *num_entries, - bool allow_rpa) +/* Adds connection to accept list if needed. On error, returns -1. */ +static int add_to_accept_list(struct hci_request *req, + struct hci_conn_params *params, u8 *num_entries, + bool allow_rpa) { - struct hci_cp_le_add_to_white_list cp; + struct hci_cp_le_add_to_accept_list cp; struct hci_dev *hdev = req->hdev; - /* Already in white list */ - if (hci_bdaddr_list_lookup(&hdev->le_white_list, ¶ms->addr, + /* Already in accept list */ + if (hci_bdaddr_list_lookup(&hdev->le_accept_list, ¶ms->addr, params->addr_type)) return 0; /* Select filter policy to accept all advertising */ - if (*num_entries >= hdev->le_white_list_size) + if (*num_entries >= hdev->le_accept_list_size) return -1; - /* White list can not be used with RPAs */ + /* Accept list can not be used with RPAs */ if (!allow_rpa && !hci_dev_test_flag(hdev, HCI_ENABLE_LL_PRIVACY) && hci_find_irk_by_addr(hdev, ¶ms->addr, params->addr_type)) { return -1; } - /* During suspend, only wakeable devices can be in whitelist */ + /* During suspend, only wakeable devices can be in accept list */ if (hdev->suspended && !hci_conn_test_flag(HCI_CONN_FLAG_REMOTE_WAKEUP, params->current_flags)) return 0; @@ -807,9 +807,9 @@ static int add_to_white_list(struct hci_request *req, cp.bdaddr_type = params->addr_type; bacpy(&cp.bdaddr, ¶ms->addr); - bt_dev_dbg(hdev, "Add %pMR (0x%x) to whitelist", &cp.bdaddr, + bt_dev_dbg(hdev, "Add %pMR (0x%x) to accept list", &cp.bdaddr, cp.bdaddr_type); - hci_req_add(req, HCI_OP_LE_ADD_TO_WHITE_LIST, sizeof(cp), &cp); + hci_req_add(req, HCI_OP_LE_ADD_TO_ACCEPT_LIST, sizeof(cp), &cp); if (use_ll_privacy(hdev) && hci_dev_test_flag(hdev, HCI_ENABLE_LL_PRIVACY)) { @@ -837,15 +837,15 @@ static int add_to_white_list(struct hci_request *req, return 0; } -static u8 update_white_list(struct hci_request *req) +static u8 update_accept_list(struct hci_request *req) { struct hci_dev *hdev = req->hdev; struct hci_conn_params *params; struct bdaddr_list *b; u8 num_entries = 0; bool pend_conn, pend_report; - /* We allow whitelisting even with RPAs in suspend. In the worst case, - * we won't be able to wake from devices that use the privacy1.2 + /* We allow usage of accept list even with RPAs in suspend. In the worst + * case, we won't be able to wake from devices that use the privacy1.2 * features. Additionally, once we support privacy1.2 and IRK * offloading, we can update this to also check for those conditions. */ @@ -855,13 +855,13 @@ static u8 update_white_list(struct hci_request *req) hci_dev_test_flag(hdev, HCI_ENABLE_LL_PRIVACY)) allow_rpa = true; - /* Go through the current white list programmed into the + /* Go through the current accept list programmed into the * controller one by one and check if that address is still * in the list of pending connections or list of devices to * report. If not present in either list, then queue the * command to remove it from the controller. */ - list_for_each_entry(b, &hdev->le_white_list, list) { + list_for_each_entry(b, &hdev->le_accept_list, list) { pend_conn = hci_pend_le_action_lookup(&hdev->pend_le_conns, &b->bdaddr, b->bdaddr_type); @@ -870,14 +870,14 @@ static u8 update_white_list(struct hci_request *req) b->bdaddr_type); /* If the device is not likely to connect or report, - * remove it from the whitelist. + * remove it from the accept list. */ if (!pend_conn && !pend_report) { - del_from_white_list(req, &b->bdaddr, b->bdaddr_type); + del_from_accept_list(req, &b->bdaddr, b->bdaddr_type); continue; } - /* White list can not be used with RPAs */ + /* Accept list can not be used with RPAs */ if (!allow_rpa && !hci_dev_test_flag(hdev, HCI_ENABLE_LL_PRIVACY) && hci_find_irk_by_addr(hdev, &b->bdaddr, b->bdaddr_type)) { @@ -887,27 +887,27 @@ static u8 update_white_list(struct hci_request *req) num_entries++; } - /* Since all no longer valid white list entries have been + /* Since all no longer valid accept list entries have been * removed, walk through the list of pending connections * and ensure that any new device gets programmed into * the controller. * * If the list of the devices is larger than the list of - * available white list entries in the controller, then + * available accept list entries in the controller, then * just abort and return filer policy value to not use the - * white list. + * accept list. */ list_for_each_entry(params, &hdev->pend_le_conns, action) { - if (add_to_white_list(req, params, &num_entries, allow_rpa)) + if (add_to_accept_list(req, params, &num_entries, allow_rpa)) return 0x00; } /* After adding all new pending connections, walk through * the list of pending reports and also add these to the - * white list if there is still space. Abort if space runs out. + * accept list if there is still space. Abort if space runs out. */ list_for_each_entry(params, &hdev->pend_le_reports, action) { - if (add_to_white_list(req, params, &num_entries, allow_rpa)) + if (add_to_accept_list(req, params, &num_entries, allow_rpa)) return 0x00; } @@ -921,7 +921,7 @@ static u8 update_white_list(struct hci_request *req) hdev->interleave_scan_state != INTERLEAVE_SCAN_ALLOWLIST) return 0x00; - /* Select filter policy to use white list */ + /* Select filter policy to use accept list */ return 0x01; } @@ -1078,20 +1078,20 @@ void hci_req_add_le_passive_scan(struct hci_request *req) return; bt_dev_dbg(hdev, "interleave state %d", hdev->interleave_scan_state); - /* Adding or removing entries from the white list must + /* Adding or removing entries from the accept list must * happen before enabling scanning. The controller does - * not allow white list modification while scanning. + * not allow accept list modification while scanning. */ - filter_policy = update_white_list(req); + filter_policy = update_accept_list(req); /* When the controller is using random resolvable addresses and * with that having LE privacy enabled, then controllers with * Extended Scanner Filter Policies support can now enable support * for handling directed advertising. * - * So instead of using filter polices 0x00 (no whitelist) - * and 0x01 (whitelist enabled) use the new filter policies - * 0x02 (no whitelist) and 0x03 (whitelist enabled). + * So instead of using filter polices 0x00 (no accept list) + * and 0x01 (accept list enabled) use the new filter policies + * 0x02 (no accept list) and 0x03 (accept list enabled). */ if (hci_dev_test_flag(hdev, HCI_PRIVACY) && (hdev->le_features[0] & HCI_LE_EXT_SCAN_POLICY)) @@ -1127,7 +1127,8 @@ void hci_req_add_le_passive_scan(struct hci_request *req) interval = hdev->le_scan_interval; } - bt_dev_dbg(hdev, "LE passive scan with whitelist = %d", filter_policy); + bt_dev_dbg(hdev, "LE passive scan with accept list = %d", + filter_policy); hci_req_start_scan(req, LE_SCAN_PASSIVE, interval, window, own_addr_type, filter_policy, filter_dup, addr_resolv); @@ -1180,7 +1181,7 @@ static void hci_req_set_event_filter(struct hci_request *req) /* Always clear event filter when starting */ hci_req_clear_event_filter(req); - list_for_each_entry(b, &hdev->whitelist, list) { + list_for_each_entry(b, &hdev->accept_list, list) { if (!hci_conn_test_flag(HCI_CONN_FLAG_REMOTE_WAKEUP, b->current_flags)) continue; @@ -2623,11 +2624,11 @@ int hci_update_random_address(struct hci_request *req, bool require_privacy, return 0; } -static bool disconnected_whitelist_entries(struct hci_dev *hdev) +static bool disconnected_accept_list_entries(struct hci_dev *hdev) { struct bdaddr_list *b; - list_for_each_entry(b, &hdev->whitelist, list) { + list_for_each_entry(b, &hdev->accept_list, list) { struct hci_conn *conn; conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &b->bdaddr); @@ -2659,7 +2660,7 @@ void __hci_req_update_scan(struct hci_request *req) return; if (hci_dev_test_flag(hdev, HCI_CONNECTABLE) || - disconnected_whitelist_entries(hdev)) + disconnected_accept_list_entries(hdev)) scan = SCAN_PAGE; else scan = SCAN_DISABLED; @@ -3151,7 +3152,7 @@ static int active_scan(struct hci_request *req, unsigned long opt) uint16_t interval = opt; struct hci_dev *hdev = req->hdev; u8 own_addr_type; - /* White list is not used for discovery */ + /* Accept list is not used for discovery */ u8 filter_policy = 0x00; /* Default is to enable duplicates filter */ u8 filter_dup = LE_SCAN_FILTER_DUP_ENABLE; diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index e8d53af7c6a6..b04a5a02ecf3 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -892,7 +892,7 @@ static int hci_sock_release(struct socket *sock) return 0; } -static int hci_sock_blacklist_add(struct hci_dev *hdev, void __user *arg) +static int hci_sock_reject_list_add(struct hci_dev *hdev, void __user *arg) { bdaddr_t bdaddr; int err; @@ -902,14 +902,14 @@ static int hci_sock_blacklist_add(struct hci_dev *hdev, void __user *arg) hci_dev_lock(hdev); - err = hci_bdaddr_list_add(&hdev->blacklist, &bdaddr, BDADDR_BREDR); + err = hci_bdaddr_list_add(&hdev->reject_list, &bdaddr, BDADDR_BREDR); hci_dev_unlock(hdev); return err; } -static int hci_sock_blacklist_del(struct hci_dev *hdev, void __user *arg) +static int hci_sock_reject_list_del(struct hci_dev *hdev, void __user *arg) { bdaddr_t bdaddr; int err; @@ -919,7 +919,7 @@ static int hci_sock_blacklist_del(struct hci_dev *hdev, void __user *arg) hci_dev_lock(hdev); - err = hci_bdaddr_list_del(&hdev->blacklist, &bdaddr, BDADDR_BREDR); + err = hci_bdaddr_list_del(&hdev->reject_list, &bdaddr, BDADDR_BREDR); hci_dev_unlock(hdev); @@ -959,12 +959,12 @@ static int hci_sock_bound_ioctl(struct sock *sk, unsigned int cmd, case HCIBLOCKADDR: if (!capable(CAP_NET_ADMIN)) return -EPERM; - return hci_sock_blacklist_add(hdev, (void __user *)arg); + return hci_sock_reject_list_add(hdev, (void __user *)arg); case HCIUNBLOCKADDR: if (!capable(CAP_NET_ADMIN)) return -EPERM; - return hci_sock_blacklist_del(hdev, (void __user *)arg); + return hci_sock_reject_list_del(hdev, (void __user *)arg); } return -ENOIOCTLCMD; diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index b76c5d00b082..77ba68209dbd 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -7662,7 +7662,7 @@ static void l2cap_recv_frame(struct l2cap_conn *conn, struct sk_buff *skb) * at least ensure that we ignore incoming data from them. */ if (hcon->type == LE_LINK && - hci_bdaddr_list_lookup(&hcon->hdev->blacklist, &hcon->dst, + hci_bdaddr_list_lookup(&hcon->hdev->reject_list, &hcon->dst, bdaddr_dst_type(hcon))) { kfree_skb(skb); return; @@ -8119,7 +8119,7 @@ static void l2cap_connect_cfm(struct hci_conn *hcon, u8 status) dst_type = bdaddr_dst_type(hcon); /* If device is blocked, do not create channels for it */ - if (hci_bdaddr_list_lookup(&hdev->blacklist, &hcon->dst, dst_type)) + if (hci_bdaddr_list_lookup(&hdev->reject_list, &hcon->dst, dst_type)) return; /* Find fixed channels and notify them of the new connection. We diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 22f9f52c5ae6..d1bf5a55ff85 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -4064,7 +4064,7 @@ static int get_device_flags(struct sock *sk, struct hci_dev *hdev, void *data, memset(&rp, 0, sizeof(rp)); if (cp->addr.type == BDADDR_BREDR) { - br_params = hci_bdaddr_list_lookup_with_flags(&hdev->whitelist, + br_params = hci_bdaddr_list_lookup_with_flags(&hdev->accept_list, &cp->addr.bdaddr, cp->addr.type); if (!br_params) @@ -4132,7 +4132,7 @@ static int set_device_flags(struct sock *sk, struct hci_dev *hdev, void *data, hci_dev_lock(hdev); if (cp->addr.type == BDADDR_BREDR) { - br_params = hci_bdaddr_list_lookup_with_flags(&hdev->whitelist, + br_params = hci_bdaddr_list_lookup_with_flags(&hdev->accept_list, &cp->addr.bdaddr, cp->addr.type); @@ -5209,7 +5209,7 @@ static int block_device(struct sock *sk, struct hci_dev *hdev, void *data, hci_dev_lock(hdev); - err = hci_bdaddr_list_add(&hdev->blacklist, &cp->addr.bdaddr, + err = hci_bdaddr_list_add(&hdev->reject_list, &cp->addr.bdaddr, cp->addr.type); if (err < 0) { status = MGMT_STATUS_FAILED; @@ -5245,7 +5245,7 @@ static int unblock_device(struct sock *sk, struct hci_dev *hdev, void *data, hci_dev_lock(hdev); - err = hci_bdaddr_list_del(&hdev->blacklist, &cp->addr.bdaddr, + err = hci_bdaddr_list_del(&hdev->reject_list, &cp->addr.bdaddr, cp->addr.type); if (err < 0) { status = MGMT_STATUS_INVALID_PARAMS; @@ -6736,7 +6736,7 @@ static int add_device(struct sock *sk, struct hci_dev *hdev, goto unlock; } - err = hci_bdaddr_list_add_with_flags(&hdev->whitelist, + err = hci_bdaddr_list_add_with_flags(&hdev->accept_list, &cp->addr.bdaddr, cp->addr.type, 0); if (err) @@ -6834,7 +6834,7 @@ static int remove_device(struct sock *sk, struct hci_dev *hdev, } if (cp->addr.type == BDADDR_BREDR) { - err = hci_bdaddr_list_del(&hdev->whitelist, + err = hci_bdaddr_list_del(&hdev->accept_list, &cp->addr.bdaddr, cp->addr.type); if (err) { @@ -6905,7 +6905,7 @@ static int remove_device(struct sock *sk, struct hci_dev *hdev, goto unlock; } - list_for_each_entry_safe(b, btmp, &hdev->whitelist, list) { + list_for_each_entry_safe(b, btmp, &hdev->accept_list, list) { device_removed(sk, hdev, &b->bdaddr, b->bdaddr_type); list_del(&b->list); kfree(b); -- cgit v1.2.3-58-ga151 From c9ed0a7077306f9d41d74fb006ab5dbada8349c5 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 9 Jun 2021 11:09:27 -0700 Subject: Bluetooth: Fix Set Extended (Scan Response) Data These command do have variable length and the length can go up to 251, so this changes the struct to not use a fixed size and then when creating the PDU only the actual length of the data send to the controller. Fixes: a0fb3726ba551 ("Bluetooth: Use Set ext adv/scan rsp data if controller supports") Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 6 +++-- include/net/bluetooth/hci_core.h | 8 +++---- net/bluetooth/hci_request.c | 51 +++++++++++++++++++++++----------------- 3 files changed, 37 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 2dc947341502..b80415011dcd 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -1775,13 +1775,15 @@ struct hci_cp_ext_adv_set { __u8 max_events; } __packed; +#define HCI_MAX_EXT_AD_LENGTH 251 + #define HCI_OP_LE_SET_EXT_ADV_DATA 0x2037 struct hci_cp_le_set_ext_adv_data { __u8 handle; __u8 operation; __u8 frag_pref; __u8 length; - __u8 data[HCI_MAX_AD_LENGTH]; + __u8 data[]; } __packed; #define HCI_OP_LE_SET_EXT_SCAN_RSP_DATA 0x2038 @@ -1790,7 +1792,7 @@ struct hci_cp_le_set_ext_scan_rsp_data { __u8 operation; __u8 frag_pref; __u8 length; - __u8 data[HCI_MAX_AD_LENGTH]; + __u8 data[]; } __packed; #define LE_SET_ADV_DATA_OP_COMPLETE 0x03 diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 212f46806ce7..a53e94459ecd 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -228,9 +228,9 @@ struct adv_info { __u16 remaining_time; __u16 duration; __u16 adv_data_len; - __u8 adv_data[HCI_MAX_AD_LENGTH]; + __u8 adv_data[HCI_MAX_EXT_AD_LENGTH]; __u16 scan_rsp_len; - __u8 scan_rsp_data[HCI_MAX_AD_LENGTH]; + __u8 scan_rsp_data[HCI_MAX_EXT_AD_LENGTH]; __s8 tx_power; __u32 min_interval; __u32 max_interval; @@ -551,9 +551,9 @@ struct hci_dev { DECLARE_BITMAP(dev_flags, __HCI_NUM_FLAGS); __s8 adv_tx_power; - __u8 adv_data[HCI_MAX_AD_LENGTH]; + __u8 adv_data[HCI_MAX_EXT_AD_LENGTH]; __u8 adv_data_len; - __u8 scan_rsp_data[HCI_MAX_AD_LENGTH]; + __u8 scan_rsp_data[HCI_MAX_EXT_AD_LENGTH]; __u8 scan_rsp_data_len; struct list_head adv_instances; diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index f7a9d97f3e84..1d14adc023e9 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -1716,30 +1716,33 @@ void __hci_req_update_scan_rsp_data(struct hci_request *req, u8 instance) return; if (ext_adv_capable(hdev)) { - struct hci_cp_le_set_ext_scan_rsp_data cp; + struct { + struct hci_cp_le_set_ext_scan_rsp_data cp; + u8 data[HCI_MAX_EXT_AD_LENGTH]; + } pdu; - memset(&cp, 0, sizeof(cp)); + memset(&pdu, 0, sizeof(pdu)); if (instance) len = create_instance_scan_rsp_data(hdev, instance, - cp.data); + pdu.data); else - len = create_default_scan_rsp_data(hdev, cp.data); + len = create_default_scan_rsp_data(hdev, pdu.data); if (hdev->scan_rsp_data_len == len && - !memcmp(cp.data, hdev->scan_rsp_data, len)) + !memcmp(pdu.data, hdev->scan_rsp_data, len)) return; - memcpy(hdev->scan_rsp_data, cp.data, sizeof(cp.data)); + memcpy(hdev->scan_rsp_data, pdu.data, len); hdev->scan_rsp_data_len = len; - cp.handle = instance; - cp.length = len; - cp.operation = LE_SET_ADV_DATA_OP_COMPLETE; - cp.frag_pref = LE_SET_ADV_DATA_NO_FRAG; + pdu.cp.handle = instance; + pdu.cp.length = len; + pdu.cp.operation = LE_SET_ADV_DATA_OP_COMPLETE; + pdu.cp.frag_pref = LE_SET_ADV_DATA_NO_FRAG; - hci_req_add(req, HCI_OP_LE_SET_EXT_SCAN_RSP_DATA, sizeof(cp), - &cp); + hci_req_add(req, HCI_OP_LE_SET_EXT_SCAN_RSP_DATA, + sizeof(pdu.cp) + len, &pdu.cp); } else { struct hci_cp_le_set_scan_rsp_data cp; @@ -1862,26 +1865,30 @@ void __hci_req_update_adv_data(struct hci_request *req, u8 instance) return; if (ext_adv_capable(hdev)) { - struct hci_cp_le_set_ext_adv_data cp; + struct { + struct hci_cp_le_set_ext_adv_data cp; + u8 data[HCI_MAX_EXT_AD_LENGTH]; + } pdu; - memset(&cp, 0, sizeof(cp)); + memset(&pdu, 0, sizeof(pdu)); - len = create_instance_adv_data(hdev, instance, cp.data); + len = create_instance_adv_data(hdev, instance, pdu.data); /* There's nothing to do if the data hasn't changed */ if (hdev->adv_data_len == len && - memcmp(cp.data, hdev->adv_data, len) == 0) + memcmp(pdu.data, hdev->adv_data, len) == 0) return; - memcpy(hdev->adv_data, cp.data, sizeof(cp.data)); + memcpy(hdev->adv_data, pdu.data, len); hdev->adv_data_len = len; - cp.length = len; - cp.handle = instance; - cp.operation = LE_SET_ADV_DATA_OP_COMPLETE; - cp.frag_pref = LE_SET_ADV_DATA_NO_FRAG; + pdu.cp.length = len; + pdu.cp.handle = instance; + pdu.cp.operation = LE_SET_ADV_DATA_OP_COMPLETE; + pdu.cp.frag_pref = LE_SET_ADV_DATA_NO_FRAG; - hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_DATA, sizeof(cp), &cp); + hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_DATA, + sizeof(pdu.cp) + len, &pdu.cp); } else { struct hci_cp_le_set_adv_data cp; -- cgit v1.2.3-58-ga151 From 43e59cb7e6077110c4622e61a188e7703e8c7e36 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Tue, 15 Jun 2021 21:23:35 +0200 Subject: Bluetooth: Increment management interface revision Increment the mgmt revision due to recent changes. Signed-off-by: Marcel Holtmann Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/mgmt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index d1bf5a55ff85..3663f880df11 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -40,7 +40,7 @@ #include "msft.h" #define MGMT_VERSION 1 -#define MGMT_REVISION 20 +#define MGMT_REVISION 21 static const u16 mgmt_commands[] = { MGMT_OP_READ_INDEX_LIST, -- cgit v1.2.3-58-ga151 From 23837a6d7a1a61818ed94a6b8af552d6cf7d32d5 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Tue, 22 Jun 2021 20:59:02 -0700 Subject: Bluetooth: Fix handling of HCI_LE_Advertising_Set_Terminated event Error status of this event means that it has ended due reasons other than a connection: 'If advertising has terminated as a result of the advertising duration elapsing, the Status parameter shall be set to the error code Advertising Timeout (0x3C).' 'If advertising has terminated because the Max_Extended_Advertising_Events was reached, the Status parameter shall be set to the error code Limit Reached (0x43).' Fixes: acf0aeae431a0 ("Bluetooth: Handle ADv set terminated event") Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_event.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 98ec486743ba..1c3018202564 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -5322,8 +5322,19 @@ static void hci_le_ext_adv_term_evt(struct hci_dev *hdev, struct sk_buff *skb) BT_DBG("%s status 0x%2.2x", hdev->name, ev->status); - if (ev->status) + if (ev->status) { + struct adv_info *adv; + + adv = hci_find_adv_instance(hdev, ev->handle); + if (!adv) + return; + + /* Remove advertising as it has been terminated */ + hci_remove_adv_instance(hdev, ev->handle); + mgmt_advertising_removed(NULL, hdev, ev->handle); + return; + } conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->conn_handle)); if (conn) { -- cgit v1.2.3-58-ga151 From fade56410c22cacafb1be9f911a0afd3701d8366 Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Fri, 25 Jun 2021 19:21:39 +0300 Subject: net: lwtunnel: handle MTU calculation in forwading Commit 14972cbd34ff ("net: lwtunnel: Handle fragmentation") moved fragmentation logic away from lwtunnel by carry encap headroom and use it in output MTU calculation. But the forwarding part was not covered and created difference in MTU for output and forwarding and further to silent drops on ipv4 forwarding path. Fix it by taking into account lwtunnel encap headroom. The same commit also introduced difference in how to treat RTAX_MTU in IPv4 and IPv6 where latter explicitly removes lwtunnel encap headroom from route MTU. Make IPv4 version do the same. Fixes: 14972cbd34ff ("net: lwtunnel: Handle fragmentation") Suggested-by: David Ahern Signed-off-by: Vadim Fedorenko Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip.h | 12 ++++++++---- include/net/ip6_route.h | 16 ++++++++++++---- net/ipv4/route.c | 3 ++- 3 files changed, 22 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/include/net/ip.h b/include/net/ip.h index e20874059f82..d9683bef8684 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -31,6 +31,7 @@ #include #include #include +#include #define IPV4_MAX_PMTU 65535U /* RFC 2675, Section 5.1 */ #define IPV4_MIN_MTU 68 /* RFC 791 */ @@ -445,22 +446,25 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst, /* 'forwarding = true' case should always honour route mtu */ mtu = dst_metric_raw(dst, RTAX_MTU); - if (mtu) - return mtu; + if (!mtu) + mtu = min(READ_ONCE(dst->dev->mtu), IP_MAX_MTU); - return min(READ_ONCE(dst->dev->mtu), IP_MAX_MTU); + return mtu - lwtunnel_headroom(dst->lwtstate, mtu); } static inline unsigned int ip_skb_dst_mtu(struct sock *sk, const struct sk_buff *skb) { + unsigned int mtu; + if (!sk || !sk_fullsock(sk) || ip_sk_use_pmtu(sk)) { bool forwarding = IPCB(skb)->flags & IPSKB_FORWARDED; return ip_dst_mtu_maybe_forward(skb_dst(skb), forwarding); } - return min(READ_ONCE(skb_dst(skb)->dev->mtu), IP_MAX_MTU); + mtu = min(READ_ONCE(skb_dst(skb)->dev->mtu), IP_MAX_MTU); + return mtu - lwtunnel_headroom(skb_dst(skb)->lwtstate, mtu); } struct dst_metrics *ip_fib_metrics_init(struct net *net, struct nlattr *fc_mx, diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index f51a118bfce8..f14149df5a65 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -265,11 +265,18 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, static inline int ip6_skb_dst_mtu(struct sk_buff *skb) { + int mtu; + struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? inet6_sk(skb->sk) : NULL; - return (np && np->pmtudisc >= IPV6_PMTUDISC_PROBE) ? - skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb)); + if (np && np->pmtudisc >= IPV6_PMTUDISC_PROBE) { + mtu = READ_ONCE(skb_dst(skb)->dev->mtu); + mtu -= lwtunnel_headroom(skb_dst(skb)->lwtstate, mtu); + } else + mtu = dst_mtu(skb_dst(skb)); + + return mtu; } static inline bool ip6_sk_accept_pmtu(const struct sock *sk) @@ -317,7 +324,7 @@ static inline unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst) if (dst_metric_locked(dst, RTAX_MTU)) { mtu = dst_metric_raw(dst, RTAX_MTU); if (mtu) - return mtu; + goto out; } mtu = IPV6_MIN_MTU; @@ -327,7 +334,8 @@ static inline unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst) mtu = idev->cnf.mtu6; rcu_read_unlock(); - return mtu; +out: + return mtu - lwtunnel_headroom(dst->lwtstate, mtu); } u32 ip6_mtu_from_fib6(const struct fib6_result *res, diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 6a36ac98476f..78d1e5afc452 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1306,7 +1306,7 @@ INDIRECT_CALLABLE_SCOPE unsigned int ipv4_mtu(const struct dst_entry *dst) mtu = dst_metric_raw(dst, RTAX_MTU); if (mtu) - return mtu; + goto out; mtu = READ_ONCE(dst->dev->mtu); @@ -1315,6 +1315,7 @@ INDIRECT_CALLABLE_SCOPE unsigned int ipv4_mtu(const struct dst_entry *dst) mtu = 576; } +out: mtu = min_t(unsigned int, mtu, IP_MAX_MTU); return mtu - lwtunnel_headroom(dst->lwtstate, mtu); -- cgit v1.2.3-58-ga151 From 7ad136fd288c0e0177eb29e04ec289e1b873b270 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 25 Jun 2021 15:33:04 +0200 Subject: ipip: allow redirecting ipip and mplsip packets to eth devices Even though ipip transports IPv4 or MPLS packets, it needs to reset the mac_header pointer, so that other parts of the stack don't mistakenly access the outer header after the packet has been decapsulated. This allows to push an Ethernet header to ipip or mplsip packets and redirect them to an Ethernet device: $ tc filter add dev ipip0 ingress matchall \ action vlan push_eth dst_mac 00:00:5e:00:53:01 \ src_mac 00:00:5e:00:53:00 \ action mirred egress redirect dev eth0 Without this patch, push_eth refuses to add an ethernet header because the skb appears to already have a MAC header. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/ipv4/ipip.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index d5bfa087c23a..266c65577ba6 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -242,6 +242,8 @@ static int ipip_tunnel_rcv(struct sk_buff *skb, u8 ipproto) if (!tun_dst) return 0; } + skb_reset_mac_header(skb); + return ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error); } -- cgit v1.2.3-58-ga151 From 730eed2772e740c30229d03e3d578cc00a5ae304 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 25 Jun 2021 15:33:08 +0200 Subject: sit: allow redirecting ip6ip, ipip and mplsip packets to eth devices Even though sit transports L3 data (IPv6, IPv4 or MPLS) packets, it needs to reset the mac_header pointer, so that other parts of the stack don't mistakenly access the outer header after the packet has been decapsulated. There are two rx handlers to modify: ipip6_rcv() for the ip6ip mode and sit_tunnel_rcv() which is used to re-implement the ipip and mplsip modes of ipip.ko. This allows to push an Ethernet header to sit packets and redirect them to an Ethernet device: $ tc filter add dev sit0 ingress matchall \ action vlan push_eth dst_mac 00:00:5e:00:53:01 \ src_mac 00:00:5e:00:53:00 \ action mirred egress redirect dev eth0 Without this patch, push_eth refuses to add an ethernet header because the skb appears to already have a MAC header. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/ipv6/sit.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index e0a39b0bb4c1..df5bea818410 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -710,6 +710,8 @@ static int ipip6_rcv(struct sk_buff *skb) * old iph is no longer valid */ iph = (const struct iphdr *)skb_mac_header(skb); + skb_reset_mac_header(skb); + err = IP_ECN_decapsulate(iph, skb); if (unlikely(err)) { if (log_ecn_error) @@ -780,6 +782,8 @@ static int sit_tunnel_rcv(struct sk_buff *skb, u8 ipproto) tpi = &ipip_tpi; if (iptunnel_pull_header(skb, 0, tpi->proto, false)) goto drop; + skb_reset_mac_header(skb); + return ip_tunnel_rcv(tunnel, skb, tpi, NULL, log_ecn_error); } -- cgit v1.2.3-58-ga151 From aab1e898c26c3e4289c62b6d6482948672fab939 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 25 Jun 2021 15:33:13 +0200 Subject: gre: let mac_header point to outer header only when necessary Commit e271c7b4420d ("gre: do not keep the GRE header around in collect medata mode") did reset the mac_header for the collect_md case. Let's extend this behaviour to classical gre devices as well. ipgre_header_parse() seems to be the only case that requires mac_header to point to the outer header. We can detect this case accurately by checking ->header_ops. For all other cases, we can reset mac_header. This allows to push an Ethernet header to ipgre packets and redirect them to an Ethernet device: $ tc filter add dev gre0 ingress matchall \ action vlan push_eth dst_mac 00:00:5e:00:53:01 \ src_mac 00:00:5e:00:53:00 \ action mirred egress redirect dev eth0 Before this patch, this worked only for collect_md gre devices. Now this works for regular gre devices as well. Only the special case of gre devices that use ipgre_header_ops isn't supported. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index a68bf4c6fe9b..12dca0c85f3c 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -107,6 +107,8 @@ module_param(log_ecn_error, bool, 0644); MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); static struct rtnl_link_ops ipgre_link_ops __read_mostly; +static const struct header_ops ipgre_header_ops; + static int ipgre_tunnel_init(struct net_device *dev); static void erspan_build_header(struct sk_buff *skb, u32 id, u32 index, @@ -364,7 +366,10 @@ static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi, raw_proto, false) < 0) goto drop; - if (tunnel->dev->type != ARPHRD_NONE) + /* Special case for ipgre_header_parse(), which expects the + * mac_header to point to the outer IP header. + */ + if (tunnel->dev->header_ops == &ipgre_header_ops) skb_pop_mac_header(skb); else skb_reset_mac_header(skb); -- cgit v1.2.3-58-ga151 From da5a2e49f064a86a3b102b20c545f855a7298394 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 25 Jun 2021 15:33:17 +0200 Subject: ip6_tunnel: allow redirecting ip6gre and ipxip6 packets to eth devices Reset the mac_header pointer even when the tunnel transports only L3 data (in the ARPHRD_ETHER case, this is already done by eth_type_trans). This prevents other parts of the stack from mistakenly accessing the outer header after the packet has been decapsulated. In practice, this allows to push an Ethernet header to ipip6, ip6ip6, mplsip6 or ip6gre packets and redirect them to an Ethernet device: $ tc filter add dev ip6tnl0 ingress matchall \ action vlan push_eth dst_mac 00:00:5e:00:53:01 \ src_mac 00:00:5e:00:53:00 \ action mirred egress redirect dev eth0 Without this patch, push_eth refuses to add an ethernet header because the skb appears to already have a MAC header. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 288bafded998..0b8a38687ce4 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -837,6 +837,7 @@ static int __ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb, skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); } else { skb->dev = tunnel->dev; + skb_reset_mac_header(skb); } skb_reset_network_header(skb); -- cgit v1.2.3-58-ga151 From 3f2db250099f46988088800052cdf2332c7aba61 Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Fri, 25 Jun 2021 23:23:48 +0300 Subject: net: sched: fix warning in tcindex_alloc_perfect_hash Syzbot reported warning in tcindex_alloc_perfect_hash. The problem was in too big cp->hash, which triggers warning in kmalloc. Since cp->hash comes from userspace, there is no need to warn if value is not correct Fixes: b9a24bb76bf6 ("net_sched: properly handle failure case of tcf_exts_init()") Reported-and-tested-by: syzbot+1071ad60cd7df39fdadb@syzkaller.appspotmail.com Signed-off-by: Pavel Skripkin Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_tcindex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index c4007b9cd16d..5b274534264c 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -304,7 +304,7 @@ static int tcindex_alloc_perfect_hash(struct net *net, struct tcindex_data *cp) int i, err = 0; cp->perfect = kcalloc(cp->hash, sizeof(struct tcindex_filter_result), - GFP_KERNEL); + GFP_KERNEL | __GFP_NOWARN); if (!cp->perfect) return -ENOMEM; -- cgit v1.2.3-58-ga151 From c4512c63b1193c73b3f09c598a6d0a7f88da1dd8 Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Fri, 25 Jun 2021 14:25:22 -0700 Subject: mptcp: fix 'masking a bool' warning Dan Carpenter reported an issue introduced in commit fde56eea01f9 ("mptcp: refine mptcp_cleanup_rbuf") where a new boolean (ack_pending) is masked with 0x9. This is not the intention to ignore values by using a boolean. This variable should not have a 'bool' type: we should keep the 'u8' to allow this comparison. Fixes: fde56eea01f9 ("mptcp: refine mptcp_cleanup_rbuf") Reported-by: Dan Carpenter Signed-off-by: Matthieu Baerts Signed-off-by: Mat Martineau Acked-by: Paolo Abeni Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index ce0c45dfb79e..7bb82424e551 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -455,7 +455,7 @@ static void mptcp_subflow_cleanup_rbuf(struct sock *ssk) static bool mptcp_subflow_could_cleanup(const struct sock *ssk, bool rx_empty) { const struct inet_connection_sock *icsk = inet_csk(ssk); - bool ack_pending = READ_ONCE(icsk->icsk_ack.pending); + u8 ack_pending = READ_ONCE(icsk->icsk_ack.pending); const struct tcp_sock *tp = tcp_sk(ssk); return (ack_pending & ICSK_ACK_SCHED) && -- cgit v1.2.3-58-ga151 From 0c6de0c943dbb42831bf7502eb5c007f71e752d2 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sun, 27 Jun 2021 23:37:44 -0700 Subject: net: tipc: fix FB_MTU eat two pages FB_MTU is used in 'tipc_msg_build()' to alloc smaller skb when memory allocation fails, which can avoid unnecessary sending failures. The value of FB_MTU now is 3744, and the data size will be: (3744 + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) + \ SKB_DATA_ALIGN(BUF_HEADROOM + BUF_TAILROOM + 3)) which is larger than one page(4096), and two pages will be allocated. To avoid it, replace '3744' with a calculation: (PAGE_SIZE - SKB_DATA_ALIGN(BUF_OVERHEAD) - \ SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) What's more, alloc_skb_fclone() will call SKB_DATA_ALIGN for data size, and it's not necessary to make alignment for buf_size in tipc_buf_acquire(). So, just remove it. Fixes: 4c94cc2d3d57 ("tipc: fall back to smaller MTU if allocation of local send skb fails") Signed-off-by: Menglong Dong Acked-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/bcast.c | 2 +- net/tipc/msg.c | 17 ++++++++--------- net/tipc/msg.h | 3 ++- 3 files changed, 11 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index d4beca895992..593846d25214 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -699,7 +699,7 @@ int tipc_bcast_init(struct net *net) spin_lock_init(&tipc_net(net)->bclock); if (!tipc_link_bc_create(net, 0, 0, NULL, - FB_MTU, + one_page_mtu, BCLINK_WIN_DEFAULT, BCLINK_WIN_DEFAULT, 0, diff --git a/net/tipc/msg.c b/net/tipc/msg.c index ce6ab54822d8..7053c22e393e 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -44,12 +44,15 @@ #define MAX_FORWARD_SIZE 1024 #ifdef CONFIG_TIPC_CRYPTO #define BUF_HEADROOM ALIGN(((LL_MAX_HEADER + 48) + EHDR_MAX_SIZE), 16) -#define BUF_TAILROOM (TIPC_AES_GCM_TAG_SIZE) +#define BUF_OVERHEAD (BUF_HEADROOM + TIPC_AES_GCM_TAG_SIZE) #else #define BUF_HEADROOM (LL_MAX_HEADER + 48) -#define BUF_TAILROOM 16 +#define BUF_OVERHEAD BUF_HEADROOM #endif +const int one_page_mtu = PAGE_SIZE - SKB_DATA_ALIGN(BUF_OVERHEAD) - + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + static unsigned int align(unsigned int i) { return (i + 3) & ~3u; @@ -69,13 +72,8 @@ static unsigned int align(unsigned int i) struct sk_buff *tipc_buf_acquire(u32 size, gfp_t gfp) { struct sk_buff *skb; -#ifdef CONFIG_TIPC_CRYPTO - unsigned int buf_size = (BUF_HEADROOM + size + BUF_TAILROOM + 3) & ~3u; -#else - unsigned int buf_size = (BUF_HEADROOM + size + 3) & ~3u; -#endif - skb = alloc_skb_fclone(buf_size, gfp); + skb = alloc_skb_fclone(BUF_OVERHEAD + size, gfp); if (skb) { skb_reserve(skb, BUF_HEADROOM); skb_put(skb, size); @@ -395,7 +393,8 @@ int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m, int offset, if (unlikely(!skb)) { if (pktmax != MAX_MSG_SIZE) return -ENOMEM; - rc = tipc_msg_build(mhdr, m, offset, dsz, FB_MTU, list); + rc = tipc_msg_build(mhdr, m, offset, dsz, + one_page_mtu, list); if (rc != dsz) return rc; if (tipc_msg_assemble(list)) diff --git a/net/tipc/msg.h b/net/tipc/msg.h index 5d64596ba987..64ae4c4c44f8 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -99,9 +99,10 @@ struct plist; #define MAX_H_SIZE 60 /* Largest possible TIPC header size */ #define MAX_MSG_SIZE (MAX_H_SIZE + TIPC_MAX_USER_MSG_SIZE) -#define FB_MTU 3744 #define TIPC_MEDIA_INFO_OFFSET 5 +extern const int one_page_mtu; + struct tipc_skb_cb { union { struct { -- cgit v1.2.3-58-ga151 From d4cfb7fe5713521280925019e7a7857b373aa627 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sun, 27 Jun 2021 23:37:45 -0700 Subject: net: tipc: replace align() with ALIGN in msg.c The function align() which is defined in msg.c is redundant, replace it with ALIGN() and introduce a BUF_ALIGN(). Signed-off-by: Menglong Dong Acked-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/msg.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 7053c22e393e..5c9fd4791c4b 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -41,6 +41,7 @@ #include "name_table.h" #include "crypto.h" +#define BUF_ALIGN(x) ALIGN(x, 4) #define MAX_FORWARD_SIZE 1024 #ifdef CONFIG_TIPC_CRYPTO #define BUF_HEADROOM ALIGN(((LL_MAX_HEADER + 48) + EHDR_MAX_SIZE), 16) @@ -53,11 +54,6 @@ const int one_page_mtu = PAGE_SIZE - SKB_DATA_ALIGN(BUF_OVERHEAD) - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); -static unsigned int align(unsigned int i) -{ - return (i + 3) & ~3u; -} - /** * tipc_buf_acquire - creates a TIPC message buffer * @size: message size (including TIPC header) @@ -489,7 +485,7 @@ static bool tipc_msg_bundle(struct sk_buff *bskb, struct tipc_msg *msg, msz = msg_size(msg); bsz = msg_size(bmsg); - offset = align(bsz); + offset = BUF_ALIGN(bsz); pad = offset - bsz; if (unlikely(skb_tailroom(bskb) < (pad + msz))) @@ -546,7 +542,7 @@ bool tipc_msg_try_bundle(struct sk_buff *tskb, struct sk_buff **skb, u32 mss, /* Make a new bundle of the two messages if possible */ tsz = msg_size(buf_msg(tskb)); - if (unlikely(mss < align(INT_H_SIZE + tsz) + msg_size(msg))) + if (unlikely(mss < BUF_ALIGN(INT_H_SIZE + tsz) + msg_size(msg))) return true; if (unlikely(pskb_expand_head(tskb, INT_H_SIZE, mss - tsz - INT_H_SIZE, GFP_ATOMIC))) @@ -605,7 +601,7 @@ bool tipc_msg_extract(struct sk_buff *skb, struct sk_buff **iskb, int *pos) if (unlikely(!tipc_msg_validate(iskb))) goto none; - *pos += align(imsz); + *pos += BUF_ALIGN(imsz); return true; none: kfree_skb(skb); -- cgit v1.2.3-58-ga151 From a78cae2476812cecaa4a33d0086bbb53986906bc Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 25 Jun 2021 15:16:12 -0700 Subject: xdp: Move the rxq_info.mem clearing to unreg_mem_model() xdp_rxq_info_unreg() implicitly calls xdp_rxq_info_unreg_mem_model(). This may well be confusing to the driver authors, and lead to double free if they call xdp_rxq_info_unreg_mem_model() before xdp_rxq_info_unreg() (when mem model type == MEM_TYPE_PAGE_POOL). In fact error path of mvpp2_rxq_init() seems to currently do exactly that. The double free will result in refcount underflow in page_pool_destroy(). Make the interface a little more programmer friendly by clearing type and id so that xdp_rxq_info_unreg_mem_model() can be called multiple times. Signed-off-by: Jakub Kicinski Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210625221612.2637086-1-kuba@kernel.org --- net/core/xdp.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/xdp.c b/net/core/xdp.c index 725d20f1b100..cc92ccb38432 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -113,8 +113,13 @@ static void mem_allocator_disconnect(void *allocator) void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) { struct xdp_mem_allocator *xa; + int type = xdp_rxq->mem.type; int id = xdp_rxq->mem.id; + /* Reset mem info to defaults */ + xdp_rxq->mem.id = 0; + xdp_rxq->mem.type = 0; + if (xdp_rxq->reg_state != REG_STATE_REGISTERED) { WARN(1, "Missing register, driver bug"); return; @@ -123,7 +128,7 @@ void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) if (id == 0) return; - if (xdp_rxq->mem.type == MEM_TYPE_PAGE_POOL) { + if (type == MEM_TYPE_PAGE_POOL) { rcu_read_lock(); xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params); page_pool_destroy(xa->page_pool); @@ -144,10 +149,6 @@ void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq) xdp_rxq->reg_state = REG_STATE_UNREGISTERED; xdp_rxq->dev = NULL; - - /* Reset mem info to defaults */ - xdp_rxq->mem.id = 0; - xdp_rxq->mem.type = 0; } EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg); -- cgit v1.2.3-58-ga151 From e887b2df62513505ac6f6db2cb59ee6234ab042b Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 27 Jun 2021 14:54:22 +0300 Subject: net: bridge: include the is_local bit in br_fdb_replay Since commit 2c4eca3ef716 ("net: bridge: switchdev: include local flag in FDB notifications"), the bridge emits SWITCHDEV_FDB_ADD_TO_DEVICE events with the is_local flag populated (but we ignore it nonetheless). We would like DSA to start treating this bit, but it is still not populated by the replay helper, so add it there too. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/bridge/br_fdb.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 698b79747d32..b8d3ddfe5853 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -737,6 +737,7 @@ static int br_fdb_replay_one(struct notifier_block *nb, item.vid = fdb->key.vlan_id; item.added_by_user = test_bit(BR_FDB_ADDED_BY_USER, &fdb->flags); item.offloaded = test_bit(BR_FDB_OFFLOADED, &fdb->flags); + item.is_local = test_bit(BR_FDB_LOCAL, &fdb->flags); item.info.dev = dev; err = nb->notifier_call(nb, SWITCHDEV_FDB_ADD_TO_DEVICE, &item); -- cgit v1.2.3-58-ga151 From 69bfac968a06aab5927160f8736485f85c3e8ee8 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 27 Jun 2021 14:54:24 +0300 Subject: net: switchdev: add a context void pointer to struct switchdev_notifier_info In the case where the driver asks for a replay of a certain type of event (port object or attribute) for a bridge port that is a LAG, it may do so because this port has just joined the LAG. But there might already be other switchdev ports in that LAG, and it is preferable that those preexisting switchdev ports do not act upon the replayed event. The solution is to add a context to switchdev events, which is NULL most of the time (when the bridge layer initiates the call) but which can be set to a value controlled by the switchdev driver when a replay is requested. The driver can then check the context to figure out if all ports within the LAG should act upon the switchdev event, or just the ones that match the context. We have to modify all switchdev_handle_* helper functions as well as the prototypes in the drivers that use these helpers too, because these helpers hide the underlying struct switchdev_notifier_info from us and there is no way to retrieve the context otherwise. The context structure will be populated and used in later patches. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- .../net/ethernet/freescale/dpaa2/dpaa2-switch.c | 2 +- .../ethernet/marvell/prestera/prestera_switchdev.c | 6 +++--- .../ethernet/mellanox/mlx5/core/en/rep/bridge.c | 3 +++ .../ethernet/mellanox/mlxsw/spectrum_switchdev.c | 6 +++--- .../ethernet/microchip/sparx5/sparx5_switchdev.c | 2 +- drivers/net/ethernet/mscc/ocelot_net.c | 6 +++--- drivers/net/ethernet/ti/am65-cpsw-switchdev.c | 6 +++--- drivers/net/ethernet/ti/cpsw_switchdev.c | 6 +++--- include/net/switchdev.h | 13 +++++------ net/dsa/slave.c | 6 +++--- net/switchdev/switchdev.c | 25 ++++++++++++---------- 11 files changed, 44 insertions(+), 37 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c index 05de37c3b64c..f3d12d0714fb 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c @@ -1625,7 +1625,7 @@ static int dpaa2_switch_port_bridge_flags(struct net_device *netdev, return 0; } -static int dpaa2_switch_port_attr_set(struct net_device *netdev, +static int dpaa2_switch_port_attr_set(struct net_device *netdev, const void *ctx, const struct switchdev_attr *attr, struct netlink_ext_ack *extack) { diff --git a/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c b/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c index 74b81b4fbb97..0b3e8f2db294 100644 --- a/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c +++ b/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c @@ -708,7 +708,7 @@ err_port_stp_set: return err; } -static int prestera_port_obj_attr_set(struct net_device *dev, +static int prestera_port_obj_attr_set(struct net_device *dev, const void *ctx, const struct switchdev_attr *attr, struct netlink_ext_ack *extack) { @@ -1040,7 +1040,7 @@ static int prestera_port_vlans_add(struct prestera_port *port, flag_pvid, extack); } -static int prestera_port_obj_add(struct net_device *dev, +static int prestera_port_obj_add(struct net_device *dev, const void *ctx, const struct switchdev_obj *obj, struct netlink_ext_ack *extack) { @@ -1078,7 +1078,7 @@ static int prestera_port_vlans_del(struct prestera_port *port, return 0; } -static int prestera_port_obj_del(struct net_device *dev, +static int prestera_port_obj_del(struct net_device *dev, const void *ctx, const struct switchdev_obj *obj) { struct prestera_port *port = netdev_priv(dev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.c b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.c index 7f5efc1b4392..3c0032c9647c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.c @@ -76,6 +76,7 @@ static int mlx5_esw_bridge_switchdev_port_event(struct notifier_block *nb, } static int mlx5_esw_bridge_port_obj_add(struct net_device *dev, + const void *ctx, const struct switchdev_obj *obj, struct netlink_ext_ack *extack) { @@ -107,6 +108,7 @@ static int mlx5_esw_bridge_port_obj_add(struct net_device *dev, } static int mlx5_esw_bridge_port_obj_del(struct net_device *dev, + const void *ctx, const struct switchdev_obj *obj) { const struct switchdev_obj_port_vlan *vlan; @@ -136,6 +138,7 @@ static int mlx5_esw_bridge_port_obj_del(struct net_device *dev, } static int mlx5_esw_bridge_port_obj_attr_set(struct net_device *dev, + const void *ctx, const struct switchdev_attr *attr, struct netlink_ext_ack *extack) { diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c index 0cfba2986841..c5ef9aa64efe 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c @@ -898,7 +898,7 @@ mlxsw_sp_port_attr_br_mrouter_set(struct mlxsw_sp_port *mlxsw_sp_port, return 0; } -static int mlxsw_sp_port_attr_set(struct net_device *dev, +static int mlxsw_sp_port_attr_set(struct net_device *dev, const void *ctx, const struct switchdev_attr *attr, struct netlink_ext_ack *extack) { @@ -1766,7 +1766,7 @@ mlxsw_sp_port_mrouter_update_mdb(struct mlxsw_sp_port *mlxsw_sp_port, } } -static int mlxsw_sp_port_obj_add(struct net_device *dev, +static int mlxsw_sp_port_obj_add(struct net_device *dev, const void *ctx, const struct switchdev_obj *obj, struct netlink_ext_ack *extack) { @@ -1916,7 +1916,7 @@ mlxsw_sp_bridge_port_mdb_flush(struct mlxsw_sp_port *mlxsw_sp_port, } } -static int mlxsw_sp_port_obj_del(struct net_device *dev, +static int mlxsw_sp_port_obj_del(struct net_device *dev, const void *ctx, const struct switchdev_obj *obj) { struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev); diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_switchdev.c b/drivers/net/ethernet/microchip/sparx5/sparx5_switchdev.c index 19c7cb795b4b..246eba711f15 100644 --- a/drivers/net/ethernet/microchip/sparx5/sparx5_switchdev.c +++ b/drivers/net/ethernet/microchip/sparx5/sparx5_switchdev.c @@ -65,7 +65,7 @@ static void sparx5_port_attr_ageing_set(struct sparx5_port *port, sparx5_set_ageing(port->sparx5, ageing_time); } -static int sparx5_port_attr_set(struct net_device *dev, +static int sparx5_port_attr_set(struct net_device *dev, const void *ctx, const struct switchdev_attr *attr, struct netlink_ext_ack *extack) { diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c index 4fc74ee4aaab..456541640feb 100644 --- a/drivers/net/ethernet/mscc/ocelot_net.c +++ b/drivers/net/ethernet/mscc/ocelot_net.c @@ -939,7 +939,7 @@ static void ocelot_port_attr_mc_set(struct ocelot *ocelot, int port, bool mc) ANA_PORT_CPU_FWD_CFG, port); } -static int ocelot_port_attr_set(struct net_device *dev, +static int ocelot_port_attr_set(struct net_device *dev, const void *ctx, const struct switchdev_attr *attr, struct netlink_ext_ack *extack) { @@ -1058,7 +1058,7 @@ ocelot_port_obj_mrp_del_ring_role(struct net_device *dev, return ocelot_mrp_del_ring_role(ocelot, port, mrp); } -static int ocelot_port_obj_add(struct net_device *dev, +static int ocelot_port_obj_add(struct net_device *dev, const void *ctx, const struct switchdev_obj *obj, struct netlink_ext_ack *extack) { @@ -1086,7 +1086,7 @@ static int ocelot_port_obj_add(struct net_device *dev, return ret; } -static int ocelot_port_obj_del(struct net_device *dev, +static int ocelot_port_obj_del(struct net_device *dev, const void *ctx, const struct switchdev_obj *obj) { int ret = 0; diff --git a/drivers/net/ethernet/ti/am65-cpsw-switchdev.c b/drivers/net/ethernet/ti/am65-cpsw-switchdev.c index 23cfb91e9c4d..9c29b363e9ae 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-switchdev.c +++ b/drivers/net/ethernet/ti/am65-cpsw-switchdev.c @@ -84,7 +84,7 @@ static int am65_cpsw_port_attr_br_flags_pre_set(struct net_device *netdev, return 0; } -static int am65_cpsw_port_attr_set(struct net_device *ndev, +static int am65_cpsw_port_attr_set(struct net_device *ndev, const void *ctx, const struct switchdev_attr *attr, struct netlink_ext_ack *extack) { @@ -302,7 +302,7 @@ static int am65_cpsw_port_mdb_del(struct am65_cpsw_port *port, return 0; } -static int am65_cpsw_port_obj_add(struct net_device *ndev, +static int am65_cpsw_port_obj_add(struct net_device *ndev, const void *ctx, const struct switchdev_obj *obj, struct netlink_ext_ack *extack) { @@ -329,7 +329,7 @@ static int am65_cpsw_port_obj_add(struct net_device *ndev, return err; } -static int am65_cpsw_port_obj_del(struct net_device *ndev, +static int am65_cpsw_port_obj_del(struct net_device *ndev, const void *ctx, const struct switchdev_obj *obj) { struct switchdev_obj_port_vlan *vlan = SWITCHDEV_OBJ_PORT_VLAN(obj); diff --git a/drivers/net/ethernet/ti/cpsw_switchdev.c b/drivers/net/ethernet/ti/cpsw_switchdev.c index 05a64fb7a04f..f7fb6e17dadd 100644 --- a/drivers/net/ethernet/ti/cpsw_switchdev.c +++ b/drivers/net/ethernet/ti/cpsw_switchdev.c @@ -86,7 +86,7 @@ static int cpsw_port_attr_br_flags_pre_set(struct net_device *netdev, return 0; } -static int cpsw_port_attr_set(struct net_device *ndev, +static int cpsw_port_attr_set(struct net_device *ndev, const void *ctx, const struct switchdev_attr *attr, struct netlink_ext_ack *extack) { @@ -310,7 +310,7 @@ static int cpsw_port_mdb_del(struct cpsw_priv *priv, return err; } -static int cpsw_port_obj_add(struct net_device *ndev, +static int cpsw_port_obj_add(struct net_device *ndev, const void *ctx, const struct switchdev_obj *obj, struct netlink_ext_ack *extack) { @@ -338,7 +338,7 @@ static int cpsw_port_obj_add(struct net_device *ndev, return err; } -static int cpsw_port_obj_del(struct net_device *ndev, +static int cpsw_port_obj_del(struct net_device *ndev, const void *ctx, const struct switchdev_obj *obj) { struct switchdev_obj_port_vlan *vlan = SWITCHDEV_OBJ_PORT_VLAN(obj); diff --git a/include/net/switchdev.h b/include/net/switchdev.h index f1a5a9a3634d..e4cac9218ce1 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -202,6 +202,7 @@ enum switchdev_notifier_type { struct switchdev_notifier_info { struct net_device *dev; struct netlink_ext_ack *extack; + const void *ctx; }; struct switchdev_notifier_fdb_info { @@ -268,19 +269,19 @@ void switchdev_port_fwd_mark_set(struct net_device *dev, int switchdev_handle_port_obj_add(struct net_device *dev, struct switchdev_notifier_port_obj_info *port_obj_info, bool (*check_cb)(const struct net_device *dev), - int (*add_cb)(struct net_device *dev, + int (*add_cb)(struct net_device *dev, const void *ctx, const struct switchdev_obj *obj, struct netlink_ext_ack *extack)); int switchdev_handle_port_obj_del(struct net_device *dev, struct switchdev_notifier_port_obj_info *port_obj_info, bool (*check_cb)(const struct net_device *dev), - int (*del_cb)(struct net_device *dev, + int (*del_cb)(struct net_device *dev, const void *ctx, const struct switchdev_obj *obj)); int switchdev_handle_port_attr_set(struct net_device *dev, struct switchdev_notifier_port_attr_info *port_attr_info, bool (*check_cb)(const struct net_device *dev), - int (*set_cb)(struct net_device *dev, + int (*set_cb)(struct net_device *dev, const void *ctx, const struct switchdev_attr *attr, struct netlink_ext_ack *extack)); #else @@ -352,7 +353,7 @@ static inline int switchdev_handle_port_obj_add(struct net_device *dev, struct switchdev_notifier_port_obj_info *port_obj_info, bool (*check_cb)(const struct net_device *dev), - int (*add_cb)(struct net_device *dev, + int (*add_cb)(struct net_device *dev, const void *ctx, const struct switchdev_obj *obj, struct netlink_ext_ack *extack)) { @@ -363,7 +364,7 @@ static inline int switchdev_handle_port_obj_del(struct net_device *dev, struct switchdev_notifier_port_obj_info *port_obj_info, bool (*check_cb)(const struct net_device *dev), - int (*del_cb)(struct net_device *dev, + int (*del_cb)(struct net_device *dev, const void *ctx, const struct switchdev_obj *obj)) { return 0; @@ -373,7 +374,7 @@ static inline int switchdev_handle_port_attr_set(struct net_device *dev, struct switchdev_notifier_port_attr_info *port_attr_info, bool (*check_cb)(const struct net_device *dev), - int (*set_cb)(struct net_device *dev, + int (*set_cb)(struct net_device *dev, const void *ctx, const struct switchdev_attr *attr, struct netlink_ext_ack *extack)) { diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 5e668e529575..3692259a025f 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -271,7 +271,7 @@ static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) return phylink_mii_ioctl(p->dp->pl, ifr, cmd); } -static int dsa_slave_port_attr_set(struct net_device *dev, +static int dsa_slave_port_attr_set(struct net_device *dev, const void *ctx, const struct switchdev_attr *attr, struct netlink_ext_ack *extack) { @@ -394,7 +394,7 @@ static int dsa_slave_vlan_add(struct net_device *dev, return vlan_vid_add(master, htons(ETH_P_8021Q), vlan.vid); } -static int dsa_slave_port_obj_add(struct net_device *dev, +static int dsa_slave_port_obj_add(struct net_device *dev, const void *ctx, const struct switchdev_obj *obj, struct netlink_ext_ack *extack) { @@ -469,7 +469,7 @@ static int dsa_slave_vlan_del(struct net_device *dev, return 0; } -static int dsa_slave_port_obj_del(struct net_device *dev, +static int dsa_slave_port_obj_del(struct net_device *dev, const void *ctx, const struct switchdev_obj *obj) { struct dsa_port *dp = dsa_slave_to_port(dev); diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 89a36db47ab4..070698dd19bc 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -381,19 +381,20 @@ EXPORT_SYMBOL_GPL(call_switchdev_blocking_notifiers); static int __switchdev_handle_port_obj_add(struct net_device *dev, struct switchdev_notifier_port_obj_info *port_obj_info, bool (*check_cb)(const struct net_device *dev), - int (*add_cb)(struct net_device *dev, + int (*add_cb)(struct net_device *dev, const void *ctx, const struct switchdev_obj *obj, struct netlink_ext_ack *extack)) { + struct switchdev_notifier_info *info = &port_obj_info->info; struct netlink_ext_ack *extack; struct net_device *lower_dev; struct list_head *iter; int err = -EOPNOTSUPP; - extack = switchdev_notifier_info_to_extack(&port_obj_info->info); + extack = switchdev_notifier_info_to_extack(info); if (check_cb(dev)) { - err = add_cb(dev, port_obj_info->obj, extack); + err = add_cb(dev, info->ctx, port_obj_info->obj, extack); if (err != -EOPNOTSUPP) port_obj_info->handled = true; return err; @@ -422,7 +423,7 @@ static int __switchdev_handle_port_obj_add(struct net_device *dev, int switchdev_handle_port_obj_add(struct net_device *dev, struct switchdev_notifier_port_obj_info *port_obj_info, bool (*check_cb)(const struct net_device *dev), - int (*add_cb)(struct net_device *dev, + int (*add_cb)(struct net_device *dev, const void *ctx, const struct switchdev_obj *obj, struct netlink_ext_ack *extack)) { @@ -439,15 +440,16 @@ EXPORT_SYMBOL_GPL(switchdev_handle_port_obj_add); static int __switchdev_handle_port_obj_del(struct net_device *dev, struct switchdev_notifier_port_obj_info *port_obj_info, bool (*check_cb)(const struct net_device *dev), - int (*del_cb)(struct net_device *dev, + int (*del_cb)(struct net_device *dev, const void *ctx, const struct switchdev_obj *obj)) { + struct switchdev_notifier_info *info = &port_obj_info->info; struct net_device *lower_dev; struct list_head *iter; int err = -EOPNOTSUPP; if (check_cb(dev)) { - err = del_cb(dev, port_obj_info->obj); + err = del_cb(dev, info->ctx, port_obj_info->obj); if (err != -EOPNOTSUPP) port_obj_info->handled = true; return err; @@ -476,7 +478,7 @@ static int __switchdev_handle_port_obj_del(struct net_device *dev, int switchdev_handle_port_obj_del(struct net_device *dev, struct switchdev_notifier_port_obj_info *port_obj_info, bool (*check_cb)(const struct net_device *dev), - int (*del_cb)(struct net_device *dev, + int (*del_cb)(struct net_device *dev, const void *ctx, const struct switchdev_obj *obj)) { int err; @@ -492,19 +494,20 @@ EXPORT_SYMBOL_GPL(switchdev_handle_port_obj_del); static int __switchdev_handle_port_attr_set(struct net_device *dev, struct switchdev_notifier_port_attr_info *port_attr_info, bool (*check_cb)(const struct net_device *dev), - int (*set_cb)(struct net_device *dev, + int (*set_cb)(struct net_device *dev, const void *ctx, const struct switchdev_attr *attr, struct netlink_ext_ack *extack)) { + struct switchdev_notifier_info *info = &port_attr_info->info; struct netlink_ext_ack *extack; struct net_device *lower_dev; struct list_head *iter; int err = -EOPNOTSUPP; - extack = switchdev_notifier_info_to_extack(&port_attr_info->info); + extack = switchdev_notifier_info_to_extack(info); if (check_cb(dev)) { - err = set_cb(dev, port_attr_info->attr, extack); + err = set_cb(dev, info->ctx, port_attr_info->attr, extack); if (err != -EOPNOTSUPP) port_attr_info->handled = true; return err; @@ -533,7 +536,7 @@ static int __switchdev_handle_port_attr_set(struct net_device *dev, int switchdev_handle_port_attr_set(struct net_device *dev, struct switchdev_notifier_port_attr_info *port_attr_info, bool (*check_cb)(const struct net_device *dev), - int (*set_cb)(struct net_device *dev, + int (*set_cb)(struct net_device *dev, const void *ctx, const struct switchdev_attr *attr, struct netlink_ext_ack *extack)) { -- cgit v1.2.3-58-ga151 From 0d2cfbd41c4a5a0ca5598d1874b1081138cd64c6 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 27 Jun 2021 14:54:25 +0300 Subject: net: bridge: ignore switchdev events for LAG ports which didn't request replay There is a slight inconvenience in the switchdev replay helpers added recently, and this is when: ip link add br0 type bridge ip link add bond0 type bond ip link set bond0 master br0 bridge vlan add dev bond0 vid 100 ip link set swp0 master bond0 ip link set swp1 master bond0 Since the underlying driver (currently only DSA) asks for a replay of VLANs when swp0 and swp1 join the LAG because it is bridged, what will happen is that DSA will try to react twice on the VLAN event for swp0. This is not really a huge problem right now, because most drivers accept duplicates since the bridge itself does, but it will become a problem when we add support for replaying switchdev object deletions. Let's fix this by adding a blank void *ctx in the replay helpers, which will be passed on by the bridge in the switchdev notifications. If the context is NULL, everything is the same as before. But if the context is populated with a valid pointer, the underlying switchdev driver (currently DSA) can use the pointer to 'see through' the bridge port (which in the example above is bond0) and 'know' that the event is only for a particular physical port offloading that bridge port, and not for all of them. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/mscc/ocelot_net.c | 19 +++++++++++++++++-- include/linux/if_bridge.h | 14 ++++++++------ net/bridge/br_fdb.c | 7 ++++--- net/bridge/br_mdb.c | 8 +++++--- net/bridge/br_vlan.c | 8 +++++--- net/dsa/port.c | 6 +++--- net/dsa/slave.c | 9 +++++++++ 7 files changed, 51 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c index 456541640feb..166d851962d2 100644 --- a/drivers/net/ethernet/mscc/ocelot_net.c +++ b/drivers/net/ethernet/mscc/ocelot_net.c @@ -948,6 +948,9 @@ static int ocelot_port_attr_set(struct net_device *dev, const void *ctx, int port = priv->chip_port; int err = 0; + if (ctx && ctx != priv) + return 0; + switch (attr->id) { case SWITCHDEV_ATTR_ID_PORT_STP_STATE: ocelot_port_attr_stp_state_set(ocelot, port, attr->u.stp_state); @@ -1062,8 +1065,12 @@ static int ocelot_port_obj_add(struct net_device *dev, const void *ctx, const struct switchdev_obj *obj, struct netlink_ext_ack *extack) { + struct ocelot_port_private *priv = netdev_priv(dev); int ret = 0; + if (ctx && ctx != priv) + return 0; + switch (obj->id) { case SWITCHDEV_OBJ_ID_PORT_VLAN: ret = ocelot_port_obj_add_vlan(dev, @@ -1089,8 +1096,12 @@ static int ocelot_port_obj_add(struct net_device *dev, const void *ctx, static int ocelot_port_obj_del(struct net_device *dev, const void *ctx, const struct switchdev_obj *obj) { + struct ocelot_port_private *priv = netdev_priv(dev); int ret = 0; + if (ctx && ctx != priv) + return 0; + switch (obj->id) { case SWITCHDEV_OBJ_ID_PORT_VLAN: ret = ocelot_vlan_vid_del(dev, @@ -1143,10 +1154,14 @@ static int ocelot_switchdev_sync(struct ocelot *ocelot, int port, struct net_device *bridge_dev, struct netlink_ext_ack *extack) { + struct ocelot_port *ocelot_port = ocelot->ports[port]; + struct ocelot_port_private *priv; clock_t ageing_time; u8 stp_state; int err; + priv = container_of(ocelot_port, struct ocelot_port_private, port); + ocelot_inherit_brport_flags(ocelot, port, brport_dev); stp_state = br_port_get_stp_state(brport_dev); @@ -1160,12 +1175,12 @@ static int ocelot_switchdev_sync(struct ocelot *ocelot, int port, ageing_time = br_get_ageing_time(bridge_dev); ocelot_port_attr_ageing_set(ocelot, port, ageing_time); - err = br_mdb_replay(bridge_dev, brport_dev, + err = br_mdb_replay(bridge_dev, brport_dev, priv, &ocelot_switchdev_blocking_nb, extack); if (err && err != -EOPNOTSUPP) return err; - err = br_vlan_replay(bridge_dev, brport_dev, + err = br_vlan_replay(bridge_dev, brport_dev, priv, &ocelot_switchdev_blocking_nb, extack); if (err && err != -EOPNOTSUPP) return err; diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index 12e9a32dbca0..57df761b6f4a 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -71,7 +71,8 @@ bool br_multicast_has_router_adjacent(struct net_device *dev, int proto); bool br_multicast_enabled(const struct net_device *dev); bool br_multicast_router(const struct net_device *dev); int br_mdb_replay(struct net_device *br_dev, struct net_device *dev, - struct notifier_block *nb, struct netlink_ext_ack *extack); + const void *ctx, struct notifier_block *nb, + struct netlink_ext_ack *extack); #else static inline int br_multicast_list_adjacent(struct net_device *dev, struct list_head *br_ip_list) @@ -104,7 +105,7 @@ static inline bool br_multicast_router(const struct net_device *dev) return false; } static inline int br_mdb_replay(struct net_device *br_dev, - struct net_device *dev, + struct net_device *dev, const void *ctx, struct notifier_block *nb, struct netlink_ext_ack *extack) { @@ -120,7 +121,8 @@ int br_vlan_get_proto(const struct net_device *dev, u16 *p_proto); int br_vlan_get_info(const struct net_device *dev, u16 vid, struct bridge_vlan_info *p_vinfo); int br_vlan_replay(struct net_device *br_dev, struct net_device *dev, - struct notifier_block *nb, struct netlink_ext_ack *extack); + const void *ctx, struct notifier_block *nb, + struct netlink_ext_ack *extack); #else static inline bool br_vlan_enabled(const struct net_device *dev) { @@ -149,7 +151,7 @@ static inline int br_vlan_get_info(const struct net_device *dev, u16 vid, } static inline int br_vlan_replay(struct net_device *br_dev, - struct net_device *dev, + struct net_device *dev, const void *ctx, struct notifier_block *nb, struct netlink_ext_ack *extack) { @@ -166,7 +168,7 @@ bool br_port_flag_is_set(const struct net_device *dev, unsigned long flag); u8 br_port_get_stp_state(const struct net_device *dev); clock_t br_get_ageing_time(struct net_device *br_dev); int br_fdb_replay(struct net_device *br_dev, struct net_device *dev, - struct notifier_block *nb); + const void *ctx, struct notifier_block *nb); #else static inline struct net_device * br_fdb_find_port(const struct net_device *br_dev, @@ -197,7 +199,7 @@ static inline clock_t br_get_ageing_time(struct net_device *br_dev) } static inline int br_fdb_replay(struct net_device *br_dev, - struct net_device *dev, + struct net_device *dev, const void *ctx, struct notifier_block *nb) { return -EOPNOTSUPP; diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index b8d3ddfe5853..9d164a518e38 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -728,7 +728,7 @@ static inline size_t fdb_nlmsg_size(void) static int br_fdb_replay_one(struct notifier_block *nb, struct net_bridge_fdb_entry *fdb, - struct net_device *dev) + struct net_device *dev, const void *ctx) { struct switchdev_notifier_fdb_info item; int err; @@ -739,13 +739,14 @@ static int br_fdb_replay_one(struct notifier_block *nb, item.offloaded = test_bit(BR_FDB_OFFLOADED, &fdb->flags); item.is_local = test_bit(BR_FDB_LOCAL, &fdb->flags); item.info.dev = dev; + item.info.ctx = ctx; err = nb->notifier_call(nb, SWITCHDEV_FDB_ADD_TO_DEVICE, &item); return notifier_to_errno(err); } int br_fdb_replay(struct net_device *br_dev, struct net_device *dev, - struct notifier_block *nb) + const void *ctx, struct notifier_block *nb) { struct net_bridge_fdb_entry *fdb; struct net_bridge *br; @@ -766,7 +767,7 @@ int br_fdb_replay(struct net_device *br_dev, struct net_device *dev, if (dst_dev != br_dev && dst_dev != dev) continue; - err = br_fdb_replay_one(nb, fdb, dst_dev); + err = br_fdb_replay_one(nb, fdb, dst_dev, ctx); if (err) break; } diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 3f839a8cc9fb..8bc6afca5e8c 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -568,12 +568,13 @@ static void br_switchdev_mdb_populate(struct switchdev_obj_port_mdb *mdb, static int br_mdb_replay_one(struct notifier_block *nb, struct net_device *dev, struct switchdev_obj_port_mdb *mdb, - struct netlink_ext_ack *extack) + const void *ctx, struct netlink_ext_ack *extack) { struct switchdev_notifier_port_obj_info obj_info = { .info = { .dev = dev, .extack = extack, + .ctx = ctx, }, .obj = &mdb->obj, }; @@ -603,7 +604,8 @@ static int br_mdb_queue_one(struct list_head *mdb_list, } int br_mdb_replay(struct net_device *br_dev, struct net_device *dev, - struct notifier_block *nb, struct netlink_ext_ack *extack) + const void *ctx, struct notifier_block *nb, + struct netlink_ext_ack *extack) { struct net_bridge_mdb_entry *mp; struct switchdev_obj *obj, *tmp; @@ -664,7 +666,7 @@ int br_mdb_replay(struct net_device *br_dev, struct net_device *dev, list_for_each_entry(obj, &mdb_list, list) { err = br_mdb_replay_one(nb, dev, SWITCHDEV_OBJ_PORT_MDB(obj), - extack); + ctx, extack); if (err) goto out_free_mdb; } diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 8789a57af543..2bfa2a00e193 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -1807,12 +1807,13 @@ out_kfree: static int br_vlan_replay_one(struct notifier_block *nb, struct net_device *dev, struct switchdev_obj_port_vlan *vlan, - struct netlink_ext_ack *extack) + const void *ctx, struct netlink_ext_ack *extack) { struct switchdev_notifier_port_obj_info obj_info = { .info = { .dev = dev, .extack = extack, + .ctx = ctx, }, .obj = &vlan->obj, }; @@ -1823,7 +1824,8 @@ static int br_vlan_replay_one(struct notifier_block *nb, } int br_vlan_replay(struct net_device *br_dev, struct net_device *dev, - struct notifier_block *nb, struct netlink_ext_ack *extack) + const void *ctx, struct notifier_block *nb, + struct netlink_ext_ack *extack) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *v; @@ -1868,7 +1870,7 @@ int br_vlan_replay(struct net_device *br_dev, struct net_device *dev, if (!br_vlan_should_use(v)) continue; - err = br_vlan_replay_one(nb, dev, &vlan, extack); + err = br_vlan_replay_one(nb, dev, &vlan, ctx, extack); if (err) return err; } diff --git a/net/dsa/port.c b/net/dsa/port.c index 5c93f1e1a03d..339781c98de1 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -194,17 +194,17 @@ static int dsa_port_switchdev_sync(struct dsa_port *dp, if (err && err != -EOPNOTSUPP) return err; - err = br_mdb_replay(br, brport_dev, + err = br_mdb_replay(br, brport_dev, dp, &dsa_slave_switchdev_blocking_notifier, extack); if (err && err != -EOPNOTSUPP) return err; - err = br_fdb_replay(br, brport_dev, &dsa_slave_switchdev_notifier); + err = br_fdb_replay(br, brport_dev, dp, &dsa_slave_switchdev_notifier); if (err && err != -EOPNOTSUPP) return err; - err = br_vlan_replay(br, brport_dev, + err = br_vlan_replay(br, brport_dev, dp, &dsa_slave_switchdev_blocking_notifier, extack); if (err && err != -EOPNOTSUPP) diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 3692259a025f..2f0d0a6b1f9c 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -278,6 +278,9 @@ static int dsa_slave_port_attr_set(struct net_device *dev, const void *ctx, struct dsa_port *dp = dsa_slave_to_port(dev); int ret; + if (ctx && ctx != dp) + return 0; + switch (attr->id) { case SWITCHDEV_ATTR_ID_PORT_STP_STATE: if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev)) @@ -401,6 +404,9 @@ static int dsa_slave_port_obj_add(struct net_device *dev, const void *ctx, struct dsa_port *dp = dsa_slave_to_port(dev); int err; + if (ctx && ctx != dp) + return 0; + switch (obj->id) { case SWITCHDEV_OBJ_ID_PORT_MDB: if (!dsa_port_offloads_bridge_port(dp, obj->orig_dev)) @@ -475,6 +481,9 @@ static int dsa_slave_port_obj_del(struct net_device *dev, const void *ctx, struct dsa_port *dp = dsa_slave_to_port(dev); int err; + if (ctx && ctx != dp) + return 0; + switch (obj->id) { case SWITCHDEV_OBJ_ID_PORT_MDB: if (!dsa_port_offloads_bridge_port(dp, obj->orig_dev)) -- cgit v1.2.3-58-ga151 From bdf123b455ce596aec6e410ec36fe3687b6a2140 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 27 Jun 2021 14:54:26 +0300 Subject: net: bridge: constify variables in the replay helpers Some of the arguments and local variables for the newly added switchdev replay helpers can be const, so let's make them so. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/if_bridge.h | 14 +++++++------- net/bridge/br_fdb.c | 6 +++--- net/bridge/br_mdb.c | 8 ++++---- net/bridge/br_stp.c | 4 ++-- 4 files changed, 16 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index 57df761b6f4a..6b54da2c65ba 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -104,8 +104,8 @@ static inline bool br_multicast_router(const struct net_device *dev) { return false; } -static inline int br_mdb_replay(struct net_device *br_dev, - struct net_device *dev, const void *ctx, +static inline int br_mdb_replay(const struct net_device *br_dev, + const struct net_device *dev, const void *ctx, struct notifier_block *nb, struct netlink_ext_ack *extack) { @@ -166,8 +166,8 @@ struct net_device *br_fdb_find_port(const struct net_device *br_dev, void br_fdb_clear_offload(const struct net_device *dev, u16 vid); bool br_port_flag_is_set(const struct net_device *dev, unsigned long flag); u8 br_port_get_stp_state(const struct net_device *dev); -clock_t br_get_ageing_time(struct net_device *br_dev); -int br_fdb_replay(struct net_device *br_dev, struct net_device *dev, +clock_t br_get_ageing_time(const struct net_device *br_dev); +int br_fdb_replay(const struct net_device *br_dev, const struct net_device *dev, const void *ctx, struct notifier_block *nb); #else static inline struct net_device * @@ -193,13 +193,13 @@ static inline u8 br_port_get_stp_state(const struct net_device *dev) return BR_STATE_DISABLED; } -static inline clock_t br_get_ageing_time(struct net_device *br_dev) +static inline clock_t br_get_ageing_time(const struct net_device *br_dev) { return 0; } -static inline int br_fdb_replay(struct net_device *br_dev, - struct net_device *dev, const void *ctx, +static inline int br_fdb_replay(const struct net_device *br_dev, + const struct net_device *dev, const void *ctx, struct notifier_block *nb) { return -EOPNOTSUPP; diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 9d164a518e38..2e777c8b0921 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -727,7 +727,7 @@ static inline size_t fdb_nlmsg_size(void) } static int br_fdb_replay_one(struct notifier_block *nb, - struct net_bridge_fdb_entry *fdb, + const struct net_bridge_fdb_entry *fdb, struct net_device *dev, const void *ctx) { struct switchdev_notifier_fdb_info item; @@ -745,7 +745,7 @@ static int br_fdb_replay_one(struct notifier_block *nb, return notifier_to_errno(err); } -int br_fdb_replay(struct net_device *br_dev, struct net_device *dev, +int br_fdb_replay(const struct net_device *br_dev, const struct net_device *dev, const void *ctx, struct notifier_block *nb) { struct net_bridge_fdb_entry *fdb; @@ -760,7 +760,7 @@ int br_fdb_replay(struct net_device *br_dev, struct net_device *dev, rcu_read_lock(); hlist_for_each_entry_rcu(fdb, &br->fdb_list, fdb_node) { - struct net_bridge_port *dst = READ_ONCE(fdb->dst); + const struct net_bridge_port *dst = READ_ONCE(fdb->dst); struct net_device *dst_dev; dst_dev = dst ? dst->dev : br->dev; diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 8bc6afca5e8c..cebdbff17b54 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -567,7 +567,7 @@ static void br_switchdev_mdb_populate(struct switchdev_obj_port_mdb *mdb, } static int br_mdb_replay_one(struct notifier_block *nb, struct net_device *dev, - struct switchdev_obj_port_mdb *mdb, + const struct switchdev_obj_port_mdb *mdb, const void *ctx, struct netlink_ext_ack *extack) { struct switchdev_notifier_port_obj_info obj_info = { @@ -607,7 +607,7 @@ int br_mdb_replay(struct net_device *br_dev, struct net_device *dev, const void *ctx, struct notifier_block *nb, struct netlink_ext_ack *extack) { - struct net_bridge_mdb_entry *mp; + const struct net_bridge_mdb_entry *mp; struct switchdev_obj *obj, *tmp; struct net_bridge *br; LIST_HEAD(mdb_list); @@ -634,8 +634,8 @@ int br_mdb_replay(struct net_device *br_dev, struct net_device *dev, rcu_read_lock(); hlist_for_each_entry_rcu(mp, &br->mdb_list, mdb_node) { - struct net_bridge_port_group __rcu **pp; - struct net_bridge_port_group *p; + struct net_bridge_port_group __rcu * const *pp; + const struct net_bridge_port_group *p; if (mp->host_joined) { err = br_mdb_queue_one(&mdb_list, diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c index 3dafb6143cff..1d80f34a139c 100644 --- a/net/bridge/br_stp.c +++ b/net/bridge/br_stp.c @@ -639,9 +639,9 @@ int br_set_ageing_time(struct net_bridge *br, clock_t ageing_time) return 0; } -clock_t br_get_ageing_time(struct net_device *br_dev) +clock_t br_get_ageing_time(const struct net_device *br_dev) { - struct net_bridge *br; + const struct net_bridge *br; if (!netif_is_bridge_master(br_dev)) return 0; -- cgit v1.2.3-58-ga151 From 7e8c18586daf7c1653c4b43a8119bc9662ed8fa6 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 27 Jun 2021 14:54:27 +0300 Subject: net: bridge: allow the switchdev replay functions to be called for deletion When a switchdev port leaves a LAG that is a bridge port, the switchdev objects and port attributes offloaded to that port are not removed: ip link add br0 type bridge ip link add bond0 type bond mode 802.3ad ip link set swp0 master bond0 ip link set bond0 master br0 bridge vlan add dev bond0 vid 100 ip link set swp0 nomaster VLAN 100 will remain installed on swp0 despite it going into standalone mode, because as far as the bridge is concerned, nothing ever happened to its bridge port. Let's extend the bridge vlan, fdb and mdb replay functions to take a 'bool adding' argument, and make DSA and ocelot call the replay functions with 'adding' as false from the switchdev unsync path, for the switch port that leaves the bridge. Note that this patch in itself does not salvage anything, because in the current pull mode of operation, DSA still needs to call the replay helpers with adding=false. This will be done in another patch. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/mscc/ocelot_net.c | 4 ++-- include/linux/if_bridge.h | 12 ++++++------ net/bridge/br_fdb.c | 15 +++++++++++---- net/bridge/br_mdb.c | 15 +++++++++++---- net/bridge/br_vlan.c | 15 +++++++++++---- net/dsa/port.c | 13 ++++++------- 6 files changed, 47 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c index 166d851962d2..3e89e34f86d5 100644 --- a/drivers/net/ethernet/mscc/ocelot_net.c +++ b/drivers/net/ethernet/mscc/ocelot_net.c @@ -1175,12 +1175,12 @@ static int ocelot_switchdev_sync(struct ocelot *ocelot, int port, ageing_time = br_get_ageing_time(bridge_dev); ocelot_port_attr_ageing_set(ocelot, port, ageing_time); - err = br_mdb_replay(bridge_dev, brport_dev, priv, + err = br_mdb_replay(bridge_dev, brport_dev, priv, true, &ocelot_switchdev_blocking_nb, extack); if (err && err != -EOPNOTSUPP) return err; - err = br_vlan_replay(bridge_dev, brport_dev, priv, + err = br_vlan_replay(bridge_dev, brport_dev, priv, true, &ocelot_switchdev_blocking_nb, extack); if (err && err != -EOPNOTSUPP) return err; diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index 6b54da2c65ba..b651c5e32a28 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -71,7 +71,7 @@ bool br_multicast_has_router_adjacent(struct net_device *dev, int proto); bool br_multicast_enabled(const struct net_device *dev); bool br_multicast_router(const struct net_device *dev); int br_mdb_replay(struct net_device *br_dev, struct net_device *dev, - const void *ctx, struct notifier_block *nb, + const void *ctx, bool adding, struct notifier_block *nb, struct netlink_ext_ack *extack); #else static inline int br_multicast_list_adjacent(struct net_device *dev, @@ -106,7 +106,7 @@ static inline bool br_multicast_router(const struct net_device *dev) } static inline int br_mdb_replay(const struct net_device *br_dev, const struct net_device *dev, const void *ctx, - struct notifier_block *nb, + bool adding, struct notifier_block *nb, struct netlink_ext_ack *extack) { return -EOPNOTSUPP; @@ -121,7 +121,7 @@ int br_vlan_get_proto(const struct net_device *dev, u16 *p_proto); int br_vlan_get_info(const struct net_device *dev, u16 vid, struct bridge_vlan_info *p_vinfo); int br_vlan_replay(struct net_device *br_dev, struct net_device *dev, - const void *ctx, struct notifier_block *nb, + const void *ctx, bool adding, struct notifier_block *nb, struct netlink_ext_ack *extack); #else static inline bool br_vlan_enabled(const struct net_device *dev) @@ -152,7 +152,7 @@ static inline int br_vlan_get_info(const struct net_device *dev, u16 vid, static inline int br_vlan_replay(struct net_device *br_dev, struct net_device *dev, const void *ctx, - struct notifier_block *nb, + bool adding, struct notifier_block *nb, struct netlink_ext_ack *extack) { return -EOPNOTSUPP; @@ -168,7 +168,7 @@ bool br_port_flag_is_set(const struct net_device *dev, unsigned long flag); u8 br_port_get_stp_state(const struct net_device *dev); clock_t br_get_ageing_time(const struct net_device *br_dev); int br_fdb_replay(const struct net_device *br_dev, const struct net_device *dev, - const void *ctx, struct notifier_block *nb); + const void *ctx, bool adding, struct notifier_block *nb); #else static inline struct net_device * br_fdb_find_port(const struct net_device *br_dev, @@ -200,7 +200,7 @@ static inline clock_t br_get_ageing_time(const struct net_device *br_dev) static inline int br_fdb_replay(const struct net_device *br_dev, const struct net_device *dev, const void *ctx, - struct notifier_block *nb) + bool adding, struct notifier_block *nb) { return -EOPNOTSUPP; } diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 2e777c8b0921..16f9434fdb5d 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -728,7 +728,8 @@ static inline size_t fdb_nlmsg_size(void) static int br_fdb_replay_one(struct notifier_block *nb, const struct net_bridge_fdb_entry *fdb, - struct net_device *dev, const void *ctx) + struct net_device *dev, unsigned long action, + const void *ctx) { struct switchdev_notifier_fdb_info item; int err; @@ -741,15 +742,16 @@ static int br_fdb_replay_one(struct notifier_block *nb, item.info.dev = dev; item.info.ctx = ctx; - err = nb->notifier_call(nb, SWITCHDEV_FDB_ADD_TO_DEVICE, &item); + err = nb->notifier_call(nb, action, &item); return notifier_to_errno(err); } int br_fdb_replay(const struct net_device *br_dev, const struct net_device *dev, - const void *ctx, struct notifier_block *nb) + const void *ctx, bool adding, struct notifier_block *nb) { struct net_bridge_fdb_entry *fdb; struct net_bridge *br; + unsigned long action; int err = 0; if (!netif_is_bridge_master(br_dev) || !netif_is_bridge_port(dev)) @@ -757,6 +759,11 @@ int br_fdb_replay(const struct net_device *br_dev, const struct net_device *dev, br = netdev_priv(br_dev); + if (adding) + action = SWITCHDEV_FDB_ADD_TO_DEVICE; + else + action = SWITCHDEV_FDB_DEL_TO_DEVICE; + rcu_read_lock(); hlist_for_each_entry_rcu(fdb, &br->fdb_list, fdb_node) { @@ -767,7 +774,7 @@ int br_fdb_replay(const struct net_device *br_dev, const struct net_device *dev, if (dst_dev != br_dev && dst_dev != dev) continue; - err = br_fdb_replay_one(nb, fdb, dst_dev, ctx); + err = br_fdb_replay_one(nb, fdb, dst_dev, action, ctx); if (err) break; } diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index cebdbff17b54..17a720b4473f 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -568,7 +568,8 @@ static void br_switchdev_mdb_populate(struct switchdev_obj_port_mdb *mdb, static int br_mdb_replay_one(struct notifier_block *nb, struct net_device *dev, const struct switchdev_obj_port_mdb *mdb, - const void *ctx, struct netlink_ext_ack *extack) + unsigned long action, const void *ctx, + struct netlink_ext_ack *extack) { struct switchdev_notifier_port_obj_info obj_info = { .info = { @@ -580,7 +581,7 @@ static int br_mdb_replay_one(struct notifier_block *nb, struct net_device *dev, }; int err; - err = nb->notifier_call(nb, SWITCHDEV_PORT_OBJ_ADD, &obj_info); + err = nb->notifier_call(nb, action, &obj_info); return notifier_to_errno(err); } @@ -604,12 +605,13 @@ static int br_mdb_queue_one(struct list_head *mdb_list, } int br_mdb_replay(struct net_device *br_dev, struct net_device *dev, - const void *ctx, struct notifier_block *nb, + const void *ctx, bool adding, struct notifier_block *nb, struct netlink_ext_ack *extack) { const struct net_bridge_mdb_entry *mp; struct switchdev_obj *obj, *tmp; struct net_bridge *br; + unsigned long action; LIST_HEAD(mdb_list); int err = 0; @@ -664,9 +666,14 @@ int br_mdb_replay(struct net_device *br_dev, struct net_device *dev, rcu_read_unlock(); + if (adding) + action = SWITCHDEV_PORT_OBJ_ADD; + else + action = SWITCHDEV_PORT_OBJ_DEL; + list_for_each_entry(obj, &mdb_list, list) { err = br_mdb_replay_one(nb, dev, SWITCHDEV_OBJ_PORT_MDB(obj), - ctx, extack); + action, ctx, extack); if (err) goto out_free_mdb; } diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 2bfa2a00e193..a08e9f193009 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -1807,7 +1807,8 @@ out_kfree: static int br_vlan_replay_one(struct notifier_block *nb, struct net_device *dev, struct switchdev_obj_port_vlan *vlan, - const void *ctx, struct netlink_ext_ack *extack) + const void *ctx, unsigned long action, + struct netlink_ext_ack *extack) { struct switchdev_notifier_port_obj_info obj_info = { .info = { @@ -1819,18 +1820,19 @@ static int br_vlan_replay_one(struct notifier_block *nb, }; int err; - err = nb->notifier_call(nb, SWITCHDEV_PORT_OBJ_ADD, &obj_info); + err = nb->notifier_call(nb, action, &obj_info); return notifier_to_errno(err); } int br_vlan_replay(struct net_device *br_dev, struct net_device *dev, - const void *ctx, struct notifier_block *nb, + const void *ctx, bool adding, struct notifier_block *nb, struct netlink_ext_ack *extack) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *v; struct net_bridge_port *p; struct net_bridge *br; + unsigned long action; int err = 0; u16 pvid; @@ -1857,6 +1859,11 @@ int br_vlan_replay(struct net_device *br_dev, struct net_device *dev, if (!vg) return 0; + if (adding) + action = SWITCHDEV_PORT_OBJ_ADD; + else + action = SWITCHDEV_PORT_OBJ_DEL; + pvid = br_get_pvid(vg); list_for_each_entry(v, &vg->vlan_list, vlist) { @@ -1870,7 +1877,7 @@ int br_vlan_replay(struct net_device *br_dev, struct net_device *dev, if (!br_vlan_should_use(v)) continue; - err = br_vlan_replay_one(nb, dev, &vlan, ctx, extack); + err = br_vlan_replay_one(nb, dev, &vlan, ctx, action, extack); if (err) return err; } diff --git a/net/dsa/port.c b/net/dsa/port.c index 339781c98de1..4e58d07ececd 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -194,19 +194,18 @@ static int dsa_port_switchdev_sync(struct dsa_port *dp, if (err && err != -EOPNOTSUPP) return err; - err = br_mdb_replay(br, brport_dev, dp, - &dsa_slave_switchdev_blocking_notifier, - extack); + err = br_mdb_replay(br, brport_dev, dp, true, + &dsa_slave_switchdev_blocking_notifier, extack); if (err && err != -EOPNOTSUPP) return err; - err = br_fdb_replay(br, brport_dev, dp, &dsa_slave_switchdev_notifier); + err = br_fdb_replay(br, brport_dev, dp, true, + &dsa_slave_switchdev_notifier); if (err && err != -EOPNOTSUPP) return err; - err = br_vlan_replay(br, brport_dev, dp, - &dsa_slave_switchdev_blocking_notifier, - extack); + err = br_vlan_replay(br, brport_dev, dp, true, + &dsa_slave_switchdev_blocking_notifier, extack); if (err && err != -EOPNOTSUPP) return err; -- cgit v1.2.3-58-ga151 From 4ede74e73b5b540b2a20bb6d5ad4d69348ba51fc Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 27 Jun 2021 14:54:28 +0300 Subject: net: dsa: refactor the prechangeupper sanity checks into a dedicated function We need to add more logic to the DSA NETDEV_PRECHANGEUPPER event handler, more exactly we need to request an unsync of switchdev objects. In order to fit more code, refactor the existing logic into a helper. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- net/dsa/slave.c | 44 +++++++++++++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 2f0d0a6b1f9c..20d8466d78f2 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -2166,6 +2166,32 @@ dsa_slave_check_8021q_upper(struct net_device *dev, return NOTIFY_DONE; } +static int +dsa_slave_prechangeupper_sanity_check(struct net_device *dev, + struct netdev_notifier_changeupper_info *info) +{ + struct dsa_switch *ds; + struct dsa_port *dp; + int err; + + if (!dsa_slave_dev_check(dev)) + return dsa_prevent_bridging_8021q_upper(dev, info); + + dp = dsa_slave_to_port(dev); + ds = dp->ds; + + if (ds->ops->port_prechangeupper) { + err = ds->ops->port_prechangeupper(ds, dp->index, info); + if (err) + return notifier_from_errno(err); + } + + if (is_vlan_dev(info->upper_dev)) + return dsa_slave_check_8021q_upper(dev, info); + + return NOTIFY_DONE; +} + static int dsa_slave_netdevice_event(struct notifier_block *nb, unsigned long event, void *ptr) { @@ -2174,24 +2200,12 @@ static int dsa_slave_netdevice_event(struct notifier_block *nb, switch (event) { case NETDEV_PRECHANGEUPPER: { struct netdev_notifier_changeupper_info *info = ptr; - struct dsa_switch *ds; - struct dsa_port *dp; int err; - if (!dsa_slave_dev_check(dev)) - return dsa_prevent_bridging_8021q_upper(dev, ptr); - - dp = dsa_slave_to_port(dev); - ds = dp->ds; - - if (ds->ops->port_prechangeupper) { - err = ds->ops->port_prechangeupper(ds, dp->index, info); - if (err) - return notifier_from_errno(err); - } + err = dsa_slave_prechangeupper_sanity_check(dev, info); + if (err != NOTIFY_DONE) + return err; - if (is_vlan_dev(info->upper_dev)) - return dsa_slave_check_8021q_upper(dev, ptr); break; } case NETDEV_CHANGEUPPER: -- cgit v1.2.3-58-ga151 From 7491894532341cff11babd1fe3bd68537166bcc4 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 27 Jun 2021 14:54:29 +0300 Subject: net: dsa: replay a deletion of switchdev objects for ports leaving a bridged LAG When a DSA switch port leaves a bonding interface that is under a bridge, there might be dangling switchdev objects on that port left behind, because the bridge is not aware that its lower interface (the bond) changed state in any way. Call the bridge replay helpers with adding=false before changing dp->bridge_dev to NULL, because we need to simulate to dsa_slave_port_obj_del() that these notifications were emitted by the bridge. We add this hook to the NETDEV_PRECHANGEUPPER event handler, because we are calling into switchdev (and the __switchdev_handle_port_obj_del fanout helpers expect the upper/lower adjacency lists to still be valid) and PRECHANGEUPPER is the last moment in time when they still are. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- net/dsa/dsa_priv.h | 4 ++++ net/dsa/port.c | 45 ++++++++++++++++++++++++++++++++++++++++++-- net/dsa/slave.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 102 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index b0811253d101..c8712942002f 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -188,12 +188,16 @@ void dsa_port_disable_rt(struct dsa_port *dp); void dsa_port_disable(struct dsa_port *dp); int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br, struct netlink_ext_ack *extack); +int dsa_port_pre_bridge_leave(struct dsa_port *dp, struct net_device *br, + struct netlink_ext_ack *extack); void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br); int dsa_port_lag_change(struct dsa_port *dp, struct netdev_lag_lower_state_info *linfo); int dsa_port_lag_join(struct dsa_port *dp, struct net_device *lag_dev, struct netdev_lag_upper_info *uinfo, struct netlink_ext_ack *extack); +int dsa_port_pre_lag_leave(struct dsa_port *dp, struct net_device *lag_dev, + struct netlink_ext_ack *extack); void dsa_port_lag_leave(struct dsa_port *dp, struct net_device *lag_dev); int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering, struct netlink_ext_ack *extack); diff --git a/net/dsa/port.c b/net/dsa/port.c index 4e58d07ececd..46089dd2b2ec 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -212,7 +212,33 @@ static int dsa_port_switchdev_sync(struct dsa_port *dp, return 0; } -static void dsa_port_switchdev_unsync(struct dsa_port *dp) +static int dsa_port_switchdev_unsync_objs(struct dsa_port *dp, + struct net_device *br, + struct netlink_ext_ack *extack) +{ + struct net_device *brport_dev = dsa_port_to_bridge_port(dp); + int err; + + /* Delete the switchdev objects left on this port */ + err = br_mdb_replay(br, brport_dev, dp, false, + &dsa_slave_switchdev_blocking_notifier, extack); + if (err && err != -EOPNOTSUPP) + return err; + + err = br_fdb_replay(br, brport_dev, dp, false, + &dsa_slave_switchdev_notifier); + if (err && err != -EOPNOTSUPP) + return err; + + err = br_vlan_replay(br, brport_dev, dp, false, + &dsa_slave_switchdev_blocking_notifier, extack); + if (err && err != -EOPNOTSUPP) + return err; + + return 0; +} + +static void dsa_port_switchdev_unsync_attrs(struct dsa_port *dp) { /* Configure the port for standalone mode (no address learning, * flood everything). @@ -278,6 +304,12 @@ out_rollback: return err; } +int dsa_port_pre_bridge_leave(struct dsa_port *dp, struct net_device *br, + struct netlink_ext_ack *extack) +{ + return dsa_port_switchdev_unsync_objs(dp, br, extack); +} + void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br) { struct dsa_notifier_bridge_info info = { @@ -297,7 +329,7 @@ void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br) if (err) pr_err("DSA: failed to notify DSA_NOTIFIER_BRIDGE_LEAVE\n"); - dsa_port_switchdev_unsync(dp); + dsa_port_switchdev_unsync_attrs(dp); } int dsa_port_lag_change(struct dsa_port *dp, @@ -365,6 +397,15 @@ err_lag_join: return err; } +int dsa_port_pre_lag_leave(struct dsa_port *dp, struct net_device *lag, + struct netlink_ext_ack *extack) +{ + if (dp->bridge_dev) + return dsa_port_pre_bridge_leave(dp, dp->bridge_dev, extack); + + return 0; +} + void dsa_port_lag_leave(struct dsa_port *dp, struct net_device *lag) { struct dsa_notifier_lag_info info = { diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 20d8466d78f2..898ed9cf756f 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -2077,6 +2077,26 @@ static int dsa_slave_changeupper(struct net_device *dev, return err; } +static int dsa_slave_prechangeupper(struct net_device *dev, + struct netdev_notifier_changeupper_info *info) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + struct netlink_ext_ack *extack; + int err = 0; + + extack = netdev_notifier_info_to_extack(&info->info); + + if (netif_is_bridge_master(info->upper_dev) && !info->linking) + err = dsa_port_pre_bridge_leave(dp, info->upper_dev, extack); + else if (netif_is_lag_master(info->upper_dev) && !info->linking) + err = dsa_port_pre_lag_leave(dp, info->upper_dev, extack); + /* dsa_port_pre_hsr_leave is not yet necessary since hsr cannot be + * meaningfully enslaved to a bridge yet + */ + + return notifier_from_errno(err); +} + static int dsa_slave_lag_changeupper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) @@ -2103,6 +2123,35 @@ dsa_slave_lag_changeupper(struct net_device *dev, return err; } +/* Same as dsa_slave_lag_changeupper() except that it calls + * dsa_slave_prechangeupper() + */ +static int +dsa_slave_lag_prechangeupper(struct net_device *dev, + struct netdev_notifier_changeupper_info *info) +{ + struct net_device *lower; + struct list_head *iter; + int err = NOTIFY_DONE; + struct dsa_port *dp; + + netdev_for_each_lower_dev(dev, lower, iter) { + if (!dsa_slave_dev_check(lower)) + continue; + + dp = dsa_slave_to_port(lower); + if (!dp->lag_dev) + /* Software LAG */ + continue; + + err = dsa_slave_prechangeupper(lower, info); + if (notifier_to_errno(err)) + break; + } + + return err; +} + static int dsa_prevent_bridging_8021q_upper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) @@ -2206,6 +2255,12 @@ static int dsa_slave_netdevice_event(struct notifier_block *nb, if (err != NOTIFY_DONE) return err; + if (dsa_slave_dev_check(dev)) + return dsa_slave_prechangeupper(dev, ptr); + + if (netif_is_lag_master(dev)) + return dsa_slave_lag_prechangeupper(dev, ptr); + break; } case NETDEV_CHANGEUPPER: -- cgit v1.2.3-58-ga151 From 1fd07f33c3ea2b4aa77426f13e8cb91d4f55af8f Mon Sep 17 00:00:00 2001 From: Andreas Roeseler Date: Sat, 26 Jun 2021 09:07:46 -0500 Subject: ipv6: ICMPV6: add response to ICMPV6 RFC 8335 PROBE messages This patch builds off of commit 2b246b2569cd2ac6ff700d0dce56b8bae29b1842 and adds functionality to respond to ICMPV6 PROBE requests. Add icmp_build_probe function to construct PROBE requests for both ICMPV4 and ICMPV6. Modify icmpv6_rcv to detect ICMPV6 PROBE messages and call the icmpv6_echo_reply handler. Modify icmpv6_echo_reply to build a PROBE response message based on the queried interface. This patch has been tested using a branch of the iputils git repo which can be found here: https://github.com/Juniper-Clinic-2020/iputils/tree/probe-request Signed-off-by: Andreas Roeseler Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/icmp.h | 1 + net/ipv4/icmp.c | 63 +++++++++++++++++++++++++++++++++++------------------- net/ipv6/icmp.c | 21 +++++++++++++++--- 3 files changed, 60 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/include/net/icmp.h b/include/net/icmp.h index fd84adc47963..caddf4a59ad1 100644 --- a/include/net/icmp.h +++ b/include/net/icmp.h @@ -57,5 +57,6 @@ int icmp_rcv(struct sk_buff *skb); int icmp_err(struct sk_buff *skb, u32 info); int icmp_init(void); void icmp_out_count(struct net *net, unsigned char type); +bool icmp_build_probe(struct sk_buff *skb, struct icmphdr *icmphdr); #endif /* _ICMP_H */ diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 0a57f1892e7e..c695d294a5df 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -993,14 +993,8 @@ static bool icmp_redirect(struct sk_buff *skb) static bool icmp_echo(struct sk_buff *skb) { - struct icmp_ext_hdr *ext_hdr, _ext_hdr; - struct icmp_ext_echo_iio *iio, _iio; struct icmp_bxm icmp_param; - struct net_device *dev; - char buff[IFNAMSIZ]; struct net *net; - u16 ident_len; - u8 status; net = dev_net(skb_dst(skb)->dev); /* should there be an ICMP stat for ignored echos? */ @@ -1013,20 +1007,46 @@ static bool icmp_echo(struct sk_buff *skb) icmp_param.data_len = skb->len; icmp_param.head_len = sizeof(struct icmphdr); - if (icmp_param.data.icmph.type == ICMP_ECHO) { + if (icmp_param.data.icmph.type == ICMP_ECHO) icmp_param.data.icmph.type = ICMP_ECHOREPLY; - goto send_reply; - } - if (!net->ipv4.sysctl_icmp_echo_enable_probe) + else if (!icmp_build_probe(skb, &icmp_param.data.icmph)) return true; + + icmp_reply(&icmp_param, skb); + return true; +} + +/* Helper for icmp_echo and icmpv6_echo_reply. + * Searches for net_device that matches PROBE interface identifier + * and builds PROBE reply message in icmphdr. + * + * Returns false if PROBE responses are disabled via sysctl + */ + +bool icmp_build_probe(struct sk_buff *skb, struct icmphdr *icmphdr) +{ + struct icmp_ext_hdr *ext_hdr, _ext_hdr; + struct icmp_ext_echo_iio *iio, _iio; + struct net *net = dev_net(skb->dev); + struct net_device *dev; + char buff[IFNAMSIZ]; + u16 ident_len; + u8 status; + + if (!net->ipv4.sysctl_icmp_echo_enable_probe) + return false; + /* We currently only support probing interfaces on the proxy node * Check to ensure L-bit is set */ - if (!(ntohs(icmp_param.data.icmph.un.echo.sequence) & 1)) - return true; + if (!(ntohs(icmphdr->un.echo.sequence) & 1)) + return false; /* Clear status bits in reply message */ - icmp_param.data.icmph.un.echo.sequence &= htons(0xFF00); - icmp_param.data.icmph.type = ICMP_EXT_ECHOREPLY; + icmphdr->un.echo.sequence &= htons(0xFF00); + if (icmphdr->type == ICMP_EXT_ECHO) + icmphdr->type = ICMP_EXT_ECHOREPLY; + else + icmphdr->type = ICMPV6_EXT_ECHO_REPLY; ext_hdr = skb_header_pointer(skb, 0, sizeof(_ext_hdr), &_ext_hdr); /* Size of iio is class_type dependent. * Only check header here and assign length based on ctype in the switch statement @@ -1087,8 +1107,8 @@ static bool icmp_echo(struct sk_buff *skb) goto send_mal_query; } if (!dev) { - icmp_param.data.icmph.code = ICMP_EXT_CODE_NO_IF; - goto send_reply; + icmphdr->code = ICMP_EXT_CODE_NO_IF; + return true; } /* Fill bits in reply message */ if (dev->flags & IFF_UP) @@ -1098,14 +1118,13 @@ static bool icmp_echo(struct sk_buff *skb) if (!list_empty(&rcu_dereference(dev->ip6_ptr)->addr_list)) status |= ICMP_EXT_ECHOREPLY_IPV6; dev_put(dev); - icmp_param.data.icmph.un.echo.sequence |= htons(status); -send_reply: - icmp_reply(&icmp_param, skb); - return true; + icmphdr->un.echo.sequence |= htons(status); + return true; send_mal_query: - icmp_param.data.icmph.code = ICMP_EXT_CODE_MAL_QUERY; - goto send_reply; + icmphdr->code = ICMP_EXT_CODE_MAL_QUERY; + return true; } +EXPORT_SYMBOL_GPL(icmp_build_probe); /* * Handle ICMP Timestamp requests. diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index e8398ffb5e35..a7c31ab67c5d 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -725,6 +725,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb) struct ipcm6_cookie ipc6; u32 mark = IP6_REPLY_MARK(net, skb->mark); bool acast; + u8 type; if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr) && net->ipv6.sysctl.icmpv6_echo_ignore_multicast) @@ -740,8 +741,13 @@ static void icmpv6_echo_reply(struct sk_buff *skb) !(net->ipv6.sysctl.anycast_src_echo_reply && acast)) saddr = NULL; + if (icmph->icmp6_type == ICMPV6_EXT_ECHO_REQUEST) + type = ICMPV6_EXT_ECHO_REPLY; + else + type = ICMPV6_ECHO_REPLY; + memcpy(&tmp_hdr, icmph, sizeof(tmp_hdr)); - tmp_hdr.icmp6_type = ICMPV6_ECHO_REPLY; + tmp_hdr.icmp6_type = type; memset(&fl6, 0, sizeof(fl6)); if (net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_ICMPV6_ECHO_REPLIES) @@ -752,7 +758,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb) if (saddr) fl6.saddr = *saddr; fl6.flowi6_oif = icmp6_iif(skb); - fl6.fl6_icmp_type = ICMPV6_ECHO_REPLY; + fl6.fl6_icmp_type = type; fl6.flowi6_mark = mark; fl6.flowi6_uid = sock_net_uid(net, NULL); security_skb_classify_flow(skb, flowi6_to_flowi_common(&fl6)); @@ -783,13 +789,17 @@ static void icmpv6_echo_reply(struct sk_buff *skb) msg.skb = skb; msg.offset = 0; - msg.type = ICMPV6_ECHO_REPLY; + msg.type = type; ipcm6_init_sk(&ipc6, np); ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); ipc6.tclass = ipv6_get_dsfield(ipv6_hdr(skb)); ipc6.sockc.mark = mark; + if (icmph->icmp6_type == ICMPV6_EXT_ECHO_REQUEST) + if (!icmp_build_probe(skb, (struct icmphdr *)&tmp_hdr)) + goto out_dst_release; + if (ip6_append_data(sk, icmpv6_getfrag, &msg, skb->len + sizeof(struct icmp6hdr), sizeof(struct icmp6hdr), &ipc6, &fl6, @@ -911,6 +921,11 @@ static int icmpv6_rcv(struct sk_buff *skb) if (!net->ipv6.sysctl.icmpv6_echo_ignore_all) icmpv6_echo_reply(skb); break; + case ICMPV6_EXT_ECHO_REQUEST: + if (!net->ipv6.sysctl.icmpv6_echo_ignore_all && + net->ipv4.sysctl_icmp_echo_enable_probe) + icmpv6_echo_reply(skb); + break; case ICMPV6_ECHO_REPLY: success = ping_rcv(skb); -- cgit v1.2.3-58-ga151 From 0c5dc070ff3d6246d22ddd931f23a6266249e3db Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Mon, 28 Jun 2021 16:13:41 -0300 Subject: sctp: validate from_addr_param return Ilja reported that, simply putting it, nothing was validating that from_addr_param functions were operating on initialized memory. That is, the parameter itself was being validated by sctp_walk_params, but it doesn't check for types and their specific sizes and it could be a 0-length one, causing from_addr_param to potentially work over the next parameter or even uninitialized memory. The fix here is to, in all calls to from_addr_param, check if enough space is there for the wanted IP address type. Reported-by: Ilja Van Sprundel Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 2 +- net/sctp/bind_addr.c | 19 +++++++++++-------- net/sctp/input.c | 6 ++++-- net/sctp/ipv6.c | 7 ++++++- net/sctp/protocol.c | 7 ++++++- net/sctp/sm_make_chunk.c | 29 ++++++++++++++++------------- 6 files changed, 44 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 1aa585216f34..d49593c72a55 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -461,7 +461,7 @@ struct sctp_af { int saddr); void (*from_sk) (union sctp_addr *, struct sock *sk); - void (*from_addr_param) (union sctp_addr *, + bool (*from_addr_param) (union sctp_addr *, union sctp_addr_param *, __be16 port, int iif); int (*to_addr_param) (const union sctp_addr *, diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c index 53e5ed79f63f..59e653b528b1 100644 --- a/net/sctp/bind_addr.c +++ b/net/sctp/bind_addr.c @@ -270,22 +270,19 @@ int sctp_raw_to_bind_addrs(struct sctp_bind_addr *bp, __u8 *raw_addr_list, rawaddr = (union sctp_addr_param *)raw_addr_list; af = sctp_get_af_specific(param_type2af(param->type)); - if (unlikely(!af)) { + if (unlikely(!af) || + !af->from_addr_param(&addr, rawaddr, htons(port), 0)) { retval = -EINVAL; - sctp_bind_addr_clean(bp); - break; + goto out_err; } - af->from_addr_param(&addr, rawaddr, htons(port), 0); if (sctp_bind_addr_state(bp, &addr) != -1) goto next; retval = sctp_add_bind_addr(bp, &addr, sizeof(addr), SCTP_ADDR_SRC, gfp); - if (retval) { + if (retval) /* Can't finish building the list, clean up. */ - sctp_bind_addr_clean(bp); - break; - } + goto out_err; next: len = ntohs(param->length); @@ -294,6 +291,12 @@ next: } return retval; + +out_err: + if (retval) + sctp_bind_addr_clean(bp); + + return retval; } /******************************************************************** diff --git a/net/sctp/input.c b/net/sctp/input.c index d508f6f3dd08..8924e2e142c8 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -1131,7 +1131,8 @@ static struct sctp_association *__sctp_rcv_init_lookup(struct net *net, if (!af) continue; - af->from_addr_param(paddr, params.addr, sh->source, 0); + if (!af->from_addr_param(paddr, params.addr, sh->source, 0)) + continue; asoc = __sctp_lookup_association(net, laddr, paddr, transportp); if (asoc) @@ -1174,7 +1175,8 @@ static struct sctp_association *__sctp_rcv_asconf_lookup( if (unlikely(!af)) return NULL; - af->from_addr_param(&paddr, param, peer_port, 0); + if (af->from_addr_param(&paddr, param, peer_port, 0)) + return NULL; return __sctp_lookup_association(net, laddr, &paddr, transportp); } diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index bd08807c9e44..5c6f5ced9cfa 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -551,15 +551,20 @@ static void sctp_v6_to_sk_daddr(union sctp_addr *addr, struct sock *sk) } /* Initialize a sctp_addr from an address parameter. */ -static void sctp_v6_from_addr_param(union sctp_addr *addr, +static bool sctp_v6_from_addr_param(union sctp_addr *addr, union sctp_addr_param *param, __be16 port, int iif) { + if (ntohs(param->v6.param_hdr.length) < sizeof(struct sctp_ipv6addr_param)) + return false; + addr->v6.sin6_family = AF_INET6; addr->v6.sin6_port = port; addr->v6.sin6_flowinfo = 0; /* BUG */ addr->v6.sin6_addr = param->v6.addr; addr->v6.sin6_scope_id = iif; + + return true; } /* Initialize an address parameter from a sctp_addr and return the length diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 6f2bbfeec3a4..25192b378e2e 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -254,14 +254,19 @@ static void sctp_v4_to_sk_daddr(union sctp_addr *addr, struct sock *sk) } /* Initialize a sctp_addr from an address parameter. */ -static void sctp_v4_from_addr_param(union sctp_addr *addr, +static bool sctp_v4_from_addr_param(union sctp_addr *addr, union sctp_addr_param *param, __be16 port, int iif) { + if (ntohs(param->v4.param_hdr.length) < sizeof(struct sctp_ipv4addr_param)) + return false; + addr->v4.sin_family = AF_INET; addr->v4.sin_port = port; addr->v4.sin_addr.s_addr = param->v4.addr.s_addr; memset(addr->v4.sin_zero, 0, sizeof(addr->v4.sin_zero)); + + return true; } /* Initialize an address parameter from a sctp_addr and return the length diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 5b44d228b6ca..f33a870b483d 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -2346,11 +2346,13 @@ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk, /* Process the initialization parameters. */ sctp_walk_params(param, peer_init, init_hdr.params) { - if (!src_match && (param.p->type == SCTP_PARAM_IPV4_ADDRESS || - param.p->type == SCTP_PARAM_IPV6_ADDRESS)) { + if (!src_match && + (param.p->type == SCTP_PARAM_IPV4_ADDRESS || + param.p->type == SCTP_PARAM_IPV6_ADDRESS)) { af = sctp_get_af_specific(param_type2af(param.p->type)); - af->from_addr_param(&addr, param.addr, - chunk->sctp_hdr->source, 0); + if (!af->from_addr_param(&addr, param.addr, + chunk->sctp_hdr->source, 0)) + continue; if (sctp_cmp_addr_exact(sctp_source(chunk), &addr)) src_match = 1; } @@ -2531,7 +2533,8 @@ static int sctp_process_param(struct sctp_association *asoc, break; do_addr_param: af = sctp_get_af_specific(param_type2af(param.p->type)); - af->from_addr_param(&addr, param.addr, htons(asoc->peer.port), 0); + if (!af->from_addr_param(&addr, param.addr, htons(asoc->peer.port), 0)) + break; scope = sctp_scope(peer_addr); if (sctp_in_scope(net, &addr, scope)) if (!sctp_assoc_add_peer(asoc, &addr, gfp, SCTP_UNCONFIRMED)) @@ -2632,15 +2635,13 @@ do_addr_param: addr_param = param.v + sizeof(struct sctp_addip_param); af = sctp_get_af_specific(param_type2af(addr_param->p.type)); - if (af == NULL) + if (!af) break; - af->from_addr_param(&addr, addr_param, - htons(asoc->peer.port), 0); + if (!af->from_addr_param(&addr, addr_param, + htons(asoc->peer.port), 0)) + break; - /* if the address is invalid, we can't process it. - * XXX: see spec for what to do. - */ if (!af->addr_valid(&addr, NULL, NULL)) break; @@ -3054,7 +3055,8 @@ static __be16 sctp_process_asconf_param(struct sctp_association *asoc, if (unlikely(!af)) return SCTP_ERROR_DNS_FAILED; - af->from_addr_param(&addr, addr_param, htons(asoc->peer.port), 0); + if (!af->from_addr_param(&addr, addr_param, htons(asoc->peer.port), 0)) + return SCTP_ERROR_DNS_FAILED; /* ADDIP 4.2.1 This parameter MUST NOT contain a broadcast * or multicast address. @@ -3331,7 +3333,8 @@ static void sctp_asconf_param_success(struct sctp_association *asoc, /* We have checked the packet before, so we do not check again. */ af = sctp_get_af_specific(param_type2af(addr_param->p.type)); - af->from_addr_param(&addr, addr_param, htons(bp->port), 0); + if (!af->from_addr_param(&addr, addr_param, htons(bp->port), 0)) + return; switch (asconf_param->param_hdr.type) { case SCTP_PARAM_ADD_IP: -- cgit v1.2.3-58-ga151 From 50619dbf8db77e98d821d615af4f634d08e22698 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Mon, 28 Jun 2021 16:13:42 -0300 Subject: sctp: add size validation when walking chunks The first chunk in a packet is ensured to be present at the beginning of sctp_rcv(), as a packet needs to have at least 1 chunk. But the second one, may not be completely available and ch->length can be over uninitialized memory. Fix here is by only trying to walk on the next chunk if there is enough to hold at least the header, and then proceed with the ch->length validation that is already there. Reported-by: Ilja Van Sprundel Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/input.c b/net/sctp/input.c index 8924e2e142c8..f72bff93745c 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -1247,7 +1247,7 @@ static struct sctp_association *__sctp_rcv_walk_lookup(struct net *net, ch = (struct sctp_chunkhdr *)ch_end; chunk_num++; - } while (ch_end < skb_tail_pointer(skb)); + } while (ch_end + sizeof(*ch) < skb_tail_pointer(skb)); return asoc; } -- cgit v1.2.3-58-ga151 From b6ffe7671b24689c09faa5675dd58f93758a97ae Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Mon, 28 Jun 2021 16:13:43 -0300 Subject: sctp: validate chunk size in __rcv_asconf_lookup In one of the fallbacks that SCTP has for identifying an association for an incoming packet, it looks for AddIp chunk (from ASCONF) and take a peek. Thing is, at this stage nothing was validating that the chunk actually had enough content for that, allowing the peek to happen over uninitialized memory. Similar check already exists in actual asconf handling in sctp_verify_asconf(). Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/input.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/sctp/input.c b/net/sctp/input.c index f72bff93745c..96dea8097dbe 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -1168,6 +1168,9 @@ static struct sctp_association *__sctp_rcv_asconf_lookup( union sctp_addr_param *param; union sctp_addr paddr; + if (ntohs(ch->length) < sizeof(*asconf) + sizeof(struct sctp_paramhdr)) + return NULL; + /* Skip over the ADDIP header and find the Address parameter */ param = (union sctp_addr_param *)(asconf + 1); -- cgit v1.2.3-58-ga151 From ef6c8d6ccf0c1dccdda092ebe8782777cd7803c9 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Mon, 28 Jun 2021 16:13:44 -0300 Subject: sctp: add param size validation for SCTP_PARAM_SET_PRIMARY When SCTP handles an INIT chunk, it calls for example: sctp_sf_do_5_1B_init sctp_verify_init sctp_verify_param sctp_process_init sctp_process_param handling of SCTP_PARAM_SET_PRIMARY sctp_verify_init() wasn't doing proper size validation and neither the later handling, allowing it to work over the chunk itself, possibly being uninitialized memory. Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/sm_make_chunk.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index f33a870b483d..587fb3cb88e2 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -2166,9 +2166,16 @@ static enum sctp_ierror sctp_verify_param(struct net *net, break; case SCTP_PARAM_SET_PRIMARY: - if (ep->asconf_enable) - break; - goto unhandled; + if (!ep->asconf_enable) + goto unhandled; + + if (ntohs(param.p->length) < sizeof(struct sctp_addip_param) + + sizeof(struct sctp_paramhdr)) { + sctp_process_inv_paramlength(asoc, param.p, + chunk, err_chunk); + retval = SCTP_IERROR_ABORT; + } + break; case SCTP_PARAM_HOST_NAME_ADDRESS: /* Tell the peer, we won't support this param. */ -- cgit v1.2.3-58-ga151 From f7458934b0791c39a001e4d902fc3bf697b439b5 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Sat, 26 Jun 2021 22:18:04 +0200 Subject: net: bridge: mrp: Update the Test frames for MRA According to the standard IEC 62439-2, in case the node behaves as MRA and needs to send Test frames on ring ports, then these Test frames need to have an Option TLV and a Sub-Option TLV which has the type AUTO_MGR. Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- net/bridge/br_mrp.c | 27 +++++++++++++++++++++++++++ net/bridge/br_private_mrp.h | 11 +++++++++++ 2 files changed, 38 insertions(+) (limited to 'net') diff --git a/net/bridge/br_mrp.c b/net/bridge/br_mrp.c index f7012b7d7ce4..fd2de35ffb3c 100644 --- a/net/bridge/br_mrp.c +++ b/net/bridge/br_mrp.c @@ -204,6 +204,33 @@ static struct sk_buff *br_mrp_alloc_test_skb(struct br_mrp *mrp, hdr->timestamp = cpu_to_be32(jiffies_to_msecs(jiffies)); br_mrp_skb_common(skb, mrp); + + /* In case the node behaves as MRA then the Test frame needs to have + * an Option TLV which includes eventually a sub-option TLV that has + * the type AUTO_MGR + */ + if (mrp->ring_role == BR_MRP_RING_ROLE_MRA) { + struct br_mrp_sub_option1_hdr *sub_opt = NULL; + struct br_mrp_tlv_hdr *sub_tlv = NULL; + struct br_mrp_oui_hdr *oui = NULL; + u8 length; + + length = sizeof(*sub_opt) + sizeof(*sub_tlv) + sizeof(oui) + + MRP_OPT_PADDING; + br_mrp_skb_tlv(skb, BR_MRP_TLV_HEADER_OPTION, length); + + oui = skb_put(skb, sizeof(*oui)); + memset(oui, 0x0, sizeof(*oui)); + sub_opt = skb_put(skb, sizeof(*sub_opt)); + memset(sub_opt, 0x0, sizeof(*sub_opt)); + + sub_tlv = skb_put(skb, sizeof(*sub_tlv)); + sub_tlv->type = BR_MRP_SUB_TLV_HEADER_TEST_AUTO_MGR; + + /* 32 bit alligment shall be ensured therefore add 2 bytes */ + skb_put(skb, MRP_OPT_PADDING); + } + br_mrp_skb_tlv(skb, BR_MRP_TLV_HEADER_END, 0x0); return skb; diff --git a/net/bridge/br_private_mrp.h b/net/bridge/br_private_mrp.h index 9559aa2750fb..bda8e1896712 100644 --- a/net/bridge/br_private_mrp.h +++ b/net/bridge/br_private_mrp.h @@ -6,6 +6,8 @@ #include "br_private.h" #include +#define MRP_OPT_PADDING 0x2 + struct br_mrp { /* list of mrp instances */ struct hlist_node list; @@ -134,4 +136,13 @@ struct br_mrp_in_test_hdr { __be32 timestamp; } __attribute__((__packed__)); +struct br_mrp_oui_hdr { + __u8 oui[MRP_OUI_LENGTH]; +}; + +struct br_mrp_sub_option1_hdr { + __u8 type; + __u8 data[MRP_MANUFACTURE_DATA_LENGTH]; +}; + #endif /* _BR_PRIVATE_MRP_H */ -- cgit v1.2.3-58-ga151 From 127d7355abb355b05ff4b42d6e18cc97aa9d1d11 Mon Sep 17 00:00:00 2001 From: Tanner Love Date: Mon, 28 Jun 2021 09:50:07 -0400 Subject: net: update netdev_rx_csum_fault() print dump only once Printing this stack dump multiple times does not provide additional useful information, and consumes time in the data path. Printing once is sufficient. Changes v2: Format indentation properly Signed-off-by: Tanner Love Acked-by: Eric Dumazet Acked-by: Mahesh Bandewar Signed-off-by: David S. Miller --- net/core/dev.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 991d09b67bd9..d609366da95c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -148,6 +148,7 @@ #include #include #include +#include #include "net-sysfs.h" @@ -3487,13 +3488,16 @@ EXPORT_SYMBOL(__skb_gso_segment); /* Take action when hardware reception checksum errors are detected. */ #ifdef CONFIG_BUG +static void do_netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb) +{ + pr_err("%s: hw csum failure\n", dev ? dev->name : ""); + skb_dump(KERN_ERR, skb, true); + dump_stack(); +} + void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb) { - if (net_ratelimit()) { - pr_err("%s: hw csum failure\n", dev ? dev->name : ""); - skb_dump(KERN_ERR, skb, true); - dump_stack(); - } + DO_ONCE_LITE(do_netdev_rx_csum_fault, dev, skb); } EXPORT_SYMBOL(netdev_rx_csum_fault); #endif -- cgit v1.2.3-58-ga151 From 3e19ae7c6fd62978ae518b17ae0e30ab8d17ed07 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 29 Jun 2021 17:06:44 +0300 Subject: net: bridge: use READ_ONCE() and WRITE_ONCE() compiler barriers for fdb->dst Annotate the writer side of fdb->dst: - fdb_create() - br_fdb_update() - fdb_add_entry() - br_fdb_external_learn_add() with WRITE_ONCE() and the reader side: - br_fdb_test_addr() - br_fdb_update() - fdb_fill_info() - fdb_add_entry() - fdb_delete_by_addr_and_port() - br_fdb_external_learn_add() - br_switchdev_fdb_notify() with compiler barriers such that the readers do not attempt to reload fdb->dst multiple times, leading to potentially different destination ports when the fdb entry is updated concurrently. This is especially important in read-side sections where fdb->dst is used more than once, but let's convert all accesses for the sake of uniformity. Suggested-by: Nikolay Aleksandrov Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- net/bridge/br_fdb.c | 28 +++++++++++++++++----------- net/bridge/br_switchdev.c | 7 ++++--- 2 files changed, 21 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 16f9434fdb5d..dc3ecf2d5637 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -440,9 +440,14 @@ int br_fdb_test_addr(struct net_device *dev, unsigned char *addr) if (!port) ret = 0; else { + const struct net_bridge_port *dst = NULL; + fdb = br_fdb_find_rcu(port->br, addr, 0); - ret = fdb && fdb->dst && fdb->dst->dev != dev && - fdb->dst->state == BR_STATE_FORWARDING; + if (fdb) + dst = READ_ONCE(fdb->dst); + + ret = dst && dst->dev != dev && + dst->state == BR_STATE_FORWARDING; } rcu_read_unlock(); @@ -509,7 +514,7 @@ static struct net_bridge_fdb_entry *fdb_create(struct net_bridge *br, fdb = kmem_cache_alloc(br_fdb_cache, GFP_ATOMIC); if (fdb) { memcpy(fdb->key.addr.addr, addr, ETH_ALEN); - fdb->dst = source; + WRITE_ONCE(fdb->dst, source); fdb->key.vlan_id = vid; fdb->flags = flags; fdb->updated = fdb->used = jiffies; @@ -600,10 +605,10 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, } /* fastpath: update of existing entry */ - if (unlikely(source != fdb->dst && + if (unlikely(source != READ_ONCE(fdb->dst) && !test_bit(BR_FDB_STICKY, &fdb->flags))) { br_switchdev_fdb_notify(fdb, RTM_DELNEIGH); - fdb->dst = source; + WRITE_ONCE(fdb->dst, source); fdb_modified = true; /* Take over HW learned entry */ if (unlikely(test_bit(BR_FDB_ADDED_BY_EXT_LEARN, @@ -650,6 +655,7 @@ static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br, const struct net_bridge_fdb_entry *fdb, u32 portid, u32 seq, int type, unsigned int flags) { + const struct net_bridge_port *dst = READ_ONCE(fdb->dst); unsigned long now = jiffies; struct nda_cacheinfo ci; struct nlmsghdr *nlh; @@ -665,7 +671,7 @@ static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br, ndm->ndm_pad2 = 0; ndm->ndm_flags = 0; ndm->ndm_type = 0; - ndm->ndm_ifindex = fdb->dst ? fdb->dst->dev->ifindex : br->dev->ifindex; + ndm->ndm_ifindex = dst ? dst->dev->ifindex : br->dev->ifindex; ndm->ndm_state = fdb_to_nud(br, fdb); if (test_bit(BR_FDB_OFFLOADED, &fdb->flags)) @@ -964,8 +970,8 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, if (flags & NLM_F_EXCL) return -EEXIST; - if (fdb->dst != source) { - fdb->dst = source; + if (READ_ONCE(fdb->dst) != source) { + WRITE_ONCE(fdb->dst, source); modified = true; } } @@ -1132,7 +1138,7 @@ static int fdb_delete_by_addr_and_port(struct net_bridge *br, struct net_bridge_fdb_entry *fdb; fdb = br_fdb_find(br, addr, vlan); - if (!fdb || fdb->dst != p) + if (!fdb || READ_ONCE(fdb->dst) != p) return -ENOENT; fdb_delete(br, fdb, true); @@ -1281,8 +1287,8 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p, } else { fdb->updated = jiffies; - if (fdb->dst != p) { - fdb->dst = p; + if (READ_ONCE(fdb->dst) != p) { + WRITE_ONCE(fdb->dst, p); modified = true; } diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c index a5e601e41cb9..192293fe37fd 100644 --- a/net/bridge/br_switchdev.c +++ b/net/bridge/br_switchdev.c @@ -110,6 +110,7 @@ int br_switchdev_set_port_flag(struct net_bridge_port *p, void br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type) { + const struct net_bridge_port *dst = READ_ONCE(fdb->dst); struct switchdev_notifier_fdb_info info = { .addr = fdb->key.addr.addr, .vid = fdb->key.vlan_id, @@ -118,17 +119,17 @@ br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type) .offloaded = test_bit(BR_FDB_OFFLOADED, &fdb->flags), }; - if (!fdb->dst) + if (!dst) return; switch (type) { case RTM_DELNEIGH: call_switchdev_notifiers(SWITCHDEV_FDB_DEL_TO_DEVICE, - fdb->dst->dev, &info.info, NULL); + dst->dev, &info.info, NULL); break; case RTM_NEWNEIGH: call_switchdev_notifiers(SWITCHDEV_FDB_ADD_TO_DEVICE, - fdb->dst->dev, &info.info, NULL); + dst->dev, &info.info, NULL); break; } } -- cgit v1.2.3-58-ga151 From 6eb38bf8eb90748dbf4191f6c4940ae76223b0a4 Mon Sep 17 00:00:00 2001 From: Tobias Waldekranz Date: Tue, 29 Jun 2021 17:06:45 +0300 Subject: net: bridge: switchdev: send FDB notifications for host addresses Treat addresses added to the bridge itself in the same way as regular ports and send out a notification so that drivers may sync it down to the hardware FDB. Signed-off-by: Tobias Waldekranz Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- net/bridge/br_fdb.c | 4 ++-- net/bridge/br_private.h | 7 ++++--- net/bridge/br_switchdev.c | 11 +++++------ 3 files changed, 11 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index dc3ecf2d5637..bad7e84d76af 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -607,7 +607,7 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, /* fastpath: update of existing entry */ if (unlikely(source != READ_ONCE(fdb->dst) && !test_bit(BR_FDB_STICKY, &fdb->flags))) { - br_switchdev_fdb_notify(fdb, RTM_DELNEIGH); + br_switchdev_fdb_notify(br, fdb, RTM_DELNEIGH); WRITE_ONCE(fdb->dst, source); fdb_modified = true; /* Take over HW learned entry */ @@ -800,7 +800,7 @@ static void fdb_notify(struct net_bridge *br, int err = -ENOBUFS; if (swdev_notify) - br_switchdev_fdb_notify(fdb, type); + br_switchdev_fdb_notify(br, fdb, type); skb = nlmsg_new(fdb_nlmsg_size(), GFP_ATOMIC); if (skb == NULL) diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index a684d0cfc58c..2b48b204205e 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -1654,8 +1654,8 @@ int br_switchdev_set_port_flag(struct net_bridge_port *p, unsigned long flags, unsigned long mask, struct netlink_ext_ack *extack); -void br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, - int type); +void br_switchdev_fdb_notify(struct net_bridge *br, + const struct net_bridge_fdb_entry *fdb, int type); int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid, u16 flags, struct netlink_ext_ack *extack); int br_switchdev_port_vlan_del(struct net_device *dev, u16 vid); @@ -1702,7 +1702,8 @@ static inline int br_switchdev_port_vlan_del(struct net_device *dev, u16 vid) } static inline void -br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type) +br_switchdev_fdb_notify(struct net_bridge *br, + const struct net_bridge_fdb_entry *fdb, int type) { } diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c index 192293fe37fd..d3adee0f91f9 100644 --- a/net/bridge/br_switchdev.c +++ b/net/bridge/br_switchdev.c @@ -108,9 +108,11 @@ int br_switchdev_set_port_flag(struct net_bridge_port *p, } void -br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type) +br_switchdev_fdb_notify(struct net_bridge *br, + const struct net_bridge_fdb_entry *fdb, int type) { const struct net_bridge_port *dst = READ_ONCE(fdb->dst); + struct net_device *dev = dst ? dst->dev : br->dev; struct switchdev_notifier_fdb_info info = { .addr = fdb->key.addr.addr, .vid = fdb->key.vlan_id, @@ -119,17 +121,14 @@ br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type) .offloaded = test_bit(BR_FDB_OFFLOADED, &fdb->flags), }; - if (!dst) - return; - switch (type) { case RTM_DELNEIGH: call_switchdev_notifiers(SWITCHDEV_FDB_DEL_TO_DEVICE, - dst->dev, &info.info, NULL); + dev, &info.info, NULL); break; case RTM_NEWNEIGH: call_switchdev_notifiers(SWITCHDEV_FDB_ADD_TO_DEVICE, - dst->dev, &info.info, NULL); + dev, &info.info, NULL); break; } } -- cgit v1.2.3-58-ga151 From f851a721a638316a8257459db8359f2930d4b473 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 29 Jun 2021 17:06:46 +0300 Subject: net: bridge: allow br_fdb_replay to be called for the bridge device When a port joins a bridge which already has local FDB entries pointing to the bridge device itself, we would like to offload those, so allow the "dev" argument to be equal to the bridge too. The code already does what we need in that case. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- net/bridge/br_fdb.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index bad7e84d76af..2b862cffc03a 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -760,7 +760,10 @@ int br_fdb_replay(const struct net_device *br_dev, const struct net_device *dev, unsigned long action; int err = 0; - if (!netif_is_bridge_master(br_dev) || !netif_is_bridge_port(dev)) + if (!netif_is_bridge_master(br_dev)) + return -EINVAL; + + if (!netif_is_bridge_port(dev) && !netif_is_bridge_master(dev)) return -EINVAL; br = netdev_priv(br_dev); -- cgit v1.2.3-58-ga151 From b117e1e8a86d363fc1ad53df8d2c47884d2c0048 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 29 Jun 2021 17:06:47 +0300 Subject: net: dsa: delete dsa_legacy_fdb_add and dsa_legacy_fdb_del We want to add reference counting for FDB entries in cross-chip topologies, and in order for that to have any chance of working and not be unbalanced (leading to entries which are never deleted), we need to ensure that higher layers are sane, because if they aren't, it's garbage in, garbage out. For example, if we add a bridge FDB entry twice, the bridge properly errors out: $ bridge fdb add dev swp0 00:01:02:03:04:07 master static $ bridge fdb add dev swp0 00:01:02:03:04:07 master static RTNETLINK answers: File exists However, the same thing cannot be said about the bridge bypass operations: $ bridge fdb add dev swp0 00:01:02:03:04:07 $ bridge fdb add dev swp0 00:01:02:03:04:07 $ bridge fdb add dev swp0 00:01:02:03:04:07 $ bridge fdb add dev swp0 00:01:02:03:04:07 $ echo $? 0 But one 'bridge fdb del' is enough to remove the entry, no matter how many times it was added. The bridge bypass operations are impossible to maintain in these circumstances and lack of support for reference counting the cross-chip notifiers is holding us back from making further progress, so just drop support for them. The only way left for users to install static bridge FDB entries is the proper one, using the "master static" flags. With this change, rtnl_fdb_add() falls back to calling ndo_dflt_fdb_add() which uses the duplicate-exclusive variant of dev_uc_add(): dev_uc_add_excl(). Because DSA does not (yet) declare IFF_UNICAST_FLT, this results in us going to promiscuous mode: $ bridge fdb add dev swp0 00:01:02:03:04:05 [ 28.206743] device swp0 entered promiscuous mode $ bridge fdb add dev swp0 00:01:02:03:04:05 RTNETLINK answers: File exists So even if it does not completely fail, there is at least some indication that it is behaving differently from before, and closer to user space expectations, I would argue (the lack of a "local|static" specifier defaults to "local", or "host-only", so dev_uc_add() is a reasonable default implementation). If the generic implementation of .ndo_fdb_add provided by Vlad Yasevich is a proof of anything, it only proves that the implementation provided by DSA was always wrong, by not looking at "ndm->ndm_state & NUD_NOARP" (the "static" flag which means that the FDB entry points outwards) and "ndm->ndm_state & NUD_PERMANENT" (the "local" flag which means that the FDB entry points towards the host). It all used to mean the same thing to DSA. Update the documentation so that the users are not confused about what's going on. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- Documentation/networking/dsa/configuration.rst | 68 ++++++++++++++++++++++++++ net/dsa/slave.c | 23 --------- 2 files changed, 68 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/Documentation/networking/dsa/configuration.rst b/Documentation/networking/dsa/configuration.rst index 774f0e76c746..2b08f1a772d3 100644 --- a/Documentation/networking/dsa/configuration.rst +++ b/Documentation/networking/dsa/configuration.rst @@ -292,3 +292,71 @@ configuration. # bring up the bridge devices ip link set br0 up + +Forwarding database (FDB) management +------------------------------------ + +The existing DSA switches do not have the necessary hardware support to keep +the software FDB of the bridge in sync with the hardware tables, so the two +tables are managed separately (``bridge fdb show`` queries both, and depending +on whether the ``self`` or ``master`` flags are being used, a ``bridge fdb +add`` or ``bridge fdb del`` command acts upon entries from one or both tables). + +Up until kernel v4.14, DSA only supported user space management of bridge FDB +entries using the bridge bypass operations (which do not update the software +FDB, just the hardware one) using the ``self`` flag (which is optional and can +be omitted). + + .. code-block:: sh + + bridge fdb add dev swp0 00:01:02:03:04:05 self static + # or shorthand + bridge fdb add dev swp0 00:01:02:03:04:05 static + +Due to a bug, the bridge bypass FDB implementation provided by DSA did not +distinguish between ``static`` and ``local`` FDB entries (``static`` are meant +to be forwarded, while ``local`` are meant to be locally terminated, i.e. sent +to the host port). Instead, all FDB entries with the ``self`` flag (implicit or +explicit) are treated by DSA as ``static`` even if they are ``local``. + + .. code-block:: sh + + # This command: + bridge fdb add dev swp0 00:01:02:03:04:05 static + # behaves the same for DSA as this command: + bridge fdb add dev swp0 00:01:02:03:04:05 local + # or shorthand, because the 'local' flag is implicit if 'static' is not + # specified, it also behaves the same as: + bridge fdb add dev swp0 00:01:02:03:04:05 + +The last command is an incorrect way of adding a static bridge FDB entry to a +DSA switch using the bridge bypass operations, and works by mistake. Other +drivers will treat an FDB entry added by the same command as ``local`` and as +such, will not forward it, as opposed to DSA. + +Between kernel v4.14 and v5.14, DSA has supported in parallel two modes of +adding a bridge FDB entry to the switch: the bridge bypass discussed above, as +well as a new mode using the ``master`` flag which installs FDB entries in the +software bridge too. + + .. code-block:: sh + + bridge fdb add dev swp0 00:01:02:03:04:05 master static + +Since kernel v5.14, DSA has gained stronger integration with the bridge's +software FDB, and the support for its bridge bypass FDB implementation (using +the ``self`` flag) has been removed. This results in the following changes: + + .. code-block:: sh + + # This is the only valid way of adding an FDB entry that is supported, + # compatible with v4.14 kernels and later: + bridge fdb add dev swp0 00:01:02:03:04:05 master static + # This command is no longer buggy and the entry is properly treated as + # 'local' instead of being forwarded: + bridge fdb add dev swp0 00:01:02:03:04:05 + # This command no longer installs a static FDB entry to hardware: + bridge fdb add dev swp0 00:01:02:03:04:05 static + +Script writers are therefore encouraged to use the ``master static`` set of +flags when working with bridge FDB entries on DSA switch interfaces. diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 898ed9cf756f..64acb1e11cd7 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1651,27 +1651,6 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = { .self_test = dsa_slave_net_selftest, }; -/* legacy way, bypassing the bridge *****************************************/ -static int dsa_legacy_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], - struct net_device *dev, - const unsigned char *addr, u16 vid, - u16 flags, - struct netlink_ext_ack *extack) -{ - struct dsa_port *dp = dsa_slave_to_port(dev); - - return dsa_port_fdb_add(dp, addr, vid); -} - -static int dsa_legacy_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], - struct net_device *dev, - const unsigned char *addr, u16 vid) -{ - struct dsa_port *dp = dsa_slave_to_port(dev); - - return dsa_port_fdb_del(dp, addr, vid); -} - static struct devlink_port *dsa_slave_get_devlink_port(struct net_device *dev) { struct dsa_port *dp = dsa_slave_to_port(dev); @@ -1713,8 +1692,6 @@ static const struct net_device_ops dsa_slave_netdev_ops = { .ndo_change_rx_flags = dsa_slave_change_rx_flags, .ndo_set_rx_mode = dsa_slave_set_rx_mode, .ndo_set_mac_address = dsa_slave_set_mac_address, - .ndo_fdb_add = dsa_legacy_fdb_add, - .ndo_fdb_del = dsa_legacy_fdb_del, .ndo_fdb_dump = dsa_slave_fdb_dump, .ndo_do_ioctl = dsa_slave_ioctl, .ndo_get_iflink = dsa_slave_get_iflink, -- cgit v1.2.3-58-ga151 From b8e997c490036f38d48687415fd1367e00e98fb9 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 29 Jun 2021 17:06:49 +0300 Subject: net: dsa: introduce a separate cross-chip notifier type for host MDBs Commit abd49535c380 ("net: dsa: execute dsa_switch_mdb_add only for routing port in cross-chip topologies") does a surprisingly good job even for the SWITCHDEV_OBJ_ID_HOST_MDB use case, where DSA simply translates a switchdev object received on dp into a cross-chip notifier for dp->cpu_dp. To visualize how that works, imagine the daisy chain topology below and consider a SWITCHDEV_OBJ_ID_HOST_MDB object emitted on sw2p0. How does the cross-chip notifier know to match on all the right ports (sw0p4, the dedicated CPU port, sw1p4, an upstream DSA link, and sw2p4, another upstream DSA link)? | sw0p0 sw0p1 sw0p2 sw0p3 sw0p4 [ user ] [ user ] [ user ] [ dsa ] [ cpu ] [ ] [ ] [ ] [ ] [ x ] | +---------+ | sw1p0 sw1p1 sw1p2 sw1p3 sw1p4 [ user ] [ user ] [ user ] [ dsa ] [ dsa ] [ ] [ ] [ ] [ ] [ x ] | +---------+ | sw2p0 sw2p1 sw2p2 sw2p3 sw2p4 [ user ] [ user ] [ user ] [ user ] [ dsa ] [ ] [ ] [ ] [ ] [ x ] The answer is simple: the dedicated CPU port of sw2p0 is sw0p4, and dsa_routing_port returns the upstream port for all switches. That is fine, but there are other topologies where this does not work as well. There are trees with "H" topologies in the wild, where there are 2 or more switches with DSA links between them, but every switch has its dedicated CPU port. For these topologies, it seems stupid for the neighbor switches to install an MDB entry on the routing port, since these multicast addresses are fundamentally different than the usual ones we support (and that is the justification for this patch, to introduce the concept of a termination plane multicast MAC address, as opposed to a forwarding plane multicast MAC address). For example, when a SWITCHDEV_OBJ_ID_HOST_MDB would get added to sw0p0, without this patch, it would get treated as a regular port MDB on sw0p2 and it would match on the ports below (including the sw1p3 routing port). | | sw0p0 sw0p1 sw0p2 sw0p3 sw1p3 sw1p2 sw1p1 sw1p0 [ user ] [ user ] [ cpu ] [ dsa ] [ dsa ] [ cpu ] [ user ] [ user ] [ ] [ ] [ x ] [ ] ---- [ x ] [ ] [ ] [ ] With the patch, the host MDB notifier on sw0p0 matches only on the local switch, which is what we want for a termination plane address. | | sw0p0 sw0p1 sw0p2 sw0p3 sw1p3 sw1p2 sw1p1 sw1p0 [ user ] [ user ] [ cpu ] [ dsa ] [ dsa ] [ cpu ] [ user ] [ user ] [ ] [ ] [ x ] [ ] ---- [ ] [ ] [ ] [ ] Name this new matching function "dsa_switch_host_address_match" since we will be reusing it soon for host FDB entries as well. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- net/dsa/dsa_priv.h | 6 ++++++ net/dsa/port.c | 24 ++++++++++++++++++++++ net/dsa/slave.c | 10 ++------- net/dsa/switch.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 92 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index c8712942002f..cd65933d269b 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -27,6 +27,8 @@ enum { DSA_NOTIFIER_LAG_LEAVE, DSA_NOTIFIER_MDB_ADD, DSA_NOTIFIER_MDB_DEL, + DSA_NOTIFIER_HOST_MDB_ADD, + DSA_NOTIFIER_HOST_MDB_DEL, DSA_NOTIFIER_VLAN_ADD, DSA_NOTIFIER_VLAN_DEL, DSA_NOTIFIER_MTU, @@ -214,6 +216,10 @@ int dsa_port_mdb_add(const struct dsa_port *dp, const struct switchdev_obj_port_mdb *mdb); int dsa_port_mdb_del(const struct dsa_port *dp, const struct switchdev_obj_port_mdb *mdb); +int dsa_port_host_mdb_add(const struct dsa_port *dp, + const struct switchdev_obj_port_mdb *mdb); +int dsa_port_host_mdb_del(const struct dsa_port *dp, + const struct switchdev_obj_port_mdb *mdb); int dsa_port_pre_bridge_flags(const struct dsa_port *dp, struct switchdev_brport_flags flags, struct netlink_ext_ack *extack); diff --git a/net/dsa/port.c b/net/dsa/port.c index 46089dd2b2ec..47f45f795f44 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -681,6 +681,30 @@ int dsa_port_mdb_del(const struct dsa_port *dp, return dsa_port_notify(dp, DSA_NOTIFIER_MDB_DEL, &info); } +int dsa_port_host_mdb_add(const struct dsa_port *dp, + const struct switchdev_obj_port_mdb *mdb) +{ + struct dsa_notifier_mdb_info info = { + .sw_index = dp->ds->index, + .port = dp->index, + .mdb = mdb, + }; + + return dsa_port_notify(dp, DSA_NOTIFIER_HOST_MDB_ADD, &info); +} + +int dsa_port_host_mdb_del(const struct dsa_port *dp, + const struct switchdev_obj_port_mdb *mdb) +{ + struct dsa_notifier_mdb_info info = { + .sw_index = dp->ds->index, + .port = dp->index, + .mdb = mdb, + }; + + return dsa_port_notify(dp, DSA_NOTIFIER_HOST_MDB_DEL, &info); +} + int dsa_port_vlan_add(struct dsa_port *dp, const struct switchdev_obj_port_vlan *vlan, struct netlink_ext_ack *extack) diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 64acb1e11cd7..4b1d738bc3bc 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -418,10 +418,7 @@ static int dsa_slave_port_obj_add(struct net_device *dev, const void *ctx, if (!dsa_port_offloads_bridge(dp, obj->orig_dev)) return -EOPNOTSUPP; - /* DSA can directly translate this to a normal MDB add, - * but on the CPU port. - */ - err = dsa_port_mdb_add(dp->cpu_dp, SWITCHDEV_OBJ_PORT_MDB(obj)); + err = dsa_port_host_mdb_add(dp, SWITCHDEV_OBJ_PORT_MDB(obj)); break; case SWITCHDEV_OBJ_ID_PORT_VLAN: if (!dsa_port_offloads_bridge_port(dp, obj->orig_dev)) @@ -495,10 +492,7 @@ static int dsa_slave_port_obj_del(struct net_device *dev, const void *ctx, if (!dsa_port_offloads_bridge(dp, obj->orig_dev)) return -EOPNOTSUPP; - /* DSA can directly translate this to a normal MDB add, - * but on the CPU port. - */ - err = dsa_port_mdb_del(dp->cpu_dp, SWITCHDEV_OBJ_PORT_MDB(obj)); + err = dsa_port_host_mdb_del(dp, SWITCHDEV_OBJ_PORT_MDB(obj)); break; case SWITCHDEV_OBJ_ID_PORT_VLAN: if (!dsa_port_offloads_bridge_port(dp, obj->orig_dev)) diff --git a/net/dsa/switch.c b/net/dsa/switch.c index c1e5afafe633..c40afd622331 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -154,6 +154,27 @@ static int dsa_switch_bridge_leave(struct dsa_switch *ds, return 0; } +/* Matches for all upstream-facing ports (the CPU port and all upstream-facing + * DSA links) that sit between the targeted port on which the notifier was + * emitted and its dedicated CPU port. + */ +static bool dsa_switch_host_address_match(struct dsa_switch *ds, int port, + int info_sw_index, int info_port) +{ + struct dsa_port *targeted_dp, *cpu_dp; + struct dsa_switch *targeted_ds; + + targeted_ds = dsa_switch_find(ds->dst->index, info_sw_index); + targeted_dp = dsa_to_port(targeted_ds, info_port); + cpu_dp = targeted_dp->cpu_dp; + + if (dsa_switch_is_upstream_of(ds, targeted_ds)) + return port == dsa_towards_port(ds, cpu_dp->ds->index, + cpu_dp->index); + + return false; +} + static int dsa_switch_fdb_add(struct dsa_switch *ds, struct dsa_notifier_fdb_info *info) { @@ -258,6 +279,39 @@ static int dsa_switch_mdb_del(struct dsa_switch *ds, return 0; } +static int dsa_switch_host_mdb_add(struct dsa_switch *ds, + struct dsa_notifier_mdb_info *info) +{ + int err = 0; + int port; + + if (!ds->ops->port_mdb_add) + return -EOPNOTSUPP; + + for (port = 0; port < ds->num_ports; port++) { + if (dsa_switch_host_address_match(ds, port, info->sw_index, + info->port)) { + err = ds->ops->port_mdb_add(ds, port, info->mdb); + if (err) + break; + } + } + + return err; +} + +static int dsa_switch_host_mdb_del(struct dsa_switch *ds, + struct dsa_notifier_mdb_info *info) +{ + if (!ds->ops->port_mdb_del) + return -EOPNOTSUPP; + + if (ds->index == info->sw_index) + return ds->ops->port_mdb_del(ds, info->port, info->mdb); + + return 0; +} + static bool dsa_switch_vlan_match(struct dsa_switch *ds, int port, struct dsa_notifier_vlan_info *info) { @@ -441,6 +495,12 @@ static int dsa_switch_event(struct notifier_block *nb, case DSA_NOTIFIER_MDB_DEL: err = dsa_switch_mdb_del(ds, info); break; + case DSA_NOTIFIER_HOST_MDB_ADD: + err = dsa_switch_host_mdb_add(ds, info); + break; + case DSA_NOTIFIER_HOST_MDB_DEL: + err = dsa_switch_host_mdb_del(ds, info); + break; case DSA_NOTIFIER_VLAN_ADD: err = dsa_switch_vlan_add(ds, info); break; -- cgit v1.2.3-58-ga151 From 161ca59d39e909d37eeeaf14bc1165b114790d00 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 29 Jun 2021 17:06:50 +0300 Subject: net: dsa: reference count the MDB entries at the cross-chip notifier level Ever since the cross-chip notifiers were introduced, the design was meant to be simplistic and just get the job done without worrying too much about dangling resources left behind. For example, somebody installs an MDB entry on sw0p0 in this daisy chain topology. It gets installed using ds->ops->port_mdb_add() on sw0p0, sw1p4 and sw2p4. | sw0p0 sw0p1 sw0p2 sw0p3 sw0p4 [ user ] [ user ] [ user ] [ dsa ] [ cpu ] [ x ] [ ] [ ] [ ] [ ] | +---------+ | sw1p0 sw1p1 sw1p2 sw1p3 sw1p4 [ user ] [ user ] [ user ] [ dsa ] [ dsa ] [ ] [ ] [ ] [ ] [ x ] | +---------+ | sw2p0 sw2p1 sw2p2 sw2p3 sw2p4 [ user ] [ user ] [ user ] [ user ] [ dsa ] [ ] [ ] [ ] [ ] [ x ] Then the same person deletes that MDB entry. The cross-chip notifier for deletion only matches sw0p0: | sw0p0 sw0p1 sw0p2 sw0p3 sw0p4 [ user ] [ user ] [ user ] [ dsa ] [ cpu ] [ x ] [ ] [ ] [ ] [ ] | +---------+ | sw1p0 sw1p1 sw1p2 sw1p3 sw1p4 [ user ] [ user ] [ user ] [ dsa ] [ dsa ] [ ] [ ] [ ] [ ] [ ] | +---------+ | sw2p0 sw2p1 sw2p2 sw2p3 sw2p4 [ user ] [ user ] [ user ] [ user ] [ dsa ] [ ] [ ] [ ] [ ] [ ] Why? Because the DSA links are 'trunk' ports, if we just go ahead and delete the MDB from sw1p4 and sw2p4 directly, we might delete those multicast entries when they are still needed. Just consider the fact that somebody does: - add a multicast MAC address towards sw0p0 [ via the cross-chip notifiers it gets installed on the DSA links too ] - add the same multicast MAC address towards sw0p1 (another port of that same switch) - delete the same multicast MAC address from sw0p0. At this point, if we deleted the MAC address from the DSA links, it would be flooded, even though there is still an entry on switch 0 which needs it not to. So that is why deletions only match the targeted source port and nothing on DSA links. Of course, dangling resources means that the hardware tables will eventually run out given enough additions/removals, but hey, at least it's simple. But there is a bigger concern which needs to be addressed, and that is our support for SWITCHDEV_OBJ_ID_HOST_MDB. DSA simply translates such an object into a dsa_port_host_mdb_add() which ends up as ds->ops->port_mdb_add() on the upstream port, and a similar thing happens on deletion: dsa_port_host_mdb_del() will trigger ds->ops->port_mdb_del() on the upstream port. When there are 2 VLAN-unaware bridges spanning the same switch (which is a use case DSA proudly supports), each bridge will install its own SWITCHDEV_OBJ_ID_HOST_MDB entries. But upon deletion, DSA goes ahead and emits a DSA_NOTIFIER_MDB_DEL for dp->cpu_dp, which is shared between the user ports enslaved to br0 and the user ports enslaved to br1. Not good. The host-trapped multicast addresses installed by br1 will be deleted when any state changes in br0 (IGMP timers expire, or ports leave, etc). To avoid this, we could of course go the route of the zero-sum game and delete the DSA_NOTIFIER_MDB_DEL call for dp->cpu_dp. But the better design is to just admit that on shared ports like DSA links and CPU ports, we should be reference counting calls, even if this consumes some dynamic memory which DSA has traditionally avoided. On the flip side, the hardware tables of switches are limited in size, so it would be good if the OS managed them properly instead of having them eventually overflow. To address the memory usage concern, we only apply the refcounting of MDB entries on ports that are really shared (CPU ports and DSA links) and not on user ports. In a typical single-switch setup, this means only the CPU port (and the host MDB entries are not that many, really). The name of the newly introduced data structures (dsa_mac_addr) is chosen in such a way that will be reusable for host FDB entries (next patch). With this change, we can finally have the same matching logic for the MDB additions and deletions, as well as for their host-trapped variants. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/net/dsa.h | 12 +++++++ net/dsa/dsa2.c | 8 +++++ net/dsa/switch.c | 104 +++++++++++++++++++++++++++++++++++++++++++++++++----- 3 files changed, 115 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 5f632cfd33c7..2c50546f9667 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -285,6 +285,11 @@ struct dsa_port { */ const struct dsa_netdevice_ops *netdev_ops; + /* List of MAC addresses that must be forwarded on this port. + * These are only valid on CPU ports and DSA links. + */ + struct list_head mdbs; + bool setup; }; @@ -299,6 +304,13 @@ struct dsa_link { struct list_head list; }; +struct dsa_mac_addr { + unsigned char addr[ETH_ALEN]; + u16 vid; + refcount_t refcount; + struct list_head list; +}; + struct dsa_switch { bool setup; diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 9000a8c84baf..2035d132682f 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -348,6 +348,8 @@ static int dsa_port_setup(struct dsa_port *dp) if (dp->setup) return 0; + INIT_LIST_HEAD(&dp->mdbs); + switch (dp->type) { case DSA_PORT_TYPE_UNUSED: dsa_port_disable(dp); @@ -443,6 +445,7 @@ static int dsa_port_devlink_setup(struct dsa_port *dp) static void dsa_port_teardown(struct dsa_port *dp) { struct devlink_port *dlp = &dp->devlink_port; + struct dsa_mac_addr *a, *tmp; if (!dp->setup) return; @@ -468,6 +471,11 @@ static void dsa_port_teardown(struct dsa_port *dp) break; } + list_for_each_entry_safe(a, tmp, &dp->mdbs, list) { + list_del(&a->list); + kfree(a); + } + dp->setup = false; } diff --git a/net/dsa/switch.c b/net/dsa/switch.c index c40afd622331..5439de029485 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -175,6 +175,84 @@ static bool dsa_switch_host_address_match(struct dsa_switch *ds, int port, return false; } +static struct dsa_mac_addr *dsa_mac_addr_find(struct list_head *addr_list, + const unsigned char *addr, + u16 vid) +{ + struct dsa_mac_addr *a; + + list_for_each_entry(a, addr_list, list) + if (ether_addr_equal(a->addr, addr) && a->vid == vid) + return a; + + return NULL; +} + +static int dsa_switch_do_mdb_add(struct dsa_switch *ds, int port, + const struct switchdev_obj_port_mdb *mdb) +{ + struct dsa_port *dp = dsa_to_port(ds, port); + struct dsa_mac_addr *a; + int err; + + /* No need to bother with refcounting for user ports */ + if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp))) + return ds->ops->port_mdb_add(ds, port, mdb); + + a = dsa_mac_addr_find(&dp->mdbs, mdb->addr, mdb->vid); + if (a) { + refcount_inc(&a->refcount); + return 0; + } + + a = kzalloc(sizeof(*a), GFP_KERNEL); + if (!a) + return -ENOMEM; + + err = ds->ops->port_mdb_add(ds, port, mdb); + if (err) { + kfree(a); + return err; + } + + ether_addr_copy(a->addr, mdb->addr); + a->vid = mdb->vid; + refcount_set(&a->refcount, 1); + list_add_tail(&a->list, &dp->mdbs); + + return 0; +} + +static int dsa_switch_do_mdb_del(struct dsa_switch *ds, int port, + const struct switchdev_obj_port_mdb *mdb) +{ + struct dsa_port *dp = dsa_to_port(ds, port); + struct dsa_mac_addr *a; + int err; + + /* No need to bother with refcounting for user ports */ + if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp))) + return ds->ops->port_mdb_del(ds, port, mdb); + + a = dsa_mac_addr_find(&dp->mdbs, mdb->addr, mdb->vid); + if (!a) + return -ENOENT; + + if (!refcount_dec_and_test(&a->refcount)) + return 0; + + err = ds->ops->port_mdb_del(ds, port, mdb); + if (err) { + refcount_inc(&a->refcount); + return err; + } + + list_del(&a->list); + kfree(a); + + return 0; +} + static int dsa_switch_fdb_add(struct dsa_switch *ds, struct dsa_notifier_fdb_info *info) { @@ -264,19 +342,18 @@ static int dsa_switch_mdb_add(struct dsa_switch *ds, if (!ds->ops->port_mdb_add) return -EOPNOTSUPP; - return ds->ops->port_mdb_add(ds, port, info->mdb); + return dsa_switch_do_mdb_add(ds, port, info->mdb); } static int dsa_switch_mdb_del(struct dsa_switch *ds, struct dsa_notifier_mdb_info *info) { + int port = dsa_towards_port(ds, info->sw_index, info->port); + if (!ds->ops->port_mdb_del) return -EOPNOTSUPP; - if (ds->index == info->sw_index) - return ds->ops->port_mdb_del(ds, info->port, info->mdb); - - return 0; + return dsa_switch_do_mdb_del(ds, port, info->mdb); } static int dsa_switch_host_mdb_add(struct dsa_switch *ds, @@ -291,7 +368,7 @@ static int dsa_switch_host_mdb_add(struct dsa_switch *ds, for (port = 0; port < ds->num_ports; port++) { if (dsa_switch_host_address_match(ds, port, info->sw_index, info->port)) { - err = ds->ops->port_mdb_add(ds, port, info->mdb); + err = dsa_switch_do_mdb_add(ds, port, info->mdb); if (err) break; } @@ -303,13 +380,22 @@ static int dsa_switch_host_mdb_add(struct dsa_switch *ds, static int dsa_switch_host_mdb_del(struct dsa_switch *ds, struct dsa_notifier_mdb_info *info) { + int err = 0; + int port; + if (!ds->ops->port_mdb_del) return -EOPNOTSUPP; - if (ds->index == info->sw_index) - return ds->ops->port_mdb_del(ds, info->port, info->mdb); + for (port = 0; port < ds->num_ports; port++) { + if (dsa_switch_host_address_match(ds, port, info->sw_index, + info->port)) { + err = dsa_switch_do_mdb_del(ds, port, info->mdb); + if (err) + break; + } + } - return 0; + return err; } static bool dsa_switch_vlan_match(struct dsa_switch *ds, int port, -- cgit v1.2.3-58-ga151 From 3dc80afc509831ec436e14d8ae74de330b37636d Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 29 Jun 2021 17:06:51 +0300 Subject: net: dsa: introduce a separate cross-chip notifier type for host FDBs DSA treats some bridge FDB entries by trapping them to the CPU port. Currently, the only class of such entries are FDB addresses learnt by the software bridge on a foreign interface. However there are many more to be added: - FDB entries with the is_local flag (for termination) added by the bridge on the user ports (typically containing the MAC address of the bridge port) - FDB entries pointing towards the bridge net device (for termination). Typically these contain the MAC address of the bridge net device. - Static FDB entries installed on a foreign interface that is in the same bridge with a DSA user port. The reason why a separate cross-chip notifier for host FDBs is justified compared to normal FDBs is the same as in the case of host MDBs: the cross-chip notifier matching function in switch.c should avoid installing these entries on routing ports that route towards the targeted switch, but not towards the CPU. This is required in order to have proper support for H-like multi-chip topologies. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- net/dsa/dsa_priv.h | 7 +++++++ net/dsa/port.c | 26 ++++++++++++++++++++++++++ net/dsa/slave.c | 21 ++++++++++++++++----- net/dsa/switch.c | 41 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 90 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index cd65933d269b..36e667ea94db 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -20,6 +20,8 @@ enum { DSA_NOTIFIER_BRIDGE_LEAVE, DSA_NOTIFIER_FDB_ADD, DSA_NOTIFIER_FDB_DEL, + DSA_NOTIFIER_HOST_FDB_ADD, + DSA_NOTIFIER_HOST_FDB_DEL, DSA_NOTIFIER_HSR_JOIN, DSA_NOTIFIER_HSR_LEAVE, DSA_NOTIFIER_LAG_CHANGE, @@ -121,6 +123,7 @@ struct dsa_switchdev_event_work { */ unsigned char addr[ETH_ALEN]; u16 vid; + bool host_addr; }; /* DSA_NOTIFIER_HSR_* */ @@ -211,6 +214,10 @@ int dsa_port_fdb_add(struct dsa_port *dp, const unsigned char *addr, u16 vid); int dsa_port_fdb_del(struct dsa_port *dp, const unsigned char *addr, u16 vid); +int dsa_port_host_fdb_add(struct dsa_port *dp, const unsigned char *addr, + u16 vid); +int dsa_port_host_fdb_del(struct dsa_port *dp, const unsigned char *addr, + u16 vid); int dsa_port_fdb_dump(struct dsa_port *dp, dsa_fdb_dump_cb_t *cb, void *data); int dsa_port_mdb_add(const struct dsa_port *dp, const struct switchdev_obj_port_mdb *mdb); diff --git a/net/dsa/port.c b/net/dsa/port.c index 47f45f795f44..1b80e0fbdfaa 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -646,6 +646,32 @@ int dsa_port_fdb_del(struct dsa_port *dp, const unsigned char *addr, return dsa_port_notify(dp, DSA_NOTIFIER_FDB_DEL, &info); } +int dsa_port_host_fdb_add(struct dsa_port *dp, const unsigned char *addr, + u16 vid) +{ + struct dsa_notifier_fdb_info info = { + .sw_index = dp->ds->index, + .port = dp->index, + .addr = addr, + .vid = vid, + }; + + return dsa_port_notify(dp, DSA_NOTIFIER_HOST_FDB_ADD, &info); +} + +int dsa_port_host_fdb_del(struct dsa_port *dp, const unsigned char *addr, + u16 vid) +{ + struct dsa_notifier_fdb_info info = { + .sw_index = dp->ds->index, + .port = dp->index, + .addr = addr, + .vid = vid, + }; + + return dsa_port_notify(dp, DSA_NOTIFIER_HOST_FDB_DEL, &info); +} + int dsa_port_fdb_dump(struct dsa_port *dp, dsa_fdb_dump_cb_t *cb, void *data) { struct dsa_switch *ds = dp->ds; diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 4b1d738bc3bc..ac7f4f200ab1 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -2315,8 +2315,12 @@ static void dsa_slave_switchdev_event_work(struct work_struct *work) rtnl_lock(); switch (switchdev_work->event) { case SWITCHDEV_FDB_ADD_TO_DEVICE: - err = dsa_port_fdb_add(dp, switchdev_work->addr, - switchdev_work->vid); + if (switchdev_work->host_addr) + err = dsa_port_host_fdb_add(dp, switchdev_work->addr, + switchdev_work->vid); + else + err = dsa_port_fdb_add(dp, switchdev_work->addr, + switchdev_work->vid); if (err) { dev_err(ds->dev, "port %d failed to add %pM vid %d to fdb: %d\n", @@ -2328,8 +2332,12 @@ static void dsa_slave_switchdev_event_work(struct work_struct *work) break; case SWITCHDEV_FDB_DEL_TO_DEVICE: - err = dsa_port_fdb_del(dp, switchdev_work->addr, - switchdev_work->vid); + if (switchdev_work->host_addr) + err = dsa_port_host_fdb_del(dp, switchdev_work->addr, + switchdev_work->vid); + else + err = dsa_port_fdb_del(dp, switchdev_work->addr, + switchdev_work->vid); if (err) { dev_err(ds->dev, "port %d failed to delete %pM vid %d from fdb: %d\n", @@ -2375,6 +2383,7 @@ static int dsa_slave_switchdev_event(struct notifier_block *unused, struct net_device *dev = switchdev_notifier_info_to_dev(ptr); const struct switchdev_notifier_fdb_info *fdb_info; struct dsa_switchdev_event_work *switchdev_work; + bool host_addr = false; struct dsa_port *dp; int err; @@ -2412,7 +2421,8 @@ static int dsa_slave_switchdev_event(struct notifier_block *unused, if (!p) return NOTIFY_DONE; - dp = p->dp->cpu_dp; + dp = p->dp; + host_addr = true; if (!dp->ds->assisted_learning_on_cpu_port) return NOTIFY_DONE; @@ -2442,6 +2452,7 @@ static int dsa_slave_switchdev_event(struct notifier_block *unused, ether_addr_copy(switchdev_work->addr, fdb_info->addr); switchdev_work->vid = fdb_info->vid; + switchdev_work->host_addr = host_addr; /* Hold a reference on the slave for dsa_fdb_offload_notify */ if (dsa_is_user_port(dp->ds, dp->index)) diff --git a/net/dsa/switch.c b/net/dsa/switch.c index 5439de029485..219fc9baaa1c 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -253,6 +253,41 @@ static int dsa_switch_do_mdb_del(struct dsa_switch *ds, int port, return 0; } +static int dsa_switch_host_fdb_add(struct dsa_switch *ds, + struct dsa_notifier_fdb_info *info) +{ + int err = 0; + int port; + + if (!ds->ops->port_fdb_add) + return -EOPNOTSUPP; + + for (port = 0; port < ds->num_ports; port++) { + if (dsa_switch_host_address_match(ds, port, info->sw_index, + info->port)) { + err = ds->ops->port_fdb_add(ds, port, info->addr, + info->vid); + if (err) + break; + } + } + + return err; +} + +static int dsa_switch_host_fdb_del(struct dsa_switch *ds, + struct dsa_notifier_fdb_info *info) +{ + if (!ds->ops->port_fdb_del) + return -EOPNOTSUPP; + + if (ds->index == info->sw_index) + return ds->ops->port_fdb_del(ds, info->port, info->addr, + info->vid); + + return 0; +} + static int dsa_switch_fdb_add(struct dsa_switch *ds, struct dsa_notifier_fdb_info *info) { @@ -560,6 +595,12 @@ static int dsa_switch_event(struct notifier_block *nb, case DSA_NOTIFIER_FDB_DEL: err = dsa_switch_fdb_del(ds, info); break; + case DSA_NOTIFIER_HOST_FDB_ADD: + err = dsa_switch_host_fdb_add(ds, info); + break; + case DSA_NOTIFIER_HOST_FDB_DEL: + err = dsa_switch_host_fdb_del(ds, info); + break; case DSA_NOTIFIER_HSR_JOIN: err = dsa_switch_hsr_join(ds, info); break; -- cgit v1.2.3-58-ga151 From 3f6e32f92a027e91f001070ec324dd3b534d948c Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 29 Jun 2021 17:06:52 +0300 Subject: net: dsa: reference count the FDB addresses at the cross-chip notifier level The same concerns expressed for host MDB entries are valid for host FDBs just as well: - in the case of multiple bridges spanning the same switch chip, deleting a host FDB entry that belongs to one bridge will result in breakage to the other bridge - not deleting FDB entries across DSA links means that the switch's hardware tables will eventually run out, given enough wear&tear So do the same thing and introduce reference counting for CPU ports and DSA links using the same data structures as we have for MDB entries. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/net/dsa.h | 1 + net/dsa/dsa2.c | 6 ++++ net/dsa/switch.c | 88 ++++++++++++++++++++++++++++++++++++++++++++++++++----- 3 files changed, 88 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 2c50546f9667..33f40c1ec379 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -288,6 +288,7 @@ struct dsa_port { /* List of MAC addresses that must be forwarded on this port. * These are only valid on CPU ports and DSA links. */ + struct list_head fdbs; struct list_head mdbs; bool setup; diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 2035d132682f..185629f27f80 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -348,6 +348,7 @@ static int dsa_port_setup(struct dsa_port *dp) if (dp->setup) return 0; + INIT_LIST_HEAD(&dp->fdbs); INIT_LIST_HEAD(&dp->mdbs); switch (dp->type) { @@ -471,6 +472,11 @@ static void dsa_port_teardown(struct dsa_port *dp) break; } + list_for_each_entry_safe(a, tmp, &dp->fdbs, list) { + list_del(&a->list); + kfree(a); + } + list_for_each_entry_safe(a, tmp, &dp->mdbs, list) { list_del(&a->list); kfree(a); diff --git a/net/dsa/switch.c b/net/dsa/switch.c index 219fc9baaa1c..af71b8638098 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -253,6 +253,71 @@ static int dsa_switch_do_mdb_del(struct dsa_switch *ds, int port, return 0; } +static int dsa_switch_do_fdb_add(struct dsa_switch *ds, int port, + const unsigned char *addr, u16 vid) +{ + struct dsa_port *dp = dsa_to_port(ds, port); + struct dsa_mac_addr *a; + int err; + + /* No need to bother with refcounting for user ports */ + if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp))) + return ds->ops->port_fdb_add(ds, port, addr, vid); + + a = dsa_mac_addr_find(&dp->fdbs, addr, vid); + if (a) { + refcount_inc(&a->refcount); + return 0; + } + + a = kzalloc(sizeof(*a), GFP_KERNEL); + if (!a) + return -ENOMEM; + + err = ds->ops->port_fdb_add(ds, port, addr, vid); + if (err) { + kfree(a); + return err; + } + + ether_addr_copy(a->addr, addr); + a->vid = vid; + refcount_set(&a->refcount, 1); + list_add_tail(&a->list, &dp->fdbs); + + return 0; +} + +static int dsa_switch_do_fdb_del(struct dsa_switch *ds, int port, + const unsigned char *addr, u16 vid) +{ + struct dsa_port *dp = dsa_to_port(ds, port); + struct dsa_mac_addr *a; + int err; + + /* No need to bother with refcounting for user ports */ + if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp))) + return ds->ops->port_fdb_del(ds, port, addr, vid); + + a = dsa_mac_addr_find(&dp->fdbs, addr, vid); + if (!a) + return -ENOENT; + + if (!refcount_dec_and_test(&a->refcount)) + return 0; + + err = ds->ops->port_fdb_del(ds, port, addr, vid); + if (err) { + refcount_inc(&a->refcount); + return err; + } + + list_del(&a->list); + kfree(a); + + return 0; +} + static int dsa_switch_host_fdb_add(struct dsa_switch *ds, struct dsa_notifier_fdb_info *info) { @@ -265,7 +330,7 @@ static int dsa_switch_host_fdb_add(struct dsa_switch *ds, for (port = 0; port < ds->num_ports; port++) { if (dsa_switch_host_address_match(ds, port, info->sw_index, info->port)) { - err = ds->ops->port_fdb_add(ds, port, info->addr, + err = dsa_switch_do_fdb_add(ds, port, info->addr, info->vid); if (err) break; @@ -278,14 +343,23 @@ static int dsa_switch_host_fdb_add(struct dsa_switch *ds, static int dsa_switch_host_fdb_del(struct dsa_switch *ds, struct dsa_notifier_fdb_info *info) { + int err = 0; + int port; + if (!ds->ops->port_fdb_del) return -EOPNOTSUPP; - if (ds->index == info->sw_index) - return ds->ops->port_fdb_del(ds, info->port, info->addr, - info->vid); + for (port = 0; port < ds->num_ports; port++) { + if (dsa_switch_host_address_match(ds, port, info->sw_index, + info->port)) { + err = dsa_switch_do_fdb_del(ds, port, info->addr, + info->vid); + if (err) + break; + } + } - return 0; + return err; } static int dsa_switch_fdb_add(struct dsa_switch *ds, @@ -296,7 +370,7 @@ static int dsa_switch_fdb_add(struct dsa_switch *ds, if (!ds->ops->port_fdb_add) return -EOPNOTSUPP; - return ds->ops->port_fdb_add(ds, port, info->addr, info->vid); + return dsa_switch_do_fdb_add(ds, port, info->addr, info->vid); } static int dsa_switch_fdb_del(struct dsa_switch *ds, @@ -307,7 +381,7 @@ static int dsa_switch_fdb_del(struct dsa_switch *ds, if (!ds->ops->port_fdb_del) return -EOPNOTSUPP; - return ds->ops->port_fdb_del(ds, port, info->addr, info->vid); + return dsa_switch_do_fdb_del(ds, port, info->addr, info->vid); } static int dsa_switch_hsr_join(struct dsa_switch *ds, -- cgit v1.2.3-58-ga151 From 26ee7b06a4d3086a3751b69c14663ba6c6bbfe7f Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 29 Jun 2021 17:06:53 +0300 Subject: net: dsa: install the host MDB and FDB entries in the master's RX filter If the DSA master implements strict address filtering, then the unicast and multicast addresses kept by the DSA CPU ports should be synchronized with the address lists of the DSA master. Note that we want the synchronization of the master's address lists even if the DSA switch doesn't support unicast/multicast database operations, on the premises that the packets will be flooded to the CPU in that case, and we should still instruct the master to receive them. This is why we do the dev_uc_add() etc first, even if dsa_port_notify() returns -EOPNOTSUPP. In turn, dev_uc_add() and friends return error only if memory allocation fails, so it is probably ok to check and propagate that error code and not just ignore it. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- net/dsa/port.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'net') diff --git a/net/dsa/port.c b/net/dsa/port.c index 1b80e0fbdfaa..778b0dc2bb39 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -655,6 +655,12 @@ int dsa_port_host_fdb_add(struct dsa_port *dp, const unsigned char *addr, .addr = addr, .vid = vid, }; + struct dsa_port *cpu_dp = dp->cpu_dp; + int err; + + err = dev_uc_add(cpu_dp->master, addr); + if (err) + return err; return dsa_port_notify(dp, DSA_NOTIFIER_HOST_FDB_ADD, &info); } @@ -668,6 +674,12 @@ int dsa_port_host_fdb_del(struct dsa_port *dp, const unsigned char *addr, .addr = addr, .vid = vid, }; + struct dsa_port *cpu_dp = dp->cpu_dp; + int err; + + err = dev_uc_del(cpu_dp->master, addr); + if (err) + return err; return dsa_port_notify(dp, DSA_NOTIFIER_HOST_FDB_DEL, &info); } @@ -715,6 +727,12 @@ int dsa_port_host_mdb_add(const struct dsa_port *dp, .port = dp->index, .mdb = mdb, }; + struct dsa_port *cpu_dp = dp->cpu_dp; + int err; + + err = dev_mc_add(cpu_dp->master, mdb->addr); + if (err) + return err; return dsa_port_notify(dp, DSA_NOTIFIER_HOST_MDB_ADD, &info); } @@ -727,6 +745,12 @@ int dsa_port_host_mdb_del(const struct dsa_port *dp, .port = dp->index, .mdb = mdb, }; + struct dsa_port *cpu_dp = dp->cpu_dp; + int err; + + err = dev_mc_del(cpu_dp->master, mdb->addr); + if (err) + return err; return dsa_port_notify(dp, DSA_NOTIFIER_HOST_MDB_DEL, &info); } -- cgit v1.2.3-58-ga151 From 3068d466a67ec96a6972f248f5c7a7b6763dbeb1 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 29 Jun 2021 17:06:54 +0300 Subject: net: dsa: sync static FDB entries on foreign interfaces to hardware DSA is able to install FDB entries towards the CPU port for addresses which were dynamically learnt by the software bridge on foreign interfaces that are in the same bridge with a DSA switch interface. Since this behavior is opportunistic, it is guarded by the "assisted_learning_on_cpu_port" property which can be enabled by drivers and is not done automatically (since certain switches may support address learning of packets coming from the CPU port). But if those FDB entries added on the foreign interfaces are static (added by the user) instead of dynamically learnt, currently DSA does not do anything (and arguably it should). Because static FDB entries are not supposed to move on their own, there is no downside in reusing the "assisted_learning_on_cpu_port" logic to sync static FDB entries to the DSA CPU port unconditionally, even if assisted_learning_on_cpu_port is not requested by the driver. For example, this situation: br0 / \ swp0 dummy0 $ bridge fdb add 02:00:de:ad:00:01 dev dummy0 vlan 1 master static Results in DSA adding an entry in the hardware FDB, pointing this address towards the CPU port. The same is true for entries added to the bridge itself, e.g: $ bridge fdb add 02:00:de:ad:00:01 dev br0 vlan 1 self local (except that right now, DSA still ignores 'local' FDB entries, this will be changed in a later patch) Signed-off-by: Tobias Waldekranz Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- net/dsa/slave.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index ac7f4f200ab1..ea9a7c1ce83e 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -2403,9 +2403,12 @@ static int dsa_slave_switchdev_event(struct notifier_block *unused, dp = dsa_slave_to_port(dev); } else { - /* Snoop addresses learnt on foreign interfaces - * bridged with us, for switches that don't - * automatically learn SA from CPU-injected traffic + /* Snoop addresses added to foreign interfaces + * bridged with us, or the bridge + * itself. Dynamically learned addresses can + * also be added for switches that don't + * automatically learn SA from CPU-injected + * traffic. */ struct net_device *br_dev; struct dsa_slave_priv *p; @@ -2424,7 +2427,8 @@ static int dsa_slave_switchdev_event(struct notifier_block *unused, dp = p->dp; host_addr = true; - if (!dp->ds->assisted_learning_on_cpu_port) + if (!fdb_info->added_by_user && + !dp->ds->assisted_learning_on_cpu_port) return NOTIFY_DONE; /* When the bridge learns an address on an offloaded -- cgit v1.2.3-58-ga151 From 10fae4ac89ce5c2ead6c6c35fd09651b5f97ae05 Mon Sep 17 00:00:00 2001 From: Tobias Waldekranz Date: Tue, 29 Jun 2021 17:06:55 +0300 Subject: net: dsa: include bridge addresses which are local in the host fdb list The bridge automatically creates local (not forwarded) fdb entries pointing towards physical ports with their interface MAC addresses. For switchdev, the significance of these fdb entries is the exact opposite of that of non-local entries: instead of sending these frame outwards, we must send them inwards (towards the host). NOTE: The bridge's own MAC address is also "local". If that address is not shared with any port, the bridge's MAC is not be added by this functionality - but the following commit takes care of that case. NOTE 2: We mark these addresses as host-filtered regardless of the value of ds->assisted_learning_on_cpu_port. This is because, as opposed to the speculative logic done for dynamic address learning on foreign interfaces, the local FDB entries are rather fixed, so there isn't any risk of them migrating from one bridge port to another. Signed-off-by: Tobias Waldekranz Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- net/dsa/slave.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index ea9a7c1ce83e..d006bd04f84a 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -2398,10 +2398,12 @@ static int dsa_slave_switchdev_event(struct notifier_block *unused, fdb_info = ptr; if (dsa_slave_dev_check(dev)) { - if (!fdb_info->added_by_user || fdb_info->is_local) - return NOTIFY_OK; - dp = dsa_slave_to_port(dev); + + if (fdb_info->is_local) + host_addr = true; + else if (!fdb_info->added_by_user) + return NOTIFY_OK; } else { /* Snoop addresses added to foreign interfaces * bridged with us, or the bridge @@ -2425,9 +2427,15 @@ static int dsa_slave_switchdev_event(struct notifier_block *unused, return NOTIFY_DONE; dp = p->dp; - host_addr = true; + host_addr = fdb_info->is_local; - if (!fdb_info->added_by_user && + /* FDB entries learned by the software bridge should + * be installed as host addresses only if the driver + * requests assisted learning. + * On the other hand, FDB entries for local termination + * should always be installed. + */ + if (!fdb_info->added_by_user && !fdb_info->is_local && !dp->ds->assisted_learning_on_cpu_port) return NOTIFY_DONE; -- cgit v1.2.3-58-ga151 From 81a619f787593daf6224068c6dc8022ece591844 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 29 Jun 2021 17:06:56 +0300 Subject: net: dsa: include fdb entries pointing to bridge in the host fdb list The bridge supports a legacy way of adding local (non-forwarded) FDB entries, which works on an individual port basis: bridge fdb add dev swp0 00:01:02:03:04:05 master local As well as a new way, added by Roopa Prabhu in commit 3741873b4f73 ("bridge: allow adding of fdb entries pointing to the bridge device"): bridge fdb add dev br0 00:01:02:03:04:05 self local The two commands are functionally equivalent, except that the first one produces an entry with fdb->dst == swp0, and the other an entry with fdb->dst == NULL. The confusing part, though, is that even if fdb->dst is swp0 for the 'local on port' entry, that destination is not used. Nonetheless, the idea is that the bridge has reference counting for local entries, and local entries pointing towards the bridge are still 'as local' as local entries for a port. The bridge adds the MAC addresses of the interfaces automatically as FDB entries with is_local=1. For the MAC address of the ports, fdb->dst will be equal to the port, and for the MAC address of the bridge, fdb->dst will point towards the bridge (i.e. be NULL). Therefore, if the MAC address of the bridge is not inherited from either of the physical ports, then we must explicitly catch local FDB entries emitted towards the br0, otherwise we'll miss the MAC address of the bridge (and, of course, any entry with 'bridge add dev br0 ... self local'). Co-developed-by: Tobias Waldekranz Signed-off-by: Tobias Waldekranz Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- net/dsa/slave.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index d006bd04f84a..a7b5d2a41472 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -2415,7 +2415,11 @@ static int dsa_slave_switchdev_event(struct notifier_block *unused, struct net_device *br_dev; struct dsa_slave_priv *p; - br_dev = netdev_master_upper_dev_get_rcu(dev); + if (netif_is_bridge_master(dev)) + br_dev = dev; + else + br_dev = netdev_master_upper_dev_get_rcu(dev); + if (!br_dev) return NOTIFY_DONE; @@ -2443,8 +2447,13 @@ static int dsa_slave_switchdev_event(struct notifier_block *unused, * LAG we don't want to send traffic to the CPU, the * other ports bridged with the LAG should be able to * autonomously forward towards it. + * On the other hand, if the address is local + * (therefore not learned) then we want to trap it to + * the CPU regardless of whether the interface it + * belongs to is offloaded or not. */ - if (dsa_tree_offloads_bridge_port(dp->ds->dst, dev)) + if (dsa_tree_offloads_bridge_port(dp->ds->dst, dev) && + !fdb_info->is_local) return NOTIFY_DONE; } -- cgit v1.2.3-58-ga151 From 4bed397c3e65638e9118956bda85d2a9bcac3668 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 29 Jun 2021 17:06:57 +0300 Subject: net: dsa: ensure during dsa_fdb_offload_notify that dev_hold and dev_put are on the same dev When (a) "dev" is a bridge port which the DSA switch tree offloads, but is otherwise not a dsa slave (such as a LAG netdev), or (b) "dev" is the bridge net device itself then strange things happen to the dev_hold/dev_put pair: dsa_schedule_work() will still be called with a DSA port that offloads that netdev, but dev_hold() will be called on the non-DSA netdev. Then the "if" condition in dsa_slave_switchdev_event_work() does not pass, because "dev" is not a DSA netdev, so dev_put() is not called. This results in the simple fact that we have a reference counting mismatch on the "dev" net device. This can be seen when we add support for host addresses installed on the bridge net device. ip link add br1 type bridge ip link set br1 address 00:01:02:03:04:05 ip link set swp0 master br1 ip link del br1 [ 968.512278] unregister_netdevice: waiting for br1 to become free. Usage count = 5 It seems foolish to do penny pinching and not add the net_device pointer in the dsa_switchdev_event_work structure, so let's finally do that. As an added bonus, when we start offloading local entries pointing towards the bridge, these will now properly appear as 'offloaded' in 'bridge fdb' (this was not possible before, because 'dev' was assumed to only be a DSA net device): 00:01:02:03:04:05 dev br0 vlan 1 offload master br0 permanent 00:01:02:03:04:05 dev br0 offload master br0 permanent Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- net/dsa/dsa_priv.h | 1 + net/dsa/slave.c | 9 ++++----- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 36e667ea94db..f201c33980bf 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -116,6 +116,7 @@ struct dsa_notifier_mrp_ring_role_info { struct dsa_switchdev_event_work { struct dsa_switch *ds; int port; + struct net_device *dev; struct work_struct work; unsigned long event; /* Specific for SWITCHDEV_FDB_ADD_TO_DEVICE and diff --git a/net/dsa/slave.c b/net/dsa/slave.c index a7b5d2a41472..ffbba1e71551 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -2349,9 +2349,8 @@ static void dsa_slave_switchdev_event_work(struct work_struct *work) } rtnl_unlock(); + dev_put(switchdev_work->dev); kfree(switchdev_work); - if (dsa_is_user_port(ds, dp->index)) - dev_put(dp->slave); } static int dsa_lower_dev_walk(struct net_device *lower_dev, @@ -2469,15 +2468,15 @@ static int dsa_slave_switchdev_event(struct notifier_block *unused, switchdev_work->ds = dp->ds; switchdev_work->port = dp->index; switchdev_work->event = event; + switchdev_work->dev = dev; ether_addr_copy(switchdev_work->addr, fdb_info->addr); switchdev_work->vid = fdb_info->vid; switchdev_work->host_addr = host_addr; - /* Hold a reference on the slave for dsa_fdb_offload_notify */ - if (dsa_is_user_port(dp->ds, dp->index)) - dev_hold(dev); + /* Hold a reference for dsa_fdb_offload_notify */ + dev_hold(dev); dsa_schedule_work(&switchdev_work->work); break; default: -- cgit v1.2.3-58-ga151 From 63c51453c82cddc27556233ff41041ea9fc49fe0 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 29 Jun 2021 17:06:58 +0300 Subject: net: dsa: replay the local bridge FDB entries pointing to the bridge dev too When we join a bridge that already has some local addresses pointing to itself, we do not get those notifications. Similarly, when we leave that bridge, we do not get notifications for the deletion of those entries. The only switchdev notifications we get are those of entries added while the DSA port is enslaved to the bridge. This makes use cases such as the following work properly (with the number of additions and removals properly balanced): ip link add br0 type bridge ip link add br1 type bridge ip link set br0 address 00:01:02:03:04:05 ip link set br1 address 00:01:02:03:04:05 ip link set swp0 up ip link set swp1 up ip link set swp0 master br0 ip link set swp1 master br1 ip link set br0 up ip link set br1 up ip link del br1 # 00:01:02:03:04:05 still installed on the CPU port ip link del br0 # 00:01:02:03:04:05 finally removed from the CPU port Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- net/dsa/port.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'net') diff --git a/net/dsa/port.c b/net/dsa/port.c index 778b0dc2bb39..28b45b7e66df 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -199,11 +199,17 @@ static int dsa_port_switchdev_sync(struct dsa_port *dp, if (err && err != -EOPNOTSUPP) return err; + /* Forwarding and termination FDB entries on the port */ err = br_fdb_replay(br, brport_dev, dp, true, &dsa_slave_switchdev_notifier); if (err && err != -EOPNOTSUPP) return err; + /* Termination FDB entries on the bridge itself */ + err = br_fdb_replay(br, br, dp, true, &dsa_slave_switchdev_notifier); + if (err && err != -EOPNOTSUPP) + return err; + err = br_vlan_replay(br, brport_dev, dp, true, &dsa_slave_switchdev_blocking_notifier, extack); if (err && err != -EOPNOTSUPP) @@ -225,11 +231,17 @@ static int dsa_port_switchdev_unsync_objs(struct dsa_port *dp, if (err && err != -EOPNOTSUPP) return err; + /* Forwarding and termination FDB entries on the port */ err = br_fdb_replay(br, brport_dev, dp, false, &dsa_slave_switchdev_notifier); if (err && err != -EOPNOTSUPP) return err; + /* Termination FDB entries on the bridge itself */ + err = br_fdb_replay(br, br, dp, false, &dsa_slave_switchdev_notifier); + if (err && err != -EOPNOTSUPP) + return err; + err = br_vlan_replay(br, brport_dev, dp, false, &dsa_slave_switchdev_blocking_notifier, extack); if (err && err != -EOPNOTSUPP) -- cgit v1.2.3-58-ga151 From e3ae2365efc14269170a6326477e669332271ab3 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Sun, 27 Jun 2021 18:48:21 -0400 Subject: net: sock: introduce sk_error_report This patch introduces a function wrapper to call the sk_error_report callback. That will prepare to add additional handling whenever sk_error_report is called, for example to trace socket errors. Signed-off-by: Alexander Aring Signed-off-by: David S. Miller --- .../ethernet/chelsio/inline_crypto/chtls/chtls_cm.c | 2 +- drivers/vhost/vsock.c | 2 +- include/linux/skmsg.h | 2 +- include/net/sock.h | 2 ++ include/net/tls.h | 2 +- net/caif/caif_socket.c | 2 +- net/can/bcm.c | 4 ++-- net/can/isotp.c | 20 ++++++++++---------- net/can/j1939/socket.c | 4 ++-- net/can/raw.c | 6 +++--- net/core/skbuff.c | 6 +++--- net/core/sock.c | 6 ++++++ net/dccp/ipv4.c | 4 ++-- net/dccp/ipv6.c | 4 ++-- net/dccp/proto.c | 2 +- net/dccp/timer.c | 2 +- net/ipv4/ping.c | 2 +- net/ipv4/raw.c | 4 ++-- net/ipv4/tcp.c | 4 ++-- net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_ipv4.c | 4 ++-- net/ipv4/tcp_timer.c | 2 +- net/ipv4/udp.c | 4 ++-- net/ipv6/raw.c | 2 +- net/ipv6/tcp_ipv6.c | 4 ++-- net/ipv6/udp.c | 2 +- net/kcm/kcmsock.c | 2 +- net/mptcp/subflow.c | 2 +- net/netlink/af_netlink.c | 8 ++++---- net/nfc/rawsock.c | 2 +- net/packet/af_packet.c | 4 ++-- net/qrtr/qrtr.c | 2 +- net/sctp/input.c | 2 +- net/sctp/ipv6.c | 2 +- net/smc/af_smc.c | 2 +- net/strparser/strparser.c | 2 +- net/unix/af_unix.c | 2 +- net/vmw_vsock/af_vsock.c | 2 +- net/vmw_vsock/virtio_transport.c | 2 +- net/vmw_vsock/virtio_transport_common.c | 2 +- net/vmw_vsock/vmci_transport.c | 4 ++-- net/xdp/xsk.c | 2 +- 42 files changed, 75 insertions(+), 67 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c index 19dc7dc054a2..bcad69c48074 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c @@ -2134,7 +2134,7 @@ static void chtls_abort_req_rss(struct sock *sk, struct sk_buff *skb) sk->sk_err = ETIMEDOUT; if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_error_report(sk); + sk_error_report(sk); if (sk->sk_state == TCP_SYN_RECV && !abort_syn_rcv(sk, skb)) return; diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c index 119f08491d3c..d38c996b4f46 100644 --- a/drivers/vhost/vsock.c +++ b/drivers/vhost/vsock.c @@ -734,7 +734,7 @@ static void vhost_vsock_reset_orphans(struct sock *sk) vsk->peer_shutdown = SHUTDOWN_MASK; sk->sk_state = SS_UNCONNECTED; sk->sk_err = ECONNRESET; - sk->sk_error_report(sk); + sk_error_report(sk); } static int vhost_vsock_dev_release(struct inode *inode, struct file *file) diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index fcaa9a7996c8..31866031e370 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -347,7 +347,7 @@ static inline void sk_psock_report_error(struct sk_psock *psock, int err) struct sock *sk = psock->sk; sk->sk_err = err; - sk->sk_error_report(sk); + sk_error_report(sk); } struct sk_psock *sk_psock_init(struct sock *sk, int node); diff --git a/include/net/sock.h b/include/net/sock.h index ced2fc965ec7..8bdd80027ffb 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2281,6 +2281,8 @@ static inline int sock_error(struct sock *sk) return -err; } +void sk_error_report(struct sock *sk); + static inline unsigned long sock_wspace(struct sock *sk) { int amt = 0; diff --git a/include/net/tls.h b/include/net/tls.h index 8d398a5de3ee..be4b3e1cac46 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -469,7 +469,7 @@ static inline bool tls_is_sk_tx_device_offloaded(struct sock *sk) static inline void tls_err_abort(struct sock *sk, int err) { sk->sk_err = err; - sk->sk_error_report(sk); + sk_error_report(sk); } static inline bool tls_bigint_increment(unsigned char *seq, int len) diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index 3ad0a1df6712..647554c9813b 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -243,7 +243,7 @@ static void caif_ctrl_cb(struct cflayer *layr, cf_sk->sk.sk_shutdown = SHUTDOWN_MASK; cf_sk->sk.sk_err = ECONNRESET; set_rx_flow_on(cf_sk); - cf_sk->sk.sk_error_report(&cf_sk->sk); + sk_error_report(&cf_sk->sk); break; default: diff --git a/net/can/bcm.c b/net/can/bcm.c index f3e4d9528fa3..e15a7dbe5f6c 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -1417,7 +1417,7 @@ static void bcm_notify(struct bcm_sock *bo, unsigned long msg, if (notify_enodev) { sk->sk_err = ENODEV; if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_error_report(sk); + sk_error_report(sk); } break; @@ -1425,7 +1425,7 @@ static void bcm_notify(struct bcm_sock *bo, unsigned long msg, if (bo->bound && bo->ifindex == dev->ifindex) { sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_error_report(sk); + sk_error_report(sk); } } } diff --git a/net/can/isotp.c b/net/can/isotp.c index bd49299319a1..9fd274cf166b 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -168,7 +168,7 @@ static enum hrtimer_restart isotp_rx_timer_handler(struct hrtimer *hrtimer) /* report 'connection timed out' */ sk->sk_err = ETIMEDOUT; if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_error_report(sk); + sk_error_report(sk); /* reset rx state */ so->rx.state = ISOTP_IDLE; @@ -339,7 +339,7 @@ static int isotp_rcv_fc(struct isotp_sock *so, struct canfd_frame *cf, int ae) /* malformed PDU - report 'not a data message' */ sk->sk_err = EBADMSG; if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_error_report(sk); + sk_error_report(sk); so->tx.state = ISOTP_IDLE; wake_up_interruptible(&so->wait); @@ -392,7 +392,7 @@ static int isotp_rcv_fc(struct isotp_sock *so, struct canfd_frame *cf, int ae) /* overflow on receiver side - report 'message too long' */ sk->sk_err = EMSGSIZE; if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_error_report(sk); + sk_error_report(sk); fallthrough; default: @@ -420,7 +420,7 @@ static int isotp_rcv_sf(struct sock *sk, struct canfd_frame *cf, int pcilen, /* malformed PDU - report 'not a data message' */ sk->sk_err = EBADMSG; if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_error_report(sk); + sk_error_report(sk); return 1; } @@ -535,7 +535,7 @@ static int isotp_rcv_cf(struct sock *sk, struct canfd_frame *cf, int ae, /* wrong sn detected - report 'illegal byte sequence' */ sk->sk_err = EILSEQ; if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_error_report(sk); + sk_error_report(sk); /* reset rx state */ so->rx.state = ISOTP_IDLE; @@ -559,7 +559,7 @@ static int isotp_rcv_cf(struct sock *sk, struct canfd_frame *cf, int ae, /* malformed PDU - report 'not a data message' */ sk->sk_err = EBADMSG; if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_error_report(sk); + sk_error_report(sk); return 1; } @@ -758,7 +758,7 @@ static enum hrtimer_restart isotp_tx_timer_handler(struct hrtimer *hrtimer) /* report 'communication error on send' */ sk->sk_err = ECOMM; if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_error_report(sk); + sk_error_report(sk); /* reset tx state */ so->tx.state = ISOTP_IDLE; @@ -1157,7 +1157,7 @@ out: if (notify_enetdown) { sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_error_report(sk); + sk_error_report(sk); } return err; @@ -1356,13 +1356,13 @@ static void isotp_notify(struct isotp_sock *so, unsigned long msg, sk->sk_err = ENODEV; if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_error_report(sk); + sk_error_report(sk); break; case NETDEV_DOWN: sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_error_report(sk); + sk_error_report(sk); break; } } diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c index 56aa66147d5a..bf18a32dc6ae 100644 --- a/net/can/j1939/socket.c +++ b/net/can/j1939/socket.c @@ -1009,7 +1009,7 @@ void j1939_sk_send_loop_abort(struct sock *sk, int err) { sk->sk_err = err; - sk->sk_error_report(sk); + sk_error_report(sk); } static int j1939_sk_send_loop(struct j1939_priv *priv, struct sock *sk, @@ -1189,7 +1189,7 @@ void j1939_sk_netdev_event_netdown(struct j1939_priv *priv) list_for_each_entry(jsk, &priv->j1939_socks, list) { jsk->sk.sk_err = error_code; if (!sock_flag(&jsk->sk, SOCK_DEAD)) - jsk->sk.sk_error_report(&jsk->sk); + sk_error_report(&jsk->sk); j1939_sk_queue_drop_all(priv, jsk, error_code); } diff --git a/net/can/raw.c b/net/can/raw.c index ac96fc210025..ed4fcb7ab0c3 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -295,13 +295,13 @@ static void raw_notify(struct raw_sock *ro, unsigned long msg, sk->sk_err = ENODEV; if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_error_report(sk); + sk_error_report(sk); break; case NETDEV_DOWN: sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_error_report(sk); + sk_error_report(sk); break; } } @@ -488,7 +488,7 @@ static int raw_bind(struct socket *sock, struct sockaddr *uaddr, int len) if (notify_enetdown) { sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_error_report(sk); + sk_error_report(sk); } return err; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 2531ac4ffa69..12aabcda6db2 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1294,7 +1294,7 @@ static void __msg_zerocopy_callback(struct ubuf_info *uarg) } spin_unlock_irqrestore(&q->lock, flags); - sk->sk_error_report(sk); + sk_error_report(sk); release: consume_skb(skb); @@ -4685,7 +4685,7 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) skb_queue_tail(&sk->sk_error_queue, skb); if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_error_report(sk); + sk_error_report(sk); return 0; } EXPORT_SYMBOL(sock_queue_err_skb); @@ -4716,7 +4716,7 @@ struct sk_buff *sock_dequeue_err_skb(struct sock *sk) sk->sk_err = 0; if (skb_next) - sk->sk_error_report(sk); + sk_error_report(sk); return skb; } diff --git a/net/core/sock.c b/net/core/sock.c index a2337b37eba6..c30f8f4cbb22 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -331,6 +331,12 @@ int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(__sk_backlog_rcv); +void sk_error_report(struct sock *sk) +{ + sk->sk_error_report(sk); +} +EXPORT_SYMBOL(sk_error_report); + static int sock_get_timeout(long timeo, void *optval, bool old_timeval) { struct __kernel_sock_timeval tv; diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index f81c1df761d3..0ea29270d7e5 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -329,7 +329,7 @@ static int dccp_v4_err(struct sk_buff *skb, u32 info) __DCCP_INC_STATS(DCCP_MIB_ATTEMPTFAILS); sk->sk_err = err; - sk->sk_error_report(sk); + sk_error_report(sk); dccp_done(sk); } else @@ -356,7 +356,7 @@ static int dccp_v4_err(struct sk_buff *skb, u32 info) inet = inet_sk(sk); if (!sock_owned_by_user(sk) && inet->recverr) { sk->sk_err = err; - sk->sk_error_report(sk); + sk_error_report(sk); } else /* Only an error on timeout */ sk->sk_err_soft = err; out: diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 6f5304db5a67..fa663518fa0e 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -172,7 +172,7 @@ static int dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, * Wake people up to see the error * (see connect in sock.c) */ - sk->sk_error_report(sk); + sk_error_report(sk); dccp_done(sk); } else sk->sk_err_soft = err; @@ -181,7 +181,7 @@ static int dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (!sock_owned_by_user(sk) && np->recverr) { sk->sk_err = err; - sk->sk_error_report(sk); + sk_error_report(sk); } else sk->sk_err_soft = err; diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 6d705d90c614..7eb0fb231940 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -302,7 +302,7 @@ int dccp_disconnect(struct sock *sk, int flags) WARN_ON(inet->inet_num && !icsk->icsk_bind_hash); - sk->sk_error_report(sk); + sk_error_report(sk); return 0; } diff --git a/net/dccp/timer.c b/net/dccp/timer.c index db768f223ef7..27a3b37acd2e 100644 --- a/net/dccp/timer.c +++ b/net/dccp/timer.c @@ -20,7 +20,7 @@ int sysctl_dccp_retries2 __read_mostly = TCP_RETR2; static void dccp_write_err(struct sock *sk) { sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT; - sk->sk_error_report(sk); + sk_error_report(sk); dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED); dccp_done(sk); diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 95a718397fd1..1e44a43acfe2 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -573,7 +573,7 @@ void ping_err(struct sk_buff *skb, int offset, u32 info) } } sk->sk_err = err; - sk->sk_error_report(sk); + sk_error_report(sk); out: sock_put(sk); } diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 50a73178d63a..bb446e60cf58 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -280,7 +280,7 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info) if (inet->recverr || harderr) { sk->sk_err = err; - sk->sk_error_report(sk); + sk_error_report(sk); } } @@ -929,7 +929,7 @@ int raw_abort(struct sock *sk, int err) lock_sock(sk); sk->sk_err = err; - sk->sk_error_report(sk); + sk_error_report(sk); __udp_disconnect(sk, 0); release_sock(sk); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 0e3f0e0e5b51..a0a96eb826c4 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3059,7 +3059,7 @@ int tcp_disconnect(struct sock *sk, int flags) sk->sk_frag.offset = 0; } - sk->sk_error_report(sk); + sk_error_report(sk); return 0; } EXPORT_SYMBOL(tcp_disconnect); @@ -4448,7 +4448,7 @@ int tcp_abort(struct sock *sk, int err) sk->sk_err = err; /* This barrier is coupled with smp_rmb() in tcp_poll() */ smp_wmb(); - sk->sk_error_report(sk); + sk_error_report(sk); if (tcp_need_reset(sk->sk_state)) tcp_send_active_reset(sk, GFP_ATOMIC); tcp_done(sk); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 7d5e59f688de..e6ca5a1f3b59 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4270,7 +4270,7 @@ void tcp_reset(struct sock *sk, struct sk_buff *skb) tcp_done(sk); if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_error_report(sk); + sk_error_report(sk); } /* diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 6cb8e269f1ab..e66ad6bfe808 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -585,7 +585,7 @@ int tcp_v4_err(struct sk_buff *skb, u32 info) if (!sock_owned_by_user(sk)) { sk->sk_err = err; - sk->sk_error_report(sk); + sk_error_report(sk); tcp_done(sk); } else { @@ -613,7 +613,7 @@ int tcp_v4_err(struct sk_buff *skb, u32 info) inet = inet_sk(sk); if (!sock_owned_by_user(sk) && inet->recverr) { sk->sk_err = err; - sk->sk_error_report(sk); + sk_error_report(sk); } else { /* Only an error on timeout */ sk->sk_err_soft = err; } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 56b9d648f054..20cf4a98c69d 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -68,7 +68,7 @@ u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when) static void tcp_write_err(struct sock *sk) { sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT; - sk->sk_error_report(sk); + sk_error_report(sk); tcp_write_queue_purge(sk); tcp_done(sk); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 1307ad0d3b9e..f86ccbf7c135 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -776,7 +776,7 @@ int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) ip_icmp_error(sk, skb, err, uh->dest, info, (u8 *)(uh+1)); sk->sk_err = err; - sk->sk_error_report(sk); + sk_error_report(sk); out: return 0; } @@ -2867,7 +2867,7 @@ int udp_abort(struct sock *sk, int err) goto out; sk->sk_err = err; - sk->sk_error_report(sk); + sk_error_report(sk); __udp_disconnect(sk, 0); out: diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index bf3646b57c68..60f1e4f5be5a 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -354,7 +354,7 @@ static void rawv6_err(struct sock *sk, struct sk_buff *skb, if (np->recverr || harderr) { sk->sk_err = err; - sk->sk_error_report(sk); + sk_error_report(sk); } } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 4d71464094b3..578ab6305c3f 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -467,7 +467,7 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (!sock_owned_by_user(sk)) { sk->sk_err = err; - sk->sk_error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ + sk_error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ tcp_done(sk); } else @@ -486,7 +486,7 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (!sock_owned_by_user(sk) && np->recverr) { sk->sk_err = err; - sk->sk_error_report(sk); + sk_error_report(sk); } else sk->sk_err_soft = err; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 3fcd86f4dfdc..368972dbd919 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -610,7 +610,7 @@ int __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } sk->sk_err = err; - sk->sk_error_report(sk); + sk_error_report(sk); out: return 0; } diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index 6201965bd822..11a715d76a4f 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -47,7 +47,7 @@ static inline struct kcm_tx_msg *kcm_tx_msg(struct sk_buff *skb) static void report_csk_error(struct sock *csk, int err) { csk->sk_err = EPIPE; - csk->sk_error_report(csk); + sk_error_report(csk); } static void kcm_abort_tx_psock(struct kcm_psock *psock, int err, diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index d55f4ef736a5..706a26a1b0fe 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1240,7 +1240,7 @@ void __mptcp_error_report(struct sock *sk) /* This barrier is coupled with smp_rmb() in mptcp_poll() */ smp_wmb(); - sk->sk_error_report(sk); + sk_error_report(sk); break; } } diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 6133e412b948..d233ac4a91b6 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -351,7 +351,7 @@ static void netlink_overrun(struct sock *sk) if (!test_and_set_bit(NETLINK_S_CONGESTED, &nlk_sk(sk)->state)) { sk->sk_err = ENOBUFS; - sk->sk_error_report(sk); + sk_error_report(sk); } } atomic_inc(&sk->sk_drops); @@ -1576,7 +1576,7 @@ static int do_one_set_err(struct sock *sk, struct netlink_set_err_data *p) } sk->sk_err = p->code; - sk->sk_error_report(sk); + sk_error_report(sk); out: return ret; } @@ -2012,7 +2012,7 @@ static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, ret = netlink_dump(sk); if (ret) { sk->sk_err = -ret; - sk->sk_error_report(sk); + sk_error_report(sk); } } @@ -2439,7 +2439,7 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, skb = nlmsg_new(payload + tlvlen, GFP_KERNEL); if (!skb) { NETLINK_CB(in_skb).sk->sk_err = ENOBUFS; - NETLINK_CB(in_skb).sk->sk_error_report(NETLINK_CB(in_skb).sk); + sk_error_report(NETLINK_CB(in_skb).sk); return; } diff --git a/net/nfc/rawsock.c b/net/nfc/rawsock.c index 5f1d438a0a23..5e39640becdb 100644 --- a/net/nfc/rawsock.c +++ b/net/nfc/rawsock.c @@ -49,7 +49,7 @@ static void rawsock_report_error(struct sock *sk, int err) sk->sk_shutdown = SHUTDOWN_MASK; sk->sk_err = -err; - sk->sk_error_report(sk); + sk_error_report(sk); rawsock_write_queue_purge(sk); } diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 77b0cdab3810..77476184741d 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3206,7 +3206,7 @@ static int packet_do_bind(struct sock *sk, const char *name, int ifindex, } else { sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_error_report(sk); + sk_error_report(sk); } out_unlock: @@ -4103,7 +4103,7 @@ static int packet_notifier(struct notifier_block *this, __unregister_prot_hook(sk, false); sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_error_report(sk); + sk_error_report(sk); } if (msg == NETDEV_UNREGISTER) { packet_cached_dev_reset(po); diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index f2efaa4225f9..e6f4a6202f82 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -751,7 +751,7 @@ static void qrtr_reset_ports(void) xa_for_each_start(&qrtr_ports, index, ipc, 1) { sock_hold(&ipc->sk); ipc->sk.sk_err = ENETRESET; - ipc->sk.sk_error_report(&ipc->sk); + sk_error_report(&ipc->sk); sock_put(&ipc->sk); } rcu_read_unlock(); diff --git a/net/sctp/input.c b/net/sctp/input.c index fe6429cc012f..76dcc137f761 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -593,7 +593,7 @@ static void sctp_v4_err_handle(struct sctp_transport *t, struct sk_buff *skb, } if (!sock_owned_by_user(sk) && inet_sk(sk)->recverr) { sk->sk_err = err; - sk->sk_error_report(sk); + sk_error_report(sk); } else { /* Only an error on timeout */ sk->sk_err_soft = err; } diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 05f81a4d0ee7..d041bed86322 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -152,7 +152,7 @@ static void sctp_v6_err_handle(struct sctp_transport *t, struct sk_buff *skb, icmpv6_err_convert(type, code, &err); if (!sock_owned_by_user(sk) && np->recverr) { sk->sk_err = err; - sk->sk_error_report(sk); + sk_error_report(sk); } else { sk->sk_err_soft = err; } diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index e41fdac606d4..898389611ae8 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2218,7 +2218,7 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, optval, optlen); if (smc->clcsock->sk->sk_err) { sk->sk_err = smc->clcsock->sk->sk_err; - sk->sk_error_report(sk); + sk_error_report(sk); } if (optlen < sizeof(int)) diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c index b3815c1e8f2e..9c0343568d2a 100644 --- a/net/strparser/strparser.c +++ b/net/strparser/strparser.c @@ -58,7 +58,7 @@ static void strp_abort_strp(struct strparser *strp, int err) /* Report an error on the lower socket */ sk->sk_err = -err; - sk->sk_error_report(sk); + sk_error_report(sk); } } diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 58c2f318b0a8..23c92ad15c61 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -491,7 +491,7 @@ static void unix_dgram_disconnected(struct sock *sk, struct sock *other) */ if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) { other->sk_err = ECONNRESET; - other->sk_error_report(other); + sk_error_report(other); } } } diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 21ccf450e249..9f12da1ff406 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1281,7 +1281,7 @@ static void vsock_connect_timeout(struct work_struct *work) (sk->sk_shutdown != SHUTDOWN_MASK)) { sk->sk_state = TCP_CLOSE; sk->sk_err = ETIMEDOUT; - sk->sk_error_report(sk); + sk_error_report(sk); vsock_transport_cancel_pkt(vsk); } release_sock(sk); diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index ed1664e7bd88..e0c2c992ad9c 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -360,7 +360,7 @@ static void virtio_vsock_reset_sock(struct sock *sk) lock_sock(sk); sk->sk_state = TCP_CLOSE; sk->sk_err = ECONNRESET; - sk->sk_error_report(sk); + sk_error_report(sk); release_sock(sk); } diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index f014ccfdd9c2..169ba8b72a63 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -1007,7 +1007,7 @@ destroy: virtio_transport_reset(vsk, pkt); sk->sk_state = TCP_CLOSE; sk->sk_err = skerr; - sk->sk_error_report(sk); + sk_error_report(sk); return err; } diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index e617ed93f06b..7aef34e32bdf 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -831,7 +831,7 @@ static void vmci_transport_handle_detach(struct sock *sk) sk->sk_state = TCP_CLOSE; sk->sk_err = ECONNRESET; - sk->sk_error_report(sk); + sk_error_report(sk); return; } sk->sk_state = TCP_CLOSE; @@ -1365,7 +1365,7 @@ destroy: sk->sk_state = TCP_CLOSE; sk->sk_err = skerr; - sk->sk_error_report(sk); + sk_error_report(sk); return err; } diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 996da915f520..d6b500dc4208 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -1313,7 +1313,7 @@ static int xsk_notifier(struct notifier_block *this, if (xs->dev == dev) { sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_error_report(sk); + sk_error_report(sk); xsk_unbind_dev(xs); -- cgit v1.2.3-58-ga151 From e6a3e4434000de5c36d606e5b5da5f7ba49444bd Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Sun, 27 Jun 2021 18:48:22 -0400 Subject: net: sock: add trace for socket errors This patch will add tracers to trace inet socket errors only. A user space monitor application can track connection errors indepedent from socket lifetime and do additional handling. For example a cluster manager can fence a node if errors occurs in a specific heuristic. Signed-off-by: Alexander Aring Signed-off-by: David S. Miller --- include/trace/events/sock.h | 60 +++++++++++++++++++++++++++++++++++++++++++++ net/core/sock.c | 10 ++++++++ 2 files changed, 70 insertions(+) (limited to 'net') diff --git a/include/trace/events/sock.h b/include/trace/events/sock.h index a966d4b5ab37..12c315782766 100644 --- a/include/trace/events/sock.h +++ b/include/trace/events/sock.h @@ -201,6 +201,66 @@ TRACE_EVENT(inet_sock_set_state, show_tcp_state_name(__entry->newstate)) ); +TRACE_EVENT(inet_sk_error_report, + + TP_PROTO(const struct sock *sk), + + TP_ARGS(sk), + + TP_STRUCT__entry( + __field(int, error) + __field(__u16, sport) + __field(__u16, dport) + __field(__u16, family) + __field(__u16, protocol) + __array(__u8, saddr, 4) + __array(__u8, daddr, 4) + __array(__u8, saddr_v6, 16) + __array(__u8, daddr_v6, 16) + ), + + TP_fast_assign( + struct inet_sock *inet = inet_sk(sk); + struct in6_addr *pin6; + __be32 *p32; + + __entry->error = sk->sk_err; + __entry->family = sk->sk_family; + __entry->protocol = sk->sk_protocol; + __entry->sport = ntohs(inet->inet_sport); + __entry->dport = ntohs(inet->inet_dport); + + p32 = (__be32 *) __entry->saddr; + *p32 = inet->inet_saddr; + + p32 = (__be32 *) __entry->daddr; + *p32 = inet->inet_daddr; + +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) { + pin6 = (struct in6_addr *)__entry->saddr_v6; + *pin6 = sk->sk_v6_rcv_saddr; + pin6 = (struct in6_addr *)__entry->daddr_v6; + *pin6 = sk->sk_v6_daddr; + } else +#endif + { + pin6 = (struct in6_addr *)__entry->saddr_v6; + ipv6_addr_set_v4mapped(inet->inet_saddr, pin6); + pin6 = (struct in6_addr *)__entry->daddr_v6; + ipv6_addr_set_v4mapped(inet->inet_daddr, pin6); + } + ), + + TP_printk("family=%s protocol=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c error=%d", + show_family_name(__entry->family), + show_inet_protocol_name(__entry->protocol), + __entry->sport, __entry->dport, + __entry->saddr, __entry->daddr, + __entry->saddr_v6, __entry->daddr_v6, + __entry->error) +); + #endif /* _TRACE_SOCK_H */ /* This part must be outside protection */ diff --git a/net/core/sock.c b/net/core/sock.c index c30f8f4cbb22..ba1c0f75cd45 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -334,6 +334,16 @@ EXPORT_SYMBOL(__sk_backlog_rcv); void sk_error_report(struct sock *sk) { sk->sk_error_report(sk); + + switch (sk->sk_family) { + case AF_INET: + fallthrough; + case AF_INET6: + trace_inet_sk_error_report(sk); + break; + default: + break; + } } EXPORT_SYMBOL(sk_error_report); -- cgit v1.2.3-58-ga151 From 23ac0b421674fba943dd131e66b81ed7f3fb3d1d Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 29 Jun 2021 03:29:25 +0300 Subject: net: use netdev_info in ndo_dflt_fdb_{add,del} Use the more modern printk helper for network interfaces, which also contains information about the associated struct device, and results in overall shorter line lengths compared to printing an open-coded dev->name. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 745965e49f78..ab11c9d5002b 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3947,12 +3947,12 @@ int ndo_dflt_fdb_add(struct ndmsg *ndm, * implement its own handler for this. */ if (ndm->ndm_state && !(ndm->ndm_state & NUD_PERMANENT)) { - pr_info("%s: FDB only supports static addresses\n", dev->name); + netdev_info(dev, "FDB only supports static addresses\n"); return err; } if (vid) { - pr_info("%s: vlans aren't supported yet for dev_uc|mc_add()\n", dev->name); + netdev_info(dev, "vlans aren't supported yet for dev_uc|mc_add()\n"); return err; } @@ -4086,7 +4086,7 @@ int ndo_dflt_fdb_del(struct ndmsg *ndm, * implement its own handler for this. */ if (!(ndm->ndm_state & NUD_PERMANENT)) { - pr_info("%s: FDB only supports static addresses\n", dev->name); + netdev_info(dev, "FDB only supports static addresses\n"); return err; } -- cgit v1.2.3-58-ga151 From 78ecc8903de2adf0387cbf06e5befe29c23f2739 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 29 Jun 2021 03:29:26 +0300 Subject: net: say "local" instead of "static" addresses in ndo_dflt_fdb_{add,del} "Static" is a loaded word, and probably not what the author meant when the code was written. In particular, this looks weird: $ bridge fdb add dev swp0 00:01:02:03:04:05 local # totally fine, but $ bridge fdb add dev swp0 00:01:02:03:04:05 static [ 2020.708298] swp0: FDB only supports static addresses # hmm what? By looking at the implementation which uses dev_uc_add/dev_uc_del it is absolutely clear that only local addresses are supported, and the proper Network Unreachability Detection state is being used for this purpose (user space indeed sets NUD_PERMANENT when local addresses are meant). So it is just the message that is wrong, fix it. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index ab11c9d5002b..f6af3e74fc44 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3947,7 +3947,7 @@ int ndo_dflt_fdb_add(struct ndmsg *ndm, * implement its own handler for this. */ if (ndm->ndm_state && !(ndm->ndm_state & NUD_PERMANENT)) { - netdev_info(dev, "FDB only supports static addresses\n"); + netdev_info(dev, "default FDB implementation only supports local addresses\n"); return err; } @@ -4086,7 +4086,7 @@ int ndo_dflt_fdb_del(struct ndmsg *ndm, * implement its own handler for this. */ if (!(ndm->ndm_state & NUD_PERMANENT)) { - netdev_info(dev, "FDB only supports static addresses\n"); + netdev_info(dev, "default FDB implementation only supports local addresses\n"); return err; } -- cgit v1.2.3-58-ga151 From 6706721d82f86e9360c3ad5339fe3da5e0988a51 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 29 Jun 2021 06:52:13 -0700 Subject: tcp_yeah: check struct yeah size at compile time Compiler can perform the sanity check instead of waiting to load the module and crash the host. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_yeah.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c index 3bb448761ca3..07c4c93b9fdb 100644 --- a/net/ipv4/tcp_yeah.c +++ b/net/ipv4/tcp_yeah.c @@ -221,7 +221,7 @@ static struct tcp_congestion_ops tcp_yeah __read_mostly = { static int __init tcp_yeah_register(void) { - BUG_ON(sizeof(struct yeah) > ICSK_CA_PRIV_SIZE); + BUILD_BUG_ON(sizeof(struct yeah) > ICSK_CA_PRIV_SIZE); tcp_register_congestion_control(&tcp_yeah); return 0; } -- cgit v1.2.3-58-ga151