From 540b1c48c37ac0ad66212004db21e1ff7e2d78be Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 27 Feb 2017 15:43:06 +0000 Subject: rxrpc: Fix deadlock between call creation and sendmsg/recvmsg All the routines by which rxrpc is accessed from the outside are serialised by means of the socket lock (sendmsg, recvmsg, bind, rxrpc_kernel_begin_call(), ...) and this presents a problem: (1) If a number of calls on the same socket are in the process of connection to the same peer, a maximum of four concurrent live calls are permitted before further calls need to wait for a slot. (2) If a call is waiting for a slot, it is deep inside sendmsg() or rxrpc_kernel_begin_call() and the entry function is holding the socket lock. (3) sendmsg() and recvmsg() or the in-kernel equivalents are prevented from servicing the other calls as they need to take the socket lock to do so. (4) The socket is stuck until a call is aborted and makes its slot available to the waiter. Fix this by: (1) Provide each call with a mutex ('user_mutex') that arbitrates access by the users of rxrpc separately for each specific call. (2) Make rxrpc_sendmsg() and rxrpc_recvmsg() unlock the socket as soon as they've got a call and taken its mutex. Note that I'm returning EWOULDBLOCK from recvmsg() if MSG_DONTWAIT is set but someone else has the lock. Should I instead only return EWOULDBLOCK if there's nothing currently to be done on a socket, and sleep in this particular instance because there is something to be done, but we appear to be blocked by the interrupt handler doing its ping? (3) Make rxrpc_new_client_call() unlock the socket after allocating a new call, locking its user mutex and adding it to the socket's call tree. The call is returned locked so that sendmsg() can add data to it immediately. From the moment the call is in the socket tree, it is subject to access by sendmsg() and recvmsg() - even if it isn't connected yet. (4) Lock new service calls in the UDP data_ready handler (in rxrpc_new_incoming_call()) because they may already be in the socket's tree and the data_ready handler makes them live immediately if a user ID has already been preassigned. Note that the new call is locked before any notifications are sent that it is live, so doing mutex_trylock() *ought* to always succeed. Userspace is prevented from doing sendmsg() on calls that are in a too-early state in rxrpc_do_sendmsg(). (5) Make rxrpc_new_incoming_call() return the call with the user mutex held so that a ping can be scheduled immediately under it. Note that it might be worth moving the ping call into rxrpc_new_incoming_call() and then we can drop the mutex there. (6) Make rxrpc_accept_call() take the lock on the call it is accepting and release the socket after adding the call to the socket's tree. This is slightly tricky as we've dequeued the call by that point and have to requeue it. Note that requeuing emits a trace event. (7) Make rxrpc_kernel_send_data() and rxrpc_kernel_recv_data() take the new mutex immediately and don't bother with the socket mutex at all. This patch has the nice bonus that calls on the same socket are now to some extent parallelisable. Note that we might want to move rxrpc_service_prealloc() calls out from the socket lock and give it its own lock, so that we don't hang progress in other calls because we're waiting for the allocator. We probably also want to avoid calling rxrpc_notify_socket() from within the socket lock (rxrpc_accept_call()). Signed-off-by: David Howells Tested-by: Marc Dionne Signed-off-by: David S. Miller --- include/trace/events/rxrpc.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 593f586545eb..39123c06a566 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -119,6 +119,7 @@ enum rxrpc_recvmsg_trace { rxrpc_recvmsg_full, rxrpc_recvmsg_hole, rxrpc_recvmsg_next, + rxrpc_recvmsg_requeue, rxrpc_recvmsg_return, rxrpc_recvmsg_terminal, rxrpc_recvmsg_to_be_accepted, @@ -277,6 +278,7 @@ enum rxrpc_congest_change { EM(rxrpc_recvmsg_full, "FULL") \ EM(rxrpc_recvmsg_hole, "HOLE") \ EM(rxrpc_recvmsg_next, "NEXT") \ + EM(rxrpc_recvmsg_requeue, "REQU") \ EM(rxrpc_recvmsg_return, "RETN") \ EM(rxrpc_recvmsg_terminal, "TERM") \ EM(rxrpc_recvmsg_to_be_accepted, "TBAC") \ -- cgit v1.2.3-58-ga151 From b2d0fe35471d1a71471f99147ffb5986bd60e744 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 28 Feb 2017 15:02:15 +0300 Subject: net/mlx4: && vs & typo Bitwise & was obviously intended here. Fixes: 745d8ae4622c ("net/mlx4: Spoofcheck and zero MAC can't coexist") Signed-off-by: Dan Carpenter Reviewed-by: Tariq Toukan Signed-off-by: David S. Miller --- include/linux/mlx4/driver.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx4/driver.h b/include/linux/mlx4/driver.h index e965e5090d96..a858bcb6220b 100644 --- a/include/linux/mlx4/driver.h +++ b/include/linux/mlx4/driver.h @@ -109,7 +109,7 @@ static inline void mlx4_u64_to_mac(u8 *addr, u64 mac) int i; for (i = ETH_ALEN; i > 0; i--) { - addr[i - 1] = mac && 0xFF; + addr[i - 1] = mac & 0xFF; mac >>= 8; } } -- cgit v1.2.3-58-ga151 From 39e6c8208d7b6fb9d2047850fb3327db567b564b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 28 Feb 2017 10:34:50 -0800 Subject: net: solve a NAPI race While playing with mlx4 hardware timestamping of RX packets, I found that some packets were received by TCP stack with a ~200 ms delay... Since the timestamp was provided by the NIC, and my probe was added in tcp_v4_rcv() while in BH handler, I was confident it was not a sender issue, or a drop in the network. This would happen with a very low probability, but hurting RPC workloads. A NAPI driver normally arms the IRQ after the napi_complete_done(), after NAPI_STATE_SCHED is cleared, so that the hard irq handler can grab it. Problem is that if another point in the stack grabs NAPI_STATE_SCHED bit while IRQ are not disabled, we might have later an IRQ firing and finding this bit set, right before napi_complete_done() clears it. This can happen with busy polling users, or if gro_flush_timeout is used. But some other uses of napi_schedule() in drivers can cause this as well. thread 1 thread 2 (could be on same cpu, or not) // busy polling or napi_watchdog() napi_schedule(); ... napi->poll() device polling: read 2 packets from ring buffer Additional 3rd packet is available. device hard irq // does nothing because NAPI_STATE_SCHED bit is owned by thread 1 napi_schedule(); napi_complete_done(napi, 2); rearm_irq(); Note that rearm_irq() will not force the device to send an additional IRQ for the packet it already signaled (3rd packet in my example) This patch adds a new NAPI_STATE_MISSED bit, that napi_schedule_prep() can set if it could not grab NAPI_STATE_SCHED Then napi_complete_done() properly reschedules the napi to make sure we do not miss something. Since we manipulate multiple bits at once, use cmpxchg() like in sk_busy_loop() to provide proper transactions. In v2, I changed napi_watchdog() to use a relaxed variant of napi_schedule_prep() : No need to set NAPI_STATE_MISSED from this point. In v3, I added more details in the changelog and clears NAPI_STATE_MISSED in busy_poll_stop() In v4, I added the ideas given by Alexander Duyck in v3 review Signed-off-by: Eric Dumazet Cc: Alexander Duyck Signed-off-by: David S. Miller --- include/linux/netdevice.h | 29 ++++++------------ net/core/dev.c | 76 ++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 81 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f40f0ab3847a..97456b2539e4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -330,6 +330,7 @@ struct napi_struct { enum { NAPI_STATE_SCHED, /* Poll is scheduled */ + NAPI_STATE_MISSED, /* reschedule a napi */ NAPI_STATE_DISABLE, /* Disable pending */ NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */ NAPI_STATE_HASHED, /* In NAPI hash (busy polling possible) */ @@ -338,12 +339,13 @@ enum { }; enum { - NAPIF_STATE_SCHED = (1UL << NAPI_STATE_SCHED), - NAPIF_STATE_DISABLE = (1UL << NAPI_STATE_DISABLE), - NAPIF_STATE_NPSVC = (1UL << NAPI_STATE_NPSVC), - NAPIF_STATE_HASHED = (1UL << NAPI_STATE_HASHED), - NAPIF_STATE_NO_BUSY_POLL = (1UL << NAPI_STATE_NO_BUSY_POLL), - NAPIF_STATE_IN_BUSY_POLL = (1UL << NAPI_STATE_IN_BUSY_POLL), + NAPIF_STATE_SCHED = BIT(NAPI_STATE_SCHED), + NAPIF_STATE_MISSED = BIT(NAPI_STATE_MISSED), + NAPIF_STATE_DISABLE = BIT(NAPI_STATE_DISABLE), + NAPIF_STATE_NPSVC = BIT(NAPI_STATE_NPSVC), + NAPIF_STATE_HASHED = BIT(NAPI_STATE_HASHED), + NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL), + NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL), }; enum gro_result { @@ -414,20 +416,7 @@ static inline bool napi_disable_pending(struct napi_struct *n) return test_bit(NAPI_STATE_DISABLE, &n->state); } -/** - * napi_schedule_prep - check if NAPI can be scheduled - * @n: NAPI context - * - * Test if NAPI routine is already running, and if not mark - * it as running. This is used as a condition variable to - * insure only one NAPI poll instance runs. We also make - * sure there is no pending NAPI disable. - */ -static inline bool napi_schedule_prep(struct napi_struct *n) -{ - return !napi_disable_pending(n) && - !test_and_set_bit(NAPI_STATE_SCHED, &n->state); -} +bool napi_schedule_prep(struct napi_struct *n); /** * napi_schedule - schedule NAPI poll diff --git a/net/core/dev.c b/net/core/dev.c index 304f2deae5f9..e63bf61b19be 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4883,6 +4883,39 @@ void __napi_schedule(struct napi_struct *n) } EXPORT_SYMBOL(__napi_schedule); +/** + * napi_schedule_prep - check if napi can be scheduled + * @n: napi context + * + * Test if NAPI routine is already running, and if not mark + * it as running. This is used as a condition variable + * insure only one NAPI poll instance runs. We also make + * sure there is no pending NAPI disable. + */ +bool napi_schedule_prep(struct napi_struct *n) +{ + unsigned long val, new; + + do { + val = READ_ONCE(n->state); + if (unlikely(val & NAPIF_STATE_DISABLE)) + return false; + new = val | NAPIF_STATE_SCHED; + + /* Sets STATE_MISSED bit if STATE_SCHED was already set + * This was suggested by Alexander Duyck, as compiler + * emits better code than : + * if (val & NAPIF_STATE_SCHED) + * new |= NAPIF_STATE_MISSED; + */ + new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED * + NAPIF_STATE_MISSED; + } while (cmpxchg(&n->state, val, new) != val); + + return !(val & NAPIF_STATE_SCHED); +} +EXPORT_SYMBOL(napi_schedule_prep); + /** * __napi_schedule_irqoff - schedule for receive * @n: entry to schedule @@ -4897,7 +4930,7 @@ EXPORT_SYMBOL(__napi_schedule_irqoff); bool napi_complete_done(struct napi_struct *n, int work_done) { - unsigned long flags; + unsigned long flags, val, new; /* * 1) Don't let napi dequeue from the cpu poll list @@ -4927,7 +4960,27 @@ bool napi_complete_done(struct napi_struct *n, int work_done) list_del_init(&n->poll_list); local_irq_restore(flags); } - WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state)); + + do { + val = READ_ONCE(n->state); + + WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED)); + + new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED); + + /* If STATE_MISSED was set, leave STATE_SCHED set, + * because we will call napi->poll() one more time. + * This C code was suggested by Alexander Duyck to help gcc. + */ + new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED * + NAPIF_STATE_SCHED; + } while (cmpxchg(&n->state, val, new) != val); + + if (unlikely(val & NAPIF_STATE_MISSED)) { + __napi_schedule(n); + return false; + } + return true; } EXPORT_SYMBOL(napi_complete_done); @@ -4953,6 +5006,16 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock) { int rc; + /* Busy polling means there is a high chance device driver hard irq + * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was + * set in napi_schedule_prep(). + * Since we are about to call napi->poll() once more, we can safely + * clear NAPI_STATE_MISSED. + * + * Note: x86 could use a single "lock and ..." instruction + * to perform these two clear_bit() + */ + clear_bit(NAPI_STATE_MISSED, &napi->state); clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state); local_bh_disable(); @@ -5088,8 +5151,13 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) struct napi_struct *napi; napi = container_of(timer, struct napi_struct, timer); - if (napi->gro_list) - napi_schedule_irqoff(napi); + + /* Note : we use a relaxed variant of napi_schedule_prep() not setting + * NAPI_STATE_MISSED, since we do not react to a device IRQ. + */ + if (napi->gro_list && !napi_disable_pending(napi) && + !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) + __napi_schedule_irqoff(napi); return HRTIMER_NORESTART; } -- cgit v1.2.3-58-ga151 From eb1e011a14748a1d9df9a7d7df9a5711721a1bdb Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 15 Feb 2017 09:49:26 +0100 Subject: average: change to declare precision, not factor Declaring the factor is counter-intuitive, and people are prone to using small(-ish) values even when that makes no sense. Change the DECLARE_EWMA() macro to take the fractional precision, in bits, rather than a factor, and update all users. While at it, add some more documentation. Acked-by: David S. Miller Signed-off-by: Johannes Berg --- drivers/net/virtio_net.c | 2 +- drivers/net/wireless/ath/ath5k/ath5k.h | 2 +- drivers/net/wireless/ralink/rt2x00/rt2x00.h | 2 +- include/linux/average.h | 61 +++++++++++++++++++---------- net/batman-adv/types.h | 2 +- net/mac80211/ieee80211_i.h | 2 +- net/mac80211/sta_info.h | 2 +- 7 files changed, 47 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index bf95016f442a..e9d7e2b70085 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -51,7 +51,7 @@ module_param(gso, bool, 0444); * at once, the weight is chosen so that the EWMA will be insensitive to short- * term, transient changes in packet size. */ -DECLARE_EWMA(pkt_len, 1, 64) +DECLARE_EWMA(pkt_len, 0, 64) /* With mergeable buffers we align buffer address and use the low bits to * encode its true size. Buffer size is up to 1 page so we need to align to diff --git a/drivers/net/wireless/ath/ath5k/ath5k.h b/drivers/net/wireless/ath/ath5k/ath5k.h index 67fedb61fcc0..979800c6f57f 100644 --- a/drivers/net/wireless/ath/ath5k/ath5k.h +++ b/drivers/net/wireless/ath/ath5k/ath5k.h @@ -1252,7 +1252,7 @@ struct ath5k_statistics { #define ATH5K_TXQ_LEN_MAX (ATH_TXBUF / 4) /* bufs per queue */ #define ATH5K_TXQ_LEN_LOW (ATH5K_TXQ_LEN_MAX / 2) /* low mark */ -DECLARE_EWMA(beacon_rssi, 1024, 8) +DECLARE_EWMA(beacon_rssi, 10, 8) /* Driver state associated with an instance of a device */ struct ath5k_hw { diff --git a/drivers/net/wireless/ralink/rt2x00/rt2x00.h b/drivers/net/wireless/ralink/rt2x00/rt2x00.h index 26869b3bef45..340787894c69 100644 --- a/drivers/net/wireless/ralink/rt2x00/rt2x00.h +++ b/drivers/net/wireless/ralink/rt2x00/rt2x00.h @@ -257,7 +257,7 @@ struct link_qual { int tx_failed; }; -DECLARE_EWMA(rssi, 1024, 8) +DECLARE_EWMA(rssi, 10, 8) /* * Antenna settings about the currently active link. diff --git a/include/linux/average.h b/include/linux/average.h index d04aa58280de..7ddaf340d2ac 100644 --- a/include/linux/average.h +++ b/include/linux/average.h @@ -1,45 +1,66 @@ #ifndef _LINUX_AVERAGE_H #define _LINUX_AVERAGE_H -/* Exponentially weighted moving average (EWMA) */ +/* + * Exponentially weighted moving average (EWMA) + * + * This implements a fixed-precision EWMA algorithm, with both the + * precision and fall-off coefficient determined at compile-time + * and built into the generated helper funtions. + * + * The first argument to the macro is the name that will be used + * for the struct and helper functions. + * + * The second argument, the precision, expresses how many bits are + * used for the fractional part of the fixed-precision values. + * + * The third argument, the weight reciprocal, determines how the + * new values will be weighed vs. the old state, new values will + * get weight 1/weight_rcp and old values 1-1/weight_rcp. Note + * that this parameter must be a power of two for efficiency. + */ -#define DECLARE_EWMA(name, _factor, _weight) \ +#define DECLARE_EWMA(name, _precision, _weight_rcp) \ struct ewma_##name { \ unsigned long internal; \ }; \ static inline void ewma_##name##_init(struct ewma_##name *e) \ { \ - BUILD_BUG_ON(!__builtin_constant_p(_factor)); \ - BUILD_BUG_ON(!__builtin_constant_p(_weight)); \ - BUILD_BUG_ON_NOT_POWER_OF_2(_factor); \ - BUILD_BUG_ON_NOT_POWER_OF_2(_weight); \ + BUILD_BUG_ON(!__builtin_constant_p(_precision)); \ + BUILD_BUG_ON(!__builtin_constant_p(_weight_rcp)); \ + /* \ + * Even if you want to feed it just 0/1 you should have \ + * some bits for the non-fractional part... \ + */ \ + BUILD_BUG_ON((_precision) > 30); \ + BUILD_BUG_ON_NOT_POWER_OF_2(_weight_rcp); \ e->internal = 0; \ } \ static inline unsigned long \ ewma_##name##_read(struct ewma_##name *e) \ { \ - BUILD_BUG_ON(!__builtin_constant_p(_factor)); \ - BUILD_BUG_ON(!__builtin_constant_p(_weight)); \ - BUILD_BUG_ON_NOT_POWER_OF_2(_factor); \ - BUILD_BUG_ON_NOT_POWER_OF_2(_weight); \ - return e->internal >> ilog2(_factor); \ + BUILD_BUG_ON(!__builtin_constant_p(_precision)); \ + BUILD_BUG_ON(!__builtin_constant_p(_weight_rcp)); \ + BUILD_BUG_ON((_precision) > 30); \ + BUILD_BUG_ON_NOT_POWER_OF_2(_weight_rcp); \ + return e->internal >> (_precision); \ } \ static inline void ewma_##name##_add(struct ewma_##name *e, \ unsigned long val) \ { \ unsigned long internal = ACCESS_ONCE(e->internal); \ - unsigned long weight = ilog2(_weight); \ - unsigned long factor = ilog2(_factor); \ + unsigned long weight_rcp = ilog2(_weight_rcp); \ + unsigned long precision = _precision; \ \ - BUILD_BUG_ON(!__builtin_constant_p(_factor)); \ - BUILD_BUG_ON(!__builtin_constant_p(_weight)); \ - BUILD_BUG_ON_NOT_POWER_OF_2(_factor); \ - BUILD_BUG_ON_NOT_POWER_OF_2(_weight); \ + BUILD_BUG_ON(!__builtin_constant_p(_precision)); \ + BUILD_BUG_ON(!__builtin_constant_p(_weight_rcp)); \ + BUILD_BUG_ON((_precision) > 30); \ + BUILD_BUG_ON_NOT_POWER_OF_2(_weight_rcp); \ \ ACCESS_ONCE(e->internal) = internal ? \ - (((internal << weight) - internal) + \ - (val << factor)) >> weight : \ - (val << factor); \ + (((internal << weight_rcp) - internal) + \ + (val << precision)) >> weight_rcp : \ + (val << precision); \ } #endif /* _LINUX_AVERAGE_H */ diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 8f64a5c01345..66b25e410a41 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -402,7 +402,7 @@ struct batadv_gw_node { struct rcu_head rcu; }; -DECLARE_EWMA(throughput, 1024, 8) +DECLARE_EWMA(throughput, 10, 8) /** * struct batadv_hardif_neigh_node_bat_v - B.A.T.M.A.N. V private neighbor diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 159a1a733725..0e718437d080 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -428,7 +428,7 @@ struct ieee80211_sta_tx_tspec { bool downgraded; }; -DECLARE_EWMA(beacon_signal, 16, 4) +DECLARE_EWMA(beacon_signal, 4, 4) struct ieee80211_if_managed { struct timer_list timer; diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 15599c70a38f..e65cda34d2bc 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -372,7 +372,7 @@ struct mesh_sta { unsigned int fail_avg; }; -DECLARE_EWMA(signal, 1024, 8) +DECLARE_EWMA(signal, 10, 8) struct ieee80211_sta_rx_stats { unsigned long packets; -- cgit v1.2.3-58-ga151 From 94352d45092c23874532221b4d1e4721df9d63df Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 1 Mar 2017 16:35:08 -0300 Subject: net: Introduce sk_clone_lock() error path routine When handling problems in cloning a socket with the sk_clone_locked() function we need to perform several steps that were open coded in it and its callers, so introduce a routine to avoid this duplication: sk_free_unlock_clone(). Cc: Cong Wang Cc: Dmitry Vyukov Cc: Eric Dumazet Cc: Gerrit Renker Cc: Thomas Gleixner Link: http://lkml.kernel.org/n/net-ui6laqkotycunhtmqryl9bfx@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- include/net/sock.h | 1 + net/core/sock.c | 16 +++++++++++----- net/dccp/minisocks.c | 6 +----- 3 files changed, 13 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 9ccefa5c5487..5e5997654db6 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1526,6 +1526,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, void sk_free(struct sock *sk); void sk_destruct(struct sock *sk); struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority); +void sk_free_unlock_clone(struct sock *sk); struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, gfp_t priority); diff --git a/net/core/sock.c b/net/core/sock.c index e7d74940e863..f6fd79f33097 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1539,11 +1539,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) is_charged = sk_filter_charge(newsk, filter); if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { - /* It is still raw copy of parent, so invalidate - * destructor and make plain sk_free() */ - newsk->sk_destruct = NULL; - bh_unlock_sock(newsk); - sk_free(newsk); + sk_free_unlock_clone(newsk); newsk = NULL; goto out; } @@ -1592,6 +1588,16 @@ out: } EXPORT_SYMBOL_GPL(sk_clone_lock); +void sk_free_unlock_clone(struct sock *sk) +{ + /* It is still raw copy of parent, so invalidate + * destructor and make plain sk_free() */ + sk->sk_destruct = NULL; + bh_unlock_sock(sk); + sk_free(sk); +} +EXPORT_SYMBOL_GPL(sk_free_unlock_clone); + void sk_setup_caps(struct sock *sk, struct dst_entry *dst) { u32 max_segs = 1; diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index d20d948a98ed..e267e6f4c9a5 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -119,11 +119,7 @@ struct sock *dccp_create_openreq_child(const struct sock *sk, * Activate features: initialise CCIDs, sequence windows etc. */ if (dccp_feat_activate_values(newsk, &dreq->dreq_featneg)) { - /* It is still raw copy of parent, so invalidate - * destructor and make plain sk_free() */ - newsk->sk_destruct = NULL; - bh_unlock_sock(newsk); - sk_free(newsk); + sk_free_unlock_clone(newsk); return NULL; } dccp_init_xmit_timers(newsk); -- cgit v1.2.3-58-ga151 From 25e94a997b324b5f167f56d56d7106d38b78c9de Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 1 Mar 2017 12:52:31 +0100 Subject: netfilter: nf_tables: don't call nfnetlink_set_err() if nfnetlink_send() fails The underlying nlmsg_multicast() already sets sk->sk_err for us to notify socket overruns, so we should not do anything with this return value. So we just call nfnetlink_set_err() if: 1) We fail to allocate the netlink message. or 2) We don't have enough space in the netlink message to place attributes, which means that we likely need to allocate a larger message. Before this patch, the internal ESRCH netlink error code was propagated to userspace, which is quite misleading. Netlink semantics mandate that listeners just hit ENOBUFS if the socket buffer overruns. Reported-by: Alexander Alemayhu Tested-by: Alexander Alemayhu Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 6 +- net/netfilter/nf_tables_api.c | 133 ++++++++++++++++---------------------- 2 files changed, 58 insertions(+), 81 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index ac84686aaafb..2aa8a9d80fbe 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -988,9 +988,9 @@ struct nft_object *nf_tables_obj_lookup(const struct nft_table *table, const struct nlattr *nla, u32 objtype, u8 genmask); -int nft_obj_notify(struct net *net, struct nft_table *table, - struct nft_object *obj, u32 portid, u32 seq, - int event, int family, int report, gfp_t gfp); +void nft_obj_notify(struct net *net, struct nft_table *table, + struct nft_object *obj, u32 portid, u32 seq, + int event, int family, int report, gfp_t gfp); /** * struct nft_object_type - stateful object type diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index ff7304ae58ac..5e0ccfd5bb37 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -461,16 +461,15 @@ nla_put_failure: return -1; } -static int nf_tables_table_notify(const struct nft_ctx *ctx, int event) +static void nf_tables_table_notify(const struct nft_ctx *ctx, int event) { struct sk_buff *skb; int err; if (!ctx->report && !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) - return 0; + return; - err = -ENOBUFS; skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (skb == NULL) goto err; @@ -482,14 +481,11 @@ static int nf_tables_table_notify(const struct nft_ctx *ctx, int event) goto err; } - err = nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, - ctx->report, GFP_KERNEL); + nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, + ctx->report, GFP_KERNEL); + return; err: - if (err < 0) { - nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, - err); - } - return err; + nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS); } static int nf_tables_dump_tables(struct sk_buff *skb, @@ -1050,16 +1046,15 @@ nla_put_failure: return -1; } -static int nf_tables_chain_notify(const struct nft_ctx *ctx, int event) +static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event) { struct sk_buff *skb; int err; if (!ctx->report && !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) - return 0; + return; - err = -ENOBUFS; skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (skb == NULL) goto err; @@ -1072,14 +1067,11 @@ static int nf_tables_chain_notify(const struct nft_ctx *ctx, int event) goto err; } - err = nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, - ctx->report, GFP_KERNEL); + nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, + ctx->report, GFP_KERNEL); + return; err: - if (err < 0) { - nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, - err); - } - return err; + nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS); } static int nf_tables_dump_chains(struct sk_buff *skb, @@ -1934,18 +1926,16 @@ nla_put_failure: return -1; } -static int nf_tables_rule_notify(const struct nft_ctx *ctx, - const struct nft_rule *rule, - int event) +static void nf_tables_rule_notify(const struct nft_ctx *ctx, + const struct nft_rule *rule, int event) { struct sk_buff *skb; int err; if (!ctx->report && !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) - return 0; + return; - err = -ENOBUFS; skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (skb == NULL) goto err; @@ -1958,14 +1948,11 @@ static int nf_tables_rule_notify(const struct nft_ctx *ctx, goto err; } - err = nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, - ctx->report, GFP_KERNEL); + nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, + ctx->report, GFP_KERNEL); + return; err: - if (err < 0) { - nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, - err); - } - return err; + nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS); } struct nft_rule_dump_ctx { @@ -2696,9 +2683,9 @@ nla_put_failure: return -1; } -static int nf_tables_set_notify(const struct nft_ctx *ctx, - const struct nft_set *set, - int event, gfp_t gfp_flags) +static void nf_tables_set_notify(const struct nft_ctx *ctx, + const struct nft_set *set, int event, + gfp_t gfp_flags) { struct sk_buff *skb; u32 portid = ctx->portid; @@ -2706,9 +2693,8 @@ static int nf_tables_set_notify(const struct nft_ctx *ctx, if (!ctx->report && !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) - return 0; + return; - err = -ENOBUFS; skb = nlmsg_new(NLMSG_GOODSIZE, gfp_flags); if (skb == NULL) goto err; @@ -2719,12 +2705,11 @@ static int nf_tables_set_notify(const struct nft_ctx *ctx, goto err; } - err = nfnetlink_send(skb, ctx->net, portid, NFNLGRP_NFTABLES, - ctx->report, gfp_flags); + nfnetlink_send(skb, ctx->net, portid, NFNLGRP_NFTABLES, ctx->report, + gfp_flags); + return; err: - if (err < 0) - nfnetlink_set_err(ctx->net, portid, NFNLGRP_NFTABLES, err); - return err; + nfnetlink_set_err(ctx->net, portid, NFNLGRP_NFTABLES, -ENOBUFS); } static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb) @@ -3504,10 +3489,10 @@ nla_put_failure: return -1; } -static int nf_tables_setelem_notify(const struct nft_ctx *ctx, - const struct nft_set *set, - const struct nft_set_elem *elem, - int event, u16 flags) +static void nf_tables_setelem_notify(const struct nft_ctx *ctx, + const struct nft_set *set, + const struct nft_set_elem *elem, + int event, u16 flags) { struct net *net = ctx->net; u32 portid = ctx->portid; @@ -3515,9 +3500,8 @@ static int nf_tables_setelem_notify(const struct nft_ctx *ctx, int err; if (!ctx->report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) - return 0; + return; - err = -ENOBUFS; skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (skb == NULL) goto err; @@ -3529,12 +3513,11 @@ static int nf_tables_setelem_notify(const struct nft_ctx *ctx, goto err; } - err = nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, ctx->report, - GFP_KERNEL); + nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, ctx->report, + GFP_KERNEL); + return; err: - if (err < 0) - nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, err); - return err; + nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, -ENOBUFS); } static struct nft_trans *nft_trans_elem_alloc(struct nft_ctx *ctx, @@ -4476,18 +4459,17 @@ static int nf_tables_delobj(struct net *net, struct sock *nlsk, return nft_delobj(&ctx, obj); } -int nft_obj_notify(struct net *net, struct nft_table *table, - struct nft_object *obj, u32 portid, u32 seq, int event, - int family, int report, gfp_t gfp) +void nft_obj_notify(struct net *net, struct nft_table *table, + struct nft_object *obj, u32 portid, u32 seq, int event, + int family, int report, gfp_t gfp) { struct sk_buff *skb; int err; if (!report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) - return 0; + return; - err = -ENOBUFS; skb = nlmsg_new(NLMSG_GOODSIZE, gfp); if (skb == NULL) goto err; @@ -4499,21 +4481,18 @@ int nft_obj_notify(struct net *net, struct nft_table *table, goto err; } - err = nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, report, gfp); + nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, report, gfp); + return; err: - if (err < 0) { - nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, err); - } - return err; + nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, -ENOBUFS); } EXPORT_SYMBOL_GPL(nft_obj_notify); -static int nf_tables_obj_notify(const struct nft_ctx *ctx, - struct nft_object *obj, int event) +static void nf_tables_obj_notify(const struct nft_ctx *ctx, + struct nft_object *obj, int event) { - return nft_obj_notify(ctx->net, ctx->table, obj, ctx->portid, - ctx->seq, event, ctx->afi->family, ctx->report, - GFP_KERNEL); + nft_obj_notify(ctx->net, ctx->table, obj, ctx->portid, ctx->seq, event, + ctx->afi->family, ctx->report, GFP_KERNEL); } static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net, @@ -4543,7 +4522,8 @@ nla_put_failure: return -EMSGSIZE; } -static int nf_tables_gen_notify(struct net *net, struct sk_buff *skb, int event) +static void nf_tables_gen_notify(struct net *net, struct sk_buff *skb, + int event) { struct nlmsghdr *nlh = nlmsg_hdr(skb); struct sk_buff *skb2; @@ -4551,9 +4531,8 @@ static int nf_tables_gen_notify(struct net *net, struct sk_buff *skb, int event) if (nlmsg_report(nlh) && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) - return 0; + return; - err = -ENOBUFS; skb2 = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (skb2 == NULL) goto err; @@ -4565,14 +4544,12 @@ static int nf_tables_gen_notify(struct net *net, struct sk_buff *skb, int event) goto err; } - err = nfnetlink_send(skb2, net, NETLINK_CB(skb).portid, - NFNLGRP_NFTABLES, nlmsg_report(nlh), GFP_KERNEL); + nfnetlink_send(skb2, net, NETLINK_CB(skb).portid, NFNLGRP_NFTABLES, + nlmsg_report(nlh), GFP_KERNEL); + return; err: - if (err < 0) { - nfnetlink_set_err(net, NETLINK_CB(skb).portid, NFNLGRP_NFTABLES, - err); - } - return err; + nfnetlink_set_err(net, NETLINK_CB(skb).portid, NFNLGRP_NFTABLES, + -ENOBUFS); } static int nf_tables_getgen(struct net *net, struct sock *nlsk, -- cgit v1.2.3-58-ga151