diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-01-31 14:31:10 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-01-31 14:31:10 -0800 |
commit | b2fe5fa68642860e7de76167c3111623aa0d5de1 (patch) | |
tree | b7f9b89b7039ecefbc35fe3c8e73a6ff972641dd /net/rds | |
parent | a103950e0dd2058df5e8a8d4a915707bdcf205f0 (diff) | |
parent | a54667f6728c2714a400f3c884727da74b6d1717 (diff) |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller:
1) Significantly shrink the core networking routing structures. Result
of http://vger.kernel.org/~davem/seoul2017_netdev_keynote.pdf
2) Add netdevsim driver for testing various offloads, from Jakub
Kicinski.
3) Support cross-chip FDB operations in DSA, from Vivien Didelot.
4) Add a 2nd listener hash table for TCP, similar to what was done for
UDP. From Martin KaFai Lau.
5) Add eBPF based queue selection to tun, from Jason Wang.
6) Lockless qdisc support, from John Fastabend.
7) SCTP stream interleave support, from Xin Long.
8) Smoother TCP receive autotuning, from Eric Dumazet.
9) Lots of erspan tunneling enhancements, from William Tu.
10) Add true function call support to BPF, from Alexei Starovoitov.
11) Add explicit support for GRO HW offloading, from Michael Chan.
12) Support extack generation in more netlink subsystems. From Alexander
Aring, Quentin Monnet, and Jakub Kicinski.
13) Add 1000BaseX, flow control, and EEE support to mvneta driver. From
Russell King.
14) Add flow table abstraction to netfilter, from Pablo Neira Ayuso.
15) Many improvements and simplifications to the NFP driver bpf JIT,
from Jakub Kicinski.
16) Support for ipv6 non-equal cost multipath routing, from Ido
Schimmel.
17) Add resource abstration to devlink, from Arkadi Sharshevsky.
18) Packet scheduler classifier shared filter block support, from Jiri
Pirko.
19) Avoid locking in act_csum, from Davide Caratti.
20) devinet_ioctl() simplifications from Al viro.
21) More TCP bpf improvements from Lawrence Brakmo.
22) Add support for onlink ipv6 route flag, similar to ipv4, from David
Ahern.
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1925 commits)
tls: Add support for encryption using async offload accelerator
ip6mr: fix stale iterator
net/sched: kconfig: Remove blank help texts
openvswitch: meter: Use 64-bit arithmetic instead of 32-bit
tcp_nv: fix potential integer overflow in tcpnv_acked
r8169: fix RTL8168EP take too long to complete driver initialization.
qmi_wwan: Add support for Quectel EP06
rtnetlink: enable IFLA_IF_NETNSID for RTM_NEWLINK
ipmr: Fix ptrdiff_t print formatting
ibmvnic: Wait for device response when changing MAC
qlcnic: fix deadlock bug
tcp: release sk_frag.page in tcp_disconnect
ipv4: Get the address of interface correctly.
net_sched: gen_estimator: fix lockdep splat
net: macb: Handle HRESP error
net/mlx5e: IPoIB, Fix copy-paste bug in flow steering refactoring
ipv6: addrconf: break critical section in addrconf_verify_rtnl()
ipv6: change route cache aging logic
i40e/i40evf: Update DESC_NEEDED value to reflect larger value
bnxt_en: cleanup DIM work on device shutdown
...
Diffstat (limited to 'net/rds')
-rw-r--r-- | net/rds/bind.c | 1 | ||||
-rw-r--r-- | net/rds/cong.c | 10 | ||||
-rw-r--r-- | net/rds/connection.c | 27 | ||||
-rw-r--r-- | net/rds/rds.h | 10 | ||||
-rw-r--r-- | net/rds/send.c | 37 | ||||
-rw-r--r-- | net/rds/tcp.c | 81 | ||||
-rw-r--r-- | net/rds/tcp.h | 1 | ||||
-rw-r--r-- | net/rds/tcp_connect.c | 2 | ||||
-rw-r--r-- | net/rds/tcp_recv.c | 8 | ||||
-rw-r--r-- | net/rds/tcp_send.c | 5 | ||||
-rw-r--r-- | net/rds/threads.c | 20 |
11 files changed, 128 insertions, 74 deletions
diff --git a/net/rds/bind.c b/net/rds/bind.c index 75d43dc8e96b..5aa3a64aa4f0 100644 --- a/net/rds/bind.c +++ b/net/rds/bind.c @@ -114,6 +114,7 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port) rs, &addr, (int)ntohs(*port)); break; } else { + rs->rs_bound_addr = 0; rds_sock_put(rs); ret = -ENOMEM; break; diff --git a/net/rds/cong.c b/net/rds/cong.c index 8398fee7c866..8d19fd25dce3 100644 --- a/net/rds/cong.c +++ b/net/rds/cong.c @@ -219,7 +219,11 @@ void rds_cong_queue_updates(struct rds_cong_map *map) spin_lock_irqsave(&rds_cong_lock, flags); list_for_each_entry(conn, &map->m_conn_list, c_map_item) { - if (!test_and_set_bit(0, &conn->c_map_queued)) { + struct rds_conn_path *cp = &conn->c_path[0]; + + rcu_read_lock(); + if (!test_and_set_bit(0, &conn->c_map_queued) && + !test_bit(RDS_DESTROY_PENDING, &cp->cp_flags)) { rds_stats_inc(s_cong_update_queued); /* We cannot inline the call to rds_send_xmit() here * for two reasons (both pertaining to a TCP transport): @@ -235,9 +239,9 @@ void rds_cong_queue_updates(struct rds_cong_map *map) * therefore trigger warnings. * Defer the xmit to rds_send_worker() instead. */ - queue_delayed_work(rds_wq, - &conn->c_path[0].cp_send_w, 0); + queue_delayed_work(rds_wq, &cp->cp_send_w, 0); } + rcu_read_unlock(); } spin_unlock_irqrestore(&rds_cong_lock, flags); diff --git a/net/rds/connection.c b/net/rds/connection.c index 7ee2d5d68b78..b10c0ef36d8d 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -230,8 +230,8 @@ static struct rds_connection *__rds_conn_create(struct net *net, rdsdebug("allocated conn %p for %pI4 -> %pI4 over %s %s\n", conn, &laddr, &faddr, - trans->t_name ? trans->t_name : "[unknown]", - is_outgoing ? "(outgoing)" : ""); + strnlen(trans->t_name, sizeof(trans->t_name)) ? trans->t_name : + "[unknown]", is_outgoing ? "(outgoing)" : ""); /* * Since we ran without holding the conn lock, someone could @@ -382,10 +382,13 @@ static void rds_conn_path_destroy(struct rds_conn_path *cp) { struct rds_message *rm, *rtmp; + set_bit(RDS_DESTROY_PENDING, &cp->cp_flags); + if (!cp->cp_transport_data) return; /* make sure lingering queued work won't try to ref the conn */ + synchronize_rcu(); cancel_delayed_work_sync(&cp->cp_send_w); cancel_delayed_work_sync(&cp->cp_recv_w); @@ -403,6 +406,11 @@ static void rds_conn_path_destroy(struct rds_conn_path *cp) if (cp->cp_xmit_rm) rds_message_put(cp->cp_xmit_rm); + WARN_ON(delayed_work_pending(&cp->cp_send_w)); + WARN_ON(delayed_work_pending(&cp->cp_recv_w)); + WARN_ON(delayed_work_pending(&cp->cp_conn_w)); + WARN_ON(work_pending(&cp->cp_down_w)); + cp->cp_conn->c_trans->conn_free(cp->cp_transport_data); } @@ -424,7 +432,6 @@ void rds_conn_destroy(struct rds_connection *conn) "%pI4\n", conn, &conn->c_laddr, &conn->c_faddr); - conn->c_destroy_in_prog = 1; /* Ensure conn will not be scheduled for reconnect */ spin_lock_irq(&rds_conn_lock); hlist_del_init_rcu(&conn->c_hash_node); @@ -445,7 +452,6 @@ void rds_conn_destroy(struct rds_connection *conn) */ rds_cong_remove_conn(conn); - put_net(conn->c_net); kfree(conn->c_path); kmem_cache_free(rds_conn_slab, conn); @@ -684,10 +690,13 @@ void rds_conn_path_drop(struct rds_conn_path *cp, bool destroy) { atomic_set(&cp->cp_state, RDS_CONN_ERROR); - if (!destroy && cp->cp_conn->c_destroy_in_prog) + rcu_read_lock(); + if (!destroy && test_bit(RDS_DESTROY_PENDING, &cp->cp_flags)) { + rcu_read_unlock(); return; - + } queue_work(rds_wq, &cp->cp_down_w); + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(rds_conn_path_drop); @@ -704,9 +713,15 @@ EXPORT_SYMBOL_GPL(rds_conn_drop); */ void rds_conn_path_connect_if_down(struct rds_conn_path *cp) { + rcu_read_lock(); + if (test_bit(RDS_DESTROY_PENDING, &cp->cp_flags)) { + rcu_read_unlock(); + return; + } if (rds_conn_path_state(cp) == RDS_CONN_DOWN && !test_and_set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags)) queue_delayed_work(rds_wq, &cp->cp_conn_w, 0); + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(rds_conn_path_connect_if_down); diff --git a/net/rds/rds.h b/net/rds/rds.h index c349c71babff..374ae83b60d4 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -88,6 +88,7 @@ enum { #define RDS_RECONNECT_PENDING 1 #define RDS_IN_XMIT 2 #define RDS_RECV_REFILL 3 +#define RDS_DESTROY_PENDING 4 /* Max number of multipaths per RDS connection. Must be a power of 2 */ #define RDS_MPATH_WORKERS 8 @@ -139,8 +140,7 @@ struct rds_connection { __be32 c_faddr; unsigned int c_loopback:1, c_ping_triggered:1, - c_destroy_in_prog:1, - c_pad_to_32:29; + c_pad_to_32:30; int c_npaths; struct rds_connection *c_passive; struct rds_transport *c_trans; @@ -150,7 +150,7 @@ struct rds_connection { /* Protocol version */ unsigned int c_version; - struct net *c_net; + possible_net_t c_net; struct list_head c_map_item; unsigned long c_map_queued; @@ -165,13 +165,13 @@ struct rds_connection { static inline struct net *rds_conn_net(struct rds_connection *conn) { - return conn->c_net; + return read_pnet(&conn->c_net); } static inline void rds_conn_net_set(struct rds_connection *conn, struct net *net) { - conn->c_net = get_net(net); + write_pnet(&conn->c_net, net); } #define RDS_FLAG_CONG_BITMAP 0x01 diff --git a/net/rds/send.c b/net/rds/send.c index f72466c63f0c..d3e32d1f3c7d 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -162,6 +162,12 @@ restart: goto out; } + if (test_bit(RDS_DESTROY_PENDING, &cp->cp_flags)) { + release_in_xmit(cp); + ret = -ENETUNREACH; /* dont requeue send work */ + goto out; + } + /* * we record the send generation after doing the xmit acquire. * if someone else manages to jump in and do some work, we'll use @@ -437,7 +443,12 @@ over_batch: !list_empty(&cp->cp_send_queue)) && !raced) { if (batch_count < send_batch_count) goto restart; - queue_delayed_work(rds_wq, &cp->cp_send_w, 1); + rcu_read_lock(); + if (test_bit(RDS_DESTROY_PENDING, &cp->cp_flags)) + ret = -ENETUNREACH; + else + queue_delayed_work(rds_wq, &cp->cp_send_w, 1); + rcu_read_unlock(); } else if (raced) { rds_stats_inc(s_send_lock_queue_raced); } @@ -1151,6 +1162,11 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) else cpath = &conn->c_path[0]; + if (test_bit(RDS_DESTROY_PENDING, &cpath->cp_flags)) { + ret = -EAGAIN; + goto out; + } + rds_conn_path_connect_if_down(cpath); ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs); @@ -1190,9 +1206,17 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) rds_stats_inc(s_send_queued); ret = rds_send_xmit(cpath); - if (ret == -ENOMEM || ret == -EAGAIN) - queue_delayed_work(rds_wq, &cpath->cp_send_w, 1); - + if (ret == -ENOMEM || ret == -EAGAIN) { + ret = 0; + rcu_read_lock(); + if (test_bit(RDS_DESTROY_PENDING, &cpath->cp_flags)) + ret = -ENETUNREACH; + else + queue_delayed_work(rds_wq, &cpath->cp_send_w, 1); + rcu_read_unlock(); + } + if (ret) + goto out; rds_message_put(rm); return payload_len; @@ -1270,7 +1294,10 @@ rds_send_probe(struct rds_conn_path *cp, __be16 sport, rds_stats_inc(s_send_pong); /* schedule the send work on rds_wq */ - queue_delayed_work(rds_wq, &cp->cp_send_w, 1); + rcu_read_lock(); + if (!test_bit(RDS_DESTROY_PENDING, &cp->cp_flags)) + queue_delayed_work(rds_wq, &cp->cp_send_w, 1); + rcu_read_unlock(); rds_message_put(rm); return 0; diff --git a/net/rds/tcp.c b/net/rds/tcp.c index ab7356e0ba83..9920d2f84eff 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -271,16 +271,33 @@ static int rds_tcp_laddr_check(struct net *net, __be32 addr) return -EADDRNOTAVAIL; } +static void rds_tcp_conn_free(void *arg) +{ + struct rds_tcp_connection *tc = arg; + unsigned long flags; + + rdsdebug("freeing tc %p\n", tc); + + spin_lock_irqsave(&rds_tcp_conn_lock, flags); + if (!tc->t_tcp_node_detached) + list_del(&tc->t_tcp_node); + spin_unlock_irqrestore(&rds_tcp_conn_lock, flags); + + kmem_cache_free(rds_tcp_conn_slab, tc); +} + static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp) { struct rds_tcp_connection *tc; - int i; + int i, j; + int ret = 0; for (i = 0; i < RDS_MPATH_WORKERS; i++) { tc = kmem_cache_alloc(rds_tcp_conn_slab, gfp); - if (!tc) - return -ENOMEM; - + if (!tc) { + ret = -ENOMEM; + break; + } mutex_init(&tc->t_conn_path_lock); tc->t_sock = NULL; tc->t_tinc = NULL; @@ -291,26 +308,17 @@ static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp) tc->t_cpath = &conn->c_path[i]; spin_lock_irq(&rds_tcp_conn_lock); + tc->t_tcp_node_detached = false; list_add_tail(&tc->t_tcp_node, &rds_tcp_conn_list); spin_unlock_irq(&rds_tcp_conn_lock); rdsdebug("rds_conn_path [%d] tc %p\n", i, conn->c_path[i].cp_transport_data); } - - return 0; -} - -static void rds_tcp_conn_free(void *arg) -{ - struct rds_tcp_connection *tc = arg; - unsigned long flags; - rdsdebug("freeing tc %p\n", tc); - - spin_lock_irqsave(&rds_tcp_conn_lock, flags); - list_del(&tc->t_tcp_node); - spin_unlock_irqrestore(&rds_tcp_conn_lock, flags); - - kmem_cache_free(rds_tcp_conn_slab, tc); + if (ret) { + for (j = 0; j < i; j++) + rds_tcp_conn_free(conn->c_path[j].cp_transport_data); + } + return ret; } static bool list_has_conn(struct list_head *list, struct rds_connection *conn) @@ -496,27 +504,6 @@ static struct pernet_operations rds_tcp_net_ops = { .size = sizeof(struct rds_tcp_net), }; -/* explicitly send a RST on each socket, thereby releasing any socket refcnts - * that may otherwise hold up netns deletion. - */ -static void rds_tcp_conn_paths_destroy(struct rds_connection *conn) -{ - struct rds_conn_path *cp; - struct rds_tcp_connection *tc; - int i; - struct sock *sk; - - for (i = 0; i < RDS_MPATH_WORKERS; i++) { - cp = &conn->c_path[i]; - tc = cp->cp_transport_data; - if (!tc->t_sock) - continue; - sk = tc->t_sock->sk; - sk->sk_prot->disconnect(sk, 0); - tcp_done(sk); - } -} - static void rds_tcp_kill_sock(struct net *net) { struct rds_tcp_connection *tc, *_tc; @@ -528,18 +515,20 @@ static void rds_tcp_kill_sock(struct net *net) rds_tcp_listen_stop(lsock, &rtn->rds_tcp_accept_w); spin_lock_irq(&rds_tcp_conn_lock); list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) { - struct net *c_net = tc->t_cpath->cp_conn->c_net; + struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net); if (net != c_net || !tc->t_sock) continue; - if (!list_has_conn(&tmp_list, tc->t_cpath->cp_conn)) + if (!list_has_conn(&tmp_list, tc->t_cpath->cp_conn)) { list_move_tail(&tc->t_tcp_node, &tmp_list); + } else { + list_del(&tc->t_tcp_node); + tc->t_tcp_node_detached = true; + } } spin_unlock_irq(&rds_tcp_conn_lock); - list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node) { - rds_tcp_conn_paths_destroy(tc->t_cpath->cp_conn); + list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node) rds_conn_destroy(tc->t_cpath->cp_conn); - } } void *rds_tcp_listen_sock_def_readable(struct net *net) @@ -587,7 +576,7 @@ static void rds_tcp_sysctl_reset(struct net *net) spin_lock_irq(&rds_tcp_conn_lock); list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) { - struct net *c_net = tc->t_cpath->cp_conn->c_net; + struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net); if (net != c_net || !tc->t_sock) continue; diff --git a/net/rds/tcp.h b/net/rds/tcp.h index 864ca7d8f019..c6fa080e9b6d 100644 --- a/net/rds/tcp.h +++ b/net/rds/tcp.h @@ -12,6 +12,7 @@ struct rds_tcp_incoming { struct rds_tcp_connection { struct list_head t_tcp_node; + bool t_tcp_node_detached; struct rds_conn_path *t_cpath; /* t_conn_path_lock synchronizes the connection establishment between * rds_tcp_accept_one and rds_tcp_conn_path_connect diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index 46f74dad0e16..534c67aeb20f 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -170,7 +170,7 @@ void rds_tcp_conn_path_shutdown(struct rds_conn_path *cp) cp->cp_conn, tc, sock); if (sock) { - if (cp->cp_conn->c_destroy_in_prog) + if (test_bit(RDS_DESTROY_PENDING, &cp->cp_flags)) rds_tcp_set_linger(sock); sock->ops->shutdown(sock, RCV_SHUTDOWN | SEND_SHUTDOWN); lock_sock(sock->sk); diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c index e006ef8e6d40..dd707b9e73e5 100644 --- a/net/rds/tcp_recv.c +++ b/net/rds/tcp_recv.c @@ -321,8 +321,12 @@ void rds_tcp_data_ready(struct sock *sk) ready = tc->t_orig_data_ready; rds_tcp_stats_inc(s_tcp_data_ready_calls); - if (rds_tcp_read_sock(cp, GFP_ATOMIC) == -ENOMEM) - queue_delayed_work(rds_wq, &cp->cp_recv_w, 0); + if (rds_tcp_read_sock(cp, GFP_ATOMIC) == -ENOMEM) { + rcu_read_lock(); + if (!test_bit(RDS_DESTROY_PENDING, &cp->cp_flags)) + queue_delayed_work(rds_wq, &cp->cp_recv_w, 0); + rcu_read_unlock(); + } out: read_unlock_bh(&sk->sk_callback_lock); ready(sk); diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c index 9b76e0fa1722..16f65744d984 100644 --- a/net/rds/tcp_send.c +++ b/net/rds/tcp_send.c @@ -202,8 +202,11 @@ void rds_tcp_write_space(struct sock *sk) tc->t_last_seen_una = rds_tcp_snd_una(tc); rds_send_path_drop_acked(cp, rds_tcp_snd_una(tc), rds_tcp_is_acked); - if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) + rcu_read_lock(); + if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf && + !test_bit(RDS_DESTROY_PENDING, &cp->cp_flags)) queue_delayed_work(rds_wq, &cp->cp_send_w, 0); + rcu_read_unlock(); out: read_unlock_bh(&sk->sk_callback_lock); diff --git a/net/rds/threads.c b/net/rds/threads.c index f121daa402c8..eb76db1360b0 100644 --- a/net/rds/threads.c +++ b/net/rds/threads.c @@ -87,8 +87,12 @@ void rds_connect_path_complete(struct rds_conn_path *cp, int curr) cp->cp_reconnect_jiffies = 0; set_bit(0, &cp->cp_conn->c_map_queued); - queue_delayed_work(rds_wq, &cp->cp_send_w, 0); - queue_delayed_work(rds_wq, &cp->cp_recv_w, 0); + rcu_read_lock(); + if (!test_bit(RDS_DESTROY_PENDING, &cp->cp_flags)) { + queue_delayed_work(rds_wq, &cp->cp_send_w, 0); + queue_delayed_work(rds_wq, &cp->cp_recv_w, 0); + } + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(rds_connect_path_complete); @@ -133,7 +137,10 @@ void rds_queue_reconnect(struct rds_conn_path *cp) set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags); if (cp->cp_reconnect_jiffies == 0) { cp->cp_reconnect_jiffies = rds_sysctl_reconnect_min_jiffies; - queue_delayed_work(rds_wq, &cp->cp_conn_w, 0); + rcu_read_lock(); + if (!test_bit(RDS_DESTROY_PENDING, &cp->cp_flags)) + queue_delayed_work(rds_wq, &cp->cp_conn_w, 0); + rcu_read_unlock(); return; } @@ -141,8 +148,11 @@ void rds_queue_reconnect(struct rds_conn_path *cp) rdsdebug("%lu delay %lu ceil conn %p for %pI4 -> %pI4\n", rand % cp->cp_reconnect_jiffies, cp->cp_reconnect_jiffies, conn, &conn->c_laddr, &conn->c_faddr); - queue_delayed_work(rds_wq, &cp->cp_conn_w, - rand % cp->cp_reconnect_jiffies); + rcu_read_lock(); + if (!test_bit(RDS_DESTROY_PENDING, &cp->cp_flags)) + queue_delayed_work(rds_wq, &cp->cp_conn_w, + rand % cp->cp_reconnect_jiffies); + rcu_read_unlock(); cp->cp_reconnect_jiffies = min(cp->cp_reconnect_jiffies * 2, rds_sysctl_reconnect_max_jiffies); |