summaryrefslogtreecommitdiff
path: root/net/sunrpc/xprtrdma/verbs.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-05-10 13:03:38 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2017-05-10 13:03:38 -0700
commit73ccb023a2f25b72c4b95499ca24760588014614 (patch)
treeb0fd9968af3e929ac496f159420a25dc3e1dcafb /net/sunrpc/xprtrdma/verbs.c
parentf94c128eefcce2e3448d543f13cd7d7b8aa660a5 (diff)
parent76b2a303384e1d6299c3a0249f0f0ce2f8f96017 (diff)
Merge tag 'nfs-for-4.12-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs
Pull NFS client updates from Trond Myklebust: "Highlights include: Stable bugfixes: - Fix use after free in write error path - Use GFP_NOIO for two allocations in writeback - Fix a hang in OPEN related to server reboot - Check the result of nfs4_pnfs_ds_connect - Fix an rcu lock leak Features: - Removal of the unmaintained and unused OSD pNFS layout - Cleanup and removal of lots of unnecessary dprintk()s - Cleanup and removal of some memory failure paths now that GFP_NOFS is guaranteed to never fail. - Remove the v3-only data server limitation on pNFS/flexfiles Bugfixes: - RPC/RDMA connection handling bugfixes - Copy offload: fixes to ensure the copied data is COMMITed to disk. - Readdir: switch back to using the ->iterate VFS interface - File locking fixes from Ben Coddington - Various use-after-free and deadlock issues in pNFS - Write path bugfixes" * tag 'nfs-for-4.12-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs: (89 commits) pNFS/flexfiles: Always attempt to call layoutstats when flexfiles is enabled NFSv4.1: Work around a Linux server bug... NFS append COMMIT after synchronous COPY NFSv4: Fix exclusive create attributes encoding NFSv4: Fix an rcu lock leak nfs: use kmap/kunmap directly NFS: always treat the invocation of nfs_getattr as cache hit when noac is on Fix nfs_client refcounting if kmalloc fails in nfs4_proc_exchange_id and nfs4_proc_async_renew NFSv4.1: RECLAIM_COMPLETE must handle NFS4ERR_CONN_NOT_BOUND_TO_SESSION pNFS: Fix NULL dereference in pnfs_generic_alloc_ds_commits pNFS: Fix a typo in pnfs_generic_alloc_ds_commits pNFS: Fix a deadlock when coalescing writes and returning the layout pNFS: Don't clear the layout return info if there are segments to return pNFS: Ensure we commit the layout if it has been invalidated pNFS: Don't send COMMITs to the DSes if the server invalidated our layout pNFS/flexfiles: Fix up the ff_layout_write_pagelist failure path pNFS: Ensure we check layout validity before marking it for return NFS4.1 handle interrupted slot reuse from ERR_DELAY NFSv4: check return value of xdr_inline_decode nfs/filelayout: fix NULL pointer dereference in fl_pnfs_update_layout() ...
Diffstat (limited to 'net/sunrpc/xprtrdma/verbs.c')
-rw-r--r--net/sunrpc/xprtrdma/verbs.c323
1 files changed, 218 insertions, 105 deletions
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 3b332b395045..3dbce9ac4327 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -53,7 +53,7 @@
#include <linux/sunrpc/addr.h>
#include <linux/sunrpc/svc_rdma.h>
#include <asm/bitops.h>
-#include <linux/module.h> /* try_module_get()/module_put() */
+
#include <rdma/ib_cm.h>
#include "xprt_rdma.h"
@@ -69,8 +69,11 @@
/*
* internal functions
*/
+static void rpcrdma_create_mrs(struct rpcrdma_xprt *r_xprt);
+static void rpcrdma_destroy_mrs(struct rpcrdma_buffer *buf);
+static void rpcrdma_dma_unmap_regbuf(struct rpcrdma_regbuf *rb);
-static struct workqueue_struct *rpcrdma_receive_wq;
+static struct workqueue_struct *rpcrdma_receive_wq __read_mostly;
int
rpcrdma_alloc_wq(void)
@@ -180,7 +183,7 @@ rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
rep->rr_wc_flags = wc->wc_flags;
rep->rr_inv_rkey = wc->ex.invalidate_rkey;
- ib_dma_sync_single_for_cpu(rep->rr_device,
+ ib_dma_sync_single_for_cpu(rdmab_device(rep->rr_rdmabuf),
rdmab_addr(rep->rr_rdmabuf),
rep->rr_len, DMA_FROM_DEVICE);
@@ -262,6 +265,21 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
__func__, ep);
complete(&ia->ri_done);
break;
+ case RDMA_CM_EVENT_DEVICE_REMOVAL:
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+ pr_info("rpcrdma: removing device for %pIS:%u\n",
+ sap, rpc_get_port(sap));
+#endif
+ set_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags);
+ ep->rep_connected = -ENODEV;
+ xprt_force_disconnect(&xprt->rx_xprt);
+ wait_for_completion(&ia->ri_remove_done);
+
+ ia->ri_id = NULL;
+ ia->ri_pd = NULL;
+ ia->ri_device = NULL;
+ /* Return 1 to ensure the core destroys the id. */
+ return 1;
case RDMA_CM_EVENT_ESTABLISHED:
connstate = 1;
ib_query_qp(ia->ri_id->qp, attr,
@@ -291,9 +309,6 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
goto connected;
case RDMA_CM_EVENT_DISCONNECTED:
connstate = -ECONNABORTED;
- goto connected;
- case RDMA_CM_EVENT_DEVICE_REMOVAL:
- connstate = -ENODEV;
connected:
dprintk("RPC: %s: %sconnected\n",
__func__, connstate > 0 ? "" : "dis");
@@ -329,14 +344,6 @@ connected:
return 0;
}
-static void rpcrdma_destroy_id(struct rdma_cm_id *id)
-{
- if (id) {
- module_put(id->device->owner);
- rdma_destroy_id(id);
- }
-}
-
static struct rdma_cm_id *
rpcrdma_create_id(struct rpcrdma_xprt *xprt,
struct rpcrdma_ia *ia, struct sockaddr *addr)
@@ -346,6 +353,7 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt,
int rc;
init_completion(&ia->ri_done);
+ init_completion(&ia->ri_remove_done);
id = rdma_create_id(&init_net, rpcrdma_conn_upcall, xprt, RDMA_PS_TCP,
IB_QPT_RC);
@@ -370,16 +378,6 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt,
goto out;
}
- /* FIXME:
- * Until xprtrdma supports DEVICE_REMOVAL, the provider must
- * be pinned while there are active NFS/RDMA mounts to prevent
- * hangs and crashes at umount time.
- */
- if (!ia->ri_async_rc && !try_module_get(id->device->owner)) {
- dprintk("RPC: %s: Failed to get device module\n",
- __func__);
- ia->ri_async_rc = -ENODEV;
- }
rc = ia->ri_async_rc;
if (rc)
goto out;
@@ -389,21 +387,20 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt,
if (rc) {
dprintk("RPC: %s: rdma_resolve_route() failed %i\n",
__func__, rc);
- goto put;
+ goto out;
}
rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout);
if (rc < 0) {
dprintk("RPC: %s: wait() exited: %i\n",
__func__, rc);
- goto put;
+ goto out;
}
rc = ia->ri_async_rc;
if (rc)
- goto put;
+ goto out;
return id;
-put:
- module_put(id->device->owner);
+
out:
rdma_destroy_id(id);
return ERR_PTR(rc);
@@ -413,13 +410,16 @@ out:
* Exported functions.
*/
-/*
- * Open and initialize an Interface Adapter.
- * o initializes fields of struct rpcrdma_ia, including
- * interface and provider attributes and protection zone.
+/**
+ * rpcrdma_ia_open - Open and initialize an Interface Adapter.
+ * @xprt: controlling transport
+ * @addr: IP address of remote peer
+ *
+ * Returns 0 on success, negative errno if an appropriate
+ * Interface Adapter could not be found and opened.
*/
int
-rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
+rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr)
{
struct rpcrdma_ia *ia = &xprt->rx_ia;
int rc;
@@ -427,7 +427,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
if (IS_ERR(ia->ri_id)) {
rc = PTR_ERR(ia->ri_id);
- goto out1;
+ goto out_err;
}
ia->ri_device = ia->ri_id->device;
@@ -435,10 +435,10 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
if (IS_ERR(ia->ri_pd)) {
rc = PTR_ERR(ia->ri_pd);
pr_err("rpcrdma: ib_alloc_pd() returned %d\n", rc);
- goto out2;
+ goto out_err;
}
- switch (memreg) {
+ switch (xprt_rdma_memreg_strategy) {
case RPCRDMA_FRMR:
if (frwr_is_supported(ia)) {
ia->ri_ops = &rpcrdma_frwr_memreg_ops;
@@ -452,28 +452,73 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
}
/*FALLTHROUGH*/
default:
- pr_err("rpcrdma: Unsupported memory registration mode: %d\n",
- memreg);
+ pr_err("rpcrdma: Device %s does not support memreg mode %d\n",
+ ia->ri_device->name, xprt_rdma_memreg_strategy);
rc = -EINVAL;
- goto out3;
+ goto out_err;
}
return 0;
-out3:
- ib_dealloc_pd(ia->ri_pd);
- ia->ri_pd = NULL;
-out2:
- rpcrdma_destroy_id(ia->ri_id);
- ia->ri_id = NULL;
-out1:
+out_err:
+ rpcrdma_ia_close(ia);
return rc;
}
-/*
- * Clean up/close an IA.
- * o if event handles and PD have been initialized, free them.
- * o close the IA
+/**
+ * rpcrdma_ia_remove - Handle device driver unload
+ * @ia: interface adapter being removed
+ *
+ * Divest transport H/W resources associated with this adapter,
+ * but allow it to be restored later.
+ */
+void
+rpcrdma_ia_remove(struct rpcrdma_ia *ia)
+{
+ struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt,
+ rx_ia);
+ struct rpcrdma_ep *ep = &r_xprt->rx_ep;
+ struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+ struct rpcrdma_req *req;
+ struct rpcrdma_rep *rep;
+
+ cancel_delayed_work_sync(&buf->rb_refresh_worker);
+
+ /* This is similar to rpcrdma_ep_destroy, but:
+ * - Don't cancel the connect worker.
+ * - Don't call rpcrdma_ep_disconnect, which waits
+ * for another conn upcall, which will deadlock.
+ * - rdma_disconnect is unneeded, the underlying
+ * connection is already gone.
+ */
+ if (ia->ri_id->qp) {
+ ib_drain_qp(ia->ri_id->qp);
+ rdma_destroy_qp(ia->ri_id);
+ ia->ri_id->qp = NULL;
+ }
+ ib_free_cq(ep->rep_attr.recv_cq);
+ ib_free_cq(ep->rep_attr.send_cq);
+
+ /* The ULP is responsible for ensuring all DMA
+ * mappings and MRs are gone.
+ */
+ list_for_each_entry(rep, &buf->rb_recv_bufs, rr_list)
+ rpcrdma_dma_unmap_regbuf(rep->rr_rdmabuf);
+ list_for_each_entry(req, &buf->rb_allreqs, rl_all) {
+ rpcrdma_dma_unmap_regbuf(req->rl_rdmabuf);
+ rpcrdma_dma_unmap_regbuf(req->rl_sendbuf);
+ rpcrdma_dma_unmap_regbuf(req->rl_recvbuf);
+ }
+ rpcrdma_destroy_mrs(buf);
+
+ /* Allow waiters to continue */
+ complete(&ia->ri_remove_done);
+}
+
+/**
+ * rpcrdma_ia_close - Clean up/close an IA.
+ * @ia: interface adapter to close
+ *
*/
void
rpcrdma_ia_close(struct rpcrdma_ia *ia)
@@ -482,13 +527,15 @@ rpcrdma_ia_close(struct rpcrdma_ia *ia)
if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
if (ia->ri_id->qp)
rdma_destroy_qp(ia->ri_id);
- rpcrdma_destroy_id(ia->ri_id);
- ia->ri_id = NULL;
+ rdma_destroy_id(ia->ri_id);
}
+ ia->ri_id = NULL;
+ ia->ri_device = NULL;
/* If the pd is still busy, xprtrdma missed freeing a resource */
if (ia->ri_pd && !IS_ERR(ia->ri_pd))
ib_dealloc_pd(ia->ri_pd);
+ ia->ri_pd = NULL;
}
/*
@@ -646,6 +693,99 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
ib_free_cq(ep->rep_attr.send_cq);
}
+/* Re-establish a connection after a device removal event.
+ * Unlike a normal reconnection, a fresh PD and a new set
+ * of MRs and buffers is needed.
+ */
+static int
+rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
+{
+ struct sockaddr *sap = (struct sockaddr *)&r_xprt->rx_data.addr;
+ int rc, err;
+
+ pr_info("%s: r_xprt = %p\n", __func__, r_xprt);
+
+ rc = -EHOSTUNREACH;
+ if (rpcrdma_ia_open(r_xprt, sap))
+ goto out1;
+
+ rc = -ENOMEM;
+ err = rpcrdma_ep_create(ep, ia, &r_xprt->rx_data);
+ if (err) {
+ pr_err("rpcrdma: rpcrdma_ep_create returned %d\n", err);
+ goto out2;
+ }
+
+ rc = -ENETUNREACH;
+ err = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
+ if (err) {
+ pr_err("rpcrdma: rdma_create_qp returned %d\n", err);
+ goto out3;
+ }
+
+ rpcrdma_create_mrs(r_xprt);
+ return 0;
+
+out3:
+ rpcrdma_ep_destroy(ep, ia);
+out2:
+ rpcrdma_ia_close(ia);
+out1:
+ return rc;
+}
+
+static int
+rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt, struct rpcrdma_ep *ep,
+ struct rpcrdma_ia *ia)
+{
+ struct sockaddr *sap = (struct sockaddr *)&r_xprt->rx_data.addr;
+ struct rdma_cm_id *id, *old;
+ int err, rc;
+
+ dprintk("RPC: %s: reconnecting...\n", __func__);
+
+ rpcrdma_ep_disconnect(ep, ia);
+
+ rc = -EHOSTUNREACH;
+ id = rpcrdma_create_id(r_xprt, ia, sap);
+ if (IS_ERR(id))
+ goto out;
+
+ /* As long as the new ID points to the same device as the
+ * old ID, we can reuse the transport's existing PD and all
+ * previously allocated MRs. Also, the same device means
+ * the transport's previous DMA mappings are still valid.
+ *
+ * This is a sanity check only. There should be no way these
+ * point to two different devices here.
+ */
+ old = id;
+ rc = -ENETUNREACH;
+ if (ia->ri_device != id->device) {
+ pr_err("rpcrdma: can't reconnect on different device!\n");
+ goto out_destroy;
+ }
+
+ err = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr);
+ if (err) {
+ dprintk("RPC: %s: rdma_create_qp returned %d\n",
+ __func__, err);
+ goto out_destroy;
+ }
+
+ /* Atomically replace the transport's ID and QP. */
+ rc = 0;
+ old = ia->ri_id;
+ ia->ri_id = id;
+ rdma_destroy_qp(old);
+
+out_destroy:
+ rdma_destroy_id(old);
+out:
+ return rc;
+}
+
/*
* Connect unconnected endpoint.
*/
@@ -654,61 +794,30 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
{
struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt,
rx_ia);
- struct rdma_cm_id *id, *old;
- struct sockaddr *sap;
unsigned int extras;
- int rc = 0;
+ int rc;
- if (ep->rep_connected != 0) {
retry:
- dprintk("RPC: %s: reconnecting...\n", __func__);
-
- rpcrdma_ep_disconnect(ep, ia);
-
- sap = (struct sockaddr *)&r_xprt->rx_data.addr;
- id = rpcrdma_create_id(r_xprt, ia, sap);
- if (IS_ERR(id)) {
- rc = -EHOSTUNREACH;
- goto out;
- }
- /* TEMP TEMP TEMP - fail if new device:
- * Deregister/remarshal *all* requests!
- * Close and recreate adapter, pd, etc!
- * Re-determine all attributes still sane!
- * More stuff I haven't thought of!
- * Rrrgh!
- */
- if (ia->ri_device != id->device) {
- printk("RPC: %s: can't reconnect on "
- "different device!\n", __func__);
- rpcrdma_destroy_id(id);
- rc = -ENETUNREACH;
- goto out;
- }
- /* END TEMP */
- rc = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr);
- if (rc) {
- dprintk("RPC: %s: rdma_create_qp failed %i\n",
- __func__, rc);
- rpcrdma_destroy_id(id);
- rc = -ENETUNREACH;
- goto out;
- }
-
- old = ia->ri_id;
- ia->ri_id = id;
-
- rdma_destroy_qp(old);
- rpcrdma_destroy_id(old);
- } else {
+ switch (ep->rep_connected) {
+ case 0:
dprintk("RPC: %s: connecting...\n", __func__);
rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
if (rc) {
dprintk("RPC: %s: rdma_create_qp failed %i\n",
__func__, rc);
- /* do not update ep->rep_connected */
- return -ENETUNREACH;
+ rc = -ENETUNREACH;
+ goto out_noupdate;
}
+ break;
+ case -ENODEV:
+ rc = rpcrdma_ep_recreate_xprt(r_xprt, ep, ia);
+ if (rc)
+ goto out_noupdate;
+ break;
+ default:
+ rc = rpcrdma_ep_reconnect(r_xprt, ep, ia);
+ if (rc)
+ goto out;
}
ep->rep_connected = 0;
@@ -736,6 +845,8 @@ retry:
out:
if (rc)
ep->rep_connected = rc;
+
+out_noupdate:
return rc;
}
@@ -878,7 +989,6 @@ struct rpcrdma_rep *
rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt)
{
struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
struct rpcrdma_rep *rep;
int rc;
@@ -894,7 +1004,6 @@ rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt)
goto out_free;
}
- rep->rr_device = ia->ri_device;
rep->rr_cqe.done = rpcrdma_wc_receive;
rep->rr_rxprt = r_xprt;
INIT_WORK(&rep->rr_work, rpcrdma_reply_handler);
@@ -1037,6 +1146,7 @@ void
rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
{
cancel_delayed_work_sync(&buf->rb_recovery_worker);
+ cancel_delayed_work_sync(&buf->rb_refresh_worker);
while (!list_empty(&buf->rb_recv_bufs)) {
struct rpcrdma_rep *rep;
@@ -1081,7 +1191,8 @@ rpcrdma_get_mw(struct rpcrdma_xprt *r_xprt)
out_nomws:
dprintk("RPC: %s: no MWs available\n", __func__);
- schedule_delayed_work(&buf->rb_refresh_worker, 0);
+ if (r_xprt->rx_ep.rep_connected != -ENODEV)
+ schedule_delayed_work(&buf->rb_refresh_worker, 0);
/* Allow the reply handler and refresh worker to run */
cond_resched();
@@ -1231,17 +1342,19 @@ rpcrdma_alloc_regbuf(size_t size, enum dma_data_direction direction,
bool
__rpcrdma_dma_map_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
{
+ struct ib_device *device = ia->ri_device;
+
if (rb->rg_direction == DMA_NONE)
return false;
- rb->rg_iov.addr = ib_dma_map_single(ia->ri_device,
+ rb->rg_iov.addr = ib_dma_map_single(device,
(void *)rb->rg_base,
rdmab_length(rb),
rb->rg_direction);
- if (ib_dma_mapping_error(ia->ri_device, rdmab_addr(rb)))
+ if (ib_dma_mapping_error(device, rdmab_addr(rb)))
return false;
- rb->rg_device = ia->ri_device;
+ rb->rg_device = device;
rb->rg_iov.lkey = ia->ri_pd->local_dma_lkey;
return true;
}