summaryrefslogtreecommitdiff
path: root/drivers/nvme/host
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/nvme/host')
-rw-r--r--drivers/nvme/host/core.c37
-rw-r--r--drivers/nvme/host/fabrics.c1
-rw-r--r--drivers/nvme/host/fabrics.h30
-rw-r--r--drivers/nvme/host/fc.c22
-rw-r--r--drivers/nvme/host/multipath.c2
-rw-r--r--drivers/nvme/host/nvme.h14
-rw-r--r--drivers/nvme/host/pci.c83
-rw-r--r--drivers/nvme/host/rdma.c280
8 files changed, 267 insertions, 202 deletions
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 25da74d310d1..839650e0926a 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1287,7 +1287,7 @@ static void nvme_config_discard(struct nvme_ctrl *ctrl,
BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) <
NVME_DSM_MAX_RANGES);
- queue->limits.discard_alignment = size;
+ queue->limits.discard_alignment = 0;
queue->limits.discard_granularity = size;
blk_queue_max_discard_sectors(queue, UINT_MAX);
@@ -1335,6 +1335,7 @@ static void nvme_update_disk_info(struct gendisk *disk,
struct nvme_ns *ns, struct nvme_id_ns *id)
{
sector_t capacity = le64_to_cpup(&id->nsze) << (ns->lba_shift - 9);
+ unsigned short bs = 1 << ns->lba_shift;
unsigned stream_alignment = 0;
if (ns->ctrl->nr_streams && ns->sws && ns->sgs)
@@ -1343,7 +1344,10 @@ static void nvme_update_disk_info(struct gendisk *disk,
blk_mq_freeze_queue(disk->queue);
blk_integrity_unregister(disk);
- blk_queue_logical_block_size(disk->queue, 1 << ns->lba_shift);
+ blk_queue_logical_block_size(disk->queue, bs);
+ blk_queue_physical_block_size(disk->queue, bs);
+ blk_queue_io_min(disk->queue, bs);
+
if (ns->ms && !ns->ext &&
(ns->ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
nvme_init_integrity(disk, ns->ms, ns->pi_type);
@@ -1449,19 +1453,19 @@ static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
int srcu_idx, ret;
u8 data[16] = { 0, };
+ ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx);
+ if (unlikely(!ns))
+ return -EWOULDBLOCK;
+
put_unaligned_le64(key, &data[0]);
put_unaligned_le64(sa_key, &data[8]);
memset(&c, 0, sizeof(c));
c.common.opcode = op;
- c.common.nsid = cpu_to_le32(head->ns_id);
+ c.common.nsid = cpu_to_le32(ns->head->ns_id);
c.common.cdw10[0] = cpu_to_le32(cdw10);
- ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx);
- if (unlikely(!ns))
- ret = -EWOULDBLOCK;
- else
- ret = nvme_submit_sync_cmd(ns->queue, &c, data, 16);
+ ret = nvme_submit_sync_cmd(ns->queue, &c, data, 16);
nvme_put_ns_from_disk(head, srcu_idx);
return ret;
}
@@ -1705,7 +1709,8 @@ static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
}
- if (ctrl->quirks & NVME_QUIRK_STRIPE_SIZE)
+ if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) &&
+ is_power_of_2(ctrl->max_hw_sectors))
blk_queue_chunk_sectors(q, ctrl->max_hw_sectors);
blk_queue_virt_boundary(q, ctrl->page_size - 1);
if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
@@ -2869,7 +2874,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
nvme_set_queue_limits(ctrl, ns->queue);
- nvme_setup_streams_ns(ctrl, ns);
id = nvme_identify_ns(ctrl, nsid);
if (!id)
@@ -2880,6 +2884,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
if (nvme_init_ns_head(ns, nsid, id, &new))
goto out_free_id;
+ nvme_setup_streams_ns(ctrl, ns);
#ifdef CONFIG_NVME_MULTIPATH
/*
@@ -2961,14 +2966,10 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
static void nvme_ns_remove(struct nvme_ns *ns)
{
- struct nvme_ns_head *head = ns->head;
-
if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
return;
if (ns->disk && ns->disk->flags & GENHD_FL_UP) {
- if (blk_get_integrity(ns->disk))
- blk_integrity_unregister(ns->disk);
nvme_mpath_remove_disk_links(ns);
sysfs_remove_group(&disk_to_dev(ns->disk)->kobj,
&nvme_ns_id_attr_group);
@@ -2976,19 +2977,21 @@ static void nvme_ns_remove(struct nvme_ns *ns)
nvme_nvm_unregister_sysfs(ns);
del_gendisk(ns->disk);
blk_cleanup_queue(ns->queue);
+ if (blk_get_integrity(ns->disk))
+ blk_integrity_unregister(ns->disk);
}
mutex_lock(&ns->ctrl->subsys->lock);
nvme_mpath_clear_current_path(ns);
- if (head)
- list_del_rcu(&ns->siblings);
+ list_del_rcu(&ns->siblings);
mutex_unlock(&ns->ctrl->subsys->lock);
mutex_lock(&ns->ctrl->namespaces_mutex);
list_del_init(&ns->list);
mutex_unlock(&ns->ctrl->namespaces_mutex);
- synchronize_srcu(&head->srcu);
+ synchronize_srcu(&ns->head->srcu);
+ nvme_mpath_check_last_path(ns);
nvme_put_ns(ns);
}
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 76b4fe6816a0..894c2ccb3891 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -74,6 +74,7 @@ static struct nvmf_host *nvmf_host_default(void)
return NULL;
kref_init(&host->ref);
+ uuid_gen(&host->id);
snprintf(host->nqn, NVMF_NQN_SIZE,
"nqn.2014-08.org.nvmexpress:uuid:%pUb", &host->id);
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index 42232e731f19..9ba614953607 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -156,4 +156,34 @@ void nvmf_free_options(struct nvmf_ctrl_options *opts);
int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size);
bool nvmf_should_reconnect(struct nvme_ctrl *ctrl);
+static inline blk_status_t nvmf_check_init_req(struct nvme_ctrl *ctrl,
+ struct request *rq)
+{
+ struct nvme_command *cmd = nvme_req(rq)->cmd;
+
+ /*
+ * We cannot accept any other command until the connect command has
+ * completed, so only allow connect to pass.
+ */
+ if (!blk_rq_is_passthrough(rq) ||
+ cmd->common.opcode != nvme_fabrics_command ||
+ cmd->fabrics.fctype != nvme_fabrics_type_connect) {
+ /*
+ * Reconnecting state means transport disruption, which can take
+ * a long time and even might fail permanently, fail fast to
+ * give upper layers a chance to failover.
+ * Deleting state means that the ctrl will never accept commands
+ * again, fail it permanently.
+ */
+ if (ctrl->state == NVME_CTRL_RECONNECTING ||
+ ctrl->state == NVME_CTRL_DELETING) {
+ nvme_req(rq)->status = NVME_SC_ABORT_REQ;
+ return BLK_STS_IOERR;
+ }
+ return BLK_STS_RESOURCE; /* try again later */
+ }
+
+ return BLK_STS_OK;
+}
+
#endif /* _NVME_FABRICS_H */
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 7ab0be55c7d0..794e66e4aa20 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -31,7 +31,8 @@
enum nvme_fc_queue_flags {
- NVME_FC_Q_CONNECTED = (1 << 0),
+ NVME_FC_Q_CONNECTED = 0,
+ NVME_FC_Q_LIVE,
};
#define NVMEFC_QUEUE_DELAY 3 /* ms units */
@@ -1927,6 +1928,7 @@ nvme_fc_free_queue(struct nvme_fc_queue *queue)
if (!test_and_clear_bit(NVME_FC_Q_CONNECTED, &queue->flags))
return;
+ clear_bit(NVME_FC_Q_LIVE, &queue->flags);
/*
* Current implementation never disconnects a single queue.
* It always terminates a whole association. So there is never
@@ -1934,7 +1936,6 @@ nvme_fc_free_queue(struct nvme_fc_queue *queue)
*/
queue->connection_id = 0;
- clear_bit(NVME_FC_Q_CONNECTED, &queue->flags);
}
static void
@@ -2013,6 +2014,8 @@ nvme_fc_connect_io_queues(struct nvme_fc_ctrl *ctrl, u16 qsize)
ret = nvmf_connect_io_queue(&ctrl->ctrl, i);
if (ret)
break;
+
+ set_bit(NVME_FC_Q_LIVE, &ctrl->queues[i].flags);
}
return ret;
@@ -2320,6 +2323,14 @@ busy:
return BLK_STS_RESOURCE;
}
+static inline blk_status_t nvme_fc_is_ready(struct nvme_fc_queue *queue,
+ struct request *rq)
+{
+ if (unlikely(!test_bit(NVME_FC_Q_LIVE, &queue->flags)))
+ return nvmf_check_init_req(&queue->ctrl->ctrl, rq);
+ return BLK_STS_OK;
+}
+
static blk_status_t
nvme_fc_queue_rq(struct blk_mq_hw_ctx *hctx,
const struct blk_mq_queue_data *bd)
@@ -2335,6 +2346,10 @@ nvme_fc_queue_rq(struct blk_mq_hw_ctx *hctx,
u32 data_len;
blk_status_t ret;
+ ret = nvme_fc_is_ready(queue, rq);
+ if (unlikely(ret))
+ return ret;
+
ret = nvme_setup_cmd(ns, rq, sqe);
if (ret)
return ret;
@@ -2727,6 +2742,8 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
if (ret)
goto out_disconnect_admin_queue;
+ set_bit(NVME_FC_Q_LIVE, &ctrl->queues[0].flags);
+
/*
* Check controller capabilities
*
@@ -3204,7 +3221,6 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
/* initiate nvme ctrl ref counting teardown */
nvme_uninit_ctrl(&ctrl->ctrl);
- nvme_put_ctrl(&ctrl->ctrl);
/* Remove core ctrl ref. */
nvme_put_ctrl(&ctrl->ctrl);
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 78d92151a904..1218a9fca846 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -131,7 +131,7 @@ static blk_qc_t nvme_ns_head_make_request(struct request_queue *q,
bio->bi_opf |= REQ_NVME_MPATH;
ret = direct_make_request(bio);
} else if (!list_empty_careful(&head->list)) {
- dev_warn_ratelimited(dev, "no path available - requeing I/O\n");
+ dev_warn_ratelimited(dev, "no path available - requeuing I/O\n");
spin_lock_irq(&head->requeue_lock);
bio_list_add(&head->requeue_list, bio);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index c0873a68872f..a00eabd06427 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -114,7 +114,7 @@ static inline struct nvme_request *nvme_req(struct request *req)
* NVME_QUIRK_DELAY_BEFORE_CHK_RDY quirk enabled. The value (in ms) was
* found empirically.
*/
-#define NVME_QUIRK_DELAY_AMOUNT 2000
+#define NVME_QUIRK_DELAY_AMOUNT 2300
enum nvme_ctrl_state {
NVME_CTRL_NEW,
@@ -417,6 +417,15 @@ static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns)
rcu_assign_pointer(head->current_path, NULL);
}
struct nvme_ns *nvme_find_path(struct nvme_ns_head *head);
+
+static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
+{
+ struct nvme_ns_head *head = ns->head;
+
+ if (head->disk && list_empty(&head->list))
+ kblockd_schedule_work(&head->requeue_work);
+}
+
#else
static inline void nvme_failover_req(struct request *req)
{
@@ -448,6 +457,9 @@ static inline void nvme_mpath_remove_disk_links(struct nvme_ns *ns)
static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns)
{
}
+static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
+{
+}
#endif /* CONFIG_NVME_MULTIPATH */
#ifdef CONFIG_NVM
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index a11cfd470089..4276ebfff22b 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -448,12 +448,34 @@ static void **nvme_pci_iod_list(struct request *req)
return (void **)(iod->sg + blk_rq_nr_phys_segments(req));
}
+static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req)
+{
+ struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ int nseg = blk_rq_nr_phys_segments(req);
+ unsigned int avg_seg_size;
+
+ if (nseg == 0)
+ return false;
+
+ avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req), nseg);
+
+ if (!(dev->ctrl.sgls & ((1 << 0) | (1 << 1))))
+ return false;
+ if (!iod->nvmeq->qid)
+ return false;
+ if (!sgl_threshold || avg_seg_size < sgl_threshold)
+ return false;
+ return true;
+}
+
static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(rq);
int nseg = blk_rq_nr_phys_segments(rq);
unsigned int size = blk_rq_payload_bytes(rq);
+ iod->use_sgl = nvme_pci_use_sgls(dev, rq);
+
if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) {
size_t alloc_size = nvme_pci_iod_alloc_size(dev, size, nseg,
iod->use_sgl);
@@ -604,8 +626,6 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
dma_addr_t prp_dma;
int nprps, i;
- iod->use_sgl = false;
-
length -= (page_size - offset);
if (length <= 0) {
iod->first_dma = 0;
@@ -705,22 +725,19 @@ static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge,
}
static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
- struct request *req, struct nvme_rw_command *cmd)
+ struct request *req, struct nvme_rw_command *cmd, int entries)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- int length = blk_rq_payload_bytes(req);
struct dma_pool *pool;
struct nvme_sgl_desc *sg_list;
struct scatterlist *sg = iod->sg;
- int entries = iod->nents, i = 0;
dma_addr_t sgl_dma;
-
- iod->use_sgl = true;
+ int i = 0;
/* setting the transfer type as SGL */
cmd->flags = NVME_CMD_SGL_METABUF;
- if (length == sg_dma_len(sg)) {
+ if (entries == 1) {
nvme_pci_sgl_set_data(&cmd->dptr.sgl, sg);
return BLK_STS_OK;
}
@@ -760,33 +777,12 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
}
nvme_pci_sgl_set_data(&sg_list[i++], sg);
-
- length -= sg_dma_len(sg);
sg = sg_next(sg);
- entries--;
- } while (length > 0);
+ } while (--entries > 0);
- WARN_ON(entries > 0);
return BLK_STS_OK;
}
-static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req)
-{
- struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- unsigned int avg_seg_size;
-
- avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req),
- blk_rq_nr_phys_segments(req));
-
- if (!(dev->ctrl.sgls & ((1 << 0) | (1 << 1))))
- return false;
- if (!iod->nvmeq->qid)
- return false;
- if (!sgl_threshold || avg_seg_size < sgl_threshold)
- return false;
- return true;
-}
-
static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
struct nvme_command *cmnd)
{
@@ -795,6 +791,7 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
enum dma_data_direction dma_dir = rq_data_dir(req) ?
DMA_TO_DEVICE : DMA_FROM_DEVICE;
blk_status_t ret = BLK_STS_IOERR;
+ int nr_mapped;
sg_init_table(iod->sg, blk_rq_nr_phys_segments(req));
iod->nents = blk_rq_map_sg(q, req, iod->sg);
@@ -802,12 +799,13 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
goto out;
ret = BLK_STS_RESOURCE;
- if (!dma_map_sg_attrs(dev->dev, iod->sg, iod->nents, dma_dir,
- DMA_ATTR_NO_WARN))
+ nr_mapped = dma_map_sg_attrs(dev->dev, iod->sg, iod->nents, dma_dir,
+ DMA_ATTR_NO_WARN);
+ if (!nr_mapped)
goto out;
- if (nvme_pci_use_sgls(dev, req))
- ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw);
+ if (iod->use_sgl)
+ ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw, nr_mapped);
else
ret = nvme_pci_setup_prps(dev, req, &cmnd->rw);
@@ -1759,6 +1757,7 @@ static void nvme_free_host_mem(struct nvme_dev *dev)
dev->nr_host_mem_descs * sizeof(*dev->host_mem_descs),
dev->host_mem_descs, dev->host_mem_descs_dma);
dev->host_mem_descs = NULL;
+ dev->nr_host_mem_descs = 0;
}
static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
@@ -1787,7 +1786,7 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
if (!bufs)
goto out_free_descs;
- for (size = 0; size < preferred; size += len) {
+ for (size = 0; size < preferred && i < max_entries; size += len) {
dma_addr_t dma_addr;
len = min_t(u64, chunk_size, preferred - size);
@@ -2428,7 +2427,7 @@ static int nvme_dev_map(struct nvme_dev *dev)
return -ENODEV;
}
-static unsigned long check_dell_samsung_bug(struct pci_dev *pdev)
+static unsigned long check_vendor_combination_bug(struct pci_dev *pdev)
{
if (pdev->vendor == 0x144d && pdev->device == 0xa802) {
/*
@@ -2443,6 +2442,14 @@ static unsigned long check_dell_samsung_bug(struct pci_dev *pdev)
(dmi_match(DMI_PRODUCT_NAME, "XPS 15 9550") ||
dmi_match(DMI_PRODUCT_NAME, "Precision 5510")))
return NVME_QUIRK_NO_DEEPEST_PS;
+ } else if (pdev->vendor == 0x144d && pdev->device == 0xa804) {
+ /*
+ * Samsung SSD 960 EVO drops off the PCIe bus after system
+ * suspend on a Ryzen board, ASUS PRIME B350M-A.
+ */
+ if (dmi_match(DMI_BOARD_VENDOR, "ASUSTeK COMPUTER INC.") &&
+ dmi_match(DMI_BOARD_NAME, "PRIME B350M-A"))
+ return NVME_QUIRK_NO_APST;
}
return 0;
@@ -2482,7 +2489,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
if (result)
goto unmap;
- quirks |= check_dell_samsung_bug(pdev);
+ quirks |= check_vendor_combination_bug(pdev);
result = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops,
quirks);
@@ -2665,6 +2672,8 @@ static const struct pci_device_id nvme_id_table[] = {
.driver_data = NVME_QUIRK_IDENTIFY_CNS, },
{ PCI_DEVICE(0x1c58, 0x0003), /* HGST adapter */
.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
+ { PCI_DEVICE(0x1c58, 0x0023), /* WDC SN200 adapter */
+ .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
{ PCI_DEVICE(0x1c5f, 0x0540), /* Memblaze Pblaze4 adapter */
.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
{ PCI_DEVICE(0x144d, 0xa821), /* Samsung PM1725 */
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 4f9bf2f815c3..2a0bba7f50cf 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -15,6 +15,7 @@
#include <linux/module.h>
#include <linux/init.h>
#include <linux/slab.h>
+#include <rdma/mr_pool.h>
#include <linux/err.h>
#include <linux/string.h>
#include <linux/atomic.h>
@@ -59,6 +60,9 @@ struct nvme_rdma_request {
struct nvme_request req;
struct ib_mr *mr;
struct nvme_rdma_qe sqe;
+ union nvme_result result;
+ __le16 status;
+ refcount_t ref;
struct ib_sge sge[1 + NVME_RDMA_MAX_INLINE_SEGMENTS];
u32 num_sge;
int nents;
@@ -73,11 +77,11 @@ struct nvme_rdma_request {
enum nvme_rdma_queue_flags {
NVME_RDMA_Q_ALLOCATED = 0,
NVME_RDMA_Q_LIVE = 1,
+ NVME_RDMA_Q_TR_READY = 2,
};
struct nvme_rdma_queue {
struct nvme_rdma_qe *rsp_ring;
- atomic_t sig_count;
int queue_size;
size_t cmnd_capsule_len;
struct nvme_rdma_ctrl *ctrl;
@@ -258,32 +262,6 @@ static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor)
return ret;
}
-static int nvme_rdma_reinit_request(void *data, struct request *rq)
-{
- struct nvme_rdma_ctrl *ctrl = data;
- struct nvme_rdma_device *dev = ctrl->device;
- struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
- int ret = 0;
-
- if (WARN_ON_ONCE(!req->mr))
- return 0;
-
- ib_dereg_mr(req->mr);
-
- req->mr = ib_alloc_mr(dev->pd, IB_MR_TYPE_MEM_REG,
- ctrl->max_fr_pages);
- if (IS_ERR(req->mr)) {
- ret = PTR_ERR(req->mr);
- req->mr = NULL;
- goto out;
- }
-
- req->mr->need_inval = false;
-
-out:
- return ret;
-}
-
static void nvme_rdma_exit_request(struct blk_mq_tag_set *set,
struct request *rq, unsigned int hctx_idx)
{
@@ -293,9 +271,6 @@ static void nvme_rdma_exit_request(struct blk_mq_tag_set *set,
struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx];
struct nvme_rdma_device *dev = queue->device;
- if (req->mr)
- ib_dereg_mr(req->mr);
-
nvme_rdma_free_qe(dev->dev, &req->sqe, sizeof(struct nvme_command),
DMA_TO_DEVICE);
}
@@ -317,21 +292,9 @@ static int nvme_rdma_init_request(struct blk_mq_tag_set *set,
if (ret)
return ret;
- req->mr = ib_alloc_mr(dev->pd, IB_MR_TYPE_MEM_REG,
- ctrl->max_fr_pages);
- if (IS_ERR(req->mr)) {
- ret = PTR_ERR(req->mr);
- goto out_free_qe;
- }
-
req->queue = queue;
return 0;
-
-out_free_qe:
- nvme_rdma_free_qe(dev->dev, &req->sqe, sizeof(struct nvme_command),
- DMA_TO_DEVICE);
- return -ENOMEM;
}
static int nvme_rdma_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
@@ -428,10 +391,23 @@ out_err:
static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue)
{
- struct nvme_rdma_device *dev = queue->device;
- struct ib_device *ibdev = dev->dev;
+ struct nvme_rdma_device *dev;
+ struct ib_device *ibdev;
- rdma_destroy_qp(queue->cm_id);
+ if (!test_and_clear_bit(NVME_RDMA_Q_TR_READY, &queue->flags))
+ return;
+
+ dev = queue->device;
+ ibdev = dev->dev;
+
+ ib_mr_pool_destroy(queue->qp, &queue->qp->rdma_mrs);
+
+ /*
+ * The cm_id object might have been destroyed during RDMA connection
+ * establishment error flow to avoid getting other cma events, thus
+ * the destruction of the QP shouldn't use rdma_cm API.
+ */
+ ib_destroy_qp(queue->qp);
ib_free_cq(queue->ib_cq);
nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size,
@@ -440,6 +416,12 @@ static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue)
nvme_rdma_dev_put(dev);
}
+static int nvme_rdma_get_max_fr_pages(struct ib_device *ibdev)
+{
+ return min_t(u32, NVME_RDMA_MAX_SEGMENTS,
+ ibdev->attrs.max_fast_reg_page_list_len);
+}
+
static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
{
struct ib_device *ibdev;
@@ -482,8 +464,24 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
goto out_destroy_qp;
}
+ ret = ib_mr_pool_init(queue->qp, &queue->qp->rdma_mrs,
+ queue->queue_size,
+ IB_MR_TYPE_MEM_REG,
+ nvme_rdma_get_max_fr_pages(ibdev));
+ if (ret) {
+ dev_err(queue->ctrl->ctrl.device,
+ "failed to initialize MR pool sized %d for QID %d\n",
+ queue->queue_size, idx);
+ goto out_destroy_ring;
+ }
+
+ set_bit(NVME_RDMA_Q_TR_READY, &queue->flags);
+
return 0;
+out_destroy_ring:
+ nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size,
+ sizeof(struct nvme_completion), DMA_FROM_DEVICE);
out_destroy_qp:
rdma_destroy_qp(queue->cm_id);
out_destroy_ib_cq:
@@ -510,7 +508,6 @@ static int nvme_rdma_alloc_queue(struct nvme_rdma_ctrl *ctrl,
queue->cmnd_capsule_len = sizeof(struct nvme_command);
queue->queue_size = queue_size;
- atomic_set(&queue->sig_count, 0);
queue->cm_id = rdma_create_id(&init_net, nvme_rdma_cm_handler, queue,
RDMA_PS_TCP, IB_QPT_RC);
@@ -546,6 +543,7 @@ static int nvme_rdma_alloc_queue(struct nvme_rdma_ctrl *ctrl,
out_destroy_cm_id:
rdma_destroy_id(queue->cm_id);
+ nvme_rdma_destroy_queue_ib(queue);
return ret;
}
@@ -756,8 +754,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
ctrl->device = ctrl->queues[0].device;
- ctrl->max_fr_pages = min_t(u32, NVME_RDMA_MAX_SEGMENTS,
- ctrl->device->dev->attrs.max_fast_reg_page_list_len);
+ ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev);
if (new) {
ctrl->ctrl.admin_tagset = nvme_rdma_alloc_tagset(&ctrl->ctrl, true);
@@ -771,10 +768,6 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
error = PTR_ERR(ctrl->ctrl.admin_q);
goto out_free_tagset;
}
- } else {
- error = nvme_reinit_tagset(&ctrl->ctrl, ctrl->ctrl.admin_tagset);
- if (error)
- goto out_free_queue;
}
error = nvme_rdma_start_queue(ctrl, 0);
@@ -854,10 +847,6 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new)
goto out_free_tag_set;
}
} else {
- ret = nvme_reinit_tagset(&ctrl->ctrl, ctrl->ctrl.tagset);
- if (ret)
- goto out_free_io_queues;
-
blk_mq_update_nr_hw_queues(&ctrl->tag_set,
ctrl->ctrl.queue_count - 1);
}
@@ -985,12 +974,18 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work)
blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
nvme_start_queues(&ctrl->ctrl);
+ if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) {
+ /* state change failure should never happen */
+ WARN_ON_ONCE(1);
+ return;
+ }
+
nvme_rdma_reconnect_or_remove(ctrl);
}
static void nvme_rdma_error_recovery(struct nvme_rdma_ctrl *ctrl)
{
- if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING))
+ if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING))
return;
queue_work(nvme_wq, &ctrl->err_work);
@@ -1018,8 +1013,18 @@ static void nvme_rdma_memreg_done(struct ib_cq *cq, struct ib_wc *wc)
static void nvme_rdma_inv_rkey_done(struct ib_cq *cq, struct ib_wc *wc)
{
- if (unlikely(wc->status != IB_WC_SUCCESS))
+ struct nvme_rdma_request *req =
+ container_of(wc->wr_cqe, struct nvme_rdma_request, reg_cqe);
+ struct request *rq = blk_mq_rq_from_pdu(req);
+
+ if (unlikely(wc->status != IB_WC_SUCCESS)) {
nvme_rdma_wr_error(cq, wc, "LOCAL_INV");
+ return;
+ }
+
+ if (refcount_dec_and_test(&req->ref))
+ nvme_end_request(rq, req->status, req->result);
+
}
static int nvme_rdma_inv_rkey(struct nvme_rdma_queue *queue,
@@ -1030,7 +1035,7 @@ static int nvme_rdma_inv_rkey(struct nvme_rdma_queue *queue,
.opcode = IB_WR_LOCAL_INV,
.next = NULL,
.num_sge = 0,
- .send_flags = 0,
+ .send_flags = IB_SEND_SIGNALED,
.ex.invalidate_rkey = req->mr->rkey,
};
@@ -1044,22 +1049,15 @@ static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue,
struct request *rq)
{
struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
- struct nvme_rdma_ctrl *ctrl = queue->ctrl;
struct nvme_rdma_device *dev = queue->device;
struct ib_device *ibdev = dev->dev;
- int res;
if (!blk_rq_bytes(rq))
return;
- if (req->mr->need_inval && test_bit(NVME_RDMA_Q_LIVE, &req->queue->flags)) {
- res = nvme_rdma_inv_rkey(queue, req);
- if (unlikely(res < 0)) {
- dev_err(ctrl->ctrl.device,
- "Queueing INV WR for rkey %#x failed (%d)\n",
- req->mr->rkey, res);
- nvme_rdma_error_recovery(queue->ctrl);
- }
+ if (req->mr) {
+ ib_mr_pool_put(queue->qp, &queue->qp->rdma_mrs, req->mr);
+ req->mr = NULL;
}
ib_dma_unmap_sg(ibdev, req->sg_table.sgl,
@@ -1118,12 +1116,18 @@ static int nvme_rdma_map_sg_fr(struct nvme_rdma_queue *queue,
struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
int nr;
+ req->mr = ib_mr_pool_get(queue->qp, &queue->qp->rdma_mrs);
+ if (WARN_ON_ONCE(!req->mr))
+ return -EAGAIN;
+
/*
* Align the MR to a 4K page size to match the ctrl page size and
* the block virtual boundary.
*/
nr = ib_map_mr_sg(req->mr, req->sg_table.sgl, count, NULL, SZ_4K);
if (unlikely(nr < count)) {
+ ib_mr_pool_put(queue->qp, &queue->qp->rdma_mrs, req->mr);
+ req->mr = NULL;
if (nr < 0)
return nr;
return -EINVAL;
@@ -1142,8 +1146,6 @@ static int nvme_rdma_map_sg_fr(struct nvme_rdma_queue *queue,
IB_ACCESS_REMOTE_READ |
IB_ACCESS_REMOTE_WRITE;
- req->mr->need_inval = true;
-
sg->addr = cpu_to_le64(req->mr->iova);
put_unaligned_le24(req->mr->length, sg->length);
put_unaligned_le32(req->mr->rkey, sg->key);
@@ -1163,7 +1165,7 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue,
req->num_sge = 1;
req->inline_data = false;
- req->mr->need_inval = false;
+ refcount_set(&req->ref, 2); /* send and recv completions */
c->common.flags |= NVME_CMD_SGL_METABUF;
@@ -1200,25 +1202,24 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue,
static void nvme_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc)
{
- if (unlikely(wc->status != IB_WC_SUCCESS))
- nvme_rdma_wr_error(cq, wc, "SEND");
-}
+ struct nvme_rdma_qe *qe =
+ container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe);
+ struct nvme_rdma_request *req =
+ container_of(qe, struct nvme_rdma_request, sqe);
+ struct request *rq = blk_mq_rq_from_pdu(req);
-/*
- * We want to signal completion at least every queue depth/2. This returns the
- * largest power of two that is not above half of (queue size + 1) to optimize
- * (avoid divisions).
- */
-static inline bool nvme_rdma_queue_sig_limit(struct nvme_rdma_queue *queue)
-{
- int limit = 1 << ilog2((queue->queue_size + 1) / 2);
+ if (unlikely(wc->status != IB_WC_SUCCESS)) {
+ nvme_rdma_wr_error(cq, wc, "SEND");
+ return;
+ }
- return (atomic_inc_return(&queue->sig_count) & (limit - 1)) == 0;
+ if (refcount_dec_and_test(&req->ref))
+ nvme_end_request(rq, req->status, req->result);
}
static int nvme_rdma_post_send(struct nvme_rdma_queue *queue,
struct nvme_rdma_qe *qe, struct ib_sge *sge, u32 num_sge,
- struct ib_send_wr *first, bool flush)
+ struct ib_send_wr *first)
{
struct ib_send_wr wr, *bad_wr;
int ret;
@@ -1227,31 +1228,12 @@ static int nvme_rdma_post_send(struct nvme_rdma_queue *queue,
sge->length = sizeof(struct nvme_command),
sge->lkey = queue->device->pd->local_dma_lkey;
- qe->cqe.done = nvme_rdma_send_done;
-
wr.next = NULL;
wr.wr_cqe = &qe->cqe;
wr.sg_list = sge;
wr.num_sge = num_sge;
wr.opcode = IB_WR_SEND;
- wr.send_flags = 0;
-
- /*
- * Unsignalled send completions are another giant desaster in the
- * IB Verbs spec: If we don't regularly post signalled sends
- * the send queue will fill up and only a QP reset will rescue us.
- * Would have been way to obvious to handle this in hardware or
- * at least the RDMA stack..
- *
- * Always signal the flushes. The magic request used for the flush
- * sequencer is not allocated in our driver's tagset and it's
- * triggered to be freed by blk_cleanup_queue(). So we need to
- * always mark it as signaled to ensure that the "wr_cqe", which is
- * embedded in request's payload, is not freed when __ib_process_cq()
- * calls wr_cqe->done().
- */
- if (nvme_rdma_queue_sig_limit(queue) || flush)
- wr.send_flags |= IB_SEND_SIGNALED;
+ wr.send_flags = IB_SEND_SIGNALED;
if (first)
first->next = &wr;
@@ -1301,6 +1283,12 @@ static struct blk_mq_tags *nvme_rdma_tagset(struct nvme_rdma_queue *queue)
return queue->ctrl->tag_set.tags[queue_idx - 1];
}
+static void nvme_rdma_async_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+ if (unlikely(wc->status != IB_WC_SUCCESS))
+ nvme_rdma_wr_error(cq, wc, "ASYNC");
+}
+
static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg)
{
struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(arg);
@@ -1319,10 +1307,12 @@ static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg)
cmd->common.flags |= NVME_CMD_SGL_METABUF;
nvme_rdma_set_sg_null(cmd);
+ sqe->cqe.done = nvme_rdma_async_done;
+
ib_dma_sync_single_for_device(dev, sqe->dma, sizeof(*cmd),
DMA_TO_DEVICE);
- ret = nvme_rdma_post_send(queue, sqe, &sge, 1, NULL, false);
+ ret = nvme_rdma_post_send(queue, sqe, &sge, 1, NULL);
WARN_ON_ONCE(ret);
}
@@ -1343,14 +1333,34 @@ static int nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
}
req = blk_mq_rq_to_pdu(rq);
- if (rq->tag == tag)
- ret = 1;
+ req->status = cqe->status;
+ req->result = cqe->result;
- if ((wc->wc_flags & IB_WC_WITH_INVALIDATE) &&
- wc->ex.invalidate_rkey == req->mr->rkey)
- req->mr->need_inval = false;
+ if (wc->wc_flags & IB_WC_WITH_INVALIDATE) {
+ if (unlikely(wc->ex.invalidate_rkey != req->mr->rkey)) {
+ dev_err(queue->ctrl->ctrl.device,
+ "Bogus remote invalidation for rkey %#x\n",
+ req->mr->rkey);
+ nvme_rdma_error_recovery(queue->ctrl);
+ }
+ } else if (req->mr) {
+ ret = nvme_rdma_inv_rkey(queue, req);
+ if (unlikely(ret < 0)) {
+ dev_err(queue->ctrl->ctrl.device,
+ "Queueing INV WR for rkey %#x failed (%d)\n",
+ req->mr->rkey, ret);
+ nvme_rdma_error_recovery(queue->ctrl);
+ }
+ /* the local invalidation completion will end the request */
+ return 0;
+ }
+
+ if (refcount_dec_and_test(&req->ref)) {
+ if (rq->tag == tag)
+ ret = 1;
+ nvme_end_request(rq, req->status, req->result);
+ }
- nvme_end_request(rq, cqe->status, cqe->result);
return ret;
}
@@ -1591,31 +1601,11 @@ nvme_rdma_timeout(struct request *rq, bool reserved)
* We cannot accept any other command until the Connect command has completed.
*/
static inline blk_status_t
-nvme_rdma_queue_is_ready(struct nvme_rdma_queue *queue, struct request *rq)
-{
- if (unlikely(!test_bit(NVME_RDMA_Q_LIVE, &queue->flags))) {
- struct nvme_command *cmd = nvme_req(rq)->cmd;
-
- if (!blk_rq_is_passthrough(rq) ||
- cmd->common.opcode != nvme_fabrics_command ||
- cmd->fabrics.fctype != nvme_fabrics_type_connect) {
- /*
- * reconnecting state means transport disruption, which
- * can take a long time and even might fail permanently,
- * fail fast to give upper layers a chance to failover.
- * deleting state means that the ctrl will never accept
- * commands again, fail it permanently.
- */
- if (queue->ctrl->ctrl.state == NVME_CTRL_RECONNECTING ||
- queue->ctrl->ctrl.state == NVME_CTRL_DELETING) {
- nvme_req(rq)->status = NVME_SC_ABORT_REQ;
- return BLK_STS_IOERR;
- }
- return BLK_STS_RESOURCE; /* try again later */
- }
- }
-
- return 0;
+nvme_rdma_is_ready(struct nvme_rdma_queue *queue, struct request *rq)
+{
+ if (unlikely(!test_bit(NVME_RDMA_Q_LIVE, &queue->flags)))
+ return nvmf_check_init_req(&queue->ctrl->ctrl, rq);
+ return BLK_STS_OK;
}
static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
@@ -1627,14 +1617,13 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
struct nvme_rdma_qe *sqe = &req->sqe;
struct nvme_command *c = sqe->data;
- bool flush = false;
struct ib_device *dev;
blk_status_t ret;
int err;
WARN_ON_ONCE(rq->tag < 0);
- ret = nvme_rdma_queue_is_ready(queue, rq);
+ ret = nvme_rdma_is_ready(queue, rq);
if (unlikely(ret))
return ret;
@@ -1656,13 +1645,13 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
goto err;
}
+ sqe->cqe.done = nvme_rdma_send_done;
+
ib_dma_sync_single_for_device(dev, sqe->dma,
sizeof(struct nvme_command), DMA_TO_DEVICE);
- if (req_op(rq) == REQ_OP_FLUSH)
- flush = true;
err = nvme_rdma_post_send(queue, sqe, req->sge, req->num_sge,
- req->mr->need_inval ? &req->reg_wr.wr : NULL, flush);
+ req->mr ? &req->reg_wr.wr : NULL);
if (unlikely(err)) {
nvme_rdma_unmap_data(queue, rq);
goto err;
@@ -1770,6 +1759,12 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work)
nvme_stop_ctrl(&ctrl->ctrl);
nvme_rdma_shutdown_ctrl(ctrl, false);
+ if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) {
+ /* state change failure should never happen */
+ WARN_ON_ONCE(1);
+ return;
+ }
+
ret = nvme_rdma_configure_admin_queue(ctrl, false);
if (ret)
goto out_fail;
@@ -1810,7 +1805,6 @@ static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = {
.submit_async_event = nvme_rdma_submit_async_event,
.delete_ctrl = nvme_rdma_delete_ctrl,
.get_address = nvmf_get_address,
- .reinit_request = nvme_rdma_reinit_request,
};
static inline bool