From 78ca40724713bd422873cb4ebee86f9f499650f7 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 24 Jul 2019 11:48:41 +0800 Subject: nvme: don't abort completed request in nvme_cancel_request Before aborting in-flight requests, all IO queues and their interrupts have been shutdown. However, request's completion function may not be done yet because it can be scheduled to run via IPI. So don't abort one request if it is marked as completed, otherwise we may abort one normal completed request. Cc: Max Gurtovoy Cc: Sagi Grimberg Cc: Keith Busch Cc: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 8f3fbe5ca937..bcede8c879d1 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -288,6 +288,10 @@ bool nvme_cancel_request(struct request *req, void *data, bool reserved) dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device, "Cancelling I/O %d", req->tag); + /* don't abort one completed request */ + if (blk_mq_request_completed(req)) + return true; + nvme_req(req)->status = NVME_SC_ABORT_REQ; blk_mq_complete_request_sync(req); return true; -- cgit v1.2.3-58-ga151 From 622b8b6893ff3096e130250c1298adf57a0cab03 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 24 Jul 2019 11:48:42 +0800 Subject: nvme: wait until all completed request's complete fn is called When aborting in-flight request for recovering controller, we have to make sure that queue's complete function is called on completed request before moving on. Otherwise, for example, the warning of WARN_ON_ONCE(qp->mrs_used > 0) in ib_destroy_qp_user() may be triggered on nvme-rdma. Fix this issue by using blk_mq_tagset_wait_completed_request. Cc: Max Gurtovoy Cc: Sagi Grimberg Cc: Keith Busch Cc: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/nvme/host/fc.c | 2 ++ drivers/nvme/host/pci.c | 2 ++ drivers/nvme/host/rdma.c | 8 ++++++-- drivers/nvme/host/tcp.c | 8 ++++++-- drivers/nvme/target/loop.c | 2 ++ 5 files changed, 18 insertions(+), 4 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 232d8094091b..f39ed8cc23a2 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2774,6 +2774,7 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl) nvme_stop_queues(&ctrl->ctrl); blk_mq_tagset_busy_iter(&ctrl->tag_set, nvme_fc_terminate_exchange, &ctrl->ctrl); + blk_mq_tagset_wait_completed_request(&ctrl->tag_set); } /* @@ -2796,6 +2797,7 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl) blk_mq_quiesce_queue(ctrl->ctrl.admin_q); blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_fc_terminate_exchange, &ctrl->ctrl); + blk_mq_tagset_wait_completed_request(&ctrl->admin_tag_set); /* kill the aens as they are a separate path */ nvme_fc_abort_aen_ops(ctrl); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index db160cee42ad..45a80b708ef4 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2401,6 +2401,8 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl); blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl); + blk_mq_tagset_wait_completed_request(&dev->tagset); + blk_mq_tagset_wait_completed_request(&dev->admin_tagset); /* * The driver will not be starting up queues again if shutting down so diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index a249db528d54..b313a60be1ca 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -901,9 +901,11 @@ static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl, { blk_mq_quiesce_queue(ctrl->ctrl.admin_q); nvme_rdma_stop_queue(&ctrl->queues[0]); - if (ctrl->ctrl.admin_tagset) + if (ctrl->ctrl.admin_tagset) { blk_mq_tagset_busy_iter(ctrl->ctrl.admin_tagset, nvme_cancel_request, &ctrl->ctrl); + blk_mq_tagset_wait_completed_request(ctrl->ctrl.admin_tagset); + } blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); nvme_rdma_destroy_admin_queue(ctrl, remove); } @@ -914,9 +916,11 @@ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl, if (ctrl->ctrl.queue_count > 1) { nvme_stop_queues(&ctrl->ctrl); nvme_rdma_stop_io_queues(ctrl); - if (ctrl->ctrl.tagset) + if (ctrl->ctrl.tagset) { blk_mq_tagset_busy_iter(ctrl->ctrl.tagset, nvme_cancel_request, &ctrl->ctrl); + blk_mq_tagset_wait_completed_request(ctrl->ctrl.tagset); + } if (remove) nvme_start_queues(&ctrl->ctrl); nvme_rdma_destroy_io_queues(ctrl, remove); diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 606b13d35d16..cf2eaf834b36 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1748,9 +1748,11 @@ static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl, { blk_mq_quiesce_queue(ctrl->admin_q); nvme_tcp_stop_queue(ctrl, 0); - if (ctrl->admin_tagset) + if (ctrl->admin_tagset) { blk_mq_tagset_busy_iter(ctrl->admin_tagset, nvme_cancel_request, ctrl); + blk_mq_tagset_wait_completed_request(ctrl->admin_tagset); + } blk_mq_unquiesce_queue(ctrl->admin_q); nvme_tcp_destroy_admin_queue(ctrl, remove); } @@ -1762,9 +1764,11 @@ static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl, return; nvme_stop_queues(ctrl); nvme_tcp_stop_io_queues(ctrl); - if (ctrl->tagset) + if (ctrl->tagset) { blk_mq_tagset_busy_iter(ctrl->tagset, nvme_cancel_request, ctrl); + blk_mq_tagset_wait_completed_request(ctrl->tagset); + } if (remove) nvme_start_queues(ctrl); nvme_tcp_destroy_io_queues(ctrl, remove); diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index b16dc3981c69..95c8f1732215 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -407,6 +407,7 @@ static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl) nvme_stop_queues(&ctrl->ctrl); blk_mq_tagset_busy_iter(&ctrl->tag_set, nvme_cancel_request, &ctrl->ctrl); + blk_mq_tagset_wait_completed_request(&ctrl->tag_set); nvme_loop_destroy_io_queues(ctrl); } @@ -416,6 +417,7 @@ static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl) blk_mq_quiesce_queue(ctrl->ctrl.admin_q); blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_cancel_request, &ctrl->ctrl); + blk_mq_tagset_wait_completed_request(&ctrl->admin_tag_set); blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); nvme_loop_destroy_admin_queue(ctrl); } -- cgit v1.2.3-58-ga151 From a87ccce0b5a06ee546931859fa62e10f1bce54f9 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 24 Jul 2019 11:48:43 +0800 Subject: blk-mq: remove blk_mq_complete_request_sync blk_mq_tagset_wait_completed_request() has been applied for waiting for completed request's fn, so not necessary to use blk_mq_complete_request_sync() any more. Cc: Max Gurtovoy Cc: Sagi Grimberg Cc: Keith Busch Cc: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 7 ------- drivers/nvme/host/core.c | 2 +- include/linux/blk-mq.h | 1 - 3 files changed, 1 insertion(+), 9 deletions(-) (limited to 'drivers/nvme') diff --git a/block/blk-mq.c b/block/blk-mq.c index 8bb5854a62f3..6968de9d7402 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -652,13 +652,6 @@ bool blk_mq_complete_request(struct request *rq) } EXPORT_SYMBOL(blk_mq_complete_request); -void blk_mq_complete_request_sync(struct request *rq) -{ - WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); - rq->q->mq_ops->complete(rq); -} -EXPORT_SYMBOL_GPL(blk_mq_complete_request_sync); - int blk_mq_request_started(struct request *rq) { return blk_mq_rq_state(rq) != MQ_RQ_IDLE; diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index bcede8c879d1..4ba374633dc8 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -293,7 +293,7 @@ bool nvme_cancel_request(struct request *req, void *data, bool reserved) return true; nvme_req(req)->status = NVME_SC_ABORT_REQ; - blk_mq_complete_request_sync(req); + blk_mq_complete_request(req); return true; } EXPORT_SYMBOL_GPL(nvme_cancel_request); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index ee0719b649b6..1cdd2788cfa6 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -305,7 +305,6 @@ void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list); void blk_mq_kick_requeue_list(struct request_queue *q); void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs); bool blk_mq_complete_request(struct request *rq); -void blk_mq_complete_request_sync(struct request *rq); bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list, struct bio *bio, unsigned int nr_segs); bool blk_mq_queue_stopped(struct request_queue *q); -- cgit v1.2.3-58-ga151 From 98d87f70f4ab84b9e50e16b7848937ae07518cd4 Mon Sep 17 00:00:00 2001 From: Hans Holmberg Date: Wed, 31 Jul 2019 11:41:33 +0200 Subject: lightnvm: remove nvm_submit_io_sync_fn MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the redundant sync handling interface and wait for a completion in the lightnvm core instead. Reviewed-by: Javier González Reviewed-by: Christoph Hellwig Signed-off-by: Hans Holmberg Signed-off-by: Jens Axboe --- drivers/lightnvm/core.c | 35 +++++++++++++++++++++++++++++------ drivers/nvme/host/lightnvm.c | 29 ----------------------------- include/linux/lightnvm.h | 2 -- 3 files changed, 29 insertions(+), 37 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index a600934fdd9c..01d098fb96ac 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -752,12 +752,36 @@ int nvm_submit_io(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd) } EXPORT_SYMBOL(nvm_submit_io); +static void nvm_sync_end_io(struct nvm_rq *rqd) +{ + struct completion *waiting = rqd->private; + + complete(waiting); +} + +static int nvm_submit_io_wait(struct nvm_dev *dev, struct nvm_rq *rqd) +{ + DECLARE_COMPLETION_ONSTACK(wait); + int ret = 0; + + rqd->end_io = nvm_sync_end_io; + rqd->private = &wait; + + ret = dev->ops->submit_io(dev, rqd); + if (ret) + return ret; + + wait_for_completion_io(&wait); + + return 0; +} + int nvm_submit_io_sync(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd) { struct nvm_dev *dev = tgt_dev->parent; int ret; - if (!dev->ops->submit_io_sync) + if (!dev->ops->submit_io) return -ENODEV; nvm_rq_tgt_to_dev(tgt_dev, rqd); @@ -765,9 +789,7 @@ int nvm_submit_io_sync(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd) rqd->dev = tgt_dev; rqd->flags = nvm_set_flags(&tgt_dev->geo, rqd); - /* In case of error, fail with right address format */ - ret = dev->ops->submit_io_sync(dev, rqd); - nvm_rq_dev_to_tgt(tgt_dev, rqd); + ret = nvm_submit_io_wait(dev, rqd); return ret; } @@ -788,12 +810,13 @@ EXPORT_SYMBOL(nvm_end_io); static int nvm_submit_io_sync_raw(struct nvm_dev *dev, struct nvm_rq *rqd) { - if (!dev->ops->submit_io_sync) + if (!dev->ops->submit_io) return -ENODEV; + rqd->dev = NULL; rqd->flags = nvm_set_flags(&dev->geo, rqd); - return dev->ops->submit_io_sync(dev, rqd); + return nvm_submit_io_wait(dev, rqd); } static int nvm_bb_chunk_sense(struct nvm_dev *dev, struct ppa_addr ppa) diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index ba009d4c9dfa..d6f121452d5d 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -690,34 +690,6 @@ static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) return 0; } -static int nvme_nvm_submit_io_sync(struct nvm_dev *dev, struct nvm_rq *rqd) -{ - struct request_queue *q = dev->q; - struct request *rq; - struct nvme_nvm_command cmd; - int ret = 0; - - memset(&cmd, 0, sizeof(struct nvme_nvm_command)); - - rq = nvme_nvm_alloc_request(q, rqd, &cmd); - if (IS_ERR(rq)) - return PTR_ERR(rq); - - /* I/Os can fail and the error is signaled through rqd. Callers must - * handle the error accordingly. - */ - blk_execute_rq(q, NULL, rq, 0); - if (nvme_req(rq)->flags & NVME_REQ_CANCELLED) - ret = -EINTR; - - rqd->ppa_status = le64_to_cpu(nvme_req(rq)->result.u64); - rqd->error = nvme_req(rq)->status; - - blk_mq_free_request(rq); - - return ret; -} - static void *nvme_nvm_create_dma_pool(struct nvm_dev *nvmdev, char *name, int size) { @@ -754,7 +726,6 @@ static struct nvm_dev_ops nvme_nvm_dev_ops = { .get_chk_meta = nvme_nvm_get_chk_meta, .submit_io = nvme_nvm_submit_io, - .submit_io_sync = nvme_nvm_submit_io_sync, .create_dma_pool = nvme_nvm_create_dma_pool, .destroy_dma_pool = nvme_nvm_destroy_dma_pool, diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 4d0d5655c7b2..8891647b24b1 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -89,7 +89,6 @@ typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct ppa_addr *, int, int); typedef int (nvm_get_chk_meta_fn)(struct nvm_dev *, sector_t, int, struct nvm_chk_meta *); typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *); -typedef int (nvm_submit_io_sync_fn)(struct nvm_dev *, struct nvm_rq *); typedef void *(nvm_create_dma_pool_fn)(struct nvm_dev *, char *, int); typedef void (nvm_destroy_dma_pool_fn)(void *); typedef void *(nvm_dev_dma_alloc_fn)(struct nvm_dev *, void *, gfp_t, @@ -104,7 +103,6 @@ struct nvm_dev_ops { nvm_get_chk_meta_fn *get_chk_meta; nvm_submit_io_fn *submit_io; - nvm_submit_io_sync_fn *submit_io_sync; nvm_create_dma_pool_fn *create_dma_pool; nvm_destroy_dma_pool_fn *destroy_dma_pool; -- cgit v1.2.3-58-ga151 From 48e5da725581c1f7444e45cccbafc33e11430b48 Mon Sep 17 00:00:00 2001 From: Hans Holmberg Date: Wed, 31 Jul 2019 11:41:34 +0200 Subject: lightnvm: move metadata mapping to lower level driver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that blk_rq_map_kern can map both kmem and vmem, move internal metadata mapping down to the lower level driver. Reviewed-by: Javier González Reviewed-by: Christoph Hellwig Signed-off-by: Hans Holmberg Signed-off-by: Jens Axboe --- drivers/lightnvm/core.c | 16 +++--- drivers/lightnvm/pblk-core.c | 113 +++++---------------------------------- drivers/lightnvm/pblk-read.c | 22 ++------ drivers/lightnvm/pblk-recovery.c | 39 ++------------ drivers/lightnvm/pblk-write.c | 20 ++----- drivers/lightnvm/pblk.h | 8 +-- drivers/nvme/host/lightnvm.c | 20 +++++-- include/linux/lightnvm.h | 6 +-- 8 files changed, 54 insertions(+), 190 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 01d098fb96ac..3cd03582a2ed 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -731,7 +731,7 @@ static int nvm_set_flags(struct nvm_geo *geo, struct nvm_rq *rqd) return flags; } -int nvm_submit_io(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd) +int nvm_submit_io(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd, void *buf) { struct nvm_dev *dev = tgt_dev->parent; int ret; @@ -745,7 +745,7 @@ int nvm_submit_io(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd) rqd->flags = nvm_set_flags(&tgt_dev->geo, rqd); /* In case of error, fail with right address format */ - ret = dev->ops->submit_io(dev, rqd); + ret = dev->ops->submit_io(dev, rqd, buf); if (ret) nvm_rq_dev_to_tgt(tgt_dev, rqd); return ret; @@ -759,7 +759,8 @@ static void nvm_sync_end_io(struct nvm_rq *rqd) complete(waiting); } -static int nvm_submit_io_wait(struct nvm_dev *dev, struct nvm_rq *rqd) +static int nvm_submit_io_wait(struct nvm_dev *dev, struct nvm_rq *rqd, + void *buf) { DECLARE_COMPLETION_ONSTACK(wait); int ret = 0; @@ -767,7 +768,7 @@ static int nvm_submit_io_wait(struct nvm_dev *dev, struct nvm_rq *rqd) rqd->end_io = nvm_sync_end_io; rqd->private = &wait; - ret = dev->ops->submit_io(dev, rqd); + ret = dev->ops->submit_io(dev, rqd, buf); if (ret) return ret; @@ -776,7 +777,8 @@ static int nvm_submit_io_wait(struct nvm_dev *dev, struct nvm_rq *rqd) return 0; } -int nvm_submit_io_sync(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd) +int nvm_submit_io_sync(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd, + void *buf) { struct nvm_dev *dev = tgt_dev->parent; int ret; @@ -789,7 +791,7 @@ int nvm_submit_io_sync(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd) rqd->dev = tgt_dev; rqd->flags = nvm_set_flags(&tgt_dev->geo, rqd); - ret = nvm_submit_io_wait(dev, rqd); + ret = nvm_submit_io_wait(dev, rqd, buf); return ret; } @@ -816,7 +818,7 @@ static int nvm_submit_io_sync_raw(struct nvm_dev *dev, struct nvm_rq *rqd) rqd->dev = NULL; rqd->flags = nvm_set_flags(&dev->geo, rqd); - return nvm_submit_io_wait(dev, rqd); + return nvm_submit_io_wait(dev, rqd, NULL); } static int nvm_bb_chunk_sense(struct nvm_dev *dev, struct ppa_addr ppa) diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index f546e6f28b8a..a58d3c84a3f2 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -507,7 +507,7 @@ void pblk_set_sec_per_write(struct pblk *pblk, int sec_per_write) pblk->sec_per_write = sec_per_write; } -int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd) +int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd, void *buf) { struct nvm_tgt_dev *dev = pblk->dev; @@ -518,7 +518,7 @@ int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd) return NVM_IO_ERR; #endif - return nvm_submit_io(dev, rqd); + return nvm_submit_io(dev, rqd, buf); } void pblk_check_chunk_state_update(struct pblk *pblk, struct nvm_rq *rqd) @@ -541,7 +541,7 @@ void pblk_check_chunk_state_update(struct pblk *pblk, struct nvm_rq *rqd) } } -int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd) +int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd, void *buf) { struct nvm_tgt_dev *dev = pblk->dev; int ret; @@ -553,7 +553,7 @@ int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd) return NVM_IO_ERR; #endif - ret = nvm_submit_io_sync(dev, rqd); + ret = nvm_submit_io_sync(dev, rqd, buf); if (trace_pblk_chunk_state_enabled() && !ret && rqd->opcode == NVM_OP_PWRITE) @@ -562,65 +562,19 @@ int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd) return ret; } -int pblk_submit_io_sync_sem(struct pblk *pblk, struct nvm_rq *rqd) +static int pblk_submit_io_sync_sem(struct pblk *pblk, struct nvm_rq *rqd, + void *buf) { struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd); int ret; pblk_down_chunk(pblk, ppa_list[0]); - ret = pblk_submit_io_sync(pblk, rqd); + ret = pblk_submit_io_sync(pblk, rqd, buf); pblk_up_chunk(pblk, ppa_list[0]); return ret; } -static void pblk_bio_map_addr_endio(struct bio *bio) -{ - bio_put(bio); -} - -struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data, - unsigned int nr_secs, unsigned int len, - int alloc_type, gfp_t gfp_mask) -{ - struct nvm_tgt_dev *dev = pblk->dev; - void *kaddr = data; - struct page *page; - struct bio *bio; - int i, ret; - - if (alloc_type == PBLK_KMALLOC_META) - return bio_map_kern(dev->q, kaddr, len, gfp_mask); - - bio = bio_kmalloc(gfp_mask, nr_secs); - if (!bio) - return ERR_PTR(-ENOMEM); - - for (i = 0; i < nr_secs; i++) { - page = vmalloc_to_page(kaddr); - if (!page) { - pblk_err(pblk, "could not map vmalloc bio\n"); - bio_put(bio); - bio = ERR_PTR(-ENOMEM); - goto out; - } - - ret = bio_add_pc_page(dev->q, bio, page, PAGE_SIZE, 0); - if (ret != PAGE_SIZE) { - pblk_err(pblk, "could not add page to bio\n"); - bio_put(bio); - bio = ERR_PTR(-ENOMEM); - goto out; - } - - kaddr += PAGE_SIZE; - } - - bio->bi_end_io = pblk_bio_map_addr_endio; -out: - return bio; -} - int pblk_calc_secs(struct pblk *pblk, unsigned long secs_avail, unsigned long secs_to_flush, bool skip_meta) { @@ -722,9 +676,7 @@ u64 pblk_line_smeta_start(struct pblk *pblk, struct pblk_line *line) int pblk_line_smeta_read(struct pblk *pblk, struct pblk_line *line) { - struct nvm_tgt_dev *dev = pblk->dev; struct pblk_line_meta *lm = &pblk->lm; - struct bio *bio; struct ppa_addr *ppa_list; struct nvm_rq rqd; u64 paddr = pblk_line_smeta_start(pblk, line); @@ -736,16 +688,6 @@ int pblk_line_smeta_read(struct pblk *pblk, struct pblk_line *line) if (ret) return ret; - bio = bio_map_kern(dev->q, line->smeta, lm->smeta_len, GFP_KERNEL); - if (IS_ERR(bio)) { - ret = PTR_ERR(bio); - goto clear_rqd; - } - - bio->bi_iter.bi_sector = 0; /* internal bio */ - bio_set_op_attrs(bio, REQ_OP_READ, 0); - - rqd.bio = bio; rqd.opcode = NVM_OP_PREAD; rqd.nr_ppas = lm->smeta_sec; rqd.is_seq = 1; @@ -754,10 +696,9 @@ int pblk_line_smeta_read(struct pblk *pblk, struct pblk_line *line) for (i = 0; i < lm->smeta_sec; i++, paddr++) ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id); - ret = pblk_submit_io_sync(pblk, &rqd); + ret = pblk_submit_io_sync(pblk, &rqd, line->smeta); if (ret) { pblk_err(pblk, "smeta I/O submission failed: %d\n", ret); - bio_put(bio); goto clear_rqd; } @@ -776,9 +717,7 @@ clear_rqd: static int pblk_line_smeta_write(struct pblk *pblk, struct pblk_line *line, u64 paddr) { - struct nvm_tgt_dev *dev = pblk->dev; struct pblk_line_meta *lm = &pblk->lm; - struct bio *bio; struct ppa_addr *ppa_list; struct nvm_rq rqd; __le64 *lba_list = emeta_to_lbas(pblk, line->emeta->buf); @@ -791,16 +730,6 @@ static int pblk_line_smeta_write(struct pblk *pblk, struct pblk_line *line, if (ret) return ret; - bio = bio_map_kern(dev->q, line->smeta, lm->smeta_len, GFP_KERNEL); - if (IS_ERR(bio)) { - ret = PTR_ERR(bio); - goto clear_rqd; - } - - bio->bi_iter.bi_sector = 0; /* internal bio */ - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); - - rqd.bio = bio; rqd.opcode = NVM_OP_PWRITE; rqd.nr_ppas = lm->smeta_sec; rqd.is_seq = 1; @@ -814,10 +743,9 @@ static int pblk_line_smeta_write(struct pblk *pblk, struct pblk_line *line, meta->lba = lba_list[paddr] = addr_empty; } - ret = pblk_submit_io_sync_sem(pblk, &rqd); + ret = pblk_submit_io_sync_sem(pblk, &rqd, line->smeta); if (ret) { pblk_err(pblk, "smeta I/O submission failed: %d\n", ret); - bio_put(bio); goto clear_rqd; } @@ -838,10 +766,8 @@ int pblk_line_emeta_read(struct pblk *pblk, struct pblk_line *line, { struct nvm_tgt_dev *dev = pblk->dev; struct nvm_geo *geo = &dev->geo; - struct pblk_line_mgmt *l_mg = &pblk->l_mg; struct pblk_line_meta *lm = &pblk->lm; void *ppa_list_buf, *meta_list; - struct bio *bio; struct ppa_addr *ppa_list; struct nvm_rq rqd; u64 paddr = line->emeta_ssec; @@ -867,17 +793,6 @@ next_rq: rq_ppas = pblk_calc_secs(pblk, left_ppas, 0, false); rq_len = rq_ppas * geo->csecs; - bio = pblk_bio_map_addr(pblk, emeta_buf, rq_ppas, rq_len, - l_mg->emeta_alloc_type, GFP_KERNEL); - if (IS_ERR(bio)) { - ret = PTR_ERR(bio); - goto free_rqd_dma; - } - - bio->bi_iter.bi_sector = 0; /* internal bio */ - bio_set_op_attrs(bio, REQ_OP_READ, 0); - - rqd.bio = bio; rqd.meta_list = meta_list; rqd.ppa_list = ppa_list_buf; rqd.dma_meta_list = dma_meta_list; @@ -896,7 +811,6 @@ next_rq: while (test_bit(pos, line->blk_bitmap)) { paddr += min; if (pblk_boundary_paddr_checks(pblk, paddr)) { - bio_put(bio); ret = -EINTR; goto free_rqd_dma; } @@ -906,7 +820,6 @@ next_rq: } if (pblk_boundary_paddr_checks(pblk, paddr + min)) { - bio_put(bio); ret = -EINTR; goto free_rqd_dma; } @@ -915,10 +828,9 @@ next_rq: ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line_id); } - ret = pblk_submit_io_sync(pblk, &rqd); + ret = pblk_submit_io_sync(pblk, &rqd, emeta_buf); if (ret) { pblk_err(pblk, "emeta I/O submission failed: %d\n", ret); - bio_put(bio); goto free_rqd_dma; } @@ -963,7 +875,7 @@ static int pblk_blk_erase_sync(struct pblk *pblk, struct ppa_addr ppa) /* The write thread schedules erases so that it minimizes disturbances * with writes. Thus, there is no need to take the LUN semaphore. */ - ret = pblk_submit_io_sync(pblk, &rqd); + ret = pblk_submit_io_sync(pblk, &rqd, NULL); rqd.private = pblk; __pblk_end_io_erase(pblk, &rqd); @@ -1792,7 +1704,7 @@ int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr ppa) /* The write thread schedules erases so that it minimizes disturbances * with writes. Thus, there is no need to take the LUN semaphore. */ - err = pblk_submit_io(pblk, rqd); + err = pblk_submit_io(pblk, rqd, NULL); if (err) { struct nvm_tgt_dev *dev = pblk->dev; struct nvm_geo *geo = &dev->geo; @@ -1923,7 +1835,6 @@ void pblk_line_close_meta(struct pblk *pblk, struct pblk_line *line) static void pblk_save_lba_list(struct pblk *pblk, struct pblk_line *line) { struct pblk_line_meta *lm = &pblk->lm; - struct pblk_line_mgmt *l_mg = &pblk->l_mg; unsigned int lba_list_size = lm->emeta_len[2]; struct pblk_w_err_gc *w_err_gc = line->w_err_gc; struct pblk_emeta *emeta = line->emeta; diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c index d98ea392fe33..d572d4559e4e 100644 --- a/drivers/lightnvm/pblk-read.c +++ b/drivers/lightnvm/pblk-read.c @@ -342,7 +342,7 @@ split_retry: bio_put(int_bio); int_bio = bio_clone_fast(bio, GFP_KERNEL, &pblk_bio_set); goto split_retry; - } else if (pblk_submit_io(pblk, rqd)) { + } else if (pblk_submit_io(pblk, rqd, NULL)) { /* Submitting IO to drive failed, let's report an error */ rqd->error = -ENODEV; pblk_end_io_read(rqd); @@ -419,7 +419,6 @@ int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq) { struct nvm_tgt_dev *dev = pblk->dev; struct nvm_geo *geo = &dev->geo; - struct bio *bio; struct nvm_rq rqd; int data_len; int ret = NVM_IO_OK; @@ -447,25 +446,12 @@ int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq) goto out; data_len = (gc_rq->secs_to_gc) * geo->csecs; - bio = pblk_bio_map_addr(pblk, gc_rq->data, gc_rq->secs_to_gc, data_len, - PBLK_VMALLOC_META, GFP_KERNEL); - if (IS_ERR(bio)) { - pblk_err(pblk, "could not allocate GC bio (%lu)\n", - PTR_ERR(bio)); - ret = PTR_ERR(bio); - goto err_free_dma; - } - - bio->bi_iter.bi_sector = 0; /* internal bio */ - bio_set_op_attrs(bio, REQ_OP_READ, 0); - rqd.opcode = NVM_OP_PREAD; rqd.nr_ppas = gc_rq->secs_to_gc; - rqd.bio = bio; - if (pblk_submit_io_sync(pblk, &rqd)) { + if (pblk_submit_io_sync(pblk, &rqd, gc_rq->data)) { ret = -EIO; - goto err_free_bio; + goto err_free_dma; } pblk_read_check_rand(pblk, &rqd, gc_rq->lba_list, gc_rq->nr_secs); @@ -489,8 +475,6 @@ out: pblk_free_rqd_meta(pblk, &rqd); return ret; -err_free_bio: - bio_put(bio); err_free_dma: pblk_free_rqd_meta(pblk, &rqd); return ret; diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c index e6dda04de144..d5e210c3c5b7 100644 --- a/drivers/lightnvm/pblk-recovery.c +++ b/drivers/lightnvm/pblk-recovery.c @@ -178,12 +178,11 @@ static int pblk_recov_pad_line(struct pblk *pblk, struct pblk_line *line, void *meta_list; struct pblk_pad_rq *pad_rq; struct nvm_rq *rqd; - struct bio *bio; struct ppa_addr *ppa_list; void *data; __le64 *lba_list = emeta_to_lbas(pblk, line->emeta->buf); u64 w_ptr = line->cur_sec; - int left_line_ppas, rq_ppas, rq_len; + int left_line_ppas, rq_ppas; int i, j; int ret = 0; @@ -212,28 +211,15 @@ next_pad_rq: goto fail_complete; } - rq_len = rq_ppas * geo->csecs; - - bio = pblk_bio_map_addr(pblk, data, rq_ppas, rq_len, - PBLK_VMALLOC_META, GFP_KERNEL); - if (IS_ERR(bio)) { - ret = PTR_ERR(bio); - goto fail_complete; - } - - bio->bi_iter.bi_sector = 0; /* internal bio */ - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); - rqd = pblk_alloc_rqd(pblk, PBLK_WRITE_INT); ret = pblk_alloc_rqd_meta(pblk, rqd); if (ret) { pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT); - bio_put(bio); goto fail_complete; } - rqd->bio = bio; + rqd->bio = NULL; rqd->opcode = NVM_OP_PWRITE; rqd->is_seq = 1; rqd->nr_ppas = rq_ppas; @@ -275,13 +261,12 @@ next_pad_rq: kref_get(&pad_rq->ref); pblk_down_chunk(pblk, ppa_list[0]); - ret = pblk_submit_io(pblk, rqd); + ret = pblk_submit_io(pblk, rqd, data); if (ret) { pblk_err(pblk, "I/O submission failed: %d\n", ret); pblk_up_chunk(pblk, ppa_list[0]); kref_put(&pad_rq->ref, pblk_recov_complete); pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT); - bio_put(bio); goto fail_complete; } @@ -375,7 +360,6 @@ static int pblk_recov_scan_oob(struct pblk *pblk, struct pblk_line *line, struct ppa_addr *ppa_list; void *meta_list; struct nvm_rq *rqd; - struct bio *bio; void *data; dma_addr_t dma_ppa_list, dma_meta_list; __le64 *lba_list; @@ -407,15 +391,7 @@ next_rq: rq_len = rq_ppas * geo->csecs; retry_rq: - bio = bio_map_kern(dev->q, data, rq_len, GFP_KERNEL); - if (IS_ERR(bio)) - return PTR_ERR(bio); - - bio->bi_iter.bi_sector = 0; /* internal bio */ - bio_set_op_attrs(bio, REQ_OP_READ, 0); - bio_get(bio); - - rqd->bio = bio; + rqd->bio = NULL; rqd->opcode = NVM_OP_PREAD; rqd->meta_list = meta_list; rqd->nr_ppas = rq_ppas; @@ -445,10 +421,9 @@ retry_rq: addr_to_gen_ppa(pblk, paddr + j, line->id); } - ret = pblk_submit_io_sync(pblk, rqd); + ret = pblk_submit_io_sync(pblk, rqd, data); if (ret) { pblk_err(pblk, "I/O submission failed: %d\n", ret); - bio_put(bio); return ret; } @@ -460,24 +435,20 @@ retry_rq: if (padded) { pblk_log_read_err(pblk, rqd); - bio_put(bio); return -EINTR; } pad_distance = pblk_pad_distance(pblk, line); ret = pblk_recov_pad_line(pblk, line, pad_distance); if (ret) { - bio_put(bio); return ret; } padded = true; - bio_put(bio); goto retry_rq; } pblk_get_packed_meta(pblk, rqd); - bio_put(bio); for (i = 0; i < rqd->nr_ppas; i++) { struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i); diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c index 4e63f9b5954c..b9a2aeba95ab 100644 --- a/drivers/lightnvm/pblk-write.c +++ b/drivers/lightnvm/pblk-write.c @@ -373,7 +373,6 @@ int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line) struct pblk_emeta *emeta = meta_line->emeta; struct ppa_addr *ppa_list; struct pblk_g_ctx *m_ctx; - struct bio *bio; struct nvm_rq *rqd; void *data; u64 paddr; @@ -391,20 +390,9 @@ int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line) rq_len = rq_ppas * geo->csecs; data = ((void *)emeta->buf) + emeta->mem; - bio = pblk_bio_map_addr(pblk, data, rq_ppas, rq_len, - l_mg->emeta_alloc_type, GFP_KERNEL); - if (IS_ERR(bio)) { - pblk_err(pblk, "failed to map emeta io"); - ret = PTR_ERR(bio); - goto fail_free_rqd; - } - bio->bi_iter.bi_sector = 0; /* internal bio */ - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); - rqd->bio = bio; - ret = pblk_alloc_w_rq(pblk, rqd, rq_ppas, pblk_end_io_write_meta); if (ret) - goto fail_free_bio; + goto fail_free_rqd; ppa_list = nvm_rq_to_ppa_list(rqd); for (i = 0; i < rqd->nr_ppas; ) { @@ -423,7 +411,7 @@ int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line) pblk_down_chunk(pblk, ppa_list[0]); - ret = pblk_submit_io(pblk, rqd); + ret = pblk_submit_io(pblk, rqd, data); if (ret) { pblk_err(pblk, "emeta I/O submission failed: %d\n", ret); goto fail_rollback; @@ -437,8 +425,6 @@ fail_rollback: pblk_dealloc_page(pblk, meta_line, rq_ppas); list_add(&meta_line->list, &meta_line->list); spin_unlock(&l_mg->close_lock); -fail_free_bio: - bio_put(bio); fail_free_rqd: pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT); return ret; @@ -523,7 +509,7 @@ static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd) meta_line = pblk_should_submit_meta_io(pblk, rqd); /* Submit data write for current data line */ - err = pblk_submit_io(pblk, rqd); + err = pblk_submit_io(pblk, rqd, NULL); if (err) { pblk_err(pblk, "data I/O submission failed: %d\n", err); return NVM_IO_ERR; diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index a67855387f53..d515d3409a74 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -783,14 +783,10 @@ struct nvm_chk_meta *pblk_chunk_get_off(struct pblk *pblk, struct ppa_addr ppa); void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd); void pblk_log_read_err(struct pblk *pblk, struct nvm_rq *rqd); -int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd); -int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd); -int pblk_submit_io_sync_sem(struct pblk *pblk, struct nvm_rq *rqd); +int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd, void *buf); +int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd, void *buf); int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line); void pblk_check_chunk_state_update(struct pblk *pblk, struct nvm_rq *rqd); -struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data, - unsigned int nr_secs, unsigned int len, - int alloc_type, gfp_t gfp_mask); struct pblk_line *pblk_line_get(struct pblk *pblk); struct pblk_line *pblk_line_get_first_data(struct pblk *pblk); struct pblk_line *pblk_line_replace_data(struct pblk *pblk); diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index d6f121452d5d..ec46693f6b64 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -667,11 +667,14 @@ static struct request *nvme_nvm_alloc_request(struct request_queue *q, return rq; } -static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) +static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd, + void *buf) { + struct nvm_geo *geo = &dev->geo; struct request_queue *q = dev->q; struct nvme_nvm_command *cmd; struct request *rq; + int ret; cmd = kzalloc(sizeof(struct nvme_nvm_command), GFP_KERNEL); if (!cmd) @@ -679,8 +682,15 @@ static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) rq = nvme_nvm_alloc_request(q, rqd, cmd); if (IS_ERR(rq)) { - kfree(cmd); - return PTR_ERR(rq); + ret = PTR_ERR(rq); + goto err_free_cmd; + } + + if (buf) { + ret = blk_rq_map_kern(q, rq, buf, geo->csecs * rqd->nr_ppas, + GFP_KERNEL); + if (ret) + goto err_free_cmd; } rq->end_io_data = rqd; @@ -688,6 +698,10 @@ static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) blk_execute_rq_nowait(q, NULL, rq, 0, nvme_nvm_end_io); return 0; + +err_free_cmd: + kfree(cmd); + return ret; } static void *nvme_nvm_create_dma_pool(struct nvm_dev *nvmdev, char *name, diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 8891647b24b1..ee8ec2e68055 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -88,7 +88,7 @@ typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, u8 *); typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct ppa_addr *, int, int); typedef int (nvm_get_chk_meta_fn)(struct nvm_dev *, sector_t, int, struct nvm_chk_meta *); -typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *); +typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *, void *); typedef void *(nvm_create_dma_pool_fn)(struct nvm_dev *, char *, int); typedef void (nvm_destroy_dma_pool_fn)(void *); typedef void *(nvm_dev_dma_alloc_fn)(struct nvm_dev *, void *, gfp_t, @@ -680,8 +680,8 @@ extern int nvm_get_chunk_meta(struct nvm_tgt_dev *, struct ppa_addr, int, struct nvm_chk_meta *); extern int nvm_set_chunk_meta(struct nvm_tgt_dev *, struct ppa_addr *, int, int); -extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *); -extern int nvm_submit_io_sync(struct nvm_tgt_dev *, struct nvm_rq *); +extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *, void *); +extern int nvm_submit_io_sync(struct nvm_tgt_dev *, struct nvm_rq *, void *); extern void nvm_end_io(struct nvm_rq *); #else /* CONFIG_NVM */ -- cgit v1.2.3-58-ga151 From 6be182607db98f9672865fed3ee3bf8b98e62dd4 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Fri, 19 Jul 2019 12:46:46 -0700 Subject: nvme-tcp: cleanup nvme_tcp_recv_pdu Can return directly in the switch statement Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg --- drivers/nvme/host/tcp.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index cf2eaf834b36..2c5df86882cc 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -608,23 +608,18 @@ static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb, switch (hdr->type) { case nvme_tcp_c2h_data: - ret = nvme_tcp_handle_c2h_data(queue, (void *)queue->pdu); - break; + return nvme_tcp_handle_c2h_data(queue, (void *)queue->pdu); case nvme_tcp_rsp: nvme_tcp_init_recv_ctx(queue); - ret = nvme_tcp_handle_comp(queue, (void *)queue->pdu); - break; + return nvme_tcp_handle_comp(queue, (void *)queue->pdu); case nvme_tcp_r2t: nvme_tcp_init_recv_ctx(queue); - ret = nvme_tcp_handle_r2t(queue, (void *)queue->pdu); - break; + return nvme_tcp_handle_r2t(queue, (void *)queue->pdu); default: dev_err(queue->ctrl->ctrl.device, "unsupported pdu type (%d)\n", hdr->type); return -EINVAL; } - - return ret; } static inline void nvme_tcp_end_request(struct request *rq, u16 status) -- cgit v1.2.3-58-ga151 From 10407ec9b42d30a6ebc49f7f84e2bb2131438699 Mon Sep 17 00:00:00 2001 From: Potnuri Bharat Teja Date: Mon, 8 Jul 2019 15:22:00 +0530 Subject: nvme-tcp: Use protocol specific operations while reading socket Using socket specific read_sock() calls instead of directly calling tcp_read_sock() helps lld module registered handlers if any, to be called from nvme-tcp host. This patch therefore replaces the tcp_read_sock() with socket specific prot_ops. Signed-off-by: Potnuri Bharat Teja Acked-by: Sagi Grimberg Signed-off-by: Sagi Grimberg --- drivers/nvme/host/tcp.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 2c5df86882cc..53c32f9bba08 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1018,14 +1018,15 @@ done: static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue) { - struct sock *sk = queue->sock->sk; + struct socket *sock = queue->sock; + struct sock *sk = sock->sk; read_descriptor_t rd_desc; int consumed; rd_desc.arg.data = queue; rd_desc.count = 1; lock_sock(sk); - consumed = tcp_read_sock(sk, &rd_desc, nvme_tcp_recv_skb); + consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb); release_sock(sk); return consumed; } -- cgit v1.2.3-58-ga151 From 4fba445828fc047c095a3a955b4ceac62cd56964 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 22 Jul 2019 17:06:52 -0700 Subject: nvme: have nvme_init_identify set ctrl->cap No need to use a stack cap variable. Reviewed-by: Minwoo Im Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg --- drivers/nvme/host/core.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 4ba374633dc8..f325e9516849 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2562,7 +2562,6 @@ static int nvme_get_effects_log(struct nvme_ctrl *ctrl) int nvme_init_identify(struct nvme_ctrl *ctrl) { struct nvme_id_ctrl *id; - u64 cap; int ret, page_shift; u32 max_hw_sectors; bool prev_apst_enabled; @@ -2573,15 +2572,15 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) return ret; } - ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &cap); + ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap); if (ret) { dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret); return ret; } - page_shift = NVME_CAP_MPSMIN(cap) + 12; + page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12; if (ctrl->vs >= NVME_VS(1, 1, 0)) - ctrl->subsystem = NVME_CAP_NSSRC(cap); + ctrl->subsystem = NVME_CAP_NSSRC(ctrl->cap); ret = nvme_identify_ctrl(ctrl, &id); if (ret) { -- cgit v1.2.3-58-ga151 From aa22c8e6650d29a00196087caa2bbb32dc6117bc Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 22 Aug 2019 10:51:17 -0700 Subject: nvme-pci: set ctrl sqsize to the device q_depth Align with what the rest of the transports are doing. Signed-off-by: Sagi Grimberg --- drivers/nvme/host/pci.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 45a80b708ef4..530104d20506 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2314,6 +2314,7 @@ static int nvme_pci_enable(struct nvme_dev *dev) dev->q_depth = min_t(int, NVME_CAP_MQES(dev->ctrl.cap) + 1, io_queue_depth); + dev->ctrl.sqsize = dev->q_depth - 1; /* 0's based queue depth */ dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap); dev->dbs = dev->bar + 4096; -- cgit v1.2.3-58-ga151 From c0f2f45be2976abe973c8cd544f38e2d928771b0 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 22 Jul 2019 17:06:53 -0700 Subject: nvme: move sqsize setting to the core nvme_enable_ctrl reads the cap register right after, so no need to do that locally in the transport driver. Have sqsize setting in nvme_init_identify. Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg --- drivers/nvme/host/core.c | 20 +++++++++++--------- drivers/nvme/host/fc.c | 12 +----------- drivers/nvme/host/nvme.h | 2 +- drivers/nvme/host/pci.c | 2 +- drivers/nvme/host/rdma.c | 13 +------------ drivers/nvme/host/tcp.c | 11 +---------- drivers/nvme/target/loop.c | 12 +----------- 7 files changed, 17 insertions(+), 55 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index f325e9516849..9e832694f9d0 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1968,16 +1968,23 @@ int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap) } EXPORT_SYMBOL_GPL(nvme_disable_ctrl); -int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap) +int nvme_enable_ctrl(struct nvme_ctrl *ctrl) { /* * Default to a 4K page size, with the intention to update this * path in the future to accomodate architectures with differing * kernel and IO page sizes. */ - unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12, page_shift = 12; + unsigned dev_page_min, page_shift = 12; int ret; + ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap); + if (ret) { + dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret); + return ret; + } + dev_page_min = NVME_CAP_MPSMIN(ctrl->cap) + 12; + if (page_shift < dev_page_min) { dev_err(ctrl->device, "Minimum device page size %u too large for host (%u)\n", @@ -1996,7 +2003,7 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap) ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); if (ret) return ret; - return nvme_wait_ready(ctrl, cap, true); + return nvme_wait_ready(ctrl, ctrl->cap, true); } EXPORT_SYMBOL_GPL(nvme_enable_ctrl); @@ -2571,13 +2578,8 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) dev_err(ctrl->device, "Reading VS failed (%d)\n", ret); return ret; } - - ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap); - if (ret) { - dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret); - return ret; - } page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12; + ctrl->sqsize = min_t(int, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize); if (ctrl->vs >= NVME_VS(1, 1, 0)) ctrl->subsystem = NVME_CAP_NSSRC(ctrl->cap); diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index f39ed8cc23a2..ec264b2e54c3 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2648,17 +2648,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) * prior connection values */ - ret = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->ctrl.cap); - if (ret) { - dev_err(ctrl->ctrl.device, - "prop_get NVME_REG_CAP failed\n"); - goto out_disconnect_admin_queue; - } - - ctrl->ctrl.sqsize = - min_t(int, NVME_CAP_MQES(ctrl->ctrl.cap), ctrl->ctrl.sqsize); - - ret = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap); + ret = nvme_enable_ctrl(&ctrl->ctrl); if (ret) goto out_disconnect_admin_queue; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 26b563f9985b..26540feed511 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -427,7 +427,7 @@ bool nvme_cancel_request(struct request *req, void *data, bool reserved); bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, enum nvme_ctrl_state new_state); int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap); -int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap); +int nvme_enable_ctrl(struct nvme_ctrl *ctrl); int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl); int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, const struct nvme_ctrl_ops *ops, unsigned long quirks); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 530104d20506..cc5787e5b451 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1695,7 +1695,7 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev) lo_hi_writeq(nvmeq->sq_dma_addr, dev->bar + NVME_REG_ASQ); lo_hi_writeq(nvmeq->cq_dma_addr, dev->bar + NVME_REG_ACQ); - result = nvme_enable_ctrl(&dev->ctrl, dev->ctrl.cap); + result = nvme_enable_ctrl(&dev->ctrl); if (result) return result; diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index b313a60be1ca..09ab05c458a8 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -803,18 +803,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, if (error) goto out_cleanup_queue; - error = ctrl->ctrl.ops->reg_read64(&ctrl->ctrl, NVME_REG_CAP, - &ctrl->ctrl.cap); - if (error) { - dev_err(ctrl->ctrl.device, - "prop_get NVME_REG_CAP failed\n"); - goto out_stop_queue; - } - - ctrl->ctrl.sqsize = - min_t(int, NVME_CAP_MQES(ctrl->ctrl.cap), ctrl->ctrl.sqsize); - - error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap); + error = nvme_enable_ctrl(&ctrl->ctrl); if (error) goto out_stop_queue; diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 53c32f9bba08..088dac0d97c4 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1707,16 +1707,7 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new) if (error) goto out_cleanup_queue; - error = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap); - if (error) { - dev_err(ctrl->device, - "prop_get NVME_REG_CAP failed\n"); - goto out_stop_queue; - } - - ctrl->sqsize = min_t(int, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize); - - error = nvme_enable_ctrl(ctrl, ctrl->cap); + error = nvme_enable_ctrl(ctrl); if (error) goto out_stop_queue; diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 95c8f1732215..ec0bc57d26fc 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -369,17 +369,7 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) set_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[0].flags); - error = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->ctrl.cap); - if (error) { - dev_err(ctrl->ctrl.device, - "prop_get NVME_REG_CAP failed\n"); - goto out_cleanup_queue; - } - - ctrl->ctrl.sqsize = - min_t(int, NVME_CAP_MQES(ctrl->ctrl.cap), ctrl->ctrl.sqsize); - - error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap); + error = nvme_enable_ctrl(&ctrl->ctrl); if (error) goto out_cleanup_queue; -- cgit v1.2.3-58-ga151 From b5b0504878b884fb38e8983f1637a7be1d07fad3 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 22 Jul 2019 17:06:54 -0700 Subject: nvme: don't pass cap to nvme_disable_ctrl All seem to call it with ctrl->cap so no need to pass it at all. Reviewed-by: Minwoo Im Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg --- drivers/nvme/host/core.c | 4 ++-- drivers/nvme/host/nvme.h | 2 +- drivers/nvme/host/pci.c | 4 ++-- drivers/nvme/host/rdma.c | 2 +- drivers/nvme/host/tcp.c | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 9e832694f9d0..35311d343a13 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1950,7 +1950,7 @@ static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled) * bits', but doing so may cause the device to complete commands to the * admin queue ... and we don't know what memory that might be pointing at! */ -int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap) +int nvme_disable_ctrl(struct nvme_ctrl *ctrl) { int ret; @@ -1964,7 +1964,7 @@ int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap) if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY) msleep(NVME_QUIRK_DELAY_AMOUNT); - return nvme_wait_ready(ctrl, cap, false); + return nvme_wait_ready(ctrl, ctrl->cap, false); } EXPORT_SYMBOL_GPL(nvme_disable_ctrl); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 26540feed511..7c86e4bcd271 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -426,7 +426,7 @@ void nvme_complete_rq(struct request *req); bool nvme_cancel_request(struct request *req, void *data, bool reserved); bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, enum nvme_ctrl_state new_state); -int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap); +int nvme_disable_ctrl(struct nvme_ctrl *ctrl); int nvme_enable_ctrl(struct nvme_ctrl *ctrl); int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl); int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index cc5787e5b451..bf54b128c5a4 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1403,7 +1403,7 @@ static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown) if (shutdown) nvme_shutdown_ctrl(&dev->ctrl); else - nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap); + nvme_disable_ctrl(&dev->ctrl); nvme_poll_irqdisable(nvmeq, -1); } @@ -1679,7 +1679,7 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev) (readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO)) writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS); - result = nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap); + result = nvme_disable_ctrl(&dev->ctrl); if (result < 0) return result; diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 09ab05c458a8..10e3bcdb7180 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1859,7 +1859,7 @@ static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) if (shutdown) nvme_shutdown_ctrl(&ctrl->ctrl); else - nvme_disable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap); + nvme_disable_ctrl(&ctrl->ctrl); nvme_rdma_teardown_admin_queue(ctrl, shutdown); } diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 088dac0d97c4..0317721fe858 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1886,7 +1886,7 @@ static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown) if (shutdown) nvme_shutdown_ctrl(ctrl); else - nvme_disable_ctrl(ctrl, ctrl->cap); + nvme_disable_ctrl(ctrl); nvme_tcp_teardown_admin_queue(ctrl, shutdown); } -- cgit v1.2.3-58-ga151 From 79fd751d61aa8b9979d51357236890d69989ba04 Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Sun, 14 Jul 2019 17:18:42 +0900 Subject: nvme: tcp: selects CRYPTO_CRC32C for nvme-tcp The tcp host module is now taking those APIs from crypto ahash: (1) crypto_ahash_final() (2) crypto_ahash_digest() (3) crypto_alloc_ahash() nvme-tcp should depends on CRYPTO_CRC32C. Cc: Christoph Hellwig Cc: Keith Busch Cc: Jens Axboe Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Minwoo Im Signed-off-by: Sagi Grimberg --- drivers/nvme/host/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig index ec43ac9199e2..2b36f052bfb9 100644 --- a/drivers/nvme/host/Kconfig +++ b/drivers/nvme/host/Kconfig @@ -64,6 +64,7 @@ config NVME_TCP depends on INET depends on BLK_DEV_NVME select NVME_FABRICS + select CRYPTO_CRC32C help This provides support for the NVMe over Fabrics protocol using the TCP transport. This allows you to use remote block devices -- cgit v1.2.3-58-ga151 From 1a9460cef571100487cde9c44110a5afff2e9ea2 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 3 Jul 2019 14:08:04 -0700 Subject: nvme-tcp: support simple polling Simple polling support via socket busy_poll interface. Although we do not shutdown interrupts but simply hammer the socket poll, we can sometimes find completions faster than the normal interrupt driven RX path. We add per queue nr_cqe counter that resets every time RX path is invoked such that .poll callback can return it to stay consistent with the semantics. Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg --- drivers/nvme/host/tcp.c | 51 +++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 45 insertions(+), 6 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 0317721fe858..1e2e5ab3875f 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "nvme.h" #include "fabrics.h" @@ -72,6 +73,7 @@ struct nvme_tcp_queue { int pdu_offset; size_t data_remaining; size_t ddgst_remaining; + unsigned int nr_cqe; /* send state */ struct nvme_tcp_request *request; @@ -438,6 +440,7 @@ static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue, } nvme_end_request(rq, cqe->status, cqe->result); + queue->nr_cqe++; return 0; } @@ -696,8 +699,10 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb, nvme_tcp_ddgst_final(queue->rcv_hash, &queue->exp_ddgst); queue->ddgst_remaining = NVME_TCP_DIGEST_LENGTH; } else { - if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) + if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) { nvme_tcp_end_request(rq, NVME_SC_SUCCESS); + queue->nr_cqe++; + } nvme_tcp_init_recv_ctx(queue); } } @@ -737,6 +742,7 @@ static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue, pdu->command_id); nvme_tcp_end_request(rq, NVME_SC_SUCCESS); + queue->nr_cqe++; } nvme_tcp_init_recv_ctx(queue); @@ -1026,6 +1032,7 @@ static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue) rd_desc.arg.data = queue; rd_desc.count = 1; lock_sock(sk); + queue->nr_cqe = 0; consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb); release_sock(sk); return consumed; @@ -1367,6 +1374,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, queue->sock->sk->sk_data_ready = nvme_tcp_data_ready; queue->sock->sk->sk_state_change = nvme_tcp_state_change; queue->sock->sk->sk_write_space = nvme_tcp_write_space; + queue->sock->sk->sk_ll_usec = 1; write_unlock_bh(&queue->sock->sk->sk_callback_lock); return 0; @@ -1465,7 +1473,7 @@ static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl, set->driver_data = ctrl; set->nr_hw_queues = nctrl->queue_count - 1; set->timeout = NVME_IO_TIMEOUT; - set->nr_maps = 2 /* default + read */; + set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2; } ret = blk_mq_alloc_tag_set(set); @@ -1564,6 +1572,7 @@ static unsigned int nvme_tcp_nr_io_queues(struct nvme_ctrl *ctrl) nr_io_queues = min(ctrl->opts->nr_io_queues, num_online_cpus()); nr_io_queues += min(ctrl->opts->nr_write_queues, num_online_cpus()); + nr_io_queues += min(ctrl->opts->nr_poll_queues, num_online_cpus()); return nr_io_queues; } @@ -1595,6 +1604,12 @@ static void nvme_tcp_set_io_queues(struct nvme_ctrl *nctrl, min(opts->nr_io_queues, nr_io_queues); nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT]; } + + if (opts->nr_poll_queues && nr_io_queues) { + /* map dedicated poll queues only if we have queues left */ + ctrl->io_queues[HCTX_TYPE_POLL] = + min(opts->nr_poll_queues, nr_io_queues); + } } static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl) @@ -2142,14 +2157,36 @@ static int nvme_tcp_map_queues(struct blk_mq_tag_set *set) blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); blk_mq_map_queues(&set->map[HCTX_TYPE_READ]); + if (opts->nr_poll_queues && ctrl->io_queues[HCTX_TYPE_POLL]) { + /* map dedicated poll queues only if we have queues left */ + set->map[HCTX_TYPE_POLL].nr_queues = + ctrl->io_queues[HCTX_TYPE_POLL]; + set->map[HCTX_TYPE_POLL].queue_offset = + ctrl->io_queues[HCTX_TYPE_DEFAULT] + + ctrl->io_queues[HCTX_TYPE_READ]; + blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]); + } + dev_info(ctrl->ctrl.device, - "mapped %d/%d default/read queues.\n", + "mapped %d/%d/%d default/read/poll queues.\n", ctrl->io_queues[HCTX_TYPE_DEFAULT], - ctrl->io_queues[HCTX_TYPE_READ]); + ctrl->io_queues[HCTX_TYPE_READ], + ctrl->io_queues[HCTX_TYPE_POLL]); return 0; } +static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx) +{ + struct nvme_tcp_queue *queue = hctx->driver_data; + struct sock *sk = queue->sock->sk; + + if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue)) + sk_busy_loop(sk, true); + nvme_tcp_try_recv(queue); + return queue->nr_cqe; +} + static struct blk_mq_ops nvme_tcp_mq_ops = { .queue_rq = nvme_tcp_queue_rq, .complete = nvme_complete_rq, @@ -2158,6 +2195,7 @@ static struct blk_mq_ops nvme_tcp_mq_ops = { .init_hctx = nvme_tcp_init_hctx, .timeout = nvme_tcp_timeout, .map_queues = nvme_tcp_map_queues, + .poll = nvme_tcp_poll, }; static struct blk_mq_ops nvme_tcp_admin_mq_ops = { @@ -2211,7 +2249,8 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev, INIT_LIST_HEAD(&ctrl->list); ctrl->ctrl.opts = opts; - ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues + 1; + ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues + + opts->nr_poll_queues + 1; ctrl->ctrl.sqsize = opts->queue_size - 1; ctrl->ctrl.kato = opts->kato; @@ -2305,7 +2344,7 @@ static struct nvmf_transport_ops nvme_tcp_transport = { .allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY | NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO | NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST | - NVMF_OPT_NR_WRITE_QUEUES, + NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES, .create_ctrl = nvme_tcp_create_ctrl, }; -- cgit v1.2.3-58-ga151 From 3bec2e3754becebd4c452999adb49bc62c575ea4 Mon Sep 17 00:00:00 2001 From: Tom Wu Date: Thu, 8 Aug 2019 02:22:36 +0000 Subject: nvmet: fix data units read and written counters in SMART log In nvme spec 1.3 there is a definition for data write/read counters from SMART log, (See section 5.14.1.2): This value is reported in thousands (i.e., a value of 1 corresponds to 1000 units of 512 bytes read) and is rounded up. However, in nvme target where value is reported with actual units, but not thousands of units as the spec requires. Signed-off-by: Tom Wu Reviewed-by: Israel Rukshin Reviewed-by: Max Gurtovoy Reviewed-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg --- drivers/nvme/target/admin-cmd.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 4dc12ea52f23..51800a9ce9a9 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -81,9 +81,11 @@ static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req, goto out; host_reads = part_stat_read(ns->bdev->bd_part, ios[READ]); - data_units_read = part_stat_read(ns->bdev->bd_part, sectors[READ]); + data_units_read = DIV_ROUND_UP(part_stat_read(ns->bdev->bd_part, + sectors[READ]), 1000); host_writes = part_stat_read(ns->bdev->bd_part, ios[WRITE]); - data_units_written = part_stat_read(ns->bdev->bd_part, sectors[WRITE]); + data_units_written = DIV_ROUND_UP(part_stat_read(ns->bdev->bd_part, + sectors[WRITE]), 1000); put_unaligned_le64(host_reads, &slog->host_reads[0]); put_unaligned_le64(data_units_read, &slog->data_units_read[0]); @@ -111,11 +113,11 @@ static u16 nvmet_get_smart_log_all(struct nvmet_req *req, if (!ns->bdev) continue; host_reads += part_stat_read(ns->bdev->bd_part, ios[READ]); - data_units_read += - part_stat_read(ns->bdev->bd_part, sectors[READ]); + data_units_read += DIV_ROUND_UP( + part_stat_read(ns->bdev->bd_part, sectors[READ]), 1000); host_writes += part_stat_read(ns->bdev->bd_part, ios[WRITE]); - data_units_written += - part_stat_read(ns->bdev->bd_part, sectors[WRITE]); + data_units_written += DIV_ROUND_UP( + part_stat_read(ns->bdev->bd_part, sectors[WRITE]), 1000); } rcu_read_unlock(); -- cgit v1.2.3-58-ga151 From 177b06ed0997bdaa9671c615a37649ab9aec889e Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Sun, 4 Aug 2019 16:50:49 +0900 Subject: nvme: trace: parse Get LBA Status command in detail Four different fields are in CDWs of Get LBA Status command which means it would be great if we can see in detail when tracing. Signed-off-by: Minwoo Im Signed-off-by: Sagi Grimberg --- drivers/nvme/host/trace.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c index 9778eb0406b3..5c3cb6928f3c 100644 --- a/drivers/nvme/host/trace.c +++ b/drivers/nvme/host/trace.c @@ -86,6 +86,22 @@ static const char *nvme_trace_admin_get_features(struct trace_seq *p, return ret; } +static const char *nvme_trace_get_lba_status(struct trace_seq *p, + u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u64 slba = get_unaligned_le64(cdw10); + u32 mndw = get_unaligned_le32(cdw10 + 8); + u16 rl = get_unaligned_le16(cdw10 + 12); + u8 atype = cdw10[15]; + + trace_seq_printf(p, "slba=0x%llx, mndw=0x%x, rl=0x%x, atype=%u", + slba, mndw, rl, atype); + trace_seq_putc(p, 0); + + return ret; +} + static const char *nvme_trace_read_write(struct trace_seq *p, u8 *cdw10) { const char *ret = trace_seq_buffer_ptr(p); @@ -141,6 +157,8 @@ const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, return nvme_trace_admin_identify(p, cdw10); case nvme_admin_get_features: return nvme_trace_admin_get_features(p, cdw10); + case nvme_admin_get_lba_status: + return nvme_trace_get_lba_status(p, cdw10); default: return nvme_trace_common(p, cdw10); } -- cgit v1.2.3-58-ga151 From 42df26d4df7b4437db7d3847c36abc3e5aa237f1 Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Sun, 4 Aug 2019 16:50:50 +0900 Subject: nvmet: trace: parse Get LBA Status command in detail Four different fields are in CDWs of Get LBA Status command which means it would be great if we can see in detail when tracing in target side also. Signed-off-by: Minwoo Im Signed-off-by: Sagi Grimberg --- drivers/nvme/target/trace.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/trace.c b/drivers/nvme/target/trace.c index 6af11d493271..1373a3c67962 100644 --- a/drivers/nvme/target/trace.c +++ b/drivers/nvme/target/trace.c @@ -33,6 +33,22 @@ static const char *nvmet_trace_admin_get_features(struct trace_seq *p, return ret; } +static const char *nvmet_trace_get_lba_status(struct trace_seq *p, + u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u64 slba = get_unaligned_le64(cdw10); + u32 mndw = get_unaligned_le32(cdw10 + 8); + u16 rl = get_unaligned_le16(cdw10 + 12); + u8 atype = cdw10[15]; + + trace_seq_printf(p, "slba=0x%llx, mndw=0x%x, rl=0x%x, atype=%u", + slba, mndw, rl, atype); + trace_seq_putc(p, 0); + + return ret; +} + static const char *nvmet_trace_read_write(struct trace_seq *p, u8 *cdw10) { const char *ret = trace_seq_buffer_ptr(p); @@ -80,6 +96,8 @@ const char *nvmet_trace_parse_admin_cmd(struct trace_seq *p, return nvmet_trace_admin_identify(p, cdw10); case nvme_admin_get_features: return nvmet_trace_admin_get_features(p, cdw10); + case nvme_admin_get_lba_status: + return nvmet_trace_get_lba_status(p, cdw10); default: return nvmet_trace_common(p, cdw10); } -- cgit v1.2.3-58-ga151 From b627200762c7e8153fe1620fdd52a68f4ca2f8a5 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Fri, 2 Aug 2019 20:23:38 -0700 Subject: nvmet-tcp: fix possible NULL deref We must only call sgl_free for sgl that we actually allocated. Signed-off-by: Sagi Grimberg --- drivers/nvme/target/tcp.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 69b83fa0c76c..0d63f3da0117 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -348,7 +348,8 @@ static int nvmet_tcp_map_data(struct nvmet_tcp_cmd *cmd) return 0; err: - sgl_free(cmd->req.sg); + if (cmd->req.sg_cnt) + sgl_free(cmd->req.sg); return NVME_SC_INTERNAL; } @@ -553,7 +554,8 @@ static int nvmet_try_send_data(struct nvmet_tcp_cmd *cmd) if (queue->nvme_sq.sqhd_disabled) { kfree(cmd->iov); - sgl_free(cmd->req.sg); + if (cmd->req.sg_cnt) + sgl_free(cmd->req.sg); } return 1; @@ -584,7 +586,8 @@ static int nvmet_try_send_response(struct nvmet_tcp_cmd *cmd, return -EAGAIN; kfree(cmd->iov); - sgl_free(cmd->req.sg); + if (cmd->req.sg_cnt) + sgl_free(cmd->req.sg); cmd->queue->snd_cmd = NULL; nvmet_tcp_put_cmd(cmd); return 1; @@ -1306,7 +1309,8 @@ static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd) { nvmet_req_uninit(&cmd->req); nvmet_tcp_unmap_pdu_iovec(cmd); - sgl_free(cmd->req.sg); + if (cmd->req.sg_cnt) + sgl_free(cmd->req.sg); } static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue *queue) -- cgit v1.2.3-58-ga151 From 35d1a938dcdaeb8e1d860f061a0cd11f67f42774 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Fri, 2 Aug 2019 20:29:11 -0700 Subject: nvmet-tcp: fix possible memory leak when we uninit a command in error flow we also need to free an iovec if it was allocated. Reviewed-by: Max Gurtovoy Signed-off-by: Sagi Grimberg --- drivers/nvme/target/tcp.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 0d63f3da0117..76e43750b9e5 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -1309,6 +1309,7 @@ static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd) { nvmet_req_uninit(&cmd->req); nvmet_tcp_unmap_pdu_iovec(cmd); + kfree(cmd->iov); if (cmd->req.sg_cnt) sgl_free(cmd->req.sg); } -- cgit v1.2.3-58-ga151 From 52b4451a9e5ae1e9ae739db16e8af61c77805389 Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Sun, 18 Aug 2019 12:08:51 +0300 Subject: nvme-fabrics: Add type of service (TOS) configuration TOS is user-defined and needs to be configured via nvme-cli. It must be set before initiating any traffic and once set the TOS cannot be changed. Signed-off-by: Israel Rukshin Reviewed-by: Max Gurtovoy Signed-off-by: Sagi Grimberg --- drivers/nvme/host/fabrics.c | 18 ++++++++++++++++++ drivers/nvme/host/fabrics.h | 3 +++ 2 files changed, 21 insertions(+) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 1994d5b42f94..854ce75e6c2d 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -611,6 +611,7 @@ static const match_table_t opt_tokens = { { NVMF_OPT_DATA_DIGEST, "data_digest" }, { NVMF_OPT_NR_WRITE_QUEUES, "nr_write_queues=%d" }, { NVMF_OPT_NR_POLL_QUEUES, "nr_poll_queues=%d" }, + { NVMF_OPT_TOS, "tos=%d" }, { NVMF_OPT_ERR, NULL } }; @@ -632,6 +633,7 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, opts->duplicate_connect = false; opts->hdr_digest = false; opts->data_digest = false; + opts->tos = -1; /* < 0 == use transport default */ options = o = kstrdup(buf, GFP_KERNEL); if (!options) @@ -856,6 +858,22 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, } opts->nr_poll_queues = token; break; + case NVMF_OPT_TOS: + if (match_int(args, &token)) { + ret = -EINVAL; + goto out; + } + if (token < 0) { + pr_err("Invalid type of service %d\n", token); + ret = -EINVAL; + goto out; + } + if (token > 255) { + pr_warn("Clamping type of service to 255\n"); + token = 255; + } + opts->tos = token; + break; default: pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n", p); diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index 3044d8b99a24..93f08d77c896 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -55,6 +55,7 @@ enum { NVMF_OPT_DATA_DIGEST = 1 << 16, NVMF_OPT_NR_WRITE_QUEUES = 1 << 17, NVMF_OPT_NR_POLL_QUEUES = 1 << 18, + NVMF_OPT_TOS = 1 << 19, }; /** @@ -87,6 +88,7 @@ enum { * @data_digest: generate/verify data digest (TCP) * @nr_write_queues: number of queues for write I/O * @nr_poll_queues: number of queues for polling I/O + * @tos: type of service */ struct nvmf_ctrl_options { unsigned mask; @@ -108,6 +110,7 @@ struct nvmf_ctrl_options { bool data_digest; unsigned int nr_write_queues; unsigned int nr_poll_queues; + int tos; }; /* -- cgit v1.2.3-58-ga151 From e63440d6a3134f7ae74bfb00bfc01db3efb8d3aa Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Sun, 18 Aug 2019 12:08:52 +0300 Subject: nvme-rdma: Add TOS for rdma transport For RDMA transports, TOS is an extension of IB QoS to provide clients the ability to segregate traffic flows for different type of data. RDMA CM abstract it for ULPs using rdma_set_service_type(). Internally, each traffic flow is represented by a connection with all of its independent resources like that of a normal connection, and is differentiated by service type. In other words, there can be multiple qp connections between an IP pair and each supports a unique service type. One of the TOS usage is bandwidth management which allows setting bandwidth limits for QoS classes, e.g. 80% bandwidth to controllers at QoS class A and 20% to controllers at QoS class B. Note: In addition to the TOS configuration, QOS must be configured on the relevant HCA on the target (send RDMA commands) and initiator to effect the traffic. usage examples: nvme connect --tos=0 --transport=rdma --traddr=10.0.1.1 --nqn=test-nvme Signed-off-by: Israel Rukshin Reviewed-by: Max Gurtovoy Signed-off-by: Sagi Grimberg --- drivers/nvme/host/rdma.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 10e3bcdb7180..5143e2a5d54c 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1534,16 +1534,18 @@ static int nvme_rdma_conn_rejected(struct nvme_rdma_queue *queue, static int nvme_rdma_addr_resolved(struct nvme_rdma_queue *queue) { + struct nvme_ctrl *ctrl = &queue->ctrl->ctrl; int ret; ret = nvme_rdma_create_queue_ib(queue); if (ret) return ret; + if (ctrl->opts->tos >= 0) + rdma_set_service_type(queue->cm_id, ctrl->opts->tos); ret = rdma_resolve_route(queue->cm_id, NVME_RDMA_CONNECT_TIMEOUT_MS); if (ret) { - dev_err(queue->ctrl->ctrl.device, - "rdma_resolve_route failed (%d).\n", + dev_err(ctrl->device, "rdma_resolve_route failed (%d).\n", queue->cm_error); goto out_destroy_queue; } @@ -2038,7 +2040,8 @@ static struct nvmf_transport_ops nvme_rdma_transport = { .required_opts = NVMF_OPT_TRADDR, .allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY | NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO | - NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES, + NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES | + NVMF_OPT_TOS, .create_ctrl = nvme_rdma_create_ctrl, }; -- cgit v1.2.3-58-ga151 From 9924b0304ab278406aaee9184a1a2032b4778c65 Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Sun, 18 Aug 2019 12:08:53 +0300 Subject: nvme-tcp: Use struct nvme_ctrl directly This patch doesn't change any functionality. Signed-off-by: Israel Rukshin Reviewed-by: Max Gurtovoy Reviewed-by: Sagi Grimberg Signed-off-by: Sagi Grimberg --- drivers/nvme/host/tcp.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 1e2e5ab3875f..1ae985ec7cec 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1258,7 +1258,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, queue->queue_size = queue_size; if (qid > 0) - queue->cmnd_capsule_len = ctrl->ctrl.ioccsz * 16; + queue->cmnd_capsule_len = nctrl->ioccsz * 16; else queue->cmnd_capsule_len = sizeof(struct nvme_command) + NVME_TCP_ADMIN_CCSZ; @@ -1266,7 +1266,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, ret = sock_create(ctrl->addr.ss_family, SOCK_STREAM, IPPROTO_TCP, &queue->sock); if (ret) { - dev_err(ctrl->ctrl.device, + dev_err(nctrl->device, "failed to create socket: %d\n", ret); return ret; } @@ -1276,7 +1276,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, ret = kernel_setsockopt(queue->sock, IPPROTO_TCP, TCP_SYNCNT, (char *)&opt, sizeof(opt)); if (ret) { - dev_err(ctrl->ctrl.device, + dev_err(nctrl->device, "failed to set TCP_SYNCNT sock opt %d\n", ret); goto err_sock; } @@ -1286,7 +1286,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, ret = kernel_setsockopt(queue->sock, IPPROTO_TCP, TCP_NODELAY, (char *)&opt, sizeof(opt)); if (ret) { - dev_err(ctrl->ctrl.device, + dev_err(nctrl->device, "failed to set TCP_NODELAY sock opt %d\n", ret); goto err_sock; } @@ -1299,7 +1299,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, ret = kernel_setsockopt(queue->sock, SOL_SOCKET, SO_LINGER, (char *)&sol, sizeof(sol)); if (ret) { - dev_err(ctrl->ctrl.device, + dev_err(nctrl->device, "failed to set SO_LINGER sock opt %d\n", ret); goto err_sock; } @@ -1317,11 +1317,11 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, queue->pdu_offset = 0; sk_set_memalloc(queue->sock->sk); - if (ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR) { + if (nctrl->opts->mask & NVMF_OPT_HOST_TRADDR) { ret = kernel_bind(queue->sock, (struct sockaddr *)&ctrl->src_addr, sizeof(ctrl->src_addr)); if (ret) { - dev_err(ctrl->ctrl.device, + dev_err(nctrl->device, "failed to bind queue %d socket %d\n", qid, ret); goto err_sock; @@ -1333,7 +1333,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, if (queue->hdr_digest || queue->data_digest) { ret = nvme_tcp_alloc_crypto(queue); if (ret) { - dev_err(ctrl->ctrl.device, + dev_err(nctrl->device, "failed to allocate queue %d crypto\n", qid); goto err_sock; } @@ -1347,13 +1347,13 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, goto err_crypto; } - dev_dbg(ctrl->ctrl.device, "connecting queue %d\n", + dev_dbg(nctrl->device, "connecting queue %d\n", nvme_tcp_queue_id(queue)); ret = kernel_connect(queue->sock, (struct sockaddr *)&ctrl->addr, sizeof(ctrl->addr), 0); if (ret) { - dev_err(ctrl->ctrl.device, + dev_err(nctrl->device, "failed to connect socket: %d\n", ret); goto err_rcv_pdu; } -- cgit v1.2.3-58-ga151 From bb13985d5a55b2830095640a2e64145bcb34929b Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Sun, 18 Aug 2019 12:08:54 +0300 Subject: nvme-tcp: Add TOS for tcp transport TOS provide clients the ability to segregate traffic flows for different type of data. One of the TOS usage is bandwidth management which allows setting bandwidth limits for QoS classes, e.g. 80% bandwidth to controllers at QoS class A and 20% to controllers at QoS class B. usage examples: nvme connect --tos=0 --transport=tcp --traddr=10.0.1.1 --nqn=test-nvme Signed-off-by: Israel Rukshin Reviewed-by: Max Gurtovoy Signed-off-by: Sagi Grimberg --- drivers/nvme/host/tcp.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 1ae985ec7cec..a9c3f28eedd7 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1304,6 +1304,18 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, goto err_sock; } + /* Set socket type of service */ + if (nctrl->opts->tos >= 0) { + opt = nctrl->opts->tos; + ret = kernel_setsockopt(queue->sock, SOL_IP, IP_TOS, + (char *)&opt, sizeof(opt)); + if (ret) { + dev_err(nctrl->device, + "failed to set IP_TOS sock opt %d\n", ret); + goto err_sock; + } + } + queue->sock->sk->sk_allocation = GFP_ATOMIC; if (!qid) n = 0; @@ -2344,7 +2356,8 @@ static struct nvmf_transport_ops nvme_tcp_transport = { .allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY | NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO | NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST | - NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES, + NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES | + NVMF_OPT_TOS, .create_ctrl = nvme_tcp_create_ctrl, }; -- cgit v1.2.3-58-ga151 From 89275a9659fe57a3c7eef6778ec64f9e435c75eb Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Sun, 18 Aug 2019 12:08:55 +0300 Subject: nvmet-tcp: Add TOS for tcp transport Set the outgoing packets type of service (TOS) according to the receiving TOS. Signed-off-by: Israel Rukshin Suggested-by: Sagi Grimberg Reviewed-by: Max Gurtovoy Reviewed-by: Sagi Grimberg Signed-off-by: Sagi Grimberg --- drivers/nvme/target/tcp.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 76e43750b9e5..bf4f03474e89 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -1415,6 +1415,7 @@ done: static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue) { struct socket *sock = queue->sock; + struct inet_sock *inet = inet_sk(sock->sk); struct linger sol = { .l_onoff = 1, .l_linger = 0 }; int ret; @@ -1438,6 +1439,16 @@ static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue) if (ret) return ret; + /* Set socket type of service */ + if (inet->rcv_tos > 0) { + int tos = inet->rcv_tos; + + ret = kernel_setsockopt(sock, SOL_IP, IP_TOS, + (char *)&tos, sizeof(tos)); + if (ret) + return ret; + } + write_lock_bh(&sock->sk->sk_callback_lock); sock->sk->sk_user_data = queue; queue->data_ready = sock->sk->sk_data_ready; -- cgit v1.2.3-58-ga151 From e01f91dff91c7b16a6e3faf2565017d497a73f83 Mon Sep 17 00:00:00 2001 From: Anton Eidelman Date: Fri, 16 Aug 2019 13:00:10 -0700 Subject: nvme-multipath: fix ana log nsid lookup when nsid is not found ANA log parsing invokes nvme_update_ana_state() per ANA group desc. This updates the state of namespaces with nsids in desc->nsids[]. Both ctrl->namespaces list and desc->nsids[] array are sorted by nsid. Hence nvme_update_ana_state() performs a single walk over ctrl->namespaces: - if current namespace matches the current desc->nsids[n], this namespace is updated, and n is incremented. - the process stops when it encounters the end of either ctrl->namespaces end or desc->nsids[] In case desc->nsids[n] does not match any of ctrl->namespaces, the remaining nsids following desc->nsids[n] will not be updated. Such situation was considered abnormal and generated WARN_ON_ONCE. However ANA log MAY contain nsids not (yet) found in ctrl->namespaces. For example, lets consider the following scenario: - nvme0 exposes namespaces with nsids = [2, 3] to the host - a new namespace nsid = 1 is added dynamically - also, a ANA topology change is triggered - NS_CHANGED aen is generated and triggers scan_work - before scan_work discovers nsid=1 and creates a namespace, a NOTICE_ANA aen was issues and ana_work receives ANA log with nsids=[1, 2, 3] Result: ana_work fails to update ANA state on existing namespaces [2, 3] Solution: Change the way nvme_update_ana_state() namespace list walk checks the current namespace against desc->nsids[n] as follows: a) ns->head->ns_id < desc->nsids[n]: keep walking ctrl->namespaces. b) ns->head->ns_id == desc->nsids[n]: match, update the namespace c) ns->head->ns_id >= desc->nsids[n]: skip to desc->nsids[n+1] This enables correct operation in the scenario described above. This also allows ANA log to contain nsids currently invisible to the host, i.e. inactive nsids. Signed-off-by: Anton Eidelman Reviewed-by: James Smart Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg --- drivers/nvme/host/multipath.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 4f0d0d12744e..961011abd2ab 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -444,14 +444,16 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl, down_write(&ctrl->namespaces_rwsem); list_for_each_entry(ns, &ctrl->namespaces, list) { - if (ns->head->ns_id != le32_to_cpu(desc->nsids[n])) + unsigned nsid = le32_to_cpu(desc->nsids[n]); + + if (ns->head->ns_id < nsid) continue; - nvme_update_ns_ana_state(desc, ns); + if (ns->head->ns_id == nsid) + nvme_update_ns_ana_state(desc, ns); if (++n == nr_nsids) break; } up_write(&ctrl->namespaces_rwsem); - WARN_ON_ONCE(n < nr_nsids); return 0; } -- cgit v1.2.3-58-ga151 From 35fe0d12c8a3d5e45f297562732ddc9ba9dc58dd Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Wed, 24 Jul 2019 15:47:55 +0200 Subject: nvme: trace bio completion When native multipathing is enabled we cannot enable blktrace for the underlying paths, so any completion is never traced. Signed-off-by: Hannes Reinecke [fixed-up by Mikhail for non-multipath-build] Signed-off-by: Mikhail Skorzhinskii Reviewed-by: Sagi Grimberg Signed-off-by: Sagi Grimberg --- drivers/nvme/host/core.c | 8 +++++--- drivers/nvme/host/nvme.h | 16 ++++++++++++++++ 2 files changed, 21 insertions(+), 3 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 35311d343a13..4660505eded9 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -22,12 +22,12 @@ #include #include -#define CREATE_TRACE_POINTS -#include "trace.h" - #include "nvme.h" #include "fabrics.h" +#define CREATE_TRACE_POINTS +#include "trace.h" + #define NVME_MINORS (1U << MINORBITS) unsigned int admin_timeout = 60; @@ -279,6 +279,8 @@ void nvme_complete_rq(struct request *req) return; } } + + nvme_trace_bio_complete(req, status); blk_mq_end_request(req, status); } EXPORT_SYMBOL_GPL(nvme_complete_rq); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 7c86e4bcd271..9656f863ea40 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -16,6 +16,8 @@ #include #include +#include + extern unsigned int nvme_io_timeout; #define NVME_IO_TIMEOUT (nvme_io_timeout * HZ) @@ -511,6 +513,16 @@ static inline void nvme_mpath_check_last_path(struct nvme_ns *ns) kblockd_schedule_work(&head->requeue_work); } +static inline void nvme_trace_bio_complete(struct request *req, + blk_status_t status) +{ + struct nvme_ns *ns = req->q->queuedata; + + if (req->cmd_flags & REQ_NVME_MPATH) + trace_block_bio_complete(ns->head->disk->queue, + req->bio, status); +} + extern struct device_attribute dev_attr_ana_grpid; extern struct device_attribute dev_attr_ana_state; extern struct device_attribute subsys_attr_iopolicy; @@ -554,6 +566,10 @@ static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns) static inline void nvme_mpath_check_last_path(struct nvme_ns *ns) { } +static inline void nvme_trace_bio_complete(struct request *req, + blk_status_t status) +{ +} static inline int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) { -- cgit v1.2.3-58-ga151 From 8a1d09a668e7e245b8e4131cc9017c63fee02ee5 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Wed, 7 Aug 2019 17:51:19 +1000 Subject: nvme-pci: Pass the queue to SQ_SIZE/CQ_SIZE macros This will make it easier to handle variable queue entry sizes later. No functional change. Signed-off-by: Benjamin Herrenschmidt Reviewed-by: Christoph Hellwig Reviewed-by: Minwoo Im Signed-off-by: Sagi Grimberg --- drivers/nvme/host/pci.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index bf54b128c5a4..a09e6c4e3434 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -28,8 +28,8 @@ #include "trace.h" #include "nvme.h" -#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) -#define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) +#define SQ_SIZE(q) ((q)->q_depth * sizeof(struct nvme_command)) +#define CQ_SIZE(q) ((q)->q_depth * sizeof(struct nvme_completion)) #define SGES_PER_PAGE (PAGE_SIZE / sizeof(struct nvme_sgl_desc)) @@ -1344,16 +1344,16 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) static void nvme_free_queue(struct nvme_queue *nvmeq) { - dma_free_coherent(nvmeq->dev->dev, CQ_SIZE(nvmeq->q_depth), + dma_free_coherent(nvmeq->dev->dev, CQ_SIZE(nvmeq), (void *)nvmeq->cqes, nvmeq->cq_dma_addr); if (!nvmeq->sq_cmds) return; if (test_and_clear_bit(NVMEQ_SQ_CMB, &nvmeq->flags)) { pci_free_p2pmem(to_pci_dev(nvmeq->dev->dev), - nvmeq->sq_cmds, SQ_SIZE(nvmeq->q_depth)); + nvmeq->sq_cmds, SQ_SIZE(nvmeq)); } else { - dma_free_coherent(nvmeq->dev->dev, SQ_SIZE(nvmeq->q_depth), + dma_free_coherent(nvmeq->dev->dev, SQ_SIZE(nvmeq), nvmeq->sq_cmds, nvmeq->sq_dma_addr); } } @@ -1433,12 +1433,12 @@ static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues, } static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq, - int qid, int depth) + int qid) { struct pci_dev *pdev = to_pci_dev(dev->dev); if (qid && dev->cmb_use_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) { - nvmeq->sq_cmds = pci_alloc_p2pmem(pdev, SQ_SIZE(depth)); + nvmeq->sq_cmds = pci_alloc_p2pmem(pdev, SQ_SIZE(nvmeq)); if (nvmeq->sq_cmds) { nvmeq->sq_dma_addr = pci_p2pmem_virt_to_bus(pdev, nvmeq->sq_cmds); @@ -1447,11 +1447,11 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq, return 0; } - pci_free_p2pmem(pdev, nvmeq->sq_cmds, SQ_SIZE(depth)); + pci_free_p2pmem(pdev, nvmeq->sq_cmds, SQ_SIZE(nvmeq)); } } - nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth), + nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(nvmeq), &nvmeq->sq_dma_addr, GFP_KERNEL); if (!nvmeq->sq_cmds) return -ENOMEM; @@ -1465,12 +1465,13 @@ static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth) if (dev->ctrl.queue_count > qid) return 0; - nvmeq->cqes = dma_alloc_coherent(dev->dev, CQ_SIZE(depth), + nvmeq->q_depth = depth; + nvmeq->cqes = dma_alloc_coherent(dev->dev, CQ_SIZE(nvmeq), &nvmeq->cq_dma_addr, GFP_KERNEL); if (!nvmeq->cqes) goto free_nvmeq; - if (nvme_alloc_sq_cmds(dev, nvmeq, qid, depth)) + if (nvme_alloc_sq_cmds(dev, nvmeq, qid)) goto free_cqdma; nvmeq->dev = dev; @@ -1479,15 +1480,14 @@ static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth) nvmeq->cq_head = 0; nvmeq->cq_phase = 1; nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; - nvmeq->q_depth = depth; nvmeq->qid = qid; dev->ctrl.queue_count++; return 0; free_cqdma: - dma_free_coherent(dev->dev, CQ_SIZE(depth), (void *)nvmeq->cqes, - nvmeq->cq_dma_addr); + dma_free_coherent(dev->dev, CQ_SIZE(nvmeq), (void *)nvmeq->cqes, + nvmeq->cq_dma_addr); free_nvmeq: return -ENOMEM; } @@ -1515,7 +1515,7 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid) nvmeq->cq_head = 0; nvmeq->cq_phase = 1; nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; - memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth)); + memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq)); nvme_dbbuf_init(dev, nvmeq, qid); dev->online_queues++; wmb(); /* ensure the first interrupt sees the initialization */ -- cgit v1.2.3-58-ga151 From c1e0cc7e1d319936271dfdd0a9405275c8091381 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Wed, 7 Aug 2019 17:51:20 +1000 Subject: nvme-pci: Add support for variable IO SQ element size The size of a submission queue element should always be 6 (64 bytes) by spec. However some controllers such as Apple's are not properly implementing the standard and require a different size. This provides the ground work for the subsequent quirks for these controllers. Signed-off-by: Benjamin Herrenschmidt Reviewed-by: Minwoo Im Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg --- drivers/nvme/host/pci.c | 11 ++++++++--- include/linux/nvme.h | 1 + 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index a09e6c4e3434..eee93e138c2c 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -28,7 +28,7 @@ #include "trace.h" #include "nvme.h" -#define SQ_SIZE(q) ((q)->q_depth * sizeof(struct nvme_command)) +#define SQ_SIZE(q) ((q)->q_depth << (q)->sqes) #define CQ_SIZE(q) ((q)->q_depth * sizeof(struct nvme_completion)) #define SGES_PER_PAGE (PAGE_SIZE / sizeof(struct nvme_sgl_desc)) @@ -100,6 +100,7 @@ struct nvme_dev { unsigned io_queues[HCTX_MAX_TYPES]; unsigned int num_vecs; int q_depth; + int io_sqes; u32 db_stride; void __iomem *bar; unsigned long bar_mapped_size; @@ -162,7 +163,7 @@ static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl) struct nvme_queue { struct nvme_dev *dev; spinlock_t sq_lock; - struct nvme_command *sq_cmds; + void *sq_cmds; /* only used for poll queues: */ spinlock_t cq_poll_lock ____cacheline_aligned_in_smp; volatile struct nvme_completion *cqes; @@ -178,6 +179,7 @@ struct nvme_queue { u16 last_cq_head; u16 qid; u8 cq_phase; + u8 sqes; unsigned long flags; #define NVMEQ_ENABLED 0 #define NVMEQ_SQ_CMB 1 @@ -488,7 +490,8 @@ static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd, bool write_sq) { spin_lock(&nvmeq->sq_lock); - memcpy(&nvmeq->sq_cmds[nvmeq->sq_tail], cmd, sizeof(*cmd)); + memcpy(nvmeq->sq_cmds + (nvmeq->sq_tail << nvmeq->sqes), + cmd, sizeof(*cmd)); if (++nvmeq->sq_tail == nvmeq->q_depth) nvmeq->sq_tail = 0; nvme_write_sq_db(nvmeq, write_sq); @@ -1465,6 +1468,7 @@ static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth) if (dev->ctrl.queue_count > qid) return 0; + nvmeq->sqes = qid ? dev->io_sqes : NVME_ADM_SQES; nvmeq->q_depth = depth; nvmeq->cqes = dma_alloc_coherent(dev->dev, CQ_SIZE(nvmeq), &nvmeq->cq_dma_addr, GFP_KERNEL); @@ -2317,6 +2321,7 @@ static int nvme_pci_enable(struct nvme_dev *dev) dev->ctrl.sqsize = dev->q_depth - 1; /* 0's based queue depth */ dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap); dev->dbs = dev->bar + 4096; + dev->io_sqes = NVME_NVM_IOSQES; /* * Temporary fix for the Apple controller found in the MacBook8,1 and diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 32c25b46ae63..f61d6906e59d 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -140,6 +140,7 @@ enum { * Submission and Completion Queue Entry Sizes for the NVM command set. * (In bytes and specified as a power of two (2^n)). */ +#define NVME_ADM_SQES 6 #define NVME_NVM_IOSQES 6 #define NVME_NVM_IOCQES 4 -- cgit v1.2.3-58-ga151 From 66341331ba0d2de4ff421cdc401a1e34de50502a Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Wed, 7 Aug 2019 17:51:21 +1000 Subject: nvme-pci: Add support for Apple 2018+ models Based on reverse engineering and original patch by Paul Pawlowski This adds support for Apple weird implementation of NVME in their 2018 or later machines. It accounts for the twice-as-big SQ entries for the IO queues, and the fact that only interrupt vector 0 appears to function properly. Signed-off-by: Benjamin Herrenschmidt Reviewed-by: Minwoo Im Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg --- drivers/nvme/host/nvme.h | 10 ++++++++++ drivers/nvme/host/pci.c | 21 ++++++++++++++++++++- 2 files changed, 30 insertions(+), 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 9656f863ea40..21eb48d3385d 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -94,6 +94,16 @@ enum nvme_quirks { * Broken Write Zeroes. */ NVME_QUIRK_DISABLE_WRITE_ZEROES = (1 << 9), + + /* + * Use only one interrupt vector for all queues + */ + NVME_QUIRK_SINGLE_VECTOR = (1 << 10), + + /* + * Use non-standard 128 bytes SQEs. + */ + NVME_QUIRK_128_BYTES_SQES = (1 << 11), }; /* diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index eee93e138c2c..effb79341909 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2081,6 +2081,13 @@ static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues) dev->io_queues[HCTX_TYPE_DEFAULT] = 1; dev->io_queues[HCTX_TYPE_READ] = 0; + /* + * Some Apple controllers require all queues to use the + * first vector. + */ + if (dev->ctrl.quirks & NVME_QUIRK_SINGLE_VECTOR) + irq_queues = 1; + return pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues, PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd); } @@ -2321,7 +2328,16 @@ static int nvme_pci_enable(struct nvme_dev *dev) dev->ctrl.sqsize = dev->q_depth - 1; /* 0's based queue depth */ dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap); dev->dbs = dev->bar + 4096; - dev->io_sqes = NVME_NVM_IOSQES; + + /* + * Some Apple controllers require a non-standard SQE size. + * Interestingly they also seem to ignore the CC:IOSQES register + * so we don't bother updating it here. + */ + if (dev->ctrl.quirks & NVME_QUIRK_128_BYTES_SQES) + dev->io_sqes = 7; + else + dev->io_sqes = NVME_NVM_IOSQES; /* * Temporary fix for the Apple controller found in the MacBook8,1 and @@ -3040,6 +3056,9 @@ static const struct pci_device_id nvme_id_table[] = { { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) }, + { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2005), + .driver_data = NVME_QUIRK_SINGLE_VECTOR | + NVME_QUIRK_128_BYTES_SQES }, { 0, } }; MODULE_DEVICE_TABLE(pci, nvme_id_table); -- cgit v1.2.3-58-ga151 From d38e9f04ebf667d9cb8185b45bff747485f1d3e9 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Wed, 7 Aug 2019 17:51:22 +1000 Subject: nvme-pci: Support shared tags across queues for Apple 2018 controllers Another issue with the Apple T2 based 2018 controllers seem to be that they blow up (and shut the machine down) if there's a tag collision between the IO queue and the Admin queue. My suspicion is that they use our tags for their internal tracking and don't mix them with the queue id. They also seem to not like when tags go beyond the IO queue depth, ie 128 tags. This adds a quirk that marks tags 0..31 of the IO queue reserved Signed-off-by: Benjamin Herrenschmidt Reviewed-by: Ming Lei Acked-by: Keith Busch Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg --- drivers/nvme/host/nvme.h | 5 +++++ drivers/nvme/host/pci.c | 31 ++++++++++++++++++++++++++++++- 2 files changed, 35 insertions(+), 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 21eb48d3385d..624c3ea2134c 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -104,6 +104,11 @@ enum nvme_quirks { * Use non-standard 128 bytes SQEs. */ NVME_QUIRK_128_BYTES_SQES = (1 << 11), + + /* + * Prevent tag overlap between queues + */ + NVME_QUIRK_SHARED_TAGS = (1 << 12), }; /* diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index effb79341909..77bcda68fe1a 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2106,6 +2106,14 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) unsigned long size; nr_io_queues = max_io_queues(); + + /* + * If tags are shared with admin queue (Apple bug), then + * make sure we only use one IO queue. + */ + if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) + nr_io_queues = 1; + result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues); if (result < 0) return result; @@ -2276,6 +2284,14 @@ static int nvme_dev_add(struct nvme_dev *dev) dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE; dev->tagset.driver_data = dev; + /* + * Some Apple controllers requires tags to be unique + * across admin and IO queue, so reserve the first 32 + * tags of the IO queue. + */ + if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) + dev->tagset.reserved_tags = NVME_AQ_DEPTH; + ret = blk_mq_alloc_tag_set(&dev->tagset); if (ret) { dev_warn(dev->ctrl.device, @@ -2356,6 +2372,18 @@ static int nvme_pci_enable(struct nvme_dev *dev) "set queue depth=%u\n", dev->q_depth); } + /* + * Controllers with the shared tags quirk need the IO queue to be + * big enough so that we get 32 tags for the admin queue + */ + if ((dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) && + (dev->q_depth < (NVME_AQ_DEPTH + 2))) { + dev->q_depth = NVME_AQ_DEPTH + 2; + dev_warn(dev->ctrl.device, "IO queue depth clamped to %d\n", + dev->q_depth); + } + + nvme_map_cmb(dev); pci_enable_pcie_error_reporting(pdev); @@ -3058,7 +3086,8 @@ static const struct pci_device_id nvme_id_table[] = { { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2005), .driver_data = NVME_QUIRK_SINGLE_VECTOR | - NVME_QUIRK_128_BYTES_SQES }, + NVME_QUIRK_128_BYTES_SQES | + NVME_QUIRK_SHARED_TAGS }, { 0, } }; MODULE_DEVICE_TABLE(pci, nvme_id_table); -- cgit v1.2.3-58-ga151 From e7832cb48a654cd12b2bc9181b2f0ad49d526ac6 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Fri, 2 Aug 2019 19:33:59 -0700 Subject: nvme: make fabrics command run on a separate request queue We have a fundamental issue that fabric commands use the admin_q. The reason is, that admin-connect, register reads and writes and admin commands cannot be guaranteed ordering while we are running controller resets. For example, when we reset a controller we perform: 1. disable the controller 2. teardown the admin queue 3. re-establish the admin queue 4. enable the controller In order to perform (3), we need to unquiesce the admin queue, however we may have some admin commands that are already pending on the quiesced admin_q and will immediate execute when we unquiesce it before we execute (4). The host must not send admin commands to the controller before enabling the controller. To fix this, we have the fabric commands (admin connect and property get/set, but not I/O queue connect) use a separate fabrics_q and make sure to quiesce the admin_q before we disable the controller, and unquiesce it only after we enable the controller. This fixes the error prints from nvmet in a controller reset storm test: kernel: nvmet: got cmd 6 while CC.EN == 0 on qid = 0 Which indicate that the host is sending an admin command when the controller is not enabled. Reviewed-by: James Smart Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg --- drivers/nvme/host/fabrics.c | 8 ++++---- drivers/nvme/host/fc.c | 15 ++++++++++++--- drivers/nvme/host/nvme.h | 1 + drivers/nvme/host/rdma.c | 19 +++++++++++++++++-- drivers/nvme/host/tcp.c | 19 +++++++++++++++++-- drivers/nvme/target/loop.c | 16 +++++++++++++--- 6 files changed, 64 insertions(+), 14 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 854ce75e6c2d..145c210edb03 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -150,7 +150,7 @@ int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val) cmd.prop_get.fctype = nvme_fabrics_type_property_get; cmd.prop_get.offset = cpu_to_le32(off); - ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &res, NULL, 0, 0, + ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, NULL, 0, 0, NVME_QID_ANY, 0, 0, false); if (ret >= 0) @@ -197,7 +197,7 @@ int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val) cmd.prop_get.attrib = 1; cmd.prop_get.offset = cpu_to_le32(off); - ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &res, NULL, 0, 0, + ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, NULL, 0, 0, NVME_QID_ANY, 0, 0, false); if (ret >= 0) @@ -243,7 +243,7 @@ int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val) cmd.prop_set.offset = cpu_to_le32(off); cmd.prop_set.value = cpu_to_le64(val); - ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, NULL, 0, 0, + ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, NULL, NULL, 0, 0, NVME_QID_ANY, 0, 0, false); if (unlikely(ret)) dev_err(ctrl->device, @@ -396,7 +396,7 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl) strncpy(data->subsysnqn, ctrl->opts->subsysnqn, NVMF_NQN_SIZE); strncpy(data->hostnqn, ctrl->opts->host->nqn, NVMF_NQN_SIZE); - ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &res, + ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, data, sizeof(*data), 0, NVME_QID_ANY, 1, BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT, false); if (ret) { diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index ec264b2e54c3..49577a33d25b 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2006,6 +2006,7 @@ nvme_fc_ctrl_free(struct kref *ref) blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); blk_cleanup_queue(ctrl->ctrl.admin_q); + blk_cleanup_queue(ctrl->ctrl.fabrics_q); blk_mq_free_tag_set(&ctrl->admin_tag_set); kfree(ctrl->queues); @@ -2633,8 +2634,6 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) if (ret) goto out_delete_hw_queue; - blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); - ret = nvmf_connect_admin_queue(&ctrl->ctrl); if (ret) goto out_disconnect_admin_queue; @@ -2655,6 +2654,8 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) ctrl->ctrl.max_hw_sectors = (ctrl->lport->ops->max_sgl_segments - 1) << (PAGE_SHIFT - 9); + blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); + ret = nvme_init_identify(&ctrl->ctrl); if (ret) goto out_disconnect_admin_queue; @@ -3101,10 +3102,16 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, goto out_free_queues; ctrl->ctrl.admin_tagset = &ctrl->admin_tag_set; + ctrl->ctrl.fabrics_q = blk_mq_init_queue(&ctrl->admin_tag_set); + if (IS_ERR(ctrl->ctrl.fabrics_q)) { + ret = PTR_ERR(ctrl->ctrl.fabrics_q); + goto out_free_admin_tag_set; + } + ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set); if (IS_ERR(ctrl->ctrl.admin_q)) { ret = PTR_ERR(ctrl->ctrl.admin_q); - goto out_free_admin_tag_set; + goto out_cleanup_fabrics_q; } /* @@ -3176,6 +3183,8 @@ fail_ctrl: out_cleanup_admin_q: blk_cleanup_queue(ctrl->ctrl.admin_q); +out_cleanup_fabrics_q: + blk_cleanup_queue(ctrl->ctrl.fabrics_q); out_free_admin_tag_set: blk_mq_free_tag_set(&ctrl->admin_tag_set); out_free_queues: diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 624c3ea2134c..a818313a1f15 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -181,6 +181,7 @@ struct nvme_ctrl { const struct nvme_ctrl_ops *ops; struct request_queue *admin_q; struct request_queue *connect_q; + struct request_queue *fabrics_q; struct device *dev; int instance; int numa_node; diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 5143e2a5d54c..0ef05a75c428 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -751,6 +751,7 @@ static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl, { if (remove) { blk_cleanup_queue(ctrl->ctrl.admin_q); + blk_cleanup_queue(ctrl->ctrl.fabrics_q); blk_mq_free_tag_set(ctrl->ctrl.admin_tagset); } if (ctrl->async_event_sqe.data) { @@ -792,10 +793,16 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, goto out_free_async_qe; } + ctrl->ctrl.fabrics_q = blk_mq_init_queue(&ctrl->admin_tag_set); + if (IS_ERR(ctrl->ctrl.fabrics_q)) { + error = PTR_ERR(ctrl->ctrl.fabrics_q); + goto out_free_tagset; + } + ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set); if (IS_ERR(ctrl->ctrl.admin_q)) { error = PTR_ERR(ctrl->ctrl.admin_q); - goto out_free_tagset; + goto out_cleanup_fabrics_q; } } @@ -810,6 +817,8 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, ctrl->ctrl.max_hw_sectors = (ctrl->max_fr_pages - 1) << (ilog2(SZ_4K) - 9); + blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); + error = nvme_init_identify(&ctrl->ctrl); if (error) goto out_stop_queue; @@ -821,6 +830,9 @@ out_stop_queue: out_cleanup_queue: if (new) blk_cleanup_queue(ctrl->ctrl.admin_q); +out_cleanup_fabrics_q: + if (new) + blk_cleanup_queue(ctrl->ctrl.fabrics_q); out_free_tagset: if (new) blk_mq_free_tag_set(ctrl->ctrl.admin_tagset); @@ -895,7 +907,8 @@ static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl, nvme_cancel_request, &ctrl->ctrl); blk_mq_tagset_wait_completed_request(ctrl->ctrl.admin_tagset); } - blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); + if (remove) + blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); nvme_rdma_destroy_admin_queue(ctrl, remove); } @@ -1046,6 +1059,7 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) nvme_rdma_teardown_io_queues(ctrl, false); nvme_start_queues(&ctrl->ctrl); nvme_rdma_teardown_admin_queue(ctrl, false); + blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { /* state change failure is ok if we're in DELETING state */ @@ -1858,6 +1872,7 @@ static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) cancel_delayed_work_sync(&ctrl->reconnect_work); nvme_rdma_teardown_io_queues(ctrl, shutdown); + blk_mq_quiesce_queue(ctrl->ctrl.admin_q); if (shutdown) nvme_shutdown_ctrl(&ctrl->ctrl); else diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index a9c3f28eedd7..2d8ba31cb691 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1703,6 +1703,7 @@ static void nvme_tcp_destroy_admin_queue(struct nvme_ctrl *ctrl, bool remove) nvme_tcp_stop_queue(ctrl, 0); if (remove) { blk_cleanup_queue(ctrl->admin_q); + blk_cleanup_queue(ctrl->fabrics_q); blk_mq_free_tag_set(ctrl->admin_tagset); } nvme_tcp_free_admin_queue(ctrl); @@ -1723,10 +1724,16 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new) goto out_free_queue; } + ctrl->fabrics_q = blk_mq_init_queue(ctrl->admin_tagset); + if (IS_ERR(ctrl->fabrics_q)) { + error = PTR_ERR(ctrl->fabrics_q); + goto out_free_tagset; + } + ctrl->admin_q = blk_mq_init_queue(ctrl->admin_tagset); if (IS_ERR(ctrl->admin_q)) { error = PTR_ERR(ctrl->admin_q); - goto out_free_tagset; + goto out_cleanup_fabrics_q; } } @@ -1738,6 +1745,8 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new) if (error) goto out_stop_queue; + blk_mq_unquiesce_queue(ctrl->admin_q); + error = nvme_init_identify(ctrl); if (error) goto out_stop_queue; @@ -1749,6 +1758,9 @@ out_stop_queue: out_cleanup_queue: if (new) blk_cleanup_queue(ctrl->admin_q); +out_cleanup_fabrics_q: + if (new) + blk_cleanup_queue(ctrl->fabrics_q); out_free_tagset: if (new) blk_mq_free_tag_set(ctrl->admin_tagset); @@ -1767,7 +1779,8 @@ static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl, nvme_cancel_request, ctrl); blk_mq_tagset_wait_completed_request(ctrl->admin_tagset); } - blk_mq_unquiesce_queue(ctrl->admin_q); + if (remove) + blk_mq_unquiesce_queue(ctrl->admin_q); nvme_tcp_destroy_admin_queue(ctrl, remove); } @@ -1894,6 +1907,7 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work) /* unquiesce to fail fast pending requests */ nvme_start_queues(ctrl); nvme_tcp_teardown_admin_queue(ctrl, false); + blk_mq_unquiesce_queue(ctrl->admin_q); if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) { /* state change failure is ok if we're in DELETING state */ @@ -1910,6 +1924,7 @@ static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown) cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work); nvme_tcp_teardown_io_queues(ctrl, shutdown); + blk_mq_quiesce_queue(ctrl->admin_q); if (shutdown) nvme_shutdown_ctrl(ctrl); else diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index ec0bc57d26fc..9ee093b9fc74 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -253,6 +253,7 @@ static void nvme_loop_destroy_admin_queue(struct nvme_loop_ctrl *ctrl) clear_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[0].flags); nvmet_sq_destroy(&ctrl->queues[0].nvme_sq); blk_cleanup_queue(ctrl->ctrl.admin_q); + blk_cleanup_queue(ctrl->ctrl.fabrics_q); blk_mq_free_tag_set(&ctrl->admin_tag_set); } @@ -357,10 +358,16 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) goto out_free_sq; ctrl->ctrl.admin_tagset = &ctrl->admin_tag_set; + ctrl->ctrl.fabrics_q = blk_mq_init_queue(&ctrl->admin_tag_set); + if (IS_ERR(ctrl->ctrl.fabrics_q)) { + error = PTR_ERR(ctrl->ctrl.fabrics_q); + goto out_free_tagset; + } + ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set); if (IS_ERR(ctrl->ctrl.admin_q)) { error = PTR_ERR(ctrl->ctrl.admin_q); - goto out_free_tagset; + goto out_cleanup_fabrics_q; } error = nvmf_connect_admin_queue(&ctrl->ctrl); @@ -376,6 +383,8 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) ctrl->ctrl.max_hw_sectors = (NVME_LOOP_MAX_SEGMENTS - 1) << (PAGE_SHIFT - 9); + blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); + error = nvme_init_identify(&ctrl->ctrl); if (error) goto out_cleanup_queue; @@ -384,6 +393,8 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) out_cleanup_queue: blk_cleanup_queue(ctrl->ctrl.admin_q); +out_cleanup_fabrics_q: + blk_cleanup_queue(ctrl->ctrl.fabrics_q); out_free_tagset: blk_mq_free_tag_set(&ctrl->admin_tag_set); out_free_sq: @@ -401,14 +412,13 @@ static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl) nvme_loop_destroy_io_queues(ctrl); } + blk_mq_quiesce_queue(ctrl->ctrl.admin_q); if (ctrl->ctrl.state == NVME_CTRL_LIVE) nvme_shutdown_ctrl(&ctrl->ctrl); - blk_mq_quiesce_queue(ctrl->ctrl.admin_q); blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_cancel_request, &ctrl->ctrl); blk_mq_tagset_wait_completed_request(&ctrl->admin_tag_set); - blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); nvme_loop_destroy_admin_queue(ctrl); } -- cgit v1.2.3-58-ga151 From f2fa006f81cc5e7b5b9ef2c06dfeb50f52a47250 Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Wed, 28 Aug 2019 14:11:48 +0300 Subject: nvme-pci: Tidy up nvme_unmap_data Remove pointless local variable and use rq_dma_dir macro. Signed-off-by: Israel Rukshin Reviewed-by: Max Gurtovoy Reviewed-by: Keith Busch Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg --- drivers/nvme/host/pci.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 77bcda68fe1a..5c3732fd02bc 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -537,14 +537,13 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req) static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - enum dma_data_direction dma_dir = rq_data_dir(req) ? - DMA_TO_DEVICE : DMA_FROM_DEVICE; const int last_prp = dev->ctrl.page_size / sizeof(__le64) - 1; dma_addr_t dma_addr = iod->first_dma, next_dma_addr; int i; if (iod->dma_len) { - dma_unmap_page(dev->dev, dma_addr, iod->dma_len, dma_dir); + dma_unmap_page(dev->dev, dma_addr, iod->dma_len, + rq_dma_dir(req)); return; } -- cgit v1.2.3-58-ga151 From f15872c5dce43b69c3dee7739d7d3f54c54fc527 Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Wed, 28 Aug 2019 14:11:49 +0300 Subject: nvme-fc: Use rq_dma_dir macro Remove code duplication. Signed-off-by: Israel Rukshin Reviewed-by: Max Gurtovoy Reviewed-by: James Smart Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg --- drivers/nvme/host/fc.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 49577a33d25b..bafe35bdffac 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2108,7 +2108,6 @@ nvme_fc_map_data(struct nvme_fc_ctrl *ctrl, struct request *rq, struct nvme_fc_fcp_op *op) { struct nvmefc_fcp_req *freq = &op->fcp_req; - enum dma_data_direction dir; int ret; freq->sg_cnt = 0; @@ -2125,9 +2124,8 @@ nvme_fc_map_data(struct nvme_fc_ctrl *ctrl, struct request *rq, op->nents = blk_rq_map_sg(rq->q, rq, freq->sg_table.sgl); WARN_ON(op->nents > blk_rq_nr_phys_segments(rq)); - dir = (rq_data_dir(rq) == WRITE) ? DMA_TO_DEVICE : DMA_FROM_DEVICE; freq->sg_cnt = fc_dma_map_sg(ctrl->lport->dev, freq->sg_table.sgl, - op->nents, dir); + op->nents, rq_dma_dir(rq)); if (unlikely(freq->sg_cnt <= 0)) { sg_free_table_chained(&freq->sg_table, SG_CHUNK_SIZE); freq->sg_cnt = 0; @@ -2150,8 +2148,7 @@ nvme_fc_unmap_data(struct nvme_fc_ctrl *ctrl, struct request *rq, return; fc_dma_unmap_sg(ctrl->lport->dev, freq->sg_table.sgl, op->nents, - ((rq_data_dir(rq) == WRITE) ? - DMA_TO_DEVICE : DMA_FROM_DEVICE)); + rq_dma_dir(rq)); nvme_cleanup_cmd(rq); -- cgit v1.2.3-58-ga151 From bc31c1eea99de9a8e65b011483716236af52f7ed Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Wed, 28 Aug 2019 14:11:50 +0300 Subject: nvme-rdma: Use rq_dma_dir macro Remove code duplication. Signed-off-by: Israel Rukshin Reviewed-by: Max Gurtovoy Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg --- drivers/nvme/host/rdma.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 0ef05a75c428..5e30bcf3fe37 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1146,9 +1146,7 @@ static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue, req->mr = NULL; } - ib_dma_unmap_sg(ibdev, req->sg_table.sgl, - req->nents, rq_data_dir(rq) == - WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + ib_dma_unmap_sg(ibdev, req->sg_table.sgl, req->nents, rq_dma_dir(rq)); nvme_cleanup_cmd(rq); sg_free_table_chained(&req->sg_table, SG_CHUNK_SIZE); @@ -1274,7 +1272,7 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue, req->nents = blk_rq_map_sg(rq->q, rq, req->sg_table.sgl); count = ib_dma_map_sg(ibdev, req->sg_table.sgl, req->nents, - rq_data_dir(rq) == WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + rq_dma_dir(rq)); if (unlikely(count <= 0)) { ret = -EIO; goto out_free_table; @@ -1303,9 +1301,7 @@ out: return 0; out_unmap_sg: - ib_dma_unmap_sg(ibdev, req->sg_table.sgl, - req->nents, rq_data_dir(rq) == - WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + ib_dma_unmap_sg(ibdev, req->sg_table.sgl, req->nents, rq_dma_dir(rq)); out_free_table: sg_free_table_chained(&req->sg_table, SG_CHUNK_SIZE); return ret; -- cgit v1.2.3-58-ga151 From 1c0d12c0b1a1a09fdfbc8e00c456581d04829915 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Fri, 2 Aug 2019 18:04:12 -0700 Subject: nvme: fail cancelled commands with NVME_SC_HOST_PATH_ERROR NVME_SC_ABORT_REQ means that the request was aborted due to an abort command received. In our case, this is a transport cancellation, so host pathing error is much more appropriate. Also, convert NVME_SC_HOST_PATH_ERROR to BLK_STS_TRANSPORT for such that callers can understand that the status is a transport related error. This will be used by the ns scanning code to understand if it got an error from the controller or that the controller happens to be unreachable by the transport. Reviewed-by: Minwoo Im Reviewed-by: Hannes Reinecke Reviewed-by: James Smart Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg --- drivers/nvme/host/core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 4660505eded9..066aeecca5d2 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -226,6 +226,8 @@ static blk_status_t nvme_error_status(struct request *req) return BLK_STS_PROTECTION; case NVME_SC_RESERVATION_CONFLICT: return BLK_STS_NEXUS; + case NVME_SC_HOST_PATH_ERROR: + return BLK_STS_TRANSPORT; default: return BLK_STS_IOERR; } @@ -294,7 +296,7 @@ bool nvme_cancel_request(struct request *req, void *data, bool reserved) if (blk_mq_request_completed(req)) return true; - nvme_req(req)->status = NVME_SC_ABORT_REQ; + nvme_req(req)->status = NVME_SC_HOST_PATH_ERROR; blk_mq_complete_request(req); return true; } -- cgit v1.2.3-58-ga151 From 16686010085f46783c895f8736e4f0ae74ae88a0 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Fri, 2 Aug 2019 18:17:52 -0700 Subject: nvme-tcp: fail command with NVME_SC_HOST_PATH_ERROR send failed This is a more appropriate error status for a transport error detected by us (the host). Reviewed-by: Hannes Reinecke Reviewed-by: James Smart Reviewed-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Signed-off-by: Sagi Grimberg --- drivers/nvme/host/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 2d8ba31cb691..0a0263a364f2 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -842,7 +842,7 @@ static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue) static void nvme_tcp_fail_request(struct nvme_tcp_request *req) { - nvme_tcp_end_request(blk_mq_rq_from_pdu(req), NVME_SC_DATA_XFER_ERROR); + nvme_tcp_end_request(blk_mq_rq_from_pdu(req), NVME_SC_HOST_PATH_ERROR); } static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) -- cgit v1.2.3-58-ga151 From 74bd8cbe7dd64c50623cf0dc20eaeaab6b68d3d6 Mon Sep 17 00:00:00 2001 From: James Smart Date: Tue, 6 Aug 2019 00:14:06 -0700 Subject: nvme-fc: Fail transport errors with NVME_SC_HOST_PATH NVME_SC_INTERNAL should indicate an internal controller errors and not host transport errors. These errors will propagate to upper layers (essentially nvme core) and be interpereted as transport errors which should not be taken into account for namespace state or condition. Reviewed-by: Hannes Reinecke Reviewed-by: James Smart Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg --- drivers/nvme/host/fc.c | 37 ++++++++++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 7 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index bafe35bdffac..265f89e11d8b 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1608,9 +1608,13 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) sizeof(op->rsp_iu), DMA_FROM_DEVICE); if (opstate == FCPOP_STATE_ABORTED) - status = cpu_to_le16(NVME_SC_ABORT_REQ << 1); - else if (freq->status) - status = cpu_to_le16(NVME_SC_INTERNAL << 1); + status = cpu_to_le16(NVME_SC_HOST_PATH_ERROR << 1); + else if (freq->status) { + status = cpu_to_le16(NVME_SC_HOST_PATH_ERROR << 1); + dev_info(ctrl->ctrl.device, + "NVME-FC{%d}: io failed due to lldd error %d\n", + ctrl->cnum, freq->status); + } /* * For the linux implementation, if we have an unsuccesful @@ -1637,8 +1641,13 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) * no payload in the CQE by the transport. */ if (freq->transferred_length != - be32_to_cpu(op->cmd_iu.data_len)) { - status = cpu_to_le16(NVME_SC_INTERNAL << 1); + be32_to_cpu(op->cmd_iu.data_len)) { + status = cpu_to_le16(NVME_SC_HOST_PATH_ERROR << 1); + dev_info(ctrl->ctrl.device, + "NVME-FC{%d}: io failed due to bad transfer " + "length: %d vs expected %d\n", + ctrl->cnum, freq->transferred_length, + be32_to_cpu(op->cmd_iu.data_len)); goto done; } result.u64 = 0; @@ -1655,7 +1664,17 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) freq->transferred_length || op->rsp_iu.status_code || sqe->common.command_id != cqe->command_id)) { - status = cpu_to_le16(NVME_SC_INTERNAL << 1); + status = cpu_to_le16(NVME_SC_HOST_PATH_ERROR << 1); + dev_info(ctrl->ctrl.device, + "NVME-FC{%d}: io failed due to bad NVMe_ERSP: " + "iu len %d, xfr len %d vs %d, status code " + "%d, cmdid %d vs %d\n", + ctrl->cnum, be16_to_cpu(op->rsp_iu.iu_len), + be32_to_cpu(op->rsp_iu.xfrd_len), + freq->transferred_length, + op->rsp_iu.status_code, + sqe->common.command_id, + cqe->command_id); goto done; } result = cqe->result; @@ -1663,7 +1682,11 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) break; default: - status = cpu_to_le16(NVME_SC_INTERNAL << 1); + status = cpu_to_le16(NVME_SC_HOST_PATH_ERROR << 1); + dev_info(ctrl->ctrl.device, + "NVME-FC{%d}: io failed due to odd NVMe_xRSP iu " + "len %d\n", + ctrl->cnum, freq->rcv_rsplen); goto done; } -- cgit v1.2.3-58-ga151 From 2f9c173647753b81eed7c198abf7622ab22dc49d Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 29 Aug 2019 12:53:15 -0700 Subject: nvme: pass status to nvme_error_status No need for the full blown request structure. Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg --- drivers/nvme/host/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 066aeecca5d2..2797d38d2dca 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -197,9 +197,9 @@ static inline bool nvme_ns_has_pi(struct nvme_ns *ns) return ns->pi_type && ns->ms == sizeof(struct t10_pi_tuple); } -static blk_status_t nvme_error_status(struct request *req) +static blk_status_t nvme_error_status(u16 status) { - switch (nvme_req(req)->status & 0x7ff) { + switch (status & 0x7ff) { case NVME_SC_SUCCESS: return BLK_STS_OK; case NVME_SC_CAP_EXCEEDED: @@ -262,7 +262,7 @@ static void nvme_retry_req(struct request *req) void nvme_complete_rq(struct request *req) { - blk_status_t status = nvme_error_status(req); + blk_status_t status = nvme_error_status(nvme_req(req)->status); trace_nvme_complete_rq(req); -- cgit v1.2.3-58-ga151 From 331813f687ed41347b2b7dc784d81ccdbf6f9157 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Fri, 2 Aug 2019 18:11:42 -0700 Subject: nvme: make nvme_identify_ns propagate errors back right now callers of nvme_identify_ns only know that it failed, but don't know why. Make nvme_identify_ns propagate the error back. Because nvme_submit_sync_cmd may return a positive status code, we make nvme_identify_ns receive the id by reference and return that status up the call chain, but make sure not to leak positive nvme status codes to the upper layers. Reviewed-by: Minwoo Im Reviewed-by: Hannes Reinecke Reviewed-by: James Smart Reviewed-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg --- drivers/nvme/host/core.c | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 2797d38d2dca..52b453e4ae9c 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1096,10 +1096,9 @@ static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *n NVME_IDENTIFY_DATA_SIZE); } -static struct nvme_id_ns *nvme_identify_ns(struct nvme_ctrl *ctrl, - unsigned nsid) +static int nvme_identify_ns(struct nvme_ctrl *ctrl, + unsigned nsid, struct nvme_id_ns **id) { - struct nvme_id_ns *id; struct nvme_command c = { }; int error; @@ -1108,18 +1107,17 @@ static struct nvme_id_ns *nvme_identify_ns(struct nvme_ctrl *ctrl, c.identify.nsid = cpu_to_le32(nsid); c.identify.cns = NVME_ID_CNS_NS; - id = kmalloc(sizeof(*id), GFP_KERNEL); - if (!id) - return NULL; + *id = kmalloc(sizeof(**id), GFP_KERNEL); + if (!*id) + return -ENOMEM; - error = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id)); + error = nvme_submit_sync_cmd(ctrl->admin_q, &c, *id, sizeof(**id)); if (error) { dev_warn(ctrl->device, "Identify namespace failed (%d)\n", error); - kfree(id); - return NULL; + kfree(*id); } - return id; + return error; } static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid, @@ -1740,13 +1738,13 @@ static int nvme_revalidate_disk(struct gendisk *disk) return -ENODEV; } - id = nvme_identify_ns(ctrl, ns->head->ns_id); - if (!id) - return -ENODEV; + ret = nvme_identify_ns(ctrl, ns->head->ns_id, &id); + if (ret) + goto out; if (id->ncap == 0) { ret = -ENODEV; - goto out; + goto free_id; } __nvme_revalidate_disk(disk, id); @@ -1757,8 +1755,11 @@ static int nvme_revalidate_disk(struct gendisk *disk) ret = -ENODEV; } -out: +free_id: kfree(id); +out: + if (ret > 0) + ret = blk_status_to_errno(nvme_error_status(ret)); return ret; } @@ -3329,11 +3330,9 @@ static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); nvme_set_queue_limits(ctrl, ns->queue); - id = nvme_identify_ns(ctrl, nsid); - if (!id) { - ret = -EIO; + ret = nvme_identify_ns(ctrl, nsid, &id); + if (ret) goto out_free_queue; - } if (id->ncap == 0) { ret = -EINVAL; @@ -3395,6 +3394,8 @@ static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) blk_cleanup_queue(ns->queue); out_free_ns: kfree(ns); + if (ret > 0) + ret = blk_status_to_errno(nvme_error_status(ret)); return ret; } -- cgit v1.2.3-58-ga151 From 538af88ea7d9de241e6b6f006e9049c4d96723bb Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Fri, 2 Aug 2019 18:16:12 -0700 Subject: nvme: make nvme_report_ns_ids propagate error back Make the callers check the return status and propagate back accordingly (casting to errno from a positive nvme status). Also print the return status in nvme_report_ns_ids. Reviewed-by: Hannes Reinecke Reviewed-by: James Smart Reviewed-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Signed-off-by: Sagi Grimberg --- drivers/nvme/host/core.c | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 52b453e4ae9c..f15a77dd3115 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1595,9 +1595,11 @@ static void nvme_config_write_zeroes(struct gendisk *disk, struct nvme_ns *ns) blk_queue_max_write_zeroes_sectors(disk->queue, max_sectors); } -static void nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid, +static int nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid, struct nvme_id_ns *id, struct nvme_ns_ids *ids) { + int ret = 0; + memset(ids, 0, sizeof(*ids)); if (ctrl->vs >= NVME_VS(1, 1, 0)) @@ -1608,10 +1610,12 @@ static void nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid, /* Don't treat error as fatal we potentially * already have a NGUID or EUI-64 */ - if (nvme_identify_ns_descs(ctrl, nsid, ids)) + ret = nvme_identify_ns_descs(ctrl, nsid, ids); + if (ret) dev_warn(ctrl->device, - "%s: Identify Descriptors failed\n", __func__); + "Identify Descriptors failed (%d)\n", ret); } + return ret; } static bool nvme_ns_ids_valid(struct nvme_ns_ids *ids) @@ -1748,7 +1752,10 @@ static int nvme_revalidate_disk(struct gendisk *disk) } __nvme_revalidate_disk(disk, id); - nvme_report_ns_ids(ctrl, ns->head->ns_id, id, &ids); + ret = nvme_report_ns_ids(ctrl, ns->head->ns_id, id, &ids); + if (ret) + goto free_id; + if (!nvme_ns_ids_equal(&ns->head->ids, &ids)) { dev_err(ctrl->device, "identifiers changed for nsid %d\n", ns->head->ns_id); @@ -3176,7 +3183,9 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl, head->ns_id = nsid; kref_init(&head->ref); - nvme_report_ns_ids(ctrl, nsid, id, &head->ids); + ret = nvme_report_ns_ids(ctrl, nsid, id, &head->ids); + if (ret) + goto out_cleanup_srcu; ret = __nvme_check_ids(ctrl->subsys, head); if (ret) { @@ -3201,6 +3210,8 @@ out_ida_remove: out_free_head: kfree(head); out: + if (ret > 0) + ret = blk_status_to_errno(nvme_error_status(ret)); return ERR_PTR(ret); } @@ -3224,7 +3235,10 @@ static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid, } else { struct nvme_ns_ids ids; - nvme_report_ns_ids(ctrl, nsid, id, &ids); + ret = nvme_report_ns_ids(ctrl, nsid, id, &ids); + if (ret) + goto out_unlock; + if (!nvme_ns_ids_equal(&head->ids, &ids)) { dev_err(ctrl->device, "IDs don't match for shared namespace %d\n", @@ -3239,6 +3253,8 @@ static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid, out_unlock: mutex_unlock(&ctrl->subsys->lock); + if (ret > 0) + ret = blk_status_to_errno(nvme_error_status(ret)); return ret; } -- cgit v1.2.3-58-ga151 From 205da24343013e0bd62475800df79cd053f22326 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Fri, 30 Aug 2019 11:00:59 -0700 Subject: nvme: fix ns removal hang when failing to revalidate due to a transient error If a controller reset is racing with a namespace revalidation, the revalidation (admin) I/O will surely fail, but we should not remove the namespace as we will execute the I/O when the controller is back up. Same for spurious allocation errors (return -ENOMEM). Fix this by checking the specific error code in nvme_revalidate_disk and if it is a transient error (for example non DNR nvme statuses or a negative ENOMEM as allocation failure), do not remove the namespace as it will either recover when the controller is back up and schedule a subsequent scan, or the controller is going away and the namespaces will be removed anyways. This fixes a hang namespace scanning racing with a controller reset and also sporious I/O errors in path failover coditions where the controller reset is racing with the namespace scan work with multipath enabled. Reported-by: Hannes Reinecke Reviewed-by: Hannes Reinecke Reviewed-by: James Smart Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg --- drivers/nvme/host/core.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index f15a77dd3115..fad04282148d 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1765,7 +1765,13 @@ static int nvme_revalidate_disk(struct gendisk *disk) free_id: kfree(id); out: - if (ret > 0) + /* + * Only fail the function if we got a fatal error back from the + * device, otherwise ignore the error and just move on. + */ + if (ret == -ENOMEM || (ret > 0 && !(ret & NVME_SC_DNR))) + ret = 0; + else if (ret > 0) ret = blk_status_to_errno(nvme_error_status(ret)); return ret; } -- cgit v1.2.3-58-ga151 From c26aa572027d438de9cc311aaebcbe972f698c24 Mon Sep 17 00:00:00 2001 From: James Smart Date: Tue, 3 Sep 2019 14:20:37 -0700 Subject: nvme: Treat discovery subsystems as unique subsystems Current code matches subnqn and collapses all controllers to the same subnqn to a single subsystem structure. This is good for recognizing multiple controllers for the same subsystem. But with the well-known discovery subnqn, the subsystems aren't truly the same subsystem. As such, subsystem specific rules, such as no overlap of controller id, do not apply. With today's behavior, the check for overlap of controller id can fail, preventing the new discovery controller from being created. When searching for like subsystem nqn, exclude the discovery nqn from matching. This will result in each discovery controller being attached to a unique subsystem structure. Signed-off-by: James Smart Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Reviewed-by: Max Gurtovoy Signed-off-by: Sagi Grimberg --- drivers/nvme/host/core.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index fad04282148d..0545eb97d838 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2374,6 +2374,17 @@ static struct nvme_subsystem *__nvme_find_get_subsystem(const char *subsysnqn) lockdep_assert_held(&nvme_subsystems_lock); + /* + * Fail matches for discovery subsystems. This results + * in each discovery controller bound to a unique subsystem. + * This avoids issues with validating controller values + * that can only be true when there is a single unique subsystem. + * There may be multiple and completely independent entities + * that provide discovery controllers. + */ + if (!strcmp(subsysnqn, NVME_DISC_SUBSYS_NAME)) + return NULL; + list_for_each_entry(subsys, &nvme_subsystems, entry) { if (strcmp(subsys->subnqn, subsysnqn)) continue; -- cgit v1.2.3-58-ga151 From 03894b7a896dc6eb3870e197bd7414ab0c947cbf Mon Sep 17 00:00:00 2001 From: Edmund Nadolski Date: Tue, 3 Sep 2019 14:08:47 -0600 Subject: nvme: include admin_q sync with nvme_sync_queues nvme_sync_queues currently syncs all namespace queues, but should also sync the admin queue, if present. Signed-off-by: Edmund Nadolski Reviewed-by: Keith Busch Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg --- drivers/nvme/host/core.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 0545eb97d838..1777c8e6dffd 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4010,6 +4010,9 @@ void nvme_sync_queues(struct nvme_ctrl *ctrl) list_for_each_entry(ns, &ctrl->namespaces, list) blk_sync_queue(ns->queue); up_read(&ctrl->namespaces_rwsem); + + if (ctrl->admin_q) + blk_sync_queue(ctrl->admin_q); } EXPORT_SYMBOL_GPL(nvme_sync_queues); -- cgit v1.2.3-58-ga151 From 312910f4d2fed987d1f4a6cd75e86c926e9ad557 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 5 Sep 2019 15:34:35 +0100 Subject: nvme: tcp: remove redundant assignment to variable ret The variable ret is being initialized with a value that is never read and is being re-assigned immediately afterwards. The assignment is redundant and hence can be removed. Addresses-Coverity: ("Unused value") Signed-off-by: Colin Ian King Reviewed-by: Max Gurtovoy Signed-off-by: Sagi Grimberg --- drivers/nvme/host/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 0a0263a364f2..4ffd5957637a 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1824,7 +1824,7 @@ static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl) static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new) { struct nvmf_ctrl_options *opts = ctrl->opts; - int ret = -EINVAL; + int ret; ret = nvme_tcp_configure_admin_queue(ctrl, new); if (ret) -- cgit v1.2.3-58-ga151 From 733e4b69d508d03c20adfdcf4bd27abc60fae9cc Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 5 Sep 2019 10:33:54 -0600 Subject: nvme: Assign subsys instance from first ctrl The namespace disk names must be unique for the lifetime of the subsystem. This was accomplished by using their parent subsystems' instances which were allocated independently from the controllers connected to that subsystem. This allowed name prefixes assigned to namespaces to match a controller from an unrelated subsystem, and has created confusion among users examining device nodes. Ensure a namespace's subsystem instance never clashes with a controller instance of another subsystem by transferring the instance ownership to the parent subsystem from the first controller discovered in that subsystem. Reviewed-by: Logan Gunthorpe Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Reviewed-by: Minwoo Im Reviewed-by: Hannes Reinecke Reviewed-off-by: Sagi Grimberg Signed-off-by: Sagi Grimberg --- drivers/nvme/host/core.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 1777c8e6dffd..55fc0728764e 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -81,7 +81,6 @@ EXPORT_SYMBOL_GPL(nvme_reset_wq); struct workqueue_struct *nvme_delete_wq; EXPORT_SYMBOL_GPL(nvme_delete_wq); -static DEFINE_IDA(nvme_subsystems_ida); static LIST_HEAD(nvme_subsystems); static DEFINE_MUTEX(nvme_subsystems_lock); @@ -2345,7 +2344,8 @@ static void nvme_release_subsystem(struct device *dev) struct nvme_subsystem *subsys = container_of(dev, struct nvme_subsystem, dev); - ida_simple_remove(&nvme_subsystems_ida, subsys->instance); + if (subsys->instance >= 0) + ida_simple_remove(&nvme_instance_ida, subsys->instance); kfree(subsys); } @@ -2485,12 +2485,8 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) subsys = kzalloc(sizeof(*subsys), GFP_KERNEL); if (!subsys) return -ENOMEM; - ret = ida_simple_get(&nvme_subsystems_ida, 0, 0, GFP_KERNEL); - if (ret < 0) { - kfree(subsys); - return ret; - } - subsys->instance = ret; + + subsys->instance = -1; mutex_init(&subsys->lock); kref_init(&subsys->ref); INIT_LIST_HEAD(&subsys->ctrls); @@ -2509,7 +2505,7 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) subsys->dev.class = nvme_subsys_class; subsys->dev.release = nvme_release_subsystem; subsys->dev.groups = nvme_subsys_attrs_groups; - dev_set_name(&subsys->dev, "nvme-subsys%d", subsys->instance); + dev_set_name(&subsys->dev, "nvme-subsys%d", ctrl->instance); device_initialize(&subsys->dev); mutex_lock(&nvme_subsystems_lock); @@ -2540,6 +2536,8 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) goto out_put_subsystem; } + if (!found) + subsys->instance = ctrl->instance; ctrl->subsys = subsys; list_add_tail(&ctrl->subsys_entry, &subsys->ctrls); mutex_unlock(&nvme_subsystems_lock); @@ -3810,7 +3808,9 @@ static void nvme_free_ctrl(struct device *dev) container_of(dev, struct nvme_ctrl, ctrl_device); struct nvme_subsystem *subsys = ctrl->subsys; - ida_simple_remove(&nvme_instance_ida, ctrl->instance); + if (subsys && ctrl->instance != subsys->instance) + ida_simple_remove(&nvme_instance_ida, ctrl->instance); + kfree(ctrl->effects); nvme_mpath_uninit(ctrl); __free_page(ctrl->discard_page); @@ -4095,7 +4095,6 @@ out: static void __exit nvme_core_exit(void) { - ida_destroy(&nvme_subsystems_ida); class_destroy(nvme_subsys_class); class_destroy(nvme_class); unregister_chrdev_region(nvme_chr_devt, NVME_MINORS); -- cgit v1.2.3-58-ga151 From 97b3807e93036819cabd803490c2bc6e2e58167c Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Thu, 5 Sep 2019 18:41:06 +0300 Subject: nvme: Remove redundant assignment of cq vector The cq vector is already assigned with the correct value. Signed-off-by: Israel Rukshin Reviewed-by: Max Gurtovoy Reviewed-by: Keith Busch Reviewed-off-by: Sagi Grimberg Signed-off-by: Sagi Grimberg --- drivers/nvme/host/pci.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 5c3732fd02bc..52205f8d90b4 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1555,7 +1555,6 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled) nvme_init_queue(nvmeq, qid); if (!polled) { - nvmeq->cq_vector = vector; result = queue_request_irq(nvmeq); if (result < 0) goto release_sq; -- cgit v1.2.3-58-ga151 From 1179d337be70e67baa2d8121677c310fea4f72c3 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Fri, 6 Sep 2019 19:50:19 +0200 Subject: nvmet: Use PTR_ERR_OR_ZERO() in nvmet_init_discovery() Simplify this function implementation by using a known function. Generated by: scripts/coccinelle/api/ptr_ret.cocci Signed-off-by: Markus Elfring Reviewed-by: Sagi Grimberg Signed-off-by: Sagi Grimberg --- drivers/nvme/target/discovery.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c index 8efca26b4776..3764a8900850 100644 --- a/drivers/nvme/target/discovery.c +++ b/drivers/nvme/target/discovery.c @@ -381,9 +381,7 @@ int __init nvmet_init_discovery(void) { nvmet_disc_subsys = nvmet_subsys_alloc(NVME_DISC_SUBSYS_NAME, NVME_NQN_DISC); - if (IS_ERR(nvmet_disc_subsys)) - return PTR_ERR(nvmet_disc_subsys); - return 0; + return PTR_ERR_OR_ZERO(nvmet_disc_subsys); } void nvmet_exit_discovery(void) -- cgit v1.2.3-58-ga151 From 2d352df57bcd7fee800fa1aa7257e68bfca64d2b Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Fri, 12 Jul 2019 11:02:09 -0700 Subject: nvme-fabrics: allow discovery subsystems accept a kato This modifies the behavior of discovery subsystems to accept a kato as a preparation to support discovery log change events. This also means that now every discovery controller will have a default kato value, and for non-persistent connections the host needs to pass in a zero kato value (keep_alive_tmo=0). Reviewed-by: Minwoo Im Reviewed-by: James Smart Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Signed-off-by: Sagi Grimberg --- drivers/nvme/host/fabrics.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 145c210edb03..74b8818ac9a1 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -381,8 +381,8 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl) * Set keep-alive timeout in seconds granularity (ms * 1000) * and add a grace period for controller kato enforcement */ - cmd.connect.kato = ctrl->opts->discovery_nqn ? 0 : - cpu_to_le32((ctrl->kato + NVME_KATO_GRACE) * 1000); + cmd.connect.kato = ctrl->kato ? + cpu_to_le32((ctrl->kato + NVME_KATO_GRACE) * 1000) : 0; if (ctrl->opts->disable_sqflow) cmd.connect.cattr |= NVME_CONNECT_DISABLE_SQFLOW; @@ -740,13 +740,6 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, pr_warn("keep_alive_tmo 0 won't execute keep alives!!!\n"); } opts->kato = token; - - if (opts->discovery_nqn && opts->kato) { - pr_err("Discovery controllers cannot accept KATO != 0\n"); - ret = -EINVAL; - goto out; - } - break; case NVMF_OPT_CTRL_LOSS_TMO: if (match_int(args, &token)) { @@ -883,7 +876,6 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, } if (opts->discovery_nqn) { - opts->kato = 0; opts->nr_io_queues = 0; opts->nr_write_queues = 0; opts->nr_poll_queues = 0; -- cgit v1.2.3-58-ga151 From 93da40239b1069ef96bbfe7c8d08edb347e8107a Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 22 Aug 2019 11:25:46 -0700 Subject: nvme: enable aen regardless of the presence of I/O queues AENs in general are not related to the presence of I/O queues, so enable them regardless. Note that the only exception is that discovery controller will not support any of the requested AENs and nvme_enable_aen will respect that and return, so it is still safe to enable regardless. Note it is safe to enable AENs even before the initial namespace scanning as we have the scan operation in a workqueue context. Reviewed-by: Minwoo Im Reviewed-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Signed-off-by: Sagi Grimberg --- drivers/nvme/host/core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 55fc0728764e..573e72139331 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1200,6 +1200,8 @@ static void nvme_enable_aen(struct nvme_ctrl *ctrl) if (status) dev_warn(ctrl->device, "Failed to configure AEN (cfg %x)\n", supported_aens); + + queue_work(nvme_wq, &ctrl->async_event_work); } static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) @@ -3785,10 +3787,10 @@ void nvme_start_ctrl(struct nvme_ctrl *ctrl) if (ctrl->kato) nvme_start_keep_alive(ctrl); + nvme_enable_aen(ctrl); + if (ctrl->queue_count > 1) { nvme_queue_scan(ctrl); - nvme_enable_aen(ctrl); - queue_work(nvme_wq, &ctrl->async_event_work); nvme_start_queues(ctrl); } } -- cgit v1.2.3-58-ga151 From a42f42e5bb84d82f1e9890f33364c4c04997c323 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 4 Sep 2019 14:29:48 -0700 Subject: nvme: add uevent variables for controller devices When we send uevents to userspace, add controller specific environment variables to uniquly identify the controller beyond its device name. This will be useful to address discovery log change events by actually verifying that the discovery controller is indeed the same as the device that generated the event. Reviewed-by: Hannes Reinecke Signed-off-by: Sagi Grimberg --- drivers/nvme/host/core.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 573e72139331..7d4e0c6f6d49 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3635,6 +3635,33 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_remove_namespaces); +static int nvme_class_uevent(struct device *dev, struct kobj_uevent_env *env) +{ + struct nvme_ctrl *ctrl = + container_of(dev, struct nvme_ctrl, ctrl_device); + struct nvmf_ctrl_options *opts = ctrl->opts; + int ret; + + ret = add_uevent_var(env, "NVME_TRTYPE=%s", ctrl->ops->name); + if (ret) + return ret; + + if (opts) { + ret = add_uevent_var(env, "NVME_TRADDR=%s", opts->traddr); + if (ret) + return ret; + + ret = add_uevent_var(env, "NVME_TRSVCID=%s", + opts->trsvcid ?: "none"); + if (ret) + return ret; + + ret = add_uevent_var(env, "NVME_HOST_TRADDR=%s", + opts->host_traddr ?: "none"); + } + return ret; +} + static void nvme_aen_uevent(struct nvme_ctrl *ctrl) { char *envp[2] = { NULL, NULL }; @@ -4073,6 +4100,7 @@ static int __init nvme_core_init(void) result = PTR_ERR(nvme_class); goto unregister_chrdev; } + nvme_class->dev_uevent = nvme_class_uevent; nvme_subsys_class = class_create(THIS_MODULE, "nvme-subsystem"); if (IS_ERR(nvme_subsys_class)) { -- cgit v1.2.3-58-ga151 From 85f8a4351dfd75a719a82c681739e7bcf17bbf9e Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Fri, 12 Jul 2019 11:02:10 -0700 Subject: nvme: send discovery log page change events to userspace If the controller supports discovery log page change events, we want to enable it. When we see a discovery log change event we will send it up to userspace and expect it to handle it. Reviewed-by: Minwoo Im Reviewed-by: James Smart Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Signed-off-by: Sagi Grimberg --- drivers/nvme/host/core.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 7d4e0c6f6d49..81f8b1841b0e 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1185,7 +1185,8 @@ int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count) EXPORT_SYMBOL_GPL(nvme_set_queue_count); #define NVME_AEN_SUPPORTED \ - (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT | NVME_AEN_CFG_ANA_CHANGE) + (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT | \ + NVME_AEN_CFG_ANA_CHANGE | NVME_AEN_CFG_DISC_CHANGE) static void nvme_enable_aen(struct nvme_ctrl *ctrl) { @@ -3768,6 +3769,9 @@ static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result) queue_work(nvme_wq, &ctrl->ana_work); break; #endif + case NVME_AER_NOTICE_DISC_CHANGED: + ctrl->aen_result = result; + break; default: dev_warn(ctrl->device, "async event result %08x\n", result); } -- cgit v1.2.3-58-ga151 From 5f8badbcbeac298a77ee634a10a375f3e66923f9 Mon Sep 17 00:00:00 2001 From: Amit Date: Thu, 12 Sep 2019 08:29:39 +0300 Subject: nvmet: fix a wrong error status returned in error log page When the command data_len cannot hold all the controller errors, we should simply return as much errors as we can fit instead of failing the command. Signed-off-by: Amit Engel Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg --- drivers/nvme/target/admin-cmd.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 51800a9ce9a9..831a062d27cb 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -37,7 +37,6 @@ static void nvmet_execute_get_log_page_noop(struct nvmet_req *req) static void nvmet_execute_get_log_page_error(struct nvmet_req *req) { struct nvmet_ctrl *ctrl = req->sq->ctrl; - u16 status = NVME_SC_SUCCESS; unsigned long flags; off_t offset = 0; u64 slot; @@ -47,9 +46,8 @@ static void nvmet_execute_get_log_page_error(struct nvmet_req *req) slot = ctrl->err_counter % NVMET_ERROR_LOG_SLOTS; for (i = 0; i < NVMET_ERROR_LOG_SLOTS; i++) { - status = nvmet_copy_to_sgl(req, offset, &ctrl->slots[slot], - sizeof(struct nvme_error_slot)); - if (status) + if (nvmet_copy_to_sgl(req, offset, &ctrl->slots[slot], + sizeof(struct nvme_error_slot))) break; if (slot == 0) @@ -59,7 +57,7 @@ static void nvmet_execute_get_log_page_error(struct nvmet_req *req) offset += sizeof(struct nvme_error_slot); } spin_unlock_irqrestore(&ctrl->error_lock, flags); - nvmet_req_complete(req, status); + nvmet_req_complete(req, 0); } static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req, -- cgit v1.2.3-58-ga151