From a87835e9ecbd29ddffdd5bae81dc913afc221abd Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 1 Jun 2020 19:41:11 -0700 Subject: nvme-core: use u16 type for directives In nvme_configure_directives() when calculating number of streams use u16 instead of unsigned type in the min_t() since target variable ctrl->nr_streams is of type u16. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index cd763f31864b..c4ba51dd213f 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -555,7 +555,7 @@ static int nvme_configure_directives(struct nvme_ctrl *ctrl) goto out_disable_stream; } - ctrl->nr_streams = min_t(unsigned, ctrl->nssa, BLK_MAX_WRITE_HINTS - 1); + ctrl->nr_streams = min_t(u16, ctrl->nssa, BLK_MAX_WRITE_HINTS - 1); dev_info(ctrl->device, "Using %u streams\n", ctrl->nr_streams); return 0; -- cgit v1.2.3-58-ga151 From d4047cf99421d434660a5a0c61ac3e83b4ad0dad Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 1 Jun 2020 19:41:12 -0700 Subject: nvme-core: use u16 type for ctrl->sqsize In nvme_init_identify() when calculating submission queue size use u16 instead of int type in the min_t() since target variable ctrl->sqsize is of type u16. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index c4ba51dd213f..c1de05646a02 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2873,7 +2873,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) return ret; } page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12; - ctrl->sqsize = min_t(int, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize); + ctrl->sqsize = min_t(u16, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize); if (ctrl->vs >= NVME_VS(1, 1, 0)) ctrl->subsystem = NVME_CAP_NSSRC(ctrl->cap); -- cgit v1.2.3-58-ga151 From 61f3b89630973037f67d8e25e5d26e80a51a7b37 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Wed, 17 Jun 2020 10:05:13 +0200 Subject: nvme-pci: use unsigned for io queue depth The NVMe PCIe declares module parameter io_queue_depth as int. Change this to u16 as queue depth can never be negative. Now to reflect this update module parameter getter function from param_get_int() -> param_get_uint() and respective setter function with type of n changed from int to u16 with param_set_int() to param_set_ushort(). Finally update struct nvme_dev q_depth member to u16 and use u16 in min_t() when calculating dev->q_depth in the nvme_pci_enable() (since q_depth is now u16) and use unsigned int instead of int when calculating dev->tagset.queue_depth as target variable tagset->queue_depth is of type unsigned int in nvme_dev_add(). Signed-off-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 74a2e2e00794..7ed1e2dfbee6 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -61,10 +61,10 @@ MODULE_PARM_DESC(sgl_threshold, static int io_queue_depth_set(const char *val, const struct kernel_param *kp); static const struct kernel_param_ops io_queue_depth_ops = { .set = io_queue_depth_set, - .get = param_get_int, + .get = param_get_uint, }; -static int io_queue_depth = 1024; +static unsigned int io_queue_depth = 1024; module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644); MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2"); @@ -115,7 +115,7 @@ struct nvme_dev { unsigned max_qid; unsigned io_queues[HCTX_MAX_TYPES]; unsigned int num_vecs; - int q_depth; + u16 q_depth; int io_sqes; u32 db_stride; void __iomem *bar; @@ -151,13 +151,14 @@ struct nvme_dev { static int io_queue_depth_set(const char *val, const struct kernel_param *kp) { - int n = 0, ret; + int ret; + u16 n; - ret = kstrtoint(val, 10, &n); + ret = kstrtou16(val, 10, &n); if (ret != 0 || n < 2) return -EINVAL; - return param_set_int(val, kp); + return param_set_ushort(val, kp); } static inline unsigned int sq_idx(unsigned int qid, u32 stride) @@ -2261,8 +2262,8 @@ static void nvme_dev_add(struct nvme_dev *dev) dev->tagset.nr_maps++; dev->tagset.timeout = NVME_IO_TIMEOUT; dev->tagset.numa_node = dev->ctrl.numa_node; - dev->tagset.queue_depth = - min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1; + dev->tagset.queue_depth = min_t(unsigned int, dev->q_depth, + BLK_MQ_MAX_DEPTH) - 1; dev->tagset.cmd_size = sizeof(struct nvme_iod); dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE; dev->tagset.driver_data = dev; @@ -2321,7 +2322,7 @@ static int nvme_pci_enable(struct nvme_dev *dev) dev->ctrl.cap = lo_hi_readq(dev->bar + NVME_REG_CAP); - dev->q_depth = min_t(int, NVME_CAP_MQES(dev->ctrl.cap) + 1, + dev->q_depth = min_t(u16, NVME_CAP_MQES(dev->ctrl.cap) + 1, io_queue_depth); dev->ctrl.sqsize = dev->q_depth - 1; /* 0's based queue depth */ dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap); -- cgit v1.2.3-58-ga151 From 9dc54a0d15665088f4c4ee30df5d2a94f03be3fa Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 1 Jun 2020 19:41:14 -0700 Subject: nvme-pci: code cleanup for nvme_alloc_host_mem() Although use of for loop is preferred it is not a common practice to have 80 char long for loop initialization and comparison section. Use temp variables for calculating values and replace them in the for loop with size of all variables to set to u64 since preferred variable is declared as u64. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 7ed1e2dfbee6..cd14b1a0ef90 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1933,12 +1933,12 @@ out: static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred) { - u32 chunk_size; + u64 min_chunk = min_t(u64, preferred, PAGE_SIZE * MAX_ORDER_NR_PAGES); + u64 hmminds = max_t(u32, dev->ctrl.hmminds * 4096, PAGE_SIZE * 2); + u64 chunk_size; /* start big and work our way down */ - for (chunk_size = min_t(u64, preferred, PAGE_SIZE * MAX_ORDER_NR_PAGES); - chunk_size >= max_t(u32, dev->ctrl.hmminds * 4096, PAGE_SIZE * 2); - chunk_size /= 2) { + for (chunk_size = min_chunk; chunk_size >= hmminds; chunk_size /= 2) { if (!__nvme_alloc_host_mem(dev, preferred, chunk_size)) { if (!min || dev->host_mem_size >= min) return 0; -- cgit v1.2.3-58-ga151 From ad509996432e54284d35f2e42b0f78273cbef27d Mon Sep 17 00:00:00 2001 From: Dongli Zhang Date: Mon, 8 Jun 2020 09:20:02 -0700 Subject: nvme-pci: remove the empty line at the beginning of nvme_should_reset() Just cleanup by removing the empty line. Signed-off-by: Dongli Zhang Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index cd14b1a0ef90..b3538141ec11 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1155,7 +1155,6 @@ static void abort_endio(struct request *req, blk_status_t error) static bool nvme_should_reset(struct nvme_dev *dev, u32 csts) { - /* If true, indicates loss of adapter communication, possibly by a * NVMe Subsystem reset. */ -- cgit v1.2.3-58-ga151 From b261b61c9e03ddf21464cda777a2457113b118be Mon Sep 17 00:00:00 2001 From: Dongli Zhang Date: Mon, 8 Jun 2020 09:20:01 -0700 Subject: nvmet-loop: remove unused 'target_ctrl' in nvme_loop_ctrl This field is never used since the introduction of nvme loopback by commit 3a85a5de29ea ("nvme-loop: add a NVMe loopback host driver"). Signed-off-by: Dongli Zhang Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/loop.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 8a0d4fe7bc18..f2c80a51985f 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -36,7 +36,6 @@ struct nvme_loop_ctrl { struct nvme_loop_iod async_event_iod; struct nvme_ctrl ctrl; - struct nvmet_ctrl *target_ctrl; struct nvmet_port *port; }; -- cgit v1.2.3-58-ga151 From 4e102559725683a4bf7240db8ed12b6b64cfccbf Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Tue, 2 Jun 2020 16:15:45 +0300 Subject: nvmet-tcp: remove has_keyed_sgls initialization Since the nvmet_tcp_ops is static, there is no need to initialize values to zero. Signed-off-by: Max Gurtovoy Reviewed-by: Israel Rukshin Reviewed-by: Himanshu Madhani Signed-off-by: Christoph Hellwig --- drivers/nvme/target/tcp.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index de9217cfd22d..1ce22b698f4d 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -1717,7 +1717,6 @@ static const struct nvmet_fabrics_ops nvmet_tcp_ops = { .owner = THIS_MODULE, .type = NVMF_TRTYPE_TCP, .msdbd = 1, - .has_keyed_sgls = 0, .add_port = nvmet_tcp_add_port, .remove_port = nvmet_tcp_remove_port, .queue_response = nvmet_tcp_queue_response, -- cgit v1.2.3-58-ga151 From 6fa350f7145677cf6d0b86eff33eb2e3e246770c Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Tue, 2 Jun 2020 16:15:46 +0300 Subject: nvmet: introduce flags member in nvmet_fabrics_ops Replace has_keyed_sgls and metadata_support booleans with a flags member that will be used for adding more features in the future. Signed-off-by: Max Gurtovoy Reviewed-by: Himanshu Madhani Reviewed-by: Israel Rukshin Signed-off-by: Christoph Hellwig --- drivers/nvme/target/admin-cmd.c | 2 +- drivers/nvme/target/core.c | 2 +- drivers/nvme/target/discovery.c | 2 +- drivers/nvme/target/nvmet.h | 5 +++-- drivers/nvme/target/rdma.c | 3 +-- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 1db8c0498668..95bb3bc4e335 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -427,7 +427,7 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) id->awupf = 0; id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */ - if (ctrl->ops->has_keyed_sgls) + if (ctrl->ops->flags & NVMF_KEYED_SGLS) id->sgls |= cpu_to_le32(1 << 2); if (req->port->inline_data_size) id->sgls |= cpu_to_le32(1 << 20); diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 6816507fba58..9cdc39c8b729 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -336,7 +336,7 @@ int nvmet_enable_port(struct nvmet_port *port) * If the user requested PI support and the transport isn't pi capable, * don't enable the port. */ - if (port->pi_enable && !ops->metadata_support) { + if (port->pi_enable && !(ops->flags & NVMF_METADATA_SUPPORTED)) { pr_err("T10-PI is not supported by transport type %d\n", port->disc_addr.trtype); ret = -EINVAL; diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c index 40cf0b6e6c9d..f40c05c33c3a 100644 --- a/drivers/nvme/target/discovery.c +++ b/drivers/nvme/target/discovery.c @@ -277,7 +277,7 @@ static void nvmet_execute_disc_identify(struct nvmet_req *req) id->maxcmd = cpu_to_le16(NVMET_MAX_CMD); id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */ - if (ctrl->ops->has_keyed_sgls) + if (ctrl->ops->flags & NVMF_KEYED_SGLS) id->sgls |= cpu_to_le32(1 << 2); if (req->port->inline_data_size) id->sgls |= cpu_to_le32(1 << 20); diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 809691291e73..6f8bd6a93575 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -286,8 +286,9 @@ struct nvmet_fabrics_ops { struct module *owner; unsigned int type; unsigned int msdbd; - bool has_keyed_sgls : 1; - bool metadata_support : 1; + unsigned int flags; +#define NVMF_KEYED_SGLS (1 << 0) +#define NVMF_METADATA_SUPPORTED (1 << 1) void (*queue_response)(struct nvmet_req *req); int (*add_port)(struct nvmet_port *port); void (*remove_port)(struct nvmet_port *port); diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 76ea23a2c2be..6731e0349480 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -1970,8 +1970,7 @@ static const struct nvmet_fabrics_ops nvmet_rdma_ops = { .owner = THIS_MODULE, .type = NVMF_TRTYPE_RDMA, .msdbd = 1, - .has_keyed_sgls = 1, - .metadata_support = 1, + .flags = NVMF_KEYED_SGLS | NVMF_METADATA_SUPPORTED, .add_port = nvmet_rdma_add_port, .remove_port = nvmet_rdma_remove_port, .queue_response = nvmet_rdma_queue_response, -- cgit v1.2.3-58-ga151 From a0f0dbaa6986d6777a4a6835ec91616d5c75ac25 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Thu, 11 Jun 2020 18:16:59 -0700 Subject: nvmet: use unsigned type for u64 In function nvmet_subsys_atte_version_show() which uses the NVME_XXX() macros related to version (of type u64) get rid of the int type cast when printing subsys version and use appropriate format specifier for u64. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/configfs.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 419e0d4ce79b..cdec47de89ed 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -862,14 +862,14 @@ static ssize_t nvmet_subsys_attr_version_show(struct config_item *item, struct nvmet_subsys *subsys = to_subsys(item); if (NVME_TERTIARY(subsys->ver)) - return snprintf(page, PAGE_SIZE, "%d.%d.%d\n", - (int)NVME_MAJOR(subsys->ver), - (int)NVME_MINOR(subsys->ver), - (int)NVME_TERTIARY(subsys->ver)); - - return snprintf(page, PAGE_SIZE, "%d.%d\n", - (int)NVME_MAJOR(subsys->ver), - (int)NVME_MINOR(subsys->ver)); + return snprintf(page, PAGE_SIZE, "%llu.%llu.%llu\n", + NVME_MAJOR(subsys->ver), + NVME_MINOR(subsys->ver), + NVME_TERTIARY(subsys->ver)); + + return snprintf(page, PAGE_SIZE, "%llu.%llu\n", + NVME_MAJOR(subsys->ver), + NVME_MINOR(subsys->ver)); } static ssize_t nvmet_subsys_attr_version_store(struct config_item *item, -- cgit v1.2.3-58-ga151 From ca8f4beebfb4979acb664e02365ef6e662ef95b0 Mon Sep 17 00:00:00 2001 From: Dongli Zhang Date: Mon, 25 May 2020 21:21:18 -0700 Subject: nvme-fcloop: verify wwnn and wwpn format The nvme host and target verify the wwnn and wwpn format via nvme_fc_parse_traddr(). For instance, it is required that the length of wwnn to be either 21 ("nn-0x") or 19 (nn-). Add this verification to nvme-fcloop so that the input should always be in hex and the length of input should always be 18. Otherwise, the user may use e.g. 0x2 to create fcloop local port, while 0x0000000000000002 is required for nvme host and target. This makes the requirement of format confusing. Signed-off-by: Dongli Zhang Reviewed-by: Sagi Grimberg Reviewed-by: James Smart Signed-off-by: Christoph Hellwig --- drivers/nvme/target/fcloop.c | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index 2ff1d1334a03..c97e60b71bbc 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -43,6 +43,17 @@ static const match_table_t opt_tokens = { { NVMF_OPT_ERR, NULL } }; +static int fcloop_verify_addr(substring_t *s) +{ + size_t blen = s->to - s->from + 1; + + if (strnlen(s->from, blen) != NVME_FC_TRADDR_HEXNAMELEN + 2 || + strncmp(s->from, "0x", 2)) + return -EINVAL; + + return 0; +} + static int fcloop_parse_options(struct fcloop_ctrl_options *opts, const char *buf) @@ -64,14 +75,16 @@ fcloop_parse_options(struct fcloop_ctrl_options *opts, opts->mask |= token; switch (token) { case NVMF_OPT_WWNN: - if (match_u64(args, &token64)) { + if (fcloop_verify_addr(args) || + match_u64(args, &token64)) { ret = -EINVAL; goto out_free_options; } opts->wwnn = token64; break; case NVMF_OPT_WWPN: - if (match_u64(args, &token64)) { + if (fcloop_verify_addr(args) || + match_u64(args, &token64)) { ret = -EINVAL; goto out_free_options; } @@ -92,14 +105,16 @@ fcloop_parse_options(struct fcloop_ctrl_options *opts, opts->fcaddr = token; break; case NVMF_OPT_LPWWNN: - if (match_u64(args, &token64)) { + if (fcloop_verify_addr(args) || + match_u64(args, &token64)) { ret = -EINVAL; goto out_free_options; } opts->lpwwnn = token64; break; case NVMF_OPT_LPWWPN: - if (match_u64(args, &token64)) { + if (fcloop_verify_addr(args) || + match_u64(args, &token64)) { ret = -EINVAL; goto out_free_options; } @@ -141,14 +156,16 @@ fcloop_parse_nm_options(struct device *dev, u64 *nname, u64 *pname, token = match_token(p, opt_tokens, args); switch (token) { case NVMF_OPT_WWNN: - if (match_u64(args, &token64)) { + if (fcloop_verify_addr(args) || + match_u64(args, &token64)) { ret = -EINVAL; goto out_free_options; } *nname = token64; break; case NVMF_OPT_WWPN: - if (match_u64(args, &token64)) { + if (fcloop_verify_addr(args) || + match_u64(args, &token64)) { ret = -EINVAL; goto out_free_options; } -- cgit v1.2.3-58-ga151 From 15ec928a65e0528ef4999e2947b4802b772f0891 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 18 Jun 2020 17:30:22 -0700 Subject: nvme-tcp: have queue prod/cons send list become a llist The queue processing will splice to a queue local list, this should alleviate some contention on the send_list lock, but also prepares us to the next patch where we look on these lists for network stack flag optimization. Remove queue lock as its not used anymore. Signed-off-by: Sagi Grimberg Tested-by: Mark Wunderlich [hch: simplified a loop] Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 7006aca89456..478868572c81 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -46,6 +46,7 @@ struct nvme_tcp_request { u32 pdu_sent; u16 ttag; struct list_head entry; + struct llist_node lentry; __le32 ddgst; struct bio *curr_bio; @@ -75,8 +76,8 @@ struct nvme_tcp_queue { struct work_struct io_work; int io_cpu; - spinlock_t lock; struct mutex send_mutex; + struct llist_head req_list; struct list_head send_list; /* recv state */ @@ -266,10 +267,8 @@ static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req, struct nvme_tcp_queue *queue = req->queue; bool empty; - spin_lock(&queue->lock); - empty = list_empty(&queue->send_list) && !queue->request; - list_add_tail(&req->entry, &queue->send_list); - spin_unlock(&queue->lock); + empty = llist_add(&req->lentry, &queue->req_list) && + list_empty(&queue->send_list) && !queue->request; /* * if we're the first on the send_list and we can try to send @@ -285,18 +284,33 @@ static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req, } } +static void nvme_tcp_process_req_list(struct nvme_tcp_queue *queue) +{ + struct nvme_tcp_request *req; + struct llist_node *node; + + for (node = llist_del_all(&queue->req_list); node; node = node->next) { + req = llist_entry(node, struct nvme_tcp_request, lentry); + list_add(&req->entry, &queue->send_list); + } +} + static inline struct nvme_tcp_request * nvme_tcp_fetch_request(struct nvme_tcp_queue *queue) { struct nvme_tcp_request *req; - spin_lock(&queue->lock); req = list_first_entry_or_null(&queue->send_list, struct nvme_tcp_request, entry); - if (req) - list_del(&req->entry); - spin_unlock(&queue->lock); + if (!req) { + nvme_tcp_process_req_list(queue); + req = list_first_entry_or_null(&queue->send_list, + struct nvme_tcp_request, entry); + if (unlikely(!req)) + return NULL; + } + list_del(&req->entry); return req; } @@ -1344,8 +1358,8 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int ret, rcv_pdu_size; queue->ctrl = ctrl; + init_llist_head(&queue->req_list); INIT_LIST_HEAD(&queue->send_list); - spin_lock_init(&queue->lock); mutex_init(&queue->send_mutex); INIT_WORK(&queue->io_work, nvme_tcp_io_work); queue->queue_size = queue_size; -- cgit v1.2.3-58-ga151 From 86f0348ace1510d7ac25124b096fb88a6ab45270 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 18 Jun 2020 17:30:23 -0700 Subject: nvme-tcp: leverage request plugging blk-mq request plugging can improve the execution of our pipeline. When we queue a request we actually trigger our I/O worker thread yielding a context switch by definition. However if we know that there are more requests in the pipe that are coming, we are better off not trigger our I/O worker and only do that for the last request in the batch (bd->last). By having it, we improve efficiency by amortizing context switches over a batch of requests. Signed-off-by: Sagi Grimberg Tested-by: Mark Wunderlich Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 478868572c81..2d3962c164a4 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -262,7 +262,7 @@ static inline void nvme_tcp_advance_req(struct nvme_tcp_request *req, } static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req, - bool sync) + bool sync, bool last) { struct nvme_tcp_queue *queue = req->queue; bool empty; @@ -279,7 +279,7 @@ static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req, sync && empty && mutex_trylock(&queue->send_mutex)) { nvme_tcp_try_send(queue); mutex_unlock(&queue->send_mutex); - } else { + } else if (last) { queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); } } @@ -610,7 +610,7 @@ static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue, req->state = NVME_TCP_SEND_H2C_PDU; req->offset = 0; - nvme_tcp_queue_request(req, false); + nvme_tcp_queue_request(req, false, true); return 0; } @@ -2120,7 +2120,7 @@ static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg) ctrl->async_req.curr_bio = NULL; ctrl->async_req.data_len = 0; - nvme_tcp_queue_request(&ctrl->async_req, true); + nvme_tcp_queue_request(&ctrl->async_req, true, true); } static enum blk_eh_timer_return @@ -2232,6 +2232,14 @@ static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns, return 0; } +static void nvme_tcp_commit_rqs(struct blk_mq_hw_ctx *hctx) +{ + struct nvme_tcp_queue *queue = hctx->driver_data; + + if (!llist_empty(&queue->req_list)) + queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); +} + static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { @@ -2251,7 +2259,7 @@ static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx, blk_mq_start_request(rq); - nvme_tcp_queue_request(req, true); + nvme_tcp_queue_request(req, true, bd->last); return BLK_STS_OK; } @@ -2319,6 +2327,7 @@ static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx) static const struct blk_mq_ops nvme_tcp_mq_ops = { .queue_rq = nvme_tcp_queue_rq, + .commit_rqs = nvme_tcp_commit_rqs, .complete = nvme_complete_rq, .init_request = nvme_tcp_init_request, .exit_request = nvme_tcp_exit_request, -- cgit v1.2.3-58-ga151 From 122e5b9f3d370ae11e1502d14ff5c7ea9b144a76 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 18 Jun 2020 17:30:24 -0700 Subject: nvme-tcp: optimize network stack with setting msg flags according to batch size If we have a long list of request to send, signal the network stack that more is coming (MSG_MORE). If we have nothing else, signal MSG_EOR. Signed-off-by: Sagi Grimberg Tested-by: Mark Wunderlich Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 2d3962c164a4..b2e73e19ef01 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -79,6 +79,7 @@ struct nvme_tcp_queue { struct mutex send_mutex; struct llist_head req_list; struct list_head send_list; + bool more_requests; /* recv state */ void *pdu; @@ -277,7 +278,9 @@ static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req, */ if (queue->io_cpu == smp_processor_id() && sync && empty && mutex_trylock(&queue->send_mutex)) { + queue->more_requests = !last; nvme_tcp_try_send(queue); + queue->more_requests = false; mutex_unlock(&queue->send_mutex); } else if (last) { queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); @@ -877,6 +880,12 @@ done: read_unlock(&sk->sk_callback_lock); } +static inline bool nvme_tcp_queue_more(struct nvme_tcp_queue *queue) +{ + return !list_empty(&queue->send_list) || + !llist_empty(&queue->req_list) || queue->more_requests; +} + static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue) { queue->request = NULL; @@ -898,7 +907,7 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) bool last = nvme_tcp_pdu_last_send(req, len); int ret, flags = MSG_DONTWAIT; - if (last && !queue->data_digest) + if (last && !queue->data_digest && !nvme_tcp_queue_more(queue)) flags |= MSG_EOR; else flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST; @@ -945,7 +954,7 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req) int flags = MSG_DONTWAIT; int ret; - if (inline_data) + if (inline_data || nvme_tcp_queue_more(queue)) flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST; else flags |= MSG_EOR; @@ -1010,12 +1019,17 @@ static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req) { struct nvme_tcp_queue *queue = req->queue; int ret; - struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_EOR }; + struct msghdr msg = { .msg_flags = MSG_DONTWAIT }; struct kvec iov = { .iov_base = &req->ddgst + req->offset, .iov_len = NVME_TCP_DIGEST_LENGTH - req->offset }; + if (nvme_tcp_queue_more(queue)) + msg.msg_flags |= MSG_MORE; + else + msg.msg_flags |= MSG_EOR; + ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len); if (unlikely(ret <= 0)) return ret; -- cgit v1.2.3-58-ga151 From b8a12e93570d8aa7fbfe2b8909eee8fd0f778afd Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 24 Jun 2020 12:27:16 -0700 Subject: nvmet-tcp: simplify nvmet_process_resp_list We can make it shorter and simpler without some redundant checks. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/tcp.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 1ce22b698f4d..9eda91162fe4 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -459,17 +459,11 @@ static void nvmet_setup_response_pdu(struct nvmet_tcp_cmd *cmd) static void nvmet_tcp_process_resp_list(struct nvmet_tcp_queue *queue) { struct llist_node *node; + struct nvmet_tcp_cmd *cmd; - node = llist_del_all(&queue->resp_list); - if (!node) - return; - - while (node) { - struct nvmet_tcp_cmd *cmd = llist_entry(node, - struct nvmet_tcp_cmd, lentry); - + for (node = llist_del_all(&queue->resp_list); node; node = node->next) { + cmd = llist_entry(node, struct nvmet_tcp_cmd, lentry); list_add(&cmd->entry, &queue->resp_send_list); - node = node->next; queue->send_list_len++; } } -- cgit v1.2.3-58-ga151 From f5af577d553103115579b5e404070dfab4d00332 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Wed, 24 Jun 2020 14:49:58 +0800 Subject: nvme: use USEC_PER_SEC instead of magic numbers Use USEC_PER_SEC instead of magic numbers to make code more readable. Reviewed-by: Sagi Grimberg Signed-off-by: Baolin Wang Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index c1de05646a02..96898040a6d5 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2947,7 +2947,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) if (id->rtd3e) { /* us -> s */ - u32 transition_time = le32_to_cpu(id->rtd3e) / 1000000; + u32 transition_time = le32_to_cpu(id->rtd3e) / USEC_PER_SEC; ctrl->shutdown_timeout = clamp_t(unsigned int, transition_time, shutdown_timeout, 60); -- cgit v1.2.3-58-ga151 From 82394db7383d33641f3f565bd79792fb41b1741f Mon Sep 17 00:00:00 2001 From: Matias Bjørling Date: Mon, 29 Jun 2020 12:06:37 -0700 Subject: block: add capacity field to zone descriptors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the zoned storage model, the sectors within a zone are typically all writeable. With the introduction of the Zoned Namespace (ZNS) Command Set in the NVM Express organization, the model was extended to have a specific writeable capacity. Extend the zone descriptor data structure with a zone capacity field to indicate to the user how many sectors in a zone are writeable. Introduce backward compatibility in the zone report ioctl by extending the zone report header data structure with a flags field to indicate if the capacity field is available. Reviewed-by: Jens Axboe Reviewed-by: Javier González Reviewed-by: Chaitanya Kulkarni Reviewed-by: Himanshu Madhani Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Reviewed-by: Daniel Wagner Signed-off-by: Matias Bjørling Signed-off-by: Christoph Hellwig --- block/blk-zoned.c | 1 + drivers/block/null_blk_zoned.c | 2 ++ drivers/scsi/sd_zbc.c | 1 + include/uapi/linux/blkzoned.h | 15 +++++++++++++-- 4 files changed, 17 insertions(+), 2 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 23831fa8701d..81152a260354 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -312,6 +312,7 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, return ret; rep.nr_zones = ret; + rep.flags = BLK_ZONE_REP_CAPACITY; if (copy_to_user(argp, &rep, sizeof(struct blk_zone_report))) return -EFAULT; return 0; diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c index cc47606d8ffe..624aac09b005 100644 --- a/drivers/block/null_blk_zoned.c +++ b/drivers/block/null_blk_zoned.c @@ -47,6 +47,7 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) zone->start = sector; zone->len = dev->zone_size_sects; + zone->capacity = zone->len; zone->wp = zone->start + zone->len; zone->type = BLK_ZONE_TYPE_CONVENTIONAL; zone->cond = BLK_ZONE_COND_NOT_WP; @@ -59,6 +60,7 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) zone->start = zone->wp = sector; zone->len = dev->zone_size_sects; + zone->capacity = zone->len; zone->type = BLK_ZONE_TYPE_SEQWRITE_REQ; zone->cond = BLK_ZONE_COND_EMPTY; diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index 6f7eba66687e..183a20720da9 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -59,6 +59,7 @@ static int sd_zbc_parse_report(struct scsi_disk *sdkp, u8 *buf, zone.non_seq = 1; zone.len = logical_to_sectors(sdp, get_unaligned_be64(&buf[8])); + zone.capacity = zone.len; zone.start = logical_to_sectors(sdp, get_unaligned_be64(&buf[16])); zone.wp = logical_to_sectors(sdp, get_unaligned_be64(&buf[24])); if (zone.type != ZBC_ZONE_TYPE_CONV && diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h index 0cdef67135f0..42c3366cc25f 100644 --- a/include/uapi/linux/blkzoned.h +++ b/include/uapi/linux/blkzoned.h @@ -73,6 +73,15 @@ enum blk_zone_cond { BLK_ZONE_COND_OFFLINE = 0xF, }; +/** + * enum blk_zone_report_flags - Feature flags of reported zone descriptors. + * + * @BLK_ZONE_REP_CAPACITY: Zone descriptor has capacity field. + */ +enum blk_zone_report_flags { + BLK_ZONE_REP_CAPACITY = (1 << 0), +}; + /** * struct blk_zone - Zone descriptor for BLKREPORTZONE ioctl. * @@ -99,7 +108,9 @@ struct blk_zone { __u8 cond; /* Zone condition */ __u8 non_seq; /* Non-sequential write resources active */ __u8 reset; /* Reset write pointer recommended */ - __u8 reserved[36]; + __u8 resv[4]; + __u64 capacity; /* Zone capacity in number of sectors */ + __u8 reserved[24]; }; /** @@ -115,7 +126,7 @@ struct blk_zone { struct blk_zone_report { __u64 sector; __u32 nr_zones; - __u8 reserved[4]; + __u32 flags; struct blk_zone zones[0]; }; -- cgit v1.2.3-58-ga151 From 089565fbf3bba99f91293b47b8c59ed85e00a81c Mon Sep 17 00:00:00 2001 From: Aravind Ramesh Date: Mon, 29 Jun 2020 12:06:38 -0700 Subject: null_blk: introduce zone capacity for zoned device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allow emulation of a zoned device with a per zone capacity smaller than the zone size as as defined in the Zoned Namespace (ZNS) Command Set specification. The zone capacity defaults to the zone size if not specified and must be smaller than the zone size otherwise. Reviewed-by: Matias Bjørling Reviewed-by: Chaitanya Kulkarni Reviewed-by: Himanshu Madhani Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Reviewed-by: Daniel Wagner Signed-off-by: Aravind Ramesh Signed-off-by: Christoph Hellwig --- drivers/block/null_blk.h | 1 + drivers/block/null_blk_main.c | 10 +++++++++- drivers/block/null_blk_zoned.c | 16 ++++++++++++++-- 3 files changed, 24 insertions(+), 3 deletions(-) diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h index 81b311c9d781..daed4a9c3436 100644 --- a/drivers/block/null_blk.h +++ b/drivers/block/null_blk.h @@ -49,6 +49,7 @@ struct nullb_device { unsigned long completion_nsec; /* time in ns to complete a request */ unsigned long cache_size; /* disk cache size in MB */ unsigned long zone_size; /* zone size in MB if device is zoned */ + unsigned long zone_capacity; /* zone capacity in MB if device is zoned */ unsigned int zone_nr_conv; /* number of conventional zones */ unsigned int submit_queues; /* number of submission queues */ unsigned int home_node; /* home node for the device */ diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c index 907c6858aec0..47a9dad880af 100644 --- a/drivers/block/null_blk_main.c +++ b/drivers/block/null_blk_main.c @@ -200,6 +200,10 @@ static unsigned long g_zone_size = 256; module_param_named(zone_size, g_zone_size, ulong, S_IRUGO); MODULE_PARM_DESC(zone_size, "Zone size in MB when block device is zoned. Must be power-of-two: Default: 256"); +static unsigned long g_zone_capacity; +module_param_named(zone_capacity, g_zone_capacity, ulong, 0444); +MODULE_PARM_DESC(zone_capacity, "Zone capacity in MB when block device is zoned. Can be less than or equal to zone size. Default: Zone size"); + static unsigned int g_zone_nr_conv; module_param_named(zone_nr_conv, g_zone_nr_conv, uint, 0444); MODULE_PARM_DESC(zone_nr_conv, "Number of conventional zones when block device is zoned. Default: 0"); @@ -341,6 +345,7 @@ NULLB_DEVICE_ATTR(mbps, uint, NULL); NULLB_DEVICE_ATTR(cache_size, ulong, NULL); NULLB_DEVICE_ATTR(zoned, bool, NULL); NULLB_DEVICE_ATTR(zone_size, ulong, NULL); +NULLB_DEVICE_ATTR(zone_capacity, ulong, NULL); NULLB_DEVICE_ATTR(zone_nr_conv, uint, NULL); static ssize_t nullb_device_power_show(struct config_item *item, char *page) @@ -457,6 +462,7 @@ static struct configfs_attribute *nullb_device_attrs[] = { &nullb_device_attr_badblocks, &nullb_device_attr_zoned, &nullb_device_attr_zone_size, + &nullb_device_attr_zone_capacity, &nullb_device_attr_zone_nr_conv, NULL, }; @@ -510,7 +516,8 @@ nullb_group_drop_item(struct config_group *group, struct config_item *item) static ssize_t memb_group_features_show(struct config_item *item, char *page) { - return snprintf(page, PAGE_SIZE, "memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size,zone_nr_conv\n"); + return snprintf(page, PAGE_SIZE, + "memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size,zone_capacity,zone_nr_conv\n"); } CONFIGFS_ATTR_RO(memb_group_, features); @@ -571,6 +578,7 @@ static struct nullb_device *null_alloc_dev(void) dev->use_per_node_hctx = g_use_per_node_hctx; dev->zoned = g_zoned; dev->zone_size = g_zone_size; + dev->zone_capacity = g_zone_capacity; dev->zone_nr_conv = g_zone_nr_conv; return dev; } diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c index 624aac09b005..3d25c9ad2383 100644 --- a/drivers/block/null_blk_zoned.c +++ b/drivers/block/null_blk_zoned.c @@ -28,6 +28,15 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) return -EINVAL; } + if (!dev->zone_capacity) + dev->zone_capacity = dev->zone_size; + + if (dev->zone_capacity > dev->zone_size) { + pr_err("null_blk: zone capacity (%lu MB) larger than zone size (%lu MB)\n", + dev->zone_capacity, dev->zone_size); + return -EINVAL; + } + dev->zone_size_sects = dev->zone_size << ZONE_SIZE_SHIFT; dev->nr_zones = dev_size >> (SECTOR_SHIFT + ilog2(dev->zone_size_sects)); @@ -60,7 +69,7 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) zone->start = zone->wp = sector; zone->len = dev->zone_size_sects; - zone->capacity = zone->len; + zone->capacity = dev->zone_capacity << ZONE_SIZE_SHIFT; zone->type = BLK_ZONE_TYPE_SEQWRITE_REQ; zone->cond = BLK_ZONE_COND_EMPTY; @@ -187,6 +196,9 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, return BLK_STS_IOERR; } + if (zone->wp + nr_sectors > zone->start + zone->capacity) + return BLK_STS_IOERR; + if (zone->cond != BLK_ZONE_COND_EXP_OPEN) zone->cond = BLK_ZONE_COND_IMP_OPEN; @@ -195,7 +207,7 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, return ret; zone->wp += nr_sectors; - if (zone->wp == zone->start + zone->len) + if (zone->wp == zone->start + zone->capacity) zone->cond = BLK_ZONE_COND_FULL; return BLK_STS_OK; default: -- cgit v1.2.3-58-ga151 From 71010c30945425203da8d069a10fa45a05a00f96 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Mon, 29 Jun 2020 12:06:39 -0700 Subject: nvme: implement multiple I/O Command Set support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements support for multiple I/O Command Sets. NVMe TP 4056 introduces a method to enumerate multiple command sets per namespace. If the command set is exposed, this method for enumeration will be used instead of the traditional method that uses the CC.CSS register command set register for command set identification. For namespaces where the Command Set Identifier is not supported or recognized, the specific namespace will not be created. Reviewed-by: Javier González Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Reviewed-by: Matias Bjørling Reviewed-by: Daniel Wagner Reviewed-by: Himanshu Madhani Reviewed-by: Hannes Reinecke Signed-off-by: Niklas Cassel Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 53 +++++++++++++++++++++++++++++++++++++++--------- drivers/nvme/host/nvme.h | 1 + include/linux/nvme.h | 19 +++++++++++++++-- 3 files changed, 61 insertions(+), 12 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 96898040a6d5..892291dbee64 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1056,8 +1056,13 @@ static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id) return error; } +static bool nvme_multi_css(struct nvme_ctrl *ctrl) +{ + return (ctrl->ctrl_config & NVME_CC_CSS_MASK) == NVME_CC_CSS_CSI; +} + static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids, - struct nvme_ns_id_desc *cur) + struct nvme_ns_id_desc *cur, bool *csi_seen) { const char *warn_str = "ctrl returned bogus length:"; void *data = cur; @@ -1087,6 +1092,15 @@ static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids, } uuid_copy(&ids->uuid, data + sizeof(*cur)); return NVME_NIDT_UUID_LEN; + case NVME_NIDT_CSI: + if (cur->nidl != NVME_NIDT_CSI_LEN) { + dev_warn(ctrl->device, "%s %d for NVME_NIDT_CSI\n", + warn_str, cur->nidl); + return -1; + } + memcpy(&ids->csi, data + sizeof(*cur), NVME_NIDT_CSI_LEN); + *csi_seen = true; + return NVME_NIDT_CSI_LEN; default: /* Skip unknown types */ return cur->nidl; @@ -1097,10 +1111,9 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid, struct nvme_ns_ids *ids) { struct nvme_command c = { }; - int status; + bool csi_seen = false; + int status, pos, len; void *data; - int pos; - int len; c.identify.opcode = nvme_admin_identify; c.identify.nsid = cpu_to_le32(nsid); @@ -1125,7 +1138,7 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid, * device just because of a temporal retry-able error (such * as path of transport errors). */ - if (status > 0 && (status & NVME_SC_DNR)) + if (status > 0 && (status & NVME_SC_DNR) && !nvme_multi_css(ctrl)) status = 0; goto free_data; } @@ -1136,12 +1149,19 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid, if (cur->nidl == 0) break; - len = nvme_process_ns_desc(ctrl, ids, cur); + len = nvme_process_ns_desc(ctrl, ids, cur, &csi_seen); if (len < 0) - goto free_data; + break; len += sizeof(*cur); } + + if (nvme_multi_css(ctrl) && !csi_seen) { + dev_warn(ctrl->device, "Command set not reported for nsid:%d\n", + nsid); + status = -EINVAL; + } + free_data: kfree(data); return status; @@ -1798,7 +1818,7 @@ static int nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid, memcpy(ids->eui64, id->eui64, sizeof(id->eui64)); if (ctrl->vs >= NVME_VS(1, 2, 0)) memcpy(ids->nguid, id->nguid, sizeof(id->nguid)); - if (ctrl->vs >= NVME_VS(1, 3, 0)) + if (ctrl->vs >= NVME_VS(1, 3, 0) || nvme_multi_css(ctrl)) return nvme_identify_ns_descs(ctrl, nsid, ids); return 0; } @@ -1814,7 +1834,8 @@ static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b) { return uuid_equal(&a->uuid, &b->uuid) && memcmp(&a->nguid, &b->nguid, sizeof(a->nguid)) == 0 && - memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0; + memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0 && + a->csi == b->csi; } static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns, @@ -1936,6 +1957,15 @@ static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) if (ns->lba_shift == 0) ns->lba_shift = 9; + switch (ns->head->ids.csi) { + case NVME_CSI_NVM: + break; + default: + dev_warn(ctrl->device, "unknown csi:%d ns:%d\n", + ns->head->ids.csi, ns->head->ns_id); + return -ENODEV; + } + if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) && is_power_of_2(ctrl->max_hw_sectors)) iob = ctrl->max_hw_sectors; @@ -2270,7 +2300,10 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl) ctrl->page_size = 1 << page_shift; - ctrl->ctrl_config = NVME_CC_CSS_NVM; + if (NVME_CAP_CSS(ctrl->cap) & NVME_CAP_CSS_CSI) + ctrl->ctrl_config = NVME_CC_CSS_CSI; + else + ctrl->ctrl_config = NVME_CC_CSS_NVM; ctrl->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT; ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE; ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index b5a2e8b7e0be..5573159f714d 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -339,6 +339,7 @@ struct nvme_ns_ids { u8 eui64[8]; u8 nguid[16]; uuid_t uuid; + u8 csi; }; /* diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 5ce51ab4c50e..81ffe5247505 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -132,6 +132,7 @@ enum { #define NVME_CAP_TIMEOUT(cap) (((cap) >> 24) & 0xff) #define NVME_CAP_STRIDE(cap) (((cap) >> 32) & 0xf) #define NVME_CAP_NSSRC(cap) (((cap) >> 36) & 0x1) +#define NVME_CAP_CSS(cap) (((cap) >> 37) & 0xff) #define NVME_CAP_MPSMIN(cap) (((cap) >> 48) & 0xf) #define NVME_CAP_MPSMAX(cap) (((cap) >> 52) & 0xf) @@ -162,7 +163,6 @@ enum { enum { NVME_CC_ENABLE = 1 << 0, - NVME_CC_CSS_NVM = 0 << 4, NVME_CC_EN_SHIFT = 0, NVME_CC_CSS_SHIFT = 4, NVME_CC_MPS_SHIFT = 7, @@ -170,6 +170,9 @@ enum { NVME_CC_SHN_SHIFT = 14, NVME_CC_IOSQES_SHIFT = 16, NVME_CC_IOCQES_SHIFT = 20, + NVME_CC_CSS_NVM = 0 << NVME_CC_CSS_SHIFT, + NVME_CC_CSS_CSI = 6 << NVME_CC_CSS_SHIFT, + NVME_CC_CSS_MASK = 7 << NVME_CC_CSS_SHIFT, NVME_CC_AMS_RR = 0 << NVME_CC_AMS_SHIFT, NVME_CC_AMS_WRRU = 1 << NVME_CC_AMS_SHIFT, NVME_CC_AMS_VS = 7 << NVME_CC_AMS_SHIFT, @@ -179,6 +182,8 @@ enum { NVME_CC_SHN_MASK = 3 << NVME_CC_SHN_SHIFT, NVME_CC_IOSQES = NVME_NVM_IOSQES << NVME_CC_IOSQES_SHIFT, NVME_CC_IOCQES = NVME_NVM_IOCQES << NVME_CC_IOCQES_SHIFT, + NVME_CAP_CSS_NVM = 1 << 0, + NVME_CAP_CSS_CSI = 1 << 6, NVME_CSTS_RDY = 1 << 0, NVME_CSTS_CFS = 1 << 1, NVME_CSTS_NSSRO = 1 << 4, @@ -374,6 +379,8 @@ enum { NVME_ID_CNS_CTRL = 0x01, NVME_ID_CNS_NS_ACTIVE_LIST = 0x02, NVME_ID_CNS_NS_DESC_LIST = 0x03, + NVME_ID_CNS_CS_NS = 0x05, + NVME_ID_CNS_CS_CTRL = 0x06, NVME_ID_CNS_NS_PRESENT_LIST = 0x10, NVME_ID_CNS_NS_PRESENT = 0x11, NVME_ID_CNS_CTRL_NS_LIST = 0x12, @@ -383,6 +390,10 @@ enum { NVME_ID_CNS_UUID_LIST = 0x17, }; +enum { + NVME_CSI_NVM = 0, +}; + enum { NVME_DIR_IDENTIFY = 0x00, NVME_DIR_STREAMS = 0x01, @@ -435,11 +446,13 @@ struct nvme_ns_id_desc { #define NVME_NIDT_EUI64_LEN 8 #define NVME_NIDT_NGUID_LEN 16 #define NVME_NIDT_UUID_LEN 16 +#define NVME_NIDT_CSI_LEN 1 enum { NVME_NIDT_EUI64 = 0x01, NVME_NIDT_NGUID = 0x02, NVME_NIDT_UUID = 0x03, + NVME_NIDT_CSI = 0x04, }; struct nvme_smart_log { @@ -972,7 +985,9 @@ struct nvme_identify { __u8 cns; __u8 rsvd3; __le16 ctrlid; - __u32 rsvd11[5]; + __u8 rsvd11[3]; + __u8 csi; + __u32 rsvd12[4]; }; #define NVME_IDENTIFY_DATA_SIZE 4096 -- cgit v1.2.3-58-ga151 From be93e87e780253780df9bb6ecc9bc1199b0d94c3 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 29 Jun 2020 12:06:40 -0700 Subject: nvme: support for multiple Command Sets Supported and Effects log pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Commands Supported and Effects log page was extended with a CSI field that enables the host to query the log page for each command set supported. Retrieve this log page for each command set that an attached namespace supports, and save a pointer to that log in the namespace head. Reviewed-by: Matias Bjørling Reviewed-by: Javier González Reviewed-by: Himanshu Madhani Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Reviewed-by: Daniel Wagner Signed-off-by: Keith Busch Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 79 ++++++++++++++++++++++++++++++++----------- drivers/nvme/host/hwmon.c | 2 +- drivers/nvme/host/lightnvm.c | 4 +-- drivers/nvme/host/multipath.c | 2 +- drivers/nvme/host/nvme.h | 10 +++++- include/linux/nvme.h | 4 ++- 6 files changed, 76 insertions(+), 25 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 892291dbee64..62b2cdc764da 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1370,8 +1370,8 @@ static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u32 effects = 0; if (ns) { - if (ctrl->effects) - effects = le32_to_cpu(ctrl->effects->iocs[opcode]); + if (ns->head->effects) + effects = le32_to_cpu(ns->head->effects->iocs[opcode]); if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC)) dev_warn(ctrl->device, "IO command:%02x has unhandled effects:%08x\n", @@ -2851,7 +2851,7 @@ out_unlock: return ret; } -int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, +int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi, void *log, size_t size, u64 offset) { struct nvme_command c = { }; @@ -2865,27 +2865,55 @@ int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, c.get_log_page.numdu = cpu_to_le16(dwlen >> 16); c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset)); c.get_log_page.lpou = cpu_to_le32(upper_32_bits(offset)); + c.get_log_page.csi = csi; return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size); } -static int nvme_get_effects_log(struct nvme_ctrl *ctrl) +static struct nvme_cel *nvme_find_cel(struct nvme_ctrl *ctrl, u8 csi) { + struct nvme_cel *cel, *ret = NULL; + + spin_lock(&ctrl->lock); + list_for_each_entry(cel, &ctrl->cels, entry) { + if (cel->csi == csi) { + ret = cel; + break; + } + } + spin_unlock(&ctrl->lock); + + return ret; +} + +static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi, + struct nvme_effects_log **log) +{ + struct nvme_cel *cel = nvme_find_cel(ctrl, csi); int ret; - if (!ctrl->effects) - ctrl->effects = kzalloc(sizeof(*ctrl->effects), GFP_KERNEL); + if (cel) + goto out; - if (!ctrl->effects) - return 0; + cel = kzalloc(sizeof(*cel), GFP_KERNEL); + if (!cel) + return -ENOMEM; - ret = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CMD_EFFECTS, 0, - ctrl->effects, sizeof(*ctrl->effects), 0); + ret = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CMD_EFFECTS, 0, csi, + &cel->log, sizeof(cel->log), 0); if (ret) { - kfree(ctrl->effects); - ctrl->effects = NULL; + kfree(cel); + return ret; } - return ret; + + cel->csi = csi; + + spin_lock(&ctrl->lock); + list_add_tail(&cel->entry, &ctrl->cels); + spin_unlock(&ctrl->lock); +out: + *log = &cel->log; + return 0; } /* @@ -2918,7 +2946,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) } if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) { - ret = nvme_get_effects_log(ctrl); + ret = nvme_get_effects_log(ctrl, NVME_CSI_NVM, &ctrl->effects); if (ret < 0) goto out_free; } @@ -3551,6 +3579,13 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl, goto out_cleanup_srcu; } + if (head->ids.csi) { + ret = nvme_get_effects_log(ctrl, head->ids.csi, &head->effects); + if (ret) + goto out_cleanup_srcu; + } else + head->effects = ctrl->effects; + ret = nvme_mpath_alloc_disk(ctrl, head); if (ret) goto out_cleanup_srcu; @@ -3891,8 +3926,8 @@ static void nvme_clear_changed_ns_log(struct nvme_ctrl *ctrl) * raced with us in reading the log page, which could cause us to miss * updates. */ - error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CHANGED_NS, 0, log, - log_size, 0); + error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CHANGED_NS, 0, + NVME_CSI_NVM, log, log_size, 0); if (error) dev_warn(ctrl->device, "reading changed ns log failed: %d\n", error); @@ -4036,8 +4071,8 @@ static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl) if (!log) return; - if (nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_FW_SLOT, 0, log, - sizeof(*log), 0)) + if (nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_FW_SLOT, 0, NVME_CSI_NVM, + log, sizeof(*log), 0)) dev_warn(ctrl->device, "Get FW SLOT INFO log error\n"); kfree(log); } @@ -4174,11 +4209,16 @@ static void nvme_free_ctrl(struct device *dev) struct nvme_ctrl *ctrl = container_of(dev, struct nvme_ctrl, ctrl_device); struct nvme_subsystem *subsys = ctrl->subsys; + struct nvme_cel *cel, *next; if (subsys && ctrl->instance != subsys->instance) ida_simple_remove(&nvme_instance_ida, ctrl->instance); - kfree(ctrl->effects); + list_for_each_entry_safe(cel, next, &ctrl->cels, entry) { + list_del(&cel->entry); + kfree(cel); + } + nvme_mpath_uninit(ctrl); __free_page(ctrl->discard_page); @@ -4209,6 +4249,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, spin_lock_init(&ctrl->lock); mutex_init(&ctrl->scan_lock); INIT_LIST_HEAD(&ctrl->namespaces); + INIT_LIST_HEAD(&ctrl->cels); init_rwsem(&ctrl->namespaces_rwsem); ctrl->dev = dev; ctrl->ops = ops; diff --git a/drivers/nvme/host/hwmon.c b/drivers/nvme/host/hwmon.c index 2e6477ed420f..23ba8bf678ae 100644 --- a/drivers/nvme/host/hwmon.c +++ b/drivers/nvme/host/hwmon.c @@ -62,7 +62,7 @@ static int nvme_hwmon_get_smart_log(struct nvme_hwmon_data *data) int ret; ret = nvme_get_log(data->ctrl, NVME_NSID_ALL, NVME_LOG_SMART, 0, - &data->log, sizeof(data->log), 0); + NVME_CSI_NVM, &data->log, sizeof(data->log), 0); return ret <= 0 ? ret : -EIO; } diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 69608755d415..8e562d0f2c30 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -593,8 +593,8 @@ static int nvme_nvm_get_chk_meta(struct nvm_dev *ndev, dev_meta_off = dev_meta; ret = nvme_get_log(ctrl, ns->head->ns_id, - NVME_NVM_LOG_REPORT_CHUNK, 0, dev_meta, len, - offset); + NVME_NVM_LOG_REPORT_CHUNK, 0, NVME_CSI_NVM, + dev_meta, len, offset); if (ret) { dev_err(ctrl->device, "Get REPORT CHUNK log error\n"); break; diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 5a37a595411e..74bad4e3d377 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -527,7 +527,7 @@ static int nvme_read_ana_log(struct nvme_ctrl *ctrl) int error; mutex_lock(&ctrl->ana_lock); - error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, 0, + error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, 0, NVME_CSI_NVM, ctrl->ana_log_buf, ctrl->ana_log_size, 0); if (error) { dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 5573159f714d..fe9424c7097f 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -191,6 +191,12 @@ struct nvme_fault_inject { #endif }; +struct nvme_cel { + struct list_head entry; + struct nvme_effects_log log; + u8 csi; +}; + struct nvme_ctrl { bool comp_seen; enum nvme_ctrl_state state; @@ -257,6 +263,7 @@ struct nvme_ctrl { unsigned long quirks; struct nvme_id_power_state psd[32]; struct nvme_effects_log *effects; + struct list_head cels; struct work_struct scan_work; struct work_struct async_event_work; struct delayed_work ka_work; @@ -359,6 +366,7 @@ struct nvme_ns_head { struct kref ref; bool shared; int instance; + struct nvme_effects_log *effects; #ifdef CONFIG_NVME_MULTIPATH struct gendisk *disk; struct bio_list requeue_list; @@ -561,7 +569,7 @@ int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl); int nvme_try_sched_reset(struct nvme_ctrl *ctrl); int nvme_delete_ctrl(struct nvme_ctrl *ctrl); -int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, +int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi, void *log, size_t size, u64 offset); extern const struct attribute_group *nvme_ns_id_attr_groups[]; diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 81ffe5247505..95cd03e240a1 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -1101,7 +1101,9 @@ struct nvme_get_log_page_command { }; __le64 lpo; }; - __u32 rsvd14[2]; + __u8 rsvd14[3]; + __u8 csi; + __u32 rsvd15; }; struct nvme_directive_cmd { -- cgit v1.2.3-58-ga151 From 240e6ee272c07a2636dfc7d65f5bbb18377c49e5 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 29 Jun 2020 12:06:41 -0700 Subject: nvme: support for zoned namespaces MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for NVM Express Zoned Namespaces (ZNS) Command Set defined in NVM Express TP4053. Zoned namespaces are discovered based on their Command Set Identifier reported in the namespaces Namespace Identification Descriptor list. A successfully discovered Zoned Namespace will be registered with the block layer as a host managed zoned block device with Zone Append command support. A namespace that does not support append is not supported by the driver. Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Reviewed-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Reviewed-by: Javier González Reviewed-by: Himanshu Madhani Signed-off-by: Hans Holmberg Signed-off-by: Dmitry Fomichev Signed-off-by: Ajay Joshi Signed-off-by: Aravind Ramesh Signed-off-by: Niklas Cassel Signed-off-by: Matias Bjørling Signed-off-by: Damien Le Moal Signed-off-by: Keith Busch Signed-off-by: Christoph Hellwig --- block/Kconfig | 5 +- drivers/nvme/host/Makefile | 1 + drivers/nvme/host/core.c | 97 ++++++++++++++--- drivers/nvme/host/nvme.h | 39 +++++++ drivers/nvme/host/zns.c | 254 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/nvme.h | 111 ++++++++++++++++++++ 6 files changed, 492 insertions(+), 15 deletions(-) create mode 100644 drivers/nvme/host/zns.c diff --git a/block/Kconfig b/block/Kconfig index 9357d7302398..bbad5e8bbffe 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -86,9 +86,10 @@ config BLK_DEV_ZONED select MQ_IOSCHED_DEADLINE help Block layer zoned block device support. This option enables - support for ZAC/ZBC host-managed and host-aware zoned block devices. + support for ZAC/ZBC/ZNS host-managed and host-aware zoned block + devices. - Say yes here if you have a ZAC or ZBC storage device. + Say yes here if you have a ZAC, ZBC, or ZNS storage device. config BLK_DEV_THROTTLING bool "Block layer bio throttling support" diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile index fc7b26be692d..d7f6a87687b8 100644 --- a/drivers/nvme/host/Makefile +++ b/drivers/nvme/host/Makefile @@ -13,6 +13,7 @@ nvme-core-y := core.o nvme-core-$(CONFIG_TRACING) += trace.o nvme-core-$(CONFIG_NVME_MULTIPATH) += multipath.o nvme-core-$(CONFIG_NVM) += lightnvm.o +nvme-core-$(CONFIG_BLK_DEV_ZONED) += zns.o nvme-core-$(CONFIG_FAULT_INJECTION_DEBUG_FS) += fault_inject.o nvme-core-$(CONFIG_NVME_HWMON) += hwmon.o diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 62b2cdc764da..a8ee10a0cd32 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -89,7 +89,7 @@ static dev_t nvme_chr_devt; static struct class *nvme_class; static struct class *nvme_subsys_class; -static int nvme_revalidate_disk(struct gendisk *disk); +static int _nvme_revalidate_disk(struct gendisk *disk); static void nvme_put_subsystem(struct nvme_subsystem *subsys); static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, unsigned nsid); @@ -287,6 +287,10 @@ void nvme_complete_rq(struct request *req) nvme_retry_req(req); return; } + } else if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && + req_op(req) == REQ_OP_ZONE_APPEND) { + req->__sector = nvme_lba_to_sect(req->q->queuedata, + le64_to_cpu(nvme_req(req)->result.u64)); } nvme_trace_bio_complete(req, status); @@ -673,7 +677,8 @@ static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns, } static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, - struct request *req, struct nvme_command *cmnd) + struct request *req, struct nvme_command *cmnd, + enum nvme_opcode op) { struct nvme_ctrl *ctrl = ns->ctrl; u16 control = 0; @@ -687,7 +692,7 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, if (req->cmd_flags & REQ_RAHEAD) dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; - cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read); + cmnd->rw.opcode = op; cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id); cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req))); cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); @@ -716,6 +721,8 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, case NVME_NS_DPS_PI_TYPE2: control |= NVME_RW_PRINFO_PRCHK_GUARD | NVME_RW_PRINFO_PRCHK_REF; + if (op == nvme_cmd_zone_append) + control |= NVME_RW_APPEND_PIREMAP; cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req)); break; } @@ -756,6 +763,19 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, case REQ_OP_FLUSH: nvme_setup_flush(ns, cmd); break; + case REQ_OP_ZONE_RESET_ALL: + case REQ_OP_ZONE_RESET: + ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_RESET); + break; + case REQ_OP_ZONE_OPEN: + ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OPEN); + break; + case REQ_OP_ZONE_CLOSE: + ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_CLOSE); + break; + case REQ_OP_ZONE_FINISH: + ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_FINISH); + break; case REQ_OP_WRITE_ZEROES: ret = nvme_setup_write_zeroes(ns, req, cmd); break; @@ -763,8 +783,13 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, ret = nvme_setup_discard(ns, req, cmd); break; case REQ_OP_READ: + ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_read); + break; case REQ_OP_WRITE: - ret = nvme_setup_rw(ns, req, cmd); + ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write); + break; + case REQ_OP_ZONE_APPEND: + ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append); break; default: WARN_ON_ONCE(1); @@ -1398,14 +1423,23 @@ static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, return effects; } -static void nvme_update_formats(struct nvme_ctrl *ctrl) +static void nvme_update_formats(struct nvme_ctrl *ctrl, u32 *effects) { struct nvme_ns *ns; down_read(&ctrl->namespaces_rwsem); list_for_each_entry(ns, &ctrl->namespaces, list) - if (ns->disk && nvme_revalidate_disk(ns->disk)) + if (ns->disk && _nvme_revalidate_disk(ns->disk)) nvme_set_queue_dying(ns); + else if (blk_queue_is_zoned(ns->disk->queue)) { + /* + * IO commands are required to fully revalidate a zoned + * device. Force the command effects to trigger rescan + * work so report zones can run in a context with + * unfrozen IO queues. + */ + *effects |= NVME_CMD_EFFECTS_NCC; + } up_read(&ctrl->namespaces_rwsem); } @@ -1417,7 +1451,7 @@ static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects) * this command. */ if (effects & NVME_CMD_EFFECTS_LBCC) - nvme_update_formats(ctrl); + nvme_update_formats(ctrl, &effects); if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) { nvme_unfreeze(ctrl); nvme_mpath_unfreeze(ctrl->subsys); @@ -1532,7 +1566,7 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, * Issue ioctl requests on the first available path. Note that unlike normal * block layer requests we will not retry failed request on another controller. */ -static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk, +struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk, struct nvme_ns_head **head, int *srcu_idx) { #ifdef CONFIG_NVME_MULTIPATH @@ -1552,7 +1586,7 @@ static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk, return disk->private_data; } -static void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx) +void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx) { if (head) srcu_read_unlock(&head->srcu, idx); @@ -1945,23 +1979,34 @@ static void nvme_update_disk_info(struct gendisk *disk, static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) { + unsigned lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK; struct nvme_ns *ns = disk->private_data; struct nvme_ctrl *ctrl = ns->ctrl; + int ret; u32 iob; /* * If identify namespace failed, use default 512 byte block size so * block layer can use before failing read/write for 0 capacity. */ - ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds; + ns->lba_shift = id->lbaf[lbaf].ds; if (ns->lba_shift == 0) ns->lba_shift = 9; switch (ns->head->ids.csi) { case NVME_CSI_NVM: break; + case NVME_CSI_ZNS: + ret = nvme_update_zone_info(disk, ns, lbaf); + if (ret) { + dev_warn(ctrl->device, + "failed to add zoned namespace:%u ret:%d\n", + ns->head->ns_id, ret); + return ret; + } + break; default: - dev_warn(ctrl->device, "unknown csi:%d ns:%d\n", + dev_warn(ctrl->device, "unknown csi:%u ns:%u\n", ns->head->ids.csi, ns->head->ns_id); return -ENODEV; } @@ -1973,7 +2018,7 @@ static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob)); ns->features = 0; - ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms); + ns->ms = le16_to_cpu(id->lbaf[lbaf].ms); /* the PI implementation requires metadata equal t10 pi tuple size */ if (ns->ms == sizeof(struct t10_pi_tuple)) ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK; @@ -2015,7 +2060,7 @@ static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) return 0; } -static int nvme_revalidate_disk(struct gendisk *disk) +static int _nvme_revalidate_disk(struct gendisk *disk) { struct nvme_ns *ns = disk->private_data; struct nvme_ctrl *ctrl = ns->ctrl; @@ -2063,6 +2108,28 @@ out: return ret; } +static int nvme_revalidate_disk(struct gendisk *disk) +{ + int ret; + + ret = _nvme_revalidate_disk(disk); + if (ret) + return ret; + +#ifdef CONFIG_BLK_DEV_ZONED + if (blk_queue_is_zoned(disk->queue)) { + struct nvme_ns *ns = disk->private_data; + struct nvme_ctrl *ctrl = ns->ctrl; + + ret = blk_revalidate_disk_zones(disk, NULL); + if (!ret) + blk_queue_max_zone_append_sectors(disk->queue, + ctrl->max_zone_append); + } +#endif + return ret; +} + static char nvme_pr_type(enum pr_type type) { switch (type) { @@ -2193,6 +2260,7 @@ static const struct block_device_operations nvme_fops = { .release = nvme_release, .getgeo = nvme_getgeo, .revalidate_disk= nvme_revalidate_disk, + .report_zones = nvme_report_zones, .pr_ops = &nvme_pr_ops, }; @@ -2219,6 +2287,7 @@ const struct block_device_operations nvme_ns_head_ops = { .ioctl = nvme_ioctl, .compat_ioctl = nvme_compat_ioctl, .getgeo = nvme_getgeo, + .report_zones = nvme_report_zones, .pr_ops = &nvme_pr_ops, }; #endif /* CONFIG_NVME_MULTIPATH */ @@ -4446,6 +4515,8 @@ static inline void _nvme_check_size(void) BUILD_BUG_ON(sizeof(struct nvme_command) != 64); BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE); BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE); + BUILD_BUG_ON(sizeof(struct nvme_id_ns_zns) != NVME_IDENTIFY_DATA_SIZE); + BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_zns) != NVME_IDENTIFY_DATA_SIZE); BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64); BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512); BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index fe9424c7097f..13ca90bcd352 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -238,6 +238,9 @@ struct nvme_ctrl { u32 max_hw_sectors; u32 max_segments; u32 max_integrity_segments; +#ifdef CONFIG_BLK_DEV_ZONED + u32 max_zone_append; +#endif u16 crdt[3]; u16 oncs; u16 oacs; @@ -404,6 +407,9 @@ struct nvme_ns { u16 sgs; u32 sws; u8 pi_type; +#ifdef CONFIG_BLK_DEV_ZONED + u64 zsze; +#endif unsigned long features; unsigned long flags; #define NVME_NS_REMOVING 0 @@ -571,6 +577,9 @@ int nvme_delete_ctrl(struct nvme_ctrl *ctrl); int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi, void *log, size_t size, u64 offset); +struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk, + struct nvme_ns_head **head, int *srcu_idx); +void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx); extern const struct attribute_group *nvme_ns_id_attr_groups[]; extern const struct block_device_operations nvme_ns_head_ops; @@ -693,6 +702,36 @@ static inline void nvme_mpath_start_freeze(struct nvme_subsystem *subsys) } #endif /* CONFIG_NVME_MULTIPATH */ +#ifdef CONFIG_BLK_DEV_ZONED +int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns, + unsigned lbaf); + +int nvme_report_zones(struct gendisk *disk, sector_t sector, + unsigned int nr_zones, report_zones_cb cb, void *data); + +blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req, + struct nvme_command *cmnd, + enum nvme_zone_mgmt_action action); +#else +#define nvme_report_zones NULL + +static inline blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, + struct request *req, struct nvme_command *cmnd, + enum nvme_zone_mgmt_action action) +{ + return BLK_STS_NOTSUPP; +} + +static inline int nvme_update_zone_info(struct gendisk *disk, + struct nvme_ns *ns, + unsigned lbaf) +{ + dev_warn(ns->ctrl->device, + "Please enable CONFIG_BLK_DEV_ZONED to support ZNS devices\n"); + return -EPROTONOSUPPORT; +} +#endif + #ifdef CONFIG_NVM int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node); void nvme_nvm_unregister(struct nvme_ns *ns); diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c new file mode 100644 index 000000000000..04e5b991c00c --- /dev/null +++ b/drivers/nvme/host/zns.c @@ -0,0 +1,254 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2020 Western Digital Corporation or its affiliates. + */ + +#include +#include +#include "nvme.h" + +static int nvme_set_max_append(struct nvme_ctrl *ctrl) +{ + struct nvme_command c = { }; + struct nvme_id_ctrl_zns *id; + int status; + + id = kzalloc(sizeof(*id), GFP_KERNEL); + if (!id) + return -ENOMEM; + + c.identify.opcode = nvme_admin_identify; + c.identify.cns = NVME_ID_CNS_CS_CTRL; + c.identify.csi = NVME_CSI_ZNS; + + status = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id)); + if (status) { + kfree(id); + return status; + } + + if (id->zasl) + ctrl->max_zone_append = 1 << (id->zasl + 3); + else + ctrl->max_zone_append = ctrl->max_hw_sectors; + kfree(id); + return 0; +} + +int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns, + unsigned lbaf) +{ + struct nvme_effects_log *log = ns->head->effects; + struct request_queue *q = disk->queue; + struct nvme_command c = { }; + struct nvme_id_ns_zns *id; + int status; + + /* Driver requires zone append support */ + if (!(le32_to_cpu(log->iocs[nvme_cmd_zone_append]) & + NVME_CMD_EFFECTS_CSUPP)) { + dev_warn(ns->ctrl->device, + "append not supported for zoned namespace:%d\n", + ns->head->ns_id); + return -EINVAL; + } + + /* Lazily query controller append limit for the first zoned namespace */ + if (!ns->ctrl->max_zone_append) { + status = nvme_set_max_append(ns->ctrl); + if (status) + return status; + } + + id = kzalloc(sizeof(*id), GFP_KERNEL); + if (!id) + return -ENOMEM; + + c.identify.opcode = nvme_admin_identify; + c.identify.nsid = cpu_to_le32(ns->head->ns_id); + c.identify.cns = NVME_ID_CNS_CS_NS; + c.identify.csi = NVME_CSI_ZNS; + + status = nvme_submit_sync_cmd(ns->ctrl->admin_q, &c, id, sizeof(*id)); + if (status) + goto free_data; + + /* + * We currently do not handle devices requiring any of the zoned + * operation characteristics. + */ + if (id->zoc) { + dev_warn(ns->ctrl->device, + "zone operations:%x not supported for namespace:%u\n", + le16_to_cpu(id->zoc), ns->head->ns_id); + status = -EINVAL; + goto free_data; + } + + ns->zsze = nvme_lba_to_sect(ns, le64_to_cpu(id->lbafe[lbaf].zsze)); + if (!is_power_of_2(ns->zsze)) { + dev_warn(ns->ctrl->device, + "invalid zone size:%llu for namespace:%u\n", + ns->zsze, ns->head->ns_id); + status = -EINVAL; + goto free_data; + } + + q->limits.zoned = BLK_ZONED_HM; + blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); +free_data: + kfree(id); + return status; +} + +static void *nvme_zns_alloc_report_buffer(struct nvme_ns *ns, + unsigned int nr_zones, size_t *buflen) +{ + struct request_queue *q = ns->disk->queue; + size_t bufsize; + void *buf; + + const size_t min_bufsize = sizeof(struct nvme_zone_report) + + sizeof(struct nvme_zone_descriptor); + + nr_zones = min_t(unsigned int, nr_zones, + get_capacity(ns->disk) >> ilog2(ns->zsze)); + + bufsize = sizeof(struct nvme_zone_report) + + nr_zones * sizeof(struct nvme_zone_descriptor); + bufsize = min_t(size_t, bufsize, + queue_max_hw_sectors(q) << SECTOR_SHIFT); + bufsize = min_t(size_t, bufsize, queue_max_segments(q) << PAGE_SHIFT); + + while (bufsize >= min_bufsize) { + buf = __vmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY); + if (buf) { + *buflen = bufsize; + return buf; + } + bufsize >>= 1; + } + return NULL; +} + +static int __nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector, + struct nvme_zone_report *report, + size_t buflen) +{ + struct nvme_command c = { }; + int ret; + + c.zmr.opcode = nvme_cmd_zone_mgmt_recv; + c.zmr.nsid = cpu_to_le32(ns->head->ns_id); + c.zmr.slba = cpu_to_le64(nvme_sect_to_lba(ns, sector)); + c.zmr.numd = cpu_to_le32(nvme_bytes_to_numd(buflen)); + c.zmr.zra = NVME_ZRA_ZONE_REPORT; + c.zmr.zrasf = NVME_ZRASF_ZONE_REPORT_ALL; + c.zmr.pr = NVME_REPORT_ZONE_PARTIAL; + + ret = nvme_submit_sync_cmd(ns->queue, &c, report, buflen); + if (ret) + return ret; + + return le64_to_cpu(report->nr_zones); +} + +static int nvme_zone_parse_entry(struct nvme_ns *ns, + struct nvme_zone_descriptor *entry, + unsigned int idx, report_zones_cb cb, + void *data) +{ + struct blk_zone zone = { }; + + if ((entry->zt & 0xf) != NVME_ZONE_TYPE_SEQWRITE_REQ) { + dev_err(ns->ctrl->device, "invalid zone type %#x\n", + entry->zt); + return -EINVAL; + } + + zone.type = BLK_ZONE_TYPE_SEQWRITE_REQ; + zone.cond = entry->zs >> 4; + zone.len = ns->zsze; + zone.capacity = nvme_lba_to_sect(ns, le64_to_cpu(entry->zcap)); + zone.start = nvme_lba_to_sect(ns, le64_to_cpu(entry->zslba)); + zone.wp = nvme_lba_to_sect(ns, le64_to_cpu(entry->wp)); + + return cb(&zone, idx, data); +} + +static int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector, + unsigned int nr_zones, report_zones_cb cb, void *data) +{ + struct nvme_zone_report *report; + int ret, zone_idx = 0; + unsigned int nz, i; + size_t buflen; + + report = nvme_zns_alloc_report_buffer(ns, nr_zones, &buflen); + if (!report) + return -ENOMEM; + + sector &= ~(ns->zsze - 1); + while (zone_idx < nr_zones && sector < get_capacity(ns->disk)) { + memset(report, 0, buflen); + ret = __nvme_ns_report_zones(ns, sector, report, buflen); + if (ret < 0) + goto out_free; + + nz = min_t(unsigned int, ret, nr_zones); + if (!nz) + break; + + for (i = 0; i < nz && zone_idx < nr_zones; i++) { + ret = nvme_zone_parse_entry(ns, &report->entries[i], + zone_idx, cb, data); + if (ret) + goto out_free; + zone_idx++; + } + + sector += ns->zsze * nz; + } + + if (zone_idx > 0) + ret = zone_idx; + else + ret = -EINVAL; +out_free: + kvfree(report); + return ret; +} + +int nvme_report_zones(struct gendisk *disk, sector_t sector, + unsigned int nr_zones, report_zones_cb cb, void *data) +{ + struct nvme_ns_head *head = NULL; + struct nvme_ns *ns; + int srcu_idx, ret; + + ns = nvme_get_ns_from_disk(disk, &head, &srcu_idx); + if (unlikely(!ns)) + return -EWOULDBLOCK; + + if (ns->head->ids.csi == NVME_CSI_ZNS) + ret = nvme_ns_report_zones(ns, sector, nr_zones, cb, data); + else + ret = -EINVAL; + nvme_put_ns_from_disk(head, srcu_idx); + + return ret; +} + +blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req, + struct nvme_command *c, enum nvme_zone_mgmt_action action) +{ + c->zms.opcode = nvme_cmd_zone_mgmt_send; + c->zms.nsid = cpu_to_le32(ns->head->ns_id); + c->zms.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req))); + c->zms.zsa = action; + + if (req_op(req) == REQ_OP_ZONE_RESET_ALL) + c->zms.select_all = 1; + + return BLK_STS_OK; +} diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 95cd03e240a1..1643005d21e3 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -374,6 +374,30 @@ struct nvme_id_ns { __u8 vs[3712]; }; +struct nvme_zns_lbafe { + __le64 zsze; + __u8 zdes; + __u8 rsvd9[7]; +}; + +struct nvme_id_ns_zns { + __le16 zoc; + __le16 ozcs; + __le32 mar; + __le32 mor; + __le32 rrl; + __le32 frl; + __u8 rsvd20[2796]; + struct nvme_zns_lbafe lbafe[16]; + __u8 rsvd3072[768]; + __u8 vs[256]; +}; + +struct nvme_id_ctrl_zns { + __u8 zasl; + __u8 rsvd1[4095]; +}; + enum { NVME_ID_CNS_NS = 0x00, NVME_ID_CNS_CTRL = 0x01, @@ -392,6 +416,7 @@ enum { enum { NVME_CSI_NVM = 0, + NVME_CSI_ZNS = 2, }; enum { @@ -532,6 +557,27 @@ struct nvme_ana_rsp_hdr { __le16 rsvd10[3]; }; +struct nvme_zone_descriptor { + __u8 zt; + __u8 zs; + __u8 za; + __u8 rsvd3[5]; + __le64 zcap; + __le64 zslba; + __le64 wp; + __u8 rsvd32[32]; +}; + +enum { + NVME_ZONE_TYPE_SEQWRITE_REQ = 0x2, +}; + +struct nvme_zone_report { + __le64 nr_zones; + __u8 resv8[56]; + struct nvme_zone_descriptor entries[]; +}; + enum { NVME_SMART_CRIT_SPARE = 1 << 0, NVME_SMART_CRIT_TEMPERATURE = 1 << 1, @@ -626,6 +672,9 @@ enum nvme_opcode { nvme_cmd_resv_report = 0x0e, nvme_cmd_resv_acquire = 0x11, nvme_cmd_resv_release = 0x15, + nvme_cmd_zone_mgmt_send = 0x79, + nvme_cmd_zone_mgmt_recv = 0x7a, + nvme_cmd_zone_append = 0x7d, }; #define nvme_opcode_name(opcode) { opcode, #opcode } @@ -764,6 +813,7 @@ struct nvme_rw_command { enum { NVME_RW_LR = 1 << 15, NVME_RW_FUA = 1 << 14, + NVME_RW_APPEND_PIREMAP = 1 << 9, NVME_RW_DSM_FREQ_UNSPEC = 0, NVME_RW_DSM_FREQ_TYPICAL = 1, NVME_RW_DSM_FREQ_RARE = 2, @@ -829,6 +879,53 @@ struct nvme_write_zeroes_cmd { __le16 appmask; }; +enum nvme_zone_mgmt_action { + NVME_ZONE_CLOSE = 0x1, + NVME_ZONE_FINISH = 0x2, + NVME_ZONE_OPEN = 0x3, + NVME_ZONE_RESET = 0x4, + NVME_ZONE_OFFLINE = 0x5, + NVME_ZONE_SET_DESC_EXT = 0x10, +}; + +struct nvme_zone_mgmt_send_cmd { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __le32 cdw2[2]; + __le64 metadata; + union nvme_data_ptr dptr; + __le64 slba; + __le32 cdw12; + __u8 zsa; + __u8 select_all; + __u8 rsvd13[2]; + __le32 cdw14[2]; +}; + +struct nvme_zone_mgmt_recv_cmd { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __le64 rsvd2[2]; + union nvme_data_ptr dptr; + __le64 slba; + __le32 numd; + __u8 zra; + __u8 zrasf; + __u8 pr; + __u8 rsvd13; + __le32 cdw14[2]; +}; + +enum { + NVME_ZRA_ZONE_REPORT = 0, + NVME_ZRASF_ZONE_REPORT_ALL = 0, + NVME_REPORT_ZONE_PARTIAL = 1, +}; + /* Features */ enum { @@ -1300,6 +1397,8 @@ struct nvme_command { struct nvme_format_cmd format; struct nvme_dsm_cmd dsm; struct nvme_write_zeroes_cmd write_zeroes; + struct nvme_zone_mgmt_send_cmd zms; + struct nvme_zone_mgmt_recv_cmd zmr; struct nvme_abort_cmd abort; struct nvme_get_log_page_command get_log_page; struct nvmf_common_command fabrics; @@ -1433,6 +1532,18 @@ enum { NVME_SC_DISCOVERY_RESTART = 0x190, NVME_SC_AUTH_REQUIRED = 0x191, + /* + * I/O Command Set Specific - Zoned commands: + */ + NVME_SC_ZONE_BOUNDARY_ERROR = 0x1b8, + NVME_SC_ZONE_FULL = 0x1b9, + NVME_SC_ZONE_READ_ONLY = 0x1ba, + NVME_SC_ZONE_OFFLINE = 0x1bb, + NVME_SC_ZONE_INVALID_WRITE = 0x1bc, + NVME_SC_ZONE_TOO_MANY_ACTIVE = 0x1bd, + NVME_SC_ZONE_TOO_MANY_OPEN = 0x1be, + NVME_SC_ZONE_INVALID_TRANSITION = 0x1bf, + /* * Media and Data Integrity Errors: */ -- cgit v1.2.3-58-ga151 From 764075fdcb2f09baa23fdc4df4b79741e5c39b57 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 5 Jul 2020 00:57:55 -0700 Subject: nvme: expose reconnect_delay and ctrl_loss_tmo via sysfs This is useful information, and moreover its it's useful to be able to alter these parameters per controller after it has been established. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index a8ee10a0cd32..4aaffc4fa150 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3535,6 +3535,66 @@ static ssize_t nvme_sysfs_show_address(struct device *dev, } static DEVICE_ATTR(address, S_IRUGO, nvme_sysfs_show_address, NULL); +static ssize_t nvme_ctrl_loss_tmo_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + struct nvmf_ctrl_options *opts = ctrl->opts; + + if (ctrl->opts->max_reconnects == -1) + return sprintf(buf, "off\n"); + return sprintf(buf, "%d\n", + opts->max_reconnects * opts->reconnect_delay); +} + +static ssize_t nvme_ctrl_loss_tmo_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + struct nvmf_ctrl_options *opts = ctrl->opts; + int ctrl_loss_tmo, err; + + err = kstrtoint(buf, 10, &ctrl_loss_tmo); + if (err) + return -EINVAL; + + else if (ctrl_loss_tmo < 0) + opts->max_reconnects = -1; + else + opts->max_reconnects = DIV_ROUND_UP(ctrl_loss_tmo, + opts->reconnect_delay); + return count; +} +static DEVICE_ATTR(ctrl_loss_tmo, S_IRUGO | S_IWUSR, + nvme_ctrl_loss_tmo_show, nvme_ctrl_loss_tmo_store); + +static ssize_t nvme_ctrl_reconnect_delay_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + if (ctrl->opts->reconnect_delay == -1) + return sprintf(buf, "off\n"); + return sprintf(buf, "%d\n", ctrl->opts->reconnect_delay); +} + +static ssize_t nvme_ctrl_reconnect_delay_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + unsigned int v; + int err; + + err = kstrtou32(buf, 10, &v); + if (err || v > UINT_MAX) + return -EINVAL; + + ctrl->opts->reconnect_delay = v; + return count; +} +static DEVICE_ATTR(reconnect_delay, S_IRUGO | S_IWUSR, + nvme_ctrl_reconnect_delay_show, nvme_ctrl_reconnect_delay_store); + static struct attribute *nvme_dev_attrs[] = { &dev_attr_reset_controller.attr, &dev_attr_rescan_controller.attr, @@ -3552,6 +3612,8 @@ static struct attribute *nvme_dev_attrs[] = { &dev_attr_sqsize.attr, &dev_attr_hostnqn.attr, &dev_attr_hostid.attr, + &dev_attr_ctrl_loss_tmo.attr, + &dev_attr_reconnect_delay.attr, NULL }; -- cgit v1.2.3-58-ga151 From 972b13e29d40753a4ab2cd9735bd6ce26e91d6a6 Mon Sep 17 00:00:00 2001 From: David Fugate Date: Thu, 2 Jul 2020 15:31:22 -0600 Subject: nvme: document quirked Intel models Documented model names of Intel SSDs requiring quirks. Signed-off-by: David Fugate Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index b3538141ec11..83585ed5ec1f 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -3079,16 +3079,16 @@ static const struct pci_error_handlers nvme_err_handler = { }; static const struct pci_device_id nvme_id_table[] = { - { PCI_VDEVICE(INTEL, 0x0953), + { PCI_VDEVICE(INTEL, 0x0953), /* Intel 750/P3500/P3600/P3700 */ .driver_data = NVME_QUIRK_STRIPE_SIZE | NVME_QUIRK_DEALLOCATE_ZEROES, }, - { PCI_VDEVICE(INTEL, 0x0a53), + { PCI_VDEVICE(INTEL, 0x0a53), /* Intel P3520 */ .driver_data = NVME_QUIRK_STRIPE_SIZE | NVME_QUIRK_DEALLOCATE_ZEROES, }, - { PCI_VDEVICE(INTEL, 0x0a54), + { PCI_VDEVICE(INTEL, 0x0a54), /* Intel P4500/P4600 */ .driver_data = NVME_QUIRK_STRIPE_SIZE | NVME_QUIRK_DEALLOCATE_ZEROES, }, - { PCI_VDEVICE(INTEL, 0x0a55), + { PCI_VDEVICE(INTEL, 0x0a55), /* Dell Express Flash P4600 */ .driver_data = NVME_QUIRK_STRIPE_SIZE | NVME_QUIRK_DEALLOCATE_ZEROES, }, { PCI_VDEVICE(INTEL, 0xf1a5), /* Intel 600P/P3100 */ -- cgit v1.2.3-58-ga151 From c25c853ef60d2c0d37420b9e4b81bdd49e90b46e Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 3 Jul 2020 10:49:22 +0800 Subject: nvme-pci: remove redundant segment validation We've validated the segment counts before calling nvme_map_data(), so there is no need to validate again in nvme_pci_use_sgls(, which is only called from nvme_map_data(). Signed-off-by: Baolin Wang Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 83585ed5ec1f..9216cbd2fd43 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -501,9 +501,6 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req) int nseg = blk_rq_nr_phys_segments(req); unsigned int avg_seg_size; - if (nseg == 0) - return false; - avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req), nseg); if (!(dev->ctrl.sgls & ((1 << 0) | (1 << 1)))) -- cgit v1.2.3-58-ga151 From ee0d96d3225f5e3688391a033b9e373b5b603315 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 3 Jul 2020 10:49:20 +0800 Subject: nvme-pci: fix some comments issues Fix comment typos and remove whitespaces before tabs to cleanup checkpatch errors. Signed-off-by: Baolin Wang Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 9216cbd2fd43..a4725b37b288 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1258,9 +1258,9 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) } /* - * Shutdown the controller immediately and schedule a reset if the - * command was already aborted once before and still hasn't been - * returned to the driver, or if this is the admin queue. + * Shutdown the controller immediately and schedule a reset if the + * command was already aborted once before and still hasn't been + * returned to the driver, or if this is the admin queue. */ if (!nvmeq->qid || iod->aborted) { dev_warn(dev->ctrl.device, @@ -2000,7 +2000,7 @@ static void nvme_calc_irq_sets(struct irq_affinity *affd, unsigned int nrirqs) unsigned int nr_read_queues, nr_write_queues = dev->nr_write_queues; /* - * If there is no interupt available for queues, ensure that + * If there is no interrupt available for queues, ensure that * the default queue is set to 1. The affinity set size is * also set to one, but the irq core ignores it for this case. * -- cgit v1.2.3-58-ga151 From 4e523547e2bf755d40cb10e85795c2f9620ff3fb Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 3 Jul 2020 10:49:21 +0800 Subject: nvme-pci: add a blank line after declarations Add a blank line after declarations to make code more readable. Signed-off-by: Baolin Wang Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index a4725b37b288..41c2055c6fc0 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1017,6 +1017,7 @@ static irqreturn_t nvme_irq(int irq, void *data) static irqreturn_t nvme_irq_check(int irq, void *data) { struct nvme_queue *nvmeq = data; + if (nvme_cqe_pending(nvmeq)) return IRQ_WAKE_THREAD; return IRQ_NONE; @@ -1399,6 +1400,7 @@ static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues, if (q_size_aligned * nr_io_queues > dev->cmb_size) { u64 mem_per_q = div_u64(dev->cmb_size, nr_io_queues); + mem_per_q = round_down(mem_per_q, dev->ctrl.page_size); q_depth = div_u64(mem_per_q, entry_size); @@ -2873,6 +2875,7 @@ static void nvme_reset_done(struct pci_dev *pdev) static void nvme_shutdown(struct pci_dev *pdev) { struct nvme_dev *dev = pci_get_drvdata(pdev); + nvme_disable_prepare_reset(dev, true); } @@ -3003,6 +3006,7 @@ unfreeze: static int nvme_simple_suspend(struct device *dev) { struct nvme_dev *ndev = pci_get_drvdata(to_pci_dev(dev)); + return nvme_disable_prepare_reset(ndev, true); } -- cgit v1.2.3-58-ga151 From 9056fc9fc514ecd2457a59c575863ecb07c4fa5e Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 3 Jul 2020 10:49:23 +0800 Subject: nvme-pci: use the consistent return type of nvme_pci_iod_alloc_size() The nvme_pci_iod_alloc_size() should return 'size_t' type to be consistent with the sizeof return value. Signed-off-by: Baolin Wang Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 41c2055c6fc0..c9083c87c6cb 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -362,7 +362,7 @@ static int nvme_pci_npages_sgl(unsigned int num_seg) return DIV_ROUND_UP(num_seg * sizeof(struct nvme_sgl_desc), PAGE_SIZE); } -static unsigned int nvme_pci_iod_alloc_size(struct nvme_dev *dev, +static size_t nvme_pci_iod_alloc_size(struct nvme_dev *dev, unsigned int size, unsigned int nseg, bool use_sgl) { size_t alloc_size; -- cgit v1.2.3-58-ga151 From 359c1f88ab646174bf82d18454c3ee2a38462ac8 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 3 Jul 2020 10:49:24 +0800 Subject: nvme-pci: use standard block status symbolic names Signed-off-by: Baolin Wang Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index c9083c87c6cb..45e94f016ec2 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -762,7 +762,7 @@ static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev, cmnd->dptr.prp1 = cpu_to_le64(iod->first_dma); if (bv->bv_len > first_prp_len) cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma + first_prp_len); - return 0; + return BLK_STS_OK; } static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev, @@ -780,7 +780,7 @@ static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev, cmnd->dptr.sgl.addr = cpu_to_le64(iod->first_dma); cmnd->dptr.sgl.length = cpu_to_le32(iod->dma_len); cmnd->dptr.sgl.type = NVME_SGL_FMT_DATA_DESC << 4; - return 0; + return BLK_STS_OK; } static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, @@ -844,7 +844,7 @@ static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req, if (dma_mapping_error(dev->dev, iod->meta_dma)) return BLK_STS_IOERR; cmnd->rw.metadata = cpu_to_le64(iod->meta_dma); - return 0; + return BLK_STS_OK; } /* -- cgit v1.2.3-58-ga151 From 3913f4f3a65ca9ed6ba7e4678fff10a6e7b42dbd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 8 Jul 2020 16:18:27 +0200 Subject: nvme: remove ns->disk checks By the time a namespace is added to ctrl->namespaces list and thus can be looked up ns->disk has been assigned, and it it never cleared. Remove all the checks for ns->disk being NULL. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Chaitanya Kulkarni --- drivers/nvme/host/core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 4aaffc4fa150..3d00ea4e7146 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -100,7 +100,7 @@ static void nvme_set_queue_dying(struct nvme_ns *ns) * Revalidating a dead namespace sets capacity to 0. This will end * buffered writers dirtying pages that can't be synced. */ - if (!ns->disk || test_and_set_bit(NVME_NS_DEAD, &ns->flags)) + if (test_and_set_bit(NVME_NS_DEAD, &ns->flags)) return; blk_set_queue_dying(ns->queue); /* Forcibly unquiesce queues to avoid blocking dispatch */ @@ -1429,7 +1429,7 @@ static void nvme_update_formats(struct nvme_ctrl *ctrl, u32 *effects) down_read(&ctrl->namespaces_rwsem); list_for_each_entry(ns, &ctrl->namespaces, list) - if (ns->disk && _nvme_revalidate_disk(ns->disk)) + if (_nvme_revalidate_disk(ns->disk)) nvme_set_queue_dying(ns); else if (blk_queue_is_zoned(ns->disk->queue)) { /* @@ -3933,7 +3933,7 @@ static void nvme_ns_remove(struct nvme_ns *ns) nvme_mpath_clear_current_path(ns); synchronize_srcu(&ns->head->srcu); /* wait for concurrent submissions */ - if (ns->disk && ns->disk->flags & GENHD_FL_UP) { + if (ns->disk->flags & GENHD_FL_UP) { del_gendisk(ns->disk); blk_cleanup_queue(ns->queue); if (blk_get_integrity(ns->disk)) @@ -3964,7 +3964,7 @@ static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid) ns = nvme_find_get_ns(ctrl, nsid); if (ns) { - if (ns->disk && revalidate_disk(ns->disk)) + if (revalidate_disk(ns->disk)) nvme_ns_remove(ns); nvme_put_ns(ns); } else -- cgit v1.2.3-58-ga151 From 2eaac320db515b2ec681f6a4bad4f67a7be84ce8 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 11 Jul 2020 08:46:47 +0200 Subject: rsxx: switch from 'pci_free_consistent()' to 'dma_free_coherent()' The wrappers in include/linux/pci-dma-compat.h should go away. The patch has been generated with the coccinelle script bellow. It has been compile tested. This also aligns code with what is in use in '/rsxx/dma.c' @@ @@ - PCI_DMA_BIDIRECTIONAL + DMA_BIDIRECTIONAL @@ @@ - PCI_DMA_TODEVICE + DMA_TO_DEVICE @@ @@ - PCI_DMA_FROMDEVICE + DMA_FROM_DEVICE @@ @@ - PCI_DMA_NONE + DMA_NONE @@ expression e1, e2, e3; @@ - pci_alloc_consistent(e1, e2, e3) + dma_alloc_coherent(&e1->dev, e2, e3, GFP_) @@ expression e1, e2, e3; @@ - pci_zalloc_consistent(e1, e2, e3) + dma_alloc_coherent(&e1->dev, e2, e3, GFP_) @@ expression e1, e2, e3, e4; @@ - pci_free_consistent(e1, e2, e3, e4) + dma_free_coherent(&e1->dev, e2, e3, e4) @@ expression e1, e2, e3, e4; @@ - pci_map_single(e1, e2, e3, e4) + dma_map_single(&e1->dev, e2, e3, e4) @@ expression e1, e2, e3, e4; @@ - pci_unmap_single(e1, e2, e3, e4) + dma_unmap_single(&e1->dev, e2, e3, e4) @@ expression e1, e2, e3, e4, e5; @@ - pci_map_page(e1, e2, e3, e4, e5) + dma_map_page(&e1->dev, e2, e3, e4, e5) @@ expression e1, e2, e3, e4; @@ - pci_unmap_page(e1, e2, e3, e4) + dma_unmap_page(&e1->dev, e2, e3, e4) @@ expression e1, e2, e3, e4; @@ - pci_map_sg(e1, e2, e3, e4) + dma_map_sg(&e1->dev, e2, e3, e4) @@ expression e1, e2, e3, e4; @@ - pci_unmap_sg(e1, e2, e3, e4) + dma_unmap_sg(&e1->dev, e2, e3, e4) @@ expression e1, e2, e3, e4; @@ - pci_dma_sync_single_for_cpu(e1, e2, e3, e4) + dma_sync_single_for_cpu(&e1->dev, e2, e3, e4) @@ expression e1, e2, e3, e4; @@ - pci_dma_sync_single_for_device(e1, e2, e3, e4) + dma_sync_single_for_device(&e1->dev, e2, e3, e4) @@ expression e1, e2, e3, e4; @@ - pci_dma_sync_sg_for_cpu(e1, e2, e3, e4) + dma_sync_sg_for_cpu(&e1->dev, e2, e3, e4) @@ expression e1, e2, e3, e4; @@ - pci_dma_sync_sg_for_device(e1, e2, e3, e4) + dma_sync_sg_for_device(&e1->dev, e2, e3, e4) @@ expression e1, e2; @@ - pci_dma_mapping_error(e1, e2) + dma_mapping_error(&e1->dev, e2) @@ expression e1, e2; @@ - pci_set_dma_mask(e1, e2) + dma_set_mask(&e1->dev, e2) @@ expression e1, e2; @@ - pci_set_consistent_dma_mask(e1, e2) + dma_set_coherent_mask(&e1->dev, e2) Signed-off-by: Christophe JAILLET Signed-off-by: Jens Axboe --- drivers/block/rsxx/core.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/drivers/block/rsxx/core.c b/drivers/block/rsxx/core.c index 10f6368117d8..01333af13bbb 100644 --- a/drivers/block/rsxx/core.c +++ b/drivers/block/rsxx/core.c @@ -562,13 +562,15 @@ static int rsxx_eeh_frozen(struct pci_dev *dev) for (i = 0; i < card->n_targets; i++) { if (card->ctrl[i].status.buf) - pci_free_consistent(card->dev, STATUS_BUFFER_SIZE8, - card->ctrl[i].status.buf, - card->ctrl[i].status.dma_addr); + dma_free_coherent(&card->dev->dev, + STATUS_BUFFER_SIZE8, + card->ctrl[i].status.buf, + card->ctrl[i].status.dma_addr); if (card->ctrl[i].cmd.buf) - pci_free_consistent(card->dev, COMMAND_BUFFER_SIZE8, - card->ctrl[i].cmd.buf, - card->ctrl[i].cmd.dma_addr); + dma_free_coherent(&card->dev->dev, + COMMAND_BUFFER_SIZE8, + card->ctrl[i].cmd.buf, + card->ctrl[i].cmd.dma_addr); } return 0; @@ -711,15 +713,15 @@ static pci_ers_result_t rsxx_slot_reset(struct pci_dev *dev) failed_hw_buffers_init: for (i = 0; i < card->n_targets; i++) { if (card->ctrl[i].status.buf) - pci_free_consistent(card->dev, - STATUS_BUFFER_SIZE8, - card->ctrl[i].status.buf, - card->ctrl[i].status.dma_addr); + dma_free_coherent(&card->dev->dev, + STATUS_BUFFER_SIZE8, + card->ctrl[i].status.buf, + card->ctrl[i].status.dma_addr); if (card->ctrl[i].cmd.buf) - pci_free_consistent(card->dev, - COMMAND_BUFFER_SIZE8, - card->ctrl[i].cmd.buf, - card->ctrl[i].cmd.dma_addr); + dma_free_coherent(&card->dev->dev, + COMMAND_BUFFER_SIZE8, + card->ctrl[i].cmd.buf, + card->ctrl[i].cmd.dma_addr); } failed_hw_setup: rsxx_eeh_failure(dev); -- cgit v1.2.3-58-ga151 From 9a5a85972c073f720d81a7ebd08bfe278e6e16db Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 2 Jul 2020 12:35:02 +0100 Subject: md: raid0/linear: fix dereference before null check on pointer mddev Pointer mddev is being dereferenced with a test_bit call before mddev is being null checked, this may cause a null pointer dereference. Fix this by moving the null pointer checks to sanity check mddev before it is dereferenced. Addresses-Coverity: ("Dereference before null check") Fixes: 62f7b1989c02 ("md raid0/linear: Mark array as 'broken' and fail BIOs if a member is gone") Signed-off-by: Colin Ian King Reviewed-by: Guilherme G. Piccoli Signed-off-by: Song Liu --- drivers/md/md.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 8bb69c61afe0..49452149ac72 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -470,17 +470,18 @@ static blk_qc_t md_submit_bio(struct bio *bio) struct mddev *mddev = bio->bi_disk->private_data; unsigned int sectors; - if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) { + if (mddev == NULL || mddev->pers == NULL) { bio_io_error(bio); return BLK_QC_T_NONE; } - blk_queue_split(&bio); - - if (mddev == NULL || mddev->pers == NULL) { + if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) { bio_io_error(bio); return BLK_QC_T_NONE; } + + blk_queue_split(&bio); + if (mddev->ro == 1 && unlikely(rw == WRITE)) { if (bio_sectors(bio) != 0) bio->bi_status = BLK_STS_IOERR; -- cgit v1.2.3-58-ga151 From 41d2d848e5c09209bdb57ff9c0ca34075e22783d Mon Sep 17 00:00:00 2001 From: Artur Paszkiewicz Date: Fri, 3 Jul 2020 11:13:09 +0200 Subject: md: improve io stats accounting Use generic io accounting functions to manage io stats. There was an attempt to do this earlier in commit 18c0b223cf99 ("md: use generic io stats accounting functions to simplify io stat accounting"), but it did not include a call to generic_end_io_acct() and caused issues with tracking in-flight IOs, so it was later removed in commit 74672d069b29 ("md: fix md io stats accounting broken"). This patch attempts to fix this by using both disk_start_io_acct() and disk_end_io_acct(). To make it possible, a struct md_io is allocated for every new md bio, which includes the io start_time. A new mempool is introduced for this purpose. We override bio->bi_end_io with our own callback and call disk_start_io_acct() before passing the bio to md_handle_request(). When it completes, we call disk_end_io_acct() and the original bi_end_io callback. This adds correct statistics about in-flight IOs and IO processing time, interpreted e.g. in iostat as await, svctm, aqu-sz and %util. It also fixes a situation where too many IOs where reported if a bio was re-submitted to the mddev, because io accounting is now performed only on newly arriving bios. Acked-by: Guoqing Jiang Signed-off-by: Artur Paszkiewicz Signed-off-by: Song Liu --- drivers/md/md.c | 57 +++++++++++++++++++++++++++++++++++++++++++++------------ drivers/md/md.h | 1 + 2 files changed, 46 insertions(+), 12 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 49452149ac72..07e5b67a2c48 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -463,12 +463,33 @@ check_suspended: } EXPORT_SYMBOL(md_handle_request); +struct md_io { + struct mddev *mddev; + bio_end_io_t *orig_bi_end_io; + void *orig_bi_private; + unsigned long start_time; +}; + +static void md_end_io(struct bio *bio) +{ + struct md_io *md_io = bio->bi_private; + struct mddev *mddev = md_io->mddev; + + disk_end_io_acct(mddev->gendisk, bio_op(bio), md_io->start_time); + + bio->bi_end_io = md_io->orig_bi_end_io; + bio->bi_private = md_io->orig_bi_private; + + mempool_free(md_io, &mddev->md_io_pool); + + if (bio->bi_end_io) + bio->bi_end_io(bio); +} + static blk_qc_t md_submit_bio(struct bio *bio) { const int rw = bio_data_dir(bio); - const int sgrp = op_stat_group(bio_op(bio)); struct mddev *mddev = bio->bi_disk->private_data; - unsigned int sectors; if (mddev == NULL || mddev->pers == NULL) { bio_io_error(bio); @@ -489,21 +510,27 @@ static blk_qc_t md_submit_bio(struct bio *bio) return BLK_QC_T_NONE; } - /* - * save the sectors now since our bio can - * go away inside make_request - */ - sectors = bio_sectors(bio); + if (bio->bi_end_io != md_end_io) { + struct md_io *md_io; + + md_io = mempool_alloc(&mddev->md_io_pool, GFP_NOIO); + md_io->mddev = mddev; + md_io->orig_bi_end_io = bio->bi_end_io; + md_io->orig_bi_private = bio->bi_private; + + bio->bi_end_io = md_end_io; + bio->bi_private = md_io; + + md_io->start_time = disk_start_io_acct(mddev->gendisk, + bio_sectors(bio), + bio_op(bio)); + } + /* bio could be mergeable after passing to underlayer */ bio->bi_opf &= ~REQ_NOMERGE; md_handle_request(mddev, bio); - part_stat_lock(); - part_stat_inc(&mddev->gendisk->part0, ios[sgrp]); - part_stat_add(&mddev->gendisk->part0, sectors[sgrp], sectors); - part_stat_unlock(); - return BLK_QC_T_NONE; } @@ -5546,6 +5573,7 @@ static void md_free(struct kobject *ko) bioset_exit(&mddev->bio_set); bioset_exit(&mddev->sync_set); + mempool_exit(&mddev->md_io_pool); kfree(mddev); } @@ -5641,6 +5669,11 @@ static int md_alloc(dev_t dev, char *name) */ mddev->hold_active = UNTIL_STOP; + error = mempool_init_kmalloc_pool(&mddev->md_io_pool, BIO_POOL_SIZE, + sizeof(struct md_io)); + if (error) + goto abort; + error = -ENOMEM; mddev->queue = blk_alloc_queue(NUMA_NO_NODE); if (!mddev->queue) diff --git a/drivers/md/md.h b/drivers/md/md.h index 612814d07d35..c26fa8bd41e7 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -481,6 +481,7 @@ struct mddev { struct bio_set sync_set; /* for sync operations like * metadata and bitmap writes */ + mempool_t md_io_pool; /* Generic flush handling. * The last to finish preflush schedules a worker to submit -- cgit v1.2.3-58-ga151 From e1a86dbbbd6a77f73c3d099030495fa31f181e2f Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Tue, 14 Jul 2020 16:10:26 -0700 Subject: md: fix deadlock causing by sysfs_notify The following deadlock was captured. The first process is holding 'kernfs_mutex' and hung by io. The io was staging in 'r1conf.pending_bio_list' of raid1 device, this pending bio list would be flushed by second process 'md127_raid1', but it was hung by 'kernfs_mutex'. Using sysfs_notify_dirent_safe() to replace sysfs_notify() can fix it. There were other sysfs_notify() invoked from io path, removed all of them. PID: 40430 TASK: ffff8ee9c8c65c40 CPU: 29 COMMAND: "probe_file" #0 [ffffb87c4df37260] __schedule at ffffffff9a8678ec #1 [ffffb87c4df372f8] schedule at ffffffff9a867f06 #2 [ffffb87c4df37310] io_schedule at ffffffff9a0c73e6 #3 [ffffb87c4df37328] __dta___xfs_iunpin_wait_3443 at ffffffffc03a4057 [xfs] #4 [ffffb87c4df373a0] xfs_iunpin_wait at ffffffffc03a6c79 [xfs] #5 [ffffb87c4df373b0] __dta_xfs_reclaim_inode_3357 at ffffffffc039a46c [xfs] #6 [ffffb87c4df37400] xfs_reclaim_inodes_ag at ffffffffc039a8b6 [xfs] #7 [ffffb87c4df37590] xfs_reclaim_inodes_nr at ffffffffc039bb33 [xfs] #8 [ffffb87c4df375b0] xfs_fs_free_cached_objects at ffffffffc03af0e9 [xfs] #9 [ffffb87c4df375c0] super_cache_scan at ffffffff9a287ec7 #10 [ffffb87c4df37618] shrink_slab at ffffffff9a1efd93 #11 [ffffb87c4df37700] shrink_node at ffffffff9a1f5968 #12 [ffffb87c4df37788] do_try_to_free_pages at ffffffff9a1f5ea2 #13 [ffffb87c4df377f0] try_to_free_mem_cgroup_pages at ffffffff9a1f6445 #14 [ffffb87c4df37880] try_charge at ffffffff9a26cc5f #15 [ffffb87c4df37920] memcg_kmem_charge_memcg at ffffffff9a270f6a #16 [ffffb87c4df37958] new_slab at ffffffff9a251430 #17 [ffffb87c4df379c0] ___slab_alloc at ffffffff9a251c85 #18 [ffffb87c4df37a80] __slab_alloc at ffffffff9a25635d #19 [ffffb87c4df37ac0] kmem_cache_alloc at ffffffff9a251f89 #20 [ffffb87c4df37b00] alloc_inode at ffffffff9a2a2b10 #21 [ffffb87c4df37b20] iget_locked at ffffffff9a2a4854 #22 [ffffb87c4df37b60] kernfs_get_inode at ffffffff9a311377 #23 [ffffb87c4df37b80] kernfs_iop_lookup at ffffffff9a311e2b #24 [ffffb87c4df37ba8] lookup_slow at ffffffff9a290118 #25 [ffffb87c4df37c10] walk_component at ffffffff9a291e83 #26 [ffffb87c4df37c78] path_lookupat at ffffffff9a293619 #27 [ffffb87c4df37cd8] filename_lookup at ffffffff9a2953af #28 [ffffb87c4df37de8] user_path_at_empty at ffffffff9a295566 #29 [ffffb87c4df37e10] vfs_statx at ffffffff9a289787 #30 [ffffb87c4df37e70] SYSC_newlstat at ffffffff9a289d5d #31 [ffffb87c4df37f18] sys_newlstat at ffffffff9a28a60e #32 [ffffb87c4df37f28] do_syscall_64 at ffffffff9a003949 #33 [ffffb87c4df37f50] entry_SYSCALL_64_after_hwframe at ffffffff9aa001ad RIP: 00007f617a5f2905 RSP: 00007f607334f838 RFLAGS: 00000246 RAX: ffffffffffffffda RBX: 00007f6064044b20 RCX: 00007f617a5f2905 RDX: 00007f6064044b20 RSI: 00007f6064044b20 RDI: 00007f6064005890 RBP: 00007f6064044aa0 R8: 0000000000000030 R9: 000000000000011c R10: 0000000000000013 R11: 0000000000000246 R12: 00007f606417e6d0 R13: 00007f6064044aa0 R14: 00007f6064044b10 R15: 00000000ffffffff ORIG_RAX: 0000000000000006 CS: 0033 SS: 002b PID: 927 TASK: ffff8f15ac5dbd80 CPU: 42 COMMAND: "md127_raid1" #0 [ffffb87c4df07b28] __schedule at ffffffff9a8678ec #1 [ffffb87c4df07bc0] schedule at ffffffff9a867f06 #2 [ffffb87c4df07bd8] schedule_preempt_disabled at ffffffff9a86825e #3 [ffffb87c4df07be8] __mutex_lock at ffffffff9a869bcc #4 [ffffb87c4df07ca0] __mutex_lock_slowpath at ffffffff9a86a013 #5 [ffffb87c4df07cb0] mutex_lock at ffffffff9a86a04f #6 [ffffb87c4df07cc8] kernfs_find_and_get_ns at ffffffff9a311d83 #7 [ffffb87c4df07cf0] sysfs_notify at ffffffff9a314b3a #8 [ffffb87c4df07d18] md_update_sb at ffffffff9a688696 #9 [ffffb87c4df07d98] md_update_sb at ffffffff9a6886d5 #10 [ffffb87c4df07da8] md_check_recovery at ffffffff9a68ad9c #11 [ffffb87c4df07dd0] raid1d at ffffffffc01f0375 [raid1] #12 [ffffb87c4df07ea0] md_thread at ffffffff9a680348 #13 [ffffb87c4df07f08] kthread at ffffffff9a0b8005 #14 [ffffb87c4df07f50] ret_from_fork at ffffffff9aa00344 Signed-off-by: Junxiao Bi Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 2 +- drivers/md/md.c | 44 ++++++++++++++++++++++++++++++-------------- drivers/md/md.h | 8 +++++++- drivers/md/raid10.c | 2 +- drivers/md/raid5.c | 6 +++--- 5 files changed, 42 insertions(+), 20 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 95a5f3757fa3..d61b524ae440 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1631,7 +1631,7 @@ void md_bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force) s += blocks; } bitmap->last_end_sync = jiffies; - sysfs_notify(&bitmap->mddev->kobj, NULL, "sync_completed"); + sysfs_notify_dirent_safe(bitmap->mddev->sysfs_completed); } EXPORT_SYMBOL(md_bitmap_cond_end_sync); diff --git a/drivers/md/md.c b/drivers/md/md.c index 07e5b67a2c48..fc33f2f8a415 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2472,6 +2472,10 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) if (sysfs_create_link(&rdev->kobj, ko, "block")) /* failure here is OK */; rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state"); + rdev->sysfs_unack_badblocks = + sysfs_get_dirent_safe(rdev->kobj.sd, "unacknowledged_bad_blocks"); + rdev->sysfs_badblocks = + sysfs_get_dirent_safe(rdev->kobj.sd, "bad_blocks"); list_add_rcu(&rdev->same_set, &mddev->disks); bd_link_disk_holder(rdev->bdev, mddev->gendisk); @@ -2505,7 +2509,11 @@ static void unbind_rdev_from_array(struct md_rdev *rdev) rdev->mddev = NULL; sysfs_remove_link(&rdev->kobj, "block"); sysfs_put(rdev->sysfs_state); + sysfs_put(rdev->sysfs_unack_badblocks); + sysfs_put(rdev->sysfs_badblocks); rdev->sysfs_state = NULL; + rdev->sysfs_unack_badblocks = NULL; + rdev->sysfs_badblocks = NULL; rdev->badblocks.count = 0; /* We need to delay this, otherwise we can deadlock when * writing to 'remove' to "dev/state". We also need @@ -2850,7 +2858,7 @@ rewrite: goto repeat; wake_up(&mddev->sb_wait); if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) - sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + sysfs_notify_dirent_safe(mddev->sysfs_completed); rdev_for_each(rdev, mddev) { if (test_and_clear_bit(FaultRecorded, &rdev->flags)) @@ -4103,7 +4111,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) mddev_resume(mddev); if (!mddev->thread) md_update_sb(mddev, 1); - sysfs_notify(&mddev->kobj, NULL, "level"); + sysfs_notify_dirent_safe(mddev->sysfs_level); md_new_event(mddev); rv = len; out_unlock: @@ -4856,7 +4864,7 @@ action_store(struct mddev *mddev, const char *page, size_t len) } if (err) return err; - sysfs_notify(&mddev->kobj, NULL, "degraded"); + sysfs_notify_dirent_safe(mddev->sysfs_degraded); } else { if (cmd_match(page, "check")) set_bit(MD_RECOVERY_CHECK, &mddev->recovery); @@ -5562,6 +5570,13 @@ static void md_free(struct kobject *ko) if (mddev->sysfs_state) sysfs_put(mddev->sysfs_state); + if (mddev->sysfs_completed) + sysfs_put(mddev->sysfs_completed); + if (mddev->sysfs_degraded) + sysfs_put(mddev->sysfs_degraded); + if (mddev->sysfs_level) + sysfs_put(mddev->sysfs_level); + if (mddev->gendisk) del_gendisk(mddev->gendisk); @@ -5729,6 +5744,9 @@ static int md_alloc(dev_t dev, char *name) if (!error && mddev->kobj.sd) { kobject_uevent(&mddev->kobj, KOBJ_ADD); mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state"); + mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed"); + mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded"); + mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level"); } mddev_put(mddev); return error; @@ -6083,7 +6101,7 @@ static int do_md_run(struct mddev *mddev) kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); sysfs_notify_dirent_safe(mddev->sysfs_state); sysfs_notify_dirent_safe(mddev->sysfs_action); - sysfs_notify(&mddev->kobj, NULL, "degraded"); + sysfs_notify_dirent_safe(mddev->sysfs_degraded); out: clear_bit(MD_NOT_READY, &mddev->flags); return err; @@ -8802,7 +8820,7 @@ void md_do_sync(struct md_thread *thread) } else mddev->curr_resync = 3; /* no longer delayed */ mddev->curr_resync_completed = j; - sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + sysfs_notify_dirent_safe(mddev->sysfs_completed); md_new_event(mddev); update_time = jiffies; @@ -8830,7 +8848,7 @@ void md_do_sync(struct md_thread *thread) mddev->recovery_cp = j; update_time = jiffies; set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); - sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + sysfs_notify_dirent_safe(mddev->sysfs_completed); } while (j >= mddev->resync_max && @@ -8937,7 +8955,7 @@ void md_do_sync(struct md_thread *thread) !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && mddev->curr_resync > 3) { mddev->curr_resync_completed = mddev->curr_resync; - sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + sysfs_notify_dirent_safe(mddev->sysfs_completed); } mddev->pers->sync_request(mddev, max_sectors, &skipped); @@ -9067,7 +9085,7 @@ static int remove_and_add_spares(struct mddev *mddev, } if (removed && mddev->kobj.sd) - sysfs_notify(&mddev->kobj, NULL, "degraded"); + sysfs_notify_dirent_safe(mddev->sysfs_degraded); if (this && removed) goto no_add; @@ -9350,8 +9368,7 @@ void md_reap_sync_thread(struct mddev *mddev) /* success...*/ /* activate any spares */ if (mddev->pers->spare_active(mddev)) { - sysfs_notify(&mddev->kobj, NULL, - "degraded"); + sysfs_notify_dirent_safe(mddev->sysfs_degraded); set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); } } @@ -9441,8 +9458,7 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, if (rv == 0) { /* Make sure they get written out promptly */ if (test_bit(ExternalBbl, &rdev->flags)) - sysfs_notify(&rdev->kobj, NULL, - "unacknowledged_bad_blocks"); + sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks); sysfs_notify_dirent_safe(rdev->sysfs_state); set_mask_bits(&mddev->sb_flags, 0, BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING)); @@ -9463,7 +9479,7 @@ int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, s += rdev->data_offset; rv = badblocks_clear(&rdev->badblocks, s, sectors); if ((rv == 0) && test_bit(ExternalBbl, &rdev->flags)) - sysfs_notify(&rdev->kobj, NULL, "bad_blocks"); + sysfs_notify_dirent_safe(rdev->sysfs_badblocks); return rv; } EXPORT_SYMBOL_GPL(rdev_clear_badblocks); @@ -9693,7 +9709,7 @@ static int read_rdev(struct mddev *mddev, struct md_rdev *rdev) if (rdev->recovery_offset == MaxSector && !test_bit(In_sync, &rdev->flags) && mddev->pers->spare_active(mddev)) - sysfs_notify(&mddev->kobj, NULL, "degraded"); + sysfs_notify_dirent_safe(mddev->sysfs_degraded); put_page(swapout); return 0; diff --git a/drivers/md/md.h b/drivers/md/md.h index c26fa8bd41e7..1e172b3e499f 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -126,7 +126,10 @@ struct md_rdev { struct kernfs_node *sysfs_state; /* handle for 'state' * sysfs entry */ - + /* handle for 'unacknowledged_bad_blocks' sysfs dentry */ + struct kernfs_node *sysfs_unack_badblocks; + /* handle for 'bad_blocks' sysfs dentry */ + struct kernfs_node *sysfs_badblocks; struct badblocks badblocks; struct { @@ -420,6 +423,9 @@ struct mddev { * file in sysfs. */ struct kernfs_node *sysfs_action; /* handle for 'sync_action' */ + struct kernfs_node *sysfs_completed; /*handle for 'sync_completed' */ + struct kernfs_node *sysfs_degraded; /*handle for 'degraded' */ + struct kernfs_node *sysfs_level; /*handle for 'level' */ struct work_struct del_work; /* used for delayed sysfs removal */ diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index e45fd56cf584..2c47474fa69d 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -4454,7 +4454,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, sector_nr = conf->reshape_progress; if (sector_nr) { mddev->curr_resync_completed = sector_nr; - sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + sysfs_notify_dirent_safe(mddev->sysfs_completed); *skipped = 1; return sector_nr; } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 8dea4398b191..a4fbafa5b8f8 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5799,7 +5799,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk sector_div(sector_nr, new_data_disks); if (sector_nr) { mddev->curr_resync_completed = sector_nr; - sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + sysfs_notify_dirent_safe(mddev->sysfs_completed); *skipped = 1; retn = sector_nr; goto finish; @@ -5913,7 +5913,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk conf->reshape_safe = mddev->reshape_position; spin_unlock_irq(&conf->device_lock); wake_up(&conf->wait_for_overlap); - sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + sysfs_notify_dirent_safe(mddev->sysfs_completed); } INIT_LIST_HEAD(&stripes); @@ -6020,7 +6020,7 @@ finish: conf->reshape_safe = mddev->reshape_position; spin_unlock_irq(&conf->device_lock); wake_up(&conf->wait_for_overlap); - sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + sysfs_notify_dirent_safe(mddev->sysfs_completed); } ret: return retn; -- cgit v1.2.3-58-ga151 From c9020e64cf33f2dd5b2a7295f2bfea787279218a Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 6 Jul 2020 14:57:32 -0700 Subject: md/raid5-cache: clear MD_SB_CHANGE_PENDING before flushing stripes In recovery, if we process too much data, raid5-cache may set MD_SB_CHANGE_PENDING, which causes spinning in handle_stripe(). Fix this issue by clearing the bit before flushing data only stripes. This issue was initially discussed in [1]. [1] https://www.spinics.net/lists/raid/msg64409.html Signed-off-by: Song Liu --- drivers/md/raid5-cache.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 9b6da759dca2..0bea21d81697 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -2430,10 +2430,15 @@ static void r5c_recovery_flush_data_only_stripes(struct r5l_log *log, struct mddev *mddev = log->rdev->mddev; struct r5conf *conf = mddev->private; struct stripe_head *sh, *next; + bool cleared_pending = false; if (ctx->data_only_stripes == 0) return; + if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { + cleared_pending = true; + clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); + } log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_BACK; list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) { @@ -2448,6 +2453,8 @@ static void r5c_recovery_flush_data_only_stripes(struct r5l_log *log, atomic_read(&conf->active_stripes) == 0); log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; + if (cleared_pending) + set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); } static int r5l_recovery_log(struct r5l_log *log) -- cgit v1.2.3-58-ga151 From 60f80d6f2d07a6d8aee485a1d1252327eeee0c81 Mon Sep 17 00:00:00 2001 From: Zhao Heming Date: Thu, 9 Jul 2020 11:29:29 +0800 Subject: md-cluster: fix wild pointer of unlock_all_bitmaps() reproduction steps: ``` node1 # mdadm -C /dev/md0 -b clustered -e 1.2 -n 2 -l mirror /dev/sda /dev/sdb node2 # mdadm -A /dev/md0 /dev/sda /dev/sdb node1 # mdadm -G /dev/md0 -b none mdadm: failed to remove clustered bitmap. node1 # mdadm -S --scan ^C <==== mdadm hung & kernel crash ``` kernel stack: ``` [ 335.230657] general protection fault: 0000 [#1] SMP NOPTI [...] [ 335.230848] Call Trace: [ 335.230873] ? unlock_all_bitmaps+0x5/0x70 [md_cluster] [ 335.230886] unlock_all_bitmaps+0x3d/0x70 [md_cluster] [ 335.230899] leave+0x10f/0x190 [md_cluster] [ 335.230932] ? md_super_wait+0x93/0xa0 [md_mod] [ 335.230947] ? leave+0x5/0x190 [md_cluster] [ 335.230973] md_cluster_stop+0x1a/0x30 [md_mod] [ 335.230999] md_bitmap_free+0x142/0x150 [md_mod] [ 335.231013] ? _cond_resched+0x15/0x40 [ 335.231025] ? mutex_lock+0xe/0x30 [ 335.231056] __md_stop+0x1c/0xa0 [md_mod] [ 335.231083] do_md_stop+0x160/0x580 [md_mod] [ 335.231119] ? 0xffffffffc05fb078 [ 335.231148] md_ioctl+0xa04/0x1930 [md_mod] [ 335.231165] ? filename_lookup+0xf2/0x190 [ 335.231179] blkdev_ioctl+0x93c/0xa10 [ 335.231205] ? _cond_resched+0x15/0x40 [ 335.231214] ? __check_object_size+0xd4/0x1a0 [ 335.231224] block_ioctl+0x39/0x40 [ 335.231243] do_vfs_ioctl+0xa0/0x680 [ 335.231253] ksys_ioctl+0x70/0x80 [ 335.231261] __x64_sys_ioctl+0x16/0x20 [ 335.231271] do_syscall_64+0x65/0x1f0 [ 335.231278] entry_SYSCALL_64_after_hwframe+0x44/0xa9 ``` Signed-off-by: Zhao Heming Signed-off-by: Song Liu --- drivers/md/md-cluster.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 813a99ffa86f..73fd50e77975 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -1518,6 +1518,7 @@ static void unlock_all_bitmaps(struct mddev *mddev) } } kfree(cinfo->other_bitmap_lockres); + cinfo->other_bitmap_lockres = NULL; } } -- cgit v1.2.3-58-ga151 From 9f4aa52387c68049403b59939df5c0dd8e3872cc Mon Sep 17 00:00:00 2001 From: Stefan Haberland Date: Tue, 14 Jul 2020 22:03:26 +0200 Subject: s390/dasd: fix inability to use DASD with DIAG driver During initialization of the DASD DIAG driver a request is issued that has a bio structure that resides on the stack. With virtually mapped kernel stacks this bio address might be in virtual storage which is unsuitable for usage with the diag250 call. In this case the device can not be set online using the DIAG discipline and fails with -EOPNOTSUP. In the system journal the following error message is presented: dasd: X.X.XXXX Setting the DASD online with discipline DIAG failed with rc=-95 Fix by allocating the bio structure instead of having it on the stack. Fixes: ce3dc447493f ("s390: add support for virtually mapped kernel stacks") Signed-off-by: Stefan Haberland Reviewed-by: Peter Oberparleiter Cc: stable@vger.kernel.org #4.20 Signed-off-by: Jens Axboe --- drivers/s390/block/dasd_diag.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/drivers/s390/block/dasd_diag.c b/drivers/s390/block/dasd_diag.c index facb588d09e4..069d6b39cacf 100644 --- a/drivers/s390/block/dasd_diag.c +++ b/drivers/s390/block/dasd_diag.c @@ -319,7 +319,7 @@ dasd_diag_check_device(struct dasd_device *device) struct dasd_diag_characteristics *rdc_data; struct vtoc_cms_label *label; struct dasd_block *block; - struct dasd_diag_bio bio; + struct dasd_diag_bio *bio; unsigned int sb, bsize; blocknum_t end_block; int rc; @@ -395,29 +395,36 @@ dasd_diag_check_device(struct dasd_device *device) rc = -ENOMEM; goto out; } + bio = kzalloc(sizeof(*bio), GFP_KERNEL); + if (bio == NULL) { + DBF_DEV_EVENT(DBF_WARNING, device, "%s", + "No memory to allocate initialization bio"); + rc = -ENOMEM; + goto out_label; + } rc = 0; end_block = 0; /* try all sizes - needed for ECKD devices */ for (bsize = 512; bsize <= PAGE_SIZE; bsize <<= 1) { mdsk_init_io(device, bsize, 0, &end_block); - memset(&bio, 0, sizeof (struct dasd_diag_bio)); - bio.type = MDSK_READ_REQ; - bio.block_number = private->pt_block + 1; - bio.buffer = label; + memset(bio, 0, sizeof(*bio)); + bio->type = MDSK_READ_REQ; + bio->block_number = private->pt_block + 1; + bio->buffer = label; memset(&private->iob, 0, sizeof (struct dasd_diag_rw_io)); private->iob.dev_nr = rdc_data->dev_nr; private->iob.key = 0; private->iob.flags = 0; /* do synchronous io */ private->iob.block_count = 1; private->iob.interrupt_params = 0; - private->iob.bio_list = &bio; + private->iob.bio_list = bio; private->iob.flaga = DASD_DIAG_FLAGA_DEFAULT; rc = dia250(&private->iob, RW_BIO); if (rc == 3) { pr_warn("%s: A 64-bit DIAG call failed\n", dev_name(&device->cdev->dev)); rc = -EOPNOTSUPP; - goto out_label; + goto out_bio; } mdsk_term_io(device); if (rc == 0) @@ -427,7 +434,7 @@ dasd_diag_check_device(struct dasd_device *device) pr_warn("%s: Accessing the DASD failed because of an incorrect format (rc=%d)\n", dev_name(&device->cdev->dev), rc); rc = -EIO; - goto out_label; + goto out_bio; } /* check for label block */ if (memcmp(label->label_id, DASD_DIAG_CMS1, @@ -457,6 +464,8 @@ dasd_diag_check_device(struct dasd_device *device) (rc == 4) ? ", read-only device" : ""); rc = 0; } +out_bio: + kfree(bio); out_label: free_page((long) label); out: -- cgit v1.2.3-58-ga151 From 10321aa17ae9ad9b1335d61aeabb6f63d38cc28e Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 14 Jul 2020 22:03:27 +0200 Subject: s390/dasd: Use struct_size() helper Make use of the struct_size() helper instead of an open-coded version in order to avoid any potential type mistakes. Also, remove unnecessary variable _datasize_. This code was detected with the help of Coccinelle and, audited and fixed manually. Signed-off-by: Gustavo A. R. Silva Signed-off-by: Stefan Haberland Signed-off-by: Jens Axboe --- drivers/s390/block/dasd_diag.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/s390/block/dasd_diag.c b/drivers/s390/block/dasd_diag.c index 069d6b39cacf..1b9e1442e6a5 100644 --- a/drivers/s390/block/dasd_diag.c +++ b/drivers/s390/block/dasd_diag.c @@ -515,7 +515,7 @@ static struct dasd_ccw_req *dasd_diag_build_cp(struct dasd_device *memdev, struct req_iterator iter; struct bio_vec bv; char *dst; - unsigned int count, datasize; + unsigned int count; sector_t recid, first_rec, last_rec; unsigned int blksize, off; unsigned char rw_cmd; @@ -543,10 +543,8 @@ static struct dasd_ccw_req *dasd_diag_build_cp(struct dasd_device *memdev, if (count != last_rec - first_rec + 1) return ERR_PTR(-EINVAL); /* Build the request */ - datasize = sizeof(struct dasd_diag_req) + - count*sizeof(struct dasd_diag_bio); - cqr = dasd_smalloc_request(DASD_DIAG_MAGIC, 0, datasize, memdev, - blk_mq_rq_to_pdu(req)); + cqr = dasd_smalloc_request(DASD_DIAG_MAGIC, 0, struct_size(dreq, bio, count), + memdev, blk_mq_rq_to_pdu(req)); if (IS_ERR(cqr)) return cqr; -- cgit v1.2.3-58-ga151 From e15864f8ea05b24071b07300459ae7e511d0b938 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Tue, 14 Jul 2020 23:18:23 +0200 Subject: block: add max_open_zones to blk-sysfs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a new max_open_zones definition in the sysfs documentation. This definition will be common for all devices utilizing the zoned block device support in the kernel. Export max open zones according to this new definition for NVMe Zoned Namespace devices, ZAC ATA devices (which are treated as SCSI devices by the kernel), and ZBC SCSI devices. Add the new max_open_zones member to struct request_queue, rather than as a queue limit, since this property cannot be split across stacking drivers. Signed-off-by: Niklas Cassel Reviewed-by: Javier González Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- Documentation/ABI/testing/sysfs-block | 9 +++++++++ Documentation/block/queue-sysfs.rst | 7 +++++++ block/blk-sysfs.c | 15 +++++++++++++++ drivers/nvme/host/zns.c | 1 + drivers/scsi/sd_zbc.c | 4 ++++ include/linux/blkdev.h | 25 +++++++++++++++++++++++++ 6 files changed, 61 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block index ed8c14f161ee..f151d9cf90de 100644 --- a/Documentation/ABI/testing/sysfs-block +++ b/Documentation/ABI/testing/sysfs-block @@ -273,6 +273,15 @@ Description: device ("host-aware" or "host-managed" zone model). For regular block devices, the value is always 0. +What: /sys/block//queue/max_open_zones +Date: July 2020 +Contact: Niklas Cassel +Description: + For zoned block devices (zoned attribute indicating + "host-managed" or "host-aware"), the sum of zones belonging to + any of the zone states: EXPLICIT OPEN or IMPLICIT OPEN, + is limited by this value. If this value is 0, there is no limit. + What: /sys/block//queue/chunk_sectors Date: September 2016 Contact: Hannes Reinecke diff --git a/Documentation/block/queue-sysfs.rst b/Documentation/block/queue-sysfs.rst index 6a8513af9201..f01cf8530ae4 100644 --- a/Documentation/block/queue-sysfs.rst +++ b/Documentation/block/queue-sysfs.rst @@ -117,6 +117,13 @@ Maximum number of elements in a DMA scatter/gather list with integrity data that will be submitted by the block layer core to the associated block driver. +max_open_zones (RO) +------------------- +For zoned block devices (zoned attribute indicating "host-managed" or +"host-aware"), the sum of zones belonging to any of the zone states: +EXPLICIT OPEN or IMPLICIT OPEN, is limited by this value. +If this value is 0, there is no limit. + max_sectors_kb (RW) ------------------- This is the maximum number of kilobytes that the block layer will allow diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index be67952e7be2..414f04579d77 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -306,6 +306,11 @@ static ssize_t queue_nr_zones_show(struct request_queue *q, char *page) return queue_var_show(blk_queue_nr_zones(q), page); } +static ssize_t queue_max_open_zones_show(struct request_queue *q, char *page) +{ + return queue_var_show(queue_max_open_zones(q), page); +} + static ssize_t queue_nomerges_show(struct request_queue *q, char *page) { return queue_var_show((blk_queue_nomerges(q) << 1) | @@ -668,6 +673,11 @@ static struct queue_sysfs_entry queue_nr_zones_entry = { .show = queue_nr_zones_show, }; +static struct queue_sysfs_entry queue_max_open_zones_entry = { + .attr = {.name = "max_open_zones", .mode = 0444 }, + .show = queue_max_open_zones_show, +}; + static struct queue_sysfs_entry queue_nomerges_entry = { .attr = {.name = "nomerges", .mode = 0644 }, .show = queue_nomerges_show, @@ -766,6 +776,7 @@ static struct attribute *queue_attrs[] = { &queue_nonrot_entry.attr, &queue_zoned_entry.attr, &queue_nr_zones_entry.attr, + &queue_max_open_zones_entry.attr, &queue_nomerges_entry.attr, &queue_rq_affinity_entry.attr, &queue_iostats_entry.attr, @@ -793,6 +804,10 @@ static umode_t queue_attr_visible(struct kobject *kobj, struct attribute *attr, (!q->mq_ops || !q->mq_ops->timeout)) return 0; + if (attr == &queue_max_open_zones_entry.attr && + !blk_queue_is_zoned(q)) + return 0; + return attr->mode; } diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c index 04e5b991c00c..3d80b9cf6bfc 100644 --- a/drivers/nvme/host/zns.c +++ b/drivers/nvme/host/zns.c @@ -96,6 +96,7 @@ int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns, q->limits.zoned = BLK_ZONED_HM; blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); + blk_queue_max_open_zones(q, le32_to_cpu(id->mor) + 1); free_data: kfree(id); return status; diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index 183a20720da9..aa3564139b40 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -717,6 +717,10 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buf) /* The drive satisfies the kernel restrictions: set it up */ blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE); + if (sdkp->zones_max_open == U32_MAX) + blk_queue_max_open_zones(q, 0); + else + blk_queue_max_open_zones(q, sdkp->zones_max_open); nr_zones = round_up(sdkp->capacity, zone_blocks) >> ilog2(zone_blocks); /* READ16/WRITE16 is mandatory for ZBC disks */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index de7adc59b993..c8beb8bbdb08 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -513,6 +513,7 @@ struct request_queue { unsigned int nr_zones; unsigned long *conv_zones_bitmap; unsigned long *seq_zones_wlock; + unsigned int max_open_zones; #endif /* CONFIG_BLK_DEV_ZONED */ /* @@ -722,6 +723,17 @@ static inline bool blk_queue_zone_is_seq(struct request_queue *q, return true; return !test_bit(blk_queue_zone_no(q, sector), q->conv_zones_bitmap); } + +static inline void blk_queue_max_open_zones(struct request_queue *q, + unsigned int max_open_zones) +{ + q->max_open_zones = max_open_zones; +} + +static inline unsigned int queue_max_open_zones(const struct request_queue *q) +{ + return q->max_open_zones; +} #else /* CONFIG_BLK_DEV_ZONED */ static inline unsigned int blk_queue_nr_zones(struct request_queue *q) { @@ -737,6 +749,10 @@ static inline unsigned int blk_queue_zone_no(struct request_queue *q, { return 0; } +static inline unsigned int queue_max_open_zones(const struct request_queue *q) +{ + return 0; +} #endif /* CONFIG_BLK_DEV_ZONED */ static inline bool rq_is_sync(struct request *rq) @@ -1519,6 +1535,15 @@ static inline sector_t bdev_zone_sectors(struct block_device *bdev) return 0; } +static inline unsigned int bdev_max_open_zones(struct block_device *bdev) +{ + struct request_queue *q = bdev_get_queue(bdev); + + if (q) + return queue_max_open_zones(q); + return 0; +} + static inline int queue_dma_alignment(const struct request_queue *q) { return q ? q->dma_alignment : 511; -- cgit v1.2.3-58-ga151 From 659bf827ba8f1183b714341d8a1d4b1e446178d9 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Tue, 14 Jul 2020 23:18:24 +0200 Subject: block: add max_active_zones to blk-sysfs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a new max_active zones definition in the sysfs documentation. This definition will be common for all devices utilizing the zoned block device support in the kernel. Export max_active_zones according to this new definition for NVMe Zoned Namespace devices, ZAC ATA devices (which are treated as SCSI devices by the kernel), and ZBC SCSI devices. Add the new max_active_zones member to struct request_queue, rather than as a queue limit, since this property cannot be split across stacking drivers. For SCSI devices, even though max active zones is not part of the ZBC/ZAC spec, export max_active_zones as 0, signifying "no limit". Signed-off-by: Niklas Cassel Reviewed-by: Javier González Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- Documentation/ABI/testing/sysfs-block | 9 +++++++++ Documentation/block/queue-sysfs.rst | 7 +++++++ block/blk-sysfs.c | 14 +++++++++++++- drivers/nvme/host/zns.c | 1 + drivers/scsi/sd_zbc.c | 1 + include/linux/blkdev.h | 25 +++++++++++++++++++++++++ 6 files changed, 56 insertions(+), 1 deletion(-) diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block index f151d9cf90de..2322eb748b38 100644 --- a/Documentation/ABI/testing/sysfs-block +++ b/Documentation/ABI/testing/sysfs-block @@ -273,6 +273,15 @@ Description: device ("host-aware" or "host-managed" zone model). For regular block devices, the value is always 0. +What: /sys/block//queue/max_active_zones +Date: July 2020 +Contact: Niklas Cassel +Description: + For zoned block devices (zoned attribute indicating + "host-managed" or "host-aware"), the sum of zones belonging to + any of the zone states: EXPLICIT OPEN, IMPLICIT OPEN or CLOSED, + is limited by this value. If this value is 0, there is no limit. + What: /sys/block//queue/max_open_zones Date: July 2020 Contact: Niklas Cassel diff --git a/Documentation/block/queue-sysfs.rst b/Documentation/block/queue-sysfs.rst index f01cf8530ae4..f261a5c84170 100644 --- a/Documentation/block/queue-sysfs.rst +++ b/Documentation/block/queue-sysfs.rst @@ -117,6 +117,13 @@ Maximum number of elements in a DMA scatter/gather list with integrity data that will be submitted by the block layer core to the associated block driver. +max_active_zones (RO) +--------------------- +For zoned block devices (zoned attribute indicating "host-managed" or +"host-aware"), the sum of zones belonging to any of the zone states: +EXPLICIT OPEN, IMPLICIT OPEN or CLOSED, is limited by this value. +If this value is 0, there is no limit. + max_open_zones (RO) ------------------- For zoned block devices (zoned attribute indicating "host-managed" or diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 414f04579d77..7dda709f3ccb 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -311,6 +311,11 @@ static ssize_t queue_max_open_zones_show(struct request_queue *q, char *page) return queue_var_show(queue_max_open_zones(q), page); } +static ssize_t queue_max_active_zones_show(struct request_queue *q, char *page) +{ + return queue_var_show(queue_max_active_zones(q), page); +} + static ssize_t queue_nomerges_show(struct request_queue *q, char *page) { return queue_var_show((blk_queue_nomerges(q) << 1) | @@ -678,6 +683,11 @@ static struct queue_sysfs_entry queue_max_open_zones_entry = { .show = queue_max_open_zones_show, }; +static struct queue_sysfs_entry queue_max_active_zones_entry = { + .attr = {.name = "max_active_zones", .mode = 0444 }, + .show = queue_max_active_zones_show, +}; + static struct queue_sysfs_entry queue_nomerges_entry = { .attr = {.name = "nomerges", .mode = 0644 }, .show = queue_nomerges_show, @@ -777,6 +787,7 @@ static struct attribute *queue_attrs[] = { &queue_zoned_entry.attr, &queue_nr_zones_entry.attr, &queue_max_open_zones_entry.attr, + &queue_max_active_zones_entry.attr, &queue_nomerges_entry.attr, &queue_rq_affinity_entry.attr, &queue_iostats_entry.attr, @@ -804,7 +815,8 @@ static umode_t queue_attr_visible(struct kobject *kobj, struct attribute *attr, (!q->mq_ops || !q->mq_ops->timeout)) return 0; - if (attr == &queue_max_open_zones_entry.attr && + if ((attr == &queue_max_open_zones_entry.attr || + attr == &queue_max_active_zones_entry.attr) && !blk_queue_is_zoned(q)) return 0; diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c index 3d80b9cf6bfc..57cfd78731fb 100644 --- a/drivers/nvme/host/zns.c +++ b/drivers/nvme/host/zns.c @@ -97,6 +97,7 @@ int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns, q->limits.zoned = BLK_ZONED_HM; blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); blk_queue_max_open_zones(q, le32_to_cpu(id->mor) + 1); + blk_queue_max_active_zones(q, le32_to_cpu(id->mar) + 1); free_data: kfree(id); return status; diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index aa3564139b40..d8b2c49d645b 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -721,6 +721,7 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buf) blk_queue_max_open_zones(q, 0); else blk_queue_max_open_zones(q, sdkp->zones_max_open); + blk_queue_max_active_zones(q, 0); nr_zones = round_up(sdkp->capacity, zone_blocks) >> ilog2(zone_blocks); /* READ16/WRITE16 is mandatory for ZBC disks */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c8beb8bbdb08..285b59cfc064 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -514,6 +514,7 @@ struct request_queue { unsigned long *conv_zones_bitmap; unsigned long *seq_zones_wlock; unsigned int max_open_zones; + unsigned int max_active_zones; #endif /* CONFIG_BLK_DEV_ZONED */ /* @@ -734,6 +735,17 @@ static inline unsigned int queue_max_open_zones(const struct request_queue *q) { return q->max_open_zones; } + +static inline void blk_queue_max_active_zones(struct request_queue *q, + unsigned int max_active_zones) +{ + q->max_active_zones = max_active_zones; +} + +static inline unsigned int queue_max_active_zones(const struct request_queue *q) +{ + return q->max_active_zones; +} #else /* CONFIG_BLK_DEV_ZONED */ static inline unsigned int blk_queue_nr_zones(struct request_queue *q) { @@ -753,6 +765,10 @@ static inline unsigned int queue_max_open_zones(const struct request_queue *q) { return 0; } +static inline unsigned int queue_max_active_zones(const struct request_queue *q) +{ + return 0; +} #endif /* CONFIG_BLK_DEV_ZONED */ static inline bool rq_is_sync(struct request *rq) @@ -1544,6 +1560,15 @@ static inline unsigned int bdev_max_open_zones(struct block_device *bdev) return 0; } +static inline unsigned int bdev_max_active_zones(struct block_device *bdev) +{ + struct request_queue *q = bdev_get_queue(bdev); + + if (q) + return queue_max_active_zones(q); + return 0; +} + static inline int queue_dma_alignment(const struct request_queue *q) { return q ? q->dma_alignment : 511; -- cgit v1.2.3-58-ga151