diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-08-05 10:51:40 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-08-05 10:51:40 -0700 |
commit | e0fc99e21e6e299673f1640105ac0c1d829c2d93 (patch) | |
tree | 4637b171164c3e653e9002b2bc962b32f05b5a07 | |
parent | 4834ce9d8e074bb7ae197632e0708219b9f389b5 (diff) | |
parent | f59589fc89665102923725e80e12f782d5f74f67 (diff) |
Merge tag 'for-5.9/drivers-20200803' of git://git.kernel.dk/linux-block
Pull block driver updates from Jens Axboe:
- NVMe:
- ZNS support (Aravind, Keith, Matias, Niklas)
- Misc cleanups, optimizations, fixes (Baolin, Chaitanya, David,
Dongli, Max, Sagi)
- null_blk zone capacity support (Aravind)
- MD:
- raid5/6 fixes (ChangSyun)
- Warning fixes (Damien)
- raid5 stripe fixes (Guoqing, Song, Yufen)
- sysfs deadlock fix (Junxiao)
- raid10 deadlock fix (Vitaly)
- struct_size conversions (Gustavo)
- Set of bcache updates/fixes (Coly)
* tag 'for-5.9/drivers-20200803' of git://git.kernel.dk/linux-block: (117 commits)
md/raid5: Allow degraded raid6 to do rmw
md/raid5: Fix Force reconstruct-write io stuck in degraded raid5
raid5: don't duplicate code for different paths in handle_stripe
raid5-cache: hold spinlock instead of mutex in r5c_journal_mode_show
md: print errno in super_written
md/raid5: remove the redundant setting of STRIPE_HANDLE
md: register new md sysfs file 'uuid' read-only
md: fix max sectors calculation for super 1.0
nvme-loop: remove extra variable in create ctrl
nvme-loop: set ctrl state connecting after init
nvme-multipath: do not fall back to __nvme_find_path() for non-optimized paths
nvme-multipath: fix logic for non-optimized paths
nvme-rdma: fix controller reset hang during traffic
nvme-tcp: fix controller reset hang during traffic
nvmet: introduce the passthru Kconfig option
nvmet: introduce the passthru configfs interface
nvmet: Add passthru enable/disable helpers
nvmet: add passthru code to process commands
nvme: export nvme_find_get_ns() and nvme_put_ns()
nvme: introduce nvme_ctrl_get_by_path()
...
70 files changed, 3149 insertions, 826 deletions
diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block index ed8c14f161ee..2322eb748b38 100644 --- a/Documentation/ABI/testing/sysfs-block +++ b/Documentation/ABI/testing/sysfs-block @@ -273,6 +273,24 @@ Description: device ("host-aware" or "host-managed" zone model). For regular block devices, the value is always 0. +What: /sys/block/<disk>/queue/max_active_zones +Date: July 2020 +Contact: Niklas Cassel <niklas.cassel@wdc.com> +Description: + For zoned block devices (zoned attribute indicating + "host-managed" or "host-aware"), the sum of zones belonging to + any of the zone states: EXPLICIT OPEN, IMPLICIT OPEN or CLOSED, + is limited by this value. If this value is 0, there is no limit. + +What: /sys/block/<disk>/queue/max_open_zones +Date: July 2020 +Contact: Niklas Cassel <niklas.cassel@wdc.com> +Description: + For zoned block devices (zoned attribute indicating + "host-managed" or "host-aware"), the sum of zones belonging to + any of the zone states: EXPLICIT OPEN or IMPLICIT OPEN, + is limited by this value. If this value is 0, there is no limit. + What: /sys/block/<disk>/queue/chunk_sectors Date: September 2016 Contact: Hannes Reinecke <hare@suse.com> diff --git a/Documentation/admin-guide/md.rst b/Documentation/admin-guide/md.rst index d973d469ffc4..cc8781b96b4d 100644 --- a/Documentation/admin-guide/md.rst +++ b/Documentation/admin-guide/md.rst @@ -426,6 +426,10 @@ All md devices contain: The accepted values when writing to this file are ``ppl`` and ``resync``, used to enable and disable PPL. + uuid + This indicates the UUID of the array in the following format: + xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx + As component devices are added to an md array, they appear in the ``md`` directory as new directories named:: diff --git a/Documentation/block/queue-sysfs.rst b/Documentation/block/queue-sysfs.rst index 6a8513af9201..f261a5c84170 100644 --- a/Documentation/block/queue-sysfs.rst +++ b/Documentation/block/queue-sysfs.rst @@ -117,6 +117,20 @@ Maximum number of elements in a DMA scatter/gather list with integrity data that will be submitted by the block layer core to the associated block driver. +max_active_zones (RO) +--------------------- +For zoned block devices (zoned attribute indicating "host-managed" or +"host-aware"), the sum of zones belonging to any of the zone states: +EXPLICIT OPEN, IMPLICIT OPEN or CLOSED, is limited by this value. +If this value is 0, there is no limit. + +max_open_zones (RO) +------------------- +For zoned block devices (zoned attribute indicating "host-managed" or +"host-aware"), the sum of zones belonging to any of the zone states: +EXPLICIT OPEN or IMPLICIT OPEN, is limited by this value. +If this value is 0, there is no limit. + max_sectors_kb (RW) ------------------- This is the maximum number of kilobytes that the block layer will allow diff --git a/block/Kconfig b/block/Kconfig index 9357d7302398..bbad5e8bbffe 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -86,9 +86,10 @@ config BLK_DEV_ZONED select MQ_IOSCHED_DEADLINE help Block layer zoned block device support. This option enables - support for ZAC/ZBC host-managed and host-aware zoned block devices. + support for ZAC/ZBC/ZNS host-managed and host-aware zoned block + devices. - Say yes here if you have a ZAC or ZBC storage device. + Say yes here if you have a ZAC, ZBC, or ZNS storage device. config BLK_DEV_THROTTLING bool "Block layer bio throttling support" diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index be67952e7be2..7dda709f3ccb 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -306,6 +306,16 @@ static ssize_t queue_nr_zones_show(struct request_queue *q, char *page) return queue_var_show(blk_queue_nr_zones(q), page); } +static ssize_t queue_max_open_zones_show(struct request_queue *q, char *page) +{ + return queue_var_show(queue_max_open_zones(q), page); +} + +static ssize_t queue_max_active_zones_show(struct request_queue *q, char *page) +{ + return queue_var_show(queue_max_active_zones(q), page); +} + static ssize_t queue_nomerges_show(struct request_queue *q, char *page) { return queue_var_show((blk_queue_nomerges(q) << 1) | @@ -668,6 +678,16 @@ static struct queue_sysfs_entry queue_nr_zones_entry = { .show = queue_nr_zones_show, }; +static struct queue_sysfs_entry queue_max_open_zones_entry = { + .attr = {.name = "max_open_zones", .mode = 0444 }, + .show = queue_max_open_zones_show, +}; + +static struct queue_sysfs_entry queue_max_active_zones_entry = { + .attr = {.name = "max_active_zones", .mode = 0444 }, + .show = queue_max_active_zones_show, +}; + static struct queue_sysfs_entry queue_nomerges_entry = { .attr = {.name = "nomerges", .mode = 0644 }, .show = queue_nomerges_show, @@ -766,6 +786,8 @@ static struct attribute *queue_attrs[] = { &queue_nonrot_entry.attr, &queue_zoned_entry.attr, &queue_nr_zones_entry.attr, + &queue_max_open_zones_entry.attr, + &queue_max_active_zones_entry.attr, &queue_nomerges_entry.attr, &queue_rq_affinity_entry.attr, &queue_iostats_entry.attr, @@ -793,6 +815,11 @@ static umode_t queue_attr_visible(struct kobject *kobj, struct attribute *attr, (!q->mq_ops || !q->mq_ops->timeout)) return 0; + if ((attr == &queue_max_open_zones_entry.attr || + attr == &queue_max_active_zones_entry.attr) && + !blk_queue_is_zoned(q)) + return 0; + return attr->mode; } diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 23831fa8701d..81152a260354 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -312,6 +312,7 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, return ret; rep.nr_zones = ret; + rep.flags = BLK_ZONE_REP_CAPACITY; if (copy_to_user(argp, &rep, sizeof(struct blk_zone_report))) return -EFAULT; return 0; diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c index 6941062272e0..d04de10a63e4 100644 --- a/drivers/acpi/property.c +++ b/drivers/acpi/property.c @@ -45,6 +45,9 @@ static const guid_t prp_guids[] = { /* Thunderbolt GUID for WAKE_SUPPORTED: 6c501103-c189-4296-ba72-9bf5a26ebe5d */ GUID_INIT(0x6c501103, 0xc189, 0x4296, 0xba, 0x72, 0x9b, 0xf5, 0xa2, 0x6e, 0xbe, 0x5d), + /* Storage device needs D3 GUID: 5025030f-842f-4ab4-a561-99a5189762d0 */ + GUID_INIT(0x5025030f, 0x842f, 0x4ab4, + 0xa5, 0x61, 0x99, 0xa5, 0x18, 0x97, 0x62, 0xd0), }; /* ACPI _DSD data subnodes GUID: dbb8e3e6-5886-4ba6-8795-1319f52a966b */ diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h index 81b311c9d781..daed4a9c3436 100644 --- a/drivers/block/null_blk.h +++ b/drivers/block/null_blk.h @@ -49,6 +49,7 @@ struct nullb_device { unsigned long completion_nsec; /* time in ns to complete a request */ unsigned long cache_size; /* disk cache size in MB */ unsigned long zone_size; /* zone size in MB if device is zoned */ + unsigned long zone_capacity; /* zone capacity in MB if device is zoned */ unsigned int zone_nr_conv; /* number of conventional zones */ unsigned int submit_queues; /* number of submission queues */ unsigned int home_node; /* home node for the device */ diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c index 907c6858aec0..47a9dad880af 100644 --- a/drivers/block/null_blk_main.c +++ b/drivers/block/null_blk_main.c @@ -200,6 +200,10 @@ static unsigned long g_zone_size = 256; module_param_named(zone_size, g_zone_size, ulong, S_IRUGO); MODULE_PARM_DESC(zone_size, "Zone size in MB when block device is zoned. Must be power-of-two: Default: 256"); +static unsigned long g_zone_capacity; +module_param_named(zone_capacity, g_zone_capacity, ulong, 0444); +MODULE_PARM_DESC(zone_capacity, "Zone capacity in MB when block device is zoned. Can be less than or equal to zone size. Default: Zone size"); + static unsigned int g_zone_nr_conv; module_param_named(zone_nr_conv, g_zone_nr_conv, uint, 0444); MODULE_PARM_DESC(zone_nr_conv, "Number of conventional zones when block device is zoned. Default: 0"); @@ -341,6 +345,7 @@ NULLB_DEVICE_ATTR(mbps, uint, NULL); NULLB_DEVICE_ATTR(cache_size, ulong, NULL); NULLB_DEVICE_ATTR(zoned, bool, NULL); NULLB_DEVICE_ATTR(zone_size, ulong, NULL); +NULLB_DEVICE_ATTR(zone_capacity, ulong, NULL); NULLB_DEVICE_ATTR(zone_nr_conv, uint, NULL); static ssize_t nullb_device_power_show(struct config_item *item, char *page) @@ -457,6 +462,7 @@ static struct configfs_attribute *nullb_device_attrs[] = { &nullb_device_attr_badblocks, &nullb_device_attr_zoned, &nullb_device_attr_zone_size, + &nullb_device_attr_zone_capacity, &nullb_device_attr_zone_nr_conv, NULL, }; @@ -510,7 +516,8 @@ nullb_group_drop_item(struct config_group *group, struct config_item *item) static ssize_t memb_group_features_show(struct config_item *item, char *page) { - return snprintf(page, PAGE_SIZE, "memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size,zone_nr_conv\n"); + return snprintf(page, PAGE_SIZE, + "memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size,zone_capacity,zone_nr_conv\n"); } CONFIGFS_ATTR_RO(memb_group_, features); @@ -571,6 +578,7 @@ static struct nullb_device *null_alloc_dev(void) dev->use_per_node_hctx = g_use_per_node_hctx; dev->zoned = g_zoned; dev->zone_size = g_zone_size; + dev->zone_capacity = g_zone_capacity; dev->zone_nr_conv = g_zone_nr_conv; return dev; } diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c index cc47606d8ffe..3d25c9ad2383 100644 --- a/drivers/block/null_blk_zoned.c +++ b/drivers/block/null_blk_zoned.c @@ -28,6 +28,15 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) return -EINVAL; } + if (!dev->zone_capacity) + dev->zone_capacity = dev->zone_size; + + if (dev->zone_capacity > dev->zone_size) { + pr_err("null_blk: zone capacity (%lu MB) larger than zone size (%lu MB)\n", + dev->zone_capacity, dev->zone_size); + return -EINVAL; + } + dev->zone_size_sects = dev->zone_size << ZONE_SIZE_SHIFT; dev->nr_zones = dev_size >> (SECTOR_SHIFT + ilog2(dev->zone_size_sects)); @@ -47,6 +56,7 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) zone->start = sector; zone->len = dev->zone_size_sects; + zone->capacity = zone->len; zone->wp = zone->start + zone->len; zone->type = BLK_ZONE_TYPE_CONVENTIONAL; zone->cond = BLK_ZONE_COND_NOT_WP; @@ -59,6 +69,7 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) zone->start = zone->wp = sector; zone->len = dev->zone_size_sects; + zone->capacity = dev->zone_capacity << ZONE_SIZE_SHIFT; zone->type = BLK_ZONE_TYPE_SEQWRITE_REQ; zone->cond = BLK_ZONE_COND_EMPTY; @@ -185,6 +196,9 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, return BLK_STS_IOERR; } + if (zone->wp + nr_sectors > zone->start + zone->capacity) + return BLK_STS_IOERR; + if (zone->cond != BLK_ZONE_COND_EXP_OPEN) zone->cond = BLK_ZONE_COND_IMP_OPEN; @@ -193,7 +207,7 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, return ret; zone->wp += nr_sectors; - if (zone->wp == zone->start + zone->len) + if (zone->wp == zone->start + zone->capacity) zone->cond = BLK_ZONE_COND_FULL; return BLK_STS_OK; default: diff --git a/drivers/block/rsxx/core.c b/drivers/block/rsxx/core.c index 10f6368117d8..01333af13bbb 100644 --- a/drivers/block/rsxx/core.c +++ b/drivers/block/rsxx/core.c @@ -562,13 +562,15 @@ static int rsxx_eeh_frozen(struct pci_dev *dev) for (i = 0; i < card->n_targets; i++) { if (card->ctrl[i].status.buf) - pci_free_consistent(card->dev, STATUS_BUFFER_SIZE8, - card->ctrl[i].status.buf, - card->ctrl[i].status.dma_addr); + dma_free_coherent(&card->dev->dev, + STATUS_BUFFER_SIZE8, + card->ctrl[i].status.buf, + card->ctrl[i].status.dma_addr); if (card->ctrl[i].cmd.buf) - pci_free_consistent(card->dev, COMMAND_BUFFER_SIZE8, - card->ctrl[i].cmd.buf, - card->ctrl[i].cmd.dma_addr); + dma_free_coherent(&card->dev->dev, + COMMAND_BUFFER_SIZE8, + card->ctrl[i].cmd.buf, + card->ctrl[i].cmd.dma_addr); } return 0; @@ -711,15 +713,15 @@ static pci_ers_result_t rsxx_slot_reset(struct pci_dev *dev) failed_hw_buffers_init: for (i = 0; i < card->n_targets; i++) { if (card->ctrl[i].status.buf) - pci_free_consistent(card->dev, - STATUS_BUFFER_SIZE8, - card->ctrl[i].status.buf, - card->ctrl[i].status.dma_addr); + dma_free_coherent(&card->dev->dev, + STATUS_BUFFER_SIZE8, + card->ctrl[i].status.buf, + card->ctrl[i].status.dma_addr); if (card->ctrl[i].cmd.buf) - pci_free_consistent(card->dev, - COMMAND_BUFFER_SIZE8, - card->ctrl[i].cmd.buf, - card->ctrl[i].cmd.dma_addr); + dma_free_coherent(&card->dev->dev, + COMMAND_BUFFER_SIZE8, + card->ctrl[i].cmd.buf, + card->ctrl[i].cmd.dma_addr); } failed_hw_setup: rsxx_eeh_failure(dev); diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig index bf7dd96db9b3..d1ca4d059c20 100644 --- a/drivers/md/bcache/Kconfig +++ b/drivers/md/bcache/Kconfig @@ -27,7 +27,7 @@ config BCACHE_CLOSURES_DEBUG interface to list them, which makes it possible to see asynchronous operations that get stuck. -config BCACHE_ASYNC_REGISTRAION +config BCACHE_ASYNC_REGISTRATION bool "Asynchronous device registration (EXPERIMENTAL)" depends on BCACHE help diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile index fd714628da6a..5b87e59676b8 100644 --- a/drivers/md/bcache/Makefile +++ b/drivers/md/bcache/Makefile @@ -4,4 +4,4 @@ obj-$(CONFIG_BCACHE) += bcache.o bcache-y := alloc.o bset.o btree.o closure.o debug.o extents.o\ io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\ - util.o writeback.o + util.o writeback.o features.o diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index a1df0d95151c..52035a78d836 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -87,7 +87,7 @@ void bch_rescale_priorities(struct cache_set *c, int sectors) { struct cache *ca; struct bucket *b; - unsigned int next = c->nbuckets * c->sb.bucket_size / 1024; + unsigned long next = c->nbuckets * c->sb.bucket_size / 1024; unsigned int i; int r; diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 3c708e8b5e2d..4fd03d2496d8 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -264,7 +264,7 @@ struct bcache_device { #define BCACHE_DEV_UNLINK_DONE 2 #define BCACHE_DEV_WB_RUNNING 3 #define BCACHE_DEV_RATE_DW_RUNNING 4 - unsigned int nr_stripes; + int nr_stripes; unsigned int stripe_size; atomic_t *stripe_sectors_dirty; unsigned long *full_dirty_stripes; @@ -762,11 +762,32 @@ struct bbio { #define bucket_bytes(c) ((c)->sb.bucket_size << 9) #define block_bytes(c) ((c)->sb.block_size << 9) -#define prios_per_bucket(c) \ - ((bucket_bytes(c) - sizeof(struct prio_set)) / \ +static inline unsigned int meta_bucket_pages(struct cache_sb *sb) +{ + unsigned int n, max_pages; + + max_pages = min_t(unsigned int, + __rounddown_pow_of_two(USHRT_MAX) / PAGE_SECTORS, + MAX_ORDER_NR_PAGES); + + n = sb->bucket_size / PAGE_SECTORS; + if (n > max_pages) + n = max_pages; + + return n; +} + +static inline unsigned int meta_bucket_bytes(struct cache_sb *sb) +{ + return meta_bucket_pages(sb) << PAGE_SHIFT; +} + +#define prios_per_bucket(ca) \ + ((meta_bucket_bytes(&(ca)->sb) - sizeof(struct prio_set)) / \ sizeof(struct bucket_disk)) -#define prio_buckets(c) \ - DIV_ROUND_UP((size_t) (c)->sb.nbuckets, prios_per_bucket(c)) + +#define prio_buckets(ca) \ + DIV_ROUND_UP((size_t) (ca)->sb.nbuckets, prios_per_bucket(ca)) static inline size_t sector_to_bucket(struct cache_set *c, sector_t s) { diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 4995fcaefe29..67a2c47f4201 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -322,7 +322,7 @@ int bch_btree_keys_alloc(struct btree_keys *b, b->page_order = page_order; - t->data = (void *) __get_free_pages(gfp, b->page_order); + t->data = (void *) __get_free_pages(__GFP_COMP|gfp, b->page_order); if (!t->data) goto err; diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index d5c51e332046..3d8bd0692af3 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -738,7 +738,7 @@ void bch_btree_cache_free(struct cache_set *c) if (c->verify_data) list_move(&c->verify_data->list, &c->btree_cache); - free_pages((unsigned long) c->verify_ondisk, ilog2(bucket_pages(c))); + free_pages((unsigned long) c->verify_ondisk, ilog2(meta_bucket_pages(&c->sb))); #endif list_splice(&c->btree_cache_freeable, @@ -785,7 +785,15 @@ int bch_btree_cache_alloc(struct cache_set *c) mutex_init(&c->verify_lock); c->verify_ondisk = (void *) - __get_free_pages(GFP_KERNEL, ilog2(bucket_pages(c))); + __get_free_pages(GFP_KERNEL|__GFP_COMP, ilog2(meta_bucket_pages(&c->sb))); + if (!c->verify_ondisk) { + /* + * Don't worry about the mca_rereserve buckets + * allocated in previous for-loop, they will be + * handled properly in bch_cache_set_unregister(). + */ + return -ENOMEM; + } c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); diff --git a/drivers/md/bcache/features.c b/drivers/md/bcache/features.c new file mode 100644 index 000000000000..4442df48d28c --- /dev/null +++ b/drivers/md/bcache/features.c @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Feature set bits and string conversion. + * Inspired by ext4's features compat/incompat/ro_compat related code. + * + * Copyright 2020 Coly Li <colyli@suse.de> + * + */ +#include <linux/bcache.h> +#include "bcache.h" +#include "features.h" + +struct feature { + int compat; + unsigned int mask; + const char *string; +}; + +static struct feature feature_list[] = { + {BCH_FEATURE_INCOMPAT, BCH_FEATURE_INCOMPAT_LARGE_BUCKET, + "large_bucket"}, + {0, 0, 0 }, +}; + +#define compose_feature_string(type) \ +({ \ + struct feature *f; \ + bool first = true; \ + \ + for (f = &feature_list[0]; f->compat != 0; f++) { \ + if (f->compat != BCH_FEATURE_ ## type) \ + continue; \ + if (BCH_HAS_ ## type ## _FEATURE(&c->sb, f->mask)) { \ + if (first) { \ + out += snprintf(out, buf + size - out, \ + "["); \ + } else { \ + out += snprintf(out, buf + size - out, \ + " ["); \ + } \ + } else if (!first) { \ + out += snprintf(out, buf + size - out, " "); \ + } \ + \ + out += snprintf(out, buf + size - out, "%s", f->string);\ + \ + if (BCH_HAS_ ## type ## _FEATURE(&c->sb, f->mask)) \ + out += snprintf(out, buf + size - out, "]"); \ + \ + first = false; \ + } \ + if (!first) \ + out += snprintf(out, buf + size - out, "\n"); \ +}) + +int bch_print_cache_set_feature_compat(struct cache_set *c, char *buf, int size) +{ + char *out = buf; + compose_feature_string(COMPAT); + return out - buf; +} + +int bch_print_cache_set_feature_ro_compat(struct cache_set *c, char *buf, int size) +{ + char *out = buf; + compose_feature_string(RO_COMPAT); + return out - buf; +} + +int bch_print_cache_set_feature_incompat(struct cache_set *c, char *buf, int size) +{ + char *out = buf; + compose_feature_string(INCOMPAT); + return out - buf; +} diff --git a/drivers/md/bcache/features.h b/drivers/md/bcache/features.h new file mode 100644 index 000000000000..a1653c478041 --- /dev/null +++ b/drivers/md/bcache/features.h @@ -0,0 +1,86 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _BCACHE_FEATURES_H +#define _BCACHE_FEATURES_H + +#include <linux/bcache.h> +#include <linux/kernel.h> +#include <linux/types.h> + +#define BCH_FEATURE_COMPAT 0 +#define BCH_FEATURE_RO_COMPAT 1 +#define BCH_FEATURE_INCOMPAT 2 +#define BCH_FEATURE_TYPE_MASK 0x03 + +/* Feature set definition */ +/* Incompat feature set */ +#define BCH_FEATURE_INCOMPAT_LARGE_BUCKET 0x0001 /* 32bit bucket size */ + +#define BCH_FEATURE_COMPAT_SUUP 0 +#define BCH_FEATURE_RO_COMPAT_SUUP 0 +#define BCH_FEATURE_INCOMPAT_SUUP BCH_FEATURE_INCOMPAT_LARGE_BUCKET + +#define BCH_HAS_COMPAT_FEATURE(sb, mask) \ + ((sb)->feature_compat & (mask)) +#define BCH_HAS_RO_COMPAT_FEATURE(sb, mask) \ + ((sb)->feature_ro_compat & (mask)) +#define BCH_HAS_INCOMPAT_FEATURE(sb, mask) \ + ((sb)->feature_incompat & (mask)) + +#define BCH_FEATURE_COMPAT_FUNCS(name, flagname) \ +static inline int bch_has_feature_##name(struct cache_sb *sb) \ +{ \ + return (((sb)->feature_compat & \ + BCH##_FEATURE_COMPAT_##flagname) != 0); \ +} \ +static inline void bch_set_feature_##name(struct cache_sb *sb) \ +{ \ + (sb)->feature_compat |= \ + BCH##_FEATURE_COMPAT_##flagname; \ +} \ +static inline void bch_clear_feature_##name(struct cache_sb *sb) \ +{ \ + (sb)->feature_compat &= \ + ~BCH##_FEATURE_COMPAT_##flagname; \ +} + +#define BCH_FEATURE_RO_COMPAT_FUNCS(name, flagname) \ +static inline int bch_has_feature_##name(struct cache_sb *sb) \ +{ \ + return (((sb)->feature_ro_compat & \ + BCH##_FEATURE_RO_COMPAT_##flagname) != 0); \ +} \ +static inline void bch_set_feature_##name(struct cache_sb *sb) \ +{ \ + (sb)->feature_ro_compat |= \ + BCH##_FEATURE_RO_COMPAT_##flagname; \ +} \ +static inline void bch_clear_feature_##name(struct cache_sb *sb) \ +{ \ + (sb)->feature_ro_compat &= \ + ~BCH##_FEATURE_RO_COMPAT_##flagname; \ +} + +#define BCH_FEATURE_INCOMPAT_FUNCS(name, flagname) \ +static inline int bch_has_feature_##name(struct cache_sb *sb) \ +{ \ + return (((sb)->feature_incompat & \ + BCH##_FEATURE_INCOMPAT_##flagname) != 0); \ +} \ +static inline void bch_set_feature_##name(struct cache_sb *sb) \ +{ \ + (sb)->feature_incompat |= \ + BCH##_FEATURE_INCOMPAT_##flagname; \ +} \ +static inline void bch_clear_feature_##name(struct cache_sb *sb) \ +{ \ + (sb)->feature_incompat &= \ + ~BCH##_FEATURE_INCOMPAT_##flagname; \ +} + +BCH_FEATURE_INCOMPAT_FUNCS(large_bucket, LARGE_BUCKET); + +int bch_print_cache_set_feature_compat(struct cache_set *c, char *buf, int size); +int bch_print_cache_set_feature_ro_compat(struct cache_set *c, char *buf, int size); +int bch_print_cache_set_feature_incompat(struct cache_set *c, char *buf, int size); + +#endif diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index b25ee33b0d0b..a14a445618b4 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -26,7 +26,7 @@ struct bio *bch_bbio_alloc(struct cache_set *c) struct bbio *b = mempool_alloc(&c->bio_meta, GFP_NOIO); struct bio *bio = &b->bio; - bio_init(bio, bio->bi_inline_vecs, bucket_pages(c)); + bio_init(bio, bio->bi_inline_vecs, meta_bucket_pages(&c->sb)); return bio; } diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 90aac4e2333f..77fbfd52edcf 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -217,10 +217,7 @@ int bch_journal_read(struct cache_set *c, struct list_head *list) */ pr_debug("falling back to linear search\n"); - for (l = find_first_zero_bit(bitmap, ca->sb.njournal_buckets); - l < ca->sb.njournal_buckets; - l = find_next_zero_bit(bitmap, ca->sb.njournal_buckets, - l + 1)) + for_each_clear_bit(l, bitmap, ca->sb.njournal_buckets) if (read_bucket(l)) goto bsearch; @@ -999,8 +996,8 @@ int bch_journal_alloc(struct cache_set *c) j->w[1].c = c; if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || - !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) || - !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS))) + !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL|__GFP_COMP, JSET_BITS)) || + !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL|__GFP_COMP, JSET_BITS))) return -ENOMEM; return 0; diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 7891fb512736..5872d6470470 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -145,8 +145,8 @@ static void read_moving(struct cache_set *c) continue; } - io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec) - * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), + io = kzalloc(struct_size(io, bio.bio.bi_inline_vecs, + DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS)), GFP_KERNEL); if (!io) goto err; @@ -206,8 +206,8 @@ void bch_moving_gc(struct cache_set *c) mutex_lock(&c->bucket_lock); for_each_cache(ca, c, i) { - unsigned int sectors_to_move = 0; - unsigned int reserve_sectors = ca->sb.bucket_size * + unsigned long sectors_to_move = 0; + unsigned long reserve_sectors = ca->sb.bucket_size * fifo_used(&ca->free[RESERVE_MOVINGGC]); ca->heap.used = 0; diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index a190bf47076d..c7cadaafa947 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -668,7 +668,9 @@ static void backing_request_endio(struct bio *bio) static void bio_complete(struct search *s) { if (s->orig_bio) { - bio_end_io_acct(s->orig_bio, s->start_time); + /* Count on bcache device */ + disk_end_io_acct(s->d->disk, bio_op(s->orig_bio), s->start_time); + trace_bcache_request_end(s->d, s->orig_bio); s->orig_bio->bi_status = s->iop.status; bio_endio(s->orig_bio); @@ -728,8 +730,8 @@ static inline struct search *search_alloc(struct bio *bio, s->recoverable = 1; s->write = op_is_write(bio_op(bio)); s->read_dirty_data = 0; - s->start_time = bio_start_io_acct(bio); - + /* Count on the bcache device */ + s->start_time = disk_start_io_acct(d->disk, bio_sectors(bio), bio_op(bio)); s->iop.c = d->c; s->iop.bio = NULL; s->iop.inode = d->id; @@ -1080,7 +1082,8 @@ static void detached_dev_end_io(struct bio *bio) bio->bi_end_io = ddip->bi_end_io; bio->bi_private = ddip->bi_private; - bio_end_io_acct(bio, ddip->start_time); + /* Count on the bcache device */ + disk_end_io_acct(ddip->d->disk, bio_op(bio), ddip->start_time); if (bio->bi_status) { struct cached_dev *dc = container_of(ddip->d, @@ -1105,7 +1108,8 @@ static void detached_dev_do_request(struct bcache_device *d, struct bio *bio) */ ddip = kzalloc(sizeof(struct detached_dev_io_private), GFP_NOIO); ddip->d = d; - ddip->start_time = bio_start_io_acct(bio); + /* Count on the bcache device */ + ddip->start_time = disk_start_io_acct(d->disk, bio_sectors(bio), bio_op(bio)); ddip->bi_end_io = bio->bi_end_io; ddip->bi_private = bio->bi_private; bio->bi_end_io = detached_dev_end_io; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 9e45faa054b6..1bbdc410ee3c 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -13,6 +13,7 @@ #include "extents.h" #include "request.h" #include "writeback.h" +#include "features.h" #include <linux/blkdev.h> #include <linux/debugfs.h> @@ -59,6 +60,92 @@ struct workqueue_struct *bch_journal_wq; /* Superblock */ +static unsigned int get_bucket_size(struct cache_sb *sb, struct cache_sb_disk *s) +{ + unsigned int bucket_size = le16_to_cpu(s->bucket_size); + + if (sb->version >= BCACHE_SB_VERSION_CDEV_WITH_FEATURES && + bch_has_feature_large_bucket(sb)) + bucket_size |= le16_to_cpu(s->bucket_size_hi) << 16; + + return bucket_size; +} + +static const char *read_super_common(struct cache_sb *sb, struct block_device *bdev, + struct cache_sb_disk *s) +{ + const char *err; + unsigned int i; + + sb->first_bucket= le16_to_cpu(s->first_bucket); + sb->nbuckets = le64_to_cpu(s->nbuckets); + sb->bucket_size = get_bucket_size(sb, s); + + sb->nr_in_set = le16_to_cpu(s->nr_in_set); + sb->nr_this_dev = le16_to_cpu(s->nr_this_dev); + + err = "Too many journal buckets"; + if (sb->keys > SB_JOURNAL_BUCKETS) + goto err; + + err = "Too many buckets"; + if (sb->nbuckets > LONG_MAX) + goto err; + + err = "Not enough buckets"; + if (sb->nbuckets < 1 << 7) + goto err; + + err = "Bad block size (not power of 2)"; + if (!is_power_of_2(sb->block_size)) + goto err; + + err = "Bad block size (larger than page size)"; + if (sb->block_size > PAGE_SECTORS) + goto err; + + err = "Bad bucket size (not power of 2)"; + if (!is_power_of_2(sb->bucket_size)) + goto err; + + err = "Bad bucket size (smaller than page size)"; + if (sb->bucket_size < PAGE_SECTORS) + goto err; + + err = "Invalid superblock: device too small"; + if (get_capacity(bdev->bd_disk) < + sb->bucket_size * sb->nbuckets) + goto err; + + err = "Bad UUID"; + if (bch_is_zero(sb->set_uuid, 16)) + goto err; + + err = "Bad cache device number in set"; + if (!sb->nr_in_set || + sb->nr_in_set <= sb->nr_this_dev || + sb->nr_in_set > MAX_CACHES_PER_SET) + goto err; + + err = "Journal buckets not sequential"; + for (i = 0; i < sb->keys; i++) + if (sb->d[i] != sb->first_bucket + i) + goto err; + + err = "Too many journal buckets"; + if (sb->first_bucket + sb->keys > sb->nbuckets) + goto err; + + err = "Invalid superblock: first bucket comes before end of super"; + if (sb->first_bucket * sb->bucket_size < 16) + goto err; + + err = NULL; +err: + return err; +} + + static const char *read_super(struct cache_sb *sb, struct block_device *bdev, struct cache_sb_disk **res) { @@ -84,7 +171,6 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, sb->flags = le64_to_cpu(s->flags); sb->seq = le64_to_cpu(s->seq); sb->last_mount = le32_to_cpu(s->last_mount); - sb->first_bucket = le16_to_cpu(s->first_bucket); sb->keys = le16_to_cpu(s->keys); for (i = 0; i < SB_JOURNAL_BUCKETS; i++) @@ -101,10 +187,6 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, if (memcmp(sb->magic, bcache_magic, 16)) goto err; - err = "Too many journal buckets"; - if (sb->keys > SB_JOURNAL_BUCKETS) - goto err; - err = "Bad checksum"; if (s->csum != csum_set(s)) goto err; @@ -124,6 +206,7 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, sb->data_offset = BDEV_DATA_START_DEFAULT; break; case BCACHE_SB_VERSION_BDEV_WITH_OFFSET: + case BCACHE_SB_VERSION_BDEV_WITH_FEATURES: sb->data_offset = le64_to_cpu(s->data_offset); err = "Bad data offset"; @@ -133,55 +216,21 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, break; case BCACHE_SB_VERSION_CDEV: case BCACHE_SB_VERSION_CDEV_WITH_UUID: - sb->nbuckets = le64_to_cpu(s->nbuckets); - sb->bucket_size = le16_to_cpu(s->bucket_size); - - sb->nr_in_set = le16_to_cpu(s->nr_in_set); - sb->nr_this_dev = le16_to_cpu(s->nr_this_dev); - - err = "Too many buckets"; - if (sb->nbuckets > LONG_MAX) - goto err; - - err = "Not enough buckets"; - if (sb->nbuckets < 1 << 7) - goto err; - - err = "Bad block/bucket size"; - if (!is_power_of_2(sb->block_size) || - sb->block_size > PAGE_SECTORS || - !is_power_of_2(sb->bucket_size) || - sb->bucket_size < PAGE_SECTORS) - goto err; - - err = "Invalid superblock: device too small"; - if (get_capacity(bdev->bd_disk) < - sb->bucket_size * sb->nbuckets) - goto err; - - err = "Bad UUID"; - if (bch_is_zero(sb->set_uuid, 16)) - goto err; - - err = "Bad cache device number in set"; - if (!sb->nr_in_set || - sb->nr_in_set <= sb->nr_this_dev || - sb->nr_in_set > MAX_CACHES_PER_SET) - goto err; - - err = "Journal buckets not sequential"; - for (i = 0; i < sb->keys; i++) - if (sb->d[i] != sb->first_bucket + i) - goto err; - - err = "Too many journal buckets"; - if (sb->first_bucket + sb->keys > sb->nbuckets) + err = read_super_common(sb, bdev, s); + if (err) goto err; - - err = "Invalid superblock: first bucket comes before end of super"; - if (sb->first_bucket * sb->bucket_size < 16) + break; + case BCACHE_SB_VERSION_CDEV_WITH_FEATURES: + /* + * Feature bits are needed in read_super_common(), + * convert them firstly. + */ + sb->feature_compat = le64_to_cpu(s->feature_compat); + sb->feature_incompat = le64_to_cpu(s->feature_incompat); + sb->feature_ro_compat = le64_to_cpu(s->feature_ro_compat); + err = read_super_common(sb, bdev, s); + if (err) goto err; - break; default: err = "Unsupported superblock version"; @@ -217,7 +266,6 @@ static void __write_super(struct cache_sb *sb, struct cache_sb_disk *out, offset_in_page(out)); out->offset = cpu_to_le64(sb->offset); - out->version = cpu_to_le64(sb->version); memcpy(out->uuid, sb->uuid, 16); memcpy(out->set_uuid, sb->set_uuid, 16); @@ -233,6 +281,13 @@ static void __write_super(struct cache_sb *sb, struct cache_sb_disk *out, for (i = 0; i < sb->keys; i++) out->d[i] = cpu_to_le64(sb->d[i]); + if (sb->version >= BCACHE_SB_VERSION_CDEV_WITH_FEATURES) { + out->feature_compat = cpu_to_le64(sb->feature_compat); + out->feature_incompat = cpu_to_le64(sb->feature_incompat); + out->feature_ro_compat = cpu_to_le64(sb->feature_ro_compat); + } + + out->version = cpu_to_le64(sb->version); out->csum = csum_set(out); pr_debug("ver %llu, flags %llu, seq %llu\n", @@ -289,17 +344,20 @@ void bcache_write_super(struct cache_set *c) { struct closure *cl = &c->sb_write; struct cache *ca; - unsigned int i; + unsigned int i, version = BCACHE_SB_VERSION_CDEV_WITH_UUID; down(&c->sb_write_mutex); closure_init(cl, &c->cl); c->sb.seq++; + if (c->sb.version > version) + version = c->sb.version; + for_each_cache(ca, c, i) { struct bio *bio = &ca->sb_bio; - ca->sb.version = BCACHE_SB_VERSION_CDEV_WITH_UUID; + ca->sb.version = version; ca->sb.seq = c->sb.seq; ca->sb.last_mount = c->sb.last_mount; @@ -423,6 +481,7 @@ static int __uuid_write(struct cache_set *c) BKEY_PADDED(key) k; struct closure cl; struct cache *ca; + unsigned int size; closure_init_stack(&cl); lockdep_assert_held(&bch_register_lock); @@ -430,7 +489,8 @@ static int __uuid_write(struct cache_set *c) if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, true)) return 1; - SET_KEY_SIZE(&k.key, c->sb.bucket_size); + size = meta_bucket_pages(&c->sb) * PAGE_SECTORS; + SET_KEY_SIZE(&k.key, size); uuid_io(c, REQ_OP_WRITE, 0, &k.key, &cl); closure_sync(&cl); @@ -518,7 +578,7 @@ static void prio_io(struct cache *ca, uint64_t bucket, int op, bio->bi_iter.bi_sector = bucket * ca->sb.bucket_size; bio_set_dev(bio, ca->bdev); - bio->bi_iter.bi_size = bucket_bytes(ca); + bio->bi_iter.bi_size = meta_bucket_bytes(&ca->sb); bio->bi_end_io = prio_endio; bio->bi_private = ca; @@ -576,7 +636,7 @@ int bch_prio_write(struct cache *ca, bool wait) p->next_bucket = ca->prio_buckets[i + 1]; p->magic = pset_magic(&ca->sb); - p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8); + p->csum = bch_crc64(&p->magic, meta_bucket_bytes(&ca->sb) - 8); bucket = bch_bucket_alloc(ca, RESERVE_PRIO, wait); BUG_ON(bucket == -1); @@ -629,7 +689,7 @@ static int prio_read(struct cache *ca, uint64_t bucket) prio_io(ca, bucket, REQ_OP_READ, 0); if (p->csum != - bch_crc64(&p->magic, bucket_bytes(ca) - 8)) { + bch_crc64(&p->magic, meta_bucket_bytes(&ca->sb) - 8)) { pr_warn("bad csum reading priorities\n"); goto out; } @@ -835,19 +895,19 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, struct request_queue *q; const size_t max_stripes = min_t(size_t, INT_MAX, SIZE_MAX / sizeof(atomic_t)); - size_t n; + uint64_t n; int idx; if (!d->stripe_size) d->stripe_size = 1 << 31; - d->nr_stripes = DIV_ROUND_UP_ULL(sectors, d->stripe_size); - - if (!d->nr_stripes || d->nr_stripes > max_stripes) { - pr_err("nr_stripes too large or invalid: %u (start sector beyond end of disk?)\n", - (unsigned int)d->nr_stripes); + n = DIV_ROUND_UP_ULL(sectors, d->stripe_size); + if (!n || n > max_stripes) { + pr_err("nr_stripes too large or invalid: %llu (start sector beyond end of disk?)\n", + n); return -ENOMEM; } + d->nr_stripes = n; n = d->nr_stripes * sizeof(atomic_t); d->stripe_sectors_dirty = kvzalloc(n, GFP_KERNEL); @@ -1620,7 +1680,7 @@ static void cache_set_free(struct closure *cl) } bch_bset_sort_state_free(&c->sort); - free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c))); + free_pages((unsigned long) c->uuids, ilog2(meta_bucket_pages(&c->sb))); if (c->moving_gc_wq) destroy_workqueue(c->moving_gc_wq); @@ -1783,7 +1843,10 @@ void bch_cache_set_unregister(struct cache_set *c) } #define alloc_bucket_pages(gfp, c) \ - ((void *) __get_free_pages(__GFP_ZERO|gfp, ilog2(bucket_pages(c)))) + ((void *) __get_free_pages(__GFP_ZERO|__GFP_COMP|gfp, ilog2(bucket_pages(c)))) + +#define alloc_meta_bucket_pages(gfp, sb) \ + ((void *) __get_free_pages(__GFP_ZERO|__GFP_COMP|gfp, ilog2(meta_bucket_pages(sb)))) struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) { @@ -1814,12 +1877,19 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) c->sb.bucket_size = sb->bucket_size; c->sb.nr_in_set = sb->nr_in_set; c->sb.last_mount = sb->last_mount; + c->sb.version = sb->version; + if (c->sb.version >= BCACHE_SB_VERSION_CDEV_WITH_FEATURES) { + c->sb.feature_compat = sb->feature_compat; + c->sb.feature_ro_compat = sb->feature_ro_compat; + c->sb.feature_incompat = sb->feature_incompat; + } + c->bucket_bits = ilog2(sb->bucket_size); c->block_bits = ilog2(sb->block_size); - c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry); + c->nr_uuids = meta_bucket_bytes(&c->sb) / sizeof(struct uuid_entry); c->devices_max_used = 0; atomic_set(&c->attached_dev_nr, 0); - c->btree_pages = bucket_pages(c); + c->btree_pages = meta_bucket_pages(&c->sb); if (c->btree_pages > BTREE_MAX_PAGES) c->btree_pages = max_t(int, c->btree_pages / 4, BTREE_MAX_PAGES); @@ -1845,24 +1915,46 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) INIT_LIST_HEAD(&c->btree_cache_freed); INIT_LIST_HEAD(&c->data_buckets); - iter_size = (sb->bucket_size / sb->block_size + 1) * + iter_size = ((meta_bucket_pages(sb) * PAGE_SECTORS) / sb->block_size + 1) * sizeof(struct btree_iter_set); - if (!(c->devices = kcalloc(c->nr_uuids, sizeof(void *), GFP_KERNEL)) || - mempool_init_slab_pool(&c->search, 32, bch_search_cache) || - mempool_init_kmalloc_pool(&c->bio_meta, 2, - sizeof(struct bbio) + sizeof(struct bio_vec) * - bucket_pages(c)) || - mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) || - bioset_init(&c->bio_split, 4, offsetof(struct bbio, bio), - BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER) || - !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) || - !(c->moving_gc_wq = alloc_workqueue("bcache_gc", - WQ_MEM_RECLAIM, 0)) || - bch_journal_alloc(c) || - bch_btree_cache_alloc(c) || - bch_open_buckets_alloc(c) || - bch_bset_sort_state_init(&c->sort, ilog2(c->btree_pages))) + c->devices = kcalloc(c->nr_uuids, sizeof(void *), GFP_KERNEL); + if (!c->devices) + goto err; + + if (mempool_init_slab_pool(&c->search, 32, bch_search_cache)) + goto err; + + if (mempool_init_kmalloc_pool(&c->bio_meta, 2, + sizeof(struct bbio) + + sizeof(struct bio_vec) * meta_bucket_pages(&c->sb))) + goto err; + + if (mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size)) + goto err; + + if (bioset_init(&c->bio_split, 4, offsetof(struct bbio, bio), + BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER)) + goto err; + + c->uuids = alloc_meta_bucket_pages(GFP_KERNEL, &c->sb); + if (!c->uuids) + goto err; + + c->moving_gc_wq = alloc_workqueue("bcache_gc", WQ_MEM_RECLAIM, 0); + if (!c->moving_gc_wq) + goto err; + + if (bch_journal_alloc(c)) + goto err; + + if (bch_btree_cache_alloc(c)) + goto err; + + if (bch_open_buckets_alloc(c)) + goto err; + + if (bch_bset_sort_state_init(&c->sort, ilog2(c->btree_pages))) goto err; c->congested_read_threshold_us = 2000; @@ -2107,7 +2199,14 @@ found: sysfs_create_link(&c->kobj, &ca->kobj, buf)) goto err; - if (ca->sb.seq > c->sb.seq) { + /* + * A special case is both ca->sb.seq and c->sb.seq are 0, + * such condition happens on a new created cache device whose + * super block is never flushed yet. In this case c->sb.version + * and other members should be updated too, otherwise we will + * have a mistaken super block version in cache set. + */ + if (ca->sb.seq > c->sb.seq || c->sb.seq == 0) { c->sb.version = ca->sb.version; memcpy(c->sb.set_uuid, ca->sb.set_uuid, 16); c->sb.flags = ca->sb.flags; @@ -2145,7 +2244,7 @@ void bch_cache_release(struct kobject *kobj) ca->set->cache[ca->sb.nr_this_dev] = NULL; } - free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca))); + free_pages((unsigned long) ca->disk_buckets, ilog2(meta_bucket_pages(&ca->sb))); kfree(ca->prio_buckets); vfree(ca->buckets); @@ -2242,7 +2341,7 @@ static int cache_alloc(struct cache *ca) goto err_prio_buckets_alloc; } - ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca); + ca->disk_buckets = alloc_meta_bucket_pages(GFP_KERNEL, &ca->sb); if (!ca->disk_buckets) { err = "ca->disk_buckets alloc failed"; goto err_disk_buckets_alloc; @@ -2789,7 +2888,7 @@ static int __init bcache_init(void) static const struct attribute *files[] = { &ksysfs_register.attr, &ksysfs_register_quiet.attr, -#ifdef CONFIG_BCACHE_ASYNC_REGISTRAION +#ifdef CONFIG_BCACHE_ASYNC_REGISTRATION &ksysfs_register_async.attr, #endif &ksysfs_pendings_cleanup.attr, diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 0dadec5a78f6..ac06c0bc3c0a 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -11,6 +11,7 @@ #include "btree.h" #include "request.h" #include "writeback.h" +#include "features.h" #include <linux/blkdev.h> #include <linux/sort.h> @@ -88,6 +89,9 @@ read_attribute(btree_used_percent); read_attribute(average_key_size); read_attribute(dirty_data); read_attribute(bset_tree_stats); +read_attribute(feature_compat); +read_attribute(feature_ro_compat); +read_attribute(feature_incompat); read_attribute(state); read_attribute(cache_read_races); @@ -779,6 +783,13 @@ SHOW(__bch_cache_set) if (attr == &sysfs_bset_tree_stats) return bch_bset_print_stats(c, buf); + if (attr == &sysfs_feature_compat) + return bch_print_cache_set_feature_compat(c, buf, PAGE_SIZE); + if (attr == &sysfs_feature_ro_compat) + return bch_print_cache_set_feature_ro_compat(c, buf, PAGE_SIZE); + if (attr == &sysfs_feature_incompat) + return bch_print_cache_set_feature_incompat(c, buf, PAGE_SIZE); + return 0; } SHOW_LOCKED(bch_cache_set) @@ -987,6 +998,9 @@ static struct attribute *bch_cache_set_internal_files[] = { &sysfs_io_disable, &sysfs_cutoff_writeback, &sysfs_cutoff_writeback_sync, + &sysfs_feature_compat, + &sysfs_feature_ro_compat, + &sysfs_feature_incompat, NULL }; KTYPE(bch_cache_set_internal); diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 1cf1e5016cb9..4f4ad6b3d43a 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -459,10 +459,8 @@ static void read_dirty(struct cached_dev *dc) for (i = 0; i < nk; i++) { w = keys[i]; - io = kzalloc(sizeof(struct dirty_io) + - sizeof(struct bio_vec) * - DIV_ROUND_UP(KEY_SIZE(&w->key), - PAGE_SECTORS), + io = kzalloc(struct_size(io, bio.bi_inline_vecs, + DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS)), GFP_KERNEL); if (!io) goto err; @@ -523,15 +521,19 @@ void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned int inode, uint64_t offset, int nr_sectors) { struct bcache_device *d = c->devices[inode]; - unsigned int stripe_offset, stripe, sectors_dirty; + unsigned int stripe_offset, sectors_dirty; + int stripe; if (!d) return; + stripe = offset_to_stripe(d, offset); + if (stripe < 0) + return; + if (UUID_FLASH_ONLY(&c->uuids[inode])) atomic_long_add(nr_sectors, &c->flash_dev_dirty_sectors); - stripe = offset_to_stripe(d, offset); stripe_offset = offset & (d->stripe_size - 1); while (nr_sectors) { @@ -571,12 +573,12 @@ static bool dirty_pred(struct keybuf *buf, struct bkey *k) static void refill_full_stripes(struct cached_dev *dc) { struct keybuf *buf = &dc->writeback_keys; - unsigned int start_stripe, stripe, next_stripe; + unsigned int start_stripe, next_stripe; + int stripe; bool wrapped = false; stripe = offset_to_stripe(&dc->disk, KEY_OFFSET(&buf->last_scanned)); - - if (stripe >= dc->disk.nr_stripes) + if (stripe < 0) stripe = 0; start_stripe = stripe; @@ -825,10 +827,8 @@ static int bch_dirty_init_thread(void *arg) struct btree_iter iter; struct bkey *k, *p; int cur_idx, prev_idx, skip_nr; - int i; k = p = NULL; - i = 0; cur_idx = prev_idx = 0; bch_btree_iter_init(&c->root->keys, &iter, NULL); diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index b029843ce5b6..3f1230e22de0 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -52,10 +52,22 @@ static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d) return ret; } -static inline unsigned int offset_to_stripe(struct bcache_device *d, +static inline int offset_to_stripe(struct bcache_device *d, uint64_t offset) { do_div(offset, d->stripe_size); + + /* d->nr_stripes is in range [1, INT_MAX] */ + if (unlikely(offset >= d->nr_stripes)) { + pr_err("Invalid stripe %llu (>= nr_stripes %d).\n", + offset, d->nr_stripes); + return -EINVAL; + } + + /* + * Here offset is definitly smaller than INT_MAX, + * return it as int will never overflow. + */ return offset; } @@ -63,7 +75,10 @@ static inline bool bcache_dev_stripe_dirty(struct cached_dev *dc, uint64_t offset, unsigned int nr_sectors) { - unsigned int stripe = offset_to_stripe(&dc->disk, offset); + int stripe = offset_to_stripe(&dc->disk, offset); + + if (stripe < 0) + return false; while (1) { if (atomic_read(dc->disk.stripe_sectors_dirty + stripe)) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 95a5f3757fa3..d61b524ae440 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1631,7 +1631,7 @@ void md_bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force) s += blocks; } bitmap->last_end_sync = jiffies; - sysfs_notify(&bitmap->mddev->kobj, NULL, "sync_completed"); + sysfs_notify_dirent_safe(bitmap->mddev->sysfs_completed); } EXPORT_SYMBOL(md_bitmap_cond_end_sync); diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 813a99ffa86f..73fd50e77975 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -1518,6 +1518,7 @@ static void unlock_all_bitmaps(struct mddev *mddev) } } kfree(cinfo->other_bitmap_lockres); + cinfo->other_bitmap_lockres = NULL; } } diff --git a/drivers/md/md.c b/drivers/md/md.c index 96b28f6d025c..153bc766bc5a 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -101,6 +101,8 @@ static void mddev_detach(struct mddev *mddev); * count by 2 for every hour elapsed between read errors. */ #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20 +/* Default safemode delay: 200 msec */ +#define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1) /* * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' * is 1000 KB/sec, so the extra system load does not show up that much. @@ -463,24 +465,46 @@ check_suspended: } EXPORT_SYMBOL(md_handle_request); +struct md_io { + struct mddev *mddev; + bio_end_io_t *orig_bi_end_io; + void *orig_bi_private; + unsigned long start_time; +}; + +static void md_end_io(struct bio *bio) +{ + struct md_io *md_io = bio->bi_private; + struct mddev *mddev = md_io->mddev; + + disk_end_io_acct(mddev->gendisk, bio_op(bio), md_io->start_time); + + bio->bi_end_io = md_io->orig_bi_end_io; + bio->bi_private = md_io->orig_bi_private; + + mempool_free(md_io, &mddev->md_io_pool); + + if (bio->bi_end_io) + bio->bi_end_io(bio); +} + static blk_qc_t md_submit_bio(struct bio *bio) { const int rw = bio_data_dir(bio); - const int sgrp = op_stat_group(bio_op(bio)); struct mddev *mddev = bio->bi_disk->private_data; - unsigned int sectors; - if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) { + if (mddev == NULL || mddev->pers == NULL) { bio_io_error(bio); return BLK_QC_T_NONE; } - blk_queue_split(&bio); - - if (mddev == NULL || mddev->pers == NULL) { + if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) { bio_io_error(bio); return BLK_QC_T_NONE; } + + blk_queue_split(&bio); + if (mddev->ro == 1 && unlikely(rw == WRITE)) { if (bio_sectors(bio) != 0) bio->bi_status = BLK_STS_IOERR; @@ -488,21 +512,27 @@ static blk_qc_t md_submit_bio(struct bio *bio) return BLK_QC_T_NONE; } - /* - * save the sectors now since our bio can - * go away inside make_request - */ - sectors = bio_sectors(bio); + if (bio->bi_end_io != md_end_io) { + struct md_io *md_io; + + md_io = mempool_alloc(&mddev->md_io_pool, GFP_NOIO); + md_io->mddev = mddev; + md_io->orig_bi_end_io = bio->bi_end_io; + md_io->orig_bi_private = bio->bi_private; + + bio->bi_end_io = md_end_io; + bio->bi_private = md_io; + + md_io->start_time = disk_start_io_acct(mddev->gendisk, + bio_sectors(bio), + bio_op(bio)); + } + /* bio could be mergeable after passing to underlayer */ bio->bi_opf &= ~REQ_NOMERGE; md_handle_request(mddev, bio); - part_stat_lock(); - part_stat_inc(&mddev->gendisk->part0, ios[sgrp]); - part_stat_add(&mddev->gendisk->part0, sectors[sgrp], sectors); - part_stat_unlock(); - return BLK_QC_T_NONE; } @@ -928,7 +958,8 @@ static void super_written(struct bio *bio) struct mddev *mddev = rdev->mddev; if (bio->bi_status) { - pr_err("md: super_written gets error=%d\n", bio->bi_status); + pr_err("md: %s gets error=%d\n", __func__, + blk_status_to_errno(bio->bi_status)); md_error(mddev, rdev); if (!test_bit(Faulty, &rdev->flags) && (bio->bi_opf & MD_FAILFAST)) { @@ -2143,6 +2174,24 @@ retry: sb->sb_csum = calc_sb_1_csum(sb); } +static sector_t super_1_choose_bm_space(sector_t dev_size) +{ + sector_t bm_space; + + /* if the device is bigger than 8Gig, save 64k for bitmap + * usage, if bigger than 200Gig, save 128k + */ + if (dev_size < 64*2) + bm_space = 0; + else if (dev_size - 64*2 >= 200*1024*1024*2) + bm_space = 128*2; + else if (dev_size - 4*2 > 8*1024*1024*2) + bm_space = 64*2; + else + bm_space = 4*2; + return bm_space; +} + static unsigned long long super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) { @@ -2163,13 +2212,22 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) return 0; } else { /* minor version 0; superblock after data */ - sector_t sb_start; - sb_start = (i_size_read(rdev->bdev->bd_inode) >> 9) - 8*2; + sector_t sb_start, bm_space; + sector_t dev_size = i_size_read(rdev->bdev->bd_inode) >> 9; + + /* 8K is for superblock */ + sb_start = dev_size - 8*2; sb_start &= ~(sector_t)(4*2 - 1); - max_sectors = rdev->sectors + sb_start - rdev->sb_start; + + bm_space = super_1_choose_bm_space(dev_size); + + /* Space that can be used to store date needs to decrease + * superblock bitmap space and bad block space(4K) + */ + max_sectors = sb_start - bm_space - 4*2; + if (!num_sectors || num_sectors > max_sectors) num_sectors = max_sectors; - rdev->sb_start = sb_start; } sb = page_address(rdev->sb_page); sb->data_size = cpu_to_le64(num_sectors); @@ -2421,9 +2479,13 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) goto fail; ko = &part_to_dev(rdev->bdev->bd_part)->kobj; - if (sysfs_create_link(&rdev->kobj, ko, "block")) - /* failure here is OK */; + /* failure here is OK */ + err = sysfs_create_link(&rdev->kobj, ko, "block"); rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state"); + rdev->sysfs_unack_badblocks = + sysfs_get_dirent_safe(rdev->kobj.sd, "unacknowledged_bad_blocks"); + rdev->sysfs_badblocks = + sysfs_get_dirent_safe(rdev->kobj.sd, "bad_blocks"); list_add_rcu(&rdev->same_set, &mddev->disks); bd_link_disk_holder(rdev->bdev, mddev->gendisk); @@ -2457,7 +2519,11 @@ static void unbind_rdev_from_array(struct md_rdev *rdev) rdev->mddev = NULL; sysfs_remove_link(&rdev->kobj, "block"); sysfs_put(rdev->sysfs_state); + sysfs_put(rdev->sysfs_unack_badblocks); + sysfs_put(rdev->sysfs_badblocks); rdev->sysfs_state = NULL; + rdev->sysfs_unack_badblocks = NULL; + rdev->sysfs_badblocks = NULL; rdev->badblocks.count = 0; /* We need to delay this, otherwise we can deadlock when * writing to 'remove' to "dev/state". We also need @@ -2802,7 +2868,7 @@ rewrite: goto repeat; wake_up(&mddev->sb_wait); if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) - sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + sysfs_notify_dirent_safe(mddev->sysfs_completed); rdev_for_each(rdev, mddev) { if (test_and_clear_bit(FaultRecorded, &rdev->flags)) @@ -3182,8 +3248,8 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len) return err; } else sysfs_notify_dirent_safe(rdev->sysfs_state); - if (sysfs_link_rdev(rdev->mddev, rdev)) - /* failure here is OK */; + /* failure here is OK */; + sysfs_link_rdev(rdev->mddev, rdev); /* don't wakeup anyone, leave that to userspace. */ } else { if (slot >= rdev->mddev->raid_disks && @@ -4055,7 +4121,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) mddev_resume(mddev); if (!mddev->thread) md_update_sb(mddev, 1); - sysfs_notify(&mddev->kobj, NULL, "level"); + sysfs_notify_dirent_safe(mddev->sysfs_level); md_new_event(mddev); rv = len; out_unlock: @@ -4168,6 +4234,14 @@ static struct md_sysfs_entry md_raid_disks = __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); static ssize_t +uuid_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%pU\n", mddev->uuid); +} +static struct md_sysfs_entry md_uuid = +__ATTR(uuid, S_IRUGO, uuid_show, NULL); + +static ssize_t chunk_size_show(struct mddev *mddev, char *page) { if (mddev->reshape_position != MaxSector && @@ -4808,7 +4882,7 @@ action_store(struct mddev *mddev, const char *page, size_t len) } if (err) return err; - sysfs_notify(&mddev->kobj, NULL, "degraded"); + sysfs_notify_dirent_safe(mddev->sysfs_degraded); } else { if (cmd_match(page, "check")) set_bit(MD_RECOVERY_CHECK, &mddev->recovery); @@ -5423,6 +5497,7 @@ static struct attribute *md_default_attrs[] = { &md_level.attr, &md_layout.attr, &md_raid_disks.attr, + &md_uuid.attr, &md_chunk_size.attr, &md_size.attr, &md_resync_start.attr, @@ -5514,6 +5589,13 @@ static void md_free(struct kobject *ko) if (mddev->sysfs_state) sysfs_put(mddev->sysfs_state); + if (mddev->sysfs_completed) + sysfs_put(mddev->sysfs_completed); + if (mddev->sysfs_degraded) + sysfs_put(mddev->sysfs_degraded); + if (mddev->sysfs_level) + sysfs_put(mddev->sysfs_level); + if (mddev->gendisk) del_gendisk(mddev->gendisk); @@ -5525,6 +5607,7 @@ static void md_free(struct kobject *ko) bioset_exit(&mddev->bio_set); bioset_exit(&mddev->sync_set); + mempool_exit(&mddev->md_io_pool); kfree(mddev); } @@ -5620,6 +5703,11 @@ static int md_alloc(dev_t dev, char *name) */ mddev->hold_active = UNTIL_STOP; + error = mempool_init_kmalloc_pool(&mddev->md_io_pool, BIO_POOL_SIZE, + sizeof(struct md_io)); + if (error) + goto abort; + error = -ENOMEM; mddev->queue = blk_alloc_queue(NUMA_NO_NODE); if (!mddev->queue) @@ -5676,6 +5764,9 @@ static int md_alloc(dev_t dev, char *name) if (!error && mddev->kobj.sd) { kobject_uevent(&mddev->kobj, KOBJ_ADD); mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state"); + mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed"); + mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded"); + mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level"); } mddev_put(mddev); return error; @@ -5961,7 +6052,7 @@ int md_run(struct mddev *mddev) if (mddev_is_clustered(mddev)) mddev->safemode_delay = 0; else - mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ + mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; mddev->in_sync = 1; smp_wmb(); spin_lock(&mddev->lock); @@ -6028,7 +6119,7 @@ static int do_md_run(struct mddev *mddev) kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); sysfs_notify_dirent_safe(mddev->sysfs_state); sysfs_notify_dirent_safe(mddev->sysfs_action); - sysfs_notify(&mddev->kobj, NULL, "degraded"); + sysfs_notify_dirent_safe(mddev->sysfs_degraded); out: clear_bit(MD_NOT_READY, &mddev->flags); return err; @@ -7339,6 +7430,8 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) mddev->bitmap_info.nodes = 0; md_cluster_ops->leave(mddev); + module_put(md_cluster_mod); + mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; } mddev_suspend(mddev); md_bitmap_destroy(mddev); @@ -8330,6 +8423,7 @@ EXPORT_SYMBOL(unregister_md_cluster_operations); int md_setup_cluster(struct mddev *mddev, int nodes) { + int ret; if (!md_cluster_ops) request_module("md-cluster"); spin_lock(&pers_lock); @@ -8341,7 +8435,10 @@ int md_setup_cluster(struct mddev *mddev, int nodes) } spin_unlock(&pers_lock); - return md_cluster_ops->join(mddev, nodes); + ret = md_cluster_ops->join(mddev, nodes); + if (!ret) + mddev->safemode_delay = 0; + return ret; } void md_cluster_stop(struct mddev *mddev) @@ -8742,7 +8839,7 @@ void md_do_sync(struct md_thread *thread) } else mddev->curr_resync = 3; /* no longer delayed */ mddev->curr_resync_completed = j; - sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + sysfs_notify_dirent_safe(mddev->sysfs_completed); md_new_event(mddev); update_time = jiffies; @@ -8770,7 +8867,7 @@ void md_do_sync(struct md_thread *thread) mddev->recovery_cp = j; update_time = jiffies; set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); - sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + sysfs_notify_dirent_safe(mddev->sysfs_completed); } while (j >= mddev->resync_max && @@ -8877,7 +8974,7 @@ void md_do_sync(struct md_thread *thread) !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && mddev->curr_resync > 3) { mddev->curr_resync_completed = mddev->curr_resync; - sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + sysfs_notify_dirent_safe(mddev->sysfs_completed); } mddev->pers->sync_request(mddev, max_sectors, &skipped); @@ -9007,7 +9104,7 @@ static int remove_and_add_spares(struct mddev *mddev, } if (removed && mddev->kobj.sd) - sysfs_notify(&mddev->kobj, NULL, "degraded"); + sysfs_notify_dirent_safe(mddev->sysfs_degraded); if (this && removed) goto no_add; @@ -9035,8 +9132,8 @@ static int remove_and_add_spares(struct mddev *mddev, rdev->recovery_offset = 0; } if (mddev->pers->hot_add_disk(mddev, rdev) == 0) { - if (sysfs_link_rdev(mddev, rdev)) - /* failure here is OK */; + /* failure here is OK */ + sysfs_link_rdev(mddev, rdev); if (!test_bit(Journal, &rdev->flags)) spares++; md_new_event(mddev); @@ -9290,8 +9387,7 @@ void md_reap_sync_thread(struct mddev *mddev) /* success...*/ /* activate any spares */ if (mddev->pers->spare_active(mddev)) { - sysfs_notify(&mddev->kobj, NULL, - "degraded"); + sysfs_notify_dirent_safe(mddev->sysfs_degraded); set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); } } @@ -9381,8 +9477,7 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, if (rv == 0) { /* Make sure they get written out promptly */ if (test_bit(ExternalBbl, &rdev->flags)) - sysfs_notify(&rdev->kobj, NULL, - "unacknowledged_bad_blocks"); + sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks); sysfs_notify_dirent_safe(rdev->sysfs_state); set_mask_bits(&mddev->sb_flags, 0, BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING)); @@ -9403,7 +9498,7 @@ int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, s += rdev->data_offset; rv = badblocks_clear(&rdev->badblocks, s, sectors); if ((rv == 0) && test_bit(ExternalBbl, &rdev->flags)) - sysfs_notify(&rdev->kobj, NULL, "bad_blocks"); + sysfs_notify_dirent_safe(rdev->sysfs_badblocks); return rv; } EXPORT_SYMBOL_GPL(rdev_clear_badblocks); @@ -9633,7 +9728,7 @@ static int read_rdev(struct mddev *mddev, struct md_rdev *rdev) if (rdev->recovery_offset == MaxSector && !test_bit(In_sync, &rdev->flags) && mddev->pers->spare_active(mddev)) - sysfs_notify(&mddev->kobj, NULL, "degraded"); + sysfs_notify_dirent_safe(mddev->sysfs_degraded); put_page(swapout); return 0; diff --git a/drivers/md/md.h b/drivers/md/md.h index e2f1ad9afc48..f79b5b4101ef 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -126,7 +126,10 @@ struct md_rdev { struct kernfs_node *sysfs_state; /* handle for 'state' * sysfs entry */ - + /* handle for 'unacknowledged_bad_blocks' sysfs dentry */ + struct kernfs_node *sysfs_unack_badblocks; + /* handle for 'bad_blocks' sysfs dentry */ + struct kernfs_node *sysfs_badblocks; struct badblocks badblocks; struct { @@ -420,6 +423,9 @@ struct mddev { * file in sysfs. */ struct kernfs_node *sysfs_action; /* handle for 'sync_action' */ + struct kernfs_node *sysfs_completed; /*handle for 'sync_completed' */ + struct kernfs_node *sysfs_degraded; /*handle for 'degraded' */ + struct kernfs_node *sysfs_level; /*handle for 'level' */ struct work_struct del_work; /* used for delayed sysfs removal */ @@ -481,6 +487,7 @@ struct mddev { struct bio_set sync_set; /* for sync operations like * metadata and bitmap writes */ + mempool_t md_io_pool; /* Generic flush handling. * The last to finish preflush schedules a worker to submit diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 353288bc4cb7..e8fa32733917 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -955,6 +955,7 @@ static void wait_barrier(struct r10conf *conf) { spin_lock_irq(&conf->resync_lock); if (conf->barrier) { + struct bio_list *bio_list = current->bio_list; conf->nr_waiting++; /* Wait for the barrier to drop. * However if there are already pending @@ -969,9 +970,16 @@ static void wait_barrier(struct r10conf *conf) wait_event_lock_irq(conf->wait_barrier, !conf->barrier || (atomic_read(&conf->nr_pending) && - current->bio_list && - (!bio_list_empty(¤t->bio_list[0]) || - !bio_list_empty(¤t->bio_list[1]))), + bio_list && + (!bio_list_empty(&bio_list[0]) || + !bio_list_empty(&bio_list[1]))) || + /* move on if recovery thread is + * blocked by us + */ + (conf->mddev->thread->tsk == current && + test_bit(MD_RECOVERY_RUNNING, + &conf->mddev->recovery) && + conf->nr_queued > 0), conf->resync_lock); conf->nr_waiting--; if (!conf->nr_waiting) @@ -4282,8 +4290,8 @@ out: else rdev->recovery_offset = 0; - if (sysfs_link_rdev(mddev, rdev)) - /* Failure here is OK */; + /* Failure here is OK */ + sysfs_link_rdev(mddev, rdev); } } else if (rdev->raid_disk >= conf->prev.raid_disks && !test_bit(Faulty, &rdev->flags)) { @@ -4429,7 +4437,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, sector_nr = conf->reshape_progress; if (sector_nr) { mddev->curr_resync_completed = sector_nr; - sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + sysfs_notify_dirent_safe(mddev->sysfs_completed); *skipped = 1; return sector_nr; } diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 9b6da759dca2..4337ae0e6af2 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -195,9 +195,7 @@ struct r5l_log { static inline sector_t r5c_tree_index(struct r5conf *conf, sector_t sect) { - sector_t offset; - - offset = sector_div(sect, conf->chunk_sectors); + sector_div(sect, conf->chunk_sectors); return sect; } @@ -298,8 +296,8 @@ r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev) wbi = dev->written; dev->written = NULL; while (wbi && wbi->bi_iter.bi_sector < - dev->sector + STRIPE_SECTORS) { - wbi2 = r5_next_bio(wbi, dev->sector); + dev->sector + RAID5_STRIPE_SECTORS(conf)) { + wbi2 = r5_next_bio(conf, wbi, dev->sector); md_write_end(conf->mddev); bio_endio(wbi); wbi = wbi2; @@ -316,7 +314,7 @@ void r5c_handle_cached_data_endio(struct r5conf *conf, set_bit(R5_UPTODATE, &sh->dev[i].flags); r5c_return_dev_pending_writes(conf, &sh->dev[i]); md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, - STRIPE_SECTORS, + RAID5_STRIPE_SECTORS(conf), !test_bit(STRIPE_DEGRADED, &sh->state), 0); } @@ -364,7 +362,7 @@ void r5c_check_cached_full_stripe(struct r5conf *conf) */ if (atomic_read(&conf->r5c_cached_full_stripes) >= min(R5C_FULL_STRIPE_FLUSH_BATCH(conf), - conf->chunk_sectors >> STRIPE_SHIFT)) + conf->chunk_sectors >> RAID5_STRIPE_SHIFT(conf))) r5l_wake_reclaim(conf->log, 0); } @@ -2430,10 +2428,15 @@ static void r5c_recovery_flush_data_only_stripes(struct r5l_log *log, struct mddev *mddev = log->rdev->mddev; struct r5conf *conf = mddev->private; struct stripe_head *sh, *next; + bool cleared_pending = false; if (ctx->data_only_stripes == 0) return; + if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { + cleared_pending = true; + clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); + } log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_BACK; list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) { @@ -2448,6 +2451,8 @@ static void r5c_recovery_flush_data_only_stripes(struct r5l_log *log, atomic_read(&conf->active_stripes) == 0); log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; + if (cleared_pending) + set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); } static int r5l_recovery_log(struct r5l_log *log) @@ -2532,13 +2537,10 @@ static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page) struct r5conf *conf; int ret; - ret = mddev_lock(mddev); - if (ret) - return ret; - + spin_lock(&mddev->lock); conf = mddev->private; if (!conf || !conf->log) { - mddev_unlock(mddev); + spin_unlock(&mddev->lock); return 0; } @@ -2558,7 +2560,7 @@ static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page) default: ret = 0; } - mddev_unlock(mddev); + spin_unlock(&mddev->lock); return ret; } diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index a750f4bbb5d9..d0f540296fe9 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -324,7 +324,7 @@ static int ppl_log_stripe(struct ppl_log *log, struct stripe_head *sh) * be just after the last logged stripe and write to the same * disks. Use bit shift and logarithm to avoid 64-bit division. */ - if ((sh->sector == sh_last->sector + STRIPE_SECTORS) && + if ((sh->sector == sh_last->sector + RAID5_STRIPE_SECTORS(conf)) && (data_sector >> ilog2(conf->chunk_sectors) == data_sector_last >> ilog2(conf->chunk_sectors)) && ((data_sector - data_sector_last) * data_disks == @@ -844,9 +844,9 @@ static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e, /* if start and end is 4k aligned, use a 4k block */ if (block_size == 512 && - (r_sector_first & (STRIPE_SECTORS - 1)) == 0 && - (r_sector_last & (STRIPE_SECTORS - 1)) == 0) - block_size = STRIPE_SIZE; + (r_sector_first & (RAID5_STRIPE_SECTORS(conf) - 1)) == 0 && + (r_sector_last & (RAID5_STRIPE_SECTORS(conf) - 1)) == 0) + block_size = RAID5_STRIPE_SIZE(conf); /* iterate through blocks in strip */ for (i = 0; i < strip_sectors; i += (block_size >> 9)) { @@ -1274,7 +1274,8 @@ static int ppl_validate_rdev(struct md_rdev *rdev) ppl_data_sectors = rdev->ppl.size - (PPL_HEADER_SIZE >> 9); if (ppl_data_sectors > 0) - ppl_data_sectors = rounddown(ppl_data_sectors, STRIPE_SECTORS); + ppl_data_sectors = rounddown(ppl_data_sectors, + RAID5_STRIPE_SECTORS((struct r5conf *)rdev->mddev->private)); if (ppl_data_sectors <= 0) { pr_warn("md/raid:%s: PPL space too small on %s\n", diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index aa6dd1381edf..fb8d1fb14088 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -69,13 +69,13 @@ static struct workqueue_struct *raid5_wq; static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) { - int hash = (sect >> STRIPE_SHIFT) & HASH_MASK; + int hash = (sect >> RAID5_STRIPE_SHIFT(conf)) & HASH_MASK; return &conf->stripe_hashtbl[hash]; } -static inline int stripe_hash_locks_hash(sector_t sect) +static inline int stripe_hash_locks_hash(struct r5conf *conf, sector_t sect) { - return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK; + return (sect >> RAID5_STRIPE_SHIFT(conf)) & STRIPE_HASH_LOCKS_MASK; } static inline void lock_device_hash_lock(struct r5conf *conf, int hash) @@ -627,7 +627,7 @@ raid5_get_active_stripe(struct r5conf *conf, sector_t sector, int previous, int noblock, int noquiesce) { struct stripe_head *sh; - int hash = stripe_hash_locks_hash(sector); + int hash = stripe_hash_locks_hash(conf, sector); int inc_empty_inactive_list_flag; pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); @@ -748,9 +748,9 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh tmp_sec = sh->sector; if (!sector_div(tmp_sec, conf->chunk_sectors)) return; - head_sector = sh->sector - STRIPE_SECTORS; + head_sector = sh->sector - RAID5_STRIPE_SECTORS(conf); - hash = stripe_hash_locks_hash(head_sector); + hash = stripe_hash_locks_hash(conf, head_sector); spin_lock_irq(conf->hash_locks + hash); head = __find_stripe(conf, head_sector, conf->generation); if (head && !atomic_inc_not_zero(&head->count)) { @@ -1057,7 +1057,7 @@ again: test_bit(WriteErrorSeen, &rdev->flags)) { sector_t first_bad; int bad_sectors; - int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, + int bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), &first_bad, &bad_sectors); if (!bad) break; @@ -1089,7 +1089,7 @@ again: if (rdev) { if (s->syncing || s->expanding || s->expanded || s->replacing) - md_sync_acct(rdev->bdev, STRIPE_SECTORS); + md_sync_acct(rdev->bdev, RAID5_STRIPE_SECTORS(conf)); set_bit(STRIPE_IO_STARTED, &sh->state); @@ -1129,9 +1129,9 @@ again: else sh->dev[i].vec.bv_page = sh->dev[i].page; bi->bi_vcnt = 1; - bi->bi_io_vec[0].bv_len = STRIPE_SIZE; + bi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf); bi->bi_io_vec[0].bv_offset = 0; - bi->bi_iter.bi_size = STRIPE_SIZE; + bi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf); bi->bi_write_hint = sh->dev[i].write_hint; if (!rrdev) sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET; @@ -1156,7 +1156,7 @@ again: if (rrdev) { if (s->syncing || s->expanding || s->expanded || s->replacing) - md_sync_acct(rrdev->bdev, STRIPE_SECTORS); + md_sync_acct(rrdev->bdev, RAID5_STRIPE_SECTORS(conf)); set_bit(STRIPE_IO_STARTED, &sh->state); @@ -1183,9 +1183,9 @@ again: WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); sh->dev[i].rvec.bv_page = sh->dev[i].page; rbi->bi_vcnt = 1; - rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; + rbi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf); rbi->bi_io_vec[0].bv_offset = 0; - rbi->bi_iter.bi_size = STRIPE_SIZE; + rbi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf); rbi->bi_write_hint = sh->dev[i].write_hint; sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET; /* @@ -1235,6 +1235,7 @@ async_copy_data(int frombio, struct bio *bio, struct page **page, int page_offset; struct async_submit_ctl submit; enum async_tx_flags flags = 0; + struct r5conf *conf = sh->raid_conf; if (bio->bi_iter.bi_sector >= sector) page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512; @@ -1256,8 +1257,8 @@ async_copy_data(int frombio, struct bio *bio, struct page **page, len -= b_offset; } - if (len > 0 && page_offset + len > STRIPE_SIZE) - clen = STRIPE_SIZE - page_offset; + if (len > 0 && page_offset + len > RAID5_STRIPE_SIZE(conf)) + clen = RAID5_STRIPE_SIZE(conf) - page_offset; else clen = len; @@ -1265,9 +1266,9 @@ async_copy_data(int frombio, struct bio *bio, struct page **page, b_offset += bvl.bv_offset; bio_page = bvl.bv_page; if (frombio) { - if (sh->raid_conf->skip_copy && + if (conf->skip_copy && b_offset == 0 && page_offset == 0 && - clen == STRIPE_SIZE && + clen == RAID5_STRIPE_SIZE(conf) && !no_skipcopy) *page = bio_page; else @@ -1292,6 +1293,7 @@ static void ops_complete_biofill(void *stripe_head_ref) { struct stripe_head *sh = stripe_head_ref; int i; + struct r5conf *conf = sh->raid_conf; pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); @@ -1312,8 +1314,8 @@ static void ops_complete_biofill(void *stripe_head_ref) rbi = dev->read; dev->read = NULL; while (rbi && rbi->bi_iter.bi_sector < - dev->sector + STRIPE_SECTORS) { - rbi2 = r5_next_bio(rbi, dev->sector); + dev->sector + RAID5_STRIPE_SECTORS(conf)) { + rbi2 = r5_next_bio(conf, rbi, dev->sector); bio_endio(rbi); rbi = rbi2; } @@ -1330,6 +1332,7 @@ static void ops_run_biofill(struct stripe_head *sh) struct dma_async_tx_descriptor *tx = NULL; struct async_submit_ctl submit; int i; + struct r5conf *conf = sh->raid_conf; BUG_ON(sh->batch_head); pr_debug("%s: stripe %llu\n", __func__, @@ -1344,10 +1347,10 @@ static void ops_run_biofill(struct stripe_head *sh) dev->toread = NULL; spin_unlock_irq(&sh->stripe_lock); while (rbi && rbi->bi_iter.bi_sector < - dev->sector + STRIPE_SECTORS) { + dev->sector + RAID5_STRIPE_SECTORS(conf)) { tx = async_copy_data(0, rbi, &dev->page, dev->sector, tx, sh, 0); - rbi = r5_next_bio(rbi, dev->sector); + rbi = r5_next_bio(conf, rbi, dev->sector); } } } @@ -1429,9 +1432,11 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, ops_complete_compute, sh, to_addr_conv(sh, percpu, 0)); if (unlikely(count == 1)) - tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); + tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, + RAID5_STRIPE_SIZE(sh->raid_conf), &submit); else - tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); + tx = async_xor(xor_dest, xor_srcs, 0, count, + RAID5_STRIPE_SIZE(sh->raid_conf), &submit); return tx; } @@ -1522,7 +1527,8 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) init_async_submit(&submit, ASYNC_TX_FENCE, NULL, ops_complete_compute, sh, to_addr_conv(sh, percpu, 0)); - tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); + tx = async_gen_syndrome(blocks, 0, count+2, + RAID5_STRIPE_SIZE(sh->raid_conf), &submit); } else { /* Compute any data- or p-drive using XOR */ count = 0; @@ -1535,7 +1541,8 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, ops_complete_compute, sh, to_addr_conv(sh, percpu, 0)); - tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit); + tx = async_xor(dest, blocks, 0, count, + RAID5_STRIPE_SIZE(sh->raid_conf), &submit); } return tx; @@ -1598,7 +1605,8 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) ops_complete_compute, sh, to_addr_conv(sh, percpu, 0)); return async_gen_syndrome(blocks, 0, syndrome_disks+2, - STRIPE_SIZE, &submit); + RAID5_STRIPE_SIZE(sh->raid_conf), + &submit); } else { struct page *dest; int data_target; @@ -1621,7 +1629,8 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, NULL, NULL, to_addr_conv(sh, percpu, 0)); - tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, + tx = async_xor(dest, blocks, 0, count, + RAID5_STRIPE_SIZE(sh->raid_conf), &submit); count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL); @@ -1629,7 +1638,8 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) ops_complete_compute, sh, to_addr_conv(sh, percpu, 0)); return async_gen_syndrome(blocks, 0, count+2, - STRIPE_SIZE, &submit); + RAID5_STRIPE_SIZE(sh->raid_conf), + &submit); } } else { init_async_submit(&submit, ASYNC_TX_FENCE, NULL, @@ -1638,13 +1648,15 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) if (failb == syndrome_disks) { /* We're missing D+P. */ return async_raid6_datap_recov(syndrome_disks+2, - STRIPE_SIZE, faila, - blocks, &submit); + RAID5_STRIPE_SIZE(sh->raid_conf), + faila, + blocks, &submit); } else { /* We're missing D+D. */ return async_raid6_2data_recov(syndrome_disks+2, - STRIPE_SIZE, faila, failb, - blocks, &submit); + RAID5_STRIPE_SIZE(sh->raid_conf), + faila, failb, + blocks, &submit); } } } @@ -1691,7 +1703,8 @@ ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu, init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0)); - tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); + tx = async_xor(xor_dest, xor_srcs, 0, count, + RAID5_STRIPE_SIZE(sh->raid_conf), &submit); return tx; } @@ -1711,7 +1724,8 @@ ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu, init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx, ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0)); - tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); + tx = async_gen_syndrome(blocks, 0, count+2, + RAID5_STRIPE_SIZE(sh->raid_conf), &submit); return tx; } @@ -1752,7 +1766,7 @@ again: WARN_ON(dev->page != dev->orig_page); while (wbi && wbi->bi_iter.bi_sector < - dev->sector + STRIPE_SECTORS) { + dev->sector + RAID5_STRIPE_SECTORS(conf)) { if (wbi->bi_opf & REQ_FUA) set_bit(R5_WantFUA, &dev->flags); if (wbi->bi_opf & REQ_SYNC) @@ -1770,7 +1784,7 @@ again: clear_bit(R5_OVERWRITE, &dev->flags); } } - wbi = r5_next_bio(wbi, dev->sector); + wbi = r5_next_bio(conf, wbi, dev->sector); } if (head_sh->batch_head) { @@ -1910,9 +1924,11 @@ again: } if (unlikely(count == 1)) - tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); + tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, + RAID5_STRIPE_SIZE(sh->raid_conf), &submit); else - tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); + tx = async_xor(xor_dest, xor_srcs, 0, count, + RAID5_STRIPE_SIZE(sh->raid_conf), &submit); if (!last_stripe) { j++; sh = list_first_entry(&sh->batch_list, struct stripe_head, @@ -1972,7 +1988,8 @@ again: } else init_async_submit(&submit, 0, tx, NULL, NULL, to_addr_conv(sh, percpu, j)); - tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); + tx = async_gen_syndrome(blocks, 0, count+2, + RAID5_STRIPE_SIZE(sh->raid_conf), &submit); if (!last_stripe) { j++; sh = list_first_entry(&sh->batch_list, struct stripe_head, @@ -2020,7 +2037,8 @@ static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) init_async_submit(&submit, 0, NULL, NULL, NULL, to_addr_conv(sh, percpu, 0)); - tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, + tx = async_xor_val(xor_dest, xor_srcs, 0, count, + RAID5_STRIPE_SIZE(sh->raid_conf), &sh->ops.zero_sum_result, &submit); atomic_inc(&sh->count); @@ -2045,7 +2063,8 @@ static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu atomic_inc(&sh->count); init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check, sh, to_addr_conv(sh, percpu, 0)); - async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE, + async_syndrome_val(srcs, 0, count+2, + RAID5_STRIPE_SIZE(sh->raid_conf), &sh->ops.zero_sum_result, percpu->spare_page, &submit); } @@ -2217,9 +2236,9 @@ static int grow_stripes(struct r5conf *conf, int num) /** * scribble_alloc - allocate percpu scribble buffer for required size * of the scribble region - * @percpu - from for_each_present_cpu() of the caller - * @num - total number of disks in the array - * @cnt - scribble objs count for required size of the scribble region + * @percpu: from for_each_present_cpu() of the caller + * @num: total number of disks in the array + * @cnt: scribble objs count for required size of the scribble region * * The scribble buffer size must be enough to contain: * 1/ a struct page pointer for each device in the array +2 @@ -2275,7 +2294,7 @@ static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors) percpu = per_cpu_ptr(conf->percpu, cpu); err = scribble_alloc(percpu, new_disks, - new_sectors / STRIPE_SECTORS); + new_sectors / RAID5_STRIPE_SECTORS(conf)); if (err) break; } @@ -2509,10 +2528,10 @@ static void raid5_end_read_request(struct bio * bi) */ pr_info_ratelimited( "md/raid:%s: read error corrected (%lu sectors at %llu on %s)\n", - mdname(conf->mddev), STRIPE_SECTORS, + mdname(conf->mddev), RAID5_STRIPE_SECTORS(conf), (unsigned long long)s, bdevname(rdev->bdev, b)); - atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); + atomic_add(RAID5_STRIPE_SECTORS(conf), &rdev->corrected_errors); clear_bit(R5_ReadError, &sh->dev[i].flags); clear_bit(R5_ReWrite, &sh->dev[i].flags); } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) @@ -2585,7 +2604,7 @@ static void raid5_end_read_request(struct bio * bi) if (!(set_bad && test_bit(In_sync, &rdev->flags) && rdev_set_badblocks( - rdev, sh->sector, STRIPE_SECTORS, 0))) + rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), 0))) md_error(conf->mddev, rdev); } } @@ -2637,7 +2656,7 @@ static void raid5_end_write_request(struct bio *bi) if (bi->bi_status) md_error(conf->mddev, rdev); else if (is_badblock(rdev, sh->sector, - STRIPE_SECTORS, + RAID5_STRIPE_SECTORS(conf), &first_bad, &bad_sectors)) set_bit(R5_MadeGoodRepl, &sh->dev[i].flags); } else { @@ -2649,7 +2668,7 @@ static void raid5_end_write_request(struct bio *bi) set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); } else if (is_badblock(rdev, sh->sector, - STRIPE_SECTORS, + RAID5_STRIPE_SECTORS(conf), &first_bad, &bad_sectors)) { set_bit(R5_MadeGood, &sh->dev[i].flags); if (test_bit(R5_ReadError, &sh->dev[i].flags)) @@ -3283,13 +3302,13 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, /* check if page is covered */ sector_t sector = sh->dev[dd_idx].sector; for (bi=sh->dev[dd_idx].towrite; - sector < sh->dev[dd_idx].sector + STRIPE_SECTORS && + sector < sh->dev[dd_idx].sector + RAID5_STRIPE_SECTORS(conf) && bi && bi->bi_iter.bi_sector <= sector; - bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) { + bi = r5_next_bio(conf, bi, sh->dev[dd_idx].sector)) { if (bio_end_sector(bi) >= sector) sector = bio_end_sector(bi); } - if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) + if (sector >= sh->dev[dd_idx].sector + RAID5_STRIPE_SECTORS(conf)) if (!test_and_set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags)) sh->overwrite_disks++; } @@ -3314,7 +3333,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, set_bit(STRIPE_BITMAP_PENDING, &sh->state); spin_unlock_irq(&sh->stripe_lock); md_bitmap_startwrite(conf->mddev->bitmap, sh->sector, - STRIPE_SECTORS, 0); + RAID5_STRIPE_SECTORS(conf), 0); spin_lock_irq(&sh->stripe_lock); clear_bit(STRIPE_BITMAP_PENDING, &sh->state); if (!sh->batch_head) { @@ -3376,7 +3395,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, if (!rdev_set_badblocks( rdev, sh->sector, - STRIPE_SECTORS, 0)) + RAID5_STRIPE_SECTORS(conf), 0)) md_error(conf->mddev, rdev); rdev_dec_pending(rdev, conf->mddev); } @@ -3396,8 +3415,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, wake_up(&conf->wait_for_overlap); while (bi && bi->bi_iter.bi_sector < - sh->dev[i].sector + STRIPE_SECTORS) { - struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); + sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) { + struct bio *nextbi = r5_next_bio(conf, bi, sh->dev[i].sector); md_write_end(conf->mddev); bio_io_error(bi); @@ -3405,7 +3424,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, } if (bitmap_end) md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, - STRIPE_SECTORS, 0, 0); + RAID5_STRIPE_SECTORS(conf), 0, 0); bitmap_end = 0; /* and fail all 'written' */ bi = sh->dev[i].written; @@ -3417,8 +3436,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, if (bi) bitmap_end = 1; while (bi && bi->bi_iter.bi_sector < - sh->dev[i].sector + STRIPE_SECTORS) { - struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); + sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) { + struct bio *bi2 = r5_next_bio(conf, bi, sh->dev[i].sector); md_write_end(conf->mddev); bio_io_error(bi); @@ -3441,9 +3460,9 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, if (bi) s->to_read--; while (bi && bi->bi_iter.bi_sector < - sh->dev[i].sector + STRIPE_SECTORS) { + sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) { struct bio *nextbi = - r5_next_bio(bi, sh->dev[i].sector); + r5_next_bio(conf, bi, sh->dev[i].sector); bio_io_error(bi); bi = nextbi; @@ -3451,7 +3470,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, } if (bitmap_end) md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, - STRIPE_SECTORS, 0, 0); + RAID5_STRIPE_SECTORS(conf), 0, 0); /* If we were in the middle of a write the parity block might * still be locked - so just clear all R5_LOCKED flags */ @@ -3496,14 +3515,14 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && !rdev_set_badblocks(rdev, sh->sector, - STRIPE_SECTORS, 0)) + RAID5_STRIPE_SECTORS(conf), 0)) abort = 1; rdev = rcu_dereference(conf->disks[i].replacement); if (rdev && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && !rdev_set_badblocks(rdev, sh->sector, - STRIPE_SECTORS, 0)) + RAID5_STRIPE_SECTORS(conf), 0)) abort = 1; } rcu_read_unlock(); @@ -3511,7 +3530,7 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, conf->recovery_disabled = conf->mddev->recovery_disabled; } - md_done_sync(conf->mddev, STRIPE_SECTORS, !abort); + md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), !abort); } static int want_replace(struct stripe_head *sh, int disk_idx) @@ -3538,6 +3557,7 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]], &sh->dev[s->failed_num[1]] }; int i; + bool force_rcw = (sh->raid_conf->rmw_level == PARITY_DISABLE_RMW); if (test_bit(R5_LOCKED, &dev->flags) || @@ -3596,17 +3616,27 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, * devices must be read. */ return 1; + + if (s->failed >= 2 && + (fdev[i]->towrite || + s->failed_num[i] == sh->pd_idx || + s->failed_num[i] == sh->qd_idx) && + !test_bit(R5_UPTODATE, &fdev[i]->flags)) + /* In max degraded raid6, If the failed disk is P, Q, + * or we want to read the failed disk, we need to do + * reconstruct-write. + */ + force_rcw = true; } - /* If we are forced to do a reconstruct-write, either because - * the current RAID6 implementation only supports that, or - * because parity cannot be trusted and we are currently - * recovering it, there is extra need to be careful. + /* If we are forced to do a reconstruct-write, because parity + * cannot be trusted and we are currently recovering it, there + * is extra need to be careful. * If one of the devices that we would need to read, because * it is not being overwritten (and maybe not written at all) * is missing/faulty, then we need to read everything we can. */ - if (sh->raid_conf->level != 6 && + if (!force_rcw && sh->sector < sh->raid_conf->mddev->recovery_cp) /* reconstruct-write isn't being forced */ return 0; @@ -3710,7 +3740,7 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, return 0; } -/** +/* * handle_stripe_fill - read or compute data to satisfy pending requests. */ static void handle_stripe_fill(struct stripe_head *sh, @@ -3785,14 +3815,14 @@ returnbi: wbi = dev->written; dev->written = NULL; while (wbi && wbi->bi_iter.bi_sector < - dev->sector + STRIPE_SECTORS) { - wbi2 = r5_next_bio(wbi, dev->sector); + dev->sector + RAID5_STRIPE_SECTORS(conf)) { + wbi2 = r5_next_bio(conf, wbi, dev->sector); md_write_end(conf->mddev); bio_endio(wbi); wbi = wbi2; } md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, - STRIPE_SECTORS, + RAID5_STRIPE_SECTORS(conf), !test_bit(STRIPE_DEGRADED, &sh->state), 0); if (head_sh->batch_head) { @@ -3976,10 +4006,8 @@ static int handle_stripe_dirtying(struct r5conf *conf, set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); s->locked++; - } else { + } else set_bit(STRIPE_DELAYED, &sh->state); - set_bit(STRIPE_HANDLE, &sh->state); - } } } } @@ -4004,10 +4032,8 @@ static int handle_stripe_dirtying(struct r5conf *conf, set_bit(R5_Wantread, &dev->flags); s->locked++; qread++; - } else { + } else set_bit(STRIPE_DELAYED, &sh->state); - set_bit(STRIPE_HANDLE, &sh->state); - } } } if (rcw && conf->mddev->queue) @@ -4099,7 +4125,7 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, */ set_bit(STRIPE_INSYNC, &sh->state); else { - atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); + atomic64_add(RAID5_STRIPE_SECTORS(conf), &conf->mddev->resync_mismatches); if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) { /* don't try to repair!! */ set_bit(STRIPE_INSYNC, &sh->state); @@ -4107,7 +4133,7 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, "%llu-%llu\n", mdname(conf->mddev), (unsigned long long) sh->sector, (unsigned long long) sh->sector + - STRIPE_SECTORS); + RAID5_STRIPE_SECTORS(conf)); } else { sh->check_state = check_state_compute_run; set_bit(STRIPE_COMPUTE_RUN, &sh->state); @@ -4264,7 +4290,7 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, */ } } else { - atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); + atomic64_add(RAID5_STRIPE_SECTORS(conf), &conf->mddev->resync_mismatches); if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) { /* don't try to repair!! */ set_bit(STRIPE_INSYNC, &sh->state); @@ -4272,7 +4298,7 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, "%llu-%llu\n", mdname(conf->mddev), (unsigned long long) sh->sector, (unsigned long long) sh->sector + - STRIPE_SECTORS); + RAID5_STRIPE_SECTORS(conf)); } else { int *target = &sh->ops.target; @@ -4343,7 +4369,7 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) /* place all the copies on one channel */ init_async_submit(&submit, 0, tx, NULL, NULL, NULL); tx = async_memcpy(sh2->dev[dd_idx].page, - sh->dev[i].page, 0, 0, STRIPE_SIZE, + sh->dev[i].page, 0, 0, RAID5_STRIPE_SIZE(conf), &submit); set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); @@ -4442,8 +4468,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) */ rdev = rcu_dereference(conf->disks[i].replacement); if (rdev && !test_bit(Faulty, &rdev->flags) && - rdev->recovery_offset >= sh->sector + STRIPE_SECTORS && - !is_badblock(rdev, sh->sector, STRIPE_SECTORS, + rdev->recovery_offset >= sh->sector + RAID5_STRIPE_SECTORS(conf) && + !is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), &first_bad, &bad_sectors)) set_bit(R5_ReadRepl, &dev->flags); else { @@ -4457,7 +4483,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) if (rdev && test_bit(Faulty, &rdev->flags)) rdev = NULL; if (rdev) { - is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, + is_bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), &first_bad, &bad_sectors); if (s->blocked_rdev == NULL && (test_bit(Blocked, &rdev->flags) @@ -4484,7 +4510,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) } } else if (test_bit(In_sync, &rdev->flags)) set_bit(R5_Insync, &dev->flags); - else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) + else if (sh->sector + RAID5_STRIPE_SECTORS(conf) <= rdev->recovery_offset) /* in sync if before recovery_offset */ set_bit(R5_Insync, &dev->flags); else if (test_bit(R5_UPTODATE, &dev->flags) && @@ -4573,12 +4599,12 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) rcu_read_unlock(); } +/* + * Return '1' if this is a member of batch, or '0' if it is a lone stripe or + * a head which can now be handled. + */ static int clear_batch_ready(struct stripe_head *sh) { - /* Return '1' if this is a member of batch, or - * '0' if it is a lone stripe or a head which can now be - * handled. - */ struct stripe_head *tmp; if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state)) return (sh->batch_head && sh->batch_head != sh); @@ -4682,6 +4708,16 @@ static void handle_stripe(struct stripe_head *sh) struct r5dev *pdev, *qdev; clear_bit(STRIPE_HANDLE, &sh->state); + + /* + * handle_stripe should not continue handle the batched stripe, only + * the head of batch list or lone stripe can continue. Otherwise we + * could see break_stripe_batch_list warns about the STRIPE_ACTIVE + * is set for the batched stripe. + */ + if (clear_batch_ready(sh)) + return; + if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) { /* already being handled, ensure it gets handled * again when current action finishes */ @@ -4689,11 +4725,6 @@ static void handle_stripe(struct stripe_head *sh) return; } - if (clear_batch_ready(sh) ) { - clear_bit_unlock(STRIPE_ACTIVE, &sh->state); - return; - } - if (test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state)) break_stripe_batch_list(sh, 0); @@ -4842,7 +4873,7 @@ static void handle_stripe(struct stripe_head *sh) * or to load a block that is being partially written. */ if (s.to_read || s.non_overwrite - || (conf->level == 6 && s.to_write && s.failed) + || (s.to_write && s.failed) || (s.syncing && (s.uptodate + s.compute < disks)) || s.replacing || s.expanding) @@ -4927,7 +4958,7 @@ static void handle_stripe(struct stripe_head *sh) if ((s.syncing || s.replacing) && s.locked == 0 && !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && test_bit(STRIPE_INSYNC, &sh->state)) { - md_done_sync(conf->mddev, STRIPE_SECTORS, 1); + md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1); clear_bit(STRIPE_SYNCING, &sh->state); if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) wake_up(&conf->wait_for_overlap); @@ -4946,14 +4977,11 @@ static void handle_stripe(struct stripe_head *sh) if (!test_bit(R5_ReWrite, &dev->flags)) { set_bit(R5_Wantwrite, &dev->flags); set_bit(R5_ReWrite, &dev->flags); - set_bit(R5_LOCKED, &dev->flags); - s.locked++; - } else { + } else /* let's read it back */ set_bit(R5_Wantread, &dev->flags); - set_bit(R5_LOCKED, &dev->flags); - s.locked++; - } + set_bit(R5_LOCKED, &dev->flags); + s.locked++; } } @@ -4995,7 +5023,7 @@ static void handle_stripe(struct stripe_head *sh) clear_bit(STRIPE_EXPAND_READY, &sh->state); atomic_dec(&conf->reshape_stripes); wake_up(&conf->wait_for_overlap); - md_done_sync(conf->mddev, STRIPE_SECTORS, 1); + md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1); } if (s.expanding && s.locked == 0 && @@ -5025,14 +5053,14 @@ finish: /* We own a safe reference to the rdev */ rdev = conf->disks[i].rdev; if (!rdev_set_badblocks(rdev, sh->sector, - STRIPE_SECTORS, 0)) + RAID5_STRIPE_SECTORS(conf), 0)) md_error(conf->mddev, rdev); rdev_dec_pending(rdev, conf->mddev); } if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { rdev = conf->disks[i].rdev; rdev_clear_badblocks(rdev, sh->sector, - STRIPE_SECTORS, 0); + RAID5_STRIPE_SECTORS(conf), 0); rdev_dec_pending(rdev, conf->mddev); } if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) { @@ -5041,7 +5069,7 @@ finish: /* rdev have been moved down */ rdev = conf->disks[i].rdev; rdev_clear_badblocks(rdev, sh->sector, - STRIPE_SECTORS, 0); + RAID5_STRIPE_SECTORS(conf), 0); rdev_dec_pending(rdev, conf->mddev); } } @@ -5483,7 +5511,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) /* Skip discard while reshape is happening */ return; - logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1); + logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1); last_sector = bio_end_sector(bi); bi->bi_next = NULL; @@ -5498,7 +5526,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) last_sector *= conf->chunk_sectors; for (; logical_sector < last_sector; - logical_sector += STRIPE_SECTORS) { + logical_sector += RAID5_STRIPE_SECTORS(conf)) { DEFINE_WAIT(w); int d; again: @@ -5543,7 +5571,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) d++) md_bitmap_startwrite(mddev->bitmap, sh->sector, - STRIPE_SECTORS, + RAID5_STRIPE_SECTORS(conf), 0); sh->bm_seq = conf->seq_flush + 1; set_bit(STRIPE_BIT_DELAY, &sh->state); @@ -5608,12 +5636,12 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) return true; } - logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1); + logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1); last_sector = bio_end_sector(bi); bi->bi_next = NULL; prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); - for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { + for (; logical_sector < last_sector; logical_sector += RAID5_STRIPE_SECTORS(conf)) { int previous; int seq; @@ -5711,8 +5739,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) do_flush = false; } - if (!sh->batch_head || sh == sh->batch_head) - set_bit(STRIPE_HANDLE, &sh->state); + set_bit(STRIPE_HANDLE, &sh->state); clear_bit(STRIPE_DELAYED, &sh->state); if ((!sh->batch_head || sh == sh->batch_head) && (bi->bi_opf & REQ_SYNC) && @@ -5777,7 +5804,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk sector_div(sector_nr, new_data_disks); if (sector_nr) { mddev->curr_resync_completed = sector_nr; - sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + sysfs_notify_dirent_safe(mddev->sysfs_completed); *skipped = 1; retn = sector_nr; goto finish; @@ -5891,11 +5918,11 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk conf->reshape_safe = mddev->reshape_position; spin_unlock_irq(&conf->device_lock); wake_up(&conf->wait_for_overlap); - sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + sysfs_notify_dirent_safe(mddev->sysfs_completed); } INIT_LIST_HEAD(&stripes); - for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { + for (i = 0; i < reshape_sectors; i += RAID5_STRIPE_SECTORS(conf)) { int j; int skipped_disk = 0; sh = raid5_get_active_stripe(conf, stripe_addr+i, 0, 0, 1); @@ -5916,7 +5943,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk skipped_disk = 1; continue; } - memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE); + memset(page_address(sh->dev[j].page), 0, RAID5_STRIPE_SIZE(conf)); set_bit(R5_Expanded, &sh->dev[j].flags); set_bit(R5_UPTODATE, &sh->dev[j].flags); } @@ -5951,7 +5978,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk set_bit(STRIPE_EXPAND_SOURCE, &sh->state); set_bit(STRIPE_HANDLE, &sh->state); raid5_release_stripe(sh); - first_sector += STRIPE_SECTORS; + first_sector += RAID5_STRIPE_SECTORS(conf); } /* Now that the sources are clearly marked, we can release * the destination stripes @@ -5998,7 +6025,7 @@ finish: conf->reshape_safe = mddev->reshape_position; spin_unlock_irq(&conf->device_lock); wake_up(&conf->wait_for_overlap); - sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + sysfs_notify_dirent_safe(mddev->sysfs_completed); } ret: return retn; @@ -6057,11 +6084,12 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && !conf->fullsync && !md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && - sync_blocks >= STRIPE_SECTORS) { + sync_blocks >= RAID5_STRIPE_SECTORS(conf)) { /* we can skip this block, and probably more */ - sync_blocks /= STRIPE_SECTORS; + do_div(sync_blocks, RAID5_STRIPE_SECTORS(conf)); *skipped = 1; - return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ + /* keep things rounded to whole stripes */ + return sync_blocks * RAID5_STRIPE_SECTORS(conf); } md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, false); @@ -6094,7 +6122,7 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n raid5_release_stripe(sh); - return STRIPE_SECTORS; + return RAID5_STRIPE_SECTORS(conf); } static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio, @@ -6117,14 +6145,14 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio, int handled = 0; logical_sector = raid_bio->bi_iter.bi_sector & - ~((sector_t)STRIPE_SECTORS-1); + ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1); sector = raid5_compute_sector(conf, logical_sector, 0, &dd_idx, NULL); last_sector = bio_end_sector(raid_bio); for (; logical_sector < last_sector; - logical_sector += STRIPE_SECTORS, - sector += STRIPE_SECTORS, + logical_sector += RAID5_STRIPE_SECTORS(conf), + sector += RAID5_STRIPE_SECTORS(conf), scnt++) { if (scnt < offset) @@ -6457,6 +6485,77 @@ raid5_rmw_level = __ATTR(rmw_level, S_IRUGO | S_IWUSR, raid5_show_rmw_level, raid5_store_rmw_level); +static ssize_t +raid5_show_stripe_size(struct mddev *mddev, char *page) +{ + struct r5conf *conf; + int ret = 0; + + spin_lock(&mddev->lock); + conf = mddev->private; + if (conf) + ret = sprintf(page, "%lu\n", RAID5_STRIPE_SIZE(conf)); + spin_unlock(&mddev->lock); + return ret; +} + +#if PAGE_SIZE != DEFAULT_STRIPE_SIZE +static ssize_t +raid5_store_stripe_size(struct mddev *mddev, const char *page, size_t len) +{ + struct r5conf *conf; + unsigned long new; + int err; + + if (len >= PAGE_SIZE) + return -EINVAL; + if (kstrtoul(page, 10, &new)) + return -EINVAL; + + /* + * The value should not be bigger than PAGE_SIZE. It requires to + * be multiple of DEFAULT_STRIPE_SIZE. + */ + if (new % DEFAULT_STRIPE_SIZE != 0 || new > PAGE_SIZE || new == 0) + return -EINVAL; + + err = mddev_lock(mddev); + if (err) + return err; + + conf = mddev->private; + if (!conf) { + err = -ENODEV; + goto out_unlock; + } + + if (new == conf->stripe_size) + goto out_unlock; + + pr_debug("md/raid: change stripe_size from %lu to %lu\n", + conf->stripe_size, new); + + mddev_suspend(mddev); + conf->stripe_size = new; + conf->stripe_shift = ilog2(new) - 9; + conf->stripe_sectors = new >> 9; + mddev_resume(mddev); + +out_unlock: + mddev_unlock(mddev); + return err ?: len; +} + +static struct md_sysfs_entry +raid5_stripe_size = __ATTR(stripe_size, 0644, + raid5_show_stripe_size, + raid5_store_stripe_size); +#else +static struct md_sysfs_entry +raid5_stripe_size = __ATTR(stripe_size, 0444, + raid5_show_stripe_size, + NULL); +#endif static ssize_t raid5_show_preread_threshold(struct mddev *mddev, char *page) @@ -6645,6 +6744,7 @@ static struct attribute *raid5_attrs[] = { &raid5_group_thread_cnt.attr, &raid5_skip_copy.attr, &raid5_rmw_level.attr, + &raid5_stripe_size.attr, &r5c_journal_mode.attr, &ppl_write_hint.attr, NULL, @@ -6744,7 +6844,7 @@ static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu conf->previous_raid_disks), max(conf->chunk_sectors, conf->prev_chunk_sectors) - / STRIPE_SECTORS)) { + / RAID5_STRIPE_SECTORS(conf))) { free_scratch_buffer(conf, percpu); return -ENOMEM; } @@ -6896,6 +6996,12 @@ static struct r5conf *setup_conf(struct mddev *mddev) conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL); if (conf == NULL) goto abort; + +#if PAGE_SIZE != DEFAULT_STRIPE_SIZE + conf->stripe_size = DEFAULT_STRIPE_SIZE; + conf->stripe_shift = ilog2(DEFAULT_STRIPE_SIZE) - 9; + conf->stripe_sectors = DEFAULT_STRIPE_SIZE >> 9; +#endif INIT_LIST_HEAD(&conf->free_list); INIT_LIST_HEAD(&conf->pending_list); conf->pending_data = kcalloc(PENDING_IO_MAX, @@ -7047,8 +7153,8 @@ static struct r5conf *setup_conf(struct mddev *mddev) conf->min_nr_stripes = NR_STRIPES; if (mddev->reshape_position != MaxSector) { int stripes = max_t(int, - ((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4, - ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4); + ((mddev->chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4, + ((mddev->new_chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4); conf->min_nr_stripes = max(NR_STRIPES, stripes); if (conf->min_nr_stripes != NR_STRIPES) pr_info("md/raid:%s: force stripe size %d for reshape\n", @@ -7779,14 +7885,14 @@ static int check_stripe_cache(struct mddev *mddev) * stripe_heads first. */ struct r5conf *conf = mddev->private; - if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4 + if (((mddev->chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4 > conf->min_nr_stripes || - ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 + ((mddev->new_chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4 > conf->min_nr_stripes) { pr_warn("md/raid:%s: reshape: not enough stripes. Needed %lu\n", mdname(mddev), ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) - / STRIPE_SIZE)*4); + / RAID5_STRIPE_SIZE(conf))*4); return 0; } return 1; @@ -7922,8 +8028,8 @@ static int raid5_start_reshape(struct mddev *mddev) else rdev->recovery_offset = 0; - if (sysfs_link_rdev(mddev, rdev)) - /* Failure here is OK */; + /* Failure here is OK */ + sysfs_link_rdev(mddev, rdev); } } else if (rdev->raid_disk >= conf->previous_raid_disks && !test_bit(Faulty, &rdev->flags)) { @@ -8118,7 +8224,7 @@ static void *raid5_takeover_raid1(struct mddev *mddev) while (chunksect && (mddev->array_sectors & (chunksect-1))) chunksect >>= 1; - if ((chunksect<<9) < STRIPE_SIZE) + if ((chunksect<<9) < RAID5_STRIPE_SIZE((struct r5conf *)mddev->private)) /* array size does not allow a suitable chunk size */ return ERR_PTR(-EINVAL); diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index f90e0704bed9..7fb3b26a181a 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -472,32 +472,20 @@ struct disk_info { */ #define NR_STRIPES 256 +#define DEFAULT_STRIPE_SIZE 4096 + +#if PAGE_SIZE == DEFAULT_STRIPE_SIZE #define STRIPE_SIZE PAGE_SIZE #define STRIPE_SHIFT (PAGE_SHIFT - 9) #define STRIPE_SECTORS (STRIPE_SIZE>>9) +#endif + #define IO_THRESHOLD 1 #define BYPASS_THRESHOLD 1 #define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head)) #define HASH_MASK (NR_HASH - 1) #define MAX_STRIPE_BATCH 8 -/* bio's attached to a stripe+device for I/O are linked together in bi_sector - * order without overlap. There may be several bio's per stripe+device, and - * a bio could span several devices. - * When walking this list for a particular stripe+device, we must never proceed - * beyond a bio that extends past this device, as the next bio might no longer - * be valid. - * This function is used to determine the 'next' bio in the list, given the - * sector of the current stripe+device - */ -static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector) -{ - if (bio_end_sector(bio) < sector + STRIPE_SECTORS) - return bio->bi_next; - else - return NULL; -} - /* NOTE NR_STRIPE_HASH_LOCKS must remain below 64. * This is because we sometimes take all the spinlocks * and creating that much locking depth can cause @@ -574,6 +562,11 @@ struct r5conf { int raid_disks; int max_nr_stripes; int min_nr_stripes; +#if PAGE_SIZE != DEFAULT_STRIPE_SIZE + unsigned long stripe_size; + unsigned int stripe_shift; + unsigned long stripe_sectors; +#endif /* reshape_progress is the leading edge of a 'reshape' * It has value MaxSector when no reshape is happening @@ -690,6 +683,32 @@ struct r5conf { struct r5pending_data *next_pending_data; }; +#if PAGE_SIZE == DEFAULT_STRIPE_SIZE +#define RAID5_STRIPE_SIZE(conf) STRIPE_SIZE +#define RAID5_STRIPE_SHIFT(conf) STRIPE_SHIFT +#define RAID5_STRIPE_SECTORS(conf) STRIPE_SECTORS +#else +#define RAID5_STRIPE_SIZE(conf) ((conf)->stripe_size) +#define RAID5_STRIPE_SHIFT(conf) ((conf)->stripe_shift) +#define RAID5_STRIPE_SECTORS(conf) ((conf)->stripe_sectors) +#endif + +/* bio's attached to a stripe+device for I/O are linked together in bi_sector + * order without overlap. There may be several bio's per stripe+device, and + * a bio could span several devices. + * When walking this list for a particular stripe+device, we must never proceed + * beyond a bio that extends past this device, as the next bio might no longer + * be valid. + * This function is used to determine the 'next' bio in the list, given the + * sector of the current stripe+device + */ +static inline struct bio *r5_next_bio(struct r5conf *conf, struct bio *bio, sector_t sector) +{ + if (bio_end_sector(bio) < sector + RAID5_STRIPE_SECTORS(conf)) + return bio->bi_next; + else + return NULL; +} /* * Our supported algorithms diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile index fc7b26be692d..d7f6a87687b8 100644 --- a/drivers/nvme/host/Makefile +++ b/drivers/nvme/host/Makefile @@ -13,6 +13,7 @@ nvme-core-y := core.o nvme-core-$(CONFIG_TRACING) += trace.o nvme-core-$(CONFIG_NVME_MULTIPATH) += multipath.o nvme-core-$(CONFIG_NVM) += lightnvm.o +nvme-core-$(CONFIG_BLK_DEV_ZONED) += zns.o nvme-core-$(CONFIG_FAULT_INJECTION_DEBUG_FS) += fault_inject.o nvme-core-$(CONFIG_NVME_HWMON) += hwmon.o diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 6bdcdd984394..767d62985bba 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -89,7 +89,7 @@ static dev_t nvme_chr_devt; static struct class *nvme_class; static struct class *nvme_subsys_class; -static int nvme_revalidate_disk(struct gendisk *disk); +static int _nvme_revalidate_disk(struct gendisk *disk); static void nvme_put_subsystem(struct nvme_subsystem *subsys); static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, unsigned nsid); @@ -100,7 +100,7 @@ static void nvme_set_queue_dying(struct nvme_ns *ns) * Revalidating a dead namespace sets capacity to 0. This will end * buffered writers dirtying pages that can't be synced. */ - if (!ns->disk || test_and_set_bit(NVME_NS_DEAD, &ns->flags)) + if (test_and_set_bit(NVME_NS_DEAD, &ns->flags)) return; blk_set_queue_dying(ns->queue); /* Forcibly unquiesce queues to avoid blocking dispatch */ @@ -287,6 +287,10 @@ void nvme_complete_rq(struct request *req) nvme_retry_req(req); return; } + } else if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && + req_op(req) == REQ_OP_ZONE_APPEND) { + req->__sector = nvme_lba_to_sect(req->q->queuedata, + le64_to_cpu(nvme_req(req)->result.u64)); } nvme_trace_bio_complete(req, status); @@ -362,6 +366,16 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, break; } break; + case NVME_CTRL_DELETING_NOIO: + switch (old_state) { + case NVME_CTRL_DELETING: + case NVME_CTRL_DEAD: + changed = true; + /* FALLTHRU */ + default: + break; + } + break; case NVME_CTRL_DEAD: switch (old_state) { case NVME_CTRL_DELETING: @@ -399,6 +413,7 @@ static bool nvme_state_terminal(struct nvme_ctrl *ctrl) case NVME_CTRL_CONNECTING: return false; case NVME_CTRL_DELETING: + case NVME_CTRL_DELETING_NOIO: case NVME_CTRL_DEAD: return true; default: @@ -450,10 +465,11 @@ static void nvme_free_ns(struct kref *kref) kfree(ns); } -static void nvme_put_ns(struct nvme_ns *ns) +void nvme_put_ns(struct nvme_ns *ns) { kref_put(&ns->kref, nvme_free_ns); } +EXPORT_SYMBOL_NS_GPL(nvme_put_ns, NVME_TARGET_PASSTHRU); static inline void nvme_clear_nvme_request(struct request *req) { @@ -555,7 +571,7 @@ static int nvme_configure_directives(struct nvme_ctrl *ctrl) goto out_disable_stream; } - ctrl->nr_streams = min_t(unsigned, ctrl->nssa, BLK_MAX_WRITE_HINTS - 1); + ctrl->nr_streams = min_t(u16, ctrl->nssa, BLK_MAX_WRITE_HINTS - 1); dev_info(ctrl->device, "Using %u streams\n", ctrl->nr_streams); return 0; @@ -589,6 +605,14 @@ static void nvme_assign_write_stream(struct nvme_ctrl *ctrl, req->q->write_hints[streamid] += blk_rq_bytes(req) >> 9; } +static void nvme_setup_passthrough(struct request *req, + struct nvme_command *cmd) +{ + memcpy(cmd, nvme_req(req)->cmd, sizeof(*cmd)); + /* passthru commands should let the driver set the SGL flags */ + cmd->common.flags &= ~NVME_CMD_SGL_ALL; +} + static inline void nvme_setup_flush(struct nvme_ns *ns, struct nvme_command *cmnd) { @@ -673,7 +697,8 @@ static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns, } static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, - struct request *req, struct nvme_command *cmnd) + struct request *req, struct nvme_command *cmnd, + enum nvme_opcode op) { struct nvme_ctrl *ctrl = ns->ctrl; u16 control = 0; @@ -687,7 +712,7 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, if (req->cmd_flags & REQ_RAHEAD) dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; - cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read); + cmnd->rw.opcode = op; cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id); cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req))); cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); @@ -716,6 +741,8 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, case NVME_NS_DPS_PI_TYPE2: control |= NVME_RW_PRINFO_PRCHK_GUARD | NVME_RW_PRINFO_PRCHK_REF; + if (op == nvme_cmd_zone_append) + control |= NVME_RW_APPEND_PIREMAP; cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req)); break; } @@ -751,11 +778,24 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, switch (req_op(req)) { case REQ_OP_DRV_IN: case REQ_OP_DRV_OUT: - memcpy(cmd, nvme_req(req)->cmd, sizeof(*cmd)); + nvme_setup_passthrough(req, cmd); break; case REQ_OP_FLUSH: nvme_setup_flush(ns, cmd); break; + case REQ_OP_ZONE_RESET_ALL: + case REQ_OP_ZONE_RESET: + ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_RESET); + break; + case REQ_OP_ZONE_OPEN: + ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OPEN); + break; + case REQ_OP_ZONE_CLOSE: + ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_CLOSE); + break; + case REQ_OP_ZONE_FINISH: + ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_FINISH); + break; case REQ_OP_WRITE_ZEROES: ret = nvme_setup_write_zeroes(ns, req, cmd); break; @@ -763,8 +803,13 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, ret = nvme_setup_discard(ns, req, cmd); break; case REQ_OP_READ: + ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_read); + break; case REQ_OP_WRITE: - ret = nvme_setup_rw(ns, req, cmd); + ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write); + break; + case REQ_OP_ZONE_APPEND: + ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append); break; default: WARN_ON_ONCE(1); @@ -884,6 +929,120 @@ out: return ERR_PTR(ret); } +static u32 nvme_known_admin_effects(u8 opcode) +{ + switch (opcode) { + case nvme_admin_format_nvm: + return NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC | + NVME_CMD_EFFECTS_CSE_MASK; + case nvme_admin_sanitize_nvm: + return NVME_CMD_EFFECTS_CSE_MASK; + default: + break; + } + return 0; +} + +u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode) +{ + u32 effects = 0; + + if (ns) { + if (ns->head->effects) + effects = le32_to_cpu(ns->head->effects->iocs[opcode]); + if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC)) + dev_warn(ctrl->device, + "IO command:%02x has unhandled effects:%08x\n", + opcode, effects); + return 0; + } + + if (ctrl->effects) + effects = le32_to_cpu(ctrl->effects->acs[opcode]); + effects |= nvme_known_admin_effects(opcode); + + return effects; +} +EXPORT_SYMBOL_NS_GPL(nvme_command_effects, NVME_TARGET_PASSTHRU); + +static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, + u8 opcode) +{ + u32 effects = nvme_command_effects(ctrl, ns, opcode); + + /* + * For simplicity, IO to all namespaces is quiesced even if the command + * effects say only one namespace is affected. + */ + if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) { + mutex_lock(&ctrl->scan_lock); + mutex_lock(&ctrl->subsys->lock); + nvme_mpath_start_freeze(ctrl->subsys); + nvme_mpath_wait_freeze(ctrl->subsys); + nvme_start_freeze(ctrl); + nvme_wait_freeze(ctrl); + } + return effects; +} + +static void nvme_update_formats(struct nvme_ctrl *ctrl, u32 *effects) +{ + struct nvme_ns *ns; + + down_read(&ctrl->namespaces_rwsem); + list_for_each_entry(ns, &ctrl->namespaces, list) + if (_nvme_revalidate_disk(ns->disk)) + nvme_set_queue_dying(ns); + else if (blk_queue_is_zoned(ns->disk->queue)) { + /* + * IO commands are required to fully revalidate a zoned + * device. Force the command effects to trigger rescan + * work so report zones can run in a context with + * unfrozen IO queues. + */ + *effects |= NVME_CMD_EFFECTS_NCC; + } + up_read(&ctrl->namespaces_rwsem); +} + +static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects) +{ + /* + * Revalidate LBA changes prior to unfreezing. This is necessary to + * prevent memory corruption if a logical block size was changed by + * this command. + */ + if (effects & NVME_CMD_EFFECTS_LBCC) + nvme_update_formats(ctrl, &effects); + if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) { + nvme_unfreeze(ctrl); + nvme_mpath_unfreeze(ctrl->subsys); + mutex_unlock(&ctrl->subsys->lock); + nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL); + mutex_unlock(&ctrl->scan_lock); + } + if (effects & NVME_CMD_EFFECTS_CCC) + nvme_init_identify(ctrl); + if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) { + nvme_queue_scan(ctrl); + flush_work(&ctrl->scan_work); + } +} + +void nvme_execute_passthru_rq(struct request *rq) +{ + struct nvme_command *cmd = nvme_req(rq)->cmd; + struct nvme_ctrl *ctrl = nvme_req(rq)->ctrl; + struct nvme_ns *ns = rq->q->queuedata; + struct gendisk *disk = ns ? ns->disk : NULL; + u32 effects; + + effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode); + blk_execute_rq(rq->q, disk, rq, 0); + nvme_passthru_end(ctrl, effects); +} +EXPORT_SYMBOL_NS_GPL(nvme_execute_passthru_rq, NVME_TARGET_PASSTHRU); + static int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, void __user *ubuffer, unsigned bufflen, void __user *meta_buffer, unsigned meta_len, @@ -922,7 +1081,7 @@ static int nvme_submit_user_cmd(struct request_queue *q, } } - blk_execute_rq(req->q, disk, req, 0); + nvme_execute_passthru_rq(req); if (nvme_req(req)->flags & NVME_REQ_CANCELLED) ret = -EINTR; else @@ -1056,8 +1215,13 @@ static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id) return error; } +static bool nvme_multi_css(struct nvme_ctrl *ctrl) +{ + return (ctrl->ctrl_config & NVME_CC_CSS_MASK) == NVME_CC_CSS_CSI; +} + static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids, - struct nvme_ns_id_desc *cur) + struct nvme_ns_id_desc *cur, bool *csi_seen) { const char *warn_str = "ctrl returned bogus length:"; void *data = cur; @@ -1087,6 +1251,15 @@ static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids, } uuid_copy(&ids->uuid, data + sizeof(*cur)); return NVME_NIDT_UUID_LEN; + case NVME_NIDT_CSI: + if (cur->nidl != NVME_NIDT_CSI_LEN) { + dev_warn(ctrl->device, "%s %d for NVME_NIDT_CSI\n", + warn_str, cur->nidl); + return -1; + } + memcpy(&ids->csi, data + sizeof(*cur), NVME_NIDT_CSI_LEN); + *csi_seen = true; + return NVME_NIDT_CSI_LEN; default: /* Skip unknown types */ return cur->nidl; @@ -1097,10 +1270,9 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid, struct nvme_ns_ids *ids) { struct nvme_command c = { }; - int status; + bool csi_seen = false; + int status, pos, len; void *data; - int pos; - int len; if (ctrl->quirks & NVME_QUIRK_NO_NS_DESC_LIST) return 0; @@ -1127,12 +1299,19 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid, if (cur->nidl == 0) break; - len = nvme_process_ns_desc(ctrl, ids, cur); + len = nvme_process_ns_desc(ctrl, ids, cur, &csi_seen); if (len < 0) - goto free_data; + break; len += sizeof(*cur); } + + if (nvme_multi_css(ctrl) && !csi_seen) { + dev_warn(ctrl->device, "Command set not reported for nsid:%d\n", + nsid); + status = -EINVAL; + } + free_data: kfree(data); return status; @@ -1321,96 +1500,12 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) metadata, meta_len, lower_32_bits(io.slba), NULL, 0); } -static u32 nvme_known_admin_effects(u8 opcode) -{ - switch (opcode) { - case nvme_admin_format_nvm: - return NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC | - NVME_CMD_EFFECTS_CSE_MASK; - case nvme_admin_sanitize_nvm: - return NVME_CMD_EFFECTS_CSE_MASK; - default: - break; - } - return 0; -} - -static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, - u8 opcode) -{ - u32 effects = 0; - - if (ns) { - if (ctrl->effects) - effects = le32_to_cpu(ctrl->effects->iocs[opcode]); - if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC)) - dev_warn(ctrl->device, - "IO command:%02x has unhandled effects:%08x\n", - opcode, effects); - return 0; - } - - if (ctrl->effects) - effects = le32_to_cpu(ctrl->effects->acs[opcode]); - effects |= nvme_known_admin_effects(opcode); - - /* - * For simplicity, IO to all namespaces is quiesced even if the command - * effects say only one namespace is affected. - */ - if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) { - mutex_lock(&ctrl->scan_lock); - mutex_lock(&ctrl->subsys->lock); - nvme_mpath_start_freeze(ctrl->subsys); - nvme_mpath_wait_freeze(ctrl->subsys); - nvme_start_freeze(ctrl); - nvme_wait_freeze(ctrl); - } - return effects; -} - -static void nvme_update_formats(struct nvme_ctrl *ctrl) -{ - struct nvme_ns *ns; - - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) - if (ns->disk && nvme_revalidate_disk(ns->disk)) - nvme_set_queue_dying(ns); - up_read(&ctrl->namespaces_rwsem); -} - -static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects) -{ - /* - * Revalidate LBA changes prior to unfreezing. This is necessary to - * prevent memory corruption if a logical block size was changed by - * this command. - */ - if (effects & NVME_CMD_EFFECTS_LBCC) - nvme_update_formats(ctrl); - if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) { - nvme_unfreeze(ctrl); - nvme_mpath_unfreeze(ctrl->subsys); - mutex_unlock(&ctrl->subsys->lock); - nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL); - mutex_unlock(&ctrl->scan_lock); - } - if (effects & NVME_CMD_EFFECTS_CCC) - nvme_init_identify(ctrl); - if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) { - nvme_queue_scan(ctrl); - flush_work(&ctrl->scan_work); - } -} - static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, struct nvme_passthru_cmd __user *ucmd) { struct nvme_passthru_cmd cmd; struct nvme_command c; unsigned timeout = 0; - u32 effects; u64 result; int status; @@ -1437,12 +1532,10 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, if (cmd.timeout_ms) timeout = msecs_to_jiffies(cmd.timeout_ms); - effects = nvme_passthru_start(ctrl, ns, cmd.opcode); status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, nvme_to_user_ptr(cmd.addr), cmd.data_len, nvme_to_user_ptr(cmd.metadata), cmd.metadata_len, 0, &result, timeout); - nvme_passthru_end(ctrl, effects); if (status >= 0) { if (put_user(result, &ucmd->result)) @@ -1458,7 +1551,6 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, struct nvme_passthru_cmd64 cmd; struct nvme_command c; unsigned timeout = 0; - u32 effects; int status; if (!capable(CAP_SYS_ADMIN)) @@ -1484,12 +1576,10 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, if (cmd.timeout_ms) timeout = msecs_to_jiffies(cmd.timeout_ms); - effects = nvme_passthru_start(ctrl, ns, cmd.opcode); status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, nvme_to_user_ptr(cmd.addr), cmd.data_len, nvme_to_user_ptr(cmd.metadata), cmd.metadata_len, 0, &cmd.result, timeout); - nvme_passthru_end(ctrl, effects); if (status >= 0) { if (put_user(cmd.result, &ucmd->result)) @@ -1503,7 +1593,7 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, * Issue ioctl requests on the first available path. Note that unlike normal * block layer requests we will not retry failed request on another controller. */ -static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk, +struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk, struct nvme_ns_head **head, int *srcu_idx) { #ifdef CONFIG_NVME_MULTIPATH @@ -1523,7 +1613,7 @@ static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk, return disk->private_data; } -static void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx) +void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx) { if (head) srcu_read_unlock(&head->srcu, idx); @@ -1789,7 +1879,7 @@ static int nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid, memcpy(ids->eui64, id->eui64, sizeof(id->eui64)); if (ctrl->vs >= NVME_VS(1, 2, 0)) memcpy(ids->nguid, id->nguid, sizeof(id->nguid)); - if (ctrl->vs >= NVME_VS(1, 3, 0)) + if (ctrl->vs >= NVME_VS(1, 3, 0) || nvme_multi_css(ctrl)) return nvme_identify_ns_descs(ctrl, nsid, ids); return 0; } @@ -1805,7 +1895,8 @@ static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b) { return uuid_equal(&a->uuid, &b->uuid) && memcmp(&a->nguid, &b->nguid, sizeof(a->nguid)) == 0 && - memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0; + memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0 && + a->csi == b->csi; } static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns, @@ -1915,18 +2006,38 @@ static void nvme_update_disk_info(struct gendisk *disk, static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) { + unsigned lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK; struct nvme_ns *ns = disk->private_data; struct nvme_ctrl *ctrl = ns->ctrl; + int ret; u32 iob; /* * If identify namespace failed, use default 512 byte block size so * block layer can use before failing read/write for 0 capacity. */ - ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds; + ns->lba_shift = id->lbaf[lbaf].ds; if (ns->lba_shift == 0) ns->lba_shift = 9; + switch (ns->head->ids.csi) { + case NVME_CSI_NVM: + break; + case NVME_CSI_ZNS: + ret = nvme_update_zone_info(disk, ns, lbaf); + if (ret) { + dev_warn(ctrl->device, + "failed to add zoned namespace:%u ret:%d\n", + ns->head->ns_id, ret); + return ret; + } + break; + default: + dev_warn(ctrl->device, "unknown csi:%u ns:%u\n", + ns->head->ids.csi, ns->head->ns_id); + return -ENODEV; + } + if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) && is_power_of_2(ctrl->max_hw_sectors)) iob = ctrl->max_hw_sectors; @@ -1934,7 +2045,7 @@ static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob)); ns->features = 0; - ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms); + ns->ms = le16_to_cpu(id->lbaf[lbaf].ms); /* the PI implementation requires metadata equal t10 pi tuple size */ if (ns->ms == sizeof(struct t10_pi_tuple)) ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK; @@ -1977,7 +2088,7 @@ static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) return 0; } -static int nvme_revalidate_disk(struct gendisk *disk) +static int _nvme_revalidate_disk(struct gendisk *disk) { struct nvme_ns *ns = disk->private_data; struct nvme_ctrl *ctrl = ns->ctrl; @@ -2025,6 +2136,28 @@ out: return ret; } +static int nvme_revalidate_disk(struct gendisk *disk) +{ + int ret; + + ret = _nvme_revalidate_disk(disk); + if (ret) + return ret; + +#ifdef CONFIG_BLK_DEV_ZONED + if (blk_queue_is_zoned(disk->queue)) { + struct nvme_ns *ns = disk->private_data; + struct nvme_ctrl *ctrl = ns->ctrl; + + ret = blk_revalidate_disk_zones(disk, NULL); + if (!ret) + blk_queue_max_zone_append_sectors(disk->queue, + ctrl->max_zone_append); + } +#endif + return ret; +} + static char nvme_pr_type(enum pr_type type) { switch (type) { @@ -2155,6 +2288,7 @@ static const struct block_device_operations nvme_fops = { .release = nvme_release, .getgeo = nvme_getgeo, .revalidate_disk= nvme_revalidate_disk, + .report_zones = nvme_report_zones, .pr_ops = &nvme_pr_ops, }; @@ -2181,6 +2315,7 @@ const struct block_device_operations nvme_ns_head_ops = { .ioctl = nvme_ioctl, .compat_ioctl = nvme_compat_ioctl, .getgeo = nvme_getgeo, + .report_zones = nvme_report_zones, .pr_ops = &nvme_pr_ops, }; #endif /* CONFIG_NVME_MULTIPATH */ @@ -2238,12 +2373,7 @@ EXPORT_SYMBOL_GPL(nvme_disable_ctrl); int nvme_enable_ctrl(struct nvme_ctrl *ctrl) { - /* - * Default to a 4K page size, with the intention to update this - * path in the future to accomodate architectures with differing - * kernel and IO page sizes. - */ - unsigned dev_page_min, page_shift = 12; + unsigned dev_page_min; int ret; ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap); @@ -2253,17 +2383,18 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl) } dev_page_min = NVME_CAP_MPSMIN(ctrl->cap) + 12; - if (page_shift < dev_page_min) { + if (NVME_CTRL_PAGE_SHIFT < dev_page_min) { dev_err(ctrl->device, "Minimum device page size %u too large for host (%u)\n", - 1 << dev_page_min, 1 << page_shift); + 1 << dev_page_min, 1 << NVME_CTRL_PAGE_SHIFT); return -ENODEV; } - ctrl->page_size = 1 << page_shift; - - ctrl->ctrl_config = NVME_CC_CSS_NVM; - ctrl->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT; + if (NVME_CAP_CSS(ctrl->cap) & NVME_CAP_CSS_CSI) + ctrl->ctrl_config = NVME_CC_CSS_CSI; + else + ctrl->ctrl_config = NVME_CC_CSS_NVM; + ctrl->ctrl_config |= (NVME_CTRL_PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT; ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE; ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; ctrl->ctrl_config |= NVME_CC_ENABLE; @@ -2313,13 +2444,13 @@ static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, if (ctrl->max_hw_sectors) { u32 max_segments = - (ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1; + (ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> 9)) + 1; max_segments = min_not_zero(max_segments, ctrl->max_segments); blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors); blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX)); } - blk_queue_virt_boundary(q, ctrl->page_size - 1); + blk_queue_virt_boundary(q, NVME_CTRL_PAGE_SIZE - 1); blk_queue_dma_alignment(q, 7); if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) vwc = true; @@ -2810,7 +2941,7 @@ out_unlock: return ret; } -int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, +int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi, void *log, size_t size, u64 offset) { struct nvme_command c = { }; @@ -2824,27 +2955,55 @@ int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, c.get_log_page.numdu = cpu_to_le16(dwlen >> 16); c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset)); c.get_log_page.lpou = cpu_to_le32(upper_32_bits(offset)); + c.get_log_page.csi = csi; return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size); } -static int nvme_get_effects_log(struct nvme_ctrl *ctrl) +static struct nvme_cel *nvme_find_cel(struct nvme_ctrl *ctrl, u8 csi) +{ + struct nvme_cel *cel, *ret = NULL; + + spin_lock(&ctrl->lock); + list_for_each_entry(cel, &ctrl->cels, entry) { + if (cel->csi == csi) { + ret = cel; + break; + } + } + spin_unlock(&ctrl->lock); + + return ret; +} + +static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi, + struct nvme_effects_log **log) { + struct nvme_cel *cel = nvme_find_cel(ctrl, csi); int ret; - if (!ctrl->effects) - ctrl->effects = kzalloc(sizeof(*ctrl->effects), GFP_KERNEL); + if (cel) + goto out; - if (!ctrl->effects) - return 0; + cel = kzalloc(sizeof(*cel), GFP_KERNEL); + if (!cel) + return -ENOMEM; - ret = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CMD_EFFECTS, 0, - ctrl->effects, sizeof(*ctrl->effects), 0); + ret = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CMD_EFFECTS, 0, csi, + &cel->log, sizeof(cel->log), 0); if (ret) { - kfree(ctrl->effects); - ctrl->effects = NULL; + kfree(cel); + return ret; } - return ret; + + cel->csi = csi; + + spin_lock(&ctrl->lock); + list_add_tail(&cel->entry, &ctrl->cels); + spin_unlock(&ctrl->lock); +out: + *log = &cel->log; + return 0; } /* @@ -2865,7 +3024,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) return ret; } page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12; - ctrl->sqsize = min_t(int, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize); + ctrl->sqsize = min_t(u16, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize); if (ctrl->vs >= NVME_VS(1, 1, 0)) ctrl->subsystem = NVME_CAP_NSSRC(ctrl->cap); @@ -2877,7 +3036,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) } if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) { - ret = nvme_get_effects_log(ctrl); + ret = nvme_get_effects_log(ctrl, NVME_CSI_NVM, &ctrl->effects); if (ret < 0) goto out_free; } @@ -2939,7 +3098,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) if (id->rtd3e) { /* us -> s */ - u32 transition_time = le32_to_cpu(id->rtd3e) / 1000000; + u32 transition_time = le32_to_cpu(id->rtd3e) / USEC_PER_SEC; ctrl->shutdown_timeout = clamp_t(unsigned int, transition_time, shutdown_timeout, 60); @@ -3345,6 +3504,7 @@ static ssize_t nvme_sysfs_show_state(struct device *dev, [NVME_CTRL_RESETTING] = "resetting", [NVME_CTRL_CONNECTING] = "connecting", [NVME_CTRL_DELETING] = "deleting", + [NVME_CTRL_DELETING_NOIO]= "deleting (no IO)", [NVME_CTRL_DEAD] = "dead", }; @@ -3397,6 +3557,66 @@ static ssize_t nvme_sysfs_show_address(struct device *dev, } static DEVICE_ATTR(address, S_IRUGO, nvme_sysfs_show_address, NULL); +static ssize_t nvme_ctrl_loss_tmo_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + struct nvmf_ctrl_options *opts = ctrl->opts; + + if (ctrl->opts->max_reconnects == -1) + return sprintf(buf, "off\n"); + return sprintf(buf, "%d\n", + opts->max_reconnects * opts->reconnect_delay); +} + +static ssize_t nvme_ctrl_loss_tmo_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + struct nvmf_ctrl_options *opts = ctrl->opts; + int ctrl_loss_tmo, err; + + err = kstrtoint(buf, 10, &ctrl_loss_tmo); + if (err) + return -EINVAL; + + else if (ctrl_loss_tmo < 0) + opts->max_reconnects = -1; + else + opts->max_reconnects = DIV_ROUND_UP(ctrl_loss_tmo, + opts->reconnect_delay); + return count; +} +static DEVICE_ATTR(ctrl_loss_tmo, S_IRUGO | S_IWUSR, + nvme_ctrl_loss_tmo_show, nvme_ctrl_loss_tmo_store); + +static ssize_t nvme_ctrl_reconnect_delay_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + if (ctrl->opts->reconnect_delay == -1) + return sprintf(buf, "off\n"); + return sprintf(buf, "%d\n", ctrl->opts->reconnect_delay); +} + +static ssize_t nvme_ctrl_reconnect_delay_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + unsigned int v; + int err; + + err = kstrtou32(buf, 10, &v); + if (err) + return err; + + ctrl->opts->reconnect_delay = v; + return count; +} +static DEVICE_ATTR(reconnect_delay, S_IRUGO | S_IWUSR, + nvme_ctrl_reconnect_delay_show, nvme_ctrl_reconnect_delay_store); + static struct attribute *nvme_dev_attrs[] = { &dev_attr_reset_controller.attr, &dev_attr_rescan_controller.attr, @@ -3414,6 +3634,8 @@ static struct attribute *nvme_dev_attrs[] = { &dev_attr_sqsize.attr, &dev_attr_hostnqn.attr, &dev_attr_hostid.attr, + &dev_attr_ctrl_loss_tmo.attr, + &dev_attr_reconnect_delay.attr, NULL }; @@ -3510,6 +3732,13 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl, goto out_cleanup_srcu; } + if (head->ids.csi) { + ret = nvme_get_effects_log(ctrl, head->ids.csi, &head->effects); + if (ret) + goto out_cleanup_srcu; + } else + head->effects = ctrl->effects; + ret = nvme_mpath_alloc_disk(ctrl, head); if (ret) goto out_cleanup_srcu; @@ -3591,7 +3820,7 @@ static int ns_cmp(void *priv, struct list_head *a, struct list_head *b) return nsa->head->ns_id - nsb->head->ns_id; } -static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) +struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) { struct nvme_ns *ns, *ret = NULL; @@ -3609,6 +3838,7 @@ static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) up_read(&ctrl->namespaces_rwsem); return ret; } +EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, NVME_TARGET_PASSTHRU); static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) { @@ -3726,7 +3956,7 @@ static void nvme_ns_remove(struct nvme_ns *ns) nvme_mpath_clear_current_path(ns); synchronize_srcu(&ns->head->srcu); /* wait for concurrent submissions */ - if (ns->disk && ns->disk->flags & GENHD_FL_UP) { + if (ns->disk->flags & GENHD_FL_UP) { del_gendisk(ns->disk); blk_cleanup_queue(ns->queue); if (blk_get_integrity(ns->disk)) @@ -3757,7 +3987,7 @@ static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid) ns = nvme_find_get_ns(ctrl, nsid); if (ns) { - if (ns->disk && revalidate_disk(ns->disk)) + if (revalidate_disk(ns->disk)) nvme_ns_remove(ns); nvme_put_ns(ns); } else @@ -3850,8 +4080,8 @@ static void nvme_clear_changed_ns_log(struct nvme_ctrl *ctrl) * raced with us in reading the log page, which could cause us to miss * updates. */ - error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CHANGED_NS, 0, log, - log_size, 0); + error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CHANGED_NS, 0, + NVME_CSI_NVM, log, log_size, 0); if (error) dev_warn(ctrl->device, "reading changed ns log failed: %d\n", error); @@ -3912,6 +4142,9 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl) if (ctrl->state == NVME_CTRL_DEAD) nvme_kill_queues(ctrl); + /* this is a no-op when called from the controller reset handler */ + nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO); + down_write(&ctrl->namespaces_rwsem); list_splice_init(&ctrl->namespaces, &ns_list); up_write(&ctrl->namespaces_rwsem); @@ -3995,8 +4228,8 @@ static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl) if (!log) return; - if (nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_FW_SLOT, 0, log, - sizeof(*log), 0)) + if (nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_FW_SLOT, 0, NVME_CSI_NVM, + log, sizeof(*log), 0)) dev_warn(ctrl->device, "Get FW SLOT INFO log error\n"); kfree(log); } @@ -4106,8 +4339,7 @@ EXPORT_SYMBOL_GPL(nvme_stop_ctrl); void nvme_start_ctrl(struct nvme_ctrl *ctrl) { - if (ctrl->kato) - nvme_start_keep_alive(ctrl); + nvme_start_keep_alive(ctrl); nvme_enable_aen(ctrl); @@ -4133,11 +4365,16 @@ static void nvme_free_ctrl(struct device *dev) struct nvme_ctrl *ctrl = container_of(dev, struct nvme_ctrl, ctrl_device); struct nvme_subsystem *subsys = ctrl->subsys; + struct nvme_cel *cel, *next; if (subsys && ctrl->instance != subsys->instance) ida_simple_remove(&nvme_instance_ida, ctrl->instance); - kfree(ctrl->effects); + list_for_each_entry_safe(cel, next, &ctrl->cels, entry) { + list_del(&cel->entry); + kfree(cel); + } + nvme_mpath_uninit(ctrl); __free_page(ctrl->discard_page); @@ -4168,6 +4405,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, spin_lock_init(&ctrl->lock); mutex_init(&ctrl->scan_lock); INIT_LIST_HEAD(&ctrl->namespaces); + INIT_LIST_HEAD(&ctrl->cels); init_rwsem(&ctrl->namespaces_rwsem); ctrl->dev = dev; ctrl->ops = ops; @@ -4346,6 +4584,29 @@ void nvme_sync_queues(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_sync_queues); +struct nvme_ctrl *nvme_ctrl_get_by_path(const char *path) +{ + struct nvme_ctrl *ctrl; + struct file *f; + + f = filp_open(path, O_RDWR, 0); + if (IS_ERR(f)) + return ERR_CAST(f); + + if (f->f_op != &nvme_dev_fops) { + ctrl = ERR_PTR(-EINVAL); + goto out_close; + } + + ctrl = f->private_data; + nvme_get_ctrl(ctrl); + +out_close: + filp_close(f, NULL); + return ctrl; +} +EXPORT_SYMBOL_NS_GPL(nvme_ctrl_get_by_path, NVME_TARGET_PASSTHRU); + /* * Check we didn't inadvertently grow the command structure sizes: */ @@ -4364,6 +4625,8 @@ static inline void _nvme_check_size(void) BUILD_BUG_ON(sizeof(struct nvme_command) != 64); BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE); BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE); + BUILD_BUG_ON(sizeof(struct nvme_id_ns_zns) != NVME_IDENTIFY_DATA_SIZE); + BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_zns) != NVME_IDENTIFY_DATA_SIZE); BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64); BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512); BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64); diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 2a6c8190eeb7..4ec4829d6233 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -547,7 +547,7 @@ static struct nvmf_transport_ops *nvmf_lookup_transport( blk_status_t nvmf_fail_nonready_command(struct nvme_ctrl *ctrl, struct request *rq) { - if (ctrl->state != NVME_CTRL_DELETING && + if (ctrl->state != NVME_CTRL_DELETING_NOIO && ctrl->state != NVME_CTRL_DEAD && !blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH)) return BLK_STS_RESOURCE; diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index a0ec40ab62ee..a9c1e3b4585e 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -182,7 +182,8 @@ bool nvmf_ip_options_match(struct nvme_ctrl *ctrl, static inline bool nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq, bool queue_live) { - if (likely(ctrl->state == NVME_CTRL_LIVE)) + if (likely(ctrl->state == NVME_CTRL_LIVE || + ctrl->state == NVME_CTRL_DELETING)) return true; return __nvmf_check_ready(ctrl, rq, queue_live); } diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 6aa30bb5a762..eae43bb444e0 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -826,6 +826,7 @@ nvme_fc_ctrl_connectivity_loss(struct nvme_fc_ctrl *ctrl) break; case NVME_CTRL_DELETING: + case NVME_CTRL_DELETING_NOIO: default: /* no action to take - let it delete */ break; @@ -3001,8 +3002,9 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) if (ret) goto out_disconnect_admin_queue; - ctrl->ctrl.max_hw_sectors = - (ctrl->lport->ops->max_sgl_segments - 1) << (PAGE_SHIFT - 9); + ctrl->ctrl.max_segments = ctrl->lport->ops->max_sgl_segments; + ctrl->ctrl.max_hw_sectors = ctrl->ctrl.max_segments << + (ilog2(SZ_4K) - 9); blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); diff --git a/drivers/nvme/host/hwmon.c b/drivers/nvme/host/hwmon.c index 2e6477ed420f..412a6c97c0d8 100644 --- a/drivers/nvme/host/hwmon.c +++ b/drivers/nvme/host/hwmon.c @@ -62,7 +62,7 @@ static int nvme_hwmon_get_smart_log(struct nvme_hwmon_data *data) int ret; ret = nvme_get_log(data->ctrl, NVME_NSID_ALL, NVME_LOG_SMART, 0, - &data->log, sizeof(data->log), 0); + NVME_CSI_NVM, &data->log, sizeof(data->log), 0); return ret <= 0 ? ret : -EIO; } @@ -241,7 +241,8 @@ void nvme_hwmon_init(struct nvme_ctrl *ctrl) err = nvme_hwmon_get_smart_log(data); if (err) { - dev_warn(dev, "Failed to read smart log (error %d)\n", err); + dev_warn(ctrl->device, + "Failed to read smart log (error %d)\n", err); devm_kfree(dev, data); return; } diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 69608755d415..8e562d0f2c30 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -593,8 +593,8 @@ static int nvme_nvm_get_chk_meta(struct nvm_dev *ndev, dev_meta_off = dev_meta; ret = nvme_get_log(ctrl, ns->head->ns_id, - NVME_NVM_LOG_REPORT_CHUNK, 0, dev_meta, len, - offset); + NVME_NVM_LOG_REPORT_CHUNK, 0, NVME_CSI_NVM, + dev_meta, len, offset); if (ret) { dev_err(ctrl->device, "Get REPORT CHUNK log error\n"); break; diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 5a37a595411e..3ded54d2c9c6 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -167,9 +167,18 @@ void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl) static bool nvme_path_is_disabled(struct nvme_ns *ns) { - return ns->ctrl->state != NVME_CTRL_LIVE || - test_bit(NVME_NS_ANA_PENDING, &ns->flags) || - test_bit(NVME_NS_REMOVING, &ns->flags); + /* + * We don't treat NVME_CTRL_DELETING as a disabled path as I/O should + * still be able to complete assuming that the controller is connected. + * Otherwise it will fail immediately and return to the requeue list. + */ + if (ns->ctrl->state != NVME_CTRL_LIVE && + ns->ctrl->state != NVME_CTRL_DELETING) + return true; + if (test_bit(NVME_NS_ANA_PENDING, &ns->flags) || + test_bit(NVME_NS_REMOVING, &ns->flags)) + return true; + return false; } static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node) @@ -246,6 +255,12 @@ static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head, fallback = ns; } + /* No optimized path found, re-check the current path */ + if (!nvme_path_is_disabled(old) && + old->ana_state == NVME_ANA_OPTIMIZED) { + found = old; + goto out; + } if (!fallback) return NULL; found = fallback; @@ -266,10 +281,13 @@ inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head) struct nvme_ns *ns; ns = srcu_dereference(head->current_path[node], &head->srcu); - if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR && ns) - ns = nvme_round_robin_path(head, node, ns); - if (unlikely(!ns || !nvme_path_is_optimized(ns))) - ns = __nvme_find_path(head, node); + if (unlikely(!ns)) + return __nvme_find_path(head, node); + + if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR) + return nvme_round_robin_path(head, node, ns); + if (unlikely(!nvme_path_is_optimized(ns))) + return __nvme_find_path(head, node); return ns; } @@ -527,7 +545,7 @@ static int nvme_read_ana_log(struct nvme_ctrl *ctrl) int error; mutex_lock(&ctrl->ana_lock); - error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, 0, + error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, 0, NVME_CSI_NVM, ctrl->ana_log_buf, ctrl->ana_log_size, 0); if (error) { dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error); @@ -563,6 +581,9 @@ static void nvme_ana_work(struct work_struct *work) { struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work); + if (ctrl->state != NVME_CTRL_LIVE) + return; + nvme_read_ana_log(ctrl); } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 9c5b82af7978..ebb8c3ed3885 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -37,6 +37,14 @@ extern unsigned int admin_timeout; #define NVME_INLINE_METADATA_SG_CNT 1 #endif +/* + * Default to a 4K page size, with the intention to update this + * path in the future to accommodate architectures with differing + * kernel and IO page sizes. + */ +#define NVME_CTRL_PAGE_SHIFT 12 +#define NVME_CTRL_PAGE_SIZE (1 << NVME_CTRL_PAGE_SHIFT) + extern struct workqueue_struct *nvme_wq; extern struct workqueue_struct *nvme_reset_wq; extern struct workqueue_struct *nvme_delete_wq; @@ -180,12 +188,32 @@ static inline u16 nvme_req_qid(struct request *req) */ #define NVME_QUIRK_DELAY_AMOUNT 2300 +/* + * enum nvme_ctrl_state: Controller state + * + * @NVME_CTRL_NEW: New controller just allocated, initial state + * @NVME_CTRL_LIVE: Controller is connected and I/O capable + * @NVME_CTRL_RESETTING: Controller is resetting (or scheduled reset) + * @NVME_CTRL_CONNECTING: Controller is disconnected, now connecting the + * transport + * @NVME_CTRL_DELETING: Controller is deleting (or scheduled deletion) + * @NVME_CTRL_DELETING_NOIO: Controller is deleting and I/O is not + * disabled/failed immediately. This state comes + * after all async event processing took place and + * before ns removal and the controller deletion + * progress + * @NVME_CTRL_DEAD: Controller is non-present/unresponsive during + * shutdown or removal. In this case we forcibly + * kill all inflight I/O as they have no chance to + * complete + */ enum nvme_ctrl_state { NVME_CTRL_NEW, NVME_CTRL_LIVE, NVME_CTRL_RESETTING, NVME_CTRL_CONNECTING, NVME_CTRL_DELETING, + NVME_CTRL_DELETING_NOIO, NVME_CTRL_DEAD, }; @@ -198,6 +226,12 @@ struct nvme_fault_inject { #endif }; +struct nvme_cel { + struct list_head entry; + struct nvme_effects_log log; + u8 csi; +}; + struct nvme_ctrl { bool comp_seen; enum nvme_ctrl_state state; @@ -235,10 +269,12 @@ struct nvme_ctrl { u32 queue_count; u64 cap; - u32 page_size; u32 max_hw_sectors; u32 max_segments; u32 max_integrity_segments; +#ifdef CONFIG_BLK_DEV_ZONED + u32 max_zone_append; +#endif u16 crdt[3]; u16 oncs; u16 oacs; @@ -264,6 +300,7 @@ struct nvme_ctrl { unsigned long quirks; struct nvme_id_power_state psd[32]; struct nvme_effects_log *effects; + struct list_head cels; struct work_struct scan_work; struct work_struct async_event_work; struct delayed_work ka_work; @@ -346,6 +383,7 @@ struct nvme_ns_ids { u8 eui64[8]; u8 nguid[16]; uuid_t uuid; + u8 csi; }; /* @@ -365,6 +403,7 @@ struct nvme_ns_head { struct kref ref; bool shared; int instance; + struct nvme_effects_log *effects; #ifdef CONFIG_NVME_MULTIPATH struct gendisk *disk; struct bio_list requeue_list; @@ -402,6 +441,9 @@ struct nvme_ns { u16 sgs; u32 sws; u8 pi_type; +#ifdef CONFIG_BLK_DEV_ZONED + u64 zsze; +#endif unsigned long features; unsigned long flags; #define NVME_NS_REMOVING 0 @@ -567,8 +609,11 @@ int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl); int nvme_try_sched_reset(struct nvme_ctrl *ctrl); int nvme_delete_ctrl(struct nvme_ctrl *ctrl); -int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, +int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi, void *log, size_t size, u64 offset); +struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk, + struct nvme_ns_head **head, int *srcu_idx); +void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx); extern const struct attribute_group *nvme_ns_id_attr_groups[]; extern const struct block_device_operations nvme_ns_head_ops; @@ -704,6 +749,36 @@ static inline void nvme_mpath_update_disk_size(struct gendisk *disk) } #endif /* CONFIG_NVME_MULTIPATH */ +#ifdef CONFIG_BLK_DEV_ZONED +int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns, + unsigned lbaf); + +int nvme_report_zones(struct gendisk *disk, sector_t sector, + unsigned int nr_zones, report_zones_cb cb, void *data); + +blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req, + struct nvme_command *cmnd, + enum nvme_zone_mgmt_action action); +#else +#define nvme_report_zones NULL + +static inline blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, + struct request *req, struct nvme_command *cmnd, + enum nvme_zone_mgmt_action action) +{ + return BLK_STS_NOTSUPP; +} + +static inline int nvme_update_zone_info(struct gendisk *disk, + struct nvme_ns *ns, + unsigned lbaf) +{ + dev_warn(ns->ctrl->device, + "Please enable CONFIG_BLK_DEV_ZONED to support ZNS devices\n"); + return -EPROTONOSUPPORT; +} +#endif + #ifdef CONFIG_NVM int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node); void nvme_nvm_unregister(struct nvme_ns *ns); @@ -735,4 +810,11 @@ void nvme_hwmon_init(struct nvme_ctrl *ctrl); static inline void nvme_hwmon_init(struct nvme_ctrl *ctrl) { } #endif +u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, + u8 opcode); +void nvme_execute_passthru_rq(struct request *rq); +struct nvme_ctrl *nvme_ctrl_get_by_path(const char *path); +struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid); +void nvme_put_ns(struct nvme_ns *ns); + #endif /* _NVME_H */ diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 0c85680984c1..ba725ae47305 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -4,6 +4,7 @@ * Copyright (c) 2011-2014, Intel Corporation. */ +#include <linux/acpi.h> #include <linux/aer.h> #include <linux/async.h> #include <linux/blkdev.h> @@ -61,10 +62,10 @@ MODULE_PARM_DESC(sgl_threshold, static int io_queue_depth_set(const char *val, const struct kernel_param *kp); static const struct kernel_param_ops io_queue_depth_ops = { .set = io_queue_depth_set, - .get = param_get_int, + .get = param_get_uint, }; -static int io_queue_depth = 1024; +static unsigned int io_queue_depth = 1024; module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644); MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2"); @@ -94,6 +95,10 @@ static unsigned int poll_queues; module_param_cb(poll_queues, &io_queue_count_ops, &poll_queues, 0644); MODULE_PARM_DESC(poll_queues, "Number of queues to use for polled IO."); +static bool noacpi; +module_param(noacpi, bool, 0444); +MODULE_PARM_DESC(noacpi, "disable acpi bios quirks"); + struct nvme_dev; struct nvme_queue; @@ -115,7 +120,7 @@ struct nvme_dev { unsigned max_qid; unsigned io_queues[HCTX_MAX_TYPES]; unsigned int num_vecs; - int q_depth; + u16 q_depth; int io_sqes; u32 db_stride; void __iomem *bar; @@ -151,13 +156,14 @@ struct nvme_dev { static int io_queue_depth_set(const char *val, const struct kernel_param *kp) { - int n = 0, ret; + int ret; + u16 n; - ret = kstrtoint(val, 10, &n); + ret = kstrtou16(val, 10, &n); if (ret != 0 || n < 2) return -EINVAL; - return param_set_int(val, kp); + return param_set_ushort(val, kp); } static inline unsigned int sq_idx(unsigned int qid, u32 stride) @@ -345,10 +351,10 @@ static bool nvme_dbbuf_update_and_check_event(u16 value, u32 *dbbuf_db, * as it only leads to a small amount of wasted memory for the lifetime of * the I/O. */ -static int nvme_npages(unsigned size, struct nvme_dev *dev) +static int nvme_pci_npages_prp(void) { - unsigned nprps = DIV_ROUND_UP(size + dev->ctrl.page_size, - dev->ctrl.page_size); + unsigned nprps = DIV_ROUND_UP(NVME_MAX_KB_SZ + NVME_CTRL_PAGE_SIZE, + NVME_CTRL_PAGE_SIZE); return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); } @@ -356,22 +362,18 @@ static int nvme_npages(unsigned size, struct nvme_dev *dev) * Calculates the number of pages needed for the SGL segments. For example a 4k * page can accommodate 256 SGL descriptors. */ -static int nvme_pci_npages_sgl(unsigned int num_seg) +static int nvme_pci_npages_sgl(void) { - return DIV_ROUND_UP(num_seg * sizeof(struct nvme_sgl_desc), PAGE_SIZE); + return DIV_ROUND_UP(NVME_MAX_SEGS * sizeof(struct nvme_sgl_desc), + PAGE_SIZE); } -static unsigned int nvme_pci_iod_alloc_size(struct nvme_dev *dev, - unsigned int size, unsigned int nseg, bool use_sgl) +static size_t nvme_pci_iod_alloc_size(void) { - size_t alloc_size; + size_t npages = max(nvme_pci_npages_prp(), nvme_pci_npages_sgl()); - if (use_sgl) - alloc_size = sizeof(__le64 *) * nvme_pci_npages_sgl(nseg); - else - alloc_size = sizeof(__le64 *) * nvme_npages(size, dev); - - return alloc_size + sizeof(struct scatterlist) * nseg; + return sizeof(__le64 *) * npages + + sizeof(struct scatterlist) * NVME_MAX_SEGS; } static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, @@ -500,9 +502,6 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req) int nseg = blk_rq_nr_phys_segments(req); unsigned int avg_seg_size; - if (nseg == 0) - return false; - avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req), nseg); if (!(dev->ctrl.sgls & ((1 << 0) | (1 << 1)))) @@ -517,7 +516,7 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req) static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - const int last_prp = dev->ctrl.page_size / sizeof(__le64) - 1; + const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1; dma_addr_t dma_addr = iod->first_dma, next_dma_addr; int i; @@ -584,34 +583,33 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev, struct scatterlist *sg = iod->sg; int dma_len = sg_dma_len(sg); u64 dma_addr = sg_dma_address(sg); - u32 page_size = dev->ctrl.page_size; - int offset = dma_addr & (page_size - 1); + int offset = dma_addr & (NVME_CTRL_PAGE_SIZE - 1); __le64 *prp_list; void **list = nvme_pci_iod_list(req); dma_addr_t prp_dma; int nprps, i; - length -= (page_size - offset); + length -= (NVME_CTRL_PAGE_SIZE - offset); if (length <= 0) { iod->first_dma = 0; goto done; } - dma_len -= (page_size - offset); + dma_len -= (NVME_CTRL_PAGE_SIZE - offset); if (dma_len) { - dma_addr += (page_size - offset); + dma_addr += (NVME_CTRL_PAGE_SIZE - offset); } else { sg = sg_next(sg); dma_addr = sg_dma_address(sg); dma_len = sg_dma_len(sg); } - if (length <= page_size) { + if (length <= NVME_CTRL_PAGE_SIZE) { iod->first_dma = dma_addr; goto done; } - nprps = DIV_ROUND_UP(length, page_size); + nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE); if (nprps <= (256 / 8)) { pool = dev->prp_small_pool; iod->npages = 0; @@ -630,7 +628,7 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev, iod->first_dma = prp_dma; i = 0; for (;;) { - if (i == page_size >> 3) { + if (i == NVME_CTRL_PAGE_SIZE >> 3) { __le64 *old_prp_list = prp_list; prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); if (!prp_list) @@ -641,9 +639,9 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev, i = 1; } prp_list[i++] = cpu_to_le64(dma_addr); - dma_len -= page_size; - dma_addr += page_size; - length -= page_size; + dma_len -= NVME_CTRL_PAGE_SIZE; + dma_addr += NVME_CTRL_PAGE_SIZE; + length -= NVME_CTRL_PAGE_SIZE; if (length <= 0) break; if (dma_len > 0) @@ -753,8 +751,8 @@ static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev, struct bio_vec *bv) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - unsigned int offset = bv->bv_offset & (dev->ctrl.page_size - 1); - unsigned int first_prp_len = dev->ctrl.page_size - offset; + unsigned int offset = bv->bv_offset & (NVME_CTRL_PAGE_SIZE - 1); + unsigned int first_prp_len = NVME_CTRL_PAGE_SIZE - offset; iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0); if (dma_mapping_error(dev->dev, iod->first_dma)) @@ -764,7 +762,7 @@ static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev, cmnd->dptr.prp1 = cpu_to_le64(iod->first_dma); if (bv->bv_len > first_prp_len) cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma + first_prp_len); - return 0; + return BLK_STS_OK; } static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev, @@ -782,7 +780,7 @@ static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev, cmnd->dptr.sgl.addr = cpu_to_le64(iod->first_dma); cmnd->dptr.sgl.length = cpu_to_le32(iod->dma_len); cmnd->dptr.sgl.type = NVME_SGL_FMT_DATA_DESC << 4; - return 0; + return BLK_STS_OK; } static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, @@ -796,7 +794,7 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, struct bio_vec bv = req_bvec(req); if (!is_pci_p2pdma_page(bv.bv_page)) { - if (bv.bv_offset + bv.bv_len <= dev->ctrl.page_size * 2) + if (bv.bv_offset + bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2) return nvme_setup_prp_simple(dev, req, &cmnd->rw, &bv); @@ -846,7 +844,7 @@ static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req, if (dma_mapping_error(dev->dev, iod->meta_dma)) return BLK_STS_IOERR; cmnd->rw.metadata = cpu_to_le64(iod->meta_dma); - return 0; + return BLK_STS_OK; } /* @@ -1019,6 +1017,7 @@ static irqreturn_t nvme_irq(int irq, void *data) static irqreturn_t nvme_irq_check(int irq, void *data) { struct nvme_queue *nvmeq = data; + if (nvme_cqe_pending(nvmeq)) return IRQ_WAKE_THREAD; return IRQ_NONE; @@ -1154,7 +1153,6 @@ static void abort_endio(struct request *req, blk_status_t error) static bool nvme_should_reset(struct nvme_dev *dev, u32 csts) { - /* If true, indicates loss of adapter communication, possibly by a * NVMe Subsystem reset. */ @@ -1261,9 +1259,9 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) } /* - * Shutdown the controller immediately and schedule a reset if the - * command was already aborted once before and still hasn't been - * returned to the driver, or if this is the admin queue. + * Shutdown the controller immediately and schedule a reset if the + * command was already aborted once before and still hasn't been + * returned to the driver, or if this is the admin queue. */ if (!nvmeq->qid || iod->aborted) { dev_warn(dev->ctrl.device, @@ -1398,11 +1396,12 @@ static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues, { int q_depth = dev->q_depth; unsigned q_size_aligned = roundup(q_depth * entry_size, - dev->ctrl.page_size); + NVME_CTRL_PAGE_SIZE); if (q_size_aligned * nr_io_queues > dev->cmb_size) { u64 mem_per_q = div_u64(dev->cmb_size, nr_io_queues); - mem_per_q = round_down(mem_per_q, dev->ctrl.page_size); + + mem_per_q = round_down(mem_per_q, NVME_CTRL_PAGE_SIZE); q_depth = div_u64(mem_per_q, entry_size); /* @@ -1817,6 +1816,7 @@ static inline void nvme_release_cmb(struct nvme_dev *dev) static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits) { + u32 host_mem_size = dev->host_mem_size >> NVME_CTRL_PAGE_SHIFT; u64 dma_addr = dev->host_mem_descs_dma; struct nvme_command c; int ret; @@ -1825,8 +1825,7 @@ static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits) c.features.opcode = nvme_admin_set_features; c.features.fid = cpu_to_le32(NVME_FEAT_HOST_MEM_BUF); c.features.dword11 = cpu_to_le32(bits); - c.features.dword12 = cpu_to_le32(dev->host_mem_size >> - ilog2(dev->ctrl.page_size)); + c.features.dword12 = cpu_to_le32(host_mem_size); c.features.dword13 = cpu_to_le32(lower_32_bits(dma_addr)); c.features.dword14 = cpu_to_le32(upper_32_bits(dma_addr)); c.features.dword15 = cpu_to_le32(dev->nr_host_mem_descs); @@ -1846,7 +1845,7 @@ static void nvme_free_host_mem(struct nvme_dev *dev) for (i = 0; i < dev->nr_host_mem_descs; i++) { struct nvme_host_mem_buf_desc *desc = &dev->host_mem_descs[i]; - size_t size = le32_to_cpu(desc->size) * dev->ctrl.page_size; + size_t size = le32_to_cpu(desc->size) * NVME_CTRL_PAGE_SIZE; dma_free_attrs(dev->dev, size, dev->host_mem_desc_bufs[i], le64_to_cpu(desc->addr), @@ -1898,7 +1897,7 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred, break; descs[i].addr = cpu_to_le64(dma_addr); - descs[i].size = cpu_to_le32(len / dev->ctrl.page_size); + descs[i].size = cpu_to_le32(len / NVME_CTRL_PAGE_SIZE); i++; } @@ -1914,7 +1913,7 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred, out_free_bufs: while (--i >= 0) { - size_t size = le32_to_cpu(descs[i].size) * dev->ctrl.page_size; + size_t size = le32_to_cpu(descs[i].size) * NVME_CTRL_PAGE_SIZE; dma_free_attrs(dev->dev, size, bufs[i], le64_to_cpu(descs[i].addr), @@ -1932,12 +1931,12 @@ out: static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred) { - u32 chunk_size; + u64 min_chunk = min_t(u64, preferred, PAGE_SIZE * MAX_ORDER_NR_PAGES); + u64 hmminds = max_t(u32, dev->ctrl.hmminds * 4096, PAGE_SIZE * 2); + u64 chunk_size; /* start big and work our way down */ - for (chunk_size = min_t(u64, preferred, PAGE_SIZE * MAX_ORDER_NR_PAGES); - chunk_size >= max_t(u32, dev->ctrl.hmminds * 4096, PAGE_SIZE * 2); - chunk_size /= 2) { + for (chunk_size = min_chunk; chunk_size >= hmminds; chunk_size /= 2) { if (!__nvme_alloc_host_mem(dev, preferred, chunk_size)) { if (!min || dev->host_mem_size >= min) return 0; @@ -2003,7 +2002,7 @@ static void nvme_calc_irq_sets(struct irq_affinity *affd, unsigned int nrirqs) unsigned int nr_read_queues, nr_write_queues = dev->nr_write_queues; /* - * If there is no interupt available for queues, ensure that + * If there is no interrupt available for queues, ensure that * the default queue is set to 1. The affinity set size is * also set to one, but the irq core ignores it for this case. * @@ -2261,8 +2260,8 @@ static void nvme_dev_add(struct nvme_dev *dev) dev->tagset.nr_maps++; dev->tagset.timeout = NVME_IO_TIMEOUT; dev->tagset.numa_node = dev->ctrl.numa_node; - dev->tagset.queue_depth = - min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1; + dev->tagset.queue_depth = min_t(unsigned int, dev->q_depth, + BLK_MQ_MAX_DEPTH) - 1; dev->tagset.cmd_size = sizeof(struct nvme_iod); dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE; dev->tagset.driver_data = dev; @@ -2321,7 +2320,7 @@ static int nvme_pci_enable(struct nvme_dev *dev) dev->ctrl.cap = lo_hi_readq(dev->bar + NVME_REG_CAP); - dev->q_depth = min_t(int, NVME_CAP_MQES(dev->ctrl.cap) + 1, + dev->q_depth = min_t(u16, NVME_CAP_MQES(dev->ctrl.cap) + 1, io_queue_depth); dev->ctrl.sqsize = dev->q_depth - 1; /* 0's based queue depth */ dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap); @@ -2760,6 +2759,54 @@ static unsigned long check_vendor_combination_bug(struct pci_dev *pdev) return 0; } +#ifdef CONFIG_ACPI +static bool nvme_acpi_storage_d3(struct pci_dev *dev) +{ + struct acpi_device *adev; + struct pci_dev *root; + acpi_handle handle; + acpi_status status; + u8 val; + + /* + * Look for _DSD property specifying that the storage device on the port + * must use D3 to support deep platform power savings during + * suspend-to-idle. + */ + root = pcie_find_root_port(dev); + if (!root) + return false; + + adev = ACPI_COMPANION(&root->dev); + if (!adev) + return false; + + /* + * The property is defined in the PXSX device for South complex ports + * and in the PEGP device for North complex ports. + */ + status = acpi_get_handle(adev->handle, "PXSX", &handle); + if (ACPI_FAILURE(status)) { + status = acpi_get_handle(adev->handle, "PEGP", &handle); + if (ACPI_FAILURE(status)) + return false; + } + + if (acpi_bus_get_device(handle, &adev)) + return false; + + if (fwnode_property_read_u8(acpi_fwnode_handle(adev), "StorageD3Enable", + &val)) + return false; + return val == 1; +} +#else +static inline bool nvme_acpi_storage_d3(struct pci_dev *dev) +{ + return false; +} +#endif /* CONFIG_ACPI */ + static void nvme_async_probe(void *data, async_cookie_t cookie) { struct nvme_dev *dev = data; @@ -2809,12 +2856,21 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) quirks |= check_vendor_combination_bug(pdev); + if (!noacpi && nvme_acpi_storage_d3(pdev)) { + /* + * Some systems use a bios work around to ask for D3 on + * platforms that support kernel managed suspend. + */ + dev_info(&pdev->dev, + "platform quirk: setting simple suspend\n"); + quirks |= NVME_QUIRK_SIMPLE_SUSPEND; + } + /* * Double check that our mempool alloc size will cover the biggest * command we support. */ - alloc_size = nvme_pci_iod_alloc_size(dev, NVME_MAX_KB_SZ, - NVME_MAX_SEGS, true); + alloc_size = nvme_pci_iod_alloc_size(); WARN_ON_ONCE(alloc_size > PAGE_SIZE); dev->iod_mempool = mempool_create_node(1, mempool_kmalloc, @@ -2876,6 +2932,7 @@ static void nvme_reset_done(struct pci_dev *pdev) static void nvme_shutdown(struct pci_dev *pdev) { struct nvme_dev *dev = pci_get_drvdata(pdev); + nvme_disable_prepare_reset(dev, true); } @@ -3006,6 +3063,7 @@ unfreeze: static int nvme_simple_suspend(struct device *dev) { struct nvme_dev *ndev = pci_get_drvdata(to_pci_dev(dev)); + return nvme_disable_prepare_reset(ndev, true); } @@ -3079,16 +3137,16 @@ static const struct pci_error_handlers nvme_err_handler = { }; static const struct pci_device_id nvme_id_table[] = { - { PCI_VDEVICE(INTEL, 0x0953), + { PCI_VDEVICE(INTEL, 0x0953), /* Intel 750/P3500/P3600/P3700 */ .driver_data = NVME_QUIRK_STRIPE_SIZE | NVME_QUIRK_DEALLOCATE_ZEROES, }, - { PCI_VDEVICE(INTEL, 0x0a53), + { PCI_VDEVICE(INTEL, 0x0a53), /* Intel P3520 */ .driver_data = NVME_QUIRK_STRIPE_SIZE | NVME_QUIRK_DEALLOCATE_ZEROES, }, - { PCI_VDEVICE(INTEL, 0x0a54), + { PCI_VDEVICE(INTEL, 0x0a54), /* Intel P4500/P4600 */ .driver_data = NVME_QUIRK_STRIPE_SIZE | NVME_QUIRK_DEALLOCATE_ZEROES, }, - { PCI_VDEVICE(INTEL, 0x0a55), + { PCI_VDEVICE(INTEL, 0x0a55), /* Dell Express Flash P4600 */ .driver_data = NVME_QUIRK_STRIPE_SIZE | NVME_QUIRK_DEALLOCATE_ZEROES, }, { PCI_VDEVICE(INTEL, 0xf1a5), /* Intel 600P/P3100 */ diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index e881f879ac63..44c76ffbb264 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -96,6 +96,7 @@ struct nvme_rdma_queue { int cm_error; struct completion cm_done; bool pi_support; + int cq_size; }; struct nvme_rdma_ctrl { @@ -275,6 +276,7 @@ static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor) init_attr.recv_cq = queue->ib_cq; if (queue->pi_support) init_attr.create_flags |= IB_QP_CREATE_INTEGRITY_EN; + init_attr.qp_context = queue; ret = rdma_create_qp(queue->cm_id, dev->pd, &init_attr); @@ -409,6 +411,14 @@ out_err: return NULL; } +static void nvme_rdma_free_cq(struct nvme_rdma_queue *queue) +{ + if (nvme_rdma_poll_queue(queue)) + ib_free_cq(queue->ib_cq); + else + ib_cq_pool_put(queue->ib_cq, queue->cq_size); +} + static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue) { struct nvme_rdma_device *dev; @@ -430,7 +440,7 @@ static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue) * the destruction of the QP shouldn't use rdma_cm API. */ ib_destroy_qp(queue->qp); - ib_free_cq(queue->ib_cq); + nvme_rdma_free_cq(queue); nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size, sizeof(struct nvme_completion), DMA_FROM_DEVICE); @@ -450,13 +460,42 @@ static int nvme_rdma_get_max_fr_pages(struct ib_device *ibdev, bool pi_support) return min_t(u32, NVME_RDMA_MAX_SEGMENTS, max_page_list_len - 1); } +static int nvme_rdma_create_cq(struct ib_device *ibdev, + struct nvme_rdma_queue *queue) +{ + int ret, comp_vector, idx = nvme_rdma_queue_idx(queue); + enum ib_poll_context poll_ctx; + + /* + * Spread I/O queues completion vectors according their queue index. + * Admin queues can always go on completion vector 0. + */ + comp_vector = (idx == 0 ? idx : idx - 1) % ibdev->num_comp_vectors; + + /* Polling queues need direct cq polling context */ + if (nvme_rdma_poll_queue(queue)) { + poll_ctx = IB_POLL_DIRECT; + queue->ib_cq = ib_alloc_cq(ibdev, queue, queue->cq_size, + comp_vector, poll_ctx); + } else { + poll_ctx = IB_POLL_SOFTIRQ; + queue->ib_cq = ib_cq_pool_get(ibdev, queue->cq_size, + comp_vector, poll_ctx); + } + + if (IS_ERR(queue->ib_cq)) { + ret = PTR_ERR(queue->ib_cq); + return ret; + } + + return 0; +} + static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) { struct ib_device *ibdev; const int send_wr_factor = 3; /* MR, SEND, INV */ const int cq_factor = send_wr_factor + 1; /* + RECV */ - int comp_vector, idx = nvme_rdma_queue_idx(queue); - enum ib_poll_context poll_ctx; int ret, pages_per_mr; queue->device = nvme_rdma_find_get_device(queue->cm_id); @@ -467,26 +506,12 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) } ibdev = queue->device->dev; - /* - * Spread I/O queues completion vectors according their queue index. - * Admin queues can always go on completion vector 0. - */ - comp_vector = (idx == 0 ? idx : idx - 1) % ibdev->num_comp_vectors; - - /* Polling queues need direct cq polling context */ - if (nvme_rdma_poll_queue(queue)) - poll_ctx = IB_POLL_DIRECT; - else - poll_ctx = IB_POLL_SOFTIRQ; - /* +1 for ib_stop_cq */ - queue->ib_cq = ib_alloc_cq(ibdev, queue, - cq_factor * queue->queue_size + 1, - comp_vector, poll_ctx); - if (IS_ERR(queue->ib_cq)) { - ret = PTR_ERR(queue->ib_cq); + queue->cq_size = cq_factor * queue->queue_size + 1; + + ret = nvme_rdma_create_cq(ibdev, queue); + if (ret) goto out_put_dev; - } ret = nvme_rdma_create_qp(queue, send_wr_factor); if (ret) @@ -512,7 +537,7 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) if (ret) { dev_err(queue->ctrl->ctrl.device, "failed to initialize MR pool sized %d for QID %d\n", - queue->queue_size, idx); + queue->queue_size, nvme_rdma_queue_idx(queue)); goto out_destroy_ring; } @@ -523,7 +548,7 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) if (ret) { dev_err(queue->ctrl->ctrl.device, "failed to initialize PI MR pool sized %d for QID %d\n", - queue->queue_size, idx); + queue->queue_size, nvme_rdma_queue_idx(queue)); goto out_destroy_mr_pool; } } @@ -540,7 +565,7 @@ out_destroy_ring: out_destroy_qp: rdma_destroy_qp(queue->cm_id); out_destroy_ib_cq: - ib_free_cq(queue->ib_cq); + nvme_rdma_free_cq(queue); out_put_dev: nvme_rdma_dev_put(queue->device); return ret; @@ -942,15 +967,20 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) ret = PTR_ERR(ctrl->ctrl.connect_q); goto out_free_tag_set; } - } else { - blk_mq_update_nr_hw_queues(&ctrl->tag_set, - ctrl->ctrl.queue_count - 1); } ret = nvme_rdma_start_io_queues(ctrl); if (ret) goto out_cleanup_connect_q; + if (!new) { + nvme_start_queues(&ctrl->ctrl); + nvme_wait_freeze(&ctrl->ctrl); + blk_mq_update_nr_hw_queues(ctrl->ctrl.tagset, + ctrl->ctrl.queue_count - 1); + nvme_unfreeze(&ctrl->ctrl); + } + return 0; out_cleanup_connect_q: @@ -983,6 +1013,7 @@ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl, bool remove) { if (ctrl->ctrl.queue_count > 1) { + nvme_start_freeze(&ctrl->ctrl); nvme_stop_queues(&ctrl->ctrl); nvme_rdma_stop_io_queues(ctrl); if (ctrl->ctrl.tagset) { @@ -1077,11 +1108,12 @@ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new) changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); if (!changed) { /* - * state change failure is ok if we're in DELETING state, + * state change failure is ok if we started ctrl delete, * unless we're during creation of a new controller to * avoid races with teardown flow. */ - WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING); + WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING && + ctrl->ctrl.state != NVME_CTRL_DELETING_NOIO); WARN_ON_ONCE(new); ret = -EINVAL; goto destroy_io; @@ -1134,8 +1166,9 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { - /* state change failure is ok if we're in DELETING state */ - WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING); + /* state change failure is ok if we started ctrl delete */ + WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING && + ctrl->ctrl.state != NVME_CTRL_DELETING_NOIO); return; } @@ -1163,7 +1196,7 @@ static void nvme_rdma_end_request(struct nvme_rdma_request *req) static void nvme_rdma_wr_error(struct ib_cq *cq, struct ib_wc *wc, const char *op) { - struct nvme_rdma_queue *queue = cq->cq_context; + struct nvme_rdma_queue *queue = wc->qp->qp_context; struct nvme_rdma_ctrl *ctrl = queue->ctrl; if (ctrl->ctrl.state == NVME_CTRL_LIVE) @@ -1706,7 +1739,7 @@ static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc) { struct nvme_rdma_qe *qe = container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe); - struct nvme_rdma_queue *queue = cq->cq_context; + struct nvme_rdma_queue *queue = wc->qp->qp_context; struct ib_device *ibdev = queue->device->dev; struct nvme_completion *cqe = qe->data; const size_t len = sizeof(struct nvme_completion); diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 472f9001521d..62fbaecdc960 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -46,6 +46,7 @@ struct nvme_tcp_request { u32 pdu_sent; u16 ttag; struct list_head entry; + struct llist_node lentry; __le32 ddgst; struct bio *curr_bio; @@ -75,9 +76,10 @@ struct nvme_tcp_queue { struct work_struct io_work; int io_cpu; - spinlock_t lock; struct mutex send_mutex; + struct llist_head req_list; struct list_head send_list; + bool more_requests; /* recv state */ void *pdu; @@ -261,15 +263,13 @@ static inline void nvme_tcp_advance_req(struct nvme_tcp_request *req, } static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req, - bool sync) + bool sync, bool last) { struct nvme_tcp_queue *queue = req->queue; bool empty; - spin_lock(&queue->lock); - empty = list_empty(&queue->send_list) && !queue->request; - list_add_tail(&req->entry, &queue->send_list); - spin_unlock(&queue->lock); + empty = llist_add(&req->lentry, &queue->req_list) && + list_empty(&queue->send_list) && !queue->request; /* * if we're the first on the send_list and we can try to send @@ -278,25 +278,42 @@ static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req, */ if (queue->io_cpu == smp_processor_id() && sync && empty && mutex_trylock(&queue->send_mutex)) { + queue->more_requests = !last; nvme_tcp_try_send(queue); + queue->more_requests = false; mutex_unlock(&queue->send_mutex); - } else { + } else if (last) { queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); } } +static void nvme_tcp_process_req_list(struct nvme_tcp_queue *queue) +{ + struct nvme_tcp_request *req; + struct llist_node *node; + + for (node = llist_del_all(&queue->req_list); node; node = node->next) { + req = llist_entry(node, struct nvme_tcp_request, lentry); + list_add(&req->entry, &queue->send_list); + } +} + static inline struct nvme_tcp_request * nvme_tcp_fetch_request(struct nvme_tcp_queue *queue) { struct nvme_tcp_request *req; - spin_lock(&queue->lock); req = list_first_entry_or_null(&queue->send_list, struct nvme_tcp_request, entry); - if (req) - list_del(&req->entry); - spin_unlock(&queue->lock); + if (!req) { + nvme_tcp_process_req_list(queue); + req = list_first_entry_or_null(&queue->send_list, + struct nvme_tcp_request, entry); + if (unlikely(!req)) + return NULL; + } + list_del(&req->entry); return req; } @@ -596,7 +613,7 @@ static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue, req->state = NVME_TCP_SEND_H2C_PDU; req->offset = 0; - nvme_tcp_queue_request(req, false); + nvme_tcp_queue_request(req, false, true); return 0; } @@ -863,6 +880,12 @@ done: read_unlock(&sk->sk_callback_lock); } +static inline bool nvme_tcp_queue_more(struct nvme_tcp_queue *queue) +{ + return !list_empty(&queue->send_list) || + !llist_empty(&queue->req_list) || queue->more_requests; +} + static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue) { queue->request = NULL; @@ -884,7 +907,7 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) bool last = nvme_tcp_pdu_last_send(req, len); int ret, flags = MSG_DONTWAIT; - if (last && !queue->data_digest) + if (last && !queue->data_digest && !nvme_tcp_queue_more(queue)) flags |= MSG_EOR; else flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST; @@ -931,7 +954,7 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req) int flags = MSG_DONTWAIT; int ret; - if (inline_data) + if (inline_data || nvme_tcp_queue_more(queue)) flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST; else flags |= MSG_EOR; @@ -996,12 +1019,17 @@ static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req) { struct nvme_tcp_queue *queue = req->queue; int ret; - struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_EOR }; + struct msghdr msg = { .msg_flags = MSG_DONTWAIT }; struct kvec iov = { .iov_base = &req->ddgst + req->offset, .iov_len = NVME_TCP_DIGEST_LENGTH - req->offset }; + if (nvme_tcp_queue_more(queue)) + msg.msg_flags |= MSG_MORE; + else + msg.msg_flags |= MSG_EOR; + ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len); if (unlikely(ret <= 0)) return ret; @@ -1344,8 +1372,8 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int ret, rcv_pdu_size; queue->ctrl = ctrl; + init_llist_head(&queue->req_list); INIT_LIST_HEAD(&queue->send_list); - spin_lock_init(&queue->lock); mutex_init(&queue->send_mutex); INIT_WORK(&queue->io_work, nvme_tcp_io_work); queue->queue_size = queue_size; @@ -1746,15 +1774,20 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new) ret = PTR_ERR(ctrl->connect_q); goto out_free_tag_set; } - } else { - blk_mq_update_nr_hw_queues(ctrl->tagset, - ctrl->queue_count - 1); } ret = nvme_tcp_start_io_queues(ctrl); if (ret) goto out_cleanup_connect_q; + if (!new) { + nvme_start_queues(ctrl); + nvme_wait_freeze(ctrl); + blk_mq_update_nr_hw_queues(ctrl->tagset, + ctrl->queue_count - 1); + nvme_unfreeze(ctrl); + } + return 0; out_cleanup_connect_q: @@ -1859,6 +1892,7 @@ static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl, { if (ctrl->queue_count <= 1) return; + nvme_start_freeze(ctrl); nvme_stop_queues(ctrl); nvme_tcp_stop_io_queues(ctrl); if (ctrl->tagset) { @@ -1925,11 +1959,12 @@ static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new) if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) { /* - * state change failure is ok if we're in DELETING state, + * state change failure is ok if we started ctrl delete, * unless we're during creation of a new controller to * avoid races with teardown flow. */ - WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING); + WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING && + ctrl->state != NVME_CTRL_DELETING_NOIO); WARN_ON_ONCE(new); ret = -EINVAL; goto destroy_io; @@ -1985,8 +2020,9 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work) blk_mq_unquiesce_queue(ctrl->admin_q); if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) { - /* state change failure is ok if we're in DELETING state */ - WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING); + /* state change failure is ok if we started ctrl delete */ + WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING && + ctrl->state != NVME_CTRL_DELETING_NOIO); return; } @@ -2021,8 +2057,9 @@ static void nvme_reset_ctrl_work(struct work_struct *work) nvme_tcp_teardown_ctrl(ctrl, false); if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) { - /* state change failure is ok if we're in DELETING state */ - WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING); + /* state change failure is ok if we started ctrl delete */ + WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING && + ctrl->state != NVME_CTRL_DELETING_NOIO); return; } @@ -2109,7 +2146,7 @@ static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg) ctrl->async_req.curr_bio = NULL; ctrl->async_req.data_len = 0; - nvme_tcp_queue_request(&ctrl->async_req, true); + nvme_tcp_queue_request(&ctrl->async_req, true, true); } static enum blk_eh_timer_return @@ -2221,6 +2258,14 @@ static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns, return 0; } +static void nvme_tcp_commit_rqs(struct blk_mq_hw_ctx *hctx) +{ + struct nvme_tcp_queue *queue = hctx->driver_data; + + if (!llist_empty(&queue->req_list)) + queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); +} + static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { @@ -2240,7 +2285,7 @@ static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx, blk_mq_start_request(rq); - nvme_tcp_queue_request(req, true); + nvme_tcp_queue_request(req, true, bd->last); return BLK_STS_OK; } @@ -2308,6 +2353,7 @@ static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx) static const struct blk_mq_ops nvme_tcp_mq_ops = { .queue_rq = nvme_tcp_queue_rq, + .commit_rqs = nvme_tcp_commit_rqs, .complete = nvme_complete_rq, .init_request = nvme_tcp_init_request, .exit_request = nvme_tcp_exit_request, diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c new file mode 100644 index 000000000000..57cfd78731fb --- /dev/null +++ b/drivers/nvme/host/zns.c @@ -0,0 +1,256 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2020 Western Digital Corporation or its affiliates. + */ + +#include <linux/blkdev.h> +#include <linux/vmalloc.h> +#include "nvme.h" + +static int nvme_set_max_append(struct nvme_ctrl *ctrl) +{ + struct nvme_command c = { }; + struct nvme_id_ctrl_zns *id; + int status; + + id = kzalloc(sizeof(*id), GFP_KERNEL); + if (!id) + return -ENOMEM; + + c.identify.opcode = nvme_admin_identify; + c.identify.cns = NVME_ID_CNS_CS_CTRL; + c.identify.csi = NVME_CSI_ZNS; + + status = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id)); + if (status) { + kfree(id); + return status; + } + + if (id->zasl) + ctrl->max_zone_append = 1 << (id->zasl + 3); + else + ctrl->max_zone_append = ctrl->max_hw_sectors; + kfree(id); + return 0; +} + +int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns, + unsigned lbaf) +{ + struct nvme_effects_log *log = ns->head->effects; + struct request_queue *q = disk->queue; + struct nvme_command c = { }; + struct nvme_id_ns_zns *id; + int status; + + /* Driver requires zone append support */ + if (!(le32_to_cpu(log->iocs[nvme_cmd_zone_append]) & + NVME_CMD_EFFECTS_CSUPP)) { + dev_warn(ns->ctrl->device, + "append not supported for zoned namespace:%d\n", + ns->head->ns_id); + return -EINVAL; + } + + /* Lazily query controller append limit for the first zoned namespace */ + if (!ns->ctrl->max_zone_append) { + status = nvme_set_max_append(ns->ctrl); + if (status) + return status; + } + + id = kzalloc(sizeof(*id), GFP_KERNEL); + if (!id) + return -ENOMEM; + + c.identify.opcode = nvme_admin_identify; + c.identify.nsid = cpu_to_le32(ns->head->ns_id); + c.identify.cns = NVME_ID_CNS_CS_NS; + c.identify.csi = NVME_CSI_ZNS; + + status = nvme_submit_sync_cmd(ns->ctrl->admin_q, &c, id, sizeof(*id)); + if (status) + goto free_data; + + /* + * We currently do not handle devices requiring any of the zoned + * operation characteristics. + */ + if (id->zoc) { + dev_warn(ns->ctrl->device, + "zone operations:%x not supported for namespace:%u\n", + le16_to_cpu(id->zoc), ns->head->ns_id); + status = -EINVAL; + goto free_data; + } + + ns->zsze = nvme_lba_to_sect(ns, le64_to_cpu(id->lbafe[lbaf].zsze)); + if (!is_power_of_2(ns->zsze)) { + dev_warn(ns->ctrl->device, + "invalid zone size:%llu for namespace:%u\n", + ns->zsze, ns->head->ns_id); + status = -EINVAL; + goto free_data; + } + + q->limits.zoned = BLK_ZONED_HM; + blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); + blk_queue_max_open_zones(q, le32_to_cpu(id->mor) + 1); + blk_queue_max_active_zones(q, le32_to_cpu(id->mar) + 1); +free_data: + kfree(id); + return status; +} + +static void *nvme_zns_alloc_report_buffer(struct nvme_ns *ns, + unsigned int nr_zones, size_t *buflen) +{ + struct request_queue *q = ns->disk->queue; + size_t bufsize; + void *buf; + + const size_t min_bufsize = sizeof(struct nvme_zone_report) + + sizeof(struct nvme_zone_descriptor); + + nr_zones = min_t(unsigned int, nr_zones, + get_capacity(ns->disk) >> ilog2(ns->zsze)); + + bufsize = sizeof(struct nvme_zone_report) + + nr_zones * sizeof(struct nvme_zone_descriptor); + bufsize = min_t(size_t, bufsize, + queue_max_hw_sectors(q) << SECTOR_SHIFT); + bufsize = min_t(size_t, bufsize, queue_max_segments(q) << PAGE_SHIFT); + + while (bufsize >= min_bufsize) { + buf = __vmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY); + if (buf) { + *buflen = bufsize; + return buf; + } + bufsize >>= 1; + } + return NULL; +} + +static int __nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector, + struct nvme_zone_report *report, + size_t buflen) +{ + struct nvme_command c = { }; + int ret; + + c.zmr.opcode = nvme_cmd_zone_mgmt_recv; + c.zmr.nsid = cpu_to_le32(ns->head->ns_id); + c.zmr.slba = cpu_to_le64(nvme_sect_to_lba(ns, sector)); + c.zmr.numd = cpu_to_le32(nvme_bytes_to_numd(buflen)); + c.zmr.zra = NVME_ZRA_ZONE_REPORT; + c.zmr.zrasf = NVME_ZRASF_ZONE_REPORT_ALL; + c.zmr.pr = NVME_REPORT_ZONE_PARTIAL; + + ret = nvme_submit_sync_cmd(ns->queue, &c, report, buflen); + if (ret) + return ret; + + return le64_to_cpu(report->nr_zones); +} + +static int nvme_zone_parse_entry(struct nvme_ns *ns, + struct nvme_zone_descriptor *entry, + unsigned int idx, report_zones_cb cb, + void *data) +{ + struct blk_zone zone = { }; + + if ((entry->zt & 0xf) != NVME_ZONE_TYPE_SEQWRITE_REQ) { + dev_err(ns->ctrl->device, "invalid zone type %#x\n", + entry->zt); + return -EINVAL; + } + + zone.type = BLK_ZONE_TYPE_SEQWRITE_REQ; + zone.cond = entry->zs >> 4; + zone.len = ns->zsze; + zone.capacity = nvme_lba_to_sect(ns, le64_to_cpu(entry->zcap)); + zone.start = nvme_lba_to_sect(ns, le64_to_cpu(entry->zslba)); + zone.wp = nvme_lba_to_sect(ns, le64_to_cpu(entry->wp)); + + return cb(&zone, idx, data); +} + +static int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector, + unsigned int nr_zones, report_zones_cb cb, void *data) +{ + struct nvme_zone_report *report; + int ret, zone_idx = 0; + unsigned int nz, i; + size_t buflen; + + report = nvme_zns_alloc_report_buffer(ns, nr_zones, &buflen); + if (!report) + return -ENOMEM; + + sector &= ~(ns->zsze - 1); + while (zone_idx < nr_zones && sector < get_capacity(ns->disk)) { + memset(report, 0, buflen); + ret = __nvme_ns_report_zones(ns, sector, report, buflen); + if (ret < 0) + goto out_free; + + nz = min_t(unsigned int, ret, nr_zones); + if (!nz) + break; + + for (i = 0; i < nz && zone_idx < nr_zones; i++) { + ret = nvme_zone_parse_entry(ns, &report->entries[i], + zone_idx, cb, data); + if (ret) + goto out_free; + zone_idx++; + } + + sector += ns->zsze * nz; + } + + if (zone_idx > 0) + ret = zone_idx; + else + ret = -EINVAL; +out_free: + kvfree(report); + return ret; +} + +int nvme_report_zones(struct gendisk *disk, sector_t sector, + unsigned int nr_zones, report_zones_cb cb, void *data) +{ + struct nvme_ns_head *head = NULL; + struct nvme_ns *ns; + int srcu_idx, ret; + + ns = nvme_get_ns_from_disk(disk, &head, &srcu_idx); + if (unlikely(!ns)) + return -EWOULDBLOCK; + + if (ns->head->ids.csi == NVME_CSI_ZNS) + ret = nvme_ns_report_zones(ns, sector, nr_zones, cb, data); + else + ret = -EINVAL; + nvme_put_ns_from_disk(head, srcu_idx); + + return ret; +} + +blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req, + struct nvme_command *c, enum nvme_zone_mgmt_action action) +{ + c->zms.opcode = nvme_cmd_zone_mgmt_send; + c->zms.nsid = cpu_to_le32(ns->head->ns_id); + c->zms.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req))); + c->zms.zsa = action; + + if (req_op(req) == REQ_OP_ZONE_RESET_ALL) + c->zms.select_all = 1; + + return BLK_STS_OK; +} diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig index 4474952d64c6..8056955e652c 100644 --- a/drivers/nvme/target/Kconfig +++ b/drivers/nvme/target/Kconfig @@ -16,6 +16,18 @@ config NVME_TARGET To configure the NVMe target you probably want to use the nvmetcli tool from http://git.infradead.org/users/hch/nvmetcli.git. +config NVME_TARGET_PASSTHRU + bool "NVMe Target Passthrough support" + depends on NVME_TARGET + depends on NVME_CORE=y || NVME_CORE=NVME_TARGET + help + This enables target side NVMe passthru controller support for the + NVMe Over Fabrics protocol. It allows for hosts to manage and + directly access an actual NVMe controller residing on the target + side, incuding executing Vendor Unique Commands. + + If unsure, say N. + config NVME_TARGET_LOOP tristate "NVMe loopback device support" depends on NVME_TARGET diff --git a/drivers/nvme/target/Makefile b/drivers/nvme/target/Makefile index 2b33836f3d3e..ebf91fc4c72e 100644 --- a/drivers/nvme/target/Makefile +++ b/drivers/nvme/target/Makefile @@ -11,6 +11,7 @@ obj-$(CONFIG_NVME_TARGET_TCP) += nvmet-tcp.o nvmet-y += core.o configfs.o admin-cmd.o fabrics-cmd.o \ discovery.o io-cmd-file.o io-cmd-bdev.o +nvmet-$(CONFIG_NVME_TARGET_PASSTHRU) += passthru.o nvme-loop-y += loop.o nvmet-rdma-y += rdma.o nvmet-fc-y += fc.o diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 1db8c0498668..e9fe91786bbb 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -113,11 +113,10 @@ static u16 nvmet_get_smart_log_all(struct nvmet_req *req, u64 data_units_read = 0, data_units_written = 0; struct nvmet_ns *ns; struct nvmet_ctrl *ctrl; + unsigned long idx; ctrl = req->sq->ctrl; - - rcu_read_lock(); - list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) { + xa_for_each(&ctrl->subsys->namespaces, idx, ns) { /* we don't have the right data for file backed ns */ if (!ns->bdev) continue; @@ -127,9 +126,7 @@ static u16 nvmet_get_smart_log_all(struct nvmet_req *req, host_writes += part_stat_read(ns->bdev->bd_part, ios[WRITE]); data_units_written += DIV_ROUND_UP( part_stat_read(ns->bdev->bd_part, sectors[WRITE]), 1000); - } - rcu_read_unlock(); put_unaligned_le64(host_reads, &slog->host_reads[0]); put_unaligned_le64(data_units_read, &slog->data_units_read[0]); @@ -230,14 +227,13 @@ static u32 nvmet_format_ana_group(struct nvmet_req *req, u32 grpid, { struct nvmet_ctrl *ctrl = req->sq->ctrl; struct nvmet_ns *ns; + unsigned long idx; u32 count = 0; if (!(req->cmd->get_log_page.lsp & NVME_ANA_LOG_RGO)) { - rcu_read_lock(); - list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) + xa_for_each(&ctrl->subsys->namespaces, idx, ns) if (ns->anagrpid == grpid) desc->nsids[count++] = cpu_to_le32(ns->nsid); - rcu_read_unlock(); } desc->grpid = cpu_to_le32(grpid); @@ -427,7 +423,7 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) id->awupf = 0; id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */ - if (ctrl->ops->has_keyed_sgls) + if (ctrl->ops->flags & NVMF_KEYED_SGLS) id->sgls |= cpu_to_le32(1 << 2); if (req->port->inline_data_size) id->sgls |= cpu_to_le32(1 << 20); @@ -556,6 +552,7 @@ static void nvmet_execute_identify_nslist(struct nvmet_req *req) static const int buf_size = NVME_IDENTIFY_DATA_SIZE; struct nvmet_ctrl *ctrl = req->sq->ctrl; struct nvmet_ns *ns; + unsigned long idx; u32 min_nsid = le32_to_cpu(req->cmd->identify.nsid); __le32 *list; u16 status = 0; @@ -567,15 +564,13 @@ static void nvmet_execute_identify_nslist(struct nvmet_req *req) goto out; } - rcu_read_lock(); - list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) { + xa_for_each(&ctrl->subsys->namespaces, idx, ns) { if (ns->nsid <= min_nsid) continue; list[i++] = cpu_to_le32(ns->nsid); if (i == buf_size / sizeof(__le32)) break; } - rcu_read_unlock(); status = nvmet_copy_to_sgl(req, 0, list, buf_size); @@ -754,7 +749,7 @@ u16 nvmet_set_feat_async_event(struct nvmet_req *req, u32 mask) return 0; } -static void nvmet_execute_set_features(struct nvmet_req *req) +void nvmet_execute_set_features(struct nvmet_req *req) { struct nvmet_subsys *subsys = req->sq->ctrl->subsys; u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10); @@ -829,7 +824,7 @@ void nvmet_get_feat_async_event(struct nvmet_req *req) nvmet_set_result(req, READ_ONCE(req->sq->ctrl->aen_enabled)); } -static void nvmet_execute_get_features(struct nvmet_req *req) +void nvmet_execute_get_features(struct nvmet_req *req) { struct nvmet_subsys *subsys = req->sq->ctrl->subsys; u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10); @@ -945,6 +940,9 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req) if (unlikely(ret)) return ret; + if (nvmet_req_passthru_ctrl(req)) + return nvmet_parse_passthru_admin_cmd(req); + switch (cmd->common.opcode) { case nvme_admin_get_log_page: req->execute = nvmet_execute_get_log_page; diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 419e0d4ce79b..74b2b61c773b 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -666,6 +666,103 @@ static const struct config_item_type nvmet_namespaces_type = { .ct_owner = THIS_MODULE, }; +#ifdef CONFIG_NVME_TARGET_PASSTHRU + +static ssize_t nvmet_passthru_device_path_show(struct config_item *item, + char *page) +{ + struct nvmet_subsys *subsys = to_subsys(item->ci_parent); + + return snprintf(page, PAGE_SIZE, "%s\n", subsys->passthru_ctrl_path); +} + +static ssize_t nvmet_passthru_device_path_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_subsys *subsys = to_subsys(item->ci_parent); + size_t len; + int ret; + + mutex_lock(&subsys->lock); + + ret = -EBUSY; + if (subsys->passthru_ctrl) + goto out_unlock; + + ret = -EINVAL; + len = strcspn(page, "\n"); + if (!len) + goto out_unlock; + + kfree(subsys->passthru_ctrl_path); + ret = -ENOMEM; + subsys->passthru_ctrl_path = kstrndup(page, len, GFP_KERNEL); + if (!subsys->passthru_ctrl_path) + goto out_unlock; + + mutex_unlock(&subsys->lock); + + return count; +out_unlock: + mutex_unlock(&subsys->lock); + return ret; +} +CONFIGFS_ATTR(nvmet_passthru_, device_path); + +static ssize_t nvmet_passthru_enable_show(struct config_item *item, + char *page) +{ + struct nvmet_subsys *subsys = to_subsys(item->ci_parent); + + return sprintf(page, "%d\n", subsys->passthru_ctrl ? 1 : 0); +} + +static ssize_t nvmet_passthru_enable_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_subsys *subsys = to_subsys(item->ci_parent); + bool enable; + int ret = 0; + + if (strtobool(page, &enable)) + return -EINVAL; + + if (enable) + ret = nvmet_passthru_ctrl_enable(subsys); + else + nvmet_passthru_ctrl_disable(subsys); + + return ret ? ret : count; +} +CONFIGFS_ATTR(nvmet_passthru_, enable); + +static struct configfs_attribute *nvmet_passthru_attrs[] = { + &nvmet_passthru_attr_device_path, + &nvmet_passthru_attr_enable, + NULL, +}; + +static const struct config_item_type nvmet_passthru_type = { + .ct_attrs = nvmet_passthru_attrs, + .ct_owner = THIS_MODULE, +}; + +static void nvmet_add_passthru_group(struct nvmet_subsys *subsys) +{ + config_group_init_type_name(&subsys->passthru_group, + "passthru", &nvmet_passthru_type); + configfs_add_default_group(&subsys->passthru_group, + &subsys->group); +} + +#else /* CONFIG_NVME_TARGET_PASSTHRU */ + +static void nvmet_add_passthru_group(struct nvmet_subsys *subsys) +{ +} + +#endif /* CONFIG_NVME_TARGET_PASSTHRU */ + static int nvmet_port_subsys_allow_link(struct config_item *parent, struct config_item *target) { @@ -862,14 +959,14 @@ static ssize_t nvmet_subsys_attr_version_show(struct config_item *item, struct nvmet_subsys *subsys = to_subsys(item); if (NVME_TERTIARY(subsys->ver)) - return snprintf(page, PAGE_SIZE, "%d.%d.%d\n", - (int)NVME_MAJOR(subsys->ver), - (int)NVME_MINOR(subsys->ver), - (int)NVME_TERTIARY(subsys->ver)); + return snprintf(page, PAGE_SIZE, "%llu.%llu.%llu\n", + NVME_MAJOR(subsys->ver), + NVME_MINOR(subsys->ver), + NVME_TERTIARY(subsys->ver)); - return snprintf(page, PAGE_SIZE, "%d.%d\n", - (int)NVME_MAJOR(subsys->ver), - (int)NVME_MINOR(subsys->ver)); + return snprintf(page, PAGE_SIZE, "%llu.%llu\n", + NVME_MAJOR(subsys->ver), + NVME_MINOR(subsys->ver)); } static ssize_t nvmet_subsys_attr_version_store(struct config_item *item, @@ -879,6 +976,10 @@ static ssize_t nvmet_subsys_attr_version_store(struct config_item *item, int major, minor, tertiary = 0; int ret; + /* passthru subsystems use the underlying controller's version */ + if (nvmet_passthru_ctrl(subsys)) + return -EINVAL; + ret = sscanf(page, "%d.%d.%d\n", &major, &minor, &tertiary); if (ret != 2 && ret != 3) return -EINVAL; @@ -1121,6 +1222,8 @@ static struct config_group *nvmet_subsys_make(struct config_group *group, configfs_add_default_group(&subsys->allowed_hosts_group, &subsys->group); + nvmet_add_passthru_group(subsys); + return &subsys->group; } diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 6816507fba58..b92f45f5cd5b 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -115,13 +115,14 @@ u16 nvmet_zero_sgl(struct nvmet_req *req, off_t off, size_t len) static unsigned int nvmet_max_nsid(struct nvmet_subsys *subsys) { - struct nvmet_ns *ns; + unsigned long nsid = 0; + struct nvmet_ns *cur; + unsigned long idx; - if (list_empty(&subsys->namespaces)) - return 0; + xa_for_each(&subsys->namespaces, idx, cur) + nsid = cur->nsid; - ns = list_last_entry(&subsys->namespaces, struct nvmet_ns, dev_link); - return ns->nsid; + return nsid; } static u32 nvmet_async_event_result(struct nvmet_async_event *aen) @@ -336,7 +337,7 @@ int nvmet_enable_port(struct nvmet_port *port) * If the user requested PI support and the transport isn't pi capable, * don't enable the port. */ - if (port->pi_enable && !ops->metadata_support) { + if (port->pi_enable && !(ops->flags & NVMF_METADATA_SUPPORTED)) { pr_err("T10-PI is not supported by transport type %d\n", port->disc_addr.trtype); ret = -EINVAL; @@ -410,28 +411,13 @@ static void nvmet_stop_keep_alive_timer(struct nvmet_ctrl *ctrl) cancel_delayed_work_sync(&ctrl->ka_work); } -static struct nvmet_ns *__nvmet_find_namespace(struct nvmet_ctrl *ctrl, - __le32 nsid) -{ - struct nvmet_ns *ns; - - list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) { - if (ns->nsid == le32_to_cpu(nsid)) - return ns; - } - - return NULL; -} - struct nvmet_ns *nvmet_find_namespace(struct nvmet_ctrl *ctrl, __le32 nsid) { struct nvmet_ns *ns; - rcu_read_lock(); - ns = __nvmet_find_namespace(ctrl, nsid); + ns = xa_load(&ctrl->subsys->namespaces, le32_to_cpu(nsid)); if (ns) percpu_ref_get(&ns->ref); - rcu_read_unlock(); return ns; } @@ -558,6 +544,12 @@ int nvmet_ns_enable(struct nvmet_ns *ns) mutex_lock(&subsys->lock); ret = 0; + + if (nvmet_passthru_ctrl(subsys)) { + pr_info("cannot enable both passthru and regular namespaces for a single subsystem"); + goto out_unlock; + } + if (ns->enabled) goto out_unlock; @@ -586,24 +578,10 @@ int nvmet_ns_enable(struct nvmet_ns *ns) if (ns->nsid > subsys->max_nsid) subsys->max_nsid = ns->nsid; - /* - * The namespaces list needs to be sorted to simplify the implementation - * of the Identify Namepace List subcommand. - */ - if (list_empty(&subsys->namespaces)) { - list_add_tail_rcu(&ns->dev_link, &subsys->namespaces); - } else { - struct nvmet_ns *old; - - list_for_each_entry_rcu(old, &subsys->namespaces, dev_link, - lockdep_is_held(&subsys->lock)) { - BUG_ON(ns->nsid == old->nsid); - if (ns->nsid < old->nsid) - break; - } + ret = xa_insert(&subsys->namespaces, ns->nsid, ns, GFP_KERNEL); + if (ret) + goto out_restore_subsys_maxnsid; - list_add_tail_rcu(&ns->dev_link, &old->dev_link); - } subsys->nr_namespaces++; nvmet_ns_changed(subsys, ns->nsid); @@ -612,6 +590,10 @@ int nvmet_ns_enable(struct nvmet_ns *ns) out_unlock: mutex_unlock(&subsys->lock); return ret; + +out_restore_subsys_maxnsid: + subsys->max_nsid = nvmet_max_nsid(subsys); + percpu_ref_exit(&ns->ref); out_dev_put: list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) pci_dev_put(radix_tree_delete(&ctrl->p2p_ns_map, ns->nsid)); @@ -630,7 +612,7 @@ void nvmet_ns_disable(struct nvmet_ns *ns) goto out_unlock; ns->enabled = false; - list_del_rcu(&ns->dev_link); + xa_erase(&ns->subsys->namespaces, ns->nsid); if (ns->nsid == subsys->max_nsid) subsys->max_nsid = nvmet_max_nsid(subsys); @@ -681,7 +663,6 @@ struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid) if (!ns) return NULL; - INIT_LIST_HEAD(&ns->dev_link); init_completion(&ns->disable_done); ns->nsid = nsid; @@ -874,6 +855,9 @@ static u16 nvmet_parse_io_cmd(struct nvmet_req *req) if (unlikely(ret)) return ret; + if (nvmet_req_passthru_ctrl(req)) + return nvmet_parse_passthru_io_cmd(req); + req->ns = nvmet_find_namespace(req->sq->ctrl, cmd->rw.nsid); if (unlikely(!req->ns)) { req->error_loc = offsetof(struct nvme_common_command, nsid); @@ -1263,14 +1247,14 @@ static void nvmet_setup_p2p_ns_map(struct nvmet_ctrl *ctrl, struct nvmet_req *req) { struct nvmet_ns *ns; + unsigned long idx; if (!req->p2p_client) return; ctrl->p2p_client = get_device(req->p2p_client); - list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link, - lockdep_is_held(&ctrl->subsys->lock)) + xa_for_each(&ctrl->subsys->namespaces, idx, ns) nvmet_p2pmem_ns_add_p2p(ctrl, ns); } @@ -1495,7 +1479,7 @@ struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn, if (!subsys) return ERR_PTR(-ENOMEM); - subsys->ver = NVME_VS(1, 3, 0); /* NVMe 1.3.0 */ + subsys->ver = NVMET_DEFAULT_VS; /* generate a random serial number as our controllers are ephemeral: */ get_random_bytes(&subsys->serial, sizeof(subsys->serial)); @@ -1523,7 +1507,7 @@ struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn, kref_init(&subsys->ref); mutex_init(&subsys->lock); - INIT_LIST_HEAD(&subsys->namespaces); + xa_init(&subsys->namespaces); INIT_LIST_HEAD(&subsys->ctrls); INIT_LIST_HEAD(&subsys->hosts); @@ -1535,7 +1519,10 @@ static void nvmet_subsys_free(struct kref *ref) struct nvmet_subsys *subsys = container_of(ref, struct nvmet_subsys, ref); - WARN_ON_ONCE(!list_empty(&subsys->namespaces)); + WARN_ON_ONCE(!xa_empty(&subsys->namespaces)); + + xa_destroy(&subsys->namespaces); + nvmet_passthru_subsys_free(subsys); kfree(subsys->subsysnqn); kfree_rcu(subsys->model, rcuhead); diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c index 40cf0b6e6c9d..f40c05c33c3a 100644 --- a/drivers/nvme/target/discovery.c +++ b/drivers/nvme/target/discovery.c @@ -277,7 +277,7 @@ static void nvmet_execute_disc_identify(struct nvmet_req *req) id->maxcmd = cpu_to_le16(NVMET_MAX_CMD); id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */ - if (ctrl->ops->has_keyed_sgls) + if (ctrl->ops->flags & NVMF_KEYED_SGLS) id->sgls |= cpu_to_le32(1 << 2); if (req->port->inline_data_size) id->sgls |= cpu_to_le32(1 << 20); diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 27fd3b5aa621..55bafd56166a 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -167,7 +167,6 @@ struct nvmet_fc_tgt_assoc { struct nvmet_fc_tgt_queue *queues[NVMET_NR_QUEUES + 1]; struct kref ref; struct work_struct del_work; - atomic_t del_work_active; }; @@ -1090,7 +1089,6 @@ nvmet_fc_delete_assoc(struct work_struct *work) container_of(work, struct nvmet_fc_tgt_assoc, del_work); nvmet_fc_delete_target_assoc(assoc); - atomic_set(&assoc->del_work_active, 0); nvmet_fc_tgt_a_put(assoc); } @@ -1123,7 +1121,6 @@ nvmet_fc_alloc_target_assoc(struct nvmet_fc_tgtport *tgtport, void *hosthandle) INIT_LIST_HEAD(&assoc->a_list); kref_init(&assoc->ref); INIT_WORK(&assoc->del_work, nvmet_fc_delete_assoc); - atomic_set(&assoc->del_work_active, 0); atomic_set(&assoc->terminating, 0); while (needrandom) { @@ -1243,7 +1240,8 @@ nvmet_fc_find_target_assoc(struct nvmet_fc_tgtport *tgtport, list_for_each_entry(assoc, &tgtport->assoc_list, a_list) { if (association_id == assoc->association_id) { ret = assoc; - nvmet_fc_tgt_a_get(assoc); + if (!nvmet_fc_tgt_a_get(assoc)) + ret = NULL; break; } } @@ -1477,21 +1475,15 @@ __nvmet_fc_free_assocs(struct nvmet_fc_tgtport *tgtport) { struct nvmet_fc_tgt_assoc *assoc, *next; unsigned long flags; - int ret; spin_lock_irqsave(&tgtport->lock, flags); list_for_each_entry_safe(assoc, next, &tgtport->assoc_list, a_list) { if (!nvmet_fc_tgt_a_get(assoc)) continue; - ret = atomic_cmpxchg(&assoc->del_work_active, 0, 1); - if (ret == 0) { - if (!schedule_work(&assoc->del_work)) - nvmet_fc_tgt_a_put(assoc); - } else { + if (!schedule_work(&assoc->del_work)) /* already deleting - release local reference */ nvmet_fc_tgt_a_put(assoc); - } } spin_unlock_irqrestore(&tgtport->lock, flags); } @@ -1533,7 +1525,6 @@ nvmet_fc_invalidate_host(struct nvmet_fc_target_port *target_port, struct nvmet_fc_tgt_assoc *assoc, *next; unsigned long flags; bool noassoc = true; - int ret; spin_lock_irqsave(&tgtport->lock, flags); list_for_each_entry_safe(assoc, next, @@ -1545,14 +1536,9 @@ nvmet_fc_invalidate_host(struct nvmet_fc_target_port *target_port, continue; assoc->hostport->invalid = 1; noassoc = false; - ret = atomic_cmpxchg(&assoc->del_work_active, 0, 1); - if (ret == 0) { - if (!schedule_work(&assoc->del_work)) - nvmet_fc_tgt_a_put(assoc); - } else { + if (!schedule_work(&assoc->del_work)) /* already deleting - release local reference */ nvmet_fc_tgt_a_put(assoc); - } } spin_unlock_irqrestore(&tgtport->lock, flags); @@ -1573,7 +1559,6 @@ nvmet_fc_delete_ctrl(struct nvmet_ctrl *ctrl) struct nvmet_fc_tgt_queue *queue; unsigned long flags; bool found_ctrl = false; - int ret; /* this is a bit ugly, but don't want to make locks layered */ spin_lock_irqsave(&nvmet_fc_tgtlock, flags); @@ -1597,14 +1582,9 @@ nvmet_fc_delete_ctrl(struct nvmet_ctrl *ctrl) nvmet_fc_tgtport_put(tgtport); if (found_ctrl) { - ret = atomic_cmpxchg(&assoc->del_work_active, 0, 1); - if (ret == 0) { - if (!schedule_work(&assoc->del_work)) - nvmet_fc_tgt_a_put(assoc); - } else { + if (!schedule_work(&assoc->del_work)) /* already deleting - release local reference */ nvmet_fc_tgt_a_put(assoc); - } return; } diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index 2ff1d1334a03..c97e60b71bbc 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -43,6 +43,17 @@ static const match_table_t opt_tokens = { { NVMF_OPT_ERR, NULL } }; +static int fcloop_verify_addr(substring_t *s) +{ + size_t blen = s->to - s->from + 1; + + if (strnlen(s->from, blen) != NVME_FC_TRADDR_HEXNAMELEN + 2 || + strncmp(s->from, "0x", 2)) + return -EINVAL; + + return 0; +} + static int fcloop_parse_options(struct fcloop_ctrl_options *opts, const char *buf) @@ -64,14 +75,16 @@ fcloop_parse_options(struct fcloop_ctrl_options *opts, opts->mask |= token; switch (token) { case NVMF_OPT_WWNN: - if (match_u64(args, &token64)) { + if (fcloop_verify_addr(args) || + match_u64(args, &token64)) { ret = -EINVAL; goto out_free_options; } opts->wwnn = token64; break; case NVMF_OPT_WWPN: - if (match_u64(args, &token64)) { + if (fcloop_verify_addr(args) || + match_u64(args, &token64)) { ret = -EINVAL; goto out_free_options; } @@ -92,14 +105,16 @@ fcloop_parse_options(struct fcloop_ctrl_options *opts, opts->fcaddr = token; break; case NVMF_OPT_LPWWNN: - if (match_u64(args, &token64)) { + if (fcloop_verify_addr(args) || + match_u64(args, &token64)) { ret = -EINVAL; goto out_free_options; } opts->lpwwnn = token64; break; case NVMF_OPT_LPWWPN: - if (match_u64(args, &token64)) { + if (fcloop_verify_addr(args) || + match_u64(args, &token64)) { ret = -EINVAL; goto out_free_options; } @@ -141,14 +156,16 @@ fcloop_parse_nm_options(struct device *dev, u64 *nname, u64 *pname, token = match_token(p, opt_tokens, args); switch (token) { case NVMF_OPT_WWNN: - if (match_u64(args, &token64)) { + if (fcloop_verify_addr(args) || + match_u64(args, &token64)) { ret = -EINVAL; goto out_free_options; } *nname = token64; break; case NVMF_OPT_WWPN: - if (match_u64(args, &token64)) { + if (fcloop_verify_addr(args) || + match_u64(args, &token64)) { ret = -EINVAL; goto out_free_options; } diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 8a0d4fe7bc18..4884ef1e46a2 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -36,7 +36,6 @@ struct nvme_loop_ctrl { struct nvme_loop_iod async_event_iod; struct nvme_ctrl ctrl; - struct nvmet_ctrl *target_ctrl; struct nvmet_port *port; }; @@ -445,7 +444,6 @@ static void nvme_loop_reset_ctrl_work(struct work_struct *work) { struct nvme_loop_ctrl *ctrl = container_of(work, struct nvme_loop_ctrl, ctrl.reset_work); - bool changed; int ret; nvme_stop_ctrl(&ctrl->ctrl); @@ -472,8 +470,8 @@ static void nvme_loop_reset_ctrl_work(struct work_struct *work) blk_mq_update_nr_hw_queues(&ctrl->tag_set, ctrl->ctrl.queue_count - 1); - changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); - WARN_ON_ONCE(!changed); + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE)) + WARN_ON_ONCE(1); nvme_start_ctrl(&ctrl->ctrl); @@ -568,7 +566,6 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev, struct nvmf_ctrl_options *opts) { struct nvme_loop_ctrl *ctrl; - bool changed; int ret; ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL); @@ -584,6 +581,9 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev, if (ret) goto out_put_ctrl; + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) + WARN_ON_ONCE(1); + ret = -ENOMEM; ctrl->ctrl.sqsize = opts->queue_size - 1; @@ -618,8 +618,8 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev, dev_info(ctrl->ctrl.device, "new ctrl: \"%s\"\n", ctrl->ctrl.opts->subsysnqn); - changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); - WARN_ON_ONCE(!changed); + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE)) + WARN_ON_ONCE(1); mutex_lock(&nvme_loop_ctrl_mutex); list_add_tail(&ctrl->list, &nvme_loop_ctrl_list); diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 809691291e73..47ee3fb193bd 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -21,6 +21,8 @@ #include <linux/radix-tree.h> #include <linux/t10-pi.h> +#define NVMET_DEFAULT_VS NVME_VS(1, 3, 0) + #define NVMET_ASYNC_EVENTS 4 #define NVMET_ERROR_LOG_SLOTS 128 #define NVMET_NO_ERROR_LOC ((u16)-1) @@ -52,7 +54,6 @@ (cpu_to_le32(offsetof(struct nvmf_connect_command, x))) struct nvmet_ns { - struct list_head dev_link; struct percpu_ref ref; struct block_device *bdev; struct file *file; @@ -219,7 +220,7 @@ struct nvmet_subsys { struct mutex lock; struct kref ref; - struct list_head namespaces; + struct xarray namespaces; unsigned int nr_namespaces; unsigned int max_nsid; u16 cntlid_min; @@ -243,6 +244,12 @@ struct nvmet_subsys { struct config_group allowed_hosts_group; struct nvmet_subsys_model __rcu *model; + +#ifdef CONFIG_NVME_TARGET_PASSTHRU + struct nvme_ctrl *passthru_ctrl; + char *passthru_ctrl_path; + struct config_group passthru_group; +#endif /* CONFIG_NVME_TARGET_PASSTHRU */ }; static inline struct nvmet_subsys *to_subsys(struct config_item *item) @@ -286,8 +293,9 @@ struct nvmet_fabrics_ops { struct module *owner; unsigned int type; unsigned int msdbd; - bool has_keyed_sgls : 1; - bool metadata_support : 1; + unsigned int flags; +#define NVMF_KEYED_SGLS (1 << 0) +#define NVMF_METADATA_SUPPORTED (1 << 1) void (*queue_response)(struct nvmet_req *req); int (*add_port)(struct nvmet_port *port); void (*remove_port)(struct nvmet_port *port); @@ -321,6 +329,11 @@ struct nvmet_req { struct bio_vec *bvec; struct work_struct work; } f; + struct { + struct request *rq; + struct work_struct work; + bool use_workqueue; + } p; }; int sg_cnt; int metadata_sg_cnt; @@ -400,6 +413,8 @@ void nvmet_req_complete(struct nvmet_req *req, u16 status); int nvmet_req_alloc_sgls(struct nvmet_req *req); void nvmet_req_free_sgls(struct nvmet_req *req); +void nvmet_execute_set_features(struct nvmet_req *req); +void nvmet_execute_get_features(struct nvmet_req *req); void nvmet_execute_keep_alive(struct nvmet_req *req); void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, u16 qid, @@ -532,6 +547,43 @@ static inline u32 nvmet_dsm_len(struct nvmet_req *req) sizeof(struct nvme_dsm_range); } +#ifdef CONFIG_NVME_TARGET_PASSTHRU +void nvmet_passthru_subsys_free(struct nvmet_subsys *subsys); +int nvmet_passthru_ctrl_enable(struct nvmet_subsys *subsys); +void nvmet_passthru_ctrl_disable(struct nvmet_subsys *subsys); +u16 nvmet_parse_passthru_admin_cmd(struct nvmet_req *req); +u16 nvmet_parse_passthru_io_cmd(struct nvmet_req *req); +static inline struct nvme_ctrl *nvmet_passthru_ctrl(struct nvmet_subsys *subsys) +{ + return subsys->passthru_ctrl; +} +#else /* CONFIG_NVME_TARGET_PASSTHRU */ +static inline void nvmet_passthru_subsys_free(struct nvmet_subsys *subsys) +{ +} +static inline void nvmet_passthru_ctrl_disable(struct nvmet_subsys *subsys) +{ +} +static inline u16 nvmet_parse_passthru_admin_cmd(struct nvmet_req *req) +{ + return 0; +} +static inline u16 nvmet_parse_passthru_io_cmd(struct nvmet_req *req) +{ + return 0; +} +static inline struct nvme_ctrl *nvmet_passthru_ctrl(struct nvmet_subsys *subsys) +{ + return NULL; +} +#endif /* CONFIG_NVME_TARGET_PASSTHRU */ + +static inline struct nvme_ctrl * +nvmet_req_passthru_ctrl(struct nvmet_req *req) +{ + return nvmet_passthru_ctrl(req->sq->ctrl->subsys); +} + u16 errno_to_nvme_status(struct nvmet_req *req, int errno); /* Convert a 32-bit number to a 16-bit 0's based number */ diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c new file mode 100644 index 000000000000..89d91dc999a6 --- /dev/null +++ b/drivers/nvme/target/passthru.c @@ -0,0 +1,544 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NVMe Over Fabrics Target Passthrough command implementation. + * + * Copyright (c) 2017-2018 Western Digital Corporation or its + * affiliates. + * Copyright (c) 2019-2020, Eideticom Inc. + * + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> + +#include "../host/nvme.h" +#include "nvmet.h" + +MODULE_IMPORT_NS(NVME_TARGET_PASSTHRU); + +/* + * xarray to maintain one passthru subsystem per nvme controller. + */ +static DEFINE_XARRAY(passthru_subsystems); + +static u16 nvmet_passthru_override_id_ctrl(struct nvmet_req *req) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + struct nvme_ctrl *pctrl = ctrl->subsys->passthru_ctrl; + u16 status = NVME_SC_SUCCESS; + struct nvme_id_ctrl *id; + u32 max_hw_sectors; + int page_shift; + + id = kzalloc(sizeof(*id), GFP_KERNEL); + if (!id) + return NVME_SC_INTERNAL; + + status = nvmet_copy_from_sgl(req, 0, id, sizeof(*id)); + if (status) + goto out_free; + + id->cntlid = cpu_to_le16(ctrl->cntlid); + id->ver = cpu_to_le32(ctrl->subsys->ver); + + /* + * The passthru NVMe driver may have a limit on the number of segments + * which depends on the host's memory fragementation. To solve this, + * ensure mdts is limited to the pages equal to the number of segments. + */ + max_hw_sectors = min_not_zero(pctrl->max_segments << (PAGE_SHIFT - 9), + pctrl->max_hw_sectors); + + page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12; + + id->mdts = ilog2(max_hw_sectors) + 9 - page_shift; + + id->acl = 3; + /* + * We export aerl limit for the fabrics controller, update this when + * passthru based aerl support is added. + */ + id->aerl = NVMET_ASYNC_EVENTS - 1; + + /* emulate kas as most of the PCIe ctrl don't have a support for kas */ + id->kas = cpu_to_le16(NVMET_KAS); + + /* don't support host memory buffer */ + id->hmpre = 0; + id->hmmin = 0; + + id->sqes = min_t(__u8, ((0x6 << 4) | 0x6), id->sqes); + id->cqes = min_t(__u8, ((0x4 << 4) | 0x4), id->cqes); + id->maxcmd = cpu_to_le16(NVMET_MAX_CMD); + + /* don't support fuse commands */ + id->fuses = 0; + + id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */ + if (ctrl->ops->flags & NVMF_KEYED_SGLS) + id->sgls |= cpu_to_le32(1 << 2); + if (req->port->inline_data_size) + id->sgls |= cpu_to_le32(1 << 20); + + /* + * When passsthru controller is setup using nvme-loop transport it will + * export the passthru ctrl subsysnqn (PCIe NVMe ctrl) and will fail in + * the nvme/host/core.c in the nvme_init_subsystem()->nvme_active_ctrl() + * code path with duplicate ctr subsynqn. In order to prevent that we + * mask the passthru-ctrl subsysnqn with the target ctrl subsysnqn. + */ + memcpy(id->subnqn, ctrl->subsysnqn, sizeof(id->subnqn)); + + /* use fabric id-ctrl values */ + id->ioccsz = cpu_to_le32((sizeof(struct nvme_command) + + req->port->inline_data_size) / 16); + id->iorcsz = cpu_to_le32(sizeof(struct nvme_completion) / 16); + + id->msdbd = ctrl->ops->msdbd; + + /* Support multipath connections with fabrics */ + id->cmic |= 1 << 1; + + /* Disable reservations, see nvmet_parse_passthru_io_cmd() */ + id->oncs &= cpu_to_le16(~NVME_CTRL_ONCS_RESERVATIONS); + + status = nvmet_copy_to_sgl(req, 0, id, sizeof(struct nvme_id_ctrl)); + +out_free: + kfree(id); + return status; +} + +static u16 nvmet_passthru_override_id_ns(struct nvmet_req *req) +{ + u16 status = NVME_SC_SUCCESS; + struct nvme_id_ns *id; + int i; + + id = kzalloc(sizeof(*id), GFP_KERNEL); + if (!id) + return NVME_SC_INTERNAL; + + status = nvmet_copy_from_sgl(req, 0, id, sizeof(struct nvme_id_ns)); + if (status) + goto out_free; + + for (i = 0; i < (id->nlbaf + 1); i++) + if (id->lbaf[i].ms) + memset(&id->lbaf[i], 0, sizeof(id->lbaf[i])); + + id->flbas = id->flbas & ~(1 << 4); + + /* + * Presently the NVMEof target code does not support sending + * metadata, so we must disable it here. This should be updated + * once target starts supporting metadata. + */ + id->mc = 0; + + status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id)); + +out_free: + kfree(id); + return status; +} + +static void nvmet_passthru_execute_cmd_work(struct work_struct *w) +{ + struct nvmet_req *req = container_of(w, struct nvmet_req, p.work); + struct request *rq = req->p.rq; + u16 status; + + nvme_execute_passthru_rq(rq); + + status = nvme_req(rq)->status; + if (status == NVME_SC_SUCCESS && + req->cmd->common.opcode == nvme_admin_identify) { + switch (req->cmd->identify.cns) { + case NVME_ID_CNS_CTRL: + nvmet_passthru_override_id_ctrl(req); + break; + case NVME_ID_CNS_NS: + nvmet_passthru_override_id_ns(req); + break; + } + } + + req->cqe->result = nvme_req(rq)->result; + nvmet_req_complete(req, status); + blk_put_request(rq); +} + +static void nvmet_passthru_req_done(struct request *rq, + blk_status_t blk_status) +{ + struct nvmet_req *req = rq->end_io_data; + + req->cqe->result = nvme_req(rq)->result; + nvmet_req_complete(req, nvme_req(rq)->status); + blk_put_request(rq); +} + +static int nvmet_passthru_map_sg(struct nvmet_req *req, struct request *rq) +{ + int sg_cnt = req->sg_cnt; + struct scatterlist *sg; + int op_flags = 0; + struct bio *bio; + int i, ret; + + if (req->cmd->common.opcode == nvme_cmd_flush) + op_flags = REQ_FUA; + else if (nvme_is_write(req->cmd)) + op_flags = REQ_SYNC | REQ_IDLE; + + bio = bio_alloc(GFP_KERNEL, min(sg_cnt, BIO_MAX_PAGES)); + bio->bi_end_io = bio_put; + bio->bi_opf = req_op(rq) | op_flags; + + for_each_sg(req->sg, sg, req->sg_cnt, i) { + if (bio_add_pc_page(rq->q, bio, sg_page(sg), sg->length, + sg->offset) < sg->length) { + bio_put(bio); + return -EINVAL; + } + sg_cnt--; + } + + ret = blk_rq_append_bio(rq, &bio); + if (unlikely(ret)) { + bio_put(bio); + return ret; + } + + return 0; +} + +static void nvmet_passthru_execute_cmd(struct nvmet_req *req) +{ + struct nvme_ctrl *ctrl = nvmet_req_passthru_ctrl(req); + struct request_queue *q = ctrl->admin_q; + struct nvme_ns *ns = NULL; + struct request *rq = NULL; + u32 effects; + u16 status; + int ret; + + if (likely(req->sq->qid != 0)) { + u32 nsid = le32_to_cpu(req->cmd->common.nsid); + + ns = nvme_find_get_ns(ctrl, nsid); + if (unlikely(!ns)) { + pr_err("failed to get passthru ns nsid:%u\n", nsid); + status = NVME_SC_INVALID_NS | NVME_SC_DNR; + goto fail_out; + } + + q = ns->queue; + } + + rq = nvme_alloc_request(q, req->cmd, BLK_MQ_REQ_NOWAIT, NVME_QID_ANY); + if (IS_ERR(rq)) { + rq = NULL; + status = NVME_SC_INTERNAL; + goto fail_out; + } + + if (req->sg_cnt) { + ret = nvmet_passthru_map_sg(req, rq); + if (unlikely(ret)) { + status = NVME_SC_INTERNAL; + goto fail_out; + } + } + + /* + * If there are effects for the command we are about to execute, or + * an end_req function we need to use nvme_execute_passthru_rq() + * synchronously in a work item seeing the end_req function and + * nvme_passthru_end() can't be called in the request done callback + * which is typically in interrupt context. + */ + effects = nvme_command_effects(ctrl, ns, req->cmd->common.opcode); + if (req->p.use_workqueue || effects) { + INIT_WORK(&req->p.work, nvmet_passthru_execute_cmd_work); + req->p.rq = rq; + schedule_work(&req->p.work); + } else { + rq->end_io_data = req; + blk_execute_rq_nowait(rq->q, ns ? ns->disk : NULL, rq, 0, + nvmet_passthru_req_done); + } + + if (ns) + nvme_put_ns(ns); + + return; + +fail_out: + if (ns) + nvme_put_ns(ns); + nvmet_req_complete(req, status); + blk_put_request(rq); +} + +/* + * We need to emulate set host behaviour to ensure that any requested + * behaviour of the target's host matches the requested behaviour + * of the device's host and fail otherwise. + */ +static void nvmet_passthru_set_host_behaviour(struct nvmet_req *req) +{ + struct nvme_ctrl *ctrl = nvmet_req_passthru_ctrl(req); + struct nvme_feat_host_behavior *host; + u16 status = NVME_SC_INTERNAL; + int ret; + + host = kzalloc(sizeof(*host) * 2, GFP_KERNEL); + if (!host) + goto out_complete_req; + + ret = nvme_get_features(ctrl, NVME_FEAT_HOST_BEHAVIOR, 0, + host, sizeof(*host), NULL); + if (ret) + goto out_free_host; + + status = nvmet_copy_from_sgl(req, 0, &host[1], sizeof(*host)); + if (status) + goto out_free_host; + + if (memcmp(&host[0], &host[1], sizeof(host[0]))) { + pr_warn("target host has requested different behaviour from the local host\n"); + status = NVME_SC_INTERNAL; + } + +out_free_host: + kfree(host); +out_complete_req: + nvmet_req_complete(req, status); +} + +static u16 nvmet_setup_passthru_command(struct nvmet_req *req) +{ + req->p.use_workqueue = false; + req->execute = nvmet_passthru_execute_cmd; + return NVME_SC_SUCCESS; +} + +u16 nvmet_parse_passthru_io_cmd(struct nvmet_req *req) +{ + switch (req->cmd->common.opcode) { + case nvme_cmd_resv_register: + case nvme_cmd_resv_report: + case nvme_cmd_resv_acquire: + case nvme_cmd_resv_release: + /* + * Reservations cannot be supported properly because the + * underlying device has no way of differentiating different + * hosts that connect via fabrics. This could potentially be + * emulated in the future if regular targets grow support for + * this feature. + */ + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + } + + return nvmet_setup_passthru_command(req); +} + +/* + * Only features that are emulated or specifically allowed in the list are + * passed down to the controller. This function implements the allow list for + * both get and set features. + */ +static u16 nvmet_passthru_get_set_features(struct nvmet_req *req) +{ + switch (le32_to_cpu(req->cmd->features.fid)) { + case NVME_FEAT_ARBITRATION: + case NVME_FEAT_POWER_MGMT: + case NVME_FEAT_LBA_RANGE: + case NVME_FEAT_TEMP_THRESH: + case NVME_FEAT_ERR_RECOVERY: + case NVME_FEAT_VOLATILE_WC: + case NVME_FEAT_WRITE_ATOMIC: + case NVME_FEAT_AUTO_PST: + case NVME_FEAT_TIMESTAMP: + case NVME_FEAT_HCTM: + case NVME_FEAT_NOPSC: + case NVME_FEAT_RRL: + case NVME_FEAT_PLM_CONFIG: + case NVME_FEAT_PLM_WINDOW: + case NVME_FEAT_HOST_BEHAVIOR: + case NVME_FEAT_SANITIZE: + case NVME_FEAT_VENDOR_START ... NVME_FEAT_VENDOR_END: + return nvmet_setup_passthru_command(req); + + case NVME_FEAT_ASYNC_EVENT: + /* There is no support for forwarding ASYNC events */ + case NVME_FEAT_IRQ_COALESCE: + case NVME_FEAT_IRQ_CONFIG: + /* The IRQ settings will not apply to the target controller */ + case NVME_FEAT_HOST_MEM_BUF: + /* + * Any HMB that's set will not be passed through and will + * not work as expected + */ + case NVME_FEAT_SW_PROGRESS: + /* + * The Pre-Boot Software Load Count doesn't make much + * sense for a target to export + */ + case NVME_FEAT_RESV_MASK: + case NVME_FEAT_RESV_PERSIST: + /* No reservations, see nvmet_parse_passthru_io_cmd() */ + default: + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + } +} + +u16 nvmet_parse_passthru_admin_cmd(struct nvmet_req *req) +{ + /* + * Passthru all vendor specific commands + */ + if (req->cmd->common.opcode >= nvme_admin_vendor_start) + return nvmet_setup_passthru_command(req); + + switch (req->cmd->common.opcode) { + case nvme_admin_async_event: + req->execute = nvmet_execute_async_event; + return NVME_SC_SUCCESS; + case nvme_admin_keep_alive: + /* + * Most PCIe ctrls don't support keep alive cmd, we route keep + * alive to the non-passthru mode. In future please change this + * code when PCIe ctrls with keep alive support available. + */ + req->execute = nvmet_execute_keep_alive; + return NVME_SC_SUCCESS; + case nvme_admin_set_features: + switch (le32_to_cpu(req->cmd->features.fid)) { + case NVME_FEAT_ASYNC_EVENT: + case NVME_FEAT_KATO: + case NVME_FEAT_NUM_QUEUES: + case NVME_FEAT_HOST_ID: + req->execute = nvmet_execute_set_features; + return NVME_SC_SUCCESS; + case NVME_FEAT_HOST_BEHAVIOR: + req->execute = nvmet_passthru_set_host_behaviour; + return NVME_SC_SUCCESS; + default: + return nvmet_passthru_get_set_features(req); + } + break; + case nvme_admin_get_features: + switch (le32_to_cpu(req->cmd->features.fid)) { + case NVME_FEAT_ASYNC_EVENT: + case NVME_FEAT_KATO: + case NVME_FEAT_NUM_QUEUES: + case NVME_FEAT_HOST_ID: + req->execute = nvmet_execute_get_features; + return NVME_SC_SUCCESS; + default: + return nvmet_passthru_get_set_features(req); + } + break; + case nvme_admin_identify: + switch (req->cmd->identify.cns) { + case NVME_ID_CNS_CTRL: + req->execute = nvmet_passthru_execute_cmd; + req->p.use_workqueue = true; + return NVME_SC_SUCCESS; + case NVME_ID_CNS_NS: + req->execute = nvmet_passthru_execute_cmd; + req->p.use_workqueue = true; + return NVME_SC_SUCCESS; + default: + return nvmet_setup_passthru_command(req); + } + case nvme_admin_get_log_page: + return nvmet_setup_passthru_command(req); + default: + /* Reject commands not in the allowlist above */ + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + } +} + +int nvmet_passthru_ctrl_enable(struct nvmet_subsys *subsys) +{ + struct nvme_ctrl *ctrl; + int ret = -EINVAL; + void *old; + + mutex_lock(&subsys->lock); + if (!subsys->passthru_ctrl_path) + goto out_unlock; + if (subsys->passthru_ctrl) + goto out_unlock; + + if (subsys->nr_namespaces) { + pr_info("cannot enable both passthru and regular namespaces for a single subsystem"); + goto out_unlock; + } + + ctrl = nvme_ctrl_get_by_path(subsys->passthru_ctrl_path); + if (IS_ERR(ctrl)) { + ret = PTR_ERR(ctrl); + pr_err("failed to open nvme controller %s\n", + subsys->passthru_ctrl_path); + + goto out_unlock; + } + + old = xa_cmpxchg(&passthru_subsystems, ctrl->cntlid, NULL, + subsys, GFP_KERNEL); + if (xa_is_err(old)) { + ret = xa_err(old); + goto out_put_ctrl; + } + + if (old) + goto out_put_ctrl; + + subsys->passthru_ctrl = ctrl; + subsys->ver = ctrl->vs; + + if (subsys->ver < NVME_VS(1, 2, 1)) { + pr_warn("nvme controller version is too old: %llu.%llu.%llu, advertising 1.2.1\n", + NVME_MAJOR(subsys->ver), NVME_MINOR(subsys->ver), + NVME_TERTIARY(subsys->ver)); + subsys->ver = NVME_VS(1, 2, 1); + } + + mutex_unlock(&subsys->lock); + return 0; + +out_put_ctrl: + nvme_put_ctrl(ctrl); +out_unlock: + mutex_unlock(&subsys->lock); + return ret; +} + +static void __nvmet_passthru_ctrl_disable(struct nvmet_subsys *subsys) +{ + if (subsys->passthru_ctrl) { + xa_erase(&passthru_subsystems, subsys->passthru_ctrl->cntlid); + nvme_put_ctrl(subsys->passthru_ctrl); + } + subsys->passthru_ctrl = NULL; + subsys->ver = NVMET_DEFAULT_VS; +} + +void nvmet_passthru_ctrl_disable(struct nvmet_subsys *subsys) +{ + mutex_lock(&subsys->lock); + __nvmet_passthru_ctrl_disable(subsys); + mutex_unlock(&subsys->lock); +} + +void nvmet_passthru_subsys_free(struct nvmet_subsys *subsys) +{ + mutex_lock(&subsys->lock); + __nvmet_passthru_ctrl_disable(subsys); + mutex_unlock(&subsys->lock); + kfree(subsys->passthru_ctrl_path); +} diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 76ea23a2c2be..3ccb59260b4a 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -752,7 +752,7 @@ static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc) { struct nvmet_rdma_rsp *rsp = container_of(wc->wr_cqe, struct nvmet_rdma_rsp, read_cqe); - struct nvmet_rdma_queue *queue = cq->cq_context; + struct nvmet_rdma_queue *queue = wc->qp->qp_context; u16 status = 0; WARN_ON(rsp->n_rdma <= 0); @@ -1008,7 +1008,7 @@ static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc) { struct nvmet_rdma_cmd *cmd = container_of(wc->wr_cqe, struct nvmet_rdma_cmd, cqe); - struct nvmet_rdma_queue *queue = cq->cq_context; + struct nvmet_rdma_queue *queue = wc->qp->qp_context; struct nvmet_rdma_rsp *rsp; if (unlikely(wc->status != IB_WC_SUCCESS)) { @@ -1258,9 +1258,8 @@ static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue) */ nr_cqe = queue->recv_queue_size + 2 * queue->send_queue_size; - queue->cq = ib_alloc_cq(ndev->device, queue, - nr_cqe + 1, queue->comp_vector, - IB_POLL_WORKQUEUE); + queue->cq = ib_cq_pool_get(ndev->device, nr_cqe + 1, + queue->comp_vector, IB_POLL_WORKQUEUE); if (IS_ERR(queue->cq)) { ret = PTR_ERR(queue->cq); pr_err("failed to create CQ cqe= %d ret= %d\n", @@ -1322,7 +1321,7 @@ out: err_destroy_qp: rdma_destroy_qp(queue->cm_id); err_destroy_cq: - ib_free_cq(queue->cq); + ib_cq_pool_put(queue->cq, nr_cqe + 1); goto out; } @@ -1332,7 +1331,8 @@ static void nvmet_rdma_destroy_queue_ib(struct nvmet_rdma_queue *queue) if (queue->cm_id) rdma_destroy_id(queue->cm_id); ib_destroy_qp(queue->qp); - ib_free_cq(queue->cq); + ib_cq_pool_put(queue->cq, queue->recv_queue_size + 2 * + queue->send_queue_size + 1); } static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue) @@ -1970,8 +1970,7 @@ static const struct nvmet_fabrics_ops nvmet_rdma_ops = { .owner = THIS_MODULE, .type = NVMF_TRTYPE_RDMA, .msdbd = 1, - .has_keyed_sgls = 1, - .metadata_support = 1, + .flags = NVMF_KEYED_SGLS | NVMF_METADATA_SUPPORTED, .add_port = nvmet_rdma_add_port, .remove_port = nvmet_rdma_remove_port, .queue_response = nvmet_rdma_queue_response, diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index de9217cfd22d..9eda91162fe4 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -459,17 +459,11 @@ static void nvmet_setup_response_pdu(struct nvmet_tcp_cmd *cmd) static void nvmet_tcp_process_resp_list(struct nvmet_tcp_queue *queue) { struct llist_node *node; + struct nvmet_tcp_cmd *cmd; - node = llist_del_all(&queue->resp_list); - if (!node) - return; - - while (node) { - struct nvmet_tcp_cmd *cmd = llist_entry(node, - struct nvmet_tcp_cmd, lentry); - + for (node = llist_del_all(&queue->resp_list); node; node = node->next) { + cmd = llist_entry(node, struct nvmet_tcp_cmd, lentry); list_add(&cmd->entry, &queue->resp_send_list); - node = node->next; queue->send_list_len++; } } @@ -1717,7 +1711,6 @@ static const struct nvmet_fabrics_ops nvmet_tcp_ops = { .owner = THIS_MODULE, .type = NVMF_TRTYPE_TCP, .msdbd = 1, - .has_keyed_sgls = 0, .add_port = nvmet_tcp_add_port, .remove_port = nvmet_tcp_remove_port, .queue_response = nvmet_tcp_queue_response, diff --git a/drivers/s390/block/dasd_diag.c b/drivers/s390/block/dasd_diag.c index facb588d09e4..1b9e1442e6a5 100644 --- a/drivers/s390/block/dasd_diag.c +++ b/drivers/s390/block/dasd_diag.c @@ -319,7 +319,7 @@ dasd_diag_check_device(struct dasd_device *device) struct dasd_diag_characteristics *rdc_data; struct vtoc_cms_label *label; struct dasd_block *block; - struct dasd_diag_bio bio; + struct dasd_diag_bio *bio; unsigned int sb, bsize; blocknum_t end_block; int rc; @@ -395,29 +395,36 @@ dasd_diag_check_device(struct dasd_device *device) rc = -ENOMEM; goto out; } + bio = kzalloc(sizeof(*bio), GFP_KERNEL); + if (bio == NULL) { + DBF_DEV_EVENT(DBF_WARNING, device, "%s", + "No memory to allocate initialization bio"); + rc = -ENOMEM; + goto out_label; + } rc = 0; end_block = 0; /* try all sizes - needed for ECKD devices */ for (bsize = 512; bsize <= PAGE_SIZE; bsize <<= 1) { mdsk_init_io(device, bsize, 0, &end_block); - memset(&bio, 0, sizeof (struct dasd_diag_bio)); - bio.type = MDSK_READ_REQ; - bio.block_number = private->pt_block + 1; - bio.buffer = label; + memset(bio, 0, sizeof(*bio)); + bio->type = MDSK_READ_REQ; + bio->block_number = private->pt_block + 1; + bio->buffer = label; memset(&private->iob, 0, sizeof (struct dasd_diag_rw_io)); private->iob.dev_nr = rdc_data->dev_nr; private->iob.key = 0; private->iob.flags = 0; /* do synchronous io */ private->iob.block_count = 1; private->iob.interrupt_params = 0; - private->iob.bio_list = &bio; + private->iob.bio_list = bio; private->iob.flaga = DASD_DIAG_FLAGA_DEFAULT; rc = dia250(&private->iob, RW_BIO); if (rc == 3) { pr_warn("%s: A 64-bit DIAG call failed\n", dev_name(&device->cdev->dev)); rc = -EOPNOTSUPP; - goto out_label; + goto out_bio; } mdsk_term_io(device); if (rc == 0) @@ -427,7 +434,7 @@ dasd_diag_check_device(struct dasd_device *device) pr_warn("%s: Accessing the DASD failed because of an incorrect format (rc=%d)\n", dev_name(&device->cdev->dev), rc); rc = -EIO; - goto out_label; + goto out_bio; } /* check for label block */ if (memcmp(label->label_id, DASD_DIAG_CMS1, @@ -457,6 +464,8 @@ dasd_diag_check_device(struct dasd_device *device) (rc == 4) ? ", read-only device" : ""); rc = 0; } +out_bio: + kfree(bio); out_label: free_page((long) label); out: @@ -506,7 +515,7 @@ static struct dasd_ccw_req *dasd_diag_build_cp(struct dasd_device *memdev, struct req_iterator iter; struct bio_vec bv; char *dst; - unsigned int count, datasize; + unsigned int count; sector_t recid, first_rec, last_rec; unsigned int blksize, off; unsigned char rw_cmd; @@ -534,10 +543,8 @@ static struct dasd_ccw_req *dasd_diag_build_cp(struct dasd_device *memdev, if (count != last_rec - first_rec + 1) return ERR_PTR(-EINVAL); /* Build the request */ - datasize = sizeof(struct dasd_diag_req) + - count*sizeof(struct dasd_diag_bio); - cqr = dasd_smalloc_request(DASD_DIAG_MAGIC, 0, datasize, memdev, - blk_mq_rq_to_pdu(req)); + cqr = dasd_smalloc_request(DASD_DIAG_MAGIC, 0, struct_size(dreq, bio, count), + memdev, blk_mq_rq_to_pdu(req)); if (IS_ERR(cqr)) return cqr; diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index 6f7eba66687e..d8b2c49d645b 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -59,6 +59,7 @@ static int sd_zbc_parse_report(struct scsi_disk *sdkp, u8 *buf, zone.non_seq = 1; zone.len = logical_to_sectors(sdp, get_unaligned_be64(&buf[8])); + zone.capacity = zone.len; zone.start = logical_to_sectors(sdp, get_unaligned_be64(&buf[16])); zone.wp = logical_to_sectors(sdp, get_unaligned_be64(&buf[24])); if (zone.type != ZBC_ZONE_TYPE_CONV && @@ -716,6 +717,11 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buf) /* The drive satisfies the kernel restrictions: set it up */ blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE); + if (sdkp->zones_max_open == U32_MAX) + blk_queue_max_open_zones(q, 0); + else + blk_queue_max_open_zones(q, sdkp->zones_max_open); + blk_queue_max_active_zones(q, 0); nr_zones = round_up(sdkp->capacity, zone_blocks) >> ilog2(zone_blocks); /* READ16/WRITE16 is mandatory for ZBC disks */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 06ecb2c1492f..9273d1126ed2 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -513,6 +513,8 @@ struct request_queue { unsigned int nr_zones; unsigned long *conv_zones_bitmap; unsigned long *seq_zones_wlock; + unsigned int max_open_zones; + unsigned int max_active_zones; #endif /* CONFIG_BLK_DEV_ZONED */ /* @@ -722,6 +724,28 @@ static inline bool blk_queue_zone_is_seq(struct request_queue *q, return true; return !test_bit(blk_queue_zone_no(q, sector), q->conv_zones_bitmap); } + +static inline void blk_queue_max_open_zones(struct request_queue *q, + unsigned int max_open_zones) +{ + q->max_open_zones = max_open_zones; +} + +static inline unsigned int queue_max_open_zones(const struct request_queue *q) +{ + return q->max_open_zones; +} + +static inline void blk_queue_max_active_zones(struct request_queue *q, + unsigned int max_active_zones) +{ + q->max_active_zones = max_active_zones; +} + +static inline unsigned int queue_max_active_zones(const struct request_queue *q) +{ + return q->max_active_zones; +} #else /* CONFIG_BLK_DEV_ZONED */ static inline unsigned int blk_queue_nr_zones(struct request_queue *q) { @@ -737,6 +761,14 @@ static inline unsigned int blk_queue_zone_no(struct request_queue *q, { return 0; } +static inline unsigned int queue_max_open_zones(const struct request_queue *q) +{ + return 0; +} +static inline unsigned int queue_max_active_zones(const struct request_queue *q) +{ + return 0; +} #endif /* CONFIG_BLK_DEV_ZONED */ static inline bool rq_is_sync(struct request *rq) @@ -1520,6 +1552,24 @@ static inline sector_t bdev_zone_sectors(struct block_device *bdev) return 0; } +static inline unsigned int bdev_max_open_zones(struct block_device *bdev) +{ + struct request_queue *q = bdev_get_queue(bdev); + + if (q) + return queue_max_open_zones(q); + return 0; +} + +static inline unsigned int bdev_max_active_zones(struct block_device *bdev) +{ + struct request_queue *q = bdev_get_queue(bdev); + + if (q) + return queue_max_active_zones(q); + return 0; +} + static inline int queue_dma_alignment(const struct request_queue *q) { return q ? q->dma_alignment : 511; diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h index 41e7795a3ee4..2a38f2b477a5 100644 --- a/include/linux/nvme-fc-driver.h +++ b/include/linux/nvme-fc-driver.h @@ -672,7 +672,7 @@ enum { * Values set by the LLDD indicating completion status of the FCP operation. * Must be set prior to calling the done() callback. * @transferred_length: amount of DATA_OUT payload data received by a - * a WRITEDATA operation. If not a WRITEDATA operation, value must + * WRITEDATA operation. If not a WRITEDATA operation, value must * be set to 0. Should equal transfer_length on success. * @fcp_error: status of the FCP operation. Must be 0 on success; on failure * must be a NVME_SC_FC_xxxx value. diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 5ce51ab4c50e..d92535997687 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -132,6 +132,7 @@ enum { #define NVME_CAP_TIMEOUT(cap) (((cap) >> 24) & 0xff) #define NVME_CAP_STRIDE(cap) (((cap) >> 32) & 0xf) #define NVME_CAP_NSSRC(cap) (((cap) >> 36) & 0x1) +#define NVME_CAP_CSS(cap) (((cap) >> 37) & 0xff) #define NVME_CAP_MPSMIN(cap) (((cap) >> 48) & 0xf) #define NVME_CAP_MPSMAX(cap) (((cap) >> 52) & 0xf) @@ -162,7 +163,6 @@ enum { enum { NVME_CC_ENABLE = 1 << 0, - NVME_CC_CSS_NVM = 0 << 4, NVME_CC_EN_SHIFT = 0, NVME_CC_CSS_SHIFT = 4, NVME_CC_MPS_SHIFT = 7, @@ -170,6 +170,9 @@ enum { NVME_CC_SHN_SHIFT = 14, NVME_CC_IOSQES_SHIFT = 16, NVME_CC_IOCQES_SHIFT = 20, + NVME_CC_CSS_NVM = 0 << NVME_CC_CSS_SHIFT, + NVME_CC_CSS_CSI = 6 << NVME_CC_CSS_SHIFT, + NVME_CC_CSS_MASK = 7 << NVME_CC_CSS_SHIFT, NVME_CC_AMS_RR = 0 << NVME_CC_AMS_SHIFT, NVME_CC_AMS_WRRU = 1 << NVME_CC_AMS_SHIFT, NVME_CC_AMS_VS = 7 << NVME_CC_AMS_SHIFT, @@ -179,6 +182,8 @@ enum { NVME_CC_SHN_MASK = 3 << NVME_CC_SHN_SHIFT, NVME_CC_IOSQES = NVME_NVM_IOSQES << NVME_CC_IOSQES_SHIFT, NVME_CC_IOCQES = NVME_NVM_IOCQES << NVME_CC_IOCQES_SHIFT, + NVME_CAP_CSS_NVM = 1 << 0, + NVME_CAP_CSS_CSI = 1 << 6, NVME_CSTS_RDY = 1 << 0, NVME_CSTS_CFS = 1 << 1, NVME_CSTS_NSSRO = 1 << 4, @@ -307,6 +312,7 @@ enum { NVME_CTRL_ONCS_WRITE_UNCORRECTABLE = 1 << 1, NVME_CTRL_ONCS_DSM = 1 << 2, NVME_CTRL_ONCS_WRITE_ZEROES = 1 << 3, + NVME_CTRL_ONCS_RESERVATIONS = 1 << 5, NVME_CTRL_ONCS_TIMESTAMP = 1 << 6, NVME_CTRL_VWC_PRESENT = 1 << 0, NVME_CTRL_OACS_SEC_SUPP = 1 << 0, @@ -369,11 +375,37 @@ struct nvme_id_ns { __u8 vs[3712]; }; +struct nvme_zns_lbafe { + __le64 zsze; + __u8 zdes; + __u8 rsvd9[7]; +}; + +struct nvme_id_ns_zns { + __le16 zoc; + __le16 ozcs; + __le32 mar; + __le32 mor; + __le32 rrl; + __le32 frl; + __u8 rsvd20[2796]; + struct nvme_zns_lbafe lbafe[16]; + __u8 rsvd3072[768]; + __u8 vs[256]; +}; + +struct nvme_id_ctrl_zns { + __u8 zasl; + __u8 rsvd1[4095]; +}; + enum { NVME_ID_CNS_NS = 0x00, NVME_ID_CNS_CTRL = 0x01, NVME_ID_CNS_NS_ACTIVE_LIST = 0x02, NVME_ID_CNS_NS_DESC_LIST = 0x03, + NVME_ID_CNS_CS_NS = 0x05, + NVME_ID_CNS_CS_CTRL = 0x06, NVME_ID_CNS_NS_PRESENT_LIST = 0x10, NVME_ID_CNS_NS_PRESENT = 0x11, NVME_ID_CNS_CTRL_NS_LIST = 0x12, @@ -384,6 +416,11 @@ enum { }; enum { + NVME_CSI_NVM = 0, + NVME_CSI_ZNS = 2, +}; + +enum { NVME_DIR_IDENTIFY = 0x00, NVME_DIR_STREAMS = 0x01, NVME_DIR_SND_ID_OP_ENABLE = 0x01, @@ -435,11 +472,13 @@ struct nvme_ns_id_desc { #define NVME_NIDT_EUI64_LEN 8 #define NVME_NIDT_NGUID_LEN 16 #define NVME_NIDT_UUID_LEN 16 +#define NVME_NIDT_CSI_LEN 1 enum { NVME_NIDT_EUI64 = 0x01, NVME_NIDT_NGUID = 0x02, NVME_NIDT_UUID = 0x03, + NVME_NIDT_CSI = 0x04, }; struct nvme_smart_log { @@ -519,6 +558,27 @@ struct nvme_ana_rsp_hdr { __le16 rsvd10[3]; }; +struct nvme_zone_descriptor { + __u8 zt; + __u8 zs; + __u8 za; + __u8 rsvd3[5]; + __le64 zcap; + __le64 zslba; + __le64 wp; + __u8 rsvd32[32]; +}; + +enum { + NVME_ZONE_TYPE_SEQWRITE_REQ = 0x2, +}; + +struct nvme_zone_report { + __le64 nr_zones; + __u8 resv8[56]; + struct nvme_zone_descriptor entries[]; +}; + enum { NVME_SMART_CRIT_SPARE = 1 << 0, NVME_SMART_CRIT_TEMPERATURE = 1 << 1, @@ -613,6 +673,9 @@ enum nvme_opcode { nvme_cmd_resv_report = 0x0e, nvme_cmd_resv_acquire = 0x11, nvme_cmd_resv_release = 0x15, + nvme_cmd_zone_mgmt_send = 0x79, + nvme_cmd_zone_mgmt_recv = 0x7a, + nvme_cmd_zone_append = 0x7d, }; #define nvme_opcode_name(opcode) { opcode, #opcode } @@ -751,6 +814,7 @@ struct nvme_rw_command { enum { NVME_RW_LR = 1 << 15, NVME_RW_FUA = 1 << 14, + NVME_RW_APPEND_PIREMAP = 1 << 9, NVME_RW_DSM_FREQ_UNSPEC = 0, NVME_RW_DSM_FREQ_TYPICAL = 1, NVME_RW_DSM_FREQ_RARE = 2, @@ -816,6 +880,53 @@ struct nvme_write_zeroes_cmd { __le16 appmask; }; +enum nvme_zone_mgmt_action { + NVME_ZONE_CLOSE = 0x1, + NVME_ZONE_FINISH = 0x2, + NVME_ZONE_OPEN = 0x3, + NVME_ZONE_RESET = 0x4, + NVME_ZONE_OFFLINE = 0x5, + NVME_ZONE_SET_DESC_EXT = 0x10, +}; + +struct nvme_zone_mgmt_send_cmd { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __le32 cdw2[2]; + __le64 metadata; + union nvme_data_ptr dptr; + __le64 slba; + __le32 cdw12; + __u8 zsa; + __u8 select_all; + __u8 rsvd13[2]; + __le32 cdw14[2]; +}; + +struct nvme_zone_mgmt_recv_cmd { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __le64 rsvd2[2]; + union nvme_data_ptr dptr; + __le64 slba; + __le32 numd; + __u8 zra; + __u8 zrasf; + __u8 pr; + __u8 rsvd13; + __le32 cdw14[2]; +}; + +enum { + NVME_ZRA_ZONE_REPORT = 0, + NVME_ZRASF_ZONE_REPORT_ALL = 0, + NVME_REPORT_ZONE_PARTIAL = 1, +}; + /* Features */ enum { @@ -872,6 +983,7 @@ enum nvme_admin_opcode { nvme_admin_security_recv = 0x82, nvme_admin_sanitize_nvm = 0x84, nvme_admin_get_lba_status = 0x86, + nvme_admin_vendor_start = 0xC0, }; #define nvme_admin_opcode_name(opcode) { opcode, #opcode } @@ -935,6 +1047,8 @@ enum { NVME_FEAT_RESV_MASK = 0x82, NVME_FEAT_RESV_PERSIST = 0x83, NVME_FEAT_WRITE_PROTECT = 0x84, + NVME_FEAT_VENDOR_START = 0xC0, + NVME_FEAT_VENDOR_END = 0xFF, NVME_LOG_ERROR = 0x01, NVME_LOG_SMART = 0x02, NVME_LOG_FW_SLOT = 0x03, @@ -972,7 +1086,9 @@ struct nvme_identify { __u8 cns; __u8 rsvd3; __le16 ctrlid; - __u32 rsvd11[5]; + __u8 rsvd11[3]; + __u8 csi; + __u32 rsvd12[4]; }; #define NVME_IDENTIFY_DATA_SIZE 4096 @@ -1086,7 +1202,9 @@ struct nvme_get_log_page_command { }; __le64 lpo; }; - __u32 rsvd14[2]; + __u8 rsvd14[3]; + __u8 csi; + __u32 rsvd15; }; struct nvme_directive_cmd { @@ -1283,6 +1401,8 @@ struct nvme_command { struct nvme_format_cmd format; struct nvme_dsm_cmd dsm; struct nvme_write_zeroes_cmd write_zeroes; + struct nvme_zone_mgmt_send_cmd zms; + struct nvme_zone_mgmt_recv_cmd zmr; struct nvme_abort_cmd abort; struct nvme_get_log_page_command get_log_page; struct nvmf_common_command fabrics; @@ -1417,6 +1537,18 @@ enum { NVME_SC_AUTH_REQUIRED = 0x191, /* + * I/O Command Set Specific - Zoned commands: + */ + NVME_SC_ZONE_BOUNDARY_ERROR = 0x1b8, + NVME_SC_ZONE_FULL = 0x1b9, + NVME_SC_ZONE_READ_ONLY = 0x1ba, + NVME_SC_ZONE_OFFLINE = 0x1bb, + NVME_SC_ZONE_INVALID_WRITE = 0x1bc, + NVME_SC_ZONE_TOO_MANY_ACTIVE = 0x1bd, + NVME_SC_ZONE_TOO_MANY_OPEN = 0x1be, + NVME_SC_ZONE_INVALID_TRANSITION = 0x1bf, + + /* * Media and Data Integrity Errors: */ NVME_SC_WRITE_FAULT = 0x280, diff --git a/include/uapi/linux/bcache.h b/include/uapi/linux/bcache.h index 9a1965c6c3d0..52e8bcb33981 100644 --- a/include/uapi/linux/bcache.h +++ b/include/uapi/linux/bcache.h @@ -141,11 +141,13 @@ static inline struct bkey *bkey_idx(const struct bkey *k, unsigned int nr_keys) * Version 3: Cache device with new UUID format * Version 4: Backing device with data offset */ -#define BCACHE_SB_VERSION_CDEV 0 -#define BCACHE_SB_VERSION_BDEV 1 -#define BCACHE_SB_VERSION_CDEV_WITH_UUID 3 -#define BCACHE_SB_VERSION_BDEV_WITH_OFFSET 4 -#define BCACHE_SB_MAX_VERSION 4 +#define BCACHE_SB_VERSION_CDEV 0 +#define BCACHE_SB_VERSION_BDEV 1 +#define BCACHE_SB_VERSION_CDEV_WITH_UUID 3 +#define BCACHE_SB_VERSION_BDEV_WITH_OFFSET 4 +#define BCACHE_SB_VERSION_CDEV_WITH_FEATURES 5 +#define BCACHE_SB_VERSION_BDEV_WITH_FEATURES 6 +#define BCACHE_SB_MAX_VERSION 6 #define SB_SECTOR 8 #define SB_OFFSET (SB_SECTOR << SECTOR_SHIFT) @@ -173,7 +175,12 @@ struct cache_sb_disk { __le64 flags; __le64 seq; - __le64 pad[8]; + + __le64 feature_compat; + __le64 feature_incompat; + __le64 feature_ro_compat; + + __le64 pad[5]; union { struct { @@ -206,10 +213,16 @@ struct cache_sb_disk { __le16 keys; }; __le64 d[SB_JOURNAL_BUCKETS]; /* journal buckets */ + __le16 bucket_size_hi; }; +/* + * This is for in-memory bcache super block. + * NOTE: cache_sb is NOT exactly mapping to cache_sb_disk, the member + * size, ordering and even whole struct size may be different + * from cache_sb_disk. + */ struct cache_sb { - __u64 csum; __u64 offset; /* sector where this sb was written */ __u64 version; @@ -224,7 +237,10 @@ struct cache_sb { __u64 flags; __u64 seq; - __u64 pad[8]; + + __u64 feature_compat; + __u64 feature_incompat; + __u64 feature_ro_compat; union { struct { @@ -232,10 +248,9 @@ struct cache_sb { __u64 nbuckets; /* device size */ __u16 block_size; /* sectors */ - __u16 bucket_size; /* sectors */ - __u16 nr_in_set; __u16 nr_this_dev; + __u32 bucket_size; /* sectors */ }; struct { /* Backing devices */ @@ -262,7 +277,8 @@ struct cache_sb { static inline _Bool SB_IS_BDEV(const struct cache_sb *sb) { return sb->version == BCACHE_SB_VERSION_BDEV - || sb->version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET; + || sb->version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET + || sb->version == BCACHE_SB_VERSION_BDEV_WITH_FEATURES; } BITMASK(CACHE_SYNC, struct cache_sb, flags, 0, 1); diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h index 0cdef67135f0..42c3366cc25f 100644 --- a/include/uapi/linux/blkzoned.h +++ b/include/uapi/linux/blkzoned.h @@ -74,6 +74,15 @@ enum blk_zone_cond { }; /** + * enum blk_zone_report_flags - Feature flags of reported zone descriptors. + * + * @BLK_ZONE_REP_CAPACITY: Zone descriptor has capacity field. + */ +enum blk_zone_report_flags { + BLK_ZONE_REP_CAPACITY = (1 << 0), +}; + +/** * struct blk_zone - Zone descriptor for BLKREPORTZONE ioctl. * * @start: Zone start in 512 B sector units @@ -99,7 +108,9 @@ struct blk_zone { __u8 cond; /* Zone condition */ __u8 non_seq; /* Non-sequential write resources active */ __u8 reset; /* Reset write pointer recommended */ - __u8 reserved[36]; + __u8 resv[4]; + __u64 capacity; /* Zone capacity in number of sectors */ + __u8 reserved[24]; }; /** @@ -115,7 +126,7 @@ struct blk_zone { struct blk_zone_report { __u64 sector; __u32 nr_zones; - __u8 reserved[4]; + __u32 flags; struct blk_zone zones[0]; }; diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h index 1f2d8c81f0e0..e5a98a16f9b0 100644 --- a/include/uapi/linux/raid/md_p.h +++ b/include/uapi/linux/raid/md_p.h @@ -123,7 +123,7 @@ typedef struct mdp_device_descriptor_s { /* * Notes: - * - if an array is being reshaped (restriped) in order to change the + * - if an array is being reshaped (restriped) in order to change * the number of active devices in the array, 'raid_disks' will be * the larger of the old and new numbers. 'delta_disks' will * be the "new - old". So if +ve, raid_disks is the new value, and |