diff options
Diffstat (limited to 'block')
-rw-r--r-- | block/Kconfig | 8 | ||||
-rw-r--r-- | block/Makefile | 3 | ||||
-rw-r--r-- | block/bdev.c | 6 | ||||
-rw-r--r-- | block/bfq-cgroup.c | 16 | ||||
-rw-r--r-- | block/bfq-iosched.c | 41 | ||||
-rw-r--r-- | block/bfq-iosched.h | 1 | ||||
-rw-r--r-- | block/bfq-wf2q.c | 15 | ||||
-rw-r--r-- | block/bio-integrity.c | 2 | ||||
-rw-r--r-- | block/bio.c | 9 | ||||
-rw-r--r-- | block/blk-cgroup.c | 5 | ||||
-rw-r--r-- | block/blk-cgroup.h | 17 | ||||
-rw-r--r-- | block/blk-core.c | 46 | ||||
-rw-r--r-- | block/blk-crypto-internal.h | 12 | ||||
-rw-r--r-- | block/blk-crypto-sysfs.c | 172 | ||||
-rw-r--r-- | block/blk-crypto.c | 3 | ||||
-rw-r--r-- | block/blk-iolatency.c | 2 | ||||
-rw-r--r-- | block/blk-map.c | 2 | ||||
-rw-r--r-- | block/blk-merge.c | 31 | ||||
-rw-r--r-- | block/blk-mq-debugfs.c | 6 | ||||
-rw-r--r-- | block/blk-mq-debugfs.h | 2 | ||||
-rw-r--r-- | block/blk-mq-sched.c | 18 | ||||
-rw-r--r-- | block/blk-mq-sysfs.c | 16 | ||||
-rw-r--r-- | block/blk-mq-tag.c | 4 | ||||
-rw-r--r-- | block/blk-mq.c | 278 | ||||
-rw-r--r-- | block/blk-mq.h | 2 | ||||
-rw-r--r-- | block/blk-rq-qos.h | 20 | ||||
-rw-r--r-- | block/blk-sysfs.c | 42 | ||||
-rw-r--r-- | block/blk-throttle.c | 48 | ||||
-rw-r--r-- | block/blk-throttle.h | 3 | ||||
-rw-r--r-- | block/blk.h | 2 | ||||
-rw-r--r-- | block/elevator.c | 16 | ||||
-rw-r--r-- | block/fops.c | 38 | ||||
-rw-r--r-- | block/genhd.c | 69 |
33 files changed, 659 insertions, 296 deletions
diff --git a/block/Kconfig b/block/Kconfig index 168b873eb666..7eb5d6d53b3f 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -28,15 +28,13 @@ if BLOCK config BLOCK_LEGACY_AUTOLOAD bool "Legacy autoloading support" + default y help Enable loading modules and creating block device instances based on accesses through their device special file. This is a historic Linux feature and makes no sense in a udev world where device files are - created on demand. - - Say N here unless booting or other functionality broke without it, in - which case you should also send a report to your distribution and - linux-block@vger.kernel.org. + created on demand, but scripts that manually create device nodes and + then call losetup might rely on this behavior. config BLK_RQ_ALLOC_TIME bool diff --git a/block/Makefile b/block/Makefile index f38eaa612929..3950ecbc5c26 100644 --- a/block/Makefile +++ b/block/Makefile @@ -36,6 +36,7 @@ obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o obj-$(CONFIG_BLK_PM) += blk-pm.o -obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += blk-crypto.o blk-crypto-profile.o +obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += blk-crypto.o blk-crypto-profile.o \ + blk-crypto-sysfs.o obj-$(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) += blk-crypto-fallback.o obj-$(CONFIG_BLOCK_HOLDER_DEPRECATED) += holder.o diff --git a/block/bdev.c b/block/bdev.c index c68772644566..13de871fa816 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -385,7 +385,7 @@ static struct kmem_cache * bdev_cachep __read_mostly; static struct inode *bdev_alloc_inode(struct super_block *sb) { - struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL); + struct bdev_inode *ei = alloc_inode_sb(sb, bdev_cachep, GFP_KERNEL); if (!ei) return NULL; @@ -678,7 +678,7 @@ static int blkdev_get_whole(struct block_device *bdev, fmode_t mode) if (test_bit(GD_NEED_PART_SCAN, &disk->state)) bdev_disk_changed(disk, false); bdev->bd_openers++; - return 0;; + return 0; } static void blkdev_put_whole(struct block_device *bdev, fmode_t mode) @@ -738,7 +738,7 @@ struct block_device *blkdev_get_no_open(dev_t dev) inode = ilookup(blockdev_superblock, dev); if (inode) pr_warn_ratelimited( -"block device autoloading is deprecated. It will be removed in Linux 5.19\n"); +"block device autoloading is deprecated and will be removed.\n"); } if (!inode) return NULL; diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 24a5c5329bcd..420eda2589c0 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -645,8 +645,22 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, struct bfq_group *bfqg) { struct bfq_entity *entity = &bfqq->entity; + struct bfq_group *old_parent = bfqq_group(bfqq); /* + * No point to move bfqq to the same group, which can happen when + * root group is offlined + */ + if (old_parent == bfqg) + return; + + /* + * oom_bfqq is not allowed to move, oom_bfqq will hold ref to root_group + * until elevator exit. + */ + if (bfqq == &bfqd->oom_bfqq) + return; + /* * Get extra reference to prevent bfqq from being freed in * next possible expire or deactivate. */ @@ -666,7 +680,7 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfq_deactivate_bfqq(bfqd, bfqq, false, false); else if (entity->on_st_or_in_serv) bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); - bfqg_and_blkg_put(bfqq_group(bfqq)); + bfqg_and_blkg_put(old_parent); if (entity->parent && entity->parent->last_bfqq_created == bfqq) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 0c612a911696..2e0dd68a3cbe 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -774,7 +774,7 @@ bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) if (!bfqq->next_rq) return; - bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; + bfqq->pos_root = &bfqq_group(bfqq)->rq_pos_tree; __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root, blk_rq_pos(bfqq->next_rq), &parent, &p); if (!__bfqq) { @@ -2153,7 +2153,7 @@ static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfqq->waker_detection_started = now_ns; bfq_bfqq_name(bfqq->tentative_waker_bfqq, waker_name, MAX_BFQQ_NAME_LENGTH); - bfq_log_bfqq(bfqd, bfqq, "set tenative waker %s", waker_name); + bfq_log_bfqq(bfqd, bfqq, "set tentative waker %s", waker_name); } else /* Same tentative waker queue detected again */ bfqq->num_waker_detections++; @@ -2669,7 +2669,7 @@ static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd, struct bfq_queue *bfqq, sector_t sector) { - struct rb_root *root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; + struct rb_root *root = &bfqq_group(bfqq)->rq_pos_tree; struct rb_node *parent, *node; struct bfq_queue *__bfqq; @@ -2782,6 +2782,15 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) * are likely to increase the throughput. */ bfqq->new_bfqq = new_bfqq; + /* + * The above assignment schedules the following redirections: + * each time some I/O for bfqq arrives, the process that + * generated that I/O is disassociated from bfqq and + * associated with new_bfqq. Here we increases new_bfqq->ref + * in advance, adding the number of processes that are + * expected to be associated with new_bfqq as they happen to + * issue I/O. + */ new_bfqq->ref += process_refs; return new_bfqq; } @@ -2844,6 +2853,10 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, { struct bfq_queue *in_service_bfqq, *new_bfqq; + /* if a merge has already been setup, then proceed with that first */ + if (bfqq->new_bfqq) + return bfqq->new_bfqq; + /* * Check delayed stable merge for rotational or non-queueing * devs. For this branch to be executed, bfqq must not be @@ -2945,9 +2958,6 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (bfq_too_late_for_merging(bfqq)) return NULL; - if (bfqq->new_bfqq) - return bfqq->new_bfqq; - if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq)) return NULL; @@ -5181,7 +5191,7 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; struct request *rq; struct bfq_queue *in_serv_queue; - bool waiting_rq, idle_timer_disabled; + bool waiting_rq, idle_timer_disabled = false; spin_lock_irq(&bfqd->lock); @@ -5189,14 +5199,15 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue); rq = __bfq_dispatch_request(hctx); - - idle_timer_disabled = - waiting_rq && !bfq_bfqq_wait_request(in_serv_queue); + if (in_serv_queue == bfqd->in_service_queue) { + idle_timer_disabled = + waiting_rq && !bfq_bfqq_wait_request(in_serv_queue); + } spin_unlock_irq(&bfqd->lock); - - bfq_update_dispatch_stats(hctx->queue, rq, in_serv_queue, - idle_timer_disabled); + bfq_update_dispatch_stats(hctx->queue, rq, + idle_timer_disabled ? in_serv_queue : NULL, + idle_timer_disabled); return rq; } @@ -5448,7 +5459,7 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) bfqq = bic_to_bfqq(bic, false); if (bfqq) { bfq_release_process_ref(bfqd, bfqq); - bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic, true); + bfqq = bfq_get_queue(bfqd, bio, false, bic, true); bic_set_bfqq(bic, bfqq, false); } @@ -7018,6 +7029,8 @@ static void bfq_exit_queue(struct elevator_queue *e) spin_unlock_irq(&bfqd->lock); #endif + wbt_enable_default(bfqd->queue); + kfree(bfqd); } diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 72255ec44f8f..3b83e3d1c2e5 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -1050,7 +1050,6 @@ extern struct blkcg_policy blkcg_policy_bfq; for (parent = NULL; entity ; entity = parent) #endif /* CONFIG_BFQ_GROUP_IOSCHED */ -struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq); struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity); unsigned int bfq_tot_busy_queues(struct bfq_data *bfqd); struct bfq_service_tree *bfq_entity_service_tree(struct bfq_entity *entity); diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index 709b901de3ca..f8eb340381cf 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -142,16 +142,6 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, #ifdef CONFIG_BFQ_GROUP_IOSCHED -struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) -{ - struct bfq_entity *group_entity = bfqq->entity.parent; - - if (!group_entity) - group_entity = &bfqq->bfqd->root_group->entity; - - return container_of(group_entity, struct bfq_group, entity); -} - /* * Returns true if this budget changes may let next_in_service->parent * become the next_in_service entity for its parent entity. @@ -230,11 +220,6 @@ static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) #else /* CONFIG_BFQ_GROUP_IOSCHED */ -struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) -{ - return bfqq->bfqd->root_group; -} - static bool bfq_update_parent_budget(struct bfq_entity *next_in_service) { return false; diff --git a/block/bio-integrity.c b/block/bio-integrity.c index bd5453220065..6996e7bd66e9 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -373,7 +373,7 @@ void bio_integrity_advance(struct bio *bio, unsigned int bytes_done) struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); unsigned bytes = bio_integrity_bytes(bi, bytes_done >> 9); - bip->bip_iter.bi_sector += bytes_done >> 9; + bip->bip_iter.bi_sector += bio_integrity_intervals(bi, bytes_done >> 9); bvec_iter_advance(bip->bip_vec, &bip->bip_iter, bytes); } diff --git a/block/bio.c b/block/bio.c index b15f5466ce08..33979f306e9e 100644 --- a/block/bio.c +++ b/block/bio.c @@ -807,12 +807,6 @@ int bio_init_clone(struct block_device *bdev, struct bio *bio, } EXPORT_SYMBOL(bio_init_clone); -const char *bio_devname(struct bio *bio, char *buf) -{ - return bdevname(bio->bi_bdev, buf); -} -EXPORT_SYMBOL(bio_devname); - /** * bio_full - check if the bio is full * @bio: bio to check @@ -1522,8 +1516,7 @@ again: if (!bio_integrity_endio(bio)) return; - if (bio->bi_bdev && bio_flagged(bio, BIO_TRACKED)) - rq_qos_done_bio(bdev_get_queue(bio->bi_bdev), bio); + rq_qos_done_bio(bio); if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) { trace_block_bio_complete(bdev_get_queue(bio->bi_bdev), bio); diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index fa063c6c0338..d53b0d69dd73 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -82,6 +82,8 @@ static void blkg_free(struct blkcg_gq *blkg) if (blkg->pd[i]) blkcg_policy[i]->pd_free_fn(blkg->pd[i]); + if (blkg->q) + blk_put_queue(blkg->q); free_percpu(blkg->iostat_cpu); percpu_ref_exit(&blkg->refcnt); kfree(blkg); @@ -167,6 +169,9 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, if (!blkg->iostat_cpu) goto err_free; + if (!blk_get_queue(q)) + goto err_free; + blkg->q = q; INIT_LIST_HEAD(&blkg->q_node); spin_lock_init(&blkg->async_bio_lock); diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 3e91803c4a55..47e1e38390c9 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -15,6 +15,7 @@ */ #include <linux/blk-cgroup.h> +#include <linux/blk-mq.h> /* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */ #define BLKG_STAT_CPU_BATCH (INT_MAX / 2) @@ -428,6 +429,21 @@ static inline void blkcg_clear_delay(struct blkcg_gq *blkg) atomic_dec(&blkg->blkcg->css.cgroup->congestion_count); } +/** + * blk_cgroup_mergeable - Determine whether to allow or disallow merges + * @rq: request to merge into + * @bio: bio to merge + * + * @bio and @rq should belong to the same cgroup and their issue_as_root should + * match. The latter is necessary as we don't want to throttle e.g. a metadata + * update because it happens to be next to a regular IO. + */ +static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio) +{ + return rq->bio->bi_blkg == bio->bi_blkg && + bio_issue_as_root_blkg(rq->bio) == bio_issue_as_root_blkg(bio); +} + void blk_cgroup_bio_start(struct bio *bio); void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta); #else /* CONFIG_BLK_CGROUP */ @@ -467,6 +483,7 @@ static inline void blkg_put(struct blkcg_gq *blkg) { } static inline bool blkcg_punt_bio_submit(struct bio *bio) { return false; } static inline void blkcg_bio_issue_init(struct bio *bio) { } static inline void blk_cgroup_bio_start(struct bio *bio) { } +static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio) { return true; } #define blk_queue_for_each_rl(rl, q) \ for ((rl) = &(q)->root_rl; (rl); (rl) = NULL) diff --git a/block/blk-core.c b/block/blk-core.c index ce08f0aa9dfc..d4ae6ac53ffc 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -50,6 +50,7 @@ #include "blk-pm.h" #include "blk-cgroup.h" #include "blk-throttle.h" +#include "blk-rq-qos.h" struct dentry *blk_debugfs_root; @@ -285,13 +286,6 @@ void blk_queue_start_drain(struct request_queue *q) wake_up_all(&q->mq_freeze_wq); } -void blk_set_queue_dying(struct request_queue *q) -{ - blk_queue_flag_set(QUEUE_FLAG_DYING, q); - blk_queue_start_drain(q); -} -EXPORT_SYMBOL_GPL(blk_set_queue_dying); - /** * blk_cleanup_queue - shutdown a request queue * @q: request queue to shutdown @@ -309,7 +303,8 @@ void blk_cleanup_queue(struct request_queue *q) WARN_ON_ONCE(blk_queue_registered(q)); /* mark @q DYING, no new request or merges will be allowed afterwards */ - blk_set_queue_dying(q); + blk_queue_flag_set(QUEUE_FLAG_DYING, q); + blk_queue_start_drain(q); blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q); blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q); @@ -321,6 +316,9 @@ void blk_cleanup_queue(struct request_queue *q) */ blk_freeze_queue(q); + /* cleanup rq qos structures for queue without disk */ + rq_qos_exit(q); + blk_queue_flag_set(QUEUE_FLAG_DEAD, q); blk_sync_queue(q); @@ -342,8 +340,6 @@ void blk_cleanup_queue(struct request_queue *q) blk_mq_sched_free_rqs(q); mutex_unlock(&q->sysfs_lock); - percpu_ref_exit(&q->q_usage_counter); - /* @q is and will stay empty, shutdown and put */ blk_put_queue(q); } @@ -496,17 +492,12 @@ struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu) PERCPU_REF_INIT_ATOMIC, GFP_KERNEL)) goto fail_stats; - if (blkcg_init_queue(q)) - goto fail_ref; - blk_queue_dma_alignment(q, 511); blk_set_default_limits(&q->limits); q->nr_requests = BLKDEV_DEFAULT_RQ; return q; -fail_ref: - percpu_ref_exit(&q->q_usage_counter); fail_stats: blk_free_queue_stats(q->stats); fail_split: @@ -540,17 +531,6 @@ bool blk_get_queue(struct request_queue *q) } EXPORT_SYMBOL(blk_get_queue); -static void handle_bad_sector(struct bio *bio, sector_t maxsector) -{ - char b[BDEVNAME_SIZE]; - - pr_info_ratelimited("%s: attempt to access beyond end of device\n" - "%s: rw=%d, want=%llu, limit=%llu\n", - current->comm, - bio_devname(bio, b), bio->bi_opf, - bio_end_sector(bio), maxsector); -} - #ifdef CONFIG_FAIL_MAKE_REQUEST static DECLARE_FAULT_ATTR(fail_make_request); @@ -580,14 +560,10 @@ late_initcall(fail_make_request_debugfs); static inline bool bio_check_ro(struct bio *bio) { if (op_is_write(bio_op(bio)) && bdev_read_only(bio->bi_bdev)) { - char b[BDEVNAME_SIZE]; - if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) return false; - - WARN_ONCE(1, - "Trying to write to read-only block-device %s (partno %d)\n", - bio_devname(bio, b), bio->bi_bdev->bd_partno); + pr_warn("Trying to write to read-only block-device %pg\n", + bio->bi_bdev); /* Older lvm-tools actually trigger this */ return false; } @@ -616,7 +592,11 @@ static inline int bio_check_eod(struct bio *bio) if (nr_sectors && maxsector && (nr_sectors > maxsector || bio->bi_iter.bi_sector > maxsector - nr_sectors)) { - handle_bad_sector(bio, maxsector); + pr_info_ratelimited("%s: attempt to access beyond end of device\n" + "%pg: rw=%d, want=%llu, limit=%llu\n", + current->comm, + bio->bi_bdev, bio->bi_opf, + bio_end_sector(bio), maxsector); return -EIO; } return 0; diff --git a/block/blk-crypto-internal.h b/block/blk-crypto-internal.h index 2fb0d65a464c..e6818ffaddbf 100644 --- a/block/blk-crypto-internal.h +++ b/block/blk-crypto-internal.h @@ -11,6 +11,7 @@ /* Represents a crypto mode supported by blk-crypto */ struct blk_crypto_mode { + const char *name; /* name of this mode, shown in sysfs */ const char *cipher_str; /* crypto API name (for fallback case) */ unsigned int keysize; /* key size in bytes */ unsigned int ivsize; /* iv size in bytes */ @@ -20,6 +21,10 @@ extern const struct blk_crypto_mode blk_crypto_modes[]; #ifdef CONFIG_BLK_INLINE_ENCRYPTION +int blk_crypto_sysfs_register(struct request_queue *q); + +void blk_crypto_sysfs_unregister(struct request_queue *q); + void bio_crypt_dun_increment(u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE], unsigned int inc); @@ -62,6 +67,13 @@ static inline bool blk_crypto_rq_is_encrypted(struct request *rq) #else /* CONFIG_BLK_INLINE_ENCRYPTION */ +static inline int blk_crypto_sysfs_register(struct request_queue *q) +{ + return 0; +} + +static inline void blk_crypto_sysfs_unregister(struct request_queue *q) { } + static inline bool bio_crypt_rq_ctx_compatible(struct request *rq, struct bio *bio) { diff --git a/block/blk-crypto-sysfs.c b/block/blk-crypto-sysfs.c new file mode 100644 index 000000000000..fd93bd2f33b7 --- /dev/null +++ b/block/blk-crypto-sysfs.c @@ -0,0 +1,172 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2021 Google LLC + * + * sysfs support for blk-crypto. This file contains the code which exports the + * crypto capabilities of devices via /sys/block/$disk/queue/crypto/. + */ + +#include <linux/blk-crypto-profile.h> + +#include "blk-crypto-internal.h" + +struct blk_crypto_kobj { + struct kobject kobj; + struct blk_crypto_profile *profile; +}; + +struct blk_crypto_attr { + struct attribute attr; + ssize_t (*show)(struct blk_crypto_profile *profile, + struct blk_crypto_attr *attr, char *page); +}; + +static struct blk_crypto_profile *kobj_to_crypto_profile(struct kobject *kobj) +{ + return container_of(kobj, struct blk_crypto_kobj, kobj)->profile; +} + +static struct blk_crypto_attr *attr_to_crypto_attr(struct attribute *attr) +{ + return container_of(attr, struct blk_crypto_attr, attr); +} + +static ssize_t max_dun_bits_show(struct blk_crypto_profile *profile, + struct blk_crypto_attr *attr, char *page) +{ + return sysfs_emit(page, "%u\n", 8 * profile->max_dun_bytes_supported); +} + +static ssize_t num_keyslots_show(struct blk_crypto_profile *profile, + struct blk_crypto_attr *attr, char *page) +{ + return sysfs_emit(page, "%u\n", profile->num_slots); +} + +#define BLK_CRYPTO_RO_ATTR(_name) \ + static struct blk_crypto_attr _name##_attr = __ATTR_RO(_name) + +BLK_CRYPTO_RO_ATTR(max_dun_bits); +BLK_CRYPTO_RO_ATTR(num_keyslots); + +static struct attribute *blk_crypto_attrs[] = { + &max_dun_bits_attr.attr, + &num_keyslots_attr.attr, + NULL, +}; + +static const struct attribute_group blk_crypto_attr_group = { + .attrs = blk_crypto_attrs, +}; + +/* + * The encryption mode attributes. To avoid hard-coding the list of encryption + * modes, these are initialized at boot time by blk_crypto_sysfs_init(). + */ +static struct blk_crypto_attr __blk_crypto_mode_attrs[BLK_ENCRYPTION_MODE_MAX]; +static struct attribute *blk_crypto_mode_attrs[BLK_ENCRYPTION_MODE_MAX + 1]; + +static umode_t blk_crypto_mode_is_visible(struct kobject *kobj, + struct attribute *attr, int n) +{ + struct blk_crypto_profile *profile = kobj_to_crypto_profile(kobj); + struct blk_crypto_attr *a = attr_to_crypto_attr(attr); + int mode_num = a - __blk_crypto_mode_attrs; + + if (profile->modes_supported[mode_num]) + return 0444; + return 0; +} + +static ssize_t blk_crypto_mode_show(struct blk_crypto_profile *profile, + struct blk_crypto_attr *attr, char *page) +{ + int mode_num = attr - __blk_crypto_mode_attrs; + + return sysfs_emit(page, "0x%x\n", profile->modes_supported[mode_num]); +} + +static const struct attribute_group blk_crypto_modes_attr_group = { + .name = "modes", + .attrs = blk_crypto_mode_attrs, + .is_visible = blk_crypto_mode_is_visible, +}; + +static const struct attribute_group *blk_crypto_attr_groups[] = { + &blk_crypto_attr_group, + &blk_crypto_modes_attr_group, + NULL, +}; + +static ssize_t blk_crypto_attr_show(struct kobject *kobj, + struct attribute *attr, char *page) +{ + struct blk_crypto_profile *profile = kobj_to_crypto_profile(kobj); + struct blk_crypto_attr *a = attr_to_crypto_attr(attr); + + return a->show(profile, a, page); +} + +static const struct sysfs_ops blk_crypto_attr_ops = { + .show = blk_crypto_attr_show, +}; + +static void blk_crypto_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct blk_crypto_kobj, kobj)); +} + +static struct kobj_type blk_crypto_ktype = { + .default_groups = blk_crypto_attr_groups, + .sysfs_ops = &blk_crypto_attr_ops, + .release = blk_crypto_release, +}; + +/* + * If the request_queue has a blk_crypto_profile, create the "crypto" + * subdirectory in sysfs (/sys/block/$disk/queue/crypto/). + */ +int blk_crypto_sysfs_register(struct request_queue *q) +{ + struct blk_crypto_kobj *obj; + int err; + + if (!q->crypto_profile) + return 0; + + obj = kzalloc(sizeof(*obj), GFP_KERNEL); + if (!obj) + return -ENOMEM; + obj->profile = q->crypto_profile; + + err = kobject_init_and_add(&obj->kobj, &blk_crypto_ktype, &q->kobj, + "crypto"); + if (err) { + kobject_put(&obj->kobj); + return err; + } + q->crypto_kobject = &obj->kobj; + return 0; +} + +void blk_crypto_sysfs_unregister(struct request_queue *q) +{ + kobject_put(q->crypto_kobject); +} + +static int __init blk_crypto_sysfs_init(void) +{ + int i; + + BUILD_BUG_ON(BLK_ENCRYPTION_MODE_INVALID != 0); + for (i = 1; i < BLK_ENCRYPTION_MODE_MAX; i++) { + struct blk_crypto_attr *attr = &__blk_crypto_mode_attrs[i]; + + attr->attr.name = blk_crypto_modes[i].name; + attr->attr.mode = 0444; + attr->show = blk_crypto_mode_show; + blk_crypto_mode_attrs[i - 1] = &attr->attr; + } + return 0; +} +subsys_initcall(blk_crypto_sysfs_init); diff --git a/block/blk-crypto.c b/block/blk-crypto.c index 773dae4c329b..a496aaef85ba 100644 --- a/block/blk-crypto.c +++ b/block/blk-crypto.c @@ -19,16 +19,19 @@ const struct blk_crypto_mode blk_crypto_modes[] = { [BLK_ENCRYPTION_MODE_AES_256_XTS] = { + .name = "AES-256-XTS", .cipher_str = "xts(aes)", .keysize = 64, .ivsize = 16, }, [BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV] = { + .name = "AES-128-CBC-ESSIV", .cipher_str = "essiv(cbc(aes),sha256)", .keysize = 16, .ivsize = 16, }, [BLK_ENCRYPTION_MODE_ADIANTUM] = { + .name = "Adiantum", .cipher_str = "adiantum(xchacha12,aes)", .keysize = 32, .ivsize = 32, diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index 010e658d44a8..2f33932e72e3 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -598,7 +598,7 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio) int inflight = 0; blkg = bio->bi_blkg; - if (!blkg || !bio_flagged(bio, BIO_TRACKED)) + if (!blkg || !bio_flagged(bio, BIO_QOS_THROTTLED)) return; iolat = blkg_to_lat(bio->bi_blkg); diff --git a/block/blk-map.c b/block/blk-map.c index 4526adde0156..c7f71d83eff1 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -446,7 +446,7 @@ static struct bio *bio_copy_kern(struct request_queue *q, void *data, if (bytes > len) bytes = len; - page = alloc_page(GFP_NOIO | gfp_mask); + page = alloc_page(GFP_NOIO | __GFP_ZERO | gfp_mask); if (!page) goto cleanup; diff --git a/block/blk-merge.c b/block/blk-merge.c index f5255991b773..ea6968313b4a 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -9,6 +9,7 @@ #include <linux/blk-integrity.h> #include <linux/scatterlist.h> #include <linux/part_stat.h> +#include <linux/blk-cgroup.h> #include <trace/events/block.h> @@ -598,6 +599,9 @@ static inline unsigned int blk_rq_get_max_sectors(struct request *rq, static inline int ll_new_hw_segment(struct request *req, struct bio *bio, unsigned int nr_phys_segs) { + if (!blk_cgroup_mergeable(req, bio)) + goto no_merge; + if (blk_integrity_merge_bio(req->q, req, bio) == false) goto no_merge; @@ -694,6 +698,9 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req, if (total_phys_segments > blk_rq_get_max_segments(req)) return 0; + if (!blk_cgroup_mergeable(req, next->bio)) + return 0; + if (blk_integrity_merge_rq(q, req, next) == false) return 0; @@ -902,6 +909,10 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) if (bio_data_dir(bio) != rq_data_dir(rq)) return false; + /* don't merge across cgroup boundaries */ + if (!blk_cgroup_mergeable(rq, bio)) + return false; + /* only merge integrity protected bio into ditto rq */ if (blk_integrity_merge_bio(rq->q, rq, bio) == false) return false; @@ -1087,12 +1098,20 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, if (!plug || rq_list_empty(plug->mq_list)) return false; - /* check the previously added entry for a quick merge attempt */ - rq = rq_list_peek(&plug->mq_list); - if (rq->q == q) { - if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) == - BIO_MERGE_OK) - return true; + rq_list_for_each(&plug->mq_list, rq) { + if (rq->q == q) { + if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) == + BIO_MERGE_OK) + return true; + break; + } + + /* + * Only keep iterating plug list for merges if we have multiple + * queues + */ + if (!plug->multiple_queues) + break; } return false; } diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 3a790eb4995c..e2880f6deb34 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -707,7 +707,7 @@ static void debugfs_create_files(struct dentry *parent, void *data, void blk_mq_debugfs_register(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; debugfs_create_files(q->debugfs_dir, q, blk_mq_debugfs_queue_attrs); @@ -780,7 +780,7 @@ void blk_mq_debugfs_unregister_hctx(struct blk_mq_hw_ctx *hctx) void blk_mq_debugfs_register_hctxs(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; queue_for_each_hw_ctx(q, hctx, i) blk_mq_debugfs_register_hctx(q, hctx); @@ -789,7 +789,7 @@ void blk_mq_debugfs_register_hctxs(struct request_queue *q) void blk_mq_debugfs_unregister_hctxs(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; queue_for_each_hw_ctx(q, hctx, i) blk_mq_debugfs_unregister_hctx(hctx); diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h index a68aa6041a10..69918f4170d6 100644 --- a/block/blk-mq-debugfs.h +++ b/block/blk-mq-debugfs.h @@ -6,6 +6,8 @@ #include <linux/seq_file.h> +struct blk_mq_hw_ctx; + struct blk_mq_debugfs_attr { const char *name; umode_t mode; diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 55488ba97823..9e56a69422b6 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -180,11 +180,18 @@ static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) { + unsigned long end = jiffies + HZ; int ret; do { ret = __blk_mq_do_dispatch_sched(hctx); - } while (ret == 1); + if (ret != 1) + break; + if (need_resched() || time_is_before_jiffies(end)) { + blk_mq_delay_run_hw_queue(hctx, 0); + break; + } + } while (1); return ret; } @@ -515,7 +522,7 @@ static void blk_mq_exit_sched_shared_tags(struct request_queue *queue) static void blk_mq_sched_tags_teardown(struct request_queue *q, unsigned int flags) { struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; queue_for_each_hw_ctx(q, hctx, i) { if (hctx->sched_tags) { @@ -550,9 +557,10 @@ static int blk_mq_init_sched_shared_tags(struct request_queue *queue) int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) { - unsigned int i, flags = q->tag_set->flags; + unsigned int flags = q->tag_set->flags; struct blk_mq_hw_ctx *hctx; struct elevator_queue *eq; + unsigned long i; int ret; if (!e) { @@ -618,7 +626,7 @@ err_free_map_and_rqs: void blk_mq_sched_free_rqs(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; if (blk_mq_is_shared_tags(q->tag_set->flags)) { blk_mq_free_rqs(q->tag_set, q->sched_shared_tags, @@ -635,7 +643,7 @@ void blk_mq_sched_free_rqs(struct request_queue *q) void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e) { struct blk_mq_hw_ctx *hctx; - unsigned int i; + unsigned long i; unsigned int flags = 0; queue_for_each_hw_ctx(q, hctx, i) { diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 674786574075..c08426975856 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -206,7 +206,7 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx) void blk_mq_unregister_dev(struct device *dev, struct request_queue *q) { struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; lockdep_assert_held(&q->sysfs_dir_lock); @@ -255,7 +255,8 @@ void blk_mq_sysfs_init(struct request_queue *q) int __blk_mq_register_dev(struct device *dev, struct request_queue *q) { struct blk_mq_hw_ctx *hctx; - int ret, i; + unsigned long i, j; + int ret; WARN_ON_ONCE(!q->kobj.parent); lockdep_assert_held(&q->sysfs_dir_lock); @@ -278,8 +279,10 @@ out: return ret; unreg: - while (--i >= 0) - blk_mq_unregister_hctx(q->queue_hw_ctx[i]); + queue_for_each_hw_ctx(q, hctx, j) { + if (j < i) + blk_mq_unregister_hctx(hctx); + } kobject_uevent(q->mq_kobj, KOBJ_REMOVE); kobject_del(q->mq_kobj); @@ -290,7 +293,7 @@ unreg: void blk_mq_sysfs_unregister(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; mutex_lock(&q->sysfs_dir_lock); if (!q->mq_sysfs_init_done) @@ -306,7 +309,8 @@ unlock: int blk_mq_sysfs_register(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; - int i, ret = 0; + unsigned long i; + int ret = 0; mutex_lock(&q->sysfs_dir_lock); if (!q->mq_sysfs_init_done) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 0fd409b8e86e..68ac23d0b640 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -498,7 +498,7 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn, void *priv) { /* - * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and queue_hw_ctx + * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and hctx_table * while the queue is frozen. So we can use q_usage_counter to avoid * racing with it. */ @@ -515,7 +515,7 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn, bt_for_each(NULL, q, btags, fn, priv, false); } else { struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; queue_for_each_hw_ctx(q, hctx, i) { struct blk_mq_tags *tags = hctx->tags; diff --git a/block/blk-mq.c b/block/blk-mq.c index a05ce7725031..8e659dc5fcf3 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -71,7 +71,8 @@ static int blk_mq_poll_stats_bkt(const struct request *rq) static inline struct blk_mq_hw_ctx *blk_qc_to_hctx(struct request_queue *q, blk_qc_t qc) { - return q->queue_hw_ctx[(qc & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT]; + return xa_load(&q->hctx_table, + (qc & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT); } static inline struct request *blk_qc_to_rq(struct blk_mq_hw_ctx *hctx, @@ -312,7 +313,7 @@ EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue); void blk_mq_wake_waiters(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; - unsigned int i; + unsigned long i; queue_for_each_hw_ctx(q, hctx, i) if (blk_mq_hw_queue_mapped(hctx)) @@ -573,7 +574,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, * If not tell the caller that it should skip this queue. */ ret = -EXDEV; - data.hctx = q->queue_hw_ctx[hctx_idx]; + data.hctx = xa_load(&q->hctx_table, hctx_idx); if (!blk_mq_hw_queue_mapped(data.hctx)) goto out_queue_exit; cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask); @@ -736,6 +737,10 @@ static void blk_complete_request(struct request *req) /* Completion has already been traced */ bio_clear_flag(bio, BIO_TRACE_COMPLETION); + + if (req_op(req) == REQ_OP_ZONE_APPEND) + bio->bi_iter.bi_sector = req->__sector; + if (!is_flush) bio_endio(bio); bio = next; @@ -883,10 +888,15 @@ static inline void blk_account_io_done(struct request *req, u64 now) static void __blk_account_io_start(struct request *rq) { - /* passthrough requests can hold bios that do not have ->bi_bdev set */ - if (rq->bio && rq->bio->bi_bdev) + /* + * All non-passthrough requests are created from a bio with one + * exception: when a flush command that is part of a flush sequence + * generated by the state machine in blk-flush.c is cloned onto the + * lower device by dm-multipath we can get here without a bio. + */ + if (rq->bio) rq->part = rq->bio->bi_bdev; - else if (rq->q->disk) + else rq->part = rq->q->disk->part0; part_stat_lock(); @@ -1442,7 +1452,7 @@ static void blk_mq_timeout_work(struct work_struct *work) container_of(work, struct request_queue, timeout_work); unsigned long next = 0; struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; /* A deadlock might occur if a request is stuck requiring a * timeout at the same time a queue freeze is waiting @@ -2143,7 +2153,7 @@ static struct blk_mq_hw_ctx *blk_mq_get_sq_hctx(struct request_queue *q) void blk_mq_run_hw_queues(struct request_queue *q, bool async) { struct blk_mq_hw_ctx *hctx, *sq_hctx; - int i; + unsigned long i; sq_hctx = NULL; if (blk_mq_has_sqsched(q)) @@ -2171,7 +2181,7 @@ EXPORT_SYMBOL(blk_mq_run_hw_queues); void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs) { struct blk_mq_hw_ctx *hctx, *sq_hctx; - int i; + unsigned long i; sq_hctx = NULL; if (blk_mq_has_sqsched(q)) @@ -2209,7 +2219,7 @@ EXPORT_SYMBOL(blk_mq_delay_run_hw_queues); bool blk_mq_queue_stopped(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; queue_for_each_hw_ctx(q, hctx, i) if (blk_mq_hctx_stopped(hctx)) @@ -2248,7 +2258,7 @@ EXPORT_SYMBOL(blk_mq_stop_hw_queue); void blk_mq_stop_hw_queues(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; queue_for_each_hw_ctx(q, hctx, i) blk_mq_stop_hw_queue(hctx); @@ -2266,7 +2276,7 @@ EXPORT_SYMBOL(blk_mq_start_hw_queue); void blk_mq_start_hw_queues(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; queue_for_each_hw_ctx(q, hctx, i) blk_mq_start_hw_queue(hctx); @@ -2286,7 +2296,7 @@ EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue); void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async) { struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; queue_for_each_hw_ctx(q, hctx, i) blk_mq_start_stopped_hw_queue(hctx, async); @@ -2567,13 +2577,36 @@ static void __blk_mq_flush_plug_list(struct request_queue *q, q->mq_ops->queue_rqs(&plug->mq_list); } +static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched) +{ + struct blk_mq_hw_ctx *this_hctx = NULL; + struct blk_mq_ctx *this_ctx = NULL; + struct request *requeue_list = NULL; + unsigned int depth = 0; + LIST_HEAD(list); + + do { + struct request *rq = rq_list_pop(&plug->mq_list); + + if (!this_hctx) { + this_hctx = rq->mq_hctx; + this_ctx = rq->mq_ctx; + } else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx) { + rq_list_add(&requeue_list, rq); + continue; + } + list_add_tail(&rq->queuelist, &list); + depth++; + } while (!rq_list_empty(plug->mq_list)); + + plug->mq_list = requeue_list; + trace_block_unplug(this_hctx->queue, depth, !from_sched); + blk_mq_sched_insert_requests(this_hctx, this_ctx, &list, from_sched); +} + void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) { - struct blk_mq_hw_ctx *this_hctx; - struct blk_mq_ctx *this_ctx; struct request *rq; - unsigned int depth; - LIST_HEAD(list); if (rq_list_empty(plug->mq_list)) return; @@ -2609,35 +2642,9 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) return; } - this_hctx = NULL; - this_ctx = NULL; - depth = 0; do { - rq = rq_list_pop(&plug->mq_list); - - if (!this_hctx) { - this_hctx = rq->mq_hctx; - this_ctx = rq->mq_ctx; - } else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx) { - trace_block_unplug(this_hctx->queue, depth, - !from_schedule); - blk_mq_sched_insert_requests(this_hctx, this_ctx, - &list, from_schedule); - depth = 0; - this_hctx = rq->mq_hctx; - this_ctx = rq->mq_ctx; - - } - - list_add(&rq->queuelist, &list); - depth++; + blk_mq_dispatch_plug_list(plug, from_schedule); } while (!rq_list_empty(plug->mq_list)); - - if (!list_empty(&list)) { - trace_block_unplug(this_hctx->queue, depth, !from_schedule); - blk_mq_sched_insert_requests(this_hctx, this_ctx, &list, - from_schedule); - } } void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, @@ -2724,7 +2731,8 @@ static bool blk_mq_attempt_bio_merge(struct request_queue *q, static struct request *blk_mq_get_new_requests(struct request_queue *q, struct blk_plug *plug, - struct bio *bio) + struct bio *bio, + unsigned int nsegs) { struct blk_mq_alloc_data data = { .q = q, @@ -2736,6 +2744,11 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q, if (unlikely(bio_queue_enter(bio))) return NULL; + if (blk_mq_attempt_bio_merge(q, bio, nsegs)) + goto queue_exit; + + rq_qos_throttle(q, bio); + if (plug) { data.nr_tags = plug->nr_ios; plug->nr_ios = 1; @@ -2748,12 +2761,13 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q, rq_qos_cleanup(q, bio); if (bio->bi_opf & REQ_NOWAIT) bio_wouldblock_error(bio); +queue_exit: blk_queue_exit(q); return NULL; } static inline struct request *blk_mq_get_cached_request(struct request_queue *q, - struct blk_plug *plug, struct bio *bio) + struct blk_plug *plug, struct bio **bio, unsigned int nsegs) { struct request *rq; @@ -2763,12 +2777,19 @@ static inline struct request *blk_mq_get_cached_request(struct request_queue *q, if (!rq || rq->q != q) return NULL; - if (blk_mq_get_hctx_type(bio->bi_opf) != rq->mq_hctx->type) + if (blk_mq_attempt_bio_merge(q, *bio, nsegs)) { + *bio = NULL; return NULL; - if (op_is_flush(rq->cmd_flags) != op_is_flush(bio->bi_opf)) + } + + rq_qos_throttle(q, *bio); + + if (blk_mq_get_hctx_type((*bio)->bi_opf) != rq->mq_hctx->type) + return NULL; + if (op_is_flush(rq->cmd_flags) != op_is_flush((*bio)->bi_opf)) return NULL; - rq->cmd_flags = bio->bi_opf; + rq->cmd_flags = (*bio)->bi_opf; plug->cached_rq = rq_list_next(rq); INIT_LIST_HEAD(&rq->queuelist); return rq; @@ -2803,14 +2824,11 @@ void blk_mq_submit_bio(struct bio *bio) if (!bio_integrity_prep(bio)) return; - if (blk_mq_attempt_bio_merge(q, bio, nr_segs)) - return; - - rq_qos_throttle(q, bio); - - rq = blk_mq_get_cached_request(q, plug, bio); + rq = blk_mq_get_cached_request(q, plug, &bio, nr_segs); if (!rq) { - rq = blk_mq_get_new_requests(q, plug, bio); + if (!bio) + return; + rq = blk_mq_get_new_requests(q, plug, bio, nr_segs); if (unlikely(!rq)) return; } @@ -3065,6 +3083,9 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, struct blk_mq_tags *drv_tags; struct page *page; + if (list_empty(&tags->page_list)) + return; + if (blk_mq_is_shared_tags(set->flags)) drv_tags = set->shared_tags; else @@ -3107,15 +3128,41 @@ void blk_mq_free_rq_map(struct blk_mq_tags *tags) blk_mq_free_tags(tags); } +static enum hctx_type hctx_idx_to_type(struct blk_mq_tag_set *set, + unsigned int hctx_idx) +{ + int i; + + for (i = 0; i < set->nr_maps; i++) { + unsigned int start = set->map[i].queue_offset; + unsigned int end = start + set->map[i].nr_queues; + + if (hctx_idx >= start && hctx_idx < end) + break; + } + + if (i >= set->nr_maps) + i = HCTX_TYPE_DEFAULT; + + return i; +} + +static int blk_mq_get_hctx_node(struct blk_mq_tag_set *set, + unsigned int hctx_idx) +{ + enum hctx_type type = hctx_idx_to_type(set, hctx_idx); + + return blk_mq_hw_queue_to_node(&set->map[type], hctx_idx); +} + static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, unsigned int hctx_idx, unsigned int nr_tags, unsigned int reserved_tags) { + int node = blk_mq_get_hctx_node(set, hctx_idx); struct blk_mq_tags *tags; - int node; - node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx); if (node == NUMA_NO_NODE) node = set->numa_node; @@ -3164,10 +3211,9 @@ static int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, unsigned int hctx_idx, unsigned int depth) { unsigned int i, j, entries_per_page, max_order = 4; + int node = blk_mq_get_hctx_node(set, hctx_idx); size_t rq_size, left; - int node; - node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx); if (node == NUMA_NO_NODE) node = set->numa_node; @@ -3412,6 +3458,8 @@ static void blk_mq_exit_hctx(struct request_queue *q, blk_mq_remove_cpuhp(hctx); + xa_erase(&q->hctx_table, hctx_idx); + spin_lock(&q->unused_hctx_lock); list_add(&hctx->hctx_list, &q->unused_hctx_list); spin_unlock(&q->unused_hctx_lock); @@ -3421,12 +3469,11 @@ static void blk_mq_exit_hw_queues(struct request_queue *q, struct blk_mq_tag_set *set, int nr_queue) { struct blk_mq_hw_ctx *hctx; - unsigned int i; + unsigned long i; queue_for_each_hw_ctx(q, hctx, i) { if (i == nr_queue) break; - blk_mq_debugfs_unregister_hctx(hctx); blk_mq_exit_hctx(q, set, hctx, i); } } @@ -3451,8 +3498,15 @@ static int blk_mq_init_hctx(struct request_queue *q, if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, hctx->numa_node)) goto exit_hctx; + + if (xa_insert(&q->hctx_table, hctx_idx, hctx, GFP_KERNEL)) + goto exit_flush_rq; + return 0; + exit_flush_rq: + if (set->ops->exit_request) + set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx); exit_hctx: if (set->ops->exit_hctx) set->ops->exit_hctx(hctx, hctx_idx); @@ -3612,7 +3666,8 @@ static void __blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set, static void blk_mq_map_swqueue(struct request_queue *q) { - unsigned int i, j, hctx_idx; + unsigned int j, hctx_idx; + unsigned long i; struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx; struct blk_mq_tag_set *set = q->tag_set; @@ -3719,7 +3774,7 @@ static void blk_mq_map_swqueue(struct request_queue *q) static void queue_set_hctx_shared(struct request_queue *q, bool shared) { struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; queue_for_each_hw_ctx(q, hctx, i) { if (shared) { @@ -3819,7 +3874,7 @@ static int blk_mq_alloc_ctxs(struct request_queue *q) void blk_mq_release(struct request_queue *q) { struct blk_mq_hw_ctx *hctx, *next; - int i; + unsigned long i; queue_for_each_hw_ctx(q, hctx, i) WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list)); @@ -3830,7 +3885,7 @@ void blk_mq_release(struct request_queue *q) kobject_put(&hctx->kobj); } - kfree(q->queue_hw_ctx); + xa_destroy(&q->hctx_table); /* * release .mq_kobj and sw queue's kobject now because @@ -3919,52 +3974,28 @@ static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx( static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, struct request_queue *q) { - int i, j, end; - struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx; - - if (q->nr_hw_queues < set->nr_hw_queues) { - struct blk_mq_hw_ctx **new_hctxs; - - new_hctxs = kcalloc_node(set->nr_hw_queues, - sizeof(*new_hctxs), GFP_KERNEL, - set->numa_node); - if (!new_hctxs) - return; - if (hctxs) - memcpy(new_hctxs, hctxs, q->nr_hw_queues * - sizeof(*hctxs)); - q->queue_hw_ctx = new_hctxs; - kfree(hctxs); - hctxs = new_hctxs; - } + struct blk_mq_hw_ctx *hctx; + unsigned long i, j; /* protect against switching io scheduler */ mutex_lock(&q->sysfs_lock); for (i = 0; i < set->nr_hw_queues; i++) { - int node; - struct blk_mq_hw_ctx *hctx; + int old_node; + int node = blk_mq_get_hctx_node(set, i); + struct blk_mq_hw_ctx *old_hctx = xa_load(&q->hctx_table, i); - node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], i); - /* - * If the hw queue has been mapped to another numa node, - * we need to realloc the hctx. If allocation fails, fallback - * to use the previous one. - */ - if (hctxs[i] && (hctxs[i]->numa_node == node)) - continue; + if (old_hctx) { + old_node = old_hctx->numa_node; + blk_mq_exit_hctx(q, set, old_hctx, i); + } - hctx = blk_mq_alloc_and_init_hctx(set, q, i, node); - if (hctx) { - if (hctxs[i]) - blk_mq_exit_hctx(q, set, hctxs[i], i); - hctxs[i] = hctx; - } else { - if (hctxs[i]) - pr_warn("Allocate new hctx on node %d fails,\ - fallback to previous one on node %d\n", - node, hctxs[i]->numa_node); - else + if (!blk_mq_alloc_and_init_hctx(set, q, i, node)) { + if (!old_hctx) break; + pr_warn("Allocate new hctx on node %d fails, fallback to previous one on node %d\n", + node, old_node); + hctx = blk_mq_alloc_and_init_hctx(set, q, i, old_node); + WARN_ON_ONCE(!hctx); } } /* @@ -3973,24 +4004,27 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, */ if (i != set->nr_hw_queues) { j = q->nr_hw_queues; - end = i; } else { j = i; - end = q->nr_hw_queues; q->nr_hw_queues = set->nr_hw_queues; } - for (; j < end; j++) { - struct blk_mq_hw_ctx *hctx = hctxs[j]; - - if (hctx) { - blk_mq_exit_hctx(q, set, hctx, j); - hctxs[j] = NULL; - } - } + xa_for_each_start(&q->hctx_table, j, hctx, j) + blk_mq_exit_hctx(q, set, hctx, j); mutex_unlock(&q->sysfs_lock); } +static void blk_mq_update_poll_flag(struct request_queue *q) +{ + struct blk_mq_tag_set *set = q->tag_set; + + if (set->nr_maps > HCTX_TYPE_POLL && + set->map[HCTX_TYPE_POLL].nr_queues) + blk_queue_flag_set(QUEUE_FLAG_POLL, q); + else + blk_queue_flag_clear(QUEUE_FLAG_POLL, q); +} + int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, struct request_queue *q) { @@ -4015,6 +4049,8 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, INIT_LIST_HEAD(&q->unused_hctx_list); spin_lock_init(&q->unused_hctx_lock); + xa_init(&q->hctx_table); + blk_mq_realloc_hw_ctxs(set, q); if (!q->nr_hw_queues) goto err_hctxs; @@ -4025,9 +4061,7 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, q->tag_set = set; q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; - if (set->nr_maps > HCTX_TYPE_POLL && - set->map[HCTX_TYPE_POLL].nr_queues) - blk_queue_flag_set(QUEUE_FLAG_POLL, q); + blk_mq_update_poll_flag(q); INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work); INIT_LIST_HEAD(&q->requeue_list); @@ -4046,7 +4080,7 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, return 0; err_hctxs: - kfree(q->queue_hw_ctx); + xa_destroy(&q->hctx_table); q->nr_hw_queues = 0; blk_mq_sysfs_deinit(q); err_poll: @@ -4334,7 +4368,8 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) { struct blk_mq_tag_set *set = q->tag_set; struct blk_mq_hw_ctx *hctx; - int i, ret; + int ret; + unsigned long i; if (!set) return -EINVAL; @@ -4493,6 +4528,7 @@ fallback: blk_mq_update_queue_map(set); list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_realloc_hw_ctxs(set, q); + blk_mq_update_poll_flag(q); if (q->nr_hw_queues != set->nr_hw_queues) { int i = prev_nr_hw_queues; @@ -4709,7 +4745,7 @@ void blk_mq_cancel_work_sync(struct request_queue *q) { if (queue_is_mq(q)) { struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; cancel_delayed_work_sync(&q->requeue_work); diff --git a/block/blk-mq.h b/block/blk-mq.h index 948791ea2a3e..2615bd58bad3 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -83,7 +83,7 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue * enum hctx_type type, unsigned int cpu) { - return q->queue_hw_ctx[q->tag_set->map[type].mq_map[cpu]]; + return xa_load(&q->hctx_table, q->tag_set->map[type].mq_map[cpu]); } static inline enum hctx_type blk_mq_get_hctx_type(unsigned int flags) diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h index 3cfbc8668cba..68267007da1c 100644 --- a/block/blk-rq-qos.h +++ b/block/blk-rq-qos.h @@ -177,20 +177,20 @@ static inline void rq_qos_requeue(struct request_queue *q, struct request *rq) __rq_qos_requeue(q->rq_qos, rq); } -static inline void rq_qos_done_bio(struct request_queue *q, struct bio *bio) +static inline void rq_qos_done_bio(struct bio *bio) { - if (q->rq_qos) - __rq_qos_done_bio(q->rq_qos, bio); + if (bio->bi_bdev && (bio_flagged(bio, BIO_QOS_THROTTLED) || + bio_flagged(bio, BIO_QOS_MERGED))) { + struct request_queue *q = bdev_get_queue(bio->bi_bdev); + if (q->rq_qos) + __rq_qos_done_bio(q->rq_qos, bio); + } } static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio) { - /* - * BIO_TRACKED lets controllers know that a bio went through the - * normal rq_qos path. - */ if (q->rq_qos) { - bio_set_flag(bio, BIO_TRACKED); + bio_set_flag(bio, BIO_QOS_THROTTLED); __rq_qos_throttle(q->rq_qos, bio); } } @@ -205,8 +205,10 @@ static inline void rq_qos_track(struct request_queue *q, struct request *rq, static inline void rq_qos_merge(struct request_queue *q, struct request *rq, struct bio *bio) { - if (q->rq_qos) + if (q->rq_qos) { + bio_set_flag(bio, BIO_QOS_MERGED); __rq_qos_merge(q->rq_qos, rq, bio); + } } static inline void rq_qos_queue_depth_changed(struct request_queue *q) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 4c6b7dff71e5..85c4ba006671 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -739,27 +739,6 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head) kmem_cache_free(blk_get_queue_kmem_cache(blk_queue_has_srcu(q)), q); } -/* Unconfigure the I/O scheduler and dissociate from the cgroup controller. */ -static void blk_exit_queue(struct request_queue *q) -{ - /* - * Since the I/O scheduler exit code may access cgroup information, - * perform I/O scheduler exit before disassociating from the block - * cgroup controller. - */ - if (q->elevator) { - ioc_clear_queue(q); - elevator_exit(q); - } - - /* - * Remove all references to @q from the block cgroup controller before - * restoring @q->queue_lock to avoid that restoring this pointer causes - * e.g. blkcg_print_blkgs() to crash. - */ - blkcg_exit_queue(q); -} - /** * blk_release_queue - releases all allocated resources of the request_queue * @kobj: pointer to a kobject, whose container is a request_queue @@ -787,12 +766,12 @@ static void blk_release_queue(struct kobject *kobj) might_sleep(); + percpu_ref_exit(&q->q_usage_counter); + if (q->poll_stat) blk_stat_remove_callback(q, q->poll_cb); blk_stat_free_callback(q->poll_cb); - blk_exit_queue(q); - blk_free_queue_stats(q->stats); kfree(q->poll_stat); @@ -880,6 +859,10 @@ int blk_register_queue(struct gendisk *disk) goto put_dev; } + ret = blk_crypto_sysfs_register(q); + if (ret) + goto put_dev; + blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); wbt_enable_default(q); blk_throtl_register_queue(q); @@ -910,6 +893,7 @@ unlock: return ret; put_dev: + elv_unregister_queue(q); disk_unregister_independent_access_ranges(disk); mutex_unlock(&q->sysfs_lock); mutex_unlock(&q->sysfs_dir_lock); @@ -954,16 +938,18 @@ void blk_unregister_queue(struct gendisk *disk) */ if (queue_is_mq(q)) blk_mq_unregister_dev(disk_to_dev(disk), q); - - kobject_uevent(&q->kobj, KOBJ_REMOVE); - kobject_del(&q->kobj); + blk_crypto_sysfs_unregister(q); blk_trace_remove_sysfs(disk_to_dev(disk)); mutex_lock(&q->sysfs_lock); - if (q->elevator) - elv_unregister_queue(q); + elv_unregister_queue(q); disk_unregister_independent_access_ranges(disk); mutex_unlock(&q->sysfs_lock); + + /* Now that we've deleted all child objects, we can delete the queue. */ + kobject_uevent(&q->kobj, KOBJ_REMOVE); + kobject_del(&q->kobj); + mutex_unlock(&q->sysfs_dir_lock); kobject_put(&disk_to_dev(disk)->kobj); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index a3b3ebc72dd4..469c483719be 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -874,7 +874,8 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, bio != throtl_peek_queued(&tg->service_queue.queued[rw])); /* If tg->bps = -1, then BW is unlimited */ - if (bps_limit == U64_MAX && iops_limit == UINT_MAX) { + if ((bps_limit == U64_MAX && iops_limit == UINT_MAX) || + tg->flags & THROTL_TG_CANCELING) { if (wait) *wait = 0; return true; @@ -1137,12 +1138,22 @@ static void throtl_pending_timer_fn(struct timer_list *t) struct throtl_service_queue *sq = from_timer(sq, t, pending_timer); struct throtl_grp *tg = sq_to_tg(sq); struct throtl_data *td = sq_to_td(sq); - struct request_queue *q = td->queue; struct throtl_service_queue *parent_sq; + struct request_queue *q; bool dispatched; int ret; + /* throtl_data may be gone, so figure out request queue by blkg */ + if (tg) + q = tg->pd.blkg->q; + else + q = td->queue; + spin_lock_irq(&q->queue_lock); + + if (!q->root_blkg) + goto out_unlock; + if (throtl_can_upgrade(td, NULL)) throtl_upgrade_state(td); @@ -1766,6 +1777,39 @@ static bool throtl_hierarchy_can_upgrade(struct throtl_grp *tg) return false; } +void blk_throtl_cancel_bios(struct request_queue *q) +{ + struct cgroup_subsys_state *pos_css; + struct blkcg_gq *blkg; + + spin_lock_irq(&q->queue_lock); + /* + * queue_lock is held, rcu lock is not needed here technically. + * However, rcu lock is still held to emphasize that following + * path need RCU protection and to prevent warning from lockdep. + */ + rcu_read_lock(); + blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) { + struct throtl_grp *tg = blkg_to_tg(blkg); + struct throtl_service_queue *sq = &tg->service_queue; + + /* + * Set the flag to make sure throtl_pending_timer_fn() won't + * stop until all throttled bios are dispatched. + */ + blkg_to_tg(blkg)->flags |= THROTL_TG_CANCELING; + /* + * Update disptime after setting the above flag to make sure + * throtl_select_dispatch() won't exit without dispatching. + */ + tg_update_disptime(tg); + + throtl_schedule_pending_timer(sq, jiffies + 1); + } + rcu_read_unlock(); + spin_unlock_irq(&q->queue_lock); +} + static bool throtl_can_upgrade(struct throtl_data *td, struct throtl_grp *this_tg) { diff --git a/block/blk-throttle.h b/block/blk-throttle.h index b23a9f3abb82..c1b602996127 100644 --- a/block/blk-throttle.h +++ b/block/blk-throttle.h @@ -56,6 +56,7 @@ enum tg_state_flags { THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */ THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */ THROTL_TG_HAS_IOPS_LIMIT = 1 << 2, /* tg has iops limit */ + THROTL_TG_CANCELING = 1 << 3, /* starts to cancel bio */ }; enum { @@ -162,11 +163,13 @@ static inline int blk_throtl_init(struct request_queue *q) { return 0; } static inline void blk_throtl_exit(struct request_queue *q) { } static inline void blk_throtl_register_queue(struct request_queue *q) { } static inline bool blk_throtl_bio(struct bio *bio) { return false; } +static inline void blk_throtl_cancel_bios(struct request_queue *q) { } #else /* CONFIG_BLK_DEV_THROTTLING */ int blk_throtl_init(struct request_queue *q); void blk_throtl_exit(struct request_queue *q); void blk_throtl_register_queue(struct request_queue *q); bool __blk_throtl_bio(struct bio *bio); +void blk_throtl_cancel_bios(struct request_queue *q); static inline bool blk_throtl_bio(struct bio *bio) { struct throtl_grp *tg = blkg_to_tg(bio->bi_blkg); diff --git a/block/blk.h b/block/blk.h index ebaa59ca46ca..6f21859c7f0f 100644 --- a/block/blk.h +++ b/block/blk.h @@ -325,7 +325,7 @@ int blk_dev_init(void); */ static inline bool blk_do_io_stat(struct request *rq) { - return (rq->rq_flags & RQF_IO_STAT) && rq->q->disk; + return (rq->rq_flags & RQF_IO_STAT) && !blk_rq_is_passthrough(rq); } void update_io_ticks(struct block_device *part, unsigned long now, bool end); diff --git a/block/elevator.c b/block/elevator.c index 6847ab6e7aa5..c319765892bb 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -192,6 +192,9 @@ void elevator_exit(struct request_queue *q) { struct elevator_queue *e = q->elevator; + ioc_clear_queue(q); + blk_mq_sched_free_rqs(q); + mutex_lock(&e->sysfs_lock); blk_mq_exit_sched(q, e); mutex_unlock(&e->sysfs_lock); @@ -516,17 +519,17 @@ int elv_register_queue(struct request_queue *q, bool uevent) void elv_unregister_queue(struct request_queue *q) { + struct elevator_queue *e = q->elevator; + lockdep_assert_held(&q->sysfs_lock); - if (q) { + if (e && e->registered) { struct elevator_queue *e = q->elevator; kobject_uevent(&e->kobj, KOBJ_REMOVE); kobject_del(&e->kobj); e->registered = 0; - /* Re-enable throttling in case elevator disabled it */ - wbt_enable_default(q); } } @@ -593,11 +596,7 @@ int elevator_switch_mq(struct request_queue *q, lockdep_assert_held(&q->sysfs_lock); if (q->elevator) { - if (q->elevator->registered) - elv_unregister_queue(q); - - ioc_clear_queue(q); - blk_mq_sched_free_rqs(q); + elv_unregister_queue(q); elevator_exit(q); } @@ -608,7 +607,6 @@ int elevator_switch_mq(struct request_queue *q, if (new_e) { ret = elv_register_queue(q, true); if (ret) { - blk_mq_sched_free_rqs(q); elevator_exit(q); goto out; } diff --git a/block/fops.c b/block/fops.c index 3696665e586a..e49096354dcd 100644 --- a/block/fops.c +++ b/block/fops.c @@ -287,6 +287,8 @@ static void blkdev_bio_end_io_async(struct bio *bio) struct kiocb *iocb = dio->iocb; ssize_t ret; + WRITE_ONCE(iocb->private, NULL); + if (likely(!bio->bi_status)) { ret = dio->size; iocb->ki_pos += ret; @@ -426,7 +428,8 @@ static int blkdev_writepages(struct address_space *mapping, } const struct address_space_operations def_blk_aops = { - .set_page_dirty = __set_page_dirty_buffers, + .dirty_folio = block_dirty_folio, + .invalidate_folio = block_invalidate_folio, .readpage = blkdev_readpage, .readahead = blkdev_readahead, .writepage = blkdev_writepage, @@ -563,34 +566,37 @@ static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct block_device *bdev = iocb->ki_filp->private_data; loff_t size = bdev_nr_bytes(bdev); - size_t count = iov_iter_count(to); loff_t pos = iocb->ki_pos; size_t shorted = 0; ssize_t ret = 0; + size_t count; - if (unlikely(pos + count > size)) { + if (unlikely(pos + iov_iter_count(to) > size)) { if (pos >= size) return 0; size -= pos; - if (count > size) { - shorted = count - size; - iov_iter_truncate(to, size); - } + shorted = iov_iter_count(to) - size; + iov_iter_truncate(to, size); } + count = iov_iter_count(to); + if (!count) + goto reexpand; /* skip atime */ + if (iocb->ki_flags & IOCB_DIRECT) { struct address_space *mapping = iocb->ki_filp->f_mapping; if (iocb->ki_flags & IOCB_NOWAIT) { - if (filemap_range_needs_writeback(mapping, iocb->ki_pos, - iocb->ki_pos + count - 1)) - return -EAGAIN; + if (filemap_range_needs_writeback(mapping, pos, + pos + count - 1)) { + ret = -EAGAIN; + goto reexpand; + } } else { - ret = filemap_write_and_wait_range(mapping, - iocb->ki_pos, - iocb->ki_pos + count - 1); + ret = filemap_write_and_wait_range(mapping, pos, + pos + count - 1); if (ret < 0) - return ret; + goto reexpand; } file_accessed(iocb->ki_filp); @@ -600,12 +606,14 @@ static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) iocb->ki_pos += ret; count -= ret; } + iov_iter_revert(to, count - iov_iter_count(to)); if (ret < 0 || !count) - return ret; + goto reexpand; } ret = filemap_read(iocb, to, ret); +reexpand: if (unlikely(shorted)) iov_iter_reexpand(to, iov_iter_count(to) + shorted); return ret; diff --git a/block/genhd.c b/block/genhd.c index 1ed46a6f94f5..c9a4fc90d3e9 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -25,10 +25,12 @@ #include <linux/pm_runtime.h> #include <linux/badblocks.h> #include <linux/part_stat.h> +#include "blk-throttle.h" #include "blk.h" #include "blk-mq-sched.h" #include "blk-rq-qos.h" +#include "blk-cgroup.h" static struct kobject *block_depr; @@ -557,6 +559,20 @@ out_free_ext_minor: EXPORT_SYMBOL(device_add_disk); /** + * blk_mark_disk_dead - mark a disk as dead + * @disk: disk to mark as dead + * + * Mark as disk as dead (e.g. surprise removed) and don't accept any new I/O + * to this disk. + */ +void blk_mark_disk_dead(struct gendisk *disk) +{ + set_bit(GD_DEAD, &disk->state); + blk_queue_start_drain(disk->queue); +} +EXPORT_SYMBOL_GPL(blk_mark_disk_dead); + +/** * del_gendisk - remove the gendisk * @disk: the struct gendisk to remove * @@ -630,7 +646,8 @@ void del_gendisk(struct gendisk *disk) blk_mq_freeze_queue_wait(q); - rq_qos_exit(q); + blk_throtl_cancel_bios(disk->queue); + blk_sync_queue(q); blk_flush_integrity(); /* @@ -923,12 +940,17 @@ ssize_t part_stat_show(struct device *dev, struct disk_stats stat; unsigned int inflight; - part_stat_read_all(bdev, &stat); if (queue_is_mq(q)) inflight = blk_mq_in_flight(q, bdev); else inflight = part_in_flight(bdev); + if (inflight) { + part_stat_lock(); + update_io_ticks(bdev, jiffies, true); + part_stat_unlock(); + } + part_stat_read_all(bdev, &stat); return sprintf(buf, "%8lu %8lu %8llu %8u " "%8lu %8lu %8llu %8u " @@ -1096,6 +1118,31 @@ static const struct attribute_group *disk_attr_groups[] = { NULL }; +static void disk_release_mq(struct request_queue *q) +{ + blk_mq_cancel_work_sync(q); + + /* + * There can't be any non non-passthrough bios in flight here, but + * requests stay around longer, including passthrough ones so we + * still need to freeze the queue here. + */ + blk_mq_freeze_queue(q); + + /* + * Since the I/O scheduler exit code may access cgroup information, + * perform I/O scheduler exit before disassociating from the block + * cgroup controller. + */ + if (q->elevator) { + mutex_lock(&q->sysfs_lock); + elevator_exit(q); + mutex_unlock(&q->sysfs_lock); + } + rq_qos_exit(q); + __blk_mq_unfreeze_queue(q, true); +} + /** * disk_release - releases all allocated resources of the gendisk * @dev: the device representing this disk @@ -1117,11 +1164,15 @@ static void disk_release(struct device *dev) might_sleep(); WARN_ON_ONCE(disk_live(disk)); - blk_mq_cancel_work_sync(disk->queue); + if (queue_is_mq(disk->queue)) + disk_release_mq(disk->queue); + + blkcg_exit_queue(disk->queue); disk_release_events(disk); kfree(disk->random); xa_destroy(&disk->part_tbl); + disk->queue->disk = NULL; blk_put_queue(disk->queue); @@ -1188,12 +1239,17 @@ static int diskstats_show(struct seq_file *seqf, void *v) xa_for_each(&gp->part_tbl, idx, hd) { if (bdev_is_partition(hd) && !bdev_nr_sectors(hd)) continue; - part_stat_read_all(hd, &stat); if (queue_is_mq(gp->queue)) inflight = blk_mq_in_flight(gp->queue, hd); else inflight = part_in_flight(hd); + if (inflight) { + part_stat_lock(); + update_io_ticks(hd, jiffies, true); + part_stat_unlock(); + } + part_stat_read_all(hd, &stat); seq_printf(seqf, "%4d %7d %pg " "%lu %lu %lu %u " "%lu %lu %lu %u " @@ -1322,6 +1378,9 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, if (xa_insert(&disk->part_tbl, 0, disk->part0, GFP_KERNEL)) goto out_destroy_part_tbl; + if (blkcg_init_queue(q)) + goto out_erase_part0; + rand_initialize_disk(disk); disk_to_dev(disk)->class = &block_class; disk_to_dev(disk)->type = &disk_type; @@ -1334,6 +1393,8 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, #endif return disk; +out_erase_part0: + xa_erase(&disk->part_tbl, 0); out_destroy_part_tbl: xa_destroy(&disk->part_tbl); disk->part0->bd_disk = NULL; |