From 19871b5c7a003946d3cd4209a348ab7c0df5dbad Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 7 Dec 2023 08:27:10 +0100 Subject: iomap: pass the length of the dirty region to ->map_blocks Let the file system know how much dirty data exists at the passed in offset. This allows file systems to allocate the right amount of space that actually is written back if they can't eagerly convert (e.g. because they don't support unwritten extents). Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20231207072710.176093-15-hch@lst.de Signed-off-by: Christian Brauner --- block/fops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/fops.c b/block/fops.c index 0cf8cf72cdfa..93bae17ce660 100644 --- a/block/fops.c +++ b/block/fops.c @@ -482,7 +482,7 @@ static void blkdev_readahead(struct readahead_control *rac) } static int blkdev_map_blocks(struct iomap_writepage_ctx *wpc, - struct inode *inode, loff_t offset) + struct inode *inode, loff_t offset, unsigned int len) { loff_t isize = i_size_read(inode); -- cgit v1.2.3-58-ga151 From 0f299da55ac3d28bc9de23a84fae01a85c4e253a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 24 Jan 2024 10:26:56 +0100 Subject: blk-mq: move blk_mq_attempt_bio_merge out blk_mq_get_new_requests blk_mq_attempt_bio_merge has nothing to do with allocating a new request, it avoids allocating a new request. Move the call out of blk_mq_get_new_requests and into the only caller. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Tested-by: Damien Le Moal Link: https://lore.kernel.org/r/20240124092658.2258309-2-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 2dc01551e27c..bc032d06858e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2891,9 +2891,6 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q, }; struct request *rq; - if (blk_mq_attempt_bio_merge(q, bio, nsegs)) - return NULL; - rq_qos_throttle(q, bio); if (plug) { @@ -2992,18 +2989,18 @@ void blk_mq_submit_bio(struct bio *bio) if (unlikely(bio_may_exceed_limits(bio, &q->limits))) { bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); if (!bio) - goto fail; + goto queue_exit; } if (!bio_integrity_prep(bio)) - goto fail; + goto queue_exit; } + if (blk_mq_attempt_bio_merge(q, bio, nr_segs)) + goto queue_exit; + rq = blk_mq_get_new_requests(q, plug, bio, nr_segs); - if (unlikely(!rq)) { -fail: - blk_queue_exit(q); - return; - } + if (unlikely(!rq)) + goto queue_exit; done: trace_block_getrq(bio); @@ -3036,6 +3033,10 @@ done: } else { blk_mq_run_dispatch_ops(q, blk_mq_try_issue_directly(hctx, rq)); } + return; + +queue_exit: + blk_queue_exit(q); } #ifdef CONFIG_BLK_MQ_STACKING -- cgit v1.2.3-58-ga151 From 337e89feb7c29043dacd851b6ac28542a9a8aacf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 24 Jan 2024 10:26:57 +0100 Subject: blk-mq: introduce a blk_mq_peek_cached_request helper Add a new helper to check if there is suitable cached request in blk_mq_submit_bio. This removes open coded logic in blk_mq_submit_bio and moves some checks that so far are in blk_mq_use_cached_rq to be performed earlier. This avoids the case where we first do check with the cached request but then later end up allocating a new one anyway and need to grab a queue reference. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Tested-by: Damien Le Moal Link: https://lore.kernel.org/r/20240124092658.2258309-3-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq.c | 63 ++++++++++++++++++++++++++++++---------------------------- 1 file changed, 33 insertions(+), 30 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index bc032d06858e..ad57a259e975 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2909,22 +2909,31 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q, } /* - * Check if we can use the passed on request for submitting the passed in bio, - * and remove it from the request list if it can be used. + * Check if there is a suitable cached request and return it. */ -static bool blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug, - struct bio *bio) +static struct request *blk_mq_peek_cached_request(struct blk_plug *plug, + struct request_queue *q, blk_opf_t opf) { - enum hctx_type type = blk_mq_get_hctx_type(bio->bi_opf); - enum hctx_type hctx_type = rq->mq_hctx->type; + enum hctx_type type = blk_mq_get_hctx_type(opf); + struct request *rq; - WARN_ON_ONCE(rq_list_peek(&plug->cached_rq) != rq); + if (!plug) + return NULL; + rq = rq_list_peek(&plug->cached_rq); + if (!rq || rq->q != q) + return NULL; + if (type != rq->mq_hctx->type && + (type != HCTX_TYPE_READ || rq->mq_hctx->type != HCTX_TYPE_DEFAULT)) + return NULL; + if (op_is_flush(rq->cmd_flags) != op_is_flush(opf)) + return NULL; + return rq; +} - if (type != hctx_type && - !(type == HCTX_TYPE_READ && hctx_type == HCTX_TYPE_DEFAULT)) - return false; - if (op_is_flush(rq->cmd_flags) != op_is_flush(bio->bi_opf)) - return false; +static void blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug, + struct bio *bio) +{ + WARN_ON_ONCE(rq_list_peek(&plug->cached_rq) != rq); /* * If any qos ->throttle() end up blocking, we will have flushed the @@ -2937,7 +2946,6 @@ static bool blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug, blk_mq_rq_time_init(rq, 0); rq->cmd_flags = bio->bi_opf; INIT_LIST_HEAD(&rq->queuelist); - return true; } /** @@ -2965,11 +2973,7 @@ void blk_mq_submit_bio(struct bio *bio) bio = blk_queue_bounce(bio, q); - if (plug) { - rq = rq_list_peek(&plug->cached_rq); - if (rq && rq->q != q) - rq = NULL; - } + rq = blk_mq_peek_cached_request(plug, q, bio->bi_opf); if (rq) { if (unlikely(bio_may_exceed_limits(bio, &q->limits))) { bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); @@ -2980,20 +2984,19 @@ void blk_mq_submit_bio(struct bio *bio) return; if (blk_mq_attempt_bio_merge(q, bio, nr_segs)) return; - if (blk_mq_use_cached_rq(rq, plug, bio)) - goto done; - percpu_ref_get(&q->q_usage_counter); - } else { - if (unlikely(bio_queue_enter(bio))) - return; - if (unlikely(bio_may_exceed_limits(bio, &q->limits))) { - bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); - if (!bio) - goto queue_exit; - } - if (!bio_integrity_prep(bio)) + blk_mq_use_cached_rq(rq, plug, bio); + goto done; + } + + if (unlikely(bio_queue_enter(bio))) + return; + if (unlikely(bio_may_exceed_limits(bio, &q->limits))) { + bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); + if (!bio) goto queue_exit; } + if (!bio_integrity_prep(bio)) + goto queue_exit; if (blk_mq_attempt_bio_merge(q, bio, nr_segs)) goto queue_exit; -- cgit v1.2.3-58-ga151 From 72e84e909eb5354e1e405c968dfdc4dcc23d41cc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 24 Jan 2024 10:26:58 +0100 Subject: blk-mq: special case cached requests less Share the main merge / split / integrity preparation code between the cached request vs newly allocated request cases, and add comments explaining the cached request handling. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Tested-by: Damien Le Moal Link: https://lore.kernel.org/r/20240124092658.2258309-4-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq.c | 41 ++++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index ad57a259e975..c53a196f579e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2967,29 +2967,24 @@ void blk_mq_submit_bio(struct bio *bio) struct blk_plug *plug = blk_mq_plug(bio); const int is_sync = op_is_sync(bio->bi_opf); struct blk_mq_hw_ctx *hctx; - struct request *rq = NULL; unsigned int nr_segs = 1; + struct request *rq; blk_status_t ret; bio = blk_queue_bounce(bio, q); + /* + * If the plug has a cached request for this queue, try use it. + * + * The cached request already holds a q_usage_counter reference and we + * don't have to acquire a new one if we use it. + */ rq = blk_mq_peek_cached_request(plug, q, bio->bi_opf); - if (rq) { - if (unlikely(bio_may_exceed_limits(bio, &q->limits))) { - bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); - if (!bio) - return; - } - if (!bio_integrity_prep(bio)) - return; - if (blk_mq_attempt_bio_merge(q, bio, nr_segs)) + if (!rq) { + if (unlikely(bio_queue_enter(bio))) return; - blk_mq_use_cached_rq(rq, plug, bio); - goto done; } - if (unlikely(bio_queue_enter(bio))) - return; if (unlikely(bio_may_exceed_limits(bio, &q->limits))) { bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); if (!bio) @@ -3001,11 +2996,14 @@ void blk_mq_submit_bio(struct bio *bio) if (blk_mq_attempt_bio_merge(q, bio, nr_segs)) goto queue_exit; - rq = blk_mq_get_new_requests(q, plug, bio, nr_segs); - if (unlikely(!rq)) - goto queue_exit; + if (!rq) { + rq = blk_mq_get_new_requests(q, plug, bio, nr_segs); + if (unlikely(!rq)) + goto queue_exit; + } else { + blk_mq_use_cached_rq(rq, plug, bio); + } -done: trace_block_getrq(bio); rq_qos_track(q, rq, bio); @@ -3039,7 +3037,12 @@ done: return; queue_exit: - blk_queue_exit(q); + /* + * Don't drop the queue reference if we were trying to use a cached + * request and thus didn't acquire one. + */ + if (!rq) + blk_queue_exit(q); } #ifdef CONFIG_BLK_MQ_STACKING -- cgit v1.2.3-58-ga151 From c4e47bbb00dad9240f4c054859950e962042ecb8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 16 Jan 2024 14:14:59 -0700 Subject: block: move cgroup time handling code into blk.h In preparation for moving time keeping into blk.h, move the cgroup related code for timestamps in here too. This will help avoid a circular dependency, and also moves it into a more appropriate header as this one is private to the block layer code. Leave struct bio_issue in blk_types.h as it's a proper time definition. Signed-off-by: Jens Axboe --- block/blk-cgroup.h | 1 + block/blk.h | 42 ++++++++++++++++++++++++++++++++++++++++++ include/linux/blk_types.h | 42 ------------------------------------------ 3 files changed, 43 insertions(+), 42 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index b927a4a0ad03..78b74106bf10 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -19,6 +19,7 @@ #include #include #include +#include "blk.h" struct blkcg_gq; struct blkg_policy_data; diff --git a/block/blk.h b/block/blk.h index 1ef920f72e0f..620e3a035da1 100644 --- a/block/blk.h +++ b/block/blk.h @@ -516,4 +516,46 @@ static inline int req_ref_read(struct request *req) return atomic_read(&req->ref); } +/* + * From most significant bit: + * 1 bit: reserved for other usage, see below + * 12 bits: original size of bio + * 51 bits: issue time of bio + */ +#define BIO_ISSUE_RES_BITS 1 +#define BIO_ISSUE_SIZE_BITS 12 +#define BIO_ISSUE_RES_SHIFT (64 - BIO_ISSUE_RES_BITS) +#define BIO_ISSUE_SIZE_SHIFT (BIO_ISSUE_RES_SHIFT - BIO_ISSUE_SIZE_BITS) +#define BIO_ISSUE_TIME_MASK ((1ULL << BIO_ISSUE_SIZE_SHIFT) - 1) +#define BIO_ISSUE_SIZE_MASK \ + (((1ULL << BIO_ISSUE_SIZE_BITS) - 1) << BIO_ISSUE_SIZE_SHIFT) +#define BIO_ISSUE_RES_MASK (~((1ULL << BIO_ISSUE_RES_SHIFT) - 1)) + +/* Reserved bit for blk-throtl */ +#define BIO_ISSUE_THROTL_SKIP_LATENCY (1ULL << 63) + +static inline u64 __bio_issue_time(u64 time) +{ + return time & BIO_ISSUE_TIME_MASK; +} + +static inline u64 bio_issue_time(struct bio_issue *issue) +{ + return __bio_issue_time(issue->value); +} + +static inline sector_t bio_issue_size(struct bio_issue *issue) +{ + return ((issue->value & BIO_ISSUE_SIZE_MASK) >> BIO_ISSUE_SIZE_SHIFT); +} + +static inline void bio_issue_init(struct bio_issue *issue, + sector_t size) +{ + size &= (1ULL << BIO_ISSUE_SIZE_BITS) - 1; + issue->value = ((issue->value & BIO_ISSUE_RES_MASK) | + (ktime_get_ns() & BIO_ISSUE_TIME_MASK) | + ((u64)size << BIO_ISSUE_SIZE_SHIFT)); +} + #endif /* BLK_INTERNAL_H */ diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index f288c94374b3..1c07848dea7e 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -206,52 +206,10 @@ static inline bool blk_path_error(blk_status_t error) return true; } -/* - * From most significant bit: - * 1 bit: reserved for other usage, see below - * 12 bits: original size of bio - * 51 bits: issue time of bio - */ -#define BIO_ISSUE_RES_BITS 1 -#define BIO_ISSUE_SIZE_BITS 12 -#define BIO_ISSUE_RES_SHIFT (64 - BIO_ISSUE_RES_BITS) -#define BIO_ISSUE_SIZE_SHIFT (BIO_ISSUE_RES_SHIFT - BIO_ISSUE_SIZE_BITS) -#define BIO_ISSUE_TIME_MASK ((1ULL << BIO_ISSUE_SIZE_SHIFT) - 1) -#define BIO_ISSUE_SIZE_MASK \ - (((1ULL << BIO_ISSUE_SIZE_BITS) - 1) << BIO_ISSUE_SIZE_SHIFT) -#define BIO_ISSUE_RES_MASK (~((1ULL << BIO_ISSUE_RES_SHIFT) - 1)) - -/* Reserved bit for blk-throtl */ -#define BIO_ISSUE_THROTL_SKIP_LATENCY (1ULL << 63) - struct bio_issue { u64 value; }; -static inline u64 __bio_issue_time(u64 time) -{ - return time & BIO_ISSUE_TIME_MASK; -} - -static inline u64 bio_issue_time(struct bio_issue *issue) -{ - return __bio_issue_time(issue->value); -} - -static inline sector_t bio_issue_size(struct bio_issue *issue) -{ - return ((issue->value & BIO_ISSUE_SIZE_MASK) >> BIO_ISSUE_SIZE_SHIFT); -} - -static inline void bio_issue_init(struct bio_issue *issue, - sector_t size) -{ - size &= (1ULL << BIO_ISSUE_SIZE_BITS) - 1; - issue->value = ((issue->value & BIO_ISSUE_RES_MASK) | - (ktime_get_ns() & BIO_ISSUE_TIME_MASK) | - ((u64)size << BIO_ISSUE_SIZE_SHIFT)); -} - typedef __u32 __bitwise blk_opf_t; typedef unsigned int blk_qc_t; -- cgit v1.2.3-58-ga151 From 08420cf70cfb32eed2a0abfeb5c54c5651bd0c99 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 15 Jan 2024 14:45:07 -0700 Subject: block: add blk_time_get_ns() and blk_time_get() helpers Convert any user of ktime_get_ns() to use blk_time_get_ns(), and ktime_get() to blk_time_get(), so we have a unified API for querying the current time in nanoseconds or as ktime. No functional changes intended, this patch just wraps ktime_get_ns() and ktime_get() with a block helper. Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/bfq-cgroup.c | 14 +++++++------- block/bfq-iosched.c | 28 ++++++++++++++-------------- block/blk-cgroup.c | 2 +- block/blk-flush.c | 2 +- block/blk-iocost.c | 8 ++++---- block/blk-iolatency.c | 6 +++--- block/blk-mq.c | 16 ++++++++-------- block/blk-throttle.c | 6 +++--- block/blk-wbt.c | 6 +++--- block/blk.h | 13 ++++++++++++- 10 files changed, 56 insertions(+), 45 deletions(-) (limited to 'block') diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 2c90e5de0acd..d442ee358fc2 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -127,7 +127,7 @@ static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats) if (!bfqg_stats_waiting(stats)) return; - now = ktime_get_ns(); + now = blk_time_get_ns(); if (now > stats->start_group_wait_time) bfq_stat_add(&stats->group_wait_time, now - stats->start_group_wait_time); @@ -144,7 +144,7 @@ static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, return; if (bfqg == curr_bfqg) return; - stats->start_group_wait_time = ktime_get_ns(); + stats->start_group_wait_time = blk_time_get_ns(); bfqg_stats_mark_waiting(stats); } @@ -156,7 +156,7 @@ static void bfqg_stats_end_empty_time(struct bfqg_stats *stats) if (!bfqg_stats_empty(stats)) return; - now = ktime_get_ns(); + now = blk_time_get_ns(); if (now > stats->start_empty_time) bfq_stat_add(&stats->empty_time, now - stats->start_empty_time); @@ -183,7 +183,7 @@ void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) if (bfqg_stats_empty(stats)) return; - stats->start_empty_time = ktime_get_ns(); + stats->start_empty_time = blk_time_get_ns(); bfqg_stats_mark_empty(stats); } @@ -192,7 +192,7 @@ void bfqg_stats_update_idle_time(struct bfq_group *bfqg) struct bfqg_stats *stats = &bfqg->stats; if (bfqg_stats_idling(stats)) { - u64 now = ktime_get_ns(); + u64 now = blk_time_get_ns(); if (now > stats->start_idle_time) bfq_stat_add(&stats->idle_time, @@ -205,7 +205,7 @@ void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { struct bfqg_stats *stats = &bfqg->stats; - stats->start_idle_time = ktime_get_ns(); + stats->start_idle_time = blk_time_get_ns(); bfqg_stats_mark_idling(stats); } @@ -242,7 +242,7 @@ void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns, u64 io_start_time_ns, blk_opf_t opf) { struct bfqg_stats *stats = &bfqg->stats; - u64 now = ktime_get_ns(); + u64 now = blk_time_get_ns(); if (now > io_start_time_ns) blkg_rwstat_add(&stats->service_time, opf, diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 3cce6de464a7..4b88a54a9b76 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -1005,7 +1005,7 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq, rq = rq_entry_fifo(bfqq->fifo.next); - if (rq == last || ktime_get_ns() < rq->fifo_time) + if (rq == last || blk_time_get_ns() < rq->fifo_time) return NULL; bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq); @@ -1829,7 +1829,7 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, * bfq_bfqq_update_budg_for_activation for * details on the usage of the next variable. */ - arrived_in_time = ktime_get_ns() <= + arrived_in_time = blk_time_get_ns() <= bfqq->ttime.last_end_request + bfqd->bfq_slice_idle * 3; unsigned int act_idx = bfq_actuator_index(bfqd, rq->bio); @@ -2208,7 +2208,7 @@ static void bfq_add_request(struct request *rq) struct request *next_rq, *prev; unsigned int old_wr_coeff = bfqq->wr_coeff; bool interactive = false; - u64 now_ns = ktime_get_ns(); + u64 now_ns = blk_time_get_ns(); bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq)); bfqq->queued[rq_is_sync(rq)]++; @@ -2262,7 +2262,7 @@ static void bfq_add_request(struct request *rq) bfqd->rqs_injected && bfqd->tot_rq_in_driver > 0)) && time_is_before_eq_jiffies(bfqq->decrease_time_jif + msecs_to_jiffies(10))) { - bfqd->last_empty_occupied_ns = ktime_get_ns(); + bfqd->last_empty_occupied_ns = blk_time_get_ns(); /* * Start the state machine for measuring the * total service time of rq: setting @@ -3294,7 +3294,7 @@ static void bfq_set_budget_timeout(struct bfq_data *bfqd, else timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; - bfqd->last_budget_start = ktime_get(); + bfqd->last_budget_start = blk_time_get(); bfqq->budget_timeout = jiffies + bfqd->bfq_timeout * timeout_coeff; @@ -3394,7 +3394,7 @@ static void bfq_arm_slice_timer(struct bfq_data *bfqd) else if (bfqq->wr_coeff > 1) sl = max_t(u32, sl, 20ULL * NSEC_PER_MSEC); - bfqd->last_idling_start = ktime_get(); + bfqd->last_idling_start = blk_time_get(); bfqd->last_idling_start_jiffies = jiffies; hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl), @@ -3433,7 +3433,7 @@ static void bfq_reset_rate_computation(struct bfq_data *bfqd, struct request *rq) { if (rq != NULL) { /* new rq dispatch now, reset accordingly */ - bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns(); + bfqd->last_dispatch = bfqd->first_dispatch = blk_time_get_ns(); bfqd->peak_rate_samples = 1; bfqd->sequential_samples = 0; bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size = @@ -3590,7 +3590,7 @@ reset_computation: */ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) { - u64 now_ns = ktime_get_ns(); + u64 now_ns = blk_time_get_ns(); if (bfqd->peak_rate_samples == 0) { /* first dispatch */ bfq_log(bfqd, "update_peak_rate: goto reset, samples %d", @@ -4162,7 +4162,7 @@ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (compensate) delta_ktime = bfqd->last_idling_start; else - delta_ktime = ktime_get(); + delta_ktime = blk_time_get(); delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start); delta_usecs = ktime_to_us(delta_ktime); @@ -5591,7 +5591,7 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, struct bfq_io_cq *bic, pid_t pid, int is_sync, unsigned int act_idx) { - u64 now_ns = ktime_get_ns(); + u64 now_ns = blk_time_get_ns(); bfqq->actuator_idx = act_idx; RB_CLEAR_NODE(&bfqq->entity.rb_node); @@ -5903,7 +5903,7 @@ static void bfq_update_io_thinktime(struct bfq_data *bfqd, */ if (bfqq->dispatched || bfq_bfqq_busy(bfqq)) return; - elapsed = ktime_get_ns() - bfqq->ttime.last_end_request; + elapsed = blk_time_get_ns() - bfqq->ttime.last_end_request; elapsed = min_t(u64, elapsed, 2ULL * bfqd->bfq_slice_idle); ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8; @@ -6194,7 +6194,7 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) bfq_add_request(rq); idle_timer_disabled = waiting && !bfq_bfqq_wait_request(bfqq); - rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; + rq->fifo_time = blk_time_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; list_add_tail(&rq->queuelist, &bfqq->fifo); bfq_rq_enqueued(bfqd, bfqq, rq); @@ -6370,7 +6370,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) bfq_weights_tree_remove(bfqq); } - now_ns = ktime_get_ns(); + now_ns = blk_time_get_ns(); bfqq->ttime.last_end_request = now_ns; @@ -6585,7 +6585,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) static void bfq_update_inject_limit(struct bfq_data *bfqd, struct bfq_queue *bfqq) { - u64 tot_time_ns = ktime_get_ns() - bfqd->last_empty_occupied_ns; + u64 tot_time_ns = blk_time_get_ns() - bfqd->last_empty_occupied_ns; unsigned int old_limit = bfqq->inject_limit; if (bfqq->last_serv_time_ns > 0 && bfqd->rqs_injected) { diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index ff93c385ba5a..bdbb557feb5a 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1846,7 +1846,7 @@ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) { unsigned long pflags; bool clamp; - u64 now = ktime_to_ns(ktime_get()); + u64 now = blk_time_get_ns(); u64 exp; u64 delay_nsec = 0; int tok; diff --git a/block/blk-flush.c b/block/blk-flush.c index 3f4d41952ef2..b0f314f4bc14 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -143,7 +143,7 @@ static void blk_account_io_flush(struct request *rq) part_stat_lock(); part_stat_inc(part, ios[STAT_FLUSH]); part_stat_add(part, nsecs[STAT_FLUSH], - ktime_get_ns() - rq->start_time_ns); + blk_time_get_ns() - rq->start_time_ns); part_stat_unlock(); } diff --git a/block/blk-iocost.c b/block/blk-iocost.c index c8beec6d7df0..4b0b483a9693 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -829,7 +829,7 @@ static int ioc_autop_idx(struct ioc *ioc, struct gendisk *disk) /* step up/down based on the vrate */ vrate_pct = div64_u64(ioc->vtime_base_rate * 100, VTIME_PER_USEC); - now_ns = ktime_get_ns(); + now_ns = blk_time_get_ns(); if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) { if (!ioc->autop_too_fast_at) @@ -1044,7 +1044,7 @@ static void ioc_now(struct ioc *ioc, struct ioc_now *now) unsigned seq; u64 vrate; - now->now_ns = ktime_get(); + now->now_ns = blk_time_get_ns(); now->now = ktime_to_us(now->now_ns); vrate = atomic64_read(&ioc->vtime_rate); @@ -2810,7 +2810,7 @@ static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq) return; } - on_q_ns = ktime_get_ns() - rq->alloc_time_ns; + on_q_ns = blk_time_get_ns() - rq->alloc_time_ns; rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns; size_nsec = div64_u64(calc_size_vtime_cost(rq, ioc), VTIME_PER_NSEC); @@ -2893,7 +2893,7 @@ static int blk_iocost_init(struct gendisk *disk) ioc->vtime_base_rate = VTIME_PER_USEC; atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC); seqcount_spinlock_init(&ioc->period_seqcount, &ioc->lock); - ioc->period_at = ktime_to_us(ktime_get()); + ioc->period_at = ktime_to_us(blk_time_get()); atomic64_set(&ioc->cur_period, 0); atomic_set(&ioc->hweight_gen, 0); diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index c1a6aba1d59e..ebb522788d97 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -609,7 +609,7 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio) if (!iolat->blkiolat->enabled) return; - now = ktime_to_ns(ktime_get()); + now = blk_time_get_ns(); while (blkg && blkg->parent) { iolat = blkg_to_lat(blkg); if (!iolat) { @@ -661,7 +661,7 @@ static void blkiolatency_timer_fn(struct timer_list *t) struct blk_iolatency *blkiolat = from_timer(blkiolat, t, timer); struct blkcg_gq *blkg; struct cgroup_subsys_state *pos_css; - u64 now = ktime_to_ns(ktime_get()); + u64 now = blk_time_get_ns(); rcu_read_lock(); blkg_for_each_descendant_pre(blkg, pos_css, @@ -985,7 +985,7 @@ static void iolatency_pd_init(struct blkg_policy_data *pd) struct blkcg_gq *blkg = lat_to_blkg(iolat); struct rq_qos *rqos = iolat_rq_qos(blkg->q); struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); - u64 now = ktime_to_ns(ktime_get()); + u64 now = blk_time_get_ns(); int cpu; if (blk_queue_nonrot(blkg->q)) diff --git a/block/blk-mq.c b/block/blk-mq.c index c53a196f579e..6d2f7b5caa01 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -322,7 +322,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq) RB_CLEAR_NODE(&rq->rb_node); rq->tag = BLK_MQ_NO_TAG; rq->internal_tag = BLK_MQ_NO_TAG; - rq->start_time_ns = ktime_get_ns(); + rq->start_time_ns = blk_time_get_ns(); rq->part = NULL; blk_crypto_rq_set_defaults(rq); } @@ -332,7 +332,7 @@ EXPORT_SYMBOL(blk_rq_init); static inline void blk_mq_rq_time_init(struct request *rq, u64 alloc_time_ns) { if (blk_mq_need_time_stamp(rq)) - rq->start_time_ns = ktime_get_ns(); + rq->start_time_ns = blk_time_get_ns(); else rq->start_time_ns = 0; @@ -443,7 +443,7 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data) /* alloc_time includes depth and tag waits */ if (blk_queue_rq_alloc_time(q)) - alloc_time_ns = ktime_get_ns(); + alloc_time_ns = blk_time_get_ns(); if (data->cmd_flags & REQ_NOWAIT) data->flags |= BLK_MQ_REQ_NOWAIT; @@ -628,7 +628,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, /* alloc_time includes depth and tag waits */ if (blk_queue_rq_alloc_time(q)) - alloc_time_ns = ktime_get_ns(); + alloc_time_ns = blk_time_get_ns(); /* * If the tag allocator sleeps we could get an allocation for a @@ -1041,7 +1041,7 @@ static inline void __blk_mq_end_request_acct(struct request *rq, u64 now) inline void __blk_mq_end_request(struct request *rq, blk_status_t error) { if (blk_mq_need_time_stamp(rq)) - __blk_mq_end_request_acct(rq, ktime_get_ns()); + __blk_mq_end_request_acct(rq, blk_time_get_ns()); blk_mq_finish_request(rq); @@ -1084,7 +1084,7 @@ void blk_mq_end_request_batch(struct io_comp_batch *iob) u64 now = 0; if (iob->need_ts) - now = ktime_get_ns(); + now = blk_time_get_ns(); while ((rq = rq_list_pop(&iob->req_list)) != NULL) { prefetch(rq->bio); @@ -1254,7 +1254,7 @@ void blk_mq_start_request(struct request *rq) if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags) && !blk_rq_is_passthrough(rq)) { - rq->io_start_time_ns = ktime_get_ns(); + rq->io_start_time_ns = blk_time_get_ns(); rq->stats_sectors = blk_rq_sectors(rq); rq->rq_flags |= RQF_STATS; rq_qos_issue(q, rq); @@ -3104,7 +3104,7 @@ blk_status_t blk_insert_cloned_request(struct request *rq) blk_mq_run_dispatch_ops(q, ret = blk_mq_request_issue_directly(rq, true)); if (ret) - blk_account_io_done(rq, ktime_get_ns()); + blk_account_io_done(rq, blk_time_get_ns()); return ret; } EXPORT_SYMBOL_GPL(blk_insert_cloned_request); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 16f5766620a4..da9dc1f793c3 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1815,7 +1815,7 @@ static bool throtl_tg_is_idle(struct throtl_grp *tg) time = min_t(unsigned long, MAX_IDLE_TIME, 4 * tg->idletime_threshold); ret = tg->latency_target == DFL_LATENCY_TARGET || tg->idletime_threshold == DFL_IDLE_THRESHOLD || - (ktime_get_ns() >> 10) - tg->last_finish_time > time || + (blk_time_get_ns() >> 10) - tg->last_finish_time > time || tg->avg_idletime > tg->idletime_threshold || (tg->latency_target && tg->bio_cnt && tg->bad_bio_cnt * 5 < tg->bio_cnt); @@ -2060,7 +2060,7 @@ static void blk_throtl_update_idletime(struct throtl_grp *tg) if (last_finish_time == 0) return; - now = ktime_get_ns() >> 10; + now = blk_time_get_ns() >> 10; if (now <= last_finish_time || last_finish_time == tg->checked_last_finish_time) return; @@ -2327,7 +2327,7 @@ void blk_throtl_bio_endio(struct bio *bio) if (!tg->td->limit_valid[LIMIT_LOW]) return; - finish_time_ns = ktime_get_ns(); + finish_time_ns = blk_time_get_ns(); tg->last_finish_time = finish_time_ns >> 10; start_time = bio_issue_time(&bio->bi_issue) >> 10; diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 5ba3cd574eac..8cb53bf4c7b1 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -29,6 +29,7 @@ #include "blk-wbt.h" #include "blk-rq-qos.h" #include "elevator.h" +#include "blk.h" #define CREATE_TRACE_POINTS #include @@ -274,13 +275,12 @@ static inline bool stat_sample_valid(struct blk_rq_stat *stat) static u64 rwb_sync_issue_lat(struct rq_wb *rwb) { - u64 now, issue = READ_ONCE(rwb->sync_issue); + u64 issue = READ_ONCE(rwb->sync_issue); if (!issue || !rwb->sync_cookie) return 0; - now = ktime_to_ns(ktime_get()); - return now - issue; + return blk_time_get_ns() - issue; } static inline unsigned int wbt_inflight(struct rq_wb *rwb) diff --git a/block/blk.h b/block/blk.h index 620e3a035da1..79ae533cdf02 100644 --- a/block/blk.h +++ b/block/blk.h @@ -4,6 +4,7 @@ #include #include /* for max_pfn/max_low_pfn */ +#include #include #include "blk-crypto-internal.h" @@ -516,6 +517,16 @@ static inline int req_ref_read(struct request *req) return atomic_read(&req->ref); } +static inline u64 blk_time_get_ns(void) +{ + return ktime_get_ns(); +} + +static inline ktime_t blk_time_get(void) +{ + return ns_to_ktime(blk_time_get_ns()); +} + /* * From most significant bit: * 1 bit: reserved for other usage, see below @@ -554,7 +565,7 @@ static inline void bio_issue_init(struct bio_issue *issue, { size &= (1ULL << BIO_ISSUE_SIZE_BITS) - 1; issue->value = ((issue->value & BIO_ISSUE_RES_MASK) | - (ktime_get_ns() & BIO_ISSUE_TIME_MASK) | + (blk_time_get_ns() & BIO_ISSUE_TIME_MASK) | ((u64)size << BIO_ISSUE_SIZE_SHIFT)); } -- cgit v1.2.3-58-ga151 From da4c8c3d0975f031ef82d39927102e39fa6ddfac Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 15 Jan 2024 14:46:03 -0700 Subject: block: cache current nsec time in struct blk_plug Querying the current time is the most costly thing we do in the block layer per IO, and depending on kernel config settings, we may do it many times per IO. None of the callers actually need nsec granularity. Take advantage of that by caching the current time in the plug, with the assumption here being that any time checking will be temporally close enough that the slight loss of precision doesn't matter. If the block plug gets flushed, eg on preempt or schedule out, then we invalidate the cached clock. On a basic peak IOPS test case with iostats enabled, this changes the performance from: IOPS=108.41M, BW=52.93GiB/s, IOS/call=31/31 IOPS=108.43M, BW=52.94GiB/s, IOS/call=32/32 IOPS=108.29M, BW=52.88GiB/s, IOS/call=31/32 IOPS=108.35M, BW=52.91GiB/s, IOS/call=32/32 IOPS=108.42M, BW=52.94GiB/s, IOS/call=31/31 IOPS=108.40M, BW=52.93GiB/s, IOS/call=32/32 IOPS=108.31M, BW=52.89GiB/s, IOS/call=32/31 to IOPS=118.79M, BW=58.00GiB/s, IOS/call=31/32 IOPS=118.62M, BW=57.92GiB/s, IOS/call=31/31 IOPS=118.80M, BW=58.01GiB/s, IOS/call=32/31 IOPS=118.78M, BW=58.00GiB/s, IOS/call=32/32 IOPS=118.69M, BW=57.95GiB/s, IOS/call=32/31 IOPS=118.62M, BW=57.92GiB/s, IOS/call=32/31 IOPS=118.63M, BW=57.92GiB/s, IOS/call=31/32 which is more than a 9% improvement in performance. Looking at perf diff, we can see a huge reduction in time overhead: 10.55% -9.88% [kernel.vmlinux] [k] read_tsc 1.31% -1.22% [kernel.vmlinux] [k] ktime_get Note that since this relies on blk_plug for the caching, it's only applicable to the issue side. But this is where most of the time calls happen anyway. On the completion side, cached time stamping is done with struct io_comp patch, as long as the driver supports it. It's also worth noting that the above testing doesn't enable any of the higher cost CPU items on the block layer side, like wbt, cgroups, iocost, etc, which all would add additional time querying and hence overhead. IOW, results would likely look even better in comparison with those enabled, as distros would do. Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/blk-core.c | 1 + block/blk.h | 14 +++++++++++++- include/linux/blkdev.h | 1 + 3 files changed, 15 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index de771093b526..13b449df5ba0 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1083,6 +1083,7 @@ void blk_start_plug_nr_ios(struct blk_plug *plug, unsigned short nr_ios) if (tsk->plug) return; + plug->cur_ktime = 0; plug->mq_list = NULL; plug->cached_rq = NULL; plug->nr_ios = min_t(unsigned short, nr_ios, BLK_MAX_REQUEST_COUNT); diff --git a/block/blk.h b/block/blk.h index 79ae533cdf02..14bbc4b780f2 100644 --- a/block/blk.h +++ b/block/blk.h @@ -519,7 +519,19 @@ static inline int req_ref_read(struct request *req) static inline u64 blk_time_get_ns(void) { - return ktime_get_ns(); + struct blk_plug *plug = current->plug; + + if (!plug) + return ktime_get_ns(); + + /* + * 0 could very well be a valid time, but rather than flag "this is + * a valid timestamp" separately, just accept that we'll do an extra + * ktime_get_ns() if we just happen to get 0 as the current time. + */ + if (!plug->cur_ktime) + plug->cur_ktime = ktime_get_ns(); + return plug->cur_ktime; } static inline ktime_t blk_time_get(void) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 99e4f5e72213..996d2ad756ff 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -942,6 +942,7 @@ struct blk_plug { /* if ios_left is > 1, we can batch tag/rq allocations */ struct request *cached_rq; + u64 cur_ktime; unsigned short nr_ios; unsigned short rq_count; -- cgit v1.2.3-58-ga151 From 06b23f92af87a84d70881b2ecaa72e00f7838264 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 16 Jan 2024 09:18:39 -0700 Subject: block: update cached timestamp post schedule/preemption Mark the task as having a cached timestamp when set assign it, so we can efficiently check if it needs updating post being scheduled back in. This covers both the actual schedule out case, which would've flushed the plug, and the preemption case which doesn't touch the plugged requests (for many reasons, one of them being then we'd need to have preemption disabled around plug state manipulation). Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/blk-core.c | 2 ++ block/blk.h | 4 +++- include/linux/blkdev.h | 16 ++++++++++++++++ include/linux/sched.h | 2 +- kernel/sched/core.c | 6 ++++-- 5 files changed, 26 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 13b449df5ba0..314c3065891a 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1183,6 +1183,8 @@ void __blk_flush_plug(struct blk_plug *plug, bool from_schedule) */ if (unlikely(!rq_list_empty(plug->cached_rq))) blk_mq_free_plug_rqs(plug); + + current->flags &= ~PF_BLOCK_TS; } /** diff --git a/block/blk.h b/block/blk.h index 14bbc4b780f2..913c93838a01 100644 --- a/block/blk.h +++ b/block/blk.h @@ -529,8 +529,10 @@ static inline u64 blk_time_get_ns(void) * a valid timestamp" separately, just accept that we'll do an extra * ktime_get_ns() if we just happen to get 0 as the current time. */ - if (!plug->cur_ktime) + if (!plug->cur_ktime) { plug->cur_ktime = ktime_get_ns(); + current->flags |= PF_BLOCK_TS; + } return plug->cur_ktime; } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 996d2ad756ff..d7cac3de65b3 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -973,6 +973,18 @@ static inline void blk_flush_plug(struct blk_plug *plug, bool async) __blk_flush_plug(plug, async); } +/* + * tsk == current here + */ +static inline void blk_plug_invalidate_ts(struct task_struct *tsk) +{ + struct blk_plug *plug = tsk->plug; + + if (plug) + plug->cur_ktime = 0; + current->flags &= ~PF_BLOCK_TS; +} + int blkdev_issue_flush(struct block_device *bdev); long nr_blockdev_pages(void); #else /* CONFIG_BLOCK */ @@ -996,6 +1008,10 @@ static inline void blk_flush_plug(struct blk_plug *plug, bool async) { } +static inline void blk_plug_invalidate_ts(struct task_struct *tsk) +{ +} + static inline int blkdev_issue_flush(struct block_device *bdev) { return 0; diff --git a/include/linux/sched.h b/include/linux/sched.h index ffe8f618ab86..15b7cb478d16 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1642,7 +1642,7 @@ extern struct pid *cad_pid; #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ #define PF_MEMALLOC_PIN 0x10000000 /* Allocation context constrained to zones which allow long term pinning. */ -#define PF__HOLE__20000000 0x20000000 +#define PF_BLOCK_TS 0x20000000 /* plug has ts that needs updating */ #define PF__HOLE__40000000 0x40000000 #define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9116bcc90346..083f2258182d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6787,10 +6787,12 @@ static inline void sched_submit_work(struct task_struct *tsk) static void sched_update_worker(struct task_struct *tsk) { - if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { + if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER | PF_BLOCK_TS)) { + if (tsk->flags & PF_BLOCK_TS) + blk_plug_invalidate_ts(tsk); if (tsk->flags & PF_WQ_WORKER) wq_worker_running(tsk); - else + else if (tsk->flags & PF_IO_WORKER) io_wq_worker_running(tsk); } } -- cgit v1.2.3-58-ga151 From 3bca7640b4c50621b94365a1746f4b86116fec56 Mon Sep 17 00:00:00 2001 From: Tang Yizhou Date: Tue, 23 Jan 2024 16:12:48 +0800 Subject: blk-throttle: Eliminate redundant checks for data direction After calling throtl_peek_queued(), the data direction can be determined so there is no need to call bio_data_dir() to check the direction again. Signed-off-by: Tang Yizhou Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240123081248.3752878-1-yizhou.tang@shopee.com Signed-off-by: Jens Axboe --- block/blk-throttle.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index da9dc1f793c3..f4850a6f860b 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1098,7 +1098,7 @@ static int throtl_dispatch_tg(struct throtl_grp *tg) while ((bio = throtl_peek_queued(&sq->queued[READ])) && tg_may_dispatch(tg, bio, NULL)) { - tg_dispatch_one_bio(tg, bio_data_dir(bio)); + tg_dispatch_one_bio(tg, READ); nr_reads++; if (nr_reads >= max_nr_reads) @@ -1108,7 +1108,7 @@ static int throtl_dispatch_tg(struct throtl_grp *tg) while ((bio = throtl_peek_queued(&sq->queued[WRITE])) && tg_may_dispatch(tg, bio, NULL)) { - tg_dispatch_one_bio(tg, bio_data_dir(bio)); + tg_dispatch_one_bio(tg, WRITE); nr_writes++; if (nr_writes >= max_nr_writes) -- cgit v1.2.3-58-ga151 From 449813515d3e5efec85206bb91588a6249a421a3 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 2 Feb 2024 12:39:25 -0800 Subject: block, fs: Restore the per-bio/request data lifetime fields Restore support for passing data lifetime information from filesystems to block drivers. This patch reverts commit b179c98f7697 ("block: Remove request.write_hint") and commit c75e707fe1aa ("block: remove the per-bio/request write hint"). This patch does not modify the size of struct bio because the new bi_write_hint member fills a hole in struct bio. pahole reports the following for struct bio on an x86_64 system with this patch applied: /* size: 112, cachelines: 2, members: 20 */ /* sum members: 110, holes: 1, sum holes: 2 */ /* last cacheline: 48 bytes */ Reviewed-by: Kanchan Joshi Cc: Jens Axboe Cc: Christoph Hellwig Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20240202203926.2478590-7-bvanassche@acm.org Signed-off-by: Christian Brauner --- block/bio.c | 2 ++ block/blk-crypto-fallback.c | 1 + block/blk-merge.c | 8 ++++++++ block/blk-mq.c | 2 ++ block/bounce.c | 1 + block/fops.c | 3 +++ fs/buffer.c | 12 ++++++++---- fs/direct-io.c | 2 ++ fs/iomap/buffered-io.c | 1 + fs/iomap/direct-io.c | 1 + fs/mpage.c | 1 + include/linux/blk-mq.h | 2 ++ include/linux/blk_types.h | 2 ++ 13 files changed, 34 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index b9642a41f286..c9223e9d31da 100644 --- a/block/bio.c +++ b/block/bio.c @@ -251,6 +251,7 @@ void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table, bio->bi_opf = opf; bio->bi_flags = 0; bio->bi_ioprio = 0; + bio->bi_write_hint = 0; bio->bi_status = 0; bio->bi_iter.bi_sector = 0; bio->bi_iter.bi_size = 0; @@ -813,6 +814,7 @@ static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp) { bio_set_flag(bio, BIO_CLONED); bio->bi_ioprio = bio_src->bi_ioprio; + bio->bi_write_hint = bio_src->bi_write_hint; bio->bi_iter = bio_src->bi_iter; if (bio->bi_bdev) { diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c index e6468eab2681..b1e7415f8439 100644 --- a/block/blk-crypto-fallback.c +++ b/block/blk-crypto-fallback.c @@ -172,6 +172,7 @@ static struct bio *blk_crypto_fallback_clone_bio(struct bio *bio_src) if (bio_flagged(bio_src, BIO_REMAPPED)) bio_set_flag(bio, BIO_REMAPPED); bio->bi_ioprio = bio_src->bi_ioprio; + bio->bi_write_hint = bio_src->bi_write_hint; bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; bio->bi_iter.bi_size = bio_src->bi_iter.bi_size; diff --git a/block/blk-merge.c b/block/blk-merge.c index 2d470cf2173e..2a06fd33039d 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -810,6 +810,10 @@ static struct request *attempt_merge(struct request_queue *q, if (rq_data_dir(req) != rq_data_dir(next)) return NULL; + /* Don't merge requests with different write hints. */ + if (req->write_hint != next->write_hint) + return NULL; + if (req->ioprio != next->ioprio) return NULL; @@ -937,6 +941,10 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) if (!bio_crypt_rq_ctx_compatible(rq, bio)) return false; + /* Don't merge requests with different write hints. */ + if (rq->write_hint != bio->bi_write_hint) + return false; + if (rq->ioprio != bio_prio(bio)) return false; diff --git a/block/blk-mq.c b/block/blk-mq.c index aa87fcfda1ec..34ceb15d2ea4 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2585,6 +2585,7 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio, rq->cmd_flags |= REQ_FAILFAST_MASK; rq->__sector = bio->bi_iter.bi_sector; + rq->write_hint = bio->bi_write_hint; blk_rq_bio_prep(rq, bio, nr_segs); /* This can't fail, since GFP_NOIO includes __GFP_DIRECT_RECLAIM. */ @@ -3185,6 +3186,7 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, } rq->nr_phys_segments = rq_src->nr_phys_segments; rq->ioprio = rq_src->ioprio; + rq->write_hint = rq_src->write_hint; if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0) goto free_and_out; diff --git a/block/bounce.c b/block/bounce.c index 7cfcb242f9a1..d6a5219f29dd 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -169,6 +169,7 @@ static struct bio *bounce_clone_bio(struct bio *bio_src) if (bio_flagged(bio_src, BIO_REMAPPED)) bio_set_flag(bio, BIO_REMAPPED); bio->bi_ioprio = bio_src->bi_ioprio; + bio->bi_write_hint = bio_src->bi_write_hint; bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; bio->bi_iter.bi_size = bio_src->bi_iter.bi_size; diff --git a/block/fops.c b/block/fops.c index 0cf8cf72cdfa..ab0e37d1dc48 100644 --- a/block/fops.c +++ b/block/fops.c @@ -73,6 +73,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, bio_init(&bio, bdev, vecs, nr_pages, dio_bio_write_op(iocb)); } bio.bi_iter.bi_sector = pos >> SECTOR_SHIFT; + bio.bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint; bio.bi_ioprio = iocb->ki_ioprio; ret = bio_iov_iter_get_pages(&bio, iter); @@ -203,6 +204,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, for (;;) { bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT; + bio->bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint; bio->bi_private = dio; bio->bi_end_io = blkdev_bio_end_io; bio->bi_ioprio = iocb->ki_ioprio; @@ -321,6 +323,7 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, dio->flags = 0; dio->iocb = iocb; bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT; + bio->bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint; bio->bi_end_io = blkdev_bio_end_io_async; bio->bi_ioprio = iocb->ki_ioprio; diff --git a/fs/buffer.c b/fs/buffer.c index d3bcf601d3e5..eb7d3ded2c33 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -55,7 +55,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, - struct writeback_control *wbc); + enum rw_hint hint, struct writeback_control *wbc); #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers) @@ -1889,7 +1889,8 @@ int __block_write_full_folio(struct inode *inode, struct folio *folio, do { struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { - submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, wbc); + submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, + inode->i_write_hint, wbc); nr_underway++; } bh = next; @@ -1944,7 +1945,8 @@ recover: struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { clear_buffer_dirty(bh); - submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, wbc); + submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, + inode->i_write_hint, wbc); nr_underway++; } bh = next; @@ -2756,6 +2758,7 @@ static void end_bio_bh_io_sync(struct bio *bio) } static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, + enum rw_hint write_hint, struct writeback_control *wbc) { const enum req_op op = opf & REQ_OP_MASK; @@ -2783,6 +2786,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO); bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); + bio->bi_write_hint = write_hint; __bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); @@ -2802,7 +2806,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, void submit_bh(blk_opf_t opf, struct buffer_head *bh) { - submit_bh_wbc(opf, bh, NULL); + submit_bh_wbc(opf, bh, WRITE_LIFE_NOT_SET, NULL); } EXPORT_SYMBOL(submit_bh); diff --git a/fs/direct-io.c b/fs/direct-io.c index 60456263a338..62c97ff9e852 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -410,6 +410,8 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, bio->bi_end_io = dio_bio_end_io; if (dio->is_pinned) bio_set_flag(bio, BIO_PAGE_PINNED); + bio->bi_write_hint = file_inode(dio->iocb->ki_filp)->i_write_hint; + sdio->bio = bio; sdio->logical_offset_in_bio = sdio->cur_page_fs_offset; } diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 093c4515b22a..18e1fef53fbc 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1667,6 +1667,7 @@ iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc, REQ_OP_WRITE | wbc_to_write_flags(wbc), GFP_NOFS, &iomap_ioend_bioset); bio->bi_iter.bi_sector = sector; + bio->bi_write_hint = inode->i_write_hint; wbc_init_bio(wbc, bio); ioend = container_of(bio, struct iomap_ioend, io_inline_bio); diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index bcd3f8cf5ea4..f3b43d223a46 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -380,6 +380,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits, GFP_KERNEL); bio->bi_iter.bi_sector = iomap_sector(iomap, pos); + bio->bi_write_hint = inode->i_write_hint; bio->bi_ioprio = dio->iocb->ki_ioprio; bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; diff --git a/fs/mpage.c b/fs/mpage.c index 738882e0766d..fa8b99a199fa 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -605,6 +605,7 @@ alloc_new: GFP_NOFS); bio->bi_iter.bi_sector = first_block << (blkbits - 9); wbc_init_bio(wbc, bio); + bio->bi_write_hint = inode->i_write_hint; } /* diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 7a8150a5f051..492b0128b5d9 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -8,6 +8,7 @@ #include #include #include +#include struct blk_mq_tags; struct blk_flush_queue; @@ -135,6 +136,7 @@ struct request { struct blk_crypto_keyslot *crypt_keyslot; #endif + enum rw_hint write_hint; unsigned short ioprio; enum mq_rq_state state; diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index f288c94374b3..12d87cef2c03 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -10,6 +10,7 @@ #include #include #include +#include struct bio_set; struct bio; @@ -269,6 +270,7 @@ struct bio { */ unsigned short bi_flags; /* BIO_* below */ unsigned short bi_ioprio; + enum rw_hint bi_write_hint; blk_status_t bi_status; atomic_t __bi_remaining; -- cgit v1.2.3-58-ga151 From c9f5f3aa19c617fe85085b19abbf7a9a077336d0 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 7 Feb 2024 14:14:28 +0000 Subject: block: extend bio caching to task context bio_put_percpu_cache() puts all non-iopoll bios into the irq-safe list, which entails disabling irqs. The overhead of that is not that bad when interrupts are already off but getting worse otherwise. We can optimise it when we're in the task context by using ->free_list directly just as the IOPOLL path does. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/4774e1a0f905f96c63174b0f3e4f79f0d9b63246.1707314970.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- block/bio.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index b9642a41f286..8da941974f88 100644 --- a/block/bio.c +++ b/block/bio.c @@ -770,8 +770,9 @@ static inline void bio_put_percpu_cache(struct bio *bio) bio_uninit(bio); - if ((bio->bi_opf & REQ_POLLED) && !WARN_ON_ONCE(in_interrupt())) { + if (in_task()) { bio->bi_next = cache->free_list; + /* Not necessary but helps not to iopoll already freed bios */ bio->bi_bdev = NULL; cache->free_list = bio; cache->nr++; -- cgit v1.2.3-58-ga151 From e516c3fc6c182736aec5418a73f15199640491e2 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 7 Feb 2024 14:14:29 +0000 Subject: block: optimise in irq bio put caching When enlisting a bio into ->free_list_irq we protect the list by disabling irqs. It's likely they're already disabled and performance of local_irq_{save,restore}() is decent, but it's not zero cost. Let's only use the irq cache when when we're serving a hard irq, which allows to remove local_irq_{save,restore}(), and fall back to bio_free() in all left cases. Profiles indicate that the bio_put() cost is reduced by ~3.5 times (1.76% -> 0.49%), and total throughput of a CPU bound benchmark improve by around 1% (t/io_uring with high QD and several drives). Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/36d207540b7046c653cc16e5ff08fe7234b19f81.1707314970.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- block/bio.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 8da941974f88..00847ff1415c 100644 --- a/block/bio.c +++ b/block/bio.c @@ -762,30 +762,31 @@ static inline void bio_put_percpu_cache(struct bio *bio) struct bio_alloc_cache *cache; cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu()); - if (READ_ONCE(cache->nr_irq) + cache->nr > ALLOC_CACHE_MAX) { - put_cpu(); - bio_free(bio); - return; - } - - bio_uninit(bio); + if (READ_ONCE(cache->nr_irq) + cache->nr > ALLOC_CACHE_MAX) + goto out_free; if (in_task()) { + bio_uninit(bio); bio->bi_next = cache->free_list; /* Not necessary but helps not to iopoll already freed bios */ bio->bi_bdev = NULL; cache->free_list = bio; cache->nr++; - } else { - unsigned long flags; + } else if (in_hardirq()) { + lockdep_assert_irqs_disabled(); - local_irq_save(flags); + bio_uninit(bio); bio->bi_next = cache->free_list_irq; cache->free_list_irq = bio; cache->nr_irq++; - local_irq_restore(flags); + } else { + goto out_free; } put_cpu(); + return; +out_free: + put_cpu(); + bio_free(bio); } /** -- cgit v1.2.3-58-ga151 From 48ff13a618b54aabc447659a9016068cf0cae322 Mon Sep 17 00:00:00 2001 From: Kunwu Chan Date: Wed, 31 Jan 2024 17:43:23 +0800 Subject: block: Simplify the allocation of slab caches Use the new KMEM_CACHE() macro instead of direct kmem_cache_create to simplify the creation of SLAB caches. Signed-off-by: Kunwu Chan Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20240131094323.146659-1-chentao@kylinos.cn Signed-off-by: Jens Axboe --- block/blk-core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 314c3065891a..2b11d8325fde 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1232,8 +1232,7 @@ int __init blk_dev_init(void) if (!kblockd_workqueue) panic("Failed to create kblockd\n"); - blk_requestq_cachep = kmem_cache_create("request_queue", - sizeof(struct request_queue), 0, SLAB_PANIC, NULL); + blk_requestq_cachep = KMEM_CACHE(request_queue, SLAB_PANIC); blk_debugfs_root = debugfs_create_dir("block", NULL); -- cgit v1.2.3-58-ga151 From 71f4ecdbb42addf82b01b734b122a02707fed521 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Sun, 28 Jan 2024 23:52:20 -0800 Subject: block: remove gfp_flags from blkdev_zone_mgmt Now that all callers pass in GFP_KERNEL to blkdev_zone_mgmt() and use memalloc_no{io,fs}_{save,restore}() to define the allocation scope, we can drop the gfp_mask parameter from blkdev_zone_mgmt() as well as blkdev_zone_reset_all() and blkdev_zone_reset_all_emulated(). Signed-off-by: Johannes Thumshirn Reviewed-by: Damien Le Moal Reviewed-by: Mike Snitzer Link: https://lore.kernel.org/r/20240128-zonefs_nofs-v3-5-ae3b7c8def61@wdc.com Signed-off-by: Jens Axboe --- block/blk-zoned.c | 19 ++++++++----------- drivers/md/dm-zoned-metadata.c | 2 +- drivers/nvme/target/zns.c | 5 ++--- fs/btrfs/zoned.c | 14 +++++--------- fs/f2fs/segment.c | 4 ++-- fs/zonefs/super.c | 2 +- include/linux/blkdev.h | 2 +- 7 files changed, 20 insertions(+), 28 deletions(-) (limited to 'block') diff --git a/block/blk-zoned.c b/block/blk-zoned.c index d343e5756a9c..d4f4f8325eff 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -177,8 +177,7 @@ static int blk_zone_need_reset_cb(struct blk_zone *zone, unsigned int idx, } } -static int blkdev_zone_reset_all_emulated(struct block_device *bdev, - gfp_t gfp_mask) +static int blkdev_zone_reset_all_emulated(struct block_device *bdev) { struct gendisk *disk = bdev->bd_disk; sector_t capacity = bdev_nr_sectors(bdev); @@ -205,7 +204,7 @@ static int blkdev_zone_reset_all_emulated(struct block_device *bdev, } bio = blk_next_bio(bio, bdev, 0, REQ_OP_ZONE_RESET | REQ_SYNC, - gfp_mask); + GFP_KERNEL); bio->bi_iter.bi_sector = sector; sector += zone_sectors; @@ -223,7 +222,7 @@ out_free_need_reset: return ret; } -static int blkdev_zone_reset_all(struct block_device *bdev, gfp_t gfp_mask) +static int blkdev_zone_reset_all(struct block_device *bdev) { struct bio bio; @@ -238,7 +237,6 @@ static int blkdev_zone_reset_all(struct block_device *bdev, gfp_t gfp_mask) * @sector: Start sector of the first zone to operate on * @nr_sectors: Number of sectors, should be at least the length of one zone and * must be zone size aligned. - * @gfp_mask: Memory allocation flags (for bio_alloc) * * Description: * Perform the specified operation on the range of zones specified by @@ -248,7 +246,7 @@ static int blkdev_zone_reset_all(struct block_device *bdev, gfp_t gfp_mask) * or finish request. */ int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op, - sector_t sector, sector_t nr_sectors, gfp_t gfp_mask) + sector_t sector, sector_t nr_sectors) { struct request_queue *q = bdev_get_queue(bdev); sector_t zone_sectors = bdev_zone_sectors(bdev); @@ -285,12 +283,12 @@ int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op, */ if (op == REQ_OP_ZONE_RESET && sector == 0 && nr_sectors == capacity) { if (!blk_queue_zone_resetall(q)) - return blkdev_zone_reset_all_emulated(bdev, gfp_mask); - return blkdev_zone_reset_all(bdev, gfp_mask); + return blkdev_zone_reset_all_emulated(bdev); + return blkdev_zone_reset_all(bdev); } while (sector < end_sector) { - bio = blk_next_bio(bio, bdev, 0, op | REQ_SYNC, gfp_mask); + bio = blk_next_bio(bio, bdev, 0, op | REQ_SYNC, GFP_KERNEL); bio->bi_iter.bi_sector = sector; sector += zone_sectors; @@ -419,8 +417,7 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode, return -ENOTTY; } - ret = blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors, - GFP_KERNEL); + ret = blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors); fail: if (cmd == BLKRESETZONE) diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 165996cc966c..8156881a31de 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -1660,7 +1660,7 @@ static int dmz_reset_zone(struct dmz_metadata *zmd, struct dm_zone *zone) noio_flag = memalloc_noio_save(); ret = blkdev_zone_mgmt(dev->bdev, REQ_OP_ZONE_RESET, dmz_start_sect(zmd, zone), - zmd->zone_nr_sectors, GFP_KERNEL); + zmd->zone_nr_sectors); memalloc_noio_restore(noio_flag); if (ret) { dmz_dev_err(dev, "Reset zone %u failed %d", diff --git a/drivers/nvme/target/zns.c b/drivers/nvme/target/zns.c index 5b5c1e481722..3148d9f1bde6 100644 --- a/drivers/nvme/target/zns.c +++ b/drivers/nvme/target/zns.c @@ -456,8 +456,7 @@ static u16 nvmet_bdev_execute_zmgmt_send_all(struct nvmet_req *req) switch (zsa_req_op(req->cmd->zms.zsa)) { case REQ_OP_ZONE_RESET: ret = blkdev_zone_mgmt(req->ns->bdev, REQ_OP_ZONE_RESET, 0, - get_capacity(req->ns->bdev->bd_disk), - GFP_KERNEL); + get_capacity(req->ns->bdev->bd_disk)); if (ret < 0) return blkdev_zone_mgmt_errno_to_nvme_status(ret); break; @@ -508,7 +507,7 @@ static void nvmet_bdev_zmgmt_send_work(struct work_struct *w) goto out; } - ret = blkdev_zone_mgmt(bdev, op, sect, zone_sectors, GFP_KERNEL); + ret = blkdev_zone_mgmt(bdev, op, sect, zone_sectors); if (ret < 0) status = blkdev_zone_mgmt_errno_to_nvme_status(ret); diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 05640d61e435..cf2e779d8ef4 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -830,8 +830,7 @@ static int sb_log_location(struct block_device *bdev, struct blk_zone *zones, nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, - reset->start, reset->len, - GFP_KERNEL); + reset->start, reset->len); memalloc_nofs_restore(nofs_flags); if (ret) return ret; @@ -984,7 +983,7 @@ int btrfs_advance_sb_log(struct btrfs_device *device, int mirror) nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, zone->start, - zone->len, GFP_KERNEL); + zone->len); memalloc_nofs_restore(nofs_flags); if (ret) return ret; @@ -1023,8 +1022,7 @@ int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror) nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, zone_start_sector(sb_zone, bdev), - zone_sectors * BTRFS_NR_SB_LOG_ZONES, - GFP_KERNEL); + zone_sectors * BTRFS_NR_SB_LOG_ZONES); memalloc_nofs_restore(nofs_flags); return ret; } @@ -1143,8 +1141,7 @@ int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical, *bytes = 0; nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET, - physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT, - GFP_KERNEL); + physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT); memalloc_nofs_restore(nofs_flags); if (ret) return ret; @@ -2258,8 +2255,7 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, physical >> SECTOR_SHIFT, - zinfo->zone_size >> SECTOR_SHIFT, - GFP_KERNEL); + zinfo->zone_size >> SECTOR_SHIFT); memalloc_nofs_restore(nofs_flags); if (ret) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 0094fe491364..e1065ba70207 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1977,7 +1977,7 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, trace_f2fs_issue_reset_zone(bdev, blkstart); nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, - sector, nr_sects, GFP_KERNEL); + sector, nr_sects); memalloc_nofs_restore(nofs_flags); return ret; } @@ -4921,7 +4921,7 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi, nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(fdev->bdev, REQ_OP_ZONE_FINISH, - zone->start, zone->len, GFP_KERNEL); + zone->start, zone->len); memalloc_nofs_restore(nofs_flags); if (ret == -EOPNOTSUPP) { ret = blkdev_issue_zeroout(fdev->bdev, zone->wp, diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 63fbac018c04..cadb1364f951 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -113,7 +113,7 @@ static int zonefs_zone_mgmt(struct super_block *sb, trace_zonefs_zone_mgmt(sb, z, op); ret = blkdev_zone_mgmt(sb->s_bdev, op, z->z_sector, - z->z_size >> SECTOR_SHIFT, GFP_KERNEL); + z->z_size >> SECTOR_SHIFT); if (ret) { zonefs_err(sb, "Zone management operation %s at %llu failed %d\n", diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d7cac3de65b3..fac580976e3a 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -325,7 +325,7 @@ void disk_set_zoned(struct gendisk *disk); int blkdev_report_zones(struct block_device *bdev, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op, - sector_t sectors, sector_t nr_sectors, gfp_t gfp_mask); + sector_t sectors, sector_t nr_sectors); int blk_revalidate_disk_zones(struct gendisk *disk, void (*update_driver_data)(struct gendisk *disk)); -- cgit v1.2.3-58-ga151 From 6b5c132a3f0d3b7c024ae98f0ace07c04d32cf73 Mon Sep 17 00:00:00 2001 From: Kanchan Joshi Date: Thu, 1 Feb 2024 18:31:24 +0530 Subject: block: refactor guard helpers Allow computation using the existing guard value. This is a prep patch. Signed-off-by: Kanchan Joshi Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240201130126.211402-2-joshi.k@samsung.com Signed-off-by: Jens Axboe --- block/t10-pi.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'block') diff --git a/block/t10-pi.c b/block/t10-pi.c index 914d8cddd43a..251a7b188963 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c @@ -12,14 +12,14 @@ #include #include -typedef __be16 (csum_fn) (void *, unsigned int); +typedef __be16 (csum_fn) (__be16, void *, unsigned int); -static __be16 t10_pi_crc_fn(void *data, unsigned int len) +static __be16 t10_pi_crc_fn(__be16 crc, void *data, unsigned int len) { - return cpu_to_be16(crc_t10dif(data, len)); + return cpu_to_be16(crc_t10dif_update(be16_to_cpu(crc), data, len)); } -static __be16 t10_pi_ip_fn(void *data, unsigned int len) +static __be16 t10_pi_ip_fn(__be16 csum, void *data, unsigned int len) { return (__force __be16)ip_compute_csum(data, len); } @@ -37,7 +37,7 @@ static blk_status_t t10_pi_generate(struct blk_integrity_iter *iter, for (i = 0 ; i < iter->data_size ; i += iter->interval) { struct t10_pi_tuple *pi = iter->prot_buf; - pi->guard_tag = fn(iter->data_buf, iter->interval); + pi->guard_tag = fn(0, iter->data_buf, iter->interval); pi->app_tag = 0; if (type == T10_PI_TYPE1_PROTECTION) @@ -83,7 +83,7 @@ static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter, goto next; } - csum = fn(iter->data_buf, iter->interval); + csum = fn(0, iter->data_buf, iter->interval); if (pi->guard_tag != csum) { pr_err("%s: guard tag error at sector %llu " \ @@ -280,9 +280,9 @@ const struct blk_integrity_profile t10_pi_type3_ip = { }; EXPORT_SYMBOL(t10_pi_type3_ip); -static __be64 ext_pi_crc64(void *data, unsigned int len) +static __be64 ext_pi_crc64(u64 crc, void *data, unsigned int len) { - return cpu_to_be64(crc64_rocksoft(data, len)); + return cpu_to_be64(crc64_rocksoft_update(crc, data, len)); } static blk_status_t ext_pi_crc64_generate(struct blk_integrity_iter *iter, @@ -293,7 +293,7 @@ static blk_status_t ext_pi_crc64_generate(struct blk_integrity_iter *iter, for (i = 0 ; i < iter->data_size ; i += iter->interval) { struct crc64_pi_tuple *pi = iter->prot_buf; - pi->guard_tag = ext_pi_crc64(iter->data_buf, iter->interval); + pi->guard_tag = ext_pi_crc64(0, iter->data_buf, iter->interval); pi->app_tag = 0; if (type == T10_PI_TYPE1_PROTECTION) @@ -343,7 +343,7 @@ static blk_status_t ext_pi_crc64_verify(struct blk_integrity_iter *iter, goto next; } - csum = ext_pi_crc64(iter->data_buf, iter->interval); + csum = ext_pi_crc64(0, iter->data_buf, iter->interval); if (pi->guard_tag != csum) { pr_err("%s: guard tag error at sector %llu " \ "(rcvd %016llx, want %016llx)\n", -- cgit v1.2.3-58-ga151 From 60d21aac52e26531affdadb7543fe5b93f58b450 Mon Sep 17 00:00:00 2001 From: Kanchan Joshi Date: Thu, 1 Feb 2024 18:31:25 +0530 Subject: block: support PI at non-zero offset within metadata Block layer integrity processing assumes that protection information (PI) is placed in the first bytes of each metadata block. Remove this limitation and include the metadata before the PI in the calculation of the guard tag. Signed-off-by: Kanchan Joshi Signed-off-by: Chinmay Gameti Reviewed-by: Sagi Grimberg Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240201130126.211402-3-joshi.k@samsung.com Signed-off-by: Jens Axboe --- block/bio-integrity.c | 1 + block/blk-integrity.c | 1 + block/t10-pi.c | 52 +++++++++++++++++++++++++++++++------------ include/linux/blk-integrity.h | 1 + include/linux/blkdev.h | 1 + 5 files changed, 42 insertions(+), 14 deletions(-) (limited to 'block') diff --git a/block/bio-integrity.c b/block/bio-integrity.c index c9a16fba58b9..2e3e8e04961e 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -395,6 +395,7 @@ static blk_status_t bio_integrity_process(struct bio *bio, iter.tuple_size = bi->tuple_size; iter.seed = proc_iter->bi_sector; iter.prot_buf = bvec_virt(bip->bip_vec); + iter.pi_offset = bi->pi_offset; __bio_for_each_segment(bv, bio, bviter, *proc_iter) { void *kaddr = bvec_kmap_local(&bv); diff --git a/block/blk-integrity.c b/block/blk-integrity.c index d4e9b4556d14..ccbeb6dfa87a 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -370,6 +370,7 @@ void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template bi->profile = template->profile ? template->profile : &nop_profile; bi->tuple_size = template->tuple_size; bi->tag_size = template->tag_size; + bi->pi_offset = template->pi_offset; blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, disk->queue); diff --git a/block/t10-pi.c b/block/t10-pi.c index 251a7b188963..d90892fd6f2a 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c @@ -32,12 +32,16 @@ static __be16 t10_pi_ip_fn(__be16 csum, void *data, unsigned int len) static blk_status_t t10_pi_generate(struct blk_integrity_iter *iter, csum_fn *fn, enum t10_dif_type type) { + u8 offset = iter->pi_offset; unsigned int i; for (i = 0 ; i < iter->data_size ; i += iter->interval) { - struct t10_pi_tuple *pi = iter->prot_buf; + struct t10_pi_tuple *pi = iter->prot_buf + offset; pi->guard_tag = fn(0, iter->data_buf, iter->interval); + if (offset) + pi->guard_tag = fn(pi->guard_tag, iter->prot_buf, + offset); pi->app_tag = 0; if (type == T10_PI_TYPE1_PROTECTION) @@ -56,12 +60,13 @@ static blk_status_t t10_pi_generate(struct blk_integrity_iter *iter, static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter, csum_fn *fn, enum t10_dif_type type) { + u8 offset = iter->pi_offset; unsigned int i; BUG_ON(type == T10_PI_TYPE0_PROTECTION); for (i = 0 ; i < iter->data_size ; i += iter->interval) { - struct t10_pi_tuple *pi = iter->prot_buf; + struct t10_pi_tuple *pi = iter->prot_buf + offset; __be16 csum; if (type == T10_PI_TYPE1_PROTECTION || @@ -84,6 +89,8 @@ static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter, } csum = fn(0, iter->data_buf, iter->interval); + if (offset) + csum = fn(csum, iter->prot_buf, offset); if (pi->guard_tag != csum) { pr_err("%s: guard tag error at sector %llu " \ @@ -134,8 +141,10 @@ static blk_status_t t10_pi_type1_verify_ip(struct blk_integrity_iter *iter) */ static void t10_pi_type1_prepare(struct request *rq) { - const int tuple_sz = rq->q->integrity.tuple_size; + struct blk_integrity *bi = &rq->q->integrity; + const int tuple_sz = bi->tuple_size; u32 ref_tag = t10_pi_ref_tag(rq); + u8 offset = bi->pi_offset; struct bio *bio; __rq_for_each_bio(bio, rq) { @@ -154,7 +163,7 @@ static void t10_pi_type1_prepare(struct request *rq) p = bvec_kmap_local(&iv); for (j = 0; j < iv.bv_len; j += tuple_sz) { - struct t10_pi_tuple *pi = p; + struct t10_pi_tuple *pi = p + offset; if (be32_to_cpu(pi->ref_tag) == virt) pi->ref_tag = cpu_to_be32(ref_tag); @@ -183,9 +192,11 @@ static void t10_pi_type1_prepare(struct request *rq) */ static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes) { - unsigned intervals = nr_bytes >> rq->q->integrity.interval_exp; - const int tuple_sz = rq->q->integrity.tuple_size; + struct blk_integrity *bi = &rq->q->integrity; + unsigned intervals = nr_bytes >> bi->interval_exp; + const int tuple_sz = bi->tuple_size; u32 ref_tag = t10_pi_ref_tag(rq); + u8 offset = bi->pi_offset; struct bio *bio; __rq_for_each_bio(bio, rq) { @@ -200,7 +211,7 @@ static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes) p = bvec_kmap_local(&iv); for (j = 0; j < iv.bv_len && intervals; j += tuple_sz) { - struct t10_pi_tuple *pi = p; + struct t10_pi_tuple *pi = p + offset; if (be32_to_cpu(pi->ref_tag) == ref_tag) pi->ref_tag = cpu_to_be32(virt); @@ -288,12 +299,16 @@ static __be64 ext_pi_crc64(u64 crc, void *data, unsigned int len) static blk_status_t ext_pi_crc64_generate(struct blk_integrity_iter *iter, enum t10_dif_type type) { + u8 offset = iter->pi_offset; unsigned int i; for (i = 0 ; i < iter->data_size ; i += iter->interval) { - struct crc64_pi_tuple *pi = iter->prot_buf; + struct crc64_pi_tuple *pi = iter->prot_buf + offset; pi->guard_tag = ext_pi_crc64(0, iter->data_buf, iter->interval); + if (offset) + pi->guard_tag = ext_pi_crc64(be64_to_cpu(pi->guard_tag), + iter->prot_buf, offset); pi->app_tag = 0; if (type == T10_PI_TYPE1_PROTECTION) @@ -319,10 +334,11 @@ static bool ext_pi_ref_escape(u8 *ref_tag) static blk_status_t ext_pi_crc64_verify(struct blk_integrity_iter *iter, enum t10_dif_type type) { + u8 offset = iter->pi_offset; unsigned int i; for (i = 0; i < iter->data_size; i += iter->interval) { - struct crc64_pi_tuple *pi = iter->prot_buf; + struct crc64_pi_tuple *pi = iter->prot_buf + offset; u64 ref, seed; __be64 csum; @@ -344,6 +360,10 @@ static blk_status_t ext_pi_crc64_verify(struct blk_integrity_iter *iter, } csum = ext_pi_crc64(0, iter->data_buf, iter->interval); + if (offset) + csum = ext_pi_crc64(be64_to_cpu(csum), iter->prot_buf, + offset); + if (pi->guard_tag != csum) { pr_err("%s: guard tag error at sector %llu " \ "(rcvd %016llx, want %016llx)\n", @@ -373,8 +393,10 @@ static blk_status_t ext_pi_type1_generate_crc64(struct blk_integrity_iter *iter) static void ext_pi_type1_prepare(struct request *rq) { - const int tuple_sz = rq->q->integrity.tuple_size; + struct blk_integrity *bi = &rq->q->integrity; + const int tuple_sz = bi->tuple_size; u64 ref_tag = ext_pi_ref_tag(rq); + u8 offset = bi->pi_offset; struct bio *bio; __rq_for_each_bio(bio, rq) { @@ -393,7 +415,7 @@ static void ext_pi_type1_prepare(struct request *rq) p = bvec_kmap_local(&iv); for (j = 0; j < iv.bv_len; j += tuple_sz) { - struct crc64_pi_tuple *pi = p; + struct crc64_pi_tuple *pi = p + offset; u64 ref = get_unaligned_be48(pi->ref_tag); if (ref == virt) @@ -411,9 +433,11 @@ static void ext_pi_type1_prepare(struct request *rq) static void ext_pi_type1_complete(struct request *rq, unsigned int nr_bytes) { - unsigned intervals = nr_bytes >> rq->q->integrity.interval_exp; - const int tuple_sz = rq->q->integrity.tuple_size; + struct blk_integrity *bi = &rq->q->integrity; + unsigned intervals = nr_bytes >> bi->interval_exp; + const int tuple_sz = bi->tuple_size; u64 ref_tag = ext_pi_ref_tag(rq); + u8 offset = bi->pi_offset; struct bio *bio; __rq_for_each_bio(bio, rq) { @@ -428,7 +452,7 @@ static void ext_pi_type1_complete(struct request *rq, unsigned int nr_bytes) p = bvec_kmap_local(&iv); for (j = 0; j < iv.bv_len && intervals; j += tuple_sz) { - struct crc64_pi_tuple *pi = p; + struct crc64_pi_tuple *pi = p + offset; u64 ref = get_unaligned_be48(pi->ref_tag); if (ref == ref_tag) diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index 378b2459efe2..e253e7bd0d17 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -20,6 +20,7 @@ struct blk_integrity_iter { unsigned int data_size; unsigned short interval; unsigned char tuple_size; + unsigned char pi_offset; const char *disk_name; }; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index fac580976e3a..0058783a4c43 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -108,6 +108,7 @@ struct blk_integrity { const struct blk_integrity_profile *profile; unsigned char flags; unsigned char tuple_size; + unsigned char pi_offset; unsigned char interval_exp; unsigned char tag_size; }; -- cgit v1.2.3-58-ga151 From b9947297d00b9c75b271db88165e39c0208bec2e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 13 Feb 2024 08:34:12 +0100 Subject: block: refactor disk_update_readahead Factor out a blk_apply_bdi_limits limits helper that can be used with an explicit queue_limits argument, which will be useful later. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Chaitanya Kulkarni Reviewed-by: Ming Lei Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240213073425.1621680-3-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-settings.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'block') diff --git a/block/blk-settings.c b/block/blk-settings.c index 06ea91e51b8b..f16d3fec6658 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -85,6 +85,17 @@ void blk_set_stacking_limits(struct queue_limits *lim) } EXPORT_SYMBOL(blk_set_stacking_limits); +static void blk_apply_bdi_limits(struct backing_dev_info *bdi, + struct queue_limits *lim) +{ + /* + * For read-ahead of large files to be effective, we need to read ahead + * at least twice the optimal I/O size. + */ + bdi->ra_pages = max(lim->io_opt * 2 / PAGE_SIZE, VM_READAHEAD_PAGES); + bdi->io_pages = lim->max_sectors >> PAGE_SECTORS_SHIFT; +} + /** * blk_queue_bounce_limit - set bounce buffer limit for queue * @q: the request queue for the device @@ -393,15 +404,7 @@ EXPORT_SYMBOL(blk_queue_alignment_offset); void disk_update_readahead(struct gendisk *disk) { - struct request_queue *q = disk->queue; - - /* - * For read-ahead of large files to be effective, we need to read ahead - * at least twice the optimal I/O size. - */ - disk->bdi->ra_pages = - max(queue_io_opt(q) * 2 / PAGE_SIZE, VM_READAHEAD_PAGES); - disk->bdi->io_pages = queue_max_sectors(q) >> (PAGE_SHIFT - 9); + blk_apply_bdi_limits(disk->bdi, &disk->queue->limits); } EXPORT_SYMBOL_GPL(disk_update_readahead); -- cgit v1.2.3-58-ga151 From c490f226a0ea22639e81ac34edc884e238ea955a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 13 Feb 2024 08:34:13 +0100 Subject: block: decouple blk_set_stacking_limits from blk_set_default_limits blk_set_stacking_limits uses very little from blk_set_default_limits. Open code these initializations in preparation for rewriting blk_set_default_limits. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240213073425.1621680-4-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-settings.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/blk-settings.c b/block/blk-settings.c index f16d3fec6658..24042f6b33d5 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -65,13 +65,18 @@ void blk_set_default_limits(struct queue_limits *lim) * blk_set_stacking_limits - set default limits for stacking devices * @lim: the queue_limits structure to reset * - * Description: - * Returns a queue_limit struct to its default state. Should be used - * by stacking drivers like DM that have no internal limits. + * Prepare queue limits for applying limits from underlying devices using + * blk_stack_limits(). */ void blk_set_stacking_limits(struct queue_limits *lim) { - blk_set_default_limits(lim); + memset(lim, 0, sizeof(*lim)); + lim->logical_block_size = SECTOR_SIZE; + lim->physical_block_size = SECTOR_SIZE; + lim->io_min = SECTOR_SIZE; + lim->discard_granularity = SECTOR_SIZE; + lim->dma_alignment = SECTOR_SIZE - 1; + lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; /* Inherit limits from component devices */ lim->max_segments = USHRT_MAX; -- cgit v1.2.3-58-ga151 From d690cb8ae14bd377d422b7905b6959c7e7a45b95 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 13 Feb 2024 08:34:14 +0100 Subject: block: add an API to atomically update queue limits Add a new queue_limits_{start,commit}_update pair of functions that allows taking an atomic snapshot of queue limits, update it, and commit it if it passes validity checking. Also use the low-level validation helper to implement blk_set_default_limits instead of duplicating the initialization. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240213073425.1621680-5-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-core.c | 1 + block/blk-settings.c | 228 +++++++++++++++++++++++++++++++++++++++++-------- block/blk.h | 2 +- include/linux/blkdev.h | 23 +++++ 4 files changed, 217 insertions(+), 37 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 2b11d8325fde..cb56724a8dfb 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -425,6 +425,7 @@ struct request_queue *blk_alloc_queue(int node_id) mutex_init(&q->debugfs_mutex); mutex_init(&q->sysfs_lock); mutex_init(&q->sysfs_dir_lock); + mutex_init(&q->limits_lock); mutex_init(&q->rq_qos_mutex); spin_lock_init(&q->queue_lock); diff --git a/block/blk-settings.c b/block/blk-settings.c index 24042f6b33d5..786b369ca599 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -25,42 +25,6 @@ void blk_queue_rq_timeout(struct request_queue *q, unsigned int timeout) } EXPORT_SYMBOL_GPL(blk_queue_rq_timeout); -/** - * blk_set_default_limits - reset limits to default values - * @lim: the queue_limits structure to reset - * - * Description: - * Returns a queue_limit struct to its default state. - */ -void blk_set_default_limits(struct queue_limits *lim) -{ - lim->max_segments = BLK_MAX_SEGMENTS; - lim->max_discard_segments = 1; - lim->max_integrity_segments = 0; - lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; - lim->virt_boundary_mask = 0; - lim->max_segment_size = BLK_MAX_SEGMENT_SIZE; - lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS; - lim->max_user_sectors = lim->max_dev_sectors = 0; - lim->chunk_sectors = 0; - lim->max_write_zeroes_sectors = 0; - lim->max_zone_append_sectors = 0; - lim->max_discard_sectors = 0; - lim->max_hw_discard_sectors = 0; - lim->max_secure_erase_sectors = 0; - lim->discard_granularity = 512; - lim->discard_alignment = 0; - lim->discard_misaligned = 0; - lim->logical_block_size = lim->physical_block_size = lim->io_min = 512; - lim->bounce = BLK_BOUNCE_NONE; - lim->alignment_offset = 0; - lim->io_opt = 0; - lim->misaligned = 0; - lim->zoned = false; - lim->zone_write_granularity = 0; - lim->dma_alignment = 511; -} - /** * blk_set_stacking_limits - set default limits for stacking devices * @lim: the queue_limits structure to reset @@ -101,6 +65,198 @@ static void blk_apply_bdi_limits(struct backing_dev_info *bdi, bdi->io_pages = lim->max_sectors >> PAGE_SECTORS_SHIFT; } +static int blk_validate_zoned_limits(struct queue_limits *lim) +{ + if (!lim->zoned) { + if (WARN_ON_ONCE(lim->max_open_zones) || + WARN_ON_ONCE(lim->max_active_zones) || + WARN_ON_ONCE(lim->zone_write_granularity) || + WARN_ON_ONCE(lim->max_zone_append_sectors)) + return -EINVAL; + return 0; + } + + if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_BLK_DEV_ZONED))) + return -EINVAL; + + if (lim->zone_write_granularity < lim->logical_block_size) + lim->zone_write_granularity = lim->logical_block_size; + + if (lim->max_zone_append_sectors) { + /* + * The Zone Append size is limited by the maximum I/O size + * and the zone size given that it can't span zones. + */ + lim->max_zone_append_sectors = + min3(lim->max_hw_sectors, + lim->max_zone_append_sectors, + lim->chunk_sectors); + } + + return 0; +} + +/* + * Check that the limits in lim are valid, initialize defaults for unset + * values, and cap values based on others where needed. + */ +static int blk_validate_limits(struct queue_limits *lim) +{ + unsigned int max_hw_sectors; + + /* + * Unless otherwise specified, default to 512 byte logical blocks and a + * physical block size equal to the logical block size. + */ + if (!lim->logical_block_size) + lim->logical_block_size = SECTOR_SIZE; + if (lim->physical_block_size < lim->logical_block_size) + lim->physical_block_size = lim->logical_block_size; + + /* + * The minimum I/O size defaults to the physical block size unless + * explicitly overridden. + */ + if (lim->io_min < lim->physical_block_size) + lim->io_min = lim->physical_block_size; + + /* + * max_hw_sectors has a somewhat weird default for historical reason, + * but driver really should set their own instead of relying on this + * value. + * + * The block layer relies on the fact that every driver can + * handle at lest a page worth of data per I/O, and needs the value + * aligned to the logical block size. + */ + if (!lim->max_hw_sectors) + lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS; + if (WARN_ON_ONCE(lim->max_hw_sectors < PAGE_SECTORS)) + return -EINVAL; + lim->max_hw_sectors = round_down(lim->max_hw_sectors, + lim->logical_block_size >> SECTOR_SHIFT); + + /* + * The actual max_sectors value is a complex beast and also takes the + * max_dev_sectors value (set by SCSI ULPs) and a user configurable + * value into account. The ->max_sectors value is always calculated + * from these, so directly setting it won't have any effect. + */ + max_hw_sectors = min_not_zero(lim->max_hw_sectors, + lim->max_dev_sectors); + if (lim->max_user_sectors) { + if (lim->max_user_sectors > max_hw_sectors || + lim->max_user_sectors < PAGE_SIZE / SECTOR_SIZE) + return -EINVAL; + lim->max_sectors = min(max_hw_sectors, lim->max_user_sectors); + } else { + lim->max_sectors = min(max_hw_sectors, BLK_DEF_MAX_SECTORS_CAP); + } + lim->max_sectors = round_down(lim->max_sectors, + lim->logical_block_size >> SECTOR_SHIFT); + + /* + * Random default for the maximum number of segments. Driver should not + * rely on this and set their own. + */ + if (!lim->max_segments) + lim->max_segments = BLK_MAX_SEGMENTS; + + lim->max_discard_sectors = lim->max_hw_discard_sectors; + if (!lim->max_discard_segments) + lim->max_discard_segments = 1; + + if (lim->discard_granularity < lim->physical_block_size) + lim->discard_granularity = lim->physical_block_size; + + /* + * By default there is no limit on the segment boundary alignment, + * but if there is one it can't be smaller than the page size as + * that would break all the normal I/O patterns. + */ + if (!lim->seg_boundary_mask) + lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; + if (WARN_ON_ONCE(lim->seg_boundary_mask < PAGE_SIZE - 1)) + return -EINVAL; + + /* + * The maximum segment size has an odd historic 64k default that + * drivers probably should override. Just like the I/O size we + * require drivers to at least handle a full page per segment. + */ + if (!lim->max_segment_size) + lim->max_segment_size = BLK_MAX_SEGMENT_SIZE; + if (WARN_ON_ONCE(lim->max_segment_size < PAGE_SIZE)) + return -EINVAL; + + /* + * Devices that require a virtual boundary do not support scatter/gather + * I/O natively, but instead require a descriptor list entry for each + * page (which might not be identical to the Linux PAGE_SIZE). Because + * of that they are not limited by our notion of "segment size". + */ + if (lim->virt_boundary_mask) { + if (WARN_ON_ONCE(lim->max_segment_size && + lim->max_segment_size != UINT_MAX)) + return -EINVAL; + lim->max_segment_size = UINT_MAX; + } + + /* + * We require drivers to at least do logical block aligned I/O, but + * historically could not check for that due to the separate calls + * to set the limits. Once the transition is finished the check + * below should be narrowed down to check the logical block size. + */ + if (!lim->dma_alignment) + lim->dma_alignment = SECTOR_SIZE - 1; + if (WARN_ON_ONCE(lim->dma_alignment > PAGE_SIZE)) + return -EINVAL; + + if (lim->alignment_offset) { + lim->alignment_offset &= (lim->physical_block_size - 1); + lim->misaligned = 0; + } + + return blk_validate_zoned_limits(lim); +} + +/* + * Set the default limits for a newly allocated queue. @lim contains the + * initial limits set by the driver, which could be no limit in which case + * all fields are cleared to zero. + */ +int blk_set_default_limits(struct queue_limits *lim) +{ + return blk_validate_limits(lim); +} + +/** + * queue_limits_commit_update - commit an atomic update of queue limits + * @q: queue to update + * @lim: limits to apply + * + * Apply the limits in @lim that were obtained from queue_limits_start_update() + * and updated by the caller to @q. + * + * Returns 0 if successful, else a negative error code. + */ +int queue_limits_commit_update(struct request_queue *q, + struct queue_limits *lim) + __releases(q->limits_lock) +{ + int error = blk_validate_limits(lim); + + if (!error) { + q->limits = *lim; + if (q->disk) + blk_apply_bdi_limits(q->disk->bdi, lim); + } + mutex_unlock(&q->limits_lock); + return error; +} +EXPORT_SYMBOL_GPL(queue_limits_commit_update); + /** * blk_queue_bounce_limit - set bounce buffer limit for queue * @q: the request queue for the device diff --git a/block/blk.h b/block/blk.h index 913c93838a01..43c7e9180b30 100644 --- a/block/blk.h +++ b/block/blk.h @@ -330,7 +330,7 @@ void blk_rq_set_mixed_merge(struct request *rq); bool blk_rq_merge_ok(struct request *rq, struct bio *bio); enum elv_merge blk_try_merge(struct request *rq, struct bio *bio); -void blk_set_default_limits(struct queue_limits *lim); +int blk_set_default_limits(struct queue_limits *lim); int blk_dev_init(void); /* diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 251a11d2d2ae..d41d7fe93457 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -474,6 +474,7 @@ struct request_queue { struct mutex sysfs_lock; struct mutex sysfs_dir_lock; + struct mutex limits_lock; /* * for reusing dead hctx instance in case of updating @@ -862,6 +863,28 @@ static inline unsigned int blk_chunk_sectors_left(sector_t offset, return chunk_sectors - (offset & (chunk_sectors - 1)); } +/** + * queue_limits_start_update - start an atomic update of queue limits + * @q: queue to update + * + * This functions starts an atomic update of the queue limits. It takes a lock + * to prevent other updates and returns a snapshot of the current limits that + * the caller can modify. The caller must call queue_limits_commit_update() + * to finish the update. + * + * Context: process context. The caller must have frozen the queue or ensured + * that there is outstanding I/O by other means. + */ +static inline struct queue_limits +queue_limits_start_update(struct request_queue *q) + __acquires(q->limits_lock) +{ + mutex_lock(&q->limits_lock); + return q->limits; +} +int queue_limits_commit_update(struct request_queue *q, + struct queue_limits *lim); + /* * Access functions for manipulating queue properties */ -- cgit v1.2.3-58-ga151 From 0327ca9d53bfbb0918867313049bba7046900f73 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 13 Feb 2024 08:34:15 +0100 Subject: block: use queue_limits_commit_update in queue_max_sectors_store Convert queue_max_sectors_store to use queue_limits_commit_update to check and update the max_sectors limit and freeze the queue before doing so to ensure we don't have requests in flight while changing the limits. Note that this removes the previously held queue_lock that doesn't protect against any other reader or writer. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: John Garry Reviewed-by: Chaitanya Kulkarni Reviewed-by: Ming Lei Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240213073425.1621680-6-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 37 ++++++++++++------------------------- 1 file changed, 12 insertions(+), 25 deletions(-) (limited to 'block') diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 6b2429cad81a..26607f9825cb 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -226,35 +226,22 @@ static ssize_t queue_zone_append_max_show(struct request_queue *q, char *page) static ssize_t queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) { - unsigned long var; - unsigned int max_sectors_kb, - max_hw_sectors_kb = queue_max_hw_sectors(q) >> 1, - page_kb = 1 << (PAGE_SHIFT - 10); - ssize_t ret = queue_var_store(&var, page, count); + unsigned long max_sectors_kb; + struct queue_limits lim; + ssize_t ret; + int err; + ret = queue_var_store(&max_sectors_kb, page, count); if (ret < 0) return ret; - max_sectors_kb = (unsigned int)var; - max_hw_sectors_kb = min_not_zero(max_hw_sectors_kb, - q->limits.max_dev_sectors >> 1); - if (max_sectors_kb == 0) { - q->limits.max_user_sectors = 0; - max_sectors_kb = min(max_hw_sectors_kb, - BLK_DEF_MAX_SECTORS_CAP >> 1); - } else { - if (max_sectors_kb > max_hw_sectors_kb || - max_sectors_kb < page_kb) - return -EINVAL; - q->limits.max_user_sectors = max_sectors_kb << 1; - } - - spin_lock_irq(&q->queue_lock); - q->limits.max_sectors = max_sectors_kb << 1; - if (q->disk) - q->disk->bdi->io_pages = max_sectors_kb >> (PAGE_SHIFT - 10); - spin_unlock_irq(&q->queue_lock); - + blk_mq_freeze_queue(q); + lim = queue_limits_start_update(q); + lim.max_user_sectors = max_sectors_kb << 1; + err = queue_limits_commit_update(q, &lim); + blk_mq_unfreeze_queue(q); + if (err) + return err; return ret; } -- cgit v1.2.3-58-ga151 From 4f563a64732dabb2677c7d1232a8f714a18b41b3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 13 Feb 2024 08:34:16 +0100 Subject: block: add a max_user_discard_sectors queue limit Add a new max_user_discard_sectors limit that mirrors max_user_sectors and stores the value that the user manually set. This now allows updates of the max_hw_discard_sectors to not worry about the user limit. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Chaitanya Kulkarni Reviewed-by: Ming Lei Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240213073425.1621680-7-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-settings.c | 18 +++++++++++++++--- block/blk-sysfs.c | 17 ++++++++--------- include/linux/blkdev.h | 1 + 3 files changed, 24 insertions(+), 12 deletions(-) (limited to 'block') diff --git a/block/blk-settings.c b/block/blk-settings.c index 786b369ca599..c4406aacc0ef 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -51,6 +51,7 @@ void blk_set_stacking_limits(struct queue_limits *lim) lim->max_dev_sectors = UINT_MAX; lim->max_write_zeroes_sectors = UINT_MAX; lim->max_zone_append_sectors = UINT_MAX; + lim->max_user_discard_sectors = UINT_MAX; } EXPORT_SYMBOL(blk_set_stacking_limits); @@ -162,7 +163,9 @@ static int blk_validate_limits(struct queue_limits *lim) if (!lim->max_segments) lim->max_segments = BLK_MAX_SEGMENTS; - lim->max_discard_sectors = lim->max_hw_discard_sectors; + lim->max_discard_sectors = + min(lim->max_hw_discard_sectors, lim->max_user_discard_sectors); + if (!lim->max_discard_segments) lim->max_discard_segments = 1; @@ -228,6 +231,12 @@ static int blk_validate_limits(struct queue_limits *lim) */ int blk_set_default_limits(struct queue_limits *lim) { + /* + * Most defaults are set by capping the bounds in blk_validate_limits, + * but max_user_discard_sectors is special and needs an explicit + * initialization to the max value here. + */ + lim->max_user_discard_sectors = UINT_MAX; return blk_validate_limits(lim); } @@ -349,8 +358,11 @@ EXPORT_SYMBOL(blk_queue_chunk_sectors); void blk_queue_max_discard_sectors(struct request_queue *q, unsigned int max_discard_sectors) { - q->limits.max_hw_discard_sectors = max_discard_sectors; - q->limits.max_discard_sectors = max_discard_sectors; + struct queue_limits *lim = &q->limits; + + lim->max_hw_discard_sectors = max_discard_sectors; + lim->max_discard_sectors = + min(max_discard_sectors, lim->max_user_discard_sectors); } EXPORT_SYMBOL(blk_queue_max_discard_sectors); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 26607f9825cb..a1ec27f0ba41 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -174,23 +174,22 @@ static ssize_t queue_discard_max_show(struct request_queue *q, char *page) static ssize_t queue_discard_max_store(struct request_queue *q, const char *page, size_t count) { - unsigned long max_discard; - ssize_t ret = queue_var_store(&max_discard, page, count); + unsigned long max_discard_bytes; + ssize_t ret; + ret = queue_var_store(&max_discard_bytes, page, count); if (ret < 0) return ret; - if (max_discard & (q->limits.discard_granularity - 1)) + if (max_discard_bytes & (q->limits.discard_granularity - 1)) return -EINVAL; - max_discard >>= 9; - if (max_discard > UINT_MAX) + if ((max_discard_bytes >> SECTOR_SHIFT) > UINT_MAX) return -EINVAL; - if (max_discard > q->limits.max_hw_discard_sectors) - max_discard = q->limits.max_hw_discard_sectors; - - q->limits.max_discard_sectors = max_discard; + q->limits.max_user_discard_sectors = max_discard_bytes >> SECTOR_SHIFT; + q->limits.max_discard_sectors = min(q->limits.max_hw_discard_sectors, + q->limits.max_user_discard_sectors); return ret; } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d41d7fe93457..45746ba73670 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -291,6 +291,7 @@ struct queue_limits { unsigned int io_opt; unsigned int max_discard_sectors; unsigned int max_hw_discard_sectors; + unsigned int max_user_discard_sectors; unsigned int max_secure_erase_sectors; unsigned int max_write_zeroes_sectors; unsigned int max_zone_append_sectors; -- cgit v1.2.3-58-ga151 From ff956a3be95b45b2a823693a8c9db740939ca35e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 13 Feb 2024 08:34:17 +0100 Subject: block: use queue_limits_commit_update in queue_discard_max_store Convert queue_discard_max_store to use queue_limits_commit_update to check and update the max_discard_sectors limit and freeze the queue before doing so to ensure we don't have requests in flight while changing the limits. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Chaitanya Kulkarni Reviewed-by: Ming Lei Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240213073425.1621680-8-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index a1ec27f0ba41..8c8f69d8ba48 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -175,7 +175,9 @@ static ssize_t queue_discard_max_store(struct request_queue *q, const char *page, size_t count) { unsigned long max_discard_bytes; + struct queue_limits lim; ssize_t ret; + int err; ret = queue_var_store(&max_discard_bytes, page, count); if (ret < 0) @@ -187,9 +189,14 @@ static ssize_t queue_discard_max_store(struct request_queue *q, if ((max_discard_bytes >> SECTOR_SHIFT) > UINT_MAX) return -EINVAL; - q->limits.max_user_discard_sectors = max_discard_bytes >> SECTOR_SHIFT; - q->limits.max_discard_sectors = min(q->limits.max_hw_discard_sectors, - q->limits.max_user_discard_sectors); + blk_mq_freeze_queue(q); + lim = queue_limits_start_update(q); + lim.max_user_discard_sectors = max_discard_bytes >> SECTOR_SHIFT; + err = queue_limits_commit_update(q, &lim); + blk_mq_unfreeze_queue(q); + + if (err) + return err; return ret; } -- cgit v1.2.3-58-ga151 From ad751ba1f8d5d4f4f4b429b552a154e888524a93 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 13 Feb 2024 08:34:18 +0100 Subject: block: pass a queue_limits argument to blk_alloc_queue Pass a queue_limits to blk_alloc_queue and apply it after validating and capping the values using blk_validate_limits. This will allow allocating queues with valid queue limits instead of setting the values one at a time later. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240213073425.1621680-9-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-core.c | 26 ++++++++++++++++++-------- block/blk-mq.c | 7 ++++--- block/blk.h | 2 +- block/genhd.c | 5 +++-- 4 files changed, 26 insertions(+), 14 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index cb56724a8dfb..a16b5abdbbf5 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -394,24 +394,34 @@ static void blk_timeout_work(struct work_struct *work) { } -struct request_queue *blk_alloc_queue(int node_id) +struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id) { struct request_queue *q; + int error; q = kmem_cache_alloc_node(blk_requestq_cachep, GFP_KERNEL | __GFP_ZERO, node_id); if (!q) - return NULL; + return ERR_PTR(-ENOMEM); q->last_merge = NULL; q->id = ida_alloc(&blk_queue_ida, GFP_KERNEL); - if (q->id < 0) + if (q->id < 0) { + error = q->id; goto fail_q; + } q->stats = blk_alloc_queue_stats(); - if (!q->stats) + if (!q->stats) { + error = -ENOMEM; goto fail_id; + } + + error = blk_set_default_limits(lim); + if (error) + goto fail_stats; + q->limits = *lim; q->node = node_id; @@ -436,12 +446,12 @@ struct request_queue *blk_alloc_queue(int node_id) * Init percpu_ref in atomic mode so that it's faster to shutdown. * See blk_register_queue() for details. */ - if (percpu_ref_init(&q->q_usage_counter, + error = percpu_ref_init(&q->q_usage_counter, blk_queue_usage_counter_release, - PERCPU_REF_INIT_ATOMIC, GFP_KERNEL)) + PERCPU_REF_INIT_ATOMIC, GFP_KERNEL); + if (error) goto fail_stats; - blk_set_default_limits(&q->limits); q->nr_requests = BLKDEV_DEFAULT_RQ; return q; @@ -452,7 +462,7 @@ fail_id: ida_free(&blk_queue_ida, q->id); fail_q: kmem_cache_free(blk_requestq_cachep, q); - return NULL; + return ERR_PTR(error); } /** diff --git a/block/blk-mq.c b/block/blk-mq.c index 6d2f7b5caa01..9dd8055cc524 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4086,12 +4086,13 @@ void blk_mq_release(struct request_queue *q) static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, void *queuedata) { + struct queue_limits lim = { }; struct request_queue *q; int ret; - q = blk_alloc_queue(set->numa_node); - if (!q) - return ERR_PTR(-ENOMEM); + q = blk_alloc_queue(&lim, set->numa_node); + if (IS_ERR(q)) + return q; q->queuedata = queuedata; ret = blk_mq_init_allocated_queue(set, q); if (ret) { diff --git a/block/blk.h b/block/blk.h index 43c7e9180b30..7c30e2ac8ebc 100644 --- a/block/blk.h +++ b/block/blk.h @@ -448,7 +448,7 @@ static inline void bio_release_page(struct bio *bio, struct page *page) unpin_user_page(page); } -struct request_queue *blk_alloc_queue(int node_id); +struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id); int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode); diff --git a/block/genhd.c b/block/genhd.c index d74fb5b4ae68..7a8fd57c51f7 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1393,11 +1393,12 @@ out_free_disk: struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass) { + struct queue_limits lim = { }; struct request_queue *q; struct gendisk *disk; - q = blk_alloc_queue(node); - if (!q) + q = blk_alloc_queue(&lim, node); + if (IS_ERR(q)) return NULL; disk = __alloc_disk_node(q, node, lkclass); -- cgit v1.2.3-58-ga151 From 9ac4dd8c47d533eb420af6a679e66ec74771125c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 13 Feb 2024 08:34:19 +0100 Subject: block: pass a queue_limits argument to blk_mq_init_queue Pass a queue_limits to blk_mq_init_queue and apply it if non-NULL. This will allow allocating queues with valid queue limits instead of setting the values one at a time later. Also rename the function to blk_mq_alloc_queue as that is a much better name for a function that allocates a queue and always pass the queuedata argument instead of having a separate version for the extra argument. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: John Garry Reviewed-by: Chaitanya Kulkarni Reviewed-by: Ming Lei Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240213073425.1621680-10-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq.c | 21 ++++++++------------- block/bsg-lib.c | 2 +- drivers/nvme/host/apple.c | 2 +- drivers/nvme/host/core.c | 6 +++--- drivers/scsi/scsi_scan.c | 2 +- drivers/ufs/core/ufshcd.c | 2 +- include/linux/blk-mq.h | 3 ++- 7 files changed, 17 insertions(+), 21 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 9dd8055cc524..f6499bbd89be 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4083,14 +4083,14 @@ void blk_mq_release(struct request_queue *q) blk_mq_sysfs_deinit(q); } -static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, - void *queuedata) +struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set, + struct queue_limits *lim, void *queuedata) { - struct queue_limits lim = { }; + struct queue_limits default_lim = { }; struct request_queue *q; int ret; - q = blk_alloc_queue(&lim, set->numa_node); + q = blk_alloc_queue(lim ? lim : &default_lim, set->numa_node); if (IS_ERR(q)) return q; q->queuedata = queuedata; @@ -4101,20 +4101,15 @@ static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, } return q; } - -struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) -{ - return blk_mq_init_queue_data(set, NULL); -} -EXPORT_SYMBOL(blk_mq_init_queue); +EXPORT_SYMBOL(blk_mq_alloc_queue); /** * blk_mq_destroy_queue - shutdown a request queue * @q: request queue to shutdown * - * This shuts down a request queue allocated by blk_mq_init_queue(). All future + * This shuts down a request queue allocated by blk_mq_alloc_queue(). All future * requests will be failed with -ENODEV. The caller is responsible for dropping - * the reference from blk_mq_init_queue() by calling blk_put_queue(). + * the reference from blk_mq_alloc_queue() by calling blk_put_queue(). * * Context: can sleep */ @@ -4141,7 +4136,7 @@ struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata, struct request_queue *q; struct gendisk *disk; - q = blk_mq_init_queue_data(set, queuedata); + q = blk_mq_alloc_queue(set, NULL, queuedata); if (IS_ERR(q)) return ERR_CAST(q); diff --git a/block/bsg-lib.c b/block/bsg-lib.c index b3acdbdb6e7e..bcc7dee6abce 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -383,7 +383,7 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name, if (blk_mq_alloc_tag_set(set)) goto out_tag_set; - q = blk_mq_init_queue(set); + q = blk_mq_alloc_queue(set, NULL, NULL); if (IS_ERR(q)) { ret = PTR_ERR(q); goto out_queue; diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c index c727cd1f264b..a480cdeac288 100644 --- a/drivers/nvme/host/apple.c +++ b/drivers/nvme/host/apple.c @@ -1516,7 +1516,7 @@ static int apple_nvme_probe(struct platform_device *pdev) goto put_dev; } - anv->ctrl.admin_q = blk_mq_init_queue(&anv->admin_tagset); + anv->ctrl.admin_q = blk_mq_alloc_queue(&anv->admin_tagset, NULL, NULL); if (IS_ERR(anv->ctrl.admin_q)) { ret = -ENOMEM; goto put_dev; diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 6e7f9b13fba2..5bcdf3654598 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4372,14 +4372,14 @@ int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, if (ret) return ret; - ctrl->admin_q = blk_mq_init_queue(set); + ctrl->admin_q = blk_mq_alloc_queue(set, NULL, NULL); if (IS_ERR(ctrl->admin_q)) { ret = PTR_ERR(ctrl->admin_q); goto out_free_tagset; } if (ctrl->ops->flags & NVME_F_FABRICS) { - ctrl->fabrics_q = blk_mq_init_queue(set); + ctrl->fabrics_q = blk_mq_alloc_queue(set, NULL, NULL); if (IS_ERR(ctrl->fabrics_q)) { ret = PTR_ERR(ctrl->fabrics_q); goto out_cleanup_admin_q; @@ -4443,7 +4443,7 @@ int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, return ret; if (ctrl->ops->flags & NVME_F_FABRICS) { - ctrl->connect_q = blk_mq_init_queue(set); + ctrl->connect_q = blk_mq_alloc_queue(set, NULL, NULL); if (IS_ERR(ctrl->connect_q)) { ret = PTR_ERR(ctrl->connect_q); goto out_free_tag_set; diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c index 44680f65ea14..9969f4e2f1c3 100644 --- a/drivers/scsi/scsi_scan.c +++ b/drivers/scsi/scsi_scan.c @@ -332,7 +332,7 @@ static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget, sdev->sg_reserved_size = INT_MAX; - q = blk_mq_init_queue(&sdev->host->tag_set); + q = blk_mq_alloc_queue(&sdev->host->tag_set, NULL, NULL); if (IS_ERR(q)) { /* release fn is set up in scsi_sysfs_device_initialise, so * have to free and put manually here */ diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index 029d017fc1b6..c502a86db16b 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -10592,7 +10592,7 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq) err = blk_mq_alloc_tag_set(&hba->tmf_tag_set); if (err < 0) goto out_remove_scsi_host; - hba->tmf_queue = blk_mq_init_queue(&hba->tmf_tag_set); + hba->tmf_queue = blk_mq_alloc_queue(&hba->tmf_tag_set, NULL, NULL); if (IS_ERR(hba->tmf_queue)) { err = PTR_ERR(hba->tmf_queue); goto free_tmf_tag_set; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 7a8150a5f051..7d42c359e2ab 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -692,7 +692,8 @@ struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata, }) struct gendisk *blk_mq_alloc_disk_for_queue(struct request_queue *q, struct lock_class_key *lkclass); -struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *); +struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set, + struct queue_limits *lim, void *queuedata); int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, struct request_queue *q); void blk_mq_destroy_queue(struct request_queue *); -- cgit v1.2.3-58-ga151 From 27e32cd23fed1ab88098897897dcb9ec2bdba4de Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 13 Feb 2024 08:34:20 +0100 Subject: block: pass a queue_limits argument to blk_mq_alloc_disk Pass a queue_limits to blk_mq_alloc_disk and apply it if non-NULL. This will allow allocating queues with valid queue limits instead of setting the values one at a time later. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: John Garry Reviewed-by: Chaitanya Kulkarni Reviewed-by: Ming Lei Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240213073425.1621680-11-hch@lst.de Signed-off-by: Jens Axboe --- arch/um/drivers/ubd_kern.c | 2 +- block/blk-mq.c | 5 +++-- drivers/block/amiflop.c | 2 +- drivers/block/aoe/aoeblk.c | 2 +- drivers/block/ataflop.c | 2 +- drivers/block/floppy.c | 2 +- drivers/block/loop.c | 2 +- drivers/block/mtip32xx/mtip32xx.c | 2 +- drivers/block/nbd.c | 2 +- drivers/block/null_blk/main.c | 2 +- drivers/block/ps3disk.c | 2 +- drivers/block/rbd.c | 2 +- drivers/block/rnbd/rnbd-clt.c | 2 +- drivers/block/sunvdc.c | 2 +- drivers/block/swim.c | 2 +- drivers/block/swim3.c | 2 +- drivers/block/ublk_drv.c | 2 +- drivers/block/virtio_blk.c | 2 +- drivers/block/xen-blkfront.c | 2 +- drivers/block/z2ram.c | 2 +- drivers/cdrom/gdrom.c | 2 +- drivers/memstick/core/ms_block.c | 2 +- drivers/memstick/core/mspro_block.c | 2 +- drivers/mmc/core/queue.c | 2 +- drivers/mtd/mtd_blkdevs.c | 2 +- drivers/mtd/ubi/block.c | 2 +- drivers/nvme/host/core.c | 2 +- drivers/s390/block/dasd_genhd.c | 2 +- drivers/s390/block/scm_blk.c | 2 +- include/linux/blk-mq.h | 7 ++++--- 30 files changed, 35 insertions(+), 33 deletions(-) (limited to 'block') diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index 92ee2697ff39..25f1b18ce7d4 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -906,7 +906,7 @@ static int ubd_add(int n, char **error_out) if (err) goto out; - disk = blk_mq_alloc_disk(&ubd_dev->tag_set, ubd_dev); + disk = blk_mq_alloc_disk(&ubd_dev->tag_set, NULL, ubd_dev); if (IS_ERR(disk)) { err = PTR_ERR(disk); goto out_cleanup_tags; diff --git a/block/blk-mq.c b/block/blk-mq.c index f6499bbd89be..6abb4ce46baa 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4130,13 +4130,14 @@ void blk_mq_destroy_queue(struct request_queue *q) } EXPORT_SYMBOL(blk_mq_destroy_queue); -struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata, +struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, + struct queue_limits *lim, void *queuedata, struct lock_class_key *lkclass) { struct request_queue *q; struct gendisk *disk; - q = blk_mq_alloc_queue(set, NULL, queuedata); + q = blk_mq_alloc_queue(set, lim, queuedata); if (IS_ERR(q)) return ERR_CAST(q); diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c index 2b98114a9fe0..a25414228e47 100644 --- a/drivers/block/amiflop.c +++ b/drivers/block/amiflop.c @@ -1779,7 +1779,7 @@ static int fd_alloc_disk(int drive, int system) struct gendisk *disk; int err; - disk = blk_mq_alloc_disk(&unit[drive].tag_set, NULL); + disk = blk_mq_alloc_disk(&unit[drive].tag_set, NULL, NULL); if (IS_ERR(disk)) return PTR_ERR(disk); diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index b1b47d88f5db..2ff6e2da8cc4 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c @@ -371,7 +371,7 @@ aoeblk_gdalloc(void *vp) goto err_mempool; } - gd = blk_mq_alloc_disk(set, d); + gd = blk_mq_alloc_disk(set, NULL, d); if (IS_ERR(gd)) { pr_err("aoe: cannot allocate block queue for %ld.%d\n", d->aoemajor, d->aoeminor); diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c index 50949207798d..cacc4ba942a8 100644 --- a/drivers/block/ataflop.c +++ b/drivers/block/ataflop.c @@ -1994,7 +1994,7 @@ static int ataflop_alloc_disk(unsigned int drive, unsigned int type) { struct gendisk *disk; - disk = blk_mq_alloc_disk(&unit[drive].tag_set, NULL); + disk = blk_mq_alloc_disk(&unit[drive].tag_set, NULL, NULL); if (IS_ERR(disk)) return PTR_ERR(disk); diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 2ba0ba135951..582cf50c6bf6 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -4518,7 +4518,7 @@ static int floppy_alloc_disk(unsigned int drive, unsigned int type) { struct gendisk *disk; - disk = blk_mq_alloc_disk(&tag_sets[drive], NULL); + disk = blk_mq_alloc_disk(&tag_sets[drive], NULL, NULL); if (IS_ERR(disk)) return PTR_ERR(disk); diff --git a/drivers/block/loop.c b/drivers/block/loop.c index f8145499da38..3f855cc79c29 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -2025,7 +2025,7 @@ static int loop_add(int i) if (err) goto out_free_idr; - disk = lo->lo_disk = blk_mq_alloc_disk(&lo->tag_set, lo); + disk = lo->lo_disk = blk_mq_alloc_disk(&lo->tag_set, NULL, lo); if (IS_ERR(disk)) { err = PTR_ERR(disk); goto out_cleanup_tags; diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index b200950e8fb5..ac08dea73552 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -3431,7 +3431,7 @@ static int mtip_block_initialize(struct driver_data *dd) goto block_queue_alloc_tag_error; } - dd->disk = blk_mq_alloc_disk(&dd->tags, dd); + dd->disk = blk_mq_alloc_disk(&dd->tags, NULL, dd); if (IS_ERR(dd->disk)) { dev_err(&dd->pdev->dev, "Unable to allocate request queue\n"); diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 33a8f37bb6a1..30ae3cc12e77 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1823,7 +1823,7 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs) if (err < 0) goto out_free_tags; - disk = blk_mq_alloc_disk(&nbd->tag_set, NULL); + disk = blk_mq_alloc_disk(&nbd->tag_set, NULL, NULL); if (IS_ERR(disk)) { err = PTR_ERR(disk); goto out_free_idr; diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 4281371c81fe..eeb895ec6f34 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -2147,7 +2147,7 @@ static int null_add_dev(struct nullb_device *dev) goto out_cleanup_queues; nullb->tag_set->timeout = 5 * HZ; - nullb->disk = blk_mq_alloc_disk(nullb->tag_set, nullb); + nullb->disk = blk_mq_alloc_disk(nullb->tag_set, NULL, nullb); if (IS_ERR(nullb->disk)) { rv = PTR_ERR(nullb->disk); goto out_cleanup_tags; diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c index 36d7b36c60c7..dfd3860df4f8 100644 --- a/drivers/block/ps3disk.c +++ b/drivers/block/ps3disk.c @@ -431,7 +431,7 @@ static int ps3disk_probe(struct ps3_system_bus_device *_dev) if (error) goto fail_teardown; - gendisk = blk_mq_alloc_disk(&priv->tag_set, dev); + gendisk = blk_mq_alloc_disk(&priv->tag_set, NULL, dev); if (IS_ERR(gendisk)) { dev_err(&dev->sbd.core, "%s:%u: blk_mq_alloc_disk failed\n", __func__, __LINE__); diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 00ca8a1d8c46..6b4f1898a722 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -4966,7 +4966,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) if (err) return err; - disk = blk_mq_alloc_disk(&rbd_dev->tag_set, rbd_dev); + disk = blk_mq_alloc_disk(&rbd_dev->tag_set, NULL, rbd_dev); if (IS_ERR(disk)) { err = PTR_ERR(disk); goto out_tag_set; diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index 4044c369d22a..d51be4f2df61 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -1408,7 +1408,7 @@ static int rnbd_client_setup_device(struct rnbd_clt_dev *dev, dev->size = le64_to_cpu(rsp->nsectors) * le16_to_cpu(rsp->logical_block_size); - dev->gd = blk_mq_alloc_disk(&dev->sess->tag_set, dev); + dev->gd = blk_mq_alloc_disk(&dev->sess->tag_set, NULL, dev); if (IS_ERR(dev->gd)) return PTR_ERR(dev->gd); dev->queue = dev->gd->queue; diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c index 7bf4b48e2282..a1f74dd1eae5 100644 --- a/drivers/block/sunvdc.c +++ b/drivers/block/sunvdc.c @@ -824,7 +824,7 @@ static int probe_disk(struct vdc_port *port) if (err) return err; - g = blk_mq_alloc_disk(&port->tag_set, port); + g = blk_mq_alloc_disk(&port->tag_set, NULL, port); if (IS_ERR(g)) { printk(KERN_ERR PFX "%s: Could not allocate gendisk.\n", port->vio.name); diff --git a/drivers/block/swim.c b/drivers/block/swim.c index f85b6af414b4..16bdf62067d8 100644 --- a/drivers/block/swim.c +++ b/drivers/block/swim.c @@ -820,7 +820,7 @@ static int swim_floppy_init(struct swim_priv *swd) goto exit_put_disks; swd->unit[drive].disk = - blk_mq_alloc_disk(&swd->unit[drive].tag_set, + blk_mq_alloc_disk(&swd->unit[drive].tag_set, NULL, &swd->unit[drive]); if (IS_ERR(swd->unit[drive].disk)) { blk_mq_free_tag_set(&swd->unit[drive].tag_set); diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c index c2bc85826358..a04756ac778e 100644 --- a/drivers/block/swim3.c +++ b/drivers/block/swim3.c @@ -1210,7 +1210,7 @@ static int swim3_attach(struct macio_dev *mdev, if (rc) goto out_unregister; - disk = blk_mq_alloc_disk(&fs->tag_set, fs); + disk = blk_mq_alloc_disk(&fs->tag_set, NULL, fs); if (IS_ERR(disk)) { rc = PTR_ERR(disk); goto out_free_tag_set; diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 1dfb2e77898b..c5b655270798 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -2222,7 +2222,7 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd) goto out_unlock; } - disk = blk_mq_alloc_disk(&ub->tag_set, NULL); + disk = blk_mq_alloc_disk(&ub->tag_set, NULL, NULL); if (IS_ERR(disk)) { ret = PTR_ERR(disk); goto out_unlock; diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 5bf98fd6a651..a23fce4eca44 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -1330,7 +1330,7 @@ static int virtblk_probe(struct virtio_device *vdev) if (err) goto out_free_vq; - vblk->disk = blk_mq_alloc_disk(&vblk->tag_set, vblk); + vblk->disk = blk_mq_alloc_disk(&vblk->tag_set, NULL, vblk); if (IS_ERR(vblk->disk)) { err = PTR_ERR(vblk->disk); goto out_free_tags; diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 434fab306777..4cc2884e7484 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -1136,7 +1136,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, if (err) goto out_release_minors; - gd = blk_mq_alloc_disk(&info->tag_set, info); + gd = blk_mq_alloc_disk(&info->tag_set, NULL, info); if (IS_ERR(gd)) { err = PTR_ERR(gd); goto out_free_tag_set; diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c index 11493167b0a8..7c5f4e4d9b50 100644 --- a/drivers/block/z2ram.c +++ b/drivers/block/z2ram.c @@ -318,7 +318,7 @@ static int z2ram_register_disk(int minor) struct gendisk *disk; int err; - disk = blk_mq_alloc_disk(&tag_set, NULL); + disk = blk_mq_alloc_disk(&tag_set, NULL, NULL); if (IS_ERR(disk)) return PTR_ERR(disk); diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c index d668b174ace9..1d044779f5e4 100644 --- a/drivers/cdrom/gdrom.c +++ b/drivers/cdrom/gdrom.c @@ -778,7 +778,7 @@ static int probe_gdrom(struct platform_device *devptr) if (err) goto probe_fail_free_cd_info; - gd.disk = blk_mq_alloc_disk(&gd.tag_set, NULL); + gd.disk = blk_mq_alloc_disk(&gd.tag_set, NULL, NULL); if (IS_ERR(gd.disk)) { err = PTR_ERR(gd.disk); goto probe_fail_free_tag_set; diff --git a/drivers/memstick/core/ms_block.c b/drivers/memstick/core/ms_block.c index 04115cd92433..d3277c901d16 100644 --- a/drivers/memstick/core/ms_block.c +++ b/drivers/memstick/core/ms_block.c @@ -2093,7 +2093,7 @@ static int msb_init_disk(struct memstick_dev *card) if (rc) goto out_release_id; - msb->disk = blk_mq_alloc_disk(&msb->tag_set, card); + msb->disk = blk_mq_alloc_disk(&msb->tag_set, NULL, card); if (IS_ERR(msb->disk)) { rc = PTR_ERR(msb->disk); goto out_free_tag_set; diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c index 5a69ed33999b..db0e2a42ca3c 100644 --- a/drivers/memstick/core/mspro_block.c +++ b/drivers/memstick/core/mspro_block.c @@ -1138,7 +1138,7 @@ static int mspro_block_init_disk(struct memstick_dev *card) if (rc) goto out_release_id; - msb->disk = blk_mq_alloc_disk(&msb->tag_set, card); + msb->disk = blk_mq_alloc_disk(&msb->tag_set, NULL, card); if (IS_ERR(msb->disk)) { rc = PTR_ERR(msb->disk); goto out_free_tag_set; diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c index a0a2412f62a7..67ad186d132a 100644 --- a/drivers/mmc/core/queue.c +++ b/drivers/mmc/core/queue.c @@ -447,7 +447,7 @@ struct gendisk *mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card) return ERR_PTR(ret); - disk = blk_mq_alloc_disk(&mq->tag_set, mq); + disk = blk_mq_alloc_disk(&mq->tag_set, NULL, mq); if (IS_ERR(disk)) { blk_mq_free_tag_set(&mq->tag_set); return disk; diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c index f0526dcc2162..b8878a2457af 100644 --- a/drivers/mtd/mtd_blkdevs.c +++ b/drivers/mtd/mtd_blkdevs.c @@ -333,7 +333,7 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new) goto out_kfree_tag_set; /* Create gendisk */ - gd = blk_mq_alloc_disk(new->tag_set, new); + gd = blk_mq_alloc_disk(new->tag_set, NULL, new); if (IS_ERR(gd)) { ret = PTR_ERR(gd); goto out_free_tag_set; diff --git a/drivers/mtd/ubi/block.c b/drivers/mtd/ubi/block.c index 654bd7372cd8..9be87c231a2e 100644 --- a/drivers/mtd/ubi/block.c +++ b/drivers/mtd/ubi/block.c @@ -393,7 +393,7 @@ int ubiblock_create(struct ubi_volume_info *vi) /* Initialize the gendisk of this ubiblock device */ - gd = blk_mq_alloc_disk(&dev->tag_set, dev); + gd = blk_mq_alloc_disk(&dev->tag_set, NULL, dev); if (IS_ERR(gd)) { ret = PTR_ERR(gd); goto out_free_tags; diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 5bcdf3654598..eed3e22e24d9 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3694,7 +3694,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info) if (!ns) return; - disk = blk_mq_alloc_disk(ctrl->tagset, ns); + disk = blk_mq_alloc_disk(ctrl->tagset, NULL, ns); if (IS_ERR(disk)) goto out_free_ns; disk->fops = &nvme_bdev_ops; diff --git a/drivers/s390/block/dasd_genhd.c b/drivers/s390/block/dasd_genhd.c index 30e8ee583e98..0465b706745f 100644 --- a/drivers/s390/block/dasd_genhd.c +++ b/drivers/s390/block/dasd_genhd.c @@ -53,7 +53,7 @@ int dasd_gendisk_alloc(struct dasd_block *block) if (rc) return rc; - gdp = blk_mq_alloc_disk(&block->tag_set, block); + gdp = blk_mq_alloc_disk(&block->tag_set, NULL, block); if (IS_ERR(gdp)) { blk_mq_free_tag_set(&block->tag_set); return PTR_ERR(gdp); diff --git a/drivers/s390/block/scm_blk.c b/drivers/s390/block/scm_blk.c index ade95e91b3c8..d05b2e2799a4 100644 --- a/drivers/s390/block/scm_blk.c +++ b/drivers/s390/block/scm_blk.c @@ -462,7 +462,7 @@ int scm_blk_dev_setup(struct scm_blk_dev *bdev, struct scm_device *scmdev) if (ret) goto out; - bdev->gendisk = blk_mq_alloc_disk(&bdev->tag_set, scmdev); + bdev->gendisk = blk_mq_alloc_disk(&bdev->tag_set, NULL, scmdev); if (IS_ERR(bdev->gendisk)) { ret = PTR_ERR(bdev->gendisk); goto out_tag; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 7d42c359e2ab..390d35fa0032 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -682,13 +682,14 @@ enum { #define BLK_MQ_NO_HCTX_IDX (-1U) -struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata, +struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, + struct queue_limits *lim, void *queuedata, struct lock_class_key *lkclass); -#define blk_mq_alloc_disk(set, queuedata) \ +#define blk_mq_alloc_disk(set, lim, queuedata) \ ({ \ static struct lock_class_key __key; \ \ - __blk_mq_alloc_disk(set, queuedata, &__key); \ + __blk_mq_alloc_disk(set, lim, queuedata, &__key); \ }) struct gendisk *blk_mq_alloc_disk_for_queue(struct request_queue *q, struct lock_class_key *lkclass); -- cgit v1.2.3-58-ga151 From 74fa8f9c553f7b5ccab7d103acae63cc2e080465 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 15 Feb 2024 08:10:47 +0100 Subject: block: pass a queue_limits argument to blk_alloc_disk Pass a queue_limits to blk_alloc_disk and apply it if non-NULL. This will allow allocating queues with valid queue limits instead of setting the values one at a time later. Also change blk_alloc_disk to return an ERR_PTR instead of just NULL which can't distinguish errors. Signed-off-by: Christoph Hellwig Reviewed-by: Dan Williams Reviewed-by: Himanshu Madhani Link: https://lore.kernel.org/r/20240215071055.2201424-2-hch@lst.de Signed-off-by: Jens Axboe --- arch/m68k/emu/nfblock.c | 6 ++++-- arch/xtensa/platforms/iss/simdisk.c | 8 +++++--- block/genhd.c | 11 ++++++----- drivers/block/brd.c | 7 ++++--- drivers/block/drbd/drbd_main.c | 6 ++++-- drivers/block/n64cart.c | 6 ++++-- drivers/block/null_blk/main.c | 7 ++++--- drivers/block/pktcdvd.c | 7 ++++--- drivers/block/ps3vram.c | 6 +++--- drivers/block/zram/zram_drv.c | 6 +++--- drivers/md/bcache/super.c | 4 ++-- drivers/md/dm.c | 4 ++-- drivers/md/md.c | 7 ++++--- drivers/nvdimm/btt.c | 8 ++++---- drivers/nvdimm/pmem.c | 6 +++--- drivers/nvme/host/multipath.c | 6 +++--- drivers/s390/block/dcssblk.c | 6 +++--- include/linux/blkdev.h | 10 +++++++--- 18 files changed, 69 insertions(+), 52 deletions(-) (limited to 'block') diff --git a/arch/m68k/emu/nfblock.c b/arch/m68k/emu/nfblock.c index a708fbd5a844..539ff56b6968 100644 --- a/arch/m68k/emu/nfblock.c +++ b/arch/m68k/emu/nfblock.c @@ -117,9 +117,11 @@ static int __init nfhd_init_one(int id, u32 blocks, u32 bsize) dev->bsize = bsize; dev->bshift = ffs(bsize) - 10; - dev->disk = blk_alloc_disk(NUMA_NO_NODE); - if (!dev->disk) + dev->disk = blk_alloc_disk(NULL, NUMA_NO_NODE); + if (IS_ERR(dev->disk)) { + err = PTR_ERR(dev->disk); goto free_dev; + } dev->disk->major = major_num; dev->disk->first_minor = dev_id * 16; diff --git a/arch/xtensa/platforms/iss/simdisk.c b/arch/xtensa/platforms/iss/simdisk.c index 178cf96ca10a..defc67909a9c 100644 --- a/arch/xtensa/platforms/iss/simdisk.c +++ b/arch/xtensa/platforms/iss/simdisk.c @@ -264,16 +264,18 @@ static int __init simdisk_setup(struct simdisk *dev, int which, struct proc_dir_entry *procdir) { char tmp[2] = { '0' + which, 0 }; - int err = -ENOMEM; + int err; dev->fd = -1; dev->filename = NULL; spin_lock_init(&dev->lock); dev->users = 0; - dev->gd = blk_alloc_disk(NUMA_NO_NODE); - if (!dev->gd) + dev->gd = blk_alloc_disk(NULL, NUMA_NO_NODE); + if (IS_ERR(dev->gd)) { + err = PTR_ERR(dev->gd); goto out; + } dev->gd->major = simdisk_major; dev->gd->first_minor = which; dev->gd->minors = SIMDISK_MINORS; diff --git a/block/genhd.c b/block/genhd.c index 7a8fd57c51f7..84c822d989da 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1391,20 +1391,21 @@ out_free_disk: return NULL; } -struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass) +struct gendisk *__blk_alloc_disk(struct queue_limits *lim, int node, + struct lock_class_key *lkclass) { - struct queue_limits lim = { }; + struct queue_limits default_lim = { }; struct request_queue *q; struct gendisk *disk; - q = blk_alloc_queue(&lim, node); + q = blk_alloc_queue(lim ? lim : &default_lim, node); if (IS_ERR(q)) - return NULL; + return ERR_CAST(q); disk = __alloc_disk_node(q, node, lkclass); if (!disk) { blk_put_queue(q); - return NULL; + return ERR_PTR(-ENOMEM); } set_bit(GD_OWNS_QUEUE, &disk->state); return disk; diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 970bd6ff38c4..689a3c0c31f8 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -335,10 +335,11 @@ static int brd_alloc(int i) debugfs_create_u64(buf, 0444, brd_debugfs_dir, &brd->brd_nr_pages); - disk = brd->brd_disk = blk_alloc_disk(NUMA_NO_NODE); - if (!disk) + disk = brd->brd_disk = blk_alloc_disk(NULL, NUMA_NO_NODE); + if (IS_ERR(disk)) { + err = PTR_ERR(disk); goto out_free_dev; - + } disk->major = RAMDISK_MAJOR; disk->first_minor = i * max_part; disk->minors = max_part; diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 6bc86106c7b2..cea1e537fd56 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2708,9 +2708,11 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig drbd_init_set_defaults(device); - disk = blk_alloc_disk(NUMA_NO_NODE); - if (!disk) + disk = blk_alloc_disk(NULL, NUMA_NO_NODE); + if (IS_ERR(disk)) { + err = PTR_ERR(disk); goto out_no_disk; + } device->vdisk = disk; device->rq_queue = disk->queue; diff --git a/drivers/block/n64cart.c b/drivers/block/n64cart.c index d914156db2d8..c64d7ee7a44d 100644 --- a/drivers/block/n64cart.c +++ b/drivers/block/n64cart.c @@ -131,9 +131,11 @@ static int __init n64cart_probe(struct platform_device *pdev) if (IS_ERR(reg_base)) return PTR_ERR(reg_base); - disk = blk_alloc_disk(NUMA_NO_NODE); - if (!disk) + disk = blk_alloc_disk(NULL, NUMA_NO_NODE); + if (IS_ERR(disk)) { + err = PTR_ERR(disk); goto out; + } disk->first_minor = 0; disk->flags = GENHD_FL_NO_PART; diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index eeb895ec6f34..baf2b228d008 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -2154,10 +2154,11 @@ static int null_add_dev(struct nullb_device *dev) } nullb->q = nullb->disk->queue; } else if (dev->queue_mode == NULL_Q_BIO) { - rv = -ENOMEM; - nullb->disk = blk_alloc_disk(nullb->dev->home_node); - if (!nullb->disk) + nullb->disk = blk_alloc_disk(NULL, nullb->dev->home_node); + if (IS_ERR(nullb->disk)) { + rv = PTR_ERR(nullb->disk); goto out_cleanup_queues; + } nullb->q = nullb->disk->queue; rv = init_driver_queues(nullb); diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index d56d972aadb3..abb82926b1c9 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -2673,10 +2673,11 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev) pd->write_congestion_on = write_congestion_on; pd->write_congestion_off = write_congestion_off; - ret = -ENOMEM; - disk = blk_alloc_disk(NUMA_NO_NODE); - if (!disk) + disk = blk_alloc_disk(NULL, NUMA_NO_NODE); + if (IS_ERR(disk)) { + ret = PTR_ERR(disk); goto out_mem; + } pd->disk = disk; disk->major = pktdev_major; disk->first_minor = idx; diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c index 38d42af01b25..bdcf083b45e2 100644 --- a/drivers/block/ps3vram.c +++ b/drivers/block/ps3vram.c @@ -730,10 +730,10 @@ static int ps3vram_probe(struct ps3_system_bus_device *dev) ps3vram_proc_init(dev); - gendisk = blk_alloc_disk(NUMA_NO_NODE); - if (!gendisk) { + gendisk = blk_alloc_disk(NULL, NUMA_NO_NODE); + if (IS_ERR(gendisk)) { dev_err(&dev->core, "blk_alloc_disk failed\n"); - error = -ENOMEM; + error = PTR_ERR(gendisk); goto out_cache_cleanup; } diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 6772e0c654fa..84982221fc66 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -2195,11 +2195,11 @@ static int zram_add(void) #endif /* gendisk structure */ - zram->disk = blk_alloc_disk(NUMA_NO_NODE); - if (!zram->disk) { + zram->disk = blk_alloc_disk(NULL, NUMA_NO_NODE); + if (IS_ERR(zram->disk)) { pr_err("Error allocating disk structure for device %d\n", device_id); - ret = -ENOMEM; + ret = PTR_ERR(zram->disk); goto out_free_idr; } diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index dc3f50f69714..9955ecff3839 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -935,8 +935,8 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER)) goto out_ida_remove; - d->disk = blk_alloc_disk(NUMA_NO_NODE); - if (!d->disk) + d->disk = blk_alloc_disk(NULL, NUMA_NO_NODE); + if (IS_ERR(d->disk)) goto out_bioset_exit; set_capacity(d->disk, sectors); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 8dcabf84d866..b5e6a10b9cfd 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2098,8 +2098,8 @@ static struct mapped_device *alloc_dev(int minor) * established. If request-based table is loaded: blk-mq will * override accordingly. */ - md->disk = blk_alloc_disk(md->numa_node_id); - if (!md->disk) + md->disk = blk_alloc_disk(NULL, md->numa_node_id); + if (IS_ERR(md->disk)) goto bad; md->queue = md->disk->queue; diff --git a/drivers/md/md.c b/drivers/md/md.c index e2a5f513dbb7..75266c34b1f9 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5763,10 +5763,11 @@ struct mddev *md_alloc(dev_t dev, char *name) */ mddev->hold_active = UNTIL_STOP; - error = -ENOMEM; - disk = blk_alloc_disk(NUMA_NO_NODE); - if (!disk) + disk = blk_alloc_disk(NULL, NUMA_NO_NODE); + if (IS_ERR(disk)) { + error = PTR_ERR(disk); goto out_free_mddev; + } disk->major = MAJOR(mddev->unit); disk->first_minor = unit << shift; diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c index bb3726b622ad..9a0eae01d598 100644 --- a/drivers/nvdimm/btt.c +++ b/drivers/nvdimm/btt.c @@ -1496,11 +1496,11 @@ static int btt_blk_init(struct btt *btt) { struct nd_btt *nd_btt = btt->nd_btt; struct nd_namespace_common *ndns = nd_btt->ndns; - int rc = -ENOMEM; + int rc; - btt->btt_disk = blk_alloc_disk(NUMA_NO_NODE); - if (!btt->btt_disk) - return -ENOMEM; + btt->btt_disk = blk_alloc_disk(NULL, NUMA_NO_NODE); + if (IS_ERR(btt->btt_disk)) + return PTR_ERR(btt->btt_disk); nvdimm_namespace_disk_name(ndns, btt->btt_disk->disk_name); btt->btt_disk->first_minor = 0; diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 4e8fdcb3f1c8..3a5df8d467c5 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -497,9 +497,9 @@ static int pmem_attach_disk(struct device *dev, return -EBUSY; } - disk = blk_alloc_disk(nid); - if (!disk) - return -ENOMEM; + disk = blk_alloc_disk(NULL, nid); + if (IS_ERR(disk)) + return PTR_ERR(disk); q = disk->queue; pmem->disk = disk; diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 74de1e64aeea..dc5d0d0a82d0 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -532,9 +532,9 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) !nvme_is_unique_nsid(ctrl, head) || !multipath) return 0; - head->disk = blk_alloc_disk(ctrl->numa_node); - if (!head->disk) - return -ENOMEM; + head->disk = blk_alloc_disk(NULL, ctrl->numa_node); + if (IS_ERR(head->disk)) + return PTR_ERR(head->disk); head->disk->fops = &nvme_ns_head_ops; head->disk->private_data = head; sprintf(head->disk->disk_name, "nvme%dn%d", diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index 4b7ecd4fd431..0903b432ea97 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -629,9 +629,9 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char dev_info->dev.release = dcssblk_release_segment; dev_info->dev.groups = dcssblk_dev_attr_groups; INIT_LIST_HEAD(&dev_info->lh); - dev_info->gd = blk_alloc_disk(NUMA_NO_NODE); - if (dev_info->gd == NULL) { - rc = -ENOMEM; + dev_info->gd = blk_alloc_disk(NULL, NUMA_NO_NODE); + if (IS_ERR(dev_info->gd)) { + rc = PTR_ERR(dev_info->gd); goto seg_list_del; } dev_info->gd->major = dcssblk_major; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 45746ba73670..a14ea9344138 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -766,22 +766,26 @@ static inline u64 sb_bdev_nr_blocks(struct super_block *sb) int bdev_disk_changed(struct gendisk *disk, bool invalidate); void put_disk(struct gendisk *disk); -struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass); +struct gendisk *__blk_alloc_disk(struct queue_limits *lim, int node, + struct lock_class_key *lkclass); /** * blk_alloc_disk - allocate a gendisk structure + * @lim: queue limits to be used for this disk. * @node_id: numa node to allocate on * * Allocate and pre-initialize a gendisk structure for use with BIO based * drivers. * + * Returns an ERR_PTR on error, else the allocated disk. + * * Context: can sleep */ -#define blk_alloc_disk(node_id) \ +#define blk_alloc_disk(lim, node_id) \ ({ \ static struct lock_class_key __key; \ \ - __blk_alloc_disk(node_id, &__key); \ + __blk_alloc_disk(lim, node_id, &__key); \ }) int __register_blkdev(unsigned int major, const char *name, -- cgit v1.2.3-58-ga151 From a3911966bd51b05488203e6e685c4a09fdc52b77 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 21 Feb 2024 13:50:10 +0100 Subject: block: fix virt_boundary handling in blk_validate_limits Don't set the default max_segment_size value when a virt_boundary is used. Fixes: d690cb8ae14b ("block: add an API to atomically update queue limits") Reported-by: Geert Uytterhoeven Signed-off-by: Christoph Hellwig Tested-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20240221125010.3609444-1-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-settings.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'block') diff --git a/block/blk-settings.c b/block/blk-settings.c index c4406aacc0ef..2120b6f9fef8 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -182,16 +182,6 @@ static int blk_validate_limits(struct queue_limits *lim) if (WARN_ON_ONCE(lim->seg_boundary_mask < PAGE_SIZE - 1)) return -EINVAL; - /* - * The maximum segment size has an odd historic 64k default that - * drivers probably should override. Just like the I/O size we - * require drivers to at least handle a full page per segment. - */ - if (!lim->max_segment_size) - lim->max_segment_size = BLK_MAX_SEGMENT_SIZE; - if (WARN_ON_ONCE(lim->max_segment_size < PAGE_SIZE)) - return -EINVAL; - /* * Devices that require a virtual boundary do not support scatter/gather * I/O natively, but instead require a descriptor list entry for each @@ -203,6 +193,16 @@ static int blk_validate_limits(struct queue_limits *lim) lim->max_segment_size != UINT_MAX)) return -EINVAL; lim->max_segment_size = UINT_MAX; + } else { + /* + * The maximum segment size has an odd historic 64k default that + * drivers probably should override. Just like the I/O size we + * require drivers to at least handle a full page per segment. + */ + if (!lim->max_segment_size) + lim->max_segment_size = BLK_MAX_SEGMENT_SIZE; + if (WARN_ON_ONCE(lim->max_segment_size < PAGE_SIZE)) + return -EINVAL; } /* -- cgit v1.2.3-58-ga151 From c8f6f88d25929ad2f290b428efcae3b526f3eab0 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 22 Feb 2024 22:17:23 +0900 Subject: block: Clear zone limits for a non-zoned stacked queue Device mapper may create a non-zoned mapped device out of a zoned device (e.g., the dm-zoned target). In such case, some queue limit such as the max_zone_append_sectors and zone_write_granularity endup being non zero values for a block device that is not zoned. Avoid this by clearing these limits in blk_stack_limits() when the stacked zoned limit is false. Fixes: 3093a479727b ("block: inherit the zoned characteristics in blk_stack_limits") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Link: https://lore.kernel.org/r/20240222131724.1803520-1-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-settings.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'block') diff --git a/block/blk-settings.c b/block/blk-settings.c index 2120b6f9fef8..b6bbe683d218 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -865,6 +865,10 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->zone_write_granularity = max(t->zone_write_granularity, b->zone_write_granularity); t->zoned = max(t->zoned, b->zoned); + if (!t->zoned) { + t->zone_write_granularity = 0; + t->max_zone_append_sectors = 0; + } return ret; } EXPORT_SYMBOL(blk_stack_limits); -- cgit v1.2.3-58-ga151 From 522d73526f8d4519eefc693204bb474391e60f2b Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 22 Feb 2024 22:17:24 +0900 Subject: block: Do not include rbtree.h in blk-zoned.c The block zone code does not use RB-tree. So remove the include of linux/rbtree.h as it is not needed. Signed-off-by: Damien Le Moal Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20240222131724.1803520-2-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-zoned.c | 1 - 1 file changed, 1 deletion(-) (limited to 'block') diff --git a/block/blk-zoned.c b/block/blk-zoned.c index d4f4f8325eff..da0f4b2a8fa0 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -11,7 +11,6 @@ #include #include -#include #include #include #include -- cgit v1.2.3-58-ga151 From 03f12122b20b6e6028e9ed69030a49f9cffcbb75 Mon Sep 17 00:00:00 2001 From: Li Nan Date: Wed, 21 Feb 2024 17:01:22 +0800 Subject: block: fix deadlock between bd_link_disk_holder and partition scan 'open_mutex' of gendisk is used to protect open/close block devices. But in bd_link_disk_holder(), it is used to protect the creation of symlink between holding disk and slave bdev, which introduces some issues. When bd_link_disk_holder() is called, the driver is usually in the process of initialization/modification and may suspend submitting io. At this time, any io hold 'open_mutex', such as scanning partitions, can cause deadlocks. For example, in raid: T1 T2 bdev_open_by_dev lock open_mutex [1] ... efi_partition ... md_submit_bio md_ioctl mddev_syspend -> suspend all io md_add_new_disk bind_rdev_to_array bd_link_disk_holder try lock open_mutex [2] md_handle_request -> wait mddev_resume T1 scan partition, T2 add a new device to raid. T1 waits for T2 to resume mddev, but T2 waits for open_mutex held by T1. Deadlock occurs. Fix it by introducing a local mutex 'blk_holder_mutex' to replace 'open_mutex'. Fixes: 1b0a2d950ee2 ("md: use new apis to suspend array for ioctls involed array reconfiguration") Reported-by: mgperkow@gmail.com Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218459 Signed-off-by: Li Nan Reviewed-by: Christoph Hellwig Reviewed-by: Yu Kuai Link: https://lore.kernel.org/r/20240221090122.1281868-1-linan666@huaweicloud.com Signed-off-by: Jens Axboe --- block/holder.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'block') diff --git a/block/holder.c b/block/holder.c index 37d18c13d958..791091a7eac2 100644 --- a/block/holder.c +++ b/block/holder.c @@ -8,6 +8,8 @@ struct bd_holder_disk { int refcnt; }; +static DEFINE_MUTEX(blk_holder_mutex); + static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev, struct gendisk *disk) { @@ -80,7 +82,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) kobject_get(bdev->bd_holder_dir); mutex_unlock(&bdev->bd_disk->open_mutex); - mutex_lock(&disk->open_mutex); + mutex_lock(&blk_holder_mutex); WARN_ON_ONCE(!bdev->bd_holder); holder = bd_find_holder_disk(bdev, disk); @@ -108,7 +110,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) goto out_del_symlink; list_add(&holder->list, &disk->slave_bdevs); - mutex_unlock(&disk->open_mutex); + mutex_unlock(&blk_holder_mutex); return 0; out_del_symlink: @@ -116,7 +118,7 @@ out_del_symlink: out_free_holder: kfree(holder); out_unlock: - mutex_unlock(&disk->open_mutex); + mutex_unlock(&blk_holder_mutex); if (ret) kobject_put(bdev->bd_holder_dir); return ret; @@ -140,7 +142,7 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) if (WARN_ON_ONCE(!disk->slave_dir)) return; - mutex_lock(&disk->open_mutex); + mutex_lock(&blk_holder_mutex); holder = bd_find_holder_disk(bdev, disk); if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) { del_symlink(disk->slave_dir, bdev_kobj(bdev)); @@ -149,6 +151,6 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) list_del_init(&holder->list); kfree(holder); } - mutex_unlock(&disk->open_mutex); + mutex_unlock(&blk_holder_mutex); } EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); -- cgit v1.2.3-58-ga151 From 5affe497c346343ecc42e6095b60dafe15e1453e Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 23 Feb 2024 07:59:07 -0800 Subject: block: blkdev_issue_secure_erase loop style Use consistent coding style in this file. All the other loops for the same purpose use "while (nr_sects)", so they win. Reviewed-by: Ming Lei Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20240223155910.3622666-2-kbusch@meta.com Signed-off-by: Jens Axboe --- block/blk-lib.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/blk-lib.c b/block/blk-lib.c index e59c3069e835..91770da2239f 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -322,7 +322,7 @@ int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector, return -EPERM; blk_start_plug(&plug); - for (;;) { + while (nr_sects) { unsigned int len = min_t(sector_t, nr_sects, max_sectors); bio = blk_next_bio(bio, bdev, 0, REQ_OP_SECURE_ERASE, gfp); @@ -331,13 +331,12 @@ int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector, sector += len; nr_sects -= len; - if (!nr_sects) { - ret = submit_bio_wait(bio); - bio_put(bio); - break; - } cond_resched(); } + if (bio) { + ret = submit_bio_wait(bio); + bio_put(bio); + } blk_finish_plug(&plug); return ret; -- cgit v1.2.3-58-ga151 From 76a27e1b53b94b5a23c221434146fda3e9d8d8e0 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 23 Feb 2024 07:59:08 -0800 Subject: block: cleanup __blkdev_issue_write_zeroes Use min to calculate the next number of sectors like everyone else. Reviewed-by: Ming Lei Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20240223155910.3622666-3-kbusch@meta.com Signed-off-by: Jens Axboe --- block/blk-lib.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) (limited to 'block') diff --git a/block/blk-lib.c b/block/blk-lib.c index 91770da2239f..a6954eafb8c8 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -120,31 +120,28 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev, struct bio **biop, unsigned flags) { struct bio *bio = *biop; - unsigned int max_write_zeroes_sectors; + unsigned int max_sectors; if (bdev_read_only(bdev)) return -EPERM; - /* Ensure that max_write_zeroes_sectors doesn't overflow bi_size */ - max_write_zeroes_sectors = bdev_write_zeroes_sectors(bdev); + /* Ensure that max_sectors doesn't overflow bi_size */ + max_sectors = bdev_write_zeroes_sectors(bdev); - if (max_write_zeroes_sectors == 0) + if (max_sectors == 0) return -EOPNOTSUPP; while (nr_sects) { + unsigned int len = min_t(sector_t, nr_sects, max_sectors); + bio = blk_next_bio(bio, bdev, 0, REQ_OP_WRITE_ZEROES, gfp_mask); bio->bi_iter.bi_sector = sector; if (flags & BLKDEV_ZERO_NOUNMAP) bio->bi_opf |= REQ_NOUNMAP; - if (nr_sects > max_write_zeroes_sectors) { - bio->bi_iter.bi_size = max_write_zeroes_sectors << 9; - nr_sects -= max_write_zeroes_sectors; - sector += max_write_zeroes_sectors; - } else { - bio->bi_iter.bi_size = nr_sects << 9; - nr_sects = 0; - } + bio->bi_iter.bi_size = len << SECTOR_SHIFT; + nr_sects -= len; + sector += len; cond_resched(); } -- cgit v1.2.3-58-ga151 From 0eb4db4706603db09644ec3bc9bb0d63ea5d326c Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 23 Feb 2024 07:59:09 -0800 Subject: block: io wait hang check helper This is the same in two places, and another will be added soon. Create a helper for it. Reviewed-by: Ming Lei Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20240223155910.3622666-4-kbusch@meta.com Signed-off-by: Jens Axboe --- block/bio.c | 12 +----------- block/blk-mq.c | 19 +++---------------- block/blk.h | 13 +++++++++++++ 3 files changed, 17 insertions(+), 27 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 00847ff1415c..496867b51609 100644 --- a/block/bio.c +++ b/block/bio.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include @@ -1371,21 +1370,12 @@ int submit_bio_wait(struct bio *bio) { DECLARE_COMPLETION_ONSTACK_MAP(done, bio->bi_bdev->bd_disk->lockdep_map); - unsigned long hang_check; bio->bi_private = &done; bio->bi_end_io = submit_bio_wait_endio; bio->bi_opf |= REQ_SYNC; submit_bio(bio); - - /* Prevent hang_check timer from firing at us during very long I/O */ - hang_check = sysctl_hung_task_timeout_secs; - if (hang_check) - while (!wait_for_completion_io_timeout(&done, - hang_check * (HZ/2))) - ; - else - wait_for_completion_io(&done); + blk_wait_io(&done); return blk_status_to_errno(bio->bi_status); } diff --git a/block/blk-mq.c b/block/blk-mq.c index 6abb4ce46baa..45f994c10044 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include #include @@ -1409,22 +1408,10 @@ blk_status_t blk_execute_rq(struct request *rq, bool at_head) blk_mq_insert_request(rq, at_head ? BLK_MQ_INSERT_AT_HEAD : 0); blk_mq_run_hw_queue(hctx, false); - if (blk_rq_is_poll(rq)) { + if (blk_rq_is_poll(rq)) blk_rq_poll_completion(rq, &wait.done); - } else { - /* - * Prevent hang_check timer from firing at us during very long - * I/O - */ - unsigned long hang_check = sysctl_hung_task_timeout_secs; - - if (hang_check) - while (!wait_for_completion_io_timeout(&wait.done, - hang_check * (HZ/2))) - ; - else - wait_for_completion_io(&wait.done); - } + else + blk_wait_io(&wait.done); return wait.ret; } diff --git a/block/blk.h b/block/blk.h index 7c30e2ac8ebc..6c2749d122ab 100644 --- a/block/blk.h +++ b/block/blk.h @@ -4,6 +4,7 @@ #include #include /* for max_pfn/max_low_pfn */ +#include #include #include #include "blk-crypto-internal.h" @@ -71,6 +72,18 @@ static inline int bio_queue_enter(struct bio *bio) return __bio_queue_enter(q, bio); } +static inline void blk_wait_io(struct completion *done) +{ + /* Prevent hang_check timer from firing at us during very long I/O */ + unsigned long timeout = sysctl_hung_task_timeout_secs * HZ / 2; + + if (timeout) + while (!wait_for_completion_io_timeout(done, timeout)) + ; + else + wait_for_completion_io(done); +} + #define BIO_INLINE_VECS 4 struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs, gfp_t gfp_mask); -- cgit v1.2.3-58-ga151 From 8a08c5fd89b447a7de7eb293a7a274c46b932ba2 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 23 Feb 2024 07:59:10 -0800 Subject: blk-lib: check for kill signal Some of these block operations can access a significant capacity and take longer than the user expected. A user may change their mind about wanting to run that command and attempt to kill the process and do something else with their device. But since the task is uninterruptable, they have to wait for it to finish, which could be many hours. Check for a fatal signal at each iteration so the user doesn't have to wait for their regretted operation to complete naturally. Reported-by: Conrad Meyer Tested-by: Nilay Shroff Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20240223155910.3622666-5-kbusch@meta.com Signed-off-by: Jens Axboe --- block/blk-lib.c | 40 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-lib.c b/block/blk-lib.c index a6954eafb8c8..dc8e35d0a51d 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -35,6 +35,26 @@ static sector_t bio_discard_limit(struct block_device *bdev, sector_t sector) return round_down(UINT_MAX, discard_granularity) >> SECTOR_SHIFT; } +static void await_bio_endio(struct bio *bio) +{ + complete(bio->bi_private); + bio_put(bio); +} + +/* + * await_bio_chain - ends @bio and waits for every chained bio to complete + */ +static void await_bio_chain(struct bio *bio) +{ + DECLARE_COMPLETION_ONSTACK_MAP(done, + bio->bi_bdev->bd_disk->lockdep_map); + + bio->bi_private = &done; + bio->bi_end_io = await_bio_endio; + bio_endio(bio); + blk_wait_io(&done); +} + int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct bio **biop) { @@ -77,6 +97,10 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, * is disabled. */ cond_resched(); + if (fatal_signal_pending(current)) { + await_bio_chain(bio); + return -EINTR; + } } *biop = bio; @@ -143,6 +167,10 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev, nr_sects -= len; sector += len; cond_resched(); + if (fatal_signal_pending(current)) { + await_bio_chain(bio); + return -EINTR; + } } *biop = bio; @@ -187,6 +215,10 @@ static int __blkdev_issue_zero_pages(struct block_device *bdev, break; } cond_resched(); + if (fatal_signal_pending(current)) { + await_bio_chain(bio); + return -EINTR; + } } *biop = bio; @@ -277,7 +309,7 @@ retry: bio_put(bio); } blk_finish_plug(&plug); - if (ret && try_write_zeroes) { + if (ret && ret != -EINTR && try_write_zeroes) { if (!(flags & BLKDEV_ZERO_NOFALLBACK)) { try_write_zeroes = false; goto retry; @@ -329,6 +361,12 @@ int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector, sector += len; nr_sects -= len; cond_resched(); + if (fatal_signal_pending(current)) { + await_bio_chain(bio); + ret = -EINTR; + bio = NULL; + break; + } } if (bio) { ret = submit_bio_wait(bio); -- cgit v1.2.3-58-ga151 From af550e4c968294398fc76b075f12d51c76caf753 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Fri, 23 Feb 2024 15:57:49 +0000 Subject: block/blk-mq: Don't complete locally if capacities are different The logic in blk_mq_complete_need_ipi() assumes SMP systems where all CPUs have equal compute capacities and only LLC cache can make a different on perceived performance. But this assumption falls apart on HMP systems where LLC is shared, but the CPUs have different capacities. Staying local then can have a big performance impact if the IO request was done from a CPU with higher capacity but the interrupt is serviced on a lower capacity CPU. Use the new cpus_equal_capacity() function to check if we need to send an IPI. Without the patch I see the BLOCK softirq always running on little cores (where the hardirq is serviced). With it I can see it running on all cores. This was noticed after the topology change [1] where now on a big.LITTLE we truly get that the LLC is shared between all cores where as in the past it was being misrepresented for historical reasons. The logic exposed a missing dependency on capacities for such systems where there can be a big performance difference between the CPUs. This of course introduced a noticeable change in behavior depending on how the topology is presented. Leading to regressions in some workloads as the performance of the BLOCK softirq on littles can be noticeably worse on some platforms. Worth noting that we could have checked for capacities being greater than or equal instead for equality. This will lead to favouring higher performance always. But opted for equality instead to match the performance of the requester without making an assumption that can lead to power trade-offs which these systems tend to be sensitive about. If the requester would like to run faster, it's better to rely on the scheduler to give the IO requester via some facility to run on a faster core; and then if the interrupt triggered on a CPU with different capacity we'll make sure to match the performance the requester is supposed to run at. [1] https://lpc.events/event/16/contributions/1342/attachments/962/1883/LPC-2022-Android-MC-Phantom-Domains.pdf Signed-off-by: Qais Yousef Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20240223155749.2958009-3-qyousef@layalina.io Signed-off-by: Jens Axboe --- block/blk-mq.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 45f994c10044..7111bd4180e7 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1166,10 +1166,11 @@ static inline bool blk_mq_complete_need_ipi(struct request *rq) if (force_irqthreads()) return false; - /* same CPU or cache domain? Complete locally */ + /* same CPU or cache domain and capacity? Complete locally */ if (cpu == rq->mq_ctx->cpu || (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags) && - cpus_share_cache(cpu, rq->mq_ctx->cpu))) + cpus_share_cache(cpu, rq->mq_ctx->cpu) && + cpus_equal_capacity(cpu, rq->mq_ctx->cpu))) return false; /* don't try to IPI to an offline CPU */ -- cgit v1.2.3-58-ga151 From 82c6515d8a970f471eeb8a5ceeaa04c3e5e1b45c Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Sat, 24 Feb 2024 13:46:46 +0000 Subject: bdev: remove SLAB_MEM_SPREAD flag usage The SLAB_MEM_SPREAD flag is already a no-op as of 6.8-rc1, remove its usage so we can delete it from slab. No functional change. Signed-off-by: Chengming Zhou Link: https://lore.kernel.org/r/20240224134646.829105-1-chengming.zhou@linux.dev Signed-off-by: Jens Axboe --- block/bdev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/bdev.c b/block/bdev.c index e9f1b12bd75c..d5ed1a70d13a 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -377,7 +377,7 @@ void __init bdev_cache_init(void) bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD|SLAB_ACCOUNT|SLAB_PANIC), + SLAB_ACCOUNT|SLAB_PANIC), init_once); err = register_filesystem(&bd_type); if (err) -- cgit v1.2.3-58-ga151 From f3a608827d1f8de0dd12813e8d9c6803fe64e119 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 8 Feb 2024 18:47:35 +0100 Subject: bdev: open block device as files Add two new helpers to allow opening block devices as files. This is not the final infrastructure. This still opens the block device before opening a struct a file. Until we have removed all references to struct bdev_handle we can't switch the order: * Introduce blk_to_file_flags() to translate from block specific to flags usable to pen a new file. * Introduce bdev_file_open_by_{dev,path}(). * Introduce temporary sb_bdev_handle() helper to retrieve a struct bdev_handle from a block device file and update places that directly reference struct bdev_handle to rely on it. * Don't count block device openes against the number of open files. A bdev_file_open_by_{dev,path}() file is never installed into any file descriptor table. One idea that came to mind was to use kernel_tmpfile_open() which would require us to pass a path and it would then call do_dentry_open() going through the regular fops->open::blkdev_open() path. But then we're back to the problem of routing block specific flags such as BLK_OPEN_RESTRICT_WRITES through the open path and would have to waste FMODE_* flags every time we add a new one. With this we can avoid using a flag bit and we have more leeway in how we open block devices from bdev_open_by_{dev,path}(). Link: https://lore.kernel.org/r/20240123-vfs-bdev-file-v2-1-adbd023e19cc@kernel.org Signed-off-by: Christian Brauner --- block/bdev.c | 101 +++++++++++++++++++++++++++++++++++++++++++++++-- fs/cramfs/inode.c | 2 +- fs/f2fs/super.c | 2 +- fs/jfs/jfs_logmgr.c | 2 +- fs/romfs/super.c | 2 +- fs/super.c | 18 ++++----- fs/xfs/xfs_super.c | 2 +- include/linux/blkdev.h | 7 ++++ include/linux/fs.h | 10 ++++- 9 files changed, 126 insertions(+), 20 deletions(-) (limited to 'block') diff --git a/block/bdev.c b/block/bdev.c index e9f1b12bd75c..e1149652c532 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -49,6 +49,13 @@ struct block_device *I_BDEV(struct inode *inode) } EXPORT_SYMBOL(I_BDEV); +struct block_device *file_bdev(struct file *bdev_file) +{ + struct bdev_handle *handle = bdev_file->private_data; + return handle->bdev; +} +EXPORT_SYMBOL(file_bdev); + static void bdev_write_inode(struct block_device *bdev) { struct inode *inode = bdev->bd_inode; @@ -368,12 +375,12 @@ static struct file_system_type bd_type = { }; struct super_block *blockdev_superblock __ro_after_init; +struct vfsmount *blockdev_mnt __ro_after_init; EXPORT_SYMBOL_GPL(blockdev_superblock); void __init bdev_cache_init(void) { int err; - static struct vfsmount *bd_mnt __ro_after_init; bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| @@ -382,10 +389,10 @@ void __init bdev_cache_init(void) err = register_filesystem(&bd_type); if (err) panic("Cannot register bdev pseudo-fs"); - bd_mnt = kern_mount(&bd_type); - if (IS_ERR(bd_mnt)) + blockdev_mnt = kern_mount(&bd_type); + if (IS_ERR(blockdev_mnt)) panic("Cannot create bdev pseudo-fs"); - blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */ + blockdev_superblock = blockdev_mnt->mnt_sb; /* For writeback */ } struct block_device *bdev_alloc(struct gendisk *disk, u8 partno) @@ -911,6 +918,92 @@ free_handle: } EXPORT_SYMBOL(bdev_open_by_dev); +/* + * If BLK_OPEN_WRITE_IOCTL is set then this is a historical quirk + * associated with the floppy driver where it has allowed ioctls if the + * file was opened for writing, but does not allow reads or writes. + * Make sure that this quirk is reflected in @f_flags. + * + * It can also happen if a block device is opened as O_RDWR | O_WRONLY. + */ +static unsigned blk_to_file_flags(blk_mode_t mode) +{ + unsigned int flags = 0; + + if ((mode & (BLK_OPEN_READ | BLK_OPEN_WRITE)) == + (BLK_OPEN_READ | BLK_OPEN_WRITE)) + flags |= O_RDWR; + else if (mode & BLK_OPEN_WRITE_IOCTL) + flags |= O_RDWR | O_WRONLY; + else if (mode & BLK_OPEN_WRITE) + flags |= O_WRONLY; + else if (mode & BLK_OPEN_READ) + flags |= O_RDONLY; /* homeopathic, because O_RDONLY is 0 */ + else + WARN_ON_ONCE(true); + + if (mode & BLK_OPEN_NDELAY) + flags |= O_NDELAY; + + return flags; +} + +struct file *bdev_file_open_by_dev(dev_t dev, blk_mode_t mode, void *holder, + const struct blk_holder_ops *hops) +{ + struct file *bdev_file; + struct bdev_handle *handle; + unsigned int flags; + + handle = bdev_open_by_dev(dev, mode, holder, hops); + if (IS_ERR(handle)) + return ERR_CAST(handle); + + flags = blk_to_file_flags(mode); + bdev_file = alloc_file_pseudo_noaccount(handle->bdev->bd_inode, + blockdev_mnt, "", flags | O_LARGEFILE, &def_blk_fops); + if (IS_ERR(bdev_file)) { + bdev_release(handle); + return bdev_file; + } + ihold(handle->bdev->bd_inode); + + bdev_file->f_mode |= FMODE_BUF_RASYNC | FMODE_CAN_ODIRECT; + if (bdev_nowait(handle->bdev)) + bdev_file->f_mode |= FMODE_NOWAIT; + + bdev_file->f_mapping = handle->bdev->bd_inode->i_mapping; + bdev_file->f_wb_err = filemap_sample_wb_err(bdev_file->f_mapping); + bdev_file->private_data = handle; + return bdev_file; +} +EXPORT_SYMBOL(bdev_file_open_by_dev); + +struct file *bdev_file_open_by_path(const char *path, blk_mode_t mode, + void *holder, + const struct blk_holder_ops *hops) +{ + struct file *bdev_file; + dev_t dev; + int error; + + error = lookup_bdev(path, &dev); + if (error) + return ERR_PTR(error); + + bdev_file = bdev_file_open_by_dev(dev, mode, holder, hops); + if (!IS_ERR(bdev_file) && (mode & BLK_OPEN_WRITE)) { + struct bdev_handle *handle = bdev_file->private_data; + if (bdev_read_only(handle->bdev)) { + fput(bdev_file); + bdev_file = ERR_PTR(-EACCES); + } + } + + return bdev_file; +} +EXPORT_SYMBOL(bdev_file_open_by_path); + /** * bdev_open_by_path - open a block device by name * @path: path to the block device to open diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 60dbfa0f8805..39e75131fd5a 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -495,7 +495,7 @@ static void cramfs_kill_sb(struct super_block *sb) sb->s_mtd = NULL; } else if (IS_ENABLED(CONFIG_CRAMFS_BLOCKDEV) && sb->s_bdev) { sync_blockdev(sb->s_bdev); - bdev_release(sb->s_bdev_handle); + fput(sb->s_bdev_file); } kfree(sbi); } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index d45ab0992ae5..ea94c148fee5 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -4247,7 +4247,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) for (i = 0; i < max_devices; i++) { if (i == 0) - FDEV(0).bdev_handle = sbi->sb->s_bdev_handle; + FDEV(0).bdev_handle = sb_bdev_handle(sbi->sb); else if (!RDEV(i).path[0]) break; diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index cb6d1fda66a7..8691463956d1 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -1162,7 +1162,7 @@ static int open_inline_log(struct super_block *sb) init_waitqueue_head(&log->syncwait); set_bit(log_INLINELOG, &log->flag); - log->bdev_handle = sb->s_bdev_handle; + log->bdev_handle = sb_bdev_handle(sb); log->base = addressPXD(&JFS_SBI(sb)->logpxd); log->size = lengthPXD(&JFS_SBI(sb)->logpxd) >> (L2LOGPSIZE - sb->s_blocksize_bits); diff --git a/fs/romfs/super.c b/fs/romfs/super.c index 545ad44f96b8..1ed468c03557 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -594,7 +594,7 @@ static void romfs_kill_sb(struct super_block *sb) #ifdef CONFIG_ROMFS_ON_BLOCK if (sb->s_bdev) { sync_blockdev(sb->s_bdev); - bdev_release(sb->s_bdev_handle); + fput(sb->s_bdev_file); } #endif } diff --git a/fs/super.c b/fs/super.c index d35e85295489..08dcc3371aa0 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1532,16 +1532,16 @@ int setup_bdev_super(struct super_block *sb, int sb_flags, struct fs_context *fc) { blk_mode_t mode = sb_open_mode(sb_flags); - struct bdev_handle *bdev_handle; + struct file *bdev_file; struct block_device *bdev; - bdev_handle = bdev_open_by_dev(sb->s_dev, mode, sb, &fs_holder_ops); - if (IS_ERR(bdev_handle)) { + bdev_file = bdev_file_open_by_dev(sb->s_dev, mode, sb, &fs_holder_ops); + if (IS_ERR(bdev_file)) { if (fc) errorf(fc, "%s: Can't open blockdev", fc->source); - return PTR_ERR(bdev_handle); + return PTR_ERR(bdev_file); } - bdev = bdev_handle->bdev; + bdev = file_bdev(bdev_file); /* * This really should be in blkdev_get_by_dev, but right now can't due @@ -1549,7 +1549,7 @@ int setup_bdev_super(struct super_block *sb, int sb_flags, * writable from userspace even for a read-only block device. */ if ((mode & BLK_OPEN_WRITE) && bdev_read_only(bdev)) { - bdev_release(bdev_handle); + fput(bdev_file); return -EACCES; } @@ -1560,11 +1560,11 @@ int setup_bdev_super(struct super_block *sb, int sb_flags, if (atomic_read(&bdev->bd_fsfreeze_count) > 0) { if (fc) warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev); - bdev_release(bdev_handle); + fput(bdev_file); return -EBUSY; } spin_lock(&sb_lock); - sb->s_bdev_handle = bdev_handle; + sb->s_bdev_file = bdev_file; sb->s_bdev = bdev; sb->s_bdi = bdi_get(bdev->bd_disk->bdi); if (bdev_stable_writes(bdev)) @@ -1680,7 +1680,7 @@ void kill_block_super(struct super_block *sb) generic_shutdown_super(sb); if (bdev) { sync_blockdev(bdev); - bdev_release(sb->s_bdev_handle); + fput(sb->s_bdev_file); } } diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index aff20ddd4a9f..e5ac0e59ede9 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -467,7 +467,7 @@ xfs_open_devices( * Setup xfs_mount buffer target pointers */ error = -ENOMEM; - mp->m_ddev_targp = xfs_alloc_buftarg(mp, sb->s_bdev_handle); + mp->m_ddev_targp = xfs_alloc_buftarg(mp, sb_bdev_handle(sb)); if (!mp->m_ddev_targp) goto out_close_rtdev; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 99e4f5e72213..76706aa47316 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -24,6 +24,7 @@ #include #include #include +#include struct module; struct request_queue; @@ -1474,6 +1475,7 @@ extern const struct blk_holder_ops fs_holder_ops; (BLK_OPEN_READ | BLK_OPEN_RESTRICT_WRITES | \ (((flags) & SB_RDONLY) ? 0 : BLK_OPEN_WRITE)) +/* @bdev_handle will be removed soon. */ struct bdev_handle { struct block_device *bdev; void *holder; @@ -1484,6 +1486,10 @@ struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder, const struct blk_holder_ops *hops); struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode, void *holder, const struct blk_holder_ops *hops); +struct file *bdev_file_open_by_dev(dev_t dev, blk_mode_t mode, void *holder, + const struct blk_holder_ops *hops); +struct file *bdev_file_open_by_path(const char *path, blk_mode_t mode, + void *holder, const struct blk_holder_ops *hops); int bd_prepare_to_claim(struct block_device *bdev, void *holder, const struct blk_holder_ops *hops); void bd_abort_claiming(struct block_device *bdev, void *holder); @@ -1494,6 +1500,7 @@ struct block_device *blkdev_get_no_open(dev_t dev); void blkdev_put_no_open(struct block_device *bdev); struct block_device *I_BDEV(struct inode *inode); +struct block_device *file_bdev(struct file *bdev_file); #ifdef CONFIG_BLOCK void invalidate_bdev(struct block_device *bdev); diff --git a/include/linux/fs.h b/include/linux/fs.h index ed5966a70495..e9291e27cc47 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1228,8 +1228,8 @@ struct super_block { #endif struct hlist_bl_head s_roots; /* alternate root dentries for NFS */ struct list_head s_mounts; /* list of mounts; _not_ for fs use */ - struct block_device *s_bdev; - struct bdev_handle *s_bdev_handle; + struct block_device *s_bdev; /* can go away once we use an accessor for @s_bdev_file */ + struct file *s_bdev_file; struct backing_dev_info *s_bdi; struct mtd_info *s_mtd; struct hlist_node s_instances; @@ -1327,6 +1327,12 @@ struct super_block { struct list_head s_inodes_wb; /* writeback inodes */ } __randomize_layout; +/* Temporary helper that will go away. */ +static inline struct bdev_handle *sb_bdev_handle(struct super_block *sb) +{ + return sb->s_bdev_file->private_data; +} + static inline struct user_namespace *i_user_ns(const struct inode *inode) { return inode->i_sb->s_user_ns; -- cgit v1.2.3-58-ga151 From e5ca9d391615269b05a6ed871fec66d9db650520 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 23 Jan 2024 14:26:19 +0100 Subject: block/ioctl: port blkdev_bszset() to file Link: https://lore.kernel.org/r/20240123-vfs-bdev-file-v2-2-adbd023e19cc@kernel.org Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- block/ioctl.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'block') diff --git a/block/ioctl.c b/block/ioctl.c index 9c73a763ef88..5d0619e02e4c 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -471,7 +471,7 @@ static int blkdev_bszset(struct block_device *bdev, blk_mode_t mode, int __user *argp) { int ret, n; - struct bdev_handle *handle; + struct file *file; if (!capable(CAP_SYS_ADMIN)) return -EACCES; @@ -483,12 +483,11 @@ static int blkdev_bszset(struct block_device *bdev, blk_mode_t mode, if (mode & BLK_OPEN_EXCL) return set_blocksize(bdev, n); - handle = bdev_open_by_dev(bdev->bd_dev, mode, &bdev, NULL); - if (IS_ERR(handle)) + file = bdev_file_open_by_dev(bdev->bd_dev, mode, &bdev, NULL); + if (IS_ERR(file)) return -EBUSY; ret = set_blocksize(bdev, n); - bdev_release(handle); - + fput(file); return ret; } -- cgit v1.2.3-58-ga151 From 190f676afa00d07082db170400aaa2cd4de0933f Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 23 Jan 2024 14:26:20 +0100 Subject: block/genhd: port disk_scan_partitions() to file This may run from a kernel thread via device_add_disk(). So this could also use __fput_sync() if we were worried about EBUSY. But when it is called from a kernel thread it's always BLK_OPEN_READ so EBUSY can't really happen even if we do BLK_OPEN_RESTRICT_WRITES or BLK_OPEN_EXCL. Otherwise it's called from an ioctl on the block device which is only called from userspace and can rely on task work. Link: https://lore.kernel.org/r/20240123-vfs-bdev-file-v2-3-adbd023e19cc@kernel.org Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- block/genhd.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index d74fb5b4ae68..a911d2969c07 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -342,7 +342,7 @@ EXPORT_SYMBOL_GPL(disk_uevent); int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode) { - struct bdev_handle *handle; + struct file *file; int ret = 0; if (disk->flags & (GENHD_FL_NO_PART | GENHD_FL_HIDDEN)) @@ -366,12 +366,12 @@ int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode) } set_bit(GD_NEED_PART_SCAN, &disk->state); - handle = bdev_open_by_dev(disk_devt(disk), mode & ~BLK_OPEN_EXCL, NULL, - NULL); - if (IS_ERR(handle)) - ret = PTR_ERR(handle); + file = bdev_file_open_by_dev(disk_devt(disk), mode & ~BLK_OPEN_EXCL, + NULL, NULL); + if (IS_ERR(file)) + ret = PTR_ERR(file); else - bdev_release(handle); + fput(file); /* * If blkdev_get_by_dev() failed early, GD_NEED_PART_SCAN is still set, -- cgit v1.2.3-58-ga151 From e97d06a46526d9392cbdbd7eda193091e1af2723 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 23 Jan 2024 14:26:44 +0100 Subject: bdev: remove bdev_open_by_path() Link: https://lore.kernel.org/r/20240123-vfs-bdev-file-v2-27-adbd023e19cc@kernel.org Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- block/bdev.c | 40 ---------------------------------------- include/linux/blkdev.h | 2 -- 2 files changed, 42 deletions(-) (limited to 'block') diff --git a/block/bdev.c b/block/bdev.c index e1149652c532..4003f8e1782a 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -1004,46 +1004,6 @@ struct file *bdev_file_open_by_path(const char *path, blk_mode_t mode, } EXPORT_SYMBOL(bdev_file_open_by_path); -/** - * bdev_open_by_path - open a block device by name - * @path: path to the block device to open - * @mode: open mode (BLK_OPEN_*) - * @holder: exclusive holder identifier - * @hops: holder operations - * - * Open the block device described by the device file at @path. If @holder is - * not %NULL, the block device is opened with exclusive access. Exclusive opens - * may nest for the same @holder. - * - * CONTEXT: - * Might sleep. - * - * RETURNS: - * Handle with a reference to the block_device on success, ERR_PTR(-errno) on - * failure. - */ -struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode, - void *holder, const struct blk_holder_ops *hops) -{ - struct bdev_handle *handle; - dev_t dev; - int error; - - error = lookup_bdev(path, &dev); - if (error) - return ERR_PTR(error); - - handle = bdev_open_by_dev(dev, mode, holder, hops); - if (!IS_ERR(handle) && (mode & BLK_OPEN_WRITE) && - bdev_read_only(handle->bdev)) { - bdev_release(handle); - return ERR_PTR(-EACCES); - } - - return handle; -} -EXPORT_SYMBOL(bdev_open_by_path); - void bdev_release(struct bdev_handle *handle) { struct block_device *bdev = handle->bdev; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 76706aa47316..5880d5abfebe 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1484,8 +1484,6 @@ struct bdev_handle { struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder, const struct blk_holder_ops *hops); -struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode, - void *holder, const struct blk_holder_ops *hops); struct file *bdev_file_open_by_dev(dev_t dev, blk_mode_t mode, void *holder, const struct blk_holder_ops *hops); struct file *bdev_file_open_by_path(const char *path, blk_mode_t mode, -- cgit v1.2.3-58-ga151 From b1211a25c4fe3443cfef4ed7c39251502a663776 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 23 Jan 2024 14:26:45 +0100 Subject: bdev: make bdev_{release, open_by_dev}() private to block layer Move both of them to the private block header. There's no caller in the tree anymore that uses them directly. Link: https://lore.kernel.org/r/20240123-vfs-bdev-file-v2-28-adbd023e19cc@kernel.org Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- block/bdev.c | 2 -- block/blk.h | 4 ++++ include/linux/blkdev.h | 3 --- 3 files changed, 4 insertions(+), 5 deletions(-) (limited to 'block') diff --git a/block/bdev.c b/block/bdev.c index 4003f8e1782a..e6e46f24a89a 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -916,7 +916,6 @@ free_handle: kfree(handle); return ERR_PTR(ret); } -EXPORT_SYMBOL(bdev_open_by_dev); /* * If BLK_OPEN_WRITE_IOCTL is set then this is a historical quirk @@ -1042,7 +1041,6 @@ void bdev_release(struct bdev_handle *handle) blkdev_put_no_open(bdev); kfree(handle); } -EXPORT_SYMBOL(bdev_release); /** * lookup_bdev() - Look up a struct block_device by name. diff --git a/block/blk.h b/block/blk.h index 1ef920f72e0f..c9630774767d 100644 --- a/block/blk.h +++ b/block/blk.h @@ -516,4 +516,8 @@ static inline int req_ref_read(struct request *req) return atomic_read(&req->ref); } +void bdev_release(struct bdev_handle *handle); +struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder, + const struct blk_holder_ops *hops); + #endif /* BLK_INTERNAL_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 5880d5abfebe..495f55587207 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1482,8 +1482,6 @@ struct bdev_handle { blk_mode_t mode; }; -struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder, - const struct blk_holder_ops *hops); struct file *bdev_file_open_by_dev(dev_t dev, blk_mode_t mode, void *holder, const struct blk_holder_ops *hops); struct file *bdev_file_open_by_path(const char *path, blk_mode_t mode, @@ -1491,7 +1489,6 @@ struct file *bdev_file_open_by_path(const char *path, blk_mode_t mode, int bd_prepare_to_claim(struct block_device *bdev, void *holder, const struct blk_holder_ops *hops); void bd_abort_claiming(struct block_device *bdev, void *holder); -void bdev_release(struct bdev_handle *handle); /* just for blk-cgroup, don't use elsewhere */ struct block_device *blkdev_get_no_open(dev_t dev); -- cgit v1.2.3-58-ga151 From a56aefca8d386181415a1fb7cfec2f72b0404797 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 23 Jan 2024 14:26:46 +0100 Subject: bdev: make struct bdev_handle private to the block layer Link: https://lore.kernel.org/r/20240123-vfs-bdev-file-v2-29-adbd023e19cc@kernel.org Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- block/bdev.c | 119 +++++++++++++++++++++++++------------------------ block/blk.h | 12 +++-- block/fops.c | 37 +++++++-------- include/linux/blkdev.h | 7 --- include/linux/fs.h | 6 --- 5 files changed, 86 insertions(+), 95 deletions(-) (limited to 'block') diff --git a/block/bdev.c b/block/bdev.c index e6e46f24a89a..8f33f160e923 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -703,6 +703,24 @@ out_blkdev_put: return ret; } +int bdev_permission(dev_t dev, blk_mode_t mode, void *holder) +{ + int ret; + + ret = devcgroup_check_permission(DEVCG_DEV_BLOCK, + MAJOR(dev), MINOR(dev), + ((mode & BLK_OPEN_READ) ? DEVCG_ACC_READ : 0) | + ((mode & BLK_OPEN_WRITE) ? DEVCG_ACC_WRITE : 0)); + if (ret) + return ret; + + /* Blocking writes requires exclusive opener */ + if (mode & BLK_OPEN_RESTRICT_WRITES && !holder) + return -EINVAL; + + return 0; +} + static void blkdev_put_part(struct block_device *part) { struct block_device *whole = bdev_whole(part); @@ -795,69 +813,43 @@ static void bdev_yield_write_access(struct block_device *bdev, blk_mode_t mode) } /** - * bdev_open_by_dev - open a block device by device number - * @dev: device number of block device to open + * bdev_open - open a block device + * @bdev: block device to open * @mode: open mode (BLK_OPEN_*) * @holder: exclusive holder identifier * @hops: holder operations + * @bdev_file: file for the block device * - * Open the block device described by device number @dev. If @holder is not - * %NULL, the block device is opened with exclusive access. Exclusive opens may - * nest for the same @holder. - * - * Use this interface ONLY if you really do not have anything better - i.e. when - * you are behind a truly sucky interface and all you are given is a device - * number. Everything else should use bdev_open_by_path(). + * Open the block device. If @holder is not %NULL, the block device is opened + * with exclusive access. Exclusive opens may nest for the same @holder. * * CONTEXT: * Might sleep. * * RETURNS: - * Handle with a reference to the block_device on success, ERR_PTR(-errno) on - * failure. + * zero on success, -errno on failure. */ -struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder, - const struct blk_holder_ops *hops) +int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder, + const struct blk_holder_ops *hops, struct file *bdev_file) { - struct bdev_handle *handle = kmalloc(sizeof(struct bdev_handle), - GFP_KERNEL); - struct block_device *bdev; + struct bdev_handle *handle; bool unblock_events = true; - struct gendisk *disk; + struct gendisk *disk = bdev->bd_disk; int ret; + handle = kmalloc(sizeof(struct bdev_handle), GFP_KERNEL); if (!handle) - return ERR_PTR(-ENOMEM); - - ret = devcgroup_check_permission(DEVCG_DEV_BLOCK, - MAJOR(dev), MINOR(dev), - ((mode & BLK_OPEN_READ) ? DEVCG_ACC_READ : 0) | - ((mode & BLK_OPEN_WRITE) ? DEVCG_ACC_WRITE : 0)); - if (ret) - goto free_handle; - - /* Blocking writes requires exclusive opener */ - if (mode & BLK_OPEN_RESTRICT_WRITES && !holder) { - ret = -EINVAL; - goto free_handle; - } - - bdev = blkdev_get_no_open(dev); - if (!bdev) { - ret = -ENXIO; - goto free_handle; - } - disk = bdev->bd_disk; + return -ENOMEM; if (holder) { mode |= BLK_OPEN_EXCL; ret = bd_prepare_to_claim(bdev, holder, hops); if (ret) - goto put_blkdev; + goto free_handle; } else { if (WARN_ON_ONCE(mode & BLK_OPEN_EXCL)) { ret = -EIO; - goto put_blkdev; + goto free_handle; } } @@ -902,7 +894,16 @@ struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder, handle->bdev = bdev; handle->holder = holder; handle->mode = mode; - return handle; + + bdev_file->f_flags |= O_LARGEFILE; + bdev_file->f_mode |= FMODE_BUF_RASYNC | FMODE_CAN_ODIRECT; + if (bdev_nowait(bdev)) + bdev_file->f_mode |= FMODE_NOWAIT; + bdev_file->f_mapping = handle->bdev->bd_inode->i_mapping; + bdev_file->f_wb_err = filemap_sample_wb_err(bdev_file->f_mapping); + bdev_file->private_data = handle; + + return 0; put_module: module_put(disk->fops->owner); abort_claiming: @@ -910,11 +911,9 @@ abort_claiming: bd_abort_claiming(bdev, holder); mutex_unlock(&disk->open_mutex); disk_unblock_events(disk); -put_blkdev: - blkdev_put_no_open(bdev); free_handle: kfree(handle); - return ERR_PTR(ret); + return ret; } /* @@ -951,29 +950,33 @@ struct file *bdev_file_open_by_dev(dev_t dev, blk_mode_t mode, void *holder, const struct blk_holder_ops *hops) { struct file *bdev_file; - struct bdev_handle *handle; + struct block_device *bdev; unsigned int flags; + int ret; - handle = bdev_open_by_dev(dev, mode, holder, hops); - if (IS_ERR(handle)) - return ERR_CAST(handle); + ret = bdev_permission(dev, mode, holder); + if (ret) + return ERR_PTR(ret); + + bdev = blkdev_get_no_open(dev); + if (!bdev) + return ERR_PTR(-ENXIO); flags = blk_to_file_flags(mode); - bdev_file = alloc_file_pseudo_noaccount(handle->bdev->bd_inode, + bdev_file = alloc_file_pseudo_noaccount(bdev->bd_inode, blockdev_mnt, "", flags | O_LARGEFILE, &def_blk_fops); if (IS_ERR(bdev_file)) { - bdev_release(handle); + blkdev_put_no_open(bdev); return bdev_file; } - ihold(handle->bdev->bd_inode); + ihold(bdev->bd_inode); - bdev_file->f_mode |= FMODE_BUF_RASYNC | FMODE_CAN_ODIRECT; - if (bdev_nowait(handle->bdev)) - bdev_file->f_mode |= FMODE_NOWAIT; - - bdev_file->f_mapping = handle->bdev->bd_inode->i_mapping; - bdev_file->f_wb_err = filemap_sample_wb_err(bdev_file->f_mapping); - bdev_file->private_data = handle; + ret = bdev_open(bdev, mode, holder, hops, bdev_file); + if (ret) { + blkdev_put_no_open(bdev); + fput(bdev_file); + return ERR_PTR(ret); + } return bdev_file; } EXPORT_SYMBOL(bdev_file_open_by_dev); diff --git a/block/blk.h b/block/blk.h index c9630774767d..19b15870284f 100644 --- a/block/blk.h +++ b/block/blk.h @@ -25,6 +25,12 @@ struct blk_flush_queue { struct request *flush_rq; }; +struct bdev_handle { + struct block_device *bdev; + void *holder; + blk_mode_t mode; +}; + bool is_flush_rq(struct request *req); struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size, @@ -517,7 +523,7 @@ static inline int req_ref_read(struct request *req) } void bdev_release(struct bdev_handle *handle); -struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder, - const struct blk_holder_ops *hops); - +int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder, + const struct blk_holder_ops *hops, struct file *bdev_file); +int bdev_permission(dev_t dev, blk_mode_t mode, void *holder); #endif /* BLK_INTERNAL_H */ diff --git a/block/fops.c b/block/fops.c index 0cf8cf72cdfa..a1ba1a50ae77 100644 --- a/block/fops.c +++ b/block/fops.c @@ -599,36 +599,31 @@ blk_mode_t file_to_blk_mode(struct file *file) static int blkdev_open(struct inode *inode, struct file *filp) { - struct bdev_handle *handle; + struct block_device *bdev; blk_mode_t mode; - - /* - * Preserve backwards compatibility and allow large file access - * even if userspace doesn't ask for it explicitly. Some mkfs - * binary needs it. We might want to drop this workaround - * during an unstable branch. - */ - filp->f_flags |= O_LARGEFILE; - filp->f_mode |= FMODE_BUF_RASYNC | FMODE_CAN_ODIRECT; + void *holder; + int ret; mode = file_to_blk_mode(filp); - handle = bdev_open_by_dev(inode->i_rdev, mode, - mode & BLK_OPEN_EXCL ? filp : NULL, NULL); - if (IS_ERR(handle)) - return PTR_ERR(handle); + holder = mode & BLK_OPEN_EXCL ? filp : NULL; + ret = bdev_permission(inode->i_rdev, mode, holder); + if (ret) + return ret; - if (bdev_nowait(handle->bdev)) - filp->f_mode |= FMODE_NOWAIT; + bdev = blkdev_get_no_open(inode->i_rdev); + if (!bdev) + return -ENXIO; - filp->f_mapping = handle->bdev->bd_inode->i_mapping; - filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping); - filp->private_data = handle; - return 0; + ret = bdev_open(bdev, mode, holder, NULL, filp); + if (ret) + blkdev_put_no_open(bdev); + return ret; } static int blkdev_release(struct inode *inode, struct file *filp) { - bdev_release(filp->private_data); + if (filp->private_data) + bdev_release(filp->private_data); return 0; } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 495f55587207..2f5dbde23094 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1475,13 +1475,6 @@ extern const struct blk_holder_ops fs_holder_ops; (BLK_OPEN_READ | BLK_OPEN_RESTRICT_WRITES | \ (((flags) & SB_RDONLY) ? 0 : BLK_OPEN_WRITE)) -/* @bdev_handle will be removed soon. */ -struct bdev_handle { - struct block_device *bdev; - void *holder; - blk_mode_t mode; -}; - struct file *bdev_file_open_by_dev(dev_t dev, blk_mode_t mode, void *holder, const struct blk_holder_ops *hops); struct file *bdev_file_open_by_path(const char *path, blk_mode_t mode, diff --git a/include/linux/fs.h b/include/linux/fs.h index e9291e27cc47..6e0714d35d9b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1327,12 +1327,6 @@ struct super_block { struct list_head s_inodes_wb; /* writeback inodes */ } __randomize_layout; -/* Temporary helper that will go away. */ -static inline struct bdev_handle *sb_bdev_handle(struct super_block *sb) -{ - return sb->s_bdev_file->private_data; -} - static inline struct user_namespace *i_user_ns(const struct inode *inode) { return inode->i_sb->s_user_ns; -- cgit v1.2.3-58-ga151 From 7c09a4ed6156c6cab6b951e027ca6ea24af454ba Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 23 Jan 2024 14:26:47 +0100 Subject: bdev: remove bdev pointer from struct bdev_handle We can always go directly via: * I_BDEV(bdev_file->f_inode) * I_BDEV(bdev_file->f_mapping->host) So keeping struct bdev in struct bdev_handle is redundant. Link: https://lore.kernel.org/r/20240123-vfs-bdev-file-v2-30-adbd023e19cc@kernel.org Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- block/bdev.c | 26 ++++++++++++-------------- block/blk.h | 3 +-- block/fops.c | 2 +- 3 files changed, 14 insertions(+), 17 deletions(-) (limited to 'block') diff --git a/block/bdev.c b/block/bdev.c index 8f33f160e923..4e4527c5df00 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -51,8 +51,7 @@ EXPORT_SYMBOL(I_BDEV); struct block_device *file_bdev(struct file *bdev_file) { - struct bdev_handle *handle = bdev_file->private_data; - return handle->bdev; + return I_BDEV(bdev_file->f_mapping->host); } EXPORT_SYMBOL(file_bdev); @@ -891,7 +890,6 @@ int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder, if (unblock_events) disk_unblock_events(disk); - handle->bdev = bdev; handle->holder = holder; handle->mode = mode; @@ -899,7 +897,7 @@ int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder, bdev_file->f_mode |= FMODE_BUF_RASYNC | FMODE_CAN_ODIRECT; if (bdev_nowait(bdev)) bdev_file->f_mode |= FMODE_NOWAIT; - bdev_file->f_mapping = handle->bdev->bd_inode->i_mapping; + bdev_file->f_mapping = bdev->bd_inode->i_mapping; bdev_file->f_wb_err = filemap_sample_wb_err(bdev_file->f_mapping); bdev_file->private_data = handle; @@ -985,7 +983,7 @@ struct file *bdev_file_open_by_path(const char *path, blk_mode_t mode, void *holder, const struct blk_holder_ops *hops) { - struct file *bdev_file; + struct file *file; dev_t dev; int error; @@ -993,22 +991,22 @@ struct file *bdev_file_open_by_path(const char *path, blk_mode_t mode, if (error) return ERR_PTR(error); - bdev_file = bdev_file_open_by_dev(dev, mode, holder, hops); - if (!IS_ERR(bdev_file) && (mode & BLK_OPEN_WRITE)) { - struct bdev_handle *handle = bdev_file->private_data; - if (bdev_read_only(handle->bdev)) { - fput(bdev_file); - bdev_file = ERR_PTR(-EACCES); + file = bdev_file_open_by_dev(dev, mode, holder, hops); + if (!IS_ERR(file) && (mode & BLK_OPEN_WRITE)) { + if (bdev_read_only(file_bdev(file))) { + fput(file); + file = ERR_PTR(-EACCES); } } - return bdev_file; + return file; } EXPORT_SYMBOL(bdev_file_open_by_path); -void bdev_release(struct bdev_handle *handle) +void bdev_release(struct file *bdev_file) { - struct block_device *bdev = handle->bdev; + struct block_device *bdev = file_bdev(bdev_file); + struct bdev_handle *handle = bdev_file->private_data; struct gendisk *disk = bdev->bd_disk; /* diff --git a/block/blk.h b/block/blk.h index 19b15870284f..7ca24814f3a0 100644 --- a/block/blk.h +++ b/block/blk.h @@ -26,7 +26,6 @@ struct blk_flush_queue { }; struct bdev_handle { - struct block_device *bdev; void *holder; blk_mode_t mode; }; @@ -522,7 +521,7 @@ static inline int req_ref_read(struct request *req) return atomic_read(&req->ref); } -void bdev_release(struct bdev_handle *handle); +void bdev_release(struct file *bdev_file); int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder, const struct blk_holder_ops *hops, struct file *bdev_file); int bdev_permission(dev_t dev, blk_mode_t mode, void *holder); diff --git a/block/fops.c b/block/fops.c index a1ba1a50ae77..aab9b89e4c77 100644 --- a/block/fops.c +++ b/block/fops.c @@ -623,7 +623,7 @@ static int blkdev_open(struct inode *inode, struct file *filp) static int blkdev_release(struct inode *inode, struct file *filp) { if (filp->private_data) - bdev_release(filp->private_data); + bdev_release(filp); return 0; } -- cgit v1.2.3-58-ga151 From 321de651fa565dcf76c017b257bdf15ec7fff45d Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 23 Jan 2024 14:26:48 +0100 Subject: block: don't rely on BLK_OPEN_RESTRICT_WRITES when yielding write access Make it possible to detected a block device that was opened with restricted write access based only on BLK_OPEN_WRITE and bdev->bd_writers < 0 so we won't have to claim another FMODE_* flag. Link: https://lore.kernel.org/r/20240123-vfs-bdev-file-v2-31-adbd023e19cc@kernel.org Signed-off-by: Christian Brauner --- block/bdev.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/bdev.c b/block/bdev.c index 4e4527c5df00..efecc9b97e1e 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -799,16 +799,21 @@ static void bdev_claim_write_access(struct block_device *bdev, blk_mode_t mode) bdev->bd_writers++; } -static void bdev_yield_write_access(struct block_device *bdev, blk_mode_t mode) +static void bdev_yield_write_access(struct file *bdev_file, blk_mode_t mode) { + struct block_device *bdev; + if (bdev_allow_write_mounted) return; + bdev = file_bdev(bdev_file); /* Yield exclusive or shared write access. */ - if (mode & BLK_OPEN_RESTRICT_WRITES) - bdev_unblock_writes(bdev); - else if (mode & BLK_OPEN_WRITE) - bdev->bd_writers--; + if (mode & BLK_OPEN_WRITE) { + if (bdev_writes_blocked(bdev)) + bdev_unblock_writes(bdev); + else + bdev->bd_writers--; + } } /** @@ -1020,7 +1025,7 @@ void bdev_release(struct file *bdev_file) sync_blockdev(bdev); mutex_lock(&disk->open_mutex); - bdev_yield_write_access(bdev, handle->mode); + bdev_yield_write_access(bdev_file, handle->mode); if (handle->holder) bd_end_claim(bdev, handle->holder); -- cgit v1.2.3-58-ga151 From ab838b3fd9a442a62f36ea7eeb93e77259f787ce Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 23 Jan 2024 14:26:49 +0100 Subject: block: remove bdev_handle completely We just need to use the holder to indicate whether a block device open was exclusive or not. We did use to do that before but had to give that up once we switched to struct bdev_handle. Before struct bdev_handle we only stashed stuff in file->private_data if this was an exclusive open but after struct bdev_handle we always set file->private_data to a struct bdev_handle and so we had to use bdev_handle->mode or bdev_handle->holder. Now that we don't use struct bdev_handle anymore we can revert back to the old behavior. Link: https://lore.kernel.org/r/20240123-vfs-bdev-file-v2-32-adbd023e19cc@kernel.org Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- block/bdev.c | 47 ++++++++++++++++++++++++----------------------- block/blk.h | 5 ----- block/fops.c | 21 ++++++++++----------- 3 files changed, 34 insertions(+), 39 deletions(-) (limited to 'block') diff --git a/block/bdev.c b/block/bdev.c index efecc9b97e1e..140093c99bdc 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -717,6 +717,13 @@ int bdev_permission(dev_t dev, blk_mode_t mode, void *holder) if (mode & BLK_OPEN_RESTRICT_WRITES && !holder) return -EINVAL; + /* + * We're using error pointers to indicate to ->release() when we + * failed to open that block device. Also this doesn't make sense. + */ + if (WARN_ON_ONCE(IS_ERR(holder))) + return -EINVAL; + return 0; } @@ -799,7 +806,7 @@ static void bdev_claim_write_access(struct block_device *bdev, blk_mode_t mode) bdev->bd_writers++; } -static void bdev_yield_write_access(struct file *bdev_file, blk_mode_t mode) +static void bdev_yield_write_access(struct file *bdev_file) { struct block_device *bdev; @@ -808,7 +815,7 @@ static void bdev_yield_write_access(struct file *bdev_file, blk_mode_t mode) bdev = file_bdev(bdev_file); /* Yield exclusive or shared write access. */ - if (mode & BLK_OPEN_WRITE) { + if (bdev_file->f_mode & FMODE_WRITE) { if (bdev_writes_blocked(bdev)) bdev_unblock_writes(bdev); else @@ -836,25 +843,18 @@ static void bdev_yield_write_access(struct file *bdev_file, blk_mode_t mode) int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder, const struct blk_holder_ops *hops, struct file *bdev_file) { - struct bdev_handle *handle; bool unblock_events = true; struct gendisk *disk = bdev->bd_disk; int ret; - handle = kmalloc(sizeof(struct bdev_handle), GFP_KERNEL); - if (!handle) - return -ENOMEM; - if (holder) { mode |= BLK_OPEN_EXCL; ret = bd_prepare_to_claim(bdev, holder, hops); if (ret) - goto free_handle; + return ret; } else { - if (WARN_ON_ONCE(mode & BLK_OPEN_EXCL)) { - ret = -EIO; - goto free_handle; - } + if (WARN_ON_ONCE(mode & BLK_OPEN_EXCL)) + return -EIO; } disk_block_events(disk); @@ -895,8 +895,6 @@ int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder, if (unblock_events) disk_unblock_events(disk); - handle->holder = holder; - handle->mode = mode; bdev_file->f_flags |= O_LARGEFILE; bdev_file->f_mode |= FMODE_BUF_RASYNC | FMODE_CAN_ODIRECT; @@ -904,7 +902,7 @@ int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder, bdev_file->f_mode |= FMODE_NOWAIT; bdev_file->f_mapping = bdev->bd_inode->i_mapping; bdev_file->f_wb_err = filemap_sample_wb_err(bdev_file->f_mapping); - bdev_file->private_data = handle; + bdev_file->private_data = holder; return 0; put_module: @@ -914,8 +912,6 @@ abort_claiming: bd_abort_claiming(bdev, holder); mutex_unlock(&disk->open_mutex); disk_unblock_events(disk); -free_handle: - kfree(handle); return ret; } @@ -976,7 +972,8 @@ struct file *bdev_file_open_by_dev(dev_t dev, blk_mode_t mode, void *holder, ret = bdev_open(bdev, mode, holder, hops, bdev_file); if (ret) { - blkdev_put_no_open(bdev); + /* We failed to open the block device. Let ->release() know. */ + bdev_file->private_data = ERR_PTR(ret); fput(bdev_file); return ERR_PTR(ret); } @@ -1011,9 +1008,13 @@ EXPORT_SYMBOL(bdev_file_open_by_path); void bdev_release(struct file *bdev_file) { struct block_device *bdev = file_bdev(bdev_file); - struct bdev_handle *handle = bdev_file->private_data; + void *holder = bdev_file->private_data; struct gendisk *disk = bdev->bd_disk; + /* We failed to open that block device. */ + if (IS_ERR(holder)) + goto put_no_open; + /* * Sync early if it looks like we're the last one. If someone else * opens the block device between now and the decrement of bd_openers @@ -1025,10 +1026,10 @@ void bdev_release(struct file *bdev_file) sync_blockdev(bdev); mutex_lock(&disk->open_mutex); - bdev_yield_write_access(bdev_file, handle->mode); + bdev_yield_write_access(bdev_file); - if (handle->holder) - bd_end_claim(bdev, handle->holder); + if (holder) + bd_end_claim(bdev, holder); /* * Trigger event checking and tell drivers to flush MEDIA_CHANGE @@ -1044,8 +1045,8 @@ void bdev_release(struct file *bdev_file) mutex_unlock(&disk->open_mutex); module_put(disk->fops->owner); +put_no_open: blkdev_put_no_open(bdev); - kfree(handle); } /** diff --git a/block/blk.h b/block/blk.h index 7ca24814f3a0..f02b25f22e8b 100644 --- a/block/blk.h +++ b/block/blk.h @@ -25,11 +25,6 @@ struct blk_flush_queue { struct request *flush_rq; }; -struct bdev_handle { - void *holder; - blk_mode_t mode; -}; - bool is_flush_rq(struct request *req); struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size, diff --git a/block/fops.c b/block/fops.c index aab9b89e4c77..029e787f0119 100644 --- a/block/fops.c +++ b/block/fops.c @@ -569,18 +569,17 @@ static int blkdev_fsync(struct file *filp, loff_t start, loff_t end, blk_mode_t file_to_blk_mode(struct file *file) { blk_mode_t mode = 0; - struct bdev_handle *handle = file->private_data; if (file->f_mode & FMODE_READ) mode |= BLK_OPEN_READ; if (file->f_mode & FMODE_WRITE) mode |= BLK_OPEN_WRITE; /* - * do_dentry_open() clears O_EXCL from f_flags, use handle->mode to - * determine whether the open was exclusive for already open files. + * do_dentry_open() clears O_EXCL from f_flags, use file->private_data + * to determine whether the open was exclusive for already open files. */ - if (handle) - mode |= handle->mode & BLK_OPEN_EXCL; + if (file->private_data) + mode |= BLK_OPEN_EXCL; else if (file->f_flags & O_EXCL) mode |= BLK_OPEN_EXCL; if (file->f_flags & O_NDELAY) @@ -601,12 +600,13 @@ static int blkdev_open(struct inode *inode, struct file *filp) { struct block_device *bdev; blk_mode_t mode; - void *holder; int ret; mode = file_to_blk_mode(filp); - holder = mode & BLK_OPEN_EXCL ? filp : NULL; - ret = bdev_permission(inode->i_rdev, mode, holder); + /* Use the file as the holder. */ + if (mode & BLK_OPEN_EXCL) + filp->private_data = filp; + ret = bdev_permission(inode->i_rdev, mode, filp->private_data); if (ret) return ret; @@ -614,7 +614,7 @@ static int blkdev_open(struct inode *inode, struct file *filp) if (!bdev) return -ENXIO; - ret = bdev_open(bdev, mode, holder, NULL, filp); + ret = bdev_open(bdev, mode, filp->private_data, NULL, filp); if (ret) blkdev_put_no_open(bdev); return ret; @@ -622,8 +622,7 @@ static int blkdev_open(struct inode *inode, struct file *filp) static int blkdev_release(struct inode *inode, struct file *filp) { - if (filp->private_data) - bdev_release(filp); + bdev_release(filp); return 0; } -- cgit v1.2.3-58-ga151 From ec30b461f3d067bd322a6c3a6c5105746ed9bf14 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 28 Feb 2024 12:08:57 +0800 Subject: blk-mq: don't change nr_hw_queues and nr_maps for kdump kernel For most of ARCHs, 'nr_cpus=1' is passed for kdump kernel, so nr_hw_queues for each mapping is supposed to be 1 already. More importantly, this way may cause trouble for driver, because blk-mq and driver see different queue mapping since driver should setup hardware queue setting before calling into allocating blk-mq tagset. So not overriding nr_hw_queues and nr_maps for kdump kernel. Cc: Wen Xiong Signed-off-by: Ming Lei Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240228040857.306483-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 7111bd4180e7..b7ba91d53c41 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4379,7 +4379,7 @@ static void blk_mq_update_queue_map(struct blk_mq_tag_set *set) if (set->nr_maps == 1) set->map[HCTX_TYPE_DEFAULT].nr_queues = set->nr_hw_queues; - if (set->ops->map_queues && !is_kdump_kernel()) { + if (set->ops->map_queues) { int i; /* @@ -4478,14 +4478,12 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) /* * If a crashdump is active, then we are potentially in a very - * memory constrained environment. Limit us to 1 queue and - * 64 tags to prevent using too much memory. + * memory constrained environment. Limit us to 64 tags to prevent + * using too much memory. */ - if (is_kdump_kernel()) { - set->nr_hw_queues = 1; - set->nr_maps = 1; + if (is_kdump_kernel()) set->queue_depth = min(64U, set->queue_depth); - } + /* * There is no use for more h/w queues than cpus if we just have * a single map @@ -4515,7 +4513,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) GFP_KERNEL, set->numa_node); if (!set->map[i].mq_map) goto out_free_mq_map; - set->map[i].nr_queues = is_kdump_kernel() ? 1 : set->nr_hw_queues; + set->map[i].nr_queues = set->nr_hw_queues; } blk_mq_update_queue_map(set); -- cgit v1.2.3-58-ga151 From 631d4efb8009df64deae8c1382b8cf43879a4e22 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 28 Feb 2024 14:56:40 -0800 Subject: block: add a queue_limits_set helper Add a small wrapper around queue_limits_commit_update for stacking drivers that don't want to update existing limits, but set an entirely new set. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240228225653.947152-2-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-settings.c | 18 ++++++++++++++++++ include/linux/blkdev.h | 1 + 2 files changed, 19 insertions(+) (limited to 'block') diff --git a/block/blk-settings.c b/block/blk-settings.c index b6bbe683d218..1989a177be20 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -266,6 +266,24 @@ int queue_limits_commit_update(struct request_queue *q, } EXPORT_SYMBOL_GPL(queue_limits_commit_update); +/** + * queue_limits_commit_set - apply queue limits to queue + * @q: queue to update + * @lim: limits to apply + * + * Apply the limits in @lim that were freshly initialized to @q. + * To update existing limits use queue_limits_start_update() and + * queue_limits_commit_update() instead. + * + * Returns 0 if successful, else a negative error code. + */ +int queue_limits_set(struct request_queue *q, struct queue_limits *lim) +{ + mutex_lock(&q->limits_lock); + return queue_limits_commit_update(q, lim); +} +EXPORT_SYMBOL_GPL(queue_limits_set); + /** * blk_queue_bounce_limit - set bounce buffer limit for queue * @q: the request queue for the device diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index a14ea9344138..dd510ad7ce4b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -889,6 +889,7 @@ queue_limits_start_update(struct request_queue *q) } int queue_limits_commit_update(struct request_queue *q, struct queue_limits *lim); +int queue_limits_set(struct request_queue *q, struct queue_limits *lim); /* * Access functions for manipulating queue properties -- cgit v1.2.3-58-ga151 From c1373f1cf452e4c7553a9d3bc05d87ec15c4f85f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 28 Feb 2024 14:56:41 -0800 Subject: block: add a queue_limits_stack_bdev helper Add a small wrapper around blk_stack_limits that allows passing a bdev for the bottom device and prints an error in case of misaligned device. The name fits into the new queue limits API and the intent is to eventually replace disk_stack_limits. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240228225653.947152-3-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-settings.c | 25 +++++++++++++++++++++++++ include/linux/blkdev.h | 2 ++ 2 files changed, 27 insertions(+) (limited to 'block') diff --git a/block/blk-settings.c b/block/blk-settings.c index 1989a177be20..865fe4ebbf9b 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -891,6 +891,31 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, } EXPORT_SYMBOL(blk_stack_limits); +/** + * queue_limits_stack_bdev - adjust queue_limits for stacked devices + * @t: the stacking driver limits (top device) + * @bdev: the underlying block device (bottom) + * @offset: offset to beginning of data within component device + * @pfx: prefix to use for warnings logged + * + * Description: + * This function is used by stacking drivers like MD and DM to ensure + * that all component devices have compatible block sizes and + * alignments. The stacking driver must provide a queue_limits + * struct (top) and then iteratively call the stacking function for + * all component (bottom) devices. The stacking function will + * attempt to combine the values and ensure proper alignment. + */ +void queue_limits_stack_bdev(struct queue_limits *t, struct block_device *bdev, + sector_t offset, const char *pfx) +{ + if (blk_stack_limits(t, &bdev_get_queue(bdev)->limits, + get_start_sect(bdev) + offset)) + pr_notice("%s: Warning: Device %pg is misaligned\n", + pfx, bdev); +} +EXPORT_SYMBOL_GPL(queue_limits_stack_bdev); + /** * disk_stack_limits - adjust queue limits for stacked drivers * @disk: MD/DM gendisk (top) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index dd510ad7ce4b..285e82723d64 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -924,6 +924,8 @@ extern void blk_set_queue_depth(struct request_queue *q, unsigned int depth); extern void blk_set_stacking_limits(struct queue_limits *lim); extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, sector_t offset); +void queue_limits_stack_bdev(struct queue_limits *t, struct block_device *bdev, + sector_t offset, const char *pfx); extern void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, sector_t offset); extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int); -- cgit v1.2.3-58-ga151 From 8e0ef412869430d114158fc3b9b1fb111e247bd3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 28 Feb 2024 14:56:42 -0800 Subject: dm: use queue_limits_set Use queue_limits_set which validates the limits and takes care of updating the readahead settings instead of directly assigning them to the queue. For that make sure all limits are actually updated before the assignment. Signed-off-by: Christoph Hellwig Reviewed-by: Mike Snitzer Link: https://lore.kernel.org/r/20240228225653.947152-4-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-settings.c | 2 +- drivers/md/dm-table.c | 27 ++++++++++++--------------- 2 files changed, 13 insertions(+), 16 deletions(-) (limited to 'block') diff --git a/block/blk-settings.c b/block/blk-settings.c index 865fe4ebbf9b..13865a9f8972 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -267,7 +267,7 @@ int queue_limits_commit_update(struct request_queue *q, EXPORT_SYMBOL_GPL(queue_limits_commit_update); /** - * queue_limits_commit_set - apply queue limits to queue + * queue_limits_set - apply queue limits to queue * @q: queue to update * @lim: limits to apply * diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 41f1d731ae5a..88114719fe18 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1963,26 +1963,27 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, bool wc = false, fua = false; int r; - /* - * Copy table's limits to the DM device's request_queue - */ - q->limits = *limits; - if (dm_table_supports_nowait(t)) blk_queue_flag_set(QUEUE_FLAG_NOWAIT, q); else blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, q); if (!dm_table_supports_discards(t)) { - q->limits.max_discard_sectors = 0; - q->limits.max_hw_discard_sectors = 0; - q->limits.discard_granularity = 0; - q->limits.discard_alignment = 0; - q->limits.discard_misaligned = 0; + limits->max_hw_discard_sectors = 0; + limits->discard_granularity = 0; + limits->discard_alignment = 0; + limits->discard_misaligned = 0; } + if (!dm_table_supports_write_zeroes(t)) + limits->max_write_zeroes_sectors = 0; + if (!dm_table_supports_secure_erase(t)) - q->limits.max_secure_erase_sectors = 0; + limits->max_secure_erase_sectors = 0; + + r = queue_limits_set(q, limits); + if (r) + return r; if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_WC))) { wc = true; @@ -2007,9 +2008,6 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, else blk_queue_flag_set(QUEUE_FLAG_NONROT, q); - if (!dm_table_supports_write_zeroes(t)) - q->limits.max_write_zeroes_sectors = 0; - dm_table_verify_integrity(t); /* @@ -2047,7 +2045,6 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, } dm_update_crypto_profile(q, t); - disk_update_readahead(t->md->disk); /* * Check for request-based device is left to -- cgit v1.2.3-58-ga151 From 38b43539d64b2fa020b3b9a752a986769f87f7a6 Mon Sep 17 00:00:00 2001 From: Tony Battersby Date: Thu, 29 Feb 2024 13:08:09 -0500 Subject: block: Fix page refcounts for unaligned buffers in __bio_release_pages() Fix an incorrect number of pages being released for buffers that do not start at the beginning of a page. Fixes: 1b151e2435fc ("block: Remove special-casing of compound pages") Cc: stable@vger.kernel.org Signed-off-by: Tony Battersby Tested-by: Greg Edwards Link: https://lore.kernel.org/r/86e592a9-98d4-4cff-a646-0c0084328356@cybernetics.com Signed-off-by: Jens Axboe --- block/bio.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 496867b51609..a8b691940027 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1153,7 +1153,7 @@ void __bio_release_pages(struct bio *bio, bool mark_dirty) bio_for_each_folio_all(fi, bio) { struct page *page; - size_t done = 0; + size_t nr_pages; if (mark_dirty) { folio_lock(fi.folio); @@ -1161,10 +1161,11 @@ void __bio_release_pages(struct bio *bio, bool mark_dirty) folio_unlock(fi.folio); } page = folio_page(fi.folio, fi.offset / PAGE_SIZE); + nr_pages = (fi.offset + fi.length - 1) / PAGE_SIZE - + fi.offset / PAGE_SIZE + 1; do { bio_release_page(bio, page++); - done += PAGE_SIZE; - } while (done < fi.length); + } while (--nr_pages != 0); } } EXPORT_SYMBOL_GPL(__bio_release_pages); -- cgit v1.2.3-58-ga151 From f8c7511db009d42e2c24e48eeb04e3f1b67ab209 Mon Sep 17 00:00:00 2001 From: "Ricardo B. Marliere" Date: Tue, 5 Mar 2024 16:32:16 -0300 Subject: block: make block_class constant Since commit 43a7206b0963 ("driver core: class: make class_register() take a const *"), the driver core allows for struct class to be in read-only memory, so move the block_class structure to be declared at build time placing it into read-only memory, instead of having to be dynamically allocated at boot time. Cc: Greg Kroah-Hartman Suggested-by: Greg Kroah-Hartman Signed-off-by: Ricardo B. Marliere Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240305-class_cleanup-block-v1-1-130bb27b9c72@marliere.net Signed-off-by: Jens Axboe --- block/genhd.c | 2 +- drivers/base/base.h | 2 +- include/linux/blkdev.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 84c822d989da..a214f9cf3a35 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1201,7 +1201,7 @@ static int block_uevent(const struct device *dev, struct kobj_uevent_env *env) return add_uevent_var(env, "DISKSEQ=%llu", disk->diskseq); } -struct class block_class = { +const struct class block_class = { .name = "block", .dev_uevent = block_uevent, }; diff --git a/drivers/base/base.h b/drivers/base/base.h index eb4c0ace9242..0738ccad08b2 100644 --- a/drivers/base/base.h +++ b/drivers/base/base.h @@ -207,7 +207,7 @@ static inline int devtmpfs_init(void) { return 0; } #endif #ifdef CONFIG_BLOCK -extern struct class block_class; +extern const struct class block_class; static inline bool is_blockdev(struct device *dev) { return dev->class == &block_class; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 285e82723d64..19c7596f4ebf 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -42,7 +42,7 @@ struct blk_crypto_profile; extern const struct device_type disk_type; extern const struct device_type part_type; -extern struct class block_class; +extern const struct class block_class; /* * Maximum number of blkcg policies allowed to be registered concurrently. -- cgit v1.2.3-58-ga151 From 147fe6133477b9654f0ede339ef9267425955e88 Mon Sep 17 00:00:00 2001 From: Li zeming Date: Wed, 6 Mar 2024 17:56:08 +0800 Subject: sed-opal: Remove unnecessary ‘0’ values from error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit error is assigned first, so it does not need to initialize the assignment. Signed-off-by: Li zeming Link: https://lore.kernel.org/r/20240306095608.26839-1-zeming@nfschina.com Signed-off-by: Jens Axboe --- block/sed-opal.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/sed-opal.c b/block/sed-opal.c index 3d9e9cd250bd..e5b069dde905 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -1208,7 +1208,7 @@ static int cmd_start(struct opal_dev *dev, const u8 *uid, const u8 *method) static int start_opal_session_cont(struct opal_dev *dev) { u32 hsn, tsn; - int error = 0; + int error; error = parse_and_check_status(dev); if (error) @@ -1350,7 +1350,7 @@ static int get_active_key_cont(struct opal_dev *dev) { const char *activekey; size_t keylen; - int error = 0; + int error; error = parse_and_check_status(dev); if (error) -- cgit v1.2.3-58-ga151 From 217fcc48074be4f526190675140a478ee795d90d Mon Sep 17 00:00:00 2001 From: Li zeming Date: Wed, 6 Mar 2024 18:02:16 +0800 Subject: sed-opal: Remove unnecessary ‘0’ values from err MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit err is assigned first, so it does not need to initialize the assignment. Signed-off-by: Li zeming Link: https://lore.kernel.org/r/20240306100216.69340-1-zeming@nfschina.com Signed-off-by: Jens Axboe --- block/sed-opal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/sed-opal.c b/block/sed-opal.c index e5b069dde905..b6887920a84e 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -2153,7 +2153,7 @@ static int lock_unlock_locking_range(struct opal_dev *dev, void *data) u8 lr_buffer[OPAL_UID_LENGTH]; struct opal_lock_unlock *lkul = data; u8 read_locked = 1, write_locked = 1; - int err = 0; + int err; if (build_locking_range(lr_buffer, sizeof(lr_buffer), lkul->session.opal_key.lr) < 0) -- cgit v1.2.3-58-ga151 From 2449be8c8cfcbb24b3cd15d8b55c2c91041c847b Mon Sep 17 00:00:00 2001 From: Li kunyu Date: Wed, 6 Mar 2024 18:06:59 +0800 Subject: sed-opal: Remove unnecessary ‘0’ values from ret MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ret is assigned first, so it does not need to initialize the assignment. Signed-off-by: Li kunyu Link: https://lore.kernel.org/r/20240306100659.106521-1-kunyu@nfschina.com Signed-off-by: Jens Axboe --- block/sed-opal.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/sed-opal.c b/block/sed-opal.c index b6887920a84e..f62b7ee933f7 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -2576,7 +2576,7 @@ static int opal_get_discv(struct opal_dev *dev, struct opal_discovery *discv) const struct opal_step discovery0_step = { opal_discovery0, discv }; - int ret = 0; + int ret; mutex_lock(&dev->dev_lock); setup_opal_dev(dev); @@ -3065,7 +3065,7 @@ bool opal_unlock_from_suspend(struct opal_dev *dev) { struct opal_suspend_data *suspend; bool was_failure = false; - int ret = 0; + int ret; if (!dev) return false; -- cgit v1.2.3-58-ga151 From 5f2ad31fbb18565645f121b413837f260748e963 Mon Sep 17 00:00:00 2001 From: Li kunyu Date: Wed, 6 Mar 2024 18:14:44 +0800 Subject: sed-opal: Remove the ret variable from the function The ret variable in the function has not yet been effective and can be removed. Signed-off-by: Li kunyu Link: https://lore.kernel.org/r/20240306101444.1244-1-kunyu@nfschina.com Signed-off-by: Jens Axboe --- block/sed-opal.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/sed-opal.c b/block/sed-opal.c index f62b7ee933f7..25fdbb7e14ae 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -3108,10 +3108,9 @@ static int opal_read_table(struct opal_dev *dev, { read_table_data, rw_tbl }, { end_opal_session, } }; - int ret = 0; if (!rw_tbl->size) - return ret; + return 0; return execute_steps(dev, read_table_steps, ARRAY_SIZE(read_table_steps)); @@ -3125,10 +3124,9 @@ static int opal_write_table(struct opal_dev *dev, { write_table_data, rw_tbl }, { end_opal_session, } }; - int ret = 0; if (!rw_tbl->size) - return ret; + return 0; return execute_steps(dev, write_table_steps, ARRAY_SIZE(write_table_steps)); -- cgit v1.2.3-58-ga151 From 93f52fbeaf4b676b21acfe42a5152620e6770d02 Mon Sep 17 00:00:00 2001 From: Roman Smirnov Date: Tue, 5 Mar 2024 16:45:09 +0300 Subject: block: prevent division by zero in blk_rq_stat_sum() The expression dst->nr_samples + src->nr_samples may have zero value on overflow. It is necessary to add a check to avoid division by zero. Found by Linux Verification Center (linuxtesting.org) with Svace. Signed-off-by: Roman Smirnov Reviewed-by: Sergey Shtylyov Link: https://lore.kernel.org/r/20240305134509.23108-1-r.smirnov@omp.ru Signed-off-by: Jens Axboe --- block/blk-stat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-stat.c b/block/blk-stat.c index 7ff76ae6c76a..e42c263e53fb 100644 --- a/block/blk-stat.c +++ b/block/blk-stat.c @@ -27,7 +27,7 @@ void blk_rq_stat_init(struct blk_rq_stat *stat) /* src is a per-cpu stat, mean isn't initialized */ void blk_rq_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src) { - if (!src->nr_samples) + if (dst->nr_samples + src->nr_samples <= dst->nr_samples) return; dst->min = min(dst->min, src->min); -- cgit v1.2.3-58-ga151 From b9355185d2aeef11f6e365dd98def82212637936 Mon Sep 17 00:00:00 2001 From: Li Lingfeng Date: Tue, 5 Mar 2024 11:21:32 +0800 Subject: block: move capacity validation to blkpg_do_ioctl() Commit 6d4e80db4ebe ("block: add capacity validation in bdev_add_partition()") add check of partition's start and end sectors to prevent exceeding the size of the disk when adding partitions. However, there is still no check for resizing partitions now. Move the check to blkpg_do_ioctl() to cover resizing partitions. Signed-off-by: Li Lingfeng Reviewed-by: Damien Le Moal Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240305032132.548958-1-lilingfeng@huaweicloud.com Signed-off-by: Jens Axboe --- block/ioctl.c | 9 ++++++++- block/partitions/core.c | 11 ----------- 2 files changed, 8 insertions(+), 12 deletions(-) (limited to 'block') diff --git a/block/ioctl.c b/block/ioctl.c index 438f79c564cf..de0cc0d215c6 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -18,7 +18,7 @@ static int blkpg_do_ioctl(struct block_device *bdev, { struct gendisk *disk = bdev->bd_disk; struct blkpg_partition p; - sector_t start, length; + sector_t start, length, capacity, end; if (!capable(CAP_SYS_ADMIN)) return -EACCES; @@ -41,6 +41,13 @@ static int blkpg_do_ioctl(struct block_device *bdev, start = p.start >> SECTOR_SHIFT; length = p.length >> SECTOR_SHIFT; + capacity = get_capacity(disk); + + if (check_add_overflow(start, length, &end)) + return -EINVAL; + + if (start >= capacity || end > capacity) + return -EINVAL; switch (op) { case BLKPG_ADD_PARTITION: diff --git a/block/partitions/core.c b/block/partitions/core.c index 5f5ed5c75f04..b11e88c82c8c 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -419,21 +419,10 @@ static bool partition_overlaps(struct gendisk *disk, sector_t start, int bdev_add_partition(struct gendisk *disk, int partno, sector_t start, sector_t length) { - sector_t capacity = get_capacity(disk), end; struct block_device *part; int ret; mutex_lock(&disk->open_mutex); - if (check_add_overflow(start, length, &end)) { - ret = -EINVAL; - goto out; - } - - if (start >= capacity || end > capacity) { - ret = -EINVAL; - goto out; - } - if (!disk_live(disk)) { ret = -ENXIO; goto out; -- cgit v1.2.3-58-ga151 From dd27a84b06aa9ea6a94b0f3e59dc768f981962e1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 3 Mar 2024 07:01:50 -0700 Subject: block: remove disk_stack_limits disk_stack_limits is unused now, remove it. Signed-off-by: Christoph Hellwig Reviewed--by: Song Liu Tested-by: Song Liu Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20240303140150.5435-12-hch@lst.de --- block/blk-settings.c | 24 ------------------------ include/linux/blkdev.h | 2 -- 2 files changed, 26 deletions(-) (limited to 'block') diff --git a/block/blk-settings.c b/block/blk-settings.c index 13865a9f8972..3c7d8d638ab5 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -916,30 +916,6 @@ void queue_limits_stack_bdev(struct queue_limits *t, struct block_device *bdev, } EXPORT_SYMBOL_GPL(queue_limits_stack_bdev); -/** - * disk_stack_limits - adjust queue limits for stacked drivers - * @disk: MD/DM gendisk (top) - * @bdev: the underlying block device (bottom) - * @offset: offset to beginning of data within component device - * - * Description: - * Merges the limits for a top level gendisk and a bottom level - * block_device. - */ -void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, - sector_t offset) -{ - struct request_queue *t = disk->queue; - - if (blk_stack_limits(&t->limits, &bdev_get_queue(bdev)->limits, - get_start_sect(bdev) + (offset >> 9)) < 0) - pr_notice("%s: Warning: Device %pg is misaligned\n", - disk->disk_name, bdev); - - disk_update_readahead(disk); -} -EXPORT_SYMBOL(disk_stack_limits); - /** * blk_queue_update_dma_pad - update pad mask * @q: the request queue for the device diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 285e82723d64..75c909865a8b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -926,8 +926,6 @@ extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, sector_t offset); void queue_limits_stack_bdev(struct queue_limits *t, struct block_device *bdev, sector_t offset, const char *pfx); -extern void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, - sector_t offset); extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int); extern void blk_queue_segment_boundary(struct request_queue *, unsigned long); extern void blk_queue_virt_boundary(struct request_queue *, unsigned long); -- cgit v1.2.3-58-ga151 From 5205a4aa8fc9454853b705b69611c80e9c644283 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 8 Mar 2024 13:39:21 +0000 Subject: block: partitions: only define function mac_fix_string for CONFIG_PPC_PMAC The helper function mac_fix_string is only required with CONFIG_PPC_PMAC, add #if CONFIG_PPC_PMAC and #endif around the function. Cleans up clang scan build warning: block/partitions/mac.c:23:20: warning: unused function 'mac_fix_string' [-Wunused-function] Signed-off-by: Colin Ian King Link: https://lore.kernel.org/r/20240308133921.2058227-1-colin.i.king@gmail.com Signed-off-by: Jens Axboe --- block/partitions/mac.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'block') diff --git a/block/partitions/mac.c b/block/partitions/mac.c index 7b521df00a39..c80183156d68 100644 --- a/block/partitions/mac.c +++ b/block/partitions/mac.c @@ -20,6 +20,7 @@ extern void note_bootable_part(dev_t dev, int part, int goodness); * Code to understand MacOS partition tables. */ +#ifdef CONFIG_PPC_PMAC static inline void mac_fix_string(char *stg, int len) { int i; @@ -27,6 +28,7 @@ static inline void mac_fix_string(char *stg, int len) for (i = len - 1; i >= 0 && stg[i] == ' '; i--) stg[i] = 0; } +#endif int mac_partition(struct parsed_partitions *state) { -- cgit v1.2.3-58-ga151 From bff4b74625fea851f9dd61e747a162d2f6b3317e Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 11 Mar 2024 17:11:28 -0700 Subject: Revert "dm: use queue_limits_set" This reverts commit 8e0ef412869430d114158fc3b9b1fb111e247bd3. It's broken, and causes the boot to fail on encrypted volumes. Reported-and-bisected-by: Johannes Weiner Link: https://lore.kernel.org/all/20240311235023.GA1205@cmpxchg.org/ Acked-by: Jens Axboe Signed-off-by: Linus Torvalds --- block/blk-settings.c | 2 +- drivers/md/dm-table.c | 27 +++++++++++++++------------ 2 files changed, 16 insertions(+), 13 deletions(-) (limited to 'block') diff --git a/block/blk-settings.c b/block/blk-settings.c index 3c7d8d638ab5..e160d56e8eda 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -267,7 +267,7 @@ int queue_limits_commit_update(struct request_queue *q, EXPORT_SYMBOL_GPL(queue_limits_commit_update); /** - * queue_limits_set - apply queue limits to queue + * queue_limits_commit_set - apply queue limits to queue * @q: queue to update * @lim: limits to apply * diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 88114719fe18..41f1d731ae5a 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1963,27 +1963,26 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, bool wc = false, fua = false; int r; + /* + * Copy table's limits to the DM device's request_queue + */ + q->limits = *limits; + if (dm_table_supports_nowait(t)) blk_queue_flag_set(QUEUE_FLAG_NOWAIT, q); else blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, q); if (!dm_table_supports_discards(t)) { - limits->max_hw_discard_sectors = 0; - limits->discard_granularity = 0; - limits->discard_alignment = 0; - limits->discard_misaligned = 0; + q->limits.max_discard_sectors = 0; + q->limits.max_hw_discard_sectors = 0; + q->limits.discard_granularity = 0; + q->limits.discard_alignment = 0; + q->limits.discard_misaligned = 0; } - if (!dm_table_supports_write_zeroes(t)) - limits->max_write_zeroes_sectors = 0; - if (!dm_table_supports_secure_erase(t)) - limits->max_secure_erase_sectors = 0; - - r = queue_limits_set(q, limits); - if (r) - return r; + q->limits.max_secure_erase_sectors = 0; if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_WC))) { wc = true; @@ -2008,6 +2007,9 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, else blk_queue_flag_set(QUEUE_FLAG_NONROT, q); + if (!dm_table_supports_write_zeroes(t)) + q->limits.max_write_zeroes_sectors = 0; + dm_table_verify_integrity(t); /* @@ -2045,6 +2047,7 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, } dm_update_crypto_profile(q, t); + disk_update_readahead(t->md->disk); /* * Check for request-based device is left to -- cgit v1.2.3-58-ga151 From b874d4aae58b92144ec2c8fa5dc0a27c98388fcc Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 12 Mar 2024 15:58:41 -0600 Subject: block: limit block time caching to in_task() context We should not have any callers of this from non-task context, but Jakub ran [1] into one from blk-iocost. Rather than risk running into others, or future ones, just limit blk_time_get_ns() to when it is called from a task. Any other usage is invalid. [1] https://lore.kernel.org/lkml/CAHk-=wiOaBLqarS2uFhM1YdwOvCX4CZaWkeyNDY1zONpbYw2ig@mail.gmail.com/ Fixes: da4c8c3d0975 ("block: cache current nsec time in struct blk_plug") Reported-by: Jakub Kicinski Signed-off-by: Jens Axboe --- block/blk.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk.h b/block/blk.h index a19b7b42e650..5cac4e29ae17 100644 --- a/block/blk.h +++ b/block/blk.h @@ -534,7 +534,7 @@ static inline u64 blk_time_get_ns(void) { struct blk_plug *plug = current->plug; - if (!plug) + if (!plug || !in_task()) return ktime_get_ns(); /* -- cgit v1.2.3-58-ga151 From 256aab46e31683d76d45ccbedc287b4d3f3e322b Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 13 Mar 2024 14:42:18 -0700 Subject: Revert "block/mq-deadline: use correct way to throttling write requests" The code "max(1U, 3 * (1U << shift) / 4)" comes from the Kyber I/O scheduler. The Kyber I/O scheduler maintains one internal queue per hwq and hence derives its async_depth from the number of hwq tags. Using this approach for the mq-deadline scheduler is wrong since the mq-deadline scheduler maintains one internal queue for all hwqs combined. Hence this revert. Cc: stable@vger.kernel.org Cc: Damien Le Moal Cc: Harshit Mogalapalli Cc: Zhiguo Niu Fixes: d47f9717e5cf ("block/mq-deadline: use correct way to throttling write requests") Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20240313214218.1736147-1-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/mq-deadline.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'block') diff --git a/block/mq-deadline.c b/block/mq-deadline.c index f958e79277b8..02a916ba62ee 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -646,9 +646,8 @@ static void dd_depth_updated(struct blk_mq_hw_ctx *hctx) struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; struct blk_mq_tags *tags = hctx->sched_tags; - unsigned int shift = tags->bitmap_tags.sb.shift; - dd->async_depth = max(1U, 3 * (1U << shift) / 4); + dd->async_depth = max(1UL, 3 * q->nr_requests / 4); sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, dd->async_depth); } -- cgit v1.2.3-58-ga151 From bf5e3a30f777b6e763f6a46c10e1250c20237e97 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Mar 2024 19:16:23 -0700 Subject: Revert "blk-lib: check for kill signal" This reverts commit 8a08c5fd89b447a7de7eb293a7a274c46b932ba2. It turns out while this is a perfectly valid and long overdue thing to do for user initiated discards / zeroing from the ioctl handler, it actually breaks file system use of the discard helper by interrupting in places the file system doesn't expect, and by leaving the bio chain in a state that the file system callers of (at least) __blkdev_issue_discard do not expect. Revert the change for now, we'll redo it for the next merge window after refactoring the code to better split the file system vs ioctl callers and cleaning up a few other loose ends. Reported-by: Chandan Babu R Signed-off-by: Christoph Hellwig Acked-by: Keith Busch Link: https://lore.kernel.org/r/20240314021623.1908895-1-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-lib.c | 40 +--------------------------------------- 1 file changed, 1 insertion(+), 39 deletions(-) (limited to 'block') diff --git a/block/blk-lib.c b/block/blk-lib.c index dc8e35d0a51d..a6954eafb8c8 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -35,26 +35,6 @@ static sector_t bio_discard_limit(struct block_device *bdev, sector_t sector) return round_down(UINT_MAX, discard_granularity) >> SECTOR_SHIFT; } -static void await_bio_endio(struct bio *bio) -{ - complete(bio->bi_private); - bio_put(bio); -} - -/* - * await_bio_chain - ends @bio and waits for every chained bio to complete - */ -static void await_bio_chain(struct bio *bio) -{ - DECLARE_COMPLETION_ONSTACK_MAP(done, - bio->bi_bdev->bd_disk->lockdep_map); - - bio->bi_private = &done; - bio->bi_end_io = await_bio_endio; - bio_endio(bio); - blk_wait_io(&done); -} - int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct bio **biop) { @@ -97,10 +77,6 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, * is disabled. */ cond_resched(); - if (fatal_signal_pending(current)) { - await_bio_chain(bio); - return -EINTR; - } } *biop = bio; @@ -167,10 +143,6 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev, nr_sects -= len; sector += len; cond_resched(); - if (fatal_signal_pending(current)) { - await_bio_chain(bio); - return -EINTR; - } } *biop = bio; @@ -215,10 +187,6 @@ static int __blkdev_issue_zero_pages(struct block_device *bdev, break; } cond_resched(); - if (fatal_signal_pending(current)) { - await_bio_chain(bio); - return -EINTR; - } } *biop = bio; @@ -309,7 +277,7 @@ retry: bio_put(bio); } blk_finish_plug(&plug); - if (ret && ret != -EINTR && try_write_zeroes) { + if (ret && try_write_zeroes) { if (!(flags & BLKDEV_ZERO_NOFALLBACK)) { try_write_zeroes = false; goto retry; @@ -361,12 +329,6 @@ int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector, sector += len; nr_sects -= len; cond_resched(); - if (fatal_signal_pending(current)) { - await_bio_chain(bio); - ret = -EINTR; - bio = NULL; - break; - } } if (bio) { ret = submit_bio_wait(bio); -- cgit v1.2.3-58-ga151 From 4c4ab8ae416350ce817339f239bdaaf351212f15 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Thu, 14 Mar 2024 10:56:15 +0800 Subject: block: fix mismatched kerneldoc function name No functional modification involved. block/blk-settings.c:281: warning: expecting prototype for queue_limits_commit_set(). Prototype was for queue_limits_set() instead. Reported-by: Abaci Robot Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=8539 Signed-off-by: Jiapeng Chong Reviewed-by: Randy Dunlap Link: https://lore.kernel.org/r/20240314025615.71269-1-jiapeng.chong@linux.alibaba.com Signed-off-by: Jens Axboe --- block/blk-settings.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-settings.c b/block/blk-settings.c index e160d56e8eda..3c7d8d638ab5 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -267,7 +267,7 @@ int queue_limits_commit_update(struct request_queue *q, EXPORT_SYMBOL_GPL(queue_limits_commit_update); /** - * queue_limits_commit_set - apply queue limits to queue + * queue_limits_set - apply queue limits to queue * @q: queue to update * @lim: limits to apply * -- cgit v1.2.3-58-ga151 From 59a55a63c24624c7ad268f12c8f82d142ef6a6d4 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 14 Mar 2024 15:24:13 +0100 Subject: fs,block: get holder during claim Now that we open block devices as files we need to deal with the realities that closing is a deferred operation. An operation on the block device such as e.g., freeze, thaw, or removal that runs concurrently with umount, tries to acquire a stable reference on the holder. The holder might already be gone though. Make that reliable by grabbing a passive reference to the holder during bdev_open() and releasing it during bdev_release(). Fixes: f3a608827d1f ("bdev: open block device as files") # mainline only Reported-by: Christoph Hellwig Link: https://lore.kernel.org/r/ZfEQQ9jZZVes0WCZ@infradead.org Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Tested-by: Yi Zhang Reported-by: https://lore.kernel.org/r/CAHj4cs8tbDwKRwfS1=DmooP73ysM__xAb2PQc6XsAmWR+VuYmg@mail.gmail.com Link: https://lore.kernel.org/r/20240315-freibad-annehmbar-ca68c375af91@brauner Signed-off-by: Christian Brauner --- block/bdev.c | 7 +++++++ fs/super.c | 18 ++++++++++++++++++ include/linux/blkdev.h | 10 ++++++++++ 3 files changed, 35 insertions(+) (limited to 'block') diff --git a/block/bdev.c b/block/bdev.c index e7adaaf1c219..7a5f611c3d2e 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -583,6 +583,9 @@ static void bd_finish_claiming(struct block_device *bdev, void *holder, mutex_unlock(&bdev->bd_holder_lock); bd_clear_claiming(whole, holder); mutex_unlock(&bdev_lock); + + if (hops && hops->get_holder) + hops->get_holder(holder); } /** @@ -605,6 +608,7 @@ EXPORT_SYMBOL(bd_abort_claiming); static void bd_end_claim(struct block_device *bdev, void *holder) { struct block_device *whole = bdev_whole(bdev); + const struct blk_holder_ops *hops = bdev->bd_holder_ops; bool unblock = false; /* @@ -627,6 +631,9 @@ static void bd_end_claim(struct block_device *bdev, void *holder) whole->bd_holder = NULL; mutex_unlock(&bdev_lock); + if (hops && hops->put_holder) + hops->put_holder(holder); + /* * If this was the last claim, remove holder link and unblock evpoll if * it was a write holder. diff --git a/fs/super.c b/fs/super.c index ee05ab6b37e7..71d9779c42b1 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1515,11 +1515,29 @@ static int fs_bdev_thaw(struct block_device *bdev) return error; } +static void fs_bdev_super_get(void *data) +{ + struct super_block *sb = data; + + spin_lock(&sb_lock); + sb->s_count++; + spin_unlock(&sb_lock); +} + +static void fs_bdev_super_put(void *data) +{ + struct super_block *sb = data; + + put_super(sb); +} + const struct blk_holder_ops fs_holder_ops = { .mark_dead = fs_bdev_mark_dead, .sync = fs_bdev_sync, .freeze = fs_bdev_freeze, .thaw = fs_bdev_thaw, + .get_holder = fs_bdev_super_get, + .put_holder = fs_bdev_super_put, }; EXPORT_SYMBOL_GPL(fs_holder_ops); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f9b87c39cab0..c3e8f7cf96be 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1505,6 +1505,16 @@ struct blk_holder_ops { * Thaw the file system mounted on the block device. */ int (*thaw)(struct block_device *bdev); + + /* + * If needed, get a reference to the holder. + */ + void (*get_holder)(void *holder); + + /* + * Release the holder. + */ + void (*put_holder)(void *holder); }; /* -- cgit v1.2.3-58-ga151