From af551fb3be26a22b7a6b345b3b7e7e6acfc41758 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 14 Sep 2017 14:02:05 -0700 Subject: blkcg: delete unused APIs Nobody uses the APIs right now. Acked-by: Tejun Heo Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- block/bio.c | 31 ------------------------------- 1 file changed, 31 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index b38e962fa83e..8338304ea256 100644 --- a/block/bio.c +++ b/block/bio.c @@ -2032,37 +2032,6 @@ int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css) } EXPORT_SYMBOL_GPL(bio_associate_blkcg); -/** - * bio_associate_current - associate a bio with %current - * @bio: target bio - * - * Associate @bio with %current if it hasn't been associated yet. Block - * layer will treat @bio as if it were issued by %current no matter which - * task actually issues it. - * - * This function takes an extra reference of @task's io_context and blkcg - * which will be put when @bio is released. The caller must own @bio, - * ensure %current->io_context exists, and is responsible for synchronizing - * calls to this function. - */ -int bio_associate_current(struct bio *bio) -{ - struct io_context *ioc; - - if (bio->bi_css) - return -EBUSY; - - ioc = current->io_context; - if (!ioc) - return -ENOENT; - - get_io_context_active(ioc); - bio->bi_ioc = ioc; - bio->bi_css = task_get_css(current, io_cgrp_id); - return 0; -} -EXPORT_SYMBOL_GPL(bio_associate_current); - /** * bio_disassociate_task - undo bio_associate_current() * @bio: target bio -- cgit v1.2.3-58-ga151 From 547248736ae54b73e713ee259a81ab3a637cb2f7 Mon Sep 17 00:00:00 2001 From: weiping zhang Date: Sat, 30 Sep 2017 15:01:48 +0800 Subject: blk-mq: remove unused function hctx_allow_merges since 9bddeb2a5b981 "blk-mq: make per-sw-queue bio merge as default .bio_merge" there is no caller for this function. Reviewed-by: Ming Lei Signed-off-by: weiping zhang Signed-off-by: Jens Axboe --- block/blk-mq.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 98a18609755e..59687ed6561b 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1504,12 +1504,6 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) blk_account_io_start(rq, true); } -static inline bool hctx_allow_merges(struct blk_mq_hw_ctx *hctx) -{ - return (hctx->flags & BLK_MQ_F_SHOULD_MERGE) && - !blk_queue_nomerges(hctx->queue); -} - static inline void blk_mq_queue_io(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct request *rq) -- cgit v1.2.3-58-ga151 From 5385fa47d883dc7d51de622835427bc2558ed43c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 1 Oct 2017 01:26:21 -0600 Subject: blk-mq-tag: kill unused tag enums We don't have any notion of a tagging cache anymore, and haven't for a long time. Kill off the unused enums. Signed-off-by: Jens Axboe --- block/blk-mq-tag.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'block') diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index 5cb51e53cc03..5932a7ac7fc4 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -43,14 +43,9 @@ static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt, return sbq_wait_ptr(bt, &hctx->wait_index); } -enum { - BLK_MQ_TAG_CACHE_MIN = 1, - BLK_MQ_TAG_CACHE_MAX = 64, -}; - enum { BLK_MQ_TAG_FAIL = -1U, - BLK_MQ_TAG_MIN = BLK_MQ_TAG_CACHE_MIN, + BLK_MQ_TAG_MIN = 1, BLK_MQ_TAG_MAX = BLK_MQ_TAG_FAIL - 1, }; -- cgit v1.2.3-58-ga151 From 7beb2f845b715cb98584cf630e9a9d5b05501166 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 30 Sep 2017 02:08:24 -0600 Subject: blk-mq: wire up completion notifier for laptop mode For some reason, the laptop mode IO completion notified was never wired up for blk-mq. Ensure that we trigger the callback appropriately, to arm the laptop mode flush timer. Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-mq.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 59687ed6561b..7f01d69879d6 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -476,6 +476,9 @@ void blk_mq_free_request(struct request *rq) if (rq->rq_flags & RQF_MQ_INFLIGHT) atomic_dec(&hctx->nr_active); + if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq))) + laptop_io_completion(q->backing_dev_info); + wbt_done(q->rq_wb, &rq->issue_stat); clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); -- cgit v1.2.3-58-ga151 From 4baa8bb13f41307f3eb62fe91f93a1a798ebef53 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Thu, 21 Sep 2017 11:04:00 +0200 Subject: block, bfq: fix wrong init of saved start time for weight raising This commit fixes a bug that causes bfq to fail to guarantee a high responsiveness on some drives, if there is heavy random read+write I/O in the background. More precisely, such a failure allowed this bug to be found [1], but the bug may well cause other yet unreported anomalies. BFQ raises the weight of the bfq_queues associated with soft real-time applications, to privilege the I/O, and thus reduce latency, for these applications. This mechanism is named soft-real-time weight raising in BFQ. A soft real-time period may happen to be nested into an interactive weight raising period, i.e., it may happen that, when a bfq_queue switches to a soft real-time weight-raised state, the bfq_queue is already being weight-raised because deemed interactive too. In this case, BFQ saves in a special variable wr_start_at_switch_to_srt, the time instant when the interactive weight-raising period started for the bfq_queue, i.e., the time instant when BFQ started to deem the bfq_queue interactive. This value is then used to check whether the interactive weight-raising period would still be in progress when the soft real-time weight-raising period ends. If so, interactive weight raising is restored for the bfq_queue. This restore is useful, in particular, because it prevents bfq_queues from losing their interactive weight raising prematurely, as a consequence of spurious, short-lived soft real-time weight-raising periods caused by wrong detections as soft real-time. If, instead, a bfq_queue switches to soft-real-time weight raising while it *is not* already in an interactive weight-raising period, then the variable wr_start_at_switch_to_srt has no meaning during the following soft real-time weight-raising period. Unfortunately the handling of this case is wrong in BFQ: not only the variable is not flagged somehow as meaningless, but it is also set to the time when the switch to soft real-time weight-raising occurs. This may cause an interactive weight-raising period to be considered mistakenly as still in progress, and thus a spurious interactive weight-raising period to start for the bfq_queue, at the end of the soft-real-time weight-raising period. In particular the spurious interactive weight-raising period will be considered as still in progress, if the soft-real-time weight-raising period does not last very long. The bfq_queue will then be wrongly privileged and, if I/O bound, will unjustly steal bandwidth to truly interactive or soft real-time bfq_queues, harming responsiveness and low latency. This commit fixes this issue by just setting wr_start_at_switch_to_srt to minus infinity (farthest past time instant according to jiffies macros): when the soft-real-time weight-raising period ends, certainly no interactive weight-raising period will be considered as still in progress. [1] Background I/O Type: Random - Background I/O mix: Reads and writes - Application to start: LibreOffice Writer in http://www.phoronix.com/scan.php?page=news_item&px=Linux-4.13-IO-Laptop Signed-off-by: Paolo Valente Signed-off-by: Angelo Ruocco Tested-by: Oleksandr Natalenko Tested-by: Lee Tibbert Tested-by: Mirko Montanari Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 50 +++++++++++++++++++++++++++++++------------------- 1 file changed, 31 insertions(+), 19 deletions(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index a4783da90ba8..c25955c25e03 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -1202,6 +1202,24 @@ static unsigned int bfq_wr_duration(struct bfq_data *bfqd) return dur; } +/* + * Return the farthest future time instant according to jiffies + * macros. + */ +static unsigned long bfq_greatest_from_now(void) +{ + return jiffies + MAX_JIFFY_OFFSET; +} + +/* + * Return the farthest past time instant according to jiffies + * macros. + */ +static unsigned long bfq_smallest_from_now(void) +{ + return jiffies - MAX_JIFFY_OFFSET; +} + static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, struct bfq_queue *bfqq, unsigned int old_wr_coeff, @@ -1216,7 +1234,19 @@ static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, bfqq->wr_coeff = bfqd->bfq_wr_coeff; bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); } else { - bfqq->wr_start_at_switch_to_srt = jiffies; + /* + * No interactive weight raising in progress + * here: assign minus infinity to + * wr_start_at_switch_to_srt, to make sure + * that, at the end of the soft-real-time + * weight raising periods that is starting + * now, no interactive weight-raising period + * may be wrongly considered as still in + * progress (and thus actually started by + * mistake). + */ + bfqq->wr_start_at_switch_to_srt = + bfq_smallest_from_now(); bfqq->wr_coeff = bfqd->bfq_wr_coeff * BFQ_SOFTRT_WEIGHT_FACTOR; bfqq->wr_cur_max_time = @@ -2897,24 +2927,6 @@ static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); } -/* - * Return the farthest future time instant according to jiffies - * macros. - */ -static unsigned long bfq_greatest_from_now(void) -{ - return jiffies + MAX_JIFFY_OFFSET; -} - -/* - * Return the farthest past time instant according to jiffies - * macros. - */ -static unsigned long bfq_smallest_from_now(void) -{ - return jiffies - MAX_JIFFY_OFFSET; -} - /** * bfq_bfqq_expire - expire a queue. * @bfqd: device owning the queue. -- cgit v1.2.3-58-ga151 From 3e2bdd6dff239afd8386e8758eee69ad61e5a3d6 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Thu, 21 Sep 2017 11:04:01 +0200 Subject: block, bfq: check and switch back to interactive wr also on queue split As already explained in the message of commit "block, bfq: fix wrong init of saved start time for weight raising", if a soft real-time weight-raising period happens to be nested in a larger interactive weight-raising period, then BFQ restores the interactive weight raising at the end of the soft real-time weight raising. In particular, BFQ checks whether the latter has ended only on request dispatches. Unfortunately, the above scheme fails to restore interactive weight raising in the following corner case: if a bfq_queue, say Q, 1) Is merged with another bfq_queue while it is in a nested soft real-time weight-raising period. The weight-raising state of Q is then saved, and not considered any longer until a split occurs. 2) Is split from the other bfq_queue(s) at a time instant when its soft real-time weight raising is already finished. On the split, while resuming the previous, soft real-time weight-raised state of the bfq_queue Q, BFQ checks whether the current soft real-time weight-raising period is actually over. If so, BFQ switches weight raising off for Q, *without* checking whether the soft real-time period was actually nested in a non-yet-finished interactive weight-raising period. This commit addresses this issue by adding the above missing check in bfq_queue splits, and restoring interactive weight raising if needed. Signed-off-by: Paolo Valente Tested-by: Angelo Ruocco Tested-by: Mirko Montanari Tested-by: Oleksandr Natalenko Tested-by: Lee Tibbert Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 87 ++++++++++++++++++++++++++++++----------------------- 1 file changed, 49 insertions(+), 38 deletions(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index c25955c25e03..33b63bc4a64b 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -724,6 +724,44 @@ static void bfq_updated_next_req(struct bfq_data *bfqd, } } +static unsigned int bfq_wr_duration(struct bfq_data *bfqd) +{ + u64 dur; + + if (bfqd->bfq_wr_max_time > 0) + return bfqd->bfq_wr_max_time; + + dur = bfqd->RT_prod; + do_div(dur, bfqd->peak_rate); + + /* + * Limit duration between 3 and 13 seconds. Tests show that + * higher values than 13 seconds often yield the opposite of + * the desired result, i.e., worsen responsiveness by letting + * non-interactive and non-soft-real-time applications + * preserve weight raising for a too long time interval. + * + * On the other end, lower values than 3 seconds make it + * difficult for most interactive tasks to complete their jobs + * before weight-raising finishes. + */ + if (dur > msecs_to_jiffies(13000)) + dur = msecs_to_jiffies(13000); + else if (dur < msecs_to_jiffies(3000)) + dur = msecs_to_jiffies(3000); + + return dur; +} + +/* switch back from soft real-time to interactive weight raising */ +static void switch_back_to_interactive_wr(struct bfq_queue *bfqq, + struct bfq_data *bfqd) +{ + bfqq->wr_coeff = bfqd->bfq_wr_coeff; + bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); + bfqq->last_wr_start_finish = bfqq->wr_start_at_switch_to_srt; +} + static void bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, struct bfq_io_cq *bic, bool bfq_already_existing) @@ -750,10 +788,16 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) || time_is_before_jiffies(bfqq->last_wr_start_finish + bfqq->wr_cur_max_time))) { - bfq_log_bfqq(bfqq->bfqd, bfqq, - "resume state: switching off wr"); - - bfqq->wr_coeff = 1; + if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && + !bfq_bfqq_in_large_burst(bfqq) && + time_is_after_eq_jiffies(bfqq->wr_start_at_switch_to_srt + + bfq_wr_duration(bfqd))) { + switch_back_to_interactive_wr(bfqq, bfqd); + } else { + bfqq->wr_coeff = 1; + bfq_log_bfqq(bfqq->bfqd, bfqq, + "resume state: switching off wr"); + } } /* make sure weight will be updated, however we got here */ @@ -1173,35 +1217,6 @@ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, return wr_or_deserves_wr; } -static unsigned int bfq_wr_duration(struct bfq_data *bfqd) -{ - u64 dur; - - if (bfqd->bfq_wr_max_time > 0) - return bfqd->bfq_wr_max_time; - - dur = bfqd->RT_prod; - do_div(dur, bfqd->peak_rate); - - /* - * Limit duration between 3 and 13 seconds. Tests show that - * higher values than 13 seconds often yield the opposite of - * the desired result, i.e., worsen responsiveness by letting - * non-interactive and non-soft-real-time applications - * preserve weight raising for a too long time interval. - * - * On the other end, lower values than 3 seconds make it - * difficult for most interactive tasks to complete their jobs - * before weight-raising finishes. - */ - if (dur > msecs_to_jiffies(13000)) - dur = msecs_to_jiffies(13000); - else if (dur < msecs_to_jiffies(3000)) - dur = msecs_to_jiffies(3000); - - return dur; -} - /* * Return the farthest future time instant according to jiffies * macros. @@ -3501,11 +3516,7 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfq_wr_duration(bfqd))) bfq_bfqq_end_wr(bfqq); else { - /* switch back to interactive wr */ - bfqq->wr_coeff = bfqd->bfq_wr_coeff; - bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); - bfqq->last_wr_start_finish = - bfqq->wr_start_at_switch_to_srt; + switch_back_to_interactive_wr(bfqq, bfqd); bfqq->entity.prio_changed = 1; } } -- cgit v1.2.3-58-ga151 From 894df937e06a56ed6f054a75a416aff84147c5a2 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Thu, 21 Sep 2017 11:04:02 +0200 Subject: block, bfq: let early-merged queues be weight-raised on split too A just-created bfq_queue, say Q, may happen to be merged with another bfq_queue on the very first invocation of the function __bfq_insert_request. In such a case, even if Q would clearly deserve interactive weight raising (as it has just been created), the function bfq_add_request does not make it to be invoked for Q, and thus to activate weight raising for Q. As a consequence, when the state of Q is saved for a possible future restore, after a split of Q from the other bfq_queue(s), such a state happens to be (unjustly) non-weight-raised. Then the bfq_queue will not enjoy any weight raising on the split, even if should still be in an interactive weight-raising period when the split occurs. This commit solves this problem as follows, for a just-created bfq_queue that is being early-merged: it stores directly, in the saved state of the bfq_queue, the weight-raising state that would have been assigned to the bfq_queue if not early-merged. Signed-off-by: Paolo Valente Tested-by: Angelo Ruocco Tested-by: Mirko Montanari Tested-by: Oleksandr Natalenko Tested-by: Lee Tibbert Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 33b63bc4a64b..115747fe43c8 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2061,10 +2061,27 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); - bic->saved_wr_coeff = bfqq->wr_coeff; - bic->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt; - bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; - bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; + if (unlikely(bfq_bfqq_just_created(bfqq) && + !bfq_bfqq_in_large_burst(bfqq))) { + /* + * bfqq being merged right after being created: bfqq + * would have deserved interactive weight raising, but + * did not make it to be set in a weight-raised state, + * because of this early merge. Store directly the + * weight-raising state that would have been assigned + * to bfqq, so that to avoid that bfqq unjustly fails + * to enjoy weight raising if split soon. + */ + bic->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff; + bic->saved_wr_cur_max_time = bfq_wr_duration(bfqq->bfqd); + bic->saved_last_wr_start_finish = jiffies; + } else { + bic->saved_wr_coeff = bfqq->wr_coeff; + bic->saved_wr_start_at_switch_to_srt = + bfqq->wr_start_at_switch_to_srt; + bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; + bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; + } } static void @@ -4150,7 +4167,6 @@ static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) new_bfqq->allocated++; bfqq->allocated--; new_bfqq->ref++; - bfq_clear_bfqq_just_created(bfqq); /* * If the bic associated with the process * issuing this request still points to bfqq @@ -4162,6 +4178,8 @@ static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) bfq_merge_bfqqs(bfqd, RQ_BIC(rq), bfqq, new_bfqq); + + bfq_clear_bfqq_just_created(bfqq); /* * rq is about to be enqueued into new_bfqq, * release rq reference on bfqq -- cgit v1.2.3-58-ga151 From 7cb04004fa371a626c1a5ebe6d977f70285759ed Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Thu, 21 Sep 2017 11:04:03 +0200 Subject: block, bfq: decrease burst size when queues in burst exit If many queues belonging to the same group happen to be created shortly after each other, then the concurrent processes associated with these queues have typically a common goal, and they get it done as soon as possible if not hampered by device idling. Examples are processes spawned by git grep, or by systemd during boot. As for device idling, this mechanism is currently necessary for weight raising to succeed in its goal: privileging I/O. In view of these facts, BFQ does not provide the above queues with either weight raising or device idling. On the other hand, a burst of queue creations may be caused also by the start-up of a complex application. In this case, these queues need usually to be served one after the other, and as quickly as possible, to maximise responsiveness. Therefore, in this case the best strategy is to weight-raise all the queues created during the burst, i.e., the exact opposite of the strategy for the above case. To distinguish between the two cases, BFQ uses an empirical burst-size threshold, found through extensive tests and monitoring of daily usage. Only large bursts, i.e., burst with a size above this threshold, are considered as generated by a high number of parallel processes. In this respect, upstart-based boot proved to be rather hard to detect as generating a large burst of queue creations, because with upstart most of the queues created in a burst exit *before* the next queues in the same burst are created. To address this issue, I changed the burst-detection mechanism so as to not decrease the size of the current burst even if one of the queues in the burst is eliminated. Unfortunately, this missing decrease causes false positives on very fast systems: on the start-up of a complex application, such as libreoffice writer, so many queues are created, served and exited shortly after each other, that a large burst of queue creations is wrongly detected as occurring. These false positives just disappear if the size of a burst is decreased when one of the queues in the burst exits. This commit restores the missing burst-size decrease, relying of the fact that upstart is apparently unlikely to be used on systems running this and future versions of the kernel. Signed-off-by: Paolo Valente Signed-off-by: Mauro Andreolini Signed-off-by: Angelo Ruocco Tested-by: Mirko Montanari Tested-by: Oleksandr Natalenko Tested-by: Lee Tibbert Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 115747fe43c8..70f9177c4f5b 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -3725,16 +3725,10 @@ void bfq_put_queue(struct bfq_queue *bfqq) if (bfqq->ref) return; - if (bfq_bfqq_sync(bfqq)) - /* - * The fact that this queue is being destroyed does not - * invalidate the fact that this queue may have been - * activated during the current burst. As a consequence, - * although the queue does not exist anymore, and hence - * needs to be removed from the burst list if there, - * the burst size has not to be decremented. - */ + if (bfq_bfqq_sync(bfqq) && !hlist_unhashed(&bfqq->burst_list_node)) { hlist_del_init(&bfqq->burst_list_node); + bfqq->bfqd->burst_size--; + } kmem_cache_free(bfq_pool, bfqq); #ifdef CONFIG_BFQ_GROUP_IOSCHED -- cgit v1.2.3-58-ga151 From 9c9883744dda1cc38339a448dd8435140537027e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 3 Oct 2017 10:47:00 +0200 Subject: block: move __elv_next_request to blk-core.c No need to have this helper inline in a header. Also drop the __ prefix. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-core.c | 42 ++++++++++++++++++++++++++++++++++++++++-- block/blk.h | 39 --------------------------------------- 2 files changed, 40 insertions(+), 41 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 048be4aa6024..14f7674fa0b1 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2517,6 +2517,45 @@ void blk_account_io_start(struct request *rq, bool new_io) part_stat_unlock(); } +static struct request *elv_next_request(struct request_queue *q) +{ + struct request *rq; + struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL); + + WARN_ON_ONCE(q->mq_ops); + + while (1) { + if (!list_empty(&q->queue_head)) { + rq = list_entry_rq(q->queue_head.next); + return rq; + } + + /* + * Flush request is running and flush request isn't queueable + * in the drive, we can hold the queue till flush request is + * finished. Even we don't do this, driver can't dispatch next + * requests and will requeue them. And this can improve + * throughput too. For example, we have request flush1, write1, + * flush 2. flush1 is dispatched, then queue is hold, write1 + * isn't inserted to queue. After flush1 is finished, flush2 + * will be dispatched. Since disk cache is already clean, + * flush2 will be finished very soon, so looks like flush2 is + * folded to flush1. + * Since the queue is hold, a flag is set to indicate the queue + * should be restarted later. Please see flush_end_io() for + * details. + */ + if (fq->flush_pending_idx != fq->flush_running_idx && + !queue_flush_queueable(q)) { + fq->flush_queue_delayed = 1; + return NULL; + } + if (unlikely(blk_queue_bypass(q)) || + !q->elevator->type->ops.sq.elevator_dispatch_fn(q, 0)) + return NULL; + } +} + /** * blk_peek_request - peek at the top of a request queue * @q: request queue to peek at @@ -2538,8 +2577,7 @@ struct request *blk_peek_request(struct request_queue *q) lockdep_assert_held(q->queue_lock); WARN_ON_ONCE(q->mq_ops); - while ((rq = __elv_next_request(q)) != NULL) { - + while ((rq = elv_next_request(q)) != NULL) { rq = blk_pm_peek_request(q, rq); if (!rq) break; diff --git a/block/blk.h b/block/blk.h index fcb9775b997d..fda5a4632aba 100644 --- a/block/blk.h +++ b/block/blk.h @@ -148,45 +148,6 @@ static inline void blk_clear_rq_complete(struct request *rq) void blk_insert_flush(struct request *rq); -static inline struct request *__elv_next_request(struct request_queue *q) -{ - struct request *rq; - struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL); - - WARN_ON_ONCE(q->mq_ops); - - while (1) { - if (!list_empty(&q->queue_head)) { - rq = list_entry_rq(q->queue_head.next); - return rq; - } - - /* - * Flush request is running and flush request isn't queueable - * in the drive, we can hold the queue till flush request is - * finished. Even we don't do this, driver can't dispatch next - * requests and will requeue them. And this can improve - * throughput too. For example, we have request flush1, write1, - * flush 2. flush1 is dispatched, then queue is hold, write1 - * isn't inserted to queue. After flush1 is finished, flush2 - * will be dispatched. Since disk cache is already clean, - * flush2 will be finished very soon, so looks like flush2 is - * folded to flush1. - * Since the queue is hold, a flag is set to indicate the queue - * should be restarted later. Please see flush_end_io() for - * details. - */ - if (fq->flush_pending_idx != fq->flush_running_idx && - !queue_flush_queueable(q)) { - fq->flush_queue_delayed = 1; - return NULL; - } - if (unlikely(blk_queue_bypass(q)) || - !q->elevator->type->ops.sq.elevator_dispatch_fn(q, 0)) - return NULL; - } -} - static inline void elv_activate_rq(struct request_queue *q, struct request *rq) { struct elevator_queue *e = q->elevator; -- cgit v1.2.3-58-ga151 From a7af0af32171c17d881e3e58b0925c4a44fb5a42 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 6 Sep 2017 10:00:22 +0200 Subject: blk-mq: attempt to fix atomic flag memory ordering Attempt to untangle the ordering in blk-mq. The patch introducing the single smp_mb__before_atomic() is obviously broken in that it doesn't clearly specify a pairing barrier and an obtained guarantee. The comment is further misleading in that it hints that the deadline store and the COMPLETE store also need to be ordered, but AFAICT there is no such dependency. However what does appear to be important is the clear happening _after_ the store, and that worked by pure accident. This clarifies blk_mq_start_request() -- we should not get there with STARTING set -- this simplifies the code and makes the barrier usage sane (the old code could be read to allow not having _any_ atomic after the barrier, in which case the barrier hasn't got anything to order). We then also introduce the missing pairing barrier for it. Also down-grade the barrier to smp_wmb(), this is cheaper for PowerPC/ARM and doesn't cost anything extra on x86. And it documents the STARTING vs COMPLETE ordering. Although I've not been entirely successful in reverse engineering the blk-mq state machine so there might still be more funnies around timeout vs requeue. If I got anything wrong, feel free to educate me by adding comments to clarify things ;-) Cc: Alan Stern Cc: Will Deacon Cc: Ming Lei Cc: Christoph Hellwig Cc: Andrea Parri Cc: Boqun Feng Cc: Bart Van Assche Cc: "Paul E. McKenney" Fixes: 538b75341835 ("blk-mq: request deadline must be visible before marking rq as started") Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Jens Axboe --- block/blk-mq.c | 52 ++++++++++++++++++++++++++++++++++++++++------------ block/blk-timeout.c | 2 +- 2 files changed, 41 insertions(+), 13 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 7f01d69879d6..a7b46b754a7c 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -596,22 +596,32 @@ void blk_mq_start_request(struct request *rq) blk_add_timer(rq); - /* - * Ensure that ->deadline is visible before set the started - * flag and clear the completed flag. - */ - smp_mb__before_atomic(); + WARN_ON_ONCE(test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)); /* * Mark us as started and clear complete. Complete might have been * set if requeue raced with timeout, which then marked it as * complete. So be sure to clear complete again when we start * the request, otherwise we'll ignore the completion event. + * + * Ensure that ->deadline is visible before we set STARTED, such that + * blk_mq_check_expired() is guaranteed to observe our ->deadline when + * it observes STARTED. */ - if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) - set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); - if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) + smp_wmb(); + set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); + if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) { + /* + * Coherence order guarantees these consecutive stores to a + * single variable propagate in the specified order. Thus the + * clear_bit() is ordered _after_ the set bit. See + * blk_mq_check_expired(). + * + * (the bits must be part of the same byte for this to be + * true). + */ clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); + } if (q->dma_drain_size && blk_rq_bytes(rq)) { /* @@ -781,10 +791,19 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, struct request *rq, void *priv, bool reserved) { struct blk_mq_timeout_data *data = priv; + unsigned long deadline; if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) return; + /* + * Ensures that if we see STARTED we must also see our + * up-to-date deadline, see blk_mq_start_request(). + */ + smp_rmb(); + + deadline = READ_ONCE(rq->deadline); + /* * The rq being checked may have been freed and reallocated * out already here, we avoid this race by checking rq->deadline @@ -798,11 +817,20 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, * and clearing the flag in blk_mq_start_request(), so * this rq won't be timed out too. */ - if (time_after_eq(jiffies, rq->deadline)) { - if (!blk_mark_rq_complete(rq)) + if (time_after_eq(jiffies, deadline)) { + if (!blk_mark_rq_complete(rq)) { + /* + * Again coherence order ensures that consecutive reads + * from the same variable must be in that order. This + * ensures that if we see COMPLETE clear, we must then + * see STARTED set and we'll ignore this timeout. + * + * (There's also the MB implied by the test_and_clear()) + */ blk_mq_rq_timed_out(rq, reserved); - } else if (!data->next_set || time_after(data->next, rq->deadline)) { - data->next = rq->deadline; + } + } else if (!data->next_set || time_after(data->next, deadline)) { + data->next = deadline; data->next_set = 1; } } diff --git a/block/blk-timeout.c b/block/blk-timeout.c index 17ec83bb0900..e3e9c9771d36 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -211,7 +211,7 @@ void blk_add_timer(struct request *req) if (!req->timeout) req->timeout = q->rq_timeout; - req->deadline = jiffies + req->timeout; + WRITE_ONCE(req->deadline, jiffies + req->timeout); /* * Only the non-mq case needs to add the request to a protected list. -- cgit v1.2.3-58-ga151 From fc13457f74dcf054b0d17efb7b94b46fdf17f412 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 4 Oct 2017 11:22:24 -0600 Subject: blk-mq: document the need to have STARTED and COMPLETED share a byte For memory ordering guarantees on stores, we need to ensure that these two bits share the same byte of storage in the unsigned long. Add a comment as to why, and a BUILD_BUG_ON() to ensure that we don't violate this requirement. Suggested-by: Boqun Feng Signed-off-by: Jens Axboe --- block/blk-mq.c | 6 ++++++ block/blk.h | 7 +++++++ 2 files changed, 13 insertions(+) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index a7b46b754a7c..076cbab9c3e0 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2923,6 +2923,12 @@ EXPORT_SYMBOL_GPL(blk_mq_poll); static int __init blk_mq_init(void) { + /* + * See comment in block/blk.h rq_atomic_flags enum + */ + BUILD_BUG_ON((REQ_ATOM_STARTED / BITS_PER_BYTE) != + (REQ_ATOM_COMPLETE / BITS_PER_BYTE)); + cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL, blk_mq_hctx_notify_dead); return 0; diff --git a/block/blk.h b/block/blk.h index fda5a4632aba..6ac43dfd68a7 100644 --- a/block/blk.h +++ b/block/blk.h @@ -122,8 +122,15 @@ void blk_account_io_done(struct request *req); * Internal atomic flags for request handling */ enum rq_atomic_flags { + /* + * Keep these two bits first - not because we depend on the + * value of them, but we do depend on them being in the same + * byte of storage to ensure ordering on writes. Keeping them + * first will achieve that nicely. + */ REQ_ATOM_COMPLETE = 0, REQ_ATOM_STARTED, + REQ_ATOM_POLL_SLEPT, }; -- cgit v1.2.3-58-ga151 From 5fdee2127faa77c9c91862ad5e001dfab7013e92 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 5 Oct 2017 21:22:52 +0200 Subject: block: remove QUEUE_FLAG_STACKABLE We already have a queue_is_rq_based helper to check if a request_queue is request based, so we can remove the flag for it. Acked-by: Mike Snitzer Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 1 - block/elevator.c | 2 +- drivers/md/dm-rq.c | 2 +- drivers/md/dm-table.c | 15 +-------------- drivers/md/dm.c | 11 ----------- include/linux/blkdev.h | 5 ----- 6 files changed, 3 insertions(+), 33 deletions(-) (limited to 'block') diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 980e73095643..7f4a1ba532af 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -54,7 +54,6 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(NOMERGES), QUEUE_FLAG_NAME(SAME_COMP), QUEUE_FLAG_NAME(FAIL_IO), - QUEUE_FLAG_NAME(STACKABLE), QUEUE_FLAG_NAME(NONROT), QUEUE_FLAG_NAME(IO_STAT), QUEUE_FLAG_NAME(DISCARD), diff --git a/block/elevator.c b/block/elevator.c index 153926a90901..7ae50eb2732b 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -1118,7 +1118,7 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name) struct elevator_type *__e; int len = 0; - if (!blk_queue_stackable(q)) + if (!queue_is_rq_based(q)) return sprintf(name, "none\n"); if (!q->elevator) diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index eadfcfd106ff..9d32f25489c2 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -56,7 +56,7 @@ static unsigned dm_get_blk_mq_queue_depth(void) int dm_request_based(struct mapped_device *md) { - return blk_queue_stackable(md->queue); + return queue_is_rq_based(md->queue); } static void dm_old_start_queue(struct request_queue *q) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index ef7b8f201f73..75281828f2cb 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1000,7 +1000,7 @@ verify_rq_based: list_for_each_entry(dd, devices, list) { struct request_queue *q = bdev_get_queue(dd->dm_dev->bdev); - if (!blk_queue_stackable(q)) { + if (!queue_is_rq_based(q)) { DMERR("table load rejected: including" " non-request-stackable devices"); return -EINVAL; @@ -1847,19 +1847,6 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, */ if (blk_queue_add_random(q) && dm_table_all_devices_attribute(t, device_is_not_random)) queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, q); - - /* - * QUEUE_FLAG_STACKABLE must be set after all queue settings are - * visible to other CPUs because, once the flag is set, incoming bios - * are processed by request-based dm, which refers to the queue - * settings. - * Until the flag set, bios are passed to bio-based dm and queued to - * md->deferred where queue settings are not needed yet. - * Those bios are passed to request-based dm at the resume time. - */ - smp_mb(); - if (dm_table_request_based(t)) - queue_flag_set_unlocked(QUEUE_FLAG_STACKABLE, q); } unsigned int dm_table_get_num_targets(struct dm_table *t) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 6e54145969c5..8d07ad61221c 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1612,17 +1612,6 @@ static void dm_wq_work(struct work_struct *work); void dm_init_md_queue(struct mapped_device *md) { - /* - * Request-based dm devices cannot be stacked on top of bio-based dm - * devices. The type of this dm device may not have been decided yet. - * The type is decided at the first table loading time. - * To prevent problematic device stacking, clear the queue flag - * for request stacking support until then. - * - * This queue is new, so no concurrency on the queue_flags. - */ - queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue); - /* * Initialize data that will only be used by a non-blk-mq DM queue * - must do so here (in alloc_dev callchain) before queue is used diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 02fa42d24b52..9fb71fc7d0e8 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -609,7 +609,6 @@ struct request_queue { #define QUEUE_FLAG_NOMERGES 5 /* disable merge attempts */ #define QUEUE_FLAG_SAME_COMP 6 /* complete on same CPU-group */ #define QUEUE_FLAG_FAIL_IO 7 /* fake timeout */ -#define QUEUE_FLAG_STACKABLE 8 /* supports request stacking */ #define QUEUE_FLAG_NONROT 9 /* non-rotational device (SSD) */ #define QUEUE_FLAG_VIRT QUEUE_FLAG_NONROT /* paravirt device */ #define QUEUE_FLAG_IO_STAT 10 /* do IO stats */ @@ -633,12 +632,10 @@ struct request_queue { #define QUEUE_FLAG_QUIESCED 28 /* queue has been quiesced */ #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ - (1 << QUEUE_FLAG_STACKABLE) | \ (1 << QUEUE_FLAG_SAME_COMP) | \ (1 << QUEUE_FLAG_ADD_RANDOM)) #define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ - (1 << QUEUE_FLAG_STACKABLE) | \ (1 << QUEUE_FLAG_SAME_COMP) | \ (1 << QUEUE_FLAG_POLL)) @@ -722,8 +719,6 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q) #define blk_queue_nonrot(q) test_bit(QUEUE_FLAG_NONROT, &(q)->queue_flags) #define blk_queue_io_stat(q) test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags) #define blk_queue_add_random(q) test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags) -#define blk_queue_stackable(q) \ - test_bit(QUEUE_FLAG_STACKABLE, &(q)->queue_flags) #define blk_queue_discard(q) test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags) #define blk_queue_secure_erase(q) \ (test_bit(QUEUE_FLAG_SECERASE, &(q)->queue_flags)) -- cgit v1.2.3-58-ga151 From 4b14a5c5d57f4fd6929db3427ba4d7c3775b4680 Mon Sep 17 00:00:00 2001 From: Tim Hansen Date: Thu, 5 Oct 2017 14:09:20 -0400 Subject: block: remove unnecessary NULL checks in bioset_integrity_free() mempool_destroy() already checks for a NULL value being passed in, this eliminates duplicate checks. This was caught by running make coccicheck M=block/ on linus' tree on commit 77ede3a014a32746002f7889211f0cecf4803163 (current head as of this patch). Reviewed-by: Kyle Fortin Acked-by: Martin K. Petersen Signed-off-by: Tim Hansen Signed-off-by: Jens Axboe --- block/bio-integrity.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'block') diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 5df32907ff3b..23b42e8aa03e 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -485,11 +485,8 @@ EXPORT_SYMBOL(bioset_integrity_create); void bioset_integrity_free(struct bio_set *bs) { - if (bs->bio_integrity_pool) - mempool_destroy(bs->bio_integrity_pool); - - if (bs->bvec_integrity_pool) - mempool_destroy(bs->bvec_integrity_pool); + mempool_destroy(bs->bio_integrity_pool); + mempool_destroy(bs->bvec_integrity_pool); } EXPORT_SYMBOL(bioset_integrity_free); -- cgit v1.2.3-58-ga151 From 4078def82f352cf5007691635da290a109511bc5 Mon Sep 17 00:00:00 2001 From: Tim Hansen Date: Fri, 6 Oct 2017 14:45:13 -0400 Subject: block/bio: Remove null checks before mempool_destroy in bioset_free This patch removes redundant checks for null values on bio_pool and bvec_pool. Found using make coccicheck M=block/ on linux-net tree on the next-20170929 tag. Signed-off-by: Tim Hansen Signed-off-by: Jens Axboe --- block/bio.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 8338304ea256..bf0dbe8f78f8 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1928,11 +1928,8 @@ void bioset_free(struct bio_set *bs) if (bs->rescue_workqueue) destroy_workqueue(bs->rescue_workqueue); - if (bs->bio_pool) - mempool_destroy(bs->bio_pool); - - if (bs->bvec_pool) - mempool_destroy(bs->bvec_pool); + mempool_destroy(bs->bio_pool); + mempool_destroy(bs->bvec_pool); bioset_integrity_free(bs); bio_put_slab(bs); -- cgit v1.2.3-58-ga151 From b5dc5d4d1f4ff9032eb6c21a3c571a1317dc9289 Mon Sep 17 00:00:00 2001 From: Luca Miccio Date: Mon, 9 Oct 2017 16:27:21 +0200 Subject: block,bfq: Disable writeback throttling Similarly to CFQ, BFQ has its write-throttling heuristics, and it is better not to combine them with further write-throttling heuristics of a different nature. So this commit disables write-back throttling for a device if BFQ is used as I/O scheduler for that device. Signed-off-by: Luca Miccio Signed-off-by: Paolo Valente Tested-by: Oleksandr Natalenko Tested-by: Lee Tibbert Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 3 ++- block/blk-wbt.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 70f9177c4f5b..261f98695910 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -108,6 +108,7 @@ #include "blk-mq-tag.h" #include "blk-mq-sched.h" #include "bfq-iosched.h" +#include "blk-wbt.h" #define BFQ_BFQQ_FNS(name) \ void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \ @@ -4810,7 +4811,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) bfq_init_root_group(bfqd->root_group, bfqd); bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); - + wbt_disable_default(q); return 0; out_free: diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 6a9a0f03a67b..e59d59c11ebb 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -654,7 +654,7 @@ void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on) } /* - * Disable wbt, if enabled by default. Only called from CFQ. + * Disable wbt, if enabled by default. */ void wbt_disable_default(struct request_queue *q) { -- cgit v1.2.3-58-ga151 From 99fead8d38e5302b1be9403d4de815ce9174a3df Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Mon, 9 Oct 2017 13:11:23 +0200 Subject: block, bfq: fix unbalanced decrements of burst size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The commit "block, bfq: decrease burst size when queues in burst exit" introduced the decrement of burst_size on the removal of a bfq_queue from the burst list. Unfortunately, this decrement can happen to be performed even when burst size is already equal to 0, because of unbalanced decrements. A description follows of the cause of these unbalanced decrements, namely a wrong assumption, and of the way how this wrong assumption leads to unbalanced decrements. The wrong assumption is that a bfq_queue can exit only if the process associated with the bfq_queue has exited. This is false, because a bfq_queue, say Q, may exit also as a consequence of a merge with another bfq_queue. In this case, Q exits because the I/O of its associated process has been redirected to another bfq_queue. The decrement unbalance occurs because Q may then be re-created after a split, and added back to the current burst list, *without* incrementing burst_size. burst_size is not incremented because Q is not a new bfq_queue added to the burst list, but a bfq_queue only temporarily removed from the list, and, before the commit "bfq-sq, bfq-mq: decrease burst size when queues in burst exit", burst_size was not decremented when Q was removed. This commit addresses this issue by just checking whether the exiting bfq_queue is a merged bfq_queue, and, in that case, not decrementing burst_size. Unfortunately, this still leaves room for unbalanced decrements, in the following rarer case: on a split, the bfq_queue happens to be inserted into a different burst list than that it was removed from when merged. If this happens, the number of elements in the new burst list becomes higher than burst_size (by one). When the bfq_queue then exits, it is of course not in a merged state any longer, thus burst_size is decremented, which results in an unbalanced decrement. To handle this sporadic, unlucky case in a simple way, this commit also checks that burst_size is larger than 0 before decrementing it. Finally, this commit removes an useless, extra check: the check that the bfq_queue is sync, performed before checking whether the bfq_queue is in the burst list. This extra check is redundant, because only sync bfq_queues can be inserted into the burst list. Fixes: 7cb04004fa37 ("block, bfq: decrease burst size when queues in burst exit") Reported-by: Philip Müller Signed-off-by: Paolo Valente Signed-off-by: Angelo Ruocco Tested-by: Philip Müller Tested-by: Oleksandr Natalenko Tested-by: Lee Tibbert Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 57 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 261f98695910..889a8549d97f 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -3726,9 +3726,36 @@ void bfq_put_queue(struct bfq_queue *bfqq) if (bfqq->ref) return; - if (bfq_bfqq_sync(bfqq) && !hlist_unhashed(&bfqq->burst_list_node)) { + if (!hlist_unhashed(&bfqq->burst_list_node)) { hlist_del_init(&bfqq->burst_list_node); - bfqq->bfqd->burst_size--; + /* + * Decrement also burst size after the removal, if the + * process associated with bfqq is exiting, and thus + * does not contribute to the burst any longer. This + * decrement helps filter out false positives of large + * bursts, when some short-lived process (often due to + * the execution of commands by some service) happens + * to start and exit while a complex application is + * starting, and thus spawning several processes that + * do I/O (and that *must not* be treated as a large + * burst, see comments on bfq_handle_burst). + * + * In particular, the decrement is performed only if: + * 1) bfqq is not a merged queue, because, if it is, + * then this free of bfqq is not triggered by the exit + * of the process bfqq is associated with, but exactly + * by the fact that bfqq has just been merged. + * 2) burst_size is greater than 0, to handle + * unbalanced decrements. Unbalanced decrements may + * happen in te following case: bfqq is inserted into + * the current burst list--without incrementing + * bust_size--because of a split, but the current + * burst list is not the burst list bfqq belonged to + * (see comments on the case of a split in + * bfq_set_request). + */ + if (bfqq->bic && bfqq->bfqd->burst_size > 0) + bfqq->bfqd->burst_size--; } kmem_cache_free(bfq_pool, bfqq); @@ -4460,6 +4487,34 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, else { bfq_clear_bfqq_in_large_burst(bfqq); if (bic->was_in_burst_list) + /* + * If bfqq was in the current + * burst list before being + * merged, then we have to add + * it back. And we do not need + * to increase burst_size, as + * we did not decrement + * burst_size when we removed + * bfqq from the burst list as + * a consequence of a merge + * (see comments in + * bfq_put_queue). In this + * respect, it would be rather + * costly to know whether the + * current burst list is still + * the same burst list from + * which bfqq was removed on + * the merge. To avoid this + * cost, if bfqq was in a + * burst list, then we add + * bfqq to the current burst + * list without any further + * check. This can cause + * inappropriate insertions, + * but rarely enough to not + * harm the detection of large + * bursts significantly. + */ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); } -- cgit v1.2.3-58-ga151 From 58a9edce0aa912640abe47d3fc039e6230ef848b Mon Sep 17 00:00:00 2001 From: weiping zhang Date: Tue, 10 Oct 2017 22:53:46 +0800 Subject: blkcg: check pol->cpd_free_fn before free cpd check pol->cpd_free_fn() instead of pol->cpd_alloc_fn() when free cpd. Reviewed-by: Johannes Thumshirn Signed-off-by: weiping zhang Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index d3f56baee936..e7ec676043b1 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1452,7 +1452,7 @@ int blkcg_policy_register(struct blkcg_policy *pol) return 0; err_free_cpds: - if (pol->cpd_alloc_fn) { + if (pol->cpd_free_fn) { list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { if (blkcg->cpd[pol->plid]) { pol->cpd_free_fn(blkcg->cpd[pol->plid]); @@ -1492,7 +1492,7 @@ void blkcg_policy_unregister(struct blkcg_policy *pol) /* remove cpds and unregister */ mutex_lock(&blkcg_pol_mutex); - if (pol->cpd_alloc_fn) { + if (pol->cpd_free_fn) { list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { if (blkcg->cpd[pol->plid]) { pol->cpd_free_fn(blkcg->cpd[pol->plid]); -- cgit v1.2.3-58-ga151 From 53cfdc10a95d03fbc82970d682a32696d19ef886 Mon Sep 17 00:00:00 2001 From: Jiufei Xue Date: Tue, 10 Oct 2017 11:13:32 +0800 Subject: blk-throttle: fix null pointer dereference while throttling writeback IOs A null pointer dereference can occur when blkcg is removed manually with writeback IOs inflight. This is caused by the following case: Writeback kworker submit the bio and set bio->bi_cg_private to tg in blk_throtl_assoc_bio. Then we remove the block cgroup manually, the blkg and tg would be freed if there is no request inflight. When the submitted bio come back, blk_throtl_bio_endio() fetch the tg which was already freed. Fix this by increasing the refcount of blkg in funcion blk_throtl_assoc_bio() so that the blkg will not be freed until the bio_endio called. Reviewed-by: Shaohua Li Signed-off-by: Jiufei Xue Signed-off-by: Jens Axboe --- block/blk-throttle.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 0fea76aa0f3f..fe49c465ec86 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -2112,8 +2112,12 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td) static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio) { #ifdef CONFIG_BLK_DEV_THROTTLING_LOW - if (bio->bi_css) + if (bio->bi_css) { + if (bio->bi_cg_private) + blkg_put(tg_to_blkg(bio->bi_cg_private)); bio->bi_cg_private = tg; + blkg_get(tg_to_blkg(tg)); + } blk_stat_set_issue(&bio->bi_issue_stat, bio_sectors(bio)); #endif } @@ -2283,8 +2287,10 @@ void blk_throtl_bio_endio(struct bio *bio) start_time = blk_stat_time(&bio->bi_issue_stat) >> 10; finish_time = __blk_stat_time(finish_time_ns) >> 10; - if (!start_time || finish_time <= start_time) + if (!start_time || finish_time <= start_time) { + blkg_put(tg_to_blkg(tg)); return; + } lat = finish_time - start_time; /* this is only for bio based driver */ @@ -2314,6 +2320,8 @@ void blk_throtl_bio_endio(struct bio *bio) tg->bio_cnt /= 2; tg->bad_bio_cnt /= 2; } + + blkg_put(tg_to_blkg(tg)); } #endif -- cgit v1.2.3-58-ga151 From eca8b53a6769e60d6d8240d71202d73b0af81901 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 6 Oct 2017 17:55:59 -0700 Subject: blk-stat: delete useless code Fix two issues: - the per-cpu stat flush is unnecessary, nobody uses per-cpu stat except sum it to global stat. We can do the calculation there. The flush just wastes cpu time. - some fields are signed int/s64. I don't see the point. Reviewed-by: Omar Sandoval Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- block/blk-stat.c | 45 +++++++-------------------------------------- include/linux/blk_types.h | 5 ++--- 2 files changed, 9 insertions(+), 41 deletions(-) (limited to 'block') diff --git a/block/blk-stat.c b/block/blk-stat.c index c52356d90fe3..3a2f3c96f367 100644 --- a/block/blk-stat.c +++ b/block/blk-stat.c @@ -11,8 +11,6 @@ #include "blk-mq.h" #include "blk.h" -#define BLK_RQ_STAT_BATCH 64 - struct blk_queue_stats { struct list_head callbacks; spinlock_t lock; @@ -23,45 +21,21 @@ static void blk_stat_init(struct blk_rq_stat *stat) { stat->min = -1ULL; stat->max = stat->nr_samples = stat->mean = 0; - stat->batch = stat->nr_batch = 0; -} - -static void blk_stat_flush_batch(struct blk_rq_stat *stat) -{ - const s32 nr_batch = READ_ONCE(stat->nr_batch); - const s32 nr_samples = READ_ONCE(stat->nr_samples); - - if (!nr_batch) - return; - if (!nr_samples) - stat->mean = div64_s64(stat->batch, nr_batch); - else { - stat->mean = div64_s64((stat->mean * nr_samples) + - stat->batch, - nr_batch + nr_samples); - } - - stat->nr_samples += nr_batch; - stat->nr_batch = stat->batch = 0; + stat->batch = 0; } +/* src is a per-cpu stat, mean isn't initialized */ static void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src) { - blk_stat_flush_batch(src); - if (!src->nr_samples) return; dst->min = min(dst->min, src->min); dst->max = max(dst->max, src->max); - if (!dst->nr_samples) - dst->mean = src->mean; - else { - dst->mean = div64_s64((src->mean * src->nr_samples) + - (dst->mean * dst->nr_samples), - dst->nr_samples + src->nr_samples); - } + dst->mean = div_u64(src->batch + dst->mean * dst->nr_samples, + dst->nr_samples + src->nr_samples); + dst->nr_samples += src->nr_samples; } @@ -69,13 +43,8 @@ static void __blk_stat_add(struct blk_rq_stat *stat, u64 value) { stat->min = min(stat->min, value); stat->max = max(stat->max, value); - - if (stat->batch + value < stat->batch || - stat->nr_batch + 1 == BLK_RQ_STAT_BATCH) - blk_stat_flush_batch(stat); - stat->batch += value; - stat->nr_batch++; + stat->nr_samples++; } void blk_stat_add(struct request *rq) @@ -84,7 +53,7 @@ void blk_stat_add(struct request *rq) struct blk_stat_callback *cb; struct blk_rq_stat *stat; int bucket; - s64 now, value; + u64 now, value; now = __blk_stat_time(ktime_to_ns(ktime_get())); if (now < blk_stat_time(&rq->issue_stat)) diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index a2d2aa709cef..3385c89f402e 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -329,11 +329,10 @@ static inline bool blk_qc_t_is_internal(blk_qc_t cookie) } struct blk_rq_stat { - s64 mean; + u64 mean; u64 min; u64 max; - s32 nr_samples; - s32 nr_batch; + u32 nr_samples; u64 batch; }; -- cgit v1.2.3-58-ga151 From 85acb3ba2f925a0ec6928c1967c3adefa00682f4 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 6 Oct 2017 17:56:00 -0700 Subject: block: set request_list for request Legacy queue sets request's request_list, mq doesn't. This makes mq does the same thing, so we can find cgroup of a request. Note, we really only use blkg field of request_list, it's pointless to allocate mempool for request_list in mq case. Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- block/blk-mq.c | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 14f7674fa0b1..e8e149ccc86b 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -718,7 +718,7 @@ static void free_request_size(void *element, void *data) int blk_init_rl(struct request_list *rl, struct request_queue *q, gfp_t gfp_mask) { - if (unlikely(rl->rq_pool)) + if (unlikely(rl->rq_pool) || q->mq_ops) return 0; rl->q = q; diff --git a/block/blk-mq.c b/block/blk-mq.c index 076cbab9c3e0..40cba1b1978f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -481,6 +481,9 @@ void blk_mq_free_request(struct request *rq) wbt_done(q->rq_wb, &rq->issue_stat); + if (blk_rq_rl(rq)) + blk_put_rl(blk_rq_rl(rq)); + clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags); if (rq->tag != -1) @@ -1532,6 +1535,8 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) { blk_init_request_from_bio(rq, bio); + blk_rq_set_rl(rq, blk_get_rl(rq->q, bio)); + blk_account_io_start(rq, true); } -- cgit v1.2.3-58-ga151 From 519c8e9ffd86143fedd84cf833a09f36b47d0f5c Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 16 Oct 2017 11:01:00 -0700 Subject: block: fix Sphinx kernel-doc warning Sphinx treats symbols that end with '_' as a kind of special documentation indicator, so fix that by adding an ending '*' to it. ../block/bio.c:404: ERROR: Unknown target name: "gfp". Signed-off-by: Randy Dunlap Signed-off-by: Jens Axboe --- block/bio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index bf0dbe8f78f8..ae9ad34e6a71 100644 --- a/block/bio.c +++ b/block/bio.c @@ -400,7 +400,7 @@ static void punt_bios_to_rescuer(struct bio_set *bs) /** * bio_alloc_bioset - allocate a bio for I/O - * @gfp_mask: the GFP_ mask given to the slab allocator + * @gfp_mask: the GFP_* mask given to the slab allocator * @nr_iovecs: number of iovecs to pre-allocate * @bs: the bio_set to allocate from. * -- cgit v1.2.3-58-ga151 From 8cf466602028196b939255f1eb4e9817efd1db6d Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Wed, 11 Oct 2017 10:39:15 -0700 Subject: kyber: fix hang on domain token wait queue When we're getting a domain token, if we fail to get a token on our first attempt, we put the current hardware queue on a wait queue and then try again just in case a token was freed after our initial attempt but before we got on the wait queue. If this second attempt succeeds, we currently leave the hardware queue on the wait queue. Usually this is okay; we'll just run the hardware queue one extra time when another token is freed. However, if the hardware queue doesn't have any other requests waiting, then when it it gets the extra wakeup, it won't have anything to free and therefore won't wake up any other hardware queues. If tokens are limited, then we won't make forward progress and the device will hang. Reported-by: Bin Zha Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- block/kyber-iosched.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index f58cab82105b..db5bfc6342d3 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -541,9 +541,17 @@ static int kyber_get_domain_token(struct kyber_queue_data *kqd, /* * Try again in case a token was freed before we got on the wait - * queue. + * queue. The waker may have already removed the entry from the + * wait queue, but list_del_init() is okay with that. */ nr = __sbitmap_queue_get(domain_tokens); + if (nr >= 0) { + unsigned long flags; + + spin_lock_irqsave(&ws->wait.lock, flags); + list_del_init(&wait->entry); + spin_unlock_irqrestore(&ws->wait.lock, flags); + } } return nr; } -- cgit v1.2.3-58-ga151 From 149e10f8ff71439014dff97987e90e87e2684a16 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 11 Oct 2017 12:53:06 +0300 Subject: block: introduce blk_mq_tagset_iter Iterator helper to apply a function on all the tags in a given tagset. export it as it will be used outside the block layer later on. Reviewed-by: Bart Van Assche Reviewed-by: Jens Axboe Reviewed-by: Max Gurtovoy Reviewed-by: Johannes Thumshirn Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- block/blk-mq-tag.c | 16 +++++++++++----- include/linux/blk-mq.h | 2 ++ 2 files changed, 13 insertions(+), 5 deletions(-) (limited to 'block') diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 6714507aa6c7..bce1c76fc768 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -298,12 +298,12 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, } EXPORT_SYMBOL(blk_mq_tagset_busy_iter); -int blk_mq_reinit_tagset(struct blk_mq_tag_set *set, - int (reinit_request)(void *, struct request *)) +int blk_mq_tagset_iter(struct blk_mq_tag_set *set, void *data, + int (fn)(void *, struct request *)) { int i, j, ret = 0; - if (WARN_ON_ONCE(!reinit_request)) + if (WARN_ON_ONCE(!fn)) goto out; for (i = 0; i < set->nr_hw_queues; i++) { @@ -316,8 +316,7 @@ int blk_mq_reinit_tagset(struct blk_mq_tag_set *set, if (!tags->static_rqs[j]) continue; - ret = reinit_request(set->driver_data, - tags->static_rqs[j]); + ret = fn(data, tags->static_rqs[j]); if (ret) goto out; } @@ -326,6 +325,13 @@ int blk_mq_reinit_tagset(struct blk_mq_tag_set *set, out: return ret; } +EXPORT_SYMBOL_GPL(blk_mq_tagset_iter); + +int blk_mq_reinit_tagset(struct blk_mq_tag_set *set, + int (reinit_request)(void *, struct request *)) +{ + return blk_mq_tagset_iter(set, set->driver_data, reinit_request); +} EXPORT_SYMBOL_GPL(blk_mq_reinit_tagset); void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 50c6485cb04f..6bc29f73a9aa 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -259,6 +259,8 @@ void blk_freeze_queue_start(struct request_queue *q); void blk_mq_freeze_queue_wait(struct request_queue *q); int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, unsigned long timeout); +int blk_mq_tagset_iter(struct blk_mq_tag_set *set, void *data, + int (reinit_request)(void *, struct request *)); int blk_mq_reinit_tagset(struct blk_mq_tag_set *set, int (reinit_request)(void *, struct request *)); -- cgit v1.2.3-58-ga151 From dab7487bdf63c6f2d70aac604494d023b189a9fd Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 11 Oct 2017 12:53:08 +0300 Subject: block: remove blk_mq_reinit_tagset No callers left. Reviewed-by: Jens Axboe Reviewed-by: Bart Van Assche Reviewed-by: Max Gurtovoy Reviewed-by: Johannes Thumshirn Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- block/blk-mq-tag.c | 7 ------- include/linux/blk-mq.h | 2 -- 2 files changed, 9 deletions(-) (limited to 'block') diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index bce1c76fc768..c81b40ecd3f1 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -327,13 +327,6 @@ out: } EXPORT_SYMBOL_GPL(blk_mq_tagset_iter); -int blk_mq_reinit_tagset(struct blk_mq_tag_set *set, - int (reinit_request)(void *, struct request *)) -{ - return blk_mq_tagset_iter(set, set->driver_data, reinit_request); -} -EXPORT_SYMBOL_GPL(blk_mq_reinit_tagset); - void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, void *priv) { diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 6bc29f73a9aa..cfd64e500d82 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -261,8 +261,6 @@ int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, unsigned long timeout); int blk_mq_tagset_iter(struct blk_mq_tag_set *set, void *data, int (reinit_request)(void *, struct request *)); -int blk_mq_reinit_tagset(struct blk_mq_tag_set *set, - int (reinit_request)(void *, struct request *)); int blk_mq_map_queues(struct blk_mq_tag_set *set); void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues); -- cgit v1.2.3-58-ga151 From 351499a172c0c5fc52d65ee2c62b344f369ea02a Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Tue, 24 Oct 2017 18:44:57 -0600 Subject: block: Invalidate cache on discard v2 It is reasonable drop page cache on discard, otherwise that pages may be written by writeback second later, so thin provision devices will not be happy. This seems to be a security leak in case of secure discard case. Also add check for queue_discard flag on early stage. Signed-off-by: Dmitry Monakhov Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/ioctl.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/ioctl.c b/block/ioctl.c index 0de02ee67eed..c0fc32bd8ed1 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -202,10 +202,16 @@ static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode, { uint64_t range[2]; uint64_t start, len; + struct request_queue *q = bdev_get_queue(bdev); + struct address_space *mapping = bdev->bd_inode->i_mapping; + if (!(mode & FMODE_WRITE)) return -EBADF; + if (!blk_queue_discard(q)) + return -EOPNOTSUPP; + if (copy_from_user(range, (void __user *)arg, sizeof(range))) return -EFAULT; @@ -216,12 +222,12 @@ static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode, return -EINVAL; if (len & 511) return -EINVAL; - start >>= 9; - len >>= 9; - if (start + len > (i_size_read(bdev->bd_inode) >> 9)) + if (start + len > i_size_read(bdev->bd_inode)) return -EINVAL; - return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, flags); + truncate_inode_pages_range(mapping, start, start + len); + return blkdev_issue_discard(bdev, start >> 9, len >> 9, + GFP_KERNEL, flags); } static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode, -- cgit v1.2.3-58-ga151 From bb749b31c25e9b11f8f974baac8d507298ffbb70 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 18 Oct 2017 14:38:38 +0200 Subject: block: move CAP_SYS_ADMIN check in blkdev_roset() Check for CAP_SYS_ADMIN before calling into the driver, similar to blkdev_flushbuf(). This is safer and can spare a check in the driver. (Currently BLKROSET is overridden by md and rbd, rbd is missing the check. md has the check, but it covers a lot more than BLKROSET.) Acked-by: Al Viro Signed-off-by: Ilya Dryomov Signed-off-by: Jens Axboe --- block/ioctl.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/ioctl.c b/block/ioctl.c index c0fc32bd8ed1..1668506d8ed8 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -443,11 +443,12 @@ static int blkdev_roset(struct block_device *bdev, fmode_t mode, { int ret, n; + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg); if (!is_unrecognized_ioctl(ret)) return ret; - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; if (get_user(n, (int __user *)arg)) return -EFAULT; set_device_ro(bdev, n); -- cgit v1.2.3-58-ga151 From 425a4dba7953e35ffd096771973add6d2f40d2ed Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 16 Oct 2017 15:59:09 +0200 Subject: block: factor out __blkdev_issue_zero_pages() blkdev_issue_zeroout() will use this in !BLKDEV_ZERO_NOFALLBACK case. Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Ilya Dryomov Signed-off-by: Jens Axboe --- block/blk-lib.c | 63 +++++++++++++++++++++++++++++++++------------------------ 1 file changed, 37 insertions(+), 26 deletions(-) (limited to 'block') diff --git a/block/blk-lib.c b/block/blk-lib.c index 62240f8832ca..9d2ab8bba52a 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -274,6 +274,40 @@ static unsigned int __blkdev_sectors_to_bio_pages(sector_t nr_sects) return min(pages, (sector_t)BIO_MAX_PAGES); } +static int __blkdev_issue_zero_pages(struct block_device *bdev, + sector_t sector, sector_t nr_sects, gfp_t gfp_mask, + struct bio **biop) +{ + struct request_queue *q = bdev_get_queue(bdev); + struct bio *bio = *biop; + int bi_size = 0; + unsigned int sz; + + if (!q) + return -ENXIO; + + while (nr_sects != 0) { + bio = next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects), + gfp_mask); + bio->bi_iter.bi_sector = sector; + bio_set_dev(bio, bdev); + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + + while (nr_sects != 0) { + sz = min((sector_t) PAGE_SIZE, nr_sects << 9); + bi_size = bio_add_page(bio, ZERO_PAGE(0), sz, 0); + nr_sects -= bi_size >> 9; + sector += bi_size >> 9; + if (bi_size < sz) + break; + } + cond_resched(); + } + + *biop = bio; + return 0; +} + /** * __blkdev_issue_zeroout - generate number of zero filed write bios * @bdev: blockdev to issue @@ -304,9 +338,6 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, unsigned flags) { int ret; - int bi_size = 0; - struct bio *bio = *biop; - unsigned int sz; sector_t bs_mask; bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; @@ -316,30 +347,10 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp_mask, biop, flags); if (ret != -EOPNOTSUPP || (flags & BLKDEV_ZERO_NOFALLBACK)) - goto out; - - ret = 0; - while (nr_sects != 0) { - bio = next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects), - gfp_mask); - bio->bi_iter.bi_sector = sector; - bio_set_dev(bio, bdev); - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); - - while (nr_sects != 0) { - sz = min((sector_t) PAGE_SIZE, nr_sects << 9); - bi_size = bio_add_page(bio, ZERO_PAGE(0), sz, 0); - nr_sects -= bi_size >> 9; - sector += bi_size >> 9; - if (bi_size < sz) - break; - } - cond_resched(); - } + return ret; - *biop = bio; -out: - return ret; + return __blkdev_issue_zero_pages(bdev, sector, nr_sects, gfp_mask, + biop); } EXPORT_SYMBOL(__blkdev_issue_zeroout); -- cgit v1.2.3-58-ga151 From d5ce4c31d6df518dd8f63bbae20d7423c5018a6c Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 16 Oct 2017 15:59:10 +0200 Subject: block: cope with WRITE ZEROES failing in blkdev_issue_zeroout() sd_config_write_same() ignores ->max_ws_blocks == 0 and resets it to permit trying WRITE SAME on older SCSI devices, unless ->no_write_same is set. Because REQ_OP_WRITE_ZEROES is implemented in terms of WRITE SAME, blkdev_issue_zeroout() may fail with -EREMOTEIO: $ fallocate -zn -l 1k /dev/sdg fallocate: fallocate failed: Remote I/O error $ fallocate -zn -l 1k /dev/sdg # OK $ fallocate -zn -l 1k /dev/sdg # OK The following calls succeed because sd_done() sets ->no_write_same in response to a sense that would become BLK_STS_TARGET/-EREMOTEIO, causing __blkdev_issue_zeroout() to fall back to generating ZERO_PAGE bios. This means blkdev_issue_zeroout() must cope with WRITE ZEROES failing and fall back to manually zeroing, unless BLKDEV_ZERO_NOFALLBACK is specified. For BLKDEV_ZERO_NOFALLBACK case, return -EOPNOTSUPP if sd_done() has just set ->no_write_same thus indicating lack of offload support. Fixes: c20cfc27a473 ("block: stop using blkdev_issue_write_same for zeroing") Cc: Hannes Reinecke Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Ilya Dryomov Signed-off-by: Jens Axboe --- block/blk-lib.c | 45 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 35 insertions(+), 10 deletions(-) (limited to 'block') diff --git a/block/blk-lib.c b/block/blk-lib.c index 9d2ab8bba52a..f625fda5f095 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -321,12 +321,6 @@ static int __blkdev_issue_zero_pages(struct block_device *bdev, * Zero-fill a block range, either using hardware offload or by explicitly * writing zeroes to the device. * - * Note that this function may fail with -EOPNOTSUPP if the driver signals - * zeroing offload support, but the device fails to process the command (for - * some devices there is no non-destructive way to verify whether this - * operation is actually supported). In this case the caller should call - * retry the call to blkdev_issue_zeroout() and the fallback path will be used. - * * If a device is using logical block provisioning, the underlying space will * not be released if %flags contains BLKDEV_ZERO_NOUNMAP. * @@ -370,18 +364,49 @@ EXPORT_SYMBOL(__blkdev_issue_zeroout); int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, unsigned flags) { - int ret; - struct bio *bio = NULL; + int ret = 0; + sector_t bs_mask; + struct bio *bio; struct blk_plug plug; + bool try_write_zeroes = !!bdev_write_zeroes_sectors(bdev); + bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; + if ((sector | nr_sects) & bs_mask) + return -EINVAL; + +retry: + bio = NULL; blk_start_plug(&plug); - ret = __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask, - &bio, flags); + if (try_write_zeroes) { + ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects, + gfp_mask, &bio, flags); + } else if (!(flags & BLKDEV_ZERO_NOFALLBACK)) { + ret = __blkdev_issue_zero_pages(bdev, sector, nr_sects, + gfp_mask, &bio); + } else { + /* No zeroing offload support */ + ret = -EOPNOTSUPP; + } if (ret == 0 && bio) { ret = submit_bio_wait(bio); bio_put(bio); } blk_finish_plug(&plug); + if (ret && try_write_zeroes) { + if (!(flags & BLKDEV_ZERO_NOFALLBACK)) { + try_write_zeroes = false; + goto retry; + } + if (!bdev_write_zeroes_sectors(bdev)) { + /* + * Zeroing offload support was indicated, but the + * device reported ILLEGAL REQUEST (for some devices + * there is no non-destructive way to verify whether + * WRITE ZEROES is actually supported). + */ + ret = -EOPNOTSUPP; + } + } return ret; } -- cgit v1.2.3-58-ga151 From 2527d99789e248576ac8081530cd4fd88730f8c7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 25 Oct 2017 12:33:42 -0600 Subject: elevator: lookup mq vs non-mq elevators If an IO scheduler is selected via elevator= and it doesn't match the driver in question wrt blk-mq support, then we fail to boot. The elevator= parameter is deprecated and only supported for non-mq devices. Augment the elevator lookup API so that we pass in if we're looking for an mq capable scheduler or not, so that we only ever return a valid type for the queue in question. Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=196695 Reviewed-by: Omar Sandoval Signed-off-by: Jens Axboe --- block/elevator.c | 44 +++++++++++++++++++++----------------------- 1 file changed, 21 insertions(+), 23 deletions(-) (limited to 'block') diff --git a/block/elevator.c b/block/elevator.c index 7ae50eb2732b..77856bf29568 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -83,12 +83,15 @@ bool elv_bio_merge_ok(struct request *rq, struct bio *bio) } EXPORT_SYMBOL(elv_bio_merge_ok); -static struct elevator_type *elevator_find(const char *name) +/* + * Return scheduler with name 'name' and with matching 'mq capability + */ +static struct elevator_type *elevator_find(const char *name, bool mq) { struct elevator_type *e; list_for_each_entry(e, &elv_list, list) { - if (!strcmp(e->elevator_name, name)) + if (!strcmp(e->elevator_name, name) && (mq == e->uses_mq)) return e; } @@ -100,25 +103,25 @@ static void elevator_put(struct elevator_type *e) module_put(e->elevator_owner); } -static struct elevator_type *elevator_get(const char *name, bool try_loading) +static struct elevator_type *elevator_get(struct request_queue *q, + const char *name, bool try_loading) { struct elevator_type *e; spin_lock(&elv_list_lock); - e = elevator_find(name); + e = elevator_find(name, q->mq_ops != NULL); if (!e && try_loading) { spin_unlock(&elv_list_lock); request_module("%s-iosched", name); spin_lock(&elv_list_lock); - e = elevator_find(name); + e = elevator_find(name, q->mq_ops != NULL); } if (e && !try_module_get(e->elevator_owner)) e = NULL; spin_unlock(&elv_list_lock); - return e; } @@ -144,8 +147,12 @@ void __init load_default_elevator_module(void) if (!chosen_elevator[0]) return; + /* + * Boot parameter is deprecated, we haven't supported that for MQ. + * Only look for non-mq schedulers from here. + */ spin_lock(&elv_list_lock); - e = elevator_find(chosen_elevator); + e = elevator_find(chosen_elevator, false); spin_unlock(&elv_list_lock); if (!e) @@ -202,7 +209,7 @@ int elevator_init(struct request_queue *q, char *name) q->boundary_rq = NULL; if (name) { - e = elevator_get(name, true); + e = elevator_get(q, name, true); if (!e) return -EINVAL; } @@ -214,7 +221,7 @@ int elevator_init(struct request_queue *q, char *name) * allowed from async. */ if (!e && !q->mq_ops && *chosen_elevator) { - e = elevator_get(chosen_elevator, false); + e = elevator_get(q, chosen_elevator, false); if (!e) printk(KERN_ERR "I/O scheduler %s not found\n", chosen_elevator); @@ -229,17 +236,17 @@ int elevator_init(struct request_queue *q, char *name) */ if (q->mq_ops) { if (q->nr_hw_queues == 1) - e = elevator_get("mq-deadline", false); + e = elevator_get(q, "mq-deadline", false); if (!e) return 0; } else - e = elevator_get(CONFIG_DEFAULT_IOSCHED, false); + e = elevator_get(q, CONFIG_DEFAULT_IOSCHED, false); if (!e) { printk(KERN_ERR "Default I/O scheduler not found. " \ "Using noop.\n"); - e = elevator_get("noop", false); + e = elevator_get(q, "noop", false); } } @@ -905,7 +912,7 @@ int elv_register(struct elevator_type *e) /* register, don't allow duplicate names */ spin_lock(&elv_list_lock); - if (elevator_find(e->elevator_name)) { + if (elevator_find(e->elevator_name, e->uses_mq)) { spin_unlock(&elv_list_lock); if (e->icq_cache) kmem_cache_destroy(e->icq_cache); @@ -1066,7 +1073,7 @@ static int __elevator_change(struct request_queue *q, const char *name) return elevator_switch(q, NULL); strlcpy(elevator_name, name, sizeof(elevator_name)); - e = elevator_get(strstrip(elevator_name), true); + e = elevator_get(q, strstrip(elevator_name), true); if (!e) return -EINVAL; @@ -1076,15 +1083,6 @@ static int __elevator_change(struct request_queue *q, const char *name) return 0; } - if (!e->uses_mq && q->mq_ops) { - elevator_put(e); - return -EINVAL; - } - if (e->uses_mq && !q->mq_ops) { - elevator_put(e); - return -EINVAL; - } - return elevator_switch(q, e); } -- cgit v1.2.3-58-ga151 From 8ac0d9a81edf2ef4a2268b65b802a6b856dc77e6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 25 Oct 2017 12:35:02 -0600 Subject: elevator: allow name aliases Since we now lookup elevator types with the appropriate multiqueue capability, allow schedulers to register with an alias alongside the real name. This is in preparation for allowing 'mq-deadline' to register an alias of 'deadline' as well. Reviewed-by: Omar Sandoval Signed-off-by: Jens Axboe --- block/elevator.c | 23 +++++++++++++++++------ include/linux/elevator.h | 1 + 2 files changed, 18 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/elevator.c b/block/elevator.c index 77856bf29568..7bda083d5968 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -83,6 +83,16 @@ bool elv_bio_merge_ok(struct request *rq, struct bio *bio) } EXPORT_SYMBOL(elv_bio_merge_ok); +static bool elevator_match(const struct elevator_type *e, const char *name) +{ + if (!strcmp(e->elevator_name, name)) + return true; + if (e->elevator_alias && !strcmp(e->elevator_alias, name)) + return true; + + return false; +} + /* * Return scheduler with name 'name' and with matching 'mq capability */ @@ -91,7 +101,7 @@ static struct elevator_type *elevator_find(const char *name, bool mq) struct elevator_type *e; list_for_each_entry(e, &elv_list, list) { - if (!strcmp(e->elevator_name, name) && (mq == e->uses_mq)) + if (elevator_match(e, name) && (mq == e->uses_mq)) return e; } @@ -922,9 +932,9 @@ int elv_register(struct elevator_type *e) spin_unlock(&elv_list_lock); /* print pretty message */ - if (!strcmp(e->elevator_name, chosen_elevator) || + if (elevator_match(e, chosen_elevator) || (!*chosen_elevator && - !strcmp(e->elevator_name, CONFIG_DEFAULT_IOSCHED))) + elevator_match(e, CONFIG_DEFAULT_IOSCHED))) def = " (default)"; printk(KERN_INFO "io scheduler %s registered%s\n", e->elevator_name, @@ -1077,8 +1087,7 @@ static int __elevator_change(struct request_queue *q, const char *name) if (!e) return -EINVAL; - if (q->elevator && - !strcmp(elevator_name, q->elevator->type->elevator_name)) { + if (q->elevator && elevator_match(q->elevator->type, elevator_name)) { elevator_put(e); return 0; } @@ -1114,6 +1123,7 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name) struct elevator_queue *e = q->elevator; struct elevator_type *elv = NULL; struct elevator_type *__e; + bool uses_mq = q->mq_ops != NULL; int len = 0; if (!queue_is_rq_based(q)) @@ -1126,7 +1136,8 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name) spin_lock(&elv_list_lock); list_for_each_entry(__e, &elv_list, list) { - if (elv && !strcmp(elv->elevator_name, __e->elevator_name)) { + if (elv && elevator_match(elv, __e->elevator_name) && + (__e->uses_mq == uses_mq)) { len += sprintf(name+len, "[%s] ", elv->elevator_name); continue; } diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 5bc8f8682a3e..6df8b14f1f6a 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -144,6 +144,7 @@ struct elevator_type size_t icq_align; /* ditto */ struct elv_fs_entry *elevator_attrs; char elevator_name[ELV_NAME_MAX]; + const char *elevator_alias; struct module *elevator_owner; bool uses_mq; #ifdef CONFIG_BLK_DEBUG_FS -- cgit v1.2.3-58-ga151 From 4d740bc9f0319229410d11e445017f47e425dbe0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 25 Oct 2017 09:47:20 -0600 Subject: mq-deadline: add 'deadline' as a name alias The scheduler framework now supports looking up the appropriate scheduler with the {name,mq} tupple. We can register mq-deadline with the alias of 'deadline', so that switching to 'deadline' will do the right thing based on the type of driver attached to it. Reviewed-by: Omar Sandoval Signed-off-by: Jens Axboe --- block/mq-deadline.c | 1 + 1 file changed, 1 insertion(+) (limited to 'block') diff --git a/block/mq-deadline.c b/block/mq-deadline.c index a1cad4331edd..0179e484ec98 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -657,6 +657,7 @@ static struct elevator_type mq_deadline = { #endif .elevator_attrs = deadline_attrs, .elevator_name = "mq-deadline", + .elevator_alias = "deadline", .elevator_owner = THIS_MODULE, }; MODULE_ALIAS("mq-deadline-iosched"); -- cgit v1.2.3-58-ga151 From 4e9b6f20828ac880dbc1fa2fdbafae779473d1af Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 19 Oct 2017 10:00:48 -0700 Subject: block: Fix a race between blk_cleanup_queue() and timeout handling Make sure that if the timeout timer fires after a queue has been marked "dying" that the affected requests are finished. Reported-by: chenxiang (M) Fixes: commit 287922eb0b18 ("block: defer timeouts to a workqueue") Signed-off-by: Bart Van Assche Tested-by: chenxiang (M) Cc: Christoph Hellwig Cc: Keith Busch Cc: Hannes Reinecke Cc: Ming Lei Cc: Johannes Thumshirn Cc: Signed-off-by: Jens Axboe --- block/blk-core.c | 2 ++ block/blk-timeout.c | 3 --- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index e8e149ccc86b..bb4fce694a60 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -333,6 +333,7 @@ EXPORT_SYMBOL(blk_stop_queue); void blk_sync_queue(struct request_queue *q) { del_timer_sync(&q->timeout); + cancel_work_sync(&q->timeout_work); if (q->mq_ops) { struct blk_mq_hw_ctx *hctx; @@ -844,6 +845,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) setup_timer(&q->backing_dev_info->laptop_mode_wb_timer, laptop_mode_timer_fn, (unsigned long) q); setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q); + INIT_WORK(&q->timeout_work, NULL); INIT_LIST_HEAD(&q->queue_head); INIT_LIST_HEAD(&q->timeout_list); INIT_LIST_HEAD(&q->icq_list); diff --git a/block/blk-timeout.c b/block/blk-timeout.c index e3e9c9771d36..764ecf9aeb30 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -134,8 +134,6 @@ void blk_timeout_work(struct work_struct *work) struct request *rq, *tmp; int next_set = 0; - if (blk_queue_enter(q, true)) - return; spin_lock_irqsave(q->queue_lock, flags); list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list) @@ -145,7 +143,6 @@ void blk_timeout_work(struct work_struct *work) mod_timer(&q->timeout, round_jiffies_up(next)); spin_unlock_irqrestore(q->queue_lock, flags); - blk_queue_exit(q); } /** -- cgit v1.2.3-58-ga151 From 5e3d02bbafad38975099b5848f5ebadedcf7bb7e Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 14 Oct 2017 17:22:25 +0800 Subject: blk-mq-sched: dispatch from scheduler IFF progress is made in ->dispatch When the hw queue is busy, we shouldn't take requests from the scheduler queue any more, otherwise it is difficult to do IO merge. This patch fixes the awful IO performance on some SCSI devices(lpfc, qla2xxx, ...) when mq-deadline/kyber is used by not taking requests if hw queue is busy. Reviewed-by: Omar Sandoval Reviewed-by: Bart Van Assche Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 4ab69435708c..eca011fdfa0e 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -94,7 +94,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) struct request_queue *q = hctx->queue; struct elevator_queue *e = q->elevator; const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request; - bool did_work = false; + bool do_sched_dispatch = true; LIST_HEAD(rq_list); /* RCU or SRCU read lock is needed before checking quiesced flag */ @@ -125,18 +125,18 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) */ if (!list_empty(&rq_list)) { blk_mq_sched_mark_restart_hctx(hctx); - did_work = blk_mq_dispatch_rq_list(q, &rq_list); + do_sched_dispatch = blk_mq_dispatch_rq_list(q, &rq_list); } else if (!has_sched_dispatch) { blk_mq_flush_busy_ctxs(hctx, &rq_list); blk_mq_dispatch_rq_list(q, &rq_list); } /* - * We want to dispatch from the scheduler if we had no work left - * on the dispatch list, OR if we did have work but weren't able - * to make progress. + * We want to dispatch from the scheduler if there was nothing + * on the dispatch list or we were able to dispatch from the + * dispatch list. */ - if (!did_work && has_sched_dispatch) { + if (do_sched_dispatch && has_sched_dispatch) { do { struct request *rq; -- cgit v1.2.3-58-ga151 From caf8eb0d604a0eaeb8111eb4d36853a6d08eebe7 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 14 Oct 2017 17:22:26 +0800 Subject: blk-mq-sched: move actual dispatching into one helper So that it becomes easy to support to dispatch from sw queue in the following patch. No functional change. Reviewed-by: Bart Van Assche Reviewed-by: Omar Sandoval Suggested-by: Christoph Hellwig # for simplifying dispatch logic Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 43 ++++++++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 19 deletions(-) (limited to 'block') diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index eca011fdfa0e..be29ba849408 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -89,12 +89,26 @@ static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx) return false; } +static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) +{ + struct request_queue *q = hctx->queue; + struct elevator_queue *e = q->elevator; + LIST_HEAD(rq_list); + + do { + struct request *rq = e->type->ops.mq.dispatch_request(hctx); + + if (!rq) + break; + list_add(&rq->queuelist, &rq_list); + } while (blk_mq_dispatch_rq_list(q, &rq_list)); +} + void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; struct elevator_queue *e = q->elevator; const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request; - bool do_sched_dispatch = true; LIST_HEAD(rq_list); /* RCU or SRCU read lock is needed before checking quiesced flag */ @@ -122,30 +136,21 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) * scheduler, we can no longer merge or sort them. So it's best to * leave them there for as long as we can. Mark the hw queue as * needing a restart in that case. + * + * We want to dispatch from the scheduler if there was nothing + * on the dispatch list or we were able to dispatch from the + * dispatch list. */ if (!list_empty(&rq_list)) { blk_mq_sched_mark_restart_hctx(hctx); - do_sched_dispatch = blk_mq_dispatch_rq_list(q, &rq_list); - } else if (!has_sched_dispatch) { + if (blk_mq_dispatch_rq_list(q, &rq_list) && has_sched_dispatch) + blk_mq_do_dispatch_sched(hctx); + } else if (has_sched_dispatch) { + blk_mq_do_dispatch_sched(hctx); + } else { blk_mq_flush_busy_ctxs(hctx, &rq_list); blk_mq_dispatch_rq_list(q, &rq_list); } - - /* - * We want to dispatch from the scheduler if there was nothing - * on the dispatch list or we were able to dispatch from the - * dispatch list. - */ - if (do_sched_dispatch && has_sched_dispatch) { - do { - struct request *rq; - - rq = e->type->ops.mq.dispatch_request(hctx); - if (!rq) - break; - list_add(&rq->queuelist, &rq_list); - } while (blk_mq_dispatch_rq_list(q, &rq_list)); - } } bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, -- cgit v1.2.3-58-ga151 From 63ba8e31c3ac6393b07c6e18538814a730478766 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 14 Oct 2017 17:22:28 +0800 Subject: block: kyber: check if there are requests in ctx in kyber_has_work() There may be request in sw queue, and not fetched to domain queue yet, so check it in kyber_has_work(). Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/kyber-iosched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index db5bfc6342d3..b4df317c2916 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -649,7 +649,7 @@ static bool kyber_has_work(struct blk_mq_hw_ctx *hctx) if (!list_empty_careful(&khd->rqs[i])) return true; } - return false; + return sbitmap_any_bit_set(&hctx->ctx_map); } #define KYBER_LAT_SHOW_STORE(op) \ -- cgit v1.2.3-58-ga151 From de1482974080ec9ef414bf048b2646b246b63f6e Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 14 Oct 2017 17:22:29 +0800 Subject: blk-mq: introduce .get_budget and .put_budget in blk_mq_ops For SCSI devices, there is often a per-request-queue depth, which needs to be respected before queuing one request. Currently blk-mq always dequeues the request first, then calls .queue_rq() to dispatch the request to lld. One obvious issue with this approach is that I/O merging may not be successful, because when the per-request-queue depth can't be respected, .queue_rq() has to return BLK_STS_RESOURCE, and then this request has to stay in hctx->dispatch list. This means it never gets a chance to be merged with other IO. This patch introduces .get_budget and .put_budget callback in blk_mq_ops, then we can try to get reserved budget first before dequeuing request. If the budget for queueing I/O can't be satisfied, we don't need to dequeue request at all. Hence the request can be left in the IO scheduler queue, for more merging opportunities. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 55 +++++++++++++++++++++++++++++++++++++++++--------- block/blk-mq-sched.h | 2 +- block/blk-mq.c | 43 ++++++++++++++++++++++++++++++++++----- block/blk-mq.h | 20 +++++++++++++++++- include/linux/blk-mq.h | 11 ++++++++++ 5 files changed, 114 insertions(+), 17 deletions(-) (limited to 'block') diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index be29ba849408..8e525e66a0d9 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -89,31 +89,57 @@ static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx) return false; } -static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) +/* return true if hctx need to run again */ +static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; struct elevator_queue *e = q->elevator; LIST_HEAD(rq_list); do { - struct request *rq = e->type->ops.mq.dispatch_request(hctx); + struct request *rq; + blk_status_t ret; - if (!rq) + if (e->type->ops.mq.has_work && + !e->type->ops.mq.has_work(hctx)) break; + + ret = blk_mq_get_dispatch_budget(hctx); + if (ret == BLK_STS_RESOURCE) + return true; + + rq = e->type->ops.mq.dispatch_request(hctx); + if (!rq) { + blk_mq_put_dispatch_budget(hctx); + break; + } else if (ret != BLK_STS_OK) { + blk_mq_end_request(rq, ret); + continue; + } + + /* + * Now this rq owns the budget which has to be released + * if this rq won't be queued to driver via .queue_rq() + * in blk_mq_dispatch_rq_list(). + */ list_add(&rq->queuelist, &rq_list); - } while (blk_mq_dispatch_rq_list(q, &rq_list)); + } while (blk_mq_dispatch_rq_list(q, &rq_list, true)); + + return false; } -void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) +/* return true if hw queue need to be run again */ +bool blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; struct elevator_queue *e = q->elevator; const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request; LIST_HEAD(rq_list); + bool run_queue = false; /* RCU or SRCU read lock is needed before checking quiesced flag */ if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q))) - return; + return false; hctx->run++; @@ -143,14 +169,23 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) */ if (!list_empty(&rq_list)) { blk_mq_sched_mark_restart_hctx(hctx); - if (blk_mq_dispatch_rq_list(q, &rq_list) && has_sched_dispatch) - blk_mq_do_dispatch_sched(hctx); + if (blk_mq_dispatch_rq_list(q, &rq_list, false) && + has_sched_dispatch) + run_queue = blk_mq_do_dispatch_sched(hctx); } else if (has_sched_dispatch) { - blk_mq_do_dispatch_sched(hctx); + run_queue = blk_mq_do_dispatch_sched(hctx); } else { blk_mq_flush_busy_ctxs(hctx, &rq_list); - blk_mq_dispatch_rq_list(q, &rq_list); + blk_mq_dispatch_rq_list(q, &rq_list, false); } + + if (run_queue && !blk_mq_sched_needs_restart(hctx) && + !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state)) { + blk_mq_sched_mark_restart_hctx(hctx); + return true; + } + + return false; } bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index 9267d0b7c197..2434061cc5b7 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -22,7 +22,7 @@ void blk_mq_sched_insert_requests(struct request_queue *q, struct blk_mq_ctx *ctx, struct list_head *list, bool run_queue_async); -void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx); +bool blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx); int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e); void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e); diff --git a/block/blk-mq.c b/block/blk-mq.c index 40cba1b1978f..dcb467369999 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1048,7 +1048,8 @@ static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx) return true; } -bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list) +bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, + bool got_budget) { struct blk_mq_hw_ctx *hctx; struct request *rq; @@ -1057,6 +1058,8 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list) if (list_empty(list)) return false; + WARN_ON(!list_is_singular(list) && got_budget); + /* * Now process all the entries, sending them to the driver. */ @@ -1074,16 +1077,30 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list) * The initial allocation attempt failed, so we need to * rerun the hardware queue when a tag is freed. */ - if (!blk_mq_dispatch_wait_add(hctx)) + if (!blk_mq_dispatch_wait_add(hctx)) { + if (got_budget) + blk_mq_put_dispatch_budget(hctx); break; + } /* * It's possible that a tag was freed in the window * between the allocation failure and adding the * hardware queue to the wait queue. */ - if (!blk_mq_get_driver_tag(rq, &hctx, false)) + if (!blk_mq_get_driver_tag(rq, &hctx, false)) { + if (got_budget) + blk_mq_put_dispatch_budget(hctx); + break; + } + } + + if (!got_budget) { + ret = blk_mq_get_dispatch_budget(hctx); + if (ret == BLK_STS_RESOURCE) break; + if (ret != BLK_STS_OK) + goto fail_rq; } list_del_init(&rq->queuelist); @@ -1111,6 +1128,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list) break; } + fail_rq: if (unlikely(ret != BLK_STS_OK)) { errors++; blk_mq_end_request(rq, BLK_STS_IOERR); @@ -1169,6 +1187,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list) static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) { int srcu_idx; + bool run_queue; /* * We should be running this queue from one of the CPUs that @@ -1185,15 +1204,18 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { rcu_read_lock(); - blk_mq_sched_dispatch_requests(hctx); + run_queue = blk_mq_sched_dispatch_requests(hctx); rcu_read_unlock(); } else { might_sleep(); srcu_idx = srcu_read_lock(hctx->queue_rq_srcu); - blk_mq_sched_dispatch_requests(hctx); + run_queue = blk_mq_sched_dispatch_requests(hctx); srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx); } + + if (run_queue) + blk_mq_run_hw_queue(hctx, true); } /* @@ -1582,6 +1604,13 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, if (!blk_mq_get_driver_tag(rq, NULL, false)) goto insert; + ret = blk_mq_get_dispatch_budget(hctx); + if (ret == BLK_STS_RESOURCE) { + blk_mq_put_driver_tag(rq); + goto insert; + } else if (ret != BLK_STS_OK) + goto fail_rq; + new_cookie = request_to_qc_t(hctx, rq); /* @@ -1598,6 +1627,7 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, __blk_mq_requeue_request(rq); goto insert; default: + fail_rq: *cookie = BLK_QC_T_NONE; blk_mq_end_request(rq, ret); return; @@ -2582,6 +2612,9 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) if (!set->ops->queue_rq) return -EINVAL; + if (!set->ops->get_budget ^ !set->ops->put_budget) + return -EINVAL; + if (set->queue_depth > BLK_MQ_MAX_DEPTH) { pr_info("blk-mq: reduced tag depth to %u\n", BLK_MQ_MAX_DEPTH); diff --git a/block/blk-mq.h b/block/blk-mq.h index ef15b3414da5..e413b732374e 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -30,7 +30,7 @@ void blk_mq_freeze_queue(struct request_queue *q); void blk_mq_free_queue(struct request_queue *q); int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); void blk_mq_wake_waiters(struct request_queue *q); -bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *); +bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *, bool); void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx); bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, @@ -137,4 +137,22 @@ static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx) void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part, unsigned int inflight[2]); +static inline void blk_mq_put_dispatch_budget(struct blk_mq_hw_ctx *hctx) +{ + struct request_queue *q = hctx->queue; + + if (q->mq_ops->put_budget) + q->mq_ops->put_budget(hctx); +} + +static inline blk_status_t blk_mq_get_dispatch_budget( + struct blk_mq_hw_ctx *hctx) +{ + struct request_queue *q = hctx->queue; + + if (q->mq_ops->get_budget) + return q->mq_ops->get_budget(hctx); + return BLK_STS_OK; +} + #endif diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 50c6485cb04f..901457df3d64 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -90,6 +90,8 @@ struct blk_mq_queue_data { typedef blk_status_t (queue_rq_fn)(struct blk_mq_hw_ctx *, const struct blk_mq_queue_data *); +typedef blk_status_t (get_budget_fn)(struct blk_mq_hw_ctx *); +typedef void (put_budget_fn)(struct blk_mq_hw_ctx *); typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool); typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int); typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int); @@ -111,6 +113,15 @@ struct blk_mq_ops { */ queue_rq_fn *queue_rq; + /* + * Reserve budget before queue request, once .queue_rq is + * run, it is driver's responsibility to release the + * reserved budget. Also we have to handle failure case + * of .get_budget for avoiding I/O deadlock. + */ + get_budget_fn *get_budget; + put_budget_fn *put_budget; + /* * Called on request timeout */ -- cgit v1.2.3-58-ga151 From b347689ffbca745ac457ee27400ce1affd571c6f Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 14 Oct 2017 17:22:30 +0800 Subject: blk-mq-sched: improve dispatching from sw queue SCSI devices use host-wide tagset, and the shared driver tag space is often quite big. However, there is also a queue depth for each lun( .cmd_per_lun), which is often small, for example, on both lpfc and qla2xxx, .cmd_per_lun is just 3. So lots of requests may stay in sw queue, and we always flush all belonging to same hw queue and dispatch them all to driver. Unfortunately it is easy to cause queue busy because of the small .cmd_per_lun. Once these requests are flushed out, they have to stay in hctx->dispatch, and no bio merge can happen on these requests, and sequential IO performance is harmed. This patch introduces blk_mq_dequeue_from_ctx for dequeuing a request from a sw queue, so that we can dispatch them in scheduler's way. We can then avoid dequeueing too many requests from sw queue, since we don't flush ->dispatch completely. This patch improves dispatching from sw queue by using the .get_budget and .put_budget callbacks. Reviewed-by: Omar Sandoval Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++-- block/blk-mq.c | 39 ++++++++++++++++++++++++++ block/blk-mq.h | 2 ++ include/linux/blk-mq.h | 2 ++ 4 files changed, 114 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 8e525e66a0d9..df8581bb0a37 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -128,6 +128,61 @@ static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) return false; } +static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *ctx) +{ + unsigned idx = ctx->index_hw; + + if (++idx == hctx->nr_ctx) + idx = 0; + + return hctx->ctxs[idx]; +} + +/* return true if hctx need to run again */ +static bool blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) +{ + struct request_queue *q = hctx->queue; + LIST_HEAD(rq_list); + struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from); + + do { + struct request *rq; + blk_status_t ret; + + if (!sbitmap_any_bit_set(&hctx->ctx_map)) + break; + + ret = blk_mq_get_dispatch_budget(hctx); + if (ret == BLK_STS_RESOURCE) + return true; + + rq = blk_mq_dequeue_from_ctx(hctx, ctx); + if (!rq) { + blk_mq_put_dispatch_budget(hctx); + break; + } else if (ret != BLK_STS_OK) { + blk_mq_end_request(rq, ret); + continue; + } + + /* + * Now this rq owns the budget which has to be released + * if this rq won't be queued to driver via .queue_rq() + * in blk_mq_dispatch_rq_list(). + */ + list_add(&rq->queuelist, &rq_list); + + /* round robin for fair dispatch */ + ctx = blk_mq_next_ctx(hctx, rq->mq_ctx); + + } while (blk_mq_dispatch_rq_list(q, &rq_list, true)); + + WRITE_ONCE(hctx->dispatch_from, ctx); + + return false; +} + /* return true if hw queue need to be run again */ bool blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) { @@ -169,11 +224,24 @@ bool blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) */ if (!list_empty(&rq_list)) { blk_mq_sched_mark_restart_hctx(hctx); - if (blk_mq_dispatch_rq_list(q, &rq_list, false) && - has_sched_dispatch) - run_queue = blk_mq_do_dispatch_sched(hctx); + if (blk_mq_dispatch_rq_list(q, &rq_list, false)) { + if (has_sched_dispatch) + run_queue = blk_mq_do_dispatch_sched(hctx); + else + run_queue = blk_mq_do_dispatch_ctx(hctx); + } } else if (has_sched_dispatch) { run_queue = blk_mq_do_dispatch_sched(hctx); + } else if (q->mq_ops->get_budget) { + /* + * If we need to get budget before queuing request, we + * dequeue request one by one from sw queue for avoiding + * to mess up I/O merge when dispatch runs out of resource. + * + * TODO: get more budgets, and dequeue more requests in + * one time. + */ + run_queue = blk_mq_do_dispatch_ctx(hctx); } else { blk_mq_flush_busy_ctxs(hctx, &rq_list); blk_mq_dispatch_rq_list(q, &rq_list, false); diff --git a/block/blk-mq.c b/block/blk-mq.c index dcb467369999..097ca3ece716 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -914,6 +914,45 @@ void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) } EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs); +struct dispatch_rq_data { + struct blk_mq_hw_ctx *hctx; + struct request *rq; +}; + +static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr, + void *data) +{ + struct dispatch_rq_data *dispatch_data = data; + struct blk_mq_hw_ctx *hctx = dispatch_data->hctx; + struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; + + spin_lock(&ctx->lock); + if (unlikely(!list_empty(&ctx->rq_list))) { + dispatch_data->rq = list_entry_rq(ctx->rq_list.next); + list_del_init(&dispatch_data->rq->queuelist); + if (list_empty(&ctx->rq_list)) + sbitmap_clear_bit(sb, bitnr); + } + spin_unlock(&ctx->lock); + + return !dispatch_data->rq; +} + +struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *start) +{ + unsigned off = start ? start->index_hw : 0; + struct dispatch_rq_data data = { + .hctx = hctx, + .rq = NULL, + }; + + __sbitmap_for_each_set(&hctx->ctx_map, off, + dispatch_rq_from_ctx, &data); + + return data.rq; +} + static inline unsigned int queued_to_index(unsigned int queued) { if (!queued) diff --git a/block/blk-mq.h b/block/blk-mq.h index e413b732374e..522b420dedc0 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -35,6 +35,8 @@ void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx); bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, bool wait); +struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *start); /* * Internal helpers for allocating/freeing the request map diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 901457df3d64..e5e6becd57d3 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -30,6 +30,8 @@ struct blk_mq_hw_ctx { struct sbitmap ctx_map; + struct blk_mq_ctx *dispatch_from; + struct blk_mq_ctx **ctxs; unsigned int nr_ctx; -- cgit v1.2.3-58-ga151 From 358a3a6bccb74da9d63a26b2dd5f09f1e9970e0b Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 27 Oct 2017 12:43:29 +0800 Subject: blk-mq: don't handle TAG_SHARED in restart Now restart is used in the following cases, and TAG_SHARED is for SCSI only. 1) .get_budget() returns BLK_STS_RESOURCE - if resource in target/host level isn't satisfied, this SCSI device will be added in shost->starved_list, and the whole queue will be rerun (via SCSI's built-in RESTART) in scsi_end_request() after any request initiated from this host/targe is completed. Forget to mention, host level resource can't be an issue for blk-mq at all. - the same is true if resource in the queue level isn't satisfied. - if there isn't outstanding request on this queue, then SCSI's RESTART can't work(blk-mq's can't work too), and the queue will be run after SCSI_QUEUE_DELAY, and finally all starved sdevs will be handled by SCSI's RESTART when this request is finished 2) scsi_dispatch_cmd() returns BLK_STS_RESOURCE - if there isn't onprogressing request on this queue, the queue will be run after SCSI_QUEUE_DELAY - otherwise, SCSI's RESTART covers the rerun. 3) blk_mq_get_driver_tag() failed - BLK_MQ_S_TAG_WAITING covers the cross-queue RESTART for driver allocation. In one word, SCSI's built-in RESTART is enough to cover the queue rerun, and we don't need to pay special attention to TAG_SHARED wrt. restart. In my test on scsi_debug(8 luns), this patch improves IOPS by 20% ~ 30% when running I/O on these 8 luns concurrently. Aslo Roman Pen reported the current RESTART is very expensive especialy when there are lots of LUNs attached in one host, such as in his test, RESTART causes half of IOPS be cut. Fixes: https://marc.info/?l=linux-kernel&m=150832216727524&w=2 Fixes: 6d8c6c0f97ad ("blk-mq: Restart a single queue if tag sets are shared") Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 78 +++------------------------------------------------- 1 file changed, 4 insertions(+), 74 deletions(-) (limited to 'block') diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index df8581bb0a37..daab27feb653 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -68,25 +68,17 @@ static void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx) set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); } -static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx) +void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) { if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) - return false; - - if (hctx->flags & BLK_MQ_F_TAG_SHARED) { - struct request_queue *q = hctx->queue; + return; - if (test_and_clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) - atomic_dec(&q->shared_hctx_restart); - } else - clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); + clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); if (blk_mq_hctx_has_pending(hctx)) { blk_mq_run_hw_queue(hctx, true); - return true; + return; } - - return false; } /* return true if hctx need to run again */ @@ -385,68 +377,6 @@ static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, return true; } -/** - * list_for_each_entry_rcu_rr - iterate in a round-robin fashion over rcu list - * @pos: loop cursor. - * @skip: the list element that will not be examined. Iteration starts at - * @skip->next. - * @head: head of the list to examine. This list must have at least one - * element, namely @skip. - * @member: name of the list_head structure within typeof(*pos). - */ -#define list_for_each_entry_rcu_rr(pos, skip, head, member) \ - for ((pos) = (skip); \ - (pos = (pos)->member.next != (head) ? list_entry_rcu( \ - (pos)->member.next, typeof(*pos), member) : \ - list_entry_rcu((pos)->member.next->next, typeof(*pos), member)), \ - (pos) != (skip); ) - -/* - * Called after a driver tag has been freed to check whether a hctx needs to - * be restarted. Restarts @hctx if its tag set is not shared. Restarts hardware - * queues in a round-robin fashion if the tag set of @hctx is shared with other - * hardware queues. - */ -void blk_mq_sched_restart(struct blk_mq_hw_ctx *const hctx) -{ - struct blk_mq_tags *const tags = hctx->tags; - struct blk_mq_tag_set *const set = hctx->queue->tag_set; - struct request_queue *const queue = hctx->queue, *q; - struct blk_mq_hw_ctx *hctx2; - unsigned int i, j; - - if (set->flags & BLK_MQ_F_TAG_SHARED) { - /* - * If this is 0, then we know that no hardware queues - * have RESTART marked. We're done. - */ - if (!atomic_read(&queue->shared_hctx_restart)) - return; - - rcu_read_lock(); - list_for_each_entry_rcu_rr(q, queue, &set->tag_list, - tag_set_list) { - queue_for_each_hw_ctx(q, hctx2, i) - if (hctx2->tags == tags && - blk_mq_sched_restart_hctx(hctx2)) - goto done; - } - j = hctx->queue_num + 1; - for (i = 0; i < queue->nr_hw_queues; i++, j++) { - if (j == queue->nr_hw_queues) - j = 0; - hctx2 = queue->queue_hw_ctx[j]; - if (hctx2->tags == tags && - blk_mq_sched_restart_hctx(hctx2)) - break; - } -done: - rcu_read_unlock(); - } else { - blk_mq_sched_restart_hctx(hctx); - } -} - /* * Add flush/fua to the queue. If we fail getting a driver tag, then * punt to the requeue list. Requeue will re-invoke us from a context -- cgit v1.2.3-58-ga151 From 1f460b63d4b37f504d8d0affc2cd492eb005ea97 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 27 Oct 2017 12:43:30 +0800 Subject: blk-mq: don't restart queue when .get_budget returns BLK_STS_RESOURCE SCSI restarts its queue in scsi_end_request() automatically, so we don't need to handle this case in blk-mq. Especailly any request won't be dequeued in this case, we needn't to worry about IO hang caused by restart vs. dispatch. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 45 ++++++++++++++++++++------------------------- block/blk-mq-sched.h | 2 +- block/blk-mq.c | 8 ++------ 3 files changed, 23 insertions(+), 32 deletions(-) (limited to 'block') diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index daab27feb653..7775f6b12fa9 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -81,8 +81,12 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) } } -/* return true if hctx need to run again */ -static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) +/* + * Only SCSI implements .get_budget and .put_budget, and SCSI restarts + * its queue by itself in its completion handler, so we don't need to + * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE. + */ +static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; struct elevator_queue *e = q->elevator; @@ -98,7 +102,7 @@ static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) ret = blk_mq_get_dispatch_budget(hctx); if (ret == BLK_STS_RESOURCE) - return true; + break; rq = e->type->ops.mq.dispatch_request(hctx); if (!rq) { @@ -116,8 +120,6 @@ static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) */ list_add(&rq->queuelist, &rq_list); } while (blk_mq_dispatch_rq_list(q, &rq_list, true)); - - return false; } static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx, @@ -131,8 +133,12 @@ static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx, return hctx->ctxs[idx]; } -/* return true if hctx need to run again */ -static bool blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) +/* + * Only SCSI implements .get_budget and .put_budget, and SCSI restarts + * its queue by itself in its completion handler, so we don't need to + * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE. + */ +static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; LIST_HEAD(rq_list); @@ -147,7 +153,7 @@ static bool blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) ret = blk_mq_get_dispatch_budget(hctx); if (ret == BLK_STS_RESOURCE) - return true; + break; rq = blk_mq_dequeue_from_ctx(hctx, ctx); if (!rq) { @@ -171,22 +177,19 @@ static bool blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) } while (blk_mq_dispatch_rq_list(q, &rq_list, true)); WRITE_ONCE(hctx->dispatch_from, ctx); - - return false; } /* return true if hw queue need to be run again */ -bool blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) +void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; struct elevator_queue *e = q->elevator; const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request; LIST_HEAD(rq_list); - bool run_queue = false; /* RCU or SRCU read lock is needed before checking quiesced flag */ if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q))) - return false; + return; hctx->run++; @@ -218,12 +221,12 @@ bool blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) blk_mq_sched_mark_restart_hctx(hctx); if (blk_mq_dispatch_rq_list(q, &rq_list, false)) { if (has_sched_dispatch) - run_queue = blk_mq_do_dispatch_sched(hctx); + blk_mq_do_dispatch_sched(hctx); else - run_queue = blk_mq_do_dispatch_ctx(hctx); + blk_mq_do_dispatch_ctx(hctx); } } else if (has_sched_dispatch) { - run_queue = blk_mq_do_dispatch_sched(hctx); + blk_mq_do_dispatch_sched(hctx); } else if (q->mq_ops->get_budget) { /* * If we need to get budget before queuing request, we @@ -233,19 +236,11 @@ bool blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) * TODO: get more budgets, and dequeue more requests in * one time. */ - run_queue = blk_mq_do_dispatch_ctx(hctx); + blk_mq_do_dispatch_ctx(hctx); } else { blk_mq_flush_busy_ctxs(hctx, &rq_list); blk_mq_dispatch_rq_list(q, &rq_list, false); } - - if (run_queue && !blk_mq_sched_needs_restart(hctx) && - !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state)) { - blk_mq_sched_mark_restart_hctx(hctx); - return true; - } - - return false; } bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index 2434061cc5b7..9267d0b7c197 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -22,7 +22,7 @@ void blk_mq_sched_insert_requests(struct request_queue *q, struct blk_mq_ctx *ctx, struct list_head *list, bool run_queue_async); -bool blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx); +void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx); int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e); void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e); diff --git a/block/blk-mq.c b/block/blk-mq.c index 097ca3ece716..e4d2490f4e7e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1226,7 +1226,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) { int srcu_idx; - bool run_queue; /* * We should be running this queue from one of the CPUs that @@ -1243,18 +1242,15 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { rcu_read_lock(); - run_queue = blk_mq_sched_dispatch_requests(hctx); + blk_mq_sched_dispatch_requests(hctx); rcu_read_unlock(); } else { might_sleep(); srcu_idx = srcu_read_lock(hctx->queue_rq_srcu); - run_queue = blk_mq_sched_dispatch_requests(hctx); + blk_mq_sched_dispatch_requests(hctx); srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx); } - - if (run_queue) - blk_mq_run_hw_queue(hctx, true); } /* -- cgit v1.2.3-58-ga151 From f421e1d9ade4e1b88183e54425cf50e390d16a7f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 2 Nov 2017 21:29:50 +0300 Subject: block: provide a direct_make_request helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This helper allows reinserting a bio into a new queue without much overhead, but requires all queue limits to be the same for the upper and lower queues, and it does not provide any recursion preventions. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Javier González Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-core.c | 34 ++++++++++++++++++++++++++++++++++ include/linux/blkdev.h | 1 + 2 files changed, 35 insertions(+) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index bb4fce694a60..96aaa3a419a3 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2243,6 +2243,40 @@ out: } EXPORT_SYMBOL(generic_make_request); +/** + * direct_make_request - hand a buffer directly to its device driver for I/O + * @bio: The bio describing the location in memory and on the device. + * + * This function behaves like generic_make_request(), but does not protect + * against recursion. Must only be used if the called driver is known + * to not call generic_make_request (or direct_make_request) again from + * its make_request function. (Calling direct_make_request again from + * a workqueue is perfectly fine as that doesn't recurse). + */ +blk_qc_t direct_make_request(struct bio *bio) +{ + struct request_queue *q = bio->bi_disk->queue; + bool nowait = bio->bi_opf & REQ_NOWAIT; + blk_qc_t ret; + + if (!generic_make_request_checks(bio)) + return BLK_QC_T_NONE; + + if (unlikely(blk_queue_enter(q, nowait))) { + if (nowait && !blk_queue_dying(q)) + bio->bi_status = BLK_STS_AGAIN; + else + bio->bi_status = BLK_STS_IOERR; + bio_endio(bio); + return BLK_QC_T_NONE; + } + + ret = q->make_request_fn(q, bio); + blk_queue_exit(q); + return ret; +} +EXPORT_SYMBOL_GPL(direct_make_request); + /** * submit_bio - submit a bio to the block device layer for I/O * @bio: The &struct bio which describes the I/O diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 72637028f3c9..eda3d25c0f68 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -920,6 +920,7 @@ static inline void rq_flush_dcache_pages(struct request *rq) extern int blk_register_queue(struct gendisk *disk); extern void blk_unregister_queue(struct gendisk *disk); extern blk_qc_t generic_make_request(struct bio *bio); +extern blk_qc_t direct_make_request(struct bio *bio); extern void blk_rq_init(struct request_queue *q, struct request *rq); extern void blk_init_request_from_bio(struct request *req, struct bio *bio); extern void blk_put_request(struct request *); -- cgit v1.2.3-58-ga151 From ef71de8b15d891b27b8c983a9a8972b11cb4576a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 2 Nov 2017 21:29:51 +0300 Subject: block: add a blk_steal_bios helper This helpers allows to bounce steal the uncompleted bios from a request so that they can be reissued on another path. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-core.c | 21 +++++++++++++++++++++ include/linux/blkdev.h | 2 ++ 2 files changed, 23 insertions(+) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 96aaa3a419a3..68cfe6780a9b 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2769,6 +2769,27 @@ struct request *blk_fetch_request(struct request_queue *q) } EXPORT_SYMBOL(blk_fetch_request); +/* + * Steal bios from a request and add them to a bio list. + * The request must not have been partially completed before. + */ +void blk_steal_bios(struct bio_list *list, struct request *rq) +{ + if (rq->bio) { + if (list->tail) + list->tail->bi_next = rq->bio; + else + list->head = rq->bio; + list->tail = rq->biotail; + + rq->bio = NULL; + rq->biotail = NULL; + } + + rq->__data_len = 0; +} +EXPORT_SYMBOL_GPL(blk_steal_bios); + /** * blk_update_request - Special helper function for request stacking drivers * @req: the request being processed diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index eda3d25c0f68..fddda6a1f9b5 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1094,6 +1094,8 @@ extern struct request *blk_peek_request(struct request_queue *q); extern void blk_start_request(struct request *rq); extern struct request *blk_fetch_request(struct request_queue *q); +void blk_steal_bios(struct bio_list *list, struct request *rq); + /* * Request completion related functions. * -- cgit v1.2.3-58-ga151 From 517bf3c306bad4fe0da631f90ae2ec40924dee2b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 2 Nov 2017 21:29:52 +0300 Subject: block: don't look at the struct device dev_t in disk_devt The hidden gendisks introduced in the next patch need to keep the dev field in their struct device empty so that udev won't try to create block device nodes for them. To support that rewrite disk_devt to look at the major and first_minor fields in the gendisk itself instead of looking into the struct device. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/genhd.c | 4 ---- include/linux/genhd.h | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index dd305c65ffb0..1174d24e405e 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -649,10 +649,6 @@ void device_add_disk(struct device *parent, struct gendisk *disk) return; } disk_to_dev(disk)->devt = devt; - - /* ->major and ->first_minor aren't supposed to be - * dereferenced from here on, but set them just in case. - */ disk->major = MAJOR(devt); disk->first_minor = MINOR(devt); diff --git a/include/linux/genhd.h b/include/linux/genhd.h index ea652bfcd675..5c0ed5db33c2 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -234,7 +234,7 @@ static inline bool disk_part_scan_enabled(struct gendisk *disk) static inline dev_t disk_devt(struct gendisk *disk) { - return disk_to_dev(disk)->devt; + return MKDEV(disk->major, disk->first_minor); } static inline dev_t part_devt(struct hd_struct *part) -- cgit v1.2.3-58-ga151 From 8ddcd653257c18a669fcb75ee42c37054908e0d6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 2 Nov 2017 21:29:53 +0300 Subject: block: introduce GENHD_FL_HIDDEN With this flag a driver can create a gendisk that can be used for I/O submission inside the kernel, but which is not registered as user facing block device. This will be useful for the NVMe multipath implementation. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/genhd.c | 68 +++++++++++++++++++++++++++++++++++++-------------- include/linux/genhd.h | 1 + 2 files changed, 51 insertions(+), 18 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 1174d24e405e..835e907e6e01 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -585,6 +585,11 @@ static void register_disk(struct device *parent, struct gendisk *disk) */ pm_runtime_set_memalloc_noio(ddev, true); + if (disk->flags & GENHD_FL_HIDDEN) { + dev_set_uevent_suppress(ddev, 0); + return; + } + disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj); disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); @@ -616,6 +621,11 @@ exit: while ((part = disk_part_iter_next(&piter))) kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD); disk_part_iter_exit(&piter); + + err = sysfs_create_link(&ddev->kobj, + &disk->queue->backing_dev_info->dev->kobj, + "bdi"); + WARN_ON(err); } /** @@ -630,7 +640,6 @@ exit: */ void device_add_disk(struct device *parent, struct gendisk *disk) { - struct backing_dev_info *bdi; dev_t devt; int retval; @@ -639,7 +648,8 @@ void device_add_disk(struct device *parent, struct gendisk *disk) * parameters make sense. */ WARN_ON(disk->minors && !(disk->major || disk->first_minor)); - WARN_ON(!disk->minors && !(disk->flags & GENHD_FL_EXT_DEVT)); + WARN_ON(!disk->minors && + !(disk->flags & (GENHD_FL_EXT_DEVT | GENHD_FL_HIDDEN))); disk->flags |= GENHD_FL_UP; @@ -648,18 +658,26 @@ void device_add_disk(struct device *parent, struct gendisk *disk) WARN_ON(1); return; } - disk_to_dev(disk)->devt = devt; disk->major = MAJOR(devt); disk->first_minor = MINOR(devt); disk_alloc_events(disk); - /* Register BDI before referencing it from bdev */ - bdi = disk->queue->backing_dev_info; - bdi_register_owner(bdi, disk_to_dev(disk)); - - blk_register_region(disk_devt(disk), disk->minors, NULL, - exact_match, exact_lock, disk); + if (disk->flags & GENHD_FL_HIDDEN) { + /* + * Don't let hidden disks show up in /proc/partitions, + * and don't bother scanning for partitions either. + */ + disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO; + disk->flags |= GENHD_FL_NO_PART_SCAN; + } else { + /* Register BDI before referencing it from bdev */ + disk_to_dev(disk)->devt = devt; + bdi_register_owner(disk->queue->backing_dev_info, + disk_to_dev(disk)); + blk_register_region(disk_devt(disk), disk->minors, NULL, + exact_match, exact_lock, disk); + } register_disk(parent, disk); blk_register_queue(disk); @@ -669,10 +687,6 @@ void device_add_disk(struct device *parent, struct gendisk *disk) */ WARN_ON_ONCE(!blk_get_queue(disk->queue)); - retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj, - "bdi"); - WARN_ON(retval); - disk_add_events(disk); blk_integrity_add(disk); } @@ -701,7 +715,8 @@ void del_gendisk(struct gendisk *disk) set_capacity(disk, 0); disk->flags &= ~GENHD_FL_UP; - sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); + if (!(disk->flags & GENHD_FL_HIDDEN)) + sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); if (disk->queue) { /* * Unregister bdi before releasing device numbers (as they can @@ -712,13 +727,15 @@ void del_gendisk(struct gendisk *disk) } else { WARN_ON(1); } - blk_unregister_region(disk_devt(disk), disk->minors); + + if (!(disk->flags & GENHD_FL_HIDDEN)) { + blk_unregister_region(disk_devt(disk), disk->minors); + kobject_put(disk->part0.holder_dir); + kobject_put(disk->slave_dir); + } part_stat_set_all(&disk->part0, 0); disk->part0.stamp = 0; - - kobject_put(disk->part0.holder_dir); - kobject_put(disk->slave_dir); if (!sysfs_deprecated) sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); pm_runtime_set_memalloc_noio(disk_to_dev(disk), false); @@ -781,6 +798,10 @@ struct gendisk *get_gendisk(dev_t devt, int *partno) spin_unlock_bh(&ext_devt_lock); } + if (unlikely(disk->flags & GENHD_FL_HIDDEN)) { + put_disk(disk); + disk = NULL; + } return disk; } EXPORT_SYMBOL(get_gendisk); @@ -1024,6 +1045,15 @@ static ssize_t disk_removable_show(struct device *dev, (disk->flags & GENHD_FL_REMOVABLE ? 1 : 0)); } +static ssize_t disk_hidden_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + return sprintf(buf, "%d\n", + (disk->flags & GENHD_FL_HIDDEN ? 1 : 0)); +} + static ssize_t disk_ro_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -1061,6 +1091,7 @@ static ssize_t disk_discard_alignment_show(struct device *dev, static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL); static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL); static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL); +static DEVICE_ATTR(hidden, S_IRUGO, disk_hidden_show, NULL); static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL); static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL); @@ -1085,6 +1116,7 @@ static struct attribute *disk_attrs[] = { &dev_attr_range.attr, &dev_attr_ext_range.attr, &dev_attr_removable.attr, + &dev_attr_hidden.attr, &dev_attr_ro.attr, &dev_attr_size.attr, &dev_attr_alignment_offset.attr, diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 5c0ed5db33c2..93aae3476f58 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -140,6 +140,7 @@ struct hd_struct { #define GENHD_FL_NATIVE_CAPACITY 128 #define GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE 256 #define GENHD_FL_NO_PART_SCAN 512 +#define GENHD_FL_HIDDEN 1024 enum { DISK_EVENT_MEDIA_CHANGE = 1 << 0, /* media changed */ -- cgit v1.2.3-58-ga151 From ea435e1b9392a33deceaea2a16ebaa3397bead93 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 2 Nov 2017 21:29:54 +0300 Subject: block: add a poll_fn callback to struct request_queue That we we can also poll non blk-mq queues. Mostly needed for the NVMe multipath code, but could also be useful elsewhere. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-core.c | 11 +++++++++++ block/blk-mq.c | 14 +++++--------- drivers/nvme/target/io-cmd.c | 2 +- fs/block_dev.c | 4 ++-- fs/direct-io.c | 2 +- fs/iomap.c | 2 +- include/linux/blkdev.h | 4 +++- mm/page_io.c | 2 +- 8 files changed, 25 insertions(+), 16 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 68cfe6780a9b..395bfb10d658 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2321,6 +2321,17 @@ blk_qc_t submit_bio(struct bio *bio) } EXPORT_SYMBOL(submit_bio); +bool blk_poll(struct request_queue *q, blk_qc_t cookie) +{ + if (!q->poll_fn || !blk_qc_t_valid(cookie)) + return false; + + if (current->plug) + blk_flush_plug_list(current->plug, false); + return q->poll_fn(q, cookie); +} +EXPORT_SYMBOL_GPL(blk_poll); + /** * blk_cloned_rq_check_limits - Helper function to check a cloned request * for new the queue limits diff --git a/block/blk-mq.c b/block/blk-mq.c index e4d2490f4e7e..95ea5889b825 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -37,6 +37,7 @@ #include "blk-wbt.h" #include "blk-mq-sched.h" +static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie); static void blk_mq_poll_stats_start(struct request_queue *q); static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb); @@ -2499,6 +2500,8 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, spin_lock_init(&q->requeue_lock); blk_queue_make_request(q, blk_mq_make_request); + if (q->mq_ops->poll) + q->poll_fn = blk_mq_poll; /* * Do this after blk_queue_make_request() overrides it... @@ -2961,20 +2964,14 @@ static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq) return false; } -bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie) +static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie) { struct blk_mq_hw_ctx *hctx; - struct blk_plug *plug; struct request *rq; - if (!q->mq_ops || !q->mq_ops->poll || !blk_qc_t_valid(cookie) || - !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) + if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) return false; - plug = current->plug; - if (plug) - blk_flush_plug_list(plug, false); - hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)]; if (!blk_qc_t_is_internal(cookie)) rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie)); @@ -2992,7 +2989,6 @@ bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie) return __blk_mq_poll(hctx, rq); } -EXPORT_SYMBOL_GPL(blk_mq_poll); static int __init blk_mq_init(void) { diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd.c index 0d4c23dc4532..db632818777d 100644 --- a/drivers/nvme/target/io-cmd.c +++ b/drivers/nvme/target/io-cmd.c @@ -94,7 +94,7 @@ static void nvmet_execute_rw(struct nvmet_req *req) cookie = submit_bio(bio); - blk_mq_poll(bdev_get_queue(req->ns->bdev), cookie); + blk_poll(bdev_get_queue(req->ns->bdev), cookie); } static void nvmet_execute_flush(struct nvmet_req *req) diff --git a/fs/block_dev.c b/fs/block_dev.c index 07ddccd17801..4afa4d5ff969 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -237,7 +237,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, if (!READ_ONCE(bio.bi_private)) break; if (!(iocb->ki_flags & IOCB_HIPRI) || - !blk_mq_poll(bdev_get_queue(bdev), qc)) + !blk_poll(bdev_get_queue(bdev), qc)) io_schedule(); } __set_current_state(TASK_RUNNING); @@ -402,7 +402,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) break; if (!(iocb->ki_flags & IOCB_HIPRI) || - !blk_mq_poll(bdev_get_queue(bdev), qc)) + !blk_poll(bdev_get_queue(bdev), qc)) io_schedule(); } __set_current_state(TASK_RUNNING); diff --git a/fs/direct-io.c b/fs/direct-io.c index 62cf812ed0e5..d2bc339cb1e9 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -486,7 +486,7 @@ static struct bio *dio_await_one(struct dio *dio) dio->waiter = current; spin_unlock_irqrestore(&dio->bio_lock, flags); if (!(dio->iocb->ki_flags & IOCB_HIPRI) || - !blk_mq_poll(dio->bio_disk->queue, dio->bio_cookie)) + !blk_poll(dio->bio_disk->queue, dio->bio_cookie)) io_schedule(); /* wake up sets us TASK_RUNNING */ spin_lock_irqsave(&dio->bio_lock, flags); diff --git a/fs/iomap.c b/fs/iomap.c index 8194d30bdca0..4241bac905b1 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -1049,7 +1049,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (!(iocb->ki_flags & IOCB_HIPRI) || !dio->submit.last_queue || - !blk_mq_poll(dio->submit.last_queue, + !blk_poll(dio->submit.last_queue, dio->submit.cookie)) io_schedule(); } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index fddda6a1f9b5..225617dd0a3f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -266,6 +266,7 @@ struct blk_queue_ctx; typedef void (request_fn_proc) (struct request_queue *q); typedef blk_qc_t (make_request_fn) (struct request_queue *q, struct bio *bio); +typedef bool (poll_q_fn) (struct request_queue *q, blk_qc_t); typedef int (prep_rq_fn) (struct request_queue *, struct request *); typedef void (unprep_rq_fn) (struct request_queue *, struct request *); @@ -408,6 +409,7 @@ struct request_queue { request_fn_proc *request_fn; make_request_fn *make_request_fn; + poll_q_fn *poll_fn; prep_rq_fn *prep_rq_fn; unprep_rq_fn *unprep_rq_fn; softirq_done_fn *softirq_done_fn; @@ -975,7 +977,7 @@ extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *, int blk_status_to_errno(blk_status_t status); blk_status_t errno_to_blk_status(int errno); -bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie); +bool blk_poll(struct request_queue *q, blk_qc_t cookie); static inline struct request_queue *bdev_get_queue(struct block_device *bdev) { diff --git a/mm/page_io.c b/mm/page_io.c index 21502d341a67..ff04de630c46 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -407,7 +407,7 @@ int swap_readpage(struct page *page, bool do_poll) if (!READ_ONCE(bio->bi_private)) break; - if (!blk_mq_poll(disk->queue, qc)) + if (!blk_poll(disk->queue, qc)) break; } __set_current_state(TASK_RUNNING); -- cgit v1.2.3-58-ga151 From c2e82a234873d905260a3bd0aca9577e85619bda Mon Sep 17 00:00:00 2001 From: weiping zhang Date: Fri, 22 Sep 2017 23:36:28 +0800 Subject: blk-mq: fix nr_requests wrong value when modify it from sysfs if blk-mq use "none" io scheduler, nr_request get a wrong value when input a number > tag_set->queue_depth. blk_mq_tag_update_depth will get the smaller one min(nr, set->queue_depth), and then q->nr_request get a wrong value. Reproduce: echo none > /sys/block/nvme0n1/queue/scheduler echo 1000000 > /sys/block/nvme0n1/queue/nr_requests cat /sys/block/nvme0n1/queue/nr_requests 1000000 Signed-off-by: weiping zhang Signed-off-by: Jens Axboe --- block/blk-mq.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 95ea5889b825..9eea67ce82d9 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2743,8 +2743,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) * queue depth. This is similar to what the old code would do. */ if (!hctx->sched_tags) { - ret = blk_mq_tag_update_depth(hctx, &hctx->tags, - min(nr, set->queue_depth), + ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr, false); } else { ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags, -- cgit v1.2.3-58-ga151 From 21e768b442bb587123ea924620e74d6e5655d717 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 16 Oct 2017 16:32:26 -0700 Subject: blk-mq: Make blk_mq_get_request() error path less confusing blk_mq_get_tag() can modify data->ctx. This means that in the error path of blk_mq_get_request() data->ctx should be passed to blk_mq_put_ctx() instead of local_ctx. Note: since blk_mq_put_ctx() ignores its argument, this patch does not change any functionality. References: commit 1ad43c0078b7 ("blk-mq: don't leak preempt counter/q_usage_counter when allocating rq failed") Reviewed-by: Ming Lei Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Bart Van Assche Cc: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 9eea67ce82d9..13cdccef543c 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -337,12 +337,14 @@ static struct request *blk_mq_get_request(struct request_queue *q, struct elevator_queue *e = q->elevator; struct request *rq; unsigned int tag; - struct blk_mq_ctx *local_ctx = NULL; + bool put_ctx_on_error = false; blk_queue_enter_live(q); data->q = q; - if (likely(!data->ctx)) - data->ctx = local_ctx = blk_mq_get_ctx(q); + if (likely(!data->ctx)) { + data->ctx = blk_mq_get_ctx(q); + put_ctx_on_error = true; + } if (likely(!data->hctx)) data->hctx = blk_mq_map_queue(q, data->ctx->cpu); if (op & REQ_NOWAIT) @@ -361,8 +363,8 @@ static struct request *blk_mq_get_request(struct request_queue *q, tag = blk_mq_get_tag(data); if (tag == BLK_MQ_TAG_FAIL) { - if (local_ctx) { - blk_mq_put_ctx(local_ctx); + if (put_ctx_on_error) { + blk_mq_put_ctx(data->ctx); data->ctx = NULL; } blk_queue_exit(q); -- cgit v1.2.3-58-ga151 From e4f36b249b4d4e7599f1cf0c8fb50f196e52677e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 20 Oct 2017 16:45:23 +0200 Subject: block: fix peeking requests during PM We need to look for an active PM request until the next softbarrier instead of looking for the first non-PM request. Otherwise any cause of request reordering might starve the PM request(s). Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-core.c | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 395bfb10d658..223f32826e62 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2511,20 +2511,22 @@ void blk_account_io_done(struct request *req) * Don't process normal requests when queue is suspended * or in the process of suspending/resuming */ -static struct request *blk_pm_peek_request(struct request_queue *q, - struct request *rq) +static bool blk_pm_allow_request(struct request *rq) { - if (q->dev && (q->rpm_status == RPM_SUSPENDED || - (q->rpm_status != RPM_ACTIVE && !(rq->rq_flags & RQF_PM)))) - return NULL; - else - return rq; + switch (rq->q->rpm_status) { + case RPM_RESUMING: + case RPM_SUSPENDING: + return rq->rq_flags & RQF_PM; + case RPM_SUSPENDED: + return false; + } + + return true; } #else -static inline struct request *blk_pm_peek_request(struct request_queue *q, - struct request *rq) +static bool blk_pm_allow_request(struct request *rq) { - return rq; + return true; } #endif @@ -2572,9 +2574,12 @@ static struct request *elv_next_request(struct request_queue *q) WARN_ON_ONCE(q->mq_ops); while (1) { - if (!list_empty(&q->queue_head)) { - rq = list_entry_rq(q->queue_head.next); - return rq; + list_for_each_entry(rq, &q->queue_head, queuelist) { + if (blk_pm_allow_request(rq)) + return rq; + + if (rq->rq_flags & RQF_SOFTBARRIER) + break; } /* @@ -2625,10 +2630,6 @@ struct request *blk_peek_request(struct request_queue *q) WARN_ON_ONCE(q->mq_ops); while ((rq = elv_next_request(q)) != NULL) { - rq = blk_pm_peek_request(q, rq); - if (!rq) - break; - if (!(rq->rq_flags & RQF_STARTED)) { /* * This is the first time the device driver -- cgit v1.2.3-58-ga151 From 88022d7201e96b43f1754b0358fc6bcd8dbdcde1 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 5 Nov 2017 02:21:12 +0800 Subject: blk-mq: don't handle failure in .get_budget It is enough to just check if we can get the budget via .get_budget(). And we don't need to deal with device state change in .get_budget(). For SCSI, one issue to be fixed is that we have to call scsi_mq_uninit_cmd() to free allocated ressources if SCSI device fails to handle the request. And it isn't enough to simply call blk_mq_end_request() to do that if this request is marked as RQF_DONTPREP. Fixes: 0df21c86bdbf(scsi: implement .get_budget and .put_budget for blk-mq) Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 14 ++------------ block/blk-mq.c | 17 ++++------------- block/blk-mq.h | 5 ++--- drivers/scsi/scsi_lib.c | 11 +++-------- include/linux/blk-mq.h | 2 +- 5 files changed, 12 insertions(+), 37 deletions(-) (limited to 'block') diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 7775f6b12fa9..13a27d4d1671 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -94,23 +94,18 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) do { struct request *rq; - blk_status_t ret; if (e->type->ops.mq.has_work && !e->type->ops.mq.has_work(hctx)) break; - ret = blk_mq_get_dispatch_budget(hctx); - if (ret == BLK_STS_RESOURCE) + if (!blk_mq_get_dispatch_budget(hctx)) break; rq = e->type->ops.mq.dispatch_request(hctx); if (!rq) { blk_mq_put_dispatch_budget(hctx); break; - } else if (ret != BLK_STS_OK) { - blk_mq_end_request(rq, ret); - continue; } /* @@ -146,22 +141,17 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) do { struct request *rq; - blk_status_t ret; if (!sbitmap_any_bit_set(&hctx->ctx_map)) break; - ret = blk_mq_get_dispatch_budget(hctx); - if (ret == BLK_STS_RESOURCE) + if (!blk_mq_get_dispatch_budget(hctx)) break; rq = blk_mq_dequeue_from_ctx(hctx, ctx); if (!rq) { blk_mq_put_dispatch_budget(hctx); break; - } else if (ret != BLK_STS_OK) { - blk_mq_end_request(rq, ret); - continue; } /* diff --git a/block/blk-mq.c b/block/blk-mq.c index 13cdccef543c..c9fa4b294664 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1137,13 +1137,8 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, } } - if (!got_budget) { - ret = blk_mq_get_dispatch_budget(hctx); - if (ret == BLK_STS_RESOURCE) - break; - if (ret != BLK_STS_OK) - goto fail_rq; - } + if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) + break; list_del_init(&rq->queuelist); @@ -1170,7 +1165,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, break; } - fail_rq: if (unlikely(ret != BLK_STS_OK)) { errors++; blk_mq_end_request(rq, BLK_STS_IOERR); @@ -1642,12 +1636,10 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, if (!blk_mq_get_driver_tag(rq, NULL, false)) goto insert; - ret = blk_mq_get_dispatch_budget(hctx); - if (ret == BLK_STS_RESOURCE) { + if (!blk_mq_get_dispatch_budget(hctx)) { blk_mq_put_driver_tag(rq); goto insert; - } else if (ret != BLK_STS_OK) - goto fail_rq; + } new_cookie = request_to_qc_t(hctx, rq); @@ -1665,7 +1657,6 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, __blk_mq_requeue_request(rq); goto insert; default: - fail_rq: *cookie = BLK_QC_T_NONE; blk_mq_end_request(rq, ret); return; diff --git a/block/blk-mq.h b/block/blk-mq.h index 522b420dedc0..f97aceff76e9 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -147,14 +147,13 @@ static inline void blk_mq_put_dispatch_budget(struct blk_mq_hw_ctx *hctx) q->mq_ops->put_budget(hctx); } -static inline blk_status_t blk_mq_get_dispatch_budget( - struct blk_mq_hw_ctx *hctx) +static inline bool blk_mq_get_dispatch_budget(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; if (q->mq_ops->get_budget) return q->mq_ops->get_budget(hctx); - return BLK_STS_OK; + return true; } #endif diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 22a7e4c47207..286ea983c9e3 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1955,27 +1955,22 @@ static void scsi_mq_put_budget(struct blk_mq_hw_ctx *hctx) put_device(&sdev->sdev_gendev); } -static blk_status_t scsi_mq_get_budget(struct blk_mq_hw_ctx *hctx) +static bool scsi_mq_get_budget(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; struct scsi_device *sdev = q->queuedata; - blk_status_t ret; - - ret = prep_to_mq(scsi_prep_state_check(sdev, NULL)); - if (ret == BLK_STS_RESOURCE || ret != BLK_STS_OK) - return ret; if (!get_device(&sdev->sdev_gendev)) goto out; if (!scsi_dev_queue_ready(q, sdev)) goto out_put_device; - return BLK_STS_OK; + return true; out_put_device: put_device(&sdev->sdev_gendev); out: - return BLK_STS_RESOURCE; + return false; } static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx, diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index f2e3079eecdd..674641527da7 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -92,7 +92,7 @@ struct blk_mq_queue_data { typedef blk_status_t (queue_rq_fn)(struct blk_mq_hw_ctx *, const struct blk_mq_queue_data *); -typedef blk_status_t (get_budget_fn)(struct blk_mq_hw_ctx *); +typedef bool (get_budget_fn)(struct blk_mq_hw_ctx *); typedef void (put_budget_fn)(struct blk_mq_hw_ctx *); typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool); typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int); -- cgit v1.2.3-58-ga151 From e84010732225c4c7c3464ee1169d395751c3adfa Mon Sep 17 00:00:00 2001 From: weiping zhang Date: Tue, 17 Oct 2017 23:56:21 +0800 Subject: blkcg: add sanity check for blkcg policy operations blkcg policy should keep cpd/pd's alloc_fn and free_fn in pairs, otherwise policy would register fail. Reviewed-by: Johannes Thumshirn Signed-off-by: weiping zhang Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index e7ec676043b1..4117524ca45b 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1419,6 +1419,11 @@ int blkcg_policy_register(struct blkcg_policy *pol) if (i >= BLKCG_MAX_POLS) goto err_unlock; + /* Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs */ + if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) || + (!pol->pd_alloc_fn ^ !pol->pd_free_fn)) + goto err_unlock; + /* register @pol */ pol->plid = i; blkcg_policy[pol->plid] = pol; -- cgit v1.2.3-58-ga151 From 6d6f167ce74158903e7fc20dfbecf89c71aa1c00 Mon Sep 17 00:00:00 2001 From: Jianchao Wang Date: Thu, 2 Nov 2017 23:24:32 +0800 Subject: blk-mq: put the driver tag of nxt rq before first one is requeued When freeing the driver tag of the next rq with an I/O scheduler configured, we get the first entry of the list. However, this can race with requeue of a request, and we end up getting the wrong request from the head of the list. Free the driver tag of next rq before the failed one is requeued in the failure branch of queue_rq callback. Signed-off-by: Jianchao Wang Signed-off-by: Jens Axboe --- block/blk-mq.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index c9fa4b294664..6eacc1dea8b7 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1094,7 +1094,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, bool got_budget) { struct blk_mq_hw_ctx *hctx; - struct request *rq; + struct request *rq, *nxt; int errors, queued; if (list_empty(list)) @@ -1151,14 +1151,20 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, if (list_empty(list)) bd.last = true; else { - struct request *nxt; - nxt = list_first_entry(list, struct request, queuelist); bd.last = !blk_mq_get_driver_tag(nxt, NULL, false); } ret = q->mq_ops->queue_rq(hctx, &bd); if (ret == BLK_STS_RESOURCE) { + /* + * If an I/O scheduler has been configured and we got a + * driver tag for the next request already, free it again. + */ + if (!list_empty(list)) { + nxt = list_first_entry(list, struct request, queuelist); + blk_mq_put_driver_tag(nxt); + } blk_mq_put_driver_tag_hctx(hctx, rq); list_add(&rq->queuelist, list); __blk_mq_requeue_request(rq); @@ -1181,13 +1187,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, * that is where we will continue on next queue run. */ if (!list_empty(list)) { - /* - * If an I/O scheduler has been configured and we got a driver - * tag for the next request already, free it again. - */ - rq = list_first_entry(list, struct request, queuelist); - blk_mq_put_driver_tag(rq); - spin_lock(&hctx->lock); list_splice_init(list, &hctx->dispatch); spin_unlock(&hctx->lock); -- cgit v1.2.3-58-ga151 From 9c71c83c857e7a84a5be5a56ea88da7098f51db8 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 2 Nov 2017 23:24:33 +0800 Subject: blk-flush: don't run queue for requests bypassing flush blk_insert_flush() should only insert request since run queue always follows it. In case of bypassing flush, we don't need to run queue because every blk_insert_flush() follows one run queue. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-flush.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-flush.c b/block/blk-flush.c index 4938bec8cfef..81bd1a843043 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -463,7 +463,7 @@ void blk_insert_flush(struct request *rq) if ((policy & REQ_FSEQ_DATA) && !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) { if (q->mq_ops) - blk_mq_sched_insert_request(rq, false, true, false, false); + blk_mq_sched_insert_request(rq, false, false, false, false); else list_add_tail(&rq->queuelist, &q->queue_head); return; -- cgit v1.2.3-58-ga151 From b0850297c749ea79a5717d597931366b3d7f4b09 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 2 Nov 2017 23:24:34 +0800 Subject: block: pass 'run_queue' to blk_mq_request_bypass_insert Block flush need this function without running the queue, so add a parameter controlling whether we run it or not. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- block/blk-mq.c | 5 +++-- block/blk-mq.h | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 223f32826e62..b8d1aa2d1008 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2397,7 +2397,7 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request * * bypass a potential scheduler on the bottom device for * insert. */ - blk_mq_request_bypass_insert(rq); + blk_mq_request_bypass_insert(rq, true); return BLK_STS_OK; } diff --git a/block/blk-mq.c b/block/blk-mq.c index 6eacc1dea8b7..021562bd5d2c 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1492,7 +1492,7 @@ void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, * Should only be used carefully, when the caller knows we want to * bypass a potential IO scheduler on the target device. */ -void blk_mq_request_bypass_insert(struct request *rq) +void blk_mq_request_bypass_insert(struct request *rq, bool run_queue) { struct blk_mq_ctx *ctx = rq->mq_ctx; struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu); @@ -1501,7 +1501,8 @@ void blk_mq_request_bypass_insert(struct request *rq) list_add_tail(&rq->queuelist, &hctx->dispatch); spin_unlock(&hctx->lock); - blk_mq_run_hw_queue(hctx, false); + if (run_queue) + blk_mq_run_hw_queue(hctx, false); } void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, diff --git a/block/blk-mq.h b/block/blk-mq.h index f97aceff76e9..9fffec0ad8e9 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -56,7 +56,7 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, */ void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, bool at_head); -void blk_mq_request_bypass_insert(struct request *rq); +void blk_mq_request_bypass_insert(struct request *rq, bool run_queue); void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct list_head *list); -- cgit v1.2.3-58-ga151 From 598906f814280762157629ba8833bf5cb11def74 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 2 Nov 2017 23:24:35 +0800 Subject: blk-flush: use blk_mq_request_bypass_insert() In the following patch, we will use RQF_FLUSH_SEQ to decide: 1) if the flag isn't set, the flush rq need to be inserted via blk_insert_flush() 2) otherwise, the flush rq need to be dispatched directly since it is in flush machinery now. So we use blk_mq_request_bypass_insert() for requests of bypassing flush machinery, just like the legacy path did. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-flush.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-flush.c b/block/blk-flush.c index 81bd1a843043..a9773d2075ac 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -463,7 +463,7 @@ void blk_insert_flush(struct request *rq) if ((policy & REQ_FSEQ_DATA) && !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) { if (q->mq_ops) - blk_mq_sched_insert_request(rq, false, false, false, false); + blk_mq_request_bypass_insert(rq, false); else list_add_tail(&rq->queuelist, &q->queue_head); return; -- cgit v1.2.3-58-ga151 From a6a252e6491443c1c18eab7e254daee63d4a7a04 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 2 Nov 2017 23:24:36 +0800 Subject: blk-mq-sched: decide how to handle flush rq via RQF_FLUSH_SEQ In case of IO scheduler we always pre-allocate one driver tag before calling blk_insert_flush(), and flush request will be marked as RQF_FLUSH_SEQ once it is in flush machinery. So if RQF_FLUSH_SEQ isn't set, we call blk_insert_flush() to handle the request, otherwise the flush request is dispatched to ->dispatch list directly. This is a preparation patch for not preallocating a driver tag for flush requests, and for not treating flush requests as a special case. This is similar to what the legacy path does. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) (limited to 'block') diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 13a27d4d1671..e7094f44afaf 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -345,21 +345,23 @@ void blk_mq_sched_request_inserted(struct request *rq) EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted); static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, + bool has_sched, struct request *rq) { - if (rq->tag == -1) { + /* dispatch flush rq directly */ + if (rq->rq_flags & RQF_FLUSH_SEQ) { + spin_lock(&hctx->lock); + list_add(&rq->queuelist, &hctx->dispatch); + spin_unlock(&hctx->lock); + return true; + } + + if (has_sched) { rq->rq_flags |= RQF_SORTED; - return false; + WARN_ON(rq->tag != -1); } - /* - * If we already have a real request tag, send directly to - * the dispatch list. - */ - spin_lock(&hctx->lock); - list_add(&rq->queuelist, &hctx->dispatch); - spin_unlock(&hctx->lock); - return true; + return false; } /* @@ -385,12 +387,13 @@ void blk_mq_sched_insert_request(struct request *rq, bool at_head, struct blk_mq_ctx *ctx = rq->mq_ctx; struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); - if (rq->tag == -1 && op_is_flush(rq->cmd_flags)) { + /* flush rq in flush machinery need to be dispatched directly */ + if (!(rq->rq_flags & RQF_FLUSH_SEQ) && op_is_flush(rq->cmd_flags)) { blk_mq_sched_insert_flush(hctx, rq, can_block); return; } - if (e && blk_mq_sched_bypass_insert(hctx, rq)) + if (blk_mq_sched_bypass_insert(hctx, !!e, rq)) goto run; if (e && e->type->ops.mq.insert_requests) { @@ -428,7 +431,7 @@ void blk_mq_sched_insert_requests(struct request_queue *q, list_for_each_entry_safe(rq, next, list, queuelist) { if (WARN_ON_ONCE(rq->tag != -1)) { list_del_init(&rq->queuelist); - blk_mq_sched_bypass_insert(hctx, rq); + blk_mq_sched_bypass_insert(hctx, true, rq); } } } -- cgit v1.2.3-58-ga151 From 244c65a3ccaa06fd15cc940315606674d3108b2f Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 4 Nov 2017 12:39:57 -0600 Subject: blk-mq: move blk_mq_put_driver_tag*() into blk-mq.h We need this helper to put the driver tag for flush rq, since we will not share tag in the flush request sequence in the following patch in case that I/O scheduler is applied. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 32 -------------------------------- block/blk-mq.h | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 32 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 021562bd5d2c..14f6886fbec8 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -996,38 +996,6 @@ done: return rq->tag != -1; } -static void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, - struct request *rq) -{ - blk_mq_put_tag(hctx, hctx->tags, rq->mq_ctx, rq->tag); - rq->tag = -1; - - if (rq->rq_flags & RQF_MQ_INFLIGHT) { - rq->rq_flags &= ~RQF_MQ_INFLIGHT; - atomic_dec(&hctx->nr_active); - } -} - -static void blk_mq_put_driver_tag_hctx(struct blk_mq_hw_ctx *hctx, - struct request *rq) -{ - if (rq->tag == -1 || rq->internal_tag == -1) - return; - - __blk_mq_put_driver_tag(hctx, rq); -} - -static void blk_mq_put_driver_tag(struct request *rq) -{ - struct blk_mq_hw_ctx *hctx; - - if (rq->tag == -1 || rq->internal_tag == -1) - return; - - hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu); - __blk_mq_put_driver_tag(hctx, rq); -} - /* * If we fail getting a driver tag because all the driver tags are already * assigned and on the dispatch list, BUT the first entry does not have a diff --git a/block/blk-mq.h b/block/blk-mq.h index 9fffec0ad8e9..2502f40ccdc0 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -2,6 +2,7 @@ #define INT_BLK_MQ_H #include "blk-stat.h" +#include "blk-mq-tag.h" struct blk_mq_tag_set; @@ -156,4 +157,36 @@ static inline bool blk_mq_get_dispatch_budget(struct blk_mq_hw_ctx *hctx) return true; } +static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, + struct request *rq) +{ + blk_mq_put_tag(hctx, hctx->tags, rq->mq_ctx, rq->tag); + rq->tag = -1; + + if (rq->rq_flags & RQF_MQ_INFLIGHT) { + rq->rq_flags &= ~RQF_MQ_INFLIGHT; + atomic_dec(&hctx->nr_active); + } +} + +static inline void blk_mq_put_driver_tag_hctx(struct blk_mq_hw_ctx *hctx, + struct request *rq) +{ + if (rq->tag == -1 || rq->internal_tag == -1) + return; + + __blk_mq_put_driver_tag(hctx, rq); +} + +static inline void blk_mq_put_driver_tag(struct request *rq) +{ + struct blk_mq_hw_ctx *hctx; + + if (rq->tag == -1 || rq->internal_tag == -1) + return; + + hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu); + __blk_mq_put_driver_tag(hctx, rq); +} + #endif -- cgit v1.2.3-58-ga151 From 923218f6166a84688973acdc39094f3bee1e9ad4 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 2 Nov 2017 23:24:38 +0800 Subject: blk-mq: don't allocate driver tag upfront for flush rq The idea behind it is simple: 1) for none scheduler, driver tag has to be borrowed for flush rq, otherwise we may run out of tag, and that causes an IO hang. And get/put driver tag is actually noop for none, so reordering tags isn't necessary at all. 2) for a real I/O scheduler, we need not allocate a driver tag upfront for flush rq. It works just fine to follow the same approach as normal requests: allocate driver tag for each rq just before calling ->queue_rq(). One driver visible change is that the driver tag isn't shared in the flush request sequence. That won't be a problem, since we always do that in legacy path. Then flush rq need not be treated specially wrt. get/put driver tag. This cleans up the code - for instance, reorder_tags_to_front() can be removed, and we needn't worry about request ordering in dispatch list for avoiding I/O deadlock. Also we have to put the driver tag before requeueing. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-flush.c | 35 ++++++++++++++++++++++++++--------- block/blk-mq-sched.c | 42 +++++------------------------------------- block/blk-mq.c | 41 ++++++----------------------------------- 3 files changed, 37 insertions(+), 81 deletions(-) (limited to 'block') diff --git a/block/blk-flush.c b/block/blk-flush.c index a9773d2075ac..f17170675917 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -231,8 +231,13 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error) /* release the tag's ownership to the req cloned from */ spin_lock_irqsave(&fq->mq_flush_lock, flags); hctx = blk_mq_map_queue(q, flush_rq->mq_ctx->cpu); - blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq); - flush_rq->tag = -1; + if (!q->elevator) { + blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq); + flush_rq->tag = -1; + } else { + blk_mq_put_driver_tag_hctx(hctx, flush_rq); + flush_rq->internal_tag = -1; + } } running = &fq->flush_queue[fq->flush_running_idx]; @@ -318,19 +323,26 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq) blk_rq_init(q, flush_rq); /* - * Borrow tag from the first request since they can't - * be in flight at the same time. And acquire the tag's - * ownership for flush req. + * In case of none scheduler, borrow tag from the first request + * since they can't be in flight at the same time. And acquire + * the tag's ownership for flush req. + * + * In case of IO scheduler, flush rq need to borrow scheduler tag + * just for cheating put/get driver tag. */ if (q->mq_ops) { struct blk_mq_hw_ctx *hctx; flush_rq->mq_ctx = first_rq->mq_ctx; - flush_rq->tag = first_rq->tag; - fq->orig_rq = first_rq; - hctx = blk_mq_map_queue(q, first_rq->mq_ctx->cpu); - blk_mq_tag_set_rq(hctx, first_rq->tag, flush_rq); + if (!q->elevator) { + fq->orig_rq = first_rq; + flush_rq->tag = first_rq->tag; + hctx = blk_mq_map_queue(q, first_rq->mq_ctx->cpu); + blk_mq_tag_set_rq(hctx, first_rq->tag, flush_rq); + } else { + flush_rq->internal_tag = first_rq->internal_tag; + } } flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH; @@ -394,6 +406,11 @@ static void mq_flush_data_end_io(struct request *rq, blk_status_t error) hctx = blk_mq_map_queue(q, ctx->cpu); + if (q->elevator) { + WARN_ON(rq->tag < 0); + blk_mq_put_driver_tag_hctx(hctx, rq); + } + /* * After populating an empty queue, kick it to avoid stall. Read * the comment in flush_end_io(). diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index e7094f44afaf..01a43fed6b8c 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -356,29 +356,12 @@ static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, return true; } - if (has_sched) { + if (has_sched) rq->rq_flags |= RQF_SORTED; - WARN_ON(rq->tag != -1); - } return false; } -/* - * Add flush/fua to the queue. If we fail getting a driver tag, then - * punt to the requeue list. Requeue will re-invoke us from a context - * that's safe to block from. - */ -static void blk_mq_sched_insert_flush(struct blk_mq_hw_ctx *hctx, - struct request *rq, bool can_block) -{ - if (blk_mq_get_driver_tag(rq, &hctx, can_block)) { - blk_insert_flush(rq); - blk_mq_run_hw_queue(hctx, true); - } else - blk_mq_add_to_requeue_list(rq, false, true); -} - void blk_mq_sched_insert_request(struct request *rq, bool at_head, bool run_queue, bool async, bool can_block) { @@ -389,10 +372,12 @@ void blk_mq_sched_insert_request(struct request *rq, bool at_head, /* flush rq in flush machinery need to be dispatched directly */ if (!(rq->rq_flags & RQF_FLUSH_SEQ) && op_is_flush(rq->cmd_flags)) { - blk_mq_sched_insert_flush(hctx, rq, can_block); - return; + blk_insert_flush(rq); + goto run; } + WARN_ON(e && (rq->tag != -1)); + if (blk_mq_sched_bypass_insert(hctx, !!e, rq)) goto run; @@ -419,23 +404,6 @@ void blk_mq_sched_insert_requests(struct request_queue *q, struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); struct elevator_queue *e = hctx->queue->elevator; - if (e) { - struct request *rq, *next; - - /* - * We bypass requests that already have a driver tag assigned, - * which should only be flushes. Flushes are only ever inserted - * as single requests, so we shouldn't ever hit the - * WARN_ON_ONCE() below (but let's handle it just in case). - */ - list_for_each_entry_safe(rq, next, list, queuelist) { - if (WARN_ON_ONCE(rq->tag != -1)) { - list_del_init(&rq->queuelist); - blk_mq_sched_bypass_insert(hctx, true, rq); - } - } - } - if (e && e->type->ops.mq.insert_requests) e->type->ops.mq.insert_requests(hctx, list, false); else diff --git a/block/blk-mq.c b/block/blk-mq.c index 14f6886fbec8..c501cbd0de93 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -653,6 +653,8 @@ static void __blk_mq_requeue_request(struct request *rq) { struct request_queue *q = rq->q; + blk_mq_put_driver_tag(rq); + trace_block_rq_requeue(q, rq); wbt_requeue(q->rq_wb, &rq->issue_stat); blk_mq_sched_requeue_request(rq); @@ -996,30 +998,6 @@ done: return rq->tag != -1; } -/* - * If we fail getting a driver tag because all the driver tags are already - * assigned and on the dispatch list, BUT the first entry does not have a - * tag, then we could deadlock. For that case, move entries with assigned - * driver tags to the front, leaving the set of tagged requests in the - * same order, and the untagged set in the same order. - */ -static bool reorder_tags_to_front(struct list_head *list) -{ - struct request *rq, *tmp, *first = NULL; - - list_for_each_entry_safe_reverse(rq, tmp, list, queuelist) { - if (rq == first) - break; - if (rq->tag != -1) { - list_move(&rq->queuelist, list); - if (!first) - first = rq; - } - } - - return first != NULL; -} - static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int flags, void *key) { @@ -1080,9 +1058,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, rq = list_first_entry(list, struct request, queuelist); if (!blk_mq_get_driver_tag(rq, &hctx, false)) { - if (!queued && reorder_tags_to_front(list)) - continue; - /* * The initial allocation attempt failed, so we need to * rerun the hardware queue when a tag is freed. @@ -1133,7 +1108,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, nxt = list_first_entry(list, struct request, queuelist); blk_mq_put_driver_tag(nxt); } - blk_mq_put_driver_tag_hctx(hctx, rq); list_add(&rq->queuelist, list); __blk_mq_requeue_request(rq); break; @@ -1698,13 +1672,10 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) if (unlikely(is_flush_fua)) { blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); - if (q->elevator) { - blk_mq_sched_insert_request(rq, false, true, true, - true); - } else { - blk_insert_flush(rq); - blk_mq_run_hw_queue(data.hctx, true); - } + + /* bypass scheduler for flush rq */ + blk_insert_flush(rq); + blk_mq_run_hw_queue(data.hctx, true); } else if (plug && q->nr_hw_queues == 1) { struct request *last = NULL; -- cgit v1.2.3-58-ga151 From 05b79413946d8b2b58999ea1ae844b6fc3c54f61 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 8 Nov 2017 10:38:29 -0700 Subject: Revert "blk-mq: don't handle TAG_SHARED in restart" This reverts commit 358a3a6bccb74da9d63a26b2dd5f09f1e9970e0b. We have cases that aren't covered 100% in the drivers, so for now we have to retain the shared tag restart loops. Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 78 +++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 74 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 01a43fed6b8c..6f4bdb8209f7 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -68,17 +68,25 @@ static void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx) set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); } -void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) +static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx) { if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) - return; + return false; - clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); + if (hctx->flags & BLK_MQ_F_TAG_SHARED) { + struct request_queue *q = hctx->queue; + + if (test_and_clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) + atomic_dec(&q->shared_hctx_restart); + } else + clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); if (blk_mq_hctx_has_pending(hctx)) { blk_mq_run_hw_queue(hctx, true); - return; + return true; } + + return false; } /* @@ -362,6 +370,68 @@ static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, return false; } +/** + * list_for_each_entry_rcu_rr - iterate in a round-robin fashion over rcu list + * @pos: loop cursor. + * @skip: the list element that will not be examined. Iteration starts at + * @skip->next. + * @head: head of the list to examine. This list must have at least one + * element, namely @skip. + * @member: name of the list_head structure within typeof(*pos). + */ +#define list_for_each_entry_rcu_rr(pos, skip, head, member) \ + for ((pos) = (skip); \ + (pos = (pos)->member.next != (head) ? list_entry_rcu( \ + (pos)->member.next, typeof(*pos), member) : \ + list_entry_rcu((pos)->member.next->next, typeof(*pos), member)), \ + (pos) != (skip); ) + +/* + * Called after a driver tag has been freed to check whether a hctx needs to + * be restarted. Restarts @hctx if its tag set is not shared. Restarts hardware + * queues in a round-robin fashion if the tag set of @hctx is shared with other + * hardware queues. + */ +void blk_mq_sched_restart(struct blk_mq_hw_ctx *const hctx) +{ + struct blk_mq_tags *const tags = hctx->tags; + struct blk_mq_tag_set *const set = hctx->queue->tag_set; + struct request_queue *const queue = hctx->queue, *q; + struct blk_mq_hw_ctx *hctx2; + unsigned int i, j; + + if (set->flags & BLK_MQ_F_TAG_SHARED) { + /* + * If this is 0, then we know that no hardware queues + * have RESTART marked. We're done. + */ + if (!atomic_read(&queue->shared_hctx_restart)) + return; + + rcu_read_lock(); + list_for_each_entry_rcu_rr(q, queue, &set->tag_list, + tag_set_list) { + queue_for_each_hw_ctx(q, hctx2, i) + if (hctx2->tags == tags && + blk_mq_sched_restart_hctx(hctx2)) + goto done; + } + j = hctx->queue_num + 1; + for (i = 0; i < queue->nr_hw_queues; i++, j++) { + if (j == queue->nr_hw_queues) + j = 0; + hctx2 = queue->queue_hw_ctx[j]; + if (hctx2->tags == tags && + blk_mq_sched_restart_hctx(hctx2)) + break; + } +done: + rcu_read_unlock(); + } else { + blk_mq_sched_restart_hctx(hctx); + } +} + void blk_mq_sched_insert_request(struct request *rq, bool at_head, bool run_queue, bool async, bool can_block) { -- cgit v1.2.3-58-ga151 From d004a5e7d4dd6335ce6e2044af42f5e0fbebb51d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 8 Nov 2017 19:13:48 +0100 Subject: block: remove __bio_kmap_atomic This helper doesn't buy us much over calling kmap_atomic directly. In fact in the only caller it does a bit of useless work as the caller already has the bvec at hand, and said caller would even buggy for a multi-segment bio due to the use of this helper. So just remove it. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- Documentation/block/biodoc.txt | 11 +++++------ arch/xtensa/platforms/iss/simdisk.c | 4 ++-- block/blk-settings.c | 2 +- include/linux/bio.h | 12 ------------ 4 files changed, 8 insertions(+), 21 deletions(-) (limited to 'block') diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt index 01c0a03407cc..86927029a52d 100644 --- a/Documentation/block/biodoc.txt +++ b/Documentation/block/biodoc.txt @@ -216,10 +216,9 @@ may need to abort DMA operations and revert to PIO for the transfer, in which case a virtual mapping of the page is required. For SCSI it is also done in some scenarios where the low level driver cannot be trusted to handle a single sg entry correctly. The driver is expected to perform the -kmaps as needed on such occasions using the bio_kmap_irq and friends -routines as appropriate. A driver could also use the blk_queue_bounce() -routine on its own to bounce highmem i/o to low memory for specific requests -if so desired. +kmaps as needed on such occasions as appropriate. A driver could also use +the blk_queue_bounce() routine on its own to bounce highmem i/o to low +memory for specific requests if so desired. iii. The i/o scheduler algorithm itself can be replaced/set as appropriate @@ -1137,8 +1136,8 @@ use dma_map_sg for scatter gather) to be able to ship it to the driver. For PIO drivers (or drivers that need to revert to PIO transfer once in a while (IDE for example)), where the CPU is doing the actual data transfer a virtual mapping is needed. If the driver supports highmem I/O, -(Sec 1.1, (ii) ) it needs to use __bio_kmap_atomic or similar to -temporarily map a bio into the virtual address space. +(Sec 1.1, (ii) ) it needs to use kmap_atomic or similar to temporarily map +a bio into the virtual address space. 8. Prior/Related/Impacted patches diff --git a/arch/xtensa/platforms/iss/simdisk.c b/arch/xtensa/platforms/iss/simdisk.c index c45b90bb9339..eacf1e433518 100644 --- a/arch/xtensa/platforms/iss/simdisk.c +++ b/arch/xtensa/platforms/iss/simdisk.c @@ -110,13 +110,13 @@ static blk_qc_t simdisk_make_request(struct request_queue *q, struct bio *bio) sector_t sector = bio->bi_iter.bi_sector; bio_for_each_segment(bvec, bio, iter) { - char *buffer = __bio_kmap_atomic(bio, iter); + char *buffer = kmap_atomic(bvec.bv_page) + bvec.bv_offset; unsigned len = bvec.bv_len >> SECTOR_SHIFT; simdisk_transfer(dev, sector, len, buffer, bio_data_dir(bio) == WRITE); sector += len; - __bio_kunmap_atomic(buffer); + kunmap_atomic(buffer) } bio_endio(bio); diff --git a/block/blk-settings.c b/block/blk-settings.c index 8559e9563c52..48ebe6be07b7 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -157,7 +157,7 @@ EXPORT_SYMBOL(blk_set_stacking_limits); * Caveat: * The driver that does this *must* be able to deal appropriately * with buffers in "highmemory". This can be accomplished by either calling - * __bio_kmap_atomic() to get a temporary kernel mapping, or by calling + * kmap_atomic() to get a temporary kernel mapping, or by calling * blk_queue_bounce() to create a buffer in normal memory. **/ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn) diff --git a/include/linux/bio.h b/include/linux/bio.h index 1d7e63d7505f..d4eec19a6d3c 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -128,18 +128,6 @@ static inline void *bio_data(struct bio *bio) */ #define bvec_to_phys(bv) (page_to_phys((bv)->bv_page) + (unsigned long) (bv)->bv_offset) -/* - * queues that have highmem support enabled may still need to revert to - * PIO transfers occasionally and thus map high pages temporarily. For - * permanent PIO fall back, user is probably better off disabling highmem - * I/O completely on that queue (see ide-dma for example) - */ -#define __bio_kmap_atomic(bio, iter) \ - (kmap_atomic(bio_iter_iovec((bio), (iter)).bv_page) + \ - bio_iter_iovec((bio), (iter)).bv_offset) - -#define __bio_kunmap_atomic(addr) kunmap_atomic(addr) - /* * merge helpers etc */ -- cgit v1.2.3-58-ga151 From f00c4d80ffdac9e3a64947ebd57489f3232d5a74 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 5 Nov 2017 10:36:31 +0300 Subject: block: pass full fmode_t to blk_verify_command Use the obvious calling convention. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bsg.c | 18 ++++++++---------- block/scsi_ioctl.c | 8 ++++---- drivers/scsi/sg.c | 2 +- include/linux/blkdev.h | 2 +- 4 files changed, 14 insertions(+), 16 deletions(-) (limited to 'block') diff --git a/block/bsg.c b/block/bsg.c index ee1335c68de7..452f94f1c5d4 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -137,7 +137,7 @@ static inline struct hlist_head *bsg_dev_idx_hash(int index) static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq, struct sg_io_v4 *hdr, struct bsg_device *bd, - fmode_t has_write_perm) + fmode_t mode) { struct scsi_request *req = scsi_req(rq); @@ -152,7 +152,7 @@ static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq, return -EFAULT; if (hdr->subprotocol == BSG_SUB_PROTOCOL_SCSI_CMD) { - if (blk_verify_command(req->cmd, has_write_perm)) + if (blk_verify_command(req->cmd, mode)) return -EPERM; } else if (!capable(CAP_SYS_RAWIO)) return -EPERM; @@ -206,7 +206,7 @@ bsg_validate_sgv4_hdr(struct sg_io_v4 *hdr, int *op) * map sg_io_v4 to a request. */ static struct request * -bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm) +bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t mode) { struct request_queue *q = bd->queue; struct request *rq, *next_rq = NULL; @@ -237,7 +237,7 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm) if (IS_ERR(rq)) return rq; - ret = blk_fill_sgv4_hdr_rq(q, rq, hdr, bd, has_write_perm); + ret = blk_fill_sgv4_hdr_rq(q, rq, hdr, bd, mode); if (ret) goto out; @@ -587,8 +587,7 @@ bsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) } static int __bsg_write(struct bsg_device *bd, const char __user *buf, - size_t count, ssize_t *bytes_written, - fmode_t has_write_perm) + size_t count, ssize_t *bytes_written, fmode_t mode) { struct bsg_command *bc; struct request *rq; @@ -619,7 +618,7 @@ static int __bsg_write(struct bsg_device *bd, const char __user *buf, /* * get a request, fill in the blanks, and add to request queue */ - rq = bsg_map_hdr(bd, &bc->hdr, has_write_perm); + rq = bsg_map_hdr(bd, &bc->hdr, mode); if (IS_ERR(rq)) { ret = PTR_ERR(rq); rq = NULL; @@ -655,8 +654,7 @@ bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) bsg_set_block(bd, file); bytes_written = 0; - ret = __bsg_write(bd, buf, count, &bytes_written, - file->f_mode & FMODE_WRITE); + ret = __bsg_write(bd, buf, count, &bytes_written, file->f_mode); *ppos = bytes_written; @@ -915,7 +913,7 @@ static long bsg_ioctl(struct file *file, unsigned int cmd, unsigned long arg) if (copy_from_user(&hdr, uarg, sizeof(hdr))) return -EFAULT; - rq = bsg_map_hdr(bd, &hdr, file->f_mode & FMODE_WRITE); + rq = bsg_map_hdr(bd, &hdr, file->f_mode); if (IS_ERR(rq)) return PTR_ERR(rq); diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index 7440de44dd85..edcfff974527 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -207,7 +207,7 @@ static void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter) __set_bit(GPCMD_SET_READ_AHEAD, filter->write_ok); } -int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm) +int blk_verify_command(unsigned char *cmd, fmode_t mode) { struct blk_cmd_filter *filter = &blk_default_cmd_filter; @@ -220,7 +220,7 @@ int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm) return 0; /* Write-safe commands require a writable open */ - if (test_bit(cmd[0], filter->write_ok) && has_write_perm) + if (test_bit(cmd[0], filter->write_ok) && (mode & FMODE_WRITE)) return 0; return -EPERM; @@ -234,7 +234,7 @@ static int blk_fill_sghdr_rq(struct request_queue *q, struct request *rq, if (copy_from_user(req->cmd, hdr->cmdp, hdr->cmd_len)) return -EFAULT; - if (blk_verify_command(req->cmd, mode & FMODE_WRITE)) + if (blk_verify_command(req->cmd, mode)) return -EPERM; /* @@ -469,7 +469,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, if (in_len && copy_from_user(buffer, sic->data + cmdlen, in_len)) goto error; - err = blk_verify_command(req->cmd, mode & FMODE_WRITE); + err = blk_verify_command(req->cmd, mode); if (err) goto error; diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 0419c2298eab..92fd870e1315 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -217,7 +217,7 @@ static int sg_allow_access(struct file *filp, unsigned char *cmd) if (sfp->parentdp->device->type == TYPE_SCANNER) return 0; - return blk_verify_command(cmd, filp->f_mode & FMODE_WRITE); + return blk_verify_command(cmd, filp->f_mode); } static int diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 225617dd0a3f..1437ef4d8037 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1360,7 +1360,7 @@ static inline int sb_issue_zeroout(struct super_block *sb, sector_t block, gfp_mask, 0); } -extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm); +extern int blk_verify_command(unsigned char *cmd, fmode_t mode); enum blk_default_limits { BLK_MAX_SEGMENTS = 128, -- cgit v1.2.3-58-ga151 From 0c6af1ccd5fd9ac640aef01c8de0043837451a04 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 8 Nov 2017 09:11:22 +0800 Subject: blk-mq: put driver tag if dispatch budget can't be got We have to put the driver tag if dispatch budget can't be got, otherwise it might cause IO deadlock, especially in case that size of tags is very small. Fixes: de1482974080(blk-mq: introduce .get_budget and .put_budget in blk_mq_ops) Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index c501cbd0de93..3d759bb8a5bb 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1080,8 +1080,10 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, } } - if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) + if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) { + blk_mq_put_driver_tag(rq); break; + } list_del_init(&rq->queuelist); -- cgit v1.2.3-58-ga151 From aba7afc5671c23beade64d10caf86e24a9105dab Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 8 Nov 2017 10:23:45 -0800 Subject: blk-mq: Avoid that request queue removal can trigger list corruption Avoid that removal of a request queue sporadically triggers the following warning: list_del corruption. next->prev should be ffff8807d649b970, but was 6b6b6b6b6b6b6b6b WARNING: CPU: 3 PID: 342 at lib/list_debug.c:56 __list_del_entry_valid+0x92/0xa0 Call Trace: process_one_work+0x11b/0x660 worker_thread+0x3d/0x3b0 kthread+0x129/0x140 ret_from_fork+0x27/0x40 Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/blk-core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index b8d1aa2d1008..5e81dcf4690a 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -339,6 +339,7 @@ void blk_sync_queue(struct request_queue *q) struct blk_mq_hw_ctx *hctx; int i; + cancel_delayed_work_sync(&q->requeue_work); queue_for_each_hw_ctx(q, hctx, i) cancel_delayed_work_sync(&hctx->run_work); } else { -- cgit v1.2.3-58-ga151 From eb619fdb2d4cb8b3d3419e9113921e87e7daf557 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 9 Nov 2017 08:32:43 -0700 Subject: blk-mq: fix issue with shared tag queue re-running This patch attempts to make the case of hctx re-running on driver tag failure more robust. Without this patch, it's pretty easy to trigger a stall condition with shared tags. An example is using null_blk like this: modprobe null_blk queue_mode=2 nr_devices=4 shared_tags=1 submit_queues=1 hw_queue_depth=1 which sets up 4 devices, sharing the same tag set with a depth of 1. Running a fio job ala: [global] bs=4k rw=randread norandommap direct=1 ioengine=libaio iodepth=4 [nullb0] filename=/dev/nullb0 [nullb1] filename=/dev/nullb1 [nullb2] filename=/dev/nullb2 [nullb3] filename=/dev/nullb3 will inevitably end with one or more threads being stuck waiting for a scheduler tag. That IO is then stuck forever, until someone else triggers a run of the queue. Ensure that we always re-run the hardware queue, if the driver tag we were waiting for got freed before we added our leftover request entries back on the dispatch list. Reviewed-by: Bart Van Assche Tested-by: Bart Van Assche Reviewed-by: Ming Lei Reviewed-by: Omar Sandoval Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 1 - block/blk-mq.c | 85 ++++++++++++++++++++++++++++---------------------- include/linux/blk-mq.h | 5 ++- 3 files changed, 50 insertions(+), 41 deletions(-) (limited to 'block') diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 7f4a1ba532af..bb7f08415203 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -179,7 +179,6 @@ static const char *const hctx_state_name[] = { HCTX_STATE_NAME(STOPPED), HCTX_STATE_NAME(TAG_ACTIVE), HCTX_STATE_NAME(SCHED_RESTART), - HCTX_STATE_NAME(TAG_WAITING), HCTX_STATE_NAME(START_ON_RUN), }; #undef HCTX_STATE_NAME diff --git a/block/blk-mq.c b/block/blk-mq.c index 3d759bb8a5bb..fed8165973a3 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -998,49 +998,64 @@ done: return rq->tag != -1; } -static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int flags, - void *key) +static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, + int flags, void *key) { struct blk_mq_hw_ctx *hctx; hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait); - list_del(&wait->entry); - clear_bit_unlock(BLK_MQ_S_TAG_WAITING, &hctx->state); + list_del_init(&wait->entry); blk_mq_run_hw_queue(hctx, true); return 1; } -static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx) +static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx **hctx, + struct request *rq) { + struct blk_mq_hw_ctx *this_hctx = *hctx; + wait_queue_entry_t *wait = &this_hctx->dispatch_wait; struct sbq_wait_state *ws; + if (!list_empty_careful(&wait->entry)) + return false; + + spin_lock(&this_hctx->lock); + if (!list_empty(&wait->entry)) { + spin_unlock(&this_hctx->lock); + return false; + } + + ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx); + add_wait_queue(&ws->wait, wait); + /* - * The TAG_WAITING bit serves as a lock protecting hctx->dispatch_wait. - * The thread which wins the race to grab this bit adds the hardware - * queue to the wait queue. + * It's possible that a tag was freed in the window between the + * allocation failure and adding the hardware queue to the wait + * queue. */ - if (test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state) || - test_and_set_bit_lock(BLK_MQ_S_TAG_WAITING, &hctx->state)) + if (!blk_mq_get_driver_tag(rq, hctx, false)) { + spin_unlock(&this_hctx->lock); return false; - - init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake); - ws = bt_wait_ptr(&hctx->tags->bitmap_tags, hctx); + } /* - * As soon as this returns, it's no longer safe to fiddle with - * hctx->dispatch_wait, since a completion can wake up the wait queue - * and unlock the bit. + * We got a tag, remove ourselves from the wait queue to ensure + * someone else gets the wakeup. */ - add_wait_queue(&ws->wait, &hctx->dispatch_wait); + spin_lock_irq(&ws->wait.lock); + list_del_init(&wait->entry); + spin_unlock_irq(&ws->wait.lock); + spin_unlock(&this_hctx->lock); return true; } bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, - bool got_budget) + bool got_budget) { struct blk_mq_hw_ctx *hctx; struct request *rq, *nxt; + bool no_tag = false; int errors, queued; if (list_empty(list)) @@ -1060,22 +1075,15 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, if (!blk_mq_get_driver_tag(rq, &hctx, false)) { /* * The initial allocation attempt failed, so we need to - * rerun the hardware queue when a tag is freed. + * rerun the hardware queue when a tag is freed. The + * waitqueue takes care of that. If the queue is run + * before we add this entry back on the dispatch list, + * we'll re-run it below. */ - if (!blk_mq_dispatch_wait_add(hctx)) { - if (got_budget) - blk_mq_put_dispatch_budget(hctx); - break; - } - - /* - * It's possible that a tag was freed in the window - * between the allocation failure and adding the - * hardware queue to the wait queue. - */ - if (!blk_mq_get_driver_tag(rq, &hctx, false)) { + if (!blk_mq_dispatch_wait_add(&hctx, rq)) { if (got_budget) blk_mq_put_dispatch_budget(hctx); + no_tag = true; break; } } @@ -1140,10 +1148,10 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, * it is no longer set that means that it was cleared by another * thread and hence that a queue rerun is needed. * - * If TAG_WAITING is set that means that an I/O scheduler has - * been configured and another thread is waiting for a driver - * tag. To guarantee fairness, do not rerun this hardware queue - * but let the other thread grab the driver tag. + * If 'no_tag' is set, that means that we failed getting + * a driver tag with an I/O scheduler attached. If our dispatch + * waitqueue is no longer active, ensure that we run the queue + * AFTER adding our entries back to the list. * * If no I/O scheduler has been configured it is possible that * the hardware queue got stopped and restarted before requests @@ -1155,8 +1163,8 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, * returning BLK_STS_RESOURCE. Two exceptions are scsi-mq * and dm-rq. */ - if (!blk_mq_sched_needs_restart(hctx) && - !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state)) + if (!blk_mq_sched_needs_restart(hctx) || + (no_tag && list_empty_careful(&hctx->dispatch_wait.entry))) blk_mq_run_hw_queue(hctx, true); } @@ -2020,6 +2028,9 @@ static int blk_mq_init_hctx(struct request_queue *q, hctx->nr_ctx = 0; + init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake); + INIT_LIST_HEAD(&hctx->dispatch_wait.entry); + if (set->ops->init_hctx && set->ops->init_hctx(hctx, set->driver_data, hctx_idx)) goto free_bitmap; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 674641527da7..4ae987c2352c 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -35,7 +35,7 @@ struct blk_mq_hw_ctx { struct blk_mq_ctx **ctxs; unsigned int nr_ctx; - wait_queue_entry_t dispatch_wait; + wait_queue_entry_t dispatch_wait; atomic_t wait_index; struct blk_mq_tags *tags; @@ -181,8 +181,7 @@ enum { BLK_MQ_S_STOPPED = 0, BLK_MQ_S_TAG_ACTIVE = 1, BLK_MQ_S_SCHED_RESTART = 2, - BLK_MQ_S_TAG_WAITING = 3, - BLK_MQ_S_START_ON_RUN = 4, + BLK_MQ_S_START_ON_RUN = 3, BLK_MQ_MAX_DEPTH = 10240, -- cgit v1.2.3-58-ga151 From 055f6e18e08f5b7fd98171fce857a0bad87a919d Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 9 Nov 2017 10:49:53 -0800 Subject: block: Make q_usage_counter also track legacy requests This patch makes it possible to pause request allocation for the legacy block layer by calling blk_mq_freeze_queue() and blk_mq_unfreeze_queue(). Signed-off-by: Ming Lei [ bvanassche: Combined two patches into one, edited a comment and made sure REQ_NOWAIT is handled properly in blk_old_get_request() ] Signed-off-by: Bart Van Assche Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Hannes Reinecke Tested-by: Martin Steigerwald Tested-by: Oleksandr Natalenko Cc: Ming Lei Signed-off-by: Jens Axboe --- block/blk-core.c | 12 ++++++++++++ block/blk-mq.c | 10 ++-------- 2 files changed, 14 insertions(+), 8 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 5e81dcf4690a..a4362849059a 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -612,6 +612,9 @@ void blk_set_queue_dying(struct request_queue *q) } spin_unlock_irq(q->queue_lock); } + + /* Make blk_queue_enter() reexamine the DYING flag. */ + wake_up_all(&q->mq_freeze_wq); } EXPORT_SYMBOL_GPL(blk_set_queue_dying); @@ -1398,16 +1401,22 @@ static struct request *blk_old_get_request(struct request_queue *q, unsigned int op, gfp_t gfp_mask) { struct request *rq; + int ret = 0; WARN_ON_ONCE(q->mq_ops); /* create ioc upfront */ create_io_context(gfp_mask, q->node); + ret = blk_queue_enter(q, !(gfp_mask & __GFP_DIRECT_RECLAIM) || + (op & REQ_NOWAIT)); + if (ret) + return ERR_PTR(ret); spin_lock_irq(q->queue_lock); rq = get_request(q, op, NULL, gfp_mask); if (IS_ERR(rq)) { spin_unlock_irq(q->queue_lock); + blk_queue_exit(q); return rq; } @@ -1579,6 +1588,7 @@ void __blk_put_request(struct request_queue *q, struct request *req) blk_free_request(rl, req); freed_request(rl, sync, rq_flags); blk_put_rl(rl); + blk_queue_exit(q); } } EXPORT_SYMBOL_GPL(__blk_put_request); @@ -1860,8 +1870,10 @@ get_rq: * Grab a free request. This is might sleep but can not fail. * Returns with the queue unlocked. */ + blk_queue_enter_live(q); req = get_request(q, bio->bi_opf, bio, GFP_NOIO); if (IS_ERR(req)) { + blk_queue_exit(q); __wbt_done(q->rq_wb, wb_acct); if (PTR_ERR(req) == -ENOMEM) bio->bi_status = BLK_STS_RESOURCE; diff --git a/block/blk-mq.c b/block/blk-mq.c index fed8165973a3..7173d4bd64af 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -126,7 +126,8 @@ void blk_freeze_queue_start(struct request_queue *q) freeze_depth = atomic_inc_return(&q->mq_freeze_depth); if (freeze_depth == 1) { percpu_ref_kill(&q->q_usage_counter); - blk_mq_run_hw_queues(q, false); + if (q->mq_ops) + blk_mq_run_hw_queues(q, false); } } EXPORT_SYMBOL_GPL(blk_freeze_queue_start); @@ -256,13 +257,6 @@ void blk_mq_wake_waiters(struct request_queue *q) queue_for_each_hw_ctx(q, hctx, i) if (blk_mq_hw_queue_mapped(hctx)) blk_mq_tag_wakeup_all(hctx->tags, true); - - /* - * If we are called because the queue has now been marked as - * dying, we need to ensure that processes currently waiting on - * the queue are notified as well. - */ - wake_up_all(&q->mq_freeze_wq); } bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx) -- cgit v1.2.3-58-ga151 From 6a15674d1e90917f1723a814e2e8c949000440f7 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 9 Nov 2017 10:49:54 -0800 Subject: block: Introduce blk_get_request_flags() A side effect of this patch is that the GFP mask that is passed to several allocation functions in the legacy block layer is changed from GFP_KERNEL into __GFP_DIRECT_RECLAIM. Signed-off-by: Bart Van Assche Reviewed-by: Hannes Reinecke Tested-by: Martin Steigerwald Tested-by: Oleksandr Natalenko Cc: Christoph Hellwig Cc: Ming Lei Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/blk-core.c | 50 +++++++++++++++++++++++++++++++++++--------------- include/linux/blkdev.h | 3 +++ 2 files changed, 38 insertions(+), 15 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index a4362849059a..0f7093dfc010 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1160,7 +1160,7 @@ int blk_update_nr_requests(struct request_queue *q, unsigned int nr) * @rl: request list to allocate from * @op: operation and flags * @bio: bio to allocate request for (can be %NULL) - * @gfp_mask: allocation mask + * @flags: BLQ_MQ_REQ_* flags * * Get a free request from @q. This function may fail under memory * pressure or if @q is dead. @@ -1170,7 +1170,7 @@ int blk_update_nr_requests(struct request_queue *q, unsigned int nr) * Returns request pointer on success, with @q->queue_lock *not held*. */ static struct request *__get_request(struct request_list *rl, unsigned int op, - struct bio *bio, gfp_t gfp_mask) + struct bio *bio, unsigned int flags) { struct request_queue *q = rl->q; struct request *rq; @@ -1179,6 +1179,8 @@ static struct request *__get_request(struct request_list *rl, unsigned int op, struct io_cq *icq = NULL; const bool is_sync = op_is_sync(op); int may_queue; + gfp_t gfp_mask = flags & BLK_MQ_REQ_NOWAIT ? GFP_ATOMIC : + __GFP_DIRECT_RECLAIM; req_flags_t rq_flags = RQF_ALLOCED; lockdep_assert_held(q->queue_lock); @@ -1339,7 +1341,7 @@ rq_starved: * @q: request_queue to allocate request from * @op: operation and flags * @bio: bio to allocate request for (can be %NULL) - * @gfp_mask: allocation mask + * @flags: BLK_MQ_REQ_* flags. * * Get a free request from @q. If %__GFP_DIRECT_RECLAIM is set in @gfp_mask, * this function keeps retrying under memory pressure and fails iff @q is dead. @@ -1349,7 +1351,7 @@ rq_starved: * Returns request pointer on success, with @q->queue_lock *not held*. */ static struct request *get_request(struct request_queue *q, unsigned int op, - struct bio *bio, gfp_t gfp_mask) + struct bio *bio, unsigned int flags) { const bool is_sync = op_is_sync(op); DEFINE_WAIT(wait); @@ -1361,7 +1363,7 @@ static struct request *get_request(struct request_queue *q, unsigned int op, rl = blk_get_rl(q, bio); /* transferred to @rq on success */ retry: - rq = __get_request(rl, op, bio, gfp_mask); + rq = __get_request(rl, op, bio, flags); if (!IS_ERR(rq)) return rq; @@ -1370,7 +1372,7 @@ retry: return ERR_PTR(-EAGAIN); } - if (!gfpflags_allow_blocking(gfp_mask) || unlikely(blk_queue_dying(q))) { + if ((flags & BLK_MQ_REQ_NOWAIT) || unlikely(blk_queue_dying(q))) { blk_put_rl(rl); return rq; } @@ -1397,10 +1399,13 @@ retry: goto retry; } +/* flags: BLK_MQ_REQ_PREEMPT and/or BLK_MQ_REQ_NOWAIT. */ static struct request *blk_old_get_request(struct request_queue *q, - unsigned int op, gfp_t gfp_mask) + unsigned int op, unsigned int flags) { struct request *rq; + gfp_t gfp_mask = flags & BLK_MQ_REQ_NOWAIT ? GFP_ATOMIC : + __GFP_DIRECT_RECLAIM; int ret = 0; WARN_ON_ONCE(q->mq_ops); @@ -1413,7 +1418,7 @@ static struct request *blk_old_get_request(struct request_queue *q, if (ret) return ERR_PTR(ret); spin_lock_irq(q->queue_lock); - rq = get_request(q, op, NULL, gfp_mask); + rq = get_request(q, op, NULL, flags); if (IS_ERR(rq)) { spin_unlock_irq(q->queue_lock); blk_queue_exit(q); @@ -1427,25 +1432,40 @@ static struct request *blk_old_get_request(struct request_queue *q, return rq; } -struct request *blk_get_request(struct request_queue *q, unsigned int op, - gfp_t gfp_mask) +/** + * blk_get_request_flags - allocate a request + * @q: request queue to allocate a request for + * @op: operation (REQ_OP_*) and REQ_* flags, e.g. REQ_SYNC. + * @flags: BLK_MQ_REQ_* flags, e.g. BLK_MQ_REQ_NOWAIT. + */ +struct request *blk_get_request_flags(struct request_queue *q, unsigned int op, + unsigned int flags) { struct request *req; + WARN_ON_ONCE(op & REQ_NOWAIT); + WARN_ON_ONCE(flags & ~BLK_MQ_REQ_NOWAIT); + if (q->mq_ops) { - req = blk_mq_alloc_request(q, op, - (gfp_mask & __GFP_DIRECT_RECLAIM) ? - 0 : BLK_MQ_REQ_NOWAIT); + req = blk_mq_alloc_request(q, op, flags); if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn) q->mq_ops->initialize_rq_fn(req); } else { - req = blk_old_get_request(q, op, gfp_mask); + req = blk_old_get_request(q, op, flags); if (!IS_ERR(req) && q->initialize_rq_fn) q->initialize_rq_fn(req); } return req; } +EXPORT_SYMBOL(blk_get_request_flags); + +struct request *blk_get_request(struct request_queue *q, unsigned int op, + gfp_t gfp_mask) +{ + return blk_get_request_flags(q, op, gfp_mask & __GFP_DIRECT_RECLAIM ? + 0 : BLK_MQ_REQ_NOWAIT); +} EXPORT_SYMBOL(blk_get_request); /** @@ -1871,7 +1891,7 @@ get_rq: * Returns with the queue unlocked. */ blk_queue_enter_live(q); - req = get_request(q, bio->bi_opf, bio, GFP_NOIO); + req = get_request(q, bio->bi_opf, bio, 0); if (IS_ERR(req)) { blk_queue_exit(q); __wbt_done(q->rq_wb, wb_acct); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 1437ef4d8037..1af5ddd0f631 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -927,6 +927,9 @@ extern void blk_rq_init(struct request_queue *q, struct request *rq); extern void blk_init_request_from_bio(struct request *req, struct bio *bio); extern void blk_put_request(struct request *); extern void __blk_put_request(struct request_queue *, struct request *); +extern struct request *blk_get_request_flags(struct request_queue *, + unsigned int op, + unsigned int flags); extern struct request *blk_get_request(struct request_queue *, unsigned int op, gfp_t gfp_mask); extern void blk_requeue_request(struct request_queue *, struct request *); -- cgit v1.2.3-58-ga151 From 1b6d65a0bfb5df2a6029c1430e99fcc5d96bb59a Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 9 Nov 2017 10:49:55 -0800 Subject: block: Introduce BLK_MQ_REQ_PREEMPT Set RQF_PREEMPT if BLK_MQ_REQ_PREEMPT is passed to blk_get_request_flags(). Signed-off-by: Bart Van Assche Reviewed-by: Hannes Reinecke Tested-by: Martin Steigerwald Tested-by: Oleksandr Natalenko Cc: Christoph Hellwig Cc: Ming Lei Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/blk-core.c | 4 +++- block/blk-mq.c | 2 ++ include/linux/blk-mq.h | 1 + 3 files changed, 6 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 0f7093dfc010..17eed16a6e04 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1263,6 +1263,8 @@ static struct request *__get_request(struct request_list *rl, unsigned int op, blk_rq_set_rl(rq, rl); rq->cmd_flags = op; rq->rq_flags = rq_flags; + if (flags & BLK_MQ_REQ_PREEMPT) + rq->rq_flags |= RQF_PREEMPT; /* init elvpriv */ if (rq_flags & RQF_ELVPRIV) { @@ -1444,7 +1446,7 @@ struct request *blk_get_request_flags(struct request_queue *q, unsigned int op, struct request *req; WARN_ON_ONCE(op & REQ_NOWAIT); - WARN_ON_ONCE(flags & ~BLK_MQ_REQ_NOWAIT); + WARN_ON_ONCE(flags & ~(BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_PREEMPT)); if (q->mq_ops) { req = blk_mq_alloc_request(q, op, flags); diff --git a/block/blk-mq.c b/block/blk-mq.c index 7173d4bd64af..e21876778cec 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -291,6 +291,8 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, rq->q = data->q; rq->mq_ctx = data->ctx; rq->cmd_flags = op; + if (data->flags & BLK_MQ_REQ_PREEMPT) + rq->rq_flags |= RQF_PREEMPT; if (blk_queue_io_stat(data->q)) rq->rq_flags |= RQF_IO_STAT; /* do not touch atomic flags, it needs atomic ops against the timer */ diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 4ae987c2352c..82b56609736a 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -212,6 +212,7 @@ enum { BLK_MQ_REQ_NOWAIT = (1 << 0), /* return when out of requests */ BLK_MQ_REQ_RESERVED = (1 << 1), /* allocate from reserved pool */ BLK_MQ_REQ_INTERNAL = (1 << 2), /* allocate internal/sched tag */ + BLK_MQ_REQ_PREEMPT = (1 << 3), /* set RQF_PREEMPT */ }; struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, -- cgit v1.2.3-58-ga151 From c9254f2ddb19387ea9714a57ea48463c20333b92 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 9 Nov 2017 10:49:57 -0800 Subject: block: Add the QUEUE_FLAG_PREEMPT_ONLY request queue flag This flag will be used in the next patch to let the block layer core know whether or not a SCSI request queue has been quiesced. A quiesced SCSI queue namely only processes RQF_PREEMPT requests. Signed-off-by: Bart Van Assche Reviewed-by: Hannes Reinecke Tested-by: Martin Steigerwald Tested-by: Oleksandr Natalenko Cc: Ming Lei Cc: Christoph Hellwig Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/blk-core.c | 30 ++++++++++++++++++++++++++++++ block/blk-mq-debugfs.c | 1 + include/linux/blkdev.h | 6 ++++++ 3 files changed, 37 insertions(+) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 17eed16a6e04..edc276899116 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -348,6 +348,36 @@ void blk_sync_queue(struct request_queue *q) } EXPORT_SYMBOL(blk_sync_queue); +/** + * blk_set_preempt_only - set QUEUE_FLAG_PREEMPT_ONLY + * @q: request queue pointer + * + * Returns the previous value of the PREEMPT_ONLY flag - 0 if the flag was not + * set and 1 if the flag was already set. + */ +int blk_set_preempt_only(struct request_queue *q) +{ + unsigned long flags; + int res; + + spin_lock_irqsave(q->queue_lock, flags); + res = queue_flag_test_and_set(QUEUE_FLAG_PREEMPT_ONLY, q); + spin_unlock_irqrestore(q->queue_lock, flags); + + return res; +} +EXPORT_SYMBOL_GPL(blk_set_preempt_only); + +void blk_clear_preempt_only(struct request_queue *q) +{ + unsigned long flags; + + spin_lock_irqsave(q->queue_lock, flags); + queue_flag_clear(QUEUE_FLAG_PREEMPT_ONLY, q); + spin_unlock_irqrestore(q->queue_lock, flags); +} +EXPORT_SYMBOL_GPL(blk_clear_preempt_only); + /** * __blk_run_queue_uncond - run a queue whether or not it has been stopped * @q: The queue to run diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index bb7f08415203..e4f2bb936e66 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -74,6 +74,7 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(REGISTERED), QUEUE_FLAG_NAME(SCSI_PASSTHROUGH), QUEUE_FLAG_NAME(QUIESCED), + QUEUE_FLAG_NAME(PREEMPT_ONLY), }; #undef QUEUE_FLAG_NAME diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 1af5ddd0f631..2147e2381a22 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -632,6 +632,7 @@ struct request_queue { #define QUEUE_FLAG_REGISTERED 26 /* queue has been registered to a disk */ #define QUEUE_FLAG_SCSI_PASSTHROUGH 27 /* queue supports SCSI commands */ #define QUEUE_FLAG_QUIESCED 28 /* queue has been quiesced */ +#define QUEUE_FLAG_PREEMPT_ONLY 29 /* only process REQ_PREEMPT requests */ #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_SAME_COMP) | \ @@ -732,6 +733,11 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q) ((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \ REQ_FAILFAST_DRIVER)) #define blk_queue_quiesced(q) test_bit(QUEUE_FLAG_QUIESCED, &(q)->queue_flags) +#define blk_queue_preempt_only(q) \ + test_bit(QUEUE_FLAG_PREEMPT_ONLY, &(q)->queue_flags) + +extern int blk_set_preempt_only(struct request_queue *q); +extern void blk_clear_preempt_only(struct request_queue *q); static inline bool blk_account_rq(struct request *rq) { -- cgit v1.2.3-58-ga151 From 3a0a529971ec4e2d933e9c7798db101dfb6b1aec Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 9 Nov 2017 10:49:58 -0800 Subject: block, scsi: Make SCSI quiesce and resume work reliably The contexts from which a SCSI device can be quiesced or resumed are: * Writing into /sys/class/scsi_device/*/device/state. * SCSI parallel (SPI) domain validation. * The SCSI device power management methods. See also scsi_bus_pm_ops. It is essential during suspend and resume that neither the filesystem state nor the filesystem metadata in RAM changes. This is why while the hibernation image is being written or restored that SCSI devices are quiesced. The SCSI core quiesces devices through scsi_device_quiesce() and scsi_device_resume(). In the SDEV_QUIESCE state execution of non-preempt requests is deferred. This is realized by returning BLKPREP_DEFER from inside scsi_prep_state_check() for quiesced SCSI devices. Avoid that a full queue prevents power management requests to be submitted by deferring allocation of non-preempt requests for devices in the quiesced state. This patch has been tested by running the following commands and by verifying that after each resume the fio job was still running: for ((i=0; i<10; i++)); do ( cd /sys/block/md0/md && while true; do [ "$( sync_action sleep 1 done ) & pids=($!) for d in /sys/class/block/sd*[a-z]; do bdev=${d#/sys/class/block/} hcil=$(readlink "$d/device") hcil=${hcil#../../../} echo 4 > "$d/queue/nr_requests" echo 1 > "/sys/class/scsi_device/$hcil/device/queue_depth" fio --name="$bdev" --filename="/dev/$bdev" --buffered=0 --bs=512 \ --rw=randread --ioengine=libaio --numjobs=4 --iodepth=16 \ --iodepth_batch=1 --thread --loops=$((2**31)) & pids+=($!) done sleep 1 echo "$(date) Hibernating ..." >>hibernate-test-log.txt systemctl hibernate sleep 10 kill "${pids[@]}" echo idle > /sys/block/md0/md/sync_action wait echo "$(date) Done." >>hibernate-test-log.txt done Reported-by: Oleksandr Natalenko References: "I/O hangs after resuming from suspend-to-ram" (https://marc.info/?l=linux-block&m=150340235201348). Signed-off-by: Bart Van Assche Reviewed-by: Hannes Reinecke Tested-by: Martin Steigerwald Tested-by: Oleksandr Natalenko Cc: Martin K. Petersen Cc: Ming Lei Cc: Christoph Hellwig Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/blk-core.c | 42 ++++++++++++++++++++++++++++++++++-------- block/blk-mq.c | 4 ++-- drivers/scsi/scsi_lib.c | 42 ++++++++++++++++++++++++++++++------------ fs/block_dev.c | 4 ++-- include/linux/blkdev.h | 2 +- include/scsi/scsi_device.h | 1 + 6 files changed, 70 insertions(+), 25 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index edc276899116..29b08428ae45 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -374,6 +374,7 @@ void blk_clear_preempt_only(struct request_queue *q) spin_lock_irqsave(q->queue_lock, flags); queue_flag_clear(QUEUE_FLAG_PREEMPT_ONLY, q); + wake_up_all(&q->mq_freeze_wq); spin_unlock_irqrestore(q->queue_lock, flags); } EXPORT_SYMBOL_GPL(blk_clear_preempt_only); @@ -795,15 +796,38 @@ struct request_queue *blk_alloc_queue(gfp_t gfp_mask) } EXPORT_SYMBOL(blk_alloc_queue); -int blk_queue_enter(struct request_queue *q, bool nowait) +/** + * blk_queue_enter() - try to increase q->q_usage_counter + * @q: request queue pointer + * @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PREEMPT + */ +int blk_queue_enter(struct request_queue *q, unsigned int flags) { + const bool preempt = flags & BLK_MQ_REQ_PREEMPT; + while (true) { + bool success = false; int ret; - if (percpu_ref_tryget_live(&q->q_usage_counter)) + rcu_read_lock_sched(); + if (percpu_ref_tryget_live(&q->q_usage_counter)) { + /* + * The code that sets the PREEMPT_ONLY flag is + * responsible for ensuring that that flag is globally + * visible before the queue is unfrozen. + */ + if (preempt || !blk_queue_preempt_only(q)) { + success = true; + } else { + percpu_ref_put(&q->q_usage_counter); + } + } + rcu_read_unlock_sched(); + + if (success) return 0; - if (nowait) + if (flags & BLK_MQ_REQ_NOWAIT) return -EBUSY; /* @@ -816,7 +840,8 @@ int blk_queue_enter(struct request_queue *q, bool nowait) smp_rmb(); ret = wait_event_interruptible(q->mq_freeze_wq, - !atomic_read(&q->mq_freeze_depth) || + (atomic_read(&q->mq_freeze_depth) == 0 && + (preempt || !blk_queue_preempt_only(q))) || blk_queue_dying(q)); if (blk_queue_dying(q)) return -ENODEV; @@ -1445,8 +1470,7 @@ static struct request *blk_old_get_request(struct request_queue *q, /* create ioc upfront */ create_io_context(gfp_mask, q->node); - ret = blk_queue_enter(q, !(gfp_mask & __GFP_DIRECT_RECLAIM) || - (op & REQ_NOWAIT)); + ret = blk_queue_enter(q, flags); if (ret) return ERR_PTR(ret); spin_lock_irq(q->queue_lock); @@ -2267,8 +2291,10 @@ blk_qc_t generic_make_request(struct bio *bio) current->bio_list = bio_list_on_stack; do { struct request_queue *q = bio->bi_disk->queue; + unsigned int flags = bio->bi_opf & REQ_NOWAIT ? + BLK_MQ_REQ_NOWAIT : 0; - if (likely(blk_queue_enter(q, bio->bi_opf & REQ_NOWAIT) == 0)) { + if (likely(blk_queue_enter(q, flags) == 0)) { struct bio_list lower, same; /* Create a fresh bio_list for all subordinate requests */ @@ -2327,7 +2353,7 @@ blk_qc_t direct_make_request(struct bio *bio) if (!generic_make_request_checks(bio)) return BLK_QC_T_NONE; - if (unlikely(blk_queue_enter(q, nowait))) { + if (unlikely(blk_queue_enter(q, nowait ? BLK_MQ_REQ_NOWAIT : 0))) { if (nowait && !blk_queue_dying(q)) bio->bi_status = BLK_STS_AGAIN; else diff --git a/block/blk-mq.c b/block/blk-mq.c index e21876778cec..211bc8a3e2cc 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -389,7 +389,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, struct request *rq; int ret; - ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT); + ret = blk_queue_enter(q, flags); if (ret) return ERR_PTR(ret); @@ -428,7 +428,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, if (hctx_idx >= q->nr_hw_queues) return ERR_PTR(-EIO); - ret = blk_queue_enter(q, true); + ret = blk_queue_enter(q, flags); if (ret) return ERR_PTR(ret); diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index eb129dfc2ebe..f907e2f8c1dd 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -2947,21 +2947,37 @@ static void scsi_wait_for_queuecommand(struct scsi_device *sdev) int scsi_device_quiesce(struct scsi_device *sdev) { + struct request_queue *q = sdev->request_queue; int err; + /* + * It is allowed to call scsi_device_quiesce() multiple times from + * the same context but concurrent scsi_device_quiesce() calls are + * not allowed. + */ + WARN_ON_ONCE(sdev->quiesced_by && sdev->quiesced_by != current); + + blk_set_preempt_only(q); + + blk_mq_freeze_queue(q); + /* + * Ensure that the effect of blk_set_preempt_only() will be visible + * for percpu_ref_tryget() callers that occur after the queue + * unfreeze even if the queue was already frozen before this function + * was called. See also https://lwn.net/Articles/573497/. + */ + synchronize_rcu(); + blk_mq_unfreeze_queue(q); + mutex_lock(&sdev->state_mutex); err = scsi_device_set_state(sdev, SDEV_QUIESCE); + if (err == 0) + sdev->quiesced_by = current; + else + blk_clear_preempt_only(q); mutex_unlock(&sdev->state_mutex); - if (err) - return err; - - scsi_run_queue(sdev->request_queue); - while (atomic_read(&sdev->device_busy)) { - msleep_interruptible(200); - scsi_run_queue(sdev->request_queue); - } - return 0; + return err; } EXPORT_SYMBOL(scsi_device_quiesce); @@ -2981,9 +2997,11 @@ void scsi_device_resume(struct scsi_device *sdev) * device deleted during suspend) */ mutex_lock(&sdev->state_mutex); - if (sdev->sdev_state == SDEV_QUIESCE && - scsi_device_set_state(sdev, SDEV_RUNNING) == 0) - scsi_run_queue(sdev->request_queue); + WARN_ON_ONCE(!sdev->quiesced_by); + sdev->quiesced_by = NULL; + blk_clear_preempt_only(sdev->request_queue); + if (sdev->sdev_state == SDEV_QUIESCE) + scsi_device_set_state(sdev, SDEV_RUNNING); mutex_unlock(&sdev->state_mutex); } EXPORT_SYMBOL(scsi_device_resume); diff --git a/fs/block_dev.c b/fs/block_dev.c index 4afa4d5ff969..04973f484422 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -662,7 +662,7 @@ int bdev_read_page(struct block_device *bdev, sector_t sector, if (!ops->rw_page || bdev_get_integrity(bdev)) return result; - result = blk_queue_enter(bdev->bd_queue, false); + result = blk_queue_enter(bdev->bd_queue, 0); if (result) return result; result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, false); @@ -698,7 +698,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector, if (!ops->rw_page || bdev_get_integrity(bdev)) return -EOPNOTSUPP; - result = blk_queue_enter(bdev->bd_queue, false); + result = blk_queue_enter(bdev->bd_queue, 0); if (result) return result; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 2147e2381a22..402c9d536ae1 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -959,7 +959,7 @@ extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t, extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t, struct scsi_ioctl_command __user *); -extern int blk_queue_enter(struct request_queue *q, bool nowait); +extern int blk_queue_enter(struct request_queue *q, unsigned int flags); extern void blk_queue_exit(struct request_queue *q); extern void blk_start_queue(struct request_queue *q); extern void blk_start_queue_async(struct request_queue *q); diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h index 82e93ee94708..6f0f1e242e23 100644 --- a/include/scsi/scsi_device.h +++ b/include/scsi/scsi_device.h @@ -219,6 +219,7 @@ struct scsi_device { unsigned char access_state; struct mutex state_mutex; enum scsi_device_state sdev_state; + struct task_struct *quiesced_by; unsigned long sdev_data[0]; } __attribute__((aligned(sizeof(unsigned long)))); -- cgit v1.2.3-58-ga151 From 9a95e4ef709533efac4aafcb8bddf73f96db50ed Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 9 Nov 2017 10:49:59 -0800 Subject: block, nvme: Introduce blk_mq_req_flags_t Several block layer and NVMe core functions accept a combination of BLK_MQ_REQ_* flags through the 'flags' argument but there is no verification at compile time whether the right type of block layer flags is passed. Make it possible for sparse to verify this. This patch does not change any functionality. Signed-off-by: Bart Van Assche Reviewed-by: Hannes Reinecke Tested-by: Oleksandr Natalenko Cc: linux-nvme@lists.infradead.org Cc: Christoph Hellwig Cc: Johannes Thumshirn Cc: Ming Lei Signed-off-by: Jens Axboe --- block/blk-core.c | 12 ++++++------ block/blk-mq.c | 4 ++-- block/blk-mq.h | 2 +- drivers/nvme/host/core.c | 5 +++-- drivers/nvme/host/nvme.h | 5 +++-- include/linux/blk-mq.h | 17 +++++++++++------ include/linux/blk_types.h | 2 ++ include/linux/blkdev.h | 4 ++-- 8 files changed, 30 insertions(+), 21 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 29b08428ae45..7c54c195e79e 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -801,7 +801,7 @@ EXPORT_SYMBOL(blk_alloc_queue); * @q: request queue pointer * @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PREEMPT */ -int blk_queue_enter(struct request_queue *q, unsigned int flags) +int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags) { const bool preempt = flags & BLK_MQ_REQ_PREEMPT; @@ -1225,7 +1225,7 @@ int blk_update_nr_requests(struct request_queue *q, unsigned int nr) * Returns request pointer on success, with @q->queue_lock *not held*. */ static struct request *__get_request(struct request_list *rl, unsigned int op, - struct bio *bio, unsigned int flags) + struct bio *bio, blk_mq_req_flags_t flags) { struct request_queue *q = rl->q; struct request *rq; @@ -1408,7 +1408,7 @@ rq_starved: * Returns request pointer on success, with @q->queue_lock *not held*. */ static struct request *get_request(struct request_queue *q, unsigned int op, - struct bio *bio, unsigned int flags) + struct bio *bio, blk_mq_req_flags_t flags) { const bool is_sync = op_is_sync(op); DEFINE_WAIT(wait); @@ -1458,7 +1458,7 @@ retry: /* flags: BLK_MQ_REQ_PREEMPT and/or BLK_MQ_REQ_NOWAIT. */ static struct request *blk_old_get_request(struct request_queue *q, - unsigned int op, unsigned int flags) + unsigned int op, blk_mq_req_flags_t flags) { struct request *rq; gfp_t gfp_mask = flags & BLK_MQ_REQ_NOWAIT ? GFP_ATOMIC : @@ -1495,7 +1495,7 @@ static struct request *blk_old_get_request(struct request_queue *q, * @flags: BLK_MQ_REQ_* flags, e.g. BLK_MQ_REQ_NOWAIT. */ struct request *blk_get_request_flags(struct request_queue *q, unsigned int op, - unsigned int flags) + blk_mq_req_flags_t flags) { struct request *req; @@ -2291,7 +2291,7 @@ blk_qc_t generic_make_request(struct bio *bio) current->bio_list = bio_list_on_stack; do { struct request_queue *q = bio->bi_disk->queue; - unsigned int flags = bio->bi_opf & REQ_NOWAIT ? + blk_mq_req_flags_t flags = bio->bi_opf & REQ_NOWAIT ? BLK_MQ_REQ_NOWAIT : 0; if (likely(blk_queue_enter(q, flags) == 0)) { diff --git a/block/blk-mq.c b/block/blk-mq.c index 211bc8a3e2cc..bfe24a5b62a3 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -383,7 +383,7 @@ static struct request *blk_mq_get_request(struct request_queue *q, } struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, - unsigned int flags) + blk_mq_req_flags_t flags) { struct blk_mq_alloc_data alloc_data = { .flags = flags }; struct request *rq; @@ -409,7 +409,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, EXPORT_SYMBOL(blk_mq_alloc_request); struct request *blk_mq_alloc_request_hctx(struct request_queue *q, - unsigned int op, unsigned int flags, unsigned int hctx_idx) + unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx) { struct blk_mq_alloc_data alloc_data = { .flags = flags }; struct request *rq; diff --git a/block/blk-mq.h b/block/blk-mq.h index 2502f40ccdc0..99a19c5523e2 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -111,7 +111,7 @@ static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx) struct blk_mq_alloc_data { /* input parameter */ struct request_queue *q; - unsigned int flags; + blk_mq_req_flags_t flags; unsigned int shallow_depth; /* input & output parameter */ diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 683d890d73fa..878d5c09d15b 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -305,7 +305,7 @@ static void nvme_put_ns(struct nvme_ns *ns) } struct request *nvme_alloc_request(struct request_queue *q, - struct nvme_command *cmd, unsigned int flags, int qid) + struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid) { unsigned op = nvme_is_write(cmd) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN; struct request *req; @@ -573,7 +573,8 @@ EXPORT_SYMBOL_GPL(nvme_setup_cmd); */ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, union nvme_result *result, void *buffer, unsigned bufflen, - unsigned timeout, int qid, int at_head, int flags) + unsigned timeout, int qid, int at_head, + blk_mq_req_flags_t flags) { struct request *req; int ret; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 7b9cc7d616b7..23b8504ace7f 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -327,14 +327,15 @@ int nvme_reinit_tagset(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set); #define NVME_QID_ANY -1 struct request *nvme_alloc_request(struct request_queue *q, - struct nvme_command *cmd, unsigned int flags, int qid); + struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid); blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, struct nvme_command *cmd); int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, void *buf, unsigned bufflen); int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, union nvme_result *result, void *buffer, unsigned bufflen, - unsigned timeout, int qid, int at_head, int flags); + unsigned timeout, int qid, int at_head, + blk_mq_req_flags_t flags); int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count); void nvme_start_keep_alive(struct nvme_ctrl *ctrl); void nvme_stop_keep_alive(struct nvme_ctrl *ctrl); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 82b56609736a..b326208277ee 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -209,16 +209,21 @@ void blk_mq_free_request(struct request *rq); bool blk_mq_can_queue(struct blk_mq_hw_ctx *); enum { - BLK_MQ_REQ_NOWAIT = (1 << 0), /* return when out of requests */ - BLK_MQ_REQ_RESERVED = (1 << 1), /* allocate from reserved pool */ - BLK_MQ_REQ_INTERNAL = (1 << 2), /* allocate internal/sched tag */ - BLK_MQ_REQ_PREEMPT = (1 << 3), /* set RQF_PREEMPT */ + /* return when out of requests */ + BLK_MQ_REQ_NOWAIT = (__force blk_mq_req_flags_t)(1 << 0), + /* allocate from reserved pool */ + BLK_MQ_REQ_RESERVED = (__force blk_mq_req_flags_t)(1 << 1), + /* allocate internal/sched tag */ + BLK_MQ_REQ_INTERNAL = (__force blk_mq_req_flags_t)(1 << 2), + /* set RQF_PREEMPT */ + BLK_MQ_REQ_PREEMPT = (__force blk_mq_req_flags_t)(1 << 3), }; struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, - unsigned int flags); + blk_mq_req_flags_t flags); struct request *blk_mq_alloc_request_hctx(struct request_queue *q, - unsigned int op, unsigned int flags, unsigned int hctx_idx); + unsigned int op, blk_mq_req_flags_t flags, + unsigned int hctx_idx); struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag); enum { diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 1b04085255fb..13ccfc9b210a 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -162,6 +162,8 @@ struct bio { */ #define BIO_RESET_BITS BVEC_POOL_OFFSET +typedef __u32 __bitwise blk_mq_req_flags_t; + /* * Operations and flags common to the bio and request structures. * We use 8 bits for encoding the operation, and the remaining 24 for flags. diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 402c9d536ae1..e80ea1d31343 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -935,7 +935,7 @@ extern void blk_put_request(struct request *); extern void __blk_put_request(struct request_queue *, struct request *); extern struct request *blk_get_request_flags(struct request_queue *, unsigned int op, - unsigned int flags); + blk_mq_req_flags_t flags); extern struct request *blk_get_request(struct request_queue *, unsigned int op, gfp_t gfp_mask); extern void blk_requeue_request(struct request_queue *, struct request *); @@ -959,7 +959,7 @@ extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t, extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t, struct scsi_ioctl_command __user *); -extern int blk_queue_enter(struct request_queue *q, unsigned int flags); +extern int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags); extern void blk_queue_exit(struct request_queue *q); extern void blk_start_queue(struct request_queue *q); extern void blk_start_queue_async(struct request_queue *q); -- cgit v1.2.3-58-ga151 From 17eac0996341cfd28b4093554e662fd82aa1c237 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Thu, 9 Nov 2017 17:57:06 +0100 Subject: block: create 'slaves' and 'holders' entries for hidden gendisks When creating nvme multipath devices we should populate the 'slaves' and 'holders' directorys properly to aid userspace topology detection. Signed-off-by: Hannes Reinecke [hch: split from a larger patch] Reviewed-by: Keith Busch Reviewed-by: Martin K. Petersen Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/genhd.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 835e907e6e01..3de1671631bf 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -585,14 +585,14 @@ static void register_disk(struct device *parent, struct gendisk *disk) */ pm_runtime_set_memalloc_noio(ddev, true); + disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj); + disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); + if (disk->flags & GENHD_FL_HIDDEN) { dev_set_uevent_suppress(ddev, 0); return; } - disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj); - disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); - /* No minors to use for partitions */ if (!disk_part_scan_enabled(disk)) goto exit; @@ -728,11 +728,11 @@ void del_gendisk(struct gendisk *disk) WARN_ON(1); } - if (!(disk->flags & GENHD_FL_HIDDEN)) { + if (!(disk->flags & GENHD_FL_HIDDEN)) blk_unregister_region(disk_devt(disk), disk->minors); - kobject_put(disk->part0.holder_dir); - kobject_put(disk->slave_dir); - } + + kobject_put(disk->part0.holder_dir); + kobject_put(disk->slave_dir); part_stat_set_all(&disk->part0, 0); disk->part0.stamp = 0; -- cgit v1.2.3-58-ga151 From f0fba398fec65ff5a2e1bf8ae62718ec0450abaf Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 6 Nov 2017 17:53:53 +0000 Subject: block: avoid null pointer dereference on null disk It is possible that the pointer disk can be null and hence we can get a null pointer deference when accessing disk->flags. Add a null pointer check to avoid the dereference. Detected by CoverityScan, CID#1461133 ("Explicit null dereferenced") Fixes: 8ddcd653257c ("block: introduce GENHD_FL_HIDDEN") Reviewed-by: Christoph Hellwig Signed-off-by: Colin Ian King Signed-off-by: Jens Axboe --- block/genhd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 3de1671631bf..997e598f3b86 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -798,7 +798,7 @@ struct gendisk *get_gendisk(dev_t devt, int *partno) spin_unlock_bh(&ext_devt_lock); } - if (unlikely(disk->flags & GENHD_FL_HIDDEN)) { + if (disk && unlikely(disk->flags & GENHD_FL_HIDDEN)) { put_disk(disk); disk = NULL; } -- cgit v1.2.3-58-ga151 From 79f720a751cad613620d0237e3b44f89f4a69181 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 10 Nov 2017 09:13:21 -0700 Subject: blk-mq: only run the hardware queue if IO is pending Currently we are inconsistent in when we decide to run the queue. Using blk_mq_run_hw_queues() we check if the hctx has pending IO before running it, but we don't do that from the individual queue run function, blk_mq_run_hw_queue(). This results in a lot of extra and pointless queue runs, potentially, on flush requests and (much worse) on tag starvation situations. This is observable just looking at top output, with lots of kworkers active. For the !async runs, it just adds to the CPU overhead of blk-mq. Move the has-pending check into the run function instead of having callers do it. Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 7 +------ block/blk-mq.c | 18 +++++++++++------- block/blk-mq.h | 2 -- include/linux/blk-mq.h | 2 +- 4 files changed, 13 insertions(+), 16 deletions(-) (limited to 'block') diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 6f4bdb8209f7..c117bd8fd1f6 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -81,12 +81,7 @@ static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx) } else clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); - if (blk_mq_hctx_has_pending(hctx)) { - blk_mq_run_hw_queue(hctx, true); - return true; - } - - return false; + return blk_mq_run_hw_queue(hctx, true); } /* diff --git a/block/blk-mq.c b/block/blk-mq.c index bfe24a5b62a3..a2a4271f5ab8 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -61,10 +61,10 @@ static int blk_mq_poll_stats_bkt(const struct request *rq) /* * Check if any of the ctx's have pending work in this hardware queue */ -bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) +static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) { - return sbitmap_any_bit_set(&hctx->ctx_map) || - !list_empty_careful(&hctx->dispatch) || + return !list_empty_careful(&hctx->dispatch) || + sbitmap_any_bit_set(&hctx->ctx_map) || blk_mq_sched_has_work(hctx); } @@ -1253,9 +1253,14 @@ void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) } EXPORT_SYMBOL(blk_mq_delay_run_hw_queue); -void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) +bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) { - __blk_mq_delay_run_hw_queue(hctx, async, 0); + if (blk_mq_hctx_has_pending(hctx)) { + __blk_mq_delay_run_hw_queue(hctx, async, 0); + return true; + } + + return false; } EXPORT_SYMBOL(blk_mq_run_hw_queue); @@ -1265,8 +1270,7 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async) int i; queue_for_each_hw_ctx(q, hctx, i) { - if (!blk_mq_hctx_has_pending(hctx) || - blk_mq_hctx_stopped(hctx)) + if (blk_mq_hctx_stopped(hctx)) continue; blk_mq_run_hw_queue(hctx, async); diff --git a/block/blk-mq.h b/block/blk-mq.h index 99a19c5523e2..dcf379a892dd 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -26,14 +26,12 @@ struct blk_mq_ctx { struct kobject kobj; } ____cacheline_aligned_in_smp; -void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_freeze_queue(struct request_queue *q); void blk_mq_free_queue(struct request_queue *q); int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); void blk_mq_wake_waiters(struct request_queue *q); bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *, bool); void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); -bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx); bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, bool wait); struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index b326208277ee..eb1e2cdffb31 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -266,7 +266,7 @@ void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); void blk_mq_quiesce_queue(struct request_queue *q); void blk_mq_unquiesce_queue(struct request_queue *q); void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); -void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); +bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_run_hw_queues(struct request_queue *q, bool async); void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, -- cgit v1.2.3-58-ga151 From f906a6a0f42684715b552ccff9282b22cfe2b5a2 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 9 Nov 2017 16:10:13 -0700 Subject: blk-mq: improve tag waiting setup for non-shared tags If we run out of driver tags, we currently treat shared and non-shared tags the same - both cases hook into the tag waitqueue. This is a bit more costly than it needs to be on unshared tags, since we have to both grab the hctx lock, and the waitqueue lock (and disable interrupts). For the non-shared case, we can simply mark the queue as needing a restart. Split blk_mq_dispatch_wait_add() to account for both cases, and rename it to blk_mq_mark_tag_wait() to better reflect what it does now. Without this patch, shared and non-shared performance is about the same with 4 fio thread hammering on a single null_blk device (~410K, at 75% sys). With the patch, the shared case is the same, but the non-shared tags case runs at 431K at 71% sys. Reviewed-by: Omar Sandoval Signed-off-by: Jens Axboe --- block/blk-mq.c | 81 +++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 55 insertions(+), 26 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index a2a4271f5ab8..3295859f419d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1006,44 +1006,68 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, return 1; } -static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx **hctx, - struct request *rq) +/* + * Mark us waiting for a tag. For shared tags, this involves hooking us into + * the tag wakeups. For non-shared tags, we can simply mark us nedeing a + * restart. For both caes, take care to check the condition again after + * marking us as waiting. + */ +static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx, + struct request *rq) { struct blk_mq_hw_ctx *this_hctx = *hctx; - wait_queue_entry_t *wait = &this_hctx->dispatch_wait; + bool shared_tags = (this_hctx->flags & BLK_MQ_F_TAG_SHARED) != 0; struct sbq_wait_state *ws; + wait_queue_entry_t *wait; + bool ret; - if (!list_empty_careful(&wait->entry)) - return false; + if (!shared_tags) { + if (!test_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state)) + set_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state); + } else { + wait = &this_hctx->dispatch_wait; + if (!list_empty_careful(&wait->entry)) + return false; - spin_lock(&this_hctx->lock); - if (!list_empty(&wait->entry)) { - spin_unlock(&this_hctx->lock); - return false; - } + spin_lock(&this_hctx->lock); + if (!list_empty(&wait->entry)) { + spin_unlock(&this_hctx->lock); + return false; + } - ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx); - add_wait_queue(&ws->wait, wait); + ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx); + add_wait_queue(&ws->wait, wait); + } /* * It's possible that a tag was freed in the window between the * allocation failure and adding the hardware queue to the wait * queue. */ - if (!blk_mq_get_driver_tag(rq, hctx, false)) { + ret = blk_mq_get_driver_tag(rq, hctx, false); + + if (!shared_tags) { + /* + * Don't clear RESTART here, someone else could have set it. + * At most this will cost an extra queue run. + */ + return ret; + } else { + if (!ret) { + spin_unlock(&this_hctx->lock); + return false; + } + + /* + * We got a tag, remove ourselves from the wait queue to ensure + * someone else gets the wakeup. + */ + spin_lock_irq(&ws->wait.lock); + list_del_init(&wait->entry); + spin_unlock_irq(&ws->wait.lock); spin_unlock(&this_hctx->lock); - return false; + return true; } - - /* - * We got a tag, remove ourselves from the wait queue to ensure - * someone else gets the wakeup. - */ - spin_lock_irq(&ws->wait.lock); - list_del_init(&wait->entry); - spin_unlock_irq(&ws->wait.lock); - spin_unlock(&this_hctx->lock); - return true; } bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, @@ -1076,10 +1100,15 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, * before we add this entry back on the dispatch list, * we'll re-run it below. */ - if (!blk_mq_dispatch_wait_add(&hctx, rq)) { + if (!blk_mq_mark_tag_wait(&hctx, rq)) { if (got_budget) blk_mq_put_dispatch_budget(hctx); - no_tag = true; + /* + * For non-shared tags, the RESTART check + * will suffice. + */ + if (hctx->flags & BLK_MQ_F_TAG_SHARED) + no_tag = true; break; } } -- cgit v1.2.3-58-ga151 From ff821d271415f17f1a704600420e92ebb9bfb32c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 10 Nov 2017 22:05:12 -0700 Subject: blk-mq: fixup some comment typos and lengths Various typos and/or spelling errors in comments. Fixes a few > 80 char lines as well. Signed-off-by: Jens Axboe --- block/blk-mq.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 3295859f419d..b600463791ec 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -707,7 +707,7 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head, /* * We abuse this flag that is otherwise used by the I/O scheduler to - * request head insertation from the workqueue. + * request head insertion from the workqueue. */ BUG_ON(rq->rq_flags & RQF_SOFTBARRIER); @@ -1137,7 +1137,8 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, if (ret == BLK_STS_RESOURCE) { /* * If an I/O scheduler has been configured and we got a - * driver tag for the next request already, free it again. + * driver tag for the next request already, free it + * again. */ if (!list_empty(list)) { nxt = list_first_entry(list, struct request, queuelist); @@ -2299,8 +2300,11 @@ static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, mutex_lock(&set->tag_list_lock); - /* Check to see if we're transitioning to shared (from 1 to 2 queues). */ - if (!list_empty(&set->tag_list) && !(set->flags & BLK_MQ_F_TAG_SHARED)) { + /* + * Check to see if we're transitioning to shared (from 1 to 2 queues). + */ + if (!list_empty(&set->tag_list) && + !(set->flags & BLK_MQ_F_TAG_SHARED)) { set->flags |= BLK_MQ_F_TAG_SHARED; /* update existing queue */ blk_mq_update_tag_set_depth(set, true); @@ -2532,10 +2536,9 @@ static void blk_mq_queue_reinit(struct request_queue *q) /* * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe - * we should change hctx numa_node according to new topology (this - * involves free and re-allocate memory, worthy doing?) + * we should change hctx numa_node according to the new topology (this + * involves freeing and re-allocating memory, worth doing?) */ - blk_mq_map_swqueue(q); blk_mq_sysfs_register(q); -- cgit v1.2.3-58-ga151