From 9cf1adc6d34f8bb12333afe189a2999131877ea3 Mon Sep 17 00:00:00 2001 From: Bhaskar Chowdhury Date: Sat, 20 Mar 2021 04:22:22 +0530 Subject: blk-mq: Sentence reconstruct for better readability Sentence reconstruction for better readability. Signed-off-by: Bhaskar Chowdhury Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 9c92053e704d..e5bfecf2940d 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -373,8 +373,8 @@ static bool blk_mq_tagset_count_completed_rqs(struct request *rq, } /** - * blk_mq_tagset_wait_completed_request - wait until all completed req's - * complete funtion is run + * blk_mq_tagset_wait_completed_request - Wait until all scheduled request + * completions have finished. * @tagset: Tag set to drain completed request * * Note: This function has to be run after all IO queues are shutdown -- cgit v1.2.3-58-ga151 From 2ec5a5c48373d4bc2f0699f86507a65bf0b9df35 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Thu, 4 Mar 2021 18:46:22 +0100 Subject: block, bfq: always inject I/O of queues blocked by wakers Suppose that I/O dispatch is plugged, to wait for new I/O for the in-service bfq-queue, say bfqq. Suppose then that there is a further bfq_queue woken by bfqq, and that this woken queue has pending I/O. A woken queue does not steal bandwidth from bfqq, because it remains soon without I/O if bfqq is not served. So there is virtually no risk of loss of bandwidth for bfqq if this woken queue has I/O dispatched while bfqq is waiting for new I/O. In contrast, this extra I/O injection boosts throughput. This commit performs this extra injection. Tested-by: Jan Kara Signed-off-by: Paolo Valente Tested-by: Oleksandr Natalenko Link: https://lore.kernel.org/r/20210304174627.161-2-paolo.valente@linaro.org Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 32 +++++++++++++++++++++++++++----- block/bfq-wf2q.c | 8 ++++++++ 2 files changed, 35 insertions(+), 5 deletions(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 95586137194e..eb249775029e 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -4491,9 +4491,15 @@ check_queue: bfq_bfqq_busy(bfqq->bic->bfqq[0]) && bfqq->bic->bfqq[0]->next_rq ? bfqq->bic->bfqq[0] : NULL; + struct bfq_queue *blocked_bfqq = + !hlist_empty(&bfqq->woken_list) ? + container_of(bfqq->woken_list.first, + struct bfq_queue, + woken_list_node) + : NULL; /* - * The next three mutually-exclusive ifs decide + * The next four mutually-exclusive ifs decide * whether to try injection, and choose the queue to * pick an I/O request from. * @@ -4526,7 +4532,15 @@ check_queue: * next bfqq's I/O is brought forward dramatically, * for it is not blocked for milliseconds. * - * The third if checks whether bfqq is a queue for + * The third if checks whether there is a queue woken + * by bfqq, and currently with pending I/O. Such a + * woken queue does not steal bandwidth from bfqq, + * because it remains soon without I/O if bfqq is not + * served. So there is virtually no risk of loss of + * bandwidth for bfqq if this woken queue has I/O + * dispatched while bfqq is waiting for new I/O. + * + * The fourth if checks whether bfqq is a queue for * which it is better to avoid injection. It is so if * bfqq delivers more throughput when served without * any further I/O from other queues in the middle, or @@ -4546,11 +4560,11 @@ check_queue: * bfq_update_has_short_ttime(), it is rather likely * that, if I/O is being plugged for bfqq and the * waker queue has pending I/O requests that are - * blocking bfqq's I/O, then the third alternative + * blocking bfqq's I/O, then the fourth alternative * above lets the waker queue get served before the * I/O-plugging timeout fires. So one may deem the * second alternative superfluous. It is not, because - * the third alternative may be way less effective in + * the fourth alternative may be way less effective in * case of a synchronization. For two main * reasons. First, throughput may be low because the * inject limit may be too low to guarantee the same @@ -4559,7 +4573,7 @@ check_queue: * guarantees (the second alternative unconditionally * injects a pending I/O request of the waker queue * for each bfq_dispatch_request()). Second, with the - * third alternative, the duration of the plugging, + * fourth alternative, the duration of the plugging, * i.e., the time before bfqq finally receives new I/O, * may not be minimized, because the waker queue may * happen to be served only after other queues. @@ -4577,6 +4591,14 @@ check_queue: bfq_bfqq_budget_left(bfqq->waker_bfqq) ) bfqq = bfqq->waker_bfqq; + else if (blocked_bfqq && + bfq_bfqq_busy(blocked_bfqq) && + blocked_bfqq->next_rq && + bfq_serv_to_charge(blocked_bfqq->next_rq, + blocked_bfqq) <= + bfq_bfqq_budget_left(blocked_bfqq) + ) + bfqq = blocked_bfqq; else if (!idling_boosts_thr_without_issues(bfqd, bfqq) && (bfqq->wr_coeff == 1 || bfqd->wr_busy_queues > 1 || !bfq_bfqq_has_short_ttime(bfqq))) diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index 070e34a7feb1..7a462df71f68 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -1706,4 +1706,12 @@ void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq) if (bfqq->wr_coeff > 1) bfqd->wr_busy_queues++; + + /* Move bfqq to the head of the woken list of its waker */ + if (!hlist_unhashed(&bfqq->woken_list_node) && + &bfqq->woken_list_node != bfqq->waker_bfqq->woken_list.first) { + hlist_del_init(&bfqq->woken_list_node); + hlist_add_head(&bfqq->woken_list_node, + &bfqq->waker_bfqq->woken_list); + } } -- cgit v1.2.3-58-ga151 From 7cc4ffc55564df4349050bcbf46fbdf3f35aef52 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Thu, 4 Mar 2021 18:46:23 +0100 Subject: block, bfq: put reqs of waker and woken in dispatch list Consider a new I/O request that arrives for a bfq_queue bfqq. If, when this happens, the only active bfq_queues are bfqq and either its waker bfq_queue or one of its woken bfq_queues, then there is no point in queueing this new I/O request in bfqq for service. In fact, the in-service queue and bfqq agree on serving this new I/O request as soon as possible. So this commit puts this new I/O request directly into the dispatch list. Tested-by: Jan Kara Acked-by: Jan Kara Signed-off-by: Paolo Valente Tested-by: Oleksandr Natalenko Link: https://lore.kernel.org/r/20210304174627.161-3-paolo.valente@linaro.org Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 44 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index eb249775029e..df840f37889d 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5649,7 +5649,49 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, spin_lock_irq(&bfqd->lock); bfqq = bfq_init_rq(rq); - if (!bfqq || at_head || blk_rq_is_passthrough(rq)) { + + /* + * Reqs with at_head or passthrough flags set are to be put + * directly into dispatch list. Additional case for putting rq + * directly into the dispatch queue: the only active + * bfq_queues are bfqq and either its waker bfq_queue or one + * of its woken bfq_queues. The rationale behind this + * additional condition is as follows: + * - consider a bfq_queue, say Q1, detected as a waker of + * another bfq_queue, say Q2 + * - by definition of a waker, Q1 blocks the I/O of Q2, i.e., + * some I/O of Q1 needs to be completed for new I/O of Q2 + * to arrive. A notable example of waker is journald + * - so, Q1 and Q2 are in any respect the queues of two + * cooperating processes (or of two cooperating sets of + * processes): the goal of Q1's I/O is doing what needs to + * be done so that new Q2's I/O can finally be + * issued. Therefore, if the service of Q1's I/O is delayed, + * then Q2's I/O is delayed too. Conversely, if Q2's I/O is + * delayed, the goal of Q1's I/O is hindered. + * - as a consequence, if some I/O of Q1/Q2 arrives while + * Q2/Q1 is the only queue in service, there is absolutely + * no point in delaying the service of such an I/O. The + * only possible result is a throughput loss + * - so, when the above condition holds, the best option is to + * have the new I/O dispatched as soon as possible + * - the most effective and efficient way to attain the above + * goal is to put the new I/O directly in the dispatch + * list + * - as an additional restriction, Q1 and Q2 must be the only + * busy queues for this commit to put the I/O of Q2/Q1 in + * the dispatch list. This is necessary, because, if also + * other queues are waiting for service, then putting new + * I/O directly in the dispatch list may evidently cause a + * violation of service guarantees for the other queues + */ + if (!bfqq || + (bfqq != bfqd->in_service_queue && + bfqd->in_service_queue != NULL && + bfq_tot_busy_queues(bfqd) == 1 + bfq_bfqq_busy(bfqq) && + (bfqq->waker_bfqq == bfqd->in_service_queue || + bfqd->in_service_queue->waker_bfqq == bfqq)) || + at_head || blk_rq_is_passthrough(rq)) { if (at_head) list_add(&rq->queuelist, &bfqd->dispatch); else -- cgit v1.2.3-58-ga151 From 8ef3fc3a043cd4b3dfdb260f02be5f65cc31445d Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Thu, 4 Mar 2021 18:46:24 +0100 Subject: block, bfq: make shared queues inherit wakers Consider a bfq_queue bfqq that is about to be merged with another bfq_queue new_bfqq. The processes associated with bfqq are cooperators of the processes associated with new_bfqq. So, if bfqq has a waker, then it is reasonable (and beneficial for throughput) to assume that all these processes will be happy to let bfqq's waker freely inject I/O when they have no I/O. So this commit makes new_bfqq inherit bfqq's waker. Tested-by: Jan Kara Signed-off-by: Paolo Valente Tested-by: Oleksandr Natalenko Link: https://lore.kernel.org/r/20210304174627.161-4-paolo.valente@linaro.org Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 42 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 39 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index df840f37889d..b00dcc46bac7 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2822,6 +2822,29 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, bfq_mark_bfqq_IO_bound(new_bfqq); bfq_clear_bfqq_IO_bound(bfqq); + /* + * The processes associated with bfqq are cooperators of the + * processes associated with new_bfqq. So, if bfqq has a + * waker, then assume that all these processes will be happy + * to let bfqq's waker freely inject I/O when they have no + * I/O. + */ + if (bfqq->waker_bfqq && !new_bfqq->waker_bfqq && + bfqq->waker_bfqq != new_bfqq) { + new_bfqq->waker_bfqq = bfqq->waker_bfqq; + new_bfqq->tentative_waker_bfqq = NULL; + + /* + * If the waker queue disappears, then + * new_bfqq->waker_bfqq must be reset. So insert + * new_bfqq into the woken_list of the waker. See + * bfq_check_waker for details. + */ + hlist_add_head(&new_bfqq->woken_list_node, + &new_bfqq->waker_bfqq->woken_list); + + } + /* * If bfqq is weight-raised, then let new_bfqq inherit * weight-raising. To reduce false positives, neglect the case @@ -6310,7 +6333,7 @@ static struct bfq_queue *bfq_init_rq(struct request *rq) if (likely(!new_queue)) { /* If the queue was seeky for too long, break it apart. */ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { - bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); + struct bfq_queue *old_bfqq = bfqq; /* Update bic before losing reference to bfqq */ if (bfq_bfqq_in_large_burst(bfqq)) @@ -6319,11 +6342,24 @@ static struct bfq_queue *bfq_init_rq(struct request *rq) bfqq = bfq_split_bfqq(bic, bfqq); split = true; - if (!bfqq) + if (!bfqq) { bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, true, is_sync, NULL); - else + bfqq->waker_bfqq = old_bfqq->waker_bfqq; + bfqq->tentative_waker_bfqq = NULL; + + /* + * If the waker queue disappears, then + * new_bfqq->waker_bfqq must be + * reset. So insert new_bfqq into the + * woken_list of the waker. See + * bfq_check_waker for details. + */ + if (bfqq->waker_bfqq) + hlist_add_head(&bfqq->woken_list_node, + &bfqq->waker_bfqq->woken_list); + } else bfqq_already_existing = true; } } -- cgit v1.2.3-58-ga151 From 8c544770092a3d7532d01903b75721e537d87001 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Thu, 4 Mar 2021 18:46:25 +0100 Subject: block, bfq: fix weight-raising resume with !low_latency When the io_latency heuristic is off, bfq_queues must not start to be weight-raised. Unfortunately, by mistake, this may happen when the state of a previously weight-raised bfq_queue is resumed after a queue split. This commit fixes this error. Tested-by: Jan Kara Signed-off-by: Paolo Valente Tested-by: Oleksandr Natalenko Link: https://lore.kernel.org/r/20210304174627.161-5-paolo.valente@linaro.org Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index b00dcc46bac7..2a84294b0525 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -1012,7 +1012,7 @@ static void bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, struct bfq_io_cq *bic, bool bfq_already_existing) { - unsigned int old_wr_coeff = bfqq->wr_coeff; + unsigned int old_wr_coeff = 1; bool busy = bfq_already_existing && bfq_bfqq_busy(bfqq); if (bic->saved_has_short_ttime) @@ -1033,7 +1033,13 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, bfqq->ttime = bic->saved_ttime; bfqq->io_start_time = bic->saved_io_start_time; bfqq->tot_idle_time = bic->saved_tot_idle_time; - bfqq->wr_coeff = bic->saved_wr_coeff; + /* + * Restore weight coefficient only if low_latency is on + */ + if (bfqd->low_latency) { + old_wr_coeff = bfqq->wr_coeff; + bfqq->wr_coeff = bic->saved_wr_coeff; + } bfqq->service_from_wr = bic->saved_service_from_wr; bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt; bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish; -- cgit v1.2.3-58-ga151 From 85686d0dc1946bd9903efb1c130d634f963e4843 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Thu, 4 Mar 2021 18:46:26 +0100 Subject: block, bfq: keep shared queues out of the waker mechanism Shared queues are likely to receive I/O at a high rate. This may deceptively let them be considered as wakers of other queues. But a false waker will unjustly steal bandwidth to its supposedly woken queue. So considering also shared queues in the waking mechanism may cause more control troubles than throughput benefits. This commit keeps shared queues out of the waker-detection mechanism. Tested-by: Jan Kara Signed-off-by: Paolo Valente Tested-by: Oleksandr Natalenko Link: https://lore.kernel.org/r/20210304174627.161-6-paolo.valente@linaro.org Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 2a84294b0525..a3f59d3065b3 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5860,7 +5860,17 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) 1UL<<(BFQ_RATE_SHIFT - 10)) bfq_update_rate_reset(bfqd, NULL); bfqd->last_completion = now_ns; - bfqd->last_completed_rq_bfqq = bfqq; + /* + * Shared queues are likely to receive I/O at a high + * rate. This may deceptively let them be considered as wakers + * of other queues. But a false waker will unjustly steal + * bandwidth to its supposedly woken queue. So considering + * also shared queues in the waking mechanism may cause more + * control troubles than throughput benefits. Then do not set + * last_completed_rq_bfqq to bfqq if bfqq is a shared queue. + */ + if (!bfq_bfqq_coop(bfqq)) + bfqd->last_completed_rq_bfqq = bfqq; /* * If we are waiting to discover whether the request pattern -- cgit v1.2.3-58-ga151 From 430a67f9d6169a7b3e328bceb2ef9542e4153c7c Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Thu, 4 Mar 2021 18:46:27 +0100 Subject: block, bfq: merge bursts of newly-created queues Many throughput-sensitive workloads are made of several parallel I/O flows, with all flows generated by the same application, or more generically by the same task (e.g., system boot). The most counterproductive action with these workloads is plugging I/O dispatch when one of the bfq_queues associated with these flows remains temporarily empty. To avoid this plugging, BFQ has been using a burst-handling mechanism for years now. This mechanism has proven effective for throughput, and not detrimental for service guarantees. This commit pushes this mechanism a little bit further, basing on the following two facts. First, all the I/O flows of a the same application or task contribute to the execution/completion of that common application or task. So the performance figures that matter are total throughput of the flows and task-wide I/O latency. In particular, these flows do not need to be protected from each other, in terms of individual bandwidth or latency. Second, the above fact holds regardless of the number of flows. Putting these two facts together, this commits merges stably the bfq_queues associated with these I/O flows, i.e., with the processes that generate these IO/ flows, regardless of how many the involved processes are. To decide whether a set of bfq_queues is actually associated with the I/O flows of a common application or task, and to merge these queues stably, this commit operates as follows: given a bfq_queue, say Q2, currently being created, and the last bfq_queue, say Q1, created before Q2, Q2 is merged stably with Q1 if - very little time has elapsed since when Q1 was created - Q2 has the same ioprio as Q1 - Q2 belongs to the same group as Q1 Merging bfq_queues also reduces scheduling overhead. A fio test with ten random readers on /dev/nullb shows a throughput boost of 40%, with a quadcore. Since BFQ's execution time amounts to ~50% of the total per-request processing time, the above throughput boost implies that BFQ's overhead is reduced by more than 50%. Tested-by: Jan Kara Signed-off-by: Paolo Valente Tested-by: Oleksandr Natalenko Link: https://lore.kernel.org/r/20210304174627.161-7-paolo.valente@linaro.org Signed-off-by: Jens Axboe --- block/bfq-cgroup.c | 2 + block/bfq-iosched.c | 259 ++++++++++++++++++++++++++++++++++++++++++++++++++-- block/bfq-iosched.h | 15 +++ 3 files changed, 266 insertions(+), 10 deletions(-) (limited to 'block') diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index b791e2041e49..e2f14508f2d6 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -547,6 +547,8 @@ static void bfq_pd_init(struct blkg_policy_data *pd) entity->orig_weight = entity->weight = entity->new_weight = d->weight; entity->my_sched_data = &bfqg->sched_data; + entity->last_bfqq_created = NULL; + bfqg->my_entity = entity; /* * the root_group's will be set to NULL * in bfq_init_queue() diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index a3f59d3065b3..9b7678ad5830 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -1075,7 +1075,7 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, static int bfqq_process_refs(struct bfq_queue *bfqq) { return bfqq->ref - bfqq->allocated - bfqq->entity.on_st_or_in_serv - - (bfqq->weight_counter != NULL); + (bfqq->weight_counter != NULL) - bfqq->stable_ref; } /* Empty burst list and add just bfqq (see comments on bfq_handle_burst) */ @@ -2628,6 +2628,11 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, return true; } +static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd, + struct bfq_queue *bfqq); + +static void bfq_put_stable_ref(struct bfq_queue *bfqq); + /* * Attempt to schedule a merge of bfqq with the currently in-service * queue or with a close queue among the scheduled queues. Return @@ -2650,10 +2655,49 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, */ static struct bfq_queue * bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, - void *io_struct, bool request) + void *io_struct, bool request, struct bfq_io_cq *bic) { struct bfq_queue *in_service_bfqq, *new_bfqq; + /* + * Check delayed stable merge for rotational or non-queueing + * devs. For this branch to be executed, bfqq must not be + * currently merged with some other queue (i.e., bfqq->bic + * must be non null). If we considered also merged queues, + * then we should also check whether bfqq has already been + * merged with bic->stable_merge_bfqq. But this would be + * costly and complicated. + */ + if (unlikely(!bfqd->nonrot_with_queueing)) { + if (bic->stable_merge_bfqq && + !bfq_bfqq_just_created(bfqq) && + time_is_after_jiffies(bfqq->split_time + + msecs_to_jiffies(200))) { + struct bfq_queue *stable_merge_bfqq = + bic->stable_merge_bfqq; + int proc_ref = min(bfqq_process_refs(bfqq), + bfqq_process_refs(stable_merge_bfqq)); + + /* deschedule stable merge, because done or aborted here */ + bfq_put_stable_ref(stable_merge_bfqq); + + bic->stable_merge_bfqq = NULL; + + if (!idling_boosts_thr_without_issues(bfqd, bfqq) && + proc_ref > 0) { + /* next function will take at least one ref */ + struct bfq_queue *new_bfqq = + bfq_setup_merge(bfqq, stable_merge_bfqq); + + bic->stably_merged = true; + if (new_bfqq && new_bfqq->bic) + new_bfqq->bic->stably_merged = true; + return new_bfqq; + } else + return NULL; + } + } + /* * Do not perform queue merging if the device is non * rotational and performs internal queueing. In fact, such a @@ -2795,6 +2839,17 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) } } + +static void +bfq_reassign_last_bfqq(struct bfq_queue *cur_bfqq, struct bfq_queue *new_bfqq) +{ + if (cur_bfqq->entity.parent && + cur_bfqq->entity.parent->last_bfqq_created == cur_bfqq) + cur_bfqq->entity.parent->last_bfqq_created = new_bfqq; + else if (cur_bfqq->bfqd && cur_bfqq->bfqd->last_bfqq_created == cur_bfqq) + cur_bfqq->bfqd->last_bfqq_created = new_bfqq; +} + void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq) { /* @@ -2812,6 +2867,8 @@ void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfqq != bfqd->in_service_queue) bfq_del_bfqq_busy(bfqd, bfqq, false); + bfq_reassign_last_bfqq(bfqq, NULL); + bfq_put_queue(bfqq); } @@ -2908,6 +2965,9 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, */ new_bfqq->pid = -1; bfqq->bic = NULL; + + bfq_reassign_last_bfqq(bfqq, new_bfqq); + bfq_release_process_ref(bfqd, bfqq); } @@ -2935,7 +2995,7 @@ static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, * We take advantage of this function to perform an early merge * of the queues of possible cooperating processes. */ - new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false); + new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false, bfqd->bio_bic); if (new_bfqq) { /* * bic still points to bfqq, then it has not yet been @@ -5034,6 +5094,12 @@ void bfq_put_queue(struct bfq_queue *bfqq) bfqg_and_blkg_put(bfqg); } +static void bfq_put_stable_ref(struct bfq_queue *bfqq) +{ + bfqq->stable_ref--; + bfq_put_queue(bfqq); +} + static void bfq_put_cooperator(struct bfq_queue *bfqq) { struct bfq_queue *__bfqq, *next; @@ -5090,6 +5156,24 @@ static void bfq_exit_icq(struct io_cq *icq) { struct bfq_io_cq *bic = icq_to_bic(icq); + if (bic->stable_merge_bfqq) { + struct bfq_data *bfqd = bic->stable_merge_bfqq->bfqd; + + /* + * bfqd is NULL if scheduler already exited, and in + * that case this is the last time bfqq is accessed. + */ + if (bfqd) { + unsigned long flags; + + spin_lock_irqsave(&bfqd->lock, flags); + bfq_put_stable_ref(bic->stable_merge_bfqq); + spin_unlock_irqrestore(&bfqd->lock, flags); + } else { + bfq_put_stable_ref(bic->stable_merge_bfqq); + } + } + bfq_exit_icq_bfqq(bic, true); bfq_exit_icq_bfqq(bic, false); } @@ -5150,7 +5234,8 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, struct bio *bio, bool is_sync, - struct bfq_io_cq *bic); + struct bfq_io_cq *bic, + bool respawn); static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) { @@ -5170,7 +5255,7 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) bfqq = bic_to_bfqq(bic, false); if (bfqq) { bfq_release_process_ref(bfqd, bfqq); - bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic); + bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic, true); bic_set_bfqq(bic, bfqq, false); } @@ -5213,6 +5298,8 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, /* set end request to minus infinity from now */ bfqq->ttime.last_end_request = now_ns + 1; + bfqq->creation_time = jiffies; + bfqq->io_start_time = now_ns; bfq_mark_bfqq_IO_bound(bfqq); @@ -5262,9 +5349,156 @@ static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, } } +static struct bfq_queue * +bfq_do_early_stable_merge(struct bfq_data *bfqd, struct bfq_queue *bfqq, + struct bfq_io_cq *bic, + struct bfq_queue *last_bfqq_created) +{ + struct bfq_queue *new_bfqq = + bfq_setup_merge(bfqq, last_bfqq_created); + + if (!new_bfqq) + return bfqq; + + if (new_bfqq->bic) + new_bfqq->bic->stably_merged = true; + bic->stably_merged = true; + + /* + * Reusing merge functions. This implies that + * bfqq->bic must be set too, for + * bfq_merge_bfqqs to correctly save bfqq's + * state before killing it. + */ + bfqq->bic = bic; + bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq); + + return new_bfqq; +} + +/* + * Many throughput-sensitive workloads are made of several parallel + * I/O flows, with all flows generated by the same application, or + * more generically by the same task (e.g., system boot). The most + * counterproductive action with these workloads is plugging I/O + * dispatch when one of the bfq_queues associated with these flows + * remains temporarily empty. + * + * To avoid this plugging, BFQ has been using a burst-handling + * mechanism for years now. This mechanism has proven effective for + * throughput, and not detrimental for service guarantees. The + * following function pushes this mechanism a little bit further, + * basing on the following two facts. + * + * First, all the I/O flows of a the same application or task + * contribute to the execution/completion of that common application + * or task. So the performance figures that matter are total + * throughput of the flows and task-wide I/O latency. In particular, + * these flows do not need to be protected from each other, in terms + * of individual bandwidth or latency. + * + * Second, the above fact holds regardless of the number of flows. + * + * Putting these two facts together, this commits merges stably the + * bfq_queues associated with these I/O flows, i.e., with the + * processes that generate these IO/ flows, regardless of how many the + * involved processes are. + * + * To decide whether a set of bfq_queues is actually associated with + * the I/O flows of a common application or task, and to merge these + * queues stably, this function operates as follows: given a bfq_queue, + * say Q2, currently being created, and the last bfq_queue, say Q1, + * created before Q2, Q2 is merged stably with Q1 if + * - very little time has elapsed since when Q1 was created + * - Q2 has the same ioprio as Q1 + * - Q2 belongs to the same group as Q1 + * + * Merging bfq_queues also reduces scheduling overhead. A fio test + * with ten random readers on /dev/nullb shows a throughput boost of + * 40%, with a quadcore. Since BFQ's execution time amounts to ~50% of + * the total per-request processing time, the above throughput boost + * implies that BFQ's overhead is reduced by more than 50%. + * + * This new mechanism most certainly obsoletes the current + * burst-handling heuristics. We keep those heuristics for the moment. + */ +static struct bfq_queue *bfq_do_or_sched_stable_merge(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + struct bfq_io_cq *bic) +{ + struct bfq_queue **source_bfqq = bfqq->entity.parent ? + &bfqq->entity.parent->last_bfqq_created : + &bfqd->last_bfqq_created; + + struct bfq_queue *last_bfqq_created = *source_bfqq; + + /* + * If last_bfqq_created has not been set yet, then init it. If + * it has been set already, but too long ago, then move it + * forward to bfqq. Finally, move also if bfqq belongs to a + * different group than last_bfqq_created, or if bfqq has a + * different ioprio or ioprio_class. If none of these + * conditions holds true, then try an early stable merge or + * schedule a delayed stable merge. + * + * A delayed merge is scheduled (instead of performing an + * early merge), in case bfqq might soon prove to be more + * throughput-beneficial if not merged. Currently this is + * possible only if bfqd is rotational with no queueing. For + * such a drive, not merging bfqq is better for throughput if + * bfqq happens to contain sequential I/O. So, we wait a + * little bit for enough I/O to flow through bfqq. After that, + * if such an I/O is sequential, then the merge is + * canceled. Otherwise the merge is finally performed. + */ + if (!last_bfqq_created || + time_before(last_bfqq_created->creation_time + + bfqd->bfq_burst_interval, + bfqq->creation_time) || + bfqq->entity.parent != last_bfqq_created->entity.parent || + bfqq->ioprio != last_bfqq_created->ioprio || + bfqq->ioprio_class != last_bfqq_created->ioprio_class) + *source_bfqq = bfqq; + else if (time_after_eq(last_bfqq_created->creation_time + + bfqd->bfq_burst_interval, + bfqq->creation_time)) { + if (likely(bfqd->nonrot_with_queueing)) + /* + * With this type of drive, leaving + * bfqq alone may provide no + * throughput benefits compared with + * merging bfqq. So merge bfqq now. + */ + bfqq = bfq_do_early_stable_merge(bfqd, bfqq, + bic, + last_bfqq_created); + else { /* schedule tentative stable merge */ + /* + * get reference on last_bfqq_created, + * to prevent it from being freed, + * until we decide whether to merge + */ + last_bfqq_created->ref++; + /* + * need to keep track of stable refs, to + * compute process refs correctly + */ + last_bfqq_created->stable_ref++; + /* + * Record the bfqq to merge to. + */ + bic->stable_merge_bfqq = last_bfqq_created; + } + } + + return bfqq; +} + + static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, struct bio *bio, bool is_sync, - struct bfq_io_cq *bic) + struct bfq_io_cq *bic, + bool respawn) { const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio); const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); @@ -5322,7 +5556,10 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, out: bfqq->ref++; /* get a process reference to this queue */ - bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref); + + if (bfqq != &bfqd->oom_bfqq && is_sync && !respawn) + bfqq = bfq_do_or_sched_stable_merge(bfqd, bfqq, bic); + rcu_read_unlock(); return bfqq; } @@ -5572,7 +5809,8 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) { struct bfq_queue *bfqq = RQ_BFQQ(rq), - *new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true); + *new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true, + RQ_BIC(rq)); bool waiting, idle_timer_disabled = false; if (new_bfqq) { @@ -6227,7 +6465,7 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, if (bfqq) bfq_put_queue(bfqq); - bfqq = bfq_get_queue(bfqd, bio, is_sync, bic); + bfqq = bfq_get_queue(bfqd, bio, is_sync, bic, split); bic_set_bfqq(bic, bfqq, is_sync); if (split && is_sync) { @@ -6348,7 +6586,8 @@ static struct bfq_queue *bfq_init_rq(struct request *rq) if (likely(!new_queue)) { /* If the queue was seeky for too long, break it apart. */ - if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { + if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq) && + !bic->stably_merged) { struct bfq_queue *old_bfqq = bfqq; /* Update bic before losing reference to bfqq */ diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index b8e793c34ff1..99c2a3cb081e 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -197,6 +197,9 @@ struct bfq_entity { /* flag, set if the entity is counted in groups_with_pending_reqs */ bool in_groups_with_pending_reqs; + + /* last child queue of entity created (for non-leaf entities) */ + struct bfq_queue *last_bfqq_created; }; struct bfq_group; @@ -230,6 +233,8 @@ struct bfq_ttime { struct bfq_queue { /* reference counter */ int ref; + /* counter of references from other queues for delayed stable merge */ + int stable_ref; /* parent bfq_data */ struct bfq_data *bfqd; @@ -365,6 +370,8 @@ struct bfq_queue { unsigned long first_IO_time; /* time of first I/O for this queue */ + unsigned long creation_time; /* when this queue is created */ + /* max service rate measured so far */ u32 max_service_rate; @@ -454,6 +461,11 @@ struct bfq_io_cq { u64 saved_last_serv_time_ns; unsigned int saved_inject_limit; unsigned long saved_decrease_time_jif; + + /* candidate queue for a stable merge (due to close creation time) */ + struct bfq_queue *stable_merge_bfqq; + + bool stably_merged; /* non splittable if true */ }; /** @@ -578,6 +590,9 @@ struct bfq_data { /* bfqq owning the last completed rq */ struct bfq_queue *last_completed_rq_bfqq; + /* last bfqq created, among those in the root group */ + struct bfq_queue *last_bfqq_created; + /* time of last transition from empty to non-empty (ns) */ u64 last_empty_occupied_ns; -- cgit v1.2.3-58-ga151 From 28af742875d7d2d8ae008701c60cd4b238f3e2b2 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Mon, 5 Apr 2021 13:20:12 +0000 Subject: block: add sysfs entry for virt boundary mask This entry will expose the bio vector alignment mask for a specific block device. Signed-off-by: Max Gurtovoy Reviewed-by: Ming Lei Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20210405132012.12504-1-mgurtovoy@nvidia.com Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 0f4f0c8a7825..e03bedf180ab 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -60,7 +60,7 @@ static ssize_t queue_var_store64(s64 *var, const char *page) static ssize_t queue_requests_show(struct request_queue *q, char *page) { - return queue_var_show(q->nr_requests, (page)); + return queue_var_show(q->nr_requests, page); } static ssize_t @@ -264,6 +264,11 @@ static ssize_t queue_max_hw_sectors_show(struct request_queue *q, char *page) return queue_var_show(max_hw_sectors_kb, (page)); } +static ssize_t queue_virt_boundary_mask_show(struct request_queue *q, char *page) +{ + return queue_var_show(q->limits.virt_boundary_mask, (page)); +} + #define QUEUE_SYSFS_BIT_FNS(name, flag, neg) \ static ssize_t \ queue_##name##_show(struct request_queue *q, char *page) \ @@ -610,6 +615,7 @@ QUEUE_RO_ENTRY(queue_fua, "fua"); QUEUE_RO_ENTRY(queue_dax, "dax"); QUEUE_RW_ENTRY(queue_io_timeout, "io_timeout"); QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec"); +QUEUE_RO_ENTRY(queue_virt_boundary_mask, "virt_boundary_mask"); #ifdef CONFIG_BLK_DEV_THROTTLING_LOW QUEUE_RW_ENTRY(blk_throtl_sample_time, "throttle_sample_time"); @@ -670,6 +676,7 @@ static struct attribute *queue_attrs[] = { #ifdef CONFIG_BLK_DEV_THROTTLING_LOW &blk_throtl_sample_time_entry.attr, #endif + &queue_virt_boundary_mask_entry.attr, NULL, }; -- cgit v1.2.3-58-ga151 From 39aa56db50b9ca5cad597e561b4b160b6cbbb65b Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 11 Mar 2021 10:17:13 +0200 Subject: blk-mq: Always use blk_mq_is_sbitmap_shared Signed-off-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Reviewed-by: Hannes Reinecke Reviewed-by: Himanshu Madhani Link: https://lore.kernel.org/r/20210311081713.2763171-1-nborisov@suse.com Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index e5bfecf2940d..2a37731e8244 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -517,7 +517,7 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, tags->nr_tags = total_tags; tags->nr_reserved_tags = reserved_tags; - if (flags & BLK_MQ_F_TAG_HCTX_SHARED) + if (blk_mq_is_sbitmap_shared(flags)) return tags; if (blk_mq_init_bitmap_tags(tags, node, alloc_policy) < 0) { @@ -529,7 +529,7 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, void blk_mq_free_tags(struct blk_mq_tags *tags, unsigned int flags) { - if (!(flags & BLK_MQ_F_TAG_HCTX_SHARED)) { + if (!blk_mq_is_sbitmap_shared(flags)) { sbitmap_queue_free(tags->bitmap_tags); sbitmap_queue_free(tags->breserved_tags); } -- cgit v1.2.3-58-ga151 From ce288e0535688cc3475a3c3d4d96624514c3550c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 Mar 2021 09:29:59 +0200 Subject: block: remove BLK_BOUNCE_ISA support Remove the BLK_BOUNCE_ISA support now that all users are gone. Signed-off-by: Christoph Hellwig Acked-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20210331073001.46776-7-hch@lst.de Signed-off-by: Jens Axboe --- block/bio-integrity.c | 3 +- block/blk-map.c | 4 +- block/blk-settings.c | 11 ---- block/blk.h | 5 -- block/bounce.c | 124 ++++++++++------------------------------------ block/scsi_ioctl.c | 2 +- drivers/ata/libata-scsi.c | 3 +- include/linux/blkdev.h | 7 --- mm/Kconfig | 9 ++-- 9 files changed, 35 insertions(+), 133 deletions(-) (limited to 'block') diff --git a/block/bio-integrity.c b/block/bio-integrity.c index dfa652122a2d..4b4eb8964a6f 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -204,7 +204,6 @@ bool bio_integrity_prep(struct bio *bio) { struct bio_integrity_payload *bip; struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); - struct request_queue *q = bio->bi_bdev->bd_disk->queue; void *buf; unsigned long start, end; unsigned int len, nr_pages; @@ -238,7 +237,7 @@ bool bio_integrity_prep(struct bio *bio) /* Allocate kernel buffer for protection data */ len = intervals * bi->tuple_size; - buf = kmalloc(len, GFP_NOIO | q->bounce_gfp); + buf = kmalloc(len, GFP_NOIO); status = BLK_STS_RESOURCE; if (unlikely(buf == NULL)) { printk(KERN_ERR "could not allocate integrity buffer\n"); diff --git a/block/blk-map.c b/block/blk-map.c index 1ffef782fcf2..b62b52dcb61d 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -181,7 +181,7 @@ static int bio_copy_user_iov(struct request *rq, struct rq_map_data *map_data, i++; } else { - page = alloc_page(rq->q->bounce_gfp | gfp_mask); + page = alloc_page(GFP_NOIO | gfp_mask); if (!page) { ret = -ENOMEM; goto cleanup; @@ -486,7 +486,7 @@ static struct bio *bio_copy_kern(struct request_queue *q, void *data, if (bytes > len) bytes = len; - page = alloc_page(q->bounce_gfp | gfp_mask); + page = alloc_page(GFP_NOIO | gfp_mask); if (!page) goto cleanup; diff --git a/block/blk-settings.c b/block/blk-settings.c index b4aa2f37fab6..f9937dd2810e 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -103,28 +103,17 @@ EXPORT_SYMBOL(blk_set_stacking_limits); void blk_queue_bounce_limit(struct request_queue *q, u64 max_addr) { unsigned long b_pfn = max_addr >> PAGE_SHIFT; - int dma = 0; - q->bounce_gfp = GFP_NOIO; #if BITS_PER_LONG == 64 /* * Assume anything <= 4GB can be handled by IOMMU. Actually * some IOMMUs can handle everything, but I don't know of a * way to test this here. */ - if (b_pfn < (min_t(u64, 0xffffffffUL, BLK_BOUNCE_HIGH) >> PAGE_SHIFT)) - dma = 1; q->limits.bounce_pfn = max(max_low_pfn, b_pfn); #else - if (b_pfn < blk_max_low_pfn) - dma = 1; q->limits.bounce_pfn = b_pfn; #endif - if (dma) { - init_emergency_isa_pool(); - q->bounce_gfp = GFP_NOIO | GFP_DMA; - q->limits.bounce_pfn = b_pfn; - } } EXPORT_SYMBOL(blk_queue_bounce_limit); diff --git a/block/blk.h b/block/blk.h index 3b53e44b967e..895c9f4a5182 100644 --- a/block/blk.h +++ b/block/blk.h @@ -312,13 +312,8 @@ static inline void blk_throtl_stat_add(struct request *rq, u64 time) { } #endif #ifdef CONFIG_BOUNCE -extern int init_emergency_isa_pool(void); extern void blk_queue_bounce(struct request_queue *q, struct bio **bio); #else -static inline int init_emergency_isa_pool(void) -{ - return 0; -} static inline void blk_queue_bounce(struct request_queue *q, struct bio **bio) { } diff --git a/block/bounce.c b/block/bounce.c index 6c441f4f1cd4..debd5b0bd318 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -29,7 +29,7 @@ #define ISA_POOL_SIZE 16 static struct bio_set bounce_bio_set, bounce_bio_split; -static mempool_t page_pool, isa_page_pool; +static mempool_t page_pool; static void init_bounce_bioset(void) { @@ -89,41 +89,6 @@ static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom) #endif /* CONFIG_HIGHMEM */ -/* - * allocate pages in the DMA region for the ISA pool - */ -static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data) -{ - return mempool_alloc_pages(gfp_mask | GFP_DMA, data); -} - -static DEFINE_MUTEX(isa_mutex); - -/* - * gets called "every" time someone init's a queue with BLK_BOUNCE_ISA - * as the max address, so check if the pool has already been created. - */ -int init_emergency_isa_pool(void) -{ - int ret; - - mutex_lock(&isa_mutex); - - if (mempool_initialized(&isa_page_pool)) { - mutex_unlock(&isa_mutex); - return 0; - } - - ret = mempool_init(&isa_page_pool, ISA_POOL_SIZE, mempool_alloc_pages_isa, - mempool_free_pages, (void *) 0); - BUG_ON(ret); - - pr_info("isa pool size: %d pages\n", ISA_POOL_SIZE); - init_bounce_bioset(); - mutex_unlock(&isa_mutex); - return 0; -} - /* * Simple bounce buffer support for highmem pages. Depending on the * queue gfp mask set, *to may or may not be a highmem page. kmap it @@ -159,7 +124,7 @@ static void copy_to_high_bio_irq(struct bio *to, struct bio *from) } } -static void bounce_end_io(struct bio *bio, mempool_t *pool) +static void bounce_end_io(struct bio *bio) { struct bio *bio_orig = bio->bi_private; struct bio_vec *bvec, orig_vec; @@ -173,7 +138,7 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool) orig_vec = bio_iter_iovec(bio_orig, orig_iter); if (bvec->bv_page != orig_vec.bv_page) { dec_zone_page_state(bvec->bv_page, NR_BOUNCE); - mempool_free(bvec->bv_page, pool); + mempool_free(bvec->bv_page, &page_pool); } bio_advance_iter(bio_orig, &orig_iter, orig_vec.bv_len); } @@ -185,33 +150,17 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool) static void bounce_end_io_write(struct bio *bio) { - bounce_end_io(bio, &page_pool); -} - -static void bounce_end_io_write_isa(struct bio *bio) -{ - - bounce_end_io(bio, &isa_page_pool); + bounce_end_io(bio); } -static void __bounce_end_io_read(struct bio *bio, mempool_t *pool) +static void bounce_end_io_read(struct bio *bio) { struct bio *bio_orig = bio->bi_private; if (!bio->bi_status) copy_to_high_bio_irq(bio_orig, bio); - bounce_end_io(bio, pool); -} - -static void bounce_end_io_read(struct bio *bio) -{ - __bounce_end_io_read(bio, &page_pool); -} - -static void bounce_end_io_read_isa(struct bio *bio) -{ - __bounce_end_io_read(bio, &isa_page_pool); + bounce_end_io(bio); } static struct bio *bounce_clone_bio(struct bio *bio_src) @@ -287,8 +236,8 @@ err_put: return NULL; } -static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, - mempool_t *pool) + +void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) { struct bio *bio; int rw = bio_data_dir(*bio_orig); @@ -298,6 +247,20 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, bool bounce = false; int sectors = 0; + /* + * Data-less bio, nothing to bounce + */ + if (!bio_has_data(*bio_orig)) + return; + + /* + * Just check if the bounce pfn is equal to or bigger than the highest + * pfn in the system -- in that case, don't waste time iterating over + * bio segments + */ + if (q->limits.bounce_pfn >= blk_max_pfn) + return; + bio_for_each_segment(from, *bio_orig, iter) { if (i++ < BIO_MAX_VECS) sectors += from.bv_len >> 9; @@ -327,7 +290,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, if (page_to_pfn(page) <= q->limits.bounce_pfn) continue; - to->bv_page = mempool_alloc(pool, q->bounce_gfp); + to->bv_page = mempool_alloc(&page_pool, GFP_NOIO); inc_zone_page_state(to->bv_page, NR_BOUNCE); if (rw == WRITE) { @@ -346,46 +309,11 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, bio->bi_flags |= (1 << BIO_BOUNCED); - if (pool == &page_pool) { + if (rw == READ) + bio->bi_end_io = bounce_end_io_read; + else bio->bi_end_io = bounce_end_io_write; - if (rw == READ) - bio->bi_end_io = bounce_end_io_read; - } else { - bio->bi_end_io = bounce_end_io_write_isa; - if (rw == READ) - bio->bi_end_io = bounce_end_io_read_isa; - } bio->bi_private = *bio_orig; *bio_orig = bio; } - -void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) -{ - mempool_t *pool; - - /* - * Data-less bio, nothing to bounce - */ - if (!bio_has_data(*bio_orig)) - return; - - /* - * for non-isa bounce case, just check if the bounce pfn is equal - * to or bigger than the highest pfn in the system -- in that case, - * don't waste time iterating over bio segments - */ - if (!(q->bounce_gfp & GFP_DMA)) { - if (q->limits.bounce_pfn >= blk_max_pfn) - return; - pool = &page_pool; - } else { - BUG_ON(!mempool_initialized(&isa_page_pool)); - pool = &isa_page_pool; - } - - /* - * slow path - */ - __blk_queue_bounce(q, bio_orig, pool); -} diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index 6599bac0a78c..1048b0925567 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -431,7 +431,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, bytes = max(in_len, out_len); if (bytes) { - buffer = kzalloc(bytes, q->bounce_gfp | GFP_USER| __GFP_NOWARN); + buffer = kzalloc(bytes, GFP_NOIO | GFP_USER | __GFP_NOWARN); if (!buffer) return -ENOMEM; diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index 48b8934970f3..fd8b6febbf70 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -1043,8 +1043,7 @@ int ata_scsi_dev_config(struct scsi_device *sdev, struct ata_device *dev) blk_queue_max_segments(q, queue_max_segments(q) - 1); sdev->dma_drain_len = ATAPI_MAX_DRAIN; - sdev->dma_drain_buf = kmalloc(sdev->dma_drain_len, - q->bounce_gfp | GFP_KERNEL); + sdev->dma_drain_buf = kmalloc(sdev->dma_drain_len, GFP_NOIO); if (!sdev->dma_drain_buf) { ata_dev_err(dev, "drain buffer allocation failed\n"); return -ENOMEM; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index bc6bc8383b43..0dbb72ea3735 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -436,11 +436,6 @@ struct request_queue { */ int id; - /* - * queue needs bounce pages for pages above this limit - */ - gfp_t bounce_gfp; - spinlock_t queue_lock; /* @@ -847,7 +842,6 @@ extern unsigned long blk_max_low_pfn, blk_max_pfn; * * BLK_BOUNCE_HIGH : bounce all highmem pages * BLK_BOUNCE_ANY : don't bounce anything - * BLK_BOUNCE_ISA : bounce pages above ISA DMA boundary */ #if BITS_PER_LONG == 32 @@ -856,7 +850,6 @@ extern unsigned long blk_max_low_pfn, blk_max_pfn; #define BLK_BOUNCE_HIGH -1ULL #endif #define BLK_BOUNCE_ANY (-1ULL) -#define BLK_BOUNCE_ISA (DMA_BIT_MASK(24)) /* * default timeout for SG_IO if none specified diff --git a/mm/Kconfig b/mm/Kconfig index 24c045b24b95..d0808a23e54b 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -283,12 +283,11 @@ config PHYS_ADDR_T_64BIT config BOUNCE bool "Enable bounce buffers" default y - depends on BLOCK && MMU && (ZONE_DMA || HIGHMEM) + depends on BLOCK && MMU && HIGHMEM help - Enable bounce buffers for devices that cannot access - the full range of memory available to the CPU. Enabled - by default when ZONE_DMA or HIGHMEM is selected, but you - may say n to override this. + Enable bounce buffers for devices that cannot access the full range of + memory available to the CPU. Enabled by default when HIGHMEM is + selected, but you may say n to override this. config VIRT_TO_BUS bool -- cgit v1.2.3-58-ga151 From 9bb33f24abbd0fa2fadad01ec75438d7cc239189 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 Mar 2021 09:30:00 +0200 Subject: block: refactor the bounce buffering code Get rid of all the PFN arithmetics and just use an enum for the two remaining options, and use PageHighMem for the actual bounce decision. Add a fast path to entirely avoid the call for the common case of a queue not using the legacy bouncing code. Signed-off-by: Christoph Hellwig Acked-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20210331073001.46776-8-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-core.c | 6 ++---- block/blk-settings.c | 42 ++++++++---------------------------------- block/blk.h | 16 ++++++++++++---- block/bounce.c | 35 +++++------------------------------ include/linux/blkdev.h | 29 +++++++++++------------------ 5 files changed, 38 insertions(+), 90 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index fc60ff208497..9bcdae93f6d4 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1161,10 +1161,8 @@ static blk_status_t blk_cloned_rq_check_limits(struct request_queue *q, } /* - * queue's settings related to segment counting like q->bounce_pfn - * may differ from that of other stacking queues. - * Recalculate it to check the request correctly on this queue's - * limitation. + * The queue settings related to segment counting may differ from the + * original queue. */ rq->nr_phys_segments = blk_recalc_rq_segments(rq); if (rq->nr_phys_segments > queue_max_segments(q)) { diff --git a/block/blk-settings.c b/block/blk-settings.c index f9937dd2810e..9c009090c4b5 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -7,7 +7,6 @@ #include #include #include -#include /* for max_pfn/max_low_pfn */ #include #include #include @@ -17,11 +16,6 @@ #include "blk.h" #include "blk-wbt.h" -unsigned long blk_max_low_pfn; -EXPORT_SYMBOL(blk_max_low_pfn); - -unsigned long blk_max_pfn; - void blk_queue_rq_timeout(struct request_queue *q, unsigned int timeout) { q->rq_timeout = timeout; @@ -55,7 +49,7 @@ void blk_set_default_limits(struct queue_limits *lim) lim->discard_alignment = 0; lim->discard_misaligned = 0; lim->logical_block_size = lim->physical_block_size = lim->io_min = 512; - lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT); + lim->bounce = BLK_BOUNCE_NONE; lim->alignment_offset = 0; lim->io_opt = 0; lim->misaligned = 0; @@ -92,28 +86,16 @@ EXPORT_SYMBOL(blk_set_stacking_limits); /** * blk_queue_bounce_limit - set bounce buffer limit for queue * @q: the request queue for the device - * @max_addr: the maximum address the device can handle + * @bounce: bounce limit to enforce * * Description: - * Different hardware can have different requirements as to what pages - * it can do I/O directly to. A low level driver can call - * blk_queue_bounce_limit to have lower memory pages allocated as bounce - * buffers for doing I/O to pages residing above @max_addr. + * Force bouncing for ISA DMA ranges or highmem. + * + * DEPRECATED, don't use in new code. **/ -void blk_queue_bounce_limit(struct request_queue *q, u64 max_addr) +void blk_queue_bounce_limit(struct request_queue *q, enum blk_bounce bounce) { - unsigned long b_pfn = max_addr >> PAGE_SHIFT; - -#if BITS_PER_LONG == 64 - /* - * Assume anything <= 4GB can be handled by IOMMU. Actually - * some IOMMUs can handle everything, but I don't know of a - * way to test this here. - */ - q->limits.bounce_pfn = max(max_low_pfn, b_pfn); -#else - q->limits.bounce_pfn = b_pfn; -#endif + q->limits.bounce = bounce; } EXPORT_SYMBOL(blk_queue_bounce_limit); @@ -536,7 +518,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, b->max_write_zeroes_sectors); t->max_zone_append_sectors = min(t->max_zone_append_sectors, b->max_zone_append_sectors); - t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn); + t->bounce = max(t->bounce, b->bounce); t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask, b->seg_boundary_mask); @@ -916,11 +898,3 @@ void blk_queue_set_zoned(struct gendisk *disk, enum blk_zoned_model model) } } EXPORT_SYMBOL_GPL(blk_queue_set_zoned); - -static int __init blk_settings_init(void) -{ - blk_max_low_pfn = max_low_pfn - 1; - blk_max_pfn = max_pfn - 1; - return 0; -} -subsys_initcall(blk_settings_init); diff --git a/block/blk.h b/block/blk.h index 895c9f4a5182..8f4337c5a9e6 100644 --- a/block/blk.h +++ b/block/blk.h @@ -6,6 +6,7 @@ #include #include #include +#include /* for max_pfn/max_low_pfn */ #include #include "blk-crypto-internal.h" #include "blk-mq.h" @@ -311,13 +312,20 @@ static inline void blk_throtl_bio_endio(struct bio *bio) { } static inline void blk_throtl_stat_add(struct request *rq, u64 time) { } #endif -#ifdef CONFIG_BOUNCE -extern void blk_queue_bounce(struct request_queue *q, struct bio **bio); -#else +void __blk_queue_bounce(struct request_queue *q, struct bio **bio); + +static inline bool blk_queue_may_bounce(struct request_queue *q) +{ + return IS_ENABLED(CONFIG_BOUNCE) && + q->limits.bounce == BLK_BOUNCE_HIGH && + max_low_pfn >= max_pfn; +} + static inline void blk_queue_bounce(struct request_queue *q, struct bio **bio) { + if (unlikely(blk_queue_may_bounce(q) && bio_has_data(*bio))) + __blk_queue_bounce(q, bio); } -#endif /* CONFIG_BOUNCE */ #ifdef CONFIG_BLK_CGROUP_IOLATENCY extern int blk_iolatency_init(struct request_queue *q); diff --git a/block/bounce.c b/block/bounce.c index debd5b0bd318..6bafc0d1f867 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include @@ -49,11 +48,11 @@ static void init_bounce_bioset(void) bounce_bs_setup = true; } -#if defined(CONFIG_HIGHMEM) static __init int init_emergency_pool(void) { int ret; -#if defined(CONFIG_HIGHMEM) && !defined(CONFIG_MEMORY_HOTPLUG) + +#ifndef CONFIG_MEMORY_HOTPLUG if (max_pfn <= max_low_pfn) return 0; #endif @@ -67,9 +66,7 @@ static __init int init_emergency_pool(void) } __initcall(init_emergency_pool); -#endif -#ifdef CONFIG_HIGHMEM /* * highmem version, map in to vec */ @@ -82,13 +79,6 @@ static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom) kunmap_atomic(vto); } -#else /* CONFIG_HIGHMEM */ - -#define bounce_copy_vec(to, vfrom) \ - memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len) - -#endif /* CONFIG_HIGHMEM */ - /* * Simple bounce buffer support for highmem pages. Depending on the * queue gfp mask set, *to may or may not be a highmem page. kmap it @@ -236,8 +226,7 @@ err_put: return NULL; } - -void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) +void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) { struct bio *bio; int rw = bio_data_dir(*bio_orig); @@ -247,24 +236,10 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) bool bounce = false; int sectors = 0; - /* - * Data-less bio, nothing to bounce - */ - if (!bio_has_data(*bio_orig)) - return; - - /* - * Just check if the bounce pfn is equal to or bigger than the highest - * pfn in the system -- in that case, don't waste time iterating over - * bio segments - */ - if (q->limits.bounce_pfn >= blk_max_pfn) - return; - bio_for_each_segment(from, *bio_orig, iter) { if (i++ < BIO_MAX_VECS) sectors += from.bv_len >> 9; - if (page_to_pfn(from.bv_page) > q->limits.bounce_pfn) + if (PageHighMem(from.bv_page)) bounce = true; } if (!bounce) @@ -287,7 +262,7 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) for (i = 0, to = bio->bi_io_vec; i < bio->bi_vcnt; to++, i++) { struct page *page = to->bv_page; - if (page_to_pfn(page) <= q->limits.bounce_pfn) + if (!PageHighMem(page)) continue; to->bv_page = mempool_alloc(&page_pool, GFP_NOIO); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0dbb72ea3735..55cc8b96c844 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -313,8 +313,17 @@ enum blk_zoned_model { BLK_ZONED_HM, /* Host-managed zoned block device */ }; +/* + * BLK_BOUNCE_NONE: never bounce (default) + * BLK_BOUNCE_HIGH: bounce all highmem pages + */ +enum blk_bounce { + BLK_BOUNCE_NONE, + BLK_BOUNCE_HIGH, +}; + struct queue_limits { - unsigned long bounce_pfn; + enum blk_bounce bounce; unsigned long seg_boundary_mask; unsigned long virt_boundary_mask; @@ -835,22 +844,6 @@ static inline unsigned int blk_queue_depth(struct request_queue *q) return q->nr_requests; } -extern unsigned long blk_max_low_pfn, blk_max_pfn; - -/* - * standard bounce addresses: - * - * BLK_BOUNCE_HIGH : bounce all highmem pages - * BLK_BOUNCE_ANY : don't bounce anything - */ - -#if BITS_PER_LONG == 32 -#define BLK_BOUNCE_HIGH ((u64)blk_max_low_pfn << PAGE_SHIFT) -#else -#define BLK_BOUNCE_HIGH -1ULL -#endif -#define BLK_BOUNCE_ANY (-1ULL) - /* * default timeout for SG_IO if none specified */ @@ -1134,7 +1127,7 @@ extern void blk_abort_request(struct request *); * Access functions for manipulating queue properties */ extern void blk_cleanup_queue(struct request_queue *); -extern void blk_queue_bounce_limit(struct request_queue *, u64); +void blk_queue_bounce_limit(struct request_queue *q, enum blk_bounce limit); extern void blk_queue_max_hw_sectors(struct request_queue *, unsigned int); extern void blk_queue_chunk_sectors(struct request_queue *, unsigned int); extern void blk_queue_max_segments(struct request_queue *, unsigned short); -- cgit v1.2.3-58-ga151 From 393bb12e00580aaa23356504eed38d8f5571153a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 Mar 2021 09:30:01 +0200 Subject: block: stop calling blk_queue_bounce for passthrough requests Instead of overloading the passthrough fast path with the deprecated block layer bounce buffering let the users that combine an old undermaintained driver with a highmem system pay the price by always falling back to copies in that case. Signed-off-by: Christoph Hellwig Acked-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20210331073001.46776-9-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-map.c | 116 ++++++++++--------------------------- block/bounce.c | 11 +--- drivers/nvme/host/lightnvm.c | 2 +- drivers/target/target_core_pscsi.c | 4 +- include/linux/blkdev.h | 2 +- 5 files changed, 36 insertions(+), 99 deletions(-) (limited to 'block') diff --git a/block/blk-map.c b/block/blk-map.c index b62b52dcb61d..dac78376acc8 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -123,7 +123,6 @@ static int bio_uncopy_user(struct bio *bio) bio_free_pages(bio); } kfree(bmd); - bio_put(bio); return ret; } @@ -132,7 +131,7 @@ static int bio_copy_user_iov(struct request *rq, struct rq_map_data *map_data, { struct bio_map_data *bmd; struct page *page; - struct bio *bio, *bounce_bio; + struct bio *bio; int i = 0, ret; int nr_pages; unsigned int len = iter->count; @@ -218,16 +217,9 @@ static int bio_copy_user_iov(struct request *rq, struct rq_map_data *map_data, bio->bi_private = bmd; - bounce_bio = bio; - ret = blk_rq_append_bio(rq, &bounce_bio); + ret = blk_rq_append_bio(rq, bio); if (ret) goto cleanup; - - /* - * We link the bounce buffer in and could have to traverse it later, so - * we have to get a ref to prevent it from being freed - */ - bio_get(bounce_bio); return 0; cleanup: if (!map_data) @@ -242,7 +234,7 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, gfp_t gfp_mask) { unsigned int max_sectors = queue_max_hw_sectors(rq->q); - struct bio *bio, *bounce_bio; + struct bio *bio; int ret; int j; @@ -304,49 +296,17 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, break; } - /* - * Subtle: if we end up needing to bounce a bio, it would normally - * disappear when its bi_end_io is run. However, we need the original - * bio for the unmap, so grab an extra reference to it - */ - bio_get(bio); - - bounce_bio = bio; - ret = blk_rq_append_bio(rq, &bounce_bio); + ret = blk_rq_append_bio(rq, bio); if (ret) - goto out_put_orig; - - /* - * We link the bounce buffer in and could have to traverse it - * later, so we have to get a ref to prevent it from being freed - */ - bio_get(bounce_bio); + goto out_unmap; return 0; - out_put_orig: - bio_put(bio); out_unmap: bio_release_pages(bio, false); bio_put(bio); return ret; } -/** - * bio_unmap_user - unmap a bio - * @bio: the bio being unmapped - * - * Unmap a bio previously mapped by bio_map_user_iov(). Must be called from - * process context. - * - * bio_unmap_user() may sleep. - */ -static void bio_unmap_user(struct bio *bio) -{ - bio_release_pages(bio, bio_data_dir(bio) == READ); - bio_put(bio); - bio_put(bio); -} - static void bio_invalidate_vmalloc_pages(struct bio *bio) { #ifdef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE @@ -519,33 +479,27 @@ cleanup: * Append a bio to a passthrough request. Only works if the bio can be merged * into the request based on the driver constraints. */ -int blk_rq_append_bio(struct request *rq, struct bio **bio) +int blk_rq_append_bio(struct request *rq, struct bio *bio) { - struct bio *orig_bio = *bio; struct bvec_iter iter; struct bio_vec bv; unsigned int nr_segs = 0; - blk_queue_bounce(rq->q, bio); + if (WARN_ON_ONCE(rq->q->limits.bounce != BLK_BOUNCE_NONE)) + return -EINVAL; - bio_for_each_bvec(bv, *bio, iter) + bio_for_each_bvec(bv, bio, iter) nr_segs++; if (!rq->bio) { - blk_rq_bio_prep(rq, *bio, nr_segs); + blk_rq_bio_prep(rq, bio, nr_segs); } else { - if (!ll_back_merge_fn(rq, *bio, nr_segs)) { - if (orig_bio != *bio) { - bio_put(*bio); - *bio = orig_bio; - } + if (!ll_back_merge_fn(rq, bio, nr_segs)) return -EINVAL; - } - - rq->biotail->bi_next = *bio; - rq->biotail = *bio; - rq->__data_len += (*bio)->bi_iter.bi_size; - bio_crypt_free_ctx(*bio); + rq->biotail->bi_next = bio; + rq->biotail = bio; + rq->__data_len += (bio)->bi_iter.bi_size; + bio_crypt_free_ctx(bio); } return 0; @@ -566,12 +520,6 @@ EXPORT_SYMBOL(blk_rq_append_bio); * * A matching blk_rq_unmap_user() must be issued at the end of I/O, while * still in process context. - * - * Note: The mapped bio may need to be bounced through blk_queue_bounce() - * before being submitted to the device, as pages mapped may be out of - * reach. It's the callers responsibility to make sure this happens. The - * original bio must be passed back in to blk_rq_unmap_user() for proper - * unmapping. */ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, struct rq_map_data *map_data, @@ -588,6 +536,8 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, if (map_data) copy = true; + else if (blk_queue_may_bounce(q)) + copy = true; else if (iov_iter_alignment(iter) & align) copy = true; else if (queue_virt_boundary(q)) @@ -641,25 +591,21 @@ EXPORT_SYMBOL(blk_rq_map_user); */ int blk_rq_unmap_user(struct bio *bio) { - struct bio *mapped_bio; + struct bio *next_bio; int ret = 0, ret2; while (bio) { - mapped_bio = bio; - if (unlikely(bio_flagged(bio, BIO_BOUNCED))) - mapped_bio = bio->bi_private; - if (bio->bi_private) { - ret2 = bio_uncopy_user(mapped_bio); + ret2 = bio_uncopy_user(bio); if (ret2 && !ret) ret = ret2; } else { - bio_unmap_user(mapped_bio); + bio_release_pages(bio, bio_data_dir(bio) == READ); } - mapped_bio = bio; + next_bio = bio; bio = bio->bi_next; - bio_put(mapped_bio); + bio_put(next_bio); } return ret; @@ -684,7 +630,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, { int reading = rq_data_dir(rq) == READ; unsigned long addr = (unsigned long) kbuf; - struct bio *bio, *orig_bio; + struct bio *bio; int ret; if (len > (queue_max_hw_sectors(q) << 9)) @@ -692,7 +638,8 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, if (!len || !kbuf) return -EINVAL; - if (!blk_rq_aligned(q, addr, len) || object_is_on_stack(kbuf)) + if (!blk_rq_aligned(q, addr, len) || object_is_on_stack(kbuf) || + blk_queue_may_bounce(q)) bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading); else bio = bio_map_kern(q, kbuf, len, gfp_mask); @@ -703,14 +650,9 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, bio->bi_opf &= ~REQ_OP_MASK; bio->bi_opf |= req_op(rq); - orig_bio = bio; - ret = blk_rq_append_bio(rq, &bio); - if (unlikely(ret)) { - /* request is too big */ - bio_put(orig_bio); - return ret; - } - - return 0; + ret = blk_rq_append_bio(rq, bio); + if (unlikely(ret)) + bio_put(bio); + return ret; } EXPORT_SYMBOL(blk_rq_map_kern); diff --git a/block/bounce.c b/block/bounce.c index 6bafc0d1f867..94081e013c58 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -180,12 +180,8 @@ static struct bio *bounce_clone_bio(struct bio *bio_src) * asking for trouble and would force extra work on * __bio_clone_fast() anyways. */ - if (bio_is_passthrough(bio_src)) - bio = bio_kmalloc(GFP_NOIO | __GFP_NOFAIL, - bio_segments(bio_src)); - else - bio = bio_alloc_bioset(GFP_NOIO, bio_segments(bio_src), - &bounce_bio_set); + bio = bio_alloc_bioset(GFP_NOIO, bio_segments(bio_src), + &bounce_bio_set); bio->bi_bdev = bio_src->bi_bdev; if (bio_flagged(bio_src, BIO_REMAPPED)) bio_set_flag(bio, BIO_REMAPPED); @@ -245,8 +241,7 @@ void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) if (!bounce) return; - if (!bio_is_passthrough(*bio_orig) && - sectors < bio_sectors(*bio_orig)) { + if (sectors < bio_sectors(*bio_orig)) { bio = bio_split(*bio_orig, sectors, GFP_NOIO, &bounce_bio_split); bio_chain(bio, *bio_orig); submit_bio_noacct(*bio_orig); diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index b705988629f2..f6ca2fbb711e 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -660,7 +660,7 @@ static struct request *nvme_nvm_alloc_request(struct request_queue *q, rq->cmd_flags &= ~REQ_FAILFAST_DRIVER; if (rqd->bio) - blk_rq_append_bio(rq, &rqd->bio); + blk_rq_append_bio(rq, rqd->bio); else rq->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM); diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c index 3cbc074992bc..7df4a9c9c7ff 100644 --- a/drivers/target/target_core_pscsi.c +++ b/drivers/target/target_core_pscsi.c @@ -911,7 +911,7 @@ new_bio: " %d i: %d bio: %p, allocating another" " bio\n", bio->bi_vcnt, i, bio); - rc = blk_rq_append_bio(req, &bio); + rc = blk_rq_append_bio(req, bio); if (rc) { pr_err("pSCSI: failed to append bio\n"); goto fail; @@ -930,7 +930,7 @@ new_bio: } if (bio) { - rc = blk_rq_append_bio(req, &bio); + rc = blk_rq_append_bio(req, bio); if (rc) { pr_err("pSCSI: failed to append bio\n"); goto fail; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 55cc8b96c844..d5d320da51f8 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -909,7 +909,7 @@ extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src, extern void blk_rq_unprep_clone(struct request *rq); extern blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq); -extern int blk_rq_append_bio(struct request *rq, struct bio **bio); +int blk_rq_append_bio(struct request *rq, struct bio *bio); extern void blk_queue_split(struct bio **); extern int scsi_verify_blk_ioctl(struct block_device *, unsigned int); extern int scsi_cmd_blk_ioctl(struct block_device *, fmode_t, -- cgit v1.2.3-58-ga151 From 580dca8143d215977811bd2ff881e1e4f6ff39f0 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 6 Apr 2021 11:19:33 +0800 Subject: blk-mq: set default elevator as deadline in case of hctx shared tagset Yanhui found that write performance is degraded a lot after applying hctx shared tagset on one test machine with megaraid_sas. And turns out it is caused by none scheduler which becomes default elevator caused by hctx shared tagset patchset. Given more scsi HBAs will apply hctx shared tagset, and the similar performance exists for them too. So keep previous behavior by still using default mq-deadline for queues which apply hctx shared tagset, just like before. Fixes: 32bc15afed04 ("blk-mq: Facilitate a shared sbitmap per tagset") Reported-by: Yanhui Ma Cc: John Garry Cc: Hannes Reinecke Signed-off-by: Ming Lei Reviewed-by: Martin K. Petersen Reviewed-by: Bart Van Assche Reviewed-by: John Garry Link: https://lore.kernel.org/r/20210406031933.767228-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/elevator.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/elevator.c b/block/elevator.c index 293c5c81397a..440699c28119 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -621,7 +621,8 @@ static inline bool elv_support_iosched(struct request_queue *q) */ static struct elevator_type *elevator_get_default(struct request_queue *q) { - if (q->nr_hw_queues != 1) + if (q->nr_hw_queues != 1 && + !blk_mq_is_sbitmap_shared(q->tag_set->flags)) return NULL; return elevator_get(q, "mq-deadline", false); -- cgit v1.2.3-58-ga151 From 540ad3f3da2542ec99235ac55e7cba8b11ce4b7b Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 6 Apr 2021 13:08:20 -0700 Subject: blk-zoned: Remove the definition of blk_zone_start() Commit e76239a3748c ("block: add a report_zones method") removed the last blk_zone_start() call. Hence also remove the definition of this function. Cc: Christoph Hellwig Cc: Damien Le Moal Signed-off-by: Bart Van Assche Reviewed-by: Himanshu Madhani Reviewed-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210406200820.15180-1-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/blk-zoned.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'block') diff --git a/block/blk-zoned.c b/block/blk-zoned.c index c0276b42d9fb..250cb76ee615 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -52,14 +52,6 @@ const char *blk_zone_cond_str(enum blk_zone_cond zone_cond) } EXPORT_SYMBOL_GPL(blk_zone_cond_str); -static inline sector_t blk_zone_start(struct request_queue *q, - sector_t sector) -{ - sector_t zone_mask = blk_queue_zone_sectors(q) - 1; - - return sector & ~zone_mask; -} - /* * Return true if a request is a write requests that needs zone write locking. */ -- cgit v1.2.3-58-ga151 From b896fa85e0ee4f09ba4be48a3f405fc82c38afb4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 Apr 2021 08:22:52 +0200 Subject: dasd: use bdev_disk_changed instead of blk_drop_partitions Use the more general interface - the behavior is the same except that now a change uevent is sent, which is the right thing to do when the device becomes unusable. Signed-off-by: Christoph Hellwig Acked-by: Stefan Haberland Link: https://lore.kernel.org/r/20210406062303.811835-2-hch@lst.de Signed-off-by: Jens Axboe --- block/partitions/core.c | 4 ---- drivers/s390/block/dasd_genhd.c | 3 +-- 2 files changed, 1 insertion(+), 6 deletions(-) (limited to 'block') diff --git a/block/partitions/core.c b/block/partitions/core.c index 1a7558917c47..22a0dab17ed3 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -544,10 +544,6 @@ int blk_drop_partitions(struct block_device *bdev) return 0; } -#ifdef CONFIG_S390 -/* for historic reasons in the DASD driver */ -EXPORT_SYMBOL_GPL(blk_drop_partitions); -#endif static bool blk_add_partition(struct gendisk *disk, struct block_device *bdev, struct parsed_partitions *state, int p) diff --git a/drivers/s390/block/dasd_genhd.c b/drivers/s390/block/dasd_genhd.c index a9698fba9b76..8d6587ec73e2 100644 --- a/drivers/s390/block/dasd_genhd.c +++ b/drivers/s390/block/dasd_genhd.c @@ -146,12 +146,11 @@ void dasd_destroy_partitions(struct dasd_block *block) block->bdev = NULL; mutex_lock(&bdev->bd_mutex); - blk_drop_partitions(bdev); + bdev_disk_changed(bdev, true); mutex_unlock(&bdev->bd_mutex); /* Matching blkdev_put to the blkdev_get in dasd_scan_partitions. */ blkdev_put(bdev, FMODE_READ); - set_capacity(block->gdp, 0); } int dasd_gendisk_init(void) -- cgit v1.2.3-58-ga151 From 45611837bb37e4544ca048e33d833483b06e3b03 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 Apr 2021 08:22:53 +0200 Subject: block: remove invalidate_partition invalidate_partition has two callers, one of which already performs the remove_inode_hash just after the call. Just open code the function in the two callsites. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210406062303.811835-3-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 8c8f543572e6..9b121b1f7998 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -646,18 +646,6 @@ void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk) } EXPORT_SYMBOL(device_add_disk_no_queue_reg); -static void invalidate_partition(struct block_device *bdev) -{ - fsync_bdev(bdev); - __invalidate_device(bdev, true); - - /* - * Unhash the bdev inode for this device so that it can't be looked - * up any more even if openers still hold references to it. - */ - remove_inode_hash(bdev->bd_inode); -} - /** * del_gendisk - remove the gendisk * @disk: the struct gendisk to remove @@ -699,12 +687,21 @@ void del_gendisk(struct gendisk *disk) /* invalidate stuff */ disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); while ((part = disk_part_iter_next(&piter))) { - invalidate_partition(part); + fsync_bdev(part); + __invalidate_device(part, true); delete_partition(part); } disk_part_iter_exit(&piter); - invalidate_partition(disk->part0); + fsync_bdev(disk->part0); + __invalidate_device(disk->part0, true); + + /* + * Unhash the bdev inode for this device so that it can't be looked + * up any more even if openers still hold references to it. + */ + remove_inode_hash(disk->part0->bd_inode); + set_capacity(disk, 0); disk->flags &= ~GENHD_FL_UP; up_write(&bdev_lookup_sem); -- cgit v1.2.3-58-ga151 From 473338be3aaea117a7133720305f240eb7f68951 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 Apr 2021 08:22:54 +0200 Subject: block: move more syncing and invalidation to delete_partition Move the calls to fsync_bdev and __invalidate_device from del_gendisk to delete_partition. For the other two callers that check that there are no openers for the delete partitions(s) the callouts are a no-op as no file system can be mounted, but this keeps all the cleanup in one place. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210406062303.811835-4-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 5 +---- block/partitions/core.c | 6 +++--- 2 files changed, 4 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 9b121b1f7998..15f99da4543f 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -686,11 +686,8 @@ void del_gendisk(struct gendisk *disk) /* invalidate stuff */ disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); - while ((part = disk_part_iter_next(&piter))) { - fsync_bdev(part); - __invalidate_device(part, true); + while ((part = disk_part_iter_next(&piter))) delete_partition(part); - } disk_part_iter_exit(&piter); fsync_bdev(disk->part0); diff --git a/block/partitions/core.c b/block/partitions/core.c index 22a0dab17ed3..8c1735292940 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -287,6 +287,9 @@ struct device_type part_type = { */ void delete_partition(struct block_device *part) { + fsync_bdev(part); + __invalidate_device(part, true); + xa_erase(&part->bd_disk->part_tbl, part->bd_partno); kobject_put(part->bd_holder_dir); device_del(&part->bd_device); @@ -468,9 +471,6 @@ int bdev_del_partition(struct block_device *bdev, int partno) if (part->bd_openers) goto out_unlock; - sync_blockdev(part); - invalidate_bdev(part); - delete_partition(part); ret = 0; out_unlock: -- cgit v1.2.3-58-ga151 From d3c4a43d9291279c28b26757351a6ab72c110753 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 Apr 2021 08:22:55 +0200 Subject: block: refactor blk_drop_partitions Move the busy check and disk-wide sync into the only caller, so that the remainder can be shared with del_gendisk. Also pass the gendisk instead of the bdev as that is all that is needed. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210406062303.811835-5-hch@lst.de Signed-off-by: Jens Axboe --- block/blk.h | 1 - block/genhd.c | 11 +---------- block/partitions/core.c | 14 +++----------- fs/block_dev.c | 8 +++++--- include/linux/genhd.h | 2 +- 5 files changed, 10 insertions(+), 26 deletions(-) (limited to 'block') diff --git a/block/blk.h b/block/blk.h index 8f4337c5a9e6..8b3591aee0a5 100644 --- a/block/blk.h +++ b/block/blk.h @@ -349,7 +349,6 @@ char *disk_name(struct gendisk *hd, int partno, char *buf); #define ADDPART_FLAG_NONE 0 #define ADDPART_FLAG_RAID 1 #define ADDPART_FLAG_WHOLEDISK 2 -void delete_partition(struct block_device *part); int bdev_add_partition(struct block_device *bdev, int partno, sector_t start, sector_t length); int bdev_del_partition(struct block_device *bdev, int partno); diff --git a/block/genhd.c b/block/genhd.c index 15f99da4543f..8303ec67fd70 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -667,9 +667,6 @@ EXPORT_SYMBOL(device_add_disk_no_queue_reg); */ void del_gendisk(struct gendisk *disk) { - struct disk_part_iter piter; - struct block_device *part; - might_sleep(); if (WARN_ON_ONCE(!disk->queue)) @@ -683,13 +680,7 @@ void del_gendisk(struct gendisk *disk) * disk is marked as dead (GENHD_FL_UP cleared). */ down_write(&bdev_lookup_sem); - - /* invalidate stuff */ - disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); - while ((part = disk_part_iter_next(&piter))) - delete_partition(part); - disk_part_iter_exit(&piter); - + blk_drop_partitions(disk); fsync_bdev(disk->part0); __invalidate_device(disk->part0, true); diff --git a/block/partitions/core.c b/block/partitions/core.c index 8c1735292940..536f7c5bb0b6 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -285,7 +285,7 @@ struct device_type part_type = { * Must be called either with bd_mutex held, before a disk can be opened or * after all disk users are gone. */ -void delete_partition(struct block_device *part) +static void delete_partition(struct block_device *part) { fsync_bdev(part); __invalidate_device(part, true); @@ -526,23 +526,15 @@ static bool disk_unlock_native_capacity(struct gendisk *disk) } } -int blk_drop_partitions(struct block_device *bdev) +void blk_drop_partitions(struct gendisk *disk) { struct disk_part_iter piter; struct block_device *part; - if (bdev->bd_part_count) - return -EBUSY; - - sync_blockdev(bdev); - invalidate_bdev(bdev); - - disk_part_iter_init(&piter, bdev->bd_disk, DISK_PITER_INCL_EMPTY); + disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); while ((part = disk_part_iter_next(&piter))) delete_partition(part); disk_part_iter_exit(&piter); - - return 0; } static bool blk_add_partition(struct gendisk *disk, struct block_device *bdev, diff --git a/fs/block_dev.c b/fs/block_dev.c index 92ed7d5df677..594a1bee9dd9 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1243,9 +1243,11 @@ int bdev_disk_changed(struct block_device *bdev, bool invalidate) clear_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state); rescan: - ret = blk_drop_partitions(bdev); - if (ret) - return ret; + if (bdev->bd_part_count) + return -EBUSY; + sync_blockdev(bdev); + invalidate_bdev(bdev); + blk_drop_partitions(disk); /* * Historically we only set the capacity to zero for devices that diff --git a/include/linux/genhd.h b/include/linux/genhd.h index f364619092cc..16178a935c40 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -273,7 +273,7 @@ static inline sector_t get_capacity(struct gendisk *disk) int bdev_disk_changed(struct block_device *bdev, bool invalidate); int blk_add_partitions(struct gendisk *disk, struct block_device *bdev); -int blk_drop_partitions(struct block_device *bdev); +void blk_drop_partitions(struct gendisk *disk); extern struct gendisk *__alloc_disk_node(int minors, int node_id); extern void put_disk(struct gendisk *disk); -- cgit v1.2.3-58-ga151 From c76f48eb5c084b1e15c931ae8cc1826cd771d70d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 Apr 2021 08:22:56 +0200 Subject: block: take bd_mutex around delete_partitions in del_gendisk There is nothing preventing an ioctl from trying do delete partition concurrenly with del_gendisk, so take open_mutex to serialize against that. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210406062303.811835-6-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 4 ++++ block/partitions/core.c | 2 ++ 2 files changed, 6 insertions(+) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 8303ec67fd70..e3f3c2321773 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -680,7 +680,11 @@ void del_gendisk(struct gendisk *disk) * disk is marked as dead (GENHD_FL_UP cleared). */ down_write(&bdev_lookup_sem); + + mutex_lock(&disk->part0->bd_mutex); blk_drop_partitions(disk); + mutex_unlock(&disk->part0->bd_mutex); + fsync_bdev(disk->part0); __invalidate_device(disk->part0, true); diff --git a/block/partitions/core.c b/block/partitions/core.c index 536f7c5bb0b6..9fbaec466b51 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -531,6 +531,8 @@ void blk_drop_partitions(struct gendisk *disk) struct disk_part_iter piter; struct block_device *part; + lockdep_assert_held(&disk->part0->bd_mutex); + disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); while ((part = disk_part_iter_next(&piter))) delete_partition(part); -- cgit v1.2.3-58-ga151 From 6c4541a8bb94a1cccca55ee53c866eb72bf279cf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 Apr 2021 08:22:57 +0200 Subject: block: simplify partition removal Always look up the first available entry instead of the complicated stateful traversal. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210406062303.811835-7-hch@lst.de Signed-off-by: Jens Axboe --- block/partitions/core.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/partitions/core.c b/block/partitions/core.c index 9fbaec466b51..927144d4e59d 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -528,15 +528,17 @@ static bool disk_unlock_native_capacity(struct gendisk *disk) void blk_drop_partitions(struct gendisk *disk) { - struct disk_part_iter piter; struct block_device *part; + unsigned long idx; lockdep_assert_held(&disk->part0->bd_mutex); - disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); - while ((part = disk_part_iter_next(&piter))) + xa_for_each_start(&disk->part_tbl, idx, part, 1) { + if (!bdgrab(part)) + continue; delete_partition(part); - disk_part_iter_exit(&piter); + bdput(part); + } } static bool blk_add_partition(struct gendisk *disk, struct block_device *bdev, -- cgit v1.2.3-58-ga151 From e30691237bc1e055c55b0fe256ed7fc1a4ee1122 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 Apr 2021 08:22:58 +0200 Subject: block: simplify partition_overlaps Just use xa_for_each to iterate over the partitions as there is no need to grab a reference to each partition. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210406062303.811835-8-hch@lst.de Signed-off-by: Jens Axboe --- block/partitions/core.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'block') diff --git a/block/partitions/core.c b/block/partitions/core.c index 927144d4e59d..0f8454b93c6e 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -420,21 +420,21 @@ out_put: static bool partition_overlaps(struct gendisk *disk, sector_t start, sector_t length, int skip_partno) { - struct disk_part_iter piter; struct block_device *part; bool overlap = false; + unsigned long idx; - disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); - while ((part = disk_part_iter_next(&piter))) { - if (part->bd_partno == skip_partno || - start >= part->bd_start_sect + bdev_nr_sectors(part) || - start + length <= part->bd_start_sect) - continue; - overlap = true; - break; + rcu_read_lock(); + xa_for_each_start(&disk->part_tbl, idx, part, 1) { + if (part->bd_partno != skip_partno && + start < part->bd_start_sect + bdev_nr_sectors(part) && + start + length > part->bd_start_sect) { + overlap = true; + break; + } } + rcu_read_unlock(); - disk_part_iter_exit(&piter); return overlap; } -- cgit v1.2.3-58-ga151 From e559f58d20dbcc0b541552ac6b5cbab29c964d6a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 Apr 2021 08:22:59 +0200 Subject: block: simplify printk_all_partitions Just use xa_for_each to iterate over the partitions as there is no need to grab a reference to each partition. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210406062303.811835-9-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index e3f3c2321773..409ff4710f92 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -806,10 +806,10 @@ void __init printk_all_partitions(void) class_dev_iter_init(&iter, &block_class, NULL, &disk_type); while ((dev = class_dev_iter_next(&iter))) { struct gendisk *disk = dev_to_disk(dev); - struct disk_part_iter piter; struct block_device *part; char name_buf[BDEVNAME_SIZE]; char devt_buf[BDEVT_SIZE]; + unsigned long idx; /* * Don't show empty devices or things that have been @@ -820,30 +820,29 @@ void __init printk_all_partitions(void) continue; /* - * Note, unlike /proc/partitions, I am showing the - * numbers in hex - the same format as the root= - * option takes. + * Note, unlike /proc/partitions, I am showing the numbers in + * hex - the same format as the root= option takes. */ - disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0); - while ((part = disk_part_iter_next(&piter))) { - bool is_part0 = part == disk->part0; - - printk("%s%s %10llu %s %s", is_part0 ? "" : " ", + rcu_read_lock(); + xa_for_each(&disk->part_tbl, idx, part) { + if (!bdev_nr_sectors(part)) + continue; + printk("%s%s %10llu %s %s", + bdev_is_partition(part) ? " " : "", bdevt_str(part->bd_dev, devt_buf), bdev_nr_sectors(part) >> 1, disk_name(disk, part->bd_partno, name_buf), part->bd_meta_info ? part->bd_meta_info->uuid : ""); - if (is_part0) { - if (dev->parent && dev->parent->driver) - printk(" driver: %s\n", - dev->parent->driver->name); - else - printk(" (driver?)\n"); - } else + if (bdev_is_partition(part)) printk("\n"); + else if (dev->parent && dev->parent->driver) + printk(" driver: %s\n", + dev->parent->driver->name); + else + printk(" (driver?)\n"); } - disk_part_iter_exit(&piter); + rcu_read_unlock(); } class_dev_iter_exit(&iter); } -- cgit v1.2.3-58-ga151 From ecc75a98b89917f18f295e154cd0bf056481c48f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 Apr 2021 08:23:00 +0200 Subject: block: simplify show_partition Just use xa_for_each to iterate over the partitions as there is no need to grab a reference to each partition. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210406062303.811835-10-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 409ff4710f92..5726714ef82f 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -907,8 +907,8 @@ static void *show_partition_start(struct seq_file *seqf, loff_t *pos) static int show_partition(struct seq_file *seqf, void *v) { struct gendisk *sgp = v; - struct disk_part_iter piter; struct block_device *part; + unsigned long idx; char buf[BDEVNAME_SIZE]; /* Don't show non-partitionable removeable devices or empty devices */ @@ -918,15 +918,16 @@ static int show_partition(struct seq_file *seqf, void *v) if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO) return 0; - /* show the full disk and all non-0 size partitions of it */ - disk_part_iter_init(&piter, sgp, DISK_PITER_INCL_PART0); - while ((part = disk_part_iter_next(&piter))) + rcu_read_lock(); + xa_for_each(&sgp->part_tbl, idx, part) { + if (!bdev_nr_sectors(part)) + continue; seq_printf(seqf, "%4d %7d %10llu %s\n", MAJOR(part->bd_dev), MINOR(part->bd_dev), bdev_nr_sectors(part) >> 1, disk_name(sgp, part->bd_partno, buf)); - disk_part_iter_exit(&piter); - + } + rcu_read_unlock(); return 0; } -- cgit v1.2.3-58-ga151 From 7fae67cc9c0e0645e8c82156ba3a8bb7bae995bb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 Apr 2021 08:23:01 +0200 Subject: block: simplify diskstats_show Just use xa_for_each to iterate over the partitions as there is no need to grab a reference to each partition. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210406062303.811835-11-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 5726714ef82f..cbfe1ff19360 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1236,11 +1236,11 @@ const struct device_type disk_type = { static int diskstats_show(struct seq_file *seqf, void *v) { struct gendisk *gp = v; - struct disk_part_iter piter; struct block_device *hd; char buf[BDEVNAME_SIZE]; unsigned int inflight; struct disk_stats stat; + unsigned long idx; /* if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next) @@ -1250,8 +1250,10 @@ static int diskstats_show(struct seq_file *seqf, void *v) "\n\n"); */ - disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0); - while ((hd = disk_part_iter_next(&piter))) { + rcu_read_lock(); + xa_for_each(&gp->part_tbl, idx, hd) { + if (bdev_is_partition(hd) && !bdev_nr_sectors(hd)) + continue; part_stat_read_all(hd, &stat); if (queue_is_mq(gp->queue)) inflight = blk_mq_in_flight(gp->queue, hd); @@ -1294,7 +1296,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) NSEC_PER_MSEC) ); } - disk_part_iter_exit(&piter); + rcu_read_unlock(); return 0; } -- cgit v1.2.3-58-ga151 From 3212135a718b06be38811f2d9a320ae842e76409 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 Apr 2021 08:23:02 +0200 Subject: block: remove disk_part_iter Just open code the xa_for_each in the remaining user. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210406062303.811835-12-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 92 ++++++++------------------------------------------- include/linux/genhd.h | 19 ----------- 2 files changed, 13 insertions(+), 98 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index cbfe1ff19360..39ca97b0edc6 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -161,81 +161,6 @@ static void part_in_flight_rw(struct block_device *part, inflight[1] = 0; } -/** - * disk_part_iter_init - initialize partition iterator - * @piter: iterator to initialize - * @disk: disk to iterate over - * @flags: DISK_PITER_* flags - * - * Initialize @piter so that it iterates over partitions of @disk. - * - * CONTEXT: - * Don't care. - */ -void disk_part_iter_init(struct disk_part_iter *piter, struct gendisk *disk, - unsigned int flags) -{ - piter->disk = disk; - piter->part = NULL; - if (flags & (DISK_PITER_INCL_PART0 | DISK_PITER_INCL_EMPTY_PART0)) - piter->idx = 0; - else - piter->idx = 1; - piter->flags = flags; -} - -/** - * disk_part_iter_next - proceed iterator to the next partition and return it - * @piter: iterator of interest - * - * Proceed @piter to the next partition and return it. - * - * CONTEXT: - * Don't care. - */ -struct block_device *disk_part_iter_next(struct disk_part_iter *piter) -{ - struct block_device *part; - unsigned long idx; - - /* put the last partition */ - disk_part_iter_exit(piter); - - rcu_read_lock(); - xa_for_each_start(&piter->disk->part_tbl, idx, part, piter->idx) { - if (!bdev_nr_sectors(part) && - !(piter->flags & DISK_PITER_INCL_EMPTY) && - !(piter->flags & DISK_PITER_INCL_EMPTY_PART0 && - piter->idx == 0)) - continue; - - piter->part = bdgrab(part); - if (!piter->part) - continue; - piter->idx = idx + 1; - break; - } - rcu_read_unlock(); - - return piter->part; -} - -/** - * disk_part_iter_exit - finish up partition iteration - * @piter: iter of interest - * - * Called when iteration is over. Cleans up @piter. - * - * CONTEXT: - * Don't care. - */ -void disk_part_iter_exit(struct disk_part_iter *piter) -{ - if (piter->part) - bdput(piter->part); - piter->part = NULL; -} - /* * Can be deleted altogether. Later. * @@ -472,13 +397,22 @@ static char *bdevt_str(dev_t devt, char *buf) void disk_uevent(struct gendisk *disk, enum kobject_action action) { - struct disk_part_iter piter; struct block_device *part; + unsigned long idx; + + rcu_read_lock(); + xa_for_each(&disk->part_tbl, idx, part) { + if (bdev_is_partition(part) && !bdev_nr_sectors(part)) + continue; + if (!bdgrab(part)) + continue; - disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY_PART0); - while ((part = disk_part_iter_next(&piter))) + rcu_read_unlock(); kobject_uevent(bdev_kobj(part), action); - disk_part_iter_exit(&piter); + bdput(part); + rcu_read_lock(); + } + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(disk_uevent); diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 16178a935c40..7e9660ea967d 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -204,25 +204,6 @@ static inline dev_t disk_devt(struct gendisk *disk) void disk_uevent(struct gendisk *disk, enum kobject_action action); -/* - * Smarter partition iterator without context limits. - */ -#define DISK_PITER_INCL_EMPTY (1 << 1) /* include 0-sized parts */ -#define DISK_PITER_INCL_PART0 (1 << 2) /* include partition 0 */ -#define DISK_PITER_INCL_EMPTY_PART0 (1 << 3) /* include empty partition 0 */ - -struct disk_part_iter { - struct gendisk *disk; - struct block_device *part; - unsigned long idx; - unsigned int flags; -}; - -extern void disk_part_iter_init(struct disk_part_iter *piter, - struct gendisk *disk, unsigned int flags); -struct block_device *disk_part_iter_next(struct disk_part_iter *piter); -extern void disk_part_iter_exit(struct disk_part_iter *piter); - /* block/genhd.c */ extern void device_add_disk(struct device *parent, struct gendisk *disk, const struct attribute_group **groups); -- cgit v1.2.3-58-ga151 From 40c7fd3fdfba97a18724a0b96d132335fd419e98 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 8 Apr 2021 11:46:12 +0200 Subject: block: Fix sys_ioprio_set(.which=IOPRIO_WHO_PGRP) task iteration do_each_pid_thread() { } while_each_pid_thread() is a double loop and thus break doesn't work as expected. Also, it should be used under tasklist_lock because otherwise we can race against change_pid() for PGID/SID. Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/YG7Q5C4Rb5dx5GFx@hirez.programming.kicks-ass.net Signed-off-by: Jens Axboe --- block/ioprio.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/ioprio.c b/block/ioprio.c index 364d2294ba90..bee628f9f1b2 100644 --- a/block/ioprio.c +++ b/block/ioprio.c @@ -119,11 +119,17 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio) pgrp = task_pgrp(current); else pgrp = find_vpid(who); + + read_lock(&tasklist_lock); do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { ret = set_task_ioprio(p, ioprio); - if (ret) - break; + if (ret) { + read_unlock(&tasklist_lock); + goto out; + } } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); + read_unlock(&tasklist_lock); + break; case IOPRIO_WHO_USER: uid = make_kuid(current_user_ns(), who); @@ -153,6 +159,7 @@ free_uid: ret = -EINVAL; } +out: rcu_read_unlock(); return ret; } -- cgit v1.2.3-58-ga151 From cbb749cf377aa8aa32a036ebe9dd9f2d89037bf0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 9 Apr 2021 17:04:46 +0200 Subject: block: remove an incorrect check from blk_rq_append_bio blk_rq_append_bio is also used for the copy case, not just the map case, so tis debug check is not correct. Fixes: 393bb12e0058 ("block: stop calling blk_queue_bounce for passthrough requests") Reported-by: Guenter Roeck Signed-off-by: Christoph Hellwig Tested-by: Guenter Roeck Reviewed-by: Himanshu Madhani Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210409150447.1977410-1-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-map.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'block') diff --git a/block/blk-map.c b/block/blk-map.c index dac78376acc8..3743158ddaeb 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -485,9 +485,6 @@ int blk_rq_append_bio(struct request *rq, struct bio *bio) struct bio_vec bv; unsigned int nr_segs = 0; - if (WARN_ON_ONCE(rq->q->limits.bounce != BLK_BOUNCE_NONE)) - return -EINVAL; - bio_for_each_bvec(bv, bio, iter) nr_segs++; -- cgit v1.2.3-58-ga151 From 6f822e1b5d9dda3d20e87365de138046e3baa03a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 12 Apr 2021 15:46:57 +0200 Subject: block: remove zero_fill_bio_iter zero_fill_bio_iter is only used to implement zero_fill_bio, so remove the indirection. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20210412134658.2623190-1-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 6 +++--- include/linux/bio.h | 7 +------ 2 files changed, 4 insertions(+), 9 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 26b7f721cda8..0fecb80872c2 100644 --- a/block/bio.c +++ b/block/bio.c @@ -493,20 +493,20 @@ struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned short nr_iovecs) } EXPORT_SYMBOL(bio_kmalloc); -void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start) +void zero_fill_bio(struct bio *bio) { unsigned long flags; struct bio_vec bv; struct bvec_iter iter; - __bio_for_each_segment(bv, bio, iter, start) { + bio_for_each_segment(bv, bio, iter) { char *data = bvec_kmap_irq(&bv, &flags); memset(data, 0, bv.bv_len); flush_dcache_page(bv.bv_page); bvec_kunmap_irq(data, &flags); } } -EXPORT_SYMBOL(zero_fill_bio_iter); +EXPORT_SYMBOL(zero_fill_bio); /** * bio_truncate - truncate the bio to small size of @new_size diff --git a/include/linux/bio.h b/include/linux/bio.h index d0246c92a6e8..a8021d79d45d 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -485,14 +485,9 @@ extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, extern void bio_copy_data(struct bio *dst, struct bio *src); extern void bio_list_copy_data(struct bio *dst, struct bio *src); extern void bio_free_pages(struct bio *bio); -void zero_fill_bio_iter(struct bio *bio, struct bvec_iter iter); void bio_truncate(struct bio *bio, unsigned new_size); void guard_bio_eod(struct bio *bio); - -static inline void zero_fill_bio(struct bio *bio) -{ - zero_fill_bio_iter(bio, bio->bi_iter); -} +void zero_fill_bio(struct bio *bio); extern const char *bio_devname(struct bio *bio, char *buffer); -- cgit v1.2.3-58-ga151 From 5f03414d4098b5718f1a5e99b43b9d9cb6f3612a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 12 Apr 2021 15:46:58 +0200 Subject: block: move bio_list_copy_data to pktcdvd bio_list_copy_data is only used by pktcdvd, so move it there. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210412134658.2623190-2-hch@lst.de Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/bio.c | 37 ------------------------------------- drivers/block/pktcdvd.c | 36 ++++++++++++++++++++++++++++++++++++ include/linux/bio.h | 1 - 3 files changed, 36 insertions(+), 38 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 0fecb80872c2..303298996afe 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1221,43 +1221,6 @@ void bio_copy_data(struct bio *dst, struct bio *src) } EXPORT_SYMBOL(bio_copy_data); -/** - * bio_list_copy_data - copy contents of data buffers from one chain of bios to - * another - * @src: source bio list - * @dst: destination bio list - * - * Stops when it reaches the end of either the @src list or @dst list - that is, - * copies min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of - * bios). - */ -void bio_list_copy_data(struct bio *dst, struct bio *src) -{ - struct bvec_iter src_iter = src->bi_iter; - struct bvec_iter dst_iter = dst->bi_iter; - - while (1) { - if (!src_iter.bi_size) { - src = src->bi_next; - if (!src) - break; - - src_iter = src->bi_iter; - } - - if (!dst_iter.bi_size) { - dst = dst->bi_next; - if (!dst) - break; - - dst_iter = dst->bi_iter; - } - - bio_copy_data_iter(dst, &dst_iter, src, &src_iter); - } -} -EXPORT_SYMBOL(bio_list_copy_data); - void bio_free_pages(struct bio *bio) { struct bio_vec *bvec; diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index fc4b0f1aa86d..bd3556585122 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -1199,6 +1199,42 @@ try_next_bio: return 1; } +/** + * bio_list_copy_data - copy contents of data buffers from one chain of bios to + * another + * @src: source bio list + * @dst: destination bio list + * + * Stops when it reaches the end of either the @src list or @dst list - that is, + * copies min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of + * bios). + */ +static void bio_list_copy_data(struct bio *dst, struct bio *src) +{ + struct bvec_iter src_iter = src->bi_iter; + struct bvec_iter dst_iter = dst->bi_iter; + + while (1) { + if (!src_iter.bi_size) { + src = src->bi_next; + if (!src) + break; + + src_iter = src->bi_iter; + } + + if (!dst_iter.bi_size) { + dst = dst->bi_next; + if (!dst) + break; + + dst_iter = dst->bi_iter; + } + + bio_copy_data_iter(dst, &dst_iter, src, &src_iter); + } +} + /* * Assemble a bio to write one packet and queue the bio for processing * by the underlying block device. diff --git a/include/linux/bio.h b/include/linux/bio.h index a8021d79d45d..a0b4cfdf62a4 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -483,7 +483,6 @@ extern void bio_check_pages_dirty(struct bio *bio); extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, struct bio *src, struct bvec_iter *src_iter); extern void bio_copy_data(struct bio *dst, struct bio *src); -extern void bio_list_copy_data(struct bio *dst, struct bio *src); extern void bio_free_pages(struct bio *bio); void bio_truncate(struct bio *bio, unsigned new_size); void guard_bio_eod(struct bio *bio); -- cgit v1.2.3-58-ga151 From 347b546d5a9bd5871279a29be3dce8b5aad47ef6 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 12 Apr 2021 20:41:42 -0700 Subject: block: Remove an obsolete comment from sg_io() Commit b7819b925918 ("block: remove the blk_execute_rq return value") changed the return type of blk_execute_rq() from int into void. That change made a comment in sg_io() obsolete. Hence remove that comment. Cc: Christoph Hellwig Cc: Johannes Thumshirn Signed-off-by: Bart Van Assche Reviewed-by: Chaitanya Kulkarni Reviewed-by: Himanshu Madhani Link: https://lore.kernel.org/r/20210413034142.23460-1-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/scsi_ioctl.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'block') diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index 1048b0925567..1b3fe99b83a6 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -353,10 +353,6 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, start_time = jiffies; - /* ignore return value. All information is passed back to caller - * (if he doesn't check that is his problem). - * N.B. a non-zero SCSI status is _not_ necessarily an error. - */ blk_execute_rq(bd_disk, rq, at_head); hdr->duration = jiffies_to_msecs(jiffies - start_time); -- cgit v1.2.3-58-ga151 From 8d663f34f8afcf5fc6a84c3cc4fa28cc84d58e39 Mon Sep 17 00:00:00 2001 From: Lin Feng Date: Thu, 15 Apr 2021 11:39:20 +0800 Subject: blk-mq: bypass IO scheduler's limit_depth for passthrough request Commit 01e99aeca39796003 ("blk-mq: insert passthrough request into hctx->dispatch directly") gives high priority to passthrough requests and bypass underlying IO scheduler. But as we allocate tag for such request it still runs io-scheduler's callback limit_depth, while we really want is to give full sbitmap-depth capabity to such request for acquiring available tag. blktrace shows PC requests(dmraid -s -c -i) hit bfq's limit_depth: 8,0 2 0 0.000000000 39952 1,0 m N bfq [bfq_limit_depth] wr_busy 0 sync 0 depth 8 8,0 2 1 0.000008134 39952 D R 4 [dmraid] 8,0 2 2 0.000021538 24 C R [0] 8,0 2 0 0.000035442 39952 1,0 m N bfq [bfq_limit_depth] wr_busy 0 sync 0 depth 8 8,0 2 3 0.000038813 39952 D R 24 [dmraid] 8,0 2 4 0.000044356 24 C R [0] This patch introduce a new wrapper to make code not that ugly. Signed-off-by: Lin Feng Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20210415033920.213963-1-linf@wangsu.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 3 ++- include/linux/blkdev.h | 6 ++++++ 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index d4d7c1caa439..927189a55575 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -361,11 +361,12 @@ static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data) if (e) { /* - * Flush requests are special and go directly to the + * Flush/passthrough requests are special and go directly to the * dispatch list. Don't include reserved tags in the * limiting, as it isn't useful. */ if (!op_is_flush(data->cmd_flags) && + !blk_op_is_passthrough(data->cmd_flags) && e->type->ops.limit_depth && !(data->flags & BLK_MQ_REQ_RESERVED)) e->type->ops.limit_depth(data->cmd_flags, data); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 30d2090583ad..f2e77ba97550 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -274,6 +274,12 @@ static inline bool bio_is_passthrough(struct bio *bio) return blk_op_is_scsi(op) || blk_op_is_private(op); } +static inline bool blk_op_is_passthrough(unsigned int op) +{ + return (blk_op_is_scsi(op & REQ_OP_MASK) || + blk_op_is_private(op & REQ_OP_MASK)); +} + static inline unsigned short req_get_ioprio(struct request *req) { return req->ioprio; -- cgit v1.2.3-58-ga151 From 7687b38ae470f01749e420079c36cccb24b8619a Mon Sep 17 00:00:00 2001 From: Lin Feng Date: Thu, 15 Apr 2021 11:43:26 +0800 Subject: bfq/mq-deadline: remove redundant check for passthrough request Since commit 01e99aeca39796003 'blk-mq: insert passthrough request into hctx->dispatch directly', passthrough request should not appear in IO-scheduler any more, so blk_rq_is_passthrough checking in addon IO schedulers is redundant. (Notes: this patch passes generic IO load test with hdds under SAS controller and hdds under AHCI controller but obviously not covers all. Not sure if passthrough request can still escape into IO scheduler from blk_mq_sched_insert_requests, which is used by blk_mq_flush_plug_list and has lots of indirect callers.) Signed-off-by: Lin Feng Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 3 +-- block/mq-deadline.c | 7 ++----- 2 files changed, 3 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 9b7678ad5830..0270cd7ca165 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5957,8 +5957,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, bfqd->in_service_queue != NULL && bfq_tot_busy_queues(bfqd) == 1 + bfq_bfqq_busy(bfqq) && (bfqq->waker_bfqq == bfqd->in_service_queue || - bfqd->in_service_queue->waker_bfqq == bfqq)) || - at_head || blk_rq_is_passthrough(rq)) { + bfqd->in_service_queue->waker_bfqq == bfqq)) || at_head) { if (at_head) list_add(&rq->queuelist, &bfqd->dispatch); else diff --git a/block/mq-deadline.c b/block/mq-deadline.c index f3631a287466..04aded71ead2 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -500,11 +500,8 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, trace_block_rq_insert(rq); - if (at_head || blk_rq_is_passthrough(rq)) { - if (at_head) - list_add(&rq->queuelist, &dd->dispatch); - else - list_add_tail(&rq->queuelist, &dd->dispatch); + if (at_head) { + list_add(&rq->queuelist, &dd->dispatch); } else { deadline_add_rq_rb(dd, rq); -- cgit v1.2.3-58-ga151 From 1e91e28e374d0b0b912154c192716374609360d9 Mon Sep 17 00:00:00 2001 From: Saravanan D Date: Wed, 7 Apr 2021 10:59:58 -0700 Subject: blk-mq: Fix spurious debugfs directory creation during initialization blk_mq_debugfs_register_sched_hctx() called from device_add_disk()->elevator_init_mq()->blk_mq_init_sched() initialization sequence does not have relevant parent directory setup and thus spuriously attempts "sched" directory creation from root mount of debugfs for every hw queue detected on the block device dmesg ... debugfs: Directory 'sched' with parent '/' already present! debugfs: Directory 'sched' with parent '/' already present! . . debugfs: Directory 'sched' with parent '/' already present! ... The parent debugfs directory for hw queues get properly setup device_add_disk()->blk_register_queue()->blk_mq_debugfs_register() ->blk_mq_debugfs_register_hctx() later in the block device initialization sequence. A simple check for debugfs_dir has been added to thwart premature debugfs directory/file creation attempts. Signed-off-by: Saravanan D Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'block') diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 9ebb344e2585..8ea96d83d599 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -973,6 +973,14 @@ void blk_mq_debugfs_register_sched_hctx(struct request_queue *q, { struct elevator_type *e = q->elevator->type; + /* + * If the parent debugfs directory has not been created yet, return; + * We will be called again later on with appropriate parent debugfs + * directory from blk_register_queue() + */ + if (!hctx->debugfs_dir) + return; + if (!e->hctx_debugfs_attrs) return; -- cgit v1.2.3-58-ga151 From f46ec84b5acbf8d7067d71a6bbdde213d4b86036 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 22 Apr 2021 21:54:28 -0400 Subject: blk-iocost: don't ignore vrate_min on QD contention ioc_adjust_base_vrate() ignored vrate_min when rq_wait_pct indicates that there is QD contention. The reasoning was that QD depletion always reliably indicates device saturation and thus it's safe to override user specified vrate_min. However, this sometimes leads to unnecessary throttling, especially on really fast devices, because vrate adjustments have delays and inertia. It also confuses users because the behavior violates the explicitly specified configuration. This patch drops the special case handling so that vrate_min is always applied. Signed-off-by: Tejun Heo Link: https://lore.kernel.org/r/YIIo1HuyNmhDeiNx@slm.duckdns.org Signed-off-by: Jens Axboe --- block/blk-iocost.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'block') diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 98d656bdb42b..e0c4baa01857 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -987,10 +987,6 @@ static void ioc_adjust_base_vrate(struct ioc *ioc, u32 rq_wait_pct, return; } - /* rq_wait signal is always reliable, ignore user vrate_min */ - if (rq_wait_pct > RQ_WAIT_BUSY_PCT) - vrate_min = VRATE_MIN; - /* * If vrate is out of bounds, apply clamp gradually as the * bounds can change abruptly. Otherwise, apply busy_level -- cgit v1.2.3-58-ga151