From af551fb3be26a22b7a6b345b3b7e7e6acfc41758 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Thu, 14 Sep 2017 14:02:05 -0700
Subject: blkcg: delete unused APIs

Nobody uses the APIs right now.

Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c | 31 -------------------------------
 1 file changed, 31 deletions(-)

(limited to 'block')

diff --git a/block/bio.c b/block/bio.c
index b38e962fa83e..8338304ea256 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -2032,37 +2032,6 @@ int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css)
 }
 EXPORT_SYMBOL_GPL(bio_associate_blkcg);
 
-/**
- * bio_associate_current - associate a bio with %current
- * @bio: target bio
- *
- * Associate @bio with %current if it hasn't been associated yet.  Block
- * layer will treat @bio as if it were issued by %current no matter which
- * task actually issues it.
- *
- * This function takes an extra reference of @task's io_context and blkcg
- * which will be put when @bio is released.  The caller must own @bio,
- * ensure %current->io_context exists, and is responsible for synchronizing
- * calls to this function.
- */
-int bio_associate_current(struct bio *bio)
-{
-	struct io_context *ioc;
-
-	if (bio->bi_css)
-		return -EBUSY;
-
-	ioc = current->io_context;
-	if (!ioc)
-		return -ENOENT;
-
-	get_io_context_active(ioc);
-	bio->bi_ioc = ioc;
-	bio->bi_css = task_get_css(current, io_cgrp_id);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(bio_associate_current);
-
 /**
  * bio_disassociate_task - undo bio_associate_current()
  * @bio: target bio
-- 
cgit v1.2.3-58-ga151


From 547248736ae54b73e713ee259a81ab3a637cb2f7 Mon Sep 17 00:00:00 2001
From: weiping zhang <zhangweiping@didichuxing.com>
Date: Sat, 30 Sep 2017 15:01:48 +0800
Subject: blk-mq: remove unused function hctx_allow_merges

since 9bddeb2a5b981 "blk-mq: make per-sw-queue bio merge as default .bio_merge"
there is no caller for this function.

Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: weiping zhang <zhangweiping@didichuxing.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 98a18609755e..59687ed6561b 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1504,12 +1504,6 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
 	blk_account_io_start(rq, true);
 }
 
-static inline bool hctx_allow_merges(struct blk_mq_hw_ctx *hctx)
-{
-	return (hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
-		!blk_queue_nomerges(hctx->queue);
-}
-
 static inline void blk_mq_queue_io(struct blk_mq_hw_ctx *hctx,
 				   struct blk_mq_ctx *ctx,
 				   struct request *rq)
-- 
cgit v1.2.3-58-ga151


From 5385fa47d883dc7d51de622835427bc2558ed43c Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sun, 1 Oct 2017 01:26:21 -0600
Subject: blk-mq-tag: kill unused tag enums

We don't have any notion of a tagging cache anymore, and haven't
for a long time. Kill off the unused enums.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-tag.h | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 5cb51e53cc03..5932a7ac7fc4 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -43,14 +43,9 @@ static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt,
 	return sbq_wait_ptr(bt, &hctx->wait_index);
 }
 
-enum {
-	BLK_MQ_TAG_CACHE_MIN	= 1,
-	BLK_MQ_TAG_CACHE_MAX	= 64,
-};
-
 enum {
 	BLK_MQ_TAG_FAIL		= -1U,
-	BLK_MQ_TAG_MIN		= BLK_MQ_TAG_CACHE_MIN,
+	BLK_MQ_TAG_MIN		= 1,
 	BLK_MQ_TAG_MAX		= BLK_MQ_TAG_FAIL - 1,
 };
 
-- 
cgit v1.2.3-58-ga151


From 7beb2f845b715cb98584cf630e9a9d5b05501166 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 30 Sep 2017 02:08:24 -0600
Subject: blk-mq: wire up completion notifier for laptop mode

For some reason, the laptop mode IO completion notified was never wired
up for blk-mq. Ensure that we trigger the callback appropriately, to arm
the laptop mode flush timer.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bart Van Assche <Bart.VanAssche@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 59687ed6561b..7f01d69879d6 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -476,6 +476,9 @@ void blk_mq_free_request(struct request *rq)
 	if (rq->rq_flags & RQF_MQ_INFLIGHT)
 		atomic_dec(&hctx->nr_active);
 
+	if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
+		laptop_io_completion(q->backing_dev_info);
+
 	wbt_done(q->rq_wb, &rq->issue_stat);
 
 	clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
-- 
cgit v1.2.3-58-ga151


From 4baa8bb13f41307f3eb62fe91f93a1a798ebef53 Mon Sep 17 00:00:00 2001
From: Paolo Valente <paolo.valente@linaro.org>
Date: Thu, 21 Sep 2017 11:04:00 +0200
Subject: block, bfq: fix wrong init of saved start time for weight raising

This commit fixes a bug that causes bfq to fail to guarantee a high
responsiveness on some drives, if there is heavy random read+write I/O
in the background. More precisely, such a failure allowed this bug to
be found [1], but the bug may well cause other yet unreported
anomalies.

BFQ raises the weight of the bfq_queues associated with soft real-time
applications, to privilege the I/O, and thus reduce latency, for these
applications. This mechanism is named soft-real-time weight raising in
BFQ. A soft real-time period may happen to be nested into an
interactive weight raising period, i.e., it may happen that, when a
bfq_queue switches to a soft real-time weight-raised state, the
bfq_queue is already being weight-raised because deemed interactive
too. In this case, BFQ saves in a special variable
wr_start_at_switch_to_srt, the time instant when the interactive
weight-raising period started for the bfq_queue, i.e., the time
instant when BFQ started to deem the bfq_queue interactive. This value
is then used to check whether the interactive weight-raising period
would still be in progress when the soft real-time weight-raising
period ends.  If so, interactive weight raising is restored for the
bfq_queue. This restore is useful, in particular, because it prevents
bfq_queues from losing their interactive weight raising prematurely,
as a consequence of spurious, short-lived soft real-time
weight-raising periods caused by wrong detections as soft real-time.

If, instead, a bfq_queue switches to soft-real-time weight raising
while it *is not* already in an interactive weight-raising period,
then the variable wr_start_at_switch_to_srt has no meaning during the
following soft real-time weight-raising period. Unfortunately the
handling of this case is wrong in BFQ: not only the variable is not
flagged somehow as meaningless, but it is also set to the time when
the switch to soft real-time weight-raising occurs. This may cause an
interactive weight-raising period to be considered mistakenly as still
in progress, and thus a spurious interactive weight-raising period to
start for the bfq_queue, at the end of the soft-real-time
weight-raising period. In particular the spurious interactive
weight-raising period will be considered as still in progress, if the
soft-real-time weight-raising period does not last very long. The
bfq_queue will then be wrongly privileged and, if I/O bound, will
unjustly steal bandwidth to truly interactive or soft real-time
bfq_queues, harming responsiveness and low latency.

This commit fixes this issue by just setting wr_start_at_switch_to_srt
to minus infinity (farthest past time instant according to jiffies
macros): when the soft-real-time weight-raising period ends, certainly
no interactive weight-raising period will be considered as still in
progress.

[1] Background I/O Type: Random - Background I/O mix: Reads and writes
- Application to start: LibreOffice Writer in
http://www.phoronix.com/scan.php?page=news_item&px=Linux-4.13-IO-Laptop

Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Angelo Ruocco <angeloruocco90@gmail.com>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Tested-by: Lee Tibbert <lee.tibbert@gmail.com>
Tested-by: Mirko Montanari <mirkomontanari91@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-iosched.c | 50 +++++++++++++++++++++++++++++++-------------------
 1 file changed, 31 insertions(+), 19 deletions(-)

(limited to 'block')

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index a4783da90ba8..c25955c25e03 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -1202,6 +1202,24 @@ static unsigned int bfq_wr_duration(struct bfq_data *bfqd)
 	return dur;
 }
 
+/*
+ * Return the farthest future time instant according to jiffies
+ * macros.
+ */
+static unsigned long bfq_greatest_from_now(void)
+{
+	return jiffies + MAX_JIFFY_OFFSET;
+}
+
+/*
+ * Return the farthest past time instant according to jiffies
+ * macros.
+ */
+static unsigned long bfq_smallest_from_now(void)
+{
+	return jiffies - MAX_JIFFY_OFFSET;
+}
+
 static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd,
 					     struct bfq_queue *bfqq,
 					     unsigned int old_wr_coeff,
@@ -1216,7 +1234,19 @@ static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd,
 			bfqq->wr_coeff = bfqd->bfq_wr_coeff;
 			bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
 		} else {
-			bfqq->wr_start_at_switch_to_srt = jiffies;
+			/*
+			 * No interactive weight raising in progress
+			 * here: assign minus infinity to
+			 * wr_start_at_switch_to_srt, to make sure
+			 * that, at the end of the soft-real-time
+			 * weight raising periods that is starting
+			 * now, no interactive weight-raising period
+			 * may be wrongly considered as still in
+			 * progress (and thus actually started by
+			 * mistake).
+			 */
+			bfqq->wr_start_at_switch_to_srt =
+				bfq_smallest_from_now();
 			bfqq->wr_coeff = bfqd->bfq_wr_coeff *
 				BFQ_SOFTRT_WEIGHT_FACTOR;
 			bfqq->wr_cur_max_time =
@@ -2897,24 +2927,6 @@ static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,
 		   jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4);
 }
 
-/*
- * Return the farthest future time instant according to jiffies
- * macros.
- */
-static unsigned long bfq_greatest_from_now(void)
-{
-	return jiffies + MAX_JIFFY_OFFSET;
-}
-
-/*
- * Return the farthest past time instant according to jiffies
- * macros.
- */
-static unsigned long bfq_smallest_from_now(void)
-{
-	return jiffies - MAX_JIFFY_OFFSET;
-}
-
 /**
  * bfq_bfqq_expire - expire a queue.
  * @bfqd: device owning the queue.
-- 
cgit v1.2.3-58-ga151


From 3e2bdd6dff239afd8386e8758eee69ad61e5a3d6 Mon Sep 17 00:00:00 2001
From: Paolo Valente <paolo.valente@linaro.org>
Date: Thu, 21 Sep 2017 11:04:01 +0200
Subject: block, bfq: check and switch back to interactive wr also on queue
 split

As already explained in the message of commit "block, bfq: fix
wrong init of saved start time for weight raising", if a soft
real-time weight-raising period happens to be nested in a larger
interactive weight-raising period, then BFQ restores the interactive
weight raising at the end of the soft real-time weight raising. In
particular, BFQ checks whether the latter has ended only on request
dispatches.

Unfortunately, the above scheme fails to restore interactive weight
raising in the following corner case: if a bfq_queue, say Q,
1) Is merged with another bfq_queue while it is in a nested soft
real-time weight-raising period. The weight-raising state of Q is
then saved, and not considered any longer until a split occurs.
2) Is split from the other bfq_queue(s) at a time instant when its
soft real-time weight raising is already finished.
On the split, while resuming the previous, soft real-time
weight-raised state of the bfq_queue Q, BFQ checks whether the
current soft real-time weight-raising period is actually over. If so,
BFQ switches weight raising off for Q, *without* checking whether the
soft real-time period was actually nested in a non-yet-finished
interactive weight-raising period.

This commit addresses this issue by adding the above missing check in
bfq_queue splits, and restoring interactive weight raising if needed.

Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Tested-by: Angelo Ruocco <angeloruocco90@gmail.com>
Tested-by: Mirko Montanari <mirkomontanari91@gmail.com>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Tested-by: Lee Tibbert <lee.tibbert@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-iosched.c | 87 ++++++++++++++++++++++++++++++-----------------------
 1 file changed, 49 insertions(+), 38 deletions(-)

(limited to 'block')

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index c25955c25e03..33b63bc4a64b 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -724,6 +724,44 @@ static void bfq_updated_next_req(struct bfq_data *bfqd,
 	}
 }
 
+static unsigned int bfq_wr_duration(struct bfq_data *bfqd)
+{
+	u64 dur;
+
+	if (bfqd->bfq_wr_max_time > 0)
+		return bfqd->bfq_wr_max_time;
+
+	dur = bfqd->RT_prod;
+	do_div(dur, bfqd->peak_rate);
+
+	/*
+	 * Limit duration between 3 and 13 seconds. Tests show that
+	 * higher values than 13 seconds often yield the opposite of
+	 * the desired result, i.e., worsen responsiveness by letting
+	 * non-interactive and non-soft-real-time applications
+	 * preserve weight raising for a too long time interval.
+	 *
+	 * On the other end, lower values than 3 seconds make it
+	 * difficult for most interactive tasks to complete their jobs
+	 * before weight-raising finishes.
+	 */
+	if (dur > msecs_to_jiffies(13000))
+		dur = msecs_to_jiffies(13000);
+	else if (dur < msecs_to_jiffies(3000))
+		dur = msecs_to_jiffies(3000);
+
+	return dur;
+}
+
+/* switch back from soft real-time to interactive weight raising */
+static void switch_back_to_interactive_wr(struct bfq_queue *bfqq,
+					  struct bfq_data *bfqd)
+{
+	bfqq->wr_coeff = bfqd->bfq_wr_coeff;
+	bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
+	bfqq->last_wr_start_finish = bfqq->wr_start_at_switch_to_srt;
+}
+
 static void
 bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd,
 		      struct bfq_io_cq *bic, bool bfq_already_existing)
@@ -750,10 +788,16 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd,
 	if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) ||
 	    time_is_before_jiffies(bfqq->last_wr_start_finish +
 				   bfqq->wr_cur_max_time))) {
-		bfq_log_bfqq(bfqq->bfqd, bfqq,
-		    "resume state: switching off wr");
-
-		bfqq->wr_coeff = 1;
+		if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time &&
+		    !bfq_bfqq_in_large_burst(bfqq) &&
+		    time_is_after_eq_jiffies(bfqq->wr_start_at_switch_to_srt +
+					     bfq_wr_duration(bfqd))) {
+			switch_back_to_interactive_wr(bfqq, bfqd);
+		} else {
+			bfqq->wr_coeff = 1;
+			bfq_log_bfqq(bfqq->bfqd, bfqq,
+				     "resume state: switching off wr");
+		}
 	}
 
 	/* make sure weight will be updated, however we got here */
@@ -1173,35 +1217,6 @@ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
 	return wr_or_deserves_wr;
 }
 
-static unsigned int bfq_wr_duration(struct bfq_data *bfqd)
-{
-	u64 dur;
-
-	if (bfqd->bfq_wr_max_time > 0)
-		return bfqd->bfq_wr_max_time;
-
-	dur = bfqd->RT_prod;
-	do_div(dur, bfqd->peak_rate);
-
-	/*
-	 * Limit duration between 3 and 13 seconds. Tests show that
-	 * higher values than 13 seconds often yield the opposite of
-	 * the desired result, i.e., worsen responsiveness by letting
-	 * non-interactive and non-soft-real-time applications
-	 * preserve weight raising for a too long time interval.
-	 *
-	 * On the other end, lower values than 3 seconds make it
-	 * difficult for most interactive tasks to complete their jobs
-	 * before weight-raising finishes.
-	 */
-	if (dur > msecs_to_jiffies(13000))
-		dur = msecs_to_jiffies(13000);
-	else if (dur < msecs_to_jiffies(3000))
-		dur = msecs_to_jiffies(3000);
-
-	return dur;
-}
-
 /*
  * Return the farthest future time instant according to jiffies
  * macros.
@@ -3501,11 +3516,7 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 					       bfq_wr_duration(bfqd)))
 				bfq_bfqq_end_wr(bfqq);
 			else {
-				/* switch back to interactive wr */
-				bfqq->wr_coeff = bfqd->bfq_wr_coeff;
-				bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
-				bfqq->last_wr_start_finish =
-					bfqq->wr_start_at_switch_to_srt;
+				switch_back_to_interactive_wr(bfqq, bfqd);
 				bfqq->entity.prio_changed = 1;
 			}
 		}
-- 
cgit v1.2.3-58-ga151


From 894df937e06a56ed6f054a75a416aff84147c5a2 Mon Sep 17 00:00:00 2001
From: Paolo Valente <paolo.valente@linaro.org>
Date: Thu, 21 Sep 2017 11:04:02 +0200
Subject: block, bfq: let early-merged queues be weight-raised on split too

A just-created bfq_queue, say Q, may happen to be merged with another
bfq_queue on the very first invocation of the function
__bfq_insert_request. In such a case, even if Q would clearly deserve
interactive weight raising (as it has just been created), the function
bfq_add_request does not make it to be invoked for Q, and thus to
activate weight raising for Q. As a consequence, when the state of Q
is saved for a possible future restore, after a split of Q from the
other bfq_queue(s), such a state happens to be (unjustly)
non-weight-raised. Then the bfq_queue will not enjoy any weight
raising on the split, even if should still be in an interactive
weight-raising period when the split occurs.

This commit solves this problem as follows, for a just-created
bfq_queue that is being early-merged: it stores directly, in the saved
state of the bfq_queue, the weight-raising state that would have been
assigned to the bfq_queue if not early-merged.

Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Tested-by: Angelo Ruocco <angeloruocco90@gmail.com>
Tested-by: Mirko Montanari <mirkomontanari91@gmail.com>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Tested-by: Lee Tibbert <lee.tibbert@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-iosched.c | 28 +++++++++++++++++++++++-----
 1 file changed, 23 insertions(+), 5 deletions(-)

(limited to 'block')

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 33b63bc4a64b..115747fe43c8 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -2061,10 +2061,27 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
 	bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq);
 	bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq);
 	bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node);
-	bic->saved_wr_coeff = bfqq->wr_coeff;
-	bic->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt;
-	bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish;
-	bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time;
+	if (unlikely(bfq_bfqq_just_created(bfqq) &&
+		     !bfq_bfqq_in_large_burst(bfqq))) {
+		/*
+		 * bfqq being merged right after being created: bfqq
+		 * would have deserved interactive weight raising, but
+		 * did not make it to be set in a weight-raised state,
+		 * because of this early merge.	Store directly the
+		 * weight-raising state that would have been assigned
+		 * to bfqq, so that to avoid that bfqq unjustly fails
+		 * to enjoy weight raising if split soon.
+		 */
+		bic->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff;
+		bic->saved_wr_cur_max_time = bfq_wr_duration(bfqq->bfqd);
+		bic->saved_last_wr_start_finish = jiffies;
+	} else {
+		bic->saved_wr_coeff = bfqq->wr_coeff;
+		bic->saved_wr_start_at_switch_to_srt =
+			bfqq->wr_start_at_switch_to_srt;
+		bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish;
+		bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time;
+	}
 }
 
 static void
@@ -4150,7 +4167,6 @@ static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
 		new_bfqq->allocated++;
 		bfqq->allocated--;
 		new_bfqq->ref++;
-		bfq_clear_bfqq_just_created(bfqq);
 		/*
 		 * If the bic associated with the process
 		 * issuing this request still points to bfqq
@@ -4162,6 +4178,8 @@ static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
 		if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq)
 			bfq_merge_bfqqs(bfqd, RQ_BIC(rq),
 					bfqq, new_bfqq);
+
+		bfq_clear_bfqq_just_created(bfqq);
 		/*
 		 * rq is about to be enqueued into new_bfqq,
 		 * release rq reference on bfqq
-- 
cgit v1.2.3-58-ga151


From 7cb04004fa371a626c1a5ebe6d977f70285759ed Mon Sep 17 00:00:00 2001
From: Paolo Valente <paolo.valente@linaro.org>
Date: Thu, 21 Sep 2017 11:04:03 +0200
Subject: block, bfq: decrease burst size when queues in burst exit

If many queues belonging to the same group happen to be created
shortly after each other, then the concurrent processes associated
with these queues have typically a common goal, and they get it done
as soon as possible if not hampered by device idling.  Examples are
processes spawned by git grep, or by systemd during boot. As for
device idling, this mechanism is currently necessary for weight
raising to succeed in its goal: privileging I/O.  In view of these
facts, BFQ does not provide the above queues with either weight
raising or device idling.

On the other hand, a burst of queue creations may be caused also by
the start-up of a complex application. In this case, these queues need
usually to be served one after the other, and as quickly as possible,
to maximise responsiveness. Therefore, in this case the best strategy
is to weight-raise all the queues created during the burst, i.e., the
exact opposite of the strategy for the above case.

To distinguish between the two cases, BFQ uses an empirical burst-size
threshold, found through extensive tests and monitoring of daily
usage. Only large bursts, i.e., burst with a size above this
threshold, are considered as generated by a high number of parallel
processes. In this respect, upstart-based boot proved to be rather
hard to detect as generating a large burst of queue creations, because
with upstart most of the queues created in a burst exit *before* the
next queues in the same burst are created. To address this issue, I
changed the burst-detection mechanism so as to not decrease the size
of the current burst even if one of the queues in the burst is
eliminated.

Unfortunately, this missing decrease causes false positives on very
fast systems: on the start-up of a complex application, such as
libreoffice writer, so many queues are created, served and exited
shortly after each other, that a large burst of queue creations is
wrongly detected as occurring. These false positives just disappear if
the size of a burst is decreased when one of the queues in the burst
exits. This commit restores the missing burst-size decrease, relying
of the fact that upstart is apparently unlikely to be used on systems
running this and future versions of the kernel.

Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Mauro Andreolini <mauro.andreolini@unimore.it>
Signed-off-by: Angelo Ruocco <angeloruocco90@gmail.com>
Tested-by: Mirko Montanari <mirkomontanari91@gmail.com>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Tested-by: Lee Tibbert <lee.tibbert@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-iosched.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

(limited to 'block')

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 115747fe43c8..70f9177c4f5b 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -3725,16 +3725,10 @@ void bfq_put_queue(struct bfq_queue *bfqq)
 	if (bfqq->ref)
 		return;
 
-	if (bfq_bfqq_sync(bfqq))
-		/*
-		 * The fact that this queue is being destroyed does not
-		 * invalidate the fact that this queue may have been
-		 * activated during the current burst. As a consequence,
-		 * although the queue does not exist anymore, and hence
-		 * needs to be removed from the burst list if there,
-		 * the burst size has not to be decremented.
-		 */
+	if (bfq_bfqq_sync(bfqq) && !hlist_unhashed(&bfqq->burst_list_node)) {
 		hlist_del_init(&bfqq->burst_list_node);
+		bfqq->bfqd->burst_size--;
+	}
 
 	kmem_cache_free(bfq_pool, bfqq);
 #ifdef CONFIG_BFQ_GROUP_IOSCHED
-- 
cgit v1.2.3-58-ga151


From 9c9883744dda1cc38339a448dd8435140537027e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 3 Oct 2017 10:47:00 +0200
Subject: block: move __elv_next_request to blk-core.c

No need to have this helper inline in a header.  Also drop the __ prefix.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 42 ++++++++++++++++++++++++++++++++++++++++--
 block/blk.h      | 39 ---------------------------------------
 2 files changed, 40 insertions(+), 41 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 048be4aa6024..14f7674fa0b1 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2517,6 +2517,45 @@ void blk_account_io_start(struct request *rq, bool new_io)
 	part_stat_unlock();
 }
 
+static struct request *elv_next_request(struct request_queue *q)
+{
+	struct request *rq;
+	struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL);
+
+	WARN_ON_ONCE(q->mq_ops);
+
+	while (1) {
+		if (!list_empty(&q->queue_head)) {
+			rq = list_entry_rq(q->queue_head.next);
+			return rq;
+		}
+
+		/*
+		 * Flush request is running and flush request isn't queueable
+		 * in the drive, we can hold the queue till flush request is
+		 * finished. Even we don't do this, driver can't dispatch next
+		 * requests and will requeue them. And this can improve
+		 * throughput too. For example, we have request flush1, write1,
+		 * flush 2. flush1 is dispatched, then queue is hold, write1
+		 * isn't inserted to queue. After flush1 is finished, flush2
+		 * will be dispatched. Since disk cache is already clean,
+		 * flush2 will be finished very soon, so looks like flush2 is
+		 * folded to flush1.
+		 * Since the queue is hold, a flag is set to indicate the queue
+		 * should be restarted later. Please see flush_end_io() for
+		 * details.
+		 */
+		if (fq->flush_pending_idx != fq->flush_running_idx &&
+				!queue_flush_queueable(q)) {
+			fq->flush_queue_delayed = 1;
+			return NULL;
+		}
+		if (unlikely(blk_queue_bypass(q)) ||
+		    !q->elevator->type->ops.sq.elevator_dispatch_fn(q, 0))
+			return NULL;
+	}
+}
+
 /**
  * blk_peek_request - peek at the top of a request queue
  * @q: request queue to peek at
@@ -2538,8 +2577,7 @@ struct request *blk_peek_request(struct request_queue *q)
 	lockdep_assert_held(q->queue_lock);
 	WARN_ON_ONCE(q->mq_ops);
 
-	while ((rq = __elv_next_request(q)) != NULL) {
-
+	while ((rq = elv_next_request(q)) != NULL) {
 		rq = blk_pm_peek_request(q, rq);
 		if (!rq)
 			break;
diff --git a/block/blk.h b/block/blk.h
index fcb9775b997d..fda5a4632aba 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -148,45 +148,6 @@ static inline void blk_clear_rq_complete(struct request *rq)
 
 void blk_insert_flush(struct request *rq);
 
-static inline struct request *__elv_next_request(struct request_queue *q)
-{
-	struct request *rq;
-	struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL);
-
-	WARN_ON_ONCE(q->mq_ops);
-
-	while (1) {
-		if (!list_empty(&q->queue_head)) {
-			rq = list_entry_rq(q->queue_head.next);
-			return rq;
-		}
-
-		/*
-		 * Flush request is running and flush request isn't queueable
-		 * in the drive, we can hold the queue till flush request is
-		 * finished. Even we don't do this, driver can't dispatch next
-		 * requests and will requeue them. And this can improve
-		 * throughput too. For example, we have request flush1, write1,
-		 * flush 2. flush1 is dispatched, then queue is hold, write1
-		 * isn't inserted to queue. After flush1 is finished, flush2
-		 * will be dispatched. Since disk cache is already clean,
-		 * flush2 will be finished very soon, so looks like flush2 is
-		 * folded to flush1.
-		 * Since the queue is hold, a flag is set to indicate the queue
-		 * should be restarted later. Please see flush_end_io() for
-		 * details.
-		 */
-		if (fq->flush_pending_idx != fq->flush_running_idx &&
-				!queue_flush_queueable(q)) {
-			fq->flush_queue_delayed = 1;
-			return NULL;
-		}
-		if (unlikely(blk_queue_bypass(q)) ||
-		    !q->elevator->type->ops.sq.elevator_dispatch_fn(q, 0))
-			return NULL;
-	}
-}
-
 static inline void elv_activate_rq(struct request_queue *q, struct request *rq)
 {
 	struct elevator_queue *e = q->elevator;
-- 
cgit v1.2.3-58-ga151


From a7af0af32171c17d881e3e58b0925c4a44fb5a42 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 6 Sep 2017 10:00:22 +0200
Subject: blk-mq: attempt to fix atomic flag memory ordering

Attempt to untangle the ordering in blk-mq. The patch introducing the
single smp_mb__before_atomic() is obviously broken in that it doesn't
clearly specify a pairing barrier and an obtained guarantee.

The comment is further misleading in that it hints that the
deadline store and the COMPLETE store also need to be ordered, but
AFAICT there is no such dependency. However what does appear to be
important is the clear happening _after_ the store, and that worked by
pure accident.

This clarifies blk_mq_start_request() -- we should not get there with
STARTING set -- this simplifies the code and makes the barrier usage
sane (the old code could be read to allow not having _any_ atomic after
the barrier, in which case the barrier hasn't got anything to order). We
then also introduce the missing pairing barrier for it.

Also down-grade the barrier to smp_wmb(), this is cheaper for
PowerPC/ARM and doesn't cost anything extra on x86.

And it documents the STARTING vs COMPLETE ordering. Although I've not
been entirely successful in reverse engineering the blk-mq state
machine so there might still be more funnies around timeout vs
requeue.

If I got anything wrong, feel free to educate me by adding comments to
clarify things ;-)

Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Ming Lei <tom.leiming@gmail.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Andrea Parri <parri.andrea@gmail.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Bart Van Assche <bart.vanassche@wdc.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Fixes: 538b75341835 ("blk-mq: request deadline must be visible before marking rq as started")
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c      | 52 ++++++++++++++++++++++++++++++++++++++++------------
 block/blk-timeout.c |  2 +-
 2 files changed, 41 insertions(+), 13 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 7f01d69879d6..a7b46b754a7c 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -596,22 +596,32 @@ void blk_mq_start_request(struct request *rq)
 
 	blk_add_timer(rq);
 
-	/*
-	 * Ensure that ->deadline is visible before set the started
-	 * flag and clear the completed flag.
-	 */
-	smp_mb__before_atomic();
+	WARN_ON_ONCE(test_bit(REQ_ATOM_STARTED, &rq->atomic_flags));
 
 	/*
 	 * Mark us as started and clear complete. Complete might have been
 	 * set if requeue raced with timeout, which then marked it as
 	 * complete. So be sure to clear complete again when we start
 	 * the request, otherwise we'll ignore the completion event.
+	 *
+	 * Ensure that ->deadline is visible before we set STARTED, such that
+	 * blk_mq_check_expired() is guaranteed to observe our ->deadline when
+	 * it observes STARTED.
 	 */
-	if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
-		set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
-	if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
+	smp_wmb();
+	set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
+	if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) {
+		/*
+		 * Coherence order guarantees these consecutive stores to a
+		 * single variable propagate in the specified order. Thus the
+		 * clear_bit() is ordered _after_ the set bit. See
+		 * blk_mq_check_expired().
+		 *
+		 * (the bits must be part of the same byte for this to be
+		 * true).
+		 */
 		clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
+	}
 
 	if (q->dma_drain_size && blk_rq_bytes(rq)) {
 		/*
@@ -781,10 +791,19 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
 		struct request *rq, void *priv, bool reserved)
 {
 	struct blk_mq_timeout_data *data = priv;
+	unsigned long deadline;
 
 	if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
 		return;
 
+	/*
+	 * Ensures that if we see STARTED we must also see our
+	 * up-to-date deadline, see blk_mq_start_request().
+	 */
+	smp_rmb();
+
+	deadline = READ_ONCE(rq->deadline);
+
 	/*
 	 * The rq being checked may have been freed and reallocated
 	 * out already here, we avoid this race by checking rq->deadline
@@ -798,11 +817,20 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
 	 *   and clearing the flag in blk_mq_start_request(), so
 	 *   this rq won't be timed out too.
 	 */
-	if (time_after_eq(jiffies, rq->deadline)) {
-		if (!blk_mark_rq_complete(rq))
+	if (time_after_eq(jiffies, deadline)) {
+		if (!blk_mark_rq_complete(rq)) {
+			/*
+			 * Again coherence order ensures that consecutive reads
+			 * from the same variable must be in that order. This
+			 * ensures that if we see COMPLETE clear, we must then
+			 * see STARTED set and we'll ignore this timeout.
+			 *
+			 * (There's also the MB implied by the test_and_clear())
+			 */
 			blk_mq_rq_timed_out(rq, reserved);
-	} else if (!data->next_set || time_after(data->next, rq->deadline)) {
-		data->next = rq->deadline;
+		}
+	} else if (!data->next_set || time_after(data->next, deadline)) {
+		data->next = deadline;
 		data->next_set = 1;
 	}
 }
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 17ec83bb0900..e3e9c9771d36 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -211,7 +211,7 @@ void blk_add_timer(struct request *req)
 	if (!req->timeout)
 		req->timeout = q->rq_timeout;
 
-	req->deadline = jiffies + req->timeout;
+	WRITE_ONCE(req->deadline, jiffies + req->timeout);
 
 	/*
 	 * Only the non-mq case needs to add the request to a protected list.
-- 
cgit v1.2.3-58-ga151


From fc13457f74dcf054b0d17efb7b94b46fdf17f412 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 4 Oct 2017 11:22:24 -0600
Subject: blk-mq: document the need to have STARTED and COMPLETED share a byte

For memory ordering guarantees on stores, we need to ensure that
these two bits share the same byte of storage in the unsigned
long. Add a comment as to why, and a BUILD_BUG_ON() to ensure that
we don't violate this requirement.

Suggested-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 6 ++++++
 block/blk.h    | 7 +++++++
 2 files changed, 13 insertions(+)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index a7b46b754a7c..076cbab9c3e0 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2923,6 +2923,12 @@ EXPORT_SYMBOL_GPL(blk_mq_poll);
 
 static int __init blk_mq_init(void)
 {
+	/*
+	 * See comment in block/blk.h rq_atomic_flags enum
+	 */
+	BUILD_BUG_ON((REQ_ATOM_STARTED / BITS_PER_BYTE) !=
+			(REQ_ATOM_COMPLETE / BITS_PER_BYTE));
+
 	cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
 				blk_mq_hctx_notify_dead);
 	return 0;
diff --git a/block/blk.h b/block/blk.h
index fda5a4632aba..6ac43dfd68a7 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -122,8 +122,15 @@ void blk_account_io_done(struct request *req);
  * Internal atomic flags for request handling
  */
 enum rq_atomic_flags {
+	/*
+	 * Keep these two bits first - not because we depend on the
+	 * value of them, but we do depend on them being in the same
+	 * byte of storage to ensure ordering on writes. Keeping them
+	 * first will achieve that nicely.
+	 */
 	REQ_ATOM_COMPLETE = 0,
 	REQ_ATOM_STARTED,
+
 	REQ_ATOM_POLL_SLEPT,
 };
 
-- 
cgit v1.2.3-58-ga151


From 5fdee2127faa77c9c91862ad5e001dfab7013e92 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 5 Oct 2017 21:22:52 +0200
Subject: block: remove QUEUE_FLAG_STACKABLE

We already have a queue_is_rq_based helper to check if a request_queue
is request based, so we can remove the flag for it.

Acked-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-debugfs.c |  1 -
 block/elevator.c       |  2 +-
 drivers/md/dm-rq.c     |  2 +-
 drivers/md/dm-table.c  | 15 +--------------
 drivers/md/dm.c        | 11 -----------
 include/linux/blkdev.h |  5 -----
 6 files changed, 3 insertions(+), 33 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 980e73095643..7f4a1ba532af 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -54,7 +54,6 @@ static const char *const blk_queue_flag_name[] = {
 	QUEUE_FLAG_NAME(NOMERGES),
 	QUEUE_FLAG_NAME(SAME_COMP),
 	QUEUE_FLAG_NAME(FAIL_IO),
-	QUEUE_FLAG_NAME(STACKABLE),
 	QUEUE_FLAG_NAME(NONROT),
 	QUEUE_FLAG_NAME(IO_STAT),
 	QUEUE_FLAG_NAME(DISCARD),
diff --git a/block/elevator.c b/block/elevator.c
index 153926a90901..7ae50eb2732b 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -1118,7 +1118,7 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name)
 	struct elevator_type *__e;
 	int len = 0;
 
-	if (!blk_queue_stackable(q))
+	if (!queue_is_rq_based(q))
 		return sprintf(name, "none\n");
 
 	if (!q->elevator)
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index eadfcfd106ff..9d32f25489c2 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -56,7 +56,7 @@ static unsigned dm_get_blk_mq_queue_depth(void)
 
 int dm_request_based(struct mapped_device *md)
 {
-	return blk_queue_stackable(md->queue);
+	return queue_is_rq_based(md->queue);
 }
 
 static void dm_old_start_queue(struct request_queue *q)
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index ef7b8f201f73..75281828f2cb 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1000,7 +1000,7 @@ verify_rq_based:
 	list_for_each_entry(dd, devices, list) {
 		struct request_queue *q = bdev_get_queue(dd->dm_dev->bdev);
 
-		if (!blk_queue_stackable(q)) {
+		if (!queue_is_rq_based(q)) {
 			DMERR("table load rejected: including"
 			      " non-request-stackable devices");
 			return -EINVAL;
@@ -1847,19 +1847,6 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
 	 */
 	if (blk_queue_add_random(q) && dm_table_all_devices_attribute(t, device_is_not_random))
 		queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, q);
-
-	/*
-	 * QUEUE_FLAG_STACKABLE must be set after all queue settings are
-	 * visible to other CPUs because, once the flag is set, incoming bios
-	 * are processed by request-based dm, which refers to the queue
-	 * settings.
-	 * Until the flag set, bios are passed to bio-based dm and queued to
-	 * md->deferred where queue settings are not needed yet.
-	 * Those bios are passed to request-based dm at the resume time.
-	 */
-	smp_mb();
-	if (dm_table_request_based(t))
-		queue_flag_set_unlocked(QUEUE_FLAG_STACKABLE, q);
 }
 
 unsigned int dm_table_get_num_targets(struct dm_table *t)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 6e54145969c5..8d07ad61221c 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1612,17 +1612,6 @@ static void dm_wq_work(struct work_struct *work);
 
 void dm_init_md_queue(struct mapped_device *md)
 {
-	/*
-	 * Request-based dm devices cannot be stacked on top of bio-based dm
-	 * devices.  The type of this dm device may not have been decided yet.
-	 * The type is decided at the first table loading time.
-	 * To prevent problematic device stacking, clear the queue flag
-	 * for request stacking support until then.
-	 *
-	 * This queue is new, so no concurrency on the queue_flags.
-	 */
-	queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue);
-
 	/*
 	 * Initialize data that will only be used by a non-blk-mq DM queue
 	 * - must do so here (in alloc_dev callchain) before queue is used
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 02fa42d24b52..9fb71fc7d0e8 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -609,7 +609,6 @@ struct request_queue {
 #define QUEUE_FLAG_NOMERGES     5	/* disable merge attempts */
 #define QUEUE_FLAG_SAME_COMP	6	/* complete on same CPU-group */
 #define QUEUE_FLAG_FAIL_IO	7	/* fake timeout */
-#define QUEUE_FLAG_STACKABLE	8	/* supports request stacking */
 #define QUEUE_FLAG_NONROT	9	/* non-rotational device (SSD) */
 #define QUEUE_FLAG_VIRT        QUEUE_FLAG_NONROT /* paravirt device */
 #define QUEUE_FLAG_IO_STAT     10	/* do IO stats */
@@ -633,12 +632,10 @@ struct request_queue {
 #define QUEUE_FLAG_QUIESCED    28	/* queue has been quiesced */
 
 #define QUEUE_FLAG_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
-				 (1 << QUEUE_FLAG_STACKABLE)	|	\
 				 (1 << QUEUE_FLAG_SAME_COMP)	|	\
 				 (1 << QUEUE_FLAG_ADD_RANDOM))
 
 #define QUEUE_FLAG_MQ_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
-				 (1 << QUEUE_FLAG_STACKABLE)	|	\
 				 (1 << QUEUE_FLAG_SAME_COMP)	|	\
 				 (1 << QUEUE_FLAG_POLL))
 
@@ -722,8 +719,6 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
 #define blk_queue_nonrot(q)	test_bit(QUEUE_FLAG_NONROT, &(q)->queue_flags)
 #define blk_queue_io_stat(q)	test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags)
 #define blk_queue_add_random(q)	test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags)
-#define blk_queue_stackable(q)	\
-	test_bit(QUEUE_FLAG_STACKABLE, &(q)->queue_flags)
 #define blk_queue_discard(q)	test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags)
 #define blk_queue_secure_erase(q) \
 	(test_bit(QUEUE_FLAG_SECERASE, &(q)->queue_flags))
-- 
cgit v1.2.3-58-ga151


From 4b14a5c5d57f4fd6929db3427ba4d7c3775b4680 Mon Sep 17 00:00:00 2001
From: Tim Hansen <devtimhansen@gmail.com>
Date: Thu, 5 Oct 2017 14:09:20 -0400
Subject: block: remove unnecessary NULL checks in bioset_integrity_free()

mempool_destroy() already checks for a NULL value being passed in, this
eliminates duplicate checks.

This was caught by running make coccicheck M=block/ on linus' tree on
commit 77ede3a014a32746002f7889211f0cecf4803163 (current head as of this
patch).

Reviewed-by: Kyle Fortin <kyle.fortin@oracle.com>
Acked-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Tim Hansen <devtimhansen@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio-integrity.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'block')

diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 5df32907ff3b..23b42e8aa03e 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -485,11 +485,8 @@ EXPORT_SYMBOL(bioset_integrity_create);
 
 void bioset_integrity_free(struct bio_set *bs)
 {
-	if (bs->bio_integrity_pool)
-		mempool_destroy(bs->bio_integrity_pool);
-
-	if (bs->bvec_integrity_pool)
-		mempool_destroy(bs->bvec_integrity_pool);
+	mempool_destroy(bs->bio_integrity_pool);
+	mempool_destroy(bs->bvec_integrity_pool);
 }
 EXPORT_SYMBOL(bioset_integrity_free);
 
-- 
cgit v1.2.3-58-ga151


From 4078def82f352cf5007691635da290a109511bc5 Mon Sep 17 00:00:00 2001
From: Tim Hansen <devtimhansen@gmail.com>
Date: Fri, 6 Oct 2017 14:45:13 -0400
Subject: block/bio: Remove null checks before mempool_destroy in bioset_free

This patch removes redundant checks for null values on bio_pool and
bvec_pool.

Found using make coccicheck M=block/ on linux-net tree on the
next-20170929 tag.

Signed-off-by: Tim Hansen <devtimhansen@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'block')

diff --git a/block/bio.c b/block/bio.c
index 8338304ea256..bf0dbe8f78f8 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1928,11 +1928,8 @@ void bioset_free(struct bio_set *bs)
 	if (bs->rescue_workqueue)
 		destroy_workqueue(bs->rescue_workqueue);
 
-	if (bs->bio_pool)
-		mempool_destroy(bs->bio_pool);
-
-	if (bs->bvec_pool)
-		mempool_destroy(bs->bvec_pool);
+	mempool_destroy(bs->bio_pool);
+	mempool_destroy(bs->bvec_pool);
 
 	bioset_integrity_free(bs);
 	bio_put_slab(bs);
-- 
cgit v1.2.3-58-ga151


From b5dc5d4d1f4ff9032eb6c21a3c571a1317dc9289 Mon Sep 17 00:00:00 2001
From: Luca Miccio <lucmiccio@gmail.com>
Date: Mon, 9 Oct 2017 16:27:21 +0200
Subject: block,bfq: Disable writeback throttling

Similarly to CFQ, BFQ has its write-throttling heuristics, and it
is better not to combine them with further write-throttling
heuristics of a different nature.
So this commit disables write-back throttling for a device if BFQ
is used as I/O scheduler for that device.

Signed-off-by: Luca Miccio <lucmiccio@gmail.com>
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Tested-by: Lee Tibbert <lee.tibbert@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-iosched.c | 3 ++-
 block/blk-wbt.c     | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 70f9177c4f5b..261f98695910 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -108,6 +108,7 @@
 #include "blk-mq-tag.h"
 #include "blk-mq-sched.h"
 #include "bfq-iosched.h"
+#include "blk-wbt.h"
 
 #define BFQ_BFQQ_FNS(name)						\
 void bfq_mark_bfqq_##name(struct bfq_queue *bfqq)			\
@@ -4810,7 +4811,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
 	bfq_init_root_group(bfqd->root_group, bfqd);
 	bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group);
 
-
+	wbt_disable_default(q);
 	return 0;
 
 out_free:
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 6a9a0f03a67b..e59d59c11ebb 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -654,7 +654,7 @@ void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on)
 }
 
 /*
- * Disable wbt, if enabled by default. Only called from CFQ.
+ * Disable wbt, if enabled by default.
  */
 void wbt_disable_default(struct request_queue *q)
 {
-- 
cgit v1.2.3-58-ga151


From 99fead8d38e5302b1be9403d4de815ce9174a3df Mon Sep 17 00:00:00 2001
From: Paolo Valente <paolo.valente@linaro.org>
Date: Mon, 9 Oct 2017 13:11:23 +0200
Subject: block, bfq: fix unbalanced decrements of burst size
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The commit "block, bfq: decrease burst size when queues in burst
exit" introduced the decrement of burst_size on the removal of a
bfq_queue from the burst list. Unfortunately, this decrement can
happen to be performed even when burst size is already equal to 0,
because of unbalanced decrements. A description follows of the cause
of these unbalanced decrements, namely a wrong assumption, and of the
way how this wrong assumption leads to unbalanced decrements.

The wrong assumption is that a bfq_queue can exit only if the process
associated with the bfq_queue has exited. This is false, because a
bfq_queue, say Q, may exit also as a consequence of a merge with
another bfq_queue. In this case, Q exits because the I/O of its
associated process has been redirected to another bfq_queue.

The decrement unbalance occurs because Q may then be re-created after
a split, and added back to the current burst list, *without*
incrementing burst_size. burst_size is not incremented because Q is
not a new bfq_queue added to the burst list, but a bfq_queue only
temporarily removed from the list, and, before the commit "bfq-sq,
bfq-mq: decrease burst size when queues in burst exit", burst_size was
not decremented when Q was removed.

This commit addresses this issue by just checking whether the exiting
bfq_queue is a merged bfq_queue, and, in that case, not decrementing
burst_size. Unfortunately, this still leaves room for unbalanced
decrements, in the following rarer case: on a split, the bfq_queue
happens to be inserted into a different burst list than that it was
removed from when merged. If this happens, the number of elements in
the new burst list becomes higher than burst_size (by one). When the
bfq_queue then exits, it is of course not in a merged state any
longer, thus burst_size is decremented, which results in an unbalanced
decrement.  To handle this sporadic, unlucky case in a simple way,
this commit also checks that burst_size is larger than 0 before
decrementing it.

Finally, this commit removes an useless, extra check: the check that
the bfq_queue is sync, performed before checking whether the bfq_queue
is in the burst list. This extra check is redundant, because only sync
bfq_queues can be inserted into the burst list.

Fixes: 7cb04004fa37 ("block, bfq: decrease burst size when queues in burst exit")
Reported-by: Philip Müller <philm@manjaro.org>
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Angelo Ruocco <angeloruocco90@gmail.com>
Tested-by: Philip Müller <philm@manjaro.org>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Tested-by: Lee Tibbert <lee.tibbert@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-iosched.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 57 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 261f98695910..889a8549d97f 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -3726,9 +3726,36 @@ void bfq_put_queue(struct bfq_queue *bfqq)
 	if (bfqq->ref)
 		return;
 
-	if (bfq_bfqq_sync(bfqq) && !hlist_unhashed(&bfqq->burst_list_node)) {
+	if (!hlist_unhashed(&bfqq->burst_list_node)) {
 		hlist_del_init(&bfqq->burst_list_node);
-		bfqq->bfqd->burst_size--;
+		/*
+		 * Decrement also burst size after the removal, if the
+		 * process associated with bfqq is exiting, and thus
+		 * does not contribute to the burst any longer. This
+		 * decrement helps filter out false positives of large
+		 * bursts, when some short-lived process (often due to
+		 * the execution of commands by some service) happens
+		 * to start and exit while a complex application is
+		 * starting, and thus spawning several processes that
+		 * do I/O (and that *must not* be treated as a large
+		 * burst, see comments on bfq_handle_burst).
+		 *
+		 * In particular, the decrement is performed only if:
+		 * 1) bfqq is not a merged queue, because, if it is,
+		 * then this free of bfqq is not triggered by the exit
+		 * of the process bfqq is associated with, but exactly
+		 * by the fact that bfqq has just been merged.
+		 * 2) burst_size is greater than 0, to handle
+		 * unbalanced decrements. Unbalanced decrements may
+		 * happen in te following case: bfqq is inserted into
+		 * the current burst list--without incrementing
+		 * bust_size--because of a split, but the current
+		 * burst list is not the burst list bfqq belonged to
+		 * (see comments on the case of a split in
+		 * bfq_set_request).
+		 */
+		if (bfqq->bic && bfqq->bfqd->burst_size > 0)
+			bfqq->bfqd->burst_size--;
 	}
 
 	kmem_cache_free(bfq_pool, bfqq);
@@ -4460,6 +4487,34 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd,
 		else {
 			bfq_clear_bfqq_in_large_burst(bfqq);
 			if (bic->was_in_burst_list)
+				/*
+				 * If bfqq was in the current
+				 * burst list before being
+				 * merged, then we have to add
+				 * it back. And we do not need
+				 * to increase burst_size, as
+				 * we did not decrement
+				 * burst_size when we removed
+				 * bfqq from the burst list as
+				 * a consequence of a merge
+				 * (see comments in
+				 * bfq_put_queue). In this
+				 * respect, it would be rather
+				 * costly to know whether the
+				 * current burst list is still
+				 * the same burst list from
+				 * which bfqq was removed on
+				 * the merge. To avoid this
+				 * cost, if bfqq was in a
+				 * burst list, then we add
+				 * bfqq to the current burst
+				 * list without any further
+				 * check. This can cause
+				 * inappropriate insertions,
+				 * but rarely enough to not
+				 * harm the detection of large
+				 * bursts significantly.
+				 */
 				hlist_add_head(&bfqq->burst_list_node,
 					       &bfqd->burst_list);
 		}
-- 
cgit v1.2.3-58-ga151


From 58a9edce0aa912640abe47d3fc039e6230ef848b Mon Sep 17 00:00:00 2001
From: weiping zhang <zhangweiping@didichuxing.com>
Date: Tue, 10 Oct 2017 22:53:46 +0800
Subject: blkcg: check pol->cpd_free_fn before free cpd

check pol->cpd_free_fn() instead of pol->cpd_alloc_fn() when free cpd.

Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: weiping zhang <zhangweiping@didichuxing.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index d3f56baee936..e7ec676043b1 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1452,7 +1452,7 @@ int blkcg_policy_register(struct blkcg_policy *pol)
 	return 0;
 
 err_free_cpds:
-	if (pol->cpd_alloc_fn) {
+	if (pol->cpd_free_fn) {
 		list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
 			if (blkcg->cpd[pol->plid]) {
 				pol->cpd_free_fn(blkcg->cpd[pol->plid]);
@@ -1492,7 +1492,7 @@ void blkcg_policy_unregister(struct blkcg_policy *pol)
 	/* remove cpds and unregister */
 	mutex_lock(&blkcg_pol_mutex);
 
-	if (pol->cpd_alloc_fn) {
+	if (pol->cpd_free_fn) {
 		list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
 			if (blkcg->cpd[pol->plid]) {
 				pol->cpd_free_fn(blkcg->cpd[pol->plid]);
-- 
cgit v1.2.3-58-ga151


From 53cfdc10a95d03fbc82970d682a32696d19ef886 Mon Sep 17 00:00:00 2001
From: Jiufei Xue <jiufei.xjf@alibaba-inc.com>
Date: Tue, 10 Oct 2017 11:13:32 +0800
Subject: blk-throttle: fix null pointer dereference while throttling writeback
 IOs

A null pointer dereference can occur when blkcg is removed manually
with writeback IOs inflight. This is caused by the following case:

Writeback kworker submit the bio and set bio->bi_cg_private to tg
in blk_throtl_assoc_bio.
Then we remove the block cgroup manually, the blkg and tg would be
freed if there is no request inflight.
When the submitted bio come back, blk_throtl_bio_endio() fetch the tg
which was already freed.

Fix this by increasing the refcount of blkg in funcion
blk_throtl_assoc_bio() so that the blkg will not be freed until the
bio_endio called.

Reviewed-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jiufei Xue <jiufei.xjf@alibaba-inc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-throttle.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 0fea76aa0f3f..fe49c465ec86 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -2112,8 +2112,12 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td)
 static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio)
 {
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
-	if (bio->bi_css)
+	if (bio->bi_css) {
+		if (bio->bi_cg_private)
+			blkg_put(tg_to_blkg(bio->bi_cg_private));
 		bio->bi_cg_private = tg;
+		blkg_get(tg_to_blkg(tg));
+	}
 	blk_stat_set_issue(&bio->bi_issue_stat, bio_sectors(bio));
 #endif
 }
@@ -2283,8 +2287,10 @@ void blk_throtl_bio_endio(struct bio *bio)
 
 	start_time = blk_stat_time(&bio->bi_issue_stat) >> 10;
 	finish_time = __blk_stat_time(finish_time_ns) >> 10;
-	if (!start_time || finish_time <= start_time)
+	if (!start_time || finish_time <= start_time) {
+		blkg_put(tg_to_blkg(tg));
 		return;
+	}
 
 	lat = finish_time - start_time;
 	/* this is only for bio based driver */
@@ -2314,6 +2320,8 @@ void blk_throtl_bio_endio(struct bio *bio)
 		tg->bio_cnt /= 2;
 		tg->bad_bio_cnt /= 2;
 	}
+
+	blkg_put(tg_to_blkg(tg));
 }
 #endif
 
-- 
cgit v1.2.3-58-ga151


From eca8b53a6769e60d6d8240d71202d73b0af81901 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Fri, 6 Oct 2017 17:55:59 -0700
Subject: blk-stat: delete useless code

Fix two issues:
- the per-cpu stat flush is unnecessary, nobody uses per-cpu stat except
  sum it to global stat. We can do the calculation there. The flush just
  wastes cpu time.
- some fields are signed int/s64. I don't see the point.

Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-stat.c          | 45 +++++++--------------------------------------
 include/linux/blk_types.h |  5 ++---
 2 files changed, 9 insertions(+), 41 deletions(-)

(limited to 'block')

diff --git a/block/blk-stat.c b/block/blk-stat.c
index c52356d90fe3..3a2f3c96f367 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -11,8 +11,6 @@
 #include "blk-mq.h"
 #include "blk.h"
 
-#define BLK_RQ_STAT_BATCH	64
-
 struct blk_queue_stats {
 	struct list_head callbacks;
 	spinlock_t lock;
@@ -23,45 +21,21 @@ static void blk_stat_init(struct blk_rq_stat *stat)
 {
 	stat->min = -1ULL;
 	stat->max = stat->nr_samples = stat->mean = 0;
-	stat->batch = stat->nr_batch = 0;
-}
-
-static void blk_stat_flush_batch(struct blk_rq_stat *stat)
-{
-	const s32 nr_batch = READ_ONCE(stat->nr_batch);
-	const s32 nr_samples = READ_ONCE(stat->nr_samples);
-
-	if (!nr_batch)
-		return;
-	if (!nr_samples)
-		stat->mean = div64_s64(stat->batch, nr_batch);
-	else {
-		stat->mean = div64_s64((stat->mean * nr_samples) +
-					stat->batch,
-					nr_batch + nr_samples);
-	}
-
-	stat->nr_samples += nr_batch;
-	stat->nr_batch = stat->batch = 0;
+	stat->batch = 0;
 }
 
+/* src is a per-cpu stat, mean isn't initialized */
 static void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src)
 {
-	blk_stat_flush_batch(src);
-
 	if (!src->nr_samples)
 		return;
 
 	dst->min = min(dst->min, src->min);
 	dst->max = max(dst->max, src->max);
 
-	if (!dst->nr_samples)
-		dst->mean = src->mean;
-	else {
-		dst->mean = div64_s64((src->mean * src->nr_samples) +
-					(dst->mean * dst->nr_samples),
-					dst->nr_samples + src->nr_samples);
-	}
+	dst->mean = div_u64(src->batch + dst->mean * dst->nr_samples,
+				dst->nr_samples + src->nr_samples);
+
 	dst->nr_samples += src->nr_samples;
 }
 
@@ -69,13 +43,8 @@ static void __blk_stat_add(struct blk_rq_stat *stat, u64 value)
 {
 	stat->min = min(stat->min, value);
 	stat->max = max(stat->max, value);
-
-	if (stat->batch + value < stat->batch ||
-	    stat->nr_batch + 1 == BLK_RQ_STAT_BATCH)
-		blk_stat_flush_batch(stat);
-
 	stat->batch += value;
-	stat->nr_batch++;
+	stat->nr_samples++;
 }
 
 void blk_stat_add(struct request *rq)
@@ -84,7 +53,7 @@ void blk_stat_add(struct request *rq)
 	struct blk_stat_callback *cb;
 	struct blk_rq_stat *stat;
 	int bucket;
-	s64 now, value;
+	u64 now, value;
 
 	now = __blk_stat_time(ktime_to_ns(ktime_get()));
 	if (now < blk_stat_time(&rq->issue_stat))
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index a2d2aa709cef..3385c89f402e 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -329,11 +329,10 @@ static inline bool blk_qc_t_is_internal(blk_qc_t cookie)
 }
 
 struct blk_rq_stat {
-	s64 mean;
+	u64 mean;
 	u64 min;
 	u64 max;
-	s32 nr_samples;
-	s32 nr_batch;
+	u32 nr_samples;
 	u64 batch;
 };
 
-- 
cgit v1.2.3-58-ga151


From 85acb3ba2f925a0ec6928c1967c3adefa00682f4 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Fri, 6 Oct 2017 17:56:00 -0700
Subject: block: set request_list for request

Legacy queue sets request's request_list, mq doesn't. This makes mq does
the same thing, so we can find cgroup of a request. Note, we really
only use blkg field of request_list, it's pointless to allocate mempool
for request_list in mq case.

Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 2 +-
 block/blk-mq.c   | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 14f7674fa0b1..e8e149ccc86b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -718,7 +718,7 @@ static void free_request_size(void *element, void *data)
 int blk_init_rl(struct request_list *rl, struct request_queue *q,
 		gfp_t gfp_mask)
 {
-	if (unlikely(rl->rq_pool))
+	if (unlikely(rl->rq_pool) || q->mq_ops)
 		return 0;
 
 	rl->q = q;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 076cbab9c3e0..40cba1b1978f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -481,6 +481,9 @@ void blk_mq_free_request(struct request *rq)
 
 	wbt_done(q->rq_wb, &rq->issue_stat);
 
+	if (blk_rq_rl(rq))
+		blk_put_rl(blk_rq_rl(rq));
+
 	clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
 	clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
 	if (rq->tag != -1)
@@ -1532,6 +1535,8 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
 {
 	blk_init_request_from_bio(rq, bio);
 
+	blk_rq_set_rl(rq, blk_get_rl(rq->q, bio));
+
 	blk_account_io_start(rq, true);
 }
 
-- 
cgit v1.2.3-58-ga151


From 519c8e9ffd86143fedd84cf833a09f36b47d0f5c Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Mon, 16 Oct 2017 11:01:00 -0700
Subject: block: fix Sphinx kernel-doc warning

Sphinx treats symbols that end with '_' as a kind of special
documentation indicator, so fix that by adding an ending '*'
to it.

../block/bio.c:404: ERROR: Unknown target name: "gfp".

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/bio.c b/block/bio.c
index bf0dbe8f78f8..ae9ad34e6a71 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -400,7 +400,7 @@ static void punt_bios_to_rescuer(struct bio_set *bs)
 
 /**
  * bio_alloc_bioset - allocate a bio for I/O
- * @gfp_mask:   the GFP_ mask given to the slab allocator
+ * @gfp_mask:   the GFP_* mask given to the slab allocator
  * @nr_iovecs:	number of iovecs to pre-allocate
  * @bs:		the bio_set to allocate from.
  *
-- 
cgit v1.2.3-58-ga151


From 8cf466602028196b939255f1eb4e9817efd1db6d Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Wed, 11 Oct 2017 10:39:15 -0700
Subject: kyber: fix hang on domain token wait queue

When we're getting a domain token, if we fail to get a token on our
first attempt, we put the current hardware queue on a wait queue and
then try again just in case a token was freed after our initial attempt
but before we got on the wait queue. If this second attempt succeeds, we
currently leave the hardware queue on the wait queue. Usually this is
okay; we'll just run the hardware queue one extra time when another
token is freed. However, if the hardware queue doesn't have any other
requests waiting, then when it it gets the extra wakeup, it won't have
anything to free and therefore won't wake up any other hardware queues.
If tokens are limited, then we won't make forward progress and the
device will hang.

Reported-by: Bin Zha <zhabin.zb@alibaba-inc.com>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/kyber-iosched.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index f58cab82105b..db5bfc6342d3 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -541,9 +541,17 @@ static int kyber_get_domain_token(struct kyber_queue_data *kqd,
 
 		/*
 		 * Try again in case a token was freed before we got on the wait
-		 * queue.
+		 * queue. The waker may have already removed the entry from the
+		 * wait queue, but list_del_init() is okay with that.
 		 */
 		nr = __sbitmap_queue_get(domain_tokens);
+		if (nr >= 0) {
+			unsigned long flags;
+
+			spin_lock_irqsave(&ws->wait.lock, flags);
+			list_del_init(&wait->entry);
+			spin_unlock_irqrestore(&ws->wait.lock, flags);
+		}
 	}
 	return nr;
 }
-- 
cgit v1.2.3-58-ga151


From 149e10f8ff71439014dff97987e90e87e2684a16 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Wed, 11 Oct 2017 12:53:06 +0300
Subject: block: introduce blk_mq_tagset_iter

Iterator helper to apply a function on all the
tags in a given tagset. export it as it will be used
outside the block layer later on.

Reviewed-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-mq-tag.c     | 16 +++++++++++-----
 include/linux/blk-mq.h |  2 ++
 2 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 6714507aa6c7..bce1c76fc768 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -298,12 +298,12 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
 }
 EXPORT_SYMBOL(blk_mq_tagset_busy_iter);
 
-int blk_mq_reinit_tagset(struct blk_mq_tag_set *set,
-			 int (reinit_request)(void *, struct request *))
+int blk_mq_tagset_iter(struct blk_mq_tag_set *set, void *data,
+			 int (fn)(void *, struct request *))
 {
 	int i, j, ret = 0;
 
-	if (WARN_ON_ONCE(!reinit_request))
+	if (WARN_ON_ONCE(!fn))
 		goto out;
 
 	for (i = 0; i < set->nr_hw_queues; i++) {
@@ -316,8 +316,7 @@ int blk_mq_reinit_tagset(struct blk_mq_tag_set *set,
 			if (!tags->static_rqs[j])
 				continue;
 
-			ret = reinit_request(set->driver_data,
-					     tags->static_rqs[j]);
+			ret = fn(data, tags->static_rqs[j]);
 			if (ret)
 				goto out;
 		}
@@ -326,6 +325,13 @@ int blk_mq_reinit_tagset(struct blk_mq_tag_set *set,
 out:
 	return ret;
 }
+EXPORT_SYMBOL_GPL(blk_mq_tagset_iter);
+
+int blk_mq_reinit_tagset(struct blk_mq_tag_set *set,
+			 int (reinit_request)(void *, struct request *))
+{
+	return blk_mq_tagset_iter(set, set->driver_data, reinit_request);
+}
 EXPORT_SYMBOL_GPL(blk_mq_reinit_tagset);
 
 void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 50c6485cb04f..6bc29f73a9aa 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -259,6 +259,8 @@ void blk_freeze_queue_start(struct request_queue *q);
 void blk_mq_freeze_queue_wait(struct request_queue *q);
 int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
 				     unsigned long timeout);
+int blk_mq_tagset_iter(struct blk_mq_tag_set *set, void *data,
+		int (reinit_request)(void *, struct request *));
 int blk_mq_reinit_tagset(struct blk_mq_tag_set *set,
 			 int (reinit_request)(void *, struct request *));
 
-- 
cgit v1.2.3-58-ga151


From dab7487bdf63c6f2d70aac604494d023b189a9fd Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Wed, 11 Oct 2017 12:53:08 +0300
Subject: block: remove blk_mq_reinit_tagset

No callers left.

Reviewed-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-mq-tag.c     | 7 -------
 include/linux/blk-mq.h | 2 --
 2 files changed, 9 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index bce1c76fc768..c81b40ecd3f1 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -327,13 +327,6 @@ out:
 }
 EXPORT_SYMBOL_GPL(blk_mq_tagset_iter);
 
-int blk_mq_reinit_tagset(struct blk_mq_tag_set *set,
-			 int (reinit_request)(void *, struct request *))
-{
-	return blk_mq_tagset_iter(set, set->driver_data, reinit_request);
-}
-EXPORT_SYMBOL_GPL(blk_mq_reinit_tagset);
-
 void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
 		void *priv)
 {
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 6bc29f73a9aa..cfd64e500d82 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -261,8 +261,6 @@ int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
 				     unsigned long timeout);
 int blk_mq_tagset_iter(struct blk_mq_tag_set *set, void *data,
 		int (reinit_request)(void *, struct request *));
-int blk_mq_reinit_tagset(struct blk_mq_tag_set *set,
-			 int (reinit_request)(void *, struct request *));
 
 int blk_mq_map_queues(struct blk_mq_tag_set *set);
 void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues);
-- 
cgit v1.2.3-58-ga151


From 351499a172c0c5fc52d65ee2c62b344f369ea02a Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Tue, 24 Oct 2017 18:44:57 -0600
Subject: block: Invalidate cache on discard v2

It is reasonable drop page cache on discard, otherwise that pages may
be written by writeback second later, so thin provision devices will
not be happy. This seems to be a  security leak in case of secure discard case.

Also add check for queue_discard flag on early stage.

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/ioctl.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'block')

diff --git a/block/ioctl.c b/block/ioctl.c
index 0de02ee67eed..c0fc32bd8ed1 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -202,10 +202,16 @@ static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode,
 {
 	uint64_t range[2];
 	uint64_t start, len;
+	struct request_queue *q = bdev_get_queue(bdev);
+	struct address_space *mapping = bdev->bd_inode->i_mapping;
+
 
 	if (!(mode & FMODE_WRITE))
 		return -EBADF;
 
+	if (!blk_queue_discard(q))
+		return -EOPNOTSUPP;
+
 	if (copy_from_user(range, (void __user *)arg, sizeof(range)))
 		return -EFAULT;
 
@@ -216,12 +222,12 @@ static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode,
 		return -EINVAL;
 	if (len & 511)
 		return -EINVAL;
-	start >>= 9;
-	len >>= 9;
 
-	if (start + len > (i_size_read(bdev->bd_inode) >> 9))
+	if (start + len > i_size_read(bdev->bd_inode))
 		return -EINVAL;
-	return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, flags);
+	truncate_inode_pages_range(mapping, start, start + len);
+	return blkdev_issue_discard(bdev, start >> 9, len >> 9,
+				    GFP_KERNEL, flags);
 }
 
 static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode,
-- 
cgit v1.2.3-58-ga151


From bb749b31c25e9b11f8f974baac8d507298ffbb70 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Wed, 18 Oct 2017 14:38:38 +0200
Subject: block: move CAP_SYS_ADMIN check in blkdev_roset()

Check for CAP_SYS_ADMIN before calling into the driver, similar to
blkdev_flushbuf().  This is safer and can spare a check in the driver.

(Currently BLKROSET is overridden by md and rbd, rbd is missing the
check.  md has the check, but it covers a lot more than BLKROSET.)

Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/ioctl.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/ioctl.c b/block/ioctl.c
index c0fc32bd8ed1..1668506d8ed8 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -443,11 +443,12 @@ static int blkdev_roset(struct block_device *bdev, fmode_t mode,
 {
 	int ret, n;
 
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
 	ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
 	if (!is_unrecognized_ioctl(ret))
 		return ret;
-	if (!capable(CAP_SYS_ADMIN))
-		return -EACCES;
 	if (get_user(n, (int __user *)arg))
 		return -EFAULT;
 	set_device_ro(bdev, n);
-- 
cgit v1.2.3-58-ga151


From 425a4dba7953e35ffd096771973add6d2f40d2ed Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 16 Oct 2017 15:59:09 +0200
Subject: block: factor out __blkdev_issue_zero_pages()

blkdev_issue_zeroout() will use this in !BLKDEV_ZERO_NOFALLBACK case.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-lib.c | 63 +++++++++++++++++++++++++++++++++------------------------
 1 file changed, 37 insertions(+), 26 deletions(-)

(limited to 'block')

diff --git a/block/blk-lib.c b/block/blk-lib.c
index 62240f8832ca..9d2ab8bba52a 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -274,6 +274,40 @@ static unsigned int __blkdev_sectors_to_bio_pages(sector_t nr_sects)
 	return min(pages, (sector_t)BIO_MAX_PAGES);
 }
 
+static int __blkdev_issue_zero_pages(struct block_device *bdev,
+		sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
+		struct bio **biop)
+{
+	struct request_queue *q = bdev_get_queue(bdev);
+	struct bio *bio = *biop;
+	int bi_size = 0;
+	unsigned int sz;
+
+	if (!q)
+		return -ENXIO;
+
+	while (nr_sects != 0) {
+		bio = next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects),
+			       gfp_mask);
+		bio->bi_iter.bi_sector = sector;
+		bio_set_dev(bio, bdev);
+		bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+
+		while (nr_sects != 0) {
+			sz = min((sector_t) PAGE_SIZE, nr_sects << 9);
+			bi_size = bio_add_page(bio, ZERO_PAGE(0), sz, 0);
+			nr_sects -= bi_size >> 9;
+			sector += bi_size >> 9;
+			if (bi_size < sz)
+				break;
+		}
+		cond_resched();
+	}
+
+	*biop = bio;
+	return 0;
+}
+
 /**
  * __blkdev_issue_zeroout - generate number of zero filed write bios
  * @bdev:	blockdev to issue
@@ -304,9 +338,6 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 		unsigned flags)
 {
 	int ret;
-	int bi_size = 0;
-	struct bio *bio = *biop;
-	unsigned int sz;
 	sector_t bs_mask;
 
 	bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
@@ -316,30 +347,10 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 	ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp_mask,
 			biop, flags);
 	if (ret != -EOPNOTSUPP || (flags & BLKDEV_ZERO_NOFALLBACK))
-		goto out;
-
-	ret = 0;
-	while (nr_sects != 0) {
-		bio = next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects),
-			       gfp_mask);
-		bio->bi_iter.bi_sector = sector;
-		bio_set_dev(bio, bdev);
-		bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
-
-		while (nr_sects != 0) {
-			sz = min((sector_t) PAGE_SIZE, nr_sects << 9);
-			bi_size = bio_add_page(bio, ZERO_PAGE(0), sz, 0);
-			nr_sects -= bi_size >> 9;
-			sector += bi_size >> 9;
-			if (bi_size < sz)
-				break;
-		}
-		cond_resched();
-	}
+		return ret;
 
-	*biop = bio;
-out:
-	return ret;
+	return __blkdev_issue_zero_pages(bdev, sector, nr_sects, gfp_mask,
+					 biop);
 }
 EXPORT_SYMBOL(__blkdev_issue_zeroout);
 
-- 
cgit v1.2.3-58-ga151


From d5ce4c31d6df518dd8f63bbae20d7423c5018a6c Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 16 Oct 2017 15:59:10 +0200
Subject: block: cope with WRITE ZEROES failing in blkdev_issue_zeroout()

sd_config_write_same() ignores ->max_ws_blocks == 0 and resets it to
permit trying WRITE SAME on older SCSI devices, unless ->no_write_same
is set.  Because REQ_OP_WRITE_ZEROES is implemented in terms of WRITE
SAME, blkdev_issue_zeroout() may fail with -EREMOTEIO:

  $ fallocate -zn -l 1k /dev/sdg
  fallocate: fallocate failed: Remote I/O error
  $ fallocate -zn -l 1k /dev/sdg  # OK
  $ fallocate -zn -l 1k /dev/sdg  # OK

The following calls succeed because sd_done() sets ->no_write_same in
response to a sense that would become BLK_STS_TARGET/-EREMOTEIO, causing
__blkdev_issue_zeroout() to fall back to generating ZERO_PAGE bios.

This means blkdev_issue_zeroout() must cope with WRITE ZEROES failing
and fall back to manually zeroing, unless BLKDEV_ZERO_NOFALLBACK is
specified.  For BLKDEV_ZERO_NOFALLBACK case, return -EOPNOTSUPP if
sd_done() has just set ->no_write_same thus indicating lack of offload
support.

Fixes: c20cfc27a473 ("block: stop using blkdev_issue_write_same for zeroing")
Cc: Hannes Reinecke <hare@suse.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-lib.c | 45 +++++++++++++++++++++++++++++++++++----------
 1 file changed, 35 insertions(+), 10 deletions(-)

(limited to 'block')

diff --git a/block/blk-lib.c b/block/blk-lib.c
index 9d2ab8bba52a..f625fda5f095 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -321,12 +321,6 @@ static int __blkdev_issue_zero_pages(struct block_device *bdev,
  *  Zero-fill a block range, either using hardware offload or by explicitly
  *  writing zeroes to the device.
  *
- *  Note that this function may fail with -EOPNOTSUPP if the driver signals
- *  zeroing offload support, but the device fails to process the command (for
- *  some devices there is no non-destructive way to verify whether this
- *  operation is actually supported).  In this case the caller should call
- *  retry the call to blkdev_issue_zeroout() and the fallback path will be used.
- *
  *  If a device is using logical block provisioning, the underlying space will
  *  not be released if %flags contains BLKDEV_ZERO_NOUNMAP.
  *
@@ -370,18 +364,49 @@ EXPORT_SYMBOL(__blkdev_issue_zeroout);
 int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, unsigned flags)
 {
-	int ret;
-	struct bio *bio = NULL;
+	int ret = 0;
+	sector_t bs_mask;
+	struct bio *bio;
 	struct blk_plug plug;
+	bool try_write_zeroes = !!bdev_write_zeroes_sectors(bdev);
 
+	bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
+	if ((sector | nr_sects) & bs_mask)
+		return -EINVAL;
+
+retry:
+	bio = NULL;
 	blk_start_plug(&plug);
-	ret = __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask,
-			&bio, flags);
+	if (try_write_zeroes) {
+		ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects,
+						  gfp_mask, &bio, flags);
+	} else if (!(flags & BLKDEV_ZERO_NOFALLBACK)) {
+		ret = __blkdev_issue_zero_pages(bdev, sector, nr_sects,
+						gfp_mask, &bio);
+	} else {
+		/* No zeroing offload support */
+		ret = -EOPNOTSUPP;
+	}
 	if (ret == 0 && bio) {
 		ret = submit_bio_wait(bio);
 		bio_put(bio);
 	}
 	blk_finish_plug(&plug);
+	if (ret && try_write_zeroes) {
+		if (!(flags & BLKDEV_ZERO_NOFALLBACK)) {
+			try_write_zeroes = false;
+			goto retry;
+		}
+		if (!bdev_write_zeroes_sectors(bdev)) {
+			/*
+			 * Zeroing offload support was indicated, but the
+			 * device reported ILLEGAL REQUEST (for some devices
+			 * there is no non-destructive way to verify whether
+			 * WRITE ZEROES is actually supported).
+			 */
+			ret = -EOPNOTSUPP;
+		}
+	}
 
 	return ret;
 }
-- 
cgit v1.2.3-58-ga151


From 2527d99789e248576ac8081530cd4fd88730f8c7 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 25 Oct 2017 12:33:42 -0600
Subject: elevator: lookup mq vs non-mq elevators

If an IO scheduler is selected via elevator= and it doesn't match
the driver in question wrt blk-mq support, then we fail to boot.

The elevator= parameter is deprecated and only supported for
non-mq devices. Augment the elevator lookup API so that we
pass in if we're looking for an mq capable scheduler or not,
so that we only ever return a valid type for the queue in
question.

Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=196695
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/elevator.c | 44 +++++++++++++++++++++-----------------------
 1 file changed, 21 insertions(+), 23 deletions(-)

(limited to 'block')

diff --git a/block/elevator.c b/block/elevator.c
index 7ae50eb2732b..77856bf29568 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -83,12 +83,15 @@ bool elv_bio_merge_ok(struct request *rq, struct bio *bio)
 }
 EXPORT_SYMBOL(elv_bio_merge_ok);
 
-static struct elevator_type *elevator_find(const char *name)
+/*
+ * Return scheduler with name 'name' and with matching 'mq capability
+ */
+static struct elevator_type *elevator_find(const char *name, bool mq)
 {
 	struct elevator_type *e;
 
 	list_for_each_entry(e, &elv_list, list) {
-		if (!strcmp(e->elevator_name, name))
+		if (!strcmp(e->elevator_name, name) && (mq == e->uses_mq))
 			return e;
 	}
 
@@ -100,25 +103,25 @@ static void elevator_put(struct elevator_type *e)
 	module_put(e->elevator_owner);
 }
 
-static struct elevator_type *elevator_get(const char *name, bool try_loading)
+static struct elevator_type *elevator_get(struct request_queue *q,
+					  const char *name, bool try_loading)
 {
 	struct elevator_type *e;
 
 	spin_lock(&elv_list_lock);
 
-	e = elevator_find(name);
+	e = elevator_find(name, q->mq_ops != NULL);
 	if (!e && try_loading) {
 		spin_unlock(&elv_list_lock);
 		request_module("%s-iosched", name);
 		spin_lock(&elv_list_lock);
-		e = elevator_find(name);
+		e = elevator_find(name, q->mq_ops != NULL);
 	}
 
 	if (e && !try_module_get(e->elevator_owner))
 		e = NULL;
 
 	spin_unlock(&elv_list_lock);
-
 	return e;
 }
 
@@ -144,8 +147,12 @@ void __init load_default_elevator_module(void)
 	if (!chosen_elevator[0])
 		return;
 
+	/*
+	 * Boot parameter is deprecated, we haven't supported that for MQ.
+	 * Only look for non-mq schedulers from here.
+	 */
 	spin_lock(&elv_list_lock);
-	e = elevator_find(chosen_elevator);
+	e = elevator_find(chosen_elevator, false);
 	spin_unlock(&elv_list_lock);
 
 	if (!e)
@@ -202,7 +209,7 @@ int elevator_init(struct request_queue *q, char *name)
 	q->boundary_rq = NULL;
 
 	if (name) {
-		e = elevator_get(name, true);
+		e = elevator_get(q, name, true);
 		if (!e)
 			return -EINVAL;
 	}
@@ -214,7 +221,7 @@ int elevator_init(struct request_queue *q, char *name)
 	 * allowed from async.
 	 */
 	if (!e && !q->mq_ops && *chosen_elevator) {
-		e = elevator_get(chosen_elevator, false);
+		e = elevator_get(q, chosen_elevator, false);
 		if (!e)
 			printk(KERN_ERR "I/O scheduler %s not found\n",
 							chosen_elevator);
@@ -229,17 +236,17 @@ int elevator_init(struct request_queue *q, char *name)
 		 */
 		if (q->mq_ops) {
 			if (q->nr_hw_queues == 1)
-				e = elevator_get("mq-deadline", false);
+				e = elevator_get(q, "mq-deadline", false);
 			if (!e)
 				return 0;
 		} else
-			e = elevator_get(CONFIG_DEFAULT_IOSCHED, false);
+			e = elevator_get(q, CONFIG_DEFAULT_IOSCHED, false);
 
 		if (!e) {
 			printk(KERN_ERR
 				"Default I/O scheduler not found. " \
 				"Using noop.\n");
-			e = elevator_get("noop", false);
+			e = elevator_get(q, "noop", false);
 		}
 	}
 
@@ -905,7 +912,7 @@ int elv_register(struct elevator_type *e)
 
 	/* register, don't allow duplicate names */
 	spin_lock(&elv_list_lock);
-	if (elevator_find(e->elevator_name)) {
+	if (elevator_find(e->elevator_name, e->uses_mq)) {
 		spin_unlock(&elv_list_lock);
 		if (e->icq_cache)
 			kmem_cache_destroy(e->icq_cache);
@@ -1066,7 +1073,7 @@ static int __elevator_change(struct request_queue *q, const char *name)
 		return elevator_switch(q, NULL);
 
 	strlcpy(elevator_name, name, sizeof(elevator_name));
-	e = elevator_get(strstrip(elevator_name), true);
+	e = elevator_get(q, strstrip(elevator_name), true);
 	if (!e)
 		return -EINVAL;
 
@@ -1076,15 +1083,6 @@ static int __elevator_change(struct request_queue *q, const char *name)
 		return 0;
 	}
 
-	if (!e->uses_mq && q->mq_ops) {
-		elevator_put(e);
-		return -EINVAL;
-	}
-	if (e->uses_mq && !q->mq_ops) {
-		elevator_put(e);
-		return -EINVAL;
-	}
-
 	return elevator_switch(q, e);
 }
 
-- 
cgit v1.2.3-58-ga151


From 8ac0d9a81edf2ef4a2268b65b802a6b856dc77e6 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 25 Oct 2017 12:35:02 -0600
Subject: elevator: allow name aliases

Since we now lookup elevator types with the appropriate multiqueue
capability, allow schedulers to register with an alias alongside
the real name. This is in preparation for allowing 'mq-deadline'
to register an alias of 'deadline' as well.

Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/elevator.c         | 23 +++++++++++++++++------
 include/linux/elevator.h |  1 +
 2 files changed, 18 insertions(+), 6 deletions(-)

(limited to 'block')

diff --git a/block/elevator.c b/block/elevator.c
index 77856bf29568..7bda083d5968 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -83,6 +83,16 @@ bool elv_bio_merge_ok(struct request *rq, struct bio *bio)
 }
 EXPORT_SYMBOL(elv_bio_merge_ok);
 
+static bool elevator_match(const struct elevator_type *e, const char *name)
+{
+	if (!strcmp(e->elevator_name, name))
+		return true;
+	if (e->elevator_alias && !strcmp(e->elevator_alias, name))
+		return true;
+
+	return false;
+}
+
 /*
  * Return scheduler with name 'name' and with matching 'mq capability
  */
@@ -91,7 +101,7 @@ static struct elevator_type *elevator_find(const char *name, bool mq)
 	struct elevator_type *e;
 
 	list_for_each_entry(e, &elv_list, list) {
-		if (!strcmp(e->elevator_name, name) && (mq == e->uses_mq))
+		if (elevator_match(e, name) && (mq == e->uses_mq))
 			return e;
 	}
 
@@ -922,9 +932,9 @@ int elv_register(struct elevator_type *e)
 	spin_unlock(&elv_list_lock);
 
 	/* print pretty message */
-	if (!strcmp(e->elevator_name, chosen_elevator) ||
+	if (elevator_match(e, chosen_elevator) ||
 			(!*chosen_elevator &&
-			 !strcmp(e->elevator_name, CONFIG_DEFAULT_IOSCHED)))
+			 elevator_match(e, CONFIG_DEFAULT_IOSCHED)))
 				def = " (default)";
 
 	printk(KERN_INFO "io scheduler %s registered%s\n", e->elevator_name,
@@ -1077,8 +1087,7 @@ static int __elevator_change(struct request_queue *q, const char *name)
 	if (!e)
 		return -EINVAL;
 
-	if (q->elevator &&
-	    !strcmp(elevator_name, q->elevator->type->elevator_name)) {
+	if (q->elevator && elevator_match(q->elevator->type, elevator_name)) {
 		elevator_put(e);
 		return 0;
 	}
@@ -1114,6 +1123,7 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name)
 	struct elevator_queue *e = q->elevator;
 	struct elevator_type *elv = NULL;
 	struct elevator_type *__e;
+	bool uses_mq = q->mq_ops != NULL;
 	int len = 0;
 
 	if (!queue_is_rq_based(q))
@@ -1126,7 +1136,8 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name)
 
 	spin_lock(&elv_list_lock);
 	list_for_each_entry(__e, &elv_list, list) {
-		if (elv && !strcmp(elv->elevator_name, __e->elevator_name)) {
+		if (elv && elevator_match(elv, __e->elevator_name) &&
+		    (__e->uses_mq == uses_mq)) {
 			len += sprintf(name+len, "[%s] ", elv->elevator_name);
 			continue;
 		}
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 5bc8f8682a3e..6df8b14f1f6a 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -144,6 +144,7 @@ struct elevator_type
 	size_t icq_align;	/* ditto */
 	struct elv_fs_entry *elevator_attrs;
 	char elevator_name[ELV_NAME_MAX];
+	const char *elevator_alias;
 	struct module *elevator_owner;
 	bool uses_mq;
 #ifdef CONFIG_BLK_DEBUG_FS
-- 
cgit v1.2.3-58-ga151


From 4d740bc9f0319229410d11e445017f47e425dbe0 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 25 Oct 2017 09:47:20 -0600
Subject: mq-deadline: add 'deadline' as a name alias

The scheduler framework now supports looking up the appropriate
scheduler with the {name,mq} tupple. We can register mq-deadline
with the alias of 'deadline', so that switching to 'deadline'
will do the right thing based on the type of driver attached to
it.

Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/mq-deadline.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'block')

diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index a1cad4331edd..0179e484ec98 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -657,6 +657,7 @@ static struct elevator_type mq_deadline = {
 #endif
 	.elevator_attrs = deadline_attrs,
 	.elevator_name = "mq-deadline",
+	.elevator_alias = "deadline",
 	.elevator_owner = THIS_MODULE,
 };
 MODULE_ALIAS("mq-deadline-iosched");
-- 
cgit v1.2.3-58-ga151


From 4e9b6f20828ac880dbc1fa2fdbafae779473d1af Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Thu, 19 Oct 2017 10:00:48 -0700
Subject: block: Fix a race between blk_cleanup_queue() and timeout handling

Make sure that if the timeout timer fires after a queue has been
marked "dying" that the affected requests are finished.

Reported-by: chenxiang (M) <chenxiang66@hisilicon.com>
Fixes: commit 287922eb0b18 ("block: defer timeouts to a workqueue")
Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Tested-by: chenxiang (M) <chenxiang66@hisilicon.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Hannes Reinecke <hare@suse.com>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Cc: <stable@vger.kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c    | 2 ++
 block/blk-timeout.c | 3 ---
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index e8e149ccc86b..bb4fce694a60 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -333,6 +333,7 @@ EXPORT_SYMBOL(blk_stop_queue);
 void blk_sync_queue(struct request_queue *q)
 {
 	del_timer_sync(&q->timeout);
+	cancel_work_sync(&q->timeout_work);
 
 	if (q->mq_ops) {
 		struct blk_mq_hw_ctx *hctx;
@@ -844,6 +845,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	setup_timer(&q->backing_dev_info->laptop_mode_wb_timer,
 		    laptop_mode_timer_fn, (unsigned long) q);
 	setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
+	INIT_WORK(&q->timeout_work, NULL);
 	INIT_LIST_HEAD(&q->queue_head);
 	INIT_LIST_HEAD(&q->timeout_list);
 	INIT_LIST_HEAD(&q->icq_list);
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index e3e9c9771d36..764ecf9aeb30 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -134,8 +134,6 @@ void blk_timeout_work(struct work_struct *work)
 	struct request *rq, *tmp;
 	int next_set = 0;
 
-	if (blk_queue_enter(q, true))
-		return;
 	spin_lock_irqsave(q->queue_lock, flags);
 
 	list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list)
@@ -145,7 +143,6 @@ void blk_timeout_work(struct work_struct *work)
 		mod_timer(&q->timeout, round_jiffies_up(next));
 
 	spin_unlock_irqrestore(q->queue_lock, flags);
-	blk_queue_exit(q);
 }
 
 /**
-- 
cgit v1.2.3-58-ga151


From 5e3d02bbafad38975099b5848f5ebadedcf7bb7e Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Sat, 14 Oct 2017 17:22:25 +0800
Subject: blk-mq-sched: dispatch from scheduler IFF progress is made in
 ->dispatch

When the hw queue is busy, we shouldn't take requests from the scheduler
queue any more, otherwise it is difficult to do IO merge.

This patch fixes the awful IO performance on some SCSI devices(lpfc,
qla2xxx, ...) when mq-deadline/kyber is used by not taking requests if
hw queue is busy.

Reviewed-by: Omar Sandoval <osandov@fb.com>
Reviewed-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-sched.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 4ab69435708c..eca011fdfa0e 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -94,7 +94,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 	struct request_queue *q = hctx->queue;
 	struct elevator_queue *e = q->elevator;
 	const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request;
-	bool did_work = false;
+	bool do_sched_dispatch = true;
 	LIST_HEAD(rq_list);
 
 	/* RCU or SRCU read lock is needed before checking quiesced flag */
@@ -125,18 +125,18 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 	 */
 	if (!list_empty(&rq_list)) {
 		blk_mq_sched_mark_restart_hctx(hctx);
-		did_work = blk_mq_dispatch_rq_list(q, &rq_list);
+		do_sched_dispatch = blk_mq_dispatch_rq_list(q, &rq_list);
 	} else if (!has_sched_dispatch) {
 		blk_mq_flush_busy_ctxs(hctx, &rq_list);
 		blk_mq_dispatch_rq_list(q, &rq_list);
 	}
 
 	/*
-	 * We want to dispatch from the scheduler if we had no work left
-	 * on the dispatch list, OR if we did have work but weren't able
-	 * to make progress.
+	 * We want to dispatch from the scheduler if there was nothing
+	 * on the dispatch list or we were able to dispatch from the
+	 * dispatch list.
 	 */
-	if (!did_work && has_sched_dispatch) {
+	if (do_sched_dispatch && has_sched_dispatch) {
 		do {
 			struct request *rq;
 
-- 
cgit v1.2.3-58-ga151


From caf8eb0d604a0eaeb8111eb4d36853a6d08eebe7 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Sat, 14 Oct 2017 17:22:26 +0800
Subject: blk-mq-sched: move actual dispatching into one helper

So that it becomes easy to support to dispatch from sw queue in the
following patch.

No functional change.

Reviewed-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Suggested-by: Christoph Hellwig <hch@lst.de> # for simplifying dispatch logic
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-sched.c | 43 ++++++++++++++++++++++++-------------------
 1 file changed, 24 insertions(+), 19 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index eca011fdfa0e..be29ba849408 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -89,12 +89,26 @@ static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx)
 	return false;
 }
 
+static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
+{
+	struct request_queue *q = hctx->queue;
+	struct elevator_queue *e = q->elevator;
+	LIST_HEAD(rq_list);
+
+	do {
+		struct request *rq = e->type->ops.mq.dispatch_request(hctx);
+
+		if (!rq)
+			break;
+		list_add(&rq->queuelist, &rq_list);
+	} while (blk_mq_dispatch_rq_list(q, &rq_list));
+}
+
 void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 {
 	struct request_queue *q = hctx->queue;
 	struct elevator_queue *e = q->elevator;
 	const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request;
-	bool do_sched_dispatch = true;
 	LIST_HEAD(rq_list);
 
 	/* RCU or SRCU read lock is needed before checking quiesced flag */
@@ -122,30 +136,21 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 	 * scheduler, we can no longer merge or sort them. So it's best to
 	 * leave them there for as long as we can. Mark the hw queue as
 	 * needing a restart in that case.
+	 *
+	 * We want to dispatch from the scheduler if there was nothing
+	 * on the dispatch list or we were able to dispatch from the
+	 * dispatch list.
 	 */
 	if (!list_empty(&rq_list)) {
 		blk_mq_sched_mark_restart_hctx(hctx);
-		do_sched_dispatch = blk_mq_dispatch_rq_list(q, &rq_list);
-	} else if (!has_sched_dispatch) {
+		if (blk_mq_dispatch_rq_list(q, &rq_list) && has_sched_dispatch)
+			blk_mq_do_dispatch_sched(hctx);
+	} else if (has_sched_dispatch) {
+		blk_mq_do_dispatch_sched(hctx);
+	} else {
 		blk_mq_flush_busy_ctxs(hctx, &rq_list);
 		blk_mq_dispatch_rq_list(q, &rq_list);
 	}
-
-	/*
-	 * We want to dispatch from the scheduler if there was nothing
-	 * on the dispatch list or we were able to dispatch from the
-	 * dispatch list.
-	 */
-	if (do_sched_dispatch && has_sched_dispatch) {
-		do {
-			struct request *rq;
-
-			rq = e->type->ops.mq.dispatch_request(hctx);
-			if (!rq)
-				break;
-			list_add(&rq->queuelist, &rq_list);
-		} while (blk_mq_dispatch_rq_list(q, &rq_list));
-	}
 }
 
 bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
-- 
cgit v1.2.3-58-ga151


From 63ba8e31c3ac6393b07c6e18538814a730478766 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Sat, 14 Oct 2017 17:22:28 +0800
Subject: block: kyber: check if there are requests in ctx in kyber_has_work()

There may be request in sw queue, and not fetched to domain queue
yet, so check it in kyber_has_work().

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/kyber-iosched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index db5bfc6342d3..b4df317c2916 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -649,7 +649,7 @@ static bool kyber_has_work(struct blk_mq_hw_ctx *hctx)
 		if (!list_empty_careful(&khd->rqs[i]))
 			return true;
 	}
-	return false;
+	return sbitmap_any_bit_set(&hctx->ctx_map);
 }
 
 #define KYBER_LAT_SHOW_STORE(op)					\
-- 
cgit v1.2.3-58-ga151


From de1482974080ec9ef414bf048b2646b246b63f6e Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Sat, 14 Oct 2017 17:22:29 +0800
Subject: blk-mq: introduce .get_budget and .put_budget in blk_mq_ops

For SCSI devices, there is often a per-request-queue depth, which needs
to be respected before queuing one request.

Currently blk-mq always dequeues the request first, then calls
.queue_rq() to dispatch the request to lld. One obvious issue with this
approach is that I/O merging may not be successful, because when the
per-request-queue depth can't be respected, .queue_rq() has to return
BLK_STS_RESOURCE, and then this request has to stay in hctx->dispatch
list. This means it never gets a chance to be merged with other IO.

This patch introduces .get_budget and .put_budget callback in blk_mq_ops,
then we can try to get reserved budget first before dequeuing request.
If the budget for queueing I/O can't be satisfied, we don't need to
dequeue request at all. Hence the request can be left in the IO
scheduler queue, for more merging opportunities.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-sched.c   | 55 +++++++++++++++++++++++++++++++++++++++++---------
 block/blk-mq-sched.h   |  2 +-
 block/blk-mq.c         | 43 ++++++++++++++++++++++++++++++++++-----
 block/blk-mq.h         | 20 +++++++++++++++++-
 include/linux/blk-mq.h | 11 ++++++++++
 5 files changed, 114 insertions(+), 17 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index be29ba849408..8e525e66a0d9 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -89,31 +89,57 @@ static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx)
 	return false;
 }
 
-static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
+/* return true if hctx need to run again */
+static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
 {
 	struct request_queue *q = hctx->queue;
 	struct elevator_queue *e = q->elevator;
 	LIST_HEAD(rq_list);
 
 	do {
-		struct request *rq = e->type->ops.mq.dispatch_request(hctx);
+		struct request *rq;
+		blk_status_t ret;
 
-		if (!rq)
+		if (e->type->ops.mq.has_work &&
+				!e->type->ops.mq.has_work(hctx))
 			break;
+
+		ret = blk_mq_get_dispatch_budget(hctx);
+		if (ret == BLK_STS_RESOURCE)
+			return true;
+
+		rq = e->type->ops.mq.dispatch_request(hctx);
+		if (!rq) {
+			blk_mq_put_dispatch_budget(hctx);
+			break;
+		} else if (ret != BLK_STS_OK) {
+			blk_mq_end_request(rq, ret);
+			continue;
+		}
+
+		/*
+		 * Now this rq owns the budget which has to be released
+		 * if this rq won't be queued to driver via .queue_rq()
+		 * in blk_mq_dispatch_rq_list().
+		 */
 		list_add(&rq->queuelist, &rq_list);
-	} while (blk_mq_dispatch_rq_list(q, &rq_list));
+	} while (blk_mq_dispatch_rq_list(q, &rq_list, true));
+
+	return false;
 }
 
-void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
+/* return true if hw queue need to be run again */
+bool blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 {
 	struct request_queue *q = hctx->queue;
 	struct elevator_queue *e = q->elevator;
 	const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request;
 	LIST_HEAD(rq_list);
+	bool run_queue = false;
 
 	/* RCU or SRCU read lock is needed before checking quiesced flag */
 	if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
-		return;
+		return false;
 
 	hctx->run++;
 
@@ -143,14 +169,23 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 	 */
 	if (!list_empty(&rq_list)) {
 		blk_mq_sched_mark_restart_hctx(hctx);
-		if (blk_mq_dispatch_rq_list(q, &rq_list) && has_sched_dispatch)
-			blk_mq_do_dispatch_sched(hctx);
+		if (blk_mq_dispatch_rq_list(q, &rq_list, false) &&
+				has_sched_dispatch)
+			run_queue = blk_mq_do_dispatch_sched(hctx);
 	} else if (has_sched_dispatch) {
-		blk_mq_do_dispatch_sched(hctx);
+		run_queue = blk_mq_do_dispatch_sched(hctx);
 	} else {
 		blk_mq_flush_busy_ctxs(hctx, &rq_list);
-		blk_mq_dispatch_rq_list(q, &rq_list);
+		blk_mq_dispatch_rq_list(q, &rq_list, false);
 	}
+
+	if (run_queue && !blk_mq_sched_needs_restart(hctx) &&
+			!test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state)) {
+		blk_mq_sched_mark_restart_hctx(hctx);
+		return true;
+	}
+
+	return false;
 }
 
 bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index 9267d0b7c197..2434061cc5b7 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -22,7 +22,7 @@ void blk_mq_sched_insert_requests(struct request_queue *q,
 				  struct blk_mq_ctx *ctx,
 				  struct list_head *list, bool run_queue_async);
 
-void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);
+bool blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);
 
 int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e);
 void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 40cba1b1978f..dcb467369999 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1048,7 +1048,8 @@ static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx)
 	return true;
 }
 
-bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
+bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
+		bool got_budget)
 {
 	struct blk_mq_hw_ctx *hctx;
 	struct request *rq;
@@ -1057,6 +1058,8 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
 	if (list_empty(list))
 		return false;
 
+	WARN_ON(!list_is_singular(list) && got_budget);
+
 	/*
 	 * Now process all the entries, sending them to the driver.
 	 */
@@ -1074,16 +1077,30 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
 			 * The initial allocation attempt failed, so we need to
 			 * rerun the hardware queue when a tag is freed.
 			 */
-			if (!blk_mq_dispatch_wait_add(hctx))
+			if (!blk_mq_dispatch_wait_add(hctx)) {
+				if (got_budget)
+					blk_mq_put_dispatch_budget(hctx);
 				break;
+			}
 
 			/*
 			 * It's possible that a tag was freed in the window
 			 * between the allocation failure and adding the
 			 * hardware queue to the wait queue.
 			 */
-			if (!blk_mq_get_driver_tag(rq, &hctx, false))
+			if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
+				if (got_budget)
+					blk_mq_put_dispatch_budget(hctx);
+				break;
+			}
+		}
+
+		if (!got_budget) {
+			ret = blk_mq_get_dispatch_budget(hctx);
+			if (ret == BLK_STS_RESOURCE)
 				break;
+			if (ret != BLK_STS_OK)
+				goto fail_rq;
 		}
 
 		list_del_init(&rq->queuelist);
@@ -1111,6 +1128,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
 			break;
 		}
 
+ fail_rq:
 		if (unlikely(ret != BLK_STS_OK)) {
 			errors++;
 			blk_mq_end_request(rq, BLK_STS_IOERR);
@@ -1169,6 +1187,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 {
 	int srcu_idx;
+	bool run_queue;
 
 	/*
 	 * We should be running this queue from one of the CPUs that
@@ -1185,15 +1204,18 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 
 	if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
 		rcu_read_lock();
-		blk_mq_sched_dispatch_requests(hctx);
+		run_queue = blk_mq_sched_dispatch_requests(hctx);
 		rcu_read_unlock();
 	} else {
 		might_sleep();
 
 		srcu_idx = srcu_read_lock(hctx->queue_rq_srcu);
-		blk_mq_sched_dispatch_requests(hctx);
+		run_queue = blk_mq_sched_dispatch_requests(hctx);
 		srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx);
 	}
+
+	if (run_queue)
+		blk_mq_run_hw_queue(hctx, true);
 }
 
 /*
@@ -1582,6 +1604,13 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 	if (!blk_mq_get_driver_tag(rq, NULL, false))
 		goto insert;
 
+	ret = blk_mq_get_dispatch_budget(hctx);
+	if (ret == BLK_STS_RESOURCE) {
+		blk_mq_put_driver_tag(rq);
+		goto insert;
+	} else if (ret != BLK_STS_OK)
+		goto fail_rq;
+
 	new_cookie = request_to_qc_t(hctx, rq);
 
 	/*
@@ -1598,6 +1627,7 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 		__blk_mq_requeue_request(rq);
 		goto insert;
 	default:
+ fail_rq:
 		*cookie = BLK_QC_T_NONE;
 		blk_mq_end_request(rq, ret);
 		return;
@@ -2582,6 +2612,9 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
 	if (!set->ops->queue_rq)
 		return -EINVAL;
 
+	if (!set->ops->get_budget ^ !set->ops->put_budget)
+		return -EINVAL;
+
 	if (set->queue_depth > BLK_MQ_MAX_DEPTH) {
 		pr_info("blk-mq: reduced tag depth to %u\n",
 			BLK_MQ_MAX_DEPTH);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index ef15b3414da5..e413b732374e 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -30,7 +30,7 @@ void blk_mq_freeze_queue(struct request_queue *q);
 void blk_mq_free_queue(struct request_queue *q);
 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
 void blk_mq_wake_waiters(struct request_queue *q);
-bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *);
+bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *, bool);
 void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
 bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx);
 bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
@@ -137,4 +137,22 @@ static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx)
 void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part,
 			unsigned int inflight[2]);
 
+static inline void blk_mq_put_dispatch_budget(struct blk_mq_hw_ctx *hctx)
+{
+	struct request_queue *q = hctx->queue;
+
+	if (q->mq_ops->put_budget)
+		q->mq_ops->put_budget(hctx);
+}
+
+static inline blk_status_t blk_mq_get_dispatch_budget(
+		struct blk_mq_hw_ctx *hctx)
+{
+	struct request_queue *q = hctx->queue;
+
+	if (q->mq_ops->get_budget)
+		return q->mq_ops->get_budget(hctx);
+	return BLK_STS_OK;
+}
+
 #endif
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 50c6485cb04f..901457df3d64 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -90,6 +90,8 @@ struct blk_mq_queue_data {
 
 typedef blk_status_t (queue_rq_fn)(struct blk_mq_hw_ctx *,
 		const struct blk_mq_queue_data *);
+typedef blk_status_t (get_budget_fn)(struct blk_mq_hw_ctx *);
+typedef void (put_budget_fn)(struct blk_mq_hw_ctx *);
 typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool);
 typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int);
 typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int);
@@ -111,6 +113,15 @@ struct blk_mq_ops {
 	 */
 	queue_rq_fn		*queue_rq;
 
+	/*
+	 * Reserve budget before queue request, once .queue_rq is
+	 * run, it is driver's responsibility to release the
+	 * reserved budget. Also we have to handle failure case
+	 * of .get_budget for avoiding I/O deadlock.
+	 */
+	get_budget_fn		*get_budget;
+	put_budget_fn		*put_budget;
+
 	/*
 	 * Called on request timeout
 	 */
-- 
cgit v1.2.3-58-ga151


From b347689ffbca745ac457ee27400ce1affd571c6f Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Sat, 14 Oct 2017 17:22:30 +0800
Subject: blk-mq-sched: improve dispatching from sw queue

SCSI devices use host-wide tagset, and the shared driver tag space is
often quite big. However, there is also a queue depth for each lun(
.cmd_per_lun), which is often small, for example, on both lpfc and
qla2xxx, .cmd_per_lun is just 3.

So lots of requests may stay in sw queue, and we always flush all
belonging to same hw queue and dispatch them all to driver.
Unfortunately it is easy to cause queue busy because of the small
.cmd_per_lun.  Once these requests are flushed out, they have to stay in
hctx->dispatch, and no bio merge can happen on these requests, and
sequential IO performance is harmed.

This patch introduces blk_mq_dequeue_from_ctx for dequeuing a request
from a sw queue, so that we can dispatch them in scheduler's way. We can
then avoid dequeueing too many requests from sw queue, since we don't
flush ->dispatch completely.

This patch improves dispatching from sw queue by using the .get_budget
and .put_budget callbacks.

Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-sched.c   | 74 ++++++++++++++++++++++++++++++++++++++++++++++++--
 block/blk-mq.c         | 39 ++++++++++++++++++++++++++
 block/blk-mq.h         |  2 ++
 include/linux/blk-mq.h |  2 ++
 4 files changed, 114 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 8e525e66a0d9..df8581bb0a37 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -128,6 +128,61 @@ static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
 	return false;
 }
 
+static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
+					  struct blk_mq_ctx *ctx)
+{
+	unsigned idx = ctx->index_hw;
+
+	if (++idx == hctx->nr_ctx)
+		idx = 0;
+
+	return hctx->ctxs[idx];
+}
+
+/* return true if hctx need to run again */
+static bool blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
+{
+	struct request_queue *q = hctx->queue;
+	LIST_HEAD(rq_list);
+	struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
+
+	do {
+		struct request *rq;
+		blk_status_t ret;
+
+		if (!sbitmap_any_bit_set(&hctx->ctx_map))
+			break;
+
+		ret = blk_mq_get_dispatch_budget(hctx);
+		if (ret == BLK_STS_RESOURCE)
+			return true;
+
+		rq = blk_mq_dequeue_from_ctx(hctx, ctx);
+		if (!rq) {
+			blk_mq_put_dispatch_budget(hctx);
+			break;
+		} else if (ret != BLK_STS_OK) {
+			blk_mq_end_request(rq, ret);
+			continue;
+		}
+
+		/*
+		 * Now this rq owns the budget which has to be released
+		 * if this rq won't be queued to driver via .queue_rq()
+		 * in blk_mq_dispatch_rq_list().
+		 */
+		list_add(&rq->queuelist, &rq_list);
+
+		/* round robin for fair dispatch */
+		ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);
+
+	} while (blk_mq_dispatch_rq_list(q, &rq_list, true));
+
+	WRITE_ONCE(hctx->dispatch_from, ctx);
+
+	return false;
+}
+
 /* return true if hw queue need to be run again */
 bool blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 {
@@ -169,11 +224,24 @@ bool blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 	 */
 	if (!list_empty(&rq_list)) {
 		blk_mq_sched_mark_restart_hctx(hctx);
-		if (blk_mq_dispatch_rq_list(q, &rq_list, false) &&
-				has_sched_dispatch)
-			run_queue = blk_mq_do_dispatch_sched(hctx);
+		if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
+			if (has_sched_dispatch)
+				run_queue = blk_mq_do_dispatch_sched(hctx);
+			else
+				run_queue = blk_mq_do_dispatch_ctx(hctx);
+		}
 	} else if (has_sched_dispatch) {
 		run_queue = blk_mq_do_dispatch_sched(hctx);
+	} else if (q->mq_ops->get_budget) {
+		/*
+		 * If we need to get budget before queuing request, we
+		 * dequeue request one by one from sw queue for avoiding
+		 * to mess up I/O merge when dispatch runs out of resource.
+		 *
+		 * TODO: get more budgets, and dequeue more requests in
+		 * one time.
+		 */
+		run_queue = blk_mq_do_dispatch_ctx(hctx);
 	} else {
 		blk_mq_flush_busy_ctxs(hctx, &rq_list);
 		blk_mq_dispatch_rq_list(q, &rq_list, false);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index dcb467369999..097ca3ece716 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -914,6 +914,45 @@ void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 }
 EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs);
 
+struct dispatch_rq_data {
+	struct blk_mq_hw_ctx *hctx;
+	struct request *rq;
+};
+
+static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr,
+		void *data)
+{
+	struct dispatch_rq_data *dispatch_data = data;
+	struct blk_mq_hw_ctx *hctx = dispatch_data->hctx;
+	struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
+
+	spin_lock(&ctx->lock);
+	if (unlikely(!list_empty(&ctx->rq_list))) {
+		dispatch_data->rq = list_entry_rq(ctx->rq_list.next);
+		list_del_init(&dispatch_data->rq->queuelist);
+		if (list_empty(&ctx->rq_list))
+			sbitmap_clear_bit(sb, bitnr);
+	}
+	spin_unlock(&ctx->lock);
+
+	return !dispatch_data->rq;
+}
+
+struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
+					struct blk_mq_ctx *start)
+{
+	unsigned off = start ? start->index_hw : 0;
+	struct dispatch_rq_data data = {
+		.hctx = hctx,
+		.rq   = NULL,
+	};
+
+	__sbitmap_for_each_set(&hctx->ctx_map, off,
+			       dispatch_rq_from_ctx, &data);
+
+	return data.rq;
+}
+
 static inline unsigned int queued_to_index(unsigned int queued)
 {
 	if (!queued)
diff --git a/block/blk-mq.h b/block/blk-mq.h
index e413b732374e..522b420dedc0 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -35,6 +35,8 @@ void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
 bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx);
 bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
 				bool wait);
+struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
+					struct blk_mq_ctx *start);
 
 /*
  * Internal helpers for allocating/freeing the request map
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 901457df3d64..e5e6becd57d3 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -30,6 +30,8 @@ struct blk_mq_hw_ctx {
 
 	struct sbitmap		ctx_map;
 
+	struct blk_mq_ctx	*dispatch_from;
+
 	struct blk_mq_ctx	**ctxs;
 	unsigned int		nr_ctx;
 
-- 
cgit v1.2.3-58-ga151


From 358a3a6bccb74da9d63a26b2dd5f09f1e9970e0b Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 27 Oct 2017 12:43:29 +0800
Subject: blk-mq: don't handle TAG_SHARED in restart

Now restart is used in the following cases, and TAG_SHARED is for
SCSI only.

1) .get_budget() returns BLK_STS_RESOURCE
- if resource in target/host level isn't satisfied, this SCSI device
will be added in shost->starved_list, and the whole queue will be rerun
(via SCSI's built-in RESTART) in scsi_end_request() after any request
initiated from this host/targe is completed. Forget to mention, host level
resource can't be an issue for blk-mq at all.

- the same is true if resource in the queue level isn't satisfied.

- if there isn't outstanding request on this queue, then SCSI's RESTART
can't work(blk-mq's can't work too), and the queue will be run after
SCSI_QUEUE_DELAY, and finally all starved sdevs will be handled by SCSI's
RESTART when this request is finished

2) scsi_dispatch_cmd() returns BLK_STS_RESOURCE
- if there isn't onprogressing request on this queue, the queue
will be run after SCSI_QUEUE_DELAY

- otherwise, SCSI's RESTART covers the rerun.

3) blk_mq_get_driver_tag() failed
- BLK_MQ_S_TAG_WAITING covers the cross-queue RESTART for driver
allocation.

In one word, SCSI's built-in RESTART is enough to cover the queue
rerun, and we don't need to pay special attention to TAG_SHARED wrt. restart.

In my test on scsi_debug(8 luns), this patch improves IOPS by 20% ~ 30% when
running I/O on these 8 luns concurrently.

Aslo Roman Pen reported the current RESTART is very expensive especialy
when there are lots of LUNs attached in one host, such as in his
test, RESTART causes half of IOPS be cut.

Fixes: https://marc.info/?l=linux-kernel&m=150832216727524&w=2
Fixes: 6d8c6c0f97ad ("blk-mq: Restart a single queue if tag sets are shared")
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-sched.c | 78 +++-------------------------------------------------
 1 file changed, 4 insertions(+), 74 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index df8581bb0a37..daab27feb653 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -68,25 +68,17 @@ static void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx)
 		set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
 }
 
-static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx)
+void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
 {
 	if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
-		return false;
-
-	if (hctx->flags & BLK_MQ_F_TAG_SHARED) {
-		struct request_queue *q = hctx->queue;
+		return;
 
-		if (test_and_clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
-			atomic_dec(&q->shared_hctx_restart);
-	} else
-		clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
+	clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
 
 	if (blk_mq_hctx_has_pending(hctx)) {
 		blk_mq_run_hw_queue(hctx, true);
-		return true;
+		return;
 	}
-
-	return false;
 }
 
 /* return true if hctx need to run again */
@@ -385,68 +377,6 @@ static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx,
 	return true;
 }
 
-/**
- * list_for_each_entry_rcu_rr - iterate in a round-robin fashion over rcu list
- * @pos:    loop cursor.
- * @skip:   the list element that will not be examined. Iteration starts at
- *          @skip->next.
- * @head:   head of the list to examine. This list must have at least one
- *          element, namely @skip.
- * @member: name of the list_head structure within typeof(*pos).
- */
-#define list_for_each_entry_rcu_rr(pos, skip, head, member)		\
-	for ((pos) = (skip);						\
-	     (pos = (pos)->member.next != (head) ? list_entry_rcu(	\
-			(pos)->member.next, typeof(*pos), member) :	\
-	      list_entry_rcu((pos)->member.next->next, typeof(*pos), member)), \
-	     (pos) != (skip); )
-
-/*
- * Called after a driver tag has been freed to check whether a hctx needs to
- * be restarted. Restarts @hctx if its tag set is not shared. Restarts hardware
- * queues in a round-robin fashion if the tag set of @hctx is shared with other
- * hardware queues.
- */
-void blk_mq_sched_restart(struct blk_mq_hw_ctx *const hctx)
-{
-	struct blk_mq_tags *const tags = hctx->tags;
-	struct blk_mq_tag_set *const set = hctx->queue->tag_set;
-	struct request_queue *const queue = hctx->queue, *q;
-	struct blk_mq_hw_ctx *hctx2;
-	unsigned int i, j;
-
-	if (set->flags & BLK_MQ_F_TAG_SHARED) {
-		/*
-		 * If this is 0, then we know that no hardware queues
-		 * have RESTART marked. We're done.
-		 */
-		if (!atomic_read(&queue->shared_hctx_restart))
-			return;
-
-		rcu_read_lock();
-		list_for_each_entry_rcu_rr(q, queue, &set->tag_list,
-					   tag_set_list) {
-			queue_for_each_hw_ctx(q, hctx2, i)
-				if (hctx2->tags == tags &&
-				    blk_mq_sched_restart_hctx(hctx2))
-					goto done;
-		}
-		j = hctx->queue_num + 1;
-		for (i = 0; i < queue->nr_hw_queues; i++, j++) {
-			if (j == queue->nr_hw_queues)
-				j = 0;
-			hctx2 = queue->queue_hw_ctx[j];
-			if (hctx2->tags == tags &&
-			    blk_mq_sched_restart_hctx(hctx2))
-				break;
-		}
-done:
-		rcu_read_unlock();
-	} else {
-		blk_mq_sched_restart_hctx(hctx);
-	}
-}
-
 /*
  * Add flush/fua to the queue. If we fail getting a driver tag, then
  * punt to the requeue list. Requeue will re-invoke us from a context
-- 
cgit v1.2.3-58-ga151


From 1f460b63d4b37f504d8d0affc2cd492eb005ea97 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 27 Oct 2017 12:43:30 +0800
Subject: blk-mq: don't restart queue when .get_budget returns BLK_STS_RESOURCE

SCSI restarts its queue in scsi_end_request() automatically, so we don't
need to handle this case in blk-mq.

Especailly any request won't be dequeued in this case, we needn't to
worry about IO hang caused by restart vs. dispatch.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-sched.c | 45 ++++++++++++++++++++-------------------------
 block/blk-mq-sched.h |  2 +-
 block/blk-mq.c       |  8 ++------
 3 files changed, 23 insertions(+), 32 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index daab27feb653..7775f6b12fa9 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -81,8 +81,12 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
 	}
 }
 
-/* return true if hctx need to run again */
-static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
+/*
+ * Only SCSI implements .get_budget and .put_budget, and SCSI restarts
+ * its queue by itself in its completion handler, so we don't need to
+ * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
+ */
+static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
 {
 	struct request_queue *q = hctx->queue;
 	struct elevator_queue *e = q->elevator;
@@ -98,7 +102,7 @@ static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
 
 		ret = blk_mq_get_dispatch_budget(hctx);
 		if (ret == BLK_STS_RESOURCE)
-			return true;
+			break;
 
 		rq = e->type->ops.mq.dispatch_request(hctx);
 		if (!rq) {
@@ -116,8 +120,6 @@ static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
 		 */
 		list_add(&rq->queuelist, &rq_list);
 	} while (blk_mq_dispatch_rq_list(q, &rq_list, true));
-
-	return false;
 }
 
 static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
@@ -131,8 +133,12 @@ static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
 	return hctx->ctxs[idx];
 }
 
-/* return true if hctx need to run again */
-static bool blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
+/*
+ * Only SCSI implements .get_budget and .put_budget, and SCSI restarts
+ * its queue by itself in its completion handler, so we don't need to
+ * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
+ */
+static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
 {
 	struct request_queue *q = hctx->queue;
 	LIST_HEAD(rq_list);
@@ -147,7 +153,7 @@ static bool blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
 
 		ret = blk_mq_get_dispatch_budget(hctx);
 		if (ret == BLK_STS_RESOURCE)
-			return true;
+			break;
 
 		rq = blk_mq_dequeue_from_ctx(hctx, ctx);
 		if (!rq) {
@@ -171,22 +177,19 @@ static bool blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
 	} while (blk_mq_dispatch_rq_list(q, &rq_list, true));
 
 	WRITE_ONCE(hctx->dispatch_from, ctx);
-
-	return false;
 }
 
 /* return true if hw queue need to be run again */
-bool blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
+void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 {
 	struct request_queue *q = hctx->queue;
 	struct elevator_queue *e = q->elevator;
 	const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request;
 	LIST_HEAD(rq_list);
-	bool run_queue = false;
 
 	/* RCU or SRCU read lock is needed before checking quiesced flag */
 	if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
-		return false;
+		return;
 
 	hctx->run++;
 
@@ -218,12 +221,12 @@ bool blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 		blk_mq_sched_mark_restart_hctx(hctx);
 		if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
 			if (has_sched_dispatch)
-				run_queue = blk_mq_do_dispatch_sched(hctx);
+				blk_mq_do_dispatch_sched(hctx);
 			else
-				run_queue = blk_mq_do_dispatch_ctx(hctx);
+				blk_mq_do_dispatch_ctx(hctx);
 		}
 	} else if (has_sched_dispatch) {
-		run_queue = blk_mq_do_dispatch_sched(hctx);
+		blk_mq_do_dispatch_sched(hctx);
 	} else if (q->mq_ops->get_budget) {
 		/*
 		 * If we need to get budget before queuing request, we
@@ -233,19 +236,11 @@ bool blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 		 * TODO: get more budgets, and dequeue more requests in
 		 * one time.
 		 */
-		run_queue = blk_mq_do_dispatch_ctx(hctx);
+		blk_mq_do_dispatch_ctx(hctx);
 	} else {
 		blk_mq_flush_busy_ctxs(hctx, &rq_list);
 		blk_mq_dispatch_rq_list(q, &rq_list, false);
 	}
-
-	if (run_queue && !blk_mq_sched_needs_restart(hctx) &&
-			!test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state)) {
-		blk_mq_sched_mark_restart_hctx(hctx);
-		return true;
-	}
-
-	return false;
 }
 
 bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index 2434061cc5b7..9267d0b7c197 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -22,7 +22,7 @@ void blk_mq_sched_insert_requests(struct request_queue *q,
 				  struct blk_mq_ctx *ctx,
 				  struct list_head *list, bool run_queue_async);
 
-bool blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);
+void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);
 
 int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e);
 void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 097ca3ece716..e4d2490f4e7e 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1226,7 +1226,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 {
 	int srcu_idx;
-	bool run_queue;
 
 	/*
 	 * We should be running this queue from one of the CPUs that
@@ -1243,18 +1242,15 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 
 	if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
 		rcu_read_lock();
-		run_queue = blk_mq_sched_dispatch_requests(hctx);
+		blk_mq_sched_dispatch_requests(hctx);
 		rcu_read_unlock();
 	} else {
 		might_sleep();
 
 		srcu_idx = srcu_read_lock(hctx->queue_rq_srcu);
-		run_queue = blk_mq_sched_dispatch_requests(hctx);
+		blk_mq_sched_dispatch_requests(hctx);
 		srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx);
 	}
-
-	if (run_queue)
-		blk_mq_run_hw_queue(hctx, true);
 }
 
 /*
-- 
cgit v1.2.3-58-ga151


From f421e1d9ade4e1b88183e54425cf50e390d16a7f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 2 Nov 2017 21:29:50 +0300
Subject: block: provide a direct_make_request helper
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This helper allows reinserting a bio into a new queue without much
overhead, but requires all queue limits to be the same for the upper
and lower queues, and it does not provide any recursion preventions.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Javier González <javier@cnexlabs.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c       | 34 ++++++++++++++++++++++++++++++++++
 include/linux/blkdev.h |  1 +
 2 files changed, 35 insertions(+)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index bb4fce694a60..96aaa3a419a3 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2243,6 +2243,40 @@ out:
 }
 EXPORT_SYMBOL(generic_make_request);
 
+/**
+ * direct_make_request - hand a buffer directly to its device driver for I/O
+ * @bio:  The bio describing the location in memory and on the device.
+ *
+ * This function behaves like generic_make_request(), but does not protect
+ * against recursion.  Must only be used if the called driver is known
+ * to not call generic_make_request (or direct_make_request) again from
+ * its make_request function.  (Calling direct_make_request again from
+ * a workqueue is perfectly fine as that doesn't recurse).
+ */
+blk_qc_t direct_make_request(struct bio *bio)
+{
+	struct request_queue *q = bio->bi_disk->queue;
+	bool nowait = bio->bi_opf & REQ_NOWAIT;
+	blk_qc_t ret;
+
+	if (!generic_make_request_checks(bio))
+		return BLK_QC_T_NONE;
+
+	if (unlikely(blk_queue_enter(q, nowait))) {
+		if (nowait && !blk_queue_dying(q))
+			bio->bi_status = BLK_STS_AGAIN;
+		else
+			bio->bi_status = BLK_STS_IOERR;
+		bio_endio(bio);
+		return BLK_QC_T_NONE;
+	}
+
+	ret = q->make_request_fn(q, bio);
+	blk_queue_exit(q);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(direct_make_request);
+
 /**
  * submit_bio - submit a bio to the block device layer for I/O
  * @bio: The &struct bio which describes the I/O
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 72637028f3c9..eda3d25c0f68 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -920,6 +920,7 @@ static inline void rq_flush_dcache_pages(struct request *rq)
 extern int blk_register_queue(struct gendisk *disk);
 extern void blk_unregister_queue(struct gendisk *disk);
 extern blk_qc_t generic_make_request(struct bio *bio);
+extern blk_qc_t direct_make_request(struct bio *bio);
 extern void blk_rq_init(struct request_queue *q, struct request *rq);
 extern void blk_init_request_from_bio(struct request *req, struct bio *bio);
 extern void blk_put_request(struct request *);
-- 
cgit v1.2.3-58-ga151


From ef71de8b15d891b27b8c983a9a8972b11cb4576a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 2 Nov 2017 21:29:51 +0300
Subject: block: add a blk_steal_bios helper

This helpers allows to bounce steal the uncompleted bios from a request so
that they can be reissued on another path.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c       | 21 +++++++++++++++++++++
 include/linux/blkdev.h |  2 ++
 2 files changed, 23 insertions(+)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 96aaa3a419a3..68cfe6780a9b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2769,6 +2769,27 @@ struct request *blk_fetch_request(struct request_queue *q)
 }
 EXPORT_SYMBOL(blk_fetch_request);
 
+/*
+ * Steal bios from a request and add them to a bio list.
+ * The request must not have been partially completed before.
+ */
+void blk_steal_bios(struct bio_list *list, struct request *rq)
+{
+	if (rq->bio) {
+		if (list->tail)
+			list->tail->bi_next = rq->bio;
+		else
+			list->head = rq->bio;
+		list->tail = rq->biotail;
+
+		rq->bio = NULL;
+		rq->biotail = NULL;
+	}
+
+	rq->__data_len = 0;
+}
+EXPORT_SYMBOL_GPL(blk_steal_bios);
+
 /**
  * blk_update_request - Special helper function for request stacking drivers
  * @req:      the request being processed
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index eda3d25c0f68..fddda6a1f9b5 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1094,6 +1094,8 @@ extern struct request *blk_peek_request(struct request_queue *q);
 extern void blk_start_request(struct request *rq);
 extern struct request *blk_fetch_request(struct request_queue *q);
 
+void blk_steal_bios(struct bio_list *list, struct request *rq);
+
 /*
  * Request completion related functions.
  *
-- 
cgit v1.2.3-58-ga151


From 517bf3c306bad4fe0da631f90ae2ec40924dee2b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 2 Nov 2017 21:29:52 +0300
Subject: block: don't look at the struct device dev_t in disk_devt

The hidden gendisks introduced in the next patch need to keep the dev
field in their struct device empty so that udev won't try to create
block device nodes for them.  To support that rewrite disk_devt to
look at the major and first_minor fields in the gendisk itself instead
of looking into the struct device.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/genhd.c         | 4 ----
 include/linux/genhd.h | 2 +-
 2 files changed, 1 insertion(+), 5 deletions(-)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index dd305c65ffb0..1174d24e405e 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -649,10 +649,6 @@ void device_add_disk(struct device *parent, struct gendisk *disk)
 		return;
 	}
 	disk_to_dev(disk)->devt = devt;
-
-	/* ->major and ->first_minor aren't supposed to be
-	 * dereferenced from here on, but set them just in case.
-	 */
 	disk->major = MAJOR(devt);
 	disk->first_minor = MINOR(devt);
 
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index ea652bfcd675..5c0ed5db33c2 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -234,7 +234,7 @@ static inline bool disk_part_scan_enabled(struct gendisk *disk)
 
 static inline dev_t disk_devt(struct gendisk *disk)
 {
-	return disk_to_dev(disk)->devt;
+	return MKDEV(disk->major, disk->first_minor);
 }
 
 static inline dev_t part_devt(struct hd_struct *part)
-- 
cgit v1.2.3-58-ga151


From 8ddcd653257c18a669fcb75ee42c37054908e0d6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 2 Nov 2017 21:29:53 +0300
Subject: block: introduce GENHD_FL_HIDDEN

With this flag a driver can create a gendisk that can be used for I/O
submission inside the kernel, but which is not registered as user
facing block device.  This will be useful for the NVMe multipath
implementation.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/genhd.c         | 68 +++++++++++++++++++++++++++++++++++++--------------
 include/linux/genhd.h |  1 +
 2 files changed, 51 insertions(+), 18 deletions(-)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index 1174d24e405e..835e907e6e01 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -585,6 +585,11 @@ static void register_disk(struct device *parent, struct gendisk *disk)
 	 */
 	pm_runtime_set_memalloc_noio(ddev, true);
 
+	if (disk->flags & GENHD_FL_HIDDEN) {
+		dev_set_uevent_suppress(ddev, 0);
+		return;
+	}
+
 	disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
 	disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
 
@@ -616,6 +621,11 @@ exit:
 	while ((part = disk_part_iter_next(&piter)))
 		kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD);
 	disk_part_iter_exit(&piter);
+
+	err = sysfs_create_link(&ddev->kobj,
+				&disk->queue->backing_dev_info->dev->kobj,
+				"bdi");
+	WARN_ON(err);
 }
 
 /**
@@ -630,7 +640,6 @@ exit:
  */
 void device_add_disk(struct device *parent, struct gendisk *disk)
 {
-	struct backing_dev_info *bdi;
 	dev_t devt;
 	int retval;
 
@@ -639,7 +648,8 @@ void device_add_disk(struct device *parent, struct gendisk *disk)
 	 * parameters make sense.
 	 */
 	WARN_ON(disk->minors && !(disk->major || disk->first_minor));
-	WARN_ON(!disk->minors && !(disk->flags & GENHD_FL_EXT_DEVT));
+	WARN_ON(!disk->minors &&
+		!(disk->flags & (GENHD_FL_EXT_DEVT | GENHD_FL_HIDDEN)));
 
 	disk->flags |= GENHD_FL_UP;
 
@@ -648,18 +658,26 @@ void device_add_disk(struct device *parent, struct gendisk *disk)
 		WARN_ON(1);
 		return;
 	}
-	disk_to_dev(disk)->devt = devt;
 	disk->major = MAJOR(devt);
 	disk->first_minor = MINOR(devt);
 
 	disk_alloc_events(disk);
 
-	/* Register BDI before referencing it from bdev */
-	bdi = disk->queue->backing_dev_info;
-	bdi_register_owner(bdi, disk_to_dev(disk));
-
-	blk_register_region(disk_devt(disk), disk->minors, NULL,
-			    exact_match, exact_lock, disk);
+	if (disk->flags & GENHD_FL_HIDDEN) {
+		/*
+		 * Don't let hidden disks show up in /proc/partitions,
+		 * and don't bother scanning for partitions either.
+		 */
+		disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO;
+		disk->flags |= GENHD_FL_NO_PART_SCAN;
+	} else {
+		/* Register BDI before referencing it from bdev */
+		disk_to_dev(disk)->devt = devt;
+		bdi_register_owner(disk->queue->backing_dev_info,
+				disk_to_dev(disk));
+		blk_register_region(disk_devt(disk), disk->minors, NULL,
+				    exact_match, exact_lock, disk);
+	}
 	register_disk(parent, disk);
 	blk_register_queue(disk);
 
@@ -669,10 +687,6 @@ void device_add_disk(struct device *parent, struct gendisk *disk)
 	 */
 	WARN_ON_ONCE(!blk_get_queue(disk->queue));
 
-	retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj,
-				   "bdi");
-	WARN_ON(retval);
-
 	disk_add_events(disk);
 	blk_integrity_add(disk);
 }
@@ -701,7 +715,8 @@ void del_gendisk(struct gendisk *disk)
 	set_capacity(disk, 0);
 	disk->flags &= ~GENHD_FL_UP;
 
-	sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");
+	if (!(disk->flags & GENHD_FL_HIDDEN))
+		sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");
 	if (disk->queue) {
 		/*
 		 * Unregister bdi before releasing device numbers (as they can
@@ -712,13 +727,15 @@ void del_gendisk(struct gendisk *disk)
 	} else {
 		WARN_ON(1);
 	}
-	blk_unregister_region(disk_devt(disk), disk->minors);
+
+	if (!(disk->flags & GENHD_FL_HIDDEN)) {
+		blk_unregister_region(disk_devt(disk), disk->minors);
+		kobject_put(disk->part0.holder_dir);
+		kobject_put(disk->slave_dir);
+	}
 
 	part_stat_set_all(&disk->part0, 0);
 	disk->part0.stamp = 0;
-
-	kobject_put(disk->part0.holder_dir);
-	kobject_put(disk->slave_dir);
 	if (!sysfs_deprecated)
 		sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
 	pm_runtime_set_memalloc_noio(disk_to_dev(disk), false);
@@ -781,6 +798,10 @@ struct gendisk *get_gendisk(dev_t devt, int *partno)
 		spin_unlock_bh(&ext_devt_lock);
 	}
 
+	if (unlikely(disk->flags & GENHD_FL_HIDDEN)) {
+		put_disk(disk);
+		disk = NULL;
+	}
 	return disk;
 }
 EXPORT_SYMBOL(get_gendisk);
@@ -1024,6 +1045,15 @@ static ssize_t disk_removable_show(struct device *dev,
 		       (disk->flags & GENHD_FL_REMOVABLE ? 1 : 0));
 }
 
+static ssize_t disk_hidden_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+
+	return sprintf(buf, "%d\n",
+		       (disk->flags & GENHD_FL_HIDDEN ? 1 : 0));
+}
+
 static ssize_t disk_ro_show(struct device *dev,
 				   struct device_attribute *attr, char *buf)
 {
@@ -1061,6 +1091,7 @@ static ssize_t disk_discard_alignment_show(struct device *dev,
 static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL);
 static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL);
 static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL);
+static DEVICE_ATTR(hidden, S_IRUGO, disk_hidden_show, NULL);
 static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL);
 static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
 static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL);
@@ -1085,6 +1116,7 @@ static struct attribute *disk_attrs[] = {
 	&dev_attr_range.attr,
 	&dev_attr_ext_range.attr,
 	&dev_attr_removable.attr,
+	&dev_attr_hidden.attr,
 	&dev_attr_ro.attr,
 	&dev_attr_size.attr,
 	&dev_attr_alignment_offset.attr,
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 5c0ed5db33c2..93aae3476f58 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -140,6 +140,7 @@ struct hd_struct {
 #define GENHD_FL_NATIVE_CAPACITY		128
 #define GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE	256
 #define GENHD_FL_NO_PART_SCAN			512
+#define GENHD_FL_HIDDEN				1024
 
 enum {
 	DISK_EVENT_MEDIA_CHANGE			= 1 << 0, /* media changed */
-- 
cgit v1.2.3-58-ga151


From ea435e1b9392a33deceaea2a16ebaa3397bead93 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 2 Nov 2017 21:29:54 +0300
Subject: block: add a poll_fn callback to struct request_queue

That we we can also poll non blk-mq queues.  Mostly needed for
the NVMe multipath code, but could also be useful elsewhere.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c             | 11 +++++++++++
 block/blk-mq.c               | 14 +++++---------
 drivers/nvme/target/io-cmd.c |  2 +-
 fs/block_dev.c               |  4 ++--
 fs/direct-io.c               |  2 +-
 fs/iomap.c                   |  2 +-
 include/linux/blkdev.h       |  4 +++-
 mm/page_io.c                 |  2 +-
 8 files changed, 25 insertions(+), 16 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 68cfe6780a9b..395bfb10d658 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2321,6 +2321,17 @@ blk_qc_t submit_bio(struct bio *bio)
 }
 EXPORT_SYMBOL(submit_bio);
 
+bool blk_poll(struct request_queue *q, blk_qc_t cookie)
+{
+	if (!q->poll_fn || !blk_qc_t_valid(cookie))
+		return false;
+
+	if (current->plug)
+		blk_flush_plug_list(current->plug, false);
+	return q->poll_fn(q, cookie);
+}
+EXPORT_SYMBOL_GPL(blk_poll);
+
 /**
  * blk_cloned_rq_check_limits - Helper function to check a cloned request
  *                              for new the queue limits
diff --git a/block/blk-mq.c b/block/blk-mq.c
index e4d2490f4e7e..95ea5889b825 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -37,6 +37,7 @@
 #include "blk-wbt.h"
 #include "blk-mq-sched.h"
 
+static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie);
 static void blk_mq_poll_stats_start(struct request_queue *q);
 static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
 
@@ -2499,6 +2500,8 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 	spin_lock_init(&q->requeue_lock);
 
 	blk_queue_make_request(q, blk_mq_make_request);
+	if (q->mq_ops->poll)
+		q->poll_fn = blk_mq_poll;
 
 	/*
 	 * Do this after blk_queue_make_request() overrides it...
@@ -2961,20 +2964,14 @@ static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq)
 	return false;
 }
 
-bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
+static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
 {
 	struct blk_mq_hw_ctx *hctx;
-	struct blk_plug *plug;
 	struct request *rq;
 
-	if (!q->mq_ops || !q->mq_ops->poll || !blk_qc_t_valid(cookie) ||
-	    !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
+	if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
 		return false;
 
-	plug = current->plug;
-	if (plug)
-		blk_flush_plug_list(plug, false);
-
 	hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
 	if (!blk_qc_t_is_internal(cookie))
 		rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
@@ -2992,7 +2989,6 @@ bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
 
 	return __blk_mq_poll(hctx, rq);
 }
-EXPORT_SYMBOL_GPL(blk_mq_poll);
 
 static int __init blk_mq_init(void)
 {
diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd.c
index 0d4c23dc4532..db632818777d 100644
--- a/drivers/nvme/target/io-cmd.c
+++ b/drivers/nvme/target/io-cmd.c
@@ -94,7 +94,7 @@ static void nvmet_execute_rw(struct nvmet_req *req)
 
 	cookie = submit_bio(bio);
 
-	blk_mq_poll(bdev_get_queue(req->ns->bdev), cookie);
+	blk_poll(bdev_get_queue(req->ns->bdev), cookie);
 }
 
 static void nvmet_execute_flush(struct nvmet_req *req)
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 07ddccd17801..4afa4d5ff969 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -237,7 +237,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
 		if (!READ_ONCE(bio.bi_private))
 			break;
 		if (!(iocb->ki_flags & IOCB_HIPRI) ||
-		    !blk_mq_poll(bdev_get_queue(bdev), qc))
+		    !blk_poll(bdev_get_queue(bdev), qc))
 			io_schedule();
 	}
 	__set_current_state(TASK_RUNNING);
@@ -402,7 +402,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 			break;
 
 		if (!(iocb->ki_flags & IOCB_HIPRI) ||
-		    !blk_mq_poll(bdev_get_queue(bdev), qc))
+		    !blk_poll(bdev_get_queue(bdev), qc))
 			io_schedule();
 	}
 	__set_current_state(TASK_RUNNING);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 62cf812ed0e5..d2bc339cb1e9 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -486,7 +486,7 @@ static struct bio *dio_await_one(struct dio *dio)
 		dio->waiter = current;
 		spin_unlock_irqrestore(&dio->bio_lock, flags);
 		if (!(dio->iocb->ki_flags & IOCB_HIPRI) ||
-		    !blk_mq_poll(dio->bio_disk->queue, dio->bio_cookie))
+		    !blk_poll(dio->bio_disk->queue, dio->bio_cookie))
 			io_schedule();
 		/* wake up sets us TASK_RUNNING */
 		spin_lock_irqsave(&dio->bio_lock, flags);
diff --git a/fs/iomap.c b/fs/iomap.c
index 8194d30bdca0..4241bac905b1 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -1049,7 +1049,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 
 			if (!(iocb->ki_flags & IOCB_HIPRI) ||
 			    !dio->submit.last_queue ||
-			    !blk_mq_poll(dio->submit.last_queue,
+			    !blk_poll(dio->submit.last_queue,
 					 dio->submit.cookie))
 				io_schedule();
 		}
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index fddda6a1f9b5..225617dd0a3f 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -266,6 +266,7 @@ struct blk_queue_ctx;
 
 typedef void (request_fn_proc) (struct request_queue *q);
 typedef blk_qc_t (make_request_fn) (struct request_queue *q, struct bio *bio);
+typedef bool (poll_q_fn) (struct request_queue *q, blk_qc_t);
 typedef int (prep_rq_fn) (struct request_queue *, struct request *);
 typedef void (unprep_rq_fn) (struct request_queue *, struct request *);
 
@@ -408,6 +409,7 @@ struct request_queue {
 
 	request_fn_proc		*request_fn;
 	make_request_fn		*make_request_fn;
+	poll_q_fn		*poll_fn;
 	prep_rq_fn		*prep_rq_fn;
 	unprep_rq_fn		*unprep_rq_fn;
 	softirq_done_fn		*softirq_done_fn;
@@ -975,7 +977,7 @@ extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *,
 int blk_status_to_errno(blk_status_t status);
 blk_status_t errno_to_blk_status(int errno);
 
-bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie);
+bool blk_poll(struct request_queue *q, blk_qc_t cookie);
 
 static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
 {
diff --git a/mm/page_io.c b/mm/page_io.c
index 21502d341a67..ff04de630c46 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -407,7 +407,7 @@ int swap_readpage(struct page *page, bool do_poll)
 		if (!READ_ONCE(bio->bi_private))
 			break;
 
-		if (!blk_mq_poll(disk->queue, qc))
+		if (!blk_poll(disk->queue, qc))
 			break;
 	}
 	__set_current_state(TASK_RUNNING);
-- 
cgit v1.2.3-58-ga151


From c2e82a234873d905260a3bd0aca9577e85619bda Mon Sep 17 00:00:00 2001
From: weiping zhang <zhangweiping@didichuxing.com>
Date: Fri, 22 Sep 2017 23:36:28 +0800
Subject: blk-mq: fix nr_requests wrong value when modify it from sysfs

if blk-mq use "none" io scheduler, nr_request get a wrong value when
input a number > tag_set->queue_depth. blk_mq_tag_update_depth will get
the smaller one min(nr, set->queue_depth), and then q->nr_request get a
wrong value.

Reproduce:

echo none > /sys/block/nvme0n1/queue/scheduler
echo 1000000 > /sys/block/nvme0n1/queue/nr_requests
cat /sys/block/nvme0n1/queue/nr_requests
1000000

Signed-off-by: weiping zhang <zhangweiping@didichuxing.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 95ea5889b825..9eea67ce82d9 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2743,8 +2743,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
 		 * queue depth. This is similar to what the old code would do.
 		 */
 		if (!hctx->sched_tags) {
-			ret = blk_mq_tag_update_depth(hctx, &hctx->tags,
-							min(nr, set->queue_depth),
+			ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr,
 							false);
 		} else {
 			ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags,
-- 
cgit v1.2.3-58-ga151


From 21e768b442bb587123ea924620e74d6e5655d717 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Mon, 16 Oct 2017 16:32:26 -0700
Subject: blk-mq: Make blk_mq_get_request() error path less confusing

blk_mq_get_tag() can modify data->ctx. This means that in the
error path of blk_mq_get_request() data->ctx should be passed to
blk_mq_put_ctx() instead of local_ctx. Note: since blk_mq_put_ctx()
ignores its argument, this patch does not change any functionality.

References: commit 1ad43c0078b7 ("blk-mq: don't leak preempt counter/q_usage_counter when allocating rq failed")
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Cc: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 9eea67ce82d9..13cdccef543c 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -337,12 +337,14 @@ static struct request *blk_mq_get_request(struct request_queue *q,
 	struct elevator_queue *e = q->elevator;
 	struct request *rq;
 	unsigned int tag;
-	struct blk_mq_ctx *local_ctx = NULL;
+	bool put_ctx_on_error = false;
 
 	blk_queue_enter_live(q);
 	data->q = q;
-	if (likely(!data->ctx))
-		data->ctx = local_ctx = blk_mq_get_ctx(q);
+	if (likely(!data->ctx)) {
+		data->ctx = blk_mq_get_ctx(q);
+		put_ctx_on_error = true;
+	}
 	if (likely(!data->hctx))
 		data->hctx = blk_mq_map_queue(q, data->ctx->cpu);
 	if (op & REQ_NOWAIT)
@@ -361,8 +363,8 @@ static struct request *blk_mq_get_request(struct request_queue *q,
 
 	tag = blk_mq_get_tag(data);
 	if (tag == BLK_MQ_TAG_FAIL) {
-		if (local_ctx) {
-			blk_mq_put_ctx(local_ctx);
+		if (put_ctx_on_error) {
+			blk_mq_put_ctx(data->ctx);
 			data->ctx = NULL;
 		}
 		blk_queue_exit(q);
-- 
cgit v1.2.3-58-ga151


From e4f36b249b4d4e7599f1cf0c8fb50f196e52677e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 20 Oct 2017 16:45:23 +0200
Subject: block: fix peeking requests during PM

We need to look for an active PM request until the next softbarrier
instead of looking for the first non-PM request.  Otherwise any cause
of request reordering might starve the PM request(s).

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 35 ++++++++++++++++++-----------------
 1 file changed, 18 insertions(+), 17 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 395bfb10d658..223f32826e62 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2511,20 +2511,22 @@ void blk_account_io_done(struct request *req)
  * Don't process normal requests when queue is suspended
  * or in the process of suspending/resuming
  */
-static struct request *blk_pm_peek_request(struct request_queue *q,
-					   struct request *rq)
+static bool blk_pm_allow_request(struct request *rq)
 {
-	if (q->dev && (q->rpm_status == RPM_SUSPENDED ||
-	    (q->rpm_status != RPM_ACTIVE && !(rq->rq_flags & RQF_PM))))
-		return NULL;
-	else
-		return rq;
+	switch (rq->q->rpm_status) {
+	case RPM_RESUMING:
+	case RPM_SUSPENDING:
+		return rq->rq_flags & RQF_PM;
+	case RPM_SUSPENDED:
+		return false;
+	}
+
+	return true;
 }
 #else
-static inline struct request *blk_pm_peek_request(struct request_queue *q,
-						  struct request *rq)
+static bool blk_pm_allow_request(struct request *rq)
 {
-	return rq;
+	return true;
 }
 #endif
 
@@ -2572,9 +2574,12 @@ static struct request *elv_next_request(struct request_queue *q)
 	WARN_ON_ONCE(q->mq_ops);
 
 	while (1) {
-		if (!list_empty(&q->queue_head)) {
-			rq = list_entry_rq(q->queue_head.next);
-			return rq;
+		list_for_each_entry(rq, &q->queue_head, queuelist) {
+			if (blk_pm_allow_request(rq))
+				return rq;
+
+			if (rq->rq_flags & RQF_SOFTBARRIER)
+				break;
 		}
 
 		/*
@@ -2625,10 +2630,6 @@ struct request *blk_peek_request(struct request_queue *q)
 	WARN_ON_ONCE(q->mq_ops);
 
 	while ((rq = elv_next_request(q)) != NULL) {
-		rq = blk_pm_peek_request(q, rq);
-		if (!rq)
-			break;
-
 		if (!(rq->rq_flags & RQF_STARTED)) {
 			/*
 			 * This is the first time the device driver
-- 
cgit v1.2.3-58-ga151


From 88022d7201e96b43f1754b0358fc6bcd8dbdcde1 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Sun, 5 Nov 2017 02:21:12 +0800
Subject: blk-mq: don't handle failure in .get_budget

It is enough to just check if we can get the budget via .get_budget().
And we don't need to deal with device state change in .get_budget().

For SCSI, one issue to be fixed is that we have to call
scsi_mq_uninit_cmd() to free allocated ressources if SCSI device fails
to handle the request. And it isn't enough to simply call
blk_mq_end_request() to do that if this request is marked as
RQF_DONTPREP.

Fixes: 0df21c86bdbf(scsi: implement .get_budget and .put_budget for blk-mq)
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-sched.c    | 14 ++------------
 block/blk-mq.c          | 17 ++++-------------
 block/blk-mq.h          |  5 ++---
 drivers/scsi/scsi_lib.c | 11 +++--------
 include/linux/blk-mq.h  |  2 +-
 5 files changed, 12 insertions(+), 37 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 7775f6b12fa9..13a27d4d1671 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -94,23 +94,18 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
 
 	do {
 		struct request *rq;
-		blk_status_t ret;
 
 		if (e->type->ops.mq.has_work &&
 				!e->type->ops.mq.has_work(hctx))
 			break;
 
-		ret = blk_mq_get_dispatch_budget(hctx);
-		if (ret == BLK_STS_RESOURCE)
+		if (!blk_mq_get_dispatch_budget(hctx))
 			break;
 
 		rq = e->type->ops.mq.dispatch_request(hctx);
 		if (!rq) {
 			blk_mq_put_dispatch_budget(hctx);
 			break;
-		} else if (ret != BLK_STS_OK) {
-			blk_mq_end_request(rq, ret);
-			continue;
 		}
 
 		/*
@@ -146,22 +141,17 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
 
 	do {
 		struct request *rq;
-		blk_status_t ret;
 
 		if (!sbitmap_any_bit_set(&hctx->ctx_map))
 			break;
 
-		ret = blk_mq_get_dispatch_budget(hctx);
-		if (ret == BLK_STS_RESOURCE)
+		if (!blk_mq_get_dispatch_budget(hctx))
 			break;
 
 		rq = blk_mq_dequeue_from_ctx(hctx, ctx);
 		if (!rq) {
 			blk_mq_put_dispatch_budget(hctx);
 			break;
-		} else if (ret != BLK_STS_OK) {
-			blk_mq_end_request(rq, ret);
-			continue;
 		}
 
 		/*
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 13cdccef543c..c9fa4b294664 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1137,13 +1137,8 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 			}
 		}
 
-		if (!got_budget) {
-			ret = blk_mq_get_dispatch_budget(hctx);
-			if (ret == BLK_STS_RESOURCE)
-				break;
-			if (ret != BLK_STS_OK)
-				goto fail_rq;
-		}
+		if (!got_budget && !blk_mq_get_dispatch_budget(hctx))
+			break;
 
 		list_del_init(&rq->queuelist);
 
@@ -1170,7 +1165,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 			break;
 		}
 
- fail_rq:
 		if (unlikely(ret != BLK_STS_OK)) {
 			errors++;
 			blk_mq_end_request(rq, BLK_STS_IOERR);
@@ -1642,12 +1636,10 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 	if (!blk_mq_get_driver_tag(rq, NULL, false))
 		goto insert;
 
-	ret = blk_mq_get_dispatch_budget(hctx);
-	if (ret == BLK_STS_RESOURCE) {
+	if (!blk_mq_get_dispatch_budget(hctx)) {
 		blk_mq_put_driver_tag(rq);
 		goto insert;
-	} else if (ret != BLK_STS_OK)
-		goto fail_rq;
+	}
 
 	new_cookie = request_to_qc_t(hctx, rq);
 
@@ -1665,7 +1657,6 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 		__blk_mq_requeue_request(rq);
 		goto insert;
 	default:
- fail_rq:
 		*cookie = BLK_QC_T_NONE;
 		blk_mq_end_request(rq, ret);
 		return;
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 522b420dedc0..f97aceff76e9 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -147,14 +147,13 @@ static inline void blk_mq_put_dispatch_budget(struct blk_mq_hw_ctx *hctx)
 		q->mq_ops->put_budget(hctx);
 }
 
-static inline blk_status_t blk_mq_get_dispatch_budget(
-		struct blk_mq_hw_ctx *hctx)
+static inline bool blk_mq_get_dispatch_budget(struct blk_mq_hw_ctx *hctx)
 {
 	struct request_queue *q = hctx->queue;
 
 	if (q->mq_ops->get_budget)
 		return q->mq_ops->get_budget(hctx);
-	return BLK_STS_OK;
+	return true;
 }
 
 #endif
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 22a7e4c47207..286ea983c9e3 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1955,27 +1955,22 @@ static void scsi_mq_put_budget(struct blk_mq_hw_ctx *hctx)
 	put_device(&sdev->sdev_gendev);
 }
 
-static blk_status_t scsi_mq_get_budget(struct blk_mq_hw_ctx *hctx)
+static bool scsi_mq_get_budget(struct blk_mq_hw_ctx *hctx)
 {
 	struct request_queue *q = hctx->queue;
 	struct scsi_device *sdev = q->queuedata;
-	blk_status_t ret;
-
-	ret = prep_to_mq(scsi_prep_state_check(sdev, NULL));
-	if (ret == BLK_STS_RESOURCE || ret != BLK_STS_OK)
-		return ret;
 
 	if (!get_device(&sdev->sdev_gendev))
 		goto out;
 	if (!scsi_dev_queue_ready(q, sdev))
 		goto out_put_device;
 
-	return BLK_STS_OK;
+	return true;
 
 out_put_device:
 	put_device(&sdev->sdev_gendev);
 out:
-	return BLK_STS_RESOURCE;
+	return false;
 }
 
 static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index f2e3079eecdd..674641527da7 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -92,7 +92,7 @@ struct blk_mq_queue_data {
 
 typedef blk_status_t (queue_rq_fn)(struct blk_mq_hw_ctx *,
 		const struct blk_mq_queue_data *);
-typedef blk_status_t (get_budget_fn)(struct blk_mq_hw_ctx *);
+typedef bool (get_budget_fn)(struct blk_mq_hw_ctx *);
 typedef void (put_budget_fn)(struct blk_mq_hw_ctx *);
 typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool);
 typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int);
-- 
cgit v1.2.3-58-ga151


From e84010732225c4c7c3464ee1169d395751c3adfa Mon Sep 17 00:00:00 2001
From: weiping zhang <zhangweiping@didichuxing.com>
Date: Tue, 17 Oct 2017 23:56:21 +0800
Subject: blkcg: add sanity check for blkcg policy operations

blkcg policy should keep cpd/pd's alloc_fn and free_fn in pairs,
otherwise policy would register fail.

Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: weiping zhang <zhangweiping@didichuxing.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index e7ec676043b1..4117524ca45b 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1419,6 +1419,11 @@ int blkcg_policy_register(struct blkcg_policy *pol)
 	if (i >= BLKCG_MAX_POLS)
 		goto err_unlock;
 
+	/* Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs */
+	if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) ||
+		(!pol->pd_alloc_fn ^ !pol->pd_free_fn))
+		goto err_unlock;
+
 	/* register @pol */
 	pol->plid = i;
 	blkcg_policy[pol->plid] = pol;
-- 
cgit v1.2.3-58-ga151


From 6d6f167ce74158903e7fc20dfbecf89c71aa1c00 Mon Sep 17 00:00:00 2001
From: Jianchao Wang <jianchao.w.wang@oracle.com>
Date: Thu, 2 Nov 2017 23:24:32 +0800
Subject: blk-mq: put the driver tag of nxt rq before first one is requeued

When freeing the driver tag of the next rq with an I/O scheduler
configured, we get the first entry of the list. However, this can
race with requeue of a request, and we end up getting the wrong request
from the head of the list. Free the driver tag of next rq before the
failed one is requeued in the failure branch of queue_rq callback.

Signed-off-by: Jianchao Wang <jianchao.w.wang@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index c9fa4b294664..6eacc1dea8b7 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1094,7 +1094,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 		bool got_budget)
 {
 	struct blk_mq_hw_ctx *hctx;
-	struct request *rq;
+	struct request *rq, *nxt;
 	int errors, queued;
 
 	if (list_empty(list))
@@ -1151,14 +1151,20 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 		if (list_empty(list))
 			bd.last = true;
 		else {
-			struct request *nxt;
-
 			nxt = list_first_entry(list, struct request, queuelist);
 			bd.last = !blk_mq_get_driver_tag(nxt, NULL, false);
 		}
 
 		ret = q->mq_ops->queue_rq(hctx, &bd);
 		if (ret == BLK_STS_RESOURCE) {
+			/*
+			 * If an I/O scheduler has been configured and we got a
+			 * driver tag for the next request already, free it again.
+			 */
+			if (!list_empty(list)) {
+				nxt = list_first_entry(list, struct request, queuelist);
+				blk_mq_put_driver_tag(nxt);
+			}
 			blk_mq_put_driver_tag_hctx(hctx, rq);
 			list_add(&rq->queuelist, list);
 			__blk_mq_requeue_request(rq);
@@ -1181,13 +1187,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 	 * that is where we will continue on next queue run.
 	 */
 	if (!list_empty(list)) {
-		/*
-		 * If an I/O scheduler has been configured and we got a driver
-		 * tag for the next request already, free it again.
-		 */
-		rq = list_first_entry(list, struct request, queuelist);
-		blk_mq_put_driver_tag(rq);
-
 		spin_lock(&hctx->lock);
 		list_splice_init(list, &hctx->dispatch);
 		spin_unlock(&hctx->lock);
-- 
cgit v1.2.3-58-ga151


From 9c71c83c857e7a84a5be5a56ea88da7098f51db8 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Thu, 2 Nov 2017 23:24:33 +0800
Subject: blk-flush: don't run queue for requests bypassing flush

blk_insert_flush() should only insert request since run queue always
follows it.

In case of bypassing flush, we don't need to run queue because every
blk_insert_flush() follows one run queue.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-flush.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-flush.c b/block/blk-flush.c
index 4938bec8cfef..81bd1a843043 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -463,7 +463,7 @@ void blk_insert_flush(struct request *rq)
 	if ((policy & REQ_FSEQ_DATA) &&
 	    !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
 		if (q->mq_ops)
-			blk_mq_sched_insert_request(rq, false, true, false, false);
+			blk_mq_sched_insert_request(rq, false, false, false, false);
 		else
 			list_add_tail(&rq->queuelist, &q->queue_head);
 		return;
-- 
cgit v1.2.3-58-ga151


From b0850297c749ea79a5717d597931366b3d7f4b09 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Thu, 2 Nov 2017 23:24:34 +0800
Subject: block: pass 'run_queue' to blk_mq_request_bypass_insert

Block flush need this function without running the queue, so add a
parameter controlling whether we run it or not.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 2 +-
 block/blk-mq.c   | 5 +++--
 block/blk-mq.h   | 2 +-
 3 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 223f32826e62..b8d1aa2d1008 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2397,7 +2397,7 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *
 		 * bypass a potential scheduler on the bottom device for
 		 * insert.
 		 */
-		blk_mq_request_bypass_insert(rq);
+		blk_mq_request_bypass_insert(rq, true);
 		return BLK_STS_OK;
 	}
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 6eacc1dea8b7..021562bd5d2c 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1492,7 +1492,7 @@ void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
  * Should only be used carefully, when the caller knows we want to
  * bypass a potential IO scheduler on the target device.
  */
-void blk_mq_request_bypass_insert(struct request *rq)
+void blk_mq_request_bypass_insert(struct request *rq, bool run_queue)
 {
 	struct blk_mq_ctx *ctx = rq->mq_ctx;
 	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu);
@@ -1501,7 +1501,8 @@ void blk_mq_request_bypass_insert(struct request *rq)
 	list_add_tail(&rq->queuelist, &hctx->dispatch);
 	spin_unlock(&hctx->lock);
 
-	blk_mq_run_hw_queue(hctx, false);
+	if (run_queue)
+		blk_mq_run_hw_queue(hctx, false);
 }
 
 void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
diff --git a/block/blk-mq.h b/block/blk-mq.h
index f97aceff76e9..9fffec0ad8e9 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -56,7 +56,7 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
  */
 void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 				bool at_head);
-void blk_mq_request_bypass_insert(struct request *rq);
+void blk_mq_request_bypass_insert(struct request *rq, bool run_queue);
 void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
 				struct list_head *list);
 
-- 
cgit v1.2.3-58-ga151


From 598906f814280762157629ba8833bf5cb11def74 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Thu, 2 Nov 2017 23:24:35 +0800
Subject: blk-flush: use blk_mq_request_bypass_insert()

In the following patch, we will use RQF_FLUSH_SEQ to decide:

1) if the flag isn't set, the flush rq need to be inserted via
blk_insert_flush()

2) otherwise, the flush rq need to be dispatched directly since
it is in flush machinery now.

So we use blk_mq_request_bypass_insert() for requests of bypassing
flush machinery, just like the legacy path did.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-flush.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-flush.c b/block/blk-flush.c
index 81bd1a843043..a9773d2075ac 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -463,7 +463,7 @@ void blk_insert_flush(struct request *rq)
 	if ((policy & REQ_FSEQ_DATA) &&
 	    !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
 		if (q->mq_ops)
-			blk_mq_sched_insert_request(rq, false, false, false, false);
+			blk_mq_request_bypass_insert(rq, false);
 		else
 			list_add_tail(&rq->queuelist, &q->queue_head);
 		return;
-- 
cgit v1.2.3-58-ga151


From a6a252e6491443c1c18eab7e254daee63d4a7a04 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Thu, 2 Nov 2017 23:24:36 +0800
Subject: blk-mq-sched: decide how to handle flush rq via RQF_FLUSH_SEQ

In case of IO scheduler we always pre-allocate one driver tag before
calling blk_insert_flush(), and flush request will be marked as
RQF_FLUSH_SEQ once it is in flush machinery.

So if RQF_FLUSH_SEQ isn't set, we call blk_insert_flush() to handle
the request, otherwise the flush request is dispatched to ->dispatch
list directly.

This is a preparation patch for not preallocating a driver tag for flush
requests, and for not treating flush requests as a special case. This is
similar to what the legacy path does.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-sched.c | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 13a27d4d1671..e7094f44afaf 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -345,21 +345,23 @@ void blk_mq_sched_request_inserted(struct request *rq)
 EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted);
 
 static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx,
+				       bool has_sched,
 				       struct request *rq)
 {
-	if (rq->tag == -1) {
+	/* dispatch flush rq directly */
+	if (rq->rq_flags & RQF_FLUSH_SEQ) {
+		spin_lock(&hctx->lock);
+		list_add(&rq->queuelist, &hctx->dispatch);
+		spin_unlock(&hctx->lock);
+		return true;
+	}
+
+	if (has_sched) {
 		rq->rq_flags |= RQF_SORTED;
-		return false;
+		WARN_ON(rq->tag != -1);
 	}
 
-	/*
-	 * If we already have a real request tag, send directly to
-	 * the dispatch list.
-	 */
-	spin_lock(&hctx->lock);
-	list_add(&rq->queuelist, &hctx->dispatch);
-	spin_unlock(&hctx->lock);
-	return true;
+	return false;
 }
 
 /*
@@ -385,12 +387,13 @@ void blk_mq_sched_insert_request(struct request *rq, bool at_head,
 	struct blk_mq_ctx *ctx = rq->mq_ctx;
 	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
 
-	if (rq->tag == -1 && op_is_flush(rq->cmd_flags)) {
+	/* flush rq in flush machinery need to be dispatched directly */
+	if (!(rq->rq_flags & RQF_FLUSH_SEQ) && op_is_flush(rq->cmd_flags)) {
 		blk_mq_sched_insert_flush(hctx, rq, can_block);
 		return;
 	}
 
-	if (e && blk_mq_sched_bypass_insert(hctx, rq))
+	if (blk_mq_sched_bypass_insert(hctx, !!e, rq))
 		goto run;
 
 	if (e && e->type->ops.mq.insert_requests) {
@@ -428,7 +431,7 @@ void blk_mq_sched_insert_requests(struct request_queue *q,
 		list_for_each_entry_safe(rq, next, list, queuelist) {
 			if (WARN_ON_ONCE(rq->tag != -1)) {
 				list_del_init(&rq->queuelist);
-				blk_mq_sched_bypass_insert(hctx, rq);
+				blk_mq_sched_bypass_insert(hctx, true, rq);
 			}
 		}
 	}
-- 
cgit v1.2.3-58-ga151


From 244c65a3ccaa06fd15cc940315606674d3108b2f Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Sat, 4 Nov 2017 12:39:57 -0600
Subject: blk-mq: move blk_mq_put_driver_tag*() into blk-mq.h

We need this helper to put the driver tag for flush rq, since we will
not share tag in the flush request sequence in the following patch
in case that I/O scheduler is applied.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 32 --------------------------------
 block/blk-mq.h | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 33 insertions(+), 32 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 021562bd5d2c..14f6886fbec8 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -996,38 +996,6 @@ done:
 	return rq->tag != -1;
 }
 
-static void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
-				    struct request *rq)
-{
-	blk_mq_put_tag(hctx, hctx->tags, rq->mq_ctx, rq->tag);
-	rq->tag = -1;
-
-	if (rq->rq_flags & RQF_MQ_INFLIGHT) {
-		rq->rq_flags &= ~RQF_MQ_INFLIGHT;
-		atomic_dec(&hctx->nr_active);
-	}
-}
-
-static void blk_mq_put_driver_tag_hctx(struct blk_mq_hw_ctx *hctx,
-				       struct request *rq)
-{
-	if (rq->tag == -1 || rq->internal_tag == -1)
-		return;
-
-	__blk_mq_put_driver_tag(hctx, rq);
-}
-
-static void blk_mq_put_driver_tag(struct request *rq)
-{
-	struct blk_mq_hw_ctx *hctx;
-
-	if (rq->tag == -1 || rq->internal_tag == -1)
-		return;
-
-	hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu);
-	__blk_mq_put_driver_tag(hctx, rq);
-}
-
 /*
  * If we fail getting a driver tag because all the driver tags are already
  * assigned and on the dispatch list, BUT the first entry does not have a
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 9fffec0ad8e9..2502f40ccdc0 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -2,6 +2,7 @@
 #define INT_BLK_MQ_H
 
 #include "blk-stat.h"
+#include "blk-mq-tag.h"
 
 struct blk_mq_tag_set;
 
@@ -156,4 +157,36 @@ static inline bool blk_mq_get_dispatch_budget(struct blk_mq_hw_ctx *hctx)
 	return true;
 }
 
+static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
+					   struct request *rq)
+{
+	blk_mq_put_tag(hctx, hctx->tags, rq->mq_ctx, rq->tag);
+	rq->tag = -1;
+
+	if (rq->rq_flags & RQF_MQ_INFLIGHT) {
+		rq->rq_flags &= ~RQF_MQ_INFLIGHT;
+		atomic_dec(&hctx->nr_active);
+	}
+}
+
+static inline void blk_mq_put_driver_tag_hctx(struct blk_mq_hw_ctx *hctx,
+				       struct request *rq)
+{
+	if (rq->tag == -1 || rq->internal_tag == -1)
+		return;
+
+	__blk_mq_put_driver_tag(hctx, rq);
+}
+
+static inline void blk_mq_put_driver_tag(struct request *rq)
+{
+	struct blk_mq_hw_ctx *hctx;
+
+	if (rq->tag == -1 || rq->internal_tag == -1)
+		return;
+
+	hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu);
+	__blk_mq_put_driver_tag(hctx, rq);
+}
+
 #endif
-- 
cgit v1.2.3-58-ga151


From 923218f6166a84688973acdc39094f3bee1e9ad4 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Thu, 2 Nov 2017 23:24:38 +0800
Subject: blk-mq: don't allocate driver tag upfront for flush rq

The idea behind it is simple:

1) for none scheduler, driver tag has to be borrowed for flush rq,
   otherwise we may run out of tag, and that causes an IO hang. And
   get/put driver tag is actually noop for none, so reordering tags
   isn't necessary at all.

2) for a real I/O scheduler, we need not allocate a driver tag upfront
   for flush rq. It works just fine to follow the same approach as
   normal requests: allocate driver tag for each rq just before calling
   ->queue_rq().

One driver visible change is that the driver tag isn't shared in the
flush request sequence. That won't be a problem, since we always do that
in legacy path.

Then flush rq need not be treated specially wrt. get/put driver tag.
This cleans up the code - for instance, reorder_tags_to_front() can be
removed, and we needn't worry about request ordering in dispatch list
for avoiding I/O deadlock.

Also we have to put the driver tag before requeueing.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-flush.c    | 35 ++++++++++++++++++++++++++---------
 block/blk-mq-sched.c | 42 +++++-------------------------------------
 block/blk-mq.c       | 41 ++++++-----------------------------------
 3 files changed, 37 insertions(+), 81 deletions(-)

(limited to 'block')

diff --git a/block/blk-flush.c b/block/blk-flush.c
index a9773d2075ac..f17170675917 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -231,8 +231,13 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
 		/* release the tag's ownership to the req cloned from */
 		spin_lock_irqsave(&fq->mq_flush_lock, flags);
 		hctx = blk_mq_map_queue(q, flush_rq->mq_ctx->cpu);
-		blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq);
-		flush_rq->tag = -1;
+		if (!q->elevator) {
+			blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq);
+			flush_rq->tag = -1;
+		} else {
+			blk_mq_put_driver_tag_hctx(hctx, flush_rq);
+			flush_rq->internal_tag = -1;
+		}
 	}
 
 	running = &fq->flush_queue[fq->flush_running_idx];
@@ -318,19 +323,26 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq)
 	blk_rq_init(q, flush_rq);
 
 	/*
-	 * Borrow tag from the first request since they can't
-	 * be in flight at the same time. And acquire the tag's
-	 * ownership for flush req.
+	 * In case of none scheduler, borrow tag from the first request
+	 * since they can't be in flight at the same time. And acquire
+	 * the tag's ownership for flush req.
+	 *
+	 * In case of IO scheduler, flush rq need to borrow scheduler tag
+	 * just for cheating put/get driver tag.
 	 */
 	if (q->mq_ops) {
 		struct blk_mq_hw_ctx *hctx;
 
 		flush_rq->mq_ctx = first_rq->mq_ctx;
-		flush_rq->tag = first_rq->tag;
-		fq->orig_rq = first_rq;
 
-		hctx = blk_mq_map_queue(q, first_rq->mq_ctx->cpu);
-		blk_mq_tag_set_rq(hctx, first_rq->tag, flush_rq);
+		if (!q->elevator) {
+			fq->orig_rq = first_rq;
+			flush_rq->tag = first_rq->tag;
+			hctx = blk_mq_map_queue(q, first_rq->mq_ctx->cpu);
+			blk_mq_tag_set_rq(hctx, first_rq->tag, flush_rq);
+		} else {
+			flush_rq->internal_tag = first_rq->internal_tag;
+		}
 	}
 
 	flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH;
@@ -394,6 +406,11 @@ static void mq_flush_data_end_io(struct request *rq, blk_status_t error)
 
 	hctx = blk_mq_map_queue(q, ctx->cpu);
 
+	if (q->elevator) {
+		WARN_ON(rq->tag < 0);
+		blk_mq_put_driver_tag_hctx(hctx, rq);
+	}
+
 	/*
 	 * After populating an empty queue, kick it to avoid stall.  Read
 	 * the comment in flush_end_io().
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index e7094f44afaf..01a43fed6b8c 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -356,29 +356,12 @@ static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx,
 		return true;
 	}
 
-	if (has_sched) {
+	if (has_sched)
 		rq->rq_flags |= RQF_SORTED;
-		WARN_ON(rq->tag != -1);
-	}
 
 	return false;
 }
 
-/*
- * Add flush/fua to the queue. If we fail getting a driver tag, then
- * punt to the requeue list. Requeue will re-invoke us from a context
- * that's safe to block from.
- */
-static void blk_mq_sched_insert_flush(struct blk_mq_hw_ctx *hctx,
-				      struct request *rq, bool can_block)
-{
-	if (blk_mq_get_driver_tag(rq, &hctx, can_block)) {
-		blk_insert_flush(rq);
-		blk_mq_run_hw_queue(hctx, true);
-	} else
-		blk_mq_add_to_requeue_list(rq, false, true);
-}
-
 void blk_mq_sched_insert_request(struct request *rq, bool at_head,
 				 bool run_queue, bool async, bool can_block)
 {
@@ -389,10 +372,12 @@ void blk_mq_sched_insert_request(struct request *rq, bool at_head,
 
 	/* flush rq in flush machinery need to be dispatched directly */
 	if (!(rq->rq_flags & RQF_FLUSH_SEQ) && op_is_flush(rq->cmd_flags)) {
-		blk_mq_sched_insert_flush(hctx, rq, can_block);
-		return;
+		blk_insert_flush(rq);
+		goto run;
 	}
 
+	WARN_ON(e && (rq->tag != -1));
+
 	if (blk_mq_sched_bypass_insert(hctx, !!e, rq))
 		goto run;
 
@@ -419,23 +404,6 @@ void blk_mq_sched_insert_requests(struct request_queue *q,
 	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
 	struct elevator_queue *e = hctx->queue->elevator;
 
-	if (e) {
-		struct request *rq, *next;
-
-		/*
-		 * We bypass requests that already have a driver tag assigned,
-		 * which should only be flushes. Flushes are only ever inserted
-		 * as single requests, so we shouldn't ever hit the
-		 * WARN_ON_ONCE() below (but let's handle it just in case).
-		 */
-		list_for_each_entry_safe(rq, next, list, queuelist) {
-			if (WARN_ON_ONCE(rq->tag != -1)) {
-				list_del_init(&rq->queuelist);
-				blk_mq_sched_bypass_insert(hctx, true, rq);
-			}
-		}
-	}
-
 	if (e && e->type->ops.mq.insert_requests)
 		e->type->ops.mq.insert_requests(hctx, list, false);
 	else
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 14f6886fbec8..c501cbd0de93 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -653,6 +653,8 @@ static void __blk_mq_requeue_request(struct request *rq)
 {
 	struct request_queue *q = rq->q;
 
+	blk_mq_put_driver_tag(rq);
+
 	trace_block_rq_requeue(q, rq);
 	wbt_requeue(q->rq_wb, &rq->issue_stat);
 	blk_mq_sched_requeue_request(rq);
@@ -996,30 +998,6 @@ done:
 	return rq->tag != -1;
 }
 
-/*
- * If we fail getting a driver tag because all the driver tags are already
- * assigned and on the dispatch list, BUT the first entry does not have a
- * tag, then we could deadlock. For that case, move entries with assigned
- * driver tags to the front, leaving the set of tagged requests in the
- * same order, and the untagged set in the same order.
- */
-static bool reorder_tags_to_front(struct list_head *list)
-{
-	struct request *rq, *tmp, *first = NULL;
-
-	list_for_each_entry_safe_reverse(rq, tmp, list, queuelist) {
-		if (rq == first)
-			break;
-		if (rq->tag != -1) {
-			list_move(&rq->queuelist, list);
-			if (!first)
-				first = rq;
-		}
-	}
-
-	return first != NULL;
-}
-
 static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
 				void *key)
 {
@@ -1080,9 +1058,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 
 		rq = list_first_entry(list, struct request, queuelist);
 		if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
-			if (!queued && reorder_tags_to_front(list))
-				continue;
-
 			/*
 			 * The initial allocation attempt failed, so we need to
 			 * rerun the hardware queue when a tag is freed.
@@ -1133,7 +1108,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 				nxt = list_first_entry(list, struct request, queuelist);
 				blk_mq_put_driver_tag(nxt);
 			}
-			blk_mq_put_driver_tag_hctx(hctx, rq);
 			list_add(&rq->queuelist, list);
 			__blk_mq_requeue_request(rq);
 			break;
@@ -1698,13 +1672,10 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 	if (unlikely(is_flush_fua)) {
 		blk_mq_put_ctx(data.ctx);
 		blk_mq_bio_to_request(rq, bio);
-		if (q->elevator) {
-			blk_mq_sched_insert_request(rq, false, true, true,
-					true);
-		} else {
-			blk_insert_flush(rq);
-			blk_mq_run_hw_queue(data.hctx, true);
-		}
+
+		/* bypass scheduler for flush rq */
+		blk_insert_flush(rq);
+		blk_mq_run_hw_queue(data.hctx, true);
 	} else if (plug && q->nr_hw_queues == 1) {
 		struct request *last = NULL;
 
-- 
cgit v1.2.3-58-ga151


From 05b79413946d8b2b58999ea1ae844b6fc3c54f61 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 8 Nov 2017 10:38:29 -0700
Subject: Revert "blk-mq: don't handle TAG_SHARED in restart"

This reverts commit 358a3a6bccb74da9d63a26b2dd5f09f1e9970e0b.

We have cases that aren't covered 100% in the drivers, so for now
we have to retain the shared tag restart loops.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-sched.c | 78 +++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 74 insertions(+), 4 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 01a43fed6b8c..6f4bdb8209f7 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -68,17 +68,25 @@ static void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx)
 		set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
 }
 
-void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
+static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx)
 {
 	if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
-		return;
+		return false;
 
-	clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
+	if (hctx->flags & BLK_MQ_F_TAG_SHARED) {
+		struct request_queue *q = hctx->queue;
+
+		if (test_and_clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
+			atomic_dec(&q->shared_hctx_restart);
+	} else
+		clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
 
 	if (blk_mq_hctx_has_pending(hctx)) {
 		blk_mq_run_hw_queue(hctx, true);
-		return;
+		return true;
 	}
+
+	return false;
 }
 
 /*
@@ -362,6 +370,68 @@ static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx,
 	return false;
 }
 
+/**
+ * list_for_each_entry_rcu_rr - iterate in a round-robin fashion over rcu list
+ * @pos:    loop cursor.
+ * @skip:   the list element that will not be examined. Iteration starts at
+ *          @skip->next.
+ * @head:   head of the list to examine. This list must have at least one
+ *          element, namely @skip.
+ * @member: name of the list_head structure within typeof(*pos).
+ */
+#define list_for_each_entry_rcu_rr(pos, skip, head, member)		\
+	for ((pos) = (skip);						\
+	     (pos = (pos)->member.next != (head) ? list_entry_rcu(	\
+			(pos)->member.next, typeof(*pos), member) :	\
+	      list_entry_rcu((pos)->member.next->next, typeof(*pos), member)), \
+	     (pos) != (skip); )
+
+/*
+ * Called after a driver tag has been freed to check whether a hctx needs to
+ * be restarted. Restarts @hctx if its tag set is not shared. Restarts hardware
+ * queues in a round-robin fashion if the tag set of @hctx is shared with other
+ * hardware queues.
+ */
+void blk_mq_sched_restart(struct blk_mq_hw_ctx *const hctx)
+{
+	struct blk_mq_tags *const tags = hctx->tags;
+	struct blk_mq_tag_set *const set = hctx->queue->tag_set;
+	struct request_queue *const queue = hctx->queue, *q;
+	struct blk_mq_hw_ctx *hctx2;
+	unsigned int i, j;
+
+	if (set->flags & BLK_MQ_F_TAG_SHARED) {
+		/*
+		 * If this is 0, then we know that no hardware queues
+		 * have RESTART marked. We're done.
+		 */
+		if (!atomic_read(&queue->shared_hctx_restart))
+			return;
+
+		rcu_read_lock();
+		list_for_each_entry_rcu_rr(q, queue, &set->tag_list,
+					   tag_set_list) {
+			queue_for_each_hw_ctx(q, hctx2, i)
+				if (hctx2->tags == tags &&
+				    blk_mq_sched_restart_hctx(hctx2))
+					goto done;
+		}
+		j = hctx->queue_num + 1;
+		for (i = 0; i < queue->nr_hw_queues; i++, j++) {
+			if (j == queue->nr_hw_queues)
+				j = 0;
+			hctx2 = queue->queue_hw_ctx[j];
+			if (hctx2->tags == tags &&
+			    blk_mq_sched_restart_hctx(hctx2))
+				break;
+		}
+done:
+		rcu_read_unlock();
+	} else {
+		blk_mq_sched_restart_hctx(hctx);
+	}
+}
+
 void blk_mq_sched_insert_request(struct request *rq, bool at_head,
 				 bool run_queue, bool async, bool can_block)
 {
-- 
cgit v1.2.3-58-ga151


From d004a5e7d4dd6335ce6e2044af42f5e0fbebb51d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 8 Nov 2017 19:13:48 +0100
Subject: block: remove __bio_kmap_atomic

This helper doesn't buy us much over calling kmap_atomic directly.
In fact in the only caller it does a bit of useless work as the
caller already has the bvec at hand, and said caller would even
buggy for a multi-segment bio due to the use of this helper.

So just remove it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 Documentation/block/biodoc.txt      | 11 +++++------
 arch/xtensa/platforms/iss/simdisk.c |  4 ++--
 block/blk-settings.c                |  2 +-
 include/linux/bio.h                 | 12 ------------
 4 files changed, 8 insertions(+), 21 deletions(-)

(limited to 'block')

diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt
index 01c0a03407cc..86927029a52d 100644
--- a/Documentation/block/biodoc.txt
+++ b/Documentation/block/biodoc.txt
@@ -216,10 +216,9 @@ may need to abort DMA operations and revert to PIO for the transfer, in
 which case a virtual mapping of the page is required. For SCSI it is also
 done in some scenarios where the low level driver cannot be trusted to
 handle a single sg entry correctly. The driver is expected to perform the
-kmaps as needed on such occasions using the bio_kmap_irq and friends
-routines as appropriate. A driver could also use the blk_queue_bounce()
-routine on its own to bounce highmem i/o to low memory for specific requests
-if so desired.
+kmaps as needed on such occasions as appropriate. A driver could also use
+the blk_queue_bounce() routine on its own to bounce highmem i/o to low
+memory for specific requests if so desired.
 
 iii. The i/o scheduler algorithm itself can be replaced/set as appropriate
 
@@ -1137,8 +1136,8 @@ use dma_map_sg for scatter gather) to be able to ship it to the driver. For
 PIO drivers (or drivers that need to revert to PIO transfer once in a
 while (IDE for example)), where the CPU is doing the actual data
 transfer a virtual mapping is needed. If the driver supports highmem I/O,
-(Sec 1.1, (ii) ) it needs to use __bio_kmap_atomic or similar to
-temporarily map a bio into the virtual address space.
+(Sec 1.1, (ii) ) it needs to use kmap_atomic or similar to temporarily map
+a bio into the virtual address space.
 
 
 8. Prior/Related/Impacted patches
diff --git a/arch/xtensa/platforms/iss/simdisk.c b/arch/xtensa/platforms/iss/simdisk.c
index c45b90bb9339..eacf1e433518 100644
--- a/arch/xtensa/platforms/iss/simdisk.c
+++ b/arch/xtensa/platforms/iss/simdisk.c
@@ -110,13 +110,13 @@ static blk_qc_t simdisk_make_request(struct request_queue *q, struct bio *bio)
 	sector_t sector = bio->bi_iter.bi_sector;
 
 	bio_for_each_segment(bvec, bio, iter) {
-		char *buffer = __bio_kmap_atomic(bio, iter);
+		char *buffer = kmap_atomic(bvec.bv_page) + bvec.bv_offset;
 		unsigned len = bvec.bv_len >> SECTOR_SHIFT;
 
 		simdisk_transfer(dev, sector, len, buffer,
 				bio_data_dir(bio) == WRITE);
 		sector += len;
-		__bio_kunmap_atomic(buffer);
+		kunmap_atomic(buffer)
 	}
 
 	bio_endio(bio);
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 8559e9563c52..48ebe6be07b7 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -157,7 +157,7 @@ EXPORT_SYMBOL(blk_set_stacking_limits);
  * Caveat:
  *    The driver that does this *must* be able to deal appropriately
  *    with buffers in "highmemory". This can be accomplished by either calling
- *    __bio_kmap_atomic() to get a temporary kernel mapping, or by calling
+ *    kmap_atomic() to get a temporary kernel mapping, or by calling
  *    blk_queue_bounce() to create a buffer in normal memory.
  **/
 void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 1d7e63d7505f..d4eec19a6d3c 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -128,18 +128,6 @@ static inline void *bio_data(struct bio *bio)
  */
 #define bvec_to_phys(bv)	(page_to_phys((bv)->bv_page) + (unsigned long) (bv)->bv_offset)
 
-/*
- * queues that have highmem support enabled may still need to revert to
- * PIO transfers occasionally and thus map high pages temporarily. For
- * permanent PIO fall back, user is probably better off disabling highmem
- * I/O completely on that queue (see ide-dma for example)
- */
-#define __bio_kmap_atomic(bio, iter)				\
-	(kmap_atomic(bio_iter_iovec((bio), (iter)).bv_page) +	\
-		bio_iter_iovec((bio), (iter)).bv_offset)
-
-#define __bio_kunmap_atomic(addr)	kunmap_atomic(addr)
-
 /*
  * merge helpers etc
  */
-- 
cgit v1.2.3-58-ga151


From f00c4d80ffdac9e3a64947ebd57489f3232d5a74 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 5 Nov 2017 10:36:31 +0300
Subject: block: pass full fmode_t to blk_verify_command

Use the obvious calling convention.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bsg.c            | 18 ++++++++----------
 block/scsi_ioctl.c     |  8 ++++----
 drivers/scsi/sg.c      |  2 +-
 include/linux/blkdev.h |  2 +-
 4 files changed, 14 insertions(+), 16 deletions(-)

(limited to 'block')

diff --git a/block/bsg.c b/block/bsg.c
index ee1335c68de7..452f94f1c5d4 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -137,7 +137,7 @@ static inline struct hlist_head *bsg_dev_idx_hash(int index)
 
 static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq,
 				struct sg_io_v4 *hdr, struct bsg_device *bd,
-				fmode_t has_write_perm)
+				fmode_t mode)
 {
 	struct scsi_request *req = scsi_req(rq);
 
@@ -152,7 +152,7 @@ static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq,
 		return -EFAULT;
 
 	if (hdr->subprotocol == BSG_SUB_PROTOCOL_SCSI_CMD) {
-		if (blk_verify_command(req->cmd, has_write_perm))
+		if (blk_verify_command(req->cmd, mode))
 			return -EPERM;
 	} else if (!capable(CAP_SYS_RAWIO))
 		return -EPERM;
@@ -206,7 +206,7 @@ bsg_validate_sgv4_hdr(struct sg_io_v4 *hdr, int *op)
  * map sg_io_v4 to a request.
  */
 static struct request *
-bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm)
+bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t mode)
 {
 	struct request_queue *q = bd->queue;
 	struct request *rq, *next_rq = NULL;
@@ -237,7 +237,7 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm)
 	if (IS_ERR(rq))
 		return rq;
 
-	ret = blk_fill_sgv4_hdr_rq(q, rq, hdr, bd, has_write_perm);
+	ret = blk_fill_sgv4_hdr_rq(q, rq, hdr, bd, mode);
 	if (ret)
 		goto out;
 
@@ -587,8 +587,7 @@ bsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 }
 
 static int __bsg_write(struct bsg_device *bd, const char __user *buf,
-		       size_t count, ssize_t *bytes_written,
-		       fmode_t has_write_perm)
+		       size_t count, ssize_t *bytes_written, fmode_t mode)
 {
 	struct bsg_command *bc;
 	struct request *rq;
@@ -619,7 +618,7 @@ static int __bsg_write(struct bsg_device *bd, const char __user *buf,
 		/*
 		 * get a request, fill in the blanks, and add to request queue
 		 */
-		rq = bsg_map_hdr(bd, &bc->hdr, has_write_perm);
+		rq = bsg_map_hdr(bd, &bc->hdr, mode);
 		if (IS_ERR(rq)) {
 			ret = PTR_ERR(rq);
 			rq = NULL;
@@ -655,8 +654,7 @@ bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
 	bsg_set_block(bd, file);
 
 	bytes_written = 0;
-	ret = __bsg_write(bd, buf, count, &bytes_written,
-			  file->f_mode & FMODE_WRITE);
+	ret = __bsg_write(bd, buf, count, &bytes_written, file->f_mode);
 
 	*ppos = bytes_written;
 
@@ -915,7 +913,7 @@ static long bsg_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		if (copy_from_user(&hdr, uarg, sizeof(hdr)))
 			return -EFAULT;
 
-		rq = bsg_map_hdr(bd, &hdr, file->f_mode & FMODE_WRITE);
+		rq = bsg_map_hdr(bd, &hdr, file->f_mode);
 		if (IS_ERR(rq))
 			return PTR_ERR(rq);
 
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 7440de44dd85..edcfff974527 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -207,7 +207,7 @@ static void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter)
 	__set_bit(GPCMD_SET_READ_AHEAD, filter->write_ok);
 }
 
-int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm)
+int blk_verify_command(unsigned char *cmd, fmode_t mode)
 {
 	struct blk_cmd_filter *filter = &blk_default_cmd_filter;
 
@@ -220,7 +220,7 @@ int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm)
 		return 0;
 
 	/* Write-safe commands require a writable open */
-	if (test_bit(cmd[0], filter->write_ok) && has_write_perm)
+	if (test_bit(cmd[0], filter->write_ok) && (mode & FMODE_WRITE))
 		return 0;
 
 	return -EPERM;
@@ -234,7 +234,7 @@ static int blk_fill_sghdr_rq(struct request_queue *q, struct request *rq,
 
 	if (copy_from_user(req->cmd, hdr->cmdp, hdr->cmd_len))
 		return -EFAULT;
-	if (blk_verify_command(req->cmd, mode & FMODE_WRITE))
+	if (blk_verify_command(req->cmd, mode))
 		return -EPERM;
 
 	/*
@@ -469,7 +469,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
 	if (in_len && copy_from_user(buffer, sic->data + cmdlen, in_len))
 		goto error;
 
-	err = blk_verify_command(req->cmd, mode & FMODE_WRITE);
+	err = blk_verify_command(req->cmd, mode);
 	if (err)
 		goto error;
 
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 0419c2298eab..92fd870e1315 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -217,7 +217,7 @@ static int sg_allow_access(struct file *filp, unsigned char *cmd)
 	if (sfp->parentdp->device->type == TYPE_SCANNER)
 		return 0;
 
-	return blk_verify_command(cmd, filp->f_mode & FMODE_WRITE);
+	return blk_verify_command(cmd, filp->f_mode);
 }
 
 static int
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 225617dd0a3f..1437ef4d8037 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1360,7 +1360,7 @@ static inline int sb_issue_zeroout(struct super_block *sb, sector_t block,
 				    gfp_mask, 0);
 }
 
-extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm);
+extern int blk_verify_command(unsigned char *cmd, fmode_t mode);
 
 enum blk_default_limits {
 	BLK_MAX_SEGMENTS	= 128,
-- 
cgit v1.2.3-58-ga151


From 0c6af1ccd5fd9ac640aef01c8de0043837451a04 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Wed, 8 Nov 2017 09:11:22 +0800
Subject: blk-mq: put driver tag if dispatch budget can't be got

We have to put the driver tag if dispatch budget can't be got, otherwise
it might cause IO deadlock, especially in case that size of tags is very
small.

Fixes: de1482974080(blk-mq: introduce .get_budget and .put_budget in blk_mq_ops)
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index c501cbd0de93..3d759bb8a5bb 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1080,8 +1080,10 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 			}
 		}
 
-		if (!got_budget && !blk_mq_get_dispatch_budget(hctx))
+		if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) {
+			blk_mq_put_driver_tag(rq);
 			break;
+		}
 
 		list_del_init(&rq->queuelist);
 
-- 
cgit v1.2.3-58-ga151


From aba7afc5671c23beade64d10caf86e24a9105dab Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Wed, 8 Nov 2017 10:23:45 -0800
Subject: blk-mq: Avoid that request queue removal can trigger list corruption

Avoid that removal of a request queue sporadically triggers the
following warning:

list_del corruption. next->prev should be ffff8807d649b970, but was 6b6b6b6b6b6b6b6b
WARNING: CPU: 3 PID: 342 at lib/list_debug.c:56 __list_del_entry_valid+0x92/0xa0
Call Trace:
 process_one_work+0x11b/0x660
 worker_thread+0x3d/0x3b0
 kthread+0x129/0x140
 ret_from_fork+0x27/0x40

Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Hannes Reinecke <hare@suse.com>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index b8d1aa2d1008..5e81dcf4690a 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -339,6 +339,7 @@ void blk_sync_queue(struct request_queue *q)
 		struct blk_mq_hw_ctx *hctx;
 		int i;
 
+		cancel_delayed_work_sync(&q->requeue_work);
 		queue_for_each_hw_ctx(q, hctx, i)
 			cancel_delayed_work_sync(&hctx->run_work);
 	} else {
-- 
cgit v1.2.3-58-ga151


From eb619fdb2d4cb8b3d3419e9113921e87e7daf557 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 9 Nov 2017 08:32:43 -0700
Subject: blk-mq: fix issue with shared tag queue re-running

This patch attempts to make the case of hctx re-running on driver tag
failure more robust. Without this patch, it's pretty easy to trigger a
stall condition with shared tags. An example is using null_blk like
this:

modprobe null_blk queue_mode=2 nr_devices=4 shared_tags=1 submit_queues=1 hw_queue_depth=1

which sets up 4 devices, sharing the same tag set with a depth of 1.
Running a fio job ala:

[global]
bs=4k
rw=randread
norandommap
direct=1
ioengine=libaio
iodepth=4

[nullb0]
filename=/dev/nullb0
[nullb1]
filename=/dev/nullb1
[nullb2]
filename=/dev/nullb2
[nullb3]
filename=/dev/nullb3

will inevitably end with one or more threads being stuck waiting for a
scheduler tag. That IO is then stuck forever, until someone else
triggers a run of the queue.

Ensure that we always re-run the hardware queue, if the driver tag we
were waiting for got freed before we added our leftover request entries
back on the dispatch list.

Reviewed-by: Bart Van Assche <bart.vanassche@wdc.com>
Tested-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-debugfs.c |  1 -
 block/blk-mq.c         | 85 ++++++++++++++++++++++++++++----------------------
 include/linux/blk-mq.h |  5 ++-
 3 files changed, 50 insertions(+), 41 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 7f4a1ba532af..bb7f08415203 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -179,7 +179,6 @@ static const char *const hctx_state_name[] = {
 	HCTX_STATE_NAME(STOPPED),
 	HCTX_STATE_NAME(TAG_ACTIVE),
 	HCTX_STATE_NAME(SCHED_RESTART),
-	HCTX_STATE_NAME(TAG_WAITING),
 	HCTX_STATE_NAME(START_ON_RUN),
 };
 #undef HCTX_STATE_NAME
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3d759bb8a5bb..fed8165973a3 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -998,49 +998,64 @@ done:
 	return rq->tag != -1;
 }
 
-static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
-				void *key)
+static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
+				int flags, void *key)
 {
 	struct blk_mq_hw_ctx *hctx;
 
 	hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
 
-	list_del(&wait->entry);
-	clear_bit_unlock(BLK_MQ_S_TAG_WAITING, &hctx->state);
+	list_del_init(&wait->entry);
 	blk_mq_run_hw_queue(hctx, true);
 	return 1;
 }
 
-static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx)
+static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx **hctx,
+				     struct request *rq)
 {
+	struct blk_mq_hw_ctx *this_hctx = *hctx;
+	wait_queue_entry_t *wait = &this_hctx->dispatch_wait;
 	struct sbq_wait_state *ws;
 
+	if (!list_empty_careful(&wait->entry))
+		return false;
+
+	spin_lock(&this_hctx->lock);
+	if (!list_empty(&wait->entry)) {
+		spin_unlock(&this_hctx->lock);
+		return false;
+	}
+
+	ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx);
+	add_wait_queue(&ws->wait, wait);
+
 	/*
-	 * The TAG_WAITING bit serves as a lock protecting hctx->dispatch_wait.
-	 * The thread which wins the race to grab this bit adds the hardware
-	 * queue to the wait queue.
+	 * It's possible that a tag was freed in the window between the
+	 * allocation failure and adding the hardware queue to the wait
+	 * queue.
 	 */
-	if (test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state) ||
-	    test_and_set_bit_lock(BLK_MQ_S_TAG_WAITING, &hctx->state))
+	if (!blk_mq_get_driver_tag(rq, hctx, false)) {
+		spin_unlock(&this_hctx->lock);
 		return false;
-
-	init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
-	ws = bt_wait_ptr(&hctx->tags->bitmap_tags, hctx);
+	}
 
 	/*
-	 * As soon as this returns, it's no longer safe to fiddle with
-	 * hctx->dispatch_wait, since a completion can wake up the wait queue
-	 * and unlock the bit.
+	 * We got a tag, remove ourselves from the wait queue to ensure
+	 * someone else gets the wakeup.
 	 */
-	add_wait_queue(&ws->wait, &hctx->dispatch_wait);
+	spin_lock_irq(&ws->wait.lock);
+	list_del_init(&wait->entry);
+	spin_unlock_irq(&ws->wait.lock);
+	spin_unlock(&this_hctx->lock);
 	return true;
 }
 
 bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
-		bool got_budget)
+			     bool got_budget)
 {
 	struct blk_mq_hw_ctx *hctx;
 	struct request *rq, *nxt;
+	bool no_tag = false;
 	int errors, queued;
 
 	if (list_empty(list))
@@ -1060,22 +1075,15 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 		if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
 			/*
 			 * The initial allocation attempt failed, so we need to
-			 * rerun the hardware queue when a tag is freed.
+			 * rerun the hardware queue when a tag is freed. The
+			 * waitqueue takes care of that. If the queue is run
+			 * before we add this entry back on the dispatch list,
+			 * we'll re-run it below.
 			 */
-			if (!blk_mq_dispatch_wait_add(hctx)) {
-				if (got_budget)
-					blk_mq_put_dispatch_budget(hctx);
-				break;
-			}
-
-			/*
-			 * It's possible that a tag was freed in the window
-			 * between the allocation failure and adding the
-			 * hardware queue to the wait queue.
-			 */
-			if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
+			if (!blk_mq_dispatch_wait_add(&hctx, rq)) {
 				if (got_budget)
 					blk_mq_put_dispatch_budget(hctx);
+				no_tag = true;
 				break;
 			}
 		}
@@ -1140,10 +1148,10 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 		 * it is no longer set that means that it was cleared by another
 		 * thread and hence that a queue rerun is needed.
 		 *
-		 * If TAG_WAITING is set that means that an I/O scheduler has
-		 * been configured and another thread is waiting for a driver
-		 * tag. To guarantee fairness, do not rerun this hardware queue
-		 * but let the other thread grab the driver tag.
+		 * If 'no_tag' is set, that means that we failed getting
+		 * a driver tag with an I/O scheduler attached. If our dispatch
+		 * waitqueue is no longer active, ensure that we run the queue
+		 * AFTER adding our entries back to the list.
 		 *
 		 * If no I/O scheduler has been configured it is possible that
 		 * the hardware queue got stopped and restarted before requests
@@ -1155,8 +1163,8 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 		 *   returning BLK_STS_RESOURCE. Two exceptions are scsi-mq
 		 *   and dm-rq.
 		 */
-		if (!blk_mq_sched_needs_restart(hctx) &&
-		    !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state))
+		if (!blk_mq_sched_needs_restart(hctx) ||
+		    (no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
 			blk_mq_run_hw_queue(hctx, true);
 	}
 
@@ -2020,6 +2028,9 @@ static int blk_mq_init_hctx(struct request_queue *q,
 
 	hctx->nr_ctx = 0;
 
+	init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
+	INIT_LIST_HEAD(&hctx->dispatch_wait.entry);
+
 	if (set->ops->init_hctx &&
 	    set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
 		goto free_bitmap;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 674641527da7..4ae987c2352c 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -35,7 +35,7 @@ struct blk_mq_hw_ctx {
 	struct blk_mq_ctx	**ctxs;
 	unsigned int		nr_ctx;
 
-	wait_queue_entry_t		dispatch_wait;
+	wait_queue_entry_t	dispatch_wait;
 	atomic_t		wait_index;
 
 	struct blk_mq_tags	*tags;
@@ -181,8 +181,7 @@ enum {
 	BLK_MQ_S_STOPPED	= 0,
 	BLK_MQ_S_TAG_ACTIVE	= 1,
 	BLK_MQ_S_SCHED_RESTART	= 2,
-	BLK_MQ_S_TAG_WAITING	= 3,
-	BLK_MQ_S_START_ON_RUN	= 4,
+	BLK_MQ_S_START_ON_RUN	= 3,
 
 	BLK_MQ_MAX_DEPTH	= 10240,
 
-- 
cgit v1.2.3-58-ga151


From 055f6e18e08f5b7fd98171fce857a0bad87a919d Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Thu, 9 Nov 2017 10:49:53 -0800
Subject: block: Make q_usage_counter also track legacy requests

This patch makes it possible to pause request allocation for
the legacy block layer by calling blk_mq_freeze_queue() and
blk_mq_unfreeze_queue().

Signed-off-by: Ming Lei <ming.lei@redhat.com>
[ bvanassche: Combined two patches into one, edited a comment and made sure
  REQ_NOWAIT is handled properly in blk_old_get_request() ]
Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Martin Steigerwald <martin@lichtvoll.de>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Cc: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 12 ++++++++++++
 block/blk-mq.c   | 10 ++--------
 2 files changed, 14 insertions(+), 8 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 5e81dcf4690a..a4362849059a 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -612,6 +612,9 @@ void blk_set_queue_dying(struct request_queue *q)
 		}
 		spin_unlock_irq(q->queue_lock);
 	}
+
+	/* Make blk_queue_enter() reexamine the DYING flag. */
+	wake_up_all(&q->mq_freeze_wq);
 }
 EXPORT_SYMBOL_GPL(blk_set_queue_dying);
 
@@ -1398,16 +1401,22 @@ static struct request *blk_old_get_request(struct request_queue *q,
 					   unsigned int op, gfp_t gfp_mask)
 {
 	struct request *rq;
+	int ret = 0;
 
 	WARN_ON_ONCE(q->mq_ops);
 
 	/* create ioc upfront */
 	create_io_context(gfp_mask, q->node);
 
+	ret = blk_queue_enter(q, !(gfp_mask & __GFP_DIRECT_RECLAIM) ||
+			      (op & REQ_NOWAIT));
+	if (ret)
+		return ERR_PTR(ret);
 	spin_lock_irq(q->queue_lock);
 	rq = get_request(q, op, NULL, gfp_mask);
 	if (IS_ERR(rq)) {
 		spin_unlock_irq(q->queue_lock);
+		blk_queue_exit(q);
 		return rq;
 	}
 
@@ -1579,6 +1588,7 @@ void __blk_put_request(struct request_queue *q, struct request *req)
 		blk_free_request(rl, req);
 		freed_request(rl, sync, rq_flags);
 		blk_put_rl(rl);
+		blk_queue_exit(q);
 	}
 }
 EXPORT_SYMBOL_GPL(__blk_put_request);
@@ -1860,8 +1870,10 @@ get_rq:
 	 * Grab a free request. This is might sleep but can not fail.
 	 * Returns with the queue unlocked.
 	 */
+	blk_queue_enter_live(q);
 	req = get_request(q, bio->bi_opf, bio, GFP_NOIO);
 	if (IS_ERR(req)) {
+		blk_queue_exit(q);
 		__wbt_done(q->rq_wb, wb_acct);
 		if (PTR_ERR(req) == -ENOMEM)
 			bio->bi_status = BLK_STS_RESOURCE;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index fed8165973a3..7173d4bd64af 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -126,7 +126,8 @@ void blk_freeze_queue_start(struct request_queue *q)
 	freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
 	if (freeze_depth == 1) {
 		percpu_ref_kill(&q->q_usage_counter);
-		blk_mq_run_hw_queues(q, false);
+		if (q->mq_ops)
+			blk_mq_run_hw_queues(q, false);
 	}
 }
 EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
@@ -256,13 +257,6 @@ void blk_mq_wake_waiters(struct request_queue *q)
 	queue_for_each_hw_ctx(q, hctx, i)
 		if (blk_mq_hw_queue_mapped(hctx))
 			blk_mq_tag_wakeup_all(hctx->tags, true);
-
-	/*
-	 * If we are called because the queue has now been marked as
-	 * dying, we need to ensure that processes currently waiting on
-	 * the queue are notified as well.
-	 */
-	wake_up_all(&q->mq_freeze_wq);
 }
 
 bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
-- 
cgit v1.2.3-58-ga151


From 6a15674d1e90917f1723a814e2e8c949000440f7 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Thu, 9 Nov 2017 10:49:54 -0800
Subject: block: Introduce blk_get_request_flags()

A side effect of this patch is that the GFP mask that is passed to
several allocation functions in the legacy block layer is changed
from GFP_KERNEL into __GFP_DIRECT_RECLAIM.

Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Martin Steigerwald <martin@lichtvoll.de>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c       | 50 +++++++++++++++++++++++++++++++++++---------------
 include/linux/blkdev.h |  3 +++
 2 files changed, 38 insertions(+), 15 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index a4362849059a..0f7093dfc010 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1160,7 +1160,7 @@ int blk_update_nr_requests(struct request_queue *q, unsigned int nr)
  * @rl: request list to allocate from
  * @op: operation and flags
  * @bio: bio to allocate request for (can be %NULL)
- * @gfp_mask: allocation mask
+ * @flags: BLQ_MQ_REQ_* flags
  *
  * Get a free request from @q.  This function may fail under memory
  * pressure or if @q is dead.
@@ -1170,7 +1170,7 @@ int blk_update_nr_requests(struct request_queue *q, unsigned int nr)
  * Returns request pointer on success, with @q->queue_lock *not held*.
  */
 static struct request *__get_request(struct request_list *rl, unsigned int op,
-		struct bio *bio, gfp_t gfp_mask)
+				     struct bio *bio, unsigned int flags)
 {
 	struct request_queue *q = rl->q;
 	struct request *rq;
@@ -1179,6 +1179,8 @@ static struct request *__get_request(struct request_list *rl, unsigned int op,
 	struct io_cq *icq = NULL;
 	const bool is_sync = op_is_sync(op);
 	int may_queue;
+	gfp_t gfp_mask = flags & BLK_MQ_REQ_NOWAIT ? GFP_ATOMIC :
+			 __GFP_DIRECT_RECLAIM;
 	req_flags_t rq_flags = RQF_ALLOCED;
 
 	lockdep_assert_held(q->queue_lock);
@@ -1339,7 +1341,7 @@ rq_starved:
  * @q: request_queue to allocate request from
  * @op: operation and flags
  * @bio: bio to allocate request for (can be %NULL)
- * @gfp_mask: allocation mask
+ * @flags: BLK_MQ_REQ_* flags.
  *
  * Get a free request from @q.  If %__GFP_DIRECT_RECLAIM is set in @gfp_mask,
  * this function keeps retrying under memory pressure and fails iff @q is dead.
@@ -1349,7 +1351,7 @@ rq_starved:
  * Returns request pointer on success, with @q->queue_lock *not held*.
  */
 static struct request *get_request(struct request_queue *q, unsigned int op,
-		struct bio *bio, gfp_t gfp_mask)
+				   struct bio *bio, unsigned int flags)
 {
 	const bool is_sync = op_is_sync(op);
 	DEFINE_WAIT(wait);
@@ -1361,7 +1363,7 @@ static struct request *get_request(struct request_queue *q, unsigned int op,
 
 	rl = blk_get_rl(q, bio);	/* transferred to @rq on success */
 retry:
-	rq = __get_request(rl, op, bio, gfp_mask);
+	rq = __get_request(rl, op, bio, flags);
 	if (!IS_ERR(rq))
 		return rq;
 
@@ -1370,7 +1372,7 @@ retry:
 		return ERR_PTR(-EAGAIN);
 	}
 
-	if (!gfpflags_allow_blocking(gfp_mask) || unlikely(blk_queue_dying(q))) {
+	if ((flags & BLK_MQ_REQ_NOWAIT) || unlikely(blk_queue_dying(q))) {
 		blk_put_rl(rl);
 		return rq;
 	}
@@ -1397,10 +1399,13 @@ retry:
 	goto retry;
 }
 
+/* flags: BLK_MQ_REQ_PREEMPT and/or BLK_MQ_REQ_NOWAIT. */
 static struct request *blk_old_get_request(struct request_queue *q,
-					   unsigned int op, gfp_t gfp_mask)
+					   unsigned int op, unsigned int flags)
 {
 	struct request *rq;
+	gfp_t gfp_mask = flags & BLK_MQ_REQ_NOWAIT ? GFP_ATOMIC :
+			 __GFP_DIRECT_RECLAIM;
 	int ret = 0;
 
 	WARN_ON_ONCE(q->mq_ops);
@@ -1413,7 +1418,7 @@ static struct request *blk_old_get_request(struct request_queue *q,
 	if (ret)
 		return ERR_PTR(ret);
 	spin_lock_irq(q->queue_lock);
-	rq = get_request(q, op, NULL, gfp_mask);
+	rq = get_request(q, op, NULL, flags);
 	if (IS_ERR(rq)) {
 		spin_unlock_irq(q->queue_lock);
 		blk_queue_exit(q);
@@ -1427,25 +1432,40 @@ static struct request *blk_old_get_request(struct request_queue *q,
 	return rq;
 }
 
-struct request *blk_get_request(struct request_queue *q, unsigned int op,
-				gfp_t gfp_mask)
+/**
+ * blk_get_request_flags - allocate a request
+ * @q: request queue to allocate a request for
+ * @op: operation (REQ_OP_*) and REQ_* flags, e.g. REQ_SYNC.
+ * @flags: BLK_MQ_REQ_* flags, e.g. BLK_MQ_REQ_NOWAIT.
+ */
+struct request *blk_get_request_flags(struct request_queue *q, unsigned int op,
+				      unsigned int flags)
 {
 	struct request *req;
 
+	WARN_ON_ONCE(op & REQ_NOWAIT);
+	WARN_ON_ONCE(flags & ~BLK_MQ_REQ_NOWAIT);
+
 	if (q->mq_ops) {
-		req = blk_mq_alloc_request(q, op,
-			(gfp_mask & __GFP_DIRECT_RECLAIM) ?
-				0 : BLK_MQ_REQ_NOWAIT);
+		req = blk_mq_alloc_request(q, op, flags);
 		if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn)
 			q->mq_ops->initialize_rq_fn(req);
 	} else {
-		req = blk_old_get_request(q, op, gfp_mask);
+		req = blk_old_get_request(q, op, flags);
 		if (!IS_ERR(req) && q->initialize_rq_fn)
 			q->initialize_rq_fn(req);
 	}
 
 	return req;
 }
+EXPORT_SYMBOL(blk_get_request_flags);
+
+struct request *blk_get_request(struct request_queue *q, unsigned int op,
+				gfp_t gfp_mask)
+{
+	return blk_get_request_flags(q, op, gfp_mask & __GFP_DIRECT_RECLAIM ?
+				     0 : BLK_MQ_REQ_NOWAIT);
+}
 EXPORT_SYMBOL(blk_get_request);
 
 /**
@@ -1871,7 +1891,7 @@ get_rq:
 	 * Returns with the queue unlocked.
 	 */
 	blk_queue_enter_live(q);
-	req = get_request(q, bio->bi_opf, bio, GFP_NOIO);
+	req = get_request(q, bio->bi_opf, bio, 0);
 	if (IS_ERR(req)) {
 		blk_queue_exit(q);
 		__wbt_done(q->rq_wb, wb_acct);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1437ef4d8037..1af5ddd0f631 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -927,6 +927,9 @@ extern void blk_rq_init(struct request_queue *q, struct request *rq);
 extern void blk_init_request_from_bio(struct request *req, struct bio *bio);
 extern void blk_put_request(struct request *);
 extern void __blk_put_request(struct request_queue *, struct request *);
+extern struct request *blk_get_request_flags(struct request_queue *,
+					     unsigned int op,
+					     unsigned int flags);
 extern struct request *blk_get_request(struct request_queue *, unsigned int op,
 				       gfp_t gfp_mask);
 extern void blk_requeue_request(struct request_queue *, struct request *);
-- 
cgit v1.2.3-58-ga151


From 1b6d65a0bfb5df2a6029c1430e99fcc5d96bb59a Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Thu, 9 Nov 2017 10:49:55 -0800
Subject: block: Introduce BLK_MQ_REQ_PREEMPT

Set RQF_PREEMPT if BLK_MQ_REQ_PREEMPT is passed to
blk_get_request_flags().

Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Martin Steigerwald <martin@lichtvoll.de>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c       | 4 +++-
 block/blk-mq.c         | 2 ++
 include/linux/blk-mq.h | 1 +
 3 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 0f7093dfc010..17eed16a6e04 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1263,6 +1263,8 @@ static struct request *__get_request(struct request_list *rl, unsigned int op,
 	blk_rq_set_rl(rq, rl);
 	rq->cmd_flags = op;
 	rq->rq_flags = rq_flags;
+	if (flags & BLK_MQ_REQ_PREEMPT)
+		rq->rq_flags |= RQF_PREEMPT;
 
 	/* init elvpriv */
 	if (rq_flags & RQF_ELVPRIV) {
@@ -1444,7 +1446,7 @@ struct request *blk_get_request_flags(struct request_queue *q, unsigned int op,
 	struct request *req;
 
 	WARN_ON_ONCE(op & REQ_NOWAIT);
-	WARN_ON_ONCE(flags & ~BLK_MQ_REQ_NOWAIT);
+	WARN_ON_ONCE(flags & ~(BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_PREEMPT));
 
 	if (q->mq_ops) {
 		req = blk_mq_alloc_request(q, op, flags);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 7173d4bd64af..e21876778cec 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -291,6 +291,8 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 	rq->q = data->q;
 	rq->mq_ctx = data->ctx;
 	rq->cmd_flags = op;
+	if (data->flags & BLK_MQ_REQ_PREEMPT)
+		rq->rq_flags |= RQF_PREEMPT;
 	if (blk_queue_io_stat(data->q))
 		rq->rq_flags |= RQF_IO_STAT;
 	/* do not touch atomic flags, it needs atomic ops against the timer */
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 4ae987c2352c..82b56609736a 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -212,6 +212,7 @@ enum {
 	BLK_MQ_REQ_NOWAIT	= (1 << 0), /* return when out of requests */
 	BLK_MQ_REQ_RESERVED	= (1 << 1), /* allocate from reserved pool */
 	BLK_MQ_REQ_INTERNAL	= (1 << 2), /* allocate internal/sched tag */
+	BLK_MQ_REQ_PREEMPT	= (1 << 3), /* set RQF_PREEMPT */
 };
 
 struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
-- 
cgit v1.2.3-58-ga151


From c9254f2ddb19387ea9714a57ea48463c20333b92 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Thu, 9 Nov 2017 10:49:57 -0800
Subject: block: Add the QUEUE_FLAG_PREEMPT_ONLY request queue flag

This flag will be used in the next patch to let the block layer
core know whether or not a SCSI request queue has been quiesced.
A quiesced SCSI queue namely only processes RQF_PREEMPT requests.

Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Martin Steigerwald <martin@lichtvoll.de>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c       | 30 ++++++++++++++++++++++++++++++
 block/blk-mq-debugfs.c |  1 +
 include/linux/blkdev.h |  6 ++++++
 3 files changed, 37 insertions(+)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 17eed16a6e04..edc276899116 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -348,6 +348,36 @@ void blk_sync_queue(struct request_queue *q)
 }
 EXPORT_SYMBOL(blk_sync_queue);
 
+/**
+ * blk_set_preempt_only - set QUEUE_FLAG_PREEMPT_ONLY
+ * @q: request queue pointer
+ *
+ * Returns the previous value of the PREEMPT_ONLY flag - 0 if the flag was not
+ * set and 1 if the flag was already set.
+ */
+int blk_set_preempt_only(struct request_queue *q)
+{
+	unsigned long flags;
+	int res;
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	res = queue_flag_test_and_set(QUEUE_FLAG_PREEMPT_ONLY, q);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+
+	return res;
+}
+EXPORT_SYMBOL_GPL(blk_set_preempt_only);
+
+void blk_clear_preempt_only(struct request_queue *q)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	queue_flag_clear(QUEUE_FLAG_PREEMPT_ONLY, q);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blk_clear_preempt_only);
+
 /**
  * __blk_run_queue_uncond - run a queue whether or not it has been stopped
  * @q:	The queue to run
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index bb7f08415203..e4f2bb936e66 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -74,6 +74,7 @@ static const char *const blk_queue_flag_name[] = {
 	QUEUE_FLAG_NAME(REGISTERED),
 	QUEUE_FLAG_NAME(SCSI_PASSTHROUGH),
 	QUEUE_FLAG_NAME(QUIESCED),
+	QUEUE_FLAG_NAME(PREEMPT_ONLY),
 };
 #undef QUEUE_FLAG_NAME
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1af5ddd0f631..2147e2381a22 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -632,6 +632,7 @@ struct request_queue {
 #define QUEUE_FLAG_REGISTERED  26	/* queue has been registered to a disk */
 #define QUEUE_FLAG_SCSI_PASSTHROUGH 27	/* queue supports SCSI commands */
 #define QUEUE_FLAG_QUIESCED    28	/* queue has been quiesced */
+#define QUEUE_FLAG_PREEMPT_ONLY	29	/* only process REQ_PREEMPT requests */
 
 #define QUEUE_FLAG_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
 				 (1 << QUEUE_FLAG_SAME_COMP)	|	\
@@ -732,6 +733,11 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
 	((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \
 			     REQ_FAILFAST_DRIVER))
 #define blk_queue_quiesced(q)	test_bit(QUEUE_FLAG_QUIESCED, &(q)->queue_flags)
+#define blk_queue_preempt_only(q)				\
+	test_bit(QUEUE_FLAG_PREEMPT_ONLY, &(q)->queue_flags)
+
+extern int blk_set_preempt_only(struct request_queue *q);
+extern void blk_clear_preempt_only(struct request_queue *q);
 
 static inline bool blk_account_rq(struct request *rq)
 {
-- 
cgit v1.2.3-58-ga151


From 3a0a529971ec4e2d933e9c7798db101dfb6b1aec Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Thu, 9 Nov 2017 10:49:58 -0800
Subject: block, scsi: Make SCSI quiesce and resume work reliably

The contexts from which a SCSI device can be quiesced or resumed are:
* Writing into /sys/class/scsi_device/*/device/state.
* SCSI parallel (SPI) domain validation.
* The SCSI device power management methods. See also scsi_bus_pm_ops.

It is essential during suspend and resume that neither the filesystem
state nor the filesystem metadata in RAM changes. This is why while
the hibernation image is being written or restored that SCSI devices
are quiesced. The SCSI core quiesces devices through scsi_device_quiesce()
and scsi_device_resume(). In the SDEV_QUIESCE state execution of
non-preempt requests is deferred. This is realized by returning
BLKPREP_DEFER from inside scsi_prep_state_check() for quiesced SCSI
devices. Avoid that a full queue prevents power management requests
to be submitted by deferring allocation of non-preempt requests for
devices in the quiesced state. This patch has been tested by running
the following commands and by verifying that after each resume the
fio job was still running:

for ((i=0; i<10; i++)); do
  (
    cd /sys/block/md0/md &&
    while true; do
      [ "$(<sync_action)" = "idle" ] && echo check > sync_action
      sleep 1
    done
  ) &
  pids=($!)
  for d in /sys/class/block/sd*[a-z]; do
    bdev=${d#/sys/class/block/}
    hcil=$(readlink "$d/device")
    hcil=${hcil#../../../}
    echo 4 > "$d/queue/nr_requests"
    echo 1 > "/sys/class/scsi_device/$hcil/device/queue_depth"
    fio --name="$bdev" --filename="/dev/$bdev" --buffered=0 --bs=512 \
      --rw=randread --ioengine=libaio --numjobs=4 --iodepth=16       \
      --iodepth_batch=1 --thread --loops=$((2**31)) &
    pids+=($!)
  done
  sleep 1
  echo "$(date) Hibernating ..." >>hibernate-test-log.txt
  systemctl hibernate
  sleep 10
  kill "${pids[@]}"
  echo idle > /sys/block/md0/md/sync_action
  wait
  echo "$(date) Done." >>hibernate-test-log.txt
done

Reported-by: Oleksandr Natalenko <oleksandr@natalenko.name>
References: "I/O hangs after resuming from suspend-to-ram" (https://marc.info/?l=linux-block&m=150340235201348).
Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Martin Steigerwald <martin@lichtvoll.de>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c           | 42 ++++++++++++++++++++++++++++++++++--------
 block/blk-mq.c             |  4 ++--
 drivers/scsi/scsi_lib.c    | 42 ++++++++++++++++++++++++++++++------------
 fs/block_dev.c             |  4 ++--
 include/linux/blkdev.h     |  2 +-
 include/scsi/scsi_device.h |  1 +
 6 files changed, 70 insertions(+), 25 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index edc276899116..29b08428ae45 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -374,6 +374,7 @@ void blk_clear_preempt_only(struct request_queue *q)
 
 	spin_lock_irqsave(q->queue_lock, flags);
 	queue_flag_clear(QUEUE_FLAG_PREEMPT_ONLY, q);
+	wake_up_all(&q->mq_freeze_wq);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 EXPORT_SYMBOL_GPL(blk_clear_preempt_only);
@@ -795,15 +796,38 @@ struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(blk_alloc_queue);
 
-int blk_queue_enter(struct request_queue *q, bool nowait)
+/**
+ * blk_queue_enter() - try to increase q->q_usage_counter
+ * @q: request queue pointer
+ * @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PREEMPT
+ */
+int blk_queue_enter(struct request_queue *q, unsigned int flags)
 {
+	const bool preempt = flags & BLK_MQ_REQ_PREEMPT;
+
 	while (true) {
+		bool success = false;
 		int ret;
 
-		if (percpu_ref_tryget_live(&q->q_usage_counter))
+		rcu_read_lock_sched();
+		if (percpu_ref_tryget_live(&q->q_usage_counter)) {
+			/*
+			 * The code that sets the PREEMPT_ONLY flag is
+			 * responsible for ensuring that that flag is globally
+			 * visible before the queue is unfrozen.
+			 */
+			if (preempt || !blk_queue_preempt_only(q)) {
+				success = true;
+			} else {
+				percpu_ref_put(&q->q_usage_counter);
+			}
+		}
+		rcu_read_unlock_sched();
+
+		if (success)
 			return 0;
 
-		if (nowait)
+		if (flags & BLK_MQ_REQ_NOWAIT)
 			return -EBUSY;
 
 		/*
@@ -816,7 +840,8 @@ int blk_queue_enter(struct request_queue *q, bool nowait)
 		smp_rmb();
 
 		ret = wait_event_interruptible(q->mq_freeze_wq,
-				!atomic_read(&q->mq_freeze_depth) ||
+				(atomic_read(&q->mq_freeze_depth) == 0 &&
+				 (preempt || !blk_queue_preempt_only(q))) ||
 				blk_queue_dying(q));
 		if (blk_queue_dying(q))
 			return -ENODEV;
@@ -1445,8 +1470,7 @@ static struct request *blk_old_get_request(struct request_queue *q,
 	/* create ioc upfront */
 	create_io_context(gfp_mask, q->node);
 
-	ret = blk_queue_enter(q, !(gfp_mask & __GFP_DIRECT_RECLAIM) ||
-			      (op & REQ_NOWAIT));
+	ret = blk_queue_enter(q, flags);
 	if (ret)
 		return ERR_PTR(ret);
 	spin_lock_irq(q->queue_lock);
@@ -2267,8 +2291,10 @@ blk_qc_t generic_make_request(struct bio *bio)
 	current->bio_list = bio_list_on_stack;
 	do {
 		struct request_queue *q = bio->bi_disk->queue;
+		unsigned int flags = bio->bi_opf & REQ_NOWAIT ?
+			BLK_MQ_REQ_NOWAIT : 0;
 
-		if (likely(blk_queue_enter(q, bio->bi_opf & REQ_NOWAIT) == 0)) {
+		if (likely(blk_queue_enter(q, flags) == 0)) {
 			struct bio_list lower, same;
 
 			/* Create a fresh bio_list for all subordinate requests */
@@ -2327,7 +2353,7 @@ blk_qc_t direct_make_request(struct bio *bio)
 	if (!generic_make_request_checks(bio))
 		return BLK_QC_T_NONE;
 
-	if (unlikely(blk_queue_enter(q, nowait))) {
+	if (unlikely(blk_queue_enter(q, nowait ? BLK_MQ_REQ_NOWAIT : 0))) {
 		if (nowait && !blk_queue_dying(q))
 			bio->bi_status = BLK_STS_AGAIN;
 		else
diff --git a/block/blk-mq.c b/block/blk-mq.c
index e21876778cec..211bc8a3e2cc 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -389,7 +389,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
 	struct request *rq;
 	int ret;
 
-	ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT);
+	ret = blk_queue_enter(q, flags);
 	if (ret)
 		return ERR_PTR(ret);
 
@@ -428,7 +428,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
 	if (hctx_idx >= q->nr_hw_queues)
 		return ERR_PTR(-EIO);
 
-	ret = blk_queue_enter(q, true);
+	ret = blk_queue_enter(q, flags);
 	if (ret)
 		return ERR_PTR(ret);
 
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index eb129dfc2ebe..f907e2f8c1dd 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -2947,21 +2947,37 @@ static void scsi_wait_for_queuecommand(struct scsi_device *sdev)
 int
 scsi_device_quiesce(struct scsi_device *sdev)
 {
+	struct request_queue *q = sdev->request_queue;
 	int err;
 
+	/*
+	 * It is allowed to call scsi_device_quiesce() multiple times from
+	 * the same context but concurrent scsi_device_quiesce() calls are
+	 * not allowed.
+	 */
+	WARN_ON_ONCE(sdev->quiesced_by && sdev->quiesced_by != current);
+
+	blk_set_preempt_only(q);
+
+	blk_mq_freeze_queue(q);
+	/*
+	 * Ensure that the effect of blk_set_preempt_only() will be visible
+	 * for percpu_ref_tryget() callers that occur after the queue
+	 * unfreeze even if the queue was already frozen before this function
+	 * was called. See also https://lwn.net/Articles/573497/.
+	 */
+	synchronize_rcu();
+	blk_mq_unfreeze_queue(q);
+
 	mutex_lock(&sdev->state_mutex);
 	err = scsi_device_set_state(sdev, SDEV_QUIESCE);
+	if (err == 0)
+		sdev->quiesced_by = current;
+	else
+		blk_clear_preempt_only(q);
 	mutex_unlock(&sdev->state_mutex);
 
-	if (err)
-		return err;
-
-	scsi_run_queue(sdev->request_queue);
-	while (atomic_read(&sdev->device_busy)) {
-		msleep_interruptible(200);
-		scsi_run_queue(sdev->request_queue);
-	}
-	return 0;
+	return err;
 }
 EXPORT_SYMBOL(scsi_device_quiesce);
 
@@ -2981,9 +2997,11 @@ void scsi_device_resume(struct scsi_device *sdev)
 	 * device deleted during suspend)
 	 */
 	mutex_lock(&sdev->state_mutex);
-	if (sdev->sdev_state == SDEV_QUIESCE &&
-	    scsi_device_set_state(sdev, SDEV_RUNNING) == 0)
-		scsi_run_queue(sdev->request_queue);
+	WARN_ON_ONCE(!sdev->quiesced_by);
+	sdev->quiesced_by = NULL;
+	blk_clear_preempt_only(sdev->request_queue);
+	if (sdev->sdev_state == SDEV_QUIESCE)
+		scsi_device_set_state(sdev, SDEV_RUNNING);
 	mutex_unlock(&sdev->state_mutex);
 }
 EXPORT_SYMBOL(scsi_device_resume);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 4afa4d5ff969..04973f484422 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -662,7 +662,7 @@ int bdev_read_page(struct block_device *bdev, sector_t sector,
 	if (!ops->rw_page || bdev_get_integrity(bdev))
 		return result;
 
-	result = blk_queue_enter(bdev->bd_queue, false);
+	result = blk_queue_enter(bdev->bd_queue, 0);
 	if (result)
 		return result;
 	result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, false);
@@ -698,7 +698,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
 
 	if (!ops->rw_page || bdev_get_integrity(bdev))
 		return -EOPNOTSUPP;
-	result = blk_queue_enter(bdev->bd_queue, false);
+	result = blk_queue_enter(bdev->bd_queue, 0);
 	if (result)
 		return result;
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2147e2381a22..402c9d536ae1 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -959,7 +959,7 @@ extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t,
 extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t,
 			 struct scsi_ioctl_command __user *);
 
-extern int blk_queue_enter(struct request_queue *q, bool nowait);
+extern int blk_queue_enter(struct request_queue *q, unsigned int flags);
 extern void blk_queue_exit(struct request_queue *q);
 extern void blk_start_queue(struct request_queue *q);
 extern void blk_start_queue_async(struct request_queue *q);
diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h
index 82e93ee94708..6f0f1e242e23 100644
--- a/include/scsi/scsi_device.h
+++ b/include/scsi/scsi_device.h
@@ -219,6 +219,7 @@ struct scsi_device {
 	unsigned char		access_state;
 	struct mutex		state_mutex;
 	enum scsi_device_state sdev_state;
+	struct task_struct	*quiesced_by;
 	unsigned long		sdev_data[0];
 } __attribute__((aligned(sizeof(unsigned long))));
 
-- 
cgit v1.2.3-58-ga151


From 9a95e4ef709533efac4aafcb8bddf73f96db50ed Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Thu, 9 Nov 2017 10:49:59 -0800
Subject: block, nvme: Introduce blk_mq_req_flags_t

Several block layer and NVMe core functions accept a combination
of BLK_MQ_REQ_* flags through the 'flags' argument but there is
no verification at compile time whether the right type of block
layer flags is passed. Make it possible for sparse to verify this.
This patch does not change any functionality.

Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Cc: linux-nvme@lists.infradead.org
Cc: Christoph Hellwig <hch@lst.de>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Cc: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c          | 12 ++++++------
 block/blk-mq.c            |  4 ++--
 block/blk-mq.h            |  2 +-
 drivers/nvme/host/core.c  |  5 +++--
 drivers/nvme/host/nvme.h  |  5 +++--
 include/linux/blk-mq.h    | 17 +++++++++++------
 include/linux/blk_types.h |  2 ++
 include/linux/blkdev.h    |  4 ++--
 8 files changed, 30 insertions(+), 21 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 29b08428ae45..7c54c195e79e 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -801,7 +801,7 @@ EXPORT_SYMBOL(blk_alloc_queue);
  * @q: request queue pointer
  * @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PREEMPT
  */
-int blk_queue_enter(struct request_queue *q, unsigned int flags)
+int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
 {
 	const bool preempt = flags & BLK_MQ_REQ_PREEMPT;
 
@@ -1225,7 +1225,7 @@ int blk_update_nr_requests(struct request_queue *q, unsigned int nr)
  * Returns request pointer on success, with @q->queue_lock *not held*.
  */
 static struct request *__get_request(struct request_list *rl, unsigned int op,
-				     struct bio *bio, unsigned int flags)
+				     struct bio *bio, blk_mq_req_flags_t flags)
 {
 	struct request_queue *q = rl->q;
 	struct request *rq;
@@ -1408,7 +1408,7 @@ rq_starved:
  * Returns request pointer on success, with @q->queue_lock *not held*.
  */
 static struct request *get_request(struct request_queue *q, unsigned int op,
-				   struct bio *bio, unsigned int flags)
+				   struct bio *bio, blk_mq_req_flags_t flags)
 {
 	const bool is_sync = op_is_sync(op);
 	DEFINE_WAIT(wait);
@@ -1458,7 +1458,7 @@ retry:
 
 /* flags: BLK_MQ_REQ_PREEMPT and/or BLK_MQ_REQ_NOWAIT. */
 static struct request *blk_old_get_request(struct request_queue *q,
-					   unsigned int op, unsigned int flags)
+				unsigned int op, blk_mq_req_flags_t flags)
 {
 	struct request *rq;
 	gfp_t gfp_mask = flags & BLK_MQ_REQ_NOWAIT ? GFP_ATOMIC :
@@ -1495,7 +1495,7 @@ static struct request *blk_old_get_request(struct request_queue *q,
  * @flags: BLK_MQ_REQ_* flags, e.g. BLK_MQ_REQ_NOWAIT.
  */
 struct request *blk_get_request_flags(struct request_queue *q, unsigned int op,
-				      unsigned int flags)
+				      blk_mq_req_flags_t flags)
 {
 	struct request *req;
 
@@ -2291,7 +2291,7 @@ blk_qc_t generic_make_request(struct bio *bio)
 	current->bio_list = bio_list_on_stack;
 	do {
 		struct request_queue *q = bio->bi_disk->queue;
-		unsigned int flags = bio->bi_opf & REQ_NOWAIT ?
+		blk_mq_req_flags_t flags = bio->bi_opf & REQ_NOWAIT ?
 			BLK_MQ_REQ_NOWAIT : 0;
 
 		if (likely(blk_queue_enter(q, flags) == 0)) {
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 211bc8a3e2cc..bfe24a5b62a3 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -383,7 +383,7 @@ static struct request *blk_mq_get_request(struct request_queue *q,
 }
 
 struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
-		unsigned int flags)
+		blk_mq_req_flags_t flags)
 {
 	struct blk_mq_alloc_data alloc_data = { .flags = flags };
 	struct request *rq;
@@ -409,7 +409,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
 EXPORT_SYMBOL(blk_mq_alloc_request);
 
 struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
-		unsigned int op, unsigned int flags, unsigned int hctx_idx)
+	unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx)
 {
 	struct blk_mq_alloc_data alloc_data = { .flags = flags };
 	struct request *rq;
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 2502f40ccdc0..99a19c5523e2 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -111,7 +111,7 @@ static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
 struct blk_mq_alloc_data {
 	/* input parameter */
 	struct request_queue *q;
-	unsigned int flags;
+	blk_mq_req_flags_t flags;
 	unsigned int shallow_depth;
 
 	/* input & output parameter */
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 683d890d73fa..878d5c09d15b 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -305,7 +305,7 @@ static void nvme_put_ns(struct nvme_ns *ns)
 }
 
 struct request *nvme_alloc_request(struct request_queue *q,
-		struct nvme_command *cmd, unsigned int flags, int qid)
+		struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid)
 {
 	unsigned op = nvme_is_write(cmd) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN;
 	struct request *req;
@@ -573,7 +573,8 @@ EXPORT_SYMBOL_GPL(nvme_setup_cmd);
  */
 int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
 		union nvme_result *result, void *buffer, unsigned bufflen,
-		unsigned timeout, int qid, int at_head, int flags)
+		unsigned timeout, int qid, int at_head,
+		blk_mq_req_flags_t flags)
 {
 	struct request *req;
 	int ret;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 7b9cc7d616b7..23b8504ace7f 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -327,14 +327,15 @@ int nvme_reinit_tagset(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set);
 
 #define NVME_QID_ANY -1
 struct request *nvme_alloc_request(struct request_queue *q,
-		struct nvme_command *cmd, unsigned int flags, int qid);
+		struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid);
 blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
 		struct nvme_command *cmd);
 int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
 		void *buf, unsigned bufflen);
 int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
 		union nvme_result *result, void *buffer, unsigned bufflen,
-		unsigned timeout, int qid, int at_head, int flags);
+		unsigned timeout, int qid, int at_head,
+		blk_mq_req_flags_t flags);
 int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count);
 void nvme_start_keep_alive(struct nvme_ctrl *ctrl);
 void nvme_stop_keep_alive(struct nvme_ctrl *ctrl);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 82b56609736a..b326208277ee 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -209,16 +209,21 @@ void blk_mq_free_request(struct request *rq);
 bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
 
 enum {
-	BLK_MQ_REQ_NOWAIT	= (1 << 0), /* return when out of requests */
-	BLK_MQ_REQ_RESERVED	= (1 << 1), /* allocate from reserved pool */
-	BLK_MQ_REQ_INTERNAL	= (1 << 2), /* allocate internal/sched tag */
-	BLK_MQ_REQ_PREEMPT	= (1 << 3), /* set RQF_PREEMPT */
+	/* return when out of requests */
+	BLK_MQ_REQ_NOWAIT	= (__force blk_mq_req_flags_t)(1 << 0),
+	/* allocate from reserved pool */
+	BLK_MQ_REQ_RESERVED	= (__force blk_mq_req_flags_t)(1 << 1),
+	/* allocate internal/sched tag */
+	BLK_MQ_REQ_INTERNAL	= (__force blk_mq_req_flags_t)(1 << 2),
+	/* set RQF_PREEMPT */
+	BLK_MQ_REQ_PREEMPT	= (__force blk_mq_req_flags_t)(1 << 3),
 };
 
 struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
-		unsigned int flags);
+		blk_mq_req_flags_t flags);
 struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
-		unsigned int op, unsigned int flags, unsigned int hctx_idx);
+		unsigned int op, blk_mq_req_flags_t flags,
+		unsigned int hctx_idx);
 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag);
 
 enum {
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 1b04085255fb..13ccfc9b210a 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -162,6 +162,8 @@ struct bio {
  */
 #define BIO_RESET_BITS	BVEC_POOL_OFFSET
 
+typedef __u32 __bitwise blk_mq_req_flags_t;
+
 /*
  * Operations and flags common to the bio and request structures.
  * We use 8 bits for encoding the operation, and the remaining 24 for flags.
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 402c9d536ae1..e80ea1d31343 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -935,7 +935,7 @@ extern void blk_put_request(struct request *);
 extern void __blk_put_request(struct request_queue *, struct request *);
 extern struct request *blk_get_request_flags(struct request_queue *,
 					     unsigned int op,
-					     unsigned int flags);
+					     blk_mq_req_flags_t flags);
 extern struct request *blk_get_request(struct request_queue *, unsigned int op,
 				       gfp_t gfp_mask);
 extern void blk_requeue_request(struct request_queue *, struct request *);
@@ -959,7 +959,7 @@ extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t,
 extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t,
 			 struct scsi_ioctl_command __user *);
 
-extern int blk_queue_enter(struct request_queue *q, unsigned int flags);
+extern int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags);
 extern void blk_queue_exit(struct request_queue *q);
 extern void blk_start_queue(struct request_queue *q);
 extern void blk_start_queue_async(struct request_queue *q);
-- 
cgit v1.2.3-58-ga151


From 17eac0996341cfd28b4093554e662fd82aa1c237 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Thu, 9 Nov 2017 17:57:06 +0100
Subject: block: create 'slaves' and 'holders' entries for hidden gendisks

When creating nvme multipath devices we should populate the 'slaves' and
'holders' directorys properly to aid userspace topology detection.

Signed-off-by: Hannes Reinecke <hare@suse.com>
[hch: split from a larger patch]
Reviewed-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/genhd.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index 835e907e6e01..3de1671631bf 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -585,14 +585,14 @@ static void register_disk(struct device *parent, struct gendisk *disk)
 	 */
 	pm_runtime_set_memalloc_noio(ddev, true);
 
+	disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
+	disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
+
 	if (disk->flags & GENHD_FL_HIDDEN) {
 		dev_set_uevent_suppress(ddev, 0);
 		return;
 	}
 
-	disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
-	disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
-
 	/* No minors to use for partitions */
 	if (!disk_part_scan_enabled(disk))
 		goto exit;
@@ -728,11 +728,11 @@ void del_gendisk(struct gendisk *disk)
 		WARN_ON(1);
 	}
 
-	if (!(disk->flags & GENHD_FL_HIDDEN)) {
+	if (!(disk->flags & GENHD_FL_HIDDEN))
 		blk_unregister_region(disk_devt(disk), disk->minors);
-		kobject_put(disk->part0.holder_dir);
-		kobject_put(disk->slave_dir);
-	}
+
+	kobject_put(disk->part0.holder_dir);
+	kobject_put(disk->slave_dir);
 
 	part_stat_set_all(&disk->part0, 0);
 	disk->part0.stamp = 0;
-- 
cgit v1.2.3-58-ga151


From f0fba398fec65ff5a2e1bf8ae62718ec0450abaf Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Mon, 6 Nov 2017 17:53:53 +0000
Subject: block: avoid null pointer dereference on null disk

It is possible that the pointer disk can be null and hence
we can get a null pointer deference when accessing disk->flags.
Add a null pointer check to avoid the dereference.

Detected by CoverityScan, CID#1461133 ("Explicit null dereferenced")

Fixes: 8ddcd653257c ("block: introduce GENHD_FL_HIDDEN")
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/genhd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index 3de1671631bf..997e598f3b86 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -798,7 +798,7 @@ struct gendisk *get_gendisk(dev_t devt, int *partno)
 		spin_unlock_bh(&ext_devt_lock);
 	}
 
-	if (unlikely(disk->flags & GENHD_FL_HIDDEN)) {
+	if (disk && unlikely(disk->flags & GENHD_FL_HIDDEN)) {
 		put_disk(disk);
 		disk = NULL;
 	}
-- 
cgit v1.2.3-58-ga151


From 79f720a751cad613620d0237e3b44f89f4a69181 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 10 Nov 2017 09:13:21 -0700
Subject: blk-mq: only run the hardware queue if IO is pending

Currently we are inconsistent in when we decide to run the queue. Using
blk_mq_run_hw_queues() we check if the hctx has pending IO before
running it, but we don't do that from the individual queue run function,
blk_mq_run_hw_queue(). This results in a lot of extra and pointless
queue runs, potentially, on flush requests and (much worse) on tag
starvation situations. This is observable just looking at top output,
with lots of kworkers active. For the !async runs, it just adds to the
CPU overhead of blk-mq.

Move the has-pending check into the run function instead of having
callers do it.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-sched.c   |  7 +------
 block/blk-mq.c         | 18 +++++++++++-------
 block/blk-mq.h         |  2 --
 include/linux/blk-mq.h |  2 +-
 4 files changed, 13 insertions(+), 16 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 6f4bdb8209f7..c117bd8fd1f6 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -81,12 +81,7 @@ static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx)
 	} else
 		clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
 
-	if (blk_mq_hctx_has_pending(hctx)) {
-		blk_mq_run_hw_queue(hctx, true);
-		return true;
-	}
-
-	return false;
+	return blk_mq_run_hw_queue(hctx, true);
 }
 
 /*
diff --git a/block/blk-mq.c b/block/blk-mq.c
index bfe24a5b62a3..a2a4271f5ab8 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -61,10 +61,10 @@ static int blk_mq_poll_stats_bkt(const struct request *rq)
 /*
  * Check if any of the ctx's have pending work in this hardware queue
  */
-bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
+static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
 {
-	return sbitmap_any_bit_set(&hctx->ctx_map) ||
-			!list_empty_careful(&hctx->dispatch) ||
+	return !list_empty_careful(&hctx->dispatch) ||
+		sbitmap_any_bit_set(&hctx->ctx_map) ||
 			blk_mq_sched_has_work(hctx);
 }
 
@@ -1253,9 +1253,14 @@ void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
 }
 EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
 
-void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
+bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 {
-	__blk_mq_delay_run_hw_queue(hctx, async, 0);
+	if (blk_mq_hctx_has_pending(hctx)) {
+		__blk_mq_delay_run_hw_queue(hctx, async, 0);
+		return true;
+	}
+
+	return false;
 }
 EXPORT_SYMBOL(blk_mq_run_hw_queue);
 
@@ -1265,8 +1270,7 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async)
 	int i;
 
 	queue_for_each_hw_ctx(q, hctx, i) {
-		if (!blk_mq_hctx_has_pending(hctx) ||
-		    blk_mq_hctx_stopped(hctx))
+		if (blk_mq_hctx_stopped(hctx))
 			continue;
 
 		blk_mq_run_hw_queue(hctx, async);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 99a19c5523e2..dcf379a892dd 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -26,14 +26,12 @@ struct blk_mq_ctx {
 	struct kobject		kobj;
 } ____cacheline_aligned_in_smp;
 
-void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
 void blk_mq_freeze_queue(struct request_queue *q);
 void blk_mq_free_queue(struct request_queue *q);
 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
 void blk_mq_wake_waiters(struct request_queue *q);
 bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *, bool);
 void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
-bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx);
 bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
 				bool wait);
 struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index b326208277ee..eb1e2cdffb31 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -266,7 +266,7 @@ void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
 void blk_mq_quiesce_queue(struct request_queue *q);
 void blk_mq_unquiesce_queue(struct request_queue *q);
 void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
-void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
+bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
 void blk_mq_run_hw_queues(struct request_queue *q, bool async);
 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
 void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
-- 
cgit v1.2.3-58-ga151


From f906a6a0f42684715b552ccff9282b22cfe2b5a2 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 9 Nov 2017 16:10:13 -0700
Subject: blk-mq: improve tag waiting setup for non-shared tags

If we run out of driver tags, we currently treat shared and non-shared
tags the same - both cases hook into the tag waitqueue. This is a bit
more costly than it needs to be on unshared tags, since we have to both
grab the hctx lock, and the waitqueue lock (and disable interrupts).
For the non-shared case, we can simply mark the queue as needing a
restart.

Split blk_mq_dispatch_wait_add() to account for both cases, and
rename it to blk_mq_mark_tag_wait() to better reflect what it
does now.

Without this patch, shared and non-shared performance is about the same
with 4 fio thread hammering on a single null_blk device (~410K, at 75%
sys). With the patch, the shared case is the same, but the non-shared
tags case runs at 431K at 71% sys.

Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 81 +++++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 55 insertions(+), 26 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index a2a4271f5ab8..3295859f419d 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1006,44 +1006,68 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
 	return 1;
 }
 
-static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx **hctx,
-				     struct request *rq)
+/*
+ * Mark us waiting for a tag. For shared tags, this involves hooking us into
+ * the tag wakeups. For non-shared tags, we can simply mark us nedeing a
+ * restart. For both caes, take care to check the condition again after
+ * marking us as waiting.
+ */
+static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx,
+				 struct request *rq)
 {
 	struct blk_mq_hw_ctx *this_hctx = *hctx;
-	wait_queue_entry_t *wait = &this_hctx->dispatch_wait;
+	bool shared_tags = (this_hctx->flags & BLK_MQ_F_TAG_SHARED) != 0;
 	struct sbq_wait_state *ws;
+	wait_queue_entry_t *wait;
+	bool ret;
 
-	if (!list_empty_careful(&wait->entry))
-		return false;
+	if (!shared_tags) {
+		if (!test_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state))
+			set_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state);
+	} else {
+		wait = &this_hctx->dispatch_wait;
+		if (!list_empty_careful(&wait->entry))
+			return false;
 
-	spin_lock(&this_hctx->lock);
-	if (!list_empty(&wait->entry)) {
-		spin_unlock(&this_hctx->lock);
-		return false;
-	}
+		spin_lock(&this_hctx->lock);
+		if (!list_empty(&wait->entry)) {
+			spin_unlock(&this_hctx->lock);
+			return false;
+		}
 
-	ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx);
-	add_wait_queue(&ws->wait, wait);
+		ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx);
+		add_wait_queue(&ws->wait, wait);
+	}
 
 	/*
 	 * It's possible that a tag was freed in the window between the
 	 * allocation failure and adding the hardware queue to the wait
 	 * queue.
 	 */
-	if (!blk_mq_get_driver_tag(rq, hctx, false)) {
+	ret = blk_mq_get_driver_tag(rq, hctx, false);
+
+	if (!shared_tags) {
+		/*
+		 * Don't clear RESTART here, someone else could have set it.
+		 * At most this will cost an extra queue run.
+		 */
+		return ret;
+	} else {
+		if (!ret) {
+			spin_unlock(&this_hctx->lock);
+			return false;
+		}
+
+		/*
+		 * We got a tag, remove ourselves from the wait queue to ensure
+		 * someone else gets the wakeup.
+		 */
+		spin_lock_irq(&ws->wait.lock);
+		list_del_init(&wait->entry);
+		spin_unlock_irq(&ws->wait.lock);
 		spin_unlock(&this_hctx->lock);
-		return false;
+		return true;
 	}
-
-	/*
-	 * We got a tag, remove ourselves from the wait queue to ensure
-	 * someone else gets the wakeup.
-	 */
-	spin_lock_irq(&ws->wait.lock);
-	list_del_init(&wait->entry);
-	spin_unlock_irq(&ws->wait.lock);
-	spin_unlock(&this_hctx->lock);
-	return true;
 }
 
 bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
@@ -1076,10 +1100,15 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 			 * before we add this entry back on the dispatch list,
 			 * we'll re-run it below.
 			 */
-			if (!blk_mq_dispatch_wait_add(&hctx, rq)) {
+			if (!blk_mq_mark_tag_wait(&hctx, rq)) {
 				if (got_budget)
 					blk_mq_put_dispatch_budget(hctx);
-				no_tag = true;
+				/*
+				 * For non-shared tags, the RESTART check
+				 * will suffice.
+				 */
+				if (hctx->flags & BLK_MQ_F_TAG_SHARED)
+					no_tag = true;
 				break;
 			}
 		}
-- 
cgit v1.2.3-58-ga151


From ff821d271415f17f1a704600420e92ebb9bfb32c Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 10 Nov 2017 22:05:12 -0700
Subject: blk-mq: fixup some comment typos and lengths

Various typos and/or spelling errors in comments. Fixes a few > 80 char
lines as well.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3295859f419d..b600463791ec 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -707,7 +707,7 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
 
 	/*
 	 * We abuse this flag that is otherwise used by the I/O scheduler to
-	 * request head insertation from the workqueue.
+	 * request head insertion from the workqueue.
 	 */
 	BUG_ON(rq->rq_flags & RQF_SOFTBARRIER);
 
@@ -1137,7 +1137,8 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 		if (ret == BLK_STS_RESOURCE) {
 			/*
 			 * If an I/O scheduler has been configured and we got a
-			 * driver tag for the next request already, free it again.
+			 * driver tag for the next request already, free it
+			 * again.
 			 */
 			if (!list_empty(list)) {
 				nxt = list_first_entry(list, struct request, queuelist);
@@ -2299,8 +2300,11 @@ static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
 
 	mutex_lock(&set->tag_list_lock);
 
-	/* Check to see if we're transitioning to shared (from 1 to 2 queues). */
-	if (!list_empty(&set->tag_list) && !(set->flags & BLK_MQ_F_TAG_SHARED)) {
+	/*
+	 * Check to see if we're transitioning to shared (from 1 to 2 queues).
+	 */
+	if (!list_empty(&set->tag_list) &&
+	    !(set->flags & BLK_MQ_F_TAG_SHARED)) {
 		set->flags |= BLK_MQ_F_TAG_SHARED;
 		/* update existing queue */
 		blk_mq_update_tag_set_depth(set, true);
@@ -2532,10 +2536,9 @@ static void blk_mq_queue_reinit(struct request_queue *q)
 
 	/*
 	 * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
-	 * we should change hctx numa_node according to new topology (this
-	 * involves free and re-allocate memory, worthy doing?)
+	 * we should change hctx numa_node according to the new topology (this
+	 * involves freeing and re-allocating memory, worth doing?)
 	 */
-
 	blk_mq_map_swqueue(q);
 
 	blk_mq_sysfs_register(q);
-- 
cgit v1.2.3-58-ga151