From a22c00be90de188d36f4772ef7b268aa48d7010d Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 1 Nov 2021 06:56:09 -0600
Subject: block: assign correct tag before doing prefetch of request

Ensure that current tag is correctly assigned before attempting
to prefetch the first cacheline of the request.

Fixes: 92aff191cc5b ("block: prefetch request to be initialized")
Reported-and-tested-by: syzbot+cd20829ac44b92bf6ed0@syzkaller.appspotmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block/blk-mq.c')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 221d1b7d10d6..4787d5b74aa3 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -405,8 +405,8 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data,
 	for (i = 0; tag_mask; i++) {
 		if (!(tag_mask & (1UL << i)))
 			continue;
-		prefetch(tags->static_rqs[tag]);
 		tag = tag_offset + i;
+		prefetch(tags->static_rqs[tag]);
 		tag_mask &= ~(1UL << i);
 		rq = blk_mq_rq_ctx_init(data, tags, tag, alloc_time_ns);
 		rq_list_add(data->cached_rq, rq);
-- 
cgit v1.2.3-58-ga151


From b22809092c70099f4d8c3b6f3d34c5bc89b300ea Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 1 Nov 2021 13:40:12 -0600
Subject: block: replace always false argument with 'false'

A previous commit fixed up the condition for doing direct issue, but that
left the 'from_schedule' argument dead inside the branch. Replace it with
'false'.

Fixes: ff1552232b36 ("blk-mq: don't issue request directly in case that current is to be blocked")
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block/blk-mq.c')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4787d5b74aa3..8aed6cea3a34 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2227,7 +2227,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 	plug->rq_count = 0;
 
 	if (!plug->multiple_queues && !plug->has_elevator && !from_schedule) {
-		blk_mq_plug_issue_direct(plug, from_schedule);
+		blk_mq_plug_issue_direct(plug, false);
 		if (rq_list_empty(plug->mq_list))
 			return;
 	}
-- 
cgit v1.2.3-58-ga151


From 781dd830ec4f4d56b99d5d0c64bacda4c3ee3cfd Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 2 Nov 2021 08:34:09 -0600
Subject: block: move RQF_ELV setting into allocators

It's not safe to do this before blk_queue_enter(), as the scheduler state
could have changed in between. Hence move the RQF_ELV setting into the
allocators, where we know the queue is already entered.

Suggested-by: Ming Lei <ming.lei@redhat.com>
Reported-by: Yi Zhang <yi.zhang@redhat.com>
Reported-by: Steffen Maier <maier@linux.ibm.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'block/blk-mq.c')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 8aed6cea3a34..00263d896843 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -419,7 +419,6 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data,
 static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
 {
 	struct request_queue *q = data->q;
-	struct elevator_queue *e = q->elevator;
 	u64 alloc_time_ns = 0;
 	struct request *rq;
 	unsigned int tag;
@@ -431,7 +430,11 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
 	if (data->cmd_flags & REQ_NOWAIT)
 		data->flags |= BLK_MQ_REQ_NOWAIT;
 
-	if (e) {
+	if (q->elevator) {
+		struct elevator_queue *e = q->elevator;
+
+		data->rq_flags |= RQF_ELV;
+
 		/*
 		 * Flush/passthrough requests are special and go directly to the
 		 * dispatch list. Don't include reserved tags in the
@@ -447,7 +450,7 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
 retry:
 	data->ctx = blk_mq_get_ctx(q);
 	data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx);
-	if (!e)
+	if (!(data->rq_flags & RQF_ELV))
 		blk_mq_tag_busy(data->hctx);
 
 	/*
@@ -490,7 +493,6 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
 		.q		= q,
 		.flags		= flags,
 		.cmd_flags	= op,
-		.rq_flags	= q->elevator ? RQF_ELV : 0,
 		.nr_tags	= 1,
 	};
 	struct request *rq;
@@ -520,7 +522,6 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
 		.q		= q,
 		.flags		= flags,
 		.cmd_flags	= op,
-		.rq_flags	= q->elevator ? RQF_ELV : 0,
 		.nr_tags	= 1,
 	};
 	u64 alloc_time_ns = 0;
@@ -561,6 +562,8 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
 
 	if (!q->elevator)
 		blk_mq_tag_busy(data.hctx);
+	else
+		data.rq_flags |= RQF_ELV;
 
 	ret = -EWOULDBLOCK;
 	tag = blk_mq_get_tag(&data);
@@ -2515,7 +2518,6 @@ void blk_mq_submit_bio(struct bio *bio)
 			.q		= q,
 			.nr_tags	= 1,
 			.cmd_flags	= bio->bi_opf,
-			.rq_flags	= q->elevator ? RQF_ELV : 0,
 		};
 
 		if (plug) {
-- 
cgit v1.2.3-58-ga151


From 3b87c6ea671a18fb77709240d658f4201904f8e4 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Tue, 2 Nov 2021 23:36:19 +0800
Subject: blk-mq: update hctx->nr_active in blk_mq_end_request_batch()

In case of shared tags and none io sched, batched completion still may
be run into, and hctx->nr_active is accounted when getting driver tag,
so it has to be updated in blk_mq_end_request_batch().

Otherwise, hctx->nr_active may become same with queue depth, then
hctx_may_queue() always return false, then io hang is caused.

Fixes the issue by updating the counter in batched way.

Reported-by: Shinichiro Kawasaki <shinichiro.kawasaki@wdc.com>
Fixes: f794f3351f26 ("block: add support for blk_mq_end_request_batch()")
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20211102153619.3627505-4-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c |  7 +++++++
 block/blk-mq.h | 12 +++++++++---
 2 files changed, 16 insertions(+), 3 deletions(-)

(limited to 'block/blk-mq.c')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 00263d896843..c68aa0a332e1 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -818,6 +818,13 @@ static inline void blk_mq_flush_tag_batch(struct blk_mq_hw_ctx *hctx,
 {
 	struct request_queue *q = hctx->queue;
 
+	/*
+	 * All requests should have been marked as RQF_MQ_INFLIGHT, so
+	 * update hctx->nr_active in batch
+	 */
+	if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
+		__blk_mq_sub_active_requests(hctx, nr_tags);
+
 	blk_mq_put_tags(hctx->tags, tag_array, nr_tags);
 	percpu_ref_put_many(&q->q_usage_counter, nr_tags);
 }
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 28859fc5faee..cb0b5482ca5e 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -225,12 +225,18 @@ static inline void __blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx)
 		atomic_inc(&hctx->nr_active);
 }
 
-static inline void __blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx)
+static inline void __blk_mq_sub_active_requests(struct blk_mq_hw_ctx *hctx,
+		int val)
 {
 	if (blk_mq_is_shared_tags(hctx->flags))
-		atomic_dec(&hctx->queue->nr_active_requests_shared_tags);
+		atomic_sub(val, &hctx->queue->nr_active_requests_shared_tags);
 	else
-		atomic_dec(&hctx->nr_active);
+		atomic_sub(val, &hctx->nr_active);
+}
+
+static inline void __blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx)
+{
+	__blk_mq_sub_active_requests(hctx, 1);
 }
 
 static inline int __blk_mq_active_requests(struct blk_mq_hw_ctx *hctx)
-- 
cgit v1.2.3-58-ga151


From c5fc7b93173661336b9cc4e32fd5082a95e12b94 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 3 Nov 2021 05:49:07 -0600
Subject: block: have plug stored requests hold references to the queue

Requests that were stored in the cache deliberately didn't hold an enter
reference to the queue, instead we grabbed one every time we pulled a
request out of there. That made for awkward logic on freeing the remainder
of the cached list, if needed, where we had to artificially raise the
queue usage count before each free.

Grab references up front for cached plug requests. That's safer, and also
more efficient.

Fixes: 47c122e35d7e ("block: pre-allocate requests if plug is started and is a batch")
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 8 +++++++-
 block/blk-mq.c   | 7 ++++---
 2 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'block/blk-mq.c')

diff --git a/block/blk-core.c b/block/blk-core.c
index fd389a16013c..35a87c06276e 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1643,7 +1643,13 @@ void blk_flush_plug(struct blk_plug *plug, bool from_schedule)
 		flush_plug_callbacks(plug, from_schedule);
 	if (!rq_list_empty(plug->mq_list))
 		blk_mq_flush_plug_list(plug, from_schedule);
-	if (unlikely(!from_schedule && plug->cached_rq))
+	/*
+	 * Unconditionally flush out cached requests, even if the unplug
+	 * event came from schedule. Since we know hold references to the
+	 * queue for cached requests, we don't want a blocked task holding
+	 * up a queue freeze/quiesce event.
+	 */
+	if (unlikely(!rq_list_empty(plug->cached_rq)))
 		blk_mq_free_plug_rqs(plug);
 }
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index c68aa0a332e1..5498454c2164 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -410,7 +410,10 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data,
 		tag_mask &= ~(1UL << i);
 		rq = blk_mq_rq_ctx_init(data, tags, tag, alloc_time_ns);
 		rq_list_add(data->cached_rq, rq);
+		nr++;
 	}
+	/* caller already holds a reference, add for remainder */
+	percpu_ref_get_many(&data->q->q_usage_counter, nr - 1);
 	data->nr_tags -= nr;
 
 	return rq_list_pop(data->cached_rq);
@@ -630,10 +633,8 @@ void blk_mq_free_plug_rqs(struct blk_plug *plug)
 {
 	struct request *rq;
 
-	while ((rq = rq_list_pop(&plug->cached_rq)) != NULL) {
-		percpu_ref_get(&rq->q->q_usage_counter);
+	while ((rq = rq_list_pop(&plug->cached_rq)) != NULL)
 		blk_mq_free_request(rq);
-	}
 }
 
 static void req_bio_endio(struct request *rq, struct bio *bio,
-- 
cgit v1.2.3-58-ga151


From 71539717c10521114403d27e171c9cbe35dcd900 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 3 Nov 2021 05:52:45 -0600
Subject: block: split request allocation components into helpers

This is in preparation for a fix, but serves as a cleanup as well moving
the cached vs regular alloc logic out of blk_mq_submit_bio().

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 71 +++++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 48 insertions(+), 23 deletions(-)

(limited to 'block/blk-mq.c')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 5498454c2164..dcb413297a96 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2478,6 +2478,51 @@ static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug)
 	return BLK_MAX_REQUEST_COUNT;
 }
 
+static struct request *blk_mq_get_new_requests(struct request_queue *q,
+					       struct blk_plug *plug,
+					       struct bio *bio)
+{
+	struct blk_mq_alloc_data data = {
+		.q		= q,
+		.nr_tags	= 1,
+		.cmd_flags	= bio->bi_opf,
+	};
+	struct request *rq;
+
+	if (plug) {
+		data.nr_tags = plug->nr_ios;
+		plug->nr_ios = 1;
+		data.cached_rq = &plug->cached_rq;
+	}
+
+	rq = __blk_mq_alloc_requests(&data);
+	if (rq)
+		return rq;
+
+	rq_qos_cleanup(q, bio);
+	if (bio->bi_opf & REQ_NOWAIT)
+		bio_wouldblock_error(bio);
+	return NULL;
+}
+
+static inline struct request *blk_mq_get_request(struct request_queue *q,
+						 struct blk_plug *plug,
+						 struct bio *bio)
+{
+	if (plug) {
+		struct request *rq;
+
+		rq = rq_list_peek(&plug->cached_rq);
+		if (rq) {
+			plug->cached_rq = rq_list_next(rq);
+			INIT_LIST_HEAD(&rq->queuelist);
+			return rq;
+		}
+	}
+
+	return blk_mq_get_new_requests(q, plug, bio);
+}
+
 /**
  * blk_mq_submit_bio - Create and send a request to block device.
  * @bio: Bio pointer.
@@ -2518,29 +2563,9 @@ void blk_mq_submit_bio(struct bio *bio)
 	rq_qos_throttle(q, bio);
 
 	plug = blk_mq_plug(q, bio);
-	if (plug && plug->cached_rq) {
-		rq = rq_list_pop(&plug->cached_rq);
-		INIT_LIST_HEAD(&rq->queuelist);
-	} else {
-		struct blk_mq_alloc_data data = {
-			.q		= q,
-			.nr_tags	= 1,
-			.cmd_flags	= bio->bi_opf,
-		};
-
-		if (plug) {
-			data.nr_tags = plug->nr_ios;
-			plug->nr_ios = 1;
-			data.cached_rq = &plug->cached_rq;
-		}
-		rq = __blk_mq_alloc_requests(&data);
-		if (unlikely(!rq)) {
-			rq_qos_cleanup(q, bio);
-			if (bio->bi_opf & REQ_NOWAIT)
-				bio_wouldblock_error(bio);
-			goto queue_exit;
-		}
-	}
+	rq = blk_mq_get_request(q, plug, bio);
+	if (unlikely(!rq))
+		goto queue_exit;
 
 	trace_block_getrq(bio);
 
-- 
cgit v1.2.3-58-ga151


From 900e080752025f0016128f07c9ed4c50eba3654b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 3 Nov 2021 05:47:09 -0600
Subject: block: move queue enter logic into blk_mq_submit_bio()

Retain the old logic for the fops based submit, but for our internal
blk_mq_submit_bio(), move the queue entering logic into the core
function itself.

We need to be a bit careful if going into the scheduler, as a scheduler
or queue mappings can arbitrarily change before we have entered the queue.
Have the bio scheduler mapping do that separately, it's a very cheap
operation compared to actually doing merging locking and lookups.

Reviewed-by: Christoph Hellwig <hch@lst.de>
[axboe: update to check merge post submit_bio_checks() doing remap...]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c     | 25 +++++++++++-----------
 block/blk-mq-sched.c | 13 +++++++++---
 block/blk-mq.c       | 60 +++++++++++++++++++++++++++++++++++-----------------
 block/blk.h          |  1 +
 4 files changed, 65 insertions(+), 34 deletions(-)

(limited to 'block/blk-mq.c')

diff --git a/block/blk-core.c b/block/blk-core.c
index 9ca3ddd154d4..4366056e14c4 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -744,7 +744,7 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q,
 	return BLK_STS_OK;
 }
 
-static noinline_for_stack bool submit_bio_checks(struct bio *bio)
+noinline_for_stack bool submit_bio_checks(struct bio *bio)
 {
 	struct block_device *bdev = bio->bi_bdev;
 	struct request_queue *q = bdev_get_queue(bdev);
@@ -862,22 +862,23 @@ end_io:
 	return false;
 }
 
-static void __submit_bio(struct bio *bio)
+static void __submit_bio_fops(struct gendisk *disk, struct bio *bio)
 {
-	struct gendisk *disk = bio->bi_bdev->bd_disk;
-
 	if (unlikely(bio_queue_enter(bio) != 0))
 		return;
+	if (submit_bio_checks(bio) && blk_crypto_bio_prep(&bio))
+		disk->fops->submit_bio(bio);
+	blk_queue_exit(disk->queue);
+}
 
-	if (!submit_bio_checks(bio) || !blk_crypto_bio_prep(&bio))
-		goto queue_exit;
-	if (!disk->fops->submit_bio) {
+static void __submit_bio(struct bio *bio)
+{
+	struct gendisk *disk = bio->bi_bdev->bd_disk;
+
+	if (!disk->fops->submit_bio)
 		blk_mq_submit_bio(bio);
-		return;
-	}
-	disk->fops->submit_bio(bio);
-queue_exit:
-	blk_queue_exit(disk->queue);
+	else
+		__submit_bio_fops(disk, bio);
 }
 
 /*
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 4a6789e4398b..4be652fa38e7 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -370,15 +370,20 @@ bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
 	bool ret = false;
 	enum hctx_type type;
 
-	if (e && e->type->ops.bio_merge)
-		return e->type->ops.bio_merge(q, bio, nr_segs);
+	if (bio_queue_enter(bio))
+		return false;
+
+	if (e && e->type->ops.bio_merge) {
+		ret = e->type->ops.bio_merge(q, bio, nr_segs);
+		goto out_put;
+	}
 
 	ctx = blk_mq_get_ctx(q);
 	hctx = blk_mq_map_queue(q, bio->bi_opf, ctx);
 	type = hctx->type;
 	if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE) ||
 	    list_empty_careful(&ctx->rq_lists[type]))
-		return false;
+		goto out_put;
 
 	/* default per sw-queue merge */
 	spin_lock(&ctx->lock);
@@ -391,6 +396,8 @@ bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
 		ret = true;
 
 	spin_unlock(&ctx->lock);
+out_put:
+	blk_queue_exit(q);
 	return ret;
 }
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index dcb413297a96..5fe40c85a308 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2478,9 +2478,23 @@ static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug)
 	return BLK_MAX_REQUEST_COUNT;
 }
 
+static bool blk_attempt_bio_merge(struct request_queue *q, struct bio *bio,
+				  unsigned int nr_segs, bool *same_queue_rq)
+{
+	if (!blk_queue_nomerges(q) && bio_mergeable(bio)) {
+		if (blk_attempt_plug_merge(q, bio, nr_segs, same_queue_rq))
+			return true;
+		if (blk_mq_sched_bio_merge(q, bio, nr_segs))
+			return true;
+	}
+	return false;
+}
+
 static struct request *blk_mq_get_new_requests(struct request_queue *q,
 					       struct blk_plug *plug,
-					       struct bio *bio)
+					       struct bio *bio,
+					       unsigned int nsegs,
+					       bool *same_queue_rq)
 {
 	struct blk_mq_alloc_data data = {
 		.q		= q,
@@ -2489,6 +2503,15 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q,
 	};
 	struct request *rq;
 
+	if (unlikely(bio_queue_enter(bio)))
+		return NULL;
+	if (unlikely(!submit_bio_checks(bio)))
+		goto put_exit;
+	if (blk_attempt_bio_merge(q, bio, nsegs, same_queue_rq))
+		goto put_exit;
+
+	rq_qos_throttle(q, bio);
+
 	if (plug) {
 		data.nr_tags = plug->nr_ios;
 		plug->nr_ios = 1;
@@ -2502,25 +2525,34 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q,
 	rq_qos_cleanup(q, bio);
 	if (bio->bi_opf & REQ_NOWAIT)
 		bio_wouldblock_error(bio);
+put_exit:
+	blk_queue_exit(q);
 	return NULL;
 }
 
 static inline struct request *blk_mq_get_request(struct request_queue *q,
 						 struct blk_plug *plug,
-						 struct bio *bio)
+						 struct bio *bio,
+						 unsigned int nsegs,
+						 bool *same_queue_rq)
 {
 	if (plug) {
 		struct request *rq;
 
 		rq = rq_list_peek(&plug->cached_rq);
 		if (rq) {
+			if (unlikely(!submit_bio_checks(bio)))
+				return NULL;
+			if (blk_attempt_bio_merge(q, bio, nsegs, same_queue_rq))
+				return NULL;
 			plug->cached_rq = rq_list_next(rq);
 			INIT_LIST_HEAD(&rq->queuelist);
+			rq_qos_throttle(q, bio);
 			return rq;
 		}
 	}
 
-	return blk_mq_get_new_requests(q, plug, bio);
+	return blk_mq_get_new_requests(q, plug, bio, nsegs, same_queue_rq);
 }
 
 /**
@@ -2546,26 +2578,20 @@ void blk_mq_submit_bio(struct bio *bio)
 	unsigned int nr_segs = 1;
 	blk_status_t ret;
 
+	if (unlikely(!blk_crypto_bio_prep(&bio)))
+		return;
+
 	blk_queue_bounce(q, &bio);
 	if (blk_may_split(q, bio))
 		__blk_queue_split(q, &bio, &nr_segs);
 
 	if (!bio_integrity_prep(bio))
-		goto queue_exit;
-
-	if (!blk_queue_nomerges(q) && bio_mergeable(bio)) {
-		if (blk_attempt_plug_merge(q, bio, nr_segs, &same_queue_rq))
-			goto queue_exit;
-		if (blk_mq_sched_bio_merge(q, bio, nr_segs))
-			goto queue_exit;
-	}
-
-	rq_qos_throttle(q, bio);
+		return;
 
 	plug = blk_mq_plug(q, bio);
-	rq = blk_mq_get_request(q, plug, bio);
+	rq = blk_mq_get_request(q, plug, bio, nr_segs, &same_queue_rq);
 	if (unlikely(!rq))
-		goto queue_exit;
+		return;
 
 	trace_block_getrq(bio);
 
@@ -2646,10 +2672,6 @@ void blk_mq_submit_bio(struct bio *bio)
 		/* Default case. */
 		blk_mq_sched_insert_request(rq, false, true, true);
 	}
-
-	return;
-queue_exit:
-	blk_queue_exit(q);
 }
 
 static size_t order_to_size(unsigned int order)
diff --git a/block/blk.h b/block/blk.h
index 814d9632d43e..b4fed2033e48 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -56,6 +56,7 @@ void blk_freeze_queue(struct request_queue *q);
 void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic);
 void blk_queue_start_drain(struct request_queue *q);
 int __bio_queue_enter(struct request_queue *q, struct bio *bio);
+bool submit_bio_checks(struct bio *bio);
 
 static inline bool blk_try_enter_queue(struct request_queue *q, bool pm)
 {
-- 
cgit v1.2.3-58-ga151


From 10c47870155b5d9a8597eff3345d244e2fe1847f Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 4 Nov 2021 11:54:47 -0600
Subject: block: ensure cached plug request matches the current queue

If we're driving multiple devices, we could have pre-populated the cache
for a different device. Ensure that the empty request matches the current
queue.

Fixes: 47c122e35d7e ("block: pre-allocate requests if plug is started and is a batch")
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block/blk-mq.c')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 5fe40c85a308..bbe1fb2dd58d 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2540,7 +2540,7 @@ static inline struct request *blk_mq_get_request(struct request_queue *q,
 		struct request *rq;
 
 		rq = rq_list_peek(&plug->cached_rq);
-		if (rq) {
+		if (rq && rq->q == q) {
 			if (unlikely(!submit_bio_checks(bio)))
 				return NULL;
 			if (blk_attempt_bio_merge(q, bio, nsegs, same_queue_rq))
-- 
cgit v1.2.3-58-ga151


From a846a8e6c9a5949582c5a6a8bbc83a7d27fd891e Mon Sep 17 00:00:00 2001
From: Ye Bin <yebin10@huawei.com>
Date: Mon, 8 Nov 2021 15:40:19 +0800
Subject: blk-mq: don't free tags if the tag_set is used by other device in
 queue initialztion

We got UAF report on v5.10 as follows:
[ 1446.674930] ==================================================================
[ 1446.675970] BUG: KASAN: use-after-free in blk_mq_get_driver_tag+0x9a4/0xa90
[ 1446.676902] Read of size 8 at addr ffff8880185afd10 by task kworker/1:2/12348
[ 1446.677851]
[ 1446.678073] CPU: 1 PID: 12348 Comm: kworker/1:2 Not tainted 5.10.0-10177-gc9c81b1e346a #2
[ 1446.679168] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
[ 1446.680692] Workqueue: kthrotld blk_throtl_dispatch_work_fn
[ 1446.681448] Call Trace:
[ 1446.681800]  dump_stack+0x9b/0xce
[ 1446.682916]  print_address_description.constprop.6+0x3e/0x60
[ 1446.685999]  kasan_report.cold.9+0x22/0x3a
[ 1446.687186]  blk_mq_get_driver_tag+0x9a4/0xa90
[ 1446.687785]  blk_mq_dispatch_rq_list+0x21a/0x1d40
[ 1446.692576]  __blk_mq_do_dispatch_sched+0x394/0x830
[ 1446.695758]  __blk_mq_sched_dispatch_requests+0x398/0x4f0
[ 1446.698279]  blk_mq_sched_dispatch_requests+0xdf/0x140
[ 1446.698967]  __blk_mq_run_hw_queue+0xc0/0x270
[ 1446.699561]  __blk_mq_delay_run_hw_queue+0x4cc/0x550
[ 1446.701407]  blk_mq_run_hw_queue+0x13b/0x2b0
[ 1446.702593]  blk_mq_sched_insert_requests+0x1de/0x390
[ 1446.703309]  blk_mq_flush_plug_list+0x4b4/0x760
[ 1446.705408]  blk_flush_plug_list+0x2c5/0x480
[ 1446.708471]  blk_finish_plug+0x55/0xa0
[ 1446.708980]  blk_throtl_dispatch_work_fn+0x23b/0x2e0
[ 1446.711236]  process_one_work+0x6d4/0xfe0
[ 1446.711778]  worker_thread+0x91/0xc80
[ 1446.713400]  kthread+0x32d/0x3f0
[ 1446.714362]  ret_from_fork+0x1f/0x30
[ 1446.714846]
[ 1446.715062] Allocated by task 1:
[ 1446.715509]  kasan_save_stack+0x19/0x40
[ 1446.716026]  __kasan_kmalloc.constprop.1+0xc1/0xd0
[ 1446.716673]  blk_mq_init_tags+0x6d/0x330
[ 1446.717207]  blk_mq_alloc_rq_map+0x50/0x1c0
[ 1446.717769]  __blk_mq_alloc_map_and_request+0xe5/0x320
[ 1446.718459]  blk_mq_alloc_tag_set+0x679/0xdc0
[ 1446.719050]  scsi_add_host_with_dma.cold.3+0xa0/0x5db
[ 1446.719736]  virtscsi_probe+0x7bf/0xbd0
[ 1446.720265]  virtio_dev_probe+0x402/0x6c0
[ 1446.720808]  really_probe+0x276/0xde0
[ 1446.721320]  driver_probe_device+0x267/0x3d0
[ 1446.721892]  device_driver_attach+0xfe/0x140
[ 1446.722491]  __driver_attach+0x13a/0x2c0
[ 1446.723037]  bus_for_each_dev+0x146/0x1c0
[ 1446.723603]  bus_add_driver+0x3fc/0x680
[ 1446.724145]  driver_register+0x1c0/0x400
[ 1446.724693]  init+0xa2/0xe8
[ 1446.725091]  do_one_initcall+0x9e/0x310
[ 1446.725626]  kernel_init_freeable+0xc56/0xcb9
[ 1446.726231]  kernel_init+0x11/0x198
[ 1446.726714]  ret_from_fork+0x1f/0x30
[ 1446.727212]
[ 1446.727433] Freed by task 26992:
[ 1446.727882]  kasan_save_stack+0x19/0x40
[ 1446.728420]  kasan_set_track+0x1c/0x30
[ 1446.728943]  kasan_set_free_info+0x1b/0x30
[ 1446.729517]  __kasan_slab_free+0x111/0x160
[ 1446.730084]  kfree+0xb8/0x520
[ 1446.730507]  blk_mq_free_map_and_requests+0x10b/0x1b0
[ 1446.731206]  blk_mq_realloc_hw_ctxs+0x8cb/0x15b0
[ 1446.731844]  blk_mq_init_allocated_queue+0x374/0x1380
[ 1446.732540]  blk_mq_init_queue_data+0x7f/0xd0
[ 1446.733155]  scsi_mq_alloc_queue+0x45/0x170
[ 1446.733730]  scsi_alloc_sdev+0x73c/0xb20
[ 1446.734281]  scsi_probe_and_add_lun+0x9a6/0x2d90
[ 1446.734916]  __scsi_scan_target+0x208/0xc50
[ 1446.735500]  scsi_scan_channel.part.3+0x113/0x170
[ 1446.736149]  scsi_scan_host_selected+0x25a/0x360
[ 1446.736783]  store_scan+0x290/0x2d0
[ 1446.737275]  dev_attr_store+0x55/0x80
[ 1446.737782]  sysfs_kf_write+0x132/0x190
[ 1446.738313]  kernfs_fop_write_iter+0x319/0x4b0
[ 1446.738921]  new_sync_write+0x40e/0x5c0
[ 1446.739429]  vfs_write+0x519/0x720
[ 1446.739877]  ksys_write+0xf8/0x1f0
[ 1446.740332]  do_syscall_64+0x2d/0x40
[ 1446.740802]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 1446.741462]
[ 1446.741670] The buggy address belongs to the object at ffff8880185afd00
[ 1446.741670]  which belongs to the cache kmalloc-256 of size 256
[ 1446.743276] The buggy address is located 16 bytes inside of
[ 1446.743276]  256-byte region [ffff8880185afd00, ffff8880185afe00)
[ 1446.744765] The buggy address belongs to the page:
[ 1446.745416] page:ffffea0000616b00 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x185ac
[ 1446.746694] head:ffffea0000616b00 order:2 compound_mapcount:0 compound_pincount:0
[ 1446.747719] flags: 0x1fffff80010200(slab|head)
[ 1446.748337] raw: 001fffff80010200 ffffea00006a3208 ffffea000061bf08 ffff88801004f240
[ 1446.749404] raw: 0000000000000000 0000000000100010 00000001ffffffff 0000000000000000
[ 1446.750455] page dumped because: kasan: bad access detected
[ 1446.751227]
[ 1446.751445] Memory state around the buggy address:
[ 1446.752102]  ffff8880185afc00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[ 1446.753090]  ffff8880185afc80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[ 1446.754079] >ffff8880185afd00: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[ 1446.755065]                          ^
[ 1446.755589]  ffff8880185afd80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[ 1446.756574]  ffff8880185afe00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[ 1446.757566] ==================================================================

Flag 'BLK_MQ_F_TAG_QUEUE_SHARED' will be set if the second device on the
same host initializes it's queue successfully. However, if the second
device failed to allocate memory in blk_mq_alloc_and_init_hctx() from
blk_mq_realloc_hw_ctxs() from blk_mq_init_allocated_queue(),
__blk_mq_free_map_and_rqs() will be called on error path, and if
'BLK_MQ_TAG_HCTX_SHARED' is not set, 'tag_set->tags' will be freed
while it's still used by the first device.

To fix this issue we move release newly allocated hardware context from
blk_mq_realloc_hw_ctxs to __blk_mq_update_nr_hw_queues. As there is needn't to
release hardware context in blk_mq_init_allocated_queue.

Fixes: 868f2f0b7206 ("blk-mq: dynamic h/w context count")
Signed-off-by: Ye Bin <yebin10@huawei.com>
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20211108074019.1058843-1-yebin10@huawei.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'block/blk-mq.c')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index bbe1fb2dd58d..5a9cd9fe8da3 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -3657,7 +3657,6 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
 		struct blk_mq_hw_ctx *hctx = hctxs[j];
 
 		if (hctx) {
-			__blk_mq_free_map_and_rqs(set, j);
 			blk_mq_exit_hctx(q, set, hctx, j);
 			hctxs[j] = NULL;
 		}
@@ -4165,8 +4164,13 @@ fallback:
 	list_for_each_entry(q, &set->tag_list, tag_set_list) {
 		blk_mq_realloc_hw_ctxs(set, q);
 		if (q->nr_hw_queues != set->nr_hw_queues) {
+			int i = prev_nr_hw_queues;
+
 			pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n",
 					nr_hw_queues, prev_nr_hw_queues);
+			for (; i < set->nr_hw_queues; i++)
+				__blk_mq_free_map_and_rqs(set, i);
+
 			set->nr_hw_queues = prev_nr_hw_queues;
 			blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
 			goto fallback;
-- 
cgit v1.2.3-58-ga151


From 9ef4d0209cbadb63656a7aa29fde49c27ab2b9bf Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Tue, 9 Nov 2021 15:11:41 +0800
Subject: blk-mq: add one API for waiting until quiesce is done

Some drivers(NVMe, SCSI) need to call quiesce and unquiesce in pair, but it
is hard to switch to this style, so these drivers need one atomic flag for
helping to balance quiesce and unquiesce.

When quiesce is in-progress, the driver still needs to wait until
the quiesce is done, so add API of blk_mq_wait_quiesce_done() for
these drivers.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20211109071144.181581-2-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c         | 28 ++++++++++++++++++++--------
 include/linux/blk-mq.h |  1 +
 2 files changed, 21 insertions(+), 8 deletions(-)

(limited to 'block/blk-mq.c')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 5a9cd9fe8da3..d3e5fcbc943b 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -251,22 +251,18 @@ void blk_mq_quiesce_queue_nowait(struct request_queue *q)
 EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
 
 /**
- * blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished
+ * blk_mq_wait_quiesce_done() - wait until in-progress quiesce is done
  * @q: request queue.
  *
- * Note: this function does not prevent that the struct request end_io()
- * callback function is invoked. Once this function is returned, we make
- * sure no dispatch can happen until the queue is unquiesced via
- * blk_mq_unquiesce_queue().
+ * Note: it is driver's responsibility for making sure that quiesce has
+ * been started.
  */
-void blk_mq_quiesce_queue(struct request_queue *q)
+void blk_mq_wait_quiesce_done(struct request_queue *q)
 {
 	struct blk_mq_hw_ctx *hctx;
 	unsigned int i;
 	bool rcu = false;
 
-	blk_mq_quiesce_queue_nowait(q);
-
 	queue_for_each_hw_ctx(q, hctx, i) {
 		if (hctx->flags & BLK_MQ_F_BLOCKING)
 			synchronize_srcu(hctx->srcu);
@@ -276,6 +272,22 @@ void blk_mq_quiesce_queue(struct request_queue *q)
 	if (rcu)
 		synchronize_rcu();
 }
+EXPORT_SYMBOL_GPL(blk_mq_wait_quiesce_done);
+
+/**
+ * blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished
+ * @q: request queue.
+ *
+ * Note: this function does not prevent that the struct request end_io()
+ * callback function is invoked. Once this function is returned, we make
+ * sure no dispatch can happen until the queue is unquiesced via
+ * blk_mq_unquiesce_queue().
+ */
+void blk_mq_quiesce_queue(struct request_queue *q)
+{
+	blk_mq_quiesce_queue_nowait(q);
+	blk_mq_wait_quiesce_done(q);
+}
 EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);
 
 /*
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index b4039fdf1b04..d53ee59ba131 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -803,6 +803,7 @@ void blk_mq_start_hw_queues(struct request_queue *q);
 void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
 void blk_mq_quiesce_queue(struct request_queue *q);
+void blk_mq_wait_quiesce_done(struct request_queue *q);
 void blk_mq_unquiesce_queue(struct request_queue *q);
 void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
-- 
cgit v1.2.3-58-ga151