From ffe81d45322cc3cb140f0db080a4727ea284661e Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 4 Dec 2018 20:06:48 -0700
Subject: blk-mq: fix corruption with direct issue

If we attempt a direct issue to a SCSI device, and it returns BUSY, then
we queue the request up normally. However, the SCSI layer may have
already setup SG tables etc for this particular command. If we later
merge with this request, then the old tables are no longer valid. Once
we issue the IO, we only read/write the original part of the request,
not the new state of it.

This causes data corruption, and is most often noticed with the file
system complaining about the just read data being invalid:

[  235.934465] EXT4-fs error (device sda1): ext4_iget:4831: inode #7142: comm dpkg-query: bad extra_isize 24937 (inode size 256)

because most of it is garbage...

This doesn't happen from the normal issue path, as we will simply defer
the request to the hardware queue dispatch list if we fail. Once it's on
the dispatch list, we never merge with it.

Fix this from the direct issue path by flagging the request as
REQ_NOMERGE so we don't change the size of it before issue.

See also:
  https://bugzilla.kernel.org/show_bug.cgi?id=201685

Tested-by: Guenter Roeck <linux@roeck-us.net>
Fixes: 6ce3dd6eec1 ("blk-mq: issue directly if hw queue isn't busy in case of 'none'")
Cc: stable@vger.kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

(limited to 'block/blk-mq.c')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3f91c6e5b17a..3262d83b9e07 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1715,6 +1715,15 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
 		break;
 	case BLK_STS_RESOURCE:
 	case BLK_STS_DEV_RESOURCE:
+		/*
+		 * If direct dispatch fails, we cannot allow any merging on
+		 * this IO. Drivers (like SCSI) may have set up permanent state
+		 * for this request, like SG tables and mappings, and if we
+		 * merge to it later on then we'll still only do IO to the
+		 * original part.
+		 */
+		rq->cmd_flags |= REQ_NOMERGE;
+
 		blk_mq_update_dispatch_busy(hctx, true);
 		__blk_mq_requeue_request(rq);
 		break;
@@ -1727,6 +1736,18 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
 	return ret;
 }
 
+/*
+ * Don't allow direct dispatch of anything but regular reads/writes,
+ * as some of the other commands can potentially share request space
+ * with data we need for the IO scheduler. If we attempt a direct dispatch
+ * on those and fail, we can't safely add it to the scheduler afterwards
+ * without potentially overwriting data that the driver has already written.
+ */
+static bool blk_rq_can_direct_dispatch(struct request *rq)
+{
+	return req_op(rq) == REQ_OP_READ || req_op(rq) == REQ_OP_WRITE;
+}
+
 static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 						struct request *rq,
 						blk_qc_t *cookie,
@@ -1748,7 +1769,7 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 		goto insert;
 	}
 
-	if (q->elevator && !bypass_insert)
+	if (!blk_rq_can_direct_dispatch(rq) || (q->elevator && !bypass_insert))
 		goto insert;
 
 	if (!blk_mq_get_dispatch_budget(hctx))
@@ -1810,6 +1831,9 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
 		struct request *rq = list_first_entry(list, struct request,
 				queuelist);
 
+		if (!blk_rq_can_direct_dispatch(rq))
+			break;
+
 		list_del_init(&rq->queuelist);
 		ret = blk_mq_request_issue_directly(rq);
 		if (ret != BLK_STS_OK) {
-- 
cgit v1.2.3-58-ga151


From c616cbee97aed4bc6178f148a7240206dcdb85a6 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 6 Dec 2018 22:17:44 -0700
Subject: blk-mq: punt failed direct issue to dispatch list

After the direct dispatch corruption fix, we permanently disallow direct
dispatch of non read/write requests. This works fine off the normal IO
path, as they will be retried like any other failed direct dispatch
request. But for the blk_insert_cloned_request() that only DM uses to
bypass the bottom level scheduler, we always first attempt direct
dispatch. For some types of requests, that's now a permanent failure,
and no amount of retrying will make that succeed. This results in a
livelock.

Instead of making special cases for what we can direct issue, and now
having to deal with DM solving the livelock while still retaining a BUSY
condition feedback loop, always just add a request that has been through
->queue_rq() to the hardware queue dispatch list. These are safe to use
as no merging can take place there. Additionally, if requests do have
prepped data from drivers, we aren't dependent on them not sharing space
in the request structure to safely add them to the IO scheduler lists.

This basically reverts ffe81d45322c and is based on a patch from Ming,
but with the list insert case covered as well.

Fixes: ffe81d45322c ("blk-mq: fix corruption with direct issue")
Cc: stable@vger.kernel.org
Suggested-by: Ming Lei <ming.lei@redhat.com>
Reported-by: Bart Van Assche <bvanassche@acm.org>
Tested-by: Ming Lei <ming.lei@redhat.com>
Acked-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 33 +++++----------------------------
 1 file changed, 5 insertions(+), 28 deletions(-)

(limited to 'block/blk-mq.c')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3262d83b9e07..6a7566244de3 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1715,15 +1715,6 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
 		break;
 	case BLK_STS_RESOURCE:
 	case BLK_STS_DEV_RESOURCE:
-		/*
-		 * If direct dispatch fails, we cannot allow any merging on
-		 * this IO. Drivers (like SCSI) may have set up permanent state
-		 * for this request, like SG tables and mappings, and if we
-		 * merge to it later on then we'll still only do IO to the
-		 * original part.
-		 */
-		rq->cmd_flags |= REQ_NOMERGE;
-
 		blk_mq_update_dispatch_busy(hctx, true);
 		__blk_mq_requeue_request(rq);
 		break;
@@ -1736,18 +1727,6 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
 	return ret;
 }
 
-/*
- * Don't allow direct dispatch of anything but regular reads/writes,
- * as some of the other commands can potentially share request space
- * with data we need for the IO scheduler. If we attempt a direct dispatch
- * on those and fail, we can't safely add it to the scheduler afterwards
- * without potentially overwriting data that the driver has already written.
- */
-static bool blk_rq_can_direct_dispatch(struct request *rq)
-{
-	return req_op(rq) == REQ_OP_READ || req_op(rq) == REQ_OP_WRITE;
-}
-
 static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 						struct request *rq,
 						blk_qc_t *cookie,
@@ -1769,7 +1748,7 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 		goto insert;
 	}
 
-	if (!blk_rq_can_direct_dispatch(rq) || (q->elevator && !bypass_insert))
+	if (q->elevator && !bypass_insert)
 		goto insert;
 
 	if (!blk_mq_get_dispatch_budget(hctx))
@@ -1785,7 +1764,7 @@ insert:
 	if (bypass_insert)
 		return BLK_STS_RESOURCE;
 
-	blk_mq_sched_insert_request(rq, false, run_queue, false);
+	blk_mq_request_bypass_insert(rq, run_queue);
 	return BLK_STS_OK;
 }
 
@@ -1801,7 +1780,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 
 	ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false);
 	if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
-		blk_mq_sched_insert_request(rq, false, true, false);
+		blk_mq_request_bypass_insert(rq, true);
 	else if (ret != BLK_STS_OK)
 		blk_mq_end_request(rq, ret);
 
@@ -1831,15 +1810,13 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
 		struct request *rq = list_first_entry(list, struct request,
 				queuelist);
 
-		if (!blk_rq_can_direct_dispatch(rq))
-			break;
-
 		list_del_init(&rq->queuelist);
 		ret = blk_mq_request_issue_directly(rq);
 		if (ret != BLK_STS_OK) {
 			if (ret == BLK_STS_RESOURCE ||
 					ret == BLK_STS_DEV_RESOURCE) {
-				list_add(&rq->queuelist, list);
+				blk_mq_request_bypass_insert(rq,
+							list_empty(list));
 				break;
 			}
 			blk_mq_end_request(rq, ret);
-- 
cgit v1.2.3-58-ga151