blk-throttle: support prioritized processing of metadata

Currently, blk-throttle handle all IO fifo, hence if data IO is throttled and then meta IO is dispatched, the meta IO will have to wait for the data IO, causing priority inversion problems. This patch support to handle metadata first and then pay debt while throttling data. Test script: use cgroup v1 to throttle root cgroup, then create new dir and file while write back is throttled test() { mkdir /mnt/test/xxx touch /mnt/test/xxx/1 sync /mnt/test/xxx sync /mnt/test/xxx } mkfs.ext4 -F /dev/nvme0n1 -E lazy_itable_init=0,lazy_journal_init=0 mount /dev/nvme0n1 /mnt/test echo "259:0 $((1024*1024))" > /sys/fs/cgroup/blkio/blkio.throttle.write_bps_device dd if=/dev/zero of=/mnt/test/foo1 bs=16M count=1 conv=fdatasync status=none & sleep 4 time test echo "259:0 0" > /sys/fs/cgroup/blkio/blkio.throttle.write_bps_device sleep 1 umount /dev/nvme0n1 Test result: time cost for creating new dir and file before this patch: 14s after this patch: 0.1s Signed-off-by: Yu Kuai <yukuai3@huawei.com> Acked-by: Tejun Heo <tj@kernel.org> Link: https://lore.kernel.org/r/20240903135149.271857-3-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe <axboe@kernel.dk>
author: Yu Kuai <yukuai3@huawei.com> 2024-09-03 21:51:49 +0800
committer: Jens Axboe <axboe@kernel.dk> 2024-09-10 16:31:41 -0600
commit: 29390bb5661d49d10424ad8e915230de1f7074c9 (patch)
tree: 3b96ae1c52e1bd9b2730f08af00f6255d0e8b7e3 /block
parent: 3bf73e6283ef0bae4e27dad62309e50e3bf7ee88 (diff)
1 files changed, 43 insertions, 22 deletions
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index eb859c44c9f3..9c5bbd261724 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1595,6 +1595,22 @@ void blk_throtl_cancel_bios(struct gendisk *disk)
 	spin_unlock_irq(&q->queue_lock);
 }
 
+static bool tg_within_limit(struct throtl_grp *tg, struct bio *bio, bool rw)
+{
+	/* throtl is FIFO - if bios are already queued, should queue */
+	if (tg->service_queue.nr_queued[rw])
+		return false;
+
+	return tg_may_dispatch(tg, bio, NULL);
+}
+
+static void tg_dispatch_in_debt(struct throtl_grp *tg, struct bio *bio, bool rw)
+{
+	if (!bio_flagged(bio, BIO_BPS_THROTTLED))
+		tg->carryover_bytes[rw] -= throtl_bio_data_size(bio);
+	tg->carryover_ios[rw]--;
+}
+
 bool __blk_throtl_bio(struct bio *bio)
 {
 	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
@@ -1611,29 +1627,34 @@ bool __blk_throtl_bio(struct bio *bio)
 	sq = &tg->service_queue;
 
 	while (true) {
-		/* throtl is FIFO - if bios are already queued, should queue */
-		if (sq->nr_queued[rw])
+		if (tg_within_limit(tg, bio, rw)) {
+			/* within limits, let's charge and dispatch directly */
+			throtl_charge_bio(tg, bio);
+
+			/*
+			 * We need to trim slice even when bios are not being
+			 * queued otherwise it might happen that a bio is not
+			 * queued for a long time and slice keeps on extending
+			 * and trim is not called for a long time. Now if limits
+			 * are reduced suddenly we take into account all the IO
+			 * dispatched so far at new low rate and * newly queued
+			 * IO gets a really long dispatch time.
+			 *
+			 * So keep on trimming slice even if bio is not queued.
+			 */
+			throtl_trim_slice(tg, rw);
+		} else if (bio_issue_as_root_blkg(bio)) {
+			/*
+			 * IOs which may cause priority inversions are
+			 * dispatched directly, even if they're over limit.
+			 * Debts are handled by carryover_bytes/ios while
+			 * calculating wait time.
+			 */
+			tg_dispatch_in_debt(tg, bio, rw);
+		} else {
+			/* if above limits, break to queue */
 			break;
-
-		/* if above limits, break to queue */
-		if (!tg_may_dispatch(tg, bio, NULL))
-			break;
-
-		/* within limits, let's charge and dispatch directly */
-		throtl_charge_bio(tg, bio);
-
-		/*
-		 * We need to trim slice even when bios are not being queued
-		 * otherwise it might happen that a bio is not queued for
-		 * a long time and slice keeps on extending and trim is not
-		 * called for a long time. Now if limits are reduced suddenly
-		 * we take into account all the IO dispatched so far at new
-		 * low rate and * newly queued IO gets a really long dispatch
-		 * time.
-		 *
-		 * So keep on trimming slice even if bio is not queued.
-		 */
-		throtl_trim_slice(tg, rw);
+		}
 
 		/*
 		 * @bio passed through this layer without being throttled.
author	Yu Kuai <yukuai3@huawei.com>	2024-09-03 21:51:49 +0800
committer	Jens Axboe <axboe@kernel.dk>	2024-09-10 16:31:41 -0600
commit	29390bb5661d49d10424ad8e915230de1f7074c9 (patch)
tree	3b96ae1c52e1bd9b2730f08af00f6255d0e8b7e3 /block
parent	3bf73e6283ef0bae4e27dad62309e50e3bf7ee88 (diff)