mm/filemap: optimize filemap folio adding

Instead of doing multiple tree walks, do one optimism range check with lock hold, and exit if raced with another insertion. If a shadow exists, check it with a new xas_get_order helper before releasing the lock to avoid redundant tree walks for getting its order. Drop the lock and do the allocation only if a split is needed. In the best case, it only need to walk the tree once. If it needs to alloc and split, 3 walks are issued (One for first ranged conflict check and order retrieving, one for the second check after allocation, one for the insert after split). Testing with 4K pages, in an 8G cgroup, with 16G brd as block device: echo 3 > /proc/sys/vm/drop_caches fio -name=cached --numjobs=16 --filename=/mnt/test.img \ --buffered=1 --ioengine=mmap --rw=randread --time_based \ --ramp_time=30s --runtime=5m --group_reporting Before: bw ( MiB/s): min= 1027, max= 3520, per=100.00%, avg=2445.02, stdev=18.90, samples=8691 iops : min=263001, max=901288, avg=625924.36, stdev=4837.28, samples=8691 After (+7.3%): bw ( MiB/s): min= 493, max= 3947, per=100.00%, avg=2625.56, stdev=25.74, samples=8651 iops : min=126454, max=1010681, avg=672142.61, stdev=6590.48, samples=8651 Test result with THP (do a THP randread then switch to 4K page in hope it issues a lot of splitting): echo 3 > /proc/sys/vm/drop_caches fio -name=cached --numjobs=16 --filename=/mnt/test.img \ --buffered=1 --ioengine=mmap -thp=1 --readonly \ --rw=randread --time_based --ramp_time=30s --runtime=10m \ --group_reporting fio -name=cached --numjobs=16 --filename=/mnt/test.img \ --buffered=1 --ioengine=mmap \ --rw=randread --time_based --runtime=5s --group_reporting Before: bw ( KiB/s): min= 4141, max=14202, per=100.00%, avg=7935.51, stdev=96.85, samples=18976 iops : min= 1029, max= 3548, avg=1979.52, stdev=24.23, samples=18976· READ: bw=4545B/s (4545B/s), 4545B/s-4545B/s (4545B/s-4545B/s), io=64.0KiB (65.5kB), run=14419-14419msec After (+12.5%): bw ( KiB/s): min= 4611, max=15370, per=100.00%, avg=8928.74, stdev=105.17, samples=19146 iops : min= 1151, max= 3842, avg=2231.27, stdev=26.29, samples=19146 READ: bw=4635B/s (4635B/s), 4635B/s-4635B/s (4635B/s-4635B/s), io=64.0KiB (65.5kB), run=14137-14137msec The performance is better for both 4K (+7.5%) and THP (+12.5%) cached read. Link: https://lkml.kernel.org/r/20240415171857.19244-5-ryncsn@gmail.com Signed-off-by: Kairui Song <kasong@tencent.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
author: Kairui Song <kasong@tencent.com> 2024-04-16 01:18:56 +0800
committer: Andrew Morton <akpm@linux-foundation.org> 2024-04-25 20:56:09 -0700
commit: 6758c1128ceb45d1a35298912b974eb4895b7dd9 (patch)
tree: aeee300a90f92bab8165cc2a4bd9c1a6b095ce4f /lib/test_xarray.c
parent: a4864671ca0bf51c8e78242951741df52c06766f (diff)
1 files changed, 59 insertions, 0 deletions
diff --git a/lib/test_xarray.c b/lib/test_xarray.c
index 0efde8f93490..8732a311f613 100644
--- a/lib/test_xarray.c
+++ b/lib/test_xarray.c
@@ -2017,6 +2017,64 @@ static noinline void check_xas_get_order(struct xarray *xa)
 	}
 }
 
+static noinline void check_xas_conflict_get_order(struct xarray *xa)
+{
+	XA_STATE(xas, xa, 0);
+
+	void *entry;
+	int only_once;
+	unsigned int max_order = IS_ENABLED(CONFIG_XARRAY_MULTI) ? 20 : 1;
+	unsigned int order;
+	unsigned long i, j, k;
+
+	for (order = 0; order < max_order; order++) {
+		for (i = 0; i < 10; i++) {
+			xas_set_order(&xas, i << order, order);
+			do {
+				xas_lock(&xas);
+				xas_store(&xas, xa_mk_value(i));
+				xas_unlock(&xas);
+			} while (xas_nomem(&xas, GFP_KERNEL));
+
+			/*
+			 * Ensure xas_get_order works with xas_for_each_conflict.
+			 */
+			j = i << order;
+			for (k = 0; k < order; k++) {
+				only_once = 0;
+				xas_set_order(&xas, j + (1 << k), k);
+				xas_lock(&xas);
+				xas_for_each_conflict(&xas, entry) {
+					XA_BUG_ON(xa, entry != xa_mk_value(i));
+					XA_BUG_ON(xa, xas_get_order(&xas) != order);
+					only_once++;
+				}
+				XA_BUG_ON(xa, only_once != 1);
+				xas_unlock(&xas);
+			}
+
+			if (order < max_order - 1) {
+				only_once = 0;
+				xas_set_order(&xas, (i & ~1UL) << order, order + 1);
+				xas_lock(&xas);
+				xas_for_each_conflict(&xas, entry) {
+					XA_BUG_ON(xa, entry != xa_mk_value(i));
+					XA_BUG_ON(xa, xas_get_order(&xas) != order);
+					only_once++;
+				}
+				XA_BUG_ON(xa, only_once != 1);
+				xas_unlock(&xas);
+			}
+
+			xas_set_order(&xas, i << order, order);
+			xas_lock(&xas);
+			xas_store(&xas, NULL);
+			xas_unlock(&xas);
+		}
+	}
+}
+
+
 static noinline void check_destroy(struct xarray *xa)
 {
 	unsigned long index;
@@ -2069,6 +2127,7 @@ static int xarray_checks(void)
 	check_multi_store_advanced(&array);
 	check_get_order(&array);
 	check_xas_get_order(&array);
+	check_xas_conflict_get_order(&array);
 	check_xa_alloc();
 	check_find(&array);
 	check_find_entry(&array);
author	Kairui Song <kasong@tencent.com>	2024-04-16 01:18:56 +0800
committer	Andrew Morton <akpm@linux-foundation.org>	2024-04-25 20:56:09 -0700
commit	6758c1128ceb45d1a35298912b974eb4895b7dd9 (patch)
tree	aeee300a90f92bab8165cc2a4bd9c1a6b095ce4f /lib/test_xarray.c
parent	a4864671ca0bf51c8e78242951741df52c06766f (diff)