From 23955622ff8d231bcc9650b3d06583f117a6e3ba Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Mon, 10 Jul 2017 15:47:11 -0700
Subject: swap: add block io poll in swapin path

For fast flash disk, async IO could introduce overhead because of
context switch.  block-mq now supports IO poll, which improves
performance and latency a lot.  swapin is a good place to use this
technique, because the task is waiting for the swapin page to continue
execution.

In my virtual machine, directly read 4k data from a NVMe with iopoll is
about 60% better than that without poll.  With iopoll support in swapin
patch, my microbenchmark (a task does random memory write) is about
10%~25% faster.  CPU utilization increases a lot though, 2x and even 3x
CPU utilization.  This will depend on disk speed.

While iopoll in swapin isn't intended for all usage cases, it's a win
for latency sensistive workloads with high speed swap disk.  block layer
has knob to control poll in runtime.  If poll isn't enabled in block
layer, there should be no noticeable change in swapin.

I got a chance to run the same test in a NVMe with DRAM as the media.
In simple fio IO test, blkpoll boosts 50% performance in single thread
test and ~20% in 8 threads test.  So this is the base line.  In above
swap test, blkpoll boosts ~27% performance in single thread test.
blkpoll uses 2x CPU time though.

If we enable hybid polling, the performance gain has very slight drop
but CPU time is only 50% worse than that without blkpoll.  Also we can
adjust parameter of hybid poll, with it, the CPU time penality is
reduced further.  In 8 threads test, blkpoll doesn't help though.  The
performance is similar to that without blkpoll, but cpu utilization is
similar too.  There is lock contention in swap path.  The cpu time
spending on blkpoll isn't high.  So overall, blkpoll swapin isn't worse
than that without it.

The swapin readahead might read several pages in in the same time and
form a big IO request.  Since the IO will take longer time, it doesn't
make sense to do poll, so the patch only does iopoll for single page
swapin.

[akpm@linux-foundation.org: coding-style fixes]
Link: http://lkml.kernel.org/r/070c3c3e40b711e7b1390002c991e86a-b5408f0@7511894063d3764ff01ea8111f5a004d7dd700ed078797c204a24e620ddb965c
Signed-off-by: Shaohua Li <shli@fb.com>
Cc: Tim Chen <tim.c.chen@intel.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Jens Axboe <axboe@fb.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/madvise.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm/madvise.c')

diff --git a/mm/madvise.c b/mm/madvise.c
index 25b78ee4fc2c..8eda1841c576 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -205,7 +205,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
 			continue;
 
 		page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
-								vma, index);
+							vma, index, false);
 		if (page)
 			put_page(page);
 	}
@@ -246,7 +246,7 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma,
 		}
 		swap = radix_to_swp_entry(page);
 		page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
-								NULL, 0);
+							NULL, 0, false);
 		if (page)
 			put_page(page);
 	}
-- 
cgit v1.2.3-58-ga151


From 230ca982ba69ae63294017a3800800ad79d5f003 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.vnet.ibm.com>
Date: Mon, 10 Jul 2017 15:49:02 -0700
Subject: userfaultfd: non-cooperative: add madvise() event for MADV_FREE
 request

MADV_FREE is identical to MADV_DONTNEED from the point of view of uffd
monitor.  The monitor has to stop handling #PF events in the range being
freed.  We are reusing userfaultfd_remove callback along with the logic
required to re-get and re-validate the VMA which may change or disappear
because userfaultfd_remove releases mmap_sem.

Link: http://lkml.kernel.org/r/1497876311-18615-1-git-send-email-rppt@linux.vnet.ibm.com
Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Pavel Emelyanov <xemul@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/madvise.c | 42 ++++++++++++++++++++++--------------------
 1 file changed, 22 insertions(+), 20 deletions(-)

(limited to 'mm/madvise.c')

diff --git a/mm/madvise.c b/mm/madvise.c
index 8eda1841c576..9976852f1e1c 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -451,9 +451,6 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
 	struct mm_struct *mm = vma->vm_mm;
 	struct mmu_gather tlb;
 
-	if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
-		return -EINVAL;
-
 	/* MADV_FREE works for only anon vma at the moment */
 	if (!vma_is_anonymous(vma))
 		return -EINVAL;
@@ -477,14 +474,6 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
 	return 0;
 }
 
-static long madvise_free(struct vm_area_struct *vma,
-			     struct vm_area_struct **prev,
-			     unsigned long start, unsigned long end)
-{
-	*prev = vma;
-	return madvise_free_single_vma(vma, start, end);
-}
-
 /*
  * Application no longer needs these pages.  If the pages are dirty,
  * it's OK to just throw them away.  The app will be more careful about
@@ -504,9 +493,17 @@ static long madvise_free(struct vm_area_struct *vma,
  * An interface that causes the system to free clean pages and flush
  * dirty pages is already available as msync(MS_INVALIDATE).
  */
-static long madvise_dontneed(struct vm_area_struct *vma,
-			     struct vm_area_struct **prev,
-			     unsigned long start, unsigned long end)
+static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
+					unsigned long start, unsigned long end)
+{
+	zap_page_range(vma, start, end - start);
+	return 0;
+}
+
+static long madvise_dontneed_free(struct vm_area_struct *vma,
+				  struct vm_area_struct **prev,
+				  unsigned long start, unsigned long end,
+				  int behavior)
 {
 	*prev = vma;
 	if (!can_madv_dontneed_vma(vma))
@@ -526,7 +523,8 @@ static long madvise_dontneed(struct vm_area_struct *vma,
 			 * is also < vma->vm_end. If start <
 			 * vma->vm_start it means an hole materialized
 			 * in the user address space within the
-			 * virtual range passed to MADV_DONTNEED.
+			 * virtual range passed to MADV_DONTNEED
+			 * or MADV_FREE.
 			 */
 			return -ENOMEM;
 		}
@@ -537,7 +535,7 @@ static long madvise_dontneed(struct vm_area_struct *vma,
 			 * Don't fail if end > vma->vm_end. If the old
 			 * vma was splitted while the mmap_sem was
 			 * released the effect of the concurrent
-			 * operation may not cause MADV_DONTNEED to
+			 * operation may not cause madvise() to
 			 * have an undefined result. There may be an
 			 * adjacent next vma that we'll walk
 			 * next. userfaultfd_remove() will generate an
@@ -549,8 +547,13 @@ static long madvise_dontneed(struct vm_area_struct *vma,
 		}
 		VM_WARN_ON(start >= end);
 	}
-	zap_page_range(vma, start, end - start);
-	return 0;
+
+	if (behavior == MADV_DONTNEED)
+		return madvise_dontneed_single_vma(vma, start, end);
+	else if (behavior == MADV_FREE)
+		return madvise_free_single_vma(vma, start, end);
+	else
+		return -EINVAL;
 }
 
 /*
@@ -656,9 +659,8 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
 	case MADV_WILLNEED:
 		return madvise_willneed(vma, prev, start, end);
 	case MADV_FREE:
-		return madvise_free(vma, prev, start, end);
 	case MADV_DONTNEED:
-		return madvise_dontneed(vma, prev, start, end);
+		return madvise_dontneed_free(vma, prev, start, end, behavior);
 	default:
 		return madvise_behavior(vma, prev, start, end, behavior);
 	}
-- 
cgit v1.2.3-58-ga151