From 5b65c4677a57a1d4414212f9995aa0e46a21ff80 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Sat, 9 Sep 2017 00:56:03 +0300 Subject: mm, x86/mm: Fix performance regression in get_user_pages_fast() The 0-day test bot found a performance regression that was tracked down to switching x86 to the generic get_user_pages_fast() implementation: http://lkml.kernel.org/r/20170710024020.GA26389@yexl-desktop The regression was caused by the fact that we now use local_irq_save() + local_irq_restore() in get_user_pages_fast() to disable interrupts. In x86 implementation local_irq_disable() + local_irq_enable() was used. The fix is to make get_user_pages_fast() use local_irq_disable(), leaving local_irq_save() for __get_user_pages_fast() that can be called with interrupts disabled. Numbers for pinning a gigabyte of memory, one page a time, 20 repeats: Before: Average: 14.91 ms, stddev: 0.45 ms After: Average: 10.76 ms, stddev: 0.18 ms Signed-off-by: Kirill A. Shutemov Cc: Andrew Morton Cc: Huang Ying Cc: Jonathan Corbet Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Thorsten Leemhuis Cc: linux-mm@kvack.org Fixes: e585513b76f7 ("x86/mm/gup: Switch GUP to the generic get_user_page_fast() implementation") Link: http://lkml.kernel.org/r/20170908215603.9189-3-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- mm/gup.c | 97 ++++++++++++++++++++++++++++++++++++++-------------------------- 1 file changed, 58 insertions(+), 39 deletions(-) (limited to 'mm') diff --git a/mm/gup.c b/mm/gup.c index 23f01c40c88f..4a789f1c6a27 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1618,6 +1618,47 @@ static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end, return 1; } +static void gup_pgd_range(unsigned long addr, unsigned long end, + int write, struct page **pages, int *nr) +{ + unsigned long next; + pgd_t *pgdp; + + pgdp = pgd_offset(current->mm, addr); + do { + pgd_t pgd = READ_ONCE(*pgdp); + + next = pgd_addr_end(addr, end); + if (pgd_none(pgd)) + return; + if (unlikely(pgd_huge(pgd))) { + if (!gup_huge_pgd(pgd, pgdp, addr, next, write, + pages, nr)) + return; + } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) { + if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr, + PGDIR_SHIFT, next, write, pages, nr)) + return; + } else if (!gup_p4d_range(pgd, addr, next, write, pages, nr)) + return; + } while (pgdp++, addr = next, addr != end); +} + +#ifndef gup_fast_permitted +/* + * Check if it's allowed to use __get_user_pages_fast() for the range, or + * we need to fall back to the slow version: + */ +bool gup_fast_permitted(unsigned long start, int nr_pages, int write) +{ + unsigned long len, end; + + len = (unsigned long) nr_pages << PAGE_SHIFT; + end = start + len; + return end >= start; +} +#endif + /* * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to * the regular GUP. It will only return non-negative values. @@ -1625,10 +1666,8 @@ static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end, int __get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) { - struct mm_struct *mm = current->mm; unsigned long addr, len, end; - unsigned long next, flags; - pgd_t *pgdp; + unsigned long flags; int nr = 0; start &= PAGE_MASK; @@ -1652,45 +1691,15 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, * block IPIs that come from THPs splitting. */ - local_irq_save(flags); - pgdp = pgd_offset(mm, addr); - do { - pgd_t pgd = READ_ONCE(*pgdp); - - next = pgd_addr_end(addr, end); - if (pgd_none(pgd)) - break; - if (unlikely(pgd_huge(pgd))) { - if (!gup_huge_pgd(pgd, pgdp, addr, next, write, - pages, &nr)) - break; - } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) { - if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr, - PGDIR_SHIFT, next, write, pages, &nr)) - break; - } else if (!gup_p4d_range(pgd, addr, next, write, pages, &nr)) - break; - } while (pgdp++, addr = next, addr != end); - local_irq_restore(flags); + if (gup_fast_permitted(start, nr_pages, write)) { + local_irq_save(flags); + gup_pgd_range(addr, end, write, pages, &nr); + local_irq_restore(flags); + } return nr; } -#ifndef gup_fast_permitted -/* - * Check if it's allowed to use __get_user_pages_fast() for the range, or - * we need to fall back to the slow version: - */ -bool gup_fast_permitted(unsigned long start, int nr_pages, int write) -{ - unsigned long len, end; - - len = (unsigned long) nr_pages << PAGE_SHIFT; - end = start + len; - return end >= start; -} -#endif - /** * get_user_pages_fast() - pin user pages in memory * @start: starting user address @@ -1710,12 +1719,22 @@ bool gup_fast_permitted(unsigned long start, int nr_pages, int write) int get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) { + unsigned long addr, len, end; int nr = 0, ret = 0; start &= PAGE_MASK; + addr = start; + len = (unsigned long) nr_pages << PAGE_SHIFT; + end = start + len; + + if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, + (void __user *)start, len))) + return 0; if (gup_fast_permitted(start, nr_pages, write)) { - nr = __get_user_pages_fast(start, nr_pages, write, pages); + local_irq_disable(); + gup_pgd_range(addr, end, write, pages, &nr); + local_irq_enable(); ret = nr; } -- cgit v1.2.3-58-ga151 From 83e3c48729d9ebb7af5a31a504f3fd6aff0348c4 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 29 Sep 2017 17:08:16 +0300 Subject: mm/sparsemem: Allocate mem_section at runtime for CONFIG_SPARSEMEM_EXTREME=y Size of the mem_section[] array depends on the size of the physical address space. In preparation for boot-time switching between paging modes on x86-64 we need to make the allocation of mem_section[] dynamic, because otherwise we waste a lot of RAM: with CONFIG_NODE_SHIFT=10, mem_section[] size is 32kB for 4-level paging and 2MB for 5-level paging mode. The patch allocates the array on the first call to sparse_memory_present_with_active_regions(). Signed-off-by: Kirill A. Shutemov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Cyrill Gorcunov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20170929140821.37654-2-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- include/linux/mmzone.h | 6 +++++- mm/page_alloc.c | 10 ++++++++++ mm/sparse.c | 17 +++++++++++------ 3 files changed, 26 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index c8f89417740b..e796edf1296f 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1150,13 +1150,17 @@ struct mem_section { #define SECTION_ROOT_MASK (SECTIONS_PER_ROOT - 1) #ifdef CONFIG_SPARSEMEM_EXTREME -extern struct mem_section *mem_section[NR_SECTION_ROOTS]; +extern struct mem_section **mem_section; #else extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]; #endif static inline struct mem_section *__nr_to_section(unsigned long nr) { +#ifdef CONFIG_SPARSEMEM_EXTREME + if (!mem_section) + return NULL; +#endif if (!mem_section[SECTION_NR_TO_ROOT(nr)]) return NULL; return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK]; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 77e4d3c5c57b..8dfd13f724d9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5646,6 +5646,16 @@ void __init sparse_memory_present_with_active_regions(int nid) unsigned long start_pfn, end_pfn; int i, this_nid; +#ifdef CONFIG_SPARSEMEM_EXTREME + if (!mem_section) { + unsigned long size, align; + + size = sizeof(struct mem_section) * NR_SECTION_ROOTS; + align = 1 << (INTERNODE_CACHE_SHIFT); + mem_section = memblock_virt_alloc(size, align); + } +#endif + for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) memory_present(this_nid, start_pfn, end_pfn); } diff --git a/mm/sparse.c b/mm/sparse.c index 83b3bf6461af..b00a97398795 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -22,8 +22,7 @@ * 1) mem_section - memory sections, mem_map's for valid memory */ #ifdef CONFIG_SPARSEMEM_EXTREME -struct mem_section *mem_section[NR_SECTION_ROOTS] - ____cacheline_internodealigned_in_smp; +struct mem_section **mem_section; #else struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT] ____cacheline_internodealigned_in_smp; @@ -100,7 +99,7 @@ static inline int sparse_index_init(unsigned long section_nr, int nid) int __section_nr(struct mem_section* ms) { unsigned long root_nr; - struct mem_section* root; + struct mem_section *root = NULL; for (root_nr = 0; root_nr < NR_SECTION_ROOTS; root_nr++) { root = __nr_to_section(root_nr * SECTIONS_PER_ROOT); @@ -111,7 +110,7 @@ int __section_nr(struct mem_section* ms) break; } - VM_BUG_ON(root_nr == NR_SECTION_ROOTS); + VM_BUG_ON(!root); return (root_nr * SECTIONS_PER_ROOT) + (ms - root); } @@ -329,11 +328,17 @@ again: static void __init check_usemap_section_nr(int nid, unsigned long *usemap) { unsigned long usemap_snr, pgdat_snr; - static unsigned long old_usemap_snr = NR_MEM_SECTIONS; - static unsigned long old_pgdat_snr = NR_MEM_SECTIONS; + static unsigned long old_usemap_snr; + static unsigned long old_pgdat_snr; struct pglist_data *pgdat = NODE_DATA(nid); int usemap_nid; + /* First call */ + if (!old_usemap_snr) { + old_usemap_snr = NR_MEM_SECTIONS; + old_pgdat_snr = NR_MEM_SECTIONS; + } + usemap_snr = pfn_to_section_nr(__pa(usemap) >> PAGE_SHIFT); pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT); if (usemap_snr == pgdat_snr) -- cgit v1.2.3-58-ga151 From 629a359bdb0e0652a8227b4ff3125431995fec6e Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 7 Nov 2017 11:33:37 +0300 Subject: mm/sparsemem: Fix ARM64 boot crash when CONFIG_SPARSEMEM_EXTREME=y Since commit: 83e3c48729d9 ("mm/sparsemem: Allocate mem_section at runtime for CONFIG_SPARSEMEM_EXTREME=y") we allocate the mem_section array dynamically in sparse_memory_present_with_active_regions(), but some architectures, like arm64, don't call the routine to initialize sparsemem. Let's move the initialization into memory_present() it should cover all architectures. Reported-and-tested-by: Sudeep Holla Tested-by: Bjorn Andersson Signed-off-by: Kirill A. Shutemov Acked-by: Will Deacon Cc: Andrew Morton Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Fixes: 83e3c48729d9 ("mm/sparsemem: Allocate mem_section at runtime for CONFIG_SPARSEMEM_EXTREME=y") Link: http://lkml.kernel.org/r/20171107083337.89952-1-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- mm/page_alloc.c | 10 ---------- mm/sparse.c | 10 ++++++++++ 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8dfd13f724d9..77e4d3c5c57b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5646,16 +5646,6 @@ void __init sparse_memory_present_with_active_regions(int nid) unsigned long start_pfn, end_pfn; int i, this_nid; -#ifdef CONFIG_SPARSEMEM_EXTREME - if (!mem_section) { - unsigned long size, align; - - size = sizeof(struct mem_section) * NR_SECTION_ROOTS; - align = 1 << (INTERNODE_CACHE_SHIFT); - mem_section = memblock_virt_alloc(size, align); - } -#endif - for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) memory_present(this_nid, start_pfn, end_pfn); } diff --git a/mm/sparse.c b/mm/sparse.c index b00a97398795..d294148ba395 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -206,6 +206,16 @@ void __init memory_present(int nid, unsigned long start, unsigned long end) { unsigned long pfn; +#ifdef CONFIG_SPARSEMEM_EXTREME + if (unlikely(!mem_section)) { + unsigned long size, align; + + size = sizeof(struct mem_section) * NR_SECTION_ROOTS; + align = 1 << (INTERNODE_CACHE_SHIFT); + mem_section = memblock_virt_alloc(size, align); + } +#endif + start &= PAGE_SECTION_MASK; mminit_validate_memmodel_limits(&start, &end); for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) { -- cgit v1.2.3-58-ga151