From 91cdcd8d624bfdf05e8db9d572759516f3c786b8 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 12 Mar 2024 11:34:11 -0400 Subject: mm: zswap: optimize zswap pool size tracking Profiling the munmap() of a zswapped memory region shows 60% of the total cycles currently going into updating the zswap_pool_total_size. There are three consumers of this counter: - store, to enforce the globally configured pool limit - meminfo & debugfs, to report the size to the user - shrink, to determine the batch size for each cycle Instead of aggregating everytime an entry enters or exits the zswap pool, aggregate the value from the zpools on-demand: - Stores aggregate the counter anyway upon success. Aggregating to check the limit instead is the same amount of work. - Meminfo & debugfs might benefit somewhat from a pre-aggregated counter, but aren't exactly hotpaths. - Shrinking can aggregate once for every cycle instead of doing it for every freed entry. As the shrinker might work on tens or hundreds of objects per scan cycle, this is a large reduction in aggregations. The paths that benefit dramatically are swapin, swapoff, and unmaps. There could be millions of pages being processed until somebody asks for the pool size again. This eliminates the pool size updates from those paths entirely. Top profile entries for a 24G range munmap(), before: 38.54% zswap-unmap [kernel.kallsyms] [k] zs_zpool_total_size 12.51% zswap-unmap [kernel.kallsyms] [k] zpool_get_total_size 9.10% zswap-unmap [kernel.kallsyms] [k] zswap_update_total_size 2.95% zswap-unmap [kernel.kallsyms] [k] obj_cgroup_uncharge_zswap 2.88% zswap-unmap [kernel.kallsyms] [k] __slab_free 2.86% zswap-unmap [kernel.kallsyms] [k] xas_store and after: 7.70% zswap-unmap [kernel.kallsyms] [k] __slab_free 7.16% zswap-unmap [kernel.kallsyms] [k] obj_cgroup_uncharge_zswap 6.74% zswap-unmap [kernel.kallsyms] [k] xas_store It was also briefly considered to move to a single atomic in zswap that is updated by the backends, since zswap only cares about the sum of all pools anyway. However, zram directly needs per-pool information out of zsmalloc. To keep the backend from having to update two atomics every time, I opted for the lazy aggregation instead for now. Link: https://lkml.kernel.org/r/20240312153901.3441-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Yosry Ahmed Reviewed-by: Chengming Zhou Reviewed-by: Nhat Pham Signed-off-by: Andrew Morton --- fs/proc/meminfo.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 45af9a989d40..245171d9164b 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -89,8 +89,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "SwapTotal: ", i.totalswap); show_val_kb(m, "SwapFree: ", i.freeswap); #ifdef CONFIG_ZSWAP - seq_printf(m, "Zswap: %8lu kB\n", - (unsigned long)(zswap_pool_total_size >> 10)); + show_val_kb(m, "Zswap: ", zswap_total_pages()); seq_printf(m, "Zswapped: %8lu kB\n", (unsigned long)atomic_read(&zswap_stored_pages) << (PAGE_SHIFT - 10)); -- cgit v1.2.3-58-ga151 From dee3d0bef2b00772be430425832ead6aa9d707f9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 26 Mar 2024 17:10:32 +0000 Subject: proc: rewrite stable_page_flags() Reduce the usage of PageFlag tests and reduce the number of compound_head() calls. For multi-page folios, we'll now show all pages as having the flags that apply to them, e.g. if it's dirty, all pages will have the dirty flag set instead of just the head page. The mapped flag is still per page, as is the hwpoison flag. [willy@infradead.org: fix up some bits vs masks] Link: https://lkml.kernel.org/r/20240403173112.1450721-1-willy@infradead.org [willy@infradead.org: fix warnings] Link: https://lkml.kernel.org/r/ZhBPtCYfSuFuUMEz@casper.infradead.org Link: https://lkml.kernel.org/r/20240326171045.410737-11-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Svetly Todorov Signed-off-by: Andrew Morton --- fs/proc/page.c | 69 ++++++++++++++++++++++-------------------- include/linux/huge_mm.h | 4 +-- include/linux/page-flags.h | 2 +- tools/cgroup/memcg_slabinfo.py | 5 ++- 4 files changed, 42 insertions(+), 38 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/page.c b/fs/proc/page.c index 9223856c934b..05120263af2a 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -107,10 +107,13 @@ static inline u64 kpf_copy_bit(u64 kflags, int ubit, int kbit) return ((kflags >> kbit) & 1) << ubit; } -u64 stable_page_flags(struct page *page) +u64 stable_page_flags(const struct page *page) { - u64 k; - u64 u; + const struct folio *folio; + unsigned long k; + unsigned long mapping; + bool is_anon; + u64 u = 0; /* * pseudo flag: KPF_NOPAGE @@ -118,49 +121,47 @@ u64 stable_page_flags(struct page *page) */ if (!page) return 1 << KPF_NOPAGE; + folio = page_folio(page); - k = page->flags; - u = 0; + k = folio->flags; + mapping = (unsigned long)folio->mapping; + is_anon = mapping & PAGE_MAPPING_ANON; /* * pseudo flags for the well known (anonymous) memory mapped pages */ if (page_mapped(page)) u |= 1 << KPF_MMAP; - if (PageAnon(page)) + if (is_anon) { u |= 1 << KPF_ANON; - if (PageKsm(page)) - u |= 1 << KPF_KSM; + if (mapping & PAGE_MAPPING_KSM) + u |= 1 << KPF_KSM; + } /* * compound pages: export both head/tail info * they together define a compound page's start/end pos and order */ - if (PageHead(page)) - u |= 1 << KPF_COMPOUND_HEAD; - if (PageTail(page)) + if (page == &folio->page) + u |= kpf_copy_bit(k, KPF_COMPOUND_HEAD, PG_head); + else u |= 1 << KPF_COMPOUND_TAIL; - if (PageHuge(page)) + if (folio_test_hugetlb(folio)) u |= 1 << KPF_HUGE; /* - * PageTransCompound can be true for non-huge compound pages (slab - * pages or pages allocated by drivers with __GFP_COMP) because it - * just checks PG_head/PG_tail, so we need to check PageLRU/PageAnon + * We need to check PageLRU/PageAnon * to make sure a given page is a thp, not a non-huge compound page. */ - else if (PageTransCompound(page)) { - struct page *head = compound_head(page); - - if (PageLRU(head) || PageAnon(head)) + else if (folio_test_large(folio)) { + if ((k & (1 << PG_lru)) || is_anon) u |= 1 << KPF_THP; - else if (is_huge_zero_page(head)) { + else if (is_huge_zero_page(&folio->page)) { u |= 1 << KPF_ZERO_PAGE; u |= 1 << KPF_THP; } } else if (is_zero_pfn(page_to_pfn(page))) u |= 1 << KPF_ZERO_PAGE; - /* * Caveats on high order pages: PG_buddy and PG_slab will only be set * on the head page. @@ -174,16 +175,17 @@ u64 stable_page_flags(struct page *page) u |= 1 << KPF_OFFLINE; if (PageTable(page)) u |= 1 << KPF_PGTABLE; + if (folio_test_slab(folio)) + u |= 1 << KPF_SLAB; - if (page_is_idle(page)) +#if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT) + u |= kpf_copy_bit(k, KPF_IDLE, PG_idle); +#else + if (folio_test_idle(folio)) u |= 1 << KPF_IDLE; +#endif u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked); - - u |= kpf_copy_bit(k, KPF_SLAB, PG_slab); - if (PageTail(page) && PageSlab(page)) - u |= 1 << KPF_SLAB; - u |= kpf_copy_bit(k, KPF_ERROR, PG_error); u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty); u |= kpf_copy_bit(k, KPF_UPTODATE, PG_uptodate); @@ -194,7 +196,8 @@ u64 stable_page_flags(struct page *page) u |= kpf_copy_bit(k, KPF_ACTIVE, PG_active); u |= kpf_copy_bit(k, KPF_RECLAIM, PG_reclaim); - if (PageSwapCache(page)) +#define SWAPCACHE ((1 << PG_swapbacked) | (1 << PG_swapcache)) + if ((k & SWAPCACHE) == SWAPCACHE) u |= 1 << KPF_SWAPCACHE; u |= kpf_copy_bit(k, KPF_SWAPBACKED, PG_swapbacked); @@ -202,7 +205,10 @@ u64 stable_page_flags(struct page *page) u |= kpf_copy_bit(k, KPF_MLOCKED, PG_mlocked); #ifdef CONFIG_MEMORY_FAILURE - u |= kpf_copy_bit(k, KPF_HWPOISON, PG_hwpoison); + if (u & (1 << KPF_HUGE)) + u |= kpf_copy_bit(k, KPF_HWPOISON, PG_hwpoison); + else + u |= kpf_copy_bit(page->flags, KPF_HWPOISON, PG_hwpoison); #endif #ifdef CONFIG_ARCH_USES_PG_UNCACHED @@ -228,7 +234,6 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf, { const unsigned long max_dump_pfn = get_max_dump_pfn(); u64 __user *out = (u64 __user *)buf; - struct page *ppage; unsigned long src = *ppos; unsigned long pfn; ssize_t ret = 0; @@ -245,9 +250,9 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf, * TODO: ZONE_DEVICE support requires to identify * memmaps that were actually initialized. */ - ppage = pfn_to_online_page(pfn); + struct page *page = pfn_to_online_page(pfn); - if (put_user(stable_page_flags(ppage), out)) { + if (put_user(stable_page_flags(page), out)) { ret = -EFAULT; break; } diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 7576025db55d..1540a1481daf 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -351,7 +351,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf); extern struct page *huge_zero_page; extern unsigned long huge_zero_pfn; -static inline bool is_huge_zero_page(struct page *page) +static inline bool is_huge_zero_page(const struct page *page) { return READ_ONCE(huge_zero_page) == page; } @@ -480,7 +480,7 @@ static inline vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) return 0; } -static inline bool is_huge_zero_page(struct page *page) +static inline bool is_huge_zero_page(const struct page *page) { return false; } diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index eaecf544039f..888353c209c0 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -734,7 +734,7 @@ static __always_inline bool PageKsm(const struct page *page) TESTPAGEFLAG_FALSE(Ksm, ksm) #endif -u64 stable_page_flags(struct page *page); +u64 stable_page_flags(const struct page *page); /** * folio_xor_flags_has_waiters - Change some folio flags. diff --git a/tools/cgroup/memcg_slabinfo.py b/tools/cgroup/memcg_slabinfo.py index 1d3a90d93fe2..270c28a0d098 100644 --- a/tools/cgroup/memcg_slabinfo.py +++ b/tools/cgroup/memcg_slabinfo.py @@ -146,12 +146,11 @@ def detect_kernel_config(): def for_each_slab(prog): - PGSlab = 1 << prog.constant('PG_slab') - PGHead = 1 << prog.constant('PG_head') + PGSlab = ~prog.constant('PG_slab') for page in for_each_page(prog): try: - if page.flags.value_() & PGSlab: + if page.page_type.value_() == PGSlab: yield cast('struct slab *', page) except FaultError: pass -- cgit v1.2.3-58-ga151 From 5beaee54a324ba1fe307e341ec825d5d099f4091 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 26 Mar 2024 20:28:22 +0000 Subject: mm: add is_huge_zero_folio() This is the folio equivalent of is_huge_zero_page(). It doesn't add any efficiency, but it does prevent the caller from passing a tail page and getting confused when the predicate returns false. Link: https://lkml.kernel.org/r/20240326202833.523759-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- fs/proc/page.c | 2 +- include/linux/huge_mm.h | 10 ++++++++++ mm/huge_memory.c | 6 +++--- mm/mempolicy.c | 2 +- mm/swap.c | 2 +- mm/swap_state.c | 2 +- mm/userfaultfd.c | 2 +- 7 files changed, 18 insertions(+), 8 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/page.c b/fs/proc/page.c index 05120263af2a..2fb64bdb64eb 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -155,7 +155,7 @@ u64 stable_page_flags(const struct page *page) else if (folio_test_large(folio)) { if ((k & (1 << PG_lru)) || is_anon) u |= 1 << KPF_THP; - else if (is_huge_zero_page(&folio->page)) { + else if (is_huge_zero_folio(folio)) { u |= 1 << KPF_ZERO_PAGE; u |= 1 << KPF_THP; } diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 1540a1481daf..600c6008262b 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -356,6 +356,11 @@ static inline bool is_huge_zero_page(const struct page *page) return READ_ONCE(huge_zero_page) == page; } +static inline bool is_huge_zero_folio(const struct folio *folio) +{ + return READ_ONCE(huge_zero_page) == &folio->page; +} + static inline bool is_huge_zero_pmd(pmd_t pmd) { return pmd_present(pmd) && READ_ONCE(huge_zero_pfn) == pmd_pfn(pmd); @@ -485,6 +490,11 @@ static inline bool is_huge_zero_page(const struct page *page) return false; } +static inline bool is_huge_zero_folio(const struct folio *folio) +{ + return false; +} + static inline bool is_huge_zero_pmd(pmd_t pmd) { return false; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 75ad971ca45e..5c043c7b5062 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -789,12 +789,12 @@ struct deferred_split *get_deferred_split_queue(struct folio *folio) } #endif -static inline bool is_transparent_hugepage(struct folio *folio) +static inline bool is_transparent_hugepage(const struct folio *folio) { if (!folio_test_large(folio)) return false; - return is_huge_zero_page(&folio->page) || + return is_huge_zero_folio(folio) || folio_test_large_rmappable(folio); } @@ -3085,7 +3085,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, } - is_hzp = is_huge_zero_page(&folio->page); + is_hzp = is_huge_zero_folio(folio); if (is_hzp) { pr_warn_ratelimited("Called split_huge_page for huge zero page\n"); return -EBUSY; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 913cff5da5a3..5743028a63a5 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -510,7 +510,7 @@ static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk) return; } folio = pfn_folio(pmd_pfn(*pmd)); - if (is_huge_zero_page(&folio->page)) { + if (is_huge_zero_folio(folio)) { walk->action = ACTION_CONTINUE; return; } diff --git a/mm/swap.c b/mm/swap.c index 500a09a48dfd..f72364e92d5f 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -985,7 +985,7 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs) struct folio *folio = folios->folios[i]; unsigned int nr_refs = refs ? refs[i] : 1; - if (is_huge_zero_page(&folio->page)) + if (is_huge_zero_folio(folio)) continue; if (folio_is_zone_device(folio)) { diff --git a/mm/swap_state.c b/mm/swap_state.c index bfc7e8c58a6d..2deac23633cd 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -301,7 +301,7 @@ void free_page_and_swap_cache(struct page *page) struct folio *folio = page_folio(page); free_swap_cache(folio); - if (!is_huge_zero_page(page)) + if (!is_huge_zero_folio(folio)) folio_put(folio); } diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 3c3539c573e7..a0ec14553fbe 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -1664,7 +1664,7 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, !pmd_none(dst_pmdval)) { struct folio *folio = pfn_folio(pmd_pfn(*src_pmd)); - if (!folio || (!is_huge_zero_page(&folio->page) && + if (!folio || (!is_huge_zero_folio(folio) && !PageAnonExclusive(&folio->page))) { spin_unlock(ptl); err = -EBUSY; -- cgit v1.2.3-58-ga151 From 5def1e0f476da713141269d86815aba7c2817cb5 Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 25 Mar 2024 19:16:43 -0700 Subject: proc: refactor pde_get_unmapped_area as prep MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "Cover a guard gap corner case", v4. In working on x86’s shadow stack feature, I came across some limitations around the kernel’s handling of guard gaps. AFAICT these limitations are not too important for the traditional stack usage of guard gaps, but have bigger impact on shadow stack’s usage. And now in addition to x86, we have two other architectures implementing shadow stack like features that plan to use guard gaps. I wanted to see about addressing them, but I have not worked on mmap() placement related code before, so would greatly appreciate if people could take a look and point me in the right direction. The nature of the limitations of concern is as follows. In order to ensure guard gaps between mappings, mmap() would need to consider two things: 1. That the new mapping isn’t placed in an any existing mapping’s guard gap. 2. That the new mapping isn’t placed such that any existing mappings are not in *its* guard gaps Currently mmap never considers (2), and (1) is not considered in some situations. When not passing an address hint, or passing one without MAP_FIXED_NOREPLACE, (1) is enforced. With MAP_FIXED_NOREPLACE, (1) is not enforced. With MAP_FIXED, (1) is not considered, but this seems to be expected since MAP_FIXED can already clobber existing mappings. For MAP_FIXED_NOREPLACE I would have guessed it should respect the guard gaps of existing mappings, but it is probably a little ambiguous. In this series I just tried to add enforcement of (2) for the normal (no address hint) case and only for the newer shadow stack memory (not stacks). The reason is that with the no-address-hint situation, landing next to a guard gap could come up naturally and so be more influencable by attackers such that two shadow stacks could be adjacent without a guard gap. Where as the address-hint scenarios would require more control - being able to call mmap() with specific arguments. As for why not just fix the other corner cases anyway, I thought it might have some greater possibility of affecting existing apps. This patch (of 14): Future changes will perform a treewide change to remove the indirect branch that is involved in calling mm->get_unmapped_area(). After doing this, the function will no longer be able to be handled as a function pointer. To make the treewide change diff cleaner and easier to review, refactor pde_get_unmapped_area() such that mm->get_unmapped_area() is called without being stored in a local function pointer. With this in refactoring, follow on changes will be able to simply replace the call site with a future function that calls it directly. Link: https://lkml.kernel.org/r/20240326021656.202649-1-rick.p.edgecombe@intel.com Link: https://lkml.kernel.org/r/20240326021656.202649-2-rick.p.edgecombe@intel.com Signed-off-by: Rick Edgecombe Cc: Andy Lutomirski Cc: Borislav Petkov (AMD) Cc: Christophe Leroy Cc: Dave Hansen Cc: Deepak Gupta Cc: H. Peter Anvin (Intel) Cc: Ingo Molnar Cc: Kees Cook Cc: Kirill A. Shutemov Cc: Liam R. Howlett Cc: Mark Brown Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Alexei Starovoitov Cc: Aneesh Kumar K.V Cc: Dan Williams Cc: Guo Ren Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: Michael Ellerman Cc: Naveen N. Rao Cc: Nicholas Piggin Signed-off-by: Andrew Morton --- fs/proc/inode.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/inode.c b/fs/proc/inode.c index dcd513dccf55..75396a24fd8c 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -451,15 +451,12 @@ pde_get_unmapped_area(struct proc_dir_entry *pde, struct file *file, unsigned lo unsigned long len, unsigned long pgoff, unsigned long flags) { - typeof_member(struct proc_ops, proc_get_unmapped_area) get_area; + if (pde->proc_ops->proc_get_unmapped_area) + return pde->proc_ops->proc_get_unmapped_area(file, orig_addr, len, pgoff, flags); - get_area = pde->proc_ops->proc_get_unmapped_area; #ifdef CONFIG_MMU - if (!get_area) - get_area = current->mm->get_unmapped_area; + return current->mm->get_unmapped_area(file, orig_addr, len, pgoff, flags); #endif - if (get_area) - return get_area(file, orig_addr, len, pgoff, flags); return orig_addr; } -- cgit v1.2.3-58-ga151 From 529ce23a764f25d172198b4c6ba90f1e2ad17f93 Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 25 Mar 2024 19:16:44 -0700 Subject: mm: switch mm->get_unmapped_area() to a flag The mm_struct contains a function pointer *get_unmapped_area(), which is set to either arch_get_unmapped_area() or arch_get_unmapped_area_topdown() during the initialization of the mm. Since the function pointer only ever points to two functions that are named the same across all arch's, a function pointer is not really required. In addition future changes will want to add versions of the functions that take additional arguments. So to save a pointers worth of bytes in mm_struct, and prevent adding additional function pointers to mm_struct in future changes, remove it and keep the information about which get_unmapped_area() to use in a flag. Add the new flag to MMF_INIT_MASK so it doesn't get clobbered on fork by mmf_init_flags(). Most MM flags get clobbered on fork. In the pre-existing behavior mm->get_unmapped_area() would get copied to the new mm in dup_mm(), so not clobbering the flag preserves the existing behavior around inheriting the topdown-ness. Introduce a helper, mm_get_unmapped_area(), to easily convert code that refers to the old function pointer to instead select and call either arch_get_unmapped_area() or arch_get_unmapped_area_topdown() based on the flag. Then drop the mm->get_unmapped_area() function pointer. Leave the get_unmapped_area() pointer in struct file_operations alone. The main purpose of this change is to reorganize in preparation for future changes, but it also converts the calls of mm->get_unmapped_area() from indirect branches into a direct ones. The stress-ng bigheap benchmark calls realloc a lot, which calls through get_unmapped_area() in the kernel. On x86, the change yielded a ~1% improvement there on a retpoline config. In testing a few x86 configs, removing the pointer unfortunately didn't result in any actual size reductions in the compiled layout of mm_struct. But depending on compiler or arch alignment requirements, the change could shrink the size of mm_struct. Link: https://lkml.kernel.org/r/20240326021656.202649-3-rick.p.edgecombe@intel.com Signed-off-by: Rick Edgecombe Acked-by: Dave Hansen Acked-by: Liam R. Howlett Reviewed-by: Kirill A. Shutemov Acked-by: Alexei Starovoitov Cc: Dan Williams Cc: Andy Lutomirski Cc: Aneesh Kumar K.V Cc: Borislav Petkov (AMD) Cc: Christophe Leroy Cc: Deepak Gupta Cc: Guo Ren Cc: Helge Deller Cc: H. Peter Anvin (Intel) Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Kees Cook Cc: Mark Brown Cc: Michael Ellerman Cc: Naveen N. Rao Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- arch/s390/mm/hugetlbpage.c | 2 +- arch/s390/mm/mmap.c | 4 ++-- arch/sparc/kernel/sys_sparc_64.c | 15 ++++++--------- arch/sparc/mm/hugetlbpage.c | 2 +- arch/x86/kernel/cpu/sgx/driver.c | 2 +- arch/x86/mm/hugetlbpage.c | 2 +- arch/x86/mm/mmap.c | 4 ++-- drivers/char/mem.c | 2 +- drivers/dax/device.c | 6 +++--- fs/hugetlbfs/inode.c | 4 ++-- fs/proc/inode.c | 3 ++- fs/ramfs/file-mmu.c | 2 +- include/linux/mm_types.h | 6 +----- include/linux/sched/coredump.h | 5 ++++- include/linux/sched/mm.h | 5 +++++ io_uring/io_uring.c | 2 +- kernel/bpf/arena.c | 2 +- kernel/bpf/syscall.c | 2 +- mm/debug.c | 6 ------ mm/huge_memory.c | 9 ++++----- mm/mmap.c | 21 ++++++++++++++++++--- mm/shmem.c | 11 +++++------ mm/util.c | 6 +++--- 23 files changed, 66 insertions(+), 57 deletions(-) (limited to 'fs/proc') diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index ca43b6fce71c..7d948e243f4b 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -318,7 +318,7 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, goto check_asce_limit; } - if (mm->get_unmapped_area == arch_get_unmapped_area) + if (!test_bit(MMF_TOPDOWN, &mm->flags)) addr = hugetlb_get_unmapped_area_bottomup(file, addr, len, pgoff, flags); else diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c index b14fc0887654..6b2e4436ad4a 100644 --- a/arch/s390/mm/mmap.c +++ b/arch/s390/mm/mmap.c @@ -185,10 +185,10 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) */ if (mmap_is_legacy(rlim_stack)) { mm->mmap_base = mmap_base_legacy(random_factor); - mm->get_unmapped_area = arch_get_unmapped_area; + clear_bit(MMF_TOPDOWN, &mm->flags); } else { mm->mmap_base = mmap_base(random_factor, rlim_stack); - mm->get_unmapped_area = arch_get_unmapped_area_topdown; + set_bit(MMF_TOPDOWN, &mm->flags); } } diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c index 1e9a9e016237..1dbf7211666e 100644 --- a/arch/sparc/kernel/sys_sparc_64.c +++ b/arch/sparc/kernel/sys_sparc_64.c @@ -218,14 +218,10 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, unsigned long get_fb_unmapped_area(struct file *filp, unsigned long orig_addr, unsigned long len, unsigned long pgoff, unsigned long flags) { unsigned long align_goal, addr = -ENOMEM; - unsigned long (*get_area)(struct file *, unsigned long, - unsigned long, unsigned long, unsigned long); - - get_area = current->mm->get_unmapped_area; if (flags & MAP_FIXED) { /* Ok, don't mess with it. */ - return get_area(NULL, orig_addr, len, pgoff, flags); + return mm_get_unmapped_area(current->mm, NULL, orig_addr, len, pgoff, flags); } flags &= ~MAP_SHARED; @@ -238,7 +234,8 @@ unsigned long get_fb_unmapped_area(struct file *filp, unsigned long orig_addr, u align_goal = (64UL * 1024); do { - addr = get_area(NULL, orig_addr, len + (align_goal - PAGE_SIZE), pgoff, flags); + addr = mm_get_unmapped_area(current->mm, NULL, orig_addr, + len + (align_goal - PAGE_SIZE), pgoff, flags); if (!(addr & ~PAGE_MASK)) { addr = (addr + (align_goal - 1UL)) & ~(align_goal - 1UL); break; @@ -256,7 +253,7 @@ unsigned long get_fb_unmapped_area(struct file *filp, unsigned long orig_addr, u * be obtained. */ if (addr & ~PAGE_MASK) - addr = get_area(NULL, orig_addr, len, pgoff, flags); + addr = mm_get_unmapped_area(current->mm, NULL, orig_addr, len, pgoff, flags); return addr; } @@ -292,7 +289,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) gap == RLIM_INFINITY || sysctl_legacy_va_layout) { mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; - mm->get_unmapped_area = arch_get_unmapped_area; + clear_bit(MMF_TOPDOWN, &mm->flags); } else { /* We know it's 32-bit */ unsigned long task_size = STACK_TOP32; @@ -303,7 +300,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) gap = (task_size / 6 * 5); mm->mmap_base = PAGE_ALIGN(task_size - gap - random_factor); - mm->get_unmapped_area = arch_get_unmapped_area_topdown; + set_bit(MMF_TOPDOWN, &mm->flags); } } diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c index 8ed5bdf95d25..c23012e3a353 100644 --- a/arch/sparc/mm/hugetlbpage.c +++ b/arch/sparc/mm/hugetlbpage.c @@ -123,7 +123,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, (!vma || addr + len <= vm_start_gap(vma))) return addr; } - if (mm->get_unmapped_area == arch_get_unmapped_area) + if (!test_bit(MMF_TOPDOWN, &mm->flags)) return hugetlb_get_unmapped_area_bottomup(file, addr, len, pgoff, flags); else diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c index 262f5fb18d74..22b65a5f5ec6 100644 --- a/arch/x86/kernel/cpu/sgx/driver.c +++ b/arch/x86/kernel/cpu/sgx/driver.c @@ -113,7 +113,7 @@ static unsigned long sgx_get_unmapped_area(struct file *file, if (flags & MAP_FIXED) return addr; - return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); + return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags); } #ifdef CONFIG_COMPAT diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index dab6db288e5b..06ca9a60bac2 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -115,7 +115,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, } get_unmapped_area: - if (mm->get_unmapped_area == arch_get_unmapped_area) + if (!test_bit(MMF_TOPDOWN, &mm->flags)) return hugetlb_get_unmapped_area_bottomup(file, addr, len, pgoff, flags); else diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index c90c20904a60..a2cabb1c81e1 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -129,9 +129,9 @@ static void arch_pick_mmap_base(unsigned long *base, unsigned long *legacy_base, void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { if (mmap_is_legacy()) - mm->get_unmapped_area = arch_get_unmapped_area; + clear_bit(MMF_TOPDOWN, &mm->flags); else - mm->get_unmapped_area = arch_get_unmapped_area_topdown; + set_bit(MMF_TOPDOWN, &mm->flags); arch_pick_mmap_base(&mm->mmap_base, &mm->mmap_legacy_base, arch_rnd(mmap64_rnd_bits), task_size_64bit(0), diff --git a/drivers/char/mem.c b/drivers/char/mem.c index 3c6670cf905f..9b80e622ae80 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -544,7 +544,7 @@ static unsigned long get_unmapped_area_zero(struct file *file, } /* Otherwise flags & MAP_PRIVATE: with no shmem object beneath it */ - return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); + return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags); #else return -ENOSYS; #endif diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 93ebedc5ec8c..47c126d37b59 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -329,14 +329,14 @@ static unsigned long dax_get_unmapped_area(struct file *filp, if ((off + len_align) < off) goto out; - addr_align = current->mm->get_unmapped_area(filp, addr, len_align, - pgoff, flags); + addr_align = mm_get_unmapped_area(current->mm, filp, addr, len_align, + pgoff, flags); if (!IS_ERR_VALUE(addr_align)) { addr_align += (off - addr_align) & (align - 1); return addr_align; } out: - return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); + return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags); } static const struct address_space_operations dev_dax_aops = { diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 6502c7e776d1..3dee18bf47ed 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -249,11 +249,11 @@ generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr, } /* - * Use mm->get_unmapped_area value as a hint to use topdown routine. + * Use MMF_TOPDOWN flag as a hint to use topdown routine. * If architectures have special needs, they should define their own * version of hugetlb_get_unmapped_area. */ - if (mm->get_unmapped_area == arch_get_unmapped_area_topdown) + if (test_bit(MMF_TOPDOWN, &mm->flags)) return hugetlb_get_unmapped_area_topdown(file, addr, len, pgoff, flags); return hugetlb_get_unmapped_area_bottomup(file, addr, len, diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 75396a24fd8c..d19434e2a58e 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -455,8 +455,9 @@ pde_get_unmapped_area(struct proc_dir_entry *pde, struct file *file, unsigned lo return pde->proc_ops->proc_get_unmapped_area(file, orig_addr, len, pgoff, flags); #ifdef CONFIG_MMU - return current->mm->get_unmapped_area(file, orig_addr, len, pgoff, flags); + return mm_get_unmapped_area(current->mm, file, orig_addr, len, pgoff, flags); #endif + return orig_addr; } diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c index c7a1aa3c882b..b45c7edc3225 100644 --- a/fs/ramfs/file-mmu.c +++ b/fs/ramfs/file-mmu.c @@ -35,7 +35,7 @@ static unsigned long ramfs_mmu_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { - return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); + return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags); } const struct file_operations ramfs_file_operations = { diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index fee6561feb7e..fa0d6995706f 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -777,11 +777,7 @@ struct mm_struct { } ____cacheline_aligned_in_smp; struct maple_tree mm_mt; -#ifdef CONFIG_MMU - unsigned long (*get_unmapped_area) (struct file *filp, - unsigned long addr, unsigned long len, - unsigned long pgoff, unsigned long flags); -#endif + unsigned long mmap_base; /* base of mmap area */ unsigned long mmap_legacy_base; /* base of mmap area in bottom-up allocations */ #ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index 02f5090ffea2..e62ff805cfc9 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -92,9 +92,12 @@ static inline int get_dumpable(struct mm_struct *mm) #define MMF_VM_MERGE_ANY 30 #define MMF_VM_MERGE_ANY_MASK (1 << MMF_VM_MERGE_ANY) +#define MMF_TOPDOWN 31 /* mm searches top down by default */ +#define MMF_TOPDOWN_MASK (1 << MMF_TOPDOWN) + #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK |\ - MMF_VM_MERGE_ANY_MASK) + MMF_VM_MERGE_ANY_MASK | MMF_TOPDOWN_MASK) static inline unsigned long mmf_init_flags(unsigned long flags) { diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index b6543f9d78d6..ed1caa26c8be 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -8,6 +8,7 @@ #include #include #include +#include /* * Routines for handling mm_structs @@ -186,6 +187,10 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); +unsigned long mm_get_unmapped_area(struct mm_struct *mm, struct file *filp, + unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags); + unsigned long generic_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index c170a2b8d2cf..d5edfb8444d7 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3525,7 +3525,7 @@ static unsigned long io_uring_mmu_get_unmapped_area(struct file *filp, #else addr = 0UL; #endif - return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); + return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags); } #else /* !CONFIG_MMU */ diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 343c3456c8dd..16dbf4f6b77f 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -314,7 +314,7 @@ static unsigned long arena_get_unmapped_area(struct file *filp, unsigned long ad return -EINVAL; } - ret = current->mm->get_unmapped_area(filp, addr, len * 2, 0, flags); + ret = mm_get_unmapped_area(current->mm, filp, addr, len * 2, 0, flags); if (IS_ERR_VALUE(ret)) return ret; if ((ret >> 32) == ((ret + len - 1) >> 32)) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c287925471f6..9cb89e875f0d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -980,7 +980,7 @@ static unsigned long bpf_get_unmapped_area(struct file *filp, unsigned long addr if (map->ops->map_get_unmapped_area) return map->ops->map_get_unmapped_area(filp, addr, len, pgoff, flags); #ifdef CONFIG_MMU - return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); + return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags); #else return addr; #endif diff --git a/mm/debug.c b/mm/debug.c index e8a96b8b7197..b71186f1fb0b 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -177,9 +177,6 @@ EXPORT_SYMBOL(dump_vma); void dump_mm(const struct mm_struct *mm) { pr_emerg("mm %px task_size %lu\n" -#ifdef CONFIG_MMU - "get_unmapped_area %px\n" -#endif "mmap_base %lu mmap_legacy_base %lu\n" "pgd %px mm_users %d mm_count %d pgtables_bytes %lu map_count %d\n" "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" @@ -205,9 +202,6 @@ void dump_mm(const struct mm_struct *mm) "def_flags: %#lx(%pGv)\n", mm, mm->task_size, -#ifdef CONFIG_MMU - mm->get_unmapped_area, -#endif mm->mmap_base, mm->mmap_legacy_base, mm->pgd, atomic_read(&mm->mm_users), atomic_read(&mm->mm_count), diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 157cee64850c..7fdb2275a1e0 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -816,8 +816,8 @@ static unsigned long __thp_get_unmapped_area(struct file *filp, if (len_pad < len || (off + len_pad) < off) return 0; - ret = current->mm->get_unmapped_area(filp, addr, len_pad, - off >> PAGE_SHIFT, flags); + ret = mm_get_unmapped_area(current->mm, filp, addr, len_pad, + off >> PAGE_SHIFT, flags); /* * The failure might be due to length padding. The caller will retry @@ -835,8 +835,7 @@ static unsigned long __thp_get_unmapped_area(struct file *filp, off_sub = (off - ret) & (size - 1); - if (current->mm->get_unmapped_area == arch_get_unmapped_area_topdown && - !off_sub) + if (test_bit(MMF_TOPDOWN, ¤t->mm->flags) && !off_sub) return ret + size; ret += off_sub; @@ -853,7 +852,7 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, if (ret) return ret; - return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); + return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags); } EXPORT_SYMBOL_GPL(thp_get_unmapped_area); diff --git a/mm/mmap.c b/mm/mmap.c index 77a625e13ec1..dfb8a518e8c9 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1812,7 +1812,8 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { unsigned long (*get_area)(struct file *, unsigned long, - unsigned long, unsigned long, unsigned long); + unsigned long, unsigned long, unsigned long) + = NULL; unsigned long error = arch_mmap_check(addr, len, flags); if (error) @@ -1822,7 +1823,6 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, if (len > TASK_SIZE) return -ENOMEM; - get_area = current->mm->get_unmapped_area; if (file) { if (file->f_op->get_unmapped_area) get_area = file->f_op->get_unmapped_area; @@ -1841,7 +1841,11 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, if (!file) pgoff = 0; - addr = get_area(file, addr, len, pgoff, flags); + if (get_area) + addr = get_area(file, addr, len, pgoff, flags); + else + addr = mm_get_unmapped_area(current->mm, file, addr, len, + pgoff, flags); if (IS_ERR_VALUE(addr)) return addr; @@ -1856,6 +1860,17 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, EXPORT_SYMBOL(get_unmapped_area); +unsigned long +mm_get_unmapped_area(struct mm_struct *mm, struct file *file, + unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags) +{ + if (test_bit(MMF_TOPDOWN, &mm->flags)) + return arch_get_unmapped_area_topdown(file, addr, len, pgoff, flags); + return arch_get_unmapped_area(file, addr, len, pgoff, flags); +} +EXPORT_SYMBOL(mm_get_unmapped_area); + /** * find_vma_intersection() - Look up the first VMA which intersects the interval * @mm: The process address space. diff --git a/mm/shmem.c b/mm/shmem.c index 98985179f495..fa2a0ed97507 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2267,8 +2267,6 @@ unsigned long shmem_get_unmapped_area(struct file *file, unsigned long uaddr, unsigned long len, unsigned long pgoff, unsigned long flags) { - unsigned long (*get_area)(struct file *, - unsigned long, unsigned long, unsigned long, unsigned long); unsigned long addr; unsigned long offset; unsigned long inflated_len; @@ -2278,8 +2276,8 @@ unsigned long shmem_get_unmapped_area(struct file *file, if (len > TASK_SIZE) return -ENOMEM; - get_area = current->mm->get_unmapped_area; - addr = get_area(file, uaddr, len, pgoff, flags); + addr = mm_get_unmapped_area(current->mm, file, uaddr, len, pgoff, + flags); if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) return addr; @@ -2336,7 +2334,8 @@ unsigned long shmem_get_unmapped_area(struct file *file, if (inflated_len < len) return addr; - inflated_addr = get_area(NULL, uaddr, inflated_len, 0, flags); + inflated_addr = mm_get_unmapped_area(current->mm, NULL, uaddr, + inflated_len, 0, flags); if (IS_ERR_VALUE(inflated_addr)) return addr; if (inflated_addr & ~PAGE_MASK) @@ -4801,7 +4800,7 @@ unsigned long shmem_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { - return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); + return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags); } #endif diff --git a/mm/util.c b/mm/util.c index a9e911b22b99..c9e519e6811f 100644 --- a/mm/util.c +++ b/mm/util.c @@ -469,17 +469,17 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) if (mmap_is_legacy(rlim_stack)) { mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; - mm->get_unmapped_area = arch_get_unmapped_area; + clear_bit(MMF_TOPDOWN, &mm->flags); } else { mm->mmap_base = mmap_base(random_factor, rlim_stack); - mm->get_unmapped_area = arch_get_unmapped_area_topdown; + set_bit(MMF_TOPDOWN, &mm->flags); } } #elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { mm->mmap_base = TASK_UNMAPPED_BASE; - mm->get_unmapped_area = arch_get_unmapped_area; + clear_bit(MMF_TOPDOWN, &mm->flags); } #endif -- cgit v1.2.3-58-ga151 From 03aa577f3b2809bcaeb9a544307c2d997a972965 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 2 Apr 2024 21:12:48 +0100 Subject: proc: convert clear_refs_pte_range to use a folio Patch series "Remove page_idle and page_young wrappers". There are only a couple of places left using the page wrappers for idle & young tracking. Convert the two users in proc and then we can remove the wrappers. That enables the further simplification of autogenerating the definitions when CONFIG_PAGE_IDLE_FLAG is disabled. This patch (of 4): Replaces four calls to compound_head() with two. Link: https://lkml.kernel.org/r/20240402201252.917342-1-willy@infradead.org Link: https://lkml.kernel.org/r/20240402201252.917342-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 23fbab954c20..b94101cd2706 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1161,7 +1161,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, struct vm_area_struct *vma = walk->vma; pte_t *pte, ptent; spinlock_t *ptl; - struct page *page; + struct folio *folio; ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { @@ -1173,12 +1173,12 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, if (!pmd_present(*pmd)) goto out; - page = pmd_page(*pmd); + folio = pmd_folio(*pmd); /* Clear accessed and referenced bits. */ pmdp_test_and_clear_young(vma, addr, pmd); - test_and_clear_page_young(page); - ClearPageReferenced(page); + folio_test_clear_young(folio); + folio_clear_referenced(folio); out: spin_unlock(ptl); return 0; @@ -1200,14 +1200,14 @@ out: if (!pte_present(ptent)) continue; - page = vm_normal_page(vma, addr, ptent); - if (!page) + folio = vm_normal_folio(vma, addr, ptent); + if (!folio) continue; /* Clear accessed and referenced bits. */ ptep_test_and_clear_young(vma, addr, pte); - test_and_clear_page_young(page); - ClearPageReferenced(page); + folio_test_clear_young(folio); + folio_clear_referenced(folio); } pte_unmap_unlock(pte - 1, ptl); cond_resched(); -- cgit v1.2.3-58-ga151 From 6c977f36dc36baec8ad15faccbc6a08315eaa195 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 2 Apr 2024 21:12:49 +0100 Subject: proc: convert smaps_account() to use a folio Replace seven calls to compound_head() with one. Link: https://lkml.kernel.org/r/20240402201252.917342-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index b94101cd2706..e8d1008a838d 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -444,6 +444,7 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page, bool compound, bool young, bool dirty, bool locked, bool migration) { + struct folio *folio = page_folio(page); int i, nr = compound ? compound_nr(page) : 1; unsigned long size = nr * PAGE_SIZE; @@ -451,27 +452,28 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page, * First accumulate quantities that depend only on |size| and the type * of the compound page. */ - if (PageAnon(page)) { + if (folio_test_anon(folio)) { mss->anonymous += size; - if (!PageSwapBacked(page) && !dirty && !PageDirty(page)) + if (!folio_test_swapbacked(folio) && !dirty && + !folio_test_dirty(folio)) mss->lazyfree += size; } - if (PageKsm(page)) + if (folio_test_ksm(folio)) mss->ksm += size; mss->resident += size; /* Accumulate the size in pages that have been accessed. */ - if (young || page_is_young(page) || PageReferenced(page)) + if (young || folio_test_young(folio) || folio_test_referenced(folio)) mss->referenced += size; /* * Then accumulate quantities that may depend on sharing, or that may * differ page-by-page. * - * page_count(page) == 1 guarantees the page is mapped exactly once. + * refcount == 1 guarantees the page is mapped exactly once. * If any subpage of the compound page mapped with PTE it would elevate - * page_count(). + * the refcount. * * The page_mapcount() is called to get a snapshot of the mapcount. * Without holding the page lock this snapshot can be slightly wrong as @@ -480,7 +482,7 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page, * especially for migration entries. Treat regular migration entries * as mapcount == 1. */ - if ((page_count(page) == 1) || migration) { + if ((folio_ref_count(folio) == 1) || migration) { smaps_page_accumulate(mss, page, size, size << PSS_SHIFT, dirty, locked, true); return; -- cgit v1.2.3-58-ga151 From f1dc623fa0d37eeb02b31264dd2ce2011fec0c9b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 3 Apr 2024 18:14:52 +0100 Subject: proc: convert gather_stats to use a folio Patch series "Use folio APIs in procfs". We're down to very few users of the PageFoo macros, with proc being a major user. After this patchset and another patchset I have for khugepaged, we can get rid of PageActive, PageReadahead and PageSwapBacked. This patchset has the usual advantages in its own right of removing hidden calls to compound_head(). We have the page table lock, so the mapcount & refcount are stable and there can't be any races with folios suddenly becoming tail pages. This patch (of 4): Replaces six calls to compound_head() with one. Shrinks the function from 5054 bytes to 1756 bytes in an allmodconfig build. Link: https://lkml.kernel.org/r/20240403171456.1445117-1-willy@infradead.org Link: https://lkml.kernel.org/r/20240403171456.1445117-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Christian Brauner Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index e8d1008a838d..5260a2788f74 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -2549,28 +2549,29 @@ struct numa_maps_private { static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty, unsigned long nr_pages) { + struct folio *folio = page_folio(page); int count = page_mapcount(page); md->pages += nr_pages; - if (pte_dirty || PageDirty(page)) + if (pte_dirty || folio_test_dirty(folio)) md->dirty += nr_pages; - if (PageSwapCache(page)) + if (folio_test_swapcache(folio)) md->swapcache += nr_pages; - if (PageActive(page) || PageUnevictable(page)) + if (folio_test_active(folio) || folio_test_unevictable(folio)) md->active += nr_pages; - if (PageWriteback(page)) + if (folio_test_writeback(folio)) md->writeback += nr_pages; - if (PageAnon(page)) + if (folio_test_anon(folio)) md->anon += nr_pages; if (count > md->mapcount_max) md->mapcount_max = count; - md->node[page_to_nid(page)] += nr_pages; + md->node[folio_nid(folio)] += nr_pages; } static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma, -- cgit v1.2.3-58-ga151 From cfc96da432fe5c6b514bdbf60ee036cad39a65af Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 3 Apr 2024 18:14:53 +0100 Subject: proc: convert smaps_page_accumulate to use a folio Replaces three calls to compound_head() with one. Shrinks the function from 2614 bytes to 1112 bytes in an allmodconfig build. Link: https://lkml.kernel.org/r/20240403171456.1445117-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Christian Brauner Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 5260a2788f74..2a3133dd47b1 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -414,11 +414,12 @@ static void smaps_page_accumulate(struct mem_size_stats *mss, struct page *page, unsigned long size, unsigned long pss, bool dirty, bool locked, bool private) { + struct folio *folio = page_folio(page); mss->pss += pss; - if (PageAnon(page)) + if (folio_test_anon(folio)) mss->pss_anon += pss; - else if (PageSwapBacked(page)) + else if (folio_test_swapbacked(folio)) mss->pss_shmem += pss; else mss->pss_file += pss; @@ -426,7 +427,7 @@ static void smaps_page_accumulate(struct mem_size_stats *mss, if (locked) mss->pss_locked += pss; - if (dirty || PageDirty(page)) { + if (dirty || folio_test_dirty(folio)) { mss->pss_dirty += pss; if (private) mss->private_dirty += size; -- cgit v1.2.3-58-ga151 From 27bb0a70e5248683fd940f738a7a53066fb470e5 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 3 Apr 2024 18:14:54 +0100 Subject: proc: pass a folio to smaps_page_accumulate() Both callers already have a folio; pass it in instead of doing the conversion each time. Link: https://lkml.kernel.org/r/20240403171456.1445117-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Christian Brauner Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 2a3133dd47b1..6d4f60bc8824 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -411,10 +411,9 @@ struct mem_size_stats { }; static void smaps_page_accumulate(struct mem_size_stats *mss, - struct page *page, unsigned long size, unsigned long pss, + struct folio *folio, unsigned long size, unsigned long pss, bool dirty, bool locked, bool private) { - struct folio *folio = page_folio(page); mss->pss += pss; if (folio_test_anon(folio)) @@ -484,8 +483,8 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page, * as mapcount == 1. */ if ((folio_ref_count(folio) == 1) || migration) { - smaps_page_accumulate(mss, page, size, size << PSS_SHIFT, dirty, - locked, true); + smaps_page_accumulate(mss, folio, size, size << PSS_SHIFT, + dirty, locked, true); return; } for (i = 0; i < nr; i++, page++) { @@ -493,8 +492,8 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page, unsigned long pss = PAGE_SIZE << PSS_SHIFT; if (mapcount >= 2) pss /= mapcount; - smaps_page_accumulate(mss, page, PAGE_SIZE, pss, dirty, locked, - mapcount < 2); + smaps_page_accumulate(mss, folio, PAGE_SIZE, pss, + dirty, locked, mapcount < 2); } } -- cgit v1.2.3-58-ga151 From 039d26d10d62b71e242cde1092b042ae1fee498d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 3 Apr 2024 18:14:55 +0100 Subject: proc: convert smaps_pmd_entry to use a folio Replace two calls to compound_head() with one. Link: https://lkml.kernel.org/r/20240403171456.1445117-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Christian Brauner Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 6d4f60bc8824..8ff79bd427ec 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -578,6 +578,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, struct vm_area_struct *vma = walk->vma; bool locked = !!(vma->vm_flags & VM_LOCKED); struct page *page = NULL; + struct folio *folio; bool migration = false; if (pmd_present(*pmd)) { @@ -592,11 +593,12 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, } if (IS_ERR_OR_NULL(page)) return; - if (PageAnon(page)) + folio = page_folio(page); + if (folio_test_anon(folio)) mss->anonymous_thp += HPAGE_PMD_SIZE; - else if (PageSwapBacked(page)) + else if (folio_test_swapbacked(folio)) mss->shmem_thp += HPAGE_PMD_SIZE; - else if (is_zone_device_page(page)) + else if (folio_is_zone_device(folio)) /* pass */; else mss->file_thp += HPAGE_PMD_SIZE; -- cgit v1.2.3-58-ga151 From 88e4e47c12836cf6875f73fca87baaf20c6ca1a2 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 17 Apr 2024 11:23:12 +0200 Subject: fs/proc/task_mmu: convert pagemap_hugetlb_range() to work on folios Patch series "fs/proc/task_mmu: convert hugetlb functions to work on folis". Let's convert two more functions, getting rid of two more page_mapcount() calls. This patch (of 2): Let's get rid of another page_mapcount() check and simply use folio_likely_mapped_shared(), which is precise for hugetlb folios. While at it, also check for PMD table sharing, like we do in smaps_hugetlb_range(). No functional change intended, except that we would now detect hugetlb folios shared via PMD table sharing correctly. Link: https://lkml.kernel.org/r/20240417092313.753919-1-david@redhat.com Link: https://lkml.kernel.org/r/20240417092313.753919-2-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Oscar Salvador Cc: Muchun Song Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 8ff79bd427ec..cd6e45e0cde8 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1578,12 +1578,13 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask, pte = huge_ptep_get(ptep); if (pte_present(pte)) { - struct page *page = pte_page(pte); + struct folio *folio = page_folio(pte_page(pte)); - if (!PageAnon(page)) + if (!folio_test_anon(folio)) flags |= PM_FILE; - if (page_mapcount(page) == 1) + if (!folio_likely_mapped_shared(folio) && + !hugetlb_pmd_shared(ptep)) flags |= PM_MMAP_EXCLUSIVE; if (huge_pte_uffd_wp(pte)) -- cgit v1.2.3-58-ga151 From 6401a2e6900843a77a27873c0529dea68f61193d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 17 Apr 2024 11:23:13 +0200 Subject: fs/proc/task_mmu: convert smaps_hugetlb_range() to work on folios Let's get rid of another page_mapcount() check and simply use folio_likely_mapped_shared(), which is precise for hugetlb folios. While at it, use huge_ptep_get() + pte_page() instead of ptep_get() + vm_normal_page(), just like we do in pagemap_hugetlb_range(). No functional change intended. Link: https://lkml.kernel.org/r/20240417092313.753919-3-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Oscar Salvador Cc: Muchun Song Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index cd6e45e0cde8..f4259b7edfde 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -730,19 +730,20 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, { struct mem_size_stats *mss = walk->private; struct vm_area_struct *vma = walk->vma; - struct page *page = NULL; - pte_t ptent = ptep_get(pte); + pte_t ptent = huge_ptep_get(pte); + struct folio *folio = NULL; if (pte_present(ptent)) { - page = vm_normal_page(vma, addr, ptent); + folio = page_folio(pte_page(ptent)); } else if (is_swap_pte(ptent)) { swp_entry_t swpent = pte_to_swp_entry(ptent); if (is_pfn_swap_entry(swpent)) - page = pfn_swap_entry_to_page(swpent); + folio = pfn_swap_entry_folio(swpent); } - if (page) { - if (page_mapcount(page) >= 2 || hugetlb_pmd_shared(pte)) + if (folio) { + if (folio_likely_mapped_shared(folio) || + hugetlb_pmd_shared(pte)) mss->shared_hugetlb += huge_page_size(hstate_vma(vma)); else mss->private_hugetlb += huge_page_size(hstate_vma(vma)); -- cgit v1.2.3-58-ga151 From e0ffb29bc54d86b9ab10ebafc66eb1b7229e0cd7 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 25 Apr 2024 05:00:55 +0100 Subject: mm: simplify thp_vma_allowable_order Combine the three boolean arguments into one flags argument for readability. Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Kefeng Wang Cc: Ryan Roberts Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 4 ++-- include/linux/huge_mm.h | 29 +++++++++++++++-------------- mm/huge_memory.c | 7 +++++-- mm/khugepaged.c | 16 +++++++--------- mm/memory.c | 10 ++++++---- 5 files changed, 35 insertions(+), 31 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index f4259b7edfde..81fbecfe5ff6 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -871,8 +871,8 @@ static int show_smap(struct seq_file *m, void *v) __show_smap(m, &mss, false); seq_printf(m, "THPeligible: %8u\n", - !!thp_vma_allowable_orders(vma, vma->vm_flags, true, false, - true, THP_ORDERS_ALL)); + !!thp_vma_allowable_orders(vma, vma->vm_flags, + TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL)); if (arch_pkeys_enabled()) seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 7cd07b83a3d0..c8d3ec116e29 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -81,8 +81,12 @@ extern struct kobj_attribute shmem_enabled_attr; */ #define THP_ORDERS_ALL (THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE) -#define thp_vma_allowable_order(vma, vm_flags, smaps, in_pf, enforce_sysfs, order) \ - (!!thp_vma_allowable_orders(vma, vm_flags, smaps, in_pf, enforce_sysfs, BIT(order))) +#define TVA_SMAPS (1 << 0) /* Will be used for procfs */ +#define TVA_IN_PF (1 << 1) /* Page fault handler */ +#define TVA_ENFORCE_SYSFS (1 << 2) /* Obey sysfs configuration */ + +#define thp_vma_allowable_order(vma, vm_flags, tva_flags, order) \ + (!!thp_vma_allowable_orders(vma, vm_flags, tva_flags, BIT(order))) #ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES #define HPAGE_PMD_SHIFT PMD_SHIFT @@ -218,17 +222,15 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma) } unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, - unsigned long vm_flags, bool smaps, - bool in_pf, bool enforce_sysfs, + unsigned long vm_flags, + unsigned long tva_flags, unsigned long orders); /** * thp_vma_allowable_orders - determine hugepage orders that are allowed for vma * @vma: the vm area to check * @vm_flags: use these vm_flags instead of vma->vm_flags - * @smaps: whether answer will be used for smaps file - * @in_pf: whether answer will be used by page fault handler - * @enforce_sysfs: whether sysfs config should be taken into account + * @tva_flags: Which TVA flags to honour * @orders: bitfield of all orders to consider * * Calculates the intersection of the requested hugepage orders and the allowed @@ -241,12 +243,12 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, */ static inline unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, - unsigned long vm_flags, bool smaps, - bool in_pf, bool enforce_sysfs, + unsigned long vm_flags, + unsigned long tva_flags, unsigned long orders) { /* Optimization to check if required orders are enabled early. */ - if (enforce_sysfs && vma_is_anonymous(vma)) { + if ((tva_flags & TVA_ENFORCE_SYSFS) && vma_is_anonymous(vma)) { unsigned long mask = READ_ONCE(huge_anon_orders_always); if (vm_flags & VM_HUGEPAGE) @@ -260,8 +262,7 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, return 0; } - return __thp_vma_allowable_orders(vma, vm_flags, smaps, in_pf, - enforce_sysfs, orders); + return __thp_vma_allowable_orders(vma, vm_flags, tva_flags, orders); } enum mthp_stat_item { @@ -428,8 +429,8 @@ static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma, } static inline unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, - unsigned long vm_flags, bool smaps, - bool in_pf, bool enforce_sysfs, + unsigned long vm_flags, + unsigned long tva_flags, unsigned long orders) { return 0; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index eabf088cb444..aaa327bd5155 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -81,10 +81,13 @@ unsigned long huge_anon_orders_madvise __read_mostly; unsigned long huge_anon_orders_inherit __read_mostly; unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, - unsigned long vm_flags, bool smaps, - bool in_pf, bool enforce_sysfs, + unsigned long vm_flags, + unsigned long tva_flags, unsigned long orders) { + bool smaps = tva_flags & TVA_SMAPS; + bool in_pf = tva_flags & TVA_IN_PF; + bool enforce_sysfs = tva_flags & TVA_ENFORCE_SYSFS; /* Check the intersection of requested and supported orders. */ orders &= vma_is_anonymous(vma) ? THP_ORDERS_ALL_ANON : THP_ORDERS_ALL_FILE; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index cf518fc44098..774a97e6e2da 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -453,7 +453,7 @@ void khugepaged_enter_vma(struct vm_area_struct *vma, { if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && hugepage_flags_enabled()) { - if (thp_vma_allowable_order(vma, vm_flags, false, false, true, + if (thp_vma_allowable_order(vma, vm_flags, TVA_ENFORCE_SYSFS, PMD_ORDER)) __khugepaged_enter(vma->vm_mm); } @@ -900,6 +900,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, struct collapse_control *cc) { struct vm_area_struct *vma; + unsigned long tva_flags = cc->is_khugepaged ? TVA_ENFORCE_SYSFS : 0; if (unlikely(hpage_collapse_test_exit_or_disable(mm))) return SCAN_ANY_PROCESS; @@ -910,8 +911,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, if (!thp_vma_suitable_order(vma, address, PMD_ORDER)) return SCAN_ADDRESS_RANGE; - if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false, - cc->is_khugepaged, PMD_ORDER)) + if (!thp_vma_allowable_order(vma, vma->vm_flags, tva_flags, PMD_ORDER)) return SCAN_VMA_CHECK; /* * Anon VMA expected, the address may be unmapped then @@ -1501,8 +1501,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, * and map it by a PMD, regardless of sysfs THP settings. As such, let's * analogously elide sysfs THP settings here. */ - if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false, false, - PMD_ORDER)) + if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER)) return SCAN_VMA_CHECK; /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */ @@ -2363,8 +2362,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, progress++; break; } - if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false, - true, PMD_ORDER)) { + if (!thp_vma_allowable_order(vma, vma->vm_flags, + TVA_ENFORCE_SYSFS, PMD_ORDER)) { skip: progress++; continue; @@ -2701,8 +2700,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, *prev = vma; - if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false, false, - PMD_ORDER)) + if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER)) return -EINVAL; cc = kmalloc(sizeof(*cc), GFP_KERNEL); diff --git a/mm/memory.c b/mm/memory.c index c0bd4e0d5e7a..79d851be8ab2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4334,8 +4334,8 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) * for this vma. Then filter out the orders that can't be allocated over * the faulting address and still be fully contained in the vma. */ - orders = thp_vma_allowable_orders(vma, vma->vm_flags, false, true, true, - BIT(PMD_ORDER) - 1); + orders = thp_vma_allowable_orders(vma, vma->vm_flags, + TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1); orders = thp_vma_suitable_orders(vma, vmf->address, orders); if (!orders) @@ -5438,7 +5438,8 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, return VM_FAULT_OOM; retry_pud: if (pud_none(*vmf.pud) && - thp_vma_allowable_order(vma, vm_flags, false, true, true, PUD_ORDER)) { + thp_vma_allowable_order(vma, vm_flags, + TVA_IN_PF | TVA_ENFORCE_SYSFS, PUD_ORDER)) { ret = create_huge_pud(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; @@ -5472,7 +5473,8 @@ retry_pud: goto retry_pud; if (pmd_none(*vmf.pmd) && - thp_vma_allowable_order(vma, vm_flags, false, true, true, PMD_ORDER)) { + thp_vma_allowable_order(vma, vm_flags, + TVA_IN_PF | TVA_ENFORCE_SYSFS, PMD_ORDER)) { ret = create_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; -- cgit v1.2.3-58-ga151