From 7716506adac4664793a9d6d3dfa31ffddfa98714 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 4 May 2021 18:32:45 -0700 Subject: mm: introduce and use mapping_empty() Patch series "Remove nrexceptional tracking", v2. We actually use nrexceptional for very little these days. It's a minor pain to keep in sync with nrpages, but the pain becomes much bigger with the THP patches because we don't know how many indices a shadow entry occupies. It's easier to just remove it than keep it accurate. Also, we save 8 bytes per inode which is nothing to sneeze at; on my laptop, it would improve shmem_inode_cache from 22 to 23 objects per 16kB, and inode_cache from 26 to 27 objects. Combined, that saves a megabyte of memory from a combined usage of 25MB for both caches. Unfortunately, ext4 doesn't cross a magic boundary, so it doesn't save any memory for ext4. This patch (of 4): Instead of checking the two counters (nrpages and nrexceptional), we can just check whether i_pages is empty. Link: https://lkml.kernel.org/r/20201026151849.24232-1-willy@infradead.org Link: https://lkml.kernel.org/r/20201026151849.24232-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Tested-by: Vishal Verma Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/block_dev.c | 2 +- fs/dax.c | 2 +- fs/gfs2/glock.c | 3 +-- include/linux/pagemap.h | 5 +++++ mm/truncate.c | 18 +++--------------- 5 files changed, 11 insertions(+), 19 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index a5244e08b6c8..9114e0a0e7b4 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -79,7 +79,7 @@ static void kill_bdev(struct block_device *bdev) { struct address_space *mapping = bdev->bd_inode->i_mapping; - if (mapping->nrpages == 0 && mapping->nrexceptional == 0) + if (mapping_empty(mapping)) return; invalidate_bh_lrus(); diff --git a/fs/dax.c b/fs/dax.c index b3d27fdc6775..999f3f22aea3 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -965,7 +965,7 @@ int dax_writeback_mapping_range(struct address_space *mapping, if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT)) return -EIO; - if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL) + if (mapping_empty(mapping) || wbc->sync_mode != WB_SYNC_ALL) return 0; trace_dax_writeback_range(inode, xas.xa_index, end_index); diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 84c38103aa06..ea7fc5c641c7 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -273,8 +273,7 @@ static void __gfs2_glock_put(struct gfs2_glock *gl) if (mapping) { truncate_inode_pages_final(mapping); if (!gfs2_withdrawn(sdp)) - GLOCK_BUG_ON(gl, mapping->nrpages || - mapping->nrexceptional); + GLOCK_BUG_ON(gl, !mapping_empty(mapping)); } trace_gfs2_glock_put(gl); sdp->sd_lockstruct.ls_ops->lm_put_lock(gl); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 469fa7ffcf96..a4bd41128bf3 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -18,6 +18,11 @@ struct pagevec; +static inline bool mapping_empty(struct address_space *mapping) +{ + return xa_empty(&mapping->i_pages); +} + /* * Bits in mapping->flags. */ diff --git a/mm/truncate.c b/mm/truncate.c index 455944264663..adb8d4107988 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -295,7 +295,7 @@ void truncate_inode_pages_range(struct address_space *mapping, pgoff_t index; int i; - if (mapping->nrpages == 0 && mapping->nrexceptional == 0) + if (mapping_empty(mapping)) goto out; /* Offsets within partial pages */ @@ -440,9 +440,6 @@ EXPORT_SYMBOL(truncate_inode_pages); */ void truncate_inode_pages_final(struct address_space *mapping) { - unsigned long nrexceptional; - unsigned long nrpages; - /* * Page reclaim can not participate in regular inode lifetime * management (can't call iput()) and thus can race with the @@ -452,16 +449,7 @@ void truncate_inode_pages_final(struct address_space *mapping) */ mapping_set_exiting(mapping); - /* - * When reclaim installs eviction entries, it increases - * nrexceptional first, then decreases nrpages. Make sure we see - * this in the right order or we might miss an entry. - */ - nrpages = mapping->nrpages; - smp_rmb(); - nrexceptional = mapping->nrexceptional; - - if (nrpages || nrexceptional) { + if (!mapping_empty(mapping)) { /* * As truncation uses a lockless tree lookup, cycle * the tree lock to make sure any ongoing tree @@ -633,7 +621,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, int ret2 = 0; int did_range_unmap = 0; - if (mapping->nrpages == 0 && mapping->nrexceptional == 0) + if (mapping_empty(mapping)) goto out; pagevec_init(&pvec); -- cgit v1.2.3-58-ga151 From 46be67b424efab933562a29ea8f1df0c20aa9959 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 4 May 2021 18:32:48 -0700 Subject: mm: stop accounting shadow entries We no longer need to keep track of how many shadow entries are present in a mapping. This saves a few writes to the inode and memory barriers. Link: https://lkml.kernel.org/r/20201026151849.24232-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Tested-by: Vishal Verma Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 13 ------------- mm/swap_state.c | 4 ---- mm/truncate.c | 1 - mm/workingset.c | 1 - 4 files changed, 19 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 5be57ba01d33..d08ff1504e64 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -142,17 +142,6 @@ static void page_cache_delete(struct address_space *mapping, page->mapping = NULL; /* Leave page->index set: truncation lookup relies upon it */ - - if (shadow) { - mapping->nrexceptional += nr; - /* - * Make sure the nrexceptional update is committed before - * the nrpages update so that final truncate racing - * with reclaim does not see both counters 0 at the - * same time and miss a shadow entry. - */ - smp_wmb(); - } mapping->nrpages -= nr; } @@ -925,8 +914,6 @@ noinline int __add_to_page_cache_locked(struct page *page, if (xas_error(&xas)) goto unlock; - if (old) - mapping->nrexceptional--; mapping->nrpages++; /* hugetlb pages do not participate in page cache accounting */ diff --git a/mm/swap_state.c b/mm/swap_state.c index fb7efa08fe57..3a1259c13f3b 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -132,7 +132,6 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, xas_store(&xas, page); xas_next(&xas); } - address_space->nrexceptional -= nr_shadows; address_space->nrpages += nr; __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); __mod_lruvec_page_state(page, NR_SWAPCACHE, nr); @@ -172,8 +171,6 @@ void __delete_from_swap_cache(struct page *page, xas_next(&xas); } ClearPageSwapCache(page); - if (shadow) - address_space->nrexceptional += nr; address_space->nrpages -= nr; __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); __mod_lruvec_page_state(page, NR_SWAPCACHE, -nr); @@ -275,7 +272,6 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin, xas_store(&xas, NULL); nr_shadows++; } - address_space->nrexceptional -= nr_shadows; xa_unlock_irq(&address_space->i_pages); /* search the next swapcache until we meet end */ diff --git a/mm/truncate.c b/mm/truncate.c index adb8d4107988..95af244b112a 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -40,7 +40,6 @@ static inline void __clear_shadow_entry(struct address_space *mapping, if (xas_load(&xas) != entry) return; xas_store(&xas, NULL); - mapping->nrexceptional--; } static void clear_shadow_entry(struct address_space *mapping, pgoff_t index, diff --git a/mm/workingset.c b/mm/workingset.c index cd39902c1062..b7cdeca5a76d 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -554,7 +554,6 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, goto out_invalid; if (WARN_ON_ONCE(node->count != node->nr_values)) goto out_invalid; - mapping->nrexceptional -= node->nr_values; xa_delete_node(node, workingset_update_node); __inc_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM); -- cgit v1.2.3-58-ga151 From 7f0e07fb0289519af7e726e4f7b7118f7ecc979b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 4 May 2021 18:32:51 -0700 Subject: dax: account DAX entries as nrpages Simplify mapping_needs_writeback() by accounting DAX entries as pages instead of exceptional entries. Link: https://lkml.kernel.org/r/20201026151849.24232-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Tested-by: Vishal Verma Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 6 +++--- mm/filemap.c | 3 --- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 999f3f22aea3..69216241392f 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -525,7 +525,7 @@ retry: dax_disassociate_entry(entry, mapping, false); xas_store(xas, NULL); /* undo the PMD join */ dax_wake_entry(xas, entry, true); - mapping->nrexceptional--; + mapping->nrpages -= PG_PMD_NR; entry = NULL; xas_set(xas, index); } @@ -541,7 +541,7 @@ retry: dax_lock_entry(xas, entry); if (xas_error(xas)) goto out_unlock; - mapping->nrexceptional++; + mapping->nrpages += 1UL << order; } out_unlock: @@ -661,7 +661,7 @@ static int __dax_invalidate_entry(struct address_space *mapping, goto out; dax_disassociate_entry(entry, mapping, trunc); xas_store(&xas, NULL); - mapping->nrexceptional--; + mapping->nrpages -= 1UL << dax_entry_order(entry); ret = 1; out: put_unlocked_entry(&xas, entry); diff --git a/mm/filemap.c b/mm/filemap.c index d08ff1504e64..ecc5f8a4c488 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -618,9 +618,6 @@ EXPORT_SYMBOL(filemap_fdatawait_keep_errors); /* Returns true if writeback might be needed or already in progress. */ static bool mapping_needs_writeback(struct address_space *mapping) { - if (dax_mapping(mapping)) - return mapping->nrexceptional; - return mapping->nrpages; } -- cgit v1.2.3-58-ga151 From 8bc3c481b3d0dcef2cf8e1b7c6b780af6725f7e3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 4 May 2021 18:32:54 -0700 Subject: mm: remove nrexceptional from inode We no longer track anything in nrexceptional, so remove it, saving 8 bytes per inode. Link: https://lkml.kernel.org/r/20201026151849.24232-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Tested-by: Vishal Verma Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/inode.c | 2 +- include/linux/fs.h | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index 9e192bea0630..af48d1b722f0 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -529,7 +529,7 @@ void clear_inode(struct inode *inode) */ xa_lock_irq(&inode->i_data.i_pages); BUG_ON(inode->i_data.nrpages); - BUG_ON(inode->i_data.nrexceptional); + BUG_ON(!mapping_empty(&inode->i_data)); xa_unlock_irq(&inode->i_data.i_pages); BUG_ON(!list_empty(&inode->i_data.private_list)); BUG_ON(!(inode->i_state & I_FREEING)); diff --git a/include/linux/fs.h b/include/linux/fs.h index 12766edee81f..acef282b97c6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -442,7 +442,6 @@ int pagecache_write_end(struct file *, struct address_space *mapping, * @i_mmap: Tree of private and shared mappings. * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable. * @nrpages: Number of page entries, protected by the i_pages lock. - * @nrexceptional: Shadow or DAX entries, protected by the i_pages lock. * @writeback_index: Writeback starts here. * @a_ops: Methods. * @flags: Error bits and flags (AS_*). @@ -463,7 +462,6 @@ struct address_space { struct rb_root_cached i_mmap; struct rw_semaphore i_mmap_rwsem; unsigned long nrpages; - unsigned long nrexceptional; pgoff_t writeback_index; const struct address_space_operations *a_ops; unsigned long flags; -- cgit v1.2.3-58-ga151 From 786b31121a2ce4309a81a7f36d63f02ca588839e Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 4 May 2021 18:32:57 -0700 Subject: mm: remove nrexceptional from inode: remove BUG_ON clear_inode()'s BUG_ON(!mapping_empty(&inode->i_data)) is unsafe: we know of two ways in which nodes can and do (on rare occasions) get left behind. Until those are fixed, do not BUG_ON() nor even WARN_ON(). Yes, this will then leak those nodes (or the next user of the struct inode may use them); but this has been happening for years, and the new BUG_ON(!mapping_empty) was only guilty of revealing that. A proper fix will follow, but no hurry. Link: https://lkml.kernel.org/r/alpine.LSU.2.11.2104292229380.16080@eggly.anvils Signed-off-by: Hugh Dickins Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/inode.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/inode.c b/fs/inode.c index af48d1b722f0..c93500d84264 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -529,7 +529,14 @@ void clear_inode(struct inode *inode) */ xa_lock_irq(&inode->i_data.i_pages); BUG_ON(inode->i_data.nrpages); - BUG_ON(!mapping_empty(&inode->i_data)); + /* + * Almost always, mapping_empty(&inode->i_data) here; but there are + * two known and long-standing ways in which nodes may get left behind + * (when deep radix-tree node allocation failed partway; or when THP + * collapse_file() failed). Until those two known cases are cleaned up, + * or a cleanup function is called here, do not BUG_ON(!mapping_empty), + * nor even WARN_ON(!mapping_empty). + */ xa_unlock_irq(&inode->i_data.i_pages); BUG_ON(!list_empty(&inode->i_data.private_list)); BUG_ON(!(inode->i_state & I_FREEING)); -- cgit v1.2.3-58-ga151 From aec44e0f0213e36d4f0868a80cdc5097a510f79d Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 4 May 2021 18:33:00 -0700 Subject: hugetlb: pass vma into huge_pte_alloc() and huge_pmd_share() Patch series "hugetlb: Disable huge pmd unshare for uffd-wp", v4. This series tries to disable huge pmd unshare of hugetlbfs backed memory for uffd-wp. Although uffd-wp of hugetlbfs is still during rfc stage, the idea of this series may be needed for multiple tasks (Axel's uffd minor fault series, and Mike's soft dirty series), so I picked it out from the larger series. This patch (of 4): It is a preparation work to be able to behave differently in the per architecture huge_pte_alloc() according to different VMA attributes. Pass it deeper into huge_pmd_share() so that we can avoid the find_vma() call. [peterx@redhat.com: build fix] Link: https://lkml.kernel.org/r/20210304164653.GB397383@xz-x1Link: https://lkml.kernel.org/r/20210218230633.15028-1-peterx@redhat.com Link: https://lkml.kernel.org/r/20210218230633.15028-2-peterx@redhat.com Signed-off-by: Peter Xu Suggested-by: Mike Kravetz Cc: Adam Ruprecht Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Axel Rasmussen Cc: Cannon Matthews Cc: Catalin Marinas Cc: Chinwen Chang Cc: David Rientjes Cc: "Dr . David Alan Gilbert" Cc: Huang Ying Cc: Ingo Molnar Cc: Jann Horn Cc: Jerome Glisse Cc: Kirill A. Shutemov Cc: Lokesh Gidra Cc: "Matthew Wilcox (Oracle)" Cc: Michael Ellerman Cc: "Michal Koutn" Cc: Michel Lespinasse Cc: Mike Rapoport Cc: Mina Almasry Cc: Nicholas Piggin Cc: Oliver Upton Cc: Shaohua Li Cc: Shawn Anastasio Cc: Steven Price Cc: Steven Rostedt Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/mm/hugetlbpage.c | 4 ++-- arch/ia64/mm/hugetlbpage.c | 3 ++- arch/mips/mm/hugetlbpage.c | 4 ++-- arch/parisc/mm/hugetlbpage.c | 2 +- arch/powerpc/mm/hugetlbpage.c | 3 ++- arch/s390/mm/hugetlbpage.c | 2 +- arch/sh/mm/hugetlbpage.c | 2 +- arch/sparc/mm/hugetlbpage.c | 2 +- include/linux/hugetlb.h | 5 +++-- mm/hugetlb.c | 15 ++++++++------- mm/userfaultfd.c | 2 +- 11 files changed, 24 insertions(+), 20 deletions(-) diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 55ecf6de9ff7..6e3bcffe2837 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -252,7 +252,7 @@ void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr, set_pte(ptep, pte); } -pte_t *huge_pte_alloc(struct mm_struct *mm, +pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long sz) { pgd_t *pgdp; @@ -286,7 +286,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, } else if (sz == PMD_SIZE) { if (IS_ENABLED(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) && pud_none(READ_ONCE(*pudp))) - ptep = huge_pmd_share(mm, addr, pudp); + ptep = huge_pmd_share(mm, vma, addr, pudp); else ptep = (pte_t *)pmd_alloc(mm, pudp, addr); } else if (sz == (CONT_PMD_SIZE)) { diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c index b331f94d20ac..f993cb36c062 100644 --- a/arch/ia64/mm/hugetlbpage.c +++ b/arch/ia64/mm/hugetlbpage.c @@ -25,7 +25,8 @@ unsigned int hpage_shift = HPAGE_SHIFT_DEFAULT; EXPORT_SYMBOL(hpage_shift); pte_t * -huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) +huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, unsigned long sz) { unsigned long taddr = htlbpage_to_page(addr); pgd_t *pgd; diff --git a/arch/mips/mm/hugetlbpage.c b/arch/mips/mm/hugetlbpage.c index b9f76f433617..7eaff5b07873 100644 --- a/arch/mips/mm/hugetlbpage.c +++ b/arch/mips/mm/hugetlbpage.c @@ -21,8 +21,8 @@ #include #include -pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, - unsigned long sz) +pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, unsigned long sz) { pgd_t *pgd; p4d_t *p4d; diff --git a/arch/parisc/mm/hugetlbpage.c b/arch/parisc/mm/hugetlbpage.c index 43652de5f139..d1d3990b83f6 100644 --- a/arch/parisc/mm/hugetlbpage.c +++ b/arch/parisc/mm/hugetlbpage.c @@ -44,7 +44,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, } -pte_t *huge_pte_alloc(struct mm_struct *mm, +pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long sz) { pgd_t *pgd; diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index d142b76d507d..9a75ba078e1b 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -106,7 +106,8 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, * At this point we do the placement change only for BOOK3S 64. This would * possibly work on other subarchs. */ -pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) +pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, unsigned long sz) { pgd_t *pg; p4d_t *p4; diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index 3b5a4d25ca9b..da36d13ffc16 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -189,7 +189,7 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, return pte; } -pte_t *huge_pte_alloc(struct mm_struct *mm, +pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long sz) { pgd_t *pgdp; diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c index 220d7bc43d2b..999ab5916e69 100644 --- a/arch/sh/mm/hugetlbpage.c +++ b/arch/sh/mm/hugetlbpage.c @@ -21,7 +21,7 @@ #include #include -pte_t *huge_pte_alloc(struct mm_struct *mm, +pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long sz) { pgd_t *pgd; diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c index ad4b42f04988..04d8790f6c32 100644 --- a/arch/sparc/mm/hugetlbpage.c +++ b/arch/sparc/mm/hugetlbpage.c @@ -279,7 +279,7 @@ unsigned long pud_leaf_size(pud_t pud) { return 1UL << tte_to_shift(*(pte_t *)&p unsigned long pmd_leaf_size(pmd_t pmd) { return 1UL << tte_to_shift(*(pte_t *)&pmd); } unsigned long pte_leaf_size(pte_t pte) { return 1UL << tte_to_shift(pte); } -pte_t *huge_pte_alloc(struct mm_struct *mm, +pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long sz) { pgd_t *pgd; diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index cccd1aab69dd..653ef322fac9 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -152,7 +152,8 @@ void hugetlb_fix_reserve_counts(struct inode *inode); extern struct mutex *hugetlb_fault_mutex_table; u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx); -pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud); +pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pud_t *pud); struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage); @@ -161,7 +162,7 @@ extern struct list_head huge_boot_pages; /* arch callbacks */ -pte_t *huge_pte_alloc(struct mm_struct *mm, +pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long sz); pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6c72433bec1e..a02a651088d3 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3795,7 +3795,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, src_pte = huge_pte_offset(src, addr, sz); if (!src_pte) continue; - dst_pte = huge_pte_alloc(dst, addr, sz); + dst_pte = huge_pte_alloc(dst, vma, addr, sz); if (!dst_pte) { ret = -ENOMEM; break; @@ -4563,7 +4563,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, */ mapping = vma->vm_file->f_mapping; i_mmap_lock_read(mapping); - ptep = huge_pte_alloc(mm, haddr, huge_page_size(h)); + ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h)); if (!ptep) { i_mmap_unlock_read(mapping); return VM_FAULT_OOM; @@ -5370,9 +5370,9 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, * if !vma_shareable check at the beginning of the routine. i_mmap_rwsem is * only required for subsequent processing. */ -pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) +pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pud_t *pud) { - struct vm_area_struct *vma = find_vma(mm, addr); struct address_space *mapping = vma->vm_file->f_mapping; pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; @@ -5450,7 +5450,8 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, } #define want_pmd_share() (1) #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ -pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) +pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pud_t *pud) { return NULL; } @@ -5469,7 +5470,7 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ #ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB -pte_t *huge_pte_alloc(struct mm_struct *mm, +pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long sz) { pgd_t *pgd; @@ -5488,7 +5489,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, } else { BUG_ON(sz != PMD_SIZE); if (want_pmd_share() && pud_none(*pud)) - pte = huge_pmd_share(mm, addr, pud); + pte = huge_pmd_share(mm, vma, addr, pud); else pte = (pte_t *)pmd_alloc(mm, pud, addr); } diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 9a3d451402d7..063cbb17e8d8 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -290,7 +290,7 @@ retry: mutex_lock(&hugetlb_fault_mutex_table[hash]); err = -ENOMEM; - dst_pte = huge_pte_alloc(dst_mm, dst_addr, vma_hpagesize); + dst_pte = huge_pte_alloc(dst_mm, dst_vma, dst_addr, vma_hpagesize); if (!dst_pte) { mutex_unlock(&hugetlb_fault_mutex_table[hash]); i_mmap_unlock_read(mapping); -- cgit v1.2.3-58-ga151 From c1991e0705d143be773c984b006f2078aa9f2853 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 4 May 2021 18:33:04 -0700 Subject: hugetlb/userfaultfd: forbid huge pmd sharing when uffd enabled Huge pmd sharing could bring problem to userfaultfd. The thing is that userfaultfd is running its logic based on the special bits on page table entries, however the huge pmd sharing could potentially share page table entries for different address ranges. That could cause issues on either: - When sharing huge pmd page tables for an uffd write protected range, the newly mapped huge pmd range will also be write protected unexpectedly, or, - When we try to write protect a range of huge pmd shared range, we'll first do huge_pmd_unshare() in hugetlb_change_protection(), however that also means the UFFDIO_WRITEPROTECT could be silently skipped for the shared region, which could lead to data loss. While at it, a few other things are done altogether: - Move want_pmd_share() from mm/hugetlb.c into linux/hugetlb.h, because that's definitely something that arch code would like to use too - ARM64 currently directly check against CONFIG_ARCH_WANT_HUGE_PMD_SHARE when trying to share huge pmd. Switch to the want_pmd_share() helper. - Move vma_shareable() from huge_pmd_share() into want_pmd_share(). [peterx@redhat.com: fix build with !ARCH_WANT_HUGE_PMD_SHARE] Link: https://lkml.kernel.org/r/20210310185359.88297-1-peterx@redhat.com Link: https://lkml.kernel.org/r/20210218231202.15426-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Mike Kravetz Reviewed-by: Axel Rasmussen Tested-by: Naresh Kamboju Cc: Adam Ruprecht Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Cannon Matthews Cc: Catalin Marinas Cc: Chinwen Chang Cc: David Rientjes Cc: "Dr . David Alan Gilbert" Cc: Huang Ying Cc: Ingo Molnar Cc: Jann Horn Cc: Jerome Glisse Cc: Kirill A. Shutemov Cc: Lokesh Gidra Cc: "Matthew Wilcox (Oracle)" Cc: Michael Ellerman Cc: "Michal Koutn" Cc: Michel Lespinasse Cc: Mike Rapoport Cc: Mina Almasry Cc: Nicholas Piggin Cc: Oliver Upton Cc: Shaohua Li Cc: Shawn Anastasio Cc: Steven Price Cc: Steven Rostedt Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/mm/hugetlbpage.c | 3 +-- include/linux/hugetlb.h | 2 ++ include/linux/userfaultfd_k.h | 9 +++++++++ mm/hugetlb.c | 22 ++++++++++++++++------ 4 files changed, 28 insertions(+), 8 deletions(-) diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 6e3bcffe2837..58987a98e179 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -284,8 +284,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, */ ptep = pte_alloc_map(mm, pmdp, addr); } else if (sz == PMD_SIZE) { - if (IS_ENABLED(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) && - pud_none(READ_ONCE(*pudp))) + if (want_pmd_share(vma, addr) && pud_none(READ_ONCE(*pudp))) ptep = huge_pmd_share(mm, vma, addr, pudp); else ptep = (pte_t *)pmd_alloc(mm, pudp, addr); diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 653ef322fac9..88e93809a455 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -1040,4 +1040,6 @@ static inline __init void hugetlb_cma_check(void) } #endif +bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr); + #endif /* _LINUX_HUGETLB_H */ diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index a8e5f3ea9bb2..c63ccdae3eab 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -52,6 +52,15 @@ static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, return vma->vm_userfaultfd_ctx.ctx == vm_ctx.ctx; } +/* + * Never enable huge pmd sharing on uffd-wp registered vmas, because uffd-wp + * protect information is per pgtable entry. + */ +static inline bool uffd_disable_huge_pmd_share(struct vm_area_struct *vma) +{ + return vma->vm_flags & VM_UFFD_WP; +} + static inline bool userfaultfd_missing(struct vm_area_struct *vma) { return vma->vm_flags & VM_UFFD_MISSING; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a02a651088d3..91647e824015 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5326,6 +5326,15 @@ static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr) return false; } +bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr) +{ +#ifdef CONFIG_USERFAULTFD + if (uffd_disable_huge_pmd_share(vma)) + return false; +#endif + return vma_shareable(vma, addr); +} + /* * Determine if start,end range within vma could be mapped by shared pmd. * If yes, adjust start and end to cover range associated with possible @@ -5382,9 +5391,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, pte_t *pte; spinlock_t *ptl; - if (!vma_shareable(vma, addr)) - return (pte_t *)pmd_alloc(mm, pud, addr); - i_mmap_assert_locked(mapping); vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { if (svma == vma) @@ -5448,7 +5454,7 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE; return 1; } -#define want_pmd_share() (1) + #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pud_t *pud) @@ -5466,7 +5472,11 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, unsigned long *start, unsigned long *end) { } -#define want_pmd_share() (0) + +bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr) +{ + return false; +} #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ #ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB @@ -5488,7 +5498,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, pte = (pte_t *)pud; } else { BUG_ON(sz != PMD_SIZE); - if (want_pmd_share() && pud_none(*pud)) + if (want_pmd_share(vma, addr) && pud_none(*pud)) pte = huge_pmd_share(mm, vma, addr, pud); else pte = (pte_t *)pmd_alloc(mm, pud, addr); -- cgit v1.2.3-58-ga151 From 537cf30bba241ae88d5f4b0b6a5e66271b394852 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 4 May 2021 18:33:08 -0700 Subject: mm/hugetlb: move flush_hugetlb_tlb_range() into hugetlb.h Prepare for it to be called outside of mm/hugetlb.c. Link: https://lkml.kernel.org/r/20210218231204.15474-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Mike Kravetz Reviewed-by: Axel Rasmussen Cc: Adam Ruprecht Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Cannon Matthews Cc: Catalin Marinas Cc: Chinwen Chang Cc: David Rientjes Cc: "Dr . David Alan Gilbert" Cc: Huang Ying Cc: Ingo Molnar Cc: Jann Horn Cc: Jerome Glisse Cc: Kirill A. Shutemov Cc: Lokesh Gidra Cc: "Matthew Wilcox (Oracle)" Cc: Michael Ellerman Cc: "Michal Koutn" Cc: Michel Lespinasse Cc: Mike Rapoport Cc: Mina Almasry Cc: Nicholas Piggin Cc: Oliver Upton Cc: Shaohua Li Cc: Shawn Anastasio Cc: Steven Price Cc: Steven Rostedt Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 8 ++++++++ mm/hugetlb.c | 8 -------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 88e93809a455..e43668144664 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -1042,4 +1042,12 @@ static inline __init void hugetlb_cma_check(void) bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr); +#ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE +/* + * ARCHes with special requirements for evicting HUGETLB backing TLB entries can + * implement this. + */ +#define flush_hugetlb_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) +#endif + #endif /* _LINUX_HUGETLB_H */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 91647e824015..3868f3126534 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4996,14 +4996,6 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, return i ? i : err; } -#ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE -/* - * ARCHes with special requirements for evicting HUGETLB backing TLB entries can - * implement this. - */ -#define flush_hugetlb_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) -#endif - unsigned long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot) { -- cgit v1.2.3-58-ga151 From 6dfeaff93be1a4cab4fb48dad7df326d05059a99 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 4 May 2021 18:33:13 -0700 Subject: hugetlb/userfaultfd: unshare all pmds for hugetlbfs when register wp Huge pmd sharing for hugetlbfs is racy with userfaultfd-wp because userfaultfd-wp is always based on pgtable entries, so they cannot be shared. Walk the hugetlb range and unshare all such mappings if there is, right before UFFDIO_REGISTER will succeed and return to userspace. This will pair with want_pmd_share() in hugetlb code so that huge pmd sharing is completely disabled for userfaultfd-wp registered range. Link: https://lkml.kernel.org/r/20210218231206.15524-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Mike Kravetz Cc: Peter Xu Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: Mike Rapoport Cc: Kirill A. Shutemov Cc: Matthew Wilcox (Oracle) Cc: Adam Ruprecht Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Anshuman Khandual Cc: Cannon Matthews Cc: Catalin Marinas Cc: Chinwen Chang Cc: David Rientjes Cc: "Dr . David Alan Gilbert" Cc: Huang Ying Cc: Ingo Molnar Cc: Jann Horn Cc: Jerome Glisse Cc: Lokesh Gidra Cc: Michael Ellerman Cc: "Michal Koutn" Cc: Michel Lespinasse Cc: Mina Almasry Cc: Nicholas Piggin Cc: Oliver Upton Cc: Shaohua Li Cc: Shawn Anastasio Cc: Steven Price Cc: Steven Rostedt Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 4 ++++ include/linux/hugetlb.h | 3 +++ mm/hugetlb.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 58 insertions(+) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 0be8cdd4425a..e5ce3b4e6c3d 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -1449,6 +1450,9 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, vma->vm_flags = new_flags; vma->vm_userfaultfd_ctx.ctx = ctx; + if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma)) + hugetlb_unshare_all_pmds(vma); + skip: prev = vma; start = vma->vm_end; diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index e43668144664..0f5813522224 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -188,6 +188,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot); bool is_hugetlb_entry_migration(pte_t pte); +void hugetlb_unshare_all_pmds(struct vm_area_struct *vma); #else /* !CONFIG_HUGETLB_PAGE */ @@ -369,6 +370,8 @@ static inline vm_fault_t hugetlb_fault(struct mm_struct *mm, return 0; } +static inline void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) { } + #endif /* !CONFIG_HUGETLB_PAGE */ /* * hugepages at page global directory. If arch support diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3868f3126534..e86d3abcc300 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5691,6 +5691,57 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason) } } +/* + * This function will unconditionally remove all the shared pmd pgtable entries + * within the specific vma for a hugetlbfs memory range. + */ +void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) +{ + struct hstate *h = hstate_vma(vma); + unsigned long sz = huge_page_size(h); + struct mm_struct *mm = vma->vm_mm; + struct mmu_notifier_range range; + unsigned long address, start, end; + spinlock_t *ptl; + pte_t *ptep; + + if (!(vma->vm_flags & VM_MAYSHARE)) + return; + + start = ALIGN(vma->vm_start, PUD_SIZE); + end = ALIGN_DOWN(vma->vm_end, PUD_SIZE); + + if (start >= end) + return; + + /* + * No need to call adjust_range_if_pmd_sharing_possible(), because + * we have already done the PUD_SIZE alignment. + */ + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, + start, end); + mmu_notifier_invalidate_range_start(&range); + i_mmap_lock_write(vma->vm_file->f_mapping); + for (address = start; address < end; address += PUD_SIZE) { + unsigned long tmp = address; + + ptep = huge_pte_offset(mm, address, sz); + if (!ptep) + continue; + ptl = huge_pte_lock(h, mm, ptep); + /* We don't want 'address' to be changed */ + huge_pmd_unshare(mm, vma, &tmp, ptep); + spin_unlock(ptl); + } + flush_hugetlb_tlb_range(vma, start, end); + i_mmap_unlock_write(vma->vm_file->f_mapping); + /* + * No need to call mmu_notifier_invalidate_range(), see + * Documentation/vm/mmu_notifier.rst. + */ + mmu_notifier_invalidate_range_end(&range); +} + #ifdef CONFIG_CMA static bool cma_reserve_called __initdata; -- cgit v1.2.3-58-ga151 From 6501fe5f162395ba6dfa6ac86be05f1c24c1a7e0 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:33:16 -0700 Subject: mm/hugetlb: remove redundant reservation check condition in alloc_huge_page() vma_resv_map(vma) checks if a reserve map is associated with the vma. The routine vma_needs_reservation() will check vma_resv_map(vma) and return 1 if no reserv map is present. map_chg is set to the return value of vma_needs_reservation(). Therefore, !vma_resv_map(vma) is redundant in the expression: map_chg || avoid_reserve || !vma_resv_map(vma); Remove the redundant check. [Thanks Mike Kravetz for reshaping this commit message!] Link: https://lkml.kernel.org/r/20210301104726.45159-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e86d3abcc300..c0d4c7784a84 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2316,7 +2316,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, /* If this allocation is not consuming a reservation, charge it now. */ - deferred_reserve = map_chg || avoid_reserve || !vma_resv_map(vma); + deferred_reserve = map_chg || avoid_reserve; if (deferred_reserve) { ret = hugetlb_cgroup_charge_cgroup_rsvd( idx, pages_per_huge_page(h), &h_cg); -- cgit v1.2.3-58-ga151 From 4bfb68a0858deae4c40ea585037a3261f0717b0a Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 4 May 2021 18:33:19 -0700 Subject: mm: generalize HUGETLB_PAGE_SIZE_VARIABLE HUGETLB_PAGE_SIZE_VARIABLE need not be defined for each individual platform subscribing it. Instead just make it generic. Link: https://lkml.kernel.org/r/1614914928-22039-1-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Suggested-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Acked-by: Michael Ellerman [powerpc] Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Christophe Leroy Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/Kconfig | 6 +----- arch/powerpc/Kconfig | 6 +----- mm/Kconfig | 7 +++++++ 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 81e2b893b1e7..fd2cfc65fdc9 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -32,6 +32,7 @@ config IA64 select TTY select HAVE_ARCH_TRACEHOOK select HAVE_VIRT_CPU_ACCOUNTING + select HUGETLB_PAGE_SIZE_VARIABLE if HUGETLB_PAGE select VIRT_TO_BUS select GENERIC_IRQ_PROBE select GENERIC_PENDING_IRQ if SMP @@ -82,11 +83,6 @@ config STACKTRACE_SUPPORT config GENERIC_LOCKBREAK def_bool n -config HUGETLB_PAGE_SIZE_VARIABLE - bool - depends on HUGETLB_PAGE - default y - config GENERIC_CALIBRATE_DELAY bool default y diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 2d060c002595..965278498dd7 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -232,6 +232,7 @@ config PPC select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI && !HAVE_HARDLOCKUP_DETECTOR_ARCH select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP + select HUGETLB_PAGE_SIZE_VARIABLE if PPC_BOOK3S_64 && HUGETLB_PAGE select MMU_GATHER_RCU_TABLE_FREE select MMU_GATHER_PAGE_SIZE select HAVE_REGS_AND_STACK_ACCESS_API @@ -416,11 +417,6 @@ config HIGHMEM source "kernel/Kconfig.hz" -config HUGETLB_PAGE_SIZE_VARIABLE - bool - depends on HUGETLB_PAGE && PPC_BOOK3S_64 - default y - config MATH_EMULATION bool "Math emulation" depends on 4xx || PPC_8xx || PPC_MPC832x || BOOKE diff --git a/mm/Kconfig b/mm/Kconfig index 3636da27c385..f4ceeaedb966 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -273,6 +273,13 @@ config ARCH_ENABLE_HUGEPAGE_MIGRATION config ARCH_ENABLE_THP_MIGRATION bool +config HUGETLB_PAGE_SIZE_VARIABLE + def_bool n + help + Allows the pageblock_order value to be dynamic instead of just standard + HUGETLB_PAGE_ORDER when there are multiple HugeTLB page sizes available + on a platform. + config CONTIG_ALLOC def_bool (MEMORY_ISOLATION && COMPACTION) || CMA -- cgit v1.2.3-58-ga151 From 04adbc3f7bff403a97355531da0190a263d66ea5 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:33:22 -0700 Subject: mm/hugetlb: use some helper functions to cleanup code Patch series "Some cleanups for hugetlb". This series contains cleanups to remove unnecessary VM_BUG_ON_PAGE, use helper function and so on. I also collect some previous patches into this series in case they are forgotten. This patch (of 5): We could use pages_per_huge_page to get the number of pages per hugepage, use get_hstate_idx to calculate hstate index, and use hstate_is_gigantic to check if a hstate is gigantic to make code more succinct. Link: https://lkml.kernel.org/r/20210308112809.26107-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20210308112809.26107-2-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 2 +- mm/hugetlb.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 701c82c36138..c262566f7c5d 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -1435,7 +1435,7 @@ static int get_hstate_idx(int page_size_log) if (!h) return -1; - return h - hstates; + return hstate_index(h); } /* diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c0d4c7784a84..87104dc78e59 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1273,7 +1273,7 @@ static void free_gigantic_page(struct page *page, unsigned int order) static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nodemask) { - unsigned long nr_pages = 1UL << huge_page_order(h); + unsigned long nr_pages = pages_per_huge_page(h); if (nid == NUMA_NO_NODE) nid = numa_mem_id(); @@ -3267,10 +3267,10 @@ static int __init hugepages_setup(char *s) /* * Global state is always initialized later in hugetlb_init. - * But we need to allocate >= MAX_ORDER hstates here early to still + * But we need to allocate gigantic hstates here early to still * use the bootmem allocator. */ - if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER) + if (hugetlb_max_hstate && hstate_is_gigantic(parsed_hstate)) hugetlb_hstate_alloc_pages(parsed_hstate); last_mhp = mhp; -- cgit v1.2.3-58-ga151 From 5af1ab1d24e0842e2ca72c1fd0833864f6fa458a Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:33:25 -0700 Subject: mm/hugetlb: optimize the surplus state transfer code in move_hugetlb_state() We should not transfer the per-node surplus state when we do not cross the node in order to save some cpu cycles Link: https://lkml.kernel.org/r/20210308112809.26107-3-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 87104dc78e59..c3d47af59137 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5682,6 +5682,12 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason) SetHPageTemporary(oldpage); ClearHPageTemporary(newpage); + /* + * There is no need to transfer the per-node surplus state + * when we do not cross the node. + */ + if (new_nid == old_nid) + return; spin_lock(&hugetlb_lock); if (h->surplus_huge_pages_node[old_nid]) { h->surplus_huge_pages_node[old_nid]--; -- cgit v1.2.3-58-ga151 From 5c8ecb131a655e775287380428ac1c764c117ee6 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:33:28 -0700 Subject: mm/hugetlb_cgroup: remove unnecessary VM_BUG_ON_PAGE in hugetlb_cgroup_migrate() !PageHuge(oldhpage) is implicitly checked in page_hstate() above, so we remove this explicit one. Link: https://lkml.kernel.org/r/20210308112809.26107-4-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb_cgroup.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 603a131e262d..726b85f4f303 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -784,7 +784,6 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage) if (hugetlb_cgroup_disabled()) return; - VM_BUG_ON_PAGE(!PageHuge(oldhpage), oldhpage); spin_lock(&hugetlb_lock); h_cg = hugetlb_cgroup_from_page(oldhpage); h_cg_rsvd = hugetlb_cgroup_from_page_rsvd(oldhpage); -- cgit v1.2.3-58-ga151 From d83e6c8a9b65876b0dcd11ca25e8c39bd7bb1a1c Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:33:31 -0700 Subject: mm/hugetlb: simplify the code when alloc_huge_page() failed in hugetlb_no_page() Rework the error handling code when alloc_huge_page() failed to remove some duplicated code and simplify the code slightly. Link: https://lkml.kernel.org/r/20210308112809.26107-5-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c3d47af59137..c8f0a38588ba 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4395,13 +4395,10 @@ retry: * sure there really is no pte entry. */ ptl = huge_pte_lock(h, mm, ptep); - if (!huge_pte_none(huge_ptep_get(ptep))) { - ret = 0; - spin_unlock(ptl); - goto out; - } + ret = 0; + if (huge_pte_none(huge_ptep_get(ptep))) + ret = vmf_error(PTR_ERR(page)); spin_unlock(ptl); - ret = vmf_error(PTR_ERR(page)); goto out; } clear_huge_page(page, address, pages_per_huge_page(h)); -- cgit v1.2.3-58-ga151 From d4241a049ac0049fe96b3dae0598092517dbf6bd Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:33:34 -0700 Subject: mm/hugetlb: avoid calculating fault_mutex_hash in truncate_op case The fault_mutex hashing overhead can be avoided in truncate_op case because page faults can not race with truncation in this routine. So calculate hash for fault_mutex only in !truncate_op case to save some cpu cycles. Link: https://lkml.kernel.org/r/20210308112809.26107-6-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index c262566f7c5d..d81f52b87bd7 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -482,10 +482,9 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, for (i = 0; i < pagevec_count(&pvec); ++i) { struct page *page = pvec.pages[i]; - u32 hash; + u32 hash = 0; index = page->index; - hash = hugetlb_fault_mutex_hash(mapping, index); if (!truncate_op) { /* * Only need to hold the fault mutex in the @@ -493,6 +492,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, * page faults. Races are not possible in the * case of truncation. */ + hash = hugetlb_fault_mutex_hash(mapping, index); mutex_lock(&hugetlb_fault_mutex_table[hash]); } -- cgit v1.2.3-58-ga151 From 0edf61e5ee5c334f33bb7bf95d1b470f01ae9fec Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:33:37 -0700 Subject: khugepaged: remove unneeded return value of khugepaged_collapse_pte_mapped_thps() Patch series "Cleanup and fixup for khugepaged", v2. This series contains cleanups to remove unneeded return value, use helper function and so on. And there is one fix to correct the wrong result value for trace_mm_collapse_huge_page_isolate(). This patch (of 4): The return value of khugepaged_collapse_pte_mapped_thps() is never checked since it's introduced. We should remove such unneeded return value. Link: https://lkml.kernel.org/r/20210306032947.35921-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20210306032947.35921-2-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Acked-by: Kirill A. Shutemov Cc: Rik van Riel Cc: Ebru Akagunduz Cc: Dan Carpenter Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/khugepaged.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index a7d6cb912b05..d43812c5ce16 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1533,16 +1533,16 @@ abort: goto drop_hpage; } -static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) +static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) { struct mm_struct *mm = mm_slot->mm; int i; if (likely(mm_slot->nr_pte_mapped_thp == 0)) - return 0; + return; if (!mmap_write_trylock(mm)) - return -EBUSY; + return; if (unlikely(khugepaged_test_exit(mm))) goto out; @@ -1553,7 +1553,6 @@ static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) out: mm_slot->nr_pte_mapped_thp = 0; mmap_write_unlock(mm); - return 0; } static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) @@ -2057,9 +2056,8 @@ static void khugepaged_scan_file(struct mm_struct *mm, BUILD_BUG(); } -static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) +static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) { - return 0; } #endif -- cgit v1.2.3-58-ga151 From 588d01f918d42d2d453d8cd5af6bf2c2e1072a47 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:33:40 -0700 Subject: khugepaged: reuse the smp_wmb() inside __SetPageUptodate() smp_wmb() is needed to avoid the copy_huge_page writes to become visible after the set_pmd_at() write here. But we can reuse the smp_wmb() inside __SetPageUptodate() to remove this redundant one. Link: https://lkml.kernel.org/r/20210306032947.35921-3-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Acked-by: Kirill A. Shutemov Cc: Dan Carpenter Cc: Ebru Akagunduz Cc: Mike Kravetz Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/khugepaged.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index d43812c5ce16..287e7ecf978c 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1183,19 +1183,18 @@ static void collapse_huge_page(struct mm_struct *mm, __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl, &compound_pagelist); pte_unmap(pte); + /* + * spin_lock() below is not the equivalent of smp_wmb(), but + * the smp_wmb() inside __SetPageUptodate() can be reused to + * avoid the copy_huge_page writes to become visible after + * the set_pmd_at() write. + */ __SetPageUptodate(new_page); pgtable = pmd_pgtable(_pmd); _pmd = mk_huge_pmd(new_page, vma->vm_page_prot); _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); - /* - * spin_lock() below is not the equivalent of smp_wmb(), so - * this is needed to avoid the copy_huge_page writes to become - * visible after the set_pmd_at() write. - */ - smp_wmb(); - spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); page_add_new_anon_rmap(new_page, vma, address, true); -- cgit v1.2.3-58-ga151 From 28ff0a3c421ca19f4c8b41f736ff388fd588e1a1 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:33:43 -0700 Subject: khugepaged: use helper khugepaged_test_exit() in __khugepaged_enter() Commit 4d45e75a9955 ("mm: remove the now-unnecessary mmget_still_valid() hack") have made khugepaged_test_exit() suitable for check mm->mm_users against 0. Use this helper here. Link: https://lkml.kernel.org/r/20210306032947.35921-4-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Acked-by: Kirill A. Shutemov Cc: Dan Carpenter Cc: Ebru Akagunduz Cc: Mike Kravetz Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/khugepaged.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 287e7ecf978c..e886a8618c33 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -481,7 +481,7 @@ int __khugepaged_enter(struct mm_struct *mm) return -ENOMEM; /* __khugepaged_exit() must not run from under us */ - VM_BUG_ON_MM(atomic_read(&mm->mm_users) == 0, mm); + VM_BUG_ON_MM(khugepaged_test_exit(mm), mm); if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { free_mm_slot(mm_slot); return 0; -- cgit v1.2.3-58-ga151 From 74e579bf231a337ab3786d59e64bc94f45ca7b3f Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:33:46 -0700 Subject: khugepaged: fix wrong result value for trace_mm_collapse_huge_page_isolate() In writable and !referenced case, the result value should be SCAN_LACK_REFERENCED_PAGE for trace_mm_collapse_huge_page_isolate() instead of default 0 (SCAN_FAIL) here. Link: https://lkml.kernel.org/r/20210306032947.35921-5-linmiaohe@huawei.com Fixes: 7d2eba0557c1 ("mm: add tracepoint for scanning pages") Signed-off-by: Miaohe Lin Acked-by: Kirill A. Shutemov Cc: Dan Carpenter Cc: Ebru Akagunduz Cc: Mike Kravetz Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/khugepaged.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index e886a8618c33..adf677246d86 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -716,17 +716,17 @@ next: if (pte_write(pteval)) writable = true; } - if (likely(writable)) { - if (likely(referenced)) { - result = SCAN_SUCCEED; - trace_mm_collapse_huge_page_isolate(page, none_or_zero, - referenced, writable, result); - return 1; - } - } else { + + if (unlikely(!writable)) { result = SCAN_PAGE_RO; + } else if (unlikely(!referenced)) { + result = SCAN_LACK_REFERENCED_PAGE; + } else { + result = SCAN_SUCCEED; + trace_mm_collapse_huge_page_isolate(page, none_or_zero, + referenced, writable, result); + return 1; } - out: release_pte_pages(pte, _pte, compound_pagelist); trace_mm_collapse_huge_page_isolate(page, none_or_zero, -- cgit v1.2.3-58-ga151 From 8fd5eda4c7268b62f46b2ed76b96f9e41e128a47 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:33:49 -0700 Subject: mm/huge_memory.c: remove unnecessary local variable ret2 There is no need to use a new local variable ret2 to get the return value of handle_userfault(). Use ret directly to make code more succinct. Link: https://lkml.kernel.org/r/20210210072409.60587-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Andrew Morton Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index ae907a9c2050..bff92dea5ab3 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -624,14 +624,12 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, /* Deliver the page fault to userland */ if (userfaultfd_missing(vma)) { - vm_fault_t ret2; - spin_unlock(vmf->ptl); put_page(page); pte_free(vma->vm_mm, pgtable); - ret2 = handle_userfault(vmf, VM_UFFD_MISSING); - VM_BUG_ON(ret2 & VM_FAULT_FALLBACK); - return ret2; + ret = handle_userfault(vmf, VM_UFFD_MISSING); + VM_BUG_ON(ret & VM_FAULT_FALLBACK); + return ret; } entry = mk_huge_pmd(page, vma->vm_page_prot); -- cgit v1.2.3-58-ga151 From 71f9e58eb408db423e0e27b55e0de66fb3590296 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:33:52 -0700 Subject: mm/huge_memory.c: rework the function vma_adjust_trans_huge() Patch series "Some cleanups for huge_memory", v3. This series contains cleanups to rework some function logics to make it more readable, use helper function and so on. More details can be found in the respective changelogs. This patch (of 6): The current implementation of vma_adjust_trans_huge() contains some duplicated codes. Add helper function to get rid of these codes to make it more succinct. Link: https://lkml.kernel.org/r/20210318122722.13135-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20210318122722.13135-2-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Peter Xu Cc: Zi Yan Cc: Matthew Wilcox Cc: William Kucharski Cc: Vlastimil Babka Cc: Peter Xu Cc: yuleixzhang Cc: Michel Lespinasse Cc: Aneesh Kumar K.V Cc: Ralph Campbell Cc: Thomas Hellstrm (Intel) Cc: Yang Shi Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 44 +++++++++++++++++++------------------------- 1 file changed, 19 insertions(+), 25 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index bff92dea5ab3..ae16a82da823 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2301,44 +2301,38 @@ void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, __split_huge_pmd(vma, pmd, address, freeze, page); } +static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned long address) +{ + /* + * If the new address isn't hpage aligned and it could previously + * contain an hugepage: check if we need to split an huge pmd. + */ + if (!IS_ALIGNED(address, HPAGE_PMD_SIZE) && + range_in_vma(vma, ALIGN_DOWN(address, HPAGE_PMD_SIZE), + ALIGN(address, HPAGE_PMD_SIZE))) + split_huge_pmd_address(vma, address, false, NULL); +} + void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, long adjust_next) { - /* - * If the new start address isn't hpage aligned and it could - * previously contain an hugepage: check if we need to split - * an huge pmd. - */ - if (start & ~HPAGE_PMD_MASK && - (start & HPAGE_PMD_MASK) >= vma->vm_start && - (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) - split_huge_pmd_address(vma, start, false, NULL); + /* Check if we need to split start first. */ + split_huge_pmd_if_needed(vma, start); - /* - * If the new end address isn't hpage aligned and it could - * previously contain an hugepage: check if we need to split - * an huge pmd. - */ - if (end & ~HPAGE_PMD_MASK && - (end & HPAGE_PMD_MASK) >= vma->vm_start && - (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) - split_huge_pmd_address(vma, end, false, NULL); + /* Check if we need to split end next. */ + split_huge_pmd_if_needed(vma, end); /* - * If we're also updating the vma->vm_next->vm_start, if the new - * vm_next->vm_start isn't hpage aligned and it could previously - * contain an hugepage: check if we need to split an huge pmd. + * If we're also updating the vma->vm_next->vm_start, + * check if we need to split it. */ if (adjust_next > 0) { struct vm_area_struct *next = vma->vm_next; unsigned long nstart = next->vm_start; nstart += adjust_next; - if (nstart & ~HPAGE_PMD_MASK && - (nstart & HPAGE_PMD_MASK) >= next->vm_start && - (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end) - split_huge_pmd_address(next, nstart, false, NULL); + split_huge_pmd_if_needed(next, nstart); } } -- cgit v1.2.3-58-ga151 From aaa9705b4af3608fd759c9ba8d0003f7a83fb335 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:33:55 -0700 Subject: mm/huge_memory.c: make get_huge_zero_page() return bool It's guaranteed that huge_zero_page will not be NULL if huge_zero_refcount is increased successfully. When READ_ONCE(huge_zero_page) is returned, there must be a huge_zero_page and it can be replaced with returning 'true' when we do not care about the value of huge_zero_page. We can thus make it return bool to save READ_ONCE cpu cycles as the return value is just used to check if huge_zero_page exists. Link: https://lkml.kernel.org/r/20210318122722.13135-3-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Zi Yan Reviewed-by: Peter Xu Cc: Aneesh Kumar K.V Cc: Matthew Wilcox Cc: Michel Lespinasse Cc: Ralph Campbell Cc: Thomas Hellstrm (Intel) Cc: Vlastimil Babka Cc: Wei Yang Cc: William Kucharski Cc: Yang Shi Cc: yuleixzhang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index ae16a82da823..01b96c638e73 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -77,18 +77,18 @@ bool transparent_hugepage_enabled(struct vm_area_struct *vma) return false; } -static struct page *get_huge_zero_page(void) +static bool get_huge_zero_page(void) { struct page *zero_page; retry: if (likely(atomic_inc_not_zero(&huge_zero_refcount))) - return READ_ONCE(huge_zero_page); + return true; zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, HPAGE_PMD_ORDER); if (!zero_page) { count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED); - return NULL; + return false; } count_vm_event(THP_ZERO_PAGE_ALLOC); preempt_disable(); @@ -101,7 +101,7 @@ retry: /* We take additional reference here. It will be put back by shrinker */ atomic_set(&huge_zero_refcount, 2); preempt_enable(); - return READ_ONCE(huge_zero_page); + return true; } static void put_huge_zero_page(void) -- cgit v1.2.3-58-ga151 From 6beb5e8bba972e15276a27555f2f4b834b248742 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:33:59 -0700 Subject: mm/huge_memory.c: rework the function do_huge_pmd_numa_page() slightly The current code that checks if migrating misplaced transhuge page is needed is pretty hard to follow. Rework it and add a comment to make its logic more clear and improve readability. Link: https://lkml.kernel.org/r/20210318122722.13135-4-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Zi Yan Reviewed-by: Peter Xu Cc: Aneesh Kumar K.V Cc: Matthew Wilcox Cc: Michel Lespinasse Cc: Ralph Campbell Cc: Thomas Hellstrm (Intel) Cc: Vlastimil Babka Cc: Wei Yang Cc: William Kucharski Cc: Yang Shi Cc: yuleixzhang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 01b96c638e73..23964adf5db2 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1462,12 +1462,6 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) */ page_locked = trylock_page(page); target_nid = mpol_misplaced(page, vma, haddr); - if (target_nid == NUMA_NO_NODE) { - /* If the page was locked, there are no parallel migrations */ - if (page_locked) - goto clear_pmdnuma; - } - /* Migration could have started since the pmd_trans_migrating check */ if (!page_locked) { page_nid = NUMA_NO_NODE; @@ -1476,6 +1470,11 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) spin_unlock(vmf->ptl); put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE); goto out; + } else if (target_nid == NUMA_NO_NODE) { + /* There are no parallel migrations and page is in the right + * node. Clear the numa hinting info in this pmd. + */ + goto clear_pmdnuma; } /* -- cgit v1.2.3-58-ga151 From f6004e73ae955d0a44d66a5709ec5f98c07c733f Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:34:02 -0700 Subject: mm/huge_memory.c: remove redundant PageCompound() check The !PageCompound() check limits the page must be head or tail while !PageHead() further limits it to page head only. So !PageHead() check is equivalent here. Link: https://lkml.kernel.org/r/20210318122722.13135-5-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Peter Xu Cc: Aneesh Kumar K.V Cc: Matthew Wilcox Cc: Michel Lespinasse Cc: Ralph Campbell Cc: Thomas Hellstrm (Intel) Cc: Vlastimil Babka Cc: Wei Yang Cc: William Kucharski Cc: Yang Shi Cc: yuleixzhang Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 23964adf5db2..52acc3954afd 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1291,7 +1291,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) } page = pmd_page(orig_pmd); - VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page); + VM_BUG_ON_PAGE(!PageHead(page), page); /* Lock page for reuse_swap_page() */ if (!trylock_page(page)) { -- cgit v1.2.3-58-ga151 From d4afd60c24f87b6275b12ec3d67d8c2ad78cb075 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:34:05 -0700 Subject: mm/huge_memory.c: remove unused macro TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG Commit 4958e4d86ecb ("mm: thp: remove debug_cow switch") forgot to remove TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG macro. Remove it here. Link: https://lkml.kernel.org/r/20210318122722.13135-6-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Zi Yan Reviewed-by: Peter Xu Cc: Aneesh Kumar K.V Cc: Matthew Wilcox Cc: Michel Lespinasse Cc: Ralph Campbell Cc: Thomas Hellstrm (Intel) Cc: Vlastimil Babka Cc: Wei Yang Cc: William Kucharski Cc: Yang Shi Cc: yuleixzhang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index ba973efcd369..9626fda5efce 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -87,9 +87,6 @@ enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG, TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG, -#ifdef CONFIG_DEBUG_VM - TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG, -#endif }; struct kobject; -- cgit v1.2.3-58-ga151 From a44f89dc6c5f8ba70240b81a570260d29d04bcb0 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:34:08 -0700 Subject: mm/huge_memory.c: use helper function migration_entry_to_page() It's more recommended to use helper function migration_entry_to_page() to get the page via migration entry. We can also enjoy the PageLocked() check there. Link: https://lkml.kernel.org/r/20210318122722.13135-7-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Peter Xu Cc: Aneesh Kumar K.V Cc: Matthew Wilcox Cc: Michel Lespinasse Cc: Ralph Campbell Cc: Thomas Hellstrm (Intel) Cc: Vlastimil Babka Cc: Wei Yang Cc: William Kucharski Cc: Yang Shi Cc: yuleixzhang Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 52acc3954afd..9b31ef84bf7e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1693,7 +1693,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, VM_BUG_ON(!is_pmd_migration_entry(orig_pmd)); entry = pmd_to_swp_entry(orig_pmd); - page = pfn_to_page(swp_offset(entry)); + page = migration_entry_to_page(entry); flush_needed = 0; } else WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); @@ -2101,7 +2101,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, swp_entry_t entry; entry = pmd_to_swp_entry(old_pmd); - page = pfn_to_page(swp_offset(entry)); + page = migration_entry_to_page(entry); write = is_write_migration_entry(entry); young = false; soft_dirty = pmd_swp_soft_dirty(old_pmd); -- cgit v1.2.3-58-ga151 From 89dc6a9682919dbd64213c630a71eedaa021d7e5 Mon Sep 17 00:00:00 2001 From: Yanfei Xu Date: Tue, 4 May 2021 18:34:12 -0700 Subject: mm/khugepaged.c: replace barrier() with READ_ONCE() for a selective variable READ_ONCE() is more selective and lightweight. It is more appropriate that using a READ_ONCE() for the certain variable to prevent the compiler from reordering. Link: https://lkml.kernel.org/r/20210323092730.247583-1-yanfei.xu@windriver.com Signed-off-by: Yanfei Xu Acked-by: Kirill A. Shutemov Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/khugepaged.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index adf677246d86..a40a29414762 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2202,11 +2202,9 @@ static void khugepaged_do_scan(void) { struct page *hpage = NULL; unsigned int progress = 0, pass_through_head = 0; - unsigned int pages = khugepaged_pages_to_scan; + unsigned int pages = READ_ONCE(khugepaged_pages_to_scan); bool wait = true; - barrier(); /* write khugepaged_pages_to_scan to local stack */ - lru_add_drain_all(); while (progress < pages) { -- cgit v1.2.3-58-ga151 From fef792a4fdb9b2d9d3d5c36aaa85f768f456a4d7 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:34:15 -0700 Subject: khugepaged: use helper function range_in_vma() in collapse_pte_mapped_thp() Patch series "Cleanup for khugepaged". This series contains cleanups to remove unnecessary out label and meaningless !pte_present() check. Also use helper function to simplify the code. More details can be found in the respective changelogs. This patch (of 3): We could use helper function range_in_vma() to check whether the desired range is inside the vma to simplify the code. Link: https://lkml.kernel.org/r/20210325135647.64106-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20210325135647.64106-2-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Zi Yan Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/khugepaged.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index a40a29414762..beaf06123b4b 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1446,7 +1446,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) int i; if (!vma || !vma->vm_file || - vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE) + !range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE)) return; /* -- cgit v1.2.3-58-ga151 From 18d24a7cd9d3f35cfa8bed32a921a94159c78df0 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:34:17 -0700 Subject: khugepaged: remove unnecessary out label in collapse_huge_page() The out label here is unneeded because it just goes to out_up_write label. Remove it to make code more concise. Link: https://lkml.kernel.org/r/20210325135647.64106-3-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Zi Yan Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/khugepaged.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index beaf06123b4b..276ac0099e19 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1128,10 +1128,10 @@ static void collapse_huge_page(struct mm_struct *mm, mmap_write_lock(mm); result = hugepage_vma_revalidate(mm, address, &vma); if (result) - goto out; + goto out_up_write; /* check if the pmd is still valid */ if (mm_find_pmd(mm, address) != pmd) - goto out; + goto out_up_write; anon_vma_lock_write(vma->anon_vma); @@ -1171,7 +1171,7 @@ static void collapse_huge_page(struct mm_struct *mm, spin_unlock(pmd_ptl); anon_vma_unlock_write(vma->anon_vma); result = SCAN_FAIL; - goto out; + goto out_up_write; } /* @@ -1215,8 +1215,6 @@ out_nolock: mem_cgroup_uncharge(*hpage); trace_mm_collapse_huge_page(mm, isolated, result); return; -out: - goto out_up_write; } static int khugepaged_scan_pmd(struct mm_struct *mm, -- cgit v1.2.3-58-ga151 From 75f83783bfdf2ddb3ffbf79ba44d506fb5b5548f Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:34:20 -0700 Subject: khugepaged: remove meaningless !pte_present() check in khugepaged_scan_pmd() We know it must meet the !is_swap_pte() and !pte_none() condition if we reach here. Since !is_swap_pte() indicates pte_none() or pte_present() is met, it's guaranteed that pte must be present here. Link: https://lkml.kernel.org/r/20210325135647.64106-4-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Zi Yan Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/khugepaged.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 276ac0099e19..a03569eda183 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1271,10 +1271,6 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, goto out_unmap; } } - if (!pte_present(pteval)) { - result = SCAN_PTE_NON_PRESENT; - goto out_unmap; - } if (pte_uffd_wp(pteval)) { /* * Don't collapse the page if any of the small -- cgit v1.2.3-58-ga151 From fa6c02315f745f00b62c634b220c3fb5c3310258 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Tue, 4 May 2021 18:34:23 -0700 Subject: mm: huge_memory: a new debugfs interface for splitting THP tests We did not have a direct user interface of splitting the compound page backing a THP and there is no need unless we want to expose the THP implementation details to users. Make /split_huge_pages accept a new command to do that. By writing ",," to /split_huge_pages, THPs within the given virtual address range from the process with the given pid are split. It is used to test split_huge_page function. In addition, a selftest program is added to tools/testing/selftests/vm to utilize the interface by splitting PMD THPs and PTE-mapped THPs. This does not change the old behavior, i.e., writing 1 to the interface to split all THPs in the system. Link: https://lkml.kernel.org/r/20210331235309.332292-1-zi.yan@sent.com Signed-off-by: Zi Yan Reviewed-by: Yang Shi Cc: David Hildenbrand Cc: David Rientjes Cc: John Hubbard Cc: "Kirill A . Shutemov" Cc: Matthew Wilcox Cc: Mika Penttila Cc: Sandipan Das Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 155 ++++++++++- tools/testing/selftests/vm/.gitignore | 1 + tools/testing/selftests/vm/Makefile | 1 + tools/testing/selftests/vm/split_huge_page_test.c | 318 ++++++++++++++++++++++ 4 files changed, 467 insertions(+), 8 deletions(-) create mode 100644 tools/testing/selftests/vm/split_huge_page_test.c diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9b31ef84bf7e..58be9fc8253d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -2915,16 +2916,14 @@ static struct shrinker deferred_split_shrinker = { }; #ifdef CONFIG_DEBUG_FS -static int split_huge_pages_set(void *data, u64 val) +static void split_huge_pages_all(void) { struct zone *zone; struct page *page; unsigned long pfn, max_zone_pfn; unsigned long total = 0, split = 0; - if (val != 1) - return -EINVAL; - + pr_debug("Split all THPs\n"); for_each_populated_zone(zone) { max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) { @@ -2948,15 +2947,155 @@ static int split_huge_pages_set(void *data, u64 val) unlock_page(page); next: put_page(page); + cond_resched(); } } - pr_info("%lu of %lu THP split\n", split, total); + pr_debug("%lu of %lu THP split\n", split, total); +} - return 0; +static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma) +{ + return vma_is_special_huge(vma) || (vma->vm_flags & VM_IO) || + is_vm_hugetlb_page(vma); +} + +static int split_huge_pages_pid(int pid, unsigned long vaddr_start, + unsigned long vaddr_end) +{ + int ret = 0; + struct task_struct *task; + struct mm_struct *mm; + unsigned long total = 0, split = 0; + unsigned long addr; + + vaddr_start &= PAGE_MASK; + vaddr_end &= PAGE_MASK; + + /* Find the task_struct from pid */ + rcu_read_lock(); + task = find_task_by_vpid(pid); + if (!task) { + rcu_read_unlock(); + ret = -ESRCH; + goto out; + } + get_task_struct(task); + rcu_read_unlock(); + + /* Find the mm_struct */ + mm = get_task_mm(task); + put_task_struct(task); + + if (!mm) { + ret = -EINVAL; + goto out; + } + + pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx]\n", + pid, vaddr_start, vaddr_end); + + mmap_read_lock(mm); + /* + * always increase addr by PAGE_SIZE, since we could have a PTE page + * table filled with PTE-mapped THPs, each of which is distinct. + */ + for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) { + struct vm_area_struct *vma = find_vma(mm, addr); + unsigned int follflags; + struct page *page; + + if (!vma || addr < vma->vm_start) + break; + + /* skip special VMA and hugetlb VMA */ + if (vma_not_suitable_for_thp_split(vma)) { + addr = vma->vm_end; + continue; + } + + /* FOLL_DUMP to ignore special (like zero) pages */ + follflags = FOLL_GET | FOLL_DUMP; + page = follow_page(vma, addr, follflags); + + if (IS_ERR(page)) + continue; + if (!page) + continue; + + if (!is_transparent_hugepage(page)) + goto next; + + total++; + if (!can_split_huge_page(compound_head(page), NULL)) + goto next; + + if (!trylock_page(page)) + goto next; + + if (!split_huge_page(page)) + split++; + + unlock_page(page); +next: + put_page(page); + cond_resched(); + } + mmap_read_unlock(mm); + mmput(mm); + + pr_debug("%lu of %lu THP split\n", split, total); + +out: + return ret; } -DEFINE_DEBUGFS_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set, - "%llu\n"); + +#define MAX_INPUT_BUF_SZ 255 + +static ssize_t split_huge_pages_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppops) +{ + static DEFINE_MUTEX(split_debug_mutex); + ssize_t ret; + char input_buf[MAX_INPUT_BUF_SZ]; /* hold pid, start_vaddr, end_vaddr */ + int pid; + unsigned long vaddr_start, vaddr_end; + + ret = mutex_lock_interruptible(&split_debug_mutex); + if (ret) + return ret; + + ret = -EFAULT; + + memset(input_buf, 0, MAX_INPUT_BUF_SZ); + if (copy_from_user(input_buf, buf, min_t(size_t, count, MAX_INPUT_BUF_SZ))) + goto out; + + input_buf[MAX_INPUT_BUF_SZ - 1] = '\0'; + ret = sscanf(input_buf, "%d,0x%lx,0x%lx", &pid, &vaddr_start, &vaddr_end); + if (ret == 1 && pid == 1) { + split_huge_pages_all(); + ret = strlen(input_buf); + goto out; + } else if (ret != 3) { + ret = -EINVAL; + goto out; + } + + ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end); + if (!ret) + ret = strlen(input_buf); +out: + mutex_unlock(&split_debug_mutex); + return ret; + +} + +static const struct file_operations split_huge_pages_fops = { + .owner = THIS_MODULE, + .write = split_huge_pages_write, + .llseek = no_llseek, +}; static int __init split_huge_pages_debugfs(void) { diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore index 9a35c3f6a557..1f651e85ed60 100644 --- a/tools/testing/selftests/vm/.gitignore +++ b/tools/testing/selftests/vm/.gitignore @@ -22,3 +22,4 @@ map_fixed_noreplace write_to_hugetlbfs hmm-tests local_config.* +split_huge_page_test diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index 8b0cd421ebd3..73e1cc96d7c2 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -42,6 +42,7 @@ TEST_GEN_FILES += on-fault-limit TEST_GEN_FILES += thuge-gen TEST_GEN_FILES += transhuge-stress TEST_GEN_FILES += userfaultfd +TEST_GEN_FILES += split_huge_page_test ifeq ($(MACHINE),x86_64) CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh $(CC) ../x86/trivial_32bit_program.c -m32) diff --git a/tools/testing/selftests/vm/split_huge_page_test.c b/tools/testing/selftests/vm/split_huge_page_test.c new file mode 100644 index 000000000000..2c0c18e60c57 --- /dev/null +++ b/tools/testing/selftests/vm/split_huge_page_test.c @@ -0,0 +1,318 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * A test of splitting PMD THPs and PTE-mapped THPs from a specified virtual + * address range in a process via /split_huge_pages interface. + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include + +uint64_t pagesize; +unsigned int pageshift; +uint64_t pmd_pagesize; + +#define PMD_SIZE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size" +#define SPLIT_DEBUGFS "/sys/kernel/debug/split_huge_pages" +#define SMAP_PATH "/proc/self/smaps" +#define INPUT_MAX 80 + +#define PFN_MASK ((1UL<<55)-1) +#define KPF_THP (1UL<<22) + +int is_backed_by_thp(char *vaddr, int pagemap_file, int kpageflags_file) +{ + uint64_t paddr; + uint64_t page_flags; + + if (pagemap_file) { + pread(pagemap_file, &paddr, sizeof(paddr), + ((long)vaddr >> pageshift) * sizeof(paddr)); + + if (kpageflags_file) { + pread(kpageflags_file, &page_flags, sizeof(page_flags), + (paddr & PFN_MASK) * sizeof(page_flags)); + + return !!(page_flags & KPF_THP); + } + } + return 0; +} + + +static uint64_t read_pmd_pagesize(void) +{ + int fd; + char buf[20]; + ssize_t num_read; + + fd = open(PMD_SIZE_PATH, O_RDONLY); + if (fd == -1) { + perror("Open hpage_pmd_size failed"); + exit(EXIT_FAILURE); + } + num_read = read(fd, buf, 19); + if (num_read < 1) { + close(fd); + perror("Read hpage_pmd_size failed"); + exit(EXIT_FAILURE); + } + buf[num_read] = '\0'; + close(fd); + + return strtoul(buf, NULL, 10); +} + +static int write_file(const char *path, const char *buf, size_t buflen) +{ + int fd; + ssize_t numwritten; + + fd = open(path, O_WRONLY); + if (fd == -1) + return 0; + + numwritten = write(fd, buf, buflen - 1); + close(fd); + if (numwritten < 1) + return 0; + + return (unsigned int) numwritten; +} + +static void write_debugfs(int pid, uint64_t vaddr_start, uint64_t vaddr_end) +{ + char input[INPUT_MAX]; + int ret; + + ret = snprintf(input, INPUT_MAX, "%d,0x%lx,0x%lx", pid, vaddr_start, + vaddr_end); + if (ret >= INPUT_MAX) { + printf("%s: Debugfs input is too long\n", __func__); + exit(EXIT_FAILURE); + } + + if (!write_file(SPLIT_DEBUGFS, input, ret + 1)) { + perror(SPLIT_DEBUGFS); + exit(EXIT_FAILURE); + } +} + +#define MAX_LINE_LENGTH 500 + +static bool check_for_pattern(FILE *fp, const char *pattern, char *buf) +{ + while (fgets(buf, MAX_LINE_LENGTH, fp) != NULL) { + if (!strncmp(buf, pattern, strlen(pattern))) + return true; + } + return false; +} + +static uint64_t check_huge(void *addr) +{ + uint64_t thp = 0; + int ret; + FILE *fp; + char buffer[MAX_LINE_LENGTH]; + char addr_pattern[MAX_LINE_LENGTH]; + + ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-", + (unsigned long) addr); + if (ret >= MAX_LINE_LENGTH) { + printf("%s: Pattern is too long\n", __func__); + exit(EXIT_FAILURE); + } + + + fp = fopen(SMAP_PATH, "r"); + if (!fp) { + printf("%s: Failed to open file %s\n", __func__, SMAP_PATH); + exit(EXIT_FAILURE); + } + if (!check_for_pattern(fp, addr_pattern, buffer)) + goto err_out; + + /* + * Fetch the AnonHugePages: in the same block and check the number of + * hugepages. + */ + if (!check_for_pattern(fp, "AnonHugePages:", buffer)) + goto err_out; + + if (sscanf(buffer, "AnonHugePages:%10ld kB", &thp) != 1) { + printf("Reading smap error\n"); + exit(EXIT_FAILURE); + } + +err_out: + fclose(fp); + return thp; +} + +void split_pmd_thp(void) +{ + char *one_page; + size_t len = 4 * pmd_pagesize; + uint64_t thp_size; + size_t i; + + one_page = memalign(pmd_pagesize, len); + + if (!one_page) { + printf("Fail to allocate memory\n"); + exit(EXIT_FAILURE); + } + + madvise(one_page, len, MADV_HUGEPAGE); + + for (i = 0; i < len; i++) + one_page[i] = (char)i; + + thp_size = check_huge(one_page); + if (!thp_size) { + printf("No THP is allocated\n"); + exit(EXIT_FAILURE); + } + + /* split all THPs */ + write_debugfs(getpid(), (uint64_t)one_page, (uint64_t)one_page + len); + + for (i = 0; i < len; i++) + if (one_page[i] != (char)i) { + printf("%ld byte corrupted\n", i); + exit(EXIT_FAILURE); + } + + + thp_size = check_huge(one_page); + if (thp_size) { + printf("Still %ld kB AnonHugePages not split\n", thp_size); + exit(EXIT_FAILURE); + } + + printf("Split huge pages successful\n"); + free(one_page); +} + +void split_pte_mapped_thp(void) +{ + char *one_page, *pte_mapped, *pte_mapped2; + size_t len = 4 * pmd_pagesize; + uint64_t thp_size; + size_t i; + const char *pagemap_template = "/proc/%d/pagemap"; + const char *kpageflags_proc = "/proc/kpageflags"; + char pagemap_proc[255]; + int pagemap_fd; + int kpageflags_fd; + + if (snprintf(pagemap_proc, 255, pagemap_template, getpid()) < 0) { + perror("get pagemap proc error"); + exit(EXIT_FAILURE); + } + pagemap_fd = open(pagemap_proc, O_RDONLY); + + if (pagemap_fd == -1) { + perror("read pagemap:"); + exit(EXIT_FAILURE); + } + + kpageflags_fd = open(kpageflags_proc, O_RDONLY); + + if (kpageflags_fd == -1) { + perror("read kpageflags:"); + exit(EXIT_FAILURE); + } + + one_page = mmap((void *)(1UL << 30), len, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + + madvise(one_page, len, MADV_HUGEPAGE); + + for (i = 0; i < len; i++) + one_page[i] = (char)i; + + thp_size = check_huge(one_page); + if (!thp_size) { + printf("No THP is allocated\n"); + exit(EXIT_FAILURE); + } + + /* remap the first pagesize of first THP */ + pte_mapped = mremap(one_page, pagesize, pagesize, MREMAP_MAYMOVE); + + /* remap the Nth pagesize of Nth THP */ + for (i = 1; i < 4; i++) { + pte_mapped2 = mremap(one_page + pmd_pagesize * i + pagesize * i, + pagesize, pagesize, + MREMAP_MAYMOVE|MREMAP_FIXED, + pte_mapped + pagesize * i); + if (pte_mapped2 == (char *)-1) { + perror("mremap failed"); + exit(EXIT_FAILURE); + } + } + + /* smap does not show THPs after mremap, use kpageflags instead */ + thp_size = 0; + for (i = 0; i < pagesize * 4; i++) + if (i % pagesize == 0 && + is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd)) + thp_size++; + + if (thp_size != 4) { + printf("Some THPs are missing during mremap\n"); + exit(EXIT_FAILURE); + } + + /* split all remapped THPs */ + write_debugfs(getpid(), (uint64_t)pte_mapped, + (uint64_t)pte_mapped + pagesize * 4); + + /* smap does not show THPs after mremap, use kpageflags instead */ + thp_size = 0; + for (i = 0; i < pagesize * 4; i++) { + if (pte_mapped[i] != (char)i) { + printf("%ld byte corrupted\n", i); + exit(EXIT_FAILURE); + } + if (i % pagesize == 0 && + is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd)) + thp_size++; + } + + if (thp_size) { + printf("Still %ld THPs not split\n", thp_size); + exit(EXIT_FAILURE); + } + + printf("Split PTE-mapped huge pages successful\n"); + munmap(one_page, len); + close(pagemap_fd); + close(kpageflags_fd); +} + +int main(int argc, char **argv) +{ + if (geteuid() != 0) { + printf("Please run the benchmark as root\n"); + exit(EXIT_FAILURE); + } + + pagesize = getpagesize(); + pageshift = ffs(pagesize) - 1; + pmd_pagesize = read_pmd_pagesize(); + + split_pmd_thp(); + split_pte_mapped_thp(); + + return 0; +} -- cgit v1.2.3-58-ga151 From fbe37501b2526a71d82b898671260524279c6765 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Tue, 4 May 2021 18:34:26 -0700 Subject: mm: huge_memory: debugfs for file-backed THP split Further extend /split_huge_pages to accept ",," for file-backed THP split tests since tmpfs may have file backed by THP that mapped nowhere. Update selftest program to test file-backed THP split too. Link: https://lkml.kernel.org/r/20210331235309.332292-2-zi.yan@sent.com Signed-off-by: Zi Yan Suggested-by: Kirill A. Shutemov Reviewed-by: Yang Shi Cc: "Kirill A . Shutemov" Cc: Shuah Khan Cc: John Hubbard Cc: Sandipan Das Cc: David Hildenbrand Cc: Mika Penttila Cc: David Rientjes Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 90 ++++++++++++++++++++++- tools/testing/selftests/vm/split_huge_page_test.c | 82 +++++++++++++++++++-- 2 files changed, 166 insertions(+), 6 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 58be9fc8253d..b3788683f7d3 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3050,6 +3050,65 @@ out: return ret; } +static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start, + pgoff_t off_end) +{ + struct filename *file; + struct file *candidate; + struct address_space *mapping; + int ret = -EINVAL; + pgoff_t index; + int nr_pages = 1; + unsigned long total = 0, split = 0; + + file = getname_kernel(file_path); + if (IS_ERR(file)) + return ret; + + candidate = file_open_name(file, O_RDONLY, 0); + if (IS_ERR(candidate)) + goto out; + + pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx]\n", + file_path, off_start, off_end); + + mapping = candidate->f_mapping; + + for (index = off_start; index < off_end; index += nr_pages) { + struct page *fpage = pagecache_get_page(mapping, index, + FGP_ENTRY | FGP_HEAD, 0); + + nr_pages = 1; + if (xa_is_value(fpage) || !fpage) + continue; + + if (!is_transparent_hugepage(fpage)) + goto next; + + total++; + nr_pages = thp_nr_pages(fpage); + + if (!trylock_page(fpage)) + goto next; + + if (!split_huge_page(fpage)) + split++; + + unlock_page(fpage); +next: + put_page(fpage); + cond_resched(); + } + + filp_close(candidate, NULL); + ret = 0; + + pr_debug("%lu of %lu file-backed THP split\n", split, total); +out: + putname(file); + return ret; +} + #define MAX_INPUT_BUF_SZ 255 static ssize_t split_huge_pages_write(struct file *file, const char __user *buf, @@ -3057,7 +3116,8 @@ static ssize_t split_huge_pages_write(struct file *file, const char __user *buf, { static DEFINE_MUTEX(split_debug_mutex); ssize_t ret; - char input_buf[MAX_INPUT_BUF_SZ]; /* hold pid, start_vaddr, end_vaddr */ + /* hold pid, start_vaddr, end_vaddr or file_path, off_start, off_end */ + char input_buf[MAX_INPUT_BUF_SZ]; int pid; unsigned long vaddr_start, vaddr_end; @@ -3072,6 +3132,34 @@ static ssize_t split_huge_pages_write(struct file *file, const char __user *buf, goto out; input_buf[MAX_INPUT_BUF_SZ - 1] = '\0'; + + if (input_buf[0] == '/') { + char *tok; + char *buf = input_buf; + char file_path[MAX_INPUT_BUF_SZ]; + pgoff_t off_start = 0, off_end = 0; + size_t input_len = strlen(input_buf); + + tok = strsep(&buf, ","); + if (tok) { + strncpy(file_path, tok, MAX_INPUT_BUF_SZ); + } else { + ret = -EINVAL; + goto out; + } + + ret = sscanf(buf, "0x%lx,0x%lx", &off_start, &off_end); + if (ret != 2) { + ret = -EINVAL; + goto out; + } + ret = split_huge_pages_in_file(file_path, off_start, off_end); + if (!ret) + ret = input_len; + + goto out; + } + ret = sscanf(input_buf, "%d,0x%lx,0x%lx", &pid, &vaddr_start, &vaddr_end); if (ret == 1 && pid == 1) { split_huge_pages_all(); diff --git a/tools/testing/selftests/vm/split_huge_page_test.c b/tools/testing/selftests/vm/split_huge_page_test.c index 2c0c18e60c57..1af16d2c2a0a 100644 --- a/tools/testing/selftests/vm/split_huge_page_test.c +++ b/tools/testing/selftests/vm/split_huge_page_test.c @@ -7,11 +7,13 @@ #define _GNU_SOURCE #include #include +#include #include #include #include #include #include +#include #include #include @@ -24,6 +26,9 @@ uint64_t pmd_pagesize; #define SMAP_PATH "/proc/self/smaps" #define INPUT_MAX 80 +#define PID_FMT "%d,0x%lx,0x%lx" +#define PATH_FMT "%s,0x%lx,0x%lx" + #define PFN_MASK ((1UL<<55)-1) #define KPF_THP (1UL<<22) @@ -87,13 +92,16 @@ static int write_file(const char *path, const char *buf, size_t buflen) return (unsigned int) numwritten; } -static void write_debugfs(int pid, uint64_t vaddr_start, uint64_t vaddr_end) +static void write_debugfs(const char *fmt, ...) { char input[INPUT_MAX]; int ret; + va_list argp; + + va_start(argp, fmt); + ret = vsnprintf(input, INPUT_MAX, fmt, argp); + va_end(argp); - ret = snprintf(input, INPUT_MAX, "%d,0x%lx,0x%lx", pid, vaddr_start, - vaddr_end); if (ret >= INPUT_MAX) { printf("%s: Debugfs input is too long\n", __func__); exit(EXIT_FAILURE); @@ -183,7 +191,8 @@ void split_pmd_thp(void) } /* split all THPs */ - write_debugfs(getpid(), (uint64_t)one_page, (uint64_t)one_page + len); + write_debugfs(PID_FMT, getpid(), (uint64_t)one_page, + (uint64_t)one_page + len); for (i = 0; i < len; i++) if (one_page[i] != (char)i) { @@ -274,7 +283,7 @@ void split_pte_mapped_thp(void) } /* split all remapped THPs */ - write_debugfs(getpid(), (uint64_t)pte_mapped, + write_debugfs(PID_FMT, getpid(), (uint64_t)pte_mapped, (uint64_t)pte_mapped + pagesize * 4); /* smap does not show THPs after mremap, use kpageflags instead */ @@ -300,6 +309,68 @@ void split_pte_mapped_thp(void) close(kpageflags_fd); } +void split_file_backed_thp(void) +{ + int status; + int fd; + ssize_t num_written; + char tmpfs_template[] = "/tmp/thp_split_XXXXXX"; + const char *tmpfs_loc = mkdtemp(tmpfs_template); + char testfile[INPUT_MAX]; + uint64_t pgoff_start = 0, pgoff_end = 1024; + + printf("Please enable pr_debug in split_huge_pages_in_file() if you need more info.\n"); + + status = mount("tmpfs", tmpfs_loc, "tmpfs", 0, "huge=always,size=4m"); + + if (status) { + printf("Unable to create a tmpfs for testing\n"); + exit(EXIT_FAILURE); + } + + status = snprintf(testfile, INPUT_MAX, "%s/thp_file", tmpfs_loc); + if (status >= INPUT_MAX) { + printf("Fail to create file-backed THP split testing file\n"); + goto cleanup; + } + + fd = open(testfile, O_CREAT|O_WRONLY); + if (fd == -1) { + perror("Cannot open testing file\n"); + goto cleanup; + } + + /* write something to the file, so a file-backed THP can be allocated */ + num_written = write(fd, tmpfs_loc, sizeof(tmpfs_loc)); + close(fd); + + if (num_written < 1) { + printf("Fail to write data to testing file\n"); + goto cleanup; + } + + /* split the file-backed THP */ + write_debugfs(PATH_FMT, testfile, pgoff_start, pgoff_end); + + status = unlink(testfile); + if (status) + perror("Cannot remove testing file\n"); + +cleanup: + status = umount(tmpfs_loc); + if (status) { + printf("Unable to umount %s\n", tmpfs_loc); + exit(EXIT_FAILURE); + } + status = rmdir(tmpfs_loc); + if (status) { + perror("cannot remove tmp dir"); + exit(EXIT_FAILURE); + } + + printf("file-backed THP split test done, please check dmesg for more information\n"); +} + int main(int argc, char **argv) { if (geteuid() != 0) { @@ -313,6 +384,7 @@ int main(int argc, char **argv) split_pmd_thp(); split_pte_mapped_thp(); + split_file_backed_thp(); return 0; } -- cgit v1.2.3-58-ga151 From f84df0b7f1b603f6c99670bdf2f908f0b6a5ed59 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:34:30 -0700 Subject: mm/hugeltb: remove redundant VM_BUG_ON() in region_add() Patch series "Cleanup and fixup for hugetlb", v2. This series contains cleanups to remove redundant VM_BUG_ON() and simplify the return code. Also this handles the error case in hugetlb_fix_reserve_counts() correctly. More details can be found in the respective changelogs. This patch (of 5): The same VM_BUG_ON() check is already done in the callee. Remove this extra one to simplify the code slightly. Link: https://lkml.kernel.org/r/20210410072348.20437-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20210410072348.20437-2-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Mike Kravetz Cc: Feilong Lin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c8f0a38588ba..ce9e18d45f52 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -553,7 +553,6 @@ retry: resv->adds_in_progress -= in_regions_needed; spin_unlock(&resv->lock); - VM_BUG_ON(add < 0); return add; } -- cgit v1.2.3-58-ga151 From bf3d12b9f7f9e7c4ae4aa94c6c81400d3bf688e6 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:34:32 -0700 Subject: mm/hugeltb: simplify the return code of __vma_reservation_common() It's guaranteed that the vma is associated with a resv_map, i.e. either VM_MAYSHARE or HPAGE_RESV_OWNER, when the code reaches here or we would have returned via !resv check above. So it's unneeded to check whether HPAGE_RESV_OWNER is set here. Simplify the return code to make it more clear. Link: https://lkml.kernel.org/r/20210410072348.20437-3-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Feilong Lin Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ce9e18d45f52..d91bd780ba98 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2174,27 +2174,26 @@ static long __vma_reservation_common(struct hstate *h, if (vma->vm_flags & VM_MAYSHARE) return ret; - else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && ret >= 0) { - /* - * In most cases, reserves always exist for private mappings. - * However, a file associated with mapping could have been - * hole punched or truncated after reserves were consumed. - * As subsequent fault on such a range will not use reserves. - * Subtle - The reserve map for private mappings has the - * opposite meaning than that of shared mappings. If NO - * entry is in the reserve map, it means a reservation exists. - * If an entry exists in the reserve map, it means the - * reservation has already been consumed. As a result, the - * return value of this routine is the opposite of the - * value returned from reserve map manipulation routines above. - */ - if (ret) - return 0; - else - return 1; - } - else - return ret < 0 ? ret : 0; + /* + * We know private mapping must have HPAGE_RESV_OWNER set. + * + * In most cases, reserves always exist for private mappings. + * However, a file associated with mapping could have been + * hole punched or truncated after reserves were consumed. + * As subsequent fault on such a range will not use reserves. + * Subtle - The reserve map for private mappings has the + * opposite meaning than that of shared mappings. If NO + * entry is in the reserve map, it means a reservation exists. + * If an entry exists in the reserve map, it means the + * reservation has already been consumed. As a result, the + * return value of this routine is the opposite of the + * value returned from reserve map manipulation routines above. + */ + if (ret > 0) + return 0; + if (ret == 0) + return 1; + return ret; } static long vma_needs_reservation(struct hstate *h, -- cgit v1.2.3-58-ga151 From dddf31a49a0eb858bba58876c3c67dd8ea81b800 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:34:35 -0700 Subject: mm/hugeltb: clarify (chg - freed) won't go negative in hugetlb_unreserve_pages() The resv_map could be NULL since this routine can be called in the evict inode path for all hugetlbfs inodes and we will have chg = 0 in this case. But (chg - freed) won't go negative as Mike pointed out: "If resv_map is NULL, then no hugetlb pages can be allocated/associated with the file. As a result, remove_inode_hugepages will never find any huge pages associated with the inode and the passed value 'freed' will always be zero." Add a comment clarifying this to make it clear and also avoid confusion. Link: https://lkml.kernel.org/r/20210410072348.20437-4-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Feilong Lin Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d91bd780ba98..b1525ae1bf1f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5267,6 +5267,9 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end, /* * If the subpool has a minimum size, the number of global * reservations to be released may be adjusted. + * + * Note that !resv_map implies freed == 0. So (chg - freed) + * won't go negative. */ gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed)); hugetlb_acct_memory(h, -gbl_reserve); -- cgit v1.2.3-58-ga151 From da56388c4397878a65b74f7fe97760f5aa7d316b Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:34:38 -0700 Subject: mm/hugeltb: handle the error case in hugetlb_fix_reserve_counts() A rare out of memory error would prevent removal of the reserve map region for a page. hugetlb_fix_reserve_counts() handles this rare case to avoid dangling with incorrect counts. Unfortunately, hugepage_subpool_get_pages and hugetlb_acct_memory could possibly fail too. We should correctly handle these cases. Link: https://lkml.kernel.org/r/20210410072348.20437-5-linmiaohe@huawei.com Fixes: b5cec28d36f5 ("hugetlbfs: truncate_hugepages() takes a range of pages") Signed-off-by: Miaohe Lin Cc: Feilong Lin Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b1525ae1bf1f..8ea8e3d2f814 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -742,13 +742,20 @@ void hugetlb_fix_reserve_counts(struct inode *inode) { struct hugepage_subpool *spool = subpool_inode(inode); long rsv_adjust; + bool reserved = false; rsv_adjust = hugepage_subpool_get_pages(spool, 1); - if (rsv_adjust) { + if (rsv_adjust > 0) { struct hstate *h = hstate_inode(inode); - hugetlb_acct_memory(h, 1); + if (!hugetlb_acct_memory(h, 1)) + reserved = true; + } else if (!rsv_adjust) { + reserved = true; } + + if (!reserved) + pr_warn("hugetlb: Huge Page Reserved count may go negative.\n"); } /* -- cgit v1.2.3-58-ga151 From 15b8365363215da82cb019d3de0eb781c9e82564 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:34:41 -0700 Subject: mm/hugetlb: remove unused variable pseudo_vma in remove_inode_hugepages() The local variable pseudo_vma is not used anymore. Link: https://lkml.kernel.org/r/20210410072348.20437-6-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Feilong Lin Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index d81f52b87bd7..a2a42335e8fd 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -463,14 +463,11 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, struct address_space *mapping = &inode->i_data; const pgoff_t start = lstart >> huge_page_shift(h); const pgoff_t end = lend >> huge_page_shift(h); - struct vm_area_struct pseudo_vma; struct pagevec pvec; pgoff_t next, index; int i, freed = 0; bool truncate_op = (lend == LLONG_MAX); - vma_init(&pseudo_vma, current->mm); - pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED); pagevec_init(&pvec); next = start; while (next < end) { -- cgit v1.2.3-58-ga151 From 0ef7dcac998fefc4767b7f10eb3b6df150c38a4e Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Tue, 4 May 2021 18:34:44 -0700 Subject: mm/cma: change cma mutex to irq safe spinlock Patch series "make hugetlb put_page safe for all calling contexts", v5. This effort is the result a recent bug report [1]. Syzbot found a potential deadlock in the hugetlb put_page/free_huge_page_path. WARNING: SOFTIRQ-safe -> SOFTIRQ-unsafe lock order detected Since the free_huge_page_path already has code to 'hand off' page free requests to a workqueue, a suggestion was proposed to make the in_irq() detection accurate by always enabling PREEMPT_COUNT [2]. The outcome of that discussion was that the hugetlb put_page path (free_huge_page) path should be properly fixed and safe for all calling contexts. [1] https://lore.kernel.org/linux-mm/000000000000f1c03b05bc43aadc@google.com/ [2] http://lkml.kernel.org/r/20210311021321.127500-1-mike.kravetz@oracle.com This patch (of 8): cma_release is currently a sleepable operatation because the bitmap manipulation is protected by cma->lock mutex. Hugetlb code which relies on cma_release for CMA backed (giga) hugetlb pages, however, needs to be irq safe. The lock doesn't protect any sleepable operation so it can be changed to a (irq aware) spin lock. The bitmap processing should be quite fast in typical case but if cma sizes grow to TB then we will likely need to replace the lock by a more optimized bitmap implementation. Link: https://lkml.kernel.org/r/20210409205254.242291-1-mike.kravetz@oracle.com Link: https://lkml.kernel.org/r/20210409205254.242291-2-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Acked-by: Michal Hocko Reviewed-by: David Hildenbrand Acked-by: Roman Gushchin Cc: Shakeel Butt Cc: Oscar Salvador Cc: Muchun Song Cc: David Rientjes Cc: Miaohe Lin Cc: Peter Zijlstra Cc: Matthew Wilcox Cc: HORIGUCHI NAOYA Cc: "Aneesh Kumar K . V" Cc: Waiman Long Cc: Peter Xu Cc: Mina Almasry Cc: Hillf Danton Cc: Joonsoo Kim Cc: Barry Song Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/cma.c | 18 +++++++++--------- mm/cma.h | 2 +- mm/cma_debug.c | 8 ++++---- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/mm/cma.c b/mm/cma.c index 54eee2119822..acd6991f77a0 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include @@ -83,13 +82,14 @@ static void cma_clear_bitmap(struct cma *cma, unsigned long pfn, unsigned int count) { unsigned long bitmap_no, bitmap_count; + unsigned long flags; bitmap_no = (pfn - cma->base_pfn) >> cma->order_per_bit; bitmap_count = cma_bitmap_pages_to_bits(cma, count); - mutex_lock(&cma->lock); + spin_lock_irqsave(&cma->lock, flags); bitmap_clear(cma->bitmap, bitmap_no, bitmap_count); - mutex_unlock(&cma->lock); + spin_unlock_irqrestore(&cma->lock, flags); } static void __init cma_activate_area(struct cma *cma) @@ -118,7 +118,7 @@ static void __init cma_activate_area(struct cma *cma) pfn += pageblock_nr_pages) init_cma_reserved_pageblock(pfn_to_page(pfn)); - mutex_init(&cma->lock); + spin_lock_init(&cma->lock); #ifdef CONFIG_CMA_DEBUGFS INIT_HLIST_HEAD(&cma->mem_head); @@ -392,7 +392,7 @@ static void cma_debug_show_areas(struct cma *cma) unsigned long nr_part, nr_total = 0; unsigned long nbits = cma_bitmap_maxno(cma); - mutex_lock(&cma->lock); + spin_lock_irq(&cma->lock); pr_info("number of available pages: "); for (;;) { next_zero_bit = find_next_zero_bit(cma->bitmap, nbits, start); @@ -407,7 +407,7 @@ static void cma_debug_show_areas(struct cma *cma) start = next_zero_bit + nr_zero; } pr_cont("=> %lu free of %lu total pages\n", nr_total, cma->count); - mutex_unlock(&cma->lock); + spin_unlock_irq(&cma->lock); } #else static inline void cma_debug_show_areas(struct cma *cma) { } @@ -452,12 +452,12 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, return NULL; for (;;) { - mutex_lock(&cma->lock); + spin_lock_irq(&cma->lock); bitmap_no = bitmap_find_next_zero_area_off(cma->bitmap, bitmap_maxno, start, bitmap_count, mask, offset); if (bitmap_no >= bitmap_maxno) { - mutex_unlock(&cma->lock); + spin_unlock_irq(&cma->lock); break; } bitmap_set(cma->bitmap, bitmap_no, bitmap_count); @@ -466,7 +466,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, * our exclusive use. If the migration fails we will take the * lock again and unmark it. */ - mutex_unlock(&cma->lock); + spin_unlock_irq(&cma->lock); pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit); ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA, diff --git a/mm/cma.h b/mm/cma.h index 42ae082cb067..c46c5050aaa9 100644 --- a/mm/cma.h +++ b/mm/cma.h @@ -9,7 +9,7 @@ struct cma { unsigned long count; unsigned long *bitmap; unsigned int order_per_bit; /* Order of pages represented by one bit */ - struct mutex lock; + spinlock_t lock; #ifdef CONFIG_CMA_DEBUGFS struct hlist_head mem_head; spinlock_t mem_head_lock; diff --git a/mm/cma_debug.c b/mm/cma_debug.c index d5bf8aa34fdc..2e7704955f4f 100644 --- a/mm/cma_debug.c +++ b/mm/cma_debug.c @@ -36,10 +36,10 @@ static int cma_used_get(void *data, u64 *val) struct cma *cma = data; unsigned long used; - mutex_lock(&cma->lock); + spin_lock_irq(&cma->lock); /* pages counter is smaller than sizeof(int) */ used = bitmap_weight(cma->bitmap, (int)cma_bitmap_maxno(cma)); - mutex_unlock(&cma->lock); + spin_unlock_irq(&cma->lock); *val = (u64)used << cma->order_per_bit; return 0; @@ -53,7 +53,7 @@ static int cma_maxchunk_get(void *data, u64 *val) unsigned long start, end = 0; unsigned long bitmap_maxno = cma_bitmap_maxno(cma); - mutex_lock(&cma->lock); + spin_lock_irq(&cma->lock); for (;;) { start = find_next_zero_bit(cma->bitmap, bitmap_maxno, end); if (start >= bitmap_maxno) @@ -61,7 +61,7 @@ static int cma_maxchunk_get(void *data, u64 *val) end = find_next_bit(cma->bitmap, bitmap_maxno, start); maxchunk = max(end - start, maxchunk); } - mutex_unlock(&cma->lock); + spin_unlock_irq(&cma->lock); *val = (u64)maxchunk << cma->order_per_bit; return 0; -- cgit v1.2.3-58-ga151 From 262443c0421e832e5312d2b14e0a2640a9f064d7 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Tue, 4 May 2021 18:34:48 -0700 Subject: hugetlb: no need to drop hugetlb_lock to call cma_release Now that cma_release is non-blocking and irq safe, there is no need to drop hugetlb_lock before calling. Link: https://lkml.kernel.org/r/20210409205254.242291-3-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Acked-by: Roman Gushchin Acked-by: Michal Hocko Reviewed-by: Oscar Salvador Reviewed-by: David Hildenbrand Cc: "Aneesh Kumar K . V" Cc: Barry Song Cc: David Rientjes Cc: Hillf Danton Cc: HORIGUCHI NAOYA Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Mina Almasry Cc: Muchun Song Cc: Peter Xu Cc: Peter Zijlstra Cc: Shakeel Butt Cc: Waiman Long Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8ea8e3d2f814..a5636e1593c5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1355,14 +1355,8 @@ static void update_and_free_page(struct hstate *h, struct page *page) set_compound_page_dtor(page, NULL_COMPOUND_DTOR); set_page_refcounted(page); if (hstate_is_gigantic(h)) { - /* - * Temporarily drop the hugetlb_lock, because - * we might block in free_gigantic_page(). - */ - spin_unlock(&hugetlb_lock); destroy_compound_gigantic_page(page, huge_page_order(h)); free_gigantic_page(page, huge_page_order(h)); - spin_lock(&hugetlb_lock); } else { __free_pages(page, huge_page_order(h)); } -- cgit v1.2.3-58-ga151 From 2938396771c8fd0870b5284319f9e78b4b552a79 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Tue, 4 May 2021 18:34:52 -0700 Subject: hugetlb: add per-hstate mutex to synchronize user adjustments The helper routine hstate_next_node_to_alloc accesses and modifies the hstate variable next_nid_to_alloc. The helper is used by the routines alloc_pool_huge_page and adjust_pool_surplus. adjust_pool_surplus is called with hugetlb_lock held. However, alloc_pool_huge_page can not be called with the hugetlb lock held as it will call the page allocator. Two instances of alloc_pool_huge_page could be run in parallel or alloc_pool_huge_page could run in parallel with adjust_pool_surplus which may result in the variable next_nid_to_alloc becoming invalid for the caller and pages being allocated on the wrong node. Both alloc_pool_huge_page and adjust_pool_surplus are only called from the routine set_max_huge_pages after boot. set_max_huge_pages is only called as the reusult of a user writing to the proc/sysfs nr_hugepages, or nr_hugepages_mempolicy file to adjust the number of hugetlb pages. It makes little sense to allow multiple adjustment to the number of hugetlb pages in parallel. Add a mutex to the hstate and use it to only allow one hugetlb page adjustment at a time. This will synchronize modifications to the next_nid_to_alloc variable. Link: https://lkml.kernel.org/r/20210409205254.242291-4-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Acked-by: Michal Hocko Reviewed-by: Oscar Salvador Reviewed-by: Miaohe Lin Reviewed-by: Muchun Song Reviewed-by: David Hildenbrand Cc: "Aneesh Kumar K . V" Cc: Barry Song Cc: David Rientjes Cc: Hillf Danton Cc: HORIGUCHI NAOYA Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Mina Almasry Cc: Peter Xu Cc: Peter Zijlstra Cc: Roman Gushchin Cc: Shakeel Butt Cc: Waiman Long Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 1 + mm/hugetlb.c | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 0f5813522224..628639422c5d 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -559,6 +559,7 @@ HPAGEFLAG(Freed, freed) #define HSTATE_NAME_LEN 32 /* Defines one hugetlb page size */ struct hstate { + struct mutex resize_lock; int next_nid_to_alloc; int next_nid_to_free; unsigned int order; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a5636e1593c5..067fd29a9d51 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2621,6 +2621,11 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, else return -ENOMEM; + /* + * resize_lock mutex prevents concurrent adjustments to number of + * pages in hstate via the proc/sysfs interfaces. + */ + mutex_lock(&h->resize_lock); spin_lock(&hugetlb_lock); /* @@ -2653,6 +2658,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) { if (count > persistent_huge_pages(h)) { spin_unlock(&hugetlb_lock); + mutex_unlock(&h->resize_lock); NODEMASK_FREE(node_alloc_noretry); return -EINVAL; } @@ -2727,6 +2733,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, out: h->max_huge_pages = persistent_huge_pages(h); spin_unlock(&hugetlb_lock); + mutex_unlock(&h->resize_lock); NODEMASK_FREE(node_alloc_noretry); @@ -3214,6 +3221,7 @@ void __init hugetlb_add_hstate(unsigned int order) BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); BUG_ON(order == 0); h = &hstates[hugetlb_max_hstate++]; + mutex_init(&h->resize_lock); h->order = order; h->mask = ~(huge_page_size(h) - 1); for (i = 0; i < MAX_NUMNODES; ++i) -- cgit v1.2.3-58-ga151 From 6eb4e88a6d27022ea8aff424d47a0a5dfc9fcb34 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Tue, 4 May 2021 18:34:55 -0700 Subject: hugetlb: create remove_hugetlb_page() to separate functionality The new remove_hugetlb_page() routine is designed to remove a hugetlb page from hugetlbfs processing. It will remove the page from the active or free list, update global counters and set the compound page destructor to NULL so that PageHuge() will return false for the 'page'. After this call, the 'page' can be treated as a normal compound page or a collection of base size pages. update_and_free_page no longer decrements h->nr_huge_pages{_node} as this is performed in remove_hugetlb_page. The only functionality performed by update_and_free_page is to free the base pages to the lower level allocators. update_and_free_page is typically called after remove_hugetlb_page. remove_hugetlb_page is to be called with the hugetlb_lock held. Creating this routine and separating functionality is in preparation for restructuring code to reduce lock hold times. This commit should not introduce any changes to functionality. Link: https://lkml.kernel.org/r/20210409205254.242291-5-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Acked-by: Michal Hocko Reviewed-by: Miaohe Lin Reviewed-by: Muchun Song Reviewed-by: Oscar Salvador Cc: "Aneesh Kumar K . V" Cc: Barry Song Cc: David Hildenbrand Cc: David Rientjes Cc: Hillf Danton Cc: HORIGUCHI NAOYA Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Mina Almasry Cc: Peter Xu Cc: Peter Zijlstra Cc: Roman Gushchin Cc: Shakeel Butt Cc: Waiman Long Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 65 +++++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 40 insertions(+), 25 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 067fd29a9d51..9a263a1c200e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1333,6 +1333,41 @@ static inline void destroy_compound_gigantic_page(struct page *page, unsigned int order) { } #endif +/* + * Remove hugetlb page from lists, and update dtor so that page appears + * as just a compound page. A reference is held on the page. + * + * Must be called with hugetlb lock held. + */ +static void remove_hugetlb_page(struct hstate *h, struct page *page, + bool adjust_surplus) +{ + int nid = page_to_nid(page); + + VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page); + VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page); + + if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) + return; + + list_del(&page->lru); + + if (HPageFreed(page)) { + h->free_huge_pages--; + h->free_huge_pages_node[nid]--; + } + if (adjust_surplus) { + h->surplus_huge_pages--; + h->surplus_huge_pages_node[nid]--; + } + + set_page_refcounted(page); + set_compound_page_dtor(page, NULL_COMPOUND_DTOR); + + h->nr_huge_pages--; + h->nr_huge_pages_node[nid]--; +} + static void update_and_free_page(struct hstate *h, struct page *page) { int i; @@ -1341,8 +1376,6 @@ static void update_and_free_page(struct hstate *h, struct page *page) if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) return; - h->nr_huge_pages--; - h->nr_huge_pages_node[page_to_nid(page)]--; for (i = 0; i < pages_per_huge_page(h); i++, subpage = mem_map_next(subpage, page, i)) { subpage->flags &= ~(1 << PG_locked | 1 << PG_error | @@ -1350,10 +1383,6 @@ static void update_and_free_page(struct hstate *h, struct page *page) 1 << PG_active | 1 << PG_private | 1 << PG_writeback); } - VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page); - VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page); - set_compound_page_dtor(page, NULL_COMPOUND_DTOR); - set_page_refcounted(page); if (hstate_is_gigantic(h)) { destroy_compound_gigantic_page(page, huge_page_order(h)); free_gigantic_page(page, huge_page_order(h)); @@ -1421,15 +1450,12 @@ static void __free_huge_page(struct page *page) h->resv_huge_pages++; if (HPageTemporary(page)) { - list_del(&page->lru); - ClearHPageTemporary(page); + remove_hugetlb_page(h, page, false); update_and_free_page(h, page); } else if (h->surplus_huge_pages_node[nid]) { /* remove the page from active list */ - list_del(&page->lru); + remove_hugetlb_page(h, page, true); update_and_free_page(h, page); - h->surplus_huge_pages--; - h->surplus_huge_pages_node[nid]--; } else { arch_clear_hugepage_flags(page); enqueue_huge_page(h, page); @@ -1714,13 +1740,7 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, struct page *page = list_entry(h->hugepage_freelists[node].next, struct page, lru); - list_del(&page->lru); - h->free_huge_pages--; - h->free_huge_pages_node[node]--; - if (acct_surplus) { - h->surplus_huge_pages--; - h->surplus_huge_pages_node[node]--; - } + remove_hugetlb_page(h, page, acct_surplus); update_and_free_page(h, page); ret = 1; break; @@ -1758,7 +1778,6 @@ retry: if (!page_count(page)) { struct page *head = compound_head(page); struct hstate *h = page_hstate(head); - int nid = page_to_nid(head); if (h->free_huge_pages - h->resv_huge_pages == 0) goto out; @@ -1789,9 +1808,7 @@ retry: SetPageHWPoison(page); ClearPageHWPoison(head); } - list_del(&head->lru); - h->free_huge_pages--; - h->free_huge_pages_node[nid]--; + remove_hugetlb_page(h, page, false); h->max_huge_pages--; update_and_free_page(h, head); rc = 0; @@ -2558,10 +2575,8 @@ static void try_to_free_low(struct hstate *h, unsigned long count, return; if (PageHighMem(page)) continue; - list_del(&page->lru); + remove_hugetlb_page(h, page, false); update_and_free_page(h, page); - h->free_huge_pages--; - h->free_huge_pages_node[page_to_nid(page)]--; } } } -- cgit v1.2.3-58-ga151 From 1121828a0c213caa55ddd5ee23ee78e99cbdd33e Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Tue, 4 May 2021 18:34:59 -0700 Subject: hugetlb: call update_and_free_page without hugetlb_lock With the introduction of remove_hugetlb_page(), there is no need for update_and_free_page to hold the hugetlb lock. Change all callers to drop the lock before calling. With additional code modifications, this will allow loops which decrease the huge page pool to drop the hugetlb_lock with each page to reduce long hold times. The ugly unlock/lock cycle in free_pool_huge_page will be removed in a subsequent patch which restructures free_pool_huge_page. Link: https://lkml.kernel.org/r/20210409205254.242291-6-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Acked-by: Michal Hocko Reviewed-by: Muchun Song Reviewed-by: Miaohe Lin Reviewed-by: Oscar Salvador Cc: "Aneesh Kumar K . V" Cc: Barry Song Cc: David Hildenbrand Cc: David Rientjes Cc: Hillf Danton Cc: HORIGUCHI NAOYA Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Mina Almasry Cc: Peter Xu Cc: Peter Zijlstra Cc: Roman Gushchin Cc: Shakeel Butt Cc: Waiman Long Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 9a263a1c200e..5425936a4590 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1451,16 +1451,18 @@ static void __free_huge_page(struct page *page) if (HPageTemporary(page)) { remove_hugetlb_page(h, page, false); + spin_unlock(&hugetlb_lock); update_and_free_page(h, page); } else if (h->surplus_huge_pages_node[nid]) { /* remove the page from active list */ remove_hugetlb_page(h, page, true); + spin_unlock(&hugetlb_lock); update_and_free_page(h, page); } else { arch_clear_hugepage_flags(page); enqueue_huge_page(h, page); + spin_unlock(&hugetlb_lock); } - spin_unlock(&hugetlb_lock); } /* @@ -1741,7 +1743,13 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, list_entry(h->hugepage_freelists[node].next, struct page, lru); remove_hugetlb_page(h, page, acct_surplus); + /* + * unlock/lock around update_and_free_page is temporary + * and will be removed with subsequent patch. + */ + spin_unlock(&hugetlb_lock); update_and_free_page(h, page); + spin_lock(&hugetlb_lock); ret = 1; break; } @@ -1810,8 +1818,9 @@ retry: } remove_hugetlb_page(h, page, false); h->max_huge_pages--; + spin_unlock(&hugetlb_lock); update_and_free_page(h, head); - rc = 0; + return 0; } out: spin_unlock(&hugetlb_lock); @@ -2563,22 +2572,34 @@ static void try_to_free_low(struct hstate *h, unsigned long count, nodemask_t *nodes_allowed) { int i; + struct page *page, *next; + LIST_HEAD(page_list); if (hstate_is_gigantic(h)) return; + /* + * Collect pages to be freed on a list, and free after dropping lock + */ for_each_node_mask(i, *nodes_allowed) { - struct page *page, *next; struct list_head *freel = &h->hugepage_freelists[i]; list_for_each_entry_safe(page, next, freel, lru) { if (count >= h->nr_huge_pages) - return; + goto out; if (PageHighMem(page)) continue; remove_hugetlb_page(h, page, false); - update_and_free_page(h, page); + list_add(&page->lru, &page_list); } } + +out: + spin_unlock(&hugetlb_lock); + list_for_each_entry_safe(page, next, &page_list, lru) { + update_and_free_page(h, page); + cond_resched(); + } + spin_lock(&hugetlb_lock); } #else static inline void try_to_free_low(struct hstate *h, unsigned long count, -- cgit v1.2.3-58-ga151 From 10c6ec49802b1779c01fc029cfd92ea20ae33c06 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Tue, 4 May 2021 18:35:03 -0700 Subject: hugetlb: change free_pool_huge_page to remove_pool_huge_page free_pool_huge_page was called with hugetlb_lock held. It would remove a hugetlb page, and then free the corresponding pages to the lower level allocators such as buddy. free_pool_huge_page was called in a loop to remove hugetlb pages and these loops could hold the hugetlb_lock for a considerable time. Create new routine remove_pool_huge_page to replace free_pool_huge_page. remove_pool_huge_page will remove the hugetlb page, and it must be called with the hugetlb_lock held. It will return the removed page and it is the responsibility of the caller to free the page to the lower level allocators. The hugetlb_lock is dropped before freeing to these allocators which results in shorter lock hold times. Add new helper routine to call update_and_free_page for a list of pages. Note: Some changes to the routine return_unused_surplus_pages are in need of explanation. Commit e5bbc8a6c992 ("mm/hugetlb.c: fix reservation race when freeing surplus pages") modified this routine to address a race which could occur when dropping the hugetlb_lock in the loop that removes pool pages. Accounting changes introduced in that commit were subtle and took some thought to understand. This commit removes the cond_resched_lock() and the potential race. Therefore, remove the subtle code and restore the more straight forward accounting effectively reverting the commit. Link: https://lkml.kernel.org/r/20210409205254.242291-7-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Muchun Song Acked-by: Michal Hocko Reviewed-by: Oscar Salvador Cc: "Aneesh Kumar K . V" Cc: Barry Song Cc: David Hildenbrand Cc: David Rientjes Cc: Hillf Danton Cc: HORIGUCHI NAOYA Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Mina Almasry Cc: Peter Xu Cc: Peter Zijlstra Cc: Roman Gushchin Cc: Shakeel Butt Cc: Waiman Long Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 93 +++++++++++++++++++++++++++++++++--------------------------- 1 file changed, 51 insertions(+), 42 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5425936a4590..d7db93639787 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1211,7 +1211,7 @@ static int hstate_next_node_to_alloc(struct hstate *h, } /* - * helper for free_pool_huge_page() - return the previously saved + * helper for remove_pool_huge_page() - return the previously saved * node ["this node"] from which to free a huge page. Advance the * next node id whether or not we find a free huge page to free so * that the next attempt to free addresses the next node. @@ -1391,6 +1391,16 @@ static void update_and_free_page(struct hstate *h, struct page *page) } } +static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list) +{ + struct page *page, *t_page; + + list_for_each_entry_safe(page, t_page, list, lru) { + update_and_free_page(h, page); + cond_resched(); + } +} + struct hstate *size_to_hstate(unsigned long size) { struct hstate *h; @@ -1721,16 +1731,18 @@ static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, } /* - * Free huge page from pool from next node to free. - * Attempt to keep persistent huge pages more or less - * balanced over allowed nodes. + * Remove huge page from pool from next node to free. Attempt to keep + * persistent huge pages more or less balanced over allowed nodes. + * This routine only 'removes' the hugetlb page. The caller must make + * an additional call to free the page to low level allocators. * Called with hugetlb_lock locked. */ -static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, - bool acct_surplus) +static struct page *remove_pool_huge_page(struct hstate *h, + nodemask_t *nodes_allowed, + bool acct_surplus) { int nr_nodes, node; - int ret = 0; + struct page *page = NULL; for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { /* @@ -1739,23 +1751,14 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, */ if ((!acct_surplus || h->surplus_huge_pages_node[node]) && !list_empty(&h->hugepage_freelists[node])) { - struct page *page = - list_entry(h->hugepage_freelists[node].next, + page = list_entry(h->hugepage_freelists[node].next, struct page, lru); remove_hugetlb_page(h, page, acct_surplus); - /* - * unlock/lock around update_and_free_page is temporary - * and will be removed with subsequent patch. - */ - spin_unlock(&hugetlb_lock); - update_and_free_page(h, page); - spin_lock(&hugetlb_lock); - ret = 1; break; } } - return ret; + return page; } /* @@ -2075,17 +2078,16 @@ free: * to the associated reservation map. * 2) Free any unused surplus pages that may have been allocated to satisfy * the reservation. As many as unused_resv_pages may be freed. - * - * Called with hugetlb_lock held. However, the lock could be dropped (and - * reacquired) during calls to cond_resched_lock. Whenever dropping the lock, - * we must make sure nobody else can claim pages we are in the process of - * freeing. Do this by ensuring resv_huge_page always is greater than the - * number of huge pages we plan to free when dropping the lock. */ static void return_unused_surplus_pages(struct hstate *h, unsigned long unused_resv_pages) { unsigned long nr_pages; + struct page *page; + LIST_HEAD(page_list); + + /* Uncommit the reservation */ + h->resv_huge_pages -= unused_resv_pages; /* Cannot return gigantic pages currently */ if (hstate_is_gigantic(h)) @@ -2102,24 +2104,21 @@ static void return_unused_surplus_pages(struct hstate *h, * evenly across all nodes with memory. Iterate across these nodes * until we can no longer free unreserved surplus pages. This occurs * when the nodes with surplus pages have no free pages. - * free_pool_huge_page() will balance the freed pages across the + * remove_pool_huge_page() will balance the freed pages across the * on-line nodes with memory and will handle the hstate accounting. - * - * Note that we decrement resv_huge_pages as we free the pages. If - * we drop the lock, resv_huge_pages will still be sufficiently large - * to cover subsequent pages we may free. */ while (nr_pages--) { - h->resv_huge_pages--; - unused_resv_pages--; - if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1)) + page = remove_pool_huge_page(h, &node_states[N_MEMORY], 1); + if (!page) goto out; - cond_resched_lock(&hugetlb_lock); + + list_add(&page->lru, &page_list); } out: - /* Fully uncommit the reservation */ - h->resv_huge_pages -= unused_resv_pages; + spin_unlock(&hugetlb_lock); + update_and_free_pages_bulk(h, &page_list); + spin_lock(&hugetlb_lock); } @@ -2572,7 +2571,6 @@ static void try_to_free_low(struct hstate *h, unsigned long count, nodemask_t *nodes_allowed) { int i; - struct page *page, *next; LIST_HEAD(page_list); if (hstate_is_gigantic(h)) @@ -2582,6 +2580,7 @@ static void try_to_free_low(struct hstate *h, unsigned long count, * Collect pages to be freed on a list, and free after dropping lock */ for_each_node_mask(i, *nodes_allowed) { + struct page *page, *next; struct list_head *freel = &h->hugepage_freelists[i]; list_for_each_entry_safe(page, next, freel, lru) { if (count >= h->nr_huge_pages) @@ -2595,10 +2594,7 @@ static void try_to_free_low(struct hstate *h, unsigned long count, out: spin_unlock(&hugetlb_lock); - list_for_each_entry_safe(page, next, &page_list, lru) { - update_and_free_page(h, page); - cond_resched(); - } + update_and_free_pages_bulk(h, &page_list); spin_lock(&hugetlb_lock); } #else @@ -2645,6 +2641,8 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, nodemask_t *nodes_allowed) { unsigned long min_count, ret; + struct page *page; + LIST_HEAD(page_list); NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL); /* @@ -2757,11 +2755,22 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; min_count = max(count, min_count); try_to_free_low(h, min_count, nodes_allowed); + + /* + * Collect pages to be removed on list without dropping lock + */ while (min_count < persistent_huge_pages(h)) { - if (!free_pool_huge_page(h, nodes_allowed, 0)) + page = remove_pool_huge_page(h, nodes_allowed, 0); + if (!page) break; - cond_resched_lock(&hugetlb_lock); + + list_add(&page->lru, &page_list); } + /* free the pages after dropping lock */ + spin_unlock(&hugetlb_lock); + update_and_free_pages_bulk(h, &page_list); + spin_lock(&hugetlb_lock); + while (count < persistent_huge_pages(h)) { if (!adjust_pool_surplus(h, nodes_allowed, 1)) break; -- cgit v1.2.3-58-ga151 From db71ef79b59bb2e78dc4df83d0e4bf6beaa5c82d Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Tue, 4 May 2021 18:35:07 -0700 Subject: hugetlb: make free_huge_page irq safe Commit c77c0a8ac4c5 ("mm/hugetlb: defer freeing of huge pages if in non-task context") was added to address the issue of free_huge_page being called from irq context. That commit hands off free_huge_page processing to a workqueue if !in_task. However, this doesn't cover all the cases as pointed out by 0day bot lockdep report [1]. : Possible interrupt unsafe locking scenario: : : CPU0 CPU1 : ---- ---- : lock(hugetlb_lock); : local_irq_disable(); : lock(slock-AF_INET); : lock(hugetlb_lock); : : lock(slock-AF_INET); Shakeel has later explained that this is very likely TCP TX zerocopy from hugetlb pages scenario when the networking code drops a last reference to hugetlb page while having IRQ disabled. Hugetlb freeing path doesn't disable IRQ while holding hugetlb_lock so a lock dependency chain can lead to a deadlock. This commit addresses the issue by doing the following: - Make hugetlb_lock irq safe. This is mostly a simple process of changing spin_*lock calls to spin_*lock_irq* calls. - Make subpool lock irq safe in a similar manner. - Revert the !in_task check and workqueue handoff. [1] https://lore.kernel.org/linux-mm/000000000000f1c03b05bc43aadc@google.com/ Link: https://lkml.kernel.org/r/20210409205254.242291-8-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Acked-by: Michal Hocko Reviewed-by: Muchun Song Reviewed-by: Oscar Salvador Cc: "Aneesh Kumar K . V" Cc: Barry Song Cc: David Hildenbrand Cc: David Rientjes Cc: Hillf Danton Cc: HORIGUCHI NAOYA Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Mina Almasry Cc: Peter Xu Cc: Peter Zijlstra Cc: Roman Gushchin Cc: Shakeel Butt Cc: Waiman Long Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 169 ++++++++++++++++++++-------------------------------- mm/hugetlb_cgroup.c | 8 +-- 2 files changed, 67 insertions(+), 110 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d7db93639787..47f56e2cc804 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -94,9 +94,10 @@ static inline bool subpool_is_free(struct hugepage_subpool *spool) return true; } -static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) +static inline void unlock_or_release_subpool(struct hugepage_subpool *spool, + unsigned long irq_flags) { - spin_unlock(&spool->lock); + spin_unlock_irqrestore(&spool->lock, irq_flags); /* If no pages are used, and no other handles to the subpool * remain, give up any reservations based on minimum size and @@ -135,10 +136,12 @@ struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages, void hugepage_put_subpool(struct hugepage_subpool *spool) { - spin_lock(&spool->lock); + unsigned long flags; + + spin_lock_irqsave(&spool->lock, flags); BUG_ON(!spool->count); spool->count--; - unlock_or_release_subpool(spool); + unlock_or_release_subpool(spool, flags); } /* @@ -157,7 +160,7 @@ static long hugepage_subpool_get_pages(struct hugepage_subpool *spool, if (!spool) return ret; - spin_lock(&spool->lock); + spin_lock_irq(&spool->lock); if (spool->max_hpages != -1) { /* maximum size accounting */ if ((spool->used_hpages + delta) <= spool->max_hpages) @@ -184,7 +187,7 @@ static long hugepage_subpool_get_pages(struct hugepage_subpool *spool, } unlock_ret: - spin_unlock(&spool->lock); + spin_unlock_irq(&spool->lock); return ret; } @@ -198,11 +201,12 @@ static long hugepage_subpool_put_pages(struct hugepage_subpool *spool, long delta) { long ret = delta; + unsigned long flags; if (!spool) return delta; - spin_lock(&spool->lock); + spin_lock_irqsave(&spool->lock, flags); if (spool->max_hpages != -1) /* maximum size accounting */ spool->used_hpages -= delta; @@ -223,7 +227,7 @@ static long hugepage_subpool_put_pages(struct hugepage_subpool *spool, * If hugetlbfs_put_super couldn't free spool due to an outstanding * quota reference, free it now. */ - unlock_or_release_subpool(spool); + unlock_or_release_subpool(spool, flags); return ret; } @@ -1412,7 +1416,7 @@ struct hstate *size_to_hstate(unsigned long size) return NULL; } -static void __free_huge_page(struct page *page) +void free_huge_page(struct page *page) { /* * Can't pass hstate in here because it is called from the @@ -1422,6 +1426,7 @@ static void __free_huge_page(struct page *page) int nid = page_to_nid(page); struct hugepage_subpool *spool = hugetlb_page_subpool(page); bool restore_reserve; + unsigned long flags; VM_BUG_ON_PAGE(page_count(page), page); VM_BUG_ON_PAGE(page_mapcount(page), page); @@ -1450,7 +1455,7 @@ static void __free_huge_page(struct page *page) restore_reserve = true; } - spin_lock(&hugetlb_lock); + spin_lock_irqsave(&hugetlb_lock, flags); ClearHPageMigratable(page); hugetlb_cgroup_uncharge_page(hstate_index(h), pages_per_huge_page(h), page); @@ -1461,66 +1466,18 @@ static void __free_huge_page(struct page *page) if (HPageTemporary(page)) { remove_hugetlb_page(h, page, false); - spin_unlock(&hugetlb_lock); + spin_unlock_irqrestore(&hugetlb_lock, flags); update_and_free_page(h, page); } else if (h->surplus_huge_pages_node[nid]) { /* remove the page from active list */ remove_hugetlb_page(h, page, true); - spin_unlock(&hugetlb_lock); + spin_unlock_irqrestore(&hugetlb_lock, flags); update_and_free_page(h, page); } else { arch_clear_hugepage_flags(page); enqueue_huge_page(h, page); - spin_unlock(&hugetlb_lock); - } -} - -/* - * As free_huge_page() can be called from a non-task context, we have - * to defer the actual freeing in a workqueue to prevent potential - * hugetlb_lock deadlock. - * - * free_hpage_workfn() locklessly retrieves the linked list of pages to - * be freed and frees them one-by-one. As the page->mapping pointer is - * going to be cleared in __free_huge_page() anyway, it is reused as the - * llist_node structure of a lockless linked list of huge pages to be freed. - */ -static LLIST_HEAD(hpage_freelist); - -static void free_hpage_workfn(struct work_struct *work) -{ - struct llist_node *node; - struct page *page; - - node = llist_del_all(&hpage_freelist); - - while (node) { - page = container_of((struct address_space **)node, - struct page, mapping); - node = node->next; - __free_huge_page(page); - } -} -static DECLARE_WORK(free_hpage_work, free_hpage_workfn); - -void free_huge_page(struct page *page) -{ - /* - * Defer freeing if in non-task context to avoid hugetlb_lock deadlock. - */ - if (!in_task()) { - /* - * Only call schedule_work() if hpage_freelist is previously - * empty. Otherwise, schedule_work() had been called but the - * workfn hasn't retrieved the list yet. - */ - if (llist_add((struct llist_node *)&page->mapping, - &hpage_freelist)) - schedule_work(&free_hpage_work); - return; + spin_unlock_irqrestore(&hugetlb_lock, flags); } - - __free_huge_page(page); } static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) @@ -1530,11 +1487,11 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) hugetlb_set_page_subpool(page, NULL); set_hugetlb_cgroup(page, NULL); set_hugetlb_cgroup_rsvd(page, NULL); - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); h->nr_huge_pages++; h->nr_huge_pages_node[nid]++; ClearHPageFreed(page); - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); } static void prep_compound_gigantic_page(struct page *page, unsigned int order) @@ -1780,7 +1737,7 @@ retry: if (!PageHuge(page)) return 0; - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); if (!PageHuge(page)) { rc = 0; goto out; @@ -1797,7 +1754,7 @@ retry: * when it is dissolved. */ if (unlikely(!HPageFreed(head))) { - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); cond_resched(); /* @@ -1821,12 +1778,12 @@ retry: } remove_hugetlb_page(h, page, false); h->max_huge_pages--; - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); update_and_free_page(h, head); return 0; } out: - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); return rc; } @@ -1868,16 +1825,16 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, if (hstate_is_gigantic(h)) return NULL; - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) goto out_unlock; - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL); if (!page) return NULL; - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); /* * We could have raced with the pool size change. * Double check that and simply deallocate the new page @@ -1887,7 +1844,7 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, */ if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { SetHPageTemporary(page); - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); put_page(page); return NULL; } else { @@ -1896,7 +1853,7 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, } out_unlock: - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); return page; } @@ -1946,17 +1903,17 @@ struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h, struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask) { - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); if (h->free_huge_pages - h->resv_huge_pages > 0) { struct page *page; page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask); if (page) { - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); return page; } } - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask); } @@ -2004,7 +1961,7 @@ static int gather_surplus_pages(struct hstate *h, long delta) ret = -ENOMEM; retry: - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); for (i = 0; i < needed; i++) { page = alloc_surplus_huge_page(h, htlb_alloc_mask(h), NUMA_NO_NODE, NULL); @@ -2021,7 +1978,7 @@ retry: * After retaking hugetlb_lock, we need to recalculate 'needed' * because either resv_huge_pages or free_huge_pages may have changed. */ - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); needed = (h->resv_huge_pages + delta) - (h->free_huge_pages + allocated); if (needed > 0) { @@ -2061,12 +2018,12 @@ retry: enqueue_huge_page(h, page); } free: - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); /* Free unnecessary surplus pages to the buddy allocator */ list_for_each_entry_safe(page, tmp, &surplus_list, lru) put_page(page); - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); return ret; } @@ -2116,9 +2073,9 @@ static void return_unused_surplus_pages(struct hstate *h, } out: - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); update_and_free_pages_bulk(h, &page_list); - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); } @@ -2352,7 +2309,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, if (ret) goto out_uncharge_cgroup_reservation; - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); /* * glb_chg is passed to indicate whether or not a page must be taken * from the global free pool (global change). gbl_chg == 0 indicates @@ -2360,7 +2317,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, */ page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg); if (!page) { - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); page = alloc_buddy_huge_page_with_mpol(h, vma, addr); if (!page) goto out_uncharge_cgroup; @@ -2368,7 +2325,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, SetHPageRestoreReserve(page); h->resv_huge_pages--; } - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); list_add(&page->lru, &h->hugepage_activelist); /* Fall through */ } @@ -2381,7 +2338,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, h_cg, page); } - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); hugetlb_set_page_subpool(page, spool); @@ -2593,9 +2550,9 @@ static void try_to_free_low(struct hstate *h, unsigned long count, } out: - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); update_and_free_pages_bulk(h, &page_list); - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); } #else static inline void try_to_free_low(struct hstate *h, unsigned long count, @@ -2660,7 +2617,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, * pages in hstate via the proc/sysfs interfaces. */ mutex_lock(&h->resize_lock); - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); /* * Check for a node specific request. @@ -2691,7 +2648,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, */ if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) { if (count > persistent_huge_pages(h)) { - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); mutex_unlock(&h->resize_lock); NODEMASK_FREE(node_alloc_noretry); return -EINVAL; @@ -2721,14 +2678,14 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, * page, free_huge_page will handle it by freeing the page * and reducing the surplus. */ - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); /* yield cpu to avoid soft lockup */ cond_resched(); ret = alloc_pool_huge_page(h, nodes_allowed, node_alloc_noretry); - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); if (!ret) goto out; @@ -2767,9 +2724,9 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, list_add(&page->lru, &page_list); } /* free the pages after dropping lock */ - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); update_and_free_pages_bulk(h, &page_list); - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); while (count < persistent_huge_pages(h)) { if (!adjust_pool_surplus(h, nodes_allowed, 1)) @@ -2777,7 +2734,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, } out: h->max_huge_pages = persistent_huge_pages(h); - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); mutex_unlock(&h->resize_lock); NODEMASK_FREE(node_alloc_noretry); @@ -2933,9 +2890,9 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, if (err) return err; - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); h->nr_overcommit_huge_pages = input; - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); return count; } @@ -3522,9 +3479,9 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, goto out; if (write) { - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); h->nr_overcommit_huge_pages = tmp; - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); } out: return ret; @@ -3620,7 +3577,7 @@ static int hugetlb_acct_memory(struct hstate *h, long delta) if (!delta) return 0; - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); /* * When cpuset is configured, it breaks the strict hugetlb page * reservation as the accounting is done on a global variable. Such @@ -3659,7 +3616,7 @@ static int hugetlb_acct_memory(struct hstate *h, long delta) return_unused_surplus_pages(h, (unsigned long) -delta); out: - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); return ret; } @@ -5687,7 +5644,7 @@ bool isolate_huge_page(struct page *page, struct list_head *list) { bool ret = true; - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); if (!PageHeadHuge(page) || !HPageMigratable(page) || !get_page_unless_zero(page)) { @@ -5697,16 +5654,16 @@ bool isolate_huge_page(struct page *page, struct list_head *list) ClearHPageMigratable(page); list_move_tail(&page->lru, list); unlock: - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); return ret; } void putback_active_hugepage(struct page *page) { - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); SetHPageMigratable(page); list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist); - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); put_page(page); } @@ -5740,12 +5697,12 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason) */ if (new_nid == old_nid) return; - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); if (h->surplus_huge_pages_node[old_nid]) { h->surplus_huge_pages_node[old_nid]--; h->surplus_huge_pages_node[new_nid]++; } - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); } } diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 726b85f4f303..5383023d0cca 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -204,11 +204,11 @@ static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css) do { idx = 0; for_each_hstate(h) { - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); list_for_each_entry(page, &h->hugepage_activelist, lru) hugetlb_cgroup_move_parent(idx, h_cg, page); - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); idx++; } cond_resched(); @@ -784,7 +784,7 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage) if (hugetlb_cgroup_disabled()) return; - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); h_cg = hugetlb_cgroup_from_page(oldhpage); h_cg_rsvd = hugetlb_cgroup_from_page_rsvd(oldhpage); set_hugetlb_cgroup(oldhpage, NULL); @@ -794,7 +794,7 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage) set_hugetlb_cgroup(newhpage, h_cg); set_hugetlb_cgroup_rsvd(newhpage, h_cg_rsvd); list_move(&newhpage->lru, &h->hugepage_activelist); - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); return; } -- cgit v1.2.3-58-ga151 From 9487ca60fd7fa2c259f0daba8e2e01e51a64da05 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Tue, 4 May 2021 18:35:10 -0700 Subject: hugetlb: add lockdep_assert_held() calls for hugetlb_lock After making hugetlb lock irq safe and separating some functionality done under the lock, add some lockdep_assert_held to help verify locking. Link: https://lkml.kernel.org/r/20210409205254.242291-9-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Acked-by: Michal Hocko Reviewed-by: Miaohe Lin Reviewed-by: Muchun Song Reviewed-by: Oscar Salvador Cc: "Aneesh Kumar K . V" Cc: Barry Song Cc: David Hildenbrand Cc: David Rientjes Cc: Hillf Danton Cc: HORIGUCHI NAOYA Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Mina Almasry Cc: Peter Xu Cc: Peter Zijlstra Cc: Roman Gushchin Cc: Shakeel Butt Cc: Waiman Long Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 47f56e2cc804..017cb260354c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1069,6 +1069,8 @@ static bool vma_has_reserves(struct vm_area_struct *vma, long chg) static void enqueue_huge_page(struct hstate *h, struct page *page) { int nid = page_to_nid(page); + + lockdep_assert_held(&hugetlb_lock); list_move(&page->lru, &h->hugepage_freelists[nid]); h->free_huge_pages++; h->free_huge_pages_node[nid]++; @@ -1080,6 +1082,7 @@ static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid) struct page *page; bool nocma = !!(current->flags & PF_MEMALLOC_NOCMA); + lockdep_assert_held(&hugetlb_lock); list_for_each_entry(page, &h->hugepage_freelists[nid], lru) { if (nocma && is_migrate_cma_page(page)) continue; @@ -1351,6 +1354,7 @@ static void remove_hugetlb_page(struct hstate *h, struct page *page, VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page); VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page); + lockdep_assert_held(&hugetlb_lock); if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) return; @@ -1701,6 +1705,7 @@ static struct page *remove_pool_huge_page(struct hstate *h, int nr_nodes, node; struct page *page = NULL; + lockdep_assert_held(&hugetlb_lock); for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { /* * If we're returning unused surplus pages, only examine @@ -1950,6 +1955,7 @@ static int gather_surplus_pages(struct hstate *h, long delta) long needed, allocated; bool alloc_ok = true; + lockdep_assert_held(&hugetlb_lock); needed = (h->resv_huge_pages + delta) - h->free_huge_pages; if (needed <= 0) { h->resv_huge_pages += delta; @@ -2043,6 +2049,7 @@ static void return_unused_surplus_pages(struct hstate *h, struct page *page; LIST_HEAD(page_list); + lockdep_assert_held(&hugetlb_lock); /* Uncommit the reservation */ h->resv_huge_pages -= unused_resv_pages; @@ -2530,6 +2537,7 @@ static void try_to_free_low(struct hstate *h, unsigned long count, int i; LIST_HEAD(page_list); + lockdep_assert_held(&hugetlb_lock); if (hstate_is_gigantic(h)) return; @@ -2571,6 +2579,7 @@ static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed, { int nr_nodes, node; + lockdep_assert_held(&hugetlb_lock); VM_BUG_ON(delta != -1 && delta != 1); if (delta < 0) { -- cgit v1.2.3-58-ga151 From c8e28b47af45c6acfc7a9256848562d4d5ef63a2 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Tue, 4 May 2021 18:35:14 -0700 Subject: mm,page_alloc: bail out earlier on -ENOMEM in alloc_contig_migrate_range Patch series "Make alloc_contig_range handle Hugetlb pages", v10. alloc_contig_range lacks the ability to handle HugeTLB pages. This can be problematic for some users, e.g: CMA and virtio-mem, where those users will fail the call if alloc_contig_range ever sees a HugeTLB page, even when those pages lay in ZONE_MOVABLE and are free. That problem can be easily solved by replacing the page in the free hugepage pool. In-use HugeTLB are no exception though, as those can be isolated and migrated as any other LRU or Movable page. This aims to improve alloc_contig_range->isolate_migratepages_block, so that HugeTLB pages can be recognized and handled. Since we also need to start reporting errors down the chain (e.g: -ENOMEM due to not be able to allocate a new hugetlb page), isolate_migratepages_{range,block} interfaces need to change to start reporting error codes instead of the pfn == 0 vs pfn != 0 scheme it is using right now. From now on, isolate_migratepages_block will not return the next pfn to be scanned anymore, but -EINTR, -ENOMEM or 0, so we the next pfn to be scanned will be recorded in cc->migrate_pfn field (as it is already done in isolate_migratepages_range()). Below is an insight from David (thanks), where the problem can clearly be seen: "Start a VM with 4G. Hotplug 1G via virtio-mem and online it to ZONE_MOVABLE. Allocate 512 huge pages. [root@localhost ~]# cat /proc/meminfo MemTotal: 5061512 kB MemFree: 3319396 kB MemAvailable: 3457144 kB ... HugePages_Total: 512 HugePages_Free: 512 HugePages_Rsvd: 0 HugePages_Surp: 0 Hugepagesize: 2048 kB The huge pages get partially allocate from ZONE_MOVABLE. Try unplugging 1G via virtio-mem (remember, all ZONE_MOVABLE). Inside the guest: [ 180.058992] alloc_contig_range: [1b8000, 1c0000) PFNs busy [ 180.060531] alloc_contig_range: [1b8000, 1c0000) PFNs busy [ 180.061972] alloc_contig_range: [1b8000, 1c0000) PFNs busy [ 180.063413] alloc_contig_range: [1b8000, 1c0000) PFNs busy [ 180.064838] alloc_contig_range: [1b8000, 1c0000) PFNs busy [ 180.065848] alloc_contig_range: [1bfc00, 1c0000) PFNs busy [ 180.066794] alloc_contig_range: [1bfc00, 1c0000) PFNs busy [ 180.067738] alloc_contig_range: [1bfc00, 1c0000) PFNs busy [ 180.068669] alloc_contig_range: [1bfc00, 1c0000) PFNs busy [ 180.069598] alloc_contig_range: [1bfc00, 1c0000) PFNs busy" And then with this patchset running: "Same experiment with ZONE_MOVABLE: a) Free huge pages: all memory can get unplugged again. b) Allocated/populated but idle huge pages: all memory can get unplugged again. c) Allocated/populated but all 512 huge pages are read/written in a loop: all memory can get unplugged again, but I get a single [ 121.192345] alloc_contig_range: [180000, 188000) PFNs busy Most probably because it happened to try migrating a huge page while it was busy. As virtio-mem retries on ZONE_MOVABLE a couple of times, it can deal with this temporary failure. Last but not least, I did something extreme: # cat /proc/meminfo MemTotal: 5061568 kB MemFree: 186560 kB MemAvailable: 354524 kB ... HugePages_Total: 2048 HugePages_Free: 2048 HugePages_Rsvd: 0 HugePages_Surp: 0 Triggering unplug would require to dissolve+alloc - which now fails when trying to allocate an additional ~512 huge pages (1G). As expected, I can properly see memory unplug not fully succeeding. + I get a fairly continuous stream of [ 226.611584] alloc_contig_range: [19f400, 19f800) PFNs busy ... But more importantly, the hugepage count remains stable, as configured by the admin (me): HugePages_Total: 2048 HugePages_Free: 2048 HugePages_Rsvd: 0 HugePages_Surp: 0" This patch (of 7): Currently, __alloc_contig_migrate_range can generate -EINTR, -ENOMEM or -EBUSY, and report them down the chain. The problem is that when migrate_pages() reports -ENOMEM, we keep going till we exhaust all the try-attempts (5 at the moment) instead of bailing out. migrate_pages() bails out right away on -ENOMEM because it is considered a fatal error. Do the same here instead of keep going and retrying. Note that this is not fixing a real issue, just a cosmetic change. Although we can save some cycles by backing off ealier Link: https://lkml.kernel.org/r/20210419075413.1064-1-osalvador@suse.de Link: https://lkml.kernel.org/r/20210419075413.1064-2-osalvador@suse.de Signed-off-by: Oscar Salvador Acked-by: Vlastimil Babka Reviewed-by: David Hildenbrand Acked-by: Michal Hocko Acked-by: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6b208b1843bf..d9acd86fa761 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8696,7 +8696,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, } tries = 0; } else if (++tries == 5) { - ret = ret < 0 ? ret : -EBUSY; + ret = -EBUSY; break; } @@ -8706,6 +8706,13 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, ret = migrate_pages(&cc->migratepages, alloc_migration_target, NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE); + + /* + * On -ENOMEM, migrate_pages() bails out right away. It is pointless + * to retry again over this error, so do the same here. + */ + if (ret == -ENOMEM) + break; } if (ret < 0) { alloc_contig_dump_pages(&cc->migratepages); -- cgit v1.2.3-58-ga151 From c2ad7a1ffeafa32eb3b3b99835f210ad402a86ff Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Tue, 4 May 2021 18:35:17 -0700 Subject: mm,compaction: let isolate_migratepages_{range,block} return error codes Currently, isolate_migratepages_{range,block} and their callers use a pfn == 0 vs pfn != 0 scheme to let the caller know whether there was any error during isolation. This does not work as soon as we need to start reporting different error codes and make sure we pass them down the chain, so they are properly interpreted by functions like e.g: alloc_contig_range. Let us rework isolate_migratepages_{range,block} so we can report error codes. Since isolate_migratepages_block will stop returning the next pfn to be scanned, we reuse the cc->migrate_pfn field to keep track of that. Link: https://lkml.kernel.org/r/20210419075413.1064-3-osalvador@suse.de Signed-off-by: Oscar Salvador Acked-by: Vlastimil Babka Acked-by: Mike Kravetz Reviewed-by: David Hildenbrand Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 52 +++++++++++++++++++++++++--------------------------- mm/internal.h | 10 ++++++++-- mm/page_alloc.c | 7 +++---- 3 files changed, 36 insertions(+), 33 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index e04f4476e68e..c4d8007221b7 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -787,15 +787,14 @@ static bool too_many_isolated(pg_data_t *pgdat) * * Isolate all pages that can be migrated from the range specified by * [low_pfn, end_pfn). The range is expected to be within same pageblock. - * Returns zero if there is a fatal signal pending, otherwise PFN of the - * first page that was not scanned (which may be both less, equal to or more - * than end_pfn). + * Returns errno, like -EAGAIN or -EINTR in case e.g signal pending or congestion, + * or 0. + * cc->migrate_pfn will contain the next pfn to scan. * * The pages are isolated on cc->migratepages list (not required to be empty), - * and cc->nr_migratepages is updated accordingly. The cc->migrate_pfn field - * is neither read nor updated. + * and cc->nr_migratepages is updated accordingly. */ -static unsigned long +static int isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, unsigned long end_pfn, isolate_mode_t isolate_mode) { @@ -809,6 +808,9 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, bool skip_on_failure = false; unsigned long next_skip_pfn = 0; bool skip_updated = false; + int ret = 0; + + cc->migrate_pfn = low_pfn; /* * Ensure that there are not too many pages isolated from the LRU @@ -818,16 +820,16 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, while (unlikely(too_many_isolated(pgdat))) { /* stop isolation if there are still pages not migrated */ if (cc->nr_migratepages) - return 0; + return -EAGAIN; /* async migration should just abort */ if (cc->mode == MIGRATE_ASYNC) - return 0; + return -EAGAIN; congestion_wait(BLK_RW_ASYNC, HZ/10); if (fatal_signal_pending(current)) - return 0; + return -EINTR; } cond_resched(); @@ -875,8 +877,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (fatal_signal_pending(current)) { cc->contended = true; + ret = -EINTR; - low_pfn = 0; goto fatal_pending; } @@ -1130,7 +1132,9 @@ fatal_pending: if (nr_isolated) count_compact_events(COMPACTISOLATED, nr_isolated); - return low_pfn; + cc->migrate_pfn = low_pfn; + + return ret; } /** @@ -1139,15 +1143,14 @@ fatal_pending: * @start_pfn: The first PFN to start isolating. * @end_pfn: The one-past-last PFN. * - * Returns zero if isolation fails fatally due to e.g. pending signal. - * Otherwise, function returns one-past-the-last PFN of isolated page - * (which may be greater than end_pfn if end fell in a middle of a THP page). + * Returns -EAGAIN when contented, -EINTR in case of a signal pending or 0. */ -unsigned long +int isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, unsigned long end_pfn) { unsigned long pfn, block_start_pfn, block_end_pfn; + int ret = 0; /* Scan block by block. First and last block may be incomplete */ pfn = start_pfn; @@ -1166,17 +1169,17 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, block_end_pfn, cc->zone)) continue; - pfn = isolate_migratepages_block(cc, pfn, block_end_pfn, - ISOLATE_UNEVICTABLE); + ret = isolate_migratepages_block(cc, pfn, block_end_pfn, + ISOLATE_UNEVICTABLE); - if (!pfn) + if (ret) break; if (cc->nr_migratepages >= COMPACT_CLUSTER_MAX) break; } - return pfn; + return ret; } #endif /* CONFIG_COMPACTION || CONFIG_CMA */ @@ -1847,7 +1850,7 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc) */ for (; block_end_pfn <= cc->free_pfn; fast_find_block = false, - low_pfn = block_end_pfn, + cc->migrate_pfn = low_pfn = block_end_pfn, block_start_pfn = block_end_pfn, block_end_pfn += pageblock_nr_pages) { @@ -1889,10 +1892,8 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc) } /* Perform the isolation */ - low_pfn = isolate_migratepages_block(cc, low_pfn, - block_end_pfn, isolate_mode); - - if (!low_pfn) + if (isolate_migratepages_block(cc, low_pfn, block_end_pfn, + isolate_mode)) return ISOLATE_ABORT; /* @@ -1903,9 +1904,6 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc) break; } - /* Record where migration scanner will be restarted. */ - cc->migrate_pfn = low_pfn; - return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE; } diff --git a/mm/internal.h b/mm/internal.h index ef5f336f59bd..feeaaf06705d 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -244,7 +244,13 @@ struct compact_control { unsigned int nr_freepages; /* Number of isolated free pages */ unsigned int nr_migratepages; /* Number of pages to migrate */ unsigned long free_pfn; /* isolate_freepages search base */ - unsigned long migrate_pfn; /* isolate_migratepages search base */ + /* + * Acts as an in/out parameter to page isolation for migration. + * isolate_migratepages uses it as a search base. + * isolate_migratepages_block will update the value to the next pfn + * after the last isolated one. + */ + unsigned long migrate_pfn; unsigned long fast_start_pfn; /* a pfn to start linear scan from */ struct zone *zone; unsigned long total_migrate_scanned; @@ -280,7 +286,7 @@ struct capture_control { unsigned long isolate_freepages_range(struct compact_control *cc, unsigned long start_pfn, unsigned long end_pfn); -unsigned long +int isolate_migratepages_range(struct compact_control *cc, unsigned long low_pfn, unsigned long end_pfn); int find_suitable_fallback(struct free_area *area, unsigned int order, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d9acd86fa761..656f86a98554 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8689,11 +8689,10 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, if (list_empty(&cc->migratepages)) { cc->nr_migratepages = 0; - pfn = isolate_migratepages_range(cc, pfn, end); - if (!pfn) { - ret = -EINTR; + ret = isolate_migratepages_range(cc, pfn, end); + if (ret && ret != -EAGAIN) break; - } + pfn = cc->migrate_pfn; tries = 0; } else if (++tries == 5) { ret = -EBUSY; -- cgit v1.2.3-58-ga151 From 9f27b34f234da7a185b4f1a2aa2cea2c47c458bf Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Tue, 4 May 2021 18:35:20 -0700 Subject: mm,hugetlb: drop clearing of flag from prep_new_huge_page Pages allocated via the page allocator or CMA get its private field cleared by means of post_alloc_hook(). Pages allocated during boot, that is directly from the memblock allocator, get cleared by paging_init()-> .. ->memmap_init_zone-> .. ->__init_single_page() before any memblock allocation. Based on this ground, let us remove the clearing of the flag from prep_new_huge_page() as it is not needed. This was a leftover from commit 6c0371490140 ("hugetlb: convert PageHugeFreed to HPageFreed flag"). Previously the explicit clearing was necessary because compound allocations do not get this initialization (see prep_compound_page). Link: https://lkml.kernel.org/r/20210419075413.1064-4-osalvador@suse.de Signed-off-by: Oscar Salvador Acked-by: Michal Hocko Reviewed-by: David Hildenbrand Reviewed-by: Mike Kravetz Cc: Muchun Song Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 017cb260354c..c1539fb95f51 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1494,7 +1494,6 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) spin_lock_irq(&hugetlb_lock); h->nr_huge_pages++; h->nr_huge_pages_node[nid]++; - ClearHPageFreed(page); spin_unlock_irq(&hugetlb_lock); } -- cgit v1.2.3-58-ga151 From d3d99fcc4e28f1a613744608c289d4f18b60b12f Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Tue, 4 May 2021 18:35:23 -0700 Subject: mm,hugetlb: split prep_new_huge_page functionality Currently, prep_new_huge_page() performs two functions. It sets the right state for a new hugetlb, and increases the hstate's counters to account for the new page. Let us split its functionality into two separate functions, decoupling the handling of the counters from initializing a hugepage. The outcome is having __prep_new_huge_page(), which only initializes the page , and __prep_account_new_huge_page(), which adds the new page to the hstate's counters. This allows us to be able to set a hugetlb without having to worry about the counter/locking. It will prove useful in the next patch. prep_new_huge_page() still calls both functions. Link: https://lkml.kernel.org/r/20210419075413.1064-5-osalvador@suse.de Signed-off-by: Oscar Salvador Acked-by: Michal Hocko Reviewed-by: Mike Kravetz Reviewed-by: David Hildenbrand Cc: Muchun Song Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c1539fb95f51..63760be2688e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1484,16 +1484,30 @@ void free_huge_page(struct page *page) } } -static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) +/* + * Must be called with the hugetlb lock held + */ +static void __prep_account_new_huge_page(struct hstate *h, int nid) +{ + lockdep_assert_held(&hugetlb_lock); + h->nr_huge_pages++; + h->nr_huge_pages_node[nid]++; +} + +static void __prep_new_huge_page(struct page *page) { INIT_LIST_HEAD(&page->lru); set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); hugetlb_set_page_subpool(page, NULL); set_hugetlb_cgroup(page, NULL); set_hugetlb_cgroup_rsvd(page, NULL); +} + +static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) +{ + __prep_new_huge_page(page); spin_lock_irq(&hugetlb_lock); - h->nr_huge_pages++; - h->nr_huge_pages_node[nid]++; + __prep_account_new_huge_page(h, nid); spin_unlock_irq(&hugetlb_lock); } -- cgit v1.2.3-58-ga151 From 369fa227c21949b22fd7374506c4992a0d7bb580 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Tue, 4 May 2021 18:35:26 -0700 Subject: mm: make alloc_contig_range handle free hugetlb pages alloc_contig_range will fail if it ever sees a HugeTLB page within the range we are trying to allocate, even when that page is free and can be easily reallocated. This has proved to be problematic for some users of alloc_contic_range, e.g: CMA and virtio-mem, where those would fail the call even when those pages lay in ZONE_MOVABLE and are free. We can do better by trying to replace such page. Free hugepages are tricky to handle so as to no userspace application notices disruption, we need to replace the current free hugepage with a new one. In order to do that, a new function called alloc_and_dissolve_huge_page is introduced. This function will first try to get a new fresh hugepage, and if it succeeds, it will replace the old one in the free hugepage pool. The free page replacement is done under hugetlb_lock, so no external users of hugetlb will notice the change. To allocate the new huge page, we use alloc_buddy_huge_page(), so we do not have to deal with any counters, and prep_new_huge_page() is not called. This is valulable because in case we need to free the new page, we only need to call __free_pages(). Once we know that the page to be replaced is a genuine 0-refcounted huge page, we remove the old page from the freelist by remove_hugetlb_page(). Then, we can call __prep_new_huge_page() and __prep_account_new_huge_page() for the new huge page to properly initialize it and increment the hstate->nr_huge_pages counter (previously decremented by remove_hugetlb_page()). Once done, the page is enqueued by enqueue_huge_page() and it is ready to be used. There is one tricky case when page's refcount is 0 because it is in the process of being released. A missing PageHugeFreed bit will tell us that freeing is in flight so we retry after dropping the hugetlb_lock. The race window should be small and the next retry should make a forward progress. E.g: CPU0 CPU1 free_huge_page() isolate_or_dissolve_huge_page PageHuge() == T alloc_and_dissolve_huge_page alloc_buddy_huge_page() spin_lock_irq(hugetlb_lock) // PageHuge() && !PageHugeFreed && // !PageCount() spin_unlock_irq(hugetlb_lock) spin_lock_irq(hugetlb_lock) 1) update_and_free_page PageHuge() == F __free_pages() 2) enqueue_huge_page SetPageHugeFreed() spin_unlock_irq(&hugetlb_lock) spin_lock_irq(hugetlb_lock) 1) PageHuge() == F (freed by case#1 from CPU0) 2) PageHuge() == T PageHugeFreed() == T - proceed with replacing the page In the case above we retry as the window race is quite small and we have high chances to succeed next time. With regard to the allocation, we restrict it to the node the page belongs to with __GFP_THISNODE, meaning we do not fallback on other node's zones. Note that gigantic hugetlb pages are fenced off since there is a cyclic dependency between them and alloc_contig_range. Link: https://lkml.kernel.org/r/20210419075413.1064-6-osalvador@suse.de Signed-off-by: Oscar Salvador Acked-by: Michal Hocko Acked-by: David Hildenbrand Reviewed-by: Mike Kravetz Cc: Muchun Song Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 6 +++ mm/compaction.c | 33 ++++++++++++-- mm/hugetlb.c | 116 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 152 insertions(+), 3 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 628639422c5d..ec6a10b8860a 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -588,6 +588,7 @@ struct huge_bootmem_page { struct hstate *hstate; }; +int isolate_or_dissolve_huge_page(struct page *page); struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve); struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, @@ -870,6 +871,11 @@ static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, #else /* CONFIG_HUGETLB_PAGE */ struct hstate {}; +static inline int isolate_or_dissolve_huge_page(struct page *page) +{ + return -ENOMEM; +} + static inline struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) diff --git a/mm/compaction.c b/mm/compaction.c index c4d8007221b7..b77e1382307f 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -788,7 +788,7 @@ static bool too_many_isolated(pg_data_t *pgdat) * Isolate all pages that can be migrated from the range specified by * [low_pfn, end_pfn). The range is expected to be within same pageblock. * Returns errno, like -EAGAIN or -EINTR in case e.g signal pending or congestion, - * or 0. + * -ENOMEM in case we could not allocate a page, or 0. * cc->migrate_pfn will contain the next pfn to scan. * * The pages are isolated on cc->migratepages list (not required to be empty), @@ -906,6 +906,29 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, valid_page = page; } + if (PageHuge(page) && cc->alloc_contig) { + ret = isolate_or_dissolve_huge_page(page); + + /* + * Fail isolation in case isolate_or_dissolve_huge_page() + * reports an error. In case of -ENOMEM, abort right away. + */ + if (ret < 0) { + /* Do not report -EBUSY down the chain */ + if (ret == -EBUSY) + ret = 0; + low_pfn += (1UL << compound_order(page)) - 1; + goto isolate_fail; + } + + /* + * Ok, the hugepage was dissolved. Now these pages are + * Buddy and cannot be re-allocated because they are + * isolated. Fall-through as the check below handles + * Buddy pages. + */ + } + /* * Skip if free. We read page order here without zone lock * which is generally unsafe, but the race window is small and @@ -1065,7 +1088,7 @@ isolate_fail_put: put_page(page); isolate_fail: - if (!skip_on_failure) + if (!skip_on_failure && ret != -ENOMEM) continue; /* @@ -1091,6 +1114,9 @@ isolate_fail: */ next_skip_pfn += 1UL << cc->order; } + + if (ret == -ENOMEM) + break; } /* @@ -1143,7 +1169,8 @@ fatal_pending: * @start_pfn: The first PFN to start isolating. * @end_pfn: The one-past-last PFN. * - * Returns -EAGAIN when contented, -EINTR in case of a signal pending or 0. + * Returns -EAGAIN when contented, -EINTR in case of a signal pending, -ENOMEM + * in case we could not allocate a page, or 0. */ int isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 63760be2688e..92f3cd08946f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2267,6 +2267,122 @@ static void restore_reserve_on_error(struct hstate *h, } } +/* + * alloc_and_dissolve_huge_page - Allocate a new page and dissolve the old one + * @h: struct hstate old page belongs to + * @old_page: Old page to dissolve + * Returns 0 on success, otherwise negated error. + */ +static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page) +{ + gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; + int nid = page_to_nid(old_page); + struct page *new_page; + int ret = 0; + + /* + * Before dissolving the page, we need to allocate a new one for the + * pool to remain stable. Using alloc_buddy_huge_page() allows us to + * not having to deal with prep_new_huge_page() and avoids dealing of any + * counters. This simplifies and let us do the whole thing under the + * lock. + */ + new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL); + if (!new_page) + return -ENOMEM; + +retry: + spin_lock_irq(&hugetlb_lock); + if (!PageHuge(old_page)) { + /* + * Freed from under us. Drop new_page too. + */ + goto free_new; + } else if (page_count(old_page)) { + /* + * Someone has grabbed the page, fail for now. + */ + ret = -EBUSY; + goto free_new; + } else if (!HPageFreed(old_page)) { + /* + * Page's refcount is 0 but it has not been enqueued in the + * freelist yet. Race window is small, so we can succeed here if + * we retry. + */ + spin_unlock_irq(&hugetlb_lock); + cond_resched(); + goto retry; + } else { + /* + * Ok, old_page is still a genuine free hugepage. Remove it from + * the freelist and decrease the counters. These will be + * incremented again when calling __prep_account_new_huge_page() + * and enqueue_huge_page() for new_page. The counters will remain + * stable since this happens under the lock. + */ + remove_hugetlb_page(h, old_page, false); + + /* + * new_page needs to be initialized with the standard hugetlb + * state. This is normally done by prep_new_huge_page() but + * that takes hugetlb_lock which is already held so we need to + * open code it here. + * Reference count trick is needed because allocator gives us + * referenced page but the pool requires pages with 0 refcount. + */ + __prep_new_huge_page(new_page); + __prep_account_new_huge_page(h, nid); + page_ref_dec(new_page); + enqueue_huge_page(h, new_page); + + /* + * Pages have been replaced, we can safely free the old one. + */ + spin_unlock_irq(&hugetlb_lock); + update_and_free_page(h, old_page); + } + + return ret; + +free_new: + spin_unlock_irq(&hugetlb_lock); + __free_pages(new_page, huge_page_order(h)); + + return ret; +} + +int isolate_or_dissolve_huge_page(struct page *page) +{ + struct hstate *h; + struct page *head; + + /* + * The page might have been dissolved from under our feet, so make sure + * to carefully check the state under the lock. + * Return success when racing as if we dissolved the page ourselves. + */ + spin_lock_irq(&hugetlb_lock); + if (PageHuge(page)) { + head = compound_head(page); + h = page_hstate(head); + } else { + spin_unlock_irq(&hugetlb_lock); + return 0; + } + spin_unlock_irq(&hugetlb_lock); + + /* + * Fence off gigantic pages as there is a cyclic dependency between + * alloc_contig_range and them. Return -ENOMEM as this has the effect + * of bailing out right away without further retrying. + */ + if (hstate_is_gigantic(h)) + return -ENOMEM; + + return alloc_and_dissolve_huge_page(h, head); +} + struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) { -- cgit v1.2.3-58-ga151 From ae37c7ff79f1f030e28ec76c46ee032f8fd07607 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Tue, 4 May 2021 18:35:29 -0700 Subject: mm: make alloc_contig_range handle in-use hugetlb pages alloc_contig_range() will fail if it finds a HugeTLB page within the range, without a chance to handle them. Since HugeTLB pages can be migrated as any LRU or Movable page, it does not make sense to bail out without trying. Enable the interface to recognize in-use HugeTLB pages so we can migrate them, and have much better chances to succeed the call. Link: https://lkml.kernel.org/r/20210419075413.1064-7-osalvador@suse.de Signed-off-by: Oscar Salvador Reviewed-by: Mike Kravetz Acked-by: Michal Hocko Acked-by: David Hildenbrand Cc: Muchun Song Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 5 +++-- mm/compaction.c | 12 +++++++++++- mm/hugetlb.c | 22 +++++++++++++++++----- mm/vmscan.c | 5 +++-- 4 files changed, 34 insertions(+), 10 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index ec6a10b8860a..d0f310ae3f82 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -588,7 +588,7 @@ struct huge_bootmem_page { struct hstate *hstate; }; -int isolate_or_dissolve_huge_page(struct page *page); +int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list); struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve); struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, @@ -871,7 +871,8 @@ static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, #else /* CONFIG_HUGETLB_PAGE */ struct hstate {}; -static inline int isolate_or_dissolve_huge_page(struct page *page) +static inline int isolate_or_dissolve_huge_page(struct page *page, + struct list_head *list) { return -ENOMEM; } diff --git a/mm/compaction.c b/mm/compaction.c index b77e1382307f..335862f1661c 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -907,7 +907,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, } if (PageHuge(page) && cc->alloc_contig) { - ret = isolate_or_dissolve_huge_page(page); + ret = isolate_or_dissolve_huge_page(page, &cc->migratepages); /* * Fail isolation in case isolate_or_dissolve_huge_page() @@ -921,6 +921,15 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, goto isolate_fail; } + if (PageHuge(page)) { + /* + * Hugepage was successfully isolated and placed + * on the cc->migratepages list. + */ + low_pfn += compound_nr(page) - 1; + goto isolate_success_no_list; + } + /* * Ok, the hugepage was dissolved. Now these pages are * Buddy and cannot be re-allocated because they are @@ -1062,6 +1071,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, isolate_success: list_add(&page->lru, &cc->migratepages); +isolate_success_no_list: cc->nr_migratepages += compound_nr(page); nr_isolated += compound_nr(page); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 92f3cd08946f..b5977d9709ad 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2271,9 +2271,11 @@ static void restore_reserve_on_error(struct hstate *h, * alloc_and_dissolve_huge_page - Allocate a new page and dissolve the old one * @h: struct hstate old page belongs to * @old_page: Old page to dissolve + * @list: List to isolate the page in case we need to * Returns 0 on success, otherwise negated error. */ -static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page) +static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page, + struct list_head *list) { gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; int nid = page_to_nid(old_page); @@ -2300,9 +2302,13 @@ retry: goto free_new; } else if (page_count(old_page)) { /* - * Someone has grabbed the page, fail for now. + * Someone has grabbed the page, try to isolate it here. + * Fail with -EBUSY if not possible. */ - ret = -EBUSY; + spin_unlock_irq(&hugetlb_lock); + if (!isolate_huge_page(old_page, list)) + ret = -EBUSY; + spin_lock_irq(&hugetlb_lock); goto free_new; } else if (!HPageFreed(old_page)) { /* @@ -2352,10 +2358,11 @@ free_new: return ret; } -int isolate_or_dissolve_huge_page(struct page *page) +int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) { struct hstate *h; struct page *head; + int ret = -EBUSY; /* * The page might have been dissolved from under our feet, so make sure @@ -2380,7 +2387,12 @@ int isolate_or_dissolve_huge_page(struct page *page) if (hstate_is_gigantic(h)) return -ENOMEM; - return alloc_and_dissolve_huge_page(h, head); + if (page_count(head) && isolate_huge_page(head, list)) + ret = 0; + else if (!page_count(head)) + ret = alloc_and_dissolve_huge_page(h, head, list); + + return ret; } struct page *alloc_huge_page(struct vm_area_struct *vma, diff --git a/mm/vmscan.c b/mm/vmscan.c index 562e87cbd7a1..42aaef30633e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1507,8 +1507,9 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, LIST_HEAD(clean_pages); list_for_each_entry_safe(page, next, page_list, lru) { - if (page_is_file_lru(page) && !PageDirty(page) && - !__PageMovable(page) && !PageUnevictable(page)) { + if (!PageHuge(page) && page_is_file_lru(page) && + !PageDirty(page) && !__PageMovable(page) && + !PageUnevictable(page)) { ClearPageActive(page); list_move(&page->lru, &clean_pages); } -- cgit v1.2.3-58-ga151 From eb14d4eefdc4f0051a63973124f431798e16a8b2 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Tue, 4 May 2021 18:35:33 -0700 Subject: mm,page_alloc: drop unnecessary checks from pfn_range_valid_contig pfn_range_valid_contig() bails out when it finds an in-use page or a hugetlb page, among other things. We can drop the in-use page check since __alloc_contig_pages can migrate away those pages, and the hugetlb page check can go too since isolate_migratepages_range is now capable of dealing with hugetlb pages. Either way, those checks are racy so let the end function handle it when the time comes. Link: https://lkml.kernel.org/r/20210419075413.1064-8-osalvador@suse.de Signed-off-by: Oscar Salvador Suggested-by: David Hildenbrand Reviewed-by: David Hildenbrand Acked-by: Mike Kravetz Acked-by: Michal Hocko Cc: Muchun Song Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 656f86a98554..80fa6e0f9ed9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8898,12 +8898,6 @@ static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn, if (PageReserved(page)) return false; - - if (page_count(page) > 0) - return false; - - if (PageHuge(page)) - return false; } return true; } -- cgit v1.2.3-58-ga151 From 7677f7fd8be76659cd2d0db8ff4093bbb51c20e5 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Tue, 4 May 2021 18:35:36 -0700 Subject: userfaultfd: add minor fault registration mode Patch series "userfaultfd: add minor fault handling", v9. Overview ======== This series adds a new userfaultfd feature, UFFD_FEATURE_MINOR_HUGETLBFS. When enabled (via the UFFDIO_API ioctl), this feature means that any hugetlbfs VMAs registered with UFFDIO_REGISTER_MODE_MISSING will *also* get events for "minor" faults. By "minor" fault, I mean the following situation: Let there exist two mappings (i.e., VMAs) to the same page(s) (shared memory). One of the mappings is registered with userfaultfd (in minor mode), and the other is not. Via the non-UFFD mapping, the underlying pages have already been allocated & filled with some contents. The UFFD mapping has not yet been faulted in; when it is touched for the first time, this results in what I'm calling a "minor" fault. As a concrete example, when working with hugetlbfs, we have huge_pte_none(), but find_lock_page() finds an existing page. We also add a new ioctl to resolve such faults: UFFDIO_CONTINUE. The idea is, userspace resolves the fault by either a) doing nothing if the contents are already correct, or b) updating the underlying contents using the second, non-UFFD mapping (via memcpy/memset or similar, or something fancier like RDMA, or etc...). In either case, userspace issues UFFDIO_CONTINUE to tell the kernel "I have ensured the page contents are correct, carry on setting up the mapping". Use Case ======== Consider the use case of VM live migration (e.g. under QEMU/KVM): 1. While a VM is still running, we copy the contents of its memory to a target machine. The pages are populated on the target by writing to the non-UFFD mapping, using the setup described above. The VM is still running (and therefore its memory is likely changing), so this may be repeated several times, until we decide the target is "up to date enough". 2. We pause the VM on the source, and start executing on the target machine. During this gap, the VM's user(s) will *see* a pause, so it is desirable to minimize this window. 3. Between the last time any page was copied from the source to the target, and when the VM was paused, the contents of that page may have changed - and therefore the copy we have on the target machine is out of date. Although we can keep track of which pages are out of date, for VMs with large amounts of memory, it is "slow" to transfer this information to the target machine. We want to resume execution before such a transfer would complete. 4. So, the guest begins executing on the target machine. The first time it touches its memory (via the UFFD-registered mapping), userspace wants to intercept this fault. Userspace checks whether or not the page is up to date, and if not, copies the updated page from the source machine, via the non-UFFD mapping. Finally, whether a copy was performed or not, userspace issues a UFFDIO_CONTINUE ioctl to tell the kernel "I have ensured the page contents are correct, carry on setting up the mapping". We don't have to do all of the final updates on-demand. The userfaultfd manager can, in the background, also copy over updated pages once it receives the map of which pages are up-to-date or not. Interaction with Existing APIs ============================== Because this is a feature, a registered VMA could potentially receive both missing and minor faults. I spent some time thinking through how the existing API interacts with the new feature: UFFDIO_CONTINUE cannot be used to resolve non-minor faults, as it does not allocate a new page. If UFFDIO_CONTINUE is used on a non-minor fault: - For non-shared memory or shmem, -EINVAL is returned. - For hugetlb, -EFAULT is returned. UFFDIO_COPY and UFFDIO_ZEROPAGE cannot be used to resolve minor faults. Without modifications, the existing codepath assumes a new page needs to be allocated. This is okay, since userspace must have a second non-UFFD-registered mapping anyway, thus there isn't much reason to want to use these in any case (just memcpy or memset or similar). - If UFFDIO_COPY is used on a minor fault, -EEXIST is returned. - If UFFDIO_ZEROPAGE is used on a minor fault, -EEXIST is returned (or -EINVAL in the case of hugetlb, as UFFDIO_ZEROPAGE is unsupported in any case). - UFFDIO_WRITEPROTECT simply doesn't work with shared memory, and returns -ENOENT in that case (regardless of the kind of fault). Future Work =========== This series only supports hugetlbfs. I have a second series in flight to support shmem as well, extending the functionality. This series is more mature than the shmem support at this point, and the functionality works fully on hugetlbfs, so this series can be merged first and then shmem support will follow. This patch (of 6): This feature allows userspace to intercept "minor" faults. By "minor" faults, I mean the following situation: Let there exist two mappings (i.e., VMAs) to the same page(s). One of the mappings is registered with userfaultfd (in minor mode), and the other is not. Via the non-UFFD mapping, the underlying pages have already been allocated & filled with some contents. The UFFD mapping has not yet been faulted in; when it is touched for the first time, this results in what I'm calling a "minor" fault. As a concrete example, when working with hugetlbfs, we have huge_pte_none(), but find_lock_page() finds an existing page. This commit adds the new registration mode, and sets the relevant flag on the VMAs being registered. In the hugetlb fault path, if we find that we have huge_pte_none(), but find_lock_page() does indeed find an existing page, then we have a "minor" fault, and if the VMA has the userfaultfd registration flag, we call into userfaultfd to handle it. This is implemented as a new registration mode, instead of an API feature. This is because the alternative implementation has significant drawbacks [1]. However, doing it this was requires we allocate a VM_* flag for the new registration mode. On 32-bit systems, there are no unused bits, so this feature is only supported on architectures with CONFIG_ARCH_USES_HIGH_VMA_FLAGS. When attempting to register a VMA in MINOR mode on 32-bit architectures, we return -EINVAL. [1] https://lore.kernel.org/patchwork/patch/1380226/ [peterx@redhat.com: fix minor fault page leak] Link: https://lkml.kernel.org/r/20210322175132.36659-1-peterx@redhat.com Link: https://lkml.kernel.org/r/20210301222728.176417-1-axelrasmussen@google.com Link: https://lkml.kernel.org/r/20210301222728.176417-2-axelrasmussen@google.com Signed-off-by: Axel Rasmussen Reviewed-by: Peter Xu Reviewed-by: Mike Kravetz Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Catalin Marinas Cc: Chinwen Chang Cc: Huang Ying Cc: Ingo Molnar Cc: Jann Horn Cc: Jerome Glisse Cc: Lokesh Gidra Cc: "Matthew Wilcox (Oracle)" Cc: Michael Ellerman Cc: "Michal Koutn" Cc: Michel Lespinasse Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Peter Xu Cc: Shaohua Li Cc: Shawn Anastasio Cc: Steven Rostedt Cc: Steven Price Cc: Vlastimil Babka Cc: Adam Ruprecht Cc: Axel Rasmussen Cc: Cannon Matthews Cc: "Dr . David Alan Gilbert" Cc: David Rientjes Cc: Mina Almasry Cc: Oliver Upton Cc: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/Kconfig | 1 + arch/x86/Kconfig | 1 + fs/proc/task_mmu.c | 3 ++ fs/userfaultfd.c | 78 +++++++++++++++++++++++---------------- include/linux/mm.h | 7 ++++ include/linux/userfaultfd_k.h | 15 +++++++- include/trace/events/mmflags.h | 7 ++++ include/uapi/linux/userfaultfd.h | 15 +++++++- init/Kconfig | 5 +++ mm/hugetlb.c | 80 ++++++++++++++++++++++++++-------------- 10 files changed, 150 insertions(+), 62 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 7f2a80091337..04c69f606537 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -213,6 +213,7 @@ config ARM64 select SWIOTLB select SYSCTL_EXCEPTION_TRACE select THREAD_INFO_IN_TASK + select HAVE_ARCH_USERFAULTFD_MINOR if USERFAULTFD help ARM 64-bit (AArch64) Linux support. diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index dac15f646f79..1c350e8782ed 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -165,6 +165,7 @@ config X86 select HAVE_ARCH_TRANSPARENT_HUGEPAGE select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD if X86_64 select HAVE_ARCH_USERFAULTFD_WP if X86_64 && USERFAULTFD + select HAVE_ARCH_USERFAULTFD_MINOR if X86_64 && USERFAULTFD select HAVE_ARCH_VMAP_STACK if X86_64 select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET select HAVE_ARCH_WITHIN_STACK_FRAMES diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index e862cab69583..fc9784544b24 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -661,6 +661,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_PKEY_BIT4)] = "", #endif #endif /* CONFIG_ARCH_HAS_PKEYS */ +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR + [ilog2(VM_UFFD_MINOR)] = "ui", +#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ }; size_t i; diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index e5ce3b4e6c3d..ba35cafa8b0d 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -197,24 +197,21 @@ static inline struct uffd_msg userfault_msg(unsigned long address, msg_init(&msg); msg.event = UFFD_EVENT_PAGEFAULT; msg.arg.pagefault.address = address; + /* + * These flags indicate why the userfault occurred: + * - UFFD_PAGEFAULT_FLAG_WP indicates a write protect fault. + * - UFFD_PAGEFAULT_FLAG_MINOR indicates a minor fault. + * - Neither of these flags being set indicates a MISSING fault. + * + * Separately, UFFD_PAGEFAULT_FLAG_WRITE indicates it was a write + * fault. Otherwise, it was a read fault. + */ if (flags & FAULT_FLAG_WRITE) - /* - * If UFFD_FEATURE_PAGEFAULT_FLAG_WP was set in the - * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WRITE - * was not set in a UFFD_EVENT_PAGEFAULT, it means it - * was a read fault, otherwise if set it means it's - * a write fault. - */ msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE; if (reason & VM_UFFD_WP) - /* - * If UFFD_FEATURE_PAGEFAULT_FLAG_WP was set in the - * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WP was - * not set in a UFFD_EVENT_PAGEFAULT, it means it was - * a missing fault, otherwise if set it means it's a - * write protect fault. - */ msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP; + if (reason & VM_UFFD_MINOR) + msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_MINOR; if (features & UFFD_FEATURE_THREAD_ID) msg.arg.pagefault.feat.ptid = task_pid_vnr(current); return msg; @@ -401,8 +398,10 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) BUG_ON(ctx->mm != mm); - VM_BUG_ON(reason & ~(VM_UFFD_MISSING|VM_UFFD_WP)); - VM_BUG_ON(!(reason & VM_UFFD_MISSING) ^ !!(reason & VM_UFFD_WP)); + /* Any unrecognized flag is a bug. */ + VM_BUG_ON(reason & ~__VM_UFFD_FLAGS); + /* 0 or > 1 flags set is a bug; we expect exactly 1. */ + VM_BUG_ON(!reason || (reason & (reason - 1))); if (ctx->features & UFFD_FEATURE_SIGBUS) goto out; @@ -612,7 +611,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, for (vma = mm->mmap; vma; vma = vma->vm_next) if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) { vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; - vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING); + vma->vm_flags &= ~__VM_UFFD_FLAGS; } mmap_write_unlock(mm); @@ -644,7 +643,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) octx = vma->vm_userfaultfd_ctx.ctx; if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) { vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; - vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING); + vma->vm_flags &= ~__VM_UFFD_FLAGS; return 0; } @@ -726,7 +725,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma, } else { /* Drop uffd context if remap feature not enabled */ vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; - vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING); + vma->vm_flags &= ~__VM_UFFD_FLAGS; } } @@ -867,12 +866,12 @@ static int userfaultfd_release(struct inode *inode, struct file *file) for (vma = mm->mmap; vma; vma = vma->vm_next) { cond_resched(); BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^ - !!(vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP))); + !!(vma->vm_flags & __VM_UFFD_FLAGS)); if (vma->vm_userfaultfd_ctx.ctx != ctx) { prev = vma; continue; } - new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP); + new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, @@ -1262,9 +1261,19 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma, unsigned long vm_flags) { /* FIXME: add WP support to hugetlbfs and shmem */ - return vma_is_anonymous(vma) || - ((is_vm_hugetlb_page(vma) || vma_is_shmem(vma)) && - !(vm_flags & VM_UFFD_WP)); + if (vm_flags & VM_UFFD_WP) { + if (is_vm_hugetlb_page(vma) || vma_is_shmem(vma)) + return false; + } + + if (vm_flags & VM_UFFD_MINOR) { + /* FIXME: Add minor fault interception for shmem. */ + if (!is_vm_hugetlb_page(vma)) + return false; + } + + return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) || + vma_is_shmem(vma); } static int userfaultfd_register(struct userfaultfd_ctx *ctx, @@ -1290,14 +1299,19 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, ret = -EINVAL; if (!uffdio_register.mode) goto out; - if (uffdio_register.mode & ~(UFFDIO_REGISTER_MODE_MISSING| - UFFDIO_REGISTER_MODE_WP)) + if (uffdio_register.mode & ~UFFD_API_REGISTER_MODES) goto out; vm_flags = 0; if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING) vm_flags |= VM_UFFD_MISSING; if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) vm_flags |= VM_UFFD_WP; + if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR) { +#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR + goto out; +#endif + vm_flags |= VM_UFFD_MINOR; + } ret = validate_range(mm, &uffdio_register.range.start, uffdio_register.range.len); @@ -1341,7 +1355,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, cond_resched(); BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^ - !!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP))); + !!(cur->vm_flags & __VM_UFFD_FLAGS)); /* check not compatible vmas */ ret = -EINVAL; @@ -1421,8 +1435,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, start = vma->vm_start; vma_end = min(end, vma->vm_end); - new_flags = (vma->vm_flags & - ~(VM_UFFD_MISSING|VM_UFFD_WP)) | vm_flags; + new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags; prev = vma_merge(mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), @@ -1544,7 +1557,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, cond_resched(); BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^ - !!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP))); + !!(cur->vm_flags & __VM_UFFD_FLAGS)); /* * Check not compatible vmas, not strictly required @@ -1595,7 +1608,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range); } - new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP); + new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; prev = vma_merge(mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), @@ -1863,6 +1876,9 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, goto err_out; /* report all available features and ioctls to userland */ uffdio_api.features = UFFD_API_FEATURES; +#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR + uffdio_api.features &= ~UFFD_FEATURE_MINOR_HUGETLBFS; +#endif uffdio_api.ioctls = UFFD_API_IOCTLS; ret = -EFAULT; if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api))) diff --git a/include/linux/mm.h b/include/linux/mm.h index 011f43605807..1dbb53c44243 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -372,6 +372,13 @@ extern unsigned int kobjsize(const void *objp); # define VM_GROWSUP VM_NONE #endif +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR +# define VM_UFFD_MINOR_BIT 37 +# define VM_UFFD_MINOR BIT(VM_UFFD_MINOR_BIT) /* UFFD minor faults */ +#else /* !CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ +# define VM_UFFD_MINOR VM_NONE +#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ + /* Bits set in the VMA until the stack is in its final location */ #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ) diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index c63ccdae3eab..0390e5ac63b3 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -17,6 +17,9 @@ #include #include +/* The set of all possible UFFD-related VM flags. */ +#define __VM_UFFD_FLAGS (VM_UFFD_MISSING | VM_UFFD_WP | VM_UFFD_MINOR) + /* * CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining * new flags, since they might collide with O_* ones. We want @@ -71,6 +74,11 @@ static inline bool userfaultfd_wp(struct vm_area_struct *vma) return vma->vm_flags & VM_UFFD_WP; } +static inline bool userfaultfd_minor(struct vm_area_struct *vma) +{ + return vma->vm_flags & VM_UFFD_MINOR; +} + static inline bool userfaultfd_pte_wp(struct vm_area_struct *vma, pte_t pte) { @@ -85,7 +93,7 @@ static inline bool userfaultfd_huge_pmd_wp(struct vm_area_struct *vma, static inline bool userfaultfd_armed(struct vm_area_struct *vma) { - return vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP); + return vma->vm_flags & __VM_UFFD_FLAGS; } extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *); @@ -132,6 +140,11 @@ static inline bool userfaultfd_wp(struct vm_area_struct *vma) return false; } +static inline bool userfaultfd_minor(struct vm_area_struct *vma) +{ + return false; +} + static inline bool userfaultfd_pte_wp(struct vm_area_struct *vma, pte_t pte) { diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 67018d367b9f..629c7a0eaff2 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -137,6 +137,12 @@ IF_HAVE_PG_ARCH_2(PG_arch_2, "arch_2" ) #define IF_HAVE_VM_SOFTDIRTY(flag,name) #endif +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR +# define IF_HAVE_UFFD_MINOR(flag, name) {flag, name}, +#else +# define IF_HAVE_UFFD_MINOR(flag, name) +#endif + #define __def_vmaflag_names \ {VM_READ, "read" }, \ {VM_WRITE, "write" }, \ @@ -148,6 +154,7 @@ IF_HAVE_PG_ARCH_2(PG_arch_2, "arch_2" ) {VM_MAYSHARE, "mayshare" }, \ {VM_GROWSDOWN, "growsdown" }, \ {VM_UFFD_MISSING, "uffd_missing" }, \ +IF_HAVE_UFFD_MINOR(VM_UFFD_MINOR, "uffd_minor" ) \ {VM_PFNMAP, "pfnmap" }, \ {VM_DENYWRITE, "denywrite" }, \ {VM_UFFD_WP, "uffd_wp" }, \ diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index 5f2d88212f7c..f24dd4fcbad9 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -19,15 +19,19 @@ * means the userland is reading). */ #define UFFD_API ((__u64)0xAA) +#define UFFD_API_REGISTER_MODES (UFFDIO_REGISTER_MODE_MISSING | \ + UFFDIO_REGISTER_MODE_WP | \ + UFFDIO_REGISTER_MODE_MINOR) #define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP | \ UFFD_FEATURE_EVENT_FORK | \ UFFD_FEATURE_EVENT_REMAP | \ - UFFD_FEATURE_EVENT_REMOVE | \ + UFFD_FEATURE_EVENT_REMOVE | \ UFFD_FEATURE_EVENT_UNMAP | \ UFFD_FEATURE_MISSING_HUGETLBFS | \ UFFD_FEATURE_MISSING_SHMEM | \ UFFD_FEATURE_SIGBUS | \ - UFFD_FEATURE_THREAD_ID) + UFFD_FEATURE_THREAD_ID | \ + UFFD_FEATURE_MINOR_HUGETLBFS) #define UFFD_API_IOCTLS \ ((__u64)1 << _UFFDIO_REGISTER | \ (__u64)1 << _UFFDIO_UNREGISTER | \ @@ -127,6 +131,7 @@ struct uffd_msg { /* flags for UFFD_EVENT_PAGEFAULT */ #define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */ #define UFFD_PAGEFAULT_FLAG_WP (1<<1) /* If reason is VM_UFFD_WP */ +#define UFFD_PAGEFAULT_FLAG_MINOR (1<<2) /* If reason is VM_UFFD_MINOR */ struct uffdio_api { /* userland asks for an API number and the features to enable */ @@ -171,6 +176,10 @@ struct uffdio_api { * * UFFD_FEATURE_THREAD_ID pid of the page faulted task_struct will * be returned, if feature is not requested 0 will be returned. + * + * UFFD_FEATURE_MINOR_HUGETLBFS indicates that minor faults + * can be intercepted (via REGISTER_MODE_MINOR) for + * hugetlbfs-backed pages. */ #define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) #define UFFD_FEATURE_EVENT_FORK (1<<1) @@ -181,6 +190,7 @@ struct uffdio_api { #define UFFD_FEATURE_EVENT_UNMAP (1<<6) #define UFFD_FEATURE_SIGBUS (1<<7) #define UFFD_FEATURE_THREAD_ID (1<<8) +#define UFFD_FEATURE_MINOR_HUGETLBFS (1<<9) __u64 features; __u64 ioctls; @@ -195,6 +205,7 @@ struct uffdio_register { struct uffdio_range range; #define UFFDIO_REGISTER_MODE_MISSING ((__u64)1<<0) #define UFFDIO_REGISTER_MODE_WP ((__u64)1<<1) +#define UFFDIO_REGISTER_MODE_MINOR ((__u64)1<<2) __u64 mode; /* diff --git a/init/Kconfig b/init/Kconfig index 9acb7762e971..1413413fcb9f 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1644,6 +1644,11 @@ config HAVE_ARCH_USERFAULTFD_WP help Arch has userfaultfd write protection support +config HAVE_ARCH_USERFAULTFD_MINOR + bool + help + Arch has userfaultfd minor fault support + config MEMBARRIER bool "Enable membarrier() system call" if EXPERT default y diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b5977d9709ad..84530876b2ae 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4469,6 +4469,44 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping, return 0; } +static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma, + struct address_space *mapping, + pgoff_t idx, + unsigned int flags, + unsigned long haddr, + unsigned long reason) +{ + vm_fault_t ret; + u32 hash; + struct vm_fault vmf = { + .vma = vma, + .address = haddr, + .flags = flags, + + /* + * Hard to debug if it ends up being + * used by a callee that assumes + * something about the other + * uninitialized fields... same as in + * memory.c + */ + }; + + /* + * hugetlb_fault_mutex and i_mmap_rwsem must be + * dropped before handling userfault. Reacquire + * after handling fault to make calling code simpler. + */ + hash = hugetlb_fault_mutex_hash(mapping, idx); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + i_mmap_unlock_read(mapping); + ret = handle_userfault(&vmf, reason); + i_mmap_lock_read(mapping); + mutex_lock(&hugetlb_fault_mutex_table[hash]); + + return ret; +} + static vm_fault_t hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, struct address_space *mapping, pgoff_t idx, @@ -4507,35 +4545,11 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, retry: page = find_lock_page(mapping, idx); if (!page) { - /* - * Check for page in userfault range - */ + /* Check for page in userfault range */ if (userfaultfd_missing(vma)) { - u32 hash; - struct vm_fault vmf = { - .vma = vma, - .address = haddr, - .flags = flags, - /* - * Hard to debug if it ends up being - * used by a callee that assumes - * something about the other - * uninitialized fields... same as in - * memory.c - */ - }; - - /* - * hugetlb_fault_mutex and i_mmap_rwsem must be - * dropped before handling userfault. Reacquire - * after handling fault to make calling code simpler. - */ - hash = hugetlb_fault_mutex_hash(mapping, idx); - mutex_unlock(&hugetlb_fault_mutex_table[hash]); - i_mmap_unlock_read(mapping); - ret = handle_userfault(&vmf, VM_UFFD_MISSING); - i_mmap_lock_read(mapping); - mutex_lock(&hugetlb_fault_mutex_table[hash]); + ret = hugetlb_handle_userfault(vma, mapping, idx, + flags, haddr, + VM_UFFD_MISSING); goto out; } @@ -4591,6 +4605,16 @@ retry: VM_FAULT_SET_HINDEX(hstate_index(h)); goto backout_unlocked; } + + /* Check for page in userfault range. */ + if (userfaultfd_minor(vma)) { + unlock_page(page); + put_page(page); + ret = hugetlb_handle_userfault(vma, mapping, idx, + flags, haddr, + VM_UFFD_MINOR); + goto out; + } } /* -- cgit v1.2.3-58-ga151 From 0d9cadabd193c6008d256533f544de8206fd3a80 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Tue, 4 May 2021 18:35:40 -0700 Subject: userfaultfd: disable huge PMD sharing for MINOR registered VMAs As the comment says: for the MINOR fault use case, although the page might be present and populated in the other (non-UFFD-registered) half of the mapping, it may be out of date, and we explicitly want userspace to get a minor fault so it can check and potentially update the page's contents. Huge PMD sharing would prevent these faults from occurring for suitably aligned areas, so disable it upon UFFD registration. Link: https://lkml.kernel.org/r/20210301222728.176417-3-axelrasmussen@google.com Signed-off-by: Axel Rasmussen Reviewed-by: Peter Xu Reviewed-by: Mike Kravetz Cc: Adam Ruprecht Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Cannon Matthews Cc: Catalin Marinas Cc: Chinwen Chang Cc: David Rientjes Cc: "Dr . David Alan Gilbert" Cc: Huang Ying Cc: Ingo Molnar Cc: Jann Horn Cc: Jerome Glisse Cc: Kirill A. Shutemov Cc: Lokesh Gidra Cc: "Matthew Wilcox (Oracle)" Cc: Michael Ellerman Cc: "Michal Koutn" Cc: Michel Lespinasse Cc: Mike Rapoport Cc: Mina Almasry Cc: Nicholas Piggin Cc: Oliver Upton Cc: Shaohua Li Cc: Shawn Anastasio Cc: Steven Price Cc: Steven Rostedt Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/userfaultfd_k.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 0390e5ac63b3..e060d5f77cc5 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -56,12 +56,19 @@ static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, } /* - * Never enable huge pmd sharing on uffd-wp registered vmas, because uffd-wp - * protect information is per pgtable entry. + * Never enable huge pmd sharing on some uffd registered vmas: + * + * - VM_UFFD_WP VMAs, because write protect information is per pgtable entry. + * + * - VM_UFFD_MINOR VMAs, because otherwise we would never get minor faults for + * VMAs which share huge pmds. (If you have two mappings to the same + * underlying pages, and fault in the non-UFFD-registered one with a write, + * with huge pmd sharing this would *also* setup the second UFFD-registered + * mapping, and we'd not get minor faults.) */ static inline bool uffd_disable_huge_pmd_share(struct vm_area_struct *vma) { - return vma->vm_flags & VM_UFFD_WP; + return vma->vm_flags & (VM_UFFD_WP | VM_UFFD_MINOR); } static inline bool userfaultfd_missing(struct vm_area_struct *vma) -- cgit v1.2.3-58-ga151 From 714c189108244f1df579689061db1d785d92e7e2 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Tue, 4 May 2021 18:35:45 -0700 Subject: userfaultfd: hugetlbfs: only compile UFFD helpers if config enabled For background, mm/userfaultfd.c provides a general mcopy_atomic implementation. But some types of memory (i.e., hugetlb and shmem) need a slightly different implementation, so they provide their own helpers for this. In other words, userfaultfd is the only caller of these functions. This patch achieves two things: 1. Don't spend time compiling code which will end up never being referenced anyway (a small build time optimization). 2. In patches later in this series, we extend the signature of these helpers with UFFD-specific state (a mode enumeration). Once this happens, we *have to* either not compile the helpers, or unconditionally define the UFFD-only state (which seems messier to me). This includes the declarations in the headers, as otherwise they'd yield warnings about implicitly defining the type of those arguments. Link: https://lkml.kernel.org/r/20210301222728.176417-4-axelrasmussen@google.com Signed-off-by: Axel Rasmussen Reviewed-by: Mike Kravetz Reviewed-by: Peter Xu Cc: Adam Ruprecht Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Cannon Matthews Cc: Catalin Marinas Cc: Chinwen Chang Cc: David Rientjes Cc: "Dr . David Alan Gilbert" Cc: Huang Ying Cc: Ingo Molnar Cc: Jann Horn Cc: Jerome Glisse Cc: Kirill A. Shutemov Cc: Lokesh Gidra Cc: "Matthew Wilcox (Oracle)" Cc: Michael Ellerman Cc: "Michal Koutn" Cc: Michel Lespinasse Cc: Mike Rapoport Cc: Mina Almasry Cc: Nicholas Piggin Cc: Oliver Upton Cc: Shaohua Li Cc: Shawn Anastasio Cc: Steven Price Cc: Steven Rostedt Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 4 ++++ mm/hugetlb.c | 2 ++ 2 files changed, 6 insertions(+) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index d0f310ae3f82..a1dbe4568707 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -134,11 +134,13 @@ void hugetlb_show_meminfo(void); unsigned long hugetlb_total_pages(void); vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags); +#ifdef CONFIG_USERFAULTFD int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, struct page **pagep); +#endif /* CONFIG_USERFAULTFD */ bool hugetlb_reserve_pages(struct inode *inode, long from, long to, struct vm_area_struct *vma, vm_flags_t vm_flags); @@ -310,6 +312,7 @@ static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, BUG(); } +#ifdef CONFIG_USERFAULTFD static inline int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte, struct vm_area_struct *dst_vma, @@ -320,6 +323,7 @@ static inline int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, BUG(); return 0; } +#endif /* CONFIG_USERFAULTFD */ static inline pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 84530876b2ae..b105a455124d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4855,6 +4855,7 @@ out_mutex: return ret; } +#ifdef CONFIG_USERFAULTFD /* * Used by userfaultfd UFFDIO_COPY. Based on mcopy_atomic_pte with * modifications for huge pages. @@ -4985,6 +4986,7 @@ out_release_nounlock: put_page(page); goto out; } +#endif /* CONFIG_USERFAULTFD */ static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma, int refs, struct page **pages, -- cgit v1.2.3-58-ga151 From f619147104c8ea71e120e4936d2b68ec11a1e527 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Tue, 4 May 2021 18:35:49 -0700 Subject: userfaultfd: add UFFDIO_CONTINUE ioctl This ioctl is how userspace ought to resolve "minor" userfaults. The idea is, userspace is notified that a minor fault has occurred. It might change the contents of the page using its second non-UFFD mapping, or not. Then, it calls UFFDIO_CONTINUE to tell the kernel "I have ensured the page contents are correct, carry on setting up the mapping". Note that it doesn't make much sense to use UFFDIO_{COPY,ZEROPAGE} for MINOR registered VMAs. ZEROPAGE maps the VMA to the zero page; but in the minor fault case, we already have some pre-existing underlying page. Likewise, UFFDIO_COPY isn't useful if we have a second non-UFFD mapping. We'd just use memcpy() or similar instead. It turns out hugetlb_mcopy_atomic_pte() already does very close to what we want, if an existing page is provided via `struct page **pagep`. We already special-case the behavior a bit for the UFFDIO_ZEROPAGE case, so just extend that design: add an enum for the three modes of operation, and make the small adjustments needed for the MCOPY_ATOMIC_CONTINUE case. (Basically, look up the existing page, and avoid adding the existing page to the page cache or calling set_page_huge_active() on it.) Link: https://lkml.kernel.org/r/20210301222728.176417-5-axelrasmussen@google.com Signed-off-by: Axel Rasmussen Reviewed-by: Peter Xu Cc: Adam Ruprecht Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Cannon Matthews Cc: Catalin Marinas Cc: Chinwen Chang Cc: David Rientjes Cc: "Dr . David Alan Gilbert" Cc: Huang Ying Cc: Ingo Molnar Cc: Jann Horn Cc: Jerome Glisse Cc: Kirill A. Shutemov Cc: Lokesh Gidra Cc: "Matthew Wilcox (Oracle)" Cc: Michael Ellerman Cc: "Michal Koutn" Cc: Michel Lespinasse Cc: Mike Kravetz Cc: Mike Rapoport Cc: Mina Almasry Cc: Nicholas Piggin Cc: Oliver Upton Cc: Shaohua Li Cc: Shawn Anastasio Cc: Steven Price Cc: Steven Rostedt Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 67 ++++++++++++++++++++++++++++++++++++++++ include/linux/hugetlb.h | 3 ++ include/linux/userfaultfd_k.h | 18 +++++++++++ include/uapi/linux/userfaultfd.h | 21 +++++++++++-- mm/hugetlb.c | 40 +++++++++++++++--------- mm/userfaultfd.c | 37 +++++++++++++--------- 6 files changed, 156 insertions(+), 30 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index ba35cafa8b0d..14f92285d04f 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1487,6 +1487,10 @@ out_unlock: if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_WP)) ioctls_out &= ~((__u64)1 << _UFFDIO_WRITEPROTECT); + /* CONTINUE ioctl is only supported for MINOR ranges. */ + if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR)) + ioctls_out &= ~((__u64)1 << _UFFDIO_CONTINUE); + /* * Now that we scanned all vmas we can already tell * userland which ioctls methods are guaranteed to @@ -1840,6 +1844,66 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx, return ret; } +static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg) +{ + __s64 ret; + struct uffdio_continue uffdio_continue; + struct uffdio_continue __user *user_uffdio_continue; + struct userfaultfd_wake_range range; + + user_uffdio_continue = (struct uffdio_continue __user *)arg; + + ret = -EAGAIN; + if (READ_ONCE(ctx->mmap_changing)) + goto out; + + ret = -EFAULT; + if (copy_from_user(&uffdio_continue, user_uffdio_continue, + /* don't copy the output fields */ + sizeof(uffdio_continue) - (sizeof(__s64)))) + goto out; + + ret = validate_range(ctx->mm, &uffdio_continue.range.start, + uffdio_continue.range.len); + if (ret) + goto out; + + ret = -EINVAL; + /* double check for wraparound just in case. */ + if (uffdio_continue.range.start + uffdio_continue.range.len <= + uffdio_continue.range.start) { + goto out; + } + if (uffdio_continue.mode & ~UFFDIO_CONTINUE_MODE_DONTWAKE) + goto out; + + if (mmget_not_zero(ctx->mm)) { + ret = mcopy_continue(ctx->mm, uffdio_continue.range.start, + uffdio_continue.range.len, + &ctx->mmap_changing); + mmput(ctx->mm); + } else { + return -ESRCH; + } + + if (unlikely(put_user(ret, &user_uffdio_continue->mapped))) + return -EFAULT; + if (ret < 0) + goto out; + + /* len == 0 would wake all */ + BUG_ON(!ret); + range.len = ret; + if (!(uffdio_continue.mode & UFFDIO_CONTINUE_MODE_DONTWAKE)) { + range.start = uffdio_continue.range.start; + wake_userfault(ctx, &range); + } + ret = range.len == uffdio_continue.range.len ? 0 : -EAGAIN; + +out: + return ret; +} + static inline unsigned int uffd_ctx_features(__u64 user_features) { /* @@ -1927,6 +1991,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd, case UFFDIO_WRITEPROTECT: ret = userfaultfd_writeprotect(ctx, arg); break; + case UFFDIO_CONTINUE: + ret = userfaultfd_continue(ctx, arg); + break; } return ret; } diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index a1dbe4568707..b92f25ccef58 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -11,6 +11,7 @@ #include #include #include +#include struct ctl_table; struct user_struct; @@ -139,6 +140,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, + enum mcopy_atomic_mode mode, struct page **pagep); #endif /* CONFIG_USERFAULTFD */ bool hugetlb_reserve_pages(struct inode *inode, long from, long to, @@ -318,6 +320,7 @@ static inline int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, + enum mcopy_atomic_mode mode, struct page **pagep) { BUG(); diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index e060d5f77cc5..794d1538b8ba 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -37,6 +37,22 @@ extern int sysctl_unprivileged_userfaultfd; extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason); +/* + * The mode of operation for __mcopy_atomic and its helpers. + * + * This is almost an implementation detail (mcopy_atomic below doesn't take this + * as a parameter), but it's exposed here because memory-kind-specific + * implementations (e.g. hugetlbfs) need to know the mode of operation. + */ +enum mcopy_atomic_mode { + /* A normal copy_from_user into the destination range. */ + MCOPY_ATOMIC_NORMAL, + /* Don't copy; map the destination range to the zero page. */ + MCOPY_ATOMIC_ZEROPAGE, + /* Just install pte(s) with the existing page(s) in the page cache. */ + MCOPY_ATOMIC_CONTINUE, +}; + extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len, bool *mmap_changing, __u64 mode); @@ -44,6 +60,8 @@ extern ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long len, bool *mmap_changing); +extern ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long dst_start, + unsigned long len, bool *mmap_changing); extern int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, unsigned long len, bool enable_wp, bool *mmap_changing); diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index f24dd4fcbad9..bafbeb1a2624 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -40,10 +40,12 @@ ((__u64)1 << _UFFDIO_WAKE | \ (__u64)1 << _UFFDIO_COPY | \ (__u64)1 << _UFFDIO_ZEROPAGE | \ - (__u64)1 << _UFFDIO_WRITEPROTECT) + (__u64)1 << _UFFDIO_WRITEPROTECT | \ + (__u64)1 << _UFFDIO_CONTINUE) #define UFFD_API_RANGE_IOCTLS_BASIC \ ((__u64)1 << _UFFDIO_WAKE | \ - (__u64)1 << _UFFDIO_COPY) + (__u64)1 << _UFFDIO_COPY | \ + (__u64)1 << _UFFDIO_CONTINUE) /* * Valid ioctl command number range with this API is from 0x00 to @@ -59,6 +61,7 @@ #define _UFFDIO_COPY (0x03) #define _UFFDIO_ZEROPAGE (0x04) #define _UFFDIO_WRITEPROTECT (0x06) +#define _UFFDIO_CONTINUE (0x07) #define _UFFDIO_API (0x3F) /* userfaultfd ioctl ids */ @@ -77,6 +80,8 @@ struct uffdio_zeropage) #define UFFDIO_WRITEPROTECT _IOWR(UFFDIO, _UFFDIO_WRITEPROTECT, \ struct uffdio_writeprotect) +#define UFFDIO_CONTINUE _IOR(UFFDIO, _UFFDIO_CONTINUE, \ + struct uffdio_continue) /* read() structure */ struct uffd_msg { @@ -268,6 +273,18 @@ struct uffdio_writeprotect { __u64 mode; }; +struct uffdio_continue { + struct uffdio_range range; +#define UFFDIO_CONTINUE_MODE_DONTWAKE ((__u64)1<<0) + __u64 mode; + + /* + * Fields below here are written by the ioctl and must be at the end: + * the copy_from_user will not read past here. + */ + __s64 mapped; +}; + /* * Flags for the userfaultfd(2) system call itself. */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b105a455124d..533e5a26e437 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -39,7 +39,6 @@ #include #include #include -#include #include #include "internal.h" @@ -4865,8 +4864,10 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, + enum mcopy_atomic_mode mode, struct page **pagep) { + bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE); struct address_space *mapping; pgoff_t idx; unsigned long size; @@ -4876,8 +4877,17 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, spinlock_t *ptl; int ret; struct page *page; + int writable; - if (!*pagep) { + mapping = dst_vma->vm_file->f_mapping; + idx = vma_hugecache_offset(h, dst_vma, dst_addr); + + if (is_continue) { + ret = -EFAULT; + page = find_lock_page(mapping, idx); + if (!page) + goto out; + } else if (!*pagep) { ret = -ENOMEM; page = alloc_huge_page(dst_vma, dst_addr, 0); if (IS_ERR(page)) @@ -4906,13 +4916,8 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, */ __SetPageUptodate(page); - mapping = dst_vma->vm_file->f_mapping; - idx = vma_hugecache_offset(h, dst_vma, dst_addr); - - /* - * If shared, add to page cache - */ - if (vm_shared) { + /* Add shared, newly allocated pages to the page cache. */ + if (vm_shared && !is_continue) { size = i_size_read(mapping->host) >> huge_page_shift(h); ret = -EFAULT; if (idx >= size) @@ -4957,8 +4962,14 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, hugepage_add_new_anon_rmap(page, dst_vma, dst_addr); } - _dst_pte = make_huge_pte(dst_vma, page, dst_vma->vm_flags & VM_WRITE); - if (dst_vma->vm_flags & VM_WRITE) + /* For CONTINUE on a non-shared VMA, don't set VM_WRITE for CoW. */ + if (is_continue && !vm_shared) + writable = 0; + else + writable = dst_vma->vm_flags & VM_WRITE; + + _dst_pte = make_huge_pte(dst_vma, page, writable); + if (writable) _dst_pte = huge_pte_mkdirty(_dst_pte); _dst_pte = pte_mkyoung(_dst_pte); @@ -4972,15 +4983,16 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, update_mmu_cache(dst_vma, dst_addr, dst_pte); spin_unlock(ptl); - SetHPageMigratable(page); - if (vm_shared) + if (!is_continue) + SetHPageMigratable(page); + if (vm_shared || is_continue) unlock_page(page); ret = 0; out: return ret; out_release_unlock: spin_unlock(ptl); - if (vm_shared) + if (vm_shared || is_continue) unlock_page(page); out_release_nounlock: put_page(page); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 063cbb17e8d8..e14b3820c6a8 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -207,7 +207,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len, - bool zeropage) + enum mcopy_atomic_mode mode) { int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED; int vm_shared = dst_vma->vm_flags & VM_SHARED; @@ -227,7 +227,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, * by THP. Since we can not reliably insert a zero page, this * feature is not supported. */ - if (zeropage) { + if (mode == MCOPY_ATOMIC_ZEROPAGE) { mmap_read_unlock(dst_mm); return -EINVAL; } @@ -273,8 +273,6 @@ retry: } while (src_addr < src_start + len) { - pte_t dst_pteval; - BUG_ON(dst_addr >= dst_start + len); /* @@ -297,16 +295,16 @@ retry: goto out_unlock; } - err = -EEXIST; - dst_pteval = huge_ptep_get(dst_pte); - if (!huge_pte_none(dst_pteval)) { + if (mode != MCOPY_ATOMIC_CONTINUE && + !huge_pte_none(huge_ptep_get(dst_pte))) { + err = -EEXIST; mutex_unlock(&hugetlb_fault_mutex_table[hash]); i_mmap_unlock_read(mapping); goto out_unlock; } err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, - dst_addr, src_addr, &page); + dst_addr, src_addr, mode, &page); mutex_unlock(&hugetlb_fault_mutex_table[hash]); i_mmap_unlock_read(mapping); @@ -408,7 +406,7 @@ extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len, - bool zeropage); + enum mcopy_atomic_mode mode); #endif /* CONFIG_HUGETLB_PAGE */ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm, @@ -458,7 +456,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len, - bool zeropage, + enum mcopy_atomic_mode mcopy_mode, bool *mmap_changing, __u64 mode) { @@ -469,6 +467,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, long copied; struct page *page; bool wp_copy; + bool zeropage = (mcopy_mode == MCOPY_ATOMIC_ZEROPAGE); /* * Sanitize the command parameters: @@ -527,10 +526,12 @@ retry: */ if (is_vm_hugetlb_page(dst_vma)) return __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start, - src_start, len, zeropage); + src_start, len, mcopy_mode); if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) goto out_unlock; + if (mcopy_mode == MCOPY_ATOMIC_CONTINUE) + goto out_unlock; /* * Ensure the dst_vma has a anon_vma or this page @@ -626,14 +627,22 @@ ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len, bool *mmap_changing, __u64 mode) { - return __mcopy_atomic(dst_mm, dst_start, src_start, len, false, - mmap_changing, mode); + return __mcopy_atomic(dst_mm, dst_start, src_start, len, + MCOPY_ATOMIC_NORMAL, mmap_changing, mode); } ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start, unsigned long len, bool *mmap_changing) { - return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing, 0); + return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_ZEROPAGE, + mmap_changing, 0); +} + +ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start, + unsigned long len, bool *mmap_changing) +{ + return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_CONTINUE, + mmap_changing, 0); } int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, -- cgit v1.2.3-58-ga151 From b8da5cd4e5f1ce1274140e200a9116b7fe61dd87 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Tue, 4 May 2021 18:35:53 -0700 Subject: userfaultfd: update documentation to describe minor fault handling Reword / reorganize things a little bit into "lists", so new features / modes / ioctls can sort of just be appended. Describe how UFFDIO_REGISTER_MODE_MINOR and UFFDIO_CONTINUE can be used to intercept and resolve minor faults. Make it clear that COPY and ZEROPAGE are used for MISSING faults, whereas CONTINUE is used for MINOR faults. Link: https://lkml.kernel.org/r/20210301222728.176417-6-axelrasmussen@google.com Signed-off-by: Axel Rasmussen Reviewed-by: Peter Xu Cc: Adam Ruprecht Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Cannon Matthews Cc: Catalin Marinas Cc: Chinwen Chang Cc: David Rientjes Cc: "Dr . David Alan Gilbert" Cc: Huang Ying Cc: Ingo Molnar Cc: Jann Horn Cc: Jerome Glisse Cc: Kirill A. Shutemov Cc: Lokesh Gidra Cc: "Matthew Wilcox (Oracle)" Cc: Michael Ellerman Cc: "Michal Koutn" Cc: Michel Lespinasse Cc: Mike Kravetz Cc: Mike Rapoport Cc: Mina Almasry Cc: Nicholas Piggin Cc: Oliver Upton Cc: Shaohua Li Cc: Shawn Anastasio Cc: Steven Price Cc: Steven Rostedt Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/mm/userfaultfd.rst | 107 +++++++++++++++++---------- 1 file changed, 66 insertions(+), 41 deletions(-) diff --git a/Documentation/admin-guide/mm/userfaultfd.rst b/Documentation/admin-guide/mm/userfaultfd.rst index 65eefa66c0ba..3aa38e8b8361 100644 --- a/Documentation/admin-guide/mm/userfaultfd.rst +++ b/Documentation/admin-guide/mm/userfaultfd.rst @@ -63,36 +63,36 @@ the generic ioctl available. The ``uffdio_api.features`` bitmask returned by the ``UFFDIO_API`` ioctl defines what memory types are supported by the ``userfaultfd`` and what -events, except page fault notifications, may be generated. - -If the kernel supports registering ``userfaultfd`` ranges on hugetlbfs -virtual memory areas, ``UFFD_FEATURE_MISSING_HUGETLBFS`` will be set in -``uffdio_api.features``. Similarly, ``UFFD_FEATURE_MISSING_SHMEM`` will be -set if the kernel supports registering ``userfaultfd`` ranges on shared -memory (covering all shmem APIs, i.e. tmpfs, ``IPCSHM``, ``/dev/zero``, -``MAP_SHARED``, ``memfd_create``, etc). - -The userland application that wants to use ``userfaultfd`` with hugetlbfs -or shared memory need to set the corresponding flag in -``uffdio_api.features`` to enable those features. - -If the userland desires to receive notifications for events other than -page faults, it has to verify that ``uffdio_api.features`` has appropriate -``UFFD_FEATURE_EVENT_*`` bits set. These events are described in more -detail below in `Non-cooperative userfaultfd`_ section. - -Once the ``userfaultfd`` has been enabled the ``UFFDIO_REGISTER`` ioctl should -be invoked (if present in the returned ``uffdio_api.ioctls`` bitmask) to -register a memory range in the ``userfaultfd`` by setting the +events, except page fault notifications, may be generated: + +- The ``UFFD_FEATURE_EVENT_*`` flags indicate that various other events + other than page faults are supported. These events are described in more + detail below in the `Non-cooperative userfaultfd`_ section. + +- ``UFFD_FEATURE_MISSING_HUGETLBFS`` and ``UFFD_FEATURE_MISSING_SHMEM`` + indicate that the kernel supports ``UFFDIO_REGISTER_MODE_MISSING`` + registrations for hugetlbfs and shared memory (covering all shmem APIs, + i.e. tmpfs, ``IPCSHM``, ``/dev/zero``, ``MAP_SHARED``, ``memfd_create``, + etc) virtual memory areas, respectively. + +- ``UFFD_FEATURE_MINOR_HUGETLBFS`` indicates that the kernel supports + ``UFFDIO_REGISTER_MODE_MINOR`` registration for hugetlbfs virtual memory + areas. + +The userland application should set the feature flags it intends to use +when invoking the ``UFFDIO_API`` ioctl, to request that those features be +enabled if supported. + +Once the ``userfaultfd`` API has been enabled the ``UFFDIO_REGISTER`` +ioctl should be invoked (if present in the returned ``uffdio_api.ioctls`` +bitmask) to register a memory range in the ``userfaultfd`` by setting the uffdio_register structure accordingly. The ``uffdio_register.mode`` bitmask will specify to the kernel which kind of faults to track for -the range (``UFFDIO_REGISTER_MODE_MISSING`` would track missing -pages). The ``UFFDIO_REGISTER`` ioctl will return the +the range. The ``UFFDIO_REGISTER`` ioctl will return the ``uffdio_register.ioctls`` bitmask of ioctls that are suitable to resolve userfaults on the range registered. Not all ioctls will necessarily be -supported for all memory types depending on the underlying virtual -memory backend (anonymous memory vs tmpfs vs real filebacked -mappings). +supported for all memory types (e.g. anonymous memory vs. shmem vs. +hugetlbfs), or all types of intercepted faults. Userland can use the ``uffdio_register.ioctls`` to manage the virtual address space in the background (to add or potentially also remove @@ -100,21 +100,46 @@ memory from the ``userfaultfd`` registered range). This means a userfault could be triggering just before userland maps in the background the user-faulted page. -The primary ioctl to resolve userfaults is ``UFFDIO_COPY``. That -atomically copies a page into the userfault registered range and wakes -up the blocked userfaults -(unless ``uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE`` is set). -Other ioctl works similarly to ``UFFDIO_COPY``. They're atomic as in -guaranteeing that nothing can see an half copied page since it'll -keep userfaulting until the copy has finished. +Resolving Userfaults +-------------------- + +There are three basic ways to resolve userfaults: + +- ``UFFDIO_COPY`` atomically copies some existing page contents from + userspace. + +- ``UFFDIO_ZEROPAGE`` atomically zeros the new page. + +- ``UFFDIO_CONTINUE`` maps an existing, previously-populated page. + +These operations are atomic in the sense that they guarantee nothing can +see a half-populated page, since readers will keep userfaulting until the +operation has finished. + +By default, these wake up userfaults blocked on the range in question. +They support a ``UFFDIO_*_MODE_DONTWAKE`` ``mode`` flag, which indicates +that waking will be done separately at some later time. + +Which ioctl to choose depends on the kind of page fault, and what we'd +like to do to resolve it: + +- For ``UFFDIO_REGISTER_MODE_MISSING`` faults, the fault needs to be + resolved by either providing a new page (``UFFDIO_COPY``), or mapping + the zero page (``UFFDIO_ZEROPAGE``). By default, the kernel would map + the zero page for a missing fault. With userfaultfd, userspace can + decide what content to provide before the faulting thread continues. + +- For ``UFFDIO_REGISTER_MODE_MINOR`` faults, there is an existing page (in + the page cache). Userspace has the option of modifying the page's + contents before resolving the fault. Once the contents are correct + (modified or not), userspace asks the kernel to map the page and let the + faulting thread continue with ``UFFDIO_CONTINUE``. Notes: -- If you requested ``UFFDIO_REGISTER_MODE_MISSING`` when registering then - you must provide some kind of page in your thread after reading from - the uffd. You must provide either ``UFFDIO_COPY`` or ``UFFDIO_ZEROPAGE``. - The normal behavior of the OS automatically providing a zero page on - an anonymous mmaping is not in place. +- You can tell which kind of fault occurred by examining + ``pagefault.flags`` within the ``uffd_msg``, checking for the + ``UFFD_PAGEFAULT_FLAG_*`` flags. - None of the page-delivering ioctls default to the range that you registered with. You must fill in all fields for the appropriate @@ -122,9 +147,9 @@ Notes: - You get the address of the access that triggered the missing page event out of a struct uffd_msg that you read in the thread from the - uffd. You can supply as many pages as you want with ``UFFDIO_COPY`` or - ``UFFDIO_ZEROPAGE``. Keep in mind that unless you used DONTWAKE then - the first of any of those IOCTLs wakes up the faulting thread. + uffd. You can supply as many pages as you want with these IOCTLs. + Keep in mind that unless you used DONTWAKE then the first of any of + those IOCTLs wakes up the faulting thread. - Be sure to test for all errors including (``pollfd[0].revents & POLLERR``). This can happen, e.g. when ranges -- cgit v1.2.3-58-ga151 From f0fa94330919be8ec5620382b50f1c72844c9224 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Tue, 4 May 2021 18:35:57 -0700 Subject: userfaultfd/selftests: add test exercising minor fault handling Fix a dormant bug in userfaultfd_events_test(), where we did `return faulting_process(0)` instead of `exit(faulting_process(0))`. This caused the forked process to keep running, trying to execute any further test cases after the events test in parallel with the "real" process. Add a simple test case which exercises minor faults. In short, it does the following: 1. "Sets up" an area (area_dst) and a second shared mapping to the same underlying pages (area_dst_alias). 2. Register one of these areas with userfaultfd, in minor fault mode. 3. Start a second thread to handle any minor faults. 4. Populate the underlying pages with the non-UFFD-registered side of the mapping. Basically, memset() each page with some arbitrary contents. 5. Then, using the UFFD-registered mapping, read all of the page contents, asserting that the contents match expectations (we expect the minor fault handling thread can modify the page contents before resolving the fault). The minor fault handling thread, upon receiving an event, flips all the bits (~) in that page, just to prove that it can modify it in some arbitrary way. Then it issues a UFFDIO_CONTINUE ioctl, to setup the mapping and resolve the fault. The reading thread should wake up and see this modification. Currently the minor fault test is only enabled in hugetlb_shared mode, as this is the only configuration the kernel feature supports. Link: https://lkml.kernel.org/r/20210301222728.176417-7-axelrasmussen@google.com Signed-off-by: Axel Rasmussen Reviewed-by: Peter Xu Cc: Adam Ruprecht Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Cannon Matthews Cc: Catalin Marinas Cc: Chinwen Chang Cc: David Rientjes Cc: "Dr . David Alan Gilbert" Cc: Huang Ying Cc: Ingo Molnar Cc: Jann Horn Cc: Jerome Glisse Cc: Kirill A. Shutemov Cc: Lokesh Gidra Cc: "Matthew Wilcox (Oracle)" Cc: Michael Ellerman Cc: "Michal Koutn" Cc: Michel Lespinasse Cc: Mike Kravetz Cc: Mike Rapoport Cc: Mina Almasry Cc: Nicholas Piggin Cc: Oliver Upton Cc: Shaohua Li Cc: Shawn Anastasio Cc: Steven Price Cc: Steven Rostedt Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/userfaultfd.c | 164 +++++++++++++++++++++++++++++-- 1 file changed, 158 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index 92b8ec423201..f5ab5e0312e7 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -81,6 +81,8 @@ static volatile bool test_uffdio_copy_eexist = true; static volatile bool test_uffdio_zeropage_eexist = true; /* Whether to test uffd write-protection */ static bool test_uffdio_wp = false; +/* Whether to test uffd minor faults */ +static bool test_uffdio_minor = false; static bool map_shared; static int huge_fd; @@ -96,6 +98,7 @@ struct uffd_stats { int cpu; unsigned long missing_faults; unsigned long wp_faults; + unsigned long minor_faults; }; /* pthread_mutex_t starts at page offset 0 */ @@ -153,17 +156,19 @@ static void uffd_stats_reset(struct uffd_stats *uffd_stats, uffd_stats[i].cpu = i; uffd_stats[i].missing_faults = 0; uffd_stats[i].wp_faults = 0; + uffd_stats[i].minor_faults = 0; } } static void uffd_stats_report(struct uffd_stats *stats, int n_cpus) { int i; - unsigned long long miss_total = 0, wp_total = 0; + unsigned long long miss_total = 0, wp_total = 0, minor_total = 0; for (i = 0; i < n_cpus; i++) { miss_total += stats[i].missing_faults; wp_total += stats[i].wp_faults; + minor_total += stats[i].minor_faults; } printf("userfaults: %llu missing (", miss_total); @@ -172,6 +177,9 @@ static void uffd_stats_report(struct uffd_stats *stats, int n_cpus) printf("\b), %llu wp (", wp_total); for (i = 0; i < n_cpus; i++) printf("%lu+", stats[i].wp_faults); + printf("\b), %llu minor (", minor_total); + for (i = 0; i < n_cpus; i++) + printf("%lu+", stats[i].minor_faults); printf("\b)\n"); } @@ -328,7 +336,7 @@ static struct uffd_test_ops shmem_uffd_test_ops = { }; static struct uffd_test_ops hugetlb_uffd_test_ops = { - .expected_ioctls = UFFD_API_RANGE_IOCTLS_BASIC, + .expected_ioctls = UFFD_API_RANGE_IOCTLS_BASIC & ~(1 << _UFFDIO_CONTINUE), .allocate_area = hugetlb_allocate_area, .release_pages = hugetlb_release_pages, .alias_mapping = hugetlb_alias_mapping, @@ -362,6 +370,22 @@ static void wp_range(int ufd, __u64 start, __u64 len, bool wp) } } +static void continue_range(int ufd, __u64 start, __u64 len) +{ + struct uffdio_continue req; + + req.range.start = start; + req.range.len = len; + req.mode = 0; + + if (ioctl(ufd, UFFDIO_CONTINUE, &req)) { + fprintf(stderr, + "UFFDIO_CONTINUE failed for address 0x%" PRIx64 "\n", + (uint64_t)start); + exit(1); + } +} + static void *locking_thread(void *arg) { unsigned long cpu = (unsigned long) arg; @@ -569,8 +593,32 @@ static void uffd_handle_page_fault(struct uffd_msg *msg, } if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) { + /* Write protect page faults */ wp_range(uffd, msg->arg.pagefault.address, page_size, false); stats->wp_faults++; + } else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) { + uint8_t *area; + int b; + + /* + * Minor page faults + * + * To prove we can modify the original range for testing + * purposes, we're going to bit flip this range before + * continuing. + * + * Note that this requires all minor page fault tests operate on + * area_dst (non-UFFD-registered) and area_dst_alias + * (UFFD-registered). + */ + + area = (uint8_t *)(area_dst + + ((char *)msg->arg.pagefault.address - + area_dst_alias)); + for (b = 0; b < page_size; ++b) + area[b] = ~area[b]; + continue_range(uffd, msg->arg.pagefault.address, page_size); + stats->minor_faults++; } else { /* Missing page faults */ if (bounces & BOUNCE_VERIFY && @@ -779,7 +827,7 @@ static int stress(struct uffd_stats *uffd_stats) return 0; } -static int userfaultfd_open(int features) +static int userfaultfd_open_ext(uint64_t *features) { struct uffdio_api uffdio_api; @@ -792,7 +840,7 @@ static int userfaultfd_open(int features) uffd_flags = fcntl(uffd, F_GETFD, NULL); uffdio_api.api = UFFD_API; - uffdio_api.features = features; + uffdio_api.features = *features; if (ioctl(uffd, UFFDIO_API, &uffdio_api)) { fprintf(stderr, "UFFDIO_API failed.\nPlease make sure to " "run with either root or ptrace capability.\n"); @@ -804,9 +852,15 @@ static int userfaultfd_open(int features) return 1; } + *features = uffdio_api.features; return 0; } +static int userfaultfd_open(uint64_t features) +{ + return userfaultfd_open_ext(&features); +} + sigjmp_buf jbuf, *sigbuf; static void sighndl(int sig, siginfo_t *siginfo, void *ptr) @@ -1112,7 +1166,7 @@ static int userfaultfd_events_test(void) } if (!pid) - return faulting_process(0); + exit(faulting_process(0)); waitpid(pid, &err, 0); if (err) { @@ -1215,6 +1269,102 @@ static int userfaultfd_sig_test(void) return userfaults != 0; } +static int userfaultfd_minor_test(void) +{ + struct uffdio_register uffdio_register; + unsigned long expected_ioctls; + unsigned long p; + pthread_t uffd_mon; + uint8_t expected_byte; + void *expected_page; + char c; + struct uffd_stats stats = { 0 }; + uint64_t features = UFFD_FEATURE_MINOR_HUGETLBFS; + + if (!test_uffdio_minor) + return 0; + + printf("testing minor faults: "); + fflush(stdout); + + if (uffd_test_ops->release_pages(area_dst)) + return 1; + + if (userfaultfd_open_ext(&features)) + return 1; + /* If kernel reports the feature isn't supported, skip the test. */ + if (!(features & UFFD_FEATURE_MINOR_HUGETLBFS)) { + printf("skipping test due to lack of feature support\n"); + fflush(stdout); + return 0; + } + + uffdio_register.range.start = (unsigned long)area_dst_alias; + uffdio_register.range.len = nr_pages * page_size; + uffdio_register.mode = UFFDIO_REGISTER_MODE_MINOR; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) { + fprintf(stderr, "register failure\n"); + exit(1); + } + + expected_ioctls = uffd_test_ops->expected_ioctls; + expected_ioctls |= 1 << _UFFDIO_CONTINUE; + if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) { + fprintf(stderr, "unexpected missing ioctl(s)\n"); + exit(1); + } + + /* + * After registering with UFFD, populate the non-UFFD-registered side of + * the shared mapping. This should *not* trigger any UFFD minor faults. + */ + for (p = 0; p < nr_pages; ++p) { + memset(area_dst + (p * page_size), p % ((uint8_t)-1), + page_size); + } + + if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) { + perror("uffd_poll_thread create"); + exit(1); + } + + /* + * Read each of the pages back using the UFFD-registered mapping. We + * expect that the first time we touch a page, it will result in a minor + * fault. uffd_poll_thread will resolve the fault by bit-flipping the + * page's contents, and then issuing a CONTINUE ioctl. + */ + + if (posix_memalign(&expected_page, page_size, page_size)) { + fprintf(stderr, "out of memory\n"); + return 1; + } + + for (p = 0; p < nr_pages; ++p) { + expected_byte = ~((uint8_t)(p % ((uint8_t)-1))); + memset(expected_page, expected_byte, page_size); + if (my_bcmp(expected_page, area_dst_alias + (p * page_size), + page_size)) { + fprintf(stderr, + "unexpected page contents after minor fault\n"); + exit(1); + } + } + + if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) { + perror("pipe write"); + exit(1); + } + if (pthread_join(uffd_mon, NULL)) + return 1; + + close(uffd); + + uffd_stats_report(&stats, 1); + + return stats.missing_faults != 0 || stats.minor_faults != nr_pages; +} + static int userfaultfd_stress(void) { void *area; @@ -1413,7 +1563,7 @@ static int userfaultfd_stress(void) close(uffd); return userfaultfd_zeropage_test() || userfaultfd_sig_test() - || userfaultfd_events_test(); + || userfaultfd_events_test() || userfaultfd_minor_test(); } /* @@ -1454,6 +1604,8 @@ static void set_test_type(const char *type) map_shared = true; test_type = TEST_HUGETLB; uffd_test_ops = &hugetlb_uffd_test_ops; + /* Minor faults require shared hugetlb; only enable here. */ + test_uffdio_minor = true; } else if (!strcmp(type, "shmem")) { map_shared = true; test_type = TEST_SHMEM; -- cgit v1.2.3-58-ga151 From b6676de8d7b48724d4cd3a3742c62fa525baa904 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 4 May 2021 18:36:01 -0700 Subject: mm/vmscan: move RECLAIM* bits to uapi header It is currently not obvious that the RECLAIM_* bits are part of the uapi since they are defined in vmscan.c. Move them to a uapi header to make it obvious. This should have no functional impact. Link: https://lkml.kernel.org/r/20210219172557.08074910@viggo.jf.intel.com Signed-off-by: Dave Hansen Reviewed-by: Ben Widawsky Reviewed-by: Oscar Salvador Acked-by: David Rientjes Acked-by: Christoph Lameter Cc: Alex Shi Cc: Daniel Wagner Cc: "Tobin C. Harding" Cc: Christoph Lameter Cc: Huang Ying Cc: Dan Williams Cc: Qian Cai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/mempolicy.h | 7 +++++++ mm/vmscan.c | 8 -------- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h index 8948467b3992..4832fd0b5642 100644 --- a/include/uapi/linux/mempolicy.h +++ b/include/uapi/linux/mempolicy.h @@ -64,5 +64,12 @@ enum { #define MPOL_F_MOF (1 << 3) /* this policy wants migrate on fault */ #define MPOL_F_MORON (1 << 4) /* Migrate On protnone Reference On Node */ +/* + * These bit locations are exposed in the vm.zone_reclaim_mode sysctl + * ABI. New bits are OK, but existing bits can never change. + */ +#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */ +#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ +#define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */ #endif /* _UAPI_LINUX_MEMPOLICY_H */ diff --git a/mm/vmscan.c b/mm/vmscan.c index 42aaef30633e..671143dbf809 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4086,14 +4086,6 @@ module_init(kswapd_init) */ int node_reclaim_mode __read_mostly; -/* - * These bit locations are exposed in the vm.zone_reclaim_mode sysctl - * ABI. New bits are OK, but existing bits can never change. - */ -#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */ -#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ -#define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */ - /* * Priority for NODE_RECLAIM. This determines the fraction of pages * of a node considered for each zone_reclaim. 4 scans 1/16th of -- cgit v1.2.3-58-ga151 From 202e35db5e719ee8af6028183403f475e243f82d Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 4 May 2021 18:36:04 -0700 Subject: mm/vmscan: replace implicit RECLAIM_ZONE checks with explicit checks RECLAIM_ZONE was assumed to be unused because it was never explicitly used in the kernel. However, there were a number of places where it was checked implicitly by checking 'node_reclaim_mode' for a zero value. These zero checks are not great because it is not obvious what a zero mode *means* in the code. Replace them with a helper which makes it more obvious: node_reclaim_enabled(). This helper also provides a handy place to explicitly check the RECLAIM_ZONE bit itself. Check it explicitly there to make it more obvious where the bit can affect behavior. This should have no functional impact. Link: https://lkml.kernel.org/r/20210219172559.BF589C44@viggo.jf.intel.com Signed-off-by: Dave Hansen Reviewed-by: Ben Widawsky Reviewed-by: Oscar Salvador Acked-by: Christoph Lameter Acked-by: David Rientjes Cc: Alex Shi Cc: "Tobin C. Harding" Cc: Huang Ying Cc: Dan Williams Cc: Qian Cai Cc: Daniel Wagner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 7 +++++++ mm/khugepaged.c | 2 +- mm/page_alloc.c | 2 +- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 4cc6ec3bf0ab..42191da1bdc9 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -12,6 +12,7 @@ #include #include #include +#include #include struct notifier_block; @@ -378,6 +379,12 @@ extern int sysctl_min_slab_ratio; #define node_reclaim_mode 0 #endif +static inline bool node_reclaim_enabled(void) +{ + /* Is any node_reclaim_mode bit set? */ + return node_reclaim_mode & (RECLAIM_ZONE|RECLAIM_WRITE|RECLAIM_UNMAP); +} + extern void check_move_unevictable_pages(struct pagevec *pvec); extern int kswapd_run(int nid); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index a03569eda183..ea74da3232ab 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -809,7 +809,7 @@ static bool khugepaged_scan_abort(int nid) * If node_reclaim_mode is disabled, then no extra effort is made to * allocate memory locally. */ - if (!node_reclaim_mode) + if (!node_reclaim_enabled()) return false; /* If there is a count for this node already, it must be acceptable */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 80fa6e0f9ed9..19cdd8a829dc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3968,7 +3968,7 @@ retry: if (alloc_flags & ALLOC_NO_WATERMARKS) goto try_this_zone; - if (node_reclaim_mode == 0 || + if (!node_reclaim_enabled() || !zone_allows_reclaim(ac->preferred_zoneref->zone, zone)) continue; -- cgit v1.2.3-58-ga151 From 8efb4b596df05f004e847d6bfadad3492b766ab3 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 4 May 2021 18:36:08 -0700 Subject: mm: vmscan: use nid from shrink_control for tracepoint Patch series "Make shrinker's nr_deferred memcg aware", v10. Recently huge amount one-off slab drop was seen on some vfs metadata heavy workloads, it turned out there were huge amount accumulated nr_deferred objects seen by the shrinker. On our production machine, I saw absurd number of nr_deferred shown as the below tracing result: <...>-48776 [032] .... 27970562.458916: mm_shrink_slab_start: super_cache_scan+0x0/0x1a0 ffff9a83046f3458: nid: 0 objects to shrink 2531805877005 gfp_flags GFP_HIGHUSER_MOVABLE pgs_scanned 32 lru_pgs 9300 cache items 1667 delta 11 total_scan 833 There are 2.5 trillion deferred objects on one node, assuming all of them are dentry (192 bytes per object), so the total size of deferred on one node is ~480TB. It is definitely ridiculous. I managed to reproduce this problem with kernel build workload plus negative dentry generator. First step, run the below kernel build test script: NR_CPUS=`cat /proc/cpuinfo | grep -e processor | wc -l` cd /root/Buildarea/linux-stable for i in `seq 1500`; do cgcreate -g memory:kern_build echo 4G > /sys/fs/cgroup/memory/kern_build/memory.limit_in_bytes echo 3 > /proc/sys/vm/drop_caches cgexec -g memory:kern_build make clean > /dev/null 2>&1 cgexec -g memory:kern_build make -j$NR_CPUS > /dev/null 2>&1 cgdelete -g memory:kern_build done Then run the below negative dentry generator script: NR_CPUS=`cat /proc/cpuinfo | grep -e processor | wc -l` mkdir /sys/fs/cgroup/memory/test echo $$ > /sys/fs/cgroup/memory/test/tasks for i in `seq $NR_CPUS`; do while true; do FILE=`head /dev/urandom | tr -dc A-Za-z0-9 | head -c 64` cat $FILE 2>/dev/null done & done Then kswapd will shrink half of dentry cache in just one loop as the below tracing result showed: kswapd0-475 [028] .... 305968.252561: mm_shrink_slab_start: super_cache_scan+0x0/0x190 0000000024acf00c: nid: 0 objects to shrink 4994376020 gfp_flags GFP_KERNEL cache items 93689873 delta 45746 total_scan 46844936 priority 12 kswapd0-475 [021] .... 306013.099399: mm_shrink_slab_end: super_cache_scan+0x0/0x190 0000000024acf00c: nid: 0 unused scan count 4994376020 new scan count 4947576838 total_scan 8 last shrinker return val 46844928 There were huge number of deferred objects before the shrinker was called, the behavior does match the code but it might be not desirable from the user's stand of point. The excessive amount of nr_deferred might be accumulated due to various reasons, for example: * GFP_NOFS allocation * Significant times of small amount scan (< scan_batch, 1024 for vfs metadata) However the LRUs of slabs are per memcg (memcg-aware shrinkers) but the deferred objects is per shrinker, this may have some bad effects: * Poor isolation among memcgs. Some memcgs which happen to have frequent limit reclaim may get nr_deferred accumulated to a huge number, then other innocent memcgs may take the fall. In our case the main workload was hit. * Unbounded deferred objects. There is no cap for deferred objects, it can outgrow ridiculously as the tracing result showed. * Easy to get out of control. Although shrinkers take into account deferred objects, but it can go out of control easily. One misconfigured memcg could incur absurd amount of deferred objects in a period of time. * Sort of reclaim problems, i.e. over reclaim, long reclaim latency, etc. There may be hundred GB slab caches for vfe metadata heavy workload, shrink half of them may take minutes. We observed latency spike due to the prolonged reclaim. These issues also have been discussed in https://lore.kernel.org/linux-mm/20200916185823.5347-1-shy828301@gmail.com/. The patchset is the outcome of that discussion. So this patchset makes nr_deferred per-memcg to tackle the problem. It does: * Have memcg_shrinker_deferred per memcg per node, just like what shrinker_map does. Instead it is an atomic_long_t array, each element represent one shrinker even though the shrinker is not memcg aware, this simplifies the implementation. For memcg aware shrinkers, the deferred objects are just accumulated to its own memcg. The shrinkers just see nr_deferred from its own memcg. Non memcg aware shrinkers still use global nr_deferred from struct shrinker. * Once the memcg is offlined, its nr_deferred will be reparented to its parent along with LRUs. * The root memcg has memcg_shrinker_deferred array too. It simplifies the handling of reparenting to root memcg. * Cap nr_deferred to 2x of the length of lru. The idea is borrowed from Dave Chinner's series (https://lore.kernel.org/linux-xfs/20191031234618.15403-1-david@fromorbit.com/) The downside is each memcg has to allocate extra memory to store the nr_deferred array. On our production environment, there are typically around 40 shrinkers, so each memcg needs ~320 bytes. 10K memcgs would need ~3.2MB memory. It seems fine. We have been running the patched kernel on some hosts of our fleet (test and production) for months, it works very well. The monitor data shows the working set is sustained as expected. This patch (of 13): The tracepoint's nid should show what node the shrink happens on, the start tracepoint uses nid from shrinkctl, but the nid might be set to 0 before end tracepoint if the shrinker is not NUMA aware, so the tracing log may show the shrink happens on one node but end up on the other node. It seems confusing. And the following patch will remove using nid directly in do_shrink_slab(), this patch also helps cleanup the code. Link: https://lkml.kernel.org/r/20210311190845.9708-1-shy828301@gmail.com Link: https://lkml.kernel.org/r/20210311190845.9708-2-shy828301@gmail.com Signed-off-by: Yang Shi Acked-by: Vlastimil Babka Acked-by: Kirill Tkhai Reviewed-by: Shakeel Butt Acked-by: Roman Gushchin Cc: Dave Chinner Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 671143dbf809..d231af7cae06 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -536,7 +536,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, else new_nr = atomic_long_read(&shrinker->nr_deferred[nid]); - trace_mm_shrink_slab_end(shrinker, nid, freed, nr, new_nr, total_scan); + trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan); return freed; } -- cgit v1.2.3-58-ga151 From 2bfd36374edd9ed7f2ebf66cacebedf7273901cb Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 4 May 2021 18:36:11 -0700 Subject: mm: vmscan: consolidate shrinker_maps handling code The shrinker map management is not purely memcg specific, it is at the intersection between memory cgroup and shrinkers. It's allocation and assignment of a structure, and the only memcg bit is the map is being stored in a memcg structure. So move the shrinker_maps handling code into vmscan.c for tighter integration with shrinker code, and remove the "memcg_" prefix. There is no functional change. Link: https://lkml.kernel.org/r/20210311190845.9708-3-shy828301@gmail.com Signed-off-by: Yang Shi Acked-by: Vlastimil Babka Acked-by: Kirill Tkhai Acked-by: Roman Gushchin Reviewed-by: Shakeel Butt Cc: Dave Chinner Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 11 ++-- mm/huge_memory.c | 4 +- mm/list_lru.c | 6 +-- mm/memcontrol.c | 130 ++------------------------------------------ mm/vmscan.c | 132 +++++++++++++++++++++++++++++++++++++++++++-- 5 files changed, 142 insertions(+), 141 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 5904716f29ba..7dbd2c9bad32 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1610,10 +1610,9 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) return false; } -extern int memcg_expand_shrinker_maps(int new_id); - -extern void memcg_set_shrinker_bit(struct mem_cgroup *memcg, - int nid, int shrinker_id); +int alloc_shrinker_maps(struct mem_cgroup *memcg); +void free_shrinker_maps(struct mem_cgroup *memcg); +void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id); #else #define mem_cgroup_sockets_enabled 0 static inline void mem_cgroup_sk_alloc(struct sock *sk) { }; @@ -1623,8 +1622,8 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) return false; } -static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg, - int nid, int shrinker_id) +static inline void set_shrinker_bit(struct mem_cgroup *memcg, + int nid, int shrinker_id) { } #endif diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b3788683f7d3..98456017744d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2830,8 +2830,8 @@ void deferred_split_huge_page(struct page *page) ds_queue->split_queue_len++; #ifdef CONFIG_MEMCG if (memcg) - memcg_set_shrinker_bit(memcg, page_to_nid(page), - deferred_split_shrinker.id); + set_shrinker_bit(memcg, page_to_nid(page), + deferred_split_shrinker.id); #endif } spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); diff --git a/mm/list_lru.c b/mm/list_lru.c index 6f067b6b935f..cd58790d0fb3 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -125,8 +125,8 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item) list_add_tail(item, &l->list); /* Set shrinker bit if the first element was added */ if (!l->nr_items++) - memcg_set_shrinker_bit(memcg, nid, - lru_shrinker_id(lru)); + set_shrinker_bit(memcg, nid, + lru_shrinker_id(lru)); nlru->nr_items++; spin_unlock(&nlru->lock); return true; @@ -540,7 +540,7 @@ static void memcg_drain_list_lru_node(struct list_lru *lru, int nid, if (src->nr_items) { dst->nr_items += src->nr_items; - memcg_set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru)); + set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru)); src->nr_items = 0; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c100265dc393..09fd17ba6de2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -400,130 +400,6 @@ DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key); EXPORT_SYMBOL(memcg_kmem_enabled_key); #endif -static int memcg_shrinker_map_size; -static DEFINE_MUTEX(memcg_shrinker_map_mutex); - -static void memcg_free_shrinker_map_rcu(struct rcu_head *head) -{ - kvfree(container_of(head, struct memcg_shrinker_map, rcu)); -} - -static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg, - int size, int old_size) -{ - struct memcg_shrinker_map *new, *old; - struct mem_cgroup_per_node *pn; - int nid; - - lockdep_assert_held(&memcg_shrinker_map_mutex); - - for_each_node(nid) { - pn = memcg->nodeinfo[nid]; - old = rcu_dereference_protected(pn->shrinker_map, true); - /* Not yet online memcg */ - if (!old) - return 0; - - new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid); - if (!new) - return -ENOMEM; - - /* Set all old bits, clear all new bits */ - memset(new->map, (int)0xff, old_size); - memset((void *)new->map + old_size, 0, size - old_size); - - rcu_assign_pointer(pn->shrinker_map, new); - call_rcu(&old->rcu, memcg_free_shrinker_map_rcu); - } - - return 0; -} - -static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) -{ - struct mem_cgroup_per_node *pn; - struct memcg_shrinker_map *map; - int nid; - - if (mem_cgroup_is_root(memcg)) - return; - - for_each_node(nid) { - pn = memcg->nodeinfo[nid]; - map = rcu_dereference_protected(pn->shrinker_map, true); - kvfree(map); - rcu_assign_pointer(pn->shrinker_map, NULL); - } -} - -static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg) -{ - struct memcg_shrinker_map *map; - int nid, size, ret = 0; - - if (mem_cgroup_is_root(memcg)) - return 0; - - mutex_lock(&memcg_shrinker_map_mutex); - size = memcg_shrinker_map_size; - for_each_node(nid) { - map = kvzalloc_node(sizeof(*map) + size, GFP_KERNEL, nid); - if (!map) { - memcg_free_shrinker_maps(memcg); - ret = -ENOMEM; - break; - } - rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map); - } - mutex_unlock(&memcg_shrinker_map_mutex); - - return ret; -} - -int memcg_expand_shrinker_maps(int new_id) -{ - int size, old_size, ret = 0; - struct mem_cgroup *memcg; - - size = DIV_ROUND_UP(new_id + 1, BITS_PER_LONG) * sizeof(unsigned long); - old_size = memcg_shrinker_map_size; - if (size <= old_size) - return 0; - - mutex_lock(&memcg_shrinker_map_mutex); - if (!root_mem_cgroup) - goto unlock; - - for_each_mem_cgroup(memcg) { - if (mem_cgroup_is_root(memcg)) - continue; - ret = memcg_expand_one_shrinker_map(memcg, size, old_size); - if (ret) { - mem_cgroup_iter_break(NULL, memcg); - goto unlock; - } - } -unlock: - if (!ret) - memcg_shrinker_map_size = size; - mutex_unlock(&memcg_shrinker_map_mutex); - return ret; -} - -void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) -{ - if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { - struct memcg_shrinker_map *map; - - rcu_read_lock(); - map = rcu_dereference(memcg->nodeinfo[nid]->shrinker_map); - /* Pairs with smp mb in shrink_slab() */ - smp_mb__before_atomic(); - set_bit(shrinker_id, map->map); - rcu_read_unlock(); - } -} - /** * mem_cgroup_css_from_page - css of the memcg associated with a page * @page: page of interest @@ -5242,11 +5118,11 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) struct mem_cgroup *memcg = mem_cgroup_from_css(css); /* - * A memcg must be visible for memcg_expand_shrinker_maps() + * A memcg must be visible for expand_shrinker_maps() * by the time the maps are allocated. So, we allocate maps * here, when for_each_mem_cgroup() can't skip it. */ - if (memcg_alloc_shrinker_maps(memcg)) { + if (alloc_shrinker_maps(memcg)) { mem_cgroup_id_remove(memcg); return -ENOMEM; } @@ -5310,7 +5186,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css) vmpressure_cleanup(&memcg->vmpressure); cancel_work_sync(&memcg->high_work); mem_cgroup_remove_from_trees(memcg); - memcg_free_shrinker_maps(memcg); + free_shrinker_maps(memcg); memcg_free_kmem(memcg); mem_cgroup_free(memcg); } diff --git a/mm/vmscan.c b/mm/vmscan.c index d231af7cae06..bead5ae1e7e2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -185,6 +185,132 @@ static LIST_HEAD(shrinker_list); static DECLARE_RWSEM(shrinker_rwsem); #ifdef CONFIG_MEMCG + +static int memcg_shrinker_map_size; +static DEFINE_MUTEX(memcg_shrinker_map_mutex); + +static void free_shrinker_map_rcu(struct rcu_head *head) +{ + kvfree(container_of(head, struct memcg_shrinker_map, rcu)); +} + +static int expand_one_shrinker_map(struct mem_cgroup *memcg, + int size, int old_size) +{ + struct memcg_shrinker_map *new, *old; + struct mem_cgroup_per_node *pn; + int nid; + + lockdep_assert_held(&memcg_shrinker_map_mutex); + + for_each_node(nid) { + pn = memcg->nodeinfo[nid]; + old = rcu_dereference_protected(pn->shrinker_map, true); + /* Not yet online memcg */ + if (!old) + return 0; + + new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid); + if (!new) + return -ENOMEM; + + /* Set all old bits, clear all new bits */ + memset(new->map, (int)0xff, old_size); + memset((void *)new->map + old_size, 0, size - old_size); + + rcu_assign_pointer(pn->shrinker_map, new); + call_rcu(&old->rcu, free_shrinker_map_rcu); + } + + return 0; +} + +void free_shrinker_maps(struct mem_cgroup *memcg) +{ + struct mem_cgroup_per_node *pn; + struct memcg_shrinker_map *map; + int nid; + + if (mem_cgroup_is_root(memcg)) + return; + + for_each_node(nid) { + pn = memcg->nodeinfo[nid]; + map = rcu_dereference_protected(pn->shrinker_map, true); + kvfree(map); + rcu_assign_pointer(pn->shrinker_map, NULL); + } +} + +int alloc_shrinker_maps(struct mem_cgroup *memcg) +{ + struct memcg_shrinker_map *map; + int nid, size, ret = 0; + + if (mem_cgroup_is_root(memcg)) + return 0; + + mutex_lock(&memcg_shrinker_map_mutex); + size = memcg_shrinker_map_size; + for_each_node(nid) { + map = kvzalloc_node(sizeof(*map) + size, GFP_KERNEL, nid); + if (!map) { + free_shrinker_maps(memcg); + ret = -ENOMEM; + break; + } + rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map); + } + mutex_unlock(&memcg_shrinker_map_mutex); + + return ret; +} + +static int expand_shrinker_maps(int new_id) +{ + int size, old_size, ret = 0; + struct mem_cgroup *memcg; + + size = DIV_ROUND_UP(new_id + 1, BITS_PER_LONG) * sizeof(unsigned long); + old_size = memcg_shrinker_map_size; + if (size <= old_size) + return 0; + + mutex_lock(&memcg_shrinker_map_mutex); + if (!root_mem_cgroup) + goto unlock; + + memcg = mem_cgroup_iter(NULL, NULL, NULL); + do { + if (mem_cgroup_is_root(memcg)) + continue; + ret = expand_one_shrinker_map(memcg, size, old_size); + if (ret) { + mem_cgroup_iter_break(NULL, memcg); + goto unlock; + } + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); +unlock: + if (!ret) + memcg_shrinker_map_size = size; + mutex_unlock(&memcg_shrinker_map_mutex); + return ret; +} + +void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) +{ + if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { + struct memcg_shrinker_map *map; + + rcu_read_lock(); + map = rcu_dereference(memcg->nodeinfo[nid]->shrinker_map); + /* Pairs with smp mb in shrink_slab() */ + smp_mb__before_atomic(); + set_bit(shrinker_id, map->map); + rcu_read_unlock(); + } +} + /* * We allow subsystems to populate their shrinker-related * LRU lists before register_shrinker_prepared() is called @@ -212,7 +338,7 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker) goto unlock; if (id >= shrinker_nr_max) { - if (memcg_expand_shrinker_maps(id)) { + if (expand_shrinker_maps(id)) { idr_remove(&shrinker_idr, id); goto unlock; } @@ -590,7 +716,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, * case, we invoke the shrinker one more time and reset * the bit if it reports that it is not empty anymore. * The memory barrier here pairs with the barrier in - * memcg_set_shrinker_bit(): + * set_shrinker_bit(): * * list_lru_add() shrink_slab_memcg() * list_add_tail() clear_bit() @@ -602,7 +728,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, if (ret == SHRINK_EMPTY) ret = 0; else - memcg_set_shrinker_bit(memcg, nid, i); + set_shrinker_bit(memcg, nid, i); } freed += ret; -- cgit v1.2.3-58-ga151 From d27cf2aa0d26a221982d04757cc32db97833ec29 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 4 May 2021 18:36:14 -0700 Subject: mm: vmscan: use shrinker_rwsem to protect shrinker_maps allocation Since memcg_shrinker_map_size just can be changed under holding shrinker_rwsem exclusively, the read side can be protected by holding read lock, so it sounds superfluous to have a dedicated mutex. Kirill Tkhai suggested use write lock since: * We want the assignment to shrinker_maps is visible for shrink_slab_memcg(). * The rcu_dereference_protected() dereferrencing in shrink_slab_memcg(), but in case of we use READ lock in alloc_shrinker_maps(), the dereferrencing is not actually protected. * READ lock makes alloc_shrinker_info() racy against memory allocation fail. alloc_shrinker_info()->free_shrinker_info() may free memory right after shrink_slab_memcg() dereferenced it. You may say shrink_slab_memcg()->mem_cgroup_online() protects us from it? Yes, sure, but this is not the thing we want to remember in the future, since this spreads modularity. And a test with heavy paging workload didn't show write lock makes things worse. Link: https://lkml.kernel.org/r/20210311190845.9708-4-shy828301@gmail.com Signed-off-by: Yang Shi Acked-by: Vlastimil Babka Acked-by: Kirill Tkhai Acked-by: Roman Gushchin Reviewed-by: Shakeel Butt Cc: Dave Chinner Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index bead5ae1e7e2..d3fba55a0028 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -187,7 +187,6 @@ static DECLARE_RWSEM(shrinker_rwsem); #ifdef CONFIG_MEMCG static int memcg_shrinker_map_size; -static DEFINE_MUTEX(memcg_shrinker_map_mutex); static void free_shrinker_map_rcu(struct rcu_head *head) { @@ -201,8 +200,6 @@ static int expand_one_shrinker_map(struct mem_cgroup *memcg, struct mem_cgroup_per_node *pn; int nid; - lockdep_assert_held(&memcg_shrinker_map_mutex); - for_each_node(nid) { pn = memcg->nodeinfo[nid]; old = rcu_dereference_protected(pn->shrinker_map, true); @@ -250,7 +247,7 @@ int alloc_shrinker_maps(struct mem_cgroup *memcg) if (mem_cgroup_is_root(memcg)) return 0; - mutex_lock(&memcg_shrinker_map_mutex); + down_write(&shrinker_rwsem); size = memcg_shrinker_map_size; for_each_node(nid) { map = kvzalloc_node(sizeof(*map) + size, GFP_KERNEL, nid); @@ -261,7 +258,7 @@ int alloc_shrinker_maps(struct mem_cgroup *memcg) } rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map); } - mutex_unlock(&memcg_shrinker_map_mutex); + up_write(&shrinker_rwsem); return ret; } @@ -276,9 +273,10 @@ static int expand_shrinker_maps(int new_id) if (size <= old_size) return 0; - mutex_lock(&memcg_shrinker_map_mutex); if (!root_mem_cgroup) - goto unlock; + goto out; + + lockdep_assert_held(&shrinker_rwsem); memcg = mem_cgroup_iter(NULL, NULL, NULL); do { @@ -287,13 +285,13 @@ static int expand_shrinker_maps(int new_id) ret = expand_one_shrinker_map(memcg, size, old_size); if (ret) { mem_cgroup_iter_break(NULL, memcg); - goto unlock; + goto out; } } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); -unlock: +out: if (!ret) memcg_shrinker_map_size = size; - mutex_unlock(&memcg_shrinker_map_mutex); + return ret; } -- cgit v1.2.3-58-ga151 From a2fb12619f202dcec83f22accc09d48347fd9138 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 4 May 2021 18:36:17 -0700 Subject: mm: vmscan: remove memcg_shrinker_map_size Both memcg_shrinker_map_size and shrinker_nr_max is maintained, but actually the map size can be calculated via shrinker_nr_max, so it seems unnecessary to keep both. Remove memcg_shrinker_map_size since shrinker_nr_max is also used by iterating the bit map. Link: https://lkml.kernel.org/r/20210311190845.9708-5-shy828301@gmail.com Signed-off-by: Yang Shi Acked-by: Kirill Tkhai Acked-by: Roman Gushchin Acked-by: Vlastimil Babka Reviewed-by: Shakeel Butt Cc: Dave Chinner Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index d3fba55a0028..dff5112dfff7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -185,8 +185,12 @@ static LIST_HEAD(shrinker_list); static DECLARE_RWSEM(shrinker_rwsem); #ifdef CONFIG_MEMCG +static int shrinker_nr_max; -static int memcg_shrinker_map_size; +static inline int shrinker_map_size(int nr_items) +{ + return (DIV_ROUND_UP(nr_items, BITS_PER_LONG) * sizeof(unsigned long)); +} static void free_shrinker_map_rcu(struct rcu_head *head) { @@ -248,7 +252,7 @@ int alloc_shrinker_maps(struct mem_cgroup *memcg) return 0; down_write(&shrinker_rwsem); - size = memcg_shrinker_map_size; + size = shrinker_map_size(shrinker_nr_max); for_each_node(nid) { map = kvzalloc_node(sizeof(*map) + size, GFP_KERNEL, nid); if (!map) { @@ -266,12 +270,13 @@ int alloc_shrinker_maps(struct mem_cgroup *memcg) static int expand_shrinker_maps(int new_id) { int size, old_size, ret = 0; + int new_nr_max = new_id + 1; struct mem_cgroup *memcg; - size = DIV_ROUND_UP(new_id + 1, BITS_PER_LONG) * sizeof(unsigned long); - old_size = memcg_shrinker_map_size; + size = shrinker_map_size(new_nr_max); + old_size = shrinker_map_size(shrinker_nr_max); if (size <= old_size) - return 0; + goto out; if (!root_mem_cgroup) goto out; @@ -290,7 +295,7 @@ static int expand_shrinker_maps(int new_id) } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); out: if (!ret) - memcg_shrinker_map_size = size; + shrinker_nr_max = new_nr_max; return ret; } @@ -323,7 +328,6 @@ void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) #define SHRINKER_REGISTERING ((struct shrinker *)~0UL) static DEFINE_IDR(shrinker_idr); -static int shrinker_nr_max; static int prealloc_memcg_shrinker(struct shrinker *shrinker) { @@ -340,8 +344,6 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker) idr_remove(&shrinker_idr, id); goto unlock; } - - shrinker_nr_max = id + 1; } shrinker->id = id; ret = 0; -- cgit v1.2.3-58-ga151 From 72673e861dd032ccaff533c0d9bb705d508017f7 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 4 May 2021 18:36:20 -0700 Subject: mm: vmscan: use kvfree_rcu instead of call_rcu Using kvfree_rcu() to free the old shrinker_maps instead of call_rcu(). We don't have to define a dedicated callback for call_rcu() anymore. Link: https://lkml.kernel.org/r/20210311190845.9708-6-shy828301@gmail.com Signed-off-by: Yang Shi Acked-by: Roman Gushchin Acked-by: Kirill Tkhai Reviewed-by: Shakeel Butt Cc: Dave Chinner Cc: Johannes Weiner Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index dff5112dfff7..aa99a835cf89 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -192,11 +192,6 @@ static inline int shrinker_map_size(int nr_items) return (DIV_ROUND_UP(nr_items, BITS_PER_LONG) * sizeof(unsigned long)); } -static void free_shrinker_map_rcu(struct rcu_head *head) -{ - kvfree(container_of(head, struct memcg_shrinker_map, rcu)); -} - static int expand_one_shrinker_map(struct mem_cgroup *memcg, int size, int old_size) { @@ -220,7 +215,7 @@ static int expand_one_shrinker_map(struct mem_cgroup *memcg, memset((void *)new->map + old_size, 0, size - old_size); rcu_assign_pointer(pn->shrinker_map, new); - call_rcu(&old->rcu, free_shrinker_map_rcu); + kvfree_rcu(old, rcu); } return 0; -- cgit v1.2.3-58-ga151 From e4262c4f51d6373447c9d89093f49ff6b1e607be Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 4 May 2021 18:36:23 -0700 Subject: mm: memcontrol: rename shrinker_map to shrinker_info The following patch is going to add nr_deferred into shrinker_map, the change will make shrinker_map not only include map anymore, so rename it to "memcg_shrinker_info". And this should make the patch adding nr_deferred cleaner and readable and make review easier. Also remove the "memcg_" prefix. Link: https://lkml.kernel.org/r/20210311190845.9708-7-shy828301@gmail.com Signed-off-by: Yang Shi Acked-by: Vlastimil Babka Acked-by: Kirill Tkhai Acked-by: Roman Gushchin Reviewed-by: Shakeel Butt Cc: Dave Chinner Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 8 +++---- mm/memcontrol.c | 6 ++--- mm/vmscan.c | 58 +++++++++++++++++++++++----------------------- 3 files changed, 36 insertions(+), 36 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 7dbd2c9bad32..6cd800fe9a67 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -117,7 +117,7 @@ struct batched_lruvec_stat { * Bitmap of shrinker::id corresponding to memcg-aware shrinkers, * which have elements charged to this memcg. */ -struct memcg_shrinker_map { +struct shrinker_info { struct rcu_head rcu; unsigned long map[]; }; @@ -145,7 +145,7 @@ struct mem_cgroup_per_node { struct mem_cgroup_reclaim_iter iter; - struct memcg_shrinker_map __rcu *shrinker_map; + struct shrinker_info __rcu *shrinker_info; struct rb_node tree_node; /* RB tree node */ unsigned long usage_in_excess;/* Set to the value by which */ @@ -1610,8 +1610,8 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) return false; } -int alloc_shrinker_maps(struct mem_cgroup *memcg); -void free_shrinker_maps(struct mem_cgroup *memcg); +int alloc_shrinker_info(struct mem_cgroup *memcg); +void free_shrinker_info(struct mem_cgroup *memcg); void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id); #else #define mem_cgroup_sockets_enabled 0 diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 09fd17ba6de2..36f31d611dea 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5118,11 +5118,11 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) struct mem_cgroup *memcg = mem_cgroup_from_css(css); /* - * A memcg must be visible for expand_shrinker_maps() + * A memcg must be visible for expand_shrinker_info() * by the time the maps are allocated. So, we allocate maps * here, when for_each_mem_cgroup() can't skip it. */ - if (alloc_shrinker_maps(memcg)) { + if (alloc_shrinker_info(memcg)) { mem_cgroup_id_remove(memcg); return -ENOMEM; } @@ -5186,7 +5186,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css) vmpressure_cleanup(&memcg->vmpressure); cancel_work_sync(&memcg->high_work); mem_cgroup_remove_from_trees(memcg); - free_shrinker_maps(memcg); + free_shrinker_info(memcg); memcg_free_kmem(memcg); mem_cgroup_free(memcg); } diff --git a/mm/vmscan.c b/mm/vmscan.c index aa99a835cf89..518084ce8757 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -192,16 +192,16 @@ static inline int shrinker_map_size(int nr_items) return (DIV_ROUND_UP(nr_items, BITS_PER_LONG) * sizeof(unsigned long)); } -static int expand_one_shrinker_map(struct mem_cgroup *memcg, - int size, int old_size) +static int expand_one_shrinker_info(struct mem_cgroup *memcg, + int size, int old_size) { - struct memcg_shrinker_map *new, *old; + struct shrinker_info *new, *old; struct mem_cgroup_per_node *pn; int nid; for_each_node(nid) { pn = memcg->nodeinfo[nid]; - old = rcu_dereference_protected(pn->shrinker_map, true); + old = rcu_dereference_protected(pn->shrinker_info, true); /* Not yet online memcg */ if (!old) return 0; @@ -214,17 +214,17 @@ static int expand_one_shrinker_map(struct mem_cgroup *memcg, memset(new->map, (int)0xff, old_size); memset((void *)new->map + old_size, 0, size - old_size); - rcu_assign_pointer(pn->shrinker_map, new); + rcu_assign_pointer(pn->shrinker_info, new); kvfree_rcu(old, rcu); } return 0; } -void free_shrinker_maps(struct mem_cgroup *memcg) +void free_shrinker_info(struct mem_cgroup *memcg) { struct mem_cgroup_per_node *pn; - struct memcg_shrinker_map *map; + struct shrinker_info *info; int nid; if (mem_cgroup_is_root(memcg)) @@ -232,15 +232,15 @@ void free_shrinker_maps(struct mem_cgroup *memcg) for_each_node(nid) { pn = memcg->nodeinfo[nid]; - map = rcu_dereference_protected(pn->shrinker_map, true); - kvfree(map); - rcu_assign_pointer(pn->shrinker_map, NULL); + info = rcu_dereference_protected(pn->shrinker_info, true); + kvfree(info); + rcu_assign_pointer(pn->shrinker_info, NULL); } } -int alloc_shrinker_maps(struct mem_cgroup *memcg) +int alloc_shrinker_info(struct mem_cgroup *memcg) { - struct memcg_shrinker_map *map; + struct shrinker_info *info; int nid, size, ret = 0; if (mem_cgroup_is_root(memcg)) @@ -249,20 +249,20 @@ int alloc_shrinker_maps(struct mem_cgroup *memcg) down_write(&shrinker_rwsem); size = shrinker_map_size(shrinker_nr_max); for_each_node(nid) { - map = kvzalloc_node(sizeof(*map) + size, GFP_KERNEL, nid); - if (!map) { - free_shrinker_maps(memcg); + info = kvzalloc_node(sizeof(*info) + size, GFP_KERNEL, nid); + if (!info) { + free_shrinker_info(memcg); ret = -ENOMEM; break; } - rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map); + rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info); } up_write(&shrinker_rwsem); return ret; } -static int expand_shrinker_maps(int new_id) +static int expand_shrinker_info(int new_id) { int size, old_size, ret = 0; int new_nr_max = new_id + 1; @@ -282,7 +282,7 @@ static int expand_shrinker_maps(int new_id) do { if (mem_cgroup_is_root(memcg)) continue; - ret = expand_one_shrinker_map(memcg, size, old_size); + ret = expand_one_shrinker_info(memcg, size, old_size); if (ret) { mem_cgroup_iter_break(NULL, memcg); goto out; @@ -298,13 +298,13 @@ out: void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) { if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { - struct memcg_shrinker_map *map; + struct shrinker_info *info; rcu_read_lock(); - map = rcu_dereference(memcg->nodeinfo[nid]->shrinker_map); + info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); /* Pairs with smp mb in shrink_slab() */ smp_mb__before_atomic(); - set_bit(shrinker_id, map->map); + set_bit(shrinker_id, info->map); rcu_read_unlock(); } } @@ -335,7 +335,7 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker) goto unlock; if (id >= shrinker_nr_max) { - if (expand_shrinker_maps(id)) { + if (expand_shrinker_info(id)) { idr_remove(&shrinker_idr, id); goto unlock; } @@ -665,7 +665,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority) { - struct memcg_shrinker_map *map; + struct shrinker_info *info; unsigned long ret, freed = 0; int i; @@ -675,12 +675,12 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, if (!down_read_trylock(&shrinker_rwsem)) return 0; - map = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_map, - true); - if (unlikely(!map)) + info = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info, + true); + if (unlikely(!info)) goto unlock; - for_each_set_bit(i, map->map, shrinker_nr_max) { + for_each_set_bit(i, info->map, shrinker_nr_max) { struct shrink_control sc = { .gfp_mask = gfp_mask, .nid = nid, @@ -691,7 +691,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, shrinker = idr_find(&shrinker_idr, i); if (unlikely(!shrinker || shrinker == SHRINKER_REGISTERING)) { if (!shrinker) - clear_bit(i, map->map); + clear_bit(i, info->map); continue; } @@ -702,7 +702,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, ret = do_shrink_slab(&sc, shrinker, priority); if (ret == SHRINK_EMPTY) { - clear_bit(i, map->map); + clear_bit(i, info->map); /* * After the shrinker reported that it had no objects to * free, but before we cleared the corresponding bit in -- cgit v1.2.3-58-ga151 From 468ab8437a97a953895856c3709e48b3067da13c Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 4 May 2021 18:36:26 -0700 Subject: mm: vmscan: add shrinker_info_protected() helper The shrinker_info is dereferenced in a couple of places via rcu_dereference_protected with different calling conventions, for example, using mem_cgroup_nodeinfo helper or dereferencing memcg->nodeinfo[nid]->shrinker_info. And the later patch will add more dereference places. So extract the dereference into a helper to make the code more readable. No functional change. [akpm@linux-foundation.org: retain rcu_dereference_protected() in free_shrinker_info(), per Hugh] Link: https://lkml.kernel.org/r/20210311190845.9708-8-shy828301@gmail.com Signed-off-by: Yang Shi Acked-by: Roman Gushchin Acked-by: Kirill Tkhai Acked-by: Vlastimil Babka Reviewed-by: Shakeel Butt Cc: Dave Chinner Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 518084ce8757..400f4a657b27 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -192,6 +192,13 @@ static inline int shrinker_map_size(int nr_items) return (DIV_ROUND_UP(nr_items, BITS_PER_LONG) * sizeof(unsigned long)); } +static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg, + int nid) +{ + return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info, + lockdep_is_held(&shrinker_rwsem)); +} + static int expand_one_shrinker_info(struct mem_cgroup *memcg, int size, int old_size) { @@ -201,7 +208,7 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg, for_each_node(nid) { pn = memcg->nodeinfo[nid]; - old = rcu_dereference_protected(pn->shrinker_info, true); + old = shrinker_info_protected(memcg, nid); /* Not yet online memcg */ if (!old) return 0; @@ -675,8 +682,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, if (!down_read_trylock(&shrinker_rwsem)) return 0; - info = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info, - true); + info = shrinker_info_protected(memcg, nid); if (unlikely(!info)) goto unlock; -- cgit v1.2.3-58-ga151 From 41ca668a71e7b03743369a2c6d8b8edc1e943dc8 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 4 May 2021 18:36:29 -0700 Subject: mm: vmscan: use a new flag to indicate shrinker is registered Currently registered shrinker is indicated by non-NULL shrinker->nr_deferred. This approach is fine with nr_deferred at the shrinker level, but the following patches will move MEMCG_AWARE shrinkers' nr_deferred to memcg level, so their shrinker->nr_deferred would always be NULL. This would prevent the shrinkers from unregistering correctly. Remove SHRINKER_REGISTERING since we could check if shrinker is registered successfully by the new flag. Link: https://lkml.kernel.org/r/20210311190845.9708-9-shy828301@gmail.com Signed-off-by: Yang Shi Acked-by: Kirill Tkhai Acked-by: Vlastimil Babka Acked-by: Roman Gushchin Reviewed-by: Shakeel Butt Cc: Dave Chinner Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/shrinker.h | 7 ++++--- mm/vmscan.c | 40 +++++++++++++++------------------------- 2 files changed, 19 insertions(+), 28 deletions(-) diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 0f80123650e2..1eac79ce57d4 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -79,13 +79,14 @@ struct shrinker { #define DEFAULT_SEEKS 2 /* A good number if you don't know better. */ /* Flags */ -#define SHRINKER_NUMA_AWARE (1 << 0) -#define SHRINKER_MEMCG_AWARE (1 << 1) +#define SHRINKER_REGISTERED (1 << 0) +#define SHRINKER_NUMA_AWARE (1 << 1) +#define SHRINKER_MEMCG_AWARE (1 << 2) /* * It just makes sense when the shrinker is also MEMCG_AWARE for now, * non-MEMCG_AWARE shrinker should not have this flag set. */ -#define SHRINKER_NONSLAB (1 << 2) +#define SHRINKER_NONSLAB (1 << 3) extern int prealloc_shrinker(struct shrinker *shrinker); extern void register_shrinker_prepared(struct shrinker *shrinker); diff --git a/mm/vmscan.c b/mm/vmscan.c index 400f4a657b27..d1601163d895 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -316,19 +316,6 @@ void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) } } -/* - * We allow subsystems to populate their shrinker-related - * LRU lists before register_shrinker_prepared() is called - * for the shrinker, since we don't want to impose - * restrictions on their internal registration order. - * In this case shrink_slab_memcg() may find corresponding - * bit is set in the shrinkers map. - * - * This value is used by the function to detect registering - * shrinkers and to skip do_shrink_slab() calls for them. - */ -#define SHRINKER_REGISTERING ((struct shrinker *)~0UL) - static DEFINE_IDR(shrinker_idr); static int prealloc_memcg_shrinker(struct shrinker *shrinker) @@ -337,7 +324,7 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker) down_write(&shrinker_rwsem); /* This may call shrinker, so it must use down_read_trylock() */ - id = idr_alloc(&shrinker_idr, SHRINKER_REGISTERING, 0, 0, GFP_KERNEL); + id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); if (id < 0) goto unlock; @@ -360,9 +347,9 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker) BUG_ON(id < 0); - down_write(&shrinker_rwsem); + lockdep_assert_held(&shrinker_rwsem); + idr_remove(&shrinker_idr, id); - up_write(&shrinker_rwsem); } static bool cgroup_reclaim(struct scan_control *sc) @@ -490,8 +477,11 @@ void free_prealloced_shrinker(struct shrinker *shrinker) if (!shrinker->nr_deferred) return; - if (shrinker->flags & SHRINKER_MEMCG_AWARE) + if (shrinker->flags & SHRINKER_MEMCG_AWARE) { + down_write(&shrinker_rwsem); unregister_memcg_shrinker(shrinker); + up_write(&shrinker_rwsem); + } kfree(shrinker->nr_deferred); shrinker->nr_deferred = NULL; @@ -501,10 +491,7 @@ void register_shrinker_prepared(struct shrinker *shrinker) { down_write(&shrinker_rwsem); list_add_tail(&shrinker->list, &shrinker_list); -#ifdef CONFIG_MEMCG - if (shrinker->flags & SHRINKER_MEMCG_AWARE) - idr_replace(&shrinker_idr, shrinker, shrinker->id); -#endif + shrinker->flags |= SHRINKER_REGISTERED; up_write(&shrinker_rwsem); } @@ -524,13 +511,16 @@ EXPORT_SYMBOL(register_shrinker); */ void unregister_shrinker(struct shrinker *shrinker) { - if (!shrinker->nr_deferred) + if (!(shrinker->flags & SHRINKER_REGISTERED)) return; - if (shrinker->flags & SHRINKER_MEMCG_AWARE) - unregister_memcg_shrinker(shrinker); + down_write(&shrinker_rwsem); list_del(&shrinker->list); + shrinker->flags &= ~SHRINKER_REGISTERED; + if (shrinker->flags & SHRINKER_MEMCG_AWARE) + unregister_memcg_shrinker(shrinker); up_write(&shrinker_rwsem); + kfree(shrinker->nr_deferred); shrinker->nr_deferred = NULL; } @@ -695,7 +685,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, struct shrinker *shrinker; shrinker = idr_find(&shrinker_idr, i); - if (unlikely(!shrinker || shrinker == SHRINKER_REGISTERING)) { + if (unlikely(!shrinker || !(shrinker->flags & SHRINKER_REGISTERED))) { if (!shrinker) clear_bit(i, info->map); continue; -- cgit v1.2.3-58-ga151 From 3c6f17e6c5d048c8029578c475dd037dd5db58af Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 4 May 2021 18:36:33 -0700 Subject: mm: vmscan: add per memcg shrinker nr_deferred Currently the number of deferred objects are per shrinker, but some slabs, for example, vfs inode/dentry cache are per memcg, this would result in poor isolation among memcgs. The deferred objects typically are generated by __GFP_NOFS allocations, one memcg with excessive __GFP_NOFS allocations may blow up deferred objects, then other innocent memcgs may suffer from over shrink, excessive reclaim latency, etc. For example, two workloads run in memcgA and memcgB respectively, workload in B is vfs heavy workload. Workload in A generates excessive deferred objects, then B's vfs cache might be hit heavily (drop half of caches) by B's limit reclaim or global reclaim. We observed this hit in our production environment which was running vfs heavy workload shown as the below tracing log: <...>-409454 [016] .... 28286961.747146: mm_shrink_slab_start: super_cache_scan+0x0/0x1a0 ffff9a83046f3458: nid: 1 objects to shrink 3641681686040 gfp_flags GFP_HIGHUSER_MOVABLE|__GFP_ZERO pgs_scanned 1 lru_pgs 15721 cache items 246404277 delta 31345 total_scan 123202138 <...>-409454 [022] .... 28287105.928018: mm_shrink_slab_end: super_cache_scan+0x0/0x1a0 ffff9a83046f3458: nid: 1 unused scan count 3641681686040 new scan count 3641798379189 total_scan 602 last shrinker return val 123186855 The vfs cache and page cache ratio was 10:1 on this machine, and half of caches were dropped. This also resulted in significant amount of page caches were dropped due to inodes eviction. Make nr_deferred per memcg for memcg aware shrinkers would solve the unfairness and bring better isolation. The following patch will add nr_deferred to parent memcg when memcg offline. To preserve nr_deferred when reparenting memcgs to root, root memcg needs shrinker_info allocated too. When memcg is not enabled (!CONFIG_MEMCG or memcg disabled), the shrinker's nr_deferred would be used. And non memcg aware shrinkers use shrinker's nr_deferred all the time. Link: https://lkml.kernel.org/r/20210311190845.9708-10-shy828301@gmail.com Signed-off-by: Yang Shi Acked-by: Roman Gushchin Acked-by: Kirill Tkhai Reviewed-by: Shakeel Butt Cc: Dave Chinner Cc: Johannes Weiner Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 7 +++--- mm/vmscan.c | 60 ++++++++++++++++++++++++++++++++-------------- 2 files changed, 46 insertions(+), 21 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 6cd800fe9a67..32bd62047238 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -114,12 +114,13 @@ struct batched_lruvec_stat { }; /* - * Bitmap of shrinker::id corresponding to memcg-aware shrinkers, - * which have elements charged to this memcg. + * Bitmap and deferred work of shrinker::id corresponding to memcg-aware + * shrinkers, which have elements charged to this memcg. */ struct shrinker_info { struct rcu_head rcu; - unsigned long map[]; + atomic_long_t *nr_deferred; + unsigned long *map; }; /* diff --git a/mm/vmscan.c b/mm/vmscan.c index d1601163d895..db668c4c78f4 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -187,11 +187,17 @@ static DECLARE_RWSEM(shrinker_rwsem); #ifdef CONFIG_MEMCG static int shrinker_nr_max; +/* The shrinker_info is expanded in a batch of BITS_PER_LONG */ static inline int shrinker_map_size(int nr_items) { return (DIV_ROUND_UP(nr_items, BITS_PER_LONG) * sizeof(unsigned long)); } +static inline int shrinker_defer_size(int nr_items) +{ + return (round_up(nr_items, BITS_PER_LONG) * sizeof(atomic_long_t)); +} + static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg, int nid) { @@ -200,11 +206,13 @@ static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg, } static int expand_one_shrinker_info(struct mem_cgroup *memcg, - int size, int old_size) + int map_size, int defer_size, + int old_map_size, int old_defer_size) { struct shrinker_info *new, *old; struct mem_cgroup_per_node *pn; int nid; + int size = map_size + defer_size; for_each_node(nid) { pn = memcg->nodeinfo[nid]; @@ -217,9 +225,16 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg, if (!new) return -ENOMEM; - /* Set all old bits, clear all new bits */ - memset(new->map, (int)0xff, old_size); - memset((void *)new->map + old_size, 0, size - old_size); + new->nr_deferred = (atomic_long_t *)(new + 1); + new->map = (void *)new->nr_deferred + defer_size; + + /* map: set all old bits, clear all new bits */ + memset(new->map, (int)0xff, old_map_size); + memset((void *)new->map + old_map_size, 0, map_size - old_map_size); + /* nr_deferred: copy old values, clear all new values */ + memcpy(new->nr_deferred, old->nr_deferred, old_defer_size); + memset((void *)new->nr_deferred + old_defer_size, 0, + defer_size - old_defer_size); rcu_assign_pointer(pn->shrinker_info, new); kvfree_rcu(old, rcu); @@ -234,9 +249,6 @@ void free_shrinker_info(struct mem_cgroup *memcg) struct shrinker_info *info; int nid; - if (mem_cgroup_is_root(memcg)) - return; - for_each_node(nid) { pn = memcg->nodeinfo[nid]; info = rcu_dereference_protected(pn->shrinker_info, true); @@ -249,12 +261,12 @@ int alloc_shrinker_info(struct mem_cgroup *memcg) { struct shrinker_info *info; int nid, size, ret = 0; - - if (mem_cgroup_is_root(memcg)) - return 0; + int map_size, defer_size = 0; down_write(&shrinker_rwsem); - size = shrinker_map_size(shrinker_nr_max); + map_size = shrinker_map_size(shrinker_nr_max); + defer_size = shrinker_defer_size(shrinker_nr_max); + size = map_size + defer_size; for_each_node(nid) { info = kvzalloc_node(sizeof(*info) + size, GFP_KERNEL, nid); if (!info) { @@ -262,6 +274,8 @@ int alloc_shrinker_info(struct mem_cgroup *memcg) ret = -ENOMEM; break; } + info->nr_deferred = (atomic_long_t *)(info + 1); + info->map = (void *)info->nr_deferred + defer_size; rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info); } up_write(&shrinker_rwsem); @@ -269,15 +283,21 @@ int alloc_shrinker_info(struct mem_cgroup *memcg) return ret; } +static inline bool need_expand(int nr_max) +{ + return round_up(nr_max, BITS_PER_LONG) > + round_up(shrinker_nr_max, BITS_PER_LONG); +} + static int expand_shrinker_info(int new_id) { - int size, old_size, ret = 0; + int ret = 0; int new_nr_max = new_id + 1; + int map_size, defer_size = 0; + int old_map_size, old_defer_size = 0; struct mem_cgroup *memcg; - size = shrinker_map_size(new_nr_max); - old_size = shrinker_map_size(shrinker_nr_max); - if (size <= old_size) + if (!need_expand(new_nr_max)) goto out; if (!root_mem_cgroup) @@ -285,11 +305,15 @@ static int expand_shrinker_info(int new_id) lockdep_assert_held(&shrinker_rwsem); + map_size = shrinker_map_size(new_nr_max); + defer_size = shrinker_defer_size(new_nr_max); + old_map_size = shrinker_map_size(shrinker_nr_max); + old_defer_size = shrinker_defer_size(shrinker_nr_max); + memcg = mem_cgroup_iter(NULL, NULL, NULL); do { - if (mem_cgroup_is_root(memcg)) - continue; - ret = expand_one_shrinker_info(memcg, size, old_size); + ret = expand_one_shrinker_info(memcg, map_size, defer_size, + old_map_size, old_defer_size); if (ret) { mem_cgroup_iter_break(NULL, memcg); goto out; -- cgit v1.2.3-58-ga151 From 86750830468506dc27fa99c644534a7189be7975 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 4 May 2021 18:36:36 -0700 Subject: mm: vmscan: use per memcg nr_deferred of shrinker Use per memcg's nr_deferred for memcg aware shrinkers. The shrinker's nr_deferred will be used in the following cases: 1. Non memcg aware shrinkers 2. !CONFIG_MEMCG 3. memcg is disabled by boot parameter Link: https://lkml.kernel.org/r/20210311190845.9708-11-shy828301@gmail.com Signed-off-by: Yang Shi Acked-by: Roman Gushchin Acked-by: Kirill Tkhai Reviewed-by: Shakeel Butt Cc: Dave Chinner Cc: Johannes Weiner Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 78 +++++++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 66 insertions(+), 12 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index db668c4c78f4..19b134d985c5 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -376,6 +376,24 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker) idr_remove(&shrinker_idr, id); } +static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, + struct mem_cgroup *memcg) +{ + struct shrinker_info *info; + + info = shrinker_info_protected(memcg, nid); + return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0); +} + +static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, + struct mem_cgroup *memcg) +{ + struct shrinker_info *info; + + info = shrinker_info_protected(memcg, nid); + return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]); +} + static bool cgroup_reclaim(struct scan_control *sc) { return sc->target_mem_cgroup; @@ -414,6 +432,18 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker) { } +static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, + struct mem_cgroup *memcg) +{ + return 0; +} + +static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, + struct mem_cgroup *memcg) +{ + return 0; +} + static bool cgroup_reclaim(struct scan_control *sc) { return false; @@ -425,6 +455,39 @@ static bool writeback_throttling_sane(struct scan_control *sc) } #endif +static long xchg_nr_deferred(struct shrinker *shrinker, + struct shrink_control *sc) +{ + int nid = sc->nid; + + if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) + nid = 0; + + if (sc->memcg && + (shrinker->flags & SHRINKER_MEMCG_AWARE)) + return xchg_nr_deferred_memcg(nid, shrinker, + sc->memcg); + + return atomic_long_xchg(&shrinker->nr_deferred[nid], 0); +} + + +static long add_nr_deferred(long nr, struct shrinker *shrinker, + struct shrink_control *sc) +{ + int nid = sc->nid; + + if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) + nid = 0; + + if (sc->memcg && + (shrinker->flags & SHRINKER_MEMCG_AWARE)) + return add_nr_deferred_memcg(nr, nid, shrinker, + sc->memcg); + + return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]); +} + /* * This misses isolated pages which are not accounted for to save counters. * As the data only determines if reclaim or compaction continues, it is @@ -561,14 +624,10 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, long freeable; long nr; long new_nr; - int nid = shrinkctl->nid; long batch_size = shrinker->batch ? shrinker->batch : SHRINK_BATCH; long scanned = 0, next_deferred; - if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) - nid = 0; - freeable = shrinker->count_objects(shrinker, shrinkctl); if (freeable == 0 || freeable == SHRINK_EMPTY) return freeable; @@ -578,7 +637,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, * and zero it so that other concurrent shrinker invocations * don't also do this scanning work. */ - nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0); + nr = xchg_nr_deferred(shrinker, shrinkctl); total_scan = nr; if (shrinker->seeks) { @@ -669,14 +728,9 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, next_deferred = 0; /* * move the unused scan count back into the shrinker in a - * manner that handles concurrent updates. If we exhausted the - * scan, there is no need to do an update. + * manner that handles concurrent updates. */ - if (next_deferred > 0) - new_nr = atomic_long_add_return(next_deferred, - &shrinker->nr_deferred[nid]); - else - new_nr = atomic_long_read(&shrinker->nr_deferred[nid]); + new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl); trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan); return freed; -- cgit v1.2.3-58-ga151 From 476b30a0949aec865dcc64d4c14f621b1a8afd12 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 4 May 2021 18:36:39 -0700 Subject: mm: vmscan: don't need allocate shrinker->nr_deferred for memcg aware shrinkers Now nr_deferred is available on per memcg level for memcg aware shrinkers, so don't need allocate shrinker->nr_deferred for such shrinkers anymore. The prealloc_memcg_shrinker() would return -ENOSYS if !CONFIG_MEMCG or memcg is disabled by kernel command line, then shrinker's SHRINKER_MEMCG_AWARE flag would be cleared. This makes the implementation of this patch simpler. Link: https://lkml.kernel.org/r/20210311190845.9708-12-shy828301@gmail.com Signed-off-by: Yang Shi Acked-by: Vlastimil Babka Reviewed-by: Kirill Tkhai Acked-by: Roman Gushchin Reviewed-by: Shakeel Butt Cc: Dave Chinner Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 19b134d985c5..9617c5ff3e98 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -346,6 +346,9 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker) { int id, ret = -ENOMEM; + if (mem_cgroup_disabled()) + return -ENOSYS; + down_write(&shrinker_rwsem); /* This may call shrinker, so it must use down_read_trylock() */ id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); @@ -425,7 +428,7 @@ static bool writeback_throttling_sane(struct scan_control *sc) #else static int prealloc_memcg_shrinker(struct shrinker *shrinker) { - return 0; + return -ENOSYS; } static void unregister_memcg_shrinker(struct shrinker *shrinker) @@ -537,8 +540,18 @@ static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, */ int prealloc_shrinker(struct shrinker *shrinker) { - unsigned int size = sizeof(*shrinker->nr_deferred); + unsigned int size; + int err; + + if (shrinker->flags & SHRINKER_MEMCG_AWARE) { + err = prealloc_memcg_shrinker(shrinker); + if (err != -ENOSYS) + return err; + shrinker->flags &= ~SHRINKER_MEMCG_AWARE; + } + + size = sizeof(*shrinker->nr_deferred); if (shrinker->flags & SHRINKER_NUMA_AWARE) size *= nr_node_ids; @@ -546,28 +559,16 @@ int prealloc_shrinker(struct shrinker *shrinker) if (!shrinker->nr_deferred) return -ENOMEM; - if (shrinker->flags & SHRINKER_MEMCG_AWARE) { - if (prealloc_memcg_shrinker(shrinker)) - goto free_deferred; - } - return 0; - -free_deferred: - kfree(shrinker->nr_deferred); - shrinker->nr_deferred = NULL; - return -ENOMEM; } void free_prealloced_shrinker(struct shrinker *shrinker) { - if (!shrinker->nr_deferred) - return; - if (shrinker->flags & SHRINKER_MEMCG_AWARE) { down_write(&shrinker_rwsem); unregister_memcg_shrinker(shrinker); up_write(&shrinker_rwsem); + return; } kfree(shrinker->nr_deferred); -- cgit v1.2.3-58-ga151 From a178015cde69981cdcd8f109c5abc98703fead62 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 4 May 2021 18:36:42 -0700 Subject: mm: memcontrol: reparent nr_deferred when memcg offline Now shrinker's nr_deferred is per memcg for memcg aware shrinkers, add to parent's corresponding nr_deferred when memcg offline. Link: https://lkml.kernel.org/r/20210311190845.9708-13-shy828301@gmail.com Signed-off-by: Yang Shi Acked-by: Vlastimil Babka Acked-by: Kirill Tkhai Acked-by: Roman Gushchin Reviewed-by: Shakeel Butt Cc: Dave Chinner Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 1 + mm/memcontrol.c | 1 + mm/vmscan.c | 24 ++++++++++++++++++++++++ 3 files changed, 26 insertions(+) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 32bd62047238..c193be760709 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1614,6 +1614,7 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) int alloc_shrinker_info(struct mem_cgroup *memcg); void free_shrinker_info(struct mem_cgroup *memcg); void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id); +void reparent_shrinker_deferred(struct mem_cgroup *memcg); #else #define mem_cgroup_sockets_enabled 0 static inline void mem_cgroup_sk_alloc(struct sock *sk) { }; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 36f31d611dea..3004afb6d090 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5154,6 +5154,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) page_counter_set_low(&memcg->memory, 0); memcg_offline_kmem(memcg); + reparent_shrinker_deferred(memcg); wb_memcg_offline(memcg); drain_all_stock(memcg); diff --git a/mm/vmscan.c b/mm/vmscan.c index 9617c5ff3e98..8c2d2003acbe 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -397,6 +397,30 @@ static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]); } +void reparent_shrinker_deferred(struct mem_cgroup *memcg) +{ + int i, nid; + long nr; + struct mem_cgroup *parent; + struct shrinker_info *child_info, *parent_info; + + parent = parent_mem_cgroup(memcg); + if (!parent) + parent = root_mem_cgroup; + + /* Prevent from concurrent shrinker_info expand */ + down_read(&shrinker_rwsem); + for_each_node(nid) { + child_info = shrinker_info_protected(memcg, nid); + parent_info = shrinker_info_protected(parent, nid); + for (i = 0; i < shrinker_nr_max; i++) { + nr = atomic_long_read(&child_info->nr_deferred[i]); + atomic_long_add(nr, &parent_info->nr_deferred[i]); + } + } + up_read(&shrinker_rwsem); +} + static bool cgroup_reclaim(struct scan_control *sc) { return sc->target_mem_cgroup; -- cgit v1.2.3-58-ga151 From 18bb473e5031213ebfa9a622c0b0f8cdcb8a5371 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 4 May 2021 18:36:45 -0700 Subject: mm: vmscan: shrink deferred objects proportional to priority The number of deferred objects might get windup to an absurd number, and it results in clamp of slab objects. It is undesirable for sustaining workingset. So shrink deferred objects proportional to priority and cap nr_deferred to twice of cache items. The idea is borrowed from Dave Chinner's patch: https://lore.kernel.org/linux-xfs/20191031234618.15403-13-david@fromorbit.com/ Tested with kernel build and vfs metadata heavy workload in our production environment, no regression is spotted so far. Link: https://lkml.kernel.org/r/20210311190845.9708-14-shy828301@gmail.com Signed-off-by: Yang Shi Cc: Johannes Weiner Cc: Kirill Tkhai Cc: Michal Hocko Cc: Roman Gushchin Cc: Shakeel Butt Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 46 +++++++++++----------------------------------- 1 file changed, 11 insertions(+), 35 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 8c2d2003acbe..44c49acf10c4 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -664,7 +664,6 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, */ nr = xchg_nr_deferred(shrinker, shrinkctl); - total_scan = nr; if (shrinker->seeks) { delta = freeable >> priority; delta *= 4; @@ -678,37 +677,9 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, delta = freeable / 2; } + total_scan = nr >> priority; total_scan += delta; - if (total_scan < 0) { - pr_err("shrink_slab: %pS negative objects to delete nr=%ld\n", - shrinker->scan_objects, total_scan); - total_scan = freeable; - next_deferred = nr; - } else - next_deferred = total_scan; - - /* - * We need to avoid excessive windup on filesystem shrinkers - * due to large numbers of GFP_NOFS allocations causing the - * shrinkers to return -1 all the time. This results in a large - * nr being built up so when a shrink that can do some work - * comes along it empties the entire cache due to nr >>> - * freeable. This is bad for sustaining a working set in - * memory. - * - * Hence only allow the shrinker to scan the entire cache when - * a large delta change is calculated directly. - */ - if (delta < freeable / 4) - total_scan = min(total_scan, freeable / 2); - - /* - * Avoid risking looping forever due to too large nr value: - * never try to free more than twice the estimate number of - * freeable entries. - */ - if (total_scan > freeable * 2) - total_scan = freeable * 2; + total_scan = min(total_scan, (2 * freeable)); trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, freeable, delta, total_scan, priority); @@ -747,10 +718,15 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, cond_resched(); } - if (next_deferred >= scanned) - next_deferred -= scanned; - else - next_deferred = 0; + /* + * The deferred work is increased by any new work (delta) that wasn't + * done, decreased by old deferred work that was done now. + * + * And it is capped to two times of the freeable items. + */ + next_deferred = max_t(long, (nr + delta - scanned), 0); + next_deferred = min(next_deferred, (2 * freeable)); + /* * move the unused scan count back into the shrinker in a * manner that handles concurrent updates. -- cgit v1.2.3-58-ga151 From ef4984384172e93cc95e0e8cd102536d67e8a787 Mon Sep 17 00:00:00 2001 From: Pintu Kumar Date: Tue, 4 May 2021 18:36:48 -0700 Subject: mm/compaction: remove unused variable sysctl_compact_memory The sysctl_compact_memory is mostly unused in mm/compaction.c It just acts as a place holder for sysctl to store .data. But the .data itself is not needed here. So we can get ride of this variable completely and make .data as NULL. This will also eliminate the extern declaration from header file. No functionality is broken or changed this way. Link: https://lkml.kernel.org/r/1614852224-14671-1-git-send-email-pintu@codeaurora.org Signed-off-by: Pintu Kumar Signed-off-by: Pintu Agarwal Reviewed-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 1 - kernel/sysctl.c | 2 +- mm/compaction.c | 3 --- 3 files changed, 1 insertion(+), 5 deletions(-) diff --git a/include/linux/compaction.h b/include/linux/compaction.h index ed4070ed41ef..4221888bdcd6 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -81,7 +81,6 @@ static inline unsigned long compact_gap(unsigned int order) } #ifdef CONFIG_COMPACTION -extern int sysctl_compact_memory; extern unsigned int sysctl_compaction_proactiveness; extern int sysctl_compaction_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f91d327273c1..14edf84cc571 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2830,7 +2830,7 @@ static struct ctl_table vm_table[] = { #ifdef CONFIG_COMPACTION { .procname = "compact_memory", - .data = &sysctl_compact_memory, + .data = NULL, .maxlen = sizeof(int), .mode = 0200, .proc_handler = sysctl_compaction_handler, diff --git a/mm/compaction.c b/mm/compaction.c index 335862f1661c..027eb794e747 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2692,9 +2692,6 @@ static void compact_nodes(void) compact_node(nid); } -/* The written value is actually unused, all memory is compacted */ -int sysctl_compact_memory; - /* * Tunable for proactive compaction. It determines how * aggressively the kernel should compact memory in the -- cgit v1.2.3-58-ga151 From 06dac2f467fe9269a433aa5056dd2ee1d20475e9 Mon Sep 17 00:00:00 2001 From: Charan Teja Reddy Date: Tue, 4 May 2021 18:36:51 -0700 Subject: mm: compaction: update the COMPACT[STALL|FAIL] events properly By definition, COMPACT[STALL|FAIL] events needs to be counted when there is 'At least in one zone compaction wasn't deferred or skipped from the direct compaction'. And when compaction is skipped or deferred, COMPACT_SKIPPED will be returned but it will still go and update these compaction events which is wrong in the sense that COMPACT[STALL|FAIL] is counted without even trying the compaction. Correct this by skipping the counting of these events when COMPACT_SKIPPED is returned for compaction. This indirectly also avoid the unnecessary try into the get_page_from_freelist() when compaction is not even tried. There is a corner case where compaction is skipped but still count COMPACTSTALL event, which is that IRQ came and freed the page and the same is captured in capture_control. Link: https://lkml.kernel.org/r/1613151184-21213-1-git-send-email-charante@codeaurora.org Signed-off-by: Charan Teja Reddy Acked-by: Vlastimil Babka Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 8 ++++++++ mm/page_alloc.c | 2 ++ 2 files changed, 10 insertions(+) diff --git a/mm/compaction.c b/mm/compaction.c index 027eb794e747..1be7928af62b 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2529,6 +2529,14 @@ static enum compact_result compact_zone_order(struct zone *zone, int order, */ WRITE_ONCE(current->capture_control, NULL); *capture = READ_ONCE(capc.page); + /* + * Technically, it is also possible that compaction is skipped but + * the page is still captured out of luck(IRQ came and freed the page). + * Returning COMPACT_SUCCESS in such cases helps in properly accounting + * the COMPACT[STALL|FAIL] when compaction is skipped. + */ + if (*capture) + ret = COMPACT_SUCCESS; return ret; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 19cdd8a829dc..64d4aae2a78a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4204,6 +4204,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, memalloc_noreclaim_restore(noreclaim_flag); psi_memstall_leave(&pflags); + if (*compact_result == COMPACT_SKIPPED) + return NULL; /* * At least in one zone compaction wasn't deferred or skipped, so let's * count a compaction stall -- cgit v1.2.3-58-ga151 From d479960e44f27e0e52ba31b21740b703c538027c Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 4 May 2021 18:36:54 -0700 Subject: mm: disable LRU pagevec during the migration temporarily LRU pagevec holds refcount of pages until the pagevec are drained. It could prevent migration since the refcount of the page is greater than the expection in migration logic. To mitigate the issue, callers of migrate_pages drains LRU pagevec via migrate_prep or lru_add_drain_all before migrate_pages call. However, it's not enough because pages coming into pagevec after the draining call still could stay at the pagevec so it could keep preventing page migration. Since some callers of migrate_pages have retrial logic with LRU draining, the page would migrate at next trail but it is still fragile in that it doesn't close the fundamental race between upcoming LRU pages into pagvec and migration so the migration failure could cause contiguous memory allocation failure in the end. To close the race, this patch disables lru caches(i.e, pagevec) during ongoing migration until migrate is done. Since it's really hard to reproduce, I measured how many times migrate_pages retried with force mode(it is about a fallback to a sync migration) with below debug code. int migrate_pages(struct list_head *from, new_page_t get_new_page, .. .. if (rc && reason == MR_CONTIG_RANGE && pass > 2) { printk(KERN_ERR, "pfn 0x%lx reason %d", page_to_pfn(page), rc); dump_page(page, "fail to migrate"); } The test was repeating android apps launching with cma allocation in background every five seconds. Total cma allocation count was about 500 during the testing. With this patch, the dump_page count was reduced from 400 to 30. The new interface is also useful for memory hotplug which currently drains lru pcp caches after each migration failure. This is rather suboptimal as it has to disrupt others running during the operation. With the new interface the operation happens only once. This is also in line with pcp allocator cache which are disabled for the offlining as well. Link: https://lkml.kernel.org/r/20210319175127.886124-1-minchan@kernel.org Signed-off-by: Minchan Kim Reviewed-by: Chris Goldsworthy Acked-by: Michal Hocko Cc: John Dias Cc: Suren Baghdasaryan Cc: Matthew Wilcox Cc: David Hildenbrand Cc: Vlastimil Babka Cc: Oliver Sang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 2 ++ include/linux/swap.h | 14 +++++++++++ mm/memory_hotplug.c | 3 ++- mm/mempolicy.c | 4 ++++ mm/migrate.c | 11 +++++---- mm/page_alloc.c | 2 ++ mm/swap.c | 64 ++++++++++++++++++++++++++++++++++++++++++------- 7 files changed, 86 insertions(+), 14 deletions(-) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 3a389633b68f..9e4a2dc8622c 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -46,6 +46,7 @@ extern int isolate_movable_page(struct page *page, isolate_mode_t mode); extern void putback_movable_page(struct page *page); extern void migrate_prep(void); +extern void migrate_finish(void); extern void migrate_prep_local(void); extern void migrate_page_states(struct page *newpage, struct page *page); extern void migrate_page_copy(struct page *newpage, struct page *page); @@ -67,6 +68,7 @@ static inline int isolate_movable_page(struct page *page, isolate_mode_t mode) { return -EBUSY; } static inline int migrate_prep(void) { return -ENOSYS; } +static inline int migrate_finish(void) { return -ENOSYS; } static inline int migrate_prep_local(void) { return -ENOSYS; } static inline void migrate_page_states(struct page *newpage, struct page *page) diff --git a/include/linux/swap.h b/include/linux/swap.h index 42191da1bdc9..f69e0f67651d 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -340,6 +340,20 @@ extern void lru_note_cost(struct lruvec *lruvec, bool file, extern void lru_note_cost_page(struct page *); extern void lru_cache_add(struct page *); extern void mark_page_accessed(struct page *); + +extern atomic_t lru_disable_count; + +static inline bool lru_cache_disabled(void) +{ + return atomic_read(&lru_disable_count); +} + +static inline void lru_cache_enable(void) +{ + atomic_dec(&lru_disable_count); +} + +extern void lru_cache_disable(void); extern void lru_add_drain(void); extern void lru_add_drain_cpu(int cpu); extern void lru_add_drain_cpu_zone(struct zone *zone); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 0cdbbfbc5757..729fba144c71 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1611,6 +1611,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) * in a way that pages from isolated pageblock are left on pcplists. */ zone_pcp_disable(zone); + lru_cache_disable(); /* set above range as isolated */ ret = start_isolate_page_range(start_pfn, end_pfn, @@ -1642,7 +1643,6 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) } cond_resched(); - lru_add_drain_all(); ret = scan_movable_pages(pfn, end_pfn, &pfn); if (!ret) { @@ -1687,6 +1687,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) zone->nr_isolate_pageblock -= nr_pages / pageblock_nr_pages; spin_unlock_irqrestore(&zone->lock, flags); + lru_cache_enable(); zone_pcp_enable(zone); /* removal success */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index cd0295567a04..3b95e169e97d 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1208,6 +1208,8 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, break; } mmap_read_unlock(mm); + + migrate_finish(); if (err < 0) return err; return busy; @@ -1371,6 +1373,8 @@ up_out: mmap_write_unlock(mm); mpol_out: mpol_put(new); + if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) + migrate_finish(); return err; } diff --git a/mm/migrate.c b/mm/migrate.c index 47df0df8f21a..5b09567dc293 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -66,11 +66,13 @@ void migrate_prep(void) { /* * Clear the LRU lists so pages can be isolated. - * Note that pages may be moved off the LRU after we have - * drained them. Those pages will fail to migrate like other - * pages that may be busy. */ - lru_add_drain_all(); + lru_cache_disable(); +} + +void migrate_finish(void) +{ + lru_cache_enable(); } /* Do the necessary work of migrate_prep but not if it involves other CPUs */ @@ -1838,6 +1840,7 @@ out_flush: if (err >= 0) err = err1; out: + migrate_finish(); return err; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 64d4aae2a78a..2cefb634e0d6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8715,6 +8715,8 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, if (ret == -ENOMEM) break; } + + migrate_finish(); if (ret < 0) { alloc_contig_dump_pages(&cc->migratepages); putback_movable_pages(&cc->migratepages); diff --git a/mm/swap.c b/mm/swap.c index 31b844d4ed94..c94f55e7b649 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -235,6 +235,18 @@ static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec) } } +/* return true if pagevec needs to drain */ +static bool pagevec_add_and_need_flush(struct pagevec *pvec, struct page *page) +{ + bool ret = false; + + if (!pagevec_add(pvec, page) || PageCompound(page) || + lru_cache_disabled()) + ret = true; + + return ret; +} + /* * Writeback is about to end against a page which has been marked for immediate * reclaim. If it still appears to be reclaimable, move it to the tail of the @@ -252,7 +264,7 @@ void rotate_reclaimable_page(struct page *page) get_page(page); local_lock_irqsave(&lru_rotate.lock, flags); pvec = this_cpu_ptr(&lru_rotate.pvec); - if (!pagevec_add(pvec, page) || PageCompound(page)) + if (pagevec_add_and_need_flush(pvec, page)) pagevec_lru_move_fn(pvec, pagevec_move_tail_fn); local_unlock_irqrestore(&lru_rotate.lock, flags); } @@ -343,7 +355,7 @@ static void activate_page(struct page *page) local_lock(&lru_pvecs.lock); pvec = this_cpu_ptr(&lru_pvecs.activate_page); get_page(page); - if (!pagevec_add(pvec, page) || PageCompound(page)) + if (pagevec_add_and_need_flush(pvec, page)) pagevec_lru_move_fn(pvec, __activate_page); local_unlock(&lru_pvecs.lock); } @@ -458,7 +470,7 @@ void lru_cache_add(struct page *page) get_page(page); local_lock(&lru_pvecs.lock); pvec = this_cpu_ptr(&lru_pvecs.lru_add); - if (!pagevec_add(pvec, page) || PageCompound(page)) + if (pagevec_add_and_need_flush(pvec, page)) __pagevec_lru_add(pvec); local_unlock(&lru_pvecs.lock); } @@ -654,7 +666,7 @@ void deactivate_file_page(struct page *page) local_lock(&lru_pvecs.lock); pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate_file); - if (!pagevec_add(pvec, page) || PageCompound(page)) + if (pagevec_add_and_need_flush(pvec, page)) pagevec_lru_move_fn(pvec, lru_deactivate_file_fn); local_unlock(&lru_pvecs.lock); } @@ -676,7 +688,7 @@ void deactivate_page(struct page *page) local_lock(&lru_pvecs.lock); pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate); get_page(page); - if (!pagevec_add(pvec, page) || PageCompound(page)) + if (pagevec_add_and_need_flush(pvec, page)) pagevec_lru_move_fn(pvec, lru_deactivate_fn); local_unlock(&lru_pvecs.lock); } @@ -698,7 +710,7 @@ void mark_page_lazyfree(struct page *page) local_lock(&lru_pvecs.lock); pvec = this_cpu_ptr(&lru_pvecs.lru_lazyfree); get_page(page); - if (!pagevec_add(pvec, page) || PageCompound(page)) + if (pagevec_add_and_need_flush(pvec, page)) pagevec_lru_move_fn(pvec, lru_lazyfree_fn); local_unlock(&lru_pvecs.lock); } @@ -735,7 +747,7 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy) * Calling this function with cpu hotplug locks held can actually lead * to obscure indirect dependencies via WQ context. */ -void lru_add_drain_all(void) +inline void __lru_add_drain_all(bool force_all_cpus) { /* * lru_drain_gen - Global pages generation number @@ -780,7 +792,7 @@ void lru_add_drain_all(void) * (C) Exit the draining operation if a newer generation, from another * lru_add_drain_all(), was already scheduled for draining. Check (A). */ - if (unlikely(this_gen != lru_drain_gen)) + if (unlikely(this_gen != lru_drain_gen && !force_all_cpus)) goto done; /* @@ -810,7 +822,8 @@ void lru_add_drain_all(void) for_each_online_cpu(cpu) { struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); - if (pagevec_count(&per_cpu(lru_pvecs.lru_add, cpu)) || + if (force_all_cpus || + pagevec_count(&per_cpu(lru_pvecs.lru_add, cpu)) || data_race(pagevec_count(&per_cpu(lru_rotate.pvec, cpu))) || pagevec_count(&per_cpu(lru_pvecs.lru_deactivate_file, cpu)) || pagevec_count(&per_cpu(lru_pvecs.lru_deactivate, cpu)) || @@ -828,6 +841,11 @@ void lru_add_drain_all(void) done: mutex_unlock(&lock); } + +void lru_add_drain_all(void) +{ + __lru_add_drain_all(false); +} #else void lru_add_drain_all(void) { @@ -835,6 +853,34 @@ void lru_add_drain_all(void) } #endif /* CONFIG_SMP */ +atomic_t lru_disable_count = ATOMIC_INIT(0); + +/* + * lru_cache_disable() needs to be called before we start compiling + * a list of pages to be migrated using isolate_lru_page(). + * It drains pages on LRU cache and then disable on all cpus until + * lru_cache_enable is called. + * + * Must be paired with a call to lru_cache_enable(). + */ +void lru_cache_disable(void) +{ + atomic_inc(&lru_disable_count); +#ifdef CONFIG_SMP + /* + * lru_add_drain_all in the force mode will schedule draining on + * all online CPUs so any calls of lru_cache_disabled wrapped by + * local_lock or preemption disabled would be ordered by that. + * The atomic operation doesn't need to have stronger ordering + * requirements because that is enforeced by the scheduling + * guarantees. + */ + __lru_add_drain_all(true); +#else + lru_add_drain(); +#endif +} + /** * release_pages - batched put_page() * @pages: array of pages to release -- cgit v1.2.3-58-ga151 From 361a2a229fa31ab7f2b236b5946e434964d00762 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 4 May 2021 18:36:57 -0700 Subject: mm: replace migrate_[prep|finish] with lru_cache_[disable|enable] Currently, migrate_[prep|finish] is merely a wrapper of lru_cache_[disable|enable]. There is not much to gain from having additional abstraction. Use lru_cache_[disable|enable] instead of migrate_[prep|finish], which would be more descriptive. note: migrate_prep_local in compaction.c changed into lru_add_drain to avoid CPU schedule cost with involving many other CPUs to keep old behavior. Link: https://lkml.kernel.org/r/20210319175127.886124-2-minchan@kernel.org Signed-off-by: Minchan Kim Acked-by: Michal Hocko Reviewed-by: David Hildenbrand Cc: Chris Goldsworthy Cc: John Dias Cc: Matthew Wilcox Cc: Oliver Sang Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 7 ------- mm/compaction.c | 3 ++- mm/mempolicy.c | 8 ++++---- mm/migrate.c | 28 ++-------------------------- mm/page_alloc.c | 4 ++-- 5 files changed, 10 insertions(+), 40 deletions(-) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 9e4a2dc8622c..6155d97ec76c 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -45,9 +45,6 @@ extern struct page *alloc_migration_target(struct page *page, unsigned long priv extern int isolate_movable_page(struct page *page, isolate_mode_t mode); extern void putback_movable_page(struct page *page); -extern void migrate_prep(void); -extern void migrate_finish(void); -extern void migrate_prep_local(void); extern void migrate_page_states(struct page *newpage, struct page *page); extern void migrate_page_copy(struct page *newpage, struct page *page); extern int migrate_huge_page_move_mapping(struct address_space *mapping, @@ -67,10 +64,6 @@ static inline struct page *alloc_migration_target(struct page *page, static inline int isolate_movable_page(struct page *page, isolate_mode_t mode) { return -EBUSY; } -static inline int migrate_prep(void) { return -ENOSYS; } -static inline int migrate_finish(void) { return -ENOSYS; } -static inline int migrate_prep_local(void) { return -ENOSYS; } - static inline void migrate_page_states(struct page *newpage, struct page *page) { } diff --git a/mm/compaction.c b/mm/compaction.c index 1be7928af62b..598dffbd5c8e 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2354,7 +2354,8 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn, sync); - migrate_prep_local(); + /* lru_add_drain_all could be expensive with involving other CPUs */ + lru_add_drain(); while ((ret = compact_finished(cc)) == COMPACT_CONTINUE) { int err; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 3b95e169e97d..c0343c742bed 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1124,7 +1124,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, int err = 0; nodemask_t tmp; - migrate_prep(); + lru_cache_disable(); mmap_read_lock(mm); @@ -1209,7 +1209,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, } mmap_read_unlock(mm); - migrate_finish(); + lru_cache_enable(); if (err < 0) return err; return busy; @@ -1325,7 +1325,7 @@ static long do_mbind(unsigned long start, unsigned long len, if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { - migrate_prep(); + lru_cache_disable(); } { NODEMASK_SCRATCH(scratch); @@ -1374,7 +1374,7 @@ up_out: mpol_out: mpol_put(new); if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) - migrate_finish(); + lru_cache_enable(); return err; } diff --git a/mm/migrate.c b/mm/migrate.c index 5b09567dc293..b5fbeb4bf49a 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -57,30 +57,6 @@ #include "internal.h" -/* - * migrate_prep() needs to be called before we start compiling a list of pages - * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is - * undesirable, use migrate_prep_local() - */ -void migrate_prep(void) -{ - /* - * Clear the LRU lists so pages can be isolated. - */ - lru_cache_disable(); -} - -void migrate_finish(void) -{ - lru_cache_enable(); -} - -/* Do the necessary work of migrate_prep but not if it involves other CPUs */ -void migrate_prep_local(void) -{ - lru_add_drain(); -} - int isolate_movable_page(struct page *page, isolate_mode_t mode) { struct address_space *mapping; @@ -1771,7 +1747,7 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, int start, i; int err = 0, err1; - migrate_prep(); + lru_cache_disable(); for (i = start = 0; i < nr_pages; i++) { const void __user *p; @@ -1840,7 +1816,7 @@ out_flush: if (err >= 0) err = err1; out: - migrate_finish(); + lru_cache_enable(); return err; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2cefb634e0d6..80754fc1f1ff 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8681,7 +8681,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, }; - migrate_prep(); + lru_cache_disable(); while (pfn < end || !list_empty(&cc->migratepages)) { if (fatal_signal_pending(current)) { @@ -8716,7 +8716,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, break; } - migrate_finish(); + lru_cache_enable(); if (ret < 0) { alloc_contig_dump_pages(&cc->migratepages); putback_movable_pages(&cc->migratepages); -- cgit v1.2.3-58-ga151 From 8cc621d2f45ddd3dc664024a647ee7adf48d79a5 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 4 May 2021 18:37:00 -0700 Subject: mm: fs: invalidate BH LRU during page migration Pages containing buffer_heads that are in one of the per-CPU buffer_head LRU caches will be pinned and thus cannot be migrated. This can prevent CMA allocations from succeeding, which are often used on platforms with co-processors (such as a DSP) that can only use physically contiguous memory. It can also prevent memory hot-unplugging from succeeding, which involves migrating at least MIN_MEMORY_BLOCK_SIZE bytes of memory, which ranges from 8 MiB to 1 GiB based on the architecture in use. Correspondingly, invalidate the BH LRU caches before a migration starts and stop any buffer_head from being cached in the LRU caches, until migration has finished. Link: https://lkml.kernel.org/r/20210319175127.886124-3-minchan@kernel.org Signed-off-by: Minchan Kim Reported-by: Chris Goldsworthy Reported-by: Laura Abbott Tested-by: Oliver Sang Cc: David Hildenbrand Cc: John Dias Cc: Matthew Wilcox Cc: Michal Hocko Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 36 ++++++++++++++++++++++++++++++------ include/linux/buffer_head.h | 4 ++++ mm/swap.c | 5 ++++- 3 files changed, 38 insertions(+), 7 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index 0cb7ffd4977c..e9872d0dcbf1 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1264,6 +1264,15 @@ static void bh_lru_install(struct buffer_head *bh) int i; check_irqs_on(); + /* + * the refcount of buffer_head in bh_lru prevents dropping the + * attached page(i.e., try_to_free_buffers) so it could cause + * failing page migration. + * Skip putting upcoming bh into bh_lru until migration is done. + */ + if (lru_cache_disabled()) + return; + bh_lru_lock(); b = this_cpu_ptr(&bh_lrus); @@ -1404,6 +1413,15 @@ __bread_gfp(struct block_device *bdev, sector_t block, } EXPORT_SYMBOL(__bread_gfp); +static void __invalidate_bh_lrus(struct bh_lru *b) +{ + int i; + + for (i = 0; i < BH_LRU_SIZE; i++) { + brelse(b->bhs[i]); + b->bhs[i] = NULL; + } +} /* * invalidate_bh_lrus() is called rarely - but not only at unmount. * This doesn't race because it runs in each cpu either in irq @@ -1412,16 +1430,12 @@ EXPORT_SYMBOL(__bread_gfp); static void invalidate_bh_lru(void *arg) { struct bh_lru *b = &get_cpu_var(bh_lrus); - int i; - for (i = 0; i < BH_LRU_SIZE; i++) { - brelse(b->bhs[i]); - b->bhs[i] = NULL; - } + __invalidate_bh_lrus(b); put_cpu_var(bh_lrus); } -static bool has_bh_in_lru(int cpu, void *dummy) +bool has_bh_in_lru(int cpu, void *dummy) { struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu); int i; @@ -1440,6 +1454,16 @@ void invalidate_bh_lrus(void) } EXPORT_SYMBOL_GPL(invalidate_bh_lrus); +void invalidate_bh_lrus_cpu(int cpu) +{ + struct bh_lru *b; + + bh_lru_lock(); + b = per_cpu_ptr(&bh_lrus, cpu); + __invalidate_bh_lrus(b); + bh_lru_unlock(); +} + void set_bh_page(struct buffer_head *bh, struct page *page, unsigned long offset) { diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 6b47f94378c5..e7e99da31349 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -194,6 +194,8 @@ void __breadahead_gfp(struct block_device *, sector_t block, unsigned int size, struct buffer_head *__bread_gfp(struct block_device *, sector_t block, unsigned size, gfp_t gfp); void invalidate_bh_lrus(void); +void invalidate_bh_lrus_cpu(int cpu); +bool has_bh_in_lru(int cpu, void *dummy); struct buffer_head *alloc_buffer_head(gfp_t gfp_flags); void free_buffer_head(struct buffer_head * bh); void unlock_buffer(struct buffer_head *bh); @@ -406,6 +408,8 @@ static inline int inode_has_buffers(struct inode *inode) { return 0; } static inline void invalidate_inode_buffers(struct inode *inode) {} static inline int remove_inode_buffers(struct inode *inode) { return 1; } static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; } +static inline void invalidate_bh_lrus_cpu(int cpu) {} +static inline bool has_bh_in_lru(int cpu, void *dummy) { return 0; } #define buffer_heads_over_limit 0 #endif /* CONFIG_BLOCK */ diff --git a/mm/swap.c b/mm/swap.c index c94f55e7b649..a75a8265302b 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -36,6 +36,7 @@ #include #include #include +#include #include "internal.h" @@ -641,6 +642,7 @@ void lru_add_drain_cpu(int cpu) pagevec_lru_move_fn(pvec, lru_lazyfree_fn); activate_page_drain(cpu); + invalidate_bh_lrus_cpu(cpu); } /** @@ -828,7 +830,8 @@ inline void __lru_add_drain_all(bool force_all_cpus) pagevec_count(&per_cpu(lru_pvecs.lru_deactivate_file, cpu)) || pagevec_count(&per_cpu(lru_pvecs.lru_deactivate, cpu)) || pagevec_count(&per_cpu(lru_pvecs.lru_lazyfree, cpu)) || - need_activate_page_drain(cpu)) { + need_activate_page_drain(cpu) || + has_bh_in_lru(cpu, NULL)) { INIT_WORK(work, lru_add_drain_per_cpu); queue_work_on(cpu, mm_percpu_wq, work); __cpumask_set_cpu(cpu, &has_work); -- cgit v1.2.3-58-ga151 From 606a6f71a25accfc960a5063c23717ff07aa43a3 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:37:04 -0700 Subject: mm/migrate.c: make putback_movable_page() static Patch series "Cleanup and fixup for mm/migrate.c", v3. This series contains cleanups to remove unnecessary VM_BUG_ON_PAGE and rc != MIGRATEPAGE_SUCCESS check. Also use helper function to remove some duplicated codes. What's more, this fixes potential deadlock in NUMA balancing shared exec THP case and so on. More details can be found in the respective changelogs. This patch (of 5): The putback_movable_page() is just called by putback_movable_pages() and we know the page is locked and both PageMovable() and PageIsolated() is checked right before calling putback_movable_page(). So we make it static and remove all the 3 VM_BUG_ON_PAGE(). Link: https://lkml.kernel.org/r/20210325131524.48181-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20210325131524.48181-2-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Yang Shi Cc: Jerome Glisse Cc: Rafael Aquini Cc: Alistair Popple Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 1 - mm/migrate.c | 7 +------ 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 6155d97ec76c..175ef15ae9e8 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -43,7 +43,6 @@ extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free, unsigned long private, enum migrate_mode mode, int reason); extern struct page *alloc_migration_target(struct page *page, unsigned long private); extern int isolate_movable_page(struct page *page, isolate_mode_t mode); -extern void putback_movable_page(struct page *page); extern void migrate_page_states(struct page *newpage, struct page *page); extern void migrate_page_copy(struct page *newpage, struct page *page); diff --git a/mm/migrate.c b/mm/migrate.c index b5fbeb4bf49a..1e1f8324cefe 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -118,15 +118,10 @@ out: return -EBUSY; } -/* It should be called on page which is PG_movable */ -void putback_movable_page(struct page *page) +static void putback_movable_page(struct page *page) { struct address_space *mapping; - VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE(!PageMovable(page), page); - VM_BUG_ON_PAGE(!PageIsolated(page), page); - mapping = page_mapping(page); mapping->a_ops->putback_page(page); __ClearPageIsolated(page); -- cgit v1.2.3-58-ga151 From a04840c6841bb266c38f51adc87325308ab8d575 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:37:07 -0700 Subject: mm/migrate.c: remove unnecessary rc != MIGRATEPAGE_SUCCESS check in 'else' case It's guaranteed that in the 'else' case of the rc == MIGRATEPAGE_SUCCESS check, rc does not equal to MIGRATEPAGE_SUCCESS. Remove this unnecessary check. Link: https://lkml.kernel.org/r/20210325131524.48181-3-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Yang Shi Cc: Alistair Popple Cc: Jerome Glisse Cc: Rafael Aquini Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/migrate.c b/mm/migrate.c index 1e1f8324cefe..896e9cd49ecc 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1348,7 +1348,7 @@ out_unlock: out: if (rc == MIGRATEPAGE_SUCCESS) putback_active_hugepage(hpage); - else if (rc != -EAGAIN && rc != MIGRATEPAGE_SUCCESS) + else if (rc != -EAGAIN) list_move_tail(&hpage->lru, ret); /* -- cgit v1.2.3-58-ga151 From 34f5e9b9d1990d286199084efa752530ee3d8297 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:37:10 -0700 Subject: mm/migrate.c: fix potential indeterminate pte entry in migrate_vma_insert_page() If the zone device page does not belong to un-addressable device memory, the variable entry will be uninitialized and lead to indeterminate pte entry ultimately. Fix this unexpected case and warn about it. Link: https://lkml.kernel.org/r/20210325131524.48181-4-linmiaohe@huawei.com Fixes: df6ad69838fc ("mm/device-public-memory: device memory cache coherent with CPU") Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Cc: Alistair Popple Cc: Jerome Glisse Cc: Rafael Aquini Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mm/migrate.c b/mm/migrate.c index 896e9cd49ecc..c40075d48ef9 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2947,6 +2947,13 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE); entry = swp_entry_to_pte(swp_entry); + } else { + /* + * For now we only support migrating to un-addressable + * device memory. + */ + pr_warn_once("Unsupported ZONE_DEVICE page type.\n"); + goto abort; } } else { entry = mk_pte(page, vma->vm_page_prot); -- cgit v1.2.3-58-ga151 From 843e1be108b9130e5ec5a78a14f77dc237c83e1e Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:37:13 -0700 Subject: mm/migrate.c: use helper migrate_vma_collect_skip() in migrate_vma_collect_hole() It's more recommended to use helper function migrate_vma_collect_skip() to skip the unexpected case and it also helps remove some duplicated codes. Move migrate_vma_collect_skip() above migrate_vma_collect_hole() to avoid compiler warning. Link: https://lkml.kernel.org/r/20210325131524.48181-5-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Cc: Alistair Popple Cc: Jerome Glisse Cc: Rafael Aquini Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index c40075d48ef9..6876696ab302 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2290,44 +2290,38 @@ out: #endif /* CONFIG_NUMA */ #ifdef CONFIG_DEVICE_PRIVATE -static int migrate_vma_collect_hole(unsigned long start, +static int migrate_vma_collect_skip(unsigned long start, unsigned long end, - __always_unused int depth, struct mm_walk *walk) { struct migrate_vma *migrate = walk->private; unsigned long addr; - /* Only allow populating anonymous memory. */ - if (!vma_is_anonymous(walk->vma)) { - for (addr = start; addr < end; addr += PAGE_SIZE) { - migrate->src[migrate->npages] = 0; - migrate->dst[migrate->npages] = 0; - migrate->npages++; - } - return 0; - } - for (addr = start; addr < end; addr += PAGE_SIZE) { - migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE; migrate->dst[migrate->npages] = 0; - migrate->npages++; - migrate->cpages++; + migrate->src[migrate->npages++] = 0; } return 0; } -static int migrate_vma_collect_skip(unsigned long start, +static int migrate_vma_collect_hole(unsigned long start, unsigned long end, + __always_unused int depth, struct mm_walk *walk) { struct migrate_vma *migrate = walk->private; unsigned long addr; + /* Only allow populating anonymous memory. */ + if (!vma_is_anonymous(walk->vma)) + return migrate_vma_collect_skip(start, end, walk); + for (addr = start; addr < end; addr += PAGE_SIZE) { + migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE; migrate->dst[migrate->npages] = 0; - migrate->src[migrate->npages++] = 0; + migrate->npages++; + migrate->cpages++; } return 0; -- cgit v1.2.3-58-ga151 From 7ee820ee72388279a37077f418e32643a298243a Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:37:16 -0700 Subject: Revert "mm: migrate: skip shared exec THP for NUMA balancing" This reverts commit c77c5cbafe549eb330e8909861a3e16cbda2c848. Since commit c77c5cbafe54 ("mm: migrate: skip shared exec THP for NUMA balancing"), the NUMA balancing would skip shared exec transhuge page. But this enhancement is not suitable for transhuge page. Because it's required that page_mapcount() must be 1 due to no migration pte dance is done here. On the other hand, the shared exec transhuge page will leave the migrate_misplaced_page() with pte entry untouched and page locked. Thus pagefault for NUMA will be triggered again and deadlock occurs when we start waiting for the page lock held by ourselves. Yang Shi said: "Thanks for catching this. By relooking the code I think the other important reason for removing this is migrate_misplaced_transhuge_page() actually can't see shared exec file THP at all since page_lock_anon_vma_read() is called before and if page is not anonymous page it will just restore the PMD without migrating anything. The pages for private mapped file vma may be anonymous pages due to COW but they can't be THP so it won't trigger THP numa fault at all. I think this is why no bug was reported. I overlooked this in the first place." Link: https://lkml.kernel.org/r/20210325131524.48181-6-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Yang Shi Cc: Alistair Popple Cc: David Hildenbrand Cc: Jerome Glisse Cc: Rafael Aquini Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 6876696ab302..30c65c2be30b 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2084,17 +2084,6 @@ bool pmd_trans_migrating(pmd_t pmd) return PageLocked(page); } -static inline bool is_shared_exec_page(struct vm_area_struct *vma, - struct page *page) -{ - if (page_mapcount(page) != 1 && - (page_is_file_lru(page) || vma_is_shmem(vma)) && - (vma->vm_flags & VM_EXEC)) - return true; - - return false; -} - /* * Attempt to migrate a misplaced page to the specified destination * node. Caller is expected to have an elevated reference count on @@ -2112,7 +2101,8 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, * Don't migrate file pages that are mapped in multiple processes * with execute permissions as they are probably shared libraries. */ - if (is_shared_exec_page(vma, page)) + if (page_mapcount(page) != 1 && page_is_file_lru(page) && + (vma->vm_flags & VM_EXEC)) goto out; /* @@ -2167,9 +2157,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, int page_lru = page_is_file_lru(page); unsigned long start = address & HPAGE_PMD_MASK; - if (is_shared_exec_page(vma, page)) - goto out; - new_page = alloc_pages_node(node, (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE), HPAGE_PMD_ORDER); @@ -2281,7 +2268,6 @@ out_fail: out_unlock: unlock_page(page); -out: put_page(page); return 0; } -- cgit v1.2.3-58-ga151 From bbb269206f3c914d4f23e023de4ec020abea6d1b Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 4 May 2021 18:37:19 -0700 Subject: mm: vmstat: add cma statistics Since CMA is used more widely, it's worth to have CMA allocation statistics into vmstat. With it, we could know how agressively system uses cma allocation and how often it fails. Link: https://lkml.kernel.org/r/20210302183346.3707237-1-minchan@kernel.org Signed-off-by: Minchan Kim Reviewed-by: John Hubbard Cc: John Dias Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vm_event_item.h | 4 ++++ mm/cma.c | 12 +++++++++--- mm/vmstat.c | 4 ++++ 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 18e75974d4e3..21d7c7f72f1c 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -70,6 +70,10 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, #endif #ifdef CONFIG_HUGETLB_PAGE HTLB_BUDDY_PGALLOC, HTLB_BUDDY_PGALLOC_FAIL, +#endif +#ifdef CONFIG_CMA + CMA_ALLOC_SUCCESS, + CMA_ALLOC_FAIL, #endif UNEVICTABLE_PGCULLED, /* culled to noreclaim list */ UNEVICTABLE_PGSCANNED, /* scanned for reclaimability */ diff --git a/mm/cma.c b/mm/cma.c index acd6991f77a0..b44a71eb3174 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -435,13 +435,13 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, int ret = -ENOMEM; if (!cma || !cma->count || !cma->bitmap) - return NULL; + goto out; pr_debug("%s(cma %p, count %zu, align %d)\n", __func__, (void *)cma, count, align); if (!count) - return NULL; + goto out; mask = cma_bitmap_aligned_mask(cma, align); offset = cma_bitmap_aligned_offset(cma, align); @@ -449,7 +449,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, bitmap_count = cma_bitmap_pages_to_bits(cma, count); if (bitmap_count > bitmap_maxno) - return NULL; + goto out; for (;;) { spin_lock_irq(&cma->lock); @@ -506,6 +506,12 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, } pr_debug("%s(): returned %p\n", __func__, page); +out: + if (page) + count_vm_event(CMA_ALLOC_SUCCESS); + else + count_vm_event(CMA_ALLOC_FAIL); + return page; } diff --git a/mm/vmstat.c b/mm/vmstat.c index 74b2c374b86c..49a8456ec079 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1312,6 +1312,10 @@ const char * const vmstat_text[] = { #ifdef CONFIG_HUGETLB_PAGE "htlb_buddy_alloc_success", "htlb_buddy_alloc_fail", +#endif +#ifdef CONFIG_CMA + "cma_alloc_success", + "cma_alloc_fail", #endif "unevictable_pgs_culled", "unevictable_pgs_scanned", -- cgit v1.2.3-58-ga151 From 63f83b31f4f36d933e13bd8b9a25d6d9a0cf89dd Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Tue, 4 May 2021 18:37:22 -0700 Subject: mm: cma: use pr_err_ratelimited for CMA warning If we did not reserve extra CMA memory, the log buffer can be easily filled up by CMA failure warning when the devices calling dmam_alloc_coherent() to alloc DMA memory. Thus we can use pr_err_ratelimited() instead to reduce the duplicate CMA warning. Link: https://lkml.kernel.org/r/ce2251ef49e1727a9a40531d1996660b05462bd2.1615279825.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: David Hildenbrand Acked-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/cma.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/cma.c b/mm/cma.c index b44a71eb3174..a862ff8c23ce 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -500,8 +500,8 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, } if (ret && !no_warn) { - pr_err("%s: %s: alloc failed, req-size: %zu pages, ret: %d\n", - __func__, cma->name, count, ret); + pr_err_ratelimited("%s: %s: alloc failed, req-size: %zu pages, ret: %d\n", + __func__, cma->name, count, ret); cma_debug_show_areas(cma); } -- cgit v1.2.3-58-ga151 From 7bc1aec5e28765ad18742824b3b972471807a632 Mon Sep 17 00:00:00 2001 From: Liam Mark Date: Tue, 4 May 2021 18:37:25 -0700 Subject: mm: cma: add trace events for CMA alloc perf testing Add cma and migrate trace events to enable CMA allocation performance to be measured via ftrace. [georgi.djakov@linaro.org: add the CMA instance name to the cma_alloc_start trace event] Link: https://lkml.kernel.org/r/20210326155414.25006-1-georgi.djakov@linaro.org Link: https://lkml.kernel.org/r/20210324160740.15901-1-georgi.djakov@linaro.org Signed-off-by: Liam Mark Signed-off-by: Georgi Djakov Acked-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/cma.h | 42 +++++++++++++++++++++++++++++++++++++++++- include/trace/events/migrate.h | 22 ++++++++++++++++++++++ mm/cma.c | 4 ++++ mm/migrate.c | 2 ++ 4 files changed, 69 insertions(+), 1 deletion(-) diff --git a/include/trace/events/cma.h b/include/trace/events/cma.h index 5017a8829270..be1525a10457 100644 --- a/include/trace/events/cma.h +++ b/include/trace/events/cma.h @@ -8,7 +8,7 @@ #include #include -TRACE_EVENT(cma_alloc, +DECLARE_EVENT_CLASS(cma_alloc_class, TP_PROTO(unsigned long pfn, const struct page *page, unsigned int count, unsigned int align), @@ -61,6 +61,46 @@ TRACE_EVENT(cma_release, __entry->count) ); +TRACE_EVENT(cma_alloc_start, + + TP_PROTO(const char *name, unsigned int count, unsigned int align), + + TP_ARGS(name, count, align), + + TP_STRUCT__entry( + __string(name, name) + __field(unsigned int, count) + __field(unsigned int, align) + ), + + TP_fast_assign( + __assign_str(name, name); + __entry->count = count; + __entry->align = align; + ), + + TP_printk("name=%s count=%u align=%u", + __get_str(name), + __entry->count, + __entry->align) +); + +DEFINE_EVENT(cma_alloc_class, cma_alloc, + + TP_PROTO(unsigned long pfn, const struct page *page, + unsigned int count, unsigned int align), + + TP_ARGS(pfn, page, count, align) +); + +DEFINE_EVENT(cma_alloc_class, cma_alloc_busy_retry, + + TP_PROTO(unsigned long pfn, const struct page *page, + unsigned int count, unsigned int align), + + TP_ARGS(pfn, page, count, align) +); + #endif /* _TRACE_CMA_H */ /* This part must be outside protection */ diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h index 4d434398d64d..f2c990603888 100644 --- a/include/trace/events/migrate.h +++ b/include/trace/events/migrate.h @@ -81,6 +81,28 @@ TRACE_EVENT(mm_migrate_pages, __print_symbolic(__entry->mode, MIGRATE_MODE), __print_symbolic(__entry->reason, MIGRATE_REASON)) ); + +TRACE_EVENT(mm_migrate_pages_start, + + TP_PROTO(enum migrate_mode mode, int reason), + + TP_ARGS(mode, reason), + + TP_STRUCT__entry( + __field(enum migrate_mode, mode) + __field(int, reason) + ), + + TP_fast_assign( + __entry->mode = mode; + __entry->reason = reason; + ), + + TP_printk("mode=%s reason=%s", + __print_symbolic(__entry->mode, MIGRATE_MODE), + __print_symbolic(__entry->reason, MIGRATE_REASON)) +); + #endif /* _TRACE_MIGRATE_H */ /* This part must be outside protection */ diff --git a/mm/cma.c b/mm/cma.c index a862ff8c23ce..d1a4c06b5039 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -443,6 +443,8 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, if (!count) goto out; + trace_cma_alloc_start(cma->name, count, align); + mask = cma_bitmap_aligned_mask(cma, align); offset = cma_bitmap_aligned_offset(cma, align); bitmap_maxno = cma_bitmap_maxno(cma); @@ -483,6 +485,8 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, pr_debug("%s(): memory range at %p is busy, retrying\n", __func__, pfn_to_page(pfn)); + + trace_cma_alloc_busy_retry(pfn, pfn_to_page(pfn), count, align); /* try again with a bit different memory target */ start = bitmap_no + mask + 1; } diff --git a/mm/migrate.c b/mm/migrate.c index 30c65c2be30b..6b37d00890ca 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1418,6 +1418,8 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, int rc, nr_subpages; LIST_HEAD(ret_pages); + trace_mm_migrate_pages_start(mode, reason); + if (!swapwrite) current->flags |= PF_SWAPWRITE; -- cgit v1.2.3-58-ga151 From 43ca106fa8ec7d684776fbe561214d3b2b7cb9cb Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 4 May 2021 18:37:28 -0700 Subject: mm: cma: support sysfs Since CMA is getting used more widely, it's more important to keep monitoring CMA statistics for system health since it's directly related to user experience. This patch introduces sysfs statistics for CMA, in order to provide some basic monitoring of the CMA allocator. * the number of CMA page successful allocations * the number of CMA page allocation failures These two values allow the user to calcuate the allocation failure rate for each CMA area. e.g.) /sys/kernel/mm/cma/WIFI/alloc_pages_[success|fail] /sys/kernel/mm/cma/SENSOR/alloc_pages_[success|fail] /sys/kernel/mm/cma/BLUETOOTH/alloc_pages_[success|fail] The cma_stat was intentionally allocated by dynamic allocation to harmonize with kobject lifetime management. https://lore.kernel.org/linux-mm/YCOAmXqt6dZkCQYs@kroah.com/ Link: https://lkml.kernel.org/r/20210324230759.2213957-1-minchan@kernel.org Link: https://lore.kernel.org/linux-mm/20210316100433.17665-1-colin.king@canonical.com/ Signed-off-by: Minchan Kim Signed-off-by: Colin Ian King Tested-by: Dmitry Osipenko Reviewed-by: Dmitry Osipenko Reviewed-by: Greg Kroah-Hartman Reviewed-by: John Hubbard Tested-by: Anders Roxell Cc: Suren Baghdasaryan Cc: John Dias Cc: Matthew Wilcox (Oracle) Cc: Colin Ian King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/ABI/testing/sysfs-kernel-mm-cma | 25 ++++++ mm/Kconfig | 7 ++ mm/Makefile | 1 + mm/cma.c | 8 +- mm/cma.h | 23 ++++++ mm/cma_sysfs.c | 112 ++++++++++++++++++++++++++ 6 files changed, 174 insertions(+), 2 deletions(-) create mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-cma create mode 100644 mm/cma_sysfs.c diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-cma b/Documentation/ABI/testing/sysfs-kernel-mm-cma new file mode 100644 index 000000000000..02b2bb60c296 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-kernel-mm-cma @@ -0,0 +1,25 @@ +What: /sys/kernel/mm/cma/ +Date: Feb 2021 +Contact: Minchan Kim +Description: + /sys/kernel/mm/cma/ contains a subdirectory for each CMA + heap name (also sometimes called CMA areas). + + Each CMA heap subdirectory (that is, each + /sys/kernel/mm/cma/ directory) contains the + following items: + + alloc_pages_success + alloc_pages_fail + +What: /sys/kernel/mm/cma//alloc_pages_success +Date: Feb 2021 +Contact: Minchan Kim +Description: + the number of pages CMA API succeeded to allocate + +What: /sys/kernel/mm/cma//alloc_pages_fail +Date: Feb 2021 +Contact: Minchan Kim +Description: + the number of pages CMA API failed to allocate diff --git a/mm/Kconfig b/mm/Kconfig index f4ceeaedb966..5f7d7b3d8cc2 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -518,6 +518,13 @@ config CMA_DEBUGFS help Turns on the DebugFS interface for CMA. +config CMA_SYSFS + bool "CMA information through sysfs interface" + depends on CMA && SYSFS + help + This option exposes some sysfs attributes to get information + from CMA. + config CMA_AREAS int "Maximum count of the CMA areas" depends on CMA diff --git a/mm/Makefile b/mm/Makefile index c0135e385984..809033d77710 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -109,6 +109,7 @@ obj-$(CONFIG_CMA) += cma.o obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o +obj-$(CONFIG_CMA_SYSFS) += cma_sysfs.o obj-$(CONFIG_USERFAULTFD) += userfaultfd.o obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o diff --git a/mm/cma.c b/mm/cma.c index d1a4c06b5039..2380f2571eb5 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -511,10 +511,14 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, pr_debug("%s(): returned %p\n", __func__, page); out: - if (page) + if (page) { count_vm_event(CMA_ALLOC_SUCCESS); - else + cma_sysfs_account_success_pages(cma, count); + } else { count_vm_event(CMA_ALLOC_FAIL); + if (cma) + cma_sysfs_account_fail_pages(cma, count); + } return page; } diff --git a/mm/cma.h b/mm/cma.h index c46c5050aaa9..2c775877eae2 100644 --- a/mm/cma.h +++ b/mm/cma.h @@ -3,6 +3,12 @@ #define __MM_CMA_H__ #include +#include + +struct cma_kobject { + struct kobject kobj; + struct cma *cma; +}; struct cma { unsigned long base_pfn; @@ -16,6 +22,14 @@ struct cma { struct debugfs_u32_array dfs_bitmap; #endif char name[CMA_MAX_NAME]; +#ifdef CONFIG_CMA_SYSFS + /* the number of CMA page successful allocations */ + atomic64_t nr_pages_succeeded; + /* the number of CMA page allocation failures */ + atomic64_t nr_pages_failed; + /* kobject requires dynamic object */ + struct cma_kobject *cma_kobj; +#endif }; extern struct cma cma_areas[MAX_CMA_AREAS]; @@ -26,4 +40,13 @@ static inline unsigned long cma_bitmap_maxno(struct cma *cma) return cma->count >> cma->order_per_bit; } +#ifdef CONFIG_CMA_SYSFS +void cma_sysfs_account_success_pages(struct cma *cma, unsigned long nr_pages); +void cma_sysfs_account_fail_pages(struct cma *cma, unsigned long nr_pages); +#else +static inline void cma_sysfs_account_success_pages(struct cma *cma, + unsigned long nr_pages) {}; +static inline void cma_sysfs_account_fail_pages(struct cma *cma, + unsigned long nr_pages) {}; +#endif #endif diff --git a/mm/cma_sysfs.c b/mm/cma_sysfs.c new file mode 100644 index 000000000000..eb2f39caff59 --- /dev/null +++ b/mm/cma_sysfs.c @@ -0,0 +1,112 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * CMA SysFS Interface + * + * Copyright (c) 2021 Minchan Kim + */ + +#include +#include +#include + +#include "cma.h" + +#define CMA_ATTR_RO(_name) \ + static struct kobj_attribute _name##_attr = __ATTR_RO(_name) + +void cma_sysfs_account_success_pages(struct cma *cma, unsigned long nr_pages) +{ + atomic64_add(nr_pages, &cma->nr_pages_succeeded); +} + +void cma_sysfs_account_fail_pages(struct cma *cma, unsigned long nr_pages) +{ + atomic64_add(nr_pages, &cma->nr_pages_failed); +} + +static inline struct cma *cma_from_kobj(struct kobject *kobj) +{ + return container_of(kobj, struct cma_kobject, kobj)->cma; +} + +static ssize_t alloc_pages_success_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct cma *cma = cma_from_kobj(kobj); + + return sysfs_emit(buf, "%llu\n", + atomic64_read(&cma->nr_pages_succeeded)); +} +CMA_ATTR_RO(alloc_pages_success); + +static ssize_t alloc_pages_fail_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct cma *cma = cma_from_kobj(kobj); + + return sysfs_emit(buf, "%llu\n", atomic64_read(&cma->nr_pages_failed)); +} +CMA_ATTR_RO(alloc_pages_fail); + +static void cma_kobj_release(struct kobject *kobj) +{ + struct cma *cma = cma_from_kobj(kobj); + struct cma_kobject *cma_kobj = cma->cma_kobj; + + kfree(cma_kobj); + cma->cma_kobj = NULL; +} + +static struct attribute *cma_attrs[] = { + &alloc_pages_success_attr.attr, + &alloc_pages_fail_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(cma); + +static struct kobj_type cma_ktype = { + .release = cma_kobj_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = cma_groups, +}; + +static int __init cma_sysfs_init(void) +{ + struct kobject *cma_kobj_root; + struct cma_kobject *cma_kobj; + struct cma *cma; + int i, err; + + cma_kobj_root = kobject_create_and_add("cma", mm_kobj); + if (!cma_kobj_root) + return -ENOMEM; + + for (i = 0; i < cma_area_count; i++) { + cma_kobj = kzalloc(sizeof(*cma_kobj), GFP_KERNEL); + if (!cma_kobj) { + err = -ENOMEM; + goto out; + } + + cma = &cma_areas[i]; + cma->cma_kobj = cma_kobj; + cma_kobj->cma = cma; + err = kobject_init_and_add(&cma_kobj->kobj, &cma_ktype, + cma_kobj_root, "%s", cma->name); + if (err) { + kobject_put(&cma_kobj->kobj); + goto out; + } + } + + return 0; +out: + while (--i >= 0) { + cma = &cma_areas[i]; + kobject_put(&cma->cma_kobj->kobj); + } + kobject_put(cma_kobj_root); + + return err; +} +subsys_initcall(cma_sysfs_init); -- cgit v1.2.3-58-ga151 From 3aab8ae7aace3388da319a233edf48f0f5d26a44 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 4 May 2021 18:37:31 -0700 Subject: mm: cma: add the CMA instance name to cma trace events There were missing places to add cma instance name. To identify each CMA instance, let's add the name for every cma trace. This patch also changes the existing cma_trace_alloc to cma_trace_finish since we have cma_alloc_start[1]. [1] https://lore.kernel.org/linux-mm/20210324160740.15901-1-georgi.djakov@linaro.org Link: https://lkml.kernel.org/r/20210330220237.748899-1-minchan@kernel.org Signed-off-by: Minchan Kim Cc: Liam Mark Cc: Georgi Djakov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/cma.h | 28 +++++++++++++++++----------- mm/cma.c | 7 ++++--- 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/include/trace/events/cma.h b/include/trace/events/cma.h index be1525a10457..5cf385ae7c08 100644 --- a/include/trace/events/cma.h +++ b/include/trace/events/cma.h @@ -10,12 +10,13 @@ DECLARE_EVENT_CLASS(cma_alloc_class, - TP_PROTO(unsigned long pfn, const struct page *page, + TP_PROTO(const char *name, unsigned long pfn, const struct page *page, unsigned int count, unsigned int align), - TP_ARGS(pfn, page, count, align), + TP_ARGS(name, pfn, page, count, align), TP_STRUCT__entry( + __string(name, name) __field(unsigned long, pfn) __field(const struct page *, page) __field(unsigned int, count) @@ -23,13 +24,15 @@ DECLARE_EVENT_CLASS(cma_alloc_class, ), TP_fast_assign( + __assign_str(name, name); __entry->pfn = pfn; __entry->page = page; __entry->count = count; __entry->align = align; ), - TP_printk("pfn=%lx page=%p count=%u align=%u", + TP_printk("name=%s pfn=%lx page=%p count=%u align=%u", + __get_str(name), __entry->pfn, __entry->page, __entry->count, @@ -38,24 +41,27 @@ DECLARE_EVENT_CLASS(cma_alloc_class, TRACE_EVENT(cma_release, - TP_PROTO(unsigned long pfn, const struct page *page, + TP_PROTO(const char *name, unsigned long pfn, const struct page *page, unsigned int count), - TP_ARGS(pfn, page, count), + TP_ARGS(name, pfn, page, count), TP_STRUCT__entry( + __string(name, name) __field(unsigned long, pfn) __field(const struct page *, page) __field(unsigned int, count) ), TP_fast_assign( + __assign_str(name, name); __entry->pfn = pfn; __entry->page = page; __entry->count = count; ), - TP_printk("pfn=%lx page=%p count=%u", + TP_printk("name=%s pfn=%lx page=%p count=%u", + __get_str(name), __entry->pfn, __entry->page, __entry->count) @@ -85,20 +91,20 @@ TRACE_EVENT(cma_alloc_start, __entry->align) ); -DEFINE_EVENT(cma_alloc_class, cma_alloc, +DEFINE_EVENT(cma_alloc_class, cma_alloc_finish, - TP_PROTO(unsigned long pfn, const struct page *page, + TP_PROTO(const char *name, unsigned long pfn, const struct page *page, unsigned int count, unsigned int align), - TP_ARGS(pfn, page, count, align) + TP_ARGS(name, pfn, page, count, align) ); DEFINE_EVENT(cma_alloc_class, cma_alloc_busy_retry, - TP_PROTO(unsigned long pfn, const struct page *page, + TP_PROTO(const char *name, unsigned long pfn, const struct page *page, unsigned int count, unsigned int align), - TP_ARGS(pfn, page, count, align) + TP_ARGS(name, pfn, page, count, align) ); #endif /* _TRACE_CMA_H */ diff --git a/mm/cma.c b/mm/cma.c index 2380f2571eb5..cdad8c4de921 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -486,12 +486,13 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, pr_debug("%s(): memory range at %p is busy, retrying\n", __func__, pfn_to_page(pfn)); - trace_cma_alloc_busy_retry(pfn, pfn_to_page(pfn), count, align); + trace_cma_alloc_busy_retry(cma->name, pfn, pfn_to_page(pfn), + count, align); /* try again with a bit different memory target */ start = bitmap_no + mask + 1; } - trace_cma_alloc(pfn, page, count, align); + trace_cma_alloc_finish(cma->name, pfn, page, count, align); /* * CMA can allocate multiple page blocks, which results in different @@ -551,7 +552,7 @@ bool cma_release(struct cma *cma, const struct page *pages, unsigned int count) free_contig_range(pfn, count); cma_clear_bitmap(cma, pfn, count); - trace_cma_release(pfn, pages, count); + trace_cma_release(cma->name, pfn, pages, count); return true; } -- cgit v1.2.3-58-ga151 From 78fa51503fdbe463c96eef4c3cf69ca54032647a Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 4 May 2021 18:37:34 -0700 Subject: mm: use proper type for cma_[alloc|release] size_t in cma_alloc is confusing since it makes people think it's byte count, not pages. Change it to unsigned long[1]. The unsigned int in cma_release is also not right so change it. Since we have unsigned long in cma_release, free_contig_range should also respect it. [1] 67a2e213e7e9, mm: cma: fix incorrect type conversion for size during dma allocation Link: https://lore.kernel.org/linux-mm/20210324043434.GP1719932@casper.infradead.org/ Link: https://lkml.kernel.org/r/20210331164018.710560-1-minchan@kernel.org Signed-off-by: Minchan Kim Reviewed-by: David Hildenbrand Cc: Matthew Wilcox Cc: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cma.h | 4 ++-- include/linux/gfp.h | 2 +- include/trace/events/cma.h | 22 +++++++++++----------- mm/cma.c | 17 +++++++++-------- mm/page_alloc.c | 6 +++--- 5 files changed, 26 insertions(+), 25 deletions(-) diff --git a/include/linux/cma.h b/include/linux/cma.h index 217999c8a762..53fd8c3cdbd0 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -44,9 +44,9 @@ extern int cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, unsigned int order_per_bit, const char *name, struct cma **res_cma); -extern struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, +extern struct page *cma_alloc(struct cma *cma, unsigned long count, unsigned int align, bool no_warn); -extern bool cma_release(struct cma *cma, const struct page *pages, unsigned int count); +extern bool cma_release(struct cma *cma, const struct page *pages, unsigned long count); extern int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data); #endif diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 26f4d907254a..8a5f6c3d7dba 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -657,7 +657,7 @@ extern int alloc_contig_range(unsigned long start, unsigned long end, extern struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask, int nid, nodemask_t *nodemask); #endif -void free_contig_range(unsigned long pfn, unsigned int nr_pages); +void free_contig_range(unsigned long pfn, unsigned long nr_pages); #ifdef CONFIG_CMA /* CMA stuff */ diff --git a/include/trace/events/cma.h b/include/trace/events/cma.h index 5cf385ae7c08..c3d354702cb0 100644 --- a/include/trace/events/cma.h +++ b/include/trace/events/cma.h @@ -11,7 +11,7 @@ DECLARE_EVENT_CLASS(cma_alloc_class, TP_PROTO(const char *name, unsigned long pfn, const struct page *page, - unsigned int count, unsigned int align), + unsigned long count, unsigned int align), TP_ARGS(name, pfn, page, count, align), @@ -19,7 +19,7 @@ DECLARE_EVENT_CLASS(cma_alloc_class, __string(name, name) __field(unsigned long, pfn) __field(const struct page *, page) - __field(unsigned int, count) + __field(unsigned long, count) __field(unsigned int, align) ), @@ -31,7 +31,7 @@ DECLARE_EVENT_CLASS(cma_alloc_class, __entry->align = align; ), - TP_printk("name=%s pfn=%lx page=%p count=%u align=%u", + TP_printk("name=%s pfn=%lx page=%p count=%lu align=%u", __get_str(name), __entry->pfn, __entry->page, @@ -42,7 +42,7 @@ DECLARE_EVENT_CLASS(cma_alloc_class, TRACE_EVENT(cma_release, TP_PROTO(const char *name, unsigned long pfn, const struct page *page, - unsigned int count), + unsigned long count), TP_ARGS(name, pfn, page, count), @@ -50,7 +50,7 @@ TRACE_EVENT(cma_release, __string(name, name) __field(unsigned long, pfn) __field(const struct page *, page) - __field(unsigned int, count) + __field(unsigned long, count) ), TP_fast_assign( @@ -60,7 +60,7 @@ TRACE_EVENT(cma_release, __entry->count = count; ), - TP_printk("name=%s pfn=%lx page=%p count=%u", + TP_printk("name=%s pfn=%lx page=%p count=%lu", __get_str(name), __entry->pfn, __entry->page, @@ -69,13 +69,13 @@ TRACE_EVENT(cma_release, TRACE_EVENT(cma_alloc_start, - TP_PROTO(const char *name, unsigned int count, unsigned int align), + TP_PROTO(const char *name, unsigned long count, unsigned int align), TP_ARGS(name, count, align), TP_STRUCT__entry( __string(name, name) - __field(unsigned int, count) + __field(unsigned long, count) __field(unsigned int, align) ), @@ -85,7 +85,7 @@ TRACE_EVENT(cma_alloc_start, __entry->align = align; ), - TP_printk("name=%s count=%u align=%u", + TP_printk("name=%s count=%lu align=%u", __get_str(name), __entry->count, __entry->align) @@ -94,7 +94,7 @@ TRACE_EVENT(cma_alloc_start, DEFINE_EVENT(cma_alloc_class, cma_alloc_finish, TP_PROTO(const char *name, unsigned long pfn, const struct page *page, - unsigned int count, unsigned int align), + unsigned long count, unsigned int align), TP_ARGS(name, pfn, page, count, align) ); @@ -102,7 +102,7 @@ DEFINE_EVENT(cma_alloc_class, cma_alloc_finish, DEFINE_EVENT(cma_alloc_class, cma_alloc_busy_retry, TP_PROTO(const char *name, unsigned long pfn, const struct page *page, - unsigned int count, unsigned int align), + unsigned long count, unsigned int align), TP_ARGS(name, pfn, page, count, align) ); diff --git a/mm/cma.c b/mm/cma.c index cdad8c4de921..995e15480937 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -79,7 +79,7 @@ static unsigned long cma_bitmap_pages_to_bits(const struct cma *cma, } static void cma_clear_bitmap(struct cma *cma, unsigned long pfn, - unsigned int count) + unsigned long count) { unsigned long bitmap_no, bitmap_count; unsigned long flags; @@ -423,21 +423,21 @@ static inline void cma_debug_show_areas(struct cma *cma) { } * This function allocates part of contiguous memory on specific * contiguous memory area. */ -struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, - bool no_warn) +struct page *cma_alloc(struct cma *cma, unsigned long count, + unsigned int align, bool no_warn) { unsigned long mask, offset; unsigned long pfn = -1; unsigned long start = 0; unsigned long bitmap_maxno, bitmap_no, bitmap_count; - size_t i; + unsigned long i; struct page *page = NULL; int ret = -ENOMEM; if (!cma || !cma->count || !cma->bitmap) goto out; - pr_debug("%s(cma %p, count %zu, align %d)\n", __func__, (void *)cma, + pr_debug("%s(cma %p, count %lu, align %d)\n", __func__, (void *)cma, count, align); if (!count) @@ -505,7 +505,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, } if (ret && !no_warn) { - pr_err_ratelimited("%s: %s: alloc failed, req-size: %zu pages, ret: %d\n", + pr_err_ratelimited("%s: %s: alloc failed, req-size: %lu pages, ret: %d\n", __func__, cma->name, count, ret); cma_debug_show_areas(cma); } @@ -534,14 +534,15 @@ out: * It returns false when provided pages do not belong to contiguous area and * true otherwise. */ -bool cma_release(struct cma *cma, const struct page *pages, unsigned int count) +bool cma_release(struct cma *cma, const struct page *pages, + unsigned long count) { unsigned long pfn; if (!cma || !pages) return false; - pr_debug("%s(page %p, count %u)\n", __func__, (void *)pages, count); + pr_debug("%s(page %p, count %lu)\n", __func__, (void *)pages, count); pfn = page_to_pfn(pages); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 80754fc1f1ff..d12299c08b95 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8973,9 +8973,9 @@ struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask, } #endif /* CONFIG_CONTIG_ALLOC */ -void free_contig_range(unsigned long pfn, unsigned int nr_pages) +void free_contig_range(unsigned long pfn, unsigned long nr_pages) { - unsigned int count = 0; + unsigned long count = 0; for (; nr_pages--; pfn++) { struct page *page = pfn_to_page(pfn); @@ -8983,7 +8983,7 @@ void free_contig_range(unsigned long pfn, unsigned int nr_pages) count += page_count(page) != 1; __free_page(page); } - WARN(count != 0, "%d pages are still in use!\n", count); + WARN(count != 0, "%lu pages are still in use!\n", count); } EXPORT_SYMBOL(free_contig_range); -- cgit v1.2.3-58-ga151 From a08e1e11c90f3e6020963b3ad097680768bc8567 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:37:37 -0700 Subject: ksm: remove redundant VM_BUG_ON_PAGE() on stable_tree_search() Patch series "Cleanup and fixup for ksm". This series contains cleanups to remove unnecessary VM_BUG_ON_PAGE and dedicated macro KSM_FLAG_MASK. Also this fixes potential missing rmap_item for stable_node which would result in failed rmap_walk_ksm(). More details can be found in the respective changelogs. This patch (of 4): The same VM_BUG_ON_PAGE() check is already done in the callee. Remove these extra caller one to simplify code slightly. Link: https://lkml.kernel.org/r/20210330140228.45635-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20210330140228.45635-2-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index 9694ee2c71de..7b6bc1036596 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1771,7 +1771,6 @@ chain_append: * stable_node_dup is the dup to replace. */ if (stable_node_dup == stable_node) { - VM_BUG_ON(is_stable_node_chain(stable_node_dup)); VM_BUG_ON(is_stable_node_dup(stable_node_dup)); /* chain is missing so create it */ stable_node = alloc_stable_node_chain(stable_node_dup, @@ -1785,7 +1784,6 @@ chain_append: * of the current nid for this page * content. */ - VM_BUG_ON(!is_stable_node_chain(stable_node)); VM_BUG_ON(!is_stable_node_dup(stable_node_dup)); VM_BUG_ON(page_node->head != &migrate_nodes); list_del(&page_node->list); -- cgit v1.2.3-58-ga151 From 3e96b6a2e9ad929a3230a22f4d64a74671a0720b Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:37:39 -0700 Subject: ksm: use GET_KSM_PAGE_NOLOCK to get ksm page in remove_rmap_item_from_tree() It's unnecessary to lock the page when get ksm page if we're going to remove the rmap item as page migration is irrelevant in this case. Use GET_KSM_PAGE_NOLOCK instead to save some page lock cycles. Link: https://lkml.kernel.org/r/20210330140228.45635-3-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index 7b6bc1036596..31dcbd4d3ac9 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -778,12 +778,11 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) struct page *page; stable_node = rmap_item->head; - page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK); + page = get_ksm_page(stable_node, GET_KSM_PAGE_NOLOCK); if (!page) goto out; hlist_del(&rmap_item->hlist); - unlock_page(page); put_page(page); if (!hlist_empty(&stable_node->hlist)) -- cgit v1.2.3-58-ga151 From cd7fae26024690c772ec66719735c58a12034088 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:37:42 -0700 Subject: ksm: remove dedicated macro KSM_FLAG_MASK The macro KSM_FLAG_MASK is used in rmap_walk_ksm() only. So we can replace ~KSM_FLAG_MASK with PAGE_MASK to remove this dedicated macro and make code more consistent because PAGE_MASK is used elsewhere in this file. Link: https://lkml.kernel.org/r/20210330140228.45635-4-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index 31dcbd4d3ac9..4b25f642f4ec 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -215,8 +215,6 @@ struct rmap_item { #define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */ #define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */ #define STABLE_FLAG 0x200 /* is listed from the stable tree */ -#define KSM_FLAG_MASK (SEQNR_MASK|UNSTABLE_FLAG|STABLE_FLAG) - /* to mask all the flags */ /* The stable and unstable tree heads */ static struct rb_root one_stable_tree[1] = { RB_ROOT }; @@ -2631,7 +2629,7 @@ again: vma = vmac->vma; /* Ignore the stable/unstable/sqnr flags */ - addr = rmap_item->address & ~KSM_FLAG_MASK; + addr = rmap_item->address & PAGE_MASK; if (addr < vma->vm_start || addr >= vma->vm_end) continue; -- cgit v1.2.3-58-ga151 From c89a384e2551c692a9fe60d093fd7080f50afc51 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:37:45 -0700 Subject: ksm: fix potential missing rmap_item for stable_node When removing rmap_item from stable tree, STABLE_FLAG of rmap_item is cleared with head reserved. So the following scenario might happen: For ksm page with rmap_item1: cmp_and_merge_page stable_node->head = &migrate_nodes; remove_rmap_item_from_tree, but head still equal to stable_node; try_to_merge_with_ksm_page failed; return; For the same ksm page with rmap_item2, stable node migration succeed this time. The stable_node->head does not equal to migrate_nodes now. For ksm page with rmap_item1 again: cmp_and_merge_page stable_node->head != &migrate_nodes && rmap_item->head == stable_node return; We would miss the rmap_item for stable_node and might result in failed rmap_walk_ksm(). Fix this by set rmap_item->head to NULL when rmap_item is removed from stable tree. Link: https://lkml.kernel.org/r/20210330140228.45635-5-linmiaohe@huawei.com Fixes: 4146d2d673e8 ("ksm: make !merge_across_nodes migration safe") Signed-off-by: Miaohe Lin Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/ksm.c b/mm/ksm.c index 4b25f642f4ec..e2ba4ebc2500 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -791,6 +791,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) stable_node->rmap_hlist_len--; put_anon_vma(rmap_item->anon_vma); + rmap_item->head = NULL; rmap_item->address &= PAGE_MASK; } else if (rmap_item->address & UNSTABLE_FLAG) { -- cgit v1.2.3-58-ga151 From 420be4edefe503f8dbd6ab914b11a57a0d339660 Mon Sep 17 00:00:00 2001 From: Chengyang Fan Date: Tue, 4 May 2021 18:37:48 -0700 Subject: mm/ksm: remove unused parameter from remove_trailing_rmap_items() Since commit 6514d511dbe5 ("ksm: singly-linked rmap_list") was merged, remove_trailing_rmap_items() doesn't use the 'mm_slot' parameter. So remove it, and update caller accordingly. Link: https://lkml.kernel.org/r/20210330121320.1693474-1-cy.fan@huawei.com Signed-off-by: Chengyang Fan Reviewed-by: David Hildenbrand Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index e2ba4ebc2500..b321a67ebaa9 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -815,8 +815,7 @@ out: cond_resched(); /* we're called from many long loops */ } -static void remove_trailing_rmap_items(struct mm_slot *mm_slot, - struct rmap_item **rmap_list) +static void remove_trailing_rmap_items(struct rmap_item **rmap_list) { while (*rmap_list) { struct rmap_item *rmap_item = *rmap_list; @@ -987,7 +986,7 @@ static int unmerge_and_remove_all_rmap_items(void) goto error; } - remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list); + remove_trailing_rmap_items(&mm_slot->rmap_list); mmap_read_unlock(mm); spin_lock(&ksm_mmlist_lock); @@ -2333,7 +2332,7 @@ next_mm: * Nuke all the rmap_items that are above this current rmap: * because there were no VM_MERGEABLE vmas with such addresses. */ - remove_trailing_rmap_items(slot, ksm_scan.rmap_list); + remove_trailing_rmap_items(ksm_scan.rmap_list); spin_lock(&ksm_mmlist_lock); ksm_scan.mm_slot = list_entry(slot->mm_list.next, -- cgit v1.2.3-58-ga151 From 76d8cc3c8f45cc597726616f11db4180f7e21ce0 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 4 May 2021 18:37:51 -0700 Subject: mm: restore node stat checking in /proc/sys/vm/stat_refresh In v4.7 commit 52b6f46bc163 ("mm: /proc/sys/vm/stat_refresh to force vmstat update") introduced vmstat_refresh(), with its vmstat underflow checking; then in v4.8 commit 75ef71840539 ("mm, vmstat: add infrastructure for per-node vmstats") split NR_VM_NODE_STAT_ITEMS out of NR_VM_ZONE_STAT_ITEMS without updating vmstat_refresh(): so it has been missing out much of the vmstat underflow checking ever since. Reinstate it. Thanks to Roman Gushchin for tangentially pointing this out. Link: https://lkml.kernel.org/r/alpine.LSU.2.11.2102251502240.13363@eggly.anvils Signed-off-by: Hugh Dickins Cc: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mm/vmstat.c b/mm/vmstat.c index 49a8456ec079..14d0ba4fd46b 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1875,6 +1875,14 @@ int vmstat_refresh(struct ctl_table *table, int write, } } #endif + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { + val = atomic_long_read(&vm_node_stat[i]); + if (val < 0) { + pr_warn("%s: %s %ld\n", + __func__, node_stat_name(i), val); + err = -EINVAL; + } + } if (err) return err; if (write) -- cgit v1.2.3-58-ga151 From 6d99a4c029c01cd7d075f7f9fa3b8b620e49a9f7 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 4 May 2021 18:37:54 -0700 Subject: mm: no more EINVAL from /proc/sys/vm/stat_refresh EINVAL was good for drawing the refresher's attention to a warning in dmesg, but became very tiresome when running test suites scripted with "set -e": an underflow from a bug in one feature would cause unrelated tests much later to fail, just because their /proc/sys/vm/stat_refresh touch failed with that error. Stop doing that. Link: https://lkml.kernel.org/r/alpine.LSU.2.11.2102251510410.13363@eggly.anvils Signed-off-by: Hugh Dickins Acked-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/mm/vmstat.c b/mm/vmstat.c index 14d0ba4fd46b..1fb91d5a2b60 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1862,7 +1862,6 @@ int vmstat_refresh(struct ctl_table *table, int write, if (val < 0) { pr_warn("%s: %s %ld\n", __func__, zone_stat_name(i), val); - err = -EINVAL; } } #ifdef CONFIG_NUMA @@ -1871,7 +1870,6 @@ int vmstat_refresh(struct ctl_table *table, int write, if (val < 0) { pr_warn("%s: %s %ld\n", __func__, numa_stat_name(i), val); - err = -EINVAL; } } #endif @@ -1880,11 +1878,8 @@ int vmstat_refresh(struct ctl_table *table, int write, if (val < 0) { pr_warn("%s: %s %ld\n", __func__, node_stat_name(i), val); - err = -EINVAL; } } - if (err) - return err; if (write) *ppos += *lenp; else -- cgit v1.2.3-58-ga151 From 75083aae114c2738af28eef2fb0c2515e818885a Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 4 May 2021 18:37:57 -0700 Subject: mm: /proc/sys/vm/stat_refresh skip checking known negative stats vmstat_refresh() can occasionally catch nr_zone_write_pending and nr_writeback when they are transiently negative. The reason is partly that the interrupt which decrements them in test_clear_page_writeback() can come in before __test_set_page_writeback() got to increment them; but transient negatives are still seen even when that is prevented, and I am not yet certain why (but see Roman's note below). Those stats are not buggy, they have never been seen to drift away from 0 permanently: so just avoid the annoyance of showing a warning on them. Similarly avoid showing a warning on nr_free_cma: CMA users have seen that one reported negative from /proc/sys/vm/stat_refresh too, but it does drift away permanently: I believe that's because its incrementation and decrementation are decided by page migratetype, but the migratetype of a pageblock is not guaranteed to be constant. Roman Gushchin points out: "For performance reasons, vmstat counters are incremented and decremented using per-cpu batches. vmstat_refresh() flushes the per-cpu batches on all CPUs, to get values as accurate as possible; but this method is not atomic, so the resulting value is not always precise. As a consequence, for those counters whose actual value is close to 0, a small negative value may occasionally be reported. If the value is small and the state is transient, it is not an indication of an error" Link: https://lore.kernel.org/linux-mm/20200714173747.3315771-1-guro@fb.com/ Link: https://lkml.kernel.org/r/alpine.LSU.2.11.2103012158540.7549@eggly.anvils Signed-off-by: Hugh Dickins Reported-by: Roman Gushchin Acked-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/mm/vmstat.c b/mm/vmstat.c index 1fb91d5a2b60..5437a7861089 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1858,6 +1858,14 @@ int vmstat_refresh(struct ctl_table *table, int write, if (err) return err; for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) { + /* + * Skip checking stats known to go negative occasionally. + */ + switch (i) { + case NR_ZONE_WRITE_PENDING: + case NR_FREE_CMA_PAGES: + continue; + } val = atomic_long_read(&vm_zone_stat[i]); if (val < 0) { pr_warn("%s: %s %ld\n", @@ -1874,6 +1882,13 @@ int vmstat_refresh(struct ctl_table *table, int write, } #endif for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { + /* + * Skip checking stats known to go negative occasionally. + */ + switch (i) { + case NR_WRITEBACK: + continue; + } val = atomic_long_read(&vm_node_stat[i]); if (val < 0) { pr_warn("%s: %s %ld\n", -- cgit v1.2.3-58-ga151 From c675790972916d3722809fcc52c5c4f8421b2e5d Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 4 May 2021 18:38:00 -0700 Subject: mm: /proc/sys/vm/stat_refresh stop checking monotonic numa stats All of the VM NUMA stats are event counts, incremented never decremented: it is not very useful for vmstat_refresh() to check them throughout their first aeon, then warn on them throughout their next. Link: https://lkml.kernel.org/r/alpine.LSU.2.11.2102251514110.13363@eggly.anvils Signed-off-by: Hugh Dickins Acked-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/mm/vmstat.c b/mm/vmstat.c index 5437a7861089..06cd78dc914d 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1872,15 +1872,6 @@ int vmstat_refresh(struct ctl_table *table, int write, __func__, zone_stat_name(i), val); } } -#ifdef CONFIG_NUMA - for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) { - val = atomic_long_read(&vm_numa_stat[i]); - if (val < 0) { - pr_warn("%s: %s %ld\n", - __func__, numa_stat_name(i), val); - } - } -#endif for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { /* * Skip checking stats known to go negative occasionally. -- cgit v1.2.3-58-ga151 From 575299ea18a8c0575d4c2ef6ad3fa4d41d529d1c Mon Sep 17 00:00:00 2001 From: Saravanan D Date: Tue, 4 May 2021 18:38:03 -0700 Subject: x86/mm: track linear mapping split events To help with debugging the sluggishness caused by TLB miss/reload, we introduce monotonic hugepage [direct mapped] split event counts since system state: SYSTEM_RUNNING to be displayed as part of /proc/vmstat in x86 servers The lifetime split event information will be displayed at the bottom of /proc/vmstat .... swap_ra 0 swap_ra_hit 0 direct_map_level2_splits 94 direct_map_level3_splits 4 nr_unstable 0 .... One of the many lasting sources of direct hugepage splits is kernel tracing (kprobes, tracepoints). Note that the kernel's code segment [512 MB] points to the same physical addresses that have been already mapped in the kernel's direct mapping range. Source : Documentation/x86/x86_64/mm.rst When we enable kernel tracing, the kernel has to modify attributes/permissions of the text segment hugepages that are direct mapped causing them to split. Kernel's direct mapped hugepages do not coalesce back after split and remain in place for the remainder of the lifetime. An instance of direct page splits when we turn on dynamic kernel tracing .... cat /proc/vmstat | grep -i direct_map_level direct_map_level2_splits 784 direct_map_level3_splits 12 bpftrace -e 'tracepoint:raw_syscalls:sys_enter { @ [pid, comm] = count(); }' cat /proc/vmstat | grep -i direct_map_level direct_map_level2_splits 789 direct_map_level3_splits 12 .... Link: https://lkml.kernel.org/r/20210218235744.1040634-1-saravanand@fb.com Signed-off-by: Saravanan D Acked-by: Tejun Heo Acked-by: Johannes Weiner Acked-by: Dave Hansen Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/pat/set_memory.c | 8 ++++++++ include/linux/vm_event_item.h | 4 ++++ mm/vmstat.c | 4 ++++ 3 files changed, 16 insertions(+) diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 427980617557..156cd235659f 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -16,6 +16,8 @@ #include #include #include +#include +#include #include #include @@ -91,6 +93,12 @@ static void split_page_count(int level) return; direct_pages_count[level]--; + if (system_state == SYSTEM_RUNNING) { + if (level == PG_LEVEL_2M) + count_vm_event(DIRECT_MAP_LEVEL2_SPLIT); + else if (level == PG_LEVEL_1G) + count_vm_event(DIRECT_MAP_LEVEL3_SPLIT); + } direct_pages_count[level - 1] += PTRS_PER_PTE; } diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 21d7c7f72f1c..ae0dd1948c2b 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -124,6 +124,10 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, #ifdef CONFIG_SWAP SWAP_RA, SWAP_RA_HIT, +#endif +#ifdef CONFIG_X86 + DIRECT_MAP_LEVEL2_SPLIT, + DIRECT_MAP_LEVEL3_SPLIT, #endif NR_VM_EVENT_ITEMS }; diff --git a/mm/vmstat.c b/mm/vmstat.c index 06cd78dc914d..5ba118521ded 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1369,6 +1369,10 @@ const char * const vmstat_text[] = { "swap_ra", "swap_ra_hit", #endif +#ifdef CONFIG_X86 + "direct_map_level2_splits", + "direct_map_level3_splits", +#endif #endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */ }; #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */ -- cgit v1.2.3-58-ga151 From fce000b1bc08c64c0cff4bb705b3970bd6fc1e34 Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Tue, 4 May 2021 18:38:06 -0700 Subject: mm/mmap.c: don't unlock VMAs in remap_file_pages() Since this call uses MAP_FIXED, do_mmap() will munlock the necessary range. There is also an error in the loop test expression which will evaluate as false and the loop body has never execute. Link: https://lkml.kernel.org/r/20210223235010.2296915-1-Liam.Howlett@Oracle.com Signed-off-by: Liam R. Howlett Acked-by: Hugh Dickins Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 347ef9b83bb5..c1b848fa7da6 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3029,25 +3029,9 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, flags &= MAP_NONBLOCK; flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE; - if (vma->vm_flags & VM_LOCKED) { - struct vm_area_struct *tmp; + if (vma->vm_flags & VM_LOCKED) flags |= MAP_LOCKED; - /* drop PG_Mlocked flag for over-mapped range */ - for (tmp = vma; tmp->vm_start >= start + size; - tmp = tmp->vm_next) { - /* - * Split pmd and munlock page on the border - * of the range. - */ - vma_adjust_trans_huge(tmp, start, start + size, 0); - - munlock_vma_pages_range(tmp, - max(tmp->vm_start, start), - min(tmp->vm_end, start + size)); - } - } - file = get_file(vma->vm_file); ret = do_mmap(vma->vm_file, start, size, prot, flags, pgoff, &populate, NULL); -- cgit v1.2.3-58-ga151 From c2280be81de404e99f66c7249496b0355406ed94 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 4 May 2021 18:38:09 -0700 Subject: mm: generalize ARCH_HAS_CACHE_LINE_SIZE Patch series "mm: some config cleanups", v2. This series contains config cleanup patches which reduces code duplication across platforms and also improves maintainability. There is no functional change intended with this series. This patch (of 6): ARCH_HAS_CACHE_LINE_SIZE config has duplicate definitions on platforms that subscribe it. Instead, just make it a generic option which can be selected on applicable platforms. This change reduces code duplication and makes it cleaner. Link: https://lkml.kernel.org/r/1617259448-22529-1-git-send-email-anshuman.khandual@arm.com Link: https://lkml.kernel.org/r/1617259448-22529-2-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Acked-by: Catalin Marinas [arm64] Acked-by: Vineet Gupta [arc] Cc: Will Deacon Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Albert Ou Cc: Alexander Viro Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Christian Borntraeger Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: Michael Ellerman Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Rich Felker Cc: Russell King Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arc/Kconfig | 4 +--- arch/arm64/Kconfig | 4 +--- arch/x86/Kconfig | 4 +--- mm/Kconfig | 3 +++ 4 files changed, 6 insertions(+), 9 deletions(-) diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index bc8d6aecfbbd..fab05f7189c0 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -6,6 +6,7 @@ config ARC def_bool y select ARC_TIMERS + select ARCH_HAS_CACHE_LINE_SIZE select ARCH_HAS_DEBUG_VM_PGTABLE select ARCH_HAS_DMA_PREP_COHERENT select ARCH_HAS_PTE_SPECIAL @@ -48,9 +49,6 @@ config ARC select HAVE_ARCH_JUMP_LABEL if ISA_ARCV2 && !CPU_ENDIAN_BE32 select SET_FS -config ARCH_HAS_CACHE_LINE_SIZE - def_bool y - config TRACE_IRQFLAGS_SUPPORT def_bool y diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 04c69f606537..86dfaac79e49 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -11,6 +11,7 @@ config ARM64 select ACPI_PPTT if ACPI select ARCH_HAS_DEBUG_WX select ARCH_BINFMT_ELF_STATE + select ARCH_HAS_CACHE_LINE_SIZE select ARCH_HAS_DEBUG_VIRTUAL select ARCH_HAS_DEBUG_VM_PGTABLE select ARCH_HAS_DMA_PREP_COHERENT @@ -1074,9 +1075,6 @@ config HW_PERF_EVENTS config SYS_SUPPORTS_HUGETLBFS def_bool y -config ARCH_HAS_CACHE_LINE_SIZE - def_bool y - config ARCH_HAS_FILTER_PGPROT def_bool y diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 1c350e8782ed..78c475d8c409 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -61,6 +61,7 @@ config X86 select ARCH_32BIT_OFF_T if X86_32 select ARCH_CLOCKSOURCE_INIT select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI + select ARCH_HAS_CACHE_LINE_SIZE select ARCH_HAS_DEBUG_VIRTUAL select ARCH_HAS_DEBUG_VM_PGTABLE if !X86_PAE select ARCH_HAS_DEVMEM_IS_ALLOWED @@ -316,9 +317,6 @@ config GENERIC_CALIBRATE_DELAY config ARCH_HAS_CPU_RELAX def_bool y -config ARCH_HAS_CACHE_LINE_SIZE - def_bool y - config ARCH_HAS_FILTER_PGPROT def_bool y diff --git a/mm/Kconfig b/mm/Kconfig index 5f7d7b3d8cc2..8d6fb99aeded 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -772,6 +772,9 @@ config IDLE_PAGE_TRACKING See Documentation/admin-guide/mm/idle_page_tracking.rst for more details. +config ARCH_HAS_CACHE_LINE_SIZE + bool + config ARCH_HAS_PTE_DEVMAP bool -- cgit v1.2.3-58-ga151 From 855f9a8e87fe3912a1c00eb63f36880d1ad32e40 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 4 May 2021 18:38:13 -0700 Subject: mm: generalize SYS_SUPPORTS_HUGETLBFS (rename as ARCH_SUPPORTS_HUGETLBFS) SYS_SUPPORTS_HUGETLBFS config has duplicate definitions on platforms that subscribe it. Instead, just make it a generic option which can be selected on applicable platforms. Also rename it as ARCH_SUPPORTS_HUGETLBFS instead. This reduces code duplication and makes it cleaner. Link: https://lkml.kernel.org/r/1617259448-22529-3-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Acked-by: Catalin Marinas [arm64] Acked-by: Palmer Dabbelt [riscv] Acked-by: Michael Ellerman [powerpc] Cc: Russell King Cc: Will Deacon Cc: Thomas Bogendoerfer Cc: "James E.J. Bottomley" Cc: Helge Deller Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Albert Ou Cc: Yoshinori Sato Cc: Rich Felker Cc: Alexander Viro Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christian Borntraeger Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/Kconfig | 5 +---- arch/arm64/Kconfig | 4 +--- arch/mips/Kconfig | 6 +----- arch/parisc/Kconfig | 5 +---- arch/powerpc/Kconfig | 3 --- arch/powerpc/platforms/Kconfig.cputype | 6 +++--- arch/riscv/Kconfig | 5 +---- arch/sh/Kconfig | 5 +---- fs/Kconfig | 5 ++++- 9 files changed, 13 insertions(+), 31 deletions(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 085c830d344b..13fa7ecf195c 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -31,6 +31,7 @@ config ARM select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT if CPU_V7 select ARCH_SUPPORTS_ATOMIC_RMW + select ARCH_SUPPORTS_HUGETLBFS if ARM_LPAE select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF select ARCH_USE_MEMTEST @@ -1511,10 +1512,6 @@ config HW_PERF_EVENTS def_bool y depends on ARM_PMU -config SYS_SUPPORTS_HUGETLBFS - def_bool y - depends on ARM_LPAE - config HAVE_ARCH_TRANSPARENT_HUGEPAGE def_bool y depends on ARM_LPAE diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 86dfaac79e49..0fd5d373d7cf 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -73,6 +73,7 @@ config ARM64 select ARCH_USE_QUEUED_SPINLOCKS select ARCH_USE_SYM_ANNOTATIONS select ARCH_SUPPORTS_DEBUG_PAGEALLOC + select ARCH_SUPPORTS_HUGETLBFS select ARCH_SUPPORTS_MEMORY_FAILURE select ARCH_SUPPORTS_SHADOW_CALL_STACK if CC_HAVE_SHADOW_CALL_STACK select ARCH_SUPPORTS_LTO_CLANG if CPU_LITTLE_ENDIAN @@ -1072,9 +1073,6 @@ config HW_PERF_EVENTS def_bool y depends on ARM_PMU -config SYS_SUPPORTS_HUGETLBFS - def_bool y - config ARCH_HAS_FILTER_PGPROT def_bool y diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 49a3c9cd1cb2..ed51970c08e7 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -19,6 +19,7 @@ config MIPS select ARCH_USE_MEMTEST select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_SPINLOCKS + select ARCH_SUPPORTS_HUGETLBFS if CPU_SUPPORTS_HUGEPAGES select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU select ARCH_WANT_IPC_PARSE_VERSION select ARCH_WANT_LD_ORPHAN_WARN @@ -1287,11 +1288,6 @@ config SYS_SUPPORTS_BIG_ENDIAN config SYS_SUPPORTS_LITTLE_ENDIAN bool -config SYS_SUPPORTS_HUGETLBFS - bool - depends on CPU_SUPPORTS_HUGEPAGES - default y - config MIPS_HUGE_TLB_SUPPORT def_bool HUGETLB_PAGE || TRANSPARENT_HUGEPAGE diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index afc3b8d03572..bde9907bc5b2 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -12,6 +12,7 @@ config PARISC select ARCH_HAS_STRICT_KERNEL_RWX select ARCH_HAS_UBSAN_SANITIZE_ALL select ARCH_NO_SG_CHAIN + select ARCH_SUPPORTS_HUGETLBFS if PA20 select ARCH_SUPPORTS_MEMORY_FAILURE select DMA_OPS select RTC_CLASS @@ -138,10 +139,6 @@ config PGTABLE_LEVELS default 3 if 64BIT && PARISC_PAGE_SIZE_4KB default 2 -config SYS_SUPPORTS_HUGETLBFS - def_bool y if PA20 - - menu "Processor type and features" choice diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 965278498dd7..3a2b90e21e69 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -697,9 +697,6 @@ config ARCH_SPARSEMEM_DEFAULT def_bool y depends on PPC_BOOK3S_64 -config SYS_SUPPORTS_HUGETLBFS - bool - config ILLEGAL_POINTER_VALUE hex # This is roughly half way between the top of user space and the bottom diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 3ce907523b1e..cec1017813f8 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -40,8 +40,8 @@ config PPC_85xx config PPC_8xx bool "Freescale 8xx" + select ARCH_SUPPORTS_HUGETLBFS select FSL_SOC - select SYS_SUPPORTS_HUGETLBFS select PPC_HAVE_KUEP select PPC_HAVE_KUAP select HAVE_ARCH_VMAP_STACK @@ -95,9 +95,9 @@ config PPC_BOOK3S_64 bool "Server processors" select PPC_FPU select PPC_HAVE_PMU_SUPPORT - select SYS_SUPPORTS_HUGETLBFS select HAVE_ARCH_TRANSPARENT_HUGEPAGE select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE + select ARCH_SUPPORTS_HUGETLBFS select ARCH_SUPPORTS_NUMA_BALANCING select IRQ_WORK select PPC_MM_SLICES @@ -278,9 +278,9 @@ config FSL_BOOKE # this is for common code between PPC32 & PPC64 FSL BOOKE config PPC_FSL_BOOK3E bool + select ARCH_SUPPORTS_HUGETLBFS if PHYS_64BIT || PPC64 select FSL_EMB_PERFMON select PPC_SMP_MUXED_IPI - select SYS_SUPPORTS_HUGETLBFS if PHYS_64BIT || PPC64 select PPC_DOORBELL default y if FSL_BOOKE diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 4515a10c5d22..2a0fc12b8d45 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -30,6 +30,7 @@ config RISCV select ARCH_HAS_STRICT_KERNEL_RWX if MMU select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT + select ARCH_SUPPORTS_HUGETLBFS if MMU select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU select ARCH_WANT_FRAME_POINTERS select ARCH_WANT_HUGE_PMD_SHARE if 64BIT @@ -165,10 +166,6 @@ config ARCH_WANT_GENERAL_HUGETLB config ARCH_SUPPORTS_UPROBES def_bool y -config SYS_SUPPORTS_HUGETLBFS - depends on MMU - def_bool y - config STACKTRACE_SUPPORT def_bool y diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index e798e55915c2..a54b0c5de37b 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -101,9 +101,6 @@ config SYS_SUPPORTS_APM_EMULATION bool select ARCH_SUSPEND_POSSIBLE -config SYS_SUPPORTS_HUGETLBFS - bool - config SYS_SUPPORTS_SMP bool @@ -175,12 +172,12 @@ config CPU_SH3 config CPU_SH4 bool + select ARCH_SUPPORTS_HUGETLBFS if MMU select CPU_HAS_INTEVT select CPU_HAS_SR_RB select CPU_HAS_FPU if !CPU_SH4AL_DSP select SH_INTC select SYS_SUPPORTS_SH_TMU - select SYS_SUPPORTS_HUGETLBFS if MMU config CPU_SH4A bool diff --git a/fs/Kconfig b/fs/Kconfig index 97e7b77c9309..89a750d292ba 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -223,10 +223,13 @@ config TMPFS_INODE64 If unsure, say N. +config ARCH_SUPPORTS_HUGETLBFS + def_bool n + config HUGETLBFS bool "HugeTLB file system support" depends on X86 || IA64 || SPARC64 || (S390 && 64BIT) || \ - SYS_SUPPORTS_HUGETLBFS || BROKEN + ARCH_SUPPORTS_HUGETLBFS || BROKEN help hugetlbfs is a filesystem backing for HugeTLB pages, based on ramfs. For architectures that support it, say Y here and read -- cgit v1.2.3-58-ga151 From 91024b3ce247213ee43103dffd629623537a569e Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 4 May 2021 18:38:17 -0700 Subject: mm: generalize ARCH_ENABLE_MEMORY_[HOTPLUG|HOTREMOVE] ARCH_ENABLE_MEMORY_[HOTPLUG|HOTREMOVE] configs have duplicate definitions on platforms that subscribe them. Instead, just make them generic options which can be selected on applicable platforms. Link: https://lkml.kernel.org/r/1617259448-22529-4-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Acked-by: Catalin Marinas [arm64] Acked-by: Heiko Carstens [s390] Cc: Will Deacon Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Vasily Gorbik Cc: Christian Borntraeger Cc: Yoshinori Sato Cc: Rich Felker Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Albert Ou Cc: Alexander Viro Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Russell King Cc: Thomas Bogendoerfer Cc: Vineet Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/Kconfig | 8 ++------ arch/ia64/Kconfig | 8 ++------ arch/powerpc/Kconfig | 8 ++------ arch/s390/Kconfig | 8 ++------ arch/sh/Kconfig | 2 ++ arch/sh/mm/Kconfig | 8 -------- arch/x86/Kconfig | 10 ++-------- mm/Kconfig | 6 ++++++ 8 files changed, 18 insertions(+), 40 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 0fd5d373d7cf..c0746b2d3fff 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -11,6 +11,8 @@ config ARM64 select ACPI_PPTT if ACPI select ARCH_HAS_DEBUG_WX select ARCH_BINFMT_ELF_STATE + select ARCH_ENABLE_MEMORY_HOTPLUG + select ARCH_ENABLE_MEMORY_HOTREMOVE select ARCH_HAS_CACHE_LINE_SIZE select ARCH_HAS_DEBUG_VIRTUAL select ARCH_HAS_DEBUG_VM_PGTABLE @@ -311,12 +313,6 @@ config ZONE_DMA32 bool "Support DMA32 zone" if EXPERT default y -config ARCH_ENABLE_MEMORY_HOTPLUG - def_bool y - -config ARCH_ENABLE_MEMORY_HOTREMOVE - def_bool y - config SMP def_bool y diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index fd2cfc65fdc9..279252e3e0f7 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -13,6 +13,8 @@ config IA64 select ARCH_MIGHT_HAVE_PC_SERIO select ACPI select ACPI_NUMA if NUMA + select ARCH_ENABLE_MEMORY_HOTPLUG + select ARCH_ENABLE_MEMORY_HOTREMOVE select ARCH_SUPPORTS_ACPI select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI @@ -246,12 +248,6 @@ config HOTPLUG_CPU can be controlled through /sys/devices/system/cpu/cpu#. Say N if you want to disable CPU hotplug. -config ARCH_ENABLE_MEMORY_HOTPLUG - def_bool y - -config ARCH_ENABLE_MEMORY_HOTREMOVE - def_bool y - config SCHED_SMT bool "SMT scheduler support" depends on SMP diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 3a2b90e21e69..d4333049b813 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -118,6 +118,8 @@ config PPC # Please keep this list sorted alphabetically. # select ARCH_32BIT_OFF_T if PPC32 + select ARCH_ENABLE_MEMORY_HOTPLUG + select ARCH_ENABLE_MEMORY_HOTREMOVE select ARCH_HAS_DEBUG_VIRTUAL select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE @@ -512,12 +514,6 @@ config ARCH_CPU_PROBE_RELEASE def_bool y depends on HOTPLUG_CPU -config ARCH_ENABLE_MEMORY_HOTPLUG - def_bool y - -config ARCH_ENABLE_MEMORY_HOTREMOVE - def_bool y - config PPC64_SUPPORTS_MEMORY_FAILURE bool "Add support for memory hwpoison" depends on PPC_BOOK3S_64 diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index c1ff874e6c2e..f8b356550daa 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -60,6 +60,8 @@ config S390 imply IMA_SECURE_AND_OR_TRUSTED_BOOT select ARCH_32BIT_USTAT_F_TINODE select ARCH_BINFMT_ELF_STATE + select ARCH_ENABLE_MEMORY_HOTPLUG if SPARSEMEM + select ARCH_ENABLE_MEMORY_HOTREMOVE select ARCH_HAS_DEBUG_VM_PGTABLE select ARCH_HAS_DEBUG_WX select ARCH_HAS_DEVMEM_IS_ALLOWED @@ -626,12 +628,6 @@ config ARCH_SPARSEMEM_ENABLE config ARCH_SPARSEMEM_DEFAULT def_bool y -config ARCH_ENABLE_MEMORY_HOTPLUG - def_bool y if SPARSEMEM - -config ARCH_ENABLE_MEMORY_HOTREMOVE - def_bool y - config ARCH_ENABLE_SPLIT_PMD_PTLOCK def_bool y diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index a54b0c5de37b..68129537e350 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -2,6 +2,8 @@ config SUPERH def_bool y select ARCH_32BIT_OFF_T + select ARCH_ENABLE_MEMORY_HOTPLUG if SPARSEMEM && MMU + select ARCH_ENABLE_MEMORY_HOTREMOVE if SPARSEMEM && MMU select ARCH_HAVE_CUSTOM_GPIO_H select ARCH_HAVE_NMI_SAFE_CMPXCHG if (GUSA_RB || CPU_SH4A) select ARCH_HAS_BINFMT_FLAT if !MMU diff --git a/arch/sh/mm/Kconfig b/arch/sh/mm/Kconfig index 77aa2f802d8d..d551a9cac41e 100644 --- a/arch/sh/mm/Kconfig +++ b/arch/sh/mm/Kconfig @@ -136,14 +136,6 @@ config ARCH_SPARSEMEM_DEFAULT config ARCH_SELECT_MEMORY_MODEL def_bool y -config ARCH_ENABLE_MEMORY_HOTPLUG - def_bool y - depends on SPARSEMEM && MMU - -config ARCH_ENABLE_MEMORY_HOTREMOVE - def_bool y - depends on SPARSEMEM && MMU - config ARCH_MEMORY_PROBE def_bool y depends on MEMORY_HOTPLUG diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 78c475d8c409..306f76fea837 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -60,6 +60,8 @@ config X86 select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI select ARCH_32BIT_OFF_T if X86_32 select ARCH_CLOCKSOURCE_INIT + select ARCH_ENABLE_MEMORY_HOTPLUG if X86_64 || (X86_32 && HIGHMEM) + select ARCH_ENABLE_MEMORY_HOTREMOVE if MEMORY_HOTPLUG select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI select ARCH_HAS_CACHE_LINE_SIZE select ARCH_HAS_DEBUG_VIRTUAL @@ -2427,14 +2429,6 @@ config ARCH_HAS_ADD_PAGES def_bool y depends on X86_64 && ARCH_ENABLE_MEMORY_HOTPLUG -config ARCH_ENABLE_MEMORY_HOTPLUG - def_bool y - depends on X86_64 || (X86_32 && HIGHMEM) - -config ARCH_ENABLE_MEMORY_HOTREMOVE - def_bool y - depends on MEMORY_HOTPLUG - config USE_PERCPU_NUMA_NODE_ID def_bool y depends on NUMA diff --git a/mm/Kconfig b/mm/Kconfig index 8d6fb99aeded..fe4897c3c81b 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -148,6 +148,9 @@ config MEMORY_ISOLATION config HAVE_BOOTMEM_INFO_NODE def_bool n +config ARCH_ENABLE_MEMORY_HOTPLUG + bool + # eventually, we can have this option just 'select SPARSEMEM' config MEMORY_HOTPLUG bool "Allow for memory hot-add" @@ -176,6 +179,9 @@ config MEMORY_HOTPLUG_DEFAULT_ONLINE Say N here if you want the default policy to keep all hot-plugged memory blocks in 'offline' state. +config ARCH_ENABLE_MEMORY_HOTREMOVE + bool + config MEMORY_HOTREMOVE bool "Allow for memory hot remove" select HAVE_BOOTMEM_INFO_NODE if (X86_64 || PPC64) -- cgit v1.2.3-58-ga151 From 1e866974a15be8921fb01f8c4efa93a5157ef690 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 4 May 2021 18:38:21 -0700 Subject: mm: drop redundant ARCH_ENABLE_[HUGEPAGE|THP]_MIGRATION ARCH_ENABLE_[HUGEPAGE|THP]_MIGRATION configs have duplicate definitions on platforms that subscribe them. Drop these reduntant definitions and instead just select them appropriately. [akpm@linux-foundation.org: s/x86_64/X86_64/, per Oscar] Link: https://lkml.kernel.org/r/1617259448-22529-5-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Acked-by: Catalin Marinas [arm64] Cc: Will Deacon Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Albert Ou Cc: Alexander Viro Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christian Borntraeger Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Rich Felker Cc: Russell King Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/Kconfig | 10 ++-------- arch/powerpc/platforms/Kconfig.cputype | 5 +---- arch/x86/Kconfig | 10 ++-------- 3 files changed, 5 insertions(+), 20 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index c0746b2d3fff..4297fbca0faa 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -11,8 +11,10 @@ config ARM64 select ACPI_PPTT if ACPI select ARCH_HAS_DEBUG_WX select ARCH_BINFMT_ELF_STATE + select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION select ARCH_ENABLE_MEMORY_HOTPLUG select ARCH_ENABLE_MEMORY_HOTREMOVE + select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE select ARCH_HAS_CACHE_LINE_SIZE select ARCH_HAS_DEBUG_VIRTUAL select ARCH_HAS_DEBUG_VM_PGTABLE @@ -1916,14 +1918,6 @@ config SYSVIPC_COMPAT def_bool y depends on COMPAT && SYSVIPC -config ARCH_ENABLE_HUGEPAGE_MIGRATION - def_bool y - depends on HUGETLB_PAGE && MIGRATION - -config ARCH_ENABLE_THP_MIGRATION - def_bool y - depends on TRANSPARENT_HUGEPAGE - menu "Power management options" source "kernel/power/Kconfig" diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index cec1017813f8..4465b71b2bff 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -96,6 +96,7 @@ config PPC_BOOK3S_64 select PPC_FPU select PPC_HAVE_PMU_SUPPORT select HAVE_ARCH_TRANSPARENT_HUGEPAGE + select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE select ARCH_SUPPORTS_HUGETLBFS select ARCH_SUPPORTS_NUMA_BALANCING @@ -420,10 +421,6 @@ config PPC_PKEY depends on PPC_BOOK3S_64 depends on PPC_MEM_KEYS || PPC_KUAP || PPC_KUEP -config ARCH_ENABLE_HUGEPAGE_MIGRATION - def_bool y - depends on PPC_BOOK3S_64 && HUGETLB_PAGE && MIGRATION - config PPC_MMU_NOHASH def_bool y diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 306f76fea837..ab581af756cd 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -60,8 +60,10 @@ config X86 select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI select ARCH_32BIT_OFF_T if X86_32 select ARCH_CLOCKSOURCE_INIT + select ARCH_ENABLE_HUGEPAGE_MIGRATION if X86_64 && HUGETLB_PAGE && MIGRATION select ARCH_ENABLE_MEMORY_HOTPLUG if X86_64 || (X86_32 && HIGHMEM) select ARCH_ENABLE_MEMORY_HOTREMOVE if MEMORY_HOTPLUG + select ARCH_ENABLE_THP_MIGRATION if X86_64 && TRANSPARENT_HUGEPAGE select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI select ARCH_HAS_CACHE_LINE_SIZE select ARCH_HAS_DEBUG_VIRTUAL @@ -2437,14 +2439,6 @@ config ARCH_ENABLE_SPLIT_PMD_PTLOCK def_bool y depends on X86_64 || X86_PAE -config ARCH_ENABLE_HUGEPAGE_MIGRATION - def_bool y - depends on X86_64 && HUGETLB_PAGE && MIGRATION - -config ARCH_ENABLE_THP_MIGRATION - def_bool y - depends on X86_64 && TRANSPARENT_HUGEPAGE - menu "Power management and ACPI options" config ARCH_HIBERNATION_HEADER -- cgit v1.2.3-58-ga151 From 66f24fa766e3a5a194a85af98ff454d8d94b59cf Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 4 May 2021 18:38:25 -0700 Subject: mm: drop redundant ARCH_ENABLE_SPLIT_PMD_PTLOCK ARCH_ENABLE_SPLIT_PMD_PTLOCKS has duplicate definitions on platforms that subscribe it. Drop these redundant definitions and instead just select it on applicable platforms. Link: https://lkml.kernel.org/r/1617259448-22529-6-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Acked-by: Catalin Marinas [arm64] Acked-by: Heiko Carstens [s390] Cc: Will Deacon Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Vasily Gorbik Cc: Christian Borntraeger Cc: Yoshinori Sato Cc: Rich Felker Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Albert Ou Cc: Alexander Viro Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Russell King Cc: Thomas Bogendoerfer Cc: Vineet Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/Kconfig | 4 +--- arch/powerpc/platforms/Kconfig.cputype | 5 +---- arch/s390/Kconfig | 4 +--- arch/x86/Kconfig | 5 +---- 4 files changed, 4 insertions(+), 14 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 4297fbca0faa..e12fad2e2a15 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -14,6 +14,7 @@ config ARM64 select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION select ARCH_ENABLE_MEMORY_HOTPLUG select ARCH_ENABLE_MEMORY_HOTREMOVE + select ARCH_ENABLE_SPLIT_PMD_PTLOCK if PGTABLE_LEVELS > 2 select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE select ARCH_HAS_CACHE_LINE_SIZE select ARCH_HAS_DEBUG_VIRTUAL @@ -1074,9 +1075,6 @@ config HW_PERF_EVENTS config ARCH_HAS_FILTER_PGPROT def_bool y -config ARCH_ENABLE_SPLIT_PMD_PTLOCK - def_bool y if PGTABLE_LEVELS > 2 - # Supported by clang >= 7.0 config CC_HAVE_SHADOW_CALL_STACK def_bool $(cc-option, -fsanitize=shadow-call-stack -ffixed-x18) diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 4465b71b2bff..be0e29f18dd4 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -97,6 +97,7 @@ config PPC_BOOK3S_64 select PPC_HAVE_PMU_SUPPORT select HAVE_ARCH_TRANSPARENT_HUGEPAGE select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION + select ARCH_ENABLE_PMD_SPLIT_PTLOCK select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE select ARCH_SUPPORTS_HUGETLBFS select ARCH_SUPPORTS_NUMA_BALANCING @@ -356,10 +357,6 @@ config SPE If in doubt, say Y here. -config ARCH_ENABLE_SPLIT_PMD_PTLOCK - def_bool y - depends on PPC_BOOK3S_64 - config PPC_RADIX_MMU bool "Radix MMU Support" depends on PPC_BOOK3S_64 diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index f8b356550daa..d72989591223 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -62,6 +62,7 @@ config S390 select ARCH_BINFMT_ELF_STATE select ARCH_ENABLE_MEMORY_HOTPLUG if SPARSEMEM select ARCH_ENABLE_MEMORY_HOTREMOVE + select ARCH_ENABLE_SPLIT_PMD_PTLOCK select ARCH_HAS_DEBUG_VM_PGTABLE select ARCH_HAS_DEBUG_WX select ARCH_HAS_DEVMEM_IS_ALLOWED @@ -628,9 +629,6 @@ config ARCH_SPARSEMEM_ENABLE config ARCH_SPARSEMEM_DEFAULT def_bool y -config ARCH_ENABLE_SPLIT_PMD_PTLOCK - def_bool y - config MAX_PHYSMEM_BITS int "Maximum size of supported physical memory in bits (42-53)" range 42 53 diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ab581af756cd..0bbc157e4bc7 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -63,6 +63,7 @@ config X86 select ARCH_ENABLE_HUGEPAGE_MIGRATION if X86_64 && HUGETLB_PAGE && MIGRATION select ARCH_ENABLE_MEMORY_HOTPLUG if X86_64 || (X86_32 && HIGHMEM) select ARCH_ENABLE_MEMORY_HOTREMOVE if MEMORY_HOTPLUG + select ARCH_ENABLE_SPLIT_PMD_PTLOCK if X86_64 || X86_PAE select ARCH_ENABLE_THP_MIGRATION if X86_64 && TRANSPARENT_HUGEPAGE select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI select ARCH_HAS_CACHE_LINE_SIZE @@ -2435,10 +2436,6 @@ config USE_PERCPU_NUMA_NODE_ID def_bool y depends on NUMA -config ARCH_ENABLE_SPLIT_PMD_PTLOCK - def_bool y - depends on X86_64 || X86_PAE - menu "Power management and ACPI options" config ARCH_HIBERNATION_HEADER -- cgit v1.2.3-58-ga151 From e8003bf66a7a66d8ae3db2c40b2dca180bf942bb Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 4 May 2021 18:38:29 -0700 Subject: mm: drop redundant HAVE_ARCH_TRANSPARENT_HUGEPAGE HAVE_ARCH_TRANSPARENT_HUGEPAGE has duplicate definitions on platforms that subscribe it. Drop these reduntant definitions and instead just select it on applicable platforms. Link: https://lkml.kernel.org/r/1617259448-22529-7-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Acked-by: Arnd Bergmann Acked-by: Vineet Gupta [arc] Cc: Russell King Cc: Albert Ou Cc: Alexander Viro Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Michael Ellerman Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Rich Felker Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arc/Kconfig | 5 +---- arch/arm/Kconfig | 5 +---- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index fab05f7189c0..2d98501c0897 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -29,6 +29,7 @@ config ARC select GENERIC_SMP_IDLE_THREAD select HAVE_ARCH_KGDB select HAVE_ARCH_TRACEHOOK + select HAVE_ARCH_TRANSPARENT_HUGEPAGE if ARC_MMU_V4 select HAVE_DEBUG_STACKOVERFLOW select HAVE_DEBUG_KMEMLEAK select HAVE_FUTEX_CMPXCHG if FUTEX @@ -84,10 +85,6 @@ config STACKTRACE_SUPPORT def_bool y select STACKTRACE -config HAVE_ARCH_TRANSPARENT_HUGEPAGE - def_bool y - depends on ARC_MMU_V4 - menu "ARC Architecture Configuration" menu "ARC Platform/SoC/Board" diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 13fa7ecf195c..24804f11302d 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -78,6 +78,7 @@ config ARM select HAVE_ARCH_SECCOMP_FILTER if AEABI && !OABI_COMPAT select HAVE_ARCH_THREAD_STRUCT_WHITELIST select HAVE_ARCH_TRACEHOOK + select HAVE_ARCH_TRANSPARENT_HUGEPAGE if ARM_LPAE select HAVE_ARM_SMCCC if CPU_V7 select HAVE_EBPF_JIT if !CPU_ENDIAN_BE32 select HAVE_CONTEXT_TRACKING @@ -1512,10 +1513,6 @@ config HW_PERF_EVENTS def_bool y depends on ARM_PMU -config HAVE_ARCH_TRANSPARENT_HUGEPAGE - def_bool y - depends on ARM_LPAE - config ARCH_WANT_GENERAL_HUGETLB def_bool y -- cgit v1.2.3-58-ga151 From 2521781c1ebc6d26b7fbe9b7e9614fd2f38affb5 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 4 May 2021 18:38:32 -0700 Subject: mm/util.c: reduce mem_dump_obj() object size Simplify the code by using a temporary and reduce the object size by using a single call to pr_cont(). Reverse a test and unindent a block too. $ size mm/util.o* (defconfig x86-64) text data bss dec hex filename 7419 372 40 7831 1e97 mm/util.o.new 7477 372 40 7889 1ed1 mm/util.o.old Link: https://lkml.kernel.org/r/a6e105886338f68afd35f7a13d73bcf06b0cc732.camel@perches.com Signed-off-by: Joe Perches Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/util.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/mm/util.c b/mm/util.c index 083c5c417cfc..972e7a0cda5e 100644 --- a/mm/util.c +++ b/mm/util.c @@ -987,22 +987,26 @@ int __weak memcmp_pages(struct page *page1, struct page *page2) */ void mem_dump_obj(void *object) { + const char *type; + if (kmem_valid_obj(object)) { kmem_dump_obj(object); return; } + if (vmalloc_dump_obj(object)) return; - if (!virt_addr_valid(object)) { - if (object == NULL) - pr_cont(" NULL pointer.\n"); - else if (object == ZERO_SIZE_PTR) - pr_cont(" zero-size pointer.\n"); - else - pr_cont(" non-paged memory.\n"); - return; - } - pr_cont(" non-slab/vmalloc memory.\n"); + + if (virt_addr_valid(object)) + type = "non-slab/vmalloc memory"; + else if (object == NULL) + type = "NULL pointer"; + else if (object == ZERO_SIZE_PTR) + type = "zero-size pointer"; + else + type = "non-paged memory"; + + pr_cont(" %s\n", type); } EXPORT_SYMBOL_GPL(mem_dump_obj); #endif -- cgit v1.2.3-58-ga151 From 31454980b8b55b066ba0d6b8267313fcb94ea816 Mon Sep 17 00:00:00 2001 From: Bhaskar Chowdhury Date: Tue, 4 May 2021 18:38:35 -0700 Subject: mm/util.c: fix typo s/condtion/condition/ Link: https://lkml.kernel.org/r/20210317033439.3429411-1-unixbhaskar@gmail.com Signed-off-by: Bhaskar Chowdhury Acked-by: Randy Dunlap Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/util.c b/mm/util.c index 972e7a0cda5e..a8bf17f18a81 100644 --- a/mm/util.c +++ b/mm/util.c @@ -765,7 +765,7 @@ int overcommit_policy_handler(struct ctl_table *table, int write, void *buffer, * The deviation of sync_overcommit_as could be big with loose policy * like OVERCOMMIT_ALWAYS/OVERCOMMIT_GUESS. When changing policy to * strict OVERCOMMIT_NEVER, we need to reduce the deviation to comply - * with the strict "NEVER", and to avoid possible race condtion (even + * with the strict "NEVER", and to avoid possible race condition (even * though user usually won't too frequently do the switching to policy * OVERCOMMIT_NEVER), the switch is done in the following order: * 1. changing the batch -- cgit v1.2.3-58-ga151 From c991ffef7bce85a5d4ebc503c06dfd6dd8e5dc52 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Tue, 4 May 2021 18:38:38 -0700 Subject: mm/gup: don't pin migrated cma pages in movable zone Patch series "prohibit pinning pages in ZONE_MOVABLE", v11. When page is pinned it cannot be moved and its physical address stays the same until pages is unpinned. This is useful functionality to allows userland to implementation DMA access. For example, it is used by vfio in vfio_pin_pages(). However, this functionality breaks memory hotplug/hotremove assumptions that pages in ZONE_MOVABLE can always be migrated. This patch series fixes this issue by forcing new allocations during page pinning to omit ZONE_MOVABLE, and also to migrate any existing pages from ZONE_MOVABLE during pinning. It uses the same scheme logic that is currently used by CMA, and extends the functionality for all allocations. For more information read the discussion [1] about this problem. [1] https://lore.kernel.org/lkml/CA+CK2bBffHBxjmb9jmSKacm0fJMinyt3Nhk8Nx6iudcQSj80_w@mail.gmail.com This patch (of 14): In order not to fragment CMA the pinned pages are migrated. However, they are migrated to ZONE_MOVABLE, which also should not have pinned pages. Remove __GFP_MOVABLE, so pages can be migrated to zones where pinning is allowed. Link: https://lkml.kernel.org/r/20210215161349.246722-1-pasha.tatashin@soleen.com Link: https://lkml.kernel.org/r/20210215161349.246722-2-pasha.tatashin@soleen.com Signed-off-by: Pavel Tatashin Reviewed-by: David Hildenbrand Reviewed-by: John Hubbard Acked-by: Michal Hocko Cc: Vlastimil Babka Cc: Michal Hocko Cc: David Hildenbrand Cc: Oscar Salvador Cc: Dan Williams Cc: Sasha Levin Cc: Tyler Hicks Cc: Joonsoo Kim Cc: Mike Kravetz Cc: Steven Rostedt (VMware) Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Peter Zijlstra Cc: Mel Gorman Cc: Matthew Wilcox Cc: David Rientjes Cc: John Hubbard Cc: Ira Weiny Cc: James Morris Cc: Jason Gunthorpe Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/gup.c b/mm/gup.c index 71e546e279fc..b63e2c0df3c7 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1616,7 +1616,7 @@ static long check_and_migrate_cma_pages(struct mm_struct *mm, long ret = nr_pages; struct migration_target_control mtc = { .nid = NUMA_NO_NODE, - .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_NOWARN, + .gfp_mask = GFP_USER | __GFP_NOWARN, }; check_again: -- cgit v1.2.3-58-ga151 From 83c02c23d0747a7bdcd71f99a538aacec94b146c Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Tue, 4 May 2021 18:38:42 -0700 Subject: mm/gup: check every subpage of a compound page during isolation When pages are isolated in check_and_migrate_movable_pages() we skip compound number of pages at a time. However, as Jason noted, it is not necessary correct that pages[i] corresponds to the pages that we skipped. This is because it is possible that the addresses in this range had split_huge_pmd()/split_huge_pud(), and these functions do not update the compound page metadata. The problem can be reproduced if something like this occurs: 1. User faulted huge pages. 2. split_huge_pmd() was called for some reason 3. User has unmapped some sub-pages in the range 4. User tries to longterm pin the addresses. The resulting pages[i] might end-up having pages which are not compound size page aligned. Link: https://lkml.kernel.org/r/20210215161349.246722-3-pasha.tatashin@soleen.com Fixes: aa712399c1e8 ("mm/gup: speed up check_and_migrate_cma_pages() on huge page") Signed-off-by: Pavel Tatashin Reported-by: Jason Gunthorpe Reviewed-by: Jason Gunthorpe Cc: Dan Williams Cc: David Hildenbrand Cc: David Rientjes Cc: Ingo Molnar Cc: Ira Weiny Cc: James Morris Cc: Jason Gunthorpe Cc: John Hubbard Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Sasha Levin Cc: Steven Rostedt (VMware) Cc: Tyler Hicks Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index b63e2c0df3c7..16015718c0df 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1609,26 +1609,23 @@ static long check_and_migrate_cma_pages(struct mm_struct *mm, unsigned int gup_flags) { unsigned long i; - unsigned long step; bool drain_allow = true; bool migrate_allow = true; LIST_HEAD(cma_page_list); long ret = nr_pages; + struct page *prev_head, *head; struct migration_target_control mtc = { .nid = NUMA_NO_NODE, .gfp_mask = GFP_USER | __GFP_NOWARN, }; check_again: - for (i = 0; i < nr_pages;) { - - struct page *head = compound_head(pages[i]); - - /* - * gup may start from a tail page. Advance step by the left - * part. - */ - step = compound_nr(head) - (pages[i] - head); + prev_head = NULL; + for (i = 0; i < nr_pages; i++) { + head = compound_head(pages[i]); + if (head == prev_head) + continue; + prev_head = head; /* * If we get a page from the CMA zone, since we are going to * be pinning these entries, we might as well move them out @@ -1652,8 +1649,6 @@ check_again: } } } - - i += step; } if (!list_empty(&cma_page_list)) { -- cgit v1.2.3-58-ga151 From f0f4463837da17a89d965dcbe4e411629dbcf308 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Tue, 4 May 2021 18:38:46 -0700 Subject: mm/gup: return an error on migration failure When migration failure occurs, we still pin pages, which means that we may pin CMA movable pages which should never be the case. Instead return an error without pinning pages when migration failure happens. No need to retry migrating, because migrate_pages() already retries 10 times. Link: https://lkml.kernel.org/r/20210215161349.246722-4-pasha.tatashin@soleen.com Signed-off-by: Pavel Tatashin Reviewed-by: Jason Gunthorpe Cc: Dan Williams Cc: David Hildenbrand Cc: David Rientjes Cc: Ingo Molnar Cc: Ira Weiny Cc: James Morris Cc: Jason Gunthorpe Cc: John Hubbard Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Sasha Levin Cc: Steven Rostedt (VMware) Cc: Tyler Hicks Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 16015718c0df..da08c582c216 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1610,7 +1610,6 @@ static long check_and_migrate_cma_pages(struct mm_struct *mm, { unsigned long i; bool drain_allow = true; - bool migrate_allow = true; LIST_HEAD(cma_page_list); long ret = nr_pages; struct page *prev_head, *head; @@ -1661,17 +1660,15 @@ check_again: for (i = 0; i < nr_pages; i++) put_page(pages[i]); - if (migrate_pages(&cma_page_list, alloc_migration_target, NULL, - (unsigned long)&mtc, MIGRATE_SYNC, MR_CONTIG_RANGE)) { - /* - * some of the pages failed migration. Do get_user_pages - * without migration. - */ - migrate_allow = false; - + ret = migrate_pages(&cma_page_list, alloc_migration_target, + NULL, (unsigned long)&mtc, MIGRATE_SYNC, + MR_CONTIG_RANGE); + if (ret) { if (!list_empty(&cma_page_list)) putback_movable_pages(&cma_page_list); + return ret > 0 ? -ENOMEM : ret; } + /* * We did migrate all the pages, Try to get the page references * again migrating any new CMA pages which we failed to isolate @@ -1681,7 +1678,7 @@ check_again: pages, vmas, NULL, gup_flags); - if ((ret > 0) && migrate_allow) { + if (ret > 0) { nr_pages = ret; drain_allow = true; goto check_again; -- cgit v1.2.3-58-ga151 From 6e7f34ebb8d25d71ce7f4580ba3cbfc10b895580 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Tue, 4 May 2021 18:38:49 -0700 Subject: mm/gup: check for isolation errors It is still possible that we pin movable CMA pages if there are isolation errors and cma_page_list stays empty when we check again. Check for isolation errors, and return success only when there are no isolation errors, and cma_page_list is empty after checking. Because isolation errors are transient, we retry indefinitely. Link: https://lkml.kernel.org/r/20210215161349.246722-5-pasha.tatashin@soleen.com Fixes: 9a4e9f3b2d73 ("mm: update get_user_pages_longterm to migrate pages allocated from CMA region") Signed-off-by: Pavel Tatashin Reviewed-by: Jason Gunthorpe Cc: Dan Williams Cc: David Hildenbrand Cc: David Rientjes Cc: Ingo Molnar Cc: Ira Weiny Cc: James Morris Cc: Jason Gunthorpe Cc: John Hubbard Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Sasha Levin Cc: Steven Rostedt (VMware) Cc: Tyler Hicks Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 60 ++++++++++++++++++++++++++++++++++-------------------------- 1 file changed, 34 insertions(+), 26 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index da08c582c216..3a00f6a8ffd6 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1608,8 +1608,8 @@ static long check_and_migrate_cma_pages(struct mm_struct *mm, struct vm_area_struct **vmas, unsigned int gup_flags) { - unsigned long i; - bool drain_allow = true; + unsigned long i, isolation_error_count; + bool drain_allow; LIST_HEAD(cma_page_list); long ret = nr_pages; struct page *prev_head, *head; @@ -1620,6 +1620,8 @@ static long check_and_migrate_cma_pages(struct mm_struct *mm, check_again: prev_head = NULL; + isolation_error_count = 0; + drain_allow = true; for (i = 0; i < nr_pages; i++) { head = compound_head(pages[i]); if (head == prev_head) @@ -1631,25 +1633,35 @@ check_again: * of the CMA zone if possible. */ if (is_migrate_cma_page(head)) { - if (PageHuge(head)) - isolate_huge_page(head, &cma_page_list); - else { + if (PageHuge(head)) { + if (!isolate_huge_page(head, &cma_page_list)) + isolation_error_count++; + } else { if (!PageLRU(head) && drain_allow) { lru_add_drain_all(); drain_allow = false; } - if (!isolate_lru_page(head)) { - list_add_tail(&head->lru, &cma_page_list); - mod_node_page_state(page_pgdat(head), - NR_ISOLATED_ANON + - page_is_file_lru(head), - thp_nr_pages(head)); + if (isolate_lru_page(head)) { + isolation_error_count++; + continue; } + list_add_tail(&head->lru, &cma_page_list); + mod_node_page_state(page_pgdat(head), + NR_ISOLATED_ANON + + page_is_file_lru(head), + thp_nr_pages(head)); } } } + /* + * If list is empty, and no isolation errors, means that all pages are + * in the correct zone. + */ + if (list_empty(&cma_page_list) && !isolation_error_count) + return ret; + if (!list_empty(&cma_page_list)) { /* * drop the above get_user_pages reference. @@ -1669,23 +1681,19 @@ check_again: return ret > 0 ? -ENOMEM : ret; } - /* - * We did migrate all the pages, Try to get the page references - * again migrating any new CMA pages which we failed to isolate - * earlier. - */ - ret = __get_user_pages_locked(mm, start, nr_pages, - pages, vmas, NULL, - gup_flags); - - if (ret > 0) { - nr_pages = ret; - drain_allow = true; - goto check_again; - } + /* We unpinned pages before migration, pin them again */ + ret = __get_user_pages_locked(mm, start, nr_pages, pages, vmas, + NULL, gup_flags); + if (ret <= 0) + return ret; + nr_pages = ret; } - return ret; + /* + * check again because pages were unpinned, and we also might have + * had isolation errors and need more pages to migrate. + */ + goto check_again; } #else static long check_and_migrate_cma_pages(struct mm_struct *mm, -- cgit v1.2.3-58-ga151 From 1a08ae36cf8b5f26d0c64ebfe46f8eb07ea0b678 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Tue, 4 May 2021 18:38:53 -0700 Subject: mm cma: rename PF_MEMALLOC_NOCMA to PF_MEMALLOC_PIN PF_MEMALLOC_NOCMA is used ot guarantee that the allocator will not return pages that might belong to CMA region. This is currently used for long term gup to make sure that such pins are not going to be done on any CMA pages. When PF_MEMALLOC_NOCMA has been introduced we haven't realized that it is focusing on CMA pages too much and that there is larger class of pages that need the same treatment. MOVABLE zone cannot contain any long term pins as well so it makes sense to reuse and redefine this flag for that usecase as well. Rename the flag to PF_MEMALLOC_PIN which defines an allocation context which can only get pages suitable for long-term pins. Also rename: memalloc_nocma_save()/memalloc_nocma_restore to memalloc_pin_save()/memalloc_pin_restore() and make the new functions common. [rppt@linux.ibm.com: fix renaming of PF_MEMALLOC_NOCMA to PF_MEMALLOC_PIN] Link: https://lkml.kernel.org/r/20210331163816.11517-1-rppt@kernel.org Link: https://lkml.kernel.org/r/20210215161349.246722-6-pasha.tatashin@soleen.com Signed-off-by: Pavel Tatashin Reviewed-by: John Hubbard Acked-by: Michal Hocko Signed-off-by: Mike Rapoport Cc: Dan Williams Cc: David Hildenbrand Cc: David Rientjes Cc: Ingo Molnar Cc: Ira Weiny Cc: James Morris Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Sasha Levin Cc: Steven Rostedt (VMware) Cc: Tyler Hicks Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 2 +- include/linux/sched/mm.h | 21 +++++---------------- mm/gup.c | 4 ++-- mm/hugetlb.c | 4 ++-- mm/page_alloc.c | 4 ++-- 5 files changed, 12 insertions(+), 23 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 9c25c8e67030..d2c881384517 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1583,7 +1583,7 @@ extern struct pid *cad_pid; #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ -#define PF_MEMALLOC_NOCMA 0x10000000 /* All allocation request will have _GFP_MOVABLE cleared */ +#define PF_MEMALLOC_PIN 0x10000000 /* Allocation context constrained to zones which allow long term pinning. */ #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */ #define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */ diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 90b2a0bce11c..ae654819e8aa 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -271,29 +271,18 @@ static inline void memalloc_noreclaim_restore(unsigned int flags) current->flags = (current->flags & ~PF_MEMALLOC) | flags; } -#ifdef CONFIG_CMA -static inline unsigned int memalloc_nocma_save(void) +static inline unsigned int memalloc_pin_save(void) { - unsigned int flags = current->flags & PF_MEMALLOC_NOCMA; + unsigned int flags = current->flags & PF_MEMALLOC_PIN; - current->flags |= PF_MEMALLOC_NOCMA; + current->flags |= PF_MEMALLOC_PIN; return flags; } -static inline void memalloc_nocma_restore(unsigned int flags) +static inline void memalloc_pin_restore(unsigned int flags) { - current->flags = (current->flags & ~PF_MEMALLOC_NOCMA) | flags; + current->flags = (current->flags & ~PF_MEMALLOC_PIN) | flags; } -#else -static inline unsigned int memalloc_nocma_save(void) -{ - return 0; -} - -static inline void memalloc_nocma_restore(unsigned int flags) -{ -} -#endif #ifdef CONFIG_MEMCG DECLARE_PER_CPU(struct mem_cgroup *, int_active_memcg); diff --git a/mm/gup.c b/mm/gup.c index 3a00f6a8ffd6..a1eff7ad31da 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1722,7 +1722,7 @@ static long __gup_longterm_locked(struct mm_struct *mm, long rc; if (gup_flags & FOLL_LONGTERM) - flags = memalloc_nocma_save(); + flags = memalloc_pin_save(); rc = __get_user_pages_locked(mm, start, nr_pages, pages, vmas, NULL, gup_flags); @@ -1731,7 +1731,7 @@ static long __gup_longterm_locked(struct mm_struct *mm, if (rc > 0) rc = check_and_migrate_cma_pages(mm, start, rc, pages, vmas, gup_flags); - memalloc_nocma_restore(flags); + memalloc_pin_restore(flags); } return rc; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 533e5a26e437..60dc197a4417 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1079,11 +1079,11 @@ static void enqueue_huge_page(struct hstate *h, struct page *page) static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid) { struct page *page; - bool nocma = !!(current->flags & PF_MEMALLOC_NOCMA); + bool pin = !!(current->flags & PF_MEMALLOC_PIN); lockdep_assert_held(&hugetlb_lock); list_for_each_entry(page, &h->hugepage_freelists[nid], lru) { - if (nocma && is_migrate_cma_page(page)) + if (pin && is_migrate_cma_page(page)) continue; if (PageHWPoison(page)) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d12299c08b95..c2fc6a64bef9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3865,8 +3865,8 @@ static inline unsigned int current_alloc_flags(gfp_t gfp_mask, #ifdef CONFIG_CMA unsigned int pflags = current->flags; - if (!(pflags & PF_MEMALLOC_NOCMA) && - gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE) + if (!(pflags & PF_MEMALLOC_PIN) && + gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE) alloc_flags |= ALLOC_CMA; #endif -- cgit v1.2.3-58-ga151 From da6df1b0fcfa97b2e3394df8622128bb810e1093 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Tue, 4 May 2021 18:38:57 -0700 Subject: mm: apply per-task gfp constraints in fast path Function current_gfp_context() is called after fast path. However, soon we will add more constraints which will also limit zones based on context. Move this call into fast path, and apply the correct constraints for all allocations. Also update .reclaim_idx based on value returned by current_gfp_context() because it soon will modify the allowed zones. Note: With this patch we will do one extra current->flags load during fast path, but we already load current->flags in fast-path: __alloc_pages() prepare_alloc_pages() current_alloc_flags(gfp_mask, *alloc_flags); Later, when we add the zone constrain logic to current_gfp_context() we will be able to remove current->flags load from current_alloc_flags, and therefore return fast-path to the current performance level. Link: https://lkml.kernel.org/r/20210215161349.246722-7-pasha.tatashin@soleen.com Signed-off-by: Pavel Tatashin Suggested-by: Michal Hocko Acked-by: Michal Hocko Cc: Dan Williams Cc: David Hildenbrand Cc: David Rientjes Cc: Ingo Molnar Cc: Ira Weiny Cc: James Morris Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: John Hubbard Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Mel Gorman Cc: Mike Kravetz Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Sasha Levin Cc: Steven Rostedt (VMware) Cc: Tyler Hicks Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c2fc6a64bef9..3c55eaafede1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5180,6 +5180,13 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, } gfp &= gfp_allowed_mask; + /* + * Apply scoped allocation constraints. This is mainly about GFP_NOFS + * resp. GFP_NOIO which has to be inherited for all allocation requests + * from a particular context which has been marked by + * memalloc_no{fs,io}_{save,restore}. + */ + gfp = current_gfp_context(gfp); alloc_gfp = gfp; if (!prepare_alloc_pages(gfp, order, preferred_nid, nodemask, &ac, &alloc_gfp, &alloc_flags)) @@ -5196,13 +5203,7 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, if (likely(page)) goto out; - /* - * Apply scoped allocation constraints. This is mainly about GFP_NOFS - * resp. GFP_NOIO which has to be inherited for all allocation requests - * from a particular context which has been marked by - * memalloc_no{fs,io}_{save,restore}. - */ - alloc_gfp = current_gfp_context(gfp); + alloc_gfp = gfp; ac.spread_dirty_pages = false; /* -- cgit v1.2.3-58-ga151 From 8e3560d963d22ba41857f48e4114ce80373144ea Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Tue, 4 May 2021 18:39:00 -0700 Subject: mm: honor PF_MEMALLOC_PIN for all movable pages PF_MEMALLOC_PIN is only honored for CMA pages, extend this flag to work for any allocations from ZONE_MOVABLE by removing __GFP_MOVABLE from gfp_mask when this flag is passed in the current context. Add is_pinnable_page() to return true if page is in a pinnable page. A pinnable page is not in ZONE_MOVABLE and not of MIGRATE_CMA type. Link: https://lkml.kernel.org/r/20210215161349.246722-8-pasha.tatashin@soleen.com Signed-off-by: Pavel Tatashin Acked-by: Michal Hocko Cc: Dan Williams Cc: David Hildenbrand Cc: David Rientjes Cc: Ingo Molnar Cc: Ira Weiny Cc: James Morris Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: John Hubbard Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Sasha Levin Cc: Steven Rostedt (VMware) Cc: Tyler Hicks Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 18 ++++++++++++++++++ include/linux/sched/mm.h | 6 +++++- mm/hugetlb.c | 2 +- mm/page_alloc.c | 20 +++++++++----------- 4 files changed, 33 insertions(+), 13 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 1dbb53c44243..d0e628f511e4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1141,6 +1141,11 @@ static inline bool is_zone_device_page(const struct page *page) } #endif +static inline bool is_zone_movable_page(const struct page *page) +{ + return page_zonenum(page) == ZONE_MOVABLE; +} + #ifdef CONFIG_DEV_PAGEMAP_OPS void free_devmap_managed_page(struct page *page); DECLARE_STATIC_KEY_FALSE(devmap_managed_key); @@ -1550,6 +1555,19 @@ static inline unsigned long page_to_section(const struct page *page) } #endif +/* MIGRATE_CMA and ZONE_MOVABLE do not allow pin pages */ +#ifdef CONFIG_MIGRATION +static inline bool is_pinnable_page(struct page *page) +{ + return !is_zone_movable_page(page) && !is_migrate_cma_page(page); +} +#else +static inline bool is_pinnable_page(struct page *page) +{ + return true; +} +#endif + static inline void set_page_zone(struct page *page, enum zone_type zone) { page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT); diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index ae654819e8aa..e24b1fe348e3 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -151,12 +151,13 @@ static inline bool in_vfork(struct task_struct *tsk) * Applies per-task gfp context to the given allocation flags. * PF_MEMALLOC_NOIO implies GFP_NOIO * PF_MEMALLOC_NOFS implies GFP_NOFS + * PF_MEMALLOC_PIN implies !GFP_MOVABLE */ static inline gfp_t current_gfp_context(gfp_t flags) { unsigned int pflags = READ_ONCE(current->flags); - if (unlikely(pflags & (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS))) { + if (unlikely(pflags & (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS | PF_MEMALLOC_PIN))) { /* * NOIO implies both NOIO and NOFS and it is a weaker context * so always make sure it makes precedence @@ -165,6 +166,9 @@ static inline gfp_t current_gfp_context(gfp_t flags) flags &= ~(__GFP_IO | __GFP_FS); else if (pflags & PF_MEMALLOC_NOFS) flags &= ~__GFP_FS; + + if (pflags & PF_MEMALLOC_PIN) + flags &= ~__GFP_MOVABLE; } return flags; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 60dc197a4417..629aa4c2259c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1083,7 +1083,7 @@ static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid) lockdep_assert_held(&hugetlb_lock); list_for_each_entry(page, &h->hugepage_freelists[nid], lru) { - if (pin && is_migrate_cma_page(page)) + if (pin && !is_pinnable_page(page)) continue; if (PageHWPoison(page)) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3c55eaafede1..81db38926266 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3859,16 +3859,13 @@ alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask) return alloc_flags; } -static inline unsigned int current_alloc_flags(gfp_t gfp_mask, - unsigned int alloc_flags) +/* Must be called after current_gfp_context() which can change gfp_mask */ +static inline unsigned int gfp_to_alloc_flags_cma(gfp_t gfp_mask, + unsigned int alloc_flags) { #ifdef CONFIG_CMA - unsigned int pflags = current->flags; - - if (!(pflags & PF_MEMALLOC_PIN) && - gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE) + if (gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE) alloc_flags |= ALLOC_CMA; - #endif return alloc_flags; } @@ -4526,7 +4523,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask) } else if (unlikely(rt_task(current)) && !in_interrupt()) alloc_flags |= ALLOC_HARDER; - alloc_flags = current_alloc_flags(gfp_mask, alloc_flags); + alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags); return alloc_flags; } @@ -4828,7 +4825,7 @@ retry: reserve_flags = __gfp_pfmemalloc_flags(gfp_mask); if (reserve_flags) - alloc_flags = current_alloc_flags(gfp_mask, reserve_flags); + alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, reserve_flags); /* * Reset the nodemask and zonelist iterators if memory policies can be @@ -4997,7 +4994,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, if (should_fail_alloc_page(gfp_mask, order)) return false; - *alloc_flags = current_alloc_flags(gfp_mask, *alloc_flags); + *alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, *alloc_flags); /* Dirty zone balancing only done in the fast path */ ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE); @@ -5184,7 +5181,8 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, * Apply scoped allocation constraints. This is mainly about GFP_NOFS * resp. GFP_NOIO which has to be inherited for all allocation requests * from a particular context which has been marked by - * memalloc_no{fs,io}_{save,restore}. + * memalloc_no{fs,io}_{save,restore}. And PF_MEMALLOC_PIN which ensures + * movable zones are not used during allocation. */ gfp = current_gfp_context(gfp); alloc_gfp = gfp; -- cgit v1.2.3-58-ga151 From 9afaf30f7a1aab2022961715a66f644275b8daec Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Tue, 4 May 2021 18:39:04 -0700 Subject: mm/gup: do not migrate zero page On some platforms ZERO_PAGE(0) might end-up in a movable zone. Do not migrate zero page in gup during longterm pinning as migration of zero page is not allowed. For example, in x86 QEMU with 16G of memory and kernelcore=5G parameter, I see the following: Boot#1: zero_pfn 0x48a8d zero_pfn zone: ZONE_DMA32 Boot#2: zero_pfn 0x20168d zero_pfn zone: ZONE_MOVABLE On x86, empty_zero_page is declared in .bss and depending on the loader may end up in different physical locations during boots. Also, move is_zero_pfn() my_zero_pfn() functions under CONFIG_MMU, because zero_pfn that they are using is declared in memory.c which is compiled with CONFIG_MMU. Link: https://lkml.kernel.org/r/20210215161349.246722-9-pasha.tatashin@soleen.com Signed-off-by: Pavel Tatashin Cc: Dan Williams Cc: David Hildenbrand Cc: David Rientjes Cc: Ingo Molnar Cc: Ira Weiny Cc: James Morris Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: John Hubbard Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Sasha Levin Cc: Steven Rostedt (VMware) Cc: Tyler Hicks Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 3 ++- include/linux/mmzone.h | 4 ++++ include/linux/pgtable.h | 12 ++++++++++++ 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index d0e628f511e4..76e27ebb28a3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1559,7 +1559,8 @@ static inline unsigned long page_to_section(const struct page *page) #ifdef CONFIG_MIGRATION static inline bool is_pinnable_page(struct page *page) { - return !is_zone_movable_page(page) && !is_migrate_cma_page(page); + return !(is_zone_movable_page(page) || is_migrate_cma_page(page)) || + is_zero_pfn(page_to_pfn(page)); } #else static inline bool is_pinnable_page(struct page *page) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 3b2205741048..92b44149d5b9 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -427,6 +427,10 @@ enum zone_type { * techniques might use alloc_contig_range() to hide previously * exposed pages from the buddy again (e.g., to implement some sort * of memory unplug in virtio-mem). + * 6. ZERO_PAGE(0), kernelcore/movablecore setups might create + * situations where ZERO_PAGE(0) which is allocated differently + * on different platforms may end up in a movable zone. ZERO_PAGE(0) + * cannot be migrated. * * In general, no unmovable allocations that degrade memory offlining * should end up in ZONE_MOVABLE. Allocators (like alloc_contig_range()) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 5e772392a379..2194a9cd885c 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1111,6 +1111,7 @@ extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, extern void untrack_pfn_moved(struct vm_area_struct *vma); #endif +#ifdef CONFIG_MMU #ifdef __HAVE_COLOR_ZERO_PAGE static inline int is_zero_pfn(unsigned long pfn) { @@ -1134,6 +1135,17 @@ static inline unsigned long my_zero_pfn(unsigned long addr) return zero_pfn; } #endif +#else +static inline int is_zero_pfn(unsigned long pfn) +{ + return 0; +} + +static inline unsigned long my_zero_pfn(unsigned long addr) +{ + return 0; +} +#endif /* CONFIG_MMU */ #ifdef CONFIG_MMU -- cgit v1.2.3-58-ga151 From d1e153fea2a8940273174fc17733c44323d35cd5 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Tue, 4 May 2021 18:39:08 -0700 Subject: mm/gup: migrate pinned pages out of movable zone We should not pin pages in ZONE_MOVABLE. Currently, we do not pin only movable CMA pages. Generalize the function that migrates CMA pages to migrate all movable pages. Use is_pinnable_page() to check which pages need to be migrated Link: https://lkml.kernel.org/r/20210215161349.246722-10-pasha.tatashin@soleen.com Signed-off-by: Pavel Tatashin Reviewed-by: John Hubbard Cc: Dan Williams Cc: David Hildenbrand Cc: David Rientjes Cc: Ingo Molnar Cc: Ira Weiny Cc: James Morris Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Sasha Levin Cc: Steven Rostedt (VMware) Cc: Tyler Hicks Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 1 + include/linux/mmzone.h | 9 ++++-- include/trace/events/migrate.h | 3 +- mm/gup.c | 67 +++++++++++++++++++++--------------------- 4 files changed, 44 insertions(+), 36 deletions(-) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 175ef15ae9e8..4bb4e519e3f5 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -27,6 +27,7 @@ enum migrate_reason { MR_MEMPOLICY_MBIND, MR_NUMA_MISPLACED, MR_CONTIG_RANGE, + MR_LONGTERM_PIN, MR_TYPES }; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 92b44149d5b9..e8922a67d1a4 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -407,8 +407,13 @@ enum zone_type { * to increase the number of THP/huge pages. Notable special cases are: * * 1. Pinned pages: (long-term) pinning of movable pages might - * essentially turn such pages unmovable. Memory offlining might - * retry a long time. + * essentially turn such pages unmovable. Therefore, we do not allow + * pinning long-term pages in ZONE_MOVABLE. When pages are pinned and + * faulted, they come from the right zone right away. However, it is + * still possible that address space already has pages in + * ZONE_MOVABLE at the time when pages are pinned (i.e. user has + * touches that memory before pinning). In such case we migrate them + * to a different zone. When migration fails - pinning fails. * 2. memblock allocations: kernelcore/movablecore setups might create * situations where ZONE_MOVABLE contains unmovable allocations * after boot. Memory offlining and allocations fail early. diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h index f2c990603888..9fb2a3bbcdfb 100644 --- a/include/trace/events/migrate.h +++ b/include/trace/events/migrate.h @@ -20,7 +20,8 @@ EM( MR_SYSCALL, "syscall_or_cpuset") \ EM( MR_MEMPOLICY_MBIND, "mempolicy_mbind") \ EM( MR_NUMA_MISPLACED, "numa_misplaced") \ - EMe(MR_CONTIG_RANGE, "contig_range") + EM( MR_CONTIG_RANGE, "contig_range") \ + EMe(MR_LONGTERM_PIN, "longterm_pin") /* * First define the enums in the above macros to be exported to userspace diff --git a/mm/gup.c b/mm/gup.c index a1eff7ad31da..4bc57420f535 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -87,11 +87,12 @@ __maybe_unused struct page *try_grab_compound_head(struct page *page, int orig_refs = refs; /* - * Can't do FOLL_LONGTERM + FOLL_PIN with CMA in the gup fast - * path, so fail and let the caller fall back to the slow path. + * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a + * right zone, so fail and let the caller fall back to the slow + * path. */ - if (unlikely(flags & FOLL_LONGTERM) && - is_migrate_cma_page(page)) + if (unlikely((flags & FOLL_LONGTERM) && + !is_pinnable_page(page))) return NULL; /* @@ -1600,17 +1601,17 @@ struct page *get_dump_page(unsigned long addr) } #endif /* CONFIG_ELF_CORE */ -#ifdef CONFIG_CMA -static long check_and_migrate_cma_pages(struct mm_struct *mm, - unsigned long start, - unsigned long nr_pages, - struct page **pages, - struct vm_area_struct **vmas, - unsigned int gup_flags) +#ifdef CONFIG_MIGRATION +static long check_and_migrate_movable_pages(struct mm_struct *mm, + unsigned long start, + unsigned long nr_pages, + struct page **pages, + struct vm_area_struct **vmas, + unsigned int gup_flags) { unsigned long i, isolation_error_count; bool drain_allow; - LIST_HEAD(cma_page_list); + LIST_HEAD(movable_page_list); long ret = nr_pages; struct page *prev_head, *head; struct migration_target_control mtc = { @@ -1628,13 +1629,12 @@ check_again: continue; prev_head = head; /* - * If we get a page from the CMA zone, since we are going to - * be pinning these entries, we might as well move them out - * of the CMA zone if possible. + * If we get a movable page, since we are going to be pinning + * these entries, try to move them out if possible. */ - if (is_migrate_cma_page(head)) { + if (!is_pinnable_page(head)) { if (PageHuge(head)) { - if (!isolate_huge_page(head, &cma_page_list)) + if (!isolate_huge_page(head, &movable_page_list)) isolation_error_count++; } else { if (!PageLRU(head) && drain_allow) { @@ -1646,7 +1646,7 @@ check_again: isolation_error_count++; continue; } - list_add_tail(&head->lru, &cma_page_list); + list_add_tail(&head->lru, &movable_page_list); mod_node_page_state(page_pgdat(head), NR_ISOLATED_ANON + page_is_file_lru(head), @@ -1659,10 +1659,10 @@ check_again: * If list is empty, and no isolation errors, means that all pages are * in the correct zone. */ - if (list_empty(&cma_page_list) && !isolation_error_count) + if (list_empty(&movable_page_list) && !isolation_error_count) return ret; - if (!list_empty(&cma_page_list)) { + if (!list_empty(&movable_page_list)) { /* * drop the above get_user_pages reference. */ @@ -1672,12 +1672,12 @@ check_again: for (i = 0; i < nr_pages; i++) put_page(pages[i]); - ret = migrate_pages(&cma_page_list, alloc_migration_target, + ret = migrate_pages(&movable_page_list, alloc_migration_target, NULL, (unsigned long)&mtc, MIGRATE_SYNC, - MR_CONTIG_RANGE); + MR_LONGTERM_PIN); if (ret) { - if (!list_empty(&cma_page_list)) - putback_movable_pages(&cma_page_list); + if (!list_empty(&movable_page_list)) + putback_movable_pages(&movable_page_list); return ret > 0 ? -ENOMEM : ret; } @@ -1696,16 +1696,16 @@ check_again: goto check_again; } #else -static long check_and_migrate_cma_pages(struct mm_struct *mm, - unsigned long start, - unsigned long nr_pages, - struct page **pages, - struct vm_area_struct **vmas, - unsigned int gup_flags) +static long check_and_migrate_movable_pages(struct mm_struct *mm, + unsigned long start, + unsigned long nr_pages, + struct page **pages, + struct vm_area_struct **vmas, + unsigned int gup_flags) { return nr_pages; } -#endif /* CONFIG_CMA */ +#endif /* CONFIG_MIGRATION */ /* * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which @@ -1729,8 +1729,9 @@ static long __gup_longterm_locked(struct mm_struct *mm, if (gup_flags & FOLL_LONGTERM) { if (rc > 0) - rc = check_and_migrate_cma_pages(mm, start, rc, pages, - vmas, gup_flags); + rc = check_and_migrate_movable_pages(mm, start, rc, + pages, vmas, + gup_flags); memalloc_pin_restore(flags); } return rc; -- cgit v1.2.3-58-ga151 From fa965fd54827a6b6967602051736da9c163b79b7 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Tue, 4 May 2021 18:39:12 -0700 Subject: memory-hotplug.rst: add a note about ZONE_MOVABLE and page pinning Document the special handling of page pinning when ZONE_MOVABLE present. Link: https://lkml.kernel.org/r/20210215161349.246722-11-pasha.tatashin@soleen.com Signed-off-by: Pavel Tatashin Suggested-by: David Hildenbrand Acked-by: Michal Hocko Cc: Dan Williams Cc: David Rientjes Cc: Ingo Molnar Cc: Ira Weiny Cc: James Morris Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: John Hubbard Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Sasha Levin Cc: Steven Rostedt (VMware) Cc: Tyler Hicks Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/mm/memory-hotplug.rst | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Documentation/admin-guide/mm/memory-hotplug.rst b/Documentation/admin-guide/mm/memory-hotplug.rst index 5307f90738aa..05d51d2d8beb 100644 --- a/Documentation/admin-guide/mm/memory-hotplug.rst +++ b/Documentation/admin-guide/mm/memory-hotplug.rst @@ -357,6 +357,15 @@ creates ZONE_MOVABLE as following. Unfortunately, there is no information to show which memory block belongs to ZONE_MOVABLE. This is TBD. +.. note:: + Techniques that rely on long-term pinnings of memory (especially, RDMA and + vfio) are fundamentally problematic with ZONE_MOVABLE and, therefore, memory + hot remove. Pinned pages cannot reside on ZONE_MOVABLE, to guarantee that + memory can still get hot removed - be aware that pinning can fail even if + there is plenty of free memory in ZONE_MOVABLE. In addition, using + ZONE_MOVABLE might make page pinning more expensive, because pages have to be + migrated off that zone first. + .. _memory_hotplug_how_to_offline_memory: How to offline memory -- cgit v1.2.3-58-ga151 From 24dc20c75f937b8f5c432e38275e70a1611766e9 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Tue, 4 May 2021 18:39:15 -0700 Subject: mm/gup: change index type to long as it counts pages In __get_user_pages_locked() i counts number of pages which should be long, as long is used in all other places to contain number of pages, and 32-bit becomes increasingly small for handling page count proportional values. Link: https://lkml.kernel.org/r/20210215161349.246722-12-pasha.tatashin@soleen.com Signed-off-by: Pavel Tatashin Acked-by: Michal Hocko Cc: Dan Williams Cc: David Hildenbrand Cc: David Rientjes Cc: Ingo Molnar Cc: Ira Weiny Cc: James Morris Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: John Hubbard Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Sasha Levin Cc: Steven Rostedt (VMware) Cc: Tyler Hicks Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/gup.c b/mm/gup.c index 4bc57420f535..91629b5b0636 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1528,7 +1528,7 @@ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start, { struct vm_area_struct *vma; unsigned long vm_flags; - int i; + long i; /* calculate required read or write permissions. * If FOLL_FORCE is set, we only require the "MAY" flags. -- cgit v1.2.3-58-ga151 From f68749ec342b5f2c18b3af3435714d9f653736c3 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Tue, 4 May 2021 18:39:19 -0700 Subject: mm/gup: longterm pin migration cleanup When pages are longterm pinned, we must migrated them out of movable zone. The function that migrates them has a hidden loop with goto. The loop is to retry on isolation failures, and after successful migration. Make this code better by moving this loop to the caller. Link: https://lkml.kernel.org/r/20210215161349.246722-13-pasha.tatashin@soleen.com Signed-off-by: Pavel Tatashin Reviewed-by: Jason Gunthorpe Cc: Dan Williams Cc: David Hildenbrand Cc: David Rientjes Cc: Ingo Molnar Cc: Ira Weiny Cc: James Morris Cc: Jason Gunthorpe Cc: John Hubbard Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Sasha Levin Cc: Steven Rostedt (VMware) Cc: Tyler Hicks Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 93 ++++++++++++++++++++++++++-------------------------------------- 1 file changed, 37 insertions(+), 56 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 91629b5b0636..aa09535cf4d4 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1602,27 +1602,28 @@ struct page *get_dump_page(unsigned long addr) #endif /* CONFIG_ELF_CORE */ #ifdef CONFIG_MIGRATION -static long check_and_migrate_movable_pages(struct mm_struct *mm, - unsigned long start, - unsigned long nr_pages, +/* + * Check whether all pages are pinnable, if so return number of pages. If some + * pages are not pinnable, migrate them, and unpin all pages. Return zero if + * pages were migrated, or if some pages were not successfully isolated. + * Return negative error if migration fails. + */ +static long check_and_migrate_movable_pages(unsigned long nr_pages, struct page **pages, - struct vm_area_struct **vmas, unsigned int gup_flags) { - unsigned long i, isolation_error_count; - bool drain_allow; + unsigned long i; + unsigned long isolation_error_count = 0; + bool drain_allow = true; LIST_HEAD(movable_page_list); - long ret = nr_pages; - struct page *prev_head, *head; + long ret = 0; + struct page *prev_head = NULL; + struct page *head; struct migration_target_control mtc = { .nid = NUMA_NO_NODE, .gfp_mask = GFP_USER | __GFP_NOWARN, }; -check_again: - prev_head = NULL; - isolation_error_count = 0; - drain_allow = true; for (i = 0; i < nr_pages; i++) { head = compound_head(pages[i]); if (head == prev_head) @@ -1660,47 +1661,27 @@ check_again: * in the correct zone. */ if (list_empty(&movable_page_list) && !isolation_error_count) - return ret; + return nr_pages; + if (gup_flags & FOLL_PIN) { + unpin_user_pages(pages, nr_pages); + } else { + for (i = 0; i < nr_pages; i++) + put_page(pages[i]); + } if (!list_empty(&movable_page_list)) { - /* - * drop the above get_user_pages reference. - */ - if (gup_flags & FOLL_PIN) - unpin_user_pages(pages, nr_pages); - else - for (i = 0; i < nr_pages; i++) - put_page(pages[i]); - ret = migrate_pages(&movable_page_list, alloc_migration_target, NULL, (unsigned long)&mtc, MIGRATE_SYNC, MR_LONGTERM_PIN); - if (ret) { - if (!list_empty(&movable_page_list)) - putback_movable_pages(&movable_page_list); - return ret > 0 ? -ENOMEM : ret; - } - - /* We unpinned pages before migration, pin them again */ - ret = __get_user_pages_locked(mm, start, nr_pages, pages, vmas, - NULL, gup_flags); - if (ret <= 0) - return ret; - nr_pages = ret; + if (ret && !list_empty(&movable_page_list)) + putback_movable_pages(&movable_page_list); } - /* - * check again because pages were unpinned, and we also might have - * had isolation errors and need more pages to migrate. - */ - goto check_again; + return ret > 0 ? -ENOMEM : ret; } #else -static long check_and_migrate_movable_pages(struct mm_struct *mm, - unsigned long start, - unsigned long nr_pages, +static long check_and_migrate_movable_pages(unsigned long nr_pages, struct page **pages, - struct vm_area_struct **vmas, unsigned int gup_flags) { return nr_pages; @@ -1718,22 +1699,22 @@ static long __gup_longterm_locked(struct mm_struct *mm, struct vm_area_struct **vmas, unsigned int gup_flags) { - unsigned long flags = 0; + unsigned int flags; long rc; - if (gup_flags & FOLL_LONGTERM) - flags = memalloc_pin_save(); - - rc = __get_user_pages_locked(mm, start, nr_pages, pages, vmas, NULL, - gup_flags); + if (!(gup_flags & FOLL_LONGTERM)) + return __get_user_pages_locked(mm, start, nr_pages, pages, vmas, + NULL, gup_flags); + flags = memalloc_pin_save(); + do { + rc = __get_user_pages_locked(mm, start, nr_pages, pages, vmas, + NULL, gup_flags); + if (rc <= 0) + break; + rc = check_and_migrate_movable_pages(rc, pages, gup_flags); + } while (!rc); + memalloc_pin_restore(flags); - if (gup_flags & FOLL_LONGTERM) { - if (rc > 0) - rc = check_and_migrate_movable_pages(mm, start, rc, - pages, vmas, - gup_flags); - memalloc_pin_restore(flags); - } return rc; } -- cgit v1.2.3-58-ga151 From 79dbf135e2481eaa77b172d88c343bf85e021545 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Tue, 4 May 2021 18:39:23 -0700 Subject: selftests/vm: gup_test: fix test flag In gup_test both gup_flags and test_flags use the same flags field. This is broken. Farther, in the actual gup_test.c all the passed gup_flags are erased and unconditionally replaced with FOLL_WRITE. Which means that test_flags are ignored, and code like this always performs pin dump test: 155 if (gup->flags & GUP_TEST_FLAG_DUMP_PAGES_USE_PIN) 156 nr = pin_user_pages(addr, nr, gup->flags, 157 pages + i, NULL); 158 else 159 nr = get_user_pages(addr, nr, gup->flags, 160 pages + i, NULL); 161 break; Add a new test_flags field, to allow raw gup_flags to work. Add a new subcommand for DUMP_USER_PAGES_TEST to specify that pin test should be performed. Remove unconditional overwriting of gup_flags via FOLL_WRITE. But, preserve the previous behaviour where FOLL_WRITE was the default flag, and add a new option "-W" to unset FOLL_WRITE. Rename flags with gup_flags. With the fix, dump works like this: root@virtme:/# gup_test -c ---- page #0, starting from user virt addr: 0x7f8acb9e4000 page:00000000d3d2ee27 refcount:2 mapcount:1 mapping:0000000000000000 index:0x0 pfn:0x100bcf anon flags: 0x300000000080016(referenced|uptodate|lru|swapbacked) raw: 0300000000080016 ffffd0e204021608 ffffd0e208df2e88 ffff8ea04243ec61 raw: 0000000000000000 0000000000000000 0000000200000000 0000000000000000 page dumped because: gup_test: dump_pages() test DUMP_USER_PAGES_TEST: done root@virtme:/# gup_test -c -p ---- page #0, starting from user virt addr: 0x7fd19701b000 page:00000000baed3c7d refcount:1025 mapcount:1 mapping:0000000000000000 index:0x0 pfn:0x108008 anon flags: 0x300000000080014(uptodate|lru|swapbacked) raw: 0300000000080014 ffffd0e204200188 ffffd0e205e09088 ffff8ea04243ee71 raw: 0000000000000000 0000000000000000 0000040100000000 0000000000000000 page dumped because: gup_test: dump_pages() test DUMP_USER_PAGES_TEST: done Refcount shows the difference between pin vs no-pin case. Also change type of nr from int to long, as it counts number of pages. Link: https://lkml.kernel.org/r/20210215161349.246722-14-pasha.tatashin@soleen.com Signed-off-by: Pavel Tatashin Reviewed-by: John Hubbard Cc: Dan Williams Cc: David Hildenbrand Cc: David Rientjes Cc: Ingo Molnar Cc: Ira Weiny Cc: James Morris Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Sasha Levin Cc: Steven Rostedt (VMware) Cc: Tyler Hicks Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup_test.c | 23 ++++++++++------------- mm/gup_test.h | 3 ++- tools/testing/selftests/vm/gup_test.c | 15 +++++++++++---- 3 files changed, 23 insertions(+), 18 deletions(-) diff --git a/mm/gup_test.c b/mm/gup_test.c index e3cf78e5873e..a6ed1c877679 100644 --- a/mm/gup_test.c +++ b/mm/gup_test.c @@ -94,7 +94,7 @@ static int __gup_test_ioctl(unsigned int cmd, { ktime_t start_time, end_time; unsigned long i, nr_pages, addr, next; - int nr; + long nr; struct page **pages; int ret = 0; bool needs_mmap_lock = @@ -126,37 +126,34 @@ static int __gup_test_ioctl(unsigned int cmd, nr = (next - addr) / PAGE_SIZE; } - /* Filter out most gup flags: only allow a tiny subset here: */ - gup->flags &= FOLL_WRITE; - switch (cmd) { case GUP_FAST_BENCHMARK: - nr = get_user_pages_fast(addr, nr, gup->flags, + nr = get_user_pages_fast(addr, nr, gup->gup_flags, pages + i); break; case GUP_BASIC_TEST: - nr = get_user_pages(addr, nr, gup->flags, pages + i, + nr = get_user_pages(addr, nr, gup->gup_flags, pages + i, NULL); break; case PIN_FAST_BENCHMARK: - nr = pin_user_pages_fast(addr, nr, gup->flags, + nr = pin_user_pages_fast(addr, nr, gup->gup_flags, pages + i); break; case PIN_BASIC_TEST: - nr = pin_user_pages(addr, nr, gup->flags, pages + i, + nr = pin_user_pages(addr, nr, gup->gup_flags, pages + i, NULL); break; case PIN_LONGTERM_BENCHMARK: nr = pin_user_pages(addr, nr, - gup->flags | FOLL_LONGTERM, + gup->gup_flags | FOLL_LONGTERM, pages + i, NULL); break; case DUMP_USER_PAGES_TEST: - if (gup->flags & GUP_TEST_FLAG_DUMP_PAGES_USE_PIN) - nr = pin_user_pages(addr, nr, gup->flags, + if (gup->test_flags & GUP_TEST_FLAG_DUMP_PAGES_USE_PIN) + nr = pin_user_pages(addr, nr, gup->gup_flags, pages + i, NULL); else - nr = get_user_pages(addr, nr, gup->flags, + nr = get_user_pages(addr, nr, gup->gup_flags, pages + i, NULL); break; default: @@ -187,7 +184,7 @@ static int __gup_test_ioctl(unsigned int cmd, start_time = ktime_get(); - put_back_pages(cmd, pages, nr_pages, gup->flags); + put_back_pages(cmd, pages, nr_pages, gup->test_flags); end_time = ktime_get(); gup->put_delta_usec = ktime_us_delta(end_time, start_time); diff --git a/mm/gup_test.h b/mm/gup_test.h index 90a6713d50eb..887ac1d5f5bc 100644 --- a/mm/gup_test.h +++ b/mm/gup_test.h @@ -21,7 +21,8 @@ struct gup_test { __u64 addr; __u64 size; __u32 nr_pages_per_call; - __u32 flags; + __u32 gup_flags; + __u32 test_flags; /* * Each non-zero entry is the number of the page (1-based: first page is * page 1, so that zero entries mean "do nothing") from the .addr base. diff --git a/tools/testing/selftests/vm/gup_test.c b/tools/testing/selftests/vm/gup_test.c index 6c6336dd3b7f..943cc2608dc2 100644 --- a/tools/testing/selftests/vm/gup_test.c +++ b/tools/testing/selftests/vm/gup_test.c @@ -37,13 +37,13 @@ int main(int argc, char **argv) { struct gup_test gup = { 0 }; unsigned long size = 128 * MB; - int i, fd, filed, opt, nr_pages = 1, thp = -1, repeats = 1, write = 0; + int i, fd, filed, opt, nr_pages = 1, thp = -1, repeats = 1, write = 1; unsigned long cmd = GUP_FAST_BENCHMARK; int flags = MAP_PRIVATE; char *file = "/dev/zero"; char *p; - while ((opt = getopt(argc, argv, "m:r:n:F:f:abctTLUuwSH")) != -1) { + while ((opt = getopt(argc, argv, "m:r:n:F:f:abctTLUuwWSHp")) != -1) { switch (opt) { case 'a': cmd = PIN_FAST_BENCHMARK; @@ -65,9 +65,13 @@ int main(int argc, char **argv) */ gup.which_pages[0] = 1; break; + case 'p': + /* works only with DUMP_USER_PAGES_TEST */ + gup.test_flags |= GUP_TEST_FLAG_DUMP_PAGES_USE_PIN; + break; case 'F': /* strtol, so you can pass flags in hex form */ - gup.flags = strtol(optarg, 0, 0); + gup.gup_flags = strtol(optarg, 0, 0); break; case 'm': size = atoi(optarg) * MB; @@ -93,6 +97,9 @@ int main(int argc, char **argv) case 'w': write = 1; break; + case 'W': + write = 0; + break; case 'f': file = optarg; break; @@ -140,7 +147,7 @@ int main(int argc, char **argv) gup.nr_pages_per_call = nr_pages; if (write) - gup.flags |= FOLL_WRITE; + gup.gup_flags |= FOLL_WRITE; fd = open("/sys/kernel/debug/gup_test", O_RDWR); if (fd == -1) { -- cgit v1.2.3-58-ga151 From e44605a8b1aa13d892addc59ec3d416cb186c77b Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Tue, 4 May 2021 18:39:27 -0700 Subject: selftests/vm: gup_test: test faulting in kernel, and verify pinnable pages When pages are pinned they can be faulted in userland and migrated, and they can be faulted right in kernel without migration. In either case, the pinned pages must end-up being pinnable (not movable). Add a new test to gup_test, to help verify that the gup/pup (get_user_pages() / pin_user_pages()) behavior with respect to pinnable and movable pages is reasonable and correct. Specifically, provide a way to: 1) Verify that only "pinnable" pages are pinned. This is checked automatically for you. 2) Verify that gup/pup performance is reasonable. This requires comparing benchmarks between doing gup/pup on pages that have been pre-faulted in from user space, vs. doing gup/pup on pages that are not faulted in until gup/pup time (via FOLL_TOUCH). This decision is controlled with the new -z command line option. Link: https://lkml.kernel.org/r/20210215161349.246722-15-pasha.tatashin@soleen.com Signed-off-by: Pavel Tatashin Reviewed-by: John Hubbard Cc: Dan Williams Cc: David Hildenbrand Cc: David Rientjes Cc: Ingo Molnar Cc: Ira Weiny Cc: James Morris Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Sasha Levin Cc: Steven Rostedt (VMware) Cc: Tyler Hicks Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup_test.c | 6 ++++++ tools/testing/selftests/vm/gup_test.c | 23 +++++++++++++++++++---- 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/mm/gup_test.c b/mm/gup_test.c index a6ed1c877679..d974dec19e1c 100644 --- a/mm/gup_test.c +++ b/mm/gup_test.c @@ -52,6 +52,12 @@ static void verify_dma_pinned(unsigned int cmd, struct page **pages, dump_page(page, "gup_test failure"); break; + } else if (cmd == PIN_LONGTERM_BENCHMARK && + WARN(!is_pinnable_page(page), + "pages[%lu] is NOT pinnable but pinned\n", + i)) { + dump_page(page, "gup_test failure"); + break; } } break; diff --git a/tools/testing/selftests/vm/gup_test.c b/tools/testing/selftests/vm/gup_test.c index 943cc2608dc2..1e662d59c502 100644 --- a/tools/testing/selftests/vm/gup_test.c +++ b/tools/testing/selftests/vm/gup_test.c @@ -13,6 +13,7 @@ /* Just the flags we need, copied from mm.h: */ #define FOLL_WRITE 0x01 /* check pte is writable */ +#define FOLL_TOUCH 0x02 /* mark page accessed */ static char *cmd_to_str(unsigned long cmd) { @@ -39,11 +40,11 @@ int main(int argc, char **argv) unsigned long size = 128 * MB; int i, fd, filed, opt, nr_pages = 1, thp = -1, repeats = 1, write = 1; unsigned long cmd = GUP_FAST_BENCHMARK; - int flags = MAP_PRIVATE; + int flags = MAP_PRIVATE, touch = 0; char *file = "/dev/zero"; char *p; - while ((opt = getopt(argc, argv, "m:r:n:F:f:abctTLUuwWSHp")) != -1) { + while ((opt = getopt(argc, argv, "m:r:n:F:f:abctTLUuwWSHpz")) != -1) { switch (opt) { case 'a': cmd = PIN_FAST_BENCHMARK; @@ -110,6 +111,10 @@ int main(int argc, char **argv) case 'H': flags |= (MAP_HUGETLB | MAP_ANONYMOUS); break; + case 'z': + /* fault pages in gup, do not fault in userland */ + touch = 1; + break; default: return -1; } @@ -167,8 +172,18 @@ int main(int argc, char **argv) else if (thp == 0) madvise(p, size, MADV_NOHUGEPAGE); - for (; (unsigned long)p < gup.addr + size; p += PAGE_SIZE) - p[0] = 0; + /* + * FOLL_TOUCH, in gup_test, is used as an either/or case: either + * fault pages in from the kernel via FOLL_TOUCH, or fault them + * in here, from user space. This allows comparison of performance + * between those two cases. + */ + if (touch) { + gup.gup_flags |= FOLL_TOUCH; + } else { + for (; (unsigned long)p < gup.addr + size; p += PAGE_SIZE) + p[0] = 0; + } /* Only report timing information on the *_BENCHMARK commands: */ if ((cmd == PIN_FAST_BENCHMARK) || (cmd == GUP_FAST_BENCHMARK) || -- cgit v1.2.3-58-ga151 From 8ca559132a2d9b56732d35e2b947af96acb9b80b Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 4 May 2021 18:39:30 -0700 Subject: mm/memory_hotplug: remove broken locking of zone PCP structures during hot remove zone_pcp_reset allegedly protects against a race with drain_pages using local_irq_save but this is bogus. local_irq_save only operates on the local CPU. If memory hotplug is running on CPU A and drain_pages is running on CPU B, disabling IRQs on CPU A does not affect CPU B and offers no protection. This patch deletes IRQ disable/enable on the grounds that IRQs protect nothing and assumes the existing hotplug paths guarantees the PCP cannot be used after zone_pcp_enable(). That should be the case already because all the pages have been freed and there is no page to put on the PCP lists. Link: https://lkml.kernel.org/r/20210412090346.GQ3697@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Michal Hocko Reviewed-by: Oscar Salvador Cc: "Michael S. Tsirkin" Cc: Vlastimil Babka Cc: Alexander Duyck Cc: Minchan Kim Cc: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 81db38926266..b012805a11ad 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -9020,12 +9020,9 @@ void zone_pcp_enable(struct zone *zone) void zone_pcp_reset(struct zone *zone) { - unsigned long flags; int cpu; struct per_cpu_pageset *pset; - /* avoid races with drain_pages() */ - local_irq_save(flags); if (zone->pageset != &boot_pageset) { for_each_online_cpu(cpu) { pset = per_cpu_ptr(zone->pageset, cpu); @@ -9034,7 +9031,6 @@ void zone_pcp_reset(struct zone *zone) free_percpu(zone->pageset); zone->pageset = &boot_pageset; } - local_irq_restore(flags); } #ifdef CONFIG_MEMORY_HOTREMOVE -- cgit v1.2.3-58-ga151 From 8736cc2d002f14e90d2b33bc5bef1740f6275ba4 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Tue, 4 May 2021 18:39:33 -0700 Subject: drivers/base/memory: introduce memory_block_{online,offline} Patch series "Allocate memmap from hotadded memory (per device)", v10. The primary goal of this patchset is to reduce memory overhead of the hot-added memory (at least for SPARSEMEM_VMEMMAP memory model). The current way we use to populate memmap (struct page array) has two main drawbacks: a) it consumes an additional memory until the hotadded memory itself is onlined and b) memmap might end up on a different numa node which is especially true for movable_node configuration. c) due to fragmentation we might end up populating memmap with base pages One way to mitigate all these issues is to simply allocate memmap array (which is the largest memory footprint of the physical memory hotplug) from the hot-added memory itself. SPARSEMEM_VMEMMAP memory model allows us to map any pfn range so the memory doesn't need to be online to be usable for the array. See patch 4 for more details. This feature is only usable when CONFIG_SPARSEMEM_VMEMMAP is set. [Overall design]: Implementation wise we reuse vmem_altmap infrastructure to override the default allocator used by vmemap_populate. memory_block structure gains a new field called nr_vmemmap_pages, which accounts for the number of vmemmap pages used by that memory_block. E.g: On x86_64, that is 512 vmemmap pages on small memory bloks and 4096 on large memory blocks (1GB) We also introduce new two functions: memory_block_{online,offline}. These functions take care of initializing/unitializing vmemmap pages prior to calling {online,offline}_pages, so the latter functions can remain totally untouched. More details can be found in the respective changelogs. This patch (of 8): This is a preparatory patch that introduces two new functions: memory_block_online() and memory_block_offline(). For now, these functions will only call online_pages() and offline_pages() respectively, but they will be later in charge of preparing the vmemmap pages, carrying out the initialization and proper accounting of such pages. Since memory_block struct contains all the information, pass this struct down the chain till the end functions. Link: https://lkml.kernel.org/r/20210421102701.25051-1-osalvador@suse.de Link: https://lkml.kernel.org/r/20210421102701.25051-2-osalvador@suse.de Signed-off-by: Oscar Salvador Reviewed-by: David Hildenbrand Acked-by: Michal Hocko Cc: Anshuman Khandual Cc: Vlastimil Babka Cc: Pavel Tatashin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index f35298425575..f209925a5d4e 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -169,30 +169,41 @@ int memory_notify(unsigned long val, void *v) return blocking_notifier_call_chain(&memory_chain, val, v); } +static int memory_block_online(struct memory_block *mem) +{ + unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); + unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; + + return online_pages(start_pfn, nr_pages, mem->online_type, mem->nid); +} + +static int memory_block_offline(struct memory_block *mem) +{ + unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); + unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; + + return offline_pages(start_pfn, nr_pages); +} + /* * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is * OK to have direct references to sparsemem variables in here. */ static int -memory_block_action(unsigned long start_section_nr, unsigned long action, - int online_type, int nid) +memory_block_action(struct memory_block *mem, unsigned long action) { - unsigned long start_pfn; - unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; int ret; - start_pfn = section_nr_to_pfn(start_section_nr); - switch (action) { case MEM_ONLINE: - ret = online_pages(start_pfn, nr_pages, online_type, nid); + ret = memory_block_online(mem); break; case MEM_OFFLINE: - ret = offline_pages(start_pfn, nr_pages); + ret = memory_block_offline(mem); break; default: WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: " - "%ld\n", __func__, start_section_nr, action, action); + "%ld\n", __func__, mem->start_section_nr, action, action); ret = -EINVAL; } @@ -210,9 +221,7 @@ static int memory_block_change_state(struct memory_block *mem, if (to_state == MEM_OFFLINE) mem->state = MEM_GOING_OFFLINE; - ret = memory_block_action(mem->start_section_nr, to_state, - mem->online_type, mem->nid); - + ret = memory_block_action(mem, to_state); mem->state = ret ? from_state_req : to_state; return ret; -- cgit v1.2.3-58-ga151 From dd8e2f230d82ecd60504fba48bb10bf3760b674e Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Tue, 4 May 2021 18:39:36 -0700 Subject: mm,memory_hotplug: relax fully spanned sections check We want {online,offline}_pages to operate on whole memblocks, but memmap_on_memory will poke pageblock_nr_pages aligned holes in the beginning, which is a special case we want to allow. Relax the check to account for that case. Link: https://lkml.kernel.org/r/20210421102701.25051-3-osalvador@suse.de Signed-off-by: Oscar Salvador Reviewed-by: David Hildenbrand Acked-by: Michal Hocko Cc: Anshuman Khandual Cc: Pavel Tatashin Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 729fba144c71..b8467faf2f69 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -838,9 +838,16 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int ret; struct memory_notify arg; - /* We can only online full sections (e.g., SECTION_IS_ONLINE) */ + /* + * {on,off}lining is constrained to full memory sections (or more + * precisly to memory blocks from the user space POV). + * memmap_on_memory is an exception because it reserves initial part + * of the physical memory space for vmemmaps. That space is pageblock + * aligned. + */ if (WARN_ON_ONCE(!nr_pages || - !IS_ALIGNED(pfn | nr_pages, PAGES_PER_SECTION))) + !IS_ALIGNED(pfn, pageblock_nr_pages) || + !IS_ALIGNED(pfn + nr_pages, PAGES_PER_SECTION))) return -EINVAL; mem_hotplug_begin(); @@ -1573,9 +1580,16 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) int ret, node; char *reason; - /* We can only offline full sections (e.g., SECTION_IS_ONLINE) */ + /* + * {on,off}lining is constrained to full memory sections (or more + * precisly to memory blocks from the user space POV). + * memmap_on_memory is an exception because it reserves initial part + * of the physical memory space for vmemmaps. That space is pageblock + * aligned. + */ if (WARN_ON_ONCE(!nr_pages || - !IS_ALIGNED(start_pfn | nr_pages, PAGES_PER_SECTION))) + !IS_ALIGNED(start_pfn, pageblock_nr_pages) || + !IS_ALIGNED(start_pfn + nr_pages, PAGES_PER_SECTION))) return -EINVAL; mem_hotplug_begin(); -- cgit v1.2.3-58-ga151 From f9901144e48f6a7ba186249add705d10e74738ec Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 4 May 2021 18:39:39 -0700 Subject: mm,memory_hotplug: factor out adjusting present pages into adjust_present_page_count() Let's have a single place (inspired by adjust_managed_page_count()) where we adjust present pages. In contrast to adjust_managed_page_count(), only memory onlining or offlining is allowed to modify the number of present pages. Link: https://lkml.kernel.org/r/20210421102701.25051-4-osalvador@suse.de Signed-off-by: David Hildenbrand Signed-off-by: Oscar Salvador Acked-by: Michal Hocko Cc: Anshuman Khandual Cc: Pavel Tatashin Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index b8467faf2f69..04f01fabc150 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -829,6 +829,16 @@ struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn, return default_zone_for_pfn(nid, start_pfn, nr_pages); } +static void adjust_present_page_count(struct zone *zone, long nr_pages) +{ + unsigned long flags; + + zone->present_pages += nr_pages; + pgdat_resize_lock(zone->zone_pgdat, &flags); + zone->zone_pgdat->node_present_pages += nr_pages; + pgdat_resize_unlock(zone->zone_pgdat, &flags); +} + int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type, int nid) { @@ -884,11 +894,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, } online_pages_range(pfn, nr_pages); - zone->present_pages += nr_pages; - - pgdat_resize_lock(zone->zone_pgdat, &flags); - zone->zone_pgdat->node_present_pages += nr_pages; - pgdat_resize_unlock(zone->zone_pgdat, &flags); + adjust_present_page_count(zone, nr_pages); node_states_set_node(nid, &arg); if (need_zonelists_rebuild) @@ -1706,11 +1712,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) /* removal success */ adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages); - zone->present_pages -= nr_pages; - - pgdat_resize_lock(zone->zone_pgdat, &flags); - zone->zone_pgdat->node_present_pages -= nr_pages; - pgdat_resize_unlock(zone->zone_pgdat, &flags); + adjust_present_page_count(zone, -nr_pages); init_per_zone_wmark_min(); -- cgit v1.2.3-58-ga151 From a08a2ae3461383c2d50d0997dcc6cd1dd1fefb08 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Tue, 4 May 2021 18:39:42 -0700 Subject: mm,memory_hotplug: allocate memmap from the added memory range Physical memory hotadd has to allocate a memmap (struct page array) for the newly added memory section. Currently, alloc_pages_node() is used for those allocations. This has some disadvantages: a) an existing memory is consumed for that purpose (eg: ~2MB per 128MB memory section on x86_64) This can even lead to extreme cases where system goes OOM because the physically hotplugged memory depletes the available memory before it is onlined. b) if the whole node is movable then we have off-node struct pages which has performance drawbacks. c) It might be there are no PMD_ALIGNED chunks so memmap array gets populated with base pages. This can be improved when CONFIG_SPARSEMEM_VMEMMAP is enabled. Vmemap page tables can map arbitrary memory. That means that we can reserve a part of the physically hotadded memory to back vmemmap page tables. This implementation uses the beginning of the hotplugged memory for that purpose. There are some non-obviously things to consider though. Vmemmap pages are allocated/freed during the memory hotplug events (add_memory_resource(), try_remove_memory()) when the memory is added/removed. This means that the reserved physical range is not online although it is used. The most obvious side effect is that pfn_to_online_page() returns NULL for those pfns. The current design expects that this should be OK as the hotplugged memory is considered a garbage until it is onlined. For example hibernation wouldn't save the content of those vmmemmaps into the image so it wouldn't be restored on resume but this should be OK as there no real content to recover anyway while metadata is reachable from other data structures (e.g. vmemmap page tables). The reserved space is therefore (de)initialized during the {on,off}line events (mhp_{de}init_memmap_on_memory). That is done by extracting page allocator independent initialization from the regular onlining path. The primary reason to handle the reserved space outside of {on,off}line_pages is to make each initialization specific to the purpose rather than special case them in a single function. As per above, the functions that are introduced are: - mhp_init_memmap_on_memory: Initializes vmemmap pages by calling move_pfn_range_to_zone(), calls kasan_add_zero_shadow(), and onlines as many sections as vmemmap pages fully span. - mhp_deinit_memmap_on_memory: Offlines as many sections as vmemmap pages fully span, removes the range from zhe zone by remove_pfn_range_from_zone(), and calls kasan_remove_zero_shadow() for the range. The new function memory_block_online() calls mhp_init_memmap_on_memory() before doing the actual online_pages(). Should online_pages() fail, we clean up by calling mhp_deinit_memmap_on_memory(). Adjusting of present_pages is done at the end once we know that online_pages() succedeed. On offline, memory_block_offline() needs to unaccount vmemmap pages from present_pages() before calling offline_pages(). This is necessary because offline_pages() tears down some structures based on the fact whether the node or the zone become empty. If offline_pages() fails, we account back vmemmap pages. If it succeeds, we call mhp_deinit_memmap_on_memory(). Hot-remove: We need to be careful when removing memory, as adding and removing memory needs to be done with the same granularity. To check that this assumption is not violated, we check the memory range we want to remove and if a) any memory block has vmemmap pages and b) the range spans more than a single memory block, we scream out loud and refuse to proceed. If all is good and the range was using memmap on memory (aka vmemmap pages), we construct an altmap structure so free_hugepage_table does the right thing and calls vmem_altmap_free instead of free_pagetable. Link: https://lkml.kernel.org/r/20210421102701.25051-5-osalvador@suse.de Signed-off-by: Oscar Salvador Reviewed-by: David Hildenbrand Acked-by: Michal Hocko Cc: Anshuman Khandual Cc: Pavel Tatashin Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 72 ++++++++++++++++-- include/linux/memory.h | 8 +- include/linux/memory_hotplug.h | 15 +++- include/linux/memremap.h | 2 +- include/linux/mmzone.h | 7 +- mm/Kconfig | 5 ++ mm/memory_hotplug.c | 161 ++++++++++++++++++++++++++++++++++++++--- mm/sparse.c | 2 - 8 files changed, 250 insertions(+), 22 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index f209925a5d4e..b31b3af5c490 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -173,16 +173,73 @@ static int memory_block_online(struct memory_block *mem) { unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; + unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages; + struct zone *zone; + int ret; + + zone = zone_for_pfn_range(mem->online_type, mem->nid, start_pfn, nr_pages); + + /* + * Although vmemmap pages have a different lifecycle than the pages + * they describe (they remain until the memory is unplugged), doing + * their initialization and accounting at memory onlining/offlining + * stage helps to keep accounting easier to follow - e.g vmemmaps + * belong to the same zone as the memory they backed. + */ + if (nr_vmemmap_pages) { + ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages, zone); + if (ret) + return ret; + } + + ret = online_pages(start_pfn + nr_vmemmap_pages, + nr_pages - nr_vmemmap_pages, zone); + if (ret) { + if (nr_vmemmap_pages) + mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages); + return ret; + } + + /* + * Account once onlining succeeded. If the zone was unpopulated, it is + * now already properly populated. + */ + if (nr_vmemmap_pages) + adjust_present_page_count(zone, nr_vmemmap_pages); - return online_pages(start_pfn, nr_pages, mem->online_type, mem->nid); + return ret; } static int memory_block_offline(struct memory_block *mem) { unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; + unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages; + struct zone *zone; + int ret; + + zone = page_zone(pfn_to_page(start_pfn)); + + /* + * Unaccount before offlining, such that unpopulated zone and kthreads + * can properly be torn down in offline_pages(). + */ + if (nr_vmemmap_pages) + adjust_present_page_count(zone, -nr_vmemmap_pages); - return offline_pages(start_pfn, nr_pages); + ret = offline_pages(start_pfn + nr_vmemmap_pages, + nr_pages - nr_vmemmap_pages); + if (ret) { + /* offline_pages() failed. Account back. */ + if (nr_vmemmap_pages) + adjust_present_page_count(zone, nr_vmemmap_pages); + return ret; + } + + if (nr_vmemmap_pages) + mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages); + + return ret; } /* @@ -576,7 +633,8 @@ int register_memory(struct memory_block *memory) return ret; } -static int init_memory_block(unsigned long block_id, unsigned long state) +static int init_memory_block(unsigned long block_id, unsigned long state, + unsigned long nr_vmemmap_pages) { struct memory_block *mem; int ret = 0; @@ -593,6 +651,7 @@ static int init_memory_block(unsigned long block_id, unsigned long state) mem->start_section_nr = block_id * sections_per_block; mem->state = state; mem->nid = NUMA_NO_NODE; + mem->nr_vmemmap_pages = nr_vmemmap_pages; ret = register_memory(mem); @@ -612,7 +671,7 @@ static int add_memory_block(unsigned long base_section_nr) if (section_count == 0) return 0; return init_memory_block(memory_block_id(base_section_nr), - MEM_ONLINE); + MEM_ONLINE, 0); } static void unregister_memory(struct memory_block *memory) @@ -634,7 +693,8 @@ static void unregister_memory(struct memory_block *memory) * * Called under device_hotplug_lock. */ -int create_memory_block_devices(unsigned long start, unsigned long size) +int create_memory_block_devices(unsigned long start, unsigned long size, + unsigned long vmemmap_pages) { const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start)); unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size)); @@ -647,7 +707,7 @@ int create_memory_block_devices(unsigned long start, unsigned long size) return -EINVAL; for (block_id = start_block_id; block_id != end_block_id; block_id++) { - ret = init_memory_block(block_id, MEM_OFFLINE); + ret = init_memory_block(block_id, MEM_OFFLINE, vmemmap_pages); if (ret) break; } diff --git a/include/linux/memory.h b/include/linux/memory.h index 4da95e684e20..97e92e8b556a 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -29,6 +29,11 @@ struct memory_block { int online_type; /* for passing data to online routine */ int nid; /* NID for this memory block */ struct device dev; + /* + * Number of vmemmap pages. These pages + * lay at the beginning of the memory block. + */ + unsigned long nr_vmemmap_pages; }; int arch_get_memory_phys_device(unsigned long start_pfn); @@ -80,7 +85,8 @@ static inline int memory_notify(unsigned long val, void *v) #else extern int register_memory_notifier(struct notifier_block *nb); extern void unregister_memory_notifier(struct notifier_block *nb); -int create_memory_block_devices(unsigned long start, unsigned long size); +int create_memory_block_devices(unsigned long start, unsigned long size, + unsigned long vmemmap_pages); void remove_memory_block_devices(unsigned long start, unsigned long size); extern void memory_dev_init(void); extern int memory_notify(unsigned long val, void *v); diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 7288aa5ef73b..28f32fd00fe9 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -55,6 +55,14 @@ typedef int __bitwise mhp_t; */ #define MHP_MERGE_RESOURCE ((__force mhp_t)BIT(0)) +/* + * We want memmap (struct page array) to be self contained. + * To do so, we will use the beginning of the hot-added range to build + * the page tables for the memmap array that describes the entire range. + * Only selected architectures support it with SPARSE_VMEMMAP. + */ +#define MHP_MEMMAP_ON_MEMORY ((__force mhp_t)BIT(1)) + /* * Extended parameters for memory hotplug: * altmap: alternative allocator for memmap array (optional) @@ -99,9 +107,13 @@ static inline void zone_seqlock_init(struct zone *zone) extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages); extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages); extern int add_one_highpage(struct page *page, int pfn, int bad_ppro); +extern void adjust_present_page_count(struct zone *zone, long nr_pages); /* VM interface that may be used by firmware interface */ +extern int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, + struct zone *zone); +extern void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages); extern int online_pages(unsigned long pfn, unsigned long nr_pages, - int online_type, int nid); + struct zone *zone); extern struct zone *test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn); extern void __offline_isolated_pages(unsigned long start_pfn, @@ -359,6 +371,7 @@ extern struct zone *zone_for_pfn_range(int online_type, int nid, unsigned start_ extern int arch_create_linear_mapping(int nid, u64 start, u64 size, struct mhp_params *params); void arch_remove_linear_mapping(u64 start, u64 size); +extern bool mhp_supports_memmap_on_memory(unsigned long size); #endif /* CONFIG_MEMORY_HOTPLUG */ #endif /* __LINUX_MEMORY_HOTPLUG_H */ diff --git a/include/linux/memremap.h b/include/linux/memremap.h index f5b464daeeca..45a79da89c5f 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -17,7 +17,7 @@ struct device; * @alloc: track pages consumed, private to vmemmap_populate() */ struct vmem_altmap { - const unsigned long base_pfn; + unsigned long base_pfn; const unsigned long end_pfn; const unsigned long reserve; unsigned long free; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index e8922a67d1a4..917bd6c604d5 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -436,6 +436,11 @@ enum zone_type { * situations where ZERO_PAGE(0) which is allocated differently * on different platforms may end up in a movable zone. ZERO_PAGE(0) * cannot be migrated. + * 7. Memory-hotplug: when using memmap_on_memory and onlining the + * memory to the MOVABLE zone, the vmemmap pages are also placed in + * such zone. Such pages cannot be really moved around as they are + * self-stored in the range, but they are treated as movable when + * the range they describe is about to be offlined. * * In general, no unmovable allocations that degrade memory offlining * should end up in ZONE_MOVABLE. Allocators (like alloc_contig_range()) @@ -1392,10 +1397,8 @@ static inline int online_section_nr(unsigned long nr) #ifdef CONFIG_MEMORY_HOTPLUG void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn); -#ifdef CONFIG_MEMORY_HOTREMOVE void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn); #endif -#endif static inline struct mem_section *__pfn_to_section(unsigned long pfn) { diff --git a/mm/Kconfig b/mm/Kconfig index fe4897c3c81b..02d44e3420f5 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -188,6 +188,11 @@ config MEMORY_HOTREMOVE depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE depends on MIGRATION +config MHP_MEMMAP_ON_MEMORY + def_bool y + depends on MEMORY_HOTPLUG && SPARSEMEM_VMEMMAP + depends on ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE + # Heavily threaded applications may benefit from splitting the mm-wide # page_table_lock, so that faults on different parts of the user address # space can be handled with less contention: split it at this NR_CPUS. diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 04f01fabc150..0b3157836814 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -42,6 +42,8 @@ #include "internal.h" #include "shuffle.h" +static bool memmap_on_memory; + /* * online_page_callback contains pointer to current page onlining function. * Initially it is generic_online_page(). If it is required it could be @@ -648,9 +650,16 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages) * decide to not expose all pages to the buddy (e.g., expose them * later). We account all pages as being online and belonging to this * zone ("present"). + * When using memmap_on_memory, the range might not be aligned to + * MAX_ORDER_NR_PAGES - 1, but pageblock aligned. __ffs() will detect + * this and the first chunk to online will be pageblock_nr_pages. */ - for (pfn = start_pfn; pfn < end_pfn; pfn += MAX_ORDER_NR_PAGES) - (*online_page_callback)(pfn_to_page(pfn), MAX_ORDER - 1); + for (pfn = start_pfn; pfn < end_pfn;) { + int order = min(MAX_ORDER - 1UL, __ffs(pfn)); + + (*online_page_callback)(pfn_to_page(pfn), order); + pfn += (1UL << order); + } /* mark all involved sections as online */ online_mem_sections(start_pfn, end_pfn); @@ -829,7 +838,11 @@ struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn, return default_zone_for_pfn(nid, start_pfn, nr_pages); } -static void adjust_present_page_count(struct zone *zone, long nr_pages) +/* + * This function should only be called by memory_block_{online,offline}, + * and {online,offline}_pages. + */ +void adjust_present_page_count(struct zone *zone, long nr_pages) { unsigned long flags; @@ -839,12 +852,54 @@ static void adjust_present_page_count(struct zone *zone, long nr_pages) pgdat_resize_unlock(zone->zone_pgdat, &flags); } -int __ref online_pages(unsigned long pfn, unsigned long nr_pages, - int online_type, int nid) +int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, + struct zone *zone) +{ + unsigned long end_pfn = pfn + nr_pages; + int ret; + + ret = kasan_add_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages)); + if (ret) + return ret; + + move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE); + + /* + * It might be that the vmemmap_pages fully span sections. If that is + * the case, mark those sections online here as otherwise they will be + * left offline. + */ + if (nr_pages >= PAGES_PER_SECTION) + online_mem_sections(pfn, ALIGN_DOWN(end_pfn, PAGES_PER_SECTION)); + + return ret; +} + +void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages) +{ + unsigned long end_pfn = pfn + nr_pages; + + /* + * It might be that the vmemmap_pages fully span sections. If that is + * the case, mark those sections offline here as otherwise they will be + * left online. + */ + if (nr_pages >= PAGES_PER_SECTION) + offline_mem_sections(pfn, ALIGN_DOWN(end_pfn, PAGES_PER_SECTION)); + + /* + * The pages associated with this vmemmap have been offlined, so + * we can reset its state here. + */ + remove_pfn_range_from_zone(page_zone(pfn_to_page(pfn)), pfn, nr_pages); + kasan_remove_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages)); +} + +int __ref online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *zone) { unsigned long flags; - struct zone *zone; int need_zonelists_rebuild = 0; + const int nid = zone_to_nid(zone); int ret; struct memory_notify arg; @@ -863,7 +918,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, mem_hotplug_begin(); /* associate pfn range with the zone */ - zone = zone_for_pfn_range(online_type, nid, pfn, nr_pages); move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE); arg.start_pfn = pfn; @@ -1077,6 +1131,45 @@ static int online_memory_block(struct memory_block *mem, void *arg) return device_online(&mem->dev); } +bool mhp_supports_memmap_on_memory(unsigned long size) +{ + unsigned long nr_vmemmap_pages = size / PAGE_SIZE; + unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page); + unsigned long remaining_size = size - vmemmap_size; + + /* + * Besides having arch support and the feature enabled at runtime, we + * need a few more assumptions to hold true: + * + * a) We span a single memory block: memory onlining/offlinin;g happens + * in memory block granularity. We don't want the vmemmap of online + * memory blocks to reside on offline memory blocks. In the future, + * we might want to support variable-sized memory blocks to make the + * feature more versatile. + * + * b) The vmemmap pages span complete PMDs: We don't want vmemmap code + * to populate memory from the altmap for unrelated parts (i.e., + * other memory blocks) + * + * c) The vmemmap pages (and thereby the pages that will be exposed to + * the buddy) have to cover full pageblocks: memory onlining/offlining + * code requires applicable ranges to be page-aligned, for example, to + * set the migratetypes properly. + * + * TODO: Although we have a check here to make sure that vmemmap pages + * fully populate a PMD, it is not the right place to check for + * this. A much better solution involves improving vmemmap code + * to fallback to base pages when trying to populate vmemmap using + * altmap as an alternative source of memory, and we do not exactly + * populate a single PMD. + */ + return memmap_on_memory && + IS_ENABLED(CONFIG_MHP_MEMMAP_ON_MEMORY) && + size == memory_block_size_bytes() && + IS_ALIGNED(vmemmap_size, PMD_SIZE) && + IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT)); +} + /* * NOTE: The caller must call lock_device_hotplug() to serialize hotplug * and online/offline operations (triggered e.g. by sysfs). @@ -1086,6 +1179,7 @@ static int online_memory_block(struct memory_block *mem, void *arg) int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) { struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) }; + struct vmem_altmap mhp_altmap = {}; u64 start, size; bool new_node = false; int ret; @@ -1112,13 +1206,26 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) goto error; new_node = ret; + /* + * Self hosted memmap array + */ + if (mhp_flags & MHP_MEMMAP_ON_MEMORY) { + if (!mhp_supports_memmap_on_memory(size)) { + ret = -EINVAL; + goto error; + } + mhp_altmap.free = PHYS_PFN(size); + mhp_altmap.base_pfn = PHYS_PFN(start); + params.altmap = &mhp_altmap; + } + /* call arch's memory hotadd */ ret = arch_add_memory(nid, start, size, ¶ms); if (ret < 0) goto error; /* create memory block devices after memory was added */ - ret = create_memory_block_devices(start, size); + ret = create_memory_block_devices(start, size, mhp_altmap.alloc); if (ret) { arch_remove_memory(nid, start, size, NULL); goto error; @@ -1767,6 +1874,14 @@ static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) return 0; } +static int get_nr_vmemmap_pages_cb(struct memory_block *mem, void *arg) +{ + /* + * If not set, continue with the next block. + */ + return mem->nr_vmemmap_pages; +} + static int check_cpu_on_node(pg_data_t *pgdat) { int cpu; @@ -1841,6 +1956,9 @@ EXPORT_SYMBOL(try_offline_node); static int __ref try_remove_memory(int nid, u64 start, u64 size) { int rc = 0; + struct vmem_altmap mhp_altmap = {}; + struct vmem_altmap *altmap = NULL; + unsigned long nr_vmemmap_pages; BUG_ON(check_hotplug_memory_range(start, size)); @@ -1853,6 +1971,31 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size) if (rc) return rc; + /* + * We only support removing memory added with MHP_MEMMAP_ON_MEMORY in + * the same granularity it was added - a single memory block. + */ + if (memmap_on_memory) { + nr_vmemmap_pages = walk_memory_blocks(start, size, NULL, + get_nr_vmemmap_pages_cb); + if (nr_vmemmap_pages) { + if (size != memory_block_size_bytes()) { + pr_warn("Refuse to remove %#llx - %#llx," + "wrong granularity\n", + start, start + size); + return -EINVAL; + } + + /* + * Let remove_pmd_table->free_hugepage_table do the + * right thing if we used vmem_altmap when hot-adding + * the range. + */ + mhp_altmap.alloc = nr_vmemmap_pages; + altmap = &mhp_altmap; + } + } + /* remove memmap entry */ firmware_map_remove(start, start + size, "System RAM"); @@ -1864,7 +2007,7 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size) mem_hotplug_begin(); - arch_remove_memory(nid, start, size, NULL); + arch_remove_memory(nid, start, size, altmap); if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) { memblock_free(start, size); diff --git a/mm/sparse.c b/mm/sparse.c index 33406ea2ecc4..d3fbed26e64e 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -624,7 +624,6 @@ void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn) } } -#ifdef CONFIG_MEMORY_HOTREMOVE /* Mark all memory sections within the pfn range as offline */ void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn) { @@ -645,7 +644,6 @@ void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn) ms->section_mem_map &= ~SECTION_IS_ONLINE; } } -#endif #ifdef CONFIG_SPARSEMEM_VMEMMAP static struct page * __meminit populate_section_memmap(unsigned long pfn, -- cgit v1.2.3-58-ga151 From 4a3e5de9c4ec41bb0684b0d4e0c16abc39617d88 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Tue, 4 May 2021 18:39:45 -0700 Subject: acpi,memhotplug: enable MHP_MEMMAP_ON_MEMORY when supported Let the caller check whether it can pass MHP_MEMMAP_ON_MEMORY by checking mhp_supports_memmap_on_memory(). MHP_MEMMAP_ON_MEMORY can only be set in case ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE is enabled, the architecture supports altmap, and the range to be added spans a single memory block. Link: https://lkml.kernel.org/r/20210421102701.25051-6-osalvador@suse.de Signed-off-by: Oscar Salvador Reviewed-by: David Hildenbrand Acked-by: Michal Hocko Cc: Anshuman Khandual Cc: Pavel Tatashin Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/acpi/acpi_memhotplug.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c index b02fd51e5589..8cc195c4c861 100644 --- a/drivers/acpi/acpi_memhotplug.c +++ b/drivers/acpi/acpi_memhotplug.c @@ -171,6 +171,7 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device) acpi_handle handle = mem_device->device->handle; int result, num_enabled = 0; struct acpi_memory_info *info; + mhp_t mhp_flags = MHP_NONE; int node; node = acpi_get_node(handle); @@ -194,8 +195,10 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device) if (node < 0) node = memory_add_physaddr_to_nid(info->start_addr); + if (mhp_supports_memmap_on_memory(info->length)) + mhp_flags |= MHP_MEMMAP_ON_MEMORY; result = __add_memory(node, info->start_addr, info->length, - MHP_NONE); + mhp_flags); /* * If the memory block has been used by the kernel, add_memory() -- cgit v1.2.3-58-ga151 From e3a9d9fcc3315993de2e9fcd7ea82fab84433815 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Tue, 4 May 2021 18:39:48 -0700 Subject: mm,memory_hotplug: add kernel boot option to enable memmap_on_memory Self stored memmap leads to a sparse memory situation which is unsuitable for workloads that requires large contiguous memory chunks, so make this an opt-in which needs to be explicitly enabled. To control this, let memory_hotplug have its own memory space, as suggested by David, so we can add memory_hotplug.memmap_on_memory parameter. Link: https://lkml.kernel.org/r/20210421102701.25051-7-osalvador@suse.de Signed-off-by: Oscar Salvador Reviewed-by: David Hildenbrand Acked-by: Michal Hocko Cc: Anshuman Khandual Cc: Pavel Tatashin Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/kernel-parameters.txt | 17 +++++++++++++++++ mm/Makefile | 5 ++++- mm/memory_hotplug.c | 10 +++++++++- 3 files changed, 30 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 1c0a3cf6fcc9..d93fbc1c1917 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2801,6 +2801,23 @@ seconds. Use this parameter to check at some other rate. 0 disables periodic checking. + memory_hotplug.memmap_on_memory + [KNL,X86,ARM] Boolean flag to enable this feature. + Format: {on | off (default)} + When enabled, runtime hotplugged memory will + allocate its internal metadata (struct pages) + from the hotadded memory which will allow to + hotadd a lot of memory without requiring + additional memory to do so. + This feature is disabled by default because it + has some implication on large (e.g. GB) + allocations in some configurations (e.g. small + memory blocks). + The state of the flag can be read in + /sys/module/memory_hotplug/parameters/memmap_on_memory. + Note that even when enabled, there are a few cases where + the feature is not effective. + memtest= [KNL,X86,ARM,PPC] Enable memtest Format: default : 0 diff --git a/mm/Makefile b/mm/Makefile index 809033d77710..bf71e295e9f6 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -58,9 +58,13 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ page-alloc-y := page_alloc.o page-alloc-$(CONFIG_SHUFFLE_PAGE_ALLOCATOR) += shuffle.o +# Give 'memory_hotplug' its own module-parameter namespace +memory-hotplug-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o + obj-y += page-alloc.o obj-y += init-mm.o obj-y += memblock.o +obj-y += $(memory-hotplug-y) ifdef CONFIG_MMU obj-$(CONFIG_ADVISE_SYSCALLS) += madvise.o @@ -83,7 +87,6 @@ obj-$(CONFIG_SLUB) += slub.o obj-$(CONFIG_KASAN) += kasan/ obj-$(CONFIG_KFENCE) += kfence/ obj-$(CONFIG_FAILSLAB) += failslab.o -obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_MEMTEST) += memtest.o obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 0b3157836814..f7e46f54a228 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -42,7 +42,15 @@ #include "internal.h" #include "shuffle.h" -static bool memmap_on_memory; + +/* + * memory_hotplug.memmap_on_memory parameter + */ +static bool memmap_on_memory __ro_after_init; +#ifdef CONFIG_MHP_MEMMAP_ON_MEMORY +module_param(memmap_on_memory, bool, 0444); +MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug"); +#endif /* * online_page_callback contains pointer to current page onlining function. -- cgit v1.2.3-58-ga151 From f91ef2223dc425e2e8759a625cffd48dce3503de Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Tue, 4 May 2021 18:39:51 -0700 Subject: x86/Kconfig: introduce ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE Enable x86_64 platform to use the MHP_MEMMAP_ON_MEMORY feature. Link: https://lkml.kernel.org/r/20210421102701.25051-8-osalvador@suse.de Signed-off-by: Oscar Salvador Reviewed-by: David Hildenbrand Acked-by: Michal Hocko Cc: Anshuman Khandual Cc: Pavel Tatashin Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0bbc157e4bc7..0045e1b44190 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2432,6 +2432,9 @@ config ARCH_HAS_ADD_PAGES def_bool y depends on X86_64 && ARCH_ENABLE_MEMORY_HOTPLUG +config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE + def_bool y + config USE_PERCPU_NUMA_NODE_ID def_bool y depends on NUMA -- cgit v1.2.3-58-ga151 From ca6e51d592d20180374366e71bb0972de002d509 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Tue, 4 May 2021 18:39:54 -0700 Subject: arm64/Kconfig: introduce ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE Enable arm64 platform to use the MHP_MEMMAP_ON_MEMORY feature. Link: https://lkml.kernel.org/r/20210421102701.25051-9-osalvador@suse.de Signed-off-by: Oscar Salvador Reviewed-by: David Hildenbrand Cc: Anshuman Khandual Cc: Michal Hocko Cc: Pavel Tatashin Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/Kconfig | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index e12fad2e2a15..f0b17d758912 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -316,6 +316,9 @@ config ZONE_DMA32 bool "Support DMA32 zone" if EXPERT default y +config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE + def_bool y + config SMP def_bool y -- cgit v1.2.3-58-ga151 From 79cd420248c776005d534416bfc9b04696e6c729 Mon Sep 17 00:00:00 2001 From: Zhiyuan Dai Date: Tue, 4 May 2021 18:39:57 -0700 Subject: mm/zswap.c: switch from strlcpy to strscpy strlcpy is marked as deprecated in Documentation/process/deprecated.rst, and there is no functional difference when the caller expects truncation (when not checking the return value). strscpy is relatively better as it also avoids scanning the whole source string. Link: https://lkml.kernel.org/r/1614227981-20367-1-git-send-email-daizhiyuan@phytium.com.cn Signed-off-by: Zhiyuan Dai Cc: Seth Jennings Cc: Dan Streetman Cc: Vitaly Wool Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zswap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/zswap.c b/mm/zswap.c index 578d9f256920..20763267a219 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -614,7 +614,7 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) } pr_debug("using %s zpool\n", zpool_get_type(pool->zpool)); - strlcpy(pool->tfm_name, compressor, sizeof(pool->tfm_name)); + strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name)); pool->acomp_ctx = alloc_percpu(*pool->acomp_ctx); if (!pool->acomp_ctx) { -- cgit v1.2.3-58-ga151 From ecfc2bda7aafc5c87b69a3d7a1fc1016dd21d5a7 Mon Sep 17 00:00:00 2001 From: zhouchuangao Date: Tue, 4 May 2021 18:40:00 -0700 Subject: mm/zsmalloc: use BUG_ON instead of if condition followed by BUG. It can be optimized at compile time. Link: https://lkml.kernel.org/r/1616727798-9110-1-git-send-email-zhouchuangao@vivo.com Signed-off-by: zhouchuangao Cc: Minchan Kim Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 30c358b72025..58697f7a43f8 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1987,8 +1987,7 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage, head = obj_to_head(page, addr); if (head & OBJ_ALLOCATED_TAG) { handle = head & ~OBJ_ALLOCATED_TAG; - if (!testpin_tag(handle)) - BUG(); + BUG_ON(!testpin_tag(handle)); old_obj = handle_to_obj(handle); obj_to_location(old_obj, &dummy, &obj_idx); @@ -2035,8 +2034,7 @@ unpin_objects: head = obj_to_head(page, addr); if (head & OBJ_ALLOCATED_TAG) { handle = head & ~OBJ_ALLOCATED_TAG; - if (!testpin_tag(handle)) - BUG(); + BUG_ON(!testpin_tag(handle)); unpin_tag(handle); } } -- cgit v1.2.3-58-ga151 From 28961998f858114e51d2ae862065b858afcfa2b2 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Tue, 4 May 2021 18:40:03 -0700 Subject: iov_iter: lift memzero_page() to highmem.h Patch series "btrfs: Convert kmap/memset/kunmap to memzero_user()". Lifting memzero_user(), convert it to kmap_local_page() and then use it in btrfs. This patch (of 3): memzero_page() can replace the kmap/memset/kunmap pattern in other places in the code. While zero_user() has the same interface it is not the same call and its use should be limited and some of those calls may be better converted from zero_user() to memzero_page().[1] But that is not addressed in this series. Lift memzero_page() to highmem. [1] https://lore.kernel.org/lkml/CAHk-=wijdojzo56FzYqE5TOYw2Vws7ik3LEMGj9SPQaJJ+Z73Q@mail.gmail.com/ Link: https://lkml.kernel.org/r/20210309212137.2610186-1-ira.weiny@intel.com Link: https://lkml.kernel.org/r/20210309212137.2610186-2-ira.weiny@intel.com Signed-off-by: Ira Weiny Cc: Alexander Viro Cc: David Sterba Cc: Chris Mason Cc: Josef Bacik Cc: Chaitanya Kulkarni Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/highmem.h | 7 +++++++ lib/iov_iter.c | 8 +------- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 44170f312ae7..832b49b50c7b 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -332,4 +332,11 @@ static inline void memcpy_to_page(struct page *page, size_t offset, kunmap_local(to); } +static inline void memzero_page(struct page *page, size_t offset, size_t len) +{ + char *addr = kmap_atomic(page); + memset(addr + offset, 0, len); + kunmap_atomic(addr); +} + #endif /* _LINUX_HIGHMEM_H */ diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 61228a6c69f8..c701b7a187f2 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -507,13 +508,6 @@ void iov_iter_init(struct iov_iter *i, unsigned int direction, } EXPORT_SYMBOL(iov_iter_init); -static void memzero_page(struct page *page, size_t offset, size_t len) -{ - char *addr = kmap_atomic(page); - memset(addr + offset, 0, len); - kunmap_atomic(addr); -} - static inline bool allocated(struct pipe_buffer *buf) { return buf->ops == &default_pipe_buf_ops; -- cgit v1.2.3-58-ga151 From d048b9c2a737eb791a5e9506930f72b02efb8b24 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Tue, 4 May 2021 18:40:07 -0700 Subject: btrfs: use memzero_page() instead of open coded kmap pattern There are many places where kmap/memset/kunmap patterns occur. Use the newly lifted memzero_page() to eliminate direct uses of kmap and leverage the new core functions use of kmap_local_page(). The development of this patch was aided by the following coccinelle script: // // SPDX-License-Identifier: GPL-2.0-only // Find kmap/memset/kunmap pattern and replace with memset*page calls // // NOTE: Offsets and other expressions may be more complex than what the script // will automatically generate. Therefore a catchall rule is provided to find // the pattern which then must be evaluated by hand. // // Confidence: Low // Copyright: (C) 2021 Intel Corporation // URL: http://coccinelle.lip6.fr/ // Comments: // Options: // // Then the memset pattern // @ memset_rule1 @ expression page, V, L, Off; identifier ptr; type VP; @@ ( -VP ptr = kmap(page); | -ptr = kmap(page); | -VP ptr = kmap_atomic(page); | -ptr = kmap_atomic(page); ) <+... ( -memset(ptr, 0, L); +memzero_page(page, 0, L); | -memset(ptr + Off, 0, L); +memzero_page(page, Off, L); | -memset(ptr, V, L); +memset_page(page, V, 0, L); | -memset(ptr + Off, V, L); +memset_page(page, V, Off, L); ) ...+> ( -kunmap(page); | -kunmap_atomic(ptr); ) // Remove any pointers left unused @ depends on memset_rule1 @ identifier memset_rule1.ptr; type VP, VP1; @@ -VP ptr; ... when != ptr; ? VP1 ptr; // // Catch all // @ memset_rule2 @ expression page; identifier ptr; expression GenTo, GenSize, GenValue; type VP; @@ ( -VP ptr = kmap(page); | -ptr = kmap(page); | -VP ptr = kmap_atomic(page); | -ptr = kmap_atomic(page); ) <+... ( // // Some call sites have complex expressions within the memset/memcpy // The follow are catch alls which need to be evaluated by hand. // -memset(GenTo, 0, GenSize); +memzero_pageExtra(page, GenTo, GenSize); | -memset(GenTo, GenValue, GenSize); +memset_pageExtra(page, GenValue, GenTo, GenSize); ) ...+> ( -kunmap(page); | -kunmap_atomic(ptr); ) // Remove any pointers left unused @ depends on memset_rule2 @ identifier memset_rule2.ptr; type VP, VP1; @@ -VP ptr; ... when != ptr; ? VP1 ptr; // Link: https://lkml.kernel.org/r/20210309212137.2610186-4-ira.weiny@intel.com Signed-off-by: Ira Weiny Reviewed-by: David Sterba Cc: Alexander Viro Cc: Chaitanya Kulkarni Cc: Chris Mason Cc: Josef Bacik Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/btrfs/compression.c | 5 +---- fs/btrfs/extent_io.c | 22 ++++------------------ fs/btrfs/inode.c | 33 ++++++++++----------------------- fs/btrfs/reflink.c | 6 +----- fs/btrfs/zlib.c | 5 +---- fs/btrfs/zstd.c | 5 +---- 6 files changed, 18 insertions(+), 58 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 17f93fd28f7e..2bea01d23a5b 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -591,16 +591,13 @@ static noinline int add_ra_bio_pages(struct inode *inode, free_extent_map(em); if (page->index == end_index) { - char *userpage; size_t zero_offset = offset_in_page(isize); if (zero_offset) { int zeros; zeros = PAGE_SIZE - zero_offset; - userpage = kmap_atomic(page); - memset(userpage + zero_offset, 0, zeros); + memzero_page(page, zero_offset, zeros); flush_dcache_page(page); - kunmap_atomic(userpage); } } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index f2d1bb234377..074a78a202b8 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3421,15 +3421,12 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, } if (page->index == last_byte >> PAGE_SHIFT) { - char *userpage; size_t zero_offset = offset_in_page(last_byte); if (zero_offset) { iosize = PAGE_SIZE - zero_offset; - userpage = kmap_atomic(page); - memset(userpage + zero_offset, 0, iosize); + memzero_page(page, zero_offset, iosize); flush_dcache_page(page); - kunmap_atomic(userpage); } } begin_page_read(fs_info, page); @@ -3438,14 +3435,11 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, u64 disk_bytenr; if (cur >= last_byte) { - char *userpage; struct extent_state *cached = NULL; iosize = PAGE_SIZE - pg_offset; - userpage = kmap_atomic(page); - memset(userpage + pg_offset, 0, iosize); + memzero_page(page, pg_offset, iosize); flush_dcache_page(page); - kunmap_atomic(userpage); set_extent_uptodate(tree, cur, cur + iosize - 1, &cached, GFP_NOFS); unlock_extent_cached(tree, cur, @@ -3528,13 +3522,10 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, /* we've found a hole, just zero and go on */ if (block_start == EXTENT_MAP_HOLE) { - char *userpage; struct extent_state *cached = NULL; - userpage = kmap_atomic(page); - memset(userpage + pg_offset, 0, iosize); + memzero_page(page, pg_offset, iosize); flush_dcache_page(page); - kunmap_atomic(userpage); set_extent_uptodate(tree, cur, cur + iosize - 1, &cached, GFP_NOFS); @@ -3845,12 +3836,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, } if (page->index == end_index) { - char *userpage; - - userpage = kmap_atomic(page); - memset(userpage + pg_offset, 0, - PAGE_SIZE - pg_offset); - kunmap_atomic(userpage); + memzero_page(page, pg_offset, PAGE_SIZE - pg_offset); flush_dcache_page(page); } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b21d491b3adc..4af336008b12 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -646,17 +646,12 @@ again: if (!ret) { unsigned long offset = offset_in_page(total_compressed); struct page *page = pages[nr_pages - 1]; - char *kaddr; /* zero the tail end of the last page, we might be * sending it down to disk */ - if (offset) { - kaddr = kmap_atomic(page); - memset(kaddr + offset, 0, - PAGE_SIZE - offset); - kunmap_atomic(kaddr); - } + if (offset) + memzero_page(page, offset, PAGE_SIZE - offset); will_compress = 1; } } @@ -4833,7 +4828,6 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; - char *kaddr; bool only_release_metadata = false; u32 blocksize = fs_info->sectorsize; pgoff_t index = from >> PAGE_SHIFT; @@ -4925,15 +4919,13 @@ again: if (offset != blocksize) { if (!len) len = blocksize - offset; - kaddr = kmap(page); if (front) - memset(kaddr + (block_start - page_offset(page)), - 0, offset); + memzero_page(page, (block_start - page_offset(page)), + offset); else - memset(kaddr + (block_start - page_offset(page)) + offset, - 0, len); + memzero_page(page, (block_start - page_offset(page)) + offset, + len); flush_dcache_page(page); - kunmap(page); } ClearPageChecked(page); set_page_dirty(page); @@ -6832,11 +6824,9 @@ static noinline int uncompress_inline(struct btrfs_path *path, * cover that region here. */ - if (max_size + pg_offset < PAGE_SIZE) { - char *map = kmap(page); - memset(map + pg_offset + max_size, 0, PAGE_SIZE - max_size - pg_offset); - kunmap(page); - } + if (max_size + pg_offset < PAGE_SIZE) + memzero_page(page, pg_offset + max_size, + PAGE_SIZE - max_size - pg_offset); kfree(tmp); return ret; } @@ -8506,7 +8496,6 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; - char *kaddr; unsigned long zero_start; loff_t size; vm_fault_t ret; @@ -8620,10 +8609,8 @@ again: zero_start = PAGE_SIZE; if (zero_start != PAGE_SIZE) { - kaddr = kmap(page); - memset(kaddr + zero_start, 0, PAGE_SIZE - zero_start); + memzero_page(page, zero_start, PAGE_SIZE - zero_start); flush_dcache_page(page); - kunmap(page); } ClearPageChecked(page); set_page_dirty(page); diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index f4ec06b53aa0..3928ecc40d7b 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -129,12 +129,8 @@ static int copy_inline_to_page(struct btrfs_inode *inode, * So what's in the range [500, 4095] corresponds to zeroes. */ if (datal < block_size) { - char *map; - - map = kmap(page); - memset(map + datal, 0, block_size - datal); + memzero_page(page, datal, block_size - datal); flush_dcache_page(page); - kunmap(page); } SetPageUptodate(page); diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index d524acf7b3e5..c3fa7d3fa770 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -375,7 +375,6 @@ int zlib_decompress(struct list_head *ws, unsigned char *data_in, unsigned long bytes_left; unsigned long total_out = 0; unsigned long pg_offset = 0; - char *kaddr; destlen = min_t(unsigned long, destlen, PAGE_SIZE); bytes_left = destlen; @@ -455,9 +454,7 @@ next: * end of the inline extent (destlen) to the end of the page */ if (pg_offset < destlen) { - kaddr = kmap_atomic(dest_page); - memset(kaddr + pg_offset, 0, destlen - pg_offset); - kunmap_atomic(kaddr); + memzero_page(dest_page, pg_offset, destlen - pg_offset); } return ret; } diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 8e9626d63976..3e26b466476a 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -631,7 +631,6 @@ int zstd_decompress(struct list_head *ws, unsigned char *data_in, size_t ret2; unsigned long total_out = 0; unsigned long pg_offset = 0; - char *kaddr; stream = ZSTD_initDStream( ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size); @@ -696,9 +695,7 @@ int zstd_decompress(struct list_head *ws, unsigned char *data_in, ret = 0; finish: if (pg_offset < destlen) { - kaddr = kmap_atomic(dest_page); - memset(kaddr + pg_offset, 0, destlen - pg_offset); - kunmap_atomic(kaddr); + memzero_page(dest_page, pg_offset, destlen - pg_offset); } return ret; } -- cgit v1.2.3-58-ga151 From 9727688dbf7ea9c3e1dc06885c6f3ba281feb1a8 Mon Sep 17 00:00:00 2001 From: songqiang Date: Tue, 4 May 2021 18:40:09 -0700 Subject: mm/highmem.c: fix coding style issue Delete/add some blank lines and some blank spaces Link: https://lkml.kernel.org/r/20210311095015.14277-1-songqiang@uniontech.com Signed-off-by: songqiang Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/highmem.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/mm/highmem.c b/mm/highmem.c index 6ef8f5e05e7e..e389337e00b4 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -104,7 +104,7 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color) atomic_long_t _totalhigh_pages __read_mostly; EXPORT_SYMBOL(_totalhigh_pages); -unsigned int __nr_free_highpages (void) +unsigned int __nr_free_highpages(void) { struct zone *zone; unsigned int pages = 0; @@ -120,7 +120,7 @@ unsigned int __nr_free_highpages (void) static int pkmap_count[LAST_PKMAP]; static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock); -pte_t * pkmap_page_table; +pte_t *pkmap_page_table; /* * Most architectures have no use for kmap_high_get(), so let's abstract @@ -147,6 +147,7 @@ struct page *__kmap_to_page(void *vaddr) if (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) { int i = PKMAP_NR(addr); + return pte_page(pkmap_page_table[i]); } @@ -278,9 +279,8 @@ void *kmap_high(struct page *page) pkmap_count[PKMAP_NR(vaddr)]++; BUG_ON(pkmap_count[PKMAP_NR(vaddr)] < 2); unlock_kmap(); - return (void*) vaddr; + return (void *) vaddr; } - EXPORT_SYMBOL(kmap_high); #ifdef ARCH_NEEDS_KMAP_HIGH_GET @@ -305,7 +305,7 @@ void *kmap_high_get(struct page *page) pkmap_count[PKMAP_NR(vaddr)]++; } unlock_kmap_any(flags); - return (void*) vaddr; + return (void *) vaddr; } #endif @@ -737,7 +737,6 @@ done: spin_unlock_irqrestore(&pas->lock, flags); return ret; } - EXPORT_SYMBOL(page_address); /** -- cgit v1.2.3-58-ga151 From 68d68ff6ebbf69d02511dd48f16b3795671c9b0b Mon Sep 17 00:00:00 2001 From: Zhiyuan Dai Date: Tue, 4 May 2021 18:40:12 -0700 Subject: mm/mempool: minor coding style tweaks Various coding style tweaks to various files under mm/ [daizhiyuan@phytium.com.cn: mm/swapfile: minor coding style tweaks] Link: https://lkml.kernel.org/r/1614223624-16055-1-git-send-email-daizhiyuan@phytium.com.cn [daizhiyuan@phytium.com.cn: mm/sparse: minor coding style tweaks] Link: https://lkml.kernel.org/r/1614227288-19363-1-git-send-email-daizhiyuan@phytium.com.cn [daizhiyuan@phytium.com.cn: mm/vmscan: minor coding style tweaks] Link: https://lkml.kernel.org/r/1614227649-19853-1-git-send-email-daizhiyuan@phytium.com.cn [daizhiyuan@phytium.com.cn: mm/compaction: minor coding style tweaks] Link: https://lkml.kernel.org/r/1614228218-20770-1-git-send-email-daizhiyuan@phytium.com.cn [daizhiyuan@phytium.com.cn: mm/oom_kill: minor coding style tweaks] Link: https://lkml.kernel.org/r/1614228360-21168-1-git-send-email-daizhiyuan@phytium.com.cn [daizhiyuan@phytium.com.cn: mm/shmem: minor coding style tweaks] Link: https://lkml.kernel.org/r/1614228504-21491-1-git-send-email-daizhiyuan@phytium.com.cn [daizhiyuan@phytium.com.cn: mm/page_alloc: minor coding style tweaks] Link: https://lkml.kernel.org/r/1614228613-21754-1-git-send-email-daizhiyuan@phytium.com.cn [daizhiyuan@phytium.com.cn: mm/filemap: minor coding style tweaks] Link: https://lkml.kernel.org/r/1614228936-22337-1-git-send-email-daizhiyuan@phytium.com.cn [daizhiyuan@phytium.com.cn: mm/mlock: minor coding style tweaks] Link: https://lkml.kernel.org/r/1613956588-2453-1-git-send-email-daizhiyuan@phytium.com.cn [daizhiyuan@phytium.com.cn: mm/frontswap: minor coding style tweaks] Link: https://lkml.kernel.org/r/1613962668-15045-1-git-send-email-daizhiyuan@phytium.com.cn [daizhiyuan@phytium.com.cn: mm/vmalloc: minor coding style tweaks] Link: https://lkml.kernel.org/r/1613963379-15988-1-git-send-email-daizhiyuan@phytium.com.cn [daizhiyuan@phytium.com.cn: mm/memory_hotplug: minor coding style tweaks] Link: https://lkml.kernel.org/r/1613971784-24878-1-git-send-email-daizhiyuan@phytium.com.cn [daizhiyuan@phytium.com.cn: mm/mempolicy: minor coding style tweaks] Link: https://lkml.kernel.org/r/1613972228-25501-1-git-send-email-daizhiyuan@phytium.com.cn Link: https://lkml.kernel.org/r/1614222374-13805-1-git-send-email-daizhiyuan@phytium.com.cn Signed-off-by: Zhiyuan Dai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 2 +- mm/filemap.c | 8 ++++---- mm/frontswap.c | 12 ++++++++---- mm/memory_hotplug.c | 2 +- mm/mempolicy.c | 4 ++-- mm/mempool.c | 2 +- mm/mlock.c | 4 ++-- mm/oom_kill.c | 2 +- mm/page_alloc.c | 2 +- mm/shmem.c | 2 +- mm/sparse.c | 2 +- mm/swapfile.c | 4 ++-- mm/vmalloc.c | 2 +- mm/vmscan.c | 2 +- 14 files changed, 27 insertions(+), 23 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 598dffbd5c8e..3a6c6b821f80 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2885,7 +2885,7 @@ void wakeup_kcompactd(pg_data_t *pgdat, int order, int highest_zoneidx) */ static int kcompactd(void *p) { - pg_data_t *pgdat = (pg_data_t*)p; + pg_data_t *pgdat = (pg_data_t *)p; struct task_struct *tsk = current; unsigned int proactive_defer = 0; diff --git a/mm/filemap.c b/mm/filemap.c index ecc5f8a4c488..7fadf211643c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3267,7 +3267,7 @@ const struct vm_operations_struct generic_file_vm_ops = { /* This is used for a general mmap of a disk file */ -int generic_file_mmap(struct file * file, struct vm_area_struct * vma) +int generic_file_mmap(struct file *file, struct vm_area_struct *vma) { struct address_space *mapping = file->f_mapping; @@ -3292,11 +3292,11 @@ vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf) { return VM_FAULT_SIGBUS; } -int generic_file_mmap(struct file * file, struct vm_area_struct * vma) +int generic_file_mmap(struct file *file, struct vm_area_struct *vma) { return -ENOSYS; } -int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma) +int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) { return -ENOSYS; } @@ -3724,7 +3724,7 @@ EXPORT_SYMBOL(generic_perform_write); ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; - struct address_space * mapping = file->f_mapping; + struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; ssize_t written = 0; ssize_t err; diff --git a/mm/frontswap.c b/mm/frontswap.c index 2183a56c7874..130e301c5ac0 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -60,16 +60,20 @@ static u64 frontswap_succ_stores; static u64 frontswap_failed_stores; static u64 frontswap_invalidates; -static inline void inc_frontswap_loads(void) { +static inline void inc_frontswap_loads(void) +{ data_race(frontswap_loads++); } -static inline void inc_frontswap_succ_stores(void) { +static inline void inc_frontswap_succ_stores(void) +{ data_race(frontswap_succ_stores++); } -static inline void inc_frontswap_failed_stores(void) { +static inline void inc_frontswap_failed_stores(void) +{ data_race(frontswap_failed_stores++); } -static inline void inc_frontswap_invalidates(void) { +static inline void inc_frontswap_invalidates(void) +{ data_race(frontswap_invalidates++); } #else diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index f7e46f54a228..70620d0dd923 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -834,7 +834,7 @@ static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn return movable_node_enabled ? movable_zone : kernel_zone; } -struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn, +struct zone *zone_for_pfn_range(int online_type, int nid, unsigned start_pfn, unsigned long nr_pages) { if (online_type == MMOP_ONLINE_KERNEL) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index c0343c742bed..3ebe2cfc64af 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -330,7 +330,7 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes) else if (pol->flags & MPOL_F_RELATIVE_NODES) mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); else { - nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed, + nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed, *nodes); pol->w.cpuset_mems_allowed = *nodes; } @@ -1161,7 +1161,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, tmp = *from; while (!nodes_empty(tmp)) { - int s,d; + int s, d; int source = NUMA_NO_NODE; int dest = 0; diff --git a/mm/mempool.c b/mm/mempool.c index fe19d290a301..a258cf4de575 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -251,7 +251,7 @@ EXPORT_SYMBOL(mempool_init); mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, void *pool_data) { - return mempool_create_node(min_nr,alloc_fn,free_fn, pool_data, + return mempool_create_node(min_nr, alloc_fn, free_fn, pool_data, GFP_KERNEL, NUMA_NO_NODE); } EXPORT_SYMBOL(mempool_create); diff --git a/mm/mlock.c b/mm/mlock.c index f8f8cc32d03d..df590fda5688 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -559,7 +559,7 @@ static int apply_vma_lock_flags(unsigned long start, size_t len, vm_flags_t flags) { unsigned long nstart, end, tmp; - struct vm_area_struct * vma, * prev; + struct vm_area_struct *vma, *prev; int error; VM_BUG_ON(offset_in_page(start)); @@ -737,7 +737,7 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) */ static int apply_mlockall_flags(int flags) { - struct vm_area_struct * vma, * prev = NULL; + struct vm_area_struct *vma, *prev = NULL; vm_flags_t to_add = 0; current->mm->def_flags &= VM_LOCKED_CLEAR_MASK; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index fa1cf18bac97..3df2ac6b8686 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -993,7 +993,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message) if (oom_group) { mem_cgroup_print_oom_group(oom_group); mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member, - (void*)message); + (void *)message); mem_cgroup_put(oom_group); } } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b012805a11ad..bcdc0c6f21f1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8808,7 +8808,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, ret = __alloc_contig_migrate_range(&cc, start, end); if (ret && ret != -EBUSY) goto done; - ret =0; + ret = 0; /* * Pages from [start, end) are within a MAX_ORDER_NR_PAGES diff --git a/mm/shmem.c b/mm/shmem.c index 162d8f8993bb..a08cedefbfaa 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3508,7 +3508,7 @@ static int shmem_parse_options(struct fs_context *fc, void *data) } } if (*this_char) { - char *value = strchr(this_char,'='); + char *value = strchr(this_char, '='); size_t len = 0; int err; diff --git a/mm/sparse.c b/mm/sparse.c index d3fbed26e64e..b2ada9dc00cb 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -257,7 +257,7 @@ static void __init memory_present(int nid, unsigned long start, unsigned long en if (unlikely(!mem_section)) { unsigned long size, align; - size = sizeof(struct mem_section*) * NR_SECTION_ROOTS; + size = sizeof(struct mem_section *) * NR_SECTION_ROOTS; align = 1 << (INTERNODE_CACHE_SHIFT); mem_section = memblock_alloc(size, align); if (!mem_section) diff --git a/mm/swapfile.c b/mm/swapfile.c index 084a5b9a18e5..149e77454e3c 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2780,7 +2780,7 @@ static int swap_show(struct seq_file *swap, void *v) unsigned int bytes, inuse; if (si == SEQ_START_TOKEN) { - seq_puts(swap,"Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n"); + seq_puts(swap, "Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n"); return 0; } @@ -3284,7 +3284,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) sizeof(long), GFP_KERNEL); - if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) { + if (p->bdev && (swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) { /* * When discard is enabled for swap with no particular * policy flagged, we set all swap discard flags here in diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d33894d7b27a..9c539f0730a5 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3083,7 +3083,7 @@ EXPORT_SYMBOL(vzalloc_node); * 64b systems should always have either DMA or DMA32 zones. For others * GFP_DMA32 should do the right thing and use the normal zone. */ -#define GFP_VMALLOC32 GFP_DMA32 | GFP_KERNEL +#define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) #endif /** diff --git a/mm/vmscan.c b/mm/vmscan.c index 44c49acf10c4..5199b9696bab 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4059,7 +4059,7 @@ static int kswapd(void *p) { unsigned int alloc_order, reclaim_order; unsigned int highest_zoneidx = MAX_NR_ZONES - 1; - pg_data_t *pgdat = (pg_data_t*)p; + pg_data_t *pgdat = (pg_data_t *)p; struct task_struct *tsk = current; const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); -- cgit v1.2.3-58-ga151 From 0c4ff27a0e541bcee167612fc9065623d75314a3 Mon Sep 17 00:00:00 2001 From: Zhang Yunkai Date: Tue, 4 May 2021 18:40:15 -0700 Subject: mm/process_vm_access.c: remove duplicate include 'linux/compat.h' included in 'process_vm_access.c' is duplicated. Link: https://lkml.kernel.org/r/20210306132122.220431-1-zhang.yunkai@zte.com.cn Signed-off-by: Zhang Yunkai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/process_vm_access.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index f5fee9cf90f8..4bcc11958089 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.3-58-ga151 From 94868a1e127bbe0e03a4467f27196cd668cbc344 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 4 May 2021 18:40:18 -0700 Subject: kfence: zero guard page after out-of-bounds access After an out-of-bounds accesses, zero the guard page before re-protecting in kfence_guarded_free(). On one hand this helps make the failure mode of subsequent out-of-bounds accesses more deterministic, but could also prevent certain information leaks. Link: https://lkml.kernel.org/r/20210312121653.348518-1-elver@google.com Signed-off-by: Marco Elver Acked-by: Alexander Potapenko Cc: Dmitry Vyukov Cc: Andrey Konovalov Cc: Jann Horn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kfence/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/kfence/core.c b/mm/kfence/core.c index d53c91f881a4..768dbd58170d 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -372,6 +372,7 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z /* Restore page protection if there was an OOB access. */ if (meta->unprotected_page) { + memzero_explicit((void *)ALIGN_DOWN(meta->unprotected_page, PAGE_SIZE), PAGE_SIZE); kfence_protect(meta->unprotected_page); meta->unprotected_page = 0; } -- cgit v1.2.3-58-ga151 From 407f1d8c1b5f3ec66a6a3eb835d3b81c76440f4e Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 4 May 2021 18:40:21 -0700 Subject: kfence: await for allocation using wait_event Patch series "kfence: optimize timer scheduling", v2. We have observed that mostly-idle systems with KFENCE enabled wake up otherwise idle CPUs, preventing such to enter a lower power state. Debugging revealed that KFENCE spends too much active time in toggle_allocation_gate(). While the first version of KFENCE was using all the right bits to be scheduling optimal, and thus power efficient, by simply using wait_event() + wake_up(), that code was unfortunately removed. As KFENCE was exposed to various different configs and tests, the scheduling optimal code slowly disappeared. First because of hung task warnings, and finally because of deadlocks when an allocation is made by timer code with debug objects enabled. Clearly, the "fixes" were not too friendly for devices that want to be power efficient. Therefore, let's try a little harder to fix the hung task and deadlock problems that we have with wait_event() + wake_up(), while remaining as scheduling friendly and power efficient as possible. Crucially, we need to defer the wake_up() to an irq_work, avoiding any potential for deadlock. The result with this series is that on the devices where we observed a power regression, power usage returns back to baseline levels. This patch (of 3): On mostly-idle systems, we have observed that toggle_allocation_gate() is a cause of frequent wake-ups, preventing an otherwise idle CPU to go into a lower power state. A late change in KFENCE's development, due to a potential deadlock [1], required changing the scheduling-friendly wait_event_timeout() and wake_up() to an open-coded wait-loop using schedule_timeout(). [1] https://lkml.kernel.org/r/000000000000c0645805b7f982e4@google.com To avoid unnecessary wake-ups, switch to using wait_event_timeout(). Unfortunately, we still cannot use a version with direct wake_up() in __kfence_alloc() due to the same potential for deadlock as in [1]. Instead, add a level of indirection via an irq_work that is scheduled if we determine that the kfence_timer requires a wake_up(). Link: https://lkml.kernel.org/r/20210421105132.3965998-1-elver@google.com Link: https://lkml.kernel.org/r/20210421105132.3965998-2-elver@google.com Fixes: 0ce20dd84089 ("mm: add Kernel Electric-Fence infrastructure") Signed-off-by: Marco Elver Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Jann Horn Cc: Mark Rutland Cc: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.kfence | 1 + mm/kfence/core.c | 45 +++++++++++++++++++++++++++++---------------- 2 files changed, 30 insertions(+), 16 deletions(-) diff --git a/lib/Kconfig.kfence b/lib/Kconfig.kfence index 78f50ccb3b45..e641add33947 100644 --- a/lib/Kconfig.kfence +++ b/lib/Kconfig.kfence @@ -7,6 +7,7 @@ menuconfig KFENCE bool "KFENCE: low-overhead sampling-based memory safety error detector" depends on HAVE_ARCH_KFENCE && (SLAB || SLUB) select STACKTRACE + select IRQ_WORK help KFENCE is a low-overhead sampling-based detector of heap out-of-bounds access, use-after-free, and invalid-free errors. KFENCE is designed diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 768dbd58170d..235d726f88bc 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -587,6 +588,17 @@ late_initcall(kfence_debugfs_init); /* === Allocation Gate Timer ================================================ */ +#ifdef CONFIG_KFENCE_STATIC_KEYS +/* Wait queue to wake up allocation-gate timer task. */ +static DECLARE_WAIT_QUEUE_HEAD(allocation_wait); + +static void wake_up_kfence_timer(struct irq_work *work) +{ + wake_up(&allocation_wait); +} +static DEFINE_IRQ_WORK(wake_up_kfence_timer_work, wake_up_kfence_timer); +#endif + /* * Set up delayed work, which will enable and disable the static key. We need to * use a work queue (rather than a simple timer), since enabling and disabling a @@ -604,25 +616,13 @@ static void toggle_allocation_gate(struct work_struct *work) if (!READ_ONCE(kfence_enabled)) return; - /* Enable static key, and await allocation to happen. */ atomic_set(&kfence_allocation_gate, 0); #ifdef CONFIG_KFENCE_STATIC_KEYS + /* Enable static key, and await allocation to happen. */ static_branch_enable(&kfence_allocation_key); - /* - * Await an allocation. Timeout after 1 second, in case the kernel stops - * doing allocations, to avoid stalling this worker task for too long. - */ - { - unsigned long end_wait = jiffies + HZ; - - do { - set_current_state(TASK_UNINTERRUPTIBLE); - if (atomic_read(&kfence_allocation_gate) != 0) - break; - schedule_timeout(1); - } while (time_before(jiffies, end_wait)); - __set_current_state(TASK_RUNNING); - } + + wait_event_timeout(allocation_wait, atomic_read(&kfence_allocation_gate), HZ); + /* Disable static key and reset timer. */ static_branch_disable(&kfence_allocation_key); #endif @@ -729,6 +729,19 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags) */ if (atomic_read(&kfence_allocation_gate) || atomic_inc_return(&kfence_allocation_gate) > 1) return NULL; +#ifdef CONFIG_KFENCE_STATIC_KEYS + /* + * waitqueue_active() is fully ordered after the update of + * kfence_allocation_gate per atomic_inc_return(). + */ + if (waitqueue_active(&allocation_wait)) { + /* + * Calling wake_up() here may deadlock when allocations happen + * from within timer code. Use an irq_work to defer it. + */ + irq_work_queue(&wake_up_kfence_timer_work); + } +#endif if (!READ_ONCE(kfence_enabled)) return NULL; -- cgit v1.2.3-58-ga151 From 37c9284f6932b915043717703d6496dfd59c85f5 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 4 May 2021 18:40:24 -0700 Subject: kfence: maximize allocation wait timeout duration The allocation wait timeout was initially added because of warnings due to CONFIG_DETECT_HUNG_TASK=y [1]. While the 1 sec timeout is sufficient to resolve the warnings (given the hung task timeout must be 1 sec or larger) it may cause unnecessary wake-ups if the system is idle: https://lkml.kernel.org/r/CADYN=9J0DQhizAGB0-jz4HOBBh+05kMBXb4c0cXMS7Qi5NAJiw@mail.gmail.com Fix it by computing the timeout duration in terms of the current sysctl_hung_task_timeout_secs value. Link: https://lkml.kernel.org/r/20210421105132.3965998-3-elver@google.com Signed-off-by: Marco Elver Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Hillf Danton Cc: Jann Horn Cc: Mark Rutland Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kfence/core.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 235d726f88bc..9742649f3f88 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -621,7 +622,16 @@ static void toggle_allocation_gate(struct work_struct *work) /* Enable static key, and await allocation to happen. */ static_branch_enable(&kfence_allocation_key); - wait_event_timeout(allocation_wait, atomic_read(&kfence_allocation_gate), HZ); + if (sysctl_hung_task_timeout_secs) { + /* + * During low activity with no allocations we might wait a + * while; let's avoid the hung task warning. + */ + wait_event_timeout(allocation_wait, atomic_read(&kfence_allocation_gate), + sysctl_hung_task_timeout_secs * HZ / 2); + } else { + wait_event(allocation_wait, atomic_read(&kfence_allocation_gate)); + } /* Disable static key and reset timer. */ static_branch_disable(&kfence_allocation_key); -- cgit v1.2.3-58-ga151 From 36f0b35d0894576fe63268ede80d9f5aa140be09 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 4 May 2021 18:40:27 -0700 Subject: kfence: use power-efficient work queue to run delayed work Use the power-efficient work queue, to avoid the pathological case where we keep pinning ourselves on the same possibly idle CPU on systems that want to be power-efficient (https://lwn.net/Articles/731052/). Link: https://lkml.kernel.org/r/20210421105132.3965998-4-elver@google.com Signed-off-by: Marco Elver Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Hillf Danton Cc: Jann Horn Cc: Mark Rutland Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kfence/core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 9742649f3f88..e18fbbd5d9b4 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -636,7 +636,8 @@ static void toggle_allocation_gate(struct work_struct *work) /* Disable static key and reset timer. */ static_branch_disable(&kfence_allocation_key); #endif - schedule_delayed_work(&kfence_timer, msecs_to_jiffies(kfence_sample_interval)); + queue_delayed_work(system_power_efficient_wq, &kfence_timer, + msecs_to_jiffies(kfence_sample_interval)); } static DECLARE_DELAYED_WORK(kfence_timer, toggle_allocation_gate); @@ -665,7 +666,7 @@ void __init kfence_init(void) } WRITE_ONCE(kfence_enabled, true); - schedule_delayed_work(&kfence_timer, 0); + queue_delayed_work(system_power_efficient_wq, &kfence_timer, 0); pr_info("initialized - using %lu bytes for %d objects at 0x%p-0x%p\n", KFENCE_POOL_SIZE, CONFIG_KFENCE_NUM_OBJECTS, (void *)__kfence_pool, (void *)(__kfence_pool + KFENCE_POOL_SIZE)); -- cgit v1.2.3-58-ga151