summaryrefslogtreecommitdiff
path: root/mm/rmap.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-05-19 09:21:03 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2024-05-19 09:21:03 -0700
commit61307b7be41a1f1039d1d1368810a1d92cb97b44 (patch)
tree639e233e177f8618cd5f86daeb7efc6b095890f0 /mm/rmap.c
parent0450d2083be6bdcd18c9535ac50c55266499b2df (diff)
parent76edc534cc289308130272a2ac28694fc9b72a03 (diff)
Merge tag 'mm-stable-2024-05-17-19-19' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull mm updates from Andrew Morton: "The usual shower of singleton fixes and minor series all over MM, documented (hopefully adequately) in the respective changelogs. Notable series include: - Lucas Stach has provided some page-mapping cleanup/consolidation/ maintainability work in the series "mm/treewide: Remove pXd_huge() API". - In the series "Allow migrate on protnone reference with MPOL_PREFERRED_MANY policy", Donet Tom has optimized mempolicy's MPOL_PREFERRED_MANY mode, yielding almost doubled performance in one test. - In their series "Memory allocation profiling" Kent Overstreet and Suren Baghdasaryan have contributed a means of determining (via /proc/allocinfo) whereabouts in the kernel memory is being allocated: number of calls and amount of memory. - Matthew Wilcox has provided the series "Various significant MM patches" which does a number of rather unrelated things, but in largely similar code sites. - In his series "mm: page_alloc: freelist migratetype hygiene" Johannes Weiner has fixed the page allocator's handling of migratetype requests, with resulting improvements in compaction efficiency. - In the series "make the hugetlb migration strategy consistent" Baolin Wang has fixed a hugetlb migration issue, which should improve hugetlb allocation reliability. - Liu Shixin has hit an I/O meltdown caused by readahead in a memory-tight memcg. Addressed in the series "Fix I/O high when memory almost met memcg limit". - In the series "mm/filemap: optimize folio adding and splitting" Kairui Song has optimized pagecache insertion, yielding ~10% performance improvement in one test. - Baoquan He has cleaned up and consolidated the early zone initialization code in the series "mm/mm_init.c: refactor free_area_init_core()". - Baoquan has also redone some MM initializatio code in the series "mm/init: minor clean up and improvement". - MM helper cleanups from Christoph Hellwig in his series "remove follow_pfn". - More cleanups from Matthew Wilcox in the series "Various page->flags cleanups". - Vlastimil Babka has contributed maintainability improvements in the series "memcg_kmem hooks refactoring". - More folio conversions and cleanups in Matthew Wilcox's series: "Convert huge_zero_page to huge_zero_folio" "khugepaged folio conversions" "Remove page_idle and page_young wrappers" "Use folio APIs in procfs" "Clean up __folio_put()" "Some cleanups for memory-failure" "Remove page_mapping()" "More folio compat code removal" - David Hildenbrand chipped in with "fs/proc/task_mmu: convert hugetlb functions to work on folis". - Code consolidation and cleanup work related to GUP's handling of hugetlbs in Peter Xu's series "mm/gup: Unify hugetlb, part 2". - Rick Edgecombe has developed some fixes to stack guard gaps in the series "Cover a guard gap corner case". - Jinjiang Tu has fixed KSM's behaviour after a fork+exec in the series "mm/ksm: fix ksm exec support for prctl". - Baolin Wang has implemented NUMA balancing for multi-size THPs. This is a simple first-cut implementation for now. The series is "support multi-size THP numa balancing". - Cleanups to vma handling helper functions from Matthew Wilcox in the series "Unify vma_address and vma_pgoff_address". - Some selftests maintenance work from Dev Jain in the series "selftests/mm: mremap_test: Optimizations and style fixes". - Improvements to the swapping of multi-size THPs from Ryan Roberts in the series "Swap-out mTHP without splitting". - Kefeng Wang has significantly optimized the handling of arm64's permission page faults in the series "arch/mm/fault: accelerate pagefault when badaccess" "mm: remove arch's private VM_FAULT_BADMAP/BADACCESS" - GUP cleanups from David Hildenbrand in "mm/gup: consistently call it GUP-fast". - hugetlb fault code cleanups from Vishal Moola in "Hugetlb fault path to use struct vm_fault". - selftests build fixes from John Hubbard in the series "Fix selftests/mm build without requiring "make headers"". - Memory tiering fixes/improvements from Ho-Ren (Jack) Chuang in the series "Improved Memory Tier Creation for CPUless NUMA Nodes". Fixes the initialization code so that migration between different memory types works as intended. - David Hildenbrand has improved follow_pte() and fixed an errant driver in the series "mm: follow_pte() improvements and acrn follow_pte() fixes". - David also did some cleanup work on large folio mapcounts in his series "mm: mapcount for large folios + page_mapcount() cleanups". - Folio conversions in KSM in Alex Shi's series "transfer page to folio in KSM". - Barry Song has added some sysfs stats for monitoring multi-size THP's in the series "mm: add per-order mTHP alloc and swpout counters". - Some zswap cleanups from Yosry Ahmed in the series "zswap same-filled and limit checking cleanups". - Matthew Wilcox has been looking at buffer_head code and found the documentation to be lacking. The series is "Improve buffer head documentation". - Multi-size THPs get more work, this time from Lance Yang. His series "mm/madvise: enhance lazyfreeing with mTHP in madvise_free" optimizes the freeing of these things. - Kemeng Shi has added more userspace-visible writeback instrumentation in the series "Improve visibility of writeback". - Kemeng Shi then sent some maintenance work on top in the series "Fix and cleanups to page-writeback". - Matthew Wilcox reduces mmap_lock traffic in the anon vma code in the series "Improve anon_vma scalability for anon VMAs". Intel's test bot reported an improbable 3x improvement in one test. - SeongJae Park adds some DAMON feature work in the series "mm/damon: add a DAMOS filter type for page granularity access recheck" "selftests/damon: add DAMOS quota goal test" - Also some maintenance work in the series "mm/damon/paddr: simplify page level access re-check for pageout" "mm/damon: misc fixes and improvements" - David Hildenbrand has disabled some known-to-fail selftests ni the series "selftests: mm: cow: flag vmsplice() hugetlb tests as XFAIL". - memcg metadata storage optimizations from Shakeel Butt in "memcg: reduce memory consumption by memcg stats". - DAX fixes and maintenance work from Vishal Verma in the series "dax/bus.c: Fixups for dax-bus locking"" * tag 'mm-stable-2024-05-17-19-19' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (426 commits) memcg, oom: cleanup unused memcg_oom_gfp_mask and memcg_oom_order selftests/mm: hugetlb_madv_vs_map: avoid test skipping by querying hugepage size at runtime mm/hugetlb: add missing VM_FAULT_SET_HINDEX in hugetlb_wp mm/hugetlb: add missing VM_FAULT_SET_HINDEX in hugetlb_fault selftests: cgroup: add tests to verify the zswap writeback path mm: memcg: make alloc_mem_cgroup_per_node_info() return bool mm/damon/core: fix return value from damos_wmark_metric_value mm: do not update memcg stats for NR_{FILE/SHMEM}_PMDMAPPED selftests: cgroup: remove redundant enabling of memory controller Docs/mm/damon/maintainer-profile: allow posting patches based on damon/next tree Docs/mm/damon/maintainer-profile: change the maintainer's timezone from PST to PT Docs/mm/damon/design: use a list for supported filters Docs/admin-guide/mm/damon/usage: fix wrong schemes effective quota update command Docs/admin-guide/mm/damon/usage: fix wrong example of DAMOS filter matching sysfs file selftests/damon: classify tests for functionalities and regressions selftests/damon/_damon_sysfs: use 'is' instead of '==' for 'None' selftests/damon/_damon_sysfs: find sysfs mount point from /proc/mounts selftests/damon/_damon_sysfs: check errors from nr_schemes file reads mm/damon/core: initialize ->esz_bp from damos_quota_init_priv() selftests/damon: add a test for DAMOS quota goal ...
Diffstat (limited to 'mm/rmap.c')
-rw-r--r--mm/rmap.c111
1 files changed, 60 insertions, 51 deletions
diff --git a/mm/rmap.c b/mm/rmap.c
index 3746a5531018..e8fc5ecb59b2 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -23,7 +23,7 @@
* inode->i_rwsem (while writing or truncating, not reading or faulting)
* mm->mmap_lock
* mapping->invalidate_lock (in filemap_fault)
- * page->flags PG_locked (lock_page)
+ * folio_lock
* hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share, see hugetlbfs below)
* vma_start_write
* mapping->i_mmap_rwsem
@@ -50,7 +50,7 @@
* hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
* vma_lock (hugetlb specific lock for pmd_sharing)
* mapping->i_mmap_rwsem (also used for hugetlb pmd sharing)
- * page->flags PG_locked (lock_page)
+ * folio_lock
*/
#include <linux/mm.h>
@@ -182,8 +182,6 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
* for the new allocation. At the same time, we do not want
* to do any locking for the common case of already having
* an anon_vma.
- *
- * This must be called with the mmap_lock held for reading.
*/
int __anon_vma_prepare(struct vm_area_struct *vma)
{
@@ -191,6 +189,7 @@ int __anon_vma_prepare(struct vm_area_struct *vma)
struct anon_vma *anon_vma, *allocated;
struct anon_vma_chain *avc;
+ mmap_assert_locked(mm);
might_sleep();
avc = anon_vma_chain_alloc(GFP_KERNEL);
@@ -775,6 +774,8 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
{
struct folio *folio = page_folio(page);
+ pgoff_t pgoff;
+
if (folio_test_anon(folio)) {
struct anon_vma *page__anon_vma = folio_anon_vma(folio);
/*
@@ -790,7 +791,9 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
return -EFAULT;
}
- return vma_address(page, vma);
+ /* The !page__anon_vma above handles KSM folios */
+ pgoff = folio->index + folio_page_idx(folio, page);
+ return vma_address(vma, pgoff, 1);
}
/*
@@ -961,7 +964,7 @@ static bool invalid_folio_referenced_vma(struct vm_area_struct *vma, void *arg)
int folio_referenced(struct folio *folio, int is_locked,
struct mem_cgroup *memcg, unsigned long *vm_flags)
{
- int we_locked = 0;
+ bool we_locked = false;
struct folio_referenced_arg pra = {
.mapcount = folio_mapcount(folio),
.memcg = memcg,
@@ -1128,56 +1131,38 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
if (invalid_mkclean_vma(vma, NULL))
return 0;
- pvmw.address = vma_pgoff_address(pgoff, nr_pages, vma);
+ pvmw.address = vma_address(vma, pgoff, nr_pages);
VM_BUG_ON_VMA(pvmw.address == -EFAULT, vma);
return page_vma_mkclean_one(&pvmw);
}
-int folio_total_mapcount(struct folio *folio)
-{
- int mapcount = folio_entire_mapcount(folio);
- int nr_pages;
- int i;
-
- /* In the common case, avoid the loop when no pages mapped by PTE */
- if (folio_nr_pages_mapped(folio) == 0)
- return mapcount;
- /*
- * Add all the PTE mappings of those pages mapped by PTE.
- * Limit the loop to folio_nr_pages_mapped()?
- * Perhaps: given all the raciness, that may be a good or a bad idea.
- */
- nr_pages = folio_nr_pages(folio);
- for (i = 0; i < nr_pages; i++)
- mapcount += atomic_read(&folio_page(folio, i)->_mapcount);
-
- /* But each of those _mapcounts was based on -1 */
- mapcount += nr_pages;
- return mapcount;
-}
-
static __always_inline unsigned int __folio_add_rmap(struct folio *folio,
struct page *page, int nr_pages, enum rmap_level level,
int *nr_pmdmapped)
{
atomic_t *mapped = &folio->_nr_pages_mapped;
+ const int orig_nr_pages = nr_pages;
int first, nr = 0;
__folio_rmap_sanity_checks(folio, page, nr_pages, level);
switch (level) {
case RMAP_LEVEL_PTE:
+ if (!folio_test_large(folio)) {
+ nr = atomic_inc_and_test(&page->_mapcount);
+ break;
+ }
+
do {
first = atomic_inc_and_test(&page->_mapcount);
- if (first && folio_test_large(folio)) {
+ if (first) {
first = atomic_inc_return_relaxed(mapped);
- first = (first < ENTIRELY_MAPPED);
+ if (first < ENTIRELY_MAPPED)
+ nr++;
}
-
- if (first)
- nr++;
} while (page++, --nr_pages > 0);
+ atomic_add(orig_nr_pages, &folio->_large_mapcount);
break;
case RMAP_LEVEL_PMD:
first = atomic_inc_and_test(&folio->_entire_mapcount);
@@ -1194,6 +1179,7 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio,
nr = 0;
}
}
+ atomic_inc(&folio->_large_mapcount);
break;
}
return nr;
@@ -1429,10 +1415,14 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
SetPageAnonExclusive(page);
}
+ /* increment count (starts at -1) */
+ atomic_set(&folio->_large_mapcount, nr - 1);
atomic_set(&folio->_nr_pages_mapped, nr);
} else {
/* increment count (starts at -1) */
atomic_set(&folio->_entire_mapcount, 0);
+ /* increment count (starts at -1) */
+ atomic_set(&folio->_large_mapcount, 0);
atomic_set(&folio->_nr_pages_mapped, ENTIRELY_MAPPED);
SetPageAnonExclusive(&folio->page);
__lruvec_stat_mod_folio(folio, NR_ANON_THPS, nr);
@@ -1445,13 +1435,14 @@ static __always_inline void __folio_add_file_rmap(struct folio *folio,
struct page *page, int nr_pages, struct vm_area_struct *vma,
enum rmap_level level)
{
+ pg_data_t *pgdat = folio_pgdat(folio);
int nr, nr_pmdmapped = 0;
VM_WARN_ON_FOLIO(folio_test_anon(folio), folio);
nr = __folio_add_rmap(folio, page, nr_pages, level, &nr_pmdmapped);
if (nr_pmdmapped)
- __lruvec_stat_mod_folio(folio, folio_test_swapbacked(folio) ?
+ __mod_node_page_state(pgdat, folio_test_swapbacked(folio) ?
NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED, nr_pmdmapped);
if (nr)
__lruvec_stat_mod_folio(folio, NR_FILE_MAPPED, nr);
@@ -1503,25 +1494,34 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
enum rmap_level level)
{
atomic_t *mapped = &folio->_nr_pages_mapped;
+ pg_data_t *pgdat = folio_pgdat(folio);
int last, nr = 0, nr_pmdmapped = 0;
+ bool partially_mapped = false;
enum node_stat_item idx;
__folio_rmap_sanity_checks(folio, page, nr_pages, level);
switch (level) {
case RMAP_LEVEL_PTE:
+ if (!folio_test_large(folio)) {
+ nr = atomic_add_negative(-1, &page->_mapcount);
+ break;
+ }
+
+ atomic_sub(nr_pages, &folio->_large_mapcount);
do {
last = atomic_add_negative(-1, &page->_mapcount);
- if (last && folio_test_large(folio)) {
+ if (last) {
last = atomic_dec_return_relaxed(mapped);
- last = (last < ENTIRELY_MAPPED);
+ if (last < ENTIRELY_MAPPED)
+ nr++;
}
-
- if (last)
- nr++;
} while (page++, --nr_pages > 0);
+
+ partially_mapped = nr && atomic_read(mapped);
break;
case RMAP_LEVEL_PMD:
+ atomic_dec(&folio->_large_mapcount);
last = atomic_add_negative(-1, &folio->_entire_mapcount);
if (last) {
nr = atomic_sub_return_relaxed(ENTIRELY_MAPPED, mapped);
@@ -1536,17 +1536,20 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
nr = 0;
}
}
+
+ partially_mapped = nr < nr_pmdmapped;
break;
}
if (nr_pmdmapped) {
+ /* NR_{FILE/SHMEM}_PMDMAPPED are not maintained per-memcg */
if (folio_test_anon(folio))
- idx = NR_ANON_THPS;
- else if (folio_test_swapbacked(folio))
- idx = NR_SHMEM_PMDMAPPED;
+ __lruvec_stat_mod_folio(folio, NR_ANON_THPS, -nr_pmdmapped);
else
- idx = NR_FILE_PMDMAPPED;
- __lruvec_stat_mod_folio(folio, idx, -nr_pmdmapped);
+ __mod_node_page_state(pgdat,
+ folio_test_swapbacked(folio) ?
+ NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED,
+ -nr_pmdmapped);
}
if (nr) {
idx = folio_test_anon(folio) ? NR_ANON_MAPPED : NR_FILE_MAPPED;
@@ -1556,10 +1559,12 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
* Queue anon large folio for deferred split if at least one
* page of the folio is unmapped and at least one page
* is still mapped.
+ *
+ * Check partially_mapped first to ensure it is a large folio.
*/
- if (folio_test_large(folio) && folio_test_anon(folio))
- if (level == RMAP_LEVEL_PTE || nr < nr_pmdmapped)
- deferred_split_folio(folio);
+ if (folio_test_anon(folio) && partially_mapped &&
+ list_empty(&folio->_deferred_list))
+ deferred_split_folio(folio);
}
/*
@@ -2588,7 +2593,8 @@ static void rmap_walk_anon(struct folio *folio,
anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
pgoff_start, pgoff_end) {
struct vm_area_struct *vma = avc->vma;
- unsigned long address = vma_address(&folio->page, vma);
+ unsigned long address = vma_address(vma, pgoff_start,
+ folio_nr_pages(folio));
VM_BUG_ON_VMA(address == -EFAULT, vma);
cond_resched();
@@ -2649,7 +2655,8 @@ static void rmap_walk_file(struct folio *folio,
lookup:
vma_interval_tree_foreach(vma, &mapping->i_mmap,
pgoff_start, pgoff_end) {
- unsigned long address = vma_address(&folio->page, vma);
+ unsigned long address = vma_address(vma, pgoff_start,
+ folio_nr_pages(folio));
VM_BUG_ON_VMA(address == -EFAULT, vma);
cond_resched();
@@ -2702,6 +2709,7 @@ void hugetlb_add_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
atomic_inc(&folio->_entire_mapcount);
+ atomic_inc(&folio->_large_mapcount);
if (flags & RMAP_EXCLUSIVE)
SetPageAnonExclusive(&folio->page);
VM_WARN_ON_FOLIO(folio_entire_mapcount(folio) > 1 &&
@@ -2716,6 +2724,7 @@ void hugetlb_add_new_anon_rmap(struct folio *folio,
BUG_ON(address < vma->vm_start || address >= vma->vm_end);
/* increment count (starts at -1) */
atomic_set(&folio->_entire_mapcount, 0);
+ atomic_set(&folio->_large_mapcount, 0);
folio_clear_hugetlb_restore_reserve(folio);
__folio_set_anon(folio, vma, address, true);
SetPageAnonExclusive(&folio->page);