From 7b086755fb8cdbb6b3e45a1bbddc00e7f9b1dc03 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 11 Sep 2023 14:11:08 -0400 Subject: mm: page_alloc: fix CMA and HIGHATOMIC landing on the wrong buddy list Commit 4b23a68f9536 ("mm/page_alloc: protect PCP lists with a spinlock") bypasses the pcplist on lock contention and returns the page directly to the buddy list of the page's migratetype. For pages that don't have their own pcplist, such as CMA and HIGHATOMIC, the migratetype is temporarily updated such that the page can hitch a ride on the MOVABLE pcplist. Their true type is later reassessed when flushing in free_pcppages_bulk(). However, when lock contention is detected after the type was already overridden, the bypass will then put the page on the wrong buddy list. Once on the MOVABLE buddy list, the page becomes eligible for fallbacks and even stealing. In the case of HIGHATOMIC, otherwise ineligible allocations can dip into the highatomic reserves. In the case of CMA, the page can be lost from the CMA region permanently. Use a separate pcpmigratetype variable for the pcplist override. Use the original migratetype when going directly to the buddy. This fixes the bug and should make the intentions more obvious in the code. Originally sent here to address the HIGHATOMIC case: https://lore.kernel.org/lkml/20230821183733.106619-4-hannes@cmpxchg.org/ Changelog updated in response to the CMA-specific bug report. [mgorman@techsingularity.net: updated changelog] Link: https://lkml.kernel.org/r/20230911181108.GA104295@cmpxchg.org Fixes: 4b23a68f9536 ("mm/page_alloc: protect PCP lists with a spinlock") Signed-off-by: Johannes Weiner Reported-by: Joe Liu Reviewed-by: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- mm/page_alloc.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0c5be12f9336..95546f376302 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2400,7 +2400,7 @@ void free_unref_page(struct page *page, unsigned int order) struct per_cpu_pages *pcp; struct zone *zone; unsigned long pfn = page_to_pfn(page); - int migratetype; + int migratetype, pcpmigratetype; if (!free_unref_page_prepare(page, pfn, order)) return; @@ -2408,24 +2408,24 @@ void free_unref_page(struct page *page, unsigned int order) /* * We only track unmovable, reclaimable and movable on pcp lists. * Place ISOLATE pages on the isolated list because they are being - * offlined but treat HIGHATOMIC as movable pages so we can get those - * areas back if necessary. Otherwise, we may have to free + * offlined but treat HIGHATOMIC and CMA as movable pages so we can + * get those areas back if necessary. Otherwise, we may have to free * excessively into the page allocator */ - migratetype = get_pcppage_migratetype(page); + migratetype = pcpmigratetype = get_pcppage_migratetype(page); if (unlikely(migratetype >= MIGRATE_PCPTYPES)) { if (unlikely(is_migrate_isolate(migratetype))) { free_one_page(page_zone(page), page, pfn, order, migratetype, FPI_NONE); return; } - migratetype = MIGRATE_MOVABLE; + pcpmigratetype = MIGRATE_MOVABLE; } zone = page_zone(page); pcp_trylock_prepare(UP_flags); pcp = pcp_spin_trylock(zone->per_cpu_pageset); if (pcp) { - free_unref_page_commit(zone, pcp, page, migratetype, order); + free_unref_page_commit(zone, pcp, page, pcpmigratetype, order); pcp_spin_unlock(pcp); } else { free_one_page(zone, page, pfn, order, migratetype, FPI_NONE); -- cgit v1.2.3-58-ga151 From 9ea9cb00a82b53ec39630eac718776d37e41b35a Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 14 Sep 2023 11:21:39 -0400 Subject: mm: memcontrol: fix GFP_NOFS recursion in memory.high enforcement Breno and Josef report a deadlock scenario from cgroup reclaim re-entering the filesystem: [ 361.546690] ====================================================== [ 361.559210] WARNING: possible circular locking dependency detected [ 361.571703] 6.5.0-0_fbk700_debug_rc0_kbuilder_13159_gbf787a128001 #1 Tainted: G S E [ 361.589704] ------------------------------------------------------ [ 361.602277] find/9315 is trying to acquire lock: [ 361.611625] ffff88837ba140c0 (&delayed_node->mutex){+.+.}-{4:4}, at: __btrfs_release_delayed_node+0x68/0x4f0 [ 361.631437] [ 361.631437] but task is already holding lock: [ 361.643243] ffff8881765b8678 (btrfs-tree-01){++++}-{4:4}, at: btrfs_tree_read_lock+0x1e/0x40 [ 362.904457] mutex_lock_nested+0x1c/0x30 [ 362.912414] __btrfs_release_delayed_node+0x68/0x4f0 [ 362.922460] btrfs_evict_inode+0x301/0x770 [ 362.982726] evict+0x17c/0x380 [ 362.988944] prune_icache_sb+0x100/0x1d0 [ 363.005559] super_cache_scan+0x1f8/0x260 [ 363.013695] do_shrink_slab+0x2a2/0x540 [ 363.021489] shrink_slab_memcg+0x237/0x3d0 [ 363.050606] shrink_slab+0xa7/0x240 [ 363.083382] shrink_node_memcgs+0x262/0x3b0 [ 363.091870] shrink_node+0x1a4/0x720 [ 363.099150] shrink_zones+0x1f6/0x5d0 [ 363.148798] do_try_to_free_pages+0x19b/0x5e0 [ 363.157633] try_to_free_mem_cgroup_pages+0x266/0x370 [ 363.190575] reclaim_high+0x16f/0x1f0 [ 363.208409] mem_cgroup_handle_over_high+0x10b/0x270 [ 363.246678] try_charge_memcg+0xaf2/0xc70 [ 363.304151] charge_memcg+0xf0/0x350 [ 363.320070] __mem_cgroup_charge+0x28/0x40 [ 363.328371] __filemap_add_folio+0x870/0xd50 [ 363.371303] filemap_add_folio+0xdd/0x310 [ 363.399696] __filemap_get_folio+0x2fc/0x7d0 [ 363.419086] pagecache_get_page+0xe/0x30 [ 363.427048] alloc_extent_buffer+0x1cd/0x6a0 [ 363.435704] read_tree_block+0x43/0xc0 [ 363.443316] read_block_for_search+0x361/0x510 [ 363.466690] btrfs_search_slot+0xc8c/0x1520 This is caused by the mem_cgroup_handle_over_high() not respecting the gfp_mask of the allocation context. We used to only call this function on resume to userspace, where no locks were held. But c9afe31ec443 ("memcg: synchronously enforce memory.high for large overcharges") added a call from the allocation context without considering the gfp. Link: https://lkml.kernel.org/r/20230914152139.100822-1-hannes@cmpxchg.org Fixes: c9afe31ec443 ("memcg: synchronously enforce memory.high for large overcharges") Signed-off-by: Johannes Weiner Reported-by: Breno Leitao Reported-by: Josef Bacik Acked-by: Shakeel Butt Acked-by: Michal Hocko Cc: Roman Gushchin Cc: Muchun Song Cc: [5.17+] Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 4 ++-- include/linux/resume_user_mode.h | 2 +- mm/memcontrol.c | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index ab94ad4597d0..e4e24da16d2c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -920,7 +920,7 @@ unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, return READ_ONCE(mz->lru_zone_size[zone_idx][lru]); } -void mem_cgroup_handle_over_high(void); +void mem_cgroup_handle_over_high(gfp_t gfp_mask); unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg); @@ -1458,7 +1458,7 @@ static inline void mem_cgroup_unlock_pages(void) rcu_read_unlock(); } -static inline void mem_cgroup_handle_over_high(void) +static inline void mem_cgroup_handle_over_high(gfp_t gfp_mask) { } diff --git a/include/linux/resume_user_mode.h b/include/linux/resume_user_mode.h index 285189454449..f8f3e958e9cf 100644 --- a/include/linux/resume_user_mode.h +++ b/include/linux/resume_user_mode.h @@ -55,7 +55,7 @@ static inline void resume_user_mode_work(struct pt_regs *regs) } #endif - mem_cgroup_handle_over_high(); + mem_cgroup_handle_over_high(GFP_KERNEL); blkcg_maybe_throttle_current(); rseq_handle_notify_resume(NULL, regs); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a4d3282493b6..d13dde2f8b56 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2555,7 +2555,7 @@ static unsigned long calculate_high_delay(struct mem_cgroup *memcg, * Scheduled by try_charge() to be executed from the userland return path * and reclaims memory over the high limit. */ -void mem_cgroup_handle_over_high(void) +void mem_cgroup_handle_over_high(gfp_t gfp_mask) { unsigned long penalty_jiffies; unsigned long pflags; @@ -2583,7 +2583,7 @@ retry_reclaim: */ nr_reclaimed = reclaim_high(memcg, in_retry ? SWAP_CLUSTER_MAX : nr_pages, - GFP_KERNEL); + gfp_mask); /* * memory.high is breached and reclaim is unable to keep up. Throttle @@ -2819,7 +2819,7 @@ done_restock: if (current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH && !(current->flags & PF_MEMALLOC) && gfpflags_allow_blocking(gfp_mask)) { - mem_cgroup_handle_over_high(); + mem_cgroup_handle_over_high(gfp_mask); } return 0; } -- cgit v1.2.3-58-ga151 From c8be03806738c86521dbf1e0503bc90855fb99a3 Mon Sep 17 00:00:00 2001 From: Yin Fengwei Date: Thu, 14 Sep 2023 21:47:41 +0800 Subject: filemap: add filemap_map_order0_folio() to handle order0 folio Kernel test robot reported regressions for several benchmarks [1]. The regression are related with commit: de74976eb65151a2f568e477fc2e0032df5b22b4 ("filemap: add filemap_map_folio_range()") It turned out that function filemap_map_folio_range() brings these regressions when handle folio with order0. Add filemap_map_order0_folio() to handle order0 folio. The benefit come from two perspectives: - the code size is smaller (around 126 bytes) - no loop Testing showed the regressions reported by 0day [1] all are fixed: commit 9f1f5b60e76d44fa: parent commit of de74976eb65151a2 commit fbdf9263a3d7fdbd: latest mm-unstable commit commit 7fbfe2003f84686d: this fixing patch 9f1f5b60e76d44fa fbdf9263a3d7fdbd 7fbfe2003f84686d ---------------- --------------------------- --------------------------- 3843810 -21.4% 3020268 +4.6% 4018708 stress-ng.bad-altstack.ops 64061 -21.4% 50336 +4.6% 66977 stress-ng.bad-altstack.ops_per_sec 1709026 -14.4% 1462102 +2.4% 1750757 stress-ng.fork.ops 28483 -14.4% 24368 +2.4% 29179 stress-ng.fork.ops_per_sec 3685088 -53.6% 1710976 +0.5% 3702454 stress-ng.zombie.ops 56732 -65.3% 19667 +0.7% 57107 stress-ng.zombie.ops_per_sec 61874 -12.1% 54416 +0.4% 62136 vm-scalability.median 13527663 -11.7% 11942117 -0.1% 13513946 vm-scalability.throughput 4.066e+09 -11.7% 3.59e+09 -0.1% 4.061e+09 vm-scalability.workload [1]: https://lore.kernel.org/oe-lkp/72e017b9-deb6-44fa-91d6-716ee2c39cbc@intel.com/T/#m7d2bba30f75a9cee8eab07e5809abd9b3b206c84 Link: https://lkml.kernel.org/r/20230914134741.1937654-1-fengwei.yin@intel.com Fixes: de74976eb65151a2f568e477fc2e0032df5b22b4 ("filemap: add filemap_map_folio_range()") Signed-off-by: Yin Fengwei Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202309111556.b2aa3d7a-oliver.sang@intel.com Cc: Feng Tang Cc: Huang Ying Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/filemap.c | 69 ++++++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 48 insertions(+), 21 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 582f5317ff71..4ea4387053e8 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3475,13 +3475,11 @@ skip: */ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, struct folio *folio, unsigned long start, - unsigned long addr, unsigned int nr_pages) + unsigned long addr, unsigned int nr_pages, + unsigned int *mmap_miss) { vm_fault_t ret = 0; - struct vm_area_struct *vma = vmf->vma; - struct file *file = vma->vm_file; struct page *page = folio_page(folio, start); - unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss); unsigned int count = 0; pte_t *old_ptep = vmf->pte; @@ -3489,8 +3487,7 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, if (PageHWPoison(page + count)) goto skip; - if (mmap_miss > 0) - mmap_miss--; + (*mmap_miss)++; /* * NOTE: If there're PTE markers, we'll leave them to be @@ -3525,7 +3522,35 @@ skip: } vmf->pte = old_ptep; - WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss); + + return ret; +} + +static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf, + struct folio *folio, unsigned long addr, + unsigned int *mmap_miss) +{ + vm_fault_t ret = 0; + struct page *page = &folio->page; + + if (PageHWPoison(page)) + return ret; + + (*mmap_miss)++; + + /* + * NOTE: If there're PTE markers, we'll leave them to be + * handled in the specific fault path, and it'll prohibit + * the fault-around logic. + */ + if (!pte_none(ptep_get(vmf->pte))) + return ret; + + if (vmf->address == addr) + ret = VM_FAULT_NOPAGE; + + set_pte_range(vmf, folio, page, 1, addr); + folio_ref_inc(folio); return ret; } @@ -3541,7 +3566,7 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, XA_STATE(xas, &mapping->i_pages, start_pgoff); struct folio *folio; vm_fault_t ret = 0; - int nr_pages = 0; + unsigned int nr_pages = 0, mmap_miss = 0, mmap_miss_saved; rcu_read_lock(); folio = next_uptodate_folio(&xas, mapping, end_pgoff); @@ -3569,25 +3594,27 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, end = folio->index + folio_nr_pages(folio) - 1; nr_pages = min(end, end_pgoff) - xas.xa_index + 1; - /* - * NOTE: If there're PTE markers, we'll leave them to be - * handled in the specific fault path, and it'll prohibit the - * fault-around logic. - */ - if (!pte_none(ptep_get(vmf->pte))) - goto unlock; - - ret |= filemap_map_folio_range(vmf, folio, - xas.xa_index - folio->index, addr, nr_pages); + if (!folio_test_large(folio)) + ret |= filemap_map_order0_folio(vmf, + folio, addr, &mmap_miss); + else + ret |= filemap_map_folio_range(vmf, folio, + xas.xa_index - folio->index, addr, + nr_pages, &mmap_miss); -unlock: folio_unlock(folio); folio_put(folio); - folio = next_uptodate_folio(&xas, mapping, end_pgoff); - } while (folio); + } while ((folio = next_uptodate_folio(&xas, mapping, end_pgoff)) != NULL); pte_unmap_unlock(vmf->pte, vmf->ptl); out: rcu_read_unlock(); + + mmap_miss_saved = READ_ONCE(file->f_ra.mmap_miss); + if (mmap_miss >= mmap_miss_saved) + WRITE_ONCE(file->f_ra.mmap_miss, 0); + else + WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss_saved - mmap_miss); + return ret; } EXPORT_SYMBOL(filemap_map_pages); -- cgit v1.2.3-58-ga151