diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2023-06-28 10:28:11 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2023-06-28 10:28:11 -0700 |
commit | 6e17c6de3ddf3073741d9c91a796ee696914d8a0 (patch) | |
tree | 2c425707f78642625dbe2c824c7fded2021e3dc7 /include/linux/pgtable.h | |
parent | 6aeadf7896bff4ca230702daba8788455e6b866e (diff) | |
parent | acc72d59c7509540c27c49625cb4b5a8db1f1a84 (diff) |
Merge tag 'mm-stable-2023-06-24-19-15' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull mm updates from Andrew Morton:
- Yosry Ahmed brought back some cgroup v1 stats in OOM logs
- Yosry has also eliminated cgroup's atomic rstat flushing
- Nhat Pham adds the new cachestat() syscall. It provides userspace
with the ability to query pagecache status - a similar concept to
mincore() but more powerful and with improved usability
- Mel Gorman provides more optimizations for compaction, reducing the
prevalence of page rescanning
- Lorenzo Stoakes has done some maintanance work on the
get_user_pages() interface
- Liam Howlett continues with cleanups and maintenance work to the
maple tree code. Peng Zhang also does some work on maple tree
- Johannes Weiner has done some cleanup work on the compaction code
- David Hildenbrand has contributed additional selftests for
get_user_pages()
- Thomas Gleixner has contributed some maintenance and optimization
work for the vmalloc code
- Baolin Wang has provided some compaction cleanups,
- SeongJae Park continues maintenance work on the DAMON code
- Huang Ying has done some maintenance on the swap code's usage of
device refcounting
- Christoph Hellwig has some cleanups for the filemap/directio code
- Ryan Roberts provides two patch series which yield some
rationalization of the kernel's access to pte entries - use the
provided APIs rather than open-coding accesses
- Lorenzo Stoakes has some fixes to the interaction between pagecache
and directio access to file mappings
- John Hubbard has a series of fixes to the MM selftesting code
- ZhangPeng continues the folio conversion campaign
- Hugh Dickins has been working on the pagetable handling code, mainly
with a view to reducing the load on the mmap_lock
- Catalin Marinas has reduced the arm64 kmalloc() minimum alignment
from 128 to 8
- Domenico Cerasuolo has improved the zswap reclaim mechanism by
reorganizing the LRU management
- Matthew Wilcox provides some fixups to make gfs2 work better with the
buffer_head code
- Vishal Moola also has done some folio conversion work
- Matthew Wilcox has removed the remnants of the pagevec code - their
functionality is migrated over to struct folio_batch
* tag 'mm-stable-2023-06-24-19-15' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (380 commits)
mm/hugetlb: remove hugetlb_set_page_subpool()
mm: nommu: correct the range of mmap_sem_read_lock in task_mem()
hugetlb: revert use of page_cache_next_miss()
Revert "page cache: fix page_cache_next/prev_miss off by one"
mm/vmscan: fix root proactive reclaim unthrottling unbalanced node
mm: memcg: rename and document global_reclaim()
mm: kill [add|del]_page_to_lru_list()
mm: compaction: convert to use a folio in isolate_migratepages_block()
mm: zswap: fix double invalidate with exclusive loads
mm: remove unnecessary pagevec includes
mm: remove references to pagevec
mm: rename invalidate_mapping_pagevec to mapping_try_invalidate
mm: remove struct pagevec
net: convert sunrpc from pagevec to folio_batch
i915: convert i915_gpu_error to use a folio_batch
pagevec: rename fbatch_count()
mm: remove check_move_unevictable_pages()
drm: convert drm_gem_put_pages() to use a folio_batch
i915: convert shmem_sg_free_table() to use a folio_batch
scatterlist: add sg_set_folio()
...
Diffstat (limited to 'include/linux/pgtable.h')
-rw-r--r-- | include/linux/pgtable.h | 176 |
1 files changed, 39 insertions, 137 deletions
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index c5a51481bbb9..5063b482e34f 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -94,14 +94,22 @@ static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address) #define pte_offset_kernel pte_offset_kernel #endif -#if defined(CONFIG_HIGHPTE) -#define pte_offset_map(dir, address) \ - ((pte_t *)kmap_atomic(pmd_page(*(dir))) + \ - pte_index((address))) -#define pte_unmap(pte) kunmap_atomic((pte)) +#ifdef CONFIG_HIGHPTE +#define __pte_map(pmd, address) \ + ((pte_t *)kmap_local_page(pmd_page(*(pmd))) + pte_index((address))) +#define pte_unmap(pte) do { \ + kunmap_local((pte)); \ + /* rcu_read_unlock() to be added later */ \ +} while (0) #else -#define pte_offset_map(dir, address) pte_offset_kernel((dir), (address)) -#define pte_unmap(pte) ((void)(pte)) /* NOP */ +static inline pte_t *__pte_map(pmd_t *pmd, unsigned long address) +{ + return pte_offset_kernel(pmd, address); +} +static inline void pte_unmap(pte_t *pte) +{ + /* rcu_read_unlock() to be added later */ +} #endif /* Find an entry in the second-level page table.. */ @@ -204,12 +212,26 @@ static inline int pudp_set_access_flags(struct vm_area_struct *vma, #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif +#ifndef ptep_get +static inline pte_t ptep_get(pte_t *ptep) +{ + return READ_ONCE(*ptep); +} +#endif + +#ifndef pmdp_get +static inline pmd_t pmdp_get(pmd_t *pmdp) +{ + return READ_ONCE(*pmdp); +} +#endif + #ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { - pte_t pte = *ptep; + pte_t pte = ptep_get(ptep); int r = 1; if (!pte_young(pte)) r = 0; @@ -296,7 +318,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long address, pte_t *ptep) { - pte_t pte = *ptep; + pte_t pte = ptep_get(ptep); pte_clear(mm, address, ptep); page_table_check_pte_clear(mm, address, pte); return pte; @@ -309,20 +331,6 @@ static inline void ptep_clear(struct mm_struct *mm, unsigned long addr, ptep_get_and_clear(mm, addr, ptep); } -#ifndef ptep_get -static inline pte_t ptep_get(pte_t *ptep) -{ - return READ_ONCE(*ptep); -} -#endif - -#ifndef pmdp_get -static inline pmd_t pmdp_get(pmd_t *pmdp) -{ - return READ_ONCE(*pmdp); -} -#endif - #ifdef CONFIG_GUP_GET_PXX_LOW_HIGH /* * For walking the pagetables without holding any locks. Some architectures @@ -511,7 +519,7 @@ extern pud_t pudp_huge_clear_flush(struct vm_area_struct *vma, struct mm_struct; static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep) { - pte_t old_pte = *ptep; + pte_t old_pte = ptep_get(ptep); set_pte_at(mm, address, ptep, pte_wrprotect(old_pte)); } #endif @@ -591,6 +599,10 @@ extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); #endif +#ifndef arch_needs_pgtable_deposit +#define arch_needs_pgtable_deposit() (false) +#endif + #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * This is an implementation of pmdp_establish() that is only suitable for an @@ -1292,9 +1304,10 @@ static inline int pud_trans_huge(pud_t pud) } #endif -/* See pmd_none_or_trans_huge_or_clear_bad for discussion. */ -static inline int pud_none_or_trans_huge_or_dev_or_clear_bad(pud_t *pud) +static inline int pud_trans_unstable(pud_t *pud) { +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ + defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) pud_t pudval = READ_ONCE(*pud); if (pud_none(pudval) || pud_trans_huge(pudval) || pud_devmap(pudval)) @@ -1303,121 +1316,10 @@ static inline int pud_none_or_trans_huge_or_dev_or_clear_bad(pud_t *pud) pud_clear_bad(pud); return 1; } - return 0; -} - -/* See pmd_trans_unstable for discussion. */ -static inline int pud_trans_unstable(pud_t *pud) -{ -#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ - defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) - return pud_none_or_trans_huge_or_dev_or_clear_bad(pud); -#else - return 0; -#endif -} - -#ifndef arch_needs_pgtable_deposit -#define arch_needs_pgtable_deposit() (false) -#endif -/* - * This function is meant to be used by sites walking pagetables with - * the mmap_lock held in read mode to protect against MADV_DONTNEED and - * transhuge page faults. MADV_DONTNEED can convert a transhuge pmd - * into a null pmd and the transhuge page fault can convert a null pmd - * into an hugepmd or into a regular pmd (if the hugepage allocation - * fails). While holding the mmap_lock in read mode the pmd becomes - * stable and stops changing under us only if it's not null and not a - * transhuge pmd. When those races occurs and this function makes a - * difference vs the standard pmd_none_or_clear_bad, the result is - * undefined so behaving like if the pmd was none is safe (because it - * can return none anyway). The compiler level barrier() is critically - * important to compute the two checks atomically on the same pmdval. - * - * For 32bit kernels with a 64bit large pmd_t this automatically takes - * care of reading the pmd atomically to avoid SMP race conditions - * against pmd_populate() when the mmap_lock is hold for reading by the - * caller (a special atomic read not done by "gcc" as in the generic - * version above, is also needed when THP is disabled because the page - * fault can populate the pmd from under us). - */ -static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd) -{ - pmd_t pmdval = pmdp_get_lockless(pmd); - /* - * The barrier will stabilize the pmdval in a register or on - * the stack so that it will stop changing under the code. - * - * When CONFIG_TRANSPARENT_HUGEPAGE=y on x86 32bit PAE, - * pmdp_get_lockless is allowed to return a not atomic pmdval - * (for example pointing to an hugepage that has never been - * mapped in the pmd). The below checks will only care about - * the low part of the pmd with 32bit PAE x86 anyway, with the - * exception of pmd_none(). So the important thing is that if - * the low part of the pmd is found null, the high part will - * be also null or the pmd_none() check below would be - * confused. - */ -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - barrier(); #endif - /* - * !pmd_present() checks for pmd migration entries - * - * The complete check uses is_pmd_migration_entry() in linux/swapops.h - * But using that requires moving current function and pmd_trans_unstable() - * to linux/swapops.h to resolve dependency, which is too much code move. - * - * !pmd_present() is equivalent to is_pmd_migration_entry() currently, - * because !pmd_present() pages can only be under migration not swapped - * out. - * - * pmd_none() is preserved for future condition checks on pmd migration - * entries and not confusing with this function name, although it is - * redundant with !pmd_present(). - */ - if (pmd_none(pmdval) || pmd_trans_huge(pmdval) || - (IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION) && !pmd_present(pmdval))) - return 1; - if (unlikely(pmd_bad(pmdval))) { - pmd_clear_bad(pmd); - return 1; - } return 0; } -/* - * This is a noop if Transparent Hugepage Support is not built into - * the kernel. Otherwise it is equivalent to - * pmd_none_or_trans_huge_or_clear_bad(), and shall only be called in - * places that already verified the pmd is not none and they want to - * walk ptes while holding the mmap sem in read mode (write mode don't - * need this). If THP is not enabled, the pmd can't go away under the - * code even if MADV_DONTNEED runs, but if THP is enabled we need to - * run a pmd_trans_unstable before walking the ptes after - * split_huge_pmd returns (because it may have run when the pmd become - * null, but then a page fault can map in a THP and not a regular page). - */ -static inline int pmd_trans_unstable(pmd_t *pmd) -{ -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - return pmd_none_or_trans_huge_or_clear_bad(pmd); -#else - return 0; -#endif -} - -/* - * the ordering of these checks is important for pmds with _page_devmap set. - * if we check pmd_trans_unstable() first we will trip the bad_pmd() check - * inside of pmd_none_or_trans_huge_or_clear_bad(). this will end up correctly - * returning 1 but not before it spams dmesg with the pmd_clear_bad() output. - */ -static inline int pmd_devmap_trans_unstable(pmd_t *pmd) -{ - return pmd_devmap(*pmd) || pmd_trans_unstable(pmd); -} - #ifndef CONFIG_NUMA_BALANCING /* * Technically a PTE can be PROTNONE even when not doing NUMA balancing but |