diff options
Diffstat (limited to 'mm/internal.h')
-rw-r--r-- | mm/internal.h | 276 |
1 files changed, 232 insertions, 44 deletions
diff --git a/mm/internal.h b/mm/internal.h index 6d4ca98f3844..7920a8b7982e 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -24,7 +24,7 @@ struct folio_batch; #define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\ __GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\ __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\ - __GFP_ATOMIC|__GFP_NOLOCKDEP) + __GFP_NOLOCKDEP) /* The GFP flags allowed during early boot */ #define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS)) @@ -52,6 +52,24 @@ struct folio_batch; void page_writeback_init(void); +/* + * If a 16GB hugetlb folio were mapped by PTEs of all of its 4kB pages, + * its nr_pages_mapped would be 0x400000: choose the COMPOUND_MAPPED bit + * above that range, instead of 2*(PMD_SIZE/PAGE_SIZE). Hugetlb currently + * leaves nr_pages_mapped at 0, but avoid surprise if it participates later. + */ +#define COMPOUND_MAPPED 0x800000 +#define FOLIO_PAGES_MAPPED (COMPOUND_MAPPED - 1) + +/* + * How many individual pages have an elevated _mapcount. Excludes + * the folio's entire_mapcount. + */ +static inline int folio_nr_pages_mapped(struct folio *folio) +{ + return atomic_read(&folio->_nr_pages_mapped) & FOLIO_PAGES_MAPPED; +} + static inline void *folio_raw_mapping(struct folio *folio) { unsigned long mapping = (unsigned long)folio->mapping; @@ -141,17 +159,6 @@ static inline bool folio_evictable(struct folio *folio) return ret; } -static inline bool page_evictable(struct page *page) -{ - bool ret; - - /* Prevent address_space of inode and swap cache from being freed */ - rcu_read_lock(); - ret = !mapping_unevictable(page_mapping(page)) && !PageMlocked(page); - rcu_read_unlock(); - return ret; -} - /* * Turn a non-refcounted page (->_refcount == 0) into refcounted with * a count of one. @@ -180,8 +187,8 @@ pgprot_t __init early_memremap_pgprot_adjust(resource_size_t phys_addr, /* * in mm/vmscan.c: */ -int isolate_lru_page(struct page *page); -int folio_isolate_lru(struct folio *folio); +bool isolate_lru_page(struct page *page); +bool folio_isolate_lru(struct folio *folio); void putback_lru_page(struct page *page); void folio_putback_lru(struct folio *folio); extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason); @@ -378,6 +385,25 @@ extern void *memmap_alloc(phys_addr_t size, phys_addr_t align, int split_free_page(struct page *free_page, unsigned int order, unsigned long split_pfn_offset); +/* + * This will have no effect, other than possibly generating a warning, if the + * caller passes in a non-large folio. + */ +static inline void folio_set_order(struct folio *folio, unsigned int order) +{ + if (WARN_ON_ONCE(!folio_test_large(folio))) + return; + + folio->_folio_order = order; +#ifdef CONFIG_64BIT + /* + * When hugetlb dissolves a folio, we need to clear the tail + * page, rather than setting nr_pages to 1. + */ + folio->_folio_nr_pages = order ? 1U << order : 0; +#endif +} + #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* @@ -422,7 +448,11 @@ struct compact_control { bool proactive_compaction; /* kcompactd proactive compaction */ bool whole_zone; /* Whole zone should/has been scanned */ bool contended; /* Signal lock contention */ - bool rescan; /* Rescanning the same pageblock */ + bool finish_pageblock; /* Scan the remainder of a pageblock. Used + * when there are potentially transient + * isolation or migration failures to + * ensure forward progress. + */ bool alloc_contig; /* alloc_contig_range allocation */ }; @@ -492,14 +522,13 @@ extern long faultin_vma_page_range(struct vm_area_struct *vma, extern int mlock_future_check(struct mm_struct *mm, unsigned long flags, unsigned long len); /* - * mlock_vma_page() and munlock_vma_page(): + * mlock_vma_folio() and munlock_vma_folio(): * should be called with vma's mmap_lock held for read or write, * under page table lock for the pte/pmd being added or removed. * - * mlock is usually called at the end of page_add_*_rmap(), - * munlock at the end of page_remove_rmap(); but new anon - * pages are managed by lru_cache_add_inactive_or_unevictable() - * calling mlock_new_page(). + * mlock is usually called at the end of page_add_*_rmap(), munlock at + * the end of page_remove_rmap(); but new anon folios are managed by + * folio_add_lru_vma() calling mlock_new_folio(). * * @compound is used to include pmd mappings of THPs, but filter out * pte mappings of THPs, which cannot be consistently counted: a pte @@ -522,24 +551,19 @@ static inline void mlock_vma_folio(struct folio *folio, mlock_folio(folio); } -static inline void mlock_vma_page(struct page *page, - struct vm_area_struct *vma, bool compound) -{ - mlock_vma_folio(page_folio(page), vma, compound); -} - -void munlock_page(struct page *page); -static inline void munlock_vma_page(struct page *page, +void munlock_folio(struct folio *folio); +static inline void munlock_vma_folio(struct folio *folio, struct vm_area_struct *vma, bool compound) { if (unlikely(vma->vm_flags & VM_LOCKED) && - (compound || !PageTransCompound(page))) - munlock_page(page); + (compound || !folio_test_large(folio))) + munlock_folio(folio); } -void mlock_new_page(struct page *page); -bool need_mlock_page_drain(int cpu); -void mlock_page_drain_local(void); -void mlock_page_drain_remote(int cpu); + +void mlock_new_folio(struct folio *folio); +bool need_mlock_drain(int cpu); +void mlock_drain_local(void); +void mlock_drain_remote(int cpu); extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); @@ -624,14 +648,10 @@ static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, } #else /* !CONFIG_MMU */ static inline void unmap_mapping_folio(struct folio *folio) { } -static inline void mlock_vma_page(struct page *page, - struct vm_area_struct *vma, bool compound) { } -static inline void munlock_vma_page(struct page *page, - struct vm_area_struct *vma, bool compound) { } -static inline void mlock_new_page(struct page *page) { } -static inline bool need_mlock_page_drain(int cpu) { return false; } -static inline void mlock_page_drain_local(void) { } -static inline void mlock_page_drain_remote(int cpu) { } +static inline void mlock_new_folio(struct folio *folio) { } +static inline bool need_mlock_drain(int cpu) { return false; } +static inline void mlock_drain_local(void) { } +static inline void mlock_drain_remote(int cpu) { } static inline void vunmap_range_noflush(unsigned long start, unsigned long end) { } @@ -735,8 +755,13 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, #define ALLOC_OOM ALLOC_NO_WATERMARKS #endif -#define ALLOC_HARDER 0x10 /* try to alloc harder */ -#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ +#define ALLOC_NON_BLOCK 0x10 /* Caller cannot block. Allow access + * to 25% of the min watermark or + * 62.5% if __GFP_HIGH is set. + */ +#define ALLOC_MIN_RESERVE 0x20 /* __GFP_HIGH set. Allow access to 50% + * of the min watermark. + */ #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ #define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ #ifdef CONFIG_ZONE_DMA32 @@ -744,8 +769,12 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, #else #define ALLOC_NOFRAGMENT 0x0 #endif +#define ALLOC_HIGHATOMIC 0x200 /* Allows access to MIGRATE_HIGHATOMIC */ #define ALLOC_KSWAPD 0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */ +/* Flags that allow allocations below the min watermark. */ +#define ALLOC_RESERVES (ALLOC_NON_BLOCK|ALLOC_MIN_RESERVE|ALLOC_HIGHATOMIC|ALLOC_OOM) + enum ttu_flags; struct tlbflush_unmap_batch; @@ -833,6 +862,87 @@ int migrate_device_coherent_page(struct page *page); * mm/gup.c */ struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags); +int __must_check try_grab_page(struct page *page, unsigned int flags); + +enum { + /* mark page accessed */ + FOLL_TOUCH = 1 << 16, + /* a retry, previous pass started an IO */ + FOLL_TRIED = 1 << 17, + /* we are working on non-current tsk/mm */ + FOLL_REMOTE = 1 << 18, + /* pages must be released via unpin_user_page */ + FOLL_PIN = 1 << 19, + /* gup_fast: prevent fall-back to slow gup */ + FOLL_FAST_ONLY = 1 << 20, + /* allow unlocking the mmap lock */ + FOLL_UNLOCKABLE = 1 << 21, +}; + +/* + * Indicates for which pages that are write-protected in the page table, + * whether GUP has to trigger unsharing via FAULT_FLAG_UNSHARE such that the + * GUP pin will remain consistent with the pages mapped into the page tables + * of the MM. + * + * Temporary unmapping of PageAnonExclusive() pages or clearing of + * PageAnonExclusive() has to protect against concurrent GUP: + * * Ordinary GUP: Using the PT lock + * * GUP-fast and fork(): mm->write_protect_seq + * * GUP-fast and KSM or temporary unmapping (swap, migration): see + * page_try_share_anon_rmap() + * + * Must be called with the (sub)page that's actually referenced via the + * page table entry, which might not necessarily be the head page for a + * PTE-mapped THP. + * + * If the vma is NULL, we're coming from the GUP-fast path and might have + * to fallback to the slow path just to lookup the vma. + */ +static inline bool gup_must_unshare(struct vm_area_struct *vma, + unsigned int flags, struct page *page) +{ + /* + * FOLL_WRITE is implicitly handled correctly as the page table entry + * has to be writable -- and if it references (part of) an anonymous + * folio, that part is required to be marked exclusive. + */ + if ((flags & (FOLL_WRITE | FOLL_PIN)) != FOLL_PIN) + return false; + /* + * Note: PageAnon(page) is stable until the page is actually getting + * freed. + */ + if (!PageAnon(page)) { + /* + * We only care about R/O long-term pining: R/O short-term + * pinning does not have the semantics to observe successive + * changes through the process page tables. + */ + if (!(flags & FOLL_LONGTERM)) + return false; + + /* We really need the vma ... */ + if (!vma) + return true; + + /* + * ... because we only care about writable private ("COW") + * mappings where we have to break COW early. + */ + return is_cow_mapping(vma->vm_flags); + } + + /* Paired with a memory barrier in page_try_share_anon_rmap(). */ + if (IS_ENABLED(CONFIG_HAVE_FAST_GUP)) + smp_rmb(); + + /* + * Note that PageKsm() pages cannot be exclusive, and consequently, + * cannot get pinned. + */ + return !PageAnonExclusive(page); +} extern bool mirrored_kernelcore; @@ -854,4 +964,82 @@ static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma) return !(vma->vm_flags & VM_SOFTDIRTY); } +/* + * VMA Iterator functions shared between nommu and mmap + */ +static inline int vma_iter_prealloc(struct vma_iterator *vmi) +{ + return mas_preallocate(&vmi->mas, GFP_KERNEL); +} + +static inline void vma_iter_clear(struct vma_iterator *vmi, + unsigned long start, unsigned long end) +{ + mas_set_range(&vmi->mas, start, end - 1); + mas_store_prealloc(&vmi->mas, NULL); +} + +static inline struct vm_area_struct *vma_iter_load(struct vma_iterator *vmi) +{ + return mas_walk(&vmi->mas); +} + +/* Store a VMA with preallocated memory */ +static inline void vma_iter_store(struct vma_iterator *vmi, + struct vm_area_struct *vma) +{ + +#if defined(CONFIG_DEBUG_VM_MAPLE_TREE) + if (WARN_ON(vmi->mas.node != MAS_START && vmi->mas.index > vma->vm_start)) { + printk("%lu > %lu\n", vmi->mas.index, vma->vm_start); + printk("store of vma %lu-%lu", vma->vm_start, vma->vm_end); + printk("into slot %lu-%lu", vmi->mas.index, vmi->mas.last); + mt_dump(vmi->mas.tree); + } + if (WARN_ON(vmi->mas.node != MAS_START && vmi->mas.last < vma->vm_start)) { + printk("%lu < %lu\n", vmi->mas.last, vma->vm_start); + printk("store of vma %lu-%lu", vma->vm_start, vma->vm_end); + printk("into slot %lu-%lu", vmi->mas.index, vmi->mas.last); + mt_dump(vmi->mas.tree); + } +#endif + + if (vmi->mas.node != MAS_START && + ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start))) + vma_iter_invalidate(vmi); + + vmi->mas.index = vma->vm_start; + vmi->mas.last = vma->vm_end - 1; + mas_store_prealloc(&vmi->mas, vma); +} + +static inline int vma_iter_store_gfp(struct vma_iterator *vmi, + struct vm_area_struct *vma, gfp_t gfp) +{ + if (vmi->mas.node != MAS_START && + ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start))) + vma_iter_invalidate(vmi); + + vmi->mas.index = vma->vm_start; + vmi->mas.last = vma->vm_end - 1; + mas_store_gfp(&vmi->mas, vma, gfp); + if (unlikely(mas_is_err(&vmi->mas))) + return -ENOMEM; + + return 0; +} + +/* + * VMA lock generalization + */ +struct vma_prepare { + struct vm_area_struct *vma; + struct vm_area_struct *adj_next; + struct file *file; + struct address_space *mapping; + struct anon_vma *anon_vma; + struct vm_area_struct *insert; + struct vm_area_struct *remove; + struct vm_area_struct *remove2; +}; #endif /* __MM_INTERNAL_H */ |