diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2021-07-02 12:08:10 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2021-07-02 12:08:10 -0700 |
commit | 71bd9341011f626d692aabe024f099820f02c497 (patch) | |
tree | a1c27fd8f17daff36e380800c5b69769d0d9cc99 /mm | |
parent | 3dbdb38e286903ec220aaf1fb29a8d94297da246 (diff) | |
parent | b869d5be0acf0e125e69adcffdca04000dc5b17c (diff) |
Merge branch 'akpm' (patches from Andrew)
Merge more updates from Andrew Morton:
"190 patches.
Subsystems affected by this patch series: mm (hugetlb, userfaultfd,
vmscan, kconfig, proc, z3fold, zbud, ras, mempolicy, memblock,
migration, thp, nommu, kconfig, madvise, memory-hotplug, zswap,
zsmalloc, zram, cleanups, kfence, and hmm), procfs, sysctl, misc,
core-kernel, lib, lz4, checkpatch, init, kprobes, nilfs2, hfs,
signals, exec, kcov, selftests, compress/decompress, and ipc"
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (190 commits)
ipc/util.c: use binary search for max_idx
ipc/sem.c: use READ_ONCE()/WRITE_ONCE() for use_global_lock
ipc: use kmalloc for msg_queue and shmid_kernel
ipc sem: use kvmalloc for sem_undo allocation
lib/decompressors: remove set but not used variabled 'level'
selftests/vm/pkeys: exercise x86 XSAVE init state
selftests/vm/pkeys: refill shadow register after implicit kernel write
selftests/vm/pkeys: handle negative sys_pkey_alloc() return code
selftests/vm/pkeys: fix alloc_random_pkey() to make it really, really random
kcov: add __no_sanitize_coverage to fix noinstr for all architectures
exec: remove checks in __register_bimfmt()
x86: signal: don't do sas_ss_reset() until we are certain that sigframe won't be abandoned
hfsplus: report create_date to kstat.btime
hfsplus: remove unnecessary oom message
nilfs2: remove redundant continue statement in a while-loop
kprobes: remove duplicated strong free_insn_page in x86 and s390
init: print out unknown kernel parameters
checkpatch: do not complain about positive return values starting with EPOLL
checkpatch: improve the indented label test
checkpatch: scripts/spdxcheck.py now requires python3
...
Diffstat (limited to 'mm')
45 files changed, 2919 insertions, 1411 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index ded98fb859ab..a02498c0e13d 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -96,6 +96,9 @@ config HAVE_FAST_GUP depends on MMU bool +config HOLES_IN_ZONE + bool + # Don't discard allocated memory used to track "memory" and "reserved" memblocks # after early boot, so it can still be used to test for validity of memory. # Also, memblocks are updated with memory hot(un)plug. @@ -671,6 +674,7 @@ config ZPOOL config ZBUD tristate "Low (Up to 2x) density storage for compressed pages" + depends on ZPOOL help A special purpose allocator for storing compressed pages. It is designed to store up to two compressed pages per physical @@ -757,6 +761,18 @@ config ARCH_HAS_CACHE_LINE_SIZE config ARCH_HAS_PTE_DEVMAP bool +config ARCH_HAS_ZONE_DMA_SET + bool + +config ZONE_DMA + bool "Support DMA zone" if ARCH_HAS_ZONE_DMA_SET + default y if ARM64 || X86 + +config ZONE_DMA32 + bool "Support DMA32 zone" if ARCH_HAS_ZONE_DMA_SET + depends on !X86_32 + default y if ARM64 + config ZONE_DEVICE bool "Device memory (pmem, HMM, etc...) hotplug support" depends on MEMORY_HOTPLUG diff --git a/mm/Makefile b/mm/Makefile index bf71e295e9f6..74b47c354682 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -75,6 +75,7 @@ obj-$(CONFIG_FRONTSWAP) += frontswap.o obj-$(CONFIG_ZSWAP) += zswap.o obj-$(CONFIG_HAS_DMA) += dmapool.o obj-$(CONFIG_HUGETLBFS) += hugetlb.o +obj-$(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP) += hugetlb_vmemmap.o obj-$(CONFIG_NUMA) += mempolicy.o obj-$(CONFIG_SPARSEMEM) += sparse.o obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o @@ -125,3 +126,4 @@ obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o obj-$(CONFIG_PTDUMP_CORE) += ptdump.o obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o obj-$(CONFIG_IO_MAPPING) += io-mapping.o +obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o diff --git a/mm/bootmem_info.c b/mm/bootmem_info.c new file mode 100644 index 000000000000..5b152dba7344 --- /dev/null +++ b/mm/bootmem_info.c @@ -0,0 +1,127 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Bootmem core functions. + * + * Copyright (c) 2020, Bytedance. + * + * Author: Muchun Song <songmuchun@bytedance.com> + * + */ +#include <linux/mm.h> +#include <linux/compiler.h> +#include <linux/memblock.h> +#include <linux/bootmem_info.h> +#include <linux/memory_hotplug.h> + +void get_page_bootmem(unsigned long info, struct page *page, unsigned long type) +{ + page->freelist = (void *)type; + SetPagePrivate(page); + set_page_private(page, info); + page_ref_inc(page); +} + +void put_page_bootmem(struct page *page) +{ + unsigned long type; + + type = (unsigned long) page->freelist; + BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || + type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE); + + if (page_ref_dec_return(page) == 1) { + page->freelist = NULL; + ClearPagePrivate(page); + set_page_private(page, 0); + INIT_LIST_HEAD(&page->lru); + free_reserved_page(page); + } +} + +#ifndef CONFIG_SPARSEMEM_VMEMMAP +static void register_page_bootmem_info_section(unsigned long start_pfn) +{ + unsigned long mapsize, section_nr, i; + struct mem_section *ms; + struct page *page, *memmap; + struct mem_section_usage *usage; + + section_nr = pfn_to_section_nr(start_pfn); + ms = __nr_to_section(section_nr); + + /* Get section's memmap address */ + memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); + + /* + * Get page for the memmap's phys address + * XXX: need more consideration for sparse_vmemmap... + */ + page = virt_to_page(memmap); + mapsize = sizeof(struct page) * PAGES_PER_SECTION; + mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT; + + /* remember memmap's page */ + for (i = 0; i < mapsize; i++, page++) + get_page_bootmem(section_nr, page, SECTION_INFO); + + usage = ms->usage; + page = virt_to_page(usage); + + mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT; + + for (i = 0; i < mapsize; i++, page++) + get_page_bootmem(section_nr, page, MIX_SECTION_INFO); + +} +#else /* CONFIG_SPARSEMEM_VMEMMAP */ +static void register_page_bootmem_info_section(unsigned long start_pfn) +{ + unsigned long mapsize, section_nr, i; + struct mem_section *ms; + struct page *page, *memmap; + struct mem_section_usage *usage; + + section_nr = pfn_to_section_nr(start_pfn); + ms = __nr_to_section(section_nr); + + memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); + + register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION); + + usage = ms->usage; + page = virt_to_page(usage); + + mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT; + + for (i = 0; i < mapsize; i++, page++) + get_page_bootmem(section_nr, page, MIX_SECTION_INFO); +} +#endif /* !CONFIG_SPARSEMEM_VMEMMAP */ + +void __init register_page_bootmem_info_node(struct pglist_data *pgdat) +{ + unsigned long i, pfn, end_pfn, nr_pages; + int node = pgdat->node_id; + struct page *page; + + nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT; + page = virt_to_page(pgdat); + + for (i = 0; i < nr_pages; i++, page++) + get_page_bootmem(node, page, NODE_INFO); + + pfn = pgdat->node_start_pfn; + end_pfn = pgdat_end_pfn(pgdat); + + /* register section info */ + for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { + /* + * Some platforms can assign the same pfn to multiple nodes - on + * node0 as well as nodeN. To avoid registering a pfn against + * multiple nodes we check that this pfn does not already + * reside in some other nodes. + */ + if (pfn_valid(pfn) && (early_pfn_to_nid(pfn) == node)) + register_page_bootmem_info_section(pfn); + } +} diff --git a/mm/compaction.c b/mm/compaction.c index 3a509fbf2bea..621508e0ecd5 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1297,8 +1297,7 @@ move_freelist_head(struct list_head *freelist, struct page *freepage) if (!list_is_last(freelist, &freepage->lru)) { list_cut_before(&sublist, freelist, &freepage->lru); - if (!list_empty(&sublist)) - list_splice_tail(&sublist, freelist); + list_splice_tail(&sublist, freelist); } } @@ -1315,8 +1314,7 @@ move_freelist_tail(struct list_head *freelist, struct page *freepage) if (!list_is_first(freelist, &freepage->lru)) { list_cut_position(&sublist, freelist, &freepage->lru); - if (!list_empty(&sublist)) - list_splice_tail(&sublist, freelist); + list_splice_tail(&sublist, freelist); } } @@ -1380,7 +1378,7 @@ static int next_search_order(struct compact_control *cc, int order) static unsigned long fast_isolate_freepages(struct compact_control *cc) { - unsigned int limit = min(1U, freelist_scan_limit(cc) >> 1); + unsigned int limit = max(1U, freelist_scan_limit(cc) >> 1); unsigned int nr_scanned = 0; unsigned long low_pfn, min_pfn, highest = 0; unsigned long nr_isolated = 0; @@ -1492,11 +1490,11 @@ fast_isolate_freepages(struct compact_control *cc) spin_unlock_irqrestore(&cc->zone->lock, flags); /* - * Smaller scan on next order so the total scan ig related + * Smaller scan on next order so the total scan is related * to freelist_scan_limit. */ if (order_scanned >= limit) - limit = min(1U, limit >> 1); + limit = max(1U, limit >> 1); } if (!page) { @@ -2722,9 +2720,9 @@ int sysctl_compaction_handler(struct ctl_table *table, int write, } #if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) -static ssize_t sysfs_compact_node(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) +static ssize_t compact_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) { int nid = dev->id; @@ -2737,7 +2735,7 @@ static ssize_t sysfs_compact_node(struct device *dev, return count; } -static DEVICE_ATTR(compact, 0200, NULL, sysfs_compact_node); +static DEVICE_ATTR_WO(compact); int compaction_register_node(struct node *node) { diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 92bfc37300df..1c922691aa61 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -91,7 +91,7 @@ static void __init pte_advanced_tests(struct mm_struct *mm, unsigned long pfn, unsigned long vaddr, pgprot_t prot) { - pte_t pte = pfn_pte(pfn, prot); + pte_t pte; /* * Architectures optimize set_pte_at by avoiding TLB flush. @@ -248,29 +248,6 @@ static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot) WARN_ON(!pmd_leaf(pmd)); } -#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP -static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot) -{ - pmd_t pmd; - - if (!arch_vmap_pmd_supported(prot)) - return; - - pr_debug("Validating PMD huge\n"); - /* - * X86 defined pmd_set_huge() verifies that the given - * PMD is not a populated non-leaf entry. - */ - WRITE_ONCE(*pmdp, __pmd(0)); - WARN_ON(!pmd_set_huge(pmdp, __pfn_to_phys(pfn), prot)); - WARN_ON(!pmd_clear_huge(pmdp)); - pmd = READ_ONCE(*pmdp); - WARN_ON(!pmd_none(pmd)); -} -#else /* CONFIG_HAVE_ARCH_HUGE_VMAP */ -static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot) { } -#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ - static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot) { pmd_t pmd; @@ -395,30 +372,6 @@ static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) pud = pud_mkhuge(pud); WARN_ON(!pud_leaf(pud)); } - -#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP -static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot) -{ - pud_t pud; - - if (!arch_vmap_pud_supported(prot)) - return; - - pr_debug("Validating PUD huge\n"); - /* - * X86 defined pud_set_huge() verifies that the given - * PUD is not a populated non-leaf entry. - */ - WRITE_ONCE(*pudp, __pud(0)); - WARN_ON(!pud_set_huge(pudp, __pfn_to_phys(pfn), prot)); - WARN_ON(!pud_clear_huge(pudp)); - pud = READ_ONCE(*pudp); - WARN_ON(!pud_none(pud)); -} -#else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */ -static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot) { } -#endif /* !CONFIG_HAVE_ARCH_HUGE_VMAP */ - #else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ static void __init pud_basic_tests(struct mm_struct *mm, unsigned long pfn, int idx) { } static void __init pud_advanced_tests(struct mm_struct *mm, @@ -428,9 +381,6 @@ static void __init pud_advanced_tests(struct mm_struct *mm, { } static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) { } -static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot) -{ -} #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ static void __init pmd_basic_tests(unsigned long pfn, int idx) { } @@ -449,14 +399,51 @@ static void __init pud_advanced_tests(struct mm_struct *mm, } static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot) { } static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) { } +static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot) { } +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot) { + pmd_t pmd; + + if (!arch_vmap_pmd_supported(prot)) + return; + + pr_debug("Validating PMD huge\n"); + /* + * X86 defined pmd_set_huge() verifies that the given + * PMD is not a populated non-leaf entry. + */ + WRITE_ONCE(*pmdp, __pmd(0)); + WARN_ON(!pmd_set_huge(pmdp, __pfn_to_phys(pfn), prot)); + WARN_ON(!pmd_clear_huge(pmdp)); + pmd = READ_ONCE(*pmdp); + WARN_ON(!pmd_none(pmd)); } + static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot) { + pud_t pud; + + if (!arch_vmap_pud_supported(prot)) + return; + + pr_debug("Validating PUD huge\n"); + /* + * X86 defined pud_set_huge() verifies that the given + * PUD is not a populated non-leaf entry. + */ + WRITE_ONCE(*pudp, __pud(0)); + WARN_ON(!pud_set_huge(pudp, __pfn_to_phys(pfn), prot)); + WARN_ON(!pud_clear_huge(pudp)); + pud = READ_ONCE(*pudp); + WARN_ON(!pud_none(pud)); } -static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot) { } -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */ +static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot) { } +static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot) { } +#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ static void __init p4d_basic_tests(unsigned long pfn, pgprot_t prot) { @@ -791,12 +778,12 @@ static void __init pmd_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot) WARN_ON(!pmd_swp_soft_dirty(pmd_swp_mksoft_dirty(pmd))); WARN_ON(pmd_swp_soft_dirty(pmd_swp_clear_soft_dirty(pmd))); } -#else /* !CONFIG_ARCH_HAS_PTE_DEVMAP */ +#else /* !CONFIG_TRANSPARENT_HUGEPAGE */ static void __init pmd_soft_dirty_tests(unsigned long pfn, pgprot_t prot) { } static void __init pmd_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot) { } -#endif /* CONFIG_ARCH_HAS_PTE_DEVMAP */ +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static void __init pte_swap_tests(unsigned long pfn, pgprot_t prot) { @@ -856,17 +843,17 @@ static void __init swap_migration_tests(void) * locked, otherwise it stumbles upon a BUG_ON(). */ __SetPageLocked(page); - swp = make_migration_entry(page, 1); + swp = make_writable_migration_entry(page_to_pfn(page)); WARN_ON(!is_migration_entry(swp)); - WARN_ON(!is_write_migration_entry(swp)); + WARN_ON(!is_writable_migration_entry(swp)); - make_migration_entry_read(&swp); + swp = make_readable_migration_entry(swp_offset(swp)); WARN_ON(!is_migration_entry(swp)); - WARN_ON(is_write_migration_entry(swp)); + WARN_ON(is_writable_migration_entry(swp)); - swp = make_migration_entry(page, 0); + swp = make_readable_migration_entry(page_to_pfn(page)); WARN_ON(!is_migration_entry(swp)); - WARN_ON(is_write_migration_entry(swp)); + WARN_ON(is_writable_migration_entry(swp)); __ClearPageLocked(page); __free_page(page); } @@ -1501,6 +1501,64 @@ long populate_vma_page_range(struct vm_area_struct *vma, } /* + * faultin_vma_page_range() - populate (prefault) page tables inside the + * given VMA range readable/writable + * + * This takes care of mlocking the pages, too, if VM_LOCKED is set. + * + * @vma: target vma + * @start: start address + * @end: end address + * @write: whether to prefault readable or writable + * @locked: whether the mmap_lock is still held + * + * Returns either number of processed pages in the vma, or a negative error + * code on error (see __get_user_pages()). + * + * vma->vm_mm->mmap_lock must be held. The range must be page-aligned and + * covered by the VMA. + * + * If @locked is NULL, it may be held for read or write and will be unperturbed. + * + * If @locked is non-NULL, it must held for read only and may be released. If + * it's released, *@locked will be set to 0. + */ +long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start, + unsigned long end, bool write, int *locked) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long nr_pages = (end - start) / PAGE_SIZE; + int gup_flags; + + VM_BUG_ON(!PAGE_ALIGNED(start)); + VM_BUG_ON(!PAGE_ALIGNED(end)); + VM_BUG_ON_VMA(start < vma->vm_start, vma); + VM_BUG_ON_VMA(end > vma->vm_end, vma); + mmap_assert_locked(mm); + + /* + * FOLL_TOUCH: Mark page accessed and thereby young; will also mark + * the page dirty with FOLL_WRITE -- which doesn't make a + * difference with !FOLL_FORCE, because the page is writable + * in the page table. + * FOLL_HWPOISON: Return -EHWPOISON instead of -EFAULT when we hit + * a poisoned page. + * FOLL_POPULATE: Always populate memory with VM_LOCKONFAULT. + * !FOLL_FORCE: Require proper access permissions. + */ + gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK | FOLL_HWPOISON; + if (write) + gup_flags |= FOLL_WRITE; + + /* + * See check_vma_flags(): Will return -EFAULT on incompatible mappings + * or with insufficient permissions. + */ + return __get_user_pages(mm, start, nr_pages, gup_flags, + NULL, NULL, locked); +} + +/* * __mm_populate - populate and/or mlock pages within a range of address space. * * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap @@ -26,6 +26,8 @@ #include <linux/mmu_notifier.h> #include <linux/memory_hotplug.h> +#include "internal.h" + struct hmm_vma_walk { struct hmm_range *range; unsigned long last; @@ -214,7 +216,7 @@ static inline bool hmm_is_device_private_entry(struct hmm_range *range, swp_entry_t entry) { return is_device_private_entry(entry) && - device_private_entry_to_page(entry)->pgmap->owner == + pfn_swap_entry_to_page(entry)->pgmap->owner == range->dev_private_owner; } @@ -255,10 +257,9 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, */ if (hmm_is_device_private_entry(range, entry)) { cpu_flags = HMM_PFN_VALID; - if (is_write_device_private_entry(entry)) + if (is_writable_device_private_entry(entry)) cpu_flags |= HMM_PFN_WRITE; - *hmm_pfn = device_private_entry_to_pfn(entry) | - cpu_flags; + *hmm_pfn = swp_offset(entry) | cpu_flags; return 0; } @@ -272,6 +273,9 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, if (!non_swap_entry(entry)) goto fault; + if (is_device_exclusive_entry(entry)) + goto fault; + if (is_migration_entry(entry)) { pte_unmap(ptep); hmm_vma_walk->last = addr; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 6d2a0119fc58..8b731d53e9f4 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -64,7 +64,14 @@ static atomic_t huge_zero_refcount; struct page *huge_zero_page __read_mostly; unsigned long huge_zero_pfn __read_mostly = ~0UL; -bool transparent_hugepage_enabled(struct vm_area_struct *vma) +static inline bool file_thp_enabled(struct vm_area_struct *vma) +{ + return transhuge_vma_enabled(vma, vma->vm_flags) && vma->vm_file && + !inode_is_open_for_write(vma->vm_file->f_inode) && + (vma->vm_flags & VM_EXEC); +} + +bool transparent_hugepage_active(struct vm_area_struct *vma) { /* The addr is used to check if the vma size fits */ unsigned long addr = (vma->vm_end & HPAGE_PMD_MASK) - HPAGE_PMD_SIZE; @@ -75,6 +82,8 @@ bool transparent_hugepage_enabled(struct vm_area_struct *vma) return __transparent_hugepage_enabled(vma); if (vma_is_shmem(vma)) return shmem_huge_enabled(vma); + if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS)) + return file_thp_enabled(vma); return false; } @@ -1017,7 +1026,7 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, - struct vm_area_struct *vma) + struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) { spinlock_t *dst_ptl, *src_ptl; struct page *src_page; @@ -1026,7 +1035,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, int ret = -ENOMEM; /* Skip if can be re-fill on fault */ - if (!vma_is_anonymous(vma)) + if (!vma_is_anonymous(dst_vma)) return 0; pgtable = pte_alloc_one(dst_mm); @@ -1040,29 +1049,26 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, ret = -EAGAIN; pmd = *src_pmd; - /* - * Make sure the _PAGE_UFFD_WP bit is cleared if the new VMA - * does not have the VM_UFFD_WP, which means that the uffd - * fork event is not enabled. - */ - if (!(vma->vm_flags & VM_UFFD_WP)) - pmd = pmd_clear_uffd_wp(pmd); - #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION if (unlikely(is_swap_pmd(pmd))) { swp_entry_t entry = pmd_to_swp_entry(pmd); VM_BUG_ON(!is_pmd_migration_entry(pmd)); - if (is_write_migration_entry(entry)) { - make_migration_entry_read(&entry); + if (is_writable_migration_entry(entry)) { + entry = make_readable_migration_entry( + swp_offset(entry)); pmd = swp_entry_to_pmd(entry); if (pmd_swp_soft_dirty(*src_pmd)) pmd = pmd_swp_mksoft_dirty(pmd); + if (pmd_swp_uffd_wp(*src_pmd)) + pmd = pmd_swp_mkuffd_wp(pmd); set_pmd_at(src_mm, addr, src_pmd, pmd); } add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); mm_inc_nr_ptes(dst_mm); pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); + if (!userfaultfd_wp(dst_vma)) + pmd = pmd_swp_clear_uffd_wp(pmd); set_pmd_at(dst_mm, addr, dst_pmd, pmd); ret = 0; goto out_unlock; @@ -1079,17 +1085,13 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, * a page table. */ if (is_huge_zero_pmd(pmd)) { - struct page *zero_page; /* * get_huge_zero_page() will never allocate a new page here, * since we already have a zero page to copy. It just takes a * reference. */ - zero_page = mm_get_huge_zero_page(dst_mm); - set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, - zero_page); - ret = 0; - goto out_unlock; + mm_get_huge_zero_page(dst_mm); + goto out_zero_page; } src_page = pmd_page(pmd); @@ -1102,21 +1104,23 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, * best effort that the pinned pages won't be replaced by another * random page during the coming copy-on-write. */ - if (unlikely(page_needs_cow_for_dma(vma, src_page))) { + if (unlikely(page_needs_cow_for_dma(src_vma, src_page))) { pte_free(dst_mm, pgtable); spin_unlock(src_ptl); spin_unlock(dst_ptl); - __split_huge_pmd(vma, src_pmd, addr, false, NULL); + __split_huge_pmd(src_vma, src_pmd, addr, false, NULL); return -EAGAIN; } get_page(src_page); page_dup_rmap(src_page, true); add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); +out_zero_page: mm_inc_nr_ptes(dst_mm); pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); - pmdp_set_wrprotect(src_mm, addr, src_pmd); + if (!userfaultfd_wp(dst_vma)) + pmd = pmd_clear_uffd_wp(pmd); pmd = pmd_mkold(pmd_wrprotect(pmd)); set_pmd_at(dst_mm, addr, dst_pmd, pmd); @@ -1254,11 +1258,12 @@ unlock: } #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ -void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd) +void huge_pmd_set_accessed(struct vm_fault *vmf) { pmd_t entry; unsigned long haddr; bool write = vmf->flags & FAULT_FLAG_WRITE; + pmd_t orig_pmd = vmf->orig_pmd; vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) @@ -1275,11 +1280,12 @@ unlock: spin_unlock(vmf->ptl); } -vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) +vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct page *page; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; + pmd_t orig_pmd = vmf->orig_pmd; vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd); VM_BUG_ON_VMA(!vma->anon_vma, vma); @@ -1415,96 +1421,25 @@ out: } /* NUMA hinting page fault entry point for trans huge pmds */ -vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) +vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - struct anon_vma *anon_vma = NULL; + pmd_t oldpmd = vmf->orig_pmd; + pmd_t pmd; struct page *page; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; - int page_nid = NUMA_NO_NODE, this_nid = numa_node_id(); + int page_nid = NUMA_NO_NODE; int target_nid, last_cpupid = -1; - bool page_locked; bool migrated = false; - bool was_writable; + bool was_writable = pmd_savedwrite(oldpmd); int flags = 0; vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); - if (unlikely(!pmd_same(pmd, *vmf->pmd))) - goto out_unlock; - - /* - * If there are potential migrations, wait for completion and retry - * without disrupting NUMA hinting information. Do not relock and - * check_same as the page may no longer be mapped. - */ - if (unlikely(pmd_trans_migrating(*vmf->pmd))) { - page = pmd_page(*vmf->pmd); - if (!get_page_unless_zero(page)) - goto out_unlock; + if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) { spin_unlock(vmf->ptl); - put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE); goto out; } - page = pmd_page(pmd); - BUG_ON(is_huge_zero_page(page)); - page_nid = page_to_nid(page); - last_cpupid = page_cpupid_last(page); - count_vm_numa_event(NUMA_HINT_FAULTS); - if (page_nid == this_nid) { - count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); - flags |= TNF_FAULT_LOCAL; - } - - /* See similar comment in do_numa_page for explanation */ - if (!pmd_savedwrite(pmd)) - flags |= TNF_NO_GROUP; - - /* - * Acquire the page lock to serialise THP migrations but avoid dropping - * page_table_lock if at all possible - */ - page_locked = trylock_page(page); - target_nid = mpol_misplaced(page, vma, haddr); - /* Migration could have started since the pmd_trans_migrating check */ - if (!page_locked) { - page_nid = NUMA_NO_NODE; - if (!get_page_unless_zero(page)) - goto out_unlock; - spin_unlock(vmf->ptl); - put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE); - goto out; - } else if (target_nid == NUMA_NO_NODE) { - /* There are no parallel migrations and page is in the right - * node. Clear the numa hinting info in this pmd. - */ - goto clear_pmdnuma; - } - - /* - * Page is misplaced. Page lock serialises migrations. Acquire anon_vma - * to serialises splits - */ - get_page(page); - spin_unlock(vmf->ptl); - anon_vma = page_lock_anon_vma_read(page); - - /* Confirm the PMD did not change while page_table_lock was released */ - spin_lock(vmf->ptl); - if (unlikely(!pmd_same(pmd, *vmf->pmd))) { - unlock_page(page); - put_page(page); - page_nid = NUMA_NO_NODE; - goto out_unlock; - } - - /* Bail if we fail to protect against THP splits for any reason */ - if (unlikely(!anon_vma)) { - put_page(page); - page_nid = NUMA_NO_NODE; - goto clear_pmdnuma; - } - /* * Since we took the NUMA fault, we must have observed the !accessible * bit. Make sure all other CPUs agree with that, to avoid them @@ -1531,43 +1466,58 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) haddr + HPAGE_PMD_SIZE); } - /* - * Migrate the THP to the requested node, returns with page unlocked - * and access rights restored. - */ + pmd = pmd_modify(oldpmd, vma->vm_page_prot); + page = vm_normal_page_pmd(vma, haddr, pmd); + if (!page) + goto out_map; + + /* See similar comment in do_numa_page for explanation */ + if (!was_writable) + flags |= TNF_NO_GROUP; + + page_nid = page_to_nid(page); + last_cpupid = page_cpupid_last(page); + target_nid = numa_migrate_prep(page, vma, haddr, page_nid, + &flags); + + if (target_nid == NUMA_NO_NODE) { + put_page(page); + goto out_map; + } + spin_unlock(vmf->ptl); - migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma, - vmf->pmd, pmd, vmf->address, page, target_nid); + migrated = migrate_misplaced_page(page, vma, target_nid); if (migrated) { flags |= TNF_MIGRATED; page_nid = target_nid; - } else + } else { flags |= TNF_MIGRATE_FAIL; - - goto out; -clear_pmdnuma: - BUG_ON(!PageLocked(page)); - was_writable = pmd_savedwrite(pmd); - pmd = pmd_modify(pmd, vma->vm_page_prot); - pmd = pmd_mkyoung(pmd); - if (was_writable) - pmd = pmd_mkwrite(pmd); - set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd); - update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); - unlock_page(page); -out_unlock: - spin_unlock(vmf->ptl); + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) { + spin_unlock(vmf->ptl); + goto out; + } + goto out_map; + } out: - if (anon_vma) - page_unlock_anon_vma_read(anon_vma); - if (page_nid != NUMA_NO_NODE) task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, flags); return 0; + +out_map: + /* Restore the PMD */ + pmd = pmd_modify(oldpmd, vma->vm_page_prot); + pmd = pmd_mkyoung(pmd); + if (was_writable) + pmd = pmd_mkwrite(pmd); + set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd); + update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); + spin_unlock(vmf->ptl); + goto out; } /* @@ -1604,7 +1554,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, * If other processes are mapping this page, we couldn't discard * the page unless they all do MADV_FREE so let's skip the page. */ - if (page_mapcount(page) != 1) + if (total_mapcount(page) != 1) goto out; if (!trylock_page(page)) @@ -1677,12 +1627,9 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (arch_needs_pgtable_deposit()) zap_deposited_table(tlb->mm, pmd); spin_unlock(ptl); - if (is_huge_zero_pmd(orig_pmd)) - tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE); } else if (is_huge_zero_pmd(orig_pmd)) { zap_deposited_table(tlb->mm, pmd); spin_unlock(ptl); - tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE); } else { struct page *page = NULL; int flush_needed = 1; @@ -1697,7 +1644,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, VM_BUG_ON(!is_pmd_migration_entry(orig_pmd)); entry = pmd_to_swp_entry(orig_pmd); - page = migration_entry_to_page(entry); + page = pfn_swap_entry_to_page(entry); flush_needed = 0; } else WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); @@ -1796,6 +1743,7 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, * Returns * - 0 if PMD could not be locked * - 1 if PMD was locked but protections unchanged and TLB flush unnecessary + * or if prot_numa but THP migration is not supported * - HPAGE_PMD_NR if protections changed and TLB flush necessary */ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, @@ -1810,6 +1758,9 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, bool uffd_wp = cp_flags & MM_CP_UFFD_WP; bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; + if (prot_numa && !thp_migration_supported()) + return 1; + ptl = __pmd_trans_huge_lock(pmd, vma); if (!ptl) return 0; @@ -1822,16 +1773,19 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, swp_entry_t entry = pmd_to_swp_entry(*pmd); VM_BUG_ON(!is_pmd_migration_entry(*pmd)); - if (is_write_migration_entry(entry)) { + if (is_writable_migration_entry(entry)) { pmd_t newpmd; /* * A protection check is difficult so * just be safe and disable write */ - make_migration_entry_read(&entry); + entry = make_readable_migration_entry( + swp_offset(entry)); newpmd = swp_entry_to_pmd(entry); if (pmd_swp_soft_dirty(*pmd)) newpmd = pmd_swp_mksoft_dirty(newpmd); + if (pmd_swp_uffd_wp(*pmd)) + newpmd = pmd_swp_mkuffd_wp(newpmd); set_pmd_at(mm, addr, pmd, newpmd); } goto unlock; @@ -2060,7 +2014,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, swp_entry_t entry; entry = pmd_to_swp_entry(old_pmd); - page = migration_entry_to_page(entry); + page = pfn_swap_entry_to_page(entry); } else { page = pmd_page(old_pmd); if (!PageDirty(page) && pmd_dirty(old_pmd)) @@ -2114,8 +2068,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, swp_entry_t entry; entry = pmd_to_swp_entry(old_pmd); - page = migration_entry_to_page(entry); - write = is_write_migration_entry(entry); + page = pfn_swap_entry_to_page(entry); + write = is_writable_migration_entry(entry); young = false; soft_dirty = pmd_swp_soft_dirty(old_pmd); uffd_wp = pmd_swp_uffd_wp(old_pmd); @@ -2147,7 +2101,12 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, */ if (freeze || pmd_migration) { swp_entry_t swp_entry; - swp_entry = make_migration_entry(page + i, write); + if (write) + swp_entry = make_writable_migration_entry( + page_to_pfn(page + i)); + else + swp_entry = make_readable_migration_entry( + page_to_pfn(page + i)); entry = swp_entry_to_pte(swp_entry); if (soft_dirty) entry = pte_swp_mksoft_dirty(entry); @@ -2350,15 +2309,20 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, static void unmap_page(struct page *page) { - enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_SYNC | - TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD; + enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD | + TTU_SYNC; VM_BUG_ON_PAGE(!PageHead(page), page); + /* + * Anon pages need migration entries to preserve them, but file + * pages can simply be left unmapped, then faulted back on demand. + * If that is ever changed (perhaps for mlock), update remap_page(). + */ if (PageAnon(page)) - ttu_flags |= TTU_SPLIT_FREEZE; - - try_to_unmap(page, ttu_flags); + try_to_migrate(page, ttu_flags); + else + try_to_unmap(page, ttu_flags | TTU_IGNORE_MLOCK); VM_WARN_ON_ONCE_PAGE(page_mapped(page), page); } @@ -2366,6 +2330,10 @@ static void unmap_page(struct page *page) static void remap_page(struct page *page, unsigned int nr) { int i; + + /* If TTU_SPLIT_FREEZE is ever extended to file, remove this check */ + if (!PageAnon(page)) + return; if (PageTransHuge(page)) { remove_migration_ptes(page, page, true); } else { @@ -2870,7 +2838,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, spin_lock_irqsave(&ds_queue->split_queue_lock, flags); /* Take pin on all head pages to avoid freeing them under us */ list_for_each_safe(pos, next, &ds_queue->split_queue) { - page = list_entry((void *)pos, struct page, mapping); + page = list_entry((void *)pos, struct page, deferred_list); page = compound_head(page); if (get_page_unless_zero(page)) { list_move(page_deferred_list(page), &list); @@ -2885,7 +2853,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); list_for_each_safe(pos, next, &list) { - page = list_entry((void *)pos, struct page, mapping); + page = list_entry((void *)pos, struct page, deferred_list); if (!trylock_page(page)) goto next; /* split_huge_page() removes page from list on success */ @@ -3144,7 +3112,7 @@ static ssize_t split_huge_pages_write(struct file *file, const char __user *buf, tok = strsep(&buf, ","); if (tok) { - strncpy(file_path, tok, MAX_INPUT_BUF_SZ); + strcpy(file_path, tok); } else { ret = -EINVAL; goto out; @@ -3214,7 +3182,10 @@ void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, pmdval = pmdp_invalidate(vma, address, pvmw->pmd); if (pmd_dirty(pmdval)) set_page_dirty(page); - entry = make_migration_entry(page, pmd_write(pmdval)); + if (pmd_write(pmdval)) + entry = make_writable_migration_entry(page_to_pfn(page)); + else + entry = make_readable_migration_entry(page_to_pfn(page)); pmdswp = swp_entry_to_pmd(entry); if (pmd_soft_dirty(pmdval)) pmdswp = pmd_swp_mksoft_dirty(pmdswp); @@ -3240,8 +3211,10 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) pmde = pmd_mkold(mk_huge_pmd(new, vma->vm_page_prot)); if (pmd_swp_soft_dirty(*pvmw->pmd)) pmde = pmd_mksoft_dirty(pmde); - if (is_write_migration_entry(entry)) + if (is_writable_migration_entry(entry)) pmde = maybe_pmd_mkwrite(pmde, vma); + if (pmd_swp_uffd_wp(*pvmw->pmd)) + pmde = pmd_wrprotect(pmd_mkuffd_wp(pmde)); flush_cache_range(vma, mmun_start, mmun_start + HPAGE_PMD_SIZE); if (PageAnon(new)) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 103f1187043f..924553aa8f78 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -30,6 +30,7 @@ #include <linux/numa.h> #include <linux/llist.h> #include <linux/cma.h> +#include <linux/migrate.h> #include <asm/page.h> #include <asm/pgalloc.h> @@ -41,6 +42,7 @@ #include <linux/node.h> #include <linux/page_owner.h> #include "internal.h" +#include "hugetlb_vmemmap.h" int hugetlb_max_hstate __read_mostly; unsigned int default_hstate_idx; @@ -1318,8 +1320,6 @@ static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask); } -static void prep_new_huge_page(struct hstate *h, struct page *page, int nid); -static void prep_compound_gigantic_page(struct page *page, unsigned int order); #else /* !CONFIG_CONTIG_ALLOC */ static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nodemask) @@ -1375,7 +1375,40 @@ static void remove_hugetlb_page(struct hstate *h, struct page *page, h->nr_huge_pages_node[nid]--; } -static void update_and_free_page(struct hstate *h, struct page *page) +static void add_hugetlb_page(struct hstate *h, struct page *page, + bool adjust_surplus) +{ + int zeroed; + int nid = page_to_nid(page); + + VM_BUG_ON_PAGE(!HPageVmemmapOptimized(page), page); + + lockdep_assert_held(&hugetlb_lock); + + INIT_LIST_HEAD(&page->lru); + h->nr_huge_pages++; + h->nr_huge_pages_node[nid]++; + + if (adjust_surplus) { + h->surplus_huge_pages++; + h->surplus_huge_pages_node[nid]++; + } + + set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); + set_page_private(page, 0); + SetHPageVmemmapOptimized(page); + + /* + * This page is now managed by the hugetlb allocator and has + * no users -- drop the last reference. + */ + zeroed = put_page_testzero(page); + VM_BUG_ON_PAGE(!zeroed, page); + arch_clear_hugepage_flags(page); + enqueue_huge_page(h, page); +} + +static void __update_and_free_page(struct hstate *h, struct page *page) { int i; struct page *subpage = page; @@ -1383,6 +1416,18 @@ static void update_and_free_page(struct hstate *h, struct page *page) if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) return; + if (alloc_huge_page_vmemmap(h, page)) { + spin_lock_irq(&hugetlb_lock); + /* + * If we cannot allocate vmemmap pages, just refuse to free the + * page and put the page back on the hugetlb free list and treat + * as a surplus page. + */ + add_hugetlb_page(h, page, true); + spin_unlock_irq(&hugetlb_lock); + return; + } + for (i = 0; i < pages_per_huge_page(h); i++, subpage = mem_map_next(subpage, page, i)) { subpage->flags &= ~(1 << PG_locked | 1 << PG_error | @@ -1398,12 +1443,79 @@ static void update_and_free_page(struct hstate *h, struct page *page) } } +/* + * As update_and_free_page() can be called under any context, so we cannot + * use GFP_KERNEL to allocate vmemmap pages. However, we can defer the + * actual freeing in a workqueue to prevent from using GFP_ATOMIC to allocate + * the vmemmap pages. + * + * free_hpage_workfn() locklessly retrieves the linked list of pages to be + * freed and frees them one-by-one. As the page->mapping pointer is going + * to be cleared in free_hpage_workfn() anyway, it is reused as the llist_node + * structure of a lockless linked list of huge pages to be freed. + */ +static LLIST_HEAD(hpage_freelist); + +static void free_hpage_workfn(struct work_struct *work) +{ + struct llist_node *node; + + node = llist_del_all(&hpage_freelist); + + while (node) { + struct page *page; + struct hstate *h; + + page = container_of((struct address_space **)node, + struct page, mapping); + node = node->next; + page->mapping = NULL; + /* + * The VM_BUG_ON_PAGE(!PageHuge(page), page) in page_hstate() + * is going to trigger because a previous call to + * remove_hugetlb_page() will set_compound_page_dtor(page, + * NULL_COMPOUND_DTOR), so do not use page_hstate() directly. + */ + h = size_to_hstate(page_size(page)); + + __update_and_free_page(h, page); + + cond_resched(); + } +} +static DECLARE_WORK(free_hpage_work, free_hpage_workfn); + +static inline void flush_free_hpage_work(struct hstate *h) +{ + if (free_vmemmap_pages_per_hpage(h)) + flush_work(&free_hpage_work); +} + +static void update_and_free_page(struct hstate *h, struct page *page, + bool atomic) +{ + if (!HPageVmemmapOptimized(page) || !atomic) { + __update_and_free_page(h, page); + return; + } + + /* + * Defer freeing to avoid using GFP_ATOMIC to allocate vmemmap pages. + * + * Only call schedule_work() if hpage_freelist is previously + * empty. Otherwise, schedule_work() had been called but the workfn + * hasn't retrieved the list yet. + */ + if (llist_add((struct llist_node *)&page->mapping, &hpage_freelist)) + schedule_work(&free_hpage_work); +} + static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list) { struct page *page, *t_page; list_for_each_entry_safe(page, t_page, list, lru) { - update_and_free_page(h, page); + update_and_free_page(h, page, false); cond_resched(); } } @@ -1470,12 +1582,12 @@ void free_huge_page(struct page *page) if (HPageTemporary(page)) { remove_hugetlb_page(h, page, false); spin_unlock_irqrestore(&hugetlb_lock, flags); - update_and_free_page(h, page); + update_and_free_page(h, page, true); } else if (h->surplus_huge_pages_node[nid]) { /* remove the page from active list */ remove_hugetlb_page(h, page, true); spin_unlock_irqrestore(&hugetlb_lock, flags); - update_and_free_page(h, page); + update_and_free_page(h, page, true); } else { arch_clear_hugepage_flags(page); enqueue_huge_page(h, page); @@ -1493,8 +1605,9 @@ static void __prep_account_new_huge_page(struct hstate *h, int nid) h->nr_huge_pages_node[nid]++; } -static void __prep_new_huge_page(struct page *page) +static void __prep_new_huge_page(struct hstate *h, struct page *page) { + free_huge_page_vmemmap(h, page); INIT_LIST_HEAD(&page->lru); set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); hugetlb_set_page_subpool(page, NULL); @@ -1504,15 +1617,15 @@ static void __prep_new_huge_page(struct page *page) static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) { - __prep_new_huge_page(page); + __prep_new_huge_page(h, page); spin_lock_irq(&hugetlb_lock); __prep_account_new_huge_page(h, nid); spin_unlock_irq(&hugetlb_lock); } -static void prep_compound_gigantic_page(struct page *page, unsigned int order) +static bool prep_compound_gigantic_page(struct page *page, unsigned int order) { - int i; + int i, j; int nr_pages = 1 << order; struct page *p = page + 1; @@ -1534,11 +1647,48 @@ static void prep_compound_gigantic_page(struct page *page, unsigned int order) * after get_user_pages(). */ __ClearPageReserved(p); + /* + * Subtle and very unlikely + * + * Gigantic 'page allocators' such as memblock or cma will + * return a set of pages with each page ref counted. We need + * to turn this set of pages into a compound page with tail + * page ref counts set to zero. Code such as speculative page + * cache adding could take a ref on a 'to be' tail page. + * We need to respect any increased ref count, and only set + * the ref count to zero if count is currently 1. If count + * is not 1, we call synchronize_rcu in the hope that a rcu + * grace period will cause ref count to drop and then retry. + * If count is still inflated on retry we return an error and + * must discard the pages. + */ + if (!page_ref_freeze(p, 1)) { + pr_info("HugeTLB unexpected inflated ref count on freshly allocated page\n"); + synchronize_rcu(); + if (!page_ref_freeze(p, 1)) + goto out_error; + } set_page_count(p, 0); set_compound_head(p, page); } atomic_set(compound_mapcount_ptr(page), -1); atomic_set(compound_pincount_ptr(page), 0); + return true; + +out_error: + /* undo tail page modifications made above */ + p = page + 1; + for (j = 1; j < i; j++, p = mem_map_next(p, page, j)) { + clear_compound_head(p); + set_page_refcounted(p); + } + /* need to clear PG_reserved on remaining tail pages */ + for (; j < nr_pages; j++, p = mem_map_next(p, page, j)) + __ClearPageReserved(p); + set_compound_order(page, 0); + page[1].compound_nr = 0; + __ClearPageHead(page); + return false; } /* @@ -1658,7 +1808,9 @@ static struct page *alloc_fresh_huge_page(struct hstate *h, nodemask_t *node_alloc_noretry) { struct page *page; + bool retry = false; +retry: if (hstate_is_gigantic(h)) page = alloc_gigantic_page(h, gfp_mask, nid, nmask); else @@ -1667,8 +1819,21 @@ static struct page *alloc_fresh_huge_page(struct hstate *h, if (!page) return NULL; - if (hstate_is_gigantic(h)) - prep_compound_gigantic_page(page, huge_page_order(h)); + if (hstate_is_gigantic(h)) { + if (!prep_compound_gigantic_page(page, huge_page_order(h))) { + /* + * Rare failure to convert pages to compound page. + * Free pages and try again - ONCE! + */ + free_gigantic_page(page, huge_page_order(h)); + if (!retry) { + retry = true; + goto retry; + } + pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n"); + return NULL; + } + } prep_new_huge_page(h, page, page_to_nid(page)); return page; @@ -1737,10 +1902,14 @@ static struct page *remove_pool_huge_page(struct hstate *h, * nothing for in-use hugepages and non-hugepages. * This function returns values like below: * - * -EBUSY: failed to dissolved free hugepages or the hugepage is in-use - * (allocated or reserved.) - * 0: successfully dissolved free hugepages or the page is not a - * hugepage (considered as already dissolved) + * -ENOMEM: failed to allocate vmemmap pages to free the freed hugepages + * when the system is under memory pressure and the feature of + * freeing unused vmemmap pages associated with each hugetlb page + * is enabled. + * -EBUSY: failed to dissolved free hugepages or the hugepage is in-use + * (allocated or reserved.) + * 0: successfully dissolved free hugepages or the page is not a + * hugepage (considered as already dissolved) */ int dissolve_free_huge_page(struct page *page) { @@ -1782,19 +1951,38 @@ retry: goto retry; } - /* - * Move PageHWPoison flag from head page to the raw error page, - * which makes any subpages rather than the error page reusable. - */ - if (PageHWPoison(head) && page != head) { - SetPageHWPoison(page); - ClearPageHWPoison(head); - } remove_hugetlb_page(h, head, false); h->max_huge_pages--; spin_unlock_irq(&hugetlb_lock); - update_and_free_page(h, head); - return 0; + + /* + * Normally update_and_free_page will allocate required vmemmmap + * before freeing the page. update_and_free_page will fail to + * free the page if it can not allocate required vmemmap. We + * need to adjust max_huge_pages if the page is not freed. + * Attempt to allocate vmemmmap here so that we can take + * appropriate action on failure. + */ + rc = alloc_huge_page_vmemmap(h, head); + if (!rc) { + /* + * Move PageHWPoison flag from head page to the raw + * error page, which makes any subpages rather than + * the error page reusable. + */ + if (PageHWPoison(head) && page != head) { + SetPageHWPoison(page); + ClearPageHWPoison(head); + } + update_and_free_page(h, head, false); + } else { + spin_lock_irq(&hugetlb_lock); + add_hugetlb_page(h, head, false); + h->max_huge_pages++; + spin_unlock_irq(&hugetlb_lock); + } + + return rc; } out: spin_unlock_irq(&hugetlb_lock); @@ -2351,14 +2539,15 @@ static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page, /* * Before dissolving the page, we need to allocate a new one for the - * pool to remain stable. Using alloc_buddy_huge_page() allows us to - * not having to deal with prep_new_huge_page() and avoids dealing of any - * counters. This simplifies and let us do the whole thing under the - * lock. + * pool to remain stable. Here, we allocate the page and 'prep' it + * by doing everything but actually updating counters and adding to + * the pool. This simplifies and let us do most of the processing + * under the lock. */ new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL); if (!new_page) return -ENOMEM; + __prep_new_huge_page(h, new_page); retry: spin_lock_irq(&hugetlb_lock); @@ -2397,14 +2586,9 @@ retry: remove_hugetlb_page(h, old_page, false); /* - * new_page needs to be initialized with the standard hugetlb - * state. This is normally done by prep_new_huge_page() but - * that takes hugetlb_lock which is already held so we need to - * open code it here. * Reference count trick is needed because allocator gives us * referenced page but the pool requires pages with 0 refcount. */ - __prep_new_huge_page(new_page); __prep_account_new_huge_page(h, nid); page_ref_dec(new_page); enqueue_huge_page(h, new_page); @@ -2413,14 +2597,14 @@ retry: * Pages have been replaced, we can safely free the old one. */ spin_unlock_irq(&hugetlb_lock); - update_and_free_page(h, old_page); + update_and_free_page(h, old_page, false); } return ret; free_new: spin_unlock_irq(&hugetlb_lock); - __free_pages(new_page, huge_page_order(h)); + update_and_free_page(h, new_page, false); return ret; } @@ -2625,16 +2809,10 @@ found: return 1; } -static void __init prep_compound_huge_page(struct page *page, - unsigned int order) -{ - if (unlikely(order > (MAX_ORDER - 1))) - prep_compound_gigantic_page(page, order); - else - prep_compound_page(page, order); -} - -/* Put bootmem huge pages into the standard lists after mem_map is up */ +/* + * Put bootmem huge pages into the standard lists after mem_map is up. + * Note: This only applies to gigantic (order > MAX_ORDER) pages. + */ static void __init gather_bootmem_prealloc(void) { struct huge_bootmem_page *m; @@ -2643,20 +2821,23 @@ static void __init gather_bootmem_prealloc(void) struct page *page = virt_to_page(m); struct hstate *h = m->hstate; + VM_BUG_ON(!hstate_is_gigantic(h)); WARN_ON(page_count(page) != 1); - prep_compound_huge_page(page, huge_page_order(h)); - WARN_ON(PageReserved(page)); - prep_new_huge_page(h, page, page_to_nid(page)); - put_page(page); /* free it into the hugepage allocator */ + if (prep_compound_gigantic_page(page, huge_page_order(h))) { + WARN_ON(PageReserved(page)); + prep_new_huge_page(h, page, page_to_nid(page)); + put_page(page); /* add to the hugepage allocator */ + } else { + free_gigantic_page(page, huge_page_order(h)); + pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n"); + } /* - * If we had gigantic hugepages allocated at boot time, we need - * to restore the 'stolen' pages to totalram_pages in order to - * fix confusing memory reports from free(1) and another - * side-effects, like CommitLimit going negative. + * We need to restore the 'stolen' pages to totalram_pages + * in order to fix confusing memory reports from free(1) and + * other side-effects, like CommitLimit going negative. */ - if (hstate_is_gigantic(h)) - adjust_managed_page_count(page, pages_per_huge_page(h)); + adjust_managed_page_count(page, pages_per_huge_page(h)); cond_resched(); } } @@ -2834,6 +3015,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, * pages in hstate via the proc/sysfs interfaces. */ mutex_lock(&h->resize_lock); + flush_free_hpage_work(h); spin_lock_irq(&hugetlb_lock); /* @@ -2943,6 +3125,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, /* free the pages after dropping lock */ spin_unlock_irq(&hugetlb_lock); update_and_free_pages_bulk(h, &page_list); + flush_free_hpage_work(h); spin_lock_irq(&hugetlb_lock); while (count < persistent_huge_pages(h)) { @@ -3450,6 +3633,7 @@ void __init hugetlb_add_hstate(unsigned int order) h->next_nid_to_free = first_memory_node; snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", huge_page_size(h)/1024); + hugetlb_vmemmap_init(h); parsed_hstate = h; } @@ -3924,6 +4108,7 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, int writable) { pte_t entry; + unsigned int shift = huge_page_shift(hstate_vma(vma)); if (writable) { entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page, @@ -3934,7 +4119,7 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, } entry = pte_mkyoung(entry); entry = pte_mkhuge(entry); - entry = arch_make_huge_pte(entry, vma, page, writable); + entry = arch_make_huge_pte(entry, shift, vma->vm_flags); return entry; } @@ -4057,12 +4242,13 @@ again: is_hugetlb_entry_hwpoisoned(entry))) { swp_entry_t swp_entry = pte_to_swp_entry(entry); - if (is_write_migration_entry(swp_entry) && cow) { + if (is_writable_migration_entry(swp_entry) && cow) { /* * COW mappings require pages in both * parent and child to be set to read. */ - make_migration_entry_read(&swp_entry); + swp_entry = make_readable_migration_entry( + swp_offset(swp_entry)); entry = swp_entry_to_pte(swp_entry); set_huge_swap_pte_at(src, addr, src_pte, entry, sz); @@ -4939,20 +5125,17 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, struct page **pagep) { bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE); - struct address_space *mapping; - pgoff_t idx; + struct hstate *h = hstate_vma(dst_vma); + struct address_space *mapping = dst_vma->vm_file->f_mapping; + pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr); unsigned long size; int vm_shared = dst_vma->vm_flags & VM_SHARED; - struct hstate *h = hstate_vma(dst_vma); pte_t _dst_pte; spinlock_t *ptl; - int ret; + int ret = -ENOMEM; struct page *page; int writable; - mapping = dst_vma->vm_file->f_mapping; - idx = vma_hugecache_offset(h, dst_vma, dst_addr); - if (is_continue) { ret = -EFAULT; page = find_lock_page(mapping, idx); @@ -4981,12 +5164,44 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, /* fallback to copy_from_user outside mmap_lock */ if (unlikely(ret)) { ret = -ENOENT; + /* Free the allocated page which may have + * consumed a reservation. + */ + restore_reserve_on_error(h, dst_vma, dst_addr, page); + put_page(page); + + /* Allocate a temporary page to hold the copied + * contents. + */ + page = alloc_huge_page_vma(h, dst_vma, dst_addr); + if (!page) { + ret = -ENOMEM; + goto out; + } *pagep = page; - /* don't free the page */ + /* Set the outparam pagep and return to the caller to + * copy the contents outside the lock. Don't free the + * page. + */ goto out; } } else { - page = *pagep; + if (vm_shared && + hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) { + put_page(*pagep); + ret = -EEXIST; + *pagep = NULL; + goto out; + } + + page = alloc_huge_page(dst_vma, dst_addr, 0); + if (IS_ERR(page)) { + ret = -ENOMEM; + *pagep = NULL; + goto out; + } + copy_huge_page(page, *pagep); + put_page(*pagep); *pagep = NULL; } @@ -5318,10 +5533,11 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, if (unlikely(is_hugetlb_entry_migration(pte))) { swp_entry_t entry = pte_to_swp_entry(pte); - if (is_write_migration_entry(entry)) { + if (is_writable_migration_entry(entry)) { pte_t newpte; - make_migration_entry_read(&entry); + entry = make_readable_migration_entry( + swp_offset(entry)); newpte = swp_entry_to_pte(entry); set_huge_swap_pte_at(mm, address, ptep, newpte, huge_page_size(h)); @@ -5332,10 +5548,11 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, } if (!huge_pte_none(pte)) { pte_t old_pte; + unsigned int shift = huge_page_shift(hstate_vma(vma)); old_pte = huge_ptep_modify_prot_start(vma, address, ptep); pte = pte_mkhuge(huge_pte_modify(old_pte, newprot)); - pte = arch_make_huge_pte(pte, vma, NULL, 0); + pte = arch_make_huge_pte(pte, shift, vma->vm_flags); huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte); pages++; } diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c new file mode 100644 index 000000000000..c540c21e26f5 --- /dev/null +++ b/mm/hugetlb_vmemmap.c @@ -0,0 +1,298 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Free some vmemmap pages of HugeTLB + * + * Copyright (c) 2020, Bytedance. All rights reserved. + * + * Author: Muchun Song <songmuchun@bytedance.com> + * + * The struct page structures (page structs) are used to describe a physical + * page frame. By default, there is a one-to-one mapping from a page frame to + * it's corresponding page struct. + * + * HugeTLB pages consist of multiple base page size pages and is supported by + * many architectures. See hugetlbpage.rst in the Documentation directory for + * more details. On the x86-64 architecture, HugeTLB pages of size 2MB and 1GB + * are currently supported. Since the base page size on x86 is 4KB, a 2MB + * HugeTLB page consists of 512 base pages and a 1GB HugeTLB page consists of + * 4096 base pages. For each base page, there is a corresponding page struct. + * + * Within the HugeTLB subsystem, only the first 4 page structs are used to + * contain unique information about a HugeTLB page. __NR_USED_SUBPAGE provides + * this upper limit. The only 'useful' information in the remaining page structs + * is the compound_head field, and this field is the same for all tail pages. + * + * By removing redundant page structs for HugeTLB pages, memory can be returned + * to the buddy allocator for other uses. + * + * Different architectures support different HugeTLB pages. For example, the + * following table is the HugeTLB page size supported by x86 and arm64 + * architectures. Because arm64 supports 4k, 16k, and 64k base pages and + * supports contiguous entries, so it supports many kinds of sizes of HugeTLB + * page. + * + * +--------------+-----------+-----------------------------------------------+ + * | Architecture | Page Size | HugeTLB Page Size | + * +--------------+-----------+-----------+-----------+-----------+-----------+ + * | x86-64 | 4KB | 2MB | 1GB | | | + * +--------------+-----------+-----------+-----------+-----------+-----------+ + * | | 4KB | 64KB | 2MB | 32MB | 1GB | + * | +-----------+-----------+-----------+-----------+-----------+ + * | arm64 | 16KB | 2MB | 32MB | 1GB | | + * | +-----------+-----------+-----------+-----------+-----------+ + * | | 64KB | 2MB | 512MB | 16GB | | + * +--------------+-----------+-----------+-----------+-----------+-----------+ + * + * When the system boot up, every HugeTLB page has more than one struct page + * structs which size is (unit: pages): + * + * struct_size = HugeTLB_Size / PAGE_SIZE * sizeof(struct page) / PAGE_SIZE + * + * Where HugeTLB_Size is the size of the HugeTLB page. We know that the size + * of the HugeTLB page is always n times PAGE_SIZE. So we can get the following + * relationship. + * + * HugeTLB_Size = n * PAGE_SIZE + * + * Then, + * + * struct_size = n * PAGE_SIZE / PAGE_SIZE * sizeof(struct page) / PAGE_SIZE + * = n * sizeof(struct page) / PAGE_SIZE + * + * We can use huge mapping at the pud/pmd level for the HugeTLB page. + * + * For the HugeTLB page of the pmd level mapping, then + * + * struct_size = n * sizeof(struct page) / PAGE_SIZE + * = PAGE_SIZE / sizeof(pte_t) * sizeof(struct page) / PAGE_SIZE + * = sizeof(struct page) / sizeof(pte_t) + * = 64 / 8 + * = 8 (pages) + * + * Where n is how many pte entries which one page can contains. So the value of + * n is (PAGE_SIZE / sizeof(pte_t)). + * + * This optimization only supports 64-bit system, so the value of sizeof(pte_t) + * is 8. And this optimization also applicable only when the size of struct page + * is a power of two. In most cases, the size of struct page is 64 bytes (e.g. + * x86-64 and arm64). So if we use pmd level mapping for a HugeTLB page, the + * size of struct page structs of it is 8 page frames which size depends on the + * size of the base page. + * + * For the HugeTLB page of the pud level mapping, then + * + * struct_size = PAGE_SIZE / sizeof(pmd_t) * struct_size(pmd) + * = PAGE_SIZE / 8 * 8 (pages) + * = PAGE_SIZE (pages) + * + * Where the struct_size(pmd) is the size of the struct page structs of a + * HugeTLB page of the pmd level mapping. + * + * E.g.: A 2MB HugeTLB page on x86_64 consists in 8 page frames while 1GB + * HugeTLB page consists in 4096. + * + * Next, we take the pmd level mapping of the HugeTLB page as an example to + * show the internal implementation of this optimization. There are 8 pages + * struct page structs associated with a HugeTLB page which is pmd mapped. + * + * Here is how things look before optimization. + * + * HugeTLB struct pages(8 pages) page frame(8 pages) + * +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+ + * | | | 0 | -------------> | 0 | + * | | +-----------+ +-----------+ + * | | | 1 | -------------> | 1 | + * | | +-----------+ +-----------+ + * | | | 2 | -------------> | 2 | + * | | +-----------+ +-----------+ + * | | | 3 | -------------> | 3 | + * | | +-----------+ +-----------+ + * | | | 4 | -------------> | 4 | + * | PMD | +-----------+ +-----------+ + * | level | | 5 | -------------> | 5 | + * | mapping | +-----------+ +-----------+ + * | | | 6 | -------------> | 6 | + * | | +-----------+ +-----------+ + * | | | 7 | -------------> | 7 | + * | | +-----------+ +-----------+ + * | | + * | | + * | | + * +-----------+ + * + * The value of page->compound_head is the same for all tail pages. The first + * page of page structs (page 0) associated with the HugeTLB page contains the 4 + * page structs necessary to describe the HugeTLB. The only use of the remaining + * pages of page structs (page 1 to page 7) is to point to page->compound_head. + * Therefore, we can remap pages 2 to 7 to page 1. Only 2 pages of page structs + * will be used for each HugeTLB page. This will allow us to free the remaining + * 6 pages to the buddy allocator. + * + * Here is how things look after remapping. + * + * HugeTLB struct pages(8 pages) page frame(8 pages) + * +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+ + * | | | 0 | -------------> | 0 | + * | | +-----------+ +-----------+ + * | | | 1 | -------------> | 1 | + * | | +-----------+ +-----------+ + * | | | 2 | ----------------^ ^ ^ ^ ^ ^ + * | | +-----------+ | | | | | + * | | | 3 | ------------------+ | | | | + * | | +-----------+ | | | | + * | | | 4 | --------------------+ | | | + * | PMD | +-----------+ | | | + * | level | | 5 | ----------------------+ | | + * | mapping | +-----------+ | | + * | | | 6 | ------------------------+ | + * | | +-----------+ | + * | | | 7 | --------------------------+ + * | | +-----------+ + * | | + * | | + * | | + * +-----------+ + * + * When a HugeTLB is freed to the buddy system, we should allocate 6 pages for + * vmemmap pages and restore the previous mapping relationship. + * + * For the HugeTLB page of the pud level mapping. It is similar to the former. + * We also can use this approach to free (PAGE_SIZE - 2) vmemmap pages. + * + * Apart from the HugeTLB page of the pmd/pud level mapping, some architectures + * (e.g. aarch64) provides a contiguous bit in the translation table entries + * that hints to the MMU to indicate that it is one of a contiguous set of + * entries that can be cached in a single TLB entry. + * + * The contiguous bit is used to increase the mapping size at the pmd and pte + * (last) level. So this type of HugeTLB page can be optimized only when its + * size of the struct page structs is greater than 2 pages. + */ +#define pr_fmt(fmt) "HugeTLB: " fmt + +#include "hugetlb_vmemmap.h" + +/* + * There are a lot of struct page structures associated with each HugeTLB page. + * For tail pages, the value of compound_head is the same. So we can reuse first + * page of tail page structures. We map the virtual addresses of the remaining + * pages of tail page structures to the first tail page struct, and then free + * these page frames. Therefore, we need to reserve two pages as vmemmap areas. + */ +#define RESERVE_VMEMMAP_NR 2U +#define RESERVE_VMEMMAP_SIZE (RESERVE_VMEMMAP_NR << PAGE_SHIFT) + +bool hugetlb_free_vmemmap_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON); + +static int __init early_hugetlb_free_vmemmap_param(char *buf) +{ + /* We cannot optimize if a "struct page" crosses page boundaries. */ + if ((!is_power_of_2(sizeof(struct page)))) { + pr_warn("cannot free vmemmap pages because \"struct page\" crosses page boundaries\n"); + return 0; + } + + if (!buf) + return -EINVAL; + + if (!strcmp(buf, "on")) + hugetlb_free_vmemmap_enabled = true; + else if (!strcmp(buf, "off")) + hugetlb_free_vmemmap_enabled = false; + else + return -EINVAL; + + return 0; +} +early_param("hugetlb_free_vmemmap", early_hugetlb_free_vmemmap_param); + +static inline unsigned long free_vmemmap_pages_size_per_hpage(struct hstate *h) +{ + return (unsigned long)free_vmemmap_pages_per_hpage(h) << PAGE_SHIFT; +} + +/* + * Previously discarded vmemmap pages will be allocated and remapping + * after this function returns zero. + */ +int alloc_huge_page_vmemmap(struct hstate *h, struct page *head) +{ + int ret; + unsigned long vmemmap_addr = (unsigned long)head; + unsigned long vmemmap_end, vmemmap_reuse; + + if (!HPageVmemmapOptimized(head)) + return 0; + + vmemmap_addr += RESERVE_VMEMMAP_SIZE; + vmemmap_end = vmemmap_addr + free_vmemmap_pages_size_per_hpage(h); + vmemmap_reuse = vmemmap_addr - PAGE_SIZE; + /* + * The pages which the vmemmap virtual address range [@vmemmap_addr, + * @vmemmap_end) are mapped to are freed to the buddy allocator, and + * the range is mapped to the page which @vmemmap_reuse is mapped to. + * When a HugeTLB page is freed to the buddy allocator, previously + * discarded vmemmap pages must be allocated and remapping. + */ + ret = vmemmap_remap_alloc(vmemmap_addr, vmemmap_end, vmemmap_reuse, + GFP_KERNEL | __GFP_NORETRY | __GFP_THISNODE); + + if (!ret) + ClearHPageVmemmapOptimized(head); + + return ret; +} + +void free_huge_page_vmemmap(struct hstate *h, struct page *head) +{ + unsigned long vmemmap_addr = (unsigned long)head; + unsigned long vmemmap_end, vmemmap_reuse; + + if (!free_vmemmap_pages_per_hpage(h)) + return; + + vmemmap_addr += RESERVE_VMEMMAP_SIZE; + vmemmap_end = vmemmap_addr + free_vmemmap_pages_size_per_hpage(h); + vmemmap_reuse = vmemmap_addr - PAGE_SIZE; + + /* + * Remap the vmemmap virtual address range [@vmemmap_addr, @vmemmap_end) + * to the page which @vmemmap_reuse is mapped to, then free the pages + * which the range [@vmemmap_addr, @vmemmap_end] is mapped to. + */ + if (!vmemmap_remap_free(vmemmap_addr, vmemmap_end, vmemmap_reuse)) + SetHPageVmemmapOptimized(head); +} + +void __init hugetlb_vmemmap_init(struct hstate *h) +{ + unsigned int nr_pages = pages_per_huge_page(h); + unsigned int vmemmap_pages; + + /* + * There are only (RESERVE_VMEMMAP_SIZE / sizeof(struct page)) struct + * page structs that can be used when CONFIG_HUGETLB_PAGE_FREE_VMEMMAP, + * so add a BUILD_BUG_ON to catch invalid usage of the tail struct page. + */ + BUILD_BUG_ON(__NR_USED_SUBPAGE >= + RESERVE_VMEMMAP_SIZE / sizeof(struct page)); + + if (!hugetlb_free_vmemmap_enabled) + return; + + vmemmap_pages = (nr_pages * sizeof(struct page)) >> PAGE_SHIFT; + /* + * The head page and the first tail page are not to be freed to buddy + * allocator, the other pages will map to the first tail page, so they + * can be freed. + * + * Could RESERVE_VMEMMAP_NR be greater than @vmemmap_pages? It is true + * on some architectures (e.g. aarch64). See Documentation/arm64/ + * hugetlbpage.rst for more details. + */ + if (likely(vmemmap_pages > RESERVE_VMEMMAP_NR)) + h->nr_free_vmemmap_pages = vmemmap_pages - RESERVE_VMEMMAP_NR; + + pr_info("can free %d vmemmap pages for %s\n", h->nr_free_vmemmap_pages, + h->name); +} diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h new file mode 100644 index 000000000000..cb2bef8f9e73 --- /dev/null +++ b/mm/hugetlb_vmemmap.h @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Free some vmemmap pages of HugeTLB + * + * Copyright (c) 2020, Bytedance. All rights reserved. + * + * Author: Muchun Song <songmuchun@bytedance.com> + */ +#ifndef _LINUX_HUGETLB_VMEMMAP_H +#define _LINUX_HUGETLB_VMEMMAP_H +#include <linux/hugetlb.h> + +#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP +int alloc_huge_page_vmemmap(struct hstate *h, struct page *head); +void free_huge_page_vmemmap(struct hstate *h, struct page *head); +void hugetlb_vmemmap_init(struct hstate *h); + +/* + * How many vmemmap pages associated with a HugeTLB page that can be freed + * to the buddy allocator. + */ +static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h) +{ + return h->nr_free_vmemmap_pages; +} +#else +static inline int alloc_huge_page_vmemmap(struct hstate *h, struct page *head) +{ + return 0; +} + +static inline void free_huge_page_vmemmap(struct hstate *h, struct page *head) +{ +} + +static inline void hugetlb_vmemmap_init(struct hstate *h) +{ +} + +static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h) +{ + return 0; +} +#endif /* CONFIG_HUGETLB_PAGE_FREE_VMEMMAP */ +#endif /* _LINUX_HUGETLB_VMEMMAP_H */ diff --git a/mm/internal.h b/mm/internal.h index 6ec2cea9926b..2d7c9a2e0118 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -274,11 +274,10 @@ isolate_freepages_range(struct compact_control *cc, int isolate_migratepages_range(struct compact_control *cc, unsigned long low_pfn, unsigned long end_pfn); +#endif int find_suitable_fallback(struct free_area *area, unsigned int order, int migratetype, bool only_stealable, bool *can_steal); -#endif - /* * This function returns the order of a free page in the buddy system. In * general, page_zone(page)->lock must be held by the caller to prevent the @@ -344,7 +343,10 @@ void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma); #ifdef CONFIG_MMU extern long populate_vma_page_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end, int *nonblocking); + unsigned long start, unsigned long end, int *locked); +extern long faultin_vma_page_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, + bool write, int *locked); extern void munlock_vma_pages_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); static inline void munlock_vma_pages_all(struct vm_area_struct *vma) @@ -369,23 +371,6 @@ extern unsigned int munlock_vma_page(struct page *page); */ extern void clear_page_mlock(struct page *page); -/* - * mlock_migrate_page - called only from migrate_misplaced_transhuge_page() - * (because that does not go through the full procedure of migration ptes): - * to migrate the Mlocked page flag; update statistics. - */ -static inline void mlock_migrate_page(struct page *newpage, struct page *page) -{ - if (TestClearPageMlocked(page)) { - int nr_pages = thp_nr_pages(page); - - /* Holding pmd lock, no change in irq context: __mod is safe */ - __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); - SetPageMlocked(newpage); - __mod_zone_page_state(page_zone(newpage), NR_MLOCK, nr_pages); - } -} - extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); /* @@ -461,7 +446,6 @@ static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, #else /* !CONFIG_MMU */ static inline void clear_page_mlock(struct page *page) { } static inline void mlock_vma_page(struct page *page) { } -static inline void mlock_migrate_page(struct page *new, struct page *old) { } static inline void vunmap_range_noflush(unsigned long start, unsigned long end) { } @@ -672,4 +656,7 @@ int vmap_pages_range_noflush(unsigned long addr, unsigned long end, void vunmap_range_noflush(unsigned long start, unsigned long end); +int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, + unsigned long addr, int page_nid, int *flags); + #endif /* __MM_INTERNAL_H */ diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 4d21ac44d5d3..d7666ace9d2e 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -636,7 +636,7 @@ static void toggle_allocation_gate(struct work_struct *work) /* Disable static key and reset timer. */ static_branch_disable(&kfence_allocation_key); #endif - queue_delayed_work(system_power_efficient_wq, &kfence_timer, + queue_delayed_work(system_unbound_wq, &kfence_timer, msecs_to_jiffies(kfence_sample_interval)); } static DECLARE_DELAYED_WORK(kfence_timer, toggle_allocation_gate); @@ -666,7 +666,7 @@ void __init kfence_init(void) } WRITE_ONCE(kfence_enabled, true); - queue_delayed_work(system_power_efficient_wq, &kfence_timer, 0); + queue_delayed_work(system_unbound_wq, &kfence_timer, 0); pr_info("initialized - using %lu bytes for %d objects at 0x%p-0x%p\n", KFENCE_POOL_SIZE, CONFIG_KFENCE_NUM_OBJECTS, (void *)__kfence_pool, (void *)(__kfence_pool + KFENCE_POOL_SIZE)); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 6c0185fdd815..b0412be08fa2 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -442,9 +442,7 @@ static inline int khugepaged_test_exit(struct mm_struct *mm) static bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags) { - /* Explicitly disabled through madvise. */ - if ((vm_flags & VM_NOHUGEPAGE) || - test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) + if (!transhuge_vma_enabled(vma, vm_flags)) return false; /* Enabled via shmem mount options or sysfs settings. */ @@ -459,7 +457,8 @@ static bool hugepage_vma_check(struct vm_area_struct *vma, /* Read-only file mappings need to be aligned for THP to work. */ if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && vma->vm_file && - (vm_flags & VM_DENYWRITE)) { + !inode_is_open_for_write(vma->vm_file->f_inode) && + (vm_flags & VM_EXEC)) { return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff, HPAGE_PMD_NR); } @@ -1864,6 +1863,19 @@ out_unlock: else { __mod_lruvec_page_state(new_page, NR_FILE_THPS, nr); filemap_nr_thps_inc(mapping); + /* + * Paired with smp_mb() in do_dentry_open() to ensure + * i_writecount is up to date and the update to nr_thps is + * visible. Ensures the page cache will be truncated if the + * file is opened writable. + */ + smp_mb(); + if (inode_is_open_for_write(mapping->host)) { + result = SCAN_FAIL; + __mod_lruvec_page_state(new_page, NR_FILE_THPS, -nr); + filemap_nr_thps_dec(mapping); + goto xa_locked; + } } if (nr_none) { diff --git a/mm/madvise.c b/mm/madvise.c index 63e489e5bfdb..6d3d348b17f4 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -53,6 +53,8 @@ static int madvise_need_mmap_write(int behavior) case MADV_COLD: case MADV_PAGEOUT: case MADV_FREE: + case MADV_POPULATE_READ: + case MADV_POPULATE_WRITE: return 0; default: /* be safe, default to 1. list exceptions explicitly */ @@ -822,6 +824,61 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, return -EINVAL; } +static long madvise_populate(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end, + int behavior) +{ + const bool write = behavior == MADV_POPULATE_WRITE; + struct mm_struct *mm = vma->vm_mm; + unsigned long tmp_end; + int locked = 1; + long pages; + + *prev = vma; + + while (start < end) { + /* + * We might have temporarily dropped the lock. For example, + * our VMA might have been split. + */ + if (!vma || start >= vma->vm_end) { + vma = find_vma(mm, start); + if (!vma || start < vma->vm_start) + return -ENOMEM; + } + + tmp_end = min_t(unsigned long, end, vma->vm_end); + /* Populate (prefault) page tables readable/writable. */ + pages = faultin_vma_page_range(vma, start, tmp_end, write, + &locked); + if (!locked) { + mmap_read_lock(mm); + locked = 1; + *prev = NULL; + vma = NULL; + } + if (pages < 0) { + switch (pages) { + case -EINTR: + return -EINTR; + case -EFAULT: /* Incompatible mappings / permissions. */ + return -EINVAL; + case -EHWPOISON: + return -EHWPOISON; + default: + pr_warn_once("%s: unhandled return value: %ld\n", + __func__, pages); + fallthrough; + case -ENOMEM: + return -ENOMEM; + } + } + start += pages * PAGE_SIZE; + } + return 0; +} + /* * Application wants to free up the pages and associated backing store. * This is effectively punching a hole into the middle of a file. @@ -935,6 +992,9 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, case MADV_FREE: case MADV_DONTNEED: return madvise_dontneed_free(vma, prev, start, end, behavior); + case MADV_POPULATE_READ: + case MADV_POPULATE_WRITE: + return madvise_populate(vma, prev, start, end, behavior); default: return madvise_behavior(vma, prev, start, end, behavior); } @@ -955,6 +1015,8 @@ madvise_behavior_valid(int behavior) case MADV_FREE: case MADV_COLD: case MADV_PAGEOUT: + case MADV_POPULATE_READ: + case MADV_POPULATE_WRITE: #ifdef CONFIG_KSM case MADV_MERGEABLE: case MADV_UNMERGEABLE: @@ -1042,6 +1104,10 @@ process_madvise_behavior_valid(int behavior) * easily if memory pressure happens. * MADV_PAGEOUT - the application is not expected to use this memory soon, * page out the pages in this range immediately. + * MADV_POPULATE_READ - populate (prefault) page tables readable by + * triggering read faults if required + * MADV_POPULATE_WRITE - populate (prefault) page tables writable by + * triggering write faults if required * * return values: * zero - success diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c index b890854ec761..ea734f248fce 100644 --- a/mm/mapping_dirty_helpers.c +++ b/mm/mapping_dirty_helpers.c @@ -317,7 +317,7 @@ EXPORT_SYMBOL_GPL(wp_shared_mapping_range); * pfn_mkwrite(). And then after a TLB flush following the write-protection * pick up all dirty bits. * - * Note: This function currently skips transhuge page-table entries, since + * This function currently skips transhuge page-table entries, since * it's intended for dirty-tracking on the PTE level. It will warn on * encountering transhuge dirty entries, though, and can easily be extended * to handle them as well. diff --git a/mm/memblock.c b/mm/memblock.c index 123feef5259d..3e4acbf03ab7 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -906,6 +906,11 @@ int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size) * @base: the base phys addr of the region * @size: the size of the region * + * The memory regions marked with %MEMBLOCK_NOMAP will not be added to the + * direct mapping of the physical memory. These regions will still be + * covered by the memory map. The struct page representing NOMAP memory + * frames in the memory map will be PageReserved() + * * Return: 0 on success, -errno on failure. */ int __init_memblock memblock_mark_nomap(phys_addr_t base, phys_addr_t size) @@ -2002,6 +2007,26 @@ static unsigned long __init __free_memory_core(phys_addr_t start, return end_pfn - start_pfn; } +static void __init memmap_init_reserved_pages(void) +{ + struct memblock_region *region; + phys_addr_t start, end; + u64 i; + + /* initialize struct pages for the reserved regions */ + for_each_reserved_mem_range(i, &start, &end) + reserve_bootmem_region(start, end); + + /* and also treat struct pages for the NOMAP regions as PageReserved */ + for_each_mem_region(region) { + if (memblock_is_nomap(region)) { + start = region->base; + end = start + region->size; + reserve_bootmem_region(start, end); + } + } +} + static unsigned long __init free_low_memory_core_early(void) { unsigned long count = 0; @@ -2010,8 +2035,7 @@ static unsigned long __init free_low_memory_core_early(void) memblock_clear_hotplug(0, -1); - for_each_reserved_mem_range(i, &start, &end) - reserve_bootmem_region(start, end); + memmap_init_reserved_pages(); /* * We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b80aae448a49..ae1f5d0cb581 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5537,7 +5537,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, * as special swap entry in the CPU page table. */ if (is_device_private_entry(ent)) { - page = device_private_entry_to_page(ent); + page = pfn_swap_entry_to_page(ent); /* * MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have * a refcount of 1 when free (unlike normal page) @@ -6644,7 +6644,7 @@ static unsigned long effective_protection(unsigned long usage, } /** - * mem_cgroup_protected - check if memory consumption is in the normal range + * mem_cgroup_calculate_protection - check if memory consumption is in the normal range * @root: the top ancestor of the sub-tree being checked * @memcg: the memory cgroup to check * diff --git a/mm/memory-failure.c b/mm/memory-failure.c index e5a1531f7f4e..eefd823deb67 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -66,6 +66,19 @@ int sysctl_memory_failure_recovery __read_mostly = 1; atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0); +static bool __page_handle_poison(struct page *page) +{ + bool ret; + + zone_pcp_disable(page_zone(page)); + ret = dissolve_free_huge_page(page); + if (!ret) + ret = take_page_off_buddy(page); + zone_pcp_enable(page_zone(page)); + + return ret; +} + static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, bool release) { if (hugepage_or_freepage) { @@ -73,7 +86,7 @@ static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, boo * Doing this check for free pages is also fine since dissolve_free_huge_page * returns 0 for non-hugetlb pages as well. */ - if (dissolve_free_huge_page(page) || !take_page_off_buddy(page)) + if (!__page_handle_poison(page)) /* * We could fail to take off the target page from buddy * for example due to racy page allocation, but that's @@ -985,7 +998,7 @@ static int me_huge_page(struct page *p, unsigned long pfn) */ if (PageAnon(hpage)) put_page(hpage); - if (!dissolve_free_huge_page(p) && take_page_off_buddy(p)) { + if (__page_handle_poison(p)) { page_ref_inc(p); res = MF_RECOVERED; } @@ -1253,10 +1266,10 @@ static int get_hwpoison_page(struct page *p, unsigned long flags) static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, int flags, struct page **hpagep) { - enum ttu_flags ttu = TTU_IGNORE_MLOCK; + enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC; struct address_space *mapping; LIST_HEAD(tokill); - bool unmap_success = true; + bool unmap_success; int kill = 1, forcekill; struct page *hpage = *hpagep; bool mlocked = PageMlocked(hpage); @@ -1319,7 +1332,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED); if (!PageHuge(hpage)) { - unmap_success = try_to_unmap(hpage, ttu); + try_to_unmap(hpage, ttu); } else { if (!PageAnon(hpage)) { /* @@ -1327,21 +1340,20 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, * could potentially call huge_pmd_unshare. Because of * this, take semaphore in write mode here and set * TTU_RMAP_LOCKED to indicate we have taken the lock - * at this higer level. + * at this higher level. */ mapping = hugetlb_page_mapping_lock_write(hpage); if (mapping) { - unmap_success = try_to_unmap(hpage, - ttu|TTU_RMAP_LOCKED); + try_to_unmap(hpage, ttu|TTU_RMAP_LOCKED); i_mmap_unlock_write(mapping); - } else { + } else pr_info("Memory failure: %#lx: could not lock mapping for mapped huge page\n", pfn); - unmap_success = false; - } } else { - unmap_success = try_to_unmap(hpage, ttu); + try_to_unmap(hpage, ttu); } } + + unmap_success = !page_mapped(hpage); if (!unmap_success) pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n", pfn, page_mapcount(hpage)); @@ -1446,7 +1458,7 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags) } unlock_page(head); res = MF_FAILED; - if (!dissolve_free_huge_page(p) && take_page_off_buddy(p)) { + if (__page_handle_poison(p)) { page_ref_inc(p); res = MF_RECOVERED; } diff --git a/mm/memory.c b/mm/memory.c index 48c4576df898..747a01d495f2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -699,6 +699,68 @@ out: } #endif +static void restore_exclusive_pte(struct vm_area_struct *vma, + struct page *page, unsigned long address, + pte_t *ptep) +{ + pte_t pte; + swp_entry_t entry; + + pte = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot))); + if (pte_swp_soft_dirty(*ptep)) + pte = pte_mksoft_dirty(pte); + + entry = pte_to_swp_entry(*ptep); + if (pte_swp_uffd_wp(*ptep)) + pte = pte_mkuffd_wp(pte); + else if (is_writable_device_exclusive_entry(entry)) + pte = maybe_mkwrite(pte_mkdirty(pte), vma); + + set_pte_at(vma->vm_mm, address, ptep, pte); + + /* + * No need to take a page reference as one was already + * created when the swap entry was made. + */ + if (PageAnon(page)) + page_add_anon_rmap(page, vma, address, false); + else + /* + * Currently device exclusive access only supports anonymous + * memory so the entry shouldn't point to a filebacked page. + */ + WARN_ON_ONCE(!PageAnon(page)); + + if (vma->vm_flags & VM_LOCKED) + mlock_vma_page(page); + + /* + * No need to invalidate - it was non-present before. However + * secondary CPUs may have mappings that need invalidating. + */ + update_mmu_cache(vma, address, ptep); +} + +/* + * Tries to restore an exclusive pte if the page lock can be acquired without + * sleeping. + */ +static int +try_restore_exclusive_pte(pte_t *src_pte, struct vm_area_struct *vma, + unsigned long addr) +{ + swp_entry_t entry = pte_to_swp_entry(*src_pte); + struct page *page = pfn_swap_entry_to_page(entry); + + if (trylock_page(page)) { + restore_exclusive_pte(vma, page, addr, src_pte); + unlock_page(page); + return 0; + } + + return -EBUSY; +} + /* * copy one vm_area from one task to the other. Assumes the page tables * already present in the new task to be cleared in the whole range @@ -707,17 +769,17 @@ out: static unsigned long copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, - unsigned long addr, int *rss) + pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma, unsigned long addr, int *rss) { - unsigned long vm_flags = vma->vm_flags; + unsigned long vm_flags = dst_vma->vm_flags; pte_t pte = *src_pte; struct page *page; swp_entry_t entry = pte_to_swp_entry(pte); if (likely(!non_swap_entry(entry))) { if (swap_duplicate(entry) < 0) - return entry.val; + return -EIO; /* make sure dst_mm is on swapoff's mmlist. */ if (unlikely(list_empty(&dst_mm->mmlist))) { @@ -729,17 +791,18 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, } rss[MM_SWAPENTS]++; } else if (is_migration_entry(entry)) { - page = migration_entry_to_page(entry); + page = pfn_swap_entry_to_page(entry); rss[mm_counter(page)]++; - if (is_write_migration_entry(entry) && + if (is_writable_migration_entry(entry) && is_cow_mapping(vm_flags)) { /* * COW mappings require pages in both * parent and child to be set to read. */ - make_migration_entry_read(&entry); + entry = make_readable_migration_entry( + swp_offset(entry)); pte = swp_entry_to_pte(entry); if (pte_swp_soft_dirty(*src_pte)) pte = pte_swp_mksoft_dirty(pte); @@ -748,7 +811,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, set_pte_at(src_mm, addr, src_pte, pte); } } else if (is_device_private_entry(entry)) { - page = device_private_entry_to_page(entry); + page = pfn_swap_entry_to_page(entry); /* * Update rss count even for unaddressable pages, as @@ -770,15 +833,29 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, * when a device driver is involved (you cannot easily * save and restore device driver state). */ - if (is_write_device_private_entry(entry) && + if (is_writable_device_private_entry(entry) && is_cow_mapping(vm_flags)) { - make_device_private_entry_read(&entry); + entry = make_readable_device_private_entry( + swp_offset(entry)); pte = swp_entry_to_pte(entry); if (pte_swp_uffd_wp(*src_pte)) pte = pte_swp_mkuffd_wp(pte); set_pte_at(src_mm, addr, src_pte, pte); } + } else if (is_device_exclusive_entry(entry)) { + /* + * Make device exclusive entries present by restoring the + * original entry then copying as for a present pte. Device + * exclusive entries currently only support private writable + * (ie. COW) mappings. + */ + VM_BUG_ON(!is_cow_mapping(src_vma->vm_flags)); + if (try_restore_exclusive_pte(src_pte, src_vma, addr)) + return -EBUSY; + return -ENOENT; } + if (!userfaultfd_wp(dst_vma)) + pte = pte_swp_clear_uffd_wp(pte); set_pte_at(dst_mm, addr, dst_pte, pte); return 0; } @@ -844,6 +921,9 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma /* All done, just insert the new page copy in the child */ pte = mk_pte(new_page, dst_vma->vm_page_prot); pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma); + if (userfaultfd_pte_wp(dst_vma, *src_pte)) + /* Uffd-wp needs to be delivered to dest pte as well */ + pte = pte_wrprotect(pte_mkuffd_wp(pte)); set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte); return 0; } @@ -893,12 +973,7 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, pte = pte_mkclean(pte); pte = pte_mkold(pte); - /* - * Make sure the _PAGE_UFFD_WP bit is cleared if the new VMA - * does not have the VM_UFFD_WP, which means that the uffd - * fork event is not enabled. - */ - if (!(vm_flags & VM_UFFD_WP)) + if (!userfaultfd_wp(dst_vma)) pte = pte_clear_uffd_wp(pte); set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte); @@ -971,13 +1046,25 @@ again: continue; } if (unlikely(!pte_present(*src_pte))) { - entry.val = copy_nonpresent_pte(dst_mm, src_mm, - dst_pte, src_pte, - src_vma, addr, rss); - if (entry.val) + ret = copy_nonpresent_pte(dst_mm, src_mm, + dst_pte, src_pte, + dst_vma, src_vma, + addr, rss); + if (ret == -EIO) { + entry = pte_to_swp_entry(*src_pte); break; - progress += 8; - continue; + } else if (ret == -EBUSY) { + break; + } else if (!ret) { + progress += 8; + continue; + } + + /* + * Device exclusive entry restored, continue by copying + * the now present pte. + */ + WARN_ON_ONCE(ret != -ENOENT); } /* copy_present_pte() will clear `*prealloc' if consumed */ ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte, @@ -1008,20 +1095,26 @@ again: pte_unmap_unlock(orig_dst_pte, dst_ptl); cond_resched(); - if (entry.val) { + if (ret == -EIO) { + VM_WARN_ON_ONCE(!entry.val); if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) { ret = -ENOMEM; goto out; } entry.val = 0; - } else if (ret) { - WARN_ON_ONCE(ret != -EAGAIN); + } else if (ret == -EBUSY) { + goto out; + } else if (ret == -EAGAIN) { prealloc = page_copy_prealloc(src_mm, src_vma, addr); if (!prealloc) return -ENOMEM; - /* We've captured and resolved the error. Reset, try again. */ - ret = 0; + } else if (ret) { + VM_WARN_ON_ONCE(1); } + + /* We've captured and resolved the error. Reset, try again. */ + ret = 0; + if (addr != end) goto again; out: @@ -1050,8 +1143,8 @@ copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, || pmd_devmap(*src_pmd)) { int err; VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma); - err = copy_huge_pmd(dst_mm, src_mm, - dst_pmd, src_pmd, addr, src_vma); + err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd, + addr, dst_vma, src_vma); if (err == -ENOMEM) return -ENOMEM; if (!err) @@ -1278,8 +1371,9 @@ again: } entry = pte_to_swp_entry(ptent); - if (is_device_private_entry(entry)) { - struct page *page = device_private_entry_to_page(entry); + if (is_device_private_entry(entry) || + is_device_exclusive_entry(entry)) { + struct page *page = pfn_swap_entry_to_page(entry); if (unlikely(details && details->check_mapping)) { /* @@ -1294,7 +1388,10 @@ again: pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); rss[mm_counter(page)]--; - page_remove_rmap(page, false); + + if (is_device_private_entry(entry)) + page_remove_rmap(page, false); + put_page(page); continue; } @@ -1308,7 +1405,7 @@ again: else if (is_migration_entry(entry)) { struct page *page; - page = migration_entry_to_page(entry); + page = pfn_swap_entry_to_page(entry); rss[mm_counter(page)]--; } if (unlikely(!free_swap_and_cache(entry))) @@ -3343,6 +3440,34 @@ void unmap_mapping_range(struct address_space *mapping, EXPORT_SYMBOL(unmap_mapping_range); /* + * Restore a potential device exclusive pte to a working pte entry + */ +static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf) +{ + struct page *page = vmf->page; + struct vm_area_struct *vma = vmf->vma; + struct mmu_notifier_range range; + + if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) + return VM_FAULT_RETRY; + mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma, + vma->vm_mm, vmf->address & PAGE_MASK, + (vmf->address & PAGE_MASK) + PAGE_SIZE, NULL); + mmu_notifier_invalidate_range_start(&range); + + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, + &vmf->ptl); + if (likely(pte_same(*vmf->pte, vmf->orig_pte))) + restore_exclusive_pte(vma, page, vmf->address, vmf->pte); + + pte_unmap_unlock(vmf->pte, vmf->ptl); + unlock_page(page); + + mmu_notifier_invalidate_range_end(&range); + return 0; +} + +/* * We enter with non-exclusive mmap_lock (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. * We return with pte unmapped and unlocked. @@ -3370,8 +3495,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (is_migration_entry(entry)) { migration_entry_wait(vma->vm_mm, vmf->pmd, vmf->address); + } else if (is_device_exclusive_entry(entry)) { + vmf->page = pfn_swap_entry_to_page(entry); + ret = remove_device_exclusive_entry(vmf); } else if (is_device_private_entry(entry)) { - vmf->page = device_private_entry_to_page(entry); + vmf->page = pfn_swap_entry_to_page(entry); ret = vmf->page->pgmap->ops->migrate_to_ram(vmf); } else if (is_hwpoison_entry(entry)) { ret = VM_FAULT_HWPOISON; @@ -4025,9 +4153,11 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf) * something). */ if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) { - ret = do_fault_around(vmf); - if (ret) - return ret; + if (likely(!userfaultfd_minor(vmf->vma))) { + ret = do_fault_around(vmf); + if (ret) + return ret; + } } ret = __do_fault(vmf); @@ -4172,9 +4302,8 @@ static vm_fault_t do_fault(struct vm_fault *vmf) return ret; } -static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, - unsigned long addr, int page_nid, - int *flags) +int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, + unsigned long addr, int page_nid, int *flags) { get_page(page); @@ -4295,12 +4424,12 @@ static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf) } /* `inline' is required to avoid gcc 4.1.2 build error */ -static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd) +static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf) { if (vma_is_anonymous(vmf->vma)) { - if (userfaultfd_huge_pmd_wp(vmf->vma, orig_pmd)) + if (userfaultfd_huge_pmd_wp(vmf->vma, vmf->orig_pmd)) return handle_userfault(vmf, VM_UFFD_WP); - return do_huge_pmd_wp_page(vmf, orig_pmd); + return do_huge_pmd_wp_page(vmf); } if (vmf->vma->vm_ops->huge_fault) { vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD); @@ -4527,26 +4656,26 @@ retry_pud: if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { - pmd_t orig_pmd = *vmf.pmd; + vmf.orig_pmd = *vmf.pmd; barrier(); - if (unlikely(is_swap_pmd(orig_pmd))) { + if (unlikely(is_swap_pmd(vmf.orig_pmd))) { VM_BUG_ON(thp_migration_supported() && - !is_pmd_migration_entry(orig_pmd)); - if (is_pmd_migration_entry(orig_pmd)) + !is_pmd_migration_entry(vmf.orig_pmd)); + if (is_pmd_migration_entry(vmf.orig_pmd)) pmd_migration_entry_wait(mm, vmf.pmd); return 0; } - if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) { - if (pmd_protnone(orig_pmd) && vma_is_accessible(vma)) - return do_huge_pmd_numa_page(&vmf, orig_pmd); + if (pmd_trans_huge(vmf.orig_pmd) || pmd_devmap(vmf.orig_pmd)) { + if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma)) + return do_huge_pmd_numa_page(&vmf); - if (dirty && !pmd_write(orig_pmd)) { - ret = wp_huge_pmd(&vmf, orig_pmd); + if (dirty && !pmd_write(vmf.orig_pmd)) { + ret = wp_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { - huge_pmd_set_accessed(&vmf, orig_pmd); + huge_pmd_set_accessed(&vmf); return 0; } } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 974a565797d8..8cb75b26ea4f 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -154,122 +154,6 @@ static void release_memory_resource(struct resource *res) } #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE -void get_page_bootmem(unsigned long info, struct page *page, - unsigned long type) -{ - page->freelist = (void *)type; - SetPagePrivate(page); - set_page_private(page, info); - page_ref_inc(page); -} - -void put_page_bootmem(struct page *page) -{ - unsigned long type; - - type = (unsigned long) page->freelist; - BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || - type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE); - - if (page_ref_dec_return(page) == 1) { - page->freelist = NULL; - ClearPagePrivate(page); - set_page_private(page, 0); - INIT_LIST_HEAD(&page->lru); - free_reserved_page(page); - } -} - -#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE -#ifndef CONFIG_SPARSEMEM_VMEMMAP -static void register_page_bootmem_info_section(unsigned long start_pfn) -{ - unsigned long mapsize, section_nr, i; - struct mem_section *ms; - struct page *page, *memmap; - struct mem_section_usage *usage; - - section_nr = pfn_to_section_nr(start_pfn); - ms = __nr_to_section(section_nr); - - /* Get section's memmap address */ - memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); - - /* - * Get page for the memmap's phys address - * XXX: need more consideration for sparse_vmemmap... - */ - page = virt_to_page(memmap); - mapsize = sizeof(struct page) * PAGES_PER_SECTION; - mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT; - - /* remember memmap's page */ - for (i = 0; i < mapsize; i++, page++) - get_page_bootmem(section_nr, page, SECTION_INFO); - - usage = ms->usage; - page = virt_to_page(usage); - - mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT; - - for (i = 0; i < mapsize; i++, page++) - get_page_bootmem(section_nr, page, MIX_SECTION_INFO); - -} -#else /* CONFIG_SPARSEMEM_VMEMMAP */ -static void register_page_bootmem_info_section(unsigned long start_pfn) -{ - unsigned long mapsize, section_nr, i; - struct mem_section *ms; - struct page *page, *memmap; - struct mem_section_usage *usage; - - section_nr = pfn_to_section_nr(start_pfn); - ms = __nr_to_section(section_nr); - - memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); - - register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION); - - usage = ms->usage; - page = virt_to_page(usage); - - mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT; - - for (i = 0; i < mapsize; i++, page++) - get_page_bootmem(section_nr, page, MIX_SECTION_INFO); -} -#endif /* !CONFIG_SPARSEMEM_VMEMMAP */ - -void __init register_page_bootmem_info_node(struct pglist_data *pgdat) -{ - unsigned long i, pfn, end_pfn, nr_pages; - int node = pgdat->node_id; - struct page *page; - - nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT; - page = virt_to_page(pgdat); - - for (i = 0; i < nr_pages; i++, page++) - get_page_bootmem(node, page, NODE_INFO); - - pfn = pgdat->node_start_pfn; - end_pfn = pgdat_end_pfn(pgdat); - - /* register section info */ - for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { - /* - * Some platforms can assign the same pfn to multiple nodes - on - * node0 as well as nodeN. To avoid registering a pfn against - * multiple nodes we check that this pfn does not already - * reside in some other nodes. - */ - if (pfn_valid(pfn) && (early_pfn_to_nid(pfn) == node)) - register_page_bootmem_info_section(pfn); - } -} -#endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ - static int check_pfn_span(unsigned long pfn, unsigned long nr_pages, const char *reason) { @@ -445,7 +329,6 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, unsigned long pfn; int nid = zone_to_nid(zone); - zone_span_writelock(zone); if (zone->zone_start_pfn == start_pfn) { /* * If the section is smallest section in the zone, it need @@ -478,7 +361,6 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, zone->spanned_pages = 0; } } - zone_span_writeunlock(zone); } static void update_pgdat_span(struct pglist_data *pgdat) @@ -515,7 +397,7 @@ void __ref remove_pfn_range_from_zone(struct zone *zone, { const unsigned long end_pfn = start_pfn + nr_pages; struct pglist_data *pgdat = zone->zone_pgdat; - unsigned long pfn, cur_nr_pages, flags; + unsigned long pfn, cur_nr_pages; /* Poison struct pages because they are now uninitialized again. */ for (pfn = start_pfn; pfn < end_pfn; pfn += cur_nr_pages) { @@ -540,10 +422,8 @@ void __ref remove_pfn_range_from_zone(struct zone *zone, clear_zone_contiguous(zone); - pgdat_resize_lock(zone->zone_pgdat, &flags); shrink_zone_span(zone, start_pfn, start_pfn + nr_pages); update_pgdat_span(pgdat); - pgdat_resize_unlock(zone->zone_pgdat, &flags); set_zone_contiguous(zone); } @@ -750,19 +630,13 @@ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, { struct pglist_data *pgdat = zone->zone_pgdat; int nid = pgdat->node_id; - unsigned long flags; clear_zone_contiguous(zone); - /* TODO Huh pgdat is irqsave while zone is not. It used to be like that before */ - pgdat_resize_lock(pgdat, &flags); - zone_span_writelock(zone); if (zone_is_empty(zone)) init_currently_empty_zone(zone, start_pfn, nr_pages); resize_zone_range(zone, start_pfn, nr_pages); - zone_span_writeunlock(zone); resize_pgdat_range(pgdat, start_pfn, nr_pages); - pgdat_resize_unlock(pgdat, &flags); /* * Subsection population requires care in pfn_to_online_page(). @@ -852,12 +726,8 @@ struct zone *zone_for_pfn_range(int online_type, int nid, unsigned start_pfn, */ void adjust_present_page_count(struct zone *zone, long nr_pages) { - unsigned long flags; - zone->present_pages += nr_pages; - pgdat_resize_lock(zone->zone_pgdat, &flags); zone->zone_pgdat->node_present_pages += nr_pages; - pgdat_resize_unlock(zone->zone_pgdat, &flags); } int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, @@ -913,7 +783,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *z /* * {on,off}lining is constrained to full memory sections (or more - * precisly to memory blocks from the user space POV). + * precisely to memory blocks from the user space POV). * memmap_on_memory is an exception because it reserves initial part * of the physical memory space for vmemmaps. That space is pageblock * aligned. @@ -1072,8 +942,8 @@ static void rollback_node_hotadd(int nid) } -/** - * try_online_node - online a node if offlined +/* + * __try_online_node - online a node if offlined * @nid: the node ID * @set_node_online: Whether we want to online the node * called by cpu_up() to online a node without onlined memory. @@ -1172,6 +1042,7 @@ bool mhp_supports_memmap_on_memory(unsigned long size) * populate a single PMD. */ return memmap_on_memory && + !hugetlb_free_vmemmap_enabled && IS_ENABLED(CONFIG_MHP_MEMMAP_ON_MEMORY) && size == memory_block_size_bytes() && IS_ALIGNED(vmemmap_size, PMD_SIZE) && @@ -1521,6 +1392,8 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) struct page *page, *head; int ret = 0; LIST_HEAD(source); + static DEFINE_RATELIMIT_STATE(migrate_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); for (pfn = start_pfn; pfn < end_pfn; pfn++) { if (!pfn_valid(pfn)) @@ -1567,8 +1440,10 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) page_is_file_lru(page)); } else { - pr_warn("failed to isolate pfn %lx\n", pfn); - dump_page(page, "isolation failed"); + if (__ratelimit(&migrate_rs)) { + pr_warn("failed to isolate pfn %lx\n", pfn); + dump_page(page, "isolation failed"); + } } put_page(page); } @@ -1597,9 +1472,11 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_HOTPLUG); if (ret) { list_for_each_entry(page, &source, lru) { - pr_warn("migrating pfn %lx failed ret:%d ", - page_to_pfn(page), ret); - dump_page(page, "migration failure"); + if (__ratelimit(&migrate_rs)) { + pr_warn("migrating pfn %lx failed ret:%d\n", + page_to_pfn(page), ret); + dump_page(page, "migration failure"); + } } putback_movable_pages(&source); } @@ -1703,7 +1580,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) /* * {on,off}lining is constrained to full memory sections (or more - * precisly to memory blocks from the user space POV). + * precisely to memory blocks from the user space POV). * memmap_on_memory is an exception because it reserves initial part * of the physical memory space for vmemmaps. That space is pageblock * aligned. @@ -2031,7 +1908,7 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size) } /** - * remove_memory + * __remove_memory - Remove memory if every memory block is offline * @nid: the node ID * @start: physical address of the region to remove * @size: size of the region to remove diff --git a/mm/mempolicy.c b/mm/mempolicy.c index b5d95bf1025d..e32360e90274 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -121,8 +121,7 @@ enum zone_type policy_zone = 0; */ static struct mempolicy default_policy = { .refcnt = ATOMIC_INIT(1), /* never free it */ - .mode = MPOL_PREFERRED, - .flags = MPOL_F_LOCAL, + .mode = MPOL_LOCAL, }; static struct mempolicy preferred_node_policy[MAX_NUMNODES]; @@ -194,18 +193,17 @@ static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes) { if (nodes_empty(*nodes)) return -EINVAL; - pol->v.nodes = *nodes; + pol->nodes = *nodes; return 0; } static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes) { - if (!nodes) - pol->flags |= MPOL_F_LOCAL; /* local allocation */ - else if (nodes_empty(*nodes)) - return -EINVAL; /* no allowed nodes */ - else - pol->v.preferred_node = first_node(*nodes); + if (nodes_empty(*nodes)) + return -EINVAL; + + nodes_clear(pol->nodes); + node_set(first_node(*nodes), pol->nodes); return 0; } @@ -213,15 +211,14 @@ static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes) { if (nodes_empty(*nodes)) return -EINVAL; - pol->v.nodes = *nodes; + pol->nodes = *nodes; return 0; } /* * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if * any, for the new policy. mpol_new() has already validated the nodes - * parameter with respect to the policy mode and flags. But, we need to - * handle an empty nodemask with MPOL_PREFERRED here. + * parameter with respect to the policy mode and flags. * * Must be called holding task's alloc_lock to protect task's mems_allowed * and mempolicy. May also be called holding the mmap_lock for write. @@ -231,33 +228,31 @@ static int mpol_set_nodemask(struct mempolicy *pol, { int ret; - /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */ - if (pol == NULL) + /* + * Default (pol==NULL) resp. local memory policies are not a + * subject of any remapping. They also do not need any special + * constructor. + */ + if (!pol || pol->mode == MPOL_LOCAL) return 0; + /* Check N_MEMORY */ nodes_and(nsc->mask1, cpuset_current_mems_allowed, node_states[N_MEMORY]); VM_BUG_ON(!nodes); - if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes)) - nodes = NULL; /* explicit local allocation */ - else { - if (pol->flags & MPOL_F_RELATIVE_NODES) - mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1); - else - nodes_and(nsc->mask2, *nodes, nsc->mask1); - if (mpol_store_user_nodemask(pol)) - pol->w.user_nodemask = *nodes; - else - pol->w.cpuset_mems_allowed = - cpuset_current_mems_allowed; - } + if (pol->flags & MPOL_F_RELATIVE_NODES) + mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1); + else + nodes_and(nsc->mask2, *nodes, nsc->mask1); - if (nodes) - ret = mpol_ops[pol->mode].create(pol, &nsc->mask2); + if (mpol_store_user_nodemask(pol)) + pol->w.user_nodemask = *nodes; else - ret = mpol_ops[pol->mode].create(pol, NULL); + pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed; + + ret = mpol_ops[pol->mode].create(pol, &nsc->mask2); return ret; } @@ -290,13 +285,14 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, if (((flags & MPOL_F_STATIC_NODES) || (flags & MPOL_F_RELATIVE_NODES))) return ERR_PTR(-EINVAL); + + mode = MPOL_LOCAL; } } else if (mode == MPOL_LOCAL) { if (!nodes_empty(*nodes) || (flags & MPOL_F_STATIC_NODES) || (flags & MPOL_F_RELATIVE_NODES)) return ERR_PTR(-EINVAL); - mode = MPOL_PREFERRED; } else if (nodes_empty(*nodes)) return ERR_PTR(-EINVAL); policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); @@ -330,7 +326,7 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes) else if (pol->flags & MPOL_F_RELATIVE_NODES) mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); else { - nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed, + nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed, *nodes); pol->w.cpuset_mems_allowed = *nodes; } @@ -338,31 +334,13 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes) if (nodes_empty(tmp)) tmp = *nodes; - pol->v.nodes = tmp; + pol->nodes = tmp; } static void mpol_rebind_preferred(struct mempolicy *pol, const nodemask_t *nodes) { - nodemask_t tmp; - - if (pol->flags & MPOL_F_STATIC_NODES) { - int node = first_node(pol->w.user_nodemask); - - if (node_isset(node, *nodes)) { - pol->v.preferred_node = node; - pol->flags &= ~MPOL_F_LOCAL; - } else - pol->flags |= MPOL_F_LOCAL; - } else if (pol->flags & MPOL_F_RELATIVE_NODES) { - mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); - pol->v.preferred_node = first_node(tmp); - } else if (!(pol->flags & MPOL_F_LOCAL)) { - pol->v.preferred_node = node_remap(pol->v.preferred_node, - pol->w.cpuset_mems_allowed, - *nodes); - pol->w.cpuset_mems_allowed = *nodes; - } + pol->w.cpuset_mems_allowed = *nodes; } /* @@ -376,7 +354,7 @@ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) { if (!pol) return; - if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) && + if (!mpol_store_user_nodemask(pol) && nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) return; @@ -427,6 +405,9 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { .create = mpol_new_bind, .rebind = mpol_rebind_nodemask, }, + [MPOL_LOCAL] = { + .rebind = mpol_rebind_default, + }, }; static int migrate_page_add(struct page *page, struct list_head *pagelist, @@ -458,7 +439,8 @@ static inline bool queue_pages_required(struct page *page, /* * queue_pages_pmd() has four possible return values: - * 0 - pages are placed on the right node or queued successfully. + * 0 - pages are placed on the right node or queued successfully, or + * special page is met, i.e. huge zero page. * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were * specified. * 2 - THP was split. @@ -482,8 +464,7 @@ static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, page = pmd_page(*pmd); if (is_huge_zero_page(page)) { spin_unlock(ptl); - __split_huge_pmd(walk->vma, pmd, addr, false, NULL); - ret = 2; + walk->action = ACTION_CONTINUE; goto out; } if (!queue_pages_required(page, qp)) @@ -510,7 +491,8 @@ out: * and move them to the pagelist if they do. * * queue_pages_pte_range() has three possible return values: - * 0 - pages are placed on the right node or queued successfully. + * 0 - pages are placed on the right node or queued successfully, or + * special page is met, i.e. zero page. * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were * specified. * -EIO - only MPOL_MF_STRICT was specified and an existing page was already @@ -917,12 +899,11 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) switch (p->mode) { case MPOL_BIND: case MPOL_INTERLEAVE: - *nodes = p->v.nodes; - break; case MPOL_PREFERRED: - if (!(p->flags & MPOL_F_LOCAL)) - node_set(p->v.preferred_node, *nodes); - /* else return empty node mask for local allocation */ + *nodes = p->nodes; + break; + case MPOL_LOCAL: + /* return empty node mask for local allocation */ break; default: BUG(); @@ -1007,7 +988,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, *policy = err; } else if (pol == current->mempolicy && pol->mode == MPOL_INTERLEAVE) { - *policy = next_node_in(current->il_prev, pol->v.nodes); + *policy = next_node_in(current->il_prev, pol->nodes); } else { err = -EINVAL; goto out; @@ -1460,26 +1441,38 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; } +/* Basic parameter sanity check used by both mbind() and set_mempolicy() */ +static inline int sanitize_mpol_flags(int *mode, unsigned short *flags) +{ + *flags = *mode & MPOL_MODE_FLAGS; + *mode &= ~MPOL_MODE_FLAGS; + if ((unsigned int)(*mode) >= MPOL_MAX) + return -EINVAL; + if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES)) + return -EINVAL; + + return 0; +} + static long kernel_mbind(unsigned long start, unsigned long len, unsigned long mode, const unsigned long __user *nmask, unsigned long maxnode, unsigned int flags) { + unsigned short mode_flags; nodemask_t nodes; + int lmode = mode; int err; - unsigned short mode_flags; start = untagged_addr(start); - mode_flags = mode & MPOL_MODE_FLAGS; - mode &= ~MPOL_MODE_FLAGS; - if (mode >= MPOL_MAX) - return -EINVAL; - if ((mode_flags & MPOL_F_STATIC_NODES) && - (mode_flags & MPOL_F_RELATIVE_NODES)) - return -EINVAL; + err = sanitize_mpol_flags(&lmode, &mode_flags); + if (err) + return err; + err = get_nodes(&nodes, nmask, maxnode); if (err) return err; - return do_mbind(start, len, mode, mode_flags, &nodes, flags); + + return do_mbind(start, len, lmode, mode_flags, &nodes, flags); } SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, @@ -1493,20 +1486,20 @@ SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask, unsigned long maxnode) { - int err; + unsigned short mode_flags; nodemask_t nodes; - unsigned short flags; + int lmode = mode; + int err; + + err = sanitize_mpol_flags(&lmode, &mode_flags); + if (err) + return err; - flags = mode & MPOL_MODE_FLAGS; - mode &= ~MPOL_MODE_FLAGS; - if ((unsigned int)mode >= MPOL_MAX) - return -EINVAL; - if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES)) - return -EINVAL; err = get_nodes(&nodes, nmask, maxnode); if (err) return err; - return do_set_mempolicy(mode, flags, &nodes); + + return do_set_mempolicy(lmode, mode_flags, &nodes); } SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask, @@ -1863,14 +1856,14 @@ static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone) BUG_ON(dynamic_policy_zone == ZONE_MOVABLE); /* - * if policy->v.nodes has movable memory only, + * if policy->nodes has movable memory only, * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only. * - * policy->v.nodes is intersect with node_states[N_MEMORY]. + * policy->nodes is intersect with node_states[N_MEMORY]. * so if the following test fails, it implies - * policy->v.nodes has movable memory only. + * policy->nodes has movable memory only. */ - if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY])) + if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY])) dynamic_policy_zone = ZONE_MOVABLE; return zone >= dynamic_policy_zone; @@ -1885,8 +1878,8 @@ nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) /* Lower zones don't get a nodemask applied for MPOL_BIND */ if (unlikely(policy->mode == MPOL_BIND) && apply_policy_zone(policy, gfp_zone(gfp)) && - cpuset_nodemask_valid_mems_allowed(&policy->v.nodes)) - return &policy->v.nodes; + cpuset_nodemask_valid_mems_allowed(&policy->nodes)) + return &policy->nodes; return NULL; } @@ -1894,9 +1887,9 @@ nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) /* Return the node id preferred by the given mempolicy, or the given id */ static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd) { - if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL)) - nd = policy->v.preferred_node; - else { + if (policy->mode == MPOL_PREFERRED) { + nd = first_node(policy->nodes); + } else { /* * __GFP_THISNODE shouldn't even be used with the bind policy * because we might easily break the expectation to stay on the @@ -1914,7 +1907,7 @@ static unsigned interleave_nodes(struct mempolicy *policy) unsigned next; struct task_struct *me = current; - next = next_node_in(me->il_prev, policy->v.nodes); + next = next_node_in(me->il_prev, policy->nodes); if (next < MAX_NUMNODES) me->il_prev = next; return next; @@ -1933,15 +1926,12 @@ unsigned int mempolicy_slab_node(void) return node; policy = current->mempolicy; - if (!policy || policy->flags & MPOL_F_LOCAL) + if (!policy) return node; switch (policy->mode) { case MPOL_PREFERRED: - /* - * handled MPOL_F_LOCAL above - */ - return policy->v.preferred_node; + return first_node(policy->nodes); case MPOL_INTERLEAVE: return interleave_nodes(policy); @@ -1957,9 +1947,11 @@ unsigned int mempolicy_slab_node(void) enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK]; z = first_zones_zonelist(zonelist, highest_zoneidx, - &policy->v.nodes); + &policy->nodes); return z->zone ? zone_to_nid(z->zone) : node; } + case MPOL_LOCAL: + return node; default: BUG(); @@ -1968,12 +1960,12 @@ unsigned int mempolicy_slab_node(void) /* * Do static interleaving for a VMA with known offset @n. Returns the n'th - * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the + * node in pol->nodes (starting from n=0), wrapping around if n exceeds the * number of present nodes. */ static unsigned offset_il_node(struct mempolicy *pol, unsigned long n) { - unsigned nnodes = nodes_weight(pol->v.nodes); + unsigned nnodes = nodes_weight(pol->nodes); unsigned target; int i; int nid; @@ -1981,9 +1973,9 @@ static unsigned offset_il_node(struct mempolicy *pol, unsigned long n) if (!nnodes) return numa_node_id(); target = (unsigned int)n % nnodes; - nid = first_node(pol->v.nodes); + nid = first_node(pol->nodes); for (i = 0; i < target; i++) - nid = next_node(nid, pol->v.nodes); + nid = next_node(nid, pol->nodes); return nid; } @@ -2039,7 +2031,7 @@ int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, } else { nid = policy_node(gfp_flags, *mpol, numa_node_id()); if ((*mpol)->mode == MPOL_BIND) - *nodemask = &(*mpol)->v.nodes; + *nodemask = &(*mpol)->nodes; } return nid; } @@ -2063,7 +2055,6 @@ int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, bool init_nodemask_of_mempolicy(nodemask_t *mask) { struct mempolicy *mempolicy; - int nid; if (!(mask && current->mempolicy)) return false; @@ -2072,16 +2063,13 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask) mempolicy = current->mempolicy; switch (mempolicy->mode) { case MPOL_PREFERRED: - if (mempolicy->flags & MPOL_F_LOCAL) - nid = numa_node_id(); - else - nid = mempolicy->v.preferred_node; - init_nodemask_of_node(mask, nid); - break; - case MPOL_BIND: case MPOL_INTERLEAVE: - *mask = mempolicy->v.nodes; + *mask = mempolicy->nodes; + break; + + case MPOL_LOCAL: + init_nodemask_of_node(mask, numa_node_id()); break; default: @@ -2094,16 +2082,16 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask) #endif /* - * mempolicy_nodemask_intersects + * mempolicy_in_oom_domain * - * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default - * policy. Otherwise, check for intersection between mask and the policy - * nodemask for 'bind' or 'interleave' policy. For 'preferred' or 'local' - * policy, always return true since it may allocate elsewhere on fallback. + * If tsk's mempolicy is "bind", check for intersection between mask and + * the policy nodemask. Otherwise, return true for all other policies + * including "interleave", as a tsk with "interleave" policy may have + * memory allocated from all nodes in system. * * Takes task_lock(tsk) to prevent freeing of its mempolicy. */ -bool mempolicy_nodemask_intersects(struct task_struct *tsk, +bool mempolicy_in_oom_domain(struct task_struct *tsk, const nodemask_t *mask) { struct mempolicy *mempolicy; @@ -2111,29 +2099,13 @@ bool mempolicy_nodemask_intersects(struct task_struct *tsk, if (!mask) return ret; + task_lock(tsk); mempolicy = tsk->mempolicy; - if (!mempolicy) - goto out; - - switch (mempolicy->mode) { - case MPOL_PREFERRED: - /* - * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to - * allocate from, they may fallback to other nodes when oom. - * Thus, it's possible for tsk to have allocated memory from - * nodes in mask. - */ - break; - case MPOL_BIND: - case MPOL_INTERLEAVE: - ret = nodes_intersects(mempolicy->v.nodes, *mask); - break; - default: - BUG(); - } -out: + if (mempolicy && mempolicy->mode == MPOL_BIND) + ret = nodes_intersects(mempolicy->nodes, *mask); task_unlock(tsk); + return ret; } @@ -2204,8 +2176,8 @@ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, * If the policy is interleave, or does not allow the current * node in its nodemask, we allocate the standard way. */ - if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL)) - hpage_node = pol->v.preferred_node; + if (pol->mode == MPOL_PREFERRED) + hpage_node = first_node(pol->nodes); nmask = policy_nodemask(gfp, pol); if (!nmask || node_isset(hpage_node, *nmask)) { @@ -2338,12 +2310,10 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) switch (a->mode) { case MPOL_BIND: case MPOL_INTERLEAVE: - return !!nodes_equal(a->v.nodes, b->v.nodes); case MPOL_PREFERRED: - /* a's ->flags is the same as b's */ - if (a->flags & MPOL_F_LOCAL) - return true; - return a->v.preferred_node == b->v.preferred_node; + return !!nodes_equal(a->nodes, b->nodes); + case MPOL_LOCAL: + return true; default: BUG(); return false; @@ -2481,16 +2451,17 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long break; case MPOL_PREFERRED: - if (pol->flags & MPOL_F_LOCAL) - polnid = numa_node_id(); - else - polnid = pol->v.preferred_node; + polnid = first_node(pol->nodes); + break; + + case MPOL_LOCAL: + polnid = numa_node_id(); break; case MPOL_BIND: /* Optimize placement among multiple nodes via NUMA balancing */ if (pol->flags & MPOL_F_MORON) { - if (node_isset(thisnid, pol->v.nodes)) + if (node_isset(thisnid, pol->nodes)) break; goto out; } @@ -2501,12 +2472,12 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long * else select nearest allowed node, if any. * If no allowed nodes, use current [!misplaced]. */ - if (node_isset(curnid, pol->v.nodes)) + if (node_isset(curnid, pol->nodes)) goto out; z = first_zones_zonelist( node_zonelist(numa_node_id(), GFP_HIGHUSER), gfp_zone(GFP_HIGHUSER), - &pol->v.nodes); + &pol->nodes); polnid = zone_to_nid(z->zone); break; @@ -2709,7 +2680,7 @@ int mpol_set_shared_policy(struct shared_policy *info, vma->vm_pgoff, sz, npol ? npol->mode : -1, npol ? npol->flags : -1, - npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE); + npol ? nodes_addr(npol->nodes)[0] : NUMA_NO_NODE); if (npol) { new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); @@ -2807,7 +2778,7 @@ void __init numa_policy_init(void) .refcnt = ATOMIC_INIT(1), .mode = MPOL_PREFERRED, .flags = MPOL_F_MOF | MPOL_F_MORON, - .v = { .preferred_node = nid, }, + .nodes = nodemask_of_node(nid), }; } @@ -2851,9 +2822,6 @@ void numa_default_policy(void) * Parse and format mempolicy from/to strings */ -/* - * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag. - */ static const char * const policy_modes[] = { [MPOL_DEFAULT] = "default", @@ -2931,7 +2899,6 @@ int mpol_parse_str(char *str, struct mempolicy **mpol) */ if (nodelist) goto out; - mode = MPOL_PREFERRED; break; case MPOL_DEFAULT: /* @@ -2970,12 +2937,14 @@ int mpol_parse_str(char *str, struct mempolicy **mpol) * Save nodes for mpol_to_str() to show the tmpfs mount options * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo. */ - if (mode != MPOL_PREFERRED) - new->v.nodes = nodes; - else if (nodelist) - new->v.preferred_node = first_node(nodes); - else - new->flags |= MPOL_F_LOCAL; + if (mode != MPOL_PREFERRED) { + new->nodes = nodes; + } else if (nodelist) { + nodes_clear(new->nodes); + node_set(first_node(nodes), new->nodes); + } else { + new->mode = MPOL_LOCAL; + } /* * Save nodes for contextualization: this will be used to "clone" @@ -3021,16 +2990,12 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) switch (mode) { case MPOL_DEFAULT: + case MPOL_LOCAL: break; case MPOL_PREFERRED: - if (flags & MPOL_F_LOCAL) - mode = MPOL_LOCAL; - else - node_set(pol->v.preferred_node, nodes); - break; case MPOL_BIND: case MPOL_INTERLEAVE: - nodes = pol->v.nodes; + nodes = pol->nodes; break; default: WARN_ON_ONCE(1); diff --git a/mm/migrate.c b/mm/migrate.c index 380ca57b9031..23cbd9de030b 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -210,13 +210,18 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, * Recheck VMA as permissions can change since migration started */ entry = pte_to_swp_entry(*pvmw.pte); - if (is_write_migration_entry(entry)) + if (is_writable_migration_entry(entry)) pte = maybe_mkwrite(pte, vma); else if (pte_swp_uffd_wp(*pvmw.pte)) pte = pte_mkuffd_wp(pte); if (unlikely(is_device_private_page(new))) { - entry = make_device_private_entry(new, pte_write(pte)); + if (pte_write(pte)) + entry = make_writable_device_private_entry( + page_to_pfn(new)); + else + entry = make_readable_device_private_entry( + page_to_pfn(new)); pte = swp_entry_to_pte(entry); if (pte_swp_soft_dirty(*pvmw.pte)) pte = pte_swp_mksoft_dirty(pte); @@ -226,8 +231,10 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, #ifdef CONFIG_HUGETLB_PAGE if (PageHuge(new)) { + unsigned int shift = huge_page_shift(hstate_vma(vma)); + pte = pte_mkhuge(pte); - pte = arch_make_huge_pte(pte, vma, new, 0); + pte = arch_make_huge_pte(pte, shift, vma->vm_flags); set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); if (PageAnon(new)) hugepage_add_anon_rmap(new, vma, pvmw.address); @@ -294,7 +301,7 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, if (!is_migration_entry(entry)) goto out; - page = migration_entry_to_page(entry); + page = pfn_swap_entry_to_page(entry); page = compound_head(page); /* @@ -335,7 +342,7 @@ void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd) ptl = pmd_lock(mm, pmd); if (!is_pmd_migration_entry(*pmd)) goto unlock; - page = migration_entry_to_page(pmd_to_swp_entry(*pmd)); + page = pfn_swap_entry_to_page(pmd_to_swp_entry(*pmd)); if (!get_page_unless_zero(page)) goto unlock; spin_unlock(ptl); @@ -551,7 +558,7 @@ static void __copy_gigantic_page(struct page *dst, struct page *src, } } -static void copy_huge_page(struct page *dst, struct page *src) +void copy_huge_page(struct page *dst, struct page *src) { int i; int nr_pages; @@ -626,7 +633,10 @@ void migrate_page_states(struct page *newpage, struct page *page) if (PageSwapCache(page)) ClearPageSwapCache(page); ClearPagePrivate(page); - set_page_private(page, 0); + + /* page->private contains hugetlb specific flags */ + if (!PageHuge(page)) + set_page_private(page, 0); /* * If any waiters have accumulated on the new page then @@ -1099,7 +1109,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, /* Establish migration ptes */ VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma, page); - try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK); + try_to_migrate(page, 0); page_was_mapped = 1; } @@ -1288,7 +1298,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, * page_mapping() set, hugetlbfs specific move page routine will not * be called and we could leak usage counts for subpools. */ - if (page_private(hpage) && !page_mapping(hpage)) { + if (hugetlb_page_subpool(hpage) && !page_mapping(hpage)) { rc = -EBUSY; goto out_unlock; } @@ -1301,7 +1311,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, if (page_mapped(hpage)) { bool mapping_locked = false; - enum ttu_flags ttu = TTU_MIGRATION|TTU_IGNORE_MLOCK; + enum ttu_flags ttu = 0; if (!PageAnon(hpage)) { /* @@ -1318,7 +1328,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, ttu |= TTU_RMAP_LOCKED; } - try_to_unmap(hpage, ttu); + try_to_migrate(hpage, ttu); page_was_mapped = 1; if (mapping_locked) @@ -1418,6 +1428,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, int swapwrite = current->flags & PF_SWAPWRITE; int rc, nr_subpages; LIST_HEAD(ret_pages); + bool nosplit = (reason == MR_NUMA_MISPLACED); trace_mm_migrate_pages_start(mode, reason); @@ -1489,8 +1500,9 @@ retry: /* * When memory is low, don't bother to try to migrate * other pages, just exit. + * THP NUMA faulting doesn't split THP to retry. */ - if (is_thp) { + if (is_thp && !nosplit) { if (!try_split_thp(page, &page2, from)) { nr_thp_split++; goto retry; @@ -2043,12 +2055,33 @@ static struct page *alloc_misplaced_dst_page(struct page *page, return newpage; } +static struct page *alloc_misplaced_dst_page_thp(struct page *page, + unsigned long data) +{ + int nid = (int) data; + struct page *newpage; + + newpage = alloc_pages_node(nid, (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE), + HPAGE_PMD_ORDER); + if (!newpage) + goto out; + + prep_transhuge_page(newpage); + +out: + return newpage; +} + static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) { int page_lru; VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page); + /* Do not migrate THP mapped by multiple processes */ + if (PageTransHuge(page) && total_mapcount(page) > 1) + return 0; + /* Avoid migrating to a node that is nearly full */ if (!migrate_balanced_pgdat(pgdat, compound_nr(page))) return 0; @@ -2056,18 +2089,6 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) if (isolate_lru_page(page)) return 0; - /* - * migrate_misplaced_transhuge_page() skips page migration's usual - * check on page_count(), so we must do it here, now that the page - * has been isolated: a GUP pin, or any other pin, prevents migration. - * The expected page count is 3: 1 for page's mapcount and 1 for the - * caller's pin and 1 for the reference taken by isolate_lru_page(). - */ - if (PageTransHuge(page) && page_count(page) != 3) { - putback_lru_page(page); - return 0; - } - page_lru = page_is_file_lru(page); mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru, thp_nr_pages(page)); @@ -2081,12 +2102,6 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) return 1; } -bool pmd_trans_migrating(pmd_t pmd) -{ - struct page *page = pmd_page(pmd); - return PageLocked(page); -} - /* * Attempt to migrate a misplaced page to the specified destination * node. Caller is expected to have an elevated reference count on @@ -2099,6 +2114,21 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, int isolated; int nr_remaining; LIST_HEAD(migratepages); + new_page_t *new; + bool compound; + unsigned int nr_pages = thp_nr_pages(page); + + /* + * PTE mapped THP or HugeTLB page can't reach here so the page could + * be either base page or THP. And it must be head page if it is + * THP. + */ + compound = PageTransHuge(page); + + if (compound) + new = alloc_misplaced_dst_page_thp; + else + new = alloc_misplaced_dst_page; /* * Don't migrate file pages that are mapped in multiple processes @@ -2120,19 +2150,18 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, goto out; list_add(&page->lru, &migratepages); - nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page, - NULL, node, MIGRATE_ASYNC, - MR_NUMA_MISPLACED); + nr_remaining = migrate_pages(&migratepages, *new, NULL, node, + MIGRATE_ASYNC, MR_NUMA_MISPLACED); if (nr_remaining) { if (!list_empty(&migratepages)) { list_del(&page->lru); - dec_node_page_state(page, NR_ISOLATED_ANON + - page_is_file_lru(page)); + mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + + page_is_file_lru(page), -nr_pages); putback_lru_page(page); } isolated = 0; } else - count_vm_numa_event(NUMA_PAGE_MIGRATE); + count_vm_numa_events(NUMA_PAGE_MIGRATE, nr_pages); BUG_ON(!list_empty(&migratepages)); return isolated; @@ -2141,141 +2170,6 @@ out: return 0; } #endif /* CONFIG_NUMA_BALANCING */ - -#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE) -/* - * Migrates a THP to a given target node. page must be locked and is unlocked - * before returning. - */ -int migrate_misplaced_transhuge_page(struct mm_struct *mm, - struct vm_area_struct *vma, - pmd_t *pmd, pmd_t entry, - unsigned long address, - struct page *page, int node) -{ - spinlock_t *ptl; - pg_data_t *pgdat = NODE_DATA(node); - int isolated = 0; - struct page *new_page = NULL; - int page_lru = page_is_file_lru(page); - unsigned long start = address & HPAGE_PMD_MASK; - - new_page = alloc_pages_node(node, - (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE), - HPAGE_PMD_ORDER); - if (!new_page) - goto out_fail; - prep_transhuge_page(new_page); - - isolated = numamigrate_isolate_page(pgdat, page); - if (!isolated) { - put_page(new_page); - goto out_fail; - } - - /* Prepare a page as a migration target */ - __SetPageLocked(new_page); - if (PageSwapBacked(page)) - __SetPageSwapBacked(new_page); - - /* anon mapping, we can simply copy page->mapping to the new page: */ - new_page->mapping = page->mapping; - new_page->index = page->index; - /* flush the cache before copying using the kernel virtual address */ - flush_cache_range(vma, start, start + HPAGE_PMD_SIZE); - migrate_page_copy(new_page, page); - WARN_ON(PageLRU(new_page)); - - /* Recheck the target PMD */ - ptl = pmd_lock(mm, pmd); - if (unlikely(!pmd_same(*pmd, entry) || !page_ref_freeze(page, 2))) { - spin_unlock(ptl); - - /* Reverse changes made by migrate_page_copy() */ - if (TestClearPageActive(new_page)) - SetPageActive(page); - if (TestClearPageUnevictable(new_page)) - SetPageUnevictable(page); - - unlock_page(new_page); - put_page(new_page); /* Free it */ - - /* Retake the callers reference and putback on LRU */ - get_page(page); - putback_lru_page(page); - mod_node_page_state(page_pgdat(page), - NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR); - - goto out_unlock; - } - - entry = mk_huge_pmd(new_page, vma->vm_page_prot); - entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - - /* - * Overwrite the old entry under pagetable lock and establish - * the new PTE. Any parallel GUP will either observe the old - * page blocking on the page lock, block on the page table - * lock or observe the new page. The SetPageUptodate on the - * new page and page_add_new_anon_rmap guarantee the copy is - * visible before the pagetable update. - */ - page_add_anon_rmap(new_page, vma, start, true); - /* - * At this point the pmd is numa/protnone (i.e. non present) and the TLB - * has already been flushed globally. So no TLB can be currently - * caching this non present pmd mapping. There's no need to clear the - * pmd before doing set_pmd_at(), nor to flush the TLB after - * set_pmd_at(). Clearing the pmd here would introduce a race - * condition against MADV_DONTNEED, because MADV_DONTNEED only holds the - * mmap_lock for reading. If the pmd is set to NULL at any given time, - * MADV_DONTNEED won't wait on the pmd lock and it'll skip clearing this - * pmd. - */ - set_pmd_at(mm, start, pmd, entry); - update_mmu_cache_pmd(vma, address, &entry); - - page_ref_unfreeze(page, 2); - mlock_migrate_page(new_page, page); - page_remove_rmap(page, true); - set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED); - - spin_unlock(ptl); - - /* Take an "isolate" reference and put new page on the LRU. */ - get_page(new_page); - putback_lru_page(new_page); - - unlock_page(new_page); - unlock_page(page); - put_page(page); /* Drop the rmap reference */ - put_page(page); /* Drop the LRU isolation reference */ - - count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR); - count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR); - - mod_node_page_state(page_pgdat(page), - NR_ISOLATED_ANON + page_lru, - -HPAGE_PMD_NR); - return isolated; - -out_fail: - count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); - ptl = pmd_lock(mm, pmd); - if (pmd_same(*pmd, entry)) { - entry = pmd_modify(entry, vma->vm_page_prot); - set_pmd_at(mm, start, pmd, entry); - update_mmu_cache_pmd(vma, address, &entry); - } - spin_unlock(ptl); - -out_unlock: - unlock_page(page); - put_page(page); - return 0; -} -#endif /* CONFIG_NUMA_BALANCING */ - #endif /* CONFIG_NUMA */ #ifdef CONFIG_DEVICE_PRIVATE @@ -2400,7 +2294,7 @@ again: if (!is_device_private_entry(entry)) goto next; - page = device_private_entry_to_page(entry); + page = pfn_swap_entry_to_page(entry); if (!(migrate->flags & MIGRATE_VMA_SELECT_DEVICE_PRIVATE) || page->pgmap->owner != migrate->pgmap_owner) @@ -2408,7 +2302,7 @@ again: mpfn = migrate_pfn(page_to_pfn(page)) | MIGRATE_PFN_MIGRATE; - if (is_write_device_private_entry(entry)) + if (is_writable_device_private_entry(entry)) mpfn |= MIGRATE_PFN_WRITE; } else { if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) @@ -2454,8 +2348,12 @@ again: ptep_get_and_clear(mm, addr, ptep); /* Setup special migration page table entry */ - entry = make_migration_entry(page, mpfn & - MIGRATE_PFN_WRITE); + if (mpfn & MIGRATE_PFN_WRITE) + entry = make_writable_migration_entry( + page_to_pfn(page)); + else + entry = make_readable_migration_entry( + page_to_pfn(page)); swp_pte = swp_entry_to_pte(entry); if (pte_present(pte)) { if (pte_soft_dirty(pte)) @@ -2518,8 +2416,8 @@ static void migrate_vma_collect(struct migrate_vma *migrate) * that the registered device driver can skip invalidating device * private page mappings that won't be migrated. */ - mmu_notifier_range_init_migrate(&range, 0, migrate->vma, - migrate->vma->vm_mm, migrate->start, migrate->end, + mmu_notifier_range_init_owner(&range, MMU_NOTIFY_MIGRATE, 0, + migrate->vma, migrate->vma->vm_mm, migrate->start, migrate->end, migrate->pgmap_owner); mmu_notifier_invalidate_range_start(&range); @@ -2704,7 +2602,6 @@ static void migrate_vma_prepare(struct migrate_vma *migrate) */ static void migrate_vma_unmap(struct migrate_vma *migrate) { - int flags = TTU_MIGRATION | TTU_IGNORE_MLOCK; const unsigned long npages = migrate->npages; const unsigned long start = migrate->start; unsigned long addr, i, restore = 0; @@ -2716,7 +2613,7 @@ static void migrate_vma_unmap(struct migrate_vma *migrate) continue; if (page_mapped(page)) { - try_to_unmap(page, flags); + try_to_migrate(page, 0); if (page_mapped(page)) goto restore; } @@ -2928,7 +2825,12 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, if (is_device_private_page(page)) { swp_entry_t swp_entry; - swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE); + if (vma->vm_flags & VM_WRITE) + swp_entry = make_writable_device_private_entry( + page_to_pfn(page)); + else + swp_entry = make_readable_device_private_entry( + page_to_pfn(page)); entry = swp_entry_to_pte(swp_entry); } else { /* @@ -3025,9 +2927,9 @@ void migrate_vma_pages(struct migrate_vma *migrate) if (!notified) { notified = true; - mmu_notifier_range_init_migrate(&range, 0, - migrate->vma, migrate->vma->vm_mm, - addr, migrate->end, + mmu_notifier_range_init_owner(&range, + MMU_NOTIFY_MIGRATE, 0, migrate->vma, + migrate->vma->vm_mm, addr, migrate->end, migrate->pgmap_owner); mmu_notifier_invalidate_range_start(&range); } diff --git a/mm/mlock.c b/mm/mlock.c index e338ebc4ad29..0d639bf48794 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -108,7 +108,7 @@ void mlock_vma_page(struct page *page) /* * Finish munlock after successful page isolation * - * Page must be locked. This is a wrapper for try_to_munlock() + * Page must be locked. This is a wrapper for page_mlock() * and putback_lru_page() with munlock accounting. */ static void __munlock_isolated_page(struct page *page) @@ -118,7 +118,7 @@ static void __munlock_isolated_page(struct page *page) * and we don't need to check all the other vmas. */ if (page_mapcount(page) > 1) - try_to_munlock(page); + page_mlock(page); /* Did try_to_unlock() succeed or punt? */ if (!PageMlocked(page)) @@ -158,7 +158,7 @@ static void __munlock_isolation_failed(struct page *page) * munlock()ed or munmap()ed, we want to check whether other vmas hold the * page locked so that we can leave it on the unevictable lru list and not * bother vmscan with it. However, to walk the page's rmap list in - * try_to_munlock() we must isolate the page from the LRU. If some other + * page_mlock() we must isolate the page from the LRU. If some other * task has removed the page from the LRU, we won't be able to do that. * So we clear the PageMlocked as we might not get another chance. If we * can't isolate the page, we leave it for putback_lru_page() and vmscan @@ -168,7 +168,7 @@ unsigned int munlock_vma_page(struct page *page) { int nr_pages; - /* For try_to_munlock() and to serialize with page migration */ + /* For page_mlock() and to serialize with page migration */ BUG_ON(!PageLocked(page)); VM_BUG_ON_PAGE(PageTail(page), page); @@ -205,7 +205,7 @@ static int __mlock_posix_error_return(long retval) * * The fast path is available only for evictable pages with single mapping. * Then we can bypass the per-cpu pvec and get better performance. - * when mapcount > 1 we need try_to_munlock() which can fail. + * when mapcount > 1 we need page_mlock() which can fail. * when !page_evictable(), we need the full redo logic of putback_lru_page to * avoid leaving evictable page in unevictable list. * @@ -414,7 +414,7 @@ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec, * * We don't save and restore VM_LOCKED here because pages are * still on lru. In unmap path, pages might be scanned by reclaim - * and re-mlocked by try_to_{munlock|unmap} before we unmap and + * and re-mlocked by page_mlock/try_to_unmap before we unmap and * free them. This will result in freeing mlocked pages. */ void munlock_vma_pages_range(struct vm_area_struct *vma, diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c index 2ae3f33b85b1..f5852a058ce0 100644 --- a/mm/mmap_lock.c +++ b/mm/mmap_lock.c @@ -153,6 +153,37 @@ static inline void put_memcg_path_buf(void) rcu_read_unlock(); } +#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \ + do { \ + const char *memcg_path; \ + preempt_disable(); \ + memcg_path = get_mm_memcg_path(mm); \ + trace_mmap_lock_##type(mm, \ + memcg_path != NULL ? memcg_path : "", \ + ##__VA_ARGS__); \ + if (likely(memcg_path != NULL)) \ + put_memcg_path_buf(); \ + preempt_enable(); \ + } while (0) + +#else /* !CONFIG_MEMCG */ + +int trace_mmap_lock_reg(void) +{ + return 0; +} + +void trace_mmap_lock_unreg(void) +{ +} + +#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \ + trace_mmap_lock_##type(mm, "", ##__VA_ARGS__) + +#endif /* CONFIG_MEMCG */ + +#ifdef CONFIG_TRACING +#ifdef CONFIG_MEMCG /* * Write the given mm_struct's memcg path to a percpu buffer, and return a * pointer to it. If the path cannot be determined, or no buffer was available @@ -187,33 +218,6 @@ out: return buf; } -#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \ - do { \ - const char *memcg_path; \ - local_lock(&memcg_paths.lock); \ - memcg_path = get_mm_memcg_path(mm); \ - trace_mmap_lock_##type(mm, \ - memcg_path != NULL ? memcg_path : "", \ - ##__VA_ARGS__); \ - if (likely(memcg_path != NULL)) \ - put_memcg_path_buf(); \ - local_unlock(&memcg_paths.lock); \ - } while (0) - -#else /* !CONFIG_MEMCG */ - -int trace_mmap_lock_reg(void) -{ - return 0; -} - -void trace_mmap_lock_unreg(void) -{ -} - -#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \ - trace_mmap_lock_##type(mm, "", ##__VA_ARGS__) - #endif /* CONFIG_MEMCG */ /* @@ -239,3 +243,4 @@ void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write) TRACE_MMAP_LOCK_EVENT(released, mm, write); } EXPORT_SYMBOL(__mmap_lock_do_trace_released); +#endif /* CONFIG_TRACING */ diff --git a/mm/mprotect.c b/mm/mprotect.c index e7a443157988..883e2cc85cad 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -143,26 +143,36 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, swp_entry_t entry = pte_to_swp_entry(oldpte); pte_t newpte; - if (is_write_migration_entry(entry)) { + if (is_writable_migration_entry(entry)) { /* * A protection check is difficult so * just be safe and disable write */ - make_migration_entry_read(&entry); + entry = make_readable_migration_entry( + swp_offset(entry)); newpte = swp_entry_to_pte(entry); if (pte_swp_soft_dirty(oldpte)) newpte = pte_swp_mksoft_dirty(newpte); if (pte_swp_uffd_wp(oldpte)) newpte = pte_swp_mkuffd_wp(newpte); - } else if (is_write_device_private_entry(entry)) { + } else if (is_writable_device_private_entry(entry)) { /* * We do not preserve soft-dirtiness. See * copy_one_pte() for explanation. */ - make_device_private_entry_read(&entry); + entry = make_readable_device_private_entry( + swp_offset(entry)); newpte = swp_entry_to_pte(entry); if (pte_swp_uffd_wp(oldpte)) newpte = pte_swp_mkuffd_wp(newpte); + } else if (is_writable_device_exclusive_entry(entry)) { + entry = make_readable_device_exclusive_entry( + swp_offset(entry)); + newpte = swp_entry_to_pte(entry); + if (pte_swp_soft_dirty(oldpte)) + newpte = pte_swp_mksoft_dirty(newpte); + if (pte_swp_uffd_wp(oldpte)) + newpte = pte_swp_mkuffd_wp(newpte); } else { newpte = oldpte; } diff --git a/mm/nommu.c b/mm/nommu.c index affda71641ca..3a93d4054810 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -223,7 +223,7 @@ long vread(char *buf, char *addr, unsigned long count) */ void *vmalloc(unsigned long size) { - return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM); + return __vmalloc(size, GFP_KERNEL); } EXPORT_SYMBOL(vmalloc); @@ -241,7 +241,7 @@ EXPORT_SYMBOL(vmalloc); */ void *vzalloc(unsigned long size) { - return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO); + return __vmalloc(size, GFP_KERNEL | __GFP_ZERO); } EXPORT_SYMBOL(vzalloc); @@ -1501,7 +1501,6 @@ erase_whole_vma: delete_vma(mm, vma); return 0; } -EXPORT_SYMBOL(do_munmap); int vm_munmap(unsigned long addr, size_t len) { diff --git a/mm/oom_kill.c b/mm/oom_kill.c index eefd3f5fde46..fcc29e9a3064 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -104,7 +104,7 @@ static bool oom_cpuset_eligible(struct task_struct *start, * mempolicy intersects current, otherwise it may be * needlessly killed. */ - ret = mempolicy_nodemask_intersects(tsk, mask); + ret = mempolicy_in_oom_domain(tsk, mask); } else { /* * This is not a mempolicy constrained oom, so only diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0817d88383d5..d6e94cc8066c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -749,7 +749,6 @@ void prep_compound_page(struct page *page, unsigned int order) __SetPageHead(page); for (i = 1; i < nr_pages; i++) { struct page *p = page + i; - set_page_count(p, 0); p->mapping = TAIL_MAPPING; set_compound_head(p, page); } @@ -3193,7 +3192,7 @@ static void __drain_all_pages(struct zone *zone, bool force_all_cpus) int cpu; /* - * Allocate in the BSS so we wont require allocation in + * Allocate in the BSS so we won't require allocation in * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y */ static cpumask_t cpus_with_pcps; @@ -3832,7 +3831,7 @@ static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) #endif /* CONFIG_FAIL_PAGE_ALLOC */ -noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) +static noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) { return __should_fail_alloc_page(gfp_mask, order); } diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index a4435311754b..f7b331081791 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -41,7 +41,8 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw) /* Handle un-addressable ZONE_DEVICE memory */ entry = pte_to_swp_entry(*pvmw->pte); - if (!is_device_private_entry(entry)) + if (!is_device_private_entry(entry) && + !is_device_exclusive_entry(entry)) return false; } else if (!pte_present(*pvmw->pte)) return false; @@ -93,19 +94,21 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw) return false; entry = pte_to_swp_entry(*pvmw->pte); - if (!is_migration_entry(entry)) + if (!is_migration_entry(entry) && + !is_device_exclusive_entry(entry)) return false; - pfn = migration_entry_to_pfn(entry); + pfn = swp_offset(entry); } else if (is_swap_pte(*pvmw->pte)) { swp_entry_t entry; /* Handle un-addressable ZONE_DEVICE memory */ entry = pte_to_swp_entry(*pvmw->pte); - if (!is_device_private_entry(entry)) + if (!is_device_private_entry(entry) && + !is_device_exclusive_entry(entry)) return false; - pfn = device_private_entry_to_pfn(entry); + pfn = swp_offset(entry); } else { if (!pte_present(*pvmw->pte)) return false; @@ -233,7 +236,7 @@ restart: return not_found(pvmw); entry = pmd_to_swp_entry(pmde); if (!is_migration_entry(entry) || - migration_entry_to_page(entry) != page) + pfn_swap_entry_to_page(entry) != page) return not_found(pvmw); return true; } diff --git a/mm/rmap.c b/mm/rmap.c index e05c300048e6..37c24672125c 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1405,24 +1405,14 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, /* * When racing against e.g. zap_pte_range() on another cpu, * in between its ptep_get_and_clear_full() and page_remove_rmap(), - * try_to_unmap() may return false when it is about to become true, + * try_to_unmap() may return before page_mapped() has become false, * if page table locking is skipped: use TTU_SYNC to wait for that. */ if (flags & TTU_SYNC) pvmw.flags = PVMW_SYNC; - /* munlock has nothing to gain from examining un-locked vmas */ - if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED)) - return true; - - if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) && - is_zone_device_page(page) && !is_device_private_page(page)) - return true; - - if (flags & TTU_SPLIT_HUGE_PMD) { - split_huge_pmd_address(vma, address, - flags & TTU_SPLIT_FREEZE, page); - } + if (flags & TTU_SPLIT_HUGE_PMD) + split_huge_pmd_address(vma, address, false, page); /* * For THP, we have to assume the worse case ie pmd for invalidation. @@ -1447,16 +1437,6 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, mmu_notifier_invalidate_range_start(&range); while (page_vma_mapped_walk(&pvmw)) { -#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION - /* PMD-mapped THP migration entry */ - if (!pvmw.pte && (flags & TTU_MIGRATION)) { - VM_BUG_ON_PAGE(PageHuge(page) || !PageTransCompound(page), page); - - set_pmd_migration_entry(&pvmw, page); - continue; - } -#endif - /* * If the page is mlock()d, we cannot swap it out. * If it's recently referenced (perhaps page_referenced @@ -1476,8 +1456,6 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, page_vma_mapped_walk_done(&pvmw); break; } - if (flags & TTU_MUNLOCK) - continue; } /* Unexpected PMD-mapped THP? */ @@ -1520,46 +1498,6 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, } } - if (IS_ENABLED(CONFIG_MIGRATION) && - (flags & TTU_MIGRATION) && - is_zone_device_page(page)) { - swp_entry_t entry; - pte_t swp_pte; - - pteval = ptep_get_and_clear(mm, pvmw.address, pvmw.pte); - - /* - * Store the pfn of the page in a special migration - * pte. do_swap_page() will wait until the migration - * pte is removed and then restart fault handling. - */ - entry = make_migration_entry(page, 0); - swp_pte = swp_entry_to_pte(entry); - - /* - * pteval maps a zone device page and is therefore - * a swap pte. - */ - if (pte_swp_soft_dirty(pteval)) - swp_pte = pte_swp_mksoft_dirty(swp_pte); - if (pte_swp_uffd_wp(pteval)) - swp_pte = pte_swp_mkuffd_wp(swp_pte); - set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte); - /* - * No need to invalidate here it will synchronize on - * against the special swap migration pte. - * - * The assignment to subpage above was computed from a - * swap PTE which results in an invalid pointer. - * Since only PAGE_SIZE pages can currently be - * migrated, just set it to page. This will need to be - * changed when hugepage migrations to device private - * memory are supported. - */ - subpage = page; - goto discard; - } - /* Nuke the page table entry. */ flush_cache_page(vma, address, pte_pfn(*pvmw.pte)); if (should_defer_flush(mm, flags)) { @@ -1612,35 +1550,6 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, /* We have to invalidate as we cleared the pte */ mmu_notifier_invalidate_range(mm, address, address + PAGE_SIZE); - } else if (IS_ENABLED(CONFIG_MIGRATION) && - (flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))) { - swp_entry_t entry; - pte_t swp_pte; - - if (arch_unmap_one(mm, vma, address, pteval) < 0) { - set_pte_at(mm, address, pvmw.pte, pteval); - ret = false; - page_vma_mapped_walk_done(&pvmw); - break; - } - - /* - * Store the pfn of the page in a special migration - * pte. do_swap_page() will wait until the migration - * pte is removed and then restart fault handling. - */ - entry = make_migration_entry(subpage, - pte_write(pteval)); - swp_pte = swp_entry_to_pte(entry); - if (pte_soft_dirty(pteval)) - swp_pte = pte_swp_mksoft_dirty(swp_pte); - if (pte_uffd_wp(pteval)) - swp_pte = pte_swp_mkuffd_wp(swp_pte); - set_pte_at(mm, address, pvmw.pte, swp_pte); - /* - * No need to invalidate here it will synchronize on - * against the special swap migration pte. - */ } else if (PageAnon(page)) { swp_entry_t entry = { .val = page_private(subpage) }; pte_t swp_pte; @@ -1756,9 +1665,10 @@ static int page_not_mapped(struct page *page) * Tries to remove all the page table entries which are mapping this * page, used in the pageout path. Caller must hold the page lock. * - * If unmap is successful, return true. Otherwise, false. + * It is the caller's responsibility to check if the page is still + * mapped when needed (use TTU_SYNC to prevent accounting races). */ -bool try_to_unmap(struct page *page, enum ttu_flags flags) +void try_to_unmap(struct page *page, enum ttu_flags flags) { struct rmap_walk_control rwc = { .rmap_one = try_to_unmap_one, @@ -1767,6 +1677,277 @@ bool try_to_unmap(struct page *page, enum ttu_flags flags) .anon_lock = page_lock_anon_vma_read, }; + if (flags & TTU_RMAP_LOCKED) + rmap_walk_locked(page, &rwc); + else + rmap_walk(page, &rwc); +} + +/* + * @arg: enum ttu_flags will be passed to this argument. + * + * If TTU_SPLIT_HUGE_PMD is specified any PMD mappings will be split into PTEs + * containing migration entries. This and TTU_RMAP_LOCKED are the only supported + * flags. + */ +static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma, + unsigned long address, void *arg) +{ + struct mm_struct *mm = vma->vm_mm; + struct page_vma_mapped_walk pvmw = { + .page = page, + .vma = vma, + .address = address, + }; + pte_t pteval; + struct page *subpage; + bool ret = true; + struct mmu_notifier_range range; + enum ttu_flags flags = (enum ttu_flags)(long)arg; + + if (is_zone_device_page(page) && !is_device_private_page(page)) + return true; + + /* + * When racing against e.g. zap_pte_range() on another cpu, + * in between its ptep_get_and_clear_full() and page_remove_rmap(), + * try_to_migrate() may return before page_mapped() has become false, + * if page table locking is skipped: use TTU_SYNC to wait for that. + */ + if (flags & TTU_SYNC) + pvmw.flags = PVMW_SYNC; + + /* + * unmap_page() in mm/huge_memory.c is the only user of migration with + * TTU_SPLIT_HUGE_PMD and it wants to freeze. + */ + if (flags & TTU_SPLIT_HUGE_PMD) + split_huge_pmd_address(vma, address, true, page); + + /* + * For THP, we have to assume the worse case ie pmd for invalidation. + * For hugetlb, it could be much worse if we need to do pud + * invalidation in the case of pmd sharing. + * + * Note that the page can not be free in this function as call of + * try_to_unmap() must hold a reference on the page. + */ + range.end = PageKsm(page) ? + address + PAGE_SIZE : vma_address_end(page, vma); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + address, range.end); + if (PageHuge(page)) { + /* + * If sharing is possible, start and end will be adjusted + * accordingly. + */ + adjust_range_if_pmd_sharing_possible(vma, &range.start, + &range.end); + } + mmu_notifier_invalidate_range_start(&range); + + while (page_vma_mapped_walk(&pvmw)) { +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION + /* PMD-mapped THP migration entry */ + if (!pvmw.pte) { + VM_BUG_ON_PAGE(PageHuge(page) || + !PageTransCompound(page), page); + + set_pmd_migration_entry(&pvmw, page); + continue; + } +#endif + + /* Unexpected PMD-mapped THP? */ + VM_BUG_ON_PAGE(!pvmw.pte, page); + + subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte); + address = pvmw.address; + + if (PageHuge(page) && !PageAnon(page)) { + /* + * To call huge_pmd_unshare, i_mmap_rwsem must be + * held in write mode. Caller needs to explicitly + * do this outside rmap routines. + */ + VM_BUG_ON(!(flags & TTU_RMAP_LOCKED)); + if (huge_pmd_unshare(mm, vma, &address, pvmw.pte)) { + /* + * huge_pmd_unshare unmapped an entire PMD + * page. There is no way of knowing exactly + * which PMDs may be cached for this mm, so + * we must flush them all. start/end were + * already adjusted above to cover this range. + */ + flush_cache_range(vma, range.start, range.end); + flush_tlb_range(vma, range.start, range.end); + mmu_notifier_invalidate_range(mm, range.start, + range.end); + + /* + * The ref count of the PMD page was dropped + * which is part of the way map counting + * is done for shared PMDs. Return 'true' + * here. When there is no other sharing, + * huge_pmd_unshare returns false and we will + * unmap the actual page and drop map count + * to zero. + */ + page_vma_mapped_walk_done(&pvmw); + break; + } + } + + /* Nuke the page table entry. */ + flush_cache_page(vma, address, pte_pfn(*pvmw.pte)); + pteval = ptep_clear_flush(vma, address, pvmw.pte); + + /* Move the dirty bit to the page. Now the pte is gone. */ + if (pte_dirty(pteval)) + set_page_dirty(page); + + /* Update high watermark before we lower rss */ + update_hiwater_rss(mm); + + if (is_zone_device_page(page)) { + swp_entry_t entry; + pte_t swp_pte; + + /* + * Store the pfn of the page in a special migration + * pte. do_swap_page() will wait until the migration + * pte is removed and then restart fault handling. + */ + entry = make_readable_migration_entry( + page_to_pfn(page)); + swp_pte = swp_entry_to_pte(entry); + + /* + * pteval maps a zone device page and is therefore + * a swap pte. + */ + if (pte_swp_soft_dirty(pteval)) + swp_pte = pte_swp_mksoft_dirty(swp_pte); + if (pte_swp_uffd_wp(pteval)) + swp_pte = pte_swp_mkuffd_wp(swp_pte); + set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte); + /* + * No need to invalidate here it will synchronize on + * against the special swap migration pte. + * + * The assignment to subpage above was computed from a + * swap PTE which results in an invalid pointer. + * Since only PAGE_SIZE pages can currently be + * migrated, just set it to page. This will need to be + * changed when hugepage migrations to device private + * memory are supported. + */ + subpage = page; + } else if (PageHWPoison(page)) { + pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); + if (PageHuge(page)) { + hugetlb_count_sub(compound_nr(page), mm); + set_huge_swap_pte_at(mm, address, + pvmw.pte, pteval, + vma_mmu_pagesize(vma)); + } else { + dec_mm_counter(mm, mm_counter(page)); + set_pte_at(mm, address, pvmw.pte, pteval); + } + + } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) { + /* + * The guest indicated that the page content is of no + * interest anymore. Simply discard the pte, vmscan + * will take care of the rest. + * A future reference will then fault in a new zero + * page. When userfaultfd is active, we must not drop + * this page though, as its main user (postcopy + * migration) will not expect userfaults on already + * copied pages. + */ + dec_mm_counter(mm, mm_counter(page)); + /* We have to invalidate as we cleared the pte */ + mmu_notifier_invalidate_range(mm, address, + address + PAGE_SIZE); + } else { + swp_entry_t entry; + pte_t swp_pte; + + if (arch_unmap_one(mm, vma, address, pteval) < 0) { + set_pte_at(mm, address, pvmw.pte, pteval); + ret = false; + page_vma_mapped_walk_done(&pvmw); + break; + } + + /* + * Store the pfn of the page in a special migration + * pte. do_swap_page() will wait until the migration + * pte is removed and then restart fault handling. + */ + if (pte_write(pteval)) + entry = make_writable_migration_entry( + page_to_pfn(subpage)); + else + entry = make_readable_migration_entry( + page_to_pfn(subpage)); + + swp_pte = swp_entry_to_pte(entry); + if (pte_soft_dirty(pteval)) + swp_pte = pte_swp_mksoft_dirty(swp_pte); + if (pte_uffd_wp(pteval)) + swp_pte = pte_swp_mkuffd_wp(swp_pte); + set_pte_at(mm, address, pvmw.pte, swp_pte); + /* + * No need to invalidate here it will synchronize on + * against the special swap migration pte. + */ + } + + /* + * No need to call mmu_notifier_invalidate_range() it has be + * done above for all cases requiring it to happen under page + * table lock before mmu_notifier_invalidate_range_end() + * + * See Documentation/vm/mmu_notifier.rst + */ + page_remove_rmap(subpage, PageHuge(page)); + put_page(page); + } + + mmu_notifier_invalidate_range_end(&range); + + return ret; +} + +/** + * try_to_migrate - try to replace all page table mappings with swap entries + * @page: the page to replace page table entries for + * @flags: action and flags + * + * Tries to remove all the page table entries which are mapping this page and + * replace them with special swap entries. Caller must hold the page lock. + * + * If is successful, return true. Otherwise, false. + */ +void try_to_migrate(struct page *page, enum ttu_flags flags) +{ + struct rmap_walk_control rwc = { + .rmap_one = try_to_migrate_one, + .arg = (void *)flags, + .done = page_not_mapped, + .anon_lock = page_lock_anon_vma_read, + }; + + /* + * Migration always ignores mlock and only supports TTU_RMAP_LOCKED and + * TTU_SPLIT_HUGE_PMD and TTU_SYNC flags. + */ + if (WARN_ON_ONCE(flags & ~(TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD | + TTU_SYNC))) + return; + /* * During exec, a temporary VMA is setup and later moved. * The VMA is moved under the anon_vma lock but not the @@ -1775,38 +1956,67 @@ bool try_to_unmap(struct page *page, enum ttu_flags flags) * locking requirements of exec(), migration skips * temporary VMAs until after exec() completes. */ - if ((flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE)) - && !PageKsm(page) && PageAnon(page)) + if (!PageKsm(page) && PageAnon(page)) rwc.invalid_vma = invalid_migration_vma; if (flags & TTU_RMAP_LOCKED) rmap_walk_locked(page, &rwc); else rmap_walk(page, &rwc); +} - /* - * When racing against e.g. zap_pte_range() on another cpu, - * in between its ptep_get_and_clear_full() and page_remove_rmap(), - * try_to_unmap() may return false when it is about to become true, - * if page table locking is skipped: use TTU_SYNC to wait for that. - */ - return !page_mapcount(page); +/* + * Walks the vma's mapping a page and mlocks the page if any locked vma's are + * found. Once one is found the page is locked and the scan can be terminated. + */ +static bool page_mlock_one(struct page *page, struct vm_area_struct *vma, + unsigned long address, void *unused) +{ + struct page_vma_mapped_walk pvmw = { + .page = page, + .vma = vma, + .address = address, + }; + + /* An un-locked vma doesn't have any pages to lock, continue the scan */ + if (!(vma->vm_flags & VM_LOCKED)) + return true; + + while (page_vma_mapped_walk(&pvmw)) { + /* + * Need to recheck under the ptl to serialise with + * __munlock_pagevec_fill() after VM_LOCKED is cleared in + * munlock_vma_pages_range(). + */ + if (vma->vm_flags & VM_LOCKED) { + /* PTE-mapped THP are never mlocked */ + if (!PageTransCompound(page)) + mlock_vma_page(page); + page_vma_mapped_walk_done(&pvmw); + } + + /* + * no need to continue scanning other vma's if the page has + * been locked. + */ + return false; + } + + return true; } /** - * try_to_munlock - try to munlock a page - * @page: the page to be munlocked + * page_mlock - try to mlock a page + * @page: the page to be mlocked * - * Called from munlock code. Checks all of the VMAs mapping the page - * to make sure nobody else has this page mlocked. The page will be - * returned with PG_mlocked cleared if no other vmas have it mlocked. + * Called from munlock code. Checks all of the VMAs mapping the page and mlocks + * the page if any are found. The page will be returned with PG_mlocked cleared + * if it is not mapped by any locked vmas. */ - -void try_to_munlock(struct page *page) +void page_mlock(struct page *page) { struct rmap_walk_control rwc = { - .rmap_one = try_to_unmap_one, - .arg = (void *)TTU_MUNLOCK, + .rmap_one = page_mlock_one, .done = page_not_mapped, .anon_lock = page_lock_anon_vma_read, @@ -1818,6 +2028,192 @@ void try_to_munlock(struct page *page) rmap_walk(page, &rwc); } +#ifdef CONFIG_DEVICE_PRIVATE +struct make_exclusive_args { + struct mm_struct *mm; + unsigned long address; + void *owner; + bool valid; +}; + +static bool page_make_device_exclusive_one(struct page *page, + struct vm_area_struct *vma, unsigned long address, void *priv) +{ + struct mm_struct *mm = vma->vm_mm; + struct page_vma_mapped_walk pvmw = { + .page = page, + .vma = vma, + .address = address, + }; + struct make_exclusive_args *args = priv; + pte_t pteval; + struct page *subpage; + bool ret = true; + struct mmu_notifier_range range; + swp_entry_t entry; + pte_t swp_pte; + + mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma, + vma->vm_mm, address, min(vma->vm_end, + address + page_size(page)), args->owner); + mmu_notifier_invalidate_range_start(&range); + + while (page_vma_mapped_walk(&pvmw)) { + /* Unexpected PMD-mapped THP? */ + VM_BUG_ON_PAGE(!pvmw.pte, page); + + if (!pte_present(*pvmw.pte)) { + ret = false; + page_vma_mapped_walk_done(&pvmw); + break; + } + + subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte); + address = pvmw.address; + + /* Nuke the page table entry. */ + flush_cache_page(vma, address, pte_pfn(*pvmw.pte)); + pteval = ptep_clear_flush(vma, address, pvmw.pte); + + /* Move the dirty bit to the page. Now the pte is gone. */ + if (pte_dirty(pteval)) + set_page_dirty(page); + + /* + * Check that our target page is still mapped at the expected + * address. + */ + if (args->mm == mm && args->address == address && + pte_write(pteval)) + args->valid = true; + + /* + * Store the pfn of the page in a special migration + * pte. do_swap_page() will wait until the migration + * pte is removed and then restart fault handling. + */ + if (pte_write(pteval)) + entry = make_writable_device_exclusive_entry( + page_to_pfn(subpage)); + else + entry = make_readable_device_exclusive_entry( + page_to_pfn(subpage)); + swp_pte = swp_entry_to_pte(entry); + if (pte_soft_dirty(pteval)) + swp_pte = pte_swp_mksoft_dirty(swp_pte); + if (pte_uffd_wp(pteval)) + swp_pte = pte_swp_mkuffd_wp(swp_pte); + + set_pte_at(mm, address, pvmw.pte, swp_pte); + + /* + * There is a reference on the page for the swap entry which has + * been removed, so shouldn't take another. + */ + page_remove_rmap(subpage, false); + } + + mmu_notifier_invalidate_range_end(&range); + + return ret; +} + +/** + * page_make_device_exclusive - mark the page exclusively owned by a device + * @page: the page to replace page table entries for + * @mm: the mm_struct where the page is expected to be mapped + * @address: address where the page is expected to be mapped + * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier callbacks + * + * Tries to remove all the page table entries which are mapping this page and + * replace them with special device exclusive swap entries to grant a device + * exclusive access to the page. Caller must hold the page lock. + * + * Returns false if the page is still mapped, or if it could not be unmapped + * from the expected address. Otherwise returns true (success). + */ +static bool page_make_device_exclusive(struct page *page, struct mm_struct *mm, + unsigned long address, void *owner) +{ + struct make_exclusive_args args = { + .mm = mm, + .address = address, + .owner = owner, + .valid = false, + }; + struct rmap_walk_control rwc = { + .rmap_one = page_make_device_exclusive_one, + .done = page_not_mapped, + .anon_lock = page_lock_anon_vma_read, + .arg = &args, + }; + + /* + * Restrict to anonymous pages for now to avoid potential writeback + * issues. Also tail pages shouldn't be passed to rmap_walk so skip + * those. + */ + if (!PageAnon(page) || PageTail(page)) + return false; + + rmap_walk(page, &rwc); + + return args.valid && !page_mapcount(page); +} + +/** + * make_device_exclusive_range() - Mark a range for exclusive use by a device + * @mm: mm_struct of assoicated target process + * @start: start of the region to mark for exclusive device access + * @end: end address of region + * @pages: returns the pages which were successfully marked for exclusive access + * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier to allow filtering + * + * Returns: number of pages found in the range by GUP. A page is marked for + * exclusive access only if the page pointer is non-NULL. + * + * This function finds ptes mapping page(s) to the given address range, locks + * them and replaces mappings with special swap entries preventing userspace CPU + * access. On fault these entries are replaced with the original mapping after + * calling MMU notifiers. + * + * A driver using this to program access from a device must use a mmu notifier + * critical section to hold a device specific lock during programming. Once + * programming is complete it should drop the page lock and reference after + * which point CPU access to the page will revoke the exclusive access. + */ +int make_device_exclusive_range(struct mm_struct *mm, unsigned long start, + unsigned long end, struct page **pages, + void *owner) +{ + long npages = (end - start) >> PAGE_SHIFT; + long i; + + npages = get_user_pages_remote(mm, start, npages, + FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD, + pages, NULL, NULL); + if (npages < 0) + return npages; + + for (i = 0; i < npages; i++, start += PAGE_SIZE) { + if (!trylock_page(pages[i])) { + put_page(pages[i]); + pages[i] = NULL; + continue; + } + + if (!page_make_device_exclusive(pages[i], mm, start, owner)) { + unlock_page(pages[i]); + put_page(pages[i]); + pages[i] = NULL; + } + } + + return npages; +} +EXPORT_SYMBOL_GPL(make_device_exclusive_range); +#endif + void __put_anon_vma(struct anon_vma *anon_vma) { struct anon_vma *root = anon_vma->root; @@ -1858,7 +2254,7 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page, * Find all the mappings of a page using the mapping pointer and the vma chains * contained in the anon_vma struct it points to. * - * When called from try_to_munlock(), the mmap_lock of the mm containing the vma + * When called from page_mlock(), the mmap_lock of the mm containing the vma * where the page was found will be held for write. So, we won't recheck * vm_flags for that VMA. That should be OK, because that vma shouldn't be * LOCKED. @@ -1911,7 +2307,7 @@ static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, * Find all the mappings of a page using the mapping pointer and the vma chains * contained in the address_space struct it points to. * - * When called from try_to_munlock(), the mmap_lock of the mm containing the vma + * When called from page_mlock(), the mmap_lock of the mm containing the vma * where the page was found will be held for write. So, we won't recheck * vm_flags for that VMA. That should be OK, because that vma shouldn't be * LOCKED. diff --git a/mm/shmem.c b/mm/shmem.c index 6268b9b4e41a..70d9ce294bb4 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1797,7 +1797,7 @@ unlock: * vm. If we swap it in we mark it dirty since we also free the swap * entry since a page cannot live in both the swap and page cache. * - * vmf and fault_type are only supplied by shmem_fault: + * vma, vmf, and fault_type are only supplied by shmem_fault: * otherwise they are NULL. */ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, @@ -1832,6 +1832,16 @@ repeat: page = pagecache_get_page(mapping, index, FGP_ENTRY | FGP_HEAD | FGP_LOCK, 0); + + if (page && vma && userfaultfd_minor(vma)) { + if (!xa_is_value(page)) { + unlock_page(page); + put_page(page); + } + *fault_type = handle_userfault(vmf, VM_UFFD_MINOR); + return 0; + } + if (xa_is_value(page)) { error = shmem_swapin_page(inode, index, &page, sgp, gfp, vma, fault_type); @@ -2352,27 +2362,25 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode return inode; } -static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, - pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, - unsigned long src_addr, - bool zeropage, - struct page **pagep) +#ifdef CONFIG_USERFAULTFD +int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, + pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, + unsigned long src_addr, + bool zeropage, + struct page **pagep) { struct inode *inode = file_inode(dst_vma->vm_file); struct shmem_inode_info *info = SHMEM_I(inode); struct address_space *mapping = inode->i_mapping; gfp_t gfp = mapping_gfp_mask(mapping); pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); - spinlock_t *ptl; void *page_kaddr; struct page *page; - pte_t _dst_pte, *dst_pte; int ret; - pgoff_t offset, max_off; + pgoff_t max_off; - ret = -ENOMEM; if (!shmem_inode_acct_block(inode, 1)) { /* * We may have got a page, returned -ENOENT triggering a retry, @@ -2383,15 +2391,16 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, put_page(*pagep); *pagep = NULL; } - goto out; + return -ENOMEM; } if (!*pagep) { + ret = -ENOMEM; page = shmem_alloc_page(gfp, info, pgoff); if (!page) goto out_unacct_blocks; - if (!zeropage) { /* mcopy_atomic */ + if (!zeropage) { /* COPY */ page_kaddr = kmap_atomic(page); ret = copy_from_user(page_kaddr, (const void __user *)src_addr, @@ -2401,11 +2410,11 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, /* fallback to copy_from_user outside mmap_lock */ if (unlikely(ret)) { *pagep = page; - shmem_inode_unacct_blocks(inode, 1); + ret = -ENOENT; /* don't free the page */ - return -ENOENT; + goto out_unacct_blocks; } - } else { /* mfill_zeropage_atomic */ + } else { /* ZEROPAGE */ clear_highpage(page); } } else { @@ -2413,15 +2422,15 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, *pagep = NULL; } - VM_BUG_ON(PageLocked(page) || PageSwapBacked(page)); + VM_BUG_ON(PageLocked(page)); + VM_BUG_ON(PageSwapBacked(page)); __SetPageLocked(page); __SetPageSwapBacked(page); __SetPageUptodate(page); ret = -EFAULT; - offset = linear_page_index(dst_vma, dst_addr); max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); - if (unlikely(offset >= max_off)) + if (unlikely(pgoff >= max_off)) goto out_release; ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL, @@ -2429,32 +2438,10 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, if (ret) goto out_release; - _dst_pte = mk_pte(page, dst_vma->vm_page_prot); - if (dst_vma->vm_flags & VM_WRITE) - _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte)); - else { - /* - * We don't set the pte dirty if the vma has no - * VM_WRITE permission, so mark the page dirty or it - * could be freed from under us. We could do it - * unconditionally before unlock_page(), but doing it - * only if VM_WRITE is not set is faster. - */ - set_page_dirty(page); - } - - dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); - - ret = -EFAULT; - max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); - if (unlikely(offset >= max_off)) - goto out_release_unlock; - - ret = -EEXIST; - if (!pte_none(*dst_pte)) - goto out_release_unlock; - - lru_cache_add(page); + ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr, + page, true, false); + if (ret) + goto out_delete_from_cache; spin_lock_irq(&info->lock); info->alloced++; @@ -2462,50 +2449,19 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, shmem_recalc_inode(inode); spin_unlock_irq(&info->lock); - inc_mm_counter(dst_mm, mm_counter_file(page)); - page_add_file_rmap(page, false); - set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); - - /* No need to invalidate - it was non-present before */ - update_mmu_cache(dst_vma, dst_addr, dst_pte); - pte_unmap_unlock(dst_pte, ptl); + SetPageDirty(page); unlock_page(page); - ret = 0; -out: - return ret; -out_release_unlock: - pte_unmap_unlock(dst_pte, ptl); - ClearPageDirty(page); + return 0; +out_delete_from_cache: delete_from_page_cache(page); out_release: unlock_page(page); put_page(page); out_unacct_blocks: shmem_inode_unacct_blocks(inode, 1); - goto out; -} - -int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, - pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, - unsigned long src_addr, - struct page **pagep) -{ - return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, - dst_addr, src_addr, false, pagep); -} - -int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm, - pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr) -{ - struct page *page = NULL; - - return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, - dst_addr, 0, true, &page); + return ret; } +#endif /* CONFIG_USERFAULTFD */ #ifdef CONFIG_TMPFS static const struct inode_operations shmem_symlink_inode_operations; @@ -4040,8 +3996,7 @@ bool shmem_huge_enabled(struct vm_area_struct *vma) loff_t i_size; pgoff_t off; - if ((vma->vm_flags & VM_NOHUGEPAGE) || - test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) + if (!transhuge_vma_enabled(vma, vma->vm_flags)) return false; if (shmem_huge == SHMEM_HUGE_FORCE) return true; diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 16183d85a7d5..bdce883f9286 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -27,8 +27,362 @@ #include <linux/spinlock.h> #include <linux/vmalloc.h> #include <linux/sched.h> +#include <linux/pgtable.h> +#include <linux/bootmem_info.h> + #include <asm/dma.h> #include <asm/pgalloc.h> +#include <asm/tlbflush.h> + +/** + * struct vmemmap_remap_walk - walk vmemmap page table + * + * @remap_pte: called for each lowest-level entry (PTE). + * @nr_walked: the number of walked pte. + * @reuse_page: the page which is reused for the tail vmemmap pages. + * @reuse_addr: the virtual address of the @reuse_page page. + * @vmemmap_pages: the list head of the vmemmap pages that can be freed + * or is mapped from. + */ +struct vmemmap_remap_walk { + void (*remap_pte)(pte_t *pte, unsigned long addr, + struct vmemmap_remap_walk *walk); + unsigned long nr_walked; + struct page *reuse_page; + unsigned long reuse_addr; + struct list_head *vmemmap_pages; +}; + +static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start, + struct vmemmap_remap_walk *walk) +{ + pmd_t __pmd; + int i; + unsigned long addr = start; + struct page *page = pmd_page(*pmd); + pte_t *pgtable = pte_alloc_one_kernel(&init_mm); + + if (!pgtable) + return -ENOMEM; + + pmd_populate_kernel(&init_mm, &__pmd, pgtable); + + for (i = 0; i < PMD_SIZE / PAGE_SIZE; i++, addr += PAGE_SIZE) { + pte_t entry, *pte; + pgprot_t pgprot = PAGE_KERNEL; + + entry = mk_pte(page + i, pgprot); + pte = pte_offset_kernel(&__pmd, addr); + set_pte_at(&init_mm, addr, pte, entry); + } + + /* Make pte visible before pmd. See comment in __pte_alloc(). */ + smp_wmb(); + pmd_populate_kernel(&init_mm, pmd, pgtable); + + flush_tlb_kernel_range(start, start + PMD_SIZE); + + return 0; +} + +static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr, + unsigned long end, + struct vmemmap_remap_walk *walk) +{ + pte_t *pte = pte_offset_kernel(pmd, addr); + + /* + * The reuse_page is found 'first' in table walk before we start + * remapping (which is calling @walk->remap_pte). + */ + if (!walk->reuse_page) { + walk->reuse_page = pte_page(*pte); + /* + * Because the reuse address is part of the range that we are + * walking, skip the reuse address range. + */ + addr += PAGE_SIZE; + pte++; + walk->nr_walked++; + } + + for (; addr != end; addr += PAGE_SIZE, pte++) { + walk->remap_pte(pte, addr, walk); + walk->nr_walked++; + } +} + +static int vmemmap_pmd_range(pud_t *pud, unsigned long addr, + unsigned long end, + struct vmemmap_remap_walk *walk) +{ + pmd_t *pmd; + unsigned long next; + + pmd = pmd_offset(pud, addr); + do { + if (pmd_leaf(*pmd)) { + int ret; + + ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK, walk); + if (ret) + return ret; + } + next = pmd_addr_end(addr, end); + vmemmap_pte_range(pmd, addr, next, walk); + } while (pmd++, addr = next, addr != end); + + return 0; +} + +static int vmemmap_pud_range(p4d_t *p4d, unsigned long addr, + unsigned long end, + struct vmemmap_remap_walk *walk) +{ + pud_t *pud; + unsigned long next; + + pud = pud_offset(p4d, addr); + do { + int ret; + + next = pud_addr_end(addr, end); + ret = vmemmap_pmd_range(pud, addr, next, walk); + if (ret) + return ret; + } while (pud++, addr = next, addr != end); + + return 0; +} + +static int vmemmap_p4d_range(pgd_t *pgd, unsigned long addr, + unsigned long end, + struct vmemmap_remap_walk *walk) +{ + p4d_t *p4d; + unsigned long next; + + p4d = p4d_offset(pgd, addr); + do { + int ret; + + next = p4d_addr_end(addr, end); + ret = vmemmap_pud_range(p4d, addr, next, walk); + if (ret) + return ret; + } while (p4d++, addr = next, addr != end); + + return 0; +} + +static int vmemmap_remap_range(unsigned long start, unsigned long end, + struct vmemmap_remap_walk *walk) +{ + unsigned long addr = start; + unsigned long next; + pgd_t *pgd; + + VM_BUG_ON(!IS_ALIGNED(start, PAGE_SIZE)); + VM_BUG_ON(!IS_ALIGNED(end, PAGE_SIZE)); + + pgd = pgd_offset_k(addr); + do { + int ret; + + next = pgd_addr_end(addr, end); + ret = vmemmap_p4d_range(pgd, addr, next, walk); + if (ret) + return ret; + } while (pgd++, addr = next, addr != end); + + /* + * We only change the mapping of the vmemmap virtual address range + * [@start + PAGE_SIZE, end), so we only need to flush the TLB which + * belongs to the range. + */ + flush_tlb_kernel_range(start + PAGE_SIZE, end); + + return 0; +} + +/* + * Free a vmemmap page. A vmemmap page can be allocated from the memblock + * allocator or buddy allocator. If the PG_reserved flag is set, it means + * that it allocated from the memblock allocator, just free it via the + * free_bootmem_page(). Otherwise, use __free_page(). + */ +static inline void free_vmemmap_page(struct page *page) +{ + if (PageReserved(page)) + free_bootmem_page(page); + else + __free_page(page); +} + +/* Free a list of the vmemmap pages */ +static void free_vmemmap_page_list(struct list_head *list) +{ + struct page *page, *next; + + list_for_each_entry_safe(page, next, list, lru) { + list_del(&page->lru); + free_vmemmap_page(page); + } +} + +static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, + struct vmemmap_remap_walk *walk) +{ + /* + * Remap the tail pages as read-only to catch illegal write operation + * to the tail pages. + */ + pgprot_t pgprot = PAGE_KERNEL_RO; + pte_t entry = mk_pte(walk->reuse_page, pgprot); + struct page *page = pte_page(*pte); + + list_add_tail(&page->lru, walk->vmemmap_pages); + set_pte_at(&init_mm, addr, pte, entry); +} + +static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, + struct vmemmap_remap_walk *walk) +{ + pgprot_t pgprot = PAGE_KERNEL; + struct page *page; + void *to; + + BUG_ON(pte_page(*pte) != walk->reuse_page); + + page = list_first_entry(walk->vmemmap_pages, struct page, lru); + list_del(&page->lru); + to = page_to_virt(page); + copy_page(to, (void *)walk->reuse_addr); + + set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot)); +} + +/** + * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end) + * to the page which @reuse is mapped to, then free vmemmap + * which the range are mapped to. + * @start: start address of the vmemmap virtual address range that we want + * to remap. + * @end: end address of the vmemmap virtual address range that we want to + * remap. + * @reuse: reuse address. + * + * Return: %0 on success, negative error code otherwise. + */ +int vmemmap_remap_free(unsigned long start, unsigned long end, + unsigned long reuse) +{ + int ret; + LIST_HEAD(vmemmap_pages); + struct vmemmap_remap_walk walk = { + .remap_pte = vmemmap_remap_pte, + .reuse_addr = reuse, + .vmemmap_pages = &vmemmap_pages, + }; + + /* + * In order to make remapping routine most efficient for the huge pages, + * the routine of vmemmap page table walking has the following rules + * (see more details from the vmemmap_pte_range()): + * + * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE) + * should be continuous. + * - The @reuse address is part of the range [@reuse, @end) that we are + * walking which is passed to vmemmap_remap_range(). + * - The @reuse address is the first in the complete range. + * + * So we need to make sure that @start and @reuse meet the above rules. + */ + BUG_ON(start - reuse != PAGE_SIZE); + + mmap_write_lock(&init_mm); + ret = vmemmap_remap_range(reuse, end, &walk); + mmap_write_downgrade(&init_mm); + + if (ret && walk.nr_walked) { + end = reuse + walk.nr_walked * PAGE_SIZE; + /* + * vmemmap_pages contains pages from the previous + * vmemmap_remap_range call which failed. These + * are pages which were removed from the vmemmap. + * They will be restored in the following call. + */ + walk = (struct vmemmap_remap_walk) { + .remap_pte = vmemmap_restore_pte, + .reuse_addr = reuse, + .vmemmap_pages = &vmemmap_pages, + }; + + vmemmap_remap_range(reuse, end, &walk); + } + mmap_read_unlock(&init_mm); + + free_vmemmap_page_list(&vmemmap_pages); + + return ret; +} + +static int alloc_vmemmap_page_list(unsigned long start, unsigned long end, + gfp_t gfp_mask, struct list_head *list) +{ + unsigned long nr_pages = (end - start) >> PAGE_SHIFT; + int nid = page_to_nid((struct page *)start); + struct page *page, *next; + + while (nr_pages--) { + page = alloc_pages_node(nid, gfp_mask, 0); + if (!page) + goto out; + list_add_tail(&page->lru, list); + } + + return 0; +out: + list_for_each_entry_safe(page, next, list, lru) + __free_pages(page, 0); + return -ENOMEM; +} + +/** + * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end) + * to the page which is from the @vmemmap_pages + * respectively. + * @start: start address of the vmemmap virtual address range that we want + * to remap. + * @end: end address of the vmemmap virtual address range that we want to + * remap. + * @reuse: reuse address. + * @gfp_mask: GFP flag for allocating vmemmap pages. + * + * Return: %0 on success, negative error code otherwise. + */ +int vmemmap_remap_alloc(unsigned long start, unsigned long end, + unsigned long reuse, gfp_t gfp_mask) +{ + LIST_HEAD(vmemmap_pages); + struct vmemmap_remap_walk walk = { + .remap_pte = vmemmap_restore_pte, + .reuse_addr = reuse, + .vmemmap_pages = &vmemmap_pages, + }; + + /* See the comment in the vmemmap_remap_free(). */ + BUG_ON(start - reuse != PAGE_SIZE); + + if (alloc_vmemmap_page_list(start, end, gfp_mask, &vmemmap_pages)) + return -ENOMEM; + + mmap_read_lock(&init_mm); + vmemmap_remap_range(reuse, end, &walk); + mmap_read_unlock(&init_mm); + + return 0; +} /* * Allocate a block of memory to be used to back the virtual memory map diff --git a/mm/sparse.c b/mm/sparse.c index 7272f7a1449d..6326cdf36c4f 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -13,6 +13,7 @@ #include <linux/vmalloc.h> #include <linux/swap.h> #include <linux/swapops.h> +#include <linux/bootmem_info.h> #include "internal.h" #include <asm/dma.h> diff --git a/mm/swap.c b/mm/swap.c index 6c11db780467..19600430e536 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -554,7 +554,7 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec) } else { /* * The page's writeback ends up during pagevec - * We moves tha page into tail of inactive. + * We move that page into tail of inactive. */ add_page_to_lru_list_tail(page, lruvec); __count_vm_events(PGROTATED, nr_pages); diff --git a/mm/swapfile.c b/mm/swapfile.c index e898c879a434..1e07d1c776f2 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2967,7 +2967,7 @@ static unsigned long read_swap_header(struct swap_info_struct *p, return 0; } - /* swap partition endianess hack... */ + /* swap partition endianness hack... */ if (swab32(swap_header->info.version) == 1) { swab32s(&swap_header->info.version); swab32s(&swap_header->info.last_page); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 63a73e164d55..0e2132834bc7 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -48,6 +48,78 @@ struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm, return dst_vma; } +/* + * Install PTEs, to map dst_addr (within dst_vma) to page. + * + * This function handles both MCOPY_ATOMIC_NORMAL and _CONTINUE for both shmem + * and anon, and for both shared and private VMAs. + */ +int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, struct page *page, + bool newly_allocated, bool wp_copy) +{ + int ret; + pte_t _dst_pte, *dst_pte; + bool writable = dst_vma->vm_flags & VM_WRITE; + bool vm_shared = dst_vma->vm_flags & VM_SHARED; + bool page_in_cache = page->mapping; + spinlock_t *ptl; + struct inode *inode; + pgoff_t offset, max_off; + + _dst_pte = mk_pte(page, dst_vma->vm_page_prot); + if (page_in_cache && !vm_shared) + writable = false; + if (writable || !page_in_cache) + _dst_pte = pte_mkdirty(_dst_pte); + if (writable) { + if (wp_copy) + _dst_pte = pte_mkuffd_wp(_dst_pte); + else + _dst_pte = pte_mkwrite(_dst_pte); + } + + dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); + + if (vma_is_shmem(dst_vma)) { + /* serialize against truncate with the page table lock */ + inode = dst_vma->vm_file->f_inode; + offset = linear_page_index(dst_vma, dst_addr); + max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + ret = -EFAULT; + if (unlikely(offset >= max_off)) + goto out_unlock; + } + + ret = -EEXIST; + if (!pte_none(*dst_pte)) + goto out_unlock; + + if (page_in_cache) + page_add_file_rmap(page, false); + else + page_add_new_anon_rmap(page, dst_vma, dst_addr, false); + + /* + * Must happen after rmap, as mm_counter() checks mapping (via + * PageAnon()), which is set by __page_set_anon_rmap(). + */ + inc_mm_counter(dst_mm, mm_counter(page)); + + if (newly_allocated) + lru_cache_add_inactive_or_unevictable(page, dst_vma); + + set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(dst_vma, dst_addr, dst_pte); + ret = 0; +out_unlock: + pte_unmap_unlock(dst_pte, ptl); + return ret; +} + static int mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, struct vm_area_struct *dst_vma, @@ -56,13 +128,9 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, struct page **pagep, bool wp_copy) { - pte_t _dst_pte, *dst_pte; - spinlock_t *ptl; void *page_kaddr; int ret; struct page *page; - pgoff_t offset, max_off; - struct inode *inode; if (!*pagep) { ret = -ENOMEM; @@ -99,43 +167,12 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, if (mem_cgroup_charge(page, dst_mm, GFP_KERNEL)) goto out_release; - _dst_pte = pte_mkdirty(mk_pte(page, dst_vma->vm_page_prot)); - if (dst_vma->vm_flags & VM_WRITE) { - if (wp_copy) - _dst_pte = pte_mkuffd_wp(_dst_pte); - else - _dst_pte = pte_mkwrite(_dst_pte); - } - - dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); - if (dst_vma->vm_file) { - /* the shmem MAP_PRIVATE case requires checking the i_size */ - inode = dst_vma->vm_file->f_inode; - offset = linear_page_index(dst_vma, dst_addr); - max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); - ret = -EFAULT; - if (unlikely(offset >= max_off)) - goto out_release_uncharge_unlock; - } - ret = -EEXIST; - if (!pte_none(*dst_pte)) - goto out_release_uncharge_unlock; - - inc_mm_counter(dst_mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, dst_vma, dst_addr, false); - lru_cache_add_inactive_or_unevictable(page, dst_vma); - - set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); - - /* No need to invalidate - it was non-present before */ - update_mmu_cache(dst_vma, dst_addr, dst_pte); - - pte_unmap_unlock(dst_pte, ptl); - ret = 0; + ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr, + page, true, wp_copy); + if (ret) + goto out_release; out: return ret; -out_release_uncharge_unlock: - pte_unmap_unlock(dst_pte, ptl); out_release: put_page(page); goto out; @@ -176,6 +213,41 @@ out_unlock: return ret; } +/* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */ +static int mcontinue_atomic_pte(struct mm_struct *dst_mm, + pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, + bool wp_copy) +{ + struct inode *inode = file_inode(dst_vma->vm_file); + pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); + struct page *page; + int ret; + + ret = shmem_getpage(inode, pgoff, &page, SGP_READ); + if (ret) + goto out; + if (!page) { + ret = -EFAULT; + goto out; + } + + ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr, + page, false, wp_copy); + if (ret) + goto out_release; + + unlock_page(page); + ret = 0; +out: + return ret; +out_release: + unlock_page(page); + put_page(page); + goto out; +} + static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address) { pgd_t *pgd; @@ -209,7 +281,6 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, unsigned long len, enum mcopy_atomic_mode mode) { - int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED; int vm_shared = dst_vma->vm_flags & VM_SHARED; ssize_t err; pte_t *dst_pte; @@ -308,7 +379,6 @@ retry: mutex_unlock(&hugetlb_fault_mutex_table[hash]); i_mmap_unlock_read(mapping); - vm_alloc_shared = vm_shared; cond_resched(); @@ -346,54 +416,8 @@ retry: out_unlock: mmap_read_unlock(dst_mm); out: - if (page) { - /* - * We encountered an error and are about to free a newly - * allocated huge page. - * - * Reservation handling is very subtle, and is different for - * private and shared mappings. See the routine - * restore_reserve_on_error for details. Unfortunately, we - * can not call restore_reserve_on_error now as it would - * require holding mmap_lock. - * - * If a reservation for the page existed in the reservation - * map of a private mapping, the map was modified to indicate - * the reservation was consumed when the page was allocated. - * We clear the HPageRestoreReserve flag now so that the global - * reserve count will not be incremented in free_huge_page. - * The reservation map will still indicate the reservation - * was consumed and possibly prevent later page allocation. - * This is better than leaking a global reservation. If no - * reservation existed, it is still safe to clear - * HPageRestoreReserve as no adjustments to reservation counts - * were made during allocation. - * - * The reservation map for shared mappings indicates which - * pages have reservations. When a huge page is allocated - * for an address with a reservation, no change is made to - * the reserve map. In this case HPageRestoreReserve will be - * set to indicate that the global reservation count should be - * incremented when the page is freed. This is the desired - * behavior. However, when a huge page is allocated for an - * address without a reservation a reservation entry is added - * to the reservation map, and HPageRestoreReserve will not be - * set. When the page is freed, the global reserve count will - * NOT be incremented and it will appear as though we have - * leaked reserved page. In this case, set HPageRestoreReserve - * so that the global reserve count will be incremented to - * match the reservation map entry which was created. - * - * Note that vm_alloc_shared is based on the flags of the vma - * for which the page was originally allocated. dst_vma could - * be different or NULL on error. - */ - if (vm_alloc_shared) - SetHPageRestoreReserve(page); - else - ClearHPageRestoreReserve(page); + if (page) put_page(page); - } BUG_ON(copied < 0); BUG_ON(err > 0); BUG_ON(!copied && !err); @@ -415,11 +439,16 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm, unsigned long dst_addr, unsigned long src_addr, struct page **page, - bool zeropage, + enum mcopy_atomic_mode mode, bool wp_copy) { ssize_t err; + if (mode == MCOPY_ATOMIC_CONTINUE) { + return mcontinue_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, + wp_copy); + } + /* * The normal page fault path for a shmem will invoke the * fault, fill the hole in the file and COW it right away. The @@ -431,7 +460,7 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm, * and not in the radix tree. */ if (!(dst_vma->vm_flags & VM_SHARED)) { - if (!zeropage) + if (mode == MCOPY_ATOMIC_NORMAL) err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, src_addr, page, wp_copy); @@ -440,13 +469,10 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm, dst_vma, dst_addr); } else { VM_WARN_ON_ONCE(wp_copy); - if (!zeropage) - err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd, - dst_vma, dst_addr, - src_addr, page); - else - err = shmem_mfill_zeropage_pte(dst_mm, dst_pmd, - dst_vma, dst_addr); + err = shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, + dst_addr, src_addr, + mode != MCOPY_ATOMIC_NORMAL, + page); } return err; @@ -467,7 +493,6 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, long copied; struct page *page; bool wp_copy; - bool zeropage = (mcopy_mode == MCOPY_ATOMIC_ZEROPAGE); /* * Sanitize the command parameters: @@ -530,7 +555,7 @@ retry: if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) goto out_unlock; - if (mcopy_mode == MCOPY_ATOMIC_CONTINUE) + if (!vma_is_shmem(dst_vma) && mcopy_mode == MCOPY_ATOMIC_CONTINUE) goto out_unlock; /* @@ -578,7 +603,7 @@ retry: BUG_ON(pmd_trans_huge(*dst_pmd)); err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, - src_addr, &page, zeropage, wp_copy); + src_addr, &page, mcopy_mode, wp_copy); cond_resched(); if (unlikely(err == -ENOENT)) { diff --git a/mm/util.c b/mm/util.c index a8bf17f18a81..a034525e7ba2 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1010,3 +1010,43 @@ void mem_dump_obj(void *object) } EXPORT_SYMBOL_GPL(mem_dump_obj); #endif + +/* + * A driver might set a page logically offline -- PageOffline() -- and + * turn the page inaccessible in the hypervisor; after that, access to page + * content can be fatal. + * + * Some special PFN walkers -- i.e., /proc/kcore -- read content of random + * pages after checking PageOffline(); however, these PFN walkers can race + * with drivers that set PageOffline(). + * + * page_offline_freeze()/page_offline_thaw() allows for a subsystem to + * synchronize with such drivers, achieving that a page cannot be set + * PageOffline() while frozen. + * + * page_offline_begin()/page_offline_end() is used by drivers that care about + * such races when setting a page PageOffline(). + */ +static DECLARE_RWSEM(page_offline_rwsem); + +void page_offline_freeze(void) +{ + down_read(&page_offline_rwsem); +} + +void page_offline_thaw(void) +{ + up_read(&page_offline_rwsem); +} + +void page_offline_begin(void) +{ + down_write(&page_offline_rwsem); +} +EXPORT_SYMBOL(page_offline_begin); + +void page_offline_end(void) +{ + up_write(&page_offline_rwsem); +} +EXPORT_SYMBOL(page_offline_end); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index b2ec7f751bd0..d5cd52805149 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -25,6 +25,7 @@ #include <linux/notifier.h> #include <linux/rbtree.h> #include <linux/xarray.h> +#include <linux/io.h> #include <linux/rcupdate.h> #include <linux/pfn.h> #include <linux/kmemleak.h> @@ -36,6 +37,7 @@ #include <linux/overflow.h> #include <linux/pgtable.h> #include <linux/uaccess.h> +#include <linux/hugetlb.h> #include <asm/tlbflush.h> #include <asm/shmparam.h> @@ -83,10 +85,11 @@ static void free_work(struct work_struct *w) /*** Page table manipulation functions ***/ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, - pgtbl_mod_mask *mask) + unsigned int max_page_shift, pgtbl_mod_mask *mask) { pte_t *pte; u64 pfn; + unsigned long size = PAGE_SIZE; pfn = phys_addr >> PAGE_SHIFT; pte = pte_alloc_kernel_track(pmd, addr, mask); @@ -94,9 +97,22 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, return -ENOMEM; do { BUG_ON(!pte_none(*pte)); + +#ifdef CONFIG_HUGETLB_PAGE + size = arch_vmap_pte_range_map_size(addr, end, pfn, max_page_shift); + if (size != PAGE_SIZE) { + pte_t entry = pfn_pte(pfn, prot); + + entry = pte_mkhuge(entry); + entry = arch_make_huge_pte(entry, ilog2(size), 0); + set_huge_pte_at(&init_mm, addr, pte, entry); + pfn += PFN_DOWN(size); + continue; + } +#endif set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot)); pfn++; - } while (pte++, addr += PAGE_SIZE, addr != end); + } while (pte += PFN_DOWN(size), addr += size, addr != end); *mask |= PGTBL_PTE_MODIFIED; return 0; } @@ -145,7 +161,7 @@ static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, continue; } - if (vmap_pte_range(pmd, addr, next, phys_addr, prot, mask)) + if (vmap_pte_range(pmd, addr, next, phys_addr, prot, max_page_shift, mask)) return -ENOMEM; } while (pmd++, phys_addr += (next - addr), addr = next, addr != end); return 0; @@ -1592,6 +1608,7 @@ static DEFINE_MUTEX(vmap_purge_lock); /* for per-CPU blocks */ static void purge_fragmented_blocks_allcpus(void); +#ifdef CONFIG_X86_64 /* * called before a call to iounmap() if the caller wants vm_area_struct's * immediately freed. @@ -1600,6 +1617,7 @@ void set_iounmap_nonlazy(void) { atomic_long_set(&vmap_lazy_nr, lazy_max_pages()+1); } +#endif /* CONFIG_X86_64 */ /* * Purges all lazily-freed vmap areas. @@ -2912,8 +2930,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, return NULL; } - if (vmap_allow_huge && !(vm_flags & VM_NO_HUGE_VMAP) && - arch_vmap_pmd_supported(prot)) { + if (vmap_allow_huge && !(vm_flags & VM_NO_HUGE_VMAP)) { unsigned long size_per_node; /* @@ -2926,11 +2943,13 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, size_per_node = size; if (node == NUMA_NO_NODE) size_per_node /= num_online_nodes(); - if (size_per_node >= PMD_SIZE) { + if (arch_vmap_pmd_supported(prot) && size_per_node >= PMD_SIZE) shift = PMD_SHIFT; - align = max(real_align, 1UL << shift); - size = ALIGN(real_size, 1UL << shift); - } + else + shift = arch_vmap_pte_supported_shift(size_per_node); + + align = max(real_align, 1UL << shift); + size = ALIGN(real_size, 1UL << shift); } again: diff --git a/mm/vmscan.c b/mm/vmscan.c index d7c3cb8688dd..4620df62f0ff 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1499,7 +1499,8 @@ static unsigned int shrink_page_list(struct list_head *page_list, if (unlikely(PageTransHuge(page))) flags |= TTU_SPLIT_HUGE_PMD; - if (!try_to_unmap(page, flags)) { + try_to_unmap(page, flags); + if (page_mapped(page)) { stat->nr_unmap_fail += nr_pages; if (!was_swapbacked && PageSwapBacked(page)) stat->nr_lazyfree_fail += nr_pages; @@ -1701,6 +1702,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, unsigned int nr_reclaimed; struct page *page, *next; LIST_HEAD(clean_pages); + unsigned int noreclaim_flag; list_for_each_entry_safe(page, next, page_list, lru) { if (!PageHuge(page) && page_is_file_lru(page) && @@ -1711,8 +1713,17 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, } } + /* + * We should be safe here since we are only dealing with file pages and + * we are not kswapd and therefore cannot write dirty file pages. But + * call memalloc_noreclaim_save() anyway, just in case these conditions + * change in the future. + */ + noreclaim_flag = memalloc_noreclaim_save(); nr_reclaimed = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc, &stat, true); + memalloc_noreclaim_restore(noreclaim_flag); + list_splice(&clean_pages, page_list); mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -(long)nr_reclaimed); @@ -1810,7 +1821,7 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec, } -/** +/* * Isolating page from the lruvec to fill in @dst list by nr_to_scan times. * * lruvec->lru_lock is heavily contended. Some of the functions that @@ -2306,6 +2317,7 @@ unsigned long reclaim_pages(struct list_head *page_list) LIST_HEAD(node_page_list); struct reclaim_stat dummy_stat; struct page *page; + unsigned int noreclaim_flag; struct scan_control sc = { .gfp_mask = GFP_KERNEL, .priority = DEF_PRIORITY, @@ -2314,6 +2326,8 @@ unsigned long reclaim_pages(struct list_head *page_list) .may_swap = 1, }; + noreclaim_flag = memalloc_noreclaim_save(); + while (!list_empty(page_list)) { page = lru_to_page(page_list); if (nid == NUMA_NO_NODE) { @@ -2350,6 +2364,8 @@ unsigned long reclaim_pages(struct list_head *page_list) } } + memalloc_noreclaim_restore(noreclaim_flag); + return nr_reclaimed; } diff --git a/mm/workingset.c b/mm/workingset.c index 4f7a306ce75a..5ba3e42446fa 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -168,8 +168,10 @@ * refault distance will immediately activate the refaulting page. */ +#define WORKINGSET_SHIFT 1 #define EVICTION_SHIFT ((BITS_PER_LONG - BITS_PER_XA_VALUE) + \ - 1 + NODES_SHIFT + MEM_CGROUP_ID_SHIFT) + WORKINGSET_SHIFT + NODES_SHIFT + \ + MEM_CGROUP_ID_SHIFT) #define EVICTION_MASK (~0UL >> EVICTION_SHIFT) /* @@ -189,7 +191,7 @@ static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, eviction &= EVICTION_MASK; eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; eviction = (eviction << NODES_SHIFT) | pgdat->node_id; - eviction = (eviction << 1) | workingset; + eviction = (eviction << WORKINGSET_SHIFT) | workingset; return xa_mk_value(eviction); } @@ -201,8 +203,8 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, int memcgid, nid; bool workingset; - workingset = entry & 1; - entry >>= 1; + workingset = entry & ((1UL << WORKINGSET_SHIFT) - 1); + entry >>= WORKINGSET_SHIFT; nid = entry & ((1UL << NODES_SHIFT) - 1); entry >>= NODES_SHIFT; memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); diff --git a/mm/z3fold.c b/mm/z3fold.c index 7fe7adaaad01..b3c0577b8095 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -62,7 +62,7 @@ #define ZHDR_SIZE_ALIGNED round_up(sizeof(struct z3fold_header), CHUNK_SIZE) #define ZHDR_CHUNKS (ZHDR_SIZE_ALIGNED >> CHUNK_SHIFT) #define TOTAL_CHUNKS (PAGE_SIZE >> CHUNK_SHIFT) -#define NCHUNKS ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT) +#define NCHUNKS (TOTAL_CHUNKS - ZHDR_CHUNKS) #define BUDDY_MASK (0x3) #define BUDDY_SHIFT 2 @@ -144,6 +144,8 @@ struct z3fold_header { * @c_handle: cache for z3fold_buddy_slots allocation * @ops: pointer to a structure of user defined operations specified at * pool creation time. + * @zpool: zpool driver + * @zpool_ops: zpool operations structure with an evict callback * @compact_wq: workqueue for page layout background optimization * @release_wq: workqueue for safe page release * @work: work_struct for safe page release @@ -253,9 +255,8 @@ static inline void z3fold_page_unlock(struct z3fold_header *zhdr) spin_unlock(&zhdr->page_lock); } - -static inline struct z3fold_header *__get_z3fold_header(unsigned long handle, - bool lock) +/* return locked z3fold page if it's not headless */ +static inline struct z3fold_header *get_z3fold_header(unsigned long handle) { struct z3fold_buddy_slots *slots; struct z3fold_header *zhdr; @@ -269,13 +270,12 @@ static inline struct z3fold_header *__get_z3fold_header(unsigned long handle, read_lock(&slots->lock); addr = *(unsigned long *)handle; zhdr = (struct z3fold_header *)(addr & PAGE_MASK); - if (lock) - locked = z3fold_page_trylock(zhdr); + locked = z3fold_page_trylock(zhdr); read_unlock(&slots->lock); if (locked) break; cpu_relax(); - } while (lock); + } while (true); } else { zhdr = (struct z3fold_header *)(handle & PAGE_MASK); } @@ -283,18 +283,6 @@ static inline struct z3fold_header *__get_z3fold_header(unsigned long handle, return zhdr; } -/* Returns the z3fold page where a given handle is stored */ -static inline struct z3fold_header *handle_to_z3fold_header(unsigned long h) -{ - return __get_z3fold_header(h, false); -} - -/* return locked z3fold page if it's not headless */ -static inline struct z3fold_header *get_z3fold_header(unsigned long h) -{ - return __get_z3fold_header(h, true); -} - static inline void put_z3fold_header(struct z3fold_header *zhdr) { struct page *page = virt_to_page(zhdr); @@ -998,7 +986,8 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp, goto out_c; spin_lock_init(&pool->lock); spin_lock_init(&pool->stale_lock); - pool->unbuddied = __alloc_percpu(sizeof(struct list_head)*NCHUNKS, 2); + pool->unbuddied = __alloc_percpu(sizeof(struct list_head) * NCHUNKS, + __alignof__(struct list_head)); if (!pool->unbuddied) goto out_pool; for_each_possible_cpu(cpu) { @@ -1059,6 +1048,7 @@ static void z3fold_destroy_pool(struct z3fold_pool *pool) destroy_workqueue(pool->compact_wq); destroy_workqueue(pool->release_wq); z3fold_unregister_migration(pool); + free_percpu(pool->unbuddied); kfree(pool); } @@ -1382,7 +1372,7 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) if (zhdr->foreign_handles || test_and_set_bit(PAGE_CLAIMED, &page->private)) { if (kref_put(&zhdr->refcount, - release_z3fold_page)) + release_z3fold_page_locked)) atomic64_dec(&pool->pages_nr); else z3fold_page_unlock(zhdr); @@ -1803,8 +1793,11 @@ static int __init init_z3fold(void) { int ret; - /* Make sure the z3fold header is not larger than the page size */ - BUILD_BUG_ON(ZHDR_SIZE_ALIGNED > PAGE_SIZE); + /* + * Make sure the z3fold header is not larger than the page size and + * there has remaining spaces for its buddy. + */ + BUILD_BUG_ON(ZHDR_SIZE_ALIGNED > PAGE_SIZE - CHUNK_SIZE); ret = z3fold_mount(); if (ret) return ret; diff --git a/mm/zbud.c b/mm/zbud.c index 7ec5f27a68b0..6348932430b8 100644 --- a/mm/zbud.c +++ b/mm/zbud.c @@ -51,7 +51,6 @@ #include <linux/preempt.h> #include <linux/slab.h> #include <linux/spinlock.h> -#include <linux/zbud.h> #include <linux/zpool.h> /***************** @@ -73,6 +72,12 @@ #define ZHDR_SIZE_ALIGNED CHUNK_SIZE #define NCHUNKS ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT) +struct zbud_pool; + +struct zbud_ops { + int (*evict)(struct zbud_pool *pool, unsigned long handle); +}; + /** * struct zbud_pool - stores metadata for each zbud pool * @lock: protects all pool fields and first|last_chunk fields of any @@ -87,21 +92,27 @@ * @pages_nr: number of zbud pages in the pool. * @ops: pointer to a structure of user defined operations specified at * pool creation time. + * @zpool: zpool driver + * @zpool_ops: zpool operations structure with an evict callback * * This structure is allocated at pool creation time and maintains metadata * pertaining to a particular zbud pool. */ struct zbud_pool { spinlock_t lock; - struct list_head unbuddied[NCHUNKS]; - struct list_head buddied; + union { + /* + * Reuse unbuddied[0] as buddied on the ground that + * unbuddied[0] is unused. + */ + struct list_head buddied; + struct list_head unbuddied[NCHUNKS]; + }; struct list_head lru; u64 pages_nr; const struct zbud_ops *ops; -#ifdef CONFIG_ZPOOL struct zpool *zpool; const struct zpool_ops *zpool_ops; -#endif }; /* @@ -121,104 +132,6 @@ struct zbud_header { }; /***************** - * zpool - ****************/ - -#ifdef CONFIG_ZPOOL - -static int zbud_zpool_evict(struct zbud_pool *pool, unsigned long handle) -{ - if (pool->zpool && pool->zpool_ops && pool->zpool_ops->evict) - return pool->zpool_ops->evict(pool->zpool, handle); - else - return -ENOENT; -} - -static const struct zbud_ops zbud_zpool_ops = { - .evict = zbud_zpool_evict -}; - -static void *zbud_zpool_create(const char *name, gfp_t gfp, - const struct zpool_ops *zpool_ops, - struct zpool *zpool) -{ - struct zbud_pool *pool; - - pool = zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL); - if (pool) { - pool->zpool = zpool; - pool->zpool_ops = zpool_ops; - } - return pool; -} - -static void zbud_zpool_destroy(void *pool) -{ - zbud_destroy_pool(pool); -} - -static int zbud_zpool_malloc(void *pool, size_t size, gfp_t gfp, - unsigned long *handle) -{ - return zbud_alloc(pool, size, gfp, handle); -} -static void zbud_zpool_free(void *pool, unsigned long handle) -{ - zbud_free(pool, handle); -} - -static int zbud_zpool_shrink(void *pool, unsigned int pages, - unsigned int *reclaimed) -{ - unsigned int total = 0; - int ret = -EINVAL; - - while (total < pages) { - ret = zbud_reclaim_page(pool, 8); - if (ret < 0) - break; - total++; - } - - if (reclaimed) - *reclaimed = total; - - return ret; -} - -static void *zbud_zpool_map(void *pool, unsigned long handle, - enum zpool_mapmode mm) -{ - return zbud_map(pool, handle); -} -static void zbud_zpool_unmap(void *pool, unsigned long handle) -{ - zbud_unmap(pool, handle); -} - -static u64 zbud_zpool_total_size(void *pool) -{ - return zbud_get_pool_size(pool) * PAGE_SIZE; -} - -static struct zpool_driver zbud_zpool_driver = { - .type = "zbud", - .sleep_mapped = true, - .owner = THIS_MODULE, - .create = zbud_zpool_create, - .destroy = zbud_zpool_destroy, - .malloc = zbud_zpool_malloc, - .free = zbud_zpool_free, - .shrink = zbud_zpool_shrink, - .map = zbud_zpool_map, - .unmap = zbud_zpool_unmap, - .total_size = zbud_zpool_total_size, -}; - -MODULE_ALIAS("zpool-zbud"); -#endif /* CONFIG_ZPOOL */ - -/***************** * Helpers *****************/ /* Just to make the code easier to read */ @@ -304,7 +217,7 @@ static int num_free_chunks(struct zbud_header *zhdr) * Return: pointer to the new zbud pool or NULL if the metadata allocation * failed. */ -struct zbud_pool *zbud_create_pool(gfp_t gfp, const struct zbud_ops *ops) +static struct zbud_pool *zbud_create_pool(gfp_t gfp, const struct zbud_ops *ops) { struct zbud_pool *pool; int i; @@ -328,7 +241,7 @@ struct zbud_pool *zbud_create_pool(gfp_t gfp, const struct zbud_ops *ops) * * The pool should be emptied before this function is called. */ -void zbud_destroy_pool(struct zbud_pool *pool) +static void zbud_destroy_pool(struct zbud_pool *pool) { kfree(pool); } @@ -352,7 +265,7 @@ void zbud_destroy_pool(struct zbud_pool *pool) * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate * a new page. */ -int zbud_alloc(struct zbud_pool *pool, size_t size, gfp_t gfp, +static int zbud_alloc(struct zbud_pool *pool, size_t size, gfp_t gfp, unsigned long *handle) { int chunks, i, freechunks; @@ -427,7 +340,7 @@ found: * only sets the first|last_chunks to 0. The page is actually freed * once both buddies are evicted (see zbud_reclaim_page() below). */ -void zbud_free(struct zbud_pool *pool, unsigned long handle) +static void zbud_free(struct zbud_pool *pool, unsigned long handle) { struct zbud_header *zhdr; int freechunks; @@ -499,7 +412,7 @@ void zbud_free(struct zbud_pool *pool, unsigned long handle) * no pages to evict or an eviction handler is not registered, -EAGAIN if * the retry limit was hit. */ -int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries) +static int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries) { int i, ret, freechunks; struct zbud_header *zhdr; @@ -581,7 +494,7 @@ next: * * Returns: a pointer to the mapped allocation */ -void *zbud_map(struct zbud_pool *pool, unsigned long handle) +static void *zbud_map(struct zbud_pool *pool, unsigned long handle) { return (void *)(handle); } @@ -591,7 +504,7 @@ void *zbud_map(struct zbud_pool *pool, unsigned long handle) * @pool: pool in which the allocation resides * @handle: handle associated with the allocation to be unmapped */ -void zbud_unmap(struct zbud_pool *pool, unsigned long handle) +static void zbud_unmap(struct zbud_pool *pool, unsigned long handle) { } @@ -602,30 +515,120 @@ void zbud_unmap(struct zbud_pool *pool, unsigned long handle) * Returns: size in pages of the given pool. The pool lock need not be * taken to access pages_nr. */ -u64 zbud_get_pool_size(struct zbud_pool *pool) +static u64 zbud_get_pool_size(struct zbud_pool *pool) { return pool->pages_nr; } +/***************** + * zpool + ****************/ + +static int zbud_zpool_evict(struct zbud_pool *pool, unsigned long handle) +{ + if (pool->zpool && pool->zpool_ops && pool->zpool_ops->evict) + return pool->zpool_ops->evict(pool->zpool, handle); + else + return -ENOENT; +} + +static const struct zbud_ops zbud_zpool_ops = { + .evict = zbud_zpool_evict +}; + +static void *zbud_zpool_create(const char *name, gfp_t gfp, + const struct zpool_ops *zpool_ops, + struct zpool *zpool) +{ + struct zbud_pool *pool; + + pool = zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL); + if (pool) { + pool->zpool = zpool; + pool->zpool_ops = zpool_ops; + } + return pool; +} + +static void zbud_zpool_destroy(void *pool) +{ + zbud_destroy_pool(pool); +} + +static int zbud_zpool_malloc(void *pool, size_t size, gfp_t gfp, + unsigned long *handle) +{ + return zbud_alloc(pool, size, gfp, handle); +} +static void zbud_zpool_free(void *pool, unsigned long handle) +{ + zbud_free(pool, handle); +} + +static int zbud_zpool_shrink(void *pool, unsigned int pages, + unsigned int *reclaimed) +{ + unsigned int total = 0; + int ret = -EINVAL; + + while (total < pages) { + ret = zbud_reclaim_page(pool, 8); + if (ret < 0) + break; + total++; + } + + if (reclaimed) + *reclaimed = total; + + return ret; +} + +static void *zbud_zpool_map(void *pool, unsigned long handle, + enum zpool_mapmode mm) +{ + return zbud_map(pool, handle); +} +static void zbud_zpool_unmap(void *pool, unsigned long handle) +{ + zbud_unmap(pool, handle); +} + +static u64 zbud_zpool_total_size(void *pool) +{ + return zbud_get_pool_size(pool) * PAGE_SIZE; +} + +static struct zpool_driver zbud_zpool_driver = { + .type = "zbud", + .sleep_mapped = true, + .owner = THIS_MODULE, + .create = zbud_zpool_create, + .destroy = zbud_zpool_destroy, + .malloc = zbud_zpool_malloc, + .free = zbud_zpool_free, + .shrink = zbud_zpool_shrink, + .map = zbud_zpool_map, + .unmap = zbud_zpool_unmap, + .total_size = zbud_zpool_total_size, +}; + +MODULE_ALIAS("zpool-zbud"); + static int __init init_zbud(void) { /* Make sure the zbud header will fit in one chunk */ BUILD_BUG_ON(sizeof(struct zbud_header) > ZHDR_SIZE_ALIGNED); pr_info("loaded\n"); -#ifdef CONFIG_ZPOOL zpool_register_driver(&zbud_zpool_driver); -#endif return 0; } static void __exit exit_zbud(void) { -#ifdef CONFIG_ZPOOL zpool_unregister_driver(&zbud_zpool_driver); -#endif - pr_info("unloaded\n"); } diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 19b563bc6c48..68e8831068f4 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1471,7 +1471,6 @@ static void obj_free(struct size_class *class, unsigned long obj) unsigned int f_objidx; void *vaddr; - obj &= ~OBJ_ALLOCATED_TAG; obj_to_location(obj, &f_page, &f_objidx); f_offset = (class->size * f_objidx) & ~PAGE_MASK; zspage = get_zspage(f_page); @@ -2163,7 +2162,7 @@ static void async_free_zspage(struct work_struct *work) VM_BUG_ON(fullness != ZS_EMPTY); class = pool->size_class[class_idx]; spin_lock(&class->lock); - __free_zspage(pool, pool->size_class[class_idx], zspage); + __free_zspage(pool, class, zspage); spin_unlock(&class->lock); } }; diff --git a/mm/zswap.c b/mm/zswap.c index 20763267a219..7944e3e57e78 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -967,6 +967,13 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) spin_unlock(&tree->lock); BUG_ON(offset != entry->offset); + src = (u8 *)zhdr + sizeof(struct zswap_header); + if (!zpool_can_sleep_mapped(pool)) { + memcpy(tmp, src, entry->length); + src = tmp; + zpool_unmap_handle(pool, handle); + } + /* try to allocate swap cache page */ switch (zswap_get_swap_cache_page(swpentry, &page)) { case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */ @@ -982,17 +989,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) case ZSWAP_SWAPCACHE_NEW: /* page is locked */ /* decompress */ acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); - dlen = PAGE_SIZE; - src = (u8 *)zhdr + sizeof(struct zswap_header); - - if (!zpool_can_sleep_mapped(pool)) { - - memcpy(tmp, src, entry->length); - src = tmp; - - zpool_unmap_handle(pool, handle); - } mutex_lock(acomp_ctx->mutex); sg_init_one(&input, src, entry->length); @@ -1203,7 +1200,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, zswap_reject_alloc_fail++; goto put_dstmem; } - buf = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_RW); + buf = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_WO); memcpy(buf, &zhdr, hlen); memcpy(buf + hlen, dst, dlen); zpool_unmap_handle(entry->pool->zpool, handle); @@ -1427,18 +1424,11 @@ static int __init zswap_debugfs_init(void) return 0; } - -static void __exit zswap_debugfs_exit(void) -{ - debugfs_remove_recursive(zswap_debugfs_root); -} #else static int __init zswap_debugfs_init(void) { return 0; } - -static void __exit zswap_debugfs_exit(void) { } #endif /********************************* |