diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-12-15 12:53:37 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-12-15 12:53:37 -0800 |
commit | ac73e3dc8acd0a3be292755db30388c3580f5674 (patch) | |
tree | 5abef6cb82b205b5dbbb69dca950b8a5aae716de /mm/mremap.c | |
parent | 148842c98a24e508aecb929718818fbf4c2a6ff3 (diff) | |
parent | dfefd226b0bf7c435a58d75a0ce2f9273b9825f6 (diff) |
Merge branch 'akpm' (patches from Andrew)
Merge misc updates from Andrew Morton:
- a few random little subsystems
- almost all of the MM patches which are staged ahead of linux-next
material. I'll trickle to post-linux-next work in as the dependents
get merged up.
Subsystems affected by this patch series: kthread, kbuild, ide, ntfs,
ocfs2, arch, and mm (slab-generic, slab, slub, dax, debug, pagecache,
gup, swap, shmem, memcg, pagemap, mremap, hmm, vmalloc, documentation,
kasan, pagealloc, memory-failure, hugetlb, vmscan, z3fold, compaction,
oom-kill, migration, cma, page-poison, userfaultfd, zswap, zsmalloc,
uaccess, zram, and cleanups).
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (200 commits)
mm: cleanup kstrto*() usage
mm: fix fall-through warnings for Clang
mm: slub: convert sysfs sprintf family to sysfs_emit/sysfs_emit_at
mm: shmem: convert shmem_enabled_show to use sysfs_emit_at
mm:backing-dev: use sysfs_emit in macro defining functions
mm: huge_memory: convert remaining use of sprintf to sysfs_emit and neatening
mm: use sysfs_emit for struct kobject * uses
mm: fix kernel-doc markups
zram: break the strict dependency from lzo
zram: add stat to gather incompressible pages since zram set up
zram: support page writeback
mm/process_vm_access: remove redundant initialization of iov_r
mm/zsmalloc.c: rework the list_add code in insert_zspage()
mm/zswap: move to use crypto_acomp API for hardware acceleration
mm/zswap: fix passing zero to 'PTR_ERR' warning
mm/zswap: make struct kernel_param_ops definitions const
userfaultfd/selftests: hint the test runner on required privilege
userfaultfd/selftests: fix retval check for userfaultfd_open()
userfaultfd/selftests: always dump something in modes
userfaultfd: selftests: make __{s,u}64 format specifiers portable
...
Diffstat (limited to 'mm/mremap.c')
-rw-r--r-- | mm/mremap.c | 280 |
1 files changed, 215 insertions, 65 deletions
diff --git a/mm/mremap.c b/mm/mremap.c index 138abbae4f75..c5590afe7165 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -30,12 +30,11 @@ #include "internal.h" -static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) +static pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; - pmd_t *pmd; pgd = pgd_offset(mm, addr); if (pgd_none_or_clear_bad(pgd)) @@ -49,6 +48,18 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) if (pud_none_or_clear_bad(pud)) return NULL; + return pud; +} + +static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) +{ + pud_t *pud; + pmd_t *pmd; + + pud = get_old_pud(mm, addr); + if (!pud) + return NULL; + pmd = pmd_offset(pud, addr); if (pmd_none(*pmd)) return NULL; @@ -56,19 +67,27 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) return pmd; } -static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma, +static pud_t *alloc_new_pud(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr) { pgd_t *pgd; p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; pgd = pgd_offset(mm, addr); p4d = p4d_alloc(mm, pgd, addr); if (!p4d) return NULL; - pud = pud_alloc(mm, p4d, addr); + + return pud_alloc(mm, p4d, addr); +} + +static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr) +{ + pud_t *pud; + pmd_t *pmd; + + pud = alloc_new_pud(mm, vma, addr); if (!pud) return NULL; @@ -249,14 +268,148 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr, return true; } +#else +static inline bool move_normal_pmd(struct vm_area_struct *vma, + unsigned long old_addr, unsigned long new_addr, pmd_t *old_pmd, + pmd_t *new_pmd) +{ + return false; +} #endif +#ifdef CONFIG_HAVE_MOVE_PUD +static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr, + unsigned long new_addr, pud_t *old_pud, pud_t *new_pud) +{ + spinlock_t *old_ptl, *new_ptl; + struct mm_struct *mm = vma->vm_mm; + pud_t pud; + + /* + * The destination pud shouldn't be established, free_pgtables() + * should have released it. + */ + if (WARN_ON_ONCE(!pud_none(*new_pud))) + return false; + + /* + * We don't have to worry about the ordering of src and dst + * ptlocks because exclusive mmap_lock prevents deadlock. + */ + old_ptl = pud_lock(vma->vm_mm, old_pud); + new_ptl = pud_lockptr(mm, new_pud); + if (new_ptl != old_ptl) + spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); + + /* Clear the pud */ + pud = *old_pud; + pud_clear(old_pud); + + VM_BUG_ON(!pud_none(*new_pud)); + + /* Set the new pud */ + set_pud_at(mm, new_addr, new_pud, pud); + flush_tlb_range(vma, old_addr, old_addr + PUD_SIZE); + if (new_ptl != old_ptl) + spin_unlock(new_ptl); + spin_unlock(old_ptl); + + return true; +} +#else +static inline bool move_normal_pud(struct vm_area_struct *vma, + unsigned long old_addr, unsigned long new_addr, pud_t *old_pud, + pud_t *new_pud) +{ + return false; +} +#endif + +enum pgt_entry { + NORMAL_PMD, + HPAGE_PMD, + NORMAL_PUD, +}; + +/* + * Returns an extent of the corresponding size for the pgt_entry specified if + * valid. Else returns a smaller extent bounded by the end of the source and + * destination pgt_entry. + */ +static unsigned long get_extent(enum pgt_entry entry, unsigned long old_addr, + unsigned long old_end, unsigned long new_addr) +{ + unsigned long next, extent, mask, size; + + switch (entry) { + case HPAGE_PMD: + case NORMAL_PMD: + mask = PMD_MASK; + size = PMD_SIZE; + break; + case NORMAL_PUD: + mask = PUD_MASK; + size = PUD_SIZE; + break; + default: + BUILD_BUG(); + break; + } + + next = (old_addr + size) & mask; + /* even if next overflowed, extent below will be ok */ + extent = (next > old_end) ? old_end - old_addr : next - old_addr; + next = (new_addr + size) & mask; + if (extent > next - new_addr) + extent = next - new_addr; + return extent; +} + +/* + * Attempts to speedup the move by moving entry at the level corresponding to + * pgt_entry. Returns true if the move was successful, else false. + */ +static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma, + unsigned long old_addr, unsigned long new_addr, + void *old_entry, void *new_entry, bool need_rmap_locks) +{ + bool moved = false; + + /* See comment in move_ptes() */ + if (need_rmap_locks) + take_rmap_locks(vma); + + switch (entry) { + case NORMAL_PMD: + moved = move_normal_pmd(vma, old_addr, new_addr, old_entry, + new_entry); + break; + case NORMAL_PUD: + moved = move_normal_pud(vma, old_addr, new_addr, old_entry, + new_entry); + break; + case HPAGE_PMD: + moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && + move_huge_pmd(vma, old_addr, new_addr, old_entry, + new_entry); + break; + default: + WARN_ON_ONCE(1); + break; + } + + if (need_rmap_locks) + drop_rmap_locks(vma); + + return moved; +} + unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, unsigned long new_addr, unsigned long len, bool need_rmap_locks) { - unsigned long extent, next, old_end; + unsigned long extent, old_end; struct mmu_notifier_range range; pmd_t *old_pmd, *new_pmd; @@ -269,53 +422,50 @@ unsigned long move_page_tables(struct vm_area_struct *vma, for (; old_addr < old_end; old_addr += extent, new_addr += extent) { cond_resched(); - next = (old_addr + PMD_SIZE) & PMD_MASK; - /* even if next overflowed, extent below will be ok */ - extent = next - old_addr; - if (extent > old_end - old_addr) - extent = old_end - old_addr; - next = (new_addr + PMD_SIZE) & PMD_MASK; - if (extent > next - new_addr) - extent = next - new_addr; + /* + * If extent is PUD-sized try to speed up the move by moving at the + * PUD level if possible. + */ + extent = get_extent(NORMAL_PUD, old_addr, old_end, new_addr); + if (IS_ENABLED(CONFIG_HAVE_MOVE_PUD) && extent == PUD_SIZE) { + pud_t *old_pud, *new_pud; + + old_pud = get_old_pud(vma->vm_mm, old_addr); + if (!old_pud) + continue; + new_pud = alloc_new_pud(vma->vm_mm, vma, new_addr); + if (!new_pud) + break; + if (move_pgt_entry(NORMAL_PUD, vma, old_addr, new_addr, + old_pud, new_pud, need_rmap_locks)) + continue; + } + + extent = get_extent(NORMAL_PMD, old_addr, old_end, new_addr); old_pmd = get_old_pmd(vma->vm_mm, old_addr); if (!old_pmd) continue; new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr); if (!new_pmd) break; - if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd) || pmd_devmap(*old_pmd)) { - if (extent == HPAGE_PMD_SIZE) { - bool moved; - /* See comment in move_ptes() */ - if (need_rmap_locks) - take_rmap_locks(vma); - moved = move_huge_pmd(vma, old_addr, new_addr, - old_pmd, new_pmd); - if (need_rmap_locks) - drop_rmap_locks(vma); - if (moved) - continue; - } + if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd) || + pmd_devmap(*old_pmd)) { + if (extent == HPAGE_PMD_SIZE && + move_pgt_entry(HPAGE_PMD, vma, old_addr, new_addr, + old_pmd, new_pmd, need_rmap_locks)) + continue; split_huge_pmd(vma, old_pmd, old_addr); if (pmd_trans_unstable(old_pmd)) continue; - } else if (extent == PMD_SIZE) { -#ifdef CONFIG_HAVE_MOVE_PMD + } else if (IS_ENABLED(CONFIG_HAVE_MOVE_PMD) && + extent == PMD_SIZE) { /* * If the extent is PMD-sized, try to speed the move by * moving at the PMD level if possible. */ - bool moved; - - if (need_rmap_locks) - take_rmap_locks(vma); - moved = move_normal_pmd(vma, old_addr, new_addr, - old_pmd, new_pmd); - if (need_rmap_locks) - drop_rmap_locks(vma); - if (moved) + if (move_pgt_entry(NORMAL_PMD, vma, old_addr, new_addr, + old_pmd, new_pmd, need_rmap_locks)) continue; -#endif } if (pte_alloc(new_vma->vm_mm, new_pmd)) @@ -343,7 +493,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, unsigned long excess = 0; unsigned long hiwater_vm; int split = 0; - int err; + int err = 0; bool need_rmap_locks; /* @@ -353,6 +503,15 @@ static unsigned long move_vma(struct vm_area_struct *vma, if (mm->map_count >= sysctl_max_map_count - 3) return -ENOMEM; + if (vma->vm_ops && vma->vm_ops->may_split) { + if (vma->vm_start != old_addr) + err = vma->vm_ops->may_split(vma, old_addr); + if (!err && vma->vm_end != old_addr + old_len) + err = vma->vm_ops->may_split(vma, old_addr + old_len); + if (err) + return err; + } + /* * Advise KSM to break any KSM pages in the area to be moved: * it would be confusing if they were to turn up at the new @@ -365,18 +524,26 @@ static unsigned long move_vma(struct vm_area_struct *vma, if (err) return err; + if (unlikely(flags & MREMAP_DONTUNMAP && vm_flags & VM_ACCOUNT)) { + if (security_vm_enough_memory_mm(mm, new_len >> PAGE_SHIFT)) + return -ENOMEM; + } + new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT); new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff, &need_rmap_locks); - if (!new_vma) + if (!new_vma) { + if (unlikely(flags & MREMAP_DONTUNMAP && vm_flags & VM_ACCOUNT)) + vm_unacct_memory(new_len >> PAGE_SHIFT); return -ENOMEM; + } moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len, need_rmap_locks); if (moved_len < old_len) { err = -ENOMEM; } else if (vma->vm_ops && vma->vm_ops->mremap) { - err = vma->vm_ops->mremap(new_vma); + err = vma->vm_ops->mremap(new_vma, flags); } if (unlikely(err)) { @@ -398,7 +565,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, } /* Conceal VM_ACCOUNT so old reservation is not undone */ - if (vm_flags & VM_ACCOUNT) { + if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) { vma->vm_flags &= ~VM_ACCOUNT; excess = vma->vm_end - vma->vm_start - old_len; if (old_addr > vma->vm_start && @@ -423,34 +590,17 @@ static unsigned long move_vma(struct vm_area_struct *vma, untrack_pfn_moved(vma); if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) { - if (vm_flags & VM_ACCOUNT) { - /* Always put back VM_ACCOUNT since we won't unmap */ - vma->vm_flags |= VM_ACCOUNT; - - vm_acct_memory(new_len >> PAGE_SHIFT); - } - - /* - * VMAs can actually be merged back together in copy_vma - * calling merge_vma. This can happen with anonymous vmas - * which have not yet been faulted, so if we were to consider - * this VMA split we'll end up adding VM_ACCOUNT on the - * next VMA, which is completely unrelated if this VMA - * was re-merged. - */ - if (split && new_vma == vma) - split = 0; - /* We always clear VM_LOCKED[ONFAULT] on the old vma */ vma->vm_flags &= VM_LOCKED_CLEAR_MASK; /* Because we won't unmap we don't need to touch locked_vm */ - goto out; + return new_addr; } if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) { /* OOM: unable to split vma, just get accounts right */ - vm_unacct_memory(excess >> PAGE_SHIFT); + if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) + vm_acct_memory(new_len >> PAGE_SHIFT); excess = 0; } @@ -458,7 +608,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, mm->locked_vm += new_len >> PAGE_SHIFT; *locked = true; } -out: + mm->hiwater_vm = hiwater_vm; /* Restore VM_ACCOUNT if one or two pieces of vma left */ |