diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-11-29 19:12:44 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-11-29 19:12:44 -0800 |
commit | a0908a1b7d68706ee52ed4a039756e70c8e956e9 (patch) | |
tree | ce23d955c37b04d816fc7d551db71d4ff86630f7 /mm | |
parent | b9151761021e25c024a6670df4e7c43ffbab0e1d (diff) | |
parent | 72639e6df4128dfde8fee7638a35be7db8114000 (diff) |
Merge branch 'akpm' (patches from Andrew)
Mergr misc fixes from Andrew Morton:
"28 fixes"
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (28 commits)
fs/hugetlbfs/inode.c: change put_page/unlock_page order in hugetlbfs_fallocate()
mm/hugetlb: fix NULL-pointer dereference on 5-level paging machine
autofs: revert "autofs: fix AT_NO_AUTOMOUNT not being honored"
autofs: revert "autofs: take more care to not update last_used on path walk"
fs/fat/inode.c: fix sb_rdonly() change
mm, memcg: fix mem_cgroup_swapout() for THPs
mm: migrate: fix an incorrect call of prep_transhuge_page()
kmemleak: add scheduling point to kmemleak_scan()
scripts/bloat-o-meter: don't fail with division by 0
fs/mbcache.c: make count_objects() more robust
Revert "mm/page-writeback.c: print a warning if the vm dirtiness settings are illogical"
mm/madvise.c: fix madvise() infinite loop under special circumstances
exec: avoid RLIMIT_STACK races with prlimit()
IB/core: disable memory registration of filesystem-dax vmas
v4l2: disable filesystem-dax mapping support
mm: fail get_vaddr_frames() for filesystem-dax mappings
mm: introduce get_user_pages_longterm
device-dax: implement ->split() to catch invalid munmap attempts
mm, hugetlbfs: introduce ->split() to vm_operations_struct
scripts/faddr2line: extend usage on generic arch
...
Diffstat (limited to 'mm')
-rw-r--r-- | mm/frame_vector.c | 12 | ||||
-rw-r--r-- | mm/gup.c | 66 | ||||
-rw-r--r-- | mm/hmm.c | 8 | ||||
-rw-r--r-- | mm/huge_memory.c | 6 | ||||
-rw-r--r-- | mm/hugetlb.c | 12 | ||||
-rw-r--r-- | mm/kmemleak.c | 2 | ||||
-rw-r--r-- | mm/madvise.c | 4 | ||||
-rw-r--r-- | mm/memcontrol.c | 2 | ||||
-rw-r--r-- | mm/memory.c | 8 | ||||
-rw-r--r-- | mm/mmap.c | 8 | ||||
-rw-r--r-- | mm/oom_kill.c | 7 | ||||
-rw-r--r-- | mm/page-writeback.c | 5 | ||||
-rw-r--r-- | mm/page_alloc.c | 13 |
13 files changed, 121 insertions, 32 deletions
diff --git a/mm/frame_vector.c b/mm/frame_vector.c index 2f98df0d460e..297c7238f7d4 100644 --- a/mm/frame_vector.c +++ b/mm/frame_vector.c @@ -53,6 +53,18 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames, ret = -EFAULT; goto out; } + + /* + * While get_vaddr_frames() could be used for transient (kernel + * controlled lifetime) pinning of memory pages all current + * users establish long term (userspace controlled lifetime) + * page pinning. Treat get_vaddr_frames() like + * get_user_pages_longterm() and disallow it for filesystem-dax + * mappings. + */ + if (vma_is_fsdax(vma)) + return -EOPNOTSUPP; + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) { vec->got_ref = true; vec->is_pfns = false; @@ -66,7 +66,7 @@ static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address, */ static inline bool can_follow_write_pte(pte_t pte, unsigned int flags) { - return pte_write(pte) || + return pte_access_permitted(pte, WRITE) || ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte)); } @@ -1095,6 +1095,70 @@ long get_user_pages(unsigned long start, unsigned long nr_pages, } EXPORT_SYMBOL(get_user_pages); +#ifdef CONFIG_FS_DAX +/* + * This is the same as get_user_pages() in that it assumes we are + * operating on the current task's mm, but it goes further to validate + * that the vmas associated with the address range are suitable for + * longterm elevated page reference counts. For example, filesystem-dax + * mappings are subject to the lifetime enforced by the filesystem and + * we need guarantees that longterm users like RDMA and V4L2 only + * establish mappings that have a kernel enforced revocation mechanism. + * + * "longterm" == userspace controlled elevated page count lifetime. + * Contrast this to iov_iter_get_pages() usages which are transient. + */ +long get_user_pages_longterm(unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + struct vm_area_struct **vmas_arg) +{ + struct vm_area_struct **vmas = vmas_arg; + struct vm_area_struct *vma_prev = NULL; + long rc, i; + + if (!pages) + return -EINVAL; + + if (!vmas) { + vmas = kcalloc(nr_pages, sizeof(struct vm_area_struct *), + GFP_KERNEL); + if (!vmas) + return -ENOMEM; + } + + rc = get_user_pages(start, nr_pages, gup_flags, pages, vmas); + + for (i = 0; i < rc; i++) { + struct vm_area_struct *vma = vmas[i]; + + if (vma == vma_prev) + continue; + + vma_prev = vma; + + if (vma_is_fsdax(vma)) + break; + } + + /* + * Either get_user_pages() failed, or the vma validation + * succeeded, in either case we don't need to put_page() before + * returning. + */ + if (i >= rc) + goto out; + + for (i = 0; i < rc; i++) + put_page(pages[i]); + rc = -EOPNOTSUPP; +out: + if (vmas != vmas_arg) + kfree(vmas); + return rc; +} +EXPORT_SYMBOL(get_user_pages_longterm); +#endif /* CONFIG_FS_DAX */ + /** * populate_vma_page_range() - populate a range of pages in the vma. * @vma: target vma @@ -391,11 +391,11 @@ again: if (pmd_protnone(pmd)) return hmm_vma_walk_clear(start, end, walk); - if (write_fault && !pmd_write(pmd)) + if (!pmd_access_permitted(pmd, write_fault)) return hmm_vma_walk_clear(start, end, walk); pfn = pmd_pfn(pmd) + pte_index(addr); - flag |= pmd_write(pmd) ? HMM_PFN_WRITE : 0; + flag |= pmd_access_permitted(pmd, WRITE) ? HMM_PFN_WRITE : 0; for (; addr < end; addr += PAGE_SIZE, i++, pfn++) pfns[i] = hmm_pfn_t_from_pfn(pfn) | flag; return 0; @@ -456,11 +456,11 @@ again: continue; } - if (write_fault && !pte_write(pte)) + if (!pte_access_permitted(pte, write_fault)) goto fault; pfns[i] = hmm_pfn_t_from_pfn(pte_pfn(pte)) | flag; - pfns[i] |= pte_write(pte) ? HMM_PFN_WRITE : 0; + pfns[i] |= pte_access_permitted(pte, WRITE) ? HMM_PFN_WRITE : 0; continue; fault: diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0e7ded98d114..2f2f5e774902 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -870,7 +870,7 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, */ WARN_ONCE(flags & FOLL_COW, "mm: In follow_devmap_pmd with FOLL_COW set"); - if (flags & FOLL_WRITE && !pmd_write(*pmd)) + if (!pmd_access_permitted(*pmd, flags & FOLL_WRITE)) return NULL; if (pmd_present(*pmd) && pmd_devmap(*pmd)) @@ -1012,7 +1012,7 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, assert_spin_locked(pud_lockptr(mm, pud)); - if (flags & FOLL_WRITE && !pud_write(*pud)) + if (!pud_access_permitted(*pud, flags & FOLL_WRITE)) return NULL; if (pud_present(*pud) && pud_devmap(*pud)) @@ -1386,7 +1386,7 @@ out_unlock: */ static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags) { - return pmd_write(pmd) || + return pmd_access_permitted(pmd, WRITE) || ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd)); } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 681b300185c0..9a334f5fb730 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3125,6 +3125,13 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) } } +static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr) +{ + if (addr & ~(huge_page_mask(hstate_vma(vma)))) + return -EINVAL; + return 0; +} + /* * We cannot handle pagefaults against hugetlb pages at all. They cause * handle_mm_fault() to try to instantiate regular-sized pages in the @@ -3141,6 +3148,7 @@ const struct vm_operations_struct hugetlb_vm_ops = { .fault = hugetlb_vm_op_fault, .open = hugetlb_vm_op_open, .close = hugetlb_vm_op_close, + .split = hugetlb_vm_op_split, }; static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, @@ -4627,7 +4635,9 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, pte_t *pte = NULL; pgd = pgd_offset(mm, addr); - p4d = p4d_offset(pgd, addr); + p4d = p4d_alloc(mm, pgd, addr); + if (!p4d) + return NULL; pud = pud_alloc(mm, p4d, addr); if (pud) { if (sz == PUD_SIZE) { diff --git a/mm/kmemleak.c b/mm/kmemleak.c index e4738d5e9b8c..3d4781756d50 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1523,6 +1523,8 @@ static void kmemleak_scan(void) if (page_count(page) == 0) continue; scan_block(page, page + 1, NULL); + if (!(pfn % (MAX_SCAN_SIZE / sizeof(*page)))) + cond_resched(); } } put_online_mems(); diff --git a/mm/madvise.c b/mm/madvise.c index 375cf32087e4..751e97aa2210 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -276,15 +276,14 @@ static long madvise_willneed(struct vm_area_struct *vma, { struct file *file = vma->vm_file; + *prev = vma; #ifdef CONFIG_SWAP if (!file) { - *prev = vma; force_swapin_readahead(vma, start, end); return 0; } if (shmem_mapping(file->f_mapping)) { - *prev = vma; force_shm_swapin_readahead(vma, start, end, file->f_mapping); return 0; @@ -299,7 +298,6 @@ static long madvise_willneed(struct vm_area_struct *vma, return 0; } - *prev = vma; start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; if (end > vma->vm_end) end = vma->vm_end; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 50e6906314f8..ac2ffd5e02b9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6044,7 +6044,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) memcg_check_events(memcg, page); if (!mem_cgroup_is_root(memcg)) - css_put(&memcg->css); + css_put_many(&memcg->css, nr_entries); } /** diff --git a/mm/memory.c b/mm/memory.c index 85e7a87da79f..5eb3d2524bdc 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3948,7 +3948,7 @@ static int handle_pte_fault(struct vm_fault *vmf) if (unlikely(!pte_same(*vmf->pte, entry))) goto unlock; if (vmf->flags & FAULT_FLAG_WRITE) { - if (!pte_write(entry)) + if (!pte_access_permitted(entry, WRITE)) return do_wp_page(vmf); entry = pte_mkdirty(entry); } @@ -4013,7 +4013,7 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, /* NUMA case for anonymous PUDs would go here */ - if (dirty && !pud_write(orig_pud)) { + if (dirty && !pud_access_permitted(orig_pud, WRITE)) { ret = wp_huge_pud(&vmf, orig_pud); if (!(ret & VM_FAULT_FALLBACK)) return ret; @@ -4046,7 +4046,7 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, if (pmd_protnone(orig_pmd) && vma_is_accessible(vma)) return do_huge_pmd_numa_page(&vmf, orig_pmd); - if (dirty && !pmd_write(orig_pmd)) { + if (dirty && !pmd_access_permitted(orig_pmd, WRITE)) { ret = wp_huge_pmd(&vmf, orig_pmd); if (!(ret & VM_FAULT_FALLBACK)) return ret; @@ -4336,7 +4336,7 @@ int follow_phys(struct vm_area_struct *vma, goto out; pte = *ptep; - if ((flags & FOLL_WRITE) && !pte_write(pte)) + if (!pte_access_permitted(pte, flags & FOLL_WRITE)) goto unlock; *prot = pgprot_val(pte_pgprot(pte)); diff --git a/mm/mmap.c b/mm/mmap.c index 924839fac0e6..a4d546821214 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2555,9 +2555,11 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *new; int err; - if (is_vm_hugetlb_page(vma) && (addr & - ~(huge_page_mask(hstate_vma(vma))))) - return -EINVAL; + if (vma->vm_ops && vma->vm_ops->split) { + err = vma->vm_ops->split(vma, addr); + if (err) + return err; + } new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (!new) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index c86fbd1b590e..c957be32b27a 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -550,7 +550,6 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) */ set_bit(MMF_UNSTABLE, &mm->flags); - tlb_gather_mmu(&tlb, mm, 0, -1); for (vma = mm->mmap ; vma; vma = vma->vm_next) { if (!can_madv_dontneed_vma(vma)) continue; @@ -565,11 +564,13 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) * we do not want to block exit_mmap by keeping mm ref * count elevated without a good reason. */ - if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) + if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) { + tlb_gather_mmu(&tlb, mm, vma->vm_start, vma->vm_end); unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end, NULL); + tlb_finish_mmu(&tlb, vma->vm_start, vma->vm_end); + } } - tlb_finish_mmu(&tlb, 0, -1); pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", task_pid_nr(tsk), tsk->comm, K(get_mm_counter(mm, MM_ANONPAGES)), diff --git a/mm/page-writeback.c b/mm/page-writeback.c index e7095030aa1f..586f31261c83 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -433,11 +433,8 @@ static void domain_dirty_limits(struct dirty_throttle_control *dtc) else bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE; - if (unlikely(bg_thresh >= thresh)) { - pr_warn("vm direct limit must be set greater than background limit.\n"); + if (bg_thresh >= thresh) bg_thresh = thresh / 2; - } - tsk = current; if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d4096f4a5c1f..73f5d4556b3d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2507,10 +2507,6 @@ void drain_all_pages(struct zone *zone) if (WARN_ON_ONCE(!mm_percpu_wq)) return; - /* Workqueues cannot recurse */ - if (current->flags & PF_WQ_WORKER) - return; - /* * Do not drain if one is already in progress unless it's specific to * a zone. Such callers are primarily CMA and memory hotplug and need @@ -7656,11 +7652,18 @@ int alloc_contig_range(unsigned long start, unsigned long end, /* * In case of -EBUSY, we'd like to know which page causes problem. - * So, just fall through. We will check it in test_pages_isolated(). + * So, just fall through. test_pages_isolated() has a tracepoint + * which will report the busy page. + * + * It is possible that busy pages could become available before + * the call to test_pages_isolated, and the range will actually be + * allocated. So, if we fall through be sure to clear ret so that + * -EBUSY is not accidentally used or returned to caller. */ ret = __alloc_contig_migrate_range(&cc, start, end); if (ret && ret != -EBUSY) goto done; + ret =0; /* * Pages from [start, end) are within a MAX_ORDER_NR_PAGES |