diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/filemap.c | 2 | ||||
-rw-r--r-- | mm/gup.c | 8 | ||||
-rw-r--r-- | mm/huge_memory.c | 8 | ||||
-rw-r--r-- | mm/hugetlb.c | 83 | ||||
-rw-r--r-- | mm/kasan/common.c | 65 | ||||
-rw-r--r-- | mm/kasan/init.c | 2 | ||||
-rw-r--r-- | mm/memory-failure.c | 16 | ||||
-rw-r--r-- | mm/memory.c | 43 | ||||
-rw-r--r-- | mm/migrate.c | 15 | ||||
-rw-r--r-- | mm/mincore.c | 4 | ||||
-rw-r--r-- | mm/mremap.c | 66 | ||||
-rw-r--r-- | mm/page_alloc.c | 8 | ||||
-rw-r--r-- | mm/page_io.c | 4 | ||||
-rw-r--r-- | mm/rmap.c | 8 | ||||
-rw-r--r-- | mm/slab.c | 6 | ||||
-rw-r--r-- | mm/slub.c | 2 | ||||
-rw-r--r-- | mm/swap.c | 2 | ||||
-rw-r--r-- | mm/usercopy.c | 9 | ||||
-rw-r--r-- | mm/userfaultfd.c | 13 | ||||
-rw-r--r-- | mm/util.c | 2 |
20 files changed, 208 insertions, 158 deletions
diff --git a/mm/filemap.c b/mm/filemap.c index 29655fb47a2c..9f5e323e883e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1125,7 +1125,7 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, break; } - if (unlikely(signal_pending_state(state, current))) { + if (signal_pending_state(state, current)) { ret = -EINTR; break; } @@ -727,7 +727,7 @@ retry: * If we have a pending SIGKILL, don't keep faulting pages and * potentially allocating memory. */ - if (unlikely(fatal_signal_pending(current))) { + if (fatal_signal_pending(current)) { ret = -ERESTARTSYS; goto out; } @@ -1813,8 +1813,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, len = (unsigned long) nr_pages << PAGE_SHIFT; end = start + len; - if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, - (void __user *)start, len))) + if (unlikely(!access_ok((void __user *)start, len))) return 0; /* @@ -1868,8 +1867,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, if (nr_pages <= 0) return 0; - if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, - (void __user *)start, len))) + if (unlikely(!access_ok((void __user *)start, len))) return -EFAULT; if (gup_fast_permitted(start, nr_pages, write)) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index cbd977b1d60d..faf357eaf0ce 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -568,7 +568,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, return VM_FAULT_FALLBACK; } - pgtable = pte_alloc_one(vma->vm_mm, haddr); + pgtable = pte_alloc_one(vma->vm_mm); if (unlikely(!pgtable)) { ret = VM_FAULT_OOM; goto release; @@ -702,7 +702,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) struct page *zero_page; bool set; vm_fault_t ret; - pgtable = pte_alloc_one(vma->vm_mm, haddr); + pgtable = pte_alloc_one(vma->vm_mm); if (unlikely(!pgtable)) return VM_FAULT_OOM; zero_page = mm_get_huge_zero_page(vma->vm_mm); @@ -791,7 +791,7 @@ vm_fault_t vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, return VM_FAULT_SIGBUS; if (arch_needs_pgtable_deposit()) { - pgtable = pte_alloc_one(vma->vm_mm, addr); + pgtable = pte_alloc_one(vma->vm_mm); if (!pgtable) return VM_FAULT_OOM; } @@ -927,7 +927,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (!vma_is_anonymous(vma)) return 0; - pgtable = pte_alloc_one(dst_mm, addr); + pgtable = pte_alloc_one(dst_mm); if (unlikely(!pgtable)) goto out; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e37efd5d8318..df2e7dd5ff17 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3238,7 +3238,6 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, struct page *ptepage; unsigned long addr; int cow; - struct address_space *mapping = vma->vm_file->f_mapping; struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); struct mmu_notifier_range range; @@ -3250,23 +3249,13 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, mmu_notifier_range_init(&range, src, vma->vm_start, vma->vm_end); mmu_notifier_invalidate_range_start(&range); - } else { - /* - * For shared mappings i_mmap_rwsem must be held to call - * huge_pte_alloc, otherwise the returned ptep could go - * away if part of a shared pmd and another thread calls - * huge_pmd_unshare. - */ - i_mmap_lock_read(mapping); } for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { spinlock_t *src_ptl, *dst_ptl; - src_pte = huge_pte_offset(src, addr, sz); if (!src_pte) continue; - dst_pte = huge_pte_alloc(dst, addr, sz); if (!dst_pte) { ret = -ENOMEM; @@ -3337,8 +3326,6 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, if (cow) mmu_notifier_invalidate_range_end(&range); - else - i_mmap_unlock_read(mapping); return ret; } @@ -3755,16 +3742,16 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, } /* - * We can not race with truncation due to holding i_mmap_rwsem. - * Check once here for faults beyond end of file. + * Use page lock to guard against racing truncation + * before we get page_table_lock. */ - size = i_size_read(mapping->host) >> huge_page_shift(h); - if (idx >= size) - goto out; - retry: page = find_lock_page(mapping, idx); if (!page) { + size = i_size_read(mapping->host) >> huge_page_shift(h); + if (idx >= size) + goto out; + /* * Check for page in userfault range */ @@ -3784,18 +3771,14 @@ retry: }; /* - * hugetlb_fault_mutex and i_mmap_rwsem must be - * dropped before handling userfault. Reacquire - * after handling fault to make calling code simpler. + * hugetlb_fault_mutex must be dropped before + * handling userfault. Reacquire after handling + * fault to make calling code simpler. */ hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr); mutex_unlock(&hugetlb_fault_mutex_table[hash]); - i_mmap_unlock_read(mapping); - ret = handle_userfault(&vmf, VM_UFFD_MISSING); - - i_mmap_lock_read(mapping); mutex_lock(&hugetlb_fault_mutex_table[hash]); goto out; } @@ -3854,6 +3837,9 @@ retry: } ptl = huge_pte_lock(h, mm, ptep); + size = i_size_read(mapping->host) >> huge_page_shift(h); + if (idx >= size) + goto backout; ret = 0; if (!huge_pte_none(huge_ptep_get(ptep))) @@ -3940,11 +3926,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); if (ptep) { - /* - * Since we hold no locks, ptep could be stale. That is - * OK as we are only making decisions based on content and - * not actually modifying content here. - */ entry = huge_ptep_get(ptep); if (unlikely(is_hugetlb_entry_migration(entry))) { migration_entry_wait_huge(vma, mm, ptep); @@ -3952,33 +3933,20 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) return VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); + } else { + ptep = huge_pte_alloc(mm, haddr, huge_page_size(h)); + if (!ptep) + return VM_FAULT_OOM; } - /* - * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold - * until finished with ptep. This serves two purposes: - * 1) It prevents huge_pmd_unshare from being called elsewhere - * and making the ptep no longer valid. - * 2) It synchronizes us with file truncation. - * - * ptep could have already be assigned via huge_pte_offset. That - * is OK, as huge_pte_alloc will return the same value unless - * something changed. - */ mapping = vma->vm_file->f_mapping; - i_mmap_lock_read(mapping); - ptep = huge_pte_alloc(mm, haddr, huge_page_size(h)); - if (!ptep) { - i_mmap_unlock_read(mapping); - return VM_FAULT_OOM; - } + idx = vma_hugecache_offset(h, vma, haddr); /* * Serialize hugepage allocation and instantiation, so that we don't * get spurious allocation failures if two CPUs race to instantiate * the same page in the page cache. */ - idx = vma_hugecache_offset(h, vma, haddr); hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr); mutex_lock(&hugetlb_fault_mutex_table[hash]); @@ -4066,7 +4034,6 @@ out_ptl: } out_mutex: mutex_unlock(&hugetlb_fault_mutex_table[hash]); - i_mmap_unlock_read(mapping); /* * Generally it's safe to hold refcount during waiting page lock. But * here we just wait to defer the next page fault to avoid busy loop and @@ -4231,7 +4198,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, * If we have a pending SIGKILL, don't keep faulting pages and * potentially allocating memory. */ - if (unlikely(fatal_signal_pending(current))) { + if (fatal_signal_pending(current)) { remainder = 0; break; } @@ -4671,12 +4638,10 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() * and returns the corresponding pte. While this is not necessary for the * !shared pmd case because we can allocate the pmd later as well, it makes the - * code much cleaner. - * - * This routine must be called with i_mmap_rwsem held in at least read mode. - * For hugetlbfs, this prevents removal of any page table entries associated - * with the address space. This is important as we are setting up sharing - * based on existing page table entries (mappings). + * code much cleaner. pmd allocation is essential for the shared case because + * pud has to be populated inside the same i_mmap_rwsem section - otherwise + * racing tasks could either miss the sharing (see huge_pte_offset) or select a + * bad pmd for sharing. */ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) { @@ -4693,6 +4658,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) if (!vma_shareable(vma, addr)) return (pte_t *)pmd_alloc(mm, pud, addr); + i_mmap_lock_write(mapping); vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { if (svma == vma) continue; @@ -4722,6 +4688,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) spin_unlock(ptl); out: pte = (pte_t *)pmd_alloc(mm, pud, addr); + i_mmap_unlock_write(mapping); return pte; } @@ -4732,7 +4699,7 @@ out: * indicated by page_count > 1, unmap is achieved by clearing pud and * decrementing the ref count. If count == 1, the pte page is not shared. * - * Called with page table lock held and i_mmap_rwsem held in write mode. + * called with page table lock held. * * returns: 1 successfully unmapped a shared pte page * 0 the underlying pte page is not shared, or it is the last user diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 03d5d1374ca7..73c9cbfdedf4 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -298,8 +298,6 @@ void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, return; } - cache->align = round_up(cache->align, KASAN_SHADOW_SCALE_SIZE); - *flags |= SLAB_KASAN; } @@ -349,28 +347,43 @@ void kasan_poison_object_data(struct kmem_cache *cache, void *object) } /* - * Since it's desirable to only call object contructors once during slab - * allocation, we preassign tags to all such objects. Also preassign tags for - * SLAB_TYPESAFE_BY_RCU slabs to avoid use-after-free reports. - * For SLAB allocator we can't preassign tags randomly since the freelist is - * stored as an array of indexes instead of a linked list. Assign tags based - * on objects indexes, so that objects that are next to each other get - * different tags. - * After a tag is assigned, the object always gets allocated with the same tag. - * The reason is that we can't change tags for objects with constructors on - * reallocation (even for non-SLAB_TYPESAFE_BY_RCU), because the constructor - * code can save the pointer to the object somewhere (e.g. in the object - * itself). Then if we retag it, the old saved pointer will become invalid. + * This function assigns a tag to an object considering the following: + * 1. A cache might have a constructor, which might save a pointer to a slab + * object somewhere (e.g. in the object itself). We preassign a tag for + * each object in caches with constructors during slab creation and reuse + * the same tag each time a particular object is allocated. + * 2. A cache might be SLAB_TYPESAFE_BY_RCU, which means objects can be + * accessed after being freed. We preassign tags for objects in these + * caches as well. + * 3. For SLAB allocator we can't preassign tags randomly since the freelist + * is stored as an array of indexes instead of a linked list. Assign tags + * based on objects indexes, so that objects that are next to each other + * get different tags. */ -static u8 assign_tag(struct kmem_cache *cache, const void *object, bool new) +static u8 assign_tag(struct kmem_cache *cache, const void *object, + bool init, bool krealloc) { + /* Reuse the same tag for krealloc'ed objects. */ + if (krealloc) + return get_tag(object); + + /* + * If the cache neither has a constructor nor has SLAB_TYPESAFE_BY_RCU + * set, assign a tag when the object is being allocated (init == false). + */ if (!cache->ctor && !(cache->flags & SLAB_TYPESAFE_BY_RCU)) - return new ? KASAN_TAG_KERNEL : random_tag(); + return init ? KASAN_TAG_KERNEL : random_tag(); + /* For caches that either have a constructor or SLAB_TYPESAFE_BY_RCU: */ #ifdef CONFIG_SLAB + /* For SLAB assign tags based on the object index in the freelist. */ return (u8)obj_to_index(cache, virt_to_page(object), (void *)object); #else - return new ? random_tag() : get_tag(object); + /* + * For SLUB assign a random tag during slab creation, otherwise reuse + * the already assigned tag. + */ + return init ? random_tag() : get_tag(object); #endif } @@ -386,7 +399,8 @@ void * __must_check kasan_init_slab_obj(struct kmem_cache *cache, __memset(alloc_info, 0, sizeof(*alloc_info)); if (IS_ENABLED(CONFIG_KASAN_SW_TAGS)) - object = set_tag(object, assign_tag(cache, object, true)); + object = set_tag(object, + assign_tag(cache, object, true, false)); return (void *)object; } @@ -452,8 +466,8 @@ bool kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip) return __kasan_slab_free(cache, object, ip, true); } -void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object, - size_t size, gfp_t flags) +static void *__kasan_kmalloc(struct kmem_cache *cache, const void *object, + size_t size, gfp_t flags, bool krealloc) { unsigned long redzone_start; unsigned long redzone_end; @@ -471,7 +485,7 @@ void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object, KASAN_SHADOW_SCALE_SIZE); if (IS_ENABLED(CONFIG_KASAN_SW_TAGS)) - tag = assign_tag(cache, object, false); + tag = assign_tag(cache, object, false, krealloc); /* Tag is ignored in set_tag without CONFIG_KASAN_SW_TAGS */ kasan_unpoison_shadow(set_tag(object, tag), size); @@ -483,6 +497,12 @@ void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object, return set_tag(object, tag); } + +void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object, + size_t size, gfp_t flags) +{ + return __kasan_kmalloc(cache, object, size, flags, false); +} EXPORT_SYMBOL(kasan_kmalloc); void * __must_check kasan_kmalloc_large(const void *ptr, size_t size, @@ -522,7 +542,8 @@ void * __must_check kasan_krealloc(const void *object, size_t size, gfp_t flags) if (unlikely(!PageSlab(page))) return kasan_kmalloc_large(object, size, flags); else - return kasan_kmalloc(page->slab_cache, object, size, flags); + return __kasan_kmalloc(page->slab_cache, object, size, + flags, true); } void kasan_poison_kfree(void *ptr, unsigned long ip) diff --git a/mm/kasan/init.c b/mm/kasan/init.c index 34afad56497b..45a1b5e38e1e 100644 --- a/mm/kasan/init.c +++ b/mm/kasan/init.c @@ -123,7 +123,7 @@ static int __ref zero_pmd_populate(pud_t *pud, unsigned long addr, pte_t *p; if (slab_is_available()) - p = pte_alloc_one_kernel(&init_mm, addr); + p = pte_alloc_one_kernel(&init_mm); else p = early_alloc(PAGE_SIZE, NUMA_NO_NODE); if (!p) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 6379fff1a5ff..7c72f2a95785 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -966,7 +966,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; struct address_space *mapping; LIST_HEAD(tokill); - bool unmap_success = true; + bool unmap_success; int kill = 1, forcekill; struct page *hpage = *hpagep; bool mlocked = PageMlocked(hpage); @@ -1028,19 +1028,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, if (kill) collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED); - if (!PageHuge(hpage)) { - unmap_success = try_to_unmap(hpage, ttu); - } else if (mapping) { - /* - * For hugetlb pages, try_to_unmap could potentially call - * huge_pmd_unshare. Because of this, take semaphore in - * write mode here and set TTU_RMAP_LOCKED to indicate we - * have taken the lock at this higer level. - */ - i_mmap_lock_write(mapping); - unmap_success = try_to_unmap(hpage, ttu|TTU_RMAP_LOCKED); - i_mmap_unlock_write(mapping); - } + unmap_success = try_to_unmap(hpage, ttu); if (!unmap_success) pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n", pfn, page_mapcount(hpage)); diff --git a/mm/memory.c b/mm/memory.c index 2dd2f9ab57f4..e11ca9dd823f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -400,10 +400,10 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, } } -int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) +int __pte_alloc(struct mm_struct *mm, pmd_t *pmd) { spinlock_t *ptl; - pgtable_t new = pte_alloc_one(mm, address); + pgtable_t new = pte_alloc_one(mm); if (!new) return -ENOMEM; @@ -434,9 +434,9 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) return 0; } -int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) +int __pte_alloc_kernel(pmd_t *pmd) { - pte_t *new = pte_alloc_one_kernel(&init_mm, address); + pte_t *new = pte_alloc_one_kernel(&init_mm); if (!new) return -ENOMEM; @@ -2896,7 +2896,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) * * Here we only have down_read(mmap_sem). */ - if (pte_alloc(vma->vm_mm, vmf->pmd, vmf->address)) + if (pte_alloc(vma->vm_mm, vmf->pmd)) return VM_FAULT_OOM; /* See the comment in pte_alloc_one_map() */ @@ -2994,6 +2994,28 @@ static vm_fault_t __do_fault(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; vm_fault_t ret; + /* + * Preallocate pte before we take page_lock because this might lead to + * deadlocks for memcg reclaim which waits for pages under writeback: + * lock_page(A) + * SetPageWriteback(A) + * unlock_page(A) + * lock_page(B) + * lock_page(B) + * pte_alloc_pne + * shrink_page_list + * wait_on_page_writeback(A) + * SetPageWriteback(B) + * unlock_page(B) + * # flush A, B to clear the writeback + */ + if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) { + vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm); + if (!vmf->prealloc_pte) + return VM_FAULT_OOM; + smp_wmb(); /* See comment in __pte_alloc() */ + } + ret = vma->vm_ops->fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY | VM_FAULT_DONE_COW))) @@ -3043,7 +3065,7 @@ static vm_fault_t pte_alloc_one_map(struct vm_fault *vmf) pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); spin_unlock(vmf->ptl); vmf->prealloc_pte = NULL; - } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))) { + } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) { return VM_FAULT_OOM; } map_pte: @@ -3122,7 +3144,7 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) * related to pte entry. Use the preallocated table for that. */ if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) { - vmf->prealloc_pte = pte_alloc_one(vma->vm_mm, vmf->address); + vmf->prealloc_pte = pte_alloc_one(vma->vm_mm); if (!vmf->prealloc_pte) return VM_FAULT_OOM; smp_wmb(); /* See comment in __pte_alloc() */ @@ -3360,8 +3382,7 @@ static vm_fault_t do_fault_around(struct vm_fault *vmf) start_pgoff + nr_pages - 1); if (pmd_none(*vmf->pmd)) { - vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm, - vmf->address); + vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm); if (!vmf->prealloc_pte) goto out; smp_wmb(); /* See comment in __pte_alloc() */ @@ -4078,8 +4099,8 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, goto out; if (range) { - range->start = address & PAGE_MASK; - range->end = range->start + PAGE_SIZE; + mmu_notifier_range_init(range, mm, address & PAGE_MASK, + (address & PAGE_MASK) + PAGE_SIZE); mmu_notifier_invalidate_range_start(range); } ptep = pte_offset_map_lock(mm, pmd, address, ptlp); diff --git a/mm/migrate.c b/mm/migrate.c index 5d1839a9148d..a16b15090df3 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1324,19 +1324,8 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, goto put_anon; if (page_mapped(hpage)) { - struct address_space *mapping = page_mapping(hpage); - - /* - * try_to_unmap could potentially call huge_pmd_unshare. - * Because of this, take semaphore in write mode here and - * set TTU_RMAP_LOCKED to let lower levels know we have - * taken the lock. - */ - i_mmap_lock_write(mapping); try_to_unmap(hpage, - TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS| - TTU_RMAP_LOCKED); - i_mmap_unlock_write(mapping); + TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); page_was_mapped = 1; } @@ -2636,7 +2625,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, * * Here we only have down_read(mmap_sem). */ - if (pte_alloc(mm, pmdp, addr)) + if (pte_alloc(mm, pmdp)) goto abort; /* See the comment in pte_alloc_one_map() */ diff --git a/mm/mincore.c b/mm/mincore.c index 4985965aa20a..218099b5ed31 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -233,14 +233,14 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len, return -EINVAL; /* ..and we need to be passed a valid user-space range */ - if (!access_ok(VERIFY_READ, (void __user *) start, len)) + if (!access_ok((void __user *) start, len)) return -ENOMEM; /* This also avoids any overflows on PAGE_ALIGN */ pages = len >> PAGE_SHIFT; pages += (offset_in_page(len)) != 0; - if (!access_ok(VERIFY_WRITE, vec, pages)) + if (!access_ok(vec, pages)) return -EFAULT; tmp = (void *) __get_free_page(GFP_USER); diff --git a/mm/mremap.c b/mm/mremap.c index def01d86e36f..3320616ed93f 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -191,6 +191,52 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, drop_rmap_locks(vma); } +#ifdef CONFIG_HAVE_MOVE_PMD +static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr, + unsigned long new_addr, unsigned long old_end, + pmd_t *old_pmd, pmd_t *new_pmd) +{ + spinlock_t *old_ptl, *new_ptl; + struct mm_struct *mm = vma->vm_mm; + pmd_t pmd; + + if ((old_addr & ~PMD_MASK) || (new_addr & ~PMD_MASK) + || old_end - old_addr < PMD_SIZE) + return false; + + /* + * The destination pmd shouldn't be established, free_pgtables() + * should have release it. + */ + if (WARN_ON(!pmd_none(*new_pmd))) + return false; + + /* + * We don't have to worry about the ordering of src and dst + * ptlocks because exclusive mmap_sem prevents deadlock. + */ + old_ptl = pmd_lock(vma->vm_mm, old_pmd); + new_ptl = pmd_lockptr(mm, new_pmd); + if (new_ptl != old_ptl) + spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); + + /* Clear the pmd */ + pmd = *old_pmd; + pmd_clear(old_pmd); + + VM_BUG_ON(!pmd_none(*new_pmd)); + + /* Set the new pmd */ + set_pmd_at(mm, new_addr, new_pmd, pmd); + flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE); + if (new_ptl != old_ptl) + spin_unlock(new_ptl); + spin_unlock(old_ptl); + + return true; +} +#endif + unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, unsigned long new_addr, unsigned long len, @@ -235,8 +281,26 @@ unsigned long move_page_tables(struct vm_area_struct *vma, split_huge_pmd(vma, old_pmd, old_addr); if (pmd_trans_unstable(old_pmd)) continue; + } else if (extent == PMD_SIZE) { +#ifdef CONFIG_HAVE_MOVE_PMD + /* + * If the extent is PMD-sized, try to speed the move by + * moving at the PMD level if possible. + */ + bool moved; + + if (need_rmap_locks) + take_rmap_locks(vma); + moved = move_normal_pmd(vma, old_addr, new_addr, + old_end, old_pmd, new_pmd); + if (need_rmap_locks) + drop_rmap_locks(vma); + if (moved) + continue; +#endif } - if (pte_alloc(new_vma->vm_mm, new_pmd, new_addr)) + + if (pte_alloc(new_vma->vm_mm, new_pmd)) break; next = (new_addr + PMD_SIZE) & PMD_MASK; if (extent > next - new_addr) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cde5dac6229a..d295c9bc01a8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2214,7 +2214,7 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page, */ boost_watermark(zone); if (alloc_flags & ALLOC_KSWAPD) - wakeup_kswapd(zone, 0, 0, zone_idx(zone)); + set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags); /* We are not allowed to try stealing from the whole block */ if (!whole_block) @@ -3102,6 +3102,12 @@ struct page *rmqueue(struct zone *preferred_zone, local_irq_restore(flags); out: + /* Separate test+clear to avoid unnecessary atomics */ + if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) { + clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags); + wakeup_kswapd(zone, 0, 0, zone_idx(zone)); + } + VM_BUG_ON_PAGE(page && bad_range(zone, page), page); return page; diff --git a/mm/page_io.c b/mm/page_io.c index d975fa3f02aa..2e8019d0e048 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -401,6 +401,8 @@ int swap_readpage(struct page *page, bool synchronous) get_task_struct(current); bio->bi_private = current; bio_set_op_attrs(bio, REQ_OP_READ, 0); + if (synchronous) + bio->bi_opf |= REQ_HIPRI; count_vm_event(PSWPIN); bio_get(bio); qc = submit_bio(bio); @@ -410,7 +412,7 @@ int swap_readpage(struct page *page, bool synchronous) break; if (!blk_poll(disk->queue, qc, true)) - break; + io_schedule(); } __set_current_state(TASK_RUNNING); bio_put(bio); diff --git a/mm/rmap.c b/mm/rmap.c index 21a26cf51114..0454ecc29537 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -25,7 +25,6 @@ * page->flags PG_locked (lock_page) * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share) * mapping->i_mmap_rwsem - * hugetlb_fault_mutex (hugetlbfs specific page fault mutex) * anon_vma->rwsem * mm->page_table_lock or pte_lock * zone_lru_lock (in mark_page_accessed, isolate_lru_page) @@ -1372,16 +1371,13 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * Note that the page can not be free in this function as call of * try_to_unmap() must hold a reference on the page. */ - mmu_notifier_range_init(&range, vma->vm_mm, vma->vm_start, - min(vma->vm_end, vma->vm_start + + mmu_notifier_range_init(&range, vma->vm_mm, address, + min(vma->vm_end, address + (PAGE_SIZE << compound_order(page)))); if (PageHuge(page)) { /* * If sharing is possible, start and end will be adjusted * accordingly. - * - * If called for a huge page, caller must hold i_mmap_rwsem - * in write mode as it is possible to call huge_pmd_unshare. */ adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); diff --git a/mm/slab.c b/mm/slab.c index 73fe23e649c9..78eb8c5bf4e4 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -666,8 +666,10 @@ static struct alien_cache *__alloc_alien_cache(int node, int entries, struct alien_cache *alc = NULL; alc = kmalloc_node(memsize, gfp, node); - init_arraycache(&alc->ac, entries, batch); - spin_lock_init(&alc->lock); + if (alc) { + init_arraycache(&alc->ac, entries, batch); + spin_lock_init(&alc->lock); + } return alc; } diff --git a/mm/slub.c b/mm/slub.c index 36c0befeebd8..1e3d0ec4e200 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3846,6 +3846,8 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page, unsigned int offset; size_t object_size; + ptr = kasan_reset_tag(ptr); + /* Find object and usable object size. */ s = page->slab_cache; diff --git a/mm/swap.c b/mm/swap.c index 4d8a1f1afaab..4929bc1be60e 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -126,7 +126,7 @@ void put_pages_list(struct list_head *pages) while (!list_empty(pages)) { struct page *victim; - victim = list_entry(pages->prev, struct page, lru); + victim = lru_to_page(pages); list_del(&victim->lru); put_page(victim); } diff --git a/mm/usercopy.c b/mm/usercopy.c index 852eb4e53f06..14faadcedd06 100644 --- a/mm/usercopy.c +++ b/mm/usercopy.c @@ -247,7 +247,8 @@ static DEFINE_STATIC_KEY_FALSE_RO(bypass_usercopy_checks); /* * Validates that the given object is: * - not bogus address - * - known-safe heap or stack object + * - fully contained by stack (or stack frame, when available) + * - fully within SLAB object (or object whitelist area, when available) * - not in kernel text */ void __check_object_size(const void *ptr, unsigned long n, bool to_user) @@ -262,9 +263,6 @@ void __check_object_size(const void *ptr, unsigned long n, bool to_user) /* Check for invalid addresses. */ check_bogus_address((const unsigned long)ptr, n, to_user); - /* Check for bad heap object. */ - check_heap_object(ptr, n, to_user); - /* Check for bad stack object. */ switch (check_stack_object(ptr, n)) { case NOT_STACK: @@ -282,6 +280,9 @@ void __check_object_size(const void *ptr, unsigned long n, bool to_user) usercopy_abort("process stack", NULL, to_user, 0, n); } + /* Check for bad heap object. */ + check_heap_object(ptr, n, to_user); + /* Check for object in kernel to avoid text exposure. */ check_kernel_text_object((const unsigned long)ptr, n, to_user); } diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 48368589f519..d59b5a73dfb3 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -267,14 +267,10 @@ retry: VM_BUG_ON(dst_addr & ~huge_page_mask(h)); /* - * Serialize via i_mmap_rwsem and hugetlb_fault_mutex. - * i_mmap_rwsem ensures the dst_pte remains valid even - * in the case of shared pmds. fault mutex prevents - * races with other faulting threads. + * Serialize via hugetlb_fault_mutex */ - mapping = dst_vma->vm_file->f_mapping; - i_mmap_lock_read(mapping); idx = linear_page_index(dst_vma, dst_addr); + mapping = dst_vma->vm_file->f_mapping; hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping, idx, dst_addr); mutex_lock(&hugetlb_fault_mutex_table[hash]); @@ -283,7 +279,6 @@ retry: dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h)); if (!dst_pte) { mutex_unlock(&hugetlb_fault_mutex_table[hash]); - i_mmap_unlock_read(mapping); goto out_unlock; } @@ -291,7 +286,6 @@ retry: dst_pteval = huge_ptep_get(dst_pte); if (!huge_pte_none(dst_pteval)) { mutex_unlock(&hugetlb_fault_mutex_table[hash]); - i_mmap_unlock_read(mapping); goto out_unlock; } @@ -299,7 +293,6 @@ retry: dst_addr, src_addr, &page); mutex_unlock(&hugetlb_fault_mutex_table[hash]); - i_mmap_unlock_read(mapping); vm_alloc_shared = vm_shared; cond_resched(); @@ -550,7 +543,7 @@ retry: break; } if (unlikely(pmd_none(dst_pmdval)) && - unlikely(__pte_alloc(dst_mm, dst_pmd, dst_addr))) { + unlikely(__pte_alloc(dst_mm, dst_pmd))) { err = -ENOMEM; break; } diff --git a/mm/util.c b/mm/util.c index 4df23d64aac7..1ea055138043 100644 --- a/mm/util.c +++ b/mm/util.c @@ -478,7 +478,7 @@ bool page_mapped(struct page *page) return true; if (PageHuge(page)) return false; - for (i = 0; i < hpage_nr_pages(page); i++) { + for (i = 0; i < (1 << compound_order(page)); i++) { if (atomic_read(&page[i]._mapcount) >= 0) return true; } |