diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/backing-dev.c | 5 | ||||
-rw-r--r-- | mm/filemap.c | 181 | ||||
-rw-r--r-- | mm/gup.c | 7 | ||||
-rw-r--r-- | mm/hugetlb.c | 19 | ||||
-rw-r--r-- | mm/kasan/hw_tags.c | 8 | ||||
-rw-r--r-- | mm/kasan/kasan.h | 22 | ||||
-rw-r--r-- | mm/kfence/core.c | 19 | ||||
-rw-r--r-- | mm/kfence/kfence_test.c | 15 | ||||
-rw-r--r-- | mm/kmemleak.c | 6 | ||||
-rw-r--r-- | mm/madvise.c | 6 | ||||
-rw-r--r-- | mm/memblock.c | 13 | ||||
-rw-r--r-- | mm/memcontrol.c | 35 | ||||
-rw-r--r-- | mm/memory-failure.c | 14 | ||||
-rw-r--r-- | mm/memory.c | 11 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 1 | ||||
-rw-r--r-- | mm/migrate.c | 2 | ||||
-rw-r--r-- | mm/mmap.c | 6 | ||||
-rw-r--r-- | mm/mmap_lock.c | 4 | ||||
-rw-r--r-- | mm/nommu.c | 3 | ||||
-rw-r--r-- | mm/page-writeback.c | 2 | ||||
-rw-r--r-- | mm/page_alloc.c | 54 | ||||
-rw-r--r-- | mm/readahead.c | 2 | ||||
-rw-r--r-- | mm/rmap.c | 41 | ||||
-rw-r--r-- | mm/secretmem.c | 1 | ||||
-rw-r--r-- | mm/shmem.c | 34 | ||||
-rw-r--r-- | mm/slab.h | 2 | ||||
-rw-r--r-- | mm/slub.c | 39 | ||||
-rw-r--r-- | mm/swap_slots.c | 4 | ||||
-rw-r--r-- | mm/swap_state.c | 7 | ||||
-rw-r--r-- | mm/truncate.c | 9 | ||||
-rw-r--r-- | mm/util.c | 19 | ||||
-rw-r--r-- | mm/vmscan.c | 30 | ||||
-rw-r--r-- | mm/vmstat.c | 12 |
33 files changed, 403 insertions, 230 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 271f2ca862c8..cd06dca232c3 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -398,12 +398,12 @@ static void cgwb_release_workfn(struct work_struct *work) blkcg_unpin_online(blkcg); fprop_local_destroy_percpu(&wb->memcg_completions); - percpu_ref_exit(&wb->refcnt); spin_lock_irq(&cgwb_lock); list_del(&wb->offline_node); spin_unlock_irq(&cgwb_lock); + percpu_ref_exit(&wb->refcnt); wb_exit(wb); WARN_ON_ONCE(!list_empty(&wb->b_attached)); kfree_rcu(wb, rcu); @@ -807,6 +807,7 @@ struct backing_dev_info *bdi_alloc(int node_id) bdi->capabilities = BDI_CAP_WRITEBACK | BDI_CAP_WRITEBACK_ACCT; bdi->ra_pages = VM_READAHEAD_PAGES; bdi->io_pages = VM_READAHEAD_PAGES; + timer_setup(&bdi->laptop_mode_wb_timer, laptop_mode_timer_fn, 0); return bdi; } EXPORT_SYMBOL(bdi_alloc); @@ -928,6 +929,8 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi) void bdi_unregister(struct backing_dev_info *bdi) { + del_timer_sync(&bdi->laptop_mode_wb_timer); + /* make sure nobody finds us on the bdi_list anymore */ bdi_remove_from_list(bdi); wb_shutdown(&bdi->wb); diff --git a/mm/filemap.c b/mm/filemap.c index d1458ecf2f51..920e8dc03251 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -76,8 +76,9 @@ * ->swap_lock (exclusive_swap_page, others) * ->i_pages lock * - * ->i_mutex - * ->i_mmap_rwsem (truncate->unmap_mapping_range) + * ->i_rwsem + * ->invalidate_lock (acquired by fs in truncate path) + * ->i_mmap_rwsem (truncate->unmap_mapping_range) * * ->mmap_lock * ->i_mmap_rwsem @@ -85,9 +86,10 @@ * ->i_pages lock (arch-dependent flush_dcache_mmap_lock) * * ->mmap_lock - * ->lock_page (access_process_vm) + * ->invalidate_lock (filemap_fault) + * ->lock_page (filemap_fault, access_process_vm) * - * ->i_mutex (generic_perform_write) + * ->i_rwsem (generic_perform_write) * ->mmap_lock (fault_in_pages_readable->do_page_fault) * * bdi->wb.list_lock @@ -378,6 +380,32 @@ static int filemap_check_and_keep_errors(struct address_space *mapping) } /** + * filemap_fdatawrite_wbc - start writeback on mapping dirty pages in range + * @mapping: address space structure to write + * @wbc: the writeback_control controlling the writeout + * + * Call writepages on the mapping using the provided wbc to control the + * writeout. + * + * Return: %0 on success, negative error code otherwise. + */ +int filemap_fdatawrite_wbc(struct address_space *mapping, + struct writeback_control *wbc) +{ + int ret; + + if (!mapping_can_writeback(mapping) || + !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) + return 0; + + wbc_attach_fdatawrite_inode(wbc, mapping->host); + ret = do_writepages(mapping, wbc); + wbc_detach_inode(wbc); + return ret; +} +EXPORT_SYMBOL(filemap_fdatawrite_wbc); + +/** * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range * @mapping: address space structure to write * @start: offset in bytes where the range starts @@ -397,7 +425,6 @@ static int filemap_check_and_keep_errors(struct address_space *mapping) int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, loff_t end, int sync_mode) { - int ret; struct writeback_control wbc = { .sync_mode = sync_mode, .nr_to_write = LONG_MAX, @@ -405,14 +432,7 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, .range_end = end, }; - if (!mapping_can_writeback(mapping) || - !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) - return 0; - - wbc_attach_fdatawrite_inode(&wbc, mapping->host); - ret = do_writepages(mapping, &wbc); - wbc_detach_inode(&wbc); - return ret; + return filemap_fdatawrite_wbc(mapping, &wbc); } static inline int __filemap_fdatawrite(struct address_space *mapping, @@ -1008,6 +1028,44 @@ EXPORT_SYMBOL(__page_cache_alloc); #endif /* + * filemap_invalidate_lock_two - lock invalidate_lock for two mappings + * + * Lock exclusively invalidate_lock of any passed mapping that is not NULL. + * + * @mapping1: the first mapping to lock + * @mapping2: the second mapping to lock + */ +void filemap_invalidate_lock_two(struct address_space *mapping1, + struct address_space *mapping2) +{ + if (mapping1 > mapping2) + swap(mapping1, mapping2); + if (mapping1) + down_write(&mapping1->invalidate_lock); + if (mapping2 && mapping1 != mapping2) + down_write_nested(&mapping2->invalidate_lock, 1); +} +EXPORT_SYMBOL(filemap_invalidate_lock_two); + +/* + * filemap_invalidate_unlock_two - unlock invalidate_lock for two mappings + * + * Unlock exclusive invalidate_lock of any passed mapping that is not NULL. + * + * @mapping1: the first mapping to unlock + * @mapping2: the second mapping to unlock + */ +void filemap_invalidate_unlock_two(struct address_space *mapping1, + struct address_space *mapping2) +{ + if (mapping1) + up_write(&mapping1->invalidate_lock); + if (mapping2 && mapping1 != mapping2) + up_write(&mapping2->invalidate_lock); +} +EXPORT_SYMBOL(filemap_invalidate_unlock_two); + +/* * In order to wait for pages to become available there must be * waitqueues associated with pages. By using a hash table of * waitqueues where the bucket discipline is to maintain all @@ -2368,20 +2426,30 @@ static int filemap_update_page(struct kiocb *iocb, { int error; + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!filemap_invalidate_trylock_shared(mapping)) + return -EAGAIN; + } else { + filemap_invalidate_lock_shared(mapping); + } + if (!trylock_page(page)) { + error = -EAGAIN; if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO)) - return -EAGAIN; + goto unlock_mapping; if (!(iocb->ki_flags & IOCB_WAITQ)) { + filemap_invalidate_unlock_shared(mapping); put_and_wait_on_page_locked(page, TASK_KILLABLE); return AOP_TRUNCATED_PAGE; } error = __lock_page_async(page, iocb->ki_waitq); if (error) - return error; + goto unlock_mapping; } + error = AOP_TRUNCATED_PAGE; if (!page->mapping) - goto truncated; + goto unlock; error = 0; if (filemap_range_uptodate(mapping, iocb->ki_pos, iter, page)) @@ -2392,15 +2460,13 @@ static int filemap_update_page(struct kiocb *iocb, goto unlock; error = filemap_read_page(iocb->ki_filp, mapping, page); - if (error == AOP_TRUNCATED_PAGE) - put_page(page); - return error; -truncated: - unlock_page(page); - put_page(page); - return AOP_TRUNCATED_PAGE; + goto unlock_mapping; unlock: unlock_page(page); +unlock_mapping: + filemap_invalidate_unlock_shared(mapping); + if (error == AOP_TRUNCATED_PAGE) + put_page(page); return error; } @@ -2415,6 +2481,19 @@ static int filemap_create_page(struct file *file, if (!page) return -ENOMEM; + /* + * Protect against truncate / hole punch. Grabbing invalidate_lock here + * assures we cannot instantiate and bring uptodate new pagecache pages + * after evicting page cache during truncate and before actually + * freeing blocks. Note that we could release invalidate_lock after + * inserting the page into page cache as the locked page would then be + * enough to synchronize with hole punching. But there are code paths + * such as filemap_update_page() filling in partially uptodate pages or + * ->readpages() that need to hold invalidate_lock while mapping blocks + * for IO so let's hold the lock here as well to keep locking rules + * simple. + */ + filemap_invalidate_lock_shared(mapping); error = add_to_page_cache_lru(page, mapping, index, mapping_gfp_constraint(mapping, GFP_KERNEL)); if (error == -EEXIST) @@ -2426,9 +2505,11 @@ static int filemap_create_page(struct file *file, if (error) goto error; + filemap_invalidate_unlock_shared(mapping); pagevec_add(pvec, page); return 0; error: + filemap_invalidate_unlock_shared(mapping); put_page(page); return error; } @@ -2967,6 +3048,7 @@ vm_fault_t filemap_fault(struct vm_fault *vmf) pgoff_t max_off; struct page *page; vm_fault_t ret = 0; + bool mapping_locked = false; max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); if (unlikely(offset >= max_off)) @@ -2976,25 +3058,39 @@ vm_fault_t filemap_fault(struct vm_fault *vmf) * Do we have something in the page cache already? */ page = find_get_page(mapping, offset); - if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) { + if (likely(page)) { /* - * We found the page, so try async readahead before - * waiting for the lock. + * We found the page, so try async readahead before waiting for + * the lock. */ - fpin = do_async_mmap_readahead(vmf, page); - } else if (!page) { + if (!(vmf->flags & FAULT_FLAG_TRIED)) + fpin = do_async_mmap_readahead(vmf, page); + if (unlikely(!PageUptodate(page))) { + filemap_invalidate_lock_shared(mapping); + mapping_locked = true; + } + } else { /* No page in the page cache at all */ count_vm_event(PGMAJFAULT); count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); ret = VM_FAULT_MAJOR; fpin = do_sync_mmap_readahead(vmf); retry_find: + /* + * See comment in filemap_create_page() why we need + * invalidate_lock + */ + if (!mapping_locked) { + filemap_invalidate_lock_shared(mapping); + mapping_locked = true; + } page = pagecache_get_page(mapping, offset, FGP_CREAT|FGP_FOR_MMAP, vmf->gfp_mask); if (!page) { if (fpin) goto out_retry; + filemap_invalidate_unlock_shared(mapping); return VM_FAULT_OOM; } } @@ -3014,8 +3110,20 @@ retry_find: * We have a locked page in the page cache, now we need to check * that it's up-to-date. If not, it is going to be due to an error. */ - if (unlikely(!PageUptodate(page))) + if (unlikely(!PageUptodate(page))) { + /* + * The page was in cache and uptodate and now it is not. + * Strange but possible since we didn't hold the page lock all + * the time. Let's drop everything get the invalidate lock and + * try again. + */ + if (!mapping_locked) { + unlock_page(page); + put_page(page); + goto retry_find; + } goto page_not_uptodate; + } /* * We've made it this far and we had to drop our mmap_lock, now is the @@ -3026,6 +3134,8 @@ retry_find: unlock_page(page); goto out_retry; } + if (mapping_locked) + filemap_invalidate_unlock_shared(mapping); /* * Found the page and have a reference on it. @@ -3056,6 +3166,7 @@ page_not_uptodate: if (!error || error == AOP_TRUNCATED_PAGE) goto retry_find; + filemap_invalidate_unlock_shared(mapping); return VM_FAULT_SIGBUS; @@ -3067,6 +3178,8 @@ out_retry: */ if (page) put_page(page); + if (mapping_locked) + filemap_invalidate_unlock_shared(mapping); if (fpin) fput(fpin); return ret | VM_FAULT_RETRY; @@ -3437,6 +3550,8 @@ out: * * If the page does not get brought uptodate, return -EIO. * + * The function expects mapping->invalidate_lock to be already held. + * * Return: up to date page on success, ERR_PTR() on failure. */ struct page *read_cache_page(struct address_space *mapping, @@ -3460,6 +3575,8 @@ EXPORT_SYMBOL(read_cache_page); * * If the page does not get brought uptodate, return -EIO. * + * The function expects mapping->invalidate_lock to be already held. + * * Return: up to date page on success, ERR_PTR() on failure. */ struct page *read_cache_page_gfp(struct address_space *mapping, @@ -3704,12 +3821,12 @@ EXPORT_SYMBOL(generic_perform_write); * modification times and calls proper subroutines depending on whether we * do direct IO or a standard buffered write. * - * It expects i_mutex to be grabbed unless we work on a block device or similar + * It expects i_rwsem to be grabbed unless we work on a block device or similar * object which does not need locking at all. * * This function does *not* take care of syncing data in case of O_SYNC write. * A caller has to handle it. This is mainly due to the fact that we want to - * avoid syncing under i_mutex. + * avoid syncing under i_rwsem. * * Return: * * number of bytes written, even for truncated writes @@ -3797,7 +3914,7 @@ EXPORT_SYMBOL(__generic_file_write_iter); * * This is a wrapper around __generic_file_write_iter() to be used by most * filesystems. It takes care of syncing the file in case of O_SYNC file - * and acquires i_mutex as needed. + * and acquires i_rwsem as needed. * Return: * * negative error code if no data has been written at all of * vfs_fsync_range() failed for a synchronous write @@ -1558,9 +1558,12 @@ long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start, gup_flags |= FOLL_WRITE; /* - * See check_vma_flags(): Will return -EFAULT on incompatible mappings - * or with insufficient permissions. + * We want to report -EINVAL instead of -EFAULT for any permission + * problems or incompatible mappings. */ + if (check_vma_flags(vma, gup_flags)) + return -EINVAL; + return __get_user_pages(mm, start, nr_pages, gup_flags, NULL, NULL, locked); } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index dfc940d5221d..8ea35ba6699f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2476,7 +2476,7 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, if (!rc) { /* * This indicates there is an entry in the reserve map - * added by alloc_huge_page. We know it was added + * not added by alloc_huge_page. We know it was added * before the alloc_huge_page call, otherwise * HPageRestoreReserve would be set on the page. * Remove the entry so that a subsequent allocation @@ -4660,7 +4660,9 @@ retry_avoidcopy: spin_unlock(ptl); mmu_notifier_invalidate_range_end(&range); out_release_all: - restore_reserve_on_error(h, vma, haddr, new_page); + /* No restore in case of successful pagetable update (Break COW) */ + if (new_page != old_page) + restore_reserve_on_error(h, vma, haddr, new_page); put_page(new_page); out_release_old: put_page(old_page); @@ -4776,7 +4778,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, pte_t new_pte; spinlock_t *ptl; unsigned long haddr = address & huge_page_mask(h); - bool new_page = false; + bool new_page, new_pagecache_page = false; /* * Currently, we are forced to kill the process in the event the @@ -4799,6 +4801,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, goto out; retry: + new_page = false; page = find_lock_page(mapping, idx); if (!page) { /* Check for page in userfault range */ @@ -4842,6 +4845,7 @@ retry: goto retry; goto out; } + new_pagecache_page = true; } else { lock_page(page); if (unlikely(anon_vma_prepare(vma))) { @@ -4926,7 +4930,9 @@ backout: spin_unlock(ptl); backout_unlocked: unlock_page(page); - restore_reserve_on_error(h, vma, haddr, page); + /* restore reserve for newly allocated pages not in page cache */ + if (new_page && !new_pagecache_page) + restore_reserve_on_error(h, vma, haddr, page); put_page(page); goto out; } @@ -5135,6 +5141,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, int ret = -ENOMEM; struct page *page; int writable; + bool new_pagecache_page = false; if (is_continue) { ret = -EFAULT; @@ -5228,6 +5235,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, ret = huge_add_to_page_cache(page, mapping, idx); if (ret) goto out_release_nounlock; + new_pagecache_page = true; } ptl = huge_pte_lockptr(h, dst_mm, dst_pte); @@ -5291,7 +5299,8 @@ out_release_unlock: if (vm_shared || is_continue) unlock_page(page); out_release_nounlock: - restore_reserve_on_error(h, dst_vma, dst_addr, page); + if (!new_pagecache_page) + restore_reserve_on_error(h, dst_vma, dst_addr, page); put_page(page); goto out; } diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index 4ea8c368b5b8..e4c16f6b6680 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -142,8 +142,6 @@ void kasan_init_hw_tags_cpu(void) if (kasan_arg == KASAN_ARG_OFF) return; - hw_init_tags(KASAN_TAG_MAX); - /* * Enable async mode only when explicitly requested through * the command line. @@ -250,12 +248,6 @@ void kasan_free_pages(struct page *page, unsigned int order) #if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) -void kasan_set_tagging_report_once(bool state) -{ - hw_set_tagging_report_once(state); -} -EXPORT_SYMBOL_GPL(kasan_set_tagging_report_once); - void kasan_enable_tagging_sync(void) { hw_enable_tagging_sync(); diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index d739cdd1621a..fff93b0bcb08 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -3,6 +3,7 @@ #define __MM_KASAN_KASAN_H #include <linux/kasan.h> +#include <linux/kasan-tags.h> #include <linux/kfence.h> #include <linux/stackdepot.h> @@ -51,16 +52,6 @@ extern bool kasan_flag_async __ro_after_init; #define KASAN_MEMORY_PER_SHADOW_PAGE (KASAN_GRANULE_SIZE << PAGE_SHIFT) -#define KASAN_TAG_KERNEL 0xFF /* native kernel pointers tag */ -#define KASAN_TAG_INVALID 0xFE /* inaccessible memory tag */ -#define KASAN_TAG_MAX 0xFD /* maximum value for random tags */ - -#ifdef CONFIG_KASAN_HW_TAGS -#define KASAN_TAG_MIN 0xF0 /* minimum value for random tags */ -#else -#define KASAN_TAG_MIN 0x00 /* minimum value for random tags */ -#endif - #ifdef CONFIG_KASAN_GENERIC #define KASAN_FREE_PAGE 0xFF /* page was freed */ #define KASAN_PAGE_REDZONE 0xFE /* redzone for kmalloc_large allocations */ @@ -299,12 +290,6 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag) #ifndef arch_enable_tagging_async #define arch_enable_tagging_async() #endif -#ifndef arch_init_tags -#define arch_init_tags(max_tag) -#endif -#ifndef arch_set_tagging_report_once -#define arch_set_tagging_report_once(state) -#endif #ifndef arch_force_async_tag_fault #define arch_force_async_tag_fault() #endif @@ -320,8 +305,6 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag) #define hw_enable_tagging_sync() arch_enable_tagging_sync() #define hw_enable_tagging_async() arch_enable_tagging_async() -#define hw_init_tags(max_tag) arch_init_tags(max_tag) -#define hw_set_tagging_report_once(state) arch_set_tagging_report_once(state) #define hw_force_async_tag_fault() arch_force_async_tag_fault() #define hw_get_random_tag() arch_get_random_tag() #define hw_get_mem_tag(addr) arch_get_mem_tag(addr) @@ -332,19 +315,16 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag) #define hw_enable_tagging_sync() #define hw_enable_tagging_async() -#define hw_set_tagging_report_once(state) #endif /* CONFIG_KASAN_HW_TAGS */ #if defined(CONFIG_KASAN_HW_TAGS) && IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) -void kasan_set_tagging_report_once(bool state); void kasan_enable_tagging_sync(void); void kasan_force_async_fault(void); #else /* CONFIG_KASAN_HW_TAGS || CONFIG_KASAN_KUNIT_TEST */ -static inline void kasan_set_tagging_report_once(bool state) { } static inline void kasan_enable_tagging_sync(void) { } static inline void kasan_force_async_fault(void) { } diff --git a/mm/kfence/core.c b/mm/kfence/core.c index d7666ace9d2e..575c685aa642 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -734,6 +734,22 @@ void kfence_shutdown_cache(struct kmem_cache *s) void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags) { /* + * Perform size check before switching kfence_allocation_gate, so that + * we don't disable KFENCE without making an allocation. + */ + if (size > PAGE_SIZE) + return NULL; + + /* + * Skip allocations from non-default zones, including DMA. We cannot + * guarantee that pages in the KFENCE pool will have the requested + * properties (e.g. reside in DMAable memory). + */ + if ((flags & GFP_ZONEMASK) || + (s->flags & (SLAB_CACHE_DMA | SLAB_CACHE_DMA32))) + return NULL; + + /* * allocation_gate only needs to become non-zero, so it doesn't make * sense to continue writing to it and pay the associated contention * cost, in case we have a large number of concurrent allocations. @@ -757,9 +773,6 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags) if (!READ_ONCE(kfence_enabled)) return NULL; - if (size > PAGE_SIZE) - return NULL; - return kfence_guarded_alloc(s, size, flags); } diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c index 7f24b9bcb2ec..eb6307c199ea 100644 --- a/mm/kfence/kfence_test.c +++ b/mm/kfence/kfence_test.c @@ -23,8 +23,15 @@ #include <linux/tracepoint.h> #include <trace/events/printk.h> +#include <asm/kfence.h> + #include "kfence.h" +/* May be overridden by <asm/kfence.h>. */ +#ifndef arch_kfence_test_address +#define arch_kfence_test_address(addr) (addr) +#endif + /* Report as observed from console. */ static struct { spinlock_t lock; @@ -82,6 +89,7 @@ static const char *get_access_type(const struct expect_report *r) /* Check observed report matches information in @r. */ static bool report_matches(const struct expect_report *r) { + unsigned long addr = (unsigned long)r->addr; bool ret = false; unsigned long flags; typeof(observed.lines) expect; @@ -131,22 +139,25 @@ static bool report_matches(const struct expect_report *r) switch (r->type) { case KFENCE_ERROR_OOB: cur += scnprintf(cur, end - cur, "Out-of-bounds %s at", get_access_type(r)); + addr = arch_kfence_test_address(addr); break; case KFENCE_ERROR_UAF: cur += scnprintf(cur, end - cur, "Use-after-free %s at", get_access_type(r)); + addr = arch_kfence_test_address(addr); break; case KFENCE_ERROR_CORRUPTION: cur += scnprintf(cur, end - cur, "Corrupted memory at"); break; case KFENCE_ERROR_INVALID: cur += scnprintf(cur, end - cur, "Invalid %s at", get_access_type(r)); + addr = arch_kfence_test_address(addr); break; case KFENCE_ERROR_INVALID_FREE: cur += scnprintf(cur, end - cur, "Invalid free of"); break; } - cur += scnprintf(cur, end - cur, " 0x%p", (void *)r->addr); + cur += scnprintf(cur, end - cur, " 0x%p", (void *)addr); spin_lock_irqsave(&observed.lock, flags); if (!report_available()) @@ -852,7 +863,7 @@ static void kfence_test_exit(void) tracepoint_synchronize_unregister(); } -late_initcall(kfence_test_init); +late_initcall_sync(kfence_test_init); module_exit(kfence_test_exit); MODULE_LICENSE("GPL v2"); diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 228a2fbe0657..73d46d16d575 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -290,7 +290,7 @@ static void hex_dump_object(struct seq_file *seq, warn_or_seq_printf(seq, " hex dump (first %zu bytes):\n", len); kasan_disable_current(); warn_or_seq_hex_dump(seq, DUMP_PREFIX_NONE, HEX_ROW_SIZE, - HEX_GROUP_SIZE, ptr, len, HEX_ASCII); + HEX_GROUP_SIZE, kasan_reset_tag((void *)ptr), len, HEX_ASCII); kasan_enable_current(); } @@ -1171,7 +1171,7 @@ static bool update_checksum(struct kmemleak_object *object) kasan_disable_current(); kcsan_disable_current(); - object->checksum = crc32(0, (void *)object->pointer, object->size); + object->checksum = crc32(0, kasan_reset_tag((void *)object->pointer), object->size); kasan_enable_current(); kcsan_enable_current(); @@ -1246,7 +1246,7 @@ static void scan_block(void *_start, void *_end, break; kasan_disable_current(); - pointer = *ptr; + pointer = *(unsigned long *)kasan_reset_tag((void *)ptr); kasan_enable_current(); untagged_ptr = (unsigned long)kasan_reset_tag((void *)pointer); diff --git a/mm/madvise.c b/mm/madvise.c index 6d3d348b17f4..56324a3dbc4e 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -862,10 +862,12 @@ static long madvise_populate(struct vm_area_struct *vma, switch (pages) { case -EINTR: return -EINTR; - case -EFAULT: /* Incompatible mappings / permissions. */ + case -EINVAL: /* Incompatible mappings / permissions. */ return -EINVAL; case -EHWPOISON: return -EHWPOISON; + case -EFAULT: /* VM_FAULT_SIGBUS or VM_FAULT_SIGSEGV */ + return -EFAULT; default: pr_warn_once("%s: unhandled return value: %ld\n", __func__, pages); @@ -910,7 +912,7 @@ static long madvise_remove(struct vm_area_struct *vma, + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); /* - * Filesystem's fallocate may need to take i_mutex. We need to + * Filesystem's fallocate may need to take i_rwsem. We need to * explicitly grab a reference because the vma (and hence the * vma's reference to the file) can go away as soon as we drop * mmap_lock. diff --git a/mm/memblock.c b/mm/memblock.c index 0041ff62c584..e2ca8ddc8ebe 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -665,6 +665,11 @@ repeat: int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size, int nid) { + phys_addr_t end = base + size - 1; + + memblock_dbg("%s: [%pa-%pa] nid=%d %pS\n", __func__, + &base, &end, nid, (void *)_RET_IP_); + return memblock_add_range(&memblock.memory, base, size, nid, 0); } @@ -947,7 +952,8 @@ static bool should_skip_region(struct memblock_type *type, return true; /* skip hotpluggable memory regions if needed */ - if (movable_node_is_enabled() && memblock_is_hotpluggable(m)) + if (movable_node_is_enabled() && memblock_is_hotpluggable(m) && + !(flags & MEMBLOCK_HOTPLUG)) return true; /* if we want mirror memory skip non-mirror memory regions */ @@ -1679,6 +1685,11 @@ void __init memblock_cap_memory_range(phys_addr_t base, phys_addr_t size) if (!size) return; + if (memblock.memory.cnt <= 1) { + pr_warn("%s: No memory registered yet\n", __func__); + return; + } + ret = memblock_isolate_range(&memblock.memory, base, size, &start_rgn, &end_rgn); if (ret) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ae1f5d0cb581..389b5766e74f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -968,7 +968,7 @@ static __always_inline bool memcg_kmem_bypass(void) return false; /* Memcg to charge can't be determined. */ - if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD)) + if (!in_task() || !current->mm || (current->flags & PF_KTHREAD)) return true; return false; @@ -3106,13 +3106,15 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, stock->cached_pgdat = pgdat; } else if (stock->cached_pgdat != pgdat) { /* Flush the existing cached vmstat data */ + struct pglist_data *oldpg = stock->cached_pgdat; + if (stock->nr_slab_reclaimable_b) { - mod_objcg_mlstate(objcg, pgdat, NR_SLAB_RECLAIMABLE_B, + mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B, stock->nr_slab_reclaimable_b); stock->nr_slab_reclaimable_b = 0; } if (stock->nr_slab_unreclaimable_b) { - mod_objcg_mlstate(objcg, pgdat, NR_SLAB_UNRECLAIMABLE_B, + mod_objcg_mlstate(objcg, oldpg, NR_SLAB_UNRECLAIMABLE_B, stock->nr_slab_unreclaimable_b); stock->nr_slab_unreclaimable_b = 0; } @@ -3574,7 +3576,8 @@ static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) unsigned long val; if (mem_cgroup_is_root(memcg)) { - cgroup_rstat_flush(memcg->css.cgroup); + /* mem_cgroup_threshold() calls here from irqsafe context */ + cgroup_rstat_flush_irqsafe(memcg->css.cgroup); val = memcg_page_state(memcg, NR_FILE_PAGES) + memcg_page_state(memcg, NR_ANON_MAPPED); if (swap) @@ -7047,14 +7050,14 @@ void mem_cgroup_sk_free(struct sock *sk) * mem_cgroup_charge_skmem - charge socket memory * @memcg: memcg to charge * @nr_pages: number of pages to charge + * @gfp_mask: reclaim mode * * Charges @nr_pages to @memcg. Returns %true if the charge fit within - * @memcg's configured limit, %false if the charge had to be forced. + * @memcg's configured limit, %false if it doesn't. */ -bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) +bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages, + gfp_t gfp_mask) { - gfp_t gfp_mask = GFP_KERNEL; - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { struct page_counter *fail; @@ -7062,21 +7065,19 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) memcg->tcpmem_pressure = 0; return true; } - page_counter_charge(&memcg->tcpmem, nr_pages); memcg->tcpmem_pressure = 1; + if (gfp_mask & __GFP_NOFAIL) { + page_counter_charge(&memcg->tcpmem, nr_pages); + return true; + } return false; } - /* Don't block in the packet receive path */ - if (in_softirq()) - gfp_mask = GFP_NOWAIT; - - mod_memcg_state(memcg, MEMCG_SOCK, nr_pages); - - if (try_charge(memcg, gfp_mask, nr_pages) == 0) + if (try_charge(memcg, gfp_mask, nr_pages) == 0) { + mod_memcg_state(memcg, MEMCG_SOCK, nr_pages); return true; + } - try_charge(memcg, gfp_mask|__GFP_NOFAIL, nr_pages); return false; } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index eefd823deb67..e1f87cf13235 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -866,7 +866,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) /* * Truncation is a bit tricky. Enable it per file system for now. * - * Open: to take i_mutex or not for this? Right now we don't. + * Open: to take i_rwsem or not for this? Right now we don't. */ ret = truncate_error_page(p, pfn, mapping); out: @@ -1146,7 +1146,7 @@ static int __get_hwpoison_page(struct page *page) * unexpected races caused by taking a page refcount. */ if (!HWPoisonHandlable(head)) - return 0; + return -EBUSY; if (PageTransHuge(head)) { /* @@ -1199,9 +1199,15 @@ try_again: } goto out; } else if (ret == -EBUSY) { - /* We raced with freeing huge page to buddy, retry. */ - if (pass++ < 3) + /* + * We raced with (possibly temporary) unhandlable + * page, retry. + */ + if (pass++ < 3) { + shake_page(p, 1); goto try_again; + } + ret = -EIO; goto out; } } diff --git a/mm/memory.c b/mm/memory.c index 747a01d495f2..25fc46e87214 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4026,8 +4026,17 @@ vm_fault_t finish_fault(struct vm_fault *vmf) return ret; } - if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) + if (vmf->prealloc_pte) { + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (likely(pmd_none(*vmf->pmd))) { + mm_inc_nr_ptes(vma->vm_mm); + pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); + vmf->prealloc_pte = NULL; + } + spin_unlock(vmf->ptl); + } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) { return VM_FAULT_OOM; + } } /* See comment in handle_pte_fault() */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 8cb75b26ea4f..86c3af79e874 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1731,6 +1731,7 @@ failed_removal_isolated: undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); memory_notify(MEM_CANCEL_OFFLINE, &arg); failed_removal_pcplists_disabled: + lru_cache_enable(); zone_pcp_enable(zone); failed_removal: pr_debug("memory offlining [mem %#010llx-%#010llx] failed due to %s\n", diff --git a/mm/migrate.c b/mm/migrate.c index 34a9ad3e0a4f..7e240437e7d9 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2068,7 +2068,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, LIST_HEAD(migratepages); new_page_t *new; bool compound; - unsigned int nr_pages = thp_nr_pages(page); + int nr_pages = thp_nr_pages(page); /* * PTE mapped THP or HugeTLB page can't reach here so the page could diff --git a/mm/mmap.c b/mm/mmap.c index ca54d36d203a..181a113b545d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1517,12 +1517,6 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE)) return -EACCES; - /* - * Make sure there are no mandatory locks on the file. - */ - if (locks_verify_locked(file)) - return -EAGAIN; - vm_flags |= VM_SHARED | VM_MAYSHARE; if (!(file->f_mode & FMODE_WRITE)) vm_flags &= ~(VM_MAYWRITE | VM_SHARED); diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c index f5852a058ce0..1854850b4b89 100644 --- a/mm/mmap_lock.c +++ b/mm/mmap_lock.c @@ -156,14 +156,14 @@ static inline void put_memcg_path_buf(void) #define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \ do { \ const char *memcg_path; \ - preempt_disable(); \ + local_lock(&memcg_paths.lock); \ memcg_path = get_mm_memcg_path(mm); \ trace_mmap_lock_##type(mm, \ memcg_path != NULL ? memcg_path : "", \ ##__VA_ARGS__); \ if (likely(memcg_path != NULL)) \ put_memcg_path_buf(); \ - preempt_enable(); \ + local_unlock(&memcg_paths.lock); \ } while (0) #else /* !CONFIG_MEMCG */ diff --git a/mm/nommu.c b/mm/nommu.c index 3a93d4054810..9d0ad98f838c 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -826,9 +826,6 @@ static int validate_mmap_request(struct file *file, (file->f_mode & FMODE_WRITE)) return -EACCES; - if (locks_verify_locked(file)) - return -EAGAIN; - if (!(capabilities & NOMMU_MAP_DIRECT)) return -ENODEV; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 9f63548f247c..c12f67cbfa19 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2010,7 +2010,6 @@ int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, return ret; } -#ifdef CONFIG_BLOCK void laptop_mode_timer_fn(struct timer_list *t) { struct backing_dev_info *backing_dev_info = @@ -2045,7 +2044,6 @@ void laptop_sync_completion(void) rcu_read_unlock(); } -#endif /* * If ratelimit_pages is too high then we can get into dirty-data overload diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3e97e68aef7a..eeb3a9cb36bb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -840,21 +840,24 @@ void init_mem_debugging_and_hardening(void) } #endif - if (_init_on_alloc_enabled_early) { - if (page_poisoning_requested) - pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, " - "will take precedence over init_on_alloc\n"); - else - static_branch_enable(&init_on_alloc); - } - if (_init_on_free_enabled_early) { - if (page_poisoning_requested) - pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, " - "will take precedence over init_on_free\n"); - else - static_branch_enable(&init_on_free); + if ((_init_on_alloc_enabled_early || _init_on_free_enabled_early) && + page_poisoning_requested) { + pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, " + "will take precedence over init_on_alloc and init_on_free\n"); + _init_on_alloc_enabled_early = false; + _init_on_free_enabled_early = false; } + if (_init_on_alloc_enabled_early) + static_branch_enable(&init_on_alloc); + else + static_branch_disable(&init_on_alloc); + + if (_init_on_free_enabled_early) + static_branch_enable(&init_on_free); + else + static_branch_disable(&init_on_free); + #ifdef CONFIG_DEBUG_PAGEALLOC if (!debug_pagealloc_enabled()) return; @@ -3450,19 +3453,10 @@ void free_unref_page_list(struct list_head *list) * comment in free_unref_page. */ migratetype = get_pcppage_migratetype(page); - if (unlikely(migratetype >= MIGRATE_PCPTYPES)) { - if (unlikely(is_migrate_isolate(migratetype))) { - list_del(&page->lru); - free_one_page(page_zone(page), page, pfn, 0, - migratetype, FPI_NONE); - continue; - } - - /* - * Non-isolated types over MIGRATE_PCPTYPES get added - * to the MIGRATE_MOVABLE pcp list. - */ - set_pcppage_migratetype(page, MIGRATE_MOVABLE); + if (unlikely(is_migrate_isolate(migratetype))) { + list_del(&page->lru); + free_one_page(page_zone(page), page, pfn, 0, migratetype, FPI_NONE); + continue; } set_page_private(page, pfn); @@ -3472,7 +3466,15 @@ void free_unref_page_list(struct list_head *list) list_for_each_entry_safe(page, next, list, lru) { pfn = page_private(page); set_page_private(page, 0); + + /* + * Non-isolated types over MIGRATE_PCPTYPES get added + * to the MIGRATE_MOVABLE pcp list. + */ migratetype = get_pcppage_migratetype(page); + if (unlikely(migratetype >= MIGRATE_PCPTYPES)) + migratetype = MIGRATE_MOVABLE; + trace_mm_page_free_batched(page); free_unref_page_commit(page, pfn, migratetype, 0); diff --git a/mm/readahead.c b/mm/readahead.c index d589f147f4c2..41b75d76d36e 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -192,6 +192,7 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, */ unsigned int nofs = memalloc_nofs_save(); + filemap_invalidate_lock_shared(mapping); /* * Preallocate as many pages as we will need. */ @@ -236,6 +237,7 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, * will then handle the error. */ read_pages(ractl, &page_pool, false); + filemap_invalidate_unlock_shared(mapping); memalloc_nofs_restore(nofs); } EXPORT_SYMBOL_GPL(page_cache_ra_unbounded); diff --git a/mm/rmap.c b/mm/rmap.c index b9eb5c12f3fe..2d29a57d29e8 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -20,28 +20,29 @@ /* * Lock ordering in mm: * - * inode->i_mutex (while writing or truncating, not reading or faulting) + * inode->i_rwsem (while writing or truncating, not reading or faulting) * mm->mmap_lock - * page->flags PG_locked (lock_page) * (see huegtlbfs below) - * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share) - * mapping->i_mmap_rwsem - * hugetlb_fault_mutex (hugetlbfs specific page fault mutex) - * anon_vma->rwsem - * mm->page_table_lock or pte_lock - * swap_lock (in swap_duplicate, swap_info_get) - * mmlist_lock (in mmput, drain_mmlist and others) - * mapping->private_lock (in __set_page_dirty_buffers) - * lock_page_memcg move_lock (in __set_page_dirty_buffers) - * i_pages lock (widely used) - * lruvec->lru_lock (in lock_page_lruvec_irq) - * inode->i_lock (in set_page_dirty's __mark_inode_dirty) - * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) - * sb_lock (within inode_lock in fs/fs-writeback.c) - * i_pages lock (widely used, in set_page_dirty, - * in arch-dependent flush_dcache_mmap_lock, - * within bdi.wb->list_lock in __sync_single_inode) + * mapping->invalidate_lock (in filemap_fault) + * page->flags PG_locked (lock_page) * (see hugetlbfs below) + * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share) + * mapping->i_mmap_rwsem + * hugetlb_fault_mutex (hugetlbfs specific page fault mutex) + * anon_vma->rwsem + * mm->page_table_lock or pte_lock + * swap_lock (in swap_duplicate, swap_info_get) + * mmlist_lock (in mmput, drain_mmlist and others) + * mapping->private_lock (in __set_page_dirty_buffers) + * lock_page_memcg move_lock (in __set_page_dirty_buffers) + * i_pages lock (widely used) + * lruvec->lru_lock (in lock_page_lruvec_irq) + * inode->i_lock (in set_page_dirty's __mark_inode_dirty) + * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) + * sb_lock (within inode_lock in fs/fs-writeback.c) + * i_pages lock (widely used, in set_page_dirty, + * in arch-dependent flush_dcache_mmap_lock, + * within bdi.wb->list_lock in __sync_single_inode) * - * anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon) + * anon_vma->rwsem,mapping->i_mmap_rwsem (memory_failure, collect_procs_anon) * ->tasklist_lock * pte map lock * diff --git a/mm/secretmem.c b/mm/secretmem.c index f77d25467a14..030f02ddc7c1 100644 --- a/mm/secretmem.c +++ b/mm/secretmem.c @@ -152,6 +152,7 @@ static void secretmem_freepage(struct page *page) } const struct address_space_operations secretmem_aops = { + .set_page_dirty = __set_page_dirty_no_writeback, .freepage = secretmem_freepage, .migratepage = secretmem_migratepage, .isolate_page = secretmem_isolate_page, diff --git a/mm/shmem.c b/mm/shmem.c index 70d9ce294bb4..3107acee4f71 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -96,7 +96,7 @@ static struct vfsmount *shm_mnt; /* * shmem_fallocate communicates with shmem_fault or shmem_writepage via - * inode->i_private (with i_mutex making sure that it has only one user at + * inode->i_private (with i_rwsem making sure that it has only one user at * a time): we would prefer not to enlarge the shmem inode just for that. */ struct shmem_falloc { @@ -774,7 +774,7 @@ static int shmem_free_swap(struct address_space *mapping, * Determine (in bytes) how many of the shmem object's pages mapped by the * given offsets are swapped out. * - * This is safe to call without i_mutex or the i_pages lock thanks to RCU, + * This is safe to call without i_rwsem or the i_pages lock thanks to RCU, * as long as the inode doesn't go away and racy results are not a problem. */ unsigned long shmem_partial_swap_usage(struct address_space *mapping, @@ -806,7 +806,7 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping, * Determine (in bytes) how many of the shmem object's pages mapped by the * given vma is swapped out. * - * This is safe to call without i_mutex or the i_pages lock thanks to RCU, + * This is safe to call without i_rwsem or the i_pages lock thanks to RCU, * as long as the inode doesn't go away and racy results are not a problem. */ unsigned long shmem_swap_usage(struct vm_area_struct *vma) @@ -1069,7 +1069,7 @@ static int shmem_setattr(struct user_namespace *mnt_userns, loff_t oldsize = inode->i_size; loff_t newsize = attr->ia_size; - /* protected by i_mutex */ + /* protected by i_rwsem */ if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) || (newsize > oldsize && (info->seals & F_SEAL_GROW))) return -EPERM; @@ -1696,8 +1696,7 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); struct mm_struct *charge_mm = vma ? vma->vm_mm : NULL; - struct swap_info_struct *si; - struct page *page = NULL; + struct page *page; swp_entry_t swap; int error; @@ -1705,12 +1704,6 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index, swap = radix_to_swp_entry(*pagep); *pagep = NULL; - /* Prevent swapoff from happening to us. */ - si = get_swap_device(swap); - if (!si) { - error = EINVAL; - goto failed; - } /* Look it up and read it in.. */ page = lookup_swap_cache(swap, NULL, 0); if (!page) { @@ -1772,8 +1765,6 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index, swap_free(swap); *pagep = page; - if (si) - put_swap_device(si); return 0; failed: if (!shmem_confirm_swap(mapping, index, swap)) @@ -1784,9 +1775,6 @@ unlock: put_page(page); } - if (si) - put_swap_device(si); - return error; } @@ -2071,7 +2059,7 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf) /* * Trinity finds that probing a hole which tmpfs is punching can * prevent the hole-punch from ever completing: which in turn - * locks writers out with its hold on i_mutex. So refrain from + * locks writers out with its hold on i_rwsem. So refrain from * faulting pages into the hole while it's being punched. Although * shmem_undo_range() does remove the additions, it may be unable to * keep up, as each new page needs its own unmap_mapping_range() call, @@ -2082,7 +2070,7 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf) * we just need to make racing faults a rare case. * * The implementation below would be much simpler if we just used a - * standard mutex or completion: but we cannot take i_mutex in fault, + * standard mutex or completion: but we cannot take i_rwsem in fault, * and bloating every shmem inode for this unlikely case would be sad. */ if (unlikely(inode->i_private)) { @@ -2482,7 +2470,7 @@ shmem_write_begin(struct file *file, struct address_space *mapping, struct shmem_inode_info *info = SHMEM_I(inode); pgoff_t index = pos >> PAGE_SHIFT; - /* i_mutex is held by caller */ + /* i_rwsem is held by caller */ if (unlikely(info->seals & (F_SEAL_GROW | F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) { if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) @@ -2582,7 +2570,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) /* * We must evaluate after, since reads (unlike writes) - * are called without i_mutex protection against truncate + * are called without i_rwsem protection against truncate */ nr = PAGE_SIZE; i_size = i_size_read(inode); @@ -2652,7 +2640,7 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) return -ENXIO; inode_lock(inode); - /* We're holding i_mutex so we can access i_size directly */ + /* We're holding i_rwsem so we can access i_size directly */ offset = mapping_seek_hole_data(mapping, offset, inode->i_size, whence); if (offset >= 0) offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE); @@ -2681,7 +2669,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1; DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq); - /* protected by i_mutex */ + /* protected by i_rwsem */ if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { error = -EPERM; goto out; diff --git a/mm/slab.h b/mm/slab.h index f997fd5e42c8..58c01a34e5b8 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -346,7 +346,7 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s_orig, continue; page = virt_to_head_page(p[i]); - objcgs = page_objcgs(page); + objcgs = page_objcgs_check(page); if (!objcgs) continue; diff --git a/mm/slub.c b/mm/slub.c index 090fa14628f9..f77d8cd79ef7 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -576,8 +576,8 @@ static void print_section(char *level, char *text, u8 *addr, unsigned int length) { metadata_access_enable(); - print_hex_dump(level, kasan_reset_tag(text), DUMP_PREFIX_ADDRESS, - 16, 1, addr, length, 1); + print_hex_dump(level, text, DUMP_PREFIX_ADDRESS, + 16, 1, kasan_reset_tag((void *)addr), length, 1); metadata_access_disable(); } @@ -1400,12 +1400,13 @@ check_slabs: static int __init setup_slub_debug(char *str) { slab_flags_t flags; + slab_flags_t global_flags; char *saved_str; char *slab_list; bool global_slub_debug_changed = false; bool slab_list_specified = false; - slub_debug = DEBUG_DEFAULT_FLAGS; + global_flags = DEBUG_DEFAULT_FLAGS; if (*str++ != '=' || !*str) /* * No options specified. Switch on full debugging. @@ -1417,7 +1418,7 @@ static int __init setup_slub_debug(char *str) str = parse_slub_debug_flags(str, &flags, &slab_list, true); if (!slab_list) { - slub_debug = flags; + global_flags = flags; global_slub_debug_changed = true; } else { slab_list_specified = true; @@ -1426,16 +1427,18 @@ static int __init setup_slub_debug(char *str) /* * For backwards compatibility, a single list of flags with list of - * slabs means debugging is only enabled for those slabs, so the global - * slub_debug should be 0. We can extended that to multiple lists as + * slabs means debugging is only changed for those slabs, so the global + * slub_debug should be unchanged (0 or DEBUG_DEFAULT_FLAGS, depending + * on CONFIG_SLUB_DEBUG_ON). We can extended that to multiple lists as * long as there is no option specifying flags without a slab list. */ if (slab_list_specified) { if (!global_slub_debug_changed) - slub_debug = 0; + global_flags = slub_debug; slub_debug_string = saved_str; } out: + slub_debug = global_flags; if (slub_debug != 0 || slub_debug_string) static_branch_enable(&slub_debug_enabled); else @@ -3236,6 +3239,16 @@ struct detached_freelist { struct kmem_cache *s; }; +static inline void free_nonslab_page(struct page *page, void *object) +{ + unsigned int order = compound_order(page); + + VM_BUG_ON_PAGE(!PageCompound(page), page); + kfree_hook(object); + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, -(PAGE_SIZE << order)); + __free_pages(page, order); +} + /* * This function progressively scans the array with free objects (with * a limited look ahead) and extract objects belonging to the same @@ -3272,9 +3285,7 @@ int build_detached_freelist(struct kmem_cache *s, size_t size, if (!s) { /* Handle kalloc'ed objects */ if (unlikely(!PageSlab(page))) { - BUG_ON(!PageCompound(page)); - kfree_hook(object); - __free_pages(page, compound_order(page)); + free_nonslab_page(page, object); p[size] = NULL; /* mark object processed */ return size; } @@ -4250,13 +4261,7 @@ void kfree(const void *x) page = virt_to_head_page(x); if (unlikely(!PageSlab(page))) { - unsigned int order = compound_order(page); - - BUG_ON(!PageCompound(page)); - kfree_hook(object); - mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, - -(PAGE_SIZE << order)); - __free_pages(page, order); + free_nonslab_page(page, object); return; } slab_free(page->slab_cache, page, object, NULL, 1, _RET_IP_); diff --git a/mm/swap_slots.c b/mm/swap_slots.c index a66f3e0ec973..16f706c55d92 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -70,9 +70,9 @@ void disable_swap_slots_cache_lock(void) swap_slot_cache_enabled = false; if (swap_slot_cache_initialized) { /* serialize with cpu hotplug operations */ - get_online_cpus(); + cpus_read_lock(); __drain_swap_slots_cache(SLOTS_CACHE|SLOTS_CACHE_RET); - put_online_cpus(); + cpus_read_unlock(); } } diff --git a/mm/swap_state.c b/mm/swap_state.c index c56aa9ac050d..bc7cee6b2ec5 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -628,13 +628,6 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, if (!mask) goto skip; - /* Test swap type to make sure the dereference is safe */ - if (likely(si->flags & (SWP_BLKDEV | SWP_FS_OPS))) { - struct inode *inode = si->swap_file->f_mapping->host; - if (inode_read_congested(inode)) - goto skip; - } - do_poll = false; /* Read a page_cluster sized and aligned cluster around offset. */ start_offset = offset & ~mask; diff --git a/mm/truncate.c b/mm/truncate.c index 234ddd879caa..44ad5e515140 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -412,7 +412,8 @@ EXPORT_SYMBOL(truncate_inode_pages_range); * @mapping: mapping to truncate * @lstart: offset from which to truncate * - * Called under (and serialised by) inode->i_mutex. + * Called under (and serialised by) inode->i_rwsem and + * mapping->invalidate_lock. * * Note: When this function returns, there can be a page in the process of * deletion (inside __delete_from_page_cache()) in the specified range. Thus @@ -429,7 +430,7 @@ EXPORT_SYMBOL(truncate_inode_pages); * truncate_inode_pages_final - truncate *all* pages before inode dies * @mapping: mapping to truncate * - * Called under (and serialized by) inode->i_mutex. + * Called under (and serialized by) inode->i_rwsem. * * Filesystems have to use this in the .evict_inode path to inform the * VM that this is the final truncate and the inode is going away. @@ -748,7 +749,7 @@ EXPORT_SYMBOL(truncate_pagecache); * setattr function when ATTR_SIZE is passed in. * * Must be called with a lock serializing truncates and writes (generally - * i_mutex but e.g. xfs uses a different lock) and before all filesystem + * i_rwsem but e.g. xfs uses a different lock) and before all filesystem * specific block truncation has been performed. */ void truncate_setsize(struct inode *inode, loff_t newsize) @@ -777,7 +778,7 @@ EXPORT_SYMBOL(truncate_setsize); * * The function must be called after i_size is updated so that page fault * coming after we unlock the page will already see the new i_size. - * The function must be called while we still hold i_mutex - this not only + * The function must be called while we still hold i_rwsem - this not only * makes sure i_size is stable but also that userspace cannot observe new * i_size value before we are prepared to store mmap writes at new inode size. */ diff --git a/mm/util.c b/mm/util.c index 9043d03750a7..499b6b5767ed 100644 --- a/mm/util.c +++ b/mm/util.c @@ -593,6 +593,10 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node) if (ret || size <= PAGE_SIZE) return ret; + /* Don't even allow crazy sizes */ + if (WARN_ON_ONCE(size > INT_MAX)) + return NULL; + return __vmalloc_node(size, 1, flags, node, __builtin_return_address(0)); } @@ -635,6 +639,21 @@ void kvfree_sensitive(const void *addr, size_t len) } EXPORT_SYMBOL(kvfree_sensitive); +void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags) +{ + void *newp; + + if (oldsize >= newsize) + return (void *)p; + newp = kvmalloc(newsize, flags); + if (!newp) + return NULL; + memcpy(newp, p, oldsize); + kvfree(p); + return newp; +} +EXPORT_SYMBOL(kvrealloc); + static inline void *__page_rmapping(struct page *page) { unsigned long mapping; diff --git a/mm/vmscan.c b/mm/vmscan.c index 4620df62f0ff..eeae2f6bc532 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -100,9 +100,12 @@ struct scan_control { unsigned int may_swap:1; /* - * Cgroups are not reclaimed below their configured memory.low, - * unless we threaten to OOM. If any cgroups are skipped due to - * memory.low and nothing was reclaimed, go back for memory.low. + * Cgroup memory below memory.low is protected as long as we + * don't threaten to OOM. If any cgroup is reclaimed at + * reduced force or passed over entirely due to its memory.low + * setting (memcg_low_skipped), and nothing is reclaimed as a + * result, then go back for one more cycle that reclaims the protected + * memory (memcg_low_reclaim) to avert OOM. */ unsigned int memcg_low_reclaim:1; unsigned int memcg_low_skipped:1; @@ -2537,15 +2540,14 @@ out: for_each_evictable_lru(lru) { int file = is_file_lru(lru); unsigned long lruvec_size; + unsigned long low, min; unsigned long scan; - unsigned long protection; lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); - protection = mem_cgroup_protection(sc->target_mem_cgroup, - memcg, - sc->memcg_low_reclaim); + mem_cgroup_protection(sc->target_mem_cgroup, memcg, + &min, &low); - if (protection) { + if (min || low) { /* * Scale a cgroup's reclaim pressure by proportioning * its current usage to its memory.low or memory.min @@ -2576,6 +2578,15 @@ out: * hard protection. */ unsigned long cgroup_size = mem_cgroup_size(memcg); + unsigned long protection; + + /* memory.low scaling, make sure we retry before OOM */ + if (!sc->memcg_low_reclaim && low > min) { + protection = low; + sc->memcg_low_skipped = 1; + } else { + protection = min; + } /* Avoid TOCTOU with earlier protection check */ cgroup_size = max(cgroup_size, protection); @@ -4413,11 +4424,13 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in .may_swap = 1, .reclaim_idx = gfp_zone(gfp_mask), }; + unsigned long pflags; trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order, sc.gfp_mask); cond_resched(); + psi_memstall_enter(&pflags); fs_reclaim_acquire(sc.gfp_mask); /* * We need to be able to allocate from the reserves for RECLAIM_UNMAP @@ -4442,6 +4455,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in current->flags &= ~PF_SWAPWRITE; memalloc_noreclaim_restore(noreclaim_flag); fs_reclaim_release(sc.gfp_mask); + psi_memstall_leave(&pflags); trace_mm_vmscan_node_reclaim_end(sc.nr_reclaimed); diff --git a/mm/vmstat.c b/mm/vmstat.c index b0534e068166..a7ed56ac4c0b 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -129,9 +129,9 @@ static void sum_vm_events(unsigned long *ret) */ void all_vm_events(unsigned long *ret) { - get_online_cpus(); + cpus_read_lock(); sum_vm_events(ret); - put_online_cpus(); + cpus_read_unlock(); } EXPORT_SYMBOL_GPL(all_vm_events); @@ -1948,7 +1948,7 @@ static void vmstat_shepherd(struct work_struct *w) { int cpu; - get_online_cpus(); + cpus_read_lock(); /* Check processors whose vmstat worker threads have been disabled */ for_each_online_cpu(cpu) { struct delayed_work *dw = &per_cpu(vmstat_work, cpu); @@ -1958,7 +1958,7 @@ static void vmstat_shepherd(struct work_struct *w) cond_resched(); } - put_online_cpus(); + cpus_read_unlock(); schedule_delayed_work(&shepherd, round_jiffies_relative(sysctl_stat_interval)); @@ -2037,9 +2037,9 @@ void __init init_mm_internals(void) if (ret < 0) pr_err("vmstat: failed to register 'online' hotplug state\n"); - get_online_cpus(); + cpus_read_lock(); init_cpu_node_state(); - put_online_cpus(); + cpus_read_unlock(); start_shepherd_timer(); #endif |