diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Makefile | 4 | ||||
-rw-r--r-- | mm/backing-dev.c | 13 | ||||
-rw-r--r-- | mm/cma.c | 6 | ||||
-rw-r--r-- | mm/compaction.c | 9 | ||||
-rw-r--r-- | mm/failslab.c | 2 | ||||
-rw-r--r-- | mm/gup.c | 4 | ||||
-rw-r--r-- | mm/huge_memory.c | 36 | ||||
-rw-r--r-- | mm/hugetlb.c | 27 | ||||
-rw-r--r-- | mm/kasan/kasan.c | 15 | ||||
-rw-r--r-- | mm/kmemleak.c | 12 | ||||
-rw-r--r-- | mm/ksm.c | 36 | ||||
-rw-r--r-- | mm/list_lru.c | 67 | ||||
-rw-r--r-- | mm/memblock.c | 43 | ||||
-rw-r--r-- | mm/memory-failure.c | 16 | ||||
-rw-r--r-- | mm/memory.c | 33 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 50 | ||||
-rw-r--r-- | mm/mmap.c | 14 | ||||
-rw-r--r-- | mm/nommu.c | 12 | ||||
-rw-r--r-- | mm/oom_kill.c | 12 | ||||
-rw-r--r-- | mm/page_alloc.c | 386 | ||||
-rw-r--r-- | mm/page_idle.c | 12 | ||||
-rw-r--r-- | mm/page_isolation.c | 18 | ||||
-rw-r--r-- | mm/page_owner.c | 2 | ||||
-rw-r--r-- | mm/page_poison.c | 2 | ||||
-rw-r--r-- | mm/pagewalk.c | 3 | ||||
-rw-r--r-- | mm/percpu-stats.c | 13 | ||||
-rw-r--r-- | mm/rmap.c | 1 | ||||
-rw-r--r-- | mm/shmem.c | 5 | ||||
-rw-r--r-- | mm/slab.c | 18 | ||||
-rw-r--r-- | mm/slab.h | 27 | ||||
-rw-r--r-- | mm/slab_common.c | 96 | ||||
-rw-r--r-- | mm/slub.c | 150 | ||||
-rw-r--r-- | mm/sparse.c | 8 | ||||
-rw-r--r-- | mm/swap.c | 1 | ||||
-rw-r--r-- | mm/swap_slots.c | 4 | ||||
-rw-r--r-- | mm/swap_state.c | 144 | ||||
-rw-r--r-- | mm/util.c | 10 | ||||
-rw-r--r-- | mm/vmscan.c | 50 | ||||
-rw-r--r-- | mm/z3fold.c | 35 | ||||
-rw-r--r-- | mm/zsmalloc.c | 58 |
40 files changed, 874 insertions, 580 deletions
diff --git a/mm/Makefile b/mm/Makefile index e669f02c5a54..b4e54a9ae9c5 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -37,7 +37,7 @@ obj-y := filemap.o mempool.o oom_kill.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ util.o mmzone.o vmstat.o backing-dev.o \ mm_init.o mmu_context.o percpu.o slab_common.o \ - compaction.o vmacache.o swap_slots.o \ + compaction.o vmacache.o \ interval_tree.o list_lru.o workingset.o \ debug.o $(mmu-y) @@ -55,7 +55,7 @@ ifdef CONFIG_MMU endif obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o -obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o +obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o swap_slots.o obj-$(CONFIG_FRONTSWAP) += frontswap.o obj-$(CONFIG_ZSWAP) += zswap.o obj-$(CONFIG_HAS_DMA) += dmapool.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index d2984e9fcf08..08b9aab631ab 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -100,18 +100,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) return 0; } - -static int bdi_debug_stats_open(struct inode *inode, struct file *file) -{ - return single_open(file, bdi_debug_stats_show, inode->i_private); -} - -static const struct file_operations bdi_debug_stats_fops = { - .open = bdi_debug_stats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(bdi_debug_stats); static int bdi_debug_register(struct backing_dev_info *bdi, const char *name) { @@ -35,6 +35,7 @@ #include <linux/cma.h> #include <linux/highmem.h> #include <linux/io.h> +#include <linux/kmemleak.h> #include <trace/events/cma.h> #include "cma.h" @@ -165,6 +166,9 @@ core_initcall(cma_init_reserved_areas); * @base: Base address of the reserved area * @size: Size of the reserved area (in bytes), * @order_per_bit: Order of pages represented by one bit on bitmap. + * @name: The name of the area. If this parameter is NULL, the name of + * the area will be set to "cmaN", where N is a running counter of + * used areas. * @res_cma: Pointer to store the created cma region. * * This function creates custom contiguous area from already reserved memory. @@ -227,6 +231,7 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, * @alignment: Alignment for the CMA area, should be power of 2 or zero * @order_per_bit: Order of pages represented by one bit on bitmap. * @fixed: hint about where to place the reserved area + * @name: The name of the area. See function cma_init_reserved_mem() * @res_cma: Pointer to store the created cma region. * * This function reserves memory from early allocator. It should be @@ -390,6 +395,7 @@ static inline void cma_debug_show_areas(struct cma *cma) { } * @cma: Contiguous memory region for which the allocation is performed. * @count: Requested number of pages. * @align: Requested alignment of pages (in PAGE_SIZE order). + * @gfp_mask: GFP mask to use during compaction * * This function allocates part of contiguous memory on specific * contiguous memory area. diff --git a/mm/compaction.c b/mm/compaction.c index 2c8999d027ab..88d01a50a015 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -576,6 +576,7 @@ isolate_fail: /** * isolate_freepages_range() - isolate free pages. + * @cc: Compaction control structure. * @start_pfn: The first PFN to start isolating. * @end_pfn: The one-past-last PFN. * @@ -1988,6 +1989,14 @@ static void kcompactd_do_work(pg_data_t *pgdat) compaction_defer_reset(zone, cc.order, false); } else if (status == COMPACT_PARTIAL_SKIPPED || status == COMPACT_COMPLETE) { /* + * Buddy pages may become stranded on pcps that could + * otherwise coalesce on the zone's free area for + * order >= cc.order. This is ratelimited by the + * upcoming deferral. + */ + drain_all_pages(zone); + + /* * We use sync migration mode here, so we defer like * sync direct compaction does. */ diff --git a/mm/failslab.c b/mm/failslab.c index 8087d976a809..1f2f248e3601 100644 --- a/mm/failslab.c +++ b/mm/failslab.c @@ -14,7 +14,7 @@ static struct { .cache_filter = false, }; -bool should_failslab(struct kmem_cache *s, gfp_t gfpflags) +bool __should_failslab(struct kmem_cache *s, gfp_t gfpflags) { /* No fault-injection for bootstrap cache */ if (unlikely(s == kmem_cache)) @@ -531,7 +531,7 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, * reCOWed by userspace write). */ if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE)) - *flags |= FOLL_COW; + *flags |= FOLL_COW; return 0; } @@ -1638,7 +1638,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, PMD_SHIFT, next, write, pages, nr)) return 0; } else if (!gup_pte_range(pmd, addr, next, write, pages, nr)) - return 0; + return 0; } while (pmdp++, addr = next, addr != end); return 1; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 5a68730eebd6..f0ae8d1d4329 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2356,26 +2356,13 @@ static void __split_huge_page_tail(struct page *head, int tail, struct page *page_tail = head + tail; VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail); - VM_BUG_ON_PAGE(page_ref_count(page_tail) != 0, page_tail); /* - * tail_page->_refcount is zero and not changing from under us. But - * get_page_unless_zero() may be running from under us on the - * tail_page. If we used atomic_set() below instead of atomic_inc() or - * atomic_add(), we would then run atomic_set() concurrently with - * get_page_unless_zero(), and atomic_set() is implemented in C not - * using locked ops. spin_unlock on x86 sometime uses locked ops - * because of PPro errata 66, 92, so unless somebody can guarantee - * atomic_set() here would be safe on all archs (and not only on x86), - * it's safer to use atomic_inc()/atomic_add(). + * Clone page flags before unfreezing refcount. + * + * After successful get_page_unless_zero() might follow flags change, + * for exmaple lock_page() which set PG_waiters. */ - if (PageAnon(head) && !PageSwapCache(head)) { - page_ref_inc(page_tail); - } else { - /* Additional pin to radix tree */ - page_ref_add(page_tail, 2); - } - page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; page_tail->flags |= (head->flags & ((1L << PG_referenced) | @@ -2388,14 +2375,21 @@ static void __split_huge_page_tail(struct page *head, int tail, (1L << PG_unevictable) | (1L << PG_dirty))); - /* - * After clearing PageTail the gup refcount can be released. - * Page flags also must be visible before we make the page non-compound. - */ + /* Page flags must be visible before we make the page non-compound. */ smp_wmb(); + /* + * Clear PageTail before unfreezing page refcount. + * + * After successful get_page_unless_zero() might follow put_page() + * which needs correct compound_head(). + */ clear_compound_head(page_tail); + /* Finally unfreeze refcount. Additional reference from page cache. */ + page_ref_unfreeze(page_tail, 1 + (!PageAnon(head) || + PageSwapCache(head))); + if (page_is_young(head)) set_page_young(page_tail); if (page_is_idle(head)) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 976bbc5646fe..218679138255 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -637,29 +637,22 @@ EXPORT_SYMBOL_GPL(linear_hugepage_index); */ unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) { - struct hstate *hstate; - - if (!is_vm_hugetlb_page(vma)) - return PAGE_SIZE; - - hstate = hstate_vma(vma); - - return 1UL << huge_page_shift(hstate); + if (vma->vm_ops && vma->vm_ops->pagesize) + return vma->vm_ops->pagesize(vma); + return PAGE_SIZE; } EXPORT_SYMBOL_GPL(vma_kernel_pagesize); /* * Return the page size being used by the MMU to back a VMA. In the majority * of cases, the page size used by the kernel matches the MMU size. On - * architectures where it differs, an architecture-specific version of this - * function is required. + * architectures where it differs, an architecture-specific 'strong' + * version of this symbol is required. */ -#ifndef vma_mmu_pagesize -unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) +__weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) { return vma_kernel_pagesize(vma); } -#endif /* * Flags for MAP_PRIVATE reservations. These are stored in the bottom @@ -3153,6 +3146,13 @@ static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr) return 0; } +static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma) +{ + struct hstate *hstate = hstate_vma(vma); + + return 1UL << huge_page_shift(hstate); +} + /* * We cannot handle pagefaults against hugetlb pages at all. They cause * handle_mm_fault() to try to instantiate regular-sized pages in the @@ -3170,6 +3170,7 @@ const struct vm_operations_struct hugetlb_vm_ops = { .open = hugetlb_vm_op_open, .close = hugetlb_vm_op_close, .split = hugetlb_vm_op_split, + .pagesize = hugetlb_vm_op_pagesize, }; static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index e13d911251e7..bc0e68f7dc75 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -323,9 +323,9 @@ void kasan_free_pages(struct page *page, unsigned int order) * Adaptive redzone policy taken from the userspace AddressSanitizer runtime. * For larger allocations larger redzones are used. */ -static size_t optimal_redzone(size_t object_size) +static unsigned int optimal_redzone(unsigned int object_size) { - int rz = + return object_size <= 64 - 16 ? 16 : object_size <= 128 - 32 ? 32 : object_size <= 512 - 64 ? 64 : @@ -333,14 +333,13 @@ static size_t optimal_redzone(size_t object_size) object_size <= (1 << 14) - 256 ? 256 : object_size <= (1 << 15) - 512 ? 512 : object_size <= (1 << 16) - 1024 ? 1024 : 2048; - return rz; } -void kasan_cache_create(struct kmem_cache *cache, size_t *size, +void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, slab_flags_t *flags) { + unsigned int orig_size = *size; int redzone_adjust; - int orig_size = *size; /* Add alloc meta. */ cache->kasan_info.alloc_meta_offset = *size; @@ -358,7 +357,8 @@ void kasan_cache_create(struct kmem_cache *cache, size_t *size, if (redzone_adjust > 0) *size += redzone_adjust; - *size = min(KMALLOC_MAX_SIZE, max(*size, cache->object_size + + *size = min_t(unsigned int, KMALLOC_MAX_SIZE, + max(*size, cache->object_size + optimal_redzone(cache->object_size))); /* @@ -382,7 +382,8 @@ void kasan_cache_shrink(struct kmem_cache *cache) void kasan_cache_shutdown(struct kmem_cache *cache) { - quarantine_remove_cache(cache); + if (!__kmem_cache_empty(cache)) + quarantine_remove_cache(cache); } size_t kasan_metadata_size(struct kmem_cache *cache) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 46c2290a08f1..9a085d525bbc 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1187,6 +1187,11 @@ EXPORT_SYMBOL(kmemleak_no_scan); /** * kmemleak_alloc_phys - similar to kmemleak_alloc but taking a physical * address argument + * @phys: physical address of the object + * @size: size of the object + * @min_count: minimum number of references to this object. + * See kmemleak_alloc() + * @gfp: kmalloc() flags used for kmemleak internal memory allocations */ void __ref kmemleak_alloc_phys(phys_addr_t phys, size_t size, int min_count, gfp_t gfp) @@ -1199,6 +1204,9 @@ EXPORT_SYMBOL(kmemleak_alloc_phys); /** * kmemleak_free_part_phys - similar to kmemleak_free_part but taking a * physical address argument + * @phys: physical address if the beginning or inside an object. This + * also represents the start of the range to be freed + * @size: size to be unregistered */ void __ref kmemleak_free_part_phys(phys_addr_t phys, size_t size) { @@ -1210,6 +1218,7 @@ EXPORT_SYMBOL(kmemleak_free_part_phys); /** * kmemleak_not_leak_phys - similar to kmemleak_not_leak but taking a physical * address argument + * @phys: physical address of the object */ void __ref kmemleak_not_leak_phys(phys_addr_t phys) { @@ -1221,6 +1230,7 @@ EXPORT_SYMBOL(kmemleak_not_leak_phys); /** * kmemleak_ignore_phys - similar to kmemleak_ignore but taking a physical * address argument + * @phys: physical address of the object */ void __ref kmemleak_ignore_phys(phys_addr_t phys) { @@ -1963,7 +1973,7 @@ static void kmemleak_disable(void) /* * Allow boot-time kmemleak disabling (enabled by default). */ -static int kmemleak_boot_config(char *str) +static int __init kmemleak_boot_config(char *str) { if (!str) return -EINVAL; @@ -1318,10 +1318,10 @@ bool is_page_sharing_candidate(struct stable_node *stable_node) return __is_page_sharing_candidate(stable_node, 0); } -struct page *stable_node_dup(struct stable_node **_stable_node_dup, - struct stable_node **_stable_node, - struct rb_root *root, - bool prune_stale_stable_nodes) +static struct page *stable_node_dup(struct stable_node **_stable_node_dup, + struct stable_node **_stable_node, + struct rb_root *root, + bool prune_stale_stable_nodes) { struct stable_node *dup, *found = NULL, *stable_node = *_stable_node; struct hlist_node *hlist_safe; @@ -2082,8 +2082,22 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) tree_rmap_item = unstable_tree_search_insert(rmap_item, page, &tree_page); if (tree_rmap_item) { + bool split; + kpage = try_to_merge_two_pages(rmap_item, page, tree_rmap_item, tree_page); + /* + * If both pages we tried to merge belong to the same compound + * page, then we actually ended up increasing the reference + * count of the same compound page twice, and split_huge_page + * failed. + * Here we set a flag if that happened, and we use it later to + * try split_huge_page again. Since we call put_page right + * afterwards, the reference count will be correct and + * split_huge_page should succeed. + */ + split = PageTransCompound(page) + && compound_head(page) == compound_head(tree_page); put_page(tree_page); if (kpage) { /* @@ -2110,6 +2124,20 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) break_cow(tree_rmap_item); break_cow(rmap_item); } + } else if (split) { + /* + * We are here if we tried to merge two pages and + * failed because they both belonged to the same + * compound page. We will split the page now, but no + * merging will take place. + * We do not want to add the cost of a full lock; if + * the page is locked, it is better to skip it and + * perhaps try again later. + */ + if (!trylock_page(page)) + return; + split_huge_page(page); + unlock_page(page); } } } diff --git a/mm/list_lru.c b/mm/list_lru.c index fd41e969ede5..fcfb6c89ed47 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -52,14 +52,15 @@ static inline bool list_lru_memcg_aware(struct list_lru *lru) static inline struct list_lru_one * list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx) { + struct list_lru_memcg *memcg_lrus; /* - * The lock protects the array of per cgroup lists from relocation - * (see memcg_update_list_lru_node). + * Either lock or RCU protects the array of per cgroup lists + * from relocation (see memcg_update_list_lru_node). */ - lockdep_assert_held(&nlru->lock); - if (nlru->memcg_lrus && idx >= 0) - return nlru->memcg_lrus->lru[idx]; - + memcg_lrus = rcu_dereference_check(nlru->memcg_lrus, + lockdep_is_held(&nlru->lock)); + if (memcg_lrus && idx >= 0) + return memcg_lrus->lru[idx]; return &nlru->lru; } @@ -168,10 +169,10 @@ static unsigned long __list_lru_count_one(struct list_lru *lru, struct list_lru_one *l; unsigned long count; - spin_lock(&nlru->lock); + rcu_read_lock(); l = list_lru_from_memcg_idx(nlru, memcg_idx); count = l->nr_items; - spin_unlock(&nlru->lock); + rcu_read_unlock(); return count; } @@ -324,24 +325,41 @@ fail: static int memcg_init_list_lru_node(struct list_lru_node *nlru) { + struct list_lru_memcg *memcg_lrus; int size = memcg_nr_cache_ids; - nlru->memcg_lrus = kvmalloc(size * sizeof(void *), GFP_KERNEL); - if (!nlru->memcg_lrus) + memcg_lrus = kvmalloc(sizeof(*memcg_lrus) + + size * sizeof(void *), GFP_KERNEL); + if (!memcg_lrus) return -ENOMEM; - if (__memcg_init_list_lru_node(nlru->memcg_lrus, 0, size)) { - kvfree(nlru->memcg_lrus); + if (__memcg_init_list_lru_node(memcg_lrus, 0, size)) { + kvfree(memcg_lrus); return -ENOMEM; } + RCU_INIT_POINTER(nlru->memcg_lrus, memcg_lrus); return 0; } static void memcg_destroy_list_lru_node(struct list_lru_node *nlru) { - __memcg_destroy_list_lru_node(nlru->memcg_lrus, 0, memcg_nr_cache_ids); - kvfree(nlru->memcg_lrus); + struct list_lru_memcg *memcg_lrus; + /* + * This is called when shrinker has already been unregistered, + * and nobody can use it. So, there is no need to use kvfree_rcu(). + */ + memcg_lrus = rcu_dereference_protected(nlru->memcg_lrus, true); + __memcg_destroy_list_lru_node(memcg_lrus, 0, memcg_nr_cache_ids); + kvfree(memcg_lrus); +} + +static void kvfree_rcu(struct rcu_head *head) +{ + struct list_lru_memcg *mlru; + + mlru = container_of(head, struct list_lru_memcg, rcu); + kvfree(mlru); } static int memcg_update_list_lru_node(struct list_lru_node *nlru, @@ -351,8 +369,9 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru, BUG_ON(old_size > new_size); - old = nlru->memcg_lrus; - new = kvmalloc(new_size * sizeof(void *), GFP_KERNEL); + old = rcu_dereference_protected(nlru->memcg_lrus, + lockdep_is_held(&list_lrus_mutex)); + new = kvmalloc(sizeof(*new) + new_size * sizeof(void *), GFP_KERNEL); if (!new) return -ENOMEM; @@ -361,29 +380,33 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru, return -ENOMEM; } - memcpy(new, old, old_size * sizeof(void *)); + memcpy(&new->lru, &old->lru, old_size * sizeof(void *)); /* - * The lock guarantees that we won't race with a reader - * (see list_lru_from_memcg_idx). + * The locking below allows readers that hold nlru->lock avoid taking + * rcu_read_lock (see list_lru_from_memcg_idx). * * Since list_lru_{add,del} may be called under an IRQ-safe lock, * we have to use IRQ-safe primitives here to avoid deadlock. */ spin_lock_irq(&nlru->lock); - nlru->memcg_lrus = new; + rcu_assign_pointer(nlru->memcg_lrus, new); spin_unlock_irq(&nlru->lock); - kvfree(old); + call_rcu(&old->rcu, kvfree_rcu); return 0; } static void memcg_cancel_update_list_lru_node(struct list_lru_node *nlru, int old_size, int new_size) { + struct list_lru_memcg *memcg_lrus; + + memcg_lrus = rcu_dereference_protected(nlru->memcg_lrus, + lockdep_is_held(&list_lrus_mutex)); /* do not bother shrinking the array back to the old size, because we * cannot handle allocation failures here */ - __memcg_destroy_list_lru_node(nlru->memcg_lrus, old_size, new_size); + __memcg_destroy_list_lru_node(memcg_lrus, old_size, new_size); } static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) diff --git a/mm/memblock.c b/mm/memblock.c index 48376bd33274..9b04568ad42a 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -17,6 +17,7 @@ #include <linux/poison.h> #include <linux/pfn.h> #include <linux/debugfs.h> +#include <linux/kmemleak.h> #include <linux/seq_file.h> #include <linux/memblock.h> @@ -924,7 +925,7 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags, r = &type_b->regions[idx_b]; r_start = idx_b ? r[-1].base + r[-1].size : 0; r_end = idx_b < type_b->cnt ? - r->base : ULLONG_MAX; + r->base : (phys_addr_t)ULLONG_MAX; /* * if idx_b advanced past idx_a, @@ -1040,7 +1041,7 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, ulong flags, r = &type_b->regions[idx_b]; r_start = idx_b ? r[-1].base + r[-1].size : 0; r_end = idx_b < type_b->cnt ? - r->base : ULLONG_MAX; + r->base : (phys_addr_t)ULLONG_MAX; /* * if idx_b advanced past idx_a, * break out to advance idx_a @@ -1345,7 +1346,7 @@ void * __init memblock_virt_alloc_try_nid_raw( min_addr, max_addr, nid); #ifdef CONFIG_DEBUG_VM if (ptr && size > 0) - memset(ptr, 0xff, size); + memset(ptr, PAGE_POISON_PATTERN, size); #endif return ptr; } @@ -1750,29 +1751,6 @@ static void __init_memblock memblock_dump(struct memblock_type *type) } } -extern unsigned long __init_memblock -memblock_reserved_memory_within(phys_addr_t start_addr, phys_addr_t end_addr) -{ - struct memblock_region *rgn; - unsigned long size = 0; - int idx; - - for_each_memblock_type(idx, (&memblock.reserved), rgn) { - phys_addr_t start, end; - - if (rgn->base + rgn->size < start_addr) - continue; - if (rgn->base > end_addr) - continue; - - start = rgn->base; - end = start + rgn->size; - size += end - start; - } - - return size; -} - void __init_memblock __memblock_dump_all(void) { pr_info("MEMBLOCK configuration:\n"); @@ -1818,18 +1796,7 @@ static int memblock_debug_show(struct seq_file *m, void *private) } return 0; } - -static int memblock_debug_open(struct inode *inode, struct file *file) -{ - return single_open(file, memblock_debug_show, inode->i_private); -} - -static const struct file_operations memblock_debug_fops = { - .open = memblock_debug_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(memblock_debug); static int __init memblock_init_debugfs(void) { diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 8291b75f42c8..2d4bf647cf01 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -502,6 +502,7 @@ static const char * const action_page_types[] = { [MF_MSG_POISONED_HUGE] = "huge page already hardware poisoned", [MF_MSG_HUGE] = "huge page", [MF_MSG_FREE_HUGE] = "free huge page", + [MF_MSG_NON_PMD_HUGE] = "non-pmd-sized huge page", [MF_MSG_UNMAP_FAILED] = "unmapping failed page", [MF_MSG_DIRTY_SWAPCACHE] = "dirty swapcache page", [MF_MSG_CLEAN_SWAPCACHE] = "clean swapcache page", @@ -1084,6 +1085,21 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags) return 0; } + /* + * TODO: hwpoison for pud-sized hugetlb doesn't work right now, so + * simply disable it. In order to make it work properly, we need + * make sure that: + * - conversion of a pud that maps an error hugetlb into hwpoison + * entry properly works, and + * - other mm code walking over page table is aware of pud-aligned + * hwpoison entries. + */ + if (huge_page_size(page_hstate(head)) > PMD_SIZE) { + action_result(pfn, MF_MSG_NON_PMD_HUGE, MF_IGNORED); + res = -EBUSY; + goto out; + } + if (!hwpoison_user_mappings(p, pfn, flags, &head)) { action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); res = -EBUSY; diff --git a/mm/memory.c b/mm/memory.c index aed37325d94e..01f5464e0fd2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2883,26 +2883,16 @@ EXPORT_SYMBOL(unmap_mapping_range); int do_swap_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - struct page *page = NULL, *swapcache = NULL; + struct page *page = NULL, *swapcache; struct mem_cgroup *memcg; - struct vma_swap_readahead swap_ra; swp_entry_t entry; pte_t pte; int locked; int exclusive = 0; int ret = 0; - bool vma_readahead = swap_use_vma_readahead(); - if (vma_readahead) { - page = swap_readahead_detect(vmf, &swap_ra); - swapcache = page; - } - - if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) { - if (page) - put_page(page); + if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) goto out; - } entry = pte_to_swp_entry(vmf->orig_pte); if (unlikely(non_swap_entry(entry))) { @@ -2928,11 +2918,8 @@ int do_swap_page(struct vm_fault *vmf) delayacct_set_flag(DELAYACCT_PF_SWAPIN); - if (!page) { - page = lookup_swap_cache(entry, vma_readahead ? vma : NULL, - vmf->address); - swapcache = page; - } + page = lookup_swap_cache(entry, vma, vmf->address); + swapcache = page; if (!page) { struct swap_info_struct *si = swp_swap_info(entry); @@ -2940,7 +2927,8 @@ int do_swap_page(struct vm_fault *vmf) if (si->flags & SWP_SYNCHRONOUS_IO && __swap_count(si, entry) == 1) { /* skip swapcache */ - page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); + page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, + vmf->address); if (page) { __SetPageLocked(page); __SetPageSwapBacked(page); @@ -2949,12 +2937,8 @@ int do_swap_page(struct vm_fault *vmf) swap_readpage(page, true); } } else { - if (vma_readahead) - page = do_swap_page_readahead(entry, - GFP_HIGHUSER_MOVABLE, vmf, &swap_ra); - else - page = swapin_readahead(entry, - GFP_HIGHUSER_MOVABLE, vma, vmf->address); + page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, + vmf); swapcache = page; } @@ -2982,7 +2966,6 @@ int do_swap_page(struct vm_fault *vmf) */ ret = VM_FAULT_HWPOISON; delayacct_clear_flag(DELAYACCT_PF_SWAPIN); - swapcache = page; goto out_release; } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index b2bd52ff7605..cc6dfa5832ca 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -250,7 +250,6 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn, struct vmem_altmap *altmap, bool want_memblock) { int ret; - int i; if (pfn_valid(phys_start_pfn)) return -EEXIST; @@ -259,27 +258,10 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn, if (ret < 0) return ret; - /* - * Make all the pages reserved so that nobody will stumble over half - * initialized state. - * FIXME: We also have to associate it with a node because page_to_nid - * relies on having page with the proper node. - */ - for (i = 0; i < PAGES_PER_SECTION; i++) { - unsigned long pfn = phys_start_pfn + i; - struct page *page; - if (!pfn_valid(pfn)) - continue; - - page = pfn_to_page(pfn); - set_page_node(page, nid); - SetPageReserved(page); - } - if (!want_memblock) return 0; - return register_new_memory(nid, __pfn_to_section(phys_start_pfn)); + return hotplug_memory_register(nid, __pfn_to_section(phys_start_pfn)); } /* @@ -559,6 +541,7 @@ static int __remove_section(struct zone *zone, struct mem_section *ms, * @zone: zone from which pages need to be removed * @phys_start_pfn: starting pageframe (must be aligned to start of a section) * @nr_pages: number of pages to remove (must be multiple of section size) + * @altmap: alternative device page map or %NULL if default memmap is used * * Generic helper function to remove section mappings and sysfs entries * for the section of the memory we are removing. Caller needs to make @@ -908,8 +891,15 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ int nid; int ret; struct memory_notify arg; + struct memory_block *mem; + + /* + * We can't use pfn_to_nid() because nid might be stored in struct page + * which is not yet initialized. Instead, we find nid from memory block. + */ + mem = find_memory_block(__pfn_to_section(pfn)); + nid = mem->nid; - nid = pfn_to_nid(pfn); /* associate pfn range with the zone */ zone = move_pfn_range(online_type, nid, pfn, nr_pages); @@ -1055,6 +1045,7 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat) /** * try_online_node - online a node if offlined + * @nid: the node ID * * called by cpu_up() to online a node without onlined memory. */ @@ -1083,15 +1074,16 @@ out: static int check_hotplug_memory_range(u64 start, u64 size) { - u64 start_pfn = PFN_DOWN(start); + unsigned long block_sz = memory_block_size_bytes(); + u64 block_nr_pages = block_sz >> PAGE_SHIFT; u64 nr_pages = size >> PAGE_SHIFT; + u64 start_pfn = PFN_DOWN(start); - /* Memory range must be aligned with section */ - if ((start_pfn & ~PAGE_SECTION_MASK) || - (nr_pages % PAGES_PER_SECTION) || (!nr_pages)) { - pr_err("Section-unaligned hotplug range: start 0x%llx, size 0x%llx\n", - (unsigned long long)start, - (unsigned long long)size); + /* memory range must be block size aligned */ + if (!nr_pages || !IS_ALIGNED(start_pfn, block_nr_pages) || + !IS_ALIGNED(nr_pages, block_nr_pages)) { + pr_err("Block size [%#lx] unaligned hotplug range: start %#llx, size %#llx", + block_sz, start, size); return -EINVAL; } @@ -1814,6 +1806,7 @@ static int check_and_unmap_cpu_on_node(pg_data_t *pgdat) /** * try_offline_node + * @nid: the node ID * * Offline a node if all memory sections and cpus of the node are removed. * @@ -1857,6 +1850,9 @@ EXPORT_SYMBOL(try_offline_node); /** * remove_memory + * @nid: the node ID + * @start: physical address of the region to remove + * @size: size of the region to remove * * NOTE: The caller must call lock_device_hotplug() to serialize hotplug * and online/offline operations before this call, as required by diff --git a/mm/mmap.c b/mm/mmap.c index aa0dc8231c0d..f2154fc2548b 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3191,13 +3191,15 @@ bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages) if (rlimit(RLIMIT_DATA) == 0 && mm->data_vm + npages <= rlimit_max(RLIMIT_DATA) >> PAGE_SHIFT) return true; - if (!ignore_rlimit_data) { - pr_warn_once("%s (%d): VmData %lu exceed data ulimit %lu. Update limits or use boot option ignore_rlimit_data.\n", - current->comm, current->pid, - (mm->data_vm + npages) << PAGE_SHIFT, - rlimit(RLIMIT_DATA)); + + pr_warn_once("%s (%d): VmData %lu exceed data ulimit %lu. Update limits%s.\n", + current->comm, current->pid, + (mm->data_vm + npages) << PAGE_SHIFT, + rlimit(RLIMIT_DATA), + ignore_rlimit_data ? "" : " or use boot option ignore_rlimit_data"); + + if (!ignore_rlimit_data) return false; - } } return true; diff --git a/mm/nommu.c b/mm/nommu.c index 4f8720243ae7..13723736d38f 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -457,18 +457,6 @@ void __weak vmalloc_sync_all(void) { } -/** - * alloc_vm_area - allocate a range of kernel address space - * @size: size of the area - * - * Returns: NULL on failure, vm_struct on success - * - * This function reserves a range of kernel address space, and - * allocates pagetables to map that range. No actual mappings - * are created. If the kernel address space is not shared - * between processes, it syncs the pagetable across all - * processes. - */ struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes) { BUG(); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index f2e7dfb81eee..ff992fa8760a 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -185,6 +185,8 @@ static bool is_dump_unreclaim_slabs(void) * oom_badness - heuristic function to determine which candidate task to kill * @p: task struct of which task we should calculate * @totalpages: total present RAM allowed for page allocation + * @memcg: task's memory controller, if constrained + * @nodemask: nodemask passed to page allocator for mempolicy ooms * * The heuristic for determining which task to kill is made to be as simple and * predictable as possible. The goal is to return the highest value for the @@ -224,13 +226,6 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, mm_pgtables_bytes(p->mm) / PAGE_SIZE; task_unlock(p); - /* - * Root processes get 3% bonus, just like the __vm_enough_memory() - * implementation used by LSMs. - */ - if (has_capability_noaudit(p, CAP_SYS_ADMIN)) - points -= (points * 3) / 100; - /* Normalize to oom_score_adj units */ adj *= totalpages / 1000; points += adj; @@ -595,7 +590,8 @@ static void oom_reap_task(struct task_struct *tsk) while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task_mm(tsk, mm)) schedule_timeout_idle(HZ/10); - if (attempts <= MAX_OOM_REAP_RETRIES) + if (attempts <= MAX_OOM_REAP_RETRIES || + test_bit(MMF_OOM_SKIP, &mm->flags)) goto done; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4ea018263210..0b97b8ece4a9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -265,17 +265,19 @@ int min_free_kbytes = 1024; int user_min_free_kbytes = -1; int watermark_scale_factor = 10; -static unsigned long __meminitdata nr_kernel_pages; -static unsigned long __meminitdata nr_all_pages; -static unsigned long __meminitdata dma_reserve; +static unsigned long nr_kernel_pages __meminitdata; +static unsigned long nr_all_pages __meminitdata; +static unsigned long dma_reserve __meminitdata; #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP -static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; -static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; -static unsigned long __initdata required_kernelcore; -static unsigned long __initdata required_movablecore; -static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; -static bool mirrored_kernelcore; +static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __meminitdata; +static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __meminitdata; +static unsigned long required_kernelcore __initdata; +static unsigned long required_kernelcore_percent __initdata; +static unsigned long required_movablecore __initdata; +static unsigned long required_movablecore_percent __initdata; +static unsigned long zone_movable_pfn[MAX_NUMNODES] __meminitdata; +static bool mirrored_kernelcore __meminitdata; /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ int movable_zone; @@ -292,40 +294,6 @@ EXPORT_SYMBOL(nr_online_nodes); int page_group_by_mobility_disabled __read_mostly; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT - -/* - * Determine how many pages need to be initialized during early boot - * (non-deferred initialization). - * The value of first_deferred_pfn will be set later, once non-deferred pages - * are initialized, but for now set it ULONG_MAX. - */ -static inline void reset_deferred_meminit(pg_data_t *pgdat) -{ - phys_addr_t start_addr, end_addr; - unsigned long max_pgcnt; - unsigned long reserved; - - /* - * Initialise at least 2G of a node but also take into account that - * two large system hashes that can take up 1GB for 0.25TB/node. - */ - max_pgcnt = max(2UL << (30 - PAGE_SHIFT), - (pgdat->node_spanned_pages >> 8)); - - /* - * Compensate the all the memblock reservations (e.g. crash kernel) - * from the initial estimation to make sure we will initialize enough - * memory to boot. - */ - start_addr = PFN_PHYS(pgdat->node_start_pfn); - end_addr = PFN_PHYS(pgdat->node_start_pfn + max_pgcnt); - reserved = memblock_reserved_memory_within(start_addr, end_addr); - max_pgcnt += PHYS_PFN(reserved); - - pgdat->static_init_pgcnt = min(max_pgcnt, pgdat->node_spanned_pages); - pgdat->first_deferred_pfn = ULONG_MAX; -} - /* Returns true if the struct page for the pfn is uninitialised */ static inline bool __meminit early_page_uninitialised(unsigned long pfn) { @@ -361,10 +329,6 @@ static inline bool update_defer_init(pg_data_t *pgdat, return true; } #else -static inline void reset_deferred_meminit(pg_data_t *pgdat) -{ -} - static inline bool early_page_uninitialised(unsigned long pfn) { return false; @@ -1099,6 +1063,15 @@ static bool bulkfree_pcp_prepare(struct page *page) } #endif /* CONFIG_DEBUG_VM */ +static inline void prefetch_buddy(struct page *page) +{ + unsigned long pfn = page_to_pfn(page); + unsigned long buddy_pfn = __find_buddy_pfn(pfn, 0); + struct page *buddy = page + (buddy_pfn - pfn); + + prefetch(buddy); +} + /* * Frees a number of pages from the PCP lists * Assumes all pages on list are in same zone, and of same order. @@ -1115,13 +1088,12 @@ static void free_pcppages_bulk(struct zone *zone, int count, { int migratetype = 0; int batch_free = 0; + int prefetch_nr = 0; bool isolated_pageblocks; - - spin_lock(&zone->lock); - isolated_pageblocks = has_isolate_pageblock(zone); + struct page *page, *tmp; + LIST_HEAD(head); while (count) { - struct page *page; struct list_head *list; /* @@ -1143,26 +1115,48 @@ static void free_pcppages_bulk(struct zone *zone, int count, batch_free = count; do { - int mt; /* migratetype of the to-be-freed page */ - page = list_last_entry(list, struct page, lru); - /* must delete as __free_one_page list manipulates */ + /* must delete to avoid corrupting pcp list */ list_del(&page->lru); - - mt = get_pcppage_migratetype(page); - /* MIGRATE_ISOLATE page should not go to pcplists */ - VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); - /* Pageblock could have been isolated meanwhile */ - if (unlikely(isolated_pageblocks)) - mt = get_pageblock_migratetype(page); + pcp->count--; if (bulkfree_pcp_prepare(page)) continue; - __free_one_page(page, page_to_pfn(page), zone, 0, mt); - trace_mm_page_pcpu_drain(page, 0, mt); + list_add_tail(&page->lru, &head); + + /* + * We are going to put the page back to the global + * pool, prefetch its buddy to speed up later access + * under zone->lock. It is believed the overhead of + * an additional test and calculating buddy_pfn here + * can be offset by reduced memory latency later. To + * avoid excessive prefetching due to large count, only + * prefetch buddy for the first pcp->batch nr of pages. + */ + if (prefetch_nr++ < pcp->batch) + prefetch_buddy(page); } while (--count && --batch_free && !list_empty(list)); } + + spin_lock(&zone->lock); + isolated_pageblocks = has_isolate_pageblock(zone); + + /* + * Use safe version since after __free_one_page(), + * page->lru.next will not point to original list. + */ + list_for_each_entry_safe(page, tmp, &head, lru) { + int mt = get_pcppage_migratetype(page); + /* MIGRATE_ISOLATE page should not go to pcplists */ + VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); + /* Pageblock could have been isolated meanwhile */ + if (unlikely(isolated_pageblocks)) + mt = get_pageblock_migratetype(page); + + __free_one_page(page, page_to_pfn(page), zone, 0, mt); + trace_mm_page_pcpu_drain(page, 0, mt); + } spin_unlock(&zone->lock); } @@ -1181,10 +1175,9 @@ static void free_one_page(struct zone *zone, } static void __meminit __init_single_page(struct page *page, unsigned long pfn, - unsigned long zone, int nid, bool zero) + unsigned long zone, int nid) { - if (zero) - mm_zero_struct_page(page); + mm_zero_struct_page(page); set_page_links(page, zone, nid, pfn); init_page_count(page); page_mapcount_reset(page); @@ -1198,12 +1191,6 @@ static void __meminit __init_single_page(struct page *page, unsigned long pfn, #endif } -static void __meminit __init_single_pfn(unsigned long pfn, unsigned long zone, - int nid, bool zero) -{ - return __init_single_page(pfn_to_page(pfn), pfn, zone, nid, zero); -} - #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT static void __meminit init_reserved_page(unsigned long pfn) { @@ -1222,7 +1209,7 @@ static void __meminit init_reserved_page(unsigned long pfn) if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone)) break; } - __init_single_pfn(pfn, zid, nid, true); + __init_single_page(pfn_to_page(pfn), pfn, zid, nid); } #else static inline void init_reserved_page(unsigned long pfn) @@ -1506,7 +1493,7 @@ static void __init deferred_free_pages(int nid, int zid, unsigned long pfn, } else if (!(pfn & nr_pgmask)) { deferred_free_range(pfn - nr_free, nr_free); nr_free = 1; - cond_resched(); + touch_nmi_watchdog(); } else { nr_free++; } @@ -1535,11 +1522,11 @@ static unsigned long __init deferred_init_pages(int nid, int zid, continue; } else if (!page || !(pfn & nr_pgmask)) { page = pfn_to_page(pfn); - cond_resched(); + touch_nmi_watchdog(); } else { page++; } - __init_single_page(page, pfn, zid, nid, true); + __init_single_page(page, pfn, zid, nid); nr_pages++; } return (nr_pages); @@ -1552,23 +1539,25 @@ static int __init deferred_init_memmap(void *data) int nid = pgdat->node_id; unsigned long start = jiffies; unsigned long nr_pages = 0; - unsigned long spfn, epfn; + unsigned long spfn, epfn, first_init_pfn, flags; phys_addr_t spa, epa; int zid; struct zone *zone; - unsigned long first_init_pfn = pgdat->first_deferred_pfn; const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); u64 i; + /* Bind memory initialisation thread to a local node if possible */ + if (!cpumask_empty(cpumask)) + set_cpus_allowed_ptr(current, cpumask); + + pgdat_resize_lock(pgdat, &flags); + first_init_pfn = pgdat->first_deferred_pfn; if (first_init_pfn == ULONG_MAX) { + pgdat_resize_unlock(pgdat, &flags); pgdat_init_report_one_done(); return 0; } - /* Bind memory initialisation thread to a local node if possible */ - if (!cpumask_empty(cpumask)) - set_cpus_allowed_ptr(current, cpumask); - /* Sanity check boundaries */ BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn); BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat)); @@ -1598,6 +1587,7 @@ static int __init deferred_init_memmap(void *data) epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); deferred_free_pages(nid, zid, spfn, epfn); } + pgdat_resize_unlock(pgdat, &flags); /* Sanity check that the next zone really is unpopulated */ WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone)); @@ -1608,6 +1598,117 @@ static int __init deferred_init_memmap(void *data) pgdat_init_report_one_done(); return 0; } + +/* + * During boot we initialize deferred pages on-demand, as needed, but once + * page_alloc_init_late() has finished, the deferred pages are all initialized, + * and we can permanently disable that path. + */ +static DEFINE_STATIC_KEY_TRUE(deferred_pages); + +/* + * If this zone has deferred pages, try to grow it by initializing enough + * deferred pages to satisfy the allocation specified by order, rounded up to + * the nearest PAGES_PER_SECTION boundary. So we're adding memory in increments + * of SECTION_SIZE bytes by initializing struct pages in increments of + * PAGES_PER_SECTION * sizeof(struct page) bytes. + * + * Return true when zone was grown, otherwise return false. We return true even + * when we grow less than requested, to let the caller decide if there are + * enough pages to satisfy the allocation. + * + * Note: We use noinline because this function is needed only during boot, and + * it is called from a __ref function _deferred_grow_zone. This way we are + * making sure that it is not inlined into permanent text section. + */ +static noinline bool __init +deferred_grow_zone(struct zone *zone, unsigned int order) +{ + int zid = zone_idx(zone); + int nid = zone_to_nid(zone); + pg_data_t *pgdat = NODE_DATA(nid); + unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION); + unsigned long nr_pages = 0; + unsigned long first_init_pfn, spfn, epfn, t, flags; + unsigned long first_deferred_pfn = pgdat->first_deferred_pfn; + phys_addr_t spa, epa; + u64 i; + + /* Only the last zone may have deferred pages */ + if (zone_end_pfn(zone) != pgdat_end_pfn(pgdat)) + return false; + + pgdat_resize_lock(pgdat, &flags); + + /* + * If deferred pages have been initialized while we were waiting for + * the lock, return true, as the zone was grown. The caller will retry + * this zone. We won't return to this function since the caller also + * has this static branch. + */ + if (!static_branch_unlikely(&deferred_pages)) { + pgdat_resize_unlock(pgdat, &flags); + return true; + } + + /* + * If someone grew this zone while we were waiting for spinlock, return + * true, as there might be enough pages already. + */ + if (first_deferred_pfn != pgdat->first_deferred_pfn) { + pgdat_resize_unlock(pgdat, &flags); + return true; + } + + first_init_pfn = max(zone->zone_start_pfn, first_deferred_pfn); + + if (first_init_pfn >= pgdat_end_pfn(pgdat)) { + pgdat_resize_unlock(pgdat, &flags); + return false; + } + + for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { + spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); + epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); + + while (spfn < epfn && nr_pages < nr_pages_needed) { + t = ALIGN(spfn + PAGES_PER_SECTION, PAGES_PER_SECTION); + first_deferred_pfn = min(t, epfn); + nr_pages += deferred_init_pages(nid, zid, spfn, + first_deferred_pfn); + spfn = first_deferred_pfn; + } + + if (nr_pages >= nr_pages_needed) + break; + } + + for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { + spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); + epfn = min_t(unsigned long, first_deferred_pfn, PFN_DOWN(epa)); + deferred_free_pages(nid, zid, spfn, epfn); + + if (first_deferred_pfn == epfn) + break; + } + pgdat->first_deferred_pfn = first_deferred_pfn; + pgdat_resize_unlock(pgdat, &flags); + + return nr_pages > 0; +} + +/* + * deferred_grow_zone() is __init, but it is called from + * get_page_from_freelist() during early boot until deferred_pages permanently + * disables this call. This is why we have refdata wrapper to avoid warning, + * and to ensure that the function body gets unloaded. + */ +static bool __ref +_deferred_grow_zone(struct zone *zone, unsigned int order) +{ + return deferred_grow_zone(zone, order); +} + #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ void __init page_alloc_init_late(void) @@ -1626,6 +1727,12 @@ void __init page_alloc_init_late(void) /* Block until all are initialised */ wait_for_completion(&pgdat_init_all_done_comp); + /* + * We initialized the rest of the deferred pages. Permanently disable + * on-demand struct page initialization. + */ + static_branch_disable(&deferred_pages); + /* Reinit limits that are based on free pages after the kernel is up */ files_maxfiles_init(); #endif @@ -2418,10 +2525,8 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) local_irq_save(flags); batch = READ_ONCE(pcp->batch); to_drain = min(pcp->count, batch); - if (to_drain > 0) { + if (to_drain > 0) free_pcppages_bulk(zone, to_drain, pcp); - pcp->count -= to_drain; - } local_irq_restore(flags); } #endif @@ -2443,10 +2548,8 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone) pset = per_cpu_ptr(zone->pageset, cpu); pcp = &pset->pcp; - if (pcp->count) { + if (pcp->count) free_pcppages_bulk(zone, pcp->count, pcp); - pcp->count = 0; - } local_irq_restore(flags); } @@ -2670,7 +2773,6 @@ static void free_unref_page_commit(struct page *page, unsigned long pfn) if (pcp->count >= pcp->high) { unsigned long batch = READ_ONCE(pcp->batch); free_pcppages_bulk(zone, batch, pcp); - pcp->count -= batch; } } @@ -3205,6 +3307,16 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, ac_classzone_idx(ac), alloc_flags)) { int ret; +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT + /* + * Watermark failed for this zone, but see if we can + * grow this zone if it contains deferred pages. + */ + if (static_branch_unlikely(&deferred_pages)) { + if (_deferred_grow_zone(zone, order)) + goto try_this_zone; + } +#endif /* Checked here to keep the fast path fast */ BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); if (alloc_flags & ALLOC_NO_WATERMARKS) @@ -3246,6 +3358,14 @@ try_this_zone: reserve_highatomic_pageblock(page, zone, order); return page; + } else { +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT + /* Try again if zone has deferred pages */ + if (static_branch_unlikely(&deferred_pages)) { + if (_deferred_grow_zone(zone, order)) + goto try_this_zone; + } +#endif } } @@ -3685,16 +3805,18 @@ retry: return page; } -static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac) +static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask, + const struct alloc_context *ac) { struct zoneref *z; struct zone *zone; pg_data_t *last_pgdat = NULL; + enum zone_type high_zoneidx = ac->high_zoneidx; - for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, - ac->high_zoneidx, ac->nodemask) { + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, high_zoneidx, + ac->nodemask) { if (last_pgdat != zone->zone_pgdat) - wakeup_kswapd(zone, order, ac->high_zoneidx); + wakeup_kswapd(zone, gfp_mask, order, high_zoneidx); last_pgdat = zone->zone_pgdat; } } @@ -3973,7 +4095,7 @@ retry_cpuset: goto nopage; if (gfp_mask & __GFP_KSWAPD_RECLAIM) - wake_all_kswapds(order, ac); + wake_all_kswapds(order, gfp_mask, ac); /* * The adjusted alloc_flags might result in immediate success, so try @@ -4031,7 +4153,7 @@ retry_cpuset: retry: /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */ if (gfp_mask & __GFP_KSWAPD_RECLAIM) - wake_all_kswapds(order, ac); + wake_all_kswapds(order, gfp_mask, ac); reserve_flags = __gfp_pfmemalloc_flags(gfp_mask); if (reserve_flags) @@ -5334,6 +5456,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, pg_data_t *pgdat = NODE_DATA(nid); unsigned long pfn; unsigned long nr_initialised = 0; + struct page *page; #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP struct memblock_region *r = NULL, *tmp; #endif @@ -5386,6 +5509,11 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, #endif not_early: + page = pfn_to_page(pfn); + __init_single_page(page, pfn, zone, nid); + if (context == MEMMAP_HOTPLUG) + SetPageReserved(page); + /* * Mark the block movable so that blocks are reserved for * movable at startup. This will force kernel allocations @@ -5402,15 +5530,8 @@ not_early: * because this is done early in sparse_add_one_section */ if (!(pfn & (pageblock_nr_pages - 1))) { - struct page *page = pfn_to_page(pfn); - - __init_single_page(page, pfn, zone, nid, - context != MEMMAP_HOTPLUG); set_pageblock_migratetype(page, MIGRATE_MOVABLE); cond_resched(); - } else { - __init_single_pfn(pfn, zone, nid, - context != MEMMAP_HOTPLUG); } } } @@ -6241,7 +6362,15 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, alloc_node_mem_map(pgdat); - reset_deferred_meminit(pgdat); +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT + /* + * We start only with one section of pages, more pages are added as + * needed until the rest of deferred pages are initialized. + */ + pgdat->static_init_pgcnt = min_t(unsigned long, PAGES_PER_SECTION, + pgdat->node_spanned_pages); + pgdat->first_deferred_pfn = ULONG_MAX; +#endif free_area_init_core(pgdat); } @@ -6471,7 +6600,18 @@ static void __init find_zone_movable_pfns_for_nodes(void) } /* - * If movablecore=nn[KMG] was specified, calculate what size of + * If kernelcore=nn% or movablecore=nn% was specified, calculate the + * amount of necessary memory. + */ + if (required_kernelcore_percent) + required_kernelcore = (totalpages * 100 * required_kernelcore_percent) / + 10000UL; + if (required_movablecore_percent) + required_movablecore = (totalpages * 100 * required_movablecore_percent) / + 10000UL; + + /* + * If movablecore= was specified, calculate what size of * kernelcore that corresponds so that memory usable for * any allocation type is evenly spread. If both kernelcore * and movablecore are specified, then the value of kernelcore @@ -6711,18 +6851,30 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) zero_resv_unavail(); } -static int __init cmdline_parse_core(char *p, unsigned long *core) +static int __init cmdline_parse_core(char *p, unsigned long *core, + unsigned long *percent) { unsigned long long coremem; + char *endptr; + if (!p) return -EINVAL; - coremem = memparse(p, &p); - *core = coremem >> PAGE_SHIFT; + /* Value may be a percentage of total memory, otherwise bytes */ + coremem = simple_strtoull(p, &endptr, 0); + if (*endptr == '%') { + /* Paranoid check for percent values greater than 100 */ + WARN_ON(coremem > 100); - /* Paranoid check that UL is enough for the coremem value */ - WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX); + *percent = coremem; + } else { + coremem = memparse(p, &p); + /* Paranoid check that UL is enough for the coremem value */ + WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX); + *core = coremem >> PAGE_SHIFT; + *percent = 0UL; + } return 0; } @@ -6738,7 +6890,8 @@ static int __init cmdline_parse_kernelcore(char *p) return 0; } - return cmdline_parse_core(p, &required_kernelcore); + return cmdline_parse_core(p, &required_kernelcore, + &required_kernelcore_percent); } /* @@ -6747,7 +6900,8 @@ static int __init cmdline_parse_kernelcore(char *p) */ static int __init cmdline_parse_movablecore(char *p) { - return cmdline_parse_core(p, &required_movablecore); + return cmdline_parse_core(p, &required_movablecore, + &required_movablecore_percent); } early_param("kernelcore", cmdline_parse_kernelcore); @@ -7591,7 +7745,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, cc->nr_migratepages -= nr_reclaimed; ret = migrate_pages(&cc->migratepages, alloc_migrate_target, - NULL, 0, cc->mode, MR_CMA); + NULL, 0, cc->mode, MR_CONTIG_RANGE); } if (ret < 0) { putback_movable_pages(&cc->migratepages); @@ -7611,11 +7765,11 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, * @gfp_mask: GFP mask to use during compaction * * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES - * aligned, however it's the caller's responsibility to guarantee that - * we are the only thread that changes migrate type of pageblocks the - * pages fall in. + * aligned. The PFN range must belong to a single zone. * - * The PFN range must belong to a single zone. + * The first thing this routine does is attempt to MIGRATE_ISOLATE all + * pageblocks in the range. Once isolated, the pageblocks should not + * be modified by others. * * Returns zero on success or negative error code. On success all * pages which PFN is in [start, end) are allocated for the caller and diff --git a/mm/page_idle.c b/mm/page_idle.c index 0a49374e6931..e412a63b2b74 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -65,11 +65,15 @@ static bool page_idle_clear_pte_refs_one(struct page *page, while (page_vma_mapped_walk(&pvmw)) { addr = pvmw.address; if (pvmw.pte) { - referenced = ptep_clear_young_notify(vma, addr, - pvmw.pte); + /* + * For PTE-mapped THP, one sub page is referenced, + * the whole THP is referenced. + */ + if (ptep_clear_young_notify(vma, addr, pvmw.pte)) + referenced = true; } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { - referenced = pmdp_clear_young_notify(vma, addr, - pvmw.pmd); + if (pmdp_clear_young_notify(vma, addr, pvmw.pmd)) + referenced = true; } else { /* unexpected pmd-mapped page? */ WARN_ON_ONCE(1); diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 165ed8117bd1..61dee77bb211 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -28,6 +28,14 @@ static int set_migratetype_isolate(struct page *page, int migratetype, spin_lock_irqsave(&zone->lock, flags); + /* + * We assume the caller intended to SET migrate type to isolate. + * If it is already set, then someone else must have raced and + * set it before us. Return -EBUSY + */ + if (is_migrate_isolate_page(page)) + goto out; + pfn = page_to_pfn(page); arg.start_pfn = pfn; arg.nr_pages = pageblock_nr_pages; @@ -166,7 +174,15 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) * future will not be allocated again. * * start_pfn/end_pfn must be aligned to pageblock_order. - * Returns 0 on success and -EBUSY if any part of range cannot be isolated. + * Return 0 on success and -EBUSY if any part of range cannot be isolated. + * + * There is no high level synchronization mechanism that prevents two threads + * from trying to isolate overlapping ranges. If this happens, one thread + * will notice pageblocks in the overlapping range already set to isolate. + * This happens in set_migratetype_isolate, and set_migratetype_isolate + * returns an error. We then clean up by restoring the migration type on + * pageblocks we may have modified and return -EBUSY to caller. This + * prevents two threads from simultaneously working on overlapping ranges. */ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, unsigned migratetype, bool skip_hwpoisoned_pages) diff --git a/mm/page_owner.c b/mm/page_owner.c index 7172e0a80e13..75d21a2259b3 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -35,7 +35,7 @@ static depot_stack_handle_t early_handle; static void init_early_allocated_pages(void); -static int early_page_owner_param(char *buf) +static int __init early_page_owner_param(char *buf) { if (!buf) return -EINVAL; diff --git a/mm/page_poison.c b/mm/page_poison.c index e83fd44867de..aa2b3d34e8ea 100644 --- a/mm/page_poison.c +++ b/mm/page_poison.c @@ -9,7 +9,7 @@ static bool want_page_poisoning __read_mostly; -static int early_page_poison_param(char *buf) +static int __init early_page_poison_param(char *buf) { if (!buf) return -EINVAL; diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 8d2da5dec1e0..c3084ff2569d 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -258,6 +258,9 @@ static int __walk_page_range(unsigned long start, unsigned long end, /** * walk_page_range - walk page table with caller specific callbacks + * @start: start address of the virtual address range + * @end: end address of the virtual address range + * @walk: mm_walk structure defining the callbacks and the target address space * * Recursively walk the page table tree of the process represented by @walk->mm * within the virtual address range [@start, @end). During walking, we can do diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c index 7a58460bfd27..063ff60ecd90 100644 --- a/mm/percpu-stats.c +++ b/mm/percpu-stats.c @@ -223,18 +223,7 @@ alloc_buffer: return 0; } - -static int percpu_stats_open(struct inode *inode, struct file *filp) -{ - return single_open(filp, percpu_stats_show, NULL); -} - -static const struct file_operations percpu_stats_fops = { - .open = percpu_stats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(percpu_stats); static int __init init_percpu_stats_debugfs(void) { diff --git a/mm/rmap.c b/mm/rmap.c index 144c66e688a9..9122787c4947 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1171,6 +1171,7 @@ void page_add_new_anon_rmap(struct page *page, /** * page_add_file_rmap - add pte mapping to a file page * @page: the page to add the mapping to + * @compound: charge the page as compound or small page * * The caller needs to hold the pte lock. */ diff --git a/mm/shmem.c b/mm/shmem.c index b85919243399..4424fc0c33aa 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1422,9 +1422,12 @@ static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, { struct vm_area_struct pvma; struct page *page; + struct vm_fault vmf; shmem_pseudo_vma_init(&pvma, info, index); - page = swapin_readahead(swap, gfp, &pvma, 0); + vmf.vma = &pvma; + vmf.address = 0; + page = swap_cluster_readahead(swap, gfp, &vmf); shmem_pseudo_vma_destroy(&pvma); return page; diff --git a/mm/slab.c b/mm/slab.c index 9095c3945425..e3a9b8e23306 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1869,7 +1869,7 @@ static int __ref setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) return 0; } -slab_flags_t kmem_cache_flags(unsigned long object_size, +slab_flags_t kmem_cache_flags(unsigned int object_size, slab_flags_t flags, const char *name, void (*ctor)(void *)) { @@ -1877,7 +1877,7 @@ slab_flags_t kmem_cache_flags(unsigned long object_size, } struct kmem_cache * -__kmem_cache_alias(const char *name, size_t size, size_t align, +__kmem_cache_alias(const char *name, unsigned int size, unsigned int align, slab_flags_t flags, void (*ctor)(void *)) { struct kmem_cache *cachep; @@ -1994,7 +1994,7 @@ int __kmem_cache_create(struct kmem_cache *cachep, slab_flags_t flags) size_t ralign = BYTES_PER_WORD; gfp_t gfp; int err; - size_t size = cachep->size; + unsigned int size = cachep->size; #if DEBUG #if FORCED_DEBUG @@ -2291,6 +2291,18 @@ out: return nr_freed; } +bool __kmem_cache_empty(struct kmem_cache *s) +{ + int node; + struct kmem_cache_node *n; + + for_each_kmem_cache_node(s, node, n) + if (!list_empty(&n->slabs_full) || + !list_empty(&n->slabs_partial)) + return false; + return true; +} + int __kmem_cache_shrink(struct kmem_cache *cachep) { int ret = 0; diff --git a/mm/slab.h b/mm/slab.h index 51813236e773..68bdf498da3b 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -22,8 +22,8 @@ struct kmem_cache { unsigned int size; /* The aligned/padded/added on size */ unsigned int align; /* Alignment as calculated */ slab_flags_t flags; /* Active flags on the slab */ - size_t useroffset; /* Usercopy region offset */ - size_t usersize; /* Usercopy region size */ + unsigned int useroffset;/* Usercopy region offset */ + unsigned int usersize; /* Usercopy region size */ const char *name; /* Slab name for sysfs */ int refcount; /* Use counter */ void (*ctor)(void *); /* Called on object slot creation */ @@ -77,7 +77,7 @@ extern struct kmem_cache *kmem_cache; /* A table of kmalloc cache names and sizes */ extern const struct kmalloc_info_struct { const char *name; - unsigned long size; + unsigned int size; } kmalloc_info[]; #ifndef CONFIG_SLOB @@ -93,31 +93,31 @@ struct kmem_cache *kmalloc_slab(size_t, gfp_t); /* Functions provided by the slab allocators */ int __kmem_cache_create(struct kmem_cache *, slab_flags_t flags); -extern struct kmem_cache *create_kmalloc_cache(const char *name, size_t size, - slab_flags_t flags, size_t useroffset, - size_t usersize); +struct kmem_cache *create_kmalloc_cache(const char *name, unsigned int size, + slab_flags_t flags, unsigned int useroffset, + unsigned int usersize); extern void create_boot_cache(struct kmem_cache *, const char *name, - size_t size, slab_flags_t flags, size_t useroffset, - size_t usersize); + unsigned int size, slab_flags_t flags, + unsigned int useroffset, unsigned int usersize); int slab_unmergeable(struct kmem_cache *s); -struct kmem_cache *find_mergeable(size_t size, size_t align, +struct kmem_cache *find_mergeable(unsigned size, unsigned align, slab_flags_t flags, const char *name, void (*ctor)(void *)); #ifndef CONFIG_SLOB struct kmem_cache * -__kmem_cache_alias(const char *name, size_t size, size_t align, +__kmem_cache_alias(const char *name, unsigned int size, unsigned int align, slab_flags_t flags, void (*ctor)(void *)); -slab_flags_t kmem_cache_flags(unsigned long object_size, +slab_flags_t kmem_cache_flags(unsigned int object_size, slab_flags_t flags, const char *name, void (*ctor)(void *)); #else static inline struct kmem_cache * -__kmem_cache_alias(const char *name, size_t size, size_t align, +__kmem_cache_alias(const char *name, unsigned int size, unsigned int align, slab_flags_t flags, void (*ctor)(void *)) { return NULL; } -static inline slab_flags_t kmem_cache_flags(unsigned long object_size, +static inline slab_flags_t kmem_cache_flags(unsigned int object_size, slab_flags_t flags, const char *name, void (*ctor)(void *)) { @@ -166,6 +166,7 @@ static inline slab_flags_t kmem_cache_flags(unsigned long object_size, SLAB_TEMPORARY | \ SLAB_ACCOUNT) +bool __kmem_cache_empty(struct kmem_cache *); int __kmem_cache_shutdown(struct kmem_cache *); void __kmem_cache_release(struct kmem_cache *); int __kmem_cache_shrink(struct kmem_cache *); diff --git a/mm/slab_common.c b/mm/slab_common.c index 10f127b2de7c..98dcdc352062 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -10,6 +10,7 @@ #include <linux/poison.h> #include <linux/interrupt.h> #include <linux/memory.h> +#include <linux/cache.h> #include <linux/compiler.h> #include <linux/module.h> #include <linux/cpu.h> @@ -81,38 +82,19 @@ unsigned int kmem_cache_size(struct kmem_cache *s) EXPORT_SYMBOL(kmem_cache_size); #ifdef CONFIG_DEBUG_VM -static int kmem_cache_sanity_check(const char *name, size_t size) +static int kmem_cache_sanity_check(const char *name, unsigned int size) { - struct kmem_cache *s = NULL; - if (!name || in_interrupt() || size < sizeof(void *) || size > KMALLOC_MAX_SIZE) { pr_err("kmem_cache_create(%s) integrity check failed\n", name); return -EINVAL; } - list_for_each_entry(s, &slab_caches, list) { - char tmp; - int res; - - /* - * This happens when the module gets unloaded and doesn't - * destroy its slab cache and no-one else reuses the vmalloc - * area of the module. Print a warning. - */ - res = probe_kernel_address(s->name, tmp); - if (res) { - pr_err("Slab cache with size %d has lost its name\n", - s->object_size); - continue; - } - } - WARN_ON(strchr(name, ' ')); /* It confuses parsers */ return 0; } #else -static inline int kmem_cache_sanity_check(const char *name, size_t size) +static inline int kmem_cache_sanity_check(const char *name, unsigned int size) { return 0; } @@ -279,8 +261,8 @@ static inline void memcg_unlink_cache(struct kmem_cache *s) * Figure out what the alignment of the objects will be given a set of * flags, a user specified alignment and the size of the objects. */ -static unsigned long calculate_alignment(unsigned long flags, - unsigned long align, unsigned long size) +static unsigned int calculate_alignment(slab_flags_t flags, + unsigned int align, unsigned int size) { /* * If the user wants hardware cache aligned objects then follow that @@ -290,7 +272,7 @@ static unsigned long calculate_alignment(unsigned long flags, * alignment though. If that is greater then use it. */ if (flags & SLAB_HWCACHE_ALIGN) { - unsigned long ralign; + unsigned int ralign; ralign = cache_line_size(); while (size <= ralign / 2) @@ -330,7 +312,7 @@ int slab_unmergeable(struct kmem_cache *s) return 0; } -struct kmem_cache *find_mergeable(size_t size, size_t align, +struct kmem_cache *find_mergeable(unsigned int size, unsigned int align, slab_flags_t flags, const char *name, void (*ctor)(void *)) { struct kmem_cache *s; @@ -378,9 +360,9 @@ struct kmem_cache *find_mergeable(size_t size, size_t align, } static struct kmem_cache *create_cache(const char *name, - size_t object_size, size_t size, size_t align, - slab_flags_t flags, size_t useroffset, - size_t usersize, void (*ctor)(void *), + unsigned int object_size, unsigned int align, + slab_flags_t flags, unsigned int useroffset, + unsigned int usersize, void (*ctor)(void *), struct mem_cgroup *memcg, struct kmem_cache *root_cache) { struct kmem_cache *s; @@ -395,8 +377,7 @@ static struct kmem_cache *create_cache(const char *name, goto out; s->name = name; - s->object_size = object_size; - s->size = size; + s->size = s->object_size = object_size; s->align = align; s->ctor = ctor; s->useroffset = useroffset; @@ -451,8 +432,10 @@ out_free_cache: * as davem. */ struct kmem_cache * -kmem_cache_create_usercopy(const char *name, size_t size, size_t align, - slab_flags_t flags, size_t useroffset, size_t usersize, +kmem_cache_create_usercopy(const char *name, + unsigned int size, unsigned int align, + slab_flags_t flags, + unsigned int useroffset, unsigned int usersize, void (*ctor)(void *)) { struct kmem_cache *s = NULL; @@ -500,7 +483,7 @@ kmem_cache_create_usercopy(const char *name, size_t size, size_t align, goto out_unlock; } - s = create_cache(cache_name, size, size, + s = create_cache(cache_name, size, calculate_alignment(flags, align, size), flags, useroffset, usersize, ctor, NULL, NULL); if (IS_ERR(s)) { @@ -531,7 +514,7 @@ out_unlock: EXPORT_SYMBOL(kmem_cache_create_usercopy); struct kmem_cache * -kmem_cache_create(const char *name, size_t size, size_t align, +kmem_cache_create(const char *name, unsigned int size, unsigned int align, slab_flags_t flags, void (*ctor)(void *)) { return kmem_cache_create_usercopy(name, size, align, flags, 0, 0, @@ -647,7 +630,7 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg, goto out_unlock; s = create_cache(cache_name, root_cache->object_size, - root_cache->size, root_cache->align, + root_cache->align, root_cache->flags & CACHE_CREATE_MASK, root_cache->useroffset, root_cache->usersize, root_cache->ctor, memcg, root_cache); @@ -916,8 +899,9 @@ bool slab_is_available(void) #ifndef CONFIG_SLOB /* Create a cache during boot when no slab services are available yet */ -void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t size, - slab_flags_t flags, size_t useroffset, size_t usersize) +void __init create_boot_cache(struct kmem_cache *s, const char *name, + unsigned int size, slab_flags_t flags, + unsigned int useroffset, unsigned int usersize) { int err; @@ -932,15 +916,15 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t siz err = __kmem_cache_create(s, flags); if (err) - panic("Creation of kmalloc slab %s size=%zu failed. Reason %d\n", + panic("Creation of kmalloc slab %s size=%u failed. Reason %d\n", name, size, err); s->refcount = -1; /* Exempt from merging for now */ } -struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size, - slab_flags_t flags, size_t useroffset, - size_t usersize) +struct kmem_cache *__init create_kmalloc_cache(const char *name, + unsigned int size, slab_flags_t flags, + unsigned int useroffset, unsigned int usersize) { struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); @@ -954,11 +938,11 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size, return s; } -struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1]; +struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1] __ro_after_init; EXPORT_SYMBOL(kmalloc_caches); #ifdef CONFIG_ZONE_DMA -struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1]; +struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1] __ro_after_init; EXPORT_SYMBOL(kmalloc_dma_caches); #endif @@ -968,7 +952,7 @@ EXPORT_SYMBOL(kmalloc_dma_caches); * of two cache sizes there. The size of larger slabs can be determined using * fls. */ -static s8 size_index[24] = { +static u8 size_index[24] __ro_after_init = { 3, /* 8 */ 4, /* 16 */ 5, /* 24 */ @@ -995,7 +979,7 @@ static s8 size_index[24] = { 2 /* 192 */ }; -static inline int size_index_elem(size_t bytes) +static inline unsigned int size_index_elem(unsigned int bytes) { return (bytes - 1) / 8; } @@ -1006,7 +990,7 @@ static inline int size_index_elem(size_t bytes) */ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags) { - int index; + unsigned int index; if (unlikely(size > KMALLOC_MAX_SIZE)) { WARN_ON_ONCE(!(flags & __GFP_NOWARN)); @@ -1064,13 +1048,13 @@ const struct kmalloc_info_struct kmalloc_info[] __initconst = { */ void __init setup_kmalloc_cache_index_table(void) { - int i; + unsigned int i; BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 || (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1))); for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) { - int elem = size_index_elem(i); + unsigned int elem = size_index_elem(i); if (elem >= ARRAY_SIZE(size_index)) break; @@ -1137,9 +1121,9 @@ void __init create_kmalloc_caches(slab_flags_t flags) struct kmem_cache *s = kmalloc_caches[i]; if (s) { - int size = kmalloc_size(i); + unsigned int size = kmalloc_size(i); char *n = kasprintf(GFP_NOWAIT, - "dma-kmalloc-%d", size); + "dma-kmalloc-%u", size); BUG_ON(!n); kmalloc_dma_caches[i] = create_kmalloc_cache(n, @@ -1182,10 +1166,10 @@ EXPORT_SYMBOL(kmalloc_order_trace); #ifdef CONFIG_SLAB_FREELIST_RANDOM /* Randomize a generic freelist */ static void freelist_randomize(struct rnd_state *state, unsigned int *list, - size_t count) + unsigned int count) { - size_t i; unsigned int rand; + unsigned int i; for (i = 0; i < count; i++) list[i] = i; @@ -1532,3 +1516,11 @@ EXPORT_TRACEPOINT_SYMBOL(kmalloc_node); EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc_node); EXPORT_TRACEPOINT_SYMBOL(kfree); EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free); + +int should_failslab(struct kmem_cache *s, gfp_t gfpflags) +{ + if (__should_failslab(s, gfpflags)) + return -ENOMEM; + return 0; +} +ALLOW_ERROR_INJECTION(should_failslab, ERRNO); diff --git a/mm/slub.c b/mm/slub.c index e381728a3751..4fb037c98782 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -311,18 +311,18 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) __p += (__s)->size, __idx++) /* Determine object index from a given position */ -static inline int slab_index(void *p, struct kmem_cache *s, void *addr) +static inline unsigned int slab_index(void *p, struct kmem_cache *s, void *addr) { return (p - addr) / s->size; } -static inline int order_objects(int order, unsigned long size, int reserved) +static inline unsigned int order_objects(unsigned int order, unsigned int size, unsigned int reserved) { - return ((PAGE_SIZE << order) - reserved) / size; + return (((unsigned int)PAGE_SIZE << order) - reserved) / size; } -static inline struct kmem_cache_order_objects oo_make(int order, - unsigned long size, int reserved) +static inline struct kmem_cache_order_objects oo_make(unsigned int order, + unsigned int size, unsigned int reserved) { struct kmem_cache_order_objects x = { (order << OO_SHIFT) + order_objects(order, size, reserved) @@ -331,12 +331,12 @@ static inline struct kmem_cache_order_objects oo_make(int order, return x; } -static inline int oo_order(struct kmem_cache_order_objects x) +static inline unsigned int oo_order(struct kmem_cache_order_objects x) { return x.x >> OO_SHIFT; } -static inline int oo_objects(struct kmem_cache_order_objects x) +static inline unsigned int oo_objects(struct kmem_cache_order_objects x) { return x.x & OO_MASK; } @@ -466,7 +466,7 @@ static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map) set_bit(slab_index(p, s, addr), map); } -static inline int size_from_object(struct kmem_cache *s) +static inline unsigned int size_from_object(struct kmem_cache *s) { if (s->flags & SLAB_RED_ZONE) return s->size - s->red_left_pad; @@ -598,13 +598,13 @@ static void init_tracking(struct kmem_cache *s, void *object) set_track(s, object, TRACK_ALLOC, 0UL); } -static void print_track(const char *s, struct track *t) +static void print_track(const char *s, struct track *t, unsigned long pr_time) { if (!t->addr) return; pr_err("INFO: %s in %pS age=%lu cpu=%u pid=%d\n", - s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid); + s, (void *)t->addr, pr_time - t->when, t->cpu, t->pid); #ifdef CONFIG_STACKTRACE { int i; @@ -619,11 +619,12 @@ static void print_track(const char *s, struct track *t) static void print_tracking(struct kmem_cache *s, void *object) { + unsigned long pr_time = jiffies; if (!(s->flags & SLAB_STORE_USER)) return; - print_track("Allocated", get_track(s, object, TRACK_ALLOC)); - print_track("Freed", get_track(s, object, TRACK_FREE)); + print_track("Allocated", get_track(s, object, TRACK_ALLOC), pr_time); + print_track("Freed", get_track(s, object, TRACK_FREE), pr_time); } static void print_page_info(struct page *page) @@ -680,7 +681,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) print_section(KERN_ERR, "Bytes b4 ", p - 16, 16); print_section(KERN_ERR, "Object ", p, - min_t(unsigned long, s->object_size, PAGE_SIZE)); + min_t(unsigned int, s->object_size, PAGE_SIZE)); if (s->flags & SLAB_RED_ZONE) print_section(KERN_ERR, "Redzone ", p + s->object_size, s->inuse - s->object_size); @@ -1292,7 +1293,7 @@ out: __setup("slub_debug", setup_slub_debug); -slab_flags_t kmem_cache_flags(unsigned long object_size, +slab_flags_t kmem_cache_flags(unsigned int object_size, slab_flags_t flags, const char *name, void (*ctor)(void *)) { @@ -1325,7 +1326,7 @@ static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page) {} static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page) {} -slab_flags_t kmem_cache_flags(unsigned long object_size, +slab_flags_t kmem_cache_flags(unsigned int object_size, slab_flags_t flags, const char *name, void (*ctor)(void *)) { @@ -1435,7 +1436,7 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s, gfp_t flags, int node, struct kmem_cache_order_objects oo) { struct page *page; - int order = oo_order(oo); + unsigned int order = oo_order(oo); if (node == NUMA_NO_NODE) page = alloc_pages(flags, order); @@ -1454,8 +1455,8 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s, /* Pre-initialize the random sequence cache */ static int init_cache_random_seq(struct kmem_cache *s) { + unsigned int count = oo_objects(s->oo); int err; - unsigned long i, count = oo_objects(s->oo); /* Bailout if already initialised */ if (s->random_seq) @@ -1470,6 +1471,8 @@ static int init_cache_random_seq(struct kmem_cache *s) /* Transform to an offset on the set of pages */ if (s->random_seq) { + unsigned int i; + for (i = 0; i < count; i++) s->random_seq[i] *= s->size; } @@ -1811,7 +1814,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, { struct page *page, *page2; void *object = NULL; - int available = 0; + unsigned int available = 0; int objects; /* @@ -2398,7 +2401,7 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) pr_warn("SLUB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n", nid, gfpflags, &gfpflags); - pr_warn(" cache: %s, object size: %d, buffer size: %d, default order: %d, min order: %d\n", + pr_warn(" cache: %s, object size: %u, buffer size: %u, default order: %u, min order: %u\n", s->name, s->object_size, s->size, oo_order(s->oo), oo_order(s->min)); @@ -3181,9 +3184,9 @@ EXPORT_SYMBOL(kmem_cache_alloc_bulk); * and increases the number of allocations possible without having to * take the list_lock. */ -static int slub_min_order; -static int slub_max_order = PAGE_ALLOC_COSTLY_ORDER; -static int slub_min_objects; +static unsigned int slub_min_order; +static unsigned int slub_max_order = PAGE_ALLOC_COSTLY_ORDER; +static unsigned int slub_min_objects; /* * Calculate the order of allocation given an slab object size. @@ -3210,20 +3213,21 @@ static int slub_min_objects; * requested a higher mininum order then we start with that one instead of * the smallest order which will fit the object. */ -static inline int slab_order(int size, int min_objects, - int max_order, int fract_leftover, int reserved) +static inline unsigned int slab_order(unsigned int size, + unsigned int min_objects, unsigned int max_order, + unsigned int fract_leftover, unsigned int reserved) { - int order; - int rem; - int min_order = slub_min_order; + unsigned int min_order = slub_min_order; + unsigned int order; if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE) return get_order(size * MAX_OBJS_PER_PAGE) - 1; - for (order = max(min_order, get_order(min_objects * size + reserved)); + for (order = max(min_order, (unsigned int)get_order(min_objects * size + reserved)); order <= max_order; order++) { - unsigned long slab_size = PAGE_SIZE << order; + unsigned int slab_size = (unsigned int)PAGE_SIZE << order; + unsigned int rem; rem = (slab_size - reserved) % size; @@ -3234,12 +3238,11 @@ static inline int slab_order(int size, int min_objects, return order; } -static inline int calculate_order(int size, int reserved) +static inline int calculate_order(unsigned int size, unsigned int reserved) { - int order; - int min_objects; - int fraction; - int max_objects; + unsigned int order; + unsigned int min_objects; + unsigned int max_objects; /* * Attempt to find best configuration for a slab. This @@ -3256,6 +3259,8 @@ static inline int calculate_order(int size, int reserved) min_objects = min(min_objects, max_objects); while (min_objects > 1) { + unsigned int fraction; + fraction = 16; while (fraction >= 4) { order = slab_order(size, min_objects, @@ -3457,8 +3462,8 @@ static void set_cpu_partial(struct kmem_cache *s) static int calculate_sizes(struct kmem_cache *s, int forced_order) { slab_flags_t flags = s->flags; - size_t size = s->object_size; - int order; + unsigned int size = s->object_size; + unsigned int order; /* * Round up object size to the next word boundary. We can only @@ -3548,7 +3553,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) else order = calculate_order(size, s->reserved); - if (order < 0) + if ((int)order < 0) return 0; s->allocflags = 0; @@ -3632,8 +3637,8 @@ static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags) free_kmem_cache_nodes(s); error: if (flags & SLAB_PANIC) - panic("Cannot create slab %s size=%lu realsize=%u order=%u offset=%u flags=%lx\n", - s->name, (unsigned long)s->size, s->size, + panic("Cannot create slab %s size=%u realsize=%u order=%u offset=%u flags=%lx\n", + s->name, s->size, s->size, oo_order(s->oo), s->offset, (unsigned long)flags); return -EINVAL; } @@ -3691,6 +3696,17 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) discard_slab(s, page); } +bool __kmem_cache_empty(struct kmem_cache *s) +{ + int node; + struct kmem_cache_node *n; + + for_each_kmem_cache_node(s, node, n) + if (n->nr_partial || slabs_node(s, node)) + return false; + return true; +} + /* * Release all resources used by a slab cache. */ @@ -3716,7 +3732,7 @@ int __kmem_cache_shutdown(struct kmem_cache *s) static int __init setup_slub_min_order(char *str) { - get_option(&str, &slub_min_order); + get_option(&str, (int *)&slub_min_order); return 1; } @@ -3725,8 +3741,8 @@ __setup("slub_min_order=", setup_slub_min_order); static int __init setup_slub_max_order(char *str) { - get_option(&str, &slub_max_order); - slub_max_order = min(slub_max_order, MAX_ORDER - 1); + get_option(&str, (int *)&slub_max_order); + slub_max_order = min(slub_max_order, (unsigned int)MAX_ORDER - 1); return 1; } @@ -3735,7 +3751,7 @@ __setup("slub_max_order=", setup_slub_max_order); static int __init setup_slub_min_objects(char *str) { - get_option(&str, &slub_min_objects); + get_option(&str, (int *)&slub_min_objects); return 1; } @@ -3824,7 +3840,7 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page, bool to_user) { struct kmem_cache *s; - unsigned long offset; + unsigned int offset; size_t object_size; /* Find object and usable object size. */ @@ -4230,7 +4246,7 @@ void __init kmem_cache_init(void) cpuhp_setup_state_nocalls(CPUHP_SLUB_DEAD, "slub:dead", NULL, slub_cpu_dead); - pr_info("SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d, CPUs=%u, Nodes=%d\n", + pr_info("SLUB: HWalign=%d, Order=%u-%u, MinObjects=%u, CPUs=%u, Nodes=%d\n", cache_line_size(), slub_min_order, slub_max_order, slub_min_objects, nr_cpu_ids, nr_node_ids); @@ -4241,7 +4257,7 @@ void __init kmem_cache_init_late(void) } struct kmem_cache * -__kmem_cache_alias(const char *name, size_t size, size_t align, +__kmem_cache_alias(const char *name, unsigned int size, unsigned int align, slab_flags_t flags, void (*ctor)(void *)) { struct kmem_cache *s, *c; @@ -4254,13 +4270,12 @@ __kmem_cache_alias(const char *name, size_t size, size_t align, * Adjust the object sizes so that we clear * the complete object on kzalloc. */ - s->object_size = max(s->object_size, (int)size); - s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); + s->object_size = max(s->object_size, size); + s->inuse = max(s->inuse, ALIGN(size, sizeof(void *))); for_each_memcg_cache(c, s) { c->object_size = s->object_size; - c->inuse = max_t(int, c->inuse, - ALIGN(size, sizeof(void *))); + c->inuse = max(c->inuse, ALIGN(size, sizeof(void *))); } if (sysfs_slab_alias(s, name)) { @@ -4889,35 +4904,35 @@ struct slab_attribute { static ssize_t slab_size_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", s->size); + return sprintf(buf, "%u\n", s->size); } SLAB_ATTR_RO(slab_size); static ssize_t align_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", s->align); + return sprintf(buf, "%u\n", s->align); } SLAB_ATTR_RO(align); static ssize_t object_size_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", s->object_size); + return sprintf(buf, "%u\n", s->object_size); } SLAB_ATTR_RO(object_size); static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", oo_objects(s->oo)); + return sprintf(buf, "%u\n", oo_objects(s->oo)); } SLAB_ATTR_RO(objs_per_slab); static ssize_t order_store(struct kmem_cache *s, const char *buf, size_t length) { - unsigned long order; + unsigned int order; int err; - err = kstrtoul(buf, 10, &order); + err = kstrtouint(buf, 10, &order); if (err) return err; @@ -4930,7 +4945,7 @@ static ssize_t order_store(struct kmem_cache *s, static ssize_t order_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", oo_order(s->oo)); + return sprintf(buf, "%u\n", oo_order(s->oo)); } SLAB_ATTR(order); @@ -4962,10 +4977,10 @@ static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf) static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf, size_t length) { - unsigned long objects; + unsigned int objects; int err; - err = kstrtoul(buf, 10, &objects); + err = kstrtouint(buf, 10, &objects); if (err) return err; if (objects && !kmem_cache_has_cpu_partial(s)) @@ -5081,7 +5096,7 @@ SLAB_ATTR_RO(cache_dma); static ssize_t usersize_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%zu\n", s->usersize); + return sprintf(buf, "%u\n", s->usersize); } SLAB_ATTR_RO(usersize); @@ -5093,7 +5108,7 @@ SLAB_ATTR_RO(destroy_by_rcu); static ssize_t reserved_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", s->reserved); + return sprintf(buf, "%u\n", s->reserved); } SLAB_ATTR_RO(reserved); @@ -5288,21 +5303,22 @@ SLAB_ATTR(shrink); #ifdef CONFIG_NUMA static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", s->remote_node_defrag_ratio / 10); + return sprintf(buf, "%u\n", s->remote_node_defrag_ratio / 10); } static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s, const char *buf, size_t length) { - unsigned long ratio; + unsigned int ratio; int err; - err = kstrtoul(buf, 10, &ratio); + err = kstrtouint(buf, 10, &ratio); if (err) return err; + if (ratio > 100) + return -ERANGE; - if (ratio <= 100) - s->remote_node_defrag_ratio = ratio * 10; + s->remote_node_defrag_ratio = ratio * 10; return length; } @@ -5663,7 +5679,7 @@ static char *create_unique_id(struct kmem_cache *s) *p++ = 'A'; if (p != name + 1) *p++ = '-'; - p += sprintf(p, "%07d", s->size); + p += sprintf(p, "%07u", s->size); BUG_ON(p > name + ID_STR_LENGTH - 1); return name; diff --git a/mm/sparse.c b/mm/sparse.c index 58cab483e81b..62eef264a7bd 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -779,7 +779,13 @@ int __meminit sparse_add_one_section(struct pglist_data *pgdat, goto out; } - memset(memmap, 0, sizeof(struct page) * PAGES_PER_SECTION); +#ifdef CONFIG_DEBUG_VM + /* + * Poison uninitialized struct pages in order to catch invalid flags + * combinations. + */ + memset(memmap, PAGE_POISON_PATTERN, sizeof(struct page) * PAGES_PER_SECTION); +#endif section_mark_present(ms); diff --git a/mm/swap.c b/mm/swap.c index 0f17330dd0e5..3dd518832096 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -707,7 +707,6 @@ void lru_add_drain_all(void) * release_pages - batched put_page() * @pages: array of pages to release * @nr: number of pages - * @cold: whether the pages are cache cold * * Decrement the reference count on all the pages in @pages. If it * fell to zero, remove the page from the LRU and free it. diff --git a/mm/swap_slots.c b/mm/swap_slots.c index bebc19292018..f2641894f440 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -34,8 +34,6 @@ #include <linux/mutex.h> #include <linux/mm.h> -#ifdef CONFIG_SWAP - static DEFINE_PER_CPU(struct swap_slots_cache, swp_slots); static bool swap_slot_cache_active; bool swap_slot_cache_enabled; @@ -356,5 +354,3 @@ repeat: return entry; } - -#endif /* CONFIG_SWAP */ diff --git a/mm/swap_state.c b/mm/swap_state.c index 39ae7cfad90f..f233dccd3b1b 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -38,7 +38,7 @@ static const struct address_space_operations swap_aops = { struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly; static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly; -bool swap_vma_readahead __read_mostly = true; +static bool enable_vma_readahead __read_mostly = true; #define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2) #define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1) @@ -322,6 +322,11 @@ void free_pages_and_swap_cache(struct page **pages, int nr) release_pages(pagep, nr); } +static inline bool swap_use_vma_readahead(void) +{ + return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap); +} + /* * Lookup a swap entry in the swap cache. A found page will be returned * unlocked and with its refcount incremented - we rely on the kernel @@ -332,32 +337,43 @@ struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, unsigned long addr) { struct page *page; - unsigned long ra_info; - int win, hits, readahead; page = find_get_page(swap_address_space(entry), swp_offset(entry)); INC_CACHE_INFO(find_total); if (page) { + bool vma_ra = swap_use_vma_readahead(); + bool readahead; + INC_CACHE_INFO(find_success); + /* + * At the moment, we don't support PG_readahead for anon THP + * so let's bail out rather than confusing the readahead stat. + */ if (unlikely(PageTransCompound(page))) return page; + readahead = TestClearPageReadahead(page); - if (vma) { - ra_info = GET_SWAP_RA_VAL(vma); - win = SWAP_RA_WIN(ra_info); - hits = SWAP_RA_HITS(ra_info); + if (vma && vma_ra) { + unsigned long ra_val; + int win, hits; + + ra_val = GET_SWAP_RA_VAL(vma); + win = SWAP_RA_WIN(ra_val); + hits = SWAP_RA_HITS(ra_val); if (readahead) hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX); atomic_long_set(&vma->swap_readahead_info, SWAP_RA_VAL(addr, win, hits)); } + if (readahead) { count_vm_event(SWAP_RA_HIT); - if (!vma) + if (!vma || !vma_ra) atomic_inc(&swapin_readahead_hits); } } + return page; } @@ -533,11 +549,10 @@ static unsigned long swapin_nr_pages(unsigned long offset) } /** - * swapin_readahead - swap in pages in hope we need them soon + * swap_cluster_readahead - swap in pages in hope we need them soon * @entry: swap entry of this memory * @gfp_mask: memory allocation flags - * @vma: user vma this address belongs to - * @addr: target address for mempolicy + * @vmf: fault information * * Returns the struct page for entry and addr, after queueing swapin. * @@ -549,10 +564,10 @@ static unsigned long swapin_nr_pages(unsigned long offset) * This has been extended to use the NUMA policies from the mm triggering * the readahead. * - * Caller must hold down_read on the vma->vm_mm if vma is not NULL. + * Caller must hold down_read on the vma->vm_mm if vmf->vma is not NULL. */ -struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, - struct vm_area_struct *vma, unsigned long addr) +struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, + struct vm_fault *vmf) { struct page *page; unsigned long entry_offset = swp_offset(entry); @@ -562,6 +577,8 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, struct swap_info_struct *si = swp_swap_info(entry); struct blk_plug plug; bool do_poll = true, page_allocated; + struct vm_area_struct *vma = vmf->vma; + unsigned long addr = vmf->address; mask = swapin_nr_pages(offset) - 1; if (!mask) @@ -586,8 +603,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, continue; if (page_allocated) { swap_readpage(page, false); - if (offset != entry_offset && - likely(!PageTransCompound(page))) { + if (offset != entry_offset) { SetPageReadahead(page); count_vm_event(SWAP_RA); } @@ -649,16 +665,15 @@ static inline void swap_ra_clamp_pfn(struct vm_area_struct *vma, PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE)); } -struct page *swap_readahead_detect(struct vm_fault *vmf, - struct vma_swap_readahead *swap_ra) +static void swap_ra_info(struct vm_fault *vmf, + struct vma_swap_readahead *ra_info) { struct vm_area_struct *vma = vmf->vma; - unsigned long swap_ra_info; - struct page *page; + unsigned long ra_val; swp_entry_t entry; unsigned long faddr, pfn, fpfn; unsigned long start, end; - pte_t *pte; + pte_t *pte, *orig_pte; unsigned int max_win, hits, prev_win, win, left; #ifndef CONFIG_64BIT pte_t *tpte; @@ -667,30 +682,32 @@ struct page *swap_readahead_detect(struct vm_fault *vmf, max_win = 1 << min_t(unsigned int, READ_ONCE(page_cluster), SWAP_RA_ORDER_CEILING); if (max_win == 1) { - swap_ra->win = 1; - return NULL; + ra_info->win = 1; + return; } faddr = vmf->address; - entry = pte_to_swp_entry(vmf->orig_pte); - if ((unlikely(non_swap_entry(entry)))) - return NULL; - page = lookup_swap_cache(entry, vma, faddr); - if (page) - return page; + orig_pte = pte = pte_offset_map(vmf->pmd, faddr); + entry = pte_to_swp_entry(*pte); + if ((unlikely(non_swap_entry(entry)))) { + pte_unmap(orig_pte); + return; + } fpfn = PFN_DOWN(faddr); - swap_ra_info = GET_SWAP_RA_VAL(vma); - pfn = PFN_DOWN(SWAP_RA_ADDR(swap_ra_info)); - prev_win = SWAP_RA_WIN(swap_ra_info); - hits = SWAP_RA_HITS(swap_ra_info); - swap_ra->win = win = __swapin_nr_pages(pfn, fpfn, hits, + ra_val = GET_SWAP_RA_VAL(vma); + pfn = PFN_DOWN(SWAP_RA_ADDR(ra_val)); + prev_win = SWAP_RA_WIN(ra_val); + hits = SWAP_RA_HITS(ra_val); + ra_info->win = win = __swapin_nr_pages(pfn, fpfn, hits, max_win, prev_win); atomic_long_set(&vma->swap_readahead_info, SWAP_RA_VAL(faddr, win, 0)); - if (win == 1) - return NULL; + if (win == 1) { + pte_unmap(orig_pte); + return; + } /* Copy the PTEs because the page table may be unmapped */ if (fpfn == pfn + 1) @@ -703,23 +720,21 @@ struct page *swap_readahead_detect(struct vm_fault *vmf, swap_ra_clamp_pfn(vma, faddr, fpfn - left, fpfn + win - left, &start, &end); } - swap_ra->nr_pte = end - start; - swap_ra->offset = fpfn - start; - pte = vmf->pte - swap_ra->offset; + ra_info->nr_pte = end - start; + ra_info->offset = fpfn - start; + pte -= ra_info->offset; #ifdef CONFIG_64BIT - swap_ra->ptes = pte; + ra_info->ptes = pte; #else - tpte = swap_ra->ptes; + tpte = ra_info->ptes; for (pfn = start; pfn != end; pfn++) *tpte++ = *pte++; #endif - - return NULL; + pte_unmap(orig_pte); } -struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask, - struct vm_fault *vmf, - struct vma_swap_readahead *swap_ra) +static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, + struct vm_fault *vmf) { struct blk_plug plug; struct vm_area_struct *vma = vmf->vma; @@ -728,12 +743,14 @@ struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask, swp_entry_t entry; unsigned int i; bool page_allocated; + struct vma_swap_readahead ra_info = {0,}; - if (swap_ra->win == 1) + swap_ra_info(vmf, &ra_info); + if (ra_info.win == 1) goto skip; blk_start_plug(&plug); - for (i = 0, pte = swap_ra->ptes; i < swap_ra->nr_pte; + for (i = 0, pte = ra_info.ptes; i < ra_info.nr_pte; i++, pte++) { pentry = *pte; if (pte_none(pentry)) @@ -749,8 +766,7 @@ struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask, continue; if (page_allocated) { swap_readpage(page, false); - if (i != swap_ra->offset && - likely(!PageTransCompound(page))) { + if (i != ra_info.offset) { SetPageReadahead(page); count_vm_event(SWAP_RA); } @@ -761,23 +777,43 @@ struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask, lru_add_drain(); skip: return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address, - swap_ra->win == 1); + ra_info.win == 1); +} + +/** + * swapin_readahead - swap in pages in hope we need them soon + * @entry: swap entry of this memory + * @gfp_mask: memory allocation flags + * @vmf: fault information + * + * Returns the struct page for entry and addr, after queueing swapin. + * + * It's a main entry function for swap readahead. By the configuration, + * it will read ahead blocks by cluster-based(ie, physical disk based) + * or vma-based(ie, virtual address based on faulty address) readahead. + */ +struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, + struct vm_fault *vmf) +{ + return swap_use_vma_readahead() ? + swap_vma_readahead(entry, gfp_mask, vmf) : + swap_cluster_readahead(entry, gfp_mask, vmf); } #ifdef CONFIG_SYSFS static ssize_t vma_ra_enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%s\n", swap_vma_readahead ? "true" : "false"); + return sprintf(buf, "%s\n", enable_vma_readahead ? "true" : "false"); } static ssize_t vma_ra_enabled_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1)) - swap_vma_readahead = true; + enable_vma_readahead = true; else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1)) - swap_vma_readahead = false; + enable_vma_readahead = false; else return -EINVAL; diff --git a/mm/util.c b/mm/util.c index c1250501364f..029fc2f3b395 100644 --- a/mm/util.c +++ b/mm/util.c @@ -515,6 +515,16 @@ struct address_space *page_mapping(struct page *page) } EXPORT_SYMBOL(page_mapping); +/* + * For file cache pages, return the address_space, otherwise return NULL + */ +struct address_space *page_mapping_file(struct page *page) +{ + if (unlikely(PageSwapCache(page))) + return NULL; + return page_mapping(page); +} + /* Slow path of page_mapcount() for compound pages */ int __page_mapcount(struct page *page) { diff --git a/mm/vmscan.c b/mm/vmscan.c index cd5dc3faaa57..4390a8d5be41 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -442,16 +442,8 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, if (memcg && (!memcg_kmem_enabled() || !mem_cgroup_online(memcg))) return 0; - if (!down_read_trylock(&shrinker_rwsem)) { - /* - * If we would return 0, our callers would understand that we - * have nothing else to shrink and give up trying. By returning - * 1 we keep it going and assume we'll be able to shrink next - * time. - */ - freed = 1; + if (!down_read_trylock(&shrinker_rwsem)) goto out; - } list_for_each_entry(shrinker, &shrinker_list, list) { struct shrink_control sc = { @@ -3547,16 +3539,21 @@ kswapd_try_sleep: } /* - * A zone is low on free memory, so wake its kswapd task to service it. + * A zone is low on free memory or too fragmented for high-order memory. If + * kswapd should reclaim (direct reclaim is deferred), wake it up for the zone's + * pgdat. It will wake up kcompactd after reclaiming memory. If kswapd reclaim + * has failed or is not needed, still wake up kcompactd if only compaction is + * needed. */ -void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) +void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, + enum zone_type classzone_idx) { pg_data_t *pgdat; if (!managed_zone(zone)) return; - if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL)) + if (!cpuset_zone_allowed(zone, gfp_flags)) return; pgdat = zone->zone_pgdat; pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat, @@ -3565,14 +3562,23 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) if (!waitqueue_active(&pgdat->kswapd_wait)) return; - /* Hopeless node, leave it to direct reclaim */ - if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) - return; - - if (pgdat_balanced(pgdat, order, classzone_idx)) + /* Hopeless node, leave it to direct reclaim if possible */ + if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES || + pgdat_balanced(pgdat, order, classzone_idx)) { + /* + * There may be plenty of free memory available, but it's too + * fragmented for high-order allocations. Wake up kcompactd + * and rely on compaction_suitable() to determine if it's + * needed. If it fails, it will defer subsequent attempts to + * ratelimit its work. + */ + if (!(gfp_flags & __GFP_DIRECT_RECLAIM)) + wakeup_kcompactd(pgdat, order, classzone_idx); return; + } - trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order); + trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order, + gfp_flags); wake_up_interruptible(&pgdat->kswapd_wait); } @@ -3877,7 +3883,13 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) */ int page_evictable(struct page *page) { - return !mapping_unevictable(page_mapping(page)) && !PageMlocked(page); + int ret; + + /* Prevent address_space of inode and swap cache from being freed */ + rcu_read_lock(); + ret = !mapping_unevictable(page_mapping(page)) && !PageMlocked(page); + rcu_read_unlock(); + return ret; } #ifdef CONFIG_SHMEM diff --git a/mm/z3fold.c b/mm/z3fold.c index d589d318727f..f579ad4a8100 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -620,24 +620,27 @@ lookup: bud = FIRST; } - spin_lock(&pool->stale_lock); - zhdr = list_first_entry_or_null(&pool->stale, - struct z3fold_header, buddy); - /* - * Before allocating a page, let's see if we can take one from the - * stale pages list. cancel_work_sync() can sleep so we must make - * sure it won't be called in case we're in atomic context. - */ - if (zhdr && (can_sleep || !work_pending(&zhdr->work))) { - list_del(&zhdr->buddy); - spin_unlock(&pool->stale_lock); - if (can_sleep) + page = NULL; + if (can_sleep) { + spin_lock(&pool->stale_lock); + zhdr = list_first_entry_or_null(&pool->stale, + struct z3fold_header, buddy); + /* + * Before allocating a page, let's see if we can take one from + * the stale pages list. cancel_work_sync() can sleep so we + * limit this case to the contexts where we can sleep + */ + if (zhdr) { + list_del(&zhdr->buddy); + spin_unlock(&pool->stale_lock); cancel_work_sync(&zhdr->work); - page = virt_to_page(zhdr); - } else { - spin_unlock(&pool->stale_lock); - page = alloc_page(gfp); + page = virt_to_page(zhdr); + } else { + spin_unlock(&pool->stale_lock); + } } + if (!page) + page = alloc_page(gfp); if (!page) return -ENOMEM; diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index b7f61cd1c709..61cb05dc950c 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -193,6 +193,7 @@ static struct vfsmount *zsmalloc_mnt; * (see: fix_fullness_group()) */ static const int fullness_threshold_frac = 4; +static size_t huge_class_size; struct size_class { spinlock_t lock; @@ -642,18 +643,7 @@ static int zs_stats_size_show(struct seq_file *s, void *v) return 0; } - -static int zs_stats_size_open(struct inode *inode, struct file *file) -{ - return single_open(file, zs_stats_size_show, inode->i_private); -} - -static const struct file_operations zs_stat_size_ops = { - .open = zs_stats_size_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(zs_stats_size); static void zs_pool_stat_create(struct zs_pool *pool, const char *name) { @@ -672,7 +662,7 @@ static void zs_pool_stat_create(struct zs_pool *pool, const char *name) pool->stat_dentry = entry; entry = debugfs_create_file("classes", S_IFREG | S_IRUGO, - pool->stat_dentry, pool, &zs_stat_size_ops); + pool->stat_dentry, pool, &zs_stats_size_fops); if (!entry) { pr_warn("%s: debugfs file entry <%s> creation failed\n", name, "classes"); @@ -861,6 +851,7 @@ static struct page *get_next_page(struct page *page) /** * obj_to_location - get (<page>, <obj_idx>) from encoded object value + * @obj: the encoded object value * @page: page object resides in zspage * @obj_idx: object index */ @@ -1311,6 +1302,7 @@ EXPORT_SYMBOL_GPL(zs_get_total_pages); * zs_map_object - get address of allocated object from handle. * @pool: pool from which the object was allocated * @handle: handle returned from zs_malloc + * @mm: maping mode to use * * Before using an object allocated from zs_malloc, it must be mapped using * this function. When done with the object, it must be unmapped using @@ -1418,6 +1410,25 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) } EXPORT_SYMBOL_GPL(zs_unmap_object); +/** + * zs_huge_class_size() - Returns the size (in bytes) of the first huge + * zsmalloc &size_class. + * @pool: zsmalloc pool to use + * + * The function returns the size of the first huge class - any object of equal + * or bigger size will be stored in zspage consisting of a single physical + * page. + * + * Context: Any context. + * + * Return: the size (in bytes) of the first huge zsmalloc &size_class. + */ +size_t zs_huge_class_size(struct zs_pool *pool) +{ + return huge_class_size; +} +EXPORT_SYMBOL_GPL(zs_huge_class_size); + static unsigned long obj_malloc(struct size_class *class, struct zspage *zspage, unsigned long handle) { @@ -2375,6 +2386,27 @@ struct zs_pool *zs_create_pool(const char *name) objs_per_zspage = pages_per_zspage * PAGE_SIZE / size; /* + * We iterate from biggest down to smallest classes, + * so huge_class_size holds the size of the first huge + * class. Any object bigger than or equal to that will + * endup in the huge class. + */ + if (pages_per_zspage != 1 && objs_per_zspage != 1 && + !huge_class_size) { + huge_class_size = size; + /* + * The object uses ZS_HANDLE_SIZE bytes to store the + * handle. We need to subtract it, because zs_malloc() + * unconditionally adds handle size before it performs + * size class search - so object may be smaller than + * huge class size, yet it still can end up in the huge + * class because it grows by ZS_HANDLE_SIZE extra bytes + * right before class lookup. + */ + huge_class_size -= (ZS_HANDLE_SIZE - 1); + } + + /* * size_class is used for normal zsmalloc operation such * as alloc/free for that size. Although it is natural that we * have one size_class for each size, there is a chance that we |