diff options
author | Dmitry Torokhov <dmitry.torokhov@gmail.com> | 2021-09-05 18:58:05 -0700 |
---|---|---|
committer | Dmitry Torokhov <dmitry.torokhov@gmail.com> | 2021-09-05 18:58:05 -0700 |
commit | 8be98d2f2a0a262f8bf8a0bc1fdf522b3c7aab17 (patch) | |
tree | a226b265d692d1933c0541802527d8aeb0d469ab /mm | |
parent | 818b26588994d9d95743fca0a427f08ec6c1c41d (diff) | |
parent | 3e204d6b76b29274cc8e57f8bd8d9873f04a7f48 (diff) |
Merge branch 'next' into for-linus
Prepare input updates for 5.15 merge window.
Diffstat (limited to 'mm')
93 files changed, 4671 insertions, 2782 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 24c045b24b95..02d44e3420f5 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -9,7 +9,6 @@ config SELECT_MEMORY_MODEL choice prompt "Memory model" depends on SELECT_MEMORY_MODEL - default DISCONTIGMEM_MANUAL if ARCH_DISCONTIGMEM_DEFAULT default SPARSEMEM_MANUAL if ARCH_SPARSEMEM_DEFAULT default FLATMEM_MANUAL help @@ -149,6 +148,9 @@ config MEMORY_ISOLATION config HAVE_BOOTMEM_INFO_NODE def_bool n +config ARCH_ENABLE_MEMORY_HOTPLUG + bool + # eventually, we can have this option just 'select SPARSEMEM' config MEMORY_HOTPLUG bool "Allow for memory hot-add" @@ -177,12 +179,20 @@ config MEMORY_HOTPLUG_DEFAULT_ONLINE Say N here if you want the default policy to keep all hot-plugged memory blocks in 'offline' state. +config ARCH_ENABLE_MEMORY_HOTREMOVE + bool + config MEMORY_HOTREMOVE bool "Allow for memory hot remove" select HAVE_BOOTMEM_INFO_NODE if (X86_64 || PPC64) depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE depends on MIGRATION +config MHP_MEMMAP_ON_MEMORY + def_bool y + depends on MEMORY_HOTPLUG && SPARSEMEM_VMEMMAP + depends on ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE + # Heavily threaded applications may benefit from splitting the mm-wide # page_table_lock, so that faults on different parts of the user address # space can be handled with less contention: split it at this NR_CPUS. @@ -274,6 +284,13 @@ config ARCH_ENABLE_HUGEPAGE_MIGRATION config ARCH_ENABLE_THP_MIGRATION bool +config HUGETLB_PAGE_SIZE_VARIABLE + def_bool n + help + Allows the pageblock_order value to be dynamic instead of just standard + HUGETLB_PAGE_ORDER when there are multiple HugeTLB page sizes available + on a platform. + config CONTIG_ALLOC def_bool (MEMORY_ISOLATION && COMPACTION) || CMA @@ -283,12 +300,11 @@ config PHYS_ADDR_T_64BIT config BOUNCE bool "Enable bounce buffers" default y - depends on BLOCK && MMU && (ZONE_DMA || HIGHMEM) + depends on BLOCK && MMU && HIGHMEM help - Enable bounce buffers for devices that cannot access - the full range of memory available to the CPU. Enabled - by default when ZONE_DMA or HIGHMEM is selected, but you - may say n to override this. + Enable bounce buffers for devices that cannot access the full range of + memory available to the CPU. Enabled by default when HIGHMEM is + selected, but you may say n to override this. config VIRT_TO_BUS bool @@ -513,6 +529,13 @@ config CMA_DEBUGFS help Turns on the DebugFS interface for CMA. +config CMA_SYSFS + bool "CMA information through sysfs interface" + depends on CMA && SYSFS + help + This option exposes some sysfs attributes to get information + from CMA. + config CMA_AREAS int "Maximum count of the CMA areas" depends on CMA @@ -760,6 +783,9 @@ config IDLE_PAGE_TRACKING See Documentation/admin-guide/mm/idle_page_tracking.rst for more details. +config ARCH_HAS_CACHE_LINE_SIZE + bool + config ARCH_HAS_PTE_DEVMAP bool @@ -872,4 +898,7 @@ config MAPPING_DIRTY_HELPERS config KMAP_LOCAL bool +# struct io_mapping based helper. Selected by drivers that need them +config IO_MAPPING + bool endmenu diff --git a/mm/Makefile b/mm/Makefile index 72227b24a616..bf71e295e9f6 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -58,9 +58,13 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ page-alloc-y := page_alloc.o page-alloc-$(CONFIG_SHUFFLE_PAGE_ALLOCATOR) += shuffle.o +# Give 'memory_hotplug' its own module-parameter namespace +memory-hotplug-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o + obj-y += page-alloc.o obj-y += init-mm.o obj-y += memblock.o +obj-y += $(memory-hotplug-y) ifdef CONFIG_MMU obj-$(CONFIG_ADVISE_SYSCALLS) += madvise.o @@ -83,7 +87,6 @@ obj-$(CONFIG_SLUB) += slub.o obj-$(CONFIG_KASAN) += kasan/ obj-$(CONFIG_KFENCE) += kfence/ obj-$(CONFIG_FAILSLAB) += failslab.o -obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_MEMTEST) += memtest.o obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o @@ -109,6 +112,7 @@ obj-$(CONFIG_CMA) += cma.o obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o +obj-$(CONFIG_CMA_SYSFS) += cma_sysfs.o obj-$(CONFIG_USERFAULTFD) += userfaultfd.o obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o @@ -120,3 +124,4 @@ obj-$(CONFIG_MEMFD_CREATE) += memfd.o obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o obj-$(CONFIG_PTDUMP_CORE) += ptdump.o obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o +obj-$(CONFIG_IO_MAPPING) += io-mapping.o diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index 26de020aae7b..907fefde2572 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -58,7 +58,7 @@ EXPORT_SYMBOL_GPL(balloon_page_list_enqueue); /** * balloon_page_list_dequeue() - removes pages from balloon's page list and * returns a list of the pages. - * @b_dev_info: balloon device decriptor where we will grab a page from. + * @b_dev_info: balloon device descriptor where we will grab a page from. * @pages: pointer to the list of pages that would be returned to the caller. * @n_req_pages: number of requested pages. * @@ -157,7 +157,7 @@ EXPORT_SYMBOL_GPL(balloon_page_enqueue); /* * balloon_page_dequeue - removes a page from balloon's page list and returns * its address to allow the driver to release the page. - * @b_dev_info: balloon device decriptor where we will grab a page from. + * @b_dev_info: balloon device descriptor where we will grab a page from. * * Driver must call this function to properly dequeue a previously enqueued page * before definitively releasing it back to the guest system. @@ -24,7 +24,6 @@ #include <linux/memblock.h> #include <linux/err.h> #include <linux/mm.h> -#include <linux/mutex.h> #include <linux/sizes.h> #include <linux/slab.h> #include <linux/log2.h> @@ -80,16 +79,17 @@ static unsigned long cma_bitmap_pages_to_bits(const struct cma *cma, } static void cma_clear_bitmap(struct cma *cma, unsigned long pfn, - unsigned int count) + unsigned long count) { unsigned long bitmap_no, bitmap_count; + unsigned long flags; bitmap_no = (pfn - cma->base_pfn) >> cma->order_per_bit; bitmap_count = cma_bitmap_pages_to_bits(cma, count); - mutex_lock(&cma->lock); + spin_lock_irqsave(&cma->lock, flags); bitmap_clear(cma->bitmap, bitmap_no, bitmap_count); - mutex_unlock(&cma->lock); + spin_unlock_irqrestore(&cma->lock, flags); } static void __init cma_activate_area(struct cma *cma) @@ -118,7 +118,7 @@ static void __init cma_activate_area(struct cma *cma) pfn += pageblock_nr_pages) init_cma_reserved_pageblock(pfn_to_page(pfn)); - mutex_init(&cma->lock); + spin_lock_init(&cma->lock); #ifdef CONFIG_CMA_DEBUGFS INIT_HLIST_HEAD(&cma->mem_head); @@ -392,7 +392,7 @@ static void cma_debug_show_areas(struct cma *cma) unsigned long nr_part, nr_total = 0; unsigned long nbits = cma_bitmap_maxno(cma); - mutex_lock(&cma->lock); + spin_lock_irq(&cma->lock); pr_info("number of available pages: "); for (;;) { next_zero_bit = find_next_zero_bit(cma->bitmap, nbits, start); @@ -407,7 +407,7 @@ static void cma_debug_show_areas(struct cma *cma) start = next_zero_bit + nr_zero; } pr_cont("=> %lu free of %lu total pages\n", nr_total, cma->count); - mutex_unlock(&cma->lock); + spin_unlock_irq(&cma->lock); } #else static inline void cma_debug_show_areas(struct cma *cma) { } @@ -423,25 +423,27 @@ static inline void cma_debug_show_areas(struct cma *cma) { } * This function allocates part of contiguous memory on specific * contiguous memory area. */ -struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, - bool no_warn) +struct page *cma_alloc(struct cma *cma, unsigned long count, + unsigned int align, bool no_warn) { unsigned long mask, offset; unsigned long pfn = -1; unsigned long start = 0; unsigned long bitmap_maxno, bitmap_no, bitmap_count; - size_t i; + unsigned long i; struct page *page = NULL; int ret = -ENOMEM; if (!cma || !cma->count || !cma->bitmap) - return NULL; + goto out; - pr_debug("%s(cma %p, count %zu, align %d)\n", __func__, (void *)cma, + pr_debug("%s(cma %p, count %lu, align %d)\n", __func__, (void *)cma, count, align); if (!count) - return NULL; + goto out; + + trace_cma_alloc_start(cma->name, count, align); mask = cma_bitmap_aligned_mask(cma, align); offset = cma_bitmap_aligned_offset(cma, align); @@ -449,15 +451,15 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, bitmap_count = cma_bitmap_pages_to_bits(cma, count); if (bitmap_count > bitmap_maxno) - return NULL; + goto out; for (;;) { - mutex_lock(&cma->lock); + spin_lock_irq(&cma->lock); bitmap_no = bitmap_find_next_zero_area_off(cma->bitmap, bitmap_maxno, start, bitmap_count, mask, offset); if (bitmap_no >= bitmap_maxno) { - mutex_unlock(&cma->lock); + spin_unlock_irq(&cma->lock); break; } bitmap_set(cma->bitmap, bitmap_no, bitmap_count); @@ -466,7 +468,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, * our exclusive use. If the migration fails we will take the * lock again and unmark it. */ - mutex_unlock(&cma->lock); + spin_unlock_irq(&cma->lock); pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit); ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA, @@ -483,11 +485,14 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, pr_debug("%s(): memory range at %p is busy, retrying\n", __func__, pfn_to_page(pfn)); + + trace_cma_alloc_busy_retry(cma->name, pfn, pfn_to_page(pfn), + count, align); /* try again with a bit different memory target */ start = bitmap_no + mask + 1; } - trace_cma_alloc(pfn, page, count, align); + trace_cma_alloc_finish(cma->name, pfn, page, count, align); /* * CMA can allocate multiple page blocks, which results in different @@ -500,12 +505,22 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, } if (ret && !no_warn) { - pr_err("%s: %s: alloc failed, req-size: %zu pages, ret: %d\n", - __func__, cma->name, count, ret); + pr_err_ratelimited("%s: %s: alloc failed, req-size: %lu pages, ret: %d\n", + __func__, cma->name, count, ret); cma_debug_show_areas(cma); } pr_debug("%s(): returned %p\n", __func__, page); +out: + if (page) { + count_vm_event(CMA_ALLOC_SUCCESS); + cma_sysfs_account_success_pages(cma, count); + } else { + count_vm_event(CMA_ALLOC_FAIL); + if (cma) + cma_sysfs_account_fail_pages(cma, count); + } + return page; } @@ -519,14 +534,15 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, * It returns false when provided pages do not belong to contiguous area and * true otherwise. */ -bool cma_release(struct cma *cma, const struct page *pages, unsigned int count) +bool cma_release(struct cma *cma, const struct page *pages, + unsigned long count) { unsigned long pfn; if (!cma || !pages) return false; - pr_debug("%s(page %p, count %u)\n", __func__, (void *)pages, count); + pr_debug("%s(page %p, count %lu)\n", __func__, (void *)pages, count); pfn = page_to_pfn(pages); @@ -537,7 +553,7 @@ bool cma_release(struct cma *cma, const struct page *pages, unsigned int count) free_contig_range(pfn, count); cma_clear_bitmap(cma, pfn, count); - trace_cma_release(pfn, pages, count); + trace_cma_release(cma->name, pfn, pages, count); return true; } @@ -3,19 +3,33 @@ #define __MM_CMA_H__ #include <linux/debugfs.h> +#include <linux/kobject.h> + +struct cma_kobject { + struct kobject kobj; + struct cma *cma; +}; struct cma { unsigned long base_pfn; unsigned long count; unsigned long *bitmap; unsigned int order_per_bit; /* Order of pages represented by one bit */ - struct mutex lock; + spinlock_t lock; #ifdef CONFIG_CMA_DEBUGFS struct hlist_head mem_head; spinlock_t mem_head_lock; struct debugfs_u32_array dfs_bitmap; #endif char name[CMA_MAX_NAME]; +#ifdef CONFIG_CMA_SYSFS + /* the number of CMA page successful allocations */ + atomic64_t nr_pages_succeeded; + /* the number of CMA page allocation failures */ + atomic64_t nr_pages_failed; + /* kobject requires dynamic object */ + struct cma_kobject *cma_kobj; +#endif }; extern struct cma cma_areas[MAX_CMA_AREAS]; @@ -26,4 +40,13 @@ static inline unsigned long cma_bitmap_maxno(struct cma *cma) return cma->count >> cma->order_per_bit; } +#ifdef CONFIG_CMA_SYSFS +void cma_sysfs_account_success_pages(struct cma *cma, unsigned long nr_pages); +void cma_sysfs_account_fail_pages(struct cma *cma, unsigned long nr_pages); +#else +static inline void cma_sysfs_account_success_pages(struct cma *cma, + unsigned long nr_pages) {}; +static inline void cma_sysfs_account_fail_pages(struct cma *cma, + unsigned long nr_pages) {}; +#endif #endif diff --git a/mm/cma_debug.c b/mm/cma_debug.c index d5bf8aa34fdc..2e7704955f4f 100644 --- a/mm/cma_debug.c +++ b/mm/cma_debug.c @@ -36,10 +36,10 @@ static int cma_used_get(void *data, u64 *val) struct cma *cma = data; unsigned long used; - mutex_lock(&cma->lock); + spin_lock_irq(&cma->lock); /* pages counter is smaller than sizeof(int) */ used = bitmap_weight(cma->bitmap, (int)cma_bitmap_maxno(cma)); - mutex_unlock(&cma->lock); + spin_unlock_irq(&cma->lock); *val = (u64)used << cma->order_per_bit; return 0; @@ -53,7 +53,7 @@ static int cma_maxchunk_get(void *data, u64 *val) unsigned long start, end = 0; unsigned long bitmap_maxno = cma_bitmap_maxno(cma); - mutex_lock(&cma->lock); + spin_lock_irq(&cma->lock); for (;;) { start = find_next_zero_bit(cma->bitmap, bitmap_maxno, end); if (start >= bitmap_maxno) @@ -61,7 +61,7 @@ static int cma_maxchunk_get(void *data, u64 *val) end = find_next_bit(cma->bitmap, bitmap_maxno, start); maxchunk = max(end - start, maxchunk); } - mutex_unlock(&cma->lock); + spin_unlock_irq(&cma->lock); *val = (u64)maxchunk << cma->order_per_bit; return 0; diff --git a/mm/cma_sysfs.c b/mm/cma_sysfs.c new file mode 100644 index 000000000000..eb2f39caff59 --- /dev/null +++ b/mm/cma_sysfs.c @@ -0,0 +1,112 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * CMA SysFS Interface + * + * Copyright (c) 2021 Minchan Kim <minchan@kernel.org> + */ + +#include <linux/cma.h> +#include <linux/kernel.h> +#include <linux/slab.h> + +#include "cma.h" + +#define CMA_ATTR_RO(_name) \ + static struct kobj_attribute _name##_attr = __ATTR_RO(_name) + +void cma_sysfs_account_success_pages(struct cma *cma, unsigned long nr_pages) +{ + atomic64_add(nr_pages, &cma->nr_pages_succeeded); +} + +void cma_sysfs_account_fail_pages(struct cma *cma, unsigned long nr_pages) +{ + atomic64_add(nr_pages, &cma->nr_pages_failed); +} + +static inline struct cma *cma_from_kobj(struct kobject *kobj) +{ + return container_of(kobj, struct cma_kobject, kobj)->cma; +} + +static ssize_t alloc_pages_success_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct cma *cma = cma_from_kobj(kobj); + + return sysfs_emit(buf, "%llu\n", + atomic64_read(&cma->nr_pages_succeeded)); +} +CMA_ATTR_RO(alloc_pages_success); + +static ssize_t alloc_pages_fail_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct cma *cma = cma_from_kobj(kobj); + + return sysfs_emit(buf, "%llu\n", atomic64_read(&cma->nr_pages_failed)); +} +CMA_ATTR_RO(alloc_pages_fail); + +static void cma_kobj_release(struct kobject *kobj) +{ + struct cma *cma = cma_from_kobj(kobj); + struct cma_kobject *cma_kobj = cma->cma_kobj; + + kfree(cma_kobj); + cma->cma_kobj = NULL; +} + +static struct attribute *cma_attrs[] = { + &alloc_pages_success_attr.attr, + &alloc_pages_fail_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(cma); + +static struct kobj_type cma_ktype = { + .release = cma_kobj_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = cma_groups, +}; + +static int __init cma_sysfs_init(void) +{ + struct kobject *cma_kobj_root; + struct cma_kobject *cma_kobj; + struct cma *cma; + int i, err; + + cma_kobj_root = kobject_create_and_add("cma", mm_kobj); + if (!cma_kobj_root) + return -ENOMEM; + + for (i = 0; i < cma_area_count; i++) { + cma_kobj = kzalloc(sizeof(*cma_kobj), GFP_KERNEL); + if (!cma_kobj) { + err = -ENOMEM; + goto out; + } + + cma = &cma_areas[i]; + cma->cma_kobj = cma_kobj; + cma_kobj->cma = cma; + err = kobject_init_and_add(&cma_kobj->kobj, &cma_ktype, + cma_kobj_root, "%s", cma->name); + if (err) { + kobject_put(&cma_kobj->kobj); + goto out; + } + } + + return 0; +out: + while (--i >= 0) { + cma = &cma_areas[i]; + kobject_put(&cma->cma_kobj->kobj); + } + kobject_put(cma_kobj_root); + + return err; +} +subsys_initcall(cma_sysfs_init); diff --git a/mm/compaction.c b/mm/compaction.c index e04f4476e68e..84fde270ae74 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -787,15 +787,14 @@ static bool too_many_isolated(pg_data_t *pgdat) * * Isolate all pages that can be migrated from the range specified by * [low_pfn, end_pfn). The range is expected to be within same pageblock. - * Returns zero if there is a fatal signal pending, otherwise PFN of the - * first page that was not scanned (which may be both less, equal to or more - * than end_pfn). + * Returns errno, like -EAGAIN or -EINTR in case e.g signal pending or congestion, + * -ENOMEM in case we could not allocate a page, or 0. + * cc->migrate_pfn will contain the next pfn to scan. * * The pages are isolated on cc->migratepages list (not required to be empty), - * and cc->nr_migratepages is updated accordingly. The cc->migrate_pfn field - * is neither read nor updated. + * and cc->nr_migratepages is updated accordingly. */ -static unsigned long +static int isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, unsigned long end_pfn, isolate_mode_t isolate_mode) { @@ -809,6 +808,9 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, bool skip_on_failure = false; unsigned long next_skip_pfn = 0; bool skip_updated = false; + int ret = 0; + + cc->migrate_pfn = low_pfn; /* * Ensure that there are not too many pages isolated from the LRU @@ -818,16 +820,16 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, while (unlikely(too_many_isolated(pgdat))) { /* stop isolation if there are still pages not migrated */ if (cc->nr_migratepages) - return 0; + return -EAGAIN; /* async migration should just abort */ if (cc->mode == MIGRATE_ASYNC) - return 0; + return -EAGAIN; congestion_wait(BLK_RW_ASYNC, HZ/10); if (fatal_signal_pending(current)) - return 0; + return -EINTR; } cond_resched(); @@ -875,8 +877,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (fatal_signal_pending(current)) { cc->contended = true; + ret = -EINTR; - low_pfn = 0; goto fatal_pending; } @@ -904,6 +906,38 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, valid_page = page; } + if (PageHuge(page) && cc->alloc_contig) { + ret = isolate_or_dissolve_huge_page(page, &cc->migratepages); + + /* + * Fail isolation in case isolate_or_dissolve_huge_page() + * reports an error. In case of -ENOMEM, abort right away. + */ + if (ret < 0) { + /* Do not report -EBUSY down the chain */ + if (ret == -EBUSY) + ret = 0; + low_pfn += (1UL << compound_order(page)) - 1; + goto isolate_fail; + } + + if (PageHuge(page)) { + /* + * Hugepage was successfully isolated and placed + * on the cc->migratepages list. + */ + low_pfn += compound_nr(page) - 1; + goto isolate_success_no_list; + } + + /* + * Ok, the hugepage was dissolved. Now these pages are + * Buddy and cannot be re-allocated because they are + * isolated. Fall-through as the check below handles + * Buddy pages. + */ + } + /* * Skip if free. We read page order here without zone lock * which is generally unsafe, but the race window is small and @@ -1037,6 +1071,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, isolate_success: list_add(&page->lru, &cc->migratepages); +isolate_success_no_list: cc->nr_migratepages += compound_nr(page); nr_isolated += compound_nr(page); @@ -1063,7 +1098,7 @@ isolate_fail_put: put_page(page); isolate_fail: - if (!skip_on_failure) + if (!skip_on_failure && ret != -ENOMEM) continue; /* @@ -1089,6 +1124,9 @@ isolate_fail: */ next_skip_pfn += 1UL << cc->order; } + + if (ret == -ENOMEM) + break; } /* @@ -1130,7 +1168,9 @@ fatal_pending: if (nr_isolated) count_compact_events(COMPACTISOLATED, nr_isolated); - return low_pfn; + cc->migrate_pfn = low_pfn; + + return ret; } /** @@ -1139,15 +1179,15 @@ fatal_pending: * @start_pfn: The first PFN to start isolating. * @end_pfn: The one-past-last PFN. * - * Returns zero if isolation fails fatally due to e.g. pending signal. - * Otherwise, function returns one-past-the-last PFN of isolated page - * (which may be greater than end_pfn if end fell in a middle of a THP page). + * Returns -EAGAIN when contented, -EINTR in case of a signal pending, -ENOMEM + * in case we could not allocate a page, or 0. */ -unsigned long +int isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, unsigned long end_pfn) { unsigned long pfn, block_start_pfn, block_end_pfn; + int ret = 0; /* Scan block by block. First and last block may be incomplete */ pfn = start_pfn; @@ -1166,17 +1206,17 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, block_end_pfn, cc->zone)) continue; - pfn = isolate_migratepages_block(cc, pfn, block_end_pfn, - ISOLATE_UNEVICTABLE); + ret = isolate_migratepages_block(cc, pfn, block_end_pfn, + ISOLATE_UNEVICTABLE); - if (!pfn) + if (ret) break; if (cc->nr_migratepages >= COMPACT_CLUSTER_MAX) break; } - return pfn; + return ret; } #endif /* CONFIG_COMPACTION || CONFIG_CMA */ @@ -1847,7 +1887,7 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc) */ for (; block_end_pfn <= cc->free_pfn; fast_find_block = false, - low_pfn = block_end_pfn, + cc->migrate_pfn = low_pfn = block_end_pfn, block_start_pfn = block_end_pfn, block_end_pfn += pageblock_nr_pages) { @@ -1889,10 +1929,8 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc) } /* Perform the isolation */ - low_pfn = isolate_migratepages_block(cc, low_pfn, - block_end_pfn, isolate_mode); - - if (!low_pfn) + if (isolate_migratepages_block(cc, low_pfn, block_end_pfn, + isolate_mode)) return ISOLATE_ABORT; /* @@ -1903,9 +1941,6 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc) break; } - /* Record where migration scanner will be restarted. */ - cc->migrate_pfn = low_pfn; - return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE; } @@ -1977,8 +2012,8 @@ static unsigned int fragmentation_score_wmark(pg_data_t *pgdat, bool low) unsigned int wmark_low; /* - * Cap the low watermak to avoid excessive compaction - * activity in case a user sets the proactivess tunable + * Cap the low watermark to avoid excessive compaction + * activity in case a user sets the proactiveness tunable * close to 100 (maximum). */ wmark_low = max(100U - sysctl_compaction_proactiveness, 5U); @@ -2319,7 +2354,8 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn, sync); - migrate_prep_local(); + /* lru_add_drain_all could be expensive with involving other CPUs */ + lru_add_drain(); while ((ret = compact_finished(cc)) == COMPACT_CONTINUE) { int err; @@ -2494,6 +2530,14 @@ static enum compact_result compact_zone_order(struct zone *zone, int order, */ WRITE_ONCE(current->capture_control, NULL); *capture = READ_ONCE(capc.page); + /* + * Technically, it is also possible that compaction is skipped but + * the page is still captured out of luck(IRQ came and freed the page). + * Returning COMPACT_SUCCESS in such cases helps in properly accounting + * the COMPACT[STALL|FAIL] when compaction is skipped. + */ + if (*capture) + ret = COMPACT_SUCCESS; return ret; } @@ -2657,9 +2701,6 @@ static void compact_nodes(void) compact_node(nid); } -/* The written value is actually unused, all memory is compacted */ -int sysctl_compact_memory; - /* * Tunable for proactive compaction. It determines how * aggressively the kernel should compact memory in the @@ -2844,7 +2885,7 @@ void wakeup_kcompactd(pg_data_t *pgdat, int order, int highest_zoneidx) */ static int kcompactd(void *p) { - pg_data_t *pgdat = (pg_data_t*)p; + pg_data_t *pgdat = (pg_data_t *)p; struct task_struct *tsk = current; unsigned int proactive_defer = 0; diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index a9bd6ce1ba02..297d1b349c19 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -192,7 +192,7 @@ static void __init pmd_advanced_tests(struct mm_struct *mm, pr_debug("Validating PMD advanced\n"); /* Align the address wrt HPAGE_PMD_SIZE */ - vaddr = (vaddr & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE; + vaddr &= HPAGE_PMD_MASK; pgtable_trans_huge_deposit(mm, pmdp, pgtable); @@ -247,7 +247,7 @@ static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot) { pmd_t pmd; - if (!arch_ioremap_pmd_supported()) + if (!arch_vmap_pmd_supported(prot)) return; pr_debug("Validating PMD huge\n"); @@ -330,7 +330,7 @@ static void __init pud_advanced_tests(struct mm_struct *mm, pr_debug("Validating PUD advanced\n"); /* Align the address wrt HPAGE_PUD_SIZE */ - vaddr = (vaddr & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE; + vaddr &= HPAGE_PUD_MASK; set_pud_at(mm, vaddr, pudp, pud); pudp_set_wrprotect(mm, vaddr, pudp); @@ -385,7 +385,7 @@ static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot) { pud_t pud; - if (!arch_ioremap_pud_supported()) + if (!arch_vmap_pud_supported(prot)) return; pr_debug("Validating PUD huge\n"); diff --git a/mm/dmapool.c b/mm/dmapool.c index f3791532fef2..16483f86360e 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -157,7 +157,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, if (!retval) return retval; - strlcpy(retval->name, name, sizeof(retval->name)); + strscpy(retval->name, name, sizeof(retval->name)); retval->dev = dev; diff --git a/mm/filemap.c b/mm/filemap.c index 6ce832dc59e7..66f7e9fdfbc4 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -142,17 +142,6 @@ static void page_cache_delete(struct address_space *mapping, page->mapping = NULL; /* Leave page->index set: truncation lookup relies upon it */ - - if (shadow) { - mapping->nrexceptional += nr; - /* - * Make sure the nrexceptional update is committed before - * the nrpages update so that final truncate racing - * with reclaim does not see both counters 0 at the - * same time and miss a shadow entry. - */ - smp_wmb(); - } mapping->nrpages -= nr; } @@ -629,13 +618,53 @@ EXPORT_SYMBOL(filemap_fdatawait_keep_errors); /* Returns true if writeback might be needed or already in progress. */ static bool mapping_needs_writeback(struct address_space *mapping) { - if (dax_mapping(mapping)) - return mapping->nrexceptional; - return mapping->nrpages; } /** + * filemap_range_needs_writeback - check if range potentially needs writeback + * @mapping: address space within which to check + * @start_byte: offset in bytes where the range starts + * @end_byte: offset in bytes where the range ends (inclusive) + * + * Find at least one page in the range supplied, usually used to check if + * direct writing in this range will trigger a writeback. Used by O_DIRECT + * read/write with IOCB_NOWAIT, to see if the caller needs to do + * filemap_write_and_wait_range() before proceeding. + * + * Return: %true if the caller should do filemap_write_and_wait_range() before + * doing O_DIRECT to a page in this range, %false otherwise. + */ +bool filemap_range_needs_writeback(struct address_space *mapping, + loff_t start_byte, loff_t end_byte) +{ + XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT); + pgoff_t max = end_byte >> PAGE_SHIFT; + struct page *page; + + if (!mapping_needs_writeback(mapping)) + return false; + if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && + !mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) + return false; + if (end_byte < start_byte) + return false; + + rcu_read_lock(); + xas_for_each(&xas, page, max) { + if (xas_retry(&xas, page)) + continue; + if (xa_is_value(page)) + continue; + if (PageDirty(page) || PageLocked(page) || PageWriteback(page)) + break; + } + rcu_read_unlock(); + return page != NULL; +} +EXPORT_SYMBOL_GPL(filemap_range_needs_writeback); + +/** * filemap_write_and_wait_range - write out & wait on a file range * @mapping: the address_space for the pages * @lstart: offset in bytes where the range starts @@ -882,8 +911,6 @@ noinline int __add_to_page_cache_locked(struct page *page, if (xas_error(&xas)) goto unlock; - if (old) - mapping->nrexceptional--; mapping->nrpages++; /* hugetlb pages do not participate in page cache accounting */ @@ -1433,6 +1460,67 @@ void unlock_page(struct page *page) EXPORT_SYMBOL(unlock_page); /** + * end_page_private_2 - Clear PG_private_2 and release any waiters + * @page: The page + * + * Clear the PG_private_2 bit on a page and wake up any sleepers waiting for + * this. The page ref held for PG_private_2 being set is released. + * + * This is, for example, used when a netfs page is being written to a local + * disk cache, thereby allowing writes to the cache for the same page to be + * serialised. + */ +void end_page_private_2(struct page *page) +{ + page = compound_head(page); + VM_BUG_ON_PAGE(!PagePrivate2(page), page); + clear_bit_unlock(PG_private_2, &page->flags); + wake_up_page_bit(page, PG_private_2); + put_page(page); +} +EXPORT_SYMBOL(end_page_private_2); + +/** + * wait_on_page_private_2 - Wait for PG_private_2 to be cleared on a page + * @page: The page to wait on + * + * Wait for PG_private_2 (aka PG_fscache) to be cleared on a page. + */ +void wait_on_page_private_2(struct page *page) +{ + page = compound_head(page); + while (PagePrivate2(page)) + wait_on_page_bit(page, PG_private_2); +} +EXPORT_SYMBOL(wait_on_page_private_2); + +/** + * wait_on_page_private_2_killable - Wait for PG_private_2 to be cleared on a page + * @page: The page to wait on + * + * Wait for PG_private_2 (aka PG_fscache) to be cleared on a page or until a + * fatal signal is received by the calling task. + * + * Return: + * - 0 if successful. + * - -EINTR if a fatal signal was encountered. + */ +int wait_on_page_private_2_killable(struct page *page) +{ + int ret = 0; + + page = compound_head(page); + while (PagePrivate2(page)) { + ret = wait_on_page_bit_killable(page, PG_private_2); + if (ret < 0) + break; + } + + return ret; +} +EXPORT_SYMBOL(wait_on_page_private_2_killable); + +/** * end_page_writeback - end writeback against a page * @page: the page */ @@ -1663,7 +1751,7 @@ EXPORT_SYMBOL(page_cache_prev_miss); * @mapping: the address_space to search * @index: The page cache index. * - * Looks up the page cache slot at @mapping & @offset. If there is a + * Looks up the page cache slot at @mapping & @index. If there is a * page cache page, the head page is returned with an increased refcount. * * If the slot holds a shadow entry of a previously evicted page, or a @@ -2244,8 +2332,6 @@ static int filemap_read_page(struct file *file, struct address_space *mapping, return error; if (PageUptodate(page)) return 0; - if (!page->mapping) /* page truncated */ - return AOP_TRUNCATED_PAGE; shrink_readahead_size_eio(&file->f_ra); return -EIO; } @@ -2577,8 +2663,8 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) size = i_size_read(inode); if (iocb->ki_flags & IOCB_NOWAIT) { - if (filemap_range_has_page(mapping, iocb->ki_pos, - iocb->ki_pos + count - 1)) + if (filemap_range_needs_writeback(mapping, iocb->ki_pos, + iocb->ki_pos + count - 1)) return -EAGAIN; } else { retval = filemap_write_and_wait_range(mapping, @@ -2669,7 +2755,7 @@ unsigned int seek_page_size(struct xa_state *xas, struct page *page) * entirely memory-based such as tmpfs, and filesystems which support * unwritten extents. * - * Return: The requested offset on successs, or -ENXIO if @whence specifies + * Return: The requested offset on success, or -ENXIO if @whence specifies * SEEK_DATA and there is no data after @start. There is an implicit hole * after @end - 1, so SEEK_HOLE returns @end if all the bytes between @start * and @end contain data. @@ -2778,7 +2864,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) struct file *file = vmf->vma->vm_file; struct file_ra_state *ra = &file->f_ra; struct address_space *mapping = file->f_mapping; - DEFINE_READAHEAD(ractl, file, mapping, vmf->pgoff); + DEFINE_READAHEAD(ractl, file, ra, mapping, vmf->pgoff); struct file *fpin = NULL; unsigned int mmap_miss; @@ -2790,7 +2876,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) if (vmf->vma->vm_flags & VM_SEQ_READ) { fpin = maybe_unlock_mmap_for_io(vmf, fpin); - page_cache_sync_ra(&ractl, ra, ra->ra_pages); + page_cache_sync_ra(&ractl, ra->ra_pages); return fpin; } @@ -2876,7 +2962,6 @@ vm_fault_t filemap_fault(struct vm_fault *vmf) struct file *file = vmf->vma->vm_file; struct file *fpin = NULL; struct address_space *mapping = file->f_mapping; - struct file_ra_state *ra = &file->f_ra; struct inode *inode = mapping->host; pgoff_t offset = vmf->pgoff; pgoff_t max_off; @@ -2963,14 +3048,8 @@ page_not_uptodate: * because there really aren't any performance issues here * and we need to check for errors. */ - ClearPageError(page); fpin = maybe_unlock_mmap_for_io(vmf, fpin); - error = mapping->a_ops->readpage(file, page); - if (!error) { - wait_on_page_locked(page); - if (!PageUptodate(page)) - error = -EIO; - } + error = filemap_read_page(file, mapping, page); if (fpin) goto out_retry; put_page(page); @@ -2978,7 +3057,6 @@ page_not_uptodate: if (!error || error == AOP_TRUNCATED_PAGE) goto retry_find; - shrink_readahead_size_eio(ra); return VM_FAULT_SIGBUS; out_retry: @@ -3189,7 +3267,7 @@ const struct vm_operations_struct generic_file_vm_ops = { /* This is used for a general mmap of a disk file */ -int generic_file_mmap(struct file * file, struct vm_area_struct * vma) +int generic_file_mmap(struct file *file, struct vm_area_struct *vma) { struct address_space *mapping = file->f_mapping; @@ -3214,11 +3292,11 @@ vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf) { return VM_FAULT_SIGBUS; } -int generic_file_mmap(struct file * file, struct vm_area_struct * vma) +int generic_file_mmap(struct file *file, struct vm_area_struct *vma) { return -ENOSYS; } -int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma) +int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) { return -ENOSYS; } @@ -3646,7 +3724,7 @@ EXPORT_SYMBOL(generic_perform_write); ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; - struct address_space * mapping = file->f_mapping; + struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; ssize_t written = 0; ssize_t err; diff --git a/mm/frontswap.c b/mm/frontswap.c index 2183a56c7874..130e301c5ac0 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -60,16 +60,20 @@ static u64 frontswap_succ_stores; static u64 frontswap_failed_stores; static u64 frontswap_invalidates; -static inline void inc_frontswap_loads(void) { +static inline void inc_frontswap_loads(void) +{ data_race(frontswap_loads++); } -static inline void inc_frontswap_succ_stores(void) { +static inline void inc_frontswap_succ_stores(void) +{ data_race(frontswap_succ_stores++); } -static inline void inc_frontswap_failed_stores(void) { +static inline void inc_frontswap_failed_stores(void) +{ data_race(frontswap_failed_stores++); } -static inline void inc_frontswap_invalidates(void) { +static inline void inc_frontswap_invalidates(void) +{ data_race(frontswap_invalidates++); } #else @@ -87,11 +87,12 @@ __maybe_unused struct page *try_grab_compound_head(struct page *page, int orig_refs = refs; /* - * Can't do FOLL_LONGTERM + FOLL_PIN with CMA in the gup fast - * path, so fail and let the caller fall back to the slow path. + * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a + * right zone, so fail and let the caller fall back to the slow + * path. */ - if (unlikely(flags & FOLL_LONGTERM) && - is_migrate_cma_page(page)) + if (unlikely((flags & FOLL_LONGTERM) && + !is_pinnable_page(page))) return NULL; /* @@ -213,6 +214,58 @@ void unpin_user_page(struct page *page) } EXPORT_SYMBOL(unpin_user_page); +static inline void compound_range_next(unsigned long i, unsigned long npages, + struct page **list, struct page **head, + unsigned int *ntails) +{ + struct page *next, *page; + unsigned int nr = 1; + + if (i >= npages) + return; + + next = *list + i; + page = compound_head(next); + if (PageCompound(page) && compound_order(page) >= 1) + nr = min_t(unsigned int, + page + compound_nr(page) - next, npages - i); + + *head = page; + *ntails = nr; +} + +#define for_each_compound_range(__i, __list, __npages, __head, __ntails) \ + for (__i = 0, \ + compound_range_next(__i, __npages, __list, &(__head), &(__ntails)); \ + __i < __npages; __i += __ntails, \ + compound_range_next(__i, __npages, __list, &(__head), &(__ntails))) + +static inline void compound_next(unsigned long i, unsigned long npages, + struct page **list, struct page **head, + unsigned int *ntails) +{ + struct page *page; + unsigned int nr; + + if (i >= npages) + return; + + page = compound_head(list[i]); + for (nr = i + 1; nr < npages; nr++) { + if (compound_head(list[nr]) != page) + break; + } + + *head = page; + *ntails = nr - i; +} + +#define for_each_compound_head(__i, __list, __npages, __head, __ntails) \ + for (__i = 0, \ + compound_next(__i, __npages, __list, &(__head), &(__ntails)); \ + __i < __npages; __i += __ntails, \ + compound_next(__i, __npages, __list, &(__head), &(__ntails))) + /** * unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages * @pages: array of pages to be maybe marked dirty, and definitely released. @@ -239,20 +292,15 @@ void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages, bool make_dirty) { unsigned long index; - - /* - * TODO: this can be optimized for huge pages: if a series of pages is - * physically contiguous and part of the same compound page, then a - * single operation to the head page should suffice. - */ + struct page *head; + unsigned int ntails; if (!make_dirty) { unpin_user_pages(pages, npages); return; } - for (index = 0; index < npages; index++) { - struct page *page = compound_head(pages[index]); + for_each_compound_head(index, pages, npages, head, ntails) { /* * Checking PageDirty at this point may race with * clear_page_dirty_for_io(), but that's OK. Two key @@ -273,14 +321,50 @@ void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages, * written back, so it gets written back again in the * next writeback cycle. This is harmless. */ - if (!PageDirty(page)) - set_page_dirty_lock(page); - unpin_user_page(page); + if (!PageDirty(head)) + set_page_dirty_lock(head); + put_compound_head(head, ntails, FOLL_PIN); } } EXPORT_SYMBOL(unpin_user_pages_dirty_lock); /** + * unpin_user_page_range_dirty_lock() - release and optionally dirty + * gup-pinned page range + * + * @page: the starting page of a range maybe marked dirty, and definitely released. + * @npages: number of consecutive pages to release. + * @make_dirty: whether to mark the pages dirty + * + * "gup-pinned page range" refers to a range of pages that has had one of the + * pin_user_pages() variants called on that page. + * + * For the page ranges defined by [page .. page+npages], make that range (or + * its head pages, if a compound page) dirty, if @make_dirty is true, and if the + * page range was previously listed as clean. + * + * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is + * required, then the caller should a) verify that this is really correct, + * because _lock() is usually required, and b) hand code it: + * set_page_dirty_lock(), unpin_user_page(). + * + */ +void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages, + bool make_dirty) +{ + unsigned long index; + struct page *head; + unsigned int ntails; + + for_each_compound_range(index, &page, npages, head, ntails) { + if (make_dirty && !PageDirty(head)) + set_page_dirty_lock(head); + put_compound_head(head, ntails, FOLL_PIN); + } +} +EXPORT_SYMBOL(unpin_user_page_range_dirty_lock); + +/** * unpin_user_pages() - release an array of gup-pinned pages. * @pages: array of pages to be marked dirty and released. * @npages: number of pages in the @pages array. @@ -292,6 +376,8 @@ EXPORT_SYMBOL(unpin_user_pages_dirty_lock); void unpin_user_pages(struct page **pages, unsigned long npages) { unsigned long index; + struct page *head; + unsigned int ntails; /* * If this WARN_ON() fires, then the system *might* be leaking pages (by @@ -300,13 +386,9 @@ void unpin_user_pages(struct page **pages, unsigned long npages) */ if (WARN_ON(IS_ERR_VALUE(npages))) return; - /* - * TODO: this can be optimized for huge pages: if a series of pages is - * physically contiguous and part of the same compound page, then a - * single operation to the head page should suffice. - */ - for (index = 0; index < npages; index++) - unpin_user_page(pages[index]); + + for_each_compound_head(index, pages, npages, head, ntails) + put_compound_head(head, ntails, FOLL_PIN); } EXPORT_SYMBOL(unpin_user_pages); @@ -435,18 +517,6 @@ retry: } } - if (flags & FOLL_SPLIT && PageTransCompound(page)) { - get_page(page); - pte_unmap_unlock(ptep, ptl); - lock_page(page); - ret = split_huge_page(page); - unlock_page(page); - put_page(page); - if (ret) - return ERR_PTR(ret); - goto retry; - } - /* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */ if (unlikely(!try_grab_page(page, flags))) { page = ERR_PTR(-ENOMEM); @@ -591,7 +661,7 @@ retry_locked: spin_unlock(ptl); return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); } - if (flags & (FOLL_SPLIT | FOLL_SPLIT_PMD)) { + if (flags & FOLL_SPLIT_PMD) { int ret; page = pmd_page(*pmd); if (is_huge_zero_page(page)) { @@ -600,19 +670,7 @@ retry_locked: split_huge_pmd(vma, pmd, address); if (pmd_trans_unstable(pmd)) ret = -EBUSY; - } else if (flags & FOLL_SPLIT) { - if (unlikely(!try_get_page(page))) { - spin_unlock(ptl); - return ERR_PTR(-ENOMEM); - } - spin_unlock(ptl); - lock_page(page); - ret = split_huge_page(page); - unlock_page(page); - put_page(page); - if (pmd_none(*pmd)) - return no_page_table(vma, flags); - } else { /* flags & FOLL_SPLIT_PMD */ + } else { spin_unlock(ptl); split_huge_pmd(vma, pmd, address); ret = pte_alloc(mm, pmd) ? -ENOMEM : 0; @@ -1470,7 +1528,7 @@ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start, { struct vm_area_struct *vma; unsigned long vm_flags; - int i; + long i; /* calculate required read or write permissions. * If FOLL_FORCE is set, we only require the "MAY" flags. @@ -1517,7 +1575,7 @@ finish_or_fault: * Returns NULL on any kind of failure - a hole must then be inserted into * the corefile, to preserve alignment with its headers; and also returns * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found - - * allowing a hole to be left in the corefile to save diskspace. + * allowing a hole to be left in the corefile to save disk space. * * Called without mmap_lock (takes and releases the mmap_lock by itself). */ @@ -1535,120 +1593,96 @@ struct page *get_dump_page(unsigned long addr) FOLL_FORCE | FOLL_DUMP | FOLL_GET); if (locked) mmap_read_unlock(mm); - - if (ret == 1 && is_page_poisoned(page)) - return NULL; - return (ret == 1) ? page : NULL; } #endif /* CONFIG_ELF_CORE */ -#ifdef CONFIG_CMA -static long check_and_migrate_cma_pages(struct mm_struct *mm, - unsigned long start, - unsigned long nr_pages, - struct page **pages, - struct vm_area_struct **vmas, - unsigned int gup_flags) +#ifdef CONFIG_MIGRATION +/* + * Check whether all pages are pinnable, if so return number of pages. If some + * pages are not pinnable, migrate them, and unpin all pages. Return zero if + * pages were migrated, or if some pages were not successfully isolated. + * Return negative error if migration fails. + */ +static long check_and_migrate_movable_pages(unsigned long nr_pages, + struct page **pages, + unsigned int gup_flags) { unsigned long i; - unsigned long step; + unsigned long isolation_error_count = 0; bool drain_allow = true; - bool migrate_allow = true; - LIST_HEAD(cma_page_list); - long ret = nr_pages; + LIST_HEAD(movable_page_list); + long ret = 0; + struct page *prev_head = NULL; + struct page *head; struct migration_target_control mtc = { .nid = NUMA_NO_NODE, - .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_NOWARN, + .gfp_mask = GFP_USER | __GFP_NOWARN, }; -check_again: - for (i = 0; i < nr_pages;) { - - struct page *head = compound_head(pages[i]); - - /* - * gup may start from a tail page. Advance step by the left - * part. - */ - step = compound_nr(head) - (pages[i] - head); + for (i = 0; i < nr_pages; i++) { + head = compound_head(pages[i]); + if (head == prev_head) + continue; + prev_head = head; /* - * If we get a page from the CMA zone, since we are going to - * be pinning these entries, we might as well move them out - * of the CMA zone if possible. + * If we get a movable page, since we are going to be pinning + * these entries, try to move them out if possible. */ - if (is_migrate_cma_page(head)) { - if (PageHuge(head)) - isolate_huge_page(head, &cma_page_list); - else { + if (!is_pinnable_page(head)) { + if (PageHuge(head)) { + if (!isolate_huge_page(head, &movable_page_list)) + isolation_error_count++; + } else { if (!PageLRU(head) && drain_allow) { lru_add_drain_all(); drain_allow = false; } - if (!isolate_lru_page(head)) { - list_add_tail(&head->lru, &cma_page_list); - mod_node_page_state(page_pgdat(head), - NR_ISOLATED_ANON + - page_is_file_lru(head), - thp_nr_pages(head)); + if (isolate_lru_page(head)) { + isolation_error_count++; + continue; } + list_add_tail(&head->lru, &movable_page_list); + mod_node_page_state(page_pgdat(head), + NR_ISOLATED_ANON + + page_is_file_lru(head), + thp_nr_pages(head)); } } - - i += step; } - if (!list_empty(&cma_page_list)) { - /* - * drop the above get_user_pages reference. - */ - if (gup_flags & FOLL_PIN) - unpin_user_pages(pages, nr_pages); - else - for (i = 0; i < nr_pages; i++) - put_page(pages[i]); - - if (migrate_pages(&cma_page_list, alloc_migration_target, NULL, - (unsigned long)&mtc, MIGRATE_SYNC, MR_CONTIG_RANGE)) { - /* - * some of the pages failed migration. Do get_user_pages - * without migration. - */ - migrate_allow = false; + /* + * If list is empty, and no isolation errors, means that all pages are + * in the correct zone. + */ + if (list_empty(&movable_page_list) && !isolation_error_count) + return nr_pages; - if (!list_empty(&cma_page_list)) - putback_movable_pages(&cma_page_list); - } - /* - * We did migrate all the pages, Try to get the page references - * again migrating any new CMA pages which we failed to isolate - * earlier. - */ - ret = __get_user_pages_locked(mm, start, nr_pages, - pages, vmas, NULL, - gup_flags); - - if ((ret > 0) && migrate_allow) { - nr_pages = ret; - drain_allow = true; - goto check_again; - } + if (gup_flags & FOLL_PIN) { + unpin_user_pages(pages, nr_pages); + } else { + for (i = 0; i < nr_pages; i++) + put_page(pages[i]); + } + if (!list_empty(&movable_page_list)) { + ret = migrate_pages(&movable_page_list, alloc_migration_target, + NULL, (unsigned long)&mtc, MIGRATE_SYNC, + MR_LONGTERM_PIN); + if (ret && !list_empty(&movable_page_list)) + putback_movable_pages(&movable_page_list); } - return ret; + return ret > 0 ? -ENOMEM : ret; } #else -static long check_and_migrate_cma_pages(struct mm_struct *mm, - unsigned long start, - unsigned long nr_pages, - struct page **pages, - struct vm_area_struct **vmas, - unsigned int gup_flags) +static long check_and_migrate_movable_pages(unsigned long nr_pages, + struct page **pages, + unsigned int gup_flags) { return nr_pages; } -#endif /* CONFIG_CMA */ +#endif /* CONFIG_MIGRATION */ /* * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which @@ -1661,21 +1695,22 @@ static long __gup_longterm_locked(struct mm_struct *mm, struct vm_area_struct **vmas, unsigned int gup_flags) { - unsigned long flags = 0; + unsigned int flags; long rc; - if (gup_flags & FOLL_LONGTERM) - flags = memalloc_nocma_save(); - - rc = __get_user_pages_locked(mm, start, nr_pages, pages, vmas, NULL, - gup_flags); + if (!(gup_flags & FOLL_LONGTERM)) + return __get_user_pages_locked(mm, start, nr_pages, pages, vmas, + NULL, gup_flags); + flags = memalloc_pin_save(); + do { + rc = __get_user_pages_locked(mm, start, nr_pages, pages, vmas, + NULL, gup_flags); + if (rc <= 0) + break; + rc = check_and_migrate_movable_pages(rc, pages, gup_flags); + } while (!rc); + memalloc_pin_restore(flags); - if (gup_flags & FOLL_LONGTERM) { - if (rc > 0) - rc = check_and_migrate_cma_pages(mm, start, rc, pages, - vmas, gup_flags); - memalloc_nocma_restore(flags); - } return rc; } diff --git a/mm/gup_test.c b/mm/gup_test.c index e3cf78e5873e..d974dec19e1c 100644 --- a/mm/gup_test.c +++ b/mm/gup_test.c @@ -52,6 +52,12 @@ static void verify_dma_pinned(unsigned int cmd, struct page **pages, dump_page(page, "gup_test failure"); break; + } else if (cmd == PIN_LONGTERM_BENCHMARK && + WARN(!is_pinnable_page(page), + "pages[%lu] is NOT pinnable but pinned\n", + i)) { + dump_page(page, "gup_test failure"); + break; } } break; @@ -94,7 +100,7 @@ static int __gup_test_ioctl(unsigned int cmd, { ktime_t start_time, end_time; unsigned long i, nr_pages, addr, next; - int nr; + long nr; struct page **pages; int ret = 0; bool needs_mmap_lock = @@ -126,37 +132,34 @@ static int __gup_test_ioctl(unsigned int cmd, nr = (next - addr) / PAGE_SIZE; } - /* Filter out most gup flags: only allow a tiny subset here: */ - gup->flags &= FOLL_WRITE; - switch (cmd) { case GUP_FAST_BENCHMARK: - nr = get_user_pages_fast(addr, nr, gup->flags, + nr = get_user_pages_fast(addr, nr, gup->gup_flags, pages + i); break; case GUP_BASIC_TEST: - nr = get_user_pages(addr, nr, gup->flags, pages + i, + nr = get_user_pages(addr, nr, gup->gup_flags, pages + i, NULL); break; case PIN_FAST_BENCHMARK: - nr = pin_user_pages_fast(addr, nr, gup->flags, + nr = pin_user_pages_fast(addr, nr, gup->gup_flags, pages + i); break; case PIN_BASIC_TEST: - nr = pin_user_pages(addr, nr, gup->flags, pages + i, + nr = pin_user_pages(addr, nr, gup->gup_flags, pages + i, NULL); break; case PIN_LONGTERM_BENCHMARK: nr = pin_user_pages(addr, nr, - gup->flags | FOLL_LONGTERM, + gup->gup_flags | FOLL_LONGTERM, pages + i, NULL); break; case DUMP_USER_PAGES_TEST: - if (gup->flags & GUP_TEST_FLAG_DUMP_PAGES_USE_PIN) - nr = pin_user_pages(addr, nr, gup->flags, + if (gup->test_flags & GUP_TEST_FLAG_DUMP_PAGES_USE_PIN) + nr = pin_user_pages(addr, nr, gup->gup_flags, pages + i, NULL); else - nr = get_user_pages(addr, nr, gup->flags, + nr = get_user_pages(addr, nr, gup->gup_flags, pages + i, NULL); break; default: @@ -187,7 +190,7 @@ static int __gup_test_ioctl(unsigned int cmd, start_time = ktime_get(); - put_back_pages(cmd, pages, nr_pages, gup->flags); + put_back_pages(cmd, pages, nr_pages, gup->test_flags); end_time = ktime_get(); gup->put_delta_usec = ktime_us_delta(end_time, start_time); diff --git a/mm/gup_test.h b/mm/gup_test.h index 90a6713d50eb..887ac1d5f5bc 100644 --- a/mm/gup_test.h +++ b/mm/gup_test.h @@ -21,7 +21,8 @@ struct gup_test { __u64 addr; __u64 size; __u32 nr_pages_per_call; - __u32 flags; + __u32 gup_flags; + __u32 test_flags; /* * Each non-zero entry is the number of the page (1-based: first page is * page 1, so that zero entries mean "do nothing") from the .addr base. diff --git a/mm/highmem.c b/mm/highmem.c index 6ef8f5e05e7e..4fb51d735aa6 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -104,7 +104,7 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color) atomic_long_t _totalhigh_pages __read_mostly; EXPORT_SYMBOL(_totalhigh_pages); -unsigned int __nr_free_highpages (void) +unsigned int __nr_free_highpages(void) { struct zone *zone; unsigned int pages = 0; @@ -120,7 +120,7 @@ unsigned int __nr_free_highpages (void) static int pkmap_count[LAST_PKMAP]; static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock); -pte_t * pkmap_page_table; +pte_t *pkmap_page_table; /* * Most architectures have no use for kmap_high_get(), so let's abstract @@ -147,6 +147,7 @@ struct page *__kmap_to_page(void *vaddr) if (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) { int i = PKMAP_NR(addr); + return pte_page(pkmap_page_table[i]); } @@ -278,9 +279,8 @@ void *kmap_high(struct page *page) pkmap_count[PKMAP_NR(vaddr)]++; BUG_ON(pkmap_count[PKMAP_NR(vaddr)] < 2); unlock_kmap(); - return (void*) vaddr; + return (void *) vaddr; } - EXPORT_SYMBOL(kmap_high); #ifdef ARCH_NEEDS_KMAP_HIGH_GET @@ -305,7 +305,7 @@ void *kmap_high_get(struct page *page) pkmap_count[PKMAP_NR(vaddr)]++; } unlock_kmap_any(flags); - return (void*) vaddr; + return (void *) vaddr; } #endif @@ -519,7 +519,7 @@ void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot) /* * Disable migration so resulting virtual address is stable - * accross preemption. + * across preemption. */ migrate_disable(); preempt_disable(); @@ -737,7 +737,6 @@ done: spin_unlock_irqrestore(&pas->lock, flags); return ret; } - EXPORT_SYMBOL(page_address); /** diff --git a/mm/huge_memory.c b/mm/huge_memory.c index ae907a9c2050..6d2a0119fc58 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -7,6 +7,7 @@ #include <linux/mm.h> #include <linux/sched.h> +#include <linux/sched/mm.h> #include <linux/sched/coredump.h> #include <linux/sched/numa_balancing.h> #include <linux/highmem.h> @@ -61,6 +62,7 @@ static struct shrinker deferred_split_shrinker; static atomic_t huge_zero_refcount; struct page *huge_zero_page __read_mostly; +unsigned long huge_zero_pfn __read_mostly = ~0UL; bool transparent_hugepage_enabled(struct vm_area_struct *vma) { @@ -77,18 +79,18 @@ bool transparent_hugepage_enabled(struct vm_area_struct *vma) return false; } -static struct page *get_huge_zero_page(void) +static bool get_huge_zero_page(void) { struct page *zero_page; retry: if (likely(atomic_inc_not_zero(&huge_zero_refcount))) - return READ_ONCE(huge_zero_page); + return true; zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, HPAGE_PMD_ORDER); if (!zero_page) { count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED); - return NULL; + return false; } count_vm_event(THP_ZERO_PAGE_ALLOC); preempt_disable(); @@ -97,11 +99,12 @@ retry: __free_pages(zero_page, compound_order(zero_page)); goto retry; } + WRITE_ONCE(huge_zero_pfn, page_to_pfn(zero_page)); /* We take additional reference here. It will be put back by shrinker */ atomic_set(&huge_zero_refcount, 2); preempt_enable(); - return READ_ONCE(huge_zero_page); + return true; } static void put_huge_zero_page(void) @@ -146,6 +149,7 @@ static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink, if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { struct page *zero_page = xchg(&huge_zero_page, NULL); BUG_ON(zero_page == NULL); + WRITE_ONCE(huge_zero_pfn, ~0UL); __free_pages(zero_page, compound_order(zero_page)); return HPAGE_PMD_NR; } @@ -624,14 +628,12 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, /* Deliver the page fault to userland */ if (userfaultfd_missing(vma)) { - vm_fault_t ret2; - spin_unlock(vmf->ptl); put_page(page); pte_free(vma->vm_mm, pgtable); - ret2 = handle_userfault(vmf, VM_UFFD_MISSING); - VM_BUG_ON(ret2 & VM_FAULT_FALLBACK); - return ret2; + ret = handle_userfault(vmf, VM_UFFD_MISSING); + VM_BUG_ON(ret & VM_FAULT_FALLBACK); + return ret; } entry = mk_huge_pmd(page, vma->vm_page_prot); @@ -1293,7 +1295,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) } page = pmd_page(orig_pmd); - VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page); + VM_BUG_ON_PAGE(!PageHead(page), page); /* Lock page for reuse_swap_page() */ if (!trylock_page(page)) { @@ -1464,12 +1466,6 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) */ page_locked = trylock_page(page); target_nid = mpol_misplaced(page, vma, haddr); - if (target_nid == NUMA_NO_NODE) { - /* If the page was locked, there are no parallel migrations */ - if (page_locked) - goto clear_pmdnuma; - } - /* Migration could have started since the pmd_trans_migrating check */ if (!page_locked) { page_nid = NUMA_NO_NODE; @@ -1478,6 +1474,11 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) spin_unlock(vmf->ptl); put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE); goto out; + } else if (target_nid == NUMA_NO_NODE) { + /* There are no parallel migrations and page is in the right + * node. Clear the numa hinting info in this pmd. + */ + goto clear_pmdnuma; } /* @@ -1696,7 +1697,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, VM_BUG_ON(!is_pmd_migration_entry(orig_pmd)); entry = pmd_to_swp_entry(orig_pmd); - page = pfn_to_page(swp_offset(entry)); + page = migration_entry_to_page(entry); flush_needed = 0; } else WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); @@ -1794,8 +1795,8 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, /* * Returns * - 0 if PMD could not be locked - * - 1 if PMD was locked but protections unchange and TLB flush unnecessary - * - HPAGE_PMD_NR is protections changed and TLB flush necessary + * - 1 if PMD was locked but protections unchanged and TLB flush unnecessary + * - HPAGE_PMD_NR if protections changed and TLB flush necessary */ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, pgprot_t newprot, unsigned long cp_flags) @@ -2046,7 +2047,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, count_vm_event(THP_SPLIT_PMD); if (!vma_is_anonymous(vma)) { - _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd); + old_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd); /* * We are going to unmap this huge page. So * just go ahead and zap it @@ -2055,16 +2056,25 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, zap_deposited_table(mm, pmd); if (vma_is_special_huge(vma)) return; - page = pmd_page(_pmd); - if (!PageDirty(page) && pmd_dirty(_pmd)) - set_page_dirty(page); - if (!PageReferenced(page) && pmd_young(_pmd)) - SetPageReferenced(page); - page_remove_rmap(page, true); - put_page(page); + if (unlikely(is_pmd_migration_entry(old_pmd))) { + swp_entry_t entry; + + entry = pmd_to_swp_entry(old_pmd); + page = migration_entry_to_page(entry); + } else { + page = pmd_page(old_pmd); + if (!PageDirty(page) && pmd_dirty(old_pmd)) + set_page_dirty(page); + if (!PageReferenced(page) && pmd_young(old_pmd)) + SetPageReferenced(page); + page_remove_rmap(page, true); + put_page(page); + } add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR); return; - } else if (pmd_trans_huge(*pmd) && is_huge_zero_pmd(*pmd)) { + } + + if (is_huge_zero_pmd(*pmd)) { /* * FIXME: Do we want to invalidate secondary mmu by calling * mmu_notifier_invalidate_range() see comments below inside @@ -2104,7 +2114,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, swp_entry_t entry; entry = pmd_to_swp_entry(old_pmd); - page = pfn_to_page(swp_offset(entry)); + page = migration_entry_to_page(entry); write = is_write_migration_entry(entry); young = false; soft_dirty = pmd_swp_soft_dirty(old_pmd); @@ -2303,60 +2313,54 @@ void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, __split_huge_pmd(vma, pmd, address, freeze, page); } +static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned long address) +{ + /* + * If the new address isn't hpage aligned and it could previously + * contain an hugepage: check if we need to split an huge pmd. + */ + if (!IS_ALIGNED(address, HPAGE_PMD_SIZE) && + range_in_vma(vma, ALIGN_DOWN(address, HPAGE_PMD_SIZE), + ALIGN(address, HPAGE_PMD_SIZE))) + split_huge_pmd_address(vma, address, false, NULL); +} + void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, long adjust_next) { - /* - * If the new start address isn't hpage aligned and it could - * previously contain an hugepage: check if we need to split - * an huge pmd. - */ - if (start & ~HPAGE_PMD_MASK && - (start & HPAGE_PMD_MASK) >= vma->vm_start && - (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) - split_huge_pmd_address(vma, start, false, NULL); + /* Check if we need to split start first. */ + split_huge_pmd_if_needed(vma, start); - /* - * If the new end address isn't hpage aligned and it could - * previously contain an hugepage: check if we need to split - * an huge pmd. - */ - if (end & ~HPAGE_PMD_MASK && - (end & HPAGE_PMD_MASK) >= vma->vm_start && - (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) - split_huge_pmd_address(vma, end, false, NULL); + /* Check if we need to split end next. */ + split_huge_pmd_if_needed(vma, end); /* - * If we're also updating the vma->vm_next->vm_start, if the new - * vm_next->vm_start isn't hpage aligned and it could previously - * contain an hugepage: check if we need to split an huge pmd. + * If we're also updating the vma->vm_next->vm_start, + * check if we need to split it. */ if (adjust_next > 0) { struct vm_area_struct *next = vma->vm_next; unsigned long nstart = next->vm_start; nstart += adjust_next; - if (nstart & ~HPAGE_PMD_MASK && - (nstart & HPAGE_PMD_MASK) >= next->vm_start && - (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end) - split_huge_pmd_address(next, nstart, false, NULL); + split_huge_pmd_if_needed(next, nstart); } } static void unmap_page(struct page *page) { - enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | + enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_SYNC | TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD; - bool unmap_success; VM_BUG_ON_PAGE(!PageHead(page), page); if (PageAnon(page)) ttu_flags |= TTU_SPLIT_FREEZE; - unmap_success = try_to_unmap(page, ttu_flags); - VM_BUG_ON_PAGE(!unmap_success, page); + try_to_unmap(page, ttu_flags); + + VM_WARN_ON_ONCE_PAGE(page_mapped(page), page); } static void remap_page(struct page *page, unsigned int nr) @@ -2477,7 +2481,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, xa_lock(&swap_cache->i_pages); } - /* lock lru list/PageCompound, ref freezed by page_ref_freeze */ + /* lock lru list/PageCompound, ref frozen by page_ref_freeze */ lruvec = lock_page_lruvec(head); for (i = nr - 1; i >= 1; i--) { @@ -2667,7 +2671,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) struct deferred_split *ds_queue = get_deferred_split_queue(head); struct anon_vma *anon_vma = NULL; struct address_space *mapping = NULL; - int count, mapcount, extra_pins, ret; + int extra_pins, ret; pgoff_t end; VM_BUG_ON_PAGE(is_huge_zero_page(head), head); @@ -2726,7 +2730,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) } unmap_page(head); - VM_BUG_ON_PAGE(compound_mapcount(head), head); /* block interrupt reentry in xa_lock and spinlock */ local_irq_disable(); @@ -2744,9 +2747,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) /* Prevent deferred_split_scan() touching ->_refcount */ spin_lock(&ds_queue->split_queue_lock); - count = page_count(head); - mapcount = total_mapcount(head); - if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) { + if (page_ref_freeze(head, 1 + extra_pins)) { if (!list_empty(page_deferred_list(head))) { ds_queue->split_queue_len--; list_del(page_deferred_list(head)); @@ -2766,16 +2767,9 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) __split_huge_page(page, list, end); ret = 0; } else { - if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) { - pr_alert("total_mapcount: %u, page_count(): %u\n", - mapcount, count); - if (PageTail(page)) - dump_page(head, NULL); - dump_page(page, "total_mapcount(head) > 0"); - BUG(); - } spin_unlock(&ds_queue->split_queue_lock); -fail: if (mapping) +fail: + if (mapping) xa_unlock(&mapping->i_pages); local_irq_enable(); remap_page(head, thp_nr_pages(head)); @@ -2838,8 +2832,8 @@ void deferred_split_huge_page(struct page *page) ds_queue->split_queue_len++; #ifdef CONFIG_MEMCG if (memcg) - memcg_set_shrinker_bit(memcg, page_to_nid(page), - deferred_split_shrinker.id); + set_shrinker_bit(memcg, page_to_nid(page), + deferred_split_shrinker.id); #endif } spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); @@ -2924,16 +2918,14 @@ static struct shrinker deferred_split_shrinker = { }; #ifdef CONFIG_DEBUG_FS -static int split_huge_pages_set(void *data, u64 val) +static void split_huge_pages_all(void) { struct zone *zone; struct page *page; unsigned long pfn, max_zone_pfn; unsigned long total = 0, split = 0; - if (val != 1) - return -EINVAL; - + pr_debug("Split all THPs\n"); for_each_populated_zone(zone) { max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) { @@ -2957,15 +2949,243 @@ static int split_huge_pages_set(void *data, u64 val) unlock_page(page); next: put_page(page); + cond_resched(); } } - pr_info("%lu of %lu THP split\n", split, total); + pr_debug("%lu of %lu THP split\n", split, total); +} - return 0; +static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma) +{ + return vma_is_special_huge(vma) || (vma->vm_flags & VM_IO) || + is_vm_hugetlb_page(vma); } -DEFINE_DEBUGFS_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set, - "%llu\n"); + +static int split_huge_pages_pid(int pid, unsigned long vaddr_start, + unsigned long vaddr_end) +{ + int ret = 0; + struct task_struct *task; + struct mm_struct *mm; + unsigned long total = 0, split = 0; + unsigned long addr; + + vaddr_start &= PAGE_MASK; + vaddr_end &= PAGE_MASK; + + /* Find the task_struct from pid */ + rcu_read_lock(); + task = find_task_by_vpid(pid); + if (!task) { + rcu_read_unlock(); + ret = -ESRCH; + goto out; + } + get_task_struct(task); + rcu_read_unlock(); + + /* Find the mm_struct */ + mm = get_task_mm(task); + put_task_struct(task); + + if (!mm) { + ret = -EINVAL; + goto out; + } + + pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx]\n", + pid, vaddr_start, vaddr_end); + + mmap_read_lock(mm); + /* + * always increase addr by PAGE_SIZE, since we could have a PTE page + * table filled with PTE-mapped THPs, each of which is distinct. + */ + for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) { + struct vm_area_struct *vma = find_vma(mm, addr); + unsigned int follflags; + struct page *page; + + if (!vma || addr < vma->vm_start) + break; + + /* skip special VMA and hugetlb VMA */ + if (vma_not_suitable_for_thp_split(vma)) { + addr = vma->vm_end; + continue; + } + + /* FOLL_DUMP to ignore special (like zero) pages */ + follflags = FOLL_GET | FOLL_DUMP; + page = follow_page(vma, addr, follflags); + + if (IS_ERR(page)) + continue; + if (!page) + continue; + + if (!is_transparent_hugepage(page)) + goto next; + + total++; + if (!can_split_huge_page(compound_head(page), NULL)) + goto next; + + if (!trylock_page(page)) + goto next; + + if (!split_huge_page(page)) + split++; + + unlock_page(page); +next: + put_page(page); + cond_resched(); + } + mmap_read_unlock(mm); + mmput(mm); + + pr_debug("%lu of %lu THP split\n", split, total); + +out: + return ret; +} + +static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start, + pgoff_t off_end) +{ + struct filename *file; + struct file *candidate; + struct address_space *mapping; + int ret = -EINVAL; + pgoff_t index; + int nr_pages = 1; + unsigned long total = 0, split = 0; + + file = getname_kernel(file_path); + if (IS_ERR(file)) + return ret; + + candidate = file_open_name(file, O_RDONLY, 0); + if (IS_ERR(candidate)) + goto out; + + pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx]\n", + file_path, off_start, off_end); + + mapping = candidate->f_mapping; + + for (index = off_start; index < off_end; index += nr_pages) { + struct page *fpage = pagecache_get_page(mapping, index, + FGP_ENTRY | FGP_HEAD, 0); + + nr_pages = 1; + if (xa_is_value(fpage) || !fpage) + continue; + + if (!is_transparent_hugepage(fpage)) + goto next; + + total++; + nr_pages = thp_nr_pages(fpage); + + if (!trylock_page(fpage)) + goto next; + + if (!split_huge_page(fpage)) + split++; + + unlock_page(fpage); +next: + put_page(fpage); + cond_resched(); + } + + filp_close(candidate, NULL); + ret = 0; + + pr_debug("%lu of %lu file-backed THP split\n", split, total); +out: + putname(file); + return ret; +} + +#define MAX_INPUT_BUF_SZ 255 + +static ssize_t split_huge_pages_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppops) +{ + static DEFINE_MUTEX(split_debug_mutex); + ssize_t ret; + /* hold pid, start_vaddr, end_vaddr or file_path, off_start, off_end */ + char input_buf[MAX_INPUT_BUF_SZ]; + int pid; + unsigned long vaddr_start, vaddr_end; + + ret = mutex_lock_interruptible(&split_debug_mutex); + if (ret) + return ret; + + ret = -EFAULT; + + memset(input_buf, 0, MAX_INPUT_BUF_SZ); + if (copy_from_user(input_buf, buf, min_t(size_t, count, MAX_INPUT_BUF_SZ))) + goto out; + + input_buf[MAX_INPUT_BUF_SZ - 1] = '\0'; + + if (input_buf[0] == '/') { + char *tok; + char *buf = input_buf; + char file_path[MAX_INPUT_BUF_SZ]; + pgoff_t off_start = 0, off_end = 0; + size_t input_len = strlen(input_buf); + + tok = strsep(&buf, ","); + if (tok) { + strncpy(file_path, tok, MAX_INPUT_BUF_SZ); + } else { + ret = -EINVAL; + goto out; + } + + ret = sscanf(buf, "0x%lx,0x%lx", &off_start, &off_end); + if (ret != 2) { + ret = -EINVAL; + goto out; + } + ret = split_huge_pages_in_file(file_path, off_start, off_end); + if (!ret) + ret = input_len; + + goto out; + } + + ret = sscanf(input_buf, "%d,0x%lx,0x%lx", &pid, &vaddr_start, &vaddr_end); + if (ret == 1 && pid == 1) { + split_huge_pages_all(); + ret = strlen(input_buf); + goto out; + } else if (ret != 3) { + ret = -EINVAL; + goto out; + } + + ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end); + if (!ret) + ret = strlen(input_buf); +out: + mutex_unlock(&split_debug_mutex); + return ret; + +} + +static const struct file_operations split_huge_pages_fops = { + .owner = THIS_MODULE, + .write = split_huge_pages_write, + .llseek = no_llseek, +}; static int __init split_huge_pages_debugfs(void) { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a86a58ef132d..5ba5a0da6d57 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -39,7 +39,6 @@ #include <linux/hugetlb.h> #include <linux/hugetlb_cgroup.h> #include <linux/node.h> -#include <linux/userfaultfd_k.h> #include <linux/page_owner.h> #include "internal.h" @@ -94,9 +93,10 @@ static inline bool subpool_is_free(struct hugepage_subpool *spool) return true; } -static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) +static inline void unlock_or_release_subpool(struct hugepage_subpool *spool, + unsigned long irq_flags) { - spin_unlock(&spool->lock); + spin_unlock_irqrestore(&spool->lock, irq_flags); /* If no pages are used, and no other handles to the subpool * remain, give up any reservations based on minimum size and @@ -135,10 +135,12 @@ struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages, void hugepage_put_subpool(struct hugepage_subpool *spool) { - spin_lock(&spool->lock); + unsigned long flags; + + spin_lock_irqsave(&spool->lock, flags); BUG_ON(!spool->count); spool->count--; - unlock_or_release_subpool(spool); + unlock_or_release_subpool(spool, flags); } /* @@ -157,7 +159,7 @@ static long hugepage_subpool_get_pages(struct hugepage_subpool *spool, if (!spool) return ret; - spin_lock(&spool->lock); + spin_lock_irq(&spool->lock); if (spool->max_hpages != -1) { /* maximum size accounting */ if ((spool->used_hpages + delta) <= spool->max_hpages) @@ -184,7 +186,7 @@ static long hugepage_subpool_get_pages(struct hugepage_subpool *spool, } unlock_ret: - spin_unlock(&spool->lock); + spin_unlock_irq(&spool->lock); return ret; } @@ -198,11 +200,12 @@ static long hugepage_subpool_put_pages(struct hugepage_subpool *spool, long delta) { long ret = delta; + unsigned long flags; if (!spool) return delta; - spin_lock(&spool->lock); + spin_lock_irqsave(&spool->lock, flags); if (spool->max_hpages != -1) /* maximum size accounting */ spool->used_hpages -= delta; @@ -223,7 +226,7 @@ static long hugepage_subpool_put_pages(struct hugepage_subpool *spool, * If hugetlbfs_put_super couldn't free spool due to an outstanding * quota reference, free it now. */ - unlock_or_release_subpool(spool); + unlock_or_release_subpool(spool, flags); return ret; } @@ -463,7 +466,7 @@ static int allocate_file_region_entries(struct resv_map *resv, resv->region_cache_count; /* At this point, we should have enough entries in the cache - * for all the existings adds_in_progress. We should only be + * for all the existing adds_in_progress. We should only be * needing to allocate for regions_needed. */ VM_BUG_ON(resv->region_cache_count < resv->adds_in_progress); @@ -553,7 +556,6 @@ retry: resv->adds_in_progress -= in_regions_needed; spin_unlock(&resv->lock); - VM_BUG_ON(add < 0); return add; } @@ -743,13 +745,20 @@ void hugetlb_fix_reserve_counts(struct inode *inode) { struct hugepage_subpool *spool = subpool_inode(inode); long rsv_adjust; + bool reserved = false; rsv_adjust = hugepage_subpool_get_pages(spool, 1); - if (rsv_adjust) { + if (rsv_adjust > 0) { struct hstate *h = hstate_inode(inode); - hugetlb_acct_memory(h, 1); + if (!hugetlb_acct_memory(h, 1)) + reserved = true; + } else if (!rsv_adjust) { + reserved = true; } + + if (!reserved) + pr_warn("hugetlb: Huge Page Reserved count may go negative.\n"); } /* @@ -1059,6 +1068,8 @@ static bool vma_has_reserves(struct vm_area_struct *vma, long chg) static void enqueue_huge_page(struct hstate *h, struct page *page) { int nid = page_to_nid(page); + + lockdep_assert_held(&hugetlb_lock); list_move(&page->lru, &h->hugepage_freelists[nid]); h->free_huge_pages++; h->free_huge_pages_node[nid]++; @@ -1068,10 +1079,11 @@ static void enqueue_huge_page(struct hstate *h, struct page *page) static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid) { struct page *page; - bool nocma = !!(current->flags & PF_MEMALLOC_NOCMA); + bool pin = !!(current->flags & PF_MEMALLOC_PIN); + lockdep_assert_held(&hugetlb_lock); list_for_each_entry(page, &h->hugepage_freelists[nid], lru) { - if (nocma && is_migrate_cma_page(page)) + if (pin && !is_pinnable_page(page)) continue; if (PageHWPoison(page)) @@ -1205,7 +1217,7 @@ static int hstate_next_node_to_alloc(struct hstate *h, } /* - * helper for free_pool_huge_page() - return the previously saved + * helper for remove_pool_huge_page() - return the previously saved * node ["this node"] from which to free a huge page. Advance the * next node id whether or not we find a free huge page to free so * that the next attempt to free addresses the next node. @@ -1273,7 +1285,7 @@ static void free_gigantic_page(struct page *page, unsigned int order) static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nodemask) { - unsigned long nr_pages = 1UL << huge_page_order(h); + unsigned long nr_pages = pages_per_huge_page(h); if (nid == NUMA_NO_NODE) nid = numa_mem_id(); @@ -1327,6 +1339,42 @@ static inline void destroy_compound_gigantic_page(struct page *page, unsigned int order) { } #endif +/* + * Remove hugetlb page from lists, and update dtor so that page appears + * as just a compound page. A reference is held on the page. + * + * Must be called with hugetlb lock held. + */ +static void remove_hugetlb_page(struct hstate *h, struct page *page, + bool adjust_surplus) +{ + int nid = page_to_nid(page); + + VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page); + VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page); + + lockdep_assert_held(&hugetlb_lock); + if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) + return; + + list_del(&page->lru); + + if (HPageFreed(page)) { + h->free_huge_pages--; + h->free_huge_pages_node[nid]--; + } + if (adjust_surplus) { + h->surplus_huge_pages--; + h->surplus_huge_pages_node[nid]--; + } + + set_page_refcounted(page); + set_compound_page_dtor(page, NULL_COMPOUND_DTOR); + + h->nr_huge_pages--; + h->nr_huge_pages_node[nid]--; +} + static void update_and_free_page(struct hstate *h, struct page *page) { int i; @@ -1335,8 +1383,6 @@ static void update_and_free_page(struct hstate *h, struct page *page) if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) return; - h->nr_huge_pages--; - h->nr_huge_pages_node[page_to_nid(page)]--; for (i = 0; i < pages_per_huge_page(h); i++, subpage = mem_map_next(subpage, page, i)) { subpage->flags &= ~(1 << PG_locked | 1 << PG_error | @@ -1344,24 +1390,24 @@ static void update_and_free_page(struct hstate *h, struct page *page) 1 << PG_active | 1 << PG_private | 1 << PG_writeback); } - VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page); - VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page); - set_compound_page_dtor(page, NULL_COMPOUND_DTOR); - set_page_refcounted(page); if (hstate_is_gigantic(h)) { - /* - * Temporarily drop the hugetlb_lock, because - * we might block in free_gigantic_page(). - */ - spin_unlock(&hugetlb_lock); destroy_compound_gigantic_page(page, huge_page_order(h)); free_gigantic_page(page, huge_page_order(h)); - spin_lock(&hugetlb_lock); } else { __free_pages(page, huge_page_order(h)); } } +static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list) +{ + struct page *page, *t_page; + + list_for_each_entry_safe(page, t_page, list, lru) { + update_and_free_page(h, page); + cond_resched(); + } +} + struct hstate *size_to_hstate(unsigned long size) { struct hstate *h; @@ -1373,7 +1419,7 @@ struct hstate *size_to_hstate(unsigned long size) return NULL; } -static void __free_huge_page(struct page *page) +void free_huge_page(struct page *page) { /* * Can't pass hstate in here because it is called from the @@ -1383,6 +1429,7 @@ static void __free_huge_page(struct page *page) int nid = page_to_nid(page); struct hugepage_subpool *spool = hugetlb_page_subpool(page); bool restore_reserve; + unsigned long flags; VM_BUG_ON_PAGE(page_count(page), page); VM_BUG_ON_PAGE(page_mapcount(page), page); @@ -1411,7 +1458,7 @@ static void __free_huge_page(struct page *page) restore_reserve = true; } - spin_lock(&hugetlb_lock); + spin_lock_irqsave(&hugetlb_lock, flags); ClearHPageMigratable(page); hugetlb_cgroup_uncharge_page(hstate_index(h), pages_per_huge_page(h), page); @@ -1421,82 +1468,46 @@ static void __free_huge_page(struct page *page) h->resv_huge_pages++; if (HPageTemporary(page)) { - list_del(&page->lru); - ClearHPageTemporary(page); + remove_hugetlb_page(h, page, false); + spin_unlock_irqrestore(&hugetlb_lock, flags); update_and_free_page(h, page); } else if (h->surplus_huge_pages_node[nid]) { /* remove the page from active list */ - list_del(&page->lru); + remove_hugetlb_page(h, page, true); + spin_unlock_irqrestore(&hugetlb_lock, flags); update_and_free_page(h, page); - h->surplus_huge_pages--; - h->surplus_huge_pages_node[nid]--; } else { arch_clear_hugepage_flags(page); enqueue_huge_page(h, page); + spin_unlock_irqrestore(&hugetlb_lock, flags); } - spin_unlock(&hugetlb_lock); } /* - * As free_huge_page() can be called from a non-task context, we have - * to defer the actual freeing in a workqueue to prevent potential - * hugetlb_lock deadlock. - * - * free_hpage_workfn() locklessly retrieves the linked list of pages to - * be freed and frees them one-by-one. As the page->mapping pointer is - * going to be cleared in __free_huge_page() anyway, it is reused as the - * llist_node structure of a lockless linked list of huge pages to be freed. + * Must be called with the hugetlb lock held */ -static LLIST_HEAD(hpage_freelist); - -static void free_hpage_workfn(struct work_struct *work) -{ - struct llist_node *node; - struct page *page; - - node = llist_del_all(&hpage_freelist); - - while (node) { - page = container_of((struct address_space **)node, - struct page, mapping); - node = node->next; - __free_huge_page(page); - } -} -static DECLARE_WORK(free_hpage_work, free_hpage_workfn); - -void free_huge_page(struct page *page) +static void __prep_account_new_huge_page(struct hstate *h, int nid) { - /* - * Defer freeing if in non-task context to avoid hugetlb_lock deadlock. - */ - if (!in_task()) { - /* - * Only call schedule_work() if hpage_freelist is previously - * empty. Otherwise, schedule_work() had been called but the - * workfn hasn't retrieved the list yet. - */ - if (llist_add((struct llist_node *)&page->mapping, - &hpage_freelist)) - schedule_work(&free_hpage_work); - return; - } - - __free_huge_page(page); + lockdep_assert_held(&hugetlb_lock); + h->nr_huge_pages++; + h->nr_huge_pages_node[nid]++; } -static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) +static void __prep_new_huge_page(struct page *page) { INIT_LIST_HEAD(&page->lru); set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); hugetlb_set_page_subpool(page, NULL); set_hugetlb_cgroup(page, NULL); set_hugetlb_cgroup_rsvd(page, NULL); - spin_lock(&hugetlb_lock); - h->nr_huge_pages++; - h->nr_huge_pages_node[nid]++; - ClearHPageFreed(page); - spin_unlock(&hugetlb_lock); +} + +static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) +{ + __prep_new_huge_page(page); + spin_lock_irq(&hugetlb_lock); + __prep_account_new_huge_page(h, nid); + spin_unlock_irq(&hugetlb_lock); } static void prep_compound_gigantic_page(struct page *page, unsigned int order) @@ -1577,15 +1588,12 @@ struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage) return NULL; } -pgoff_t __basepage_index(struct page *page) +pgoff_t hugetlb_basepage_index(struct page *page) { struct page *page_head = compound_head(page); pgoff_t index = page_index(page_head); unsigned long compound_idx; - if (!PageHuge(page_head)) - return page_index(page); - if (compound_order(page_head) >= MAX_ORDER) compound_idx = page_to_pfn(page) - page_to_pfn(page_head); else @@ -1616,7 +1624,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, gfp_mask |= __GFP_RETRY_MAYFAIL; if (nid == NUMA_NO_NODE) nid = numa_mem_id(); - page = __alloc_pages_nodemask(gfp_mask, order, nid, nmask); + page = __alloc_pages(gfp_mask, order, nid, nmask); if (page) __count_vm_event(HTLB_BUDDY_PGALLOC); else @@ -1693,17 +1701,20 @@ static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, } /* - * Free huge page from pool from next node to free. - * Attempt to keep persistent huge pages more or less - * balanced over allowed nodes. + * Remove huge page from pool from next node to free. Attempt to keep + * persistent huge pages more or less balanced over allowed nodes. + * This routine only 'removes' the hugetlb page. The caller must make + * an additional call to free the page to low level allocators. * Called with hugetlb_lock locked. */ -static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, - bool acct_surplus) +static struct page *remove_pool_huge_page(struct hstate *h, + nodemask_t *nodes_allowed, + bool acct_surplus) { int nr_nodes, node; - int ret = 0; + struct page *page = NULL; + lockdep_assert_held(&hugetlb_lock); for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { /* * If we're returning unused surplus pages, only examine @@ -1711,23 +1722,14 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, */ if ((!acct_surplus || h->surplus_huge_pages_node[node]) && !list_empty(&h->hugepage_freelists[node])) { - struct page *page = - list_entry(h->hugepage_freelists[node].next, + page = list_entry(h->hugepage_freelists[node].next, struct page, lru); - list_del(&page->lru); - h->free_huge_pages--; - h->free_huge_pages_node[node]--; - if (acct_surplus) { - h->surplus_huge_pages--; - h->surplus_huge_pages_node[node]--; - } - update_and_free_page(h, page); - ret = 1; + remove_hugetlb_page(h, page, acct_surplus); break; } } - return ret; + return page; } /* @@ -1749,7 +1751,7 @@ retry: if (!PageHuge(page)) return 0; - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); if (!PageHuge(page)) { rc = 0; goto out; @@ -1758,7 +1760,6 @@ retry: if (!page_count(page)) { struct page *head = compound_head(page); struct hstate *h = page_hstate(head); - int nid = page_to_nid(head); if (h->free_huge_pages - h->resv_huge_pages == 0) goto out; @@ -1767,7 +1768,7 @@ retry: * when it is dissolved. */ if (unlikely(!HPageFreed(head))) { - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); cond_resched(); /* @@ -1789,15 +1790,14 @@ retry: SetPageHWPoison(page); ClearPageHWPoison(head); } - list_del(&head->lru); - h->free_huge_pages--; - h->free_huge_pages_node[nid]--; + remove_hugetlb_page(h, head, false); h->max_huge_pages--; + spin_unlock_irq(&hugetlb_lock); update_and_free_page(h, head); - rc = 0; + return 0; } out: - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); return rc; } @@ -1839,16 +1839,16 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, if (hstate_is_gigantic(h)) return NULL; - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) goto out_unlock; - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL); if (!page) return NULL; - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); /* * We could have raced with the pool size change. * Double check that and simply deallocate the new page @@ -1858,7 +1858,7 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, */ if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { SetHPageTemporary(page); - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); put_page(page); return NULL; } else { @@ -1867,7 +1867,7 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, } out_unlock: - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); return page; } @@ -1917,17 +1917,17 @@ struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h, struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask) { - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); if (h->free_huge_pages - h->resv_huge_pages > 0) { struct page *page; page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask); if (page) { - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); return page; } } - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask); } @@ -1964,6 +1964,7 @@ static int gather_surplus_pages(struct hstate *h, long delta) long needed, allocated; bool alloc_ok = true; + lockdep_assert_held(&hugetlb_lock); needed = (h->resv_huge_pages + delta) - h->free_huge_pages; if (needed <= 0) { h->resv_huge_pages += delta; @@ -1975,7 +1976,7 @@ static int gather_surplus_pages(struct hstate *h, long delta) ret = -ENOMEM; retry: - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); for (i = 0; i < needed; i++) { page = alloc_surplus_huge_page(h, htlb_alloc_mask(h), NUMA_NO_NODE, NULL); @@ -1992,7 +1993,7 @@ retry: * After retaking hugetlb_lock, we need to recalculate 'needed' * because either resv_huge_pages or free_huge_pages may have changed. */ - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); needed = (h->resv_huge_pages + delta) - (h->free_huge_pages + allocated); if (needed > 0) { @@ -2032,12 +2033,12 @@ retry: enqueue_huge_page(h, page); } free: - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); /* Free unnecessary surplus pages to the buddy allocator */ list_for_each_entry_safe(page, tmp, &surplus_list, lru) put_page(page); - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); return ret; } @@ -2049,17 +2050,17 @@ free: * to the associated reservation map. * 2) Free any unused surplus pages that may have been allocated to satisfy * the reservation. As many as unused_resv_pages may be freed. - * - * Called with hugetlb_lock held. However, the lock could be dropped (and - * reacquired) during calls to cond_resched_lock. Whenever dropping the lock, - * we must make sure nobody else can claim pages we are in the process of - * freeing. Do this by ensuring resv_huge_page always is greater than the - * number of huge pages we plan to free when dropping the lock. */ static void return_unused_surplus_pages(struct hstate *h, unsigned long unused_resv_pages) { unsigned long nr_pages; + struct page *page; + LIST_HEAD(page_list); + + lockdep_assert_held(&hugetlb_lock); + /* Uncommit the reservation */ + h->resv_huge_pages -= unused_resv_pages; /* Cannot return gigantic pages currently */ if (hstate_is_gigantic(h)) @@ -2076,24 +2077,21 @@ static void return_unused_surplus_pages(struct hstate *h, * evenly across all nodes with memory. Iterate across these nodes * until we can no longer free unreserved surplus pages. This occurs * when the nodes with surplus pages have no free pages. - * free_pool_huge_page() will balance the freed pages across the + * remove_pool_huge_page() will balance the freed pages across the * on-line nodes with memory and will handle the hstate accounting. - * - * Note that we decrement resv_huge_pages as we free the pages. If - * we drop the lock, resv_huge_pages will still be sufficiently large - * to cover subsequent pages we may free. */ while (nr_pages--) { - h->resv_huge_pages--; - unused_resv_pages--; - if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1)) + page = remove_pool_huge_page(h, &node_states[N_MEMORY], 1); + if (!page) goto out; - cond_resched_lock(&hugetlb_lock); + + list_add(&page->lru, &page_list); } out: - /* Fully uncommit the reservation */ - h->resv_huge_pages -= unused_resv_pages; + spin_unlock_irq(&hugetlb_lock); + update_and_free_pages_bulk(h, &page_list); + spin_lock_irq(&hugetlb_lock); } @@ -2120,12 +2118,18 @@ out: * be restored when a newly allocated huge page must be freed. It is * to be called after calling vma_needs_reservation to determine if a * reservation exists. + * + * vma_del_reservation is used in error paths where an entry in the reserve + * map was created during huge page allocation and must be removed. It is to + * be called after calling vma_needs_reservation to determine if a reservation + * exists. */ enum vma_resv_mode { VMA_NEEDS_RESV, VMA_COMMIT_RESV, VMA_END_RESV, VMA_ADD_RESV, + VMA_DEL_RESV, }; static long __vma_reservation_common(struct hstate *h, struct vm_area_struct *vma, unsigned long addr, @@ -2169,33 +2173,42 @@ static long __vma_reservation_common(struct hstate *h, ret = region_del(resv, idx, idx + 1); } break; + case VMA_DEL_RESV: + if (vma->vm_flags & VM_MAYSHARE) { + region_abort(resv, idx, idx + 1, 1); + ret = region_del(resv, idx, idx + 1); + } else { + ret = region_add(resv, idx, idx + 1, 1, NULL, NULL); + /* region_add calls of range 1 should never fail. */ + VM_BUG_ON(ret < 0); + } + break; default: BUG(); } - if (vma->vm_flags & VM_MAYSHARE) + if (vma->vm_flags & VM_MAYSHARE || mode == VMA_DEL_RESV) return ret; - else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && ret >= 0) { - /* - * In most cases, reserves always exist for private mappings. - * However, a file associated with mapping could have been - * hole punched or truncated after reserves were consumed. - * As subsequent fault on such a range will not use reserves. - * Subtle - The reserve map for private mappings has the - * opposite meaning than that of shared mappings. If NO - * entry is in the reserve map, it means a reservation exists. - * If an entry exists in the reserve map, it means the - * reservation has already been consumed. As a result, the - * return value of this routine is the opposite of the - * value returned from reserve map manipulation routines above. - */ - if (ret) - return 0; - else - return 1; - } - else - return ret < 0 ? ret : 0; + /* + * We know private mapping must have HPAGE_RESV_OWNER set. + * + * In most cases, reserves always exist for private mappings. + * However, a file associated with mapping could have been + * hole punched or truncated after reserves were consumed. + * As subsequent fault on such a range will not use reserves. + * Subtle - The reserve map for private mappings has the + * opposite meaning than that of shared mappings. If NO + * entry is in the reserve map, it means a reservation exists. + * If an entry exists in the reserve map, it means the + * reservation has already been consumed. As a result, the + * return value of this routine is the opposite of the + * value returned from reserve map manipulation routines above. + */ + if (ret > 0) + return 0; + if (ret == 0) + return 1; + return ret; } static long vma_needs_reservation(struct hstate *h, @@ -2222,25 +2235,39 @@ static long vma_add_reservation(struct hstate *h, return __vma_reservation_common(h, vma, addr, VMA_ADD_RESV); } +static long vma_del_reservation(struct hstate *h, + struct vm_area_struct *vma, unsigned long addr) +{ + return __vma_reservation_common(h, vma, addr, VMA_DEL_RESV); +} + /* - * This routine is called to restore a reservation on error paths. In the - * specific error paths, a huge page was allocated (via alloc_huge_page) - * and is about to be freed. If a reservation for the page existed, - * alloc_huge_page would have consumed the reservation and set - * HPageRestoreReserve in the newly allocated page. When the page is freed - * via free_huge_page, the global reservation count will be incremented if - * HPageRestoreReserve is set. However, free_huge_page can not adjust the - * reserve map. Adjust the reserve map here to be consistent with global - * reserve count adjustments to be made by free_huge_page. + * This routine is called to restore reservation information on error paths. + * It should ONLY be called for pages allocated via alloc_huge_page(), and + * the hugetlb mutex should remain held when calling this routine. + * + * It handles two specific cases: + * 1) A reservation was in place and the page consumed the reservation. + * HPageRestoreReserve is set in the page. + * 2) No reservation was in place for the page, so HPageRestoreReserve is + * not set. However, alloc_huge_page always updates the reserve map. + * + * In case 1, free_huge_page later in the error path will increment the + * global reserve count. But, free_huge_page does not have enough context + * to adjust the reservation map. This case deals primarily with private + * mappings. Adjust the reserve map here to be consistent with global + * reserve count adjustments to be made by free_huge_page. Make sure the + * reserve map indicates there is a reservation present. + * + * In case 2, simply undo reserve map modifications done by alloc_huge_page. */ -static void restore_reserve_on_error(struct hstate *h, - struct vm_area_struct *vma, unsigned long address, - struct page *page) +void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, + unsigned long address, struct page *page) { - if (unlikely(HPageRestoreReserve(page))) { - long rc = vma_needs_reservation(h, vma, address); + long rc = vma_needs_reservation(h, vma, address); - if (unlikely(rc < 0)) { + if (HPageRestoreReserve(page)) { + if (unlikely(rc < 0)) /* * Rare out of memory condition in reserve map * manipulation. Clear HPageRestoreReserve so that @@ -2253,19 +2280,188 @@ static void restore_reserve_on_error(struct hstate *h, * accounting of reserve counts. */ ClearHPageRestoreReserve(page); - } else if (rc) { - rc = vma_add_reservation(h, vma, address); - if (unlikely(rc < 0)) + else if (rc) + (void)vma_add_reservation(h, vma, address); + else + vma_end_reservation(h, vma, address); + } else { + if (!rc) { + /* + * This indicates there is an entry in the reserve map + * added by alloc_huge_page. We know it was added + * before the alloc_huge_page call, otherwise + * HPageRestoreReserve would be set on the page. + * Remove the entry so that a subsequent allocation + * does not consume a reservation. + */ + rc = vma_del_reservation(h, vma, address); + if (rc < 0) + /* + * VERY rare out of memory condition. Since + * we can not delete the entry, set + * HPageRestoreReserve so that the reserve + * count will be incremented when the page + * is freed. This reserve will be consumed + * on a subsequent allocation. + */ + SetHPageRestoreReserve(page); + } else if (rc < 0) { + /* + * Rare out of memory condition from + * vma_needs_reservation call. Memory allocation is + * only attempted if a new entry is needed. Therefore, + * this implies there is not an entry in the + * reserve map. + * + * For shared mappings, no entry in the map indicates + * no reservation. We are done. + */ + if (!(vma->vm_flags & VM_MAYSHARE)) /* - * See above comment about rare out of - * memory condition. + * For private mappings, no entry indicates + * a reservation is present. Since we can + * not add an entry, set SetHPageRestoreReserve + * on the page so reserve count will be + * incremented when freed. This reserve will + * be consumed on a subsequent allocation. */ - ClearHPageRestoreReserve(page); + SetHPageRestoreReserve(page); } else - vma_end_reservation(h, vma, address); + /* + * No reservation present, do nothing + */ + vma_end_reservation(h, vma, address); } } +/* + * alloc_and_dissolve_huge_page - Allocate a new page and dissolve the old one + * @h: struct hstate old page belongs to + * @old_page: Old page to dissolve + * @list: List to isolate the page in case we need to + * Returns 0 on success, otherwise negated error. + */ +static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page, + struct list_head *list) +{ + gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; + int nid = page_to_nid(old_page); + struct page *new_page; + int ret = 0; + + /* + * Before dissolving the page, we need to allocate a new one for the + * pool to remain stable. Using alloc_buddy_huge_page() allows us to + * not having to deal with prep_new_huge_page() and avoids dealing of any + * counters. This simplifies and let us do the whole thing under the + * lock. + */ + new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL); + if (!new_page) + return -ENOMEM; + +retry: + spin_lock_irq(&hugetlb_lock); + if (!PageHuge(old_page)) { + /* + * Freed from under us. Drop new_page too. + */ + goto free_new; + } else if (page_count(old_page)) { + /* + * Someone has grabbed the page, try to isolate it here. + * Fail with -EBUSY if not possible. + */ + spin_unlock_irq(&hugetlb_lock); + if (!isolate_huge_page(old_page, list)) + ret = -EBUSY; + spin_lock_irq(&hugetlb_lock); + goto free_new; + } else if (!HPageFreed(old_page)) { + /* + * Page's refcount is 0 but it has not been enqueued in the + * freelist yet. Race window is small, so we can succeed here if + * we retry. + */ + spin_unlock_irq(&hugetlb_lock); + cond_resched(); + goto retry; + } else { + /* + * Ok, old_page is still a genuine free hugepage. Remove it from + * the freelist and decrease the counters. These will be + * incremented again when calling __prep_account_new_huge_page() + * and enqueue_huge_page() for new_page. The counters will remain + * stable since this happens under the lock. + */ + remove_hugetlb_page(h, old_page, false); + + /* + * new_page needs to be initialized with the standard hugetlb + * state. This is normally done by prep_new_huge_page() but + * that takes hugetlb_lock which is already held so we need to + * open code it here. + * Reference count trick is needed because allocator gives us + * referenced page but the pool requires pages with 0 refcount. + */ + __prep_new_huge_page(new_page); + __prep_account_new_huge_page(h, nid); + page_ref_dec(new_page); + enqueue_huge_page(h, new_page); + + /* + * Pages have been replaced, we can safely free the old one. + */ + spin_unlock_irq(&hugetlb_lock); + update_and_free_page(h, old_page); + } + + return ret; + +free_new: + spin_unlock_irq(&hugetlb_lock); + __free_pages(new_page, huge_page_order(h)); + + return ret; +} + +int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) +{ + struct hstate *h; + struct page *head; + int ret = -EBUSY; + + /* + * The page might have been dissolved from under our feet, so make sure + * to carefully check the state under the lock. + * Return success when racing as if we dissolved the page ourselves. + */ + spin_lock_irq(&hugetlb_lock); + if (PageHuge(page)) { + head = compound_head(page); + h = page_hstate(head); + } else { + spin_unlock_irq(&hugetlb_lock); + return 0; + } + spin_unlock_irq(&hugetlb_lock); + + /* + * Fence off gigantic pages as there is a cyclic dependency between + * alloc_contig_range and them. Return -ENOMEM as this has the effect + * of bailing out right away without further retrying. + */ + if (hstate_is_gigantic(h)) + return -ENOMEM; + + if (page_count(head) && isolate_huge_page(head, list)) + ret = 0; + else if (!page_count(head)) + ret = alloc_and_dissolve_huge_page(h, head, list); + + return ret; +} + struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) { @@ -2316,7 +2512,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, /* If this allocation is not consuming a reservation, charge it now. */ - deferred_reserve = map_chg || avoid_reserve || !vma_resv_map(vma); + deferred_reserve = map_chg || avoid_reserve; if (deferred_reserve) { ret = hugetlb_cgroup_charge_cgroup_rsvd( idx, pages_per_huge_page(h), &h_cg); @@ -2328,7 +2524,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, if (ret) goto out_uncharge_cgroup_reservation; - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); /* * glb_chg is passed to indicate whether or not a page must be taken * from the global free pool (global change). gbl_chg == 0 indicates @@ -2336,7 +2532,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, */ page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg); if (!page) { - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); page = alloc_buddy_huge_page_with_mpol(h, vma, addr); if (!page) goto out_uncharge_cgroup; @@ -2344,7 +2540,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, SetHPageRestoreReserve(page); h->resv_huge_pages--; } - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); list_add(&page->lru, &h->hugepage_activelist); /* Fall through */ } @@ -2357,7 +2553,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, h_cg, page); } - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); hugetlb_set_page_subpool(page, spool); @@ -2547,24 +2743,32 @@ static void try_to_free_low(struct hstate *h, unsigned long count, nodemask_t *nodes_allowed) { int i; + LIST_HEAD(page_list); + lockdep_assert_held(&hugetlb_lock); if (hstate_is_gigantic(h)) return; + /* + * Collect pages to be freed on a list, and free after dropping lock + */ for_each_node_mask(i, *nodes_allowed) { struct page *page, *next; struct list_head *freel = &h->hugepage_freelists[i]; list_for_each_entry_safe(page, next, freel, lru) { if (count >= h->nr_huge_pages) - return; + goto out; if (PageHighMem(page)) continue; - list_del(&page->lru); - update_and_free_page(h, page); - h->free_huge_pages--; - h->free_huge_pages_node[page_to_nid(page)]--; + remove_hugetlb_page(h, page, false); + list_add(&page->lru, &page_list); } } + +out: + spin_unlock_irq(&hugetlb_lock); + update_and_free_pages_bulk(h, &page_list); + spin_lock_irq(&hugetlb_lock); } #else static inline void try_to_free_low(struct hstate *h, unsigned long count, @@ -2583,6 +2787,7 @@ static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed, { int nr_nodes, node; + lockdep_assert_held(&hugetlb_lock); VM_BUG_ON(delta != -1 && delta != 1); if (delta < 0) { @@ -2610,6 +2815,8 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, nodemask_t *nodes_allowed) { unsigned long min_count, ret; + struct page *page; + LIST_HEAD(page_list); NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL); /* @@ -2622,7 +2829,12 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, else return -ENOMEM; - spin_lock(&hugetlb_lock); + /* + * resize_lock mutex prevents concurrent adjustments to number of + * pages in hstate via the proc/sysfs interfaces. + */ + mutex_lock(&h->resize_lock); + spin_lock_irq(&hugetlb_lock); /* * Check for a node specific request. @@ -2653,7 +2865,8 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, */ if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) { if (count > persistent_huge_pages(h)) { - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); + mutex_unlock(&h->resize_lock); NODEMASK_FREE(node_alloc_noretry); return -EINVAL; } @@ -2682,14 +2895,14 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, * page, free_huge_page will handle it by freeing the page * and reducing the surplus. */ - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); /* yield cpu to avoid soft lockup */ cond_resched(); ret = alloc_pool_huge_page(h, nodes_allowed, node_alloc_noretry); - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); if (!ret) goto out; @@ -2716,18 +2929,30 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; min_count = max(count, min_count); try_to_free_low(h, min_count, nodes_allowed); + + /* + * Collect pages to be removed on list without dropping lock + */ while (min_count < persistent_huge_pages(h)) { - if (!free_pool_huge_page(h, nodes_allowed, 0)) + page = remove_pool_huge_page(h, nodes_allowed, 0); + if (!page) break; - cond_resched_lock(&hugetlb_lock); + + list_add(&page->lru, &page_list); } + /* free the pages after dropping lock */ + spin_unlock_irq(&hugetlb_lock); + update_and_free_pages_bulk(h, &page_list); + spin_lock_irq(&hugetlb_lock); + while (count < persistent_huge_pages(h)) { if (!adjust_pool_surplus(h, nodes_allowed, 1)) break; } out: h->max_huge_pages = persistent_huge_pages(h); - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); + mutex_unlock(&h->resize_lock); NODEMASK_FREE(node_alloc_noretry); @@ -2882,9 +3107,9 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, if (err) return err; - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); h->nr_overcommit_huge_pages = input; - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); return count; } @@ -3215,6 +3440,7 @@ void __init hugetlb_add_hstate(unsigned int order) BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); BUG_ON(order == 0); h = &hstates[hugetlb_max_hstate++]; + mutex_init(&h->resize_lock); h->order = order; h->mask = ~(huge_page_size(h) - 1); for (i = 0; i < MAX_NUMNODES; ++i) @@ -3267,10 +3493,10 @@ static int __init hugepages_setup(char *s) /* * Global state is always initialized later in hugetlb_init. - * But we need to allocate >= MAX_ORDER hstates here early to still + * But we need to allocate gigantic hstates here early to still * use the bootmem allocator. */ - if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER) + if (hugetlb_max_hstate && hstate_is_gigantic(parsed_hstate)) hugetlb_hstate_alloc_pages(parsed_hstate); last_mhp = mhp; @@ -3470,9 +3696,9 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, goto out; if (write) { - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); h->nr_overcommit_huge_pages = tmp; - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); } out: return ret; @@ -3568,7 +3794,7 @@ static int hugetlb_acct_memory(struct hstate *h, long delta) if (!delta) return 0; - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); /* * When cpuset is configured, it breaks the strict hugetlb page * reservation as the accounting is done on a global variable. Such @@ -3607,7 +3833,7 @@ static int hugetlb_acct_memory(struct hstate *h, long delta) return_unused_surplus_pages(h, (unsigned long) -delta); out: - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); return ret; } @@ -3795,7 +4021,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, src_pte = huge_pte_offset(src, addr, sz); if (!src_pte) continue; - dst_pte = huge_pte_alloc(dst, addr, sz); + dst_pte = huge_pte_alloc(dst, vma, addr, sz); if (!dst_pte) { ret = -ENOMEM; break; @@ -3879,6 +4105,8 @@ again: spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); entry = huge_ptep_get(src_pte); if (!pte_same(src_pte_old, entry)) { + restore_reserve_on_error(h, vma, addr, + new); put_page(new); /* dst_entry won't change as in child */ goto again; @@ -3898,6 +4126,7 @@ again: * See Documentation/vm/mmu_notifier.rst */ huge_ptep_set_wrprotect(src, addr, src_pte); + entry = huge_pte_wrprotect(entry); } page_dup_rmap(ptepage, true); @@ -4310,6 +4539,44 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping, return 0; } +static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma, + struct address_space *mapping, + pgoff_t idx, + unsigned int flags, + unsigned long haddr, + unsigned long reason) +{ + vm_fault_t ret; + u32 hash; + struct vm_fault vmf = { + .vma = vma, + .address = haddr, + .flags = flags, + + /* + * Hard to debug if it ends up being + * used by a callee that assumes + * something about the other + * uninitialized fields... same as in + * memory.c + */ + }; + + /* + * hugetlb_fault_mutex and i_mmap_rwsem must be + * dropped before handling userfault. Reacquire + * after handling fault to make calling code simpler. + */ + hash = hugetlb_fault_mutex_hash(mapping, idx); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + i_mmap_unlock_read(mapping); + ret = handle_userfault(&vmf, reason); + i_mmap_lock_read(mapping); + mutex_lock(&hugetlb_fault_mutex_table[hash]); + + return ret; +} + static vm_fault_t hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, struct address_space *mapping, pgoff_t idx, @@ -4348,35 +4615,11 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, retry: page = find_lock_page(mapping, idx); if (!page) { - /* - * Check for page in userfault range - */ + /* Check for page in userfault range */ if (userfaultfd_missing(vma)) { - u32 hash; - struct vm_fault vmf = { - .vma = vma, - .address = haddr, - .flags = flags, - /* - * Hard to debug if it ends up being - * used by a callee that assumes - * something about the other - * uninitialized fields... same as in - * memory.c - */ - }; - - /* - * hugetlb_fault_mutex and i_mmap_rwsem must be - * dropped before handling userfault. Reacquire - * after handling fault to make calling code simpler. - */ - hash = hugetlb_fault_mutex_hash(mapping, idx); - mutex_unlock(&hugetlb_fault_mutex_table[hash]); - i_mmap_unlock_read(mapping); - ret = handle_userfault(&vmf, VM_UFFD_MISSING); - i_mmap_lock_read(mapping); - mutex_lock(&hugetlb_fault_mutex_table[hash]); + ret = hugetlb_handle_userfault(vma, mapping, idx, + flags, haddr, + VM_UFFD_MISSING); goto out; } @@ -4395,13 +4638,10 @@ retry: * sure there really is no pte entry. */ ptl = huge_pte_lock(h, mm, ptep); - if (!huge_pte_none(huge_ptep_get(ptep))) { - ret = 0; - spin_unlock(ptl); - goto out; - } + ret = 0; + if (huge_pte_none(huge_ptep_get(ptep))) + ret = vmf_error(PTR_ERR(page)); spin_unlock(ptl); - ret = vmf_error(PTR_ERR(page)); goto out; } clear_huge_page(page, address, pages_per_huge_page(h)); @@ -4435,6 +4675,16 @@ retry: VM_FAULT_SET_HINDEX(hstate_index(h)); goto backout_unlocked; } + + /* Check for page in userfault range. */ + if (userfaultfd_minor(vma)) { + unlock_page(page); + put_page(page); + ret = hugetlb_handle_userfault(vma, mapping, idx, + flags, haddr, + VM_UFFD_MINOR); + goto out; + } } /* @@ -4563,7 +4813,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, */ mapping = vma->vm_file->f_mapping; i_mmap_lock_read(mapping); - ptep = huge_pte_alloc(mm, haddr, huge_page_size(h)); + ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h)); if (!ptep) { i_mmap_unlock_read(mapping); return VM_FAULT_OOM; @@ -4675,6 +4925,7 @@ out_mutex: return ret; } +#ifdef CONFIG_USERFAULTFD /* * Used by userfaultfd UFFDIO_COPY. Based on mcopy_atomic_pte with * modifications for huge pages. @@ -4684,8 +4935,10 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, + enum mcopy_atomic_mode mode, struct page **pagep) { + bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE); struct address_space *mapping; pgoff_t idx; unsigned long size; @@ -4695,12 +4948,31 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, spinlock_t *ptl; int ret; struct page *page; + int writable; + + mapping = dst_vma->vm_file->f_mapping; + idx = vma_hugecache_offset(h, dst_vma, dst_addr); + + if (is_continue) { + ret = -EFAULT; + page = find_lock_page(mapping, idx); + if (!page) + goto out; + } else if (!*pagep) { + /* If a page already exists, then it's UFFDIO_COPY for + * a non-missing case. Return -EEXIST. + */ + if (vm_shared && + hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) { + ret = -EEXIST; + goto out; + } - if (!*pagep) { - ret = -ENOMEM; page = alloc_huge_page(dst_vma, dst_addr, 0); - if (IS_ERR(page)) + if (IS_ERR(page)) { + ret = -ENOMEM; goto out; + } ret = copy_huge_page_from_user(page, (const void __user *) src_addr, @@ -4725,13 +4997,8 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, */ __SetPageUptodate(page); - mapping = dst_vma->vm_file->f_mapping; - idx = vma_hugecache_offset(h, dst_vma, dst_addr); - - /* - * If shared, add to page cache - */ - if (vm_shared) { + /* Add shared, newly allocated pages to the page cache. */ + if (vm_shared && !is_continue) { size = i_size_read(mapping->host) >> huge_page_shift(h); ret = -EFAULT; if (idx >= size) @@ -4776,8 +5043,14 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, hugepage_add_new_anon_rmap(page, dst_vma, dst_addr); } - _dst_pte = make_huge_pte(dst_vma, page, dst_vma->vm_flags & VM_WRITE); - if (dst_vma->vm_flags & VM_WRITE) + /* For CONTINUE on a non-shared VMA, don't set VM_WRITE for CoW. */ + if (is_continue && !vm_shared) + writable = 0; + else + writable = dst_vma->vm_flags & VM_WRITE; + + _dst_pte = make_huge_pte(dst_vma, page, writable); + if (writable) _dst_pte = huge_pte_mkdirty(_dst_pte); _dst_pte = pte_mkyoung(_dst_pte); @@ -4791,20 +5064,23 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, update_mmu_cache(dst_vma, dst_addr, dst_pte); spin_unlock(ptl); - SetHPageMigratable(page); - if (vm_shared) + if (!is_continue) + SetHPageMigratable(page); + if (vm_shared || is_continue) unlock_page(page); ret = 0; out: return ret; out_release_unlock: spin_unlock(ptl); - if (vm_shared) + if (vm_shared || is_continue) unlock_page(page); out_release_nounlock: + restore_reserve_on_error(h, dst_vma, dst_addr, page); put_page(page); goto out; } +#endif /* CONFIG_USERFAULTFD */ static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma, int refs, struct page **pages, @@ -4996,14 +5272,6 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, return i ? i : err; } -#ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE -/* - * ARCHes with special requirements for evicting HUGETLB backing TLB entries can - * implement this. - */ -#define flush_hugetlb_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) -#endif - unsigned long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot) { @@ -5280,6 +5548,9 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end, /* * If the subpool has a minimum size, the number of global * reservations to be released may be adjusted. + * + * Note that !resv_map implies freed == 0. So (chg - freed) + * won't go negative. */ gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed)); hugetlb_acct_memory(h, -gbl_reserve); @@ -5326,6 +5597,15 @@ static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr) return false; } +bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr) +{ +#ifdef CONFIG_USERFAULTFD + if (uffd_disable_huge_pmd_share(vma)) + return false; +#endif + return vma_shareable(vma, addr); +} + /* * Determine if start,end range within vma could be mapped by shared pmd. * If yes, adjust start and end to cover range associated with possible @@ -5338,8 +5618,8 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, v_end = ALIGN_DOWN(vma->vm_end, PUD_SIZE); /* - * vma need span at least one aligned PUD size and the start,end range - * must at least partialy within it. + * vma needs to span at least one aligned PUD size, and the range + * must be at least partially within in. */ if (!(vma->vm_flags & VM_MAYSHARE) || !(v_end > v_start) || (*end <= v_start) || (*start >= v_end)) @@ -5370,9 +5650,9 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, * if !vma_shareable check at the beginning of the routine. i_mmap_rwsem is * only required for subsequent processing. */ -pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) +pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pud_t *pud) { - struct vm_area_struct *vma = find_vma(mm, addr); struct address_space *mapping = vma->vm_file->f_mapping; pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; @@ -5382,9 +5662,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) pte_t *pte; spinlock_t *ptl; - if (!vma_shareable(vma, addr)) - return (pte_t *)pmd_alloc(mm, pud, addr); - i_mmap_assert_locked(mapping); vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { if (svma == vma) @@ -5448,9 +5725,10 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE; return 1; } -#define want_pmd_share() (1) + #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ -pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) +pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pud_t *pud) { return NULL; } @@ -5465,11 +5743,15 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, unsigned long *start, unsigned long *end) { } -#define want_pmd_share() (0) + +bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr) +{ + return false; +} #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ #ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB -pte_t *huge_pte_alloc(struct mm_struct *mm, +pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long sz) { pgd_t *pgd; @@ -5487,8 +5769,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, pte = (pte_t *)pud; } else { BUG_ON(sz != PMD_SIZE); - if (want_pmd_share() && pud_none(*pud)) - pte = huge_pmd_share(mm, addr, pud); + if (want_pmd_share(vma, addr) && pud_none(*pud)) + pte = huge_pmd_share(mm, vma, addr, pud); else pte = (pte_t *)pmd_alloc(mm, pud, addr); } @@ -5632,7 +5914,7 @@ bool isolate_huge_page(struct page *page, struct list_head *list) { bool ret = true; - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); if (!PageHeadHuge(page) || !HPageMigratable(page) || !get_page_unless_zero(page)) { @@ -5642,16 +5924,31 @@ bool isolate_huge_page(struct page *page, struct list_head *list) ClearHPageMigratable(page); list_move_tail(&page->lru, list); unlock: - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); + return ret; +} + +int get_hwpoison_huge_page(struct page *page, bool *hugetlb) +{ + int ret = 0; + + *hugetlb = false; + spin_lock_irq(&hugetlb_lock); + if (PageHeadHuge(page)) { + *hugetlb = true; + if (HPageFreed(page) || HPageMigratable(page)) + ret = get_page_unless_zero(page); + } + spin_unlock_irq(&hugetlb_lock); return ret; } void putback_active_hugepage(struct page *page) { - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); SetHPageMigratable(page); list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist); - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); put_page(page); } @@ -5679,13 +5976,70 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason) SetHPageTemporary(oldpage); ClearHPageTemporary(newpage); - spin_lock(&hugetlb_lock); + /* + * There is no need to transfer the per-node surplus state + * when we do not cross the node. + */ + if (new_nid == old_nid) + return; + spin_lock_irq(&hugetlb_lock); if (h->surplus_huge_pages_node[old_nid]) { h->surplus_huge_pages_node[old_nid]--; h->surplus_huge_pages_node[new_nid]++; } - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); + } +} + +/* + * This function will unconditionally remove all the shared pmd pgtable entries + * within the specific vma for a hugetlbfs memory range. + */ +void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) +{ + struct hstate *h = hstate_vma(vma); + unsigned long sz = huge_page_size(h); + struct mm_struct *mm = vma->vm_mm; + struct mmu_notifier_range range; + unsigned long address, start, end; + spinlock_t *ptl; + pte_t *ptep; + + if (!(vma->vm_flags & VM_MAYSHARE)) + return; + + start = ALIGN(vma->vm_start, PUD_SIZE); + end = ALIGN_DOWN(vma->vm_end, PUD_SIZE); + + if (start >= end) + return; + + /* + * No need to call adjust_range_if_pmd_sharing_possible(), because + * we have already done the PUD_SIZE alignment. + */ + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, + start, end); + mmu_notifier_invalidate_range_start(&range); + i_mmap_lock_write(vma->vm_file->f_mapping); + for (address = start; address < end; address += PUD_SIZE) { + unsigned long tmp = address; + + ptep = huge_pte_offset(mm, address, sz); + if (!ptep) + continue; + ptl = huge_pte_lock(h, mm, ptep); + /* We don't want 'address' to be changed */ + huge_pmd_unshare(mm, vma, &tmp, ptep); + spin_unlock(ptl); } + flush_hugetlb_tlb_range(vma, start, end); + i_mmap_unlock_write(vma->vm_file->f_mapping); + /* + * No need to call mmu_notifier_invalidate_range(), see + * Documentation/vm/mmu_notifier.rst. + */ + mmu_notifier_invalidate_range_end(&range); } #ifdef CONFIG_CMA diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 603a131e262d..5383023d0cca 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -204,11 +204,11 @@ static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css) do { idx = 0; for_each_hstate(h) { - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); list_for_each_entry(page, &h->hugepage_activelist, lru) hugetlb_cgroup_move_parent(idx, h_cg, page); - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); idx++; } cond_resched(); @@ -784,8 +784,7 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage) if (hugetlb_cgroup_disabled()) return; - VM_BUG_ON_PAGE(!PageHuge(oldhpage), oldhpage); - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); h_cg = hugetlb_cgroup_from_page(oldhpage); h_cg_rsvd = hugetlb_cgroup_from_page_rsvd(oldhpage); set_hugetlb_cgroup(oldhpage, NULL); @@ -795,7 +794,7 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage) set_hugetlb_cgroup(newhpage, h_cg); set_hugetlb_cgroup_rsvd(newhpage, h_cg_rsvd); list_move(&newhpage->lru, &h->hugepage_activelist); - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); return; } diff --git a/mm/internal.h b/mm/internal.h index cb3c5e0a7799..e8fdb531f887 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -51,13 +51,12 @@ void unmap_page_range(struct mmu_gather *tlb, void do_page_cache_ra(struct readahead_control *, unsigned long nr_to_read, unsigned long lookahead_size); -void force_page_cache_ra(struct readahead_control *, struct file_ra_state *, - unsigned long nr); +void force_page_cache_ra(struct readahead_control *, unsigned long nr); static inline void force_page_cache_readahead(struct address_space *mapping, struct file *file, pgoff_t index, unsigned long nr_to_read) { - DEFINE_READAHEAD(ractl, file, mapping, index); - force_page_cache_ra(&ractl, &file->f_ra, nr_to_read); + DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, index); + force_page_cache_ra(&ractl, nr_to_read); } unsigned find_lock_entries(struct address_space *mapping, pgoff_t start, @@ -97,26 +96,6 @@ static inline void set_page_refcounted(struct page *page) set_page_count(page, 1); } -/* - * When kernel touch the user page, the user page may be have been marked - * poison but still mapped in user space, if without this page, the kernel - * can guarantee the data integrity and operation success, the kernel is - * better to check the posion status and avoid touching it, be good not to - * panic, coredump for process fatal signal is a sample case matching this - * scenario. Or if kernel can't guarantee the data integrity, it's better - * not to call this function, let kernel touch the poison page and get to - * panic. - */ -static inline bool is_page_poisoned(struct page *page) -{ - if (PageHWPoison(page)) - return true; - else if (PageHuge(page) && PageHWPoison(compound_head(page))) - return true; - - return false; -} - extern unsigned long highest_memmap_pfn; /* @@ -146,10 +125,10 @@ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); * family of functions. * * nodemask, migratetype and highest_zoneidx are initialized only once in - * __alloc_pages_nodemask() and then never change. + * __alloc_pages() and then never change. * * zonelist, preferred_zone and highest_zoneidx are set first in - * __alloc_pages_nodemask() for the fast path, and might be later changed + * __alloc_pages() for the fast path, and might be later changed * in __alloc_pages_slowpath(). All other functions pass the whole structure * by a const pointer. */ @@ -245,7 +224,13 @@ struct compact_control { unsigned int nr_freepages; /* Number of isolated free pages */ unsigned int nr_migratepages; /* Number of pages to migrate */ unsigned long free_pfn; /* isolate_freepages search base */ - unsigned long migrate_pfn; /* isolate_migratepages search base */ + /* + * Acts as an in/out parameter to page isolation for migration. + * isolate_migratepages uses it as a search base. + * isolate_migratepages_block will update the value to the next pfn + * after the last isolated one. + */ + unsigned long migrate_pfn; unsigned long fast_start_pfn; /* a pfn to start linear scan from */ struct zone *zone; unsigned long total_migrate_scanned; @@ -281,7 +266,7 @@ struct capture_control { unsigned long isolate_freepages_range(struct compact_control *cc, unsigned long start_pfn, unsigned long end_pfn); -unsigned long +int isolate_migratepages_range(struct compact_control *cc, unsigned long low_pfn, unsigned long end_pfn); int find_suitable_fallback(struct free_area *area, unsigned int order, @@ -329,7 +314,7 @@ static inline bool is_exec_mapping(vm_flags_t flags) } /* - * Stack area - atomatically grows in one direction + * Stack area - automatically grows in one direction * * VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous: * do_mmap() forbids all other combinations. @@ -399,27 +384,52 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page) extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); /* - * At what user virtual address is page expected in @vma? + * At what user virtual address is page expected in vma? + * Returns -EFAULT if all of the page is outside the range of vma. + * If page is a compound head, the entire compound page is considered. */ static inline unsigned long -__vma_address(struct page *page, struct vm_area_struct *vma) +vma_address(struct page *page, struct vm_area_struct *vma) { - pgoff_t pgoff = page_to_pgoff(page); - return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); + pgoff_t pgoff; + unsigned long address; + + VM_BUG_ON_PAGE(PageKsm(page), page); /* KSM page->index unusable */ + pgoff = page_to_pgoff(page); + if (pgoff >= vma->vm_pgoff) { + address = vma->vm_start + + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); + /* Check for address beyond vma (or wrapped through 0?) */ + if (address < vma->vm_start || address >= vma->vm_end) + address = -EFAULT; + } else if (PageHead(page) && + pgoff + compound_nr(page) - 1 >= vma->vm_pgoff) { + /* Test above avoids possibility of wrap to 0 on 32-bit */ + address = vma->vm_start; + } else { + address = -EFAULT; + } + return address; } +/* + * Then at what user virtual address will none of the page be found in vma? + * Assumes that vma_address() already returned a good starting address. + * If page is a compound head, the entire compound page is considered. + */ static inline unsigned long -vma_address(struct page *page, struct vm_area_struct *vma) +vma_address_end(struct page *page, struct vm_area_struct *vma) { - unsigned long start, end; - - start = __vma_address(page, vma); - end = start + thp_size(page) - PAGE_SIZE; - - /* page should be within @vma mapping range */ - VM_BUG_ON_VMA(end < vma->vm_start || start >= vma->vm_end, vma); - - return max(start, vma->vm_start); + pgoff_t pgoff; + unsigned long address; + + VM_BUG_ON_PAGE(PageKsm(page), page); /* KSM page->index unusable */ + pgoff = page_to_pgoff(page) + compound_nr(page); + address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); + /* Check for address beyond vma (or wrapped through 0?) */ + if (address < vma->vm_start || address > vma->vm_end) + address = vma->vm_end; + return address; } static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, @@ -447,7 +457,9 @@ static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, static inline void clear_page_mlock(struct page *page) { } static inline void mlock_vma_page(struct page *page) { } static inline void mlock_migrate_page(struct page *new, struct page *old) { } - +static inline void vunmap_range_noflush(unsigned long start, unsigned long end) +{ +} #endif /* !CONFIG_MMU */ /* @@ -638,4 +650,21 @@ struct migration_target_control { gfp_t gfp_mask; }; +/* + * mm/vmalloc.c + */ +#ifdef CONFIG_MMU +int vmap_pages_range_noflush(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, unsigned int page_shift); +#else +static inline +int vmap_pages_range_noflush(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, unsigned int page_shift) +{ + return -EINVAL; +} +#endif + +void vunmap_range_noflush(unsigned long start, unsigned long end); + #endif /* __MM_INTERNAL_H */ diff --git a/mm/interval_tree.c b/mm/interval_tree.c index 11c75fb07584..32e390c42c53 100644 --- a/mm/interval_tree.c +++ b/mm/interval_tree.c @@ -22,7 +22,7 @@ static inline unsigned long vma_last_pgoff(struct vm_area_struct *v) INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.rb, unsigned long, shared.rb_subtree_last, - vma_start_pgoff, vma_last_pgoff,, vma_interval_tree) + vma_start_pgoff, vma_last_pgoff, /* empty */, vma_interval_tree) /* Insert node immediately after prev in the interval tree */ void vma_interval_tree_insert_after(struct vm_area_struct *node, diff --git a/mm/io-mapping.c b/mm/io-mapping.c new file mode 100644 index 000000000000..01b362799930 --- /dev/null +++ b/mm/io-mapping.c @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/mm.h> +#include <linux/io-mapping.h> + +/** + * io_mapping_map_user - remap an I/O mapping to userspace + * @iomap: the source io_mapping + * @vma: user vma to map to + * @addr: target user address to start at + * @pfn: physical address of kernel memory + * @size: size of map area + * + * Note: this is only safe if the mm semaphore is held when called. + */ +int io_mapping_map_user(struct io_mapping *iomap, struct vm_area_struct *vma, + unsigned long addr, unsigned long pfn, unsigned long size) +{ + vm_flags_t expected_flags = VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; + + if (WARN_ON_ONCE((vma->vm_flags & expected_flags) != expected_flags)) + return -EINVAL; + + /* We rely on prevalidation of the io-mapping to skip track_pfn(). */ + return remap_pfn_range_notrack(vma, addr, pfn, size, + __pgprot((pgprot_val(iomap->prot) & _PAGE_CACHE_MASK) | + (pgprot_val(vma->vm_page_prot) & ~_PAGE_CACHE_MASK))); +} +EXPORT_SYMBOL_GPL(io_mapping_map_user); diff --git a/mm/ioremap.c b/mm/ioremap.c index 5fa1ab41d152..8ee0136f8cb0 100644 --- a/mm/ioremap.c +++ b/mm/ioremap.c @@ -16,237 +16,22 @@ #include "pgalloc-track.h" #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP -static int __read_mostly ioremap_p4d_capable; -static int __read_mostly ioremap_pud_capable; -static int __read_mostly ioremap_pmd_capable; -static int __read_mostly ioremap_huge_disabled; +static unsigned int __ro_after_init iomap_max_page_shift = BITS_PER_LONG - 1; static int __init set_nohugeiomap(char *str) { - ioremap_huge_disabled = 1; + iomap_max_page_shift = PAGE_SHIFT; return 0; } early_param("nohugeiomap", set_nohugeiomap); - -void __init ioremap_huge_init(void) -{ - if (!ioremap_huge_disabled) { - if (arch_ioremap_p4d_supported()) - ioremap_p4d_capable = 1; - if (arch_ioremap_pud_supported()) - ioremap_pud_capable = 1; - if (arch_ioremap_pmd_supported()) - ioremap_pmd_capable = 1; - } -} - -static inline int ioremap_p4d_enabled(void) -{ - return ioremap_p4d_capable; -} - -static inline int ioremap_pud_enabled(void) -{ - return ioremap_pud_capable; -} - -static inline int ioremap_pmd_enabled(void) -{ - return ioremap_pmd_capable; -} - -#else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */ -static inline int ioremap_p4d_enabled(void) { return 0; } -static inline int ioremap_pud_enabled(void) { return 0; } -static inline int ioremap_pmd_enabled(void) { return 0; } +#else /* CONFIG_HAVE_ARCH_HUGE_VMAP */ +static const unsigned int iomap_max_page_shift = PAGE_SHIFT; #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ -static int ioremap_pte_range(pmd_t *pmd, unsigned long addr, - unsigned long end, phys_addr_t phys_addr, pgprot_t prot, - pgtbl_mod_mask *mask) -{ - pte_t *pte; - u64 pfn; - - pfn = phys_addr >> PAGE_SHIFT; - pte = pte_alloc_kernel_track(pmd, addr, mask); - if (!pte) - return -ENOMEM; - do { - BUG_ON(!pte_none(*pte)); - set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot)); - pfn++; - } while (pte++, addr += PAGE_SIZE, addr != end); - *mask |= PGTBL_PTE_MODIFIED; - return 0; -} - -static int ioremap_try_huge_pmd(pmd_t *pmd, unsigned long addr, - unsigned long end, phys_addr_t phys_addr, - pgprot_t prot) -{ - if (!ioremap_pmd_enabled()) - return 0; - - if ((end - addr) != PMD_SIZE) - return 0; - - if (!IS_ALIGNED(addr, PMD_SIZE)) - return 0; - - if (!IS_ALIGNED(phys_addr, PMD_SIZE)) - return 0; - - if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr)) - return 0; - - return pmd_set_huge(pmd, phys_addr, prot); -} - -static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr, - unsigned long end, phys_addr_t phys_addr, pgprot_t prot, - pgtbl_mod_mask *mask) -{ - pmd_t *pmd; - unsigned long next; - - pmd = pmd_alloc_track(&init_mm, pud, addr, mask); - if (!pmd) - return -ENOMEM; - do { - next = pmd_addr_end(addr, end); - - if (ioremap_try_huge_pmd(pmd, addr, next, phys_addr, prot)) { - *mask |= PGTBL_PMD_MODIFIED; - continue; - } - - if (ioremap_pte_range(pmd, addr, next, phys_addr, prot, mask)) - return -ENOMEM; - } while (pmd++, phys_addr += (next - addr), addr = next, addr != end); - return 0; -} - -static int ioremap_try_huge_pud(pud_t *pud, unsigned long addr, - unsigned long end, phys_addr_t phys_addr, - pgprot_t prot) -{ - if (!ioremap_pud_enabled()) - return 0; - - if ((end - addr) != PUD_SIZE) - return 0; - - if (!IS_ALIGNED(addr, PUD_SIZE)) - return 0; - - if (!IS_ALIGNED(phys_addr, PUD_SIZE)) - return 0; - - if (pud_present(*pud) && !pud_free_pmd_page(pud, addr)) - return 0; - - return pud_set_huge(pud, phys_addr, prot); -} - -static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr, - unsigned long end, phys_addr_t phys_addr, pgprot_t prot, - pgtbl_mod_mask *mask) -{ - pud_t *pud; - unsigned long next; - - pud = pud_alloc_track(&init_mm, p4d, addr, mask); - if (!pud) - return -ENOMEM; - do { - next = pud_addr_end(addr, end); - - if (ioremap_try_huge_pud(pud, addr, next, phys_addr, prot)) { - *mask |= PGTBL_PUD_MODIFIED; - continue; - } - - if (ioremap_pmd_range(pud, addr, next, phys_addr, prot, mask)) - return -ENOMEM; - } while (pud++, phys_addr += (next - addr), addr = next, addr != end); - return 0; -} - -static int ioremap_try_huge_p4d(p4d_t *p4d, unsigned long addr, - unsigned long end, phys_addr_t phys_addr, - pgprot_t prot) -{ - if (!ioremap_p4d_enabled()) - return 0; - - if ((end - addr) != P4D_SIZE) - return 0; - - if (!IS_ALIGNED(addr, P4D_SIZE)) - return 0; - - if (!IS_ALIGNED(phys_addr, P4D_SIZE)) - return 0; - - if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr)) - return 0; - - return p4d_set_huge(p4d, phys_addr, prot); -} - -static inline int ioremap_p4d_range(pgd_t *pgd, unsigned long addr, - unsigned long end, phys_addr_t phys_addr, pgprot_t prot, - pgtbl_mod_mask *mask) -{ - p4d_t *p4d; - unsigned long next; - - p4d = p4d_alloc_track(&init_mm, pgd, addr, mask); - if (!p4d) - return -ENOMEM; - do { - next = p4d_addr_end(addr, end); - - if (ioremap_try_huge_p4d(p4d, addr, next, phys_addr, prot)) { - *mask |= PGTBL_P4D_MODIFIED; - continue; - } - - if (ioremap_pud_range(p4d, addr, next, phys_addr, prot, mask)) - return -ENOMEM; - } while (p4d++, phys_addr += (next - addr), addr = next, addr != end); - return 0; -} - int ioremap_page_range(unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot) { - pgd_t *pgd; - unsigned long start; - unsigned long next; - int err; - pgtbl_mod_mask mask = 0; - - might_sleep(); - BUG_ON(addr >= end); - - start = addr; - pgd = pgd_offset_k(addr); - do { - next = pgd_addr_end(addr, end); - err = ioremap_p4d_range(pgd, addr, next, phys_addr, prot, - &mask); - if (err) - break; - } while (pgd++, phys_addr += (next - addr), addr = next, addr != end); - - flush_cache_vmap(start, end); - - if (mask & ARCH_PAGE_TABLE_SYNC_MASK) - arch_sync_kernel_mappings(start, end); - - return err; + return vmap_range(addr, end, phys_addr, prot, iomap_max_page_shift); } #ifdef CONFIG_GENERIC_IOREMAP diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 7b53291dafa1..6bb87f2acd4e 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -60,7 +60,7 @@ void kasan_disable_current(void) void __kasan_unpoison_range(const void *address, size_t size) { - kasan_unpoison(address, size); + kasan_unpoison(address, size, false); } #ifdef CONFIG_KASAN_STACK @@ -69,7 +69,7 @@ void kasan_unpoison_task_stack(struct task_struct *task) { void *base = task_stack_page(task); - kasan_unpoison(base, THREAD_SIZE); + kasan_unpoison(base, THREAD_SIZE, false); } /* Unpoison the stack for the current task beyond a watermark sp value. */ @@ -82,7 +82,7 @@ asmlinkage void kasan_unpoison_task_stack_below(const void *watermark) */ void *base = (void *)((unsigned long)watermark & ~(THREAD_SIZE - 1)); - kasan_unpoison(base, watermark - base); + kasan_unpoison(base, watermark - base, false); } #endif /* CONFIG_KASAN_STACK */ @@ -97,7 +97,7 @@ slab_flags_t __kasan_never_merge(void) return 0; } -void __kasan_alloc_pages(struct page *page, unsigned int order) +void __kasan_alloc_pages(struct page *page, unsigned int order, bool init) { u8 tag; unsigned long i; @@ -108,14 +108,14 @@ void __kasan_alloc_pages(struct page *page, unsigned int order) tag = kasan_random_tag(); for (i = 0; i < (1 << order); i++) page_kasan_tag_set(page + i, tag); - kasan_unpoison(page_address(page), PAGE_SIZE << order); + kasan_unpoison(page_address(page), PAGE_SIZE << order, init); } -void __kasan_free_pages(struct page *page, unsigned int order) +void __kasan_free_pages(struct page *page, unsigned int order, bool init) { if (likely(!PageHighMem(page))) kasan_poison(page_address(page), PAGE_SIZE << order, - KASAN_FREE_PAGE); + KASAN_FREE_PAGE, init); } /* @@ -251,18 +251,18 @@ void __kasan_poison_slab(struct page *page) for (i = 0; i < compound_nr(page); i++) page_kasan_tag_reset(page + i); kasan_poison(page_address(page), page_size(page), - KASAN_KMALLOC_REDZONE); + KASAN_KMALLOC_REDZONE, false); } void __kasan_unpoison_object_data(struct kmem_cache *cache, void *object) { - kasan_unpoison(object, cache->object_size); + kasan_unpoison(object, cache->object_size, false); } void __kasan_poison_object_data(struct kmem_cache *cache, void *object) { kasan_poison(object, round_up(cache->object_size, KASAN_GRANULE_SIZE), - KASAN_KMALLOC_REDZONE); + KASAN_KMALLOC_REDZONE, false); } /* @@ -322,8 +322,8 @@ void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache, return (void *)object; } -static inline bool ____kasan_slab_free(struct kmem_cache *cache, - void *object, unsigned long ip, bool quarantine) +static inline bool ____kasan_slab_free(struct kmem_cache *cache, void *object, + unsigned long ip, bool quarantine, bool init) { u8 tag; void *tagged_object; @@ -351,7 +351,7 @@ static inline bool ____kasan_slab_free(struct kmem_cache *cache, } kasan_poison(object, round_up(cache->object_size, KASAN_GRANULE_SIZE), - KASAN_KMALLOC_FREE); + KASAN_KMALLOC_FREE, init); if ((IS_ENABLED(CONFIG_KASAN_GENERIC) && !quarantine)) return false; @@ -362,9 +362,10 @@ static inline bool ____kasan_slab_free(struct kmem_cache *cache, return kasan_quarantine_put(cache, object); } -bool __kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip) +bool __kasan_slab_free(struct kmem_cache *cache, void *object, + unsigned long ip, bool init) { - return ____kasan_slab_free(cache, object, ip, true); + return ____kasan_slab_free(cache, object, ip, true, init); } static inline bool ____kasan_kfree_large(void *ptr, unsigned long ip) @@ -407,9 +408,9 @@ void __kasan_slab_free_mempool(void *ptr, unsigned long ip) if (unlikely(!PageSlab(page))) { if (____kasan_kfree_large(ptr, ip)) return; - kasan_poison(ptr, page_size(page), KASAN_FREE_PAGE); + kasan_poison(ptr, page_size(page), KASAN_FREE_PAGE, false); } else { - ____kasan_slab_free(page->slab_cache, ptr, ip, false); + ____kasan_slab_free(page->slab_cache, ptr, ip, false, false); } } @@ -428,7 +429,7 @@ static void set_alloc_info(struct kmem_cache *cache, void *object, } void * __must_check __kasan_slab_alloc(struct kmem_cache *cache, - void *object, gfp_t flags) + void *object, gfp_t flags, bool init) { u8 tag; void *tagged_object; @@ -453,7 +454,7 @@ void * __must_check __kasan_slab_alloc(struct kmem_cache *cache, * Unpoison the whole object. * For kmalloc() allocations, kasan_kmalloc() will do precise poisoning. */ - kasan_unpoison(tagged_object, cache->object_size); + kasan_unpoison(tagged_object, cache->object_size, init); /* Save alloc info (if possible) for non-kmalloc() allocations. */ if (kasan_stack_collection_enabled()) @@ -496,7 +497,7 @@ static inline void *____kasan_kmalloc(struct kmem_cache *cache, redzone_end = round_up((unsigned long)(object + cache->object_size), KASAN_GRANULE_SIZE); kasan_poison((void *)redzone_start, redzone_end - redzone_start, - KASAN_KMALLOC_REDZONE); + KASAN_KMALLOC_REDZONE, false); /* * Save alloc info (if possible) for kmalloc() allocations. @@ -546,7 +547,7 @@ void * __must_check __kasan_kmalloc_large(const void *ptr, size_t size, KASAN_GRANULE_SIZE); redzone_end = (unsigned long)ptr + page_size(virt_to_page(ptr)); kasan_poison((void *)redzone_start, redzone_end - redzone_start, - KASAN_PAGE_REDZONE); + KASAN_PAGE_REDZONE, false); return (void *)ptr; } @@ -563,7 +564,7 @@ void * __must_check __kasan_krealloc(const void *object, size_t size, gfp_t flag * Part of it might already have been unpoisoned, but it's unknown * how big that part is. */ - kasan_unpoison(object, size); + kasan_unpoison(object, size, false); page = virt_to_head_page(object); diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 2e55e0f82f39..53cbf28859b5 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -208,11 +208,11 @@ static void register_global(struct kasan_global *global) { size_t aligned_size = round_up(global->size, KASAN_GRANULE_SIZE); - kasan_unpoison(global->beg, global->size); + kasan_unpoison(global->beg, global->size, false); kasan_poison(global->beg + aligned_size, global->size_with_redzone - aligned_size, - KASAN_GLOBAL_REDZONE); + KASAN_GLOBAL_REDZONE, false); } void __asan_register_globals(struct kasan_global *globals, size_t size) @@ -292,11 +292,11 @@ void __asan_alloca_poison(unsigned long addr, size_t size) WARN_ON(!IS_ALIGNED(addr, KASAN_ALLOCA_REDZONE_SIZE)); kasan_unpoison((const void *)(addr + rounded_down_size), - size - rounded_down_size); + size - rounded_down_size, false); kasan_poison(left_redzone, KASAN_ALLOCA_REDZONE_SIZE, - KASAN_ALLOCA_LEFT); + KASAN_ALLOCA_LEFT, false); kasan_poison(right_redzone, padding_size + KASAN_ALLOCA_REDZONE_SIZE, - KASAN_ALLOCA_RIGHT); + KASAN_ALLOCA_RIGHT, false); } EXPORT_SYMBOL(__asan_alloca_poison); @@ -306,7 +306,7 @@ void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom) if (unlikely(!stack_top || stack_top > stack_bottom)) return; - kasan_unpoison(stack_top, stack_bottom - stack_top); + kasan_unpoison(stack_top, stack_bottom - stack_top, false); } EXPORT_SYMBOL(__asan_allocas_unpoison); diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index 2aad21fda156..4004388b4e4b 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -25,6 +25,12 @@ enum kasan_arg { KASAN_ARG_ON, }; +enum kasan_arg_mode { + KASAN_ARG_MODE_DEFAULT, + KASAN_ARG_MODE_SYNC, + KASAN_ARG_MODE_ASYNC, +}; + enum kasan_arg_stacktrace { KASAN_ARG_STACKTRACE_DEFAULT, KASAN_ARG_STACKTRACE_OFF, @@ -38,6 +44,7 @@ enum kasan_arg_fault { }; static enum kasan_arg kasan_arg __ro_after_init; +static enum kasan_arg_mode kasan_arg_mode __ro_after_init; static enum kasan_arg_stacktrace kasan_arg_stacktrace __ro_after_init; static enum kasan_arg_fault kasan_arg_fault __ro_after_init; @@ -45,6 +52,10 @@ static enum kasan_arg_fault kasan_arg_fault __ro_after_init; DEFINE_STATIC_KEY_FALSE(kasan_flag_enabled); EXPORT_SYMBOL(kasan_flag_enabled); +/* Whether the asynchronous mode is enabled. */ +bool kasan_flag_async __ro_after_init; +EXPORT_SYMBOL_GPL(kasan_flag_async); + /* Whether to collect alloc/free stack traces. */ DEFINE_STATIC_KEY_FALSE(kasan_flag_stacktrace); @@ -68,6 +79,23 @@ static int __init early_kasan_flag(char *arg) } early_param("kasan", early_kasan_flag); +/* kasan.mode=sync/async */ +static int __init early_kasan_mode(char *arg) +{ + if (!arg) + return -EINVAL; + + if (!strcmp(arg, "sync")) + kasan_arg_mode = KASAN_ARG_MODE_SYNC; + else if (!strcmp(arg, "async")) + kasan_arg_mode = KASAN_ARG_MODE_ASYNC; + else + return -EINVAL; + + return 0; +} +early_param("kasan.mode", early_kasan_mode); + /* kasan.stacktrace=off/on */ static int __init early_kasan_flag_stacktrace(char *arg) { @@ -115,7 +143,15 @@ void kasan_init_hw_tags_cpu(void) return; hw_init_tags(KASAN_TAG_MAX); - hw_enable_tagging(); + + /* + * Enable async mode only when explicitly requested through + * the command line. + */ + if (kasan_arg_mode == KASAN_ARG_MODE_ASYNC) + hw_enable_tagging_async(); + else + hw_enable_tagging_sync(); } /* kasan_init_hw_tags() is called once on boot CPU. */ @@ -132,6 +168,22 @@ void __init kasan_init_hw_tags(void) /* Enable KASAN. */ static_branch_enable(&kasan_flag_enabled); + switch (kasan_arg_mode) { + case KASAN_ARG_MODE_DEFAULT: + /* + * Default to sync mode. + * Do nothing, kasan_flag_async keeps its default value. + */ + break; + case KASAN_ARG_MODE_SYNC: + /* Do nothing, kasan_flag_async keeps its default value. */ + break; + case KASAN_ARG_MODE_ASYNC: + /* Async mode enabled. */ + kasan_flag_async = true; + break; + } + switch (kasan_arg_stacktrace) { case KASAN_ARG_STACKTRACE_DEFAULT: /* Default to enabling stack trace collection. */ @@ -194,10 +246,16 @@ void kasan_set_tagging_report_once(bool state) } EXPORT_SYMBOL_GPL(kasan_set_tagging_report_once); -void kasan_enable_tagging(void) +void kasan_enable_tagging_sync(void) +{ + hw_enable_tagging_sync(); +} +EXPORT_SYMBOL_GPL(kasan_enable_tagging_sync); + +void kasan_force_async_fault(void) { - hw_enable_tagging(); + hw_force_async_tag_fault(); } -EXPORT_SYMBOL_GPL(kasan_enable_tagging); +EXPORT_SYMBOL_GPL(kasan_force_async_fault); #endif diff --git a/mm/kasan/init.c b/mm/kasan/init.c index c4605ac9837b..348f31d15a97 100644 --- a/mm/kasan/init.c +++ b/mm/kasan/init.c @@ -220,8 +220,8 @@ static int __ref zero_p4d_populate(pgd_t *pgd, unsigned long addr, /** * kasan_populate_early_shadow - populate shadow memory region with * kasan_early_shadow_page - * @shadow_start - start of the memory range to populate - * @shadow_end - end of the memory range to populate + * @shadow_start: start of the memory range to populate + * @shadow_end: end of the memory range to populate */ int __ref kasan_populate_early_shadow(const void *shadow_start, const void *shadow_end) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 3436c6bf7c0c..8f450bc28045 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -7,20 +7,37 @@ #include <linux/stackdepot.h> #ifdef CONFIG_KASAN_HW_TAGS + #include <linux/static_key.h> + DECLARE_STATIC_KEY_FALSE(kasan_flag_stacktrace); +extern bool kasan_flag_async __ro_after_init; + static inline bool kasan_stack_collection_enabled(void) { return static_branch_unlikely(&kasan_flag_stacktrace); } + +static inline bool kasan_async_mode_enabled(void) +{ + return kasan_flag_async; +} #else + static inline bool kasan_stack_collection_enabled(void) { return true; } + +static inline bool kasan_async_mode_enabled(void) +{ + return false; +} + #endif extern bool kasan_flag_panic __ro_after_init; +extern bool kasan_flag_async __ro_after_init; #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) #define KASAN_GRANULE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT) @@ -38,9 +55,9 @@ extern bool kasan_flag_panic __ro_after_init; #define KASAN_TAG_MAX 0xFD /* maximum value for random tags */ #ifdef CONFIG_KASAN_HW_TAGS -#define KASAN_TAG_MIN 0xF0 /* mimimum value for random tags */ +#define KASAN_TAG_MIN 0xF0 /* minimum value for random tags */ #else -#define KASAN_TAG_MIN 0x00 /* mimimum value for random tags */ +#define KASAN_TAG_MIN 0x00 /* minimum value for random tags */ #endif #ifdef CONFIG_KASAN_GENERIC @@ -146,7 +163,7 @@ struct kasan_alloc_meta { struct kasan_track alloc_track; #ifdef CONFIG_KASAN_GENERIC /* - * call_rcu() call stack is stored into struct kasan_alloc_meta. + * The auxiliary stack is stored into struct kasan_alloc_meta. * The free stack is stored into struct kasan_free_meta. */ depot_stack_handle_t aux_stack[2]; @@ -275,8 +292,11 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag) #ifdef CONFIG_KASAN_HW_TAGS -#ifndef arch_enable_tagging -#define arch_enable_tagging() +#ifndef arch_enable_tagging_sync +#define arch_enable_tagging_sync() +#endif +#ifndef arch_enable_tagging_async +#define arch_enable_tagging_async() #endif #ifndef arch_init_tags #define arch_init_tags(max_tag) @@ -284,6 +304,9 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag) #ifndef arch_set_tagging_report_once #define arch_set_tagging_report_once(state) #endif +#ifndef arch_force_async_tag_fault +#define arch_force_async_tag_fault() +#endif #ifndef arch_get_random_tag #define arch_get_random_tag() (0xFF) #endif @@ -291,19 +314,23 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag) #define arch_get_mem_tag(addr) (0xFF) #endif #ifndef arch_set_mem_tag_range -#define arch_set_mem_tag_range(addr, size, tag) ((void *)(addr)) +#define arch_set_mem_tag_range(addr, size, tag, init) ((void *)(addr)) #endif -#define hw_enable_tagging() arch_enable_tagging() +#define hw_enable_tagging_sync() arch_enable_tagging_sync() +#define hw_enable_tagging_async() arch_enable_tagging_async() #define hw_init_tags(max_tag) arch_init_tags(max_tag) #define hw_set_tagging_report_once(state) arch_set_tagging_report_once(state) +#define hw_force_async_tag_fault() arch_force_async_tag_fault() #define hw_get_random_tag() arch_get_random_tag() #define hw_get_mem_tag(addr) arch_get_mem_tag(addr) -#define hw_set_mem_tag_range(addr, size, tag) arch_set_mem_tag_range((addr), (size), (tag)) +#define hw_set_mem_tag_range(addr, size, tag, init) \ + arch_set_mem_tag_range((addr), (size), (tag), (init)) #else /* CONFIG_KASAN_HW_TAGS */ -#define hw_enable_tagging() +#define hw_enable_tagging_sync() +#define hw_enable_tagging_async() #define hw_set_tagging_report_once(state) #endif /* CONFIG_KASAN_HW_TAGS */ @@ -311,12 +338,14 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag) #if defined(CONFIG_KASAN_HW_TAGS) && IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) void kasan_set_tagging_report_once(bool state); -void kasan_enable_tagging(void); +void kasan_enable_tagging_sync(void); +void kasan_force_async_fault(void); #else /* CONFIG_KASAN_HW_TAGS || CONFIG_KASAN_KUNIT_TEST */ static inline void kasan_set_tagging_report_once(bool state) { } -static inline void kasan_enable_tagging(void) { } +static inline void kasan_enable_tagging_sync(void) { } +static inline void kasan_force_async_fault(void) { } #endif /* CONFIG_KASAN_HW_TAGS || CONFIG_KASAN_KUNIT_TEST */ @@ -330,7 +359,7 @@ static inline u8 kasan_random_tag(void) { return 0; } #ifdef CONFIG_KASAN_HW_TAGS -static inline void kasan_poison(const void *addr, size_t size, u8 value) +static inline void kasan_poison(const void *addr, size_t size, u8 value, bool init) { addr = kasan_reset_tag(addr); @@ -343,10 +372,10 @@ static inline void kasan_poison(const void *addr, size_t size, u8 value) if (WARN_ON(size & KASAN_GRANULE_MASK)) return; - hw_set_mem_tag_range((void *)addr, size, value); + hw_set_mem_tag_range((void *)addr, size, value, init); } -static inline void kasan_unpoison(const void *addr, size_t size) +static inline void kasan_unpoison(const void *addr, size_t size, bool init) { u8 tag = get_tag(addr); @@ -360,7 +389,7 @@ static inline void kasan_unpoison(const void *addr, size_t size) return; size = round_up(size, KASAN_GRANULE_SIZE); - hw_set_mem_tag_range((void *)addr, size, tag); + hw_set_mem_tag_range((void *)addr, size, tag, init); } static inline bool kasan_byte_accessible(const void *addr) @@ -368,33 +397,34 @@ static inline bool kasan_byte_accessible(const void *addr) u8 ptr_tag = get_tag(addr); u8 mem_tag = hw_get_mem_tag((void *)addr); - return (mem_tag != KASAN_TAG_INVALID) && - (ptr_tag == KASAN_TAG_KERNEL || ptr_tag == mem_tag); + return ptr_tag == KASAN_TAG_KERNEL || ptr_tag == mem_tag; } #else /* CONFIG_KASAN_HW_TAGS */ /** - * kasan_poison - mark the memory range as unaccessible + * kasan_poison - mark the memory range as inaccessible * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE * @size - range size, must be aligned to KASAN_GRANULE_SIZE * @value - value that's written to metadata for the range + * @init - whether to initialize the memory range (only for hardware tag-based) * * The size gets aligned to KASAN_GRANULE_SIZE before marking the range. */ -void kasan_poison(const void *addr, size_t size, u8 value); +void kasan_poison(const void *addr, size_t size, u8 value, bool init); /** * kasan_unpoison - mark the memory range as accessible * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE * @size - range size, can be unaligned + * @init - whether to initialize the memory range (only for hardware tag-based) * * For the tag-based modes, the @size gets aligned to KASAN_GRANULE_SIZE before * marking the range. * For the generic mode, the last granule of the memory range gets partially * unpoisoned based on the @size. */ -void kasan_unpoison(const void *addr, size_t size); +void kasan_unpoison(const void *addr, size_t size, bool init); bool kasan_byte_accessible(const void *addr); @@ -404,7 +434,7 @@ bool kasan_byte_accessible(const void *addr); /** * kasan_poison_last_granule - mark the last granule of the memory range as - * unaccessible + * inaccessible * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE * @size - range size * diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c index 728fb24c5683..d8ccff4c1275 100644 --- a/mm/kasan/quarantine.c +++ b/mm/kasan/quarantine.c @@ -27,7 +27,7 @@ /* Data structure and operations for quarantine queues. */ /* - * Each queue is a signle-linked list, which also stores the total size of + * Each queue is a single-linked list, which also stores the total size of * objects inside of it. */ struct qlist_head { @@ -138,7 +138,7 @@ static void qlink_free(struct qlist_node *qlink, struct kmem_cache *cache) local_irq_save(flags); /* - * As the object now gets freed from the quaratine, assume that its + * As the object now gets freed from the quarantine, assume that its * free track is no longer valid. */ *(u8 *)kasan_mem_to_shadow(object) = KASAN_KMALLOC_FREE; diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 87b271206163..14bd51ea2348 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -87,7 +87,8 @@ static void start_report(unsigned long *flags) static void end_report(unsigned long *flags, unsigned long addr) { - trace_error_report_end(ERROR_DETECTOR_KASAN, addr); + if (!kasan_async_mode_enabled()) + trace_error_report_end(ERROR_DETECTOR_KASAN, addr); pr_err("==================================================================\n"); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); spin_unlock_irqrestore(&report_lock, *flags); @@ -360,6 +361,25 @@ void kasan_report_invalid_free(void *object, unsigned long ip) end_report(&flags, (unsigned long)object); } +#ifdef CONFIG_KASAN_HW_TAGS +void kasan_report_async(void) +{ + unsigned long flags; + +#if IS_ENABLED(CONFIG_KUNIT) + if (current->kunit_test) + kasan_update_kunit_status(current->kunit_test); +#endif /* IS_ENABLED(CONFIG_KUNIT) */ + + start_report(&flags); + pr_err("BUG: KASAN: invalid-access\n"); + pr_err("Asynchronous mode enabled: no access details available\n"); + pr_err("\n"); + dump_stack(); + end_report(&flags, 0); +} +#endif /* CONFIG_KASAN_HW_TAGS */ + static void __kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip) { diff --git a/mm/kasan/report_generic.c b/mm/kasan/report_generic.c index de732bc341c5..139615ef326b 100644 --- a/mm/kasan/report_generic.c +++ b/mm/kasan/report_generic.c @@ -148,7 +148,7 @@ static bool __must_check tokenize_frame_descr(const char **frame_descr, } /* Copy token (+ 1 byte for '\0'). */ - strlcpy(token, *frame_descr, tok_len + 1); + strscpy(token, *frame_descr, tok_len + 1); } /* Advance frame_descr past separator. */ diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 63f43443f5d7..082ee5b6d9a1 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -69,7 +69,7 @@ void *memcpy(void *dest, const void *src, size_t len) return __memcpy(dest, src, len); } -void kasan_poison(const void *addr, size_t size, u8 value) +void kasan_poison(const void *addr, size_t size, u8 value, bool init) { void *shadow_start, *shadow_end; @@ -106,7 +106,7 @@ void kasan_poison_last_granule(const void *addr, size_t size) } #endif -void kasan_unpoison(const void *addr, size_t size) +void kasan_unpoison(const void *addr, size_t size, bool init) { u8 tag = get_tag(addr); @@ -129,7 +129,7 @@ void kasan_unpoison(const void *addr, size_t size) return; /* Unpoison all granules that cover the object. */ - kasan_poison(addr, round_up(size, KASAN_GRANULE_SIZE), tag); + kasan_poison(addr, round_up(size, KASAN_GRANULE_SIZE), tag, false); /* Partially poison the last granule for the generic mode. */ if (IS_ENABLED(CONFIG_KASAN_GENERIC)) @@ -316,7 +316,7 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size) * // rest of vmalloc process <data dependency> * STORE p, a LOAD shadow(x+99) * - * If there is no barrier between the end of unpoisioning the shadow + * If there is no barrier between the end of unpoisoning the shadow * and the store of the result to p, the stores could be committed * in a different order by CPU#0, and CPU#1 could erroneously observe * poison in the shadow. @@ -344,7 +344,7 @@ void kasan_poison_vmalloc(const void *start, unsigned long size) return; size = round_up(size, KASAN_GRANULE_SIZE); - kasan_poison(start, size, KASAN_VMALLOC_INVALID); + kasan_poison(start, size, KASAN_VMALLOC_INVALID, false); } void kasan_unpoison_vmalloc(const void *start, unsigned long size) @@ -352,7 +352,7 @@ void kasan_unpoison_vmalloc(const void *start, unsigned long size) if (!is_vmalloc_or_module_addr(start)) return; - kasan_unpoison(start, size); + kasan_unpoison(start, size, false); } static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr, @@ -384,7 +384,7 @@ static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr, * How does this work? * ------------------- * - * We have a region that is page aligned, labelled as A. + * We have a region that is page aligned, labeled as A. * That might not map onto the shadow in a way that is page-aligned: * * start end diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c index 94c2d33be333..9df8e7f69e87 100644 --- a/mm/kasan/sw_tags.c +++ b/mm/kasan/sw_tags.c @@ -121,10 +121,14 @@ bool kasan_check_range(unsigned long addr, size_t size, bool write, bool kasan_byte_accessible(const void *addr) { u8 tag = get_tag(addr); - u8 shadow_byte = READ_ONCE(*(u8 *)kasan_mem_to_shadow(kasan_reset_tag(addr))); + void *untagged_addr = kasan_reset_tag(addr); + u8 shadow_byte; - return (shadow_byte != KASAN_TAG_INVALID) && - (tag == KASAN_TAG_KERNEL || tag == shadow_byte); + if (untagged_addr < kasan_shadow_to_mem((void *)KASAN_SHADOW_START)) + return false; + + shadow_byte = READ_ONCE(*(u8 *)kasan_mem_to_shadow(untagged_addr)); + return tag == KASAN_TAG_KERNEL || tag == shadow_byte; } #define DEFINE_HWASAN_LOAD_STORE(size) \ @@ -159,7 +163,7 @@ EXPORT_SYMBOL(__hwasan_storeN_noabort); void __hwasan_tag_memory(unsigned long addr, u8 tag, unsigned long size) { - kasan_poison((void *)addr, size, tag); + kasan_poison((void *)addr, size, tag, false); } EXPORT_SYMBOL(__hwasan_tag_memory); diff --git a/mm/kfence/core.c b/mm/kfence/core.c index d53c91f881a4..4d21ac44d5d3 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -10,6 +10,7 @@ #include <linux/atomic.h> #include <linux/bug.h> #include <linux/debugfs.h> +#include <linux/irq_work.h> #include <linux/kcsan-checks.h> #include <linux/kfence.h> #include <linux/kmemleak.h> @@ -19,6 +20,7 @@ #include <linux/moduleparam.h> #include <linux/random.h> #include <linux/rcupdate.h> +#include <linux/sched/sysctl.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/spinlock.h> @@ -372,6 +374,7 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z /* Restore page protection if there was an OOB access. */ if (meta->unprotected_page) { + memzero_explicit((void *)ALIGN_DOWN(meta->unprotected_page, PAGE_SIZE), PAGE_SIZE); kfence_protect(meta->unprotected_page); meta->unprotected_page = 0; } @@ -586,6 +589,17 @@ late_initcall(kfence_debugfs_init); /* === Allocation Gate Timer ================================================ */ +#ifdef CONFIG_KFENCE_STATIC_KEYS +/* Wait queue to wake up allocation-gate timer task. */ +static DECLARE_WAIT_QUEUE_HEAD(allocation_wait); + +static void wake_up_kfence_timer(struct irq_work *work) +{ + wake_up(&allocation_wait); +} +static DEFINE_IRQ_WORK(wake_up_kfence_timer_work, wake_up_kfence_timer); +#endif + /* * Set up delayed work, which will enable and disable the static key. We need to * use a work queue (rather than a simple timer), since enabling and disabling a @@ -603,29 +617,27 @@ static void toggle_allocation_gate(struct work_struct *work) if (!READ_ONCE(kfence_enabled)) return; - /* Enable static key, and await allocation to happen. */ atomic_set(&kfence_allocation_gate, 0); #ifdef CONFIG_KFENCE_STATIC_KEYS + /* Enable static key, and await allocation to happen. */ static_branch_enable(&kfence_allocation_key); - /* - * Await an allocation. Timeout after 1 second, in case the kernel stops - * doing allocations, to avoid stalling this worker task for too long. - */ - { - unsigned long end_wait = jiffies + HZ; - - do { - set_current_state(TASK_UNINTERRUPTIBLE); - if (atomic_read(&kfence_allocation_gate) != 0) - break; - schedule_timeout(1); - } while (time_before(jiffies, end_wait)); - __set_current_state(TASK_RUNNING); + + if (sysctl_hung_task_timeout_secs) { + /* + * During low activity with no allocations we might wait a + * while; let's avoid the hung task warning. + */ + wait_event_idle_timeout(allocation_wait, atomic_read(&kfence_allocation_gate), + sysctl_hung_task_timeout_secs * HZ / 2); + } else { + wait_event_idle(allocation_wait, atomic_read(&kfence_allocation_gate)); } + /* Disable static key and reset timer. */ static_branch_disable(&kfence_allocation_key); #endif - schedule_delayed_work(&kfence_timer, msecs_to_jiffies(kfence_sample_interval)); + queue_delayed_work(system_power_efficient_wq, &kfence_timer, + msecs_to_jiffies(kfence_sample_interval)); } static DECLARE_DELAYED_WORK(kfence_timer, toggle_allocation_gate); @@ -654,7 +666,7 @@ void __init kfence_init(void) } WRITE_ONCE(kfence_enabled, true); - schedule_delayed_work(&kfence_timer, 0); + queue_delayed_work(system_power_efficient_wq, &kfence_timer, 0); pr_info("initialized - using %lu bytes for %d objects at 0x%p-0x%p\n", KFENCE_POOL_SIZE, CONFIG_KFENCE_NUM_OBJECTS, (void *)__kfence_pool, (void *)(__kfence_pool + KFENCE_POOL_SIZE)); @@ -728,6 +740,19 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags) */ if (atomic_read(&kfence_allocation_gate) || atomic_inc_return(&kfence_allocation_gate) > 1) return NULL; +#ifdef CONFIG_KFENCE_STATIC_KEYS + /* + * waitqueue_active() is fully ordered after the update of + * kfence_allocation_gate per atomic_inc_return(). + */ + if (waitqueue_active(&allocation_wait)) { + /* + * Calling wake_up() here may deadlock when allocations happen + * from within timer code. Use an irq_work to defer it. + */ + irq_work_queue(&wake_up_kfence_timer_work); + } +#endif if (!READ_ONCE(kfence_enabled)) return NULL; diff --git a/mm/kfence/report.c b/mm/kfence/report.c index e3f71451ad9e..2a319c21c939 100644 --- a/mm/kfence/report.c +++ b/mm/kfence/report.c @@ -263,6 +263,6 @@ void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *r if (panic_on_warn) panic("panic_on_warn set ...\n"); - /* We encountered a memory unsafety error, taint the kernel! */ + /* We encountered a memory safety error, taint the kernel! */ add_taint(TAINT_BAD_PAGE, LOCKDEP_STILL_OK); } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index a7d6cb912b05..6c0185fdd815 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -481,7 +481,7 @@ int __khugepaged_enter(struct mm_struct *mm) return -ENOMEM; /* __khugepaged_exit() must not run from under us */ - VM_BUG_ON_MM(atomic_read(&mm->mm_users) == 0, mm); + VM_BUG_ON_MM(khugepaged_test_exit(mm), mm); if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { free_mm_slot(mm_slot); return 0; @@ -667,7 +667,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, * * The page table that maps the page has been already unlinked * from the page table tree and this process cannot get - * an additinal pin on the page. + * an additional pin on the page. * * New pins can come later if the page is shared across fork, * but not from this process. The other process cannot write to @@ -716,17 +716,17 @@ next: if (pte_write(pteval)) writable = true; } - if (likely(writable)) { - if (likely(referenced)) { - result = SCAN_SUCCEED; - trace_mm_collapse_huge_page_isolate(page, none_or_zero, - referenced, writable, result); - return 1; - } - } else { + + if (unlikely(!writable)) { result = SCAN_PAGE_RO; + } else if (unlikely(!referenced)) { + result = SCAN_LACK_REFERENCED_PAGE; + } else { + result = SCAN_SUCCEED; + trace_mm_collapse_huge_page_isolate(page, none_or_zero, + referenced, writable, result); + return 1; } - out: release_pte_pages(pte, _pte, compound_pagelist); trace_mm_collapse_huge_page_isolate(page, none_or_zero, @@ -809,7 +809,7 @@ static bool khugepaged_scan_abort(int nid) * If node_reclaim_mode is disabled, then no extra effort is made to * allocate memory locally. */ - if (!node_reclaim_mode) + if (!node_reclaim_enabled()) return false; /* If there is a count for this node already, it must be acceptable */ @@ -1128,10 +1128,10 @@ static void collapse_huge_page(struct mm_struct *mm, mmap_write_lock(mm); result = hugepage_vma_revalidate(mm, address, &vma); if (result) - goto out; + goto out_up_write; /* check if the pmd is still valid */ if (mm_find_pmd(mm, address) != pmd) - goto out; + goto out_up_write; anon_vma_lock_write(vma->anon_vma); @@ -1171,7 +1171,7 @@ static void collapse_huge_page(struct mm_struct *mm, spin_unlock(pmd_ptl); anon_vma_unlock_write(vma->anon_vma); result = SCAN_FAIL; - goto out; + goto out_up_write; } /* @@ -1183,19 +1183,18 @@ static void collapse_huge_page(struct mm_struct *mm, __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl, &compound_pagelist); pte_unmap(pte); + /* + * spin_lock() below is not the equivalent of smp_wmb(), but + * the smp_wmb() inside __SetPageUptodate() can be reused to + * avoid the copy_huge_page writes to become visible after + * the set_pmd_at() write. + */ __SetPageUptodate(new_page); pgtable = pmd_pgtable(_pmd); _pmd = mk_huge_pmd(new_page, vma->vm_page_prot); _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); - /* - * spin_lock() below is not the equivalent of smp_wmb(), so - * this is needed to avoid the copy_huge_page writes to become - * visible after the set_pmd_at() write. - */ - smp_wmb(); - spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); page_add_new_anon_rmap(new_page, vma, address, true); @@ -1216,8 +1215,6 @@ out_nolock: mem_cgroup_uncharge(*hpage); trace_mm_collapse_huge_page(mm, isolated, result); return; -out: - goto out_up_write; } static int khugepaged_scan_pmd(struct mm_struct *mm, @@ -1274,10 +1271,6 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, goto out_unmap; } } - if (!pte_present(pteval)) { - result = SCAN_PTE_NON_PRESENT; - goto out_unmap; - } if (pte_uffd_wp(pteval)) { /* * Don't collapse the page if any of the small @@ -1447,7 +1440,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) int i; if (!vma || !vma->vm_file || - vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE) + !range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE)) return; /* @@ -1533,16 +1526,16 @@ abort: goto drop_hpage; } -static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) +static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) { struct mm_struct *mm = mm_slot->mm; int i; if (likely(mm_slot->nr_pte_mapped_thp == 0)) - return 0; + return; if (!mmap_write_trylock(mm)) - return -EBUSY; + return; if (unlikely(khugepaged_test_exit(mm))) goto out; @@ -1553,7 +1546,6 @@ static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) out: mm_slot->nr_pte_mapped_thp = 0; mmap_write_unlock(mm); - return 0; } static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) @@ -2057,9 +2049,8 @@ static void khugepaged_scan_file(struct mm_struct *mm, BUILD_BUG(); } -static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) +static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) { - return 0; } #endif @@ -2205,11 +2196,9 @@ static void khugepaged_do_scan(void) { struct page *hpage = NULL; unsigned int progress = 0, pass_through_head = 0; - unsigned int pages = khugepaged_pages_to_scan; + unsigned int pages = READ_ONCE(khugepaged_pages_to_scan); bool wait = true; - barrier(); /* write khugepaged_pages_to_scan to local stack */ - lru_add_drain_all(); while (progress < pages) { diff --git a/mm/kmemleak.c b/mm/kmemleak.c index fe6e3ae8e8c6..92a2d4885808 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1203,7 +1203,7 @@ static void update_refs(struct kmemleak_object *object) } /* - * Memory scanning is a long process and it needs to be interruptable. This + * Memory scanning is a long process and it needs to be interruptible. This * function checks whether such interrupt condition occurred. */ static int scan_should_stop(void) @@ -215,8 +215,6 @@ struct rmap_item { #define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */ #define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */ #define STABLE_FLAG 0x200 /* is listed from the stable tree */ -#define KSM_FLAG_MASK (SEQNR_MASK|UNSTABLE_FLAG|STABLE_FLAG) - /* to mask all the flags */ /* The stable and unstable tree heads */ static struct rb_root one_stable_tree[1] = { RB_ROOT }; @@ -461,7 +459,7 @@ static inline bool ksm_test_exit(struct mm_struct *mm) * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma, * in case the application has unmapped and remapped mm,addr meanwhile. * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP - * mmap of /dev/mem or /dev/kmem, where we would not want to touch it. + * mmap of /dev/mem, where we would not want to touch it. * * FAULT_FLAG/FOLL_REMOTE are because we do this outside the context * of the process that owns 'vma'. We also do not want to enforce @@ -794,6 +792,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) stable_node->rmap_hlist_len--; put_anon_vma(rmap_item->anon_vma); + rmap_item->head = NULL; rmap_item->address &= PAGE_MASK; } else if (rmap_item->address & UNSTABLE_FLAG) { @@ -817,8 +816,7 @@ out: cond_resched(); /* we're called from many long loops */ } -static void remove_trailing_rmap_items(struct mm_slot *mm_slot, - struct rmap_item **rmap_list) +static void remove_trailing_rmap_items(struct rmap_item **rmap_list) { while (*rmap_list) { struct rmap_item *rmap_item = *rmap_list; @@ -989,7 +987,7 @@ static int unmerge_and_remove_all_rmap_items(void) goto error; } - remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list); + remove_trailing_rmap_items(&mm_slot->rmap_list); mmap_read_unlock(mm); spin_lock(&ksm_mmlist_lock); @@ -1068,7 +1066,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, /* * Ok this is tricky, when get_user_pages_fast() run it doesn't * take any lock, therefore the check that we are going to make - * with the pagecount against the mapcount is racey and + * with the pagecount against the mapcount is racy and * O_DIRECT can happen right after the check. * So we clear the pte and flush the tlb before the check * this assure us that no O_DIRECT can happen after the check @@ -1438,7 +1436,7 @@ static struct page *stable_node_dup(struct stable_node **_stable_node_dup, */ *_stable_node = found; /* - * Just for robustneess as stable_node is + * Just for robustness, as stable_node is * otherwise left as a stable pointer, the * compiler shall optimize it away at build * time. @@ -1771,7 +1769,6 @@ chain_append: * stable_node_dup is the dup to replace. */ if (stable_node_dup == stable_node) { - VM_BUG_ON(is_stable_node_chain(stable_node_dup)); VM_BUG_ON(is_stable_node_dup(stable_node_dup)); /* chain is missing so create it */ stable_node = alloc_stable_node_chain(stable_node_dup, @@ -1785,7 +1782,6 @@ chain_append: * of the current nid for this page * content. */ - VM_BUG_ON(!is_stable_node_chain(stable_node)); VM_BUG_ON(!is_stable_node_dup(stable_node_dup)); VM_BUG_ON(page_node->head != &migrate_nodes); list_del(&page_node->list); @@ -2337,7 +2333,7 @@ next_mm: * Nuke all the rmap_items that are above this current rmap: * because there were no VM_MERGEABLE vmas with such addresses. */ - remove_trailing_rmap_items(slot, ksm_scan.rmap_list); + remove_trailing_rmap_items(ksm_scan.rmap_list); spin_lock(&ksm_mmlist_lock); ksm_scan.mm_slot = list_entry(slot->mm_list.next, @@ -2634,7 +2630,7 @@ again: vma = vmac->vma; /* Ignore the stable/unstable/sqnr flags */ - addr = rmap_item->address & ~KSM_FLAG_MASK; + addr = rmap_item->address & PAGE_MASK; if (addr < vma->vm_start || addr >= vma->vm_end) continue; diff --git a/mm/list_lru.c b/mm/list_lru.c index 6f067b6b935f..cd58790d0fb3 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -125,8 +125,8 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item) list_add_tail(item, &l->list); /* Set shrinker bit if the first element was added */ if (!l->nr_items++) - memcg_set_shrinker_bit(memcg, nid, - lru_shrinker_id(lru)); + set_shrinker_bit(memcg, nid, + lru_shrinker_id(lru)); nlru->nr_items++; spin_unlock(&nlru->lock); return true; @@ -540,7 +540,7 @@ static void memcg_drain_list_lru_node(struct list_lru *lru, int nid, if (src->nr_items) { dst->nr_items += src->nr_items; - memcg_set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru)); + set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru)); src->nr_items = 0; } diff --git a/mm/madvise.c b/mm/madvise.c index 01fef79ac761..63e489e5bfdb 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -799,7 +799,7 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, if (end > vma->vm_end) { /* * Don't fail if end > vma->vm_end. If the old - * vma was splitted while the mmap_lock was + * vma was split while the mmap_lock was * released the effect of the concurrent * operation may not cause madvise() to * have an undefined result. There may be an @@ -1039,7 +1039,7 @@ process_madvise_behavior_valid(int behavior) * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump. * MADV_COLD - the application is not expected to use this memory soon, * deactivate pages in this range so that they can be reclaimed - * easily if memory pressure hanppens. + * easily if memory pressure happens. * MADV_PAGEOUT - the application is not expected to use this memory soon, * page out the pages in this range immediately. * diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e064ac0d850a..64ada9e650a5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -215,7 +215,7 @@ enum res_type { #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) #define MEMFILE_ATTR(val) ((val) & 0xffff) -/* Used for OOM nofiier */ +/* Used for OOM notifier */ #define OOM_CONTROL (0) /* @@ -255,10 +255,8 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr) #ifdef CONFIG_MEMCG_KMEM extern spinlock_t css_set_lock; -static int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp, - unsigned int nr_pages); -static void __memcg_kmem_uncharge(struct mem_cgroup *memcg, - unsigned int nr_pages); +static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg, + unsigned int nr_pages); static void obj_cgroup_release(struct percpu_ref *ref) { @@ -295,7 +293,7 @@ static void obj_cgroup_release(struct percpu_ref *ref) spin_lock_irqsave(&css_set_lock, flags); memcg = obj_cgroup_memcg(objcg); if (nr_pages) - __memcg_kmem_uncharge(memcg, nr_pages); + obj_cgroup_uncharge_pages(objcg, nr_pages); list_del(&objcg->list); mem_cgroup_put(memcg); spin_unlock_irqrestore(&css_set_lock, flags); @@ -402,129 +400,6 @@ DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key); EXPORT_SYMBOL(memcg_kmem_enabled_key); #endif -static int memcg_shrinker_map_size; -static DEFINE_MUTEX(memcg_shrinker_map_mutex); - -static void memcg_free_shrinker_map_rcu(struct rcu_head *head) -{ - kvfree(container_of(head, struct memcg_shrinker_map, rcu)); -} - -static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg, - int size, int old_size) -{ - struct memcg_shrinker_map *new, *old; - int nid; - - lockdep_assert_held(&memcg_shrinker_map_mutex); - - for_each_node(nid) { - old = rcu_dereference_protected( - mem_cgroup_nodeinfo(memcg, nid)->shrinker_map, true); - /* Not yet online memcg */ - if (!old) - return 0; - - new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid); - if (!new) - return -ENOMEM; - - /* Set all old bits, clear all new bits */ - memset(new->map, (int)0xff, old_size); - memset((void *)new->map + old_size, 0, size - old_size); - - rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, new); - call_rcu(&old->rcu, memcg_free_shrinker_map_rcu); - } - - return 0; -} - -static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) -{ - struct mem_cgroup_per_node *pn; - struct memcg_shrinker_map *map; - int nid; - - if (mem_cgroup_is_root(memcg)) - return; - - for_each_node(nid) { - pn = mem_cgroup_nodeinfo(memcg, nid); - map = rcu_dereference_protected(pn->shrinker_map, true); - kvfree(map); - rcu_assign_pointer(pn->shrinker_map, NULL); - } -} - -static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg) -{ - struct memcg_shrinker_map *map; - int nid, size, ret = 0; - - if (mem_cgroup_is_root(memcg)) - return 0; - - mutex_lock(&memcg_shrinker_map_mutex); - size = memcg_shrinker_map_size; - for_each_node(nid) { - map = kvzalloc_node(sizeof(*map) + size, GFP_KERNEL, nid); - if (!map) { - memcg_free_shrinker_maps(memcg); - ret = -ENOMEM; - break; - } - rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map); - } - mutex_unlock(&memcg_shrinker_map_mutex); - - return ret; -} - -int memcg_expand_shrinker_maps(int new_id) -{ - int size, old_size, ret = 0; - struct mem_cgroup *memcg; - - size = DIV_ROUND_UP(new_id + 1, BITS_PER_LONG) * sizeof(unsigned long); - old_size = memcg_shrinker_map_size; - if (size <= old_size) - return 0; - - mutex_lock(&memcg_shrinker_map_mutex); - if (!root_mem_cgroup) - goto unlock; - - for_each_mem_cgroup(memcg) { - if (mem_cgroup_is_root(memcg)) - continue; - ret = memcg_expand_one_shrinker_map(memcg, size, old_size); - if (ret) { - mem_cgroup_iter_break(NULL, memcg); - goto unlock; - } - } -unlock: - if (!ret) - memcg_shrinker_map_size = size; - mutex_unlock(&memcg_shrinker_map_mutex); - return ret; -} - -void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) -{ - if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { - struct memcg_shrinker_map *map; - - rcu_read_lock(); - map = rcu_dereference(memcg->nodeinfo[nid]->shrinker_map); - /* Pairs with smp mb in shrink_slab() */ - smp_mb__before_atomic(); - set_bit(shrinker_id, map->map); - rcu_read_unlock(); - } -} - /** * mem_cgroup_css_from_page - css of the memcg associated with a page * @page: page of interest @@ -713,7 +588,7 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) int nid; for_each_node(nid) { - mz = mem_cgroup_nodeinfo(memcg, nid); + mz = memcg->nodeinfo[nid]; mctz = soft_limit_tree_node(nid); if (mctz) mem_cgroup_remove_exceeded(mz, mctz); @@ -764,28 +639,37 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) */ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) { - long x, threshold = MEMCG_CHARGE_BATCH; - if (mem_cgroup_disabled()) return; - if (memcg_stat_item_in_bytes(idx)) - threshold <<= PAGE_SHIFT; + __this_cpu_add(memcg->vmstats_percpu->state[idx], val); + cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id()); +} - x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]); - if (unlikely(abs(x) > threshold)) { - struct mem_cgroup *mi; +/* idx can be of type enum memcg_stat_item or node_stat_item. */ +static unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) +{ + long x = READ_ONCE(memcg->vmstats.state[idx]); +#ifdef CONFIG_SMP + if (x < 0) + x = 0; +#endif + return x; +} - /* - * Batch local counters to keep them in sync with - * the hierarchical ones. - */ - __this_cpu_add(memcg->vmstats_local->stat[idx], x); - for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) - atomic_long_add(x, &mi->vmstats[idx]); +/* idx can be of type enum memcg_stat_item or node_stat_item. */ +static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx) +{ + long x = 0; + int cpu; + + for_each_possible_cpu(cpu) + x += per_cpu(memcg->vmstats_percpu->state[idx], cpu); +#ifdef CONFIG_SMP + if (x < 0) x = 0; - } - __this_cpu_write(memcg->vmstats_percpu->stat[idx], x); +#endif + return x; } static struct mem_cgroup_per_node * @@ -796,7 +680,7 @@ parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid) parent = parent_mem_cgroup(pn->memcg); if (!parent) return NULL; - return mem_cgroup_nodeinfo(parent, nid); + return parent->nodeinfo[nid]; } void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, @@ -855,18 +739,22 @@ void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx, int val) { struct page *head = compound_head(page); /* rmap on tail pages */ - struct mem_cgroup *memcg = page_memcg(head); + struct mem_cgroup *memcg; pg_data_t *pgdat = page_pgdat(page); struct lruvec *lruvec; + rcu_read_lock(); + memcg = page_memcg(head); /* Untracked pages have no memcg, no lruvec. Update only the node */ if (!memcg) { + rcu_read_unlock(); __mod_node_page_state(pgdat, idx, val); return; } lruvec = mem_cgroup_lruvec(memcg, pgdat); __mod_lruvec_state(lruvec, idx, val); + rcu_read_unlock(); } EXPORT_SYMBOL(__mod_lruvec_page_state); @@ -898,35 +786,21 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) * __count_memcg_events - account VM events in a cgroup * @memcg: the memory cgroup * @idx: the event item - * @count: the number of events that occured + * @count: the number of events that occurred */ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, unsigned long count) { - unsigned long x; - if (mem_cgroup_disabled()) return; - x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]); - if (unlikely(x > MEMCG_CHARGE_BATCH)) { - struct mem_cgroup *mi; - - /* - * Batch local counters to keep them in sync with - * the hierarchical ones. - */ - __this_cpu_add(memcg->vmstats_local->events[idx], x); - for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) - atomic_long_add(x, &mi->vmevents[idx]); - x = 0; - } - __this_cpu_write(memcg->vmstats_percpu->events[idx], x); + __this_cpu_add(memcg->vmstats_percpu->events[idx], count); + cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id()); } static unsigned long memcg_events(struct mem_cgroup *memcg, int event) { - return atomic_long_read(&memcg->vmevents[event]); + return READ_ONCE(memcg->vmstats.events[event]); } static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) @@ -935,7 +809,7 @@ static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) int cpu; for_each_possible_cpu(cpu) - x += per_cpu(memcg->vmstats_local->events[event], cpu); + x += per_cpu(memcg->vmstats_percpu->events[event], cpu); return x; } @@ -1030,7 +904,7 @@ struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) rcu_read_lock(); do { /* - * Page cache insertions can happen withou an + * Page cache insertions can happen without an * actual mm context, e.g. during disk probing * on boot, loopback IO, acct() writes etc. */ @@ -1055,20 +929,6 @@ static __always_inline struct mem_cgroup *active_memcg(void) return current->active_memcg; } -static __always_inline struct mem_cgroup *get_active_memcg(void) -{ - struct mem_cgroup *memcg; - - rcu_read_lock(); - memcg = active_memcg(); - /* remote memcg must hold a ref. */ - if (memcg && WARN_ON_ONCE(!css_tryget(&memcg->css))) - memcg = root_mem_cgroup; - rcu_read_unlock(); - - return memcg; -} - static __always_inline bool memcg_kmem_bypass(void) { /* Allow remote memcg charging from any context. */ @@ -1083,20 +943,6 @@ static __always_inline bool memcg_kmem_bypass(void) } /** - * If active memcg is set, do not fallback to current->mm->memcg. - */ -static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void) -{ - if (memcg_kmem_bypass()) - return NULL; - - if (unlikely(active_memcg())) - return get_active_memcg(); - - return get_mem_cgroup_from_mm(current->mm); -} - -/** * mem_cgroup_iter - iterate over memory cgroup hierarchy * @root: hierarchy root * @prev: previously returned memcg, NULL on first invocation @@ -1136,7 +982,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, if (reclaim) { struct mem_cgroup_per_node *mz; - mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id); + mz = root->nodeinfo[reclaim->pgdat->node_id]; iter = &mz->iter; if (prev && reclaim->generation != iter->generation) @@ -1238,7 +1084,7 @@ static void __invalidate_reclaim_iterators(struct mem_cgroup *from, int nid; for_each_node(nid) { - mz = mem_cgroup_nodeinfo(from, nid); + mz = from->nodeinfo[nid]; iter = &mz->iter; cmpxchg(&iter->position, dead_memcg, NULL); } @@ -1571,6 +1417,7 @@ static char *memory_stat_format(struct mem_cgroup *memcg) * * Current memory state: */ + cgroup_rstat_flush(memcg->css.cgroup); for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { u64 size; @@ -1865,7 +1712,7 @@ static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) struct mem_cgroup *iter; /* - * Be careful about under_oom underflows becase a child memcg + * Be careful about under_oom underflows because a child memcg * could have been added after mem_cgroup_mark_under_oom. */ spin_lock(&memcg_oom_lock); @@ -2037,7 +1884,7 @@ bool mem_cgroup_oom_synchronize(bool handle) /* * There is no guarantee that an OOM-lock contender * sees the wakeups triggered by the OOM kill - * uncharges. Wake any sleepers explicitely. + * uncharges. Wake any sleepers explicitly. */ memcg_oom_recover(memcg); } @@ -2118,11 +1965,10 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) * This function protects unlocked LRU pages from being moved to * another cgroup. * - * It ensures lifetime of the returned memcg. Caller is responsible - * for the lifetime of the page; __unlock_page_memcg() is available - * when @page might get freed inside the locked section. + * It ensures lifetime of the locked memcg. Caller is responsible + * for the lifetime of the page. */ -struct mem_cgroup *lock_page_memcg(struct page *page) +void lock_page_memcg(struct page *page) { struct page *head = compound_head(page); /* rmap on tail pages */ struct mem_cgroup *memcg; @@ -2132,21 +1978,15 @@ struct mem_cgroup *lock_page_memcg(struct page *page) * The RCU lock is held throughout the transaction. The fast * path can get away without acquiring the memcg->move_lock * because page moving starts with an RCU grace period. - * - * The RCU lock also protects the memcg from being freed when - * the page state that is going to change is the only thing - * preventing the page itself from being freed. E.g. writeback - * doesn't hold a page reference and relies on PG_writeback to - * keep off truncation, migration and so forth. */ rcu_read_lock(); if (mem_cgroup_disabled()) - return NULL; + return; again: memcg = page_memcg(head); if (unlikely(!memcg)) - return NULL; + return; #ifdef CONFIG_PROVE_LOCKING local_irq_save(flags); @@ -2155,7 +1995,7 @@ again: #endif if (atomic_read(&memcg->moving_account) <= 0) - return memcg; + return; spin_lock_irqsave(&memcg->move_lock, flags); if (memcg != page_memcg(head)) { @@ -2164,24 +2004,17 @@ again: } /* - * When charge migration first begins, we can have locked and - * unlocked page stat updates happening concurrently. Track - * the task who has the lock for unlock_page_memcg(). + * When charge migration first begins, we can have multiple + * critical sections holding the fast-path RCU lock and one + * holding the slowpath move_lock. Track the task who has the + * move_lock for unlock_page_memcg(). */ memcg->move_lock_task = current; memcg->move_lock_flags = flags; - - return memcg; } EXPORT_SYMBOL(lock_page_memcg); -/** - * __unlock_page_memcg - unlock and unpin a memcg - * @memcg: the memcg - * - * Unlock and unpin a memcg returned by lock_page_memcg(). - */ -void __unlock_page_memcg(struct mem_cgroup *memcg) +static void __unlock_page_memcg(struct mem_cgroup *memcg) { if (memcg && memcg->move_lock_task == current) { unsigned long flags = memcg->move_lock_flags; @@ -2381,50 +2214,39 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) mutex_unlock(&percpu_charge_mutex); } -static int memcg_hotplug_cpu_dead(unsigned int cpu) +static void memcg_flush_lruvec_page_state(struct mem_cgroup *memcg, int cpu) { - struct memcg_stock_pcp *stock; - struct mem_cgroup *memcg, *mi; - - stock = &per_cpu(memcg_stock, cpu); - drain_stock(stock); + int nid; - for_each_mem_cgroup(memcg) { + for_each_node(nid) { + struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid]; + unsigned long stat[NR_VM_NODE_STAT_ITEMS]; + struct batched_lruvec_stat *lstatc; int i; - for (i = 0; i < MEMCG_NR_STAT; i++) { - int nid; - long x; - - x = this_cpu_xchg(memcg->vmstats_percpu->stat[i], 0); - if (x) - for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) - atomic_long_add(x, &memcg->vmstats[i]); - - if (i >= NR_VM_NODE_STAT_ITEMS) - continue; + lstatc = per_cpu_ptr(pn->lruvec_stat_cpu, cpu); + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { + stat[i] = lstatc->count[i]; + lstatc->count[i] = 0; + } - for_each_node(nid) { - struct mem_cgroup_per_node *pn; + do { + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) + atomic_long_add(stat[i], &pn->lruvec_stat[i]); + } while ((pn = parent_nodeinfo(pn, nid))); + } +} - pn = mem_cgroup_nodeinfo(memcg, nid); - x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0); - if (x) - do { - atomic_long_add(x, &pn->lruvec_stat[i]); - } while ((pn = parent_nodeinfo(pn, nid))); - } - } +static int memcg_hotplug_cpu_dead(unsigned int cpu) +{ + struct memcg_stock_pcp *stock; + struct mem_cgroup *memcg; - for (i = 0; i < NR_VM_EVENT_ITEMS; i++) { - long x; + stock = &per_cpu(memcg_stock, cpu); + drain_stock(stock); - x = this_cpu_xchg(memcg->vmstats_percpu->events[i], 0); - if (x) - for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) - atomic_long_add(x, &memcg->vmevents[i]); - } - } + for_each_mem_cgroup(memcg) + memcg_flush_lruvec_page_state(memcg, cpu); return 0; } @@ -2793,9 +2615,6 @@ retry: if (gfp_mask & __GFP_RETRY_MAYFAIL) goto nomem; - if (gfp_mask & __GFP_NOFAIL) - goto force; - if (fatal_signal_pending(current)) goto force; @@ -2905,6 +2724,20 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg) page->memcg_data = (unsigned long)memcg; } +static struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg) +{ + struct mem_cgroup *memcg; + + rcu_read_lock(); +retry: + memcg = obj_cgroup_memcg(objcg); + if (unlikely(!css_tryget(&memcg->css))) + goto retry; + rcu_read_unlock(); + + return memcg; +} + #ifdef CONFIG_MEMCG_KMEM int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s, gfp_t gfp, bool new_page) @@ -3056,23 +2889,45 @@ static void memcg_free_cache_id(int id) ida_simple_remove(&memcg_cache_ida, id); } -/** - * __memcg_kmem_charge: charge a number of kernel pages to a memcg - * @memcg: memory cgroup to charge +/* + * obj_cgroup_uncharge_pages: uncharge a number of kernel pages from a objcg + * @objcg: object cgroup to uncharge + * @nr_pages: number of pages to uncharge + */ +static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg, + unsigned int nr_pages) +{ + struct mem_cgroup *memcg; + + memcg = get_mem_cgroup_from_objcg(objcg); + + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + page_counter_uncharge(&memcg->kmem, nr_pages); + refill_stock(memcg, nr_pages); + + css_put(&memcg->css); +} + +/* + * obj_cgroup_charge_pages: charge a number of kernel pages to a objcg + * @objcg: object cgroup to charge * @gfp: reclaim mode * @nr_pages: number of pages to charge * * Returns 0 on success, an error code on failure. */ -static int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp, - unsigned int nr_pages) +static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp, + unsigned int nr_pages) { struct page_counter *counter; + struct mem_cgroup *memcg; int ret; + memcg = get_mem_cgroup_from_objcg(objcg); + ret = try_charge(memcg, gfp, nr_pages); if (ret) - return ret; + goto out; if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) { @@ -3084,25 +2939,15 @@ static int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp, */ if (gfp & __GFP_NOFAIL) { page_counter_charge(&memcg->kmem, nr_pages); - return 0; + goto out; } cancel_charge(memcg, nr_pages); - return -ENOMEM; + ret = -ENOMEM; } - return 0; -} - -/** - * __memcg_kmem_uncharge: uncharge a number of kernel pages from a memcg - * @memcg: memcg to uncharge - * @nr_pages: number of pages to uncharge - */ -static void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages) -{ - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) - page_counter_uncharge(&memcg->kmem, nr_pages); +out: + css_put(&memcg->css); - refill_stock(memcg, nr_pages); + return ret; } /** @@ -3115,18 +2960,18 @@ static void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_page */ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) { - struct mem_cgroup *memcg; + struct obj_cgroup *objcg; int ret = 0; - memcg = get_mem_cgroup_from_current(); - if (memcg && !mem_cgroup_is_root(memcg)) { - ret = __memcg_kmem_charge(memcg, gfp, 1 << order); + objcg = get_obj_cgroup_from_current(); + if (objcg) { + ret = obj_cgroup_charge_pages(objcg, gfp, 1 << order); if (!ret) { - page->memcg_data = (unsigned long)memcg | + page->memcg_data = (unsigned long)objcg | MEMCG_DATA_KMEM; return 0; } - css_put(&memcg->css); + obj_cgroup_put(objcg); } return ret; } @@ -3138,16 +2983,16 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) */ void __memcg_kmem_uncharge_page(struct page *page, int order) { - struct mem_cgroup *memcg = page_memcg(page); + struct obj_cgroup *objcg; unsigned int nr_pages = 1 << order; - if (!memcg) + if (!PageMemcgKmem(page)) return; - VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); - __memcg_kmem_uncharge(memcg, nr_pages); + objcg = __page_objcg(page); + obj_cgroup_uncharge_pages(objcg, nr_pages); page->memcg_data = 0; - css_put(&memcg->css); + obj_cgroup_put(objcg); } static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) @@ -3180,11 +3025,8 @@ static void drain_obj_stock(struct memcg_stock_pcp *stock) unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT; unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1); - if (nr_pages) { - rcu_read_lock(); - __memcg_kmem_uncharge(obj_cgroup_memcg(old), nr_pages); - rcu_read_unlock(); - } + if (nr_pages) + obj_cgroup_uncharge_pages(old, nr_pages); /* * The leftover is flushed to the centralized per-memcg value. @@ -3242,7 +3084,6 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size) { - struct mem_cgroup *memcg; unsigned int nr_pages, nr_bytes; int ret; @@ -3259,24 +3100,16 @@ int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size) * refill_obj_stock(), called from this function or * independently later. */ - rcu_read_lock(); -retry: - memcg = obj_cgroup_memcg(objcg); - if (unlikely(!css_tryget(&memcg->css))) - goto retry; - rcu_read_unlock(); - nr_pages = size >> PAGE_SHIFT; nr_bytes = size & (PAGE_SIZE - 1); if (nr_bytes) nr_pages += 1; - ret = __memcg_kmem_charge(memcg, gfp, nr_pages); + ret = obj_cgroup_charge_pages(objcg, gfp, nr_pages); if (!ret && nr_bytes) refill_obj_stock(objcg, PAGE_SIZE - nr_bytes); - css_put(&memcg->css); return ret; } @@ -3300,7 +3133,11 @@ void split_page_memcg(struct page *head, unsigned int nr) for (i = 1; i < nr; i++) head[i].memcg_data = head->memcg_data; - css_get_many(&memcg->css, nr - 1); + + if (PageMemcgKmem(head)) + obj_cgroup_get_many(__page_objcg(head), nr - 1); + else + css_get_many(&memcg->css, nr - 1); } #ifdef CONFIG_MEMCG_SWAP @@ -3549,6 +3386,7 @@ static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) unsigned long val; if (mem_cgroup_is_root(memcg)) { + cgroup_rstat_flush(memcg->css.cgroup); val = memcg_page_state(memcg, NR_FILE_PAGES) + memcg_page_state(memcg, NR_ANON_MAPPED); if (swap) @@ -3613,57 +3451,6 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, } } -static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg) -{ - unsigned long stat[MEMCG_NR_STAT] = {0}; - struct mem_cgroup *mi; - int node, cpu, i; - - for_each_online_cpu(cpu) - for (i = 0; i < MEMCG_NR_STAT; i++) - stat[i] += per_cpu(memcg->vmstats_percpu->stat[i], cpu); - - for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) - for (i = 0; i < MEMCG_NR_STAT; i++) - atomic_long_add(stat[i], &mi->vmstats[i]); - - for_each_node(node) { - struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; - struct mem_cgroup_per_node *pi; - - for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) - stat[i] = 0; - - for_each_online_cpu(cpu) - for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) - stat[i] += per_cpu( - pn->lruvec_stat_cpu->count[i], cpu); - - for (pi = pn; pi; pi = parent_nodeinfo(pi, node)) - for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) - atomic_long_add(stat[i], &pi->lruvec_stat[i]); - } -} - -static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg) -{ - unsigned long events[NR_VM_EVENT_ITEMS]; - struct mem_cgroup *mi; - int cpu, i; - - for (i = 0; i < NR_VM_EVENT_ITEMS; i++) - events[i] = 0; - - for_each_online_cpu(cpu) - for (i = 0; i < NR_VM_EVENT_ITEMS; i++) - events[i] += per_cpu(memcg->vmstats_percpu->events[i], - cpu); - - for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) - for (i = 0; i < NR_VM_EVENT_ITEMS; i++) - atomic_long_add(events[i], &mi->vmevents[i]); -} - #ifdef CONFIG_MEMCG_KMEM static int memcg_online_kmem(struct mem_cgroup *memcg) { @@ -3980,6 +3767,8 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v) int nid; struct mem_cgroup *memcg = mem_cgroup_from_seq(m); + cgroup_rstat_flush(memcg->css.cgroup); + for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { seq_printf(m, "%s=%lu", stat->name, mem_cgroup_nr_lru_pages(memcg, stat->lru_mask, @@ -4050,6 +3839,8 @@ static int memcg_stat_show(struct seq_file *m, void *v) BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats)); + cgroup_rstat_flush(memcg->css.cgroup); + for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { unsigned long nr; @@ -4108,7 +3899,7 @@ static int memcg_stat_show(struct seq_file *m, void *v) unsigned long file_cost = 0; for_each_online_pgdat(pgdat) { - mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id); + mz = memcg->nodeinfo[pgdat->node_id]; anon_cost += mz->lruvec.anon_cost; file_cost += mz->lruvec.file_cost; @@ -4137,7 +3928,7 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, if (val > 100) return -EINVAL; - if (css->parent) + if (!mem_cgroup_is_root(memcg)) memcg->swappiness = val; else vm_swappiness = val; @@ -4487,7 +4278,7 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, struct mem_cgroup *memcg = mem_cgroup_from_css(css); /* cannot set to root cgroup and only 0 and 1 are allowed */ - if (!css->parent || !((val == 0) || (val == 1))) + if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1))) return -EINVAL; memcg->oom_kill_disable = val; @@ -4526,22 +4317,6 @@ struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) return &memcg->cgwb_domain; } -/* - * idx can be of type enum memcg_stat_item or node_stat_item. - * Keep in sync with memcg_exact_page(). - */ -static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx) -{ - long x = atomic_long_read(&memcg->vmstats[idx]); - int cpu; - - for_each_online_cpu(cpu) - x += per_cpu_ptr(memcg->vmstats_percpu, cpu)->stat[idx]; - if (x < 0) - x = 0; - return x; -} - /** * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg * @wb: bdi_writeback in question @@ -4567,13 +4342,14 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); struct mem_cgroup *parent; - *pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY); + cgroup_rstat_flush_irqsafe(memcg->css.cgroup); - *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK); - *pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) + - memcg_exact_page_state(memcg, NR_ACTIVE_FILE); - *pheadroom = PAGE_COUNTER_MAX; + *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY); + *pwriteback = memcg_page_state(memcg, NR_WRITEBACK); + *pfilepages = memcg_page_state(memcg, NR_INACTIVE_FILE) + + memcg_page_state(memcg, NR_ACTIVE_FILE); + *pheadroom = PAGE_COUNTER_MAX; while ((parent = parent_mem_cgroup(memcg))) { unsigned long ceiling = min(READ_ONCE(memcg->memory.max), READ_ONCE(memcg->memory.high)); @@ -4588,7 +4364,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, * Foreign dirty flushing * * There's an inherent mismatch between memcg and writeback. The former - * trackes ownership per-page while the latter per-inode. This was a + * tracks ownership per-page while the latter per-inode. This was a * deliberate design decision because honoring per-page ownership in the * writeback path is complicated, may lead to higher CPU and IO overheads * and deemed unnecessary given that write-sharing an inode across @@ -4603,9 +4379,9 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, * triggering background writeback. A will be slowed down without a way to * make writeback of the dirty pages happen. * - * Conditions like the above can lead to a cgroup getting repatedly and + * Conditions like the above can lead to a cgroup getting repeatedly and * severely throttled after making some progress after each - * dirty_expire_interval while the underyling IO device is almost + * dirty_expire_interval while the underlying IO device is almost * completely idle. * * Solving this problem completely requires matching the ownership tracking @@ -5205,19 +4981,20 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) for_each_node(node) free_mem_cgroup_per_node_info(memcg, node); free_percpu(memcg->vmstats_percpu); - free_percpu(memcg->vmstats_local); kfree(memcg); } static void mem_cgroup_free(struct mem_cgroup *memcg) { + int cpu; + memcg_wb_domain_exit(memcg); /* - * Flush percpu vmstats and vmevents to guarantee the value correctness - * on parent's and all ancestor levels. + * Flush percpu lruvec stats to guarantee the value + * correctness on parent's and all ancestor levels. */ - memcg_flush_percpu_vmstats(memcg); - memcg_flush_percpu_vmevents(memcg); + for_each_online_cpu(cpu) + memcg_flush_lruvec_page_state(memcg, cpu); __mem_cgroup_free(memcg); } @@ -5244,11 +5021,6 @@ static struct mem_cgroup *mem_cgroup_alloc(void) goto fail; } - memcg->vmstats_local = alloc_percpu_gfp(struct memcg_vmstats_percpu, - GFP_KERNEL_ACCOUNT); - if (!memcg->vmstats_local) - goto fail; - memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu, GFP_KERNEL_ACCOUNT); if (!memcg->vmstats_percpu) @@ -5346,11 +5118,11 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) struct mem_cgroup *memcg = mem_cgroup_from_css(css); /* - * A memcg must be visible for memcg_expand_shrinker_maps() + * A memcg must be visible for expand_shrinker_info() * by the time the maps are allocated. So, we allocate maps * here, when for_each_mem_cgroup() can't skip it. */ - if (memcg_alloc_shrinker_maps(memcg)) { + if (alloc_shrinker_info(memcg)) { mem_cgroup_id_remove(memcg); return -ENOMEM; } @@ -5382,6 +5154,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) page_counter_set_low(&memcg->memory, 0); memcg_offline_kmem(memcg); + reparent_shrinker_deferred(memcg); wb_memcg_offline(memcg); drain_all_stock(memcg); @@ -5414,7 +5187,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css) vmpressure_cleanup(&memcg->vmpressure); cancel_work_sync(&memcg->high_work); mem_cgroup_remove_from_trees(memcg); - memcg_free_shrinker_maps(memcg); + free_shrinker_info(memcg); memcg_free_kmem(memcg); mem_cgroup_free(memcg); } @@ -5448,6 +5221,62 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) memcg_wb_domain_size_changed(memcg); } +static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup *parent = parent_mem_cgroup(memcg); + struct memcg_vmstats_percpu *statc; + long delta, v; + int i; + + statc = per_cpu_ptr(memcg->vmstats_percpu, cpu); + + for (i = 0; i < MEMCG_NR_STAT; i++) { + /* + * Collect the aggregated propagation counts of groups + * below us. We're in a per-cpu loop here and this is + * a global counter, so the first cycle will get them. + */ + delta = memcg->vmstats.state_pending[i]; + if (delta) + memcg->vmstats.state_pending[i] = 0; + + /* Add CPU changes on this level since the last flush */ + v = READ_ONCE(statc->state[i]); + if (v != statc->state_prev[i]) { + delta += v - statc->state_prev[i]; + statc->state_prev[i] = v; + } + + if (!delta) + continue; + + /* Aggregate counts on this level and propagate upwards */ + memcg->vmstats.state[i] += delta; + if (parent) + parent->vmstats.state_pending[i] += delta; + } + + for (i = 0; i < NR_VM_EVENT_ITEMS; i++) { + delta = memcg->vmstats.events_pending[i]; + if (delta) + memcg->vmstats.events_pending[i] = 0; + + v = READ_ONCE(statc->events[i]); + if (v != statc->events_prev[i]) { + delta += v - statc->events_prev[i]; + statc->events_prev[i] = v; + } + + if (!delta) + continue; + + memcg->vmstats.events[i] += delta; + if (parent) + parent->vmstats.events_pending[i] += delta; + } +} + #ifdef CONFIG_MMU /* Handlers for move charge at task migration. */ static int mem_cgroup_do_precharge(unsigned long count) @@ -5945,7 +5774,7 @@ static int mem_cgroup_can_attach(struct cgroup_taskset *tset) return 0; /* - * We are now commited to this value whatever it is. Changes in this + * We are now committed to this value whatever it is. Changes in this * tunable will only affect upcoming migrations, not the current one. * So we need to save it, and keep it going. */ @@ -6501,6 +6330,7 @@ struct cgroup_subsys memory_cgrp_subsys = { .css_released = mem_cgroup_css_released, .css_free = mem_cgroup_css_free, .css_reset = mem_cgroup_css_reset, + .css_rstat_flush = mem_cgroup_css_rstat_flush, .can_attach = mem_cgroup_can_attach, .cancel_attach = mem_cgroup_cancel_attach, .post_attach = mem_cgroup_move_task, @@ -6683,6 +6513,27 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root, atomic_long_read(&parent->memory.children_low_usage))); } +static int __mem_cgroup_charge(struct page *page, struct mem_cgroup *memcg, + gfp_t gfp) +{ + unsigned int nr_pages = thp_nr_pages(page); + int ret; + + ret = try_charge(memcg, gfp, nr_pages); + if (ret) + goto out; + + css_get(&memcg->css); + commit_charge(page, memcg); + + local_irq_disable(); + mem_cgroup_charge_statistics(memcg, page, nr_pages); + memcg_check_events(memcg, page); + local_irq_enable(); +out: + return ret; +} + /** * mem_cgroup_charge - charge a newly allocated page to a cgroup * @page: page to charge @@ -6692,55 +6543,71 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root, * Try to charge @page to the memcg that @mm belongs to, reclaiming * pages according to @gfp_mask if necessary. * + * Do not use this for pages allocated for swapin. + * * Returns 0 on success. Otherwise, an error code is returned. */ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { - unsigned int nr_pages = thp_nr_pages(page); - struct mem_cgroup *memcg = NULL; - int ret = 0; + struct mem_cgroup *memcg; + int ret; if (mem_cgroup_disabled()) - goto out; + return 0; - if (PageSwapCache(page)) { - swp_entry_t ent = { .val = page_private(page), }; - unsigned short id; + memcg = get_mem_cgroup_from_mm(mm); + ret = __mem_cgroup_charge(page, memcg, gfp_mask); + css_put(&memcg->css); - /* - * Every swap fault against a single page tries to charge the - * page, bail as early as possible. shmem_unuse() encounters - * already charged pages, too. page and memcg binding is - * protected by the page lock, which serializes swap cache - * removal, which in turn serializes uncharging. - */ - VM_BUG_ON_PAGE(!PageLocked(page), page); - if (page_memcg(compound_head(page))) - goto out; + return ret; +} - id = lookup_swap_cgroup_id(ent); - rcu_read_lock(); - memcg = mem_cgroup_from_id(id); - if (memcg && !css_tryget_online(&memcg->css)) - memcg = NULL; - rcu_read_unlock(); - } +/** + * mem_cgroup_swapin_charge_page - charge a newly allocated page for swapin + * @page: page to charge + * @mm: mm context of the victim + * @gfp: reclaim mode + * @entry: swap entry for which the page is allocated + * + * This function charges a page allocated for swapin. Please call this before + * adding the page to the swapcache. + * + * Returns 0 on success. Otherwise, an error code is returned. + */ +int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm, + gfp_t gfp, swp_entry_t entry) +{ + struct mem_cgroup *memcg; + unsigned short id; + int ret; - if (!memcg) - memcg = get_mem_cgroup_from_mm(mm); + if (mem_cgroup_disabled()) + return 0; - ret = try_charge(memcg, gfp_mask, nr_pages); - if (ret) - goto out_put; + id = lookup_swap_cgroup_id(entry); + rcu_read_lock(); + memcg = mem_cgroup_from_id(id); + if (!memcg || !css_tryget_online(&memcg->css)) + memcg = get_mem_cgroup_from_mm(mm); + rcu_read_unlock(); - css_get(&memcg->css); - commit_charge(page, memcg); + ret = __mem_cgroup_charge(page, memcg, gfp); - local_irq_disable(); - mem_cgroup_charge_statistics(memcg, page, nr_pages); - memcg_check_events(memcg, page); - local_irq_enable(); + css_put(&memcg->css); + return ret; +} +/* + * mem_cgroup_swapin_uncharge_swap - uncharge swap slot + * @entry: swap entry for which the page is charged + * + * Call this function after successfully adding the charged page to swapcache. + * + * Note: This function assumes the page for which swap slot is being uncharged + * is order 0 page. + */ +void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry) +{ /* * Cgroup1's unified memory+swap counter has been charged with the * new swapcache page, finish the transfer by uncharging the swap @@ -6753,25 +6620,19 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) * correspond 1:1 to page and swap slot lifetimes: we charge the * page to memory here, and uncharge swap when the slot is freed. */ - if (do_memsw_account() && PageSwapCache(page)) { - swp_entry_t entry = { .val = page_private(page) }; + if (!mem_cgroup_disabled() && do_memsw_account()) { /* * The swap entry might not get freed for a long time, * let's not wait for it. The page already received a * memory+swap charge, drop the swap entry duplicate. */ - mem_cgroup_uncharge_swap(entry, nr_pages); + mem_cgroup_uncharge_swap(entry, 1); } - -out_put: - css_put(&memcg->css); -out: - return ret; } struct uncharge_gather { struct mem_cgroup *memcg; - unsigned long nr_pages; + unsigned long nr_memory; unsigned long pgpgout; unsigned long nr_kmem; struct page *dummy_page; @@ -6786,10 +6647,10 @@ static void uncharge_batch(const struct uncharge_gather *ug) { unsigned long flags; - if (!mem_cgroup_is_root(ug->memcg)) { - page_counter_uncharge(&ug->memcg->memory, ug->nr_pages); + if (ug->nr_memory) { + page_counter_uncharge(&ug->memcg->memory, ug->nr_memory); if (do_memsw_account()) - page_counter_uncharge(&ug->memcg->memsw, ug->nr_pages); + page_counter_uncharge(&ug->memcg->memsw, ug->nr_memory); if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem) page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem); memcg_oom_recover(ug->memcg); @@ -6797,7 +6658,7 @@ static void uncharge_batch(const struct uncharge_gather *ug) local_irq_save(flags); __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout); - __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_pages); + __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_memory); memcg_check_events(ug->memcg, ug->dummy_page); local_irq_restore(flags); @@ -6808,40 +6669,60 @@ static void uncharge_batch(const struct uncharge_gather *ug) static void uncharge_page(struct page *page, struct uncharge_gather *ug) { unsigned long nr_pages; + struct mem_cgroup *memcg; + struct obj_cgroup *objcg; VM_BUG_ON_PAGE(PageLRU(page), page); - if (!page_memcg(page)) - return; - /* * Nobody should be changing or seriously looking at - * page_memcg(page) at this point, we have fully + * page memcg or objcg at this point, we have fully * exclusive access to the page. */ + if (PageMemcgKmem(page)) { + objcg = __page_objcg(page); + /* + * This get matches the put at the end of the function and + * kmem pages do not hold memcg references anymore. + */ + memcg = get_mem_cgroup_from_objcg(objcg); + } else { + memcg = __page_memcg(page); + } - if (ug->memcg != page_memcg(page)) { + if (!memcg) + return; + + if (ug->memcg != memcg) { if (ug->memcg) { uncharge_batch(ug); uncharge_gather_clear(ug); } - ug->memcg = page_memcg(page); + ug->memcg = memcg; + ug->dummy_page = page; /* pairs with css_put in uncharge_batch */ - css_get(&ug->memcg->css); + css_get(&memcg->css); } nr_pages = compound_nr(page); - ug->nr_pages += nr_pages; - if (PageMemcgKmem(page)) + if (PageMemcgKmem(page)) { + ug->nr_memory += nr_pages; ug->nr_kmem += nr_pages; - else + + page->memcg_data = 0; + obj_cgroup_put(objcg); + } else { + /* LRU pages aren't accounted at the root level */ + if (!mem_cgroup_is_root(memcg)) + ug->nr_memory += nr_pages; ug->pgpgout++; - ug->dummy_page = page; - page->memcg_data = 0; - css_put(&ug->memcg->css); + page->memcg_data = 0; + } + + css_put(&memcg->css); } /** diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 24210c9bd843..6f5f78885ab4 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -75,7 +75,7 @@ static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, boo if (dissolve_free_huge_page(page) || !take_page_off_buddy(page)) /* * We could fail to take off the target page from buddy - * for example due to racy page allocaiton, but that's + * for example due to racy page allocation, but that's * acceptable because soft-offlined page is not broken * and if someone really want to use it, they should * take it. @@ -658,6 +658,7 @@ static int truncate_error_page(struct page *p, unsigned long pfn, */ static int me_kernel(struct page *p, unsigned long pfn) { + unlock_page(p); return MF_IGNORED; } @@ -667,6 +668,7 @@ static int me_kernel(struct page *p, unsigned long pfn) static int me_unknown(struct page *p, unsigned long pfn) { pr_err("Memory failure: %#lx: Unknown page state\n", pfn); + unlock_page(p); return MF_FAILED; } @@ -675,6 +677,7 @@ static int me_unknown(struct page *p, unsigned long pfn) */ static int me_pagecache_clean(struct page *p, unsigned long pfn) { + int ret; struct address_space *mapping; delete_from_lru_cache(p); @@ -683,8 +686,10 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) * For anonymous pages we're done the only reference left * should be the one m_f() holds. */ - if (PageAnon(p)) - return MF_RECOVERED; + if (PageAnon(p)) { + ret = MF_RECOVERED; + goto out; + } /* * Now truncate the page in the page cache. This is really @@ -698,7 +703,8 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) /* * Page has been teared down in the meanwhile */ - return MF_FAILED; + ret = MF_FAILED; + goto out; } /* @@ -706,7 +712,10 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) * * Open: to take i_mutex or not for this? Right now we don't. */ - return truncate_error_page(p, pfn, mapping); + ret = truncate_error_page(p, pfn, mapping); +out: + unlock_page(p); + return ret; } /* @@ -782,24 +791,26 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn) */ static int me_swapcache_dirty(struct page *p, unsigned long pfn) { + int ret; + ClearPageDirty(p); /* Trigger EIO in shmem: */ ClearPageUptodate(p); - if (!delete_from_lru_cache(p)) - return MF_DELAYED; - else - return MF_FAILED; + ret = delete_from_lru_cache(p) ? MF_FAILED : MF_DELAYED; + unlock_page(p); + return ret; } static int me_swapcache_clean(struct page *p, unsigned long pfn) { + int ret; + delete_from_swap_cache(p); - if (!delete_from_lru_cache(p)) - return MF_RECOVERED; - else - return MF_FAILED; + ret = delete_from_lru_cache(p) ? MF_FAILED : MF_RECOVERED; + unlock_page(p); + return ret; } /* @@ -820,6 +831,7 @@ static int me_huge_page(struct page *p, unsigned long pfn) mapping = page_mapping(hpage); if (mapping) { res = truncate_error_page(hpage, pfn, mapping); + unlock_page(hpage); } else { res = MF_FAILED; unlock_page(hpage); @@ -834,7 +846,6 @@ static int me_huge_page(struct page *p, unsigned long pfn) page_ref_inc(p); res = MF_RECOVERED; } - lock_page(hpage); } return res; @@ -866,6 +877,8 @@ static struct page_state { unsigned long mask; unsigned long res; enum mf_action_page_type type; + + /* Callback ->action() has to unlock the relevant page inside it. */ int (*action)(struct page *p, unsigned long pfn); } error_states[] = { { reserved, reserved, MF_MSG_KERNEL, me_kernel }, @@ -929,6 +942,7 @@ static int page_action(struct page_state *ps, struct page *p, int result; int count; + /* page p should be unlocked after returning from ps->action(). */ result = ps->action(p, pfn); count = page_count(p) - 1; @@ -949,6 +963,17 @@ static int page_action(struct page_state *ps, struct page *p, return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY; } +/* + * Return true if a page type of a given page is supported by hwpoison + * mechanism (while handling could fail), otherwise false. This function + * does not return true for hugetlb or device memory pages, so it's assumed + * to be called only in the context where we never have such pages. + */ +static inline bool HWPoisonHandlable(struct page *page) +{ + return PageLRU(page) || __PageMovable(page); +} + /** * __get_hwpoison_page() - Get refcount for memory error handling: * @page: raw error page (hit by memory error) @@ -959,8 +984,22 @@ static int page_action(struct page_state *ps, struct page *p, static int __get_hwpoison_page(struct page *page) { struct page *head = compound_head(page); + int ret = 0; + bool hugetlb = false; + + ret = get_hwpoison_huge_page(head, &hugetlb); + if (hugetlb) + return ret; + + /* + * This check prevents from calling get_hwpoison_unless_zero() + * for any unsupported type of page in order to reduce the risk of + * unexpected races caused by taking a page refcount. + */ + if (!HWPoisonHandlable(head)) + return 0; - if (!PageHuge(head) && PageTransHuge(head)) { + if (PageTransHuge(head)) { /* * Non anonymous thp exists only in allocation/free time. We * can't handle such a case correctly, so let's give it up. @@ -1017,7 +1056,7 @@ try_again: ret = -EIO; } } else { - if (PageHuge(p) || PageLRU(p) || __PageMovable(p)) { + if (PageHuge(p) || HWPoisonHandlable(p)) { ret = 1; } else { /* @@ -1228,7 +1267,7 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags) if (TestSetPageHWPoison(head)) { pr_err("Memory failure: %#lx: already hardware poisoned\n", pfn); - return 0; + return -EHWPOISON; } num_poisoned_pages_inc(); @@ -1288,7 +1327,7 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags) goto out; } - res = identify_page_state(pfn, p, page_flags); + return identify_page_state(pfn, p, page_flags); out: unlock_page(head); return res; @@ -1368,7 +1407,7 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, * communicated in siginfo, see kill_proc() */ start = (page->index << PAGE_SHIFT) & ~(size - 1); - unmap_mapping_range(page->mapping, start, start + size, 0); + unmap_mapping_range(page->mapping, start, size, 0); } kill_procs(&tokill, flags & MF_MUST_KILL, !unmap_success, pfn, flags); rc = 0; @@ -1404,9 +1443,10 @@ int memory_failure(unsigned long pfn, int flags) struct page *hpage; struct page *orig_head; struct dev_pagemap *pgmap; - int res; + int res = 0; unsigned long page_flags; bool retry = true; + static DEFINE_MUTEX(mf_mutex); if (!sysctl_memory_failure_recovery) panic("Memory failure on page %lx", pfn); @@ -1424,13 +1464,19 @@ int memory_failure(unsigned long pfn, int flags) return -ENXIO; } + mutex_lock(&mf_mutex); + try_again: - if (PageHuge(p)) - return memory_failure_hugetlb(pfn, flags); + if (PageHuge(p)) { + res = memory_failure_hugetlb(pfn, flags); + goto unlock_mutex; + } + if (TestSetPageHWPoison(p)) { pr_err("Memory failure: %#lx: already hardware poisoned\n", pfn); - return 0; + res = -EHWPOISON; + goto unlock_mutex; } orig_head = hpage = compound_head(p); @@ -1463,17 +1509,19 @@ try_again: res = MF_FAILED; } action_result(pfn, MF_MSG_BUDDY, res); - return res == MF_RECOVERED ? 0 : -EBUSY; + res = res == MF_RECOVERED ? 0 : -EBUSY; } else { action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED); - return -EBUSY; + res = -EBUSY; } + goto unlock_mutex; } if (PageTransHuge(hpage)) { if (try_to_split_thp_page(p, "Memory Failure") < 0) { action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED); - return -EBUSY; + res = -EBUSY; + goto unlock_mutex; } VM_BUG_ON_PAGE(!page_count(p), p); } @@ -1497,7 +1545,7 @@ try_again: if (PageCompound(p) && compound_head(p) != orig_head) { action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED); res = -EBUSY; - goto out; + goto unlock_page; } /* @@ -1517,17 +1565,22 @@ try_again: num_poisoned_pages_dec(); unlock_page(p); put_page(p); - return 0; + goto unlock_mutex; } if (hwpoison_filter(p)) { if (TestClearPageHWPoison(p)) num_poisoned_pages_dec(); unlock_page(p); put_page(p); - return 0; + goto unlock_mutex; } - if (!PageTransTail(p) && !PageLRU(p)) + /* + * __munlock_pagevec may clear a writeback page's LRU flag without + * page_lock. We need wait writeback completion for this page or it + * may trigger vfs BUG while evict inode. + */ + if (!PageTransTail(p) && !PageLRU(p) && !PageWriteback(p)) goto identify_page_state; /* @@ -1543,7 +1596,7 @@ try_again: if (!hwpoison_user_mappings(p, pfn, flags, &p)) { action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); res = -EBUSY; - goto out; + goto unlock_page; } /* @@ -1552,13 +1605,17 @@ try_again: if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED); res = -EBUSY; - goto out; + goto unlock_page; } identify_page_state: res = identify_page_state(pfn, p, page_flags); -out: + mutex_unlock(&mf_mutex); + return res; +unlock_page: unlock_page(p); +unlock_mutex: + mutex_unlock(&mf_mutex); return res; } EXPORT_SYMBOL_GPL(memory_failure); diff --git a/mm/memory.c b/mm/memory.c index 550405fc3b5e..486f4a2874e7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1361,7 +1361,18 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, else if (zap_huge_pmd(tlb, vma, pmd, addr)) goto next; /* fall through */ + } else if (details && details->single_page && + PageTransCompound(details->single_page) && + next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) { + spinlock_t *ptl = pmd_lock(tlb->mm, pmd); + /* + * Take and drop THP pmd lock so that we cannot return + * prematurely, while zap_huge_pmd() has cleared *pmd, + * but not yet decremented compound_mapcount(). + */ + spin_unlock(ptl); } + /* * Here there can be other concurrent MADV_DONTNEED or * trans huge page faults running, and if the pmd is @@ -2260,26 +2271,17 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd, return 0; } -/** - * remap_pfn_range - remap kernel memory to userspace - * @vma: user vma to map to - * @addr: target page aligned user address to start at - * @pfn: page frame number of kernel physical memory address - * @size: size of mapping area - * @prot: page protection flags for this mapping - * - * Note: this is only safe if the mm semaphore is held when called. - * - * Return: %0 on success, negative error code otherwise. +/* + * Variant of remap_pfn_range that does not call track_pfn_remap. The caller + * must have pre-validated the caching bits of the pgprot_t. */ -int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn, unsigned long size, pgprot_t prot) +int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) { pgd_t *pgd; unsigned long next; unsigned long end = addr + PAGE_ALIGN(size); struct mm_struct *mm = vma->vm_mm; - unsigned long remap_pfn = pfn; int err; if (WARN_ON_ONCE(!PAGE_ALIGNED(addr))) @@ -2309,10 +2311,6 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, vma->vm_pgoff = pfn; } - err = track_pfn_remap(vma, &prot, remap_pfn, addr, PAGE_ALIGN(size)); - if (err) - return -EINVAL; - vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; BUG_ON(addr >= end); @@ -2324,12 +2322,36 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, err = remap_p4d_range(mm, pgd, addr, next, pfn + (addr >> PAGE_SHIFT), prot); if (err) - break; + return err; } while (pgd++, addr = next, addr != end); + return 0; +} + +/** + * remap_pfn_range - remap kernel memory to userspace + * @vma: user vma to map to + * @addr: target page aligned user address to start at + * @pfn: page frame number of kernel physical memory address + * @size: size of mapping area + * @prot: page protection flags for this mapping + * + * Note: this is only safe if the mm semaphore is held when called. + * + * Return: %0 on success, negative error code otherwise. + */ +int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + int err; + + err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size)); if (err) - untrack_pfn(vma, remap_pfn, PAGE_ALIGN(size)); + return -EINVAL; + err = remap_pfn_range_notrack(vma, addr, pfn, size, prot); + if (err) + untrack_pfn(vma, pfn, PAGE_ALIGN(size)); return err; } EXPORT_SYMBOL(remap_pfn_range); @@ -2446,13 +2468,21 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, } do { next = pmd_addr_end(addr, end); - if (create || !pmd_none_or_clear_bad(pmd)) { - err = apply_to_pte_range(mm, pmd, addr, next, fn, data, - create, mask); - if (err) - break; + if (pmd_none(*pmd) && !create) + continue; + if (WARN_ON_ONCE(pmd_leaf(*pmd))) + return -EINVAL; + if (!pmd_none(*pmd) && WARN_ON_ONCE(pmd_bad(*pmd))) { + if (!create) + continue; + pmd_clear_bad(pmd); } + err = apply_to_pte_range(mm, pmd, addr, next, + fn, data, create, mask); + if (err) + break; } while (pmd++, addr = next, addr != end); + return err; } @@ -2474,13 +2504,21 @@ static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d, } do { next = pud_addr_end(addr, end); - if (create || !pud_none_or_clear_bad(pud)) { - err = apply_to_pmd_range(mm, pud, addr, next, fn, data, - create, mask); - if (err) - break; + if (pud_none(*pud) && !create) + continue; + if (WARN_ON_ONCE(pud_leaf(*pud))) + return -EINVAL; + if (!pud_none(*pud) && WARN_ON_ONCE(pud_bad(*pud))) { + if (!create) + continue; + pud_clear_bad(pud); } + err = apply_to_pmd_range(mm, pud, addr, next, + fn, data, create, mask); + if (err) + break; } while (pud++, addr = next, addr != end); + return err; } @@ -2502,13 +2540,21 @@ static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd, } do { next = p4d_addr_end(addr, end); - if (create || !p4d_none_or_clear_bad(p4d)) { - err = apply_to_pud_range(mm, p4d, addr, next, fn, data, - create, mask); - if (err) - break; + if (p4d_none(*p4d) && !create) + continue; + if (WARN_ON_ONCE(p4d_leaf(*p4d))) + return -EINVAL; + if (!p4d_none(*p4d) && WARN_ON_ONCE(p4d_bad(*p4d))) { + if (!create) + continue; + p4d_clear_bad(p4d); } + err = apply_to_pud_range(mm, p4d, addr, next, + fn, data, create, mask); + if (err) + break; } while (p4d++, addr = next, addr != end); + return err; } @@ -2528,9 +2574,17 @@ static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr, pgd = pgd_offset(mm, addr); do { next = pgd_addr_end(addr, end); - if (!create && pgd_none_or_clear_bad(pgd)) + if (pgd_none(*pgd) && !create) continue; - err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, create, &mask); + if (WARN_ON_ONCE(pgd_leaf(*pgd))) + return -EINVAL; + if (!pgd_none(*pgd) && WARN_ON_ONCE(pgd_bad(*pgd))) { + if (!create) + continue; + pgd_clear_bad(pgd); + } + err = apply_to_p4d_range(mm, pgd, addr, next, + fn, data, create, &mask); if (err) break; } while (pgd++, addr = next, addr != end); @@ -2896,6 +2950,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) } flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); + entry = pte_sw_mkyoung(entry); entry = maybe_mkwrite(pte_mkdirty(entry), vma); /* @@ -3193,6 +3248,36 @@ static inline void unmap_mapping_range_tree(struct rb_root_cached *root, } /** + * unmap_mapping_page() - Unmap single page from processes. + * @page: The locked page to be unmapped. + * + * Unmap this page from any userspace process which still has it mmaped. + * Typically, for efficiency, the range of nearby pages has already been + * unmapped by unmap_mapping_pages() or unmap_mapping_range(). But once + * truncation or invalidation holds the lock on a page, it may find that + * the page has been remapped again: and then uses unmap_mapping_page() + * to unmap it finally. + */ +void unmap_mapping_page(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct zap_details details = { }; + + VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON(PageTail(page)); + + details.check_mapping = mapping; + details.first_index = page->index; + details.last_index = page->index + thp_nr_pages(page) - 1; + details.single_page = page; + + i_mmap_lock_write(mapping); + if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))) + unmap_mapping_range_tree(&mapping->i_mmap, &details); + i_mmap_unlock_write(mapping); +} + +/** * unmap_mapping_pages() - Unmap pages from processes. * @mapping: The address space containing pages to be unmapped. * @start: Index of first page to be unmapped. @@ -3296,7 +3381,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) } - delayacct_set_flag(DELAYACCT_PF_SWAPIN); + delayacct_set_flag(current, DELAYACCT_PF_SWAPIN); page = lookup_swap_cache(entry, vma, vmf->address); swapcache = page; @@ -3309,28 +3394,26 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); if (page) { - int err; - __SetPageLocked(page); __SetPageSwapBacked(page); - set_page_private(page, entry.val); - /* Tell memcg to use swap ownership records */ - SetPageSwapCache(page); - err = mem_cgroup_charge(page, vma->vm_mm, - GFP_KERNEL); - ClearPageSwapCache(page); - if (err) { + if (mem_cgroup_swapin_charge_page(page, + vma->vm_mm, GFP_KERNEL, entry)) { ret = VM_FAULT_OOM; goto out_page; } + mem_cgroup_swapin_uncharge_swap(entry); shadow = get_shadow_from_swap_cache(entry); if (shadow) workingset_refault(page, shadow); lru_cache_add(page); + + /* To provide entry to swap_readpage() */ + set_page_private(page, entry.val); swap_readpage(page, true); + set_page_private(page, 0); } } else { page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, @@ -3347,7 +3430,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) vmf->address, &vmf->ptl); if (likely(pte_same(*vmf->pte, vmf->orig_pte))) ret = VM_FAULT_OOM; - delayacct_clear_flag(DELAYACCT_PF_SWAPIN); + delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN); goto unlock; } @@ -3361,13 +3444,13 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * owner processes (which may be unknown at hwpoison time) */ ret = VM_FAULT_HWPOISON; - delayacct_clear_flag(DELAYACCT_PF_SWAPIN); + delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN); goto out_release; } locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags); - delayacct_clear_flag(DELAYACCT_PF_SWAPIN); + delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN); if (!locked) { ret |= VM_FAULT_RETRY; goto out_release; @@ -3561,6 +3644,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) __SetPageUptodate(page); entry = mk_pte(page, vma->vm_page_prot); + entry = pte_sw_mkyoung(entry); if (vma->vm_flags & VM_WRITE) entry = pte_mkwrite(pte_mkdirty(entry)); @@ -3686,7 +3770,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) return ret; /* - * Archs like ppc64 need additonal space to store information + * Archs like ppc64 need additional space to store information * related to pte entry. Use the preallocated table for that. */ if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) { @@ -3745,6 +3829,8 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) if (prefault && arch_wants_old_prefaulted_pte()) entry = pte_mkold(entry); + else + entry = pte_sw_mkyoung(entry); if (write) entry = maybe_mkwrite(pte_mkdirty(entry), vma); @@ -4100,7 +4186,6 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) int page_nid = NUMA_NO_NODE; int last_cpupid; int target_nid; - bool migrated = false; pte_t pte, old_pte; bool was_writable = pte_savedwrite(vmf->orig_pte); int flags = 0; @@ -4117,29 +4202,17 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) goto out; } - /* - * Make it present again, Depending on how arch implementes non - * accessible ptes, some can allow access by kernel mode. - */ - old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte); + /* Get the normal PTE */ + old_pte = ptep_get(vmf->pte); pte = pte_modify(old_pte, vma->vm_page_prot); - pte = pte_mkyoung(pte); - if (was_writable) - pte = pte_mkwrite(pte); - ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte); - update_mmu_cache(vma, vmf->address, vmf->pte); page = vm_normal_page(vma, vmf->address, pte); - if (!page) { - pte_unmap_unlock(vmf->pte, vmf->ptl); - return 0; - } + if (!page) + goto out_map; /* TODO: handle PTE-mapped THP */ - if (PageCompound(page)) { - pte_unmap_unlock(vmf->pte, vmf->ptl); - return 0; - } + if (PageCompound(page)) + goto out_map; /* * Avoid grouping on RO pages in general. RO pages shouldn't hurt as @@ -4149,7 +4222,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) * pte_dirty has unpredictable behaviour between PTE scan updates, * background writeback, dirty balancing and application behaviour. */ - if (!pte_write(pte)) + if (!was_writable) flags |= TNF_NO_GROUP; /* @@ -4163,24 +4236,45 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) page_nid = page_to_nid(page); target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid, &flags); - pte_unmap_unlock(vmf->pte, vmf->ptl); if (target_nid == NUMA_NO_NODE) { put_page(page); - goto out; + goto out_map; } + pte_unmap_unlock(vmf->pte, vmf->ptl); /* Migrate to the requested node */ - migrated = migrate_misplaced_page(page, vma, target_nid); - if (migrated) { + if (migrate_misplaced_page(page, vma, target_nid)) { page_nid = target_nid; flags |= TNF_MIGRATED; - } else + } else { flags |= TNF_MIGRATE_FAIL; + vmf->pte = pte_offset_map(vmf->pmd, vmf->address); + spin_lock(vmf->ptl); + if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) { + pte_unmap_unlock(vmf->pte, vmf->ptl); + goto out; + } + goto out_map; + } out: if (page_nid != NUMA_NO_NODE) task_numa_fault(last_cpupid, page_nid, 1, flags); return 0; +out_map: + /* + * Make it present again, depending on how arch implements + * non-accessible ptes, some can allow access by kernel mode. + */ + old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte); + pte = pte_modify(old_pte, vma->vm_page_prot); + pte = pte_mkyoung(pte); + if (was_writable) + pte = pte_mkwrite(pte); + ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte); + update_mmu_cache(vma, vmf->address, vmf->pte); + pte_unmap_unlock(vmf->pte, vmf->ptl); + goto out; } static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf) @@ -4454,7 +4548,7 @@ retry_pud: } /** - * mm_account_fault - Do page fault accountings + * mm_account_fault - Do page fault accounting * * @regs: the pt_regs struct pointer. When set to NULL, will skip accounting * of perf event counters, but we'll still do the per-task accounting to @@ -4463,9 +4557,9 @@ retry_pud: * @flags: the fault flags. * @ret: the fault retcode. * - * This will take care of most of the page fault accountings. Meanwhile, it + * This will take care of most of the page fault accounting. Meanwhile, it * will also include the PERF_COUNT_SW_PAGE_FAULTS_[MAJ|MIN] perf counter - * updates. However note that the handling of PERF_COUNT_SW_PAGE_FAULTS should + * updates. However, note that the handling of PERF_COUNT_SW_PAGE_FAULTS should * still be in per-arch page fault handlers at the entry of page fault. */ static inline void mm_account_fault(struct pt_regs *regs, @@ -4799,7 +4893,7 @@ out: /** * generic_access_phys - generic implementation for iomem mmap access * @vma: the vma to access - * @addr: userspace addres, not relative offset within @vma + * @addr: userspace address, not relative offset within @vma * @buf: buffer to read/write * @len: length of transfer * @write: set to FOLL_WRITE when writing, otherwise reading diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 0cdbbfbc5757..70620d0dd923 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -42,6 +42,16 @@ #include "internal.h" #include "shuffle.h" + +/* + * memory_hotplug.memmap_on_memory parameter + */ +static bool memmap_on_memory __ro_after_init; +#ifdef CONFIG_MHP_MEMMAP_ON_MEMORY +module_param(memmap_on_memory, bool, 0444); +MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug"); +#endif + /* * online_page_callback contains pointer to current page onlining function. * Initially it is generic_online_page(). If it is required it could be @@ -648,9 +658,16 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages) * decide to not expose all pages to the buddy (e.g., expose them * later). We account all pages as being online and belonging to this * zone ("present"). + * When using memmap_on_memory, the range might not be aligned to + * MAX_ORDER_NR_PAGES - 1, but pageblock aligned. __ffs() will detect + * this and the first chunk to online will be pageblock_nr_pages. */ - for (pfn = start_pfn; pfn < end_pfn; pfn += MAX_ORDER_NR_PAGES) - (*online_page_callback)(pfn_to_page(pfn), MAX_ORDER - 1); + for (pfn = start_pfn; pfn < end_pfn;) { + int order = min(MAX_ORDER - 1UL, __ffs(pfn)); + + (*online_page_callback)(pfn_to_page(pfn), order); + pfn += (1UL << order); + } /* mark all involved sections as online */ online_mem_sections(start_pfn, end_pfn); @@ -817,7 +834,7 @@ static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn return movable_node_enabled ? movable_zone : kernel_zone; } -struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn, +struct zone *zone_for_pfn_range(int online_type, int nid, unsigned start_pfn, unsigned long nr_pages) { if (online_type == MMOP_ONLINE_KERNEL) @@ -829,24 +846,86 @@ struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn, return default_zone_for_pfn(nid, start_pfn, nr_pages); } -int __ref online_pages(unsigned long pfn, unsigned long nr_pages, - int online_type, int nid) +/* + * This function should only be called by memory_block_{online,offline}, + * and {online,offline}_pages. + */ +void adjust_present_page_count(struct zone *zone, long nr_pages) +{ + unsigned long flags; + + zone->present_pages += nr_pages; + pgdat_resize_lock(zone->zone_pgdat, &flags); + zone->zone_pgdat->node_present_pages += nr_pages; + pgdat_resize_unlock(zone->zone_pgdat, &flags); +} + +int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, + struct zone *zone) +{ + unsigned long end_pfn = pfn + nr_pages; + int ret; + + ret = kasan_add_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages)); + if (ret) + return ret; + + move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE); + + /* + * It might be that the vmemmap_pages fully span sections. If that is + * the case, mark those sections online here as otherwise they will be + * left offline. + */ + if (nr_pages >= PAGES_PER_SECTION) + online_mem_sections(pfn, ALIGN_DOWN(end_pfn, PAGES_PER_SECTION)); + + return ret; +} + +void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages) +{ + unsigned long end_pfn = pfn + nr_pages; + + /* + * It might be that the vmemmap_pages fully span sections. If that is + * the case, mark those sections offline here as otherwise they will be + * left online. + */ + if (nr_pages >= PAGES_PER_SECTION) + offline_mem_sections(pfn, ALIGN_DOWN(end_pfn, PAGES_PER_SECTION)); + + /* + * The pages associated with this vmemmap have been offlined, so + * we can reset its state here. + */ + remove_pfn_range_from_zone(page_zone(pfn_to_page(pfn)), pfn, nr_pages); + kasan_remove_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages)); +} + +int __ref online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *zone) { unsigned long flags; - struct zone *zone; int need_zonelists_rebuild = 0; + const int nid = zone_to_nid(zone); int ret; struct memory_notify arg; - /* We can only online full sections (e.g., SECTION_IS_ONLINE) */ + /* + * {on,off}lining is constrained to full memory sections (or more + * precisly to memory blocks from the user space POV). + * memmap_on_memory is an exception because it reserves initial part + * of the physical memory space for vmemmaps. That space is pageblock + * aligned. + */ if (WARN_ON_ONCE(!nr_pages || - !IS_ALIGNED(pfn | nr_pages, PAGES_PER_SECTION))) + !IS_ALIGNED(pfn, pageblock_nr_pages) || + !IS_ALIGNED(pfn + nr_pages, PAGES_PER_SECTION))) return -EINVAL; mem_hotplug_begin(); /* associate pfn range with the zone */ - zone = zone_for_pfn_range(online_type, nid, pfn, nr_pages); move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE); arg.start_pfn = pfn; @@ -877,11 +956,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, } online_pages_range(pfn, nr_pages); - zone->present_pages += nr_pages; - - pgdat_resize_lock(zone->zone_pgdat, &flags); - zone->zone_pgdat->node_present_pages += nr_pages; - pgdat_resize_unlock(zone->zone_pgdat, &flags); + adjust_present_page_count(zone, nr_pages); node_states_set_node(nid, &arg); if (need_zonelists_rebuild) @@ -1064,6 +1139,45 @@ static int online_memory_block(struct memory_block *mem, void *arg) return device_online(&mem->dev); } +bool mhp_supports_memmap_on_memory(unsigned long size) +{ + unsigned long nr_vmemmap_pages = size / PAGE_SIZE; + unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page); + unsigned long remaining_size = size - vmemmap_size; + + /* + * Besides having arch support and the feature enabled at runtime, we + * need a few more assumptions to hold true: + * + * a) We span a single memory block: memory onlining/offlinin;g happens + * in memory block granularity. We don't want the vmemmap of online + * memory blocks to reside on offline memory blocks. In the future, + * we might want to support variable-sized memory blocks to make the + * feature more versatile. + * + * b) The vmemmap pages span complete PMDs: We don't want vmemmap code + * to populate memory from the altmap for unrelated parts (i.e., + * other memory blocks) + * + * c) The vmemmap pages (and thereby the pages that will be exposed to + * the buddy) have to cover full pageblocks: memory onlining/offlining + * code requires applicable ranges to be page-aligned, for example, to + * set the migratetypes properly. + * + * TODO: Although we have a check here to make sure that vmemmap pages + * fully populate a PMD, it is not the right place to check for + * this. A much better solution involves improving vmemmap code + * to fallback to base pages when trying to populate vmemmap using + * altmap as an alternative source of memory, and we do not exactly + * populate a single PMD. + */ + return memmap_on_memory && + IS_ENABLED(CONFIG_MHP_MEMMAP_ON_MEMORY) && + size == memory_block_size_bytes() && + IS_ALIGNED(vmemmap_size, PMD_SIZE) && + IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT)); +} + /* * NOTE: The caller must call lock_device_hotplug() to serialize hotplug * and online/offline operations (triggered e.g. by sysfs). @@ -1073,6 +1187,7 @@ static int online_memory_block(struct memory_block *mem, void *arg) int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) { struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) }; + struct vmem_altmap mhp_altmap = {}; u64 start, size; bool new_node = false; int ret; @@ -1099,13 +1214,26 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) goto error; new_node = ret; + /* + * Self hosted memmap array + */ + if (mhp_flags & MHP_MEMMAP_ON_MEMORY) { + if (!mhp_supports_memmap_on_memory(size)) { + ret = -EINVAL; + goto error; + } + mhp_altmap.free = PHYS_PFN(size); + mhp_altmap.base_pfn = PHYS_PFN(start); + params.altmap = &mhp_altmap; + } + /* call arch's memory hotadd */ ret = arch_add_memory(nid, start, size, ¶ms); if (ret < 0) goto error; /* create memory block devices after memory was added */ - ret = create_memory_block_devices(start, size); + ret = create_memory_block_devices(start, size, mhp_altmap.alloc); if (ret) { arch_remove_memory(nid, start, size, NULL); goto error; @@ -1573,9 +1701,16 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) int ret, node; char *reason; - /* We can only offline full sections (e.g., SECTION_IS_ONLINE) */ + /* + * {on,off}lining is constrained to full memory sections (or more + * precisly to memory blocks from the user space POV). + * memmap_on_memory is an exception because it reserves initial part + * of the physical memory space for vmemmaps. That space is pageblock + * aligned. + */ if (WARN_ON_ONCE(!nr_pages || - !IS_ALIGNED(start_pfn | nr_pages, PAGES_PER_SECTION))) + !IS_ALIGNED(start_pfn, pageblock_nr_pages) || + !IS_ALIGNED(start_pfn + nr_pages, PAGES_PER_SECTION))) return -EINVAL; mem_hotplug_begin(); @@ -1611,6 +1746,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) * in a way that pages from isolated pageblock are left on pcplists. */ zone_pcp_disable(zone); + lru_cache_disable(); /* set above range as isolated */ ret = start_isolate_page_range(start_pfn, end_pfn, @@ -1642,7 +1778,6 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) } cond_resched(); - lru_add_drain_all(); ret = scan_movable_pages(pfn, end_pfn, &pfn); if (!ret) { @@ -1687,15 +1822,12 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) zone->nr_isolate_pageblock -= nr_pages / pageblock_nr_pages; spin_unlock_irqrestore(&zone->lock, flags); + lru_cache_enable(); zone_pcp_enable(zone); /* removal success */ adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages); - zone->present_pages -= nr_pages; - - pgdat_resize_lock(zone->zone_pgdat, &flags); - zone->zone_pgdat->node_present_pages -= nr_pages; - pgdat_resize_unlock(zone->zone_pgdat, &flags); + adjust_present_page_count(zone, -nr_pages); init_per_zone_wmark_min(); @@ -1750,6 +1882,14 @@ static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) return 0; } +static int get_nr_vmemmap_pages_cb(struct memory_block *mem, void *arg) +{ + /* + * If not set, continue with the next block. + */ + return mem->nr_vmemmap_pages; +} + static int check_cpu_on_node(pg_data_t *pgdat) { int cpu; @@ -1824,6 +1964,9 @@ EXPORT_SYMBOL(try_offline_node); static int __ref try_remove_memory(int nid, u64 start, u64 size) { int rc = 0; + struct vmem_altmap mhp_altmap = {}; + struct vmem_altmap *altmap = NULL; + unsigned long nr_vmemmap_pages; BUG_ON(check_hotplug_memory_range(start, size)); @@ -1836,6 +1979,31 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size) if (rc) return rc; + /* + * We only support removing memory added with MHP_MEMMAP_ON_MEMORY in + * the same granularity it was added - a single memory block. + */ + if (memmap_on_memory) { + nr_vmemmap_pages = walk_memory_blocks(start, size, NULL, + get_nr_vmemmap_pages_cb); + if (nr_vmemmap_pages) { + if (size != memory_block_size_bytes()) { + pr_warn("Refuse to remove %#llx - %#llx," + "wrong granularity\n", + start, start + size); + return -EINVAL; + } + + /* + * Let remove_pmd_table->free_hugepage_table do the + * right thing if we used vmem_altmap when hot-adding + * the range. + */ + mhp_altmap.alloc = nr_vmemmap_pages; + altmap = &mhp_altmap; + } + } + /* remove memmap entry */ firmware_map_remove(start, start + size, "System RAM"); @@ -1847,7 +2015,7 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size) mem_hotplug_begin(); - arch_remove_memory(nid, start, size, NULL); + arch_remove_memory(nid, start, size, altmap); if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) { memblock_free(start, size); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index ab51132547b8..d79fa299b70c 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -330,7 +330,7 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes) else if (pol->flags & MPOL_F_RELATIVE_NODES) mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); else { - nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed, + nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed, *nodes); pol->w.cpuset_mems_allowed = *nodes; } @@ -994,7 +994,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, if (flags & MPOL_F_ADDR) { /* * Take a refcount on the mpol, lookup_node() - * wil drop the mmap_lock, so after calling + * will drop the mmap_lock, so after calling * lookup_node() only "pol" remains valid, "vma" * is stale. */ @@ -1124,7 +1124,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, int err = 0; nodemask_t tmp; - migrate_prep(); + lru_cache_disable(); mmap_read_lock(mm); @@ -1161,7 +1161,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, tmp = *from; while (!nodes_empty(tmp)) { - int s,d; + int s, d; int source = NUMA_NO_NODE; int dest = 0; @@ -1208,6 +1208,8 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, break; } mmap_read_unlock(mm); + + lru_cache_enable(); if (err < 0) return err; return busy; @@ -1323,7 +1325,7 @@ static long do_mbind(unsigned long start, unsigned long len, if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { - migrate_prep(); + lru_cache_disable(); } { NODEMASK_SCRATCH(scratch); @@ -1371,6 +1373,8 @@ up_out: mmap_write_unlock(mm); mpol_out: mpol_put(new); + if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) + lru_cache_enable(); return err; } @@ -1863,7 +1867,7 @@ static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone) * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only. * * policy->v.nodes is intersect with node_states[N_MEMORY]. - * so if the following test faile, it implies + * so if the following test fails, it implies * policy->v.nodes has movable memory only. */ if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY])) @@ -2094,7 +2098,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask) * * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default * policy. Otherwise, check for intersection between mask and the policy - * nodemask for 'bind' or 'interleave' policy. For 'perferred' or 'local' + * nodemask for 'bind' or 'interleave' policy. For 'preferred' or 'local' * policy, always return true since it may allocate elsewhere on fallback. * * Takes task_lock(tsk) to prevent freeing of its mempolicy. @@ -2140,7 +2144,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, { struct page *page; - page = __alloc_pages(gfp, order, nid); + page = __alloc_pages(gfp, order, nid, NULL); /* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */ if (!static_branch_likely(&vm_numa_stat_key)) return page; @@ -2153,30 +2157,22 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, } /** - * alloc_pages_vma - Allocate a page for a VMA. - * - * @gfp: - * %GFP_USER user allocation. - * %GFP_KERNEL kernel allocations, - * %GFP_HIGHMEM highmem/user allocations, - * %GFP_FS allocation should not call back into a file system. - * %GFP_ATOMIC don't sleep. + * alloc_pages_vma - Allocate a page for a VMA. + * @gfp: GFP flags. + * @order: Order of the GFP allocation. + * @vma: Pointer to VMA or NULL if not available. + * @addr: Virtual address of the allocation. Must be inside @vma. + * @node: Which node to prefer for allocation (modulo policy). + * @hugepage: For hugepages try only the preferred node if possible. * - * @order:Order of the GFP allocation. - * @vma: Pointer to VMA or NULL if not available. - * @addr: Virtual Address of the allocation. Must be inside the VMA. - * @node: Which node to prefer for allocation (modulo policy). - * @hugepage: for hugepages try only the preferred node if possible + * Allocate a page for a specific address in @vma, using the appropriate + * NUMA policy. When @vma is not NULL the caller must hold the mmap_lock + * of the mm_struct of the VMA to prevent it from going away. Should be + * used for all allocations for pages that will be mapped into user space. * - * This function allocates a page from the kernel page pool and applies - * a NUMA policy associated with the VMA or the current process. - * When VMA is not NULL caller must read-lock the mmap_lock of the - * mm_struct of the VMA to prevent it from going away. Should be used for - * all allocations for pages that will be mapped into user space. Returns - * NULL when no page can be allocated. + * Return: The page on success or NULL if allocation fails. */ -struct page * -alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, +struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, unsigned long addr, int node, bool hugepage) { struct mempolicy *pol; @@ -2237,7 +2233,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, nmask = policy_nodemask(gfp, pol); preferred_nid = policy_node(gfp, pol, node); - page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask); + page = __alloc_pages(gfp, order, preferred_nid, nmask); mpol_cond_put(pol); out: return page; @@ -2245,21 +2241,20 @@ out: EXPORT_SYMBOL(alloc_pages_vma); /** - * alloc_pages_current - Allocate pages. + * alloc_pages - Allocate pages. + * @gfp: GFP flags. + * @order: Power of two of number of pages to allocate. * - * @gfp: - * %GFP_USER user allocation, - * %GFP_KERNEL kernel allocation, - * %GFP_HIGHMEM highmem allocation, - * %GFP_FS don't call back into a file system. - * %GFP_ATOMIC don't sleep. - * @order: Power of two of allocation size in pages. 0 is a single page. + * Allocate 1 << @order contiguous pages. The physical address of the + * first page is naturally aligned (eg an order-3 allocation will be aligned + * to a multiple of 8 * PAGE_SIZE bytes). The NUMA policy of the current + * process is honoured when in process context. * - * Allocate a page from the kernel page pool. When not in - * interrupt context and apply the current process NUMA policy. - * Returns NULL when no page can be allocated. + * Context: Can be called from any context, providing the appropriate GFP + * flags are used. + * Return: The page on success or NULL if allocation fails. */ -struct page *alloc_pages_current(gfp_t gfp, unsigned order) +struct page *alloc_pages(gfp_t gfp, unsigned order) { struct mempolicy *pol = &default_policy; struct page *page; @@ -2274,13 +2269,13 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) if (pol->mode == MPOL_INTERLEAVE) page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); else - page = __alloc_pages_nodemask(gfp, order, + page = __alloc_pages(gfp, order, policy_node(gfp, pol, numa_node_id()), policy_nodemask(gfp, pol)); return page; } -EXPORT_SYMBOL(alloc_pages_current); +EXPORT_SYMBOL(alloc_pages); int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) { @@ -2457,14 +2452,11 @@ static void sp_free(struct sp_node *n) * @addr: virtual address where page mapped * * Lookup current policy node id for vma,addr and "compare to" page's - * node id. - * - * Returns: - * -1 - not misplaced, page is in the right node - * node - node id where the page should be - * - * Policy determination "mimics" alloc_page_vma(). + * node id. Policy determination "mimics" alloc_page_vma(). * Called from fault path where we know the vma and faulting address. + * + * Return: -1 if the page is in a node that is valid for this policy, or a + * suitable node ID to allocate a replacement page from. */ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr) { diff --git a/mm/mempool.c b/mm/mempool.c index 79959fac27d7..a258cf4de575 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -106,7 +106,7 @@ static __always_inline void kasan_poison_element(mempool_t *pool, void *element) if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) kasan_slab_free_mempool(element); else if (pool->alloc == mempool_alloc_pages) - kasan_free_pages(element, (unsigned long)pool->pool_data); + kasan_free_pages(element, (unsigned long)pool->pool_data, false); } static void kasan_unpoison_element(mempool_t *pool, void *element) @@ -114,7 +114,7 @@ static void kasan_unpoison_element(mempool_t *pool, void *element) if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) kasan_unpoison_range(element, __ksize(element)); else if (pool->alloc == mempool_alloc_pages) - kasan_alloc_pages(element, (unsigned long)pool->pool_data); + kasan_alloc_pages(element, (unsigned long)pool->pool_data, false); } static __always_inline void add_element(mempool_t *pool, void *element) @@ -251,7 +251,7 @@ EXPORT_SYMBOL(mempool_init); mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, void *pool_data) { - return mempool_create_node(min_nr,alloc_fn,free_fn, pool_data, + return mempool_create_node(min_nr, alloc_fn, free_fn, pool_data, GFP_KERNEL, NUMA_NO_NODE); } EXPORT_SYMBOL(mempool_create); diff --git a/mm/memremap.c b/mm/memremap.c index 7aa7d6e80ee5..15a074ffb8d7 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +// SPDX-License-Identifier: GPL-2.0 /* Copyright(c) 2015 Intel Corporation. All rights reserved. */ #include <linux/device.h> #include <linux/io.h> diff --git a/mm/migrate.c b/mm/migrate.c index 62b81d5257aa..41ff2c9896c4 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -57,28 +57,6 @@ #include "internal.h" -/* - * migrate_prep() needs to be called before we start compiling a list of pages - * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is - * undesirable, use migrate_prep_local() - */ -void migrate_prep(void) -{ - /* - * Clear the LRU lists so pages can be isolated. - * Note that pages may be moved off the LRU after we have - * drained them. Those pages will fail to migrate like other - * pages that may be busy. - */ - lru_add_drain_all(); -} - -/* Do the necessary work of migrate_prep but not if it involves other CPUs */ -void migrate_prep_local(void) -{ - lru_add_drain(); -} - int isolate_movable_page(struct page *page, isolate_mode_t mode) { struct address_space *mapping; @@ -140,15 +118,10 @@ out: return -EBUSY; } -/* It should be called on page which is PG_movable */ -void putback_movable_page(struct page *page) +static void putback_movable_page(struct page *page) { struct address_space *mapping; - VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE(!PageMovable(page), page); - VM_BUG_ON_PAGE(!PageIsolated(page), page); - mapping = page_mapping(page); mapping->a_ops->putback_page(page); __ClearPageIsolated(page); @@ -322,6 +295,7 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, goto out; page = migration_entry_to_page(entry); + page = compound_head(page); /* * Once page cache replacement of page migration started, page_count @@ -1375,7 +1349,7 @@ out_unlock: out: if (rc == MIGRATEPAGE_SUCCESS) putback_active_hugepage(hpage); - else if (rc != -EAGAIN && rc != MIGRATEPAGE_SUCCESS) + else if (rc != -EAGAIN) list_move_tail(&hpage->lru, ret); /* @@ -1445,6 +1419,8 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, int rc, nr_subpages; LIST_HEAD(ret_pages); + trace_mm_migrate_pages_start(mode, reason); + if (!swapwrite) current->flags |= PF_SWAPWRITE; @@ -1617,7 +1593,7 @@ struct page *alloc_migration_target(struct page *page, unsigned long private) if (is_highmem_idx(zidx) || zidx == ZONE_MOVABLE) gfp_mask |= __GFP_HIGHMEM; - new_page = __alloc_pages_nodemask(gfp_mask, order, nid, mtc->nmask); + new_page = __alloc_pages(gfp_mask, order, nid, mtc->nmask); if (new_page && PageTransHuge(new_page)) prep_transhuge_page(new_page); @@ -1769,7 +1745,7 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, int start, i; int err = 0, err1; - migrate_prep(); + lru_cache_disable(); for (i = start = 0; i < nr_pages; i++) { const void __user *p; @@ -1838,6 +1814,7 @@ out_flush: if (err >= 0) err = err1; out: + lru_cache_enable(); return err; } @@ -2110,17 +2087,6 @@ bool pmd_trans_migrating(pmd_t pmd) return PageLocked(page); } -static inline bool is_shared_exec_page(struct vm_area_struct *vma, - struct page *page) -{ - if (page_mapcount(page) != 1 && - (page_is_file_lru(page) || vma_is_shmem(vma)) && - (vma->vm_flags & VM_EXEC)) - return true; - - return false; -} - /* * Attempt to migrate a misplaced page to the specified destination * node. Caller is expected to have an elevated reference count on @@ -2138,7 +2104,8 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, * Don't migrate file pages that are mapped in multiple processes * with execute permissions as they are probably shared libraries. */ - if (is_shared_exec_page(vma, page)) + if (page_mapcount(page) != 1 && page_is_file_lru(page) && + (vma->vm_flags & VM_EXEC)) goto out; /* @@ -2193,9 +2160,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, int page_lru = page_is_file_lru(page); unsigned long start = address & HPAGE_PMD_MASK; - if (is_shared_exec_page(vma, page)) - goto out; - new_page = alloc_pages_node(node, (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE), HPAGE_PMD_ORDER); @@ -2307,7 +2271,6 @@ out_fail: out_unlock: unlock_page(page); -out: put_page(page); return 0; } @@ -2316,44 +2279,38 @@ out: #endif /* CONFIG_NUMA */ #ifdef CONFIG_DEVICE_PRIVATE -static int migrate_vma_collect_hole(unsigned long start, +static int migrate_vma_collect_skip(unsigned long start, unsigned long end, - __always_unused int depth, struct mm_walk *walk) { struct migrate_vma *migrate = walk->private; unsigned long addr; - /* Only allow populating anonymous memory. */ - if (!vma_is_anonymous(walk->vma)) { - for (addr = start; addr < end; addr += PAGE_SIZE) { - migrate->src[migrate->npages] = 0; - migrate->dst[migrate->npages] = 0; - migrate->npages++; - } - return 0; - } - for (addr = start; addr < end; addr += PAGE_SIZE) { - migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE; migrate->dst[migrate->npages] = 0; - migrate->npages++; - migrate->cpages++; + migrate->src[migrate->npages++] = 0; } return 0; } -static int migrate_vma_collect_skip(unsigned long start, +static int migrate_vma_collect_hole(unsigned long start, unsigned long end, + __always_unused int depth, struct mm_walk *walk) { struct migrate_vma *migrate = walk->private; unsigned long addr; + /* Only allow populating anonymous memory. */ + if (!vma_is_anonymous(walk->vma)) + return migrate_vma_collect_skip(start, end, walk); + for (addr = start; addr < end; addr += PAGE_SIZE) { + migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE; migrate->dst[migrate->npages] = 0; - migrate->src[migrate->npages++] = 0; + migrate->npages++; + migrate->cpages++; } return 0; @@ -2823,11 +2780,11 @@ restore: * * For empty entries inside CPU page table (pte_none() or pmd_none() is true) we * do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus - * allowing the caller to allocate device memory for those unback virtual - * address. For this the caller simply has to allocate device memory and + * allowing the caller to allocate device memory for those unbacked virtual + * addresses. For this the caller simply has to allocate device memory and * properly set the destination entry like for regular migration. Note that - * this can still fails and thus inside the device driver must check if the - * migration was successful for those entries after calling migrate_vma_pages() + * this can still fail, and thus inside the device driver you must check if the + * migration was successful for those entries after calling migrate_vma_pages(), * just like for regular migration. * * After that, the callers must call migrate_vma_pages() to go over each entry @@ -2973,6 +2930,13 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE); entry = swp_entry_to_pte(swp_entry); + } else { + /* + * For now we only support migrating to un-addressable + * device memory. + */ + pr_warn_once("Unsupported ZONE_DEVICE page type.\n"); + goto abort; } } else { entry = mk_pte(page, vma->vm_page_prot); diff --git a/mm/mlock.c b/mm/mlock.c index f8f8cc32d03d..df590fda5688 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -559,7 +559,7 @@ static int apply_vma_lock_flags(unsigned long start, size_t len, vm_flags_t flags) { unsigned long nstart, end, tmp; - struct vm_area_struct * vma, * prev; + struct vm_area_struct *vma, *prev; int error; VM_BUG_ON(offset_in_page(start)); @@ -737,7 +737,7 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) */ static int apply_mlockall_flags(int flags) { - struct vm_area_struct * vma, * prev = NULL; + struct vm_area_struct *vma, *prev = NULL; vm_flags_t to_add = 0; current->mm->def_flags &= VM_LOCKED_CLEAR_MASK; diff --git a/mm/mm_init.c b/mm/mm_init.c index 8e02e865cc65..9ddaf0e1b0ab 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -19,10 +19,6 @@ #ifdef CONFIG_DEBUG_MEMORY_INIT int __meminitdata mminit_loglevel; -#ifndef SECTIONS_SHIFT -#define SECTIONS_SHIFT 0 -#endif - /* The zonelists are simply reported, validation is manual. */ void __init mminit_verify_zonelist(void) { diff --git a/mm/mmap.c b/mm/mmap.c index 3f287599a7a3..0584e540246e 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -93,6 +93,12 @@ static void unmap_region(struct mm_struct *mm, * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes * w: (no) no w: (no) no w: (copy) copy w: (no) no * x: (no) no x: (no) yes x: (no) yes x: (yes) yes + * + * On arm64, PROT_EXEC has the following behaviour for both MAP_SHARED and + * MAP_PRIVATE (with Enhanced PAN supported): + * r: (no) no + * w: (no) no + * x: (yes) yes */ pgprot_t protection_map[16] __ro_after_init = { __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111, @@ -606,7 +612,7 @@ static unsigned long count_vma_pages_range(struct mm_struct *mm, unsigned long nr_pages = 0; struct vm_area_struct *vma; - /* Find first overlaping mapping */ + /* Find first overlapping mapping */ vma = find_vma_intersection(mm, addr, end); if (!vma) return 0; @@ -2869,7 +2875,7 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, if (unlikely(uf)) { /* * If userfaultfd_unmap_prep returns an error the vmas - * will remain splitted, but userland will get a + * will remain split, but userland will get a * highly unexpected error anyway. This is no * different than the case where the first of the two * __split_vma fails, but we don't undo the first @@ -3023,25 +3029,9 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, flags &= MAP_NONBLOCK; flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE; - if (vma->vm_flags & VM_LOCKED) { - struct vm_area_struct *tmp; + if (vma->vm_flags & VM_LOCKED) flags |= MAP_LOCKED; - /* drop PG_Mlocked flag for over-mapped range */ - for (tmp = vma; tmp->vm_start >= start + size; - tmp = tmp->vm_next) { - /* - * Split pmd and munlock page on the border - * of the range. - */ - vma_adjust_trans_huge(tmp, start, start + size, 0); - - munlock_vma_pages_range(tmp, - max(tmp->vm_start, start), - min(tmp->vm_end, start + size)); - } - } - file = get_file(vma->vm_file); ret = do_mmap(vma->vm_file, start, size, prot, flags, pgoff, &populate, NULL); @@ -3403,14 +3393,10 @@ static const char *special_mapping_name(struct vm_area_struct *vma) return ((struct vm_special_mapping *)vma->vm_private_data)->name; } -static int special_mapping_mremap(struct vm_area_struct *new_vma, - unsigned long flags) +static int special_mapping_mremap(struct vm_area_struct *new_vma) { struct vm_special_mapping *sm = new_vma->vm_private_data; - if (flags & MREMAP_DONTUNMAP) - return -EINVAL; - if (WARN_ON_ONCE(current->mm != new_vma->vm_mm)) return -EFAULT; diff --git a/mm/mprotect.c b/mm/mprotect.c index 94188df1ee55..e7a443157988 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -699,7 +699,7 @@ SYSCALL_DEFINE1(pkey_free, int, pkey) mmap_write_unlock(current->mm); /* - * We could provie warnings or errors if any VMA still + * We could provide warnings or errors if any VMA still * has the pkey set here. */ return ret; diff --git a/mm/mremap.c b/mm/mremap.c index ec8f840399ed..47c255b60150 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -545,7 +545,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, if (moved_len < old_len) { err = -ENOMEM; } else if (vma->vm_ops && vma->vm_ops->mremap) { - err = vma->vm_ops->mremap(new_vma, flags); + err = vma->vm_ops->mremap(new_vma); } if (unlikely(err)) { @@ -653,8 +653,8 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, return ERR_PTR(-EINVAL); } - if (flags & MREMAP_DONTUNMAP && (!vma_is_anonymous(vma) || - vma->vm_flags & VM_SHARED)) + if ((flags & MREMAP_DONTUNMAP) && + (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))) return ERR_PTR(-EINVAL); if (is_vm_hugetlb_page(vma)) @@ -730,7 +730,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, * So, to avoid such scenario we can pre-compute if the whole * operation has high chances to success map-wise. * Worst-scenario case is when both vma's (new_addr and old_addr) get - * split in 3 before unmaping it. + * split in 3 before unmapping it. * That means 2 more maps (1 for each) to the ones we already hold. * Check whether current map count plus 2 still leads us to 4 maps below * the threshold, otherwise return -ENOMEM here to be more safe. diff --git a/mm/msync.c b/mm/msync.c index 69c6d2029531..137d1c104f3e 100644 --- a/mm/msync.c +++ b/mm/msync.c @@ -55,7 +55,9 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags) goto out; /* * If the interval [start,end) covers some unmapped address ranges, - * just ignore them, but return -ENOMEM at the end. + * just ignore them, but return -ENOMEM at the end. Besides, if the + * flag is MS_ASYNC (w/o MS_INVALIDATE) the result would be -ENOMEM + * anyway and there is nothing left to do, so return immediately. */ mmap_read_lock(mm); vma = find_vma(mm, start); @@ -69,6 +71,8 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags) goto out_unlock; /* Here start < vma->vm_end. */ if (start < vma->vm_start) { + if (flags == MS_ASYNC) + goto out_unlock; start = vma->vm_start; if (start >= end) goto out_unlock; diff --git a/mm/nommu.c b/mm/nommu.c index 5c9ab799c0e6..85a3a68dffb6 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -210,16 +210,6 @@ long vread(char *buf, char *addr, unsigned long count) return count; } -long vwrite(char *buf, char *addr, unsigned long count) -{ - /* Don't allow overflow */ - if ((unsigned long) addr + count < count) - count = -(unsigned long) addr; - - memcpy(addr, buf, count); - return count; -} - /* * vmalloc - allocate virtually contiguous memory * diff --git a/mm/oom_kill.c b/mm/oom_kill.c index fa1cf18bac97..eefd3f5fde46 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -74,7 +74,7 @@ static inline bool is_memcg_oom(struct oom_control *oc) #ifdef CONFIG_NUMA /** - * oom_cpuset_eligible() - check task eligiblity for kill + * oom_cpuset_eligible() - check task eligibility for kill * @start: task struct of which task to consider * @oc: pointer to struct oom_control * @@ -993,7 +993,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message) if (oom_group) { mem_cgroup_print_oom_group(oom_group); mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member, - (void*)message); + (void *)message); mem_cgroup_put(oom_group); } } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 9e35b636a393..0062d5c57d41 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1806,7 +1806,7 @@ pause: break; /* - * In the case of an unresponding NFS server and the NFS dirty + * In the case of an unresponsive NFS server and the NFS dirty * pages exceeds dirty_thresh, give the other good wb's a pipe * to go through, so that tasks on them still remain responsive. * @@ -2216,7 +2216,7 @@ int write_cache_pages(struct address_space *mapping, * Page truncated or invalidated. We can freely skip it * then, even for data integrity operations: the page * has disappeared concurrently, so there could be no - * real expectation of this data interity operation + * real expectation of this data integrity operation * even if there is now a new, dirty page at the same * pagecache address. */ @@ -2722,12 +2722,9 @@ EXPORT_SYMBOL(clear_page_dirty_for_io); int test_clear_page_writeback(struct page *page) { struct address_space *mapping = page_mapping(page); - struct mem_cgroup *memcg; - struct lruvec *lruvec; int ret; - memcg = lock_page_memcg(page); - lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page)); + lock_page_memcg(page); if (mapping && mapping_use_writeback_tags(mapping)) { struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); @@ -2755,11 +2752,11 @@ int test_clear_page_writeback(struct page *page) ret = TestClearPageWriteback(page); } if (ret) { - dec_lruvec_state(lruvec, NR_WRITEBACK); + dec_lruvec_page_state(page, NR_WRITEBACK); dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); inc_node_page_state(page, NR_WRITTEN); } - __unlock_page_memcg(memcg); + unlock_page_memcg(page); return ret; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cfc72873961d..ef2265f86b91 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -72,7 +72,6 @@ #include <linux/padata.h> #include <linux/khugepaged.h> #include <linux/buffer_head.h> - #include <asm/sections.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -108,6 +107,17 @@ typedef int __bitwise fpi_t; */ #define FPI_TO_TAIL ((__force fpi_t)BIT(1)) +/* + * Don't poison memory with KASAN (only for the tag-based modes). + * During boot, all non-reserved memblock memory is exposed to page_alloc. + * Poisoning all that memory lengthens boot time, especially on systems with + * large amount of RAM. This flag is used to skip that poisoning. + * This is only done for the tag-based KASAN modes, as those are able to + * detect memory corruptions with the memory tags assigned by default. + * All memory allocated normally after boot gets poisoned as usual. + */ +#define FPI_SKIP_KASAN_POISON ((__force fpi_t)BIT(2)) + /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ static DEFINE_MUTEX(pcp_batch_high_lock); #define MIN_PERCPU_PAGELIST_FRACTION (8) @@ -167,10 +177,10 @@ unsigned long totalcma_pages __read_mostly; int percpu_pagelist_fraction; gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; -DEFINE_STATIC_KEY_FALSE(init_on_alloc); +DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc); EXPORT_SYMBOL(init_on_alloc); -DEFINE_STATIC_KEY_FALSE(init_on_free); +DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free); EXPORT_SYMBOL(init_on_free); static bool _init_on_alloc_enabled_early __read_mostly @@ -384,10 +394,15 @@ static DEFINE_STATIC_KEY_TRUE(deferred_pages); * on-demand allocation and then freed again before the deferred pages * initialization is done, but this is not likely to happen. */ -static inline void kasan_free_nondeferred_pages(struct page *page, int order) +static inline void kasan_free_nondeferred_pages(struct page *page, int order, + bool init, fpi_t fpi_flags) { - if (!static_branch_unlikely(&deferred_pages)) - kasan_free_pages(page, order); + if (static_branch_unlikely(&deferred_pages)) + return; + if (!IS_ENABLED(CONFIG_KASAN_GENERIC) && + (fpi_flags & FPI_SKIP_KASAN_POISON)) + return; + kasan_free_pages(page, order, init); } /* Returns true if the struct page for the pfn is uninitialised */ @@ -438,7 +453,14 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn) return false; } #else -#define kasan_free_nondeferred_pages(p, o) kasan_free_pages(p, o) +static inline void kasan_free_nondeferred_pages(struct page *page, int order, + bool init, fpi_t fpi_flags) +{ + if (!IS_ENABLED(CONFIG_KASAN_GENERIC) && + (fpi_flags & FPI_SKIP_KASAN_POISON)) + return; + kasan_free_pages(page, order, init); +} static inline bool early_page_uninitialised(unsigned long pfn) { @@ -764,32 +786,36 @@ static inline void clear_page_guard(struct zone *zone, struct page *page, */ void init_mem_debugging_and_hardening(void) { + bool page_poisoning_requested = false; + +#ifdef CONFIG_PAGE_POISONING + /* + * Page poisoning is debug page alloc for some arches. If + * either of those options are enabled, enable poisoning. + */ + if (page_poisoning_enabled() || + (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) && + debug_pagealloc_enabled())) { + static_branch_enable(&_page_poisoning_enabled); + page_poisoning_requested = true; + } +#endif + if (_init_on_alloc_enabled_early) { - if (page_poisoning_enabled()) + if (page_poisoning_requested) pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, " "will take precedence over init_on_alloc\n"); else static_branch_enable(&init_on_alloc); } if (_init_on_free_enabled_early) { - if (page_poisoning_enabled()) + if (page_poisoning_requested) pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, " "will take precedence over init_on_free\n"); else static_branch_enable(&init_on_free); } -#ifdef CONFIG_PAGE_POISONING - /* - * Page poisoning is debug page alloc for some arches. If - * either of those options are enabled, enable poisoning. - */ - if (page_poisoning_enabled() || - (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) && - debug_pagealloc_enabled())) - static_branch_enable(&_page_poisoning_enabled); -#endif - #ifdef CONFIG_DEBUG_PAGEALLOC if (!debug_pagealloc_enabled()) return; @@ -867,7 +893,7 @@ compaction_capture(struct capture_control *capc, struct page *page, return false; /* - * Do not let lower order allocations polluate a movable pageblock. + * Do not let lower order allocations pollute a movable pageblock. * This might let an unmovable request use a reclaimable pageblock * and vice-versa but no more than normal fallback logic which can * have trouble finding a high-order free page. @@ -1103,7 +1129,7 @@ static inline bool page_expected_state(struct page *page, if (unlikely((unsigned long)page->mapping | page_ref_count(page) | #ifdef CONFIG_MEMCG - (unsigned long)page_memcg(page) | + page->memcg_data | #endif (page->flags & check_flags))) return false; @@ -1128,7 +1154,7 @@ static const char *page_bad_reason(struct page *page, unsigned long flags) bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; } #ifdef CONFIG_MEMCG - if (unlikely(page_memcg(page))) + if (unlikely(page->memcg_data)) bad_reason = "page still charged to cgroup"; #endif return bad_reason; @@ -1216,9 +1242,10 @@ static void kernel_init_free_pages(struct page *page, int numpages) } static __always_inline bool free_pages_prepare(struct page *page, - unsigned int order, bool check_free) + unsigned int order, bool check_free, fpi_t fpi_flags) { int bad = 0; + bool init; VM_BUG_ON_PAGE(PageTail(page), page); @@ -1276,16 +1303,21 @@ static __always_inline bool free_pages_prepare(struct page *page, debug_check_no_obj_freed(page_address(page), PAGE_SIZE << order); } - if (want_init_on_free()) - kernel_init_free_pages(page, 1 << order); kernel_poison_pages(page, 1 << order); /* + * As memory initialization might be integrated into KASAN, + * kasan_free_pages and kernel_init_free_pages must be + * kept together to avoid discrepancies in behavior. + * * With hardware tag-based KASAN, memory tags must be set before the * page becomes unavailable via debug_pagealloc or arch_free_page. */ - kasan_free_nondeferred_pages(page, order); + init = want_init_on_free(); + if (init && !kasan_has_integrated_init()) + kernel_init_free_pages(page, 1 << order); + kasan_free_nondeferred_pages(page, order, init, fpi_flags); /* * arch_free_page() can make the page's contents inaccessible. s390 @@ -1307,7 +1339,7 @@ static __always_inline bool free_pages_prepare(struct page *page, */ static bool free_pcp_prepare(struct page *page) { - return free_pages_prepare(page, 0, true); + return free_pages_prepare(page, 0, true, FPI_NONE); } static bool bulkfree_pcp_prepare(struct page *page) @@ -1327,9 +1359,9 @@ static bool bulkfree_pcp_prepare(struct page *page) static bool free_pcp_prepare(struct page *page) { if (debug_pagealloc_enabled_static()) - return free_pages_prepare(page, 0, true); + return free_pages_prepare(page, 0, true, FPI_NONE); else - return free_pages_prepare(page, 0, false); + return free_pages_prepare(page, 0, false, FPI_NONE); } static bool bulkfree_pcp_prepare(struct page *page) @@ -1537,7 +1569,7 @@ static void __free_pages_ok(struct page *page, unsigned int order, int migratetype; unsigned long pfn = page_to_pfn(page); - if (!free_pages_prepare(page, order, true)) + if (!free_pages_prepare(page, order, true, fpi_flags)) return; migratetype = get_pfnblock_migratetype(page, pfn); @@ -1574,7 +1606,7 @@ void __free_pages_core(struct page *page, unsigned int order) * Bypass PCP and place fresh pages right to the tail, primarily * relevant for memory onlining. */ - __free_pages_ok(page, order, FPI_TO_TAIL); + __free_pages_ok(page, order, FPI_TO_TAIL | FPI_SKIP_KASAN_POISON); } #ifdef CONFIG_NEED_MULTIPLE_NODES @@ -2292,17 +2324,32 @@ static bool check_new_pages(struct page *page, unsigned int order) inline void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags) { + bool init; + set_page_private(page, 0); set_page_refcounted(page); arch_alloc_page(page, order); debug_pagealloc_map_pages(page, 1 << order); - kasan_alloc_pages(page, order); + + /* + * Page unpoisoning must happen before memory initialization. + * Otherwise, the poison pattern will be overwritten for __GFP_ZERO + * allocations and the page unpoisoning code will complain. + */ kernel_unpoison_pages(page, 1 << order); - set_page_owner(page, order, gfp_flags); - if (!want_init_on_free() && want_init_on_alloc(gfp_flags)) + /* + * As memory initialization might be integrated into KASAN, + * kasan_alloc_pages and kernel_init_free_pages must be + * kept together to avoid discrepancies in behavior. + */ + init = !want_init_on_free() && want_init_on_alloc(gfp_flags); + kasan_alloc_pages(page, order, init); + if (init && !kasan_has_integrated_init()) kernel_init_free_pages(page, 1 << order); + + set_page_owner(page, order, gfp_flags); } static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, @@ -2386,19 +2433,21 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone, * boundary. If alignment is required, use move_freepages_block() */ static int move_freepages(struct zone *zone, - struct page *start_page, struct page *end_page, + unsigned long start_pfn, unsigned long end_pfn, int migratetype, int *num_movable) { struct page *page; + unsigned long pfn; unsigned int order; int pages_moved = 0; - for (page = start_page; page <= end_page;) { - if (!pfn_valid_within(page_to_pfn(page))) { - page++; + for (pfn = start_pfn; pfn <= end_pfn;) { + if (!pfn_valid_within(pfn)) { + pfn++; continue; } + page = pfn_to_page(pfn); if (!PageBuddy(page)) { /* * We assume that pages that could be isolated for @@ -2408,8 +2457,7 @@ static int move_freepages(struct zone *zone, if (num_movable && (PageLRU(page) || __PageMovable(page))) (*num_movable)++; - - page++; + pfn++; continue; } @@ -2419,7 +2467,7 @@ static int move_freepages(struct zone *zone, order = buddy_order(page); move_to_free_list(page, zone, order, migratetype); - page += 1 << order; + pfn += 1 << order; pages_moved += 1 << order; } @@ -2429,25 +2477,22 @@ static int move_freepages(struct zone *zone, int move_freepages_block(struct zone *zone, struct page *page, int migratetype, int *num_movable) { - unsigned long start_pfn, end_pfn; - struct page *start_page, *end_page; + unsigned long start_pfn, end_pfn, pfn; if (num_movable) *num_movable = 0; - start_pfn = page_to_pfn(page); - start_pfn = start_pfn & ~(pageblock_nr_pages-1); - start_page = pfn_to_page(start_pfn); - end_page = start_page + pageblock_nr_pages - 1; + pfn = page_to_pfn(page); + start_pfn = pfn & ~(pageblock_nr_pages - 1); end_pfn = start_pfn + pageblock_nr_pages - 1; /* Do not cross zone boundaries */ if (!zone_spans_pfn(zone, start_pfn)) - start_page = page; + start_pfn = pfn; if (!zone_spans_pfn(zone, end_pfn)) return 0; - return move_freepages(zone, start_page, end_page, migratetype, + return move_freepages(zone, start_pfn, end_pfn, migratetype, num_movable); } @@ -2731,7 +2776,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, /* * In page freeing path, migratetype change is racy so * we can counter several free pages in a pageblock - * in this loop althoug we changed the pageblock type + * in this loop although we changed the pageblock type * from highatomic to ac->migratetype. So we should * adjust the count once. */ @@ -2908,7 +2953,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, int migratetype, unsigned int alloc_flags) { - int i, alloced = 0; + int i, allocated = 0; spin_lock(&zone->lock); for (i = 0; i < count; ++i) { @@ -2931,7 +2976,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, * pages are ordered properly. */ list_add_tail(&page->lru, list); - alloced++; + allocated++; if (is_migrate_cma(get_pcppage_migratetype(page))) __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, -(1 << order)); @@ -2940,12 +2985,12 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, /* * i pages were removed from the buddy list even if some leak due * to check_pcp_refill failing so adjust NR_FREE_PAGES based - * on i. Do not confuse with 'alloced' which is the number of + * on i. Do not confuse with 'allocated' which is the number of * pages added to the pcp list. */ __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); spin_unlock(&zone->lock); - return alloced; + return allocated; } #ifdef CONFIG_NUMA @@ -3035,7 +3080,7 @@ static void drain_local_pages_wq(struct work_struct *work) * drain_all_pages doesn't use proper cpu hotplug protection so * we can race with cpu offline when the WQ can move this from * a cpu pinned worker to an unbound one. We can operate on a different - * cpu which is allright but we also have to make sure to not move to + * cpu which is alright but we also have to make sure to not move to * a different one. */ preempt_disable(); @@ -3415,7 +3460,8 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) } /* Remove page from the per-cpu list, caller must protect the list */ -static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, +static inline +struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, unsigned int alloc_flags, struct per_cpu_pages *pcp, struct list_head *list) @@ -3813,16 +3859,13 @@ alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask) return alloc_flags; } -static inline unsigned int current_alloc_flags(gfp_t gfp_mask, - unsigned int alloc_flags) +/* Must be called after current_gfp_context() which can change gfp_mask */ +static inline unsigned int gfp_to_alloc_flags_cma(gfp_t gfp_mask, + unsigned int alloc_flags) { #ifdef CONFIG_CMA - unsigned int pflags = current->flags; - - if (!(pflags & PF_MEMALLOC_NOCMA) && - gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE) + if (gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE) alloc_flags |= ALLOC_CMA; - #endif return alloc_flags; } @@ -3922,7 +3965,7 @@ retry: if (alloc_flags & ALLOC_NO_WATERMARKS) goto try_this_zone; - if (node_reclaim_mode == 0 || + if (!node_reclaim_enabled() || !zone_allows_reclaim(ac->preferred_zoneref->zone, zone)) continue; @@ -4130,7 +4173,7 @@ out: } /* - * Maximum number of compaction retries wit a progress before OOM + * Maximum number of compaction retries with a progress before OOM * killer is consider as the only way to move forward. */ #define MAX_COMPACT_RETRIES 16 @@ -4158,6 +4201,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, memalloc_noreclaim_restore(noreclaim_flag); psi_memstall_leave(&pflags); + if (*compact_result == COMPACT_SKIPPED) + return NULL; /* * At least in one zone compaction wasn't deferred or skipped, so let's * count a compaction stall @@ -4478,7 +4523,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask) } else if (unlikely(rt_task(current)) && !in_interrupt()) alloc_flags |= ALLOC_HARDER; - alloc_flags = current_alloc_flags(gfp_mask, alloc_flags); + alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags); return alloc_flags; } @@ -4780,7 +4825,7 @@ retry: reserve_flags = __gfp_pfmemalloc_flags(gfp_mask); if (reserve_flags) - alloc_flags = current_alloc_flags(gfp_mask, reserve_flags); + alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, reserve_flags); /* * Reset the nodemask and zonelist iterators if memory policies can be @@ -4921,7 +4966,7 @@ got_pg: static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, int preferred_nid, nodemask_t *nodemask, - struct alloc_context *ac, gfp_t *alloc_mask, + struct alloc_context *ac, gfp_t *alloc_gfp, unsigned int *alloc_flags) { ac->highest_zoneidx = gfp_zone(gfp_mask); @@ -4930,7 +4975,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, ac->migratetype = gfp_migratetype(gfp_mask); if (cpusets_enabled()) { - *alloc_mask |= __GFP_HARDWALL; + *alloc_gfp |= __GFP_HARDWALL; /* * When we are in the interrupt context, it is irrelevant * to the current task context. It means that any node ok. @@ -4949,7 +4994,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, if (should_fail_alloc_page(gfp_mask, order)) return false; - *alloc_flags = current_alloc_flags(gfp_mask, *alloc_flags); + *alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, *alloc_flags); /* Dirty zone balancing only done in the fast path */ ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE); @@ -4966,15 +5011,164 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, } /* + * __alloc_pages_bulk - Allocate a number of order-0 pages to a list or array + * @gfp: GFP flags for the allocation + * @preferred_nid: The preferred NUMA node ID to allocate from + * @nodemask: Set of nodes to allocate from, may be NULL + * @nr_pages: The number of pages desired on the list or array + * @page_list: Optional list to store the allocated pages + * @page_array: Optional array to store the pages + * + * This is a batched version of the page allocator that attempts to + * allocate nr_pages quickly. Pages are added to page_list if page_list + * is not NULL, otherwise it is assumed that the page_array is valid. + * + * For lists, nr_pages is the number of pages that should be allocated. + * + * For arrays, only NULL elements are populated with pages and nr_pages + * is the maximum number of pages that will be stored in the array. + * + * Returns the number of pages on the list or array. + */ +unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, + nodemask_t *nodemask, int nr_pages, + struct list_head *page_list, + struct page **page_array) +{ + struct page *page; + unsigned long flags; + struct zone *zone; + struct zoneref *z; + struct per_cpu_pages *pcp; + struct list_head *pcp_list; + struct alloc_context ac; + gfp_t alloc_gfp; + unsigned int alloc_flags = ALLOC_WMARK_LOW; + int nr_populated = 0; + + if (unlikely(nr_pages <= 0)) + return 0; + + /* + * Skip populated array elements to determine if any pages need + * to be allocated before disabling IRQs. + */ + while (page_array && nr_populated < nr_pages && page_array[nr_populated]) + nr_populated++; + + /* Already populated array? */ + if (unlikely(page_array && nr_pages - nr_populated == 0)) + return 0; + + /* Use the single page allocator for one page. */ + if (nr_pages - nr_populated == 1) + goto failed; + + /* May set ALLOC_NOFRAGMENT, fragmentation will return 1 page. */ + gfp &= gfp_allowed_mask; + alloc_gfp = gfp; + if (!prepare_alloc_pages(gfp, 0, preferred_nid, nodemask, &ac, &alloc_gfp, &alloc_flags)) + return 0; + gfp = alloc_gfp; + + /* Find an allowed local zone that meets the low watermark. */ + for_each_zone_zonelist_nodemask(zone, z, ac.zonelist, ac.highest_zoneidx, ac.nodemask) { + unsigned long mark; + + if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) && + !__cpuset_zone_allowed(zone, gfp)) { + continue; + } + + if (nr_online_nodes > 1 && zone != ac.preferred_zoneref->zone && + zone_to_nid(zone) != zone_to_nid(ac.preferred_zoneref->zone)) { + goto failed; + } + + mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK) + nr_pages; + if (zone_watermark_fast(zone, 0, mark, + zonelist_zone_idx(ac.preferred_zoneref), + alloc_flags, gfp)) { + break; + } + } + + /* + * If there are no allowed local zones that meets the watermarks then + * try to allocate a single page and reclaim if necessary. + */ + if (unlikely(!zone)) + goto failed; + + /* Attempt the batch allocation */ + local_irq_save(flags); + pcp = &this_cpu_ptr(zone->pageset)->pcp; + pcp_list = &pcp->lists[ac.migratetype]; + + while (nr_populated < nr_pages) { + + /* Skip existing pages */ + if (page_array && page_array[nr_populated]) { + nr_populated++; + continue; + } + + page = __rmqueue_pcplist(zone, ac.migratetype, alloc_flags, + pcp, pcp_list); + if (unlikely(!page)) { + /* Try and get at least one page */ + if (!nr_populated) + goto failed_irq; + break; + } + + /* + * Ideally this would be batched but the best way to do + * that cheaply is to first convert zone_statistics to + * be inaccurate per-cpu counter like vm_events to avoid + * a RMW cycle then do the accounting with IRQs enabled. + */ + __count_zid_vm_events(PGALLOC, zone_idx(zone), 1); + zone_statistics(ac.preferred_zoneref->zone, zone); + + prep_new_page(page, 0, gfp, 0); + if (page_list) + list_add(&page->lru, page_list); + else + page_array[nr_populated] = page; + nr_populated++; + } + + local_irq_restore(flags); + + return nr_populated; + +failed_irq: + local_irq_restore(flags); + +failed: + page = __alloc_pages(gfp, 0, preferred_nid, nodemask); + if (page) { + if (page_list) + list_add(&page->lru, page_list); + else + page_array[nr_populated] = page; + nr_populated++; + } + + return nr_populated; +} +EXPORT_SYMBOL_GPL(__alloc_pages_bulk); + +/* * This is the 'heart' of the zoned buddy allocator. */ -struct page * -__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, +struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, nodemask_t *nodemask) { struct page *page; unsigned int alloc_flags = ALLOC_WMARK_LOW; - gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */ + gfp_t alloc_gfp; /* The gfp_t that was actually used for allocation */ struct alloc_context ac = { }; /* @@ -4982,33 +5176,36 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, * so bail out early if the request is out of bound. */ if (unlikely(order >= MAX_ORDER)) { - WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN)); + WARN_ON_ONCE(!(gfp & __GFP_NOWARN)); return NULL; } - gfp_mask &= gfp_allowed_mask; - alloc_mask = gfp_mask; - if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags)) + gfp &= gfp_allowed_mask; + /* + * Apply scoped allocation constraints. This is mainly about GFP_NOFS + * resp. GFP_NOIO which has to be inherited for all allocation requests + * from a particular context which has been marked by + * memalloc_no{fs,io}_{save,restore}. And PF_MEMALLOC_PIN which ensures + * movable zones are not used during allocation. + */ + gfp = current_gfp_context(gfp); + alloc_gfp = gfp; + if (!prepare_alloc_pages(gfp, order, preferred_nid, nodemask, &ac, + &alloc_gfp, &alloc_flags)) return NULL; /* * Forbid the first pass from falling back to types that fragment * memory until all local zones are considered. */ - alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp_mask); + alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp); /* First allocation attempt */ - page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); + page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac); if (likely(page)) goto out; - /* - * Apply scoped allocation constraints. This is mainly about GFP_NOFS - * resp. GFP_NOIO which has to be inherited for all allocation requests - * from a particular context which has been marked by - * memalloc_no{fs,io}_{save,restore}. - */ - alloc_mask = current_gfp_context(gfp_mask); + alloc_gfp = gfp; ac.spread_dirty_pages = false; /* @@ -5017,20 +5214,20 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, */ ac.nodemask = nodemask; - page = __alloc_pages_slowpath(alloc_mask, order, &ac); + page = __alloc_pages_slowpath(alloc_gfp, order, &ac); out: - if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page && - unlikely(__memcg_kmem_charge_page(page, gfp_mask, order) != 0)) { + if (memcg_kmem_enabled() && (gfp & __GFP_ACCOUNT) && page && + unlikely(__memcg_kmem_charge_page(page, gfp, order) != 0)) { __free_pages(page, order); page = NULL; } - trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype); + trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype); return page; } -EXPORT_SYMBOL(__alloc_pages_nodemask); +EXPORT_SYMBOL(__alloc_pages); /* * Common helper functions. Never use with __GFP_HIGHMEM because the returned @@ -5736,7 +5933,7 @@ static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs) static int __parse_numa_zonelist_order(char *s) { /* - * We used to support different zonlists modes but they turned + * We used to support different zonelists modes but they turned * out to be just not useful. Let's keep the warning in place * if somebody still use the cmd line parameter so that we do * not fail it silently @@ -7477,7 +7674,7 @@ static void check_for_memory(pg_data_t *pgdat, int nid) } /* - * Some architecturs, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For + * Some architectures, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For * such cases we allow max_zone_pfn sorted in the descending order */ bool __weak arch_has_descending_max_zone_pfns(void) @@ -7689,7 +7886,7 @@ unsigned long free_reserved_area(void *start, void *end, int poison, const char return pages; } -void __init mem_init_print_info(const char *str) +void __init mem_init_print_info(void) { unsigned long physpages, codesize, datasize, rosize, bss_size; unsigned long init_code_size, init_data_size; @@ -7728,17 +7925,17 @@ void __init mem_init_print_info(const char *str) #ifdef CONFIG_HIGHMEM ", %luK highmem" #endif - "%s%s)\n", + ")\n", nr_free_pages() << (PAGE_SHIFT - 10), physpages << (PAGE_SHIFT - 10), codesize >> 10, datasize >> 10, rosize >> 10, (init_data_size + init_code_size) >> 10, bss_size >> 10, (physpages - totalram_pages() - totalcma_pages) << (PAGE_SHIFT - 10), - totalcma_pages << (PAGE_SHIFT - 10), + totalcma_pages << (PAGE_SHIFT - 10) #ifdef CONFIG_HIGHMEM - totalhigh_pages() << (PAGE_SHIFT - 10), + , totalhigh_pages() << (PAGE_SHIFT - 10) #endif - str ? ", " : "", str ? str : ""); + ); } /** @@ -8222,6 +8419,7 @@ void *__init alloc_large_system_hash(const char *tablename, void *table = NULL; gfp_t gfp_flags; bool virt; + bool huge; /* allow the kernel cmdline to have a say */ if (!numentries) { @@ -8289,6 +8487,7 @@ void *__init alloc_large_system_hash(const char *tablename, } else if (get_order(size) >= MAX_ORDER || hashdist) { table = __vmalloc(size, gfp_flags); virt = true; + huge = is_vm_area_hugepages(table); } else { /* * If bucketsize is not a power-of-two, we may free @@ -8305,7 +8504,7 @@ void *__init alloc_large_system_hash(const char *tablename, pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n", tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size, - virt ? "vmalloc" : "linear"); + virt ? (huge ? "vmalloc hugepage" : "vmalloc") : "linear"); if (_hash_shift) *_hash_shift = log2qty; @@ -8450,6 +8649,27 @@ static unsigned long pfn_max_align_up(unsigned long pfn) pageblock_nr_pages)); } +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) +/* Usage: See admin-guide/dynamic-debug-howto.rst */ +static void alloc_contig_dump_pages(struct list_head *page_list) +{ + DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, "migrate failure"); + + if (DYNAMIC_DEBUG_BRANCH(descriptor)) { + struct page *page; + + dump_stack(); + list_for_each_entry(page, page_list, lru) + dump_page(page, "migration failure"); + } +} +#else +static inline void alloc_contig_dump_pages(struct list_head *page_list) +{ +} +#endif + /* [start, end) must belong to a single zone. */ static int __alloc_contig_migrate_range(struct compact_control *cc, unsigned long start, unsigned long end) @@ -8464,7 +8684,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, }; - migrate_prep(); + lru_cache_disable(); while (pfn < end || !list_empty(&cc->migratepages)) { if (fatal_signal_pending(current)) { @@ -8474,14 +8694,13 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, if (list_empty(&cc->migratepages)) { cc->nr_migratepages = 0; - pfn = isolate_migratepages_range(cc, pfn, end); - if (!pfn) { - ret = -EINTR; + ret = isolate_migratepages_range(cc, pfn, end); + if (ret && ret != -EAGAIN) break; - } + pfn = cc->migrate_pfn; tries = 0; } else if (++tries == 5) { - ret = ret < 0 ? ret : -EBUSY; + ret = -EBUSY; break; } @@ -8491,8 +8710,18 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, ret = migrate_pages(&cc->migratepages, alloc_migration_target, NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE); + + /* + * On -ENOMEM, migrate_pages() bails out right away. It is pointless + * to retry again over this error, so do the same here. + */ + if (ret == -ENOMEM) + break; } + + lru_cache_enable(); if (ret < 0) { + alloc_contig_dump_pages(&cc->migratepages); putback_movable_pages(&cc->migratepages); return ret; } @@ -8503,7 +8732,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, * alloc_contig_range() -- tries to allocate given range of pages * @start: start PFN to allocate * @end: one-past-the-last PFN to allocate - * @migratetype: migratetype of the underlaying pageblocks (either + * @migratetype: migratetype of the underlying pageblocks (either * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks * in range must have the same migratetype and it must * be either of the two. @@ -8583,7 +8812,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, ret = __alloc_contig_migrate_range(&cc, start, end); if (ret && ret != -EBUSY) goto done; - ret =0; + ret = 0; /* * Pages from [start, end) are within a MAX_ORDER_NR_PAGES @@ -8602,8 +8831,6 @@ int alloc_contig_range(unsigned long start, unsigned long end, * isolated thus they won't get removed from buddy. */ - lru_add_drain_all(); - order = 0; outer_start = start; while (!PageBuddy(pfn_to_page(outer_start))) { @@ -8629,8 +8856,6 @@ int alloc_contig_range(unsigned long start, unsigned long end, /* Make sure the range is really isolated. */ if (test_pages_isolated(outer_start, end, 0)) { - pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n", - __func__, outer_start, end); ret = -EBUSY; goto done; } @@ -8680,12 +8905,6 @@ static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn, if (PageReserved(page)) return false; - - if (page_count(page) > 0) - return false; - - if (PageHuge(page)) - return false; } return true; } @@ -8757,9 +8976,9 @@ struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask, } #endif /* CONFIG_CONTIG_ALLOC */ -void free_contig_range(unsigned long pfn, unsigned int nr_pages) +void free_contig_range(unsigned long pfn, unsigned long nr_pages) { - unsigned int count = 0; + unsigned long count = 0; for (; nr_pages--; pfn++) { struct page *page = pfn_to_page(pfn); @@ -8767,13 +8986,13 @@ void free_contig_range(unsigned long pfn, unsigned int nr_pages) count += page_count(page) != 1; __free_page(page); } - WARN(count != 0, "%d pages are still in use!\n", count); + WARN(count != 0, "%lu pages are still in use!\n", count); } EXPORT_SYMBOL(free_contig_range); /* * The zone indicated has a new number of managed_pages; batch sizes and percpu - * page high values need to be recalulated. + * page high values need to be recalculated. */ void __meminit zone_pcp_update(struct zone *zone) { @@ -8805,12 +9024,9 @@ void zone_pcp_enable(struct zone *zone) void zone_pcp_reset(struct zone *zone) { - unsigned long flags; int cpu; struct per_cpu_pageset *pset; - /* avoid races with drain_pages() */ - local_irq_save(flags); if (zone->pageset != &boot_pageset) { for_each_online_cpu(cpu) { pset = per_cpu_ptr(zone->pageset, cpu); @@ -8819,7 +9035,6 @@ void zone_pcp_reset(struct zone *zone) free_percpu(zone->pageset); zone->pageset = &boot_pageset; } - local_irq_restore(flags); } #ifdef CONFIG_MEMORY_HOTREMOVE @@ -8947,6 +9162,8 @@ bool take_page_off_buddy(struct page *page) del_page_from_free_list(page_head, zone, page_order); break_down_buddy_pages(zone, page_head, page, 0, page_order, migratetype); + if (!is_migrate_isolate(migratetype)) + __mod_zone_freepage_state(zone, -1, migratetype); ret = true; break; } diff --git a/mm/page_counter.c b/mm/page_counter.c index c6860f51b6c6..7d83641eb86b 100644 --- a/mm/page_counter.c +++ b/mm/page_counter.c @@ -52,9 +52,13 @@ void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages) long new; new = atomic_long_sub_return(nr_pages, &counter->usage); - propagate_protected_usage(counter, new); /* More uncharges than charges? */ - WARN_ON_ONCE(new < 0); + if (WARN_ONCE(new < 0, "page_counter underflow: %ld nr_pages=%lu\n", + new, nr_pages)) { + new = 0; + atomic_long_set(&counter->usage, new); + } + propagate_protected_usage(counter, new); } /** diff --git a/mm/page_owner.c b/mm/page_owner.c index d15c7c4994f5..adfabb560eb9 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -27,6 +27,7 @@ struct page_owner { depot_stack_handle_t handle; depot_stack_handle_t free_handle; u64 ts_nsec; + u64 free_ts_nsec; pid_t pid; }; @@ -41,13 +42,7 @@ static void init_early_allocated_pages(void); static int __init early_page_owner_param(char *buf) { - if (!buf) - return -EINVAL; - - if (strcmp(buf, "on") == 0) - page_owner_enabled = true; - - return 0; + return kstrtobool(buf, &page_owner_enabled); } early_param("page_owner", early_page_owner_param); @@ -103,42 +98,30 @@ static inline struct page_owner *get_page_owner(struct page_ext *page_ext) return (void *)page_ext + page_owner_ops.offset; } -static inline bool check_recursive_alloc(unsigned long *entries, - unsigned int nr_entries, - unsigned long ip) -{ - unsigned int i; - - for (i = 0; i < nr_entries; i++) { - if (entries[i] == ip) - return true; - } - return false; -} - static noinline depot_stack_handle_t save_stack(gfp_t flags) { unsigned long entries[PAGE_OWNER_STACK_DEPTH]; depot_stack_handle_t handle; unsigned int nr_entries; - nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 2); - /* - * We need to check recursion here because our request to - * stackdepot could trigger memory allocation to save new - * entry. New memory allocation would reach here and call - * stack_depot_save_entries() again if we don't catch it. There is - * still not enough memory in stackdepot so it would try to - * allocate memory again and loop forever. + * Avoid recursion. + * + * Sometimes page metadata allocation tracking requires more + * memory to be allocated: + * - when new stack trace is saved to stack depot + * - when backtrace itself is calculated (ia64) */ - if (check_recursive_alloc(entries, nr_entries, _RET_IP_)) + if (current->in_page_owner) return dummy_handle; + current->in_page_owner = 1; + nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 2); handle = stack_depot_save(entries, nr_entries, flags); if (!handle) handle = failure_handle; + current->in_page_owner = 0; return handle; } @@ -146,25 +129,27 @@ void __reset_page_owner(struct page *page, unsigned int order) { int i; struct page_ext *page_ext; - depot_stack_handle_t handle = 0; + depot_stack_handle_t handle; struct page_owner *page_owner; - - handle = save_stack(GFP_NOWAIT | __GFP_NOWARN); + u64 free_ts_nsec = local_clock(); page_ext = lookup_page_ext(page); if (unlikely(!page_ext)) return; + + handle = save_stack(GFP_NOWAIT | __GFP_NOWARN); for (i = 0; i < (1 << order); i++) { __clear_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags); page_owner = get_page_owner(page_ext); page_owner->free_handle = handle; + page_owner->free_ts_nsec = free_ts_nsec; page_ext = page_ext_next(page_ext); } } -static inline void __set_page_owner_handle(struct page *page, - struct page_ext *page_ext, depot_stack_handle_t handle, - unsigned int order, gfp_t gfp_mask) +static inline void __set_page_owner_handle(struct page_ext *page_ext, + depot_stack_handle_t handle, + unsigned int order, gfp_t gfp_mask) { struct page_owner *page_owner; int i; @@ -194,7 +179,7 @@ noinline void __set_page_owner(struct page *page, unsigned int order, return; handle = save_stack(gfp_mask); - __set_page_owner_handle(page, page_ext, handle, order, gfp_mask); + __set_page_owner_handle(page_ext, handle, order, gfp_mask); } void __set_page_owner_migrate_reason(struct page *page, int reason) @@ -243,11 +228,12 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage) new_page_owner->handle = old_page_owner->handle; new_page_owner->pid = old_page_owner->pid; new_page_owner->ts_nsec = old_page_owner->ts_nsec; + new_page_owner->free_ts_nsec = old_page_owner->ts_nsec; /* * We don't clear the bit on the oldpage as it's going to be freed * after migration. Until then, the info can be useful in case of - * a bug, and the overal stats will be off a bit only temporarily. + * a bug, and the overall stats will be off a bit only temporarily. * Also, migrate_misplaced_transhuge_page() can still fail the * migration and then we want the oldpage to retain the info. But * in that case we also don't need to explicitly clear the info from @@ -356,10 +342,10 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, return -ENOMEM; ret = snprintf(kbuf, count, - "Page allocated via order %u, mask %#x(%pGg), pid %d, ts %llu ns\n", + "Page allocated via order %u, mask %#x(%pGg), pid %d, ts %llu ns, free_ts %llu ns\n", page_owner->order, page_owner->gfp_mask, &page_owner->gfp_mask, page_owner->pid, - page_owner->ts_nsec); + page_owner->ts_nsec, page_owner->free_ts_nsec); if (ret >= count) goto err; @@ -435,9 +421,9 @@ void __dump_page_owner(struct page *page) else pr_alert("page_owner tracks the page as freed\n"); - pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg), pid %d, ts %llu\n", + pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg), pid %d, ts %llu, free_ts %llu\n", page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask, - page_owner->pid, page_owner->ts_nsec); + page_owner->pid, page_owner->ts_nsec, page_owner->free_ts_nsec); handle = READ_ONCE(page_owner->handle); if (!handle) { @@ -612,7 +598,7 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) continue; /* Found early allocated page */ - __set_page_owner_handle(page, page_ext, early_handle, + __set_page_owner_handle(page_ext, early_handle, 0, 0); count++; } diff --git a/mm/page_poison.c b/mm/page_poison.c index 655dc5895604..98438985e1ed 100644 --- a/mm/page_poison.c +++ b/mm/page_poison.c @@ -2,6 +2,7 @@ #include <linux/kernel.h> #include <linux/string.h> #include <linux/mm.h> +#include <linux/mmdebug.h> #include <linux/highmem.h> #include <linux/page_ext.h> #include <linux/poison.h> @@ -45,7 +46,7 @@ static bool single_bit_flip(unsigned char a, unsigned char b) return error && !(error & (error - 1)); } -static void check_poison_mem(unsigned char *mem, size_t bytes) +static void check_poison_mem(struct page *page, unsigned char *mem, size_t bytes) { static DEFINE_RATELIMIT_STATE(ratelimit, 5 * HZ, 10); unsigned char *start; @@ -70,6 +71,7 @@ static void check_poison_mem(unsigned char *mem, size_t bytes) print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, start, end - start + 1, 1); dump_stack(); + dump_page(page, "pagealloc: corrupted page details"); } static void unpoison_page(struct page *page) @@ -83,7 +85,7 @@ static void unpoison_page(struct page *page) * that is freed to buddy. Thus no extra check is done to * see if a page was poisoned. */ - check_poison_mem(kasan_reset_tag(addr), PAGE_SIZE); + check_poison_mem(page, kasan_reset_tag(addr), PAGE_SIZE); kasan_enable_current(); kunmap_atomic(addr); } diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index 86e3a3688d59..a4435311754b 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -116,6 +116,13 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw) return pfn_is_match(pvmw->page, pfn); } +static void step_forward(struct page_vma_mapped_walk *pvmw, unsigned long size) +{ + pvmw->address = (pvmw->address + size) & ~(size - 1); + if (!pvmw->address) + pvmw->address = ULONG_MAX; +} + /** * page_vma_mapped_walk - check if @pvmw->page is mapped in @pvmw->vma at * @pvmw->address @@ -134,7 +141,7 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw) * regardless of which page table level the page is mapped at. @pvmw->pmd is * NULL. * - * Retruns false if there are no more page table entries for the page in + * Returns false if there are no more page table entries for the page in * the vma. @pvmw->ptl is unlocked and @pvmw->pte is unmapped. * * If you need to stop the walk before page_vma_mapped_walk() returned false, @@ -144,6 +151,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) { struct mm_struct *mm = pvmw->vma->vm_mm; struct page *page = pvmw->page; + unsigned long end; pgd_t *pgd; p4d_t *p4d; pud_t *pud; @@ -153,10 +161,11 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) if (pvmw->pmd && !pvmw->pte) return not_found(pvmw); - if (pvmw->pte) - goto next_pte; + if (unlikely(PageHuge(page))) { + /* The only possible mapping was handled on last iteration */ + if (pvmw->pte) + return not_found(pvmw); - if (unlikely(PageHuge(pvmw->page))) { /* when pud is not present, pte will be NULL */ pvmw->pte = huge_pte_offset(mm, pvmw->address, page_size(page)); if (!pvmw->pte) @@ -168,78 +177,108 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) return not_found(pvmw); return true; } -restart: - pgd = pgd_offset(mm, pvmw->address); - if (!pgd_present(*pgd)) - return false; - p4d = p4d_offset(pgd, pvmw->address); - if (!p4d_present(*p4d)) - return false; - pud = pud_offset(p4d, pvmw->address); - if (!pud_present(*pud)) - return false; - pvmw->pmd = pmd_offset(pud, pvmw->address); + /* - * Make sure the pmd value isn't cached in a register by the - * compiler and used as a stale value after we've observed a - * subsequent update. + * Seek to next pte only makes sense for THP. + * But more important than that optimization, is to filter out + * any PageKsm page: whose page->index misleads vma_address() + * and vma_address_end() to disaster. */ - pmde = READ_ONCE(*pvmw->pmd); - if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde)) { - pvmw->ptl = pmd_lock(mm, pvmw->pmd); - if (likely(pmd_trans_huge(*pvmw->pmd))) { - if (pvmw->flags & PVMW_MIGRATION) - return not_found(pvmw); - if (pmd_page(*pvmw->pmd) != page) - return not_found(pvmw); - return true; - } else if (!pmd_present(*pvmw->pmd)) { - if (thp_migration_supported()) { - if (!(pvmw->flags & PVMW_MIGRATION)) + end = PageTransCompound(page) ? + vma_address_end(page, pvmw->vma) : + pvmw->address + PAGE_SIZE; + if (pvmw->pte) + goto next_pte; +restart: + do { + pgd = pgd_offset(mm, pvmw->address); + if (!pgd_present(*pgd)) { + step_forward(pvmw, PGDIR_SIZE); + continue; + } + p4d = p4d_offset(pgd, pvmw->address); + if (!p4d_present(*p4d)) { + step_forward(pvmw, P4D_SIZE); + continue; + } + pud = pud_offset(p4d, pvmw->address); + if (!pud_present(*pud)) { + step_forward(pvmw, PUD_SIZE); + continue; + } + + pvmw->pmd = pmd_offset(pud, pvmw->address); + /* + * Make sure the pmd value isn't cached in a register by the + * compiler and used as a stale value after we've observed a + * subsequent update. + */ + pmde = READ_ONCE(*pvmw->pmd); + + if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde)) { + pvmw->ptl = pmd_lock(mm, pvmw->pmd); + pmde = *pvmw->pmd; + if (likely(pmd_trans_huge(pmde))) { + if (pvmw->flags & PVMW_MIGRATION) return not_found(pvmw); - if (is_migration_entry(pmd_to_swp_entry(*pvmw->pmd))) { - swp_entry_t entry = pmd_to_swp_entry(*pvmw->pmd); + if (pmd_page(pmde) != page) + return not_found(pvmw); + return true; + } + if (!pmd_present(pmde)) { + swp_entry_t entry; - if (migration_entry_to_page(entry) != page) - return not_found(pvmw); - return true; - } + if (!thp_migration_supported() || + !(pvmw->flags & PVMW_MIGRATION)) + return not_found(pvmw); + entry = pmd_to_swp_entry(pmde); + if (!is_migration_entry(entry) || + migration_entry_to_page(entry) != page) + return not_found(pvmw); + return true; } - return not_found(pvmw); - } else { /* THP pmd was split under us: handle on pte level */ spin_unlock(pvmw->ptl); pvmw->ptl = NULL; + } else if (!pmd_present(pmde)) { + /* + * If PVMW_SYNC, take and drop THP pmd lock so that we + * cannot return prematurely, while zap_huge_pmd() has + * cleared *pmd but not decremented compound_mapcount(). + */ + if ((pvmw->flags & PVMW_SYNC) && + PageTransCompound(page)) { + spinlock_t *ptl = pmd_lock(mm, pvmw->pmd); + + spin_unlock(ptl); + } + step_forward(pvmw, PMD_SIZE); + continue; } - } else if (!pmd_present(pmde)) { - return false; - } - if (!map_pte(pvmw)) - goto next_pte; - while (1) { + if (!map_pte(pvmw)) + goto next_pte; +this_pte: if (check_pte(pvmw)) return true; next_pte: - /* Seek to next pte only makes sense for THP */ - if (!PageTransHuge(pvmw->page) || PageHuge(pvmw->page)) - return not_found(pvmw); do { pvmw->address += PAGE_SIZE; - if (pvmw->address >= pvmw->vma->vm_end || - pvmw->address >= - __vma_address(pvmw->page, pvmw->vma) + - thp_size(pvmw->page)) + if (pvmw->address >= end) return not_found(pvmw); /* Did we cross page table boundary? */ - if (pvmw->address % PMD_SIZE == 0) { - pte_unmap(pvmw->pte); + if ((pvmw->address & (PMD_SIZE - PAGE_SIZE)) == 0) { if (pvmw->ptl) { spin_unlock(pvmw->ptl); pvmw->ptl = NULL; } + pte_unmap(pvmw->pte); + pvmw->pte = NULL; goto restart; - } else { - pvmw->pte++; + } + pvmw->pte++; + if ((pvmw->flags & PVMW_SYNC) && !pvmw->ptl) { + pvmw->ptl = pte_lockptr(mm, pvmw->pmd); + spin_lock(pvmw->ptl); } } while (pte_none(*pvmw->pte)); @@ -247,7 +286,10 @@ next_pte: pvmw->ptl = pte_lockptr(mm, pvmw->pmd); spin_lock(pvmw->ptl); } - } + goto this_pte; + } while (pvmw->address < end); + + return false; } /** @@ -266,14 +308,10 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) .vma = vma, .flags = PVMW_SYNC, }; - unsigned long start, end; - - start = __vma_address(page, vma); - end = start + thp_size(page) - PAGE_SIZE; - if (unlikely(end < vma->vm_start || start >= vma->vm_end)) + pvmw.address = vma_address(page, vma); + if (pvmw.address == -EFAULT) return 0; - pvmw.address = max(start, vma->vm_start); if (!page_vma_mapped_walk(&pvmw)) return 0; page_vma_mapped_walk_done(&pvmw); diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h index 095d7eaa0db4..ae26b118e246 100644 --- a/mm/percpu-internal.h +++ b/mm/percpu-internal.h @@ -170,7 +170,7 @@ struct percpu_stats { u64 nr_max_alloc; /* max # of live allocations */ u32 nr_chunks; /* current # of live chunks */ u32 nr_max_chunks; /* max # of live chunks */ - size_t min_alloc_size; /* min allocaiton size */ + size_t min_alloc_size; /* min allocation size */ size_t max_alloc_size; /* max allocation size */ }; diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index e46f7a6917f9..8d3844bc0c7c 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -8,6 +8,7 @@ * Chunks are mapped into vmalloc areas and populated page by page. * This is the default chunk allocator. */ +#include "internal.h" static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk, unsigned int cpu, int page_idx) @@ -133,7 +134,7 @@ static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk, static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) { - unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT); + vunmap_range_noflush(addr, addr + (nr_pages << PAGE_SHIFT)); } /** @@ -192,8 +193,8 @@ static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk, static int __pcpu_map_pages(unsigned long addr, struct page **pages, int nr_pages) { - return map_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT, - PAGE_KERNEL, pages); + return vmap_pages_range_noflush(addr, addr + (nr_pages << PAGE_SHIFT), + PAGE_KERNEL, pages, PAGE_SHIFT); } /** diff --git a/mm/percpu.c b/mm/percpu.c index 23308113a5ff..f99e9306b939 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1862,7 +1862,7 @@ fail: pr_info("limit reached, disable warning\n"); } if (is_atomic) { - /* see the flag handling in pcpu_blance_workfn() */ + /* see the flag handling in pcpu_balance_workfn() */ pcpu_atomic_alloc_failed = true; pcpu_schedule_balance_work(); } else { diff --git a/mm/pgalloc-track.h b/mm/pgalloc-track.h index 1dcc865029a2..e9e879de8649 100644 --- a/mm/pgalloc-track.h +++ b/mm/pgalloc-track.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_PGALLLC_TRACK_H -#define _LINUX_PGALLLC_TRACK_H +#ifndef _LINUX_PGALLOC_TRACK_H +#define _LINUX_PGALLOC_TRACK_H #if defined(CONFIG_MMU) static inline p4d_t *p4d_alloc_track(struct mm_struct *mm, pgd_t *pgd, @@ -48,4 +48,4 @@ static inline pmd_t *pmd_alloc_track(struct mm_struct *mm, pud_t *pud, (__pte_alloc_kernel(pmd) || ({*(mask)|=PGTBL_PMD_MODIFIED;0;})))?\ NULL: pte_offset_kernel(pmd, address)) -#endif /* _LINUX_PGALLLC_TRACK_H */ +#endif /* _LINUX_PGALLOC_TRACK_H */ diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index c2210e1cdb51..4e640baf9794 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -135,9 +135,8 @@ pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, { pmd_t pmd; VM_BUG_ON(address & ~HPAGE_PMD_MASK); - VM_BUG_ON(!pmd_present(*pmdp)); - /* Below assumes pmd_present() is true */ - VM_BUG_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp)); + VM_BUG_ON(pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) && + !pmd_devmap(*pmdp)); pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return pmd; diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index f5fee9cf90f8..4bcc11958089 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -9,7 +9,6 @@ #include <linux/mm.h> #include <linux/uio.h> #include <linux/sched.h> -#include <linux/compat.h> #include <linux/sched/mm.h> #include <linux/highmem.h> #include <linux/ptrace.h> diff --git a/mm/readahead.c b/mm/readahead.c index c5b0457415be..d589f147f4c2 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -198,8 +198,6 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, for (i = 0; i < nr_to_read; i++) { struct page *page = xa_load(&mapping->i_pages, index + i); - BUG_ON(index + i != ractl->_index + ractl->_nr_pages); - if (page && !xa_is_value(page)) { /* * Page already present? Kick off the current batch @@ -210,6 +208,7 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, * not worth getting one just for that. */ read_pages(ractl, &page_pool, true); + i = ractl->_index + ractl->_nr_pages - index - 1; continue; } @@ -223,6 +222,7 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, gfp_mask) < 0) { put_page(page); read_pages(ractl, &page_pool, true); + i = ractl->_index + ractl->_nr_pages - index - 1; continue; } if (i == nr_to_read - lookahead_size) @@ -272,9 +272,10 @@ void do_page_cache_ra(struct readahead_control *ractl, * memory at once. */ void force_page_cache_ra(struct readahead_control *ractl, - struct file_ra_state *ra, unsigned long nr_to_read) + unsigned long nr_to_read) { struct address_space *mapping = ractl->mapping; + struct file_ra_state *ra = ractl->ra; struct backing_dev_info *bdi = inode_to_bdi(mapping->host); unsigned long max_pages, index; @@ -433,10 +434,10 @@ static int try_context_readahead(struct address_space *mapping, * A minimal readahead algorithm for trivial sequential/random reads. */ static void ondemand_readahead(struct readahead_control *ractl, - struct file_ra_state *ra, bool hit_readahead_marker, - unsigned long req_size) + bool hit_readahead_marker, unsigned long req_size) { struct backing_dev_info *bdi = inode_to_bdi(ractl->mapping->host); + struct file_ra_state *ra = ractl->ra; unsigned long max_pages = ra->ra_pages; unsigned long add_pages; unsigned long index = readahead_index(ractl); @@ -550,7 +551,7 @@ readit: } void page_cache_sync_ra(struct readahead_control *ractl, - struct file_ra_state *ra, unsigned long req_count) + unsigned long req_count) { bool do_forced_ra = ractl->file && (ractl->file->f_mode & FMODE_RANDOM); @@ -560,7 +561,7 @@ void page_cache_sync_ra(struct readahead_control *ractl, * read-ahead will do the right thing and limit the read to just the * requested range, which we'll set to 1 page for this case. */ - if (!ra->ra_pages || blk_cgroup_congested()) { + if (!ractl->ra->ra_pages || blk_cgroup_congested()) { if (!ractl->file) return; req_count = 1; @@ -569,21 +570,20 @@ void page_cache_sync_ra(struct readahead_control *ractl, /* be dumb */ if (do_forced_ra) { - force_page_cache_ra(ractl, ra, req_count); + force_page_cache_ra(ractl, req_count); return; } /* do read-ahead */ - ondemand_readahead(ractl, ra, false, req_count); + ondemand_readahead(ractl, false, req_count); } EXPORT_SYMBOL_GPL(page_cache_sync_ra); void page_cache_async_ra(struct readahead_control *ractl, - struct file_ra_state *ra, struct page *page, - unsigned long req_count) + struct page *page, unsigned long req_count) { /* no read-ahead */ - if (!ra->ra_pages) + if (!ractl->ra->ra_pages) return; /* @@ -604,7 +604,7 @@ void page_cache_async_ra(struct readahead_control *ractl, return; /* do read-ahead */ - ondemand_readahead(ractl, ra, true, req_count); + ondemand_readahead(ractl, true, req_count); } EXPORT_SYMBOL_GPL(page_cache_async_ra); @@ -638,3 +638,78 @@ SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count) { return ksys_readahead(fd, offset, count); } + +/** + * readahead_expand - Expand a readahead request + * @ractl: The request to be expanded + * @new_start: The revised start + * @new_len: The revised size of the request + * + * Attempt to expand a readahead request outwards from the current size to the + * specified size by inserting locked pages before and after the current window + * to increase the size to the new window. This may involve the insertion of + * THPs, in which case the window may get expanded even beyond what was + * requested. + * + * The algorithm will stop if it encounters a conflicting page already in the + * pagecache and leave a smaller expansion than requested. + * + * The caller must check for this by examining the revised @ractl object for a + * different expansion than was requested. + */ +void readahead_expand(struct readahead_control *ractl, + loff_t new_start, size_t new_len) +{ + struct address_space *mapping = ractl->mapping; + struct file_ra_state *ra = ractl->ra; + pgoff_t new_index, new_nr_pages; + gfp_t gfp_mask = readahead_gfp_mask(mapping); + + new_index = new_start / PAGE_SIZE; + + /* Expand the leading edge downwards */ + while (ractl->_index > new_index) { + unsigned long index = ractl->_index - 1; + struct page *page = xa_load(&mapping->i_pages, index); + + if (page && !xa_is_value(page)) + return; /* Page apparently present */ + + page = __page_cache_alloc(gfp_mask); + if (!page) + return; + if (add_to_page_cache_lru(page, mapping, index, gfp_mask) < 0) { + put_page(page); + return; + } + + ractl->_nr_pages++; + ractl->_index = page->index; + } + + new_len += new_start - readahead_pos(ractl); + new_nr_pages = DIV_ROUND_UP(new_len, PAGE_SIZE); + + /* Expand the trailing edge upwards */ + while (ractl->_nr_pages < new_nr_pages) { + unsigned long index = ractl->_index + ractl->_nr_pages; + struct page *page = xa_load(&mapping->i_pages, index); + + if (page && !xa_is_value(page)) + return; /* Page apparently present */ + + page = __page_cache_alloc(gfp_mask); + if (!page) + return; + if (add_to_page_cache_lru(page, mapping, index, gfp_mask) < 0) { + put_page(page); + return; + } + ractl->_nr_pages++; + if (ra) { + ra->size++; + ra->async_size++; + } + } +} +EXPORT_SYMBOL(readahead_expand); diff --git a/mm/rmap.c b/mm/rmap.c index b0fc27e77d6d..e05c300048e6 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -257,7 +257,7 @@ static inline void unlock_anon_vma_root(struct anon_vma *root) * Attach the anon_vmas from src to dst. * Returns 0 on success, -ENOMEM on failure. * - * anon_vma_clone() is called by __vma_split(), __split_vma(), copy_vma() and + * anon_vma_clone() is called by __vma_adjust(), __split_vma(), copy_vma() and * anon_vma_fork(). The first three want an exact copy of src, while the last * one, anon_vma_fork(), may try to reuse an existing anon_vma to prevent * endless growth of anon_vma. Since dst->anon_vma is set to NULL before call, @@ -707,7 +707,6 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) */ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) { - unsigned long address; if (PageAnon(page)) { struct anon_vma *page__anon_vma = page_anon_vma(page); /* @@ -717,15 +716,13 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) if (!vma->anon_vma || !page__anon_vma || vma->anon_vma->root != page__anon_vma->root) return -EFAULT; - } else if (page->mapping) { - if (!vma->vm_file || vma->vm_file->f_mapping != page->mapping) - return -EFAULT; - } else + } else if (!vma->vm_file) { return -EFAULT; - address = __vma_address(page, vma); - if (unlikely(address < vma->vm_start || address >= vma->vm_end)) + } else if (vma->vm_file->f_mapping != compound_head(page)->mapping) { return -EFAULT; - return address; + } + + return vma_address(page, vma); } pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) @@ -919,7 +916,7 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, */ mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 0, vma, vma->vm_mm, address, - min(vma->vm_end, address + page_size(page))); + vma_address_end(page, vma)); mmu_notifier_invalidate_range_start(&range); while (page_vma_mapped_walk(&pvmw)) { @@ -1405,6 +1402,15 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, struct mmu_notifier_range range; enum ttu_flags flags = (enum ttu_flags)(long)arg; + /* + * When racing against e.g. zap_pte_range() on another cpu, + * in between its ptep_get_and_clear_full() and page_remove_rmap(), + * try_to_unmap() may return false when it is about to become true, + * if page table locking is skipped: use TTU_SYNC to wait for that. + */ + if (flags & TTU_SYNC) + pvmw.flags = PVMW_SYNC; + /* munlock has nothing to gain from examining un-locked vmas */ if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED)) return true; @@ -1426,9 +1432,10 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * Note that the page can not be free in this function as call of * try_to_unmap() must hold a reference on the page. */ + range.end = PageKsm(page) ? + address + PAGE_SIZE : vma_address_end(page, vma); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, - address, - min(vma->vm_end, address + page_size(page))); + address, range.end); if (PageHuge(page)) { /* * If sharing is possible, start and end will be adjusted @@ -1777,7 +1784,13 @@ bool try_to_unmap(struct page *page, enum ttu_flags flags) else rmap_walk(page, &rwc); - return !page_mapcount(page) ? true : false; + /* + * When racing against e.g. zap_pte_range() on another cpu, + * in between its ptep_get_and_clear_full() and page_remove_rmap(), + * try_to_unmap() may return false when it is about to become true, + * if page table locking is skipped: use TTU_SYNC to wait for that. + */ + return !page_mapcount(page); } /** @@ -1874,6 +1887,7 @@ static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, struct vm_area_struct *vma = avc->vma; unsigned long address = vma_address(page, vma); + VM_BUG_ON_VMA(address == -EFAULT, vma); cond_resched(); if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) @@ -1928,6 +1942,7 @@ static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, pgoff_start, pgoff_end) { unsigned long address = vma_address(page, vma); + VM_BUG_ON_VMA(address == -EFAULT, vma); cond_resched(); if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) diff --git a/mm/shmem.c b/mm/shmem.c index b2db4ed0fbc7..5d46611cba8d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2258,25 +2258,11 @@ out_nomem: static int shmem_mmap(struct file *file, struct vm_area_struct *vma) { struct shmem_inode_info *info = SHMEM_I(file_inode(file)); + int ret; - if (info->seals & F_SEAL_FUTURE_WRITE) { - /* - * New PROT_WRITE and MAP_SHARED mmaps are not allowed when - * "future write" seal active. - */ - if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) - return -EPERM; - - /* - * Since an F_SEAL_FUTURE_WRITE sealed memfd can be mapped as - * MAP_SHARED and read-only, take care to not allow mprotect to - * revert protections on such mappings. Do this only for shared - * mappings. For private mappings, don't need to mask - * VM_MAYWRITE as we still want them to be COW-writable. - */ - if (vma->vm_flags & VM_SHARED) - vma->vm_flags &= ~(VM_MAYWRITE); - } + ret = seal_check_future_write(info->seals, vma); + if (ret) + return ret; /* arm64 - allow memory tagging on RAM-based files */ vma->vm_flags |= VM_MTE_ALLOWED; @@ -2375,8 +2361,18 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, pgoff_t offset, max_off; ret = -ENOMEM; - if (!shmem_inode_acct_block(inode, 1)) + if (!shmem_inode_acct_block(inode, 1)) { + /* + * We may have got a page, returned -ENOENT triggering a retry, + * and now we find ourselves with -ENOMEM. Release the page, to + * avoid a BUG_ON in our caller. + */ + if (unlikely(*pagep)) { + put_page(*pagep); + *pagep = NULL; + } goto out; + } if (!*pagep) { page = shmem_alloc_page(gfp, info, pgoff); @@ -2846,6 +2842,9 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_ffree = sbinfo->free_inodes; } /* else leave those fields 0 like simple_statfs */ + + buf->f_fsid = uuid_to_fsid(dentry->d_sb->s_uuid.b); + return 0; } @@ -3505,7 +3504,7 @@ static int shmem_parse_options(struct fs_context *fc, void *data) } } if (*this_char) { - char *value = strchr(this_char,'='); + char *value = strchr(this_char, '='); size_t len = 0; int err; diff --git a/mm/shuffle.h b/mm/shuffle.h index 71b784f0b7c3..cec62984f7d3 100644 --- a/mm/shuffle.h +++ b/mm/shuffle.h @@ -10,7 +10,7 @@ DECLARE_STATIC_KEY_FALSE(page_alloc_shuffle_key); extern void __shuffle_free_memory(pg_data_t *pgdat); extern bool shuffle_pick_tail(void); -static inline void shuffle_free_memory(pg_data_t *pgdat) +static inline void __meminit shuffle_free_memory(pg_data_t *pgdat) { if (!static_branch_unlikely(&page_alloc_shuffle_key)) return; @@ -18,7 +18,7 @@ static inline void shuffle_free_memory(pg_data_t *pgdat) } extern void __shuffle_zone(struct zone *z); -static inline void shuffle_zone(struct zone *z) +static inline void __meminit shuffle_zone(struct zone *z) { if (!static_branch_unlikely(&page_alloc_shuffle_key)) return; diff --git a/mm/slab.c b/mm/slab.c index ae651bf540b7..d0f725637663 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -259,7 +259,7 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent) #define BATCHREFILL_LIMIT 16 /* - * Optimization question: fewer reaps means less probability for unnessary + * Optimization question: fewer reaps means less probability for unnecessary * cpucache drain/refill cycles. * * OTOH the cpuarrays can contain lots of objects, @@ -2284,7 +2284,7 @@ void __kmem_cache_release(struct kmem_cache *cachep) * Because if it is the case, that means we defer the creation of * the kmalloc_{dma,}_cache of size sizeof(slab descriptor) to this point. * And we eventually call down to __kmem_cache_create(), which - * in turn looks up in the kmalloc_{dma,}_caches for the disired-size one. + * in turn looks up in the kmalloc_{dma,}_caches for the desired-size one. * This is a "chicken-and-egg" problem. * * So the off-slab slab descriptor shall come from the kmalloc_{dma,}_caches, @@ -2381,8 +2381,8 @@ union freelist_init_state { }; /* - * Initialize the state based on the randomization methode available. - * return true if the pre-computed list is available, false otherwize. + * Initialize the state based on the randomization method available. + * return true if the pre-computed list is available, false otherwise. */ static bool freelist_state_initialize(union freelist_init_state *state, struct kmem_cache *cachep, @@ -3216,6 +3216,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, size_t orig_ void *ptr; int slab_node = numa_mem_id(); struct obj_cgroup *objcg = NULL; + bool init = false; flags &= gfp_allowed_mask; cachep = slab_pre_alloc_hook(cachep, &objcg, 1, flags); @@ -3254,12 +3255,10 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, size_t orig_ out: local_irq_restore(save_flags); ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); - - if (unlikely(slab_want_init_on_alloc(flags, cachep)) && ptr) - memset(ptr, 0, cachep->object_size); + init = slab_want_init_on_alloc(flags, cachep); out_hooks: - slab_post_alloc_hook(cachep, objcg, flags, 1, &ptr); + slab_post_alloc_hook(cachep, objcg, flags, 1, &ptr, init); return ptr; } @@ -3301,6 +3300,7 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, size_t orig_size, unsigned lo unsigned long save_flags; void *objp; struct obj_cgroup *objcg = NULL; + bool init = false; flags &= gfp_allowed_mask; cachep = slab_pre_alloc_hook(cachep, &objcg, 1, flags); @@ -3317,12 +3317,10 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, size_t orig_size, unsigned lo local_irq_restore(save_flags); objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); prefetchw(objp); - - if (unlikely(slab_want_init_on_alloc(flags, cachep)) && objp) - memset(objp, 0, cachep->object_size); + init = slab_want_init_on_alloc(flags, cachep); out: - slab_post_alloc_hook(cachep, objcg, flags, 1, &objp); + slab_post_alloc_hook(cachep, objcg, flags, 1, &objp, init); return objp; } @@ -3427,17 +3425,24 @@ free_done: static __always_inline void __cache_free(struct kmem_cache *cachep, void *objp, unsigned long caller) { + bool init; + if (is_kfence_address(objp)) { kmemleak_free_recursive(objp, cachep->flags); __kfence_free(objp); return; } - if (unlikely(slab_want_init_on_free(cachep))) + /* + * As memory initialization might be integrated into KASAN, + * kasan_slab_free and initialization memset must be + * kept together to avoid discrepancies in behavior. + */ + init = slab_want_init_on_free(cachep); + if (init && !kasan_has_integrated_init()) memset(objp, 0, cachep->object_size); - - /* Put the object into the quarantine, don't touch it for now. */ - if (kasan_slab_free(cachep, objp)) + /* KASAN might put objp into memory quarantine, delaying its reuse. */ + if (kasan_slab_free(cachep, objp, init)) return; /* Use KCSAN to help debug racy use-after-free. */ @@ -3542,18 +3547,18 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, cache_alloc_debugcheck_after_bulk(s, flags, size, p, _RET_IP_); - /* Clear memory outside IRQ disabled section */ - if (unlikely(slab_want_init_on_alloc(flags, s))) - for (i = 0; i < size; i++) - memset(p[i], 0, s->object_size); - - slab_post_alloc_hook(s, objcg, flags, size, p); + /* + * memcg and kmem_cache debug support and memory initialization. + * Done outside of the IRQ disabled section. + */ + slab_post_alloc_hook(s, objcg, flags, size, p, + slab_want_init_on_alloc(flags, s)); /* FIXME: Trace call missing. Christoph would like a bulk variant */ return size; error: local_irq_enable(); cache_alloc_debugcheck_after_bulk(s, flags, i, p, _RET_IP_); - slab_post_alloc_hook(s, objcg, flags, i, p); + slab_post_alloc_hook(s, objcg, flags, i, p, false); __kmem_cache_free_bulk(s, i, p); return 0; } @@ -3651,6 +3656,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t flags, EXPORT_SYMBOL(__kmalloc_node_track_caller); #endif /* CONFIG_NUMA */ +#ifdef CONFIG_PRINTK void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page) { struct kmem_cache *cachep; @@ -3670,6 +3676,7 @@ void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page) if (DEBUG && cachep->flags & SLAB_STORE_USER) kpp->kp_ret = *dbg_userword(cachep, objp); } +#endif /** * __do_kmalloc - allocate memory diff --git a/mm/slab.h b/mm/slab.h index 076582f58f68..18c1927cd196 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -506,15 +506,24 @@ static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, } static inline void slab_post_alloc_hook(struct kmem_cache *s, - struct obj_cgroup *objcg, - gfp_t flags, size_t size, void **p) + struct obj_cgroup *objcg, gfp_t flags, + size_t size, void **p, bool init) { size_t i; flags &= gfp_allowed_mask; + + /* + * As memory initialization might be integrated into KASAN, + * kasan_slab_alloc and initialization memset must be + * kept together to avoid discrepancies in behavior. + * + * As p[i] might get tagged, memset and kmemleak hook come after KASAN. + */ for (i = 0; i < size; i++) { - p[i] = kasan_slab_alloc(s, p[i], flags); - /* As p[i] might get tagged, call kmemleak hook after KASAN. */ + p[i] = kasan_slab_alloc(s, p[i], flags, init); + if (p[i] && init && !kasan_has_integrated_init()) + memset(p[i], 0, s->object_size); kmemleak_alloc_recursive(p[i], s->object_size, 1, s->flags, flags); } @@ -601,7 +610,8 @@ static inline void cache_random_seq_destroy(struct kmem_cache *cachep) { } static inline bool slab_want_init_on_alloc(gfp_t flags, struct kmem_cache *c) { - if (static_branch_unlikely(&init_on_alloc)) { + if (static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, + &init_on_alloc)) { if (c->ctor) return false; if (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) @@ -613,12 +623,14 @@ static inline bool slab_want_init_on_alloc(gfp_t flags, struct kmem_cache *c) static inline bool slab_want_init_on_free(struct kmem_cache *c) { - if (static_branch_unlikely(&init_on_free)) + if (static_branch_maybe(CONFIG_INIT_ON_FREE_DEFAULT_ON, + &init_on_free)) return !(c->ctor || (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON))); return false; } +#ifdef CONFIG_PRINTK #define KS_ADDRS_COUNT 16 struct kmem_obj_info { void *kp_ptr; @@ -630,5 +642,6 @@ struct kmem_obj_info { void *kp_stack[KS_ADDRS_COUNT]; }; void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page); +#endif #endif /* MM_SLAB_H */ diff --git a/mm/slab_common.c b/mm/slab_common.c index 88e833986332..7cab77655f11 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -71,11 +71,19 @@ static int __init setup_slab_nomerge(char *str) return 1; } +static int __init setup_slab_merge(char *str) +{ + slab_nomerge = false; + return 1; +} + #ifdef CONFIG_SLUB __setup_param("slub_nomerge", slub_nomerge, setup_slab_nomerge, 0); +__setup_param("slub_merge", slub_merge, setup_slab_merge, 0); #endif __setup("slab_nomerge", setup_slab_nomerge); +__setup("slab_merge", setup_slab_merge); /* * Determine the size of a slab object @@ -89,8 +97,7 @@ EXPORT_SYMBOL(kmem_cache_size); #ifdef CONFIG_DEBUG_VM static int kmem_cache_sanity_check(const char *name, unsigned int size) { - if (!name || in_interrupt() || size < sizeof(void *) || - size > KMALLOC_MAX_SIZE) { + if (!name || in_interrupt() || size > KMALLOC_MAX_SIZE) { pr_err("kmem_cache_create(%s) integrity check failed\n", name); return -EINVAL; } @@ -310,6 +317,16 @@ kmem_cache_create_usercopy(const char *name, const char *cache_name; int err; +#ifdef CONFIG_SLUB_DEBUG + /* + * If no slub_debug was enabled globally, the static key is not yet + * enabled by setup_slub_debug(). Enable it if the cache is being + * created with any of the debugging flags passed explicitly. + */ + if (flags & SLAB_DEBUG_FLAGS) + static_branch_enable(&slub_debug_enabled); +#endif + mutex_lock(&slab_mutex); err = kmem_cache_sanity_check(name, size); @@ -526,6 +543,7 @@ bool slab_is_available(void) return slab_state >= UP; } +#ifdef CONFIG_PRINTK /** * kmem_valid_obj - does the pointer reference a valid slab object? * @object: pointer to query. @@ -544,6 +562,7 @@ bool kmem_valid_obj(void *object) page = virt_to_head_page(object); return PageSlab(page); } +EXPORT_SYMBOL_GPL(kmem_valid_obj); /** * kmem_dump_obj - Print available slab provenance information @@ -600,6 +619,8 @@ void kmem_dump_obj(void *object) pr_info(" %pS\n", kp.kp_stack[i]); } } +EXPORT_SYMBOL_GPL(kmem_dump_obj); +#endif #ifndef CONFIG_SLOB /* Create a cache during boot when no slab services are available yet */ diff --git a/mm/slob.c b/mm/slob.c index 0578429b991b..74d3f6e60666 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -461,11 +461,13 @@ out: spin_unlock_irqrestore(&slob_lock, flags); } +#ifdef CONFIG_PRINTK void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page) { kpp->kp_ptr = object; kpp->kp_page = page; } +#endif /* * End of slob allocator proper. Begin kmem_cache_alloc and kmalloc frontend. diff --git a/mm/slub.c b/mm/slub.c index 3021ce9bf1b3..61bd40e3eb9a 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3,7 +3,7 @@ * SLUB: A slab allocator that limits cache line use instead of queuing * objects in per cpu and per node lists. * - * The allocator synchronizes using per slab locks or atomic operatios + * The allocator synchronizes using per slab locks or atomic operations * and only uses a centralized lock to manage a pool of partial slabs. * * (C) 2007 SGI, Christoph Lameter @@ -15,6 +15,7 @@ #include <linux/module.h> #include <linux/bit_spinlock.h> #include <linux/interrupt.h> +#include <linux/swab.h> #include <linux/bitops.h> #include <linux/slab.h> #include "slab.h" @@ -160,7 +161,7 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) #undef SLUB_DEBUG_CMPXCHG /* - * Mininum number of partial slabs. These will be left on the partial + * Minimum number of partial slabs. These will be left on the partial * lists even if they are empty. kmem_cache_shrink may reclaim them. */ #define MIN_PARTIAL 5 @@ -301,6 +302,7 @@ static inline void *get_freepointer_safe(struct kmem_cache *s, void *object) if (!debug_pagealloc_enabled_static()) return get_freepointer(s, object); + object = kasan_reset_tag(object); freepointer_addr = (unsigned long)object + s->offset; copy_from_kernel_nofault(&p, (void **)freepointer_addr, sizeof(p)); return freelist_ptr(s, p, freepointer_addr); @@ -624,7 +626,7 @@ static void print_track(const char *s, struct track *t, unsigned long pr_time) if (!t->addr) return; - pr_err("INFO: %s in %pS age=%lu cpu=%u pid=%d\n", + pr_err("%s in %pS age=%lu cpu=%u pid=%d\n", s, (void *)t->addr, pr_time - t->when, t->cpu, t->pid); #ifdef CONFIG_STACKTRACE { @@ -650,8 +652,9 @@ void print_tracking(struct kmem_cache *s, void *object) static void print_page_info(struct page *page) { - pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n", - page, page->objects, page->inuse, page->freelist, page->flags); + pr_err("Slab 0x%p objects=%u used=%u fp=0x%p flags=%#lx(%pGp)\n", + page, page->objects, page->inuse, page->freelist, + page->flags, &page->flags); } @@ -706,19 +709,19 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) print_page_info(page); - pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", + pr_err("Object 0x%p @offset=%tu fp=0x%p\n\n", p, p - addr, get_freepointer(s, p)); if (s->flags & SLAB_RED_ZONE) - print_section(KERN_ERR, "Redzone ", p - s->red_left_pad, + print_section(KERN_ERR, "Redzone ", p - s->red_left_pad, s->red_left_pad); else if (p > addr + 16) print_section(KERN_ERR, "Bytes b4 ", p - 16, 16); - print_section(KERN_ERR, "Object ", p, + print_section(KERN_ERR, "Object ", p, min_t(unsigned int, s->object_size, PAGE_SIZE)); if (s->flags & SLAB_RED_ZONE) - print_section(KERN_ERR, "Redzone ", p + s->object_size, + print_section(KERN_ERR, "Redzone ", p + s->object_size, s->inuse - s->object_size); off = get_info_end(s); @@ -730,7 +733,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) if (off != size_from_object(s)) /* Beginning of the filler is the free pointer */ - print_section(KERN_ERR, "Padding ", p + off, + print_section(KERN_ERR, "Padding ", p + off, size_from_object(s) - off); dump_stack(); @@ -799,7 +802,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, end--; slab_bug(s, "%s overwritten", what); - pr_err("INFO: 0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n", + pr_err("0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n", fault, end - 1, fault - addr, fault[0], value); print_trailer(s, page, object); @@ -832,7 +835,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, * * A. Free pointer (if we cannot overwrite object on free) * B. Tracking data for SLAB_STORE_USER - * C. Padding to reach required alignment boundary or at mininum + * C. Padding to reach required alignment boundary or at minimum * one word if debugging is on to be able to detect writes * before the word boundary. * @@ -907,11 +910,11 @@ static int check_object(struct kmem_cache *s, struct page *page, u8 *endobject = object + s->object_size; if (s->flags & SLAB_RED_ZONE) { - if (!check_bytes_and_report(s, page, object, "Redzone", + if (!check_bytes_and_report(s, page, object, "Left Redzone", object - s->red_left_pad, val, s->red_left_pad)) return 0; - if (!check_bytes_and_report(s, page, object, "Redzone", + if (!check_bytes_and_report(s, page, object, "Right Redzone", endobject, val, s->inuse - s->object_size)) return 0; } else { @@ -926,7 +929,7 @@ static int check_object(struct kmem_cache *s, struct page *page, if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) && (!check_bytes_and_report(s, page, p, "Poison", p, POISON_FREE, s->object_size - 1) || - !check_bytes_and_report(s, page, p, "Poison", + !check_bytes_and_report(s, page, p, "End Poison", p + s->object_size - 1, POISON_END, 1))) return 0; /* @@ -1532,7 +1535,8 @@ static __always_inline void kfree_hook(void *x) kasan_kfree_large(x); } -static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x) +static __always_inline bool slab_free_hook(struct kmem_cache *s, + void *x, bool init) { kmemleak_free_recursive(x, s->flags); @@ -1558,8 +1562,25 @@ static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x) __kcsan_check_access(x, s->object_size, KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT); - /* KASAN might put x into memory quarantine, delaying its reuse */ - return kasan_slab_free(s, x); + /* + * As memory initialization might be integrated into KASAN, + * kasan_slab_free and initialization memset's must be + * kept together to avoid discrepancies in behavior. + * + * The initialization memset's clear the object and the metadata, + * but don't touch the SLAB redzone. + */ + if (init) { + int rsize; + + if (!kasan_has_integrated_init()) + memset(kasan_reset_tag(x), 0, s->object_size); + rsize = (s->flags & SLAB_RED_ZONE) ? s->red_left_pad : 0; + memset((char *)kasan_reset_tag(x) + s->inuse, 0, + s->size - s->inuse - rsize); + } + /* KASAN might put x into memory quarantine, delaying its reuse. */ + return kasan_slab_free(s, x, init); } static inline bool slab_free_freelist_hook(struct kmem_cache *s, @@ -1569,10 +1590,9 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s, void *object; void *next = *head; void *old_tail = *tail ? *tail : *head; - int rsize; if (is_kfence_address(next)) { - slab_free_hook(s, next); + slab_free_hook(s, next, false); return true; } @@ -1584,20 +1604,8 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s, object = next; next = get_freepointer(s, object); - if (slab_want_init_on_free(s)) { - /* - * Clear the object and the metadata, but don't touch - * the redzone. - */ - memset(kasan_reset_tag(object), 0, s->object_size); - rsize = (s->flags & SLAB_RED_ZONE) ? s->red_left_pad - : 0; - memset((char *)kasan_reset_tag(object) + s->inuse, 0, - s->size - s->inuse - rsize); - - } /* If object's reuse doesn't have to be delayed */ - if (!slab_free_hook(s, object)) { + if (!slab_free_hook(s, object, slab_want_init_on_free(s))) { /* Move object to the new freelist */ set_freepointer(s, object, *head); *head = object; @@ -2822,6 +2830,7 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s, struct page *page; unsigned long tid; struct obj_cgroup *objcg = NULL; + bool init = false; s = slab_pre_alloc_hook(s, &objcg, 1, gfpflags); if (!s) @@ -2899,12 +2908,10 @@ redo: } maybe_wipe_obj_freeptr(s, object); - - if (unlikely(slab_want_init_on_alloc(gfpflags, s)) && object) - memset(kasan_reset_tag(object), 0, s->object_size); + init = slab_want_init_on_alloc(gfpflags, s); out: - slab_post_alloc_hook(s, objcg, gfpflags, 1, &object); + slab_post_alloc_hook(s, objcg, gfpflags, 1, &object, init); return object; } @@ -3236,7 +3243,7 @@ int build_detached_freelist(struct kmem_cache *s, size_t size, } if (is_kfence_address(object)) { - slab_free_hook(df->s, object); + slab_free_hook(df->s, object, false); __kfence_free(object); p[size] = NULL; /* mark object processed */ return size; @@ -3356,20 +3363,16 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, c->tid = next_tid(c->tid); local_irq_enable(); - /* Clear memory outside IRQ disabled fastpath loop */ - if (unlikely(slab_want_init_on_alloc(flags, s))) { - int j; - - for (j = 0; j < i; j++) - memset(kasan_reset_tag(p[j]), 0, s->object_size); - } - - /* memcg and kmem_cache debug support */ - slab_post_alloc_hook(s, objcg, flags, size, p); + /* + * memcg and kmem_cache debug support and memory initialization. + * Done outside of the IRQ disabled fastpath loop. + */ + slab_post_alloc_hook(s, objcg, flags, size, p, + slab_want_init_on_alloc(flags, s)); return i; error: local_irq_enable(); - slab_post_alloc_hook(s, objcg, flags, i, p); + slab_post_alloc_hook(s, objcg, flags, i, p, false); __kmem_cache_free_bulk(s, i, p); return 0; } @@ -3390,7 +3393,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_bulk); */ /* - * Mininum / Maximum order of slab pages. This influences locking overhead + * Minimum / Maximum order of slab pages. This influences locking overhead * and slab fragmentation. A higher order reduces the number of partial slabs * and increases the number of allocations possible without having to * take the list_lock. @@ -3421,7 +3424,7 @@ static unsigned int slub_min_objects; * * Higher order allocations also allow the placement of more objects in a * slab and thereby reduce object handling overhead. If the user has - * requested a higher mininum order then we start with that one instead of + * requested a higher minimum order then we start with that one instead of * the smallest order which will fit the object. */ static inline unsigned int slab_order(unsigned int size, @@ -3579,7 +3582,7 @@ static void early_kmem_cache_node_alloc(int node) init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); init_tracking(kmem_cache_node, n); #endif - n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL); + n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL, false); page->freelist = get_freepointer(kmem_cache_node, n); page->inuse = 1; page->frozen = 0; @@ -3687,7 +3690,6 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) { slab_flags_t flags = s->flags; unsigned int size = s->object_size; - unsigned int freepointer_area; unsigned int order; /* @@ -3696,13 +3698,6 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) * the possible location of the free pointer. */ size = ALIGN(size, sizeof(void *)); - /* - * This is the area of the object where a freepointer can be - * safely written. If redzoning adds more to the inuse size, we - * can't use that portion for writing the freepointer, so - * s->offset must be limited within this for the general case. - */ - freepointer_area = size; #ifdef CONFIG_SLUB_DEBUG /* @@ -3728,19 +3723,21 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) /* * With that we have determined the number of bytes in actual use - * by the object. This is the potential offset to the free pointer. + * by the object and redzoning. */ s->inuse = size; - if (((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) || - s->ctor)) { + if ((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) || + ((flags & SLAB_RED_ZONE) && s->object_size < sizeof(void *)) || + s->ctor) { /* * Relocate free pointer after the object if it is not * permitted to overwrite the first word of the object on * kmem_cache_free. * * This is the case if we do RCU, have a constructor or - * destructor or are poisoning the objects. + * destructor, are poisoning the objects, or are + * redzoning an object smaller than sizeof(void *). * * The assumption that s->offset >= s->inuse means free * pointer is outside of the object is used in the @@ -3749,13 +3746,13 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) */ s->offset = size; size += sizeof(void *); - } else if (freepointer_area > sizeof(void *)) { + } else { /* * Store freelist pointer near middle of object to keep * it away from the edges of the object to avoid small * sized over/underflows from neighboring allocations. */ - s->offset = ALIGN(freepointer_area / 2, sizeof(void *)); + s->offset = ALIGN_DOWN(s->object_size / 2, sizeof(void *)); } #ifdef CONFIG_SLUB_DEBUG @@ -3898,7 +3895,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, for_each_object(p, s, addr, page->objects) { if (!test_bit(__obj_to_index(s, addr, p), map)) { - pr_err("INFO: Object 0x%p @offset=%tu\n", p, p - addr); + pr_err("Object 0x%p @offset=%tu\n", p, p - addr); print_tracking(s, p); } } @@ -3963,6 +3960,7 @@ int __kmem_cache_shutdown(struct kmem_cache *s) return 0; } +#ifdef CONFIG_PRINTK void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page) { void *base; @@ -4002,6 +4000,7 @@ void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page) #endif #endif } +#endif /******************************************************************** * Kmalloc subsystem diff --git a/mm/sparse.c b/mm/sparse.c index 7bd23f9d6cef..55c18aff3e42 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -257,7 +257,7 @@ static void __init memory_present(int nid, unsigned long start, unsigned long en if (unlikely(!mem_section)) { unsigned long size, align; - size = sizeof(struct mem_section*) * NR_SECTION_ROOTS; + size = sizeof(struct mem_section *) * NR_SECTION_ROOTS; align = 1 << (INTERNODE_CACHE_SHIFT); mem_section = memblock_alloc(size, align); if (!mem_section) @@ -344,6 +344,15 @@ size_t mem_section_usage_size(void) return sizeof(struct mem_section_usage) + usemap_size(); } +static inline phys_addr_t pgdat_to_phys(struct pglist_data *pgdat) +{ +#ifndef CONFIG_NEED_MULTIPLE_NODES + return __pa_symbol(pgdat); +#else + return __pa(pgdat); +#endif +} + #ifdef CONFIG_MEMORY_HOTREMOVE static struct mem_section_usage * __init sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, @@ -362,7 +371,7 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, * from the same section as the pgdat where possible to avoid * this problem. */ - goal = __pa(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT); + goal = pgdat_to_phys(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT); limit = goal + (1UL << PA_SECTION_SHIFT); nid = early_pfn_to_nid(goal >> PAGE_SHIFT); again: @@ -390,7 +399,7 @@ static void __init check_usemap_section_nr(int nid, } usemap_snr = pfn_to_section_nr(__pa(usage) >> PAGE_SHIFT); - pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT); + pgdat_snr = pfn_to_section_nr(pgdat_to_phys(pgdat) >> PAGE_SHIFT); if (usemap_snr == pgdat_snr) return; @@ -547,6 +556,7 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin, pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.", __func__, nid); pnum_begin = pnum; + sparse_buffer_fini(); goto failed; } check_usemap_section_nr(nid, usage); @@ -623,7 +633,6 @@ void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn) } } -#ifdef CONFIG_MEMORY_HOTREMOVE /* Mark all memory sections within the pfn range as offline */ void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn) { @@ -644,7 +653,6 @@ void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn) ms->section_mem_map &= ~SECTION_IS_ONLINE; } } -#endif #ifdef CONFIG_SPARSEMEM_VMEMMAP static struct page * __meminit populate_section_memmap(unsigned long pfn, diff --git a/mm/swap.c b/mm/swap.c index 31b844d4ed94..dfb48cf9c2c9 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -36,6 +36,7 @@ #include <linux/hugetlb.h> #include <linux/page_idle.h> #include <linux/local_lock.h> +#include <linux/buffer_head.h> #include "internal.h" @@ -235,6 +236,18 @@ static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec) } } +/* return true if pagevec needs to drain */ +static bool pagevec_add_and_need_flush(struct pagevec *pvec, struct page *page) +{ + bool ret = false; + + if (!pagevec_add(pvec, page) || PageCompound(page) || + lru_cache_disabled()) + ret = true; + + return ret; +} + /* * Writeback is about to end against a page which has been marked for immediate * reclaim. If it still appears to be reclaimable, move it to the tail of the @@ -252,7 +265,7 @@ void rotate_reclaimable_page(struct page *page) get_page(page); local_lock_irqsave(&lru_rotate.lock, flags); pvec = this_cpu_ptr(&lru_rotate.pvec); - if (!pagevec_add(pvec, page) || PageCompound(page)) + if (pagevec_add_and_need_flush(pvec, page)) pagevec_lru_move_fn(pvec, pagevec_move_tail_fn); local_unlock_irqrestore(&lru_rotate.lock, flags); } @@ -343,7 +356,7 @@ static void activate_page(struct page *page) local_lock(&lru_pvecs.lock); pvec = this_cpu_ptr(&lru_pvecs.activate_page); get_page(page); - if (!pagevec_add(pvec, page) || PageCompound(page)) + if (pagevec_add_and_need_flush(pvec, page)) pagevec_lru_move_fn(pvec, __activate_page); local_unlock(&lru_pvecs.lock); } @@ -458,7 +471,7 @@ void lru_cache_add(struct page *page) get_page(page); local_lock(&lru_pvecs.lock); pvec = this_cpu_ptr(&lru_pvecs.lru_add); - if (!pagevec_add(pvec, page) || PageCompound(page)) + if (pagevec_add_and_need_flush(pvec, page)) __pagevec_lru_add(pvec); local_unlock(&lru_pvecs.lock); } @@ -483,7 +496,7 @@ void lru_cache_add_inactive_or_unevictable(struct page *page, if (unlikely(unevictable) && !TestSetPageMlocked(page)) { int nr_pages = thp_nr_pages(page); /* - * We use the irq-unsafe __mod_zone_page_stat because this + * We use the irq-unsafe __mod_zone_page_state because this * counter is not modified from interrupt context, and the pte * lock is held(spinlock), which implies preemption disabled. */ @@ -629,6 +642,7 @@ void lru_add_drain_cpu(int cpu) pagevec_lru_move_fn(pvec, lru_lazyfree_fn); activate_page_drain(cpu); + invalidate_bh_lrus_cpu(cpu); } /** @@ -654,7 +668,7 @@ void deactivate_file_page(struct page *page) local_lock(&lru_pvecs.lock); pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate_file); - if (!pagevec_add(pvec, page) || PageCompound(page)) + if (pagevec_add_and_need_flush(pvec, page)) pagevec_lru_move_fn(pvec, lru_deactivate_file_fn); local_unlock(&lru_pvecs.lock); } @@ -676,7 +690,7 @@ void deactivate_page(struct page *page) local_lock(&lru_pvecs.lock); pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate); get_page(page); - if (!pagevec_add(pvec, page) || PageCompound(page)) + if (pagevec_add_and_need_flush(pvec, page)) pagevec_lru_move_fn(pvec, lru_deactivate_fn); local_unlock(&lru_pvecs.lock); } @@ -698,7 +712,7 @@ void mark_page_lazyfree(struct page *page) local_lock(&lru_pvecs.lock); pvec = this_cpu_ptr(&lru_pvecs.lru_lazyfree); get_page(page); - if (!pagevec_add(pvec, page) || PageCompound(page)) + if (pagevec_add_and_need_flush(pvec, page)) pagevec_lru_move_fn(pvec, lru_lazyfree_fn); local_unlock(&lru_pvecs.lock); } @@ -735,7 +749,7 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy) * Calling this function with cpu hotplug locks held can actually lead * to obscure indirect dependencies via WQ context. */ -void lru_add_drain_all(void) +inline void __lru_add_drain_all(bool force_all_cpus) { /* * lru_drain_gen - Global pages generation number @@ -780,7 +794,7 @@ void lru_add_drain_all(void) * (C) Exit the draining operation if a newer generation, from another * lru_add_drain_all(), was already scheduled for draining. Check (A). */ - if (unlikely(this_gen != lru_drain_gen)) + if (unlikely(this_gen != lru_drain_gen && !force_all_cpus)) goto done; /* @@ -794,7 +808,7 @@ void lru_add_drain_all(void) * below which drains the page vectors. * * Let x, y, and z represent some system CPU numbers, where x < y < z. - * Assume CPU #z is is in the middle of the for_each_online_cpu loop + * Assume CPU #z is in the middle of the for_each_online_cpu loop * below and has already reached CPU #y's per-cpu data. CPU #x comes * along, adds some pages to its per-cpu vectors, then calls * lru_add_drain_all(). @@ -810,12 +824,14 @@ void lru_add_drain_all(void) for_each_online_cpu(cpu) { struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); - if (pagevec_count(&per_cpu(lru_pvecs.lru_add, cpu)) || + if (force_all_cpus || + pagevec_count(&per_cpu(lru_pvecs.lru_add, cpu)) || data_race(pagevec_count(&per_cpu(lru_rotate.pvec, cpu))) || pagevec_count(&per_cpu(lru_pvecs.lru_deactivate_file, cpu)) || pagevec_count(&per_cpu(lru_pvecs.lru_deactivate, cpu)) || pagevec_count(&per_cpu(lru_pvecs.lru_lazyfree, cpu)) || - need_activate_page_drain(cpu)) { + need_activate_page_drain(cpu) || + has_bh_in_lru(cpu, NULL)) { INIT_WORK(work, lru_add_drain_per_cpu); queue_work_on(cpu, mm_percpu_wq, work); __cpumask_set_cpu(cpu, &has_work); @@ -828,6 +844,11 @@ void lru_add_drain_all(void) done: mutex_unlock(&lock); } + +void lru_add_drain_all(void) +{ + __lru_add_drain_all(false); +} #else void lru_add_drain_all(void) { @@ -835,6 +856,34 @@ void lru_add_drain_all(void) } #endif /* CONFIG_SMP */ +atomic_t lru_disable_count = ATOMIC_INIT(0); + +/* + * lru_cache_disable() needs to be called before we start compiling + * a list of pages to be migrated using isolate_lru_page(). + * It drains pages on LRU cache and then disable on all cpus until + * lru_cache_enable is called. + * + * Must be paired with a call to lru_cache_enable(). + */ +void lru_cache_disable(void) +{ + atomic_inc(&lru_disable_count); +#ifdef CONFIG_SMP + /* + * lru_add_drain_all in the force mode will schedule draining on + * all online CPUs so any calls of lru_cache_disabled wrapped by + * local_lock or preemption disabled would be ordered by that. + * The atomic operation doesn't need to have stronger ordering + * requirements because that is enforeced by the scheduling + * guarantees. + */ + __lru_add_drain_all(true); +#else + lru_add_drain(); +#endif +} + /** * release_pages - batched put_page() * @pages: array of pages to release diff --git a/mm/swap_slots.c b/mm/swap_slots.c index be9de6d5b516..6248d1030a9b 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -16,7 +16,7 @@ * to local caches without needing to acquire swap_info * lock. We do not reuse the returned slots directly but * move them back to the global pool in a batch. This - * allows the slots to coaellesce and reduce fragmentation. + * allows the slots to coalesce and reduce fragmentation. * * The swap entry allocated is marked with SWAP_HAS_CACHE * flag in map_count that prevents it from being allocated diff --git a/mm/swap_state.c b/mm/swap_state.c index 3cdee7b11da9..272ea2108c9d 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -132,7 +132,6 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, xas_store(&xas, page); xas_next(&xas); } - address_space->nrexceptional -= nr_shadows; address_space->nrpages += nr; __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); __mod_lruvec_page_state(page, NR_SWAPCACHE, nr); @@ -172,8 +171,6 @@ void __delete_from_swap_cache(struct page *page, xas_next(&xas); } ClearPageSwapCache(page); - if (shadow) - address_space->nrexceptional += nr; address_space->nrpages -= nr; __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); __mod_lruvec_page_state(page, NR_SWAPCACHE, -nr); @@ -275,7 +272,6 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin, xas_store(&xas, NULL); nr_shadows++; } - address_space->nrexceptional -= nr_shadows; xa_unlock_irq(&address_space->i_pages); /* search the next swapcache until we meet end */ @@ -497,16 +493,14 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, __SetPageLocked(page); __SetPageSwapBacked(page); - /* May fail (-ENOMEM) if XArray node allocation failed. */ - if (add_to_swap_cache(page, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) { - put_swap_page(page, entry); + if (mem_cgroup_swapin_charge_page(page, NULL, gfp_mask, entry)) goto fail_unlock; - } - if (mem_cgroup_charge(page, NULL, gfp_mask)) { - delete_from_swap_cache(page); + /* May fail (-ENOMEM) if XArray node allocation failed. */ + if (add_to_swap_cache(page, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) goto fail_unlock; - } + + mem_cgroup_swapin_uncharge_swap(entry); if (shadow) workingset_refault(page, shadow); @@ -517,6 +511,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, return page; fail_unlock: + put_swap_page(page, entry); unlock_page(page); put_page(page); return NULL; @@ -797,7 +792,7 @@ static void swap_ra_info(struct vm_fault *vmf, * * Returns the struct page for entry and addr, after queueing swapin. * - * Primitive swap readahead code. We simply read in a few pages whoes + * Primitive swap readahead code. We simply read in a few pages whose * virtual addresses are around the fault address in the same vma. * * Caller must hold read mmap_lock if vmf->vma is not NULL. diff --git a/mm/swapfile.c b/mm/swapfile.c index 084a5b9a18e5..996afa8131c8 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1900,7 +1900,7 @@ unsigned int count_swap_pages(int type, int free) static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte) { - return pte_same(pte_swp_clear_soft_dirty(pte), swp_pte); + return pte_same(pte_swp_clear_flags(pte), swp_pte); } /* @@ -2780,7 +2780,7 @@ static int swap_show(struct seq_file *swap, void *v) unsigned int bytes, inuse; if (si == SEQ_START_TOKEN) { - seq_puts(swap,"Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n"); + seq_puts(swap, "Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n"); return 0; } @@ -3284,7 +3284,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) sizeof(long), GFP_KERNEL); - if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) { + if (p->bdev && (swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) { /* * When discard is enabled for swap with no particular * policy flagged, we set all swap discard flags here in diff --git a/mm/truncate.c b/mm/truncate.c index 455944264663..234ddd879caa 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -40,7 +40,6 @@ static inline void __clear_shadow_entry(struct address_space *mapping, if (xas_load(&xas) != entry) return; xas_store(&xas, NULL); - mapping->nrexceptional--; } static void clear_shadow_entry(struct address_space *mapping, pgoff_t index, @@ -168,13 +167,10 @@ void do_invalidatepage(struct page *page, unsigned int offset, * its lock, b) when a concurrent invalidate_mapping_pages got there first and * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. */ -static void -truncate_cleanup_page(struct address_space *mapping, struct page *page) +static void truncate_cleanup_page(struct page *page) { - if (page_mapped(page)) { - unsigned int nr = thp_nr_pages(page); - unmap_mapping_pages(mapping, page->index, nr, false); - } + if (page_mapped(page)) + unmap_mapping_page(page); if (page_has_private(page)) do_invalidatepage(page, 0, thp_size(page)); @@ -219,7 +215,7 @@ int truncate_inode_page(struct address_space *mapping, struct page *page) if (page->mapping != mapping) return -EIO; - truncate_cleanup_page(mapping, page); + truncate_cleanup_page(page); delete_from_page_cache(page); return 0; } @@ -295,7 +291,7 @@ void truncate_inode_pages_range(struct address_space *mapping, pgoff_t index; int i; - if (mapping->nrpages == 0 && mapping->nrexceptional == 0) + if (mapping_empty(mapping)) goto out; /* Offsets within partial pages */ @@ -326,7 +322,7 @@ void truncate_inode_pages_range(struct address_space *mapping, index = indices[pagevec_count(&pvec) - 1] + 1; truncate_exceptional_pvec_entries(mapping, &pvec, indices); for (i = 0; i < pagevec_count(&pvec); i++) - truncate_cleanup_page(mapping, pvec.pages[i]); + truncate_cleanup_page(pvec.pages[i]); delete_from_page_cache_batch(mapping, &pvec); for (i = 0; i < pagevec_count(&pvec); i++) unlock_page(pvec.pages[i]); @@ -440,9 +436,6 @@ EXPORT_SYMBOL(truncate_inode_pages); */ void truncate_inode_pages_final(struct address_space *mapping) { - unsigned long nrexceptional; - unsigned long nrpages; - /* * Page reclaim can not participate in regular inode lifetime * management (can't call iput()) and thus can race with the @@ -452,16 +445,7 @@ void truncate_inode_pages_final(struct address_space *mapping) */ mapping_set_exiting(mapping); - /* - * When reclaim installs eviction entries, it increases - * nrexceptional first, then decreases nrpages. Make sure we see - * this in the right order or we might miss an entry. - */ - nrpages = mapping->nrpages; - smp_rmb(); - nrexceptional = mapping->nrexceptional; - - if (nrpages || nrexceptional) { + if (!mapping_empty(mapping)) { /* * As truncation uses a lockless tree lookup, cycle * the tree lock to make sure any ongoing tree @@ -633,7 +617,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, int ret2 = 0; int did_range_unmap = 0; - if (mapping->nrpages == 0 && mapping->nrexceptional == 0) + if (mapping_empty(mapping)) goto out; pagevec_init(&pvec); @@ -652,6 +636,16 @@ int invalidate_inode_pages2_range(struct address_space *mapping, continue; } + if (!did_range_unmap && page_mapped(page)) { + /* + * If page is mapped, before taking its lock, + * zap the rest of the file in one hit. + */ + unmap_mapping_pages(mapping, index, + (1 + end - index), false); + did_range_unmap = 1; + } + lock_page(page); WARN_ON(page_to_index(page) != index); if (page->mapping != mapping) { @@ -659,23 +653,11 @@ int invalidate_inode_pages2_range(struct address_space *mapping, continue; } wait_on_page_writeback(page); - if (page_mapped(page)) { - if (!did_range_unmap) { - /* - * Zap the rest of the file in one hit. - */ - unmap_mapping_pages(mapping, index, - (1 + end - index), false); - did_range_unmap = 1; - } else { - /* - * Just zap this page - */ - unmap_mapping_pages(mapping, index, - 1, false); - } - } + + if (page_mapped(page)) + unmap_mapping_page(page); BUG_ON(page_mapped(page)); + ret2 = do_launder_page(mapping, page); if (ret2 == 0) { if (!invalidate_complete_page2(mapping, page)) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 9a3d451402d7..63a73e164d55 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -207,7 +207,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len, - bool zeropage) + enum mcopy_atomic_mode mode) { int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED; int vm_shared = dst_vma->vm_flags & VM_SHARED; @@ -227,7 +227,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, * by THP. Since we can not reliably insert a zero page, this * feature is not supported. */ - if (zeropage) { + if (mode == MCOPY_ATOMIC_ZEROPAGE) { mmap_read_unlock(dst_mm); return -EINVAL; } @@ -273,8 +273,6 @@ retry: } while (src_addr < src_start + len) { - pte_t dst_pteval; - BUG_ON(dst_addr >= dst_start + len); /* @@ -290,23 +288,23 @@ retry: mutex_lock(&hugetlb_fault_mutex_table[hash]); err = -ENOMEM; - dst_pte = huge_pte_alloc(dst_mm, dst_addr, vma_hpagesize); + dst_pte = huge_pte_alloc(dst_mm, dst_vma, dst_addr, vma_hpagesize); if (!dst_pte) { mutex_unlock(&hugetlb_fault_mutex_table[hash]); i_mmap_unlock_read(mapping); goto out_unlock; } - err = -EEXIST; - dst_pteval = huge_ptep_get(dst_pte); - if (!huge_pte_none(dst_pteval)) { + if (mode != MCOPY_ATOMIC_CONTINUE && + !huge_pte_none(huge_ptep_get(dst_pte))) { + err = -EEXIST; mutex_unlock(&hugetlb_fault_mutex_table[hash]); i_mmap_unlock_read(mapping); goto out_unlock; } err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, - dst_addr, src_addr, &page); + dst_addr, src_addr, mode, &page); mutex_unlock(&hugetlb_fault_mutex_table[hash]); i_mmap_unlock_read(mapping); @@ -362,38 +360,38 @@ out: * If a reservation for the page existed in the reservation * map of a private mapping, the map was modified to indicate * the reservation was consumed when the page was allocated. - * We clear the PagePrivate flag now so that the global + * We clear the HPageRestoreReserve flag now so that the global * reserve count will not be incremented in free_huge_page. * The reservation map will still indicate the reservation * was consumed and possibly prevent later page allocation. * This is better than leaking a global reservation. If no - * reservation existed, it is still safe to clear PagePrivate - * as no adjustments to reservation counts were made during - * allocation. + * reservation existed, it is still safe to clear + * HPageRestoreReserve as no adjustments to reservation counts + * were made during allocation. * * The reservation map for shared mappings indicates which * pages have reservations. When a huge page is allocated * for an address with a reservation, no change is made to - * the reserve map. In this case PagePrivate will be set - * to indicate that the global reservation count should be + * the reserve map. In this case HPageRestoreReserve will be + * set to indicate that the global reservation count should be * incremented when the page is freed. This is the desired * behavior. However, when a huge page is allocated for an * address without a reservation a reservation entry is added - * to the reservation map, and PagePrivate will not be set. - * When the page is freed, the global reserve count will NOT - * be incremented and it will appear as though we have leaked - * reserved page. In this case, set PagePrivate so that the - * global reserve count will be incremented to match the - * reservation map entry which was created. + * to the reservation map, and HPageRestoreReserve will not be + * set. When the page is freed, the global reserve count will + * NOT be incremented and it will appear as though we have + * leaked reserved page. In this case, set HPageRestoreReserve + * so that the global reserve count will be incremented to + * match the reservation map entry which was created. * * Note that vm_alloc_shared is based on the flags of the vma * for which the page was originally allocated. dst_vma could * be different or NULL on error. */ if (vm_alloc_shared) - SetPagePrivate(page); + SetHPageRestoreReserve(page); else - ClearPagePrivate(page); + ClearHPageRestoreReserve(page); put_page(page); } BUG_ON(copied < 0); @@ -408,7 +406,7 @@ extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len, - bool zeropage); + enum mcopy_atomic_mode mode); #endif /* CONFIG_HUGETLB_PAGE */ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm, @@ -458,7 +456,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len, - bool zeropage, + enum mcopy_atomic_mode mcopy_mode, bool *mmap_changing, __u64 mode) { @@ -469,6 +467,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, long copied; struct page *page; bool wp_copy; + bool zeropage = (mcopy_mode == MCOPY_ATOMIC_ZEROPAGE); /* * Sanitize the command parameters: @@ -527,10 +526,12 @@ retry: */ if (is_vm_hugetlb_page(dst_vma)) return __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start, - src_start, len, zeropage); + src_start, len, mcopy_mode); if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) goto out_unlock; + if (mcopy_mode == MCOPY_ATOMIC_CONTINUE) + goto out_unlock; /* * Ensure the dst_vma has a anon_vma or this page @@ -626,14 +627,22 @@ ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len, bool *mmap_changing, __u64 mode) { - return __mcopy_atomic(dst_mm, dst_start, src_start, len, false, - mmap_changing, mode); + return __mcopy_atomic(dst_mm, dst_start, src_start, len, + MCOPY_ATOMIC_NORMAL, mmap_changing, mode); } ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start, unsigned long len, bool *mmap_changing) { - return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing, 0); + return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_ZEROPAGE, + mmap_changing, 0); +} + +ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start, + unsigned long len, bool *mmap_changing) +{ + return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_CONTINUE, + mmap_changing, 0); } int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, diff --git a/mm/util.c b/mm/util.c index 54870226cea6..a8bf17f18a81 100644 --- a/mm/util.c +++ b/mm/util.c @@ -711,16 +711,6 @@ struct address_space *page_mapping(struct page *page) } EXPORT_SYMBOL(page_mapping); -/* - * For file cache pages, return the address_space, otherwise return NULL - */ -struct address_space *page_mapping_file(struct page *page) -{ - if (unlikely(PageSwapCache(page))) - return NULL; - return page_mapping(page); -} - /* Slow path of page_mapcount() for compound pages */ int __page_mapcount(struct page *page) { @@ -775,7 +765,7 @@ int overcommit_policy_handler(struct ctl_table *table, int write, void *buffer, * The deviation of sync_overcommit_as could be big with loose policy * like OVERCOMMIT_ALWAYS/OVERCOMMIT_GUESS. When changing policy to * strict OVERCOMMIT_NEVER, we need to reduce the deviation to comply - * with the strict "NEVER", and to avoid possible race condtion (even + * with the strict "NEVER", and to avoid possible race condition (even * though user usually won't too frequently do the switching to policy * OVERCOMMIT_NEVER), the switch is done in the following order: * 1. changing the batch @@ -983,6 +973,7 @@ int __weak memcmp_pages(struct page *page1, struct page *page2) return ret; } +#ifdef CONFIG_PRINTK /** * mem_dump_obj - Print available provenance information * @object: object for which to find provenance information. @@ -996,20 +987,26 @@ int __weak memcmp_pages(struct page *page1, struct page *page2) */ void mem_dump_obj(void *object) { + const char *type; + if (kmem_valid_obj(object)) { kmem_dump_obj(object); return; } + if (vmalloc_dump_obj(object)) return; - if (!virt_addr_valid(object)) { - if (object == NULL) - pr_cont(" NULL pointer.\n"); - else if (object == ZERO_SIZE_PTR) - pr_cont(" zero-size pointer.\n"); - else - pr_cont(" non-paged memory.\n"); - return; - } - pr_cont(" non-slab/vmalloc memory.\n"); + + if (virt_addr_valid(object)) + type = "non-slab/vmalloc memory"; + else if (object == NULL) + type = "NULL pointer"; + else if (object == ZERO_SIZE_PTR) + type = "zero-size pointer"; + else + type = "non-paged memory"; + + pr_cont(" %s\n", type); } +EXPORT_SYMBOL_GPL(mem_dump_obj); +#endif diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 4f5f8c907897..d0a7d89be091 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -34,7 +34,7 @@ #include <linux/bitops.h> #include <linux/rbtree_augmented.h> #include <linux/overflow.h> - +#include <linux/pgtable.h> #include <linux/uaccess.h> #include <asm/tlbflush.h> #include <asm/shmparam.h> @@ -42,6 +42,19 @@ #include "internal.h" #include "pgalloc-track.h" +#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC +static bool __ro_after_init vmap_allow_huge = true; + +static int __init set_nohugevmalloc(char *str) +{ + vmap_allow_huge = false; + return 0; +} +early_param("nohugevmalloc", set_nohugevmalloc); +#else /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */ +static const bool vmap_allow_huge = false; +#endif /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */ + bool is_vmalloc_addr(const void *x) { unsigned long addr = (unsigned long)x; @@ -68,6 +81,218 @@ static void free_work(struct work_struct *w) } /*** Page table manipulation functions ***/ +static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + pgtbl_mod_mask *mask) +{ + pte_t *pte; + u64 pfn; + + pfn = phys_addr >> PAGE_SHIFT; + pte = pte_alloc_kernel_track(pmd, addr, mask); + if (!pte) + return -ENOMEM; + do { + BUG_ON(!pte_none(*pte)); + set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot)); + pfn++; + } while (pte++, addr += PAGE_SIZE, addr != end); + *mask |= PGTBL_PTE_MODIFIED; + return 0; +} + +static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift) +{ + if (max_page_shift < PMD_SHIFT) + return 0; + + if (!arch_vmap_pmd_supported(prot)) + return 0; + + if ((end - addr) != PMD_SIZE) + return 0; + + if (!IS_ALIGNED(addr, PMD_SIZE)) + return 0; + + if (!IS_ALIGNED(phys_addr, PMD_SIZE)) + return 0; + + if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr)) + return 0; + + return pmd_set_huge(pmd, phys_addr, prot); +} + +static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift, pgtbl_mod_mask *mask) +{ + pmd_t *pmd; + unsigned long next; + + pmd = pmd_alloc_track(&init_mm, pud, addr, mask); + if (!pmd) + return -ENOMEM; + do { + next = pmd_addr_end(addr, end); + + if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot, + max_page_shift)) { + *mask |= PGTBL_PMD_MODIFIED; + continue; + } + + if (vmap_pte_range(pmd, addr, next, phys_addr, prot, mask)) + return -ENOMEM; + } while (pmd++, phys_addr += (next - addr), addr = next, addr != end); + return 0; +} + +static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift) +{ + if (max_page_shift < PUD_SHIFT) + return 0; + + if (!arch_vmap_pud_supported(prot)) + return 0; + + if ((end - addr) != PUD_SIZE) + return 0; + + if (!IS_ALIGNED(addr, PUD_SIZE)) + return 0; + + if (!IS_ALIGNED(phys_addr, PUD_SIZE)) + return 0; + + if (pud_present(*pud) && !pud_free_pmd_page(pud, addr)) + return 0; + + return pud_set_huge(pud, phys_addr, prot); +} + +static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift, pgtbl_mod_mask *mask) +{ + pud_t *pud; + unsigned long next; + + pud = pud_alloc_track(&init_mm, p4d, addr, mask); + if (!pud) + return -ENOMEM; + do { + next = pud_addr_end(addr, end); + + if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot, + max_page_shift)) { + *mask |= PGTBL_PUD_MODIFIED; + continue; + } + + if (vmap_pmd_range(pud, addr, next, phys_addr, prot, + max_page_shift, mask)) + return -ENOMEM; + } while (pud++, phys_addr += (next - addr), addr = next, addr != end); + return 0; +} + +static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift) +{ + if (max_page_shift < P4D_SHIFT) + return 0; + + if (!arch_vmap_p4d_supported(prot)) + return 0; + + if ((end - addr) != P4D_SIZE) + return 0; + + if (!IS_ALIGNED(addr, P4D_SIZE)) + return 0; + + if (!IS_ALIGNED(phys_addr, P4D_SIZE)) + return 0; + + if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr)) + return 0; + + return p4d_set_huge(p4d, phys_addr, prot); +} + +static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift, pgtbl_mod_mask *mask) +{ + p4d_t *p4d; + unsigned long next; + + p4d = p4d_alloc_track(&init_mm, pgd, addr, mask); + if (!p4d) + return -ENOMEM; + do { + next = p4d_addr_end(addr, end); + + if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot, + max_page_shift)) { + *mask |= PGTBL_P4D_MODIFIED; + continue; + } + + if (vmap_pud_range(p4d, addr, next, phys_addr, prot, + max_page_shift, mask)) + return -ENOMEM; + } while (p4d++, phys_addr += (next - addr), addr = next, addr != end); + return 0; +} + +static int vmap_range_noflush(unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift) +{ + pgd_t *pgd; + unsigned long start; + unsigned long next; + int err; + pgtbl_mod_mask mask = 0; + + might_sleep(); + BUG_ON(addr >= end); + + start = addr; + pgd = pgd_offset_k(addr); + do { + next = pgd_addr_end(addr, end); + err = vmap_p4d_range(pgd, addr, next, phys_addr, prot, + max_page_shift, &mask); + if (err) + break; + } while (pgd++, phys_addr += (next - addr), addr = next, addr != end); + + if (mask & ARCH_PAGE_TABLE_SYNC_MASK) + arch_sync_kernel_mappings(start, end); + + return err; +} + +int vmap_range(unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift) +{ + int err; + + err = vmap_range_noflush(addr, end, phys_addr, prot, max_page_shift); + flush_cache_vmap(addr, end); + + return err; +} static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pgtbl_mod_mask *mask) @@ -153,22 +378,20 @@ static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, } while (p4d++, addr = next, addr != end); } -/** - * unmap_kernel_range_noflush - unmap kernel VM area - * @start: start of the VM area to unmap - * @size: size of the VM area to unmap +/* + * vunmap_range_noflush is similar to vunmap_range, but does not + * flush caches or TLBs. * - * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size specify - * should have been allocated using get_vm_area() and its friends. + * The caller is responsible for calling flush_cache_vmap() before calling + * this function, and flush_tlb_kernel_range after it has returned + * successfully (and before the addresses are expected to cause a page fault + * or be re-mapped for something else, if TLB flushes are being delayed or + * coalesced). * - * NOTE: - * This function does NOT do any cache flushing. The caller is responsible - * for calling flush_cache_vunmap() on to-be-mapped areas before calling this - * function and flush_tlb_kernel_range() after. + * This is an internal function only. Do not use outside mm/. */ -void unmap_kernel_range_noflush(unsigned long start, unsigned long size) +void vunmap_range_noflush(unsigned long start, unsigned long end) { - unsigned long end = start + size; unsigned long next; pgd_t *pgd; unsigned long addr = start; @@ -189,7 +412,23 @@ void unmap_kernel_range_noflush(unsigned long start, unsigned long size) arch_sync_kernel_mappings(start, end); } -static int vmap_pte_range(pmd_t *pmd, unsigned long addr, +/** + * vunmap_range - unmap kernel virtual addresses + * @addr: start of the VM area to unmap + * @end: end of the VM area to unmap (non-inclusive) + * + * Clears any present PTEs in the virtual address range, flushes TLBs and + * caches. Any subsequent access to the address before it has been re-mapped + * is a kernel bug. + */ +void vunmap_range(unsigned long addr, unsigned long end) +{ + flush_cache_vunmap(addr, end); + vunmap_range_noflush(addr, end); + flush_tlb_kernel_range(addr, end); +} + +static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr, pgtbl_mod_mask *mask) { @@ -217,7 +456,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, return 0; } -static int vmap_pmd_range(pud_t *pud, unsigned long addr, +static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr, pgtbl_mod_mask *mask) { @@ -229,13 +468,13 @@ static int vmap_pmd_range(pud_t *pud, unsigned long addr, return -ENOMEM; do { next = pmd_addr_end(addr, end); - if (vmap_pte_range(pmd, addr, next, prot, pages, nr, mask)) + if (vmap_pages_pte_range(pmd, addr, next, prot, pages, nr, mask)) return -ENOMEM; } while (pmd++, addr = next, addr != end); return 0; } -static int vmap_pud_range(p4d_t *p4d, unsigned long addr, +static int vmap_pages_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr, pgtbl_mod_mask *mask) { @@ -247,13 +486,13 @@ static int vmap_pud_range(p4d_t *p4d, unsigned long addr, return -ENOMEM; do { next = pud_addr_end(addr, end); - if (vmap_pmd_range(pud, addr, next, prot, pages, nr, mask)) + if (vmap_pages_pmd_range(pud, addr, next, prot, pages, nr, mask)) return -ENOMEM; } while (pud++, addr = next, addr != end); return 0; } -static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, +static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr, pgtbl_mod_mask *mask) { @@ -265,37 +504,18 @@ static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, return -ENOMEM; do { next = p4d_addr_end(addr, end); - if (vmap_pud_range(p4d, addr, next, prot, pages, nr, mask)) + if (vmap_pages_pud_range(p4d, addr, next, prot, pages, nr, mask)) return -ENOMEM; } while (p4d++, addr = next, addr != end); return 0; } -/** - * map_kernel_range_noflush - map kernel VM area with the specified pages - * @addr: start of the VM area to map - * @size: size of the VM area to map - * @prot: page protection flags to use - * @pages: pages to map - * - * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size specify should - * have been allocated using get_vm_area() and its friends. - * - * NOTE: - * This function does NOT do any cache flushing. The caller is responsible for - * calling flush_cache_vmap() on to-be-mapped areas before calling this - * function. - * - * RETURNS: - * 0 on success, -errno on failure. - */ -int map_kernel_range_noflush(unsigned long addr, unsigned long size, - pgprot_t prot, struct page **pages) +static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages) { unsigned long start = addr; - unsigned long end = addr + size; - unsigned long next; pgd_t *pgd; + unsigned long next; int err = 0; int nr = 0; pgtbl_mod_mask mask = 0; @@ -306,7 +526,7 @@ int map_kernel_range_noflush(unsigned long addr, unsigned long size, next = pgd_addr_end(addr, end); if (pgd_bad(*pgd)) mask |= PGTBL_PGD_MODIFIED; - err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr, &mask); + err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask); if (err) return err; } while (pgd++, addr = next, addr != end); @@ -317,14 +537,61 @@ int map_kernel_range_noflush(unsigned long addr, unsigned long size, return 0; } -int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot, - struct page **pages) +/* + * vmap_pages_range_noflush is similar to vmap_pages_range, but does not + * flush caches. + * + * The caller is responsible for calling flush_cache_vmap() after this + * function returns successfully and before the addresses are accessed. + * + * This is an internal function only. Do not use outside mm/. + */ +int vmap_pages_range_noflush(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, unsigned int page_shift) { - int ret; + unsigned int i, nr = (end - addr) >> PAGE_SHIFT; + + WARN_ON(page_shift < PAGE_SHIFT); + + if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) || + page_shift == PAGE_SHIFT) + return vmap_small_pages_range_noflush(addr, end, prot, pages); + + for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) { + int err; - ret = map_kernel_range_noflush(start, size, prot, pages); - flush_cache_vmap(start, start + size); - return ret; + err = vmap_range_noflush(addr, addr + (1UL << page_shift), + __pa(page_address(pages[i])), prot, + page_shift); + if (err) + return err; + + addr += 1UL << page_shift; + } + + return 0; +} + +/** + * vmap_pages_range - map pages to a kernel virtual address + * @addr: start of the VM area to map + * @end: end of the VM area to map (non-inclusive) + * @prot: page protection flags to use + * @pages: pages to map (always PAGE_SIZE pages) + * @page_shift: maximum shift that the pages may be mapped with, @pages must + * be aligned and contiguous up to at least this shift. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +static int vmap_pages_range(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, unsigned int page_shift) +{ + int err; + + err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift); + flush_cache_vmap(addr, end); + return err; } int is_vmalloc_or_module_addr(const void *x) @@ -343,7 +610,9 @@ int is_vmalloc_or_module_addr(const void *x) } /* - * Walk a vmap address to the struct page it maps. + * Walk a vmap address to the struct page it maps. Huge vmap mappings will + * return the tail page that corresponds to the base page address, which + * matches small vmap mappings. */ struct page *vmalloc_to_page(const void *vmalloc_addr) { @@ -363,25 +632,33 @@ struct page *vmalloc_to_page(const void *vmalloc_addr) if (pgd_none(*pgd)) return NULL; + if (WARN_ON_ONCE(pgd_leaf(*pgd))) + return NULL; /* XXX: no allowance for huge pgd */ + if (WARN_ON_ONCE(pgd_bad(*pgd))) + return NULL; + p4d = p4d_offset(pgd, addr); if (p4d_none(*p4d)) return NULL; - pud = pud_offset(p4d, addr); + if (p4d_leaf(*p4d)) + return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT); + if (WARN_ON_ONCE(p4d_bad(*p4d))) + return NULL; - /* - * Don't dereference bad PUD or PMD (below) entries. This will also - * identify huge mappings, which we may encounter on architectures - * that define CONFIG_HAVE_ARCH_HUGE_VMAP=y. Such regions will be - * identified as vmalloc addresses by is_vmalloc_addr(), but are - * not [unambiguously] associated with a struct page, so there is - * no correct value to return for them. - */ - WARN_ON_ONCE(pud_bad(*pud)); - if (pud_none(*pud) || pud_bad(*pud)) + pud = pud_offset(p4d, addr); + if (pud_none(*pud)) return NULL; + if (pud_leaf(*pud)) + return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); + if (WARN_ON_ONCE(pud_bad(*pud))) + return NULL; + pmd = pmd_offset(pud, addr); - WARN_ON_ONCE(pmd_bad(*pmd)); - if (pmd_none(*pmd) || pmd_bad(*pmd)) + if (pmd_none(*pmd)) + return NULL; + if (pmd_leaf(*pmd)) + return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); + if (WARN_ON_ONCE(pmd_bad(*pmd))) return NULL; ptep = pte_offset_map(pmd, addr); @@ -389,6 +666,7 @@ struct page *vmalloc_to_page(const void *vmalloc_addr) if (pte_present(pte)) page = pte_page(pte); pte_unmap(ptep); + return page; } EXPORT_SYMBOL(vmalloc_to_page); @@ -1152,6 +1430,29 @@ static void free_vmap_area(struct vmap_area *va) spin_unlock(&free_vmap_area_lock); } +static inline void +preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node) +{ + struct vmap_area *va = NULL; + + /* + * Preload this CPU with one extra vmap_area object. It is used + * when fit type of free area is NE_FIT_TYPE. It guarantees that + * a CPU that does an allocation is preloaded. + * + * We do it in non-atomic context, thus it allows us to use more + * permissive allocation masks to be more stable under low memory + * condition and high memory pressure. + */ + if (!this_cpu_read(ne_fit_preload_node)) + va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); + + spin_lock(lock); + + if (va && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, va)) + kmem_cache_free(vmap_area_cachep, va); +} + /* * Allocate a region of KVA of the specified size and alignment, within the * vstart and vend. @@ -1161,7 +1462,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, unsigned long vstart, unsigned long vend, int node, gfp_t gfp_mask) { - struct vmap_area *va, *pva; + struct vmap_area *va; unsigned long addr; int purged = 0; int ret; @@ -1187,43 +1488,14 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask); retry: - /* - * Preload this CPU with one extra vmap_area object. It is used - * when fit type of free area is NE_FIT_TYPE. Please note, it - * does not guarantee that an allocation occurs on a CPU that - * is preloaded, instead we minimize the case when it is not. - * It can happen because of cpu migration, because there is a - * race until the below spinlock is taken. - * - * The preload is done in non-atomic context, thus it allows us - * to use more permissive allocation masks to be more stable under - * low memory condition and high memory pressure. In rare case, - * if not preloaded, GFP_NOWAIT is used. - * - * Set "pva" to NULL here, because of "retry" path. - */ - pva = NULL; - - if (!this_cpu_read(ne_fit_preload_node)) - /* - * Even if it fails we do not really care about that. - * Just proceed as it is. If needed "overflow" path - * will refill the cache we allocate from. - */ - pva = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); - - spin_lock(&free_vmap_area_lock); - - if (pva && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, pva)) - kmem_cache_free(vmap_area_cachep, pva); + preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node); + addr = __alloc_vmap_area(size, align, vstart, vend); + spin_unlock(&free_vmap_area_lock); /* * If an allocation fails, the "vend" address is * returned. Therefore trigger the overflow path. */ - addr = __alloc_vmap_area(size, align, vstart, vend); - spin_unlock(&free_vmap_area_lock); - if (unlikely(addr == vend)) goto overflow; @@ -1231,7 +1503,6 @@ retry: va->va_end = addr + size; va->vm = NULL; - spin_lock(&vmap_area_lock); insert_vmap_area(va, &vmap_area_root, &vmap_area_list); spin_unlock(&vmap_area_lock); @@ -1312,7 +1583,7 @@ static unsigned long lazy_max_pages(void) static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0); /* - * Serialize vmap purging. There is no actual criticial section protected + * Serialize vmap purging. There is no actual critical section protected * by this look, but we want to avoid concurrent calls for performance * reasons and to make the pcpu_get_vm_areas more deterministic. */ @@ -1448,7 +1719,7 @@ static void free_vmap_area_noflush(struct vmap_area *va) static void free_unmap_vmap_area(struct vmap_area *va) { flush_cache_vunmap(va->va_start, va->va_end); - unmap_kernel_range_noflush(va->va_start, va->va_end - va->va_start); + vunmap_range_noflush(va->va_start, va->va_end); if (debug_pagealloc_enabled_static()) flush_tlb_kernel_range(va->va_start, va->va_end); @@ -1726,7 +1997,7 @@ static void vb_free(unsigned long addr, unsigned long size) offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT; vb = xa_load(&vmap_blocks, addr_to_vb_idx(addr)); - unmap_kernel_range_noflush(addr, size); + vunmap_range_noflush(addr, addr + size); if (debug_pagealloc_enabled_static()) flush_tlb_kernel_range(addr, addr + size); @@ -1762,7 +2033,7 @@ static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush) rcu_read_lock(); list_for_each_entry_rcu(vb, &vbq->free, free_list) { spin_lock(&vb->lock); - if (vb->dirty) { + if (vb->dirty && vb->dirty != VMAP_BBMAP_BITS) { unsigned long va_start = vb->va->va_start; unsigned long s, e; @@ -1879,16 +2150,36 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node) kasan_unpoison_vmalloc(mem, size); - if (map_kernel_range(addr, size, PAGE_KERNEL, pages) < 0) { + if (vmap_pages_range(addr, addr + size, PAGE_KERNEL, + pages, PAGE_SHIFT) < 0) { vm_unmap_ram(mem, count); return NULL; } + return mem; } EXPORT_SYMBOL(vm_map_ram); static struct vm_struct *vmlist __initdata; +static inline unsigned int vm_area_page_order(struct vm_struct *vm) +{ +#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC + return vm->page_order; +#else + return 0; +#endif +} + +static inline void set_vm_area_page_order(struct vm_struct *vm, unsigned int order) +{ +#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC + vm->page_order = order; +#else + BUG_ON(order != 0); +#endif +} + /** * vm_area_add_early - add vmap area early during boot * @vm: vm_struct to add @@ -2023,23 +2314,6 @@ void __init vmalloc_init(void) vmap_initialized = true; } -/** - * unmap_kernel_range - unmap kernel VM area and flush cache and TLB - * @addr: start of the VM area to unmap - * @size: size of the VM area to unmap - * - * Similar to unmap_kernel_range_noflush() but flushes vcache before - * the unmapping and tlb after. - */ -void unmap_kernel_range(unsigned long addr, unsigned long size) -{ - unsigned long end = addr + size; - - flush_cache_vunmap(addr, end); - unmap_kernel_range_noflush(addr, size); - flush_tlb_kernel_range(addr, end); -} - static inline void setup_vmalloc_vm_locked(struct vm_struct *vm, struct vmap_area *va, unsigned long flags, const void *caller) { @@ -2070,15 +2344,16 @@ static void clear_vm_uninitialized_flag(struct vm_struct *vm) } static struct vm_struct *__get_vm_area_node(unsigned long size, - unsigned long align, unsigned long flags, unsigned long start, - unsigned long end, int node, gfp_t gfp_mask, const void *caller) + unsigned long align, unsigned long shift, unsigned long flags, + unsigned long start, unsigned long end, int node, + gfp_t gfp_mask, const void *caller) { struct vmap_area *va; struct vm_struct *area; unsigned long requested_size = size; BUG_ON(in_interrupt()); - size = PAGE_ALIGN(size); + size = ALIGN(size, 1ul << shift); if (unlikely(!size)) return NULL; @@ -2110,8 +2385,8 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, unsigned long start, unsigned long end, const void *caller) { - return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE, - GFP_KERNEL, caller); + return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, start, end, + NUMA_NO_NODE, GFP_KERNEL, caller); } /** @@ -2127,7 +2402,8 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, */ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) { - return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, + return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, + VMALLOC_START, VMALLOC_END, NUMA_NO_NODE, GFP_KERNEL, __builtin_return_address(0)); } @@ -2135,7 +2411,8 @@ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, const void *caller) { - return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, + return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, + VMALLOC_START, VMALLOC_END, NUMA_NO_NODE, GFP_KERNEL, caller); } @@ -2199,6 +2476,7 @@ static inline void set_area_direct_map(const struct vm_struct *area, { int i; + /* HUGE_VMALLOC passes small pages to set_direct_map */ for (i = 0; i < area->nr_pages; i++) if (page_address(area->pages[i])) set_direct_map(area->pages[i]); @@ -2208,6 +2486,7 @@ static inline void set_area_direct_map(const struct vm_struct *area, static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) { unsigned long start = ULONG_MAX, end = 0; + unsigned int page_order = vm_area_page_order(area); int flush_reset = area->flags & VM_FLUSH_RESET_PERMS; int flush_dmap = 0; int i; @@ -2232,11 +2511,14 @@ static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) * map. Find the start and end range of the direct mappings to make sure * the vm_unmap_aliases() flush includes the direct map. */ - for (i = 0; i < area->nr_pages; i++) { + for (i = 0; i < area->nr_pages; i += 1U << page_order) { unsigned long addr = (unsigned long)page_address(area->pages[i]); if (addr) { + unsigned long page_size; + + page_size = PAGE_SIZE << page_order; start = min(addr, start); - end = max(addr + PAGE_SIZE, end); + end = max(addr + page_size, end); flush_dmap = 1; } } @@ -2277,13 +2559,14 @@ static void __vunmap(const void *addr, int deallocate_pages) vm_remove_mappings(area, deallocate_pages); if (deallocate_pages) { + unsigned int page_order = vm_area_page_order(area); int i; - for (i = 0; i < area->nr_pages; i++) { + for (i = 0; i < area->nr_pages; i += 1U << page_order) { struct page *page = area->pages[i]; BUG_ON(!page); - __free_pages(page, 0); + __free_pages(page, page_order); } atomic_long_sub(area->nr_pages, &nr_vmalloc_pages); @@ -2348,7 +2631,7 @@ static void __vfree(const void *addr) * May sleep if called *not* from interrupt context. * Must not be called in NMI context (strictly speaking, it could be * if we have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling - * conventions for vfree() arch-depenedent would be a really bad idea). + * conventions for vfree() arch-dependent would be a really bad idea). */ void vfree(const void *addr) { @@ -2402,6 +2685,7 @@ void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_t prot) { struct vm_struct *area; + unsigned long addr; unsigned long size; /* In bytes */ might_sleep(); @@ -2414,8 +2698,9 @@ void *vmap(struct page **pages, unsigned int count, if (!area) return NULL; - if (map_kernel_range((unsigned long)area->addr, size, pgprot_nx(prot), - pages) < 0) { + addr = (unsigned long)area->addr; + if (vmap_pages_range(addr, addr + size, pgprot_nx(prot), + pages, PAGE_SHIFT) < 0) { vunmap(area->addr); return NULL; } @@ -2474,15 +2759,19 @@ EXPORT_SYMBOL_GPL(vmap_pfn); #endif /* CONFIG_VMAP_PFN */ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, - pgprot_t prot, int node) + pgprot_t prot, unsigned int page_shift, + int node) { const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; - unsigned int nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; + unsigned long addr = (unsigned long)area->addr; + unsigned long size = get_vm_area_size(area); unsigned long array_size; - unsigned int i; + unsigned int nr_small_pages = size >> PAGE_SHIFT; + unsigned int page_order; struct page **pages; + unsigned int i; - array_size = (unsigned long)nr_pages * sizeof(struct page *); + array_size = (unsigned long)nr_small_pages * sizeof(struct page *); gfp_mask |= __GFP_NOWARN; if (!(gfp_mask & (GFP_DMA | GFP_DMA32))) gfp_mask |= __GFP_HIGHMEM; @@ -2497,42 +2786,60 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, if (!pages) { free_vm_area(area); + warn_alloc(gfp_mask, NULL, + "vmalloc size %lu allocation failure: " + "page array size %lu allocation failed", + nr_small_pages * PAGE_SIZE, array_size); return NULL; } area->pages = pages; - area->nr_pages = nr_pages; + area->nr_pages = nr_small_pages; + set_vm_area_page_order(area, page_shift - PAGE_SHIFT); - for (i = 0; i < area->nr_pages; i++) { - struct page *page; + page_order = vm_area_page_order(area); - if (node == NUMA_NO_NODE) - page = alloc_page(gfp_mask); - else - page = alloc_pages_node(node, gfp_mask, 0); + /* + * Careful, we allocate and map page_order pages, but tracking is done + * per PAGE_SIZE page so as to keep the vm_struct APIs independent of + * the physical/mapped size. + */ + for (i = 0; i < area->nr_pages; i += 1U << page_order) { + struct page *page; + int p; + /* Compound pages required for remap_vmalloc_page */ + page = alloc_pages_node(node, gfp_mask | __GFP_COMP, page_order); if (unlikely(!page)) { /* Successfully allocated i pages, free them in __vfree() */ area->nr_pages = i; atomic_long_add(area->nr_pages, &nr_vmalloc_pages); + warn_alloc(gfp_mask, NULL, + "vmalloc size %lu allocation failure: " + "page order %u allocation failed", + area->nr_pages * PAGE_SIZE, page_order); goto fail; } - area->pages[i] = page; + + for (p = 0; p < (1U << page_order); p++) + area->pages[i + p] = page + p; + if (gfpflags_allow_blocking(gfp_mask)) cond_resched(); } atomic_long_add(area->nr_pages, &nr_vmalloc_pages); - if (map_kernel_range((unsigned long)area->addr, get_vm_area_size(area), - prot, pages) < 0) + if (vmap_pages_range(addr, addr + size, prot, pages, page_shift) < 0) { + warn_alloc(gfp_mask, NULL, + "vmalloc size %lu allocation failure: " + "failed to map pages", + area->nr_pages * PAGE_SIZE); goto fail; + } return area->addr; fail: - warn_alloc(gfp_mask, NULL, - "vmalloc: allocation failure, allocated %ld of %ld bytes", - (area->nr_pages*PAGE_SIZE), area->size); __vfree(area->addr); return NULL; } @@ -2563,19 +2870,54 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, struct vm_struct *area; void *addr; unsigned long real_size = size; + unsigned long real_align = align; + unsigned int shift = PAGE_SHIFT; - size = PAGE_ALIGN(size); - if (!size || (size >> PAGE_SHIFT) > totalram_pages()) - goto fail; + if (WARN_ON_ONCE(!size)) + return NULL; - area = __get_vm_area_node(real_size, align, VM_ALLOC | VM_UNINITIALIZED | - vm_flags, start, end, node, gfp_mask, caller); - if (!area) + if ((size >> PAGE_SHIFT) > totalram_pages()) { + warn_alloc(gfp_mask, NULL, + "vmalloc size %lu allocation failure: " + "exceeds total pages", real_size); + return NULL; + } + + if (vmap_allow_huge && !(vm_flags & VM_NO_HUGE_VMAP) && + arch_vmap_pmd_supported(prot)) { + unsigned long size_per_node; + + /* + * Try huge pages. Only try for PAGE_KERNEL allocations, + * others like modules don't yet expect huge pages in + * their allocations due to apply_to_page_range not + * supporting them. + */ + + size_per_node = size; + if (node == NUMA_NO_NODE) + size_per_node /= num_online_nodes(); + if (size_per_node >= PMD_SIZE) { + shift = PMD_SHIFT; + align = max(real_align, 1UL << shift); + size = ALIGN(real_size, 1UL << shift); + } + } + +again: + area = __get_vm_area_node(real_size, align, shift, VM_ALLOC | + VM_UNINITIALIZED | vm_flags, start, end, node, + gfp_mask, caller); + if (!area) { + warn_alloc(gfp_mask, NULL, + "vmalloc size %lu allocation failure: " + "vm_struct allocation failed", real_size); goto fail; + } - addr = __vmalloc_area_node(area, gfp_mask, prot, node); + addr = __vmalloc_area_node(area, gfp_mask, prot, shift, node); if (!addr) - return NULL; + goto fail; /* * In this function, newly allocated vm_struct has VM_UNINITIALIZED @@ -2584,13 +2926,19 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, */ clear_vm_uninitialized_flag(area); + size = PAGE_ALIGN(size); kmemleak_vmalloc(area, size, gfp_mask); return addr; fail: - warn_alloc(gfp_mask, NULL, - "vmalloc: allocation failure: %lu bytes", real_size); + if (shift > PAGE_SHIFT) { + shift = PAGE_SHIFT; + align = real_align; + size = real_size; + goto again; + } + return NULL; } @@ -2655,6 +3003,23 @@ void *vmalloc(unsigned long size) EXPORT_SYMBOL(vmalloc); /** + * vmalloc_no_huge - allocate virtually contiguous memory using small pages + * @size: allocation size + * + * Allocate enough non-huge pages to cover @size from the page level + * allocator and map them into contiguous kernel virtual space. + * + * Return: pointer to the allocated memory or %NULL on error + */ +void *vmalloc_no_huge(unsigned long size) +{ + return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, + GFP_KERNEL, PAGE_KERNEL, VM_NO_HUGE_VMAP, + NUMA_NO_NODE, __builtin_return_address(0)); +} +EXPORT_SYMBOL(vmalloc_no_huge); + +/** * vzalloc - allocate virtually contiguous memory with zero fill * @size: allocation size * @@ -2739,7 +3104,7 @@ EXPORT_SYMBOL(vzalloc_node); * 64b systems should always have either DMA or DMA32 zones. For others * GFP_DMA32 should do the right thing and use the normal zone. */ -#define GFP_VMALLOC32 GFP_DMA32 | GFP_KERNEL +#define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) #endif /** @@ -2797,15 +3162,12 @@ static int aligned_vread(char *buf, char *addr, unsigned long count) /* * To do safe access to this _mapped_ area, we need * lock. But adding lock here means that we need to add - * overhead of vmalloc()/vfree() calles for this _debug_ + * overhead of vmalloc()/vfree() calls for this _debug_ * interface, rarely used. Instead of that, we'll use * kmap() and get small overhead in this access function. */ if (p) { - /* - * we can expect USER0 is not used (see vread/vwrite's - * function description) - */ + /* We can expect USER0 is not used -- see vread() */ void *map = kmap_atomic(p); memcpy(buf, map + offset, length); kunmap_atomic(map); @@ -2820,43 +3182,6 @@ static int aligned_vread(char *buf, char *addr, unsigned long count) return copied; } -static int aligned_vwrite(char *buf, char *addr, unsigned long count) -{ - struct page *p; - int copied = 0; - - while (count) { - unsigned long offset, length; - - offset = offset_in_page(addr); - length = PAGE_SIZE - offset; - if (length > count) - length = count; - p = vmalloc_to_page(addr); - /* - * To do safe access to this _mapped_ area, we need - * lock. But adding lock here means that we need to add - * overhead of vmalloc()/vfree() calles for this _debug_ - * interface, rarely used. Instead of that, we'll use - * kmap() and get small overhead in this access function. - */ - if (p) { - /* - * we can expect USER0 is not used (see vread/vwrite's - * function description) - */ - void *map = kmap_atomic(p); - memcpy(map + offset, buf, length); - kunmap_atomic(map); - } - addr += length; - buf += length; - copied += length; - count -= length; - } - return copied; -} - /** * vread() - read vmalloc area in a safe way. * @buf: buffer for reading data @@ -2875,7 +3200,7 @@ static int aligned_vwrite(char *buf, char *addr, unsigned long count) * Note: In usual ops, vread() is never necessary because the caller * should know vmalloc() area is valid and can use memcpy(). * This is for routines which have to access vmalloc area without - * any information, as /dev/kmem. + * any information, as /proc/kcore. * * Return: number of bytes for which addr and buf should be increased * (same number as @count) or %0 if [addr...addr+count) doesn't @@ -2894,7 +3219,10 @@ long vread(char *buf, char *addr, unsigned long count) count = -(unsigned long) addr; spin_lock(&vmap_area_lock); - list_for_each_entry(va, &vmap_area_list, list) { + va = __find_vmap_area((unsigned long)addr); + if (!va) + goto finished; + list_for_each_entry_from(va, &vmap_area_list, list) { if (!count) break; @@ -2937,80 +3265,6 @@ finished: } /** - * vwrite() - write vmalloc area in a safe way. - * @buf: buffer for source data - * @addr: vm address. - * @count: number of bytes to be read. - * - * This function checks that addr is a valid vmalloc'ed area, and - * copy data from a buffer to the given addr. If specified range of - * [addr...addr+count) includes some valid address, data is copied from - * proper area of @buf. If there are memory holes, no copy to hole. - * IOREMAP area is treated as memory hole and no copy is done. - * - * If [addr...addr+count) doesn't includes any intersects with alive - * vm_struct area, returns 0. @buf should be kernel's buffer. - * - * Note: In usual ops, vwrite() is never necessary because the caller - * should know vmalloc() area is valid and can use memcpy(). - * This is for routines which have to access vmalloc area without - * any information, as /dev/kmem. - * - * Return: number of bytes for which addr and buf should be - * increased (same number as @count) or %0 if [addr...addr+count) - * doesn't include any intersection with valid vmalloc area - */ -long vwrite(char *buf, char *addr, unsigned long count) -{ - struct vmap_area *va; - struct vm_struct *vm; - char *vaddr; - unsigned long n, buflen; - int copied = 0; - - /* Don't allow overflow */ - if ((unsigned long) addr + count < count) - count = -(unsigned long) addr; - buflen = count; - - spin_lock(&vmap_area_lock); - list_for_each_entry(va, &vmap_area_list, list) { - if (!count) - break; - - if (!va->vm) - continue; - - vm = va->vm; - vaddr = (char *) vm->addr; - if (addr >= vaddr + get_vm_area_size(vm)) - continue; - while (addr < vaddr) { - if (count == 0) - goto finished; - buf++; - addr++; - count--; - } - n = vaddr + get_vm_area_size(vm) - addr; - if (n > count) - n = count; - if (!(vm->flags & VM_IOREMAP)) { - aligned_vwrite(buf, addr, n); - copied++; - } - buf += n; - addr += n; - count -= n; - } -finished: - spin_unlock(&vmap_area_lock); - if (!copied) - return 0; - return buflen; -} - -/** * remap_vmalloc_range_partial - map vmalloc pages to userspace * @vma: vma to cover * @uaddr: target user address to start at @@ -3072,7 +3326,6 @@ int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr, return 0; } -EXPORT_SYMBOL(remap_vmalloc_range_partial); /** * remap_vmalloc_range - map vmalloc pages to userspace @@ -3450,6 +3703,7 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) } #endif /* CONFIG_SMP */ +#ifdef CONFIG_PRINTK bool vmalloc_dump_obj(void *object) { struct vm_struct *vm; @@ -3462,6 +3716,7 @@ bool vmalloc_dump_obj(void *object) vm->nr_pages, (unsigned long)vm->addr, vm->caller); return true; } +#endif #ifdef CONFIG_PROC_FS static void *s_start(struct seq_file *m, loff_t *pos) diff --git a/mm/vmscan.c b/mm/vmscan.c index 562e87cbd7a1..5199b9696bab 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -185,39 +185,181 @@ static LIST_HEAD(shrinker_list); static DECLARE_RWSEM(shrinker_rwsem); #ifdef CONFIG_MEMCG -/* - * We allow subsystems to populate their shrinker-related - * LRU lists before register_shrinker_prepared() is called - * for the shrinker, since we don't want to impose - * restrictions on their internal registration order. - * In this case shrink_slab_memcg() may find corresponding - * bit is set in the shrinkers map. - * - * This value is used by the function to detect registering - * shrinkers and to skip do_shrink_slab() calls for them. - */ -#define SHRINKER_REGISTERING ((struct shrinker *)~0UL) +static int shrinker_nr_max; + +/* The shrinker_info is expanded in a batch of BITS_PER_LONG */ +static inline int shrinker_map_size(int nr_items) +{ + return (DIV_ROUND_UP(nr_items, BITS_PER_LONG) * sizeof(unsigned long)); +} + +static inline int shrinker_defer_size(int nr_items) +{ + return (round_up(nr_items, BITS_PER_LONG) * sizeof(atomic_long_t)); +} + +static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg, + int nid) +{ + return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info, + lockdep_is_held(&shrinker_rwsem)); +} + +static int expand_one_shrinker_info(struct mem_cgroup *memcg, + int map_size, int defer_size, + int old_map_size, int old_defer_size) +{ + struct shrinker_info *new, *old; + struct mem_cgroup_per_node *pn; + int nid; + int size = map_size + defer_size; + + for_each_node(nid) { + pn = memcg->nodeinfo[nid]; + old = shrinker_info_protected(memcg, nid); + /* Not yet online memcg */ + if (!old) + return 0; + + new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid); + if (!new) + return -ENOMEM; + + new->nr_deferred = (atomic_long_t *)(new + 1); + new->map = (void *)new->nr_deferred + defer_size; + + /* map: set all old bits, clear all new bits */ + memset(new->map, (int)0xff, old_map_size); + memset((void *)new->map + old_map_size, 0, map_size - old_map_size); + /* nr_deferred: copy old values, clear all new values */ + memcpy(new->nr_deferred, old->nr_deferred, old_defer_size); + memset((void *)new->nr_deferred + old_defer_size, 0, + defer_size - old_defer_size); + + rcu_assign_pointer(pn->shrinker_info, new); + kvfree_rcu(old, rcu); + } + + return 0; +} + +void free_shrinker_info(struct mem_cgroup *memcg) +{ + struct mem_cgroup_per_node *pn; + struct shrinker_info *info; + int nid; + + for_each_node(nid) { + pn = memcg->nodeinfo[nid]; + info = rcu_dereference_protected(pn->shrinker_info, true); + kvfree(info); + rcu_assign_pointer(pn->shrinker_info, NULL); + } +} + +int alloc_shrinker_info(struct mem_cgroup *memcg) +{ + struct shrinker_info *info; + int nid, size, ret = 0; + int map_size, defer_size = 0; + + down_write(&shrinker_rwsem); + map_size = shrinker_map_size(shrinker_nr_max); + defer_size = shrinker_defer_size(shrinker_nr_max); + size = map_size + defer_size; + for_each_node(nid) { + info = kvzalloc_node(sizeof(*info) + size, GFP_KERNEL, nid); + if (!info) { + free_shrinker_info(memcg); + ret = -ENOMEM; + break; + } + info->nr_deferred = (atomic_long_t *)(info + 1); + info->map = (void *)info->nr_deferred + defer_size; + rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info); + } + up_write(&shrinker_rwsem); + + return ret; +} + +static inline bool need_expand(int nr_max) +{ + return round_up(nr_max, BITS_PER_LONG) > + round_up(shrinker_nr_max, BITS_PER_LONG); +} + +static int expand_shrinker_info(int new_id) +{ + int ret = 0; + int new_nr_max = new_id + 1; + int map_size, defer_size = 0; + int old_map_size, old_defer_size = 0; + struct mem_cgroup *memcg; + + if (!need_expand(new_nr_max)) + goto out; + + if (!root_mem_cgroup) + goto out; + + lockdep_assert_held(&shrinker_rwsem); + + map_size = shrinker_map_size(new_nr_max); + defer_size = shrinker_defer_size(new_nr_max); + old_map_size = shrinker_map_size(shrinker_nr_max); + old_defer_size = shrinker_defer_size(shrinker_nr_max); + + memcg = mem_cgroup_iter(NULL, NULL, NULL); + do { + ret = expand_one_shrinker_info(memcg, map_size, defer_size, + old_map_size, old_defer_size); + if (ret) { + mem_cgroup_iter_break(NULL, memcg); + goto out; + } + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); +out: + if (!ret) + shrinker_nr_max = new_nr_max; + + return ret; +} + +void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) +{ + if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { + struct shrinker_info *info; + + rcu_read_lock(); + info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); + /* Pairs with smp mb in shrink_slab() */ + smp_mb__before_atomic(); + set_bit(shrinker_id, info->map); + rcu_read_unlock(); + } +} static DEFINE_IDR(shrinker_idr); -static int shrinker_nr_max; static int prealloc_memcg_shrinker(struct shrinker *shrinker) { int id, ret = -ENOMEM; + if (mem_cgroup_disabled()) + return -ENOSYS; + down_write(&shrinker_rwsem); /* This may call shrinker, so it must use down_read_trylock() */ - id = idr_alloc(&shrinker_idr, SHRINKER_REGISTERING, 0, 0, GFP_KERNEL); + id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); if (id < 0) goto unlock; if (id >= shrinker_nr_max) { - if (memcg_expand_shrinker_maps(id)) { + if (expand_shrinker_info(id)) { idr_remove(&shrinker_idr, id); goto unlock; } - - shrinker_nr_max = id + 1; } shrinker->id = id; ret = 0; @@ -232,9 +374,51 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker) BUG_ON(id < 0); - down_write(&shrinker_rwsem); + lockdep_assert_held(&shrinker_rwsem); + idr_remove(&shrinker_idr, id); - up_write(&shrinker_rwsem); +} + +static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, + struct mem_cgroup *memcg) +{ + struct shrinker_info *info; + + info = shrinker_info_protected(memcg, nid); + return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0); +} + +static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, + struct mem_cgroup *memcg) +{ + struct shrinker_info *info; + + info = shrinker_info_protected(memcg, nid); + return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]); +} + +void reparent_shrinker_deferred(struct mem_cgroup *memcg) +{ + int i, nid; + long nr; + struct mem_cgroup *parent; + struct shrinker_info *child_info, *parent_info; + + parent = parent_mem_cgroup(memcg); + if (!parent) + parent = root_mem_cgroup; + + /* Prevent from concurrent shrinker_info expand */ + down_read(&shrinker_rwsem); + for_each_node(nid) { + child_info = shrinker_info_protected(memcg, nid); + parent_info = shrinker_info_protected(parent, nid); + for (i = 0; i < shrinker_nr_max; i++) { + nr = atomic_long_read(&child_info->nr_deferred[i]); + atomic_long_add(nr, &parent_info->nr_deferred[i]); + } + } + up_read(&shrinker_rwsem); } static bool cgroup_reclaim(struct scan_control *sc) @@ -268,13 +452,25 @@ static bool writeback_throttling_sane(struct scan_control *sc) #else static int prealloc_memcg_shrinker(struct shrinker *shrinker) { - return 0; + return -ENOSYS; } static void unregister_memcg_shrinker(struct shrinker *shrinker) { } +static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, + struct mem_cgroup *memcg) +{ + return 0; +} + +static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, + struct mem_cgroup *memcg) +{ + return 0; +} + static bool cgroup_reclaim(struct scan_control *sc) { return false; @@ -286,6 +482,39 @@ static bool writeback_throttling_sane(struct scan_control *sc) } #endif +static long xchg_nr_deferred(struct shrinker *shrinker, + struct shrink_control *sc) +{ + int nid = sc->nid; + + if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) + nid = 0; + + if (sc->memcg && + (shrinker->flags & SHRINKER_MEMCG_AWARE)) + return xchg_nr_deferred_memcg(nid, shrinker, + sc->memcg); + + return atomic_long_xchg(&shrinker->nr_deferred[nid], 0); +} + + +static long add_nr_deferred(long nr, struct shrinker *shrinker, + struct shrink_control *sc) +{ + int nid = sc->nid; + + if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) + nid = 0; + + if (sc->memcg && + (shrinker->flags & SHRINKER_MEMCG_AWARE)) + return add_nr_deferred_memcg(nr, nid, shrinker, + sc->memcg); + + return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]); +} + /* * This misses isolated pages which are not accounted for to save counters. * As the data only determines if reclaim or compaction continues, it is @@ -335,8 +564,18 @@ static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, */ int prealloc_shrinker(struct shrinker *shrinker) { - unsigned int size = sizeof(*shrinker->nr_deferred); + unsigned int size; + int err; + + if (shrinker->flags & SHRINKER_MEMCG_AWARE) { + err = prealloc_memcg_shrinker(shrinker); + if (err != -ENOSYS) + return err; + + shrinker->flags &= ~SHRINKER_MEMCG_AWARE; + } + size = sizeof(*shrinker->nr_deferred); if (shrinker->flags & SHRINKER_NUMA_AWARE) size *= nr_node_ids; @@ -344,26 +583,17 @@ int prealloc_shrinker(struct shrinker *shrinker) if (!shrinker->nr_deferred) return -ENOMEM; - if (shrinker->flags & SHRINKER_MEMCG_AWARE) { - if (prealloc_memcg_shrinker(shrinker)) - goto free_deferred; - } - return 0; - -free_deferred: - kfree(shrinker->nr_deferred); - shrinker->nr_deferred = NULL; - return -ENOMEM; } void free_prealloced_shrinker(struct shrinker *shrinker) { - if (!shrinker->nr_deferred) - return; - - if (shrinker->flags & SHRINKER_MEMCG_AWARE) + if (shrinker->flags & SHRINKER_MEMCG_AWARE) { + down_write(&shrinker_rwsem); unregister_memcg_shrinker(shrinker); + up_write(&shrinker_rwsem); + return; + } kfree(shrinker->nr_deferred); shrinker->nr_deferred = NULL; @@ -373,10 +603,7 @@ void register_shrinker_prepared(struct shrinker *shrinker) { down_write(&shrinker_rwsem); list_add_tail(&shrinker->list, &shrinker_list); -#ifdef CONFIG_MEMCG - if (shrinker->flags & SHRINKER_MEMCG_AWARE) - idr_replace(&shrinker_idr, shrinker, shrinker->id); -#endif + shrinker->flags |= SHRINKER_REGISTERED; up_write(&shrinker_rwsem); } @@ -396,13 +623,16 @@ EXPORT_SYMBOL(register_shrinker); */ void unregister_shrinker(struct shrinker *shrinker) { - if (!shrinker->nr_deferred) + if (!(shrinker->flags & SHRINKER_REGISTERED)) return; - if (shrinker->flags & SHRINKER_MEMCG_AWARE) - unregister_memcg_shrinker(shrinker); + down_write(&shrinker_rwsem); list_del(&shrinker->list); + shrinker->flags &= ~SHRINKER_REGISTERED; + if (shrinker->flags & SHRINKER_MEMCG_AWARE) + unregister_memcg_shrinker(shrinker); up_write(&shrinker_rwsem); + kfree(shrinker->nr_deferred); shrinker->nr_deferred = NULL; } @@ -419,14 +649,10 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, long freeable; long nr; long new_nr; - int nid = shrinkctl->nid; long batch_size = shrinker->batch ? shrinker->batch : SHRINK_BATCH; long scanned = 0, next_deferred; - if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) - nid = 0; - freeable = shrinker->count_objects(shrinker, shrinkctl); if (freeable == 0 || freeable == SHRINK_EMPTY) return freeable; @@ -436,9 +662,8 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, * and zero it so that other concurrent shrinker invocations * don't also do this scanning work. */ - nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0); + nr = xchg_nr_deferred(shrinker, shrinkctl); - total_scan = nr; if (shrinker->seeks) { delta = freeable >> priority; delta *= 4; @@ -452,37 +677,9 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, delta = freeable / 2; } + total_scan = nr >> priority; total_scan += delta; - if (total_scan < 0) { - pr_err("shrink_slab: %pS negative objects to delete nr=%ld\n", - shrinker->scan_objects, total_scan); - total_scan = freeable; - next_deferred = nr; - } else - next_deferred = total_scan; - - /* - * We need to avoid excessive windup on filesystem shrinkers - * due to large numbers of GFP_NOFS allocations causing the - * shrinkers to return -1 all the time. This results in a large - * nr being built up so when a shrink that can do some work - * comes along it empties the entire cache due to nr >>> - * freeable. This is bad for sustaining a working set in - * memory. - * - * Hence only allow the shrinker to scan the entire cache when - * a large delta change is calculated directly. - */ - if (delta < freeable / 4) - total_scan = min(total_scan, freeable / 2); - - /* - * Avoid risking looping forever due to too large nr value: - * never try to free more than twice the estimate number of - * freeable entries. - */ - if (total_scan > freeable * 2) - total_scan = freeable * 2; + total_scan = min(total_scan, (2 * freeable)); trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, freeable, delta, total_scan, priority); @@ -521,22 +718,22 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, cond_resched(); } - if (next_deferred >= scanned) - next_deferred -= scanned; - else - next_deferred = 0; + /* + * The deferred work is increased by any new work (delta) that wasn't + * done, decreased by old deferred work that was done now. + * + * And it is capped to two times of the freeable items. + */ + next_deferred = max_t(long, (nr + delta - scanned), 0); + next_deferred = min(next_deferred, (2 * freeable)); + /* * move the unused scan count back into the shrinker in a - * manner that handles concurrent updates. If we exhausted the - * scan, there is no need to do an update. + * manner that handles concurrent updates. */ - if (next_deferred > 0) - new_nr = atomic_long_add_return(next_deferred, - &shrinker->nr_deferred[nid]); - else - new_nr = atomic_long_read(&shrinker->nr_deferred[nid]); + new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl); - trace_mm_shrink_slab_end(shrinker, nid, freed, nr, new_nr, total_scan); + trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan); return freed; } @@ -544,7 +741,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority) { - struct memcg_shrinker_map *map; + struct shrinker_info *info; unsigned long ret, freed = 0; int i; @@ -554,12 +751,11 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, if (!down_read_trylock(&shrinker_rwsem)) return 0; - map = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_map, - true); - if (unlikely(!map)) + info = shrinker_info_protected(memcg, nid); + if (unlikely(!info)) goto unlock; - for_each_set_bit(i, map->map, shrinker_nr_max) { + for_each_set_bit(i, info->map, shrinker_nr_max) { struct shrink_control sc = { .gfp_mask = gfp_mask, .nid = nid, @@ -568,9 +764,9 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, struct shrinker *shrinker; shrinker = idr_find(&shrinker_idr, i); - if (unlikely(!shrinker || shrinker == SHRINKER_REGISTERING)) { + if (unlikely(!shrinker || !(shrinker->flags & SHRINKER_REGISTERED))) { if (!shrinker) - clear_bit(i, map->map); + clear_bit(i, info->map); continue; } @@ -581,7 +777,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, ret = do_shrink_slab(&sc, shrinker, priority); if (ret == SHRINK_EMPTY) { - clear_bit(i, map->map); + clear_bit(i, info->map); /* * After the shrinker reported that it had no objects to * free, but before we cleared the corresponding bit in @@ -590,7 +786,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, * case, we invoke the shrinker one more time and reset * the bit if it reports that it is not empty anymore. * The memory barrier here pairs with the barrier in - * memcg_set_shrinker_bit(): + * set_shrinker_bit(): * * list_lru_add() shrink_slab_memcg() * list_add_tail() clear_bit() @@ -602,7 +798,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, if (ret == SHRINK_EMPTY) ret = 0; else - memcg_set_shrinker_bit(memcg, nid, i); + set_shrinker_bit(memcg, nid, i); } freed += ret; @@ -1507,8 +1703,9 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, LIST_HEAD(clean_pages); list_for_each_entry_safe(page, next, page_list, lru) { - if (page_is_file_lru(page) && !PageDirty(page) && - !__PageMovable(page) && !PageUnevictable(page)) { + if (!PageHuge(page) && page_is_file_lru(page) && + !PageDirty(page) && !__PageMovable(page) && + !PageUnevictable(page)) { ClearPageActive(page); list_move(&page->lru, &clean_pages); } @@ -3862,7 +4059,7 @@ static int kswapd(void *p) { unsigned int alloc_order, reclaim_order; unsigned int highest_zoneidx = MAX_NR_ZONES - 1; - pg_data_t *pgdat = (pg_data_t*)p; + pg_data_t *pgdat = (pg_data_t *)p; struct task_struct *tsk = current; const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); @@ -4086,14 +4283,6 @@ module_init(kswapd_init) int node_reclaim_mode __read_mostly; /* - * These bit locations are exposed in the vm.zone_reclaim_mode sysctl - * ABI. New bits are OK, but existing bits can never change. - */ -#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */ -#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ -#define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */ - -/* * Priority for NODE_RECLAIM. This determines the fraction of pages * of a node considered for each zone_reclaim. 4 scans 1/16th of * a zone. diff --git a/mm/vmstat.c b/mm/vmstat.c index 74b2c374b86c..cccee36b289c 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -934,7 +934,7 @@ void cpu_vm_stats_fold(int cpu) /* * this is only called if !populated_zone(zone), which implies no other users of - * pset->vm_stat_diff[] exsist. + * pset->vm_stat_diff[] exist. */ void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset) { @@ -1313,6 +1313,10 @@ const char * const vmstat_text[] = { "htlb_buddy_alloc_success", "htlb_buddy_alloc_fail", #endif +#ifdef CONFIG_CMA + "cma_alloc_success", + "cma_alloc_fail", +#endif "unevictable_pgs_culled", "unevictable_pgs_scanned", "unevictable_pgs_rescued", @@ -1365,6 +1369,10 @@ const char * const vmstat_text[] = { "swap_ra", "swap_ra_hit", #endif +#ifdef CONFIG_X86 + "direct_map_level2_splits", + "direct_map_level3_splits", +#endif #endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */ }; #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */ @@ -1854,25 +1862,34 @@ int vmstat_refresh(struct ctl_table *table, int write, if (err) return err; for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) { + /* + * Skip checking stats known to go negative occasionally. + */ + switch (i) { + case NR_ZONE_WRITE_PENDING: + case NR_FREE_CMA_PAGES: + continue; + } val = atomic_long_read(&vm_zone_stat[i]); if (val < 0) { pr_warn("%s: %s %ld\n", __func__, zone_stat_name(i), val); - err = -EINVAL; } } -#ifdef CONFIG_NUMA - for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) { - val = atomic_long_read(&vm_numa_stat[i]); + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { + /* + * Skip checking stats known to go negative occasionally. + */ + switch (i) { + case NR_WRITEBACK: + continue; + } + val = atomic_long_read(&vm_node_stat[i]); if (val < 0) { pr_warn("%s: %s %ld\n", - __func__, numa_stat_name(i), val); - err = -EINVAL; + __func__, node_stat_name(i), val); } } -#endif - if (err) - return err; if (write) *ppos += *lenp; else diff --git a/mm/workingset.c b/mm/workingset.c index cd39902c1062..b7cdeca5a76d 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -554,7 +554,6 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, goto out_invalid; if (WARN_ON_ONCE(node->count != node->nr_values)) goto out_invalid; - mapping->nrexceptional -= node->nr_values; xa_delete_node(node, workingset_update_node); __inc_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM); diff --git a/mm/z3fold.c b/mm/z3fold.c index 9d889ad2bb86..7fe7adaaad01 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -391,7 +391,7 @@ static void z3fold_unregister_migration(struct z3fold_pool *pool) { if (pool->inode) iput(pool->inode); - } +} /* Initializes the z3fold header of a newly allocated z3fold page */ static struct z3fold_header *init_z3fold_page(struct page *page, bool headless, diff --git a/mm/zpool.c b/mm/zpool.c index 5ed71207ced7..6d9ed48141e5 100644 --- a/mm/zpool.c +++ b/mm/zpool.c @@ -336,7 +336,7 @@ int zpool_shrink(struct zpool *zpool, unsigned int pages, * This may hold locks, disable interrupts, and/or preemption, * and the zpool_unmap_handle() must be called to undo those * actions. The code that uses the mapped handle should complete - * its operatons on the mapped handle memory quickly and unmap + * its operations on the mapped handle memory quickly and unmap * as soon as possible. As the implementation may use per-cpu * data, multiple handles should not be mapped concurrently on * any cpu. diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 30c358b72025..19b563bc6c48 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -61,7 +61,7 @@ #define ZSPAGE_MAGIC 0x58 /* - * This must be power of 2 and greater than of equal to sizeof(link_free). + * This must be power of 2 and greater than or equal to sizeof(link_free). * These two conditions ensure that any 'struct link_free' itself doesn't * span more than 1 page which avoids complex case of mapping 2 pages simply * to restore link_free pointer values. @@ -530,7 +530,7 @@ static void set_zspage_mapping(struct zspage *zspage, * class maintains a list of zspages where each zspage is divided * into equal sized chunks. Each allocation falls into one of these * classes depending on its size. This function returns index of the - * size class which has chunk size big enough to hold the give size. + * size class which has chunk size big enough to hold the given size. */ static int get_size_class_index(int size) { @@ -1227,7 +1227,7 @@ EXPORT_SYMBOL_GPL(zs_get_total_pages); * zs_map_object - get address of allocated object from handle. * @pool: pool from which the object was allocated * @handle: handle returned from zs_malloc - * @mm: maping mode to use + * @mm: mapping mode to use * * Before using an object allocated from zs_malloc, it must be mapped using * this function. When done with the object, it must be unmapped using @@ -1987,8 +1987,7 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage, head = obj_to_head(page, addr); if (head & OBJ_ALLOCATED_TAG) { handle = head & ~OBJ_ALLOCATED_TAG; - if (!testpin_tag(handle)) - BUG(); + BUG_ON(!testpin_tag(handle)); old_obj = handle_to_obj(handle); obj_to_location(old_obj, &dummy, &obj_idx); @@ -2035,8 +2034,7 @@ unpin_objects: head = obj_to_head(page, addr); if (head & OBJ_ALLOCATED_TAG) { handle = head & ~OBJ_ALLOCATED_TAG; - if (!testpin_tag(handle)) - BUG(); + BUG_ON(!testpin_tag(handle)); unpin_tag(handle); } } diff --git a/mm/zswap.c b/mm/zswap.c index 578d9f256920..20763267a219 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -614,7 +614,7 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) } pr_debug("using %s zpool\n", zpool_get_type(pool->zpool)); - strlcpy(pool->tfm_name, compressor, sizeof(pool->tfm_name)); + strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name)); pool->acomp_ctx = alloc_percpu(*pool->acomp_ctx); if (!pool->acomp_ctx) { |